sinew 3.0.1 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
data/lib/sinew/csv.rb ADDED
@@ -0,0 +1,89 @@
1
+ require 'csv'
2
+ require 'sterile'
3
+
4
+ module Sinew
5
+ class CSV
6
+ attr_reader :columns, :count, :csv, :path, :tally
7
+
8
+ def initialize(path)
9
+ @count = 0
10
+ @csv = nil
11
+ @path = path
12
+ end
13
+
14
+ # start writing the csv
15
+ def start(columns)
16
+ raise 'started twice' if started?
17
+
18
+ @columns = columns
19
+ @tally = columns.map { [_1, 0] }.to_h
20
+ @csv = ::CSV.open(path, 'wb').tap do
21
+ _1 << columns
22
+ end
23
+ end
24
+
25
+ # has this csv been started?
26
+ def started?
27
+ @csv != nil
28
+ end
29
+
30
+ # append a row
31
+ def emit(row)
32
+ # convert row to cols, and construct print (our return value)
33
+ print = {}
34
+ row = columns.map do
35
+ value = normalize(row[_1])
36
+ if value
37
+ print[_1] = value
38
+ tally[_1] += 1
39
+ end
40
+ value
41
+ end
42
+ @count += 1
43
+
44
+ # emit
45
+ csv << row
46
+ csv.flush
47
+
48
+ # return in case someone wants to pretty print this
49
+ print
50
+ end
51
+
52
+ ASCII_ONLY = begin
53
+ chars = (33..126).map(&:chr) - ['&']
54
+ /\A[#{Regexp.escape(chars.join)}\s]+\Z/
55
+ end.freeze
56
+
57
+ def normalize(s)
58
+ # nokogiri/array/misc => string
59
+ s = if s.respond_to?(:inner_html)
60
+ s.inner_html
61
+ elsif s.is_a?(Array)
62
+ s.join('|')
63
+ else
64
+ s.to_s
65
+ end
66
+ return if s.empty?
67
+
68
+ # simple attempt to strip tags. Note that we replace tags with spaces
69
+ s = s.gsub(/<[^>]+>/, ' ')
70
+
71
+ if s !~ ASCII_ONLY
72
+ # Converts MS Word 'smart punctuation' to ASCII
73
+ s = Sterile.plain_format(s)
74
+
75
+ # &aacute; &amp; etc.
76
+ s = Sterile.decode_entities(s)
77
+
78
+ # "šţɽĩɳģ" => "string"
79
+ s = Sterile.transliterate(s)
80
+ end
81
+
82
+ # squish
83
+ s = s.strip.gsub(/\s+/, ' ')
84
+ return if s.empty?
85
+
86
+ s
87
+ end
88
+ end
89
+ end
data/lib/sinew/main.rb CHANGED
@@ -1,98 +1,72 @@
1
- require 'scripto'
2
- require 'sinew/connection'
3
-
4
- #
5
- # Main sinew entry point.
6
- #
7
-
8
1
  module Sinew
9
- class Main < Scripto::Main
10
- attr_reader :runtime_options
2
+ # Helper class used by sinew bin. This exists as an independent class solely
3
+ # for testing, otherwise it would be built into the bin script.
4
+ class Main
5
+ attr_reader :sinew
11
6
 
12
7
  def initialize(options)
13
- super(options)
8
+ options[:output] ||= begin
9
+ src = options[:recipe]
10
+ dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
11
+ dst = dst.sub(%r{^./}, '') # nice to clean this up
12
+ dst
13
+ end
14
14
 
15
- # init
16
- @runtime_options = RuntimeOptions.new
15
+ @sinew = Sinew::Base.new(options)
17
16
  end
18
17
 
19
18
  def run
20
- dsl.run
21
- footer if !quiet?
22
- end
23
-
24
- def quiet?
25
- options[:quiet]
26
- end
27
-
28
- def dsl
29
- @dsl ||= DSL.new(self)
30
- end
31
-
32
- #
33
- # http requests
34
- #
35
-
36
- def http(method, url, options = {})
37
- request = Request.new(method, url, request_options(options))
38
- response = request.perform(connection)
39
-
40
- # always log error messages
41
- if response.error?
42
- puts "xxx http request failed with #{response.code}"
19
+ tm = Time.now
20
+ header if !sinew.options[:silent]
21
+ recipe = sinew.options[:recipe]
22
+ dsl = DSL.new(sinew)
23
+ begin
24
+ dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
43
27
  end
44
-
45
- response
28
+ footer(Time.now - tm) if !sinew.options[:silent]
46
29
  end
47
30
 
48
- def connection
49
- @connection ||= Connection.create(options: options, runtime_options: runtime_options)
50
- end
51
- protected :connection
31
+ protected
52
32
 
53
33
  #
54
- # output
34
+ # header/footer
55
35
  #
56
36
 
57
- def output
58
- @output ||= Output.new(self)
37
+ def header
38
+ sinew.banner("Writing to #{sinew.csv.path}...")
59
39
  end
60
40
 
61
- #
62
- # helpers
63
- #
41
+ def footer(elapsed)
42
+ csv = sinew.csv
43
+ count = csv.count
64
44
 
65
- def request_options(options)
66
- options.dup.tap do |req|
67
- req[:headers] = {}.tap do |h|
68
- [ runtime_options.headers, options[:headers]].each do
69
- h.merge!(_1) if _1
70
- end
71
- end
72
- req[:proxy] = random_proxy
45
+ if count == 0
46
+ sinew.banner(format('Done in %ds. Nothing written.', elapsed))
47
+ return
73
48
  end
74
- end
75
- protected :request_options
76
-
77
- PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
78
49
 
79
- def random_proxy
80
- return if !options[:proxy]
50
+ # summary
51
+ msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
52
+ sinew.banner(msg)
81
53
 
82
- proxy = options[:proxy].split(',').sample
83
- if proxy !~ PROXY_RE
84
- raise ArgumentError, "invalid proxy #{proxy.inspect}, should be host[:port]"
54
+ # tally
55
+ tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
56
+ len = tally.keys.map { _1.to_s.length }.max
57
+ fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
58
+ tally.each do
59
+ printf(fmt, _1, _2, count, _2 * 100.0 / count)
85
60
  end
86
-
87
- "http://#{proxy}"
88
61
  end
89
- protected :random_proxy
90
62
 
91
- def footer
92
- output.report
93
- finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
94
- banner("#{finished} in #{dsl.elapsed.to_i}s.")
63
+ # simple DSL for .sinew files
64
+ class DSL
65
+ attr_reader :sinew
66
+
67
+ def initialize(sinew)
68
+ @sinew = sinew
69
+ end
95
70
  end
96
- protected :footer
97
71
  end
98
72
  end
@@ -1,5 +1,6 @@
1
1
  module Sinew
2
- module Connection
2
+ module Middleware
3
+ # Minimalist Formatter that logs proxy if present.
3
4
  class LogFormatter < Faraday::Logging::Formatter
4
5
  def request(env)
5
6
  info('req') do
@@ -1,28 +1,19 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
- class Nokogiri::XML::NodeSet
5
- alias old_inner_html inner_html
6
- alias old_inner_text inner_text
4
+ module Nokogiri
5
+ module XML
6
+ class NodeSet
7
+ alias old_inner_html inner_html
8
+ alias old_inner_text inner_text
7
9
 
8
- def inner_text
9
- map(&:inner_text).join(' ')
10
- end
11
-
12
- def inner_html(*args)
13
- map { |i| i.inner_html(*args) }.join(' ')
14
- end
15
- end
10
+ def inner_text
11
+ map(&:inner_text).join(' ')
12
+ end
16
13
 
17
- # text_just_me
18
- class Nokogiri::XML::Node
19
- def text_just_me
20
- t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
21
- t&.text
22
- end
23
- end
24
- class Nokogiri::XML::NodeSet
25
- def text_just_me
26
- map(&:text_just_me).join(' ')
14
+ def inner_html(*args)
15
+ map { _1.inner_html(*args) }.join(' ')
16
+ end
17
+ end
27
18
  end
28
19
  end
@@ -1,72 +1,61 @@
1
- require 'stringio'
2
- require 'zlib'
3
-
4
- #
5
- # An HTTP response.
6
- #
1
+ require 'delegate'
2
+ require 'hashie/mash'
3
+ require 'json'
4
+ require 'nokogiri'
7
5
 
8
6
  module Sinew
9
- class Response
10
- attr_accessor :request, :uri, :body, :code, :headers
7
+ # A wrapper around Faraday::Response, with some parsing helpers.
8
+ class Response < SimpleDelegator
9
+ # Like body, but tries to cleanup whitespace around HTML for easier parsing.
10
+ def html
11
+ @html ||= body.dup.tap do
12
+ # fix invalid utf8
13
+ if _1.encoding == Encoding::UTF_8
14
+ _1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
15
+ end
11
16
 
12
- #
13
- # factory methods
14
- #
17
+ # squish
18
+ _1.strip!
19
+ _1.gsub!(/\s+/, ' ')
15
20
 
16
- def self.from_network(request, fday_response)
17
- Response.new.tap do
18
- _1.request = request
19
- _1.uri = fday_response.env.url
20
- _1.code = fday_response.status
21
- _1.headers = fday_response.headers.to_h
22
- _1.body = process_body(fday_response)
21
+ # kill whitespace around tags
22
+ _1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
23
23
  end
24
24
  end
25
25
 
26
- # helper for decoding bodies before parsing
27
- def self.process_body(response)
28
- body = response.body
29
-
30
- # inflate if necessary
31
- bits = body[0, 10].force_encoding('BINARY')
32
- if bits =~ /\A\x1f\x8b/n
33
- body = Zlib::GzipReader.new(StringIO.new(body)).read
34
- end
35
-
36
- # force to utf-8 if we think this could be text
37
- if body.encoding != Encoding::UTF_8
38
- if content_type = response.headers['content-type']
39
- if content_type =~ /\b(html|javascript|json|text|xml)\b/
40
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
41
- end
42
- end
43
- end
26
+ # Return body as JSON
27
+ def json
28
+ @json ||= JSON.parse(body, symbolize_names: true)
29
+ end
44
30
 
45
- body
31
+ # Return JSON body as Hashie::Mash
32
+ def mash
33
+ @mash ||= Hashie::Mash.new(json)
46
34
  end
47
35
 
48
- #
49
- # accessors
50
- #
36
+ # Return body HTML as Nokogiri document
37
+ def noko
38
+ @noko ||= Nokogiri::HTML(html)
39
+ end
51
40
 
52
- def error?
53
- code >= 400
41
+ # Return body XML as Nokogiri document
42
+ def xml
43
+ @xml ||= Nokogiri::XML(html)
54
44
  end
55
45
 
56
- def error_500?
57
- code / 100 >= 5
46
+ # Return the final URI for the request, after redirects
47
+ def url
48
+ env.url
58
49
  end
59
50
 
60
- def redirected?
61
- request.uri != uri
51
+ # Return the cache diskpath for this response
52
+ def diskpath
53
+ env[:httpdisk_diskpath]
62
54
  end
63
55
 
64
- def head_as_json
65
- {
66
- uri: uri,
67
- code: code,
68
- headers: headers,
69
- }
56
+ # Remove cached response from disk, if any
57
+ def uncache
58
+ File.unlink(diskpath) if File.exist?(diskpath)
70
59
  end
71
60
  end
72
61
  end
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '3.0.1'.freeze
3
+ VERSION = '4.0.0'.freeze
4
4
  end
data/sample.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative 'lib/sinew'
2
+
3
+ sinew = Sinew.new(output: 'sample.csv', verbose: true)
4
+
5
+ response = sinew.get 'http://httpbingo.org'
6
+ response.noko.css('ul li a').each do |a|
7
+ row = {}
8
+ row[:url] = a[:href]
9
+ row[:title] = a.text
10
+ sinew.csv_emit(row)
11
+ end
12
+
13
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sample.sinew CHANGED
@@ -1,9 +1,9 @@
1
- get 'http://httpbingo.org'
2
- noko.css('ul li a').each do |a|
1
+ response = sinew.get 'http://httpbingo.org'
2
+ response.noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
- csv_emit(row)
6
+ sinew.csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbingo.org/redirect/2'
9
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -3,14 +3,15 @@ $LOAD_PATH.unshift("#{__dir__}/lib")
3
3
  require 'sinew/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = 'sinew'
7
- s.version = Sinew::VERSION
8
- s.license = 'MIT'
9
- s.authors = [ 'Adam Doppelt', 'Nathan Kriege' ]
10
- s.email = [ 'amd@gurge.com' ]
11
- s.homepage = 'http://github.com/gurgeous/sinew'
12
- s.summary = 'Sinew - structured web crawling using recipes.'
6
+ s.name = 'sinew'
7
+ s.version = Sinew::VERSION
8
+ s.authors = ['Adam Doppelt', 'Nathan Kriege']
9
+ s.email = ['amd@gurge.com']
10
+
11
+ s.summary = 'Sinew - structured web crawling using recipes.'
13
12
  s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
13
+ s.homepage = 'http://github.com/gurgeous/sinew'
14
+ s.license = 'MIT'
14
15
  s.required_ruby_version = '>= 2.7'
15
16
 
16
17
  # what's in the gem?
@@ -19,14 +20,16 @@ Gem::Specification.new do |s|
19
20
  end
20
21
  s.bindir = 'bin'
21
22
  s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
22
- s.require_paths = [ 'lib' ]
23
+ s.require_paths = ['lib']
23
24
 
24
- s.add_runtime_dependency 'amazing_print', '~> 1.3'
25
- s.add_runtime_dependency 'faraday', '~> 1.4'
26
- s.add_runtime_dependency 'faraday-encoding', '~> 0'
27
- s.add_runtime_dependency 'httpdisk', '~> 0'
28
- s.add_runtime_dependency 'nokogiri', '~> 1.11'
29
- s.add_runtime_dependency 'scripto', '~> 0'
30
- s.add_runtime_dependency 'slop', '~> 4.8'
31
- s.add_runtime_dependency 'sterile', '~> 1.0'
25
+ # gem dependencies
26
+ s.add_dependency 'amazing_print', '~> 1.3'
27
+ s.add_dependency 'faraday', '~> 1.4'
28
+ s.add_dependency 'faraday-encoding', '~> 0'
29
+ s.add_dependency 'faraday-rate_limiter', '~> 0.0'
30
+ s.add_dependency 'hashie', '~> 4.1'
31
+ s.add_dependency 'httpdisk', '~> 0.5'
32
+ s.add_dependency 'nokogiri', '~> 1.11'
33
+ s.add_dependency 'slop', '~> 4.8'
34
+ s.add_dependency 'sterile', '~> 1.0'
32
35
  end