sinew 3.0.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/sinew/csv.rb ADDED
@@ -0,0 +1,89 @@
1
+ require 'csv'
2
+ require 'sterile'
3
+
4
+ module Sinew
5
+ class CSV
6
+ attr_reader :columns, :count, :csv, :path, :tally
7
+
8
+ def initialize(path)
9
+ @count = 0
10
+ @csv = nil
11
+ @path = path
12
+ end
13
+
14
+ # start writing the csv
15
+ def start(columns)
16
+ raise 'started twice' if started?
17
+
18
+ @columns = columns
19
+ @tally = columns.map { [_1, 0] }.to_h
20
+ @csv = ::CSV.open(path, 'wb').tap do
21
+ _1 << columns
22
+ end
23
+ end
24
+
25
+ # has this csv been started?
26
+ def started?
27
+ @csv != nil
28
+ end
29
+
30
+ # append a row
31
+ def emit(row)
32
+ # convert row to cols, and construct print (our return value)
33
+ print = {}
34
+ row = columns.map do
35
+ value = normalize(row[_1])
36
+ if value
37
+ print[_1] = value
38
+ tally[_1] += 1
39
+ end
40
+ value
41
+ end
42
+ @count += 1
43
+
44
+ # emit
45
+ csv << row
46
+ csv.flush
47
+
48
+ # return in case someone wants to pretty print this
49
+ print
50
+ end
51
+
52
+ ASCII_ONLY = begin
53
+ chars = (33..126).map(&:chr) - ['&']
54
+ /\A[#{Regexp.escape(chars.join)}\s]+\Z/
55
+ end.freeze
56
+
57
+ def normalize(s)
58
+ # nokogiri/array/misc => string
59
+ s = if s.respond_to?(:inner_html)
60
+ s.inner_html
61
+ elsif s.is_a?(Array)
62
+ s.join('|')
63
+ else
64
+ s.to_s
65
+ end
66
+ return if s.empty?
67
+
68
+ # simple attempt to strip tags. Note that we replace tags with spaces
69
+ s = s.gsub(/<[^>]+>/, ' ')
70
+
71
+ if s !~ ASCII_ONLY
72
+ # Converts MS Word 'smart punctuation' to ASCII
73
+ s = Sterile.plain_format(s)
74
+
75
+ # &aacute; &amp; etc.
76
+ s = Sterile.decode_entities(s)
77
+
78
+ # "šţɽĩɳģ" => "string"
79
+ s = Sterile.transliterate(s)
80
+ end
81
+
82
+ # squish
83
+ s = s.strip.gsub(/\s+/, ' ')
84
+ return if s.empty?
85
+
86
+ s
87
+ end
88
+ end
89
+ end
data/lib/sinew/main.rb CHANGED
@@ -1,98 +1,72 @@
1
- require 'scripto'
2
- require 'sinew/connection'
3
-
4
- #
5
- # Main sinew entry point.
6
- #
7
-
8
1
  module Sinew
9
- class Main < Scripto::Main
10
- attr_reader :runtime_options
2
+ # Helper class used by sinew bin. This exists as an independent class solely
3
+ # for testing, otherwise it would be built into the bin script.
4
+ class Main
5
+ attr_reader :sinew
11
6
 
12
7
  def initialize(options)
13
- super(options)
8
+ options[:output] ||= begin
9
+ src = options[:recipe]
10
+ dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
11
+ dst = dst.sub(%r{^./}, '') # nice to clean this up
12
+ dst
13
+ end
14
14
 
15
- # init
16
- @runtime_options = RuntimeOptions.new
15
+ @sinew = Sinew::Base.new(options)
17
16
  end
18
17
 
19
18
  def run
20
- dsl.run
21
- footer if !quiet?
22
- end
23
-
24
- def quiet?
25
- options[:quiet]
26
- end
27
-
28
- def dsl
29
- @dsl ||= DSL.new(self)
30
- end
31
-
32
- #
33
- # http requests
34
- #
35
-
36
- def http(method, url, options = {})
37
- request = Request.new(method, url, request_options(options))
38
- response = request.perform(connection)
39
-
40
- # always log error messages
41
- if response.error?
42
- puts "xxx http request failed with #{response.code}"
19
+ tm = Time.now
20
+ header if !sinew.options[:silent]
21
+ recipe = sinew.options[:recipe]
22
+ dsl = DSL.new(sinew)
23
+ begin
24
+ dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
43
27
  end
44
-
45
- response
28
+ footer(Time.now - tm) if !sinew.options[:silent]
46
29
  end
47
30
 
48
- def connection
49
- @connection ||= Connection.create(options: options, runtime_options: runtime_options)
50
- end
51
- protected :connection
31
+ protected
52
32
 
53
33
  #
54
- # output
34
+ # header/footer
55
35
  #
56
36
 
57
- def output
58
- @output ||= Output.new(self)
37
+ def header
38
+ sinew.banner("Writing to #{sinew.csv.path}...")
59
39
  end
60
40
 
61
- #
62
- # helpers
63
- #
41
+ def footer(elapsed)
42
+ csv = sinew.csv
43
+ count = csv.count
64
44
 
65
- def request_options(options)
66
- options.dup.tap do |req|
67
- req[:headers] = {}.tap do |h|
68
- [ runtime_options.headers, options[:headers]].each do
69
- h.merge!(_1) if _1
70
- end
71
- end
72
- req[:proxy] = random_proxy
45
+ if count == 0
46
+ sinew.banner(format('Done in %ds. Nothing written.', elapsed))
47
+ return
73
48
  end
74
- end
75
- protected :request_options
76
-
77
- PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
78
49
 
79
- def random_proxy
80
- return if !options[:proxy]
50
+ # summary
51
+ msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
52
+ sinew.banner(msg)
81
53
 
82
- proxy = options[:proxy].split(',').sample
83
- if proxy !~ PROXY_RE
84
- raise ArgumentError, "invalid proxy #{proxy.inspect}, should be host[:port]"
54
+ # tally
55
+ tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
56
+ len = tally.keys.map { _1.to_s.length }.max
57
+ fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
58
+ tally.each do
59
+ printf(fmt, _1, _2, count, _2 * 100.0 / count)
85
60
  end
86
-
87
- "http://#{proxy}"
88
61
  end
89
- protected :random_proxy
90
62
 
91
- def footer
92
- output.report
93
- finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
94
- banner("#{finished} in #{dsl.elapsed.to_i}s.")
63
+ # simple DSL for .sinew files
64
+ class DSL
65
+ attr_reader :sinew
66
+
67
+ def initialize(sinew)
68
+ @sinew = sinew
69
+ end
95
70
  end
96
- protected :footer
97
71
  end
98
72
  end
@@ -1,5 +1,6 @@
1
1
  module Sinew
2
- module Connection
2
+ module Middleware
3
+ # Minimalist Formatter that logs proxy if present.
3
4
  class LogFormatter < Faraday::Logging::Formatter
4
5
  def request(env)
5
6
  info('req') do
@@ -1,28 +1,19 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
- class Nokogiri::XML::NodeSet
5
- alias old_inner_html inner_html
6
- alias old_inner_text inner_text
4
+ module Nokogiri
5
+ module XML
6
+ class NodeSet
7
+ alias old_inner_html inner_html
8
+ alias old_inner_text inner_text
7
9
 
8
- def inner_text
9
- map(&:inner_text).join(' ')
10
- end
11
-
12
- def inner_html(*args)
13
- map { |i| i.inner_html(*args) }.join(' ')
14
- end
15
- end
10
+ def inner_text
11
+ map(&:inner_text).join(' ')
12
+ end
16
13
 
17
- # text_just_me
18
- class Nokogiri::XML::Node
19
- def text_just_me
20
- t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
21
- t&.text
22
- end
23
- end
24
- class Nokogiri::XML::NodeSet
25
- def text_just_me
26
- map(&:text_just_me).join(' ')
14
+ def inner_html(*args)
15
+ map { _1.inner_html(*args) }.join(' ')
16
+ end
17
+ end
27
18
  end
28
19
  end
@@ -1,72 +1,61 @@
1
- require 'stringio'
2
- require 'zlib'
3
-
4
- #
5
- # An HTTP response.
6
- #
1
+ require 'delegate'
2
+ require 'hashie/mash'
3
+ require 'json'
4
+ require 'nokogiri'
7
5
 
8
6
  module Sinew
9
- class Response
10
- attr_accessor :request, :uri, :body, :code, :headers
7
+ # A wrapper around Faraday::Response, with some parsing helpers.
8
+ class Response < SimpleDelegator
9
+ # Like body, but tries to cleanup whitespace around HTML for easier parsing.
10
+ def html
11
+ @html ||= body.dup.tap do
12
+ # fix invalid utf8
13
+ if _1.encoding == Encoding::UTF_8
14
+ _1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
15
+ end
11
16
 
12
- #
13
- # factory methods
14
- #
17
+ # squish
18
+ _1.strip!
19
+ _1.gsub!(/\s+/, ' ')
15
20
 
16
- def self.from_network(request, fday_response)
17
- Response.new.tap do
18
- _1.request = request
19
- _1.uri = fday_response.env.url
20
- _1.code = fday_response.status
21
- _1.headers = fday_response.headers.to_h
22
- _1.body = process_body(fday_response)
21
+ # kill whitespace around tags
22
+ _1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
23
23
  end
24
24
  end
25
25
 
26
- # helper for decoding bodies before parsing
27
- def self.process_body(response)
28
- body = response.body
29
-
30
- # inflate if necessary
31
- bits = body[0, 10].force_encoding('BINARY')
32
- if bits =~ /\A\x1f\x8b/n
33
- body = Zlib::GzipReader.new(StringIO.new(body)).read
34
- end
35
-
36
- # force to utf-8 if we think this could be text
37
- if body.encoding != Encoding::UTF_8
38
- if content_type = response.headers['content-type']
39
- if content_type =~ /\b(html|javascript|json|text|xml)\b/
40
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
41
- end
42
- end
43
- end
26
+ # Return body as JSON
27
+ def json
28
+ @json ||= JSON.parse(body, symbolize_names: true)
29
+ end
44
30
 
45
- body
31
+ # Return JSON body as Hashie::Mash
32
+ def mash
33
+ @mash ||= Hashie::Mash.new(json)
46
34
  end
47
35
 
48
- #
49
- # accessors
50
- #
36
+ # Return body HTML as Nokogiri document
37
+ def noko
38
+ @noko ||= Nokogiri::HTML(html)
39
+ end
51
40
 
52
- def error?
53
- code >= 400
41
+ # Return body XML as Nokogiri document
42
+ def xml
43
+ @xml ||= Nokogiri::XML(html)
54
44
  end
55
45
 
56
- def error_500?
57
- code / 100 >= 5
46
+ # Return the final URI for the request, after redirects
47
+ def url
48
+ env.url
58
49
  end
59
50
 
60
- def redirected?
61
- request.uri != uri
51
+ # Return the cache diskpath for this response
52
+ def diskpath
53
+ env[:httpdisk_diskpath]
62
54
  end
63
55
 
64
- def head_as_json
65
- {
66
- uri: uri,
67
- code: code,
68
- headers: headers,
69
- }
56
+ # Remove cached response from disk, if any
57
+ def uncache
58
+ File.unlink(diskpath) if File.exist?(diskpath)
70
59
  end
71
60
  end
72
61
  end
data/lib/sinew/version.rb CHANGED
@@ -1,4 +1,4 @@
1
1
  module Sinew
2
2
  # Gem version
3
- VERSION = '3.0.1'.freeze
3
+ VERSION = '4.0.0'.freeze
4
4
  end
data/sample.rb ADDED
@@ -0,0 +1,13 @@
1
+ require_relative 'lib/sinew'
2
+
3
+ sinew = Sinew.new(output: 'sample.csv', verbose: true)
4
+
5
+ response = sinew.get 'http://httpbingo.org'
6
+ response.noko.css('ul li a').each do |a|
7
+ row = {}
8
+ row[:url] = a[:href]
9
+ row[:title] = a.text
10
+ sinew.csv_emit(row)
11
+ end
12
+
13
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sample.sinew CHANGED
@@ -1,9 +1,9 @@
1
- get 'http://httpbingo.org'
2
- noko.css('ul li a').each do |a|
1
+ response = sinew.get 'http://httpbingo.org'
2
+ response.noko.css('ul li a').each do |a|
3
3
  row = {}
4
4
  row[:url] = a[:href]
5
5
  row[:title] = a.text
6
- csv_emit(row)
6
+ sinew.csv_emit(row)
7
7
  end
8
8
 
9
- get 'http://httpbingo.org/redirect/2'
9
+ sinew.get 'http://httpbingo.org/redirect/2'
data/sinew.gemspec CHANGED
@@ -3,14 +3,15 @@ $LOAD_PATH.unshift("#{__dir__}/lib")
3
3
  require 'sinew/version'
4
4
 
5
5
  Gem::Specification.new do |s|
6
- s.name = 'sinew'
7
- s.version = Sinew::VERSION
8
- s.license = 'MIT'
9
- s.authors = [ 'Adam Doppelt', 'Nathan Kriege' ]
10
- s.email = [ 'amd@gurge.com' ]
11
- s.homepage = 'http://github.com/gurgeous/sinew'
12
- s.summary = 'Sinew - structured web crawling using recipes.'
6
+ s.name = 'sinew'
7
+ s.version = Sinew::VERSION
8
+ s.authors = ['Adam Doppelt', 'Nathan Kriege']
9
+ s.email = ['amd@gurge.com']
10
+
11
+ s.summary = 'Sinew - structured web crawling using recipes.'
13
12
  s.description = 'Crawl web sites easily using ruby recipes, with caching and nokogiri.'
13
+ s.homepage = 'http://github.com/gurgeous/sinew'
14
+ s.license = 'MIT'
14
15
  s.required_ruby_version = '>= 2.7'
15
16
 
16
17
  # what's in the gem?
@@ -19,14 +20,16 @@ Gem::Specification.new do |s|
19
20
  end
20
21
  s.bindir = 'bin'
21
22
  s.executables = s.files.grep(%r{^#{s.bindir}/}) { File.basename(_1) }
22
- s.require_paths = [ 'lib' ]
23
+ s.require_paths = ['lib']
23
24
 
24
- s.add_runtime_dependency 'amazing_print', '~> 1.3'
25
- s.add_runtime_dependency 'faraday', '~> 1.4'
26
- s.add_runtime_dependency 'faraday-encoding', '~> 0'
27
- s.add_runtime_dependency 'httpdisk', '~> 0'
28
- s.add_runtime_dependency 'nokogiri', '~> 1.11'
29
- s.add_runtime_dependency 'scripto', '~> 0'
30
- s.add_runtime_dependency 'slop', '~> 4.8'
31
- s.add_runtime_dependency 'sterile', '~> 1.0'
25
+ # gem dependencies
26
+ s.add_dependency 'amazing_print', '~> 1.3'
27
+ s.add_dependency 'faraday', '~> 1.4'
28
+ s.add_dependency 'faraday-encoding', '~> 0'
29
+ s.add_dependency 'faraday-rate_limiter', '~> 0.0'
30
+ s.add_dependency 'hashie', '~> 4.1'
31
+ s.add_dependency 'httpdisk', '~> 0.5'
32
+ s.add_dependency 'nokogiri', '~> 1.11'
33
+ s.add_dependency 'slop', '~> 4.8'
34
+ s.add_dependency 'sterile', '~> 1.0'
32
35
  end