sinew 2.0.3 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +26 -0
  3. data/.gitignore +3 -5
  4. data/.rubocop.yml +31 -46
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +124 -0
  7. data/README.md +146 -81
  8. data/Rakefile +36 -20
  9. data/bin/sinew +13 -39
  10. data/lib/sinew.rb +23 -10
  11. data/lib/sinew/args.rb +53 -0
  12. data/lib/sinew/base.rb +251 -0
  13. data/lib/sinew/csv.rb +89 -0
  14. data/lib/sinew/main.rb +45 -98
  15. data/lib/sinew/middleware/log_formatter.rb +23 -0
  16. data/lib/sinew/nokogiri_ext.rb +12 -21
  17. data/lib/sinew/response.rb +39 -99
  18. data/lib/sinew/version.rb +1 -1
  19. data/sample.rb +13 -0
  20. data/sample.sinew +4 -4
  21. data/sinew.gemspec +26 -25
  22. metadata +46 -108
  23. data/.travis.yml +0 -4
  24. data/.vscode/extensions.json +0 -3
  25. data/.vscode/settings.json +0 -15
  26. data/lib/sinew/cache.rb +0 -79
  27. data/lib/sinew/core_ext.rb +0 -59
  28. data/lib/sinew/dsl.rb +0 -114
  29. data/lib/sinew/output.rb +0 -149
  30. data/lib/sinew/request.rb +0 -151
  31. data/lib/sinew/runtime_options.rb +0 -28
  32. data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
  33. data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
  34. data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
  35. data/test/legacy/eu.httpbin.org/status,500 +0 -1
  36. data/test/legacy/legacy.sinew +0 -2
  37. data/test/recipes/array_header.sinew +0 -6
  38. data/test/recipes/basic.sinew +0 -8
  39. data/test/recipes/dups.sinew +0 -7
  40. data/test/recipes/implicit_header.sinew +0 -5
  41. data/test/recipes/limit.sinew +0 -11
  42. data/test/recipes/noko.sinew +0 -9
  43. data/test/recipes/uri.sinew +0 -11
  44. data/test/recipes/xml.sinew +0 -8
  45. data/test/test.html +0 -45
  46. data/test/test_cache.rb +0 -69
  47. data/test/test_helper.rb +0 -123
  48. data/test/test_legacy.rb +0 -23
  49. data/test/test_main.rb +0 -34
  50. data/test/test_nokogiri_ext.rb +0 -18
  51. data/test/test_output.rb +0 -56
  52. data/test/test_recipes.rb +0 -60
  53. data/test/test_requests.rb +0 -135
  54. data/test/test_utf8.rb +0 -39
data/lib/sinew/csv.rb ADDED
@@ -0,0 +1,89 @@
1
+ require 'csv'
2
+ require 'sterile'
3
+
4
+ module Sinew
5
+ class CSV
6
+ attr_reader :columns, :count, :csv, :path, :tally
7
+
8
+ def initialize(path)
9
+ @count = 0
10
+ @csv = nil
11
+ @path = path
12
+ end
13
+
14
+ # start writing the csv
15
+ def start(columns)
16
+ raise 'started twice' if started?
17
+
18
+ @columns = columns
19
+ @tally = columns.map { [_1, 0] }.to_h
20
+ @csv = ::CSV.open(path, 'wb').tap do
21
+ _1 << columns
22
+ end
23
+ end
24
+
25
+ # has this csv been started?
26
+ def started?
27
+ @csv != nil
28
+ end
29
+
30
+ # append a row
31
+ def emit(row)
32
+ # convert row to cols, and construct print (our return value)
33
+ print = {}
34
+ row = columns.map do
35
+ value = normalize(row[_1])
36
+ if value
37
+ print[_1] = value
38
+ tally[_1] += 1
39
+ end
40
+ value
41
+ end
42
+ @count += 1
43
+
44
+ # emit
45
+ csv << row
46
+ csv.flush
47
+
48
+ # return in case someone wants to pretty print this
49
+ print
50
+ end
51
+
52
+ ASCII_ONLY = begin
53
+ chars = (33..126).map(&:chr) - ['&']
54
+ /\A[#{Regexp.escape(chars.join)}\s]+\Z/
55
+ end.freeze
56
+
57
+ def normalize(s)
58
+ # nokogiri/array/misc => string
59
+ s = if s.respond_to?(:inner_html)
60
+ s.inner_html
61
+ elsif s.is_a?(Array)
62
+ s.join('|')
63
+ else
64
+ s.to_s
65
+ end
66
+ return if s.empty?
67
+
68
+ # simple attempt to strip tags. Note that we replace tags with spaces
69
+ s = s.gsub(/<[^>]+>/, ' ')
70
+
71
+ if s !~ ASCII_ONLY
72
+ # Converts MS Word 'smart punctuation' to ASCII
73
+ s = Sterile.plain_format(s)
74
+
75
+ # &aacute; &amp; etc.
76
+ s = Sterile.decode_entities(s)
77
+
78
+ # "šţɽĩɳģ" => "string"
79
+ s = Sterile.transliterate(s)
80
+ end
81
+
82
+ # squish
83
+ s = s.strip.gsub(/\s+/, ' ')
84
+ return if s.empty?
85
+
86
+ s
87
+ end
88
+ end
89
+ end
data/lib/sinew/main.rb CHANGED
@@ -1,125 +1,72 @@
1
- require 'scripto'
2
-
3
- #
4
- # Main sinew entry point.
5
- #
6
-
7
1
  module Sinew
8
- class Main < Scripto::Main
9
- attr_reader :runtime_options, :request_tm, :request_count
2
+ # Helper class used by sinew bin. This exists as an independent class solely
3
+ # for testing, otherwise it would be built into the bin script.
4
+ class Main
5
+ attr_reader :sinew
10
6
 
11
7
  def initialize(options)
12
- super(options)
13
-
14
- # init
15
- @runtime_options = RuntimeOptions.new
16
- @request_tm = Time.at(0)
17
- @request_count = 0
18
-
19
- if options[:proxy]
20
- addr, port = options[:proxy].split(':')
21
- runtime_options.httparty_options[:http_proxyaddr] = addr
22
- runtime_options.httparty_options[:http_proxyport] = port || 80
8
+ options[:output] ||= begin
9
+ src = options[:recipe]
10
+ dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
11
+ dst = dst.sub(%r{^./}, '') # nice to clean this up
12
+ dst
23
13
  end
24
- end
25
14
 
26
- def run
27
- dsl.run
28
- footer if !quiet?
15
+ @sinew = Sinew::Base.new(options)
29
16
  end
30
17
 
31
- def quiet?
32
- options[:quiet]
18
+ def run
19
+ tm = Time.now
20
+ header if !sinew.options[:silent]
21
+ recipe = sinew.options[:recipe]
22
+ dsl = DSL.new(sinew)
23
+ begin
24
+ dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
27
+ end
28
+ footer(Time.now - tm) if !sinew.options[:silent]
33
29
  end
34
30
 
35
- def dsl
36
- @dsl ||= DSL.new(self)
37
- end
31
+ protected
38
32
 
39
33
  #
40
- # http requests and caching
34
+ # header/footer
41
35
  #
42
36
 
43
- def cache
44
- @cache ||= Cache.new(self)
37
+ def header
38
+ sinew.banner("Writing to #{sinew.csv.path}...")
45
39
  end
46
40
 
47
- def http(method, url, options = {})
48
- request = Request.new(self, method, url, options)
49
-
50
- # try to get from cache
51
- response = cache.get(request)
41
+ def footer(elapsed)
42
+ csv = sinew.csv
43
+ count = csv.count
52
44
 
53
- # perform if necessary
54
- if !response
55
- response = perform(request)
56
- cache.set(response)
45
+ if count == 0
46
+ sinew.banner(format('Done in %ds. Nothing written.', elapsed))
47
+ return
57
48
  end
58
49
 
59
- # always log error messages
60
- if response.error?
61
- puts "xxx http request failed with #{response.code}"
62
- end
63
-
64
- response
65
- end
66
-
67
- def perform(request)
68
- before_perform_request(request)
69
-
70
- response = nil
50
+ # summary
51
+ msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
52
+ sinew.banner(msg)
71
53
 
72
- tries = runtime_options.retries + 1
73
- while tries > 0
74
- tries -= 1
75
- begin
76
- @request_count += 1
77
- response = request.perform
78
- rescue Timeout::Error
79
- response = Response.from_timeout(request)
80
- end
81
- break if !response.error_500?
54
+ # tally
55
+ tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
56
+ len = tally.keys.map { _1.to_s.length }.max
57
+ fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
58
+ tally.each do
59
+ printf(fmt, _1, _2, count, _2 * 100.0 / count)
82
60
  end
83
-
84
- response
85
61
  end
86
- protected :perform
87
62
 
88
- #
89
- # output
90
- #
91
-
92
- def output
93
- @output ||= Output.new(self)
94
- end
63
+ # simple DSL for .sinew files
64
+ class DSL
65
+ attr_reader :sinew
95
66
 
96
- #
97
- # helpers
98
- #
99
-
100
- def before_perform_request(request)
101
- # log
102
- if !quiet?
103
- msg = if request.method != 'get'
104
- "req #{request.uri} (#{request.method})"
105
- else
106
- "req #{request.uri}"
107
- end
108
- $stderr.puts msg
67
+ def initialize(sinew)
68
+ @sinew = sinew
109
69
  end
110
-
111
- # rate limit
112
- sleep = (request_tm + runtime_options.rate_limit) - Time.now
113
- sleep(sleep) if sleep > 0
114
- @request_tm = Time.now
115
- end
116
- protected :before_perform_request
117
-
118
- def footer
119
- output.report
120
- finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
121
- banner("#{finished} in #{dsl.elapsed.to_i}s.")
122
70
  end
123
- protected :footer
124
71
  end
125
72
  end
@@ -0,0 +1,23 @@
1
+ module Sinew
2
+ module Middleware
3
+ # Minimalist Formatter that logs proxy if present.
4
+ class LogFormatter < Faraday::Logging::Formatter
5
+ def request(env)
6
+ info('req') do
7
+ # Only log the initial request, not the redirects
8
+ return if env[:redirect]
9
+
10
+ msg = apply_filters(env.url.to_s)
11
+ msg = "#{msg} (#{env.method})" if env.method != :get
12
+ msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
13
+
14
+ msg
15
+ end
16
+ end
17
+
18
+ def response(env)
19
+ # silent
20
+ end
21
+ end
22
+ end
23
+ end
@@ -1,28 +1,19 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
- class Nokogiri::XML::NodeSet
5
- alias old_inner_html inner_html
6
- alias old_inner_text inner_text
4
+ module Nokogiri
5
+ module XML
6
+ class NodeSet
7
+ alias old_inner_html inner_html
8
+ alias old_inner_text inner_text
7
9
 
8
- def inner_text
9
- map(&:inner_text).join(' ')
10
- end
11
-
12
- def inner_html(*args)
13
- map { |i| i.inner_html(*args) }.join(' ')
14
- end
15
- end
10
+ def inner_text
11
+ map(&:inner_text).join(' ')
12
+ end
16
13
 
17
- # text_just_me
18
- class Nokogiri::XML::Node
19
- def text_just_me
20
- t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
21
- t&.text
22
- end
23
- end
24
- class Nokogiri::XML::NodeSet
25
- def text_just_me
26
- map(&:text_just_me).join(' ')
14
+ def inner_html(*args)
15
+ map { _1.inner_html(*args) }.join(' ')
16
+ end
17
+ end
27
18
  end
28
19
  end
@@ -1,121 +1,61 @@
1
- require 'stringio'
2
- require 'zlib'
3
-
4
- #
5
- # An HTTP response. Mostly a wrapper around HTTParty.
6
- #
1
+ require 'delegate'
2
+ require 'hashie/mash'
3
+ require 'json'
4
+ require 'nokogiri'
7
5
 
8
6
  module Sinew
9
- class Response
10
- attr_accessor :request, :uri, :body, :code, :headers
11
-
12
- #
13
- # factory methods
14
- #
15
-
16
- def self.from_network(request, party_response)
17
- Response.new.tap do |response|
18
- response.request = request
19
- response.uri = party_response.request.last_uri
20
- response.code = party_response.code
21
- response.headers = party_response.headers.to_h
22
- response.body = process_body(party_response)
23
- end
24
- end
25
-
26
- def self.from_cache(request, body, head)
27
- Response.new.tap do |response|
28
- response.request = request
29
- response.body = body
7
+ # A wrapper around Faraday::Response, with some parsing helpers.
8
+ class Response < SimpleDelegator
9
+ # Like body, but tries to cleanup whitespace around HTML for easier parsing.
10
+ def html
11
+ @html ||= body.dup.tap do
12
+ # fix invalid utf8
13
+ if _1.encoding == Encoding::UTF_8
14
+ _1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
15
+ end
30
16
 
31
- # defaults
32
- response.uri = request.uri
33
- response.code = 200
34
- response.headers = {}
17
+ # squish
18
+ _1.strip!
19
+ _1.gsub!(/\s+/, ' ')
35
20
 
36
- # overwrite with cached response headers
37
- if head
38
- if head !~ /^{/
39
- return from_legacy_head(response, head)
40
- end
41
- head = JSON.parse(head, symbolize_names: true)
42
- response.uri = URI.parse(head[:uri])
43
- response.code = head[:code]
44
- response.headers = head[:headers]
45
- end
21
+ # kill whitespace around tags
22
+ _1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
46
23
  end
47
24
  end
48
25
 
49
- def self.from_timeout(request)
50
- Response.new.tap do |response|
51
- response.request = request
52
- response.uri = request.uri
53
- response.body = 'timeout'
54
- response.code = 999
55
- response.headers = {}
56
- end
26
+ # Return body as JSON
27
+ def json
28
+ @json ||= JSON.parse(body, symbolize_names: true)
57
29
  end
58
30
 
59
- def self.from_legacy_head(response, head)
60
- response.tap do |r|
61
- case head
62
- when /\ACURLER_ERROR/
63
- # error
64
- r.code = 999
65
- when /\AHTTP/
66
- # redirect
67
- location = head.scan(/Location: ([^\r\n]+)/).flatten.last
68
- r.uri += location
69
- else
70
- $stderr.puts "unknown cached /head for #{r.uri}"
71
- end
72
- end
31
+ # Return JSON body as Hashie::Mash
32
+ def mash
33
+ @mash ||= Hashie::Mash.new(json)
73
34
  end
74
35
 
75
- # helper for decoding bodies before parsing
76
- def self.process_body(response)
77
- body = response.body
78
-
79
- # inflate if necessary
80
- bits = body[0, 10].force_encoding('BINARY')
81
- if bits =~ /\A\x1f\x8b/n
82
- body = Zlib::GzipReader.new(StringIO.new(body)).read
83
- end
84
-
85
- # force to utf-8 if we think this could be text
86
- if body.encoding != Encoding::UTF_8
87
- if content_type = response.headers['content-type']
88
- if content_type =~ /\b(html|javascript|json|text|xml)\b/
89
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
90
- end
91
- end
92
- end
93
-
94
- body
36
+ # Return body HTML as Nokogiri document
37
+ def noko
38
+ @noko ||= Nokogiri::HTML(html)
95
39
  end
96
40
 
97
- #
98
- # accessors
99
- #
100
-
101
- def error?
102
- code >= 400
41
+ # Return body XML as Nokogiri document
42
+ def xml
43
+ @xml ||= Nokogiri::XML(html)
103
44
  end
104
45
 
105
- def error_500?
106
- code / 100 >= 5
46
+ # Return the final URI for the request, after redirects
47
+ def url
48
+ env.url
107
49
  end
108
50
 
109
- def redirected?
110
- request.uri != uri
51
+ # Return the cache diskpath for this response
52
+ def diskpath
53
+ env[:httpdisk_diskpath]
111
54
  end
112
55
 
113
- def head_as_json
114
- {
115
- uri: uri,
116
- code: code,
117
- headers: headers,
118
- }
56
+ # Remove cached response from disk, if any
57
+ def uncache
58
+ File.unlink(diskpath) if File.exist?(diskpath)
119
59
  end
120
60
  end
121
61
  end