sinew 2.0.3 → 4.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (54) hide show
  1. checksums.yaml +4 -4
  2. data/.github/workflows/test.yml +26 -0
  3. data/.gitignore +3 -5
  4. data/.rubocop.yml +31 -46
  5. data/Gemfile +9 -0
  6. data/Gemfile.lock +124 -0
  7. data/README.md +146 -81
  8. data/Rakefile +36 -20
  9. data/bin/sinew +13 -39
  10. data/lib/sinew.rb +23 -10
  11. data/lib/sinew/args.rb +53 -0
  12. data/lib/sinew/base.rb +251 -0
  13. data/lib/sinew/csv.rb +89 -0
  14. data/lib/sinew/main.rb +45 -98
  15. data/lib/sinew/middleware/log_formatter.rb +23 -0
  16. data/lib/sinew/nokogiri_ext.rb +12 -21
  17. data/lib/sinew/response.rb +39 -99
  18. data/lib/sinew/version.rb +1 -1
  19. data/sample.rb +13 -0
  20. data/sample.sinew +4 -4
  21. data/sinew.gemspec +26 -25
  22. metadata +46 -108
  23. data/.travis.yml +0 -4
  24. data/.vscode/extensions.json +0 -3
  25. data/.vscode/settings.json +0 -15
  26. data/lib/sinew/cache.rb +0 -79
  27. data/lib/sinew/core_ext.rb +0 -59
  28. data/lib/sinew/dsl.rb +0 -114
  29. data/lib/sinew/output.rb +0 -149
  30. data/lib/sinew/request.rb +0 -151
  31. data/lib/sinew/runtime_options.rb +0 -28
  32. data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
  33. data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
  34. data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
  35. data/test/legacy/eu.httpbin.org/status,500 +0 -1
  36. data/test/legacy/legacy.sinew +0 -2
  37. data/test/recipes/array_header.sinew +0 -6
  38. data/test/recipes/basic.sinew +0 -8
  39. data/test/recipes/dups.sinew +0 -7
  40. data/test/recipes/implicit_header.sinew +0 -5
  41. data/test/recipes/limit.sinew +0 -11
  42. data/test/recipes/noko.sinew +0 -9
  43. data/test/recipes/uri.sinew +0 -11
  44. data/test/recipes/xml.sinew +0 -8
  45. data/test/test.html +0 -45
  46. data/test/test_cache.rb +0 -69
  47. data/test/test_helper.rb +0 -123
  48. data/test/test_legacy.rb +0 -23
  49. data/test/test_main.rb +0 -34
  50. data/test/test_nokogiri_ext.rb +0 -18
  51. data/test/test_output.rb +0 -56
  52. data/test/test_recipes.rb +0 -60
  53. data/test/test_requests.rb +0 -135
  54. data/test/test_utf8.rb +0 -39
data/lib/sinew/csv.rb ADDED
@@ -0,0 +1,89 @@
1
+ require 'csv'
2
+ require 'sterile'
3
+
4
+ module Sinew
5
+ class CSV
6
+ attr_reader :columns, :count, :csv, :path, :tally
7
+
8
+ def initialize(path)
9
+ @count = 0
10
+ @csv = nil
11
+ @path = path
12
+ end
13
+
14
+ # start writing the csv
15
+ def start(columns)
16
+ raise 'started twice' if started?
17
+
18
+ @columns = columns
19
+ @tally = columns.map { [_1, 0] }.to_h
20
+ @csv = ::CSV.open(path, 'wb').tap do
21
+ _1 << columns
22
+ end
23
+ end
24
+
25
+ # has this csv been started?
26
+ def started?
27
+ @csv != nil
28
+ end
29
+
30
+ # append a row
31
+ def emit(row)
32
+ # convert row to cols, and construct print (our return value)
33
+ print = {}
34
+ row = columns.map do
35
+ value = normalize(row[_1])
36
+ if value
37
+ print[_1] = value
38
+ tally[_1] += 1
39
+ end
40
+ value
41
+ end
42
+ @count += 1
43
+
44
+ # emit
45
+ csv << row
46
+ csv.flush
47
+
48
+ # return in case someone wants to pretty print this
49
+ print
50
+ end
51
+
52
+ ASCII_ONLY = begin
53
+ chars = (33..126).map(&:chr) - ['&']
54
+ /\A[#{Regexp.escape(chars.join)}\s]+\Z/
55
+ end.freeze
56
+
57
+ def normalize(s)
58
+ # nokogiri/array/misc => string
59
+ s = if s.respond_to?(:inner_html)
60
+ s.inner_html
61
+ elsif s.is_a?(Array)
62
+ s.join('|')
63
+ else
64
+ s.to_s
65
+ end
66
+ return if s.empty?
67
+
68
+ # simple attempt to strip tags. Note that we replace tags with spaces
69
+ s = s.gsub(/<[^>]+>/, ' ')
70
+
71
+ if s !~ ASCII_ONLY
72
+ # Converts MS Word 'smart punctuation' to ASCII
73
+ s = Sterile.plain_format(s)
74
+
75
+ # &aacute; &amp; etc.
76
+ s = Sterile.decode_entities(s)
77
+
78
+ # "šţɽĩɳģ" => "string"
79
+ s = Sterile.transliterate(s)
80
+ end
81
+
82
+ # squish
83
+ s = s.strip.gsub(/\s+/, ' ')
84
+ return if s.empty?
85
+
86
+ s
87
+ end
88
+ end
89
+ end
data/lib/sinew/main.rb CHANGED
@@ -1,125 +1,72 @@
1
- require 'scripto'
2
-
3
- #
4
- # Main sinew entry point.
5
- #
6
-
7
1
  module Sinew
8
- class Main < Scripto::Main
9
- attr_reader :runtime_options, :request_tm, :request_count
2
+ # Helper class used by sinew bin. This exists as an independent class solely
3
+ # for testing, otherwise it would be built into the bin script.
4
+ class Main
5
+ attr_reader :sinew
10
6
 
11
7
  def initialize(options)
12
- super(options)
13
-
14
- # init
15
- @runtime_options = RuntimeOptions.new
16
- @request_tm = Time.at(0)
17
- @request_count = 0
18
-
19
- if options[:proxy]
20
- addr, port = options[:proxy].split(':')
21
- runtime_options.httparty_options[:http_proxyaddr] = addr
22
- runtime_options.httparty_options[:http_proxyport] = port || 80
8
+ options[:output] ||= begin
9
+ src = options[:recipe]
10
+ dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
11
+ dst = dst.sub(%r{^./}, '') # nice to clean this up
12
+ dst
23
13
  end
24
- end
25
14
 
26
- def run
27
- dsl.run
28
- footer if !quiet?
15
+ @sinew = Sinew::Base.new(options)
29
16
  end
30
17
 
31
- def quiet?
32
- options[:quiet]
18
+ def run
19
+ tm = Time.now
20
+ header if !sinew.options[:silent]
21
+ recipe = sinew.options[:recipe]
22
+ dsl = DSL.new(sinew)
23
+ begin
24
+ dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
27
+ end
28
+ footer(Time.now - tm) if !sinew.options[:silent]
33
29
  end
34
30
 
35
- def dsl
36
- @dsl ||= DSL.new(self)
37
- end
31
+ protected
38
32
 
39
33
  #
40
- # http requests and caching
34
+ # header/footer
41
35
  #
42
36
 
43
- def cache
44
- @cache ||= Cache.new(self)
37
+ def header
38
+ sinew.banner("Writing to #{sinew.csv.path}...")
45
39
  end
46
40
 
47
- def http(method, url, options = {})
48
- request = Request.new(self, method, url, options)
49
-
50
- # try to get from cache
51
- response = cache.get(request)
41
+ def footer(elapsed)
42
+ csv = sinew.csv
43
+ count = csv.count
52
44
 
53
- # perform if necessary
54
- if !response
55
- response = perform(request)
56
- cache.set(response)
45
+ if count == 0
46
+ sinew.banner(format('Done in %ds. Nothing written.', elapsed))
47
+ return
57
48
  end
58
49
 
59
- # always log error messages
60
- if response.error?
61
- puts "xxx http request failed with #{response.code}"
62
- end
63
-
64
- response
65
- end
66
-
67
- def perform(request)
68
- before_perform_request(request)
69
-
70
- response = nil
50
+ # summary
51
+ msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
52
+ sinew.banner(msg)
71
53
 
72
- tries = runtime_options.retries + 1
73
- while tries > 0
74
- tries -= 1
75
- begin
76
- @request_count += 1
77
- response = request.perform
78
- rescue Timeout::Error
79
- response = Response.from_timeout(request)
80
- end
81
- break if !response.error_500?
54
+ # tally
55
+ tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
56
+ len = tally.keys.map { _1.to_s.length }.max
57
+ fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
58
+ tally.each do
59
+ printf(fmt, _1, _2, count, _2 * 100.0 / count)
82
60
  end
83
-
84
- response
85
61
  end
86
- protected :perform
87
62
 
88
- #
89
- # output
90
- #
91
-
92
- def output
93
- @output ||= Output.new(self)
94
- end
63
+ # simple DSL for .sinew files
64
+ class DSL
65
+ attr_reader :sinew
95
66
 
96
- #
97
- # helpers
98
- #
99
-
100
- def before_perform_request(request)
101
- # log
102
- if !quiet?
103
- msg = if request.method != 'get'
104
- "req #{request.uri} (#{request.method})"
105
- else
106
- "req #{request.uri}"
107
- end
108
- $stderr.puts msg
67
+ def initialize(sinew)
68
+ @sinew = sinew
109
69
  end
110
-
111
- # rate limit
112
- sleep = (request_tm + runtime_options.rate_limit) - Time.now
113
- sleep(sleep) if sleep > 0
114
- @request_tm = Time.now
115
- end
116
- protected :before_perform_request
117
-
118
- def footer
119
- output.report
120
- finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
121
- banner("#{finished} in #{dsl.elapsed.to_i}s.")
122
70
  end
123
- protected :footer
124
71
  end
125
72
  end
@@ -0,0 +1,23 @@
1
+ module Sinew
2
+ module Middleware
3
+ # Minimalist Formatter that logs proxy if present.
4
+ class LogFormatter < Faraday::Logging::Formatter
5
+ def request(env)
6
+ info('req') do
7
+ # Only log the initial request, not the redirects
8
+ return if env[:redirect]
9
+
10
+ msg = apply_filters(env.url.to_s)
11
+ msg = "#{msg} (#{env.method})" if env.method != :get
12
+ msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
13
+
14
+ msg
15
+ end
16
+ end
17
+
18
+ def response(env)
19
+ # silent
20
+ end
21
+ end
22
+ end
23
+ end
@@ -1,28 +1,19 @@
1
1
  require 'nokogiri'
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
- class Nokogiri::XML::NodeSet
5
- alias old_inner_html inner_html
6
- alias old_inner_text inner_text
4
+ module Nokogiri
5
+ module XML
6
+ class NodeSet
7
+ alias old_inner_html inner_html
8
+ alias old_inner_text inner_text
7
9
 
8
- def inner_text
9
- map(&:inner_text).join(' ')
10
- end
11
-
12
- def inner_html(*args)
13
- map { |i| i.inner_html(*args) }.join(' ')
14
- end
15
- end
10
+ def inner_text
11
+ map(&:inner_text).join(' ')
12
+ end
16
13
 
17
- # text_just_me
18
- class Nokogiri::XML::Node
19
- def text_just_me
20
- t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
21
- t&.text
22
- end
23
- end
24
- class Nokogiri::XML::NodeSet
25
- def text_just_me
26
- map(&:text_just_me).join(' ')
14
+ def inner_html(*args)
15
+ map { _1.inner_html(*args) }.join(' ')
16
+ end
17
+ end
27
18
  end
28
19
  end
@@ -1,121 +1,61 @@
1
- require 'stringio'
2
- require 'zlib'
3
-
4
- #
5
- # An HTTP response. Mostly a wrapper around HTTParty.
6
- #
1
+ require 'delegate'
2
+ require 'hashie/mash'
3
+ require 'json'
4
+ require 'nokogiri'
7
5
 
8
6
  module Sinew
9
- class Response
10
- attr_accessor :request, :uri, :body, :code, :headers
11
-
12
- #
13
- # factory methods
14
- #
15
-
16
- def self.from_network(request, party_response)
17
- Response.new.tap do |response|
18
- response.request = request
19
- response.uri = party_response.request.last_uri
20
- response.code = party_response.code
21
- response.headers = party_response.headers.to_h
22
- response.body = process_body(party_response)
23
- end
24
- end
25
-
26
- def self.from_cache(request, body, head)
27
- Response.new.tap do |response|
28
- response.request = request
29
- response.body = body
7
+ # A wrapper around Faraday::Response, with some parsing helpers.
8
+ class Response < SimpleDelegator
9
+ # Like body, but tries to cleanup whitespace around HTML for easier parsing.
10
+ def html
11
+ @html ||= body.dup.tap do
12
+ # fix invalid utf8
13
+ if _1.encoding == Encoding::UTF_8
14
+ _1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
15
+ end
30
16
 
31
- # defaults
32
- response.uri = request.uri
33
- response.code = 200
34
- response.headers = {}
17
+ # squish
18
+ _1.strip!
19
+ _1.gsub!(/\s+/, ' ')
35
20
 
36
- # overwrite with cached response headers
37
- if head
38
- if head !~ /^{/
39
- return from_legacy_head(response, head)
40
- end
41
- head = JSON.parse(head, symbolize_names: true)
42
- response.uri = URI.parse(head[:uri])
43
- response.code = head[:code]
44
- response.headers = head[:headers]
45
- end
21
+ # kill whitespace around tags
22
+ _1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
46
23
  end
47
24
  end
48
25
 
49
- def self.from_timeout(request)
50
- Response.new.tap do |response|
51
- response.request = request
52
- response.uri = request.uri
53
- response.body = 'timeout'
54
- response.code = 999
55
- response.headers = {}
56
- end
26
+ # Return body as JSON
27
+ def json
28
+ @json ||= JSON.parse(body, symbolize_names: true)
57
29
  end
58
30
 
59
- def self.from_legacy_head(response, head)
60
- response.tap do |r|
61
- case head
62
- when /\ACURLER_ERROR/
63
- # error
64
- r.code = 999
65
- when /\AHTTP/
66
- # redirect
67
- location = head.scan(/Location: ([^\r\n]+)/).flatten.last
68
- r.uri += location
69
- else
70
- $stderr.puts "unknown cached /head for #{r.uri}"
71
- end
72
- end
31
+ # Return JSON body as Hashie::Mash
32
+ def mash
33
+ @mash ||= Hashie::Mash.new(json)
73
34
  end
74
35
 
75
- # helper for decoding bodies before parsing
76
- def self.process_body(response)
77
- body = response.body
78
-
79
- # inflate if necessary
80
- bits = body[0, 10].force_encoding('BINARY')
81
- if bits =~ /\A\x1f\x8b/n
82
- body = Zlib::GzipReader.new(StringIO.new(body)).read
83
- end
84
-
85
- # force to utf-8 if we think this could be text
86
- if body.encoding != Encoding::UTF_8
87
- if content_type = response.headers['content-type']
88
- if content_type =~ /\b(html|javascript|json|text|xml)\b/
89
- body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
90
- end
91
- end
92
- end
93
-
94
- body
36
+ # Return body HTML as Nokogiri document
37
+ def noko
38
+ @noko ||= Nokogiri::HTML(html)
95
39
  end
96
40
 
97
- #
98
- # accessors
99
- #
100
-
101
- def error?
102
- code >= 400
41
+ # Return body XML as Nokogiri document
42
+ def xml
43
+ @xml ||= Nokogiri::XML(html)
103
44
  end
104
45
 
105
- def error_500?
106
- code / 100 >= 5
46
+ # Return the final URI for the request, after redirects
47
+ def url
48
+ env.url
107
49
  end
108
50
 
109
- def redirected?
110
- request.uri != uri
51
+ # Return the cache diskpath for this response
52
+ def diskpath
53
+ env[:httpdisk_diskpath]
111
54
  end
112
55
 
113
- def head_as_json
114
- {
115
- uri: uri,
116
- code: code,
117
- headers: headers,
118
- }
56
+ # Remove cached response from disk, if any
57
+ def uncache
58
+ File.unlink(diskpath) if File.exist?(diskpath)
119
59
  end
120
60
  end
121
61
  end