sinew 2.0.3 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.gitignore +3 -5
- data/.rubocop.yml +31 -46
- data/Gemfile +9 -0
- data/Gemfile.lock +124 -0
- data/README.md +146 -81
- data/Rakefile +36 -20
- data/bin/sinew +13 -39
- data/lib/sinew.rb +23 -10
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +45 -98
- data/lib/sinew/middleware/log_formatter.rb +23 -0
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +39 -99
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +26 -25
- metadata +46 -108
- data/.travis.yml +0 -4
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -15
- data/lib/sinew/cache.rb +0 -79
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -114
- data/lib/sinew/output.rb +0 -149
- data/lib/sinew/request.rb +0 -151
- data/lib/sinew/runtime_options.rb +0 -28
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/recipes/array_header.sinew +0 -6
- data/test/recipes/basic.sinew +0 -8
- data/test/recipes/dups.sinew +0 -7
- data/test/recipes/implicit_header.sinew +0 -5
- data/test/recipes/limit.sinew +0 -11
- data/test/recipes/noko.sinew +0 -9
- data/test/recipes/uri.sinew +0 -11
- data/test/recipes/xml.sinew +0 -8
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -123
- data/test/test_legacy.rb +0 -23
- data/test/test_main.rb +0 -34
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -56
- data/test/test_recipes.rb +0 -60
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
data/lib/sinew/csv.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'sterile'
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
class CSV
|
6
|
+
attr_reader :columns, :count, :csv, :path, :tally
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@count = 0
|
10
|
+
@csv = nil
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
# start writing the csv
|
15
|
+
def start(columns)
|
16
|
+
raise 'started twice' if started?
|
17
|
+
|
18
|
+
@columns = columns
|
19
|
+
@tally = columns.map { [_1, 0] }.to_h
|
20
|
+
@csv = ::CSV.open(path, 'wb').tap do
|
21
|
+
_1 << columns
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# has this csv been started?
|
26
|
+
def started?
|
27
|
+
@csv != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# append a row
|
31
|
+
def emit(row)
|
32
|
+
# convert row to cols, and construct print (our return value)
|
33
|
+
print = {}
|
34
|
+
row = columns.map do
|
35
|
+
value = normalize(row[_1])
|
36
|
+
if value
|
37
|
+
print[_1] = value
|
38
|
+
tally[_1] += 1
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
@count += 1
|
43
|
+
|
44
|
+
# emit
|
45
|
+
csv << row
|
46
|
+
csv.flush
|
47
|
+
|
48
|
+
# return in case someone wants to pretty print this
|
49
|
+
print
|
50
|
+
end
|
51
|
+
|
52
|
+
ASCII_ONLY = begin
|
53
|
+
chars = (33..126).map(&:chr) - ['&']
|
54
|
+
/\A[#{Regexp.escape(chars.join)}\s]+\Z/
|
55
|
+
end.freeze
|
56
|
+
|
57
|
+
def normalize(s)
|
58
|
+
# nokogiri/array/misc => string
|
59
|
+
s = if s.respond_to?(:inner_html)
|
60
|
+
s.inner_html
|
61
|
+
elsif s.is_a?(Array)
|
62
|
+
s.join('|')
|
63
|
+
else
|
64
|
+
s.to_s
|
65
|
+
end
|
66
|
+
return if s.empty?
|
67
|
+
|
68
|
+
# simple attempt to strip tags. Note that we replace tags with spaces
|
69
|
+
s = s.gsub(/<[^>]+>/, ' ')
|
70
|
+
|
71
|
+
if s !~ ASCII_ONLY
|
72
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
73
|
+
s = Sterile.plain_format(s)
|
74
|
+
|
75
|
+
# á & etc.
|
76
|
+
s = Sterile.decode_entities(s)
|
77
|
+
|
78
|
+
# "šţɽĩɳģ" => "string"
|
79
|
+
s = Sterile.transliterate(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
# squish
|
83
|
+
s = s.strip.gsub(/\s+/, ' ')
|
84
|
+
return if s.empty?
|
85
|
+
|
86
|
+
s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,125 +1,72 @@
|
|
1
|
-
require 'scripto'
|
2
|
-
|
3
|
-
#
|
4
|
-
# Main sinew entry point.
|
5
|
-
#
|
6
|
-
|
7
1
|
module Sinew
|
8
|
-
class
|
9
|
-
|
2
|
+
# Helper class used by sinew bin. This exists as an independent class solely
|
3
|
+
# for testing, otherwise it would be built into the bin script.
|
4
|
+
class Main
|
5
|
+
attr_reader :sinew
|
10
6
|
|
11
7
|
def initialize(options)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@request_count = 0
|
18
|
-
|
19
|
-
if options[:proxy]
|
20
|
-
addr, port = options[:proxy].split(':')
|
21
|
-
runtime_options.httparty_options[:http_proxyaddr] = addr
|
22
|
-
runtime_options.httparty_options[:http_proxyport] = port || 80
|
8
|
+
options[:output] ||= begin
|
9
|
+
src = options[:recipe]
|
10
|
+
dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
|
11
|
+
dst = dst.sub(%r{^./}, '') # nice to clean this up
|
12
|
+
dst
|
23
13
|
end
|
24
|
-
end
|
25
14
|
|
26
|
-
|
27
|
-
dsl.run
|
28
|
-
footer if !quiet?
|
15
|
+
@sinew = Sinew::Base.new(options)
|
29
16
|
end
|
30
17
|
|
31
|
-
def
|
32
|
-
|
18
|
+
def run
|
19
|
+
tm = Time.now
|
20
|
+
header if !sinew.options[:silent]
|
21
|
+
recipe = sinew.options[:recipe]
|
22
|
+
dsl = DSL.new(sinew)
|
23
|
+
begin
|
24
|
+
dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
27
|
+
end
|
28
|
+
footer(Time.now - tm) if !sinew.options[:silent]
|
33
29
|
end
|
34
30
|
|
35
|
-
|
36
|
-
@dsl ||= DSL.new(self)
|
37
|
-
end
|
31
|
+
protected
|
38
32
|
|
39
33
|
#
|
40
|
-
#
|
34
|
+
# header/footer
|
41
35
|
#
|
42
36
|
|
43
|
-
def
|
44
|
-
|
37
|
+
def header
|
38
|
+
sinew.banner("Writing to #{sinew.csv.path}...")
|
45
39
|
end
|
46
40
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
50
|
-
# try to get from cache
|
51
|
-
response = cache.get(request)
|
41
|
+
def footer(elapsed)
|
42
|
+
csv = sinew.csv
|
43
|
+
count = csv.count
|
52
44
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
cache.set(response)
|
45
|
+
if count == 0
|
46
|
+
sinew.banner(format('Done in %ds. Nothing written.', elapsed))
|
47
|
+
return
|
57
48
|
end
|
58
49
|
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
response
|
65
|
-
end
|
66
|
-
|
67
|
-
def perform(request)
|
68
|
-
before_perform_request(request)
|
69
|
-
|
70
|
-
response = nil
|
50
|
+
# summary
|
51
|
+
msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
|
52
|
+
sinew.banner(msg)
|
71
53
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
rescue Timeout::Error
|
79
|
-
response = Response.from_timeout(request)
|
80
|
-
end
|
81
|
-
break if !response.error_500?
|
54
|
+
# tally
|
55
|
+
tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
|
56
|
+
len = tally.keys.map { _1.to_s.length }.max
|
57
|
+
fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
|
58
|
+
tally.each do
|
59
|
+
printf(fmt, _1, _2, count, _2 * 100.0 / count)
|
82
60
|
end
|
83
|
-
|
84
|
-
response
|
85
61
|
end
|
86
|
-
protected :perform
|
87
62
|
|
88
|
-
#
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
def output
|
93
|
-
@output ||= Output.new(self)
|
94
|
-
end
|
63
|
+
# simple DSL for .sinew files
|
64
|
+
class DSL
|
65
|
+
attr_reader :sinew
|
95
66
|
|
96
|
-
|
97
|
-
|
98
|
-
#
|
99
|
-
|
100
|
-
def before_perform_request(request)
|
101
|
-
# log
|
102
|
-
if !quiet?
|
103
|
-
msg = if request.method != 'get'
|
104
|
-
"req #{request.uri} (#{request.method})"
|
105
|
-
else
|
106
|
-
"req #{request.uri}"
|
107
|
-
end
|
108
|
-
$stderr.puts msg
|
67
|
+
def initialize(sinew)
|
68
|
+
@sinew = sinew
|
109
69
|
end
|
110
|
-
|
111
|
-
# rate limit
|
112
|
-
sleep = (request_tm + runtime_options.rate_limit) - Time.now
|
113
|
-
sleep(sleep) if sleep > 0
|
114
|
-
@request_tm = Time.now
|
115
|
-
end
|
116
|
-
protected :before_perform_request
|
117
|
-
|
118
|
-
def footer
|
119
|
-
output.report
|
120
|
-
finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
|
121
|
-
banner("#{finished} in #{dsl.elapsed.to_i}s.")
|
122
70
|
end
|
123
|
-
protected :footer
|
124
71
|
end
|
125
72
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Sinew
|
2
|
+
module Middleware
|
3
|
+
# Minimalist Formatter that logs proxy if present.
|
4
|
+
class LogFormatter < Faraday::Logging::Formatter
|
5
|
+
def request(env)
|
6
|
+
info('req') do
|
7
|
+
# Only log the initial request, not the redirects
|
8
|
+
return if env[:redirect]
|
9
|
+
|
10
|
+
msg = apply_filters(env.url.to_s)
|
11
|
+
msg = "#{msg} (#{env.method})" if env.method != :get
|
12
|
+
msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
|
13
|
+
|
14
|
+
msg
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def response(env)
|
19
|
+
# silent
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,28 +1,19 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Nokogiri
|
5
|
+
module XML
|
6
|
+
class NodeSet
|
7
|
+
alias old_inner_html inner_html
|
8
|
+
alias old_inner_text inner_text
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def inner_html(*args)
|
13
|
-
map { |i| i.inner_html(*args) }.join(' ')
|
14
|
-
end
|
15
|
-
end
|
10
|
+
def inner_text
|
11
|
+
map(&:inner_text).join(' ')
|
12
|
+
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
t&.text
|
22
|
-
end
|
23
|
-
end
|
24
|
-
class Nokogiri::XML::NodeSet
|
25
|
-
def text_just_me
|
26
|
-
map(&:text_just_me).join(' ')
|
14
|
+
def inner_html(*args)
|
15
|
+
map { _1.inner_html(*args) }.join(' ')
|
16
|
+
end
|
17
|
+
end
|
27
18
|
end
|
28
19
|
end
|
data/lib/sinew/response.rb
CHANGED
@@ -1,121 +1,61 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
3
|
-
|
4
|
-
|
5
|
-
# An HTTP response. Mostly a wrapper around HTTParty.
|
6
|
-
#
|
1
|
+
require 'delegate'
|
2
|
+
require 'hashie/mash'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
7
5
|
|
8
6
|
module Sinew
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
response.request = request
|
19
|
-
response.uri = party_response.request.last_uri
|
20
|
-
response.code = party_response.code
|
21
|
-
response.headers = party_response.headers.to_h
|
22
|
-
response.body = process_body(party_response)
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.from_cache(request, body, head)
|
27
|
-
Response.new.tap do |response|
|
28
|
-
response.request = request
|
29
|
-
response.body = body
|
7
|
+
# A wrapper around Faraday::Response, with some parsing helpers.
|
8
|
+
class Response < SimpleDelegator
|
9
|
+
# Like body, but tries to cleanup whitespace around HTML for easier parsing.
|
10
|
+
def html
|
11
|
+
@html ||= body.dup.tap do
|
12
|
+
# fix invalid utf8
|
13
|
+
if _1.encoding == Encoding::UTF_8
|
14
|
+
_1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
15
|
+
end
|
30
16
|
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
response.headers = {}
|
17
|
+
# squish
|
18
|
+
_1.strip!
|
19
|
+
_1.gsub!(/\s+/, ' ')
|
35
20
|
|
36
|
-
#
|
37
|
-
|
38
|
-
if head !~ /^{/
|
39
|
-
return from_legacy_head(response, head)
|
40
|
-
end
|
41
|
-
head = JSON.parse(head, symbolize_names: true)
|
42
|
-
response.uri = URI.parse(head[:uri])
|
43
|
-
response.code = head[:code]
|
44
|
-
response.headers = head[:headers]
|
45
|
-
end
|
21
|
+
# kill whitespace around tags
|
22
|
+
_1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
46
23
|
end
|
47
24
|
end
|
48
25
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
response.uri = request.uri
|
53
|
-
response.body = 'timeout'
|
54
|
-
response.code = 999
|
55
|
-
response.headers = {}
|
56
|
-
end
|
26
|
+
# Return body as JSON
|
27
|
+
def json
|
28
|
+
@json ||= JSON.parse(body, symbolize_names: true)
|
57
29
|
end
|
58
30
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
when /\ACURLER_ERROR/
|
63
|
-
# error
|
64
|
-
r.code = 999
|
65
|
-
when /\AHTTP/
|
66
|
-
# redirect
|
67
|
-
location = head.scan(/Location: ([^\r\n]+)/).flatten.last
|
68
|
-
r.uri += location
|
69
|
-
else
|
70
|
-
$stderr.puts "unknown cached /head for #{r.uri}"
|
71
|
-
end
|
72
|
-
end
|
31
|
+
# Return JSON body as Hashie::Mash
|
32
|
+
def mash
|
33
|
+
@mash ||= Hashie::Mash.new(json)
|
73
34
|
end
|
74
35
|
|
75
|
-
#
|
76
|
-
def
|
77
|
-
|
78
|
-
|
79
|
-
# inflate if necessary
|
80
|
-
bits = body[0, 10].force_encoding('BINARY')
|
81
|
-
if bits =~ /\A\x1f\x8b/n
|
82
|
-
body = Zlib::GzipReader.new(StringIO.new(body)).read
|
83
|
-
end
|
84
|
-
|
85
|
-
# force to utf-8 if we think this could be text
|
86
|
-
if body.encoding != Encoding::UTF_8
|
87
|
-
if content_type = response.headers['content-type']
|
88
|
-
if content_type =~ /\b(html|javascript|json|text|xml)\b/
|
89
|
-
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
body
|
36
|
+
# Return body HTML as Nokogiri document
|
37
|
+
def noko
|
38
|
+
@noko ||= Nokogiri::HTML(html)
|
95
39
|
end
|
96
40
|
|
97
|
-
#
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
def error?
|
102
|
-
code >= 400
|
41
|
+
# Return body XML as Nokogiri document
|
42
|
+
def xml
|
43
|
+
@xml ||= Nokogiri::XML(html)
|
103
44
|
end
|
104
45
|
|
105
|
-
|
106
|
-
|
46
|
+
# Return the final URI for the request, after redirects
|
47
|
+
def url
|
48
|
+
env.url
|
107
49
|
end
|
108
50
|
|
109
|
-
|
110
|
-
|
51
|
+
# Return the cache diskpath for this response
|
52
|
+
def diskpath
|
53
|
+
env[:httpdisk_diskpath]
|
111
54
|
end
|
112
55
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
code: code,
|
117
|
-
headers: headers,
|
118
|
-
}
|
56
|
+
# Remove cached response from disk, if any
|
57
|
+
def uncache
|
58
|
+
File.unlink(diskpath) if File.exist?(diskpath)
|
119
59
|
end
|
120
60
|
end
|
121
61
|
end
|