sinew 2.0.3 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.gitignore +3 -5
- data/.rubocop.yml +31 -46
- data/Gemfile +9 -0
- data/Gemfile.lock +124 -0
- data/README.md +146 -81
- data/Rakefile +36 -20
- data/bin/sinew +13 -39
- data/lib/sinew.rb +23 -10
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +45 -98
- data/lib/sinew/middleware/log_formatter.rb +23 -0
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +39 -99
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +26 -25
- metadata +46 -108
- data/.travis.yml +0 -4
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -15
- data/lib/sinew/cache.rb +0 -79
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -114
- data/lib/sinew/output.rb +0 -149
- data/lib/sinew/request.rb +0 -151
- data/lib/sinew/runtime_options.rb +0 -28
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/recipes/array_header.sinew +0 -6
- data/test/recipes/basic.sinew +0 -8
- data/test/recipes/dups.sinew +0 -7
- data/test/recipes/implicit_header.sinew +0 -5
- data/test/recipes/limit.sinew +0 -11
- data/test/recipes/noko.sinew +0 -9
- data/test/recipes/uri.sinew +0 -11
- data/test/recipes/xml.sinew +0 -8
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -123
- data/test/test_legacy.rb +0 -23
- data/test/test_main.rb +0 -34
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -56
- data/test/test_recipes.rb +0 -60
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
data/lib/sinew/csv.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require 'csv'
|
2
|
+
require 'sterile'
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
class CSV
|
6
|
+
attr_reader :columns, :count, :csv, :path, :tally
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@count = 0
|
10
|
+
@csv = nil
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
# start writing the csv
|
15
|
+
def start(columns)
|
16
|
+
raise 'started twice' if started?
|
17
|
+
|
18
|
+
@columns = columns
|
19
|
+
@tally = columns.map { [_1, 0] }.to_h
|
20
|
+
@csv = ::CSV.open(path, 'wb').tap do
|
21
|
+
_1 << columns
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# has this csv been started?
|
26
|
+
def started?
|
27
|
+
@csv != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# append a row
|
31
|
+
def emit(row)
|
32
|
+
# convert row to cols, and construct print (our return value)
|
33
|
+
print = {}
|
34
|
+
row = columns.map do
|
35
|
+
value = normalize(row[_1])
|
36
|
+
if value
|
37
|
+
print[_1] = value
|
38
|
+
tally[_1] += 1
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
@count += 1
|
43
|
+
|
44
|
+
# emit
|
45
|
+
csv << row
|
46
|
+
csv.flush
|
47
|
+
|
48
|
+
# return in case someone wants to pretty print this
|
49
|
+
print
|
50
|
+
end
|
51
|
+
|
52
|
+
ASCII_ONLY = begin
|
53
|
+
chars = (33..126).map(&:chr) - ['&']
|
54
|
+
/\A[#{Regexp.escape(chars.join)}\s]+\Z/
|
55
|
+
end.freeze
|
56
|
+
|
57
|
+
def normalize(s)
|
58
|
+
# nokogiri/array/misc => string
|
59
|
+
s = if s.respond_to?(:inner_html)
|
60
|
+
s.inner_html
|
61
|
+
elsif s.is_a?(Array)
|
62
|
+
s.join('|')
|
63
|
+
else
|
64
|
+
s.to_s
|
65
|
+
end
|
66
|
+
return if s.empty?
|
67
|
+
|
68
|
+
# simple attempt to strip tags. Note that we replace tags with spaces
|
69
|
+
s = s.gsub(/<[^>]+>/, ' ')
|
70
|
+
|
71
|
+
if s !~ ASCII_ONLY
|
72
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
73
|
+
s = Sterile.plain_format(s)
|
74
|
+
|
75
|
+
# á & etc.
|
76
|
+
s = Sterile.decode_entities(s)
|
77
|
+
|
78
|
+
# "šţɽĩɳģ" => "string"
|
79
|
+
s = Sterile.transliterate(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
# squish
|
83
|
+
s = s.strip.gsub(/\s+/, ' ')
|
84
|
+
return if s.empty?
|
85
|
+
|
86
|
+
s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,125 +1,72 @@
|
|
1
|
-
require 'scripto'
|
2
|
-
|
3
|
-
#
|
4
|
-
# Main sinew entry point.
|
5
|
-
#
|
6
|
-
|
7
1
|
module Sinew
|
8
|
-
class
|
9
|
-
|
2
|
+
# Helper class used by sinew bin. This exists as an independent class solely
|
3
|
+
# for testing, otherwise it would be built into the bin script.
|
4
|
+
class Main
|
5
|
+
attr_reader :sinew
|
10
6
|
|
11
7
|
def initialize(options)
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
@request_count = 0
|
18
|
-
|
19
|
-
if options[:proxy]
|
20
|
-
addr, port = options[:proxy].split(':')
|
21
|
-
runtime_options.httparty_options[:http_proxyaddr] = addr
|
22
|
-
runtime_options.httparty_options[:http_proxyport] = port || 80
|
8
|
+
options[:output] ||= begin
|
9
|
+
src = options[:recipe]
|
10
|
+
dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
|
11
|
+
dst = dst.sub(%r{^./}, '') # nice to clean this up
|
12
|
+
dst
|
23
13
|
end
|
24
|
-
end
|
25
14
|
|
26
|
-
|
27
|
-
dsl.run
|
28
|
-
footer if !quiet?
|
15
|
+
@sinew = Sinew::Base.new(options)
|
29
16
|
end
|
30
17
|
|
31
|
-
def
|
32
|
-
|
18
|
+
def run
|
19
|
+
tm = Time.now
|
20
|
+
header if !sinew.options[:silent]
|
21
|
+
recipe = sinew.options[:recipe]
|
22
|
+
dsl = DSL.new(sinew)
|
23
|
+
begin
|
24
|
+
dsl.instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
27
|
+
end
|
28
|
+
footer(Time.now - tm) if !sinew.options[:silent]
|
33
29
|
end
|
34
30
|
|
35
|
-
|
36
|
-
@dsl ||= DSL.new(self)
|
37
|
-
end
|
31
|
+
protected
|
38
32
|
|
39
33
|
#
|
40
|
-
#
|
34
|
+
# header/footer
|
41
35
|
#
|
42
36
|
|
43
|
-
def
|
44
|
-
|
37
|
+
def header
|
38
|
+
sinew.banner("Writing to #{sinew.csv.path}...")
|
45
39
|
end
|
46
40
|
|
47
|
-
def
|
48
|
-
|
49
|
-
|
50
|
-
# try to get from cache
|
51
|
-
response = cache.get(request)
|
41
|
+
def footer(elapsed)
|
42
|
+
csv = sinew.csv
|
43
|
+
count = csv.count
|
52
44
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
cache.set(response)
|
45
|
+
if count == 0
|
46
|
+
sinew.banner(format('Done in %ds. Nothing written.', elapsed))
|
47
|
+
return
|
57
48
|
end
|
58
49
|
|
59
|
-
#
|
60
|
-
|
61
|
-
|
62
|
-
end
|
63
|
-
|
64
|
-
response
|
65
|
-
end
|
66
|
-
|
67
|
-
def perform(request)
|
68
|
-
before_perform_request(request)
|
69
|
-
|
70
|
-
response = nil
|
50
|
+
# summary
|
51
|
+
msg = format('Done in %ds. Wrote %d rows to %s. Summary:', elapsed, count, csv.path)
|
52
|
+
sinew.banner(msg)
|
71
53
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
rescue Timeout::Error
|
79
|
-
response = Response.from_timeout(request)
|
80
|
-
end
|
81
|
-
break if !response.error_500?
|
54
|
+
# tally
|
55
|
+
tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
|
56
|
+
len = tally.keys.map { _1.to_s.length }.max
|
57
|
+
fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
|
58
|
+
tally.each do
|
59
|
+
printf(fmt, _1, _2, count, _2 * 100.0 / count)
|
82
60
|
end
|
83
|
-
|
84
|
-
response
|
85
61
|
end
|
86
|
-
protected :perform
|
87
62
|
|
88
|
-
#
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
def output
|
93
|
-
@output ||= Output.new(self)
|
94
|
-
end
|
63
|
+
# simple DSL for .sinew files
|
64
|
+
class DSL
|
65
|
+
attr_reader :sinew
|
95
66
|
|
96
|
-
|
97
|
-
|
98
|
-
#
|
99
|
-
|
100
|
-
def before_perform_request(request)
|
101
|
-
# log
|
102
|
-
if !quiet?
|
103
|
-
msg = if request.method != 'get'
|
104
|
-
"req #{request.uri} (#{request.method})"
|
105
|
-
else
|
106
|
-
"req #{request.uri}"
|
107
|
-
end
|
108
|
-
$stderr.puts msg
|
67
|
+
def initialize(sinew)
|
68
|
+
@sinew = sinew
|
109
69
|
end
|
110
|
-
|
111
|
-
# rate limit
|
112
|
-
sleep = (request_tm + runtime_options.rate_limit) - Time.now
|
113
|
-
sleep(sleep) if sleep > 0
|
114
|
-
@request_tm = Time.now
|
115
|
-
end
|
116
|
-
protected :before_perform_request
|
117
|
-
|
118
|
-
def footer
|
119
|
-
output.report
|
120
|
-
finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
|
121
|
-
banner("#{finished} in #{dsl.elapsed.to_i}s.")
|
122
70
|
end
|
123
|
-
protected :footer
|
124
71
|
end
|
125
72
|
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Sinew
|
2
|
+
module Middleware
|
3
|
+
# Minimalist Formatter that logs proxy if present.
|
4
|
+
class LogFormatter < Faraday::Logging::Formatter
|
5
|
+
def request(env)
|
6
|
+
info('req') do
|
7
|
+
# Only log the initial request, not the redirects
|
8
|
+
return if env[:redirect]
|
9
|
+
|
10
|
+
msg = apply_filters(env.url.to_s)
|
11
|
+
msg = "#{msg} (#{env.method})" if env.method != :get
|
12
|
+
msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
|
13
|
+
|
14
|
+
msg
|
15
|
+
end
|
16
|
+
end
|
17
|
+
|
18
|
+
def response(env)
|
19
|
+
# silent
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,28 +1,19 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Nokogiri
|
5
|
+
module XML
|
6
|
+
class NodeSet
|
7
|
+
alias old_inner_html inner_html
|
8
|
+
alias old_inner_text inner_text
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def inner_html(*args)
|
13
|
-
map { |i| i.inner_html(*args) }.join(' ')
|
14
|
-
end
|
15
|
-
end
|
10
|
+
def inner_text
|
11
|
+
map(&:inner_text).join(' ')
|
12
|
+
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
t&.text
|
22
|
-
end
|
23
|
-
end
|
24
|
-
class Nokogiri::XML::NodeSet
|
25
|
-
def text_just_me
|
26
|
-
map(&:text_just_me).join(' ')
|
14
|
+
def inner_html(*args)
|
15
|
+
map { _1.inner_html(*args) }.join(' ')
|
16
|
+
end
|
17
|
+
end
|
27
18
|
end
|
28
19
|
end
|
data/lib/sinew/response.rb
CHANGED
@@ -1,121 +1,61 @@
|
|
1
|
-
require '
|
2
|
-
require '
|
3
|
-
|
4
|
-
|
5
|
-
# An HTTP response. Mostly a wrapper around HTTParty.
|
6
|
-
#
|
1
|
+
require 'delegate'
|
2
|
+
require 'hashie/mash'
|
3
|
+
require 'json'
|
4
|
+
require 'nokogiri'
|
7
5
|
|
8
6
|
module Sinew
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
response.request = request
|
19
|
-
response.uri = party_response.request.last_uri
|
20
|
-
response.code = party_response.code
|
21
|
-
response.headers = party_response.headers.to_h
|
22
|
-
response.body = process_body(party_response)
|
23
|
-
end
|
24
|
-
end
|
25
|
-
|
26
|
-
def self.from_cache(request, body, head)
|
27
|
-
Response.new.tap do |response|
|
28
|
-
response.request = request
|
29
|
-
response.body = body
|
7
|
+
# A wrapper around Faraday::Response, with some parsing helpers.
|
8
|
+
class Response < SimpleDelegator
|
9
|
+
# Like body, but tries to cleanup whitespace around HTML for easier parsing.
|
10
|
+
def html
|
11
|
+
@html ||= body.dup.tap do
|
12
|
+
# fix invalid utf8
|
13
|
+
if _1.encoding == Encoding::UTF_8
|
14
|
+
_1.encode!('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
15
|
+
end
|
30
16
|
|
31
|
-
#
|
32
|
-
|
33
|
-
|
34
|
-
response.headers = {}
|
17
|
+
# squish
|
18
|
+
_1.strip!
|
19
|
+
_1.gsub!(/\s+/, ' ')
|
35
20
|
|
36
|
-
#
|
37
|
-
|
38
|
-
if head !~ /^{/
|
39
|
-
return from_legacy_head(response, head)
|
40
|
-
end
|
41
|
-
head = JSON.parse(head, symbolize_names: true)
|
42
|
-
response.uri = URI.parse(head[:uri])
|
43
|
-
response.code = head[:code]
|
44
|
-
response.headers = head[:headers]
|
45
|
-
end
|
21
|
+
# kill whitespace around tags
|
22
|
+
_1.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
46
23
|
end
|
47
24
|
end
|
48
25
|
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
response.uri = request.uri
|
53
|
-
response.body = 'timeout'
|
54
|
-
response.code = 999
|
55
|
-
response.headers = {}
|
56
|
-
end
|
26
|
+
# Return body as JSON
|
27
|
+
def json
|
28
|
+
@json ||= JSON.parse(body, symbolize_names: true)
|
57
29
|
end
|
58
30
|
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
when /\ACURLER_ERROR/
|
63
|
-
# error
|
64
|
-
r.code = 999
|
65
|
-
when /\AHTTP/
|
66
|
-
# redirect
|
67
|
-
location = head.scan(/Location: ([^\r\n]+)/).flatten.last
|
68
|
-
r.uri += location
|
69
|
-
else
|
70
|
-
$stderr.puts "unknown cached /head for #{r.uri}"
|
71
|
-
end
|
72
|
-
end
|
31
|
+
# Return JSON body as Hashie::Mash
|
32
|
+
def mash
|
33
|
+
@mash ||= Hashie::Mash.new(json)
|
73
34
|
end
|
74
35
|
|
75
|
-
#
|
76
|
-
def
|
77
|
-
|
78
|
-
|
79
|
-
# inflate if necessary
|
80
|
-
bits = body[0, 10].force_encoding('BINARY')
|
81
|
-
if bits =~ /\A\x1f\x8b/n
|
82
|
-
body = Zlib::GzipReader.new(StringIO.new(body)).read
|
83
|
-
end
|
84
|
-
|
85
|
-
# force to utf-8 if we think this could be text
|
86
|
-
if body.encoding != Encoding::UTF_8
|
87
|
-
if content_type = response.headers['content-type']
|
88
|
-
if content_type =~ /\b(html|javascript|json|text|xml)\b/
|
89
|
-
body = body.encode('UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
90
|
-
end
|
91
|
-
end
|
92
|
-
end
|
93
|
-
|
94
|
-
body
|
36
|
+
# Return body HTML as Nokogiri document
|
37
|
+
def noko
|
38
|
+
@noko ||= Nokogiri::HTML(html)
|
95
39
|
end
|
96
40
|
|
97
|
-
#
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
def error?
|
102
|
-
code >= 400
|
41
|
+
# Return body XML as Nokogiri document
|
42
|
+
def xml
|
43
|
+
@xml ||= Nokogiri::XML(html)
|
103
44
|
end
|
104
45
|
|
105
|
-
|
106
|
-
|
46
|
+
# Return the final URI for the request, after redirects
|
47
|
+
def url
|
48
|
+
env.url
|
107
49
|
end
|
108
50
|
|
109
|
-
|
110
|
-
|
51
|
+
# Return the cache diskpath for this response
|
52
|
+
def diskpath
|
53
|
+
env[:httpdisk_diskpath]
|
111
54
|
end
|
112
55
|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
code: code,
|
117
|
-
headers: headers,
|
118
|
-
}
|
56
|
+
# Remove cached response from disk, if any
|
57
|
+
def uncache
|
58
|
+
File.unlink(diskpath) if File.exist?(diskpath)
|
119
59
|
end
|
120
60
|
end
|
121
61
|
end
|