sinew 2.0.1 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.rubocop.yml +9 -6
- data/.vscode/settings.json +0 -10
- data/Gemfile +9 -0
- data/LICENSE +1 -1
- data/README.md +77 -58
- data/Rakefile +33 -18
- data/bin/sinew +8 -4
- data/lib/sinew.rb +0 -1
- data/lib/sinew/connection.rb +52 -0
- data/lib/sinew/connection/log_formatter.rb +22 -0
- data/lib/sinew/connection/rate_limit.rb +29 -0
- data/lib/sinew/core_ext.rb +1 -1
- data/lib/sinew/dsl.rb +27 -10
- data/lib/sinew/main.rb +7 -54
- data/lib/sinew/output.rb +26 -19
- data/lib/sinew/request.rb +28 -49
- data/lib/sinew/response.rb +25 -55
- data/lib/sinew/runtime_options.rb +4 -2
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +2 -2
- data/sinew.gemspec +16 -17
- metadata +41 -81
- data/.travis.yml +0 -4
- data/lib/sinew/cache.rb +0 -79
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -113
- data/test/test_legacy.rb +0 -21
- data/test/test_main.rb +0 -46
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -73
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
@@ -0,0 +1,22 @@
|
|
1
|
+
module Sinew
|
2
|
+
module Connection
|
3
|
+
class LogFormatter < Faraday::Logging::Formatter
|
4
|
+
def request(env)
|
5
|
+
info('req') do
|
6
|
+
# Only log the initial request, not the redirects
|
7
|
+
return if env[:redirect]
|
8
|
+
|
9
|
+
msg = apply_filters(env.url.to_s)
|
10
|
+
msg = "#{msg} (#{env.method})" if env.method != :get
|
11
|
+
msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
|
12
|
+
|
13
|
+
msg
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def response(env)
|
18
|
+
# silent
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Sinew
|
2
|
+
module Connection
|
3
|
+
class RateLimit < Faraday::Middleware
|
4
|
+
attr_reader :rate_limit
|
5
|
+
|
6
|
+
def initialize(app, options = {})
|
7
|
+
super(app)
|
8
|
+
|
9
|
+
@last_request_tm = @current_request_tm = nil
|
10
|
+
@rate_limit = options.fetch(:rate_limit, 1)
|
11
|
+
end
|
12
|
+
|
13
|
+
def on_request(_env)
|
14
|
+
if @last_request_tm
|
15
|
+
sleep = (@last_request_tm + rate_limit) - Time.now
|
16
|
+
sleep(sleep) if sleep > 0
|
17
|
+
end
|
18
|
+
|
19
|
+
@current_request_tm = Time.now
|
20
|
+
end
|
21
|
+
|
22
|
+
def on_complete(env)
|
23
|
+
# Only rate limit on uncached requests
|
24
|
+
@last_request_tm = @current_request_tm unless env[:httpdisk]
|
25
|
+
@current_request_tm = nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/sinew/core_ext.rb
CHANGED
data/lib/sinew/dsl.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
require '
|
1
|
+
require 'amazing_print'
|
2
2
|
require 'cgi'
|
3
|
+
require 'json'
|
3
4
|
|
4
5
|
#
|
5
6
|
# The DSL available to .sinew files.
|
@@ -7,7 +8,10 @@ require 'cgi'
|
|
7
8
|
|
8
9
|
module Sinew
|
9
10
|
class DSL
|
10
|
-
|
11
|
+
# this is used to break out of --limit
|
12
|
+
class LimitError < StandardError; end
|
13
|
+
|
14
|
+
attr_reader :sinew, :uri, :raw, :code, :elapsed
|
11
15
|
|
12
16
|
def initialize(sinew)
|
13
17
|
@sinew = sinew
|
@@ -15,8 +19,12 @@ module Sinew
|
|
15
19
|
|
16
20
|
def run
|
17
21
|
tm = Time.now
|
18
|
-
|
19
|
-
|
22
|
+
begin
|
23
|
+
recipe = sinew.options[:recipe]
|
24
|
+
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
27
|
+
end
|
20
28
|
@elapsed = Time.now - tm
|
21
29
|
end
|
22
30
|
|
@@ -45,15 +53,17 @@ module Sinew
|
|
45
53
|
end
|
46
54
|
|
47
55
|
def http(method, url, options = {})
|
48
|
-
#
|
49
|
-
@html
|
56
|
+
# these need to be cleared before each request
|
57
|
+
%i[@html @noko @xml @json].each do |i|
|
58
|
+
instance_variable_set(i, nil)
|
59
|
+
end
|
50
60
|
|
51
|
-
# fetch
|
61
|
+
# fetch and make response available to callers
|
52
62
|
response = sinew.http(method, url, options)
|
63
|
+
@uri, @raw, @code = response.uri, response.body, response.code
|
53
64
|
|
54
|
-
#
|
55
|
-
|
56
|
-
@raw = response.body
|
65
|
+
# don't confuse the user
|
66
|
+
nil
|
57
67
|
end
|
58
68
|
|
59
69
|
#
|
@@ -75,6 +85,10 @@ module Sinew
|
|
75
85
|
@noko ||= Nokogiri::HTML(html)
|
76
86
|
end
|
77
87
|
|
88
|
+
def xml
|
89
|
+
@xml ||= Nokogiri::XML(html)
|
90
|
+
end
|
91
|
+
|
78
92
|
def json
|
79
93
|
@json ||= JSON.parse(raw, symbolize_names: true)
|
80
94
|
end
|
@@ -93,6 +107,9 @@ module Sinew
|
|
93
107
|
|
94
108
|
def csv_emit(row)
|
95
109
|
sinew.output.emit(row)
|
110
|
+
if sinew.output.count == sinew.options[:limit]
|
111
|
+
raise LimitError.new
|
112
|
+
end
|
96
113
|
end
|
97
114
|
end
|
98
115
|
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'scripto'
|
2
|
+
require 'sinew/connection'
|
2
3
|
|
3
4
|
#
|
4
5
|
# Main sinew entry point.
|
@@ -6,15 +7,13 @@ require 'scripto'
|
|
6
7
|
|
7
8
|
module Sinew
|
8
9
|
class Main < Scripto::Main
|
9
|
-
attr_reader :runtime_options
|
10
|
+
attr_reader :runtime_options
|
10
11
|
|
11
12
|
def initialize(options)
|
12
13
|
super(options)
|
13
14
|
|
14
15
|
# init
|
15
16
|
@runtime_options = RuntimeOptions.new
|
16
|
-
@request_tm = Time.at(0)
|
17
|
-
@request_count = 0
|
18
17
|
end
|
19
18
|
|
20
19
|
def run
|
@@ -31,24 +30,12 @@ module Sinew
|
|
31
30
|
end
|
32
31
|
|
33
32
|
#
|
34
|
-
# http requests
|
33
|
+
# http requests
|
35
34
|
#
|
36
35
|
|
37
|
-
def cache
|
38
|
-
@cache ||= Cache.new(self)
|
39
|
-
end
|
40
|
-
|
41
36
|
def http(method, url, options = {})
|
42
37
|
request = Request.new(self, method, url, options)
|
43
|
-
|
44
|
-
# try to get from cache
|
45
|
-
response = cache.get(request)
|
46
|
-
|
47
|
-
# perform if necessary
|
48
|
-
if !response
|
49
|
-
response = perform(request)
|
50
|
-
cache.set(response)
|
51
|
-
end
|
38
|
+
response = request.perform(connection)
|
52
39
|
|
53
40
|
# always log error messages
|
54
41
|
if response.error?
|
@@ -58,26 +45,10 @@ module Sinew
|
|
58
45
|
response
|
59
46
|
end
|
60
47
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
response = nil
|
65
|
-
|
66
|
-
tries = runtime_options.retries + 1
|
67
|
-
while tries > 0
|
68
|
-
tries -= 1
|
69
|
-
begin
|
70
|
-
@request_count += 1
|
71
|
-
response = request.perform
|
72
|
-
rescue Timeout::Error
|
73
|
-
response = Response.from_timeout(request)
|
74
|
-
end
|
75
|
-
break if !response.error_500?
|
76
|
-
end
|
77
|
-
|
78
|
-
response
|
48
|
+
def connection
|
49
|
+
@connection ||= Connection.create(options: options, runtime_options: runtime_options)
|
79
50
|
end
|
80
|
-
protected :
|
51
|
+
protected :connection
|
81
52
|
|
82
53
|
#
|
83
54
|
# output
|
@@ -91,24 +62,6 @@ module Sinew
|
|
91
62
|
# helpers
|
92
63
|
#
|
93
64
|
|
94
|
-
def before_perform_request(request)
|
95
|
-
# log
|
96
|
-
if !quiet?
|
97
|
-
msg = if request.method != 'get'
|
98
|
-
"req #{request.uri} (#{request.method})"
|
99
|
-
else
|
100
|
-
"req #{request.uri}"
|
101
|
-
end
|
102
|
-
$stderr.puts msg
|
103
|
-
end
|
104
|
-
|
105
|
-
# rate limit
|
106
|
-
sleep = (request_tm + runtime_options.rate_limit) - Time.now
|
107
|
-
sleep(sleep) if sleep > 0
|
108
|
-
@request_tm = Time.now
|
109
|
-
end
|
110
|
-
protected :before_perform_request
|
111
|
-
|
112
65
|
def footer
|
113
66
|
output.report
|
114
67
|
finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
|
data/lib/sinew/output.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'csv'
|
2
|
-
require '
|
2
|
+
require 'set'
|
3
|
+
require 'sterile'
|
3
4
|
|
4
5
|
#
|
5
6
|
# CSV output.
|
@@ -7,11 +8,12 @@ require 'stringex'
|
|
7
8
|
|
8
9
|
module Sinew
|
9
10
|
class Output
|
10
|
-
attr_reader :sinew, :columns, :rows, :csv
|
11
|
+
attr_reader :sinew, :columns, :rows, :urls, :csv
|
11
12
|
|
12
13
|
def initialize(sinew)
|
13
14
|
@sinew = sinew
|
14
15
|
@rows = []
|
16
|
+
@urls = Set.new
|
15
17
|
end
|
16
18
|
|
17
19
|
def filename
|
@@ -41,6 +43,9 @@ module Sinew
|
|
41
43
|
# implicit header if necessary
|
42
44
|
header(row.keys) if !csv
|
43
45
|
|
46
|
+
# don't allow duplicate urls
|
47
|
+
return if dup_url?(row)
|
48
|
+
|
44
49
|
rows << row.dup
|
45
50
|
|
46
51
|
# map columns to row, and normalize along the way
|
@@ -94,27 +99,17 @@ module Sinew
|
|
94
99
|
s.to_s
|
95
100
|
end
|
96
101
|
|
97
|
-
#
|
98
|
-
|
99
|
-
#
|
100
|
-
# github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
|
101
|
-
# github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
|
102
|
-
#
|
103
|
-
|
104
|
-
# <a>b</a> => b
|
105
|
-
s = s.strip_html_tags
|
102
|
+
# strip html tags. Note that we replace tags with spaces
|
103
|
+
s = s.gsub(/<[^>]+>/, ' ')
|
106
104
|
|
107
105
|
# Converts MS Word 'smart punctuation' to ASCII
|
108
|
-
s = s
|
106
|
+
s = Sterile.plain_format(s)
|
109
107
|
|
110
|
-
#
|
111
|
-
s = s
|
108
|
+
# á & etc.
|
109
|
+
s = Sterile.decode_entities(s)
|
112
110
|
|
113
|
-
#
|
114
|
-
s = s
|
115
|
-
|
116
|
-
# convert unicode => regular characters
|
117
|
-
s = s.to_ascii
|
111
|
+
# "šţɽĩɳģ" => "string"
|
112
|
+
s = Sterile.transliterate(s)
|
118
113
|
|
119
114
|
# squish
|
120
115
|
s = s.squish
|
@@ -122,5 +117,17 @@ module Sinew
|
|
122
117
|
s
|
123
118
|
end
|
124
119
|
protected :normalize
|
120
|
+
|
121
|
+
def dup_url?(row)
|
122
|
+
if url = row[:url]
|
123
|
+
if urls.include?(url)
|
124
|
+
sinew.warning("duplicate url: #{url}") if !sinew.quiet?
|
125
|
+
return true
|
126
|
+
end
|
127
|
+
urls << url
|
128
|
+
end
|
129
|
+
false
|
130
|
+
end
|
131
|
+
protected :dup_url?
|
125
132
|
end
|
126
133
|
end
|
data/lib/sinew/request.rb
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
require 'digest/md5'
|
2
|
-
require 'httparty'
|
3
2
|
require 'htmlentities'
|
4
3
|
|
5
4
|
#
|
6
|
-
# Process a single HTTP request.
|
5
|
+
# Process a single HTTP request.
|
7
6
|
#
|
8
7
|
|
9
8
|
module Sinew
|
@@ -12,29 +11,43 @@ module Sinew
|
|
12
11
|
class Request
|
13
12
|
HTML_ENTITIES = HTMLEntities.new
|
14
13
|
VALID_METHODS = %w[get post patch put delete head options].freeze
|
14
|
+
METHODS_WITH_BODY = %w[patch post put].freeze
|
15
15
|
|
16
|
-
attr_reader :sinew, :method, :uri, :options
|
16
|
+
attr_reader :sinew, :method, :uri, :options
|
17
17
|
|
18
|
-
#
|
18
|
+
# Supported options:
|
19
|
+
# body: Body of http post
|
20
|
+
# headers: Hash of HTTP headers (combined with runtime_options.headers)
|
21
|
+
# query: Hash of query parameters to add to url
|
19
22
|
def initialize(sinew, method, url, options = {})
|
20
23
|
@sinew = sinew
|
21
24
|
@method = method
|
22
25
|
@options = options.dup
|
23
26
|
@uri = parse_url(url)
|
24
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
def proxy
|
30
|
+
@proxy ||= begin
|
31
|
+
if proxies = sinew.options[:proxy]
|
32
|
+
proxies.split(',').sample
|
33
|
+
end
|
34
|
+
end
|
25
35
|
end
|
26
36
|
|
27
37
|
# run the request, return the result
|
28
|
-
def perform
|
38
|
+
def perform(connection)
|
29
39
|
validate!
|
30
40
|
|
31
|
-
# merge global/options headers
|
32
41
|
headers = sinew.runtime_options.headers
|
33
42
|
headers = headers.merge(options[:headers]) if options[:headers]
|
34
|
-
options[:headers] = headers
|
35
43
|
|
36
|
-
|
37
|
-
|
44
|
+
body = options.delete(:body)
|
45
|
+
|
46
|
+
fday_response = connection.send(method, uri, body, headers) do
|
47
|
+
_1.options[:proxy] = proxy
|
48
|
+
end
|
49
|
+
|
50
|
+
Response.from_network(self, fday_response)
|
38
51
|
end
|
39
52
|
|
40
53
|
# We accept sloppy urls and attempt to clean them up
|
@@ -48,11 +61,11 @@ module Sinew
|
|
48
61
|
s = s.gsub(' ', '%20')
|
49
62
|
s = s.gsub("'", '%27')
|
50
63
|
|
51
|
-
# append query manually (instead of letting
|
52
|
-
#
|
64
|
+
# append query manually (instead of letting Faraday handle it) for consistent
|
65
|
+
# Request#uri and Response#uri
|
53
66
|
query = options.delete(:query)
|
54
67
|
if query.present?
|
55
|
-
q =
|
68
|
+
q = Faraday::Utils.default_params_encoder.encode(query)
|
56
69
|
separator = s.include?('?') ? '&' : '?'
|
57
70
|
s = "#{s}#{separator}#{q}"
|
58
71
|
end
|
@@ -61,44 +74,10 @@ module Sinew
|
|
61
74
|
end
|
62
75
|
protected :parse_url
|
63
76
|
|
64
|
-
def calculate_cache_key
|
65
|
-
dir = pathify(uri.host)
|
66
|
-
|
67
|
-
body_key = if body.is_a?(Hash)
|
68
|
-
HTTParty::HashConversions.to_params(body)
|
69
|
-
else
|
70
|
-
body&.dup
|
71
|
-
end
|
72
|
-
|
73
|
-
# build key, as a hash for before_generate_cache_key
|
74
|
-
key = {
|
75
|
-
method: method.dup,
|
76
|
-
path: uri.path,
|
77
|
-
query: uri.query,
|
78
|
-
body: body_key,
|
79
|
-
}
|
80
|
-
key = sinew.runtime_options.before_generate_cache_key.call(key)
|
81
|
-
|
82
|
-
# strip method for gets
|
83
|
-
key.delete(:method) if key[:method] == 'get'
|
84
|
-
|
85
|
-
# pull out the values, join and pathify
|
86
|
-
path = key.values.select(&:present?).join(',')
|
87
|
-
path = pathify(path)
|
88
|
-
|
89
|
-
# shorten long paths
|
90
|
-
if path.length > 250
|
91
|
-
path = Digest::MD5.hexdigest(path)
|
92
|
-
end
|
93
|
-
|
94
|
-
"#{dir}/#{path}"
|
95
|
-
end
|
96
|
-
protected :calculate_cache_key
|
97
|
-
|
98
77
|
def validate!
|
99
78
|
raise "invalid method #{method}" if !VALID_METHODS.include?(method)
|
100
79
|
raise "invalid url #{uri}" if uri.scheme !~ /^http/
|
101
|
-
raise "can't
|
80
|
+
raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
|
102
81
|
raise "Content-Type doesn't make sense without a body" if content_type && !body
|
103
82
|
end
|
104
83
|
protected :validate!
|
@@ -134,7 +113,7 @@ module Sinew
|
|
134
113
|
s = s.gsub(',,', ',')
|
135
114
|
# encode invalid path chars
|
136
115
|
s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
|
137
|
-
hex = i.
|
116
|
+
hex = i.unpack1('H2')
|
138
117
|
"%#{hex}"
|
139
118
|
end
|
140
119
|
# handle empty case
|