sinew 2.0.1 → 3.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +26 -0
- data/.rubocop.yml +9 -6
- data/.vscode/settings.json +0 -10
- data/Gemfile +9 -0
- data/LICENSE +1 -1
- data/README.md +77 -58
- data/Rakefile +33 -18
- data/bin/sinew +8 -4
- data/lib/sinew.rb +0 -1
- data/lib/sinew/connection.rb +52 -0
- data/lib/sinew/connection/log_formatter.rb +22 -0
- data/lib/sinew/connection/rate_limit.rb +29 -0
- data/lib/sinew/core_ext.rb +1 -1
- data/lib/sinew/dsl.rb +27 -10
- data/lib/sinew/main.rb +7 -54
- data/lib/sinew/output.rb +26 -19
- data/lib/sinew/request.rb +28 -49
- data/lib/sinew/response.rb +25 -55
- data/lib/sinew/runtime_options.rb +4 -2
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +2 -2
- data/sinew.gemspec +16 -17
- metadata +41 -81
- data/.travis.yml +0 -4
- data/lib/sinew/cache.rb +0 -79
- data/test/legacy/eu.httpbin.org/head/redirect,3 +0 -51
- data/test/legacy/eu.httpbin.org/head/status,500 +0 -1
- data/test/legacy/eu.httpbin.org/redirect,3 +0 -11
- data/test/legacy/eu.httpbin.org/status,500 +0 -1
- data/test/legacy/legacy.sinew +0 -2
- data/test/test.html +0 -45
- data/test/test_cache.rb +0 -69
- data/test/test_helper.rb +0 -113
- data/test/test_legacy.rb +0 -21
- data/test/test_main.rb +0 -46
- data/test/test_nokogiri_ext.rb +0 -18
- data/test/test_output.rb +0 -73
- data/test/test_requests.rb +0 -135
- data/test/test_utf8.rb +0 -39
@@ -0,0 +1,22 @@
|
|
1
|
+
module Sinew
|
2
|
+
module Connection
|
3
|
+
class LogFormatter < Faraday::Logging::Formatter
|
4
|
+
def request(env)
|
5
|
+
info('req') do
|
6
|
+
# Only log the initial request, not the redirects
|
7
|
+
return if env[:redirect]
|
8
|
+
|
9
|
+
msg = apply_filters(env.url.to_s)
|
10
|
+
msg = "#{msg} (#{env.method})" if env.method != :get
|
11
|
+
msg = "#{msg} => #{env.request.proxy.uri}" if env.request.proxy
|
12
|
+
|
13
|
+
msg
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def response(env)
|
18
|
+
# silent
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Sinew
|
2
|
+
module Connection
|
3
|
+
class RateLimit < Faraday::Middleware
|
4
|
+
attr_reader :rate_limit
|
5
|
+
|
6
|
+
def initialize(app, options = {})
|
7
|
+
super(app)
|
8
|
+
|
9
|
+
@last_request_tm = @current_request_tm = nil
|
10
|
+
@rate_limit = options.fetch(:rate_limit, 1)
|
11
|
+
end
|
12
|
+
|
13
|
+
def on_request(_env)
|
14
|
+
if @last_request_tm
|
15
|
+
sleep = (@last_request_tm + rate_limit) - Time.now
|
16
|
+
sleep(sleep) if sleep > 0
|
17
|
+
end
|
18
|
+
|
19
|
+
@current_request_tm = Time.now
|
20
|
+
end
|
21
|
+
|
22
|
+
def on_complete(env)
|
23
|
+
# Only rate limit on uncached requests
|
24
|
+
@last_request_tm = @current_request_tm unless env[:httpdisk]
|
25
|
+
@current_request_tm = nil
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
data/lib/sinew/core_ext.rb
CHANGED
data/lib/sinew/dsl.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
-
require '
|
1
|
+
require 'amazing_print'
|
2
2
|
require 'cgi'
|
3
|
+
require 'json'
|
3
4
|
|
4
5
|
#
|
5
6
|
# The DSL available to .sinew files.
|
@@ -7,7 +8,10 @@ require 'cgi'
|
|
7
8
|
|
8
9
|
module Sinew
|
9
10
|
class DSL
|
10
|
-
|
11
|
+
# this is used to break out of --limit
|
12
|
+
class LimitError < StandardError; end
|
13
|
+
|
14
|
+
attr_reader :sinew, :uri, :raw, :code, :elapsed
|
11
15
|
|
12
16
|
def initialize(sinew)
|
13
17
|
@sinew = sinew
|
@@ -15,8 +19,12 @@ module Sinew
|
|
15
19
|
|
16
20
|
def run
|
17
21
|
tm = Time.now
|
18
|
-
|
19
|
-
|
22
|
+
begin
|
23
|
+
recipe = sinew.options[:recipe]
|
24
|
+
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
27
|
+
end
|
20
28
|
@elapsed = Time.now - tm
|
21
29
|
end
|
22
30
|
|
@@ -45,15 +53,17 @@ module Sinew
|
|
45
53
|
end
|
46
54
|
|
47
55
|
def http(method, url, options = {})
|
48
|
-
#
|
49
|
-
@html
|
56
|
+
# these need to be cleared before each request
|
57
|
+
%i[@html @noko @xml @json].each do |i|
|
58
|
+
instance_variable_set(i, nil)
|
59
|
+
end
|
50
60
|
|
51
|
-
# fetch
|
61
|
+
# fetch and make response available to callers
|
52
62
|
response = sinew.http(method, url, options)
|
63
|
+
@uri, @raw, @code = response.uri, response.body, response.code
|
53
64
|
|
54
|
-
#
|
55
|
-
|
56
|
-
@raw = response.body
|
65
|
+
# don't confuse the user
|
66
|
+
nil
|
57
67
|
end
|
58
68
|
|
59
69
|
#
|
@@ -75,6 +85,10 @@ module Sinew
|
|
75
85
|
@noko ||= Nokogiri::HTML(html)
|
76
86
|
end
|
77
87
|
|
88
|
+
def xml
|
89
|
+
@xml ||= Nokogiri::XML(html)
|
90
|
+
end
|
91
|
+
|
78
92
|
def json
|
79
93
|
@json ||= JSON.parse(raw, symbolize_names: true)
|
80
94
|
end
|
@@ -93,6 +107,9 @@ module Sinew
|
|
93
107
|
|
94
108
|
def csv_emit(row)
|
95
109
|
sinew.output.emit(row)
|
110
|
+
if sinew.output.count == sinew.options[:limit]
|
111
|
+
raise LimitError.new
|
112
|
+
end
|
96
113
|
end
|
97
114
|
end
|
98
115
|
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'scripto'
|
2
|
+
require 'sinew/connection'
|
2
3
|
|
3
4
|
#
|
4
5
|
# Main sinew entry point.
|
@@ -6,15 +7,13 @@ require 'scripto'
|
|
6
7
|
|
7
8
|
module Sinew
|
8
9
|
class Main < Scripto::Main
|
9
|
-
attr_reader :runtime_options
|
10
|
+
attr_reader :runtime_options
|
10
11
|
|
11
12
|
def initialize(options)
|
12
13
|
super(options)
|
13
14
|
|
14
15
|
# init
|
15
16
|
@runtime_options = RuntimeOptions.new
|
16
|
-
@request_tm = Time.at(0)
|
17
|
-
@request_count = 0
|
18
17
|
end
|
19
18
|
|
20
19
|
def run
|
@@ -31,24 +30,12 @@ module Sinew
|
|
31
30
|
end
|
32
31
|
|
33
32
|
#
|
34
|
-
# http requests
|
33
|
+
# http requests
|
35
34
|
#
|
36
35
|
|
37
|
-
def cache
|
38
|
-
@cache ||= Cache.new(self)
|
39
|
-
end
|
40
|
-
|
41
36
|
def http(method, url, options = {})
|
42
37
|
request = Request.new(self, method, url, options)
|
43
|
-
|
44
|
-
# try to get from cache
|
45
|
-
response = cache.get(request)
|
46
|
-
|
47
|
-
# perform if necessary
|
48
|
-
if !response
|
49
|
-
response = perform(request)
|
50
|
-
cache.set(response)
|
51
|
-
end
|
38
|
+
response = request.perform(connection)
|
52
39
|
|
53
40
|
# always log error messages
|
54
41
|
if response.error?
|
@@ -58,26 +45,10 @@ module Sinew
|
|
58
45
|
response
|
59
46
|
end
|
60
47
|
|
61
|
-
def
|
62
|
-
|
63
|
-
|
64
|
-
response = nil
|
65
|
-
|
66
|
-
tries = runtime_options.retries + 1
|
67
|
-
while tries > 0
|
68
|
-
tries -= 1
|
69
|
-
begin
|
70
|
-
@request_count += 1
|
71
|
-
response = request.perform
|
72
|
-
rescue Timeout::Error
|
73
|
-
response = Response.from_timeout(request)
|
74
|
-
end
|
75
|
-
break if !response.error_500?
|
76
|
-
end
|
77
|
-
|
78
|
-
response
|
48
|
+
def connection
|
49
|
+
@connection ||= Connection.create(options: options, runtime_options: runtime_options)
|
79
50
|
end
|
80
|
-
protected :
|
51
|
+
protected :connection
|
81
52
|
|
82
53
|
#
|
83
54
|
# output
|
@@ -91,24 +62,6 @@ module Sinew
|
|
91
62
|
# helpers
|
92
63
|
#
|
93
64
|
|
94
|
-
def before_perform_request(request)
|
95
|
-
# log
|
96
|
-
if !quiet?
|
97
|
-
msg = if request.method != 'get'
|
98
|
-
"req #{request.uri} (#{request.method})"
|
99
|
-
else
|
100
|
-
"req #{request.uri}"
|
101
|
-
end
|
102
|
-
$stderr.puts msg
|
103
|
-
end
|
104
|
-
|
105
|
-
# rate limit
|
106
|
-
sleep = (request_tm + runtime_options.rate_limit) - Time.now
|
107
|
-
sleep(sleep) if sleep > 0
|
108
|
-
@request_tm = Time.now
|
109
|
-
end
|
110
|
-
protected :before_perform_request
|
111
|
-
|
112
65
|
def footer
|
113
66
|
output.report
|
114
67
|
finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
|
data/lib/sinew/output.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
require 'csv'
|
2
|
-
require '
|
2
|
+
require 'set'
|
3
|
+
require 'sterile'
|
3
4
|
|
4
5
|
#
|
5
6
|
# CSV output.
|
@@ -7,11 +8,12 @@ require 'stringex'
|
|
7
8
|
|
8
9
|
module Sinew
|
9
10
|
class Output
|
10
|
-
attr_reader :sinew, :columns, :rows, :csv
|
11
|
+
attr_reader :sinew, :columns, :rows, :urls, :csv
|
11
12
|
|
12
13
|
def initialize(sinew)
|
13
14
|
@sinew = sinew
|
14
15
|
@rows = []
|
16
|
+
@urls = Set.new
|
15
17
|
end
|
16
18
|
|
17
19
|
def filename
|
@@ -41,6 +43,9 @@ module Sinew
|
|
41
43
|
# implicit header if necessary
|
42
44
|
header(row.keys) if !csv
|
43
45
|
|
46
|
+
# don't allow duplicate urls
|
47
|
+
return if dup_url?(row)
|
48
|
+
|
44
49
|
rows << row.dup
|
45
50
|
|
46
51
|
# map columns to row, and normalize along the way
|
@@ -94,27 +99,17 @@ module Sinew
|
|
94
99
|
s.to_s
|
95
100
|
end
|
96
101
|
|
97
|
-
#
|
98
|
-
|
99
|
-
#
|
100
|
-
# github.com/rsl/stringex/blob/master/lib/stringex/string_extensions.rb
|
101
|
-
# github.com/rsl/stringex/blob/master/lib/stringex/localization/conversion_expressions.rb
|
102
|
-
#
|
103
|
-
|
104
|
-
# <a>b</a> => b
|
105
|
-
s = s.strip_html_tags
|
102
|
+
# strip html tags. Note that we replace tags with spaces
|
103
|
+
s = s.gsub(/<[^>]+>/, ' ')
|
106
104
|
|
107
105
|
# Converts MS Word 'smart punctuation' to ASCII
|
108
|
-
s = s
|
106
|
+
s = Sterile.plain_format(s)
|
109
107
|
|
110
|
-
#
|
111
|
-
s = s
|
108
|
+
# á & etc.
|
109
|
+
s = Sterile.decode_entities(s)
|
112
110
|
|
113
|
-
#
|
114
|
-
s = s
|
115
|
-
|
116
|
-
# convert unicode => regular characters
|
117
|
-
s = s.to_ascii
|
111
|
+
# "šţɽĩɳģ" => "string"
|
112
|
+
s = Sterile.transliterate(s)
|
118
113
|
|
119
114
|
# squish
|
120
115
|
s = s.squish
|
@@ -122,5 +117,17 @@ module Sinew
|
|
122
117
|
s
|
123
118
|
end
|
124
119
|
protected :normalize
|
120
|
+
|
121
|
+
def dup_url?(row)
|
122
|
+
if url = row[:url]
|
123
|
+
if urls.include?(url)
|
124
|
+
sinew.warning("duplicate url: #{url}") if !sinew.quiet?
|
125
|
+
return true
|
126
|
+
end
|
127
|
+
urls << url
|
128
|
+
end
|
129
|
+
false
|
130
|
+
end
|
131
|
+
protected :dup_url?
|
125
132
|
end
|
126
133
|
end
|
data/lib/sinew/request.rb
CHANGED
@@ -1,9 +1,8 @@
|
|
1
1
|
require 'digest/md5'
|
2
|
-
require 'httparty'
|
3
2
|
require 'htmlentities'
|
4
3
|
|
5
4
|
#
|
6
|
-
# Process a single HTTP request.
|
5
|
+
# Process a single HTTP request.
|
7
6
|
#
|
8
7
|
|
9
8
|
module Sinew
|
@@ -12,29 +11,43 @@ module Sinew
|
|
12
11
|
class Request
|
13
12
|
HTML_ENTITIES = HTMLEntities.new
|
14
13
|
VALID_METHODS = %w[get post patch put delete head options].freeze
|
14
|
+
METHODS_WITH_BODY = %w[patch post put].freeze
|
15
15
|
|
16
|
-
attr_reader :sinew, :method, :uri, :options
|
16
|
+
attr_reader :sinew, :method, :uri, :options
|
17
17
|
|
18
|
-
#
|
18
|
+
# Supported options:
|
19
|
+
# body: Body of http post
|
20
|
+
# headers: Hash of HTTP headers (combined with runtime_options.headers)
|
21
|
+
# query: Hash of query parameters to add to url
|
19
22
|
def initialize(sinew, method, url, options = {})
|
20
23
|
@sinew = sinew
|
21
24
|
@method = method
|
22
25
|
@options = options.dup
|
23
26
|
@uri = parse_url(url)
|
24
|
-
|
27
|
+
end
|
28
|
+
|
29
|
+
def proxy
|
30
|
+
@proxy ||= begin
|
31
|
+
if proxies = sinew.options[:proxy]
|
32
|
+
proxies.split(',').sample
|
33
|
+
end
|
34
|
+
end
|
25
35
|
end
|
26
36
|
|
27
37
|
# run the request, return the result
|
28
|
-
def perform
|
38
|
+
def perform(connection)
|
29
39
|
validate!
|
30
40
|
|
31
|
-
# merge global/options headers
|
32
41
|
headers = sinew.runtime_options.headers
|
33
42
|
headers = headers.merge(options[:headers]) if options[:headers]
|
34
|
-
options[:headers] = headers
|
35
43
|
|
36
|
-
|
37
|
-
|
44
|
+
body = options.delete(:body)
|
45
|
+
|
46
|
+
fday_response = connection.send(method, uri, body, headers) do
|
47
|
+
_1.options[:proxy] = proxy
|
48
|
+
end
|
49
|
+
|
50
|
+
Response.from_network(self, fday_response)
|
38
51
|
end
|
39
52
|
|
40
53
|
# We accept sloppy urls and attempt to clean them up
|
@@ -48,11 +61,11 @@ module Sinew
|
|
48
61
|
s = s.gsub(' ', '%20')
|
49
62
|
s = s.gsub("'", '%27')
|
50
63
|
|
51
|
-
# append query manually (instead of letting
|
52
|
-
#
|
64
|
+
# append query manually (instead of letting Faraday handle it) for consistent
|
65
|
+
# Request#uri and Response#uri
|
53
66
|
query = options.delete(:query)
|
54
67
|
if query.present?
|
55
|
-
q =
|
68
|
+
q = Faraday::Utils.default_params_encoder.encode(query)
|
56
69
|
separator = s.include?('?') ? '&' : '?'
|
57
70
|
s = "#{s}#{separator}#{q}"
|
58
71
|
end
|
@@ -61,44 +74,10 @@ module Sinew
|
|
61
74
|
end
|
62
75
|
protected :parse_url
|
63
76
|
|
64
|
-
def calculate_cache_key
|
65
|
-
dir = pathify(uri.host)
|
66
|
-
|
67
|
-
body_key = if body.is_a?(Hash)
|
68
|
-
HTTParty::HashConversions.to_params(body)
|
69
|
-
else
|
70
|
-
body&.dup
|
71
|
-
end
|
72
|
-
|
73
|
-
# build key, as a hash for before_generate_cache_key
|
74
|
-
key = {
|
75
|
-
method: method.dup,
|
76
|
-
path: uri.path,
|
77
|
-
query: uri.query,
|
78
|
-
body: body_key,
|
79
|
-
}
|
80
|
-
key = sinew.runtime_options.before_generate_cache_key.call(key)
|
81
|
-
|
82
|
-
# strip method for gets
|
83
|
-
key.delete(:method) if key[:method] == 'get'
|
84
|
-
|
85
|
-
# pull out the values, join and pathify
|
86
|
-
path = key.values.select(&:present?).join(',')
|
87
|
-
path = pathify(path)
|
88
|
-
|
89
|
-
# shorten long paths
|
90
|
-
if path.length > 250
|
91
|
-
path = Digest::MD5.hexdigest(path)
|
92
|
-
end
|
93
|
-
|
94
|
-
"#{dir}/#{path}"
|
95
|
-
end
|
96
|
-
protected :calculate_cache_key
|
97
|
-
|
98
77
|
def validate!
|
99
78
|
raise "invalid method #{method}" if !VALID_METHODS.include?(method)
|
100
79
|
raise "invalid url #{uri}" if uri.scheme !~ /^http/
|
101
|
-
raise "can't
|
80
|
+
raise "can't #{method} with a body" if body && !METHODS_WITH_BODY.include?(method)
|
102
81
|
raise "Content-Type doesn't make sense without a body" if content_type && !body
|
103
82
|
end
|
104
83
|
protected :validate!
|
@@ -134,7 +113,7 @@ module Sinew
|
|
134
113
|
s = s.gsub(',,', ',')
|
135
114
|
# encode invalid path chars
|
136
115
|
s = s.gsub(/[^A-Za-z0-9_.,=-]/) do |i|
|
137
|
-
hex = i.
|
116
|
+
hex = i.unpack1('H2')
|
138
117
|
"%#{hex}"
|
139
118
|
end
|
140
119
|
# handle empty case
|