sinew 3.0.1 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +4 -8
- data/.gitignore +3 -5
- data/.rubocop.yml +13 -48
- data/Gemfile +9 -9
- data/Gemfile.lock +132 -0
- data/LICENSE +1 -1
- data/README.md +113 -48
- data/Rakefile +3 -51
- data/bin/sinew +13 -41
- data/justfile +59 -0
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +252 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +3 -2
- data/lib/sinew/nokogiri_ext.rb +13 -22
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/lib/sinew.rb +23 -9
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +24 -20
- metadata +56 -31
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
data/justfile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
# read gem version
|
3
|
+
gemver := `cat lib/sinew/version.rb | grep -Eo "[0-9]+\.[0-9]+\.[0-9]+"`
|
4
|
+
|
5
|
+
#
|
6
|
+
# dev
|
7
|
+
#
|
8
|
+
|
9
|
+
default: test
|
10
|
+
|
11
|
+
check: lint test
|
12
|
+
|
13
|
+
fmt:
|
14
|
+
bundle exec rubocop -a
|
15
|
+
|
16
|
+
lint:
|
17
|
+
@just banner lint...
|
18
|
+
bundle exec rubocop
|
19
|
+
|
20
|
+
pry:
|
21
|
+
bundle exec pry -I lib -r sinew.rb
|
22
|
+
|
23
|
+
test:
|
24
|
+
@just banner test...
|
25
|
+
bundle exec rake test
|
26
|
+
|
27
|
+
watch:
|
28
|
+
@watchexec --watch lib --watch test --clear bundle exec rake test
|
29
|
+
|
30
|
+
#
|
31
|
+
# ci
|
32
|
+
#
|
33
|
+
|
34
|
+
ci:
|
35
|
+
bundle install
|
36
|
+
just check
|
37
|
+
|
38
|
+
#
|
39
|
+
# gem tasks
|
40
|
+
#
|
41
|
+
|
42
|
+
gem-push: check-git-status
|
43
|
+
@just banner gem build...
|
44
|
+
gem build sinew.gemspec
|
45
|
+
@just banner tag...
|
46
|
+
git tag -a "v{{gemver}}" -m "Tagging {{gemver}}"
|
47
|
+
git push --tags
|
48
|
+
@just banner gem push...
|
49
|
+
gem push "sinew-{{gemver}}.gem"
|
50
|
+
|
51
|
+
#
|
52
|
+
# util
|
53
|
+
#
|
54
|
+
|
55
|
+
banner *ARGS:
|
56
|
+
@printf '\e[42;37;1m[%s] %-72s \e[m\n' "$(date +%H:%M:%S)" "{{ARGS}}"
|
57
|
+
|
58
|
+
check-git-status:
|
59
|
+
@if [ ! -z "$(git status --porcelain)" ]; then echo "git status is dirty, bailing."; exit 1; fi
|
data/lib/sinew/args.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# manually load dependencies here since this is loaded standalone by bin
|
2
|
+
require "httpdisk/slop_duration"
|
3
|
+
require "sinew/version"
|
4
|
+
require "slop"
|
5
|
+
|
6
|
+
#
|
7
|
+
# This is used to parse command line arguments with Slop. We don't set any
|
8
|
+
# defaults in here, relying instead on Sloptions in Sinew::Base. That way
|
9
|
+
# defaults are applied for both command line and embedded usage of Sinew::Base.
|
10
|
+
#
|
11
|
+
|
12
|
+
module Sinew
|
13
|
+
module Args
|
14
|
+
def self.slop(args)
|
15
|
+
slop = Slop.parse(args) do |o|
|
16
|
+
o.banner = "Usage: sinew [options] [recipe.sinew]"
|
17
|
+
o.integer "-l", "--limit", "quit after emitting this many rows"
|
18
|
+
o.string "--proxy", "use host[:port] as HTTP proxy (can be a comma-delimited list)"
|
19
|
+
o.integer "--timeout", "maximum time allowed for the transfer"
|
20
|
+
o.bool "-s", "--silent", "suppress some output"
|
21
|
+
o.bool "-v", "--verbose", "dump emitted rows while running"
|
22
|
+
|
23
|
+
o.separator "From httpdisk:"
|
24
|
+
o.string "--dir", "set custom cache directory"
|
25
|
+
# note: uses slop_duration from HTTPDisk
|
26
|
+
o.duration "--expires", "when to expire cached requests (ex: 1h, 2d, 3w)"
|
27
|
+
o.bool "--force", "don't read anything from cache (but still write)"
|
28
|
+
o.bool "--force-errors", "don't read errors from cache (but still write)"
|
29
|
+
|
30
|
+
# generic
|
31
|
+
o.boolean "--version", "show version" do
|
32
|
+
puts "sinew #{Sinew::VERSION}"
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
o.on("--help", "show this help") do
|
36
|
+
puts o
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# recipe argument
|
42
|
+
recipe = slop.args.first
|
43
|
+
raise Slop::Error, "" if args.empty?
|
44
|
+
raise Slop::Error, "no RECIPE specified" if !recipe
|
45
|
+
raise Slop::Error, "more than one RECIPE specified" if slop.args.length > 1
|
46
|
+
raise Slop::Error, "#{recipe} not found" if !File.exist?(recipe)
|
47
|
+
|
48
|
+
slop.to_h.tap do
|
49
|
+
_1[:recipe] = recipe
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/lib/sinew/base.rb
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
require "amazing_print"
|
2
|
+
require "faraday-encoding"
|
3
|
+
require "faraday-rate_limiter"
|
4
|
+
require "faraday/logging/formatter"
|
5
|
+
require "faraday/retry"
|
6
|
+
require "httpdisk"
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
# Sinew base class, for in standalone scripts or via the sinew binary.
|
10
|
+
class Base
|
11
|
+
attr_reader :csv, :mutex, :options
|
12
|
+
|
13
|
+
def initialize(opts = {})
|
14
|
+
@mutex = Mutex.new
|
15
|
+
|
16
|
+
#
|
17
|
+
# defaults for Sloptions
|
18
|
+
#
|
19
|
+
|
20
|
+
# default :rate_limit, typically 1
|
21
|
+
default_rate_limit = ENV["SINEW_TEST"] ? 0 : 1
|
22
|
+
|
23
|
+
#
|
24
|
+
# note: uses HTTPDisk::Sloptions
|
25
|
+
#
|
26
|
+
|
27
|
+
@options = HTTPDisk::Sloptions.parse(opts) do
|
28
|
+
# cli
|
29
|
+
_1.integer :limit
|
30
|
+
_1.integer :timeout, default: 30
|
31
|
+
_1.boolean :silent
|
32
|
+
_1.on :proxy, type: [:string, Array]
|
33
|
+
_1.boolean :verbose
|
34
|
+
|
35
|
+
# httpdisk
|
36
|
+
_1.string :dir, default: File.join(ENV["HOME"], ".sinew")
|
37
|
+
_1.integer :expires
|
38
|
+
_1.boolean :force
|
39
|
+
_1.boolean :force_errors
|
40
|
+
_1.array :ignore_params
|
41
|
+
|
42
|
+
# more runtime options
|
43
|
+
_1.hash :headers
|
44
|
+
_1.boolean :insecure
|
45
|
+
_1.string :output, required: true
|
46
|
+
_1.hash :params
|
47
|
+
_1.float :rate_limit, default: default_rate_limit
|
48
|
+
_1.integer :retries, default: 2
|
49
|
+
_1.on :url_prefix, type: [:string, URI]
|
50
|
+
_1.boolean :utf8, default: true
|
51
|
+
end
|
52
|
+
|
53
|
+
@csv = CSV.new(opts[:output])
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# requests
|
58
|
+
#
|
59
|
+
|
60
|
+
# http get, returns a Response
|
61
|
+
def get(url, params = nil, headers = nil)
|
62
|
+
faraday_response = faraday.get(url, params, headers) do
|
63
|
+
_1.options[:proxy] = random_proxy
|
64
|
+
end
|
65
|
+
Response.new(faraday_response)
|
66
|
+
end
|
67
|
+
|
68
|
+
# http post, returns a Response. Defaults to form body type.
|
69
|
+
def post(url, body = nil, headers = nil)
|
70
|
+
faraday_response = faraday.post(url, body, headers) do
|
71
|
+
_1.options[:proxy] = random_proxy
|
72
|
+
end
|
73
|
+
Response.new(faraday_response)
|
74
|
+
end
|
75
|
+
|
76
|
+
# http post json, returns a Response
|
77
|
+
def post_json(url, body = nil, headers = nil)
|
78
|
+
body = body.to_json
|
79
|
+
headers = (headers || {}).merge("Content-Type" => "application/json")
|
80
|
+
post(url, body, headers)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Faraday connection for this recipe
|
84
|
+
def faraday
|
85
|
+
mutex.synchronize do
|
86
|
+
@faraday ||= create_faraday
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# httpdisk
|
92
|
+
#
|
93
|
+
|
94
|
+
# Returns true if request is cached. Defaults to form body type.
|
95
|
+
def cached?(method, url, params = nil, body = nil)
|
96
|
+
status = status(method, url, params, body)
|
97
|
+
status[:status] != "miss"
|
98
|
+
end
|
99
|
+
|
100
|
+
# Remove cache file, if any. Defaults to form body type.
|
101
|
+
def uncache(method, url, params = nil, body = nil)
|
102
|
+
status = status(method, url, params, body)
|
103
|
+
path = status[:path]
|
104
|
+
File.unlink(path) if File.exist?(path)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Check httpdisk status for this request. Defaults to form body type.
|
108
|
+
def status(method, url, params = nil, body = nil)
|
109
|
+
# if hash, default to url encoded form
|
110
|
+
# see lib/faraday/request/url_encoded.rb
|
111
|
+
if body.is_a?(Hash)
|
112
|
+
body = Faraday::Utils::ParamsHash[body].to_query
|
113
|
+
end
|
114
|
+
|
115
|
+
env = Faraday::Env.new.tap do
|
116
|
+
_1.method = method.to_s.downcase.to_sym
|
117
|
+
_1.request_headers = {}
|
118
|
+
_1.request_body = body
|
119
|
+
_1.url = faraday.build_url(url, params)
|
120
|
+
end
|
121
|
+
httpdisk.status(env)
|
122
|
+
end
|
123
|
+
|
124
|
+
#
|
125
|
+
# csv
|
126
|
+
#
|
127
|
+
|
128
|
+
# Output a csv header. This usually happens automatically, but you can call
|
129
|
+
# this method directly to ensure a consistent set of columns.
|
130
|
+
def csv_header(*columns)
|
131
|
+
csv.start(columns.flatten)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Output a csv row. Row should be any object that can turn into a hash - a
|
135
|
+
# hash, OpenStruct, etc.
|
136
|
+
def csv_emit(row)
|
137
|
+
row = row.to_h
|
138
|
+
mutex.synchronize do
|
139
|
+
# header if necessary
|
140
|
+
csv_header(row.keys) if !csv.started?
|
141
|
+
|
142
|
+
# emit
|
143
|
+
print = csv.emit(row)
|
144
|
+
puts print.ai if options[:verbose]
|
145
|
+
|
146
|
+
# this is caught by Sinew::Main
|
147
|
+
if csv.count == options[:limit]
|
148
|
+
raise LimitError
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# stdout
|
155
|
+
#
|
156
|
+
|
157
|
+
RESET = "\e[0m".freeze
|
158
|
+
RED = "\e[1;37;41m".freeze
|
159
|
+
GREEN = "\e[1;37;42m".freeze
|
160
|
+
|
161
|
+
# Print a nice green banner.
|
162
|
+
def banner(msg, color: GREEN)
|
163
|
+
msg = "#{msg} ".ljust(72, " ")
|
164
|
+
msg = "[#{Time.new.strftime("%H:%M:%S")}] #{msg}"
|
165
|
+
msg = "#{color}#{msg}#{RESET}" if $stdout.tty?
|
166
|
+
puts msg
|
167
|
+
end
|
168
|
+
|
169
|
+
# Print a scary red banner and exit.
|
170
|
+
def fatal(msg)
|
171
|
+
banner(msg, color: RED)
|
172
|
+
exit 1
|
173
|
+
end
|
174
|
+
|
175
|
+
protected
|
176
|
+
|
177
|
+
# Return a random proxy.
|
178
|
+
def random_proxy
|
179
|
+
return if !options[:proxy]
|
180
|
+
|
181
|
+
proxies = options[:proxy]
|
182
|
+
proxies = proxies.split(",") if !proxies.is_a?(Array)
|
183
|
+
proxies.sample
|
184
|
+
end
|
185
|
+
|
186
|
+
# Create the Faraday connection for making requests.
|
187
|
+
def create_faraday
|
188
|
+
faraday_options = options.slice(:headers, :params)
|
189
|
+
if options[:insecure]
|
190
|
+
faraday_options[:ssl] = {verify: false}
|
191
|
+
end
|
192
|
+
Faraday.new(nil, faraday_options) do
|
193
|
+
# options
|
194
|
+
if options[:url_prefix]
|
195
|
+
_1.url_prefix = options[:url_prefix]
|
196
|
+
end
|
197
|
+
_1.options.timeout = options[:timeout]
|
198
|
+
|
199
|
+
#
|
200
|
+
# middleware that runs on both disk/network requests
|
201
|
+
#
|
202
|
+
|
203
|
+
# cookie middleware
|
204
|
+
_1.use :cookie_jar
|
205
|
+
|
206
|
+
# auto-encode form bodies
|
207
|
+
_1.request :url_encoded
|
208
|
+
|
209
|
+
# Before httpdisk so each redirect segment is cached
|
210
|
+
# Keep track of redirect status for logger
|
211
|
+
_1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
|
212
|
+
|
213
|
+
#
|
214
|
+
# httpdisk
|
215
|
+
#
|
216
|
+
|
217
|
+
httpdisk_options = options.slice(:dir, :expires, :force, :force_errors, :ignore_params, :utf8)
|
218
|
+
_1.use :httpdisk, httpdisk_options
|
219
|
+
|
220
|
+
#
|
221
|
+
# middleware below only used it httpdisk uses the network
|
222
|
+
#
|
223
|
+
|
224
|
+
# rate limit
|
225
|
+
rate_limit = options[:rate_limit]
|
226
|
+
_1.request :rate_limiter, interval: rate_limit
|
227
|
+
|
228
|
+
# After httpdisk so that only non-cached requests are logged.
|
229
|
+
# Before retry so that we don't log each retry attempt.
|
230
|
+
_1.response :logger, nil, formatter: Middleware::LogFormatter if !options[:silent]
|
231
|
+
|
232
|
+
retry_options = {
|
233
|
+
max_interval: rate_limit, # very important, negates Retry-After: 86400
|
234
|
+
max: options[:retries],
|
235
|
+
methods: %w[delete get head options patch post put trace],
|
236
|
+
retry_statuses: (500..600).to_a,
|
237
|
+
retry_if: ->(_env, _err) { true }
|
238
|
+
}
|
239
|
+
_1.request :retry, retry_options
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
# find connection's httpdisk instance
|
244
|
+
def httpdisk
|
245
|
+
@httpdisk ||= begin
|
246
|
+
app = faraday.app
|
247
|
+
app = app.app until app.is_a?(HTTPDisk::Client)
|
248
|
+
app
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
data/lib/sinew/csv.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "sterile"
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
class CSV
|
6
|
+
attr_reader :columns, :count, :csv, :path, :tally
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@count = 0
|
10
|
+
@csv = nil
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
# start writing the csv
|
15
|
+
def start(columns)
|
16
|
+
raise "started twice" if started?
|
17
|
+
|
18
|
+
@columns = columns
|
19
|
+
@tally = columns.map { [_1, 0] }.to_h
|
20
|
+
@csv = ::CSV.open(path, "wb").tap do
|
21
|
+
_1 << columns
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# has this csv been started?
|
26
|
+
def started?
|
27
|
+
@csv != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# append a row
|
31
|
+
def emit(row)
|
32
|
+
# convert row to cols, and construct print (our return value)
|
33
|
+
print = {}
|
34
|
+
row = columns.map do
|
35
|
+
value = normalize(row[_1])
|
36
|
+
if value
|
37
|
+
print[_1] = value
|
38
|
+
tally[_1] += 1
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
@count += 1
|
43
|
+
|
44
|
+
# emit
|
45
|
+
csv << row
|
46
|
+
csv.flush
|
47
|
+
|
48
|
+
# return in case someone wants to pretty print this
|
49
|
+
print
|
50
|
+
end
|
51
|
+
|
52
|
+
ASCII_ONLY = begin
|
53
|
+
chars = (33..126).map(&:chr) - ["&"]
|
54
|
+
/\A[#{Regexp.escape(chars.join)}\s]+\Z/
|
55
|
+
end.freeze
|
56
|
+
|
57
|
+
def normalize(s)
|
58
|
+
# nokogiri/array/misc => string
|
59
|
+
s = if s.respond_to?(:inner_html)
|
60
|
+
s.inner_html
|
61
|
+
elsif s.is_a?(Array)
|
62
|
+
s.join("|")
|
63
|
+
else
|
64
|
+
s.to_s
|
65
|
+
end
|
66
|
+
return if s.empty?
|
67
|
+
|
68
|
+
# simple attempt to strip tags. Note that we replace tags with spaces
|
69
|
+
s = s.gsub(/<[^>]+>/, " ")
|
70
|
+
|
71
|
+
if s !~ ASCII_ONLY
|
72
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
73
|
+
s = Sterile.plain_format(s)
|
74
|
+
|
75
|
+
# á & etc.
|
76
|
+
s = Sterile.decode_entities(s)
|
77
|
+
|
78
|
+
# "šţɽĩɳģ" => "string"
|
79
|
+
s = Sterile.transliterate(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
# squish
|
83
|
+
s = s.strip.gsub(/\s+/, " ")
|
84
|
+
return if s.empty?
|
85
|
+
|
86
|
+
s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,98 +1,72 @@
|
|
1
|
-
require 'scripto'
|
2
|
-
require 'sinew/connection'
|
3
|
-
|
4
|
-
#
|
5
|
-
# Main sinew entry point.
|
6
|
-
#
|
7
|
-
|
8
1
|
module Sinew
|
9
|
-
class
|
10
|
-
|
2
|
+
# Helper class used by sinew bin. This exists as an independent class solely
|
3
|
+
# for testing, otherwise it would be built into the bin script.
|
4
|
+
class Main
|
5
|
+
attr_reader :sinew
|
11
6
|
|
12
7
|
def initialize(options)
|
13
|
-
|
8
|
+
options[:output] ||= begin
|
9
|
+
src = options[:recipe]
|
10
|
+
dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
|
11
|
+
dst = dst.sub(%r{^./}, "") # nice to clean this up
|
12
|
+
dst
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
@runtime_options = RuntimeOptions.new
|
15
|
+
@sinew = Sinew::Base.new(options)
|
17
16
|
end
|
18
17
|
|
19
18
|
def run
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
def dsl
|
29
|
-
@dsl ||= DSL.new(self)
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# http requests
|
34
|
-
#
|
35
|
-
|
36
|
-
def http(method, url, options = {})
|
37
|
-
request = Request.new(method, url, request_options(options))
|
38
|
-
response = request.perform(connection)
|
39
|
-
|
40
|
-
# always log error messages
|
41
|
-
if response.error?
|
42
|
-
puts "xxx http request failed with #{response.code}"
|
19
|
+
tm = Time.now
|
20
|
+
header if !sinew.options[:silent]
|
21
|
+
recipe = sinew.options[:recipe]
|
22
|
+
dsl = DSL.new(sinew)
|
23
|
+
begin
|
24
|
+
dsl.instance_eval(File.read(recipe, mode: "rb"), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
43
27
|
end
|
44
|
-
|
45
|
-
response
|
28
|
+
footer(Time.now - tm) if !sinew.options[:silent]
|
46
29
|
end
|
47
30
|
|
48
|
-
|
49
|
-
@connection ||= Connection.create(options: options, runtime_options: runtime_options)
|
50
|
-
end
|
51
|
-
protected :connection
|
31
|
+
protected
|
52
32
|
|
53
33
|
#
|
54
|
-
#
|
34
|
+
# header/footer
|
55
35
|
#
|
56
36
|
|
57
|
-
def
|
58
|
-
|
37
|
+
def header
|
38
|
+
sinew.banner("Writing to #{sinew.csv.path}...")
|
59
39
|
end
|
60
40
|
|
61
|
-
|
62
|
-
|
63
|
-
|
41
|
+
def footer(elapsed)
|
42
|
+
csv = sinew.csv
|
43
|
+
count = csv.count
|
64
44
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
[ runtime_options.headers, options[:headers]].each do
|
69
|
-
h.merge!(_1) if _1
|
70
|
-
end
|
71
|
-
end
|
72
|
-
req[:proxy] = random_proxy
|
45
|
+
if count == 0
|
46
|
+
sinew.banner(format("Done in %ds. Nothing written.", elapsed))
|
47
|
+
return
|
73
48
|
end
|
74
|
-
end
|
75
|
-
protected :request_options
|
76
|
-
|
77
|
-
PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
|
78
49
|
|
79
|
-
|
80
|
-
|
50
|
+
# summary
|
51
|
+
msg = format("Done in %ds. Wrote %d rows to %s. Summary:", elapsed, count, csv.path)
|
52
|
+
sinew.banner(msg)
|
81
53
|
|
82
|
-
|
83
|
-
|
84
|
-
|
54
|
+
# tally
|
55
|
+
tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
|
56
|
+
len = tally.keys.map { _1.to_s.length }.max
|
57
|
+
fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
|
58
|
+
tally.each do
|
59
|
+
printf(fmt, _1, _2, count, _2 * 100.0 / count)
|
85
60
|
end
|
86
|
-
|
87
|
-
"http://#{proxy}"
|
88
61
|
end
|
89
|
-
protected :random_proxy
|
90
62
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
63
|
+
# simple DSL for .sinew files
|
64
|
+
class DSL
|
65
|
+
attr_reader :sinew
|
66
|
+
|
67
|
+
def initialize(sinew)
|
68
|
+
@sinew = sinew
|
69
|
+
end
|
95
70
|
end
|
96
|
-
protected :footer
|
97
71
|
end
|
98
72
|
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
module Sinew
|
2
|
-
module
|
2
|
+
module Middleware
|
3
|
+
# Minimalist Formatter that logs proxy if present.
|
3
4
|
class LogFormatter < Faraday::Logging::Formatter
|
4
5
|
def request(env)
|
5
|
-
info(
|
6
|
+
info("req") do
|
6
7
|
# Only log the initial request, not the redirects
|
7
8
|
return if env[:redirect]
|
8
9
|
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,28 +1,19 @@
|
|
1
|
-
require
|
1
|
+
require "nokogiri"
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Nokogiri
|
5
|
+
module XML
|
6
|
+
class NodeSet
|
7
|
+
alias_method :old_inner_html, :inner_html
|
8
|
+
alias_method :old_inner_text, :inner_text
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def inner_html(*args)
|
13
|
-
map { |i| i.inner_html(*args) }.join(' ')
|
14
|
-
end
|
15
|
-
end
|
10
|
+
def inner_text
|
11
|
+
map(&:inner_text).join(" ")
|
12
|
+
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
t&.text
|
22
|
-
end
|
23
|
-
end
|
24
|
-
class Nokogiri::XML::NodeSet
|
25
|
-
def text_just_me
|
26
|
-
map(&:text_just_me).join(' ')
|
14
|
+
def inner_html(*args)
|
15
|
+
map { _1.inner_html(*args) }.join(" ")
|
16
|
+
end
|
17
|
+
end
|
27
18
|
end
|
28
19
|
end
|