sinew 3.0.1 → 4.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.github/workflows/test.yml +4 -8
- data/.gitignore +3 -5
- data/.rubocop.yml +13 -48
- data/Gemfile +9 -9
- data/Gemfile.lock +132 -0
- data/LICENSE +1 -1
- data/README.md +113 -48
- data/Rakefile +3 -51
- data/bin/sinew +13 -41
- data/justfile +59 -0
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +252 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +3 -2
- data/lib/sinew/nokogiri_ext.rb +13 -22
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/lib/sinew.rb +23 -9
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +24 -20
- metadata +56 -31
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
data/justfile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
|
2
|
+
# read gem version
|
3
|
+
gemver := `cat lib/sinew/version.rb | grep -Eo "[0-9]+\.[0-9]+\.[0-9]+"`
|
4
|
+
|
5
|
+
#
|
6
|
+
# dev
|
7
|
+
#
|
8
|
+
|
9
|
+
default: test
|
10
|
+
|
11
|
+
check: lint test
|
12
|
+
|
13
|
+
fmt:
|
14
|
+
bundle exec rubocop -a
|
15
|
+
|
16
|
+
lint:
|
17
|
+
@just banner lint...
|
18
|
+
bundle exec rubocop
|
19
|
+
|
20
|
+
pry:
|
21
|
+
bundle exec pry -I lib -r sinew.rb
|
22
|
+
|
23
|
+
test:
|
24
|
+
@just banner test...
|
25
|
+
bundle exec rake test
|
26
|
+
|
27
|
+
watch:
|
28
|
+
@watchexec --watch lib --watch test --clear bundle exec rake test
|
29
|
+
|
30
|
+
#
|
31
|
+
# ci
|
32
|
+
#
|
33
|
+
|
34
|
+
ci:
|
35
|
+
bundle install
|
36
|
+
just check
|
37
|
+
|
38
|
+
#
|
39
|
+
# gem tasks
|
40
|
+
#
|
41
|
+
|
42
|
+
gem-push: check-git-status
|
43
|
+
@just banner gem build...
|
44
|
+
gem build sinew.gemspec
|
45
|
+
@just banner tag...
|
46
|
+
git tag -a "v{{gemver}}" -m "Tagging {{gemver}}"
|
47
|
+
git push --tags
|
48
|
+
@just banner gem push...
|
49
|
+
gem push "sinew-{{gemver}}.gem"
|
50
|
+
|
51
|
+
#
|
52
|
+
# util
|
53
|
+
#
|
54
|
+
|
55
|
+
banner *ARGS:
|
56
|
+
@printf '\e[42;37;1m[%s] %-72s \e[m\n' "$(date +%H:%M:%S)" "{{ARGS}}"
|
57
|
+
|
58
|
+
check-git-status:
|
59
|
+
@if [ ! -z "$(git status --porcelain)" ]; then echo "git status is dirty, bailing."; exit 1; fi
|
data/lib/sinew/args.rb
ADDED
@@ -0,0 +1,53 @@
|
|
1
|
+
# manually load dependencies here since this is loaded standalone by bin
|
2
|
+
require "httpdisk/slop_duration"
|
3
|
+
require "sinew/version"
|
4
|
+
require "slop"
|
5
|
+
|
6
|
+
#
|
7
|
+
# This is used to parse command line arguments with Slop. We don't set any
|
8
|
+
# defaults in here, relying instead on Sloptions in Sinew::Base. That way
|
9
|
+
# defaults are applied for both command line and embedded usage of Sinew::Base.
|
10
|
+
#
|
11
|
+
|
12
|
+
module Sinew
|
13
|
+
module Args
|
14
|
+
def self.slop(args)
|
15
|
+
slop = Slop.parse(args) do |o|
|
16
|
+
o.banner = "Usage: sinew [options] [recipe.sinew]"
|
17
|
+
o.integer "-l", "--limit", "quit after emitting this many rows"
|
18
|
+
o.string "--proxy", "use host[:port] as HTTP proxy (can be a comma-delimited list)"
|
19
|
+
o.integer "--timeout", "maximum time allowed for the transfer"
|
20
|
+
o.bool "-s", "--silent", "suppress some output"
|
21
|
+
o.bool "-v", "--verbose", "dump emitted rows while running"
|
22
|
+
|
23
|
+
o.separator "From httpdisk:"
|
24
|
+
o.string "--dir", "set custom cache directory"
|
25
|
+
# note: uses slop_duration from HTTPDisk
|
26
|
+
o.duration "--expires", "when to expire cached requests (ex: 1h, 2d, 3w)"
|
27
|
+
o.bool "--force", "don't read anything from cache (but still write)"
|
28
|
+
o.bool "--force-errors", "don't read errors from cache (but still write)"
|
29
|
+
|
30
|
+
# generic
|
31
|
+
o.boolean "--version", "show version" do
|
32
|
+
puts "sinew #{Sinew::VERSION}"
|
33
|
+
exit
|
34
|
+
end
|
35
|
+
o.on("--help", "show this help") do
|
36
|
+
puts o
|
37
|
+
exit
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
# recipe argument
|
42
|
+
recipe = slop.args.first
|
43
|
+
raise Slop::Error, "" if args.empty?
|
44
|
+
raise Slop::Error, "no RECIPE specified" if !recipe
|
45
|
+
raise Slop::Error, "more than one RECIPE specified" if slop.args.length > 1
|
46
|
+
raise Slop::Error, "#{recipe} not found" if !File.exist?(recipe)
|
47
|
+
|
48
|
+
slop.to_h.tap do
|
49
|
+
_1[:recipe] = recipe
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/lib/sinew/base.rb
ADDED
@@ -0,0 +1,252 @@
|
|
1
|
+
require "amazing_print"
|
2
|
+
require "faraday-encoding"
|
3
|
+
require "faraday-rate_limiter"
|
4
|
+
require "faraday/logging/formatter"
|
5
|
+
require "faraday/retry"
|
6
|
+
require "httpdisk"
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
# Sinew base class, for in standalone scripts or via the sinew binary.
|
10
|
+
class Base
|
11
|
+
attr_reader :csv, :mutex, :options
|
12
|
+
|
13
|
+
def initialize(opts = {})
|
14
|
+
@mutex = Mutex.new
|
15
|
+
|
16
|
+
#
|
17
|
+
# defaults for Sloptions
|
18
|
+
#
|
19
|
+
|
20
|
+
# default :rate_limit, typically 1
|
21
|
+
default_rate_limit = ENV["SINEW_TEST"] ? 0 : 1
|
22
|
+
|
23
|
+
#
|
24
|
+
# note: uses HTTPDisk::Sloptions
|
25
|
+
#
|
26
|
+
|
27
|
+
@options = HTTPDisk::Sloptions.parse(opts) do
|
28
|
+
# cli
|
29
|
+
_1.integer :limit
|
30
|
+
_1.integer :timeout, default: 30
|
31
|
+
_1.boolean :silent
|
32
|
+
_1.on :proxy, type: [:string, Array]
|
33
|
+
_1.boolean :verbose
|
34
|
+
|
35
|
+
# httpdisk
|
36
|
+
_1.string :dir, default: File.join(ENV["HOME"], ".sinew")
|
37
|
+
_1.integer :expires
|
38
|
+
_1.boolean :force
|
39
|
+
_1.boolean :force_errors
|
40
|
+
_1.array :ignore_params
|
41
|
+
|
42
|
+
# more runtime options
|
43
|
+
_1.hash :headers
|
44
|
+
_1.boolean :insecure
|
45
|
+
_1.string :output, required: true
|
46
|
+
_1.hash :params
|
47
|
+
_1.float :rate_limit, default: default_rate_limit
|
48
|
+
_1.integer :retries, default: 2
|
49
|
+
_1.on :url_prefix, type: [:string, URI]
|
50
|
+
_1.boolean :utf8, default: true
|
51
|
+
end
|
52
|
+
|
53
|
+
@csv = CSV.new(opts[:output])
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
# requests
|
58
|
+
#
|
59
|
+
|
60
|
+
# http get, returns a Response
|
61
|
+
def get(url, params = nil, headers = nil)
|
62
|
+
faraday_response = faraday.get(url, params, headers) do
|
63
|
+
_1.options[:proxy] = random_proxy
|
64
|
+
end
|
65
|
+
Response.new(faraday_response)
|
66
|
+
end
|
67
|
+
|
68
|
+
# http post, returns a Response. Defaults to form body type.
|
69
|
+
def post(url, body = nil, headers = nil)
|
70
|
+
faraday_response = faraday.post(url, body, headers) do
|
71
|
+
_1.options[:proxy] = random_proxy
|
72
|
+
end
|
73
|
+
Response.new(faraday_response)
|
74
|
+
end
|
75
|
+
|
76
|
+
# http post json, returns a Response
|
77
|
+
def post_json(url, body = nil, headers = nil)
|
78
|
+
body = body.to_json
|
79
|
+
headers = (headers || {}).merge("Content-Type" => "application/json")
|
80
|
+
post(url, body, headers)
|
81
|
+
end
|
82
|
+
|
83
|
+
# Faraday connection for this recipe
|
84
|
+
def faraday
|
85
|
+
mutex.synchronize do
|
86
|
+
@faraday ||= create_faraday
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# httpdisk
|
92
|
+
#
|
93
|
+
|
94
|
+
# Returns true if request is cached. Defaults to form body type.
|
95
|
+
def cached?(method, url, params = nil, body = nil)
|
96
|
+
status = status(method, url, params, body)
|
97
|
+
status[:status] != "miss"
|
98
|
+
end
|
99
|
+
|
100
|
+
# Remove cache file, if any. Defaults to form body type.
|
101
|
+
def uncache(method, url, params = nil, body = nil)
|
102
|
+
status = status(method, url, params, body)
|
103
|
+
path = status[:path]
|
104
|
+
File.unlink(path) if File.exist?(path)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Check httpdisk status for this request. Defaults to form body type.
|
108
|
+
def status(method, url, params = nil, body = nil)
|
109
|
+
# if hash, default to url encoded form
|
110
|
+
# see lib/faraday/request/url_encoded.rb
|
111
|
+
if body.is_a?(Hash)
|
112
|
+
body = Faraday::Utils::ParamsHash[body].to_query
|
113
|
+
end
|
114
|
+
|
115
|
+
env = Faraday::Env.new.tap do
|
116
|
+
_1.method = method.to_s.downcase.to_sym
|
117
|
+
_1.request_headers = {}
|
118
|
+
_1.request_body = body
|
119
|
+
_1.url = faraday.build_url(url, params)
|
120
|
+
end
|
121
|
+
httpdisk.status(env)
|
122
|
+
end
|
123
|
+
|
124
|
+
#
|
125
|
+
# csv
|
126
|
+
#
|
127
|
+
|
128
|
+
# Output a csv header. This usually happens automatically, but you can call
|
129
|
+
# this method directly to ensure a consistent set of columns.
|
130
|
+
def csv_header(*columns)
|
131
|
+
csv.start(columns.flatten)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Output a csv row. Row should be any object that can turn into a hash - a
|
135
|
+
# hash, OpenStruct, etc.
|
136
|
+
def csv_emit(row)
|
137
|
+
row = row.to_h
|
138
|
+
mutex.synchronize do
|
139
|
+
# header if necessary
|
140
|
+
csv_header(row.keys) if !csv.started?
|
141
|
+
|
142
|
+
# emit
|
143
|
+
print = csv.emit(row)
|
144
|
+
puts print.ai if options[:verbose]
|
145
|
+
|
146
|
+
# this is caught by Sinew::Main
|
147
|
+
if csv.count == options[:limit]
|
148
|
+
raise LimitError
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
152
|
+
|
153
|
+
#
|
154
|
+
# stdout
|
155
|
+
#
|
156
|
+
|
157
|
+
RESET = "\e[0m".freeze
|
158
|
+
RED = "\e[1;37;41m".freeze
|
159
|
+
GREEN = "\e[1;37;42m".freeze
|
160
|
+
|
161
|
+
# Print a nice green banner.
|
162
|
+
def banner(msg, color: GREEN)
|
163
|
+
msg = "#{msg} ".ljust(72, " ")
|
164
|
+
msg = "[#{Time.new.strftime("%H:%M:%S")}] #{msg}"
|
165
|
+
msg = "#{color}#{msg}#{RESET}" if $stdout.tty?
|
166
|
+
puts msg
|
167
|
+
end
|
168
|
+
|
169
|
+
# Print a scary red banner and exit.
|
170
|
+
def fatal(msg)
|
171
|
+
banner(msg, color: RED)
|
172
|
+
exit 1
|
173
|
+
end
|
174
|
+
|
175
|
+
protected
|
176
|
+
|
177
|
+
# Return a random proxy.
|
178
|
+
def random_proxy
|
179
|
+
return if !options[:proxy]
|
180
|
+
|
181
|
+
proxies = options[:proxy]
|
182
|
+
proxies = proxies.split(",") if !proxies.is_a?(Array)
|
183
|
+
proxies.sample
|
184
|
+
end
|
185
|
+
|
186
|
+
# Create the Faraday connection for making requests.
|
187
|
+
def create_faraday
|
188
|
+
faraday_options = options.slice(:headers, :params)
|
189
|
+
if options[:insecure]
|
190
|
+
faraday_options[:ssl] = {verify: false}
|
191
|
+
end
|
192
|
+
Faraday.new(nil, faraday_options) do
|
193
|
+
# options
|
194
|
+
if options[:url_prefix]
|
195
|
+
_1.url_prefix = options[:url_prefix]
|
196
|
+
end
|
197
|
+
_1.options.timeout = options[:timeout]
|
198
|
+
|
199
|
+
#
|
200
|
+
# middleware that runs on both disk/network requests
|
201
|
+
#
|
202
|
+
|
203
|
+
# cookie middleware
|
204
|
+
_1.use :cookie_jar
|
205
|
+
|
206
|
+
# auto-encode form bodies
|
207
|
+
_1.request :url_encoded
|
208
|
+
|
209
|
+
# Before httpdisk so each redirect segment is cached
|
210
|
+
# Keep track of redirect status for logger
|
211
|
+
_1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
|
212
|
+
|
213
|
+
#
|
214
|
+
# httpdisk
|
215
|
+
#
|
216
|
+
|
217
|
+
httpdisk_options = options.slice(:dir, :expires, :force, :force_errors, :ignore_params, :utf8)
|
218
|
+
_1.use :httpdisk, httpdisk_options
|
219
|
+
|
220
|
+
#
|
221
|
+
# middleware below only used it httpdisk uses the network
|
222
|
+
#
|
223
|
+
|
224
|
+
# rate limit
|
225
|
+
rate_limit = options[:rate_limit]
|
226
|
+
_1.request :rate_limiter, interval: rate_limit
|
227
|
+
|
228
|
+
# After httpdisk so that only non-cached requests are logged.
|
229
|
+
# Before retry so that we don't log each retry attempt.
|
230
|
+
_1.response :logger, nil, formatter: Middleware::LogFormatter if !options[:silent]
|
231
|
+
|
232
|
+
retry_options = {
|
233
|
+
max_interval: rate_limit, # very important, negates Retry-After: 86400
|
234
|
+
max: options[:retries],
|
235
|
+
methods: %w[delete get head options patch post put trace],
|
236
|
+
retry_statuses: (500..600).to_a,
|
237
|
+
retry_if: ->(_env, _err) { true }
|
238
|
+
}
|
239
|
+
_1.request :retry, retry_options
|
240
|
+
end
|
241
|
+
end
|
242
|
+
|
243
|
+
# find connection's httpdisk instance
|
244
|
+
def httpdisk
|
245
|
+
@httpdisk ||= begin
|
246
|
+
app = faraday.app
|
247
|
+
app = app.app until app.is_a?(HTTPDisk::Client)
|
248
|
+
app
|
249
|
+
end
|
250
|
+
end
|
251
|
+
end
|
252
|
+
end
|
data/lib/sinew/csv.rb
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
require "csv"
|
2
|
+
require "sterile"
|
3
|
+
|
4
|
+
module Sinew
|
5
|
+
class CSV
|
6
|
+
attr_reader :columns, :count, :csv, :path, :tally
|
7
|
+
|
8
|
+
def initialize(path)
|
9
|
+
@count = 0
|
10
|
+
@csv = nil
|
11
|
+
@path = path
|
12
|
+
end
|
13
|
+
|
14
|
+
# start writing the csv
|
15
|
+
def start(columns)
|
16
|
+
raise "started twice" if started?
|
17
|
+
|
18
|
+
@columns = columns
|
19
|
+
@tally = columns.map { [_1, 0] }.to_h
|
20
|
+
@csv = ::CSV.open(path, "wb").tap do
|
21
|
+
_1 << columns
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# has this csv been started?
|
26
|
+
def started?
|
27
|
+
@csv != nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# append a row
|
31
|
+
def emit(row)
|
32
|
+
# convert row to cols, and construct print (our return value)
|
33
|
+
print = {}
|
34
|
+
row = columns.map do
|
35
|
+
value = normalize(row[_1])
|
36
|
+
if value
|
37
|
+
print[_1] = value
|
38
|
+
tally[_1] += 1
|
39
|
+
end
|
40
|
+
value
|
41
|
+
end
|
42
|
+
@count += 1
|
43
|
+
|
44
|
+
# emit
|
45
|
+
csv << row
|
46
|
+
csv.flush
|
47
|
+
|
48
|
+
# return in case someone wants to pretty print this
|
49
|
+
print
|
50
|
+
end
|
51
|
+
|
52
|
+
ASCII_ONLY = begin
|
53
|
+
chars = (33..126).map(&:chr) - ["&"]
|
54
|
+
/\A[#{Regexp.escape(chars.join)}\s]+\Z/
|
55
|
+
end.freeze
|
56
|
+
|
57
|
+
def normalize(s)
|
58
|
+
# nokogiri/array/misc => string
|
59
|
+
s = if s.respond_to?(:inner_html)
|
60
|
+
s.inner_html
|
61
|
+
elsif s.is_a?(Array)
|
62
|
+
s.join("|")
|
63
|
+
else
|
64
|
+
s.to_s
|
65
|
+
end
|
66
|
+
return if s.empty?
|
67
|
+
|
68
|
+
# simple attempt to strip tags. Note that we replace tags with spaces
|
69
|
+
s = s.gsub(/<[^>]+>/, " ")
|
70
|
+
|
71
|
+
if s !~ ASCII_ONLY
|
72
|
+
# Converts MS Word 'smart punctuation' to ASCII
|
73
|
+
s = Sterile.plain_format(s)
|
74
|
+
|
75
|
+
# á & etc.
|
76
|
+
s = Sterile.decode_entities(s)
|
77
|
+
|
78
|
+
# "šţɽĩɳģ" => "string"
|
79
|
+
s = Sterile.transliterate(s)
|
80
|
+
end
|
81
|
+
|
82
|
+
# squish
|
83
|
+
s = s.strip.gsub(/\s+/, " ")
|
84
|
+
return if s.empty?
|
85
|
+
|
86
|
+
s
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,98 +1,72 @@
|
|
1
|
-
require 'scripto'
|
2
|
-
require 'sinew/connection'
|
3
|
-
|
4
|
-
#
|
5
|
-
# Main sinew entry point.
|
6
|
-
#
|
7
|
-
|
8
1
|
module Sinew
|
9
|
-
class
|
10
|
-
|
2
|
+
# Helper class used by sinew bin. This exists as an independent class solely
|
3
|
+
# for testing, otherwise it would be built into the bin script.
|
4
|
+
class Main
|
5
|
+
attr_reader :sinew
|
11
6
|
|
12
7
|
def initialize(options)
|
13
|
-
|
8
|
+
options[:output] ||= begin
|
9
|
+
src = options[:recipe]
|
10
|
+
dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
|
11
|
+
dst = dst.sub(%r{^./}, "") # nice to clean this up
|
12
|
+
dst
|
13
|
+
end
|
14
14
|
|
15
|
-
|
16
|
-
@runtime_options = RuntimeOptions.new
|
15
|
+
@sinew = Sinew::Base.new(options)
|
17
16
|
end
|
18
17
|
|
19
18
|
def run
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
def dsl
|
29
|
-
@dsl ||= DSL.new(self)
|
30
|
-
end
|
31
|
-
|
32
|
-
#
|
33
|
-
# http requests
|
34
|
-
#
|
35
|
-
|
36
|
-
def http(method, url, options = {})
|
37
|
-
request = Request.new(method, url, request_options(options))
|
38
|
-
response = request.perform(connection)
|
39
|
-
|
40
|
-
# always log error messages
|
41
|
-
if response.error?
|
42
|
-
puts "xxx http request failed with #{response.code}"
|
19
|
+
tm = Time.now
|
20
|
+
header if !sinew.options[:silent]
|
21
|
+
recipe = sinew.options[:recipe]
|
22
|
+
dsl = DSL.new(sinew)
|
23
|
+
begin
|
24
|
+
dsl.instance_eval(File.read(recipe, mode: "rb"), recipe)
|
25
|
+
rescue LimitError
|
26
|
+
# ignore - this is flow control for --limit
|
43
27
|
end
|
44
|
-
|
45
|
-
response
|
28
|
+
footer(Time.now - tm) if !sinew.options[:silent]
|
46
29
|
end
|
47
30
|
|
48
|
-
|
49
|
-
@connection ||= Connection.create(options: options, runtime_options: runtime_options)
|
50
|
-
end
|
51
|
-
protected :connection
|
31
|
+
protected
|
52
32
|
|
53
33
|
#
|
54
|
-
#
|
34
|
+
# header/footer
|
55
35
|
#
|
56
36
|
|
57
|
-
def
|
58
|
-
|
37
|
+
def header
|
38
|
+
sinew.banner("Writing to #{sinew.csv.path}...")
|
59
39
|
end
|
60
40
|
|
61
|
-
|
62
|
-
|
63
|
-
|
41
|
+
def footer(elapsed)
|
42
|
+
csv = sinew.csv
|
43
|
+
count = csv.count
|
64
44
|
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
[ runtime_options.headers, options[:headers]].each do
|
69
|
-
h.merge!(_1) if _1
|
70
|
-
end
|
71
|
-
end
|
72
|
-
req[:proxy] = random_proxy
|
45
|
+
if count == 0
|
46
|
+
sinew.banner(format("Done in %ds. Nothing written.", elapsed))
|
47
|
+
return
|
73
48
|
end
|
74
|
-
end
|
75
|
-
protected :request_options
|
76
|
-
|
77
|
-
PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
|
78
49
|
|
79
|
-
|
80
|
-
|
50
|
+
# summary
|
51
|
+
msg = format("Done in %ds. Wrote %d rows to %s. Summary:", elapsed, count, csv.path)
|
52
|
+
sinew.banner(msg)
|
81
53
|
|
82
|
-
|
83
|
-
|
84
|
-
|
54
|
+
# tally
|
55
|
+
tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
|
56
|
+
len = tally.keys.map { _1.to_s.length }.max
|
57
|
+
fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
|
58
|
+
tally.each do
|
59
|
+
printf(fmt, _1, _2, count, _2 * 100.0 / count)
|
85
60
|
end
|
86
|
-
|
87
|
-
"http://#{proxy}"
|
88
61
|
end
|
89
|
-
protected :random_proxy
|
90
62
|
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
63
|
+
# simple DSL for .sinew files
|
64
|
+
class DSL
|
65
|
+
attr_reader :sinew
|
66
|
+
|
67
|
+
def initialize(sinew)
|
68
|
+
@sinew = sinew
|
69
|
+
end
|
95
70
|
end
|
96
|
-
protected :footer
|
97
71
|
end
|
98
72
|
end
|
@@ -1,8 +1,9 @@
|
|
1
1
|
module Sinew
|
2
|
-
module
|
2
|
+
module Middleware
|
3
|
+
# Minimalist Formatter that logs proxy if present.
|
3
4
|
class LogFormatter < Faraday::Logging::Formatter
|
4
5
|
def request(env)
|
5
|
-
info(
|
6
|
+
info("req") do
|
6
7
|
# Only log the initial request, not the redirects
|
7
8
|
return if env[:redirect]
|
8
9
|
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,28 +1,19 @@
|
|
1
|
-
require
|
1
|
+
require "nokogiri"
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
|
-
|
5
|
-
|
6
|
-
|
4
|
+
module Nokogiri
|
5
|
+
module XML
|
6
|
+
class NodeSet
|
7
|
+
alias_method :old_inner_html, :inner_html
|
8
|
+
alias_method :old_inner_text, :inner_text
|
7
9
|
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
def inner_html(*args)
|
13
|
-
map { |i| i.inner_html(*args) }.join(' ')
|
14
|
-
end
|
15
|
-
end
|
10
|
+
def inner_text
|
11
|
+
map(&:inner_text).join(" ")
|
12
|
+
end
|
16
13
|
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
t&.text
|
22
|
-
end
|
23
|
-
end
|
24
|
-
class Nokogiri::XML::NodeSet
|
25
|
-
def text_just_me
|
26
|
-
map(&:text_just_me).join(' ')
|
14
|
+
def inner_html(*args)
|
15
|
+
map { _1.inner_html(*args) }.join(" ")
|
16
|
+
end
|
17
|
+
end
|
27
18
|
end
|
28
19
|
end
|