sinew 3.0.1 → 4.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/justfile ADDED
@@ -0,0 +1,59 @@
1
+
2
+ # read gem version
3
+ gemver := `cat lib/sinew/version.rb | grep -Eo "[0-9]+\.[0-9]+\.[0-9]+"`
4
+
5
+ #
6
+ # dev
7
+ #
8
+
9
+ default: test
10
+
11
+ check: lint test
12
+
13
+ fmt:
14
+ bundle exec rubocop -a
15
+
16
+ lint:
17
+ @just banner lint...
18
+ bundle exec rubocop
19
+
20
+ pry:
21
+ bundle exec pry -I lib -r sinew.rb
22
+
23
+ test:
24
+ @just banner test...
25
+ bundle exec rake test
26
+
27
+ watch:
28
+ @watchexec --watch lib --watch test --clear bundle exec rake test
29
+
30
+ #
31
+ # ci
32
+ #
33
+
34
+ ci:
35
+ bundle install
36
+ just check
37
+
38
+ #
39
+ # gem tasks
40
+ #
41
+
42
+ gem-push: check-git-status
43
+ @just banner gem build...
44
+ gem build sinew.gemspec
45
+ @just banner tag...
46
+ git tag -a "v{{gemver}}" -m "Tagging {{gemver}}"
47
+ git push --tags
48
+ @just banner gem push...
49
+ gem push "sinew-{{gemver}}.gem"
50
+
51
+ #
52
+ # util
53
+ #
54
+
55
+ banner *ARGS:
56
+ @printf '\e[42;37;1m[%s] %-72s \e[m\n' "$(date +%H:%M:%S)" "{{ARGS}}"
57
+
58
+ check-git-status:
59
+ @if [ ! -z "$(git status --porcelain)" ]; then echo "git status is dirty, bailing."; exit 1; fi
data/lib/sinew/args.rb ADDED
@@ -0,0 +1,53 @@
1
+ # manually load dependencies here since this is loaded standalone by bin
2
+ require "httpdisk/slop_duration"
3
+ require "sinew/version"
4
+ require "slop"
5
+
6
+ #
7
+ # This is used to parse command line arguments with Slop. We don't set any
8
+ # defaults in here, relying instead on Sloptions in Sinew::Base. That way
9
+ # defaults are applied for both command line and embedded usage of Sinew::Base.
10
+ #
11
+
12
+ module Sinew
13
+ module Args
14
+ def self.slop(args)
15
+ slop = Slop.parse(args) do |o|
16
+ o.banner = "Usage: sinew [options] [recipe.sinew]"
17
+ o.integer "-l", "--limit", "quit after emitting this many rows"
18
+ o.string "--proxy", "use host[:port] as HTTP proxy (can be a comma-delimited list)"
19
+ o.integer "--timeout", "maximum time allowed for the transfer"
20
+ o.bool "-s", "--silent", "suppress some output"
21
+ o.bool "-v", "--verbose", "dump emitted rows while running"
22
+
23
+ o.separator "From httpdisk:"
24
+ o.string "--dir", "set custom cache directory"
25
+ # note: uses slop_duration from HTTPDisk
26
+ o.duration "--expires", "when to expire cached requests (ex: 1h, 2d, 3w)"
27
+ o.bool "--force", "don't read anything from cache (but still write)"
28
+ o.bool "--force-errors", "don't read errors from cache (but still write)"
29
+
30
+ # generic
31
+ o.boolean "--version", "show version" do
32
+ puts "sinew #{Sinew::VERSION}"
33
+ exit
34
+ end
35
+ o.on("--help", "show this help") do
36
+ puts o
37
+ exit
38
+ end
39
+ end
40
+
41
+ # recipe argument
42
+ recipe = slop.args.first
43
+ raise Slop::Error, "" if args.empty?
44
+ raise Slop::Error, "no RECIPE specified" if !recipe
45
+ raise Slop::Error, "more than one RECIPE specified" if slop.args.length > 1
46
+ raise Slop::Error, "#{recipe} not found" if !File.exist?(recipe)
47
+
48
+ slop.to_h.tap do
49
+ _1[:recipe] = recipe
50
+ end
51
+ end
52
+ end
53
+ end
data/lib/sinew/base.rb ADDED
@@ -0,0 +1,252 @@
1
+ require "amazing_print"
2
+ require "faraday-encoding"
3
+ require "faraday-rate_limiter"
4
+ require "faraday/logging/formatter"
5
+ require "faraday/retry"
6
+ require "httpdisk"
7
+
8
+ module Sinew
9
+ # Sinew base class, for in standalone scripts or via the sinew binary.
10
+ class Base
11
+ attr_reader :csv, :mutex, :options
12
+
13
+ def initialize(opts = {})
14
+ @mutex = Mutex.new
15
+
16
+ #
17
+ # defaults for Sloptions
18
+ #
19
+
20
+ # default :rate_limit, typically 1
21
+ default_rate_limit = ENV["SINEW_TEST"] ? 0 : 1
22
+
23
+ #
24
+ # note: uses HTTPDisk::Sloptions
25
+ #
26
+
27
+ @options = HTTPDisk::Sloptions.parse(opts) do
28
+ # cli
29
+ _1.integer :limit
30
+ _1.integer :timeout, default: 30
31
+ _1.boolean :silent
32
+ _1.on :proxy, type: [:string, Array]
33
+ _1.boolean :verbose
34
+
35
+ # httpdisk
36
+ _1.string :dir, default: File.join(ENV["HOME"], ".sinew")
37
+ _1.integer :expires
38
+ _1.boolean :force
39
+ _1.boolean :force_errors
40
+ _1.array :ignore_params
41
+
42
+ # more runtime options
43
+ _1.hash :headers
44
+ _1.boolean :insecure
45
+ _1.string :output, required: true
46
+ _1.hash :params
47
+ _1.float :rate_limit, default: default_rate_limit
48
+ _1.integer :retries, default: 2
49
+ _1.on :url_prefix, type: [:string, URI]
50
+ _1.boolean :utf8, default: true
51
+ end
52
+
53
+ @csv = CSV.new(opts[:output])
54
+ end
55
+
56
+ #
57
+ # requests
58
+ #
59
+
60
+ # http get, returns a Response
61
+ def get(url, params = nil, headers = nil)
62
+ faraday_response = faraday.get(url, params, headers) do
63
+ _1.options[:proxy] = random_proxy
64
+ end
65
+ Response.new(faraday_response)
66
+ end
67
+
68
+ # http post, returns a Response. Defaults to form body type.
69
+ def post(url, body = nil, headers = nil)
70
+ faraday_response = faraday.post(url, body, headers) do
71
+ _1.options[:proxy] = random_proxy
72
+ end
73
+ Response.new(faraday_response)
74
+ end
75
+
76
+ # http post json, returns a Response
77
+ def post_json(url, body = nil, headers = nil)
78
+ body = body.to_json
79
+ headers = (headers || {}).merge("Content-Type" => "application/json")
80
+ post(url, body, headers)
81
+ end
82
+
83
+ # Faraday connection for this recipe
84
+ def faraday
85
+ mutex.synchronize do
86
+ @faraday ||= create_faraday
87
+ end
88
+ end
89
+
90
+ #
91
+ # httpdisk
92
+ #
93
+
94
+ # Returns true if request is cached. Defaults to form body type.
95
+ def cached?(method, url, params = nil, body = nil)
96
+ status = status(method, url, params, body)
97
+ status[:status] != "miss"
98
+ end
99
+
100
+ # Remove cache file, if any. Defaults to form body type.
101
+ def uncache(method, url, params = nil, body = nil)
102
+ status = status(method, url, params, body)
103
+ path = status[:path]
104
+ File.unlink(path) if File.exist?(path)
105
+ end
106
+
107
+ # Check httpdisk status for this request. Defaults to form body type.
108
+ def status(method, url, params = nil, body = nil)
109
+ # if hash, default to url encoded form
110
+ # see lib/faraday/request/url_encoded.rb
111
+ if body.is_a?(Hash)
112
+ body = Faraday::Utils::ParamsHash[body].to_query
113
+ end
114
+
115
+ env = Faraday::Env.new.tap do
116
+ _1.method = method.to_s.downcase.to_sym
117
+ _1.request_headers = {}
118
+ _1.request_body = body
119
+ _1.url = faraday.build_url(url, params)
120
+ end
121
+ httpdisk.status(env)
122
+ end
123
+
124
+ #
125
+ # csv
126
+ #
127
+
128
+ # Output a csv header. This usually happens automatically, but you can call
129
+ # this method directly to ensure a consistent set of columns.
130
+ def csv_header(*columns)
131
+ csv.start(columns.flatten)
132
+ end
133
+
134
+ # Output a csv row. Row should be any object that can turn into a hash - a
135
+ # hash, OpenStruct, etc.
136
+ def csv_emit(row)
137
+ row = row.to_h
138
+ mutex.synchronize do
139
+ # header if necessary
140
+ csv_header(row.keys) if !csv.started?
141
+
142
+ # emit
143
+ print = csv.emit(row)
144
+ puts print.ai if options[:verbose]
145
+
146
+ # this is caught by Sinew::Main
147
+ if csv.count == options[:limit]
148
+ raise LimitError
149
+ end
150
+ end
151
+ end
152
+
153
+ #
154
+ # stdout
155
+ #
156
+
157
+ RESET = "\e[0m".freeze
158
+ RED = "\e[1;37;41m".freeze
159
+ GREEN = "\e[1;37;42m".freeze
160
+
161
+ # Print a nice green banner.
162
+ def banner(msg, color: GREEN)
163
+ msg = "#{msg} ".ljust(72, " ")
164
+ msg = "[#{Time.new.strftime("%H:%M:%S")}] #{msg}"
165
+ msg = "#{color}#{msg}#{RESET}" if $stdout.tty?
166
+ puts msg
167
+ end
168
+
169
+ # Print a scary red banner and exit.
170
+ def fatal(msg)
171
+ banner(msg, color: RED)
172
+ exit 1
173
+ end
174
+
175
+ protected
176
+
177
+ # Return a random proxy.
178
+ def random_proxy
179
+ return if !options[:proxy]
180
+
181
+ proxies = options[:proxy]
182
+ proxies = proxies.split(",") if !proxies.is_a?(Array)
183
+ proxies.sample
184
+ end
185
+
186
+ # Create the Faraday connection for making requests.
187
+ def create_faraday
188
+ faraday_options = options.slice(:headers, :params)
189
+ if options[:insecure]
190
+ faraday_options[:ssl] = {verify: false}
191
+ end
192
+ Faraday.new(nil, faraday_options) do
193
+ # options
194
+ if options[:url_prefix]
195
+ _1.url_prefix = options[:url_prefix]
196
+ end
197
+ _1.options.timeout = options[:timeout]
198
+
199
+ #
200
+ # middleware that runs on both disk/network requests
201
+ #
202
+
203
+ # cookie middleware
204
+ _1.use :cookie_jar
205
+
206
+ # auto-encode form bodies
207
+ _1.request :url_encoded
208
+
209
+ # Before httpdisk so each redirect segment is cached
210
+ # Keep track of redirect status for logger
211
+ _1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
212
+
213
+ #
214
+ # httpdisk
215
+ #
216
+
217
+ httpdisk_options = options.slice(:dir, :expires, :force, :force_errors, :ignore_params, :utf8)
218
+ _1.use :httpdisk, httpdisk_options
219
+
220
+ #
221
+ # middleware below only used it httpdisk uses the network
222
+ #
223
+
224
+ # rate limit
225
+ rate_limit = options[:rate_limit]
226
+ _1.request :rate_limiter, interval: rate_limit
227
+
228
+ # After httpdisk so that only non-cached requests are logged.
229
+ # Before retry so that we don't log each retry attempt.
230
+ _1.response :logger, nil, formatter: Middleware::LogFormatter if !options[:silent]
231
+
232
+ retry_options = {
233
+ max_interval: rate_limit, # very important, negates Retry-After: 86400
234
+ max: options[:retries],
235
+ methods: %w[delete get head options patch post put trace],
236
+ retry_statuses: (500..600).to_a,
237
+ retry_if: ->(_env, _err) { true }
238
+ }
239
+ _1.request :retry, retry_options
240
+ end
241
+ end
242
+
243
+ # find connection's httpdisk instance
244
+ def httpdisk
245
+ @httpdisk ||= begin
246
+ app = faraday.app
247
+ app = app.app until app.is_a?(HTTPDisk::Client)
248
+ app
249
+ end
250
+ end
251
+ end
252
+ end
data/lib/sinew/csv.rb ADDED
@@ -0,0 +1,89 @@
1
+ require "csv"
2
+ require "sterile"
3
+
4
+ module Sinew
5
+ class CSV
6
+ attr_reader :columns, :count, :csv, :path, :tally
7
+
8
+ def initialize(path)
9
+ @count = 0
10
+ @csv = nil
11
+ @path = path
12
+ end
13
+
14
+ # start writing the csv
15
+ def start(columns)
16
+ raise "started twice" if started?
17
+
18
+ @columns = columns
19
+ @tally = columns.map { [_1, 0] }.to_h
20
+ @csv = ::CSV.open(path, "wb").tap do
21
+ _1 << columns
22
+ end
23
+ end
24
+
25
+ # has this csv been started?
26
+ def started?
27
+ @csv != nil
28
+ end
29
+
30
+ # append a row
31
+ def emit(row)
32
+ # convert row to cols, and construct print (our return value)
33
+ print = {}
34
+ row = columns.map do
35
+ value = normalize(row[_1])
36
+ if value
37
+ print[_1] = value
38
+ tally[_1] += 1
39
+ end
40
+ value
41
+ end
42
+ @count += 1
43
+
44
+ # emit
45
+ csv << row
46
+ csv.flush
47
+
48
+ # return in case someone wants to pretty print this
49
+ print
50
+ end
51
+
52
+ ASCII_ONLY = begin
53
+ chars = (33..126).map(&:chr) - ["&"]
54
+ /\A[#{Regexp.escape(chars.join)}\s]+\Z/
55
+ end.freeze
56
+
57
+ def normalize(s)
58
+ # nokogiri/array/misc => string
59
+ s = if s.respond_to?(:inner_html)
60
+ s.inner_html
61
+ elsif s.is_a?(Array)
62
+ s.join("|")
63
+ else
64
+ s.to_s
65
+ end
66
+ return if s.empty?
67
+
68
+ # simple attempt to strip tags. Note that we replace tags with spaces
69
+ s = s.gsub(/<[^>]+>/, " ")
70
+
71
+ if s !~ ASCII_ONLY
72
+ # Converts MS Word 'smart punctuation' to ASCII
73
+ s = Sterile.plain_format(s)
74
+
75
+ # &aacute; &amp; etc.
76
+ s = Sterile.decode_entities(s)
77
+
78
+ # "šţɽĩɳģ" => "string"
79
+ s = Sterile.transliterate(s)
80
+ end
81
+
82
+ # squish
83
+ s = s.strip.gsub(/\s+/, " ")
84
+ return if s.empty?
85
+
86
+ s
87
+ end
88
+ end
89
+ end
data/lib/sinew/main.rb CHANGED
@@ -1,98 +1,72 @@
1
- require 'scripto'
2
- require 'sinew/connection'
3
-
4
- #
5
- # Main sinew entry point.
6
- #
7
-
8
1
  module Sinew
9
- class Main < Scripto::Main
10
- attr_reader :runtime_options
2
+ # Helper class used by sinew bin. This exists as an independent class solely
3
+ # for testing, otherwise it would be built into the bin script.
4
+ class Main
5
+ attr_reader :sinew
11
6
 
12
7
  def initialize(options)
13
- super(options)
8
+ options[:output] ||= begin
9
+ src = options[:recipe]
10
+ dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
11
+ dst = dst.sub(%r{^./}, "") # nice to clean this up
12
+ dst
13
+ end
14
14
 
15
- # init
16
- @runtime_options = RuntimeOptions.new
15
+ @sinew = Sinew::Base.new(options)
17
16
  end
18
17
 
19
18
  def run
20
- dsl.run
21
- footer if !quiet?
22
- end
23
-
24
- def quiet?
25
- options[:quiet]
26
- end
27
-
28
- def dsl
29
- @dsl ||= DSL.new(self)
30
- end
31
-
32
- #
33
- # http requests
34
- #
35
-
36
- def http(method, url, options = {})
37
- request = Request.new(method, url, request_options(options))
38
- response = request.perform(connection)
39
-
40
- # always log error messages
41
- if response.error?
42
- puts "xxx http request failed with #{response.code}"
19
+ tm = Time.now
20
+ header if !sinew.options[:silent]
21
+ recipe = sinew.options[:recipe]
22
+ dsl = DSL.new(sinew)
23
+ begin
24
+ dsl.instance_eval(File.read(recipe, mode: "rb"), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
43
27
  end
44
-
45
- response
28
+ footer(Time.now - tm) if !sinew.options[:silent]
46
29
  end
47
30
 
48
- def connection
49
- @connection ||= Connection.create(options: options, runtime_options: runtime_options)
50
- end
51
- protected :connection
31
+ protected
52
32
 
53
33
  #
54
- # output
34
+ # header/footer
55
35
  #
56
36
 
57
- def output
58
- @output ||= Output.new(self)
37
+ def header
38
+ sinew.banner("Writing to #{sinew.csv.path}...")
59
39
  end
60
40
 
61
- #
62
- # helpers
63
- #
41
+ def footer(elapsed)
42
+ csv = sinew.csv
43
+ count = csv.count
64
44
 
65
- def request_options(options)
66
- options.dup.tap do |req|
67
- req[:headers] = {}.tap do |h|
68
- [ runtime_options.headers, options[:headers]].each do
69
- h.merge!(_1) if _1
70
- end
71
- end
72
- req[:proxy] = random_proxy
45
+ if count == 0
46
+ sinew.banner(format("Done in %ds. Nothing written.", elapsed))
47
+ return
73
48
  end
74
- end
75
- protected :request_options
76
-
77
- PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
78
49
 
79
- def random_proxy
80
- return if !options[:proxy]
50
+ # summary
51
+ msg = format("Done in %ds. Wrote %d rows to %s. Summary:", elapsed, count, csv.path)
52
+ sinew.banner(msg)
81
53
 
82
- proxy = options[:proxy].split(',').sample
83
- if proxy !~ PROXY_RE
84
- raise ArgumentError, "invalid proxy #{proxy.inspect}, should be host[:port]"
54
+ # tally
55
+ tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
56
+ len = tally.keys.map { _1.to_s.length }.max
57
+ fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
58
+ tally.each do
59
+ printf(fmt, _1, _2, count, _2 * 100.0 / count)
85
60
  end
86
-
87
- "http://#{proxy}"
88
61
  end
89
- protected :random_proxy
90
62
 
91
- def footer
92
- output.report
93
- finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
94
- banner("#{finished} in #{dsl.elapsed.to_i}s.")
63
+ # simple DSL for .sinew files
64
+ class DSL
65
+ attr_reader :sinew
66
+
67
+ def initialize(sinew)
68
+ @sinew = sinew
69
+ end
95
70
  end
96
- protected :footer
97
71
  end
98
72
  end
@@ -1,8 +1,9 @@
1
1
  module Sinew
2
- module Connection
2
+ module Middleware
3
+ # Minimalist Formatter that logs proxy if present.
3
4
  class LogFormatter < Faraday::Logging::Formatter
4
5
  def request(env)
5
- info('req') do
6
+ info("req") do
6
7
  # Only log the initial request, not the redirects
7
8
  return if env[:redirect]
8
9
 
@@ -1,28 +1,19 @@
1
- require 'nokogiri'
1
+ require "nokogiri"
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
- class Nokogiri::XML::NodeSet
5
- alias old_inner_html inner_html
6
- alias old_inner_text inner_text
4
+ module Nokogiri
5
+ module XML
6
+ class NodeSet
7
+ alias_method :old_inner_html, :inner_html
8
+ alias_method :old_inner_text, :inner_text
7
9
 
8
- def inner_text
9
- map(&:inner_text).join(' ')
10
- end
11
-
12
- def inner_html(*args)
13
- map { |i| i.inner_html(*args) }.join(' ')
14
- end
15
- end
10
+ def inner_text
11
+ map(&:inner_text).join(" ")
12
+ end
16
13
 
17
- # text_just_me
18
- class Nokogiri::XML::Node
19
- def text_just_me
20
- t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
21
- t&.text
22
- end
23
- end
24
- class Nokogiri::XML::NodeSet
25
- def text_just_me
26
- map(&:text_just_me).join(' ')
14
+ def inner_html(*args)
15
+ map { _1.inner_html(*args) }.join(" ")
16
+ end
17
+ end
27
18
  end
28
19
  end