sinew 3.0.1 → 4.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/justfile ADDED
@@ -0,0 +1,59 @@
1
+
2
+ # read gem version
3
+ gemver := `cat lib/sinew/version.rb | grep -Eo "[0-9]+\.[0-9]+\.[0-9]+"`
4
+
5
+ #
6
+ # dev
7
+ #
8
+
9
+ default: test
10
+
11
+ check: lint test
12
+
13
+ fmt:
14
+ bundle exec rubocop -a
15
+
16
+ lint:
17
+ @just banner lint...
18
+ bundle exec rubocop
19
+
20
+ pry:
21
+ bundle exec pry -I lib -r sinew.rb
22
+
23
+ test:
24
+ @just banner test...
25
+ bundle exec rake test
26
+
27
+ watch:
28
+ @watchexec --watch lib --watch test --clear bundle exec rake test
29
+
30
+ #
31
+ # ci
32
+ #
33
+
34
+ ci:
35
+ bundle install
36
+ just check
37
+
38
+ #
39
+ # gem tasks
40
+ #
41
+
42
+ gem-push: check-git-status
43
+ @just banner gem build...
44
+ gem build sinew.gemspec
45
+ @just banner tag...
46
+ git tag -a "v{{gemver}}" -m "Tagging {{gemver}}"
47
+ git push --tags
48
+ @just banner gem push...
49
+ gem push "sinew-{{gemver}}.gem"
50
+
51
+ #
52
+ # util
53
+ #
54
+
55
+ banner *ARGS:
56
+ @printf '\e[42;37;1m[%s] %-72s \e[m\n' "$(date +%H:%M:%S)" "{{ARGS}}"
57
+
58
+ check-git-status:
59
+ @if [ ! -z "$(git status --porcelain)" ]; then echo "git status is dirty, bailing."; exit 1; fi
data/lib/sinew/args.rb ADDED
@@ -0,0 +1,53 @@
1
+ # manually load dependencies here since this is loaded standalone by bin
2
+ require "httpdisk/slop_duration"
3
+ require "sinew/version"
4
+ require "slop"
5
+
6
+ #
7
+ # This is used to parse command line arguments with Slop. We don't set any
8
+ # defaults in here, relying instead on Sloptions in Sinew::Base. That way
9
+ # defaults are applied for both command line and embedded usage of Sinew::Base.
10
+ #
11
+
12
+ module Sinew
13
+ module Args
14
+ def self.slop(args)
15
+ slop = Slop.parse(args) do |o|
16
+ o.banner = "Usage: sinew [options] [recipe.sinew]"
17
+ o.integer "-l", "--limit", "quit after emitting this many rows"
18
+ o.string "--proxy", "use host[:port] as HTTP proxy (can be a comma-delimited list)"
19
+ o.integer "--timeout", "maximum time allowed for the transfer"
20
+ o.bool "-s", "--silent", "suppress some output"
21
+ o.bool "-v", "--verbose", "dump emitted rows while running"
22
+
23
+ o.separator "From httpdisk:"
24
+ o.string "--dir", "set custom cache directory"
25
+ # note: uses slop_duration from HTTPDisk
26
+ o.duration "--expires", "when to expire cached requests (ex: 1h, 2d, 3w)"
27
+ o.bool "--force", "don't read anything from cache (but still write)"
28
+ o.bool "--force-errors", "don't read errors from cache (but still write)"
29
+
30
+ # generic
31
+ o.boolean "--version", "show version" do
32
+ puts "sinew #{Sinew::VERSION}"
33
+ exit
34
+ end
35
+ o.on("--help", "show this help") do
36
+ puts o
37
+ exit
38
+ end
39
+ end
40
+
41
+ # recipe argument
42
+ recipe = slop.args.first
43
+ raise Slop::Error, "" if args.empty?
44
+ raise Slop::Error, "no RECIPE specified" if !recipe
45
+ raise Slop::Error, "more than one RECIPE specified" if slop.args.length > 1
46
+ raise Slop::Error, "#{recipe} not found" if !File.exist?(recipe)
47
+
48
+ slop.to_h.tap do
49
+ _1[:recipe] = recipe
50
+ end
51
+ end
52
+ end
53
+ end
data/lib/sinew/base.rb ADDED
@@ -0,0 +1,252 @@
1
+ require "amazing_print"
2
+ require "faraday-encoding"
3
+ require "faraday-rate_limiter"
4
+ require "faraday/logging/formatter"
5
+ require "faraday/retry"
6
+ require "httpdisk"
7
+
8
+ module Sinew
9
+ # Sinew base class, for in standalone scripts or via the sinew binary.
10
+ class Base
11
+ attr_reader :csv, :mutex, :options
12
+
13
+ def initialize(opts = {})
14
+ @mutex = Mutex.new
15
+
16
+ #
17
+ # defaults for Sloptions
18
+ #
19
+
20
+ # default :rate_limit, typically 1
21
+ default_rate_limit = ENV["SINEW_TEST"] ? 0 : 1
22
+
23
+ #
24
+ # note: uses HTTPDisk::Sloptions
25
+ #
26
+
27
+ @options = HTTPDisk::Sloptions.parse(opts) do
28
+ # cli
29
+ _1.integer :limit
30
+ _1.integer :timeout, default: 30
31
+ _1.boolean :silent
32
+ _1.on :proxy, type: [:string, Array]
33
+ _1.boolean :verbose
34
+
35
+ # httpdisk
36
+ _1.string :dir, default: File.join(ENV["HOME"], ".sinew")
37
+ _1.integer :expires
38
+ _1.boolean :force
39
+ _1.boolean :force_errors
40
+ _1.array :ignore_params
41
+
42
+ # more runtime options
43
+ _1.hash :headers
44
+ _1.boolean :insecure
45
+ _1.string :output, required: true
46
+ _1.hash :params
47
+ _1.float :rate_limit, default: default_rate_limit
48
+ _1.integer :retries, default: 2
49
+ _1.on :url_prefix, type: [:string, URI]
50
+ _1.boolean :utf8, default: true
51
+ end
52
+
53
+ @csv = CSV.new(opts[:output])
54
+ end
55
+
56
+ #
57
+ # requests
58
+ #
59
+
60
+ # http get, returns a Response
61
+ def get(url, params = nil, headers = nil)
62
+ faraday_response = faraday.get(url, params, headers) do
63
+ _1.options[:proxy] = random_proxy
64
+ end
65
+ Response.new(faraday_response)
66
+ end
67
+
68
+ # http post, returns a Response. Defaults to form body type.
69
+ def post(url, body = nil, headers = nil)
70
+ faraday_response = faraday.post(url, body, headers) do
71
+ _1.options[:proxy] = random_proxy
72
+ end
73
+ Response.new(faraday_response)
74
+ end
75
+
76
+ # http post json, returns a Response
77
+ def post_json(url, body = nil, headers = nil)
78
+ body = body.to_json
79
+ headers = (headers || {}).merge("Content-Type" => "application/json")
80
+ post(url, body, headers)
81
+ end
82
+
83
+ # Faraday connection for this recipe
84
+ def faraday
85
+ mutex.synchronize do
86
+ @faraday ||= create_faraday
87
+ end
88
+ end
89
+
90
+ #
91
+ # httpdisk
92
+ #
93
+
94
+ # Returns true if request is cached. Defaults to form body type.
95
+ def cached?(method, url, params = nil, body = nil)
96
+ status = status(method, url, params, body)
97
+ status[:status] != "miss"
98
+ end
99
+
100
+ # Remove cache file, if any. Defaults to form body type.
101
+ def uncache(method, url, params = nil, body = nil)
102
+ status = status(method, url, params, body)
103
+ path = status[:path]
104
+ File.unlink(path) if File.exist?(path)
105
+ end
106
+
107
+ # Check httpdisk status for this request. Defaults to form body type.
108
+ def status(method, url, params = nil, body = nil)
109
+ # if hash, default to url encoded form
110
+ # see lib/faraday/request/url_encoded.rb
111
+ if body.is_a?(Hash)
112
+ body = Faraday::Utils::ParamsHash[body].to_query
113
+ end
114
+
115
+ env = Faraday::Env.new.tap do
116
+ _1.method = method.to_s.downcase.to_sym
117
+ _1.request_headers = {}
118
+ _1.request_body = body
119
+ _1.url = faraday.build_url(url, params)
120
+ end
121
+ httpdisk.status(env)
122
+ end
123
+
124
+ #
125
+ # csv
126
+ #
127
+
128
+ # Output a csv header. This usually happens automatically, but you can call
129
+ # this method directly to ensure a consistent set of columns.
130
+ def csv_header(*columns)
131
+ csv.start(columns.flatten)
132
+ end
133
+
134
+ # Output a csv row. Row should be any object that can turn into a hash - a
135
+ # hash, OpenStruct, etc.
136
+ def csv_emit(row)
137
+ row = row.to_h
138
+ mutex.synchronize do
139
+ # header if necessary
140
+ csv_header(row.keys) if !csv.started?
141
+
142
+ # emit
143
+ print = csv.emit(row)
144
+ puts print.ai if options[:verbose]
145
+
146
+ # this is caught by Sinew::Main
147
+ if csv.count == options[:limit]
148
+ raise LimitError
149
+ end
150
+ end
151
+ end
152
+
153
+ #
154
+ # stdout
155
+ #
156
+
157
+ RESET = "\e[0m".freeze
158
+ RED = "\e[1;37;41m".freeze
159
+ GREEN = "\e[1;37;42m".freeze
160
+
161
+ # Print a nice green banner.
162
+ def banner(msg, color: GREEN)
163
+ msg = "#{msg} ".ljust(72, " ")
164
+ msg = "[#{Time.new.strftime("%H:%M:%S")}] #{msg}"
165
+ msg = "#{color}#{msg}#{RESET}" if $stdout.tty?
166
+ puts msg
167
+ end
168
+
169
+ # Print a scary red banner and exit.
170
+ def fatal(msg)
171
+ banner(msg, color: RED)
172
+ exit 1
173
+ end
174
+
175
+ protected
176
+
177
+ # Return a random proxy.
178
+ def random_proxy
179
+ return if !options[:proxy]
180
+
181
+ proxies = options[:proxy]
182
+ proxies = proxies.split(",") if !proxies.is_a?(Array)
183
+ proxies.sample
184
+ end
185
+
186
+ # Create the Faraday connection for making requests.
187
+ def create_faraday
188
+ faraday_options = options.slice(:headers, :params)
189
+ if options[:insecure]
190
+ faraday_options[:ssl] = {verify: false}
191
+ end
192
+ Faraday.new(nil, faraday_options) do
193
+ # options
194
+ if options[:url_prefix]
195
+ _1.url_prefix = options[:url_prefix]
196
+ end
197
+ _1.options.timeout = options[:timeout]
198
+
199
+ #
200
+ # middleware that runs on both disk/network requests
201
+ #
202
+
203
+ # cookie middleware
204
+ _1.use :cookie_jar
205
+
206
+ # auto-encode form bodies
207
+ _1.request :url_encoded
208
+
209
+ # Before httpdisk so each redirect segment is cached
210
+ # Keep track of redirect status for logger
211
+ _1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
212
+
213
+ #
214
+ # httpdisk
215
+ #
216
+
217
+ httpdisk_options = options.slice(:dir, :expires, :force, :force_errors, :ignore_params, :utf8)
218
+ _1.use :httpdisk, httpdisk_options
219
+
220
+ #
221
+ # middleware below only used it httpdisk uses the network
222
+ #
223
+
224
+ # rate limit
225
+ rate_limit = options[:rate_limit]
226
+ _1.request :rate_limiter, interval: rate_limit
227
+
228
+ # After httpdisk so that only non-cached requests are logged.
229
+ # Before retry so that we don't log each retry attempt.
230
+ _1.response :logger, nil, formatter: Middleware::LogFormatter if !options[:silent]
231
+
232
+ retry_options = {
233
+ max_interval: rate_limit, # very important, negates Retry-After: 86400
234
+ max: options[:retries],
235
+ methods: %w[delete get head options patch post put trace],
236
+ retry_statuses: (500..600).to_a,
237
+ retry_if: ->(_env, _err) { true }
238
+ }
239
+ _1.request :retry, retry_options
240
+ end
241
+ end
242
+
243
+ # find connection's httpdisk instance
244
+ def httpdisk
245
+ @httpdisk ||= begin
246
+ app = faraday.app
247
+ app = app.app until app.is_a?(HTTPDisk::Client)
248
+ app
249
+ end
250
+ end
251
+ end
252
+ end
data/lib/sinew/csv.rb ADDED
@@ -0,0 +1,89 @@
1
+ require "csv"
2
+ require "sterile"
3
+
4
+ module Sinew
5
+ class CSV
6
+ attr_reader :columns, :count, :csv, :path, :tally
7
+
8
+ def initialize(path)
9
+ @count = 0
10
+ @csv = nil
11
+ @path = path
12
+ end
13
+
14
+ # start writing the csv
15
+ def start(columns)
16
+ raise "started twice" if started?
17
+
18
+ @columns = columns
19
+ @tally = columns.map { [_1, 0] }.to_h
20
+ @csv = ::CSV.open(path, "wb").tap do
21
+ _1 << columns
22
+ end
23
+ end
24
+
25
+ # has this csv been started?
26
+ def started?
27
+ @csv != nil
28
+ end
29
+
30
+ # append a row
31
+ def emit(row)
32
+ # convert row to cols, and construct print (our return value)
33
+ print = {}
34
+ row = columns.map do
35
+ value = normalize(row[_1])
36
+ if value
37
+ print[_1] = value
38
+ tally[_1] += 1
39
+ end
40
+ value
41
+ end
42
+ @count += 1
43
+
44
+ # emit
45
+ csv << row
46
+ csv.flush
47
+
48
+ # return in case someone wants to pretty print this
49
+ print
50
+ end
51
+
52
+ ASCII_ONLY = begin
53
+ chars = (33..126).map(&:chr) - ["&"]
54
+ /\A[#{Regexp.escape(chars.join)}\s]+\Z/
55
+ end.freeze
56
+
57
+ def normalize(s)
58
+ # nokogiri/array/misc => string
59
+ s = if s.respond_to?(:inner_html)
60
+ s.inner_html
61
+ elsif s.is_a?(Array)
62
+ s.join("|")
63
+ else
64
+ s.to_s
65
+ end
66
+ return if s.empty?
67
+
68
+ # simple attempt to strip tags. Note that we replace tags with spaces
69
+ s = s.gsub(/<[^>]+>/, " ")
70
+
71
+ if s !~ ASCII_ONLY
72
+ # Converts MS Word 'smart punctuation' to ASCII
73
+ s = Sterile.plain_format(s)
74
+
75
+ # &aacute; &amp; etc.
76
+ s = Sterile.decode_entities(s)
77
+
78
+ # "šţɽĩɳģ" => "string"
79
+ s = Sterile.transliterate(s)
80
+ end
81
+
82
+ # squish
83
+ s = s.strip.gsub(/\s+/, " ")
84
+ return if s.empty?
85
+
86
+ s
87
+ end
88
+ end
89
+ end
data/lib/sinew/main.rb CHANGED
@@ -1,98 +1,72 @@
1
- require 'scripto'
2
- require 'sinew/connection'
3
-
4
- #
5
- # Main sinew entry point.
6
- #
7
-
8
1
  module Sinew
9
- class Main < Scripto::Main
10
- attr_reader :runtime_options
2
+ # Helper class used by sinew bin. This exists as an independent class solely
3
+ # for testing, otherwise it would be built into the bin script.
4
+ class Main
5
+ attr_reader :sinew
11
6
 
12
7
  def initialize(options)
13
- super(options)
8
+ options[:output] ||= begin
9
+ src = options[:recipe]
10
+ dst = File.join(File.dirname(src), "#{File.basename(src, File.extname(src))}.csv")
11
+ dst = dst.sub(%r{^./}, "") # nice to clean this up
12
+ dst
13
+ end
14
14
 
15
- # init
16
- @runtime_options = RuntimeOptions.new
15
+ @sinew = Sinew::Base.new(options)
17
16
  end
18
17
 
19
18
  def run
20
- dsl.run
21
- footer if !quiet?
22
- end
23
-
24
- def quiet?
25
- options[:quiet]
26
- end
27
-
28
- def dsl
29
- @dsl ||= DSL.new(self)
30
- end
31
-
32
- #
33
- # http requests
34
- #
35
-
36
- def http(method, url, options = {})
37
- request = Request.new(method, url, request_options(options))
38
- response = request.perform(connection)
39
-
40
- # always log error messages
41
- if response.error?
42
- puts "xxx http request failed with #{response.code}"
19
+ tm = Time.now
20
+ header if !sinew.options[:silent]
21
+ recipe = sinew.options[:recipe]
22
+ dsl = DSL.new(sinew)
23
+ begin
24
+ dsl.instance_eval(File.read(recipe, mode: "rb"), recipe)
25
+ rescue LimitError
26
+ # ignore - this is flow control for --limit
43
27
  end
44
-
45
- response
28
+ footer(Time.now - tm) if !sinew.options[:silent]
46
29
  end
47
30
 
48
- def connection
49
- @connection ||= Connection.create(options: options, runtime_options: runtime_options)
50
- end
51
- protected :connection
31
+ protected
52
32
 
53
33
  #
54
- # output
34
+ # header/footer
55
35
  #
56
36
 
57
- def output
58
- @output ||= Output.new(self)
37
+ def header
38
+ sinew.banner("Writing to #{sinew.csv.path}...")
59
39
  end
60
40
 
61
- #
62
- # helpers
63
- #
41
+ def footer(elapsed)
42
+ csv = sinew.csv
43
+ count = csv.count
64
44
 
65
- def request_options(options)
66
- options.dup.tap do |req|
67
- req[:headers] = {}.tap do |h|
68
- [ runtime_options.headers, options[:headers]].each do
69
- h.merge!(_1) if _1
70
- end
71
- end
72
- req[:proxy] = random_proxy
45
+ if count == 0
46
+ sinew.banner(format("Done in %ds. Nothing written.", elapsed))
47
+ return
73
48
  end
74
- end
75
- protected :request_options
76
-
77
- PROXY_RE = /\A#{URI::PATTERN::HOST}(:\d+)?\Z/.freeze
78
49
 
79
- def random_proxy
80
- return if !options[:proxy]
50
+ # summary
51
+ msg = format("Done in %ds. Wrote %d rows to %s. Summary:", elapsed, count, csv.path)
52
+ sinew.banner(msg)
81
53
 
82
- proxy = options[:proxy].split(',').sample
83
- if proxy !~ PROXY_RE
84
- raise ArgumentError, "invalid proxy #{proxy.inspect}, should be host[:port]"
54
+ # tally
55
+ tally = csv.tally.sort_by { [-_2, _1.to_s] }.to_h
56
+ len = tally.keys.map { _1.to_s.length }.max
57
+ fmt = " %-#{len + 1}s %7d/%-7d %5.1f%%\n"
58
+ tally.each do
59
+ printf(fmt, _1, _2, count, _2 * 100.0 / count)
85
60
  end
86
-
87
- "http://#{proxy}"
88
61
  end
89
- protected :random_proxy
90
62
 
91
- def footer
92
- output.report
93
- finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
94
- banner("#{finished} in #{dsl.elapsed.to_i}s.")
63
+ # simple DSL for .sinew files
64
+ class DSL
65
+ attr_reader :sinew
66
+
67
+ def initialize(sinew)
68
+ @sinew = sinew
69
+ end
95
70
  end
96
- protected :footer
97
71
  end
98
72
  end
@@ -1,8 +1,9 @@
1
1
  module Sinew
2
- module Connection
2
+ module Middleware
3
+ # Minimalist Formatter that logs proxy if present.
3
4
  class LogFormatter < Faraday::Logging::Formatter
4
5
  def request(env)
5
- info('req') do
6
+ info("req") do
6
7
  # Only log the initial request, not the redirects
7
8
  return if env[:redirect]
8
9
 
@@ -1,28 +1,19 @@
1
- require 'nokogiri'
1
+ require "nokogiri"
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
- class Nokogiri::XML::NodeSet
5
- alias old_inner_html inner_html
6
- alias old_inner_text inner_text
4
+ module Nokogiri
5
+ module XML
6
+ class NodeSet
7
+ alias_method :old_inner_html, :inner_html
8
+ alias_method :old_inner_text, :inner_text
7
9
 
8
- def inner_text
9
- map(&:inner_text).join(' ')
10
- end
11
-
12
- def inner_html(*args)
13
- map { |i| i.inner_html(*args) }.join(' ')
14
- end
15
- end
10
+ def inner_text
11
+ map(&:inner_text).join(" ")
12
+ end
16
13
 
17
- # text_just_me
18
- class Nokogiri::XML::Node
19
- def text_just_me
20
- t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
21
- t&.text
22
- end
23
- end
24
- class Nokogiri::XML::NodeSet
25
- def text_just_me
26
- map(&:text_just_me).join(' ')
14
+ def inner_html(*args)
15
+ map { _1.inner_html(*args) }.join(" ")
16
+ end
17
+ end
27
18
  end
28
19
  end