sinew 3.0.1 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +3 -5
- data/.rubocop.yml +30 -48
- data/Gemfile +4 -4
- data/Gemfile.lock +124 -0
- data/README.md +108 -47
- data/Rakefile +16 -15
- data/bin/sinew +13 -41
- data/lib/sinew.rb +23 -9
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +2 -1
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +19 -16
- metadata +31 -21
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sinew
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Doppelt
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: amazing_print
|
@@ -54,47 +54,61 @@ dependencies:
|
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '0'
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
|
-
name:
|
57
|
+
name: faraday-rate_limiter
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '0'
|
62
|
+
version: '0.0'
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: '0'
|
69
|
+
version: '0.0'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
|
-
name:
|
71
|
+
name: hashie
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '1
|
76
|
+
version: '4.1'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
79
|
version_requirements: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
81
|
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
|
-
version: '1
|
83
|
+
version: '4.1'
|
84
84
|
- !ruby/object:Gem::Dependency
|
85
|
-
name:
|
85
|
+
name: httpdisk
|
86
86
|
requirement: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - "~>"
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: '0'
|
90
|
+
version: '0.5'
|
91
91
|
type: :runtime
|
92
92
|
prerelease: false
|
93
93
|
version_requirements: !ruby/object:Gem::Requirement
|
94
94
|
requirements:
|
95
95
|
- - "~>"
|
96
96
|
- !ruby/object:Gem::Version
|
97
|
-
version: '0'
|
97
|
+
version: '0.5'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: nokogiri
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '1.11'
|
105
|
+
type: :runtime
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '1.11'
|
98
112
|
- !ruby/object:Gem::Dependency
|
99
113
|
name: slop
|
100
114
|
requirement: !ruby/object:Gem::Requirement
|
@@ -134,26 +148,22 @@ files:
|
|
134
148
|
- ".github/workflows/test.yml"
|
135
149
|
- ".gitignore"
|
136
150
|
- ".rubocop.yml"
|
137
|
-
- ".vscode/extensions.json"
|
138
|
-
- ".vscode/settings.json"
|
139
151
|
- Gemfile
|
152
|
+
- Gemfile.lock
|
140
153
|
- LICENSE
|
141
154
|
- README.md
|
142
155
|
- Rakefile
|
143
156
|
- bin/sinew
|
144
157
|
- lib/sinew.rb
|
145
|
-
- lib/sinew/
|
146
|
-
- lib/sinew/
|
147
|
-
- lib/sinew/
|
148
|
-
- lib/sinew/core_ext.rb
|
149
|
-
- lib/sinew/dsl.rb
|
158
|
+
- lib/sinew/args.rb
|
159
|
+
- lib/sinew/base.rb
|
160
|
+
- lib/sinew/csv.rb
|
150
161
|
- lib/sinew/main.rb
|
162
|
+
- lib/sinew/middleware/log_formatter.rb
|
151
163
|
- lib/sinew/nokogiri_ext.rb
|
152
|
-
- lib/sinew/output.rb
|
153
|
-
- lib/sinew/request.rb
|
154
164
|
- lib/sinew/response.rb
|
155
|
-
- lib/sinew/runtime_options.rb
|
156
165
|
- lib/sinew/version.rb
|
166
|
+
- sample.rb
|
157
167
|
- sample.sinew
|
158
168
|
- sinew.gemspec
|
159
169
|
homepage: http://github.com/gurgeous/sinew
|
data/.vscode/extensions.json
DELETED
data/.vscode/settings.json
DELETED
data/lib/sinew/connection.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require 'faraday'
|
2
|
-
require 'faraday-encoding'
|
3
|
-
require 'faraday/logging/formatter'
|
4
|
-
require 'httpdisk'
|
5
|
-
require 'sinew/connection/log_formatter'
|
6
|
-
require 'sinew/connection/rate_limit'
|
7
|
-
|
8
|
-
module Sinew
|
9
|
-
module Connection
|
10
|
-
def self.create(options:, runtime_options:)
|
11
|
-
connection_options = {}
|
12
|
-
connection_options[:ssl] = { verify: false } if runtime_options.insecure
|
13
|
-
|
14
|
-
Faraday.new(nil, connection_options) do
|
15
|
-
_1.use RateLimit, rate_limit: runtime_options.rate_limit
|
16
|
-
|
17
|
-
# auto-encode form bodies
|
18
|
-
_1.request :url_encoded
|
19
|
-
|
20
|
-
# Before httpdisk so each redirect segment is cached
|
21
|
-
# Keep track of redirect status for logger
|
22
|
-
_1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
|
23
|
-
|
24
|
-
# set Ruby string encoding based on Content-Type (should be above httpdisk)
|
25
|
-
_1.response :encoding
|
26
|
-
|
27
|
-
# disk caching
|
28
|
-
httpdisk_options = {
|
29
|
-
dir: options[:cache],
|
30
|
-
force: options[:force],
|
31
|
-
force_errors: options[:force_errors],
|
32
|
-
}.merge(runtime_options.httpdisk_options)
|
33
|
-
|
34
|
-
_1.use :httpdisk, httpdisk_options
|
35
|
-
|
36
|
-
# After httpdisk so that only non-cached requests are logged.
|
37
|
-
# Before retry so that we don't log each retry attempt.
|
38
|
-
_1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
|
39
|
-
|
40
|
-
# After httpdisk so transient failures are not cached
|
41
|
-
retry_options = {
|
42
|
-
interval: runtime_options.rate_limit,
|
43
|
-
max: runtime_options.retries,
|
44
|
-
methods: %w[delete get head options patch post put trace],
|
45
|
-
retry_statuses: (500..600).to_a,
|
46
|
-
retry_if: ->(_env, _err) { true },
|
47
|
-
}
|
48
|
-
_1.request :retry, retry_options
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Sinew
|
2
|
-
module Connection
|
3
|
-
class RateLimit < Faraday::Middleware
|
4
|
-
attr_reader :rate_limit
|
5
|
-
|
6
|
-
def initialize(app, options = {})
|
7
|
-
super(app)
|
8
|
-
|
9
|
-
@last_request_tm = @current_request_tm = nil
|
10
|
-
@rate_limit = options.fetch(:rate_limit, 1)
|
11
|
-
end
|
12
|
-
|
13
|
-
def on_request(_env)
|
14
|
-
if @last_request_tm
|
15
|
-
sleep = (@last_request_tm + rate_limit) - Time.now
|
16
|
-
sleep(sleep) if sleep > 0
|
17
|
-
end
|
18
|
-
|
19
|
-
@current_request_tm = Time.now
|
20
|
-
end
|
21
|
-
|
22
|
-
def on_complete(env)
|
23
|
-
# Only rate limit on uncached requests
|
24
|
-
@last_request_tm = @current_request_tm unless env[:httpdisk]
|
25
|
-
@current_request_tm = nil
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
data/lib/sinew/core_ext.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# A few core extensions brought over from ActiveSupport. These are handy for
|
3
|
-
# parsing.
|
4
|
-
#
|
5
|
-
|
6
|
-
class String
|
7
|
-
def squish
|
8
|
-
dup.squish!
|
9
|
-
end
|
10
|
-
|
11
|
-
def squish!
|
12
|
-
strip!
|
13
|
-
gsub!(/\s+/, ' ')
|
14
|
-
self
|
15
|
-
end
|
16
|
-
|
17
|
-
def first(limit = 1)
|
18
|
-
if limit == 0
|
19
|
-
''
|
20
|
-
elsif limit >= size
|
21
|
-
dup
|
22
|
-
else
|
23
|
-
self[0..limit - 1]
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def last(limit = 1)
|
28
|
-
if limit == 0
|
29
|
-
''
|
30
|
-
elsif limit >= size
|
31
|
-
dup
|
32
|
-
else
|
33
|
-
self[-limit..]
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
alias starts_with? start_with?
|
38
|
-
alias ends_with? end_with?
|
39
|
-
end
|
40
|
-
|
41
|
-
#
|
42
|
-
# blank?/present?
|
43
|
-
#
|
44
|
-
|
45
|
-
class Object
|
46
|
-
def blank?
|
47
|
-
respond_to?(:empty?) ? !!empty? : !self
|
48
|
-
end
|
49
|
-
|
50
|
-
def present?
|
51
|
-
!blank?
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
class String
|
56
|
-
def blank?
|
57
|
-
!!(self =~ /\A\s*\z/)
|
58
|
-
end
|
59
|
-
end
|
data/lib/sinew/dsl.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
require 'amazing_print'
|
2
|
-
require 'cgi'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
#
|
6
|
-
# The DSL available to .sinew files.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class DSL
|
11
|
-
# this is used to break out of --limit
|
12
|
-
class LimitError < StandardError; end
|
13
|
-
|
14
|
-
attr_reader :sinew, :uri, :raw, :code, :elapsed
|
15
|
-
|
16
|
-
def initialize(sinew)
|
17
|
-
@sinew = sinew
|
18
|
-
end
|
19
|
-
|
20
|
-
def run
|
21
|
-
tm = Time.now
|
22
|
-
begin
|
23
|
-
recipe = sinew.options[:recipe]
|
24
|
-
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
-
rescue LimitError
|
26
|
-
# ignore - this is flow control for --limit
|
27
|
-
end
|
28
|
-
@elapsed = Time.now - tm
|
29
|
-
end
|
30
|
-
|
31
|
-
#
|
32
|
-
# request
|
33
|
-
#
|
34
|
-
|
35
|
-
def get(url, query = {})
|
36
|
-
http('get', url, query: query)
|
37
|
-
end
|
38
|
-
|
39
|
-
def post(url, form = {})
|
40
|
-
body = form
|
41
|
-
headers = {
|
42
|
-
'Content-Type' => 'application/x-www-form-urlencoded',
|
43
|
-
}
|
44
|
-
http('post', url, body: body, headers: headers)
|
45
|
-
end
|
46
|
-
|
47
|
-
def post_json(url, json = {})
|
48
|
-
body = json.to_json
|
49
|
-
headers = {
|
50
|
-
'Content-Type' => 'application/json',
|
51
|
-
}
|
52
|
-
http('post', url, body: body, headers: headers)
|
53
|
-
end
|
54
|
-
|
55
|
-
def http(method, url, options = {})
|
56
|
-
# these need to be cleared before each request
|
57
|
-
%i[@html @noko @xml @json].each do |i|
|
58
|
-
instance_variable_set(i, nil)
|
59
|
-
end
|
60
|
-
|
61
|
-
# fetch and make response available to callers
|
62
|
-
response = sinew.http(method, url, options)
|
63
|
-
@uri, @raw, @code = response.uri, response.body, response.code
|
64
|
-
|
65
|
-
# don't confuse the user
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
|
69
|
-
#
|
70
|
-
# response
|
71
|
-
#
|
72
|
-
|
73
|
-
def html
|
74
|
-
@html ||= begin
|
75
|
-
s = raw.dup
|
76
|
-
# squish!
|
77
|
-
s.squish!
|
78
|
-
# kill whitespace around tags
|
79
|
-
s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
80
|
-
s
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def noko
|
85
|
-
@noko ||= Nokogiri::HTML(html)
|
86
|
-
end
|
87
|
-
|
88
|
-
def xml
|
89
|
-
@xml ||= Nokogiri::XML(html)
|
90
|
-
end
|
91
|
-
|
92
|
-
def json
|
93
|
-
@json ||= JSON.parse(raw, symbolize_names: true)
|
94
|
-
end
|
95
|
-
|
96
|
-
def url
|
97
|
-
uri.to_s
|
98
|
-
end
|
99
|
-
|
100
|
-
#
|
101
|
-
# csv
|
102
|
-
#
|
103
|
-
|
104
|
-
def csv_header(*args)
|
105
|
-
sinew.output.header(args)
|
106
|
-
end
|
107
|
-
|
108
|
-
def csv_emit(row)
|
109
|
-
sinew.output.emit(row)
|
110
|
-
if sinew.output.count == sinew.options[:limit]
|
111
|
-
raise LimitError.new
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|
data/lib/sinew/output.rb
DELETED
@@ -1,133 +0,0 @@
|
|
1
|
-
require 'csv'
|
2
|
-
require 'set'
|
3
|
-
require 'sterile'
|
4
|
-
|
5
|
-
#
|
6
|
-
# CSV output.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class Output
|
11
|
-
attr_reader :sinew, :columns, :rows, :urls, :csv
|
12
|
-
|
13
|
-
def initialize(sinew)
|
14
|
-
@sinew = sinew
|
15
|
-
@rows = []
|
16
|
-
@urls = Set.new
|
17
|
-
end
|
18
|
-
|
19
|
-
def filename
|
20
|
-
@filename ||= begin
|
21
|
-
recipe = sinew.options[:recipe]
|
22
|
-
ext = File.extname(recipe)
|
23
|
-
if ext.empty?
|
24
|
-
"#{recipe}.csv"
|
25
|
-
else
|
26
|
-
recipe.gsub(ext, '.csv')
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def header(columns)
|
32
|
-
sinew.banner("Writing to #{filename}...") if !sinew.quiet?
|
33
|
-
|
34
|
-
columns = columns.flatten
|
35
|
-
@columns = columns
|
36
|
-
|
37
|
-
# open csv, write header row
|
38
|
-
@csv = CSV.open(filename, 'wb')
|
39
|
-
csv << columns
|
40
|
-
end
|
41
|
-
|
42
|
-
def emit(row)
|
43
|
-
# implicit header if necessary
|
44
|
-
header(row.keys) if !csv
|
45
|
-
|
46
|
-
# don't allow duplicate urls
|
47
|
-
return if dup_url?(row)
|
48
|
-
|
49
|
-
rows << row.dup
|
50
|
-
|
51
|
-
# map columns to row, and normalize along the way
|
52
|
-
print = {}
|
53
|
-
row = columns.map do |i|
|
54
|
-
value = normalize(row[i])
|
55
|
-
print[i] = value if value.present?
|
56
|
-
value
|
57
|
-
end
|
58
|
-
|
59
|
-
# print
|
60
|
-
sinew.vputs print.ai
|
61
|
-
|
62
|
-
csv << row
|
63
|
-
csv.flush
|
64
|
-
end
|
65
|
-
|
66
|
-
def count
|
67
|
-
rows.length
|
68
|
-
end
|
69
|
-
|
70
|
-
def report
|
71
|
-
return if count == 0
|
72
|
-
|
73
|
-
sinew.banner("Got #{count} rows.")
|
74
|
-
|
75
|
-
# calculate counts
|
76
|
-
counts = Hash.new(0)
|
77
|
-
rows.each do |row|
|
78
|
-
row.each_pair { |k, v| counts[k] += 1 if v.present? }
|
79
|
-
end
|
80
|
-
# sort by counts
|
81
|
-
cols = columns.sort_by { |i| [ -counts[i], i ] }
|
82
|
-
|
83
|
-
# report
|
84
|
-
len = cols.map { |i| i.to_s.length }.max
|
85
|
-
fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
|
86
|
-
cols.each do |col|
|
87
|
-
$stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def normalize(s)
|
92
|
-
# noko/array/misc => string
|
93
|
-
s = case s
|
94
|
-
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
95
|
-
s.inner_html
|
96
|
-
when Array
|
97
|
-
s.map(&:to_s).join('|')
|
98
|
-
else
|
99
|
-
s.to_s
|
100
|
-
end
|
101
|
-
|
102
|
-
# strip html tags. Note that we replace tags with spaces
|
103
|
-
s = s.gsub(/<[^>]+>/, ' ')
|
104
|
-
|
105
|
-
# Converts MS Word 'smart punctuation' to ASCII
|
106
|
-
s = Sterile.plain_format(s)
|
107
|
-
|
108
|
-
# á & etc.
|
109
|
-
s = Sterile.decode_entities(s)
|
110
|
-
|
111
|
-
# "šţɽĩɳģ" => "string"
|
112
|
-
s = Sterile.transliterate(s)
|
113
|
-
|
114
|
-
# squish
|
115
|
-
s = s.squish
|
116
|
-
|
117
|
-
s
|
118
|
-
end
|
119
|
-
protected :normalize
|
120
|
-
|
121
|
-
def dup_url?(row)
|
122
|
-
if url = row[:url]
|
123
|
-
if urls.include?(url)
|
124
|
-
sinew.warning("duplicate url: #{url}") if !sinew.quiet?
|
125
|
-
return true
|
126
|
-
end
|
127
|
-
urls << url
|
128
|
-
end
|
129
|
-
false
|
130
|
-
end
|
131
|
-
protected :dup_url?
|
132
|
-
end
|
133
|
-
end
|