sinew 3.0.1 → 4.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +3 -5
- data/.rubocop.yml +30 -48
- data/Gemfile +4 -4
- data/Gemfile.lock +124 -0
- data/README.md +108 -47
- data/Rakefile +16 -15
- data/bin/sinew +13 -41
- data/lib/sinew.rb +23 -9
- data/lib/sinew/args.rb +53 -0
- data/lib/sinew/base.rb +251 -0
- data/lib/sinew/csv.rb +89 -0
- data/lib/sinew/main.rb +46 -72
- data/lib/sinew/{connection → middleware}/log_formatter.rb +2 -1
- data/lib/sinew/nokogiri_ext.rb +12 -21
- data/lib/sinew/response.rb +41 -52
- data/lib/sinew/version.rb +1 -1
- data/sample.rb +13 -0
- data/sample.sinew +4 -4
- data/sinew.gemspec +19 -16
- metadata +31 -21
- data/.vscode/extensions.json +0 -3
- data/.vscode/settings.json +0 -5
- data/lib/sinew/connection.rb +0 -52
- data/lib/sinew/connection/rate_limit.rb +0 -29
- data/lib/sinew/core_ext.rb +0 -59
- data/lib/sinew/dsl.rb +0 -115
- data/lib/sinew/output.rb +0 -133
- data/lib/sinew/request.rb +0 -86
- data/lib/sinew/runtime_options.rb +0 -28
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: sinew
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 4.0.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Adam Doppelt
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2021-
|
12
|
+
date: 2021-07-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: amazing_print
|
@@ -54,47 +54,61 @@ dependencies:
|
|
54
54
|
- !ruby/object:Gem::Version
|
55
55
|
version: '0'
|
56
56
|
- !ruby/object:Gem::Dependency
|
57
|
-
name:
|
57
|
+
name: faraday-rate_limiter
|
58
58
|
requirement: !ruby/object:Gem::Requirement
|
59
59
|
requirements:
|
60
60
|
- - "~>"
|
61
61
|
- !ruby/object:Gem::Version
|
62
|
-
version: '0'
|
62
|
+
version: '0.0'
|
63
63
|
type: :runtime
|
64
64
|
prerelease: false
|
65
65
|
version_requirements: !ruby/object:Gem::Requirement
|
66
66
|
requirements:
|
67
67
|
- - "~>"
|
68
68
|
- !ruby/object:Gem::Version
|
69
|
-
version: '0'
|
69
|
+
version: '0.0'
|
70
70
|
- !ruby/object:Gem::Dependency
|
71
|
-
name:
|
71
|
+
name: hashie
|
72
72
|
requirement: !ruby/object:Gem::Requirement
|
73
73
|
requirements:
|
74
74
|
- - "~>"
|
75
75
|
- !ruby/object:Gem::Version
|
76
|
-
version: '1
|
76
|
+
version: '4.1'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
79
|
version_requirements: !ruby/object:Gem::Requirement
|
80
80
|
requirements:
|
81
81
|
- - "~>"
|
82
82
|
- !ruby/object:Gem::Version
|
83
|
-
version: '1
|
83
|
+
version: '4.1'
|
84
84
|
- !ruby/object:Gem::Dependency
|
85
|
-
name:
|
85
|
+
name: httpdisk
|
86
86
|
requirement: !ruby/object:Gem::Requirement
|
87
87
|
requirements:
|
88
88
|
- - "~>"
|
89
89
|
- !ruby/object:Gem::Version
|
90
|
-
version: '0'
|
90
|
+
version: '0.5'
|
91
91
|
type: :runtime
|
92
92
|
prerelease: false
|
93
93
|
version_requirements: !ruby/object:Gem::Requirement
|
94
94
|
requirements:
|
95
95
|
- - "~>"
|
96
96
|
- !ruby/object:Gem::Version
|
97
|
-
version: '0'
|
97
|
+
version: '0.5'
|
98
|
+
- !ruby/object:Gem::Dependency
|
99
|
+
name: nokogiri
|
100
|
+
requirement: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - "~>"
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '1.11'
|
105
|
+
type: :runtime
|
106
|
+
prerelease: false
|
107
|
+
version_requirements: !ruby/object:Gem::Requirement
|
108
|
+
requirements:
|
109
|
+
- - "~>"
|
110
|
+
- !ruby/object:Gem::Version
|
111
|
+
version: '1.11'
|
98
112
|
- !ruby/object:Gem::Dependency
|
99
113
|
name: slop
|
100
114
|
requirement: !ruby/object:Gem::Requirement
|
@@ -134,26 +148,22 @@ files:
|
|
134
148
|
- ".github/workflows/test.yml"
|
135
149
|
- ".gitignore"
|
136
150
|
- ".rubocop.yml"
|
137
|
-
- ".vscode/extensions.json"
|
138
|
-
- ".vscode/settings.json"
|
139
151
|
- Gemfile
|
152
|
+
- Gemfile.lock
|
140
153
|
- LICENSE
|
141
154
|
- README.md
|
142
155
|
- Rakefile
|
143
156
|
- bin/sinew
|
144
157
|
- lib/sinew.rb
|
145
|
-
- lib/sinew/
|
146
|
-
- lib/sinew/
|
147
|
-
- lib/sinew/
|
148
|
-
- lib/sinew/core_ext.rb
|
149
|
-
- lib/sinew/dsl.rb
|
158
|
+
- lib/sinew/args.rb
|
159
|
+
- lib/sinew/base.rb
|
160
|
+
- lib/sinew/csv.rb
|
150
161
|
- lib/sinew/main.rb
|
162
|
+
- lib/sinew/middleware/log_formatter.rb
|
151
163
|
- lib/sinew/nokogiri_ext.rb
|
152
|
-
- lib/sinew/output.rb
|
153
|
-
- lib/sinew/request.rb
|
154
164
|
- lib/sinew/response.rb
|
155
|
-
- lib/sinew/runtime_options.rb
|
156
165
|
- lib/sinew/version.rb
|
166
|
+
- sample.rb
|
157
167
|
- sample.sinew
|
158
168
|
- sinew.gemspec
|
159
169
|
homepage: http://github.com/gurgeous/sinew
|
data/.vscode/extensions.json
DELETED
data/.vscode/settings.json
DELETED
data/lib/sinew/connection.rb
DELETED
@@ -1,52 +0,0 @@
|
|
1
|
-
require 'faraday'
|
2
|
-
require 'faraday-encoding'
|
3
|
-
require 'faraday/logging/formatter'
|
4
|
-
require 'httpdisk'
|
5
|
-
require 'sinew/connection/log_formatter'
|
6
|
-
require 'sinew/connection/rate_limit'
|
7
|
-
|
8
|
-
module Sinew
|
9
|
-
module Connection
|
10
|
-
def self.create(options:, runtime_options:)
|
11
|
-
connection_options = {}
|
12
|
-
connection_options[:ssl] = { verify: false } if runtime_options.insecure
|
13
|
-
|
14
|
-
Faraday.new(nil, connection_options) do
|
15
|
-
_1.use RateLimit, rate_limit: runtime_options.rate_limit
|
16
|
-
|
17
|
-
# auto-encode form bodies
|
18
|
-
_1.request :url_encoded
|
19
|
-
|
20
|
-
# Before httpdisk so each redirect segment is cached
|
21
|
-
# Keep track of redirect status for logger
|
22
|
-
_1.response :follow_redirects, callback: ->(_old_env, new_env) { new_env[:redirect] = true }
|
23
|
-
|
24
|
-
# set Ruby string encoding based on Content-Type (should be above httpdisk)
|
25
|
-
_1.response :encoding
|
26
|
-
|
27
|
-
# disk caching
|
28
|
-
httpdisk_options = {
|
29
|
-
dir: options[:cache],
|
30
|
-
force: options[:force],
|
31
|
-
force_errors: options[:force_errors],
|
32
|
-
}.merge(runtime_options.httpdisk_options)
|
33
|
-
|
34
|
-
_1.use :httpdisk, httpdisk_options
|
35
|
-
|
36
|
-
# After httpdisk so that only non-cached requests are logged.
|
37
|
-
# Before retry so that we don't log each retry attempt.
|
38
|
-
_1.response :logger, nil, formatter: LogFormatter if !options[:quiet]
|
39
|
-
|
40
|
-
# After httpdisk so transient failures are not cached
|
41
|
-
retry_options = {
|
42
|
-
interval: runtime_options.rate_limit,
|
43
|
-
max: runtime_options.retries,
|
44
|
-
methods: %w[delete get head options patch post put trace],
|
45
|
-
retry_statuses: (500..600).to_a,
|
46
|
-
retry_if: ->(_env, _err) { true },
|
47
|
-
}
|
48
|
-
_1.request :retry, retry_options
|
49
|
-
end
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
@@ -1,29 +0,0 @@
|
|
1
|
-
module Sinew
|
2
|
-
module Connection
|
3
|
-
class RateLimit < Faraday::Middleware
|
4
|
-
attr_reader :rate_limit
|
5
|
-
|
6
|
-
def initialize(app, options = {})
|
7
|
-
super(app)
|
8
|
-
|
9
|
-
@last_request_tm = @current_request_tm = nil
|
10
|
-
@rate_limit = options.fetch(:rate_limit, 1)
|
11
|
-
end
|
12
|
-
|
13
|
-
def on_request(_env)
|
14
|
-
if @last_request_tm
|
15
|
-
sleep = (@last_request_tm + rate_limit) - Time.now
|
16
|
-
sleep(sleep) if sleep > 0
|
17
|
-
end
|
18
|
-
|
19
|
-
@current_request_tm = Time.now
|
20
|
-
end
|
21
|
-
|
22
|
-
def on_complete(env)
|
23
|
-
# Only rate limit on uncached requests
|
24
|
-
@last_request_tm = @current_request_tm unless env[:httpdisk]
|
25
|
-
@current_request_tm = nil
|
26
|
-
end
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
data/lib/sinew/core_ext.rb
DELETED
@@ -1,59 +0,0 @@
|
|
1
|
-
#
|
2
|
-
# A few core extensions brought over from ActiveSupport. These are handy for
|
3
|
-
# parsing.
|
4
|
-
#
|
5
|
-
|
6
|
-
class String
|
7
|
-
def squish
|
8
|
-
dup.squish!
|
9
|
-
end
|
10
|
-
|
11
|
-
def squish!
|
12
|
-
strip!
|
13
|
-
gsub!(/\s+/, ' ')
|
14
|
-
self
|
15
|
-
end
|
16
|
-
|
17
|
-
def first(limit = 1)
|
18
|
-
if limit == 0
|
19
|
-
''
|
20
|
-
elsif limit >= size
|
21
|
-
dup
|
22
|
-
else
|
23
|
-
self[0..limit - 1]
|
24
|
-
end
|
25
|
-
end
|
26
|
-
|
27
|
-
def last(limit = 1)
|
28
|
-
if limit == 0
|
29
|
-
''
|
30
|
-
elsif limit >= size
|
31
|
-
dup
|
32
|
-
else
|
33
|
-
self[-limit..]
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
alias starts_with? start_with?
|
38
|
-
alias ends_with? end_with?
|
39
|
-
end
|
40
|
-
|
41
|
-
#
|
42
|
-
# blank?/present?
|
43
|
-
#
|
44
|
-
|
45
|
-
class Object
|
46
|
-
def blank?
|
47
|
-
respond_to?(:empty?) ? !!empty? : !self
|
48
|
-
end
|
49
|
-
|
50
|
-
def present?
|
51
|
-
!blank?
|
52
|
-
end
|
53
|
-
end
|
54
|
-
|
55
|
-
class String
|
56
|
-
def blank?
|
57
|
-
!!(self =~ /\A\s*\z/)
|
58
|
-
end
|
59
|
-
end
|
data/lib/sinew/dsl.rb
DELETED
@@ -1,115 +0,0 @@
|
|
1
|
-
require 'amazing_print'
|
2
|
-
require 'cgi'
|
3
|
-
require 'json'
|
4
|
-
|
5
|
-
#
|
6
|
-
# The DSL available to .sinew files.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class DSL
|
11
|
-
# this is used to break out of --limit
|
12
|
-
class LimitError < StandardError; end
|
13
|
-
|
14
|
-
attr_reader :sinew, :uri, :raw, :code, :elapsed
|
15
|
-
|
16
|
-
def initialize(sinew)
|
17
|
-
@sinew = sinew
|
18
|
-
end
|
19
|
-
|
20
|
-
def run
|
21
|
-
tm = Time.now
|
22
|
-
begin
|
23
|
-
recipe = sinew.options[:recipe]
|
24
|
-
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
25
|
-
rescue LimitError
|
26
|
-
# ignore - this is flow control for --limit
|
27
|
-
end
|
28
|
-
@elapsed = Time.now - tm
|
29
|
-
end
|
30
|
-
|
31
|
-
#
|
32
|
-
# request
|
33
|
-
#
|
34
|
-
|
35
|
-
def get(url, query = {})
|
36
|
-
http('get', url, query: query)
|
37
|
-
end
|
38
|
-
|
39
|
-
def post(url, form = {})
|
40
|
-
body = form
|
41
|
-
headers = {
|
42
|
-
'Content-Type' => 'application/x-www-form-urlencoded',
|
43
|
-
}
|
44
|
-
http('post', url, body: body, headers: headers)
|
45
|
-
end
|
46
|
-
|
47
|
-
def post_json(url, json = {})
|
48
|
-
body = json.to_json
|
49
|
-
headers = {
|
50
|
-
'Content-Type' => 'application/json',
|
51
|
-
}
|
52
|
-
http('post', url, body: body, headers: headers)
|
53
|
-
end
|
54
|
-
|
55
|
-
def http(method, url, options = {})
|
56
|
-
# these need to be cleared before each request
|
57
|
-
%i[@html @noko @xml @json].each do |i|
|
58
|
-
instance_variable_set(i, nil)
|
59
|
-
end
|
60
|
-
|
61
|
-
# fetch and make response available to callers
|
62
|
-
response = sinew.http(method, url, options)
|
63
|
-
@uri, @raw, @code = response.uri, response.body, response.code
|
64
|
-
|
65
|
-
# don't confuse the user
|
66
|
-
nil
|
67
|
-
end
|
68
|
-
|
69
|
-
#
|
70
|
-
# response
|
71
|
-
#
|
72
|
-
|
73
|
-
def html
|
74
|
-
@html ||= begin
|
75
|
-
s = raw.dup
|
76
|
-
# squish!
|
77
|
-
s.squish!
|
78
|
-
# kill whitespace around tags
|
79
|
-
s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
80
|
-
s
|
81
|
-
end
|
82
|
-
end
|
83
|
-
|
84
|
-
def noko
|
85
|
-
@noko ||= Nokogiri::HTML(html)
|
86
|
-
end
|
87
|
-
|
88
|
-
def xml
|
89
|
-
@xml ||= Nokogiri::XML(html)
|
90
|
-
end
|
91
|
-
|
92
|
-
def json
|
93
|
-
@json ||= JSON.parse(raw, symbolize_names: true)
|
94
|
-
end
|
95
|
-
|
96
|
-
def url
|
97
|
-
uri.to_s
|
98
|
-
end
|
99
|
-
|
100
|
-
#
|
101
|
-
# csv
|
102
|
-
#
|
103
|
-
|
104
|
-
def csv_header(*args)
|
105
|
-
sinew.output.header(args)
|
106
|
-
end
|
107
|
-
|
108
|
-
def csv_emit(row)
|
109
|
-
sinew.output.emit(row)
|
110
|
-
if sinew.output.count == sinew.options[:limit]
|
111
|
-
raise LimitError.new
|
112
|
-
end
|
113
|
-
end
|
114
|
-
end
|
115
|
-
end
|
data/lib/sinew/output.rb
DELETED
@@ -1,133 +0,0 @@
|
|
1
|
-
require 'csv'
|
2
|
-
require 'set'
|
3
|
-
require 'sterile'
|
4
|
-
|
5
|
-
#
|
6
|
-
# CSV output.
|
7
|
-
#
|
8
|
-
|
9
|
-
module Sinew
|
10
|
-
class Output
|
11
|
-
attr_reader :sinew, :columns, :rows, :urls, :csv
|
12
|
-
|
13
|
-
def initialize(sinew)
|
14
|
-
@sinew = sinew
|
15
|
-
@rows = []
|
16
|
-
@urls = Set.new
|
17
|
-
end
|
18
|
-
|
19
|
-
def filename
|
20
|
-
@filename ||= begin
|
21
|
-
recipe = sinew.options[:recipe]
|
22
|
-
ext = File.extname(recipe)
|
23
|
-
if ext.empty?
|
24
|
-
"#{recipe}.csv"
|
25
|
-
else
|
26
|
-
recipe.gsub(ext, '.csv')
|
27
|
-
end
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def header(columns)
|
32
|
-
sinew.banner("Writing to #{filename}...") if !sinew.quiet?
|
33
|
-
|
34
|
-
columns = columns.flatten
|
35
|
-
@columns = columns
|
36
|
-
|
37
|
-
# open csv, write header row
|
38
|
-
@csv = CSV.open(filename, 'wb')
|
39
|
-
csv << columns
|
40
|
-
end
|
41
|
-
|
42
|
-
def emit(row)
|
43
|
-
# implicit header if necessary
|
44
|
-
header(row.keys) if !csv
|
45
|
-
|
46
|
-
# don't allow duplicate urls
|
47
|
-
return if dup_url?(row)
|
48
|
-
|
49
|
-
rows << row.dup
|
50
|
-
|
51
|
-
# map columns to row, and normalize along the way
|
52
|
-
print = {}
|
53
|
-
row = columns.map do |i|
|
54
|
-
value = normalize(row[i])
|
55
|
-
print[i] = value if value.present?
|
56
|
-
value
|
57
|
-
end
|
58
|
-
|
59
|
-
# print
|
60
|
-
sinew.vputs print.ai
|
61
|
-
|
62
|
-
csv << row
|
63
|
-
csv.flush
|
64
|
-
end
|
65
|
-
|
66
|
-
def count
|
67
|
-
rows.length
|
68
|
-
end
|
69
|
-
|
70
|
-
def report
|
71
|
-
return if count == 0
|
72
|
-
|
73
|
-
sinew.banner("Got #{count} rows.")
|
74
|
-
|
75
|
-
# calculate counts
|
76
|
-
counts = Hash.new(0)
|
77
|
-
rows.each do |row|
|
78
|
-
row.each_pair { |k, v| counts[k] += 1 if v.present? }
|
79
|
-
end
|
80
|
-
# sort by counts
|
81
|
-
cols = columns.sort_by { |i| [ -counts[i], i ] }
|
82
|
-
|
83
|
-
# report
|
84
|
-
len = cols.map { |i| i.to_s.length }.max
|
85
|
-
fmt = " %-#{len + 1}s %7d / %-7d %6.1f%%\n"
|
86
|
-
cols.each do |col|
|
87
|
-
$stderr.printf(fmt, col, counts[col], count, counts[col] * 100.0 / count)
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
def normalize(s)
|
92
|
-
# noko/array/misc => string
|
93
|
-
s = case s
|
94
|
-
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
95
|
-
s.inner_html
|
96
|
-
when Array
|
97
|
-
s.map(&:to_s).join('|')
|
98
|
-
else
|
99
|
-
s.to_s
|
100
|
-
end
|
101
|
-
|
102
|
-
# strip html tags. Note that we replace tags with spaces
|
103
|
-
s = s.gsub(/<[^>]+>/, ' ')
|
104
|
-
|
105
|
-
# Converts MS Word 'smart punctuation' to ASCII
|
106
|
-
s = Sterile.plain_format(s)
|
107
|
-
|
108
|
-
# á & etc.
|
109
|
-
s = Sterile.decode_entities(s)
|
110
|
-
|
111
|
-
# "šţɽĩɳģ" => "string"
|
112
|
-
s = Sterile.transliterate(s)
|
113
|
-
|
114
|
-
# squish
|
115
|
-
s = s.squish
|
116
|
-
|
117
|
-
s
|
118
|
-
end
|
119
|
-
protected :normalize
|
120
|
-
|
121
|
-
def dup_url?(row)
|
122
|
-
if url = row[:url]
|
123
|
-
if urls.include?(url)
|
124
|
-
sinew.warning("duplicate url: #{url}") if !sinew.quiet?
|
125
|
-
return true
|
126
|
-
end
|
127
|
-
urls << url
|
128
|
-
end
|
129
|
-
false
|
130
|
-
end
|
131
|
-
protected :dup_url?
|
132
|
-
end
|
133
|
-
end
|