sinew 1.0.4 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +49 -0
- data/.travis.yml +4 -0
- data/.vscode/extensions.json +3 -0
- data/.vscode/settings.json +15 -0
- data/Gemfile +1 -1
- data/README.md +153 -12
- data/Rakefile +13 -14
- data/bin/sinew +40 -20
- data/lib/sinew.rb +10 -6
- data/lib/sinew/cache.rb +79 -0
- data/lib/sinew/core_ext.rb +59 -0
- data/lib/sinew/dsl.rb +98 -0
- data/lib/sinew/main.rb +80 -149
- data/lib/sinew/nokogiri_ext.rb +10 -9
- data/lib/sinew/output.rb +126 -0
- data/lib/sinew/request.rb +148 -0
- data/lib/sinew/response.rb +75 -0
- data/lib/sinew/runtime_options.rb +26 -0
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +5 -3
- data/sinew.gemspec +24 -19
- data/test/test.html +40 -34
- data/test/test_cache.rb +69 -0
- data/test/test_helper.rb +113 -0
- data/test/test_main.rb +36 -91
- data/test/test_nokogiri_ext.rb +14 -15
- data/test/test_output.rb +73 -0
- data/test/test_requests.rb +135 -0
- data/test/test_utf8.rb +39 -0
- metadata +103 -48
- data/lib/sinew/curler.rb +0 -173
- data/lib/sinew/text_util.rb +0 -101
- data/lib/sinew/util.rb +0 -236
- data/test/helper.rb +0 -64
- data/test/test_curler.rb +0 -70
- data/test/test_text_util.rb +0 -23
@@ -0,0 +1,59 @@
|
|
1
|
+
#
|
2
|
+
# A few core extensions brought over from ActiveSupport. These are handy for
|
3
|
+
# parsing.
|
4
|
+
#
|
5
|
+
|
6
|
+
class String
|
7
|
+
def squish
|
8
|
+
dup.squish!
|
9
|
+
end
|
10
|
+
|
11
|
+
def squish!
|
12
|
+
strip!
|
13
|
+
gsub!(/\s+/, ' ')
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def first(limit = 1)
|
18
|
+
if limit == 0
|
19
|
+
''
|
20
|
+
elsif limit >= size
|
21
|
+
dup
|
22
|
+
else
|
23
|
+
self[0..limit - 1]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def last(limit = 1)
|
28
|
+
if limit == 0
|
29
|
+
''
|
30
|
+
elsif limit >= size
|
31
|
+
dup
|
32
|
+
else
|
33
|
+
self[-limit..-1]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
alias starts_with? start_with?
|
38
|
+
alias ends_with? end_with?
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# blank?/present?
|
43
|
+
#
|
44
|
+
|
45
|
+
class Object
|
46
|
+
def blank?
|
47
|
+
respond_to?(:empty?) ? !!empty? : !self
|
48
|
+
end
|
49
|
+
|
50
|
+
def present?
|
51
|
+
!blank?
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class String
|
56
|
+
def blank?
|
57
|
+
!!(self =~ /\A\s*\z/)
|
58
|
+
end
|
59
|
+
end
|
data/lib/sinew/dsl.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'awesome_print'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
#
|
5
|
+
# The DSL available to .sinew files.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
class DSL
|
10
|
+
attr_reader :sinew, :raw, :uri, :elapsed
|
11
|
+
|
12
|
+
def initialize(sinew)
|
13
|
+
@sinew = sinew
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
tm = Time.now
|
18
|
+
recipe = sinew.options[:recipe]
|
19
|
+
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
20
|
+
@elapsed = Time.now - tm
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# request
|
25
|
+
#
|
26
|
+
|
27
|
+
def get(url, query = {})
|
28
|
+
http('get', url, query: query)
|
29
|
+
end
|
30
|
+
|
31
|
+
def post(url, form = {})
|
32
|
+
body = form
|
33
|
+
headers = {
|
34
|
+
'Content-Type' => 'application/x-www-form-urlencoded',
|
35
|
+
}
|
36
|
+
http('post', url, body: body, headers: headers)
|
37
|
+
end
|
38
|
+
|
39
|
+
def post_json(url, json = {})
|
40
|
+
body = json.to_json
|
41
|
+
headers = {
|
42
|
+
'Content-Type' => 'application/json',
|
43
|
+
}
|
44
|
+
http('post', url, body: body, headers: headers)
|
45
|
+
end
|
46
|
+
|
47
|
+
def http(method, url, options = {})
|
48
|
+
# reset
|
49
|
+
@html = @noko = @json = @url = nil
|
50
|
+
|
51
|
+
# fetch
|
52
|
+
response = sinew.http(method, url, options)
|
53
|
+
|
54
|
+
# respond
|
55
|
+
@uri = response.uri
|
56
|
+
@raw = response.body
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# response
|
61
|
+
#
|
62
|
+
|
63
|
+
def html
|
64
|
+
@html ||= begin
|
65
|
+
s = raw.dup
|
66
|
+
# squish!
|
67
|
+
s.squish!
|
68
|
+
# kill whitespace around tags
|
69
|
+
s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
70
|
+
s
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def noko
|
75
|
+
@noko ||= Nokogiri::HTML(html)
|
76
|
+
end
|
77
|
+
|
78
|
+
def json
|
79
|
+
@json ||= JSON.parse(raw, symbolize_names: true)
|
80
|
+
end
|
81
|
+
|
82
|
+
def url
|
83
|
+
uri.to_s
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# csv
|
88
|
+
#
|
89
|
+
|
90
|
+
def csv_header(*args)
|
91
|
+
sinew.output.header(args)
|
92
|
+
end
|
93
|
+
|
94
|
+
def csv_emit(row)
|
95
|
+
sinew.output.emit(row)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,188 +1,119 @@
|
|
1
|
-
require
|
2
|
-
require "awesome_print"
|
3
|
-
require "cgi"
|
4
|
-
require "csv"
|
5
|
-
require "htmlentities"
|
6
|
-
require "stringex"
|
1
|
+
require 'scripto'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
#
|
4
|
+
# Main sinew entry point.
|
5
|
+
#
|
11
6
|
|
12
|
-
|
7
|
+
module Sinew
|
8
|
+
class Main < Scripto::Main
|
9
|
+
attr_reader :runtime_options, :request_tm, :request_count
|
13
10
|
|
14
11
|
def initialize(options)
|
15
|
-
|
16
|
-
_run if !@options[:test]
|
17
|
-
end
|
18
|
-
|
19
|
-
def get(url, params = nil)
|
20
|
-
_http(url, params, :get)
|
21
|
-
end
|
12
|
+
super(options)
|
22
13
|
|
23
|
-
|
24
|
-
|
14
|
+
# init
|
15
|
+
@runtime_options = RuntimeOptions.new
|
16
|
+
@request_tm = Time.at(0)
|
17
|
+
@request_count = 0
|
25
18
|
end
|
26
19
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def html
|
32
|
-
@html ||= begin
|
33
|
-
s = TextUtil.html_tidy(@raw)
|
34
|
-
nelements = @raw.count("<")
|
35
|
-
if nelements > 1
|
36
|
-
# is there a problem with tidy?
|
37
|
-
percent = 100 * s.count("<") / nelements
|
38
|
-
if percent < 80
|
39
|
-
# bad xml processing instruction? Try fixing it.
|
40
|
-
maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
|
41
|
-
new_percent = 100 * maybe.count("<") / nelements
|
42
|
-
if new_percent > 80
|
43
|
-
# yes!
|
44
|
-
s = maybe
|
45
|
-
else
|
46
|
-
Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
s
|
51
|
-
end
|
20
|
+
def run
|
21
|
+
dsl.run
|
22
|
+
footer if !quiet?
|
52
23
|
end
|
53
24
|
|
54
|
-
def
|
55
|
-
|
25
|
+
def quiet?
|
26
|
+
options[:quiet]
|
56
27
|
end
|
57
28
|
|
58
|
-
def
|
59
|
-
@
|
29
|
+
def dsl
|
30
|
+
@dsl ||= DSL.new(self)
|
60
31
|
end
|
61
32
|
|
62
33
|
#
|
63
|
-
#
|
34
|
+
# http requests and caching
|
64
35
|
#
|
65
36
|
|
66
|
-
def
|
67
|
-
|
68
|
-
if args.first.is_a?(String)
|
69
|
-
file = args.shift
|
70
|
-
if file !~ /^\//
|
71
|
-
file = "#{File.dirname(@options[:file])}/#{file}"
|
72
|
-
end
|
73
|
-
else
|
74
|
-
file = @options[:file]
|
75
|
-
end
|
76
|
-
ext = File.extname(file)
|
77
|
-
file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
|
78
|
-
|
79
|
-
@path = file
|
80
|
-
@csv = CSV.open(file, "wb")
|
81
|
-
@csv_keys = args
|
82
|
-
@csv << @csv_keys
|
83
|
-
_banner("Writing to #{@path}...")
|
37
|
+
def cache
|
38
|
+
@cache ||= Cache.new(self)
|
84
39
|
end
|
85
40
|
|
86
|
-
def
|
87
|
-
|
41
|
+
def http(method, url, options = {})
|
42
|
+
request = Request.new(self, method, url, options)
|
88
43
|
|
89
|
-
|
90
|
-
|
91
|
-
s = _normalize(row[i], i)
|
92
|
-
print[i] = s if !s.empty?
|
93
|
-
s
|
94
|
-
end
|
95
|
-
$stderr.puts print.ai if @options[:verbose]
|
96
|
-
@csv << row
|
97
|
-
@csv.flush
|
98
|
-
end
|
44
|
+
# try to get from cache
|
45
|
+
response = cache.get(request)
|
99
46
|
|
100
|
-
|
47
|
+
# perform if necessary
|
48
|
+
if !response
|
49
|
+
response = perform(request)
|
50
|
+
cache.set(response)
|
51
|
+
end
|
101
52
|
|
102
|
-
|
103
|
-
|
104
|
-
#
|
105
|
-
options = { user_agent: "sinew/#{VERSION}" }
|
106
|
-
options[:dir] = @options[:cache] if @options[:cache]
|
107
|
-
options[:verbose] = false if @options[:quiet]
|
108
|
-
Curler.new(options)
|
53
|
+
# always log error messages
|
54
|
+
if response.error?
|
55
|
+
puts "xxx http request failed with #{response.code}"
|
109
56
|
end
|
57
|
+
|
58
|
+
response
|
110
59
|
end
|
111
60
|
|
112
|
-
def
|
113
|
-
|
61
|
+
def perform(request)
|
62
|
+
before_perform_request(request)
|
114
63
|
|
115
|
-
|
116
|
-
if !File.exists?(file)
|
117
|
-
Util.fatal("#{file} not found")
|
118
|
-
end
|
64
|
+
response = nil
|
119
65
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
66
|
+
tries = runtime_options.retries + 1
|
67
|
+
while tries > 0
|
68
|
+
tries -= 1
|
69
|
+
begin
|
70
|
+
@request_count += 1
|
71
|
+
response = request.perform
|
72
|
+
rescue Timeout::Error
|
73
|
+
response = Response.from_timeout(request)
|
74
|
+
end
|
75
|
+
break if !response.error_500?
|
126
76
|
end
|
77
|
+
|
78
|
+
response
|
127
79
|
end
|
80
|
+
protected :perform
|
128
81
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# decode entities
|
134
|
-
url = CODER.decode(url)
|
135
|
-
|
136
|
-
# handle params
|
137
|
-
body = nil
|
138
|
-
if params
|
139
|
-
q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
|
140
|
-
q = q.map { |key, value| "#{key}=#{value}" }.join("&")
|
141
|
-
if method == :get
|
142
|
-
separator = url.include?(??) ? "&" : "?"
|
143
|
-
url = "#{url}#{separator}#{q}"
|
144
|
-
else
|
145
|
-
body = q
|
146
|
-
end
|
147
|
-
end
|
82
|
+
#
|
83
|
+
# output
|
84
|
+
#
|
148
85
|
|
149
|
-
|
150
|
-
|
151
|
-
|
86
|
+
def output
|
87
|
+
@output ||= Output.new(self)
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# helpers
|
92
|
+
#
|
93
|
+
|
94
|
+
def before_perform_request(request)
|
95
|
+
# log
|
96
|
+
if !quiet?
|
97
|
+
msg = if request.method != 'get'
|
98
|
+
"req #{request.uri} (#{request.method})"
|
152
99
|
else
|
153
|
-
|
100
|
+
"req #{request.uri}"
|
154
101
|
end
|
155
|
-
|
156
|
-
rescue Curler::Error => e
|
157
|
-
$stderr.puts "xxx #{e.message}"
|
158
|
-
@raw = ""
|
102
|
+
$stderr.puts msg
|
159
103
|
end
|
160
104
|
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
@
|
165
|
-
@noko = nil
|
166
|
-
end
|
167
|
-
|
168
|
-
def _normalize(s, key = nil)
|
169
|
-
case s
|
170
|
-
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
171
|
-
s = s.inner_html
|
172
|
-
when Array
|
173
|
-
s = s.map { |j| j.to_s }.join("|")
|
174
|
-
else
|
175
|
-
s = s.to_s
|
176
|
-
end
|
177
|
-
s = TextUtil.untag(s)
|
178
|
-
s = s.convert_accented_html_entities
|
179
|
-
s = TextUtil.unent(s)
|
180
|
-
s = s.to_ascii.squish
|
181
|
-
s
|
105
|
+
# rate limit
|
106
|
+
sleep = (request_tm + runtime_options.rate_limit) - Time.now
|
107
|
+
sleep(sleep) if sleep > 0
|
108
|
+
@request_tm = Time.now
|
182
109
|
end
|
110
|
+
protected :before_perform_request
|
183
111
|
|
184
|
-
def
|
185
|
-
|
112
|
+
def footer
|
113
|
+
output.report
|
114
|
+
finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
|
115
|
+
banner("#{finished} in #{dsl.elapsed.to_i}s.")
|
186
116
|
end
|
117
|
+
protected :footer
|
187
118
|
end
|
188
119
|
end
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,15 +1,16 @@
|
|
1
|
-
require
|
1
|
+
require 'nokogiri'
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
4
|
class Nokogiri::XML::NodeSet
|
5
|
-
alias
|
6
|
-
alias
|
7
|
-
|
5
|
+
alias old_inner_html inner_html
|
6
|
+
alias old_inner_text inner_text
|
7
|
+
|
8
8
|
def inner_text
|
9
|
-
|
9
|
+
map(&:inner_text).join(' ')
|
10
10
|
end
|
11
|
-
|
12
|
-
|
11
|
+
|
12
|
+
def inner_html(*args)
|
13
|
+
map { |i| i.inner_html(*args) }.join(' ')
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
@@ -17,11 +18,11 @@ end
|
|
17
18
|
class Nokogiri::XML::Node
|
18
19
|
def text_just_me
|
19
20
|
t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
|
20
|
-
t
|
21
|
+
t&.text
|
21
22
|
end
|
22
23
|
end
|
23
24
|
class Nokogiri::XML::NodeSet
|
24
25
|
def text_just_me
|
25
|
-
map
|
26
|
+
map(&:text_just_me).join(' ')
|
26
27
|
end
|
27
28
|
end
|