sinew 1.0.4 → 2.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +1 -0
- data/.rubocop.yml +49 -0
- data/.travis.yml +4 -0
- data/.vscode/extensions.json +3 -0
- data/.vscode/settings.json +15 -0
- data/Gemfile +1 -1
- data/README.md +153 -12
- data/Rakefile +13 -14
- data/bin/sinew +40 -20
- data/lib/sinew.rb +10 -6
- data/lib/sinew/cache.rb +79 -0
- data/lib/sinew/core_ext.rb +59 -0
- data/lib/sinew/dsl.rb +98 -0
- data/lib/sinew/main.rb +80 -149
- data/lib/sinew/nokogiri_ext.rb +10 -9
- data/lib/sinew/output.rb +126 -0
- data/lib/sinew/request.rb +148 -0
- data/lib/sinew/response.rb +75 -0
- data/lib/sinew/runtime_options.rb +26 -0
- data/lib/sinew/version.rb +1 -1
- data/sample.sinew +5 -3
- data/sinew.gemspec +24 -19
- data/test/test.html +40 -34
- data/test/test_cache.rb +69 -0
- data/test/test_helper.rb +113 -0
- data/test/test_main.rb +36 -91
- data/test/test_nokogiri_ext.rb +14 -15
- data/test/test_output.rb +73 -0
- data/test/test_requests.rb +135 -0
- data/test/test_utf8.rb +39 -0
- metadata +103 -48
- data/lib/sinew/curler.rb +0 -173
- data/lib/sinew/text_util.rb +0 -101
- data/lib/sinew/util.rb +0 -236
- data/test/helper.rb +0 -64
- data/test/test_curler.rb +0 -70
- data/test/test_text_util.rb +0 -23
@@ -0,0 +1,59 @@
|
|
1
|
+
#
|
2
|
+
# A few core extensions brought over from ActiveSupport. These are handy for
|
3
|
+
# parsing.
|
4
|
+
#
|
5
|
+
|
6
|
+
class String
|
7
|
+
def squish
|
8
|
+
dup.squish!
|
9
|
+
end
|
10
|
+
|
11
|
+
def squish!
|
12
|
+
strip!
|
13
|
+
gsub!(/\s+/, ' ')
|
14
|
+
self
|
15
|
+
end
|
16
|
+
|
17
|
+
def first(limit = 1)
|
18
|
+
if limit == 0
|
19
|
+
''
|
20
|
+
elsif limit >= size
|
21
|
+
dup
|
22
|
+
else
|
23
|
+
self[0..limit - 1]
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def last(limit = 1)
|
28
|
+
if limit == 0
|
29
|
+
''
|
30
|
+
elsif limit >= size
|
31
|
+
dup
|
32
|
+
else
|
33
|
+
self[-limit..-1]
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
alias starts_with? start_with?
|
38
|
+
alias ends_with? end_with?
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# blank?/present?
|
43
|
+
#
|
44
|
+
|
45
|
+
class Object
|
46
|
+
def blank?
|
47
|
+
respond_to?(:empty?) ? !!empty? : !self
|
48
|
+
end
|
49
|
+
|
50
|
+
def present?
|
51
|
+
!blank?
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
class String
|
56
|
+
def blank?
|
57
|
+
!!(self =~ /\A\s*\z/)
|
58
|
+
end
|
59
|
+
end
|
data/lib/sinew/dsl.rb
ADDED
@@ -0,0 +1,98 @@
|
|
1
|
+
require 'awesome_print'
|
2
|
+
require 'cgi'
|
3
|
+
|
4
|
+
#
|
5
|
+
# The DSL available to .sinew files.
|
6
|
+
#
|
7
|
+
|
8
|
+
module Sinew
|
9
|
+
class DSL
|
10
|
+
attr_reader :sinew, :raw, :uri, :elapsed
|
11
|
+
|
12
|
+
def initialize(sinew)
|
13
|
+
@sinew = sinew
|
14
|
+
end
|
15
|
+
|
16
|
+
def run
|
17
|
+
tm = Time.now
|
18
|
+
recipe = sinew.options[:recipe]
|
19
|
+
instance_eval(File.read(recipe, mode: 'rb'), recipe)
|
20
|
+
@elapsed = Time.now - tm
|
21
|
+
end
|
22
|
+
|
23
|
+
#
|
24
|
+
# request
|
25
|
+
#
|
26
|
+
|
27
|
+
def get(url, query = {})
|
28
|
+
http('get', url, query: query)
|
29
|
+
end
|
30
|
+
|
31
|
+
def post(url, form = {})
|
32
|
+
body = form
|
33
|
+
headers = {
|
34
|
+
'Content-Type' => 'application/x-www-form-urlencoded',
|
35
|
+
}
|
36
|
+
http('post', url, body: body, headers: headers)
|
37
|
+
end
|
38
|
+
|
39
|
+
def post_json(url, json = {})
|
40
|
+
body = json.to_json
|
41
|
+
headers = {
|
42
|
+
'Content-Type' => 'application/json',
|
43
|
+
}
|
44
|
+
http('post', url, body: body, headers: headers)
|
45
|
+
end
|
46
|
+
|
47
|
+
def http(method, url, options = {})
|
48
|
+
# reset
|
49
|
+
@html = @noko = @json = @url = nil
|
50
|
+
|
51
|
+
# fetch
|
52
|
+
response = sinew.http(method, url, options)
|
53
|
+
|
54
|
+
# respond
|
55
|
+
@uri = response.uri
|
56
|
+
@raw = response.body
|
57
|
+
end
|
58
|
+
|
59
|
+
#
|
60
|
+
# response
|
61
|
+
#
|
62
|
+
|
63
|
+
def html
|
64
|
+
@html ||= begin
|
65
|
+
s = raw.dup
|
66
|
+
# squish!
|
67
|
+
s.squish!
|
68
|
+
# kill whitespace around tags
|
69
|
+
s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
|
70
|
+
s
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
def noko
|
75
|
+
@noko ||= Nokogiri::HTML(html)
|
76
|
+
end
|
77
|
+
|
78
|
+
def json
|
79
|
+
@json ||= JSON.parse(raw, symbolize_names: true)
|
80
|
+
end
|
81
|
+
|
82
|
+
def url
|
83
|
+
uri.to_s
|
84
|
+
end
|
85
|
+
|
86
|
+
#
|
87
|
+
# csv
|
88
|
+
#
|
89
|
+
|
90
|
+
def csv_header(*args)
|
91
|
+
sinew.output.header(args)
|
92
|
+
end
|
93
|
+
|
94
|
+
def csv_emit(row)
|
95
|
+
sinew.output.emit(row)
|
96
|
+
end
|
97
|
+
end
|
98
|
+
end
|
data/lib/sinew/main.rb
CHANGED
@@ -1,188 +1,119 @@
|
|
1
|
-
require
|
2
|
-
require "awesome_print"
|
3
|
-
require "cgi"
|
4
|
-
require "csv"
|
5
|
-
require "htmlentities"
|
6
|
-
require "stringex"
|
1
|
+
require 'scripto'
|
7
2
|
|
8
|
-
|
9
|
-
|
10
|
-
|
3
|
+
#
|
4
|
+
# Main sinew entry point.
|
5
|
+
#
|
11
6
|
|
12
|
-
|
7
|
+
module Sinew
|
8
|
+
class Main < Scripto::Main
|
9
|
+
attr_reader :runtime_options, :request_tm, :request_count
|
13
10
|
|
14
11
|
def initialize(options)
|
15
|
-
|
16
|
-
_run if !@options[:test]
|
17
|
-
end
|
18
|
-
|
19
|
-
def get(url, params = nil)
|
20
|
-
_http(url, params, :get)
|
21
|
-
end
|
12
|
+
super(options)
|
22
13
|
|
23
|
-
|
24
|
-
|
14
|
+
# init
|
15
|
+
@runtime_options = RuntimeOptions.new
|
16
|
+
@request_tm = Time.at(0)
|
17
|
+
@request_count = 0
|
25
18
|
end
|
26
19
|
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
def html
|
32
|
-
@html ||= begin
|
33
|
-
s = TextUtil.html_tidy(@raw)
|
34
|
-
nelements = @raw.count("<")
|
35
|
-
if nelements > 1
|
36
|
-
# is there a problem with tidy?
|
37
|
-
percent = 100 * s.count("<") / nelements
|
38
|
-
if percent < 80
|
39
|
-
# bad xml processing instruction? Try fixing it.
|
40
|
-
maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
|
41
|
-
new_percent = 100 * maybe.count("<") / nelements
|
42
|
-
if new_percent > 80
|
43
|
-
# yes!
|
44
|
-
s = maybe
|
45
|
-
else
|
46
|
-
Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
s
|
51
|
-
end
|
20
|
+
def run
|
21
|
+
dsl.run
|
22
|
+
footer if !quiet?
|
52
23
|
end
|
53
24
|
|
54
|
-
def
|
55
|
-
|
25
|
+
def quiet?
|
26
|
+
options[:quiet]
|
56
27
|
end
|
57
28
|
|
58
|
-
def
|
59
|
-
@
|
29
|
+
def dsl
|
30
|
+
@dsl ||= DSL.new(self)
|
60
31
|
end
|
61
32
|
|
62
33
|
#
|
63
|
-
#
|
34
|
+
# http requests and caching
|
64
35
|
#
|
65
36
|
|
66
|
-
def
|
67
|
-
|
68
|
-
if args.first.is_a?(String)
|
69
|
-
file = args.shift
|
70
|
-
if file !~ /^\//
|
71
|
-
file = "#{File.dirname(@options[:file])}/#{file}"
|
72
|
-
end
|
73
|
-
else
|
74
|
-
file = @options[:file]
|
75
|
-
end
|
76
|
-
ext = File.extname(file)
|
77
|
-
file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
|
78
|
-
|
79
|
-
@path = file
|
80
|
-
@csv = CSV.open(file, "wb")
|
81
|
-
@csv_keys = args
|
82
|
-
@csv << @csv_keys
|
83
|
-
_banner("Writing to #{@path}...")
|
37
|
+
def cache
|
38
|
+
@cache ||= Cache.new(self)
|
84
39
|
end
|
85
40
|
|
86
|
-
def
|
87
|
-
|
41
|
+
def http(method, url, options = {})
|
42
|
+
request = Request.new(self, method, url, options)
|
88
43
|
|
89
|
-
|
90
|
-
|
91
|
-
s = _normalize(row[i], i)
|
92
|
-
print[i] = s if !s.empty?
|
93
|
-
s
|
94
|
-
end
|
95
|
-
$stderr.puts print.ai if @options[:verbose]
|
96
|
-
@csv << row
|
97
|
-
@csv.flush
|
98
|
-
end
|
44
|
+
# try to get from cache
|
45
|
+
response = cache.get(request)
|
99
46
|
|
100
|
-
|
47
|
+
# perform if necessary
|
48
|
+
if !response
|
49
|
+
response = perform(request)
|
50
|
+
cache.set(response)
|
51
|
+
end
|
101
52
|
|
102
|
-
|
103
|
-
|
104
|
-
#
|
105
|
-
options = { user_agent: "sinew/#{VERSION}" }
|
106
|
-
options[:dir] = @options[:cache] if @options[:cache]
|
107
|
-
options[:verbose] = false if @options[:quiet]
|
108
|
-
Curler.new(options)
|
53
|
+
# always log error messages
|
54
|
+
if response.error?
|
55
|
+
puts "xxx http request failed with #{response.code}"
|
109
56
|
end
|
57
|
+
|
58
|
+
response
|
110
59
|
end
|
111
60
|
|
112
|
-
def
|
113
|
-
|
61
|
+
def perform(request)
|
62
|
+
before_perform_request(request)
|
114
63
|
|
115
|
-
|
116
|
-
if !File.exists?(file)
|
117
|
-
Util.fatal("#{file} not found")
|
118
|
-
end
|
64
|
+
response = nil
|
119
65
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
66
|
+
tries = runtime_options.retries + 1
|
67
|
+
while tries > 0
|
68
|
+
tries -= 1
|
69
|
+
begin
|
70
|
+
@request_count += 1
|
71
|
+
response = request.perform
|
72
|
+
rescue Timeout::Error
|
73
|
+
response = Response.from_timeout(request)
|
74
|
+
end
|
75
|
+
break if !response.error_500?
|
126
76
|
end
|
77
|
+
|
78
|
+
response
|
127
79
|
end
|
80
|
+
protected :perform
|
128
81
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
# decode entities
|
134
|
-
url = CODER.decode(url)
|
135
|
-
|
136
|
-
# handle params
|
137
|
-
body = nil
|
138
|
-
if params
|
139
|
-
q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
|
140
|
-
q = q.map { |key, value| "#{key}=#{value}" }.join("&")
|
141
|
-
if method == :get
|
142
|
-
separator = url.include?(??) ? "&" : "?"
|
143
|
-
url = "#{url}#{separator}#{q}"
|
144
|
-
else
|
145
|
-
body = q
|
146
|
-
end
|
147
|
-
end
|
82
|
+
#
|
83
|
+
# output
|
84
|
+
#
|
148
85
|
|
149
|
-
|
150
|
-
|
151
|
-
|
86
|
+
def output
|
87
|
+
@output ||= Output.new(self)
|
88
|
+
end
|
89
|
+
|
90
|
+
#
|
91
|
+
# helpers
|
92
|
+
#
|
93
|
+
|
94
|
+
def before_perform_request(request)
|
95
|
+
# log
|
96
|
+
if !quiet?
|
97
|
+
msg = if request.method != 'get'
|
98
|
+
"req #{request.uri} (#{request.method})"
|
152
99
|
else
|
153
|
-
|
100
|
+
"req #{request.uri}"
|
154
101
|
end
|
155
|
-
|
156
|
-
rescue Curler::Error => e
|
157
|
-
$stderr.puts "xxx #{e.message}"
|
158
|
-
@raw = ""
|
102
|
+
$stderr.puts msg
|
159
103
|
end
|
160
104
|
|
161
|
-
#
|
162
|
-
|
163
|
-
|
164
|
-
@
|
165
|
-
@noko = nil
|
166
|
-
end
|
167
|
-
|
168
|
-
def _normalize(s, key = nil)
|
169
|
-
case s
|
170
|
-
when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
|
171
|
-
s = s.inner_html
|
172
|
-
when Array
|
173
|
-
s = s.map { |j| j.to_s }.join("|")
|
174
|
-
else
|
175
|
-
s = s.to_s
|
176
|
-
end
|
177
|
-
s = TextUtil.untag(s)
|
178
|
-
s = s.convert_accented_html_entities
|
179
|
-
s = TextUtil.unent(s)
|
180
|
-
s = s.to_ascii.squish
|
181
|
-
s
|
105
|
+
# rate limit
|
106
|
+
sleep = (request_tm + runtime_options.rate_limit) - Time.now
|
107
|
+
sleep(sleep) if sleep > 0
|
108
|
+
@request_tm = Time.now
|
182
109
|
end
|
110
|
+
protected :before_perform_request
|
183
111
|
|
184
|
-
def
|
185
|
-
|
112
|
+
def footer
|
113
|
+
output.report
|
114
|
+
finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
|
115
|
+
banner("#{finished} in #{dsl.elapsed.to_i}s.")
|
186
116
|
end
|
117
|
+
protected :footer
|
187
118
|
end
|
188
119
|
end
|
data/lib/sinew/nokogiri_ext.rb
CHANGED
@@ -1,15 +1,16 @@
|
|
1
|
-
require
|
1
|
+
require 'nokogiri'
|
2
2
|
|
3
3
|
# modify NodeSet to join with SPACE instead of empty string
|
4
4
|
class Nokogiri::XML::NodeSet
|
5
|
-
alias
|
6
|
-
alias
|
7
|
-
|
5
|
+
alias old_inner_html inner_html
|
6
|
+
alias old_inner_text inner_text
|
7
|
+
|
8
8
|
def inner_text
|
9
|
-
|
9
|
+
map(&:inner_text).join(' ')
|
10
10
|
end
|
11
|
-
|
12
|
-
|
11
|
+
|
12
|
+
def inner_html(*args)
|
13
|
+
map { |i| i.inner_html(*args) }.join(' ')
|
13
14
|
end
|
14
15
|
end
|
15
16
|
|
@@ -17,11 +18,11 @@ end
|
|
17
18
|
class Nokogiri::XML::Node
|
18
19
|
def text_just_me
|
19
20
|
t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
|
20
|
-
t
|
21
|
+
t&.text
|
21
22
|
end
|
22
23
|
end
|
23
24
|
class Nokogiri::XML::NodeSet
|
24
25
|
def text_just_me
|
25
|
-
map
|
26
|
+
map(&:text_just_me).join(' ')
|
26
27
|
end
|
27
28
|
end
|