sinew 1.0.4 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ #
2
+ # A few core extensions brought over from ActiveSupport. These are handy for
3
+ # parsing.
4
+ #
5
+
6
+ class String
7
+ def squish
8
+ dup.squish!
9
+ end
10
+
11
+ def squish!
12
+ strip!
13
+ gsub!(/\s+/, ' ')
14
+ self
15
+ end
16
+
17
+ def first(limit = 1)
18
+ if limit == 0
19
+ ''
20
+ elsif limit >= size
21
+ dup
22
+ else
23
+ self[0..limit - 1]
24
+ end
25
+ end
26
+
27
+ def last(limit = 1)
28
+ if limit == 0
29
+ ''
30
+ elsif limit >= size
31
+ dup
32
+ else
33
+ self[-limit..-1]
34
+ end
35
+ end
36
+
37
+ alias starts_with? start_with?
38
+ alias ends_with? end_with?
39
+ end
40
+
41
+ #
42
+ # blank?/present?
43
+ #
44
+
45
+ class Object
46
+ def blank?
47
+ respond_to?(:empty?) ? !!empty? : !self
48
+ end
49
+
50
+ def present?
51
+ !blank?
52
+ end
53
+ end
54
+
55
+ class String
56
+ def blank?
57
+ !!(self =~ /\A\s*\z/)
58
+ end
59
+ end
@@ -0,0 +1,98 @@
1
+ require 'awesome_print'
2
+ require 'cgi'
3
+
4
+ #
5
+ # The DSL available to .sinew files.
6
+ #
7
+
8
+ module Sinew
9
+ class DSL
10
+ attr_reader :sinew, :raw, :uri, :elapsed
11
+
12
+ def initialize(sinew)
13
+ @sinew = sinew
14
+ end
15
+
16
+ def run
17
+ tm = Time.now
18
+ recipe = sinew.options[:recipe]
19
+ instance_eval(File.read(recipe, mode: 'rb'), recipe)
20
+ @elapsed = Time.now - tm
21
+ end
22
+
23
+ #
24
+ # request
25
+ #
26
+
27
+ def get(url, query = {})
28
+ http('get', url, query: query)
29
+ end
30
+
31
+ def post(url, form = {})
32
+ body = form
33
+ headers = {
34
+ 'Content-Type' => 'application/x-www-form-urlencoded',
35
+ }
36
+ http('post', url, body: body, headers: headers)
37
+ end
38
+
39
+ def post_json(url, json = {})
40
+ body = json.to_json
41
+ headers = {
42
+ 'Content-Type' => 'application/json',
43
+ }
44
+ http('post', url, body: body, headers: headers)
45
+ end
46
+
47
+ def http(method, url, options = {})
48
+ # reset
49
+ @html = @noko = @json = @url = nil
50
+
51
+ # fetch
52
+ response = sinew.http(method, url, options)
53
+
54
+ # respond
55
+ @uri = response.uri
56
+ @raw = response.body
57
+ end
58
+
59
+ #
60
+ # response
61
+ #
62
+
63
+ def html
64
+ @html ||= begin
65
+ s = raw.dup
66
+ # squish!
67
+ s.squish!
68
+ # kill whitespace around tags
69
+ s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
70
+ s
71
+ end
72
+ end
73
+
74
+ def noko
75
+ @noko ||= Nokogiri::HTML(html)
76
+ end
77
+
78
+ def json
79
+ @json ||= JSON.parse(raw, symbolize_names: true)
80
+ end
81
+
82
+ def url
83
+ uri.to_s
84
+ end
85
+
86
+ #
87
+ # csv
88
+ #
89
+
90
+ def csv_header(*args)
91
+ sinew.output.header(args)
92
+ end
93
+
94
+ def csv_emit(row)
95
+ sinew.output.emit(row)
96
+ end
97
+ end
98
+ end
@@ -1,188 +1,119 @@
1
- require "nokogiri" # must be loaded before awesome_print
2
- require "awesome_print"
3
- require "cgi"
4
- require "csv"
5
- require "htmlentities"
6
- require "stringex"
1
+ require 'scripto'
7
2
 
8
- module Sinew
9
- class Main
10
- CODER = HTMLEntities.new
3
+ #
4
+ # Main sinew entry point.
5
+ #
11
6
 
12
- attr_accessor :url, :uri, :raw
7
+ module Sinew
8
+ class Main < Scripto::Main
9
+ attr_reader :runtime_options, :request_tm, :request_count
13
10
 
14
11
  def initialize(options)
15
- @options = options.dup
16
- _run if !@options[:test]
17
- end
18
-
19
- def get(url, params = nil)
20
- _http(url, params, :get)
21
- end
12
+ super(options)
22
13
 
23
- def post(url, params = nil)
24
- _http(url, params, :post)
14
+ # init
15
+ @runtime_options = RuntimeOptions.new
16
+ @request_tm = Time.at(0)
17
+ @request_count = 0
25
18
  end
26
19
 
27
- #
28
- # lazy accessors for cleaned up version
29
- #
30
-
31
- def html
32
- @html ||= begin
33
- s = TextUtil.html_tidy(@raw)
34
- nelements = @raw.count("<")
35
- if nelements > 1
36
- # is there a problem with tidy?
37
- percent = 100 * s.count("<") / nelements
38
- if percent < 80
39
- # bad xml processing instruction? Try fixing it.
40
- maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
41
- new_percent = 100 * maybe.count("<") / nelements
42
- if new_percent > 80
43
- # yes!
44
- s = maybe
45
- else
46
- Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
47
- end
48
- end
49
- end
50
- s
51
- end
20
+ def run
21
+ dsl.run
22
+ footer if !quiet?
52
23
  end
53
24
 
54
- def clean
55
- @clean ||= TextUtil.html_clean_from_tidy(self.html)
25
+ def quiet?
26
+ options[:quiet]
56
27
  end
57
28
 
58
- def noko
59
- @noko ||= Nokogiri::HTML(html)
29
+ def dsl
30
+ @dsl ||= DSL.new(self)
60
31
  end
61
32
 
62
33
  #
63
- # csv
34
+ # http requests and caching
64
35
  #
65
36
 
66
- def csv_header(*args)
67
- args = args.flatten
68
- if args.first.is_a?(String)
69
- file = args.shift
70
- if file !~ /^\//
71
- file = "#{File.dirname(@options[:file])}/#{file}"
72
- end
73
- else
74
- file = @options[:file]
75
- end
76
- ext = File.extname(file)
77
- file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
78
-
79
- @path = file
80
- @csv = CSV.open(file, "wb")
81
- @csv_keys = args
82
- @csv << @csv_keys
83
- _banner("Writing to #{@path}...")
37
+ def cache
38
+ @cache ||= Cache.new(self)
84
39
  end
85
40
 
86
- def csv_emit(row, options = {})
87
- csv_header(row.keys.sort) if !@csv
41
+ def http(method, url, options = {})
42
+ request = Request.new(self, method, url, options)
88
43
 
89
- print = { }
90
- row = @csv_keys.map do |i|
91
- s = _normalize(row[i], i)
92
- print[i] = s if !s.empty?
93
- s
94
- end
95
- $stderr.puts print.ai if @options[:verbose]
96
- @csv << row
97
- @csv.flush
98
- end
44
+ # try to get from cache
45
+ response = cache.get(request)
99
46
 
100
- protected
47
+ # perform if necessary
48
+ if !response
49
+ response = perform(request)
50
+ cache.set(response)
51
+ end
101
52
 
102
- def _curler
103
- @curler ||= begin
104
- # curler
105
- options = { user_agent: "sinew/#{VERSION}" }
106
- options[:dir] = @options[:cache] if @options[:cache]
107
- options[:verbose] = false if @options[:quiet]
108
- Curler.new(options)
53
+ # always log error messages
54
+ if response.error?
55
+ puts "xxx http request failed with #{response.code}"
109
56
  end
57
+
58
+ response
110
59
  end
111
60
 
112
- def _run
113
- @csv = @path = nil
61
+ def perform(request)
62
+ before_perform_request(request)
114
63
 
115
- file = @options[:file]
116
- if !File.exists?(file)
117
- Util.fatal("#{file} not found")
118
- end
64
+ response = nil
119
65
 
120
- tm = Time.now
121
- instance_eval(File.read(file, mode: "rb"), file)
122
- if @path
123
- _banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
124
- else
125
- _banner("Finished in #{(Time.now - tm).to_i}s.")
66
+ tries = runtime_options.retries + 1
67
+ while tries > 0
68
+ tries -= 1
69
+ begin
70
+ @request_count += 1
71
+ response = request.perform
72
+ rescue Timeout::Error
73
+ response = Response.from_timeout(request)
74
+ end
75
+ break if !response.error_500?
126
76
  end
77
+
78
+ response
127
79
  end
80
+ protected :perform
128
81
 
129
- def _http(url, params, method)
130
- url = url.to_s
131
- raise "invalid url #{url.inspect}" if url !~ /^http/i
132
-
133
- # decode entities
134
- url = CODER.decode(url)
135
-
136
- # handle params
137
- body = nil
138
- if params
139
- q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
140
- q = q.map { |key, value| "#{key}=#{value}" }.join("&")
141
- if method == :get
142
- separator = url.include?(??) ? "&" : "?"
143
- url = "#{url}#{separator}#{q}"
144
- else
145
- body = q
146
- end
147
- end
82
+ #
83
+ # output
84
+ #
148
85
 
149
- begin
150
- if method == :get
151
- path = _curler.get(url)
86
+ def output
87
+ @output ||= Output.new(self)
88
+ end
89
+
90
+ #
91
+ # helpers
92
+ #
93
+
94
+ def before_perform_request(request)
95
+ # log
96
+ if !quiet?
97
+ msg = if request.method != 'get'
98
+ "req #{request.uri} (#{request.method})"
152
99
  else
153
- path = _curler.post(url, body)
100
+ "req #{request.uri}"
154
101
  end
155
- @raw = File.read(path, mode: "rb")
156
- rescue Curler::Error => e
157
- $stderr.puts "xxx #{e.message}"
158
- @raw = ""
102
+ $stderr.puts msg
159
103
  end
160
104
 
161
- # setup local variables
162
- @url, @uri = _curler.url, _curler.uri
163
- @html = nil
164
- @clean = nil
165
- @noko = nil
166
- end
167
-
168
- def _normalize(s, key = nil)
169
- case s
170
- when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
171
- s = s.inner_html
172
- when Array
173
- s = s.map { |j| j.to_s }.join("|")
174
- else
175
- s = s.to_s
176
- end
177
- s = TextUtil.untag(s)
178
- s = s.convert_accented_html_entities
179
- s = TextUtil.unent(s)
180
- s = s.to_ascii.squish
181
- s
105
+ # rate limit
106
+ sleep = (request_tm + runtime_options.rate_limit) - Time.now
107
+ sleep(sleep) if sleep > 0
108
+ @request_tm = Time.now
182
109
  end
110
+ protected :before_perform_request
183
111
 
184
- def _banner(s)
185
- Util.banner(s) if !@options[:quiet]
112
+ def footer
113
+ output.report
114
+ finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
115
+ banner("#{finished} in #{dsl.elapsed.to_i}s.")
186
116
  end
117
+ protected :footer
187
118
  end
188
119
  end
@@ -1,15 +1,16 @@
1
- require "nokogiri"
1
+ require 'nokogiri'
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
4
  class Nokogiri::XML::NodeSet
5
- alias :old_inner_html :inner_html
6
- alias :old_inner_text :inner_text
7
-
5
+ alias old_inner_html inner_html
6
+ alias old_inner_text inner_text
7
+
8
8
  def inner_text
9
- collect { |i| i.inner_text }.join(" ")
9
+ map(&:inner_text).join(' ')
10
10
  end
11
- def inner_html *args
12
- collect { |i| i.inner_html(*args) }.join(" ")
11
+
12
+ def inner_html(*args)
13
+ map { |i| i.inner_html(*args) }.join(' ')
13
14
  end
14
15
  end
15
16
 
@@ -17,11 +18,11 @@ end
17
18
  class Nokogiri::XML::Node
18
19
  def text_just_me
19
20
  t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
20
- t && t.text
21
+ t&.text
21
22
  end
22
23
  end
23
24
  class Nokogiri::XML::NodeSet
24
25
  def text_just_me
25
- map { |i| i.text_just_me }.join(" ")
26
+ map(&:text_just_me).join(' ')
26
27
  end
27
28
  end