sinew 1.0.4 → 2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,59 @@
1
+ #
2
+ # A few core extensions brought over from ActiveSupport. These are handy for
3
+ # parsing.
4
+ #
5
+
6
+ class String
7
+ def squish
8
+ dup.squish!
9
+ end
10
+
11
+ def squish!
12
+ strip!
13
+ gsub!(/\s+/, ' ')
14
+ self
15
+ end
16
+
17
+ def first(limit = 1)
18
+ if limit == 0
19
+ ''
20
+ elsif limit >= size
21
+ dup
22
+ else
23
+ self[0..limit - 1]
24
+ end
25
+ end
26
+
27
+ def last(limit = 1)
28
+ if limit == 0
29
+ ''
30
+ elsif limit >= size
31
+ dup
32
+ else
33
+ self[-limit..-1]
34
+ end
35
+ end
36
+
37
+ alias starts_with? start_with?
38
+ alias ends_with? end_with?
39
+ end
40
+
41
+ #
42
+ # blank?/present?
43
+ #
44
+
45
+ class Object
46
+ def blank?
47
+ respond_to?(:empty?) ? !!empty? : !self
48
+ end
49
+
50
+ def present?
51
+ !blank?
52
+ end
53
+ end
54
+
55
+ class String
56
+ def blank?
57
+ !!(self =~ /\A\s*\z/)
58
+ end
59
+ end
@@ -0,0 +1,98 @@
1
+ require 'awesome_print'
2
+ require 'cgi'
3
+
4
+ #
5
+ # The DSL available to .sinew files.
6
+ #
7
+
8
+ module Sinew
9
+ class DSL
10
+ attr_reader :sinew, :raw, :uri, :elapsed
11
+
12
+ def initialize(sinew)
13
+ @sinew = sinew
14
+ end
15
+
16
+ def run
17
+ tm = Time.now
18
+ recipe = sinew.options[:recipe]
19
+ instance_eval(File.read(recipe, mode: 'rb'), recipe)
20
+ @elapsed = Time.now - tm
21
+ end
22
+
23
+ #
24
+ # request
25
+ #
26
+
27
+ def get(url, query = {})
28
+ http('get', url, query: query)
29
+ end
30
+
31
+ def post(url, form = {})
32
+ body = form
33
+ headers = {
34
+ 'Content-Type' => 'application/x-www-form-urlencoded',
35
+ }
36
+ http('post', url, body: body, headers: headers)
37
+ end
38
+
39
+ def post_json(url, json = {})
40
+ body = json.to_json
41
+ headers = {
42
+ 'Content-Type' => 'application/json',
43
+ }
44
+ http('post', url, body: body, headers: headers)
45
+ end
46
+
47
+ def http(method, url, options = {})
48
+ # reset
49
+ @html = @noko = @json = @url = nil
50
+
51
+ # fetch
52
+ response = sinew.http(method, url, options)
53
+
54
+ # respond
55
+ @uri = response.uri
56
+ @raw = response.body
57
+ end
58
+
59
+ #
60
+ # response
61
+ #
62
+
63
+ def html
64
+ @html ||= begin
65
+ s = raw.dup
66
+ # squish!
67
+ s.squish!
68
+ # kill whitespace around tags
69
+ s.gsub!(/ ?<([^>]+)> ?/, '<\\1>')
70
+ s
71
+ end
72
+ end
73
+
74
+ def noko
75
+ @noko ||= Nokogiri::HTML(html)
76
+ end
77
+
78
+ def json
79
+ @json ||= JSON.parse(raw, symbolize_names: true)
80
+ end
81
+
82
+ def url
83
+ uri.to_s
84
+ end
85
+
86
+ #
87
+ # csv
88
+ #
89
+
90
+ def csv_header(*args)
91
+ sinew.output.header(args)
92
+ end
93
+
94
+ def csv_emit(row)
95
+ sinew.output.emit(row)
96
+ end
97
+ end
98
+ end
@@ -1,188 +1,119 @@
1
- require "nokogiri" # must be loaded before awesome_print
2
- require "awesome_print"
3
- require "cgi"
4
- require "csv"
5
- require "htmlentities"
6
- require "stringex"
1
+ require 'scripto'
7
2
 
8
- module Sinew
9
- class Main
10
- CODER = HTMLEntities.new
3
+ #
4
+ # Main sinew entry point.
5
+ #
11
6
 
12
- attr_accessor :url, :uri, :raw
7
+ module Sinew
8
+ class Main < Scripto::Main
9
+ attr_reader :runtime_options, :request_tm, :request_count
13
10
 
14
11
  def initialize(options)
15
- @options = options.dup
16
- _run if !@options[:test]
17
- end
18
-
19
- def get(url, params = nil)
20
- _http(url, params, :get)
21
- end
12
+ super(options)
22
13
 
23
- def post(url, params = nil)
24
- _http(url, params, :post)
14
+ # init
15
+ @runtime_options = RuntimeOptions.new
16
+ @request_tm = Time.at(0)
17
+ @request_count = 0
25
18
  end
26
19
 
27
- #
28
- # lazy accessors for cleaned up version
29
- #
30
-
31
- def html
32
- @html ||= begin
33
- s = TextUtil.html_tidy(@raw)
34
- nelements = @raw.count("<")
35
- if nelements > 1
36
- # is there a problem with tidy?
37
- percent = 100 * s.count("<") / nelements
38
- if percent < 80
39
- # bad xml processing instruction? Try fixing it.
40
- maybe = TextUtil.html_tidy(@raw.gsub(/<\?[^>]*?>/, ""))
41
- new_percent = 100 * maybe.count("<") / nelements
42
- if new_percent > 80
43
- # yes!
44
- s = maybe
45
- else
46
- Util.warning "Hm - it looks like tidy ate some of your file (#{percent}%)" if percent < 90
47
- end
48
- end
49
- end
50
- s
51
- end
20
+ def run
21
+ dsl.run
22
+ footer if !quiet?
52
23
  end
53
24
 
54
- def clean
55
- @clean ||= TextUtil.html_clean_from_tidy(self.html)
25
+ def quiet?
26
+ options[:quiet]
56
27
  end
57
28
 
58
- def noko
59
- @noko ||= Nokogiri::HTML(html)
29
+ def dsl
30
+ @dsl ||= DSL.new(self)
60
31
  end
61
32
 
62
33
  #
63
- # csv
34
+ # http requests and caching
64
35
  #
65
36
 
66
- def csv_header(*args)
67
- args = args.flatten
68
- if args.first.is_a?(String)
69
- file = args.shift
70
- if file !~ /^\//
71
- file = "#{File.dirname(@options[:file])}/#{file}"
72
- end
73
- else
74
- file = @options[:file]
75
- end
76
- ext = File.extname(file)
77
- file = ext.empty? ? "#{file}.csv" : file.gsub(ext, ".csv")
78
-
79
- @path = file
80
- @csv = CSV.open(file, "wb")
81
- @csv_keys = args
82
- @csv << @csv_keys
83
- _banner("Writing to #{@path}...")
37
+ def cache
38
+ @cache ||= Cache.new(self)
84
39
  end
85
40
 
86
- def csv_emit(row, options = {})
87
- csv_header(row.keys.sort) if !@csv
41
+ def http(method, url, options = {})
42
+ request = Request.new(self, method, url, options)
88
43
 
89
- print = { }
90
- row = @csv_keys.map do |i|
91
- s = _normalize(row[i], i)
92
- print[i] = s if !s.empty?
93
- s
94
- end
95
- $stderr.puts print.ai if @options[:verbose]
96
- @csv << row
97
- @csv.flush
98
- end
44
+ # try to get from cache
45
+ response = cache.get(request)
99
46
 
100
- protected
47
+ # perform if necessary
48
+ if !response
49
+ response = perform(request)
50
+ cache.set(response)
51
+ end
101
52
 
102
- def _curler
103
- @curler ||= begin
104
- # curler
105
- options = { user_agent: "sinew/#{VERSION}" }
106
- options[:dir] = @options[:cache] if @options[:cache]
107
- options[:verbose] = false if @options[:quiet]
108
- Curler.new(options)
53
+ # always log error messages
54
+ if response.error?
55
+ puts "xxx http request failed with #{response.code}"
109
56
  end
57
+
58
+ response
110
59
  end
111
60
 
112
- def _run
113
- @csv = @path = nil
61
+ def perform(request)
62
+ before_perform_request(request)
114
63
 
115
- file = @options[:file]
116
- if !File.exists?(file)
117
- Util.fatal("#{file} not found")
118
- end
64
+ response = nil
119
65
 
120
- tm = Time.now
121
- instance_eval(File.read(file, mode: "rb"), file)
122
- if @path
123
- _banner("Finished #{@path} in #{(Time.now - tm).to_i}s.")
124
- else
125
- _banner("Finished in #{(Time.now - tm).to_i}s.")
66
+ tries = runtime_options.retries + 1
67
+ while tries > 0
68
+ tries -= 1
69
+ begin
70
+ @request_count += 1
71
+ response = request.perform
72
+ rescue Timeout::Error
73
+ response = Response.from_timeout(request)
74
+ end
75
+ break if !response.error_500?
126
76
  end
77
+
78
+ response
127
79
  end
80
+ protected :perform
128
81
 
129
- def _http(url, params, method)
130
- url = url.to_s
131
- raise "invalid url #{url.inspect}" if url !~ /^http/i
132
-
133
- # decode entities
134
- url = CODER.decode(url)
135
-
136
- # handle params
137
- body = nil
138
- if params
139
- q = params.map { |key, value| [CGI.escape(key.to_s), CGI.escape(value.to_s)] }.sort
140
- q = q.map { |key, value| "#{key}=#{value}" }.join("&")
141
- if method == :get
142
- separator = url.include?(??) ? "&" : "?"
143
- url = "#{url}#{separator}#{q}"
144
- else
145
- body = q
146
- end
147
- end
82
+ #
83
+ # output
84
+ #
148
85
 
149
- begin
150
- if method == :get
151
- path = _curler.get(url)
86
+ def output
87
+ @output ||= Output.new(self)
88
+ end
89
+
90
+ #
91
+ # helpers
92
+ #
93
+
94
+ def before_perform_request(request)
95
+ # log
96
+ if !quiet?
97
+ msg = if request.method != 'get'
98
+ "req #{request.uri} (#{request.method})"
152
99
  else
153
- path = _curler.post(url, body)
100
+ "req #{request.uri}"
154
101
  end
155
- @raw = File.read(path, mode: "rb")
156
- rescue Curler::Error => e
157
- $stderr.puts "xxx #{e.message}"
158
- @raw = ""
102
+ $stderr.puts msg
159
103
  end
160
104
 
161
- # setup local variables
162
- @url, @uri = _curler.url, _curler.uri
163
- @html = nil
164
- @clean = nil
165
- @noko = nil
166
- end
167
-
168
- def _normalize(s, key = nil)
169
- case s
170
- when Nokogiri::XML::Element, Nokogiri::XML::NodeSet
171
- s = s.inner_html
172
- when Array
173
- s = s.map { |j| j.to_s }.join("|")
174
- else
175
- s = s.to_s
176
- end
177
- s = TextUtil.untag(s)
178
- s = s.convert_accented_html_entities
179
- s = TextUtil.unent(s)
180
- s = s.to_ascii.squish
181
- s
105
+ # rate limit
106
+ sleep = (request_tm + runtime_options.rate_limit) - Time.now
107
+ sleep(sleep) if sleep > 0
108
+ @request_tm = Time.now
182
109
  end
110
+ protected :before_perform_request
183
111
 
184
- def _banner(s)
185
- Util.banner(s) if !@options[:quiet]
112
+ def footer
113
+ output.report
114
+ finished = output.count > 0 ? "Finished #{output.filename}" : 'Finished'
115
+ banner("#{finished} in #{dsl.elapsed.to_i}s.")
186
116
  end
117
+ protected :footer
187
118
  end
188
119
  end
@@ -1,15 +1,16 @@
1
- require "nokogiri"
1
+ require 'nokogiri'
2
2
 
3
3
  # modify NodeSet to join with SPACE instead of empty string
4
4
  class Nokogiri::XML::NodeSet
5
- alias :old_inner_html :inner_html
6
- alias :old_inner_text :inner_text
7
-
5
+ alias old_inner_html inner_html
6
+ alias old_inner_text inner_text
7
+
8
8
  def inner_text
9
- collect { |i| i.inner_text }.join(" ")
9
+ map(&:inner_text).join(' ')
10
10
  end
11
- def inner_html *args
12
- collect { |i| i.inner_html(*args) }.join(" ")
11
+
12
+ def inner_html(*args)
13
+ map { |i| i.inner_html(*args) }.join(' ')
13
14
  end
14
15
  end
15
16
 
@@ -17,11 +18,11 @@ end
17
18
  class Nokogiri::XML::Node
18
19
  def text_just_me
19
20
  t = children.find { |i| i.node_type == Nokogiri::XML::Node::TEXT_NODE }
20
- t && t.text
21
+ t&.text
21
22
  end
22
23
  end
23
24
  class Nokogiri::XML::NodeSet
24
25
  def text_just_me
25
- map { |i| i.text_just_me }.join(" ")
26
+ map(&:text_just_me).join(' ')
26
27
  end
27
28
  end