monkeyshines 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'monkeyshines/runner_core/options'
|
3
|
+
|
4
|
+
module Monkeyshines
|
5
|
+
|
6
|
+
#
|
7
|
+
# In general, you should
|
8
|
+
#
|
9
|
+
# But where an external library is alread providing cooked results or it's
|
10
|
+
# otherwise most straightforward to directly emit model objects, you can use
|
11
|
+
# a parsing runner
|
12
|
+
#
|
13
|
+
class ParsingRunner < Runner
|
14
|
+
|
15
|
+
#
|
16
|
+
# Fetch and store result
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def fetch_and_store req
|
20
|
+
result = fetcher.get(req) # do the url fetch
|
21
|
+
# results.each do |result|
|
22
|
+
result.parse do |obj|
|
23
|
+
dest.save(obj)
|
24
|
+
end
|
25
|
+
#end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,343 @@
|
|
1
|
+
# require 'time'
|
2
|
+
# module Monkeyshines
|
3
|
+
# #
|
4
|
+
# # Paginated lets you make repeated requests to collect a timeline or
|
5
|
+
# # collection of items.
|
6
|
+
# #
|
7
|
+
# # You will typically want to set the
|
8
|
+
# #
|
9
|
+
# # A Paginated-compatible ScrapeRequest should inherit from or be compatible
|
10
|
+
# # with +Monkeyshines::ScrapeRequest+ and additionally define
|
11
|
+
# # * [#items] list of individual items in the response; +nil+ if there was an
|
12
|
+
# # error, +[]+ if the response was well-formed but returned no items.
|
13
|
+
# # * [#num_items] number of items from this response
|
14
|
+
# # * [#span] the range of (typically) IDs within this scrape. Used to know when
|
15
|
+
# # we've reached results from previous session
|
16
|
+
# #
|
17
|
+
# #
|
18
|
+
# module Paginated
|
19
|
+
# #
|
20
|
+
# # Generates request for each page to be scraped
|
21
|
+
# #
|
22
|
+
# # Block must return the fulfilled scrape_request response. This response is
|
23
|
+
# # passed to +#acknowledge+ for bookkeeping, then the next request is
|
24
|
+
# # made.
|
25
|
+
# #
|
26
|
+
# # Scraping stops after max_pages requests or when is_last?(response, page)
|
27
|
+
# #
|
28
|
+
# def each_request pageinfo={}, &block
|
29
|
+
# begin_pagination!
|
30
|
+
# (1..hard_request_limit).each do |page|
|
31
|
+
# response = yield make_request(page, pageinfo)
|
32
|
+
# warn 'nil response' unless response
|
33
|
+
# acknowledge(response, page)
|
34
|
+
# break if is_last?(response, page)
|
35
|
+
# end
|
36
|
+
# finish_pagination!
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# # Set up bookkeeping for pagination tracking
|
40
|
+
# def begin_pagination!
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# # Finalize bookkeeping at conclusion of scrape_job.
|
44
|
+
# def finish_pagination!
|
45
|
+
# end
|
46
|
+
#
|
47
|
+
# #
|
48
|
+
# # Feed back info from the scrape
|
49
|
+
# #
|
50
|
+
# def acknowledge response, page
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# # return true if the next request would be pointless (true if, perhaps, the
|
54
|
+
# # response had no items, or the API page limit is reached)
|
55
|
+
# def is_last? response, page
|
56
|
+
# ( (page >= max_pages) ||
|
57
|
+
# (response && response.healthy? && (response.num_items < max_items)) )
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
# #
|
61
|
+
# # Soft limit on the number of pages to scrape.
|
62
|
+
# #
|
63
|
+
# # Typically, leave this set to the hard_request_limit if you don't know
|
64
|
+
# # beforehand how many pages to scrape, and override is_last? to decide when
|
65
|
+
# # to stop short of the API limit
|
66
|
+
# #
|
67
|
+
# def max_pages
|
68
|
+
# hard_request_limit
|
69
|
+
# end
|
70
|
+
#
|
71
|
+
# # inject class variables
|
72
|
+
# def self.included base
|
73
|
+
# base.class_eval do
|
74
|
+
# # Hard request limit: do not in any case exceed this number of requests
|
75
|
+
# class_inheritable_accessor :hard_request_limit
|
76
|
+
# # max items per page, from API
|
77
|
+
# class_inheritable_accessor :max_items
|
78
|
+
# #
|
79
|
+
# # Span of items gathered in this scrape scrape_job.
|
80
|
+
# attr_accessor :sess_items, :sess_span, :sess_timespan
|
81
|
+
# end
|
82
|
+
# end
|
83
|
+
# end
|
84
|
+
#
|
85
|
+
# module PaginatedTimeline
|
86
|
+
# # Soft limit on the number of pages to scrape.
|
87
|
+
# #
|
88
|
+
# # Typically, leave this set to the hard_request_limit if you don't know
|
89
|
+
# # beforehand how many pages to scrape, and override is_last? to decide when
|
90
|
+
# # to stop short of the API limit
|
91
|
+
# #
|
92
|
+
# def max_pages
|
93
|
+
# mp = fudge_factor * (n_items - prev_scraped_items) / max_items
|
94
|
+
# return 0 if mp == 0
|
95
|
+
# (mp+1).clamp(1, hard_request_limit).to_i
|
96
|
+
# end
|
97
|
+
# # inject class variables
|
98
|
+
# def self.included base
|
99
|
+
# base.class_eval do
|
100
|
+
# include Monkeyshines::Paginated
|
101
|
+
# end
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# # #
|
105
|
+
# # # Threshold count-per-page and actual count to get number of expected pages.
|
106
|
+
# # # Cap the request with max
|
107
|
+
# # def pages_from_count per_page, count, max=nil
|
108
|
+
# # num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
|
109
|
+
# # [num, max].compact.min
|
110
|
+
# # end
|
111
|
+
# end
|
112
|
+
#
|
113
|
+
# #
|
114
|
+
# # Scenario: you request paginated search requests with a limit parameter (a
|
115
|
+
# # max_id or min_id, for example).
|
116
|
+
# #
|
117
|
+
# # * request successive pages,
|
118
|
+
# # * use info on the requested page to set the next limit parameter
|
119
|
+
# # * stop when max_pages is reached or a successful request gives fewer than
|
120
|
+
# # max_items
|
121
|
+
# #
|
122
|
+
# #
|
123
|
+
# # The first
|
124
|
+
# #
|
125
|
+
# # req?min_id=1234&max_id=
|
126
|
+
# # => [ [8675, ...], ..., [8012, ...] ] # 100 items
|
127
|
+
# # req?min_id=1234&max_id=8011
|
128
|
+
# # => [ [7581, ...], ..., [2044, ...] ] # 100 items
|
129
|
+
# # req?min_id=1234&max_id=2043
|
130
|
+
# # => [ [2012, ...], ..., [1234, ...] ] # 69 items
|
131
|
+
# #
|
132
|
+
# # * The search terminates when
|
133
|
+
# # ** max_requests requests have been made, or
|
134
|
+
# # ** the limit params interval is zero, or
|
135
|
+
# # ** a successful response with fewer than max_items is received.
|
136
|
+
# #
|
137
|
+
# # * You will want to save <req?min_id=8676&max_id=""> for later scrape
|
138
|
+
# #
|
139
|
+
# module PaginatedWithLimit
|
140
|
+
#
|
141
|
+
# #
|
142
|
+
# # Return true if the next request would be pointless (true if, perhaps, the
|
143
|
+
# # response had no items, or the API page limit is reached)
|
144
|
+
# def is_last? response, page
|
145
|
+
# unscraped_span.empty? || super(response, page)
|
146
|
+
# end
|
147
|
+
#
|
148
|
+
# # Set up bookkeeping for pagination tracking
|
149
|
+
# def begin_pagination!
|
150
|
+
# self.sess_items ||= 0
|
151
|
+
# self.sess_span = UnionInterval.new
|
152
|
+
# self.sess_timespan = UnionInterval.new
|
153
|
+
# super
|
154
|
+
# end
|
155
|
+
#
|
156
|
+
# def finish_pagination!
|
157
|
+
# # piw = [(prev_items.to_f ** 0.66), (max_items * hard_request_limit * 4.0)].min
|
158
|
+
# # puts ([Time.now.strftime("%M:%S"), "%-23s"%query_term] + [prev_rate, sess_rate, avg_rate, sess_timespan.size.to_f, prev_items, sess_items, piw, (1000/avg_rate)].map{|s| "%15.4f"%(s||0) }).join("\t") rescue nil
|
159
|
+
# self.prev_rate = avg_rate
|
160
|
+
# if sess_items == (hard_request_limit * max_items)
|
161
|
+
# # bump the rate if we hit the hard cap:
|
162
|
+
# new_rate = [prev_rate * 1.25, 1000/120.0].max
|
163
|
+
# Log.info "Bumping rate on #{query_term} from #{prev_rate} to #{new_rate}"
|
164
|
+
# self.prev_rate = new_rate
|
165
|
+
# end
|
166
|
+
# self.prev_items = prev_items.to_i + sess_items.to_i
|
167
|
+
# self.prev_span = sess_span + prev_span
|
168
|
+
# self.new_items = sess_items.to_i + new_items.to_i
|
169
|
+
# self.sess_items = 0
|
170
|
+
# self.sess_span = UnionInterval.new
|
171
|
+
# self.sess_timespan = UnionInterval.new
|
172
|
+
# super
|
173
|
+
# end
|
174
|
+
#
|
175
|
+
# #
|
176
|
+
# # Feed back info from the scrape
|
177
|
+
# #
|
178
|
+
# def acknowledge response, page
|
179
|
+
# super response, page
|
180
|
+
# return unless response && response.items
|
181
|
+
# count_new_items response
|
182
|
+
# update_spans response
|
183
|
+
# end
|
184
|
+
#
|
185
|
+
# # account for additional items
|
186
|
+
# def count_new_items response
|
187
|
+
# num_items = response.num_items
|
188
|
+
# # if there was overlap with a previous scrape, we have to count the items by hand
|
189
|
+
# prev_span = self.prev_span
|
190
|
+
# if prev_span.max && response.span && (response.span.min < prev_span.max)
|
191
|
+
# num_items = response.items.inject(0){|n,item| (prev_span.include? item['id']) ? n : n+1 }
|
192
|
+
# end
|
193
|
+
# self.sess_items += num_items
|
194
|
+
# end
|
195
|
+
#
|
196
|
+
#
|
197
|
+
# def sess_rate
|
198
|
+
# return nil if (!sess_timespan) || (sess_timespan.size == 0)
|
199
|
+
# sess_items.to_f / sess_timespan.size.to_f
|
200
|
+
# end
|
201
|
+
# #
|
202
|
+
# # How often an item rolls in, on average
|
203
|
+
# #
|
204
|
+
# def avg_rate
|
205
|
+
# return nil if (sess_items.to_f == 0 && (prev_rate.blank? || prev_items.to_f == 0))
|
206
|
+
# prev_weight = prev_items.to_f ** 0.66
|
207
|
+
# sess_weight = sess_items.to_f
|
208
|
+
# prev_weight = [prev_weight, sess_weight*3].min if sess_weight > 0
|
209
|
+
# weighted_sum = (
|
210
|
+
# (prev_rate.to_f * prev_weight) + # damped previous avg
|
211
|
+
# (sess_rate.to_f * sess_weight) ) # current avg
|
212
|
+
# rt = weighted_sum / (prev_weight + sess_weight)
|
213
|
+
# rt
|
214
|
+
# end
|
215
|
+
#
|
216
|
+
#
|
217
|
+
# # inject class variables
|
218
|
+
# def self.included base
|
219
|
+
# base.class_eval do
|
220
|
+
# attr_accessor :new_items
|
221
|
+
# # include Monkeyshines::Paginated
|
222
|
+
# end
|
223
|
+
# end
|
224
|
+
# end
|
225
|
+
#
|
226
|
+
# end
|
227
|
+
#
|
228
|
+
#
|
229
|
+
# module Monkeyshines
|
230
|
+
# module Paginated
|
231
|
+
# end
|
232
|
+
#
|
233
|
+
# module PaginatedTimeline
|
234
|
+
# # Soft limit on the number of pages to scrape.
|
235
|
+
# #
|
236
|
+
# # Typically, leave this set to the hard_request_limit if you don't know
|
237
|
+
# # beforehand how many pages to scrape, and override is_last? to decide when
|
238
|
+
# # to stop short of the API limit
|
239
|
+
# #
|
240
|
+
# def max_pages
|
241
|
+
# mp = fudge_factor * (n_items - prev_scraped_items) / max_items
|
242
|
+
# return 0 if mp == 0
|
243
|
+
# (mp+1).clamp(1, hard_request_limit).to_i
|
244
|
+
# end
|
245
|
+
# # inject class variables
|
246
|
+
# def self.included base
|
247
|
+
# base.class_eval do
|
248
|
+
# include Monkeyshines::Paginated
|
249
|
+
# end
|
250
|
+
# end
|
251
|
+
#
|
252
|
+
# # #
|
253
|
+
# # # Threshold count-per-page and actual count to get number of expected pages.
|
254
|
+
# # # Cap the request with max
|
255
|
+
# # def pages_from_count per_page, count, max=nil
|
256
|
+
# # num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
|
257
|
+
# # [num, max].compact.min
|
258
|
+
# # end
|
259
|
+
# end
|
260
|
+
#
|
261
|
+
# module PaginatedWithRateAndLimit
|
262
|
+
#
|
263
|
+
# def after_pagination
|
264
|
+
# # piw = [(prev_items.to_f ** 0.66), (max_items * hard_request_limit * 4.0)].min
|
265
|
+
# # puts ([Time.now.strftime("%M:%S"), "%-23s"%query_term] + [prev_rate, sess_rate, avg_rate, sess_timespan.size.to_f, prev_items, sess_items, piw, (1000/avg_rate)].map{|s| "%15.4f"%(s||0) }).join("\t") rescue nil
|
266
|
+
# self.prev_rate = avg_rate
|
267
|
+
# if sess_items == (hard_request_limit * max_items)
|
268
|
+
# # bump the rate if we hit the hard cap:
|
269
|
+
# new_rate = [prev_rate * 1.25, 1000/120.0].max
|
270
|
+
# Log.info "Bumping rate on #{query_term} from #{prev_rate} to #{new_rate}"
|
271
|
+
# self.prev_rate = new_rate
|
272
|
+
# end
|
273
|
+
# self.prev_items = prev_items.to_i + sess_items.to_i
|
274
|
+
# self.prev_span = sess_span + prev_span
|
275
|
+
# self.new_items = sess_items.to_i + new_items.to_i
|
276
|
+
# self.sess_items = 0
|
277
|
+
# self.sess_span = UnionInterval.new
|
278
|
+
# self.sess_timespan = UnionInterval.new
|
279
|
+
# super
|
280
|
+
# end
|
281
|
+
#
|
282
|
+
# #
|
283
|
+
# # Feed back info from the scrape
|
284
|
+
# #
|
285
|
+
# def after_fetch response, page
|
286
|
+
# super response, page
|
287
|
+
# return unless response && response.items
|
288
|
+
# count_new_items response
|
289
|
+
# update_spans response
|
290
|
+
# end
|
291
|
+
#
|
292
|
+
# # account for additional items
|
293
|
+
# def count_new_items response
|
294
|
+
# num_items = response.num_items
|
295
|
+
# # if there was overlap with a previous scrape, we have to count the items by hand
|
296
|
+
# prev_span = self.prev_span
|
297
|
+
# if prev_span.max && response.span && (response.span.min < prev_span.max)
|
298
|
+
# num_items = response.items.inject(0){|n,item| (prev_span.include? item['id']) ? n : n+1 }
|
299
|
+
# end
|
300
|
+
# self.sess_items += num_items
|
301
|
+
# end
|
302
|
+
#
|
303
|
+
# def update_spans response
|
304
|
+
# # Update intervals
|
305
|
+
# self.sess_span << response.span
|
306
|
+
# self.sess_timespan << response.timespan
|
307
|
+
# end
|
308
|
+
#
|
309
|
+
# # gap between oldest scraped in this scrape_job and last one scraped in
|
310
|
+
# # previous scrape_job.
|
311
|
+
# def unscraped_span
|
312
|
+
# UnionInterval.new(prev_span_max, sess_span.min)
|
313
|
+
# end
|
314
|
+
# # span of previous scrape
|
315
|
+
# def prev_span
|
316
|
+
# @prev_span ||= UnionInterval.new(prev_span_min, prev_span_max)
|
317
|
+
# end
|
318
|
+
# def prev_span= min_max
|
319
|
+
# self.prev_span_min, self.prev_span_max = min_max.to_a
|
320
|
+
# @prev_span = UnionInterval.new(prev_span_min, prev_span_max)
|
321
|
+
# end
|
322
|
+
#
|
323
|
+
# def sess_rate
|
324
|
+
# return nil if (!sess_timespan) || (sess_timespan.size == 0)
|
325
|
+
# sess_items.to_f / sess_timespan.size.to_f
|
326
|
+
# end
|
327
|
+
# #
|
328
|
+
# # How often an item rolls in, on average
|
329
|
+
# #
|
330
|
+
# def avg_rate
|
331
|
+
# return nil if (sess_items.to_f == 0 && (prev_rate.blank? || prev_items.to_f == 0))
|
332
|
+
# prev_weight = prev_items.to_f ** 0.66
|
333
|
+
# sess_weight = sess_items.to_f
|
334
|
+
# prev_weight = [prev_weight, sess_weight*3].min if sess_weight > 0
|
335
|
+
# weighted_sum = (
|
336
|
+
# (prev_rate.to_f * prev_weight) + # damped previous avg
|
337
|
+
# (sess_rate.to_f * sess_weight) ) # current avg
|
338
|
+
# rt = weighted_sum / (prev_weight + sess_weight)
|
339
|
+
# rt
|
340
|
+
# end
|
341
|
+
# end
|
342
|
+
#
|
343
|
+
# end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
module Monkeyshines
|
3
|
+
def self.url_encode str
|
4
|
+
return '' if str.blank?
|
5
|
+
str = str.gsub(/ /, '+')
|
6
|
+
Addressable::URI.encode_component(str, Addressable::URI::CharacterClasses::UNRESERVED+'+')
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.url_decode str
|
10
|
+
return '' if str.blank?
|
11
|
+
str = str.gsub(/\+/, ' ')
|
12
|
+
Addressable::URI.unencode(str)
|
13
|
+
end
|
14
|
+
|
15
|
+
XML_ENCODED_BADNESS = { "\r" => " ", "\n" => " ", "\t" => "	" }
|
16
|
+
#
|
17
|
+
# Takes an already-encoded XML string and replaces ONLY the characters in
|
18
|
+
# XML_ENCODED_BADNESS (by default, \r newline, \n carriage return and \t tab)
|
19
|
+
# with their XML encodings ( and so forth). Doesn't do any other
|
20
|
+
# encoding, and leaves exiting entities alone.
|
21
|
+
#
|
22
|
+
def self.scrub_xml_encoded_badness str
|
23
|
+
str.chomp.gsub(/[\r\n\t]/){|c| XML_ENCODED_BADNESS[c]}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
module Monkeyshines
|
28
|
+
#
|
29
|
+
# Base class for Scrape requests
|
30
|
+
#
|
31
|
+
module ScrapeRequestCore
|
32
|
+
|
33
|
+
autoload :SignedUrl, 'monkeyshines/scrape_request/signed_url'
|
34
|
+
autoload :Paginated, 'monkeyshines/scrape_request/paginated'
|
35
|
+
autoload :Paginating, 'monkeyshines/scrape_request/paginated'
|
36
|
+
autoload :PaginatedWithLimit, 'monkeyshines/scrape_request/paginated'
|
37
|
+
|
38
|
+
def initialize *args
|
39
|
+
super *args
|
40
|
+
if (moreinfo.is_a?(String)) then self.moreinfo = JSON.load(moreinfo) rescue nil end
|
41
|
+
make_url! if (! url)
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_hash *args
|
45
|
+
hsh = super *args
|
46
|
+
if hsh['moreinfo'].is_a?(Hash)
|
47
|
+
hsh['moreinfo'] = moreinfo.to_json
|
48
|
+
end
|
49
|
+
hsh
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_a *args
|
53
|
+
to_hash.values_of(*members).to_flat
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
def healthy?
|
58
|
+
(! url.blank?) && ( # has a URL and either:
|
59
|
+
scraped_at.blank? || # hasn't been scraped,
|
60
|
+
(! response_code.blank?) || # or has, with response code
|
61
|
+
(! contents.blank?) ) # or has, with response
|
62
|
+
end
|
63
|
+
|
64
|
+
# Set URL from other attributes
|
65
|
+
def make_url!
|
66
|
+
self.url = make_url
|
67
|
+
end
|
68
|
+
|
69
|
+
def response= response
|
70
|
+
return unless response
|
71
|
+
self.contents = Monkeyshines.scrub_xml_encoded_badness(response.body)
|
72
|
+
end
|
73
|
+
|
74
|
+
def url_encode str
|
75
|
+
Monkeyshines.url_encode str
|
76
|
+
end
|
77
|
+
|
78
|
+
def key
|
79
|
+
Digest::MD5.hexdigest(self.url)
|
80
|
+
end
|
81
|
+
|
82
|
+
def req_generation= val
|
83
|
+
(self.moreinfo||={})['req_generation'] = val
|
84
|
+
end
|
85
|
+
def req_generation
|
86
|
+
(self.moreinfo||={})['req_generation']
|
87
|
+
end
|
88
|
+
|
89
|
+
# inject methods at class level
|
90
|
+
module ClassMethods
|
91
|
+
# Builds a URL query string from a hash of key,value pairs
|
92
|
+
#
|
93
|
+
# parameters are in sort order by encoded string
|
94
|
+
#
|
95
|
+
# Ex.
|
96
|
+
# make_url_query( :foo => 'bar', :q => 'happy meal', :angle => 90 )
|
97
|
+
# #=> "angle=90&foo=bar&q=happy%20meal"
|
98
|
+
#
|
99
|
+
def make_url_query hsh
|
100
|
+
hsh.map{|attr, val| "#{attr}=#{Monkeyshines.url_encode(val)}" }.sort.join("&")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
def self.included base
|
104
|
+
base.class_eval do
|
105
|
+
include ClassMethods
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
class ScrapeRequest < TypedStruct.new(
|
111
|
+
[:identifier, Integer],
|
112
|
+
[:page, Integer],
|
113
|
+
[:moreinfo, String],
|
114
|
+
[:url, String],
|
115
|
+
[:scraped_at, Bignum],
|
116
|
+
[:response_code, Integer],
|
117
|
+
[:response_message, String],
|
118
|
+
[:contents, String]
|
119
|
+
)
|
120
|
+
include ScrapeRequestCore
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# A SimpleRequest just holds a URL and the fetch result.
|
125
|
+
#
|
126
|
+
class SimpleRequest < TypedStruct.new(
|
127
|
+
[:url, String],
|
128
|
+
[:scraped_at, Bignum],
|
129
|
+
[:response_code, Integer],
|
130
|
+
[:response_message, String],
|
131
|
+
[:contents, String]
|
132
|
+
)
|
133
|
+
include ScrapeRequestCore
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|