monkeyshines 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,29 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'monkeyshines/runner_core/options'
|
3
|
+
|
4
|
+
module Monkeyshines
|
5
|
+
|
6
|
+
#
|
7
|
+
# In general, you should
|
8
|
+
#
|
9
|
+
# But where an external library is alread providing cooked results or it's
|
10
|
+
# otherwise most straightforward to directly emit model objects, you can use
|
11
|
+
# a parsing runner
|
12
|
+
#
|
13
|
+
class ParsingRunner < Runner
|
14
|
+
|
15
|
+
#
|
16
|
+
# Fetch and store result
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def fetch_and_store req
|
20
|
+
result = fetcher.get(req) # do the url fetch
|
21
|
+
# results.each do |result|
|
22
|
+
result.parse do |obj|
|
23
|
+
dest.save(obj)
|
24
|
+
end
|
25
|
+
#end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,343 @@
|
|
1
|
+
# require 'time'
|
2
|
+
# module Monkeyshines
|
3
|
+
# #
|
4
|
+
# # Paginated lets you make repeated requests to collect a timeline or
|
5
|
+
# # collection of items.
|
6
|
+
# #
|
7
|
+
# # You will typically want to set the
|
8
|
+
# #
|
9
|
+
# # A Paginated-compatible ScrapeRequest should inherit from or be compatible
|
10
|
+
# # with +Monkeyshines::ScrapeRequest+ and additionally define
|
11
|
+
# # * [#items] list of individual items in the response; +nil+ if there was an
|
12
|
+
# # error, +[]+ if the response was well-formed but returned no items.
|
13
|
+
# # * [#num_items] number of items from this response
|
14
|
+
# # * [#span] the range of (typically) IDs within this scrape. Used to know when
|
15
|
+
# # we've reached results from previous session
|
16
|
+
# #
|
17
|
+
# #
|
18
|
+
# module Paginated
|
19
|
+
# #
|
20
|
+
# # Generates request for each page to be scraped
|
21
|
+
# #
|
22
|
+
# # Block must return the fulfilled scrape_request response. This response is
|
23
|
+
# # passed to +#acknowledge+ for bookkeeping, then the next request is
|
24
|
+
# # made.
|
25
|
+
# #
|
26
|
+
# # Scraping stops after max_pages requests or when is_last?(response, page)
|
27
|
+
# #
|
28
|
+
# def each_request pageinfo={}, &block
|
29
|
+
# begin_pagination!
|
30
|
+
# (1..hard_request_limit).each do |page|
|
31
|
+
# response = yield make_request(page, pageinfo)
|
32
|
+
# warn 'nil response' unless response
|
33
|
+
# acknowledge(response, page)
|
34
|
+
# break if is_last?(response, page)
|
35
|
+
# end
|
36
|
+
# finish_pagination!
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# # Set up bookkeeping for pagination tracking
|
40
|
+
# def begin_pagination!
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# # Finalize bookkeeping at conclusion of scrape_job.
|
44
|
+
# def finish_pagination!
|
45
|
+
# end
|
46
|
+
#
|
47
|
+
# #
|
48
|
+
# # Feed back info from the scrape
|
49
|
+
# #
|
50
|
+
# def acknowledge response, page
|
51
|
+
# end
|
52
|
+
#
|
53
|
+
# # return true if the next request would be pointless (true if, perhaps, the
|
54
|
+
# # response had no items, or the API page limit is reached)
|
55
|
+
# def is_last? response, page
|
56
|
+
# ( (page >= max_pages) ||
|
57
|
+
# (response && response.healthy? && (response.num_items < max_items)) )
|
58
|
+
# end
|
59
|
+
#
|
60
|
+
# #
|
61
|
+
# # Soft limit on the number of pages to scrape.
|
62
|
+
# #
|
63
|
+
# # Typically, leave this set to the hard_request_limit if you don't know
|
64
|
+
# # beforehand how many pages to scrape, and override is_last? to decide when
|
65
|
+
# # to stop short of the API limit
|
66
|
+
# #
|
67
|
+
# def max_pages
|
68
|
+
# hard_request_limit
|
69
|
+
# end
|
70
|
+
#
|
71
|
+
# # inject class variables
|
72
|
+
# def self.included base
|
73
|
+
# base.class_eval do
|
74
|
+
# # Hard request limit: do not in any case exceed this number of requests
|
75
|
+
# class_inheritable_accessor :hard_request_limit
|
76
|
+
# # max items per page, from API
|
77
|
+
# class_inheritable_accessor :max_items
|
78
|
+
# #
|
79
|
+
# # Span of items gathered in this scrape scrape_job.
|
80
|
+
# attr_accessor :sess_items, :sess_span, :sess_timespan
|
81
|
+
# end
|
82
|
+
# end
|
83
|
+
# end
|
84
|
+
#
|
85
|
+
# module PaginatedTimeline
|
86
|
+
# # Soft limit on the number of pages to scrape.
|
87
|
+
# #
|
88
|
+
# # Typically, leave this set to the hard_request_limit if you don't know
|
89
|
+
# # beforehand how many pages to scrape, and override is_last? to decide when
|
90
|
+
# # to stop short of the API limit
|
91
|
+
# #
|
92
|
+
# def max_pages
|
93
|
+
# mp = fudge_factor * (n_items - prev_scraped_items) / max_items
|
94
|
+
# return 0 if mp == 0
|
95
|
+
# (mp+1).clamp(1, hard_request_limit).to_i
|
96
|
+
# end
|
97
|
+
# # inject class variables
|
98
|
+
# def self.included base
|
99
|
+
# base.class_eval do
|
100
|
+
# include Monkeyshines::Paginated
|
101
|
+
# end
|
102
|
+
# end
|
103
|
+
#
|
104
|
+
# # #
|
105
|
+
# # # Threshold count-per-page and actual count to get number of expected pages.
|
106
|
+
# # # Cap the request with max
|
107
|
+
# # def pages_from_count per_page, count, max=nil
|
108
|
+
# # num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
|
109
|
+
# # [num, max].compact.min
|
110
|
+
# # end
|
111
|
+
# end
|
112
|
+
#
|
113
|
+
# #
|
114
|
+
# # Scenario: you request paginated search requests with a limit parameter (a
|
115
|
+
# # max_id or min_id, for example).
|
116
|
+
# #
|
117
|
+
# # * request successive pages,
|
118
|
+
# # * use info on the requested page to set the next limit parameter
|
119
|
+
# # * stop when max_pages is reached or a successful request gives fewer than
|
120
|
+
# # max_items
|
121
|
+
# #
|
122
|
+
# #
|
123
|
+
# # The first
|
124
|
+
# #
|
125
|
+
# # req?min_id=1234&max_id=
|
126
|
+
# # => [ [8675, ...], ..., [8012, ...] ] # 100 items
|
127
|
+
# # req?min_id=1234&max_id=8011
|
128
|
+
# # => [ [7581, ...], ..., [2044, ...] ] # 100 items
|
129
|
+
# # req?min_id=1234&max_id=2043
|
130
|
+
# # => [ [2012, ...], ..., [1234, ...] ] # 69 items
|
131
|
+
# #
|
132
|
+
# # * The search terminates when
|
133
|
+
# # ** max_requests requests have been made, or
|
134
|
+
# # ** the limit params interval is zero, or
|
135
|
+
# # ** a successful response with fewer than max_items is received.
|
136
|
+
# #
|
137
|
+
# # * You will want to save <req?min_id=8676&max_id=""> for later scrape
|
138
|
+
# #
|
139
|
+
# module PaginatedWithLimit
|
140
|
+
#
|
141
|
+
# #
|
142
|
+
# # Return true if the next request would be pointless (true if, perhaps, the
|
143
|
+
# # response had no items, or the API page limit is reached)
|
144
|
+
# def is_last? response, page
|
145
|
+
# unscraped_span.empty? || super(response, page)
|
146
|
+
# end
|
147
|
+
#
|
148
|
+
# # Set up bookkeeping for pagination tracking
|
149
|
+
# def begin_pagination!
|
150
|
+
# self.sess_items ||= 0
|
151
|
+
# self.sess_span = UnionInterval.new
|
152
|
+
# self.sess_timespan = UnionInterval.new
|
153
|
+
# super
|
154
|
+
# end
|
155
|
+
#
|
156
|
+
# def finish_pagination!
|
157
|
+
# # piw = [(prev_items.to_f ** 0.66), (max_items * hard_request_limit * 4.0)].min
|
158
|
+
# # puts ([Time.now.strftime("%M:%S"), "%-23s"%query_term] + [prev_rate, sess_rate, avg_rate, sess_timespan.size.to_f, prev_items, sess_items, piw, (1000/avg_rate)].map{|s| "%15.4f"%(s||0) }).join("\t") rescue nil
|
159
|
+
# self.prev_rate = avg_rate
|
160
|
+
# if sess_items == (hard_request_limit * max_items)
|
161
|
+
# # bump the rate if we hit the hard cap:
|
162
|
+
# new_rate = [prev_rate * 1.25, 1000/120.0].max
|
163
|
+
# Log.info "Bumping rate on #{query_term} from #{prev_rate} to #{new_rate}"
|
164
|
+
# self.prev_rate = new_rate
|
165
|
+
# end
|
166
|
+
# self.prev_items = prev_items.to_i + sess_items.to_i
|
167
|
+
# self.prev_span = sess_span + prev_span
|
168
|
+
# self.new_items = sess_items.to_i + new_items.to_i
|
169
|
+
# self.sess_items = 0
|
170
|
+
# self.sess_span = UnionInterval.new
|
171
|
+
# self.sess_timespan = UnionInterval.new
|
172
|
+
# super
|
173
|
+
# end
|
174
|
+
#
|
175
|
+
# #
|
176
|
+
# # Feed back info from the scrape
|
177
|
+
# #
|
178
|
+
# def acknowledge response, page
|
179
|
+
# super response, page
|
180
|
+
# return unless response && response.items
|
181
|
+
# count_new_items response
|
182
|
+
# update_spans response
|
183
|
+
# end
|
184
|
+
#
|
185
|
+
# # account for additional items
|
186
|
+
# def count_new_items response
|
187
|
+
# num_items = response.num_items
|
188
|
+
# # if there was overlap with a previous scrape, we have to count the items by hand
|
189
|
+
# prev_span = self.prev_span
|
190
|
+
# if prev_span.max && response.span && (response.span.min < prev_span.max)
|
191
|
+
# num_items = response.items.inject(0){|n,item| (prev_span.include? item['id']) ? n : n+1 }
|
192
|
+
# end
|
193
|
+
# self.sess_items += num_items
|
194
|
+
# end
|
195
|
+
#
|
196
|
+
#
|
197
|
+
# def sess_rate
|
198
|
+
# return nil if (!sess_timespan) || (sess_timespan.size == 0)
|
199
|
+
# sess_items.to_f / sess_timespan.size.to_f
|
200
|
+
# end
|
201
|
+
# #
|
202
|
+
# # How often an item rolls in, on average
|
203
|
+
# #
|
204
|
+
# def avg_rate
|
205
|
+
# return nil if (sess_items.to_f == 0 && (prev_rate.blank? || prev_items.to_f == 0))
|
206
|
+
# prev_weight = prev_items.to_f ** 0.66
|
207
|
+
# sess_weight = sess_items.to_f
|
208
|
+
# prev_weight = [prev_weight, sess_weight*3].min if sess_weight > 0
|
209
|
+
# weighted_sum = (
|
210
|
+
# (prev_rate.to_f * prev_weight) + # damped previous avg
|
211
|
+
# (sess_rate.to_f * sess_weight) ) # current avg
|
212
|
+
# rt = weighted_sum / (prev_weight + sess_weight)
|
213
|
+
# rt
|
214
|
+
# end
|
215
|
+
#
|
216
|
+
#
|
217
|
+
# # inject class variables
|
218
|
+
# def self.included base
|
219
|
+
# base.class_eval do
|
220
|
+
# attr_accessor :new_items
|
221
|
+
# # include Monkeyshines::Paginated
|
222
|
+
# end
|
223
|
+
# end
|
224
|
+
# end
|
225
|
+
#
|
226
|
+
# end
|
227
|
+
#
|
228
|
+
#
|
229
|
+
# module Monkeyshines
|
230
|
+
# module Paginated
|
231
|
+
# end
|
232
|
+
#
|
233
|
+
# module PaginatedTimeline
|
234
|
+
# # Soft limit on the number of pages to scrape.
|
235
|
+
# #
|
236
|
+
# # Typically, leave this set to the hard_request_limit if you don't know
|
237
|
+
# # beforehand how many pages to scrape, and override is_last? to decide when
|
238
|
+
# # to stop short of the API limit
|
239
|
+
# #
|
240
|
+
# def max_pages
|
241
|
+
# mp = fudge_factor * (n_items - prev_scraped_items) / max_items
|
242
|
+
# return 0 if mp == 0
|
243
|
+
# (mp+1).clamp(1, hard_request_limit).to_i
|
244
|
+
# end
|
245
|
+
# # inject class variables
|
246
|
+
# def self.included base
|
247
|
+
# base.class_eval do
|
248
|
+
# include Monkeyshines::Paginated
|
249
|
+
# end
|
250
|
+
# end
|
251
|
+
#
|
252
|
+
# # #
|
253
|
+
# # # Threshold count-per-page and actual count to get number of expected pages.
|
254
|
+
# # # Cap the request with max
|
255
|
+
# # def pages_from_count per_page, count, max=nil
|
256
|
+
# # num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
|
257
|
+
# # [num, max].compact.min
|
258
|
+
# # end
|
259
|
+
# end
|
260
|
+
#
|
261
|
+
# module PaginatedWithRateAndLimit
|
262
|
+
#
|
263
|
+
# def after_pagination
|
264
|
+
# # piw = [(prev_items.to_f ** 0.66), (max_items * hard_request_limit * 4.0)].min
|
265
|
+
# # puts ([Time.now.strftime("%M:%S"), "%-23s"%query_term] + [prev_rate, sess_rate, avg_rate, sess_timespan.size.to_f, prev_items, sess_items, piw, (1000/avg_rate)].map{|s| "%15.4f"%(s||0) }).join("\t") rescue nil
|
266
|
+
# self.prev_rate = avg_rate
|
267
|
+
# if sess_items == (hard_request_limit * max_items)
|
268
|
+
# # bump the rate if we hit the hard cap:
|
269
|
+
# new_rate = [prev_rate * 1.25, 1000/120.0].max
|
270
|
+
# Log.info "Bumping rate on #{query_term} from #{prev_rate} to #{new_rate}"
|
271
|
+
# self.prev_rate = new_rate
|
272
|
+
# end
|
273
|
+
# self.prev_items = prev_items.to_i + sess_items.to_i
|
274
|
+
# self.prev_span = sess_span + prev_span
|
275
|
+
# self.new_items = sess_items.to_i + new_items.to_i
|
276
|
+
# self.sess_items = 0
|
277
|
+
# self.sess_span = UnionInterval.new
|
278
|
+
# self.sess_timespan = UnionInterval.new
|
279
|
+
# super
|
280
|
+
# end
|
281
|
+
#
|
282
|
+
# #
|
283
|
+
# # Feed back info from the scrape
|
284
|
+
# #
|
285
|
+
# def after_fetch response, page
|
286
|
+
# super response, page
|
287
|
+
# return unless response && response.items
|
288
|
+
# count_new_items response
|
289
|
+
# update_spans response
|
290
|
+
# end
|
291
|
+
#
|
292
|
+
# # account for additional items
|
293
|
+
# def count_new_items response
|
294
|
+
# num_items = response.num_items
|
295
|
+
# # if there was overlap with a previous scrape, we have to count the items by hand
|
296
|
+
# prev_span = self.prev_span
|
297
|
+
# if prev_span.max && response.span && (response.span.min < prev_span.max)
|
298
|
+
# num_items = response.items.inject(0){|n,item| (prev_span.include? item['id']) ? n : n+1 }
|
299
|
+
# end
|
300
|
+
# self.sess_items += num_items
|
301
|
+
# end
|
302
|
+
#
|
303
|
+
# def update_spans response
|
304
|
+
# # Update intervals
|
305
|
+
# self.sess_span << response.span
|
306
|
+
# self.sess_timespan << response.timespan
|
307
|
+
# end
|
308
|
+
#
|
309
|
+
# # gap between oldest scraped in this scrape_job and last one scraped in
|
310
|
+
# # previous scrape_job.
|
311
|
+
# def unscraped_span
|
312
|
+
# UnionInterval.new(prev_span_max, sess_span.min)
|
313
|
+
# end
|
314
|
+
# # span of previous scrape
|
315
|
+
# def prev_span
|
316
|
+
# @prev_span ||= UnionInterval.new(prev_span_min, prev_span_max)
|
317
|
+
# end
|
318
|
+
# def prev_span= min_max
|
319
|
+
# self.prev_span_min, self.prev_span_max = min_max.to_a
|
320
|
+
# @prev_span = UnionInterval.new(prev_span_min, prev_span_max)
|
321
|
+
# end
|
322
|
+
#
|
323
|
+
# def sess_rate
|
324
|
+
# return nil if (!sess_timespan) || (sess_timespan.size == 0)
|
325
|
+
# sess_items.to_f / sess_timespan.size.to_f
|
326
|
+
# end
|
327
|
+
# #
|
328
|
+
# # How often an item rolls in, on average
|
329
|
+
# #
|
330
|
+
# def avg_rate
|
331
|
+
# return nil if (sess_items.to_f == 0 && (prev_rate.blank? || prev_items.to_f == 0))
|
332
|
+
# prev_weight = prev_items.to_f ** 0.66
|
333
|
+
# sess_weight = sess_items.to_f
|
334
|
+
# prev_weight = [prev_weight, sess_weight*3].min if sess_weight > 0
|
335
|
+
# weighted_sum = (
|
336
|
+
# (prev_rate.to_f * prev_weight) + # damped previous avg
|
337
|
+
# (sess_rate.to_f * sess_weight) ) # current avg
|
338
|
+
# rt = weighted_sum / (prev_weight + sess_weight)
|
339
|
+
# rt
|
340
|
+
# end
|
341
|
+
# end
|
342
|
+
#
|
343
|
+
# end
|
@@ -0,0 +1,136 @@
|
|
1
|
+
require 'digest/md5'
|
2
|
+
module Monkeyshines
|
3
|
+
def self.url_encode str
|
4
|
+
return '' if str.blank?
|
5
|
+
str = str.gsub(/ /, '+')
|
6
|
+
Addressable::URI.encode_component(str, Addressable::URI::CharacterClasses::UNRESERVED+'+')
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.url_decode str
|
10
|
+
return '' if str.blank?
|
11
|
+
str = str.gsub(/\+/, ' ')
|
12
|
+
Addressable::URI.unencode(str)
|
13
|
+
end
|
14
|
+
|
15
|
+
XML_ENCODED_BADNESS = { "\r" => " ", "\n" => " ", "\t" => "	" }
|
16
|
+
#
|
17
|
+
# Takes an already-encoded XML string and replaces ONLY the characters in
|
18
|
+
# XML_ENCODED_BADNESS (by default, \r newline, \n carriage return and \t tab)
|
19
|
+
# with their XML encodings ( and so forth). Doesn't do any other
|
20
|
+
# encoding, and leaves exiting entities alone.
|
21
|
+
#
|
22
|
+
def self.scrub_xml_encoded_badness str
|
23
|
+
str.chomp.gsub(/[\r\n\t]/){|c| XML_ENCODED_BADNESS[c]}
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
module Monkeyshines
|
28
|
+
#
|
29
|
+
# Base class for Scrape requests
|
30
|
+
#
|
31
|
+
module ScrapeRequestCore
|
32
|
+
|
33
|
+
autoload :SignedUrl, 'monkeyshines/scrape_request/signed_url'
|
34
|
+
autoload :Paginated, 'monkeyshines/scrape_request/paginated'
|
35
|
+
autoload :Paginating, 'monkeyshines/scrape_request/paginated'
|
36
|
+
autoload :PaginatedWithLimit, 'monkeyshines/scrape_request/paginated'
|
37
|
+
|
38
|
+
def initialize *args
|
39
|
+
super *args
|
40
|
+
if (moreinfo.is_a?(String)) then self.moreinfo = JSON.load(moreinfo) rescue nil end
|
41
|
+
make_url! if (! url)
|
42
|
+
end
|
43
|
+
|
44
|
+
def to_hash *args
|
45
|
+
hsh = super *args
|
46
|
+
if hsh['moreinfo'].is_a?(Hash)
|
47
|
+
hsh['moreinfo'] = moreinfo.to_json
|
48
|
+
end
|
49
|
+
hsh
|
50
|
+
end
|
51
|
+
|
52
|
+
def to_a *args
|
53
|
+
to_hash.values_of(*members).to_flat
|
54
|
+
end
|
55
|
+
|
56
|
+
#
|
57
|
+
def healthy?
|
58
|
+
(! url.blank?) && ( # has a URL and either:
|
59
|
+
scraped_at.blank? || # hasn't been scraped,
|
60
|
+
(! response_code.blank?) || # or has, with response code
|
61
|
+
(! contents.blank?) ) # or has, with response
|
62
|
+
end
|
63
|
+
|
64
|
+
# Set URL from other attributes
|
65
|
+
def make_url!
|
66
|
+
self.url = make_url
|
67
|
+
end
|
68
|
+
|
69
|
+
def response= response
|
70
|
+
return unless response
|
71
|
+
self.contents = Monkeyshines.scrub_xml_encoded_badness(response.body)
|
72
|
+
end
|
73
|
+
|
74
|
+
def url_encode str
|
75
|
+
Monkeyshines.url_encode str
|
76
|
+
end
|
77
|
+
|
78
|
+
def key
|
79
|
+
Digest::MD5.hexdigest(self.url)
|
80
|
+
end
|
81
|
+
|
82
|
+
def req_generation= val
|
83
|
+
(self.moreinfo||={})['req_generation'] = val
|
84
|
+
end
|
85
|
+
def req_generation
|
86
|
+
(self.moreinfo||={})['req_generation']
|
87
|
+
end
|
88
|
+
|
89
|
+
# inject methods at class level
|
90
|
+
module ClassMethods
|
91
|
+
# Builds a URL query string from a hash of key,value pairs
|
92
|
+
#
|
93
|
+
# parameters are in sort order by encoded string
|
94
|
+
#
|
95
|
+
# Ex.
|
96
|
+
# make_url_query( :foo => 'bar', :q => 'happy meal', :angle => 90 )
|
97
|
+
# #=> "angle=90&foo=bar&q=happy%20meal"
|
98
|
+
#
|
99
|
+
def make_url_query hsh
|
100
|
+
hsh.map{|attr, val| "#{attr}=#{Monkeyshines.url_encode(val)}" }.sort.join("&")
|
101
|
+
end
|
102
|
+
end
|
103
|
+
def self.included base
|
104
|
+
base.class_eval do
|
105
|
+
include ClassMethods
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
|
110
|
+
class ScrapeRequest < TypedStruct.new(
|
111
|
+
[:identifier, Integer],
|
112
|
+
[:page, Integer],
|
113
|
+
[:moreinfo, String],
|
114
|
+
[:url, String],
|
115
|
+
[:scraped_at, Bignum],
|
116
|
+
[:response_code, Integer],
|
117
|
+
[:response_message, String],
|
118
|
+
[:contents, String]
|
119
|
+
)
|
120
|
+
include ScrapeRequestCore
|
121
|
+
end
|
122
|
+
|
123
|
+
#
|
124
|
+
# A SimpleRequest just holds a URL and the fetch result.
|
125
|
+
#
|
126
|
+
class SimpleRequest < TypedStruct.new(
|
127
|
+
[:url, String],
|
128
|
+
[:scraped_at, Bignum],
|
129
|
+
[:response_code, Integer],
|
130
|
+
[:response_message, String],
|
131
|
+
[:contents, String]
|
132
|
+
)
|
133
|
+
include ScrapeRequestCore
|
134
|
+
end
|
135
|
+
|
136
|
+
end
|