monkeyshines 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
@@ -0,0 +1,5 @@
1
+ module Monkeyshines
2
+ class Runner
3
+
4
+ end
5
+ end
@@ -0,0 +1,29 @@
1
+ require 'yaml'
2
+ require 'monkeyshines/runner_core/options'
3
+
4
+ module Monkeyshines
5
+
6
+ #
7
+ # In general, you should
8
+ #
9
+ # But where an external library is alread providing cooked results or it's
10
+ # otherwise most straightforward to directly emit model objects, you can use
11
+ # a parsing runner
12
+ #
13
+ class ParsingRunner < Runner
14
+
15
+ #
16
+ # Fetch and store result
17
+ #
18
+ #
19
+ def fetch_and_store req
20
+ result = fetcher.get(req) # do the url fetch
21
+ # results.each do |result|
22
+ result.parse do |obj|
23
+ dest.save(obj)
24
+ end
25
+ #end
26
+ end
27
+
28
+ end
29
+ end
@@ -0,0 +1,343 @@
1
+ # require 'time'
2
+ # module Monkeyshines
3
+ # #
4
+ # # Paginated lets you make repeated requests to collect a timeline or
5
+ # # collection of items.
6
+ # #
7
+ # # You will typically want to set the
8
+ # #
9
+ # # A Paginated-compatible ScrapeRequest should inherit from or be compatible
10
+ # # with +Monkeyshines::ScrapeRequest+ and additionally define
11
+ # # * [#items] list of individual items in the response; +nil+ if there was an
12
+ # # error, +[]+ if the response was well-formed but returned no items.
13
+ # # * [#num_items] number of items from this response
14
+ # # * [#span] the range of (typically) IDs within this scrape. Used to know when
15
+ # # we've reached results from previous session
16
+ # #
17
+ # #
18
+ # module Paginated
19
+ # #
20
+ # # Generates request for each page to be scraped
21
+ # #
22
+ # # Block must return the fulfilled scrape_request response. This response is
23
+ # # passed to +#acknowledge+ for bookkeeping, then the next request is
24
+ # # made.
25
+ # #
26
+ # # Scraping stops after max_pages requests or when is_last?(response, page)
27
+ # #
28
+ # def each_request pageinfo={}, &block
29
+ # begin_pagination!
30
+ # (1..hard_request_limit).each do |page|
31
+ # response = yield make_request(page, pageinfo)
32
+ # warn 'nil response' unless response
33
+ # acknowledge(response, page)
34
+ # break if is_last?(response, page)
35
+ # end
36
+ # finish_pagination!
37
+ # end
38
+ #
39
+ # # Set up bookkeeping for pagination tracking
40
+ # def begin_pagination!
41
+ # end
42
+ #
43
+ # # Finalize bookkeeping at conclusion of scrape_job.
44
+ # def finish_pagination!
45
+ # end
46
+ #
47
+ # #
48
+ # # Feed back info from the scrape
49
+ # #
50
+ # def acknowledge response, page
51
+ # end
52
+ #
53
+ # # return true if the next request would be pointless (true if, perhaps, the
54
+ # # response had no items, or the API page limit is reached)
55
+ # def is_last? response, page
56
+ # ( (page >= max_pages) ||
57
+ # (response && response.healthy? && (response.num_items < max_items)) )
58
+ # end
59
+ #
60
+ # #
61
+ # # Soft limit on the number of pages to scrape.
62
+ # #
63
+ # # Typically, leave this set to the hard_request_limit if you don't know
64
+ # # beforehand how many pages to scrape, and override is_last? to decide when
65
+ # # to stop short of the API limit
66
+ # #
67
+ # def max_pages
68
+ # hard_request_limit
69
+ # end
70
+ #
71
+ # # inject class variables
72
+ # def self.included base
73
+ # base.class_eval do
74
+ # # Hard request limit: do not in any case exceed this number of requests
75
+ # class_inheritable_accessor :hard_request_limit
76
+ # # max items per page, from API
77
+ # class_inheritable_accessor :max_items
78
+ # #
79
+ # # Span of items gathered in this scrape scrape_job.
80
+ # attr_accessor :sess_items, :sess_span, :sess_timespan
81
+ # end
82
+ # end
83
+ # end
84
+ #
85
+ # module PaginatedTimeline
86
+ # # Soft limit on the number of pages to scrape.
87
+ # #
88
+ # # Typically, leave this set to the hard_request_limit if you don't know
89
+ # # beforehand how many pages to scrape, and override is_last? to decide when
90
+ # # to stop short of the API limit
91
+ # #
92
+ # def max_pages
93
+ # mp = fudge_factor * (n_items - prev_scraped_items) / max_items
94
+ # return 0 if mp == 0
95
+ # (mp+1).clamp(1, hard_request_limit).to_i
96
+ # end
97
+ # # inject class variables
98
+ # def self.included base
99
+ # base.class_eval do
100
+ # include Monkeyshines::Paginated
101
+ # end
102
+ # end
103
+ #
104
+ # # #
105
+ # # # Threshold count-per-page and actual count to get number of expected pages.
106
+ # # # Cap the request with max
107
+ # # def pages_from_count per_page, count, max=nil
108
+ # # num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
109
+ # # [num, max].compact.min
110
+ # # end
111
+ # end
112
+ #
113
+ # #
114
+ # # Scenario: you request paginated search requests with a limit parameter (a
115
+ # # max_id or min_id, for example).
116
+ # #
117
+ # # * request successive pages,
118
+ # # * use info on the requested page to set the next limit parameter
119
+ # # * stop when max_pages is reached or a successful request gives fewer than
120
+ # # max_items
121
+ # #
122
+ # #
123
+ # # The first
124
+ # #
125
+ # # req?min_id=1234&max_id=
126
+ # # => [ [8675, ...], ..., [8012, ...] ] # 100 items
127
+ # # req?min_id=1234&max_id=8011
128
+ # # => [ [7581, ...], ..., [2044, ...] ] # 100 items
129
+ # # req?min_id=1234&max_id=2043
130
+ # # => [ [2012, ...], ..., [1234, ...] ] # 69 items
131
+ # #
132
+ # # * The search terminates when
133
+ # # ** max_requests requests have been made, or
134
+ # # ** the limit params interval is zero, or
135
+ # # ** a successful response with fewer than max_items is received.
136
+ # #
137
+ # # * You will want to save <req?min_id=8676&max_id=""> for later scrape
138
+ # #
139
+ # module PaginatedWithLimit
140
+ #
141
+ # #
142
+ # # Return true if the next request would be pointless (true if, perhaps, the
143
+ # # response had no items, or the API page limit is reached)
144
+ # def is_last? response, page
145
+ # unscraped_span.empty? || super(response, page)
146
+ # end
147
+ #
148
+ # # Set up bookkeeping for pagination tracking
149
+ # def begin_pagination!
150
+ # self.sess_items ||= 0
151
+ # self.sess_span = UnionInterval.new
152
+ # self.sess_timespan = UnionInterval.new
153
+ # super
154
+ # end
155
+ #
156
+ # def finish_pagination!
157
+ # # piw = [(prev_items.to_f ** 0.66), (max_items * hard_request_limit * 4.0)].min
158
+ # # puts ([Time.now.strftime("%M:%S"), "%-23s"%query_term] + [prev_rate, sess_rate, avg_rate, sess_timespan.size.to_f, prev_items, sess_items, piw, (1000/avg_rate)].map{|s| "%15.4f"%(s||0) }).join("\t") rescue nil
159
+ # self.prev_rate = avg_rate
160
+ # if sess_items == (hard_request_limit * max_items)
161
+ # # bump the rate if we hit the hard cap:
162
+ # new_rate = [prev_rate * 1.25, 1000/120.0].max
163
+ # Log.info "Bumping rate on #{query_term} from #{prev_rate} to #{new_rate}"
164
+ # self.prev_rate = new_rate
165
+ # end
166
+ # self.prev_items = prev_items.to_i + sess_items.to_i
167
+ # self.prev_span = sess_span + prev_span
168
+ # self.new_items = sess_items.to_i + new_items.to_i
169
+ # self.sess_items = 0
170
+ # self.sess_span = UnionInterval.new
171
+ # self.sess_timespan = UnionInterval.new
172
+ # super
173
+ # end
174
+ #
175
+ # #
176
+ # # Feed back info from the scrape
177
+ # #
178
+ # def acknowledge response, page
179
+ # super response, page
180
+ # return unless response && response.items
181
+ # count_new_items response
182
+ # update_spans response
183
+ # end
184
+ #
185
+ # # account for additional items
186
+ # def count_new_items response
187
+ # num_items = response.num_items
188
+ # # if there was overlap with a previous scrape, we have to count the items by hand
189
+ # prev_span = self.prev_span
190
+ # if prev_span.max && response.span && (response.span.min < prev_span.max)
191
+ # num_items = response.items.inject(0){|n,item| (prev_span.include? item['id']) ? n : n+1 }
192
+ # end
193
+ # self.sess_items += num_items
194
+ # end
195
+ #
196
+ #
197
+ # def sess_rate
198
+ # return nil if (!sess_timespan) || (sess_timespan.size == 0)
199
+ # sess_items.to_f / sess_timespan.size.to_f
200
+ # end
201
+ # #
202
+ # # How often an item rolls in, on average
203
+ # #
204
+ # def avg_rate
205
+ # return nil if (sess_items.to_f == 0 && (prev_rate.blank? || prev_items.to_f == 0))
206
+ # prev_weight = prev_items.to_f ** 0.66
207
+ # sess_weight = sess_items.to_f
208
+ # prev_weight = [prev_weight, sess_weight*3].min if sess_weight > 0
209
+ # weighted_sum = (
210
+ # (prev_rate.to_f * prev_weight) + # damped previous avg
211
+ # (sess_rate.to_f * sess_weight) ) # current avg
212
+ # rt = weighted_sum / (prev_weight + sess_weight)
213
+ # rt
214
+ # end
215
+ #
216
+ #
217
+ # # inject class variables
218
+ # def self.included base
219
+ # base.class_eval do
220
+ # attr_accessor :new_items
221
+ # # include Monkeyshines::Paginated
222
+ # end
223
+ # end
224
+ # end
225
+ #
226
+ # end
227
+ #
228
+ #
229
+ # module Monkeyshines
230
+ # module Paginated
231
+ # end
232
+ #
233
+ # module PaginatedTimeline
234
+ # # Soft limit on the number of pages to scrape.
235
+ # #
236
+ # # Typically, leave this set to the hard_request_limit if you don't know
237
+ # # beforehand how many pages to scrape, and override is_last? to decide when
238
+ # # to stop short of the API limit
239
+ # #
240
+ # def max_pages
241
+ # mp = fudge_factor * (n_items - prev_scraped_items) / max_items
242
+ # return 0 if mp == 0
243
+ # (mp+1).clamp(1, hard_request_limit).to_i
244
+ # end
245
+ # # inject class variables
246
+ # def self.included base
247
+ # base.class_eval do
248
+ # include Monkeyshines::Paginated
249
+ # end
250
+ # end
251
+ #
252
+ # # #
253
+ # # # Threshold count-per-page and actual count to get number of expected pages.
254
+ # # # Cap the request with max
255
+ # # def pages_from_count per_page, count, max=nil
256
+ # # num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
257
+ # # [num, max].compact.min
258
+ # # end
259
+ # end
260
+ #
261
+ # module PaginatedWithRateAndLimit
262
+ #
263
+ # def after_pagination
264
+ # # piw = [(prev_items.to_f ** 0.66), (max_items * hard_request_limit * 4.0)].min
265
+ # # puts ([Time.now.strftime("%M:%S"), "%-23s"%query_term] + [prev_rate, sess_rate, avg_rate, sess_timespan.size.to_f, prev_items, sess_items, piw, (1000/avg_rate)].map{|s| "%15.4f"%(s||0) }).join("\t") rescue nil
266
+ # self.prev_rate = avg_rate
267
+ # if sess_items == (hard_request_limit * max_items)
268
+ # # bump the rate if we hit the hard cap:
269
+ # new_rate = [prev_rate * 1.25, 1000/120.0].max
270
+ # Log.info "Bumping rate on #{query_term} from #{prev_rate} to #{new_rate}"
271
+ # self.prev_rate = new_rate
272
+ # end
273
+ # self.prev_items = prev_items.to_i + sess_items.to_i
274
+ # self.prev_span = sess_span + prev_span
275
+ # self.new_items = sess_items.to_i + new_items.to_i
276
+ # self.sess_items = 0
277
+ # self.sess_span = UnionInterval.new
278
+ # self.sess_timespan = UnionInterval.new
279
+ # super
280
+ # end
281
+ #
282
+ # #
283
+ # # Feed back info from the scrape
284
+ # #
285
+ # def after_fetch response, page
286
+ # super response, page
287
+ # return unless response && response.items
288
+ # count_new_items response
289
+ # update_spans response
290
+ # end
291
+ #
292
+ # # account for additional items
293
+ # def count_new_items response
294
+ # num_items = response.num_items
295
+ # # if there was overlap with a previous scrape, we have to count the items by hand
296
+ # prev_span = self.prev_span
297
+ # if prev_span.max && response.span && (response.span.min < prev_span.max)
298
+ # num_items = response.items.inject(0){|n,item| (prev_span.include? item['id']) ? n : n+1 }
299
+ # end
300
+ # self.sess_items += num_items
301
+ # end
302
+ #
303
+ # def update_spans response
304
+ # # Update intervals
305
+ # self.sess_span << response.span
306
+ # self.sess_timespan << response.timespan
307
+ # end
308
+ #
309
+ # # gap between oldest scraped in this scrape_job and last one scraped in
310
+ # # previous scrape_job.
311
+ # def unscraped_span
312
+ # UnionInterval.new(prev_span_max, sess_span.min)
313
+ # end
314
+ # # span of previous scrape
315
+ # def prev_span
316
+ # @prev_span ||= UnionInterval.new(prev_span_min, prev_span_max)
317
+ # end
318
+ # def prev_span= min_max
319
+ # self.prev_span_min, self.prev_span_max = min_max.to_a
320
+ # @prev_span = UnionInterval.new(prev_span_min, prev_span_max)
321
+ # end
322
+ #
323
+ # def sess_rate
324
+ # return nil if (!sess_timespan) || (sess_timespan.size == 0)
325
+ # sess_items.to_f / sess_timespan.size.to_f
326
+ # end
327
+ # #
328
+ # # How often an item rolls in, on average
329
+ # #
330
+ # def avg_rate
331
+ # return nil if (sess_items.to_f == 0 && (prev_rate.blank? || prev_items.to_f == 0))
332
+ # prev_weight = prev_items.to_f ** 0.66
333
+ # sess_weight = sess_items.to_f
334
+ # prev_weight = [prev_weight, sess_weight*3].min if sess_weight > 0
335
+ # weighted_sum = (
336
+ # (prev_rate.to_f * prev_weight) + # damped previous avg
337
+ # (sess_rate.to_f * sess_weight) ) # current avg
338
+ # rt = weighted_sum / (prev_weight + sess_weight)
339
+ # rt
340
+ # end
341
+ # end
342
+ #
343
+ # end
@@ -0,0 +1,9 @@
1
+ require 'time'
2
+ require 'monkeyshines/utils/union_interval'
3
+ module Monkeyshines
4
+ module ScrapeJob
5
+ module RecursesJobs
6
+
7
+ end
8
+ end
9
+ end
@@ -0,0 +1,136 @@
1
+ require 'digest/md5'
2
+ module Monkeyshines
3
+ def self.url_encode str
4
+ return '' if str.blank?
5
+ str = str.gsub(/ /, '+')
6
+ Addressable::URI.encode_component(str, Addressable::URI::CharacterClasses::UNRESERVED+'+')
7
+ end
8
+
9
+ def self.url_decode str
10
+ return '' if str.blank?
11
+ str = str.gsub(/\+/, ' ')
12
+ Addressable::URI.unencode(str)
13
+ end
14
+
15
+ XML_ENCODED_BADNESS = { "\r" => "&#13;", "\n" => "&#10;", "\t" => "&#9;" }
16
+ #
17
+ # Takes an already-encoded XML string and replaces ONLY the characters in
18
+ # XML_ENCODED_BADNESS (by default, \r newline, \n carriage return and \t tab)
19
+ # with their XML encodings (&#10; and so forth). Doesn't do any other
20
+ # encoding, and leaves exiting entities alone.
21
+ #
22
+ def self.scrub_xml_encoded_badness str
23
+ str.chomp.gsub(/[\r\n\t]/){|c| XML_ENCODED_BADNESS[c]}
24
+ end
25
+ end
26
+
27
+ module Monkeyshines
28
+ #
29
+ # Base class for Scrape requests
30
+ #
31
+ module ScrapeRequestCore
32
+
33
+ autoload :SignedUrl, 'monkeyshines/scrape_request/signed_url'
34
+ autoload :Paginated, 'monkeyshines/scrape_request/paginated'
35
+ autoload :Paginating, 'monkeyshines/scrape_request/paginated'
36
+ autoload :PaginatedWithLimit, 'monkeyshines/scrape_request/paginated'
37
+
38
+ def initialize *args
39
+ super *args
40
+ if (moreinfo.is_a?(String)) then self.moreinfo = JSON.load(moreinfo) rescue nil end
41
+ make_url! if (! url)
42
+ end
43
+
44
+ def to_hash *args
45
+ hsh = super *args
46
+ if hsh['moreinfo'].is_a?(Hash)
47
+ hsh['moreinfo'] = moreinfo.to_json
48
+ end
49
+ hsh
50
+ end
51
+
52
+ def to_a *args
53
+ to_hash.values_of(*members).to_flat
54
+ end
55
+
56
+ #
57
+ def healthy?
58
+ (! url.blank?) && ( # has a URL and either:
59
+ scraped_at.blank? || # hasn't been scraped,
60
+ (! response_code.blank?) || # or has, with response code
61
+ (! contents.blank?) ) # or has, with response
62
+ end
63
+
64
+ # Set URL from other attributes
65
+ def make_url!
66
+ self.url = make_url
67
+ end
68
+
69
+ def response= response
70
+ return unless response
71
+ self.contents = Monkeyshines.scrub_xml_encoded_badness(response.body)
72
+ end
73
+
74
+ def url_encode str
75
+ Monkeyshines.url_encode str
76
+ end
77
+
78
+ def key
79
+ Digest::MD5.hexdigest(self.url)
80
+ end
81
+
82
+ def req_generation= val
83
+ (self.moreinfo||={})['req_generation'] = val
84
+ end
85
+ def req_generation
86
+ (self.moreinfo||={})['req_generation']
87
+ end
88
+
89
+ # inject methods at class level
90
+ module ClassMethods
91
+ # Builds a URL query string from a hash of key,value pairs
92
+ #
93
+ # parameters are in sort order by encoded string
94
+ #
95
+ # Ex.
96
+ # make_url_query( :foo => 'bar', :q => 'happy meal', :angle => 90 )
97
+ # #=> "angle=90&foo=bar&q=happy%20meal"
98
+ #
99
+ def make_url_query hsh
100
+ hsh.map{|attr, val| "#{attr}=#{Monkeyshines.url_encode(val)}" }.sort.join("&")
101
+ end
102
+ end
103
+ def self.included base
104
+ base.class_eval do
105
+ include ClassMethods
106
+ end
107
+ end
108
+ end
109
+
110
+ class ScrapeRequest < TypedStruct.new(
111
+ [:identifier, Integer],
112
+ [:page, Integer],
113
+ [:moreinfo, String],
114
+ [:url, String],
115
+ [:scraped_at, Bignum],
116
+ [:response_code, Integer],
117
+ [:response_message, String],
118
+ [:contents, String]
119
+ )
120
+ include ScrapeRequestCore
121
+ end
122
+
123
+ #
124
+ # A SimpleRequest just holds a URL and the fetch result.
125
+ #
126
+ class SimpleRequest < TypedStruct.new(
127
+ [:url, String],
128
+ [:scraped_at, Bignum],
129
+ [:response_code, Integer],
130
+ [:response_message, String],
131
+ [:contents, String]
132
+ )
133
+ include ScrapeRequestCore
134
+ end
135
+
136
+ end