monkeyshines 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,290 @@
|
|
1
|
+
require 'time'
|
2
|
+
require 'monkeyshines/utils/union_interval'
|
3
|
+
module Monkeyshines
|
4
|
+
module ScrapeRequestCore
|
5
|
+
|
6
|
+
#
|
7
|
+
# Paginated lets you make repeated requests to collect a timeline or
|
8
|
+
# collection of items.
|
9
|
+
#
|
10
|
+
# You will typically want to set the
|
11
|
+
#
|
12
|
+
# A Paginated-compatible ScrapeRequest should inherit from or be compatible
|
13
|
+
# with +Monkeyshines::ScrapeRequest+ and additionally define
|
14
|
+
# * [#items] list of individual items in the response; +nil+ if there was an
|
15
|
+
# error, +[]+ if the response was well-formed but returned no items.
|
16
|
+
# * [#num_items] number of items from this response
|
17
|
+
# * [#span] the range of (typically) IDs within this scrape. Used to know when
|
18
|
+
# we've reached results from previous session
|
19
|
+
#
|
20
|
+
#
|
21
|
+
module Paginated
|
22
|
+
#
|
23
|
+
# Soft limit on the number of pages to scrape.
|
24
|
+
#
|
25
|
+
# If we know the max_total_items, use it to set the number of pages;
|
26
|
+
# otherwise, let it run up to the hard limit.
|
27
|
+
#
|
28
|
+
# Typically, use this to set an upper limit that you know beforehand, and
|
29
|
+
# use #is_last? to decide based on the results
|
30
|
+
#
|
31
|
+
def max_pages
|
32
|
+
return hard_request_limit if (!max_total_items)
|
33
|
+
(max_total_items.to_f / max_items).ceil.clamp(0, hard_request_limit)
|
34
|
+
end
|
35
|
+
|
36
|
+
# Number of items returned in this request
|
37
|
+
def num_items()
|
38
|
+
items ? items.length : 0
|
39
|
+
end
|
40
|
+
|
41
|
+
# inject class variables
|
42
|
+
def self.included base
|
43
|
+
base.class_eval do
|
44
|
+
# Hard request limit: do not in any case exceed this number of requests
|
45
|
+
class_inheritable_accessor :hard_request_limit
|
46
|
+
|
47
|
+
# max items per page the API might return
|
48
|
+
class_inheritable_accessor :max_items
|
49
|
+
|
50
|
+
# Total items in all requests, if known ahead of time -- eg. a
|
51
|
+
# twitter_user's statuses_count can be used to set the max_total_items
|
52
|
+
# for TwitterUserTimelineRequests
|
53
|
+
attr_accessor :max_total_items
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end # Paginated
|
57
|
+
|
58
|
+
module Paginating
|
59
|
+
#
|
60
|
+
# Generates request for each page to be scraped.
|
61
|
+
#
|
62
|
+
# The job class must define a #request_for_page(page) method.
|
63
|
+
#
|
64
|
+
# * request is generated
|
65
|
+
# * ... and yielded to the call block. (which must return the fulfilled
|
66
|
+
# scrape_request response.)
|
67
|
+
# * after_fetch method chain invoked
|
68
|
+
#
|
69
|
+
# Scraping stops when is_last?(response, page) is true
|
70
|
+
#
|
71
|
+
def each_request info=nil, &block
|
72
|
+
before_pagination()
|
73
|
+
(1..hard_request_limit).each do |page|
|
74
|
+
request = request_for_page(page, info)
|
75
|
+
response = yield request
|
76
|
+
after_fetch(response, page)
|
77
|
+
break if is_last?(response, page)
|
78
|
+
end
|
79
|
+
after_pagination()
|
80
|
+
end
|
81
|
+
|
82
|
+
# return true if the next request would be pointless (true if, perhaps, the
|
83
|
+
# response had no items, or the API page limit is reached)
|
84
|
+
def is_last? response, page
|
85
|
+
( (page >= response.max_pages) ||
|
86
|
+
(response && response.healthy? && partial_response?(response)) )
|
87
|
+
end
|
88
|
+
def partial_response? response
|
89
|
+
(response.num_items < response.max_items)
|
90
|
+
end
|
91
|
+
|
92
|
+
# Bookkeeping/setup preceding pagination
|
93
|
+
def before_pagination
|
94
|
+
end
|
95
|
+
|
96
|
+
# Finalize bookkeeping at conclusion of scrape_job.
|
97
|
+
def after_pagination
|
98
|
+
end
|
99
|
+
|
100
|
+
# Feed back info from the fetch
|
101
|
+
def after_fetch response, page
|
102
|
+
end
|
103
|
+
|
104
|
+
# inject class variables
|
105
|
+
def self.included base
|
106
|
+
base.class_eval do
|
107
|
+
# Hard request limit: do not in any case exceed this number of requests
|
108
|
+
class_inheritable_accessor :hard_request_limit
|
109
|
+
end
|
110
|
+
end
|
111
|
+
end # Paginating
|
112
|
+
|
113
|
+
#
|
114
|
+
# Scenario: you request paginated search requests with a limit parameter (a
|
115
|
+
# max_id or min_id, for example).
|
116
|
+
#
|
117
|
+
# * request successive pages,
|
118
|
+
# * use info on the requested page to set the next limit parameter
|
119
|
+
# * stop when max_pages is reached or a successful request gives fewer than
|
120
|
+
# max_items
|
121
|
+
#
|
122
|
+
#
|
123
|
+
# The first
|
124
|
+
#
|
125
|
+
# req?min_id=1234&max_id=
|
126
|
+
# => [ [8675, ...], ..., [8012, ...] ] # 100 items
|
127
|
+
# req?min_id=1234&max_id=8011
|
128
|
+
# => [ [7581, ...], ..., [2044, ...] ] # 100 items
|
129
|
+
# req?min_id=1234&max_id=2043
|
130
|
+
# => [ [2012, ...], ..., [1234, ...] ] # 69 items
|
131
|
+
#
|
132
|
+
# * The search terminates when
|
133
|
+
# ** max_requests requests have been made, or
|
134
|
+
# ** the limit params interval is zero, or
|
135
|
+
# ** a successful response with fewer than max_items is received.
|
136
|
+
#
|
137
|
+
# * You will want to save <req?min_id=8676&max_id=""> for later scrape
|
138
|
+
#
|
139
|
+
module PaginatedWithLimit
|
140
|
+
# Set up bookkeeping for pagination tracking
|
141
|
+
def before_pagination
|
142
|
+
self.started_at = Time.now.utc
|
143
|
+
self.sess_span = UnionInterval.new
|
144
|
+
self.sess_timespan = UnionInterval.new
|
145
|
+
super
|
146
|
+
end
|
147
|
+
|
148
|
+
#
|
149
|
+
# Feed back info from the scrape
|
150
|
+
#
|
151
|
+
def after_fetch response, page
|
152
|
+
super response, page
|
153
|
+
update_spans(response) if (response && response.items)
|
154
|
+
end
|
155
|
+
|
156
|
+
# Update intervals to include new response
|
157
|
+
def update_spans response
|
158
|
+
self.sess_span << response.span
|
159
|
+
self.sess_timespan << response.timespan
|
160
|
+
end
|
161
|
+
|
162
|
+
# Return true if the next request would be pointless (true if, perhaps, the
|
163
|
+
# response had no items, or the API page limit is reached)
|
164
|
+
def is_last? response, page
|
165
|
+
sess_span.include?(prev_max) || super(response, page)
|
166
|
+
end
|
167
|
+
|
168
|
+
def after_pagination
|
169
|
+
self.prev_max = [prev_max, sess_span.max].compact.max
|
170
|
+
self.sess_span = UnionInterval.new
|
171
|
+
self.sess_timespan = UnionInterval.new
|
172
|
+
super
|
173
|
+
end
|
174
|
+
|
175
|
+
# inject class variables
|
176
|
+
def self.included base
|
177
|
+
base.class_eval do
|
178
|
+
# Span of items gathered in this scrape scrape_job.
|
179
|
+
attr_accessor :sess_span, :sess_timespan, :started_at
|
180
|
+
end
|
181
|
+
end
|
182
|
+
end # PaginatedWithLimit
|
183
|
+
|
184
|
+
module PaginatedWithRate
|
185
|
+
def before_pagination
|
186
|
+
self.sess_items ||= 0
|
187
|
+
super
|
188
|
+
end
|
189
|
+
|
190
|
+
#
|
191
|
+
# Feed back info from the scrape
|
192
|
+
#
|
193
|
+
def after_fetch response, page
|
194
|
+
super response, page
|
195
|
+
update_counts(response) if (response && response.items)
|
196
|
+
# p [response.items.map{|item| item['id']}.max, response.items.map{|item| item['id']}.min, prev_max, sess_span, response.parsed_contents.slice('max_id','next_page')]
|
197
|
+
# p response.items.map{|item| ("%6.2f" % [Time.now - Time.parse(item['created_at'])])}
|
198
|
+
end
|
199
|
+
|
200
|
+
# Count the new items from this response among the session items
|
201
|
+
def update_counts response
|
202
|
+
self.sess_items += response.num_items
|
203
|
+
end
|
204
|
+
|
205
|
+
RATE_PARAMETERS = {
|
206
|
+
:max_session_timespan => (60 * 60 * 24 * 5), # 5 days
|
207
|
+
:default_scrape_period => (60 * 60 * 2 ), # 2 hours
|
208
|
+
:max_resched_delay => (60 * 60 * 24 * 1), # 1 days
|
209
|
+
:min_resched_delay => (5), # 5 seconds
|
210
|
+
:sess_weight_slowing => 0.35, # how fast to converge when rate < average
|
211
|
+
:sess_weight_rising => 1.0, # how fast to converge when rate > average
|
212
|
+
}
|
213
|
+
|
214
|
+
#
|
215
|
+
# * session returns one result
|
216
|
+
# * session returns no result
|
217
|
+
# * session results clustered at center of nominal timespan
|
218
|
+
#
|
219
|
+
def recalculate_rate!
|
220
|
+
# If there's no good session timespan, we can fake one out
|
221
|
+
self.sess_timespan.max ||= Time.now.utc
|
222
|
+
self.sess_timespan.min ||= self.last_run
|
223
|
+
# Whatever its origin, limit the session timespan
|
224
|
+
if sess_timespan.size > RATE_PARAMETERS[:max_session_timespan]
|
225
|
+
sess_timespan.min = sess_timespan.max - RATE_PARAMETERS[:max_session_timespan]
|
226
|
+
end
|
227
|
+
# Find and limit the session items rate
|
228
|
+
if self.sess_items.to_f < 2
|
229
|
+
self.sess_items = 2
|
230
|
+
sess_items_rate = self.sess_items.to_f / RATE_PARAMETERS[:default_scrape_period]
|
231
|
+
else
|
232
|
+
# Find the items rate
|
233
|
+
sess_items_rate = self.sess_items.to_f / sess_timespan.size.to_f
|
234
|
+
end
|
235
|
+
# Find and limit the previous items rate
|
236
|
+
self.prev_items_rate = self.prev_items_rate.to_i rescue 0
|
237
|
+
if self.prev_items_rate == 0
|
238
|
+
self.prev_items_rate = target_items_per_job.to_f / RATE_PARAMETERS[:default_scrape_period]
|
239
|
+
self.delay = RATE_PARAMETERS[:default_scrape_period].to_f
|
240
|
+
end
|
241
|
+
|
242
|
+
# New items rate is a weighted average of new and old
|
243
|
+
#
|
244
|
+
# If new rate is faster than the prev_rate, we use a high weight
|
245
|
+
# (~1.0). When
|
246
|
+
sess_wt = (sess_items_rate > prev_items_rate) ? RATE_PARAMETERS[:sess_weight_rising] : RATE_PARAMETERS[:sess_weight_slowing]
|
247
|
+
new_items_rate = (prev_items_rate + (sess_items_rate * sess_wt)) / (1.0 + sess_wt)
|
248
|
+
new_total_items = prev_items.to_i + sess_items.to_i
|
249
|
+
since_start = (Time.now.utc - self.started_at).to_f
|
250
|
+
new_period = (target_items_per_job / new_items_rate)
|
251
|
+
new_delay = new_period - since_start
|
252
|
+
|
253
|
+
# puts %Q{rates %6.3f %6.3f => %6.3f delay %5.2f %5.2f => %5.2f (%5.2f) want %d sess %d items/%5.1fs -- %10d < %10d -- %s } %
|
254
|
+
# [sess_items_rate, prev_items_rate, new_items_rate,
|
255
|
+
# target_items_per_job / sess_items_rate, self.delay, new_period, new_delay,
|
256
|
+
# target_items_per_job, sess_items, sess_timespan.size.to_f,
|
257
|
+
# sess_span.max, prev_max,
|
258
|
+
# self.key]
|
259
|
+
|
260
|
+
Log.info(
|
261
|
+
%Q{resched\tit %4d\t%7.3f\t%7.2f\t%7.2f\t%7.2f\t%7.2f\t%10d\t%s } %
|
262
|
+
[sess_items, sess_timespan.size.to_f, target_items_per_job / sess_items_rate, self.delay, new_period, new_delay, prev_max, self.key])
|
263
|
+
|
264
|
+
self.delay = new_delay.to_f.clamp(RATE_PARAMETERS[:min_resched_delay], RATE_PARAMETERS[:max_resched_delay])
|
265
|
+
self.prev_items_rate = new_items_rate
|
266
|
+
self.prev_items = new_total_items
|
267
|
+
end
|
268
|
+
|
269
|
+
#
|
270
|
+
# Recalculate the item rates
|
271
|
+
# using the accumulated response
|
272
|
+
#
|
273
|
+
def after_pagination
|
274
|
+
recalculate_rate!
|
275
|
+
self.sess_items = 0
|
276
|
+
super
|
277
|
+
end
|
278
|
+
|
279
|
+
# inject class variables
|
280
|
+
def self.included base
|
281
|
+
base.class_eval do
|
282
|
+
# Span of items gathered in this scrape scrape_job.
|
283
|
+
attr_accessor :sess_items
|
284
|
+
# How many items we hope to pull in for every job
|
285
|
+
cattr_accessor :target_items_per_job
|
286
|
+
end
|
287
|
+
end
|
288
|
+
end # PaginatedWithRate
|
289
|
+
end
|
290
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require 'json'
|
2
|
+
module Monkeyshines
|
3
|
+
module RawJsonContents
|
4
|
+
def parsed_contents
|
5
|
+
return @parsed_contents if @parsed_contents
|
6
|
+
return nil unless contents
|
7
|
+
begin
|
8
|
+
@parsed_contents = JSON.load(contents.to_s)
|
9
|
+
rescue Exception => e
|
10
|
+
warn "JSON not parsing : #{e.to_s[0..2000].gsub(/[\r\n]+/,"")}" ; return nil
|
11
|
+
end
|
12
|
+
@parsed_contents
|
13
|
+
end
|
14
|
+
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,86 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module ScrapeRequestCore
|
3
|
+
module SignedUrl
|
4
|
+
|
5
|
+
def sign_url parsed_uri, request_key
|
6
|
+
qq = parsed_uri.query_values || {}
|
7
|
+
qq.merge!(request_key)
|
8
|
+
qq.merge!(
|
9
|
+
'api_key' => api_key,
|
10
|
+
'nonce' => nonce,
|
11
|
+
'format' => 'json')
|
12
|
+
p qq
|
13
|
+
qq = qq.sort.map{|k,v| k+'='+v }
|
14
|
+
str = [ parsed_uri.path, qq, api_secret].flatten.join("")
|
15
|
+
sig = Digest::MD5.hexdigest(str)
|
16
|
+
[qq, sig]
|
17
|
+
end
|
18
|
+
|
19
|
+
def authed_url(url, request_key)
|
20
|
+
parsed_uri = Addressable::URI.parse(url)
|
21
|
+
qq, sig = sign_url(parsed_uri, request_key)
|
22
|
+
[parsed_uri.scheme, '://', parsed_uri.host, parsed_uri.path, '?', qq.join("&"), "&sig=#{sig}"].join("")
|
23
|
+
end
|
24
|
+
|
25
|
+
def nonce
|
26
|
+
Time.now.utc.to_f.to_s
|
27
|
+
end
|
28
|
+
|
29
|
+
def token_request_url
|
30
|
+
"http://api.friendster.com/v1/token?api_key=#{api_key}&nonce=#{nonce}&format=json"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
#
|
37
|
+
# class TokenRequest < Base
|
38
|
+
# def authed_url
|
39
|
+
# qq = parsed_uri.query_values.merge(
|
40
|
+
# 'api_key' => api_key,
|
41
|
+
# 'nonce' => nonce,
|
42
|
+
# # 'auth_token' => auth_token,
|
43
|
+
# 'format' => 'json').sort.map{|k,v| k+'='+v }
|
44
|
+
# p qq
|
45
|
+
# str = [
|
46
|
+
# parsed_uri.path,
|
47
|
+
# qq,
|
48
|
+
# api_secret].flatten.join("")
|
49
|
+
# p str
|
50
|
+
# sig = Digest::MD5.hexdigest(str)
|
51
|
+
# qq << "sig=#{sig}"
|
52
|
+
# au = [parsed_uri.scheme, '://', parsed_uri.host, parsed_uri.path, '?', qq.join("&")].join("")
|
53
|
+
# p au
|
54
|
+
# au
|
55
|
+
# end
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# class SessionRequest < Base
|
59
|
+
# def authed_url(auth_token)
|
60
|
+
# qq = parsed_uri.query_values.merge(
|
61
|
+
# 'api_key' => api_key,
|
62
|
+
# 'nonce' => nonce,
|
63
|
+
# 'auth_token' => auth_token,
|
64
|
+
# 'format' => 'json').sort.map{|k,v| k+'='+v }
|
65
|
+
# p qq
|
66
|
+
# str = [
|
67
|
+
# parsed_uri.path,
|
68
|
+
# qq,
|
69
|
+
# api_secret].flatten.join("")
|
70
|
+
# p str
|
71
|
+
# sig = Digest::MD5.hexdigest(str)
|
72
|
+
# qq << "sig=#{sig}"
|
73
|
+
# au = [parsed_uri.scheme, '://', parsed_uri.host, parsed_uri.path, '?', qq.join("&")].join("")
|
74
|
+
# p au
|
75
|
+
# au
|
76
|
+
# end
|
77
|
+
# def make_url()
|
78
|
+
# "http://api.friendster.com/v1/session?"
|
79
|
+
# end
|
80
|
+
# end
|
81
|
+
#
|
82
|
+
# # require 'monkeyshines' ; require 'wuclan' ; require 'wukong' ; require 'addressable/uri' ; require 'rest_client' ; scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
|
83
|
+
# # load(ENV['HOME']+'/ics/wuclan/lib/wuclan/friendster/scrape/base.rb') ; Wuclan::Friendster::Scrape::Base.api_key = scrape_config[:friendster_api][:api_key] ; tokreq = Wuclan::Friendster::Scrape::TokenRequest.new(scrape_config[:friendster_api][:user_id]) ; tok= RestClient.post(tokreq.authed_url, {}).gsub(/\"/,"")
|
84
|
+
# # sessreq = Wuclan::Friendster::Scrape::SessionRequest.new(scrape_config[:friendster_api][:user_id])
|
85
|
+
# # sessreq.auth_token = '' ; sessreq.make_url! ; RestClient.post(sessreq.url+'&sig='+sessreq.url_sig[1], {})
|
86
|
+
# # # => "{"session_key":"....","uid":"...","expires":"..."}"
|
@@ -0,0 +1,14 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
extend FactoryModule
|
4
|
+
autoload :Base, 'monkeyshines/store/base'
|
5
|
+
autoload :FlatFileStore, 'monkeyshines/store/flat_file_store'
|
6
|
+
autoload :ConditionalStore, 'monkeyshines/store/conditional_store'
|
7
|
+
autoload :ChunkedFlatFileStore, 'monkeyshines/store/chunked_flat_file_store'
|
8
|
+
autoload :KeyStore, 'monkeyshines/store/key_store'
|
9
|
+
autoload :TokyoTdbKeyStore, 'monkeyshines/store/tokyo_tdb_key_store'
|
10
|
+
autoload :TyrantTdbKeyStore, 'monkeyshines/store/tyrant_tdb_key_store'
|
11
|
+
autoload :TyrantRdbKeyStore, 'monkeyshines/store/tyrant_rdb_key_store'
|
12
|
+
autoload :ReadThruStore, 'monkeyshines/store/read_thru_store'
|
13
|
+
end
|
14
|
+
end
|
@@ -0,0 +1,29 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class Base
|
4
|
+
attr_accessor :options
|
5
|
+
def initialize _options={}
|
6
|
+
self.options = _options
|
7
|
+
Log.info "Creating #{self.class}"
|
8
|
+
end
|
9
|
+
|
10
|
+
#
|
11
|
+
def each_as klass, &block
|
12
|
+
self.each do |*args|
|
13
|
+
begin
|
14
|
+
item = klass.new *args[1..-1]
|
15
|
+
rescue Exception => e
|
16
|
+
Log.info [args, e.to_s, self].join("\t")
|
17
|
+
raise e
|
18
|
+
end
|
19
|
+
yield item
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
def log_line
|
24
|
+
nil
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ChunkedFlatFileStore < Monkeyshines::Store::FlatFileStore
|
4
|
+
attr_accessor :filename_pattern, :chunk_monitor, :handle
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
:chunktime => 4*60*60, # default 4 hours
|
8
|
+
:pattern => ":rootdir/:date/:handle+:timestamp-:pid.tsv",
|
9
|
+
:rootdir => nil,
|
10
|
+
:filemode => 'w',
|
11
|
+
}
|
12
|
+
|
13
|
+
def initialize _options
|
14
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options)
|
15
|
+
raise "You don't really want a chunk time this small: #{options[:chunktime]}" unless options[:chunktime] > 600
|
16
|
+
self.chunk_monitor = Monkeyshines::Monitor::PeriodicMonitor.new( :time => options[:chunktime] )
|
17
|
+
self.handle = options[:handle] || Monkeyshines::CONFIG[:handle]
|
18
|
+
self.filename_pattern = Monkeyshines::Utils::FilenamePattern.new(options[:pattern], :handle => handle, :rootdir => options[:rootdir])
|
19
|
+
super options.merge(:filename => filename_pattern.make())
|
20
|
+
self.mkdir!
|
21
|
+
end
|
22
|
+
|
23
|
+
def save *args
|
24
|
+
result = super *args
|
25
|
+
chunk_monitor.periodically do
|
26
|
+
new_filename = filename_pattern.make()
|
27
|
+
Log.info "Rotating chunked file #{filename} into #{new_filename}"
|
28
|
+
self.close
|
29
|
+
@filename = new_filename
|
30
|
+
self.mkdir!
|
31
|
+
end
|
32
|
+
result
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|