wuclan 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
@@ -0,0 +1,157 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << ENV['WUKONG_PATH']
|
3
|
+
require File.dirname(__FILE__)+'/config/config_private'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'trollop'
|
6
|
+
require 'wukong'
|
7
|
+
require 'monkeyshines'
|
8
|
+
#
|
9
|
+
require 'wuclan/twitter/scrape' ; include Wuclan
|
10
|
+
require 'monkeyshines/utils/uri'
|
11
|
+
require 'monkeyshines/fetcher/http_fetcher'
|
12
|
+
#
|
13
|
+
# Command line options
|
14
|
+
#
|
15
|
+
opts = Trollop::options do
|
16
|
+
opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
|
17
|
+
opt :items_per_job, "Desired item count per job", :default => 675
|
18
|
+
opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 30*1
|
19
|
+
opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
|
20
|
+
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
21
|
+
# import from file
|
22
|
+
opt :from, "Location of scrape store to load from", :type => String
|
23
|
+
# output storage
|
24
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
25
|
+
opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
|
26
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
|
27
|
+
end
|
28
|
+
|
29
|
+
module Wuclan
|
30
|
+
module Domains
|
31
|
+
module Twitter
|
32
|
+
module Scrape
|
33
|
+
TwitterSearchJob = Struct.new(
|
34
|
+
:query_term,
|
35
|
+
:priority,
|
36
|
+
:prev_items,
|
37
|
+
:prev_rate,
|
38
|
+
:prev_span_min,
|
39
|
+
:prev_span_max
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
# Queue of request import_jobs, with reschedule requests
|
48
|
+
beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
|
49
|
+
request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
|
50
|
+
# Scrape requests by HTTP
|
51
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
|
52
|
+
# Log every 60 seconds
|
53
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
|
54
|
+
# Persist scrape_job jobs in distributed DB
|
55
|
+
job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
|
56
|
+
|
57
|
+
# Import
|
58
|
+
if opts[:from]
|
59
|
+
import_jobs = Monkeyshines::Store::FlatFileStore.new(opts[:from], :filemode => 'r')
|
60
|
+
end
|
61
|
+
|
62
|
+
#
|
63
|
+
# Keep one unique copy of each scrape_job. The most senior instance (the one
|
64
|
+
# with the highest prev_items) wins.
|
65
|
+
#
|
66
|
+
SCRAPES = { }
|
67
|
+
def add_scrape_job scrape_job
|
68
|
+
return if SCRAPES[scrape_job.query_term] &&
|
69
|
+
(SCRAPES[scrape_job.query_term].prev_items.to_i >= scrape_job.prev_items.to_i)
|
70
|
+
SCRAPES[scrape_job.query_term] = scrape_job
|
71
|
+
end
|
72
|
+
|
73
|
+
Monkeyshines::RequestStream::BeanstalkQueue.class_eval do
|
74
|
+
#
|
75
|
+
# An (extremely dangerous) routine to examine all the jobs in the queue--
|
76
|
+
# since I don't know another way we pull all of them out and then put all of
|
77
|
+
# them back in.
|
78
|
+
#
|
79
|
+
def scrub_all &block
|
80
|
+
job_queue.connect()
|
81
|
+
File.open("/tmp/qjobs-#{Time.now.strftime("%H%M%S")}.tsv", "w") do |dump|
|
82
|
+
loop do
|
83
|
+
# Kick a bunch of jobs across all connections
|
84
|
+
$stderr.puts job_queue_stats.inspect
|
85
|
+
kicked = job_queue.open_connections.map{|conxn| conxn.kick(20) }
|
86
|
+
kicked = kicked.inject(0){|sum, n| sum += n }
|
87
|
+
# For all the jobs we can get our hands on quickly,
|
88
|
+
while(qjob = reserve_job!(0.5)) do
|
89
|
+
# send it in for processing
|
90
|
+
scrape_job = scrape_job_from_qjob(qjob)
|
91
|
+
yield scrape_job
|
92
|
+
# last recourse in case something goes wrong.
|
93
|
+
dump << scrape_job.to_flat.join("\t")+"\n"
|
94
|
+
# and remove it from the pool
|
95
|
+
qjob.delete
|
96
|
+
end
|
97
|
+
# stop when there's no more qjobs
|
98
|
+
break if (job_queue_total_jobs == 0) && (!job_queue.peek_ready)
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
103
|
+
|
104
|
+
begin
|
105
|
+
#
|
106
|
+
# Catalog the jobs in the persistent store
|
107
|
+
#
|
108
|
+
job_store.each do |key, hsh|
|
109
|
+
scrape_job = Twitter::Scrape::TwitterSearchJob.from_hash hsh
|
110
|
+
periodic_log.periodically{ [scrape_job] }
|
111
|
+
add_scrape_job scrape_job
|
112
|
+
end
|
113
|
+
|
114
|
+
#
|
115
|
+
# Catalog the jobs in the transient queue
|
116
|
+
#
|
117
|
+
request_queue.scrub_all do |scrape_job|
|
118
|
+
periodic_log.periodically{ [scrape_job] }
|
119
|
+
add_scrape_job scrape_job
|
120
|
+
end
|
121
|
+
|
122
|
+
#
|
123
|
+
# Import jobs from a static file
|
124
|
+
#
|
125
|
+
import_jobs.each_as(Twitter::Scrape::TwitterSearchJob) do |scrape_job|
|
126
|
+
next if (scrape_job.query_term =~ /^#/) || (scrape_job.query_term.blank?)
|
127
|
+
periodic_log.periodically{ [scrape_job] }
|
128
|
+
add_scrape_job scrape_job
|
129
|
+
# SCRAPES[scrape_job.query_term].priority = scrape_job.priority unless scrape_job.priority.blank?
|
130
|
+
# SCRAPES[scrape_job.query_term] = scrape_job
|
131
|
+
end
|
132
|
+
rescue Exception => e
|
133
|
+
warn e
|
134
|
+
ensure
|
135
|
+
#
|
136
|
+
# Serialize them to disk
|
137
|
+
#
|
138
|
+
sorted = SCRAPES.sort_by{|term,scrape_job| [scrape_job.priority||65536, -(scrape_job.prev_rate||1440), term] }
|
139
|
+
sorted.each do |term, scrape_job|
|
140
|
+
# scrape_job.prev_rate = [scrape_job.prev_rate.to_f, 0.01].max if scrape_job.prev_rate
|
141
|
+
# scrape_job.prev_items = 1000
|
142
|
+
puts scrape_job.to_flat[1..-1].join("\t")
|
143
|
+
end
|
144
|
+
end
|
145
|
+
|
146
|
+
request_queue.min_resched_delay = 10
|
147
|
+
sorted.each do |term, scrape_job|
|
148
|
+
#
|
149
|
+
# Persist the updated job to the job_store db, so that we can restart queue easily
|
150
|
+
job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
|
151
|
+
|
152
|
+
#
|
153
|
+
# re-enqueue the job. If it's run before, accelerate its next call; if never
|
154
|
+
# run before schedule for immediate run.
|
155
|
+
delay = (scrape_job.prev_rate ? request_queue.delay_to_next_scrape(scrape_job)/3 : 0)
|
156
|
+
request_queue.save scrape_job, scrape_job.priority, delay
|
157
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'monkeyshines'
|
4
|
+
require 'monkeyshines/runner/base'
|
5
|
+
require 'pathname'
|
6
|
+
|
7
|
+
#
|
8
|
+
require 'wuclan/twitter'
|
9
|
+
# un-namespace request classes.
|
10
|
+
include Wuclan::Twitter::Scrape
|
11
|
+
|
12
|
+
WORK_DIR = Pathname.new(File.dirname(__FILE__)+"/rawd").realpath.to_s
|
13
|
+
|
14
|
+
# ===========================================================================
|
15
|
+
#
|
16
|
+
# scrape_shorturls.rb --
|
17
|
+
#
|
18
|
+
# To scrape from a list of shortened urls:
|
19
|
+
#
|
20
|
+
# ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
|
21
|
+
#
|
22
|
+
# To do a random scrape:
|
23
|
+
#
|
24
|
+
# ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
|
25
|
+
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
26
|
+
#
|
27
|
+
#
|
28
|
+
opts = Trollop::options do
|
29
|
+
opt :log, "Log to file instead of STDERR"
|
30
|
+
# input from file
|
31
|
+
opt :from, "URI for scrape store to load from", :type => String
|
32
|
+
opt :skip, "Initial lines to skip", :type => Integer
|
33
|
+
# output storage
|
34
|
+
opt :cache_loc, "URI for cache server", :type => String, :default => ':10022'
|
35
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
36
|
+
opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd'
|
37
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv"
|
38
|
+
end
|
39
|
+
opts[:handle] ||= 'com.twitter'
|
40
|
+
scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
|
41
|
+
opts.merge! scrape_config
|
42
|
+
|
43
|
+
# ******************** Log ********************
|
44
|
+
if (opts[:log])
|
45
|
+
opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv'))
|
46
|
+
$stdout = $stderr = File.open(opts[:log]+"-console.log", "a")
|
47
|
+
end
|
48
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 1, :time => 30)
|
49
|
+
|
50
|
+
#
|
51
|
+
# ******************** Load from store ********************
|
52
|
+
#
|
53
|
+
class TwitterRequestStream < Monkeyshines::RequestStream::Base
|
54
|
+
def each *args
|
55
|
+
request_store.each(*args) do |twitter_user_id, *_|
|
56
|
+
yield TwitterUserRequest.new(twitter_user_id, 1, "" )
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
|
61
|
+
src_store.skip!(opts[:skip].to_i) if opts[:skip]
|
62
|
+
request_stream = TwitterRequestStream.new TwitterUserRequest, src_store
|
63
|
+
|
64
|
+
#
|
65
|
+
# ******************** Store output ********************
|
66
|
+
#
|
67
|
+
# Track visited URLs with key-value database
|
68
|
+
#
|
69
|
+
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(opts[:cache_loc])
|
70
|
+
|
71
|
+
#
|
72
|
+
# Store the data into flat files
|
73
|
+
#
|
74
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], :handle => opts[:handle], :dest_dir => opts[:dest_dir])
|
75
|
+
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern, opts[:chunk_time].to_i, opts)
|
76
|
+
|
77
|
+
#
|
78
|
+
# Conditional store uses the key-value DB to boss around the flat files --
|
79
|
+
# requests are only made (and thus data is only output) if the url is missing
|
80
|
+
# from the key-value store.
|
81
|
+
#
|
82
|
+
dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
|
83
|
+
|
84
|
+
#
|
85
|
+
# ******************** Fetcher ********************
|
86
|
+
#
|
87
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new opts[:twitter_api]
|
88
|
+
|
89
|
+
|
90
|
+
#
|
91
|
+
# ******************** Do this thing ********************
|
92
|
+
#
|
93
|
+
Log.info "Beginning scrape itself"
|
94
|
+
request_stream.each do |req|
|
95
|
+
# conditional store only calls fetcher if url key is missing.
|
96
|
+
result = dest_store.set(req.url) do
|
97
|
+
response = fetcher.get(req) # do the url fetch
|
98
|
+
next unless response.healthy? # don't store bad fetches
|
99
|
+
[response.scraped_at, response] # timestamp into cache, result into flat file
|
100
|
+
end
|
101
|
+
periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
|
102
|
+
end
|
103
|
+
dest_store.close
|
104
|
+
fetcher.close
|
@@ -0,0 +1,57 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << ENV['WUKONG_PATH']
|
3
|
+
require File.dirname(__FILE__)+'/config/config_private'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'trollop'
|
6
|
+
require 'wukong'
|
7
|
+
require 'monkeyshines'
|
8
|
+
require 'wuclan/twitter/scrape' ; include Wuclan
|
9
|
+
|
10
|
+
require 'monkeyshines/fetcher/http_fetcher'
|
11
|
+
require 'monkeyshines/utils/filename_pattern'
|
12
|
+
#
|
13
|
+
# Command line options
|
14
|
+
#
|
15
|
+
opts = Trollop::options do
|
16
|
+
opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
|
17
|
+
opt :items_per_job, "Desired item count per job", :default => 1000
|
18
|
+
opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 20*1
|
19
|
+
opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
|
20
|
+
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
21
|
+
# output storage
|
22
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
23
|
+
opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
|
24
|
+
opt :dest_pattern, "Pattern for dump file output", :default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
|
25
|
+
end
|
26
|
+
Trollop::die :dest_dir unless opts[:dest_dir]
|
27
|
+
|
28
|
+
# Queue of request scrape_jobs, with reschedule requests
|
29
|
+
beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
|
30
|
+
request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
|
31
|
+
# Scrape Store for completed requests
|
32
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], opts.slice(:handle, :dest_dir))
|
33
|
+
dest = Monkeyshines::Store::ChunkedFlatFileStore.new dest_pattern, opts[:chunk_time].to_i
|
34
|
+
# Scrape requests by HTTP
|
35
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
|
36
|
+
# Log every 60 seconds
|
37
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
|
38
|
+
# Persist scrape_job jobs in distributed DB
|
39
|
+
job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
|
40
|
+
|
41
|
+
request_queue.each do |scrape_job|
|
42
|
+
# Run through all pages for this search term
|
43
|
+
scrape_job.each_request do |req|
|
44
|
+
# Fetch request
|
45
|
+
response = fetcher.get(req)
|
46
|
+
# save it if successful
|
47
|
+
dest.save response if response
|
48
|
+
# log progress
|
49
|
+
periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
|
50
|
+
# return it to the scrape_job for bookkeeping
|
51
|
+
response
|
52
|
+
end
|
53
|
+
# Persist the updated job to the scrape_jobs db, so that we can restart queue easily
|
54
|
+
job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
|
55
|
+
# sleep 0.5
|
56
|
+
end
|
57
|
+
request_queue.close
|
@@ -0,0 +1,73 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << ENV['WUKONG_PATH']
|
3
|
+
require File.dirname(__FILE__)+'/config/config_private'
|
4
|
+
require 'rubygems'
|
5
|
+
require 'trollop'
|
6
|
+
require 'wukong'
|
7
|
+
require 'monkeyshines'
|
8
|
+
require 'wuclan/twitter/scrape' ; include Wuclan
|
9
|
+
|
10
|
+
require 'monkeyshines/fetcher/http_fetcher'
|
11
|
+
require 'monkeyshines/utils/filename_pattern'
|
12
|
+
|
13
|
+
#
|
14
|
+
# Command line options
|
15
|
+
#
|
16
|
+
opts = Trollop::options do
|
17
|
+
opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
|
18
|
+
opt :dumpfile_pattern, "Pattern for dump file output",
|
19
|
+
:default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
|
20
|
+
opt :dumpfile_chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer,
|
21
|
+
:default => 60*60*24
|
22
|
+
opt :handle, "Handle to uniquely identify this scrape",
|
23
|
+
:default => 'com.twitter.search'
|
24
|
+
opt :min_resched_delay, "Don't run jobs more often than this (in seconds)",
|
25
|
+
:default => 60*1
|
26
|
+
end
|
27
|
+
Trollop::die :dumpfile_dir unless opts[:dumpfile_dir]
|
28
|
+
|
29
|
+
# Queue of request jobs, with reschedule requests
|
30
|
+
# opts[:beanstalk_tube] ||= opts[:handle].gsub(/\w+/,'_')
|
31
|
+
request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay)) # , :beanstalk_tube
|
32
|
+
# Scrape Store for completed requests
|
33
|
+
dumpfile_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dumpfile_pattern], opts.slice(:handle, :dumpfile_dir))
|
34
|
+
store = Monkeyshines::Store::ChunkedFlatFileStore.new dumpfile_pattern, opts[:dumpfile_chunk_time].to_i
|
35
|
+
# Scrape requests by HTTP
|
36
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
|
37
|
+
# Log every 60 seconds
|
38
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
|
39
|
+
|
40
|
+
|
41
|
+
class TwitterTrendingJob < Struct.new(
|
42
|
+
:query_term,
|
43
|
+
:priority,
|
44
|
+
:period
|
45
|
+
)
|
46
|
+
|
47
|
+
end
|
48
|
+
|
49
|
+
|
50
|
+
# %w[
|
51
|
+
# http://search.twitter.com/trends/current.format , 60*60
|
52
|
+
# http://search.twitter.com/trends/daily.json?date=2009-03-19
|
53
|
+
# http://search.twitter.com/trends/weekly.json?date=2009-03-19
|
54
|
+
# ]
|
55
|
+
|
56
|
+
|
57
|
+
|
58
|
+
request_queue.each do |scrape_job|
|
59
|
+
# Run through all pages for this search term
|
60
|
+
scrape_job.each_request do |req|
|
61
|
+
# Make request
|
62
|
+
response = fetcher.get(req)
|
63
|
+
# save it if successful
|
64
|
+
store.save response if response
|
65
|
+
# log progress
|
66
|
+
periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
|
67
|
+
# return it to the scrape_job for bookkeeping
|
68
|
+
response
|
69
|
+
end
|
70
|
+
end
|
71
|
+
request_queue.finish
|
72
|
+
|
73
|
+
# Twitter::Scrape::Scrape_Job.hard_request_limit = 15
|
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#$: << ENV['WUKONG_PATH']
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
|
7
|
+
require 'wuclan/twitter'
|
8
|
+
# if you're anyone but original author this next require is useless but harmless.
|
9
|
+
require 'wuclan/twitter/scrape/old_skool_request_classes'
|
10
|
+
# un-namespace request classes.
|
11
|
+
include Wuclan::Twitter::Scrape
|
12
|
+
include Wuclan::Twitter::Model
|
13
|
+
|
14
|
+
#
|
15
|
+
#
|
16
|
+
# Instantiate each incoming request.
|
17
|
+
# Stream out the contained classes it generates.
|
18
|
+
#
|
19
|
+
#
|
20
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
21
|
+
|
22
|
+
def process request, *args, &block
|
23
|
+
request.parse(*args) do |obj|
|
24
|
+
next if obj.is_a? BadRecord
|
25
|
+
yield obj.to_flat(false)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
#
|
31
|
+
# We want to record each individual state of the resource, with the last-seen of
|
32
|
+
# its timestamps (if there are many). So if we saw
|
33
|
+
#
|
34
|
+
# rsrc id screen_name followers_count friends_count (... more)
|
35
|
+
# user 23 skidoo 47 61
|
36
|
+
# user 23 skidoo 48 62
|
37
|
+
# user 23 skidoo 48 62
|
38
|
+
# user 23 skidoo 52 62
|
39
|
+
# user 23 skidoo 52 63
|
40
|
+
#
|
41
|
+
#
|
42
|
+
class TwitterRequestUniqer < Wukong::Streamer::UniqByLastReducer
|
43
|
+
include Wukong::Streamer::StructRecordizer
|
44
|
+
|
45
|
+
attr_accessor :uniquer_count
|
46
|
+
|
47
|
+
#
|
48
|
+
#
|
49
|
+
#
|
50
|
+
#
|
51
|
+
# for immutable objects we can just work off their ID.
|
52
|
+
#
|
53
|
+
# for mutable objects we want to record each unique state: all the fields
|
54
|
+
# apart from the scraped_at timestamp.
|
55
|
+
#
|
56
|
+
def get_key obj
|
57
|
+
case obj
|
58
|
+
when Tweet
|
59
|
+
obj.id
|
60
|
+
when AFollowsB, AFavoritesB, ARepliesB, AAtsignsB, AAtsignsBId, ARetweetsB, ARetweetsBId, TwitterUserId
|
61
|
+
obj.key
|
62
|
+
when TwitterUser, TwitterUserProfile, TwitterUserStyle, TwitterUserPartial
|
63
|
+
[obj.id] + obj.to_a[2..-1]
|
64
|
+
else
|
65
|
+
raise "Don't know how to extract key from #{obj.class}"
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
def start! *args
|
70
|
+
self.uniquer_count = 0
|
71
|
+
super *args
|
72
|
+
end
|
73
|
+
|
74
|
+
def accumulate obj
|
75
|
+
self.uniquer_count += 1
|
76
|
+
self.final_value = [self.uniquer_count, obj.to_flat].flatten
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
# This makes the script go.
|
81
|
+
Wukong::Script.new(TwitterRequestParser, TwitterRequestUniqer).run
|
@@ -0,0 +1,28 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
#$: << ENV['WUKONG_PATH']
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
|
7
|
+
require 'wuclan/twitter'
|
8
|
+
require 'wuclan/twitter/scrape/twitter_search_request'
|
9
|
+
require 'wuclan/twitter/parse/twitter_search_parse'
|
10
|
+
include Wuclan::Twitter::Scrape
|
11
|
+
|
12
|
+
#
|
13
|
+
#
|
14
|
+
# Instantiate each incoming request.
|
15
|
+
# Stream out the contained classes it generates.
|
16
|
+
#
|
17
|
+
#
|
18
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
19
|
+
def process request, *args, &block
|
20
|
+
request.parse(*args) do |obj|
|
21
|
+
next if obj.is_a? BadRecord
|
22
|
+
yield obj.to_flat(false)
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
# This makes the script go.
|
28
|
+
Wukong::Script.new(TwitterRequestParser, nil).run
|
@@ -0,0 +1,61 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require 'rubygems'
|
3
|
+
require 'monkeyshines'
|
4
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
|
5
|
+
|
6
|
+
#
|
7
|
+
# * jobs stream from a flat file
|
8
|
+
#
|
9
|
+
# * each job generates one or several requests (ex: followers_ids,
|
10
|
+
# friends_ids, user_timeline, favorites). Paginated requests stop when results
|
11
|
+
# overlap the prev_max item, as tracked from a central store).
|
12
|
+
#
|
13
|
+
# * Each request is fetched with the standard HTTP fetcher.
|
14
|
+
#
|
15
|
+
# * Jobs are rescheduled based on the observed item rate
|
16
|
+
#
|
17
|
+
# * results are sent to a ChunkedFlatFileStore
|
18
|
+
#
|
19
|
+
|
20
|
+
#
|
21
|
+
# Follow-on requests to make for each user
|
22
|
+
# You can also specify these with --source-fetches on the command line
|
23
|
+
#
|
24
|
+
DEFAULT_SOURCE_FETCHES = [
|
25
|
+
:user,
|
26
|
+
# :followers_ids, :friends_ids,
|
27
|
+
:followers, :friends,
|
28
|
+
# :favorites
|
29
|
+
]
|
30
|
+
|
31
|
+
Monkeyshines::CMDLINE_OPTIONS << [:source_fetches, "Follow-on requests to make. Default '#{DEFAULT_SOURCE_FETCHES.join(',')}'", { :default => DEFAULT_SOURCE_FETCHES.join(',') }]
|
32
|
+
Monkeyshines::CMDLINE_OPTIONS << [:source_skip, "Initial lines to skip", { :default => 1 }]
|
33
|
+
|
34
|
+
# Setup
|
35
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
36
|
+
Monkeyshines.load_global_options!
|
37
|
+
Monkeyshines.load_cmdline_options!
|
38
|
+
Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
|
39
|
+
|
40
|
+
#
|
41
|
+
# Don't spend all day on follow-on requests
|
42
|
+
#
|
43
|
+
{ TwitterFollowersRequest => 10,
|
44
|
+
TwitterFriendsRequest => 10,
|
45
|
+
TwitterFavoritesRequest => 4, }.each{|klass, limit| klass.hard_request_limit = limit }
|
46
|
+
|
47
|
+
#
|
48
|
+
# Set up scraper
|
49
|
+
#
|
50
|
+
scraper = Monkeyshines::Runner.new({
|
51
|
+
:log => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
|
52
|
+
:source => { :type => TwitterRequestStream },
|
53
|
+
:dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR },
|
54
|
+
# :fetcher => { :type => TwitterFakeFetcher },
|
55
|
+
:sleep_time => 0,
|
56
|
+
})
|
57
|
+
|
58
|
+
#
|
59
|
+
# Run scraper
|
60
|
+
#
|
61
|
+
scraper.run
|