wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << ENV['WUKONG_PATH']
|
|
3
|
+
require File.dirname(__FILE__)+'/config/config_private'
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
require 'trollop'
|
|
6
|
+
require 'wukong'
|
|
7
|
+
require 'monkeyshines'
|
|
8
|
+
#
|
|
9
|
+
require 'wuclan/twitter/scrape' ; include Wuclan
|
|
10
|
+
require 'monkeyshines/utils/uri'
|
|
11
|
+
require 'monkeyshines/fetcher/http_fetcher'
|
|
12
|
+
#
|
|
13
|
+
# Command line options
|
|
14
|
+
#
|
|
15
|
+
opts = Trollop::options do
|
|
16
|
+
opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
|
|
17
|
+
opt :items_per_job, "Desired item count per job", :default => 675
|
|
18
|
+
opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 30*1
|
|
19
|
+
opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
|
|
20
|
+
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
|
21
|
+
# import from file
|
|
22
|
+
opt :from, "Location of scrape store to load from", :type => String
|
|
23
|
+
# output storage
|
|
24
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
|
25
|
+
opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
|
|
26
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
|
|
27
|
+
end
|
|
28
|
+
|
|
29
|
+
module Wuclan
|
|
30
|
+
module Domains
|
|
31
|
+
module Twitter
|
|
32
|
+
module Scrape
|
|
33
|
+
TwitterSearchJob = Struct.new(
|
|
34
|
+
:query_term,
|
|
35
|
+
:priority,
|
|
36
|
+
:prev_items,
|
|
37
|
+
:prev_rate,
|
|
38
|
+
:prev_span_min,
|
|
39
|
+
:prev_span_max
|
|
40
|
+
)
|
|
41
|
+
end
|
|
42
|
+
end
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
# Queue of request import_jobs, with reschedule requests
|
|
48
|
+
beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
|
|
49
|
+
request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
|
|
50
|
+
# Scrape requests by HTTP
|
|
51
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
|
|
52
|
+
# Log every 60 seconds
|
|
53
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
|
|
54
|
+
# Persist scrape_job jobs in distributed DB
|
|
55
|
+
job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
|
|
56
|
+
|
|
57
|
+
# Import
|
|
58
|
+
if opts[:from]
|
|
59
|
+
import_jobs = Monkeyshines::Store::FlatFileStore.new(opts[:from], :filemode => 'r')
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
#
|
|
63
|
+
# Keep one unique copy of each scrape_job. The most senior instance (the one
|
|
64
|
+
# with the highest prev_items) wins.
|
|
65
|
+
#
|
|
66
|
+
SCRAPES = { }
|
|
67
|
+
def add_scrape_job scrape_job
|
|
68
|
+
return if SCRAPES[scrape_job.query_term] &&
|
|
69
|
+
(SCRAPES[scrape_job.query_term].prev_items.to_i >= scrape_job.prev_items.to_i)
|
|
70
|
+
SCRAPES[scrape_job.query_term] = scrape_job
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Monkeyshines::RequestStream::BeanstalkQueue.class_eval do
|
|
74
|
+
#
|
|
75
|
+
# An (extremely dangerous) routine to examine all the jobs in the queue--
|
|
76
|
+
# since I don't know another way we pull all of them out and then put all of
|
|
77
|
+
# them back in.
|
|
78
|
+
#
|
|
79
|
+
def scrub_all &block
|
|
80
|
+
job_queue.connect()
|
|
81
|
+
File.open("/tmp/qjobs-#{Time.now.strftime("%H%M%S")}.tsv", "w") do |dump|
|
|
82
|
+
loop do
|
|
83
|
+
# Kick a bunch of jobs across all connections
|
|
84
|
+
$stderr.puts job_queue_stats.inspect
|
|
85
|
+
kicked = job_queue.open_connections.map{|conxn| conxn.kick(20) }
|
|
86
|
+
kicked = kicked.inject(0){|sum, n| sum += n }
|
|
87
|
+
# For all the jobs we can get our hands on quickly,
|
|
88
|
+
while(qjob = reserve_job!(0.5)) do
|
|
89
|
+
# send it in for processing
|
|
90
|
+
scrape_job = scrape_job_from_qjob(qjob)
|
|
91
|
+
yield scrape_job
|
|
92
|
+
# last recourse in case something goes wrong.
|
|
93
|
+
dump << scrape_job.to_flat.join("\t")+"\n"
|
|
94
|
+
# and remove it from the pool
|
|
95
|
+
qjob.delete
|
|
96
|
+
end
|
|
97
|
+
# stop when there's no more qjobs
|
|
98
|
+
break if (job_queue_total_jobs == 0) && (!job_queue.peek_ready)
|
|
99
|
+
end
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
begin
|
|
105
|
+
#
|
|
106
|
+
# Catalog the jobs in the persistent store
|
|
107
|
+
#
|
|
108
|
+
job_store.each do |key, hsh|
|
|
109
|
+
scrape_job = Twitter::Scrape::TwitterSearchJob.from_hash hsh
|
|
110
|
+
periodic_log.periodically{ [scrape_job] }
|
|
111
|
+
add_scrape_job scrape_job
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
#
|
|
115
|
+
# Catalog the jobs in the transient queue
|
|
116
|
+
#
|
|
117
|
+
request_queue.scrub_all do |scrape_job|
|
|
118
|
+
periodic_log.periodically{ [scrape_job] }
|
|
119
|
+
add_scrape_job scrape_job
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
#
|
|
123
|
+
# Import jobs from a static file
|
|
124
|
+
#
|
|
125
|
+
import_jobs.each_as(Twitter::Scrape::TwitterSearchJob) do |scrape_job|
|
|
126
|
+
next if (scrape_job.query_term =~ /^#/) || (scrape_job.query_term.blank?)
|
|
127
|
+
periodic_log.periodically{ [scrape_job] }
|
|
128
|
+
add_scrape_job scrape_job
|
|
129
|
+
# SCRAPES[scrape_job.query_term].priority = scrape_job.priority unless scrape_job.priority.blank?
|
|
130
|
+
# SCRAPES[scrape_job.query_term] = scrape_job
|
|
131
|
+
end
|
|
132
|
+
rescue Exception => e
|
|
133
|
+
warn e
|
|
134
|
+
ensure
|
|
135
|
+
#
|
|
136
|
+
# Serialize them to disk
|
|
137
|
+
#
|
|
138
|
+
sorted = SCRAPES.sort_by{|term,scrape_job| [scrape_job.priority||65536, -(scrape_job.prev_rate||1440), term] }
|
|
139
|
+
sorted.each do |term, scrape_job|
|
|
140
|
+
# scrape_job.prev_rate = [scrape_job.prev_rate.to_f, 0.01].max if scrape_job.prev_rate
|
|
141
|
+
# scrape_job.prev_items = 1000
|
|
142
|
+
puts scrape_job.to_flat[1..-1].join("\t")
|
|
143
|
+
end
|
|
144
|
+
end
|
|
145
|
+
|
|
146
|
+
request_queue.min_resched_delay = 10
|
|
147
|
+
sorted.each do |term, scrape_job|
|
|
148
|
+
#
|
|
149
|
+
# Persist the updated job to the job_store db, so that we can restart queue easily
|
|
150
|
+
job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
|
|
151
|
+
|
|
152
|
+
#
|
|
153
|
+
# re-enqueue the job. If it's run before, accelerate its next call; if never
|
|
154
|
+
# run before schedule for immediate run.
|
|
155
|
+
delay = (scrape_job.prev_rate ? request_queue.delay_to_next_scrape(scrape_job)/3 : 0)
|
|
156
|
+
request_queue.save scrape_job, scrape_job.priority, delay
|
|
157
|
+
end
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'monkeyshines'
|
|
4
|
+
require 'monkeyshines/runner/base'
|
|
5
|
+
require 'pathname'
|
|
6
|
+
|
|
7
|
+
#
|
|
8
|
+
require 'wuclan/twitter'
|
|
9
|
+
# un-namespace request classes.
|
|
10
|
+
include Wuclan::Twitter::Scrape
|
|
11
|
+
|
|
12
|
+
WORK_DIR = Pathname.new(File.dirname(__FILE__)+"/rawd").realpath.to_s
|
|
13
|
+
|
|
14
|
+
# ===========================================================================
|
|
15
|
+
#
|
|
16
|
+
# scrape_shorturls.rb --
|
|
17
|
+
#
|
|
18
|
+
# To scrape from a list of shortened urls:
|
|
19
|
+
#
|
|
20
|
+
# ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
|
|
21
|
+
#
|
|
22
|
+
# To do a random scrape:
|
|
23
|
+
#
|
|
24
|
+
# ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
|
|
25
|
+
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
|
26
|
+
#
|
|
27
|
+
#
|
|
28
|
+
opts = Trollop::options do
|
|
29
|
+
opt :log, "Log to file instead of STDERR"
|
|
30
|
+
# input from file
|
|
31
|
+
opt :from, "URI for scrape store to load from", :type => String
|
|
32
|
+
opt :skip, "Initial lines to skip", :type => Integer
|
|
33
|
+
# output storage
|
|
34
|
+
opt :cache_loc, "URI for cache server", :type => String, :default => ':10022'
|
|
35
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
|
36
|
+
opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd'
|
|
37
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv"
|
|
38
|
+
end
|
|
39
|
+
opts[:handle] ||= 'com.twitter'
|
|
40
|
+
scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
|
|
41
|
+
opts.merge! scrape_config
|
|
42
|
+
|
|
43
|
+
# ******************** Log ********************
|
|
44
|
+
if (opts[:log])
|
|
45
|
+
opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv'))
|
|
46
|
+
$stdout = $stderr = File.open(opts[:log]+"-console.log", "a")
|
|
47
|
+
end
|
|
48
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 1, :time => 30)
|
|
49
|
+
|
|
50
|
+
#
|
|
51
|
+
# ******************** Load from store ********************
|
|
52
|
+
#
|
|
53
|
+
class TwitterRequestStream < Monkeyshines::RequestStream::Base
|
|
54
|
+
def each *args
|
|
55
|
+
request_store.each(*args) do |twitter_user_id, *_|
|
|
56
|
+
yield TwitterUserRequest.new(twitter_user_id, 1, "" )
|
|
57
|
+
end
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
|
|
61
|
+
src_store.skip!(opts[:skip].to_i) if opts[:skip]
|
|
62
|
+
request_stream = TwitterRequestStream.new TwitterUserRequest, src_store
|
|
63
|
+
|
|
64
|
+
#
|
|
65
|
+
# ******************** Store output ********************
|
|
66
|
+
#
|
|
67
|
+
# Track visited URLs with key-value database
|
|
68
|
+
#
|
|
69
|
+
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(opts[:cache_loc])
|
|
70
|
+
|
|
71
|
+
#
|
|
72
|
+
# Store the data into flat files
|
|
73
|
+
#
|
|
74
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], :handle => opts[:handle], :dest_dir => opts[:dest_dir])
|
|
75
|
+
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern, opts[:chunk_time].to_i, opts)
|
|
76
|
+
|
|
77
|
+
#
|
|
78
|
+
# Conditional store uses the key-value DB to boss around the flat files --
|
|
79
|
+
# requests are only made (and thus data is only output) if the url is missing
|
|
80
|
+
# from the key-value store.
|
|
81
|
+
#
|
|
82
|
+
dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
|
|
83
|
+
|
|
84
|
+
#
|
|
85
|
+
# ******************** Fetcher ********************
|
|
86
|
+
#
|
|
87
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new opts[:twitter_api]
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
#
|
|
91
|
+
# ******************** Do this thing ********************
|
|
92
|
+
#
|
|
93
|
+
Log.info "Beginning scrape itself"
|
|
94
|
+
request_stream.each do |req|
|
|
95
|
+
# conditional store only calls fetcher if url key is missing.
|
|
96
|
+
result = dest_store.set(req.url) do
|
|
97
|
+
response = fetcher.get(req) # do the url fetch
|
|
98
|
+
next unless response.healthy? # don't store bad fetches
|
|
99
|
+
[response.scraped_at, response] # timestamp into cache, result into flat file
|
|
100
|
+
end
|
|
101
|
+
periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
|
|
102
|
+
end
|
|
103
|
+
dest_store.close
|
|
104
|
+
fetcher.close
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << ENV['WUKONG_PATH']
|
|
3
|
+
require File.dirname(__FILE__)+'/config/config_private'
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
require 'trollop'
|
|
6
|
+
require 'wukong'
|
|
7
|
+
require 'monkeyshines'
|
|
8
|
+
require 'wuclan/twitter/scrape' ; include Wuclan
|
|
9
|
+
|
|
10
|
+
require 'monkeyshines/fetcher/http_fetcher'
|
|
11
|
+
require 'monkeyshines/utils/filename_pattern'
|
|
12
|
+
#
|
|
13
|
+
# Command line options
|
|
14
|
+
#
|
|
15
|
+
opts = Trollop::options do
|
|
16
|
+
opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
|
|
17
|
+
opt :items_per_job, "Desired item count per job", :default => 1000
|
|
18
|
+
opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 20*1
|
|
19
|
+
opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
|
|
20
|
+
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
|
21
|
+
# output storage
|
|
22
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
|
23
|
+
opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
|
|
24
|
+
opt :dest_pattern, "Pattern for dump file output", :default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
|
|
25
|
+
end
|
|
26
|
+
Trollop::die :dest_dir unless opts[:dest_dir]
|
|
27
|
+
|
|
28
|
+
# Queue of request scrape_jobs, with reschedule requests
|
|
29
|
+
beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
|
|
30
|
+
request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
|
|
31
|
+
# Scrape Store for completed requests
|
|
32
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], opts.slice(:handle, :dest_dir))
|
|
33
|
+
dest = Monkeyshines::Store::ChunkedFlatFileStore.new dest_pattern, opts[:chunk_time].to_i
|
|
34
|
+
# Scrape requests by HTTP
|
|
35
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
|
|
36
|
+
# Log every 60 seconds
|
|
37
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
|
|
38
|
+
# Persist scrape_job jobs in distributed DB
|
|
39
|
+
job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
|
|
40
|
+
|
|
41
|
+
request_queue.each do |scrape_job|
|
|
42
|
+
# Run through all pages for this search term
|
|
43
|
+
scrape_job.each_request do |req|
|
|
44
|
+
# Fetch request
|
|
45
|
+
response = fetcher.get(req)
|
|
46
|
+
# save it if successful
|
|
47
|
+
dest.save response if response
|
|
48
|
+
# log progress
|
|
49
|
+
periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
|
|
50
|
+
# return it to the scrape_job for bookkeeping
|
|
51
|
+
response
|
|
52
|
+
end
|
|
53
|
+
# Persist the updated job to the scrape_jobs db, so that we can restart queue easily
|
|
54
|
+
job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
|
|
55
|
+
# sleep 0.5
|
|
56
|
+
end
|
|
57
|
+
request_queue.close
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
$: << ENV['WUKONG_PATH']
|
|
3
|
+
require File.dirname(__FILE__)+'/config/config_private'
|
|
4
|
+
require 'rubygems'
|
|
5
|
+
require 'trollop'
|
|
6
|
+
require 'wukong'
|
|
7
|
+
require 'monkeyshines'
|
|
8
|
+
require 'wuclan/twitter/scrape' ; include Wuclan
|
|
9
|
+
|
|
10
|
+
require 'monkeyshines/fetcher/http_fetcher'
|
|
11
|
+
require 'monkeyshines/utils/filename_pattern'
|
|
12
|
+
|
|
13
|
+
#
|
|
14
|
+
# Command line options
|
|
15
|
+
#
|
|
16
|
+
opts = Trollop::options do
|
|
17
|
+
opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
|
|
18
|
+
opt :dumpfile_pattern, "Pattern for dump file output",
|
|
19
|
+
:default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
|
|
20
|
+
opt :dumpfile_chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer,
|
|
21
|
+
:default => 60*60*24
|
|
22
|
+
opt :handle, "Handle to uniquely identify this scrape",
|
|
23
|
+
:default => 'com.twitter.search'
|
|
24
|
+
opt :min_resched_delay, "Don't run jobs more often than this (in seconds)",
|
|
25
|
+
:default => 60*1
|
|
26
|
+
end
|
|
27
|
+
Trollop::die :dumpfile_dir unless opts[:dumpfile_dir]
|
|
28
|
+
|
|
29
|
+
# Queue of request jobs, with reschedule requests
|
|
30
|
+
# opts[:beanstalk_tube] ||= opts[:handle].gsub(/\w+/,'_')
|
|
31
|
+
request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay)) # , :beanstalk_tube
|
|
32
|
+
# Scrape Store for completed requests
|
|
33
|
+
dumpfile_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dumpfile_pattern], opts.slice(:handle, :dumpfile_dir))
|
|
34
|
+
store = Monkeyshines::Store::ChunkedFlatFileStore.new dumpfile_pattern, opts[:dumpfile_chunk_time].to_i
|
|
35
|
+
# Scrape requests by HTTP
|
|
36
|
+
fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
|
|
37
|
+
# Log every 60 seconds
|
|
38
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class TwitterTrendingJob < Struct.new(
|
|
42
|
+
:query_term,
|
|
43
|
+
:priority,
|
|
44
|
+
:period
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# %w[
|
|
51
|
+
# http://search.twitter.com/trends/current.format , 60*60
|
|
52
|
+
# http://search.twitter.com/trends/daily.json?date=2009-03-19
|
|
53
|
+
# http://search.twitter.com/trends/weekly.json?date=2009-03-19
|
|
54
|
+
# ]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
request_queue.each do |scrape_job|
|
|
59
|
+
# Run through all pages for this search term
|
|
60
|
+
scrape_job.each_request do |req|
|
|
61
|
+
# Make request
|
|
62
|
+
response = fetcher.get(req)
|
|
63
|
+
# save it if successful
|
|
64
|
+
store.save response if response
|
|
65
|
+
# log progress
|
|
66
|
+
periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
|
|
67
|
+
# return it to the scrape_job for bookkeeping
|
|
68
|
+
response
|
|
69
|
+
end
|
|
70
|
+
end
|
|
71
|
+
request_queue.finish
|
|
72
|
+
|
|
73
|
+
# Twitter::Scrape::Scrape_Job.hard_request_limit = 15
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#$: << ENV['WUKONG_PATH']
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'wukong'
|
|
5
|
+
require 'monkeyshines'
|
|
6
|
+
|
|
7
|
+
require 'wuclan/twitter'
|
|
8
|
+
# if you're anyone but original author this next require is useless but harmless.
|
|
9
|
+
require 'wuclan/twitter/scrape/old_skool_request_classes'
|
|
10
|
+
# un-namespace request classes.
|
|
11
|
+
include Wuclan::Twitter::Scrape
|
|
12
|
+
include Wuclan::Twitter::Model
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
#
|
|
16
|
+
# Instantiate each incoming request.
|
|
17
|
+
# Stream out the contained classes it generates.
|
|
18
|
+
#
|
|
19
|
+
#
|
|
20
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
|
21
|
+
|
|
22
|
+
def process request, *args, &block
|
|
23
|
+
request.parse(*args) do |obj|
|
|
24
|
+
next if obj.is_a? BadRecord
|
|
25
|
+
yield obj.to_flat(false)
|
|
26
|
+
end
|
|
27
|
+
end
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
#
|
|
31
|
+
# We want to record each individual state of the resource, with the last-seen of
|
|
32
|
+
# its timestamps (if there are many). So if we saw
|
|
33
|
+
#
|
|
34
|
+
# rsrc id screen_name followers_count friends_count (... more)
|
|
35
|
+
# user 23 skidoo 47 61
|
|
36
|
+
# user 23 skidoo 48 62
|
|
37
|
+
# user 23 skidoo 48 62
|
|
38
|
+
# user 23 skidoo 52 62
|
|
39
|
+
# user 23 skidoo 52 63
|
|
40
|
+
#
|
|
41
|
+
#
|
|
42
|
+
class TwitterRequestUniqer < Wukong::Streamer::UniqByLastReducer
|
|
43
|
+
include Wukong::Streamer::StructRecordizer
|
|
44
|
+
|
|
45
|
+
attr_accessor :uniquer_count
|
|
46
|
+
|
|
47
|
+
#
|
|
48
|
+
#
|
|
49
|
+
#
|
|
50
|
+
#
|
|
51
|
+
# for immutable objects we can just work off their ID.
|
|
52
|
+
#
|
|
53
|
+
# for mutable objects we want to record each unique state: all the fields
|
|
54
|
+
# apart from the scraped_at timestamp.
|
|
55
|
+
#
|
|
56
|
+
def get_key obj
|
|
57
|
+
case obj
|
|
58
|
+
when Tweet
|
|
59
|
+
obj.id
|
|
60
|
+
when AFollowsB, AFavoritesB, ARepliesB, AAtsignsB, AAtsignsBId, ARetweetsB, ARetweetsBId, TwitterUserId
|
|
61
|
+
obj.key
|
|
62
|
+
when TwitterUser, TwitterUserProfile, TwitterUserStyle, TwitterUserPartial
|
|
63
|
+
[obj.id] + obj.to_a[2..-1]
|
|
64
|
+
else
|
|
65
|
+
raise "Don't know how to extract key from #{obj.class}"
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def start! *args
|
|
70
|
+
self.uniquer_count = 0
|
|
71
|
+
super *args
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
def accumulate obj
|
|
75
|
+
self.uniquer_count += 1
|
|
76
|
+
self.final_value = [self.uniquer_count, obj.to_flat].flatten
|
|
77
|
+
end
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
# This makes the script go.
|
|
81
|
+
Wukong::Script.new(TwitterRequestParser, TwitterRequestUniqer).run
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
#$: << ENV['WUKONG_PATH']
|
|
3
|
+
require 'rubygems'
|
|
4
|
+
require 'wukong'
|
|
5
|
+
require 'monkeyshines'
|
|
6
|
+
|
|
7
|
+
require 'wuclan/twitter'
|
|
8
|
+
require 'wuclan/twitter/scrape/twitter_search_request'
|
|
9
|
+
require 'wuclan/twitter/parse/twitter_search_parse'
|
|
10
|
+
include Wuclan::Twitter::Scrape
|
|
11
|
+
|
|
12
|
+
#
|
|
13
|
+
#
|
|
14
|
+
# Instantiate each incoming request.
|
|
15
|
+
# Stream out the contained classes it generates.
|
|
16
|
+
#
|
|
17
|
+
#
|
|
18
|
+
class TwitterRequestParser < Wukong::Streamer::StructStreamer
|
|
19
|
+
def process request, *args, &block
|
|
20
|
+
request.parse(*args) do |obj|
|
|
21
|
+
next if obj.is_a? BadRecord
|
|
22
|
+
yield obj.to_flat(false)
|
|
23
|
+
end
|
|
24
|
+
end
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# This makes the script go.
|
|
28
|
+
Wukong::Script.new(TwitterRequestParser, nil).run
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'monkeyshines'
|
|
4
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
|
|
5
|
+
|
|
6
|
+
#
|
|
7
|
+
# * jobs stream from a flat file
|
|
8
|
+
#
|
|
9
|
+
# * each job generates one or several requests (ex: followers_ids,
|
|
10
|
+
# friends_ids, user_timeline, favorites). Paginated requests stop when results
|
|
11
|
+
# overlap the prev_max item, as tracked from a central store).
|
|
12
|
+
#
|
|
13
|
+
# * Each request is fetched with the standard HTTP fetcher.
|
|
14
|
+
#
|
|
15
|
+
# * Jobs are rescheduled based on the observed item rate
|
|
16
|
+
#
|
|
17
|
+
# * results are sent to a ChunkedFlatFileStore
|
|
18
|
+
#
|
|
19
|
+
|
|
20
|
+
#
|
|
21
|
+
# Follow-on requests to make for each user
|
|
22
|
+
# You can also specify these with --source-fetches on the command line
|
|
23
|
+
#
|
|
24
|
+
DEFAULT_SOURCE_FETCHES = [
|
|
25
|
+
:user,
|
|
26
|
+
# :followers_ids, :friends_ids,
|
|
27
|
+
:followers, :friends,
|
|
28
|
+
# :favorites
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
Monkeyshines::CMDLINE_OPTIONS << [:source_fetches, "Follow-on requests to make. Default '#{DEFAULT_SOURCE_FETCHES.join(',')}'", { :default => DEFAULT_SOURCE_FETCHES.join(',') }]
|
|
32
|
+
Monkeyshines::CMDLINE_OPTIONS << [:source_skip, "Initial lines to skip", { :default => 1 }]
|
|
33
|
+
|
|
34
|
+
# Setup
|
|
35
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
|
36
|
+
Monkeyshines.load_global_options!
|
|
37
|
+
Monkeyshines.load_cmdline_options!
|
|
38
|
+
Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
|
|
39
|
+
|
|
40
|
+
#
|
|
41
|
+
# Don't spend all day on follow-on requests
|
|
42
|
+
#
|
|
43
|
+
{ TwitterFollowersRequest => 10,
|
|
44
|
+
TwitterFriendsRequest => 10,
|
|
45
|
+
TwitterFavoritesRequest => 4, }.each{|klass, limit| klass.hard_request_limit = limit }
|
|
46
|
+
|
|
47
|
+
#
|
|
48
|
+
# Set up scraper
|
|
49
|
+
#
|
|
50
|
+
scraper = Monkeyshines::Runner.new({
|
|
51
|
+
:log => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
|
|
52
|
+
:source => { :type => TwitterRequestStream },
|
|
53
|
+
:dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR },
|
|
54
|
+
# :fetcher => { :type => TwitterFakeFetcher },
|
|
55
|
+
:sleep_time => 0,
|
|
56
|
+
})
|
|
57
|
+
|
|
58
|
+
#
|
|
59
|
+
# Run scraper
|
|
60
|
+
#
|
|
61
|
+
scraper.run
|