wuclan 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require File.dirname(__FILE__)+'/config/config_private'
4
+ require 'rubygems'
5
+ require 'trollop'
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ #
9
+ require 'wuclan/twitter/scrape' ; include Wuclan
10
+ require 'monkeyshines/utils/uri'
11
+ require 'monkeyshines/fetcher/http_fetcher'
12
+ #
13
+ # Command line options
14
+ #
15
+ opts = Trollop::options do
16
+ opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
17
+ opt :items_per_job, "Desired item count per job", :default => 675
18
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 30*1
19
+ opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
20
+ opt :log, "Log file name; leave blank to use STDERR", :type => String
21
+ # import from file
22
+ opt :from, "Location of scrape store to load from", :type => String
23
+ # output storage
24
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
25
+ opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
26
+ opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
27
+ end
28
+
29
+ module Wuclan
30
+ module Domains
31
+ module Twitter
32
+ module Scrape
33
+ TwitterSearchJob = Struct.new(
34
+ :query_term,
35
+ :priority,
36
+ :prev_items,
37
+ :prev_rate,
38
+ :prev_span_min,
39
+ :prev_span_max
40
+ )
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+
47
+ # Queue of request import_jobs, with reschedule requests
48
+ beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
49
+ request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
50
+ # Scrape requests by HTTP
51
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
52
+ # Log every 60 seconds
53
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
54
+ # Persist scrape_job jobs in distributed DB
55
+ job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
56
+
57
+ # Import
58
+ if opts[:from]
59
+ import_jobs = Monkeyshines::Store::FlatFileStore.new(opts[:from], :filemode => 'r')
60
+ end
61
+
62
+ #
63
+ # Keep one unique copy of each scrape_job. The most senior instance (the one
64
+ # with the highest prev_items) wins.
65
+ #
66
+ SCRAPES = { }
67
+ def add_scrape_job scrape_job
68
+ return if SCRAPES[scrape_job.query_term] &&
69
+ (SCRAPES[scrape_job.query_term].prev_items.to_i >= scrape_job.prev_items.to_i)
70
+ SCRAPES[scrape_job.query_term] = scrape_job
71
+ end
72
+
73
+ Monkeyshines::RequestStream::BeanstalkQueue.class_eval do
74
+ #
75
+ # An (extremely dangerous) routine to examine all the jobs in the queue--
76
+ # since I don't know another way we pull all of them out and then put all of
77
+ # them back in.
78
+ #
79
+ def scrub_all &block
80
+ job_queue.connect()
81
+ File.open("/tmp/qjobs-#{Time.now.strftime("%H%M%S")}.tsv", "w") do |dump|
82
+ loop do
83
+ # Kick a bunch of jobs across all connections
84
+ $stderr.puts job_queue_stats.inspect
85
+ kicked = job_queue.open_connections.map{|conxn| conxn.kick(20) }
86
+ kicked = kicked.inject(0){|sum, n| sum += n }
87
+ # For all the jobs we can get our hands on quickly,
88
+ while(qjob = reserve_job!(0.5)) do
89
+ # send it in for processing
90
+ scrape_job = scrape_job_from_qjob(qjob)
91
+ yield scrape_job
92
+ # last recourse in case something goes wrong.
93
+ dump << scrape_job.to_flat.join("\t")+"\n"
94
+ # and remove it from the pool
95
+ qjob.delete
96
+ end
97
+ # stop when there's no more qjobs
98
+ break if (job_queue_total_jobs == 0) && (!job_queue.peek_ready)
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ begin
105
+ #
106
+ # Catalog the jobs in the persistent store
107
+ #
108
+ job_store.each do |key, hsh|
109
+ scrape_job = Twitter::Scrape::TwitterSearchJob.from_hash hsh
110
+ periodic_log.periodically{ [scrape_job] }
111
+ add_scrape_job scrape_job
112
+ end
113
+
114
+ #
115
+ # Catalog the jobs in the transient queue
116
+ #
117
+ request_queue.scrub_all do |scrape_job|
118
+ periodic_log.periodically{ [scrape_job] }
119
+ add_scrape_job scrape_job
120
+ end
121
+
122
+ #
123
+ # Import jobs from a static file
124
+ #
125
+ import_jobs.each_as(Twitter::Scrape::TwitterSearchJob) do |scrape_job|
126
+ next if (scrape_job.query_term =~ /^#/) || (scrape_job.query_term.blank?)
127
+ periodic_log.periodically{ [scrape_job] }
128
+ add_scrape_job scrape_job
129
+ # SCRAPES[scrape_job.query_term].priority = scrape_job.priority unless scrape_job.priority.blank?
130
+ # SCRAPES[scrape_job.query_term] = scrape_job
131
+ end
132
+ rescue Exception => e
133
+ warn e
134
+ ensure
135
+ #
136
+ # Serialize them to disk
137
+ #
138
+ sorted = SCRAPES.sort_by{|term,scrape_job| [scrape_job.priority||65536, -(scrape_job.prev_rate||1440), term] }
139
+ sorted.each do |term, scrape_job|
140
+ # scrape_job.prev_rate = [scrape_job.prev_rate.to_f, 0.01].max if scrape_job.prev_rate
141
+ # scrape_job.prev_items = 1000
142
+ puts scrape_job.to_flat[1..-1].join("\t")
143
+ end
144
+ end
145
+
146
+ request_queue.min_resched_delay = 10
147
+ sorted.each do |term, scrape_job|
148
+ #
149
+ # Persist the updated job to the job_store db, so that we can restart queue easily
150
+ job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
151
+
152
+ #
153
+ # re-enqueue the job. If it's run before, accelerate its next call; if never
154
+ # run before schedule for immediate run.
155
+ delay = (scrape_job.prev_rate ? request_queue.delay_to_next_scrape(scrape_job)/3 : 0)
156
+ request_queue.save scrape_job, scrape_job.priority, delay
157
+ end
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'monkeyshines/runner/base'
5
+ require 'pathname'
6
+
7
+ #
8
+ require 'wuclan/twitter'
9
+ # un-namespace request classes.
10
+ include Wuclan::Twitter::Scrape
11
+
12
+ WORK_DIR = Pathname.new(File.dirname(__FILE__)+"/rawd").realpath.to_s
13
+
14
+ # ===========================================================================
15
+ #
16
+ # scrape_shorturls.rb --
17
+ #
18
+ # To scrape from a list of shortened urls:
19
+ #
20
+ # ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
21
+ #
22
+ # To do a random scrape:
23
+ #
24
+ # ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
25
+ # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
26
+ #
27
+ #
28
+ opts = Trollop::options do
29
+ opt :log, "Log to file instead of STDERR"
30
+ # input from file
31
+ opt :from, "URI for scrape store to load from", :type => String
32
+ opt :skip, "Initial lines to skip", :type => Integer
33
+ # output storage
34
+ opt :cache_loc, "URI for cache server", :type => String, :default => ':10022'
35
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
36
+ opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd'
37
+ opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv"
38
+ end
39
+ opts[:handle] ||= 'com.twitter'
40
+ scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
41
+ opts.merge! scrape_config
42
+
43
+ # ******************** Log ********************
44
+ if (opts[:log])
45
+ opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv'))
46
+ $stdout = $stderr = File.open(opts[:log]+"-console.log", "a")
47
+ end
48
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 1, :time => 30)
49
+
50
+ #
51
+ # ******************** Load from store ********************
52
+ #
53
+ class TwitterRequestStream < Monkeyshines::RequestStream::Base
54
+ def each *args
55
+ request_store.each(*args) do |twitter_user_id, *_|
56
+ yield TwitterUserRequest.new(twitter_user_id, 1, "" )
57
+ end
58
+ end
59
+ end
60
+ src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
61
+ src_store.skip!(opts[:skip].to_i) if opts[:skip]
62
+ request_stream = TwitterRequestStream.new TwitterUserRequest, src_store
63
+
64
+ #
65
+ # ******************** Store output ********************
66
+ #
67
+ # Track visited URLs with key-value database
68
+ #
69
+ dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(opts[:cache_loc])
70
+
71
+ #
72
+ # Store the data into flat files
73
+ #
74
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], :handle => opts[:handle], :dest_dir => opts[:dest_dir])
75
+ dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern, opts[:chunk_time].to_i, opts)
76
+
77
+ #
78
+ # Conditional store uses the key-value DB to boss around the flat files --
79
+ # requests are only made (and thus data is only output) if the url is missing
80
+ # from the key-value store.
81
+ #
82
+ dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
83
+
84
+ #
85
+ # ******************** Fetcher ********************
86
+ #
87
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new opts[:twitter_api]
88
+
89
+
90
+ #
91
+ # ******************** Do this thing ********************
92
+ #
93
+ Log.info "Beginning scrape itself"
94
+ request_stream.each do |req|
95
+ # conditional store only calls fetcher if url key is missing.
96
+ result = dest_store.set(req.url) do
97
+ response = fetcher.get(req) # do the url fetch
98
+ next unless response.healthy? # don't store bad fetches
99
+ [response.scraped_at, response] # timestamp into cache, result into flat file
100
+ end
101
+ periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
102
+ end
103
+ dest_store.close
104
+ fetcher.close
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require File.dirname(__FILE__)+'/config/config_private'
4
+ require 'rubygems'
5
+ require 'trollop'
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ require 'wuclan/twitter/scrape' ; include Wuclan
9
+
10
+ require 'monkeyshines/fetcher/http_fetcher'
11
+ require 'monkeyshines/utils/filename_pattern'
12
+ #
13
+ # Command line options
14
+ #
15
+ opts = Trollop::options do
16
+ opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
17
+ opt :items_per_job, "Desired item count per job", :default => 1000
18
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 20*1
19
+ opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
20
+ opt :log, "Log file name; leave blank to use STDERR", :type => String
21
+ # output storage
22
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
23
+ opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
24
+ opt :dest_pattern, "Pattern for dump file output", :default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
25
+ end
26
+ Trollop::die :dest_dir unless opts[:dest_dir]
27
+
28
+ # Queue of request scrape_jobs, with reschedule requests
29
+ beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
30
+ request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
31
+ # Scrape Store for completed requests
32
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], opts.slice(:handle, :dest_dir))
33
+ dest = Monkeyshines::Store::ChunkedFlatFileStore.new dest_pattern, opts[:chunk_time].to_i
34
+ # Scrape requests by HTTP
35
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
36
+ # Log every 60 seconds
37
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
38
+ # Persist scrape_job jobs in distributed DB
39
+ job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
40
+
41
+ request_queue.each do |scrape_job|
42
+ # Run through all pages for this search term
43
+ scrape_job.each_request do |req|
44
+ # Fetch request
45
+ response = fetcher.get(req)
46
+ # save it if successful
47
+ dest.save response if response
48
+ # log progress
49
+ periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
50
+ # return it to the scrape_job for bookkeeping
51
+ response
52
+ end
53
+ # Persist the updated job to the scrape_jobs db, so that we can restart queue easily
54
+ job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
55
+ # sleep 0.5
56
+ end
57
+ request_queue.close
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require File.dirname(__FILE__)+'/config/config_private'
4
+ require 'rubygems'
5
+ require 'trollop'
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ require 'wuclan/twitter/scrape' ; include Wuclan
9
+
10
+ require 'monkeyshines/fetcher/http_fetcher'
11
+ require 'monkeyshines/utils/filename_pattern'
12
+
13
+ #
14
+ # Command line options
15
+ #
16
+ opts = Trollop::options do
17
+ opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
18
+ opt :dumpfile_pattern, "Pattern for dump file output",
19
+ :default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
20
+ opt :dumpfile_chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer,
21
+ :default => 60*60*24
22
+ opt :handle, "Handle to uniquely identify this scrape",
23
+ :default => 'com.twitter.search'
24
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)",
25
+ :default => 60*1
26
+ end
27
+ Trollop::die :dumpfile_dir unless opts[:dumpfile_dir]
28
+
29
+ # Queue of request jobs, with reschedule requests
30
+ # opts[:beanstalk_tube] ||= opts[:handle].gsub(/\w+/,'_')
31
+ request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay)) # , :beanstalk_tube
32
+ # Scrape Store for completed requests
33
+ dumpfile_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dumpfile_pattern], opts.slice(:handle, :dumpfile_dir))
34
+ store = Monkeyshines::Store::ChunkedFlatFileStore.new dumpfile_pattern, opts[:dumpfile_chunk_time].to_i
35
+ # Scrape requests by HTTP
36
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
37
+ # Log every 60 seconds
38
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
39
+
40
+
41
+ class TwitterTrendingJob < Struct.new(
42
+ :query_term,
43
+ :priority,
44
+ :period
45
+ )
46
+
47
+ end
48
+
49
+
50
+ # %w[
51
+ # http://search.twitter.com/trends/current.format , 60*60
52
+ # http://search.twitter.com/trends/daily.json?date=2009-03-19
53
+ # http://search.twitter.com/trends/weekly.json?date=2009-03-19
54
+ # ]
55
+
56
+
57
+
58
+ request_queue.each do |scrape_job|
59
+ # Run through all pages for this search term
60
+ scrape_job.each_request do |req|
61
+ # Make request
62
+ response = fetcher.get(req)
63
+ # save it if successful
64
+ store.save response if response
65
+ # log progress
66
+ periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
67
+ # return it to the scrape_job for bookkeeping
68
+ response
69
+ end
70
+ end
71
+ request_queue.finish
72
+
73
+ # Twitter::Scrape::Scrape_Job.hard_request_limit = 15
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+ #$: << ENV['WUKONG_PATH']
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'monkeyshines'
6
+
7
+ require 'wuclan/twitter'
8
+ # if you're anyone but original author this next require is useless but harmless.
9
+ require 'wuclan/twitter/scrape/old_skool_request_classes'
10
+ # un-namespace request classes.
11
+ include Wuclan::Twitter::Scrape
12
+ include Wuclan::Twitter::Model
13
+
14
+ #
15
+ #
16
+ # Instantiate each incoming request.
17
+ # Stream out the contained classes it generates.
18
+ #
19
+ #
20
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
21
+
22
+ def process request, *args, &block
23
+ request.parse(*args) do |obj|
24
+ next if obj.is_a? BadRecord
25
+ yield obj.to_flat(false)
26
+ end
27
+ end
28
+ end
29
+
30
+ #
31
+ # We want to record each individual state of the resource, with the last-seen of
32
+ # its timestamps (if there are many). So if we saw
33
+ #
34
+ # rsrc id screen_name followers_count friends_count (... more)
35
+ # user 23 skidoo 47 61
36
+ # user 23 skidoo 48 62
37
+ # user 23 skidoo 48 62
38
+ # user 23 skidoo 52 62
39
+ # user 23 skidoo 52 63
40
+ #
41
+ #
42
+ class TwitterRequestUniqer < Wukong::Streamer::UniqByLastReducer
43
+ include Wukong::Streamer::StructRecordizer
44
+
45
+ attr_accessor :uniquer_count
46
+
47
+ #
48
+ #
49
+ #
50
+ #
51
+ # for immutable objects we can just work off their ID.
52
+ #
53
+ # for mutable objects we want to record each unique state: all the fields
54
+ # apart from the scraped_at timestamp.
55
+ #
56
+ def get_key obj
57
+ case obj
58
+ when Tweet
59
+ obj.id
60
+ when AFollowsB, AFavoritesB, ARepliesB, AAtsignsB, AAtsignsBId, ARetweetsB, ARetweetsBId, TwitterUserId
61
+ obj.key
62
+ when TwitterUser, TwitterUserProfile, TwitterUserStyle, TwitterUserPartial
63
+ [obj.id] + obj.to_a[2..-1]
64
+ else
65
+ raise "Don't know how to extract key from #{obj.class}"
66
+ end
67
+ end
68
+
69
+ def start! *args
70
+ self.uniquer_count = 0
71
+ super *args
72
+ end
73
+
74
+ def accumulate obj
75
+ self.uniquer_count += 1
76
+ self.final_value = [self.uniquer_count, obj.to_flat].flatten
77
+ end
78
+ end
79
+
80
+ # This makes the script go.
81
+ Wukong::Script.new(TwitterRequestParser, TwitterRequestUniqer).run
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ #$: << ENV['WUKONG_PATH']
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'monkeyshines'
6
+
7
+ require 'wuclan/twitter'
8
+ require 'wuclan/twitter/scrape/twitter_search_request'
9
+ require 'wuclan/twitter/parse/twitter_search_parse'
10
+ include Wuclan::Twitter::Scrape
11
+
12
+ #
13
+ #
14
+ # Instantiate each incoming request.
15
+ # Stream out the contained classes it generates.
16
+ #
17
+ #
18
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
19
+ def process request, *args, &block
20
+ request.parse(*args) do |obj|
21
+ next if obj.is_a? BadRecord
22
+ yield obj.to_flat(false)
23
+ end
24
+ end
25
+ end
26
+
27
+ # This makes the script go.
28
+ Wukong::Script.new(TwitterRequestParser, nil).run
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
5
+
6
+ #
7
+ # * jobs stream from a flat file
8
+ #
9
+ # * each job generates one or several requests (ex: followers_ids,
10
+ # friends_ids, user_timeline, favorites). Paginated requests stop when results
11
+ # overlap the prev_max item, as tracked from a central store).
12
+ #
13
+ # * Each request is fetched with the standard HTTP fetcher.
14
+ #
15
+ # * Jobs are rescheduled based on the observed item rate
16
+ #
17
+ # * results are sent to a ChunkedFlatFileStore
18
+ #
19
+
20
+ #
21
+ # Follow-on requests to make for each user
22
+ # You can also specify these with --source-fetches on the command line
23
+ #
24
+ DEFAULT_SOURCE_FETCHES = [
25
+ :user,
26
+ # :followers_ids, :friends_ids,
27
+ :followers, :friends,
28
+ # :favorites
29
+ ]
30
+
31
+ Monkeyshines::CMDLINE_OPTIONS << [:source_fetches, "Follow-on requests to make. Default '#{DEFAULT_SOURCE_FETCHES.join(',')}'", { :default => DEFAULT_SOURCE_FETCHES.join(',') }]
32
+ Monkeyshines::CMDLINE_OPTIONS << [:source_skip, "Initial lines to skip", { :default => 1 }]
33
+
34
+ # Setup
35
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
36
+ Monkeyshines.load_global_options!
37
+ Monkeyshines.load_cmdline_options!
38
+ Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
39
+
40
+ #
41
+ # Don't spend all day on follow-on requests
42
+ #
43
+ { TwitterFollowersRequest => 10,
44
+ TwitterFriendsRequest => 10,
45
+ TwitterFavoritesRequest => 4, }.each{|klass, limit| klass.hard_request_limit = limit }
46
+
47
+ #
48
+ # Set up scraper
49
+ #
50
+ scraper = Monkeyshines::Runner.new({
51
+ :log => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
52
+ :source => { :type => TwitterRequestStream },
53
+ :dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR },
54
+ # :fetcher => { :type => TwitterFakeFetcher },
55
+ :sleep_time => 0,
56
+ })
57
+
58
+ #
59
+ # Run scraper
60
+ #
61
+ scraper.run
@@ -0,0 +1,4 @@
1
+ infochimps
2
+ AustinOnRails
3
+ hadoop
4
+ mrflip
@@ -0,0 +1,2 @@
1
+ script_dir=`dirname $0`
2
+ ttserver -port 10022 $script_dir/rawd/distdb/twitter_api-`hostname`.tch >> $script_dir/rawd/log/twitter_api-ttserver-`datename`.log 2>&1