wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,157 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require File.dirname(__FILE__)+'/config/config_private'
4
+ require 'rubygems'
5
+ require 'trollop'
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ #
9
+ require 'wuclan/twitter/scrape' ; include Wuclan
10
+ require 'monkeyshines/utils/uri'
11
+ require 'monkeyshines/fetcher/http_fetcher'
12
+ #
13
+ # Command line options
14
+ #
15
+ opts = Trollop::options do
16
+ opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
17
+ opt :items_per_job, "Desired item count per job", :default => 675
18
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 30*1
19
+ opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
20
+ opt :log, "Log file name; leave blank to use STDERR", :type => String
21
+ # import from file
22
+ opt :from, "Location of scrape store to load from", :type => String
23
+ # output storage
24
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
25
+ opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
26
+ opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:datetime-:pid.tsv"
27
+ end
28
+
29
+ module Wuclan
30
+ module Domains
31
+ module Twitter
32
+ module Scrape
33
+ TwitterSearchJob = Struct.new(
34
+ :query_term,
35
+ :priority,
36
+ :prev_items,
37
+ :prev_rate,
38
+ :prev_span_min,
39
+ :prev_span_max
40
+ )
41
+ end
42
+ end
43
+ end
44
+ end
45
+
46
+
47
+ # Queue of request import_jobs, with reschedule requests
48
+ beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
49
+ request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
50
+ # Scrape requests by HTTP
51
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
52
+ # Log every 60 seconds
53
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
54
+ # Persist scrape_job jobs in distributed DB
55
+ job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
56
+
57
+ # Import
58
+ if opts[:from]
59
+ import_jobs = Monkeyshines::Store::FlatFileStore.new(opts[:from], :filemode => 'r')
60
+ end
61
+
62
+ #
63
+ # Keep one unique copy of each scrape_job. The most senior instance (the one
64
+ # with the highest prev_items) wins.
65
+ #
66
+ SCRAPES = { }
67
+ def add_scrape_job scrape_job
68
+ return if SCRAPES[scrape_job.query_term] &&
69
+ (SCRAPES[scrape_job.query_term].prev_items.to_i >= scrape_job.prev_items.to_i)
70
+ SCRAPES[scrape_job.query_term] = scrape_job
71
+ end
72
+
73
+ Monkeyshines::RequestStream::BeanstalkQueue.class_eval do
74
+ #
75
+ # An (extremely dangerous) routine to examine all the jobs in the queue--
76
+ # since I don't know another way we pull all of them out and then put all of
77
+ # them back in.
78
+ #
79
+ def scrub_all &block
80
+ job_queue.connect()
81
+ File.open("/tmp/qjobs-#{Time.now.strftime("%H%M%S")}.tsv", "w") do |dump|
82
+ loop do
83
+ # Kick a bunch of jobs across all connections
84
+ $stderr.puts job_queue_stats.inspect
85
+ kicked = job_queue.open_connections.map{|conxn| conxn.kick(20) }
86
+ kicked = kicked.inject(0){|sum, n| sum += n }
87
+ # For all the jobs we can get our hands on quickly,
88
+ while(qjob = reserve_job!(0.5)) do
89
+ # send it in for processing
90
+ scrape_job = scrape_job_from_qjob(qjob)
91
+ yield scrape_job
92
+ # last recourse in case something goes wrong.
93
+ dump << scrape_job.to_flat.join("\t")+"\n"
94
+ # and remove it from the pool
95
+ qjob.delete
96
+ end
97
+ # stop when there's no more qjobs
98
+ break if (job_queue_total_jobs == 0) && (!job_queue.peek_ready)
99
+ end
100
+ end
101
+ end
102
+ end
103
+
104
+ begin
105
+ #
106
+ # Catalog the jobs in the persistent store
107
+ #
108
+ job_store.each do |key, hsh|
109
+ scrape_job = Twitter::Scrape::TwitterSearchJob.from_hash hsh
110
+ periodic_log.periodically{ [scrape_job] }
111
+ add_scrape_job scrape_job
112
+ end
113
+
114
+ #
115
+ # Catalog the jobs in the transient queue
116
+ #
117
+ request_queue.scrub_all do |scrape_job|
118
+ periodic_log.periodically{ [scrape_job] }
119
+ add_scrape_job scrape_job
120
+ end
121
+
122
+ #
123
+ # Import jobs from a static file
124
+ #
125
+ import_jobs.each_as(Twitter::Scrape::TwitterSearchJob) do |scrape_job|
126
+ next if (scrape_job.query_term =~ /^#/) || (scrape_job.query_term.blank?)
127
+ periodic_log.periodically{ [scrape_job] }
128
+ add_scrape_job scrape_job
129
+ # SCRAPES[scrape_job.query_term].priority = scrape_job.priority unless scrape_job.priority.blank?
130
+ # SCRAPES[scrape_job.query_term] = scrape_job
131
+ end
132
+ rescue Exception => e
133
+ warn e
134
+ ensure
135
+ #
136
+ # Serialize them to disk
137
+ #
138
+ sorted = SCRAPES.sort_by{|term,scrape_job| [scrape_job.priority||65536, -(scrape_job.prev_rate||1440), term] }
139
+ sorted.each do |term, scrape_job|
140
+ # scrape_job.prev_rate = [scrape_job.prev_rate.to_f, 0.01].max if scrape_job.prev_rate
141
+ # scrape_job.prev_items = 1000
142
+ puts scrape_job.to_flat[1..-1].join("\t")
143
+ end
144
+ end
145
+
146
+ request_queue.min_resched_delay = 10
147
+ sorted.each do |term, scrape_job|
148
+ #
149
+ # Persist the updated job to the job_store db, so that we can restart queue easily
150
+ job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
151
+
152
+ #
153
+ # re-enqueue the job. If it's run before, accelerate its next call; if never
154
+ # run before schedule for immediate run.
155
+ delay = (scrape_job.prev_rate ? request_queue.delay_to_next_scrape(scrape_job)/3 : 0)
156
+ request_queue.save scrape_job, scrape_job.priority, delay
157
+ end
@@ -0,0 +1,104 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'monkeyshines/runner/base'
5
+ require 'pathname'
6
+
7
+ #
8
+ require 'wuclan/twitter'
9
+ # un-namespace request classes.
10
+ include Wuclan::Twitter::Scrape
11
+
12
+ WORK_DIR = Pathname.new(File.dirname(__FILE__)+"/rawd").realpath.to_s
13
+
14
+ # ===========================================================================
15
+ #
16
+ # scrape_shorturls.rb --
17
+ #
18
+ # To scrape from a list of shortened urls:
19
+ #
20
+ # ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
21
+ #
22
+ # To do a random scrape:
23
+ #
24
+ # ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
25
+ # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
26
+ #
27
+ #
28
+ opts = Trollop::options do
29
+ opt :log, "Log to file instead of STDERR"
30
+ # input from file
31
+ opt :from, "URI for scrape store to load from", :type => String
32
+ opt :skip, "Initial lines to skip", :type => Integer
33
+ # output storage
34
+ opt :cache_loc, "URI for cache server", :type => String, :default => ':10022'
35
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
36
+ opt :dest_dir, "Filename base to store output. default ./work/ripd", :default => WORK_DIR+'/ripd'
37
+ opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:date/:handle+:timestamp-:pid.tsv"
38
+ end
39
+ opts[:handle] ||= 'com.twitter'
40
+ scrape_config = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
41
+ opts.merge! scrape_config
42
+
43
+ # ******************** Log ********************
44
+ if (opts[:log])
45
+ opts[:log] = (WORK_DIR+'/log/'+File.basename(opts[:from],'.tsv'))
46
+ $stdout = $stderr = File.open(opts[:log]+"-console.log", "a")
47
+ end
48
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 1, :time => 30)
49
+
50
+ #
51
+ # ******************** Load from store ********************
52
+ #
53
+ class TwitterRequestStream < Monkeyshines::RequestStream::Base
54
+ def each *args
55
+ request_store.each(*args) do |twitter_user_id, *_|
56
+ yield TwitterUserRequest.new(twitter_user_id, 1, "" )
57
+ end
58
+ end
59
+ end
60
+ src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
61
+ src_store.skip!(opts[:skip].to_i) if opts[:skip]
62
+ request_stream = TwitterRequestStream.new TwitterUserRequest, src_store
63
+
64
+ #
65
+ # ******************** Store output ********************
66
+ #
67
+ # Track visited URLs with key-value database
68
+ #
69
+ dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(opts[:cache_loc])
70
+
71
+ #
72
+ # Store the data into flat files
73
+ #
74
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], :handle => opts[:handle], :dest_dir => opts[:dest_dir])
75
+ dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern, opts[:chunk_time].to_i, opts)
76
+
77
+ #
78
+ # Conditional store uses the key-value DB to boss around the flat files --
79
+ # requests are only made (and thus data is only output) if the url is missing
80
+ # from the key-value store.
81
+ #
82
+ dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
83
+
84
+ #
85
+ # ******************** Fetcher ********************
86
+ #
87
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new opts[:twitter_api]
88
+
89
+
90
+ #
91
+ # ******************** Do this thing ********************
92
+ #
93
+ Log.info "Beginning scrape itself"
94
+ request_stream.each do |req|
95
+ # conditional store only calls fetcher if url key is missing.
96
+ result = dest_store.set(req.url) do
97
+ response = fetcher.get(req) # do the url fetch
98
+ next unless response.healthy? # don't store bad fetches
99
+ [response.scraped_at, response] # timestamp into cache, result into flat file
100
+ end
101
+ periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
102
+ end
103
+ dest_store.close
104
+ fetcher.close
@@ -0,0 +1,57 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require File.dirname(__FILE__)+'/config/config_private'
4
+ require 'rubygems'
5
+ require 'trollop'
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ require 'wuclan/twitter/scrape' ; include Wuclan
9
+
10
+ require 'monkeyshines/fetcher/http_fetcher'
11
+ require 'monkeyshines/utils/filename_pattern'
12
+ #
13
+ # Command line options
14
+ #
15
+ opts = Trollop::options do
16
+ opt :handle, "Handle to uniquely identify this scrape", :default => 'com.twitter.search'
17
+ opt :items_per_job, "Desired item count per job", :default => 1000
18
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)", :default => 20*1
19
+ opt :job_db, "Tokyo tyrant db host", :default => ':1978', :type => String
20
+ opt :log, "Log file name; leave blank to use STDERR", :type => String
21
+ # output storage
22
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
23
+ opt :dest_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
24
+ opt :dest_pattern, "Pattern for dump file output", :default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
25
+ end
26
+ Trollop::die :dest_dir unless opts[:dest_dir]
27
+
28
+ # Queue of request scrape_jobs, with reschedule requests
29
+ beanstalk_tube = opts[:handle].gsub(/\w+/,'_')
30
+ request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay))
31
+ # Scrape Store for completed requests
32
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern], opts.slice(:handle, :dest_dir))
33
+ dest = Monkeyshines::Store::ChunkedFlatFileStore.new dest_pattern, opts[:chunk_time].to_i
34
+ # Scrape requests by HTTP
35
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
36
+ # Log every 60 seconds
37
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
38
+ # Persist scrape_job jobs in distributed DB
39
+ job_store = Monkeyshines::Store::TyrantTdbKeyStore.new(opts[:job_db])
40
+
41
+ request_queue.each do |scrape_job|
42
+ # Run through all pages for this search term
43
+ scrape_job.each_request do |req|
44
+ # Fetch request
45
+ response = fetcher.get(req)
46
+ # save it if successful
47
+ dest.save response if response
48
+ # log progress
49
+ periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
50
+ # return it to the scrape_job for bookkeeping
51
+ response
52
+ end
53
+ # Persist the updated job to the scrape_jobs db, so that we can restart queue easily
54
+ job_store.save "#{scrape_job.class}-#{scrape_job.query_term}", scrape_job.to_hash.compact
55
+ # sleep 0.5
56
+ end
57
+ request_queue.close
@@ -0,0 +1,73 @@
1
+ #!/usr/bin/env ruby
2
+ $: << ENV['WUKONG_PATH']
3
+ require File.dirname(__FILE__)+'/config/config_private'
4
+ require 'rubygems'
5
+ require 'trollop'
6
+ require 'wukong'
7
+ require 'monkeyshines'
8
+ require 'wuclan/twitter/scrape' ; include Wuclan
9
+
10
+ require 'monkeyshines/fetcher/http_fetcher'
11
+ require 'monkeyshines/utils/filename_pattern'
12
+
13
+ #
14
+ # Command line options
15
+ #
16
+ opts = Trollop::options do
17
+ opt :dumpfile_dir, "Filename base to store output. e.g. --dump_basename=/data/ripd", :type => String
18
+ opt :dumpfile_pattern, "Pattern for dump file output",
19
+ :default => Monkeyshines::Utils::FilenamePattern::DEFAULT_PATTERN_STR
20
+ opt :dumpfile_chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer,
21
+ :default => 60*60*24
22
+ opt :handle, "Handle to uniquely identify this scrape",
23
+ :default => 'com.twitter.search'
24
+ opt :min_resched_delay, "Don't run jobs more often than this (in seconds)",
25
+ :default => 60*1
26
+ end
27
+ Trollop::die :dumpfile_dir unless opts[:dumpfile_dir]
28
+
29
+ # Queue of request jobs, with reschedule requests
30
+ # opts[:beanstalk_tube] ||= opts[:handle].gsub(/\w+/,'_')
31
+ request_queue = Monkeyshines::RequestStream::BeanstalkQueue.new(nil, Twitter::Scrape::TwitterSearchJob, opts[:items_per_job], opts.slice(:min_resched_delay)) # , :beanstalk_tube
32
+ # Scrape Store for completed requests
33
+ dumpfile_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dumpfile_pattern], opts.slice(:handle, :dumpfile_dir))
34
+ store = Monkeyshines::Store::ChunkedFlatFileStore.new dumpfile_pattern, opts[:dumpfile_chunk_time].to_i
35
+ # Scrape requests by HTTP
36
+ fetcher = Monkeyshines::Fetcher::HttpFetcher.new Monkeyshines::CONFIG[:twitter]
37
+ # Log every 60 seconds
38
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:time => 60)
39
+
40
+
41
+ class TwitterTrendingJob < Struct.new(
42
+ :query_term,
43
+ :priority,
44
+ :period
45
+ )
46
+
47
+ end
48
+
49
+
50
+ # %w[
51
+ # http://search.twitter.com/trends/current.format , 60*60
52
+ # http://search.twitter.com/trends/daily.json?date=2009-03-19
53
+ # http://search.twitter.com/trends/weekly.json?date=2009-03-19
54
+ # ]
55
+
56
+
57
+
58
+ request_queue.each do |scrape_job|
59
+ # Run through all pages for this search term
60
+ scrape_job.each_request do |req|
61
+ # Make request
62
+ response = fetcher.get(req)
63
+ # save it if successful
64
+ store.save response if response
65
+ # log progress
66
+ periodic_log.periodically{ ["%7d"%response.num_items, response.url] }
67
+ # return it to the scrape_job for bookkeeping
68
+ response
69
+ end
70
+ end
71
+ request_queue.finish
72
+
73
+ # Twitter::Scrape::Scrape_Job.hard_request_limit = 15
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+ #$: << ENV['WUKONG_PATH']
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'monkeyshines'
6
+
7
+ require 'wuclan/twitter'
8
+ # if you're anyone but original author this next require is useless but harmless.
9
+ require 'wuclan/twitter/scrape/old_skool_request_classes'
10
+ # un-namespace request classes.
11
+ include Wuclan::Twitter::Scrape
12
+ include Wuclan::Twitter::Model
13
+
14
+ #
15
+ #
16
+ # Instantiate each incoming request.
17
+ # Stream out the contained classes it generates.
18
+ #
19
+ #
20
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
21
+
22
+ def process request, *args, &block
23
+ request.parse(*args) do |obj|
24
+ next if obj.is_a? BadRecord
25
+ yield obj.to_flat(false)
26
+ end
27
+ end
28
+ end
29
+
30
+ #
31
+ # We want to record each individual state of the resource, with the last-seen of
32
+ # its timestamps (if there are many). So if we saw
33
+ #
34
+ # rsrc id screen_name followers_count friends_count (... more)
35
+ # user 23 skidoo 47 61
36
+ # user 23 skidoo 48 62
37
+ # user 23 skidoo 48 62
38
+ # user 23 skidoo 52 62
39
+ # user 23 skidoo 52 63
40
+ #
41
+ #
42
+ class TwitterRequestUniqer < Wukong::Streamer::UniqByLastReducer
43
+ include Wukong::Streamer::StructRecordizer
44
+
45
+ attr_accessor :uniquer_count
46
+
47
+ #
48
+ #
49
+ #
50
+ #
51
+ # for immutable objects we can just work off their ID.
52
+ #
53
+ # for mutable objects we want to record each unique state: all the fields
54
+ # apart from the scraped_at timestamp.
55
+ #
56
+ def get_key obj
57
+ case obj
58
+ when Tweet
59
+ obj.id
60
+ when AFollowsB, AFavoritesB, ARepliesB, AAtsignsB, AAtsignsBId, ARetweetsB, ARetweetsBId, TwitterUserId
61
+ obj.key
62
+ when TwitterUser, TwitterUserProfile, TwitterUserStyle, TwitterUserPartial
63
+ [obj.id] + obj.to_a[2..-1]
64
+ else
65
+ raise "Don't know how to extract key from #{obj.class}"
66
+ end
67
+ end
68
+
69
+ def start! *args
70
+ self.uniquer_count = 0
71
+ super *args
72
+ end
73
+
74
+ def accumulate obj
75
+ self.uniquer_count += 1
76
+ self.final_value = [self.uniquer_count, obj.to_flat].flatten
77
+ end
78
+ end
79
+
80
+ # This makes the script go.
81
+ Wukong::Script.new(TwitterRequestParser, TwitterRequestUniqer).run
@@ -0,0 +1,28 @@
1
+ #!/usr/bin/env ruby
2
+ #$: << ENV['WUKONG_PATH']
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'monkeyshines'
6
+
7
+ require 'wuclan/twitter'
8
+ require 'wuclan/twitter/scrape/twitter_search_request'
9
+ require 'wuclan/twitter/parse/twitter_search_parse'
10
+ include Wuclan::Twitter::Scrape
11
+
12
+ #
13
+ #
14
+ # Instantiate each incoming request.
15
+ # Stream out the contained classes it generates.
16
+ #
17
+ #
18
+ class TwitterRequestParser < Wukong::Streamer::StructStreamer
19
+ def process request, *args, &block
20
+ request.parse(*args) do |obj|
21
+ next if obj.is_a? BadRecord
22
+ yield obj.to_flat(false)
23
+ end
24
+ end
25
+ end
26
+
27
+ # This makes the script go.
28
+ Wukong::Script.new(TwitterRequestParser, nil).run
@@ -0,0 +1,61 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'monkeyshines'
4
+ require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
5
+
6
+ #
7
+ # * jobs stream from a flat file
8
+ #
9
+ # * each job generates one or several requests (ex: followers_ids,
10
+ # friends_ids, user_timeline, favorites). Paginated requests stop when results
11
+ # overlap the prev_max item, as tracked from a central store).
12
+ #
13
+ # * Each request is fetched with the standard HTTP fetcher.
14
+ #
15
+ # * Jobs are rescheduled based on the observed item rate
16
+ #
17
+ # * results are sent to a ChunkedFlatFileStore
18
+ #
19
+
20
+ #
21
+ # Follow-on requests to make for each user
22
+ # You can also specify these with --source-fetches on the command line
23
+ #
24
+ DEFAULT_SOURCE_FETCHES = [
25
+ :user,
26
+ # :followers_ids, :friends_ids,
27
+ :followers, :friends,
28
+ # :favorites
29
+ ]
30
+
31
+ Monkeyshines::CMDLINE_OPTIONS << [:source_fetches, "Follow-on requests to make. Default '#{DEFAULT_SOURCE_FETCHES.join(',')}'", { :default => DEFAULT_SOURCE_FETCHES.join(',') }]
32
+ Monkeyshines::CMDLINE_OPTIONS << [:source_skip, "Initial lines to skip", { :default => 1 }]
33
+
34
+ # Setup
35
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
36
+ Monkeyshines.load_global_options!
37
+ Monkeyshines.load_cmdline_options!
38
+ Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
39
+
40
+ #
41
+ # Don't spend all day on follow-on requests
42
+ #
43
+ { TwitterFollowersRequest => 10,
44
+ TwitterFriendsRequest => 10,
45
+ TwitterFavoritesRequest => 4, }.each{|klass, limit| klass.hard_request_limit = limit }
46
+
47
+ #
48
+ # Set up scraper
49
+ #
50
+ scraper = Monkeyshines::Runner.new({
51
+ :log => { :iters => 100, :dest => Monkeyshines::CONFIG[:handle] },
52
+ :source => { :type => TwitterRequestStream },
53
+ :dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR },
54
+ # :fetcher => { :type => TwitterFakeFetcher },
55
+ :sleep_time => 0,
56
+ })
57
+
58
+ #
59
+ # Run scraper
60
+ #
61
+ scraper.run
@@ -0,0 +1,4 @@
1
+ infochimps
2
+ AustinOnRails
3
+ hadoop
4
+ mrflip
@@ -0,0 +1,2 @@
1
+ script_dir=`dirname $0`
2
+ ttserver -port 10022 $script_dir/rawd/distdb/twitter_api-`hostname`.tch >> $script_dir/rawd/log/twitter_api-ttserver-`datename`.log 2>&1