wuclan 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (111) hide show
  1. data/LICENSE.textile +20 -0
  2. data/README.textile +28 -0
  3. data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
  4. data/examples/analyze/strong_links/main.rb +51 -0
  5. data/examples/analyze/word_count/dump_schema.rb +13 -0
  6. data/examples/analyze/word_count/freq_user.rb +31 -0
  7. data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
  8. data/examples/analyze/word_count/word_count.pig +43 -0
  9. data/examples/analyze/word_count/word_count.rb +34 -0
  10. data/examples/lastfm/scrape/load_lastfm.rb +31 -0
  11. data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
  12. data/examples/lastfm/scrape/seed.tsv +147 -0
  13. data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
  14. data/examples/twitter/old/scrape_twitter_api.rb +104 -0
  15. data/examples/twitter/old/scrape_twitter_search.rb +57 -0
  16. data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
  17. data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
  18. data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
  19. data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
  20. data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
  21. data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
  22. data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
  23. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
  24. data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
  25. data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
  26. data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
  27. data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
  28. data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
  29. data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
  30. data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
  31. data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
  32. data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
  33. data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
  34. data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
  35. data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
  36. data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
  37. data/lib/old/twitter_api.rb +88 -0
  38. data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
  39. data/lib/wuclan/delicious/delicious_models.rb +26 -0
  40. data/lib/wuclan/delicious/delicious_request.rb +65 -0
  41. data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
  42. data/lib/wuclan/friendster.rb +7 -0
  43. data/lib/wuclan/lastfm/model/base.rb +49 -0
  44. data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
  45. data/lib/wuclan/lastfm/scrape/base.rb +195 -0
  46. data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
  47. data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
  48. data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
  49. data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
  50. data/lib/wuclan/lastfm/scrape.rb +12 -0
  51. data/lib/wuclan/lastfm.rb +7 -0
  52. data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
  53. data/lib/wuclan/metrics/user_metrics.rb +443 -0
  54. data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
  55. data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
  56. data/lib/wuclan/metrics.rb +0 -0
  57. data/lib/wuclan/myspace.rb +21 -0
  58. data/lib/wuclan/open_social/model/base.rb +0 -0
  59. data/lib/wuclan/open_social/scrape/base.rb +111 -0
  60. data/lib/wuclan/open_social/scrape_request.rb +6 -0
  61. data/lib/wuclan/open_social.rb +0 -0
  62. data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
  63. data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
  64. data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
  65. data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
  66. data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
  67. data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
  68. data/lib/wuclan/twitter/api_response_examples.textile +300 -0
  69. data/lib/wuclan/twitter/model/base.rb +72 -0
  70. data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
  71. data/lib/wuclan/twitter/model/relationship.rb +176 -0
  72. data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
  73. data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
  74. data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
  75. data/lib/wuclan/twitter/model/text_element.rb +38 -0
  76. data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
  77. data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
  78. data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
  79. data/lib/wuclan/twitter/model/tweet.rb +74 -0
  80. data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
  81. data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
  82. data/lib/wuclan/twitter/model.rb +21 -0
  83. data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
  84. data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
  85. data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
  86. data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
  87. data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
  88. data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
  89. data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
  90. data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
  91. data/lib/wuclan/twitter/scrape/base.rb +97 -0
  92. data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
  93. data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
  94. data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
  95. data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
  96. data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
  97. data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
  98. data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
  99. data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
  100. data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
  101. data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
  102. data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
  103. data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
  104. data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
  105. data/lib/wuclan/twitter/scrape.rb +27 -0
  106. data/lib/wuclan/twitter.rb +7 -0
  107. data/lib/wuclan.rb +1 -0
  108. data/spec/spec_helper.rb +9 -0
  109. data/spec/wuclan_spec.rb +7 -0
  110. data/wuclan.gemspec +184 -0
  111. metadata +219 -0
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env ruby
2
+ Daemonz::ProcTable.ps_emulation.find_all{|proc| proc.cmdline =~ %r{ruby \S+/test_spewer\.rb} }.each do |proc|
3
+ Process.kill 15, proc.pid.to_i
4
+ end
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'sys/proctable'
4
+ include Sys
5
+
6
+ # Sys::ProcTable.ps{ |proc|
7
+ # puts proc.to_a.join("\t")
8
+ # }
9
+
10
+ process_exe = File.dirname(__FILE__)+'/test_spewer.rb'
11
+ require 'ps_emulation'
12
+
13
+ # p [Daemonz::ProcTable.ps_emulation, Sys::ProcTable.ps.first]
14
+
15
+ p process_exe
16
+ Daemonz::ProcTable.ps_emulation.each do |proc|
17
+ next unless (proc.cmdline =~ %r{ruby \S+/test_spewer\.rb})
18
+ puts proc.to_a.join("\t")
19
+ end
@@ -0,0 +1,111 @@
1
+ #
2
+ # This code taken from the daemonz gem
3
+ # http://github.com/costan/daemonz
4
+ # from the file lib/daemonz/process.rb:
5
+ # http://github.com/costan/daemonz/raw/master/lib/daemonz/process.rb
6
+ #
7
+ module Daemonz
8
+
9
+ # Mocks the sys-proctable gem using ps.
10
+ #
11
+ # This is useful even if sys-proctable is available, because it may fail for
12
+ # random reasons.
13
+ module ProcTable
14
+ #
15
+ # Pure-ruby equivalent to Sys::Proctable's ProcInfo struct. This
16
+ # is somewhat wasteful, as we only fill in a few of the fields.
17
+ #
18
+ class FakeProcInfo < Struct.new(
19
+ :pid, :ppid, :pgid, :ruid, :rgid, :comm, :state, :pctcpu,
20
+ :oncpu, :tnum, :tdev, :wmesg, :rtime, :priority, :usrpri, :nice,
21
+ :cmdline,:starttime, :maxrss, :ixrss, :idrss, :isrss, :minflt, :majflt,
22
+ :nswap, :inblock, :oublock, :msgsnd, :msgrcv, :nsignals, :nvcsw,
23
+ :nivcsw, :utime, :stime )
24
+ end
25
+
26
+ #
27
+ # Emulate the Sys::ProcTable.ps method by shelling out to
28
+ #
29
+ def self.ps_emulation
30
+ retval = []
31
+ ps_output = `ps ax`
32
+ ps_output.each_line do |pline|
33
+ pdata = pline.chomp.split(nil, 5)
34
+ pinfo = FakeProcInfo.new
35
+ pinfo.pid = pdata[0].strip!
36
+ pinfo.cmdline = pdata[4].strip!
37
+ retval << pinfo
38
+ end
39
+ retval.shift
40
+ return retval
41
+ end
42
+
43
+
44
+ # ===========================================================================
45
+ #
46
+ # Know when to hold em and when to fold em:
47
+ #
48
+ # We want to use sys/proctable if it is available, succeeds, and has a
49
+ # complete implementation on our platform.
50
+ #
51
+ # * sys/proctable is broken on OSX (possibly just 10.5 Leopard): fake it.
52
+ # * if require 'sys/proctable' fails, then fake it.
53
+ # * Otherwise, as a first resort use sys/proctable;
54
+ # * but if the call fails, fake it.
55
+ #
56
+ def self.require_sys_proctable_or_die_trying!
57
+ begin
58
+ require 'sys/proctable'
59
+ require 'sys/uname'
60
+ return false if Sys::Uname.sysname =~ /Darwin/
61
+ return true
62
+ rescue Exception
63
+ return false
64
+ end
65
+ end
66
+
67
+ # ===========================================================================
68
+ #
69
+ # Define either the fake ProcTable.ps or a defensive-driver ProcTable.ps
70
+ #
71
+ unless self.require_sys_proctable_or_die_trying!
72
+ # The accelerated version is not available, use the slow version all the time.
73
+ def self.ps
74
+ self.ps_emulation
75
+ end
76
+ else
77
+ #
78
+ # If sys/proctable exists
79
+ #
80
+ def self.ps
81
+ # We don't use ps_emulation all the time because sys-proctable is
82
+ # faster. We only pay the performance penalty when sys-proctable fails.
83
+ begin ; Sys::ProcTable.ps
84
+ rescue Exception ; self.ps_emulation ; end
85
+ end
86
+ end
87
+
88
+ # ===========================================================================
89
+ #
90
+ # returns information about a process or all the running processes
91
+ #
92
+ def self.process_info(pid = nil)
93
+ info = Hash.new
94
+ Daemonz::ProcTable.ps.each do |process|
95
+ item = { :cmdline => process.cmdline, :pid => process.pid.to_s }
96
+ if pid.nil?
97
+ info[process.pid.to_s] = item
98
+ else
99
+ return item if item[:pid].to_s == pid.to_s
100
+ end
101
+ end
102
+
103
+ if pid.nil?
104
+ return info
105
+ else
106
+ return nil
107
+ end
108
+ end
109
+
110
+ end
111
+ end
@@ -0,0 +1,110 @@
1
+ $: << File.dirname(__FILE__)+'/../../lib'
2
+ $: << File.dirname(__FILE__)+'/../../../wukong/lib'
3
+ $: << File.dirname(__FILE__)+'/../../../monkeyshines/lib'
4
+ require 'edamame'
5
+ require 'edamame/monitoring'
6
+ require 'ps_emulation'
7
+
8
+ class ScraperGod < GodProcess
9
+ DEFAULT_OPTIONS = {
10
+ :max_file_age => 21600, # 6 hours
11
+ :max_file_size => 1000000, # About 10% less than 128MB, so it should mostly fill one or two blocks on a hadoop file system
12
+ :max_file_size_interval => 10.seconds,
13
+ :process_exe => File.dirname(__FILE__)+'/test_spewer.rb',
14
+ }
15
+ def self.default_options() super.deep_merge(ScraperGod::DEFAULT_OPTIONS) ; end
16
+ def self.site_options() super.deep_merge(global_site_options[:scraper_god]||{}) ; end
17
+ def self.kind
18
+ :scraper
19
+ end
20
+ def handle
21
+ self.class.kind.to_s + '_' + (options[:handle] || File.basename(options[:process_exe])).to_s
22
+ end
23
+
24
+ def mkdirs!
25
+ super()
26
+ FileUtils.mkdir_p options[:data_dir]
27
+ p options[:process_exe]
28
+ end
29
+
30
+ def setup_restart watcher
31
+ super(watcher)
32
+ watcher.restart_if do |restart|
33
+ restart.condition(:file_size_of_oldest_datestamped) do |c|
34
+ c.data_dir = options[:data_dir]
35
+ c.max_size = options[:max_file_size]
36
+ c.interval = options[:max_file_size_interval] if options[:max_file_size_interval]
37
+ end
38
+ end
39
+ end
40
+
41
+ def start_command
42
+ [
43
+ options[:process_exe],
44
+ "--handle=#{handle}",
45
+ ].flatten.compact.join(" ")
46
+ end
47
+ def restart_command() nil ; end
48
+ def stop_command() nil ; end
49
+ end
50
+
51
+ module God
52
+ module Conditions
53
+
54
+ #
55
+ # a
56
+ #
57
+ class FileSize < PollCondition
58
+ attr_accessor :path, :max_size
59
+
60
+ def initialize
61
+ super
62
+ self.path = nil
63
+ self.max_size = nil
64
+ end
65
+
66
+ def valid?
67
+ valid = true
68
+ valid &= complain("Attribute 'path' must be specified", self) if self.path.nil?
69
+ valid &= complain("Attribute 'max_size' must be specified", self) if self.max_size.nil?
70
+ valid
71
+ end
72
+
73
+ def test
74
+ return false if self.path.blank? || (! File.exists?(self.path))
75
+ File.size(self.path) >= self.max_size
76
+ end
77
+ end
78
+
79
+ #
80
+ # Look for the last-datestamped file in the last-datestamped directory
81
+ #
82
+ class FileSizeOfOldestDatestamped < FileSize
83
+ attr_accessor :data_dir
84
+
85
+ def initialize
86
+ super
87
+ self.data_dir = nil
88
+ end
89
+
90
+ def valid?
91
+ valid = super()
92
+ valid &= complain("Attribute 'data_dir' must be specified", self) if self.data_dir.nil?
93
+ valid
94
+ end
95
+
96
+ def path
97
+ last_dir = Dir[data_dir+'/[0-9]*'].
98
+ find_all{|dir| File.directory?(dir) && (File.basename(dir) =~ /^\d+$/)}.
99
+ sort.last
100
+ Dir["#{last_dir}/*"].sort.last || ''
101
+ end
102
+
103
+ end
104
+
105
+ end
106
+ end
107
+
108
+ WORK_DIR = File.dirname(__FILE__)+'/work'
109
+
110
+ ScraperGod.create :data_dir => WORK_DIR
@@ -0,0 +1,20 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'edamame'
4
+ require 'monkeyshines'
5
+ require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
6
+ # Setup
7
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
8
+ Monkeyshines.load_global_options!
9
+ Monkeyshines.load_cmdline_options!
10
+ # Monkeyshines::CONFIG[:handle] = :test
11
+
12
+ # set up a chunked store -- will save timestamped files
13
+ # in the form :rootdir/:date/:handle+:timestamp-:pid.tsv",
14
+ dest = Monkeyshines::Store::ChunkedFlatFileStore.new(:rootdir => WORK_DIR)
15
+
16
+ # make those files fill up at about 40k/sec
17
+ loop do
18
+ dest << ( Time.now.utc.to_s + " " + "*"*4064)
19
+ sleep 0.5
20
+ end
@@ -0,0 +1,10 @@
1
+ :process_groups:
2
+ :twitter_hosebird:
3
+ :type: :scraper
4
+ :max_file_age: 21600 # 6 hours
5
+ :max_file_size: 120000000 # About 10% less than 128MB, so it should mostly fill one or two blocks on a hadoop file system
6
+ :hosebird_level: spritzer # spritzer, gardenhose, etc.
7
+ :user_pass_file: ...
8
+ :data_dir: /data/ripd/com.tw/com.twitter.stream
9
+ :log_dir: /data/log/com.tw/com.twitter.stream
10
+ :filename_pattern: ":date/stream.twitter.com+:datetime-:pid-:hostname.json"
@@ -0,0 +1,38 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'edamame'
4
+ require 'monkeyshines'
5
+ require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
6
+ require 'monkeyshines/utils/filename_pattern'
7
+ # Setup
8
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
9
+ Monkeyshines.load_global_options!
10
+ Monkeyshines.load_cmdline_options!
11
+ Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
12
+ include Edamame::Scheduling
13
+
14
+ #
15
+ # You can also specify these with --source-fetches on the command line
16
+ #
17
+ class TwitterScraper < Monkeyshines::Runner
18
+ def self.define_cmdline_options &block
19
+ super(&block)
20
+ yield(:dest_filename, "URI for scrape store to dump into", :type => String)
21
+ end
22
+ end
23
+
24
+ Monkeyshines::CONFIG[:dest][:filename] ||= Monkeyshines::Utils::FilenamePattern.new(":dest_dir/seed/jobs-:handle-:hostname-:timestamp.tsv", :dest_dir => WORK_DIR ).to_s
25
+ default_tube = Monkeyshines::CONFIG[:handle].to_s.gsub(/\W+/,'').gsub(/_/,'-')
26
+
27
+ source = Edamame::PersistentQueue.new( :tube => default_tube,
28
+ :queue => { :uris => ['localhost:11220'], },
29
+ :store => { :uri => ':11221', }
30
+ )
31
+ dest = Monkeyshines::Store::FlatFileStore.new :filename => Monkeyshines::CONFIG[:dest][:filename], :filemode => 'w'
32
+ Log.info "Sucking from tube '#{default_tube}' on s#{source.options[:store][:uri]}/q#{source.options[:queue][:uris]}, dumping into #{dest.filename}"
33
+ source.each do |job|
34
+ # dest << ( [Monkeyshines.url_decode(job.obj[:key])] + job.values_of(:priority, :prev_rate, :prev_items, :prev_span_min, :prev_span_max) )
35
+ # last_run = Time.parse(job.last_run)
36
+ last_run = job.last_run ? job.last_run.strftime("%Y%m%d%H%M%S") : nil
37
+ dest << ( [Monkeyshines.url_decode(job.obj[:key])] + job.values_of(:priority, :prev_items_rate, :prev_max, :prev_items, :delay) + [last_run] )
38
+ end
@@ -0,0 +1,63 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'edamame'
4
+ require 'monkeyshines'
5
+ require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
6
+ # Setup
7
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
8
+ Monkeyshines.load_global_options!
9
+ Monkeyshines.load_cmdline_options!
10
+ Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
11
+
12
+ include Edamame::Scheduling
13
+
14
+ #
15
+ # You can also specify these with --source-fetches on the command line
16
+ #
17
+ class TwitterScraper < Monkeyshines::Runner
18
+ def self.define_cmdline_options &block
19
+ super(&block)
20
+ yield(:source_filename, "URI for scrape store to load from", :type => String, :required => true)
21
+ end
22
+ end
23
+
24
+ default_tube = Monkeyshines::CONFIG[:handle].to_s.gsub(/\W+/,'').gsub(/_/,'-')
25
+ p default_tube
26
+
27
+ DEFAULT_PRIORITY = 65536
28
+ DEFAULT_TTR = nil
29
+ DEFAULT_RESERVE_TIMEOUT = 15
30
+ IMMEDIATELY = 0
31
+ source = Monkeyshines::Store::FlatFileStore.new :filename => Monkeyshines::CONFIG[:source][:filename]
32
+ dest = Edamame::PersistentQueue.new( :tube => default_tube,
33
+ :queue => { :uris => ['localhost:11220'], },
34
+ :store => { :uri => ':11221', }
35
+ )
36
+ source.each do |query_term, priority, prev_items_rate, prev_max, prev_items, delay, last_run|
37
+ # |query_term, priority, prev_rate, prev_items, prev_span_min, prev_span_max|
38
+ query_term.strip!
39
+ query_term = Monkeyshines.url_encode(query_term)
40
+ priority = priority.to_i
41
+ priority = DEFAULT_PRIORITY if (priority == 0 )
42
+ prev_items_rate = prev_items_rate.to_f
43
+ prev_items_rate = nil if (prev_items_rate < 1e-6)
44
+ prev_max = prev_max.to_i
45
+ prev_items = prev_items.to_i
46
+ delay = delay.to_f
47
+ last_run = last_run ? DateTime.parse(last_run) : nil
48
+
49
+ twitter_search = { :type => 'TwitterSearchRequest', :key => query_term }
50
+
51
+ job = Edamame::Job.new(default_tube, priority, nil, 1,
52
+ Recurring.new(delay, prev_max, prev_items, prev_items_rate),
53
+ twitter_search
54
+ )
55
+ dest.put job, job.priority, IMMEDIATELY
56
+ end
57
+
58
+
59
+ # loop do
60
+ # job = dest.reserve(DEFAULT_RESERVE_TIMEOUT) or break
61
+ # p [job, job.priority, job.scheduling, job.obj]
62
+ # # dest.delete job
63
+ # end
@@ -0,0 +1,44 @@
1
+ #!/usr/bin/env ruby
2
+ require 'rubygems'
3
+ require 'edamame'
4
+ require 'monkeyshines'
5
+ require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
6
+ # Setup
7
+ WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
8
+ Monkeyshines.load_global_options!
9
+ Monkeyshines.load_cmdline_options!
10
+ Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
11
+
12
+ #
13
+ # * jobs stream from an edamame job queue.
14
+ # * each job generates 1 to 15 paginated requests, stopping when a response
15
+ # overlaps the prev_max item.
16
+ # * Each request is fetched with the standard HTTP fetcher.
17
+ # * jobs are rescheduled based on the observed item rate
18
+ # * results are sent to a ChunkedFlatFileStore
19
+ #
20
+
21
+ #
22
+ # Run scraper
23
+ #
24
+ loop do
25
+ begin
26
+ #
27
+ # Create scraper
28
+ #
29
+ scraper = Monkeyshines::Runner.new({
30
+ :log => { :iters => 600, :time => 150, :dest => nil }, # Monkeyshines::CONFIG[:handle]
31
+ :source => { :type => TwitterSearchRequestStream, :queue_request_timeout => (10 * 60),
32
+ :queue => { :uris => ['localhost:11220'], },
33
+ :store => { :uri => ':11221', }, },
34
+ :dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR, :filemode => 'a' },
35
+ # :dest => { :type => :flat_file_store, :filename => WORK_DIR+"/test_output.tsv" },
36
+ # :fetcher => { :type => TwitterSearchFakeFetcher },
37
+ :sleep_time => 1 ,
38
+ })
39
+ Log.info "Starting a run!"
40
+ scraper.run
41
+ rescue Beanstalk::NotFoundError => e
42
+ warn e
43
+ end
44
+ end
@@ -0,0 +1,25 @@
1
+ $: << File.dirname(__FILE__)+'/../../../../edamame/lib'
2
+ require 'edamame/monitoring'
3
+ WORK_DIR = File.dirname(__FILE__)+'/work'
4
+
5
+ #
6
+ # For debugging:
7
+ #
8
+ # sudo god -c this_file.god -D
9
+ #
10
+ # (for production, use the etc/initc.d script in this directory)
11
+ #
12
+ # TODO: define an EdamameDirector that lets us name these collections.
13
+ #
14
+ THE_FAITHFUL = [
15
+ # twitter_search
16
+ [BeanstalkdGod, { :port => 11240, :max_mem_usage => 100.megabytes, }],
17
+ [TyrantGod, { :port => 11241, :db_dirname => WORK_DIR, :db_name => "twitter_search-queue.tct" }],
18
+ #
19
+ # [TyrantGod, { :port => 11249, :db_dirname => WORK_DIR, :db_name => "twitter_search-flat.tct" }],
20
+ ]
21
+
22
+ THE_FAITHFUL.each do |klass, config|
23
+ proc = klass.create(config.merge :flapping_notify => 'default')
24
+ proc.mkdirs!
25
+ end
@@ -0,0 +1,88 @@
1
+ module TwitterApi
2
+ #
3
+ # The URI for a given resource
4
+ #
5
+ def gen_url
6
+ case context
7
+ when :user, :followers, :friends, :favorites, :timeline
8
+ "http://twitter.com/#{resource_path}/#{identifier}.json?page=#{page}"
9
+ when :followers_ids, :friends_ids
10
+ "http://twitter.com/#{resource_path}/#{identifier}.json"
11
+ when :user_timeline
12
+ "http://twitter.com/#{resource_path}/#{identifier}.json?page=#{page}&count=200"
13
+ # when :public_timeline
14
+ # when :search
15
+ # "http://search.twitter.com/search.json?q=#{query}"
16
+ else
17
+ raise "Don't know how to retrieve #{context} yet"
18
+ end
19
+ end
20
+
21
+ # Regular expression to grok resource from uri
22
+ GROK_URI_RE = %r{http://twitter.com/(\w+/\w+)/(\w+)\.json\?page=(\d+)}
23
+
24
+ # Context <=> resource mapping
25
+ #
26
+ # aka. repairing the non-REST uri's
27
+ RESOURCE_PATH_FROM_CONTEXT = {
28
+ :user => 'users/show',
29
+ :followers_ids => 'followers/ids',
30
+ :friends_ids => 'friends/ids',
31
+ :followers => 'statuses/followers',
32
+ :friends => 'statuses/friends',
33
+ :favorites => 'favorites',
34
+ :timeline => 'statuses/user_timeline',
35
+ :user_timeline => 'statuses/user_timeline',
36
+ :public_timeline => 'statuses/public_timeline'
37
+ }
38
+ # Get url resource for context
39
+ def resource_path
40
+ RESOURCE_PATH_FROM_CONTEXT[context.to_sym]
41
+ end
42
+
43
+ def self.pages_from_count per_page, count, max=nil
44
+ num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
45
+ [num, max].compact.min
46
+ end
47
+ def self.pages context, thing
48
+ case context
49
+ when :favorites then pages_from_count( 20, thing.favourites_count, 20)
50
+ when :friends then pages_from_count(100, thing.friends_count, 10)
51
+ when :followers then pages_from_count(100, thing.followers_count, 10)
52
+ when :followers_ids then thing.followers_count == 0 ? 0 : 1
53
+ when :friends_ids then thing.friends_count == 0 ? 0 : 1
54
+ when :user then 1
55
+ when :public_timeline then 1
56
+ when :user_timeline then pages_from_count(200, thing.statuses_count, 20)
57
+ when :search then pages_from_count(100, 1500)
58
+ else raise "need to define pages for context #{context}"
59
+ end
60
+ end
61
+
62
+ module ClassMethods
63
+ # Get context from url resource
64
+ def context_for_resource(resource)
65
+ RESOURCE_PATH_FROM_CONTEXT.invert[resource] or raise("Wrong resource specification #{resource}")
66
+ end
67
+ end
68
+
69
+ def self.included base
70
+ base.extend ClassMethods
71
+ end
72
+ end
73
+
74
+ # language: http://en.wikipedia.org/wiki/ISO_639-1
75
+ #
76
+ # * Find tweets containing a word: http://search.twitter.com/search.atom?q=twitter
77
+ # * Find tweets from a user: http://search.twitter.com/search.atom?q=from%3Aalexiskold
78
+ # * Find tweets to a user: http://search.twitter.com/search.atom?q=to%3Atechcrunch
79
+ # * Find tweets referencing a user: http://search.twitter.com/search.atom?q=%40mashable
80
+ # * Find tweets containing a hashtag: http://search.twitter.com/search.atom?q=%23haiku
81
+ # * Combine any of the operators together: http://search.twitter.com/search.atom?q=movie+%3A%29
82
+ #
83
+ # * lang: restricts tweets to the given language, given by an ISO 639-1 code. Ex: http://search.twitter.com/search.atom?lang=en&q=devo
84
+ # * rpp: the number of tweets to return per page, up to a max of 100. Ex: http://search.twitter.com/search.atom?lang=en&q=devo&rpp=15
85
+ # * page: the page number (starting at 1) to return, up to a max of roughly 1500 results (based on rpp * page)
86
+ # * since_id: returns tweets with status ids greater than the given id.
87
+ # * geocode: returns tweets by users located within a given radius of the given latitude/longitude, where the user's location is taken from their Twitter profile. The parameter value is specified by "latitide,longitude,radius", where radius units must be specified as either "mi" (miles) or "km" (kilometers). Ex: http://search.twitter.com/search.atom?geocode=40.757929%2C-73.985506%2C25km. Note that you cannot use the near operator via the API to geocode arbitrary locations; however you can use this geocode parameter to search near geocodes directly.
88
+ # * show_user: when "true", adds "<user>:" to the beginning of the tweet. This is useful for readers that do not display Atom's author field. The default is "false".
@@ -0,0 +1,31 @@
1
+ module Wuclan
2
+ module Delicious
3
+ module DeliciousRequest
4
+ end
5
+
6
+ # Recent bookmarks by tag: http://delicious.com/tag/{tag[+tag+...+tag]}?detail=3
7
+ # Popular bookmarks by tag: http://delicious.com/popular/{tag}?detail=3
8
+ # Bookmarks for a specific user by tag(s): http://delicious.com/{username}/{tag[+tag+...+tag]}?detail=3
9
+ # Bookmarks for a specific user: http://delicious.com/{username}?detail=3
10
+ class TagRequest
11
+ attr_accessor :scope
12
+ def initialize scope
13
+ end
14
+
15
+ # pages
16
+ # count
17
+ end
18
+
19
+ # A list of all public tags for a user: http://delicious.com/tags/{username}?view=all
20
+ class UserTagsRequest
21
+ end
22
+
23
+ # Recent bookmarks for a URL: http://delicious.com/url/{url md5}
24
+ class UrlInfoRequest
25
+ end
26
+
27
+ # A list of a user's network members: http://delicious.com/network/{username}
28
+ class FriendsFollowersRequest
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,26 @@
1
+ module Wuclan
2
+ module Models
3
+
4
+ #
5
+ # Models for the delicious.com (formerly del.icio.us) social network
6
+ #
7
+ # Link: has tags, tagged by socialites
8
+ # Socialite: describes links with tabs, uses tags, follows/followedby socialites
9
+ # Tag: tags links, used by socialites
10
+
11
+ class DeliciousLink < Struct.new(
12
+ :delicious_link_id, :url, :title, :taggers_count)
13
+ end
14
+ class DeliciousTag < Struct.new(
15
+ :name )
16
+ end
17
+ class DeliciousUser < Struct.new(
18
+ :id, :scraped_at, :screen_name, :protected, :followers_count, :friends_count, :taggings_count, :name, :description, :bio_url )
19
+ end
20
+
21
+ class DeliciousTagging < Struct.new(
22
+ :tag_name, :delicious_link_id, :screen_name, :created_at, :text, :description)
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,65 @@
1
+ module Wuclan
2
+ module Delicious
3
+ module DeliciousRequest
4
+ end
5
+
6
+ # Recent bookmarks by tag: http://feeds.delicious.com/v2/{format}/tag/{tag[+tag+...+tag]}
7
+ # Popular bookmarks by tag: http://feeds.delicious.com/v2/{format}/popular/{tag}
8
+ # Bookmarks for a specific user by tag(s): http://feeds.delicious.com/v2/{format}/{username}/{tag[+tag+...+tag]}
9
+ # Bookmarks for a specific user: http://feeds.delicious.com/v2/{format}/{username}
10
+ class TagRequest
11
+ attr_accessor :scope
12
+ def initialize scope
13
+ end
14
+
15
+ # pages
16
+ # count
17
+ end
18
+
19
+ # A list of all public tags for a user: http://feeds.delicious.com/v2/{format}/tags/{username}
20
+ class UserTagsRequest
21
+ end
22
+
23
+ # Public summary information about a user (as seen in the network badge): http://feeds.delicious.com/v2/{format}/userinfo/{username}
24
+ class UserInfoRequest
25
+ end
26
+
27
+ # Recent bookmarks for a URL: http://feeds.delicious.com/v2/{format}/url/{url md5}
28
+ class UrlInfoRequest
29
+ end
30
+
31
+ # A list of a user's network members: http://feeds.delicious.com/v2/{format}/networkmembers/{username}
32
+ class FollowersRequest
33
+ end
34
+
35
+ # A list of a user's network fans: http://feeds.delicious.com/v2/{format}/networkfans/{username}
36
+ class FriendsRequest
37
+ end
38
+ end
39
+ end
40
+
41
+
42
+ # Recent bookmarks by tag: http://feeds.delicious.com/v2/{format}/tag/{tag[+tag+...+tag]}
43
+ # Popular bookmarks by tag: http://feeds.delicious.com/v2/{format}/popular/{tag}
44
+ # Bookmarks for a specific user: http://feeds.delicious.com/v2/{format}/{username}
45
+ # Bookmarks for a specific user by tag(s): http://feeds.delicious.com/v2/{format}/{username}/{tag[+tag+...+tag]}
46
+ # Public summary information about a user (as seen in the network badge): http://feeds.delicious.com/v2/{format}/userinfo/{username}
47
+ # A list of all public tags for a user: http://feeds.delicious.com/v2/{format}/tags/{username}
48
+ # Recent bookmarks for a URL: http://feeds.delicious.com/v2/{format}/url/{url md5}
49
+ # A list of a user's network members: http://feeds.delicious.com/v2/{format}/networkmembers/{username}
50
+ # A list of a user's network fans: http://feeds.delicious.com/v2/{format}/networkfans/{username}
51
+
52
+ # Bookmarks from the hotlist: http://feeds.delicious.com/v2/{format}
53
+ # Recent bookmarks: http://feeds.delicious.com/v2/{format}/recent
54
+ # Popular bookmarks: http://feeds.delicious.com/v2/{format}/popular
55
+ # Recent site alerts (as seen in the top-of-page alert bar on the site): http://feeds.delicious.com/v2/{format}/alerts
56
+ # Bookmarks from a user's subscriptions: http://feeds.delicious.com/v2/{format}/subscriptions/{username}
57
+ # Bookmarks from members of a user's network: http://feeds.delicious.com/v2/{format}/network/{username}
58
+ # Bookmarks from members of a user's network by tag: http://feeds.delicious.com/v2/{format}/network/{username}/{tag[+tag+...+tag]}
59
+ # Summary information about a URL (as seen in the tagometer): http://feeds.delicious.com/v2/json/urlinfo/{url md5}
60
+
61
+ # Private bookmarks for a specific user: http://feeds.delicious.com/v2/{format}/{username}?private={key}
62
+ # Private bookmarks for a specific user by tag(s): http://feeds.delicious.com/v2/{format}/{username}/{tag[+tag+...+tag]}?private={key}
63
+ # Private feed for a user's inbox bookmarks from others: http://feeds.delicious.com/v2/{format}/inbox/{username}?private={key}
64
+ # Bookmarks from members of a user's private network: http://feeds.delicious.com/v2/{format}/network/{username}?private={key}
65
+ # Bookmarks from members of a user's private network by tag: http://feeds.delicious.com/v2/{format}/network/{username}/{tag[+tag+...+tag]}?private={key}