wuclan 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE.textile +20 -0
- data/README.textile +28 -0
- data/examples/analyze/strong_links/gen_multi_edge.rb +103 -0
- data/examples/analyze/strong_links/main.rb +51 -0
- data/examples/analyze/word_count/dump_schema.rb +13 -0
- data/examples/analyze/word_count/freq_user.rb +31 -0
- data/examples/analyze/word_count/freq_whole_corpus.rb +27 -0
- data/examples/analyze/word_count/word_count.pig +43 -0
- data/examples/analyze/word_count/word_count.rb +34 -0
- data/examples/lastfm/scrape/load_lastfm.rb +31 -0
- data/examples/lastfm/scrape/scrape_lastfm.rb +47 -0
- data/examples/lastfm/scrape/seed.tsv +147 -0
- data/examples/twitter/old/load_twitter_search_jobs.rb +157 -0
- data/examples/twitter/old/scrape_twitter_api.rb +104 -0
- data/examples/twitter/old/scrape_twitter_search.rb +57 -0
- data/examples/twitter/old/scrape_twitter_trending.rb +73 -0
- data/examples/twitter/parse/parse_twitter_requests.rb +81 -0
- data/examples/twitter/parse/parse_twitter_search_requests.rb +28 -0
- data/examples/twitter/scrape_twitter_api/scrape_twitter_api.rb +61 -0
- data/examples/twitter/scrape_twitter_api/seed.tsv +4 -0
- data/examples/twitter/scrape_twitter_api/start_cache_twitter.sh +2 -0
- data/examples/twitter/scrape_twitter_api/support/make_request_stats.rb +291 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_1.rb +98 -0
- data/examples/twitter/scrape_twitter_api/support/make_requests_by_id_and_date_2.pig +4 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_search_jobs.tsv +6 -0
- data/examples/twitter/scrape_twitter_api/support/twitter_trending_seed.tsv +725 -0
- data/examples/twitter/scrape_twitter_hosebird/edamame-killall +4 -0
- data/examples/twitter/scrape_twitter_hosebird/foo.rb +19 -0
- data/examples/twitter/scrape_twitter_hosebird/ps_emulation.rb +111 -0
- data/examples/twitter/scrape_twitter_hosebird/scrape_twitter_hosebird.rb +110 -0
- data/examples/twitter/scrape_twitter_hosebird/test_spewer.rb +20 -0
- data/examples/twitter/scrape_twitter_hosebird/twitter_hosebird_god.yaml +10 -0
- data/examples/twitter/scrape_twitter_search/dump_twitter_search_jobs.rb +38 -0
- data/examples/twitter/scrape_twitter_search/load_twitter_search_jobs.rb +63 -0
- data/examples/twitter/scrape_twitter_search/scrape_twitter_search.rb +44 -0
- data/examples/twitter/scrape_twitter_search/twitter_search_daemons.god +25 -0
- data/lib/old/twitter_api.rb +88 -0
- data/lib/wuclan/delicious/delicious_html_request.rb +31 -0
- data/lib/wuclan/delicious/delicious_models.rb +26 -0
- data/lib/wuclan/delicious/delicious_request.rb +65 -0
- data/lib/wuclan/friendfeed/scrape/friendfeed_search_request.rb +60 -0
- data/lib/wuclan/friendster.rb +7 -0
- data/lib/wuclan/lastfm/model/base.rb +49 -0
- data/lib/wuclan/lastfm/model/sample_responses.txt +16 -0
- data/lib/wuclan/lastfm/scrape/base.rb +195 -0
- data/lib/wuclan/lastfm/scrape/concrete.rb +143 -0
- data/lib/wuclan/lastfm/scrape/lastfm_job.rb +12 -0
- data/lib/wuclan/lastfm/scrape/lastfm_request_stream.rb +17 -0
- data/lib/wuclan/lastfm/scrape/recursive_requests.rb +154 -0
- data/lib/wuclan/lastfm/scrape.rb +12 -0
- data/lib/wuclan/lastfm.rb +7 -0
- data/lib/wuclan/metrics/user_graph_metrics.rb +99 -0
- data/lib/wuclan/metrics/user_metrics.rb +443 -0
- data/lib/wuclan/metrics/user_metrics_basic.rb +277 -0
- data/lib/wuclan/metrics/user_scraping_metrics.rb +64 -0
- data/lib/wuclan/metrics.rb +0 -0
- data/lib/wuclan/myspace.rb +21 -0
- data/lib/wuclan/open_social/model/base.rb +0 -0
- data/lib/wuclan/open_social/scrape/base.rb +111 -0
- data/lib/wuclan/open_social/scrape_request.rb +6 -0
- data/lib/wuclan/open_social.rb +0 -0
- data/lib/wuclan/rdf_output/relationship_rdf.rb +47 -0
- data/lib/wuclan/rdf_output/text_element_rdf.rb +64 -0
- data/lib/wuclan/rdf_output/tweet_rdf.rb +10 -0
- data/lib/wuclan/rdf_output/twitter_rdf.rb +84 -0
- data/lib/wuclan/rdf_output/twitter_user_rdf.rb +12 -0
- data/lib/wuclan/shorturl/shorturl_request.rb +271 -0
- data/lib/wuclan/twitter/api_response_examples.textile +300 -0
- data/lib/wuclan/twitter/model/base.rb +72 -0
- data/lib/wuclan/twitter/model/multi_edge.rb +31 -0
- data/lib/wuclan/twitter/model/relationship.rb +176 -0
- data/lib/wuclan/twitter/model/text_element/extract_info_tests.rb +83 -0
- data/lib/wuclan/twitter/model/text_element/grok_tweets.rb +96 -0
- data/lib/wuclan/twitter/model/text_element/more_regexes.rb +370 -0
- data/lib/wuclan/twitter/model/text_element.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tokenize.rb +38 -0
- data/lib/wuclan/twitter/model/tweet/tweet_regexes.rb +202 -0
- data/lib/wuclan/twitter/model/tweet/tweet_token.rb +79 -0
- data/lib/wuclan/twitter/model/tweet.rb +74 -0
- data/lib/wuclan/twitter/model/twitter_user/style/color_to_hsv.rb +57 -0
- data/lib/wuclan/twitter/model/twitter_user.rb +145 -0
- data/lib/wuclan/twitter/model.rb +21 -0
- data/lib/wuclan/twitter/parse/ff_ids_parser.rb +27 -0
- data/lib/wuclan/twitter/parse/friends_followers_parser.rb +52 -0
- data/lib/wuclan/twitter/parse/generic_json_parser.rb +26 -0
- data/lib/wuclan/twitter/parse/json_tweet.rb +63 -0
- data/lib/wuclan/twitter/parse/json_twitter_user.rb +122 -0
- data/lib/wuclan/twitter/parse/public_timeline_parser.rb +54 -0
- data/lib/wuclan/twitter/parse/twitter_search_parse.rb +60 -0
- data/lib/wuclan/twitter/parse/user_parser.rb +30 -0
- data/lib/wuclan/twitter/scrape/base.rb +97 -0
- data/lib/wuclan/twitter/scrape/old_skool_request_classes.rb +40 -0
- data/lib/wuclan/twitter/scrape/twitter_fake_fetcher.rb +31 -0
- data/lib/wuclan/twitter/scrape/twitter_ff_ids_request.rb +75 -0
- data/lib/wuclan/twitter/scrape/twitter_followers_request.rb +135 -0
- data/lib/wuclan/twitter/scrape/twitter_json_response.rb +124 -0
- data/lib/wuclan/twitter/scrape/twitter_request_stream.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_fake_fetcher.rb +44 -0
- data/lib/wuclan/twitter/scrape/twitter_search_flat_stream.rb +30 -0
- data/lib/wuclan/twitter/scrape/twitter_search_job.rb +25 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request.rb +70 -0
- data/lib/wuclan/twitter/scrape/twitter_search_request_stream.rb +19 -0
- data/lib/wuclan/twitter/scrape/twitter_timeline_request.rb +72 -0
- data/lib/wuclan/twitter/scrape/twitter_user_request.rb +64 -0
- data/lib/wuclan/twitter/scrape.rb +27 -0
- data/lib/wuclan/twitter.rb +7 -0
- data/lib/wuclan.rb +1 -0
- data/spec/spec_helper.rb +9 -0
- data/spec/wuclan_spec.rb +7 -0
- data/wuclan.gemspec +184 -0
- metadata +219 -0
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'sys/proctable'
|
|
4
|
+
include Sys
|
|
5
|
+
|
|
6
|
+
# Sys::ProcTable.ps{ |proc|
|
|
7
|
+
# puts proc.to_a.join("\t")
|
|
8
|
+
# }
|
|
9
|
+
|
|
10
|
+
process_exe = File.dirname(__FILE__)+'/test_spewer.rb'
|
|
11
|
+
require 'ps_emulation'
|
|
12
|
+
|
|
13
|
+
# p [Daemonz::ProcTable.ps_emulation, Sys::ProcTable.ps.first]
|
|
14
|
+
|
|
15
|
+
p process_exe
|
|
16
|
+
Daemonz::ProcTable.ps_emulation.each do |proc|
|
|
17
|
+
next unless (proc.cmdline =~ %r{ruby \S+/test_spewer\.rb})
|
|
18
|
+
puts proc.to_a.join("\t")
|
|
19
|
+
end
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
#
|
|
2
|
+
# This code taken from the daemonz gem
|
|
3
|
+
# http://github.com/costan/daemonz
|
|
4
|
+
# from the file lib/daemonz/process.rb:
|
|
5
|
+
# http://github.com/costan/daemonz/raw/master/lib/daemonz/process.rb
|
|
6
|
+
#
|
|
7
|
+
module Daemonz
|
|
8
|
+
|
|
9
|
+
# Mocks the sys-proctable gem using ps.
|
|
10
|
+
#
|
|
11
|
+
# This is useful even if sys-proctable is available, because it may fail for
|
|
12
|
+
# random reasons.
|
|
13
|
+
module ProcTable
|
|
14
|
+
#
|
|
15
|
+
# Pure-ruby equivalent to Sys::Proctable's ProcInfo struct. This
|
|
16
|
+
# is somewhat wasteful, as we only fill in a few of the fields.
|
|
17
|
+
#
|
|
18
|
+
class FakeProcInfo < Struct.new(
|
|
19
|
+
:pid, :ppid, :pgid, :ruid, :rgid, :comm, :state, :pctcpu,
|
|
20
|
+
:oncpu, :tnum, :tdev, :wmesg, :rtime, :priority, :usrpri, :nice,
|
|
21
|
+
:cmdline,:starttime, :maxrss, :ixrss, :idrss, :isrss, :minflt, :majflt,
|
|
22
|
+
:nswap, :inblock, :oublock, :msgsnd, :msgrcv, :nsignals, :nvcsw,
|
|
23
|
+
:nivcsw, :utime, :stime )
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
#
|
|
27
|
+
# Emulate the Sys::ProcTable.ps method by shelling out to
|
|
28
|
+
#
|
|
29
|
+
def self.ps_emulation
|
|
30
|
+
retval = []
|
|
31
|
+
ps_output = `ps ax`
|
|
32
|
+
ps_output.each_line do |pline|
|
|
33
|
+
pdata = pline.chomp.split(nil, 5)
|
|
34
|
+
pinfo = FakeProcInfo.new
|
|
35
|
+
pinfo.pid = pdata[0].strip!
|
|
36
|
+
pinfo.cmdline = pdata[4].strip!
|
|
37
|
+
retval << pinfo
|
|
38
|
+
end
|
|
39
|
+
retval.shift
|
|
40
|
+
return retval
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ===========================================================================
|
|
45
|
+
#
|
|
46
|
+
# Know when to hold em and when to fold em:
|
|
47
|
+
#
|
|
48
|
+
# We want to use sys/proctable if it is available, succeeds, and has a
|
|
49
|
+
# complete implementation on our platform.
|
|
50
|
+
#
|
|
51
|
+
# * sys/proctable is broken on OSX (possibly just 10.5 Leopard): fake it.
|
|
52
|
+
# * if require 'sys/proctable' fails, then fake it.
|
|
53
|
+
# * Otherwise, as a first resort use sys/proctable;
|
|
54
|
+
# * but if the call fails, fake it.
|
|
55
|
+
#
|
|
56
|
+
def self.require_sys_proctable_or_die_trying!
|
|
57
|
+
begin
|
|
58
|
+
require 'sys/proctable'
|
|
59
|
+
require 'sys/uname'
|
|
60
|
+
return false if Sys::Uname.sysname =~ /Darwin/
|
|
61
|
+
return true
|
|
62
|
+
rescue Exception
|
|
63
|
+
return false
|
|
64
|
+
end
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
# ===========================================================================
|
|
68
|
+
#
|
|
69
|
+
# Define either the fake ProcTable.ps or a defensive-driver ProcTable.ps
|
|
70
|
+
#
|
|
71
|
+
unless self.require_sys_proctable_or_die_trying!
|
|
72
|
+
# The accelerated version is not available, use the slow version all the time.
|
|
73
|
+
def self.ps
|
|
74
|
+
self.ps_emulation
|
|
75
|
+
end
|
|
76
|
+
else
|
|
77
|
+
#
|
|
78
|
+
# If sys/proctable exists
|
|
79
|
+
#
|
|
80
|
+
def self.ps
|
|
81
|
+
# We don't use ps_emulation all the time because sys-proctable is
|
|
82
|
+
# faster. We only pay the performance penalty when sys-proctable fails.
|
|
83
|
+
begin ; Sys::ProcTable.ps
|
|
84
|
+
rescue Exception ; self.ps_emulation ; end
|
|
85
|
+
end
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# ===========================================================================
|
|
89
|
+
#
|
|
90
|
+
# returns information about a process or all the running processes
|
|
91
|
+
#
|
|
92
|
+
def self.process_info(pid = nil)
|
|
93
|
+
info = Hash.new
|
|
94
|
+
Daemonz::ProcTable.ps.each do |process|
|
|
95
|
+
item = { :cmdline => process.cmdline, :pid => process.pid.to_s }
|
|
96
|
+
if pid.nil?
|
|
97
|
+
info[process.pid.to_s] = item
|
|
98
|
+
else
|
|
99
|
+
return item if item[:pid].to_s == pid.to_s
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
if pid.nil?
|
|
104
|
+
return info
|
|
105
|
+
else
|
|
106
|
+
return nil
|
|
107
|
+
end
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
end
|
|
111
|
+
end
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
$: << File.dirname(__FILE__)+'/../../lib'
|
|
2
|
+
$: << File.dirname(__FILE__)+'/../../../wukong/lib'
|
|
3
|
+
$: << File.dirname(__FILE__)+'/../../../monkeyshines/lib'
|
|
4
|
+
require 'edamame'
|
|
5
|
+
require 'edamame/monitoring'
|
|
6
|
+
require 'ps_emulation'
|
|
7
|
+
|
|
8
|
+
class ScraperGod < GodProcess
|
|
9
|
+
DEFAULT_OPTIONS = {
|
|
10
|
+
:max_file_age => 21600, # 6 hours
|
|
11
|
+
:max_file_size => 1000000, # About 10% less than 128MB, so it should mostly fill one or two blocks on a hadoop file system
|
|
12
|
+
:max_file_size_interval => 10.seconds,
|
|
13
|
+
:process_exe => File.dirname(__FILE__)+'/test_spewer.rb',
|
|
14
|
+
}
|
|
15
|
+
def self.default_options() super.deep_merge(ScraperGod::DEFAULT_OPTIONS) ; end
|
|
16
|
+
def self.site_options() super.deep_merge(global_site_options[:scraper_god]||{}) ; end
|
|
17
|
+
def self.kind
|
|
18
|
+
:scraper
|
|
19
|
+
end
|
|
20
|
+
def handle
|
|
21
|
+
self.class.kind.to_s + '_' + (options[:handle] || File.basename(options[:process_exe])).to_s
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
def mkdirs!
|
|
25
|
+
super()
|
|
26
|
+
FileUtils.mkdir_p options[:data_dir]
|
|
27
|
+
p options[:process_exe]
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def setup_restart watcher
|
|
31
|
+
super(watcher)
|
|
32
|
+
watcher.restart_if do |restart|
|
|
33
|
+
restart.condition(:file_size_of_oldest_datestamped) do |c|
|
|
34
|
+
c.data_dir = options[:data_dir]
|
|
35
|
+
c.max_size = options[:max_file_size]
|
|
36
|
+
c.interval = options[:max_file_size_interval] if options[:max_file_size_interval]
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
def start_command
|
|
42
|
+
[
|
|
43
|
+
options[:process_exe],
|
|
44
|
+
"--handle=#{handle}",
|
|
45
|
+
].flatten.compact.join(" ")
|
|
46
|
+
end
|
|
47
|
+
def restart_command() nil ; end
|
|
48
|
+
def stop_command() nil ; end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
module God
|
|
52
|
+
module Conditions
|
|
53
|
+
|
|
54
|
+
#
|
|
55
|
+
# a
|
|
56
|
+
#
|
|
57
|
+
class FileSize < PollCondition
|
|
58
|
+
attr_accessor :path, :max_size
|
|
59
|
+
|
|
60
|
+
def initialize
|
|
61
|
+
super
|
|
62
|
+
self.path = nil
|
|
63
|
+
self.max_size = nil
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
def valid?
|
|
67
|
+
valid = true
|
|
68
|
+
valid &= complain("Attribute 'path' must be specified", self) if self.path.nil?
|
|
69
|
+
valid &= complain("Attribute 'max_size' must be specified", self) if self.max_size.nil?
|
|
70
|
+
valid
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def test
|
|
74
|
+
return false if self.path.blank? || (! File.exists?(self.path))
|
|
75
|
+
File.size(self.path) >= self.max_size
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
#
|
|
80
|
+
# Look for the last-datestamped file in the last-datestamped directory
|
|
81
|
+
#
|
|
82
|
+
class FileSizeOfOldestDatestamped < FileSize
|
|
83
|
+
attr_accessor :data_dir
|
|
84
|
+
|
|
85
|
+
def initialize
|
|
86
|
+
super
|
|
87
|
+
self.data_dir = nil
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def valid?
|
|
91
|
+
valid = super()
|
|
92
|
+
valid &= complain("Attribute 'data_dir' must be specified", self) if self.data_dir.nil?
|
|
93
|
+
valid
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
def path
|
|
97
|
+
last_dir = Dir[data_dir+'/[0-9]*'].
|
|
98
|
+
find_all{|dir| File.directory?(dir) && (File.basename(dir) =~ /^\d+$/)}.
|
|
99
|
+
sort.last
|
|
100
|
+
Dir["#{last_dir}/*"].sort.last || ''
|
|
101
|
+
end
|
|
102
|
+
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
end
|
|
106
|
+
end
|
|
107
|
+
|
|
108
|
+
WORK_DIR = File.dirname(__FILE__)+'/work'
|
|
109
|
+
|
|
110
|
+
ScraperGod.create :data_dir => WORK_DIR
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'edamame'
|
|
4
|
+
require 'monkeyshines'
|
|
5
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
|
|
6
|
+
# Setup
|
|
7
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
|
8
|
+
Monkeyshines.load_global_options!
|
|
9
|
+
Monkeyshines.load_cmdline_options!
|
|
10
|
+
# Monkeyshines::CONFIG[:handle] = :test
|
|
11
|
+
|
|
12
|
+
# set up a chunked store -- will save timestamped files
|
|
13
|
+
# in the form :rootdir/:date/:handle+:timestamp-:pid.tsv",
|
|
14
|
+
dest = Monkeyshines::Store::ChunkedFlatFileStore.new(:rootdir => WORK_DIR)
|
|
15
|
+
|
|
16
|
+
# make those files fill up at about 40k/sec
|
|
17
|
+
loop do
|
|
18
|
+
dest << ( Time.now.utc.to_s + " " + "*"*4064)
|
|
19
|
+
sleep 0.5
|
|
20
|
+
end
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
:process_groups:
|
|
2
|
+
:twitter_hosebird:
|
|
3
|
+
:type: :scraper
|
|
4
|
+
:max_file_age: 21600 # 6 hours
|
|
5
|
+
:max_file_size: 120000000 # About 10% less than 128MB, so it should mostly fill one or two blocks on a hadoop file system
|
|
6
|
+
:hosebird_level: spritzer # spritzer, gardenhose, etc.
|
|
7
|
+
:user_pass_file: ...
|
|
8
|
+
:data_dir: /data/ripd/com.tw/com.twitter.stream
|
|
9
|
+
:log_dir: /data/log/com.tw/com.twitter.stream
|
|
10
|
+
:filename_pattern: ":date/stream.twitter.com+:datetime-:pid-:hostname.json"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'edamame'
|
|
4
|
+
require 'monkeyshines'
|
|
5
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
|
|
6
|
+
require 'monkeyshines/utils/filename_pattern'
|
|
7
|
+
# Setup
|
|
8
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
|
9
|
+
Monkeyshines.load_global_options!
|
|
10
|
+
Monkeyshines.load_cmdline_options!
|
|
11
|
+
Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
|
|
12
|
+
include Edamame::Scheduling
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# You can also specify these with --source-fetches on the command line
|
|
16
|
+
#
|
|
17
|
+
class TwitterScraper < Monkeyshines::Runner
|
|
18
|
+
def self.define_cmdline_options &block
|
|
19
|
+
super(&block)
|
|
20
|
+
yield(:dest_filename, "URI for scrape store to dump into", :type => String)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
Monkeyshines::CONFIG[:dest][:filename] ||= Monkeyshines::Utils::FilenamePattern.new(":dest_dir/seed/jobs-:handle-:hostname-:timestamp.tsv", :dest_dir => WORK_DIR ).to_s
|
|
25
|
+
default_tube = Monkeyshines::CONFIG[:handle].to_s.gsub(/\W+/,'').gsub(/_/,'-')
|
|
26
|
+
|
|
27
|
+
source = Edamame::PersistentQueue.new( :tube => default_tube,
|
|
28
|
+
:queue => { :uris => ['localhost:11220'], },
|
|
29
|
+
:store => { :uri => ':11221', }
|
|
30
|
+
)
|
|
31
|
+
dest = Monkeyshines::Store::FlatFileStore.new :filename => Monkeyshines::CONFIG[:dest][:filename], :filemode => 'w'
|
|
32
|
+
Log.info "Sucking from tube '#{default_tube}' on s#{source.options[:store][:uri]}/q#{source.options[:queue][:uris]}, dumping into #{dest.filename}"
|
|
33
|
+
source.each do |job|
|
|
34
|
+
# dest << ( [Monkeyshines.url_decode(job.obj[:key])] + job.values_of(:priority, :prev_rate, :prev_items, :prev_span_min, :prev_span_max) )
|
|
35
|
+
# last_run = Time.parse(job.last_run)
|
|
36
|
+
last_run = job.last_run ? job.last_run.strftime("%Y%m%d%H%M%S") : nil
|
|
37
|
+
dest << ( [Monkeyshines.url_decode(job.obj[:key])] + job.values_of(:priority, :prev_items_rate, :prev_max, :prev_items, :delay) + [last_run] )
|
|
38
|
+
end
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'edamame'
|
|
4
|
+
require 'monkeyshines'
|
|
5
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
|
|
6
|
+
# Setup
|
|
7
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
|
8
|
+
Monkeyshines.load_global_options!
|
|
9
|
+
Monkeyshines.load_cmdline_options!
|
|
10
|
+
Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
|
|
11
|
+
|
|
12
|
+
include Edamame::Scheduling
|
|
13
|
+
|
|
14
|
+
#
|
|
15
|
+
# You can also specify these with --source-fetches on the command line
|
|
16
|
+
#
|
|
17
|
+
class TwitterScraper < Monkeyshines::Runner
|
|
18
|
+
def self.define_cmdline_options &block
|
|
19
|
+
super(&block)
|
|
20
|
+
yield(:source_filename, "URI for scrape store to load from", :type => String, :required => true)
|
|
21
|
+
end
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
default_tube = Monkeyshines::CONFIG[:handle].to_s.gsub(/\W+/,'').gsub(/_/,'-')
|
|
25
|
+
p default_tube
|
|
26
|
+
|
|
27
|
+
DEFAULT_PRIORITY = 65536
|
|
28
|
+
DEFAULT_TTR = nil
|
|
29
|
+
DEFAULT_RESERVE_TIMEOUT = 15
|
|
30
|
+
IMMEDIATELY = 0
|
|
31
|
+
source = Monkeyshines::Store::FlatFileStore.new :filename => Monkeyshines::CONFIG[:source][:filename]
|
|
32
|
+
dest = Edamame::PersistentQueue.new( :tube => default_tube,
|
|
33
|
+
:queue => { :uris => ['localhost:11220'], },
|
|
34
|
+
:store => { :uri => ':11221', }
|
|
35
|
+
)
|
|
36
|
+
source.each do |query_term, priority, prev_items_rate, prev_max, prev_items, delay, last_run|
|
|
37
|
+
# |query_term, priority, prev_rate, prev_items, prev_span_min, prev_span_max|
|
|
38
|
+
query_term.strip!
|
|
39
|
+
query_term = Monkeyshines.url_encode(query_term)
|
|
40
|
+
priority = priority.to_i
|
|
41
|
+
priority = DEFAULT_PRIORITY if (priority == 0 )
|
|
42
|
+
prev_items_rate = prev_items_rate.to_f
|
|
43
|
+
prev_items_rate = nil if (prev_items_rate < 1e-6)
|
|
44
|
+
prev_max = prev_max.to_i
|
|
45
|
+
prev_items = prev_items.to_i
|
|
46
|
+
delay = delay.to_f
|
|
47
|
+
last_run = last_run ? DateTime.parse(last_run) : nil
|
|
48
|
+
|
|
49
|
+
twitter_search = { :type => 'TwitterSearchRequest', :key => query_term }
|
|
50
|
+
|
|
51
|
+
job = Edamame::Job.new(default_tube, priority, nil, 1,
|
|
52
|
+
Recurring.new(delay, prev_max, prev_items, prev_items_rate),
|
|
53
|
+
twitter_search
|
|
54
|
+
)
|
|
55
|
+
dest.put job, job.priority, IMMEDIATELY
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# loop do
|
|
60
|
+
# job = dest.reserve(DEFAULT_RESERVE_TIMEOUT) or break
|
|
61
|
+
# p [job, job.priority, job.scheduling, job.obj]
|
|
62
|
+
# # dest.delete job
|
|
63
|
+
# end
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
require 'rubygems'
|
|
3
|
+
require 'edamame'
|
|
4
|
+
require 'monkeyshines'
|
|
5
|
+
require 'wuclan/twitter' ; include Wuclan::Twitter::Scrape
|
|
6
|
+
# Setup
|
|
7
|
+
WORK_DIR = Subdir[__FILE__,'work'].expand_path.to_s
|
|
8
|
+
Monkeyshines.load_global_options!
|
|
9
|
+
Monkeyshines.load_cmdline_options!
|
|
10
|
+
Monkeyshines::CONFIG[:fetcher] = Monkeyshines::CONFIG[:twitter_api]
|
|
11
|
+
|
|
12
|
+
#
|
|
13
|
+
# * jobs stream from an edamame job queue.
|
|
14
|
+
# * each job generates 1 to 15 paginated requests, stopping when a response
|
|
15
|
+
# overlaps the prev_max item.
|
|
16
|
+
# * Each request is fetched with the standard HTTP fetcher.
|
|
17
|
+
# * jobs are rescheduled based on the observed item rate
|
|
18
|
+
# * results are sent to a ChunkedFlatFileStore
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
#
|
|
22
|
+
# Run scraper
|
|
23
|
+
#
|
|
24
|
+
loop do
|
|
25
|
+
begin
|
|
26
|
+
#
|
|
27
|
+
# Create scraper
|
|
28
|
+
#
|
|
29
|
+
scraper = Monkeyshines::Runner.new({
|
|
30
|
+
:log => { :iters => 600, :time => 150, :dest => nil }, # Monkeyshines::CONFIG[:handle]
|
|
31
|
+
:source => { :type => TwitterSearchRequestStream, :queue_request_timeout => (10 * 60),
|
|
32
|
+
:queue => { :uris => ['localhost:11220'], },
|
|
33
|
+
:store => { :uri => ':11221', }, },
|
|
34
|
+
:dest => { :type => :chunked_flat_file_store, :rootdir => WORK_DIR, :filemode => 'a' },
|
|
35
|
+
# :dest => { :type => :flat_file_store, :filename => WORK_DIR+"/test_output.tsv" },
|
|
36
|
+
# :fetcher => { :type => TwitterSearchFakeFetcher },
|
|
37
|
+
:sleep_time => 1 ,
|
|
38
|
+
})
|
|
39
|
+
Log.info "Starting a run!"
|
|
40
|
+
scraper.run
|
|
41
|
+
rescue Beanstalk::NotFoundError => e
|
|
42
|
+
warn e
|
|
43
|
+
end
|
|
44
|
+
end
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
$: << File.dirname(__FILE__)+'/../../../../edamame/lib'
|
|
2
|
+
require 'edamame/monitoring'
|
|
3
|
+
WORK_DIR = File.dirname(__FILE__)+'/work'
|
|
4
|
+
|
|
5
|
+
#
|
|
6
|
+
# For debugging:
|
|
7
|
+
#
|
|
8
|
+
# sudo god -c this_file.god -D
|
|
9
|
+
#
|
|
10
|
+
# (for production, use the etc/initc.d script in this directory)
|
|
11
|
+
#
|
|
12
|
+
# TODO: define an EdamameDirector that lets us name these collections.
|
|
13
|
+
#
|
|
14
|
+
THE_FAITHFUL = [
|
|
15
|
+
# twitter_search
|
|
16
|
+
[BeanstalkdGod, { :port => 11240, :max_mem_usage => 100.megabytes, }],
|
|
17
|
+
[TyrantGod, { :port => 11241, :db_dirname => WORK_DIR, :db_name => "twitter_search-queue.tct" }],
|
|
18
|
+
#
|
|
19
|
+
# [TyrantGod, { :port => 11249, :db_dirname => WORK_DIR, :db_name => "twitter_search-flat.tct" }],
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
THE_FAITHFUL.each do |klass, config|
|
|
23
|
+
proc = klass.create(config.merge :flapping_notify => 'default')
|
|
24
|
+
proc.mkdirs!
|
|
25
|
+
end
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
module TwitterApi
|
|
2
|
+
#
|
|
3
|
+
# The URI for a given resource
|
|
4
|
+
#
|
|
5
|
+
def gen_url
|
|
6
|
+
case context
|
|
7
|
+
when :user, :followers, :friends, :favorites, :timeline
|
|
8
|
+
"http://twitter.com/#{resource_path}/#{identifier}.json?page=#{page}"
|
|
9
|
+
when :followers_ids, :friends_ids
|
|
10
|
+
"http://twitter.com/#{resource_path}/#{identifier}.json"
|
|
11
|
+
when :user_timeline
|
|
12
|
+
"http://twitter.com/#{resource_path}/#{identifier}.json?page=#{page}&count=200"
|
|
13
|
+
# when :public_timeline
|
|
14
|
+
# when :search
|
|
15
|
+
# "http://search.twitter.com/search.json?q=#{query}"
|
|
16
|
+
else
|
|
17
|
+
raise "Don't know how to retrieve #{context} yet"
|
|
18
|
+
end
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
# Regular expression to grok resource from uri
|
|
22
|
+
GROK_URI_RE = %r{http://twitter.com/(\w+/\w+)/(\w+)\.json\?page=(\d+)}
|
|
23
|
+
|
|
24
|
+
# Context <=> resource mapping
|
|
25
|
+
#
|
|
26
|
+
# aka. repairing the non-REST uri's
|
|
27
|
+
RESOURCE_PATH_FROM_CONTEXT = {
|
|
28
|
+
:user => 'users/show',
|
|
29
|
+
:followers_ids => 'followers/ids',
|
|
30
|
+
:friends_ids => 'friends/ids',
|
|
31
|
+
:followers => 'statuses/followers',
|
|
32
|
+
:friends => 'statuses/friends',
|
|
33
|
+
:favorites => 'favorites',
|
|
34
|
+
:timeline => 'statuses/user_timeline',
|
|
35
|
+
:user_timeline => 'statuses/user_timeline',
|
|
36
|
+
:public_timeline => 'statuses/public_timeline'
|
|
37
|
+
}
|
|
38
|
+
# Get url resource for context
|
|
39
|
+
def resource_path
|
|
40
|
+
RESOURCE_PATH_FROM_CONTEXT[context.to_sym]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
def self.pages_from_count per_page, count, max=nil
|
|
44
|
+
num = [ (count.to_f / per_page.to_f).ceil, 0 ].max
|
|
45
|
+
[num, max].compact.min
|
|
46
|
+
end
|
|
47
|
+
def self.pages context, thing
|
|
48
|
+
case context
|
|
49
|
+
when :favorites then pages_from_count( 20, thing.favourites_count, 20)
|
|
50
|
+
when :friends then pages_from_count(100, thing.friends_count, 10)
|
|
51
|
+
when :followers then pages_from_count(100, thing.followers_count, 10)
|
|
52
|
+
when :followers_ids then thing.followers_count == 0 ? 0 : 1
|
|
53
|
+
when :friends_ids then thing.friends_count == 0 ? 0 : 1
|
|
54
|
+
when :user then 1
|
|
55
|
+
when :public_timeline then 1
|
|
56
|
+
when :user_timeline then pages_from_count(200, thing.statuses_count, 20)
|
|
57
|
+
when :search then pages_from_count(100, 1500)
|
|
58
|
+
else raise "need to define pages for context #{context}"
|
|
59
|
+
end
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
module ClassMethods
|
|
63
|
+
# Get context from url resource
|
|
64
|
+
def context_for_resource(resource)
|
|
65
|
+
RESOURCE_PATH_FROM_CONTEXT.invert[resource] or raise("Wrong resource specification #{resource}")
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
def self.included base
|
|
70
|
+
base.extend ClassMethods
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
|
|
74
|
+
# language: http://en.wikipedia.org/wiki/ISO_639-1
|
|
75
|
+
#
|
|
76
|
+
# * Find tweets containing a word: http://search.twitter.com/search.atom?q=twitter
|
|
77
|
+
# * Find tweets from a user: http://search.twitter.com/search.atom?q=from%3Aalexiskold
|
|
78
|
+
# * Find tweets to a user: http://search.twitter.com/search.atom?q=to%3Atechcrunch
|
|
79
|
+
# * Find tweets referencing a user: http://search.twitter.com/search.atom?q=%40mashable
|
|
80
|
+
# * Find tweets containing a hashtag: http://search.twitter.com/search.atom?q=%23haiku
|
|
81
|
+
# * Combine any of the operators together: http://search.twitter.com/search.atom?q=movie+%3A%29
|
|
82
|
+
#
|
|
83
|
+
# * lang: restricts tweets to the given language, given by an ISO 639-1 code. Ex: http://search.twitter.com/search.atom?lang=en&q=devo
|
|
84
|
+
# * rpp: the number of tweets to return per page, up to a max of 100. Ex: http://search.twitter.com/search.atom?lang=en&q=devo&rpp=15
|
|
85
|
+
# * page: the page number (starting at 1) to return, up to a max of roughly 1500 results (based on rpp * page)
|
|
86
|
+
# * since_id: returns tweets with status ids greater than the given id.
|
|
87
|
+
# * geocode: returns tweets by users located within a given radius of the given latitude/longitude, where the user's location is taken from their Twitter profile. The parameter value is specified by "latitide,longitude,radius", where radius units must be specified as either "mi" (miles) or "km" (kilometers). Ex: http://search.twitter.com/search.atom?geocode=40.757929%2C-73.985506%2C25km. Note that you cannot use the near operator via the API to geocode arbitrary locations; however you can use this geocode parameter to search near geocodes directly.
|
|
88
|
+
# * show_user: when "true", adds "<user>:" to the beginning of the tweet. This is useful for readers that do not display Atom's author field. The default is "false".
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Delicious
|
|
3
|
+
module DeliciousRequest
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
# Recent bookmarks by tag: http://delicious.com/tag/{tag[+tag+...+tag]}?detail=3
|
|
7
|
+
# Popular bookmarks by tag: http://delicious.com/popular/{tag}?detail=3
|
|
8
|
+
# Bookmarks for a specific user by tag(s): http://delicious.com/{username}/{tag[+tag+...+tag]}?detail=3
|
|
9
|
+
# Bookmarks for a specific user: http://delicious.com/{username}?detail=3
|
|
10
|
+
class TagRequest
|
|
11
|
+
attr_accessor :scope
|
|
12
|
+
def initialize scope
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# pages
|
|
16
|
+
# count
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# A list of all public tags for a user: http://delicious.com/tags/{username}?view=all
|
|
20
|
+
class UserTagsRequest
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Recent bookmarks for a URL: http://delicious.com/url/{url md5}
|
|
24
|
+
class UrlInfoRequest
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# A list of a user's network members: http://delicious.com/network/{username}
|
|
28
|
+
class FriendsFollowersRequest
|
|
29
|
+
end
|
|
30
|
+
end
|
|
31
|
+
end
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Models
|
|
3
|
+
|
|
4
|
+
#
|
|
5
|
+
# Models for the delicious.com (formerly del.icio.us) social network
|
|
6
|
+
#
|
|
7
|
+
# Link: has tags, tagged by socialites
|
|
8
|
+
# Socialite: describes links with tabs, uses tags, follows/followedby socialites
|
|
9
|
+
# Tag: tags links, used by socialites
|
|
10
|
+
|
|
11
|
+
class DeliciousLink < Struct.new(
|
|
12
|
+
:delicious_link_id, :url, :title, :taggers_count)
|
|
13
|
+
end
|
|
14
|
+
class DeliciousTag < Struct.new(
|
|
15
|
+
:name )
|
|
16
|
+
end
|
|
17
|
+
class DeliciousUser < Struct.new(
|
|
18
|
+
:id, :scraped_at, :screen_name, :protected, :followers_count, :friends_count, :taggings_count, :name, :description, :bio_url )
|
|
19
|
+
end
|
|
20
|
+
|
|
21
|
+
class DeliciousTagging < Struct.new(
|
|
22
|
+
:tag_name, :delicious_link_id, :screen_name, :created_at, :text, :description)
|
|
23
|
+
end
|
|
24
|
+
|
|
25
|
+
end
|
|
26
|
+
end
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
module Wuclan
|
|
2
|
+
module Delicious
|
|
3
|
+
module DeliciousRequest
|
|
4
|
+
end
|
|
5
|
+
|
|
6
|
+
# Recent bookmarks by tag: http://feeds.delicious.com/v2/{format}/tag/{tag[+tag+...+tag]}
|
|
7
|
+
# Popular bookmarks by tag: http://feeds.delicious.com/v2/{format}/popular/{tag}
|
|
8
|
+
# Bookmarks for a specific user by tag(s): http://feeds.delicious.com/v2/{format}/{username}/{tag[+tag+...+tag]}
|
|
9
|
+
# Bookmarks for a specific user: http://feeds.delicious.com/v2/{format}/{username}
|
|
10
|
+
class TagRequest
|
|
11
|
+
attr_accessor :scope
|
|
12
|
+
def initialize scope
|
|
13
|
+
end
|
|
14
|
+
|
|
15
|
+
# pages
|
|
16
|
+
# count
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
# A list of all public tags for a user: http://feeds.delicious.com/v2/{format}/tags/{username}
|
|
20
|
+
class UserTagsRequest
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
# Public summary information about a user (as seen in the network badge): http://feeds.delicious.com/v2/{format}/userinfo/{username}
|
|
24
|
+
class UserInfoRequest
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Recent bookmarks for a URL: http://feeds.delicious.com/v2/{format}/url/{url md5}
|
|
28
|
+
class UrlInfoRequest
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# A list of a user's network members: http://feeds.delicious.com/v2/{format}/networkmembers/{username}
|
|
32
|
+
class FollowersRequest
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# A list of a user's network fans: http://feeds.delicious.com/v2/{format}/networkfans/{username}
|
|
36
|
+
class FriendsRequest
|
|
37
|
+
end
|
|
38
|
+
end
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Recent bookmarks by tag: http://feeds.delicious.com/v2/{format}/tag/{tag[+tag+...+tag]}
|
|
43
|
+
# Popular bookmarks by tag: http://feeds.delicious.com/v2/{format}/popular/{tag}
|
|
44
|
+
# Bookmarks for a specific user: http://feeds.delicious.com/v2/{format}/{username}
|
|
45
|
+
# Bookmarks for a specific user by tag(s): http://feeds.delicious.com/v2/{format}/{username}/{tag[+tag+...+tag]}
|
|
46
|
+
# Public summary information about a user (as seen in the network badge): http://feeds.delicious.com/v2/{format}/userinfo/{username}
|
|
47
|
+
# A list of all public tags for a user: http://feeds.delicious.com/v2/{format}/tags/{username}
|
|
48
|
+
# Recent bookmarks for a URL: http://feeds.delicious.com/v2/{format}/url/{url md5}
|
|
49
|
+
# A list of a user's network members: http://feeds.delicious.com/v2/{format}/networkmembers/{username}
|
|
50
|
+
# A list of a user's network fans: http://feeds.delicious.com/v2/{format}/networkfans/{username}
|
|
51
|
+
|
|
52
|
+
# Bookmarks from the hotlist: http://feeds.delicious.com/v2/{format}
|
|
53
|
+
# Recent bookmarks: http://feeds.delicious.com/v2/{format}/recent
|
|
54
|
+
# Popular bookmarks: http://feeds.delicious.com/v2/{format}/popular
|
|
55
|
+
# Recent site alerts (as seen in the top-of-page alert bar on the site): http://feeds.delicious.com/v2/{format}/alerts
|
|
56
|
+
# Bookmarks from a user's subscriptions: http://feeds.delicious.com/v2/{format}/subscriptions/{username}
|
|
57
|
+
# Bookmarks from members of a user's network: http://feeds.delicious.com/v2/{format}/network/{username}
|
|
58
|
+
# Bookmarks from members of a user's network by tag: http://feeds.delicious.com/v2/{format}/network/{username}/{tag[+tag+...+tag]}
|
|
59
|
+
# Summary information about a URL (as seen in the tagometer): http://feeds.delicious.com/v2/json/urlinfo/{url md5}
|
|
60
|
+
|
|
61
|
+
# Private bookmarks for a specific user: http://feeds.delicious.com/v2/{format}/{username}?private={key}
|
|
62
|
+
# Private bookmarks for a specific user by tag(s): http://feeds.delicious.com/v2/{format}/{username}/{tag[+tag+...+tag]}?private={key}
|
|
63
|
+
# Private feed for a user's inbox bookmarks from others: http://feeds.delicious.com/v2/{format}/inbox/{username}?private={key}
|
|
64
|
+
# Bookmarks from members of a user's private network: http://feeds.delicious.com/v2/{format}/network/{username}?private={key}
|
|
65
|
+
# Bookmarks from members of a user's private network by tag: http://feeds.delicious.com/v2/{format}/network/{username}/{tag[+tag+...+tag]}?private={key}
|