monkeyshines 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- data/examples/shorturls/scrape_shorturls.rb +89 -35
- data/examples/shorturls/shorturl_stats.rb +37 -0
- data/examples/shorturls/split_short_urls.rb +21 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +118 -6
- data/lib/monkeyshines/fetcher/http_fetcher.rb +10 -9
- data/lib/monkeyshines/monitor/periodic_logger.rb +2 -1
- data/lib/monkeyshines/monitor/periodic_monitor.rb +9 -2
- data/lib/monkeyshines/options.rb +2 -0
- data/lib/monkeyshines/runner.rb +1 -0
- data/lib/monkeyshines/store/flat_file_store.rb +6 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +2 -1
- data/lib/monkeyshines/utils/filename_pattern.rb +1 -1
- metadata +53 -29
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +0 -66
- data/examples/shorturls/old/shorturl_stats.rb +0 -81
@@ -1,16 +1,19 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__); $: << File.dirname(__FILE__)+'/../../../graphiterb/lib'
|
3
3
|
require 'rubygems'
|
4
4
|
require 'wukong'
|
5
5
|
require 'monkeyshines'
|
6
|
+
require 'configliere'
|
6
7
|
#
|
7
8
|
require 'shorturl_request'
|
8
9
|
require 'shorturl_sequence'
|
10
|
+
require 'shorturl_stats'
|
9
11
|
require 'monkeyshines/utils/uri'
|
10
12
|
require 'monkeyshines/utils/filename_pattern'
|
11
13
|
require 'monkeyshines/store/conditional_store'
|
12
14
|
require 'monkeyshines/fetcher/http_head_fetcher'
|
13
|
-
require '
|
15
|
+
require 'graphiterb' # needs graphiterb - simple ruby interface for graphite
|
16
|
+
# require 'trollop' # gem install trollop
|
14
17
|
|
15
18
|
# ===========================================================================
|
16
19
|
#
|
@@ -26,39 +29,74 @@ require 'trollop' # gem install trollop
|
|
26
29
|
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
27
30
|
#
|
28
31
|
#
|
29
|
-
opts = Trollop::options do
|
30
|
-
opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
31
|
-
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
32
|
-
# input from file
|
33
|
-
opt :from, "Location of URLs to scrape", :type => String
|
34
|
-
opt :skip, "Initial lines to skip", :type => Integer
|
35
|
-
# OR do a random walk
|
36
|
-
opt :random, "Generate and visit random URL suffixes"
|
37
|
-
opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
38
|
-
opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
39
|
-
opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
40
|
-
# output storage
|
41
|
-
opt :cache_loc, "URI for cache server", :type => String
|
42
|
-
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
43
|
-
opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
|
44
|
-
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
|
45
|
-
end
|
46
|
-
handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
|
47
32
|
|
33
|
+
Configliere.use :commandline, :config_file, :define
|
34
|
+
Settings.read 'shorturls.yaml' #~/.configliere/shorturls.yaml
|
35
|
+
Settings.define :base_url, :description => "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
36
|
+
# Settings.define :log, :description => "Log file name; leave blank to use STDERR", :type => String
|
37
|
+
Settings.define :log_time, :description => "Log time interval, in seconds, for periodic logger and Graphite logger", :type => Integer, :default => 60
|
38
|
+
Settings.define :log_iters, :description => "Log iteration interval for periodic logger and Graphite logger", :type => Integer, :default => 10000
|
39
|
+
# input from file
|
40
|
+
Settings.define :file_from, :description => "Location of URLs to scrape", :type => String
|
41
|
+
Settings.define :file_skip, :description => "Initial lines to skip", :type => Integer
|
42
|
+
# OR do a random walk
|
43
|
+
Settings.define :random, :description => "Generate and visit random URL suffixes"
|
44
|
+
Settings.define :random_min, :description => "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
45
|
+
Settings.define :random_max, :description => "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
46
|
+
Settings.define :random_radix, :description => "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
47
|
+
# output storage
|
48
|
+
Settings.define :cache_loc, :description => "URI for cache server", :type => String
|
49
|
+
Settings.define :chunk_time, :description => "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
50
|
+
Settings.define :rootdir, :description => "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd/shorturls'
|
51
|
+
Settings.define :dest_pattern, :description => "Pattern for dump file output", :default => ":rootdir/:date/:handle+:timestamp-:pid.tsv"
|
52
|
+
Settings.resolve!
|
53
|
+
Log = Logger.new($stderr) unless defined?(Log)
|
54
|
+
|
55
|
+
# Removed trollop optioning, added in configliere instead
|
56
|
+
# opts = Trollop::options do
|
57
|
+
# opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
58
|
+
# opt :log, "Log file name; leave blank to use STDERR", :type => String
|
59
|
+
# # input from file
|
60
|
+
# opt :from, "Location of URLs to scrape", :type => String
|
61
|
+
# opt :skip, "Initial lines to skip", :type => Integer
|
62
|
+
# # OR do a random walk
|
63
|
+
# opt :random, "Generate and visit random URL suffixes"
|
64
|
+
# opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
65
|
+
# opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
66
|
+
# opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
67
|
+
# # output storage
|
68
|
+
# opt :cache_loc, "URI for cache server", :type => String
|
69
|
+
# opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
70
|
+
# opt :rootdir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
|
71
|
+
# opt :dest_pattern, "Pattern for dump file output", :default => ":rootdir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
|
72
|
+
# end
|
73
|
+
handle = Settings.base_url.gsub(/\.com$/,'').gsub(/\W+/,'')
|
74
|
+
hostname ||= `hostname`.chomp.gsub(".","_")
|
75
|
+
|
76
|
+
|
77
|
+
#
|
48
78
|
# ******************** Log ********************
|
49
|
-
|
50
|
-
|
79
|
+
#
|
80
|
+
# (I don't think the log file name ever gets used)
|
81
|
+
# Settings.log = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (Settings.log=='')
|
82
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
|
83
|
+
|
84
|
+
#
|
85
|
+
# ******************** Graphite Sender ***********************
|
86
|
+
#
|
87
|
+
graphite_sender = Graphiterb::GraphiteLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
|
51
88
|
|
52
89
|
#
|
53
90
|
# ******************** Load from store or random walk ********************
|
54
91
|
#
|
55
|
-
if
|
56
|
-
|
57
|
-
src_store.
|
58
|
-
|
92
|
+
if Settings.file_from
|
93
|
+
# Settings.filename = Settings.file_from
|
94
|
+
src_store = Monkeyshines::Store::FlatFileStore.new(:filename => Settings.file_from, :skip => Settings.file_skip.to_i) # + {:filemode => 'r'}
|
95
|
+
# src_store.skip!(Settings.file_skip.to_i) if Settings.file_skip
|
96
|
+
elsif Settings.random
|
59
97
|
src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
|
60
98
|
else
|
61
|
-
|
99
|
+
Settings.die "Need to either say --random or --file_from=filename"
|
62
100
|
end
|
63
101
|
|
64
102
|
#
|
@@ -67,30 +105,37 @@ end
|
|
67
105
|
# Track visited URLs with key-value database
|
68
106
|
#
|
69
107
|
RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
|
70
|
-
cache_loc =
|
71
|
-
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
|
108
|
+
cache_loc = Settings.cache_loc || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
|
109
|
+
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(:uri => cache_loc)
|
110
|
+
|
111
|
+
|
72
112
|
# dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
|
73
113
|
|
74
114
|
#
|
75
115
|
# Store the data into flat files
|
76
116
|
#
|
77
|
-
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(
|
78
|
-
:handle => 'shorturl-'+handle, :
|
79
|
-
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
|
80
|
-
|
117
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(Settings.dest_pattern,
|
118
|
+
:handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
|
119
|
+
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(:pattern => Settings.dest_pattern,
|
120
|
+
:chunk_time => Settings.chunk_time.to_i, :handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
|
81
121
|
|
82
122
|
#
|
83
123
|
# Conditional store uses the key-value DB to boss around the flat files --
|
84
124
|
# requests are only made (and thus data is only output) if the url is missing
|
85
125
|
# from the key-value store.
|
86
126
|
#
|
87
|
-
dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
|
127
|
+
dest_store = Monkeyshines::Store::ConditionalStore.new(:cache => dest_cache, :store => dest_files)
|
88
128
|
|
89
129
|
#
|
90
130
|
# ******************** Fetcher ********************
|
91
131
|
#
|
92
132
|
fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
|
93
133
|
|
134
|
+
#
|
135
|
+
# ******************** Success/Fail stats ********************
|
136
|
+
#
|
137
|
+
stats = ShorturlStats.new(0,0,0,0)
|
138
|
+
|
94
139
|
#
|
95
140
|
# ******************** Do this thing ********************
|
96
141
|
#
|
@@ -104,9 +149,18 @@ src_store.each do |bareurl, *args|
|
|
104
149
|
result = dest_store.set( req.url ) do
|
105
150
|
response = fetcher.get(req) # do the url fetch
|
106
151
|
next unless response.response_code || response.contents # don't store bad fetches
|
152
|
+
stats.code_sort(response.response_code) # count successes (301) and failures (404)
|
107
153
|
[response.scraped_at, response] # timestamp into cache, result into flat file
|
108
154
|
end
|
109
|
-
periodic_log.periodically{ ["%7d"%
|
155
|
+
periodic_log.periodically{ ["%7d"%stats.success_tot, 'successes', "%7d"%stats.failure_tot, 'failures', dest_store.size, req.response_code, result, req.url] }
|
156
|
+
graphite_sender.periodically do |metrics, iter, since|
|
157
|
+
rates = stats.rates_inst
|
158
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_rate", rates[0]]
|
159
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_rate", rates[1]]
|
160
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_tot_rate", stats.rates_tot[0]]
|
161
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_tot_rate", stats.rates_tot[1]]
|
162
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.current_file_size", dest_files.size]
|
163
|
+
end
|
110
164
|
end
|
111
165
|
dest_store.close
|
112
166
|
fetcher.close
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class ShorturlStats < Struct.new(
|
2
|
+
:success_tot,
|
3
|
+
:failure_tot,
|
4
|
+
:success_last,
|
5
|
+
:fail_last
|
6
|
+
)
|
7
|
+
|
8
|
+
def code_sort code
|
9
|
+
case code.to_s
|
10
|
+
when /4\d{2}/
|
11
|
+
self.failure_tot += 1
|
12
|
+
self.fail_last += 1
|
13
|
+
when /3\d{2}/
|
14
|
+
self.success_tot += 1
|
15
|
+
self.success_last += 1
|
16
|
+
else
|
17
|
+
Log.warn "Code #{code} not included in stats."
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def rates_inst
|
22
|
+
return [0,0] if (self.success_last.to_f + self.fail_last.to_f) == 0
|
23
|
+
s_rate = (self.success_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
|
24
|
+
f_rate = (self.fail_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
|
25
|
+
self.success_last = 0
|
26
|
+
self.fail_last = 0
|
27
|
+
[s_rate,f_rate]
|
28
|
+
end
|
29
|
+
|
30
|
+
def rates_tot
|
31
|
+
return [0,0] if (self.success_tot.to_f + self.failure_tot.to_f) == 0
|
32
|
+
st_rate = (self.success_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
|
33
|
+
ft_rate = (self.failure_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
|
34
|
+
[st_rate,ft_rate]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
WORK_DIR = '/data/rawd/social/networks/twitter_friends/tokens_by_month/'
|
4
|
+
|
5
|
+
OTHER_SHORTURL_RE =
|
6
|
+
%r{.*(http://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.+)}
|
7
|
+
|
8
|
+
bitly_file = File.open('/home/doncarlo/shorturls/shorturls_bitly','w')
|
9
|
+
tinyurl_file = File.open('/home/doncarlo/shorturls/shorturls_tinyurl','w')
|
10
|
+
otherurl_file = File.open('/home/doncarlo/shorturls/shorturls_other','w')
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
File.open(WORK_DIR + 'urls_by_month-20091111.tsv').each do |line|
|
15
|
+
line.chomp!
|
16
|
+
url = line.split("\t")[-1]
|
17
|
+
bitly_file << $1 + "\n" if url =~ %r{.*(http://bit.ly/.+)}
|
18
|
+
tinyurl_file << $1 + "\n" if url =~ %r{.*(http://tinyurl.com/.+)}
|
19
|
+
otherurl_file << $1 + "\n" if url =~ OTHER_SHORTURL_RE
|
20
|
+
end
|
21
|
+
|
@@ -1,25 +1,137 @@
|
|
1
1
|
require 'net/http'
|
2
|
+
require 'oauth'
|
2
3
|
Net::HTTP.version_1_2
|
3
4
|
module Monkeyshines
|
4
5
|
module Fetcher
|
5
6
|
|
6
7
|
#
|
7
|
-
class AuthedHttpFetcher
|
8
|
-
|
8
|
+
class AuthedHttpFetcher < HttpFetcher
|
9
|
+
attr_accessor :auth_params, :oauth_token, :oauth_secret, :consumer_key, :consumer_secret, :site, :authorize_path
|
10
|
+
#
|
11
|
+
# All the stuff below was copied from http://github.com/moomerman/twitter_oauth in the client.rb file
|
12
|
+
#
|
13
|
+
# def initialize(options = {})
|
14
|
+
# @consumer_key = options[:consumer_key]
|
15
|
+
# @consumer_secret = options[:consumer_secret]
|
16
|
+
# @token = options[:token]
|
17
|
+
# @secret = options[:secret]
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def authorize(token, secret, options = {})
|
21
|
+
# request_token = OAuth::RequestToken.new(
|
22
|
+
# consumer, token, secret
|
23
|
+
# )
|
24
|
+
# @access_token = request_token.get_access_token(options)
|
25
|
+
# @token = @access_token.token
|
26
|
+
# @secret = @access_token.secret
|
27
|
+
# @access_token
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# def show(username)
|
31
|
+
# get("/users/show/#{username}.json")
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# # Returns the string "ok" in the requested format with a 200 OK HTTP status code.
|
35
|
+
# def test
|
36
|
+
# get("/help/test.json")
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# def request_token(options={})
|
40
|
+
# consumer.get_request_token(options)
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# def authentication_request_token(options={})
|
44
|
+
# consumer.options[:authorize_path] = '/oauth/authenticate'
|
45
|
+
# request_token(options)
|
46
|
+
# end
|
47
|
+
#
|
48
|
+
# private
|
49
|
+
#
|
50
|
+
# def consumer
|
51
|
+
# @consumer ||= OAuth::Consumer.new(
|
52
|
+
# @consumer_key,
|
53
|
+
# @consumer_secret,
|
54
|
+
# { :site => "http://api.twitter.com" }
|
55
|
+
# )
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# def access_token
|
59
|
+
# @access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# def get(path, headers={})
|
63
|
+
# headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
|
64
|
+
# oauth_response = access_token.get("/1#{path}", headers)
|
65
|
+
# JSON.parse(oauth_response.body)
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# def post(path, body='', headers={})
|
69
|
+
# headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
|
70
|
+
# oauth_response = access_token.post("/1#{path}", body, headers)
|
71
|
+
# JSON.parse(oauth_response.body)
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
# def delete(path, headers={})
|
75
|
+
# headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
|
76
|
+
# oauth_response = access_token.delete("/1#{path}", headers)
|
77
|
+
# JSON.parse(oauth_response.body)
|
78
|
+
# end
|
9
79
|
|
10
|
-
|
80
|
+
|
81
|
+
def initialize _options={}
|
82
|
+
super _options
|
83
|
+
# These should get called by calling super, right?
|
84
|
+
# self.username = options[:username]
|
85
|
+
# self.password = options[:password]
|
86
|
+
# self.http_req_options = {}
|
87
|
+
# self.http_req_options["User-Agent"] = options[:user_agent] || USER_AGENT
|
88
|
+
# self.http_req_options["Connection"] = "keep-alive"
|
89
|
+
self.oauth_token = options[:oauth_token]
|
90
|
+
self.oauth_secret = options[:oauth_token_secret]
|
91
|
+
self.consumer_key = options[:consumer_key]
|
92
|
+
self.consumer_secret = options[:consumer_secret]
|
93
|
+
self.site = options[:site]
|
94
|
+
self.authorize_path = options[:authorize_path]
|
95
|
+
end
|
96
|
+
|
97
|
+
def request_token(options={})
|
98
|
+
consumer.options[:authorize_path] = @authorize_path
|
99
|
+
consumer.get_request_token(options)
|
11
100
|
end
|
12
101
|
|
13
|
-
def authorize
|
102
|
+
def authorize(token, secret, options = {})
|
103
|
+
request_token = OAuth::RequestToken.new(
|
104
|
+
consumer, token, secret
|
105
|
+
)
|
106
|
+
@access_token = request_token.get_access_token(options)
|
107
|
+
@token = @access_token.token
|
108
|
+
@secret = @access_token.secret
|
109
|
+
@access_token
|
14
110
|
end
|
15
111
|
|
16
112
|
def get_access_token
|
17
113
|
end
|
18
114
|
|
19
|
-
def
|
115
|
+
def oauth_token
|
116
|
+
@oauth_token
|
117
|
+
end
|
118
|
+
|
119
|
+
def oauth_secret
|
120
|
+
@oauth_secret
|
121
|
+
end
|
122
|
+
|
123
|
+
def consumer
|
124
|
+
@consumer ||= OAuth::Consumer.new(
|
125
|
+
@consumer_key,
|
126
|
+
@consumer_secret,
|
127
|
+
{ :site => @site }
|
128
|
+
)
|
20
129
|
end
|
21
|
-
|
130
|
+
|
131
|
+
def access_token
|
132
|
+
@access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
|
22
133
|
end
|
134
|
+
|
23
135
|
def session_key
|
24
136
|
end
|
25
137
|
|
@@ -82,17 +82,18 @@ module Monkeyshines
|
|
82
82
|
# Response-based sleep time
|
83
83
|
sleep_time = 0
|
84
84
|
case response
|
85
|
-
when Net::HTTPSuccess then return
|
86
|
-
when Net::HTTPRedirection then return
|
87
|
-
when Net::HTTPBadRequest then sleep_time =
|
88
|
-
when Net::HTTPUnauthorized then sleep_time = 0
|
89
|
-
when Net::HTTPForbidden then sleep_time =
|
90
|
-
when Net::HTTPNotFound then sleep_time = 0
|
91
|
-
when Net::HTTPServiceUnavailable then sleep_time =
|
92
|
-
when Net::HTTPServerError then sleep_time = 2
|
85
|
+
when Net::HTTPSuccess then return # 2xx
|
86
|
+
when Net::HTTPRedirection then return # 3xx
|
87
|
+
when Net::HTTPBadRequest then sleep_time = 10 # 400 (rate limit, probably)
|
88
|
+
when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
|
89
|
+
when Net::HTTPForbidden then sleep_time = 10 # 403 update limit
|
90
|
+
when Net::HTTPNotFound then sleep_time = 0 # 404 deleted or suspended
|
91
|
+
when Net::HTTPServiceUnavailable then sleep_time = 10 # 503 Fail Whale
|
92
|
+
when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
|
93
93
|
else sleep_time = 1
|
94
94
|
end
|
95
|
-
|
95
|
+
sleep_time += response['retry-after'].to_i rescue 0
|
96
|
+
Log.warn "Received #{response.code} and retry-after #{response['retry-after']}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at}): '#{response.body[0..200].gsub(%r{[\r\n\t]}, " ")}'"
|
96
97
|
sleep sleep_time
|
97
98
|
end
|
98
99
|
|
@@ -24,7 +24,8 @@ module Monkeyshines
|
|
24
24
|
#
|
25
25
|
def periodically &block
|
26
26
|
super do
|
27
|
-
|
27
|
+
now = Time.now.utc.to_f
|
28
|
+
result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
|
28
29
|
Log.info result.join("\t")
|
29
30
|
end
|
30
31
|
end
|
@@ -18,12 +18,13 @@ module Monkeyshines
|
|
18
18
|
#
|
19
19
|
class PeriodicMonitor
|
20
20
|
attr_accessor :time_interval, :iter_interval
|
21
|
-
attr_accessor :last_time, :iter, :started_at
|
21
|
+
attr_accessor :last_time, :current_iter, :iter, :started_at
|
22
22
|
|
23
23
|
def initialize options={}
|
24
24
|
self.started_at = Time.now.utc.to_f
|
25
25
|
self.last_time = started_at
|
26
26
|
self.iter = 0
|
27
|
+
self.current_iter = 0
|
27
28
|
self.time_interval = options[:time]
|
28
29
|
self.iter_interval = options[:iters]
|
29
30
|
end
|
@@ -42,10 +43,14 @@ module Monkeyshines
|
|
42
43
|
def since
|
43
44
|
Time.now.utc.to_f - started_at
|
44
45
|
end
|
45
|
-
#
|
46
|
+
# Overall iterations per second
|
46
47
|
def rate
|
47
48
|
iter.to_f / since.to_f
|
48
49
|
end
|
50
|
+
# "Instantaneous" iterations per second
|
51
|
+
def inst_rate now
|
52
|
+
current_iter.to_f / (now-last_time).to_f
|
53
|
+
end
|
49
54
|
|
50
55
|
#
|
51
56
|
# if the interval conditions are met, executes block; otherwise just does
|
@@ -53,10 +58,12 @@ module Monkeyshines
|
|
53
58
|
#
|
54
59
|
def periodically &block
|
55
60
|
self.iter += 1
|
61
|
+
self.current_iter += 1
|
56
62
|
now = Time.now.utc.to_f
|
57
63
|
if enough_iterations? || enough_time?(now)
|
58
64
|
block.call(iter, (now-last_time))
|
59
65
|
self.last_time = now
|
66
|
+
self.current_iter = 0
|
60
67
|
end
|
61
68
|
end
|
62
69
|
end
|
data/lib/monkeyshines/options.rb
CHANGED
data/lib/monkeyshines/runner.rb
CHANGED
@@ -142,6 +142,7 @@ module Monkeyshines
|
|
142
142
|
def setup_main_log
|
143
143
|
unless options[:log][:dest].blank?
|
144
144
|
log_file = "%s/log/%s" % [WORK_DIR, options[:log][:dest]]
|
145
|
+
require 'fileutils'
|
145
146
|
FileUtils.mkdir_p(File.dirname(log_file))
|
146
147
|
$stdout = $stderr = File.open( log_file+"-console.log", "a" )
|
147
148
|
end
|
@@ -12,13 +12,14 @@ module Monkeyshines
|
|
12
12
|
def initialize options
|
13
13
|
raise "URI for #{self.class} is required" if options[:uri].blank?
|
14
14
|
self.db_host, self.db_port = options[:uri].to_s.split(':')
|
15
|
+
self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
|
15
16
|
super options
|
16
17
|
end
|
17
18
|
|
18
19
|
def db
|
19
20
|
return @db if @db
|
20
21
|
@db ||= TokyoTyrant::RDB.new
|
21
|
-
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}
|
22
|
+
@db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
22
23
|
@db
|
23
24
|
end
|
24
25
|
|
@@ -75,7 +75,7 @@ module Monkeyshines
|
|
75
75
|
|
76
76
|
# Memoized: the hostname for the machine running this script.
|
77
77
|
def hostname
|
78
|
-
@hostname ||= ENV['HOSTNAME'] || `hostname
|
78
|
+
@hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
|
79
79
|
end
|
80
80
|
# Memoized: the Process ID for this invocation.
|
81
81
|
def pid
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: monkeyshines
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 19
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Philip (flip) Kromer
|
@@ -9,39 +15,51 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
18
|
+
date: 2010-07-15 00:00:00 +00:00
|
13
19
|
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: addressable
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
23
32
|
version: "0"
|
24
|
-
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
25
35
|
- !ruby/object:Gem::Dependency
|
26
36
|
name: uuid
|
27
|
-
|
28
|
-
|
29
|
-
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
30
40
|
requirements:
|
31
41
|
- - ">="
|
32
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
33
46
|
version: "0"
|
34
|
-
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
35
49
|
- !ruby/object:Gem::Dependency
|
36
50
|
name: wukong
|
37
|
-
|
38
|
-
|
39
|
-
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
40
54
|
requirements:
|
41
55
|
- - ">="
|
42
56
|
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
43
60
|
version: "0"
|
44
|
-
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
45
63
|
description: A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.
|
46
64
|
email: flip@infochimps.org
|
47
65
|
executables: []
|
@@ -64,12 +82,12 @@ files:
|
|
64
82
|
- examples/shorturls/bulkload_shorturls.rb
|
65
83
|
- examples/shorturls/extract_urls.rb
|
66
84
|
- examples/shorturls/multiplex_shorturl_cache.rb
|
67
|
-
- examples/shorturls/old/multidump_and_fix_shorturls.rb
|
68
|
-
- examples/shorturls/old/shorturl_stats.rb
|
69
85
|
- examples/shorturls/scrape_shorturls.rb
|
70
86
|
- examples/shorturls/shorturl_request.rb
|
71
87
|
- examples/shorturls/shorturl_sequence.rb
|
72
88
|
- examples/shorturls/shorturl_start_tyrant.sh
|
89
|
+
- examples/shorturls/shorturl_stats.rb
|
90
|
+
- examples/shorturls/split_short_urls.rb
|
73
91
|
- examples/shorturls/start_shorturl_cache.sh
|
74
92
|
- lib/monkeyshines.rb
|
75
93
|
- lib/monkeyshines/extensions.rb
|
@@ -139,37 +157,43 @@ rdoc_options:
|
|
139
157
|
require_paths:
|
140
158
|
- lib
|
141
159
|
required_ruby_version: !ruby/object:Gem::Requirement
|
160
|
+
none: false
|
142
161
|
requirements:
|
143
162
|
- - ">="
|
144
163
|
- !ruby/object:Gem::Version
|
164
|
+
hash: 3
|
165
|
+
segments:
|
166
|
+
- 0
|
145
167
|
version: "0"
|
146
|
-
version:
|
147
168
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
148
170
|
requirements:
|
149
171
|
- - ">="
|
150
172
|
- !ruby/object:Gem::Version
|
173
|
+
hash: 3
|
174
|
+
segments:
|
175
|
+
- 0
|
151
176
|
version: "0"
|
152
|
-
version:
|
153
177
|
requirements: []
|
154
178
|
|
155
179
|
rubyforge_project:
|
156
|
-
rubygems_version: 1.3.
|
180
|
+
rubygems_version: 1.3.7
|
157
181
|
signing_key:
|
158
182
|
specification_version: 3
|
159
183
|
summary: A simple scraper for directed scrapes of APIs, feed or structured HTML.
|
160
184
|
test_files:
|
161
185
|
- spec/monkeyshines_spec.rb
|
162
186
|
- spec/spec_helper.rb
|
163
|
-
- examples/
|
164
|
-
- examples/
|
165
|
-
- examples/rename_tree/rename_ripd_tree.rb
|
166
|
-
- examples/rss_feeds/scrape_rss_feeds.rb
|
167
|
-
- examples/shorturls/bulkdump_shorturls.rb
|
168
|
-
- examples/shorturls/bulkload_shorturls.rb
|
169
|
-
- examples/shorturls/extract_urls.rb
|
187
|
+
- examples/shorturls/shorturl_stats.rb
|
188
|
+
- examples/shorturls/shorturl_request.rb
|
170
189
|
- examples/shorturls/multiplex_shorturl_cache.rb
|
171
|
-
- examples/shorturls/
|
172
|
-
- examples/shorturls/old/shorturl_stats.rb
|
190
|
+
- examples/shorturls/bulkload_shorturls.rb
|
173
191
|
- examples/shorturls/scrape_shorturls.rb
|
174
|
-
- examples/shorturls/
|
192
|
+
- examples/shorturls/extract_urls.rb
|
193
|
+
- examples/shorturls/bulkdump_shorturls.rb
|
175
194
|
- examples/shorturls/shorturl_sequence.rb
|
195
|
+
- examples/shorturls/split_short_urls.rb
|
196
|
+
- examples/rename_tree/rename_hdp_tree.rb
|
197
|
+
- examples/rename_tree/rename_ripd_tree.rb
|
198
|
+
- examples/rss_feeds/scrape_rss_feeds.rb
|
199
|
+
- examples/bulk_urls/scrape_bulk_urls.rb
|
@@ -1,66 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
-
require 'rubygems'
|
4
|
-
require 'trollop'
|
5
|
-
require 'wukong'
|
6
|
-
require 'monkeyshines'
|
7
|
-
require 'shorturl_request'
|
8
|
-
require 'shorturl_sequence'
|
9
|
-
require 'monkeyshines/utils/uri'
|
10
|
-
|
11
|
-
#
|
12
|
-
# Command line options
|
13
|
-
#
|
14
|
-
opts = Trollop::options do
|
15
|
-
opt :from_type, 'Class name for scrape store to load from', :type => String
|
16
|
-
opt :from, 'URI for scrape store to load from', :type => String
|
17
|
-
opt :into, 'Filename for flat TSV dump', :type => String
|
18
|
-
opt :log, 'File to store log', :type => String
|
19
|
-
end
|
20
|
-
Trollop::die :from_type unless opts[:from_type]
|
21
|
-
|
22
|
-
# ******************** Read From ********************
|
23
|
-
src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
|
24
|
-
src_store = src_store_klass.new(opts[:from])
|
25
|
-
Log.info "Loaded store with #{src_store.size}"
|
26
|
-
|
27
|
-
# ******************** Write into ********************
|
28
|
-
DUMPFILE_BASE = opts[:into]
|
29
|
-
def make_store uri
|
30
|
-
Monkeyshines::Store::FlatFileStore.new "#{DUMPFILE_BASE+"-"+uri}.tsv", :filemode => 'w'
|
31
|
-
end
|
32
|
-
dests = { }
|
33
|
-
[ 'tinyurl', 'bitly', 'other'
|
34
|
-
].each do |handle|
|
35
|
-
dests[handle] = make_store handle
|
36
|
-
end
|
37
|
-
|
38
|
-
# ******************** Log ********************
|
39
|
-
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
|
40
|
-
|
41
|
-
# ******************** Cross Load ********************
|
42
|
-
# Read , process, dump
|
43
|
-
iter = 0
|
44
|
-
src_store.each do |key, hsh|
|
45
|
-
hsh['contents'] ||= hsh.delete 'expanded_url'
|
46
|
-
hsh['response_code'] = nil if hsh['response_code'] == 'nil'
|
47
|
-
hsh['contents'] = nil if hsh['contents'] == 'nil'
|
48
|
-
unless hsh['contents'] || hsh['response_code']
|
49
|
-
# Log.info "removing #{hsh.inspect}"
|
50
|
-
src_store.db.out(key)
|
51
|
-
next
|
52
|
-
end
|
53
|
-
hsh['response_message'] = nil if hsh['response_message'] == 'nil'
|
54
|
-
hsh['url'] ||= hsh.delete 'short_url'
|
55
|
-
req = ShorturlRequest.from_hash hsh
|
56
|
-
periodic_log.periodically{ [src_store.size, req.to_flat] }
|
57
|
-
|
58
|
-
req.contents = Addressable::URI.scrub_url req.contents if req.contents
|
59
|
-
|
60
|
-
case
|
61
|
-
when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].save req
|
62
|
-
when (key =~ %r{^http://bit.ly/(.*)}) then dests['bitly' ].save req
|
63
|
-
else dests['other' ].save req
|
64
|
-
end
|
65
|
-
# src_store.save(key, req.to_hash.compact)
|
66
|
-
end
|
@@ -1,81 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
-
#require 'rubygems'
|
4
|
-
# require 'wukong'
|
5
|
-
require 'monkeyshines'
|
6
|
-
# require 'monkeyshines/utils/uri'
|
7
|
-
# require 'monkeyshines/utils/filename_pattern'
|
8
|
-
# require 'monkeyshines/store/conditional_store'
|
9
|
-
# require 'monkeyshines/fetcher/http_head_fetcher'
|
10
|
-
# require 'trollop' # gem install trollop
|
11
|
-
# require 'shorturl_request'
|
12
|
-
require 'shorturl_sequence'
|
13
|
-
|
14
|
-
digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
|
15
|
-
|
16
|
-
# (1..10000).each do |idx|
|
17
|
-
# s = ShorturlSequence.encode_integer idx, 36
|
18
|
-
# digits[s[0..0]] += 1
|
19
|
-
# end
|
20
|
-
# p digits
|
21
|
-
# puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
|
22
|
-
|
23
|
-
class Histo
|
24
|
-
attr_accessor :buckets
|
25
|
-
def initialize
|
26
|
-
self.buckets = { }
|
27
|
-
end
|
28
|
-
def << val
|
29
|
-
buckets[val] ||= 0
|
30
|
-
buckets[val] += 1
|
31
|
-
end
|
32
|
-
def dump
|
33
|
-
buckets.sort.each do |val, count|
|
34
|
-
puts "%10d\t%s"%[count,val]
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
len_histo = Histo.new
|
40
|
-
num_histo = Histo.new
|
41
|
-
ltr_histo = Histo.new
|
42
|
-
iter = 0
|
43
|
-
|
44
|
-
# 123456789-123456789-
|
45
|
-
# http://bit.ly/
|
46
|
-
# http://tinyurl.com/
|
47
|
-
BASE_URL = "http://is.gd/"
|
48
|
-
RADIX = 62
|
49
|
-
HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
|
50
|
-
BASE_URL_LEN = BASE_URL.length
|
51
|
-
MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
|
52
|
-
SIX_CHARS = RADIX**6
|
53
|
-
File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
|
54
|
-
) do |reqfile|
|
55
|
-
reqfile.each do |url|
|
56
|
-
#decode
|
57
|
-
next unless url.length <= MAX_TAIL_LEN
|
58
|
-
tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
|
59
|
-
# tail.downcase!
|
60
|
-
asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
|
61
|
-
next unless asnum && asnum < SIX_CHARS
|
62
|
-
size = (asnum / 1_000_000)
|
63
|
-
len = tail.length
|
64
|
-
# track stats
|
65
|
-
len_histo << len
|
66
|
-
num_histo << size
|
67
|
-
ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
|
68
|
-
puts iter if ((iter += 1) % 1_000_000 == 0)
|
69
|
-
|
70
|
-
end
|
71
|
-
end
|
72
|
-
puts "Integer magnitude of decoded (M):"
|
73
|
-
num_histo.dump
|
74
|
-
puts "Length of encoded:"
|
75
|
-
len_histo.dump
|
76
|
-
puts "First Letter:"
|
77
|
-
ltr_histo.dump
|
78
|
-
|
79
|
-
|
80
|
-
# puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
|
81
|
-
# puts [asnum, tail, url].inspect
|