monkeyshines 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/examples/shorturls/scrape_shorturls.rb +89 -35
- data/examples/shorturls/shorturl_stats.rb +37 -0
- data/examples/shorturls/split_short_urls.rb +21 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +118 -6
- data/lib/monkeyshines/fetcher/http_fetcher.rb +10 -9
- data/lib/monkeyshines/monitor/periodic_logger.rb +2 -1
- data/lib/monkeyshines/monitor/periodic_monitor.rb +9 -2
- data/lib/monkeyshines/options.rb +2 -0
- data/lib/monkeyshines/runner.rb +1 -0
- data/lib/monkeyshines/store/flat_file_store.rb +6 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +2 -1
- data/lib/monkeyshines/utils/filename_pattern.rb +1 -1
- metadata +53 -29
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +0 -66
- data/examples/shorturls/old/shorturl_stats.rb +0 -81
@@ -1,16 +1,19 @@
|
|
1
1
|
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__); $: << File.dirname(__FILE__)+'/../../../graphiterb/lib'
|
3
3
|
require 'rubygems'
|
4
4
|
require 'wukong'
|
5
5
|
require 'monkeyshines'
|
6
|
+
require 'configliere'
|
6
7
|
#
|
7
8
|
require 'shorturl_request'
|
8
9
|
require 'shorturl_sequence'
|
10
|
+
require 'shorturl_stats'
|
9
11
|
require 'monkeyshines/utils/uri'
|
10
12
|
require 'monkeyshines/utils/filename_pattern'
|
11
13
|
require 'monkeyshines/store/conditional_store'
|
12
14
|
require 'monkeyshines/fetcher/http_head_fetcher'
|
13
|
-
require '
|
15
|
+
require 'graphiterb' # needs graphiterb - simple ruby interface for graphite
|
16
|
+
# require 'trollop' # gem install trollop
|
14
17
|
|
15
18
|
# ===========================================================================
|
16
19
|
#
|
@@ -26,39 +29,74 @@ require 'trollop' # gem install trollop
|
|
26
29
|
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
27
30
|
#
|
28
31
|
#
|
29
|
-
opts = Trollop::options do
|
30
|
-
opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
31
|
-
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
32
|
-
# input from file
|
33
|
-
opt :from, "Location of URLs to scrape", :type => String
|
34
|
-
opt :skip, "Initial lines to skip", :type => Integer
|
35
|
-
# OR do a random walk
|
36
|
-
opt :random, "Generate and visit random URL suffixes"
|
37
|
-
opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
38
|
-
opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
39
|
-
opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
40
|
-
# output storage
|
41
|
-
opt :cache_loc, "URI for cache server", :type => String
|
42
|
-
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
43
|
-
opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
|
44
|
-
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
|
45
|
-
end
|
46
|
-
handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
|
47
32
|
|
33
|
+
Configliere.use :commandline, :config_file, :define
|
34
|
+
Settings.read 'shorturls.yaml' #~/.configliere/shorturls.yaml
|
35
|
+
Settings.define :base_url, :description => "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
36
|
+
# Settings.define :log, :description => "Log file name; leave blank to use STDERR", :type => String
|
37
|
+
Settings.define :log_time, :description => "Log time interval, in seconds, for periodic logger and Graphite logger", :type => Integer, :default => 60
|
38
|
+
Settings.define :log_iters, :description => "Log iteration interval for periodic logger and Graphite logger", :type => Integer, :default => 10000
|
39
|
+
# input from file
|
40
|
+
Settings.define :file_from, :description => "Location of URLs to scrape", :type => String
|
41
|
+
Settings.define :file_skip, :description => "Initial lines to skip", :type => Integer
|
42
|
+
# OR do a random walk
|
43
|
+
Settings.define :random, :description => "Generate and visit random URL suffixes"
|
44
|
+
Settings.define :random_min, :description => "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
45
|
+
Settings.define :random_max, :description => "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
46
|
+
Settings.define :random_radix, :description => "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
47
|
+
# output storage
|
48
|
+
Settings.define :cache_loc, :description => "URI for cache server", :type => String
|
49
|
+
Settings.define :chunk_time, :description => "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
50
|
+
Settings.define :rootdir, :description => "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd/shorturls'
|
51
|
+
Settings.define :dest_pattern, :description => "Pattern for dump file output", :default => ":rootdir/:date/:handle+:timestamp-:pid.tsv"
|
52
|
+
Settings.resolve!
|
53
|
+
Log = Logger.new($stderr) unless defined?(Log)
|
54
|
+
|
55
|
+
# Removed trollop optioning, added in configliere instead
|
56
|
+
# opts = Trollop::options do
|
57
|
+
# opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
58
|
+
# opt :log, "Log file name; leave blank to use STDERR", :type => String
|
59
|
+
# # input from file
|
60
|
+
# opt :from, "Location of URLs to scrape", :type => String
|
61
|
+
# opt :skip, "Initial lines to skip", :type => Integer
|
62
|
+
# # OR do a random walk
|
63
|
+
# opt :random, "Generate and visit random URL suffixes"
|
64
|
+
# opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
65
|
+
# opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
66
|
+
# opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
67
|
+
# # output storage
|
68
|
+
# opt :cache_loc, "URI for cache server", :type => String
|
69
|
+
# opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
70
|
+
# opt :rootdir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
|
71
|
+
# opt :dest_pattern, "Pattern for dump file output", :default => ":rootdir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
|
72
|
+
# end
|
73
|
+
handle = Settings.base_url.gsub(/\.com$/,'').gsub(/\W+/,'')
|
74
|
+
hostname ||= `hostname`.chomp.gsub(".","_")
|
75
|
+
|
76
|
+
|
77
|
+
#
|
48
78
|
# ******************** Log ********************
|
49
|
-
|
50
|
-
|
79
|
+
#
|
80
|
+
# (I don't think the log file name ever gets used)
|
81
|
+
# Settings.log = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (Settings.log=='')
|
82
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
|
83
|
+
|
84
|
+
#
|
85
|
+
# ******************** Graphite Sender ***********************
|
86
|
+
#
|
87
|
+
graphite_sender = Graphiterb::GraphiteLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
|
51
88
|
|
52
89
|
#
|
53
90
|
# ******************** Load from store or random walk ********************
|
54
91
|
#
|
55
|
-
if
|
56
|
-
|
57
|
-
src_store.
|
58
|
-
|
92
|
+
if Settings.file_from
|
93
|
+
# Settings.filename = Settings.file_from
|
94
|
+
src_store = Monkeyshines::Store::FlatFileStore.new(:filename => Settings.file_from, :skip => Settings.file_skip.to_i) # + {:filemode => 'r'}
|
95
|
+
# src_store.skip!(Settings.file_skip.to_i) if Settings.file_skip
|
96
|
+
elsif Settings.random
|
59
97
|
src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
|
60
98
|
else
|
61
|
-
|
99
|
+
Settings.die "Need to either say --random or --file_from=filename"
|
62
100
|
end
|
63
101
|
|
64
102
|
#
|
@@ -67,30 +105,37 @@ end
|
|
67
105
|
# Track visited URLs with key-value database
|
68
106
|
#
|
69
107
|
RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
|
70
|
-
cache_loc =
|
71
|
-
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
|
108
|
+
cache_loc = Settings.cache_loc || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
|
109
|
+
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(:uri => cache_loc)
|
110
|
+
|
111
|
+
|
72
112
|
# dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
|
73
113
|
|
74
114
|
#
|
75
115
|
# Store the data into flat files
|
76
116
|
#
|
77
|
-
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(
|
78
|
-
:handle => 'shorturl-'+handle, :
|
79
|
-
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
|
80
|
-
|
117
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(Settings.dest_pattern,
|
118
|
+
:handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
|
119
|
+
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(:pattern => Settings.dest_pattern,
|
120
|
+
:chunk_time => Settings.chunk_time.to_i, :handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
|
81
121
|
|
82
122
|
#
|
83
123
|
# Conditional store uses the key-value DB to boss around the flat files --
|
84
124
|
# requests are only made (and thus data is only output) if the url is missing
|
85
125
|
# from the key-value store.
|
86
126
|
#
|
87
|
-
dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
|
127
|
+
dest_store = Monkeyshines::Store::ConditionalStore.new(:cache => dest_cache, :store => dest_files)
|
88
128
|
|
89
129
|
#
|
90
130
|
# ******************** Fetcher ********************
|
91
131
|
#
|
92
132
|
fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
|
93
133
|
|
134
|
+
#
|
135
|
+
# ******************** Success/Fail stats ********************
|
136
|
+
#
|
137
|
+
stats = ShorturlStats.new(0,0,0,0)
|
138
|
+
|
94
139
|
#
|
95
140
|
# ******************** Do this thing ********************
|
96
141
|
#
|
@@ -104,9 +149,18 @@ src_store.each do |bareurl, *args|
|
|
104
149
|
result = dest_store.set( req.url ) do
|
105
150
|
response = fetcher.get(req) # do the url fetch
|
106
151
|
next unless response.response_code || response.contents # don't store bad fetches
|
152
|
+
stats.code_sort(response.response_code) # count successes (301) and failures (404)
|
107
153
|
[response.scraped_at, response] # timestamp into cache, result into flat file
|
108
154
|
end
|
109
|
-
periodic_log.periodically{ ["%7d"%
|
155
|
+
periodic_log.periodically{ ["%7d"%stats.success_tot, 'successes', "%7d"%stats.failure_tot, 'failures', dest_store.size, req.response_code, result, req.url] }
|
156
|
+
graphite_sender.periodically do |metrics, iter, since|
|
157
|
+
rates = stats.rates_inst
|
158
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_rate", rates[0]]
|
159
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_rate", rates[1]]
|
160
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_tot_rate", stats.rates_tot[0]]
|
161
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_tot_rate", stats.rates_tot[1]]
|
162
|
+
metrics << ["scraper.#{hostname}.shorturl.#{handle}.current_file_size", dest_files.size]
|
163
|
+
end
|
110
164
|
end
|
111
165
|
dest_store.close
|
112
166
|
fetcher.close
|
@@ -0,0 +1,37 @@
|
|
1
|
+
class ShorturlStats < Struct.new(
|
2
|
+
:success_tot,
|
3
|
+
:failure_tot,
|
4
|
+
:success_last,
|
5
|
+
:fail_last
|
6
|
+
)
|
7
|
+
|
8
|
+
def code_sort code
|
9
|
+
case code.to_s
|
10
|
+
when /4\d{2}/
|
11
|
+
self.failure_tot += 1
|
12
|
+
self.fail_last += 1
|
13
|
+
when /3\d{2}/
|
14
|
+
self.success_tot += 1
|
15
|
+
self.success_last += 1
|
16
|
+
else
|
17
|
+
Log.warn "Code #{code} not included in stats."
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
def rates_inst
|
22
|
+
return [0,0] if (self.success_last.to_f + self.fail_last.to_f) == 0
|
23
|
+
s_rate = (self.success_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
|
24
|
+
f_rate = (self.fail_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
|
25
|
+
self.success_last = 0
|
26
|
+
self.fail_last = 0
|
27
|
+
[s_rate,f_rate]
|
28
|
+
end
|
29
|
+
|
30
|
+
def rates_tot
|
31
|
+
return [0,0] if (self.success_tot.to_f + self.failure_tot.to_f) == 0
|
32
|
+
st_rate = (self.success_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
|
33
|
+
ft_rate = (self.failure_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
|
34
|
+
[st_rate,ft_rate]
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
WORK_DIR = '/data/rawd/social/networks/twitter_friends/tokens_by_month/'
|
4
|
+
|
5
|
+
OTHER_SHORTURL_RE =
|
6
|
+
%r{.*(http://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.+)}
|
7
|
+
|
8
|
+
bitly_file = File.open('/home/doncarlo/shorturls/shorturls_bitly','w')
|
9
|
+
tinyurl_file = File.open('/home/doncarlo/shorturls/shorturls_tinyurl','w')
|
10
|
+
otherurl_file = File.open('/home/doncarlo/shorturls/shorturls_other','w')
|
11
|
+
|
12
|
+
|
13
|
+
|
14
|
+
File.open(WORK_DIR + 'urls_by_month-20091111.tsv').each do |line|
|
15
|
+
line.chomp!
|
16
|
+
url = line.split("\t")[-1]
|
17
|
+
bitly_file << $1 + "\n" if url =~ %r{.*(http://bit.ly/.+)}
|
18
|
+
tinyurl_file << $1 + "\n" if url =~ %r{.*(http://tinyurl.com/.+)}
|
19
|
+
otherurl_file << $1 + "\n" if url =~ OTHER_SHORTURL_RE
|
20
|
+
end
|
21
|
+
|
@@ -1,25 +1,137 @@
|
|
1
1
|
require 'net/http'
|
2
|
+
require 'oauth'
|
2
3
|
Net::HTTP.version_1_2
|
3
4
|
module Monkeyshines
|
4
5
|
module Fetcher
|
5
6
|
|
6
7
|
#
|
7
|
-
class AuthedHttpFetcher
|
8
|
-
|
8
|
+
class AuthedHttpFetcher < HttpFetcher
|
9
|
+
attr_accessor :auth_params, :oauth_token, :oauth_secret, :consumer_key, :consumer_secret, :site, :authorize_path
|
10
|
+
#
|
11
|
+
# All the stuff below was copied from http://github.com/moomerman/twitter_oauth in the client.rb file
|
12
|
+
#
|
13
|
+
# def initialize(options = {})
|
14
|
+
# @consumer_key = options[:consumer_key]
|
15
|
+
# @consumer_secret = options[:consumer_secret]
|
16
|
+
# @token = options[:token]
|
17
|
+
# @secret = options[:secret]
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# def authorize(token, secret, options = {})
|
21
|
+
# request_token = OAuth::RequestToken.new(
|
22
|
+
# consumer, token, secret
|
23
|
+
# )
|
24
|
+
# @access_token = request_token.get_access_token(options)
|
25
|
+
# @token = @access_token.token
|
26
|
+
# @secret = @access_token.secret
|
27
|
+
# @access_token
|
28
|
+
# end
|
29
|
+
#
|
30
|
+
# def show(username)
|
31
|
+
# get("/users/show/#{username}.json")
|
32
|
+
# end
|
33
|
+
#
|
34
|
+
# # Returns the string "ok" in the requested format with a 200 OK HTTP status code.
|
35
|
+
# def test
|
36
|
+
# get("/help/test.json")
|
37
|
+
# end
|
38
|
+
#
|
39
|
+
# def request_token(options={})
|
40
|
+
# consumer.get_request_token(options)
|
41
|
+
# end
|
42
|
+
#
|
43
|
+
# def authentication_request_token(options={})
|
44
|
+
# consumer.options[:authorize_path] = '/oauth/authenticate'
|
45
|
+
# request_token(options)
|
46
|
+
# end
|
47
|
+
#
|
48
|
+
# private
|
49
|
+
#
|
50
|
+
# def consumer
|
51
|
+
# @consumer ||= OAuth::Consumer.new(
|
52
|
+
# @consumer_key,
|
53
|
+
# @consumer_secret,
|
54
|
+
# { :site => "http://api.twitter.com" }
|
55
|
+
# )
|
56
|
+
# end
|
57
|
+
#
|
58
|
+
# def access_token
|
59
|
+
# @access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
|
60
|
+
# end
|
61
|
+
#
|
62
|
+
# def get(path, headers={})
|
63
|
+
# headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
|
64
|
+
# oauth_response = access_token.get("/1#{path}", headers)
|
65
|
+
# JSON.parse(oauth_response.body)
|
66
|
+
# end
|
67
|
+
#
|
68
|
+
# def post(path, body='', headers={})
|
69
|
+
# headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
|
70
|
+
# oauth_response = access_token.post("/1#{path}", body, headers)
|
71
|
+
# JSON.parse(oauth_response.body)
|
72
|
+
# end
|
73
|
+
#
|
74
|
+
# def delete(path, headers={})
|
75
|
+
# headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
|
76
|
+
# oauth_response = access_token.delete("/1#{path}", headers)
|
77
|
+
# JSON.parse(oauth_response.body)
|
78
|
+
# end
|
9
79
|
|
10
|
-
|
80
|
+
|
81
|
+
def initialize _options={}
|
82
|
+
super _options
|
83
|
+
# These should get called by calling super, right?
|
84
|
+
# self.username = options[:username]
|
85
|
+
# self.password = options[:password]
|
86
|
+
# self.http_req_options = {}
|
87
|
+
# self.http_req_options["User-Agent"] = options[:user_agent] || USER_AGENT
|
88
|
+
# self.http_req_options["Connection"] = "keep-alive"
|
89
|
+
self.oauth_token = options[:oauth_token]
|
90
|
+
self.oauth_secret = options[:oauth_token_secret]
|
91
|
+
self.consumer_key = options[:consumer_key]
|
92
|
+
self.consumer_secret = options[:consumer_secret]
|
93
|
+
self.site = options[:site]
|
94
|
+
self.authorize_path = options[:authorize_path]
|
95
|
+
end
|
96
|
+
|
97
|
+
def request_token(options={})
|
98
|
+
consumer.options[:authorize_path] = @authorize_path
|
99
|
+
consumer.get_request_token(options)
|
11
100
|
end
|
12
101
|
|
13
|
-
def authorize
|
102
|
+
def authorize(token, secret, options = {})
|
103
|
+
request_token = OAuth::RequestToken.new(
|
104
|
+
consumer, token, secret
|
105
|
+
)
|
106
|
+
@access_token = request_token.get_access_token(options)
|
107
|
+
@token = @access_token.token
|
108
|
+
@secret = @access_token.secret
|
109
|
+
@access_token
|
14
110
|
end
|
15
111
|
|
16
112
|
def get_access_token
|
17
113
|
end
|
18
114
|
|
19
|
-
def
|
115
|
+
def oauth_token
|
116
|
+
@oauth_token
|
117
|
+
end
|
118
|
+
|
119
|
+
def oauth_secret
|
120
|
+
@oauth_secret
|
121
|
+
end
|
122
|
+
|
123
|
+
def consumer
|
124
|
+
@consumer ||= OAuth::Consumer.new(
|
125
|
+
@consumer_key,
|
126
|
+
@consumer_secret,
|
127
|
+
{ :site => @site }
|
128
|
+
)
|
20
129
|
end
|
21
|
-
|
130
|
+
|
131
|
+
def access_token
|
132
|
+
@access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
|
22
133
|
end
|
134
|
+
|
23
135
|
def session_key
|
24
136
|
end
|
25
137
|
|
@@ -82,17 +82,18 @@ module Monkeyshines
|
|
82
82
|
# Response-based sleep time
|
83
83
|
sleep_time = 0
|
84
84
|
case response
|
85
|
-
when Net::HTTPSuccess then return
|
86
|
-
when Net::HTTPRedirection then return
|
87
|
-
when Net::HTTPBadRequest then sleep_time =
|
88
|
-
when Net::HTTPUnauthorized then sleep_time = 0
|
89
|
-
when Net::HTTPForbidden then sleep_time =
|
90
|
-
when Net::HTTPNotFound then sleep_time = 0
|
91
|
-
when Net::HTTPServiceUnavailable then sleep_time =
|
92
|
-
when Net::HTTPServerError then sleep_time = 2
|
85
|
+
when Net::HTTPSuccess then return # 2xx
|
86
|
+
when Net::HTTPRedirection then return # 3xx
|
87
|
+
when Net::HTTPBadRequest then sleep_time = 10 # 400 (rate limit, probably)
|
88
|
+
when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
|
89
|
+
when Net::HTTPForbidden then sleep_time = 10 # 403 update limit
|
90
|
+
when Net::HTTPNotFound then sleep_time = 0 # 404 deleted or suspended
|
91
|
+
when Net::HTTPServiceUnavailable then sleep_time = 10 # 503 Fail Whale
|
92
|
+
when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
|
93
93
|
else sleep_time = 1
|
94
94
|
end
|
95
|
-
|
95
|
+
sleep_time += response['retry-after'].to_i rescue 0
|
96
|
+
Log.warn "Received #{response.code} and retry-after #{response['retry-after']}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at}): '#{response.body[0..200].gsub(%r{[\r\n\t]}, " ")}'"
|
96
97
|
sleep sleep_time
|
97
98
|
end
|
98
99
|
|
@@ -24,7 +24,8 @@ module Monkeyshines
|
|
24
24
|
#
|
25
25
|
def periodically &block
|
26
26
|
super do
|
27
|
-
|
27
|
+
now = Time.now.utc.to_f
|
28
|
+
result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
|
28
29
|
Log.info result.join("\t")
|
29
30
|
end
|
30
31
|
end
|
@@ -18,12 +18,13 @@ module Monkeyshines
|
|
18
18
|
#
|
19
19
|
class PeriodicMonitor
|
20
20
|
attr_accessor :time_interval, :iter_interval
|
21
|
-
attr_accessor :last_time, :iter, :started_at
|
21
|
+
attr_accessor :last_time, :current_iter, :iter, :started_at
|
22
22
|
|
23
23
|
def initialize options={}
|
24
24
|
self.started_at = Time.now.utc.to_f
|
25
25
|
self.last_time = started_at
|
26
26
|
self.iter = 0
|
27
|
+
self.current_iter = 0
|
27
28
|
self.time_interval = options[:time]
|
28
29
|
self.iter_interval = options[:iters]
|
29
30
|
end
|
@@ -42,10 +43,14 @@ module Monkeyshines
|
|
42
43
|
def since
|
43
44
|
Time.now.utc.to_f - started_at
|
44
45
|
end
|
45
|
-
#
|
46
|
+
# Overall iterations per second
|
46
47
|
def rate
|
47
48
|
iter.to_f / since.to_f
|
48
49
|
end
|
50
|
+
# "Instantaneous" iterations per second
|
51
|
+
def inst_rate now
|
52
|
+
current_iter.to_f / (now-last_time).to_f
|
53
|
+
end
|
49
54
|
|
50
55
|
#
|
51
56
|
# if the interval conditions are met, executes block; otherwise just does
|
@@ -53,10 +58,12 @@ module Monkeyshines
|
|
53
58
|
#
|
54
59
|
def periodically &block
|
55
60
|
self.iter += 1
|
61
|
+
self.current_iter += 1
|
56
62
|
now = Time.now.utc.to_f
|
57
63
|
if enough_iterations? || enough_time?(now)
|
58
64
|
block.call(iter, (now-last_time))
|
59
65
|
self.last_time = now
|
66
|
+
self.current_iter = 0
|
60
67
|
end
|
61
68
|
end
|
62
69
|
end
|
data/lib/monkeyshines/options.rb
CHANGED
data/lib/monkeyshines/runner.rb
CHANGED
@@ -142,6 +142,7 @@ module Monkeyshines
|
|
142
142
|
def setup_main_log
|
143
143
|
unless options[:log][:dest].blank?
|
144
144
|
log_file = "%s/log/%s" % [WORK_DIR, options[:log][:dest]]
|
145
|
+
require 'fileutils'
|
145
146
|
FileUtils.mkdir_p(File.dirname(log_file))
|
146
147
|
$stdout = $stderr = File.open( log_file+"-console.log", "a" )
|
147
148
|
end
|
@@ -12,13 +12,14 @@ module Monkeyshines
|
|
12
12
|
def initialize options
|
13
13
|
raise "URI for #{self.class} is required" if options[:uri].blank?
|
14
14
|
self.db_host, self.db_port = options[:uri].to_s.split(':')
|
15
|
+
self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
|
15
16
|
super options
|
16
17
|
end
|
17
18
|
|
18
19
|
def db
|
19
20
|
return @db if @db
|
20
21
|
@db ||= TokyoTyrant::RDB.new
|
21
|
-
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}
|
22
|
+
@db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
22
23
|
@db
|
23
24
|
end
|
24
25
|
|
@@ -75,7 +75,7 @@ module Monkeyshines
|
|
75
75
|
|
76
76
|
# Memoized: the hostname for the machine running this script.
|
77
77
|
def hostname
|
78
|
-
@hostname ||= ENV['HOSTNAME'] || `hostname
|
78
|
+
@hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
|
79
79
|
end
|
80
80
|
# Memoized: the Process ID for this invocation.
|
81
81
|
def pid
|
metadata
CHANGED
@@ -1,7 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: monkeyshines
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
4
|
+
hash: 19
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 2
|
9
|
+
- 2
|
10
|
+
version: 0.2.2
|
5
11
|
platform: ruby
|
6
12
|
authors:
|
7
13
|
- Philip (flip) Kromer
|
@@ -9,39 +15,51 @@ autorequire:
|
|
9
15
|
bindir: bin
|
10
16
|
cert_chain: []
|
11
17
|
|
12
|
-
date:
|
18
|
+
date: 2010-07-15 00:00:00 +00:00
|
13
19
|
default_executable:
|
14
20
|
dependencies:
|
15
21
|
- !ruby/object:Gem::Dependency
|
16
22
|
name: addressable
|
17
|
-
|
18
|
-
|
19
|
-
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
20
26
|
requirements:
|
21
27
|
- - ">="
|
22
28
|
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
23
32
|
version: "0"
|
24
|
-
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
25
35
|
- !ruby/object:Gem::Dependency
|
26
36
|
name: uuid
|
27
|
-
|
28
|
-
|
29
|
-
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
30
40
|
requirements:
|
31
41
|
- - ">="
|
32
42
|
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
33
46
|
version: "0"
|
34
|
-
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
35
49
|
- !ruby/object:Gem::Dependency
|
36
50
|
name: wukong
|
37
|
-
|
38
|
-
|
39
|
-
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
40
54
|
requirements:
|
41
55
|
- - ">="
|
42
56
|
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
43
60
|
version: "0"
|
44
|
-
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
45
63
|
description: A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.
|
46
64
|
email: flip@infochimps.org
|
47
65
|
executables: []
|
@@ -64,12 +82,12 @@ files:
|
|
64
82
|
- examples/shorturls/bulkload_shorturls.rb
|
65
83
|
- examples/shorturls/extract_urls.rb
|
66
84
|
- examples/shorturls/multiplex_shorturl_cache.rb
|
67
|
-
- examples/shorturls/old/multidump_and_fix_shorturls.rb
|
68
|
-
- examples/shorturls/old/shorturl_stats.rb
|
69
85
|
- examples/shorturls/scrape_shorturls.rb
|
70
86
|
- examples/shorturls/shorturl_request.rb
|
71
87
|
- examples/shorturls/shorturl_sequence.rb
|
72
88
|
- examples/shorturls/shorturl_start_tyrant.sh
|
89
|
+
- examples/shorturls/shorturl_stats.rb
|
90
|
+
- examples/shorturls/split_short_urls.rb
|
73
91
|
- examples/shorturls/start_shorturl_cache.sh
|
74
92
|
- lib/monkeyshines.rb
|
75
93
|
- lib/monkeyshines/extensions.rb
|
@@ -139,37 +157,43 @@ rdoc_options:
|
|
139
157
|
require_paths:
|
140
158
|
- lib
|
141
159
|
required_ruby_version: !ruby/object:Gem::Requirement
|
160
|
+
none: false
|
142
161
|
requirements:
|
143
162
|
- - ">="
|
144
163
|
- !ruby/object:Gem::Version
|
164
|
+
hash: 3
|
165
|
+
segments:
|
166
|
+
- 0
|
145
167
|
version: "0"
|
146
|
-
version:
|
147
168
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
169
|
+
none: false
|
148
170
|
requirements:
|
149
171
|
- - ">="
|
150
172
|
- !ruby/object:Gem::Version
|
173
|
+
hash: 3
|
174
|
+
segments:
|
175
|
+
- 0
|
151
176
|
version: "0"
|
152
|
-
version:
|
153
177
|
requirements: []
|
154
178
|
|
155
179
|
rubyforge_project:
|
156
|
-
rubygems_version: 1.3.
|
180
|
+
rubygems_version: 1.3.7
|
157
181
|
signing_key:
|
158
182
|
specification_version: 3
|
159
183
|
summary: A simple scraper for directed scrapes of APIs, feed or structured HTML.
|
160
184
|
test_files:
|
161
185
|
- spec/monkeyshines_spec.rb
|
162
186
|
- spec/spec_helper.rb
|
163
|
-
- examples/
|
164
|
-
- examples/
|
165
|
-
- examples/rename_tree/rename_ripd_tree.rb
|
166
|
-
- examples/rss_feeds/scrape_rss_feeds.rb
|
167
|
-
- examples/shorturls/bulkdump_shorturls.rb
|
168
|
-
- examples/shorturls/bulkload_shorturls.rb
|
169
|
-
- examples/shorturls/extract_urls.rb
|
187
|
+
- examples/shorturls/shorturl_stats.rb
|
188
|
+
- examples/shorturls/shorturl_request.rb
|
170
189
|
- examples/shorturls/multiplex_shorturl_cache.rb
|
171
|
-
- examples/shorturls/
|
172
|
-
- examples/shorturls/old/shorturl_stats.rb
|
190
|
+
- examples/shorturls/bulkload_shorturls.rb
|
173
191
|
- examples/shorturls/scrape_shorturls.rb
|
174
|
-
- examples/shorturls/
|
192
|
+
- examples/shorturls/extract_urls.rb
|
193
|
+
- examples/shorturls/bulkdump_shorturls.rb
|
175
194
|
- examples/shorturls/shorturl_sequence.rb
|
195
|
+
- examples/shorturls/split_short_urls.rb
|
196
|
+
- examples/rename_tree/rename_hdp_tree.rb
|
197
|
+
- examples/rename_tree/rename_ripd_tree.rb
|
198
|
+
- examples/rss_feeds/scrape_rss_feeds.rb
|
199
|
+
- examples/bulk_urls/scrape_bulk_urls.rb
|
@@ -1,66 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
-
require 'rubygems'
|
4
|
-
require 'trollop'
|
5
|
-
require 'wukong'
|
6
|
-
require 'monkeyshines'
|
7
|
-
require 'shorturl_request'
|
8
|
-
require 'shorturl_sequence'
|
9
|
-
require 'monkeyshines/utils/uri'
|
10
|
-
|
11
|
-
#
|
12
|
-
# Command line options
|
13
|
-
#
|
14
|
-
opts = Trollop::options do
|
15
|
-
opt :from_type, 'Class name for scrape store to load from', :type => String
|
16
|
-
opt :from, 'URI for scrape store to load from', :type => String
|
17
|
-
opt :into, 'Filename for flat TSV dump', :type => String
|
18
|
-
opt :log, 'File to store log', :type => String
|
19
|
-
end
|
20
|
-
Trollop::die :from_type unless opts[:from_type]
|
21
|
-
|
22
|
-
# ******************** Read From ********************
|
23
|
-
src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
|
24
|
-
src_store = src_store_klass.new(opts[:from])
|
25
|
-
Log.info "Loaded store with #{src_store.size}"
|
26
|
-
|
27
|
-
# ******************** Write into ********************
|
28
|
-
DUMPFILE_BASE = opts[:into]
|
29
|
-
def make_store uri
|
30
|
-
Monkeyshines::Store::FlatFileStore.new "#{DUMPFILE_BASE+"-"+uri}.tsv", :filemode => 'w'
|
31
|
-
end
|
32
|
-
dests = { }
|
33
|
-
[ 'tinyurl', 'bitly', 'other'
|
34
|
-
].each do |handle|
|
35
|
-
dests[handle] = make_store handle
|
36
|
-
end
|
37
|
-
|
38
|
-
# ******************** Log ********************
|
39
|
-
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
|
40
|
-
|
41
|
-
# ******************** Cross Load ********************
|
42
|
-
# Read , process, dump
|
43
|
-
iter = 0
|
44
|
-
src_store.each do |key, hsh|
|
45
|
-
hsh['contents'] ||= hsh.delete 'expanded_url'
|
46
|
-
hsh['response_code'] = nil if hsh['response_code'] == 'nil'
|
47
|
-
hsh['contents'] = nil if hsh['contents'] == 'nil'
|
48
|
-
unless hsh['contents'] || hsh['response_code']
|
49
|
-
# Log.info "removing #{hsh.inspect}"
|
50
|
-
src_store.db.out(key)
|
51
|
-
next
|
52
|
-
end
|
53
|
-
hsh['response_message'] = nil if hsh['response_message'] == 'nil'
|
54
|
-
hsh['url'] ||= hsh.delete 'short_url'
|
55
|
-
req = ShorturlRequest.from_hash hsh
|
56
|
-
periodic_log.periodically{ [src_store.size, req.to_flat] }
|
57
|
-
|
58
|
-
req.contents = Addressable::URI.scrub_url req.contents if req.contents
|
59
|
-
|
60
|
-
case
|
61
|
-
when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].save req
|
62
|
-
when (key =~ %r{^http://bit.ly/(.*)}) then dests['bitly' ].save req
|
63
|
-
else dests['other' ].save req
|
64
|
-
end
|
65
|
-
# src_store.save(key, req.to_hash.compact)
|
66
|
-
end
|
@@ -1,81 +0,0 @@
|
|
1
|
-
#!/usr/bin/env ruby
|
2
|
-
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
-
#require 'rubygems'
|
4
|
-
# require 'wukong'
|
5
|
-
require 'monkeyshines'
|
6
|
-
# require 'monkeyshines/utils/uri'
|
7
|
-
# require 'monkeyshines/utils/filename_pattern'
|
8
|
-
# require 'monkeyshines/store/conditional_store'
|
9
|
-
# require 'monkeyshines/fetcher/http_head_fetcher'
|
10
|
-
# require 'trollop' # gem install trollop
|
11
|
-
# require 'shorturl_request'
|
12
|
-
require 'shorturl_sequence'
|
13
|
-
|
14
|
-
digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
|
15
|
-
|
16
|
-
# (1..10000).each do |idx|
|
17
|
-
# s = ShorturlSequence.encode_integer idx, 36
|
18
|
-
# digits[s[0..0]] += 1
|
19
|
-
# end
|
20
|
-
# p digits
|
21
|
-
# puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
|
22
|
-
|
23
|
-
class Histo
|
24
|
-
attr_accessor :buckets
|
25
|
-
def initialize
|
26
|
-
self.buckets = { }
|
27
|
-
end
|
28
|
-
def << val
|
29
|
-
buckets[val] ||= 0
|
30
|
-
buckets[val] += 1
|
31
|
-
end
|
32
|
-
def dump
|
33
|
-
buckets.sort.each do |val, count|
|
34
|
-
puts "%10d\t%s"%[count,val]
|
35
|
-
end
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
len_histo = Histo.new
|
40
|
-
num_histo = Histo.new
|
41
|
-
ltr_histo = Histo.new
|
42
|
-
iter = 0
|
43
|
-
|
44
|
-
# 123456789-123456789-
|
45
|
-
# http://bit.ly/
|
46
|
-
# http://tinyurl.com/
|
47
|
-
BASE_URL = "http://is.gd/"
|
48
|
-
RADIX = 62
|
49
|
-
HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
|
50
|
-
BASE_URL_LEN = BASE_URL.length
|
51
|
-
MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
|
52
|
-
SIX_CHARS = RADIX**6
|
53
|
-
File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
|
54
|
-
) do |reqfile|
|
55
|
-
reqfile.each do |url|
|
56
|
-
#decode
|
57
|
-
next unless url.length <= MAX_TAIL_LEN
|
58
|
-
tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
|
59
|
-
# tail.downcase!
|
60
|
-
asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
|
61
|
-
next unless asnum && asnum < SIX_CHARS
|
62
|
-
size = (asnum / 1_000_000)
|
63
|
-
len = tail.length
|
64
|
-
# track stats
|
65
|
-
len_histo << len
|
66
|
-
num_histo << size
|
67
|
-
ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
|
68
|
-
puts iter if ((iter += 1) % 1_000_000 == 0)
|
69
|
-
|
70
|
-
end
|
71
|
-
end
|
72
|
-
puts "Integer magnitude of decoded (M):"
|
73
|
-
num_histo.dump
|
74
|
-
puts "Length of encoded:"
|
75
|
-
len_histo.dump
|
76
|
-
puts "First Letter:"
|
77
|
-
ltr_histo.dump
|
78
|
-
|
79
|
-
|
80
|
-
# puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
|
81
|
-
# puts [asnum, tail, url].inspect
|