monkeyshines 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,16 +1,19 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
2
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__); $: << File.dirname(__FILE__)+'/../../../graphiterb/lib'
3
3
  require 'rubygems'
4
4
  require 'wukong'
5
5
  require 'monkeyshines'
6
+ require 'configliere'
6
7
  #
7
8
  require 'shorturl_request'
8
9
  require 'shorturl_sequence'
10
+ require 'shorturl_stats'
9
11
  require 'monkeyshines/utils/uri'
10
12
  require 'monkeyshines/utils/filename_pattern'
11
13
  require 'monkeyshines/store/conditional_store'
12
14
  require 'monkeyshines/fetcher/http_head_fetcher'
13
- require 'trollop' # gem install trollop
15
+ require 'graphiterb' # needs graphiterb - simple ruby interface for graphite
16
+ # require 'trollop' # gem install trollop
14
17
 
15
18
  # ===========================================================================
16
19
  #
@@ -26,39 +29,74 @@ require 'trollop' # gem install trollop
26
29
  # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
27
30
  #
28
31
  #
29
- opts = Trollop::options do
30
- opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
31
- opt :log, "Log file name; leave blank to use STDERR", :type => String
32
- # input from file
33
- opt :from, "Location of URLs to scrape", :type => String
34
- opt :skip, "Initial lines to skip", :type => Integer
35
- # OR do a random walk
36
- opt :random, "Generate and visit random URL suffixes"
37
- opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
38
- opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
39
- opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
40
- # output storage
41
- opt :cache_loc, "URI for cache server", :type => String
42
- opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
43
- opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
44
- opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
45
- end
46
- handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
47
32
 
33
+ Configliere.use :commandline, :config_file, :define
34
+ Settings.read 'shorturls.yaml' #~/.configliere/shorturls.yaml
35
+ Settings.define :base_url, :description => "Host part of URL: eg tinyurl.com", :type => String, :required => true
36
+ # Settings.define :log, :description => "Log file name; leave blank to use STDERR", :type => String
37
+ Settings.define :log_time, :description => "Log time interval, in seconds, for periodic logger and Graphite logger", :type => Integer, :default => 60
38
+ Settings.define :log_iters, :description => "Log iteration interval for periodic logger and Graphite logger", :type => Integer, :default => 10000
39
+ # input from file
40
+ Settings.define :file_from, :description => "Location of URLs to scrape", :type => String
41
+ Settings.define :file_skip, :description => "Initial lines to skip", :type => Integer
42
+ # OR do a random walk
43
+ Settings.define :random, :description => "Generate and visit random URL suffixes"
44
+ Settings.define :random_min, :description => "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
45
+ Settings.define :random_max, :description => "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
46
+ Settings.define :random_radix, :description => "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
47
+ # output storage
48
+ Settings.define :cache_loc, :description => "URI for cache server", :type => String
49
+ Settings.define :chunk_time, :description => "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
50
+ Settings.define :rootdir, :description => "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd/shorturls'
51
+ Settings.define :dest_pattern, :description => "Pattern for dump file output", :default => ":rootdir/:date/:handle+:timestamp-:pid.tsv"
52
+ Settings.resolve!
53
+ Log = Logger.new($stderr) unless defined?(Log)
54
+
55
+ # Removed trollop optioning, added in configliere instead
56
+ # opts = Trollop::options do
57
+ # opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
58
+ # opt :log, "Log file name; leave blank to use STDERR", :type => String
59
+ # # input from file
60
+ # opt :from, "Location of URLs to scrape", :type => String
61
+ # opt :skip, "Initial lines to skip", :type => Integer
62
+ # # OR do a random walk
63
+ # opt :random, "Generate and visit random URL suffixes"
64
+ # opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
65
+ # opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
66
+ # opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
67
+ # # output storage
68
+ # opt :cache_loc, "URI for cache server", :type => String
69
+ # opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
70
+ # opt :rootdir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
71
+ # opt :dest_pattern, "Pattern for dump file output", :default => ":rootdir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
72
+ # end
73
+ handle = Settings.base_url.gsub(/\.com$/,'').gsub(/\W+/,'')
74
+ hostname ||= `hostname`.chomp.gsub(".","_")
75
+
76
+
77
+ #
48
78
  # ******************** Log ********************
49
- opts[:log] = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (opts[:log]=='')
50
- periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 10000, :time => 30)
79
+ #
80
+ # (I don't think the log file name ever gets used)
81
+ # Settings.log = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (Settings.log=='')
82
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
83
+
84
+ #
85
+ # ******************** Graphite Sender ***********************
86
+ #
87
+ graphite_sender = Graphiterb::GraphiteLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
51
88
 
52
89
  #
53
90
  # ******************** Load from store or random walk ********************
54
91
  #
55
- if opts[:from]
56
- src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
57
- src_store.skip!(opts[:skip].to_i) if opts[:skip]
58
- elsif opts[:random]
92
+ if Settings.file_from
93
+ # Settings.filename = Settings.file_from
94
+ src_store = Monkeyshines::Store::FlatFileStore.new(:filename => Settings.file_from, :skip => Settings.file_skip.to_i) # + {:filemode => 'r'}
95
+ # src_store.skip!(Settings.file_skip.to_i) if Settings.file_skip
96
+ elsif Settings.random
59
97
  src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
60
98
  else
61
- Trollop::die "Need to either say --random or --from=filename"
99
+ Settings.die "Need to either say --random or --file_from=filename"
62
100
  end
63
101
 
64
102
  #
@@ -67,30 +105,37 @@ end
67
105
  # Track visited URLs with key-value database
68
106
  #
69
107
  RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
70
- cache_loc = opts[:cache_loc] || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
71
- dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
108
+ cache_loc = Settings.cache_loc || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
109
+ dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(:uri => cache_loc)
110
+
111
+
72
112
  # dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
73
113
 
74
114
  #
75
115
  # Store the data into flat files
76
116
  #
77
- dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern],
78
- :handle => 'shorturl-'+handle, :dest_dir => opts[:dest_dir])
79
- dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
80
- opts[:chunk_time].to_i, opts)
117
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(Settings.dest_pattern,
118
+ :handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
119
+ dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(:pattern => Settings.dest_pattern,
120
+ :chunk_time => Settings.chunk_time.to_i, :handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
81
121
 
82
122
  #
83
123
  # Conditional store uses the key-value DB to boss around the flat files --
84
124
  # requests are only made (and thus data is only output) if the url is missing
85
125
  # from the key-value store.
86
126
  #
87
- dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
127
+ dest_store = Monkeyshines::Store::ConditionalStore.new(:cache => dest_cache, :store => dest_files)
88
128
 
89
129
  #
90
130
  # ******************** Fetcher ********************
91
131
  #
92
132
  fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
93
133
 
134
+ #
135
+ # ******************** Success/Fail stats ********************
136
+ #
137
+ stats = ShorturlStats.new(0,0,0,0)
138
+
94
139
  #
95
140
  # ******************** Do this thing ********************
96
141
  #
@@ -104,9 +149,18 @@ src_store.each do |bareurl, *args|
104
149
  result = dest_store.set( req.url ) do
105
150
  response = fetcher.get(req) # do the url fetch
106
151
  next unless response.response_code || response.contents # don't store bad fetches
152
+ stats.code_sort(response.response_code) # count successes (301) and failures (404)
107
153
  [response.scraped_at, response] # timestamp into cache, result into flat file
108
154
  end
109
- periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
155
+ periodic_log.periodically{ ["%7d"%stats.success_tot, 'successes', "%7d"%stats.failure_tot, 'failures', dest_store.size, req.response_code, result, req.url] }
156
+ graphite_sender.periodically do |metrics, iter, since|
157
+ rates = stats.rates_inst
158
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_rate", rates[0]]
159
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_rate", rates[1]]
160
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_tot_rate", stats.rates_tot[0]]
161
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_tot_rate", stats.rates_tot[1]]
162
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.current_file_size", dest_files.size]
163
+ end
110
164
  end
111
165
  dest_store.close
112
166
  fetcher.close
@@ -0,0 +1,37 @@
1
+ class ShorturlStats < Struct.new(
2
+ :success_tot,
3
+ :failure_tot,
4
+ :success_last,
5
+ :fail_last
6
+ )
7
+
8
+ def code_sort code
9
+ case code.to_s
10
+ when /4\d{2}/
11
+ self.failure_tot += 1
12
+ self.fail_last += 1
13
+ when /3\d{2}/
14
+ self.success_tot += 1
15
+ self.success_last += 1
16
+ else
17
+ Log.warn "Code #{code} not included in stats."
18
+ end
19
+ end
20
+
21
+ def rates_inst
22
+ return [0,0] if (self.success_last.to_f + self.fail_last.to_f) == 0
23
+ s_rate = (self.success_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
24
+ f_rate = (self.fail_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
25
+ self.success_last = 0
26
+ self.fail_last = 0
27
+ [s_rate,f_rate]
28
+ end
29
+
30
+ def rates_tot
31
+ return [0,0] if (self.success_tot.to_f + self.failure_tot.to_f) == 0
32
+ st_rate = (self.success_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
33
+ ft_rate = (self.failure_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
34
+ [st_rate,ft_rate]
35
+ end
36
+
37
+ end
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ WORK_DIR = '/data/rawd/social/networks/twitter_friends/tokens_by_month/'
4
+
5
+ OTHER_SHORTURL_RE =
6
+ %r{.*(http://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.+)}
7
+
8
+ bitly_file = File.open('/home/doncarlo/shorturls/shorturls_bitly','w')
9
+ tinyurl_file = File.open('/home/doncarlo/shorturls/shorturls_tinyurl','w')
10
+ otherurl_file = File.open('/home/doncarlo/shorturls/shorturls_other','w')
11
+
12
+
13
+
14
+ File.open(WORK_DIR + 'urls_by_month-20091111.tsv').each do |line|
15
+ line.chomp!
16
+ url = line.split("\t")[-1]
17
+ bitly_file << $1 + "\n" if url =~ %r{.*(http://bit.ly/.+)}
18
+ tinyurl_file << $1 + "\n" if url =~ %r{.*(http://tinyurl.com/.+)}
19
+ otherurl_file << $1 + "\n" if url =~ OTHER_SHORTURL_RE
20
+ end
21
+
@@ -1,25 +1,137 @@
1
1
  require 'net/http'
2
+ require 'oauth'
2
3
  Net::HTTP.version_1_2
3
4
  module Monkeyshines
4
5
  module Fetcher
5
6
 
6
7
  #
7
- class AuthedHttpFetcher
8
- cattr_accessor :auth_params
8
+ class AuthedHttpFetcher < HttpFetcher
9
+ attr_accessor :auth_params, :oauth_token, :oauth_secret, :consumer_key, :consumer_secret, :site, :authorize_path
10
+ #
11
+ # All the stuff below was copied from http://github.com/moomerman/twitter_oauth in the client.rb file
12
+ #
13
+ # def initialize(options = {})
14
+ # @consumer_key = options[:consumer_key]
15
+ # @consumer_secret = options[:consumer_secret]
16
+ # @token = options[:token]
17
+ # @secret = options[:secret]
18
+ # end
19
+ #
20
+ # def authorize(token, secret, options = {})
21
+ # request_token = OAuth::RequestToken.new(
22
+ # consumer, token, secret
23
+ # )
24
+ # @access_token = request_token.get_access_token(options)
25
+ # @token = @access_token.token
26
+ # @secret = @access_token.secret
27
+ # @access_token
28
+ # end
29
+ #
30
+ # def show(username)
31
+ # get("/users/show/#{username}.json")
32
+ # end
33
+ #
34
+ # # Returns the string "ok" in the requested format with a 200 OK HTTP status code.
35
+ # def test
36
+ # get("/help/test.json")
37
+ # end
38
+ #
39
+ # def request_token(options={})
40
+ # consumer.get_request_token(options)
41
+ # end
42
+ #
43
+ # def authentication_request_token(options={})
44
+ # consumer.options[:authorize_path] = '/oauth/authenticate'
45
+ # request_token(options)
46
+ # end
47
+ #
48
+ # private
49
+ #
50
+ # def consumer
51
+ # @consumer ||= OAuth::Consumer.new(
52
+ # @consumer_key,
53
+ # @consumer_secret,
54
+ # { :site => "http://api.twitter.com" }
55
+ # )
56
+ # end
57
+ #
58
+ # def access_token
59
+ # @access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
60
+ # end
61
+ #
62
+ # def get(path, headers={})
63
+ # headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
64
+ # oauth_response = access_token.get("/1#{path}", headers)
65
+ # JSON.parse(oauth_response.body)
66
+ # end
67
+ #
68
+ # def post(path, body='', headers={})
69
+ # headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
70
+ # oauth_response = access_token.post("/1#{path}", body, headers)
71
+ # JSON.parse(oauth_response.body)
72
+ # end
73
+ #
74
+ # def delete(path, headers={})
75
+ # headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
76
+ # oauth_response = access_token.delete("/1#{path}", headers)
77
+ # JSON.parse(oauth_response.body)
78
+ # end
9
79
 
10
- def get_request_token
80
+
81
+ def initialize _options={}
82
+ super _options
83
+ # These should get called by calling super, right?
84
+ # self.username = options[:username]
85
+ # self.password = options[:password]
86
+ # self.http_req_options = {}
87
+ # self.http_req_options["User-Agent"] = options[:user_agent] || USER_AGENT
88
+ # self.http_req_options["Connection"] = "keep-alive"
89
+ self.oauth_token = options[:oauth_token]
90
+ self.oauth_secret = options[:oauth_token_secret]
91
+ self.consumer_key = options[:consumer_key]
92
+ self.consumer_secret = options[:consumer_secret]
93
+ self.site = options[:site]
94
+ self.authorize_path = options[:authorize_path]
95
+ end
96
+
97
+ def request_token(options={})
98
+ consumer.options[:authorize_path] = @authorize_path
99
+ consumer.get_request_token(options)
11
100
  end
12
101
 
13
- def authorize
102
+ def authorize(token, secret, options = {})
103
+ request_token = OAuth::RequestToken.new(
104
+ consumer, token, secret
105
+ )
106
+ @access_token = request_token.get_access_token(options)
107
+ @token = @access_token.token
108
+ @secret = @access_token.secret
109
+ @access_token
14
110
  end
15
111
 
16
112
  def get_access_token
17
113
  end
18
114
 
19
- def api_key
115
+ def oauth_token
116
+ @oauth_token
117
+ end
118
+
119
+ def oauth_secret
120
+ @oauth_secret
121
+ end
122
+
123
+ def consumer
124
+ @consumer ||= OAuth::Consumer.new(
125
+ @consumer_key,
126
+ @consumer_secret,
127
+ { :site => @site }
128
+ )
20
129
  end
21
- def api_secret
130
+
131
+ def access_token
132
+ @access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
22
133
  end
134
+
23
135
  def session_key
24
136
  end
25
137
 
@@ -82,17 +82,18 @@ module Monkeyshines
82
82
  # Response-based sleep time
83
83
  sleep_time = 0
84
84
  case response
85
- when Net::HTTPSuccess then return # 2xx
86
- when Net::HTTPRedirection then return # 3xx
87
- when Net::HTTPBadRequest then sleep_time = 5 # 400 (rate limit, probably)
88
- when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
89
- when Net::HTTPForbidden then sleep_time = 4 # 403 update limit
90
- when Net::HTTPNotFound then sleep_time = 0 # 404 deleted
91
- when Net::HTTPServiceUnavailable then sleep_time = 15 # 503 Fail Whale
92
- when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
85
+ when Net::HTTPSuccess then return # 2xx
86
+ when Net::HTTPRedirection then return # 3xx
87
+ when Net::HTTPBadRequest then sleep_time = 10 # 400 (rate limit, probably)
88
+ when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
89
+ when Net::HTTPForbidden then sleep_time = 10 # 403 update limit
90
+ when Net::HTTPNotFound then sleep_time = 0 # 404 deleted or suspended
91
+ when Net::HTTPServiceUnavailable then sleep_time = 10 # 503 Fail Whale
92
+ when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
93
93
  else sleep_time = 1
94
94
  end
95
- Log.warn "Received #{response.code}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at})"
95
+ sleep_time += response['retry-after'].to_i rescue 0
96
+ Log.warn "Received #{response.code} and retry-after #{response['retry-after']}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at}): '#{response.body[0..200].gsub(%r{[\r\n\t]}, " ")}'"
96
97
  sleep sleep_time
97
98
  end
98
99
 
@@ -24,7 +24,8 @@ module Monkeyshines
24
24
  #
25
25
  def periodically &block
26
26
  super do
27
- result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%rate, (block ? block.call : nil) ].flatten.compact
27
+ now = Time.now.utc.to_f
28
+ result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
28
29
  Log.info result.join("\t")
29
30
  end
30
31
  end
@@ -18,12 +18,13 @@ module Monkeyshines
18
18
  #
19
19
  class PeriodicMonitor
20
20
  attr_accessor :time_interval, :iter_interval
21
- attr_accessor :last_time, :iter, :started_at
21
+ attr_accessor :last_time, :current_iter, :iter, :started_at
22
22
 
23
23
  def initialize options={}
24
24
  self.started_at = Time.now.utc.to_f
25
25
  self.last_time = started_at
26
26
  self.iter = 0
27
+ self.current_iter = 0
27
28
  self.time_interval = options[:time]
28
29
  self.iter_interval = options[:iters]
29
30
  end
@@ -42,10 +43,14 @@ module Monkeyshines
42
43
  def since
43
44
  Time.now.utc.to_f - started_at
44
45
  end
45
- # Iterations per second
46
+ # Overall iterations per second
46
47
  def rate
47
48
  iter.to_f / since.to_f
48
49
  end
50
+ # "Instantaneous" iterations per second
51
+ def inst_rate now
52
+ current_iter.to_f / (now-last_time).to_f
53
+ end
49
54
 
50
55
  #
51
56
  # if the interval conditions are met, executes block; otherwise just does
@@ -53,10 +58,12 @@ module Monkeyshines
53
58
  #
54
59
  def periodically &block
55
60
  self.iter += 1
61
+ self.current_iter += 1
56
62
  now = Time.now.utc.to_f
57
63
  if enough_iterations? || enough_time?(now)
58
64
  block.call(iter, (now-last_time))
59
65
  self.last_time = now
66
+ self.current_iter = 0
60
67
  end
61
68
  end
62
69
  end
@@ -1,3 +1,5 @@
1
+ require 'yaml'
2
+ require 'monkeyshines/runner_core/options'
1
3
  require 'monkeyshines/utils/trollop'
2
4
  module Monkeyshines
3
5
 
@@ -142,6 +142,7 @@ module Monkeyshines
142
142
  def setup_main_log
143
143
  unless options[:log][:dest].blank?
144
144
  log_file = "%s/log/%s" % [WORK_DIR, options[:log][:dest]]
145
+ require 'fileutils'
145
146
  FileUtils.mkdir_p(File.dirname(log_file))
146
147
  $stdout = $stderr = File.open( log_file+"-console.log", "a" )
147
148
  end
@@ -67,6 +67,12 @@ module Monkeyshines
67
67
  file << obj.to_flat.join("\t")+"\n"
68
68
  obj
69
69
  end
70
+
71
+ # returns the size of the current file
72
+ def size
73
+ return 0 if !@file
74
+ File.size(filename)
75
+ end
70
76
 
71
77
  def set key, *args, &block
72
78
  tok, obj = block.call
@@ -12,13 +12,14 @@ module Monkeyshines
12
12
  def initialize options
13
13
  raise "URI for #{self.class} is required" if options[:uri].blank?
14
14
  self.db_host, self.db_port = options[:uri].to_s.split(':')
15
+ self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
15
16
  super options
16
17
  end
17
18
 
18
19
  def db
19
20
  return @db if @db
20
21
  @db ||= TokyoTyrant::RDB.new
21
- @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
22
+ @db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
22
23
  @db
23
24
  end
24
25
 
@@ -75,7 +75,7 @@ module Monkeyshines
75
75
 
76
76
  # Memoized: the hostname for the machine running this script.
77
77
  def hostname
78
- @hostname ||= ENV['HOSTNAME'] || `hostname`
78
+ @hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
79
79
  end
80
80
  # Memoized: the Process ID for this invocation.
81
81
  def pid
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: monkeyshines
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 2
10
+ version: 0.2.2
5
11
  platform: ruby
6
12
  authors:
7
13
  - Philip (flip) Kromer
@@ -9,39 +15,51 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2009-11-02 00:00:00 -06:00
18
+ date: 2010-07-15 00:00:00 +00:00
13
19
  default_executable:
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: addressable
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
20
26
  requirements:
21
27
  - - ">="
22
28
  - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
23
32
  version: "0"
24
- version:
33
+ type: :runtime
34
+ version_requirements: *id001
25
35
  - !ruby/object:Gem::Dependency
26
36
  name: uuid
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
30
40
  requirements:
31
41
  - - ">="
32
42
  - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
33
46
  version: "0"
34
- version:
47
+ type: :runtime
48
+ version_requirements: *id002
35
49
  - !ruby/object:Gem::Dependency
36
50
  name: wukong
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
40
54
  requirements:
41
55
  - - ">="
42
56
  - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
43
60
  version: "0"
44
- version:
61
+ type: :runtime
62
+ version_requirements: *id003
45
63
  description: A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.
46
64
  email: flip@infochimps.org
47
65
  executables: []
@@ -64,12 +82,12 @@ files:
64
82
  - examples/shorturls/bulkload_shorturls.rb
65
83
  - examples/shorturls/extract_urls.rb
66
84
  - examples/shorturls/multiplex_shorturl_cache.rb
67
- - examples/shorturls/old/multidump_and_fix_shorturls.rb
68
- - examples/shorturls/old/shorturl_stats.rb
69
85
  - examples/shorturls/scrape_shorturls.rb
70
86
  - examples/shorturls/shorturl_request.rb
71
87
  - examples/shorturls/shorturl_sequence.rb
72
88
  - examples/shorturls/shorturl_start_tyrant.sh
89
+ - examples/shorturls/shorturl_stats.rb
90
+ - examples/shorturls/split_short_urls.rb
73
91
  - examples/shorturls/start_shorturl_cache.sh
74
92
  - lib/monkeyshines.rb
75
93
  - lib/monkeyshines/extensions.rb
@@ -139,37 +157,43 @@ rdoc_options:
139
157
  require_paths:
140
158
  - lib
141
159
  required_ruby_version: !ruby/object:Gem::Requirement
160
+ none: false
142
161
  requirements:
143
162
  - - ">="
144
163
  - !ruby/object:Gem::Version
164
+ hash: 3
165
+ segments:
166
+ - 0
145
167
  version: "0"
146
- version:
147
168
  required_rubygems_version: !ruby/object:Gem::Requirement
169
+ none: false
148
170
  requirements:
149
171
  - - ">="
150
172
  - !ruby/object:Gem::Version
173
+ hash: 3
174
+ segments:
175
+ - 0
151
176
  version: "0"
152
- version:
153
177
  requirements: []
154
178
 
155
179
  rubyforge_project:
156
- rubygems_version: 1.3.5
180
+ rubygems_version: 1.3.7
157
181
  signing_key:
158
182
  specification_version: 3
159
183
  summary: A simple scraper for directed scrapes of APIs, feed or structured HTML.
160
184
  test_files:
161
185
  - spec/monkeyshines_spec.rb
162
186
  - spec/spec_helper.rb
163
- - examples/bulk_urls/scrape_bulk_urls.rb
164
- - examples/rename_tree/rename_hdp_tree.rb
165
- - examples/rename_tree/rename_ripd_tree.rb
166
- - examples/rss_feeds/scrape_rss_feeds.rb
167
- - examples/shorturls/bulkdump_shorturls.rb
168
- - examples/shorturls/bulkload_shorturls.rb
169
- - examples/shorturls/extract_urls.rb
187
+ - examples/shorturls/shorturl_stats.rb
188
+ - examples/shorturls/shorturl_request.rb
170
189
  - examples/shorturls/multiplex_shorturl_cache.rb
171
- - examples/shorturls/old/multidump_and_fix_shorturls.rb
172
- - examples/shorturls/old/shorturl_stats.rb
190
+ - examples/shorturls/bulkload_shorturls.rb
173
191
  - examples/shorturls/scrape_shorturls.rb
174
- - examples/shorturls/shorturl_request.rb
192
+ - examples/shorturls/extract_urls.rb
193
+ - examples/shorturls/bulkdump_shorturls.rb
175
194
  - examples/shorturls/shorturl_sequence.rb
195
+ - examples/shorturls/split_short_urls.rb
196
+ - examples/rename_tree/rename_hdp_tree.rb
197
+ - examples/rename_tree/rename_ripd_tree.rb
198
+ - examples/rss_feeds/scrape_rss_feeds.rb
199
+ - examples/bulk_urls/scrape_bulk_urls.rb
@@ -1,66 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
- require 'rubygems'
4
- require 'trollop'
5
- require 'wukong'
6
- require 'monkeyshines'
7
- require 'shorturl_request'
8
- require 'shorturl_sequence'
9
- require 'monkeyshines/utils/uri'
10
-
11
- #
12
- # Command line options
13
- #
14
- opts = Trollop::options do
15
- opt :from_type, 'Class name for scrape store to load from', :type => String
16
- opt :from, 'URI for scrape store to load from', :type => String
17
- opt :into, 'Filename for flat TSV dump', :type => String
18
- opt :log, 'File to store log', :type => String
19
- end
20
- Trollop::die :from_type unless opts[:from_type]
21
-
22
- # ******************** Read From ********************
23
- src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
24
- src_store = src_store_klass.new(opts[:from])
25
- Log.info "Loaded store with #{src_store.size}"
26
-
27
- # ******************** Write into ********************
28
- DUMPFILE_BASE = opts[:into]
29
- def make_store uri
30
- Monkeyshines::Store::FlatFileStore.new "#{DUMPFILE_BASE+"-"+uri}.tsv", :filemode => 'w'
31
- end
32
- dests = { }
33
- [ 'tinyurl', 'bitly', 'other'
34
- ].each do |handle|
35
- dests[handle] = make_store handle
36
- end
37
-
38
- # ******************** Log ********************
39
- periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
40
-
41
- # ******************** Cross Load ********************
42
- # Read , process, dump
43
- iter = 0
44
- src_store.each do |key, hsh|
45
- hsh['contents'] ||= hsh.delete 'expanded_url'
46
- hsh['response_code'] = nil if hsh['response_code'] == 'nil'
47
- hsh['contents'] = nil if hsh['contents'] == 'nil'
48
- unless hsh['contents'] || hsh['response_code']
49
- # Log.info "removing #{hsh.inspect}"
50
- src_store.db.out(key)
51
- next
52
- end
53
- hsh['response_message'] = nil if hsh['response_message'] == 'nil'
54
- hsh['url'] ||= hsh.delete 'short_url'
55
- req = ShorturlRequest.from_hash hsh
56
- periodic_log.periodically{ [src_store.size, req.to_flat] }
57
-
58
- req.contents = Addressable::URI.scrub_url req.contents if req.contents
59
-
60
- case
61
- when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].save req
62
- when (key =~ %r{^http://bit.ly/(.*)}) then dests['bitly' ].save req
63
- else dests['other' ].save req
64
- end
65
- # src_store.save(key, req.to_hash.compact)
66
- end
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
- #require 'rubygems'
4
- # require 'wukong'
5
- require 'monkeyshines'
6
- # require 'monkeyshines/utils/uri'
7
- # require 'monkeyshines/utils/filename_pattern'
8
- # require 'monkeyshines/store/conditional_store'
9
- # require 'monkeyshines/fetcher/http_head_fetcher'
10
- # require 'trollop' # gem install trollop
11
- # require 'shorturl_request'
12
- require 'shorturl_sequence'
13
-
14
- digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
15
-
16
- # (1..10000).each do |idx|
17
- # s = ShorturlSequence.encode_integer idx, 36
18
- # digits[s[0..0]] += 1
19
- # end
20
- # p digits
21
- # puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
22
-
23
- class Histo
24
- attr_accessor :buckets
25
- def initialize
26
- self.buckets = { }
27
- end
28
- def << val
29
- buckets[val] ||= 0
30
- buckets[val] += 1
31
- end
32
- def dump
33
- buckets.sort.each do |val, count|
34
- puts "%10d\t%s"%[count,val]
35
- end
36
- end
37
- end
38
-
39
- len_histo = Histo.new
40
- num_histo = Histo.new
41
- ltr_histo = Histo.new
42
- iter = 0
43
-
44
- # 123456789-123456789-
45
- # http://bit.ly/
46
- # http://tinyurl.com/
47
- BASE_URL = "http://is.gd/"
48
- RADIX = 62
49
- HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
50
- BASE_URL_LEN = BASE_URL.length
51
- MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
52
- SIX_CHARS = RADIX**6
53
- File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
54
- ) do |reqfile|
55
- reqfile.each do |url|
56
- #decode
57
- next unless url.length <= MAX_TAIL_LEN
58
- tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
59
- # tail.downcase!
60
- asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
61
- next unless asnum && asnum < SIX_CHARS
62
- size = (asnum / 1_000_000)
63
- len = tail.length
64
- # track stats
65
- len_histo << len
66
- num_histo << size
67
- ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
68
- puts iter if ((iter += 1) % 1_000_000 == 0)
69
-
70
- end
71
- end
72
- puts "Integer magnitude of decoded (M):"
73
- num_histo.dump
74
- puts "Length of encoded:"
75
- len_histo.dump
76
- puts "First Letter:"
77
- ltr_histo.dump
78
-
79
-
80
- # puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
81
- # puts [asnum, tail, url].inspect