monkeyshines 0.2.1 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,16 +1,19 @@
1
1
  #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
2
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__); $: << File.dirname(__FILE__)+'/../../../graphiterb/lib'
3
3
  require 'rubygems'
4
4
  require 'wukong'
5
5
  require 'monkeyshines'
6
+ require 'configliere'
6
7
  #
7
8
  require 'shorturl_request'
8
9
  require 'shorturl_sequence'
10
+ require 'shorturl_stats'
9
11
  require 'monkeyshines/utils/uri'
10
12
  require 'monkeyshines/utils/filename_pattern'
11
13
  require 'monkeyshines/store/conditional_store'
12
14
  require 'monkeyshines/fetcher/http_head_fetcher'
13
- require 'trollop' # gem install trollop
15
+ require 'graphiterb' # needs graphiterb - simple ruby interface for graphite
16
+ # require 'trollop' # gem install trollop
14
17
 
15
18
  # ===========================================================================
16
19
  #
@@ -26,39 +29,74 @@ require 'trollop' # gem install trollop
26
29
  # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
27
30
  #
28
31
  #
29
- opts = Trollop::options do
30
- opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
31
- opt :log, "Log file name; leave blank to use STDERR", :type => String
32
- # input from file
33
- opt :from, "Location of URLs to scrape", :type => String
34
- opt :skip, "Initial lines to skip", :type => Integer
35
- # OR do a random walk
36
- opt :random, "Generate and visit random URL suffixes"
37
- opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
38
- opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
39
- opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
40
- # output storage
41
- opt :cache_loc, "URI for cache server", :type => String
42
- opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
43
- opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
44
- opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
45
- end
46
- handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
47
32
 
33
+ Configliere.use :commandline, :config_file, :define
34
+ Settings.read 'shorturls.yaml' #~/.configliere/shorturls.yaml
35
+ Settings.define :base_url, :description => "Host part of URL: eg tinyurl.com", :type => String, :required => true
36
+ # Settings.define :log, :description => "Log file name; leave blank to use STDERR", :type => String
37
+ Settings.define :log_time, :description => "Log time interval, in seconds, for periodic logger and Graphite logger", :type => Integer, :default => 60
38
+ Settings.define :log_iters, :description => "Log iteration interval for periodic logger and Graphite logger", :type => Integer, :default => 10000
39
+ # input from file
40
+ Settings.define :file_from, :description => "Location of URLs to scrape", :type => String
41
+ Settings.define :file_skip, :description => "Initial lines to skip", :type => Integer
42
+ # OR do a random walk
43
+ Settings.define :random, :description => "Generate and visit random URL suffixes"
44
+ Settings.define :random_min, :description => "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
45
+ Settings.define :random_max, :description => "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
46
+ Settings.define :random_radix, :description => "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
47
+ # output storage
48
+ Settings.define :cache_loc, :description => "URI for cache server", :type => String
49
+ Settings.define :chunk_time, :description => "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
50
+ Settings.define :rootdir, :description => "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd/shorturls'
51
+ Settings.define :dest_pattern, :description => "Pattern for dump file output", :default => ":rootdir/:date/:handle+:timestamp-:pid.tsv"
52
+ Settings.resolve!
53
+ Log = Logger.new($stderr) unless defined?(Log)
54
+
55
+ # Removed trollop optioning, added in configliere instead
56
+ # opts = Trollop::options do
57
+ # opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
58
+ # opt :log, "Log file name; leave blank to use STDERR", :type => String
59
+ # # input from file
60
+ # opt :from, "Location of URLs to scrape", :type => String
61
+ # opt :skip, "Initial lines to skip", :type => Integer
62
+ # # OR do a random walk
63
+ # opt :random, "Generate and visit random URL suffixes"
64
+ # opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
65
+ # opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
66
+ # opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
67
+ # # output storage
68
+ # opt :cache_loc, "URI for cache server", :type => String
69
+ # opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
70
+ # opt :rootdir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
71
+ # opt :dest_pattern, "Pattern for dump file output", :default => ":rootdir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
72
+ # end
73
+ handle = Settings.base_url.gsub(/\.com$/,'').gsub(/\W+/,'')
74
+ hostname ||= `hostname`.chomp.gsub(".","_")
75
+
76
+
77
+ #
48
78
  # ******************** Log ********************
49
- opts[:log] = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (opts[:log]=='')
50
- periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 10000, :time => 30)
79
+ #
80
+ # (I don't think the log file name ever gets used)
81
+ # Settings.log = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (Settings.log=='')
82
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
83
+
84
+ #
85
+ # ******************** Graphite Sender ***********************
86
+ #
87
+ graphite_sender = Graphiterb::GraphiteLogger.new(:iters => Settings.log_iters, :time => Settings.log_time)
51
88
 
52
89
  #
53
90
  # ******************** Load from store or random walk ********************
54
91
  #
55
- if opts[:from]
56
- src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
57
- src_store.skip!(opts[:skip].to_i) if opts[:skip]
58
- elsif opts[:random]
92
+ if Settings.file_from
93
+ # Settings.filename = Settings.file_from
94
+ src_store = Monkeyshines::Store::FlatFileStore.new(:filename => Settings.file_from, :skip => Settings.file_skip.to_i) # + {:filemode => 'r'}
95
+ # src_store.skip!(Settings.file_skip.to_i) if Settings.file_skip
96
+ elsif Settings.random
59
97
  src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
60
98
  else
61
- Trollop::die "Need to either say --random or --from=filename"
99
+ Settings.die "Need to either say --random or --file_from=filename"
62
100
  end
63
101
 
64
102
  #
@@ -67,30 +105,37 @@ end
67
105
  # Track visited URLs with key-value database
68
106
  #
69
107
  RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
70
- cache_loc = opts[:cache_loc] || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
71
- dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
108
+ cache_loc = Settings.cache_loc || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
109
+ dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(:uri => cache_loc)
110
+
111
+
72
112
  # dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
73
113
 
74
114
  #
75
115
  # Store the data into flat files
76
116
  #
77
- dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern],
78
- :handle => 'shorturl-'+handle, :dest_dir => opts[:dest_dir])
79
- dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
80
- opts[:chunk_time].to_i, opts)
117
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(Settings.dest_pattern,
118
+ :handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
119
+ dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(:pattern => Settings.dest_pattern,
120
+ :chunk_time => Settings.chunk_time.to_i, :handle => 'shorturl-'+handle, :rootdir => Settings.rootdir)
81
121
 
82
122
  #
83
123
  # Conditional store uses the key-value DB to boss around the flat files --
84
124
  # requests are only made (and thus data is only output) if the url is missing
85
125
  # from the key-value store.
86
126
  #
87
- dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
127
+ dest_store = Monkeyshines::Store::ConditionalStore.new(:cache => dest_cache, :store => dest_files)
88
128
 
89
129
  #
90
130
  # ******************** Fetcher ********************
91
131
  #
92
132
  fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
93
133
 
134
+ #
135
+ # ******************** Success/Fail stats ********************
136
+ #
137
+ stats = ShorturlStats.new(0,0,0,0)
138
+
94
139
  #
95
140
  # ******************** Do this thing ********************
96
141
  #
@@ -104,9 +149,18 @@ src_store.each do |bareurl, *args|
104
149
  result = dest_store.set( req.url ) do
105
150
  response = fetcher.get(req) # do the url fetch
106
151
  next unless response.response_code || response.contents # don't store bad fetches
152
+ stats.code_sort(response.response_code) # count successes (301) and failures (404)
107
153
  [response.scraped_at, response] # timestamp into cache, result into flat file
108
154
  end
109
- periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
155
+ periodic_log.periodically{ ["%7d"%stats.success_tot, 'successes', "%7d"%stats.failure_tot, 'failures', dest_store.size, req.response_code, result, req.url] }
156
+ graphite_sender.periodically do |metrics, iter, since|
157
+ rates = stats.rates_inst
158
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_rate", rates[0]]
159
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_rate", rates[1]]
160
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.success_tot_rate", stats.rates_tot[0]]
161
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.failure_tot_rate", stats.rates_tot[1]]
162
+ metrics << ["scraper.#{hostname}.shorturl.#{handle}.current_file_size", dest_files.size]
163
+ end
110
164
  end
111
165
  dest_store.close
112
166
  fetcher.close
@@ -0,0 +1,37 @@
1
+ class ShorturlStats < Struct.new(
2
+ :success_tot,
3
+ :failure_tot,
4
+ :success_last,
5
+ :fail_last
6
+ )
7
+
8
+ def code_sort code
9
+ case code.to_s
10
+ when /4\d{2}/
11
+ self.failure_tot += 1
12
+ self.fail_last += 1
13
+ when /3\d{2}/
14
+ self.success_tot += 1
15
+ self.success_last += 1
16
+ else
17
+ Log.warn "Code #{code} not included in stats."
18
+ end
19
+ end
20
+
21
+ def rates_inst
22
+ return [0,0] if (self.success_last.to_f + self.fail_last.to_f) == 0
23
+ s_rate = (self.success_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
24
+ f_rate = (self.fail_last.to_f)/(self.success_last.to_f + self.fail_last.to_f)
25
+ self.success_last = 0
26
+ self.fail_last = 0
27
+ [s_rate,f_rate]
28
+ end
29
+
30
+ def rates_tot
31
+ return [0,0] if (self.success_tot.to_f + self.failure_tot.to_f) == 0
32
+ st_rate = (self.success_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
33
+ ft_rate = (self.failure_tot.to_f)/(self.success_tot.to_f + self.failure_tot.to_f)
34
+ [st_rate,ft_rate]
35
+ end
36
+
37
+ end
@@ -0,0 +1,21 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ WORK_DIR = '/data/rawd/social/networks/twitter_friends/tokens_by_month/'
4
+
5
+ OTHER_SHORTURL_RE =
6
+ %r{.*(http://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.+)}
7
+
8
+ bitly_file = File.open('/home/doncarlo/shorturls/shorturls_bitly','w')
9
+ tinyurl_file = File.open('/home/doncarlo/shorturls/shorturls_tinyurl','w')
10
+ otherurl_file = File.open('/home/doncarlo/shorturls/shorturls_other','w')
11
+
12
+
13
+
14
+ File.open(WORK_DIR + 'urls_by_month-20091111.tsv').each do |line|
15
+ line.chomp!
16
+ url = line.split("\t")[-1]
17
+ bitly_file << $1 + "\n" if url =~ %r{.*(http://bit.ly/.+)}
18
+ tinyurl_file << $1 + "\n" if url =~ %r{.*(http://tinyurl.com/.+)}
19
+ otherurl_file << $1 + "\n" if url =~ OTHER_SHORTURL_RE
20
+ end
21
+
@@ -1,25 +1,137 @@
1
1
  require 'net/http'
2
+ require 'oauth'
2
3
  Net::HTTP.version_1_2
3
4
  module Monkeyshines
4
5
  module Fetcher
5
6
 
6
7
  #
7
- class AuthedHttpFetcher
8
- cattr_accessor :auth_params
8
+ class AuthedHttpFetcher < HttpFetcher
9
+ attr_accessor :auth_params, :oauth_token, :oauth_secret, :consumer_key, :consumer_secret, :site, :authorize_path
10
+ #
11
+ # All the stuff below was copied from http://github.com/moomerman/twitter_oauth in the client.rb file
12
+ #
13
+ # def initialize(options = {})
14
+ # @consumer_key = options[:consumer_key]
15
+ # @consumer_secret = options[:consumer_secret]
16
+ # @token = options[:token]
17
+ # @secret = options[:secret]
18
+ # end
19
+ #
20
+ # def authorize(token, secret, options = {})
21
+ # request_token = OAuth::RequestToken.new(
22
+ # consumer, token, secret
23
+ # )
24
+ # @access_token = request_token.get_access_token(options)
25
+ # @token = @access_token.token
26
+ # @secret = @access_token.secret
27
+ # @access_token
28
+ # end
29
+ #
30
+ # def show(username)
31
+ # get("/users/show/#{username}.json")
32
+ # end
33
+ #
34
+ # # Returns the string "ok" in the requested format with a 200 OK HTTP status code.
35
+ # def test
36
+ # get("/help/test.json")
37
+ # end
38
+ #
39
+ # def request_token(options={})
40
+ # consumer.get_request_token(options)
41
+ # end
42
+ #
43
+ # def authentication_request_token(options={})
44
+ # consumer.options[:authorize_path] = '/oauth/authenticate'
45
+ # request_token(options)
46
+ # end
47
+ #
48
+ # private
49
+ #
50
+ # def consumer
51
+ # @consumer ||= OAuth::Consumer.new(
52
+ # @consumer_key,
53
+ # @consumer_secret,
54
+ # { :site => "http://api.twitter.com" }
55
+ # )
56
+ # end
57
+ #
58
+ # def access_token
59
+ # @access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
60
+ # end
61
+ #
62
+ # def get(path, headers={})
63
+ # headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
64
+ # oauth_response = access_token.get("/1#{path}", headers)
65
+ # JSON.parse(oauth_response.body)
66
+ # end
67
+ #
68
+ # def post(path, body='', headers={})
69
+ # headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
70
+ # oauth_response = access_token.post("/1#{path}", body, headers)
71
+ # JSON.parse(oauth_response.body)
72
+ # end
73
+ #
74
+ # def delete(path, headers={})
75
+ # headers.merge!("User-Agent" => "twitter_oauth gem v#{TwitterOAuth::VERSION}")
76
+ # oauth_response = access_token.delete("/1#{path}", headers)
77
+ # JSON.parse(oauth_response.body)
78
+ # end
9
79
 
10
- def get_request_token
80
+
81
+ def initialize _options={}
82
+ super _options
83
+ # These should get called by calling super, right?
84
+ # self.username = options[:username]
85
+ # self.password = options[:password]
86
+ # self.http_req_options = {}
87
+ # self.http_req_options["User-Agent"] = options[:user_agent] || USER_AGENT
88
+ # self.http_req_options["Connection"] = "keep-alive"
89
+ self.oauth_token = options[:oauth_token]
90
+ self.oauth_secret = options[:oauth_token_secret]
91
+ self.consumer_key = options[:consumer_key]
92
+ self.consumer_secret = options[:consumer_secret]
93
+ self.site = options[:site]
94
+ self.authorize_path = options[:authorize_path]
95
+ end
96
+
97
+ def request_token(options={})
98
+ consumer.options[:authorize_path] = @authorize_path
99
+ consumer.get_request_token(options)
11
100
  end
12
101
 
13
- def authorize
102
+ def authorize(token, secret, options = {})
103
+ request_token = OAuth::RequestToken.new(
104
+ consumer, token, secret
105
+ )
106
+ @access_token = request_token.get_access_token(options)
107
+ @token = @access_token.token
108
+ @secret = @access_token.secret
109
+ @access_token
14
110
  end
15
111
 
16
112
  def get_access_token
17
113
  end
18
114
 
19
- def api_key
115
+ def oauth_token
116
+ @oauth_token
117
+ end
118
+
119
+ def oauth_secret
120
+ @oauth_secret
121
+ end
122
+
123
+ def consumer
124
+ @consumer ||= OAuth::Consumer.new(
125
+ @consumer_key,
126
+ @consumer_secret,
127
+ { :site => @site }
128
+ )
20
129
  end
21
- def api_secret
130
+
131
+ def access_token
132
+ @access_token ||= OAuth::AccessToken.new(consumer, @token, @secret)
22
133
  end
134
+
23
135
  def session_key
24
136
  end
25
137
 
@@ -82,17 +82,18 @@ module Monkeyshines
82
82
  # Response-based sleep time
83
83
  sleep_time = 0
84
84
  case response
85
- when Net::HTTPSuccess then return # 2xx
86
- when Net::HTTPRedirection then return # 3xx
87
- when Net::HTTPBadRequest then sleep_time = 5 # 400 (rate limit, probably)
88
- when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
89
- when Net::HTTPForbidden then sleep_time = 4 # 403 update limit
90
- when Net::HTTPNotFound then sleep_time = 0 # 404 deleted
91
- when Net::HTTPServiceUnavailable then sleep_time = 15 # 503 Fail Whale
92
- when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
85
+ when Net::HTTPSuccess then return # 2xx
86
+ when Net::HTTPRedirection then return # 3xx
87
+ when Net::HTTPBadRequest then sleep_time = 10 # 400 (rate limit, probably)
88
+ when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
89
+ when Net::HTTPForbidden then sleep_time = 10 # 403 update limit
90
+ when Net::HTTPNotFound then sleep_time = 0 # 404 deleted or suspended
91
+ when Net::HTTPServiceUnavailable then sleep_time = 10 # 503 Fail Whale
92
+ when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
93
93
  else sleep_time = 1
94
94
  end
95
- Log.warn "Received #{response.code}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at})"
95
+ sleep_time += response['retry-after'].to_i rescue 0
96
+ Log.warn "Received #{response.code} and retry-after #{response['retry-after']}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at}): '#{response.body[0..200].gsub(%r{[\r\n\t]}, " ")}'"
96
97
  sleep sleep_time
97
98
  end
98
99
 
@@ -24,7 +24,8 @@ module Monkeyshines
24
24
  #
25
25
  def periodically &block
26
26
  super do
27
- result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%rate, (block ? block.call : nil) ].flatten.compact
27
+ now = Time.now.utc.to_f
28
+ result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%inst_rate(now), (block ? block.call : nil) ].flatten.compact
28
29
  Log.info result.join("\t")
29
30
  end
30
31
  end
@@ -18,12 +18,13 @@ module Monkeyshines
18
18
  #
19
19
  class PeriodicMonitor
20
20
  attr_accessor :time_interval, :iter_interval
21
- attr_accessor :last_time, :iter, :started_at
21
+ attr_accessor :last_time, :current_iter, :iter, :started_at
22
22
 
23
23
  def initialize options={}
24
24
  self.started_at = Time.now.utc.to_f
25
25
  self.last_time = started_at
26
26
  self.iter = 0
27
+ self.current_iter = 0
27
28
  self.time_interval = options[:time]
28
29
  self.iter_interval = options[:iters]
29
30
  end
@@ -42,10 +43,14 @@ module Monkeyshines
42
43
  def since
43
44
  Time.now.utc.to_f - started_at
44
45
  end
45
- # Iterations per second
46
+ # Overall iterations per second
46
47
  def rate
47
48
  iter.to_f / since.to_f
48
49
  end
50
+ # "Instantaneous" iterations per second
51
+ def inst_rate now
52
+ current_iter.to_f / (now-last_time).to_f
53
+ end
49
54
 
50
55
  #
51
56
  # if the interval conditions are met, executes block; otherwise just does
@@ -53,10 +58,12 @@ module Monkeyshines
53
58
  #
54
59
  def periodically &block
55
60
  self.iter += 1
61
+ self.current_iter += 1
56
62
  now = Time.now.utc.to_f
57
63
  if enough_iterations? || enough_time?(now)
58
64
  block.call(iter, (now-last_time))
59
65
  self.last_time = now
66
+ self.current_iter = 0
60
67
  end
61
68
  end
62
69
  end
@@ -1,3 +1,5 @@
1
+ require 'yaml'
2
+ require 'monkeyshines/runner_core/options'
1
3
  require 'monkeyshines/utils/trollop'
2
4
  module Monkeyshines
3
5
 
@@ -142,6 +142,7 @@ module Monkeyshines
142
142
  def setup_main_log
143
143
  unless options[:log][:dest].blank?
144
144
  log_file = "%s/log/%s" % [WORK_DIR, options[:log][:dest]]
145
+ require 'fileutils'
145
146
  FileUtils.mkdir_p(File.dirname(log_file))
146
147
  $stdout = $stderr = File.open( log_file+"-console.log", "a" )
147
148
  end
@@ -67,6 +67,12 @@ module Monkeyshines
67
67
  file << obj.to_flat.join("\t")+"\n"
68
68
  obj
69
69
  end
70
+
71
+ # returns the size of the current file
72
+ def size
73
+ return 0 if !@file
74
+ File.size(filename)
75
+ end
70
76
 
71
77
  def set key, *args, &block
72
78
  tok, obj = block.call
@@ -12,13 +12,14 @@ module Monkeyshines
12
12
  def initialize options
13
13
  raise "URI for #{self.class} is required" if options[:uri].blank?
14
14
  self.db_host, self.db_port = options[:uri].to_s.split(':')
15
+ self.db_host.gsub!(/^(localhost|127\.0\.0\.1)$/,'')
15
16
  super options
16
17
  end
17
18
 
18
19
  def db
19
20
  return @db if @db
20
21
  @db ||= TokyoTyrant::RDB.new
21
- @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
22
+ @db.open(db_host, db_port) or raise("Can't open DB at host #{db_host} port #{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
22
23
  @db
23
24
  end
24
25
 
@@ -75,7 +75,7 @@ module Monkeyshines
75
75
 
76
76
  # Memoized: the hostname for the machine running this script.
77
77
  def hostname
78
- @hostname ||= ENV['HOSTNAME'] || `hostname`
78
+ @hostname ||= ENV['HOSTNAME'] || `hostname`.delete("\n")
79
79
  end
80
80
  # Memoized: the Process ID for this invocation.
81
81
  def pid
metadata CHANGED
@@ -1,7 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: monkeyshines
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ hash: 19
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 2
9
+ - 2
10
+ version: 0.2.2
5
11
  platform: ruby
6
12
  authors:
7
13
  - Philip (flip) Kromer
@@ -9,39 +15,51 @@ autorequire:
9
15
  bindir: bin
10
16
  cert_chain: []
11
17
 
12
- date: 2009-11-02 00:00:00 -06:00
18
+ date: 2010-07-15 00:00:00 +00:00
13
19
  default_executable:
14
20
  dependencies:
15
21
  - !ruby/object:Gem::Dependency
16
22
  name: addressable
17
- type: :runtime
18
- version_requirement:
19
- version_requirements: !ruby/object:Gem::Requirement
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
20
26
  requirements:
21
27
  - - ">="
22
28
  - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
23
32
  version: "0"
24
- version:
33
+ type: :runtime
34
+ version_requirements: *id001
25
35
  - !ruby/object:Gem::Dependency
26
36
  name: uuid
27
- type: :runtime
28
- version_requirement:
29
- version_requirements: !ruby/object:Gem::Requirement
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
30
40
  requirements:
31
41
  - - ">="
32
42
  - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
33
46
  version: "0"
34
- version:
47
+ type: :runtime
48
+ version_requirements: *id002
35
49
  - !ruby/object:Gem::Dependency
36
50
  name: wukong
37
- type: :runtime
38
- version_requirement:
39
- version_requirements: !ruby/object:Gem::Requirement
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
40
54
  requirements:
41
55
  - - ">="
42
56
  - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
43
60
  version: "0"
44
- version:
61
+ type: :runtime
62
+ version_requirements: *id003
45
63
  description: A simple scraper for directed scrapes of APIs, feed or structured HTML. Plays nicely with wuclan and wukong.
46
64
  email: flip@infochimps.org
47
65
  executables: []
@@ -64,12 +82,12 @@ files:
64
82
  - examples/shorturls/bulkload_shorturls.rb
65
83
  - examples/shorturls/extract_urls.rb
66
84
  - examples/shorturls/multiplex_shorturl_cache.rb
67
- - examples/shorturls/old/multidump_and_fix_shorturls.rb
68
- - examples/shorturls/old/shorturl_stats.rb
69
85
  - examples/shorturls/scrape_shorturls.rb
70
86
  - examples/shorturls/shorturl_request.rb
71
87
  - examples/shorturls/shorturl_sequence.rb
72
88
  - examples/shorturls/shorturl_start_tyrant.sh
89
+ - examples/shorturls/shorturl_stats.rb
90
+ - examples/shorturls/split_short_urls.rb
73
91
  - examples/shorturls/start_shorturl_cache.sh
74
92
  - lib/monkeyshines.rb
75
93
  - lib/monkeyshines/extensions.rb
@@ -139,37 +157,43 @@ rdoc_options:
139
157
  require_paths:
140
158
  - lib
141
159
  required_ruby_version: !ruby/object:Gem::Requirement
160
+ none: false
142
161
  requirements:
143
162
  - - ">="
144
163
  - !ruby/object:Gem::Version
164
+ hash: 3
165
+ segments:
166
+ - 0
145
167
  version: "0"
146
- version:
147
168
  required_rubygems_version: !ruby/object:Gem::Requirement
169
+ none: false
148
170
  requirements:
149
171
  - - ">="
150
172
  - !ruby/object:Gem::Version
173
+ hash: 3
174
+ segments:
175
+ - 0
151
176
  version: "0"
152
- version:
153
177
  requirements: []
154
178
 
155
179
  rubyforge_project:
156
- rubygems_version: 1.3.5
180
+ rubygems_version: 1.3.7
157
181
  signing_key:
158
182
  specification_version: 3
159
183
  summary: A simple scraper for directed scrapes of APIs, feed or structured HTML.
160
184
  test_files:
161
185
  - spec/monkeyshines_spec.rb
162
186
  - spec/spec_helper.rb
163
- - examples/bulk_urls/scrape_bulk_urls.rb
164
- - examples/rename_tree/rename_hdp_tree.rb
165
- - examples/rename_tree/rename_ripd_tree.rb
166
- - examples/rss_feeds/scrape_rss_feeds.rb
167
- - examples/shorturls/bulkdump_shorturls.rb
168
- - examples/shorturls/bulkload_shorturls.rb
169
- - examples/shorturls/extract_urls.rb
187
+ - examples/shorturls/shorturl_stats.rb
188
+ - examples/shorturls/shorturl_request.rb
170
189
  - examples/shorturls/multiplex_shorturl_cache.rb
171
- - examples/shorturls/old/multidump_and_fix_shorturls.rb
172
- - examples/shorturls/old/shorturl_stats.rb
190
+ - examples/shorturls/bulkload_shorturls.rb
173
191
  - examples/shorturls/scrape_shorturls.rb
174
- - examples/shorturls/shorturl_request.rb
192
+ - examples/shorturls/extract_urls.rb
193
+ - examples/shorturls/bulkdump_shorturls.rb
175
194
  - examples/shorturls/shorturl_sequence.rb
195
+ - examples/shorturls/split_short_urls.rb
196
+ - examples/rename_tree/rename_hdp_tree.rb
197
+ - examples/rename_tree/rename_ripd_tree.rb
198
+ - examples/rss_feeds/scrape_rss_feeds.rb
199
+ - examples/bulk_urls/scrape_bulk_urls.rb
@@ -1,66 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
- require 'rubygems'
4
- require 'trollop'
5
- require 'wukong'
6
- require 'monkeyshines'
7
- require 'shorturl_request'
8
- require 'shorturl_sequence'
9
- require 'monkeyshines/utils/uri'
10
-
11
- #
12
- # Command line options
13
- #
14
- opts = Trollop::options do
15
- opt :from_type, 'Class name for scrape store to load from', :type => String
16
- opt :from, 'URI for scrape store to load from', :type => String
17
- opt :into, 'Filename for flat TSV dump', :type => String
18
- opt :log, 'File to store log', :type => String
19
- end
20
- Trollop::die :from_type unless opts[:from_type]
21
-
22
- # ******************** Read From ********************
23
- src_store_klass = Wukong.class_from_resource('Monkeyshines::Store::'+opts[:from_type])
24
- src_store = src_store_klass.new(opts[:from])
25
- Log.info "Loaded store with #{src_store.size}"
26
-
27
- # ******************** Write into ********************
28
- DUMPFILE_BASE = opts[:into]
29
- def make_store uri
30
- Monkeyshines::Store::FlatFileStore.new "#{DUMPFILE_BASE+"-"+uri}.tsv", :filemode => 'w'
31
- end
32
- dests = { }
33
- [ 'tinyurl', 'bitly', 'other'
34
- ].each do |handle|
35
- dests[handle] = make_store handle
36
- end
37
-
38
- # ******************** Log ********************
39
- periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 20_000, :time => 30)
40
-
41
- # ******************** Cross Load ********************
42
- # Read , process, dump
43
- iter = 0
44
- src_store.each do |key, hsh|
45
- hsh['contents'] ||= hsh.delete 'expanded_url'
46
- hsh['response_code'] = nil if hsh['response_code'] == 'nil'
47
- hsh['contents'] = nil if hsh['contents'] == 'nil'
48
- unless hsh['contents'] || hsh['response_code']
49
- # Log.info "removing #{hsh.inspect}"
50
- src_store.db.out(key)
51
- next
52
- end
53
- hsh['response_message'] = nil if hsh['response_message'] == 'nil'
54
- hsh['url'] ||= hsh.delete 'short_url'
55
- req = ShorturlRequest.from_hash hsh
56
- periodic_log.periodically{ [src_store.size, req.to_flat] }
57
-
58
- req.contents = Addressable::URI.scrub_url req.contents if req.contents
59
-
60
- case
61
- when (key =~ %r{^http://tinyurl.com/(.*)}) then dests['tinyurl'].save req
62
- when (key =~ %r{^http://bit.ly/(.*)}) then dests['bitly' ].save req
63
- else dests['other' ].save req
64
- end
65
- # src_store.save(key, req.to_hash.compact)
66
- end
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env ruby
2
- $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
- #require 'rubygems'
4
- # require 'wukong'
5
- require 'monkeyshines'
6
- # require 'monkeyshines/utils/uri'
7
- # require 'monkeyshines/utils/filename_pattern'
8
- # require 'monkeyshines/store/conditional_store'
9
- # require 'monkeyshines/fetcher/http_head_fetcher'
10
- # require 'trollop' # gem install trollop
11
- # require 'shorturl_request'
12
- require 'shorturl_sequence'
13
-
14
- digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
15
-
16
- # (1..10000).each do |idx|
17
- # s = ShorturlSequence.encode_integer idx, 36
18
- # digits[s[0..0]] += 1
19
- # end
20
- # p digits
21
- # puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
22
-
23
- class Histo
24
- attr_accessor :buckets
25
- def initialize
26
- self.buckets = { }
27
- end
28
- def << val
29
- buckets[val] ||= 0
30
- buckets[val] += 1
31
- end
32
- def dump
33
- buckets.sort.each do |val, count|
34
- puts "%10d\t%s"%[count,val]
35
- end
36
- end
37
- end
38
-
39
- len_histo = Histo.new
40
- num_histo = Histo.new
41
- ltr_histo = Histo.new
42
- iter = 0
43
-
44
- # 123456789-123456789-
45
- # http://bit.ly/
46
- # http://tinyurl.com/
47
- BASE_URL = "http://is.gd/"
48
- RADIX = 62
49
- HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
50
- BASE_URL_LEN = BASE_URL.length
51
- MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
52
- SIX_CHARS = RADIX**6
53
- File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
54
- ) do |reqfile|
55
- reqfile.each do |url|
56
- #decode
57
- next unless url.length <= MAX_TAIL_LEN
58
- tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
59
- # tail.downcase!
60
- asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
61
- next unless asnum && asnum < SIX_CHARS
62
- size = (asnum / 1_000_000)
63
- len = tail.length
64
- # track stats
65
- len_histo << len
66
- num_histo << size
67
- ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
68
- puts iter if ((iter += 1) % 1_000_000 == 0)
69
-
70
- end
71
- end
72
- puts "Integer magnitude of decoded (M):"
73
- num_histo.dump
74
- puts "Length of encoded:"
75
- len_histo.dump
76
- puts "First Letter:"
77
- ltr_histo.dump
78
-
79
-
80
- # puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
81
- # puts [asnum, tail, url].inspect