monkeyshines 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
+
#require 'rubygems'
|
4
|
+
# require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
# require 'monkeyshines/utils/uri'
|
7
|
+
# require 'monkeyshines/utils/filename_pattern'
|
8
|
+
# require 'monkeyshines/store/conditional_store'
|
9
|
+
# require 'monkeyshines/fetcher/http_head_fetcher'
|
10
|
+
# require 'trollop' # gem install trollop
|
11
|
+
# require 'shorturl_request'
|
12
|
+
require 'shorturl_sequence'
|
13
|
+
|
14
|
+
digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
|
15
|
+
|
16
|
+
# (1..10000).each do |idx|
|
17
|
+
# s = ShorturlSequence.encode_integer idx, 36
|
18
|
+
# digits[s[0..0]] += 1
|
19
|
+
# end
|
20
|
+
# p digits
|
21
|
+
# puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
|
22
|
+
|
23
|
+
class Histo
|
24
|
+
attr_accessor :buckets
|
25
|
+
def initialize
|
26
|
+
self.buckets = { }
|
27
|
+
end
|
28
|
+
def << val
|
29
|
+
buckets[val] ||= 0
|
30
|
+
buckets[val] += 1
|
31
|
+
end
|
32
|
+
def dump
|
33
|
+
buckets.sort.each do |val, count|
|
34
|
+
puts "%10d\t%s"%[count,val]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
len_histo = Histo.new
|
40
|
+
num_histo = Histo.new
|
41
|
+
ltr_histo = Histo.new
|
42
|
+
iter = 0
|
43
|
+
|
44
|
+
# 123456789-123456789-
|
45
|
+
# http://bit.ly/
|
46
|
+
# http://tinyurl.com/
|
47
|
+
BASE_URL = "http://is.gd/"
|
48
|
+
RADIX = 62
|
49
|
+
HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
|
50
|
+
BASE_URL_LEN = BASE_URL.length
|
51
|
+
MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
|
52
|
+
SIX_CHARS = RADIX**6
|
53
|
+
File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
|
54
|
+
) do |reqfile|
|
55
|
+
reqfile.each do |url|
|
56
|
+
#decode
|
57
|
+
next unless url.length <= MAX_TAIL_LEN
|
58
|
+
tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
|
59
|
+
# tail.downcase!
|
60
|
+
asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
|
61
|
+
next unless asnum && asnum < SIX_CHARS
|
62
|
+
size = (asnum / 1_000_000)
|
63
|
+
len = tail.length
|
64
|
+
# track stats
|
65
|
+
len_histo << len
|
66
|
+
num_histo << size
|
67
|
+
ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
|
68
|
+
puts iter if ((iter += 1) % 1_000_000 == 0)
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
puts "Integer magnitude of decoded (M):"
|
73
|
+
num_histo.dump
|
74
|
+
puts "Length of encoded:"
|
75
|
+
len_histo.dump
|
76
|
+
puts "First Letter:"
|
77
|
+
ltr_histo.dump
|
78
|
+
|
79
|
+
|
80
|
+
# puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
|
81
|
+
# puts [asnum, tail, url].inspect
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
#
|
7
|
+
require 'shorturl_request'
|
8
|
+
require 'shorturl_sequence'
|
9
|
+
require 'monkeyshines/utils/uri'
|
10
|
+
require 'monkeyshines/utils/filename_pattern'
|
11
|
+
require 'monkeyshines/store/conditional_store'
|
12
|
+
require 'monkeyshines/fetcher/http_head_fetcher'
|
13
|
+
require 'trollop' # gem install trollop
|
14
|
+
|
15
|
+
# ===========================================================================
|
16
|
+
#
|
17
|
+
# scrape_shorturls.rb --
|
18
|
+
#
|
19
|
+
# To scrape from a list of shortened urls:
|
20
|
+
#
|
21
|
+
# ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
|
22
|
+
#
|
23
|
+
# To do a random scrape:
|
24
|
+
#
|
25
|
+
# ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
|
26
|
+
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
27
|
+
#
|
28
|
+
#
|
29
|
+
opts = Trollop::options do
|
30
|
+
opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
31
|
+
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
32
|
+
# input from file
|
33
|
+
opt :from, "Location of URLs to scrape", :type => String
|
34
|
+
opt :skip, "Initial lines to skip", :type => Integer
|
35
|
+
# OR do a random walk
|
36
|
+
opt :random, "Generate and visit random URL suffixes"
|
37
|
+
opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
38
|
+
opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
39
|
+
opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
40
|
+
# output storage
|
41
|
+
opt :cache_loc, "URI for cache server", :type => String
|
42
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
43
|
+
opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
|
44
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
|
45
|
+
end
|
46
|
+
handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
|
47
|
+
|
48
|
+
# ******************** Log ********************
|
49
|
+
opts[:log] = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (opts[:log]=='')
|
50
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 10000, :time => 30)
|
51
|
+
|
52
|
+
#
|
53
|
+
# ******************** Load from store or random walk ********************
|
54
|
+
#
|
55
|
+
if opts[:from]
|
56
|
+
src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
|
57
|
+
src_store.skip!(opts[:skip].to_i) if opts[:skip]
|
58
|
+
elsif opts[:random]
|
59
|
+
src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
|
60
|
+
else
|
61
|
+
Trollop::die "Need to either say --random or --from=filename"
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# ******************** Store output ********************
|
66
|
+
#
|
67
|
+
# Track visited URLs with key-value database
|
68
|
+
#
|
69
|
+
RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
|
70
|
+
cache_loc = opts[:cache_loc] || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
|
71
|
+
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
|
72
|
+
# dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
|
73
|
+
|
74
|
+
#
|
75
|
+
# Store the data into flat files
|
76
|
+
#
|
77
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern],
|
78
|
+
:handle => 'shorturl-'+handle, :dest_dir => opts[:dest_dir])
|
79
|
+
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
|
80
|
+
opts[:chunk_time].to_i, opts)
|
81
|
+
|
82
|
+
#
|
83
|
+
# Conditional store uses the key-value DB to boss around the flat files --
|
84
|
+
# requests are only made (and thus data is only output) if the url is missing
|
85
|
+
# from the key-value store.
|
86
|
+
#
|
87
|
+
dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
|
88
|
+
|
89
|
+
#
|
90
|
+
# ******************** Fetcher ********************
|
91
|
+
#
|
92
|
+
fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
|
93
|
+
|
94
|
+
#
|
95
|
+
# ******************** Do this thing ********************
|
96
|
+
#
|
97
|
+
Log.info "Beginning scrape itself"
|
98
|
+
src_store.each do |bareurl, *args|
|
99
|
+
# prepare the request
|
100
|
+
next if bareurl =~ %r{\Ahttp://(poprl.com|short.to|timesurl.at|bkite.com)}
|
101
|
+
req = ShorturlRequest.new(bareurl, *args)
|
102
|
+
|
103
|
+
# conditional store only calls fetcher if url key is missing.
|
104
|
+
result = dest_store.set( req.url ) do
|
105
|
+
response = fetcher.get(req) # do the url fetch
|
106
|
+
next unless response.response_code || response.contents # don't store bad fetches
|
107
|
+
[response.scraped_at, response] # timestamp into cache, result into flat file
|
108
|
+
end
|
109
|
+
periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
|
110
|
+
end
|
111
|
+
dest_store.close
|
112
|
+
fetcher.close
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class ShorturlRequest < Struct.new(
|
2
|
+
:url,
|
3
|
+
:scraped_at,
|
4
|
+
:response_code, :response_message,
|
5
|
+
:contents
|
6
|
+
)
|
7
|
+
alias_method :short_url=, :url=
|
8
|
+
alias_method :expanded_url=, :contents=
|
9
|
+
alias_method :expanded_url, :contents
|
10
|
+
#
|
11
|
+
# All we care about is the redirect destination.
|
12
|
+
#
|
13
|
+
def response= response
|
14
|
+
self.contents = response["location"]
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# The major shortening services
|
19
|
+
#
|
20
|
+
# Do any of the mainstream shorteners use in-band characters besides \w
|
21
|
+
# alphanum and - dash? (idek.net uses a ~ and pastoid.com a + but they
|
22
|
+
# are not popular enough to justify the annoyance of allowing extra
|
23
|
+
# chars).
|
24
|
+
#
|
25
|
+
SHORTURL_RE = %r{\Ahttp://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bit.ly|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tinyurl.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.}
|
26
|
+
def self.is_shorturl? url
|
27
|
+
url.to_s =~ SHORTURL_RE
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module Base62
|
2
|
+
# http://refactormycode.com/codes/125-base-62-encoding
|
3
|
+
BASE62_CHARS = ('0'..'9').to_a + ('a'..'z').to_a + ('A'..'Z').to_a
|
4
|
+
BASE62_MAP = {}
|
5
|
+
BASE62_CHARS.zip((0..61).to_a){|ch,num| BASE62_MAP[ch]=num }
|
6
|
+
def self.i_to_s i
|
7
|
+
return '0' if i == 0
|
8
|
+
s = ''
|
9
|
+
while i > 0
|
10
|
+
s << BASE62_CHARS[i.modulo(62)]
|
11
|
+
i /= 62
|
12
|
+
end
|
13
|
+
s.reverse
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.s_to_i str
|
17
|
+
i_out = 0
|
18
|
+
str.reverse.chars.each_with_index do |c, i|
|
19
|
+
i_out += BASE62_MAP[c] * (62 ** i)
|
20
|
+
end
|
21
|
+
i_out
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module ShorturlSequence
|
26
|
+
def self.encode_integer i, radix
|
27
|
+
case radix.to_s
|
28
|
+
when '36' then i.to_s(36)
|
29
|
+
when '62' then Base62.i_to_s(i)
|
30
|
+
else
|
31
|
+
raise "Can't encode into base #{radix}"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.decode_str s, radix
|
36
|
+
s = s.gsub(%r{\W+$},'')
|
37
|
+
case radix.to_s
|
38
|
+
when '36' then s.to_i(36)
|
39
|
+
when '62' then Base62.s_to_i(i)
|
40
|
+
else
|
41
|
+
raise "Can't encode into base #{radix}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Shorturl
|
47
|
+
attr_accessor :base_url
|
48
|
+
attr_accessor :token
|
49
|
+
def initialize token
|
50
|
+
self.token = token
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Shorturl62
|
55
|
+
def to_i
|
56
|
+
Base62.s_to_i token
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_s
|
60
|
+
url
|
61
|
+
end
|
62
|
+
|
63
|
+
def url
|
64
|
+
"#{base_url}/#{token}"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class IsgdShorturl < Shorturl62
|
69
|
+
def base_url
|
70
|
+
'http://is.gd'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class Monkeyshines::Store::SequentialUrlStream
|
75
|
+
DEFAULT_MAX_URLSTR = '1zzzzz'.to_i(36)
|
76
|
+
DEFAULT_RADIX = {
|
77
|
+
'http://tinyurl.com/' => 36,
|
78
|
+
'http://bit.ly/' => 62,
|
79
|
+
'http://is.gd/' => 62,
|
80
|
+
}
|
81
|
+
attr_accessor :base_url, :min_limit, :span, :encoding_radix
|
82
|
+
def initialize base_url, min_limit=0, max_limit=nil, encoding_radix=nil
|
83
|
+
self.base_url = self.class.fix_url(base_url)
|
84
|
+
self.min_limit = min_limit.to_i
|
85
|
+
max_limit ||= DEFAULT_MAX_URLSTR
|
86
|
+
self.span = max_limit.to_i - self.min_limit
|
87
|
+
self.encoding_radix = (encoding_radix || DEFAULT_RADIX[self.base_url]).to_i
|
88
|
+
raise "Please specify either encoding_radix of 36 or 62" unless [36, 62].include?(self.encoding_radix)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.fix_url url
|
92
|
+
url = 'http://' + url unless (url[0..6]=='http://')
|
93
|
+
url = url + '/' unless (url[-1..-1]=='/')
|
94
|
+
url
|
95
|
+
end
|
96
|
+
|
97
|
+
# An infinite stream of urls in range
|
98
|
+
def each *args, &block
|
99
|
+
(min_limit..max_limit).each(&block)
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.new_from_command_line cmdline_opts, default_opts={}
|
103
|
+
options = default_opts.merge(cmdline_opts)
|
104
|
+
Trollop::die :base_url if options[:base_url].blank?
|
105
|
+
self.new *options.values_of(:base_url, :min_limit, :max_limit, :encoding_radix)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class Monkeyshines::Store::RandomUrlStream < Monkeyshines::Store::SequentialUrlStream
|
110
|
+
# An infinite stream of urls in range
|
111
|
+
def each *args, &block
|
112
|
+
loop do
|
113
|
+
yield url_in_range
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def url_in_range
|
118
|
+
idx = rand(span) + min_limit
|
119
|
+
base_url + ShorturlSequence.encode_integer(idx, encoding_radix)
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
script_dir=`dirname $0`
|
4
|
+
|
5
|
+
# nohup ttserver -port 10001 "$script_dir/distdb/shorturl_scrapes-tinyurl.tct#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-tinyurl+`date "+%Y%m%d"`.log &
|
6
|
+
# nohup ttserver -port 10002 "$script_dir/distdb/shorturl_scrapes-bitly.tct#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-bitly+`date "+%Y%m%d"`.log &
|
7
|
+
# nohup ttserver -port 10003 "$script_dir/distdb/shorturl_scrapes-other.tct#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-other+`date "+%Y%m%d"`.log &
|
8
|
+
|
9
|
+
#
|
10
|
+
# Start shorturl readthru cache TokyoTyrant servers
|
11
|
+
#
|
12
|
+
nohup ttserver -port 10042 "$script_dir/distdb/shorturl_reqs-tinyurl.tch#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-tinyurl+`date "+%Y%m%d"`.log &
|
13
|
+
nohup ttserver -port 10043 "$script_dir/distdb/shorturl_reqs-bitly.tch#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-bitly+`date "+%Y%m%d"`.log &
|
14
|
+
nohup ttserver -port 10044 "$script_dir/distdb/shorturl_reqs-other.tch#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-other+`date "+%Y%m%d"`.log &
|
15
|
+
|
16
|
+
# nohup ttserver -port 10069 "$script_dir/distdb/shorturl_reqs-foo.tch#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-tinyurl+`date "+%Y%m%d"`.log &
|
data/lib/monkeyshines.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'monkeyshines/extensions'
|
2
|
+
require 'monkeyshines/utils/logger'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/extensions/pathname'
|
5
|
+
require 'monkeyshines/utils/factory_module'
|
6
|
+
require 'monkeyshines/utils/uri'
|
7
|
+
require 'monkeyshines/utils/filename_pattern'
|
8
|
+
require 'monkeyshines/options'
|
9
|
+
require 'monkeyshines/scrape_request'
|
10
|
+
|
11
|
+
module Monkeyshines
|
12
|
+
autoload :ScrapeRequest, 'monkeyshines/scrape_request'
|
13
|
+
autoload :ScrapeRequestCore, 'monkeyshines/scrape_request'
|
14
|
+
autoload :RequestStream, 'monkeyshines/request_stream'
|
15
|
+
autoload :Store, 'monkeyshines/store'
|
16
|
+
autoload :Fetcher, 'monkeyshines/fetcher'
|
17
|
+
autoload :Monitor, 'monkeyshines/monitor'
|
18
|
+
autoload :Runner, 'monkeyshines/runner'
|
19
|
+
autoload :RawJsonContents, 'monkeyshines/scrape_request/raw_json_contents'
|
20
|
+
|
21
|
+
# Dumping ground for configuration values
|
22
|
+
CONFIG = {} unless defined?(CONFIG)
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# A convenient logger.
|
28
|
+
#
|
29
|
+
# Define NO_MONKEYSHINES_LOG (or define Log yourself) to prevent its creation
|
30
|
+
#
|
31
|
+
Log = Monkeyshines.logger unless (defined?(Log) || defined?(NO_MONKEYSHINES_LOG))
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class Numeric
|
2
|
+
def clamp min, max
|
3
|
+
return min if min && (self <= min)
|
4
|
+
return max if max && (self >= max)
|
5
|
+
self
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
class Hash
|
11
|
+
def self.deep_sum *args
|
12
|
+
args.inject({}) do |result, options|
|
13
|
+
result.deep_merge options
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Fetcher
|
3
|
+
extend FactoryModule
|
4
|
+
autoload :Base, 'monkeyshines/fetcher/base'
|
5
|
+
autoload :FakeFetcher, 'monkeyshines/fetcher/fake_fetcher'
|
6
|
+
autoload :HttpFetcher, 'monkeyshines/fetcher/http_fetcher'
|
7
|
+
autoload :HttpHeadFetcher, 'monkeyshines/fetcher/http_head_fetcher'
|
8
|
+
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
Net::HTTP.version_1_2
|
3
|
+
module Monkeyshines
|
4
|
+
module Fetcher
|
5
|
+
|
6
|
+
#
|
7
|
+
class AuthedHttpFetcher
|
8
|
+
cattr_accessor :auth_params
|
9
|
+
|
10
|
+
def get_request_token
|
11
|
+
end
|
12
|
+
|
13
|
+
def authorize
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_access_token
|
17
|
+
end
|
18
|
+
|
19
|
+
def api_key
|
20
|
+
end
|
21
|
+
def api_secret
|
22
|
+
end
|
23
|
+
def session_key
|
24
|
+
end
|
25
|
+
|
26
|
+
# authenticate request
|
27
|
+
def authenticate req
|
28
|
+
get_session_key unless session_key
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|