monkeyshines 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,81 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
+
#require 'rubygems'
|
4
|
+
# require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
# require 'monkeyshines/utils/uri'
|
7
|
+
# require 'monkeyshines/utils/filename_pattern'
|
8
|
+
# require 'monkeyshines/store/conditional_store'
|
9
|
+
# require 'monkeyshines/fetcher/http_head_fetcher'
|
10
|
+
# require 'trollop' # gem install trollop
|
11
|
+
# require 'shorturl_request'
|
12
|
+
require 'shorturl_sequence'
|
13
|
+
|
14
|
+
digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
|
15
|
+
|
16
|
+
# (1..10000).each do |idx|
|
17
|
+
# s = ShorturlSequence.encode_integer idx, 36
|
18
|
+
# digits[s[0..0]] += 1
|
19
|
+
# end
|
20
|
+
# p digits
|
21
|
+
# puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
|
22
|
+
|
23
|
+
class Histo
|
24
|
+
attr_accessor :buckets
|
25
|
+
def initialize
|
26
|
+
self.buckets = { }
|
27
|
+
end
|
28
|
+
def << val
|
29
|
+
buckets[val] ||= 0
|
30
|
+
buckets[val] += 1
|
31
|
+
end
|
32
|
+
def dump
|
33
|
+
buckets.sort.each do |val, count|
|
34
|
+
puts "%10d\t%s"%[count,val]
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
len_histo = Histo.new
|
40
|
+
num_histo = Histo.new
|
41
|
+
ltr_histo = Histo.new
|
42
|
+
iter = 0
|
43
|
+
|
44
|
+
# 123456789-123456789-
|
45
|
+
# http://bit.ly/
|
46
|
+
# http://tinyurl.com/
|
47
|
+
BASE_URL = "http://is.gd/"
|
48
|
+
RADIX = 62
|
49
|
+
HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
|
50
|
+
BASE_URL_LEN = BASE_URL.length
|
51
|
+
MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
|
52
|
+
SIX_CHARS = RADIX**6
|
53
|
+
File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
|
54
|
+
) do |reqfile|
|
55
|
+
reqfile.each do |url|
|
56
|
+
#decode
|
57
|
+
next unless url.length <= MAX_TAIL_LEN
|
58
|
+
tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
|
59
|
+
# tail.downcase!
|
60
|
+
asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
|
61
|
+
next unless asnum && asnum < SIX_CHARS
|
62
|
+
size = (asnum / 1_000_000)
|
63
|
+
len = tail.length
|
64
|
+
# track stats
|
65
|
+
len_histo << len
|
66
|
+
num_histo << size
|
67
|
+
ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
|
68
|
+
puts iter if ((iter += 1) % 1_000_000 == 0)
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
puts "Integer magnitude of decoded (M):"
|
73
|
+
num_histo.dump
|
74
|
+
puts "Length of encoded:"
|
75
|
+
len_histo.dump
|
76
|
+
puts "First Letter:"
|
77
|
+
ltr_histo.dump
|
78
|
+
|
79
|
+
|
80
|
+
# puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
|
81
|
+
# puts [asnum, tail, url].inspect
|
@@ -0,0 +1,112 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
$: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
|
3
|
+
require 'rubygems'
|
4
|
+
require 'wukong'
|
5
|
+
require 'monkeyshines'
|
6
|
+
#
|
7
|
+
require 'shorturl_request'
|
8
|
+
require 'shorturl_sequence'
|
9
|
+
require 'monkeyshines/utils/uri'
|
10
|
+
require 'monkeyshines/utils/filename_pattern'
|
11
|
+
require 'monkeyshines/store/conditional_store'
|
12
|
+
require 'monkeyshines/fetcher/http_head_fetcher'
|
13
|
+
require 'trollop' # gem install trollop
|
14
|
+
|
15
|
+
# ===========================================================================
|
16
|
+
#
|
17
|
+
# scrape_shorturls.rb --
|
18
|
+
#
|
19
|
+
# To scrape from a list of shortened urls:
|
20
|
+
#
|
21
|
+
# ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
|
22
|
+
#
|
23
|
+
# To do a random scrape:
|
24
|
+
#
|
25
|
+
# ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
|
26
|
+
# --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
|
27
|
+
#
|
28
|
+
#
|
29
|
+
opts = Trollop::options do
|
30
|
+
opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
|
31
|
+
opt :log, "Log file name; leave blank to use STDERR", :type => String
|
32
|
+
# input from file
|
33
|
+
opt :from, "Location of URLs to scrape", :type => String
|
34
|
+
opt :skip, "Initial lines to skip", :type => Integer
|
35
|
+
# OR do a random walk
|
36
|
+
opt :random, "Generate and visit random URL suffixes"
|
37
|
+
opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
38
|
+
opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
|
39
|
+
opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
|
40
|
+
# output storage
|
41
|
+
opt :cache_loc, "URI for cache server", :type => String
|
42
|
+
opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
|
43
|
+
opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
|
44
|
+
opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
|
45
|
+
end
|
46
|
+
handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
|
47
|
+
|
48
|
+
# ******************** Log ********************
|
49
|
+
opts[:log] = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (opts[:log]=='')
|
50
|
+
periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 10000, :time => 30)
|
51
|
+
|
52
|
+
#
|
53
|
+
# ******************** Load from store or random walk ********************
|
54
|
+
#
|
55
|
+
if opts[:from]
|
56
|
+
src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
|
57
|
+
src_store.skip!(opts[:skip].to_i) if opts[:skip]
|
58
|
+
elsif opts[:random]
|
59
|
+
src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
|
60
|
+
else
|
61
|
+
Trollop::die "Need to either say --random or --from=filename"
|
62
|
+
end
|
63
|
+
|
64
|
+
#
|
65
|
+
# ******************** Store output ********************
|
66
|
+
#
|
67
|
+
# Track visited URLs with key-value database
|
68
|
+
#
|
69
|
+
RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
|
70
|
+
cache_loc = opts[:cache_loc] || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
|
71
|
+
dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
|
72
|
+
# dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
|
73
|
+
|
74
|
+
#
|
75
|
+
# Store the data into flat files
|
76
|
+
#
|
77
|
+
dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern],
|
78
|
+
:handle => 'shorturl-'+handle, :dest_dir => opts[:dest_dir])
|
79
|
+
dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
|
80
|
+
opts[:chunk_time].to_i, opts)
|
81
|
+
|
82
|
+
#
|
83
|
+
# Conditional store uses the key-value DB to boss around the flat files --
|
84
|
+
# requests are only made (and thus data is only output) if the url is missing
|
85
|
+
# from the key-value store.
|
86
|
+
#
|
87
|
+
dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
|
88
|
+
|
89
|
+
#
|
90
|
+
# ******************** Fetcher ********************
|
91
|
+
#
|
92
|
+
fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
|
93
|
+
|
94
|
+
#
|
95
|
+
# ******************** Do this thing ********************
|
96
|
+
#
|
97
|
+
Log.info "Beginning scrape itself"
|
98
|
+
src_store.each do |bareurl, *args|
|
99
|
+
# prepare the request
|
100
|
+
next if bareurl =~ %r{\Ahttp://(poprl.com|short.to|timesurl.at|bkite.com)}
|
101
|
+
req = ShorturlRequest.new(bareurl, *args)
|
102
|
+
|
103
|
+
# conditional store only calls fetcher if url key is missing.
|
104
|
+
result = dest_store.set( req.url ) do
|
105
|
+
response = fetcher.get(req) # do the url fetch
|
106
|
+
next unless response.response_code || response.contents # don't store bad fetches
|
107
|
+
[response.scraped_at, response] # timestamp into cache, result into flat file
|
108
|
+
end
|
109
|
+
periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
|
110
|
+
end
|
111
|
+
dest_store.close
|
112
|
+
fetcher.close
|
@@ -0,0 +1,29 @@
|
|
1
|
+
class ShorturlRequest < Struct.new(
|
2
|
+
:url,
|
3
|
+
:scraped_at,
|
4
|
+
:response_code, :response_message,
|
5
|
+
:contents
|
6
|
+
)
|
7
|
+
alias_method :short_url=, :url=
|
8
|
+
alias_method :expanded_url=, :contents=
|
9
|
+
alias_method :expanded_url, :contents
|
10
|
+
#
|
11
|
+
# All we care about is the redirect destination.
|
12
|
+
#
|
13
|
+
def response= response
|
14
|
+
self.contents = response["location"]
|
15
|
+
end
|
16
|
+
|
17
|
+
#
|
18
|
+
# The major shortening services
|
19
|
+
#
|
20
|
+
# Do any of the mainstream shorteners use in-band characters besides \w
|
21
|
+
# alphanum and - dash? (idek.net uses a ~ and pastoid.com a + but they
|
22
|
+
# are not popular enough to justify the annoyance of allowing extra
|
23
|
+
# chars).
|
24
|
+
#
|
25
|
+
SHORTURL_RE = %r{\Ahttp://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bit.ly|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tinyurl.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.}
|
26
|
+
def self.is_shorturl? url
|
27
|
+
url.to_s =~ SHORTURL_RE
|
28
|
+
end
|
29
|
+
end
|
@@ -0,0 +1,121 @@
|
|
1
|
+
module Base62
|
2
|
+
# http://refactormycode.com/codes/125-base-62-encoding
|
3
|
+
BASE62_CHARS = ('0'..'9').to_a + ('a'..'z').to_a + ('A'..'Z').to_a
|
4
|
+
BASE62_MAP = {}
|
5
|
+
BASE62_CHARS.zip((0..61).to_a){|ch,num| BASE62_MAP[ch]=num }
|
6
|
+
def self.i_to_s i
|
7
|
+
return '0' if i == 0
|
8
|
+
s = ''
|
9
|
+
while i > 0
|
10
|
+
s << BASE62_CHARS[i.modulo(62)]
|
11
|
+
i /= 62
|
12
|
+
end
|
13
|
+
s.reverse
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.s_to_i str
|
17
|
+
i_out = 0
|
18
|
+
str.reverse.chars.each_with_index do |c, i|
|
19
|
+
i_out += BASE62_MAP[c] * (62 ** i)
|
20
|
+
end
|
21
|
+
i_out
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
module ShorturlSequence
|
26
|
+
def self.encode_integer i, radix
|
27
|
+
case radix.to_s
|
28
|
+
when '36' then i.to_s(36)
|
29
|
+
when '62' then Base62.i_to_s(i)
|
30
|
+
else
|
31
|
+
raise "Can't encode into base #{radix}"
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.decode_str s, radix
|
36
|
+
s = s.gsub(%r{\W+$},'')
|
37
|
+
case radix.to_s
|
38
|
+
when '36' then s.to_i(36)
|
39
|
+
when '62' then Base62.s_to_i(i)
|
40
|
+
else
|
41
|
+
raise "Can't encode into base #{radix}"
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class Shorturl
|
47
|
+
attr_accessor :base_url
|
48
|
+
attr_accessor :token
|
49
|
+
def initialize token
|
50
|
+
self.token = token
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
class Shorturl62
|
55
|
+
def to_i
|
56
|
+
Base62.s_to_i token
|
57
|
+
end
|
58
|
+
|
59
|
+
def to_s
|
60
|
+
url
|
61
|
+
end
|
62
|
+
|
63
|
+
def url
|
64
|
+
"#{base_url}/#{token}"
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
class IsgdShorturl < Shorturl62
|
69
|
+
def base_url
|
70
|
+
'http://is.gd'
|
71
|
+
end
|
72
|
+
end
|
73
|
+
|
74
|
+
class Monkeyshines::Store::SequentialUrlStream
|
75
|
+
DEFAULT_MAX_URLSTR = '1zzzzz'.to_i(36)
|
76
|
+
DEFAULT_RADIX = {
|
77
|
+
'http://tinyurl.com/' => 36,
|
78
|
+
'http://bit.ly/' => 62,
|
79
|
+
'http://is.gd/' => 62,
|
80
|
+
}
|
81
|
+
attr_accessor :base_url, :min_limit, :span, :encoding_radix
|
82
|
+
def initialize base_url, min_limit=0, max_limit=nil, encoding_radix=nil
|
83
|
+
self.base_url = self.class.fix_url(base_url)
|
84
|
+
self.min_limit = min_limit.to_i
|
85
|
+
max_limit ||= DEFAULT_MAX_URLSTR
|
86
|
+
self.span = max_limit.to_i - self.min_limit
|
87
|
+
self.encoding_radix = (encoding_radix || DEFAULT_RADIX[self.base_url]).to_i
|
88
|
+
raise "Please specify either encoding_radix of 36 or 62" unless [36, 62].include?(self.encoding_radix)
|
89
|
+
end
|
90
|
+
|
91
|
+
def self.fix_url url
|
92
|
+
url = 'http://' + url unless (url[0..6]=='http://')
|
93
|
+
url = url + '/' unless (url[-1..-1]=='/')
|
94
|
+
url
|
95
|
+
end
|
96
|
+
|
97
|
+
# An infinite stream of urls in range
|
98
|
+
def each *args, &block
|
99
|
+
(min_limit..max_limit).each(&block)
|
100
|
+
end
|
101
|
+
|
102
|
+
def self.new_from_command_line cmdline_opts, default_opts={}
|
103
|
+
options = default_opts.merge(cmdline_opts)
|
104
|
+
Trollop::die :base_url if options[:base_url].blank?
|
105
|
+
self.new *options.values_of(:base_url, :min_limit, :max_limit, :encoding_radix)
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
class Monkeyshines::Store::RandomUrlStream < Monkeyshines::Store::SequentialUrlStream
|
110
|
+
# An infinite stream of urls in range
|
111
|
+
def each *args, &block
|
112
|
+
loop do
|
113
|
+
yield url_in_range
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def url_in_range
|
118
|
+
idx = rand(span) + min_limit
|
119
|
+
base_url + ShorturlSequence.encode_integer(idx, encoding_radix)
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
script_dir=`dirname $0`
|
4
|
+
|
5
|
+
# nohup ttserver -port 10001 "$script_dir/distdb/shorturl_scrapes-tinyurl.tct#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-tinyurl+`date "+%Y%m%d"`.log &
|
6
|
+
# nohup ttserver -port 10002 "$script_dir/distdb/shorturl_scrapes-bitly.tct#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-bitly+`date "+%Y%m%d"`.log &
|
7
|
+
# nohup ttserver -port 10003 "$script_dir/distdb/shorturl_scrapes-other.tct#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-other+`date "+%Y%m%d"`.log &
|
8
|
+
|
9
|
+
#
|
10
|
+
# Start shorturl readthru cache TokyoTyrant servers
|
11
|
+
#
|
12
|
+
nohup ttserver -port 10042 "$script_dir/distdb/shorturl_reqs-tinyurl.tch#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-tinyurl+`date "+%Y%m%d"`.log &
|
13
|
+
nohup ttserver -port 10043 "$script_dir/distdb/shorturl_reqs-bitly.tch#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-bitly+`date "+%Y%m%d"`.log &
|
14
|
+
nohup ttserver -port 10044 "$script_dir/distdb/shorturl_reqs-other.tch#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-other+`date "+%Y%m%d"`.log &
|
15
|
+
|
16
|
+
# nohup ttserver -port 10069 "$script_dir/distdb/shorturl_reqs-foo.tch#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-tinyurl+`date "+%Y%m%d"`.log &
|
data/lib/monkeyshines.rb
ADDED
@@ -0,0 +1,31 @@
|
|
1
|
+
require 'monkeyshines/extensions'
|
2
|
+
require 'monkeyshines/utils/logger'
|
3
|
+
require 'wukong'
|
4
|
+
require 'wukong/extensions/pathname'
|
5
|
+
require 'monkeyshines/utils/factory_module'
|
6
|
+
require 'monkeyshines/utils/uri'
|
7
|
+
require 'monkeyshines/utils/filename_pattern'
|
8
|
+
require 'monkeyshines/options'
|
9
|
+
require 'monkeyshines/scrape_request'
|
10
|
+
|
11
|
+
module Monkeyshines
|
12
|
+
autoload :ScrapeRequest, 'monkeyshines/scrape_request'
|
13
|
+
autoload :ScrapeRequestCore, 'monkeyshines/scrape_request'
|
14
|
+
autoload :RequestStream, 'monkeyshines/request_stream'
|
15
|
+
autoload :Store, 'monkeyshines/store'
|
16
|
+
autoload :Fetcher, 'monkeyshines/fetcher'
|
17
|
+
autoload :Monitor, 'monkeyshines/monitor'
|
18
|
+
autoload :Runner, 'monkeyshines/runner'
|
19
|
+
autoload :RawJsonContents, 'monkeyshines/scrape_request/raw_json_contents'
|
20
|
+
|
21
|
+
# Dumping ground for configuration values
|
22
|
+
CONFIG = {} unless defined?(CONFIG)
|
23
|
+
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# A convenient logger.
|
28
|
+
#
|
29
|
+
# Define NO_MONKEYSHINES_LOG (or define Log yourself) to prevent its creation
|
30
|
+
#
|
31
|
+
Log = Monkeyshines.logger unless (defined?(Log) || defined?(NO_MONKEYSHINES_LOG))
|
@@ -0,0 +1,16 @@
|
|
1
|
+
class Numeric
|
2
|
+
def clamp min, max
|
3
|
+
return min if min && (self <= min)
|
4
|
+
return max if max && (self >= max)
|
5
|
+
self
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
|
10
|
+
class Hash
|
11
|
+
def self.deep_sum *args
|
12
|
+
args.inject({}) do |result, options|
|
13
|
+
result.deep_merge options
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,10 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Fetcher
|
3
|
+
extend FactoryModule
|
4
|
+
autoload :Base, 'monkeyshines/fetcher/base'
|
5
|
+
autoload :FakeFetcher, 'monkeyshines/fetcher/fake_fetcher'
|
6
|
+
autoload :HttpFetcher, 'monkeyshines/fetcher/http_fetcher'
|
7
|
+
autoload :HttpHeadFetcher, 'monkeyshines/fetcher/http_head_fetcher'
|
8
|
+
|
9
|
+
end
|
10
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
require 'net/http'
|
2
|
+
Net::HTTP.version_1_2
|
3
|
+
module Monkeyshines
|
4
|
+
module Fetcher
|
5
|
+
|
6
|
+
#
|
7
|
+
class AuthedHttpFetcher
|
8
|
+
cattr_accessor :auth_params
|
9
|
+
|
10
|
+
def get_request_token
|
11
|
+
end
|
12
|
+
|
13
|
+
def authorize
|
14
|
+
end
|
15
|
+
|
16
|
+
def get_access_token
|
17
|
+
end
|
18
|
+
|
19
|
+
def api_key
|
20
|
+
end
|
21
|
+
def api_secret
|
22
|
+
end
|
23
|
+
def session_key
|
24
|
+
end
|
25
|
+
|
26
|
+
# authenticate request
|
27
|
+
def authenticate req
|
28
|
+
get_session_key unless session_key
|
29
|
+
end
|
30
|
+
|
31
|
+
|
32
|
+
end
|
33
|
+
|
34
|
+
end
|
35
|
+
end
|