monkeyshines 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
@@ -0,0 +1,81 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
+ #require 'rubygems'
4
+ # require 'wukong'
5
+ require 'monkeyshines'
6
+ # require 'monkeyshines/utils/uri'
7
+ # require 'monkeyshines/utils/filename_pattern'
8
+ # require 'monkeyshines/store/conditional_store'
9
+ # require 'monkeyshines/fetcher/http_head_fetcher'
10
+ # require 'trollop' # gem install trollop
11
+ # require 'shorturl_request'
12
+ require 'shorturl_sequence'
13
+
14
+ digits = { } ; (('0'..'9').to_a+('a'..'z').to_a).each do |ch| digits[ch] = 0 end
15
+
16
+ # (1..10000).each do |idx|
17
+ # s = ShorturlSequence.encode_integer idx, 36
18
+ # digits[s[0..0]] += 1
19
+ # end
20
+ # p digits
21
+ # puts digits.sort.map{|ch,ct| "%-7s\t%10d"%[ch,ct]}
22
+
23
+ class Histo
24
+ attr_accessor :buckets
25
+ def initialize
26
+ self.buckets = { }
27
+ end
28
+ def << val
29
+ buckets[val] ||= 0
30
+ buckets[val] += 1
31
+ end
32
+ def dump
33
+ buckets.sort.each do |val, count|
34
+ puts "%10d\t%s"%[count,val]
35
+ end
36
+ end
37
+ end
38
+
39
+ len_histo = Histo.new
40
+ num_histo = Histo.new
41
+ ltr_histo = Histo.new
42
+ iter = 0
43
+
44
+ # 123456789-123456789-
45
+ # http://bit.ly/
46
+ # http://tinyurl.com/
47
+ BASE_URL = "http://is.gd/"
48
+ RADIX = 62
49
+ HANDLE = BASE_URL.gsub(%r{^http://},'').gsub(/\.com$/,'').gsub(/\W+/,'')
50
+ BASE_URL_LEN = BASE_URL.length
51
+ MAX_TAIL_LEN = BASE_URL_LEN + 2 + 6
52
+ SIX_CHARS = RADIX**6
53
+ File.open("rawd/req/shorturl_requests-20090710-#{HANDLE}.tsv"
54
+ ) do |reqfile|
55
+ reqfile.each do |url|
56
+ #decode
57
+ next unless url.length <= MAX_TAIL_LEN
58
+ tail = url.chomp.strip[BASE_URL_LEN..-1] || ''
59
+ # tail.downcase!
60
+ asnum = ShorturlSequence.decode_str tail, RADIX rescue nil # tail.to_i(36) rescue -1
61
+ next unless asnum && asnum < SIX_CHARS
62
+ size = (asnum / 1_000_000)
63
+ len = tail.length
64
+ # track stats
65
+ len_histo << len
66
+ num_histo << size
67
+ ltr_histo << "%s-%s" % [len, tail[0..0]] # + (len > 1 ? '.'* (len-1) : '')
68
+ puts iter if ((iter += 1) % 1_000_000 == 0)
69
+
70
+ end
71
+ end
72
+ puts "Integer magnitude of decoded (M):"
73
+ num_histo.dump
74
+ puts "Length of encoded:"
75
+ len_histo.dump
76
+ puts "First Letter:"
77
+ ltr_histo.dump
78
+
79
+
80
+ # puts tail.length # [tail.length, tail, tail[-1].to_i].join("\t")
81
+ # puts [asnum, tail, url].inspect
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env ruby
2
+ $: << File.dirname(__FILE__)+'/../../lib'; $: << File.dirname(__FILE__)
3
+ require 'rubygems'
4
+ require 'wukong'
5
+ require 'monkeyshines'
6
+ #
7
+ require 'shorturl_request'
8
+ require 'shorturl_sequence'
9
+ require 'monkeyshines/utils/uri'
10
+ require 'monkeyshines/utils/filename_pattern'
11
+ require 'monkeyshines/store/conditional_store'
12
+ require 'monkeyshines/fetcher/http_head_fetcher'
13
+ require 'trollop' # gem install trollop
14
+
15
+ # ===========================================================================
16
+ #
17
+ # scrape_shorturls.rb --
18
+ #
19
+ # To scrape from a list of shortened urls:
20
+ #
21
+ # ./shorturl_random_scrape.rb --from-type=FlatFileStore --from=request_urls.tsv
22
+ #
23
+ # To do a random scrape:
24
+ #
25
+ # ./shorturl_random_scrape.rb --from-type=RandomUrlStream --base-url=tinyurl.com
26
+ # --base-url="http://tinyurl.com" --min-limit= --max-limit= --encoding_radix=
27
+ #
28
+ #
29
+ opts = Trollop::options do
30
+ opt :base_url, "Host part of URL: eg tinyurl.com", :type => String, :required => true
31
+ opt :log, "Log file name; leave blank to use STDERR", :type => String
32
+ # input from file
33
+ opt :from, "Location of URLs to scrape", :type => String
34
+ opt :skip, "Initial lines to skip", :type => Integer
35
+ # OR do a random walk
36
+ opt :random, "Generate and visit random URL suffixes"
37
+ opt :min_limit, "Smallest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
38
+ opt :max_limit, "Largest sequential URL to randomly visit", :type => Integer # default in shorturl_sequence.rb
39
+ opt :encoding_radix, "36 for most, 62 if URLs are case-sensitive", :type => Integer, :default => 36
40
+ # output storage
41
+ opt :cache_loc, "URI for cache server", :type => String
42
+ opt :chunk_time, "Frequency to rotate chunk files (in seconds)", :type => Integer, :default => 60*60*4
43
+ opt :dest_dir, "Filename base for output, def /data/ripd", :type => String, :default => '/data/ripd'
44
+ opt :dest_pattern, "Pattern for dump file output", :default => ":dest_dir/:handle_prefix/:handle/:date/:handle+:timestamp-:pid.tsv"
45
+ end
46
+ handle = opts[:base_url].gsub(/\.com$/,'').gsub(/\W+/,'')
47
+
48
+ # ******************** Log ********************
49
+ opts[:log] = (WORK_DIR+"/log/shorturls_#{handle}-#{Time.now.to_flat}.log") if (opts[:log]=='')
50
+ periodic_log = Monkeyshines::Monitor::PeriodicLogger.new(:iters => 10000, :time => 30)
51
+
52
+ #
53
+ # ******************** Load from store or random walk ********************
54
+ #
55
+ if opts[:from]
56
+ src_store = Monkeyshines::Store::FlatFileStore.new_from_command_line(opts, :filemode => 'r')
57
+ src_store.skip!(opts[:skip].to_i) if opts[:skip]
58
+ elsif opts[:random]
59
+ src_store = Monkeyshines::Store::RandomUrlStream.new_from_command_line(opts)
60
+ else
61
+ Trollop::die "Need to either say --random or --from=filename"
62
+ end
63
+
64
+ #
65
+ # ******************** Store output ********************
66
+ #
67
+ # Track visited URLs with key-value database
68
+ #
69
+ RDB_PORTS = { 'tinyurl' => "localhost:10042", 'bitly' => "localhost:10043", 'other' => "localhost:10044" }
70
+ cache_loc = opts[:cache_loc] || RDB_PORTS[handle] or raise "Need a handle (bitly, tinyurl or other)."
71
+ dest_cache = Monkeyshines::Store::TyrantRdbKeyStore.new(cache_loc)
72
+ # dest_cache = Monkeyshines::Store::MultiplexShorturlCache.new(RDB_PORTS)
73
+
74
+ #
75
+ # Store the data into flat files
76
+ #
77
+ dest_pattern = Monkeyshines::Utils::FilenamePattern.new(opts[:dest_pattern],
78
+ :handle => 'shorturl-'+handle, :dest_dir => opts[:dest_dir])
79
+ dest_files = Monkeyshines::Store::ChunkedFlatFileStore.new(dest_pattern,
80
+ opts[:chunk_time].to_i, opts)
81
+
82
+ #
83
+ # Conditional store uses the key-value DB to boss around the flat files --
84
+ # requests are only made (and thus data is only output) if the url is missing
85
+ # from the key-value store.
86
+ #
87
+ dest_store = Monkeyshines::Store::ConditionalStore.new(dest_cache, dest_files)
88
+
89
+ #
90
+ # ******************** Fetcher ********************
91
+ #
92
+ fetcher = Monkeyshines::Fetcher::HttpHeadFetcher.new
93
+
94
+ #
95
+ # ******************** Do this thing ********************
96
+ #
97
+ Log.info "Beginning scrape itself"
98
+ src_store.each do |bareurl, *args|
99
+ # prepare the request
100
+ next if bareurl =~ %r{\Ahttp://(poprl.com|short.to|timesurl.at|bkite.com)}
101
+ req = ShorturlRequest.new(bareurl, *args)
102
+
103
+ # conditional store only calls fetcher if url key is missing.
104
+ result = dest_store.set( req.url ) do
105
+ response = fetcher.get(req) # do the url fetch
106
+ next unless response.response_code || response.contents # don't store bad fetches
107
+ [response.scraped_at, response] # timestamp into cache, result into flat file
108
+ end
109
+ periodic_log.periodically{ ["%7d"%dest_store.misses, 'misses', dest_store.size, req.response_code, result, req.url] }
110
+ end
111
+ dest_store.close
112
+ fetcher.close
@@ -0,0 +1,29 @@
1
+ class ShorturlRequest < Struct.new(
2
+ :url,
3
+ :scraped_at,
4
+ :response_code, :response_message,
5
+ :contents
6
+ )
7
+ alias_method :short_url=, :url=
8
+ alias_method :expanded_url=, :contents=
9
+ alias_method :expanded_url, :contents
10
+ #
11
+ # All we care about is the redirect destination.
12
+ #
13
+ def response= response
14
+ self.contents = response["location"]
15
+ end
16
+
17
+ #
18
+ # The major shortening services
19
+ #
20
+ # Do any of the mainstream shorteners use in-band characters besides \w
21
+ # alphanum and - dash? (idek.net uses a ~ and pastoid.com a + but they
22
+ # are not popular enough to justify the annoyance of allowing extra
23
+ # chars).
24
+ #
25
+ SHORTURL_RE = %r{\Ahttp://(?:1link.in|4url.cc|6url.com|adjix.com|ad.vu|bellypath.com|bit.ly|bkite.com|budurl.com|canurl.com|chod.sk|cli.gs|decenturl.com|dn.vc|doiop.com|dwarfurl.com|easyuri.com|easyurl.net|ff.im|go2cut.com|gonext.org|hulu.com|hypem.com|ifood.tv|ilix.in|is.gd|ix.it|jdem.cz|jijr.com|kissa.be|kurl.us|litturl.com|lnkurl.com|memurl.com|metamark.net|miklos.dk|minilien.com|minurl.org|muhlink.com|myurl.in|myurl.us|notlong.com|ow.ly|plexp.com|poprl.com|qurlyq.com|redirx.com|s3nt.com|shorterlink.com|shortlinks.co.uk|short.to|shorturl.com|shrinklink.co.uk|shrinkurl.us|shrt.st|shurl.net|simurl.com|shorl.com|smarturl.eu|snipr.com|snipurl.com|snurl.com|sn.vc|starturl.com|surl.co.uk|tighturl.com|timesurl.at|tiny123.com|tiny.cc|tinylink.com|tinyurl.com|tobtr.com|traceurl.com|tr.im|tweetburner.com|twitpwr.com|twitthis.com|twurl.nl|u.mavrev.com|ur1.ca|url9.com|urlborg.com|urlbrief.com|urlcover.com|urlcut.com|urlhawk.com|url-press.com|urlsmash.com|urltea.com|urlvi.be|vimeo.com|wlink.us|xaddr.com|xil.in|xrl.us|x.se|xs.md|yatuc.com|yep.it|yweb.com|zi.ma|w3t.org)/.}
26
+ def self.is_shorturl? url
27
+ url.to_s =~ SHORTURL_RE
28
+ end
29
+ end
@@ -0,0 +1,121 @@
1
+ module Base62
2
+ # http://refactormycode.com/codes/125-base-62-encoding
3
+ BASE62_CHARS = ('0'..'9').to_a + ('a'..'z').to_a + ('A'..'Z').to_a
4
+ BASE62_MAP = {}
5
+ BASE62_CHARS.zip((0..61).to_a){|ch,num| BASE62_MAP[ch]=num }
6
+ def self.i_to_s i
7
+ return '0' if i == 0
8
+ s = ''
9
+ while i > 0
10
+ s << BASE62_CHARS[i.modulo(62)]
11
+ i /= 62
12
+ end
13
+ s.reverse
14
+ end
15
+
16
+ def self.s_to_i str
17
+ i_out = 0
18
+ str.reverse.chars.each_with_index do |c, i|
19
+ i_out += BASE62_MAP[c] * (62 ** i)
20
+ end
21
+ i_out
22
+ end
23
+ end
24
+
25
+ module ShorturlSequence
26
+ def self.encode_integer i, radix
27
+ case radix.to_s
28
+ when '36' then i.to_s(36)
29
+ when '62' then Base62.i_to_s(i)
30
+ else
31
+ raise "Can't encode into base #{radix}"
32
+ end
33
+ end
34
+
35
+ def self.decode_str s, radix
36
+ s = s.gsub(%r{\W+$},'')
37
+ case radix.to_s
38
+ when '36' then s.to_i(36)
39
+ when '62' then Base62.s_to_i(i)
40
+ else
41
+ raise "Can't encode into base #{radix}"
42
+ end
43
+ end
44
+ end
45
+
46
+ class Shorturl
47
+ attr_accessor :base_url
48
+ attr_accessor :token
49
+ def initialize token
50
+ self.token = token
51
+ end
52
+ end
53
+
54
+ class Shorturl62
55
+ def to_i
56
+ Base62.s_to_i token
57
+ end
58
+
59
+ def to_s
60
+ url
61
+ end
62
+
63
+ def url
64
+ "#{base_url}/#{token}"
65
+ end
66
+ end
67
+
68
+ class IsgdShorturl < Shorturl62
69
+ def base_url
70
+ 'http://is.gd'
71
+ end
72
+ end
73
+
74
+ class Monkeyshines::Store::SequentialUrlStream
75
+ DEFAULT_MAX_URLSTR = '1zzzzz'.to_i(36)
76
+ DEFAULT_RADIX = {
77
+ 'http://tinyurl.com/' => 36,
78
+ 'http://bit.ly/' => 62,
79
+ 'http://is.gd/' => 62,
80
+ }
81
+ attr_accessor :base_url, :min_limit, :span, :encoding_radix
82
+ def initialize base_url, min_limit=0, max_limit=nil, encoding_radix=nil
83
+ self.base_url = self.class.fix_url(base_url)
84
+ self.min_limit = min_limit.to_i
85
+ max_limit ||= DEFAULT_MAX_URLSTR
86
+ self.span = max_limit.to_i - self.min_limit
87
+ self.encoding_radix = (encoding_radix || DEFAULT_RADIX[self.base_url]).to_i
88
+ raise "Please specify either encoding_radix of 36 or 62" unless [36, 62].include?(self.encoding_radix)
89
+ end
90
+
91
+ def self.fix_url url
92
+ url = 'http://' + url unless (url[0..6]=='http://')
93
+ url = url + '/' unless (url[-1..-1]=='/')
94
+ url
95
+ end
96
+
97
+ # An infinite stream of urls in range
98
+ def each *args, &block
99
+ (min_limit..max_limit).each(&block)
100
+ end
101
+
102
+ def self.new_from_command_line cmdline_opts, default_opts={}
103
+ options = default_opts.merge(cmdline_opts)
104
+ Trollop::die :base_url if options[:base_url].blank?
105
+ self.new *options.values_of(:base_url, :min_limit, :max_limit, :encoding_radix)
106
+ end
107
+ end
108
+
109
+ class Monkeyshines::Store::RandomUrlStream < Monkeyshines::Store::SequentialUrlStream
110
+ # An infinite stream of urls in range
111
+ def each *args, &block
112
+ loop do
113
+ yield url_in_range
114
+ end
115
+ end
116
+
117
+ def url_in_range
118
+ idx = rand(span) + min_limit
119
+ base_url + ShorturlSequence.encode_integer(idx, encoding_radix)
120
+ end
121
+ end
@@ -0,0 +1,16 @@
1
+ #!/usr/bin/env bash
2
+
3
+ script_dir=`dirname $0`
4
+
5
+ # nohup ttserver -port 10001 "$script_dir/distdb/shorturl_scrapes-tinyurl.tct#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-tinyurl+`date "+%Y%m%d"`.log &
6
+ # nohup ttserver -port 10002 "$script_dir/distdb/shorturl_scrapes-bitly.tct#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-bitly+`date "+%Y%m%d"`.log &
7
+ # nohup ttserver -port 10003 "$script_dir/distdb/shorturl_scrapes-other.tct#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_scrapes-other+`date "+%Y%m%d"`.log &
8
+
9
+ #
10
+ # Start shorturl readthru cache TokyoTyrant servers
11
+ #
12
+ nohup ttserver -port 10042 "$script_dir/distdb/shorturl_reqs-tinyurl.tch#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-tinyurl+`date "+%Y%m%d"`.log &
13
+ nohup ttserver -port 10043 "$script_dir/distdb/shorturl_reqs-bitly.tch#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-bitly+`date "+%Y%m%d"`.log &
14
+ nohup ttserver -port 10044 "$script_dir/distdb/shorturl_reqs-other.tch#bnum=20000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-other+`date "+%Y%m%d"`.log &
15
+
16
+ # nohup ttserver -port 10069 "$script_dir/distdb/shorturl_reqs-foo.tch#bnum=40000000#opts=l" 2>&1 >> log/ttserver-shorturl_reqs-tinyurl+`date "+%Y%m%d"`.log &
@@ -0,0 +1,2 @@
1
+ script_dir=`dirname $0`
2
+ ttserver -port 10040 $script_dir/work/distdb/shorturl_cache.tct >> $script_dir/work/log/shorturl_cache-`datename`.log 2>&1
@@ -0,0 +1,31 @@
1
+ require 'monkeyshines/extensions'
2
+ require 'monkeyshines/utils/logger'
3
+ require 'wukong'
4
+ require 'wukong/extensions/pathname'
5
+ require 'monkeyshines/utils/factory_module'
6
+ require 'monkeyshines/utils/uri'
7
+ require 'monkeyshines/utils/filename_pattern'
8
+ require 'monkeyshines/options'
9
+ require 'monkeyshines/scrape_request'
10
+
11
+ module Monkeyshines
12
+ autoload :ScrapeRequest, 'monkeyshines/scrape_request'
13
+ autoload :ScrapeRequestCore, 'monkeyshines/scrape_request'
14
+ autoload :RequestStream, 'monkeyshines/request_stream'
15
+ autoload :Store, 'monkeyshines/store'
16
+ autoload :Fetcher, 'monkeyshines/fetcher'
17
+ autoload :Monitor, 'monkeyshines/monitor'
18
+ autoload :Runner, 'monkeyshines/runner'
19
+ autoload :RawJsonContents, 'monkeyshines/scrape_request/raw_json_contents'
20
+
21
+ # Dumping ground for configuration values
22
+ CONFIG = {} unless defined?(CONFIG)
23
+
24
+ end
25
+
26
+ #
27
+ # A convenient logger.
28
+ #
29
+ # Define NO_MONKEYSHINES_LOG (or define Log yourself) to prevent its creation
30
+ #
31
+ Log = Monkeyshines.logger unless (defined?(Log) || defined?(NO_MONKEYSHINES_LOG))
@@ -0,0 +1,16 @@
1
+ class Numeric
2
+ def clamp min, max
3
+ return min if min && (self <= min)
4
+ return max if max && (self >= max)
5
+ self
6
+ end
7
+ end
8
+
9
+
10
+ class Hash
11
+ def self.deep_sum *args
12
+ args.inject({}) do |result, options|
13
+ result.deep_merge options
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,10 @@
1
+ module Monkeyshines
2
+ module Fetcher
3
+ extend FactoryModule
4
+ autoload :Base, 'monkeyshines/fetcher/base'
5
+ autoload :FakeFetcher, 'monkeyshines/fetcher/fake_fetcher'
6
+ autoload :HttpFetcher, 'monkeyshines/fetcher/http_fetcher'
7
+ autoload :HttpHeadFetcher, 'monkeyshines/fetcher/http_head_fetcher'
8
+
9
+ end
10
+ end
@@ -0,0 +1,35 @@
1
+ require 'net/http'
2
+ Net::HTTP.version_1_2
3
+ module Monkeyshines
4
+ module Fetcher
5
+
6
+ #
7
+ class AuthedHttpFetcher
8
+ cattr_accessor :auth_params
9
+
10
+ def get_request_token
11
+ end
12
+
13
+ def authorize
14
+ end
15
+
16
+ def get_access_token
17
+ end
18
+
19
+ def api_key
20
+ end
21
+ def api_secret
22
+ end
23
+ def session_key
24
+ end
25
+
26
+ # authenticate request
27
+ def authenticate req
28
+ get_session_key unless session_key
29
+ end
30
+
31
+
32
+ end
33
+
34
+ end
35
+ end