monkeyshines 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
@@ -0,0 +1,44 @@
1
+ module Monkeyshines
2
+ module Fetcher
3
+ #
4
+ # Base URL fetcher.
5
+ #
6
+ # Subclasses must provide
7
+ # get(scrape_request)
8
+ # returning the same scrape_request with its contents, scraped_at and
9
+ # response fields appropriately filled in.
10
+ #
11
+ class Base
12
+ attr_accessor :options
13
+ #
14
+ # Options hash configures any subclass behavior
15
+ #
16
+ def initialize _options={}
17
+ self.options = _options
18
+ end
19
+
20
+ # Make request, return satisfied scrape_request
21
+ def get scrape_request
22
+ end
23
+
24
+ # inscribes request with credentials
25
+ def authenticate req
26
+ end
27
+
28
+ # Based on the response code, sleep (in case servers are overheating) and
29
+ # log response.
30
+ def backoff response
31
+ sleep
32
+ end
33
+
34
+ # A compact timestamp, created each time it's called
35
+ def self.timestamp
36
+ Time.now.utc.to_flat
37
+ end
38
+
39
+ # Release any persistent connections to the remote server
40
+ def close
41
+ end
42
+ end
43
+ end
44
+ end
@@ -0,0 +1,19 @@
1
+ module Monkeyshines
2
+ module Fetcher
3
+ FakeResponse = Struct.new(:code, :message, :body)
4
+
5
+ class FakeFetcher < Base
6
+
7
+ # Fake a satisfied scrape_request
8
+ def get scrape_request
9
+ response = FakeResponse.new('200', 'OK', { :fetched => scrape_request.url }.to_json )
10
+ scrape_request.response_code = response.code
11
+ scrape_request.response_message = response.message
12
+ scrape_request.response = response
13
+ scrape_request.scraped_at = Time.now.utc.to_flat
14
+ scrape_request
15
+ end
16
+
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,127 @@
1
+ require 'net/http'
2
+ Net::HTTP.version_1_2
3
+ module Monkeyshines
4
+ module Fetcher
5
+ #
6
+ # Opens a persistent connection and makes repeated requests.
7
+ #
8
+ # * authentication
9
+ # * backoff and logging on client or server errors
10
+ #
11
+ class HttpFetcher < Base
12
+
13
+ #
14
+ # Notes:
15
+ #
16
+ # On HTTP:
17
+ # * "RFC 2616 (HTTP/1.1)":http://tools.ietf.org/html/rfc2616
18
+ # * "Header Fields":http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html
19
+ # * "Notes on Keep-Alive":http://www.hpl.hp.com/personal/ange/archives/archives-95/http-wg-archive/1661.html
20
+ #
21
+ # * "right_http_connection is another HTTP lib":http://github.com/rightscale/right_http_connection/tree/master/lib/right_http_connection.rb
22
+
23
+ # amount to throttle non-persistent connections:
24
+ # if http is
25
+ CNXN_SLEEP_TIME = 0.5
26
+ # Default user agent presented to servers
27
+ USER_AGENT = "Net::HTTP #{RUBY_VERSION}"
28
+ attr_accessor :connection_opened_at, :username, :password, :http_req_options
29
+ def initialize _options={}
30
+ super _options
31
+ self.username = options[:username]
32
+ self.password = options[:password]
33
+ self.http_req_options = {}
34
+ self.http_req_options["User-Agent"] = options[:user_agent] || USER_AGENT
35
+ self.http_req_options["Connection"] = "keep-alive"
36
+ end
37
+
38
+ #
39
+ # Current session (starting a new one if necessary)
40
+ # If the host has changed, closes old conxn and opens new one
41
+ #
42
+ def http host, port=nil
43
+ return @http if (@http && (@http.started?) && (@host == host))
44
+ close if (@http && (@http.started?) && (@host != host))
45
+ @host = host
46
+ @connection_opened_at = Time.now
47
+ Log.info "Opening HTTP connection for #{@host} at #{@connection_opened_at}"
48
+ @http = Net::HTTP.new(@host)
49
+ @http.set_debug_output($stderr) if options[:debug_requests]
50
+ @http.start
51
+ end
52
+
53
+ # Close the current session, if any
54
+ def close
55
+ if @http && @http.started?
56
+ @http.finish
57
+ Log.info "Closing HTTP connection for #{@host} from #{@connection_opened_at}"
58
+ end
59
+ @http = nil
60
+ end
61
+
62
+ # Build and dispatch request
63
+ def perform_request url_str
64
+ url = URI.parse(url_str)
65
+ req = Net::HTTP::Get.new(url.send(:path_query), http_req_options)
66
+ authenticate req
67
+ http(url.host, url.port).request req
68
+ end
69
+
70
+ # authenticate request
71
+ def authenticate req
72
+ req.basic_auth(username, password) if username && password
73
+ end
74
+
75
+ #
76
+ # Based on the response code, sleep (in case servers are overheating) and
77
+ # log response.
78
+ #
79
+ def backoff response
80
+ # backoff when server isn't persisting connection
81
+ sleep CNXN_SLEEP_TIME if (! @http.started?)
82
+ # Response-based sleep time
83
+ sleep_time = 0
84
+ case response
85
+ when Net::HTTPSuccess then return # 2xx
86
+ when Net::HTTPRedirection then return # 3xx
87
+ when Net::HTTPBadRequest then sleep_time = 5 # 400 (rate limit, probably)
88
+ when Net::HTTPUnauthorized then sleep_time = 0 # 401 (protected user, probably)
89
+ when Net::HTTPForbidden then sleep_time = 4 # 403 update limit
90
+ when Net::HTTPNotFound then sleep_time = 0 # 404 deleted
91
+ when Net::HTTPServiceUnavailable then sleep_time = 9 # 503 Fail Whale
92
+ when Net::HTTPServerError then sleep_time = 2 # 5xx All other server errors
93
+ else sleep_time = 1
94
+ end
95
+ Log.warn "Received #{response.code}, sleeping #{sleep_time} ('#{response.message[0..200].gsub(%r{[\r\n\t]}, " ")}' from #{@host}+#{@connection_opened_at})"
96
+ sleep sleep_time
97
+ end
98
+
99
+ # Make request, return satisfied scrape_request
100
+ def get scrape_request
101
+ begin
102
+ response = perform_request(scrape_request.url)
103
+ scrape_request.response_code = response.code
104
+ scrape_request.response_message = response.message[0..200].gsub(/[\n\r\t]+/, ' ')
105
+ scrape_request.response = response
106
+ backoff response
107
+ rescue StandardError, SignalException, Timeout::Error => e
108
+ Log.warn ["Recovering from fetcher error:", e.to_s, scrape_request.inspect[0..2000].gsub(/[\n\r]+/, ' ')].join("\t")
109
+ close # restart the connection
110
+ rescue Exception => e
111
+ Log.warn e
112
+ raise e
113
+ end
114
+ scrape_request.scraped_at = self.class.timestamp
115
+ scrape_request
116
+ end
117
+
118
+ def get_and_report_timing *args
119
+ start = Time.now.to_f
120
+ response = get *args
121
+ Log.info( Time.now.to_f - start )
122
+ response
123
+ end
124
+ end
125
+
126
+ end
127
+ end
@@ -0,0 +1,23 @@
1
+ require 'monkeyshines/fetcher/http_fetcher'
2
+ module Monkeyshines
3
+ module Fetcher
4
+ #
5
+ # Requests the HEAD only, for cases where you don't need to know actual page
6
+ # contents (e.g. you're looking for server info or scraping URL shorteners)
7
+ #
8
+ class HttpHeadFetcher < HttpFetcher
9
+
10
+ #
11
+ # Build and dispatch request
12
+ # We do a HEAD request only, no reason to get the body.
13
+ #
14
+ def perform_request url_str
15
+ url = URI.parse(url_str)
16
+ req = Net::HTTP::Head.new(url.send(:path_query), http_req_options)
17
+ authenticate req
18
+ http(url.host, url.port).request req
19
+ end
20
+
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,7 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+ autoload :PeriodicMonitor, 'monkeyshines/monitor/periodic_monitor'
4
+ autoload :PeriodicLogger, 'monkeyshines/monitor/periodic_logger'
5
+ end
6
+ end
7
+
@@ -0,0 +1,23 @@
1
+ require 'monkeyshines/monitor/periodic_monitor'
2
+ module Monkeyshines
3
+ module Monitor
4
+ module ChunkedStore
5
+ attr_accessor :file_pattern
6
+ def initialize file_pattern
7
+ self.file_pattern = file_pattern
8
+ super file_pattern.make
9
+ end
10
+
11
+ def close_and_reopen
12
+ close
13
+ self.filename = file_pattern.make
14
+ dump_file
15
+ end
16
+
17
+ def save *args
18
+ chunk_monitor.periodically{ close_rename_and_open }
19
+ super *args
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,33 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+
4
+ #
5
+ # Emits a log line but only every +iter_interval+ calls or +time_interval+
6
+ # lapse.
7
+ #
8
+ # Since the contents of the block aren't called until the criteria are met,
9
+ # you can put relatively expensive operations in the log without killing
10
+ # your iteration time.
11
+ #
12
+ class PeriodicLogger < PeriodicMonitor
13
+ #
14
+ # Call with a block that returns a string or array to log.
15
+ # If you return
16
+ #
17
+ # Ex: log if it has been at least 5 minutes since last announcement:
18
+ #
19
+ # periodic_logger = Monkeyshines::Monitor::PeriodicLogger.new(:time => 300)
20
+ # loop do
21
+ # # ... stuff ...
22
+ # periodic_logger.periodically{ [morbenfactor, crunkosity, exuberance] }
23
+ # end
24
+ #
25
+ def periodically &block
26
+ super do
27
+ result = [ "%10d"%iter, "%7.1f"%since, "%7.1f"%rate, (block ? block.call : nil) ].flatten.compact
28
+ Log.info result.join("\t")
29
+ end
30
+ end
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,65 @@
1
+ module Monkeyshines
2
+ module Monitor
3
+ #
4
+ # Accepts a lightweight call every iteration.
5
+ #
6
+ # Once either a time or an iteration criterion is met, executes the block
7
+ # and resets the timer until next execution.
8
+ #
9
+ # Note that the +time_interval+ is measured *excution to execution* and not
10
+ # in multiples of iter_interval. Say I set a time_interval of 300s, and
11
+ # happen to iterate at 297s and 310s after start. Then the monitor will
12
+ # execute at 310s, and the next execution will happen on or after 610s.
13
+ #
14
+ # Also note that when *either* criterion is met, *both* criteria are
15
+ # reset. Say I set a time interval of 300s and an +iter_interval+ of 10_000;
16
+ # and that at 250s I reach iteration 10_000. Then the monitor will execute
17
+ # on or after 20_000 iteration or 550s, whichever happens first.
18
+ #
19
+ class PeriodicMonitor
20
+ attr_accessor :time_interval, :iter_interval
21
+ attr_accessor :last_time, :iter, :started_at
22
+
23
+ def initialize options={}
24
+ self.started_at = Time.now.utc.to_f
25
+ self.last_time = started_at
26
+ self.iter = 0
27
+ self.time_interval = options[:time]
28
+ self.iter_interval = options[:iters]
29
+ end
30
+
31
+ # True if more than +iter_interval+ has elapsed since last execution.
32
+ def enough_iterations?
33
+ iter % iter_interval == 0 if iter_interval
34
+ end
35
+
36
+ # True if more than +time_interval+ has elapsed since last execution.
37
+ def enough_time? now
38
+ (now - last_time) > time_interval if time_interval
39
+ end
40
+
41
+ # Time since monitor was created
42
+ def since
43
+ Time.now.utc.to_f - started_at
44
+ end
45
+ # Iterations per second
46
+ def rate
47
+ iter.to_f / since.to_f
48
+ end
49
+
50
+ #
51
+ # if the interval conditions are met, executes block; otherwise just does
52
+ # bookkeeping and returns.
53
+ #
54
+ def periodically &block
55
+ self.iter += 1
56
+ now = Time.now.utc.to_f
57
+ if enough_iterations? || enough_time?(now)
58
+ block.call(iter, (now-last_time))
59
+ self.last_time = now
60
+ end
61
+ end
62
+ end
63
+
64
+ end
65
+ end
@@ -0,0 +1,59 @@
1
+ require 'monkeyshines/utils/trollop'
2
+ module Monkeyshines
3
+
4
+ CMDLINE_OPTIONS = [
5
+ [:handle, "Identifying string for scrape", { :type => String, :required => true } ],
6
+ [:source_filename, "URI for scrape store to load from", { :type => String } ],
7
+ [:dest_filename, "Filename for results", { :type => String } ],
8
+ [:log_dest, "Log file location", { :type => String } ],
9
+ ]
10
+
11
+ #
12
+ # Load the YAML file ~/.monkeyshines
13
+ # and toss it into Monkeyshines::CONFIG
14
+ #
15
+ def self.load_global_options! *keys
16
+ all_defaults = YAML.load(File.open(ENV['HOME']+'/.monkeyshines'))
17
+ if keys.blank?
18
+ CONFIG.deep_merge! all_defaults
19
+ else
20
+ keys.each do |key|
21
+ CONFIG.deep_merge!( all_defaults[key] || {} )
22
+ end
23
+ end
24
+ end
25
+
26
+
27
+ def self.load_cmdline_options!
28
+ Monkeyshines::CONFIG.deep_merge! options_from_cmdline
29
+ end
30
+
31
+ #
32
+ # Takes the values set on the command line
33
+ # and merges them into the options hash:
34
+ # --source-filename
35
+ # sets the value for options[:source][:filename], etc.
36
+ #
37
+ def self.options_from_cmdline
38
+ result = {}
39
+ cmdline = self.get_cmdline_args
40
+ cmdline.each do |key, val|
41
+ next if key.to_s =~ /_given$/
42
+ args = key.to_s.split(/_/).map(&:to_sym)+[val]
43
+ result.deep_set(*args) # if val
44
+ end
45
+ result[:handle] = result[:handle].to_s.gsub(/\W/,'').to_sym
46
+ result
47
+ end
48
+
49
+ # Use the trollop options defined in Monkeyshines::CMDLINE_OPTIONS
50
+ # to extract command-line args
51
+ def self.get_cmdline_args
52
+ cmdline = Trollop::Parser.new
53
+ Monkeyshines::CMDLINE_OPTIONS.each do |args|
54
+ cmdline.opt *args
55
+ end
56
+ Trollop::do_parse_args(cmdline)
57
+ end
58
+
59
+ end
@@ -0,0 +1,26 @@
1
+ require 'monkeyshines/runner'
2
+
3
+ module Monkeyshines
4
+ class RecursiveRunner < Monkeyshines::Runner
5
+ GENERATION_LIMIT = 5
6
+
7
+ #
8
+ # Generate requests that ensue from this one
9
+ #
10
+ # if GENERATION_LIMIT is 5, requests at generation 4 *do* generate recursive
11
+ # jobs, ones at generation 5 do not (so, generation 6 shouldn't exist)
12
+ #
13
+ def bookkeep result
14
+ super result
15
+ if result
16
+ result.req_generation = result.req_generation.to_i
17
+ return if (result.req_generation >= GENERATION_LIMIT)
18
+ iter = 0
19
+ result.recursive_requests do |rec_req|
20
+ source.put rec_req
21
+ end
22
+ end
23
+ end
24
+
25
+ end
26
+ end
@@ -0,0 +1,57 @@
1
+ require 'right_aws'
2
+ module Monkeyshines
3
+ module Repository
4
+ class Base
5
+
6
+ def exists?(key)
7
+ end
8
+ alias_method :include?, :exists?
9
+
10
+ def put key, val
11
+ end
12
+
13
+ def get key
14
+ end
15
+
16
+ def open
17
+ end
18
+
19
+ def close
20
+ end
21
+
22
+ def uri key
23
+ end
24
+
25
+ def md5 key
26
+ metadata key, :md5
27
+ end
28
+ alias_method :checksum, :md5
29
+
30
+ def size key
31
+ end
32
+
33
+ def timestamp key
34
+ end
35
+
36
+ # By default,
37
+ # size+timestamp-md5
38
+ # Ex:
39
+ # 1251777182+20090222121200-577416a26499f6facf45973298be5276
40
+ def version_id
41
+ "#{size}+#{timestamp}-#{md5}"
42
+ end
43
+
44
+ CACHED_METADATA = {}
45
+ def metadata key, datum=nil
46
+ attrs = CACHED_METADATA[key] || get_metadata(key, datum)
47
+ datum ? attrs[datum] : attrs
48
+ end
49
+
50
+ # fetch
51
+ def get_metadata key, datum=nil
52
+ #
53
+ end
54
+
55
+ end
56
+ end
57
+ end