monkeyshines 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
@@ -0,0 +1,57 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ConditionalStore < Monkeyshines::Store::Base
4
+ attr_accessor :options, :cache, :store, :misses
5
+
6
+ DEFAULT_OPTIONS = {
7
+ :cache => { :type => :tyrant_rdb_key_store },
8
+ :store => { :type => :chunked_flat_file_store },
9
+ }
10
+
11
+ #
12
+ #
13
+ # +cache+ must behave like a hash (Hash and
14
+ # Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
15
+ # choices).
16
+ #
17
+ #
18
+ #
19
+ def initialize _options
20
+ self.options = DEFAULT_OPTIONS.deep_merge(_options)
21
+ self.cache = Monkeyshines::Store.create(options[:cache])
22
+ self.store = Monkeyshines::Store.create(options[:store])
23
+ self.misses = 0
24
+ end
25
+
26
+ #
27
+ # If key is absent, save the result of calling the block.
28
+ # If key is present, block is never called.
29
+ #
30
+ # Ex:
31
+ # rt_store.set(url) do
32
+ # fetcher.get url # will only be called if url isn't in rt_store
33
+ # end
34
+ #
35
+ def set key, force=nil, &block
36
+ return if (!force) && cache.include?(key)
37
+ cache_val, store_val = block.call()
38
+ return unless cache_val
39
+ cache.set_nr key, cache_val # update cache
40
+ store << store_val # save value
41
+ self.misses += 1 # track the cache miss
42
+ store_val
43
+ end
44
+
45
+ def size() cache.size end
46
+
47
+ def log_line
48
+ [size, "%8d misses"%misses]
49
+ end
50
+
51
+ def close()
52
+ cache.close
53
+ store.close
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,8 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class Factory
4
+ def self.generate type, opts
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,84 @@
1
+ require 'fileutils'; include FileUtils
2
+
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ class FlatFileStore < Store::Base
7
+ attr_accessor :filename, :filemode
8
+
9
+ #
10
+ # +filename_root+ : first part of name for files
11
+ #
12
+ def initialize options={}
13
+ Log.debug "New #{self.class} as #{options.inspect}"
14
+ self.filename = options[:filename] or raise "Missing filename in #{self.class}"
15
+ self.filemode = options[:filemode] || 'r'
16
+ skip!(options[:skip]) if options[:skip]
17
+ end
18
+
19
+ #
20
+ #
21
+ #
22
+ def each &block
23
+ file.each do |line|
24
+ next if line[0..0] == '#'
25
+ attrs = line.chomp.split("\t")
26
+ next if attrs.blank?
27
+ yield *attrs
28
+ end
29
+ end
30
+
31
+ #
32
+ # Read ahead n_lines lines in the file
33
+ #
34
+ def skip! n_lines
35
+ Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
36
+ n_lines.times do
37
+ file.readline
38
+ end
39
+ end
40
+
41
+ #
42
+ # Open the timestamped file,
43
+ # ensuring its directory exists
44
+ #
45
+ def file
46
+ return @file if @file
47
+ Log.info "Opening file #{filename} with mode #{filemode}"
48
+ @file = File.open(filename, filemode)
49
+ end
50
+
51
+ # Close the dump file
52
+ def close
53
+ @file.close if @file
54
+ @file = nil
55
+ end
56
+
57
+ # Ensure the file's directory exists
58
+ def mkdir!
59
+ dir = File.dirname(filename)
60
+ return if File.directory?(dir)
61
+ Log.info "Making directory #{dir}"
62
+ FileUtils.mkdir_p dir
63
+ end
64
+
65
+ # write to the file
66
+ def save obj
67
+ file << obj.to_flat.join("\t")+"\n"
68
+ obj
69
+ end
70
+
71
+ def set key, *args, &block
72
+ tok, obj = block.call
73
+ save obj
74
+ end
75
+
76
+ # delegates to +#save+ -- writes the object to the file
77
+ def <<(obj)
78
+ save obj
79
+ end
80
+
81
+ end
82
+ end
83
+ end
84
+
@@ -0,0 +1,51 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class KeyStore < Monkeyshines::Store::Base
4
+ # The actual backing store; should respond to #set and #get methods
5
+ attr_accessor :db
6
+
7
+ #
8
+ # Executes block once for each element in the whole DB, in whatever order
9
+ # the DB thinks you should see it.
10
+ #
11
+ # Your block will see |key, val|
12
+ #
13
+ # key_store.each do |key, val|
14
+ # # ... stuff ...
15
+ # end
16
+ #
17
+ def each &block
18
+ db.iterinit
19
+ loop do
20
+ key = db.iternext or break
21
+ val = db[key]
22
+ yield key, val
23
+ end
24
+ end
25
+
26
+
27
+ # Save the value into the database
28
+ def set(key, val)
29
+ return unless val
30
+ db[key] = val
31
+ end
32
+
33
+ alias_method :save, :set
34
+ def get(key) db[key] end
35
+ def [](key) db[key] end
36
+ def close() db.close end
37
+ def size() db.size end
38
+
39
+ #
40
+ # Load from standard command-line options
41
+ #
42
+ # obvs only works when there's just one store
43
+ #
44
+ def self.new_from_command_line cmdline_opts, default_opts={}
45
+ options = default_opts.merge(cmdline_opts)
46
+ store = self.new(options[:store_db])
47
+ store
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,15 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class NullStore < Monkeyshines::Store::Base
4
+
5
+ def each *args, &block
6
+ end
7
+
8
+
9
+ # Does nothing!
10
+ def set *args
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
4
+
5
+ #
6
+ # If key is absent, save the result of calling the block.
7
+ # If key is present, block is never called.
8
+ #
9
+ # Ex:
10
+ # rt_store.set(url) do
11
+ # fetcher.get url # will only be called if url isn't in rt_store
12
+ # end
13
+ #
14
+ def set key, force=nil, &block
15
+ return if !force && db.has_key?(key)
16
+ result = block.call() or return
17
+ super(key, result)
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ require 'tokyocabinet'
2
+ module Monkeyshines
3
+ module Store
4
+ #
5
+ # Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
6
+ #
7
+ class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
8
+
9
+ # pass in the filename or URI of a tokyo cabinet table-style DB
10
+ # set create_db = true if you want to create a missing DB file
11
+ def initialize db_uri, *args
12
+ self.db = TokyoCabinet::TDB.new
13
+ db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
14
+ super *args
15
+ end
16
+
17
+
18
+ def each_as klass, &block
19
+ self.each do |key, hsh|
20
+ yield klass.from_hash hsh
21
+ end
22
+ end
23
+ # Delegate to store
24
+ def set(key, val)
25
+ return unless val
26
+ db.put key, val.to_hash.compact
27
+ end
28
+
29
+ def size() db.rnum end
30
+
31
+ end #class
32
+ end
33
+ end
@@ -0,0 +1,56 @@
1
+ require 'tokyotyrant'
2
+ module Monkeyshines
3
+ module Store
4
+
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
7
+ #
8
+ class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
9
+ attr_accessor :db_host, :db_port
10
+
11
+ # pass in the host:port uri of the key store.
12
+ def initialize options
13
+ raise "URI for #{self.class} is required" if options[:uri].blank?
14
+ self.db_host, self.db_port = options[:uri].to_s.split(':')
15
+ super options
16
+ end
17
+
18
+ def db
19
+ return @db if @db
20
+ @db ||= TokyoTyrant::RDB.new
21
+ @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
22
+ @db
23
+ end
24
+
25
+ def close
26
+ @db.close if @db
27
+ @db = nil
28
+ end
29
+
30
+ # Save the value into the database without waiting for a response.
31
+ def set_nr(key, val)
32
+ db.putnr key, val if val
33
+ end
34
+
35
+ def size() db.rnum end
36
+ def include? *args
37
+ db.has_key? *args
38
+ end
39
+
40
+ # require 'memcache'
41
+ # def initialize db_uri=nil, *args
42
+ # # db_uri ||= ':1978'
43
+ # # self.db_host, self.db_port = db_uri.split(':')
44
+ # self.db = MemCache.new(db_uri, :no_reply => true)
45
+ # if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
46
+ # super *args
47
+ # end
48
+ #
49
+ # def size
50
+ # db.stats
51
+ # end
52
+
53
+ end #class
54
+ end
55
+ end
56
+
@@ -0,0 +1,20 @@
1
+ require 'tokyotyrant'
2
+ require 'tyrant_rdb_key_store'
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
7
+ #
8
+ class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
9
+
10
+ def db
11
+ return @db if @db
12
+ @db ||= TokyoTyrant::RDBTBL.new
13
+ @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
14
+ @db
15
+ end
16
+
17
+ end #class
18
+ end
19
+ end
20
+
@@ -0,0 +1,106 @@
1
+ #
2
+ # Makes a module behave as a factory.
3
+ #
4
+ # A module that extends FactoryModule gets a method #new(klass_name, *args):
5
+ # this finds the class corresponding to klass_name and creates it with *args as
6
+ # arguments.
7
+ #
8
+ # if +klass_name+ is a class, it's used directly. Otherwise, it's converted to
9
+ # a class, and can be in underscored form (mysql_doc_source) or namespace form
10
+ # (FileSources::WordDoc); the name is interpreted relative to the extending
11
+ # module's namespace. (So, in the example below, :file_doc_source,
12
+ # FileDocSource, DocSource::
13
+ #
14
+ # Example. Given:
15
+ #
16
+ # module DocSource
17
+ # extend FactoryModule
18
+ # end
19
+ #
20
+ # # ... elsewhere ...
21
+ # module DocSource
22
+ # # load docs from file
23
+ # class FileDocSource
24
+ # def initialize filename
25
+ # #...
26
+ # end
27
+ # end
28
+ #
29
+ # # load docs from web
30
+ # class MySqlDocSource
31
+ # def initialize host, port, user, password
32
+ # # ...
33
+ # end
34
+ # end
35
+ # end
36
+ #
37
+ # Then:
38
+ # DocSource.new :file_doc_source, '/tmp/foo.doc' # => returns DocSource::FileDocSource
39
+ # DocSource.new :MySqlDocSource, 'localhost', 6666 # => returns DocSource::MySqlDocSource
40
+ #
41
+ #
42
+ module FactoryModule
43
+ def self.extended base
44
+ base.class_eval do
45
+
46
+ def self.new klass_name, *args
47
+ FactoryModule.get_class(self, klass_name).new(*args)
48
+ end
49
+
50
+ def self.from_hash plan
51
+ return plan unless plan.is_a?(Hash)
52
+ klass_name = (plan[:type] || plan['type']) or raise "Fat, drunk, and stupid is no way to go through life, son. You need a plan: #{plan.inspect}"
53
+ FactoryModule.get_class(self, klass_name).from_hash(plan)
54
+ end
55
+
56
+ def self.create plan
57
+ case
58
+ # when plan.class.ancestors.include? self
59
+ when plan.is_a?(Hash)
60
+ klass_name = plan[:type] || plan['type']
61
+ FactoryModule.get_class(self, klass_name).new(plan)
62
+ when plan.is_a?(Symbol)
63
+ klass_name = plan
64
+ FactoryModule.get_class(self, klass_name).new()
65
+ else plan
66
+ end
67
+ end
68
+
69
+ end
70
+ end
71
+
72
+ def get_class klass_name
73
+ FactoryModule.get_class self, klass_name
74
+ end
75
+
76
+ FACTORY_CLASSES = {}
77
+ def self.get_class scope, klass_name
78
+ return FACTORY_CLASSES[ [scope, klass_name] ] if FACTORY_CLASSES[ [scope, klass_name] ]
79
+ if klass_name.is_a? Class
80
+ klass = klass_name
81
+ else
82
+ begin
83
+ klass = scope.find_const(klass_name.to_s.camelize)
84
+ rescue NameError => e
85
+ raise "Can't find #{klass_name.inspect} in #{scope}"
86
+ end
87
+ end
88
+ # find_const from wukong/extensions/module via extlib
89
+ FACTORY_CLASSES[ [scope, klass_name] ] = klass
90
+ end
91
+
92
+
93
+ #
94
+ #
95
+ # FactoryModule.list_of_classes(Wuclan::Twitter::Scrape, 'followers_ids,friends_ids', 'request')
96
+ # # => [Wuclan::Twitter::Scrape::FollowersIdsRequest, Wuclan::Twitter::Scrape::FriendsIdsRequest]
97
+ #
98
+ def self.list_of_classes scope, klass_names, prefix=nil, suffix=nil
99
+ klass_names = klass_names.split(',') if klass_names.is_a?(String)
100
+ klass_names.map do |klass_name|
101
+ klass_name = [prefix, klass_name, suffix].compact.join('_') if klass_name.is_a?(String)
102
+ self.get_class(scope, klass_name)
103
+ end
104
+ end
105
+
106
+ end