monkeyshines 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. data/.document +4 -0
  2. data/.gitignore +43 -0
  3. data/LICENSE +20 -0
  4. data/LICENSE.textile +20 -0
  5. data/README.textile +125 -0
  6. data/Rakefile +105 -0
  7. data/VERSION +1 -0
  8. data/examples/.gitignore +4 -0
  9. data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
  10. data/examples/rename_tree/rename_hdp_tree.rb +151 -0
  11. data/examples/rename_tree/rename_ripd_tree.rb +82 -0
  12. data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
  13. data/examples/shorturls/README.textile +111 -0
  14. data/examples/shorturls/bulkdump_shorturls.rb +46 -0
  15. data/examples/shorturls/bulkload_shorturls.rb +45 -0
  16. data/examples/shorturls/extract_urls.rb +12 -0
  17. data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
  18. data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
  19. data/examples/shorturls/old/shorturl_stats.rb +81 -0
  20. data/examples/shorturls/scrape_shorturls.rb +112 -0
  21. data/examples/shorturls/shorturl_request.rb +29 -0
  22. data/examples/shorturls/shorturl_sequence.rb +121 -0
  23. data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
  24. data/examples/shorturls/start_shorturl_cache.sh +2 -0
  25. data/lib/monkeyshines.rb +31 -0
  26. data/lib/monkeyshines/extensions.rb +16 -0
  27. data/lib/monkeyshines/fetcher.rb +10 -0
  28. data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
  29. data/lib/monkeyshines/fetcher/base.rb +44 -0
  30. data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
  31. data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
  32. data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
  33. data/lib/monkeyshines/monitor.rb +7 -0
  34. data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
  35. data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
  36. data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
  37. data/lib/monkeyshines/options.rb +59 -0
  38. data/lib/monkeyshines/recursive_runner.rb +26 -0
  39. data/lib/monkeyshines/repository/base.rb +57 -0
  40. data/lib/monkeyshines/repository/s3.rb +169 -0
  41. data/lib/monkeyshines/request_stream.rb +11 -0
  42. data/lib/monkeyshines/request_stream/base.rb +32 -0
  43. data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
  44. data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
  45. data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
  46. data/lib/monkeyshines/runner.rb +161 -0
  47. data/lib/monkeyshines/runner_core/options.rb +5 -0
  48. data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
  49. data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
  50. data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
  51. data/lib/monkeyshines/scrape_request.rb +136 -0
  52. data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
  53. data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
  54. data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
  55. data/lib/monkeyshines/store.rb +14 -0
  56. data/lib/monkeyshines/store/base.rb +29 -0
  57. data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
  58. data/lib/monkeyshines/store/conditional_store.rb +57 -0
  59. data/lib/monkeyshines/store/factory.rb +8 -0
  60. data/lib/monkeyshines/store/flat_file_store.rb +84 -0
  61. data/lib/monkeyshines/store/key_store.rb +51 -0
  62. data/lib/monkeyshines/store/null_store.rb +15 -0
  63. data/lib/monkeyshines/store/read_thru_store.rb +22 -0
  64. data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
  65. data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
  66. data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
  67. data/lib/monkeyshines/utils/factory_module.rb +106 -0
  68. data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
  69. data/lib/monkeyshines/utils/logger.rb +15 -0
  70. data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
  71. data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
  72. data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
  73. data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
  74. data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
  75. data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
  76. data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
  77. data/lib/monkeyshines/utils/trollop.rb +744 -0
  78. data/lib/monkeyshines/utils/union_interval.rb +52 -0
  79. data/lib/monkeyshines/utils/uri.rb +70 -0
  80. data/lib/monkeyshines/utils/uuid.rb +32 -0
  81. data/monkeyshines.gemspec +147 -0
  82. data/scrape_from_file.rb +44 -0
  83. data/spec/monkeyshines_spec.rb +7 -0
  84. data/spec/spec_helper.rb +9 -0
  85. metadata +183 -0
@@ -0,0 +1,57 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ConditionalStore < Monkeyshines::Store::Base
4
+ attr_accessor :options, :cache, :store, :misses
5
+
6
+ DEFAULT_OPTIONS = {
7
+ :cache => { :type => :tyrant_rdb_key_store },
8
+ :store => { :type => :chunked_flat_file_store },
9
+ }
10
+
11
+ #
12
+ #
13
+ # +cache+ must behave like a hash (Hash and
14
+ # Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
15
+ # choices).
16
+ #
17
+ #
18
+ #
19
+ def initialize _options
20
+ self.options = DEFAULT_OPTIONS.deep_merge(_options)
21
+ self.cache = Monkeyshines::Store.create(options[:cache])
22
+ self.store = Monkeyshines::Store.create(options[:store])
23
+ self.misses = 0
24
+ end
25
+
26
+ #
27
+ # If key is absent, save the result of calling the block.
28
+ # If key is present, block is never called.
29
+ #
30
+ # Ex:
31
+ # rt_store.set(url) do
32
+ # fetcher.get url # will only be called if url isn't in rt_store
33
+ # end
34
+ #
35
+ def set key, force=nil, &block
36
+ return if (!force) && cache.include?(key)
37
+ cache_val, store_val = block.call()
38
+ return unless cache_val
39
+ cache.set_nr key, cache_val # update cache
40
+ store << store_val # save value
41
+ self.misses += 1 # track the cache miss
42
+ store_val
43
+ end
44
+
45
+ def size() cache.size end
46
+
47
+ def log_line
48
+ [size, "%8d misses"%misses]
49
+ end
50
+
51
+ def close()
52
+ cache.close
53
+ store.close
54
+ end
55
+ end
56
+ end
57
+ end
@@ -0,0 +1,8 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class Factory
4
+ def self.generate type, opts
5
+ end
6
+ end
7
+ end
8
+ end
@@ -0,0 +1,84 @@
1
+ require 'fileutils'; include FileUtils
2
+
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ class FlatFileStore < Store::Base
7
+ attr_accessor :filename, :filemode
8
+
9
+ #
10
+ # +filename_root+ : first part of name for files
11
+ #
12
+ def initialize options={}
13
+ Log.debug "New #{self.class} as #{options.inspect}"
14
+ self.filename = options[:filename] or raise "Missing filename in #{self.class}"
15
+ self.filemode = options[:filemode] || 'r'
16
+ skip!(options[:skip]) if options[:skip]
17
+ end
18
+
19
+ #
20
+ #
21
+ #
22
+ def each &block
23
+ file.each do |line|
24
+ next if line[0..0] == '#'
25
+ attrs = line.chomp.split("\t")
26
+ next if attrs.blank?
27
+ yield *attrs
28
+ end
29
+ end
30
+
31
+ #
32
+ # Read ahead n_lines lines in the file
33
+ #
34
+ def skip! n_lines
35
+ Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
36
+ n_lines.times do
37
+ file.readline
38
+ end
39
+ end
40
+
41
+ #
42
+ # Open the timestamped file,
43
+ # ensuring its directory exists
44
+ #
45
+ def file
46
+ return @file if @file
47
+ Log.info "Opening file #{filename} with mode #{filemode}"
48
+ @file = File.open(filename, filemode)
49
+ end
50
+
51
+ # Close the dump file
52
+ def close
53
+ @file.close if @file
54
+ @file = nil
55
+ end
56
+
57
+ # Ensure the file's directory exists
58
+ def mkdir!
59
+ dir = File.dirname(filename)
60
+ return if File.directory?(dir)
61
+ Log.info "Making directory #{dir}"
62
+ FileUtils.mkdir_p dir
63
+ end
64
+
65
+ # write to the file
66
+ def save obj
67
+ file << obj.to_flat.join("\t")+"\n"
68
+ obj
69
+ end
70
+
71
+ def set key, *args, &block
72
+ tok, obj = block.call
73
+ save obj
74
+ end
75
+
76
+ # delegates to +#save+ -- writes the object to the file
77
+ def <<(obj)
78
+ save obj
79
+ end
80
+
81
+ end
82
+ end
83
+ end
84
+
@@ -0,0 +1,51 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class KeyStore < Monkeyshines::Store::Base
4
+ # The actual backing store; should respond to #set and #get methods
5
+ attr_accessor :db
6
+
7
+ #
8
+ # Executes block once for each element in the whole DB, in whatever order
9
+ # the DB thinks you should see it.
10
+ #
11
+ # Your block will see |key, val|
12
+ #
13
+ # key_store.each do |key, val|
14
+ # # ... stuff ...
15
+ # end
16
+ #
17
+ def each &block
18
+ db.iterinit
19
+ loop do
20
+ key = db.iternext or break
21
+ val = db[key]
22
+ yield key, val
23
+ end
24
+ end
25
+
26
+
27
+ # Save the value into the database
28
+ def set(key, val)
29
+ return unless val
30
+ db[key] = val
31
+ end
32
+
33
+ alias_method :save, :set
34
+ def get(key) db[key] end
35
+ def [](key) db[key] end
36
+ def close() db.close end
37
+ def size() db.size end
38
+
39
+ #
40
+ # Load from standard command-line options
41
+ #
42
+ # obvs only works when there's just one store
43
+ #
44
+ def self.new_from_command_line cmdline_opts, default_opts={}
45
+ options = default_opts.merge(cmdline_opts)
46
+ store = self.new(options[:store_db])
47
+ store
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,15 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class NullStore < Monkeyshines::Store::Base
4
+
5
+ def each *args, &block
6
+ end
7
+
8
+
9
+ # Does nothing!
10
+ def set *args
11
+ end
12
+
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,22 @@
1
+ module Monkeyshines
2
+ module Store
3
+ class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
4
+
5
+ #
6
+ # If key is absent, save the result of calling the block.
7
+ # If key is present, block is never called.
8
+ #
9
+ # Ex:
10
+ # rt_store.set(url) do
11
+ # fetcher.get url # will only be called if url isn't in rt_store
12
+ # end
13
+ #
14
+ def set key, force=nil, &block
15
+ return if !force && db.has_key?(key)
16
+ result = block.call() or return
17
+ super(key, result)
18
+ end
19
+
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,33 @@
1
+ require 'tokyocabinet'
2
+ module Monkeyshines
3
+ module Store
4
+ #
5
+ # Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
6
+ #
7
+ class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
8
+
9
+ # pass in the filename or URI of a tokyo cabinet table-style DB
10
+ # set create_db = true if you want to create a missing DB file
11
+ def initialize db_uri, *args
12
+ self.db = TokyoCabinet::TDB.new
13
+ db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
14
+ super *args
15
+ end
16
+
17
+
18
+ def each_as klass, &block
19
+ self.each do |key, hsh|
20
+ yield klass.from_hash hsh
21
+ end
22
+ end
23
+ # Delegate to store
24
+ def set(key, val)
25
+ return unless val
26
+ db.put key, val.to_hash.compact
27
+ end
28
+
29
+ def size() db.rnum end
30
+
31
+ end #class
32
+ end
33
+ end
@@ -0,0 +1,56 @@
1
+ require 'tokyotyrant'
2
+ module Monkeyshines
3
+ module Store
4
+
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
7
+ #
8
+ class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
9
+ attr_accessor :db_host, :db_port
10
+
11
+ # pass in the host:port uri of the key store.
12
+ def initialize options
13
+ raise "URI for #{self.class} is required" if options[:uri].blank?
14
+ self.db_host, self.db_port = options[:uri].to_s.split(':')
15
+ super options
16
+ end
17
+
18
+ def db
19
+ return @db if @db
20
+ @db ||= TokyoTyrant::RDB.new
21
+ @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
22
+ @db
23
+ end
24
+
25
+ def close
26
+ @db.close if @db
27
+ @db = nil
28
+ end
29
+
30
+ # Save the value into the database without waiting for a response.
31
+ def set_nr(key, val)
32
+ db.putnr key, val if val
33
+ end
34
+
35
+ def size() db.rnum end
36
+ def include? *args
37
+ db.has_key? *args
38
+ end
39
+
40
+ # require 'memcache'
41
+ # def initialize db_uri=nil, *args
42
+ # # db_uri ||= ':1978'
43
+ # # self.db_host, self.db_port = db_uri.split(':')
44
+ # self.db = MemCache.new(db_uri, :no_reply => true)
45
+ # if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
46
+ # super *args
47
+ # end
48
+ #
49
+ # def size
50
+ # db.stats
51
+ # end
52
+
53
+ end #class
54
+ end
55
+ end
56
+
@@ -0,0 +1,20 @@
1
+ require 'tokyotyrant'
2
+ require 'tyrant_rdb_key_store'
3
+ module Monkeyshines
4
+ module Store
5
+ #
6
+ # Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
7
+ #
8
+ class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
9
+
10
+ def db
11
+ return @db if @db
12
+ @db ||= TokyoTyrant::RDBTBL.new
13
+ @db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
14
+ @db
15
+ end
16
+
17
+ end #class
18
+ end
19
+ end
20
+
@@ -0,0 +1,106 @@
1
+ #
2
+ # Makes a module behave as a factory.
3
+ #
4
+ # A module that extends FactoryModule gets a method #new(klass_name, *args):
5
+ # this finds the class corresponding to klass_name and creates it with *args as
6
+ # arguments.
7
+ #
8
+ # if +klass_name+ is a class, it's used directly. Otherwise, it's converted to
9
+ # a class, and can be in underscored form (mysql_doc_source) or namespace form
10
+ # (FileSources::WordDoc); the name is interpreted relative to the extending
11
+ # module's namespace. (So, in the example below, :file_doc_source,
12
+ # FileDocSource, DocSource::
13
+ #
14
+ # Example. Given:
15
+ #
16
+ # module DocSource
17
+ # extend FactoryModule
18
+ # end
19
+ #
20
+ # # ... elsewhere ...
21
+ # module DocSource
22
+ # # load docs from file
23
+ # class FileDocSource
24
+ # def initialize filename
25
+ # #...
26
+ # end
27
+ # end
28
+ #
29
+ # # load docs from web
30
+ # class MySqlDocSource
31
+ # def initialize host, port, user, password
32
+ # # ...
33
+ # end
34
+ # end
35
+ # end
36
+ #
37
+ # Then:
38
+ # DocSource.new :file_doc_source, '/tmp/foo.doc' # => returns DocSource::FileDocSource
39
+ # DocSource.new :MySqlDocSource, 'localhost', 6666 # => returns DocSource::MySqlDocSource
40
+ #
41
+ #
42
+ module FactoryModule
43
+ def self.extended base
44
+ base.class_eval do
45
+
46
+ def self.new klass_name, *args
47
+ FactoryModule.get_class(self, klass_name).new(*args)
48
+ end
49
+
50
+ def self.from_hash plan
51
+ return plan unless plan.is_a?(Hash)
52
+ klass_name = (plan[:type] || plan['type']) or raise "Fat, drunk, and stupid is no way to go through life, son. You need a plan: #{plan.inspect}"
53
+ FactoryModule.get_class(self, klass_name).from_hash(plan)
54
+ end
55
+
56
+ def self.create plan
57
+ case
58
+ # when plan.class.ancestors.include? self
59
+ when plan.is_a?(Hash)
60
+ klass_name = plan[:type] || plan['type']
61
+ FactoryModule.get_class(self, klass_name).new(plan)
62
+ when plan.is_a?(Symbol)
63
+ klass_name = plan
64
+ FactoryModule.get_class(self, klass_name).new()
65
+ else plan
66
+ end
67
+ end
68
+
69
+ end
70
+ end
71
+
72
+ def get_class klass_name
73
+ FactoryModule.get_class self, klass_name
74
+ end
75
+
76
+ FACTORY_CLASSES = {}
77
+ def self.get_class scope, klass_name
78
+ return FACTORY_CLASSES[ [scope, klass_name] ] if FACTORY_CLASSES[ [scope, klass_name] ]
79
+ if klass_name.is_a? Class
80
+ klass = klass_name
81
+ else
82
+ begin
83
+ klass = scope.find_const(klass_name.to_s.camelize)
84
+ rescue NameError => e
85
+ raise "Can't find #{klass_name.inspect} in #{scope}"
86
+ end
87
+ end
88
+ # find_const from wukong/extensions/module via extlib
89
+ FACTORY_CLASSES[ [scope, klass_name] ] = klass
90
+ end
91
+
92
+
93
+ #
94
+ #
95
+ # FactoryModule.list_of_classes(Wuclan::Twitter::Scrape, 'followers_ids,friends_ids', 'request')
96
+ # # => [Wuclan::Twitter::Scrape::FollowersIdsRequest, Wuclan::Twitter::Scrape::FriendsIdsRequest]
97
+ #
98
+ def self.list_of_classes scope, klass_names, prefix=nil, suffix=nil
99
+ klass_names = klass_names.split(',') if klass_names.is_a?(String)
100
+ klass_names.map do |klass_name|
101
+ klass_name = [prefix, klass_name, suffix].compact.join('_') if klass_name.is_a?(String)
102
+ self.get_class(scope, klass_name)
103
+ end
104
+ end
105
+
106
+ end