monkeyshines 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ConditionalStore < Monkeyshines::Store::Base
|
4
|
+
attr_accessor :options, :cache, :store, :misses
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
:cache => { :type => :tyrant_rdb_key_store },
|
8
|
+
:store => { :type => :chunked_flat_file_store },
|
9
|
+
}
|
10
|
+
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# +cache+ must behave like a hash (Hash and
|
14
|
+
# Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
|
15
|
+
# choices).
|
16
|
+
#
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def initialize _options
|
20
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options)
|
21
|
+
self.cache = Monkeyshines::Store.create(options[:cache])
|
22
|
+
self.store = Monkeyshines::Store.create(options[:store])
|
23
|
+
self.misses = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# If key is absent, save the result of calling the block.
|
28
|
+
# If key is present, block is never called.
|
29
|
+
#
|
30
|
+
# Ex:
|
31
|
+
# rt_store.set(url) do
|
32
|
+
# fetcher.get url # will only be called if url isn't in rt_store
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
def set key, force=nil, &block
|
36
|
+
return if (!force) && cache.include?(key)
|
37
|
+
cache_val, store_val = block.call()
|
38
|
+
return unless cache_val
|
39
|
+
cache.set_nr key, cache_val # update cache
|
40
|
+
store << store_val # save value
|
41
|
+
self.misses += 1 # track the cache miss
|
42
|
+
store_val
|
43
|
+
end
|
44
|
+
|
45
|
+
def size() cache.size end
|
46
|
+
|
47
|
+
def log_line
|
48
|
+
[size, "%8d misses"%misses]
|
49
|
+
end
|
50
|
+
|
51
|
+
def close()
|
52
|
+
cache.close
|
53
|
+
store.close
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'fileutils'; include FileUtils
|
2
|
+
|
3
|
+
module Monkeyshines
|
4
|
+
module Store
|
5
|
+
#
|
6
|
+
class FlatFileStore < Store::Base
|
7
|
+
attr_accessor :filename, :filemode
|
8
|
+
|
9
|
+
#
|
10
|
+
# +filename_root+ : first part of name for files
|
11
|
+
#
|
12
|
+
def initialize options={}
|
13
|
+
Log.debug "New #{self.class} as #{options.inspect}"
|
14
|
+
self.filename = options[:filename] or raise "Missing filename in #{self.class}"
|
15
|
+
self.filemode = options[:filemode] || 'r'
|
16
|
+
skip!(options[:skip]) if options[:skip]
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
#
|
21
|
+
#
|
22
|
+
def each &block
|
23
|
+
file.each do |line|
|
24
|
+
next if line[0..0] == '#'
|
25
|
+
attrs = line.chomp.split("\t")
|
26
|
+
next if attrs.blank?
|
27
|
+
yield *attrs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Read ahead n_lines lines in the file
|
33
|
+
#
|
34
|
+
def skip! n_lines
|
35
|
+
Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
|
36
|
+
n_lines.times do
|
37
|
+
file.readline
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Open the timestamped file,
|
43
|
+
# ensuring its directory exists
|
44
|
+
#
|
45
|
+
def file
|
46
|
+
return @file if @file
|
47
|
+
Log.info "Opening file #{filename} with mode #{filemode}"
|
48
|
+
@file = File.open(filename, filemode)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Close the dump file
|
52
|
+
def close
|
53
|
+
@file.close if @file
|
54
|
+
@file = nil
|
55
|
+
end
|
56
|
+
|
57
|
+
# Ensure the file's directory exists
|
58
|
+
def mkdir!
|
59
|
+
dir = File.dirname(filename)
|
60
|
+
return if File.directory?(dir)
|
61
|
+
Log.info "Making directory #{dir}"
|
62
|
+
FileUtils.mkdir_p dir
|
63
|
+
end
|
64
|
+
|
65
|
+
# write to the file
|
66
|
+
def save obj
|
67
|
+
file << obj.to_flat.join("\t")+"\n"
|
68
|
+
obj
|
69
|
+
end
|
70
|
+
|
71
|
+
def set key, *args, &block
|
72
|
+
tok, obj = block.call
|
73
|
+
save obj
|
74
|
+
end
|
75
|
+
|
76
|
+
# delegates to +#save+ -- writes the object to the file
|
77
|
+
def <<(obj)
|
78
|
+
save obj
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class KeyStore < Monkeyshines::Store::Base
|
4
|
+
# The actual backing store; should respond to #set and #get methods
|
5
|
+
attr_accessor :db
|
6
|
+
|
7
|
+
#
|
8
|
+
# Executes block once for each element in the whole DB, in whatever order
|
9
|
+
# the DB thinks you should see it.
|
10
|
+
#
|
11
|
+
# Your block will see |key, val|
|
12
|
+
#
|
13
|
+
# key_store.each do |key, val|
|
14
|
+
# # ... stuff ...
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
def each &block
|
18
|
+
db.iterinit
|
19
|
+
loop do
|
20
|
+
key = db.iternext or break
|
21
|
+
val = db[key]
|
22
|
+
yield key, val
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
# Save the value into the database
|
28
|
+
def set(key, val)
|
29
|
+
return unless val
|
30
|
+
db[key] = val
|
31
|
+
end
|
32
|
+
|
33
|
+
alias_method :save, :set
|
34
|
+
def get(key) db[key] end
|
35
|
+
def [](key) db[key] end
|
36
|
+
def close() db.close end
|
37
|
+
def size() db.size end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Load from standard command-line options
|
41
|
+
#
|
42
|
+
# obvs only works when there's just one store
|
43
|
+
#
|
44
|
+
def self.new_from_command_line cmdline_opts, default_opts={}
|
45
|
+
options = default_opts.merge(cmdline_opts)
|
46
|
+
store = self.new(options[:store_db])
|
47
|
+
store
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
|
4
|
+
|
5
|
+
#
|
6
|
+
# If key is absent, save the result of calling the block.
|
7
|
+
# If key is present, block is never called.
|
8
|
+
#
|
9
|
+
# Ex:
|
10
|
+
# rt_store.set(url) do
|
11
|
+
# fetcher.get url # will only be called if url isn't in rt_store
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
def set key, force=nil, &block
|
15
|
+
return if !force && db.has_key?(key)
|
16
|
+
result = block.call() or return
|
17
|
+
super(key, result)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'tokyocabinet'
|
2
|
+
module Monkeyshines
|
3
|
+
module Store
|
4
|
+
#
|
5
|
+
# Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
|
6
|
+
#
|
7
|
+
class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
|
8
|
+
|
9
|
+
# pass in the filename or URI of a tokyo cabinet table-style DB
|
10
|
+
# set create_db = true if you want to create a missing DB file
|
11
|
+
def initialize db_uri, *args
|
12
|
+
self.db = TokyoCabinet::TDB.new
|
13
|
+
db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
|
14
|
+
super *args
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def each_as klass, &block
|
19
|
+
self.each do |key, hsh|
|
20
|
+
yield klass.from_hash hsh
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# Delegate to store
|
24
|
+
def set(key, val)
|
25
|
+
return unless val
|
26
|
+
db.put key, val.to_hash.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
def size() db.rnum end
|
30
|
+
|
31
|
+
end #class
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'tokyotyrant'
|
2
|
+
module Monkeyshines
|
3
|
+
module Store
|
4
|
+
|
5
|
+
#
|
6
|
+
# Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
|
7
|
+
#
|
8
|
+
class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
|
9
|
+
attr_accessor :db_host, :db_port
|
10
|
+
|
11
|
+
# pass in the host:port uri of the key store.
|
12
|
+
def initialize options
|
13
|
+
raise "URI for #{self.class} is required" if options[:uri].blank?
|
14
|
+
self.db_host, self.db_port = options[:uri].to_s.split(':')
|
15
|
+
super options
|
16
|
+
end
|
17
|
+
|
18
|
+
def db
|
19
|
+
return @db if @db
|
20
|
+
@db ||= TokyoTyrant::RDB.new
|
21
|
+
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
22
|
+
@db
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
@db.close if @db
|
27
|
+
@db = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# Save the value into the database without waiting for a response.
|
31
|
+
def set_nr(key, val)
|
32
|
+
db.putnr key, val if val
|
33
|
+
end
|
34
|
+
|
35
|
+
def size() db.rnum end
|
36
|
+
def include? *args
|
37
|
+
db.has_key? *args
|
38
|
+
end
|
39
|
+
|
40
|
+
# require 'memcache'
|
41
|
+
# def initialize db_uri=nil, *args
|
42
|
+
# # db_uri ||= ':1978'
|
43
|
+
# # self.db_host, self.db_port = db_uri.split(':')
|
44
|
+
# self.db = MemCache.new(db_uri, :no_reply => true)
|
45
|
+
# if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
|
46
|
+
# super *args
|
47
|
+
# end
|
48
|
+
#
|
49
|
+
# def size
|
50
|
+
# db.stats
|
51
|
+
# end
|
52
|
+
|
53
|
+
end #class
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'tokyotyrant'
|
2
|
+
require 'tyrant_rdb_key_store'
|
3
|
+
module Monkeyshines
|
4
|
+
module Store
|
5
|
+
#
|
6
|
+
# Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
|
7
|
+
#
|
8
|
+
class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
|
9
|
+
|
10
|
+
def db
|
11
|
+
return @db if @db
|
12
|
+
@db ||= TokyoTyrant::RDBTBL.new
|
13
|
+
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
14
|
+
@db
|
15
|
+
end
|
16
|
+
|
17
|
+
end #class
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#
|
2
|
+
# Makes a module behave as a factory.
|
3
|
+
#
|
4
|
+
# A module that extends FactoryModule gets a method #new(klass_name, *args):
|
5
|
+
# this finds the class corresponding to klass_name and creates it with *args as
|
6
|
+
# arguments.
|
7
|
+
#
|
8
|
+
# if +klass_name+ is a class, it's used directly. Otherwise, it's converted to
|
9
|
+
# a class, and can be in underscored form (mysql_doc_source) or namespace form
|
10
|
+
# (FileSources::WordDoc); the name is interpreted relative to the extending
|
11
|
+
# module's namespace. (So, in the example below, :file_doc_source,
|
12
|
+
# FileDocSource, DocSource::
|
13
|
+
#
|
14
|
+
# Example. Given:
|
15
|
+
#
|
16
|
+
# module DocSource
|
17
|
+
# extend FactoryModule
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# # ... elsewhere ...
|
21
|
+
# module DocSource
|
22
|
+
# # load docs from file
|
23
|
+
# class FileDocSource
|
24
|
+
# def initialize filename
|
25
|
+
# #...
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# # load docs from web
|
30
|
+
# class MySqlDocSource
|
31
|
+
# def initialize host, port, user, password
|
32
|
+
# # ...
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# Then:
|
38
|
+
# DocSource.new :file_doc_source, '/tmp/foo.doc' # => returns DocSource::FileDocSource
|
39
|
+
# DocSource.new :MySqlDocSource, 'localhost', 6666 # => returns DocSource::MySqlDocSource
|
40
|
+
#
|
41
|
+
#
|
42
|
+
module FactoryModule
|
43
|
+
def self.extended base
|
44
|
+
base.class_eval do
|
45
|
+
|
46
|
+
def self.new klass_name, *args
|
47
|
+
FactoryModule.get_class(self, klass_name).new(*args)
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.from_hash plan
|
51
|
+
return plan unless plan.is_a?(Hash)
|
52
|
+
klass_name = (plan[:type] || plan['type']) or raise "Fat, drunk, and stupid is no way to go through life, son. You need a plan: #{plan.inspect}"
|
53
|
+
FactoryModule.get_class(self, klass_name).from_hash(plan)
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.create plan
|
57
|
+
case
|
58
|
+
# when plan.class.ancestors.include? self
|
59
|
+
when plan.is_a?(Hash)
|
60
|
+
klass_name = plan[:type] || plan['type']
|
61
|
+
FactoryModule.get_class(self, klass_name).new(plan)
|
62
|
+
when plan.is_a?(Symbol)
|
63
|
+
klass_name = plan
|
64
|
+
FactoryModule.get_class(self, klass_name).new()
|
65
|
+
else plan
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_class klass_name
|
73
|
+
FactoryModule.get_class self, klass_name
|
74
|
+
end
|
75
|
+
|
76
|
+
FACTORY_CLASSES = {}
|
77
|
+
def self.get_class scope, klass_name
|
78
|
+
return FACTORY_CLASSES[ [scope, klass_name] ] if FACTORY_CLASSES[ [scope, klass_name] ]
|
79
|
+
if klass_name.is_a? Class
|
80
|
+
klass = klass_name
|
81
|
+
else
|
82
|
+
begin
|
83
|
+
klass = scope.find_const(klass_name.to_s.camelize)
|
84
|
+
rescue NameError => e
|
85
|
+
raise "Can't find #{klass_name.inspect} in #{scope}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
# find_const from wukong/extensions/module via extlib
|
89
|
+
FACTORY_CLASSES[ [scope, klass_name] ] = klass
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
#
|
94
|
+
#
|
95
|
+
# FactoryModule.list_of_classes(Wuclan::Twitter::Scrape, 'followers_ids,friends_ids', 'request')
|
96
|
+
# # => [Wuclan::Twitter::Scrape::FollowersIdsRequest, Wuclan::Twitter::Scrape::FriendsIdsRequest]
|
97
|
+
#
|
98
|
+
def self.list_of_classes scope, klass_names, prefix=nil, suffix=nil
|
99
|
+
klass_names = klass_names.split(',') if klass_names.is_a?(String)
|
100
|
+
klass_names.map do |klass_name|
|
101
|
+
klass_name = [prefix, klass_name, suffix].compact.join('_') if klass_name.is_a?(String)
|
102
|
+
self.get_class(scope, klass_name)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|