monkeyshines 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +4 -0
- data/.gitignore +43 -0
- data/LICENSE +20 -0
- data/LICENSE.textile +20 -0
- data/README.textile +125 -0
- data/Rakefile +105 -0
- data/VERSION +1 -0
- data/examples/.gitignore +4 -0
- data/examples/bulk_urls/scrape_bulk_urls.rb +64 -0
- data/examples/rename_tree/rename_hdp_tree.rb +151 -0
- data/examples/rename_tree/rename_ripd_tree.rb +82 -0
- data/examples/rss_feeds/scrape_rss_feeds.rb +52 -0
- data/examples/shorturls/README.textile +111 -0
- data/examples/shorturls/bulkdump_shorturls.rb +46 -0
- data/examples/shorturls/bulkload_shorturls.rb +45 -0
- data/examples/shorturls/extract_urls.rb +12 -0
- data/examples/shorturls/multiplex_shorturl_cache.rb +32 -0
- data/examples/shorturls/old/multidump_and_fix_shorturls.rb +66 -0
- data/examples/shorturls/old/shorturl_stats.rb +81 -0
- data/examples/shorturls/scrape_shorturls.rb +112 -0
- data/examples/shorturls/shorturl_request.rb +29 -0
- data/examples/shorturls/shorturl_sequence.rb +121 -0
- data/examples/shorturls/shorturl_start_tyrant.sh +16 -0
- data/examples/shorturls/start_shorturl_cache.sh +2 -0
- data/lib/monkeyshines.rb +31 -0
- data/lib/monkeyshines/extensions.rb +16 -0
- data/lib/monkeyshines/fetcher.rb +10 -0
- data/lib/monkeyshines/fetcher/authed_http_fetcher.rb +35 -0
- data/lib/monkeyshines/fetcher/base.rb +44 -0
- data/lib/monkeyshines/fetcher/fake_fetcher.rb +19 -0
- data/lib/monkeyshines/fetcher/http_fetcher.rb +127 -0
- data/lib/monkeyshines/fetcher/http_head_fetcher.rb +23 -0
- data/lib/monkeyshines/monitor.rb +7 -0
- data/lib/monkeyshines/monitor/chunked_store.rb +23 -0
- data/lib/monkeyshines/monitor/periodic_logger.rb +33 -0
- data/lib/monkeyshines/monitor/periodic_monitor.rb +65 -0
- data/lib/monkeyshines/options.rb +59 -0
- data/lib/monkeyshines/recursive_runner.rb +26 -0
- data/lib/monkeyshines/repository/base.rb +57 -0
- data/lib/monkeyshines/repository/s3.rb +169 -0
- data/lib/monkeyshines/request_stream.rb +11 -0
- data/lib/monkeyshines/request_stream/base.rb +32 -0
- data/lib/monkeyshines/request_stream/edamame_queue.rb +54 -0
- data/lib/monkeyshines/request_stream/klass_request_stream.rb +39 -0
- data/lib/monkeyshines/request_stream/simple_request_stream.rb +22 -0
- data/lib/monkeyshines/runner.rb +161 -0
- data/lib/monkeyshines/runner_core/options.rb +5 -0
- data/lib/monkeyshines/runner_core/parsing_runner.rb +29 -0
- data/lib/monkeyshines/scrape_job/old_paginated.rb +343 -0
- data/lib/monkeyshines/scrape_job/recursive.rb +9 -0
- data/lib/monkeyshines/scrape_request.rb +136 -0
- data/lib/monkeyshines/scrape_request/paginated.rb +290 -0
- data/lib/monkeyshines/scrape_request/raw_json_contents.rb +16 -0
- data/lib/monkeyshines/scrape_request/signed_url.rb +86 -0
- data/lib/monkeyshines/store.rb +14 -0
- data/lib/monkeyshines/store/base.rb +29 -0
- data/lib/monkeyshines/store/chunked_flat_file_store.rb +37 -0
- data/lib/monkeyshines/store/conditional_store.rb +57 -0
- data/lib/monkeyshines/store/factory.rb +8 -0
- data/lib/monkeyshines/store/flat_file_store.rb +84 -0
- data/lib/monkeyshines/store/key_store.rb +51 -0
- data/lib/monkeyshines/store/null_store.rb +15 -0
- data/lib/monkeyshines/store/read_thru_store.rb +22 -0
- data/lib/monkeyshines/store/tokyo_tdb_key_store.rb +33 -0
- data/lib/monkeyshines/store/tyrant_rdb_key_store.rb +56 -0
- data/lib/monkeyshines/store/tyrant_tdb_key_store.rb +20 -0
- data/lib/monkeyshines/utils/factory_module.rb +106 -0
- data/lib/monkeyshines/utils/filename_pattern.rb +134 -0
- data/lib/monkeyshines/utils/logger.rb +15 -0
- data/lib/monkeyshines/utils/trollop-1.14/FAQ.txt +84 -0
- data/lib/monkeyshines/utils/trollop-1.14/History.txt +101 -0
- data/lib/monkeyshines/utils/trollop-1.14/Manifest.txt +7 -0
- data/lib/monkeyshines/utils/trollop-1.14/README.txt +40 -0
- data/lib/monkeyshines/utils/trollop-1.14/Rakefile +36 -0
- data/lib/monkeyshines/utils/trollop-1.14/lib/trollop.rb +744 -0
- data/lib/monkeyshines/utils/trollop-1.14/test/test_trollop.rb +1048 -0
- data/lib/monkeyshines/utils/trollop.rb +744 -0
- data/lib/monkeyshines/utils/union_interval.rb +52 -0
- data/lib/monkeyshines/utils/uri.rb +70 -0
- data/lib/monkeyshines/utils/uuid.rb +32 -0
- data/monkeyshines.gemspec +147 -0
- data/scrape_from_file.rb +44 -0
- data/spec/monkeyshines_spec.rb +7 -0
- data/spec/spec_helper.rb +9 -0
- metadata +183 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ConditionalStore < Monkeyshines::Store::Base
|
4
|
+
attr_accessor :options, :cache, :store, :misses
|
5
|
+
|
6
|
+
DEFAULT_OPTIONS = {
|
7
|
+
:cache => { :type => :tyrant_rdb_key_store },
|
8
|
+
:store => { :type => :chunked_flat_file_store },
|
9
|
+
}
|
10
|
+
|
11
|
+
#
|
12
|
+
#
|
13
|
+
# +cache+ must behave like a hash (Hash and
|
14
|
+
# Monkeyshines::Store::TyrantRdbKeyStore are both cromulent
|
15
|
+
# choices).
|
16
|
+
#
|
17
|
+
#
|
18
|
+
#
|
19
|
+
def initialize _options
|
20
|
+
self.options = DEFAULT_OPTIONS.deep_merge(_options)
|
21
|
+
self.cache = Monkeyshines::Store.create(options[:cache])
|
22
|
+
self.store = Monkeyshines::Store.create(options[:store])
|
23
|
+
self.misses = 0
|
24
|
+
end
|
25
|
+
|
26
|
+
#
|
27
|
+
# If key is absent, save the result of calling the block.
|
28
|
+
# If key is present, block is never called.
|
29
|
+
#
|
30
|
+
# Ex:
|
31
|
+
# rt_store.set(url) do
|
32
|
+
# fetcher.get url # will only be called if url isn't in rt_store
|
33
|
+
# end
|
34
|
+
#
|
35
|
+
def set key, force=nil, &block
|
36
|
+
return if (!force) && cache.include?(key)
|
37
|
+
cache_val, store_val = block.call()
|
38
|
+
return unless cache_val
|
39
|
+
cache.set_nr key, cache_val # update cache
|
40
|
+
store << store_val # save value
|
41
|
+
self.misses += 1 # track the cache miss
|
42
|
+
store_val
|
43
|
+
end
|
44
|
+
|
45
|
+
def size() cache.size end
|
46
|
+
|
47
|
+
def log_line
|
48
|
+
[size, "%8d misses"%misses]
|
49
|
+
end
|
50
|
+
|
51
|
+
def close()
|
52
|
+
cache.close
|
53
|
+
store.close
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require 'fileutils'; include FileUtils
|
2
|
+
|
3
|
+
module Monkeyshines
|
4
|
+
module Store
|
5
|
+
#
|
6
|
+
class FlatFileStore < Store::Base
|
7
|
+
attr_accessor :filename, :filemode
|
8
|
+
|
9
|
+
#
|
10
|
+
# +filename_root+ : first part of name for files
|
11
|
+
#
|
12
|
+
def initialize options={}
|
13
|
+
Log.debug "New #{self.class} as #{options.inspect}"
|
14
|
+
self.filename = options[:filename] or raise "Missing filename in #{self.class}"
|
15
|
+
self.filemode = options[:filemode] || 'r'
|
16
|
+
skip!(options[:skip]) if options[:skip]
|
17
|
+
end
|
18
|
+
|
19
|
+
#
|
20
|
+
#
|
21
|
+
#
|
22
|
+
def each &block
|
23
|
+
file.each do |line|
|
24
|
+
next if line[0..0] == '#'
|
25
|
+
attrs = line.chomp.split("\t")
|
26
|
+
next if attrs.blank?
|
27
|
+
yield *attrs
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
#
|
32
|
+
# Read ahead n_lines lines in the file
|
33
|
+
#
|
34
|
+
def skip! n_lines
|
35
|
+
Log.info "Skipping #{n_lines} in #{self.class}:#{filename}"
|
36
|
+
n_lines.times do
|
37
|
+
file.readline
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
#
|
42
|
+
# Open the timestamped file,
|
43
|
+
# ensuring its directory exists
|
44
|
+
#
|
45
|
+
def file
|
46
|
+
return @file if @file
|
47
|
+
Log.info "Opening file #{filename} with mode #{filemode}"
|
48
|
+
@file = File.open(filename, filemode)
|
49
|
+
end
|
50
|
+
|
51
|
+
# Close the dump file
|
52
|
+
def close
|
53
|
+
@file.close if @file
|
54
|
+
@file = nil
|
55
|
+
end
|
56
|
+
|
57
|
+
# Ensure the file's directory exists
|
58
|
+
def mkdir!
|
59
|
+
dir = File.dirname(filename)
|
60
|
+
return if File.directory?(dir)
|
61
|
+
Log.info "Making directory #{dir}"
|
62
|
+
FileUtils.mkdir_p dir
|
63
|
+
end
|
64
|
+
|
65
|
+
# write to the file
|
66
|
+
def save obj
|
67
|
+
file << obj.to_flat.join("\t")+"\n"
|
68
|
+
obj
|
69
|
+
end
|
70
|
+
|
71
|
+
def set key, *args, &block
|
72
|
+
tok, obj = block.call
|
73
|
+
save obj
|
74
|
+
end
|
75
|
+
|
76
|
+
# delegates to +#save+ -- writes the object to the file
|
77
|
+
def <<(obj)
|
78
|
+
save obj
|
79
|
+
end
|
80
|
+
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
@@ -0,0 +1,51 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class KeyStore < Monkeyshines::Store::Base
|
4
|
+
# The actual backing store; should respond to #set and #get methods
|
5
|
+
attr_accessor :db
|
6
|
+
|
7
|
+
#
|
8
|
+
# Executes block once for each element in the whole DB, in whatever order
|
9
|
+
# the DB thinks you should see it.
|
10
|
+
#
|
11
|
+
# Your block will see |key, val|
|
12
|
+
#
|
13
|
+
# key_store.each do |key, val|
|
14
|
+
# # ... stuff ...
|
15
|
+
# end
|
16
|
+
#
|
17
|
+
def each &block
|
18
|
+
db.iterinit
|
19
|
+
loop do
|
20
|
+
key = db.iternext or break
|
21
|
+
val = db[key]
|
22
|
+
yield key, val
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
|
27
|
+
# Save the value into the database
|
28
|
+
def set(key, val)
|
29
|
+
return unless val
|
30
|
+
db[key] = val
|
31
|
+
end
|
32
|
+
|
33
|
+
alias_method :save, :set
|
34
|
+
def get(key) db[key] end
|
35
|
+
def [](key) db[key] end
|
36
|
+
def close() db.close end
|
37
|
+
def size() db.size end
|
38
|
+
|
39
|
+
#
|
40
|
+
# Load from standard command-line options
|
41
|
+
#
|
42
|
+
# obvs only works when there's just one store
|
43
|
+
#
|
44
|
+
def self.new_from_command_line cmdline_opts, default_opts={}
|
45
|
+
options = default_opts.merge(cmdline_opts)
|
46
|
+
store = self.new(options[:store_db])
|
47
|
+
store
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
module Monkeyshines
|
2
|
+
module Store
|
3
|
+
class ReadThruStore < Monkeyshines::Store::TyrantTdbKeyStore
|
4
|
+
|
5
|
+
#
|
6
|
+
# If key is absent, save the result of calling the block.
|
7
|
+
# If key is present, block is never called.
|
8
|
+
#
|
9
|
+
# Ex:
|
10
|
+
# rt_store.set(url) do
|
11
|
+
# fetcher.get url # will only be called if url isn't in rt_store
|
12
|
+
# end
|
13
|
+
#
|
14
|
+
def set key, force=nil, &block
|
15
|
+
return if !force && db.has_key?(key)
|
16
|
+
result = block.call() or return
|
17
|
+
super(key, result)
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,33 @@
|
|
1
|
+
require 'tokyocabinet'
|
2
|
+
module Monkeyshines
|
3
|
+
module Store
|
4
|
+
#
|
5
|
+
# Implementation of KeyStore with a Local TokyoCabinet table database (TDB)
|
6
|
+
#
|
7
|
+
class TokyoTdbKeyStore < Monkeyshines::Store::KeyStore
|
8
|
+
|
9
|
+
# pass in the filename or URI of a tokyo cabinet table-style DB
|
10
|
+
# set create_db = true if you want to create a missing DB file
|
11
|
+
def initialize db_uri, *args
|
12
|
+
self.db = TokyoCabinet::TDB.new
|
13
|
+
db.open(db_uri, TokyoCabinet::TDB::OWRITER) or raise "#{self.class.to_s}: Can't open TokyoCabinet TDB #{db_uri}"
|
14
|
+
super *args
|
15
|
+
end
|
16
|
+
|
17
|
+
|
18
|
+
def each_as klass, &block
|
19
|
+
self.each do |key, hsh|
|
20
|
+
yield klass.from_hash hsh
|
21
|
+
end
|
22
|
+
end
|
23
|
+
# Delegate to store
|
24
|
+
def set(key, val)
|
25
|
+
return unless val
|
26
|
+
db.put key, val.to_hash.compact
|
27
|
+
end
|
28
|
+
|
29
|
+
def size() db.rnum end
|
30
|
+
|
31
|
+
end #class
|
32
|
+
end
|
33
|
+
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require 'tokyotyrant'
|
2
|
+
module Monkeyshines
|
3
|
+
module Store
|
4
|
+
|
5
|
+
#
|
6
|
+
# Implementation of KeyStore with a Local TokyoCabinet hash database (RDB)
|
7
|
+
#
|
8
|
+
class TyrantRdbKeyStore < Monkeyshines::Store::KeyStore
|
9
|
+
attr_accessor :db_host, :db_port
|
10
|
+
|
11
|
+
# pass in the host:port uri of the key store.
|
12
|
+
def initialize options
|
13
|
+
raise "URI for #{self.class} is required" if options[:uri].blank?
|
14
|
+
self.db_host, self.db_port = options[:uri].to_s.split(':')
|
15
|
+
super options
|
16
|
+
end
|
17
|
+
|
18
|
+
def db
|
19
|
+
return @db if @db
|
20
|
+
@db ||= TokyoTyrant::RDB.new
|
21
|
+
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
22
|
+
@db
|
23
|
+
end
|
24
|
+
|
25
|
+
def close
|
26
|
+
@db.close if @db
|
27
|
+
@db = nil
|
28
|
+
end
|
29
|
+
|
30
|
+
# Save the value into the database without waiting for a response.
|
31
|
+
def set_nr(key, val)
|
32
|
+
db.putnr key, val if val
|
33
|
+
end
|
34
|
+
|
35
|
+
def size() db.rnum end
|
36
|
+
def include? *args
|
37
|
+
db.has_key? *args
|
38
|
+
end
|
39
|
+
|
40
|
+
# require 'memcache'
|
41
|
+
# def initialize db_uri=nil, *args
|
42
|
+
# # db_uri ||= ':1978'
|
43
|
+
# # self.db_host, self.db_port = db_uri.split(':')
|
44
|
+
# self.db = MemCache.new(db_uri, :no_reply => true)
|
45
|
+
# if !self.db then raise("Can't open DB #{db_uri}. Pass in host:port, default is ':1978' #{db.ecode}: #{db.errmsg(db.ecode)}") end
|
46
|
+
# super *args
|
47
|
+
# end
|
48
|
+
#
|
49
|
+
# def size
|
50
|
+
# db.stats
|
51
|
+
# end
|
52
|
+
|
53
|
+
end #class
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'tokyotyrant'
|
2
|
+
require 'tyrant_rdb_key_store'
|
3
|
+
module Monkeyshines
|
4
|
+
module Store
|
5
|
+
#
|
6
|
+
# Implementation of KeyStore with a Local TokyoCabinet Table database (RDBTBL)
|
7
|
+
#
|
8
|
+
class TyrantRdbKeyStore < TyrantRdbKeyStore Monkeyshines::Store::KeyStore
|
9
|
+
|
10
|
+
def db
|
11
|
+
return @db if @db
|
12
|
+
@db ||= TokyoTyrant::RDBTBL.new
|
13
|
+
@db.open(db_host, db_port) or raise("Can't open DB #{db_host}:#{db_port}. Pass in host:port' #{@db.ecode}: #{@db.errmsg(@db.ecode)}")
|
14
|
+
@db
|
15
|
+
end
|
16
|
+
|
17
|
+
end #class
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
@@ -0,0 +1,106 @@
|
|
1
|
+
#
|
2
|
+
# Makes a module behave as a factory.
|
3
|
+
#
|
4
|
+
# A module that extends FactoryModule gets a method #new(klass_name, *args):
|
5
|
+
# this finds the class corresponding to klass_name and creates it with *args as
|
6
|
+
# arguments.
|
7
|
+
#
|
8
|
+
# if +klass_name+ is a class, it's used directly. Otherwise, it's converted to
|
9
|
+
# a class, and can be in underscored form (mysql_doc_source) or namespace form
|
10
|
+
# (FileSources::WordDoc); the name is interpreted relative to the extending
|
11
|
+
# module's namespace. (So, in the example below, :file_doc_source,
|
12
|
+
# FileDocSource, DocSource::
|
13
|
+
#
|
14
|
+
# Example. Given:
|
15
|
+
#
|
16
|
+
# module DocSource
|
17
|
+
# extend FactoryModule
|
18
|
+
# end
|
19
|
+
#
|
20
|
+
# # ... elsewhere ...
|
21
|
+
# module DocSource
|
22
|
+
# # load docs from file
|
23
|
+
# class FileDocSource
|
24
|
+
# def initialize filename
|
25
|
+
# #...
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# # load docs from web
|
30
|
+
# class MySqlDocSource
|
31
|
+
# def initialize host, port, user, password
|
32
|
+
# # ...
|
33
|
+
# end
|
34
|
+
# end
|
35
|
+
# end
|
36
|
+
#
|
37
|
+
# Then:
|
38
|
+
# DocSource.new :file_doc_source, '/tmp/foo.doc' # => returns DocSource::FileDocSource
|
39
|
+
# DocSource.new :MySqlDocSource, 'localhost', 6666 # => returns DocSource::MySqlDocSource
|
40
|
+
#
|
41
|
+
#
|
42
|
+
module FactoryModule
|
43
|
+
def self.extended base
|
44
|
+
base.class_eval do
|
45
|
+
|
46
|
+
def self.new klass_name, *args
|
47
|
+
FactoryModule.get_class(self, klass_name).new(*args)
|
48
|
+
end
|
49
|
+
|
50
|
+
def self.from_hash plan
|
51
|
+
return plan unless plan.is_a?(Hash)
|
52
|
+
klass_name = (plan[:type] || plan['type']) or raise "Fat, drunk, and stupid is no way to go through life, son. You need a plan: #{plan.inspect}"
|
53
|
+
FactoryModule.get_class(self, klass_name).from_hash(plan)
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.create plan
|
57
|
+
case
|
58
|
+
# when plan.class.ancestors.include? self
|
59
|
+
when plan.is_a?(Hash)
|
60
|
+
klass_name = plan[:type] || plan['type']
|
61
|
+
FactoryModule.get_class(self, klass_name).new(plan)
|
62
|
+
when plan.is_a?(Symbol)
|
63
|
+
klass_name = plan
|
64
|
+
FactoryModule.get_class(self, klass_name).new()
|
65
|
+
else plan
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_class klass_name
|
73
|
+
FactoryModule.get_class self, klass_name
|
74
|
+
end
|
75
|
+
|
76
|
+
FACTORY_CLASSES = {}
|
77
|
+
def self.get_class scope, klass_name
|
78
|
+
return FACTORY_CLASSES[ [scope, klass_name] ] if FACTORY_CLASSES[ [scope, klass_name] ]
|
79
|
+
if klass_name.is_a? Class
|
80
|
+
klass = klass_name
|
81
|
+
else
|
82
|
+
begin
|
83
|
+
klass = scope.find_const(klass_name.to_s.camelize)
|
84
|
+
rescue NameError => e
|
85
|
+
raise "Can't find #{klass_name.inspect} in #{scope}"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
# find_const from wukong/extensions/module via extlib
|
89
|
+
FACTORY_CLASSES[ [scope, klass_name] ] = klass
|
90
|
+
end
|
91
|
+
|
92
|
+
|
93
|
+
#
|
94
|
+
#
|
95
|
+
# FactoryModule.list_of_classes(Wuclan::Twitter::Scrape, 'followers_ids,friends_ids', 'request')
|
96
|
+
# # => [Wuclan::Twitter::Scrape::FollowersIdsRequest, Wuclan::Twitter::Scrape::FriendsIdsRequest]
|
97
|
+
#
|
98
|
+
def self.list_of_classes scope, klass_names, prefix=nil, suffix=nil
|
99
|
+
klass_names = klass_names.split(',') if klass_names.is_a?(String)
|
100
|
+
klass_names.map do |klass_name|
|
101
|
+
klass_name = [prefix, klass_name, suffix].compact.join('_') if klass_name.is_a?(String)
|
102
|
+
self.get_class(scope, klass_name)
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
106
|
+
end
|