metacrunch 3.1.4 → 4.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +5 -11
- data/Rakefile +1 -0
- data/Readme.md +98 -90
- data/lib/metacrunch.rb +0 -5
- data/lib/metacrunch/cli.rb +22 -61
- data/lib/metacrunch/job.rb +65 -84
- data/lib/metacrunch/job/dsl.rb +10 -14
- data/lib/metacrunch/job/dsl/options.rb +80 -0
- data/lib/metacrunch/job/dsl/options/dsl.rb +21 -0
- data/lib/metacrunch/version.rb +1 -1
- data/metacrunch.gemspec +2 -6
- metadata +10 -68
- data/lib/metacrunch/db.rb +0 -8
- data/lib/metacrunch/db/reader.rb +0 -33
- data/lib/metacrunch/db/writer.rb +0 -55
- data/lib/metacrunch/fs.rb +0 -6
- data/lib/metacrunch/fs/entry.rb +0 -17
- data/lib/metacrunch/fs/reader.rb +0 -63
- data/lib/metacrunch/job/dsl/option_support.rb +0 -102
- data/lib/metacrunch/parallel_processable_reader.rb +0 -21
- data/lib/metacrunch/redis.rb +0 -8
- data/lib/metacrunch/redis/queue_reader.rb +0 -43
- data/lib/metacrunch/redis/queue_writer.rb +0 -39
- data/lib/metacrunch/redis/writer.rb +0 -33
data/lib/metacrunch/db.rb
DELETED
data/lib/metacrunch/db/reader.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require "metacrunch/db"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Db::Reader
|
5
|
-
|
6
|
-
def initialize(database_connection_or_url, dataset_proc, options = {})
|
7
|
-
@rows_per_fetch = options.delete(:rows_per_fetch) || 1000
|
8
|
-
|
9
|
-
@db = if database_connection_or_url.is_a?(String)
|
10
|
-
Sequel.connect(database_connection_or_url, options)
|
11
|
-
else
|
12
|
-
database_connection_or_url
|
13
|
-
end
|
14
|
-
|
15
|
-
@dataset = dataset_proc.call(@db).unlimited
|
16
|
-
|
17
|
-
unless @dataset.opts[:order]
|
18
|
-
raise ArgumentError, "Metacrunch::Db::Reader requires the dataset be ordered."
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def each(&block)
|
23
|
-
return enum_for(__method__) unless block_given?
|
24
|
-
|
25
|
-
@dataset.paged_each(rows_per_fetch: @rows_per_fetch, strategy: :filter) do |row|
|
26
|
-
yield(row)
|
27
|
-
end
|
28
|
-
|
29
|
-
self
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
end
|
data/lib/metacrunch/db/writer.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
require "metacrunch/db"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Db::Writer
|
5
|
-
|
6
|
-
def initialize(database_connection_or_url, dataset_proc, options = {})
|
7
|
-
@use_upsert = options.delete(:use_upsert) || false
|
8
|
-
@id_key = options.delete(:id_key) || :id
|
9
|
-
@isolation_level = options.delete(:isolation_level) || :repeatable
|
10
|
-
@transaction_retries = options.delete(:transaction_retries) || 5
|
11
|
-
|
12
|
-
@db = if database_connection_or_url.is_a?(String)
|
13
|
-
Sequel.connect(database_connection_or_url, options)
|
14
|
-
else
|
15
|
-
database_connection_or_url
|
16
|
-
end
|
17
|
-
|
18
|
-
@dataset = dataset_proc.call(@db)
|
19
|
-
end
|
20
|
-
|
21
|
-
def write(data)
|
22
|
-
if data.is_a?(Array)
|
23
|
-
@db.transaction(isolation: @isolation_level, num_retries: @transaction_retries) do
|
24
|
-
data.each{|d| insert_or_upsert(d) }
|
25
|
-
end
|
26
|
-
else
|
27
|
-
insert_or_upsert(data)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def close
|
32
|
-
@db.disconnect
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def insert_or_upsert(data)
|
38
|
-
@use_upsert ? upsert(data) : insert(data)
|
39
|
-
end
|
40
|
-
|
41
|
-
def insert(data)
|
42
|
-
@dataset.insert(data) if data
|
43
|
-
end
|
44
|
-
|
45
|
-
def upsert(data)
|
46
|
-
if data
|
47
|
-
rec = @dataset.where(@id_key => data[@id_key])
|
48
|
-
if 1 != rec.update(data)
|
49
|
-
insert(data)
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
end
|
data/lib/metacrunch/fs.rb
DELETED
data/lib/metacrunch/fs/entry.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
module Metacrunch
|
2
|
-
class Fs::Entry
|
3
|
-
|
4
|
-
attr_reader :filename, :archive_filename, :contents
|
5
|
-
|
6
|
-
def initialize(filename:, archive_filename: nil, contents: nil)
|
7
|
-
@filename = filename
|
8
|
-
@archive_filename = archive_filename.presence
|
9
|
-
@contents = contents
|
10
|
-
end
|
11
|
-
|
12
|
-
def from_archive?
|
13
|
-
@archive_filename != nil
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
17
|
-
end
|
data/lib/metacrunch/fs/reader.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
require "metacrunch/fs"
|
2
|
-
require "rubygems/package"
|
3
|
-
|
4
|
-
module Metacrunch
|
5
|
-
class Fs::Reader
|
6
|
-
include Metacrunch::ParallelProcessableReader
|
7
|
-
|
8
|
-
def initialize(filenames = nil)
|
9
|
-
@filenames = [*filenames].map{|f| f.presence}.compact
|
10
|
-
end
|
11
|
-
|
12
|
-
def each(&block)
|
13
|
-
return enum_for(__method__) unless block_given?
|
14
|
-
|
15
|
-
offset = 0 + process_index
|
16
|
-
|
17
|
-
while offset < @filenames.count do
|
18
|
-
_filename = @filenames[offset]
|
19
|
-
|
20
|
-
if is_archive?(_filename)
|
21
|
-
read_archive(_filename, &block)
|
22
|
-
else
|
23
|
-
read_regular_file(_filename, &block)
|
24
|
-
end
|
25
|
-
|
26
|
-
offset += number_of_processes
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def is_archive?(filename)
|
33
|
-
filename.ends_with?(".tar") || filename.ends_with?(".tar.gz") || filename.ends_with?(".tgz")
|
34
|
-
end
|
35
|
-
|
36
|
-
def is_gzip_file?(filename)
|
37
|
-
filename.ends_with?(".gz") || filename.ends_with?(".tgz")
|
38
|
-
end
|
39
|
-
|
40
|
-
def read_regular_file(filename, &block)
|
41
|
-
if File.file?(filename)
|
42
|
-
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : File.open(filename, "r")
|
43
|
-
yield Fs::Entry.new(filename: filename, archive_filename: nil, contents: io.read)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def read_archive(filename, &block)
|
48
|
-
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : File.open(filename, "r")
|
49
|
-
tarReader = Gem::Package::TarReader.new(io)
|
50
|
-
|
51
|
-
tarReader.each do |_tar_entry|
|
52
|
-
if _tar_entry.file?
|
53
|
-
yield Fs::Entry.new(
|
54
|
-
filename: filename,
|
55
|
-
archive_filename: _tar_entry.full_name,
|
56
|
-
contents: _tar_entry.read
|
57
|
-
)
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
end
|
63
|
-
end
|
@@ -1,102 +0,0 @@
|
|
1
|
-
module Metacrunch
|
2
|
-
class Job::Dsl::OptionSupport
|
3
|
-
|
4
|
-
def register_options(args, require_args: false, &block)
|
5
|
-
options = {}
|
6
|
-
registry.instance_eval(&block)
|
7
|
-
|
8
|
-
registry.each do |key, opt_def|
|
9
|
-
# Set default value
|
10
|
-
options[key] = opt_def[:default]
|
11
|
-
|
12
|
-
# Register with OptionParser
|
13
|
-
if opt_def[:args].present?
|
14
|
-
option = parser.define(*opt_def[:args]) { |value| options[key] = value }
|
15
|
-
|
16
|
-
option.desc << "REQUIRED" if opt_def[:required]
|
17
|
-
option.desc << "DEFAULT: #{opt_def[:default]}" if opt_def[:default].present?
|
18
|
-
|
19
|
-
parser_options[key] = option
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
# Finally parse CLI options with OptionParser
|
24
|
-
args = parser.parse(args || [])
|
25
|
-
|
26
|
-
# Make sure required options are present
|
27
|
-
ensure_required_options!(options)
|
28
|
-
|
29
|
-
# Make sure args are present if required
|
30
|
-
ensure_required_args!(args) if require_args
|
31
|
-
|
32
|
-
options
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def parser
|
38
|
-
@parser ||= OptionParser.new do |parser|
|
39
|
-
parser.banner = "Usage: metacrunch [options] JOB_FILE @@ [job-options] [ARGS]\nJob options:"
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def parser_options
|
44
|
-
@parser_options ||= {}
|
45
|
-
end
|
46
|
-
|
47
|
-
def registry
|
48
|
-
@registry ||= OptionRegistry.new
|
49
|
-
end
|
50
|
-
|
51
|
-
def ensure_required_options!(options)
|
52
|
-
registry.each do |key, opt_def|
|
53
|
-
if opt_def[:required] && options[key].blank?
|
54
|
-
long_option = parser_options[key].long.try(:[], 0)
|
55
|
-
short_option = parser_options[key].short.try(:[], 0)
|
56
|
-
|
57
|
-
puts "Error: Required job option `#{long_option || short_option}` missing."
|
58
|
-
puts parser.help
|
59
|
-
|
60
|
-
exit(1)
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def ensure_required_args!(args)
|
66
|
-
if args.blank?
|
67
|
-
puts "Error: Required ARGS are missing."
|
68
|
-
puts parser.help
|
69
|
-
|
70
|
-
exit(1)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
private
|
75
|
-
|
76
|
-
class OptionRegistry
|
77
|
-
|
78
|
-
def add(name, *args, default: nil, required: false)
|
79
|
-
if default && required
|
80
|
-
raise ArgumentError, "You can't use `default` and `required` option at the same time."
|
81
|
-
end
|
82
|
-
|
83
|
-
options[name.to_sym] = {
|
84
|
-
args: args,
|
85
|
-
default: default,
|
86
|
-
required: required
|
87
|
-
}
|
88
|
-
end
|
89
|
-
|
90
|
-
def each(&block)
|
91
|
-
options.each(&block)
|
92
|
-
end
|
93
|
-
|
94
|
-
private
|
95
|
-
|
96
|
-
def options
|
97
|
-
@options ||= {}
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
end
|
102
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
module Metacrunch
|
2
|
-
module ParallelProcessableReader
|
3
|
-
|
4
|
-
def set_parallel_process_options(number_of_processes: 1, process_index: 0)
|
5
|
-
raise ArgumentError, "number_of_processes must be >= 1" if number_of_processes < 1
|
6
|
-
raise ArgumentError, "process_index must be >= 0" if process_index < 0
|
7
|
-
|
8
|
-
@number_of_processes = number_of_processes
|
9
|
-
@process_index = process_index
|
10
|
-
end
|
11
|
-
|
12
|
-
def number_of_processes
|
13
|
-
@number_of_processes || 1
|
14
|
-
end
|
15
|
-
|
16
|
-
def process_index
|
17
|
-
@process_index || 0
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
data/lib/metacrunch/redis.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
require "metacrunch/redis"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Redis::QueueReader
|
5
|
-
include Metacrunch::ParallelProcessableReader
|
6
|
-
|
7
|
-
def initialize(redis_connection_or_url, queue_name, options = {})
|
8
|
-
@queue_name = queue_name
|
9
|
-
raise ArgumentError, "queue_name must be a string" unless queue_name.is_a?(String)
|
10
|
-
|
11
|
-
@blocking_mode = options.delete(:blocking) || false
|
12
|
-
@blocking_timeout = options.delete(:blocking_timeout) || 0
|
13
|
-
|
14
|
-
@redis = if redis_connection_or_url.is_a?(String)
|
15
|
-
::Redis.new(url: redis_connection_or_url)
|
16
|
-
else
|
17
|
-
redis_connection_or_url
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def each(&block)
|
22
|
-
return enum_for(__method__) unless block_given?
|
23
|
-
|
24
|
-
if @blocking_mode
|
25
|
-
while true
|
26
|
-
list, result = @redis.blpop(@queue_name, timeout: @blocking_timeout)
|
27
|
-
if result.present?
|
28
|
-
yield JSON.parse(result)
|
29
|
-
else
|
30
|
-
yield nil
|
31
|
-
end
|
32
|
-
end
|
33
|
-
else
|
34
|
-
while result = @redis.lpop(@queue_name)
|
35
|
-
yield JSON.parse(result)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
self
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
require "metacrunch/redis"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Redis::QueueWriter
|
5
|
-
|
6
|
-
def initialize(redis_connection_or_url, queue_name, options = {})
|
7
|
-
@queue_name = queue_name
|
8
|
-
raise ArgumentError, "queue_name must be a string" unless queue_name.is_a?(String)
|
9
|
-
|
10
|
-
@save_on_close = options.delete(:save_on_close) || true
|
11
|
-
|
12
|
-
@redis = if redis_connection_or_url.is_a?(String)
|
13
|
-
::Redis.new(url: redis_connection_or_url)
|
14
|
-
else
|
15
|
-
redis_connection_or_url
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def write(data)
|
20
|
-
@redis.rpush(@queue_name, data)
|
21
|
-
rescue RuntimeError => e
|
22
|
-
if e.message =~ /maxmemory/
|
23
|
-
puts "Redis has reached maxmemory. Waiting 10 seconds and trying again..."
|
24
|
-
sleep(10)
|
25
|
-
retry
|
26
|
-
else
|
27
|
-
raise e
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def close
|
32
|
-
if @redis
|
33
|
-
@redis.bgsave if @save_on_close
|
34
|
-
@redis.close
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
require "metacrunch/redis"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Redis::Writer
|
5
|
-
|
6
|
-
def initialize(redis_connection_or_url, options = {})
|
7
|
-
@save_on_close = options.delete(:save_on_close) || true
|
8
|
-
|
9
|
-
@key = options.delete(:key) || :key
|
10
|
-
|
11
|
-
@redis = if redis_connection_or_url.is_a?(String)
|
12
|
-
::Redis.new(url: redis_connection_or_url)
|
13
|
-
else
|
14
|
-
redis_connection_or_url
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def write(data)
|
19
|
-
key = data[@key]
|
20
|
-
raise ArgumentError, "No key found in data. Tried '#{@key}' but didn't found a value." unless key
|
21
|
-
|
22
|
-
@redis.set(key.to_s, data.to_json)
|
23
|
-
end
|
24
|
-
|
25
|
-
def close
|
26
|
-
if @redis
|
27
|
-
@redis.bgsave if @save_on_close
|
28
|
-
@redis.close
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
end
|