metacrunch 3.1.4 → 4.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +2 -2
- data/Gemfile +5 -11
- data/Rakefile +1 -0
- data/Readme.md +98 -90
- data/lib/metacrunch.rb +0 -5
- data/lib/metacrunch/cli.rb +22 -61
- data/lib/metacrunch/job.rb +65 -84
- data/lib/metacrunch/job/dsl.rb +10 -14
- data/lib/metacrunch/job/dsl/options.rb +80 -0
- data/lib/metacrunch/job/dsl/options/dsl.rb +21 -0
- data/lib/metacrunch/version.rb +1 -1
- data/metacrunch.gemspec +2 -6
- metadata +10 -68
- data/lib/metacrunch/db.rb +0 -8
- data/lib/metacrunch/db/reader.rb +0 -33
- data/lib/metacrunch/db/writer.rb +0 -55
- data/lib/metacrunch/fs.rb +0 -6
- data/lib/metacrunch/fs/entry.rb +0 -17
- data/lib/metacrunch/fs/reader.rb +0 -63
- data/lib/metacrunch/job/dsl/option_support.rb +0 -102
- data/lib/metacrunch/parallel_processable_reader.rb +0 -21
- data/lib/metacrunch/redis.rb +0 -8
- data/lib/metacrunch/redis/queue_reader.rb +0 -43
- data/lib/metacrunch/redis/queue_writer.rb +0 -39
- data/lib/metacrunch/redis/writer.rb +0 -33
data/lib/metacrunch/db.rb
DELETED
data/lib/metacrunch/db/reader.rb
DELETED
@@ -1,33 +0,0 @@
|
|
1
|
-
require "metacrunch/db"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Db::Reader
|
5
|
-
|
6
|
-
def initialize(database_connection_or_url, dataset_proc, options = {})
|
7
|
-
@rows_per_fetch = options.delete(:rows_per_fetch) || 1000
|
8
|
-
|
9
|
-
@db = if database_connection_or_url.is_a?(String)
|
10
|
-
Sequel.connect(database_connection_or_url, options)
|
11
|
-
else
|
12
|
-
database_connection_or_url
|
13
|
-
end
|
14
|
-
|
15
|
-
@dataset = dataset_proc.call(@db).unlimited
|
16
|
-
|
17
|
-
unless @dataset.opts[:order]
|
18
|
-
raise ArgumentError, "Metacrunch::Db::Reader requires the dataset be ordered."
|
19
|
-
end
|
20
|
-
end
|
21
|
-
|
22
|
-
def each(&block)
|
23
|
-
return enum_for(__method__) unless block_given?
|
24
|
-
|
25
|
-
@dataset.paged_each(rows_per_fetch: @rows_per_fetch, strategy: :filter) do |row|
|
26
|
-
yield(row)
|
27
|
-
end
|
28
|
-
|
29
|
-
self
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
end
|
data/lib/metacrunch/db/writer.rb
DELETED
@@ -1,55 +0,0 @@
|
|
1
|
-
require "metacrunch/db"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Db::Writer
|
5
|
-
|
6
|
-
def initialize(database_connection_or_url, dataset_proc, options = {})
|
7
|
-
@use_upsert = options.delete(:use_upsert) || false
|
8
|
-
@id_key = options.delete(:id_key) || :id
|
9
|
-
@isolation_level = options.delete(:isolation_level) || :repeatable
|
10
|
-
@transaction_retries = options.delete(:transaction_retries) || 5
|
11
|
-
|
12
|
-
@db = if database_connection_or_url.is_a?(String)
|
13
|
-
Sequel.connect(database_connection_or_url, options)
|
14
|
-
else
|
15
|
-
database_connection_or_url
|
16
|
-
end
|
17
|
-
|
18
|
-
@dataset = dataset_proc.call(@db)
|
19
|
-
end
|
20
|
-
|
21
|
-
def write(data)
|
22
|
-
if data.is_a?(Array)
|
23
|
-
@db.transaction(isolation: @isolation_level, num_retries: @transaction_retries) do
|
24
|
-
data.each{|d| insert_or_upsert(d) }
|
25
|
-
end
|
26
|
-
else
|
27
|
-
insert_or_upsert(data)
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def close
|
32
|
-
@db.disconnect
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def insert_or_upsert(data)
|
38
|
-
@use_upsert ? upsert(data) : insert(data)
|
39
|
-
end
|
40
|
-
|
41
|
-
def insert(data)
|
42
|
-
@dataset.insert(data) if data
|
43
|
-
end
|
44
|
-
|
45
|
-
def upsert(data)
|
46
|
-
if data
|
47
|
-
rec = @dataset.where(@id_key => data[@id_key])
|
48
|
-
if 1 != rec.update(data)
|
49
|
-
insert(data)
|
50
|
-
end
|
51
|
-
end
|
52
|
-
end
|
53
|
-
|
54
|
-
end
|
55
|
-
end
|
data/lib/metacrunch/fs.rb
DELETED
data/lib/metacrunch/fs/entry.rb
DELETED
@@ -1,17 +0,0 @@
|
|
1
|
-
module Metacrunch
|
2
|
-
class Fs::Entry
|
3
|
-
|
4
|
-
attr_reader :filename, :archive_filename, :contents
|
5
|
-
|
6
|
-
def initialize(filename:, archive_filename: nil, contents: nil)
|
7
|
-
@filename = filename
|
8
|
-
@archive_filename = archive_filename.presence
|
9
|
-
@contents = contents
|
10
|
-
end
|
11
|
-
|
12
|
-
def from_archive?
|
13
|
-
@archive_filename != nil
|
14
|
-
end
|
15
|
-
|
16
|
-
end
|
17
|
-
end
|
data/lib/metacrunch/fs/reader.rb
DELETED
@@ -1,63 +0,0 @@
|
|
1
|
-
require "metacrunch/fs"
|
2
|
-
require "rubygems/package"
|
3
|
-
|
4
|
-
module Metacrunch
|
5
|
-
class Fs::Reader
|
6
|
-
include Metacrunch::ParallelProcessableReader
|
7
|
-
|
8
|
-
def initialize(filenames = nil)
|
9
|
-
@filenames = [*filenames].map{|f| f.presence}.compact
|
10
|
-
end
|
11
|
-
|
12
|
-
def each(&block)
|
13
|
-
return enum_for(__method__) unless block_given?
|
14
|
-
|
15
|
-
offset = 0 + process_index
|
16
|
-
|
17
|
-
while offset < @filenames.count do
|
18
|
-
_filename = @filenames[offset]
|
19
|
-
|
20
|
-
if is_archive?(_filename)
|
21
|
-
read_archive(_filename, &block)
|
22
|
-
else
|
23
|
-
read_regular_file(_filename, &block)
|
24
|
-
end
|
25
|
-
|
26
|
-
offset += number_of_processes
|
27
|
-
end
|
28
|
-
end
|
29
|
-
|
30
|
-
private
|
31
|
-
|
32
|
-
def is_archive?(filename)
|
33
|
-
filename.ends_with?(".tar") || filename.ends_with?(".tar.gz") || filename.ends_with?(".tgz")
|
34
|
-
end
|
35
|
-
|
36
|
-
def is_gzip_file?(filename)
|
37
|
-
filename.ends_with?(".gz") || filename.ends_with?(".tgz")
|
38
|
-
end
|
39
|
-
|
40
|
-
def read_regular_file(filename, &block)
|
41
|
-
if File.file?(filename)
|
42
|
-
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : File.open(filename, "r")
|
43
|
-
yield Fs::Entry.new(filename: filename, archive_filename: nil, contents: io.read)
|
44
|
-
end
|
45
|
-
end
|
46
|
-
|
47
|
-
def read_archive(filename, &block)
|
48
|
-
io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : File.open(filename, "r")
|
49
|
-
tarReader = Gem::Package::TarReader.new(io)
|
50
|
-
|
51
|
-
tarReader.each do |_tar_entry|
|
52
|
-
if _tar_entry.file?
|
53
|
-
yield Fs::Entry.new(
|
54
|
-
filename: filename,
|
55
|
-
archive_filename: _tar_entry.full_name,
|
56
|
-
contents: _tar_entry.read
|
57
|
-
)
|
58
|
-
end
|
59
|
-
end
|
60
|
-
end
|
61
|
-
|
62
|
-
end
|
63
|
-
end
|
@@ -1,102 +0,0 @@
|
|
1
|
-
module Metacrunch
|
2
|
-
class Job::Dsl::OptionSupport
|
3
|
-
|
4
|
-
def register_options(args, require_args: false, &block)
|
5
|
-
options = {}
|
6
|
-
registry.instance_eval(&block)
|
7
|
-
|
8
|
-
registry.each do |key, opt_def|
|
9
|
-
# Set default value
|
10
|
-
options[key] = opt_def[:default]
|
11
|
-
|
12
|
-
# Register with OptionParser
|
13
|
-
if opt_def[:args].present?
|
14
|
-
option = parser.define(*opt_def[:args]) { |value| options[key] = value }
|
15
|
-
|
16
|
-
option.desc << "REQUIRED" if opt_def[:required]
|
17
|
-
option.desc << "DEFAULT: #{opt_def[:default]}" if opt_def[:default].present?
|
18
|
-
|
19
|
-
parser_options[key] = option
|
20
|
-
end
|
21
|
-
end
|
22
|
-
|
23
|
-
# Finally parse CLI options with OptionParser
|
24
|
-
args = parser.parse(args || [])
|
25
|
-
|
26
|
-
# Make sure required options are present
|
27
|
-
ensure_required_options!(options)
|
28
|
-
|
29
|
-
# Make sure args are present if required
|
30
|
-
ensure_required_args!(args) if require_args
|
31
|
-
|
32
|
-
options
|
33
|
-
end
|
34
|
-
|
35
|
-
private
|
36
|
-
|
37
|
-
def parser
|
38
|
-
@parser ||= OptionParser.new do |parser|
|
39
|
-
parser.banner = "Usage: metacrunch [options] JOB_FILE @@ [job-options] [ARGS]\nJob options:"
|
40
|
-
end
|
41
|
-
end
|
42
|
-
|
43
|
-
def parser_options
|
44
|
-
@parser_options ||= {}
|
45
|
-
end
|
46
|
-
|
47
|
-
def registry
|
48
|
-
@registry ||= OptionRegistry.new
|
49
|
-
end
|
50
|
-
|
51
|
-
def ensure_required_options!(options)
|
52
|
-
registry.each do |key, opt_def|
|
53
|
-
if opt_def[:required] && options[key].blank?
|
54
|
-
long_option = parser_options[key].long.try(:[], 0)
|
55
|
-
short_option = parser_options[key].short.try(:[], 0)
|
56
|
-
|
57
|
-
puts "Error: Required job option `#{long_option || short_option}` missing."
|
58
|
-
puts parser.help
|
59
|
-
|
60
|
-
exit(1)
|
61
|
-
end
|
62
|
-
end
|
63
|
-
end
|
64
|
-
|
65
|
-
def ensure_required_args!(args)
|
66
|
-
if args.blank?
|
67
|
-
puts "Error: Required ARGS are missing."
|
68
|
-
puts parser.help
|
69
|
-
|
70
|
-
exit(1)
|
71
|
-
end
|
72
|
-
end
|
73
|
-
|
74
|
-
private
|
75
|
-
|
76
|
-
class OptionRegistry
|
77
|
-
|
78
|
-
def add(name, *args, default: nil, required: false)
|
79
|
-
if default && required
|
80
|
-
raise ArgumentError, "You can't use `default` and `required` option at the same time."
|
81
|
-
end
|
82
|
-
|
83
|
-
options[name.to_sym] = {
|
84
|
-
args: args,
|
85
|
-
default: default,
|
86
|
-
required: required
|
87
|
-
}
|
88
|
-
end
|
89
|
-
|
90
|
-
def each(&block)
|
91
|
-
options.each(&block)
|
92
|
-
end
|
93
|
-
|
94
|
-
private
|
95
|
-
|
96
|
-
def options
|
97
|
-
@options ||= {}
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
end
|
102
|
-
end
|
@@ -1,21 +0,0 @@
|
|
1
|
-
module Metacrunch
|
2
|
-
module ParallelProcessableReader
|
3
|
-
|
4
|
-
def set_parallel_process_options(number_of_processes: 1, process_index: 0)
|
5
|
-
raise ArgumentError, "number_of_processes must be >= 1" if number_of_processes < 1
|
6
|
-
raise ArgumentError, "process_index must be >= 0" if process_index < 0
|
7
|
-
|
8
|
-
@number_of_processes = number_of_processes
|
9
|
-
@process_index = process_index
|
10
|
-
end
|
11
|
-
|
12
|
-
def number_of_processes
|
13
|
-
@number_of_processes || 1
|
14
|
-
end
|
15
|
-
|
16
|
-
def process_index
|
17
|
-
@process_index || 0
|
18
|
-
end
|
19
|
-
|
20
|
-
end
|
21
|
-
end
|
data/lib/metacrunch/redis.rb
DELETED
@@ -1,43 +0,0 @@
|
|
1
|
-
require "metacrunch/redis"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Redis::QueueReader
|
5
|
-
include Metacrunch::ParallelProcessableReader
|
6
|
-
|
7
|
-
def initialize(redis_connection_or_url, queue_name, options = {})
|
8
|
-
@queue_name = queue_name
|
9
|
-
raise ArgumentError, "queue_name must be a string" unless queue_name.is_a?(String)
|
10
|
-
|
11
|
-
@blocking_mode = options.delete(:blocking) || false
|
12
|
-
@blocking_timeout = options.delete(:blocking_timeout) || 0
|
13
|
-
|
14
|
-
@redis = if redis_connection_or_url.is_a?(String)
|
15
|
-
::Redis.new(url: redis_connection_or_url)
|
16
|
-
else
|
17
|
-
redis_connection_or_url
|
18
|
-
end
|
19
|
-
end
|
20
|
-
|
21
|
-
def each(&block)
|
22
|
-
return enum_for(__method__) unless block_given?
|
23
|
-
|
24
|
-
if @blocking_mode
|
25
|
-
while true
|
26
|
-
list, result = @redis.blpop(@queue_name, timeout: @blocking_timeout)
|
27
|
-
if result.present?
|
28
|
-
yield JSON.parse(result)
|
29
|
-
else
|
30
|
-
yield nil
|
31
|
-
end
|
32
|
-
end
|
33
|
-
else
|
34
|
-
while result = @redis.lpop(@queue_name)
|
35
|
-
yield JSON.parse(result)
|
36
|
-
end
|
37
|
-
end
|
38
|
-
|
39
|
-
self
|
40
|
-
end
|
41
|
-
|
42
|
-
end
|
43
|
-
end
|
@@ -1,39 +0,0 @@
|
|
1
|
-
require "metacrunch/redis"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Redis::QueueWriter
|
5
|
-
|
6
|
-
def initialize(redis_connection_or_url, queue_name, options = {})
|
7
|
-
@queue_name = queue_name
|
8
|
-
raise ArgumentError, "queue_name must be a string" unless queue_name.is_a?(String)
|
9
|
-
|
10
|
-
@save_on_close = options.delete(:save_on_close) || true
|
11
|
-
|
12
|
-
@redis = if redis_connection_or_url.is_a?(String)
|
13
|
-
::Redis.new(url: redis_connection_or_url)
|
14
|
-
else
|
15
|
-
redis_connection_or_url
|
16
|
-
end
|
17
|
-
end
|
18
|
-
|
19
|
-
def write(data)
|
20
|
-
@redis.rpush(@queue_name, data)
|
21
|
-
rescue RuntimeError => e
|
22
|
-
if e.message =~ /maxmemory/
|
23
|
-
puts "Redis has reached maxmemory. Waiting 10 seconds and trying again..."
|
24
|
-
sleep(10)
|
25
|
-
retry
|
26
|
-
else
|
27
|
-
raise e
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
|
-
def close
|
32
|
-
if @redis
|
33
|
-
@redis.bgsave if @save_on_close
|
34
|
-
@redis.close
|
35
|
-
end
|
36
|
-
end
|
37
|
-
|
38
|
-
end
|
39
|
-
end
|
@@ -1,33 +0,0 @@
|
|
1
|
-
require "metacrunch/redis"
|
2
|
-
|
3
|
-
module Metacrunch
|
4
|
-
class Redis::Writer
|
5
|
-
|
6
|
-
def initialize(redis_connection_or_url, options = {})
|
7
|
-
@save_on_close = options.delete(:save_on_close) || true
|
8
|
-
|
9
|
-
@key = options.delete(:key) || :key
|
10
|
-
|
11
|
-
@redis = if redis_connection_or_url.is_a?(String)
|
12
|
-
::Redis.new(url: redis_connection_or_url)
|
13
|
-
else
|
14
|
-
redis_connection_or_url
|
15
|
-
end
|
16
|
-
end
|
17
|
-
|
18
|
-
def write(data)
|
19
|
-
key = data[@key]
|
20
|
-
raise ArgumentError, "No key found in data. Tried '#{@key}' but didn't found a value." unless key
|
21
|
-
|
22
|
-
@redis.set(key.to_s, data.to_json)
|
23
|
-
end
|
24
|
-
|
25
|
-
def close
|
26
|
-
if @redis
|
27
|
-
@redis.bgsave if @save_on_close
|
28
|
-
@redis.close
|
29
|
-
end
|
30
|
-
end
|
31
|
-
|
32
|
-
end
|
33
|
-
end
|