metacrunch 3.1.4 → 4.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/lib/metacrunch/db.rb DELETED
@@ -1,8 +0,0 @@
1
- require "sequel"
2
-
3
- module Metacrunch
4
- class Db
5
- require_relative "db/reader"
6
- require_relative "db/writer"
7
- end
8
- end
@@ -1,33 +0,0 @@
1
- require "metacrunch/db"
2
-
3
- module Metacrunch
4
- class Db::Reader
5
-
6
- def initialize(database_connection_or_url, dataset_proc, options = {})
7
- @rows_per_fetch = options.delete(:rows_per_fetch) || 1000
8
-
9
- @db = if database_connection_or_url.is_a?(String)
10
- Sequel.connect(database_connection_or_url, options)
11
- else
12
- database_connection_or_url
13
- end
14
-
15
- @dataset = dataset_proc.call(@db).unlimited
16
-
17
- unless @dataset.opts[:order]
18
- raise ArgumentError, "Metacrunch::Db::Reader requires the dataset be ordered."
19
- end
20
- end
21
-
22
- def each(&block)
23
- return enum_for(__method__) unless block_given?
24
-
25
- @dataset.paged_each(rows_per_fetch: @rows_per_fetch, strategy: :filter) do |row|
26
- yield(row)
27
- end
28
-
29
- self
30
- end
31
-
32
- end
33
- end
@@ -1,55 +0,0 @@
1
- require "metacrunch/db"
2
-
3
- module Metacrunch
4
- class Db::Writer
5
-
6
- def initialize(database_connection_or_url, dataset_proc, options = {})
7
- @use_upsert = options.delete(:use_upsert) || false
8
- @id_key = options.delete(:id_key) || :id
9
- @isolation_level = options.delete(:isolation_level) || :repeatable
10
- @transaction_retries = options.delete(:transaction_retries) || 5
11
-
12
- @db = if database_connection_or_url.is_a?(String)
13
- Sequel.connect(database_connection_or_url, options)
14
- else
15
- database_connection_or_url
16
- end
17
-
18
- @dataset = dataset_proc.call(@db)
19
- end
20
-
21
- def write(data)
22
- if data.is_a?(Array)
23
- @db.transaction(isolation: @isolation_level, num_retries: @transaction_retries) do
24
- data.each{|d| insert_or_upsert(d) }
25
- end
26
- else
27
- insert_or_upsert(data)
28
- end
29
- end
30
-
31
- def close
32
- @db.disconnect
33
- end
34
-
35
- private
36
-
37
- def insert_or_upsert(data)
38
- @use_upsert ? upsert(data) : insert(data)
39
- end
40
-
41
- def insert(data)
42
- @dataset.insert(data) if data
43
- end
44
-
45
- def upsert(data)
46
- if data
47
- rec = @dataset.where(@id_key => data[@id_key])
48
- if 1 != rec.update(data)
49
- insert(data)
50
- end
51
- end
52
- end
53
-
54
- end
55
- end
data/lib/metacrunch/fs.rb DELETED
@@ -1,6 +0,0 @@
1
- module Metacrunch
2
- module Fs
3
- require_relative "fs/reader"
4
- require_relative "fs/entry"
5
- end
6
- end
@@ -1,17 +0,0 @@
1
- module Metacrunch
2
- class Fs::Entry
3
-
4
- attr_reader :filename, :archive_filename, :contents
5
-
6
- def initialize(filename:, archive_filename: nil, contents: nil)
7
- @filename = filename
8
- @archive_filename = archive_filename.presence
9
- @contents = contents
10
- end
11
-
12
- def from_archive?
13
- @archive_filename != nil
14
- end
15
-
16
- end
17
- end
@@ -1,63 +0,0 @@
1
- require "metacrunch/fs"
2
- require "rubygems/package"
3
-
4
- module Metacrunch
5
- class Fs::Reader
6
- include Metacrunch::ParallelProcessableReader
7
-
8
- def initialize(filenames = nil)
9
- @filenames = [*filenames].map{|f| f.presence}.compact
10
- end
11
-
12
- def each(&block)
13
- return enum_for(__method__) unless block_given?
14
-
15
- offset = 0 + process_index
16
-
17
- while offset < @filenames.count do
18
- _filename = @filenames[offset]
19
-
20
- if is_archive?(_filename)
21
- read_archive(_filename, &block)
22
- else
23
- read_regular_file(_filename, &block)
24
- end
25
-
26
- offset += number_of_processes
27
- end
28
- end
29
-
30
- private
31
-
32
- def is_archive?(filename)
33
- filename.ends_with?(".tar") || filename.ends_with?(".tar.gz") || filename.ends_with?(".tgz")
34
- end
35
-
36
- def is_gzip_file?(filename)
37
- filename.ends_with?(".gz") || filename.ends_with?(".tgz")
38
- end
39
-
40
- def read_regular_file(filename, &block)
41
- if File.file?(filename)
42
- io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : File.open(filename, "r")
43
- yield Fs::Entry.new(filename: filename, archive_filename: nil, contents: io.read)
44
- end
45
- end
46
-
47
- def read_archive(filename, &block)
48
- io = is_gzip_file?(filename) ? Zlib::GzipReader.open(filename) : File.open(filename, "r")
49
- tarReader = Gem::Package::TarReader.new(io)
50
-
51
- tarReader.each do |_tar_entry|
52
- if _tar_entry.file?
53
- yield Fs::Entry.new(
54
- filename: filename,
55
- archive_filename: _tar_entry.full_name,
56
- contents: _tar_entry.read
57
- )
58
- end
59
- end
60
- end
61
-
62
- end
63
- end
@@ -1,102 +0,0 @@
1
- module Metacrunch
2
- class Job::Dsl::OptionSupport
3
-
4
- def register_options(args, require_args: false, &block)
5
- options = {}
6
- registry.instance_eval(&block)
7
-
8
- registry.each do |key, opt_def|
9
- # Set default value
10
- options[key] = opt_def[:default]
11
-
12
- # Register with OptionParser
13
- if opt_def[:args].present?
14
- option = parser.define(*opt_def[:args]) { |value| options[key] = value }
15
-
16
- option.desc << "REQUIRED" if opt_def[:required]
17
- option.desc << "DEFAULT: #{opt_def[:default]}" if opt_def[:default].present?
18
-
19
- parser_options[key] = option
20
- end
21
- end
22
-
23
- # Finally parse CLI options with OptionParser
24
- args = parser.parse(args || [])
25
-
26
- # Make sure required options are present
27
- ensure_required_options!(options)
28
-
29
- # Make sure args are present if required
30
- ensure_required_args!(args) if require_args
31
-
32
- options
33
- end
34
-
35
- private
36
-
37
- def parser
38
- @parser ||= OptionParser.new do |parser|
39
- parser.banner = "Usage: metacrunch [options] JOB_FILE @@ [job-options] [ARGS]\nJob options:"
40
- end
41
- end
42
-
43
- def parser_options
44
- @parser_options ||= {}
45
- end
46
-
47
- def registry
48
- @registry ||= OptionRegistry.new
49
- end
50
-
51
- def ensure_required_options!(options)
52
- registry.each do |key, opt_def|
53
- if opt_def[:required] && options[key].blank?
54
- long_option = parser_options[key].long.try(:[], 0)
55
- short_option = parser_options[key].short.try(:[], 0)
56
-
57
- puts "Error: Required job option `#{long_option || short_option}` missing."
58
- puts parser.help
59
-
60
- exit(1)
61
- end
62
- end
63
- end
64
-
65
- def ensure_required_args!(args)
66
- if args.blank?
67
- puts "Error: Required ARGS are missing."
68
- puts parser.help
69
-
70
- exit(1)
71
- end
72
- end
73
-
74
- private
75
-
76
- class OptionRegistry
77
-
78
- def add(name, *args, default: nil, required: false)
79
- if default && required
80
- raise ArgumentError, "You can't use `default` and `required` option at the same time."
81
- end
82
-
83
- options[name.to_sym] = {
84
- args: args,
85
- default: default,
86
- required: required
87
- }
88
- end
89
-
90
- def each(&block)
91
- options.each(&block)
92
- end
93
-
94
- private
95
-
96
- def options
97
- @options ||= {}
98
- end
99
- end
100
-
101
- end
102
- end
@@ -1,21 +0,0 @@
1
- module Metacrunch
2
- module ParallelProcessableReader
3
-
4
- def set_parallel_process_options(number_of_processes: 1, process_index: 0)
5
- raise ArgumentError, "number_of_processes must be >= 1" if number_of_processes < 1
6
- raise ArgumentError, "process_index must be >= 0" if process_index < 0
7
-
8
- @number_of_processes = number_of_processes
9
- @process_index = process_index
10
- end
11
-
12
- def number_of_processes
13
- @number_of_processes || 1
14
- end
15
-
16
- def process_index
17
- @process_index || 0
18
- end
19
-
20
- end
21
- end
@@ -1,8 +0,0 @@
1
- require "redis"
2
-
3
- module Metacrunch
4
- class Redis
5
- require_relative "redis/queue_reader"
6
- require_relative "redis/queue_writer"
7
- end
8
- end
@@ -1,43 +0,0 @@
1
- require "metacrunch/redis"
2
-
3
- module Metacrunch
4
- class Redis::QueueReader
5
- include Metacrunch::ParallelProcessableReader
6
-
7
- def initialize(redis_connection_or_url, queue_name, options = {})
8
- @queue_name = queue_name
9
- raise ArgumentError, "queue_name must be a string" unless queue_name.is_a?(String)
10
-
11
- @blocking_mode = options.delete(:blocking) || false
12
- @blocking_timeout = options.delete(:blocking_timeout) || 0
13
-
14
- @redis = if redis_connection_or_url.is_a?(String)
15
- ::Redis.new(url: redis_connection_or_url)
16
- else
17
- redis_connection_or_url
18
- end
19
- end
20
-
21
- def each(&block)
22
- return enum_for(__method__) unless block_given?
23
-
24
- if @blocking_mode
25
- while true
26
- list, result = @redis.blpop(@queue_name, timeout: @blocking_timeout)
27
- if result.present?
28
- yield JSON.parse(result)
29
- else
30
- yield nil
31
- end
32
- end
33
- else
34
- while result = @redis.lpop(@queue_name)
35
- yield JSON.parse(result)
36
- end
37
- end
38
-
39
- self
40
- end
41
-
42
- end
43
- end
@@ -1,39 +0,0 @@
1
- require "metacrunch/redis"
2
-
3
- module Metacrunch
4
- class Redis::QueueWriter
5
-
6
- def initialize(redis_connection_or_url, queue_name, options = {})
7
- @queue_name = queue_name
8
- raise ArgumentError, "queue_name must be a string" unless queue_name.is_a?(String)
9
-
10
- @save_on_close = options.delete(:save_on_close) || true
11
-
12
- @redis = if redis_connection_or_url.is_a?(String)
13
- ::Redis.new(url: redis_connection_or_url)
14
- else
15
- redis_connection_or_url
16
- end
17
- end
18
-
19
- def write(data)
20
- @redis.rpush(@queue_name, data)
21
- rescue RuntimeError => e
22
- if e.message =~ /maxmemory/
23
- puts "Redis has reached maxmemory. Waiting 10 seconds and trying again..."
24
- sleep(10)
25
- retry
26
- else
27
- raise e
28
- end
29
- end
30
-
31
- def close
32
- if @redis
33
- @redis.bgsave if @save_on_close
34
- @redis.close
35
- end
36
- end
37
-
38
- end
39
- end
@@ -1,33 +0,0 @@
1
- require "metacrunch/redis"
2
-
3
- module Metacrunch
4
- class Redis::Writer
5
-
6
- def initialize(redis_connection_or_url, options = {})
7
- @save_on_close = options.delete(:save_on_close) || true
8
-
9
- @key = options.delete(:key) || :key
10
-
11
- @redis = if redis_connection_or_url.is_a?(String)
12
- ::Redis.new(url: redis_connection_or_url)
13
- else
14
- redis_connection_or_url
15
- end
16
- end
17
-
18
- def write(data)
19
- key = data[@key]
20
- raise ArgumentError, "No key found in data. Tried '#{@key}' but didn't found a value." unless key
21
-
22
- @redis.set(key.to_s, data.to_json)
23
- end
24
-
25
- def close
26
- if @redis
27
- @redis.bgsave if @save_on_close
28
- @redis.close
29
- end
30
- end
31
-
32
- end
33
- end