s3_data_packer 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,99 @@
1
+ module S3DataPacker
2
+ class Configuration
3
+ # Standard logger to output information
4
+ attr_accessor :logger
5
+
6
+ # How many threads to run for reading and processing items. This needs to be
7
+ # balanced out with the speed at which item keys are gathered to prevent
8
+ # emptying the queue too early.
9
+ attr_accessor :thread_count
10
+
11
+ # Time in seconds to let a thread sleep when there's no pending items in queue.
12
+ attr_accessor :thread_sleep_time
13
+
14
+ # Time in seconds for thread to wait when locked.
15
+ attr_accessor :thread_lock_wait_time
16
+
17
+ # Maximum number of items to maintain in queue to not overflow while workers
18
+ # process items.
19
+ attr_accessor :max_queue_size
20
+
21
+ # Time in seconds to wait when the queue reached max_queue_size to keep adding
22
+ # new items.
23
+ attr_accessor :max_queue_wait
24
+
25
+ # Directory to keep working files. Make sure you have permissions on the path
26
+ # set. If the path does not exist, the packer will try to create it before using
27
+ # it.
28
+ attr_accessor :workdir
29
+
30
+ # Whether to keep or delete the finalized batch file. Set to false if you want to
31
+ # keep the output files in the workdir.
32
+ attr_accessor :cleanup_batch
33
+
34
+ # Whether to compress the final batch file or not. If set to true, the output file
35
+ # will be compressed with GZip, and the uncompressed file will be removed.
36
+ attr_accessor :compress_batch
37
+
38
+ # Number of items of the final batch size
39
+ attr_accessor :batch_size
40
+
41
+ attr_accessor :s3_api_key
42
+ attr_accessor :s3_api_secret
43
+ attr_accessor :s3_region
44
+
45
+ # String prefix to include in output filenames for the batches
46
+ attr_accessor :output_filename_prefix
47
+
48
+ # String suffix to include in output filenames for the batches
49
+ attr_accessor :output_filename_suffix
50
+
51
+ # Desired pattern to construct output filenames
52
+ attr_accessor :output_filename_pattern
53
+
54
+ # Splitter character to concat different values to generate a filename
55
+ attr_accessor :output_filename_splitter
56
+
57
+ def initialize
58
+ @thread_count = 2
59
+ @thread_sleep_time = 1
60
+ @thread_lock_wait_time = 1
61
+ @max_queue_size = 10000
62
+ @max_queue_wait = 5
63
+ @batch_size = 100000
64
+ @workdir = 'tmp/s3_data_packer'
65
+ @cleanup_batch = true
66
+ @compress_batch = true
67
+ @output_filename_prefix = nil
68
+ @output_filename_suffix = 'batch'
69
+ @output_filename_pattern = %i[timecode_int suffix]
70
+ @output_filename_splitter = '_'
71
+ end
72
+
73
+ def compress_batch?
74
+ compress_batch == true
75
+ end
76
+
77
+ def cleanup_batch?
78
+ cleanup_batch == true
79
+ end
80
+
81
+ def s3_credentials?
82
+ s3_api_key && s3_api_secret
83
+ end
84
+
85
+ def default_s3_credentials
86
+ return nil unless s3_credentials?
87
+
88
+ Aws::Credentials.new(s3_api_key, s3_api_secret)
89
+ end
90
+
91
+ def filename_generator_defaults
92
+ { prefix: output_filename_prefix,
93
+ suffix: output_filename_suffix,
94
+ pattern: output_filename_pattern,
95
+ splitter: output_filename_splitter }
96
+ end
97
+
98
+ end
99
+ end
@@ -0,0 +1,45 @@
1
+ module S3DataPacker
2
+ class FilenameGenerator
3
+ attr_reader :pattern, :number, :splitter
4
+
5
+ def initialize opts={}
6
+ @number = opts[:start_at] || 1
7
+ @prefix = opts[:prefix] || default_options[:prefix]
8
+ @suffix = opts[:suffix] || default_options[:suffix]
9
+ @pattern = opts[:pattern] || default_options[:pattern]
10
+ @splitter = opts[:splitter] || default_options[:splitter]
11
+ validate_pattern!
12
+ end
13
+
14
+ def timecode_int; Time.now.to_i; end
15
+ def timecode_dec; Time.now.to_f; end
16
+ def number; @number; end
17
+ def timestamp; Time.now.strftime('%Y%m%d%H%M%s'); end
18
+ def datestamp; Time.now.strftime('%Y%m%d'); end
19
+ def prefix; @prefix; end
20
+ def suffix; @suffix; end
21
+
22
+ def generate!
23
+ name = pattern.map{ |key| send(key) }
24
+ name.delete_if { |value| value.nil? || value == '' }
25
+ name = name.map(&:to_s).join(splitter)
26
+ @number += 1
27
+ name
28
+ end
29
+
30
+ private
31
+
32
+ def default_options
33
+ @default_options ||= S3DataPacker.config.filename_generator_defaults
34
+ end
35
+
36
+ def validate_pattern!
37
+ valid = %i[timecode_int timecode_dec number timestamp datestamp prefix suffix]
38
+ pattern.each do |item|
39
+ raise ArgumentError, "Invalid pattern key, has to be a symbol" unless Symbol === item
40
+ raise ArgumentError, "Invalid pattern key #{item}. Allowed: #{valid.join(', ')}" unless valid.include?(item)
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,93 @@
1
+ module S3DataPacker
2
+ class JSONBatch
3
+ attr_reader :delimitter, :batch, :item_count
4
+
5
+ def initialize opts = {}
6
+ @delimitter = "\r\n"
7
+ @workdir = opts[:workdir]
8
+ @filename_generator = opts[:filename_generator]
9
+ @pre_processor = opts[:pre_processor] # Should be a proc
10
+ @size = opts[:size]
11
+ @item_count = 0
12
+ init_workdir!
13
+ end
14
+
15
+ def size
16
+ @size ||= S3DataPacker.config.batch_size
17
+ end
18
+
19
+ def workdir
20
+ @workdir ||= S3DataPacker.config.workdir
21
+ end
22
+
23
+ def filename_generator
24
+ @filename_generator ||= S3DataPacker::FilenameGenerator.new
25
+ end
26
+
27
+ def full?
28
+ item_count >= size
29
+ end
30
+
31
+ def generate_filename
32
+ name = filename_generator.generate!
33
+ "#{workdir}/#{name}.json"
34
+ end
35
+
36
+ def new_file!
37
+ close! if @batch
38
+ @batch = File.open(generate_filename, 'w')
39
+ end
40
+
41
+ def append_data! data
42
+ digested = pre_proccess_data(data)
43
+ batch << "#{digested}#{delimitter}"
44
+ @item_count += 1
45
+ end
46
+
47
+ def path
48
+ batch.path
49
+ end
50
+
51
+ def close!
52
+ batch.close
53
+ end
54
+
55
+ def delete!
56
+ close! if !@batch.closed?
57
+ File.delete(path) if File.exist?(path)
58
+ reset!
59
+ end
60
+
61
+ def finalize!
62
+ close! if !batch.closed?
63
+ final_path = batch.path
64
+ final_path = compress! if S3DataPacker.config.compress_batch?
65
+ reset!
66
+ final_path
67
+ end
68
+
69
+ private
70
+
71
+ def pre_proccess_data(raw_data)
72
+ # Transformations here, return string for this one
73
+ return @pre_processor.call(raw_data) if @pre_processor
74
+ raw_data
75
+ end
76
+
77
+ def init_workdir!
78
+ Dir.mkdir(workdir) unless Dir.exist?(workdir)
79
+ end
80
+
81
+ def compress!
82
+ new_path = "#{batch.path}.gz"
83
+ `gzip #{batch.path}`
84
+ new_path
85
+ end
86
+
87
+ def reset!
88
+ @batch = nil
89
+ @item_count = 0
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,105 @@
1
+ module S3DataPacker
2
+ class Packer
3
+ module Error
4
+ class DeadWorkers < StandardError; end
5
+ end
6
+
7
+ attr_reader :source, :target, :output
8
+
9
+ def initialize opts = {}
10
+ @source = opts[:source]
11
+ @target = opts[:target]
12
+ @output = opts[:output] || S3DataPacker::JSONBatch.new
13
+ end
14
+
15
+ def summary
16
+ @summary ||= S3DataPacker::Summary.new
17
+ end
18
+
19
+ def logger
20
+ @logger ||= S3DataPacker.logger
21
+ end
22
+
23
+ def workers
24
+ @workers ||= S3DataPacker::ThreadSet.new
25
+ end
26
+
27
+ def pack!
28
+ log "Packing data from #{source.name} to #{target.name} ..."
29
+ boot_workers!
30
+
31
+ @start_time = Time.now
32
+ begin
33
+ each_item { |item| workers.queue.add!(item) }
34
+ finalize_processing!
35
+ rescue Exception => e
36
+ log "Unexpected error, killing threads", :error
37
+ raise e
38
+ ensure
39
+ workers.kill!
40
+ end
41
+ end
42
+
43
+ def process_item(data)
44
+ output.append_data! data
45
+ summary.count_processed
46
+ if output.full?
47
+ flush_batch!
48
+ output.new_file!
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def finalize_processing!
55
+ log "No more items found to enqueue, signaling workers to finish"
56
+ workers.finish!
57
+ workers.wait!
58
+ workers.kill!
59
+ log "Pushing last open batch #{output.path}"
60
+ flush_batch!
61
+ summary.set_time(@start_time, Time.now)
62
+ log "Finished\n#{summary.flush!}"
63
+ end
64
+
65
+ def each_item &block
66
+ source.each do |item|
67
+ if workers.dead?
68
+ log "Workers diead", :error
69
+ raise Error::DeadWorkers, 'Workers died'
70
+ end
71
+ summary.count_item
72
+ yield item
73
+ end
74
+ end
75
+
76
+ def flush_batch!
77
+ summary.count_batch
78
+ final_filename = output.finalize!
79
+ send_file!(final_filename)
80
+ end
81
+
82
+ def send_file!(file)
83
+ target.save_file file
84
+ end
85
+
86
+ def boot_workers!
87
+ output.new_file!
88
+ workers.spawn_threads! do |item|
89
+ data = source.fetch(item)
90
+ workers.lock.synchronize { process_item(data) }
91
+ post_process_item(item)
92
+ end
93
+ end
94
+
95
+ def post_process_item(item)
96
+ # Do something with the key after processed
97
+ nil
98
+ end
99
+
100
+ def log(message, level = :info)
101
+ logger.send level, "Main process: #{message}"
102
+ end
103
+
104
+ end
105
+ end
@@ -0,0 +1,46 @@
1
+ module S3DataPacker
2
+ class Queue
3
+ attr_reader :total_items, :items
4
+
5
+ def initialize opts = {}
6
+ @max_items = opts[:max_items]
7
+ @wait_time = opts[:wait_time]
8
+ @total_items = 0
9
+ end
10
+
11
+ def max_items
12
+ @max_items ||= S3DataPacker.config.max_queue_size
13
+ end
14
+
15
+ def wait_time
16
+ @wait_time ||= S3DataPacker.config.max_queue_wait
17
+ end
18
+
19
+ def items
20
+ @items ||= []
21
+ end
22
+
23
+ def add!(item)
24
+ items << item
25
+ @total_items += 1
26
+ if size >= max_items
27
+ S3DataPacker.logger.info "Queue full, pausing"
28
+ sleep(wait_time)
29
+ S3DataPacker.logger.info "Resuming queue"
30
+ end
31
+ end
32
+
33
+ def fetch!
34
+ items.shift
35
+ end
36
+
37
+ def size
38
+ items.size
39
+ end
40
+
41
+ def reset!
42
+ @items = []
43
+ @total_items = 0
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,28 @@
1
+ module S3DataPacker
2
+ module Sources
3
+ class Object
4
+
5
+ def initialize(object:, each_method: :each, fetch_method: :fetch, name_method: :name)
6
+ @object = object
7
+ @each_method = each_method
8
+ @fetch_method = fetch_method
9
+ @name_method = name_method
10
+ end
11
+
12
+ def name
13
+ @object.send(@name_method)
14
+ end
15
+
16
+ def each &block
17
+ @object.send(@each_method) do |item|
18
+ yield item
19
+ end
20
+ end
21
+
22
+ def fetch(item)
23
+ @object.send(@fetch_method, item)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,21 @@
1
+ module S3DataPacker
2
+ module Sources
3
+
4
+ class S3Bucket < S3DataPacker::Bucket
5
+ def name
6
+ "s3://#{bucket_name}/#{path}"
7
+ end
8
+
9
+ def each(&block)
10
+ each_key do |key|
11
+ yield key
12
+ end
13
+ end
14
+
15
+ def fetch(key)
16
+ download(key)
17
+ end
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,59 @@
1
+ module S3DataPacker
2
+ class Summary
3
+ def stats
4
+ @stats ||= {}
5
+ end
6
+
7
+ def count_item
8
+ stats[:total_items] ||= 0
9
+ stats[:total_items] += 1
10
+ end
11
+
12
+ def count_processed
13
+ stats[:processed] ||= 0
14
+ stats[:processed] += 1
15
+ end
16
+
17
+ def count_batch
18
+ stats[:batches] ||= 0
19
+ stats[:batches] += 1
20
+ end
21
+
22
+ def set_time start_time, end_time
23
+ stats[:elapsed] = "#{(end_time.to_i - start_time.to_i)} seconds"
24
+ end
25
+
26
+ def total_items
27
+ stats[:total_items] || 0
28
+ end
29
+
30
+ def processed
31
+ stats[:processed] || 0
32
+ end
33
+
34
+ def batches
35
+ stats[:batches] || 0
36
+ end
37
+
38
+ def elapsed
39
+ stats[:elapsed]
40
+ end
41
+
42
+ def flush!
43
+ output = [
44
+ "Summary:",
45
+ "Total Items: #{stats[:total_items]}",
46
+ "Processed Items: #{stats[:processed]}",
47
+ "Batches: #{stats[:batches]}",
48
+ "Elapsed: #{stats[:elapsed]}"
49
+ ].join("\n")
50
+ reset!
51
+ output
52
+ end
53
+
54
+ def reset!
55
+ @stats = {}
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,21 @@
1
+ module S3DataPacker
2
+ module Targets
3
+ class Object
4
+
5
+ def initialize(object:, name_method: :each, save_file_method: :save_file)
6
+ @object = object
7
+ @name_method = name_method
8
+ @save_file_method = save_file_method
9
+ end
10
+
11
+ def name
12
+ @object.send(@name_method)
13
+ end
14
+
15
+ def save_file(filepath)
16
+ @object.send(@save_file_method, filepath)
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,16 @@
1
+ module S3DataPacker
2
+ module Targets
3
+
4
+ class S3Bucket < S3DataPacker::Bucket
5
+ def name
6
+ "s3://#{bucket_name}/#{path}"
7
+ end
8
+
9
+ def save_file(filepath)
10
+ upload(filepath)
11
+ File.delete(filepath) if S3DataPacker.config.cleanup_batch?
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,98 @@
1
+ module S3DataPacker
2
+ class ThreadSet
3
+ attr_reader :lock, :workers, :queue
4
+
5
+ def initialize opts ={}
6
+ @lock = Mutex.new
7
+ @workers = []
8
+ @finish = false
9
+ @queue = S3DataPacker::Queue.new
10
+ end
11
+
12
+ def wait_time
13
+ @wait_time ||= S3DataPacker.config.thread_sleep_time
14
+ end
15
+
16
+ def lock_wait_time
17
+ @lock_wait_time ||= S3DataPacker.config.thread_lock_wait_time
18
+ end
19
+
20
+ def thread_count
21
+ @thread_count ||= S3DataPacker.config.thread_count
22
+ end
23
+
24
+ def dead?
25
+ workers.map(&:status).uniq == [nil] || workers.map(&:status).uniq == [false]
26
+ end
27
+
28
+ def kill!
29
+ log 'All', "Killing #{workers.size} workers"
30
+ workers.map(&:kill)
31
+ end
32
+
33
+ def reset!
34
+ return unless dead?
35
+ @finish = false
36
+ @workers = []
37
+ end
38
+
39
+ def finish!
40
+ @finish = true
41
+ end
42
+
43
+ def finished?
44
+ @finish == true && queue.size == 0
45
+ end
46
+
47
+ def log id, message, level = :info
48
+ logger.send level, "Thread #{id}: #{message}"
49
+ end
50
+
51
+ def wait!
52
+ workers.map(&:join)
53
+ end
54
+
55
+ def spawn_thread! id, &block
56
+ @workers << Thread.new do
57
+ log id, "Started!"
58
+ loop do
59
+ if finished?
60
+ log id, "Finish signal up and no more work to pull - Exiting"
61
+ break
62
+ end
63
+ item = queue.fetch!
64
+ if item
65
+ log id, "Processing item #{item}", :debug
66
+ begin
67
+ yield item
68
+ rescue ThreadError
69
+ log id, "Locked, retry in #{lock_wait_time}", :warn
70
+ sleep(lock_wait_time)
71
+ retry
72
+ end
73
+ else
74
+ log id, "No more work found, sleeping for #{wait_time}"
75
+ sleep(wait_time)
76
+ end
77
+ rescue Exception => e
78
+ log id, 'Unexpected error!'
79
+ raise e
80
+ end
81
+ end
82
+ end
83
+
84
+ def spawn_threads! &block
85
+ logger.info "Spawning #{thread_count} threads"
86
+ thread_count.times do |id|
87
+ spawn_thread!(id, &block)
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ def logger
94
+ @logger ||= S3DataPacker.logger
95
+ end
96
+
97
+ end
98
+ end
@@ -0,0 +1,3 @@
1
+ module S3DataPacker
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,41 @@
1
+ require 'csv'
2
+ require 'json'
3
+ require 'logger'
4
+ require 'mime/types/full'
5
+ require 'aws-sdk-s3'
6
+
7
+ require "s3_data_packer/version"
8
+ require 's3_data_packer/configuration'
9
+ require 's3_data_packer/packer'
10
+ require 's3_data_packer/queue'
11
+ require 's3_data_packer/thread_set'
12
+ require 's3_data_packer/summary'
13
+ require 's3_data_packer/json_batch'
14
+ require 's3_data_packer/bucket'
15
+ require 's3_data_packer/filename_generator'
16
+
17
+ require 's3_data_packer/sources/object'
18
+ require 's3_data_packer/sources/s3_bucket'
19
+
20
+ require 's3_data_packer/targets/s3_bucket'
21
+ require 's3_data_packer/targets/object'
22
+
23
+ module S3DataPacker
24
+ class << self
25
+ attr_reader :configuration
26
+
27
+ def configuration
28
+ @configuration ||= Configuration.new
29
+ end
30
+
31
+ alias config configuration
32
+
33
+ def configure
34
+ yield configuration
35
+ end
36
+
37
+ def logger
38
+ @logger ||= config.logger || Logger.new('log/s3_data_packer.log')
39
+ end
40
+ end
41
+ end