s3_data_packer 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,99 @@
1
+ module S3DataPacker
2
+ class Configuration
3
+ # Standard logger to output information
4
+ attr_accessor :logger
5
+
6
+ # How many threads to run for reading and processing items. This needs to be
7
+ # balanced out with the speed at which item keys are gathered to prevent
8
+ # emptying the queue too early.
9
+ attr_accessor :thread_count
10
+
11
+ # Time in seconds to let a thread sleep when there's no pending items in queue.
12
+ attr_accessor :thread_sleep_time
13
+
14
+ # Time in seconds for thread to wait when locked.
15
+ attr_accessor :thread_lock_wait_time
16
+
17
+ # Maximum number of items to maintain in queue to not overflow while workers
18
+ # process items.
19
+ attr_accessor :max_queue_size
20
+
21
+ # Time in seconds to wait when the queue reached max_queue_size to keep adding
22
+ # new items.
23
+ attr_accessor :max_queue_wait
24
+
25
+ # Directory to keep working files. Make sure you have permissions on the path
26
+ # set. If the path does not exist, the packer will try to create it before using
27
+ # it.
28
+ attr_accessor :workdir
29
+
30
+ # Whether to keep or delete the finalized batch file. Set to false if you want to
31
+ # keep the output files in the workdir.
32
+ attr_accessor :cleanup_batch
33
+
34
+ # Whether to compress the final batch file or not. If set to true, the output file
35
+ # will be compressed with GZip, and the uncompressed file will be removed.
36
+ attr_accessor :compress_batch
37
+
38
+ # Number of items of the final batch size
39
+ attr_accessor :batch_size
40
+
41
+ attr_accessor :s3_api_key
42
+ attr_accessor :s3_api_secret
43
+ attr_accessor :s3_region
44
+
45
+ # String prefix to include in output filenames for the batches
46
+ attr_accessor :output_filename_prefix
47
+
48
+ # String suffix to include in output filenames for the batches
49
+ attr_accessor :output_filename_suffix
50
+
51
+ # Desired pattern to construct output filenames
52
+ attr_accessor :output_filename_pattern
53
+
54
+ # Splitter character to concat different values to generate a filename
55
+ attr_accessor :output_filename_splitter
56
+
57
+ def initialize
58
+ @thread_count = 2
59
+ @thread_sleep_time = 1
60
+ @thread_lock_wait_time = 1
61
+ @max_queue_size = 10000
62
+ @max_queue_wait = 5
63
+ @batch_size = 100000
64
+ @workdir = 'tmp/s3_data_packer'
65
+ @cleanup_batch = true
66
+ @compress_batch = true
67
+ @output_filename_prefix = nil
68
+ @output_filename_suffix = 'batch'
69
+ @output_filename_pattern = %i[timecode_int suffix]
70
+ @output_filename_splitter = '_'
71
+ end
72
+
73
+ def compress_batch?
74
+ compress_batch == true
75
+ end
76
+
77
+ def cleanup_batch?
78
+ cleanup_batch == true
79
+ end
80
+
81
+ def s3_credentials?
82
+ s3_api_key && s3_api_secret
83
+ end
84
+
85
+ def default_s3_credentials
86
+ return nil unless s3_credentials?
87
+
88
+ Aws::Credentials.new(s3_api_key, s3_api_secret)
89
+ end
90
+
91
+ def filename_generator_defaults
92
+ { prefix: output_filename_prefix,
93
+ suffix: output_filename_suffix,
94
+ pattern: output_filename_pattern,
95
+ splitter: output_filename_splitter }
96
+ end
97
+
98
+ end
99
+ end
@@ -0,0 +1,45 @@
1
+ module S3DataPacker
2
+ class FilenameGenerator
3
+ attr_reader :pattern, :number, :splitter
4
+
5
+ def initialize opts={}
6
+ @number = opts[:start_at] || 1
7
+ @prefix = opts[:prefix] || default_options[:prefix]
8
+ @suffix = opts[:suffix] || default_options[:suffix]
9
+ @pattern = opts[:pattern] || default_options[:pattern]
10
+ @splitter = opts[:splitter] || default_options[:splitter]
11
+ validate_pattern!
12
+ end
13
+
14
+ def timecode_int; Time.now.to_i; end
15
+ def timecode_dec; Time.now.to_f; end
16
+ def number; @number; end
17
+ def timestamp; Time.now.strftime('%Y%m%d%H%M%s'); end
18
+ def datestamp; Time.now.strftime('%Y%m%d'); end
19
+ def prefix; @prefix; end
20
+ def suffix; @suffix; end
21
+
22
+ def generate!
23
+ name = pattern.map{ |key| send(key) }
24
+ name.delete_if { |value| value.nil? || value == '' }
25
+ name = name.map(&:to_s).join(splitter)
26
+ @number += 1
27
+ name
28
+ end
29
+
30
+ private
31
+
32
+ def default_options
33
+ @default_options ||= S3DataPacker.config.filename_generator_defaults
34
+ end
35
+
36
+ def validate_pattern!
37
+ valid = %i[timecode_int timecode_dec number timestamp datestamp prefix suffix]
38
+ pattern.each do |item|
39
+ raise ArgumentError, "Invalid pattern key, has to be a symbol" unless Symbol === item
40
+ raise ArgumentError, "Invalid pattern key #{item}. Allowed: #{valid.join(', ')}" unless valid.include?(item)
41
+ end
42
+ end
43
+
44
+ end
45
+ end
@@ -0,0 +1,93 @@
1
+ module S3DataPacker
2
+ class JSONBatch
3
+ attr_reader :delimitter, :batch, :item_count
4
+
5
+ def initialize opts = {}
6
+ @delimitter = "\r\n"
7
+ @workdir = opts[:workdir]
8
+ @filename_generator = opts[:filename_generator]
9
+ @pre_processor = opts[:pre_processor] # Should be a proc
10
+ @size = opts[:size]
11
+ @item_count = 0
12
+ init_workdir!
13
+ end
14
+
15
+ def size
16
+ @size ||= S3DataPacker.config.batch_size
17
+ end
18
+
19
+ def workdir
20
+ @workdir ||= S3DataPacker.config.workdir
21
+ end
22
+
23
+ def filename_generator
24
+ @filename_generator ||= S3DataPacker::FilenameGenerator.new
25
+ end
26
+
27
+ def full?
28
+ item_count >= size
29
+ end
30
+
31
+ def generate_filename
32
+ name = filename_generator.generate!
33
+ "#{workdir}/#{name}.json"
34
+ end
35
+
36
+ def new_file!
37
+ close! if @batch
38
+ @batch = File.open(generate_filename, 'w')
39
+ end
40
+
41
+ def append_data! data
42
+ digested = pre_proccess_data(data)
43
+ batch << "#{digested}#{delimitter}"
44
+ @item_count += 1
45
+ end
46
+
47
+ def path
48
+ batch.path
49
+ end
50
+
51
+ def close!
52
+ batch.close
53
+ end
54
+
55
+ def delete!
56
+ close! if !@batch.closed?
57
+ File.delete(path) if File.exist?(path)
58
+ reset!
59
+ end
60
+
61
+ def finalize!
62
+ close! if !batch.closed?
63
+ final_path = batch.path
64
+ final_path = compress! if S3DataPacker.config.compress_batch?
65
+ reset!
66
+ final_path
67
+ end
68
+
69
+ private
70
+
71
+ def pre_proccess_data(raw_data)
72
+ # Transformations here, return string for this one
73
+ return @pre_processor.call(raw_data) if @pre_processor
74
+ raw_data
75
+ end
76
+
77
+ def init_workdir!
78
+ Dir.mkdir(workdir) unless Dir.exist?(workdir)
79
+ end
80
+
81
+ def compress!
82
+ new_path = "#{batch.path}.gz"
83
+ `gzip #{batch.path}`
84
+ new_path
85
+ end
86
+
87
+ def reset!
88
+ @batch = nil
89
+ @item_count = 0
90
+ end
91
+
92
+ end
93
+ end
@@ -0,0 +1,105 @@
1
+ module S3DataPacker
2
+ class Packer
3
+ module Error
4
+ class DeadWorkers < StandardError; end
5
+ end
6
+
7
+ attr_reader :source, :target, :output
8
+
9
+ def initialize opts = {}
10
+ @source = opts[:source]
11
+ @target = opts[:target]
12
+ @output = opts[:output] || S3DataPacker::JSONBatch.new
13
+ end
14
+
15
+ def summary
16
+ @summary ||= S3DataPacker::Summary.new
17
+ end
18
+
19
+ def logger
20
+ @logger ||= S3DataPacker.logger
21
+ end
22
+
23
+ def workers
24
+ @workers ||= S3DataPacker::ThreadSet.new
25
+ end
26
+
27
+ def pack!
28
+ log "Packing data from #{source.name} to #{target.name} ..."
29
+ boot_workers!
30
+
31
+ @start_time = Time.now
32
+ begin
33
+ each_item { |item| workers.queue.add!(item) }
34
+ finalize_processing!
35
+ rescue Exception => e
36
+ log "Unexpected error, killing threads", :error
37
+ raise e
38
+ ensure
39
+ workers.kill!
40
+ end
41
+ end
42
+
43
+ def process_item(data)
44
+ output.append_data! data
45
+ summary.count_processed
46
+ if output.full?
47
+ flush_batch!
48
+ output.new_file!
49
+ end
50
+ end
51
+
52
+ private
53
+
54
+ def finalize_processing!
55
+ log "No more items found to enqueue, signaling workers to finish"
56
+ workers.finish!
57
+ workers.wait!
58
+ workers.kill!
59
+ log "Pushing last open batch #{output.path}"
60
+ flush_batch!
61
+ summary.set_time(@start_time, Time.now)
62
+ log "Finished\n#{summary.flush!}"
63
+ end
64
+
65
+ def each_item &block
66
+ source.each do |item|
67
+ if workers.dead?
68
+ log "Workers diead", :error
69
+ raise Error::DeadWorkers, 'Workers died'
70
+ end
71
+ summary.count_item
72
+ yield item
73
+ end
74
+ end
75
+
76
+ def flush_batch!
77
+ summary.count_batch
78
+ final_filename = output.finalize!
79
+ send_file!(final_filename)
80
+ end
81
+
82
+ def send_file!(file)
83
+ target.save_file file
84
+ end
85
+
86
+ def boot_workers!
87
+ output.new_file!
88
+ workers.spawn_threads! do |item|
89
+ data = source.fetch(item)
90
+ workers.lock.synchronize { process_item(data) }
91
+ post_process_item(item)
92
+ end
93
+ end
94
+
95
+ def post_process_item(item)
96
+ # Do something with the key after processed
97
+ nil
98
+ end
99
+
100
+ def log(message, level = :info)
101
+ logger.send level, "Main process: #{message}"
102
+ end
103
+
104
+ end
105
+ end
@@ -0,0 +1,46 @@
1
+ module S3DataPacker
2
+ class Queue
3
+ attr_reader :total_items, :items
4
+
5
+ def initialize opts = {}
6
+ @max_items = opts[:max_items]
7
+ @wait_time = opts[:wait_time]
8
+ @total_items = 0
9
+ end
10
+
11
+ def max_items
12
+ @max_items ||= S3DataPacker.config.max_queue_size
13
+ end
14
+
15
+ def wait_time
16
+ @wait_time ||= S3DataPacker.config.max_queue_wait
17
+ end
18
+
19
+ def items
20
+ @items ||= []
21
+ end
22
+
23
+ def add!(item)
24
+ items << item
25
+ @total_items += 1
26
+ if size >= max_items
27
+ S3DataPacker.logger.info "Queue full, pausing"
28
+ sleep(wait_time)
29
+ S3DataPacker.logger.info "Resuming queue"
30
+ end
31
+ end
32
+
33
+ def fetch!
34
+ items.shift
35
+ end
36
+
37
+ def size
38
+ items.size
39
+ end
40
+
41
+ def reset!
42
+ @items = []
43
+ @total_items = 0
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,28 @@
1
+ module S3DataPacker
2
+ module Sources
3
+ class Object
4
+
5
+ def initialize(object:, each_method: :each, fetch_method: :fetch, name_method: :name)
6
+ @object = object
7
+ @each_method = each_method
8
+ @fetch_method = fetch_method
9
+ @name_method = name_method
10
+ end
11
+
12
+ def name
13
+ @object.send(@name_method)
14
+ end
15
+
16
+ def each &block
17
+ @object.send(@each_method) do |item|
18
+ yield item
19
+ end
20
+ end
21
+
22
+ def fetch(item)
23
+ @object.send(@fetch_method, item)
24
+ end
25
+
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,21 @@
1
+ module S3DataPacker
2
+ module Sources
3
+
4
+ class S3Bucket < S3DataPacker::Bucket
5
+ def name
6
+ "s3://#{bucket_name}/#{path}"
7
+ end
8
+
9
+ def each(&block)
10
+ each_key do |key|
11
+ yield key
12
+ end
13
+ end
14
+
15
+ def fetch(key)
16
+ download(key)
17
+ end
18
+ end
19
+
20
+ end
21
+ end
@@ -0,0 +1,59 @@
1
+ module S3DataPacker
2
+ class Summary
3
+ def stats
4
+ @stats ||= {}
5
+ end
6
+
7
+ def count_item
8
+ stats[:total_items] ||= 0
9
+ stats[:total_items] += 1
10
+ end
11
+
12
+ def count_processed
13
+ stats[:processed] ||= 0
14
+ stats[:processed] += 1
15
+ end
16
+
17
+ def count_batch
18
+ stats[:batches] ||= 0
19
+ stats[:batches] += 1
20
+ end
21
+
22
+ def set_time start_time, end_time
23
+ stats[:elapsed] = "#{(end_time.to_i - start_time.to_i)} seconds"
24
+ end
25
+
26
+ def total_items
27
+ stats[:total_items] || 0
28
+ end
29
+
30
+ def processed
31
+ stats[:processed] || 0
32
+ end
33
+
34
+ def batches
35
+ stats[:batches] || 0
36
+ end
37
+
38
+ def elapsed
39
+ stats[:elapsed]
40
+ end
41
+
42
+ def flush!
43
+ output = [
44
+ "Summary:",
45
+ "Total Items: #{stats[:total_items]}",
46
+ "Processed Items: #{stats[:processed]}",
47
+ "Batches: #{stats[:batches]}",
48
+ "Elapsed: #{stats[:elapsed]}"
49
+ ].join("\n")
50
+ reset!
51
+ output
52
+ end
53
+
54
+ def reset!
55
+ @stats = {}
56
+ end
57
+
58
+ end
59
+ end
@@ -0,0 +1,21 @@
1
+ module S3DataPacker
2
+ module Targets
3
+ class Object
4
+
5
+ def initialize(object:, name_method: :each, save_file_method: :save_file)
6
+ @object = object
7
+ @name_method = name_method
8
+ @save_file_method = save_file_method
9
+ end
10
+
11
+ def name
12
+ @object.send(@name_method)
13
+ end
14
+
15
+ def save_file(filepath)
16
+ @object.send(@save_file_method, filepath)
17
+ end
18
+
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,16 @@
1
+ module S3DataPacker
2
+ module Targets
3
+
4
+ class S3Bucket < S3DataPacker::Bucket
5
+ def name
6
+ "s3://#{bucket_name}/#{path}"
7
+ end
8
+
9
+ def save_file(filepath)
10
+ upload(filepath)
11
+ File.delete(filepath) if S3DataPacker.config.cleanup_batch?
12
+ end
13
+ end
14
+
15
+ end
16
+ end
@@ -0,0 +1,98 @@
1
+ module S3DataPacker
2
+ class ThreadSet
3
+ attr_reader :lock, :workers, :queue
4
+
5
+ def initialize opts ={}
6
+ @lock = Mutex.new
7
+ @workers = []
8
+ @finish = false
9
+ @queue = S3DataPacker::Queue.new
10
+ end
11
+
12
+ def wait_time
13
+ @wait_time ||= S3DataPacker.config.thread_sleep_time
14
+ end
15
+
16
+ def lock_wait_time
17
+ @lock_wait_time ||= S3DataPacker.config.thread_lock_wait_time
18
+ end
19
+
20
+ def thread_count
21
+ @thread_count ||= S3DataPacker.config.thread_count
22
+ end
23
+
24
+ def dead?
25
+ workers.map(&:status).uniq == [nil] || workers.map(&:status).uniq == [false]
26
+ end
27
+
28
+ def kill!
29
+ log 'All', "Killing #{workers.size} workers"
30
+ workers.map(&:kill)
31
+ end
32
+
33
+ def reset!
34
+ return unless dead?
35
+ @finish = false
36
+ @workers = []
37
+ end
38
+
39
+ def finish!
40
+ @finish = true
41
+ end
42
+
43
+ def finished?
44
+ @finish == true && queue.size == 0
45
+ end
46
+
47
+ def log id, message, level = :info
48
+ logger.send level, "Thread #{id}: #{message}"
49
+ end
50
+
51
+ def wait!
52
+ workers.map(&:join)
53
+ end
54
+
55
+ def spawn_thread! id, &block
56
+ @workers << Thread.new do
57
+ log id, "Started!"
58
+ loop do
59
+ if finished?
60
+ log id, "Finish signal up and no more work to pull - Exiting"
61
+ break
62
+ end
63
+ item = queue.fetch!
64
+ if item
65
+ log id, "Processing item #{item}", :debug
66
+ begin
67
+ yield item
68
+ rescue ThreadError
69
+ log id, "Locked, retry in #{lock_wait_time}", :warn
70
+ sleep(lock_wait_time)
71
+ retry
72
+ end
73
+ else
74
+ log id, "No more work found, sleeping for #{wait_time}"
75
+ sleep(wait_time)
76
+ end
77
+ rescue Exception => e
78
+ log id, 'Unexpected error!'
79
+ raise e
80
+ end
81
+ end
82
+ end
83
+
84
+ def spawn_threads! &block
85
+ logger.info "Spawning #{thread_count} threads"
86
+ thread_count.times do |id|
87
+ spawn_thread!(id, &block)
88
+ end
89
+ end
90
+
91
+ private
92
+
93
+ def logger
94
+ @logger ||= S3DataPacker.logger
95
+ end
96
+
97
+ end
98
+ end
@@ -0,0 +1,3 @@
1
+ module S3DataPacker
2
+ VERSION = "0.2.0"
3
+ end
@@ -0,0 +1,41 @@
1
+ require 'csv'
2
+ require 'json'
3
+ require 'logger'
4
+ require 'mime/types/full'
5
+ require 'aws-sdk-s3'
6
+
7
+ require "s3_data_packer/version"
8
+ require 's3_data_packer/configuration'
9
+ require 's3_data_packer/packer'
10
+ require 's3_data_packer/queue'
11
+ require 's3_data_packer/thread_set'
12
+ require 's3_data_packer/summary'
13
+ require 's3_data_packer/json_batch'
14
+ require 's3_data_packer/bucket'
15
+ require 's3_data_packer/filename_generator'
16
+
17
+ require 's3_data_packer/sources/object'
18
+ require 's3_data_packer/sources/s3_bucket'
19
+
20
+ require 's3_data_packer/targets/s3_bucket'
21
+ require 's3_data_packer/targets/object'
22
+
23
+ module S3DataPacker
24
+ class << self
25
+ attr_reader :configuration
26
+
27
+ def configuration
28
+ @configuration ||= Configuration.new
29
+ end
30
+
31
+ alias config configuration
32
+
33
+ def configure
34
+ yield configuration
35
+ end
36
+
37
+ def logger
38
+ @logger ||= config.logger || Logger.new('log/s3_data_packer.log')
39
+ end
40
+ end
41
+ end