RubyGems - s3_data_packer - Versions diffs - 0.2.0 - Mend

s3_data_packer 0.2.0

Files changed (30) hide show

checksums.yaml +7 -0
data/.gitignore +15 -0
data/.ruby-gemset +1 -0
data/.ruby-version +1 -0
data/.travis.yml +7 -0
data/CHANGELOG.md +8 -0
data/CODE_OF_CONDUCT.md +74 -0
data/Gemfile +6 -0
data/Gemfile.lock +54 -0
data/LICENSE.txt +21 -0
data/README.md +258 -0
data/Rakefile +10 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/lib/s3_data_packer/bucket.rb +88 -0
data/lib/s3_data_packer/configuration.rb +99 -0
data/lib/s3_data_packer/filename_generator.rb +45 -0
data/lib/s3_data_packer/json_batch.rb +93 -0
data/lib/s3_data_packer/packer.rb +105 -0
data/lib/s3_data_packer/queue.rb +46 -0
data/lib/s3_data_packer/sources/object.rb +28 -0
data/lib/s3_data_packer/sources/s3_bucket.rb +21 -0
data/lib/s3_data_packer/summary.rb +59 -0
data/lib/s3_data_packer/targets/object.rb +21 -0
data/lib/s3_data_packer/targets/s3_bucket.rb +16 -0
data/lib/s3_data_packer/thread_set.rb +98 -0
data/lib/s3_data_packer/version.rb +3 -0
data/lib/s3_data_packer.rb +41 -0
data/s3_data_packer.gemspec +41 -0
metadata +174 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA256:
+  metadata.gz: da63067cb5437094daab442088507425866787ad176a98a0f7e515ae54f142fe
+  data.tar.gz: 249ffbb44f8e245c8383e17c1c6ae754e477935e8b461ed9193b8a12beb4394f
+SHA512:
+  metadata.gz: 7f204a257f1b92d40e2e425b62e66e88238f06205e2da40940243cd086ce66005eb00ed25d3326bb25b6ee65351ca46a85165f7117d8526ec9f9de22a2cff165
+  data.tar.gz: 9c6681710e72702cb85fa63bf91a891afc32771626449f5dfc314ebd2f69fd8a52a279d0d1d889a15b4a016e9695ad2846f3eb395244df17f45ed7d5b5e4e0bf

data/.gitignore ADDED Viewed

@@ -0,0 +1,15 @@
+/.bundle/
+/.yardoc
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+log/*.log*
+*~
+*#*#*
+.DS_Store
+coverage/*
+*.swp
+*.iml

data/.ruby-gemset ADDED Viewed

	@@ -0,0 +1 @@
1	+ s3_data_packer

data/.ruby-version ADDED Viewed

	@@ -0,0 +1 @@
1	+ 2.5.9

data/.travis.yml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+sudo: false
+language: ruby
+cache: bundler
+rvm:
+  - 2.5.9
+before_install: gem install bundler -v 1.16.6

data/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,8 @@
+# v0.2.1
+- Adding custom sources/targets wrappers
+- More testing
+# Unreleased
+- First commit

data/CODE_OF_CONDUCT.md ADDED Viewed

@@ -0,0 +1,74 @@
+# Contributor Covenant Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, gender identity and expression, level of experience,
+nationality, personal appearance, race, religion, or sexual identity and
+orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at rayko.drg@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at [http://contributor-covenant.org/version/1/4][version]
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source "https://rubygems.org"
+git_source(:github) {|repo_name| "https://github.com/#{repo_name}" }
+# Specify your gem's dependencies in s3_data_packer.gemspec
+gemspec

data/Gemfile.lock ADDED Viewed

@@ -0,0 +1,54 @@
+PATH
+  remote: .
+  specs:
+    s3_data_packer (0.2.0)
+      aws-sdk-s3 (~> 1)
+      mime-types (~> 3)
+GEM
+  remote: https://rubygems.org/
+  specs:
+    aws-eventstream (1.1.1)
+    aws-partitions (1.492.0)
+    aws-sdk-core (3.119.1)
+      aws-eventstream (~> 1, >= 1.0.2)
+      aws-partitions (~> 1, >= 1.239.0)
+      aws-sigv4 (~> 1.1)
+      jmespath (~> 1.0)
+    aws-sdk-kms (1.47.0)
+      aws-sdk-core (~> 3, >= 3.119.0)
+      aws-sigv4 (~> 1.1)
+    aws-sdk-s3 (1.100.0)
+      aws-sdk-core (~> 3, >= 3.119.0)
+      aws-sdk-kms (~> 1)
+      aws-sigv4 (~> 1.1)
+    aws-sigv4 (1.2.4)
+      aws-eventstream (~> 1, >= 1.0.2)
+    byebug (11.1.3)
+    docile (1.4.0)
+    jmespath (1.4.0)
+    mime-types (3.3.1)
+      mime-types-data (~> 3.2015)
+    mime-types-data (3.2021.0704)
+    minitest (5.14.4)
+    rake (10.5.0)
+    simplecov (0.21.2)
+      docile (~> 1.1)
+      simplecov-html (~> 0.11)
+      simplecov_json_formatter (~> 0.1)
+    simplecov-html (0.12.3)
+    simplecov_json_formatter (0.1.3)
+PLATFORMS
+  ruby
+DEPENDENCIES
+  bundler (~> 1.16)
+  byebug
+  minitest (~> 5.0)
+  rake (~> 10.0)
+  s3_data_packer!
+  simplecov
+BUNDLED WITH
+   1.16.6

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2021 Rayko
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,258 @@
+# S3DataPacker
+This small packer will read a large amount of individual files on an S3 location that represent single
+items in JSON format, and pack them into larget batches with the option of compressing the final batch,
+decreasing the total storage size of the data (if compressed), and also reducing the total number of files.
+The idea is to prepare data dumped on S3 in this way to a more optimal layout for AWS Athena to setup
+a querying system on top of it.
+For now, S3DataPacker supports JSON items, with a 1 item per file layout, GZip compression if enabled, and
+only from S3 to S3, though the source and target bucket can be different buckets or even on different accounts
+if the proper credentials are provided.
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 's3_data_packer'
+```
+Or use the `main` branch from repo:
+```ruby
+gem 's3_data_packer', git: 'https://github.com/rayko/s3_data_packer.git', branch: 'main'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install s3_data_packer
+## Configurations
+There's a good number of options that can alter how the data is consumed. Below is the list of all defaults
+out of the box:
+```
+S3DataPacker.configure do |config|
+  config.logger = Logger.new('log/s3_data_packer.log')     # Standard logger for information
+  config.thread_count = 2                                  # How many threads to run
+  config.thread_sleep_time = 1                             # How long to wait when there's no work in queue
+  config.thread_lock_wait_time = 1                         # How long to wait when a lock error happens before retrying
+  config.max_queue_size = 10000                            # How big can the queue get during processing
+  config.max_queue_wait = 5                                # How long to wait when the queue reached max_queue_size before continuing
+  config.workdir = 'tmp/s3_data_packer'                    # Where to keep output files until pushing to target location
+  config.cleanup_batch = true                              # Whether to remove the pushed batches or not
+  config.compress_batch = true                             # Whether to compresss with GZip or not
+  config.batch_size = 100000                               # How many items to fit in a batch
+  config.s3_api_key = nil                                  # Default API Key for an Aws account
+  config.s3_api_secret = nil                               # Default API Secret for an AWS account
+  config.s3_region = nil                                   # Default region for the buckets to use
+  config.output_filename_prefix = nil                      # Static prefix to append on output filenames
+  config.output_filename_suffix = 'batch'                  # Static suffix to insert on output filenames
+  config.output_filename_pattern = %i[timecode_int suffix] # Simple pattern to construct output filenames (more on that below)
+  config.output_filename_splitter = '_'                    # Character to join elements into a string that'll be a final filename
+end
+```
+### S3 Credentials
+There are 2 main ways to do this depending on the context. Buckets can be configured in place with user provided
+credentials for both source and target locations.
+If the source and target locations are on the same account, region and use the same credentials, the options
+above can be set to always set those credentials.
+AWS credentials in the configuration here are optional, and just a shortcut to setting credentials for each
+run.
+### Thread options
+Various thread options are available to moderate how the process run. Depending on the hardware available
+the thread counts can be adjusted to speed up the process. However, it there are enough threads, the queue
+might run empty too soon, in which case threads will sleep the given ammount of time to wait to gather
+some items to work on.
+All timming settings should be adjusted depending on where this is going to run and the resources available.
+### Output filename options
+There are a couple parameters that can be configured generally to generate filenames consistently. The simplest
+options `:output_filename_prefix`, `:output_filename_suffix` and `:output_filename_splitter` are straight
+forward. The `:output_filename_pattern` option is a bit more involved. It basically dictates order and what
+values to use when generating a filename. When a new name needs to be generated, each item in the pattern will
+be translated to a value of some kind, and merged toghether with the `:output_filename_splitter` character.
+The contents of the pattern array must be `Symbol` names and can only be one of the following:
+- :timecode_int -> current standard time in seconds (`Time.now.to_i`)
+- :timecode_dec -> current standard time with milliseconds (`Time.now.to_f`)
+- :number -> a simple number that grows as new names are generated
+- :timestamp -> simple time stamp with format YYYYMMDDhhmmss
+- :datestamp -> simple date stamp with format YYYYMMDD
+- :prefix -> given static string to use as prefix on the name
+- :suffix -> given static string to use as suffix on the name
+Different patterns will generate different names with same structuring. The important part here is to always
+include a variable element so final files do not override previous data.
+A few examples of different patterns, setting prefix as 'data' and suffix as 'batch':
+- [:timecode_int, :suffix] -> 1111111111_batch 1111111112_batch 1111111113_batch ...
+- [:datestamp, :number] -> 20200101_1 20200101_2 20200101_3 ...
+- [:prefix, :number, :suffix] -> data_1_batch data_2_batch data_3_batch ...
+## Usage
+The simplest setup for this simple file processor is to set the AWS credentials and region through the
+configuration as shown above. Be sure that `config.workdir` is set and the location exists in the local
+machine.
+To launch the packer, the only thing needed out of the box, is to instantiate 2 `S3DataPacker::Bucket`
+objects that will act as source and destination:
+```
+source_bucket = S3DataPacker::Sources::S3Bucket.new name: 'my-bucket', path: 'some/location'
+target_bucket = S3DataPacker::Sources::S3Bucket.new name: 'other-bucket', path: 'my/destination'
+```
+You can override the configured AWS credentials with the `:credentials` option, as well as `:region`.
+`:credentials` needs to be an instance of `Aws::Credentials`. Having it setup this way should allow
+for more complex role invoking, since the instance passed `:credentials` option is fed direclty to
+`Aws::S3::Resource` and `Aws::S3::Client` to interface with the S3 buckets.
+Once the buckets are instantiated you can call the packer:
+```
+packer = S3DataPacker::Packer.new source: source_bucket, target: target_bucket
+packer.pack!
+```
+### How it works?
+Based on the sample above, what will happen once that `#pack!` is called, is that a set of threads will boot
+up, a new file will be opened in `config.workdir` that, without further configuration it will be named
+`123123123_batch.json` (in general), and then the packer will start to iterate over all keys under the
+source path `some/location`.
+Each key listed will enter the queue for the threads, and the threads will then take each key in queue,
+download the data in memory (it does not create a file for it), append the data into the currently opened
+batch, and continue with the next key.
+As items are appended, if the target size `config.batch_size` is reached, the current batch is closed,
+compressed with GZip, and uploaded to target bucket in the location specificed `my/destination`. Once
+the file is pushed, the local copy is deleted, and a new batch is opened to continue appending items.
+When all the keys have been listed, the packer will wait for the threads to finish any remaining items in the
+queue, and the last opened batch that likely hasn't reached target size, is then closed and pushed like the
+others.
+And that's basically it. There are a few places in where additional processing may be introduced, but that's
+a feature for later.
+There are no specialties regarding source and target buckets, they can be the same, on different accounts
+or region. However it is not recommended to setup source and target on the same bucket and path.
+### Custom Sources/Targets
+It is possible to define a custom source and target for the packer to read data from some different place
+that is not an S3 bucket, as well as put the resultant batch into somewhere else. The `S3DataPacker::Packer`
+can take `:source` and `:target` parameters to use other things. At the moment, there are 2 source classes
+provided:
+- `S3DataPacker::Sources::S3Bucket`
+- `S3DataPacker::Sources::Object`
+And 2 pre-defined target:
+- `S3DataPacker::Targets::S3Bucket`
+- `S3DataPacker::Targets::Object`
+Both bucket related classes operate in the same way, you need to define the name and path of the buckets
+to read and write the data, as in the main example above. Be sure to configure credentials to use these.
+The object source is pretty much a wrapper you can use with some other custom object, passing down which
+methods to call on it for the packer. Any object you pass down in the object source needs to respond to:
+- `#name`: which is mostly used for logging
+- `#each`: with a block to iterate over items
+- `#fetch`: with an identifier to find the actual data of the item
+The `#each` and `#fetch` methods are like that mainly because the packer is threaded and it expects to
+iterate over keys, or IDs or some minor piece of information in one thread, and use that information
+to retrive the full object data on other threads. This keeps the queue small in byte size.
+By default the object source expects those method names to be defined in the object provided. If there
+are other methods that do that already on the object but with different name, the method names can be
+passed like so:
+```ruby
+S3DataPacker::Sources::Object.new object: my_object,
+                                  each_method: :iterate,
+                                  fetch_method: :find,
+                                  name_method: :display_name
+```
+As long as `#each` yields items (strings, IDs, whatever), and `#fetch` returns JSON data for an item,
+this should work.
+For targets, there's also a `S3DataPacker::Targets::Object` that can be used in the a similar way, the only
+2 methods for it are:
+- `#name`: for the same purposes as sources `#name` method
+- `#save_file`: with a path parameter
+It can also be configured with other method names if needed:
+```ruby
+S3DataPacker::Targets::Object.new object: my_object,
+                                  name_method: :custom_name,
+                                  save_file_method: :save!
+```
+It is also possible to construct a custom source/target class outside of the pre-defined ones that can
+do anything needed, and passed down to the packer instance to use. As long as the few needed methods are
+there, it should work just fine.
+In some cases it might be useful to unify the get/fetch mechanics. This can be easily done by just
+bypassing the `#fetch` method and returning the data received. If for some reason the iterator for
+`#each` needs to output the actual data right there, by writing a `#fetch` method that returns whatever
+was passed in the parameter, effectively makes the packer's queue hold actual data. This might be useful
+in some cases, though it might need a smaller max size configuration to prevent having too much data in the
+queue.
+I believe that with these tools, the packer can pretty much do the JSON packing in most cases, including:
+- Reading database records and serializing them into JSON
+- Reading S3 buckets (as originally intended)
+- Reading NoSQL items
+- Reading one file or a set of files
+- Writing batches into S3 buckets (as originally intended)
+- Writing batches into filesystem on some custom location
+- Writing batches into some other custom location
+At least, it does cover the cases in where I intend to use it.
+## Development
+After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests.
+You can also run `bin/console` for an interactive prompt that will allow you to experiment.
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/rayko/s3_data_packer.
+This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to
+adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
+## Code of Conduct
+Everyone interacting in the S3DataPacker project’s codebases, issue trackers, chat rooms and mailing lists
+is expected to follow the [code of conduct](https://github.com/[USERNAME]/s3_data_packer/blob/master/CODE_OF_CONDUCT.md).

data/Rakefile ADDED Viewed

@@ -0,0 +1,10 @@
+require "bundler/gem_tasks"
+require "rake/testtask"
+Rake::TestTask.new(:test) do |t|
+  t.libs << "test"
+  t.libs << "lib"
+  t.test_files = FileList["test/**/*_test.rb"]
+end
+task :default => :test

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "s3_data_packer"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start(__FILE__)

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/lib/s3_data_packer/bucket.rb ADDED Viewed

@@ -0,0 +1,88 @@
+module S3DataPacker
+  class Bucket
+    attr_reader :bucket_name, :path
+    def initialize opts = {}
+      @bucket_name = opts[:bucket_name]
+      @credentials = opts[:credentials]
+      @region = opts[:region]
+      @path = opts[:path]
+    end
+    def credentials
+      @credentials ||= S3DataPacker.config.default_s3_credentials
+    end
+    def region
+      @region ||= S3DataPacker.config.s3_region
+    end
+    def logger
+      @logger ||= S3DataPacker.logger
+    end
+    def each_key &block
+      bucket.objects(prefix: path).each do |item|
+        yield item.key
+      end
+    end
+    def exist?(key)
+      request! { object(key).exists? }
+    end
+    def download(key)
+      data = request! { object(key).get }
+      logger.warn "missing key #{key}" unless data
+      return nil unless data
+      data.body.read
+    end
+    def upload(file, opts={})
+      raise ArgumentError, 'File does not exist' unless File.exist?(file)
+      key = "#{path}/#{File.basename(file)}"
+      raise ArgumentError, "File #{File.basename(file)} already exists in target location" if exist?(key)
+      metadata = opts
+      metadata[:content_type] ||= file_mime_type(file)
+      metadata[:content_disposition] ||= 'attachement'
+      request! { object(key).upload_file(file, metadata) }
+      logger.info "Uploaded #{file} to s3://#{bucket_name}/#{key}"
+    end
+    private
+    def request! &block
+      begin
+        yield
+      rescue Aws::S3::Errors::InternalError
+        logger.warn "Aws::S3::Errors::InternalError, retrying in 1 second"
+        sleep(1)
+        retry
+      rescue Aws::S3::Errors::NoSuchKey
+        return nil
+      end
+    end
+    def file_mime_type(file)
+      begin
+        MIME::Types.type_for(file).first.content_type
+      rescue StandardError
+        logger.error "Could not guess MIME type of #{file}"
+        return nil
+      end
+    end
+    def object(key)
+      bucket.object(key)
+    end
+    def bucket
+      @bucket ||= resource.bucket(bucket_name)
+    end
+    def resource
+      @resource ||= ::Aws::S3::Resource.new(region: region, credentials: credentials)
+    end
+  end
+end