s3_data_packer 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +15 -0
- data/.ruby-gemset +1 -0
- data/.ruby-version +1 -0
- data/.travis.yml +7 -0
- data/CHANGELOG.md +8 -0
- data/CODE_OF_CONDUCT.md +74 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +54 -0
- data/LICENSE.txt +21 -0
- data/README.md +258 -0
- data/Rakefile +10 -0
- data/bin/console +14 -0
- data/bin/setup +8 -0
- data/lib/s3_data_packer/bucket.rb +88 -0
- data/lib/s3_data_packer/configuration.rb +99 -0
- data/lib/s3_data_packer/filename_generator.rb +45 -0
- data/lib/s3_data_packer/json_batch.rb +93 -0
- data/lib/s3_data_packer/packer.rb +105 -0
- data/lib/s3_data_packer/queue.rb +46 -0
- data/lib/s3_data_packer/sources/object.rb +28 -0
- data/lib/s3_data_packer/sources/s3_bucket.rb +21 -0
- data/lib/s3_data_packer/summary.rb +59 -0
- data/lib/s3_data_packer/targets/object.rb +21 -0
- data/lib/s3_data_packer/targets/s3_bucket.rb +16 -0
- data/lib/s3_data_packer/thread_set.rb +98 -0
- data/lib/s3_data_packer/version.rb +3 -0
- data/lib/s3_data_packer.rb +41 -0
- data/s3_data_packer.gemspec +41 -0
- metadata +174 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: da63067cb5437094daab442088507425866787ad176a98a0f7e515ae54f142fe
|
4
|
+
data.tar.gz: 249ffbb44f8e245c8383e17c1c6ae754e477935e8b461ed9193b8a12beb4394f
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 7f204a257f1b92d40e2e425b62e66e88238f06205e2da40940243cd086ce66005eb00ed25d3326bb25b6ee65351ca46a85165f7117d8526ec9f9de22a2cff165
|
7
|
+
data.tar.gz: 9c6681710e72702cb85fa63bf91a891afc32771626449f5dfc314ebd2f69fd8a52a279d0d1d889a15b4a016e9695ad2846f3eb395244df17f45ed7d5b5e4e0bf
|
data/.gitignore
ADDED
data/.ruby-gemset
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
s3_data_packer
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.5.9
|
data/.travis.yml
ADDED
data/CHANGELOG.md
ADDED
data/CODE_OF_CONDUCT.md
ADDED
@@ -0,0 +1,74 @@
|
|
1
|
+
# Contributor Covenant Code of Conduct
|
2
|
+
|
3
|
+
## Our Pledge
|
4
|
+
|
5
|
+
In the interest of fostering an open and welcoming environment, we as
|
6
|
+
contributors and maintainers pledge to making participation in our project and
|
7
|
+
our community a harassment-free experience for everyone, regardless of age, body
|
8
|
+
size, disability, ethnicity, gender identity and expression, level of experience,
|
9
|
+
nationality, personal appearance, race, religion, or sexual identity and
|
10
|
+
orientation.
|
11
|
+
|
12
|
+
## Our Standards
|
13
|
+
|
14
|
+
Examples of behavior that contributes to creating a positive environment
|
15
|
+
include:
|
16
|
+
|
17
|
+
* Using welcoming and inclusive language
|
18
|
+
* Being respectful of differing viewpoints and experiences
|
19
|
+
* Gracefully accepting constructive criticism
|
20
|
+
* Focusing on what is best for the community
|
21
|
+
* Showing empathy towards other community members
|
22
|
+
|
23
|
+
Examples of unacceptable behavior by participants include:
|
24
|
+
|
25
|
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
26
|
+
advances
|
27
|
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
28
|
+
* Public or private harassment
|
29
|
+
* Publishing others' private information, such as a physical or electronic
|
30
|
+
address, without explicit permission
|
31
|
+
* Other conduct which could reasonably be considered inappropriate in a
|
32
|
+
professional setting
|
33
|
+
|
34
|
+
## Our Responsibilities
|
35
|
+
|
36
|
+
Project maintainers are responsible for clarifying the standards of acceptable
|
37
|
+
behavior and are expected to take appropriate and fair corrective action in
|
38
|
+
response to any instances of unacceptable behavior.
|
39
|
+
|
40
|
+
Project maintainers have the right and responsibility to remove, edit, or
|
41
|
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
42
|
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
43
|
+
permanently any contributor for other behaviors that they deem inappropriate,
|
44
|
+
threatening, offensive, or harmful.
|
45
|
+
|
46
|
+
## Scope
|
47
|
+
|
48
|
+
This Code of Conduct applies both within project spaces and in public spaces
|
49
|
+
when an individual is representing the project or its community. Examples of
|
50
|
+
representing a project or community include using an official project e-mail
|
51
|
+
address, posting via an official social media account, or acting as an appointed
|
52
|
+
representative at an online or offline event. Representation of a project may be
|
53
|
+
further defined and clarified by project maintainers.
|
54
|
+
|
55
|
+
## Enforcement
|
56
|
+
|
57
|
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
58
|
+
reported by contacting the project team at rayko.drg@gmail.com. All
|
59
|
+
complaints will be reviewed and investigated and will result in a response that
|
60
|
+
is deemed necessary and appropriate to the circumstances. The project team is
|
61
|
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
62
|
+
Further details of specific enforcement policies may be posted separately.
|
63
|
+
|
64
|
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
65
|
+
faith may face temporary or permanent repercussions as determined by other
|
66
|
+
members of the project's leadership.
|
67
|
+
|
68
|
+
## Attribution
|
69
|
+
|
70
|
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
71
|
+
available at [http://contributor-covenant.org/version/1/4][version]
|
72
|
+
|
73
|
+
[homepage]: http://contributor-covenant.org
|
74
|
+
[version]: http://contributor-covenant.org/version/1/4/
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
s3_data_packer (0.2.0)
|
5
|
+
aws-sdk-s3 (~> 1)
|
6
|
+
mime-types (~> 3)
|
7
|
+
|
8
|
+
GEM
|
9
|
+
remote: https://rubygems.org/
|
10
|
+
specs:
|
11
|
+
aws-eventstream (1.1.1)
|
12
|
+
aws-partitions (1.492.0)
|
13
|
+
aws-sdk-core (3.119.1)
|
14
|
+
aws-eventstream (~> 1, >= 1.0.2)
|
15
|
+
aws-partitions (~> 1, >= 1.239.0)
|
16
|
+
aws-sigv4 (~> 1.1)
|
17
|
+
jmespath (~> 1.0)
|
18
|
+
aws-sdk-kms (1.47.0)
|
19
|
+
aws-sdk-core (~> 3, >= 3.119.0)
|
20
|
+
aws-sigv4 (~> 1.1)
|
21
|
+
aws-sdk-s3 (1.100.0)
|
22
|
+
aws-sdk-core (~> 3, >= 3.119.0)
|
23
|
+
aws-sdk-kms (~> 1)
|
24
|
+
aws-sigv4 (~> 1.1)
|
25
|
+
aws-sigv4 (1.2.4)
|
26
|
+
aws-eventstream (~> 1, >= 1.0.2)
|
27
|
+
byebug (11.1.3)
|
28
|
+
docile (1.4.0)
|
29
|
+
jmespath (1.4.0)
|
30
|
+
mime-types (3.3.1)
|
31
|
+
mime-types-data (~> 3.2015)
|
32
|
+
mime-types-data (3.2021.0704)
|
33
|
+
minitest (5.14.4)
|
34
|
+
rake (10.5.0)
|
35
|
+
simplecov (0.21.2)
|
36
|
+
docile (~> 1.1)
|
37
|
+
simplecov-html (~> 0.11)
|
38
|
+
simplecov_json_formatter (~> 0.1)
|
39
|
+
simplecov-html (0.12.3)
|
40
|
+
simplecov_json_formatter (0.1.3)
|
41
|
+
|
42
|
+
PLATFORMS
|
43
|
+
ruby
|
44
|
+
|
45
|
+
DEPENDENCIES
|
46
|
+
bundler (~> 1.16)
|
47
|
+
byebug
|
48
|
+
minitest (~> 5.0)
|
49
|
+
rake (~> 10.0)
|
50
|
+
s3_data_packer!
|
51
|
+
simplecov
|
52
|
+
|
53
|
+
BUNDLED WITH
|
54
|
+
1.16.6
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2021 Rayko
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,258 @@
|
|
1
|
+
# S3DataPacker
|
2
|
+
|
3
|
+
This small packer will read a large amount of individual files on an S3 location that represent single
|
4
|
+
items in JSON format, and pack them into larget batches with the option of compressing the final batch,
|
5
|
+
decreasing the total storage size of the data (if compressed), and also reducing the total number of files.
|
6
|
+
|
7
|
+
The idea is to prepare data dumped on S3 in this way to a more optimal layout for AWS Athena to setup
|
8
|
+
a querying system on top of it.
|
9
|
+
|
10
|
+
For now, S3DataPacker supports JSON items, with a 1 item per file layout, GZip compression if enabled, and
|
11
|
+
only from S3 to S3, though the source and target bucket can be different buckets or even on different accounts
|
12
|
+
if the proper credentials are provided.
|
13
|
+
|
14
|
+
## Installation
|
15
|
+
|
16
|
+
Add this line to your application's Gemfile:
|
17
|
+
|
18
|
+
```ruby
|
19
|
+
gem 's3_data_packer'
|
20
|
+
```
|
21
|
+
|
22
|
+
Or use the `main` branch from repo:
|
23
|
+
|
24
|
+
```ruby
|
25
|
+
gem 's3_data_packer', git: 'https://github.com/rayko/s3_data_packer.git', branch: 'main'
|
26
|
+
```
|
27
|
+
|
28
|
+
And then execute:
|
29
|
+
|
30
|
+
$ bundle
|
31
|
+
|
32
|
+
Or install it yourself as:
|
33
|
+
|
34
|
+
$ gem install s3_data_packer
|
35
|
+
|
36
|
+
## Configurations
|
37
|
+
|
38
|
+
There's a good number of options that can alter how the data is consumed. Below is the list of all defaults
|
39
|
+
out of the box:
|
40
|
+
|
41
|
+
```
|
42
|
+
S3DataPacker.configure do |config|
|
43
|
+
config.logger = Logger.new('log/s3_data_packer.log') # Standard logger for information
|
44
|
+
config.thread_count = 2 # How many threads to run
|
45
|
+
config.thread_sleep_time = 1 # How long to wait when there's no work in queue
|
46
|
+
config.thread_lock_wait_time = 1 # How long to wait when a lock error happens before retrying
|
47
|
+
config.max_queue_size = 10000 # How big can the queue get during processing
|
48
|
+
config.max_queue_wait = 5 # How long to wait when the queue reached max_queue_size before continuing
|
49
|
+
config.workdir = 'tmp/s3_data_packer' # Where to keep output files until pushing to target location
|
50
|
+
config.cleanup_batch = true # Whether to remove the pushed batches or not
|
51
|
+
config.compress_batch = true # Whether to compresss with GZip or not
|
52
|
+
config.batch_size = 100000 # How many items to fit in a batch
|
53
|
+
config.s3_api_key = nil # Default API Key for an Aws account
|
54
|
+
config.s3_api_secret = nil # Default API Secret for an AWS account
|
55
|
+
config.s3_region = nil # Default region for the buckets to use
|
56
|
+
config.output_filename_prefix = nil # Static prefix to append on output filenames
|
57
|
+
config.output_filename_suffix = 'batch' # Static suffix to insert on output filenames
|
58
|
+
config.output_filename_pattern = %i[timecode_int suffix] # Simple pattern to construct output filenames (more on that below)
|
59
|
+
config.output_filename_splitter = '_' # Character to join elements into a string that'll be a final filename
|
60
|
+
end
|
61
|
+
```
|
62
|
+
|
63
|
+
### S3 Credentials
|
64
|
+
|
65
|
+
There are 2 main ways to do this depending on the context. Buckets can be configured in place with user provided
|
66
|
+
credentials for both source and target locations.
|
67
|
+
|
68
|
+
If the source and target locations are on the same account, region and use the same credentials, the options
|
69
|
+
above can be set to always set those credentials.
|
70
|
+
|
71
|
+
AWS credentials in the configuration here are optional, and just a shortcut to setting credentials for each
|
72
|
+
run.
|
73
|
+
|
74
|
+
### Thread options
|
75
|
+
|
76
|
+
Various thread options are available to moderate how the process run. Depending on the hardware available
|
77
|
+
the thread counts can be adjusted to speed up the process. However, it there are enough threads, the queue
|
78
|
+
might run empty too soon, in which case threads will sleep the given ammount of time to wait to gather
|
79
|
+
some items to work on.
|
80
|
+
|
81
|
+
All timming settings should be adjusted depending on where this is going to run and the resources available.
|
82
|
+
|
83
|
+
### Output filename options
|
84
|
+
|
85
|
+
There are a couple parameters that can be configured generally to generate filenames consistently. The simplest
|
86
|
+
options `:output_filename_prefix`, `:output_filename_suffix` and `:output_filename_splitter` are straight
|
87
|
+
forward. The `:output_filename_pattern` option is a bit more involved. It basically dictates order and what
|
88
|
+
values to use when generating a filename. When a new name needs to be generated, each item in the pattern will
|
89
|
+
be translated to a value of some kind, and merged toghether with the `:output_filename_splitter` character.
|
90
|
+
The contents of the pattern array must be `Symbol` names and can only be one of the following:
|
91
|
+
|
92
|
+
- :timecode_int -> current standard time in seconds (`Time.now.to_i`)
|
93
|
+
- :timecode_dec -> current standard time with milliseconds (`Time.now.to_f`)
|
94
|
+
- :number -> a simple number that grows as new names are generated
|
95
|
+
- :timestamp -> simple time stamp with format YYYYMMDDhhmmss
|
96
|
+
- :datestamp -> simple date stamp with format YYYYMMDD
|
97
|
+
- :prefix -> given static string to use as prefix on the name
|
98
|
+
- :suffix -> given static string to use as suffix on the name
|
99
|
+
|
100
|
+
Different patterns will generate different names with same structuring. The important part here is to always
|
101
|
+
include a variable element so final files do not override previous data.
|
102
|
+
|
103
|
+
A few examples of different patterns, setting prefix as 'data' and suffix as 'batch':
|
104
|
+
|
105
|
+
- [:timecode_int, :suffix] -> 1111111111_batch 1111111112_batch 1111111113_batch ...
|
106
|
+
- [:datestamp, :number] -> 20200101_1 20200101_2 20200101_3 ...
|
107
|
+
- [:prefix, :number, :suffix] -> data_1_batch data_2_batch data_3_batch ...
|
108
|
+
|
109
|
+
## Usage
|
110
|
+
|
111
|
+
The simplest setup for this simple file processor is to set the AWS credentials and region through the
|
112
|
+
configuration as shown above. Be sure that `config.workdir` is set and the location exists in the local
|
113
|
+
machine.
|
114
|
+
|
115
|
+
To launch the packer, the only thing needed out of the box, is to instantiate 2 `S3DataPacker::Bucket`
|
116
|
+
objects that will act as source and destination:
|
117
|
+
|
118
|
+
```
|
119
|
+
source_bucket = S3DataPacker::Sources::S3Bucket.new name: 'my-bucket', path: 'some/location'
|
120
|
+
target_bucket = S3DataPacker::Sources::S3Bucket.new name: 'other-bucket', path: 'my/destination'
|
121
|
+
```
|
122
|
+
|
123
|
+
You can override the configured AWS credentials with the `:credentials` option, as well as `:region`.
|
124
|
+
`:credentials` needs to be an instance of `Aws::Credentials`. Having it setup this way should allow
|
125
|
+
for more complex role invoking, since the instance passed `:credentials` option is fed direclty to
|
126
|
+
`Aws::S3::Resource` and `Aws::S3::Client` to interface with the S3 buckets.
|
127
|
+
|
128
|
+
Once the buckets are instantiated you can call the packer:
|
129
|
+
|
130
|
+
```
|
131
|
+
packer = S3DataPacker::Packer.new source: source_bucket, target: target_bucket
|
132
|
+
packer.pack!
|
133
|
+
```
|
134
|
+
|
135
|
+
### How it works?
|
136
|
+
|
137
|
+
Based on the sample above, what will happen once that `#pack!` is called, is that a set of threads will boot
|
138
|
+
up, a new file will be opened in `config.workdir` that, without further configuration it will be named
|
139
|
+
`123123123_batch.json` (in general), and then the packer will start to iterate over all keys under the
|
140
|
+
source path `some/location`.
|
141
|
+
|
142
|
+
Each key listed will enter the queue for the threads, and the threads will then take each key in queue,
|
143
|
+
download the data in memory (it does not create a file for it), append the data into the currently opened
|
144
|
+
batch, and continue with the next key.
|
145
|
+
|
146
|
+
As items are appended, if the target size `config.batch_size` is reached, the current batch is closed,
|
147
|
+
compressed with GZip, and uploaded to target bucket in the location specificed `my/destination`. Once
|
148
|
+
the file is pushed, the local copy is deleted, and a new batch is opened to continue appending items.
|
149
|
+
|
150
|
+
When all the keys have been listed, the packer will wait for the threads to finish any remaining items in the
|
151
|
+
queue, and the last opened batch that likely hasn't reached target size, is then closed and pushed like the
|
152
|
+
others.
|
153
|
+
|
154
|
+
And that's basically it. There are a few places in where additional processing may be introduced, but that's
|
155
|
+
a feature for later.
|
156
|
+
|
157
|
+
There are no specialties regarding source and target buckets, they can be the same, on different accounts
|
158
|
+
or region. However it is not recommended to setup source and target on the same bucket and path.
|
159
|
+
|
160
|
+
### Custom Sources/Targets
|
161
|
+
|
162
|
+
It is possible to define a custom source and target for the packer to read data from some different place
|
163
|
+
that is not an S3 bucket, as well as put the resultant batch into somewhere else. The `S3DataPacker::Packer`
|
164
|
+
can take `:source` and `:target` parameters to use other things. At the moment, there are 2 source classes
|
165
|
+
provided:
|
166
|
+
|
167
|
+
- `S3DataPacker::Sources::S3Bucket`
|
168
|
+
- `S3DataPacker::Sources::Object`
|
169
|
+
|
170
|
+
And 2 pre-defined target:
|
171
|
+
|
172
|
+
- `S3DataPacker::Targets::S3Bucket`
|
173
|
+
- `S3DataPacker::Targets::Object`
|
174
|
+
|
175
|
+
Both bucket related classes operate in the same way, you need to define the name and path of the buckets
|
176
|
+
to read and write the data, as in the main example above. Be sure to configure credentials to use these.
|
177
|
+
|
178
|
+
The object source is pretty much a wrapper you can use with some other custom object, passing down which
|
179
|
+
methods to call on it for the packer. Any object you pass down in the object source needs to respond to:
|
180
|
+
|
181
|
+
- `#name`: which is mostly used for logging
|
182
|
+
- `#each`: with a block to iterate over items
|
183
|
+
- `#fetch`: with an identifier to find the actual data of the item
|
184
|
+
|
185
|
+
The `#each` and `#fetch` methods are like that mainly because the packer is threaded and it expects to
|
186
|
+
iterate over keys, or IDs or some minor piece of information in one thread, and use that information
|
187
|
+
to retrive the full object data on other threads. This keeps the queue small in byte size.
|
188
|
+
|
189
|
+
By default the object source expects those method names to be defined in the object provided. If there
|
190
|
+
are other methods that do that already on the object but with different name, the method names can be
|
191
|
+
passed like so:
|
192
|
+
|
193
|
+
```ruby
|
194
|
+
S3DataPacker::Sources::Object.new object: my_object,
|
195
|
+
each_method: :iterate,
|
196
|
+
fetch_method: :find,
|
197
|
+
name_method: :display_name
|
198
|
+
```
|
199
|
+
|
200
|
+
As long as `#each` yields items (strings, IDs, whatever), and `#fetch` returns JSON data for an item,
|
201
|
+
this should work.
|
202
|
+
|
203
|
+
For targets, there's also a `S3DataPacker::Targets::Object` that can be used in the a similar way, the only
|
204
|
+
2 methods for it are:
|
205
|
+
|
206
|
+
- `#name`: for the same purposes as sources `#name` method
|
207
|
+
- `#save_file`: with a path parameter
|
208
|
+
|
209
|
+
It can also be configured with other method names if needed:
|
210
|
+
|
211
|
+
```ruby
|
212
|
+
S3DataPacker::Targets::Object.new object: my_object,
|
213
|
+
name_method: :custom_name,
|
214
|
+
save_file_method: :save!
|
215
|
+
```
|
216
|
+
|
217
|
+
It is also possible to construct a custom source/target class outside of the pre-defined ones that can
|
218
|
+
do anything needed, and passed down to the packer instance to use. As long as the few needed methods are
|
219
|
+
there, it should work just fine.
|
220
|
+
|
221
|
+
In some cases it might be useful to unify the get/fetch mechanics. This can be easily done by just
|
222
|
+
bypassing the `#fetch` method and returning the data received. If for some reason the iterator for
|
223
|
+
`#each` needs to output the actual data right there, by writing a `#fetch` method that returns whatever
|
224
|
+
was passed in the parameter, effectively makes the packer's queue hold actual data. This might be useful
|
225
|
+
in some cases, though it might need a smaller max size configuration to prevent having too much data in the
|
226
|
+
queue.
|
227
|
+
|
228
|
+
I believe that with these tools, the packer can pretty much do the JSON packing in most cases, including:
|
229
|
+
|
230
|
+
- Reading database records and serializing them into JSON
|
231
|
+
- Reading S3 buckets (as originally intended)
|
232
|
+
- Reading NoSQL items
|
233
|
+
- Reading one file or a set of files
|
234
|
+
- Writing batches into S3 buckets (as originally intended)
|
235
|
+
- Writing batches into filesystem on some custom location
|
236
|
+
- Writing batches into some other custom location
|
237
|
+
|
238
|
+
At least, it does cover the cases in where I intend to use it.
|
239
|
+
|
240
|
+
## Development
|
241
|
+
|
242
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake test` to run the tests.
|
243
|
+
You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
244
|
+
|
245
|
+
## Contributing
|
246
|
+
|
247
|
+
Bug reports and pull requests are welcome on GitHub at https://github.com/rayko/s3_data_packer.
|
248
|
+
This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to
|
249
|
+
adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
|
250
|
+
|
251
|
+
## License
|
252
|
+
|
253
|
+
The gem is available as open source under the terms of the [MIT License](https://opensource.org/licenses/MIT).
|
254
|
+
|
255
|
+
## Code of Conduct
|
256
|
+
|
257
|
+
Everyone interacting in the S3DataPacker project’s codebases, issue trackers, chat rooms and mailing lists
|
258
|
+
is expected to follow the [code of conduct](https://github.com/[USERNAME]/s3_data_packer/blob/master/CODE_OF_CONDUCT.md).
|
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require "bundler/setup"
|
4
|
+
require "s3_data_packer"
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require "irb"
|
14
|
+
IRB.start(__FILE__)
|
data/bin/setup
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
module S3DataPacker
|
2
|
+
class Bucket
|
3
|
+
attr_reader :bucket_name, :path
|
4
|
+
|
5
|
+
def initialize opts = {}
|
6
|
+
@bucket_name = opts[:bucket_name]
|
7
|
+
@credentials = opts[:credentials]
|
8
|
+
@region = opts[:region]
|
9
|
+
@path = opts[:path]
|
10
|
+
end
|
11
|
+
|
12
|
+
def credentials
|
13
|
+
@credentials ||= S3DataPacker.config.default_s3_credentials
|
14
|
+
end
|
15
|
+
|
16
|
+
def region
|
17
|
+
@region ||= S3DataPacker.config.s3_region
|
18
|
+
end
|
19
|
+
|
20
|
+
def logger
|
21
|
+
@logger ||= S3DataPacker.logger
|
22
|
+
end
|
23
|
+
|
24
|
+
def each_key &block
|
25
|
+
bucket.objects(prefix: path).each do |item|
|
26
|
+
yield item.key
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
def exist?(key)
|
31
|
+
request! { object(key).exists? }
|
32
|
+
end
|
33
|
+
|
34
|
+
def download(key)
|
35
|
+
data = request! { object(key).get }
|
36
|
+
logger.warn "missing key #{key}" unless data
|
37
|
+
return nil unless data
|
38
|
+
data.body.read
|
39
|
+
end
|
40
|
+
|
41
|
+
def upload(file, opts={})
|
42
|
+
raise ArgumentError, 'File does not exist' unless File.exist?(file)
|
43
|
+
key = "#{path}/#{File.basename(file)}"
|
44
|
+
raise ArgumentError, "File #{File.basename(file)} already exists in target location" if exist?(key)
|
45
|
+
metadata = opts
|
46
|
+
metadata[:content_type] ||= file_mime_type(file)
|
47
|
+
metadata[:content_disposition] ||= 'attachement'
|
48
|
+
request! { object(key).upload_file(file, metadata) }
|
49
|
+
logger.info "Uploaded #{file} to s3://#{bucket_name}/#{key}"
|
50
|
+
end
|
51
|
+
|
52
|
+
private
|
53
|
+
|
54
|
+
def request! &block
|
55
|
+
begin
|
56
|
+
yield
|
57
|
+
rescue Aws::S3::Errors::InternalError
|
58
|
+
logger.warn "Aws::S3::Errors::InternalError, retrying in 1 second"
|
59
|
+
sleep(1)
|
60
|
+
retry
|
61
|
+
rescue Aws::S3::Errors::NoSuchKey
|
62
|
+
return nil
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
def file_mime_type(file)
|
67
|
+
begin
|
68
|
+
MIME::Types.type_for(file).first.content_type
|
69
|
+
rescue StandardError
|
70
|
+
logger.error "Could not guess MIME type of #{file}"
|
71
|
+
return nil
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def object(key)
|
76
|
+
bucket.object(key)
|
77
|
+
end
|
78
|
+
|
79
|
+
def bucket
|
80
|
+
@bucket ||= resource.bucket(bucket_name)
|
81
|
+
end
|
82
|
+
|
83
|
+
def resource
|
84
|
+
@resource ||= ::Aws::S3::Resource.new(region: region, credentials: credentials)
|
85
|
+
end
|
86
|
+
|
87
|
+
end
|
88
|
+
end
|