bricolage-streamingload 0.15.0 → 0.15.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +54 -0
- data/LICENSES +21 -0
- data/RELEASE.md +116 -0
- data/Rakefile +3 -0
- data/bricolage-streamingload.gemspec +26 -0
- data/config/production/database.yml +66 -0
- data/config/production/password.yml +5 -0
- data/config/production/streamingload.yml +20 -0
- data/config/production/variable.yml +5 -0
- data/lib/bricolage/streamingload/jobparams.rb +1 -1
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/sample/sqs-message.txt +38 -0
- data/sample/sqs-result.txt +18 -0
- data/strload_load_logs.ct +13 -0
- data/testschema/strload_test.ct +11 -0
- data/testschema/with_work_table.job +4 -0
- data/testschema/with_work_table.sql +1 -0
- data/utils/init_strload_tables.sql +13 -0
- data/utils/strload-stat.sql +36 -0
- metadata +35 -33
- data/test/all.rb +0 -3
- data/test/streamingload/test_dispatcher.rb +0 -241
- data/test/streamingload/test_dispatchermessage.rb +0 -31
- data/test/streamingload/test_job.rb +0 -620
- data/test/test_sqsdatasource.rb +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 3e5d795da31bb8940f14d95be7ae2710ffdf026ab07bed08c619c20b25ce91e6
|
4
|
+
data.tar.gz: ce467fbb158a9a9a3a42eaf07a23c9741cf0f8721511a2cf66295d3f59f0c131
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22008a5432ecf609084750804e39b5ac20e71f477c86cdb71066b4c5adea0a370ee15316bfba9972196ad431b0751afa4ea1f18b9c0d1524acd9f1a896994ad6
|
7
|
+
data.tar.gz: 84b45b8a683d971965ce2162ce5a2e9b47ffff015242290266dd64409e9c9890c7c52c5ed692a91010cbed076de79d1383d9f1c0d31619924f158dbb789ed76c
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bricolage-streamingload (0.15.1)
|
5
|
+
aws-sdk-s3 (~> 1.8)
|
6
|
+
aws-sdk-sqs (~> 1.3)
|
7
|
+
bricolage (>= 5.29.2)
|
8
|
+
pg (~> 0.18.0)
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: https://rubygems.org/
|
12
|
+
specs:
|
13
|
+
aws-eventstream (1.0.1)
|
14
|
+
aws-partitions (1.100.0)
|
15
|
+
aws-sdk-core (3.24.1)
|
16
|
+
aws-eventstream (~> 1.0)
|
17
|
+
aws-partitions (~> 1.0)
|
18
|
+
aws-sigv4 (~> 1.0)
|
19
|
+
jmespath (~> 1.0)
|
20
|
+
aws-sdk-kms (1.7.0)
|
21
|
+
aws-sdk-core (~> 3)
|
22
|
+
aws-sigv4 (~> 1.0)
|
23
|
+
aws-sdk-s3 (1.17.0)
|
24
|
+
aws-sdk-core (~> 3, >= 3.21.2)
|
25
|
+
aws-sdk-kms (~> 1)
|
26
|
+
aws-sigv4 (~> 1.0)
|
27
|
+
aws-sdk-sns (1.3.0)
|
28
|
+
aws-sdk-core (~> 3)
|
29
|
+
aws-sigv4 (~> 1.0)
|
30
|
+
aws-sdk-sqs (1.4.0)
|
31
|
+
aws-sdk-core (~> 3)
|
32
|
+
aws-sigv4 (~> 1.0)
|
33
|
+
aws-sigv4 (1.0.3)
|
34
|
+
bricolage (5.29.2)
|
35
|
+
aws-sdk-s3 (~> 1)
|
36
|
+
aws-sdk-sns (~> 1)
|
37
|
+
pg (~> 0.18.0)
|
38
|
+
jmespath (1.4.0)
|
39
|
+
pg (0.18.4)
|
40
|
+
power_assert (1.1.3)
|
41
|
+
rake (12.3.1)
|
42
|
+
test-unit (3.2.8)
|
43
|
+
power_assert
|
44
|
+
|
45
|
+
PLATFORMS
|
46
|
+
ruby
|
47
|
+
|
48
|
+
DEPENDENCIES
|
49
|
+
bricolage-streamingload!
|
50
|
+
rake
|
51
|
+
test-unit
|
52
|
+
|
53
|
+
BUNDLED WITH
|
54
|
+
1.16.1
|
data/LICENSES
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2016 Minero Aoki, Shimpei Kodama
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/RELEASE.md
ADDED
@@ -0,0 +1,116 @@
|
|
1
|
+
# Bricolage Streaming Load Release Note
|
2
|
+
|
3
|
+
## version 0.15.1
|
4
|
+
|
5
|
+
- [new] Use the schema name in the parameters table by default (no more "no such variable: XXXX_schema" error).
|
6
|
+
|
7
|
+
## version 0.15.0
|
8
|
+
|
9
|
+
- [CHANGE] Upgrade AWS-SDK to v3. This version requires at least Bricolage 5.25.
|
10
|
+
|
11
|
+
## version 0.14.2
|
12
|
+
|
13
|
+
- Reduces loading retry count
|
14
|
+
|
15
|
+
## version 0.14.1
|
16
|
+
|
17
|
+
- [fix] task logging did not work
|
18
|
+
|
19
|
+
## version 0.14.0
|
20
|
+
|
21
|
+
- [new] Logs (task - object) relashonships to S3 on dispatch, as a DB backup.
|
22
|
+
|
23
|
+
## version 0.13.0
|
24
|
+
|
25
|
+
- grand refactoring
|
26
|
+
|
27
|
+
## version 0.12.0
|
28
|
+
|
29
|
+
- [CHANGE] Adds task_id column to the log table (strload_load_logs).
|
30
|
+
|
31
|
+
## version 0.11.0
|
32
|
+
|
33
|
+
- Loosen dependent Bricolage version
|
34
|
+
|
35
|
+
## version 0.10.2
|
36
|
+
|
37
|
+
- [new] New parameter dispatch-interval.
|
38
|
+
|
39
|
+
## version 0.10.1
|
40
|
+
|
41
|
+
- [fix] Fixes simple variable ref bug.
|
42
|
+
|
43
|
+
## version 0.10.0
|
44
|
+
|
45
|
+
- [new] Automatically complement strload_jobs status with Redshift-side log table.
|
46
|
+
|
47
|
+
## version 0.9.0
|
48
|
+
|
49
|
+
- [new] Introduces Redshift-side load log table (strload_load_logs) for load duplication checking.
|
50
|
+
|
51
|
+
## version 0.8.1
|
52
|
+
|
53
|
+
- [fix] tmp: Do not retry on data connection failure.
|
54
|
+
|
55
|
+
## version 0.8.0
|
56
|
+
|
57
|
+
- [CHANGE] Loader retries failed load tasks automatically. Streaming loader does NOT delete a task from the queue on job failures.
|
58
|
+
- [enhancement] Rewrites loader to get better error handling (ensure to write log record in the wider range of errornous situations).
|
59
|
+
|
60
|
+
## version 0.7.1
|
61
|
+
|
62
|
+
- fix utilities
|
63
|
+
|
64
|
+
## version 0.7.0
|
65
|
+
|
66
|
+
- [CHANGE] SQS data source requires "region" attribute.
|
67
|
+
|
68
|
+
## version 0.6.2
|
69
|
+
|
70
|
+
- [new] AWS access key id and secret key are now optional for SQS data sources (to allow using EC2 instance or ECS task attached IAM roles).
|
71
|
+
- [new] New utility commands send-data-event, send-shutdown, send-checkpoint, send-load-task.
|
72
|
+
- Adds sample config files.
|
73
|
+
|
74
|
+
## version 0.6.1
|
75
|
+
|
76
|
+
- [fix] dispatcher: Default ctl data source was wrong.
|
77
|
+
- [fix] dispatcher: Detects S3 events by "s3" attribute instead of "eventSource" attribute, to allow fake S3 events (from non-S3 system).
|
78
|
+
- [fix] dispatcher: SNS alert is now optional.
|
79
|
+
- [fix] dispatcher: Correctly deletes unknown format messages.
|
80
|
+
- [enhancement] Adds more logging messages.
|
81
|
+
|
82
|
+
## version 0.6.0
|
83
|
+
|
84
|
+
- [CHANGE] Adds loaded column to strload_objects table to record if the object is really loaded or not.
|
85
|
+
- [CHANGE] Now strload_objects' object_url is unique. Duplicated objects are stored in another table, strload_dup_objects.
|
86
|
+
- [CHANGE] Now strload_table has table_id column, which is the primary key.
|
87
|
+
- [new] Loader daemon supports new command line option --working-dir, to support symbolic linked path, such as Capistrano deploy target (current/).
|
88
|
+
- [new] Keeps Redshift manifest file for later inspection.
|
89
|
+
- [enhancement] Reduces the number of Redshift writer transactions (1 transaction for 1 loading).
|
90
|
+
- [enhancement] Delay dispatching tasks until current event batch is processed, to avoid unexpected visibility timeout.
|
91
|
+
- [enhancement] Adds more logging messages.
|
92
|
+
|
93
|
+
## version 0.5.1
|
94
|
+
|
95
|
+
- [fix] Fixes slow query
|
96
|
+
|
97
|
+
## version 0.5.0
|
98
|
+
|
99
|
+
- [new] Introduces FLUSHTABLE dispatcher event
|
100
|
+
|
101
|
+
## version 0.4.0
|
102
|
+
|
103
|
+
- [new] Introduces CHECKPOINT dispatcher event
|
104
|
+
|
105
|
+
## version 0.3.0
|
106
|
+
|
107
|
+
- [new] Supoprts SNS notification
|
108
|
+
|
109
|
+
## version 0.2.0
|
110
|
+
|
111
|
+
- not released
|
112
|
+
- [fix] Fixes async delete timing
|
113
|
+
|
114
|
+
## version 0.1.0
|
115
|
+
|
116
|
+
- 2016-07-13 works 1 month
|
data/Rakefile
ADDED
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative 'lib/bricolage/streamingload/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.platform = Gem::Platform::RUBY
|
5
|
+
s.name = 'bricolage-streamingload'
|
6
|
+
s.version = Bricolage::StreamingLoad::VERSION
|
7
|
+
s.summary = 'Bricolage Streaming Load Daemon'
|
8
|
+
s.description = 'Bricolage Streaming Load Daemon loads S3 data files to Redshift continuously.'
|
9
|
+
s.license = 'MIT'
|
10
|
+
|
11
|
+
s.author = ['Minero Aoki', 'Shimpei Kodama']
|
12
|
+
s.email = ['aamine@loveruby.net']
|
13
|
+
s.homepage = 'https://github.com/aamine/bricolage-streamingload'
|
14
|
+
|
15
|
+
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
s.executables = s.files.grep(%r{bin/}).map {|path| File.basename(path) }
|
17
|
+
s.require_path = 'lib'
|
18
|
+
|
19
|
+
s.required_ruby_version = '>= 2.3.0'
|
20
|
+
s.add_dependency 'bricolage', '>= 5.29.2'
|
21
|
+
s.add_dependency 'pg', '~> 0.18.0'
|
22
|
+
s.add_dependency 'aws-sdk-s3', '~> 1.8'
|
23
|
+
s.add_dependency 'aws-sdk-sqs', '~> 1.3'
|
24
|
+
s.add_development_dependency 'rake'
|
25
|
+
s.add_development_dependency 'test-unit'
|
26
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
### Databases
|
2
|
+
|
3
|
+
db_ctl:
|
4
|
+
type: psql
|
5
|
+
host: localhost
|
6
|
+
port: 5432
|
7
|
+
database: bricolage
|
8
|
+
username: bricolage
|
9
|
+
# Get password from password.yml
|
10
|
+
password: <%= password 'postgres_bricolage_password' %>
|
11
|
+
encoding: utf8
|
12
|
+
|
13
|
+
db_data: &db_data
|
14
|
+
type: psql
|
15
|
+
host: redshift.host
|
16
|
+
port: 5439
|
17
|
+
database: production
|
18
|
+
username: bricolage
|
19
|
+
# Get password from password.yml
|
20
|
+
password: <%= password 'redshift_bricolage_password' %>
|
21
|
+
encoding: utf8
|
22
|
+
|
23
|
+
sql:
|
24
|
+
<<: *db_data
|
25
|
+
|
26
|
+
### SQS
|
27
|
+
|
28
|
+
sqs_event:
|
29
|
+
type: sqs
|
30
|
+
region: ap-northeast-1
|
31
|
+
url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-events
|
32
|
+
max_number_of_messages: 10
|
33
|
+
visibility_timeout: 600
|
34
|
+
wait_time_seconds: 10
|
35
|
+
# Enable following lines if you use access key explicitly.
|
36
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
37
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
38
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
39
|
+
|
40
|
+
sqs_task:
|
41
|
+
type: sqs
|
42
|
+
region: ap-northeast-1
|
43
|
+
url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-tasks
|
44
|
+
max_number_of_messages: 1
|
45
|
+
visibility_timeout: 1800
|
46
|
+
wait_time_seconds: 10
|
47
|
+
# Enable following lines if you use access key explicitly.
|
48
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
49
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
50
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
51
|
+
|
52
|
+
### S3
|
53
|
+
|
54
|
+
s3_ctl: &s3_ctl
|
55
|
+
type: s3
|
56
|
+
region: ap-northeast-1
|
57
|
+
endpoint: s3-ap-northeast-1.amazonaws.com
|
58
|
+
bucket: bricolagectl.ap-northeast-1
|
59
|
+
prefix: development/strload
|
60
|
+
# Enable following lines if you use access key explicitly.
|
61
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
62
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
63
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
64
|
+
|
65
|
+
s3:
|
66
|
+
<<: *s3_ctl
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#event-queue-ds: sqs_event
|
2
|
+
#task-queue-ds: sqs_task
|
3
|
+
|
4
|
+
#ctl-postgres-ds: db_ctl
|
5
|
+
#ctl-s3-ds: s3_ctl
|
6
|
+
|
7
|
+
#redshift-ds: db_data
|
8
|
+
#log-table: strload_load_logs
|
9
|
+
|
10
|
+
#dispatch-interval: 60
|
11
|
+
|
12
|
+
# To Enable SNS notification
|
13
|
+
#sns-ds: sns
|
14
|
+
#alert-level: warn
|
15
|
+
|
16
|
+
url_patterns:
|
17
|
+
-
|
18
|
+
url: "s3://some-log-bucket/\\w{4}\\.\\w+?\\.\\w+?\\.(?<schema>\\w+)\\.(?<table>\\w+)/\\d{4}/\\d{2}/\\d{2}/.*\\.gz"
|
19
|
+
schema: "%schema"
|
20
|
+
table: "%table"
|
@@ -0,0 +1,38 @@
|
|
1
|
+
{
|
2
|
+
"Records" => [
|
3
|
+
{
|
4
|
+
"eventVersion" => "2.0",
|
5
|
+
"eventSource" => "aws:s3",
|
6
|
+
"awsRegion" => "ap-northeast-1",
|
7
|
+
"eventTime" => "2016-02-13T11:40:08.001Z",
|
8
|
+
"eventName" => "ObjectCreated:Put",
|
9
|
+
"userIdentity" => {
|
10
|
+
"principalId" => "AWS:AIXXXXXXXXXXXXXXXXX6A"
|
11
|
+
},
|
12
|
+
"requestParameters" => {
|
13
|
+
"sourceIPAddress" => "111.222.111.90"
|
14
|
+
},
|
15
|
+
"responseElements" => {
|
16
|
+
"x-amz-request-id" => "1111AAAA9999AAAA",
|
17
|
+
"x-amz-id-2" => "6p9IZG+R+xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxFyXiNMMB4="
|
18
|
+
},
|
19
|
+
"s3" => {
|
20
|
+
"s3SchemaVersion" => "1.0",
|
21
|
+
"configurationId" => "ObjectCreatedEvents",
|
22
|
+
"bucket" => {
|
23
|
+
"name" => "some-bucket",
|
24
|
+
"ownerIdentity" => {
|
25
|
+
"principalId" => "AAAAIIIIBBBB11"
|
26
|
+
},
|
27
|
+
"arn" => "arn:aws:s3:::some-bucket"
|
28
|
+
},
|
29
|
+
"object" => {
|
30
|
+
"key" => "development/logs/schema.table/20160125_0000_0_de37c5ad-d26a-42cc-a141-475676b65f69.gz",
|
31
|
+
"size" => 1302957,
|
32
|
+
"eTag" => "d704db7f9cb77b1ecb981c534526b542",
|
33
|
+
"sequencer" => "111122223333444499"
|
34
|
+
}
|
35
|
+
}
|
36
|
+
}
|
37
|
+
]
|
38
|
+
}
|
@@ -0,0 +1,18 @@
|
|
1
|
+
#<struct Aws::SQS::Types::ReceiveMessageResult
|
2
|
+
messages = [
|
3
|
+
#<struct Aws::SQS::Types::Message
|
4
|
+
message_id = "11112222-b37f-4281-aee2-696408c482be",
|
5
|
+
receipt_handle = "AQEBErp4Vv159Hvx5oXSioWu7Ov1Jj1ht423LwTTUlgLohmKh02gAMAZ7kN1J4+aLzM54BRuMatnifWPH3cBh6rtdSWlllysBVmXDRKj83VCaseKPpq+Vdw/bLZrPEYRSGPuhKsHa2DFjo67KeND4AqasZO7lOCJX+YcsbqsqAtcODlDWlVbfedPkKIGM3Scra3uY9ysGjNx/zWuJ88fCtvRpu9tgk0nG7srHAm84Mxj5ArakJnKnJMbtkD/+lvm0Vqi2YCkXonS8+BP8gK4kSTbl1uv/Gp7ZfxkjKQJBB8xQiYo7e7qKLwhbImMZtO5tqDKzIKFVfMi1G2ODF+tEj4Ce+ryMRGA50GhON2ETCyofsl1T7Wdr61IEOV0NFlonGTWFf4q/1r3OPhFAchp+tYnyA==",
|
6
|
+
md5_of_body = "00006bf43abdff178ca0ffa96205aaaa",
|
7
|
+
body = "{\"Records\":[{\"eventVersion\":\"2.0\",\"eventSource\":\"aws:s3\",\"awsRegion\":\"ap-northeast-1\",\"eventTime\":\"2016-02-13T11:40:07.268Z\",\"eventName\":\"ObjectCreated:Put\",\"userIdentity\":{\"principalId\":\"AWS:AAAAJKS3A4VEF45XCAAAA\"},\"requestParameters\":{\"sourceIPAddress\":\"111.222.111.90\"},\"responseElements\":{\"x-amz-request-id\":\"10AAAA31A1EDCCCC\",\"x-amz-id-2\":\"YT+U/PxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxzfFGPjqP5AQg=\"},\"s3\":{\"s3SchemaVersion\":\"1.0\",\"configurationId\":\"LogStreamDev\",\"bucket\":{\"name\":\"redshift-copy-buffer\",\"ownerIdentity\":{\"principalId\":\"AAAAFIDWR40000\"},\"arn\":\"arn:aws:s3:::redshift-copy-buffer\"},\"object\":{\"key\":\"development/logs/schema.table/20160125_0000_0_2d0b43ad-0db9-4655-bd98-b7464b123763.gz\",\"size\":1238953,\"eTag\":\"aaaa196c3935f4957c7bb645f9780000\",\"sequencer\":\"0000BF161697C1AAAA\"}}}]}",
|
8
|
+
attributes = {
|
9
|
+
"SenderId" => "FACE0VEO02BJMF37H2JKW",
|
10
|
+
"ApproximateFirstReceiveTimestamp" => "1455364193429",
|
11
|
+
"ApproximateReceiveCount" => "1",
|
12
|
+
"SentTimestamp" => "1455363607341"
|
13
|
+
},
|
14
|
+
md5_of_message_attributes = nil,
|
15
|
+
message_attributes = {}
|
16
|
+
>
|
17
|
+
]
|
18
|
+
>
|
@@ -0,0 +1,13 @@
|
|
1
|
+
--dest-table: bricolage.strload_load_logs
|
2
|
+
|
3
|
+
/*
|
4
|
+
Redshift-side log table
|
5
|
+
*/
|
6
|
+
create table $dest_table
|
7
|
+
( job_id bigint encode raw
|
8
|
+
, task_id bigint encode zstd
|
9
|
+
, finish_time timestamp encode delta
|
10
|
+
)
|
11
|
+
distkey (job_id)
|
12
|
+
sortkey (job_id)
|
13
|
+
;
|