bricolage-streamingload 0.14.1 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +54 -0
- data/LICENSES +21 -0
- data/RELEASE.md +127 -0
- data/Rakefile +3 -0
- data/bin/send-data-event +1 -1
- data/bricolage-streamingload.gemspec +26 -0
- data/config/production/database.yml +66 -0
- data/config/production/password.yml +5 -0
- data/config/production/streamingload.yml +20 -0
- data/config/production/variable.yml +5 -0
- data/lib/bricolage/sqsdatasource.rb +1 -1
- data/lib/bricolage/streamingload/dispatcher.rb +0 -1
- data/lib/bricolage/streamingload/job.rb +8 -2
- data/lib/bricolage/streamingload/jobparams.rb +1 -1
- data/lib/bricolage/streamingload/manifest.rb +2 -0
- data/lib/bricolage/streamingload/taskhandler.rb +5 -2
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/sample/sqs-message.txt +38 -0
- data/sample/sqs-result.txt +18 -0
- data/strload_load_logs.ct +13 -0
- data/testschema/strload_test.ct +11 -0
- data/testschema/testlog.json.gz +0 -0
- data/testschema/with_work_table.job +4 -0
- data/testschema/with_work_table.sql +1 -0
- data/utils/init_strload_tables.sql +13 -0
- data/utils/strload-stat.sql +36 -0
- metadata +39 -23
- data/test/all.rb +0 -3
- data/test/streamingload/test_dispatcher.rb +0 -241
- data/test/streamingload/test_dispatchermessage.rb +0 -31
- data/test/streamingload/test_job.rb +0 -590
- data/test/test_sqsdatasource.rb +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: e799760fb58dd3ec1dcd3eeaf74b068a0336a21a32488f04dc0353d44514c0cd
|
4
|
+
data.tar.gz: 1ec9bfe0cadd6537a352e99cf7b61d53122ae17a67dd31c733e4e2e0c7890849
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 700157b7be5a53753056d7a830d5502c117cbcae474a97be17a72cb18b58a8aa48ea927aaf333040e6674c098413e91298a6bfe112b1fcd58293d32c4b207068
|
7
|
+
data.tar.gz: bb63e1c5e7f646003ed07c0f12758cb79bd316227033d43ae2cb0165a203d3706bfdf6ea2eae8ef681e8d9c02285fcca5a98ab4f8d7b0c3a78ab3a11b853fe31
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bricolage-streamingload (0.15.2)
|
5
|
+
aws-sdk-s3 (~> 1.8)
|
6
|
+
aws-sdk-sqs (~> 1.3)
|
7
|
+
bricolage (>= 5.29.2)
|
8
|
+
pg (~> 0.18.0)
|
9
|
+
|
10
|
+
GEM
|
11
|
+
remote: https://rubygems.org/
|
12
|
+
specs:
|
13
|
+
aws-eventstream (1.0.1)
|
14
|
+
aws-partitions (1.102.0)
|
15
|
+
aws-sdk-core (3.25.0)
|
16
|
+
aws-eventstream (~> 1.0)
|
17
|
+
aws-partitions (~> 1.0)
|
18
|
+
aws-sigv4 (~> 1.0)
|
19
|
+
jmespath (~> 1.0)
|
20
|
+
aws-sdk-kms (1.7.0)
|
21
|
+
aws-sdk-core (~> 3)
|
22
|
+
aws-sigv4 (~> 1.0)
|
23
|
+
aws-sdk-s3 (1.17.1)
|
24
|
+
aws-sdk-core (~> 3, >= 3.21.2)
|
25
|
+
aws-sdk-kms (~> 1)
|
26
|
+
aws-sigv4 (~> 1.0)
|
27
|
+
aws-sdk-sns (1.3.0)
|
28
|
+
aws-sdk-core (~> 3)
|
29
|
+
aws-sigv4 (~> 1.0)
|
30
|
+
aws-sdk-sqs (1.4.0)
|
31
|
+
aws-sdk-core (~> 3)
|
32
|
+
aws-sigv4 (~> 1.0)
|
33
|
+
aws-sigv4 (1.0.3)
|
34
|
+
bricolage (5.29.2)
|
35
|
+
aws-sdk-s3 (~> 1)
|
36
|
+
aws-sdk-sns (~> 1)
|
37
|
+
pg (~> 0.18.0)
|
38
|
+
jmespath (1.4.0)
|
39
|
+
pg (0.18.4)
|
40
|
+
power_assert (1.1.3)
|
41
|
+
rake (12.3.1)
|
42
|
+
test-unit (3.2.8)
|
43
|
+
power_assert
|
44
|
+
|
45
|
+
PLATFORMS
|
46
|
+
ruby
|
47
|
+
|
48
|
+
DEPENDENCIES
|
49
|
+
bricolage-streamingload!
|
50
|
+
rake
|
51
|
+
test-unit
|
52
|
+
|
53
|
+
BUNDLED WITH
|
54
|
+
1.16.1
|
data/LICENSES
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2016 Minero Aoki, Shimpei Kodama
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/RELEASE.md
ADDED
@@ -0,0 +1,127 @@
|
|
1
|
+
# Bricolage Streaming Load Release Note
|
2
|
+
|
3
|
+
## version 0.16.0
|
4
|
+
|
5
|
+
- [new] Retry for also error tasks, not only failure tasks.
|
6
|
+
Error tasks are assumed as "non-retriable" e.g. DB login error, but some of them are really able to be resolved
|
7
|
+
by retrying on ECS environment. Retrying is not expensive (and human operation is relatively expensive),
|
8
|
+
we'll retry on all failures and errors.
|
9
|
+
|
10
|
+
## version 0.15.2
|
11
|
+
|
12
|
+
- [fix] send-data-event: Send objectSize=0 to indicate this message is fake.
|
13
|
+
|
14
|
+
## version 0.15.1
|
15
|
+
|
16
|
+
- [new] Use the schema name in the parameters table by default (no more "no such variable: XXXX_schema" error).
|
17
|
+
|
18
|
+
## version 0.15.0
|
19
|
+
|
20
|
+
- [CHANGE] Upgrade AWS-SDK to v3. This version requires at least Bricolage 5.25.
|
21
|
+
|
22
|
+
## version 0.14.2
|
23
|
+
|
24
|
+
- Reduces loading retry count
|
25
|
+
|
26
|
+
## version 0.14.1
|
27
|
+
|
28
|
+
- [fix] task logging did not work
|
29
|
+
|
30
|
+
## version 0.14.0
|
31
|
+
|
32
|
+
- [new] Logs (task - object) relashonships to S3 on dispatch, as a DB backup.
|
33
|
+
|
34
|
+
## version 0.13.0
|
35
|
+
|
36
|
+
- grand refactoring
|
37
|
+
|
38
|
+
## version 0.12.0
|
39
|
+
|
40
|
+
- [CHANGE] Adds task_id column to the log table (strload_load_logs).
|
41
|
+
|
42
|
+
## version 0.11.0
|
43
|
+
|
44
|
+
- Loosen dependent Bricolage version
|
45
|
+
|
46
|
+
## version 0.10.2
|
47
|
+
|
48
|
+
- [new] New parameter dispatch-interval.
|
49
|
+
|
50
|
+
## version 0.10.1
|
51
|
+
|
52
|
+
- [fix] Fixes simple variable ref bug.
|
53
|
+
|
54
|
+
## version 0.10.0
|
55
|
+
|
56
|
+
- [new] Automatically complement strload_jobs status with Redshift-side log table.
|
57
|
+
|
58
|
+
## version 0.9.0
|
59
|
+
|
60
|
+
- [new] Introduces Redshift-side load log table (strload_load_logs) for load duplication checking.
|
61
|
+
|
62
|
+
## version 0.8.1
|
63
|
+
|
64
|
+
- [fix] tmp: Do not retry on data connection failure.
|
65
|
+
|
66
|
+
## version 0.8.0
|
67
|
+
|
68
|
+
- [CHANGE] Loader retries failed load tasks automatically. Streaming loader does NOT delete a task from the queue on job failures.
|
69
|
+
- [enhancement] Rewrites loader to get better error handling (ensure to write log record in the wider range of errornous situations).
|
70
|
+
|
71
|
+
## version 0.7.1
|
72
|
+
|
73
|
+
- fix utilities
|
74
|
+
|
75
|
+
## version 0.7.0
|
76
|
+
|
77
|
+
- [CHANGE] SQS data source requires "region" attribute.
|
78
|
+
|
79
|
+
## version 0.6.2
|
80
|
+
|
81
|
+
- [new] AWS access key id and secret key are now optional for SQS data sources (to allow using EC2 instance or ECS task attached IAM roles).
|
82
|
+
- [new] New utility commands send-data-event, send-shutdown, send-checkpoint, send-load-task.
|
83
|
+
- Adds sample config files.
|
84
|
+
|
85
|
+
## version 0.6.1
|
86
|
+
|
87
|
+
- [fix] dispatcher: Default ctl data source was wrong.
|
88
|
+
- [fix] dispatcher: Detects S3 events by "s3" attribute instead of "eventSource" attribute, to allow fake S3 events (from non-S3 system).
|
89
|
+
- [fix] dispatcher: SNS alert is now optional.
|
90
|
+
- [fix] dispatcher: Correctly deletes unknown format messages.
|
91
|
+
- [enhancement] Adds more logging messages.
|
92
|
+
|
93
|
+
## version 0.6.0
|
94
|
+
|
95
|
+
- [CHANGE] Adds loaded column to strload_objects table to record if the object is really loaded or not.
|
96
|
+
- [CHANGE] Now strload_objects' object_url is unique. Duplicated objects are stored in another table, strload_dup_objects.
|
97
|
+
- [CHANGE] Now strload_table has table_id column, which is the primary key.
|
98
|
+
- [new] Loader daemon supports new command line option --working-dir, to support symbolic linked path, such as Capistrano deploy target (current/).
|
99
|
+
- [new] Keeps Redshift manifest file for later inspection.
|
100
|
+
- [enhancement] Reduces the number of Redshift writer transactions (1 transaction for 1 loading).
|
101
|
+
- [enhancement] Delay dispatching tasks until current event batch is processed, to avoid unexpected visibility timeout.
|
102
|
+
- [enhancement] Adds more logging messages.
|
103
|
+
|
104
|
+
## version 0.5.1
|
105
|
+
|
106
|
+
- [fix] Fixes slow query
|
107
|
+
|
108
|
+
## version 0.5.0
|
109
|
+
|
110
|
+
- [new] Introduces FLUSHTABLE dispatcher event
|
111
|
+
|
112
|
+
## version 0.4.0
|
113
|
+
|
114
|
+
- [new] Introduces CHECKPOINT dispatcher event
|
115
|
+
|
116
|
+
## version 0.3.0
|
117
|
+
|
118
|
+
- [new] Supoprts SNS notification
|
119
|
+
|
120
|
+
## version 0.2.0
|
121
|
+
|
122
|
+
- not released
|
123
|
+
- [fix] Fixes async delete timing
|
124
|
+
|
125
|
+
## version 0.1.0
|
126
|
+
|
127
|
+
- 2016-07-13 works 1 month
|
data/Rakefile
ADDED
data/bin/send-data-event
CHANGED
@@ -0,0 +1,26 @@
|
|
1
|
+
require_relative 'lib/bricolage/streamingload/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.platform = Gem::Platform::RUBY
|
5
|
+
s.name = 'bricolage-streamingload'
|
6
|
+
s.version = Bricolage::StreamingLoad::VERSION
|
7
|
+
s.summary = 'Bricolage Streaming Load Daemon'
|
8
|
+
s.description = 'Bricolage Streaming Load Daemon loads S3 data files to Redshift continuously.'
|
9
|
+
s.license = 'MIT'
|
10
|
+
|
11
|
+
s.author = ['Minero Aoki', 'Shimpei Kodama']
|
12
|
+
s.email = ['aamine@loveruby.net']
|
13
|
+
s.homepage = 'https://github.com/aamine/bricolage-streamingload'
|
14
|
+
|
15
|
+
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
s.executables = s.files.grep(%r{bin/}).map {|path| File.basename(path) }
|
17
|
+
s.require_path = 'lib'
|
18
|
+
|
19
|
+
s.required_ruby_version = '>= 2.3.0'
|
20
|
+
s.add_dependency 'bricolage', '>= 5.29.2'
|
21
|
+
s.add_dependency 'pg', '~> 0.18.0'
|
22
|
+
s.add_dependency 'aws-sdk-s3', '~> 1.8'
|
23
|
+
s.add_dependency 'aws-sdk-sqs', '~> 1.3'
|
24
|
+
s.add_development_dependency 'rake'
|
25
|
+
s.add_development_dependency 'test-unit'
|
26
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
### Databases
|
2
|
+
|
3
|
+
db_ctl:
|
4
|
+
type: psql
|
5
|
+
host: localhost
|
6
|
+
port: 5432
|
7
|
+
database: bricolage
|
8
|
+
username: bricolage
|
9
|
+
# Get password from password.yml
|
10
|
+
password: <%= password 'postgres_bricolage_password' %>
|
11
|
+
encoding: utf8
|
12
|
+
|
13
|
+
db_data: &db_data
|
14
|
+
type: psql
|
15
|
+
host: redshift.host
|
16
|
+
port: 5439
|
17
|
+
database: production
|
18
|
+
username: bricolage
|
19
|
+
# Get password from password.yml
|
20
|
+
password: <%= password 'redshift_bricolage_password' %>
|
21
|
+
encoding: utf8
|
22
|
+
|
23
|
+
sql:
|
24
|
+
<<: *db_data
|
25
|
+
|
26
|
+
### SQS
|
27
|
+
|
28
|
+
sqs_event:
|
29
|
+
type: sqs
|
30
|
+
region: ap-northeast-1
|
31
|
+
url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-events
|
32
|
+
max_number_of_messages: 10
|
33
|
+
visibility_timeout: 600
|
34
|
+
wait_time_seconds: 10
|
35
|
+
# Enable following lines if you use access key explicitly.
|
36
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
37
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
38
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
39
|
+
|
40
|
+
sqs_task:
|
41
|
+
type: sqs
|
42
|
+
region: ap-northeast-1
|
43
|
+
url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-tasks
|
44
|
+
max_number_of_messages: 1
|
45
|
+
visibility_timeout: 1800
|
46
|
+
wait_time_seconds: 10
|
47
|
+
# Enable following lines if you use access key explicitly.
|
48
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
49
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
50
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
51
|
+
|
52
|
+
### S3
|
53
|
+
|
54
|
+
s3_ctl: &s3_ctl
|
55
|
+
type: s3
|
56
|
+
region: ap-northeast-1
|
57
|
+
endpoint: s3-ap-northeast-1.amazonaws.com
|
58
|
+
bucket: bricolagectl.ap-northeast-1
|
59
|
+
prefix: development/strload
|
60
|
+
# Enable following lines if you use access key explicitly.
|
61
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
62
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
63
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
64
|
+
|
65
|
+
s3:
|
66
|
+
<<: *s3_ctl
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#event-queue-ds: sqs_event
|
2
|
+
#task-queue-ds: sqs_task
|
3
|
+
|
4
|
+
#ctl-postgres-ds: db_ctl
|
5
|
+
#ctl-s3-ds: s3_ctl
|
6
|
+
|
7
|
+
#redshift-ds: db_data
|
8
|
+
#log-table: strload_load_logs
|
9
|
+
|
10
|
+
#dispatch-interval: 60
|
11
|
+
|
12
|
+
# To Enable SNS notification
|
13
|
+
#sns-ds: sns
|
14
|
+
#alert-level: warn
|
15
|
+
|
16
|
+
url_patterns:
|
17
|
+
-
|
18
|
+
url: "s3://some-log-bucket/\\w{4}\\.\\w+?\\.\\w+?\\.(?<schema>\\w+)\\.(?<table>\\w+)/\\d{4}/\\d{2}/\\d{2}/.*\\.gz"
|
19
|
+
schema: "%schema"
|
20
|
+
table: "%table"
|
@@ -9,7 +9,6 @@ require 'bricolage/streamingload/chunkrouter'
|
|
9
9
|
require 'bricolage/streamingload/chunkbuffer'
|
10
10
|
require 'bricolage/streamingload/loadtasklogger'
|
11
11
|
require 'bricolage/streamingload/alertinglogger'
|
12
|
-
require 'aws-sdk'
|
13
12
|
require 'yaml'
|
14
13
|
require 'optparse'
|
15
14
|
require 'fileutils'
|
@@ -63,13 +63,13 @@ module Bricolage
|
|
63
63
|
return false
|
64
64
|
rescue JobError => ex
|
65
65
|
@logger.error ex.message
|
66
|
-
return
|
66
|
+
return false
|
67
67
|
rescue Exception => ex
|
68
68
|
@logger.exception ex
|
69
69
|
return true
|
70
70
|
end
|
71
71
|
|
72
|
-
MAX_RETRY =
|
72
|
+
MAX_RETRY = 2
|
73
73
|
|
74
74
|
def execute_task
|
75
75
|
@process_id = "#{Socket.gethostname}-#{$$}"
|
@@ -241,6 +241,12 @@ module Bricolage
|
|
241
241
|
;
|
242
242
|
EndSQL
|
243
243
|
@logger.info "load succeeded: #{manifest.url}"
|
244
|
+
rescue JobFailure => ex
|
245
|
+
if /stl_load_errors/ =~ ex.message
|
246
|
+
# We cannot resolve this load error by retry, give up now.
|
247
|
+
raise JobError, ex.message
|
248
|
+
end
|
249
|
+
raise
|
244
250
|
end
|
245
251
|
|
246
252
|
def write_load_log(log)
|
@@ -149,12 +149,15 @@ module Bricolage
|
|
149
149
|
|
150
150
|
class NoopJob
|
151
151
|
|
152
|
-
def initialize(context:, ctl_ds:, task_id:, force: false, logger:)
|
152
|
+
def initialize(context:, ctl_ds:, data_ds:, log_table:, task_id:, force: false, logger:)
|
153
153
|
@ctx = context
|
154
154
|
@ctl_ds = ctl_ds
|
155
|
+
@data_ds = data_ds
|
156
|
+
@log_table = log_table
|
155
157
|
@task_id = task_id
|
156
158
|
@force = force
|
157
159
|
@logger = logger
|
160
|
+
@working_dir = Dir.getwd
|
158
161
|
end
|
159
162
|
|
160
163
|
def execute(fail_fast: false)
|
@@ -164,7 +167,7 @@ module Bricolage
|
|
164
167
|
end
|
165
168
|
|
166
169
|
def execute_task
|
167
|
-
@logger.info "execute_task: task_id=#{@task_id} force=#{@force} ctx=#{@ctx.home_path} ctl_ds=#{@ctl_ds.name} dir=#{@working_dir}"
|
170
|
+
@logger.info "execute_task: task_id=#{@task_id} force=#{@force} ctx=#{@ctx.home_path} ctl_ds=#{@ctl_ds.name} data_ds=#{@data_ds.name} dir=#{@working_dir}"
|
168
171
|
end
|
169
172
|
|
170
173
|
end
|