bricolage-streamingload 0.15.0 → 0.17.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.gitignore +5 -0
- data/Gemfile +2 -0
- data/Gemfile.lock +61 -0
- data/LICENSES +21 -0
- data/RELEASE.md +135 -0
- data/Rakefile +3 -0
- data/bin/send-data-event +1 -1
- data/bricolage-streamingload.gemspec +27 -0
- data/config/production/database.yml +66 -0
- data/config/production/password.yml +5 -0
- data/config/production/streamingload.yml +20 -0
- data/config/production/variable.yml +5 -0
- data/lib/bricolage/streamingload/dispatcher.rb +8 -1
- data/lib/bricolage/streamingload/job.rb +14 -3
- data/lib/bricolage/streamingload/jobparams.rb +1 -1
- data/lib/bricolage/streamingload/taskhandler.rb +5 -2
- data/lib/bricolage/streamingload/version.rb +1 -1
- data/sample/sqs-message.txt +38 -0
- data/sample/sqs-result.txt +18 -0
- data/strload_load_logs.ct +13 -0
- data/testschema/strload_test.ct +11 -0
- data/testschema/testlog.json.gz +0 -0
- data/testschema/with_work_table.job +4 -0
- data/testschema/with_work_table.sql +1 -0
- data/utils/init_strload_tables.sql +13 -0
- data/utils/strload-stat.sql +36 -0
- metadata +46 -30
- data/test/all.rb +0 -3
- data/test/streamingload/test_dispatcher.rb +0 -241
- data/test/streamingload/test_dispatchermessage.rb +0 -31
- data/test/streamingload/test_job.rb +0 -620
- data/test/test_sqsdatasource.rb +0 -55
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 8f3e8cfb62766f9de7ef2ff91a8b9a7defce6403560338973734af68422d2fcb
|
4
|
+
data.tar.gz: 359732332455a7db82c20cb3ebc2ccfcdc830d35595dfdae7b515e57cafd77ac
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 80365541531153a42be70362e04e8d6ee89eb639df4d9fcfb899c95e6516b92be8ac81e9b1da27acb14f919e5d2abdc408ebe2e75460c65063e01b1e7b718418
|
7
|
+
data.tar.gz: c90d2c27285681a950bb1d95c076a83bbd95cbb0863651e1b26ab0f12690ee58da8b5759187f65a8cc883fc4ea680c76706fa337a93d5011a58ea9bec7a60e83
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,61 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
bricolage-streamingload (0.16.1)
|
5
|
+
aws-sdk-s3 (~> 1.8)
|
6
|
+
aws-sdk-sqs (~> 1.3)
|
7
|
+
bricolage (~> 5.30)
|
8
|
+
pg (~> 0.18.0)
|
9
|
+
sentry-raven (~> 3.0)
|
10
|
+
|
11
|
+
GEM
|
12
|
+
remote: https://rubygems.org/
|
13
|
+
specs:
|
14
|
+
aws-eventstream (1.1.0)
|
15
|
+
aws-partitions (1.351.0)
|
16
|
+
aws-sdk-core (3.104.3)
|
17
|
+
aws-eventstream (~> 1, >= 1.0.2)
|
18
|
+
aws-partitions (~> 1, >= 1.239.0)
|
19
|
+
aws-sigv4 (~> 1.1)
|
20
|
+
jmespath (~> 1.0)
|
21
|
+
aws-sdk-kms (1.36.0)
|
22
|
+
aws-sdk-core (~> 3, >= 3.99.0)
|
23
|
+
aws-sigv4 (~> 1.1)
|
24
|
+
aws-sdk-s3 (1.75.0)
|
25
|
+
aws-sdk-core (~> 3, >= 3.104.1)
|
26
|
+
aws-sdk-kms (~> 1)
|
27
|
+
aws-sigv4 (~> 1.1)
|
28
|
+
aws-sdk-sns (1.28.0)
|
29
|
+
aws-sdk-core (~> 3, >= 3.99.0)
|
30
|
+
aws-sigv4 (~> 1.1)
|
31
|
+
aws-sdk-sqs (1.30.0)
|
32
|
+
aws-sdk-core (~> 3, >= 3.99.0)
|
33
|
+
aws-sigv4 (~> 1.1)
|
34
|
+
aws-sigv4 (1.2.1)
|
35
|
+
aws-eventstream (~> 1, >= 1.0.2)
|
36
|
+
bricolage (5.30.0)
|
37
|
+
aws-sdk-s3 (~> 1)
|
38
|
+
aws-sdk-sns (~> 1)
|
39
|
+
pg (~> 0.18.0)
|
40
|
+
faraday (1.0.1)
|
41
|
+
multipart-post (>= 1.2, < 3)
|
42
|
+
jmespath (1.4.0)
|
43
|
+
multipart-post (2.1.1)
|
44
|
+
pg (0.18.4)
|
45
|
+
power_assert (1.1.3)
|
46
|
+
rake (12.3.3)
|
47
|
+
sentry-raven (3.0.0)
|
48
|
+
faraday (>= 1.0)
|
49
|
+
test-unit (3.2.8)
|
50
|
+
power_assert
|
51
|
+
|
52
|
+
PLATFORMS
|
53
|
+
ruby
|
54
|
+
|
55
|
+
DEPENDENCIES
|
56
|
+
bricolage-streamingload!
|
57
|
+
rake
|
58
|
+
test-unit
|
59
|
+
|
60
|
+
BUNDLED WITH
|
61
|
+
1.17.2
|
data/LICENSES
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
The MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2016 Minero Aoki, Shimpei Kodama
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in
|
13
|
+
all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
21
|
+
THE SOFTWARE.
|
data/RELEASE.md
ADDED
@@ -0,0 +1,135 @@
|
|
1
|
+
# Bricolage Streaming Load Release Note
|
2
|
+
|
3
|
+
## version 0.17.0
|
4
|
+
|
5
|
+
- [new] Introduces Sentry error reporting. Define SENTRY_DSN environment variable to enable it.
|
6
|
+
|
7
|
+
## version 0.16.1
|
8
|
+
|
9
|
+
- [fix] Stop retrying after 2 times retried (total 3 trial).
|
10
|
+
|
11
|
+
## version 0.16.0
|
12
|
+
|
13
|
+
- [new] Retry for also error tasks, not only failure tasks.
|
14
|
+
Error tasks are assumed as "non-retriable" e.g. DB login error, but some of them are really able to be resolved
|
15
|
+
by retrying on ECS environment. Retrying is not expensive (and human operation is relatively expensive),
|
16
|
+
we'll retry on all failures and errors.
|
17
|
+
|
18
|
+
## version 0.15.2
|
19
|
+
|
20
|
+
- [fix] send-data-event: Send objectSize=0 to indicate this message is fake.
|
21
|
+
|
22
|
+
## version 0.15.1
|
23
|
+
|
24
|
+
- [new] Use the schema name in the parameters table by default (no more "no such variable: XXXX_schema" error).
|
25
|
+
|
26
|
+
## version 0.15.0
|
27
|
+
|
28
|
+
- [CHANGE] Upgrade AWS-SDK to v3. This version requires at least Bricolage 5.25.
|
29
|
+
|
30
|
+
## version 0.14.2
|
31
|
+
|
32
|
+
- Reduces loading retry count
|
33
|
+
|
34
|
+
## version 0.14.1
|
35
|
+
|
36
|
+
- [fix] task logging did not work
|
37
|
+
|
38
|
+
## version 0.14.0
|
39
|
+
|
40
|
+
- [new] Logs (task - object) relashonships to S3 on dispatch, as a DB backup.
|
41
|
+
|
42
|
+
## version 0.13.0
|
43
|
+
|
44
|
+
- grand refactoring
|
45
|
+
|
46
|
+
## version 0.12.0
|
47
|
+
|
48
|
+
- [CHANGE] Adds task_id column to the log table (strload_load_logs).
|
49
|
+
|
50
|
+
## version 0.11.0
|
51
|
+
|
52
|
+
- Loosen dependent Bricolage version
|
53
|
+
|
54
|
+
## version 0.10.2
|
55
|
+
|
56
|
+
- [new] New parameter dispatch-interval.
|
57
|
+
|
58
|
+
## version 0.10.1
|
59
|
+
|
60
|
+
- [fix] Fixes simple variable ref bug.
|
61
|
+
|
62
|
+
## version 0.10.0
|
63
|
+
|
64
|
+
- [new] Automatically complement strload_jobs status with Redshift-side log table.
|
65
|
+
|
66
|
+
## version 0.9.0
|
67
|
+
|
68
|
+
- [new] Introduces Redshift-side load log table (strload_load_logs) for load duplication checking.
|
69
|
+
|
70
|
+
## version 0.8.1
|
71
|
+
|
72
|
+
- [fix] tmp: Do not retry on data connection failure.
|
73
|
+
|
74
|
+
## version 0.8.0
|
75
|
+
|
76
|
+
- [CHANGE] Loader retries failed load tasks automatically. Streaming loader does NOT delete a task from the queue on job failures.
|
77
|
+
- [enhancement] Rewrites loader to get better error handling (ensure to write log record in the wider range of errornous situations).
|
78
|
+
|
79
|
+
## version 0.7.1
|
80
|
+
|
81
|
+
- fix utilities
|
82
|
+
|
83
|
+
## version 0.7.0
|
84
|
+
|
85
|
+
- [CHANGE] SQS data source requires "region" attribute.
|
86
|
+
|
87
|
+
## version 0.6.2
|
88
|
+
|
89
|
+
- [new] AWS access key id and secret key are now optional for SQS data sources (to allow using EC2 instance or ECS task attached IAM roles).
|
90
|
+
- [new] New utility commands send-data-event, send-shutdown, send-checkpoint, send-load-task.
|
91
|
+
- Adds sample config files.
|
92
|
+
|
93
|
+
## version 0.6.1
|
94
|
+
|
95
|
+
- [fix] dispatcher: Default ctl data source was wrong.
|
96
|
+
- [fix] dispatcher: Detects S3 events by "s3" attribute instead of "eventSource" attribute, to allow fake S3 events (from non-S3 system).
|
97
|
+
- [fix] dispatcher: SNS alert is now optional.
|
98
|
+
- [fix] dispatcher: Correctly deletes unknown format messages.
|
99
|
+
- [enhancement] Adds more logging messages.
|
100
|
+
|
101
|
+
## version 0.6.0
|
102
|
+
|
103
|
+
- [CHANGE] Adds loaded column to strload_objects table to record if the object is really loaded or not.
|
104
|
+
- [CHANGE] Now strload_objects' object_url is unique. Duplicated objects are stored in another table, strload_dup_objects.
|
105
|
+
- [CHANGE] Now strload_table has table_id column, which is the primary key.
|
106
|
+
- [new] Loader daemon supports new command line option --working-dir, to support symbolic linked path, such as Capistrano deploy target (current/).
|
107
|
+
- [new] Keeps Redshift manifest file for later inspection.
|
108
|
+
- [enhancement] Reduces the number of Redshift writer transactions (1 transaction for 1 loading).
|
109
|
+
- [enhancement] Delay dispatching tasks until current event batch is processed, to avoid unexpected visibility timeout.
|
110
|
+
- [enhancement] Adds more logging messages.
|
111
|
+
|
112
|
+
## version 0.5.1
|
113
|
+
|
114
|
+
- [fix] Fixes slow query
|
115
|
+
|
116
|
+
## version 0.5.0
|
117
|
+
|
118
|
+
- [new] Introduces FLUSHTABLE dispatcher event
|
119
|
+
|
120
|
+
## version 0.4.0
|
121
|
+
|
122
|
+
- [new] Introduces CHECKPOINT dispatcher event
|
123
|
+
|
124
|
+
## version 0.3.0
|
125
|
+
|
126
|
+
- [new] Supoprts SNS notification
|
127
|
+
|
128
|
+
## version 0.2.0
|
129
|
+
|
130
|
+
- not released
|
131
|
+
- [fix] Fixes async delete timing
|
132
|
+
|
133
|
+
## version 0.1.0
|
134
|
+
|
135
|
+
- 2016-07-13 works 1 month
|
data/Rakefile
ADDED
data/bin/send-data-event
CHANGED
@@ -0,0 +1,27 @@
|
|
1
|
+
require_relative 'lib/bricolage/streamingload/version'
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.platform = Gem::Platform::RUBY
|
5
|
+
s.name = 'bricolage-streamingload'
|
6
|
+
s.version = Bricolage::StreamingLoad::VERSION
|
7
|
+
s.summary = 'Bricolage Streaming Load Daemon'
|
8
|
+
s.description = 'Bricolage Streaming Load Daemon loads S3 data files to Redshift continuously.'
|
9
|
+
s.license = 'MIT'
|
10
|
+
|
11
|
+
s.author = ['Minero Aoki', 'Shimpei Kodama']
|
12
|
+
s.email = ['aamine@loveruby.net']
|
13
|
+
s.homepage = 'https://github.com/aamine/bricolage-streamingload'
|
14
|
+
|
15
|
+
s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
|
16
|
+
s.executables = s.files.grep(%r{bin/}).map {|path| File.basename(path) }
|
17
|
+
s.require_path = 'lib'
|
18
|
+
|
19
|
+
s.required_ruby_version = '>= 2.3.0'
|
20
|
+
s.add_dependency 'bricolage', '~> 5.30'
|
21
|
+
s.add_dependency 'pg', '~> 0.18.0'
|
22
|
+
s.add_dependency 'aws-sdk-s3', '~> 1.8'
|
23
|
+
s.add_dependency 'aws-sdk-sqs', '~> 1.3'
|
24
|
+
s.add_dependency 'sentry-raven', '~> 3.0'
|
25
|
+
s.add_development_dependency 'rake'
|
26
|
+
s.add_development_dependency 'test-unit'
|
27
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
### Databases
|
2
|
+
|
3
|
+
db_ctl:
|
4
|
+
type: psql
|
5
|
+
host: localhost
|
6
|
+
port: 5432
|
7
|
+
database: bricolage
|
8
|
+
username: bricolage
|
9
|
+
# Get password from password.yml
|
10
|
+
password: <%= password 'postgres_bricolage_password' %>
|
11
|
+
encoding: utf8
|
12
|
+
|
13
|
+
db_data: &db_data
|
14
|
+
type: psql
|
15
|
+
host: redshift.host
|
16
|
+
port: 5439
|
17
|
+
database: production
|
18
|
+
username: bricolage
|
19
|
+
# Get password from password.yml
|
20
|
+
password: <%= password 'redshift_bricolage_password' %>
|
21
|
+
encoding: utf8
|
22
|
+
|
23
|
+
sql:
|
24
|
+
<<: *db_data
|
25
|
+
|
26
|
+
### SQS
|
27
|
+
|
28
|
+
sqs_event:
|
29
|
+
type: sqs
|
30
|
+
region: ap-northeast-1
|
31
|
+
url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-events
|
32
|
+
max_number_of_messages: 10
|
33
|
+
visibility_timeout: 600
|
34
|
+
wait_time_seconds: 10
|
35
|
+
# Enable following lines if you use access key explicitly.
|
36
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
37
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
38
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
39
|
+
|
40
|
+
sqs_task:
|
41
|
+
type: sqs
|
42
|
+
region: ap-northeast-1
|
43
|
+
url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-tasks
|
44
|
+
max_number_of_messages: 1
|
45
|
+
visibility_timeout: 1800
|
46
|
+
wait_time_seconds: 10
|
47
|
+
# Enable following lines if you use access key explicitly.
|
48
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
49
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
50
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
51
|
+
|
52
|
+
### S3
|
53
|
+
|
54
|
+
s3_ctl: &s3_ctl
|
55
|
+
type: s3
|
56
|
+
region: ap-northeast-1
|
57
|
+
endpoint: s3-ap-northeast-1.amazonaws.com
|
58
|
+
bucket: bricolagectl.ap-northeast-1
|
59
|
+
prefix: development/strload
|
60
|
+
# Enable following lines if you use access key explicitly.
|
61
|
+
# Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
|
62
|
+
#access_key_id: "<%%= password 'aws_access_key_id' %>"
|
63
|
+
#secret_access_key: "<%%= password 'aws_secret_access_key' %>"
|
64
|
+
|
65
|
+
s3:
|
66
|
+
<<: *s3_ctl
|
@@ -0,0 +1,20 @@
|
|
1
|
+
#event-queue-ds: sqs_event
|
2
|
+
#task-queue-ds: sqs_task
|
3
|
+
|
4
|
+
#ctl-postgres-ds: db_ctl
|
5
|
+
#ctl-s3-ds: s3_ctl
|
6
|
+
|
7
|
+
#redshift-ds: db_data
|
8
|
+
#log-table: strload_load_logs
|
9
|
+
|
10
|
+
#dispatch-interval: 60
|
11
|
+
|
12
|
+
# To Enable SNS notification
|
13
|
+
#sns-ds: sns
|
14
|
+
#alert-level: warn
|
15
|
+
|
16
|
+
url_patterns:
|
17
|
+
-
|
18
|
+
url: "s3://some-log-bucket/\\w{4}\\.\\w+?\\.\\w+?\\.(?<schema>\\w+)\\.(?<table>\\w+)/\\d{4}/\\d{2}/\\d{2}/.*\\.gz"
|
19
|
+
schema: "%schema"
|
20
|
+
table: "%table"
|
@@ -10,8 +10,9 @@ require 'bricolage/streamingload/chunkbuffer'
|
|
10
10
|
require 'bricolage/streamingload/loadtasklogger'
|
11
11
|
require 'bricolage/streamingload/alertinglogger'
|
12
12
|
require 'yaml'
|
13
|
-
require 'optparse'
|
14
13
|
require 'fileutils'
|
14
|
+
require 'raven'
|
15
|
+
require 'optparse'
|
15
16
|
|
16
17
|
module Bricolage
|
17
18
|
|
@@ -20,6 +21,12 @@ module Bricolage
|
|
20
21
|
class Dispatcher < SQSDataSource::MessageHandler
|
21
22
|
|
22
23
|
def Dispatcher.main
|
24
|
+
Raven.capture {
|
25
|
+
_main
|
26
|
+
}
|
27
|
+
end
|
28
|
+
|
29
|
+
def Dispatcher._main
|
23
30
|
opts = DispatcherOptions.new(ARGV)
|
24
31
|
opts.parse
|
25
32
|
unless opts.rest_arguments.size == 1
|
@@ -3,6 +3,7 @@ require 'bricolage/streamingload/manifest'
|
|
3
3
|
require 'bricolage/sqlutils'
|
4
4
|
require 'socket'
|
5
5
|
require 'json'
|
6
|
+
require 'raven'
|
6
7
|
|
7
8
|
module Bricolage
|
8
9
|
|
@@ -44,7 +45,8 @@ module Bricolage
|
|
44
45
|
def execute(fail_fast: false)
|
45
46
|
execute_task
|
46
47
|
return true
|
47
|
-
rescue JobCancelled
|
48
|
+
rescue JobCancelled => ex
|
49
|
+
Raven.capture_exception(ex)
|
48
50
|
return true
|
49
51
|
rescue JobDuplicated
|
50
52
|
return true
|
@@ -52,20 +54,25 @@ module Bricolage
|
|
52
54
|
return false
|
53
55
|
rescue ControlConnectionFailed => ex
|
54
56
|
@logger.error ex.message
|
57
|
+
Raven.capture_exception(ex)
|
55
58
|
wait_for_connection('ctl', @ctl_ds) unless fail_fast
|
56
59
|
return false
|
57
60
|
rescue DataConnectionFailed => ex
|
58
61
|
@logger.error ex.message
|
62
|
+
Raven.capture_exception(ex)
|
59
63
|
wait_for_connection('data', @data_ds) unless fail_fast
|
60
64
|
return false
|
61
65
|
rescue JobFailure => ex
|
62
66
|
@logger.error ex.message
|
67
|
+
Raven.capture_exception(ex)
|
63
68
|
return false
|
64
69
|
rescue JobError => ex
|
65
70
|
@logger.error ex.message
|
66
|
-
|
71
|
+
Raven.capture_exception(ex)
|
72
|
+
return false
|
67
73
|
rescue Exception => ex
|
68
74
|
@logger.exception ex
|
75
|
+
Raven.capture_exception(ex)
|
69
76
|
return true
|
70
77
|
end
|
71
78
|
|
@@ -126,7 +133,11 @@ module Bricolage
|
|
126
133
|
raise
|
127
134
|
rescue JobError => ex
|
128
135
|
ctl.open {
|
129
|
-
|
136
|
+
fail_count = @task.failure_count
|
137
|
+
final_retry = (fail_count >= MAX_RETRY)
|
138
|
+
retry_msg = (fail_count > 0) ? "(retry\##{fail_count}#{final_retry ? ' FINAL' : ''}) " : ''
|
139
|
+
ctl.abort_job job_id, 'error', retry_msg + ex.message.lines.first.strip
|
140
|
+
raise JobCancelled, "retry count exceeds limit: task_id=#{@task_id}" if final_retry
|
130
141
|
}
|
131
142
|
raise
|
132
143
|
rescue Exception => ex
|