bricolage-streamingload 0.15.0 → 0.17.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: c74f043409945f4dc970472a1a7735c67a7ba930
4
- data.tar.gz: 9ce73867f02aff49a7391d168c791ce4be4d3727
2
+ SHA256:
3
+ metadata.gz: 8f3e8cfb62766f9de7ef2ff91a8b9a7defce6403560338973734af68422d2fcb
4
+ data.tar.gz: 359732332455a7db82c20cb3ebc2ccfcdc830d35595dfdae7b515e57cafd77ac
5
5
  SHA512:
6
- metadata.gz: 4a6a1a3c6fbb28d1048481072596fc944e0732218bcddfd09dd9555a4d0a43b900f7806627c2f4ec4685b36cbf3a93ded8b173bd7a75917d652bef7d7a6a0739
7
- data.tar.gz: ffa357b2233b11b9820ffeda35480c74a328904d531a1f67b716fc115d584979e4eff2332239c557179687121527d3e38c90afcc02d102b4e9a3f1e3fe12de70
6
+ metadata.gz: 80365541531153a42be70362e04e8d6ee89eb639df4d9fcfb899c95e6516b92be8ac81e9b1da27acb14f919e5d2abdc408ebe2e75460c65063e01b1e7b718418
7
+ data.tar.gz: c90d2c27285681a950bb1d95c076a83bbd95cbb0863651e1b26ab0f12690ee58da8b5759187f65a8cc883fc4ea680c76706fa337a93d5011a58ea9bec7a60e83
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+
4
+ /config/development/
5
+ /config/test/
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
@@ -0,0 +1,61 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ bricolage-streamingload (0.16.1)
5
+ aws-sdk-s3 (~> 1.8)
6
+ aws-sdk-sqs (~> 1.3)
7
+ bricolage (~> 5.30)
8
+ pg (~> 0.18.0)
9
+ sentry-raven (~> 3.0)
10
+
11
+ GEM
12
+ remote: https://rubygems.org/
13
+ specs:
14
+ aws-eventstream (1.1.0)
15
+ aws-partitions (1.351.0)
16
+ aws-sdk-core (3.104.3)
17
+ aws-eventstream (~> 1, >= 1.0.2)
18
+ aws-partitions (~> 1, >= 1.239.0)
19
+ aws-sigv4 (~> 1.1)
20
+ jmespath (~> 1.0)
21
+ aws-sdk-kms (1.36.0)
22
+ aws-sdk-core (~> 3, >= 3.99.0)
23
+ aws-sigv4 (~> 1.1)
24
+ aws-sdk-s3 (1.75.0)
25
+ aws-sdk-core (~> 3, >= 3.104.1)
26
+ aws-sdk-kms (~> 1)
27
+ aws-sigv4 (~> 1.1)
28
+ aws-sdk-sns (1.28.0)
29
+ aws-sdk-core (~> 3, >= 3.99.0)
30
+ aws-sigv4 (~> 1.1)
31
+ aws-sdk-sqs (1.30.0)
32
+ aws-sdk-core (~> 3, >= 3.99.0)
33
+ aws-sigv4 (~> 1.1)
34
+ aws-sigv4 (1.2.1)
35
+ aws-eventstream (~> 1, >= 1.0.2)
36
+ bricolage (5.30.0)
37
+ aws-sdk-s3 (~> 1)
38
+ aws-sdk-sns (~> 1)
39
+ pg (~> 0.18.0)
40
+ faraday (1.0.1)
41
+ multipart-post (>= 1.2, < 3)
42
+ jmespath (1.4.0)
43
+ multipart-post (2.1.1)
44
+ pg (0.18.4)
45
+ power_assert (1.1.3)
46
+ rake (12.3.3)
47
+ sentry-raven (3.0.0)
48
+ faraday (>= 1.0)
49
+ test-unit (3.2.8)
50
+ power_assert
51
+
52
+ PLATFORMS
53
+ ruby
54
+
55
+ DEPENDENCIES
56
+ bricolage-streamingload!
57
+ rake
58
+ test-unit
59
+
60
+ BUNDLED WITH
61
+ 1.17.2
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2016 Minero Aoki, Shimpei Kodama
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,135 @@
1
+ # Bricolage Streaming Load Release Note
2
+
3
+ ## version 0.17.0
4
+
5
+ - [new] Introduces Sentry error reporting. Define SENTRY_DSN environment variable to enable it.
6
+
7
+ ## version 0.16.1
8
+
9
+ - [fix] Stop retrying after 2 times retried (total 3 trial).
10
+
11
+ ## version 0.16.0
12
+
13
+ - [new] Retry for also error tasks, not only failure tasks.
14
+ Error tasks are assumed as "non-retriable" e.g. DB login error, but some of them are really able to be resolved
15
+ by retrying on ECS environment. Retrying is not expensive (and human operation is relatively expensive),
16
+ we'll retry on all failures and errors.
17
+
18
+ ## version 0.15.2
19
+
20
+ - [fix] send-data-event: Send objectSize=0 to indicate this message is fake.
21
+
22
+ ## version 0.15.1
23
+
24
+ - [new] Use the schema name in the parameters table by default (no more "no such variable: XXXX_schema" error).
25
+
26
+ ## version 0.15.0
27
+
28
+ - [CHANGE] Upgrade AWS-SDK to v3. This version requires at least Bricolage 5.25.
29
+
30
+ ## version 0.14.2
31
+
32
+ - Reduces loading retry count
33
+
34
+ ## version 0.14.1
35
+
36
+ - [fix] task logging did not work
37
+
38
+ ## version 0.14.0
39
+
40
+ - [new] Logs (task - object) relashonships to S3 on dispatch, as a DB backup.
41
+
42
+ ## version 0.13.0
43
+
44
+ - grand refactoring
45
+
46
+ ## version 0.12.0
47
+
48
+ - [CHANGE] Adds task_id column to the log table (strload_load_logs).
49
+
50
+ ## version 0.11.0
51
+
52
+ - Loosen dependent Bricolage version
53
+
54
+ ## version 0.10.2
55
+
56
+ - [new] New parameter dispatch-interval.
57
+
58
+ ## version 0.10.1
59
+
60
+ - [fix] Fixes simple variable ref bug.
61
+
62
+ ## version 0.10.0
63
+
64
+ - [new] Automatically complement strload_jobs status with Redshift-side log table.
65
+
66
+ ## version 0.9.0
67
+
68
+ - [new] Introduces Redshift-side load log table (strload_load_logs) for load duplication checking.
69
+
70
+ ## version 0.8.1
71
+
72
+ - [fix] tmp: Do not retry on data connection failure.
73
+
74
+ ## version 0.8.0
75
+
76
+ - [CHANGE] Loader retries failed load tasks automatically. Streaming loader does NOT delete a task from the queue on job failures.
77
+ - [enhancement] Rewrites loader to get better error handling (ensure to write log record in the wider range of errornous situations).
78
+
79
+ ## version 0.7.1
80
+
81
+ - fix utilities
82
+
83
+ ## version 0.7.0
84
+
85
+ - [CHANGE] SQS data source requires "region" attribute.
86
+
87
+ ## version 0.6.2
88
+
89
+ - [new] AWS access key id and secret key are now optional for SQS data sources (to allow using EC2 instance or ECS task attached IAM roles).
90
+ - [new] New utility commands send-data-event, send-shutdown, send-checkpoint, send-load-task.
91
+ - Adds sample config files.
92
+
93
+ ## version 0.6.1
94
+
95
+ - [fix] dispatcher: Default ctl data source was wrong.
96
+ - [fix] dispatcher: Detects S3 events by "s3" attribute instead of "eventSource" attribute, to allow fake S3 events (from non-S3 system).
97
+ - [fix] dispatcher: SNS alert is now optional.
98
+ - [fix] dispatcher: Correctly deletes unknown format messages.
99
+ - [enhancement] Adds more logging messages.
100
+
101
+ ## version 0.6.0
102
+
103
+ - [CHANGE] Adds loaded column to strload_objects table to record if the object is really loaded or not.
104
+ - [CHANGE] Now strload_objects' object_url is unique. Duplicated objects are stored in another table, strload_dup_objects.
105
+ - [CHANGE] Now strload_table has table_id column, which is the primary key.
106
+ - [new] Loader daemon supports new command line option --working-dir, to support symbolic linked path, such as Capistrano deploy target (current/).
107
+ - [new] Keeps Redshift manifest file for later inspection.
108
+ - [enhancement] Reduces the number of Redshift writer transactions (1 transaction for 1 loading).
109
+ - [enhancement] Delay dispatching tasks until current event batch is processed, to avoid unexpected visibility timeout.
110
+ - [enhancement] Adds more logging messages.
111
+
112
+ ## version 0.5.1
113
+
114
+ - [fix] Fixes slow query
115
+
116
+ ## version 0.5.0
117
+
118
+ - [new] Introduces FLUSHTABLE dispatcher event
119
+
120
+ ## version 0.4.0
121
+
122
+ - [new] Introduces CHECKPOINT dispatcher event
123
+
124
+ ## version 0.3.0
125
+
126
+ - [new] Supoprts SNS notification
127
+
128
+ ## version 0.2.0
129
+
130
+ - not released
131
+ - [fix] Fixes async delete timing
132
+
133
+ ## version 0.1.0
134
+
135
+ - 2016-07-13 works 1 month
@@ -0,0 +1,3 @@
1
+ task :test do
2
+ load "#{__dir__}/test/all.rb"
3
+ end
@@ -28,7 +28,7 @@ ARGF.each do |line|
28
28
  },
29
29
  object: {
30
30
  key: key,
31
- size: 1
31
+ size: 0
32
32
  }
33
33
  },
34
34
  noDispatch: no_dispatch
@@ -0,0 +1,27 @@
1
+ require_relative 'lib/bricolage/streamingload/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.platform = Gem::Platform::RUBY
5
+ s.name = 'bricolage-streamingload'
6
+ s.version = Bricolage::StreamingLoad::VERSION
7
+ s.summary = 'Bricolage Streaming Load Daemon'
8
+ s.description = 'Bricolage Streaming Load Daemon loads S3 data files to Redshift continuously.'
9
+ s.license = 'MIT'
10
+
11
+ s.author = ['Minero Aoki', 'Shimpei Kodama']
12
+ s.email = ['aamine@loveruby.net']
13
+ s.homepage = 'https://github.com/aamine/bricolage-streamingload'
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
16
+ s.executables = s.files.grep(%r{bin/}).map {|path| File.basename(path) }
17
+ s.require_path = 'lib'
18
+
19
+ s.required_ruby_version = '>= 2.3.0'
20
+ s.add_dependency 'bricolage', '~> 5.30'
21
+ s.add_dependency 'pg', '~> 0.18.0'
22
+ s.add_dependency 'aws-sdk-s3', '~> 1.8'
23
+ s.add_dependency 'aws-sdk-sqs', '~> 1.3'
24
+ s.add_dependency 'sentry-raven', '~> 3.0'
25
+ s.add_development_dependency 'rake'
26
+ s.add_development_dependency 'test-unit'
27
+ end
@@ -0,0 +1,66 @@
1
+ ### Databases
2
+
3
+ db_ctl:
4
+ type: psql
5
+ host: localhost
6
+ port: 5432
7
+ database: bricolage
8
+ username: bricolage
9
+ # Get password from password.yml
10
+ password: <%= password 'postgres_bricolage_password' %>
11
+ encoding: utf8
12
+
13
+ db_data: &db_data
14
+ type: psql
15
+ host: redshift.host
16
+ port: 5439
17
+ database: production
18
+ username: bricolage
19
+ # Get password from password.yml
20
+ password: <%= password 'redshift_bricolage_password' %>
21
+ encoding: utf8
22
+
23
+ sql:
24
+ <<: *db_data
25
+
26
+ ### SQS
27
+
28
+ sqs_event:
29
+ type: sqs
30
+ region: ap-northeast-1
31
+ url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-events
32
+ max_number_of_messages: 10
33
+ visibility_timeout: 600
34
+ wait_time_seconds: 10
35
+ # Enable following lines if you use access key explicitly.
36
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
37
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
38
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
39
+
40
+ sqs_task:
41
+ type: sqs
42
+ region: ap-northeast-1
43
+ url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-tasks
44
+ max_number_of_messages: 1
45
+ visibility_timeout: 1800
46
+ wait_time_seconds: 10
47
+ # Enable following lines if you use access key explicitly.
48
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
49
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
50
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
51
+
52
+ ### S3
53
+
54
+ s3_ctl: &s3_ctl
55
+ type: s3
56
+ region: ap-northeast-1
57
+ endpoint: s3-ap-northeast-1.amazonaws.com
58
+ bucket: bricolagectl.ap-northeast-1
59
+ prefix: development/strload
60
+ # Enable following lines if you use access key explicitly.
61
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
62
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
63
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
64
+
65
+ s3:
66
+ <<: *s3_ctl
@@ -0,0 +1,5 @@
1
+ # Never commit this file
2
+ redshift_bricolage_password: xxxxxxxxxxx
3
+ postgres_bricolage_password: xxxxxxxxxxx
4
+ aws_access_key_id: "AKIAAAAAAAAAAAAAAAAA"
5
+ aws_secret_access_key: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
@@ -0,0 +1,20 @@
1
+ #event-queue-ds: sqs_event
2
+ #task-queue-ds: sqs_task
3
+
4
+ #ctl-postgres-ds: db_ctl
5
+ #ctl-s3-ds: s3_ctl
6
+
7
+ #redshift-ds: db_data
8
+ #log-table: strload_load_logs
9
+
10
+ #dispatch-interval: 60
11
+
12
+ # To Enable SNS notification
13
+ #sns-ds: sns
14
+ #alert-level: warn
15
+
16
+ url_patterns:
17
+ -
18
+ url: "s3://some-log-bucket/\\w{4}\\.\\w+?\\.\\w+?\\.(?<schema>\\w+)\\.(?<table>\\w+)/\\d{4}/\\d{2}/\\d{2}/.*\\.gz"
19
+ schema: "%schema"
20
+ table: "%table"
@@ -0,0 +1,5 @@
1
+ defaults:
2
+ redshift-ds: db_data
3
+ ctl-ds: s3_ctl
4
+
5
+ dwh_schema: dwh
@@ -10,8 +10,9 @@ require 'bricolage/streamingload/chunkbuffer'
10
10
  require 'bricolage/streamingload/loadtasklogger'
11
11
  require 'bricolage/streamingload/alertinglogger'
12
12
  require 'yaml'
13
- require 'optparse'
14
13
  require 'fileutils'
14
+ require 'raven'
15
+ require 'optparse'
15
16
 
16
17
  module Bricolage
17
18
 
@@ -20,6 +21,12 @@ module Bricolage
20
21
  class Dispatcher < SQSDataSource::MessageHandler
21
22
 
22
23
  def Dispatcher.main
24
+ Raven.capture {
25
+ _main
26
+ }
27
+ end
28
+
29
+ def Dispatcher._main
23
30
  opts = DispatcherOptions.new(ARGV)
24
31
  opts.parse
25
32
  unless opts.rest_arguments.size == 1
@@ -3,6 +3,7 @@ require 'bricolage/streamingload/manifest'
3
3
  require 'bricolage/sqlutils'
4
4
  require 'socket'
5
5
  require 'json'
6
+ require 'raven'
6
7
 
7
8
  module Bricolage
8
9
 
@@ -44,7 +45,8 @@ module Bricolage
44
45
  def execute(fail_fast: false)
45
46
  execute_task
46
47
  return true
47
- rescue JobCancelled
48
+ rescue JobCancelled => ex
49
+ Raven.capture_exception(ex)
48
50
  return true
49
51
  rescue JobDuplicated
50
52
  return true
@@ -52,20 +54,25 @@ module Bricolage
52
54
  return false
53
55
  rescue ControlConnectionFailed => ex
54
56
  @logger.error ex.message
57
+ Raven.capture_exception(ex)
55
58
  wait_for_connection('ctl', @ctl_ds) unless fail_fast
56
59
  return false
57
60
  rescue DataConnectionFailed => ex
58
61
  @logger.error ex.message
62
+ Raven.capture_exception(ex)
59
63
  wait_for_connection('data', @data_ds) unless fail_fast
60
64
  return false
61
65
  rescue JobFailure => ex
62
66
  @logger.error ex.message
67
+ Raven.capture_exception(ex)
63
68
  return false
64
69
  rescue JobError => ex
65
70
  @logger.error ex.message
66
- return true
71
+ Raven.capture_exception(ex)
72
+ return false
67
73
  rescue Exception => ex
68
74
  @logger.exception ex
75
+ Raven.capture_exception(ex)
69
76
  return true
70
77
  end
71
78
 
@@ -126,7 +133,11 @@ module Bricolage
126
133
  raise
127
134
  rescue JobError => ex
128
135
  ctl.open {
129
- ctl.abort_job job_id, 'error', ex.message.lines.first.strip
136
+ fail_count = @task.failure_count
137
+ final_retry = (fail_count >= MAX_RETRY)
138
+ retry_msg = (fail_count > 0) ? "(retry\##{fail_count}#{final_retry ? ' FINAL' : ''}) " : ''
139
+ ctl.abort_job job_id, 'error', retry_msg + ex.message.lines.first.strip
140
+ raise JobCancelled, "retry count exceeds limit: task_id=#{@task_id}" if final_retry
130
141
  }
131
142
  raise
132
143
  rescue Exception => ex