bricolage-streamingload 0.14.2 → 0.16.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 01102c31f0c9f92fb7d6653c130f2a06633cf1fc
4
- data.tar.gz: 4b160bb1a4c176faa0451319d94c8d6aa2404f85
2
+ SHA256:
3
+ metadata.gz: 3374ca3db7400fe9dc36658bf78b3d6516f0acaf4488eb81df1d8054196b055f
4
+ data.tar.gz: 96735f5f850c2769b6031dc9472e27406ba79c9087d9333ac3bc2cddd0113ebc
5
5
  SHA512:
6
- metadata.gz: cd84fa57a717eda77e19710004df60ee780311e3ef89d8668f1d65ef0b67455911339194dae90c914e2c60f56eece109d2ca460998ad918a0d554a273aa5b950
7
- data.tar.gz: 7e981e9c417716f0b5b51a90b442596849c906f62184222b4b2dd9347f3e4c510e2400748f93989ab60b46a95974b5842d5b557be721ac01362f3a410e25a678
6
+ metadata.gz: 0b3a4caedabfea4579ad55942fb9e469821e9a82c50f7e09f782e37df00602ecb419ca47848339f0094dfeca3dc271337737d5fc0390c6e740e364c3dd84c174
7
+ data.tar.gz: 741586ac3cf7a4b4b5983af0a5775db34bf768a6b8a321347fd51a2970efc516ab5eb8516bdb3ecc68ca2ad82a4b923188eaba13e922b67ce06f953323232e40
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+
4
+ /config/development/
5
+ /config/test/
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
@@ -0,0 +1,54 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ bricolage-streamingload (0.15.2)
5
+ aws-sdk-s3 (~> 1.8)
6
+ aws-sdk-sqs (~> 1.3)
7
+ bricolage (>= 5.29.2)
8
+ pg (~> 0.18.0)
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ aws-eventstream (1.0.1)
14
+ aws-partitions (1.102.0)
15
+ aws-sdk-core (3.25.0)
16
+ aws-eventstream (~> 1.0)
17
+ aws-partitions (~> 1.0)
18
+ aws-sigv4 (~> 1.0)
19
+ jmespath (~> 1.0)
20
+ aws-sdk-kms (1.7.0)
21
+ aws-sdk-core (~> 3)
22
+ aws-sigv4 (~> 1.0)
23
+ aws-sdk-s3 (1.17.1)
24
+ aws-sdk-core (~> 3, >= 3.21.2)
25
+ aws-sdk-kms (~> 1)
26
+ aws-sigv4 (~> 1.0)
27
+ aws-sdk-sns (1.3.0)
28
+ aws-sdk-core (~> 3)
29
+ aws-sigv4 (~> 1.0)
30
+ aws-sdk-sqs (1.4.0)
31
+ aws-sdk-core (~> 3)
32
+ aws-sigv4 (~> 1.0)
33
+ aws-sigv4 (1.0.3)
34
+ bricolage (5.29.2)
35
+ aws-sdk-s3 (~> 1)
36
+ aws-sdk-sns (~> 1)
37
+ pg (~> 0.18.0)
38
+ jmespath (1.4.0)
39
+ pg (0.18.4)
40
+ power_assert (1.1.3)
41
+ rake (12.3.1)
42
+ test-unit (3.2.8)
43
+ power_assert
44
+
45
+ PLATFORMS
46
+ ruby
47
+
48
+ DEPENDENCIES
49
+ bricolage-streamingload!
50
+ rake
51
+ test-unit
52
+
53
+ BUNDLED WITH
54
+ 1.16.1
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2016 Minero Aoki, Shimpei Kodama
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,131 @@
1
+ # Bricolage Streaming Load Release Note
2
+
3
+ ## version 0.16.1
4
+
5
+ - [fix] Stop retrying after 2 times retried (total 3 trial).
6
+
7
+ ## version 0.16.0
8
+
9
+ - [new] Retry for also error tasks, not only failure tasks.
10
+ Error tasks are assumed as "non-retriable" e.g. DB login error, but some of them are really able to be resolved
11
+ by retrying on ECS environment. Retrying is not expensive (and human operation is relatively expensive),
12
+ we'll retry on all failures and errors.
13
+
14
+ ## version 0.15.2
15
+
16
+ - [fix] send-data-event: Send objectSize=0 to indicate this message is fake.
17
+
18
+ ## version 0.15.1
19
+
20
+ - [new] Use the schema name in the parameters table by default (no more "no such variable: XXXX_schema" error).
21
+
22
+ ## version 0.15.0
23
+
24
+ - [CHANGE] Upgrade AWS-SDK to v3. This version requires at least Bricolage 5.25.
25
+
26
+ ## version 0.14.2
27
+
28
+ - Reduces loading retry count
29
+
30
+ ## version 0.14.1
31
+
32
+ - [fix] task logging did not work
33
+
34
+ ## version 0.14.0
35
+
36
+ - [new] Logs (task - object) relashonships to S3 on dispatch, as a DB backup.
37
+
38
+ ## version 0.13.0
39
+
40
+ - grand refactoring
41
+
42
+ ## version 0.12.0
43
+
44
+ - [CHANGE] Adds task_id column to the log table (strload_load_logs).
45
+
46
+ ## version 0.11.0
47
+
48
+ - Loosen dependent Bricolage version
49
+
50
+ ## version 0.10.2
51
+
52
+ - [new] New parameter dispatch-interval.
53
+
54
+ ## version 0.10.1
55
+
56
+ - [fix] Fixes simple variable ref bug.
57
+
58
+ ## version 0.10.0
59
+
60
+ - [new] Automatically complement strload_jobs status with Redshift-side log table.
61
+
62
+ ## version 0.9.0
63
+
64
+ - [new] Introduces Redshift-side load log table (strload_load_logs) for load duplication checking.
65
+
66
+ ## version 0.8.1
67
+
68
+ - [fix] tmp: Do not retry on data connection failure.
69
+
70
+ ## version 0.8.0
71
+
72
+ - [CHANGE] Loader retries failed load tasks automatically. Streaming loader does NOT delete a task from the queue on job failures.
73
+ - [enhancement] Rewrites loader to get better error handling (ensure to write log record in the wider range of errornous situations).
74
+
75
+ ## version 0.7.1
76
+
77
+ - fix utilities
78
+
79
+ ## version 0.7.0
80
+
81
+ - [CHANGE] SQS data source requires "region" attribute.
82
+
83
+ ## version 0.6.2
84
+
85
+ - [new] AWS access key id and secret key are now optional for SQS data sources (to allow using EC2 instance or ECS task attached IAM roles).
86
+ - [new] New utility commands send-data-event, send-shutdown, send-checkpoint, send-load-task.
87
+ - Adds sample config files.
88
+
89
+ ## version 0.6.1
90
+
91
+ - [fix] dispatcher: Default ctl data source was wrong.
92
+ - [fix] dispatcher: Detects S3 events by "s3" attribute instead of "eventSource" attribute, to allow fake S3 events (from non-S3 system).
93
+ - [fix] dispatcher: SNS alert is now optional.
94
+ - [fix] dispatcher: Correctly deletes unknown format messages.
95
+ - [enhancement] Adds more logging messages.
96
+
97
+ ## version 0.6.0
98
+
99
+ - [CHANGE] Adds loaded column to strload_objects table to record if the object is really loaded or not.
100
+ - [CHANGE] Now strload_objects' object_url is unique. Duplicated objects are stored in another table, strload_dup_objects.
101
+ - [CHANGE] Now strload_table has table_id column, which is the primary key.
102
+ - [new] Loader daemon supports new command line option --working-dir, to support symbolic linked path, such as Capistrano deploy target (current/).
103
+ - [new] Keeps Redshift manifest file for later inspection.
104
+ - [enhancement] Reduces the number of Redshift writer transactions (1 transaction for 1 loading).
105
+ - [enhancement] Delay dispatching tasks until current event batch is processed, to avoid unexpected visibility timeout.
106
+ - [enhancement] Adds more logging messages.
107
+
108
+ ## version 0.5.1
109
+
110
+ - [fix] Fixes slow query
111
+
112
+ ## version 0.5.0
113
+
114
+ - [new] Introduces FLUSHTABLE dispatcher event
115
+
116
+ ## version 0.4.0
117
+
118
+ - [new] Introduces CHECKPOINT dispatcher event
119
+
120
+ ## version 0.3.0
121
+
122
+ - [new] Supoprts SNS notification
123
+
124
+ ## version 0.2.0
125
+
126
+ - not released
127
+ - [fix] Fixes async delete timing
128
+
129
+ ## version 0.1.0
130
+
131
+ - 2016-07-13 works 1 month
@@ -0,0 +1,3 @@
1
+ task :test do
2
+ load "#{__dir__}/test/all.rb"
3
+ end
@@ -28,7 +28,7 @@ ARGF.each do |line|
28
28
  },
29
29
  object: {
30
30
  key: key,
31
- size: 1
31
+ size: 0
32
32
  }
33
33
  },
34
34
  noDispatch: no_dispatch
@@ -0,0 +1,26 @@
1
+ require_relative 'lib/bricolage/streamingload/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.platform = Gem::Platform::RUBY
5
+ s.name = 'bricolage-streamingload'
6
+ s.version = Bricolage::StreamingLoad::VERSION
7
+ s.summary = 'Bricolage Streaming Load Daemon'
8
+ s.description = 'Bricolage Streaming Load Daemon loads S3 data files to Redshift continuously.'
9
+ s.license = 'MIT'
10
+
11
+ s.author = ['Minero Aoki', 'Shimpei Kodama']
12
+ s.email = ['aamine@loveruby.net']
13
+ s.homepage = 'https://github.com/aamine/bricolage-streamingload'
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
16
+ s.executables = s.files.grep(%r{bin/}).map {|path| File.basename(path) }
17
+ s.require_path = 'lib'
18
+
19
+ s.required_ruby_version = '>= 2.3.0'
20
+ s.add_dependency 'bricolage', '>= 5.29.2'
21
+ s.add_dependency 'pg', '~> 0.18.0'
22
+ s.add_dependency 'aws-sdk-s3', '~> 1.8'
23
+ s.add_dependency 'aws-sdk-sqs', '~> 1.3'
24
+ s.add_development_dependency 'rake'
25
+ s.add_development_dependency 'test-unit'
26
+ end
@@ -0,0 +1,66 @@
1
+ ### Databases
2
+
3
+ db_ctl:
4
+ type: psql
5
+ host: localhost
6
+ port: 5432
7
+ database: bricolage
8
+ username: bricolage
9
+ # Get password from password.yml
10
+ password: <%= password 'postgres_bricolage_password' %>
11
+ encoding: utf8
12
+
13
+ db_data: &db_data
14
+ type: psql
15
+ host: redshift.host
16
+ port: 5439
17
+ database: production
18
+ username: bricolage
19
+ # Get password from password.yml
20
+ password: <%= password 'redshift_bricolage_password' %>
21
+ encoding: utf8
22
+
23
+ sql:
24
+ <<: *db_data
25
+
26
+ ### SQS
27
+
28
+ sqs_event:
29
+ type: sqs
30
+ region: ap-northeast-1
31
+ url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-events
32
+ max_number_of_messages: 10
33
+ visibility_timeout: 600
34
+ wait_time_seconds: 10
35
+ # Enable following lines if you use access key explicitly.
36
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
37
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
38
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
39
+
40
+ sqs_task:
41
+ type: sqs
42
+ region: ap-northeast-1
43
+ url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-tasks
44
+ max_number_of_messages: 1
45
+ visibility_timeout: 1800
46
+ wait_time_seconds: 10
47
+ # Enable following lines if you use access key explicitly.
48
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
49
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
50
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
51
+
52
+ ### S3
53
+
54
+ s3_ctl: &s3_ctl
55
+ type: s3
56
+ region: ap-northeast-1
57
+ endpoint: s3-ap-northeast-1.amazonaws.com
58
+ bucket: bricolagectl.ap-northeast-1
59
+ prefix: development/strload
60
+ # Enable following lines if you use access key explicitly.
61
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
62
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
63
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
64
+
65
+ s3:
66
+ <<: *s3_ctl
@@ -0,0 +1,5 @@
1
+ # Never commit this file
2
+ redshift_bricolage_password: xxxxxxxxxxx
3
+ postgres_bricolage_password: xxxxxxxxxxx
4
+ aws_access_key_id: "AKIAAAAAAAAAAAAAAAAA"
5
+ aws_secret_access_key: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
@@ -0,0 +1,20 @@
1
+ #event-queue-ds: sqs_event
2
+ #task-queue-ds: sqs_task
3
+
4
+ #ctl-postgres-ds: db_ctl
5
+ #ctl-s3-ds: s3_ctl
6
+
7
+ #redshift-ds: db_data
8
+ #log-table: strload_load_logs
9
+
10
+ #dispatch-interval: 60
11
+
12
+ # To Enable SNS notification
13
+ #sns-ds: sns
14
+ #alert-level: warn
15
+
16
+ url_patterns:
17
+ -
18
+ url: "s3://some-log-bucket/\\w{4}\\.\\w+?\\.\\w+?\\.(?<schema>\\w+)\\.(?<table>\\w+)/\\d{4}/\\d{2}/\\d{2}/.*\\.gz"
19
+ schema: "%schema"
20
+ table: "%table"
@@ -0,0 +1,5 @@
1
+ defaults:
2
+ redshift-ds: db_data
3
+ ctl-ds: s3_ctl
4
+
5
+ dwh_schema: dwh
@@ -1,6 +1,6 @@
1
1
  require 'bricolage/datasource'
2
2
  require 'securerandom'
3
- require 'aws-sdk'
3
+ require 'aws-sdk-sqs'
4
4
  require 'json'
5
5
  require 'time'
6
6
 
@@ -9,7 +9,6 @@ require 'bricolage/streamingload/chunkrouter'
9
9
  require 'bricolage/streamingload/chunkbuffer'
10
10
  require 'bricolage/streamingload/loadtasklogger'
11
11
  require 'bricolage/streamingload/alertinglogger'
12
- require 'aws-sdk'
13
12
  require 'yaml'
14
13
  require 'optparse'
15
14
  require 'fileutils'
@@ -63,7 +63,7 @@ module Bricolage
63
63
  return false
64
64
  rescue JobError => ex
65
65
  @logger.error ex.message
66
- return true
66
+ return false
67
67
  rescue Exception => ex
68
68
  @logger.exception ex
69
69
  return true
@@ -126,7 +126,11 @@ module Bricolage
126
126
  raise
127
127
  rescue JobError => ex
128
128
  ctl.open {
129
- ctl.abort_job job_id, 'error', ex.message.lines.first.strip
129
+ fail_count = @task.failure_count
130
+ final_retry = (fail_count >= MAX_RETRY)
131
+ retry_msg = (fail_count > 0) ? "(retry\##{fail_count}#{final_retry ? ' FINAL' : ''}) " : ''
132
+ ctl.abort_job job_id, 'error', retry_msg + ex.message.lines.first.strip
133
+ raise JobCancelled, "retry count exceeds limit: task_id=#{@task_id}" if final_retry
130
134
  }
131
135
  raise
132
136
  rescue Exception => ex
@@ -35,7 +35,7 @@ module Bricolage
35
35
  end
36
36
 
37
37
  def JobParams.resolve_schema(ctx, schema)
38
- ctx.global_variables["#{schema}_schema"] || schema
38
+ ctx.global_variables.get_force("#{schema}_schema") || schema
39
39
  end
40
40
  private_class_method :resolve_schema
41
41
 
@@ -1,3 +1,5 @@
1
+ require 'aws-sdk-s3'
2
+
1
3
  module Bricolage
2
4
 
3
5
  module StreamingLoad
@@ -149,12 +149,15 @@ module Bricolage
149
149
 
150
150
  class NoopJob
151
151
 
152
- def initialize(context:, ctl_ds:, task_id:, force: false, logger:)
152
+ def initialize(context:, ctl_ds:, data_ds:, log_table:, task_id:, force: false, logger:)
153
153
  @ctx = context
154
154
  @ctl_ds = ctl_ds
155
+ @data_ds = data_ds
156
+ @log_table = log_table
155
157
  @task_id = task_id
156
158
  @force = force
157
159
  @logger = logger
160
+ @working_dir = Dir.getwd
158
161
  end
159
162
 
160
163
  def execute(fail_fast: false)
@@ -164,7 +167,7 @@ module Bricolage
164
167
  end
165
168
 
166
169
  def execute_task
167
- @logger.info "execute_task: task_id=#{@task_id} force=#{@force} ctx=#{@ctx.home_path} ctl_ds=#{@ctl_ds.name} dir=#{@working_dir}"
170
+ @logger.info "execute_task: task_id=#{@task_id} force=#{@force} ctx=#{@ctx.home_path} ctl_ds=#{@ctl_ds.name} data_ds=#{@data_ds.name} dir=#{@working_dir}"
168
171
  end
169
172
 
170
173
  end