bricolage-streamingload 0.14.2 → 0.16.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 01102c31f0c9f92fb7d6653c130f2a06633cf1fc
4
- data.tar.gz: 4b160bb1a4c176faa0451319d94c8d6aa2404f85
2
+ SHA256:
3
+ metadata.gz: 3374ca3db7400fe9dc36658bf78b3d6516f0acaf4488eb81df1d8054196b055f
4
+ data.tar.gz: 96735f5f850c2769b6031dc9472e27406ba79c9087d9333ac3bc2cddd0113ebc
5
5
  SHA512:
6
- metadata.gz: cd84fa57a717eda77e19710004df60ee780311e3ef89d8668f1d65ef0b67455911339194dae90c914e2c60f56eece109d2ca460998ad918a0d554a273aa5b950
7
- data.tar.gz: 7e981e9c417716f0b5b51a90b442596849c906f62184222b4b2dd9347f3e4c510e2400748f93989ab60b46a95974b5842d5b557be721ac01362f3a410e25a678
6
+ metadata.gz: 0b3a4caedabfea4579ad55942fb9e469821e9a82c50f7e09f782e37df00602ecb419ca47848339f0094dfeca3dc271337737d5fc0390c6e740e364c3dd84c174
7
+ data.tar.gz: 741586ac3cf7a4b4b5983af0a5775db34bf768a6b8a321347fd51a2970efc516ab5eb8516bdb3ecc68ca2ad82a4b923188eaba13e922b67ce06f953323232e40
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+
4
+ /config/development/
5
+ /config/test/
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source "https://rubygems.org"
2
+ gemspec
@@ -0,0 +1,54 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ bricolage-streamingload (0.15.2)
5
+ aws-sdk-s3 (~> 1.8)
6
+ aws-sdk-sqs (~> 1.3)
7
+ bricolage (>= 5.29.2)
8
+ pg (~> 0.18.0)
9
+
10
+ GEM
11
+ remote: https://rubygems.org/
12
+ specs:
13
+ aws-eventstream (1.0.1)
14
+ aws-partitions (1.102.0)
15
+ aws-sdk-core (3.25.0)
16
+ aws-eventstream (~> 1.0)
17
+ aws-partitions (~> 1.0)
18
+ aws-sigv4 (~> 1.0)
19
+ jmespath (~> 1.0)
20
+ aws-sdk-kms (1.7.0)
21
+ aws-sdk-core (~> 3)
22
+ aws-sigv4 (~> 1.0)
23
+ aws-sdk-s3 (1.17.1)
24
+ aws-sdk-core (~> 3, >= 3.21.2)
25
+ aws-sdk-kms (~> 1)
26
+ aws-sigv4 (~> 1.0)
27
+ aws-sdk-sns (1.3.0)
28
+ aws-sdk-core (~> 3)
29
+ aws-sigv4 (~> 1.0)
30
+ aws-sdk-sqs (1.4.0)
31
+ aws-sdk-core (~> 3)
32
+ aws-sigv4 (~> 1.0)
33
+ aws-sigv4 (1.0.3)
34
+ bricolage (5.29.2)
35
+ aws-sdk-s3 (~> 1)
36
+ aws-sdk-sns (~> 1)
37
+ pg (~> 0.18.0)
38
+ jmespath (1.4.0)
39
+ pg (0.18.4)
40
+ power_assert (1.1.3)
41
+ rake (12.3.1)
42
+ test-unit (3.2.8)
43
+ power_assert
44
+
45
+ PLATFORMS
46
+ ruby
47
+
48
+ DEPENDENCIES
49
+ bricolage-streamingload!
50
+ rake
51
+ test-unit
52
+
53
+ BUNDLED WITH
54
+ 1.16.1
@@ -0,0 +1,21 @@
1
+ The MIT License
2
+
3
+ Copyright (c) 2016 Minero Aoki, Shimpei Kodama
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
@@ -0,0 +1,131 @@
1
+ # Bricolage Streaming Load Release Note
2
+
3
+ ## version 0.16.1
4
+
5
+ - [fix] Stop retrying after 2 times retried (total 3 trial).
6
+
7
+ ## version 0.16.0
8
+
9
+ - [new] Retry for also error tasks, not only failure tasks.
10
+ Error tasks are assumed as "non-retriable" e.g. DB login error, but some of them are really able to be resolved
11
+ by retrying on ECS environment. Retrying is not expensive (and human operation is relatively expensive),
12
+ we'll retry on all failures and errors.
13
+
14
+ ## version 0.15.2
15
+
16
+ - [fix] send-data-event: Send objectSize=0 to indicate this message is fake.
17
+
18
+ ## version 0.15.1
19
+
20
+ - [new] Use the schema name in the parameters table by default (no more "no such variable: XXXX_schema" error).
21
+
22
+ ## version 0.15.0
23
+
24
+ - [CHANGE] Upgrade AWS-SDK to v3. This version requires at least Bricolage 5.25.
25
+
26
+ ## version 0.14.2
27
+
28
+ - Reduces loading retry count
29
+
30
+ ## version 0.14.1
31
+
32
+ - [fix] task logging did not work
33
+
34
+ ## version 0.14.0
35
+
36
+ - [new] Logs (task - object) relashonships to S3 on dispatch, as a DB backup.
37
+
38
+ ## version 0.13.0
39
+
40
+ - grand refactoring
41
+
42
+ ## version 0.12.0
43
+
44
+ - [CHANGE] Adds task_id column to the log table (strload_load_logs).
45
+
46
+ ## version 0.11.0
47
+
48
+ - Loosen dependent Bricolage version
49
+
50
+ ## version 0.10.2
51
+
52
+ - [new] New parameter dispatch-interval.
53
+
54
+ ## version 0.10.1
55
+
56
+ - [fix] Fixes simple variable ref bug.
57
+
58
+ ## version 0.10.0
59
+
60
+ - [new] Automatically complement strload_jobs status with Redshift-side log table.
61
+
62
+ ## version 0.9.0
63
+
64
+ - [new] Introduces Redshift-side load log table (strload_load_logs) for load duplication checking.
65
+
66
+ ## version 0.8.1
67
+
68
+ - [fix] tmp: Do not retry on data connection failure.
69
+
70
+ ## version 0.8.0
71
+
72
+ - [CHANGE] Loader retries failed load tasks automatically. Streaming loader does NOT delete a task from the queue on job failures.
73
+ - [enhancement] Rewrites loader to get better error handling (ensure to write log record in the wider range of errornous situations).
74
+
75
+ ## version 0.7.1
76
+
77
+ - fix utilities
78
+
79
+ ## version 0.7.0
80
+
81
+ - [CHANGE] SQS data source requires "region" attribute.
82
+
83
+ ## version 0.6.2
84
+
85
+ - [new] AWS access key id and secret key are now optional for SQS data sources (to allow using EC2 instance or ECS task attached IAM roles).
86
+ - [new] New utility commands send-data-event, send-shutdown, send-checkpoint, send-load-task.
87
+ - Adds sample config files.
88
+
89
+ ## version 0.6.1
90
+
91
+ - [fix] dispatcher: Default ctl data source was wrong.
92
+ - [fix] dispatcher: Detects S3 events by "s3" attribute instead of "eventSource" attribute, to allow fake S3 events (from non-S3 system).
93
+ - [fix] dispatcher: SNS alert is now optional.
94
+ - [fix] dispatcher: Correctly deletes unknown format messages.
95
+ - [enhancement] Adds more logging messages.
96
+
97
+ ## version 0.6.0
98
+
99
+ - [CHANGE] Adds loaded column to strload_objects table to record if the object is really loaded or not.
100
+ - [CHANGE] Now strload_objects' object_url is unique. Duplicated objects are stored in another table, strload_dup_objects.
101
+ - [CHANGE] Now strload_table has table_id column, which is the primary key.
102
+ - [new] Loader daemon supports new command line option --working-dir, to support symbolic linked path, such as Capistrano deploy target (current/).
103
+ - [new] Keeps Redshift manifest file for later inspection.
104
+ - [enhancement] Reduces the number of Redshift writer transactions (1 transaction for 1 loading).
105
+ - [enhancement] Delay dispatching tasks until current event batch is processed, to avoid unexpected visibility timeout.
106
+ - [enhancement] Adds more logging messages.
107
+
108
+ ## version 0.5.1
109
+
110
+ - [fix] Fixes slow query
111
+
112
+ ## version 0.5.0
113
+
114
+ - [new] Introduces FLUSHTABLE dispatcher event
115
+
116
+ ## version 0.4.0
117
+
118
+ - [new] Introduces CHECKPOINT dispatcher event
119
+
120
+ ## version 0.3.0
121
+
122
+ - [new] Supoprts SNS notification
123
+
124
+ ## version 0.2.0
125
+
126
+ - not released
127
+ - [fix] Fixes async delete timing
128
+
129
+ ## version 0.1.0
130
+
131
+ - 2016-07-13 works 1 month
@@ -0,0 +1,3 @@
1
+ task :test do
2
+ load "#{__dir__}/test/all.rb"
3
+ end
@@ -28,7 +28,7 @@ ARGF.each do |line|
28
28
  },
29
29
  object: {
30
30
  key: key,
31
- size: 1
31
+ size: 0
32
32
  }
33
33
  },
34
34
  noDispatch: no_dispatch
@@ -0,0 +1,26 @@
1
+ require_relative 'lib/bricolage/streamingload/version'
2
+
3
+ Gem::Specification.new do |s|
4
+ s.platform = Gem::Platform::RUBY
5
+ s.name = 'bricolage-streamingload'
6
+ s.version = Bricolage::StreamingLoad::VERSION
7
+ s.summary = 'Bricolage Streaming Load Daemon'
8
+ s.description = 'Bricolage Streaming Load Daemon loads S3 data files to Redshift continuously.'
9
+ s.license = 'MIT'
10
+
11
+ s.author = ['Minero Aoki', 'Shimpei Kodama']
12
+ s.email = ['aamine@loveruby.net']
13
+ s.homepage = 'https://github.com/aamine/bricolage-streamingload'
14
+
15
+ s.files = `git ls-files -z`.split("\x0").reject {|f| f.match(%r{^(test|spec|features)/}) }
16
+ s.executables = s.files.grep(%r{bin/}).map {|path| File.basename(path) }
17
+ s.require_path = 'lib'
18
+
19
+ s.required_ruby_version = '>= 2.3.0'
20
+ s.add_dependency 'bricolage', '>= 5.29.2'
21
+ s.add_dependency 'pg', '~> 0.18.0'
22
+ s.add_dependency 'aws-sdk-s3', '~> 1.8'
23
+ s.add_dependency 'aws-sdk-sqs', '~> 1.3'
24
+ s.add_development_dependency 'rake'
25
+ s.add_development_dependency 'test-unit'
26
+ end
@@ -0,0 +1,66 @@
1
+ ### Databases
2
+
3
+ db_ctl:
4
+ type: psql
5
+ host: localhost
6
+ port: 5432
7
+ database: bricolage
8
+ username: bricolage
9
+ # Get password from password.yml
10
+ password: <%= password 'postgres_bricolage_password' %>
11
+ encoding: utf8
12
+
13
+ db_data: &db_data
14
+ type: psql
15
+ host: redshift.host
16
+ port: 5439
17
+ database: production
18
+ username: bricolage
19
+ # Get password from password.yml
20
+ password: <%= password 'redshift_bricolage_password' %>
21
+ encoding: utf8
22
+
23
+ sql:
24
+ <<: *db_data
25
+
26
+ ### SQS
27
+
28
+ sqs_event:
29
+ type: sqs
30
+ region: ap-northeast-1
31
+ url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-events
32
+ max_number_of_messages: 10
33
+ visibility_timeout: 600
34
+ wait_time_seconds: 10
35
+ # Enable following lines if you use access key explicitly.
36
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
37
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
38
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
39
+
40
+ sqs_task:
41
+ type: sqs
42
+ region: ap-northeast-1
43
+ url: https://sqs.ap-northeast-1.amazonaws.com/111111111111/bricolage-tasks
44
+ max_number_of_messages: 1
45
+ visibility_timeout: 1800
46
+ wait_time_seconds: 10
47
+ # Enable following lines if you use access key explicitly.
48
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
49
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
50
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
51
+
52
+ ### S3
53
+
54
+ s3_ctl: &s3_ctl
55
+ type: s3
56
+ region: ap-northeast-1
57
+ endpoint: s3-ap-northeast-1.amazonaws.com
58
+ bucket: bricolagectl.ap-northeast-1
59
+ prefix: development/strload
60
+ # Enable following lines if you use access key explicitly.
61
+ # Otherwise Bricolage uses EC2 instance or ECS task attached IAM role.
62
+ #access_key_id: "<%%= password 'aws_access_key_id' %>"
63
+ #secret_access_key: "<%%= password 'aws_secret_access_key' %>"
64
+
65
+ s3:
66
+ <<: *s3_ctl
@@ -0,0 +1,5 @@
1
+ # Never commit this file
2
+ redshift_bricolage_password: xxxxxxxxxxx
3
+ postgres_bricolage_password: xxxxxxxxxxx
4
+ aws_access_key_id: "AKIAAAAAAAAAAAAAAAAA"
5
+ aws_secret_access_key: "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
@@ -0,0 +1,20 @@
1
+ #event-queue-ds: sqs_event
2
+ #task-queue-ds: sqs_task
3
+
4
+ #ctl-postgres-ds: db_ctl
5
+ #ctl-s3-ds: s3_ctl
6
+
7
+ #redshift-ds: db_data
8
+ #log-table: strload_load_logs
9
+
10
+ #dispatch-interval: 60
11
+
12
+ # To Enable SNS notification
13
+ #sns-ds: sns
14
+ #alert-level: warn
15
+
16
+ url_patterns:
17
+ -
18
+ url: "s3://some-log-bucket/\\w{4}\\.\\w+?\\.\\w+?\\.(?<schema>\\w+)\\.(?<table>\\w+)/\\d{4}/\\d{2}/\\d{2}/.*\\.gz"
19
+ schema: "%schema"
20
+ table: "%table"
@@ -0,0 +1,5 @@
1
+ defaults:
2
+ redshift-ds: db_data
3
+ ctl-ds: s3_ctl
4
+
5
+ dwh_schema: dwh
@@ -1,6 +1,6 @@
1
1
  require 'bricolage/datasource'
2
2
  require 'securerandom'
3
- require 'aws-sdk'
3
+ require 'aws-sdk-sqs'
4
4
  require 'json'
5
5
  require 'time'
6
6
 
@@ -9,7 +9,6 @@ require 'bricolage/streamingload/chunkrouter'
9
9
  require 'bricolage/streamingload/chunkbuffer'
10
10
  require 'bricolage/streamingload/loadtasklogger'
11
11
  require 'bricolage/streamingload/alertinglogger'
12
- require 'aws-sdk'
13
12
  require 'yaml'
14
13
  require 'optparse'
15
14
  require 'fileutils'
@@ -63,7 +63,7 @@ module Bricolage
63
63
  return false
64
64
  rescue JobError => ex
65
65
  @logger.error ex.message
66
- return true
66
+ return false
67
67
  rescue Exception => ex
68
68
  @logger.exception ex
69
69
  return true
@@ -126,7 +126,11 @@ module Bricolage
126
126
  raise
127
127
  rescue JobError => ex
128
128
  ctl.open {
129
- ctl.abort_job job_id, 'error', ex.message.lines.first.strip
129
+ fail_count = @task.failure_count
130
+ final_retry = (fail_count >= MAX_RETRY)
131
+ retry_msg = (fail_count > 0) ? "(retry\##{fail_count}#{final_retry ? ' FINAL' : ''}) " : ''
132
+ ctl.abort_job job_id, 'error', retry_msg + ex.message.lines.first.strip
133
+ raise JobCancelled, "retry count exceeds limit: task_id=#{@task_id}" if final_retry
130
134
  }
131
135
  raise
132
136
  rescue Exception => ex
@@ -35,7 +35,7 @@ module Bricolage
35
35
  end
36
36
 
37
37
  def JobParams.resolve_schema(ctx, schema)
38
- ctx.global_variables["#{schema}_schema"] || schema
38
+ ctx.global_variables.get_force("#{schema}_schema") || schema
39
39
  end
40
40
  private_class_method :resolve_schema
41
41
 
@@ -1,3 +1,5 @@
1
+ require 'aws-sdk-s3'
2
+
1
3
  module Bricolage
2
4
 
3
5
  module StreamingLoad
@@ -149,12 +149,15 @@ module Bricolage
149
149
 
150
150
  class NoopJob
151
151
 
152
- def initialize(context:, ctl_ds:, task_id:, force: false, logger:)
152
+ def initialize(context:, ctl_ds:, data_ds:, log_table:, task_id:, force: false, logger:)
153
153
  @ctx = context
154
154
  @ctl_ds = ctl_ds
155
+ @data_ds = data_ds
156
+ @log_table = log_table
155
157
  @task_id = task_id
156
158
  @force = force
157
159
  @logger = logger
160
+ @working_dir = Dir.getwd
158
161
  end
159
162
 
160
163
  def execute(fail_fast: false)
@@ -164,7 +167,7 @@ module Bricolage
164
167
  end
165
168
 
166
169
  def execute_task
167
- @logger.info "execute_task: task_id=#{@task_id} force=#{@force} ctx=#{@ctx.home_path} ctl_ds=#{@ctl_ds.name} dir=#{@working_dir}"
170
+ @logger.info "execute_task: task_id=#{@task_id} force=#{@force} ctx=#{@ctx.home_path} ctl_ds=#{@ctl_ds.name} data_ds=#{@data_ds.name} dir=#{@working_dir}"
168
171
  end
169
172
 
170
173
  end