embulk-output-vertica 0.4.1 → 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: dd4013fbcdb34ce19df2fe94eb19b226d92d6b83
4
- data.tar.gz: 5e99ffef341da9c253a6f5b6420239c7e2000999
3
+ metadata.gz: 14be9147838a9b3e7e7c7ace08b5d26b491538cf
4
+ data.tar.gz: 1f53ce38472f8c015e38e2b502cb652dfcaeef6e
5
5
  SHA512:
6
- metadata.gz: dabf1e7a0a5b6767dbd7352c42e777b37012e3172ef46024cb515298f8b734a8e64aa6d0b685b7fa920e41821a9351183ab9a7764ecb0a0790cf0190055ff563
7
- data.tar.gz: bc761f55280145dde00983580b960af45b03e50f4ce32f4a6338cc8154a7b76d7f78280657828f5780b60f2370987415d6d3ba678535814f6e75df875df18a67
6
+ metadata.gz: f265d51ec0ffc498cbeaf66f6b86622d2cce48f2d9d5cac846250930bec95376b3dde1069f5aff12fd3c89a16d50172c3faed476652fd1cae9ac9d7eb582a04c
7
+ data.tar.gz: 9eebc8e2346c03d908c9ca2c735623a6890d2e6ae852c3083efef543694b905e6d9d3babd24e1c7a9ced34433e13ee3e42a1bd0e28babfab6ab070e8009e8ead
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ # 0.5.0 (2015/12/04)
2
+
3
+ Changes:
4
+
5
+ * Use thread pool instead of connection pool #13
6
+
1
7
  # 0.4.1 (2015/12/04)
2
8
 
3
9
  Fixes:
data/README.md CHANGED
@@ -18,8 +18,7 @@
18
18
  - **table**: table name (string, required)
19
19
  - **mode**: "insert", or "replace". See bellow. (string, default: insert)
20
20
  - **copy_mode**: specifies how data is loaded into the database. See vertica documents for details. (`AUTO`, `DIRECT`, or `TRICKLE`. default: `AUTO`)
21
- - **pool**: number of connection pools, this number controls number of concurrency to issue COPY statements (integer, default: processor_count, that is, number of threads in input plugin)
22
- - **pool_timeout**: timeout to checkout a connection from connection pools (seconds, default: 600)
21
+ - **pool**: number of output threads, this number controls number of concurrency to issue COPY statements (integer, default: processor_count, that is, number of threads in input plugin)
23
22
  - **abort_on_error**: stops the COPY command if a row is rejected and rolls back the command. No data is loaded. (bool, default: false)
24
23
  - **reject_on_materialized_type_error**: uses `reject_on_materialized_type_error` option for fjsonparser(). This rejects rows if any of column types and value types do not fit, ex) double value into INT column fails. See vertica documents for details. (bool, default: false)
25
24
  - **default_timezone**: the default timezone for column_options (string, default is "UTC")
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-output-vertica"
3
- spec.version = "0.4.1"
3
+ spec.version = "0.5.0"
4
4
  spec.authors = ["eiji.sekiya", "Naotoshi Seo"]
5
5
  spec.email = ["eiji.sekiya.0326@gmail.com", "sonots@gmail.com"]
6
6
  spec.summary = "Vertica output plugin for Embulk"
@@ -15,7 +15,6 @@ Gem::Specification.new do |spec|
15
15
 
16
16
  spec.add_dependency "jvertica", "~> 0.2"
17
17
  spec.add_dependency "tzinfo"
18
- spec.add_dependency "connection_pool"
19
18
  spec.add_development_dependency "bundler", "~> 1.7"
20
19
  spec.add_development_dependency "rake", "~> 10.0"
21
20
  end
@@ -0,0 +1,148 @@
1
+ module Embulk
2
+ module Output
3
+ class Vertica < OutputPlugin
4
+ class OutputThreadPool
5
+ def initialize(task, schema, size)
6
+ @size = size
7
+ converters = ValueConverterFactory.create_converters(schema, task['default_timezone'], task['column_options'])
8
+ @output_threads = size.times.map { OutputThread.new(task, schema, converters) }
9
+ @current_index = 0
10
+ end
11
+
12
+ def enqueue(page)
13
+ @output_threads[@current_index].enqueue(page)
14
+ @current_index = (@current_index + 1) % @size
15
+ end
16
+
17
+ def start
18
+ @size.times.map {|i| @output_threads[i].start }
19
+ end
20
+
21
+ def commit
22
+ task_reports = @size.times.map {|i| @output_threads[i].commit }
23
+ end
24
+ end
25
+
26
+ class OutputThread
27
+ def initialize(task, schema, converters)
28
+ @task = task
29
+ @schema = schema
30
+ @queue = SizedQueue.new(1)
31
+ @converters = converters
32
+ @num_input_rows = 0
33
+ @num_output_rows = 0
34
+ @num_rejected_rows = 0
35
+ end
36
+
37
+ def enqueue(page)
38
+ if @thread.status.nil? # thread died by an error
39
+ @thread.join # raise the same error raised inside thread
40
+ end
41
+ if @thread.alive?
42
+ Embulk.logger.trace { "embulk-output-vertica: enqueued" }
43
+ @queue.push(page)
44
+ end
45
+ end
46
+
47
+ def run
48
+ Embulk.logger.debug { "embulk-output-vertica: thread started" }
49
+ Vertica.connect(@task) do |jv|
50
+ json = nil # for log
51
+ begin
52
+ num_output_rows, rejects = copy(jv, copy_sql) do |stdin|
53
+ while page = @queue.pop
54
+ if page == 'finish'
55
+ Embulk.logger.debug { "embulk-output-vertica: thread finished" }
56
+ break
57
+ end
58
+ Embulk.logger.trace { "embulk-output-vertica: dequeued" }
59
+
60
+ page.each do |record|
61
+ json = to_json(record)
62
+ Embulk.logger.trace { "embulk-output-vertica: to_json #{json}" }
63
+ stdin << json << "\n"
64
+ @num_input_rows += 1
65
+ end
66
+ end
67
+ end
68
+ num_rejected_rows = rejects.size
69
+ @num_output_rows += num_output_rows
70
+ @num_rejected_rows += num_rejected_rows
71
+ jv.commit
72
+ Embulk.logger.info { "embulk-output-vertica: COMMIT!" }
73
+ rescue java.sql.SQLDataException => e
74
+ jv.rollback
75
+ if @task['reject_on_materialized_type_error'] and e.message =~ /Rejected by user-defined parser/
76
+ Embulk.logger.warn "embulk-output-vertica: ROLLBACK! some of column types and values types do not fit #{json}"
77
+ else
78
+ Embulk.logger.warn "embulk-output-vertica: ROLLBACK!"
79
+ end
80
+ raise e # die transaction
81
+ end
82
+ end
83
+ end
84
+
85
+ def start
86
+ @thread = Thread.new(&method(:run))
87
+ end
88
+
89
+ def commit
90
+ @queue.push('finish') if @thread.alive?
91
+ Thread.pass
92
+ @thread.join # the same error with run would be raised at here
93
+
94
+ task_report = {
95
+ 'num_input_rows' => @num_input_rows,
96
+ 'num_output_rows' => @num_output_rows,
97
+ 'num_rejected_rows' => @num_rejected_rows,
98
+ }
99
+ end
100
+
101
+ # private
102
+
103
+ def copy(conn, sql, &block)
104
+ Embulk.logger.debug "embulk-output-vertica: #{sql}"
105
+ results, rejects = conn.copy(sql, &block)
106
+ end
107
+
108
+ def copy_sql
109
+ @copy_sql ||= "COPY #{quoted_schema}.#{quoted_temp_table} FROM STDIN#{fjsonparser}#{copy_mode}#{abort_on_error} NO COMMIT"
110
+ end
111
+
112
+ def to_json(record)
113
+ Hash[*(@schema.names.zip(record).map do |column_name, value|
114
+ [column_name, @converters[column_name].call(value)]
115
+ end.flatten!(1))].to_json
116
+ end
117
+
118
+ def quoted_schema
119
+ ::Jvertica.quote_identifier(@task['schema'])
120
+ end
121
+
122
+ def quoted_table
123
+ ::Jvertica.quote_identifier(@task['table'])
124
+ end
125
+
126
+ def quoted_temp_table
127
+ ::Jvertica.quote_identifier(@task['temp_table'])
128
+ end
129
+
130
+ def copy_mode
131
+ " #{@task['copy_mode']}"
132
+ end
133
+
134
+ def abort_on_error
135
+ @task['abort_on_error'] ? ' ABORT ON ERROR' : ''
136
+ end
137
+
138
+ def fjsonparser
139
+ " PARSER fjsonparser(#{reject_on_materialized_type_error})"
140
+ end
141
+
142
+ def reject_on_materialized_type_error
143
+ @task['reject_on_materialized_type_error'] ? 'reject_on_materialized_type_error=true' : ''
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end
@@ -1,6 +1,6 @@
1
1
  require 'jvertica'
2
- require 'connection_pool'
3
2
  require_relative 'vertica/value_converter_factory'
3
+ require_relative 'vertica/output_thread'
4
4
 
5
5
  module Embulk
6
6
  module Output
@@ -10,8 +10,8 @@ module Embulk
10
10
  class Error < StandardError; end
11
11
  class NotSupportedType < Error; end
12
12
 
13
- def self.connection_pool
14
- @connection_pool ||= @connection_pool_proc.call
13
+ def self.thread_pool
14
+ @thread_pool ||= @thread_pool_proc.call
15
15
  end
16
16
 
17
17
  def self.transaction(config, schema, processor_count, &control)
@@ -27,24 +27,14 @@ module Embulk
27
27
  'mode' => config.param('mode', :string, :default => 'insert'),
28
28
  'copy_mode' => config.param('copy_mode', :string, :default => 'AUTO'),
29
29
  'abort_on_error' => config.param('abort_on_error', :bool, :default => false),
30
- 'default_timezone' => config.param('default_timezone', :string, :default => 'UTC'),
30
+ 'default_timezone' => config.param('default_timezone', :string, :default => 'UTC'),
31
31
  'column_options' => config.param('column_options', :hash, :default => {}),
32
32
  'reject_on_materialized_type_error' => config.param('reject_on_materialized_type_error', :bool, :default => false),
33
33
  'pool' => config.param('pool', :integer, :default => processor_count),
34
- 'pool_timeout' => config.param('pool_timeout', :integer, :default => 600),
35
34
  }
36
- task['user'] ||= task['username']
37
35
 
38
- @connection_pool_proc = Proc.new do
39
- ConnectionPool.new(size: task['pool'], timeout: task['pool_timeout']) do
40
- ::Jvertica.connect({
41
- host: task['host'],
42
- port: task['port'],
43
- user: task['user'],
44
- password: task['password'],
45
- database: task['database'],
46
- })
47
- end
36
+ @thread_pool_proc = Proc.new do
37
+ OutputThreadPool.new(task, schema, task['pool'])
48
38
  end
49
39
 
50
40
  task['user'] ||= task['username']
@@ -73,7 +63,7 @@ module Embulk
73
63
  sql_schema_table = self.sql_schema_from_embulk_schema(schema, task['column_options'])
74
64
 
75
65
  # create the target table
76
- connection_pool.with do |jv|
66
+ connect(task) do |jv|
77
67
  query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_table}]) if task['mode'] == 'REPLACE'
78
68
  query(jv, %[CREATE TABLE IF NOT EXISTS #{quoted_schema}.#{quoted_table} (#{sql_schema_table})])
79
69
  end
@@ -81,7 +71,7 @@ module Embulk
81
71
  sql_schema_temp_table = self.sql_schema_from_table(task)
82
72
 
83
73
  # create a temp table
84
- connection_pool.with do |jv|
74
+ connect(task) do |jv|
85
75
  query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_temp_table}])
86
76
  query(jv, %[CREATE TABLE #{quoted_schema}.#{quoted_temp_table} (#{sql_schema_temp_table})])
87
77
  # Create internal vertica projection beforehand, otherwirse, parallel copies lock table to create a projection and we get S Lock error sometimes
@@ -96,30 +86,21 @@ module Embulk
96
86
 
97
87
  begin
98
88
  # insert data into the temp table
99
- task_reports = yield(task) # obtain an array of task_reports where one report is of a task
100
- connection_pool.shutdown do |jv| # just don't know how to loop all connections
101
- jv.commit
102
- Embulk.logger.info { "embulk-output-vertica: COMMIT!" }
103
- jv.close rescue nil
104
- end
105
- @connection_pool = nil
89
+ thread_pool.start
90
+ yield(task)
91
+ task_reports = thread_pool.commit
106
92
  Embulk.logger.info { "embulk-output-vertica: task_reports: #{task_reports.to_json}" }
107
93
 
108
94
  # insert select from the temp table
109
- connection_pool.with do |jv|
95
+ connect(task) do |jv|
110
96
  query(jv, %[INSERT INTO #{quoted_schema}.#{quoted_table} SELECT * FROM #{quoted_schema}.#{quoted_temp_table}])
111
97
  jv.commit
112
98
  end
113
99
  ensure
114
- connection_pool.with do |jv|
100
+ connect(task) do |jv|
115
101
  # clean up the temp table
116
- Embulk.logger.debug { "embulk-output-vertica: select count #{query(jv, %[SELECT count(*) FROM #{quoted_schema}.#{quoted_temp_table}]).map {|row| row.to_h }.join("\n") rescue nil}" }
117
- Embulk.logger.trace { "embulk-output-vertica: select limit 10\n#{query(jv, %[SELECT * FROM #{quoted_schema}.#{quoted_temp_table} LIMIT 10]).map {|row| row.to_h }.join("\n") rescue nil}" }
118
102
  query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_temp_table}])
119
- end
120
-
121
- connection_pool.shutdown do |jv|
122
- jv.close rescue nil
103
+ Embulk.logger.debug { "embulk-output-vertica: select result\n#{query(jv, %[SELECT * FROM #{quoted_schema}.#{quoted_table} LIMIT 10]).map {|row| row.to_h }.join("\n") rescue nil}" }
123
104
  end
124
105
  end
125
106
  # this is for -o next_config option, add some paramters for next time execution if wants
@@ -130,46 +111,15 @@ module Embulk
130
111
  # instance is created on each thread
131
112
  def initialize(task, schema, index)
132
113
  super
133
- @converters = ValueConverterFactory.create_converters(schema, task['default_timezone'], task['column_options'])
134
- Embulk.logger.trace { @converters.to_s }
135
- @num_input_rows = 0
136
- @num_output_rows = 0
137
- @num_rejected_rows = 0
138
- end
139
-
140
- def connection_pool
141
- self.class.connection_pool
142
114
  end
143
115
 
116
+ # called for each page in each thread
144
117
  def close
145
- # do not close connection_pool on each thread / page
146
118
  end
147
119
 
120
+ # called for each page in each thread
148
121
  def add(page)
149
- connection_pool.with do |jv| # block if no available connection left
150
- json = nil # for log
151
- begin
152
- num_output_rows, rejects = copy(jv, copy_sql) do |stdin|
153
- page.each do |record|
154
- json = to_json(record)
155
- Embulk.logger.debug { "embulk-output-vertica: to_json #{json}" }
156
- stdin << json << "\n"
157
- @num_input_rows += 1
158
- end
159
- end
160
- num_rejected_rows = rejects.size
161
- @num_output_rows += num_output_rows
162
- @num_rejected_rows += num_rejected_rows
163
- rescue java.sql.SQLDataException => e
164
- jv.rollback
165
- if @task['reject_on_materialized_type_error'] and e.message =~ /Rejected by user-defined parser/
166
- Embulk.logger.warn "embulk-output-vertica: ROLLBACK! some of column types and values types do not fit #{json}"
167
- else
168
- Embulk.logger.warn "embulk-output-vertica: ROLLBACK!"
169
- end
170
- raise e # die transaction
171
- end
172
- end
122
+ self.class.thread_pool.enqueue(page)
173
123
  end
174
124
 
175
125
  def finish
@@ -178,19 +128,33 @@ module Embulk
178
128
  def abort
179
129
  end
180
130
 
181
- # this is called after processing all pages in a thread
182
- # we do commit on #transaction for all connection pools, not at here
131
+ # called after processing all pages in each thread
132
+ # we do commit on #transaction for all pools, not at here
183
133
  def commit
184
- Embulk.logger.debug { "embulk-output-vertica: #{@num_output_rows} rows" }
185
- task_report = {
186
- "num_input_rows" => @num_input_rows,
187
- "num_output_rows" => @num_output_rows,
188
- "num_rejected_rows" => @num_rejected_rows,
189
- }
134
+ {}
190
135
  end
191
136
 
192
137
  private
193
138
 
139
+ def self.connect(task)
140
+ jv = ::Jvertica.connect({
141
+ host: task['host'],
142
+ port: task['port'],
143
+ user: task['user'],
144
+ password: task['password'],
145
+ database: task['database'],
146
+ })
147
+
148
+ if block_given?
149
+ begin
150
+ yield jv
151
+ ensure
152
+ jv.close
153
+ end
154
+ end
155
+ jv
156
+ end
157
+
194
158
  # @param [Schema] schema embulk defined column types
195
159
  # @param [Hash] column_options user defined column types
196
160
  # @return [String] sql schema used to CREATE TABLE
@@ -224,7 +188,7 @@ module Embulk
224
188
  "WHERE table_schema = #{quoted_schema} AND table_name = #{quoted_table}"
225
189
 
226
190
  sql_schema = {}
227
- connection_pool.with do |jv|
191
+ connect(task) do |jv|
228
192
  result = query(jv, sql)
229
193
  sql_schema = result.map {|row| [row[0], row[1]] }
230
194
  end
@@ -239,49 +203,6 @@ module Embulk
239
203
  def query(conn, sql)
240
204
  self.class.query(conn, sql)
241
205
  end
242
-
243
- def copy(conn, sql, &block)
244
- Embulk.logger.debug "embulk-output-vertica: #{sql}"
245
- results, rejects = conn.copy(sql, &block)
246
- end
247
-
248
- def copy_sql
249
- @copy_sql ||= "COPY #{quoted_schema}.#{quoted_temp_table} FROM STDIN#{fjsonparser}#{copy_mode}#{abort_on_error} NO COMMIT"
250
- end
251
-
252
- def to_json(record)
253
- Hash[*(schema.names.zip(record).map do |column_name, value|
254
- [column_name, @converters[column_name].call(value)]
255
- end.flatten!(1))].to_json
256
- end
257
-
258
- def quoted_schema
259
- ::Jvertica.quote_identifier(@task['schema'])
260
- end
261
-
262
- def quoted_table
263
- ::Jvertica.quote_identifier(@task['table'])
264
- end
265
-
266
- def quoted_temp_table
267
- ::Jvertica.quote_identifier(@task['temp_table'])
268
- end
269
-
270
- def copy_mode
271
- " #{@task['copy_mode']}"
272
- end
273
-
274
- def abort_on_error
275
- @task['abort_on_error'] ? ' ABORT ON ERROR' : ''
276
- end
277
-
278
- def fjsonparser
279
- " PARSER fjsonparser(#{reject_on_materialized_type_error})"
280
- end
281
-
282
- def reject_on_materialized_type_error
283
- @task['reject_on_materialized_type_error'] ? 'reject_on_materialized_type_error=true' : ''
284
- end
285
206
  end
286
207
  end
287
208
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-vertica
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - eiji.sekiya
@@ -39,20 +39,6 @@ dependencies:
39
39
  version: '0'
40
40
  prerelease: false
41
41
  type: :runtime
42
- - !ruby/object:Gem::Dependency
43
- name: connection_pool
44
- version_requirements: !ruby/object:Gem::Requirement
45
- requirements:
46
- - - ">="
47
- - !ruby/object:Gem::Version
48
- version: '0'
49
- requirement: !ruby/object:Gem::Requirement
50
- requirements:
51
- - - ">="
52
- - !ruby/object:Gem::Version
53
- version: '0'
54
- prerelease: false
55
- type: :runtime
56
42
  - !ruby/object:Gem::Dependency
57
43
  name: bundler
58
44
  version_requirements: !ruby/object:Gem::Requirement
@@ -99,6 +85,7 @@ files:
99
85
  - example.csv
100
86
  - example.yml
101
87
  - lib/embulk/output/vertica.rb
88
+ - lib/embulk/output/vertica/output_thread.rb
102
89
  - lib/embulk/output/vertica/value_converter_factory.rb
103
90
  homepage: https://github.com/eratostennis/embulk-output-vertica
104
91
  licenses: