embulk-output-vertica 0.4.1 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +1 -2
- data/embulk-output-vertica.gemspec +1 -2
- data/lib/embulk/output/vertica/output_thread.rb +148 -0
- data/lib/embulk/output/vertica.rb +40 -119
- metadata +2 -15
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 14be9147838a9b3e7e7c7ace08b5d26b491538cf
|
4
|
+
data.tar.gz: 1f53ce38472f8c015e38e2b502cb652dfcaeef6e
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: f265d51ec0ffc498cbeaf66f6b86622d2cce48f2d9d5cac846250930bec95376b3dde1069f5aff12fd3c89a16d50172c3faed476652fd1cae9ac9d7eb582a04c
|
7
|
+
data.tar.gz: 9eebc8e2346c03d908c9ca2c735623a6890d2e6ae852c3083efef543694b905e6d9d3babd24e1c7a9ced34433e13ee3e42a1bd0e28babfab6ab070e8009e8ead
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -18,8 +18,7 @@
|
|
18
18
|
- **table**: table name (string, required)
|
19
19
|
- **mode**: "insert", or "replace". See bellow. (string, default: insert)
|
20
20
|
- **copy_mode**: specifies how data is loaded into the database. See vertica documents for details. (`AUTO`, `DIRECT`, or `TRICKLE`. default: `AUTO`)
|
21
|
-
- **pool**: number of
|
22
|
-
- **pool_timeout**: timeout to checkout a connection from connection pools (seconds, default: 600)
|
21
|
+
- **pool**: number of output threads, this number controls number of concurrency to issue COPY statements (integer, default: processor_count, that is, number of threads in input plugin)
|
23
22
|
- **abort_on_error**: stops the COPY command if a row is rejected and rolls back the command. No data is loaded. (bool, default: false)
|
24
23
|
- **reject_on_materialized_type_error**: uses `reject_on_materialized_type_error` option for fjsonparser(). This rejects rows if any of column types and value types do not fit, ex) double value into INT column fails. See vertica documents for details. (bool, default: false)
|
25
24
|
- **default_timezone**: the default timezone for column_options (string, default is "UTC")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-vertica"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.5.0"
|
4
4
|
spec.authors = ["eiji.sekiya", "Naotoshi Seo"]
|
5
5
|
spec.email = ["eiji.sekiya.0326@gmail.com", "sonots@gmail.com"]
|
6
6
|
spec.summary = "Vertica output plugin for Embulk"
|
@@ -15,7 +15,6 @@ Gem::Specification.new do |spec|
|
|
15
15
|
|
16
16
|
spec.add_dependency "jvertica", "~> 0.2"
|
17
17
|
spec.add_dependency "tzinfo"
|
18
|
-
spec.add_dependency "connection_pool"
|
19
18
|
spec.add_development_dependency "bundler", "~> 1.7"
|
20
19
|
spec.add_development_dependency "rake", "~> 10.0"
|
21
20
|
end
|
@@ -0,0 +1,148 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Output
|
3
|
+
class Vertica < OutputPlugin
|
4
|
+
class OutputThreadPool
|
5
|
+
def initialize(task, schema, size)
|
6
|
+
@size = size
|
7
|
+
converters = ValueConverterFactory.create_converters(schema, task['default_timezone'], task['column_options'])
|
8
|
+
@output_threads = size.times.map { OutputThread.new(task, schema, converters) }
|
9
|
+
@current_index = 0
|
10
|
+
end
|
11
|
+
|
12
|
+
def enqueue(page)
|
13
|
+
@output_threads[@current_index].enqueue(page)
|
14
|
+
@current_index = (@current_index + 1) % @size
|
15
|
+
end
|
16
|
+
|
17
|
+
def start
|
18
|
+
@size.times.map {|i| @output_threads[i].start }
|
19
|
+
end
|
20
|
+
|
21
|
+
def commit
|
22
|
+
task_reports = @size.times.map {|i| @output_threads[i].commit }
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
class OutputThread
|
27
|
+
def initialize(task, schema, converters)
|
28
|
+
@task = task
|
29
|
+
@schema = schema
|
30
|
+
@queue = SizedQueue.new(1)
|
31
|
+
@converters = converters
|
32
|
+
@num_input_rows = 0
|
33
|
+
@num_output_rows = 0
|
34
|
+
@num_rejected_rows = 0
|
35
|
+
end
|
36
|
+
|
37
|
+
def enqueue(page)
|
38
|
+
if @thread.status.nil? # thread died by an error
|
39
|
+
@thread.join # raise the same error raised inside thread
|
40
|
+
end
|
41
|
+
if @thread.alive?
|
42
|
+
Embulk.logger.trace { "embulk-output-vertica: enqueued" }
|
43
|
+
@queue.push(page)
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
def run
|
48
|
+
Embulk.logger.debug { "embulk-output-vertica: thread started" }
|
49
|
+
Vertica.connect(@task) do |jv|
|
50
|
+
json = nil # for log
|
51
|
+
begin
|
52
|
+
num_output_rows, rejects = copy(jv, copy_sql) do |stdin|
|
53
|
+
while page = @queue.pop
|
54
|
+
if page == 'finish'
|
55
|
+
Embulk.logger.debug { "embulk-output-vertica: thread finished" }
|
56
|
+
break
|
57
|
+
end
|
58
|
+
Embulk.logger.trace { "embulk-output-vertica: dequeued" }
|
59
|
+
|
60
|
+
page.each do |record|
|
61
|
+
json = to_json(record)
|
62
|
+
Embulk.logger.trace { "embulk-output-vertica: to_json #{json}" }
|
63
|
+
stdin << json << "\n"
|
64
|
+
@num_input_rows += 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
num_rejected_rows = rejects.size
|
69
|
+
@num_output_rows += num_output_rows
|
70
|
+
@num_rejected_rows += num_rejected_rows
|
71
|
+
jv.commit
|
72
|
+
Embulk.logger.info { "embulk-output-vertica: COMMIT!" }
|
73
|
+
rescue java.sql.SQLDataException => e
|
74
|
+
jv.rollback
|
75
|
+
if @task['reject_on_materialized_type_error'] and e.message =~ /Rejected by user-defined parser/
|
76
|
+
Embulk.logger.warn "embulk-output-vertica: ROLLBACK! some of column types and values types do not fit #{json}"
|
77
|
+
else
|
78
|
+
Embulk.logger.warn "embulk-output-vertica: ROLLBACK!"
|
79
|
+
end
|
80
|
+
raise e # die transaction
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def start
|
86
|
+
@thread = Thread.new(&method(:run))
|
87
|
+
end
|
88
|
+
|
89
|
+
def commit
|
90
|
+
@queue.push('finish') if @thread.alive?
|
91
|
+
Thread.pass
|
92
|
+
@thread.join # the same error with run would be raised at here
|
93
|
+
|
94
|
+
task_report = {
|
95
|
+
'num_input_rows' => @num_input_rows,
|
96
|
+
'num_output_rows' => @num_output_rows,
|
97
|
+
'num_rejected_rows' => @num_rejected_rows,
|
98
|
+
}
|
99
|
+
end
|
100
|
+
|
101
|
+
# private
|
102
|
+
|
103
|
+
def copy(conn, sql, &block)
|
104
|
+
Embulk.logger.debug "embulk-output-vertica: #{sql}"
|
105
|
+
results, rejects = conn.copy(sql, &block)
|
106
|
+
end
|
107
|
+
|
108
|
+
def copy_sql
|
109
|
+
@copy_sql ||= "COPY #{quoted_schema}.#{quoted_temp_table} FROM STDIN#{fjsonparser}#{copy_mode}#{abort_on_error} NO COMMIT"
|
110
|
+
end
|
111
|
+
|
112
|
+
def to_json(record)
|
113
|
+
Hash[*(@schema.names.zip(record).map do |column_name, value|
|
114
|
+
[column_name, @converters[column_name].call(value)]
|
115
|
+
end.flatten!(1))].to_json
|
116
|
+
end
|
117
|
+
|
118
|
+
def quoted_schema
|
119
|
+
::Jvertica.quote_identifier(@task['schema'])
|
120
|
+
end
|
121
|
+
|
122
|
+
def quoted_table
|
123
|
+
::Jvertica.quote_identifier(@task['table'])
|
124
|
+
end
|
125
|
+
|
126
|
+
def quoted_temp_table
|
127
|
+
::Jvertica.quote_identifier(@task['temp_table'])
|
128
|
+
end
|
129
|
+
|
130
|
+
def copy_mode
|
131
|
+
" #{@task['copy_mode']}"
|
132
|
+
end
|
133
|
+
|
134
|
+
def abort_on_error
|
135
|
+
@task['abort_on_error'] ? ' ABORT ON ERROR' : ''
|
136
|
+
end
|
137
|
+
|
138
|
+
def fjsonparser
|
139
|
+
" PARSER fjsonparser(#{reject_on_materialized_type_error})"
|
140
|
+
end
|
141
|
+
|
142
|
+
def reject_on_materialized_type_error
|
143
|
+
@task['reject_on_materialized_type_error'] ? 'reject_on_materialized_type_error=true' : ''
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
147
|
+
end
|
148
|
+
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
require 'jvertica'
|
2
|
-
require 'connection_pool'
|
3
2
|
require_relative 'vertica/value_converter_factory'
|
3
|
+
require_relative 'vertica/output_thread'
|
4
4
|
|
5
5
|
module Embulk
|
6
6
|
module Output
|
@@ -10,8 +10,8 @@ module Embulk
|
|
10
10
|
class Error < StandardError; end
|
11
11
|
class NotSupportedType < Error; end
|
12
12
|
|
13
|
-
def self.
|
14
|
-
@
|
13
|
+
def self.thread_pool
|
14
|
+
@thread_pool ||= @thread_pool_proc.call
|
15
15
|
end
|
16
16
|
|
17
17
|
def self.transaction(config, schema, processor_count, &control)
|
@@ -27,24 +27,14 @@ module Embulk
|
|
27
27
|
'mode' => config.param('mode', :string, :default => 'insert'),
|
28
28
|
'copy_mode' => config.param('copy_mode', :string, :default => 'AUTO'),
|
29
29
|
'abort_on_error' => config.param('abort_on_error', :bool, :default => false),
|
30
|
-
'default_timezone' => config.param('default_timezone', :string,
|
30
|
+
'default_timezone' => config.param('default_timezone', :string, :default => 'UTC'),
|
31
31
|
'column_options' => config.param('column_options', :hash, :default => {}),
|
32
32
|
'reject_on_materialized_type_error' => config.param('reject_on_materialized_type_error', :bool, :default => false),
|
33
33
|
'pool' => config.param('pool', :integer, :default => processor_count),
|
34
|
-
'pool_timeout' => config.param('pool_timeout', :integer, :default => 600),
|
35
34
|
}
|
36
|
-
task['user'] ||= task['username']
|
37
35
|
|
38
|
-
@
|
39
|
-
|
40
|
-
::Jvertica.connect({
|
41
|
-
host: task['host'],
|
42
|
-
port: task['port'],
|
43
|
-
user: task['user'],
|
44
|
-
password: task['password'],
|
45
|
-
database: task['database'],
|
46
|
-
})
|
47
|
-
end
|
36
|
+
@thread_pool_proc = Proc.new do
|
37
|
+
OutputThreadPool.new(task, schema, task['pool'])
|
48
38
|
end
|
49
39
|
|
50
40
|
task['user'] ||= task['username']
|
@@ -73,7 +63,7 @@ module Embulk
|
|
73
63
|
sql_schema_table = self.sql_schema_from_embulk_schema(schema, task['column_options'])
|
74
64
|
|
75
65
|
# create the target table
|
76
|
-
|
66
|
+
connect(task) do |jv|
|
77
67
|
query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_table}]) if task['mode'] == 'REPLACE'
|
78
68
|
query(jv, %[CREATE TABLE IF NOT EXISTS #{quoted_schema}.#{quoted_table} (#{sql_schema_table})])
|
79
69
|
end
|
@@ -81,7 +71,7 @@ module Embulk
|
|
81
71
|
sql_schema_temp_table = self.sql_schema_from_table(task)
|
82
72
|
|
83
73
|
# create a temp table
|
84
|
-
|
74
|
+
connect(task) do |jv|
|
85
75
|
query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_temp_table}])
|
86
76
|
query(jv, %[CREATE TABLE #{quoted_schema}.#{quoted_temp_table} (#{sql_schema_temp_table})])
|
87
77
|
# Create internal vertica projection beforehand, otherwirse, parallel copies lock table to create a projection and we get S Lock error sometimes
|
@@ -96,30 +86,21 @@ module Embulk
|
|
96
86
|
|
97
87
|
begin
|
98
88
|
# insert data into the temp table
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
Embulk.logger.info { "embulk-output-vertica: COMMIT!" }
|
103
|
-
jv.close rescue nil
|
104
|
-
end
|
105
|
-
@connection_pool = nil
|
89
|
+
thread_pool.start
|
90
|
+
yield(task)
|
91
|
+
task_reports = thread_pool.commit
|
106
92
|
Embulk.logger.info { "embulk-output-vertica: task_reports: #{task_reports.to_json}" }
|
107
93
|
|
108
94
|
# insert select from the temp table
|
109
|
-
|
95
|
+
connect(task) do |jv|
|
110
96
|
query(jv, %[INSERT INTO #{quoted_schema}.#{quoted_table} SELECT * FROM #{quoted_schema}.#{quoted_temp_table}])
|
111
97
|
jv.commit
|
112
98
|
end
|
113
99
|
ensure
|
114
|
-
|
100
|
+
connect(task) do |jv|
|
115
101
|
# clean up the temp table
|
116
|
-
Embulk.logger.debug { "embulk-output-vertica: select count #{query(jv, %[SELECT count(*) FROM #{quoted_schema}.#{quoted_temp_table}]).map {|row| row.to_h }.join("\n") rescue nil}" }
|
117
|
-
Embulk.logger.trace { "embulk-output-vertica: select limit 10\n#{query(jv, %[SELECT * FROM #{quoted_schema}.#{quoted_temp_table} LIMIT 10]).map {|row| row.to_h }.join("\n") rescue nil}" }
|
118
102
|
query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_temp_table}])
|
119
|
-
|
120
|
-
|
121
|
-
connection_pool.shutdown do |jv|
|
122
|
-
jv.close rescue nil
|
103
|
+
Embulk.logger.debug { "embulk-output-vertica: select result\n#{query(jv, %[SELECT * FROM #{quoted_schema}.#{quoted_table} LIMIT 10]).map {|row| row.to_h }.join("\n") rescue nil}" }
|
123
104
|
end
|
124
105
|
end
|
125
106
|
# this is for -o next_config option, add some paramters for next time execution if wants
|
@@ -130,46 +111,15 @@ module Embulk
|
|
130
111
|
# instance is created on each thread
|
131
112
|
def initialize(task, schema, index)
|
132
113
|
super
|
133
|
-
@converters = ValueConverterFactory.create_converters(schema, task['default_timezone'], task['column_options'])
|
134
|
-
Embulk.logger.trace { @converters.to_s }
|
135
|
-
@num_input_rows = 0
|
136
|
-
@num_output_rows = 0
|
137
|
-
@num_rejected_rows = 0
|
138
|
-
end
|
139
|
-
|
140
|
-
def connection_pool
|
141
|
-
self.class.connection_pool
|
142
114
|
end
|
143
115
|
|
116
|
+
# called for each page in each thread
|
144
117
|
def close
|
145
|
-
# do not close connection_pool on each thread / page
|
146
118
|
end
|
147
119
|
|
120
|
+
# called for each page in each thread
|
148
121
|
def add(page)
|
149
|
-
|
150
|
-
json = nil # for log
|
151
|
-
begin
|
152
|
-
num_output_rows, rejects = copy(jv, copy_sql) do |stdin|
|
153
|
-
page.each do |record|
|
154
|
-
json = to_json(record)
|
155
|
-
Embulk.logger.debug { "embulk-output-vertica: to_json #{json}" }
|
156
|
-
stdin << json << "\n"
|
157
|
-
@num_input_rows += 1
|
158
|
-
end
|
159
|
-
end
|
160
|
-
num_rejected_rows = rejects.size
|
161
|
-
@num_output_rows += num_output_rows
|
162
|
-
@num_rejected_rows += num_rejected_rows
|
163
|
-
rescue java.sql.SQLDataException => e
|
164
|
-
jv.rollback
|
165
|
-
if @task['reject_on_materialized_type_error'] and e.message =~ /Rejected by user-defined parser/
|
166
|
-
Embulk.logger.warn "embulk-output-vertica: ROLLBACK! some of column types and values types do not fit #{json}"
|
167
|
-
else
|
168
|
-
Embulk.logger.warn "embulk-output-vertica: ROLLBACK!"
|
169
|
-
end
|
170
|
-
raise e # die transaction
|
171
|
-
end
|
172
|
-
end
|
122
|
+
self.class.thread_pool.enqueue(page)
|
173
123
|
end
|
174
124
|
|
175
125
|
def finish
|
@@ -178,19 +128,33 @@ module Embulk
|
|
178
128
|
def abort
|
179
129
|
end
|
180
130
|
|
181
|
-
#
|
182
|
-
# we do commit on #transaction for all
|
131
|
+
# called after processing all pages in each thread
|
132
|
+
# we do commit on #transaction for all pools, not at here
|
183
133
|
def commit
|
184
|
-
|
185
|
-
task_report = {
|
186
|
-
"num_input_rows" => @num_input_rows,
|
187
|
-
"num_output_rows" => @num_output_rows,
|
188
|
-
"num_rejected_rows" => @num_rejected_rows,
|
189
|
-
}
|
134
|
+
{}
|
190
135
|
end
|
191
136
|
|
192
137
|
private
|
193
138
|
|
139
|
+
def self.connect(task)
|
140
|
+
jv = ::Jvertica.connect({
|
141
|
+
host: task['host'],
|
142
|
+
port: task['port'],
|
143
|
+
user: task['user'],
|
144
|
+
password: task['password'],
|
145
|
+
database: task['database'],
|
146
|
+
})
|
147
|
+
|
148
|
+
if block_given?
|
149
|
+
begin
|
150
|
+
yield jv
|
151
|
+
ensure
|
152
|
+
jv.close
|
153
|
+
end
|
154
|
+
end
|
155
|
+
jv
|
156
|
+
end
|
157
|
+
|
194
158
|
# @param [Schema] schema embulk defined column types
|
195
159
|
# @param [Hash] column_options user defined column types
|
196
160
|
# @return [String] sql schema used to CREATE TABLE
|
@@ -224,7 +188,7 @@ module Embulk
|
|
224
188
|
"WHERE table_schema = #{quoted_schema} AND table_name = #{quoted_table}"
|
225
189
|
|
226
190
|
sql_schema = {}
|
227
|
-
|
191
|
+
connect(task) do |jv|
|
228
192
|
result = query(jv, sql)
|
229
193
|
sql_schema = result.map {|row| [row[0], row[1]] }
|
230
194
|
end
|
@@ -239,49 +203,6 @@ module Embulk
|
|
239
203
|
def query(conn, sql)
|
240
204
|
self.class.query(conn, sql)
|
241
205
|
end
|
242
|
-
|
243
|
-
def copy(conn, sql, &block)
|
244
|
-
Embulk.logger.debug "embulk-output-vertica: #{sql}"
|
245
|
-
results, rejects = conn.copy(sql, &block)
|
246
|
-
end
|
247
|
-
|
248
|
-
def copy_sql
|
249
|
-
@copy_sql ||= "COPY #{quoted_schema}.#{quoted_temp_table} FROM STDIN#{fjsonparser}#{copy_mode}#{abort_on_error} NO COMMIT"
|
250
|
-
end
|
251
|
-
|
252
|
-
def to_json(record)
|
253
|
-
Hash[*(schema.names.zip(record).map do |column_name, value|
|
254
|
-
[column_name, @converters[column_name].call(value)]
|
255
|
-
end.flatten!(1))].to_json
|
256
|
-
end
|
257
|
-
|
258
|
-
def quoted_schema
|
259
|
-
::Jvertica.quote_identifier(@task['schema'])
|
260
|
-
end
|
261
|
-
|
262
|
-
def quoted_table
|
263
|
-
::Jvertica.quote_identifier(@task['table'])
|
264
|
-
end
|
265
|
-
|
266
|
-
def quoted_temp_table
|
267
|
-
::Jvertica.quote_identifier(@task['temp_table'])
|
268
|
-
end
|
269
|
-
|
270
|
-
def copy_mode
|
271
|
-
" #{@task['copy_mode']}"
|
272
|
-
end
|
273
|
-
|
274
|
-
def abort_on_error
|
275
|
-
@task['abort_on_error'] ? ' ABORT ON ERROR' : ''
|
276
|
-
end
|
277
|
-
|
278
|
-
def fjsonparser
|
279
|
-
" PARSER fjsonparser(#{reject_on_materialized_type_error})"
|
280
|
-
end
|
281
|
-
|
282
|
-
def reject_on_materialized_type_error
|
283
|
-
@task['reject_on_materialized_type_error'] ? 'reject_on_materialized_type_error=true' : ''
|
284
|
-
end
|
285
206
|
end
|
286
207
|
end
|
287
208
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-vertica
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.5.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- eiji.sekiya
|
@@ -39,20 +39,6 @@ dependencies:
|
|
39
39
|
version: '0'
|
40
40
|
prerelease: false
|
41
41
|
type: :runtime
|
42
|
-
- !ruby/object:Gem::Dependency
|
43
|
-
name: connection_pool
|
44
|
-
version_requirements: !ruby/object:Gem::Requirement
|
45
|
-
requirements:
|
46
|
-
- - ">="
|
47
|
-
- !ruby/object:Gem::Version
|
48
|
-
version: '0'
|
49
|
-
requirement: !ruby/object:Gem::Requirement
|
50
|
-
requirements:
|
51
|
-
- - ">="
|
52
|
-
- !ruby/object:Gem::Version
|
53
|
-
version: '0'
|
54
|
-
prerelease: false
|
55
|
-
type: :runtime
|
56
42
|
- !ruby/object:Gem::Dependency
|
57
43
|
name: bundler
|
58
44
|
version_requirements: !ruby/object:Gem::Requirement
|
@@ -99,6 +85,7 @@ files:
|
|
99
85
|
- example.csv
|
100
86
|
- example.yml
|
101
87
|
- lib/embulk/output/vertica.rb
|
88
|
+
- lib/embulk/output/vertica/output_thread.rb
|
102
89
|
- lib/embulk/output/vertica/value_converter_factory.rb
|
103
90
|
homepage: https://github.com/eratostennis/embulk-output-vertica
|
104
91
|
licenses:
|