embulk-output-vertica 0.3.1 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +2 -0
- data/embulk-output-vertica.gemspec +2 -1
- data/lib/embulk/output/vertica.rb +80 -62
- metadata +16 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: b178cb9ffe687570117d6aa8edfb55e7c77c446d
|
4
|
+
data.tar.gz: 72195fdd08f4929162602381bcd79432545067c0
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3f132dba10f45571d856cf33478b80f73c154413dd78268a43510461f24f223b04def25cada97e204b3053c6d2ebd09054a6d08816bfe4d26cf102b908ff5bc4
|
7
|
+
data.tar.gz: 10fcde36ab57437e724b4e0eb63586fcb218b300d799875777b15ba4769e9e3402dc85cc3b77f882897fdc73515e258b2ce3e57261d79eb98d921b8bab226b7f
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -18,6 +18,8 @@
|
|
18
18
|
- **table**: table name (string, required)
|
19
19
|
- **mode**: "insert", or "replace". See bellow. (string, default: insert)
|
20
20
|
- **copy_mode**: specifies how data is loaded into the database. See vertica documents for details. (`AUTO`, `DIRECT`, or `TRICKLE`. default: `AUTO`)
|
21
|
+
- **pool**: number of connection pools, this number controls number of concurrency to issue COPY statements (integer, default: processor_count, that is, number of threads in input plugin)
|
22
|
+
- **pool_timeout**: timeout to checkout a connection from connection pools (seconds, default: 600)
|
21
23
|
- **abort_on_error**: stops the COPY command if a row is rejected and rolls back the command. No data is loaded. (bool, default: false)
|
22
24
|
- **reject_on_materialized_type_error**: uses `reject_on_materialized_type_error` option for fjsonparser(). This rejects rows if any of column types and value types do not fit, ex) double value into INT column fails. See vertica documents for details. (bool, default: false)
|
23
25
|
- **default_timezone**: the default timezone for column_options (string, default is "UTC")
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Gem::Specification.new do |spec|
|
2
2
|
spec.name = "embulk-output-vertica"
|
3
|
-
spec.version = "0.
|
3
|
+
spec.version = "0.4.0"
|
4
4
|
spec.authors = ["eiji.sekiya", "Naotoshi Seo"]
|
5
5
|
spec.email = ["eiji.sekiya.0326@gmail.com", "sonots@gmail.com"]
|
6
6
|
spec.summary = "Vertica output plugin for Embulk"
|
@@ -15,6 +15,7 @@ Gem::Specification.new do |spec|
|
|
15
15
|
|
16
16
|
spec.add_dependency "jvertica", "~> 0.2"
|
17
17
|
spec.add_dependency "tzinfo"
|
18
|
+
spec.add_dependency "connection_pool"
|
18
19
|
spec.add_development_dependency "bundler", "~> 1.7"
|
19
20
|
spec.add_development_dependency "rake", "~> 10.0"
|
20
21
|
end
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'jvertica'
|
2
|
+
require 'connection_pool'
|
2
3
|
require_relative 'vertica/value_converter_factory'
|
3
4
|
|
4
5
|
module Embulk
|
@@ -9,23 +10,42 @@ module Embulk
|
|
9
10
|
class Error < StandardError; end
|
10
11
|
class NotSupportedType < Error; end
|
11
12
|
|
13
|
+
def self.connection_pool
|
14
|
+
@connection_pool ||= @connection_pool_proc.call
|
15
|
+
end
|
16
|
+
|
12
17
|
def self.transaction(config, schema, processor_count, &control)
|
13
18
|
task = {
|
14
|
-
'host' => config.param('host',
|
15
|
-
'port' => config.param('port',
|
16
|
-
'user' => config.param('user',
|
17
|
-
'username' => config.param('username',
|
18
|
-
'password' => config.param('password',
|
19
|
-
'database' => config.param('database',
|
20
|
-
'schema' => config.param('schema',
|
21
|
-
'table' => config.param('table',
|
22
|
-
'mode' => config.param('mode',
|
23
|
-
'copy_mode' => config.param('copy_mode',
|
24
|
-
'abort_on_error' => config.param('abort_on_error',
|
25
|
-
'default_timezone' => config.param('default_timezone', :string,
|
26
|
-
'column_options' => config.param('column_options',
|
19
|
+
'host' => config.param('host', :string, :default => 'localhost'),
|
20
|
+
'port' => config.param('port', :integer, :default => 5433),
|
21
|
+
'user' => config.param('user', :string, :default => nil),
|
22
|
+
'username' => config.param('username', :string, :default => nil), # alias to :user for backward compatibility
|
23
|
+
'password' => config.param('password', :string, :default => ''),
|
24
|
+
'database' => config.param('database', :string, :default => 'vdb'),
|
25
|
+
'schema' => config.param('schema', :string, :default => 'public'),
|
26
|
+
'table' => config.param('table', :string),
|
27
|
+
'mode' => config.param('mode', :string, :default => 'insert'),
|
28
|
+
'copy_mode' => config.param('copy_mode', :string, :default => 'AUTO'),
|
29
|
+
'abort_on_error' => config.param('abort_on_error', :bool, :default => false),
|
30
|
+
'default_timezone' => config.param('default_timezone', :string, :default => 'UTC'),
|
31
|
+
'column_options' => config.param('column_options', :hash, :default => {}),
|
27
32
|
'reject_on_materialized_type_error' => config.param('reject_on_materialized_type_error', :bool, :default => false),
|
33
|
+
'pool' => config.param('pool', :integer, :default => processor_count),
|
34
|
+
'pool_timeout' => config.param('pool_timeout', :integer, :default => 600),
|
28
35
|
}
|
36
|
+
task['user'] ||= task['username']
|
37
|
+
|
38
|
+
@connection_pool_proc = Proc.new do
|
39
|
+
ConnectionPool.new(size: task['pool'], timeout: task['pool_timeout']) do
|
40
|
+
::Jvertica.connect({
|
41
|
+
host: task['host'],
|
42
|
+
port: task['port'],
|
43
|
+
user: task['user'],
|
44
|
+
password: task['password'],
|
45
|
+
database: task['database'],
|
46
|
+
})
|
47
|
+
end
|
48
|
+
end
|
29
49
|
|
30
50
|
task['user'] ||= task['username']
|
31
51
|
unless task['user']
|
@@ -53,7 +73,7 @@ module Embulk
|
|
53
73
|
sql_schema_table = self.sql_schema_from_embulk_schema(schema, task['column_options'])
|
54
74
|
|
55
75
|
# create the target table
|
56
|
-
|
76
|
+
connection_pool.with do |jv|
|
57
77
|
query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_table}]) if task['mode'] == 'REPLACE'
|
58
78
|
query(jv, %[CREATE TABLE IF NOT EXISTS #{quoted_schema}.#{quoted_table} (#{sql_schema_table})])
|
59
79
|
end
|
@@ -61,7 +81,7 @@ module Embulk
|
|
61
81
|
sql_schema_temp_table = self.sql_schema_from_table(task)
|
62
82
|
|
63
83
|
# create a temp table
|
64
|
-
|
84
|
+
connection_pool.with do |jv|
|
65
85
|
query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_temp_table}])
|
66
86
|
query(jv, %[CREATE TABLE #{quoted_schema}.#{quoted_temp_table} (#{sql_schema_temp_table})])
|
67
87
|
end
|
@@ -69,18 +89,29 @@ module Embulk
|
|
69
89
|
begin
|
70
90
|
# insert data into the temp table
|
71
91
|
task_reports = yield(task) # obtain an array of task_reports where one report is of a task
|
92
|
+
connection_pool.shutdown do |jv| # just don't know how to loop all connections
|
93
|
+
jv.commit
|
94
|
+
Embulk.logger.info { "embulk-output-vertica: COMMIT!" }
|
95
|
+
jv.close rescue nil
|
96
|
+
end
|
97
|
+
@connection_pool = nil
|
72
98
|
Embulk.logger.info { "embulk-output-vertica: task_reports: #{task_reports.to_json}" }
|
73
99
|
|
74
100
|
# insert select from the temp table
|
75
|
-
|
101
|
+
connection_pool.with do |jv|
|
76
102
|
query(jv, %[INSERT INTO #{quoted_schema}.#{quoted_table} SELECT * FROM #{quoted_schema}.#{quoted_temp_table}])
|
77
103
|
jv.commit
|
78
104
|
end
|
79
105
|
ensure
|
80
|
-
|
106
|
+
connection_pool.with do |jv|
|
81
107
|
# clean up the temp table
|
108
|
+
Embulk.logger.debug { "embulk-output-vertica: select count #{query(jv, %[SELECT count(*) FROM #{quoted_schema}.#{quoted_temp_table}]).map {|row| row.to_h }.join("\n") rescue nil}" }
|
109
|
+
Embulk.logger.trace { "embulk-output-vertica: select limit 10\n#{query(jv, %[SELECT * FROM #{quoted_schema}.#{quoted_temp_table} LIMIT 10]).map {|row| row.to_h }.join("\n") rescue nil}" }
|
82
110
|
query(jv, %[DROP TABLE IF EXISTS #{quoted_schema}.#{quoted_temp_table}])
|
83
|
-
|
111
|
+
end
|
112
|
+
|
113
|
+
connection_pool.shutdown do |jv|
|
114
|
+
jv.close rescue nil
|
84
115
|
end
|
85
116
|
end
|
86
117
|
# this is for -o next_config option, add some paramters for next time execution if wants
|
@@ -91,39 +122,44 @@ module Embulk
|
|
91
122
|
def initialize(task, schema, index)
|
92
123
|
super
|
93
124
|
@converters = ValueConverterFactory.create_converters(schema, task['default_timezone'], task['column_options'])
|
94
|
-
Embulk.logger.
|
95
|
-
@jv = self.class.connect(task)
|
125
|
+
Embulk.logger.trace { @converters.to_s }
|
96
126
|
@num_input_rows = 0
|
97
127
|
@num_output_rows = 0
|
98
128
|
@num_rejected_rows = 0
|
99
129
|
end
|
100
130
|
|
131
|
+
def connection_pool
|
132
|
+
self.class.connection_pool
|
133
|
+
end
|
134
|
+
|
101
135
|
def close
|
102
|
-
|
136
|
+
# do not close connection_pool on each thread / page
|
103
137
|
end
|
104
138
|
|
105
139
|
def add(page)
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
140
|
+
connection_pool.with do |jv| # block if no available connection left
|
141
|
+
json = nil # for log
|
142
|
+
begin
|
143
|
+
num_output_rows, rejects = copy(jv, copy_sql) do |stdin|
|
144
|
+
page.each do |record|
|
145
|
+
json = to_json(record)
|
146
|
+
Embulk.logger.debug { "embulk-output-vertica: to_json #{json}" }
|
147
|
+
stdin << json << "\n"
|
148
|
+
@num_input_rows += 1
|
149
|
+
end
|
114
150
|
end
|
151
|
+
num_rejected_rows = rejects.size
|
152
|
+
@num_output_rows += num_output_rows
|
153
|
+
@num_rejected_rows += num_rejected_rows
|
154
|
+
rescue java.sql.SQLDataException => e
|
155
|
+
jv.rollback
|
156
|
+
if @task['reject_on_materialized_type_error'] and e.message =~ /Rejected by user-defined parser/
|
157
|
+
Embulk.logger.warn "embulk-output-vertica: ROLLBACK! some of column types and values types do not fit #{json}"
|
158
|
+
else
|
159
|
+
Embulk.logger.warn "embulk-output-vertica: ROLLBACK!"
|
160
|
+
end
|
161
|
+
raise e # die transaction
|
115
162
|
end
|
116
|
-
num_rejected_rows = rejects.size
|
117
|
-
@num_output_rows += num_output_rows
|
118
|
-
@num_rejected_rows += num_rejected_rows
|
119
|
-
rescue java.sql.SQLDataException => e
|
120
|
-
@jv.rollback
|
121
|
-
if @task['reject_on_materialized_type_error'] and e.message =~ /Rejected by user-defined parser/
|
122
|
-
Embulk.logger.warn "embulk-output-vertica: ROLLBACK! some of column types and values types do not fit #{json}"
|
123
|
-
else
|
124
|
-
Embulk.logger.warn "embulk-output-vertica: ROLLBACK!"
|
125
|
-
end
|
126
|
-
raise e # die transaction
|
127
163
|
end
|
128
164
|
end
|
129
165
|
|
@@ -133,9 +169,10 @@ module Embulk
|
|
133
169
|
def abort
|
134
170
|
end
|
135
171
|
|
172
|
+
# this is called after processing all pages in a thread
|
173
|
+
# we do commit on #transaction for all connection pools, not at here
|
136
174
|
def commit
|
137
|
-
@
|
138
|
-
Embulk.logger.debug { "embulk-output-vertica: COMMIT! #{@num_output_rows} rows" }
|
175
|
+
Embulk.logger.debug { "embulk-output-vertica: #{@num_output_rows} rows" }
|
139
176
|
task_report = {
|
140
177
|
"num_input_rows" => @num_input_rows,
|
141
178
|
"num_output_rows" => @num_output_rows,
|
@@ -145,25 +182,6 @@ module Embulk
|
|
145
182
|
|
146
183
|
private
|
147
184
|
|
148
|
-
def self.connect(task)
|
149
|
-
jv = ::Jvertica.connect({
|
150
|
-
host: task['host'],
|
151
|
-
port: task['port'],
|
152
|
-
user: task['user'],
|
153
|
-
password: task['password'],
|
154
|
-
database: task['database'],
|
155
|
-
})
|
156
|
-
|
157
|
-
if block_given?
|
158
|
-
begin
|
159
|
-
yield jv
|
160
|
-
ensure
|
161
|
-
jv.close
|
162
|
-
end
|
163
|
-
end
|
164
|
-
jv
|
165
|
-
end
|
166
|
-
|
167
185
|
# @param [Schema] schema embulk defined column types
|
168
186
|
# @param [Hash] column_options user defined column types
|
169
187
|
# @return [String] sql schema used to CREATE TABLE
|
@@ -197,7 +215,7 @@ module Embulk
|
|
197
215
|
"WHERE table_schema = #{quoted_schema} AND table_name = #{quoted_table}"
|
198
216
|
|
199
217
|
sql_schema = {}
|
200
|
-
|
218
|
+
connection_pool.with do |jv|
|
201
219
|
result = query(jv, sql)
|
202
220
|
sql_schema = result.map {|row| [row[0], row[1]] }
|
203
221
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-vertica
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- eiji.sekiya
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2015-11-
|
12
|
+
date: 2015-11-24 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: jvertica
|
@@ -39,6 +39,20 @@ dependencies:
|
|
39
39
|
version: '0'
|
40
40
|
prerelease: false
|
41
41
|
type: :runtime
|
42
|
+
- !ruby/object:Gem::Dependency
|
43
|
+
name: connection_pool
|
44
|
+
version_requirements: !ruby/object:Gem::Requirement
|
45
|
+
requirements:
|
46
|
+
- - ">="
|
47
|
+
- !ruby/object:Gem::Version
|
48
|
+
version: '0'
|
49
|
+
requirement: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
prerelease: false
|
55
|
+
type: :runtime
|
42
56
|
- !ruby/object:Gem::Dependency
|
43
57
|
name: bundler
|
44
58
|
version_requirements: !ruby/object:Gem::Requirement
|