embulk-output-bigquery 0.2.3 → 0.3.0.pre1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +6 -12
  3. data/CHANGELOG.md +18 -0
  4. data/Gemfile +8 -0
  5. data/LICENSE.txt +20 -0
  6. data/README.md +165 -39
  7. data/Rakefile +11 -0
  8. data/embulk-output-bigquery.gemspec +20 -0
  9. data/example/config_client_options.yml +33 -0
  10. data/example/config_csv.yml +30 -0
  11. data/example/config_delete_in_advance.yml +29 -0
  12. data/example/config_expose_errors.yml +30 -0
  13. data/example/config_guess_from_embulk_schema.yml +29 -0
  14. data/example/config_guess_with_column_options.yml +40 -0
  15. data/example/config_gzip.yml +30 -0
  16. data/example/config_jsonl.yml +30 -0
  17. data/example/config_mode_append.yml +30 -0
  18. data/example/config_mode_append_direct.yml +30 -0
  19. data/example/config_payload_column.yml +20 -0
  20. data/example/config_payload_column_index.yml +20 -0
  21. data/example/config_prevent_duplicate_insert.yml +30 -0
  22. data/example/config_replace.yml +30 -0
  23. data/example/config_replace_backup.yml +32 -0
  24. data/example/config_skip_file_generation.yml +32 -0
  25. data/example/config_table_strftime.yml +30 -0
  26. data/example/config_template_table.yml +21 -0
  27. data/example/config_uncompressed.yml +30 -0
  28. data/example/config_with_rehearsal.yml +32 -0
  29. data/example/example.csv +17 -0
  30. data/example/example.jsonl +16 -0
  31. data/example/example.yml +30 -0
  32. data/example/json_key.json +12 -0
  33. data/example/nested_example.jsonl +16 -0
  34. data/example/schema.json +30 -0
  35. data/example/schema_expose_errors.json +30 -0
  36. data/lib/embulk/output/bigquery.rb +388 -3
  37. data/lib/embulk/output/bigquery/bigquery_client.rb +396 -0
  38. data/lib/embulk/output/bigquery/file_writer.rb +103 -0
  39. data/lib/embulk/output/bigquery/helper.rb +78 -0
  40. data/lib/embulk/output/bigquery/value_converter_factory.rb +292 -0
  41. data/test/helper.rb +13 -0
  42. data/test/test_bigquery_client.rb +166 -0
  43. data/test/test_configure.rb +254 -0
  44. data/test/test_example.rb +34 -0
  45. data/test/test_file_writer.rb +129 -0
  46. data/test/test_helper.rb +103 -0
  47. data/test/test_transaction.rb +129 -0
  48. data/test/test_value_converter_factory.rb +316 -0
  49. metadata +114 -45
  50. data/build.gradle +0 -80
  51. data/config/checkstyle/checkstyle.xml +0 -128
  52. data/config/checkstyle/default.xml +0 -108
  53. data/gradle/wrapper/gradle-wrapper.jar +0 -0
  54. data/gradle/wrapper/gradle-wrapper.properties +0 -6
  55. data/gradlew +0 -164
  56. data/gradlew.bat +0 -90
  57. data/settings.gradle +0 -2
  58. data/src/main/java/org/embulk/output/BigqueryAuthentication.java +0 -117
  59. data/src/main/java/org/embulk/output/BigqueryOutputPlugin.java +0 -508
  60. data/src/main/java/org/embulk/output/BigqueryWriter.java +0 -575
  61. data/src/test/java/org/embulk/output/TestBigqueryAuthentication.java +0 -5
  62. data/src/test/java/org/embulk/output/TestBigqueryOutputPlugin.java +0 -5
  63. data/src/test/java/org/embulk/output/TestBigqueryWriter.java +0 -5
@@ -0,0 +1,78 @@
1
+ require 'digest/md5'
2
+
3
+ module Embulk
4
+ module Output
5
+ class Bigquery < OutputPlugin
6
+ class Helper
7
+ def self.bq_type_from_embulk_type(embulk_type)
8
+ case embulk_type
9
+ when :boolean then 'BOOLEAN'
10
+ when :long then 'INTEGER'
11
+ when :double then 'FLOAT'
12
+ when :string then 'STRING'
13
+ when :timestamp then 'TIMESTAMP'
14
+ when :json then 'STRING' # NOTE: Default is not RECORD since it requires `fields`
15
+ else raise ArgumentError, "embulk type #{embulk_type} is not supported"
16
+ end
17
+ end
18
+
19
+ # @return [Hash] name => column_option.
20
+ # ToDo: recursively map fields?
21
+ def self.column_options_map(column_options)
22
+ (column_options || {}).map do |column_option|
23
+ [column_option['name'], column_option]
24
+ end.to_h
25
+ end
26
+
27
+ def self.fields_from_embulk_schema(task, schema)
28
+ column_options_map = self.column_options_map(task['column_options'])
29
+ schema.map do |column|
30
+ column_name = column[:name]
31
+ embulk_type = column[:type]
32
+ column_option = column_options_map[column_name] || {}
33
+ {}.tap do |field|
34
+ field[:name] = column_name
35
+ field[:type] = (column_option['type'] || bq_type_from_embulk_type(embulk_type)).upcase
36
+ field[:mode] = column_option['mode'] if column_option['mode']
37
+ field[:fields] = deep_symbolize_keys(column_option['fields']) if column_option['fields']
38
+ end
39
+ end
40
+ end
41
+
42
+ def self.deep_symbolize_keys(obj)
43
+ if obj.is_a?(Hash)
44
+ obj.inject({}) do |options, (key, value)|
45
+ options[(key.to_sym rescue key) || key] = deep_symbolize_keys(value)
46
+ options
47
+ end
48
+ elsif obj.is_a?(Array)
49
+ obj.map {|value| deep_symbolize_keys(value) }
50
+ else
51
+ obj
52
+ end
53
+ end
54
+
55
+ def self.create_job_id(task, path, table, fields)
56
+ elements = [
57
+ Digest::MD5.file(path).hexdigest,
58
+ task['dataset'],
59
+ table,
60
+ fields,
61
+ task['source_format'],
62
+ task['max_bad_records'],
63
+ task['field_delimiter'],
64
+ task['encoding'],
65
+ task['ignore_unknown_values'],
66
+ task['allow_quoted_newlines'],
67
+ ]
68
+
69
+ str = elements.map(&:to_s).join('')
70
+ md5 = Digest::MD5.hexdigest(str)
71
+ job_id = "embulk_job_#{md5}"
72
+ Embulk.logger.debug { "embulk-output-bigquery: create_job_id(#{path}, #{table}) #=> #{job_id}" }
73
+ job_id
74
+ end
75
+ end
76
+ end
77
+ end
78
+ end
@@ -0,0 +1,292 @@
1
+ require 'time'
2
+ require 'tzinfo'
3
+ require 'json'
4
+ require_relative 'helper'
5
+
6
+ module Embulk
7
+ module Output
8
+ class Bigquery < OutputPlugin
9
+ class ValueConverterFactory
10
+ class NotSupportedType < StandardError; end
11
+ class TypeCastError < StandardError; end
12
+
13
+ # ref. https://cloud.google.com/bigquery/preparing-data-for-bigquery
14
+
15
+ DEFAULT_TIMESTAMP_FORMAT = "%Y-%m-%d %H:%M:%S.%6N" # BigQuery timestamp format
16
+ DEFAULT_TIMEZONE = "UTC"
17
+
18
+ # @param [Hash] task
19
+ # @option task [String] default_timestamp_format
20
+ # @option task [String] default_timezone
21
+ # @option task [Hash] column_options user defined column types
22
+ # @param [Schema] schema embulk defined column types
23
+ # @return [Array] an arary whose key is column_index, and value is its converter (Proc)
24
+ def self.create_converters(task, schema)
25
+ column_options_map = Helper.column_options_map(task['column_options'])
26
+ default_timestamp_format = task['default_timestamp_format']
27
+ default_timezone = task['default_timezone']
28
+ schema.map do |column|
29
+ column_name = column[:name]
30
+ embulk_type = column[:type]
31
+ column_option = column_options_map[column_name] || {}
32
+ self.new(
33
+ embulk_type, column_option['type'],
34
+ timestamp_format: column_option['timestamp_format'],
35
+ timezone: column_option['timezone'],
36
+ strict: column_option['strict'],
37
+ default_timestamp_format: default_timestamp_format,
38
+ default_timezone: default_timezone,
39
+ ).create_converter
40
+ end
41
+ end
42
+
43
+ attr_reader :embulk_type, :type, :timestamp_format, :timezone, :zone_offset, :strict
44
+
45
+ def initialize(
46
+ embulk_type, type = nil,
47
+ timestamp_format: nil, timezone: nil, strict: nil,
48
+ default_timestamp_format: DEFAULT_TIMESTAMP_FORMAT,
49
+ default_timezone: DEFAULT_TIMEZONE
50
+ )
51
+ @embulk_type = embulk_type
52
+ @type = (type || Helper.bq_type_from_embulk_type(embulk_type)).upcase
53
+ @timestamp_format = timestamp_format
54
+ @default_timestamp_format = default_timestamp_format
55
+ @timezone = timezone || default_timezone
56
+ @zone_offset = get_zone_offset(@timezone) if @timezone
57
+ @strict = strict.nil? ? true : strict
58
+ end
59
+
60
+ def create_converter
61
+ case embulk_type
62
+ when :boolean then boolean_converter
63
+ when :long then long_converter
64
+ when :double then double_converter
65
+ when :string then string_converter
66
+ when :timestamp then timestamp_converter
67
+ when :json then json_converter
68
+ else raise NotSupportedType, "embulk type #{embulk_type} is not supported"
69
+ end
70
+ end
71
+
72
+ def with_typecast_error(val)
73
+ begin
74
+ yield(val)
75
+ rescue => e
76
+ raise_typecast_error(val)
77
+ end
78
+ end
79
+
80
+ def raise_typecast_error(val)
81
+ message = "cannot cast #{@embulk_type} `#{val}` to #{@type}"
82
+ if @strict
83
+ raise TypeCastError, message
84
+ else
85
+ Embulk.logger.trace { message }
86
+ return nil
87
+ end
88
+ end
89
+
90
+ def boolean_converter
91
+ case type
92
+ when 'BOOLEAN'
93
+ Proc.new {|val|
94
+ val
95
+ }
96
+ when 'STRING'
97
+ Proc.new {|val|
98
+ next nil if val.nil?
99
+ val.to_s
100
+ }
101
+ else
102
+ raise NotSupportedType, "cannot take column type #{type} for boolean column"
103
+ end
104
+ end
105
+
106
+ def long_converter
107
+ case type
108
+ when 'BOOLEAN'
109
+ Proc.new {|val|
110
+ next nil if val.nil?
111
+ next true if val == 1
112
+ next false if val == 0
113
+ raise_typecast_error(val)
114
+ }
115
+ when 'INTEGER'
116
+ Proc.new {|val|
117
+ val
118
+ }
119
+ when 'FLOAT'
120
+ Proc.new {|val|
121
+ next nil if val.nil?
122
+ val.to_f
123
+ }
124
+ when 'STRING'
125
+ Proc.new {|val|
126
+ next nil if val.nil?
127
+ val.to_s
128
+ }
129
+ when 'TIMESTAMP'
130
+ Proc.new {|val|
131
+ next nil if val.nil?
132
+ val # BigQuery supports UNIX timestamp
133
+ }
134
+ else
135
+ raise NotSupportedType, "cannot take column type #{type} for long column"
136
+ end
137
+ end
138
+
139
+ def double_converter
140
+ case type
141
+ when 'INTEGER'
142
+ Proc.new {|val|
143
+ next nil if val.nil?
144
+ val.to_i
145
+ }
146
+ when 'FLOAT'
147
+ Proc.new {|val|
148
+ val
149
+ }
150
+ when 'STRING'
151
+ Proc.new {|val|
152
+ next nil if val.nil?
153
+ val.to_s
154
+ }
155
+ when 'TIMESTAMP'
156
+ Proc.new {|val|
157
+ next nil if val.nil?
158
+ val # BigQuery supports UNIX timestamp
159
+ }
160
+ else
161
+ raise NotSupportedType, "cannot take column type #{type} for double column"
162
+ end
163
+ end
164
+
165
+ def string_converter
166
+ case type
167
+ when 'BOOLEAN'
168
+ Proc.new {|val|
169
+ next nil if val.nil?
170
+ next true if val == 'true'.freeze
171
+ next false if val == 'false'.freeze
172
+ raise_typecast_error(val)
173
+ }
174
+ when 'INTEGER'
175
+ Proc.new {|val|
176
+ next nil if val.nil?
177
+ with_typecast_error(val) do |val|
178
+ Integer(val)
179
+ end
180
+ }
181
+ when 'FLOAT'
182
+ Proc.new {|val|
183
+ next nil if val.nil?
184
+ with_typecast_error(val) do |val|
185
+ Float(val)
186
+ end
187
+ }
188
+ when 'STRING'
189
+ Proc.new {|val|
190
+ val
191
+ }
192
+ when 'TIMESTAMP'
193
+ if @timestamp_format
194
+ Proc.new {|val|
195
+ next nil if val.nil?
196
+ with_typecast_error(val) do |val|
197
+ strptime_with_zone(val, @timestamp_format, zone_offset).to_f
198
+ end
199
+ }
200
+ else
201
+ Proc.new {|val|
202
+ next nil if val.nil?
203
+ val # Users must care of BQ timestamp format
204
+ }
205
+ end
206
+ when 'RECORD'
207
+ Proc.new {|val|
208
+ next nil if val.nil?
209
+ with_typecast_error(val) do |val|
210
+ JSON.parse(val)
211
+ end
212
+ }
213
+ else
214
+ raise NotSupportedType, "cannot take column type #{type} for string column"
215
+ end
216
+ end
217
+
218
+ def timestamp_converter
219
+ case type
220
+ when 'INTEGER'
221
+ Proc.new {|val|
222
+ next nil if val.nil?
223
+ val.to_i
224
+ }
225
+ when 'FLOAT'
226
+ Proc.new {|val|
227
+ next nil if val.nil?
228
+ val.to_f
229
+ }
230
+ when 'STRING'
231
+ _timestamp_format = @timestamp_format || @default_timestamp_format
232
+ Proc.new {|val|
233
+ next nil if val.nil?
234
+ with_typecast_error(val) do |val|
235
+ val.localtime(zone_offset).strftime(_timestamp_format)
236
+ end
237
+ }
238
+ when 'TIMESTAMP'
239
+ Proc.new {|val|
240
+ next nil if val.nil?
241
+ val.to_f # BigQuery supports UNIX timestamp
242
+ }
243
+ else
244
+ raise NotSupportedType, "cannot take column type #{type} for timestamp column"
245
+ end
246
+ end
247
+
248
+ # ToDo: recursive conversion
249
+ def json_converter
250
+ case type
251
+ when 'STRING'
252
+ Proc.new {|val|
253
+ next nil if val.nil?
254
+ val.to_json
255
+ }
256
+ when 'RECORD'
257
+ Proc.new {|val|
258
+ val
259
+ }
260
+ else
261
+ raise NotSupportedType, "cannot take column type #{type} for json column"
262
+ end
263
+ end
264
+
265
+ private
266
+
267
+ # [+-]HH:MM, [+-]HHMM, [+-]HH
268
+ NUMERIC_PATTERN = %r{\A[+-]\d\d(:?\d\d)?\z}
269
+
270
+ # Region/Zone, Region/Zone/Zone
271
+ NAME_PATTERN = %r{\A[^/]+/[^/]+(/[^/]+)?\z}
272
+
273
+ def strptime_with_zone(date, timestamp_format, zone_offset)
274
+ time = Time.strptime(date, timestamp_format)
275
+ utc_offset = time.utc_offset
276
+ time.localtime(zone_offset) + utc_offset - zone_offset
277
+ end
278
+
279
+ def get_zone_offset(timezone)
280
+ if NUMERIC_PATTERN === timezone
281
+ Time.zone_offset(timezone)
282
+ elsif NAME_PATTERN === timezone || 'UTC' == timezone
283
+ tz = TZInfo::Timezone.get(timezone)
284
+ tz.period_for_utc(Time.now).utc_total_offset
285
+ else
286
+ raise ArgumentError, "timezone format is invalid: #{timezone}"
287
+ end
288
+ end
289
+ end
290
+ end
291
+ end
292
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'test/unit/rr'
5
+
6
+ # require 'embulk/java/bootstrap'
7
+ require 'embulk'
8
+ Embulk.setup
9
+ Embulk.logger = Embulk::Logger.new('/dev/null')
10
+
11
+ APP_ROOT = File.expand_path('../', __dir__)
12
+ EXAMPLE_ROOT = File.expand_path('../example', __dir__)
13
+ TEST_ROOT = File.expand_path(File.dirname(__FILE__))
@@ -0,0 +1,166 @@
1
+ require_relative './helper'
2
+ require 'embulk/output/bigquery/bigquery_client'
3
+ require 'csv'
4
+
5
+ # 1. Prepare /tmp/your-project-000.json
6
+ # 2. CONNECT=1 bunlde exec ruby test/test_bigquery_client.rb
7
+
8
+ if ENV['CONNECT']
9
+ module Embulk
10
+ class Output::Bigquery
11
+ class TestBigqueryClient < Test::Unit::TestCase
12
+ class << self
13
+ def startup
14
+ FileUtils.mkdir_p('tmp')
15
+ end
16
+
17
+ def shutdown
18
+ FileUtils.rm_rf('tmp')
19
+ end
20
+ end
21
+
22
+ def client(task = {})
23
+ task = least_task.merge(task)
24
+ BigqueryClient.new(task, schema)
25
+ end
26
+
27
+ def least_task
28
+ {
29
+ 'project' => JSON.parse(File.read('/tmp/your-project-000.json'))['project_id'],
30
+ 'dataset' => 'your_dataset_name',
31
+ 'table' => 'your_table_name',
32
+ 'auth_method' => 'json_key',
33
+ 'json_keyfile' => '/tmp/your-project-000.json',
34
+ 'retries' => 3,
35
+ 'timeout_sec' => 300,
36
+ 'open_timeout_sec' => 300,
37
+ 'job_status_max_polling_time' => 3600,
38
+ 'job_status_polling_interval' => 10,
39
+ 'source_format' => 'CSV'
40
+ }
41
+ end
42
+
43
+ def schema
44
+ Schema.new([
45
+ Column.new({index: 0, name: 'boolean', type: :boolean}),
46
+ Column.new({index: 1, name: 'long', type: :long}),
47
+ Column.new({index: 2, name: 'double', type: :double}),
48
+ Column.new({index: 3, name: 'string', type: :string}),
49
+ Column.new({index: 4, name: 'timestamp', type: :timestamp}),
50
+ Column.new({index: 5, name: 'json', type: :json}),
51
+ ])
52
+ end
53
+
54
+ def record
55
+ [true,1,1.1,'1',Time.parse("2016-02-26 +00:00"),'{"foo":"bar"}']
56
+ end
57
+
58
+ sub_test_case "client" do
59
+ def test_json_keyfile
60
+ assert_nothing_raised { BigqueryClient.new(least_task, schema).client }
61
+ end
62
+
63
+ def test_p12_keyfile
64
+ # pending
65
+ end
66
+ end
67
+
68
+ sub_test_case "create_dataset" do
69
+ def test_create_dataset
70
+ assert_nothing_raised { client.create_dataset }
71
+ end
72
+
73
+ def test_create_dataset_with_reference
74
+ response = client.get_dataset
75
+ any_instance_of(BigqueryClient) do |obj|
76
+ mock(obj).get_dataset('your_dataset_name') { response }
77
+ end
78
+ assert_nothing_raised do
79
+ client.create_dataset('your_dataset_name_old', reference: 'your_dataset_name')
80
+ end
81
+ end
82
+ end
83
+
84
+ sub_test_case "get_dataset" do
85
+ def test_get_dataset
86
+ assert_nothing_raised { client.create_dataset }
87
+ assert_nothing_raised { client.get_dataset }
88
+ end
89
+
90
+ def test_get_dataset_not_found
91
+ assert_raise(NotFoundError) {
92
+ client.get_dataset('something_does_not_exist')
93
+ }
94
+ end
95
+ end
96
+
97
+ sub_test_case "create_table" do
98
+ def test_create_table
99
+ client.delete_table('your_table_name')
100
+ assert_nothing_raised { client.create_table('your_table_name') }
101
+ end
102
+
103
+ def test_create_table_already_exists
104
+ assert_nothing_raised { client.create_table('your_table_name') }
105
+ end
106
+ end
107
+
108
+ sub_test_case "delete_table" do
109
+ def test_delete_table
110
+ client.create_table('your_table_name')
111
+ assert_nothing_raised { client.delete_table('your_table_name') }
112
+ end
113
+
114
+ def test_delete_table_not_found
115
+ assert_nothing_raised { client.delete_table('your_table_name') }
116
+ end
117
+ end
118
+
119
+ sub_test_case "get_table" do
120
+ def test_get_table
121
+ client.create_table('your_table_name')
122
+ assert_nothing_raised { client.get_table('your_table_name') }
123
+ end
124
+
125
+ def test_get_table_not_found
126
+ client.delete_table('your_table_name')
127
+ assert_raise(NotFoundError) {
128
+ client.get_table('your_table_name')
129
+ }
130
+ end
131
+ end
132
+
133
+ sub_test_case "fields" do
134
+ def test_fields_from_table
135
+ client.create_table('your_table_name')
136
+ fields = client.fields_from_table('your_table_name')
137
+ expected = [
138
+ {:type=>"BOOLEAN", :name=>"boolean"},
139
+ {:type=>"INTEGER", :name=>"long"},
140
+ {:type=>"FLOAT", :name=>"double"},
141
+ {:type=>"STRING", :name=>"string"},
142
+ {:type=>"TIMESTAMP", :name=>"timestamp"},
143
+ {:type=>"STRING", :name=>"json"},
144
+ ]
145
+ assert_equal expected, fields
146
+ end
147
+ end
148
+
149
+ sub_test_case "copy" do
150
+ def test_create_table
151
+ client.create_table('your_table_name')
152
+ assert_nothing_raised { client.copy('your_table_name', 'your_table_name_old') }
153
+ end
154
+ end
155
+
156
+ sub_test_case "load" do
157
+ def test_load
158
+ client.create_table('your_table_name')
159
+ File.write("tmp/your_file_name.csv", record.to_csv)
160
+ assert_nothing_raised { client.load("/tmp/your_file_name.csv", 'your_table_name') }
161
+ end
162
+ end
163
+ end
164
+ end
165
+ end
166
+ end