fluent-plugin-bigquery 0.2.15 → 0.2.16

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 27f275ca7fb430c0576f0358736759c256b18492
4
- data.tar.gz: 0f910671b2e06f5f3af370a88122c98fc275fa02
3
+ metadata.gz: 6283655314f920c8d3f1bab8f387d96c6fe79da0
4
+ data.tar.gz: f1016e03203cf12c4c26ad62f1c3a05926423fa7
5
5
  SHA512:
6
- metadata.gz: a7014fe6b9f39479a08d8440f6d5a2e4f7524e89c72e07d77008ead605bb430963893ae88ab77934f4a6aff9a9e903af6d7d23820a5885df38c00203248991cd
7
- data.tar.gz: 5115561a849c5a5f3150c2254e536fd1b645757853c40860b35ee733a80e16452edb99b630a049f247d39ee56494f64632c1b2891544e7296d1549d63c781d82
6
+ metadata.gz: 15e484f5df810cd5736711bd70df5a9e34950e10c77118a3b6097fba6f9c1efd9641ac515df547a15b2f9dac653deb3d5b2fa665541a47bf43dba750754d584e
7
+ data.tar.gz: 1bbcea1f4ec490c69028eca66032b6dca734231fedf431951618b1d5ad08a354395f357e31a028f3e60531ec15e831bbbb0d4f9a70bd48082a57562885564023
data/README.md CHANGED
@@ -5,7 +5,7 @@
5
5
  * insert data over streaming inserts
6
6
  * for continuous real-time insertions
7
7
  * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
8
- * (NOT IMPLEMENTED) load data
8
+ * load data
9
9
  * for data loading as batch jobs, for big amount of data
10
10
  * https://developers.google.com/bigquery/loading-data-into-bigquery
11
11
 
@@ -20,7 +20,7 @@ Configure insert specifications with target table schema, with your credentials.
20
20
 
21
21
  ```apache
22
22
  <match dummy>
23
- type bigquery
23
+ @type bigquery
24
24
 
25
25
  method insert # default
26
26
 
@@ -47,7 +47,7 @@ For high rate inserts over streaming inserts, you should specify flush intervals
47
47
 
48
48
  ```apache
49
49
  <match dummy>
50
- type bigquery
50
+ @type bigquery
51
51
 
52
52
  method insert # default
53
53
 
@@ -106,6 +106,37 @@ Important options for high rate events are:
106
106
  See [Quota policy](https://cloud.google.com/bigquery/streaming-data-into-bigquery#quota)
107
107
  section in the Google BigQuery document.
108
108
 
109
+ ### Load
110
+ ```apache
111
+ <match bigquery>
112
+ @type bigquery
113
+
114
+ method load
115
+ buffer_type file
116
+ buffer_path bigquery.*.buffer
117
+ flush_interval 1800
118
+ flush_at_shutdown true
119
+ try_flush_interval 1
120
+ utc
121
+
122
+ auth_method json_key
123
+ json_key json_key_path.json
124
+
125
+ time_format %s
126
+ time_field time
127
+
128
+ project yourproject_id
129
+ dataset yourdataset_id
130
+ auto_create_table true
131
+ table yourtable%{time_slice}
132
+ schema_path bq_schema.json
133
+ </match>
134
+ ```
135
+
136
+ I recommend to use file buffer and long flush interval.
137
+
138
+ __CAUTION: `flush_interval` default is still `0.25` even if `method` is `load` on current version.__
139
+
109
140
  ### Authentication
110
141
 
111
142
  There are two methods supported to fetch access token for the service account.
@@ -127,7 +158,7 @@ download its JSON key and deploy the key with fluentd.
127
158
 
128
159
  ```apache
129
160
  <match dummy>
130
- type bigquery
161
+ @type bigquery
131
162
 
132
163
  auth_method json_key
133
164
  json_key /home/username/.keys/00000000000000000000000000000000-jsonkey.json
@@ -144,7 +175,7 @@ You need to only include `private_key` and `client_email` key from JSON key file
144
175
 
145
176
  ```apache
146
177
  <match dummy>
147
- type bigquery
178
+ @type bigquery
148
179
 
149
180
  auth_method json_key
150
181
  json_key {"private_key": "-----BEGIN PRIVATE KEY-----\n...", "client_email": "xxx@developer.gserviceaccount.com"}
@@ -165,7 +196,7 @@ Compute Engine instance, then you can configure fluentd like this.
165
196
 
166
197
  ```apache
167
198
  <match dummy>
168
- type bigquery
199
+ @type bigquery
169
200
 
170
201
  auth_method compute_engine
171
202
 
@@ -198,6 +229,7 @@ In this authentication method, the credentials returned are determined by the en
198
229
 
199
230
  ### Table id formatting
200
231
 
232
+ #### strftime formatting
201
233
  `table` and `tables` options accept [Time#strftime](http://ruby-doc.org/core-1.9.3/Time.html#method-i-strftime)
202
234
  format to construct table ids.
203
235
  Table ids are formatted at runtime
@@ -208,7 +240,7 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
208
240
 
209
241
  ```apache
210
242
  <match dummy>
211
- type bigquery
243
+ @type bigquery
212
244
 
213
245
  ...
214
246
 
@@ -220,8 +252,11 @@ data is inserted into tables `accesslog_2014_08`, `accesslog_2014_09` and so on.
220
252
  </match>
221
253
  ```
222
254
 
255
+ #### record attribute formatting
223
256
  The format can be suffixed with attribute name.
224
257
 
258
+ __NOTE: This feature is available only if `method` is `insert`. Because it makes performance impact. Use `%{time_slice}` instead of it.__
259
+
225
260
  ```apache
226
261
  <match dummy>
227
262
  ...
@@ -233,23 +268,39 @@ The format can be suffixed with attribute name.
233
268
  If attribute name is given, the time to be used for formatting is value of each row.
234
269
  The value for the time should be a UNIX time.
235
270
 
271
+ #### time_slice_key formatting
236
272
  Or, the options can use `%{time_slice}` placeholder.
237
273
  `%{time_slice}` is replaced by formatted time slice key at runtime.
238
274
 
239
275
  ```apache
240
276
  <match dummy>
241
- type bigquery
242
-
277
+ @type bigquery
278
+
243
279
  ...
244
-
245
- project yourproject_id
246
- dataset yourdataset_id
247
280
  table accesslog%{time_slice}
248
-
249
281
  ...
250
282
  </match>
251
283
  ```
252
284
 
285
+ #### record attribute value formatting
286
+ Or, `${attr_name}` placeholder is available to use value of attribute as part of table id.
287
+ `${attr_name}` is replaced by string value of the attribute specified by `attr_name`.
288
+
289
+ __NOTE: This feature is available only if `method` is `insert`.__
290
+
291
+ ```apache
292
+ <match dummy>
293
+ ...
294
+ table accesslog_%Y_%m_${subdomain}
295
+ ...
296
+ </match>
297
+ ```
298
+
299
+ For example value of `subdomain` attribute is `"bq.fluent"`, table id will be like "accesslog_2016_03_bqfluent".
300
+
301
+ - any type of attribute is allowed because stringified value will be used as replacement.
302
+ - acceptable characters are alphabets, digits and `_`. All other characters will be removed.
303
+
253
304
  ### Dynamic table creating
254
305
 
255
306
  When `auto_create_table` is set to `true`, try to create the table using BigQuery API when insertion failed with code=404 "Not Found: Table ...".
@@ -259,7 +310,7 @@ NOTE: `auto_create_table` option cannot be used with `fetch_schema`. You should
259
310
 
260
311
  ```apache
261
312
  <match dummy>
262
- type bigquery
313
+ @type bigquery
263
314
 
264
315
  ...
265
316
 
@@ -283,7 +334,7 @@ you can also specify nested fields by prefixing their belonging record fields.
283
334
 
284
335
  ```apache
285
336
  <match dummy>
286
- type bigquery
337
+ @type bigquery
287
338
 
288
339
  ...
289
340
 
@@ -322,7 +373,7 @@ The second method is to specify a path to a BigQuery schema file instead of list
322
373
 
323
374
  ```apache
324
375
  <match dummy>
325
- type bigquery
376
+ @type bigquery
326
377
 
327
378
  ...
328
379
 
@@ -339,7 +390,7 @@ The third method is to set `fetch_schema` to `true` to enable fetch a schema usi
339
390
 
340
391
  ```apache
341
392
  <match dummy>
342
- type bigquery
393
+ @type bigquery
343
394
 
344
395
  ...
345
396
 
@@ -363,7 +414,7 @@ You can set `insert_id_field` option to specify the field to use as `insertId` p
363
414
 
364
415
  ```apache
365
416
  <match dummy>
366
- type bigquery
417
+ @type bigquery
367
418
 
368
419
  ...
369
420
 
@@ -11,7 +11,7 @@ Gem::Specification.new do |spec|
11
11
  spec.description = %q{Fluentd plugin to store data on Google BigQuery, by load, or by stream inserts}
12
12
  spec.summary = %q{Fluentd plugin to store data on Google BigQuery}
13
13
  spec.homepage = "https://github.com/kaizenplatform/fluent-plugin-bigquery"
14
- spec.license = "APLv2"
14
+ spec.license = "Apache-2.0"
15
15
 
16
16
  spec.files = `git ls-files`.split($/)
17
17
  spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
@@ -23,9 +23,10 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "test-unit", "~> 3.0.2"
24
24
  spec.add_development_dependency "test-unit-rr", "~> 1.0.3"
25
25
 
26
- spec.add_runtime_dependency "google-api-client", "~> 0.9.1"
26
+ spec.add_runtime_dependency "google-api-client", "~> 0.9.3"
27
27
  spec.add_runtime_dependency "googleauth", ">= 0.5.0"
28
28
  spec.add_runtime_dependency "multi_json"
29
+ spec.add_runtime_dependency "activesupport", ">= 3.2"
29
30
  spec.add_runtime_dependency "fluentd"
30
31
  spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
31
32
  spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.3.0"
@@ -1,6 +1,6 @@
1
1
  module Fluent
2
2
  module BigQueryPlugin
3
- VERSION = "0.2.15"
3
+ VERSION = "0.2.16"
4
4
  end
5
5
  end
6
6
 
@@ -92,7 +92,7 @@ module Fluent
92
92
 
93
93
  config_param :insert_id_field, :string, default: nil
94
94
 
95
- config_param :method, :string, default: 'insert' # or 'load' # TODO: not implemented now
95
+ config_param :method, :string, default: 'insert' # or 'load'
96
96
 
97
97
  config_param :load_size_limit, :integer, default: 1000**4 # < 1TB (1024^4) # TODO: not implemented now
98
98
  ### method: 'load'
@@ -150,6 +150,14 @@ module Fluent
150
150
  def configure(conf)
151
151
  super
152
152
 
153
+ if @method == "insert"
154
+ extend(InsertImplementation)
155
+ elsif @method == "load"
156
+ extend(LoadImplementation)
157
+ else
158
+ raise Fluend::ConfigError "'method' must be 'insert' or 'load'"
159
+ end
160
+
153
161
  case @auth_method
154
162
  when 'private_key'
155
163
  unless @email && @private_key_path
@@ -286,6 +294,12 @@ module Fluent
286
294
  else
287
295
  current_time
288
296
  end
297
+ if row && format =~ /\$\{/
298
+ json = row[:json]
299
+ format.gsub!(/\$\{\s*(\w+)\s*\}/) do |m|
300
+ row[:json][$1.to_sym].to_s.gsub(/[^\w]/, '')
301
+ end
302
+ end
289
303
  table_id = time.strftime(format)
290
304
 
291
305
  if chunk
@@ -321,29 +335,6 @@ module Fluent
321
335
  raise "failed to create table in bigquery" # TODO: error class
322
336
  end
323
337
 
324
- def insert(table_id, rows)
325
- client.insert_all_table_data(@project, @dataset, table_id, {
326
- rows: rows
327
- }, {})
328
- rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
329
- # api_error? -> client cache clear
330
- @cached_client = nil
331
-
332
- message = e.message
333
- if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ message.to_s
334
- # Table Not Found: Auto Create Table
335
- create_table(table_id)
336
- raise "table created. send rows next time."
337
- end
338
- log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
339
- raise "failed to insert into bigquery" # TODO: error class
340
- end
341
-
342
- def load
343
- # https://developers.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest
344
- raise NotImplementedError # TODO
345
- end
346
-
347
338
  def replace_record_key(record)
348
339
  new_record = {}
349
340
  record.each do |key, _|
@@ -366,44 +357,13 @@ module Fluent
366
357
  record
367
358
  end
368
359
 
369
- def format(tag, time, record)
370
- buf = ''
371
-
372
- if @replace_record_key
373
- record = replace_record_key(record)
374
- end
375
-
376
- if @convert_hash_to_json
377
- record = convert_hash_to_json(record)
378
- end
379
-
380
- row = @fields.format(@add_time_field.call(record, time))
381
- unless row.empty?
382
- row = {"json" => row}
383
- row['insert_id'] = @get_insert_id.call(record) if @get_insert_id
384
- buf << row.to_msgpack
385
- end
386
- buf
387
- end
388
-
389
360
  def write(chunk)
390
- rows = []
391
- chunk.msgpack_each do |row_object|
392
- # TODO: row size limit
393
- rows << row_object.deep_symbolize_keys
394
- end
395
-
396
- # TODO: method
397
-
398
- insert_table_format = @tables_mutex.synchronize do
361
+ table_id_format = @tables_mutex.synchronize do
399
362
  t = @tables_queue.shift
400
363
  @tables_queue.push t
401
364
  t
402
365
  end
403
-
404
- rows.group_by {|row| generate_table_id(insert_table_format, Time.at(Fluent::Engine.now), row, chunk) }.each do |table_id, rows|
405
- insert(table_id, rows)
406
- end
366
+ _write(chunk, table_id_format)
407
367
  end
408
368
 
409
369
  def fetch_schema
@@ -422,6 +382,137 @@ module Fluent
422
382
  raise "failed to fetch schema from bigquery" # TODO: error class
423
383
  end
424
384
 
385
+ module InsertImplementation
386
+ def format(tag, time, record)
387
+ buf = ''
388
+
389
+ if @replace_record_key
390
+ record = replace_record_key(record)
391
+ end
392
+
393
+ if @convert_hash_to_json
394
+ record = convert_hash_to_json(record)
395
+ end
396
+
397
+ row = @fields.format(@add_time_field.call(record, time))
398
+ unless row.empty?
399
+ row = {"json" => row}
400
+ row['insert_id'] = @get_insert_id.call(record) if @get_insert_id
401
+ buf << row.to_msgpack
402
+ end
403
+ buf
404
+ end
405
+
406
+ def _write(chunk, table_format)
407
+ rows = []
408
+ chunk.msgpack_each do |row_object|
409
+ # TODO: row size limit
410
+ rows << row_object.deep_symbolize_keys
411
+ end
412
+
413
+ rows.group_by {|row| generate_table_id(table_format, Time.at(Fluent::Engine.now), row, chunk) }.each do |table_id, group|
414
+ insert(table_id, group)
415
+ end
416
+ end
417
+
418
+ def insert(table_id, rows)
419
+ client.insert_all_table_data(@project, @dataset, table_id, {
420
+ rows: rows
421
+ }, {})
422
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
423
+ # api_error? -> client cache clear
424
+ @cached_client = nil
425
+
426
+ message = e.message
427
+ if @auto_create_table && e.status_code == 404 && /Not Found: Table/i =~ message.to_s
428
+ # Table Not Found: Auto Create Table
429
+ create_table(table_id)
430
+ raise "table created. send rows next time."
431
+ end
432
+ log.error "tabledata.insertAll API", project_id: @project, dataset: @dataset, table: table_id, code: e.status_code, message: message
433
+ raise "failed to insert into bigquery" # TODO: error class
434
+ end
435
+ end
436
+
437
+ module LoadImplementation
438
+ def format(tag, time, record)
439
+ buf = ''
440
+
441
+ if @replace_record_key
442
+ record = replace_record_key(record)
443
+ end
444
+ row = @fields.format(@add_time_field.call(record, time))
445
+ unless row.empty?
446
+ buf << MultiJson.dump(row) + "\n"
447
+ end
448
+ buf
449
+ end
450
+
451
+ def _write(chunk, table_id_format)
452
+ table_id = generate_table_id(table_id_format, Time.at(Fluent::Engine.now), nil, chunk)
453
+ load(chunk, table_id)
454
+ end
455
+
456
+ def load(chunk, table_id)
457
+ res = nil
458
+ create_upload_source(chunk) do |upload_source|
459
+ res = client.insert_job(@project, {
460
+ configuration: {
461
+ load: {
462
+ destination_table: {
463
+ project_id: @project,
464
+ dataset_id: @dataset,
465
+ table_id: table_id,
466
+ },
467
+ schema: {
468
+ fields: @fields.to_a,
469
+ },
470
+ write_disposition: "WRITE_APPEND",
471
+ source_format: "NEWLINE_DELIMITED_JSON"
472
+ }
473
+ }
474
+ }, {upload_source: upload_source, content_type: "application/octet-stream"})
475
+ end
476
+ wait_load(res, table_id)
477
+ end
478
+
479
+ private
480
+
481
+ def wait_load(res, table_id)
482
+ wait_interval = 10
483
+ _response = res
484
+ until _response.status.state == "DONE"
485
+ log.debug "wait for load job finish", state: _response.status.state
486
+ sleep wait_interval
487
+ _response = client.get_job(@project, _response.job_reference.job_id)
488
+ end
489
+
490
+ if _response.status.error_result
491
+ log.error "job.insert API", project_id: @project, dataset: @dataset, table: table_id, message: _response.status.error_result.message
492
+ raise "failed to load into bigquery"
493
+ end
494
+
495
+ log.debug "finish load job", state: _response.status.state
496
+ end
497
+
498
+ def create_upload_source(chunk)
499
+ chunk_is_file = @buffer_type == 'file'
500
+ if chunk_is_file
501
+ File.open(chunk.path) do |file|
502
+ yield file
503
+ end
504
+ else
505
+ Tempfile.open("chunk-tmp") do |file|
506
+ file.binmode
507
+ chunk.write_to(file)
508
+ file.sync
509
+ file.rewind
510
+ yield file
511
+ end
512
+ end
513
+ end
514
+ end
515
+
425
516
  class FieldSchema
426
517
  def initialize(name, mode = :nullable)
427
518
  unless [:nullable, :required, :repeated].include?(mode)
@@ -27,7 +27,6 @@ require 'fluent/plugin/buf_memory'
27
27
  require 'fluent/plugin/buf_file'
28
28
 
29
29
  require 'fluent/plugin/out_bigquery'
30
- require 'fluent/plugin/bigquery/load_request_body_wrapper'
31
30
 
32
31
  require 'rr'
33
32
 
@@ -710,6 +710,35 @@ class BigQueryOutputTest < Test::Unit::TestCase
710
710
  assert_equal expected, MessagePack.unpack(buf)
711
711
  end
712
712
 
713
+ def test_format_for_load
714
+ now = Time.now
715
+ input = [
716
+ now,
717
+ {
718
+ "uuid" => "9ABFF756-0267-4247-847F-0895B65F0938",
719
+ }
720
+ ]
721
+ expected = MultiJson.dump({
722
+ "uuid" => "9ABFF756-0267-4247-847F-0895B65F0938",
723
+ }) + "\n"
724
+
725
+ driver = create_driver(<<-CONFIG)
726
+ method load
727
+ table foo
728
+ email foo@bar.example
729
+ private_key_path /path/to/key
730
+ project yourproject_id
731
+ dataset yourdataset_id
732
+
733
+ field_string uuid
734
+ CONFIG
735
+ driver.instance.start
736
+ buf = driver.instance.format_stream("my.tag", [input])
737
+ driver.instance.shutdown
738
+
739
+ assert_equal expected, buf
740
+ end
741
+
713
742
  def test_empty_value_in_required
714
743
  now = Time.now
715
744
  input = [
@@ -857,6 +886,66 @@ class BigQueryOutputTest < Test::Unit::TestCase
857
886
  driver.instance.shutdown
858
887
  end
859
888
 
889
+ def test_write_for_load
890
+ schema_path = File.join(File.dirname(__FILE__), "testdata", "sudo.schema")
891
+ entry = {a: "b"}, {b: "c"}
892
+ driver = create_driver(<<-CONFIG)
893
+ method load
894
+ table foo
895
+ email foo@bar.example
896
+ private_key_path /path/to/key
897
+ project yourproject_id
898
+ dataset yourdataset_id
899
+
900
+ time_format %s
901
+ time_field time
902
+
903
+ schema_path #{schema_path}
904
+ field_integer time
905
+ CONFIG
906
+ schema_fields = MultiJson.load(File.read(schema_path)).map(&:deep_symbolize_keys).tap do |h|
907
+ h[0][:type] = "INTEGER"
908
+ h[0][:mode] = "NULLABLE"
909
+ end
910
+
911
+ chunk = Fluent::MemoryBufferChunk.new("my.tag")
912
+ io = StringIO.new("hello")
913
+ mock(driver.instance).create_upload_source(chunk).yields(io)
914
+ mock_client(driver) do |expect|
915
+ expect.insert_job('yourproject_id', {
916
+ configuration: {
917
+ load: {
918
+ destination_table: {
919
+ project_id: 'yourproject_id',
920
+ dataset_id: 'yourdataset_id',
921
+ table_id: 'foo',
922
+ },
923
+ schema: {
924
+ fields: schema_fields,
925
+ },
926
+ write_disposition: "WRITE_APPEND",
927
+ source_format: "NEWLINE_DELIMITED_JSON"
928
+ }
929
+ }
930
+ }, {upload_source: io, content_type: "application/octet-stream"}) {
931
+ s = stub!
932
+ status_stub = stub!
933
+ s.status { status_stub }
934
+ status_stub.state { "DONE" }
935
+ status_stub.error_result { nil }
936
+ s
937
+ }
938
+ end
939
+
940
+ entry.each do |e|
941
+ chunk << MultiJson.dump(e) + "\n"
942
+ end
943
+
944
+ driver.instance.start
945
+ driver.instance.write(chunk)
946
+ driver.instance.shutdown
947
+ end
948
+
860
949
  def test_write_with_row_based_table_id_formatting
861
950
  entry = [
862
951
  {json: {a: "b", created_at: Time.local(2014,8,20,9,0,0).to_i}},
@@ -935,6 +1024,26 @@ class BigQueryOutputTest < Test::Unit::TestCase
935
1024
  assert_equal 'foo_20140811', table_id
936
1025
  end
937
1026
 
1027
+ def test_generate_table_id_with_attribute_replacement
1028
+ driver = create_driver
1029
+ table_id_format = 'foo_%Y_%m_%d_${baz}'
1030
+ current_time = Time.now
1031
+ time = Time.local(2014, 8, 11, 21, 20, 56)
1032
+ [
1033
+ [ { baz: 1234 }, 'foo_2014_08_11_1234' ],
1034
+ [ { baz: 'piyo' }, 'foo_2014_08_11_piyo' ],
1035
+ [ { baz: true }, 'foo_2014_08_11_true' ],
1036
+ [ { baz: nil }, 'foo_2014_08_11_' ],
1037
+ [ { baz: '' }, 'foo_2014_08_11_' ],
1038
+ [ { baz: "_X-Y.Z !\n" }, 'foo_2014_08_11__XYZ' ],
1039
+ [ { baz: { xyz: 1 } }, 'foo_2014_08_11_xyz1' ],
1040
+ ].each do |attrs, expected|
1041
+ row = { json: { created_at: Time.local(2014,8,10,21,20,57).to_i }.merge(attrs) }
1042
+ table_id = driver.instance.generate_table_id(table_id_format, time, row)
1043
+ assert_equal expected, table_id
1044
+ end
1045
+ end
1046
+
938
1047
  def test_auto_create_table_by_bigquery_api
939
1048
  now = Time.now
940
1049
  message = {
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: fluent-plugin-bigquery
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.15
4
+ version: 0.2.16
5
5
  platform: ruby
6
6
  authors:
7
7
  - Naoya Ito
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-01-25 00:00:00.000000000 Z
11
+ date: 2016-03-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake
@@ -72,14 +72,14 @@ dependencies:
72
72
  requirements:
73
73
  - - "~>"
74
74
  - !ruby/object:Gem::Version
75
- version: 0.9.1
75
+ version: 0.9.3
76
76
  type: :runtime
77
77
  prerelease: false
78
78
  version_requirements: !ruby/object:Gem::Requirement
79
79
  requirements:
80
80
  - - "~>"
81
81
  - !ruby/object:Gem::Version
82
- version: 0.9.1
82
+ version: 0.9.3
83
83
  - !ruby/object:Gem::Dependency
84
84
  name: googleauth
85
85
  requirement: !ruby/object:Gem::Requirement
@@ -108,6 +108,20 @@ dependencies:
108
108
  - - ">="
109
109
  - !ruby/object:Gem::Version
110
110
  version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: activesupport
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '3.2'
118
+ type: :runtime
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '3.2'
111
125
  - !ruby/object:Gem::Dependency
112
126
  name: fluentd
113
127
  requirement: !ruby/object:Gem::Requirement
@@ -193,7 +207,6 @@ files:
193
207
  - README.md
194
208
  - Rakefile
195
209
  - fluent-plugin-bigquery.gemspec
196
- - lib/fluent/plugin/bigquery/load_request_body_wrapper.rb
197
210
  - lib/fluent/plugin/bigquery/version.rb
198
211
  - lib/fluent/plugin/out_bigquery.rb
199
212
  - test/helper.rb
@@ -201,10 +214,9 @@ files:
201
214
  - test/plugin/testdata/apache.schema
202
215
  - test/plugin/testdata/json_key.json
203
216
  - test/plugin/testdata/sudo.schema
204
- - test/test_load_request_body_wrapper.rb
205
217
  homepage: https://github.com/kaizenplatform/fluent-plugin-bigquery
206
218
  licenses:
207
- - APLv2
219
+ - Apache-2.0
208
220
  metadata: {}
209
221
  post_install_message:
210
222
  rdoc_options: []
@@ -232,4 +244,3 @@ test_files:
232
244
  - test/plugin/testdata/apache.schema
233
245
  - test/plugin/testdata/json_key.json
234
246
  - test/plugin/testdata/sudo.schema
235
- - test/test_load_request_body_wrapper.rb
@@ -1,173 +0,0 @@
1
- module Fluent
2
- module BigQueryPlugin
3
- class LoadRequestBodyWrapper
4
- # body can be a instance of IO (#rewind, #read, #to_str)
5
- # http://rubydoc.info/github/google/google-api-ruby-client/Google/APIClient/Request#body-instance_method
6
-
7
- # http://rubydoc.info/github/google/google-api-ruby-client/Google/APIClient#execute-instance_method
8
- # (Google::APIClient::Method) api_method: The method object or the RPC name of the method being executed.
9
- # (Hash, Array) parameters: The parameters to send to the method.
10
- # (String) body: The body of the request.
11
- # (Hash, Array) headers: The HTTP headers for the request.
12
- # (Hash) options: A set of options for the request, of which:
13
- # (#generate_authenticated_request) :authorization (default: true)
14
- # - The authorization mechanism for the response. Used only if :authenticated is true.
15
- # (TrueClass, FalseClass) :authenticated (default: true)
16
- # - true if the request must be signed or somehow authenticated, false otherwise.
17
- # (TrueClass, FalseClass) :gzip (default: true) - true if gzip enabled, false otherwise.
18
-
19
- # https://developers.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest
20
-
21
- JSON_PRETTY_DUMP = JSON::State.new(space: " ", indent:" ", object_nl:"\n", array_nl:"\n")
22
-
23
- CONTENT_TYPE_FIRST = "Content-Type: application/json; charset=UTF-8\n\n"
24
- CONTENT_TYPE_SECOND = "Content-Type: application/octet-stream\n\n"
25
-
26
- MULTIPART_BOUNDARY = "--xxx\n"
27
- MULTIPART_BOUNDARY_END = "--xxx--\n"
28
-
29
- def initialize(project_id, dataset_id, table_id, field_defs, buffer)
30
- @metadata = {
31
- configuration: {
32
- load: {
33
- sourceFormat: "<required for JSON files>",
34
- schema: {
35
- fields: field_defs
36
- },
37
- destinationTable: {
38
- projectId: project_id,
39
- datasetId: dataset_id,
40
- tableId: table_id
41
- }
42
- }
43
- }
44
- }
45
-
46
- @non_buffer = MULTIPART_BOUNDARY + CONTENT_TYPE_FIRST + @metadata.to_json(JSON_PRETTY_DUMP) + "\n" +
47
- MULTIPART_BOUNDARY + CONTENT_TYPE_SECOND
48
- @non_buffer.force_encoding("ASCII-8BIT")
49
- @non_buffer_bytesize = @non_buffer.bytesize
50
-
51
- @buffer = buffer # read
52
- @buffer_bytesize = @buffer.size # Fluentd Buffer Chunk #size -> bytesize
53
-
54
- @footer = MULTIPART_BOUNDARY_END.force_encoding("ASCII-8BIT")
55
-
56
- @contents_bytesize = @non_buffer_bytesize + @buffer_bytesize
57
- @total_bytesize = @contents_bytesize + MULTIPART_BOUNDARY_END.bytesize
58
-
59
- @whole_data = nil
60
-
61
- @counter = 0
62
- @eof = false
63
- end
64
-
65
- # sample_body = <<EOF
66
- # --xxx
67
- # Content-Type: application/json; charset=UTF-8
68
- #
69
- # {
70
- # "configuration": {
71
- # "load": {
72
- # "sourceFormat": "<required for JSON files>",
73
- # "schema": {
74
- # "fields": [
75
- # {"name":"f1", "type":"STRING"},
76
- # {"name":"f2", "type":"INTEGER"}
77
- # ]
78
- # },
79
- # "destinationTable": {
80
- # "projectId": "projectId",
81
- # "datasetId": "datasetId",
82
- # "tableId": "tableId"
83
- # }
84
- # }
85
- # }
86
- # }
87
- # --xxx
88
- # Content-Type: application/octet-stream
89
- #
90
- # <your data>
91
- # --xxx--
92
- # EOF
93
- def rewind
94
- @counter = 0
95
- @eof = false
96
- end
97
-
98
- def eof?
99
- @eof
100
- end
101
-
102
- def to_str
103
- rewind
104
- self.read # all data
105
- end
106
-
107
- def read(length=nil, outbuf="")
108
- raise ArgumentError, "negative read length" if length && length < 0
109
- return (length.nil? || length == 0) ? "" : nil if @eof
110
- return outbuf if length == 0
111
-
112
- # read all data
113
- if length.nil? || length >= @total_bytesize
114
- @whole_data ||= @buffer.read.force_encoding("ASCII-8BIT")
115
-
116
- if @counter.zero?
117
- outbuf.replace(@non_buffer)
118
- outbuf << @whole_data
119
- outbuf << @footer
120
- elsif @counter < @non_buffer_bytesize
121
- outbuf.replace(@non_buffer[ @counter .. -1 ])
122
- outbuf << @whole_data
123
- outbuf << @footer
124
- elsif @counter < @contents_bytesize
125
- outbuf.replace(@whole_data[ (@counter - @non_buffer_bytesize) .. -1 ])
126
- outbuf << @footer
127
- else
128
- outbuf.replace(@footer[ (@counter - @contents_bytesize) .. -1 ])
129
- end
130
- @counter = @total_bytesize
131
- @eof = true
132
- return outbuf
133
- end
134
-
135
- # In ruby script level (non-ext module), we cannot prevent to change outbuf length or object re-assignment
136
- outbuf.replace("")
137
-
138
- # return first part (metadata)
139
- if @counter < @non_buffer_bytesize
140
- non_buffer_part = @non_buffer[@counter, length]
141
- if non_buffer_part
142
- outbuf << non_buffer_part
143
- length -= non_buffer_part.bytesize
144
- @counter += non_buffer_part.bytesize
145
- end
146
- end
147
- return outbuf if length < 1
148
-
149
- # return second part (buffer content)
150
- if @counter < @contents_bytesize
151
- @whole_data ||= @buffer.read.force_encoding("ASCII-8BIT")
152
- buffer_part = @whole_data[@counter - @non_buffer_bytesize, length]
153
- if buffer_part
154
- outbuf << buffer_part
155
- length -= buffer_part.bytesize
156
- @counter += buffer_part.bytesize
157
- end
158
- end
159
- return outbuf if length < 1
160
-
161
- # return footer
162
- footer_part = @footer[@counter - @contents_bytesize, length]
163
- if footer_part
164
- outbuf << footer_part
165
- @counter += footer_part.bytesize
166
- @eof = true if @counter >= @total_bytesize
167
- end
168
-
169
- outbuf
170
- end
171
- end
172
- end
173
- end
@@ -1,190 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- require 'helper'
3
- require 'json'
4
- require 'tempfile'
5
-
6
- class LoadRequestBodyWrapperTest < Test::Unit::TestCase
7
- def content_alphabet(repeat)
8
- (0...repeat).map{|i| "#{i}0123456789\n" }.join
9
- end
10
-
11
- def content_kana(repeat)
12
- (0...repeat).map{|i| "#{i}あいうえおかきくけこ\n" }.join
13
- end
14
-
15
- def mem_chunk(repeat=10, kana=false)
16
- content = kana ? content_kana(repeat) : content_alphabet(repeat)
17
- Fluent::MemoryBufferChunk.new('bc_mem', content)
18
- end
19
-
20
- def file_chunk(repeat=10, kana=false)
21
- content = kana ? content_kana(repeat) : content_alphabet(repeat)
22
- tmpfile = Tempfile.new('fluent_bigquery_plugin_test')
23
- buf = Fluent::FileBufferChunk.new('bc_mem', tmpfile.path, tmpfile.object_id)
24
- buf << content
25
- buf
26
- end
27
-
28
- def field_defs
29
- [{"name" => "field1", "type" => "STRING"}, {"name" => "field2", "type" => "INTEGER"}]
30
- end
31
-
32
- def check_meta(blank, first, last)
33
- assert_equal "", blank
34
-
35
- header1, body1 = first.split("\n\n")
36
- assert_equal "Content-Type: application/json; charset=UTF-8", header1
37
- metadata = JSON.parse(body1)
38
- assert_equal "<required for JSON files>", metadata["configuration"]["load"]["sourceFormat"]
39
- assert_equal "field1", metadata["configuration"]["load"]["schema"]["fields"][0]["name"]
40
- assert_equal "STRING", metadata["configuration"]["load"]["schema"]["fields"][0]["type"]
41
- assert_equal "field2", metadata["configuration"]["load"]["schema"]["fields"][1]["name"]
42
- assert_equal "INTEGER", metadata["configuration"]["load"]["schema"]["fields"][1]["type"]
43
- assert_equal "pname1", metadata["configuration"]["load"]["destinationTable"]["projectId"]
44
- assert_equal "dname1", metadata["configuration"]["load"]["destinationTable"]["datasetId"]
45
- assert_equal "tname1", metadata["configuration"]["load"]["destinationTable"]["tableId"]
46
-
47
- assert_equal "--\n", last
48
- end
49
-
50
- def check_ascii(data)
51
- blank, first, second, last = data.split(/--xxx\n?/)
52
-
53
- check_meta(blank, first, last)
54
-
55
- header2, body2 = second.split("\n\n")
56
- assert_equal "Content-Type: application/octet-stream", header2
57
- i = 0
58
- body2.each_line do |line|
59
- assert_equal "#{i}0123456789\n", line
60
- i += 1
61
- end
62
- end
63
-
64
- def check_kana(data)
65
- blank, first, second, last = data.split(/--xxx\n?/)
66
-
67
- check_meta(blank, first, last)
68
-
69
- header2, body2 = second.split("\n\n")
70
- assert_equal "Content-Type: application/octet-stream", header2
71
- i = 0
72
- body2.each_line do |line|
73
- assert_equal "#{i}あいうえおかきくけこ\n", line
74
- i += 1
75
- end
76
- end
77
-
78
- def setup
79
- @klass = Fluent::BigQueryPlugin::LoadRequestBodyWrapper
80
- self
81
- end
82
-
83
- def test_memory_buf
84
- d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(10))
85
- data1 = d1.read.force_encoding("UTF-8")
86
- check_ascii(data1)
87
-
88
- d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(10))
89
- data2 = ""
90
- while !d2.eof? do
91
- buf = " "
92
- objid = buf.object_id
93
- data2 << d2.read(20, buf)
94
- assert_equal objid, buf.object_id
95
- end
96
- data2.force_encoding("UTF-8")
97
-
98
- assert_equal data1.size, data2.size
99
- end
100
-
101
- def test_memory_buf2
102
- d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000))
103
- data1 = d1.read.force_encoding("UTF-8")
104
- check_ascii(data1)
105
-
106
- d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000))
107
- data2 = ""
108
- while !d2.eof? do
109
- buf = " "
110
- objid = buf.object_id
111
- data2 << d2.read(2048, buf)
112
- assert_equal objid, buf.object_id
113
- end
114
- data2.force_encoding("UTF-8")
115
-
116
- assert_equal data1.size, data2.size
117
- end
118
-
119
- def test_memory_buf3 # kana
120
- d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000, true))
121
- data1 = d1.read.force_encoding("UTF-8")
122
- check_kana(data1)
123
-
124
- d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000, true))
125
- data2 = ""
126
- while !d2.eof? do
127
- buf = " "
128
- objid = buf.object_id
129
- data2 << d2.read(2048, buf)
130
- assert_equal objid, buf.object_id
131
- end
132
- data2.force_encoding("UTF-8")
133
-
134
- assert_equal data1.size, data2.size
135
- end
136
-
137
- def test_file_buf
138
- d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(10))
139
- data1 = d1.read.force_encoding("UTF-8")
140
- check_ascii(data1)
141
-
142
- d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(10))
143
- data2 = ""
144
- while !d2.eof? do
145
- buf = " "
146
- objid = buf.object_id
147
- data2 << d2.read(20, buf)
148
- assert_equal objid, buf.object_id
149
- end
150
- data2.force_encoding("UTF-8")
151
-
152
- assert_equal data1.size, data2.size
153
- end
154
-
155
- def test_file_buf2
156
- d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000))
157
- data1 = d1.read.force_encoding("UTF-8")
158
- check_ascii(data1)
159
-
160
- d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000))
161
- data2 = ""
162
- while !d2.eof? do
163
- buf = " "
164
- objid = buf.object_id
165
- data2 << d2.read(20480, buf)
166
- assert_equal objid, buf.object_id
167
- end
168
- data2.force_encoding("UTF-8")
169
-
170
- assert_equal data1.size, data2.size
171
- end
172
-
173
- def test_file_buf3 # kana
174
- d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000, true))
175
- data1 = d1.read.force_encoding("UTF-8")
176
- check_kana(data1)
177
-
178
- d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000, true))
179
- data2 = ""
180
- while !d2.eof? do
181
- buf = " "
182
- objid = buf.object_id
183
- data2 << d2.read(20480, buf)
184
- assert_equal objid, buf.object_id
185
- end
186
- data2.force_encoding("UTF-8")
187
-
188
- assert_equal data1.size, data2.size
189
- end
190
- end