fluent-plugin-bigquery 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 793b3fbd0189044538497bcef9dc244adf987d86
4
+ data.tar.gz: 115ff35e20bf3e3fe58e54978f7e659678e14a17
5
+ SHA512:
6
+ metadata.gz: d230732372df108fbcdf59aec5485f2837028531fb1cb9edf0582dfc31654a747bb35494e9de646067e295960709e778d9e69f41f80f1a8810e122b952c971e4
7
+ data.tar.gz: fc7dbd59a34a44f9f8f2aef3829fc352636c58a5fad3d8ddbf661368c912d8a2ba5d6f2080c66fc85c32f615916c3ad980f3e4e3a9b6e967bc8bfc1fc5712e2c
data/.gitignore ADDED
@@ -0,0 +1,18 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
18
+ script/
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in fluent-plugin-bigquery.gemspec
4
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2012- TAGOMORI Satoshi
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+
7
+ http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ Unless required by applicable law or agreed to in writing, software
10
+ distributed under the License is distributed on an "AS IS" BASIS,
11
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ See the License for the specific language governing permissions and
13
+ limitations under the License.
data/README.md ADDED
@@ -0,0 +1,140 @@
1
+ # fluent-plugin-webhdfs
2
+
3
+ Fluentd output plugin to load/insert data into Google BigQuery.
4
+
5
+ * insert data over streaming inserts
6
+ * for continuous real-time insertions, under many limitations
7
+ * https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
8
+ * (NOT IMPLEMENTED) load data
9
+ * for data loading as batch jobs, for big amount of data
10
+ * https://developers.google.com/bigquery/loading-data-into-bigquery
11
+
12
+ Current version of this plugin supports Google API with Service Account Authentication, and does not support OAuth.
13
+
14
+ ## Configuration
15
+
16
+ ### Streming inserts
17
+
18
+ For service account authentication, generate service account private key file and email key, then upload private key file onto your server.
19
+
20
+ Configure insert specifications with target table schema, with your credentials. This is minimum configurations:
21
+
22
+ ```apache
23
+ <match dummy>
24
+ type bigquery
25
+
26
+ method insert # default
27
+
28
+ email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
29
+ private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
30
+ # private_key_passphrase notasecret # default
31
+
32
+ project yourproject_id
33
+ dataset yourdataset_id
34
+ table tablename
35
+
36
+ time_format %s
37
+ time_field time
38
+
39
+ field_integer time,status,bytes
40
+ field_string rhost,vhost,path,method,protocol,agent,referer
41
+ field_float requestime
42
+ field_boolean bot_access,loginsession
43
+ </match>
44
+ ```
45
+
46
+ For high rate inserts over streaming inserts, you should specify flush intervals and buffer chunk options:
47
+
48
+ ```apache
49
+ <match dummy>
50
+ type bigquery
51
+
52
+ method insert # default
53
+
54
+ flush_interval 1 # flush as frequent as possible
55
+
56
+ buffer_chunk_records_limit 300 # default rate limit for users is 100
57
+ buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
58
+
59
+ num_threads 16
60
+
61
+ email xxxxxxxxxxxx-xxxxxxxxxxxxxxxxxxxxxx@developer.gserviceaccount.com
62
+ private_key_path /home/username/.keys/00000000000000000000000000000000-privatekey.p12
63
+ # private_key_passphrase notasecret # default
64
+
65
+ project yourproject_id
66
+ dataset yourdataset_id
67
+ tables accesslog1,accesslog2,accesslog3
68
+
69
+ time_format %s
70
+ time_field time
71
+
72
+ field_integer time,status,bytes
73
+ field_string rhost,vhost,path,method,protocol,agent,referer
74
+ field_float requestime
75
+ field_boolean bot_access,loginsession
76
+ </match>
77
+ ```
78
+
79
+ Important options for high rate events are:
80
+
81
+ * `tables`
82
+ * 2 or more tables are available with ',' separator
83
+ * `out_bigquery` uses these tables for Table Sharding inserts
84
+ * these must have same schema
85
+ * `buffer_chunk_records_limit`
86
+ * number of records over streaming inserts API call is limited as 100, per second, per table
87
+ * default average rate limit is 100, and spike rate limit is 1000
88
+ * `out_bigquery` flushes buffer with 100 records for 1 inserts API call
89
+ * `buffer_queue_limit`
90
+ * BigQuery streaming inserts needs very small buffer chunks
91
+ * for high-rate events, `buffer_queue_limit` should be configured with big number
92
+ * Max 1GB memory may be used under network problem in default configuration
93
+ * `buffer_chunk_limit (default 1MB)` x `buffer_queue_limit (default 1024)`
94
+ * `num_threads`
95
+ * threads for insert api calls in parallel
96
+ * specify this option for 100 or more records per seconds
97
+ * 10 or more threads seems good for inserts over internet
98
+ * less threads may be good for Google Compute Engine instances (with low latency for BigQuery)
99
+ * `flush_interval`
100
+ * `1` is lowest value, without patches on Fluentd v0.10.41 or earlier
101
+ * see `patches` below
102
+
103
+ ### patches
104
+
105
+ This plugin depends on `fluent-plugin-buffer-lightening`, and it includes monkey patch module for BufferedOutput plugin, to realize high rate and low latency flushing. With this patch, sub 1 second flushing available.
106
+
107
+ To use this feature, execute fluentd with `-r fluent/plugin/output_try_flush_interval_patch` option.
108
+ And configure `flush_interval` and `try_flush_interval` with floating point value.
109
+
110
+ ```apache
111
+ <match dummy>
112
+ type bigquery
113
+
114
+ method insert # default
115
+
116
+ flush_interval 0.2
117
+ try_flush_interval 0.05
118
+
119
+ buffer_chunk_records_limit 300 # default rate limit for users is 100
120
+ buffer_queue_limit 10240 # 1MB * 10240 -> 10GB!
121
+
122
+ num_threads 16
123
+
124
+ # credentials, project/dataset/table and schema specs.
125
+ </match>
126
+ ```
127
+
128
+ With this configuration, flushing will be done in 0.25 seconds after record inputs in the worst case.
129
+
130
+ ## TODO
131
+
132
+ * support Load API
133
+ * with automatically configured flush/buffer options
134
+ * support RECORD field
135
+ * and support optional data fields
136
+ * support NULLABLE/REQUIRED/REPEATED field options
137
+ * OAuth installed application credentials support
138
+ * Google API discovery expiration
139
+ * Error classes
140
+ * check row size limits
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env rake
2
+ require "bundler/gem_tasks"
3
+
4
+ require 'rake/testtask'
5
+ Rake::TestTask.new(:test) do |test|
6
+ test.libs << 'lib' << 'test'
7
+ test.pattern = 'test/**/test_*.rb'
8
+ test.verbose = true
9
+ end
10
+
11
+ task :default => :test
@@ -0,0 +1,29 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'fluent/plugin/bigquery/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "fluent-plugin-bigquery"
8
+ spec.version = Fluent::BigQueryPlugin::VERSION
9
+ spec.authors = ["TAGOMORI Satoshi"]
10
+ spec.email = ["tagomoris@gmail.com"]
11
+ spec.description = %q{Fluentd plugin to store data on Google BigQuery, by load, or by stream inserts}
12
+ spec.summary = %q{Fluentd plugin to store data on Google BigQuery}
13
+ spec.homepage = "https://github.com/tagomoris/fluent-plugin-bigquery"
14
+ spec.license = "APLv2"
15
+
16
+ spec.files = `git ls-files`.split($/)
17
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
18
+ spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
19
+ spec.require_paths = ["lib"]
20
+
21
+ spec.add_development_dependency "rake"
22
+ spec.add_runtime_dependency "google-api-client", "~> 0.6.4"
23
+ spec.add_runtime_dependency "fluentd"
24
+ spec.add_runtime_dependency "fluent-mixin-plaintextformatter", '>= 0.2.1'
25
+ spec.add_runtime_dependency "fluent-mixin-config-placeholders", ">= 0.2.0"
26
+ spec.add_runtime_dependency "fluent-plugin-buffer-lightening"
27
+
28
+ spec.add_development_dependency "fluent-plugin-dummydata-producer"
29
+ end
@@ -0,0 +1,173 @@
1
+ module Fluent
2
+ module BigQueryPlugin
3
+ class LoadRequestBodyWrapper
4
+ # body can be a instance of IO (#rewind, #read, #to_str)
5
+ # http://rubydoc.info/github/google/google-api-ruby-client/Google/APIClient/Request#body-instance_method
6
+
7
+ # http://rubydoc.info/github/google/google-api-ruby-client/Google/APIClient#execute-instance_method
8
+ # (Google::APIClient::Method) api_method: The method object or the RPC name of the method being executed.
9
+ # (Hash, Array) parameters: The parameters to send to the method.
10
+ # (String) body: The body of the request.
11
+ # (Hash, Array) headers: The HTTP headers for the request.
12
+ # (Hash) options: A set of options for the request, of which:
13
+ # (#generate_authenticated_request) :authorization (default: true)
14
+ # - The authorization mechanism for the response. Used only if :authenticated is true.
15
+ # (TrueClass, FalseClass) :authenticated (default: true)
16
+ # - true if the request must be signed or somehow authenticated, false otherwise.
17
+ # (TrueClass, FalseClass) :gzip (default: true) - true if gzip enabled, false otherwise.
18
+
19
+ # https://developers.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest
20
+
21
+ JSON_PRETTY_DUMP = JSON::State.new(space: " ", indent:" ", object_nl:"\n", array_nl:"\n")
22
+
23
+ CONTENT_TYPE_FIRST = "Content-Type: application/json; charset=UTF-8\n\n"
24
+ CONTENT_TYPE_SECOND = "Content-Type: application/octet-stream\n\n"
25
+
26
+ MULTIPART_BOUNDARY = "--xxx\n"
27
+ MULTIPART_BOUNDARY_END = "--xxx--\n"
28
+
29
+ def initialize(project_id, dataset_id, table_id, field_defs, buffer)
30
+ @metadata = {
31
+ configuration: {
32
+ load: {
33
+ sourceFormat: "<required for JSON files>",
34
+ schema: {
35
+ fields: field_defs
36
+ },
37
+ destinationTable: {
38
+ projectId: project_id,
39
+ datasetId: dataset_id,
40
+ tableId: table_id
41
+ }
42
+ }
43
+ }
44
+ }
45
+
46
+ @non_buffer = MULTIPART_BOUNDARY + CONTENT_TYPE_FIRST + @metadata.to_json(JSON_PRETTY_DUMP) + "\n" +
47
+ MULTIPART_BOUNDARY + CONTENT_TYPE_SECOND
48
+ @non_buffer.force_encoding("ASCII-8BIT")
49
+ @non_buffer_bytesize = @non_buffer.bytesize
50
+
51
+ @buffer = buffer # read
52
+ @buffer_bytesize = @buffer.size # Fluentd Buffer Chunk #size -> bytesize
53
+
54
+ @footer = MULTIPART_BOUNDARY_END.force_encoding("ASCII-8BIT")
55
+
56
+ @contents_bytesize = @non_buffer_bytesize + @buffer_bytesize
57
+ @total_bytesize = @contents_bytesize + MULTIPART_BOUNDARY_END.bytesize
58
+
59
+ @whole_data = nil
60
+
61
+ @counter = 0
62
+ @eof = false
63
+ end
64
+
65
+ # sample_body = <<EOF
66
+ # --xxx
67
+ # Content-Type: application/json; charset=UTF-8
68
+ #
69
+ # {
70
+ # "configuration": {
71
+ # "load": {
72
+ # "sourceFormat": "<required for JSON files>",
73
+ # "schema": {
74
+ # "fields": [
75
+ # {"name":"f1", "type":"STRING"},
76
+ # {"name":"f2", "type":"INTEGER"}
77
+ # ]
78
+ # },
79
+ # "destinationTable": {
80
+ # "projectId": "projectId",
81
+ # "datasetId": "datasetId",
82
+ # "tableId": "tableId"
83
+ # }
84
+ # }
85
+ # }
86
+ # }
87
+ # --xxx
88
+ # Content-Type: application/octet-stream
89
+ #
90
+ # <your data>
91
+ # --xxx--
92
+ # EOF
93
+ def rewind
94
+ @counter = 0
95
+ @eof = false
96
+ end
97
+
98
+ def eof?
99
+ @eof
100
+ end
101
+
102
+ def to_str
103
+ rewind
104
+ self.read # all data
105
+ end
106
+
107
+ def read(length=nil, outbuf="")
108
+ raise ArgumentError, "negative read length" if length && length < 0
109
+ return (length.nil? || length == 0) ? "" : nil if @eof
110
+ return outbuf if length == 0
111
+
112
+ # read all data
113
+ if length.nil? || length >= @total_bytesize
114
+ @whole_data ||= @buffer.read.force_encoding("ASCII-8BIT")
115
+
116
+ if @counter.zero?
117
+ outbuf.replace(@non_buffer)
118
+ outbuf << @whole_data
119
+ outbuf << @footer
120
+ elsif @counter < @non_buffer_bytesize
121
+ outbuf.replace(@non_buffer[ @counter .. -1 ])
122
+ outbuf << @whole_data
123
+ outbuf << @footer
124
+ elsif @counter < @contents_bytesize
125
+ outbuf.replace(@whole_data[ (@counter - @non_buffer_bytesize) .. -1 ])
126
+ outbuf << @footer
127
+ else
128
+ outbuf.replace(@footer[ (@counter - @contents_bytesize) .. -1 ])
129
+ end
130
+ @counter = @total_bytesize
131
+ @eof = true
132
+ return outbuf
133
+ end
134
+
135
+ # In ruby script level (non-ext module), we cannot prevent to change outbuf length or object re-assignment
136
+ outbuf.replace("")
137
+
138
+ # return first part (metadata)
139
+ if @counter < @non_buffer_bytesize
140
+ non_buffer_part = @non_buffer[@counter, length]
141
+ if non_buffer_part
142
+ outbuf << non_buffer_part
143
+ length -= non_buffer_part.bytesize
144
+ @counter += non_buffer_part.bytesize
145
+ end
146
+ end
147
+ return outbuf if length < 1
148
+
149
+ # return second part (buffer content)
150
+ if @counter < @contents_bytesize
151
+ @whole_data ||= @buffer.read.force_encoding("ASCII-8BIT")
152
+ buffer_part = @whole_data[@counter - @non_buffer_bytesize, length]
153
+ if buffer_part
154
+ outbuf << buffer_part
155
+ length -= buffer_part.bytesize
156
+ @counter += buffer_part.bytesize
157
+ end
158
+ end
159
+ return outbuf if length < 1
160
+
161
+ # return footer
162
+ footer_part = @footer[@counter - @contents_bytesize, length]
163
+ if footer_part
164
+ outbuf << footer_part
165
+ @counter += footer_part.bytesize
166
+ @eof = true if @counter >= @total_bytesize
167
+ end
168
+
169
+ outbuf
170
+ end
171
+ end
172
+ end
173
+ end
@@ -0,0 +1,6 @@
1
+ module Fluent
2
+ module BigQueryPlugin
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
6
+
@@ -0,0 +1,296 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ require 'fluent/plugin/bigquery/version'
4
+
5
+ require 'fluent/mixin/config_placeholders'
6
+ require 'fluent/mixin/plaintextformatter'
7
+
8
+ ## TODO: load implementation
9
+ # require 'fluent/plugin/bigquery/load_request_body_wrapper'
10
+
11
+ require 'fluent/plugin/output_try_flush_interval_patch'
12
+
13
+ module Fluent
14
+ ### TODO: error classes for each api error responses
15
+ # class BigQueryAPIError < StandardError
16
+ # end
17
+
18
+ class BigQueryOutput < BufferedOutput
19
+ Fluent::Plugin.register_output('bigquery', self)
20
+
21
+ # https://developers.google.com/bigquery/browser-tool-quickstart
22
+ # https://developers.google.com/bigquery/bigquery-api-quickstart
23
+
24
+ config_set_default :buffer_type, 'lightening'
25
+
26
+ config_set_default :flush_interval, 0.25
27
+ config_set_default :try_flush_interval, 0.05
28
+
29
+ config_set_default :buffer_chunk_records_limit, 100
30
+ config_set_default :buffer_chunk_limit, 1000000
31
+ config_set_default :buffer_queue_limit, 1024
32
+
33
+ ### for loads
34
+ ### TODO: different default values for buffering between 'load' and insert
35
+ # config_set_default :flush_interval, 1800 # 30min => 48 imports/day
36
+ # config_set_default :buffer_chunk_limit, 1000**4 # 1.0*10^12 < 1TB (1024^4)
37
+
38
+ ### OAuth credential
39
+ # config_param :client_id, :string
40
+ # config_param :client_secret, :string
41
+
42
+ ### Service Account credential
43
+ config_param :email, :string
44
+ config_param :private_key_path, :string
45
+ config_param :private_key_passphrase, :string, :default => 'notasecret'
46
+
47
+ # see as simple reference
48
+ # https://github.com/abronte/BigQuery/blob/master/lib/bigquery.rb
49
+ config_param :project, :string
50
+
51
+ # dataset_name
52
+ # The name can be up to 1,024 characters long, and consist of A-Z, a-z, 0-9, and the underscore,
53
+ # but it cannot start with a number or underscore, or have spaces.
54
+ config_param :dataset, :string
55
+
56
+ # table_id
57
+ # In Table ID, enter a name for your new table. Naming rules are the same as for your dataset.
58
+ config_param :table, :string, :default => nil
59
+ config_param :tables, :string, :default => nil
60
+
61
+ config_param :field_string, :string, :default => nil
62
+ config_param :field_integer, :string, :default => nil
63
+ config_param :field_float, :string, :default => nil
64
+ config_param :field_boolean, :string, :default => nil
65
+ ### TODO: record field stream inserts doesn't works well?
66
+ ### At table creation, table type json + field type record -> field type validation fails
67
+ ### At streaming inserts, schema cannot be specified
68
+ # config_param :field_record, :string, :defualt => nil
69
+ # config_param :optional_data_field, :string, :default => nil
70
+
71
+ config_param :time_format, :string, :default => nil
72
+ config_param :localtime, :bool, :default => nil
73
+ config_param :utc, :bool, :default => nil
74
+ config_param :time_field, :string, :default => nil
75
+
76
+ config_param :method, :string, :default => 'insert' # or 'load' # TODO: not implemented now
77
+
78
+ config_param :load_size_limit, :integer, :default => 1000**4 # < 1TB (1024^4) # TODO: not implemented now
79
+ ### method: 'load'
80
+ # https://developers.google.com/bigquery/loading-data-into-bigquery
81
+ # Maximum File Sizes:
82
+ # File Type Compressed Uncompressed
83
+ # CSV 1 GB With new-lines in strings: 4 GB
84
+ # Without new-lines in strings: 1 TB
85
+ # JSON 1 GB 1 TB
86
+
87
+ config_param :row_size_limit, :integer, :default => 100*1000 # < 100KB # configurable in google ?
88
+ # config_param :insert_size_limit, :integer, :default => 1000**2 # < 1MB
89
+ # config_param :rows_per_second_limit, :integer, :default => 1000 # spike limit
90
+ ### method: ''Streaming data inserts support
91
+ # https://developers.google.com/bigquery/streaming-data-into-bigquery#usecases
92
+ # Maximum row size: 100 KB
93
+ # Maximum data size of all rows, per insert: 1 MB
94
+ # Maximum rows per second: 100 rows per second, per table, with allowed and occasional bursts of up to 1,000 rows per second.
95
+ # If you exceed 100 rows per second for an extended period of time, throttling might occur.
96
+ ### Toooooooooooooo short/small per inserts and row!
97
+
98
+ ### Table types
99
+ # https://developers.google.com/bigquery/docs/tables
100
+ #
101
+ # type - The following data types are supported; see Data Formats for details on each data type:
102
+ # STRING
103
+ # INTEGER
104
+ # FLOAT
105
+ # BOOLEAN
106
+ # RECORD A JSON object, used when importing nested records. This type is only available when using JSON source files.
107
+ #
108
+ # mode - Whether a field can be null. The following values are supported:
109
+ # NULLABLE - The cell can be null.
110
+ # REQUIRED - The cell cannot be null.
111
+ # REPEATED - Zero or more repeated simple or nested subfields. This mode is only supported when using JSON source files.
112
+
113
+ def initialize
114
+ super
115
+ require 'google/api_client'
116
+ require 'google/api_client/client_secrets'
117
+ require 'google/api_client/auth/installed_app'
118
+ end
119
+
120
+ def configure(conf)
121
+ super
122
+
123
+ if (!@table && !@tables) || (@table && @table)
124
+ raise Fluent::ConfigError, "'table' or 'tables' must be specified, and both are invalid"
125
+ end
126
+
127
+ @tablelist = @tables ? @tables.split(',') : [@table]
128
+
129
+ @fields = {}
130
+ if @field_string
131
+ @field_string.split(',').each do |fieldname|
132
+ @fields[fieldname] = :string
133
+ end
134
+ end
135
+ if @field_integer
136
+ @field_integer.split(',').each do |fieldname|
137
+ @fields[fieldname] = :integer
138
+ end
139
+ end
140
+ if @field_float
141
+ @field_float.split(',').each do |fieldname|
142
+ @fields[fieldname] = :float
143
+ end
144
+ end
145
+ if @field_boolean
146
+ @field_boolean.split(',').each do |fieldname|
147
+ @fields[fieldname] = :boolean
148
+ end
149
+ end
150
+
151
+ if @localtime.nil?
152
+ if @utc
153
+ @localtime = false
154
+ end
155
+ end
156
+ @timef = TimeFormatter.new(@time_format, @localtime)
157
+ end
158
+
159
+ def start
160
+ super
161
+
162
+ @bq = client.discovered_api("bigquery", "v2") # TODO: refresh with specified expiration
163
+ @cached_client = nil
164
+ @cached_client_expiration = nil
165
+
166
+ @tables_queue = @tablelist.dup.shuffle
167
+ @tables_mutex = Mutex.new
168
+ end
169
+
170
+ def shutdown
171
+ super
172
+ # nothing to do
173
+ end
174
+
175
+ def client
176
+ return @cached_client if @cached_client && @cached_client_expiration > Time.now
177
+
178
+ client = Google::APIClient.new(
179
+ :application_name => 'Fluentd BigQuery plugin',
180
+ :application_version => Fluent::BigQueryPlugin::VERSION
181
+ )
182
+
183
+ key = Google::APIClient::PKCS12.load_key( @private_key_path, @private_key_passphrase )
184
+ asserter = Google::APIClient::JWTAsserter.new(
185
+ @email,
186
+ "https://www.googleapis.com/auth/bigquery",
187
+ key
188
+ )
189
+ # refresh_auth
190
+ client.authorization = asserter.authorize
191
+ @cached_client_expiration = Time.now + 1800
192
+ @cached_client = client
193
+ end
194
+
195
+ def insert(table_id, rows)
196
+ res = client().execute(
197
+ :api_method => @bq.tabledata.insert_all,
198
+ :parameters => {
199
+ 'projectId' => @project,
200
+ 'datasetId' => @dataset,
201
+ 'tableId' => table_id,
202
+ },
203
+ :body_object => {
204
+ "rows" => rows
205
+ }
206
+ )
207
+ if res.status != 200
208
+ # api_error? -> client cache clear
209
+ @cached_client = nil
210
+
211
+ message = res.body
212
+ if res.body =~ /^\{/
213
+ begin
214
+ res_obj = JSON.parse(res.body)
215
+ message = res_obj['error']['message'] || res.body
216
+ rescue => e
217
+ $log.warn "Parse error: google api error response body", :body => res.body
218
+ end
219
+ end
220
+ $log.error "tabledata.insertAll API", :project_id => @project_id, :dataset => @dataset_id, :table => table_id, :code => res.status, :message => message
221
+ raise "failed to insert into bigquery" # TODO: error class
222
+ end
223
+ end
224
+
225
+ def load
226
+ # https://developers.google.com/bigquery/loading-data-into-bigquery#loaddatapostrequest
227
+ raise NotImplementedError # TODO
228
+ end
229
+
230
+ def format_record(record)
231
+ out = {}
232
+ @fields.each do |key, type|
233
+ value = record[key]
234
+ next if value.nil? # field does not exists, or null value
235
+ out[key] = case type
236
+ when :string then record[key].to_s
237
+ when :integer then record[key].to_i
238
+ when :float then record[key].to_f
239
+ when :boolean then !!record[key]
240
+ # when :record
241
+ else
242
+ raise "BUG: unknown field type #{type}"
243
+ end
244
+ end
245
+ out
246
+ end
247
+
248
+ def format_stream(tag, es)
249
+ super
250
+ buf = ''
251
+ es.each do |time, record|
252
+ row = if @time_field
253
+ format_record(record.merge({@time_field => @timef.format(time)}))
254
+ else
255
+ format_record(record)
256
+ end
257
+ buf << {"json" => row}.to_msgpack unless row.empty?
258
+ end
259
+ buf
260
+ end
261
+
262
+ def write(chunk)
263
+ rows = []
264
+ chunk.msgpack_each do |row_object|
265
+ # TODO: row size limit
266
+ rows << row_object
267
+ end
268
+
269
+ # TODO: method
270
+
271
+ insert_table = @tables_mutex.synchronize do
272
+ t = @tables_queue.shift
273
+ @tables_queue.push t
274
+ t
275
+ end
276
+ insert(insert_table, rows)
277
+ end
278
+
279
+ # def client_oauth # not implemented
280
+ # raise NotImplementedError, "OAuth needs browser authentication..."
281
+ #
282
+ # client = Google::APIClient.new(
283
+ # :application_name => 'Example Ruby application',
284
+ # :application_version => '1.0.0'
285
+ # )
286
+ # bigquery = client.discovered_api('bigquery', 'v2')
287
+ # flow = Google::APIClient::InstalledAppFlow.new(
288
+ # :client_id => @client_id
289
+ # :client_secret => @client_secret
290
+ # :scope => ['https://www.googleapis.com/auth/bigquery']
291
+ # )
292
+ # client.authorization = flow.authorize # browser authentication !
293
+ # client
294
+ # end
295
+ end
296
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,33 @@
1
+ require 'rubygems'
2
+ require 'bundler'
3
+ begin
4
+ Bundler.setup(:default, :development)
5
+ rescue Bundler::BundlerError => e
6
+ $stderr.puts e.message
7
+ $stderr.puts "Run `bundle install` to install missing gems"
8
+ exit e.status_code
9
+ end
10
+ require 'test/unit'
11
+
12
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
13
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
14
+ require 'fluent/test'
15
+ unless ENV.has_key?('VERBOSE')
16
+ nulllogger = Object.new
17
+ nulllogger.instance_eval {|obj|
18
+ def method_missing(method, *args)
19
+ # pass
20
+ end
21
+ }
22
+ $log = nulllogger
23
+ end
24
+
25
+ require 'fluent/buffer'
26
+ require 'fluent/plugin/buf_memory'
27
+ require 'fluent/plugin/buf_file'
28
+
29
+ require 'fluent/plugin/out_bigquery'
30
+ require 'fluent/plugin/bigquery/load_request_body_wrapper'
31
+
32
+ class Test::Unit::TestCase
33
+ end
@@ -0,0 +1,190 @@
1
+ # -*- coding: utf-8 -*-
2
+ require 'helper'
3
+ require 'json'
4
+ require 'tempfile'
5
+
6
+ class LoadRequestBodyWrapperTest < Test::Unit::TestCase
7
+ def content_alphabet(repeat)
8
+ (0...repeat).map{|i| "#{i}0123456789\n" }.join
9
+ end
10
+
11
+ def content_kana(repeat)
12
+ (0...repeat).map{|i| "#{i}あいうえおかきくけこ\n" }.join
13
+ end
14
+
15
+ def mem_chunk(repeat=10, kana=false)
16
+ content = kana ? content_kana(repeat) : content_alphabet(repeat)
17
+ Fluent::MemoryBufferChunk.new('bc_mem', content)
18
+ end
19
+
20
+ def file_chunk(repeat=10, kana=false)
21
+ content = kana ? content_kana(repeat) : content_alphabet(repeat)
22
+ tmpfile = Tempfile.new('fluent_bigquery_plugin_test')
23
+ buf = Fluent::FileBufferChunk.new('bc_mem', tmpfile.path, tmpfile.object_id)
24
+ buf << content
25
+ buf
26
+ end
27
+
28
+ def field_defs
29
+ [{"name" => "field1", "type" => "STRING"}, {"name" => "field2", "type" => "INTEGER"}]
30
+ end
31
+
32
+ def check_meta(blank, first, last)
33
+ assert_equal "", blank
34
+
35
+ header1, body1 = first.split("\n\n")
36
+ assert_equal "Content-Type: application/json; charset=UTF-8", header1
37
+ metadata = JSON.parse(body1)
38
+ assert_equal "<required for JSON files>", metadata["configuration"]["load"]["sourceFormat"]
39
+ assert_equal "field1", metadata["configuration"]["load"]["schema"]["fields"][0]["name"]
40
+ assert_equal "STRING", metadata["configuration"]["load"]["schema"]["fields"][0]["type"]
41
+ assert_equal "field2", metadata["configuration"]["load"]["schema"]["fields"][1]["name"]
42
+ assert_equal "INTEGER", metadata["configuration"]["load"]["schema"]["fields"][1]["type"]
43
+ assert_equal "pname1", metadata["configuration"]["load"]["destinationTable"]["projectId"]
44
+ assert_equal "dname1", metadata["configuration"]["load"]["destinationTable"]["datasetId"]
45
+ assert_equal "tname1", metadata["configuration"]["load"]["destinationTable"]["tableId"]
46
+
47
+ assert_equal "--\n", last
48
+ end
49
+
50
+ def check_ascii(data)
51
+ blank, first, second, last = data.split(/--xxx\n?/)
52
+
53
+ check_meta(blank, first, last)
54
+
55
+ header2, body2 = second.split("\n\n")
56
+ assert_equal "Content-Type: application/octet-stream", header2
57
+ i = 0
58
+ body2.each_line do |line|
59
+ assert_equal "#{i}0123456789\n", line
60
+ i += 1
61
+ end
62
+ end
63
+
64
+ def check_kana(data)
65
+ blank, first, second, last = data.split(/--xxx\n?/)
66
+
67
+ check_meta(blank, first, last)
68
+
69
+ header2, body2 = second.split("\n\n")
70
+ assert_equal "Content-Type: application/octet-stream", header2
71
+ i = 0
72
+ body2.each_line do |line|
73
+ assert_equal "#{i}あいうえおかきくけこ\n", line
74
+ i += 1
75
+ end
76
+ end
77
+
78
+ def setup
79
+ @klass = Fluent::BigQueryPlugin::LoadRequestBodyWrapper
80
+ self
81
+ end
82
+
83
+ def test_memory_buf
84
+ d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(10))
85
+ data1 = d1.read.force_encoding("UTF-8")
86
+ check_ascii(data1)
87
+
88
+ d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(10))
89
+ data2 = ""
90
+ while !d2.eof? do
91
+ buf = " "
92
+ objid = buf.object_id
93
+ data2 << d2.read(20, buf)
94
+ assert_equal objid, buf.object_id
95
+ end
96
+ data2.force_encoding("UTF-8")
97
+
98
+ assert_equal data1.size, data2.size
99
+ end
100
+
101
+ def test_memory_buf2
102
+ d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000))
103
+ data1 = d1.read.force_encoding("UTF-8")
104
+ check_ascii(data1)
105
+
106
+ d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000))
107
+ data2 = ""
108
+ while !d2.eof? do
109
+ buf = " "
110
+ objid = buf.object_id
111
+ data2 << d2.read(2048, buf)
112
+ assert_equal objid, buf.object_id
113
+ end
114
+ data2.force_encoding("UTF-8")
115
+
116
+ assert_equal data1.size, data2.size
117
+ end
118
+
119
+ def test_memory_buf3 # kana
120
+ d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000, true))
121
+ data1 = d1.read.force_encoding("UTF-8")
122
+ check_kana(data1)
123
+
124
+ d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), mem_chunk(100000, true))
125
+ data2 = ""
126
+ while !d2.eof? do
127
+ buf = " "
128
+ objid = buf.object_id
129
+ data2 << d2.read(2048, buf)
130
+ assert_equal objid, buf.object_id
131
+ end
132
+ data2.force_encoding("UTF-8")
133
+
134
+ assert_equal data1.size, data2.size
135
+ end
136
+
137
+ def test_file_buf
138
+ d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(10))
139
+ data1 = d1.read.force_encoding("UTF-8")
140
+ check_ascii(data1)
141
+
142
+ d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(10))
143
+ data2 = ""
144
+ while !d2.eof? do
145
+ buf = " "
146
+ objid = buf.object_id
147
+ data2 << d2.read(20, buf)
148
+ assert_equal objid, buf.object_id
149
+ end
150
+ data2.force_encoding("UTF-8")
151
+
152
+ assert_equal data1.size, data2.size
153
+ end
154
+
155
+ def test_file_buf2
156
+ d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000))
157
+ data1 = d1.read.force_encoding("UTF-8")
158
+ check_ascii(data1)
159
+
160
+ d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000))
161
+ data2 = ""
162
+ while !d2.eof? do
163
+ buf = " "
164
+ objid = buf.object_id
165
+ data2 << d2.read(20480, buf)
166
+ assert_equal objid, buf.object_id
167
+ end
168
+ data2.force_encoding("UTF-8")
169
+
170
+ assert_equal data1.size, data2.size
171
+ end
172
+
173
+ def test_file_buf3 # kana
174
+ d1 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000, true))
175
+ data1 = d1.read.force_encoding("UTF-8")
176
+ check_kana(data1)
177
+
178
+ d2 = @klass.new('pname1', 'dname1', 'tname1', field_defs(), file_chunk(100000, true))
179
+ data2 = ""
180
+ while !d2.eof? do
181
+ buf = " "
182
+ objid = buf.object_id
183
+ data2 << d2.read(20480, buf)
184
+ assert_equal objid, buf.object_id
185
+ end
186
+ data2.force_encoding("UTF-8")
187
+
188
+ assert_equal data1.size, data2.size
189
+ end
190
+ end
metadata ADDED
@@ -0,0 +1,157 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: fluent-plugin-bigquery
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - TAGOMORI Satoshi
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-12-23 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rake
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - '>='
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - '>='
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: google-api-client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ~>
32
+ - !ruby/object:Gem::Version
33
+ version: 0.6.4
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ~>
39
+ - !ruby/object:Gem::Version
40
+ version: 0.6.4
41
+ - !ruby/object:Gem::Dependency
42
+ name: fluentd
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - '>='
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: fluent-mixin-plaintextformatter
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - '>='
60
+ - !ruby/object:Gem::Version
61
+ version: 0.2.1
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - '>='
67
+ - !ruby/object:Gem::Version
68
+ version: 0.2.1
69
+ - !ruby/object:Gem::Dependency
70
+ name: fluent-mixin-config-placeholders
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: 0.2.0
76
+ type: :runtime
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - '>='
81
+ - !ruby/object:Gem::Version
82
+ version: 0.2.0
83
+ - !ruby/object:Gem::Dependency
84
+ name: fluent-plugin-buffer-lightening
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - '>='
88
+ - !ruby/object:Gem::Version
89
+ version: '0'
90
+ type: :runtime
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - '>='
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: fluent-plugin-dummydata-producer
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - '>='
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - '>='
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Fluentd plugin to store data on Google BigQuery, by load, or by stream
112
+ inserts
113
+ email:
114
+ - tagomoris@gmail.com
115
+ executables: []
116
+ extensions: []
117
+ extra_rdoc_files: []
118
+ files:
119
+ - .gitignore
120
+ - Gemfile
121
+ - LICENSE.txt
122
+ - README.md
123
+ - Rakefile
124
+ - fluent-plugin-bigquery.gemspec
125
+ - lib/fluent/plugin/bigquery/load_request_body_wrapper.rb
126
+ - lib/fluent/plugin/bigquery/version.rb
127
+ - lib/fluent/plugin/out_bigquery.rb
128
+ - test/helper.rb
129
+ - test/test_load_request_body_wrapper.rb
130
+ homepage: https://github.com/tagomoris/fluent-plugin-bigquery
131
+ licenses:
132
+ - APLv2
133
+ metadata: {}
134
+ post_install_message:
135
+ rdoc_options: []
136
+ require_paths:
137
+ - lib
138
+ required_ruby_version: !ruby/object:Gem::Requirement
139
+ requirements:
140
+ - - '>='
141
+ - !ruby/object:Gem::Version
142
+ version: '0'
143
+ required_rubygems_version: !ruby/object:Gem::Requirement
144
+ requirements:
145
+ - - '>='
146
+ - !ruby/object:Gem::Version
147
+ version: '0'
148
+ requirements: []
149
+ rubyforge_project:
150
+ rubygems_version: 2.0.3
151
+ signing_key:
152
+ specification_version: 4
153
+ summary: Fluentd plugin to store data on Google BigQuery
154
+ test_files:
155
+ - test/helper.rb
156
+ - test/test_load_request_body_wrapper.rb
157
+ has_rdoc: