triglav-agent-bigquery 1.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b32f5948041f7c59be88aeaa8964818c2bf7cc9d
4
+ data.tar.gz: b3f3a1c2eec2353ab8c49c0ab967894b6b2ce1b5
5
+ SHA512:
6
+ metadata.gz: 31b0ad8eb808a81da42f62081f11e4d5ce78dc07ab9056d0427a98f48725d735e1fbf5ee0f358dd1d64f1588f76c3f96866dbf019c1889731c84ba047bd92804
7
+ data.tar.gz: b6f5be6197a4d92cbe089956049c5102754d3e3ef3f8cb6791bfa4e3ba555561f840ce518a998df2516d1e8dd0f808a101945c20f0ca725bbf42f6f11b6c0160
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .env
11
+ /status.yml
12
+ /token.yml
13
+ /config.yml
14
+ .ruby-version
15
+ *.json
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ - 2.4.0
5
+ before_install:
6
+ - gem install bundler -v 1.11.2
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at sonots@gmail.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+ gem 'triglav_client', git: 'https://github.com/triglav-dataflow/triglav-client-ruby'
5
+ gem 'triglav-agent', git: 'https://github.com/triglav-dataflow/triglav-agent-framework-ruby'
6
+ gem 'pry-byebug'
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Triglav Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,135 @@
1
+ # Triglav::Agent::Bigquery
2
+
3
+ Triglav Agent for BigQuery
4
+
5
+ ## Requirements
6
+
7
+ * Ruby >= 2.3.0
8
+
9
+ ## Prerequisites
10
+
11
+ * BigQuery view is not supported
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'triglav-agent-bigquery'
19
+ ```
20
+
21
+ And then execute:
22
+
23
+ $ bundle
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install triglav-agent-bigquery
28
+
29
+ ## CLI
30
+
31
+ ```
32
+ Usage: triglav-agent-bigquery [options]
33
+ -c, --config VALUE Config file (default: config.yml)
34
+ -s, --status VALUE Status stroage file (default: status.yml)
35
+ -t, --token VALUE Triglav access token storage file (default: token.yml)
36
+ --dotenv Load environment variables from .env file (default: false)
37
+ -h, --help help
38
+ --log VALUE Log path (default: STDOUT)
39
+ --log-level VALUE Log level (default: info)
40
+ ```
41
+
42
+ Run as:
43
+
44
+ ```
45
+ TRIGLAV_ENV=development bundle exec triglav-agent-bigquery --dotenv -c config.yml
46
+ ```
47
+
48
+ ## Configuration
49
+
50
+ Prepare config.yml as [example/config.yml](./example/config.yml).
51
+
52
+ You can use erb template. You may load environment variables from .env file with `--dotenv` option as an [example/example.env](./example/example.env) file shows.
53
+
54
+ ### serverengine section
55
+
56
+ You can specify any [serverengine](https://github.com/fluent/serverengine) options at this section
57
+
58
+ ### triglav section
59
+
60
+ Specify triglav api url, and a credential to authenticate.
61
+
62
+ The access token obtained is stored into a token storage file (--token option).
63
+
64
+ ### bigquery section
65
+
66
+ This section is the special section for triglav-agent-bigquery.
67
+
68
+ * **monitor_interval**: The interval to watch tables (number, default: 60)
69
+ * **connection_info**: key-value pairs of bigquery connection info where keys are resource URI pattern in regular expression, and values are connection infomation
70
+ * **auth_method**: Authentication method. Must be one of `service_account`, `authorized_user` (for oauth2), `compute_engine`, and `application_default`. Default obtains from credentials.
71
+ * **credentials_file**: Credentials file path such as service account json.
72
+ * **credentials**: Instead of `credentials_file`, you may pass json contents as a string
73
+
74
+ ### Specification of Resource URI
75
+
76
+ Resource URI must be a form of:
77
+
78
+ ```
79
+ https://bigquery.cloud.google.com/table/#{project}:#{dataset}.#{table}
80
+ ```
81
+
82
+ `#{table}` also accepts strftime formatted suffix such as
83
+
84
+ ```
85
+ #{table}_%Y%m%d
86
+ ```
87
+
88
+ and strftime formatted partition decorator for a partitioned table such as
89
+
90
+ ```
91
+ #{table}$%Y%m%d
92
+ ```
93
+
94
+ ## How it behaves
95
+
96
+ 1. Authenticate with triglav
97
+ * Store the access token into the token storage file
98
+ * Read the token from the token storage file next time
99
+ * Refresh the access token if it is expired
100
+ 2. Repeat followings in `monitor_interval` seconds:
101
+ 3. Obtain resource (table) lists of the specified prefix (keys of connection_info) from triglav.
102
+ 4. Connect to bigquery with an appropriate connection info for a resource uri, and find tables which are newer than last check.
103
+ 5. Store checking information into the status storage file for the next time check.
104
+
105
+ ## Development
106
+
107
+ ### Prepare
108
+
109
+ ```
110
+ ./prepare.sh
111
+ ```
112
+
113
+ Edit `.env` or `config.yml` file directly.
114
+
115
+ ### Start
116
+
117
+ Start up triglav api on localhost.
118
+
119
+ Run triglav-anget-bigquery as:
120
+
121
+ ```
122
+ TRIGLAV_ENV=development bundle exec triglav-agent-bigquery --dotenv --debug -c example/config.yml
123
+ ```
124
+
125
+ The debug mode with --debug option ignores the `last_modified_time` value in status file.
126
+
127
+ ## Contributing
128
+
129
+ Bug reports and pull requests are welcome on GitHub at https://github.com/triglav-dataflow/triglav-agent-bigquery. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
130
+
131
+
132
+ ## License
133
+
134
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
135
+
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+ desc 'Run test_unit based test'
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.test_files = Dir["test/**/test_*.rb"].sort
8
+ t.verbose = false
9
+ t.warning = false
10
+ end
11
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "triglav/agent/bigquery"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,37 @@
1
+ defaults: &defaults
2
+ serverengine:
3
+ log: 'STDOUT'
4
+ log_level: 'debug'
5
+ log_rotate_age: 5
6
+ log_rotate_size: 10485760
7
+ triglav:
8
+ url: <%= ENV['TRIGLAV_URL'] || 'http://localhost:7800' %>
9
+ credential:
10
+ username: <%= ENV['TRIGLAV_USERNAME'] || 'triglav_test' %>
11
+ password: <%= ENV['TRIGLAV_PASSWORD'] || 'triglav_test' %>
12
+ authenticator: local
13
+ timeout: 60
14
+ debugging: false
15
+ retries: 3
16
+ retry_interval: 3 # sec
17
+ bigquery:
18
+ monitor_interval: 5
19
+ retries: 5
20
+ timeout_sec: 300
21
+ open_timeout_sec: 300
22
+ connection_info:
23
+ "https://bigquery.cloud.google.com/table/<%= ENV['GOOGLE_PROJECT'] || 'your-project' %>":
24
+ # auth_method: # service_account, authorized_user, or compute_engine. default: get type from credentials
25
+ credentials_file: ~/.config/gcloud/application_default_credentials.json
26
+ # credentials: |
27
+ # {
28
+ # "private_key_id": "123456789",
29
+ # "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
30
+ # "client_email": "..."
31
+ # }
32
+
33
+ development:
34
+ <<: *defaults
35
+
36
+ test:
37
+ <<: *defaults
@@ -0,0 +1,8 @@
1
+ TRIGLAV_URL=http://localhost:7800
2
+ TRIGLAV_USERNAME=triglav_test
3
+ TRIGLAV_PASSWORD=triglav_test
4
+ VERTICA_HOST=xxx.xxx.xxx.xxx
5
+ VERTICA_PORT=5433
6
+ VERTICA_DATABASE=vdb
7
+ VERTICA_USER=dbread
8
+ VERTICA_PASSWORD=daerbd
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'triglav/agent/bigquery'
4
+ Triglav::Agent::Configuration.configure do |config|
5
+ config.name = :bigquery
6
+ # config.cli_class = Triglav::Agent::Bigquery::CLI
7
+ # config.setting_class = Triglav::Agent::Bigquery::Setting
8
+ # config.worker_module = Triglav::Agent::Bigquery::Worker
9
+ # config.processor_class = Triglav::Agent::Bigquery::Processor
10
+ config.monitor_class = Triglav::Agent::Bigquery::Monitor
11
+ config.connection_class = Triglav::Agent::Bigquery::Connection
12
+ end
13
+ Triglav::Agent::Configuration.cli_class.new.run
@@ -0,0 +1,279 @@
1
+ require 'triglav/agent/base/connection'
2
+ require 'google/apis/bigquery_v2'
3
+ require 'google/api_client/auth/key_utils'
4
+ require 'securerandom'
5
+ require 'ini_file'
6
+
7
+ # monkey patch not to create representable objects which consumes lots of memory
8
+ # @see http://qiita.com/sonots/items/1271f3d426cda6c891c0
9
+ module Google
10
+ module Apis
11
+ module BigqueryV2
12
+ class BigqueryService < Google::Apis::Core::BaseService
13
+ def get_job_query_results(project_id, job_id, max_results: nil, page_token: nil, start_index: nil, timeout_ms: nil, fields: nil, quota_user: nil, user_ip: nil, options: nil, &block)
14
+ command = make_simple_command(:get, 'projects/{projectId}/queries/{jobId}', options)
15
+ # command.response_representation = Google::Apis::BigqueryV2::GetQueryResultsResponse::Representation # monkey patch
16
+ command.response_class = Google::Apis::BigqueryV2::GetQueryResultsResponse
17
+ command.params['projectId'] = project_id unless project_id.nil?
18
+ command.params['jobId'] = job_id unless job_id.nil?
19
+ command.query['maxResults'] = max_results unless max_results.nil?
20
+ command.query['pageToken'] = page_token unless page_token.nil?
21
+ command.query['startIndex'] = start_index unless start_index.nil?
22
+ command.query['timeoutMs'] = timeout_ms unless timeout_ms.nil?
23
+ command.query['fields'] = fields unless fields.nil?
24
+ command.query['quotaUser'] = quota_user unless quota_user.nil?
25
+ command.query['userIp'] = user_ip unless user_ip.nil?
26
+ execute_or_queue_command(command, &block)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ module Triglav::Agent
34
+ module Bigquery
35
+ class Connection < Base::Connection
36
+ attr_reader :connection_info
37
+
38
+ class Error < StandardError; end
39
+ class NotFoundError < Error; end
40
+ class ConfigError < Error; end
41
+
42
+ def initialize(connection_info)
43
+ @connection_info = connection_info
44
+ end
45
+
46
+ def close
47
+ # google-api-ruby-client uses hurley and patches it to use httpclient gem inside.
48
+ # httpclient gem manages its connections in its connection pool, and
49
+ # releases or reuses its connections automatically.
50
+ #
51
+ # ADVANCE NOTE: httpclient gem itself has own connection pool, so the connection pool
52
+ # machanism of triglav-agent-framework is just useless.
53
+ # httpclient gem creates a new connection as much as it is required, so the number of
54
+ # connections typically will be the same with the number of threads (?).
55
+ end
56
+
57
+ def client
58
+ return @cached_client if @cached_client && @cached_client_expiration > Time.now
59
+
60
+ client = Google::Apis::BigqueryV2::BigqueryService.new
61
+ client.request_options.retries = retries
62
+ client.request_options.timeout_sec = timeout_sec
63
+ client.request_options.open_timeout_sec = open_timeout_sec
64
+
65
+ scope = "https://www.googleapis.com/auth/bigquery"
66
+
67
+ case auth_method
68
+ when 'authorized_user'
69
+ auth = Signet::OAuth2::Client.new(
70
+ token_credential_uri: "https://accounts.google.com/o/oauth2/token",
71
+ audience: "https://accounts.google.com/o/oauth2/token",
72
+ scope: scope,
73
+ client_id: credentials['client_id'],
74
+ client_secret: credentials['client_secret'],
75
+ refresh_token: credentials['refresh_token']
76
+ )
77
+ auth.refresh!
78
+ when 'compute_engine'
79
+ auth = Google::Auth::GCECredentials.new
80
+ when 'service_account'
81
+ key = StringIO.new(credentials.to_json)
82
+ auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
83
+ when 'application_default'
84
+ auth = Google::Auth.get_application_default([scope])
85
+ else
86
+ raise ConfigError, "Unknown auth method: #{auth_method}"
87
+ end
88
+
89
+ client.authorization = auth
90
+
91
+ @cached_client_expiration = Time.now + 1800
92
+ @cached_client = client
93
+ end
94
+
95
+ # @return [Hash] {id:, creation_time:, last_modified_time:, location:, num_bytes:, num_rows:}
96
+ #
97
+ # creation_time [Integer] milli sec
98
+ # last_modified_time [Integer] milli sec
99
+ def get_table(project: nil, dataset:, table:)
100
+ project ||= self.project
101
+ begin
102
+ $logger.debug { "Get table... #{project}:#{dataset}.#{table}" }
103
+ response = client.get_table(project, dataset, table)
104
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
105
+ if e.status_code == 404 # not found
106
+ raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
107
+ end
108
+
109
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
110
+ raise Error, "Failed to get_table(#{project}, #{dataset}, #{table}), response:#{response}"
111
+ end
112
+
113
+ result = {
114
+ id: response.id, # project:dataset.table
115
+ creation_time: response.creation_time.to_i, # millisec
116
+ last_modified_time: response.last_modified_time.to_i, # millisec
117
+ location: response.location,
118
+ num_bytes: response.num_bytes.to_i,
119
+ num_rows: response.num_rows.to_i,
120
+ }
121
+ end
122
+
123
+ # @return [Array] [partition_id, creation_time, last_modified_time]
124
+ #
125
+ # partition_id [String] partition id such as "20160307"
126
+ # creation_time [Integer] milli sec
127
+ # last_modified_time [Integer] milli sec
128
+ def get_partitions_summary(project: nil, dataset:, table:, limit: nil)
129
+ project ||= self.project
130
+ limit_stmt = limit ? " LIMIT #{limit.to_i}" : ""
131
+ result = query(
132
+ "select partition_id,creation_time,last_modified_time " \
133
+ "from [#{project}:#{dataset}.#{table}$__PARTITIONS_SUMMARY__] " \
134
+ "order by partition_id asc#{limit_stmt}"
135
+ )
136
+ result[:rows].map {|r| v = r[:f].map {|c| c[:v] }; [v[0], v[1].to_i, v[2].to_i] }
137
+ end
138
+
139
+ def project
140
+ @project ||= ENV['GOOGLE_PROJECT'] || @connection_info.fetch(:project, nil) || credentials['project_id']
141
+ @project ||= credentials['client_email'].chomp('.iam.gserviceaccount.com').split('@').last if credentials['client_email']
142
+ @project ||= project_default
143
+ end
144
+
145
+ private
146
+
147
+ def query(q, options = {})
148
+ started = Time.now
149
+ current_row = 0
150
+
151
+ body = {
152
+ job_reference: {
153
+ project_id: project,
154
+ job_id: "job_#{SecureRandom.uuid}",
155
+ },
156
+ configuration: {
157
+ query: {
158
+ query: q,
159
+ use_legacy_sql: true,
160
+ use_query_cache: true,
161
+ },
162
+ dry_run: options[:dry_run],
163
+ },
164
+ }
165
+ opts = {}
166
+
167
+ $logger.info { "insert_job(#{project}, #{body}, #{opts})" }
168
+ job_res = client.insert_job(project, body, opts)
169
+
170
+ if options[:dry_run]
171
+ {
172
+ totalRows: nil,
173
+ totalBytesProcessed: job_res.statistics.query.total_bytes_processed,
174
+ cacheHit: job_res.statistics.query.cache_hit,
175
+ }
176
+ else
177
+ job_id = job_res.job_reference.job_id
178
+
179
+ res = {}
180
+ while true
181
+ res = JSON.parse(client.get_job_query_results(
182
+ project,
183
+ job_id,
184
+ ), symbolize_names: true)
185
+ break if res[:jobComplete]
186
+ sleep 3
187
+
188
+ if (Time.now - started).to_i > HARD_TIMEOUT_SEC
189
+ raise RuntimeError.new("Query is timeout")
190
+ end
191
+ end
192
+
193
+ if res[:rows]
194
+ # res[:rows].each(&block)
195
+ current_row += res[:rows].size
196
+ end
197
+ total_rows = res[:totalRows].to_i
198
+
199
+ while current_row < total_rows
200
+ res = JSON.parse(client.get_job_query_results(
201
+ project,
202
+ job_id,
203
+ start_index: current_row
204
+ ), symbolize_names: true)
205
+ if res[:rows]
206
+ res[:rows].each(&block)
207
+ current_row += res[:rows].size
208
+ end
209
+ end
210
+
211
+ res
212
+ end
213
+ end
214
+
215
+ # compute_engine, authorized_user, service_account
216
+ def auth_method
217
+ @auth_method ||= ENV['AUTH_METHOD'] || @connection_info.fetch(:auth_method, nil) || credentials['type'] || 'compute_engine'
218
+ end
219
+
220
+ def credentials
221
+ JSON.parse(@connection_info.fetch(:credentials, nil) || File.read(credentials_file))
222
+ end
223
+
224
+ def credentials_file
225
+ @credentials_file ||= File.expand_path(
226
+ # ref. https://developers.google.com/identity/protocols/application-default-credentials
227
+ ENV['GOOGLE_APPLICATION_CREDENTIALS'] ||
228
+ @connection_info.fetch(:credentials_file, nil) ||
229
+ (File.exist?(global_application_default_credentials_file) ? global_application_default_credentials_file : application_default_credentials_file)
230
+ )
231
+ end
232
+
233
+ def application_default_credentials_file
234
+ @application_default_credentials_file ||= File.expand_path("~/.config/gcloud/application_default_credentials.json")
235
+ end
236
+
237
+ def global_application_default_credentials_file
238
+ @global_application_default_credentials_file ||= '/etc/google/auth/application_default_credentials.json'
239
+ end
240
+
241
+ def config_default_file
242
+ File.expand_path('~/.config/gcloud/configurations/config_default')
243
+ end
244
+
245
+ def config_default
246
+ # {core:{account:'xxx',project:'xxx'},compute:{zone:'xxx}}
247
+ @config_default ||= File.readable?(config_default_file) ? IniFile.load(config_default_file).to_hash : {}
248
+ end
249
+
250
+ def service_account_default
251
+ (config_default[:core] || {})[:account]
252
+ end
253
+
254
+ def project_default
255
+ (config_default[:core] || {})[:project]
256
+ end
257
+
258
+ def zone_default
259
+ (config_default[:compute] || {})[:zone]
260
+ end
261
+
262
+ def service_account
263
+ @service_account ||= ENV['GOOGLE_SERVICE_ACCOUNT'] || @connection_info.fetch(:service_account, nil) || credentials['client_email'] || service_account_default
264
+ end
265
+
266
+ def retries
267
+ @retries ||= ENV['RETRIES'] || @connection_info.fetch(:retries, nil) || $setting.dig(:bigquery, :retries) || 5
268
+ end
269
+
270
+ def timeout_sec
271
+ @timeout_sec ||= ENV['TIMEOUT_SEC'] || @connection_info.fetch(:timeout_sec, nil) || $setting.dig(:bigquery, :timeout_sec) || 300
272
+ end
273
+
274
+ def open_timeout_sec
275
+ @open_timeout_sec ||= ENV['OPEN_TIMEOUT_SEC'] || @connection_info.fetch(:open_timeout_sec, nil) || $setting.dig(:bigquery, :open_timeout_sec) || 300
276
+ end
277
+ end
278
+ end
279
+ end
@@ -0,0 +1,230 @@
1
+ require 'triglav/agent/base/monitor'
2
+ require 'uri'
3
+ require 'cgi'
4
+ require 'securerandom'
5
+
6
+ module Triglav::Agent
7
+ module Bigquery
8
+ class Monitor < Base::Monitor
9
+ attr_reader :connection, :resource_uri_prefix, :resource
10
+
11
+ # @param [Triglav::Agent::Bigquery::Connection] connection
12
+ # @param [String] resource_uri_prefix
13
+ # @param [TriglavClient::ResourceResponse] resource
14
+ # resource:
15
+ # uri: https://bigquery.cloud.google.com/table/project:dataset.table
16
+ # unit: 'daily', 'hourly', or 'singular'
17
+ # timezone: '+09:00'
18
+ # span_in_days: 32
19
+ def initialize(connection, resource_uri_prefix, resource)
20
+ @connection = connection
21
+ @resource_uri_prefix = resource_uri_prefix
22
+ @resource = resource
23
+ @status = Triglav::Agent::Status.new(resource_uri_prefix, resource.uri)
24
+ end
25
+
26
+ def process
27
+ unless resource_valid?
28
+ $logger.warn { "Broken resource: #{resource.to_s}" }
29
+ return nil
30
+ end
31
+
32
+ $logger.debug { "Start process #{resource.uri}" }
33
+
34
+ events, new_last_modified_times = get_events
35
+
36
+ $logger.debug { "Finish process #{resource.uri}" }
37
+
38
+ return nil if events.nil? || events.empty?
39
+ yield(events) if block_given? # send_message
40
+ update_status_file(new_last_modified_times)
41
+ true
42
+ end
43
+
44
+ private
45
+
46
+ def last_modified_times
47
+ @last_modified_times ||= get_last_modified_times
48
+ end
49
+
50
+ def get_events
51
+ if partitioned_table?
52
+ new_last_modified_times = get_new_last_modified_times_for_partitioned_table
53
+ else
54
+ new_last_modified_times = get_new_last_modified_times_for_non_partitioned_table
55
+ end
56
+ latest_tables = select_latest_tables(new_last_modified_times)
57
+ events = build_events(latest_tables)
58
+ [events, new_last_modified_times]
59
+ rescue => e
60
+ $logger.warn { "#{e.class} #{e.message} #{e.backtrace.join("\n ")}" }
61
+ nil
62
+ end
63
+
64
+ def update_status_file(last_modified_times)
65
+ last_modified_times[:max] = last_modified_times.values.max
66
+ @status.set(last_modified_times)
67
+ end
68
+
69
+ def get_last_modified_times
70
+ max_last_modified_time = @status.getsetnx([:max], $setting.debug? ? 0 : get_current_time)
71
+ last_modified_times = @status.get
72
+ removes = last_modified_times.keys - tables.keys
73
+ appends = tables.keys - last_modified_times.keys
74
+ removes.each {|table| last_modified_times.delete(table) }
75
+ appends.each {|table| last_modified_times[table] = max_last_modified_time }
76
+ last_modified_times
77
+ end
78
+
79
+ def get_current_time
80
+ (Time.now.to_f * 1000).to_i # msec
81
+ end
82
+
83
+ def resource_valid?
84
+ self.class.resource_valid?(resource)
85
+ end
86
+
87
+ def self.resource_valid?(resource)
88
+ resource_unit_valid?(resource) && !resource.timezone.nil? && !resource.span_in_days.nil?
89
+ end
90
+
91
+ # Two or more combinations are not allowed for hdfs because
92
+ # * hourly should have %d, %H
93
+ # * daily should have %d, but not have %H
94
+ # * singualr should not have %d
95
+ # These conditions conflict.
96
+ def self.resource_unit_valid?(resource)
97
+ units = resource.unit.split(',').sort
98
+ return false if units.size >= 2
99
+ if units.include?('hourly')
100
+ return false unless resource.uri.match(/%H/)
101
+ end
102
+ # if units.include?('daily')
103
+ # return false unless resource.uri.match(/%d/)
104
+ # end
105
+ if units.include?('singular')
106
+ return false if resource.uri.match(/%[YmdH]/)
107
+ end
108
+ true
109
+ end
110
+
111
+ def dates
112
+ return @dates if @dates
113
+ now = Time.now.localtime(resource.timezone)
114
+ @dates = resource.span_in_days.times.map do |i|
115
+ (now - (i * 86000)).to_date
116
+ end
117
+ end
118
+
119
+ def project_dataset_table
120
+ @project_dataset_table ||= resource.uri.split('/').last
121
+ end
122
+
123
+ def project
124
+ @project ||= project_dataset_table.split(':').first
125
+ end
126
+
127
+ def dataset
128
+ @dataset ||= project_dataset_table.split(':').last.chomp(".#{table}")
129
+ end
130
+
131
+ def table
132
+ @table ||= project_dataset_table.split('.').last
133
+ end
134
+
135
+ def partitioned_table?
136
+ table.include?('$')
137
+ end
138
+
139
+ def table_without_partition
140
+ @table_without_partition ||= table.split('$').first
141
+ end
142
+
143
+ def dates
144
+ return @dates if @dates
145
+ now = Time.now.localtime(resource.timezone)
146
+ @dates = resource.span_in_days.times.map do |i|
147
+ (now - (i * 86000)).to_date
148
+ end
149
+ end
150
+
151
+ def tables
152
+ return @tables if @tables
153
+ tables = {}
154
+ # If table becomes same, use newer date
155
+ case resource.unit
156
+ when 'hourly'
157
+ dates.each do |date|
158
+ date_time = date.to_time
159
+ (0..23).each do |hour|
160
+ _table = (date_time + hour * 3600).strftime(table)
161
+ tables[_table.to_sym] = [date, hour]
162
+ end
163
+ end
164
+ when 'daily'
165
+ hour = 0
166
+ dates.each do |date|
167
+ _table = date.strftime(table)
168
+ tables[_table.to_sym] = [date, hour]
169
+ end
170
+ when 'singular'
171
+ tables[table.to_sym] = [nil, nil]
172
+ end
173
+ @tables = tables
174
+ end
175
+
176
+ def get_new_last_modified_times_for_partitioned_table
177
+ rows = connection.get_partitions_summary(
178
+ project: project, dataset: dataset, table: table_without_partition, limit: resource.span_in_days
179
+ )
180
+ new_last_modified_times = {}
181
+ rows.each do |partition, creation_time, last_modified_time|
182
+ new_last_modified_times["#{table_without_partition}$#{partition}".to_sym] = last_modified_time
183
+ end
184
+ new_last_modified_times
185
+ end
186
+
187
+ def get_new_last_modified_times_for_non_partitioned_table
188
+ new_last_modified_times = {}
189
+ tables.each do |table, date_hour|
190
+ begin
191
+ result = connection.get_table(project: project, dataset: dataset, table: table)
192
+ new_last_modified_times[table.to_sym] = result[:last_modified_time]
193
+ rescue Connection::NotFoundError => e
194
+ $logger.debug { "#{project}:#{dataset}.#{table.to_s} #=> does not exist" }
195
+ rescue Connection::Error => e
196
+ $logger.warn { "#{project}:#{dataset}.#{table.to_s} #=> #{e.class} #{e.message}" }
197
+ end
198
+ end
199
+ new_last_modified_times
200
+ end
201
+
202
+ def select_latest_tables(new_last_modified_times)
203
+ new_last_modified_times.select do |table, last_modified_time|
204
+ is_newer = last_modified_time > (last_modified_times[table] || 0)
205
+ $logger.debug { "#{project}:#{dataset}.#{table} #=> latest_modified_time:#{last_modified_time}, is_newer:#{is_newer}" }
206
+ is_newer
207
+ end
208
+ end
209
+
210
+ def build_events(latest_tables)
211
+ latest_tables.map do |table, last_modified_time|
212
+ date, hour = date_hour = tables[table]
213
+ {
214
+ uuid: SecureRandom.uuid,
215
+ resource_uri: resource.uri,
216
+ resource_unit: resource.unit,
217
+ resource_time: date_hour_to_i(date, hour, resource.timezone),
218
+ resource_timezone: resource.timezone,
219
+ payload: {table: table.to_sym, last_modified_time: last_modified_time}.to_json, # msec
220
+ }
221
+ end
222
+ end
223
+
224
+ def date_hour_to_i(date, hour, timezone)
225
+ return 0 if date.nil?
226
+ Time.strptime("#{date.to_s} #{hour.to_i} #{timezone}", '%Y-%m-%d %H %z').to_i
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,7 @@
1
+ module Triglav
2
+ module Agent
3
+ module Bigquery
4
+ VERSION = "1.0.0.rc1"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,11 @@
1
+ module Triglav
2
+ module Agent
3
+ module Bigquery
4
+ end
5
+ end
6
+ end
7
+
8
+ require 'triglav-agent'
9
+ require 'triglav/agent/bigquery/connection'
10
+ require 'triglav/agent/bigquery/version'
11
+ require 'triglav/agent/bigquery/monitor'
data/prepare.sh ADDED
@@ -0,0 +1,3 @@
1
+ #!/bin/sh
2
+ test -f config.yml || cp example/config.yml config.yml
3
+ test -f .env || cp example/example.env .env
data/start.sh ADDED
@@ -0,0 +1,8 @@
1
+ #!/bin/sh
2
+ ABSPATH=$(cd $(dirname $0) && pwd)/$(basename $0)
3
+ APP_ROOT=$(dirname $ABSPATH)
4
+ if [ -z "${SHARED_ROOT}" ]; then SHARED_ROOT=.; fi
5
+
6
+ CMD="bundle exec triglav-agent-bigquery --dotenv -c config.yml --status ${SHARED_ROOT}/status.yml --token ${SHARED_ROOT}/token.yml"
7
+ echo $CMD
8
+ $CMD
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'triglav/agent/bigquery/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "triglav-agent-bigquery"
8
+ spec.version = Triglav::Agent::Bigquery::VERSION
9
+ spec.authors = ["Triglav Team"]
10
+ spec.email = ["triglav_admin_my@dena.jp"]
11
+
12
+ spec.summary = %q{BigQuery agent for triglav, data-driven workflow tool.}
13
+ spec.description = %q{BigQuery agent for triglav, data-driven workflow tool.}
14
+ spec.homepage = "https://github.com/triglav-dataflow/triglav-agent-bigquery"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "triglav-agent"
23
+ spec.add_dependency "triglav_client"
24
+ spec.add_dependency "google-api-client"
25
+ spec.add_dependency "ini_file"
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.11"
28
+ spec.add_development_dependency "rake", "~> 10.0"
29
+ spec.add_development_dependency "test-unit"
30
+ spec.add_development_dependency "test-unit-rr"
31
+ spec.add_development_dependency "test-unit-power_assert"
32
+ spec.add_development_dependency "timecop"
33
+ end
metadata ADDED
@@ -0,0 +1,205 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: triglav-agent-bigquery
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.rc1
5
+ platform: ruby
6
+ authors:
7
+ - Triglav Team
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: triglav-agent
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: triglav_client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: google-api-client
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: ini_file
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.11'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: test-unit
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: test-unit-rr
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: test-unit-power_assert
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: timecop
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ description: BigQuery agent for triglav, data-driven workflow tool.
154
+ email:
155
+ - triglav_admin_my@dena.jp
156
+ executables:
157
+ - triglav-agent-bigquery
158
+ extensions: []
159
+ extra_rdoc_files: []
160
+ files:
161
+ - ".gitignore"
162
+ - ".rspec"
163
+ - ".travis.yml"
164
+ - CODE_OF_CONDUCT.md
165
+ - Gemfile
166
+ - LICENSE.txt
167
+ - README.md
168
+ - Rakefile
169
+ - bin/console
170
+ - bin/setup
171
+ - example/config.yml
172
+ - example/example.env
173
+ - exe/triglav-agent-bigquery
174
+ - lib/triglav/agent/bigquery.rb
175
+ - lib/triglav/agent/bigquery/connection.rb
176
+ - lib/triglav/agent/bigquery/monitor.rb
177
+ - lib/triglav/agent/bigquery/version.rb
178
+ - prepare.sh
179
+ - start.sh
180
+ - triglav-agent-bigquery.gemspec
181
+ homepage: https://github.com/triglav-dataflow/triglav-agent-bigquery
182
+ licenses:
183
+ - MIT
184
+ metadata: {}
185
+ post_install_message:
186
+ rdoc_options: []
187
+ require_paths:
188
+ - lib
189
+ required_ruby_version: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: '0'
194
+ required_rubygems_version: !ruby/object:Gem::Requirement
195
+ requirements:
196
+ - - ">"
197
+ - !ruby/object:Gem::Version
198
+ version: 1.3.1
199
+ requirements: []
200
+ rubyforge_project:
201
+ rubygems_version: 2.5.2
202
+ signing_key:
203
+ specification_version: 4
204
+ summary: BigQuery agent for triglav, data-driven workflow tool.
205
+ test_files: []