triglav-agent-bigquery 1.0.0.rc1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b32f5948041f7c59be88aeaa8964818c2bf7cc9d
4
+ data.tar.gz: b3f3a1c2eec2353ab8c49c0ab967894b6b2ce1b5
5
+ SHA512:
6
+ metadata.gz: 31b0ad8eb808a81da42f62081f11e4d5ce78dc07ab9056d0427a98f48725d735e1fbf5ee0f358dd1d64f1588f76c3f96866dbf019c1889731c84ba047bd92804
7
+ data.tar.gz: b6f5be6197a4d92cbe089956049c5102754d3e3ef3f8cb6791bfa4e3ba555561f840ce518a998df2516d1e8dd0f808a101945c20f0ca725bbf42f6f11b6c0160
data/.gitignore ADDED
@@ -0,0 +1,15 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
10
+ .env
11
+ /status.yml
12
+ /token.yml
13
+ /config.yml
14
+ .ruby-version
15
+ *.json
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,6 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.3.0
4
+ - 2.4.0
5
+ before_install:
6
+ - gem install bundler -v 1.11.2
@@ -0,0 +1,49 @@
1
+ # Contributor Code of Conduct
2
+
3
+ As contributors and maintainers of this project, and in the interest of
4
+ fostering an open and welcoming community, we pledge to respect all people who
5
+ contribute through reporting issues, posting feature requests, updating
6
+ documentation, submitting pull requests or patches, and other activities.
7
+
8
+ We are committed to making participation in this project a harassment-free
9
+ experience for everyone, regardless of level of experience, gender, gender
10
+ identity and expression, sexual orientation, disability, personal appearance,
11
+ body size, race, ethnicity, age, religion, or nationality.
12
+
13
+ Examples of unacceptable behavior by participants include:
14
+
15
+ * The use of sexualized language or imagery
16
+ * Personal attacks
17
+ * Trolling or insulting/derogatory comments
18
+ * Public or private harassment
19
+ * Publishing other's private information, such as physical or electronic
20
+ addresses, without explicit permission
21
+ * Other unethical or unprofessional conduct
22
+
23
+ Project maintainers have the right and responsibility to remove, edit, or
24
+ reject comments, commits, code, wiki edits, issues, and other contributions
25
+ that are not aligned to this Code of Conduct, or to ban temporarily or
26
+ permanently any contributor for other behaviors that they deem inappropriate,
27
+ threatening, offensive, or harmful.
28
+
29
+ By adopting this Code of Conduct, project maintainers commit themselves to
30
+ fairly and consistently applying these principles to every aspect of managing
31
+ this project. Project maintainers who do not follow or enforce the Code of
32
+ Conduct may be permanently removed from the project team.
33
+
34
+ This code of conduct applies both within project spaces and in public spaces
35
+ when an individual is representing the project or its community.
36
+
37
+ Instances of abusive, harassing, or otherwise unacceptable behavior may be
38
+ reported by contacting a project maintainer at sonots@gmail.com. All
39
+ complaints will be reviewed and investigated and will result in a response that
40
+ is deemed necessary and appropriate to the circumstances. Maintainers are
41
+ obligated to maintain confidentiality with regard to the reporter of an
42
+ incident.
43
+
44
+ This Code of Conduct is adapted from the [Contributor Covenant][homepage],
45
+ version 1.3.0, available at
46
+ [http://contributor-covenant.org/version/1/3/0/][version]
47
+
48
+ [homepage]: http://contributor-covenant.org
49
+ [version]: http://contributor-covenant.org/version/1/3/0/
data/Gemfile ADDED
@@ -0,0 +1,6 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+ gem 'triglav_client', git: 'https://github.com/triglav-dataflow/triglav-client-ruby'
5
+ gem 'triglav-agent', git: 'https://github.com/triglav-dataflow/triglav-agent-framework-ruby'
6
+ gem 'pry-byebug'
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ The MIT License (MIT)
2
+
3
+ Copyright (c) 2016 Triglav Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21
+ THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,135 @@
1
+ # Triglav::Agent::Bigquery
2
+
3
+ Triglav Agent for BigQuery
4
+
5
+ ## Requirements
6
+
7
+ * Ruby >= 2.3.0
8
+
9
+ ## Prerequisites
10
+
11
+ * BigQuery view is not supported
12
+
13
+ ## Installation
14
+
15
+ Add this line to your application's Gemfile:
16
+
17
+ ```ruby
18
+ gem 'triglav-agent-bigquery'
19
+ ```
20
+
21
+ And then execute:
22
+
23
+ $ bundle
24
+
25
+ Or install it yourself as:
26
+
27
+ $ gem install triglav-agent-bigquery
28
+
29
+ ## CLI
30
+
31
+ ```
32
+ Usage: triglav-agent-bigquery [options]
33
+ -c, --config VALUE Config file (default: config.yml)
34
+ -s, --status VALUE Status stroage file (default: status.yml)
35
+ -t, --token VALUE Triglav access token storage file (default: token.yml)
36
+ --dotenv Load environment variables from .env file (default: false)
37
+ -h, --help help
38
+ --log VALUE Log path (default: STDOUT)
39
+ --log-level VALUE Log level (default: info)
40
+ ```
41
+
42
+ Run as:
43
+
44
+ ```
45
+ TRIGLAV_ENV=development bundle exec triglav-agent-bigquery --dotenv -c config.yml
46
+ ```
47
+
48
+ ## Configuration
49
+
50
+ Prepare config.yml as [example/config.yml](./example/config.yml).
51
+
52
+ You can use erb template. You may load environment variables from .env file with `--dotenv` option as an [example/example.env](./example/example.env) file shows.
53
+
54
+ ### serverengine section
55
+
56
+ You can specify any [serverengine](https://github.com/fluent/serverengine) options at this section
57
+
58
+ ### triglav section
59
+
60
+ Specify triglav api url, and a credential to authenticate.
61
+
62
+ The access token obtained is stored into a token storage file (--token option).
63
+
64
+ ### bigquery section
65
+
66
+ This section is the special section for triglav-agent-bigquery.
67
+
68
+ * **monitor_interval**: The interval to watch tables (number, default: 60)
69
+ * **connection_info**: key-value pairs of bigquery connection info where keys are resource URI pattern in regular expression, and values are connection infomation
70
+ * **auth_method**: Authentication method. Must be one of `service_account`, `authorized_user` (for oauth2), `compute_engine`, and `application_default`. Default obtains from credentials.
71
+ * **credentials_file**: Credentials file path such as service account json.
72
+ * **credentials**: Instead of `credentials_file`, you may pass json contents as a string
73
+
74
+ ### Specification of Resource URI
75
+
76
+ Resource URI must be a form of:
77
+
78
+ ```
79
+ https://bigquery.cloud.google.com/table/#{project}:#{dataset}.#{table}
80
+ ```
81
+
82
+ `#{table}` also accepts strftime formatted suffix such as
83
+
84
+ ```
85
+ #{table}_%Y%m%d
86
+ ```
87
+
88
+ and strftime formatted partition decorator for a partitioned table such as
89
+
90
+ ```
91
+ #{table}$%Y%m%d
92
+ ```
93
+
94
+ ## How it behaves
95
+
96
+ 1. Authenticate with triglav
97
+ * Store the access token into the token storage file
98
+ * Read the token from the token storage file next time
99
+ * Refresh the access token if it is expired
100
+ 2. Repeat followings in `monitor_interval` seconds:
101
+ 3. Obtain resource (table) lists of the specified prefix (keys of connection_info) from triglav.
102
+ 4. Connect to bigquery with an appropriate connection info for a resource uri, and find tables which are newer than last check.
103
+ 5. Store checking information into the status storage file for the next time check.
104
+
105
+ ## Development
106
+
107
+ ### Prepare
108
+
109
+ ```
110
+ ./prepare.sh
111
+ ```
112
+
113
+ Edit `.env` or `config.yml` file directly.
114
+
115
+ ### Start
116
+
117
+ Start up triglav api on localhost.
118
+
119
+ Run triglav-anget-bigquery as:
120
+
121
+ ```
122
+ TRIGLAV_ENV=development bundle exec triglav-agent-bigquery --dotenv --debug -c example/config.yml
123
+ ```
124
+
125
+ The debug mode with --debug option ignores the `last_modified_time` value in status file.
126
+
127
+ ## Contributing
128
+
129
+ Bug reports and pull requests are welcome on GitHub at https://github.com/triglav-dataflow/triglav-agent-bigquery. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
130
+
131
+
132
+ ## License
133
+
134
+ The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).
135
+
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ require 'rake/testtask'
4
+ desc 'Run test_unit based test'
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.test_files = Dir["test/**/test_*.rb"].sort
8
+ t.verbose = false
9
+ t.warning = false
10
+ end
11
+ task :default => :test
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "triglav/agent/bigquery"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,8 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+ set -vx
5
+
6
+ bundle install
7
+
8
+ # Do any other automated setup that you need to do here
@@ -0,0 +1,37 @@
1
+ defaults: &defaults
2
+ serverengine:
3
+ log: 'STDOUT'
4
+ log_level: 'debug'
5
+ log_rotate_age: 5
6
+ log_rotate_size: 10485760
7
+ triglav:
8
+ url: <%= ENV['TRIGLAV_URL'] || 'http://localhost:7800' %>
9
+ credential:
10
+ username: <%= ENV['TRIGLAV_USERNAME'] || 'triglav_test' %>
11
+ password: <%= ENV['TRIGLAV_PASSWORD'] || 'triglav_test' %>
12
+ authenticator: local
13
+ timeout: 60
14
+ debugging: false
15
+ retries: 3
16
+ retry_interval: 3 # sec
17
+ bigquery:
18
+ monitor_interval: 5
19
+ retries: 5
20
+ timeout_sec: 300
21
+ open_timeout_sec: 300
22
+ connection_info:
23
+ "https://bigquery.cloud.google.com/table/<%= ENV['GOOGLE_PROJECT'] || 'your-project' %>":
24
+ # auth_method: # service_account, authorized_user, or compute_engine. default: get type from credentials
25
+ credentials_file: ~/.config/gcloud/application_default_credentials.json
26
+ # credentials: |
27
+ # {
28
+ # "private_key_id": "123456789",
29
+ # "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
30
+ # "client_email": "..."
31
+ # }
32
+
33
+ development:
34
+ <<: *defaults
35
+
36
+ test:
37
+ <<: *defaults
@@ -0,0 +1,8 @@
1
+ TRIGLAV_URL=http://localhost:7800
2
+ TRIGLAV_USERNAME=triglav_test
3
+ TRIGLAV_PASSWORD=triglav_test
4
+ VERTICA_HOST=xxx.xxx.xxx.xxx
5
+ VERTICA_PORT=5433
6
+ VERTICA_DATABASE=vdb
7
+ VERTICA_USER=dbread
8
+ VERTICA_PASSWORD=daerbd
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'triglav/agent/bigquery'
4
+ Triglav::Agent::Configuration.configure do |config|
5
+ config.name = :bigquery
6
+ # config.cli_class = Triglav::Agent::Bigquery::CLI
7
+ # config.setting_class = Triglav::Agent::Bigquery::Setting
8
+ # config.worker_module = Triglav::Agent::Bigquery::Worker
9
+ # config.processor_class = Triglav::Agent::Bigquery::Processor
10
+ config.monitor_class = Triglav::Agent::Bigquery::Monitor
11
+ config.connection_class = Triglav::Agent::Bigquery::Connection
12
+ end
13
+ Triglav::Agent::Configuration.cli_class.new.run
@@ -0,0 +1,279 @@
1
+ require 'triglav/agent/base/connection'
2
+ require 'google/apis/bigquery_v2'
3
+ require 'google/api_client/auth/key_utils'
4
+ require 'securerandom'
5
+ require 'ini_file'
6
+
7
+ # monkey patch not to create representable objects which consumes lots of memory
8
+ # @see http://qiita.com/sonots/items/1271f3d426cda6c891c0
9
+ module Google
10
+ module Apis
11
+ module BigqueryV2
12
+ class BigqueryService < Google::Apis::Core::BaseService
13
+ def get_job_query_results(project_id, job_id, max_results: nil, page_token: nil, start_index: nil, timeout_ms: nil, fields: nil, quota_user: nil, user_ip: nil, options: nil, &block)
14
+ command = make_simple_command(:get, 'projects/{projectId}/queries/{jobId}', options)
15
+ # command.response_representation = Google::Apis::BigqueryV2::GetQueryResultsResponse::Representation # monkey patch
16
+ command.response_class = Google::Apis::BigqueryV2::GetQueryResultsResponse
17
+ command.params['projectId'] = project_id unless project_id.nil?
18
+ command.params['jobId'] = job_id unless job_id.nil?
19
+ command.query['maxResults'] = max_results unless max_results.nil?
20
+ command.query['pageToken'] = page_token unless page_token.nil?
21
+ command.query['startIndex'] = start_index unless start_index.nil?
22
+ command.query['timeoutMs'] = timeout_ms unless timeout_ms.nil?
23
+ command.query['fields'] = fields unless fields.nil?
24
+ command.query['quotaUser'] = quota_user unless quota_user.nil?
25
+ command.query['userIp'] = user_ip unless user_ip.nil?
26
+ execute_or_queue_command(command, &block)
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
32
+
33
+ module Triglav::Agent
34
+ module Bigquery
35
+ class Connection < Base::Connection
36
+ attr_reader :connection_info
37
+
38
+ class Error < StandardError; end
39
+ class NotFoundError < Error; end
40
+ class ConfigError < Error; end
41
+
42
+ def initialize(connection_info)
43
+ @connection_info = connection_info
44
+ end
45
+
46
+ def close
47
+ # google-api-ruby-client uses hurley and patches it to use httpclient gem inside.
48
+ # httpclient gem manages its connections in its connection pool, and
49
+ # releases or reuses its connections automatically.
50
+ #
51
+ # ADVANCE NOTE: httpclient gem itself has own connection pool, so the connection pool
52
+ # machanism of triglav-agent-framework is just useless.
53
+ # httpclient gem creates a new connection as much as it is required, so the number of
54
+ # connections typically will be the same with the number of threads (?).
55
+ end
56
+
57
+ def client
58
+ return @cached_client if @cached_client && @cached_client_expiration > Time.now
59
+
60
+ client = Google::Apis::BigqueryV2::BigqueryService.new
61
+ client.request_options.retries = retries
62
+ client.request_options.timeout_sec = timeout_sec
63
+ client.request_options.open_timeout_sec = open_timeout_sec
64
+
65
+ scope = "https://www.googleapis.com/auth/bigquery"
66
+
67
+ case auth_method
68
+ when 'authorized_user'
69
+ auth = Signet::OAuth2::Client.new(
70
+ token_credential_uri: "https://accounts.google.com/o/oauth2/token",
71
+ audience: "https://accounts.google.com/o/oauth2/token",
72
+ scope: scope,
73
+ client_id: credentials['client_id'],
74
+ client_secret: credentials['client_secret'],
75
+ refresh_token: credentials['refresh_token']
76
+ )
77
+ auth.refresh!
78
+ when 'compute_engine'
79
+ auth = Google::Auth::GCECredentials.new
80
+ when 'service_account'
81
+ key = StringIO.new(credentials.to_json)
82
+ auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
83
+ when 'application_default'
84
+ auth = Google::Auth.get_application_default([scope])
85
+ else
86
+ raise ConfigError, "Unknown auth method: #{auth_method}"
87
+ end
88
+
89
+ client.authorization = auth
90
+
91
+ @cached_client_expiration = Time.now + 1800
92
+ @cached_client = client
93
+ end
94
+
95
+ # @return [Hash] {id:, creation_time:, last_modified_time:, location:, num_bytes:, num_rows:}
96
+ #
97
+ # creation_time [Integer] milli sec
98
+ # last_modified_time [Integer] milli sec
99
+ def get_table(project: nil, dataset:, table:)
100
+ project ||= self.project
101
+ begin
102
+ $logger.debug { "Get table... #{project}:#{dataset}.#{table}" }
103
+ response = client.get_table(project, dataset, table)
104
+ rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
105
+ if e.status_code == 404 # not found
106
+ raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
107
+ end
108
+
109
+ response = {status_code: e.status_code, message: e.message, error_class: e.class}
110
+ raise Error, "Failed to get_table(#{project}, #{dataset}, #{table}), response:#{response}"
111
+ end
112
+
113
+ result = {
114
+ id: response.id, # project:dataset.table
115
+ creation_time: response.creation_time.to_i, # millisec
116
+ last_modified_time: response.last_modified_time.to_i, # millisec
117
+ location: response.location,
118
+ num_bytes: response.num_bytes.to_i,
119
+ num_rows: response.num_rows.to_i,
120
+ }
121
+ end
122
+
123
+ # @return [Array] [partition_id, creation_time, last_modified_time]
124
+ #
125
+ # partition_id [String] partition id such as "20160307"
126
+ # creation_time [Integer] milli sec
127
+ # last_modified_time [Integer] milli sec
128
+ def get_partitions_summary(project: nil, dataset:, table:, limit: nil)
129
+ project ||= self.project
130
+ limit_stmt = limit ? " LIMIT #{limit.to_i}" : ""
131
+ result = query(
132
+ "select partition_id,creation_time,last_modified_time " \
133
+ "from [#{project}:#{dataset}.#{table}$__PARTITIONS_SUMMARY__] " \
134
+ "order by partition_id asc#{limit_stmt}"
135
+ )
136
+ result[:rows].map {|r| v = r[:f].map {|c| c[:v] }; [v[0], v[1].to_i, v[2].to_i] }
137
+ end
138
+
139
+ def project
140
+ @project ||= ENV['GOOGLE_PROJECT'] || @connection_info.fetch(:project, nil) || credentials['project_id']
141
+ @project ||= credentials['client_email'].chomp('.iam.gserviceaccount.com').split('@').last if credentials['client_email']
142
+ @project ||= project_default
143
+ end
144
+
145
+ private
146
+
147
+ def query(q, options = {})
148
+ started = Time.now
149
+ current_row = 0
150
+
151
+ body = {
152
+ job_reference: {
153
+ project_id: project,
154
+ job_id: "job_#{SecureRandom.uuid}",
155
+ },
156
+ configuration: {
157
+ query: {
158
+ query: q,
159
+ use_legacy_sql: true,
160
+ use_query_cache: true,
161
+ },
162
+ dry_run: options[:dry_run],
163
+ },
164
+ }
165
+ opts = {}
166
+
167
+ $logger.info { "insert_job(#{project}, #{body}, #{opts})" }
168
+ job_res = client.insert_job(project, body, opts)
169
+
170
+ if options[:dry_run]
171
+ {
172
+ totalRows: nil,
173
+ totalBytesProcessed: job_res.statistics.query.total_bytes_processed,
174
+ cacheHit: job_res.statistics.query.cache_hit,
175
+ }
176
+ else
177
+ job_id = job_res.job_reference.job_id
178
+
179
+ res = {}
180
+ while true
181
+ res = JSON.parse(client.get_job_query_results(
182
+ project,
183
+ job_id,
184
+ ), symbolize_names: true)
185
+ break if res[:jobComplete]
186
+ sleep 3
187
+
188
+ if (Time.now - started).to_i > HARD_TIMEOUT_SEC
189
+ raise RuntimeError.new("Query is timeout")
190
+ end
191
+ end
192
+
193
+ if res[:rows]
194
+ # res[:rows].each(&block)
195
+ current_row += res[:rows].size
196
+ end
197
+ total_rows = res[:totalRows].to_i
198
+
199
+ while current_row < total_rows
200
+ res = JSON.parse(client.get_job_query_results(
201
+ project,
202
+ job_id,
203
+ start_index: current_row
204
+ ), symbolize_names: true)
205
+ if res[:rows]
206
+ res[:rows].each(&block)
207
+ current_row += res[:rows].size
208
+ end
209
+ end
210
+
211
+ res
212
+ end
213
+ end
214
+
215
+ # compute_engine, authorized_user, service_account
216
+ def auth_method
217
+ @auth_method ||= ENV['AUTH_METHOD'] || @connection_info.fetch(:auth_method, nil) || credentials['type'] || 'compute_engine'
218
+ end
219
+
220
+ def credentials
221
+ JSON.parse(@connection_info.fetch(:credentials, nil) || File.read(credentials_file))
222
+ end
223
+
224
+ def credentials_file
225
+ @credentials_file ||= File.expand_path(
226
+ # ref. https://developers.google.com/identity/protocols/application-default-credentials
227
+ ENV['GOOGLE_APPLICATION_CREDENTIALS'] ||
228
+ @connection_info.fetch(:credentials_file, nil) ||
229
+ (File.exist?(global_application_default_credentials_file) ? global_application_default_credentials_file : application_default_credentials_file)
230
+ )
231
+ end
232
+
233
+ def application_default_credentials_file
234
+ @application_default_credentials_file ||= File.expand_path("~/.config/gcloud/application_default_credentials.json")
235
+ end
236
+
237
+ def global_application_default_credentials_file
238
+ @global_application_default_credentials_file ||= '/etc/google/auth/application_default_credentials.json'
239
+ end
240
+
241
+ def config_default_file
242
+ File.expand_path('~/.config/gcloud/configurations/config_default')
243
+ end
244
+
245
+ def config_default
246
+ # {core:{account:'xxx',project:'xxx'},compute:{zone:'xxx}}
247
+ @config_default ||= File.readable?(config_default_file) ? IniFile.load(config_default_file).to_hash : {}
248
+ end
249
+
250
+ def service_account_default
251
+ (config_default[:core] || {})[:account]
252
+ end
253
+
254
+ def project_default
255
+ (config_default[:core] || {})[:project]
256
+ end
257
+
258
+ def zone_default
259
+ (config_default[:compute] || {})[:zone]
260
+ end
261
+
262
+ def service_account
263
+ @service_account ||= ENV['GOOGLE_SERVICE_ACCOUNT'] || @connection_info.fetch(:service_account, nil) || credentials['client_email'] || service_account_default
264
+ end
265
+
266
+ def retries
267
+ @retries ||= ENV['RETRIES'] || @connection_info.fetch(:retries, nil) || $setting.dig(:bigquery, :retries) || 5
268
+ end
269
+
270
+ def timeout_sec
271
+ @timeout_sec ||= ENV['TIMEOUT_SEC'] || @connection_info.fetch(:timeout_sec, nil) || $setting.dig(:bigquery, :timeout_sec) || 300
272
+ end
273
+
274
+ def open_timeout_sec
275
+ @open_timeout_sec ||= ENV['OPEN_TIMEOUT_SEC'] || @connection_info.fetch(:open_timeout_sec, nil) || $setting.dig(:bigquery, :open_timeout_sec) || 300
276
+ end
277
+ end
278
+ end
279
+ end
@@ -0,0 +1,230 @@
1
+ require 'triglav/agent/base/monitor'
2
+ require 'uri'
3
+ require 'cgi'
4
+ require 'securerandom'
5
+
6
+ module Triglav::Agent
7
+ module Bigquery
8
+ class Monitor < Base::Monitor
9
+ attr_reader :connection, :resource_uri_prefix, :resource
10
+
11
+ # @param [Triglav::Agent::Bigquery::Connection] connection
12
+ # @param [String] resource_uri_prefix
13
+ # @param [TriglavClient::ResourceResponse] resource
14
+ # resource:
15
+ # uri: https://bigquery.cloud.google.com/table/project:dataset.table
16
+ # unit: 'daily', 'hourly', or 'singular'
17
+ # timezone: '+09:00'
18
+ # span_in_days: 32
19
+ def initialize(connection, resource_uri_prefix, resource)
20
+ @connection = connection
21
+ @resource_uri_prefix = resource_uri_prefix
22
+ @resource = resource
23
+ @status = Triglav::Agent::Status.new(resource_uri_prefix, resource.uri)
24
+ end
25
+
26
+ def process
27
+ unless resource_valid?
28
+ $logger.warn { "Broken resource: #{resource.to_s}" }
29
+ return nil
30
+ end
31
+
32
+ $logger.debug { "Start process #{resource.uri}" }
33
+
34
+ events, new_last_modified_times = get_events
35
+
36
+ $logger.debug { "Finish process #{resource.uri}" }
37
+
38
+ return nil if events.nil? || events.empty?
39
+ yield(events) if block_given? # send_message
40
+ update_status_file(new_last_modified_times)
41
+ true
42
+ end
43
+
44
+ private
45
+
46
+ def last_modified_times
47
+ @last_modified_times ||= get_last_modified_times
48
+ end
49
+
50
+ def get_events
51
+ if partitioned_table?
52
+ new_last_modified_times = get_new_last_modified_times_for_partitioned_table
53
+ else
54
+ new_last_modified_times = get_new_last_modified_times_for_non_partitioned_table
55
+ end
56
+ latest_tables = select_latest_tables(new_last_modified_times)
57
+ events = build_events(latest_tables)
58
+ [events, new_last_modified_times]
59
+ rescue => e
60
+ $logger.warn { "#{e.class} #{e.message} #{e.backtrace.join("\n ")}" }
61
+ nil
62
+ end
63
+
64
+ def update_status_file(last_modified_times)
65
+ last_modified_times[:max] = last_modified_times.values.max
66
+ @status.set(last_modified_times)
67
+ end
68
+
69
+ def get_last_modified_times
70
+ max_last_modified_time = @status.getsetnx([:max], $setting.debug? ? 0 : get_current_time)
71
+ last_modified_times = @status.get
72
+ removes = last_modified_times.keys - tables.keys
73
+ appends = tables.keys - last_modified_times.keys
74
+ removes.each {|table| last_modified_times.delete(table) }
75
+ appends.each {|table| last_modified_times[table] = max_last_modified_time }
76
+ last_modified_times
77
+ end
78
+
79
+ def get_current_time
80
+ (Time.now.to_f * 1000).to_i # msec
81
+ end
82
+
83
+ def resource_valid?
84
+ self.class.resource_valid?(resource)
85
+ end
86
+
87
+ def self.resource_valid?(resource)
88
+ resource_unit_valid?(resource) && !resource.timezone.nil? && !resource.span_in_days.nil?
89
+ end
90
+
91
+ # Two or more combinations are not allowed for hdfs because
92
+ # * hourly should have %d, %H
93
+ # * daily should have %d, but not have %H
94
+ # * singualr should not have %d
95
+ # These conditions conflict.
96
+ def self.resource_unit_valid?(resource)
97
+ units = resource.unit.split(',').sort
98
+ return false if units.size >= 2
99
+ if units.include?('hourly')
100
+ return false unless resource.uri.match(/%H/)
101
+ end
102
+ # if units.include?('daily')
103
+ # return false unless resource.uri.match(/%d/)
104
+ # end
105
+ if units.include?('singular')
106
+ return false if resource.uri.match(/%[YmdH]/)
107
+ end
108
+ true
109
+ end
110
+
111
+ def dates
112
+ return @dates if @dates
113
+ now = Time.now.localtime(resource.timezone)
114
+ @dates = resource.span_in_days.times.map do |i|
115
+ (now - (i * 86000)).to_date
116
+ end
117
+ end
118
+
119
+ def project_dataset_table
120
+ @project_dataset_table ||= resource.uri.split('/').last
121
+ end
122
+
123
+ def project
124
+ @project ||= project_dataset_table.split(':').first
125
+ end
126
+
127
+ def dataset
128
+ @dataset ||= project_dataset_table.split(':').last.chomp(".#{table}")
129
+ end
130
+
131
+ def table
132
+ @table ||= project_dataset_table.split('.').last
133
+ end
134
+
135
+ def partitioned_table?
136
+ table.include?('$')
137
+ end
138
+
139
+ def table_without_partition
140
+ @table_without_partition ||= table.split('$').first
141
+ end
142
+
143
+ def dates
144
+ return @dates if @dates
145
+ now = Time.now.localtime(resource.timezone)
146
+ @dates = resource.span_in_days.times.map do |i|
147
+ (now - (i * 86000)).to_date
148
+ end
149
+ end
150
+
151
+ def tables
152
+ return @tables if @tables
153
+ tables = {}
154
+ # If table becomes same, use newer date
155
+ case resource.unit
156
+ when 'hourly'
157
+ dates.each do |date|
158
+ date_time = date.to_time
159
+ (0..23).each do |hour|
160
+ _table = (date_time + hour * 3600).strftime(table)
161
+ tables[_table.to_sym] = [date, hour]
162
+ end
163
+ end
164
+ when 'daily'
165
+ hour = 0
166
+ dates.each do |date|
167
+ _table = date.strftime(table)
168
+ tables[_table.to_sym] = [date, hour]
169
+ end
170
+ when 'singular'
171
+ tables[table.to_sym] = [nil, nil]
172
+ end
173
+ @tables = tables
174
+ end
175
+
176
+ def get_new_last_modified_times_for_partitioned_table
177
+ rows = connection.get_partitions_summary(
178
+ project: project, dataset: dataset, table: table_without_partition, limit: resource.span_in_days
179
+ )
180
+ new_last_modified_times = {}
181
+ rows.each do |partition, creation_time, last_modified_time|
182
+ new_last_modified_times["#{table_without_partition}$#{partition}".to_sym] = last_modified_time
183
+ end
184
+ new_last_modified_times
185
+ end
186
+
187
+ def get_new_last_modified_times_for_non_partitioned_table
188
+ new_last_modified_times = {}
189
+ tables.each do |table, date_hour|
190
+ begin
191
+ result = connection.get_table(project: project, dataset: dataset, table: table)
192
+ new_last_modified_times[table.to_sym] = result[:last_modified_time]
193
+ rescue Connection::NotFoundError => e
194
+ $logger.debug { "#{project}:#{dataset}.#{table.to_s} #=> does not exist" }
195
+ rescue Connection::Error => e
196
+ $logger.warn { "#{project}:#{dataset}.#{table.to_s} #=> #{e.class} #{e.message}" }
197
+ end
198
+ end
199
+ new_last_modified_times
200
+ end
201
+
202
+ def select_latest_tables(new_last_modified_times)
203
+ new_last_modified_times.select do |table, last_modified_time|
204
+ is_newer = last_modified_time > (last_modified_times[table] || 0)
205
+ $logger.debug { "#{project}:#{dataset}.#{table} #=> latest_modified_time:#{last_modified_time}, is_newer:#{is_newer}" }
206
+ is_newer
207
+ end
208
+ end
209
+
210
+ def build_events(latest_tables)
211
+ latest_tables.map do |table, last_modified_time|
212
+ date, hour = date_hour = tables[table]
213
+ {
214
+ uuid: SecureRandom.uuid,
215
+ resource_uri: resource.uri,
216
+ resource_unit: resource.unit,
217
+ resource_time: date_hour_to_i(date, hour, resource.timezone),
218
+ resource_timezone: resource.timezone,
219
+ payload: {table: table.to_sym, last_modified_time: last_modified_time}.to_json, # msec
220
+ }
221
+ end
222
+ end
223
+
224
+ def date_hour_to_i(date, hour, timezone)
225
+ return 0 if date.nil?
226
+ Time.strptime("#{date.to_s} #{hour.to_i} #{timezone}", '%Y-%m-%d %H %z').to_i
227
+ end
228
+ end
229
+ end
230
+ end
@@ -0,0 +1,7 @@
1
+ module Triglav
2
+ module Agent
3
+ module Bigquery
4
+ VERSION = "1.0.0.rc1"
5
+ end
6
+ end
7
+ end
@@ -0,0 +1,11 @@
1
+ module Triglav
2
+ module Agent
3
+ module Bigquery
4
+ end
5
+ end
6
+ end
7
+
8
+ require 'triglav-agent'
9
+ require 'triglav/agent/bigquery/connection'
10
+ require 'triglav/agent/bigquery/version'
11
+ require 'triglav/agent/bigquery/monitor'
data/prepare.sh ADDED
@@ -0,0 +1,3 @@
1
+ #!/bin/sh
2
+ test -f config.yml || cp example/config.yml config.yml
3
+ test -f .env || cp example/example.env .env
data/start.sh ADDED
@@ -0,0 +1,8 @@
1
+ #!/bin/sh
2
+ ABSPATH=$(cd $(dirname $0) && pwd)/$(basename $0)
3
+ APP_ROOT=$(dirname $ABSPATH)
4
+ if [ -z "${SHARED_ROOT}" ]; then SHARED_ROOT=.; fi
5
+
6
+ CMD="bundle exec triglav-agent-bigquery --dotenv -c config.yml --status ${SHARED_ROOT}/status.yml --token ${SHARED_ROOT}/token.yml"
7
+ echo $CMD
8
+ $CMD
@@ -0,0 +1,33 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'triglav/agent/bigquery/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "triglav-agent-bigquery"
8
+ spec.version = Triglav::Agent::Bigquery::VERSION
9
+ spec.authors = ["Triglav Team"]
10
+ spec.email = ["triglav_admin_my@dena.jp"]
11
+
12
+ spec.summary = %q{BigQuery agent for triglav, data-driven workflow tool.}
13
+ spec.description = %q{BigQuery agent for triglav, data-driven workflow tool.}
14
+ spec.homepage = "https://github.com/triglav-dataflow/triglav-agent-bigquery"
15
+ spec.license = "MIT"
16
+
17
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
18
+ spec.bindir = "exe"
19
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
20
+ spec.require_paths = ["lib"]
21
+
22
+ spec.add_dependency "triglav-agent"
23
+ spec.add_dependency "triglav_client"
24
+ spec.add_dependency "google-api-client"
25
+ spec.add_dependency "ini_file"
26
+
27
+ spec.add_development_dependency "bundler", "~> 1.11"
28
+ spec.add_development_dependency "rake", "~> 10.0"
29
+ spec.add_development_dependency "test-unit"
30
+ spec.add_development_dependency "test-unit-rr"
31
+ spec.add_development_dependency "test-unit-power_assert"
32
+ spec.add_development_dependency "timecop"
33
+ end
metadata ADDED
@@ -0,0 +1,205 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: triglav-agent-bigquery
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0.rc1
5
+ platform: ruby
6
+ authors:
7
+ - Triglav Team
8
+ autorequire:
9
+ bindir: exe
10
+ cert_chain: []
11
+ date: 2017-03-14 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: triglav-agent
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ name: triglav_client
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - ">="
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ name: google-api-client
43
+ requirement: !ruby/object:Gem::Requirement
44
+ requirements:
45
+ - - ">="
46
+ - !ruby/object:Gem::Version
47
+ version: '0'
48
+ type: :runtime
49
+ prerelease: false
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: ini_file
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ - !ruby/object:Gem::Dependency
70
+ name: bundler
71
+ requirement: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - "~>"
74
+ - !ruby/object:Gem::Version
75
+ version: '1.11'
76
+ type: :development
77
+ prerelease: false
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - "~>"
81
+ - !ruby/object:Gem::Version
82
+ version: '1.11'
83
+ - !ruby/object:Gem::Dependency
84
+ name: rake
85
+ requirement: !ruby/object:Gem::Requirement
86
+ requirements:
87
+ - - "~>"
88
+ - !ruby/object:Gem::Version
89
+ version: '10.0'
90
+ type: :development
91
+ prerelease: false
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - "~>"
95
+ - !ruby/object:Gem::Version
96
+ version: '10.0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: test-unit
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ - !ruby/object:Gem::Dependency
112
+ name: test-unit-rr
113
+ requirement: !ruby/object:Gem::Requirement
114
+ requirements:
115
+ - - ">="
116
+ - !ruby/object:Gem::Version
117
+ version: '0'
118
+ type: :development
119
+ prerelease: false
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ requirements:
122
+ - - ">="
123
+ - !ruby/object:Gem::Version
124
+ version: '0'
125
+ - !ruby/object:Gem::Dependency
126
+ name: test-unit-power_assert
127
+ requirement: !ruby/object:Gem::Requirement
128
+ requirements:
129
+ - - ">="
130
+ - !ruby/object:Gem::Version
131
+ version: '0'
132
+ type: :development
133
+ prerelease: false
134
+ version_requirements: !ruby/object:Gem::Requirement
135
+ requirements:
136
+ - - ">="
137
+ - !ruby/object:Gem::Version
138
+ version: '0'
139
+ - !ruby/object:Gem::Dependency
140
+ name: timecop
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - ">="
144
+ - !ruby/object:Gem::Version
145
+ version: '0'
146
+ type: :development
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - ">="
151
+ - !ruby/object:Gem::Version
152
+ version: '0'
153
+ description: BigQuery agent for triglav, data-driven workflow tool.
154
+ email:
155
+ - triglav_admin_my@dena.jp
156
+ executables:
157
+ - triglav-agent-bigquery
158
+ extensions: []
159
+ extra_rdoc_files: []
160
+ files:
161
+ - ".gitignore"
162
+ - ".rspec"
163
+ - ".travis.yml"
164
+ - CODE_OF_CONDUCT.md
165
+ - Gemfile
166
+ - LICENSE.txt
167
+ - README.md
168
+ - Rakefile
169
+ - bin/console
170
+ - bin/setup
171
+ - example/config.yml
172
+ - example/example.env
173
+ - exe/triglav-agent-bigquery
174
+ - lib/triglav/agent/bigquery.rb
175
+ - lib/triglav/agent/bigquery/connection.rb
176
+ - lib/triglav/agent/bigquery/monitor.rb
177
+ - lib/triglav/agent/bigquery/version.rb
178
+ - prepare.sh
179
+ - start.sh
180
+ - triglav-agent-bigquery.gemspec
181
+ homepage: https://github.com/triglav-dataflow/triglav-agent-bigquery
182
+ licenses:
183
+ - MIT
184
+ metadata: {}
185
+ post_install_message:
186
+ rdoc_options: []
187
+ require_paths:
188
+ - lib
189
+ required_ruby_version: !ruby/object:Gem::Requirement
190
+ requirements:
191
+ - - ">="
192
+ - !ruby/object:Gem::Version
193
+ version: '0'
194
+ required_rubygems_version: !ruby/object:Gem::Requirement
195
+ requirements:
196
+ - - ">"
197
+ - !ruby/object:Gem::Version
198
+ version: 1.3.1
199
+ requirements: []
200
+ rubyforge_project:
201
+ rubygems_version: 2.5.2
202
+ signing_key:
203
+ specification_version: 4
204
+ summary: BigQuery agent for triglav, data-driven workflow tool.
205
+ test_files: []