RubyGems - triglav-agent-bigquery - Versions diffs - 1.0.0.rc1 - Mend

triglav-agent-bigquery 1.0.0.rc1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +7 -0
data/.gitignore +15 -0
data/.rspec +2 -0
data/.travis.yml +6 -0
data/CODE_OF_CONDUCT.md +49 -0
data/Gemfile +6 -0
data/LICENSE.txt +21 -0
data/README.md +135 -0
data/Rakefile +11 -0
data/bin/console +14 -0
data/bin/setup +8 -0
data/example/config.yml +37 -0
data/example/example.env +8 -0
data/exe/triglav-agent-bigquery +13 -0
data/lib/triglav/agent/bigquery/connection.rb +279 -0
data/lib/triglav/agent/bigquery/monitor.rb +230 -0
data/lib/triglav/agent/bigquery/version.rb +7 -0
data/lib/triglav/agent/bigquery.rb +11 -0
data/prepare.sh +3 -0
data/start.sh +8 -0
data/triglav-agent-bigquery.gemspec +33 -0
metadata +205 -0

checksums.yaml ADDED Viewed

@@ -0,0 +1,7 @@
+---
+SHA1:
+  metadata.gz: b32f5948041f7c59be88aeaa8964818c2bf7cc9d
+  data.tar.gz: b3f3a1c2eec2353ab8c49c0ab967894b6b2ce1b5
+SHA512:
+  metadata.gz: 31b0ad8eb808a81da42f62081f11e4d5ce78dc07ab9056d0427a98f48725d735e1fbf5ee0f358dd1d64f1588f76c3f96866dbf019c1889731c84ba047bd92804
+  data.tar.gz: b6f5be6197a4d92cbe089956049c5102754d3e3ef3f8cb6791bfa4e3ba555561f840ce518a998df2516d1e8dd0f808a101945c20f0ca725bbf42f6f11b6c0160

data/.gitignore ADDED Viewed

@@ -0,0 +1,15 @@
+/.bundle/
+/.yardoc
+/Gemfile.lock
+/_yardoc/
+/coverage/
+/doc/
+/pkg/
+/spec/reports/
+/tmp/
+.env
+/status.yml
+/token.yml
+/config.yml
+.ruby-version
+*.json

data/.rspec ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ --format documentation
2	+ --color

data/.travis.yml ADDED Viewed

@@ -0,0 +1,6 @@
+language: ruby
+rvm:
+  - 2.3.0
+  - 2.4.0
+before_install:
+  - gem install bundler -v 1.11.2

data/CODE_OF_CONDUCT.md ADDED Viewed

@@ -0,0 +1,49 @@
+# Contributor Code of Conduct
+As contributors and maintainers of this project, and in the interest of
+fostering an open and welcoming community, we pledge to respect all people who
+contribute through reporting issues, posting feature requests, updating
+documentation, submitting pull requests or patches, and other activities.
+We are committed to making participation in this project a harassment-free
+experience for everyone, regardless of level of experience, gender, gender
+identity and expression, sexual orientation, disability, personal appearance,
+body size, race, ethnicity, age, religion, or nationality.
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery
+* Personal attacks
+* Trolling or insulting/derogatory comments
+* Public or private harassment
+* Publishing other's private information, such as physical or electronic
+  addresses, without explicit permission
+* Other unethical or unprofessional conduct
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+By adopting this Code of Conduct, project maintainers commit themselves to
+fairly and consistently applying these principles to every aspect of managing
+this project. Project maintainers who do not follow or enforce the Code of
+Conduct may be permanently removed from the project team.
+This code of conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community.
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting a project maintainer at sonots@gmail.com. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. Maintainers are
+obligated to maintain confidentiality with regard to the reporter of an
+incident.
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 1.3.0, available at
+[http://contributor-covenant.org/version/1/3/0/][version]
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/3/0/

data/Gemfile ADDED Viewed

@@ -0,0 +1,6 @@
+source 'https://rubygems.org'
+gemspec
+gem 'triglav_client', git: 'https://github.com/triglav-dataflow/triglav-client-ruby'
+gem 'triglav-agent', git: 'https://github.com/triglav-dataflow/triglav-agent-framework-ruby'
+gem 'pry-byebug'

data/LICENSE.txt ADDED Viewed

@@ -0,0 +1,21 @@
+The MIT License (MIT)
+Copyright (c) 2016 Triglav Team
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.

data/README.md ADDED Viewed

@@ -0,0 +1,135 @@
+# Triglav::Agent::Bigquery
+Triglav Agent for BigQuery
+## Requirements
+* Ruby >= 2.3.0
+## Prerequisites
+* BigQuery view is not supported
+## Installation
+Add this line to your application's Gemfile:
+```ruby
+gem 'triglav-agent-bigquery'
+```
+And then execute:
+    $ bundle
+Or install it yourself as:
+    $ gem install triglav-agent-bigquery
+## CLI
+```
+Usage: triglav-agent-bigquery [options]
+    -c, --config VALUE               Config file (default: config.yml)
+    -s, --status VALUE               Status stroage file (default: status.yml)
+    -t, --token VALUE                Triglav access token storage file (default: token.yml)
+        --dotenv                     Load environment variables from .env file (default: false)
+    -h, --help                       help
+        --log VALUE                  Log path (default: STDOUT)
+        --log-level VALUE            Log level (default: info)
+```
+Run as:
+```
+TRIGLAV_ENV=development bundle exec triglav-agent-bigquery --dotenv -c config.yml
+```
+## Configuration
+Prepare config.yml as [example/config.yml](./example/config.yml).
+You can use erb template. You may load environment variables from .env file with `--dotenv` option as an [example/example.env](./example/example.env) file shows.
+### serverengine section
+You can specify any [serverengine](https://github.com/fluent/serverengine) options at this section
+### triglav section
+Specify triglav api url, and a credential to authenticate.
+The access token obtained is stored into a token storage file (--token option).
+### bigquery section
+This section is the special section for triglav-agent-bigquery.
+* **monitor_interval**: The interval to watch tables (number, default: 60)
+* **connection_info**: key-value pairs of bigquery connection info where keys are resource URI pattern in regular expression, and values are connection infomation
+  * **auth_method**: Authentication method. Must be one of `service_account`, `authorized_user` (for oauth2), `compute_engine`, and `application_default`. Default obtains from credentials.
+  * **credentials_file**: Credentials file path such as service account json.
+  * **credentials**: Instead of `credentials_file`, you may pass json contents as a string
+### Specification of Resource URI
+Resource URI must be a form of:
+```
+https://bigquery.cloud.google.com/table/#{project}:#{dataset}.#{table}
+```
+`#{table}` also accepts strftime formatted suffix such as
+```
+#{table}_%Y%m%d
+```
+and strftime formatted partition decorator for a partitioned table such as
+```
+#{table}$%Y%m%d
+```
+## How it behaves
+1. Authenticate with triglav
+  * Store the access token into the token storage file
+  * Read the token from the token storage file next time
+  * Refresh the access token if it is expired
+2. Repeat followings in `monitor_interval` seconds:
+3. Obtain resource (table) lists of the specified prefix (keys of connection_info) from triglav.
+4. Connect to bigquery with an appropriate connection info for a resource uri, and find tables which are newer than last check.
+5. Store checking information into the status storage file for the next time check.
+## Development
+### Prepare
+```
+./prepare.sh
+```
+Edit `.env` or `config.yml` file directly.
+### Start
+Start up triglav api on localhost.
+Run triglav-anget-bigquery as:
+```
+TRIGLAV_ENV=development bundle exec triglav-agent-bigquery --dotenv --debug -c example/config.yml
+```
+The debug mode with --debug option ignores the `last_modified_time` value in status file.
+## Contributing
+Bug reports and pull requests are welcome on GitHub at https://github.com/triglav-dataflow/triglav-agent-bigquery. This project is intended to be a safe, welcoming space for collaboration, and contributors are expected to adhere to the [Contributor Covenant](http://contributor-covenant.org) code of conduct.
+## License
+The gem is available as open source under the terms of the [MIT License](http://opensource.org/licenses/MIT).

data/Rakefile ADDED Viewed

@@ -0,0 +1,11 @@
+require "bundler/gem_tasks"
+require 'rake/testtask'
+desc 'Run test_unit based test'
+Rake::TestTask.new(:test) do |t|
+  t.libs << "test"
+  t.test_files = Dir["test/**/test_*.rb"].sort
+  t.verbose = false
+  t.warning = false
+end
+task :default => :test

data/bin/console ADDED Viewed

@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+require "bundler/setup"
+require "triglav/agent/bigquery"
+# You can add fixtures and/or initialization code here to make experimenting
+# with your gem easier. You can also use a different console, if you like.
+# (If you use this, don't forget to add pry to your Gemfile!)
+# require "pry"
+# Pry.start
+require "irb"
+IRB.start

data/bin/setup ADDED Viewed

@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+IFS=$'\n\t'
+set -vx
+bundle install
+# Do any other automated setup that you need to do here

data/example/config.yml ADDED Viewed

@@ -0,0 +1,37 @@
+defaults: &defaults
+  serverengine:
+    log: 'STDOUT'
+    log_level: 'debug'
+    log_rotate_age: 5
+    log_rotate_size: 10485760
+  triglav:
+    url: <%= ENV['TRIGLAV_URL'] || 'http://localhost:7800' %>
+    credential:
+      username: <%= ENV['TRIGLAV_USERNAME'] || 'triglav_test' %>
+      password: <%= ENV['TRIGLAV_PASSWORD'] || 'triglav_test' %>
+      authenticator: local
+    timeout: 60
+    debugging: false
+    retries: 3
+    retry_interval: 3 # sec
+  bigquery:
+    monitor_interval: 5
+    retries: 5
+    timeout_sec: 300
+    open_timeout_sec: 300
+    connection_info:
+      "https://bigquery.cloud.google.com/table/<%= ENV['GOOGLE_PROJECT'] || 'your-project' %>":
+        # auth_method: # service_account, authorized_user, or compute_engine. default: get type from credentials
+        credentials_file: ~/.config/gcloud/application_default_credentials.json
+        # credentials: |
+        #   {
+        #     "private_key_id": "123456789",
+        #     "private_key": "-----BEGIN PRIVATE KEY-----\nABCDEF",
+        #     "client_email": "..."
+        #   }
+development:
+  <<: *defaults
+test:
+  <<: *defaults

data/example/example.env ADDED Viewed

@@ -0,0 +1,8 @@
+TRIGLAV_URL=http://localhost:7800
+TRIGLAV_USERNAME=triglav_test
+TRIGLAV_PASSWORD=triglav_test
+VERTICA_HOST=xxx.xxx.xxx.xxx
+VERTICA_PORT=5433
+VERTICA_DATABASE=vdb
+VERTICA_USER=dbread
+VERTICA_PASSWORD=daerbd

data/exe/triglav-agent-bigquery ADDED Viewed

@@ -0,0 +1,13 @@
+#!/usr/bin/env ruby
+require 'triglav/agent/bigquery'
+Triglav::Agent::Configuration.configure do |config|
+  config.name = :bigquery
+  # config.cli_class = Triglav::Agent::Bigquery::CLI
+  # config.setting_class = Triglav::Agent::Bigquery::Setting
+  # config.worker_module = Triglav::Agent::Bigquery::Worker
+  # config.processor_class = Triglav::Agent::Bigquery::Processor
+  config.monitor_class = Triglav::Agent::Bigquery::Monitor
+  config.connection_class = Triglav::Agent::Bigquery::Connection
+end
+Triglav::Agent::Configuration.cli_class.new.run

data/lib/triglav/agent/bigquery/connection.rb ADDED Viewed

@@ -0,0 +1,279 @@
+require 'triglav/agent/base/connection'
+require 'google/apis/bigquery_v2'
+require 'google/api_client/auth/key_utils'
+require 'securerandom'
+require 'ini_file'
+# monkey patch not to create representable objects which consumes lots of memory
+# @see http://qiita.com/sonots/items/1271f3d426cda6c891c0
+module Google
+  module Apis
+    module BigqueryV2
+      class BigqueryService < Google::Apis::Core::BaseService
+        def get_job_query_results(project_id, job_id, max_results: nil, page_token: nil, start_index: nil, timeout_ms: nil, fields: nil, quota_user: nil, user_ip: nil, options: nil, &block)
+          command =  make_simple_command(:get, 'projects/{projectId}/queries/{jobId}', options)
+          # command.response_representation = Google::Apis::BigqueryV2::GetQueryResultsResponse::Representation # monkey patch
+          command.response_class = Google::Apis::BigqueryV2::GetQueryResultsResponse
+          command.params['projectId'] = project_id unless project_id.nil?
+          command.params['jobId'] = job_id unless job_id.nil?
+          command.query['maxResults'] = max_results unless max_results.nil?
+          command.query['pageToken'] = page_token unless page_token.nil?
+          command.query['startIndex'] = start_index unless start_index.nil?
+          command.query['timeoutMs'] = timeout_ms unless timeout_ms.nil?
+          command.query['fields'] = fields unless fields.nil?
+          command.query['quotaUser'] = quota_user unless quota_user.nil?
+          command.query['userIp'] = user_ip unless user_ip.nil?
+          execute_or_queue_command(command, &block)
+        end
+      end
+    end
+  end
+end
+module Triglav::Agent
+  module Bigquery
+    class Connection < Base::Connection
+      attr_reader :connection_info
+      class Error < StandardError; end
+      class NotFoundError < Error; end
+      class ConfigError < Error; end
+      def initialize(connection_info)
+        @connection_info = connection_info
+      end
+      def close
+        # google-api-ruby-client uses hurley and patches it to use httpclient gem inside.
+        # httpclient gem manages its connections in its connection pool, and
+        # releases or reuses its connections automatically.
+        #
+        # ADVANCE NOTE: httpclient gem itself has own connection pool, so the connection pool
+        # machanism of triglav-agent-framework is just useless.
+        # httpclient gem creates a new connection as much as it is required, so the number of
+        # connections typically will be the same with the number of threads (?).
+      end
+      def client
+        return @cached_client if @cached_client && @cached_client_expiration > Time.now
+        client = Google::Apis::BigqueryV2::BigqueryService.new
+        client.request_options.retries = retries
+        client.request_options.timeout_sec = timeout_sec
+        client.request_options.open_timeout_sec = open_timeout_sec
+        scope = "https://www.googleapis.com/auth/bigquery"
+        case auth_method
+        when 'authorized_user'
+          auth = Signet::OAuth2::Client.new(
+            token_credential_uri: "https://accounts.google.com/o/oauth2/token",
+            audience: "https://accounts.google.com/o/oauth2/token",
+            scope: scope,
+            client_id:     credentials['client_id'],
+            client_secret: credentials['client_secret'],
+            refresh_token: credentials['refresh_token']
+          )
+          auth.refresh!
+        when 'compute_engine'
+          auth = Google::Auth::GCECredentials.new
+        when 'service_account'
+          key = StringIO.new(credentials.to_json)
+          auth = Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
+        when 'application_default'
+          auth = Google::Auth.get_application_default([scope])
+        else
+          raise ConfigError, "Unknown auth method: #{auth_method}"
+        end
+        client.authorization = auth
+        @cached_client_expiration = Time.now + 1800
+        @cached_client = client
+      end
+      # @return [Hash] {id:, creation_time:, last_modified_time:, location:, num_bytes:, num_rows:}
+      #
+      # creation_time [Integer] milli sec
+      # last_modified_time [Integer] milli sec
+      def get_table(project: nil, dataset:, table:)
+        project ||= self.project
+        begin
+          $logger.debug { "Get table... #{project}:#{dataset}.#{table}" }
+          response = client.get_table(project, dataset, table)
+        rescue Google::Apis::ServerError, Google::Apis::ClientError, Google::Apis::AuthorizationError => e
+          if e.status_code == 404 # not found
+            raise NotFoundError, "Table #{project}:#{dataset}.#{table} is not found"
+          end
+          response = {status_code: e.status_code, message: e.message, error_class: e.class}
+          raise Error, "Failed to get_table(#{project}, #{dataset}, #{table}), response:#{response}"
+        end
+        result = {
+          id: response.id, # project:dataset.table
+          creation_time: response.creation_time.to_i, # millisec
+          last_modified_time: response.last_modified_time.to_i, # millisec
+          location: response.location,
+          num_bytes: response.num_bytes.to_i,
+          num_rows: response.num_rows.to_i,
+        }
+      end
+      # @return [Array] [partition_id, creation_time, last_modified_time]
+      #
+      # partition_id [String] partition id such as "20160307"
+      # creation_time [Integer] milli sec
+      # last_modified_time [Integer] milli sec
+      def get_partitions_summary(project: nil, dataset:, table:, limit: nil)
+        project ||= self.project
+        limit_stmt = limit ? " LIMIT #{limit.to_i}" : ""
+        result = query(
+          "select partition_id,creation_time,last_modified_time " \
+          "from [#{project}:#{dataset}.#{table}$__PARTITIONS_SUMMARY__] " \
+          "order by partition_id asc#{limit_stmt}"
+        )
+        result[:rows].map {|r| v = r[:f].map {|c| c[:v] }; [v[0], v[1].to_i, v[2].to_i] }
+      end
+      def project
+        @project ||= ENV['GOOGLE_PROJECT'] || @connection_info.fetch(:project, nil) || credentials['project_id']
+        @project ||= credentials['client_email'].chomp('.iam.gserviceaccount.com').split('@').last if credentials['client_email']
+        @project ||= project_default
+      end
+      private
+      def query(q, options = {})
+        started = Time.now
+        current_row = 0
+        body  = {
+          job_reference: {
+            project_id: project,
+            job_id: "job_#{SecureRandom.uuid}",
+          },
+          configuration: {
+            query: {
+              query: q,
+              use_legacy_sql: true,
+              use_query_cache: true,
+            },
+            dry_run: options[:dry_run],
+          },
+        }
+        opts = {}
+        $logger.info { "insert_job(#{project}, #{body}, #{opts})" }
+        job_res = client.insert_job(project, body, opts)
+        if options[:dry_run]
+          {
+            totalRows: nil,
+            totalBytesProcessed: job_res.statistics.query.total_bytes_processed,
+            cacheHit: job_res.statistics.query.cache_hit,
+          }
+        else
+          job_id = job_res.job_reference.job_id
+          res = {}
+          while true
+            res = JSON.parse(client.get_job_query_results(
+              project,
+              job_id,
+            ), symbolize_names: true)
+            break if res[:jobComplete]
+            sleep 3
+            if (Time.now - started).to_i > HARD_TIMEOUT_SEC
+              raise RuntimeError.new("Query is timeout")
+            end
+          end
+          if res[:rows]
+            # res[:rows].each(&block)
+            current_row += res[:rows].size
+          end
+          total_rows = res[:totalRows].to_i
+          while current_row < total_rows
+            res = JSON.parse(client.get_job_query_results(
+              project,
+              job_id,
+              start_index: current_row
+            ), symbolize_names: true)
+            if res[:rows]
+              res[:rows].each(&block)
+              current_row += res[:rows].size
+            end
+          end
+          res
+        end
+      end
+      # compute_engine, authorized_user, service_account
+      def auth_method
+        @auth_method ||= ENV['AUTH_METHOD'] || @connection_info.fetch(:auth_method, nil) || credentials['type'] || 'compute_engine'
+      end
+      def credentials
+        JSON.parse(@connection_info.fetch(:credentials, nil) || File.read(credentials_file))
+      end
+      def credentials_file
+        @credentials_file ||= File.expand_path(
+          # ref. https://developers.google.com/identity/protocols/application-default-credentials
+          ENV['GOOGLE_APPLICATION_CREDENTIALS'] ||
+          @connection_info.fetch(:credentials_file, nil) ||
+          (File.exist?(global_application_default_credentials_file) ? global_application_default_credentials_file : application_default_credentials_file)
+        )
+      end
+      def application_default_credentials_file
+        @application_default_credentials_file ||= File.expand_path("~/.config/gcloud/application_default_credentials.json")
+      end
+      def global_application_default_credentials_file
+        @global_application_default_credentials_file ||= '/etc/google/auth/application_default_credentials.json'
+      end
+      def config_default_file
+        File.expand_path('~/.config/gcloud/configurations/config_default')
+      end
+      def config_default
+        # {core:{account:'xxx',project:'xxx'},compute:{zone:'xxx}}
+        @config_default ||= File.readable?(config_default_file) ? IniFile.load(config_default_file).to_hash : {}
+      end
+      def service_account_default
+        (config_default[:core] || {})[:account]
+      end
+      def project_default
+        (config_default[:core] || {})[:project]
+      end
+      def zone_default
+        (config_default[:compute] || {})[:zone]
+      end
+      def service_account
+        @service_account ||= ENV['GOOGLE_SERVICE_ACCOUNT'] || @connection_info.fetch(:service_account, nil) || credentials['client_email'] || service_account_default
+      end
+      def retries
+        @retries ||= ENV['RETRIES'] || @connection_info.fetch(:retries, nil) || $setting.dig(:bigquery, :retries) || 5
+      end
+      def timeout_sec
+        @timeout_sec ||= ENV['TIMEOUT_SEC'] || @connection_info.fetch(:timeout_sec, nil) || $setting.dig(:bigquery, :timeout_sec) || 300
+      end
+      def open_timeout_sec
+        @open_timeout_sec ||= ENV['OPEN_TIMEOUT_SEC'] || @connection_info.fetch(:open_timeout_sec, nil) || $setting.dig(:bigquery, :open_timeout_sec) || 300
+      end
+    end
+  end
+end

data/lib/triglav/agent/bigquery/monitor.rb ADDED Viewed

@@ -0,0 +1,230 @@
+require 'triglav/agent/base/monitor'
+require 'uri'
+require 'cgi'
+require 'securerandom'
+module Triglav::Agent
+  module Bigquery
+    class Monitor < Base::Monitor
+      attr_reader :connection, :resource_uri_prefix, :resource
+      # @param [Triglav::Agent::Bigquery::Connection] connection
+      # @param [String] resource_uri_prefix
+      # @param [TriglavClient::ResourceResponse] resource
+      # resource:
+      #   uri: https://bigquery.cloud.google.com/table/project:dataset.table
+      #   unit: 'daily', 'hourly', or 'singular'
+      #   timezone: '+09:00'
+      #   span_in_days: 32
+      def initialize(connection, resource_uri_prefix, resource)
+        @connection = connection
+        @resource_uri_prefix = resource_uri_prefix
+        @resource = resource
+        @status = Triglav::Agent::Status.new(resource_uri_prefix, resource.uri)
+      end
+      def process
+        unless resource_valid?
+          $logger.warn { "Broken resource: #{resource.to_s}" }
+          return nil
+        end
+        $logger.debug { "Start process #{resource.uri}" }
+        events, new_last_modified_times = get_events
+        $logger.debug { "Finish process #{resource.uri}" }
+        return nil if events.nil? || events.empty?
+        yield(events) if block_given? # send_message
+        update_status_file(new_last_modified_times)
+        true
+      end
+      private
+      def last_modified_times
+        @last_modified_times ||= get_last_modified_times
+      end
+      def get_events
+        if partitioned_table?
+          new_last_modified_times = get_new_last_modified_times_for_partitioned_table
+        else
+          new_last_modified_times = get_new_last_modified_times_for_non_partitioned_table
+        end
+        latest_tables = select_latest_tables(new_last_modified_times)
+        events = build_events(latest_tables)
+        [events, new_last_modified_times]
+      rescue => e
+        $logger.warn { "#{e.class} #{e.message} #{e.backtrace.join("\n  ")}" }
+        nil
+      end
+      def update_status_file(last_modified_times)
+        last_modified_times[:max] = last_modified_times.values.max
+        @status.set(last_modified_times)
+      end
+      def get_last_modified_times
+        max_last_modified_time = @status.getsetnx([:max], $setting.debug? ? 0 : get_current_time)
+        last_modified_times = @status.get
+        removes = last_modified_times.keys - tables.keys
+        appends = tables.keys - last_modified_times.keys
+        removes.each {|table| last_modified_times.delete(table) }
+        appends.each {|table| last_modified_times[table] = max_last_modified_time }
+        last_modified_times
+      end
+      def get_current_time
+        (Time.now.to_f * 1000).to_i # msec
+      end
+      def resource_valid?
+        self.class.resource_valid?(resource)
+      end
+      def self.resource_valid?(resource)
+        resource_unit_valid?(resource) && !resource.timezone.nil? && !resource.span_in_days.nil?
+      end
+      # Two or more combinations are not allowed for hdfs because
+      # * hourly should have %d, %H
+      # * daily should have %d, but not have %H
+      # * singualr should not have %d
+      # These conditions conflict.
+      def self.resource_unit_valid?(resource)
+        units = resource.unit.split(',').sort
+        return false if units.size >= 2
+        if units.include?('hourly')
+          return false unless resource.uri.match(/%H/)
+        end
+        # if units.include?('daily')
+        #   return false unless resource.uri.match(/%d/)
+        # end
+        if units.include?('singular')
+          return false if resource.uri.match(/%[YmdH]/)
+        end
+        true
+      end
+      def dates
+        return @dates if @dates
+        now = Time.now.localtime(resource.timezone)
+        @dates = resource.span_in_days.times.map do |i|
+          (now - (i * 86000)).to_date
+        end
+      end
+      def project_dataset_table
+        @project_dataset_table ||= resource.uri.split('/').last
+      end
+      def project
+        @project ||= project_dataset_table.split(':').first
+      end
+      def dataset
+        @dataset ||= project_dataset_table.split(':').last.chomp(".#{table}")
+      end
+      def table
+        @table ||= project_dataset_table.split('.').last
+      end
+      def partitioned_table?
+        table.include?('$')
+      end
+      def table_without_partition
+        @table_without_partition ||= table.split('$').first
+      end
+      def dates
+        return @dates if @dates
+        now = Time.now.localtime(resource.timezone)
+        @dates = resource.span_in_days.times.map do |i|
+          (now - (i * 86000)).to_date
+        end
+      end
+      def tables
+        return @tables if @tables
+        tables = {}
+        # If table becomes same, use newer date
+        case resource.unit
+        when 'hourly'
+          dates.each do |date|
+            date_time = date.to_time
+            (0..23).each do |hour|
+              _table = (date_time + hour * 3600).strftime(table)
+              tables[_table.to_sym] = [date, hour]
+            end
+          end
+        when 'daily'
+          hour = 0
+          dates.each do |date|
+            _table = date.strftime(table)
+            tables[_table.to_sym] = [date, hour]
+          end
+        when 'singular'
+          tables[table.to_sym] = [nil, nil]
+        end
+        @tables = tables
+      end
+      def get_new_last_modified_times_for_partitioned_table
+        rows = connection.get_partitions_summary(
+          project: project, dataset: dataset, table: table_without_partition, limit: resource.span_in_days
+        )
+        new_last_modified_times = {}
+        rows.each do |partition, creation_time, last_modified_time|
+          new_last_modified_times["#{table_without_partition}$#{partition}".to_sym] = last_modified_time
+        end
+        new_last_modified_times
+      end
+      def get_new_last_modified_times_for_non_partitioned_table
+        new_last_modified_times = {}
+        tables.each do |table, date_hour|
+          begin
+            result = connection.get_table(project: project, dataset: dataset, table: table)
+            new_last_modified_times[table.to_sym] = result[:last_modified_time]
+          rescue Connection::NotFoundError => e
+            $logger.debug { "#{project}:#{dataset}.#{table.to_s} #=> does not exist" }
+          rescue Connection::Error => e
+            $logger.warn { "#{project}:#{dataset}.#{table.to_s} #=> #{e.class} #{e.message}" }
+          end
+        end
+        new_last_modified_times
+      end
+      def select_latest_tables(new_last_modified_times)
+        new_last_modified_times.select do |table, last_modified_time|
+          is_newer = last_modified_time > (last_modified_times[table] || 0)
+          $logger.debug { "#{project}:#{dataset}.#{table} #=> latest_modified_time:#{last_modified_time}, is_newer:#{is_newer}" }
+          is_newer
+        end
+      end
+      def build_events(latest_tables)
+        latest_tables.map do |table, last_modified_time|
+          date, hour = date_hour = tables[table]
+          {
+            uuid: SecureRandom.uuid,
+            resource_uri: resource.uri,
+            resource_unit: resource.unit,
+            resource_time: date_hour_to_i(date, hour, resource.timezone),
+            resource_timezone: resource.timezone,
+            payload: {table: table.to_sym, last_modified_time: last_modified_time}.to_json, # msec
+          }
+        end
+      end
+      def date_hour_to_i(date, hour, timezone)
+        return 0 if date.nil?
+        Time.strptime("#{date.to_s} #{hour.to_i} #{timezone}", '%Y-%m-%d %H %z').to_i
+      end
+    end
+  end
+end

data/lib/triglav/agent/bigquery/version.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module Triglav
+  module Agent
+    module Bigquery
+      VERSION = "1.0.0.rc1"
+    end
+  end
+end

data/lib/triglav/agent/bigquery.rb ADDED Viewed

@@ -0,0 +1,11 @@
+module Triglav
+  module Agent
+    module Bigquery
+    end
+  end
+end
+require 'triglav-agent'
+require 'triglav/agent/bigquery/connection'
+require 'triglav/agent/bigquery/version'
+require 'triglav/agent/bigquery/monitor'

data/prepare.sh ADDED Viewed

@@ -0,0 +1,3 @@
+#!/bin/sh
+test -f config.yml || cp example/config.yml config.yml
+test -f .env || cp example/example.env .env

data/start.sh ADDED Viewed

@@ -0,0 +1,8 @@
+#!/bin/sh
+ABSPATH=$(cd $(dirname $0) && pwd)/$(basename $0)
+APP_ROOT=$(dirname $ABSPATH)
+if [ -z "${SHARED_ROOT}" ]; then SHARED_ROOT=.; fi
+CMD="bundle exec triglav-agent-bigquery --dotenv -c config.yml --status ${SHARED_ROOT}/status.yml --token ${SHARED_ROOT}/token.yml"
+echo $CMD
+$CMD

data/triglav-agent-bigquery.gemspec ADDED Viewed

@@ -0,0 +1,33 @@
+# coding: utf-8
+lib = File.expand_path('../lib', __FILE__)
+$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
+require 'triglav/agent/bigquery/version'
+Gem::Specification.new do |spec|
+  spec.name          = "triglav-agent-bigquery"
+  spec.version       = Triglav::Agent::Bigquery::VERSION
+  spec.authors       = ["Triglav Team"]
+  spec.email         = ["triglav_admin_my@dena.jp"]
+  spec.summary       = %q{BigQuery agent for triglav, data-driven workflow tool.}
+  spec.description   = %q{BigQuery agent for triglav, data-driven workflow tool.}
+  spec.homepage      = "https://github.com/triglav-dataflow/triglav-agent-bigquery"
+  spec.license       = "MIT"
+  spec.files         = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
+  spec.bindir        = "exe"
+  spec.executables   = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
+  spec.require_paths = ["lib"]
+  spec.add_dependency "triglav-agent"
+  spec.add_dependency "triglav_client"
+  spec.add_dependency "google-api-client"
+  spec.add_dependency "ini_file"
+  spec.add_development_dependency "bundler", "~> 1.11"
+  spec.add_development_dependency "rake", "~> 10.0"
+  spec.add_development_dependency "test-unit"
+  spec.add_development_dependency "test-unit-rr"
+  spec.add_development_dependency "test-unit-power_assert"
+  spec.add_development_dependency "timecop"
+end

metadata ADDED Viewed

@@ -0,0 +1,205 @@
+--- !ruby/object:Gem::Specification
+name: triglav-agent-bigquery
+version: !ruby/object:Gem::Version
+  version: 1.0.0.rc1
+platform: ruby
+authors:
+- Triglav Team
+autorequire:
+bindir: exe
+cert_chain: []
+date: 2017-03-14 00:00:00.000000000 Z
+dependencies:
+- !ruby/object:Gem::Dependency
+  name: triglav-agent
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: triglav_client
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: google-api-client
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: ini_file
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :runtime
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: bundler
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.11'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '1.11'
+- !ruby/object:Gem::Dependency
+  name: rake
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - "~>"
+      - !ruby/object:Gem::Version
+        version: '10.0'
+- !ruby/object:Gem::Dependency
+  name: test-unit
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: test-unit-rr
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: test-unit-power_assert
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+- !ruby/object:Gem::Dependency
+  name: timecop
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+description: BigQuery agent for triglav, data-driven workflow tool.
+email:
+- triglav_admin_my@dena.jp
+executables:
+- triglav-agent-bigquery
+extensions: []
+extra_rdoc_files: []
+files:
+- ".gitignore"
+- ".rspec"
+- ".travis.yml"
+- CODE_OF_CONDUCT.md
+- Gemfile
+- LICENSE.txt
+- README.md
+- Rakefile
+- bin/console
+- bin/setup
+- example/config.yml
+- example/example.env
+- exe/triglav-agent-bigquery
+- lib/triglav/agent/bigquery.rb
+- lib/triglav/agent/bigquery/connection.rb
+- lib/triglav/agent/bigquery/monitor.rb
+- lib/triglav/agent/bigquery/version.rb
+- prepare.sh
+- start.sh
+- triglav-agent-bigquery.gemspec
+homepage: https://github.com/triglav-dataflow/triglav-agent-bigquery
+licenses:
+- MIT
+metadata: {}
+post_install_message:
+rdoc_options: []
+require_paths:
+- lib
+required_ruby_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">="
+    - !ruby/object:Gem::Version
+      version: '0'
+required_rubygems_version: !ruby/object:Gem::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 1.3.1
+requirements: []
+rubyforge_project:
+rubygems_version: 2.5.2
+signing_key:
+specification_version: 4
+summary: BigQuery agent for triglav, data-driven workflow tool.
+test_files: []