export_to_gcloud 0.9.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: b10b7cddc6441b96f235f33936eb9560e9662f86
4
+ data.tar.gz: 91f284abd485f6c6bbf8e7cd2ff99d49e3983157
5
+ SHA512:
6
+ metadata.gz: ac65925a3d33b4b0d081e170a5a0f0a7d29d86e08a7490b9703b05e9163628514ef815db66c9c09422e50ab89d73376753b7e402935062b915a26e02eefb1129
7
+ data.tar.gz: 0a3b60ce3cfdc1c6523a9700701f502d0a6cf8e7329be4bc63bb72d4c41e373e7294ccf31fa2ccec7f9d3b3de5ecaee62cb6ad37b6ae5ed93422e54fdbb982bd
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2016 Ondřej Želazko
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,49 @@
1
+ class ExportToGcloud::Exporter::Context
2
+
3
+ attr_reader :client
4
+
5
+ OPTIONS = %i[dump_path storage_prefix bucket dataset].freeze
6
+
7
+ def initialize client, **opts
8
+ @client = client
9
+ set opts
10
+ end
11
+
12
+ def set **opts
13
+ OPTIONS.each do |key|
14
+ value = opts[key]
15
+ send "set_#{key}", value if value
16
+ end
17
+ self
18
+ end
19
+
20
+ def set_dump_path path
21
+ @dump_path = Pathname.new path
22
+ end
23
+
24
+ def set_storage_prefix prefix
25
+ @storage_prefix = prefix
26
+ end
27
+
28
+ def set_bucket bucket
29
+ bucket = client.storage.bucket bucket if String === bucket
30
+ @bucket = bucket
31
+ end
32
+
33
+ def set_dataset dataset
34
+ dataset = client.bigquery.dataset dataset if String === dataset
35
+ @dataset = dataset
36
+ end
37
+
38
+ OPTIONS.each do |key|
39
+ define_method key do
40
+ value = instance_variable_get "@#{key}"
41
+ value || raise("Undefined value for #{key} in exporter options!")
42
+ end
43
+ end
44
+
45
+ def copy
46
+ self.class.new client, OPTIONS.inject({}){|h, k| h[k] = instance_variable_get "@#{k}"; h}
47
+ end
48
+
49
+ end
@@ -0,0 +1,47 @@
1
+ class ExportToGcloud::Exporter::Definition < OpenStruct
2
+
3
+ def initialize exporter_type, attrs
4
+ super attrs.merge!(type: exporter_type)
5
+ end
6
+
7
+ def validate!
8
+ (String === name && !name.empty?) || raise('`name` must be defined!')
9
+ Proc === bq_schema || raise('`bq_schema` must be defined as a Proc!')
10
+ data || raise('`data` must be defined!')
11
+ type.validate_definition! self if type.respond_to? 'validate_definition!'
12
+ end
13
+
14
+ def get_data *args
15
+ Proc === data ? data.call(*args) : data
16
+ end
17
+
18
+ def get_bq_table_name
19
+ bq_table_name || name
20
+ end
21
+
22
+ def self.set_last_definition klass, attrs={}, &block
23
+ last_definition = new klass, attrs
24
+ block.call last_definition if block
25
+
26
+ last_definition.validate!
27
+ @last_definition = last_definition
28
+ end
29
+
30
+ def self.load_definition name, finder
31
+ file_path = finder.call name
32
+ load file_path
33
+ definition = @last_definition
34
+ @last_definition = nil
35
+
36
+ unless definition
37
+ raise("File #{file_path.to_s} must define exporter for '#{name}'!")
38
+ end
39
+
40
+ unless definition.name == name
41
+ raise "File #{file_path.to_s} defines '#{definition.name}' instead of '#{name}'"
42
+ end
43
+
44
+ definition
45
+ end
46
+
47
+ end
@@ -0,0 +1,18 @@
1
+
2
+ module ExportToGcloud
3
+
4
+ class CSVExporter < Exporter
5
+
6
+ def create_data_file! file, *part_data
7
+ data = @definition.get_data(*part_data)
8
+
9
+ csv_data = CSV.generate col_sep: ';', force_quotes: false do |csv|
10
+ data.each{|row| csv << row}
11
+ end
12
+
13
+ File.write file.to_path, csv_data
14
+ end
15
+
16
+ end
17
+
18
+ end
@@ -0,0 +1,112 @@
1
+
2
+ module ExportToGcloud
3
+
4
+ class Exporter
5
+
6
+ def initialize definition, context
7
+ @definition = definition
8
+ @context = context
9
+
10
+ @parts = []
11
+ case definition.parts
12
+ when Array then definition.parts.each{|label, *part_args| add_data_part *part_args, label: label}
13
+ when Proc then definition.parts.call self
14
+ end
15
+ end
16
+
17
+ def local_file_path label
18
+ @context.dump_path.join "#{@definition.name}#{prepend_underscore label}.csv"
19
+ end
20
+
21
+ def storage_file_path label
22
+ prefix = @definition.storage_prefix || @context.storage_prefix
23
+ "#{prefix}#{@definition.name}#{prepend_underscore label}.csv"
24
+ end
25
+
26
+ def add_data_part *args, label:nil
27
+ args.unshift(label ? label.to_s : (@parts.length+1).to_s)
28
+ @parts << args
29
+ end
30
+
31
+ def process_all_parts! recreate_table=true
32
+ add_data_part label: 'all' if @parts.empty?
33
+ recreate_bq_table! if recreate_table
34
+
35
+ @parts.map{|*args| process_part! *args}
36
+ end
37
+
38
+ def process_part! label, *part_args
39
+ file = local_file_path label
40
+ create_data_file! file, *part_args
41
+
42
+ storage_name = storage_file_path label
43
+ gcloud_file = upload_file! file, storage_name
44
+ start_load_job gcloud_file
45
+ end
46
+
47
+ def create_data_file! file, *part_data
48
+ File.write file.to_path, @definition.get_data(*part_data)
49
+ end
50
+
51
+ def upload_file!(file, storage_name)
52
+ file = compress_file! file
53
+ gcloud_file = @context.bucket.create_file file, storage_name, chunk_size: 2**21 # 2MB
54
+ file.delete
55
+ gcloud_file
56
+ end
57
+
58
+ def get_storage_files
59
+ @parts.map do |label, *_|
60
+ @context.bucket.file storage_file_path(label)
61
+ end.compact
62
+ end
63
+
64
+ def bq_table
65
+ unless defined? @bq_table
66
+ @bq_table = @context.dataset.table @definition.get_bq_table_name
67
+ end
68
+ @bq_table
69
+ end
70
+
71
+ def recreate_bq_table!
72
+ bq_table.delete if bq_table
73
+ @bq_table = @context.dataset.create_table @definition.get_bq_table_name, &@definition.bq_schema
74
+ end
75
+
76
+ def start_load_job gcloud_file, **_load_settings
77
+ load_settings = {
78
+ format: 'csv',
79
+ quote: '"',
80
+ delimiter: ';',
81
+ create: 'never',
82
+ write: 'append',
83
+ max_bad_records: 0
84
+ }
85
+ load_settings.merge! _load_settings unless _load_settings.empty?
86
+ bq_table.load gcloud_file, **load_settings
87
+ end
88
+
89
+ def self.define **kwargs, &block
90
+ ::ExportToGcloud::Exporter::Definition.set_last_definition self, kwargs, &block
91
+ end
92
+
93
+ private
94
+
95
+ def compress_file!(original_file)
96
+ err = %x(pigz -f9 #{original_file.to_path} 2>&1)
97
+ compressed_file = Pathname.new "#{original_file.to_path}.gz"
98
+ raise "Compression of #{original_file.to_path} failed: #{err}" unless compressed_file.exist?
99
+ original_file.delete if original_file.exist?
100
+ compressed_file
101
+ end
102
+
103
+ def prepend_underscore text
104
+ "_#{text}" if String === text && !text.empty?
105
+ end
106
+
107
+ end
108
+
109
+ end
110
+
111
+ require_relative '../exporter/definition'
112
+ require_relative '../exporter/context'
@@ -0,0 +1,37 @@
1
+
2
+ module ExportToGcloud
3
+
4
+ class PGExporter < Exporter
5
+
6
+ def create_data_file! file, *part_data
7
+ sql = @definition.get_data(*part_data)
8
+
9
+ schema = ::Gcloud::Bigquery::Table::Schema.new nil
10
+ @definition.bq_schema.call schema
11
+ string_fields = schema.fields.select{|f| f['type']=='STRING'}.map{|f| f['name']}
12
+
13
+ force_quote = if string_fields.empty?
14
+ ''
15
+ else
16
+ ", FORCE_QUOTE (#{string_fields.join ', '})"
17
+ end
18
+ sql = "COPY (#{sql}) TO '#{file.to_path}' WITH (FORMAT CSV, DELIMITER ';', QUOTE '\"'#{force_quote});"
19
+
20
+
21
+ executor = @definition.get_sql_executor || self.class.default_executor
22
+ executor.call sql
23
+ end
24
+
25
+ def self.validate_definition! definition
26
+ definition.get_sql_executor || default_executor || raise('`sql_executor` needs to be defined!')
27
+ end
28
+
29
+ class << self
30
+
31
+ attr_accessor :default_executor
32
+
33
+ end
34
+
35
+ end
36
+
37
+ end
@@ -0,0 +1,67 @@
1
+ require 'gcloud'
2
+ require 'gcloud/bigquery'
3
+
4
+ # large files uploading
5
+ require 'httpclient'
6
+ Faraday.default_adapter = :httpclient
7
+
8
+ # monkeypatch :/ some issue in google-api
9
+ # see http://googlecloudplatform.github.io/gcloud-ruby/docs/master/Gcloud/Storage.html
10
+ # -> A note about large uploads
11
+ require 'google/api_client'
12
+ Faraday::Response.register_middleware gzip: Faraday::Response::Middleware
13
+
14
+ module ExportToGcloud
15
+
16
+ def self.definitions_resolver= proc
17
+ @definitions_resolver = proc
18
+ end
19
+
20
+ # waits for BigQuery jobs
21
+ # - send a block to do something with failed
22
+ def self.wait_for_load_jobs(jobs, &block)
23
+ jobs_left = jobs.dup
24
+ failed = []
25
+ sleeper = ->(_retries) {sleep 2 * _retries + 5}
26
+ retries = 0
27
+
28
+ until jobs_left.empty?
29
+ sleeper.call retries
30
+ retries += 1
31
+ jobs_left.each &:reload!
32
+ jobs_left.delete_if do |j|
33
+ if j.done?
34
+ failed << {id: j.job_id, error: j.error, sources: j.sources} if j.failed?
35
+ true
36
+ end
37
+ end
38
+ end
39
+
40
+ block.call failed unless failed.empty?
41
+ end
42
+
43
+ def self.get_exporter name, context
44
+ name = name.to_s
45
+
46
+ @definitions ||= {}
47
+ unless @definitions.has_key? name
48
+ @definitions[name] = ::ExportToGcloud::Exporter::Definition.load_definition name, @definitions_resolver
49
+ end
50
+
51
+ definition = @definitions[name]
52
+ definition.type.new definition, context
53
+ end
54
+
55
+ def self.create_context **opts
56
+ ::ExportToGcloud::Exporter::Context.new client, opts
57
+ end
58
+
59
+ end
60
+
61
+ require 'pathname'
62
+ require 'ostruct'
63
+ require 'csv'
64
+
65
+ require_relative 'exporters/exporter'
66
+ require_relative 'exporters/csv_exporter'
67
+ require_relative 'exporters/pg_exporter'
@@ -0,0 +1,5 @@
1
+ module ExportToGcloud
2
+
3
+ VERSION = '0.9.0'
4
+
5
+ end
@@ -0,0 +1,16 @@
1
+ require_relative 'export_to_gcloud/version'
2
+
3
+ module ExportToGcloud
4
+
5
+ def self.setup project_name:, config_file:, definitions_resolver:nil
6
+ require_relative 'export_to_gcloud/library'
7
+
8
+ self.definitions_resolver = definitions_resolver if definitions_resolver
9
+ @client = ::Gcloud.new project_name, config_file
10
+ end
11
+
12
+ def self.client
13
+ @client || raise('Gcloud client not present. call ExportToGcloud#setup first.')
14
+ end
15
+
16
+ end
metadata ADDED
@@ -0,0 +1,80 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: export_to_gcloud
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.9.0
5
+ platform: ruby
6
+ authors:
7
+ - Ondřej Želazko
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-11-07 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: gcloud
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 0.5.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 0.5.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: httpclient
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: '2.8'
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: '2.8'
41
+ description: A simple helper to export data to BigQuery via Google Drive
42
+ email: zelazk.o@email.cz
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - LICENSE
48
+ - lib/export_to_gcloud.rb
49
+ - lib/export_to_gcloud/exporter/context.rb
50
+ - lib/export_to_gcloud/exporter/definition.rb
51
+ - lib/export_to_gcloud/exporters/csv_exporter.rb
52
+ - lib/export_to_gcloud/exporters/exporter.rb
53
+ - lib/export_to_gcloud/exporters/pg_exporter.rb
54
+ - lib/export_to_gcloud/library.rb
55
+ - lib/export_to_gcloud/version.rb
56
+ homepage: https://github.com/doooby/export_to_gcloud
57
+ licenses:
58
+ - MIT
59
+ metadata: {}
60
+ post_install_message:
61
+ rdoc_options: []
62
+ require_paths:
63
+ - lib
64
+ required_ruby_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ required_rubygems_version: !ruby/object:Gem::Requirement
70
+ requirements:
71
+ - - ">="
72
+ - !ruby/object:Gem::Version
73
+ version: '0'
74
+ requirements: []
75
+ rubyforge_project:
76
+ rubygems_version: 2.5.1
77
+ signing_key:
78
+ specification_version: 4
79
+ summary: Exporter to BigQuery
80
+ test_files: []