export_to_gcloud 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/lib/export_to_gcloud/exporter/context.rb +49 -0
- data/lib/export_to_gcloud/exporter/definition.rb +47 -0
- data/lib/export_to_gcloud/exporters/csv_exporter.rb +18 -0
- data/lib/export_to_gcloud/exporters/exporter.rb +112 -0
- data/lib/export_to_gcloud/exporters/pg_exporter.rb +37 -0
- data/lib/export_to_gcloud/library.rb +67 -0
- data/lib/export_to_gcloud/version.rb +5 -0
- data/lib/export_to_gcloud.rb +16 -0
- metadata +80 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: b10b7cddc6441b96f235f33936eb9560e9662f86
|
4
|
+
data.tar.gz: 91f284abd485f6c6bbf8e7cd2ff99d49e3983157
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: ac65925a3d33b4b0d081e170a5a0f0a7d29d86e08a7490b9703b05e9163628514ef815db66c9c09422e50ab89d73376753b7e402935062b915a26e02eefb1129
|
7
|
+
data.tar.gz: 0a3b60ce3cfdc1c6523a9700701f502d0a6cf8e7329be4bc63bb72d4c41e373e7294ccf31fa2ccec7f9d3b3de5ecaee62cb6ad37b6ae5ed93422e54fdbb982bd
|
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2016 Ondřej Želazko
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
@@ -0,0 +1,49 @@
|
|
1
|
+
class ExportToGcloud::Exporter::Context
|
2
|
+
|
3
|
+
attr_reader :client
|
4
|
+
|
5
|
+
OPTIONS = %i[dump_path storage_prefix bucket dataset].freeze
|
6
|
+
|
7
|
+
def initialize client, **opts
|
8
|
+
@client = client
|
9
|
+
set opts
|
10
|
+
end
|
11
|
+
|
12
|
+
def set **opts
|
13
|
+
OPTIONS.each do |key|
|
14
|
+
value = opts[key]
|
15
|
+
send "set_#{key}", value if value
|
16
|
+
end
|
17
|
+
self
|
18
|
+
end
|
19
|
+
|
20
|
+
def set_dump_path path
|
21
|
+
@dump_path = Pathname.new path
|
22
|
+
end
|
23
|
+
|
24
|
+
def set_storage_prefix prefix
|
25
|
+
@storage_prefix = prefix
|
26
|
+
end
|
27
|
+
|
28
|
+
def set_bucket bucket
|
29
|
+
bucket = client.storage.bucket bucket if String === bucket
|
30
|
+
@bucket = bucket
|
31
|
+
end
|
32
|
+
|
33
|
+
def set_dataset dataset
|
34
|
+
dataset = client.bigquery.dataset dataset if String === dataset
|
35
|
+
@dataset = dataset
|
36
|
+
end
|
37
|
+
|
38
|
+
OPTIONS.each do |key|
|
39
|
+
define_method key do
|
40
|
+
value = instance_variable_get "@#{key}"
|
41
|
+
value || raise("Undefined value for #{key} in exporter options!")
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def copy
|
46
|
+
self.class.new client, OPTIONS.inject({}){|h, k| h[k] = instance_variable_get "@#{k}"; h}
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
class ExportToGcloud::Exporter::Definition < OpenStruct
|
2
|
+
|
3
|
+
def initialize exporter_type, attrs
|
4
|
+
super attrs.merge!(type: exporter_type)
|
5
|
+
end
|
6
|
+
|
7
|
+
def validate!
|
8
|
+
(String === name && !name.empty?) || raise('`name` must be defined!')
|
9
|
+
Proc === bq_schema || raise('`bq_schema` must be defined as a Proc!')
|
10
|
+
data || raise('`data` must be defined!')
|
11
|
+
type.validate_definition! self if type.respond_to? 'validate_definition!'
|
12
|
+
end
|
13
|
+
|
14
|
+
def get_data *args
|
15
|
+
Proc === data ? data.call(*args) : data
|
16
|
+
end
|
17
|
+
|
18
|
+
def get_bq_table_name
|
19
|
+
bq_table_name || name
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.set_last_definition klass, attrs={}, &block
|
23
|
+
last_definition = new klass, attrs
|
24
|
+
block.call last_definition if block
|
25
|
+
|
26
|
+
last_definition.validate!
|
27
|
+
@last_definition = last_definition
|
28
|
+
end
|
29
|
+
|
30
|
+
def self.load_definition name, finder
|
31
|
+
file_path = finder.call name
|
32
|
+
load file_path
|
33
|
+
definition = @last_definition
|
34
|
+
@last_definition = nil
|
35
|
+
|
36
|
+
unless definition
|
37
|
+
raise("File #{file_path.to_s} must define exporter for '#{name}'!")
|
38
|
+
end
|
39
|
+
|
40
|
+
unless definition.name == name
|
41
|
+
raise "File #{file_path.to_s} defines '#{definition.name}' instead of '#{name}'"
|
42
|
+
end
|
43
|
+
|
44
|
+
definition
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,18 @@
|
|
1
|
+
|
2
|
+
module ExportToGcloud
|
3
|
+
|
4
|
+
class CSVExporter < Exporter
|
5
|
+
|
6
|
+
def create_data_file! file, *part_data
|
7
|
+
data = @definition.get_data(*part_data)
|
8
|
+
|
9
|
+
csv_data = CSV.generate col_sep: ';', force_quotes: false do |csv|
|
10
|
+
data.each{|row| csv << row}
|
11
|
+
end
|
12
|
+
|
13
|
+
File.write file.to_path, csv_data
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
17
|
+
|
18
|
+
end
|
@@ -0,0 +1,112 @@
|
|
1
|
+
|
2
|
+
module ExportToGcloud
|
3
|
+
|
4
|
+
class Exporter
|
5
|
+
|
6
|
+
def initialize definition, context
|
7
|
+
@definition = definition
|
8
|
+
@context = context
|
9
|
+
|
10
|
+
@parts = []
|
11
|
+
case definition.parts
|
12
|
+
when Array then definition.parts.each{|label, *part_args| add_data_part *part_args, label: label}
|
13
|
+
when Proc then definition.parts.call self
|
14
|
+
end
|
15
|
+
end
|
16
|
+
|
17
|
+
def local_file_path label
|
18
|
+
@context.dump_path.join "#{@definition.name}#{prepend_underscore label}.csv"
|
19
|
+
end
|
20
|
+
|
21
|
+
def storage_file_path label
|
22
|
+
prefix = @definition.storage_prefix || @context.storage_prefix
|
23
|
+
"#{prefix}#{@definition.name}#{prepend_underscore label}.csv"
|
24
|
+
end
|
25
|
+
|
26
|
+
def add_data_part *args, label:nil
|
27
|
+
args.unshift(label ? label.to_s : (@parts.length+1).to_s)
|
28
|
+
@parts << args
|
29
|
+
end
|
30
|
+
|
31
|
+
def process_all_parts! recreate_table=true
|
32
|
+
add_data_part label: 'all' if @parts.empty?
|
33
|
+
recreate_bq_table! if recreate_table
|
34
|
+
|
35
|
+
@parts.map{|*args| process_part! *args}
|
36
|
+
end
|
37
|
+
|
38
|
+
def process_part! label, *part_args
|
39
|
+
file = local_file_path label
|
40
|
+
create_data_file! file, *part_args
|
41
|
+
|
42
|
+
storage_name = storage_file_path label
|
43
|
+
gcloud_file = upload_file! file, storage_name
|
44
|
+
start_load_job gcloud_file
|
45
|
+
end
|
46
|
+
|
47
|
+
def create_data_file! file, *part_data
|
48
|
+
File.write file.to_path, @definition.get_data(*part_data)
|
49
|
+
end
|
50
|
+
|
51
|
+
def upload_file!(file, storage_name)
|
52
|
+
file = compress_file! file
|
53
|
+
gcloud_file = @context.bucket.create_file file, storage_name, chunk_size: 2**21 # 2MB
|
54
|
+
file.delete
|
55
|
+
gcloud_file
|
56
|
+
end
|
57
|
+
|
58
|
+
def get_storage_files
|
59
|
+
@parts.map do |label, *_|
|
60
|
+
@context.bucket.file storage_file_path(label)
|
61
|
+
end.compact
|
62
|
+
end
|
63
|
+
|
64
|
+
def bq_table
|
65
|
+
unless defined? @bq_table
|
66
|
+
@bq_table = @context.dataset.table @definition.get_bq_table_name
|
67
|
+
end
|
68
|
+
@bq_table
|
69
|
+
end
|
70
|
+
|
71
|
+
def recreate_bq_table!
|
72
|
+
bq_table.delete if bq_table
|
73
|
+
@bq_table = @context.dataset.create_table @definition.get_bq_table_name, &@definition.bq_schema
|
74
|
+
end
|
75
|
+
|
76
|
+
def start_load_job gcloud_file, **_load_settings
|
77
|
+
load_settings = {
|
78
|
+
format: 'csv',
|
79
|
+
quote: '"',
|
80
|
+
delimiter: ';',
|
81
|
+
create: 'never',
|
82
|
+
write: 'append',
|
83
|
+
max_bad_records: 0
|
84
|
+
}
|
85
|
+
load_settings.merge! _load_settings unless _load_settings.empty?
|
86
|
+
bq_table.load gcloud_file, **load_settings
|
87
|
+
end
|
88
|
+
|
89
|
+
def self.define **kwargs, &block
|
90
|
+
::ExportToGcloud::Exporter::Definition.set_last_definition self, kwargs, &block
|
91
|
+
end
|
92
|
+
|
93
|
+
private
|
94
|
+
|
95
|
+
def compress_file!(original_file)
|
96
|
+
err = %x(pigz -f9 #{original_file.to_path} 2>&1)
|
97
|
+
compressed_file = Pathname.new "#{original_file.to_path}.gz"
|
98
|
+
raise "Compression of #{original_file.to_path} failed: #{err}" unless compressed_file.exist?
|
99
|
+
original_file.delete if original_file.exist?
|
100
|
+
compressed_file
|
101
|
+
end
|
102
|
+
|
103
|
+
def prepend_underscore text
|
104
|
+
"_#{text}" if String === text && !text.empty?
|
105
|
+
end
|
106
|
+
|
107
|
+
end
|
108
|
+
|
109
|
+
end
|
110
|
+
|
111
|
+
require_relative '../exporter/definition'
|
112
|
+
require_relative '../exporter/context'
|
@@ -0,0 +1,37 @@
|
|
1
|
+
|
2
|
+
module ExportToGcloud
|
3
|
+
|
4
|
+
class PGExporter < Exporter
|
5
|
+
|
6
|
+
def create_data_file! file, *part_data
|
7
|
+
sql = @definition.get_data(*part_data)
|
8
|
+
|
9
|
+
schema = ::Gcloud::Bigquery::Table::Schema.new nil
|
10
|
+
@definition.bq_schema.call schema
|
11
|
+
string_fields = schema.fields.select{|f| f['type']=='STRING'}.map{|f| f['name']}
|
12
|
+
|
13
|
+
force_quote = if string_fields.empty?
|
14
|
+
''
|
15
|
+
else
|
16
|
+
", FORCE_QUOTE (#{string_fields.join ', '})"
|
17
|
+
end
|
18
|
+
sql = "COPY (#{sql}) TO '#{file.to_path}' WITH (FORMAT CSV, DELIMITER ';', QUOTE '\"'#{force_quote});"
|
19
|
+
|
20
|
+
|
21
|
+
executor = @definition.get_sql_executor || self.class.default_executor
|
22
|
+
executor.call sql
|
23
|
+
end
|
24
|
+
|
25
|
+
def self.validate_definition! definition
|
26
|
+
definition.get_sql_executor || default_executor || raise('`sql_executor` needs to be defined!')
|
27
|
+
end
|
28
|
+
|
29
|
+
class << self
|
30
|
+
|
31
|
+
attr_accessor :default_executor
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'gcloud'
|
2
|
+
require 'gcloud/bigquery'
|
3
|
+
|
4
|
+
# large files uploading
|
5
|
+
require 'httpclient'
|
6
|
+
Faraday.default_adapter = :httpclient
|
7
|
+
|
8
|
+
# monkeypatch :/ some issue in google-api
|
9
|
+
# see http://googlecloudplatform.github.io/gcloud-ruby/docs/master/Gcloud/Storage.html
|
10
|
+
# -> A note about large uploads
|
11
|
+
require 'google/api_client'
|
12
|
+
Faraday::Response.register_middleware gzip: Faraday::Response::Middleware
|
13
|
+
|
14
|
+
module ExportToGcloud
|
15
|
+
|
16
|
+
def self.definitions_resolver= proc
|
17
|
+
@definitions_resolver = proc
|
18
|
+
end
|
19
|
+
|
20
|
+
# waits for BigQuery jobs
|
21
|
+
# - send a block to do something with failed
|
22
|
+
def self.wait_for_load_jobs(jobs, &block)
|
23
|
+
jobs_left = jobs.dup
|
24
|
+
failed = []
|
25
|
+
sleeper = ->(_retries) {sleep 2 * _retries + 5}
|
26
|
+
retries = 0
|
27
|
+
|
28
|
+
until jobs_left.empty?
|
29
|
+
sleeper.call retries
|
30
|
+
retries += 1
|
31
|
+
jobs_left.each &:reload!
|
32
|
+
jobs_left.delete_if do |j|
|
33
|
+
if j.done?
|
34
|
+
failed << {id: j.job_id, error: j.error, sources: j.sources} if j.failed?
|
35
|
+
true
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
|
40
|
+
block.call failed unless failed.empty?
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.get_exporter name, context
|
44
|
+
name = name.to_s
|
45
|
+
|
46
|
+
@definitions ||= {}
|
47
|
+
unless @definitions.has_key? name
|
48
|
+
@definitions[name] = ::ExportToGcloud::Exporter::Definition.load_definition name, @definitions_resolver
|
49
|
+
end
|
50
|
+
|
51
|
+
definition = @definitions[name]
|
52
|
+
definition.type.new definition, context
|
53
|
+
end
|
54
|
+
|
55
|
+
def self.create_context **opts
|
56
|
+
::ExportToGcloud::Exporter::Context.new client, opts
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
60
|
+
|
61
|
+
require 'pathname'
|
62
|
+
require 'ostruct'
|
63
|
+
require 'csv'
|
64
|
+
|
65
|
+
require_relative 'exporters/exporter'
|
66
|
+
require_relative 'exporters/csv_exporter'
|
67
|
+
require_relative 'exporters/pg_exporter'
|
@@ -0,0 +1,16 @@
|
|
1
|
+
require_relative 'export_to_gcloud/version'
|
2
|
+
|
3
|
+
module ExportToGcloud
|
4
|
+
|
5
|
+
def self.setup project_name:, config_file:, definitions_resolver:nil
|
6
|
+
require_relative 'export_to_gcloud/library'
|
7
|
+
|
8
|
+
self.definitions_resolver = definitions_resolver if definitions_resolver
|
9
|
+
@client = ::Gcloud.new project_name, config_file
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.client
|
13
|
+
@client || raise('Gcloud client not present. call ExportToGcloud#setup first.')
|
14
|
+
end
|
15
|
+
|
16
|
+
end
|
metadata
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: export_to_gcloud
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.9.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Ondřej Želazko
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2016-11-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: gcloud
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: 0.5.0
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: 0.5.0
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: httpclient
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '2.8'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '2.8'
|
41
|
+
description: A simple helper to export data to BigQuery via Google Drive
|
42
|
+
email: zelazk.o@email.cz
|
43
|
+
executables: []
|
44
|
+
extensions: []
|
45
|
+
extra_rdoc_files: []
|
46
|
+
files:
|
47
|
+
- LICENSE
|
48
|
+
- lib/export_to_gcloud.rb
|
49
|
+
- lib/export_to_gcloud/exporter/context.rb
|
50
|
+
- lib/export_to_gcloud/exporter/definition.rb
|
51
|
+
- lib/export_to_gcloud/exporters/csv_exporter.rb
|
52
|
+
- lib/export_to_gcloud/exporters/exporter.rb
|
53
|
+
- lib/export_to_gcloud/exporters/pg_exporter.rb
|
54
|
+
- lib/export_to_gcloud/library.rb
|
55
|
+
- lib/export_to_gcloud/version.rb
|
56
|
+
homepage: https://github.com/doooby/export_to_gcloud
|
57
|
+
licenses:
|
58
|
+
- MIT
|
59
|
+
metadata: {}
|
60
|
+
post_install_message:
|
61
|
+
rdoc_options: []
|
62
|
+
require_paths:
|
63
|
+
- lib
|
64
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
70
|
+
requirements:
|
71
|
+
- - ">="
|
72
|
+
- !ruby/object:Gem::Version
|
73
|
+
version: '0'
|
74
|
+
requirements: []
|
75
|
+
rubyforge_project:
|
76
|
+
rubygems_version: 2.5.1
|
77
|
+
signing_key:
|
78
|
+
specification_version: 4
|
79
|
+
summary: Exporter to BigQuery
|
80
|
+
test_files: []
|