bricolage-spreadsheet 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 465aebc471ab4d6adfac54aedc78c2766204d202c49009c29c848e1ef391519e
4
+ data.tar.gz: b8d4b1e847bd67d8f865ed9b5fd42ba05f0d34a08a30787166d1bee09a2b0320
5
+ SHA512:
6
+ metadata.gz: 35d5f63b44638ffc54ff4279a60e95e50337137fb635c2fe5c517db0ccf0fd6ae731427fe5997575610b8e3213eaedb9f513d73c9e51fc64c81eebc46af6e7ee
7
+ data.tar.gz: add032b531a4d56067e1cf937746777a1be317084b0c763afb2079551fc7407cbd3d4fdb698ec38ae96772eb9a09f624d7ec4dde3004966c31813ed4f77eb494
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # bricolage-spreadsheet
2
+
3
+ Google Spreadsheet-related job classes for Bricolage batch job framework.
4
+
5
+ ## Home Page
6
+
7
+ https://github.com/bricolages/bricolage-spreadsheet
8
+
9
+ ## Usage
10
+
11
+ Add following line in your Gemfile:
12
+ ```
13
+ gem 'bricolage-spreadsheet'
14
+ ```
15
+
16
+ Job Options
17
+
18
+ ```
19
+ % bundle exec bricolage spreadsheet-import -h
20
+ Usage: bricolage spreadsheet-import [job_class_options]
21
+ --src-ds=NAME [optional] Main data source. [default: spreadsheet]
22
+ --sheet-id=ID Spreadsheet ID
23
+ --range=RANGE_EXPR Spreadsheet Range
24
+ --format=VALUE [optional] Data file format. (csv, json)
25
+ --value-render-option=VALUE [optional] For values with format on sheets (FORMATTED_VALUE, UNFORMATTED_VALUE, FORMULA)
26
+ --s3-ds=NAME [optional] Main data source. [default: s3]
27
+ --s3-file=PATH Target file name.
28
+ --dest-ds=NAME [optional] Main data source. [default: psql]
29
+ --dest-table=[SCHEMA.]TABLE [optional] Target table name.
30
+ --options=OPTIONS [optional] Loader options.
31
+ --table-def=PATH Create table file.
32
+ --no-backup [optional] Drop dest table with suffix "_old".
33
+ --analyze [optional] ANALYZE table after SQL is executed.
34
+ --grant=KEY:VALUE [optional] GRANT table after SQL is executed. (required keys: privilege, to)
35
+ --gzip [optional] Compress Temporary files.
36
+ -v, --variable=NAME=VALUE Set variable.
37
+ --help Shows this message and quit.
38
+ --version Shows program version and quit.
39
+ ```
40
+
41
+ ## License
42
+
43
+ MIT license.
44
+ See LICENSES file for details.
45
+
46
+ ## Credit
47
+
48
+ Author: Shimpei Kodama
49
+
50
+ This software is written in working time in Cookpad, Inc.
@@ -0,0 +1,72 @@
1
+ require 'bricolage/psqldatasource'
2
+
3
+ module Bricolage
4
+ JobClass.define('spreadsheet-import') {
5
+ parameters {|params|
6
+ # S3Export
7
+ params.add DataSourceParam.new('spreadsheet', 'src-ds')
8
+ params.add StringParam.new('sheet-id', 'ID', 'Google Spreadsheet ID')
9
+ params.add StringParam.new('range', 'RANGE_EXPR', 'Google Spreadsheet Range Expression')
10
+ params.add EnumParam.new('format', %w(csv json), 'Intermediate data file format.', default: 'json')
11
+ params.add EnumParam.new('value-render-option', %w(FORMATTED_VALUE UNFORMATTED_VALUE FORMULA), 'For values with format on sheets', default: 'FORMATTED_VALUE')
12
+ params.add DataSourceParam.new('s3', 's3-ds')
13
+ params.add DestFileParam.new('s3-file')
14
+
15
+ # Load
16
+ params.add DataSourceParam.new('psql', 'dest-ds')
17
+ params.add DestTableParam.new
18
+ params.add KeyValuePairsParam.new('options', 'OPTIONS', 'Loader options.',
19
+ optional: true, default: PSQLLoadOptions.new,
20
+ value_handler: lambda {|value, ctx, vars| PSQLLoadOptions.parse(value) })
21
+ params.add SQLFileParam.new('table-def', 'PATH', 'Create table file.')
22
+ params.add OptionalBoolParam.new('no-backup', 'Drop dest table with suffix "_old".', default: false)
23
+
24
+ # Misc
25
+ params.add OptionalBoolParam.new('analyze', 'ANALYZE table after SQL is executed.', default: true)
26
+ params.add KeyValuePairsParam.new('grant', 'KEY:VALUE', 'GRANT table after SQL is executed. (required keys: privilege, to)')
27
+ params.add OptionalBoolParam.new('gzip', 'Compress Temporary files.')
28
+ }
29
+
30
+ script {|params, script|
31
+ # S3Export
32
+ script.task(params['src-ds']) {|task|
33
+ task.s3export params['sheet-id'],
34
+ params['range'],
35
+ params['format'],
36
+ params['value-render-option'],
37
+ params['s3-ds'],
38
+ params['s3-file'],
39
+ params['gzip']
40
+ }
41
+
42
+ # Load
43
+ script.task(params['dest-ds']) {|task|
44
+ prev_table = '${dest_table}_old'
45
+ work_table = '${dest_table}_wk'
46
+
47
+ task.transaction {
48
+ # CREATE
49
+ task.drop_force prev_table
50
+ task.drop_force work_table
51
+ task.exec params['table-def'].replace(/\$\{?dest_table\}?\b/, work_table)
52
+
53
+ # COPY
54
+ options = params['gzip'] ? params['options'].merge('gzip' => params['gzip']) : params['options']
55
+ task.load params['s3-ds'], params['s3-file'], work_table,
56
+ params['format'], nil, options
57
+
58
+ # GRANT, ANALYZE
59
+ task.grant_if params['grant'], work_table
60
+ task.analyze_if params['analyze'], work_table
61
+
62
+ # RENAME
63
+ task.create_dummy_table '${dest_table}'
64
+ task.rename_table params['dest-table'].to_s, "#{params['dest-table'].name}_old"
65
+ task.rename_table work_table, params['dest-table'].name
66
+ }
67
+ # No Backup
68
+ task.drop_force prev_table if params['no-backup']
69
+ }
70
+ }
71
+ } #spreadsheet-import job class
72
+ end
@@ -0,0 +1,5 @@
1
+ require 'bricolage/jobclass'
2
+ require 'pathname'
3
+
4
+ jobclass_path = Pathname(__dir__).realpath.parent.cleanpath + 'jobclass'
5
+ Bricolage::JobClass.add_load_path jobclass_path
@@ -0,0 +1,181 @@
1
+ require 'bricolage/datasource'
2
+ require 'bricolage/jobresult'
3
+ require 'google/apis/sheets_v4'
4
+ require 'googleauth'
5
+ require 'csv'
6
+ require 'json'
7
+ require 'uri'
8
+ require 'tempfile'
9
+ require 'zlib'
10
+ require 'pathname'
11
+
12
+ module Bricolage
13
+
14
+ class SpreadsheetDataSource < DataSource
15
+ declare_type 'spreadsheet'
16
+
17
+ SCOPE_BASE = "Google::Apis::SheetsV4::"
18
+ DEFAULT_SCOPE = 'AUTH_SPREADSHEETS_READONLY'
19
+ DEFAULT_APPLICATION_NAME = 'Bricolage'
20
+
21
+ def initialize(credentials: nil, scope: nil, application_name: nil)
22
+ @credentials = credentials
23
+ @scope = "#{SCOPE_BASE}#{(scope || DEFAULT_SCOPE)}"
24
+ @application_name = application_name || DEFAULT_APPLICATION_NAME
25
+ end
26
+
27
+ attr_reader :scope, :application_name
28
+
29
+ def new_task
30
+ SpreadsheetTask.new(self)
31
+ end
32
+
33
+ def rows(sheet_id, range, **get_options)
34
+ return enum_for(:rows, sheet_id, range, **get_options) unless block_given?
35
+ response = service.get_spreadsheet_values(sheet_id, range, **get_options)
36
+ response.values.each do |row|
37
+ next if row.all?(&:empty?) # skip empty row
38
+ yield row
39
+ end
40
+ end
41
+
42
+ def formatted_rows(sheet_id, range, format = 'csv', **get_options)
43
+ return enum_for(:formatted_rows, sheet_id, range, format, **get_options) unless block_given?
44
+ row_formatter = RowFormatterFactory.new_formatter(format)
45
+ fields = []
46
+ rows(sheet_id, range, **get_options).each_with_index do |row, idx|
47
+ if idx == 0
48
+ fields = row
49
+ next if row_formatter.skip_header?
50
+ end
51
+ yield row_formatter.format(row, fields)
52
+ end
53
+ end
54
+
55
+ def credential
56
+ if @credentials
57
+ case
58
+ when @credentials.is_a?(Hash) then StringIO.new(@credentials.to_json)
59
+ when Pathname.new(@credentials).exist? then File.open(@credentials)
60
+ else raise ParameterError, "credentials must be a JSON or PATH. credentials=#{@credentials}"
61
+ end
62
+ elsif ENV['GOOGLE_APPLICATION_CREDENTIALS']
63
+ File.open(ENV['GOOGLE_APPLICATION_CREDENTIALS'])
64
+ else
65
+ raise ParameterError, "credentials or GOOGLE_APPLICATION_CREDENTIALS is required."
66
+ end
67
+ end
68
+
69
+ class SpreadsheetTask < DataSourceTask
70
+
71
+ def s3export(sheet_id, range, format, value_render_option, dest_ds, dest_file, gzip)
72
+ add S3ExportAction.new(sheet_id, range, format, value_render_option, dest_ds, dest_file, gzip)
73
+ end
74
+
75
+ class S3ExportAction < Action
76
+ def initialize(sheet_id, range, format, value_render_option, dest_ds, dest_file, gzip)
77
+ @sheet_id = sheet_id
78
+ @range = range
79
+ @format = format
80
+ @value_render_option = value_render_option
81
+ @dest_ds = dest_ds
82
+ @dest_file = dest_file
83
+ @gzip = gzip
84
+ end
85
+
86
+ attr_reader :sheet_id, :range, :dest_ds, :dest_file, :gzip
87
+
88
+ def url_encoded_range
89
+ @url_encoded_range ||= URI.encode(range)
90
+ end
91
+
92
+ def value_render_option
93
+ @value_render_option&.upcase
94
+ end
95
+
96
+ def format
97
+ @format.downcase
98
+ end
99
+
100
+ def source
101
+ <<~SOURCE
102
+ "GET https://sheets.googleapis.com/v4/spreadsheets/#{sheet_id}/values/#{url_encoded_range}"
103
+ "PUT s3://#{dest_ds.bucket_name}/#{dest_ds.prefix}/#{dest_file}"
104
+ SOURCE
105
+ end
106
+
107
+ def run
108
+ ds.logger.info source
109
+ rows = ds.formatted_rows(sheet_id, range, format, value_render_option: value_render_option)
110
+ Tempfile.open do |f|
111
+ f = Zlib::GzipWriter.wrap(f) if gzip
112
+ f.write rows.to_a.join("\n")
113
+ f.close # flush
114
+ dest_ds.object(dest_file).upload_file(f.path)
115
+ end
116
+ nil
117
+ end
118
+
119
+ end #S3ExportAction
120
+ end #SpreadsheetTask
121
+
122
+ private
123
+
124
+ def service
125
+ return @service if @service
126
+ @service = Google::Apis::SheetsV4::SheetsService.new
127
+ @service.client_options.application_name = application_name
128
+ @service.authorization = authorizer
129
+ @service
130
+ end
131
+
132
+ def authorizer
133
+ return @authorizer if @authorizer
134
+ @authorizer = Google::Auth::ServiceAccountCredentials.make_creds(
135
+ json_key_io: credential,
136
+ scope: scope,
137
+ )
138
+ # token lifetime is 3600 sec
139
+ # May need to implement refresh logic for long running app???
140
+ @authorizer.fetch_access_token!
141
+ @authorizer
142
+ end
143
+
144
+ class RowFormatterFactory
145
+ def self.new_formatter(format)
146
+ const_get(format.upcase! + "Formatter").new
147
+ end
148
+
149
+ class CSVFormatter
150
+ def format(values, fields)
151
+ # remove '\n' with row_sep: nil
152
+ escaped_values = values.map {|v| v.gsub(/\n/,'\\n') }
153
+ CSV.generate_line(escaped_values, row_sep: nil, quote_char: '"', force_quotes: true)
154
+ end
155
+
156
+ def skip_header?
157
+ false
158
+ end
159
+ end
160
+
161
+ class JSONFormatter
162
+ def format(values, fields)
163
+ # https://stackoverflow.com/questions/1509915/converting-camel-case-to-underscore-case-in-ruby
164
+ normalized_fields = fields.map {|f|
165
+ f.gsub(/::/, '/').
166
+ gsub(/([A-Z]+)([A-Z][a-z])/,'\1_\2').
167
+ gsub(/([a-z\d])([A-Z])/,'\1_\2').
168
+ tr("-", "_").
169
+ tr(" ", "_").
170
+ downcase
171
+ }
172
+ normalized_fields.zip(values).to_h.to_json
173
+ end
174
+
175
+ def skip_header?
176
+ true
177
+ end
178
+ end
179
+ end # RowFormatterFactory
180
+ end
181
+ end
metadata ADDED
@@ -0,0 +1,74 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: bricolage-spreadsheet
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Shimpei Kodama
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2021-04-19 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bricolage
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: 5.26.0
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - "~>"
25
+ - !ruby/object:Gem::Version
26
+ version: 5.26.0
27
+ - !ruby/object:Gem::Dependency
28
+ name: google-apis-sheets_v4
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - "~>"
32
+ - !ruby/object:Gem::Version
33
+ version: 0.4.0
34
+ type: :runtime
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - "~>"
39
+ - !ruby/object:Gem::Version
40
+ version: 0.4.0
41
+ description:
42
+ email: shimpeko@gmail.com
43
+ executables: []
44
+ extensions: []
45
+ extra_rdoc_files: []
46
+ files:
47
+ - README.md
48
+ - jobclass/spreadsheet-import.rb
49
+ - lib/bricolage-spreadsheet.rb
50
+ - lib/bricolage/spreadsheetdatasource.rb
51
+ homepage: https://github.com/bricolages/bricolage-spreadsheet
52
+ licenses:
53
+ - MIT
54
+ metadata: {}
55
+ post_install_message:
56
+ rdoc_options: []
57
+ require_paths:
58
+ - lib
59
+ required_ruby_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - "~>"
62
+ - !ruby/object:Gem::Version
63
+ version: 2.7.0
64
+ required_rubygems_version: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
69
+ requirements: []
70
+ rubygems_version: 3.1.4
71
+ signing_key:
72
+ specification_version: 4
73
+ summary: Google Spreadsheet-related job classes for Bricolage batch framework
74
+ test_files: []