embulk-input-google_spreadsheets 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.travis.yml +12 -0
  4. data/CHANGELOG.md +67 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +94 -0
  8. data/Rakefile +13 -0
  9. data/embulk-input-google_spreadsheets.gemspec +24 -0
  10. data/example/config_authorized_user.yml +19 -0
  11. data/example/config_authorized_user.yml.liquid +25 -0
  12. data/example/config_authorized_user_emoji_worksheet.yml +19 -0
  13. data/example/config_authorized_user_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  14. data/example/config_authorized_user_large_data.yml +19 -0
  15. data/example/config_authorized_user_no_data.yml +18 -0
  16. data/example/config_service_account.yml +19 -0
  17. data/example/config_service_account_emoji_worksheet.yml +19 -0
  18. data/example/config_service_account_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  19. data/example/config_service_account_large_data.yml +19 -0
  20. data/example/config_service_account_no_data.yml +18 -0
  21. data/example/setup_authorized_user_credentials.rb +34 -0
  22. data/lib/embulk/input/google_spreadsheets.rb +182 -0
  23. data/lib/embulk/input/google_spreadsheets/auth.rb +63 -0
  24. data/lib/embulk/input/google_spreadsheets/error.rb +36 -0
  25. data/lib/embulk/input/google_spreadsheets/pager.rb +107 -0
  26. data/lib/embulk/input/google_spreadsheets/pager_util.rb +28 -0
  27. data/lib/embulk/input/google_spreadsheets/record_typecaster.rb +73 -0
  28. data/lib/embulk/input/google_spreadsheets/spreadsheets_client.rb +75 -0
  29. data/lib/embulk/input/google_spreadsheets/spreadsheets_url_util.rb +23 -0
  30. data/lib/embulk/input/google_spreadsheets/typecast/base.rb +62 -0
  31. data/lib/embulk/input/google_spreadsheets/typecast/loose_typecast.rb +84 -0
  32. data/lib/embulk/input/google_spreadsheets/typecast/minimal_typecast.rb +109 -0
  33. data/lib/embulk/input/google_spreadsheets/typecast/strict_typecast.rb +236 -0
  34. data/lib/embulk/input/google_spreadsheets/typecast/timestamp_format_util.rb +29 -0
  35. data/lib/embulk/input/google_spreadsheets/typecast_factory.rb +34 -0
  36. data/test/assert_embulk_nothing_raised.rb +11 -0
  37. data/test/assert_embulk_raise.rb +11 -0
  38. data/test/dummy.key +27 -0
  39. data/test/helper.rb +21 -0
  40. data/test/test_auth.rb +82 -0
  41. data/test/test_configure.rb +155 -0
  42. data/test/test_loose_typecast.rb +194 -0
  43. data/test/test_minimal_typecast.rb +616 -0
  44. data/test/test_pager_util.rb +24 -0
  45. data/test/test_run_examples.rb +125 -0
  46. data/test/test_spreadsheets_client.rb +87 -0
  47. data/test/test_spreadsheets_url_util.rb +23 -0
  48. data/test/test_strict_typecast.rb +666 -0
  49. data/test/test_typecast_factory.rb +36 -0
  50. metadata +220 -0
@@ -0,0 +1,19 @@
1
+ in:
2
+ type: google_spreadsheets
3
+ auth_method: service_account
4
+ json_keyfile: example/service_account_credentials.json
5
+ spreadsheets_url: https://docs.google.com/spreadsheets/d/1Cxz-LudQuhRAGZL8mBoHs6mRnpjODpyF4Rwc5UYoV1E/edit#gid=0
6
+ worksheet_title: Large Data
7
+ start_row: 2
8
+ default_timezone: Asia/Tokyo
9
+ null_string: '\N'
10
+ default_typecast: strict
11
+ columns:
12
+ - {name: _c1, type: boolean}
13
+ - {name: _c2, type: string}
14
+ - {name: _c3, type: long}
15
+ - {name: _c4, type: double}
16
+ - {name: _c5, type: timestamp, format: '%Y-%m-%d %H:%M:%S.%N'}
17
+ - {name: _c6, type: timestamp, format: '%Y-%m-%d'}
18
+ out:
19
+ type: stdout
@@ -0,0 +1,18 @@
1
+ in:
2
+ type: google_spreadsheets
3
+ auth_method: service_account
4
+ json_keyfile: example/service_account_credentials.json
5
+ spreadsheets_url: https://docs.google.com/spreadsheets/d/1Cxz-LudQuhRAGZL8mBoHs6mRnpjODpyF4Rwc5UYoV1E/edit#gid=0
6
+ worksheet_title: No Data
7
+ default_timezone: Asia/Tokyo
8
+ null_string: '\N'
9
+ default_typecast: strict
10
+ columns:
11
+ - {name: _c1, type: boolean}
12
+ - {name: _c2, type: string}
13
+ - {name: _c3, type: long}
14
+ - {name: _c4, type: double}
15
+ - {name: _c5, type: timestamp, format: '%Y-%m-%d %H:%M:%S.%N'}
16
+ - {name: _c6, type: timestamp, format: '%Y-%m-%d'}
17
+ out:
18
+ type: stdout
@@ -0,0 +1,34 @@
1
+ require 'googleauth'
2
+ require 'google/apis/sheets_v4'
3
+ require 'highline/import'
4
+ require 'json'
5
+
6
+ puts 'Before setup, open this page https://developers.google.com/identity/protocols/OAuth2'
7
+ puts 'then get OAuth 2.0 credentials such as a client ID and client secret according to the above page.'
8
+ puts
9
+
10
+ credentials = Google::Auth::UserRefreshCredentials.new(
11
+ client_id: ask('Enter client_id: '),
12
+ client_secret: ask('Enter client_secret: '),
13
+ scope: Google::Apis::SheetsV4::AUTH_SPREADSHEETS_READONLY,
14
+ redirect_uri: 'urn:ietf:wg:oauth:2.0:oob'
15
+ )
16
+
17
+ credentials.code = ask(
18
+ "1. Open this page '#{credentials.authorization_uri.to_s}'.\n" \
19
+ '2. Enter the authorization code shown in the page: '
20
+ ) {|q| q.echo = false}
21
+
22
+ credentials.fetch_access_token!
23
+
24
+ data = {
25
+ client_id: credentials.client_id,
26
+ client_secret: credentials.client_secret,
27
+ refresh_token: credentials.refresh_token,
28
+ }.to_json
29
+ file = File.expand_path('authorized_user_credentials.json', __dir__)
30
+ File.open(file, 'w') do |f|
31
+ f.write(data)
32
+ end
33
+
34
+ puts "Success. See '#{file}'."
@@ -0,0 +1,182 @@
1
+ require_relative 'google_spreadsheets/error'
2
+ require_relative 'google_spreadsheets/record_typecaster'
3
+ require_relative 'google_spreadsheets/auth'
4
+ require_relative 'google_spreadsheets/spreadsheets_client'
5
+ require_relative 'google_spreadsheets/pager'
6
+
7
+ module Embulk
8
+ module Input
9
+
10
+ class GoogleSpreadsheets < InputPlugin
11
+ Plugin.register_input('google_spreadsheets', self)
12
+
13
+ # support config by file path or content which supported by org.embulk.spi.unit.LocalFile
14
+ # json_keyfile:
15
+ # content: |
16
+ class LocalFile
17
+ # return JSON string
18
+ def self.load(v)
19
+ if v.is_a?(String)
20
+ File.read(v)
21
+ elsif v.is_a?(Hash)
22
+ v['content']
23
+ end
24
+ end
25
+ end
26
+
27
+ class CustomColumns
28
+ # NOTE: if raised, rescue and re-raise as Embulk::ConfigError
29
+ def self.load(v)
30
+ raise "`embulk-input-google_spreadsheets`: Invalid value '#{v}' for :array_of_hash" unless v.is_a?(Array)
31
+ v.each do |c|
32
+ raise "`embulk-input-google_spreadsheets`: Invalid value '#{v}' for :array_of_hash" unless c.is_a?(Hash)
33
+ end
34
+
35
+ complete_default(v.dup)
36
+ end
37
+
38
+ def self.complete_default(columns)
39
+ columns.map do |c|
40
+ c = c.dup
41
+ if c['type'] == 'timestamp'
42
+ c['format'] = c['format'] || default_format
43
+ c['timezone'] = c['timezone'] || default_timezone
44
+ end
45
+ c['typecast'] = c['typecast'] || default_typecast
46
+ c
47
+ end
48
+ end
49
+
50
+ def self.default_format
51
+ # ref. https://github.com/embulk/embulk/blob/936c5d5a20af3086f7d1e5779a89035105bb975b/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java#L10
52
+ # `Time.strptime` does not support `%6N`, so use `%N` instead.
53
+ @default_format ||= '%Y-%m-%d %H:%M:%S.%N %z'
54
+ end
55
+
56
+ def self.default_format=(format)
57
+ @default_format = format
58
+ end
59
+
60
+ def self.default_timezone
61
+ @default_timezone ||= 'UTC'
62
+ end
63
+
64
+ def self.default_timezone=(timezone)
65
+ @default_timezone = timezone
66
+ end
67
+
68
+ def self.default_typecast
69
+ @default_typecast ||= 'strict'
70
+ end
71
+
72
+ def self.default_typecast=(typecast)
73
+ @default_typecast = typecast
74
+ end
75
+ end
76
+
77
+ def self.logger
78
+ ::Embulk.logger
79
+ end
80
+
81
+ def logger
82
+ self.class.logger
83
+ end
84
+
85
+ def self.configure(config)
86
+ task = {}
87
+ # auth_method:
88
+ # - service_account
89
+ # - authorized_user
90
+ # - compute_engine
91
+ # - application_default
92
+ task['auth_method'] = config.param('auth_method', :string, default: 'authorized_user')
93
+ # json_keyfile: Fullpath of json key
94
+ # if `auth_method` is `authorized_user`, this plugin supposes the format
95
+ # is the below.
96
+ # {
97
+ # "client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
98
+ # "client_secret":"xxxxxxxxxxx",
99
+ # "refresh_token":"xxxxxxxxxxx"
100
+ # }
101
+ #
102
+ # if `auth_method` is `compute_engine` or `application_default`, this
103
+ # option is not required.
104
+ task['json_keyfile'] = config.param('json_keyfile', LocalFile, default: nil)
105
+ task['spreadsheets_url'] = config.param('spreadsheets_url', :string)
106
+ task['worksheet_title'] = config.param('worksheet_title', :string)
107
+ task['start_column'] = config.param('start_column', :integer, default: 1)
108
+ task['start_row'] = config.param('start_row', :integer, default: 1)
109
+ task['end_row'] = config.param('end_row', :integer, default: -1)
110
+ task['max_fetch_rows'] = config.param('max_fetch_rows', :integer, default: 10000)
111
+ task['null_string'] = config.param('null_string', :string, default: '')
112
+ task['stop_on_invalid_record'] = config.param('stop_on_invalid_record', :bool, default: true)
113
+ # columns: this option supposes an array of hash has the below structure.
114
+ # - name
115
+ # - type
116
+ # - format
117
+ # - timezone
118
+ # - typecast: default: strict
119
+ CustomColumns.default_format = task['default_timestamp_format'] = config.param('default_timestamp_format', :string, default: CustomColumns.default_format)
120
+ CustomColumns.default_timezone = task['default_timezone'] = config.param('default_timezone', :string, default: CustomColumns.default_timezone)
121
+ CustomColumns.default_typecast = task['default_typecast'] = config.param('default_typecast', :string, default: CustomColumns.default_typecast)
122
+ task['columns'] = config.param('columns', CustomColumns)
123
+
124
+ task['end_column'] = task['start_column'] + task['columns'].length - 1
125
+
126
+ logger.debug { "`embulk-input-google_spreadsheets`: configured task '#{task.reject{|k, v| k == 'json_keyfile'}.to_json}'"}
127
+ task
128
+ end
129
+
130
+ def self.configure_columns(task)
131
+ task['columns'].map.with_index do |c, i|
132
+ Column.new(i, c['name'], c['type'].to_sym, c['format'])
133
+ end
134
+ end
135
+
136
+ def self.transaction(config, &control)
137
+ task = configure(config)
138
+ columns = configure_columns(task)
139
+ resume(task, columns, 1, &control)
140
+ end
141
+
142
+ def self.resume(task, columns, count, &control)
143
+ task_reports = yield(task, columns, count)
144
+
145
+ next_config_diff = {}
146
+ return next_config_diff
147
+ end
148
+
149
+ attr_reader :typecaster, :client
150
+
151
+ def init
152
+ @typecaster = RecordTypecaster.new(task)
153
+ @client = SpreadsheetsClient.new(task, auth: Auth.new(task), pager: Pager.new(task))
154
+ end
155
+
156
+ def stop_on_invalid_record?
157
+ task['stop_on_invalid_record']
158
+ end
159
+
160
+ def run
161
+ client.worksheet_each_record do |record|
162
+ begin
163
+ record = typecaster.transform_by_columns(record)
164
+ page_builder.add(record)
165
+ rescue => e
166
+ if stop_on_invalid_record?
167
+ raise e if e.is_a?(ConfigError) or e.is_a?(DataError)
168
+ raise DataError.new(e)
169
+ end
170
+ logger.warn{ "`embulk-input-google_spreadsheets`: Error '#{e}' occurred. Skip '#{record}'" }
171
+ end
172
+ end
173
+
174
+ page_builder.finish
175
+
176
+ task_report = {}
177
+ return task_report
178
+ end
179
+ end
180
+ end
181
+ end
182
+
@@ -0,0 +1,63 @@
1
+ require 'googleauth'
2
+ require 'google/apis/sheets_v4'
3
+
4
+ module Embulk
5
+ module Input
6
+ class GoogleSpreadsheets < InputPlugin
7
+ class Auth
8
+
9
+ attr_reader :auth_method
10
+
11
+ def initialize(task)
12
+ @auth_method = task['auth_method']
13
+ @json_key = task['json_keyfile']
14
+ end
15
+
16
+ def authenticate
17
+ case auth_method
18
+ when 'authorized_user'
19
+ key = StringIO.new(credentials.to_json)
20
+ return Google::Auth::UserRefreshCredentials.make_creds(json_key_io: key, scope: scope)
21
+ when 'compute_engine'
22
+ return Google::Auth::GCECredentials.new
23
+ when 'service_account'
24
+ key = StringIO.new(credentials.to_json)
25
+ return Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
26
+ when 'application_default'
27
+ return Google::Auth.get_application_default([scope])
28
+ else
29
+ raise ConfigError.new("Unknown auth method: #{auth_method}")
30
+ end
31
+ end
32
+
33
+ def scope
34
+ Google::Apis::SheetsV4::AUTH_SPREADSHEETS_READONLY
35
+ end
36
+
37
+ private
38
+
39
+ def credentials
40
+ JSON.parse(@json_key || File.read(credentials_file))
41
+ end
42
+
43
+ def credentials_file
44
+ @credentials_file ||= File.expand_path(
45
+ # ref. https://developers.google.com/identity/protocols/application-default-credentials
46
+ (File.exist?(global_application_default_credentials_file) ?
47
+ global_application_default_credentials_file : application_default_credentials_file)
48
+ )
49
+ end
50
+
51
+ def application_default_credentials_file
52
+ @application_default_credentials_file ||=
53
+ File.expand_path('~/.config/gcloud/application_default_credentials.json')
54
+ end
55
+
56
+ def global_application_default_credentials_file
57
+ @global_application_default_credentials_file ||=
58
+ '/etc/google/auth/application_default_credentials.json'
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,36 @@
1
+ module Embulk
2
+ module Input
3
+
4
+ class GoogleSpreadsheets < InputPlugin
5
+
6
+ module Traceable
7
+ def initialize(e, more_msg = nil)
8
+ message = e.is_a?(String) ? '' : "(#{e.class}) "
9
+ message << "#{e}#{more_msg}\n"
10
+ message << "\tat #{e.backtrace.join("\n\tat ")}\n" if e.respond_to?(:backtrace)
11
+
12
+ while e.respond_to?(:cause) and e.cause
13
+ # Java Exception cannot follow the JRuby causes.
14
+ message << "Caused by (#{e.cause.class}) #{e.cause}\n"
15
+ message << "\tat #{e.cause.backtrace.join("\n\tat ")}\n" if e.cause.respond_to?(:backtrace)
16
+ e = e.cause
17
+ end
18
+
19
+ super(message)
20
+ end
21
+ end
22
+
23
+ class ConfigError < ::Embulk::ConfigError
24
+ include Traceable
25
+ end
26
+
27
+ class DataError < ::Embulk::DataError
28
+ include Traceable
29
+ end
30
+
31
+ class TypecastError < DataError
32
+ end
33
+
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,107 @@
1
+ require_relative 'pager_util'
2
+
3
+ module Embulk
4
+ module Input
5
+ class GoogleSpreadsheets < InputPlugin
6
+ class Pager
7
+ attr_reader :start_row, :start_column, :end_row, :end_column, :max_fetch_rows
8
+
9
+ def initialize(task)
10
+ @start_row = task['start_row']
11
+ @start_column = task['start_column']
12
+ @end_row = task['end_row']
13
+ @end_column = task['end_column']
14
+ @max_fetch_rows = task['max_fetch_rows']
15
+
16
+ validate!
17
+ end
18
+
19
+ def logger
20
+ GoogleSpreadsheets.logger
21
+ end
22
+
23
+ def each_record(client, &block)
24
+ max_row_num = max_accessible_row_num(client)
25
+
26
+ total_fetched_rows = 0
27
+ last_fetched_row_num = start_row - 1
28
+ while true do
29
+ start_row_num = last_fetched_row_num + 1
30
+ end_row_num = last_fetched_row_num + max_fetch_rows
31
+ if end_row_num >= max_row_num
32
+ end_row_num = max_row_num
33
+ end
34
+
35
+ range = range(start_row_num, end_row_num)
36
+ page = client.worksheet_values(range)
37
+ unless page # no values
38
+ logger.warn { '`embulk-input-google_spreadsheets`: no data is found.' } if total_fetched_rows <= 0
39
+ break
40
+ end
41
+
42
+ num_fetched_rows = 0
43
+ page.each do |record|
44
+ break false if no_limit? and empty_record?(record)
45
+ num_fetched_rows += 1
46
+ yield(record)
47
+ end
48
+ total_fetched_rows = total_fetched_rows + num_fetched_rows
49
+ logger.info { "`embulk-input-google_spreadsheets`: fetched #{num_fetched_rows} rows in #{range} (tatal: #{total_fetched_rows} rows)" }
50
+ break if num_fetched_rows < max_fetch_rows
51
+
52
+ last_fetched_row_num = end_row_num
53
+ break if last_fetched_row_num >= max_row_num
54
+ end
55
+ end
56
+
57
+ private
58
+
59
+ def validate!
60
+ if (has_limit? && start_row > end_row) || start_column > end_column
61
+ raise ConfigError.new("`embulk-input-google_spreadsheets`: Area does not exist. Please check start & end for row and column. start_row: '#{start_row}', end_row: '#{end_row}', start_col: '#{start_column}', end_col: '#{end_column}'")
62
+ end
63
+ if max_fetch_rows <= 0
64
+ raise ConfigError.new('`embulk-input-google_spreadsheets`: `max_fetch_rows` must be positive integer.')
65
+ end
66
+ end
67
+
68
+ def max_accessible_row_num(client)
69
+ sheets_max = client.worksheet_max_row_num
70
+ if end_row > sheets_max
71
+ raise ConfigError.new("`embulk-input-google_spreadsheets`: end_row `#{end_row}` is larger than spreadsheets max row `#{sheets_max}`")
72
+ end
73
+
74
+ return sheets_max if no_limit?
75
+
76
+ end_row
77
+ end
78
+
79
+ def empty_record?(record)
80
+ return true unless record
81
+ return true if record.empty?
82
+ record.all?{|v| v.nil? or v.empty?}
83
+ end
84
+
85
+ def no_limit?
86
+ end_row <= 0
87
+ end
88
+
89
+ def has_limit?
90
+ !no_limit?
91
+ end
92
+
93
+ def start_column_name
94
+ @start_column_name ||= PagerUtil.num2col(start_column)
95
+ end
96
+
97
+ def end_column_name
98
+ @end_column_name ||= PagerUtil.num2col(end_column)
99
+ end
100
+
101
+ def range(start_row_num, end_row_num)
102
+ "#{start_column_name}#{start_row_num}:#{end_column_name}#{end_row_num}"
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end