embulk-input-google_spreadsheets 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.travis.yml +12 -0
  4. data/CHANGELOG.md +67 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +94 -0
  8. data/Rakefile +13 -0
  9. data/embulk-input-google_spreadsheets.gemspec +24 -0
  10. data/example/config_authorized_user.yml +19 -0
  11. data/example/config_authorized_user.yml.liquid +25 -0
  12. data/example/config_authorized_user_emoji_worksheet.yml +19 -0
  13. data/example/config_authorized_user_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  14. data/example/config_authorized_user_large_data.yml +19 -0
  15. data/example/config_authorized_user_no_data.yml +18 -0
  16. data/example/config_service_account.yml +19 -0
  17. data/example/config_service_account_emoji_worksheet.yml +19 -0
  18. data/example/config_service_account_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  19. data/example/config_service_account_large_data.yml +19 -0
  20. data/example/config_service_account_no_data.yml +18 -0
  21. data/example/setup_authorized_user_credentials.rb +34 -0
  22. data/lib/embulk/input/google_spreadsheets.rb +182 -0
  23. data/lib/embulk/input/google_spreadsheets/auth.rb +63 -0
  24. data/lib/embulk/input/google_spreadsheets/error.rb +36 -0
  25. data/lib/embulk/input/google_spreadsheets/pager.rb +107 -0
  26. data/lib/embulk/input/google_spreadsheets/pager_util.rb +28 -0
  27. data/lib/embulk/input/google_spreadsheets/record_typecaster.rb +73 -0
  28. data/lib/embulk/input/google_spreadsheets/spreadsheets_client.rb +75 -0
  29. data/lib/embulk/input/google_spreadsheets/spreadsheets_url_util.rb +23 -0
  30. data/lib/embulk/input/google_spreadsheets/typecast/base.rb +62 -0
  31. data/lib/embulk/input/google_spreadsheets/typecast/loose_typecast.rb +84 -0
  32. data/lib/embulk/input/google_spreadsheets/typecast/minimal_typecast.rb +109 -0
  33. data/lib/embulk/input/google_spreadsheets/typecast/strict_typecast.rb +236 -0
  34. data/lib/embulk/input/google_spreadsheets/typecast/timestamp_format_util.rb +29 -0
  35. data/lib/embulk/input/google_spreadsheets/typecast_factory.rb +34 -0
  36. data/test/assert_embulk_nothing_raised.rb +11 -0
  37. data/test/assert_embulk_raise.rb +11 -0
  38. data/test/dummy.key +27 -0
  39. data/test/helper.rb +21 -0
  40. data/test/test_auth.rb +82 -0
  41. data/test/test_configure.rb +155 -0
  42. data/test/test_loose_typecast.rb +194 -0
  43. data/test/test_minimal_typecast.rb +616 -0
  44. data/test/test_pager_util.rb +24 -0
  45. data/test/test_run_examples.rb +125 -0
  46. data/test/test_spreadsheets_client.rb +87 -0
  47. data/test/test_spreadsheets_url_util.rb +23 -0
  48. data/test/test_strict_typecast.rb +666 -0
  49. data/test/test_typecast_factory.rb +36 -0
  50. metadata +220 -0
@@ -0,0 +1,19 @@
1
+ in:
2
+ type: google_spreadsheets
3
+ auth_method: service_account
4
+ json_keyfile: example/service_account_credentials.json
5
+ spreadsheets_url: https://docs.google.com/spreadsheets/d/1Cxz-LudQuhRAGZL8mBoHs6mRnpjODpyF4Rwc5UYoV1E/edit#gid=0
6
+ worksheet_title: Large Data
7
+ start_row: 2
8
+ default_timezone: Asia/Tokyo
9
+ null_string: '\N'
10
+ default_typecast: strict
11
+ columns:
12
+ - {name: _c1, type: boolean}
13
+ - {name: _c2, type: string}
14
+ - {name: _c3, type: long}
15
+ - {name: _c4, type: double}
16
+ - {name: _c5, type: timestamp, format: '%Y-%m-%d %H:%M:%S.%N'}
17
+ - {name: _c6, type: timestamp, format: '%Y-%m-%d'}
18
+ out:
19
+ type: stdout
@@ -0,0 +1,18 @@
1
+ in:
2
+ type: google_spreadsheets
3
+ auth_method: service_account
4
+ json_keyfile: example/service_account_credentials.json
5
+ spreadsheets_url: https://docs.google.com/spreadsheets/d/1Cxz-LudQuhRAGZL8mBoHs6mRnpjODpyF4Rwc5UYoV1E/edit#gid=0
6
+ worksheet_title: No Data
7
+ default_timezone: Asia/Tokyo
8
+ null_string: '\N'
9
+ default_typecast: strict
10
+ columns:
11
+ - {name: _c1, type: boolean}
12
+ - {name: _c2, type: string}
13
+ - {name: _c3, type: long}
14
+ - {name: _c4, type: double}
15
+ - {name: _c5, type: timestamp, format: '%Y-%m-%d %H:%M:%S.%N'}
16
+ - {name: _c6, type: timestamp, format: '%Y-%m-%d'}
17
+ out:
18
+ type: stdout
@@ -0,0 +1,34 @@
1
+ require 'googleauth'
2
+ require 'google/apis/sheets_v4'
3
+ require 'highline/import'
4
+ require 'json'
5
+
6
+ puts 'Before setup, open this page https://developers.google.com/identity/protocols/OAuth2'
7
+ puts 'then get OAuth 2.0 credentials such as a client ID and client secret according to the above page.'
8
+ puts
9
+
10
+ credentials = Google::Auth::UserRefreshCredentials.new(
11
+ client_id: ask('Enter client_id: '),
12
+ client_secret: ask('Enter client_secret: '),
13
+ scope: Google::Apis::SheetsV4::AUTH_SPREADSHEETS_READONLY,
14
+ redirect_uri: 'urn:ietf:wg:oauth:2.0:oob'
15
+ )
16
+
17
+ credentials.code = ask(
18
+ "1. Open this page '#{credentials.authorization_uri.to_s}'.\n" \
19
+ '2. Enter the authorization code shown in the page: '
20
+ ) {|q| q.echo = false}
21
+
22
+ credentials.fetch_access_token!
23
+
24
+ data = {
25
+ client_id: credentials.client_id,
26
+ client_secret: credentials.client_secret,
27
+ refresh_token: credentials.refresh_token,
28
+ }.to_json
29
+ file = File.expand_path('authorized_user_credentials.json', __dir__)
30
+ File.open(file, 'w') do |f|
31
+ f.write(data)
32
+ end
33
+
34
+ puts "Success. See '#{file}'."
@@ -0,0 +1,182 @@
1
+ require_relative 'google_spreadsheets/error'
2
+ require_relative 'google_spreadsheets/record_typecaster'
3
+ require_relative 'google_spreadsheets/auth'
4
+ require_relative 'google_spreadsheets/spreadsheets_client'
5
+ require_relative 'google_spreadsheets/pager'
6
+
7
+ module Embulk
8
+ module Input
9
+
10
+ class GoogleSpreadsheets < InputPlugin
11
+ Plugin.register_input('google_spreadsheets', self)
12
+
13
+ # support config by file path or content which supported by org.embulk.spi.unit.LocalFile
14
+ # json_keyfile:
15
+ # content: |
16
+ class LocalFile
17
+ # return JSON string
18
+ def self.load(v)
19
+ if v.is_a?(String)
20
+ File.read(v)
21
+ elsif v.is_a?(Hash)
22
+ v['content']
23
+ end
24
+ end
25
+ end
26
+
27
+ class CustomColumns
28
+ # NOTE: if raised, rescue and re-raise as Embulk::ConfigError
29
+ def self.load(v)
30
+ raise "`embulk-input-google_spreadsheets`: Invalid value '#{v}' for :array_of_hash" unless v.is_a?(Array)
31
+ v.each do |c|
32
+ raise "`embulk-input-google_spreadsheets`: Invalid value '#{v}' for :array_of_hash" unless c.is_a?(Hash)
33
+ end
34
+
35
+ complete_default(v.dup)
36
+ end
37
+
38
+ def self.complete_default(columns)
39
+ columns.map do |c|
40
+ c = c.dup
41
+ if c['type'] == 'timestamp'
42
+ c['format'] = c['format'] || default_format
43
+ c['timezone'] = c['timezone'] || default_timezone
44
+ end
45
+ c['typecast'] = c['typecast'] || default_typecast
46
+ c
47
+ end
48
+ end
49
+
50
+ def self.default_format
51
+ # ref. https://github.com/embulk/embulk/blob/936c5d5a20af3086f7d1e5779a89035105bb975b/embulk-core/src/main/java/org/embulk/spi/type/TimestampType.java#L10
52
+ # `Time.strptime` does not support `%6N`, so use `%N` instead.
53
+ @default_format ||= '%Y-%m-%d %H:%M:%S.%N %z'
54
+ end
55
+
56
+ def self.default_format=(format)
57
+ @default_format = format
58
+ end
59
+
60
+ def self.default_timezone
61
+ @default_timezone ||= 'UTC'
62
+ end
63
+
64
+ def self.default_timezone=(timezone)
65
+ @default_timezone = timezone
66
+ end
67
+
68
+ def self.default_typecast
69
+ @default_typecast ||= 'strict'
70
+ end
71
+
72
+ def self.default_typecast=(typecast)
73
+ @default_typecast = typecast
74
+ end
75
+ end
76
+
77
+ def self.logger
78
+ ::Embulk.logger
79
+ end
80
+
81
+ def logger
82
+ self.class.logger
83
+ end
84
+
85
+ def self.configure(config)
86
+ task = {}
87
+ # auth_method:
88
+ # - service_account
89
+ # - authorized_user
90
+ # - compute_engine
91
+ # - application_default
92
+ task['auth_method'] = config.param('auth_method', :string, default: 'authorized_user')
93
+ # json_keyfile: Fullpath of json key
94
+ # if `auth_method` is `authorized_user`, this plugin supposes the format
95
+ # is the below.
96
+ # {
97
+ # "client_id":"xxxxxxxxxxx.apps.googleusercontent.com",
98
+ # "client_secret":"xxxxxxxxxxx",
99
+ # "refresh_token":"xxxxxxxxxxx"
100
+ # }
101
+ #
102
+ # if `auth_method` is `compute_engine` or `application_default`, this
103
+ # option is not required.
104
+ task['json_keyfile'] = config.param('json_keyfile', LocalFile, default: nil)
105
+ task['spreadsheets_url'] = config.param('spreadsheets_url', :string)
106
+ task['worksheet_title'] = config.param('worksheet_title', :string)
107
+ task['start_column'] = config.param('start_column', :integer, default: 1)
108
+ task['start_row'] = config.param('start_row', :integer, default: 1)
109
+ task['end_row'] = config.param('end_row', :integer, default: -1)
110
+ task['max_fetch_rows'] = config.param('max_fetch_rows', :integer, default: 10000)
111
+ task['null_string'] = config.param('null_string', :string, default: '')
112
+ task['stop_on_invalid_record'] = config.param('stop_on_invalid_record', :bool, default: true)
113
+ # columns: this option supposes an array of hash has the below structure.
114
+ # - name
115
+ # - type
116
+ # - format
117
+ # - timezone
118
+ # - typecast: default: strict
119
+ CustomColumns.default_format = task['default_timestamp_format'] = config.param('default_timestamp_format', :string, default: CustomColumns.default_format)
120
+ CustomColumns.default_timezone = task['default_timezone'] = config.param('default_timezone', :string, default: CustomColumns.default_timezone)
121
+ CustomColumns.default_typecast = task['default_typecast'] = config.param('default_typecast', :string, default: CustomColumns.default_typecast)
122
+ task['columns'] = config.param('columns', CustomColumns)
123
+
124
+ task['end_column'] = task['start_column'] + task['columns'].length - 1
125
+
126
+ logger.debug { "`embulk-input-google_spreadsheets`: configured task '#{task.reject{|k, v| k == 'json_keyfile'}.to_json}'"}
127
+ task
128
+ end
129
+
130
+ def self.configure_columns(task)
131
+ task['columns'].map.with_index do |c, i|
132
+ Column.new(i, c['name'], c['type'].to_sym, c['format'])
133
+ end
134
+ end
135
+
136
+ def self.transaction(config, &control)
137
+ task = configure(config)
138
+ columns = configure_columns(task)
139
+ resume(task, columns, 1, &control)
140
+ end
141
+
142
+ def self.resume(task, columns, count, &control)
143
+ task_reports = yield(task, columns, count)
144
+
145
+ next_config_diff = {}
146
+ return next_config_diff
147
+ end
148
+
149
+ attr_reader :typecaster, :client
150
+
151
+ def init
152
+ @typecaster = RecordTypecaster.new(task)
153
+ @client = SpreadsheetsClient.new(task, auth: Auth.new(task), pager: Pager.new(task))
154
+ end
155
+
156
+ def stop_on_invalid_record?
157
+ task['stop_on_invalid_record']
158
+ end
159
+
160
+ def run
161
+ client.worksheet_each_record do |record|
162
+ begin
163
+ record = typecaster.transform_by_columns(record)
164
+ page_builder.add(record)
165
+ rescue => e
166
+ if stop_on_invalid_record?
167
+ raise e if e.is_a?(ConfigError) or e.is_a?(DataError)
168
+ raise DataError.new(e)
169
+ end
170
+ logger.warn{ "`embulk-input-google_spreadsheets`: Error '#{e}' occurred. Skip '#{record}'" }
171
+ end
172
+ end
173
+
174
+ page_builder.finish
175
+
176
+ task_report = {}
177
+ return task_report
178
+ end
179
+ end
180
+ end
181
+ end
182
+
@@ -0,0 +1,63 @@
1
+ require 'googleauth'
2
+ require 'google/apis/sheets_v4'
3
+
4
+ module Embulk
5
+ module Input
6
+ class GoogleSpreadsheets < InputPlugin
7
+ class Auth
8
+
9
+ attr_reader :auth_method
10
+
11
+ def initialize(task)
12
+ @auth_method = task['auth_method']
13
+ @json_key = task['json_keyfile']
14
+ end
15
+
16
+ def authenticate
17
+ case auth_method
18
+ when 'authorized_user'
19
+ key = StringIO.new(credentials.to_json)
20
+ return Google::Auth::UserRefreshCredentials.make_creds(json_key_io: key, scope: scope)
21
+ when 'compute_engine'
22
+ return Google::Auth::GCECredentials.new
23
+ when 'service_account'
24
+ key = StringIO.new(credentials.to_json)
25
+ return Google::Auth::ServiceAccountCredentials.make_creds(json_key_io: key, scope: scope)
26
+ when 'application_default'
27
+ return Google::Auth.get_application_default([scope])
28
+ else
29
+ raise ConfigError.new("Unknown auth method: #{auth_method}")
30
+ end
31
+ end
32
+
33
+ def scope
34
+ Google::Apis::SheetsV4::AUTH_SPREADSHEETS_READONLY
35
+ end
36
+
37
+ private
38
+
39
+ def credentials
40
+ JSON.parse(@json_key || File.read(credentials_file))
41
+ end
42
+
43
+ def credentials_file
44
+ @credentials_file ||= File.expand_path(
45
+ # ref. https://developers.google.com/identity/protocols/application-default-credentials
46
+ (File.exist?(global_application_default_credentials_file) ?
47
+ global_application_default_credentials_file : application_default_credentials_file)
48
+ )
49
+ end
50
+
51
+ def application_default_credentials_file
52
+ @application_default_credentials_file ||=
53
+ File.expand_path('~/.config/gcloud/application_default_credentials.json')
54
+ end
55
+
56
+ def global_application_default_credentials_file
57
+ @global_application_default_credentials_file ||=
58
+ '/etc/google/auth/application_default_credentials.json'
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,36 @@
1
+ module Embulk
2
+ module Input
3
+
4
+ class GoogleSpreadsheets < InputPlugin
5
+
6
+ module Traceable
7
+ def initialize(e, more_msg = nil)
8
+ message = e.is_a?(String) ? '' : "(#{e.class}) "
9
+ message << "#{e}#{more_msg}\n"
10
+ message << "\tat #{e.backtrace.join("\n\tat ")}\n" if e.respond_to?(:backtrace)
11
+
12
+ while e.respond_to?(:cause) and e.cause
13
+ # Java Exception cannot follow the JRuby causes.
14
+ message << "Caused by (#{e.cause.class}) #{e.cause}\n"
15
+ message << "\tat #{e.cause.backtrace.join("\n\tat ")}\n" if e.cause.respond_to?(:backtrace)
16
+ e = e.cause
17
+ end
18
+
19
+ super(message)
20
+ end
21
+ end
22
+
23
+ class ConfigError < ::Embulk::ConfigError
24
+ include Traceable
25
+ end
26
+
27
+ class DataError < ::Embulk::DataError
28
+ include Traceable
29
+ end
30
+
31
+ class TypecastError < DataError
32
+ end
33
+
34
+ end
35
+ end
36
+ end
@@ -0,0 +1,107 @@
1
+ require_relative 'pager_util'
2
+
3
+ module Embulk
4
+ module Input
5
+ class GoogleSpreadsheets < InputPlugin
6
+ class Pager
7
+ attr_reader :start_row, :start_column, :end_row, :end_column, :max_fetch_rows
8
+
9
+ def initialize(task)
10
+ @start_row = task['start_row']
11
+ @start_column = task['start_column']
12
+ @end_row = task['end_row']
13
+ @end_column = task['end_column']
14
+ @max_fetch_rows = task['max_fetch_rows']
15
+
16
+ validate!
17
+ end
18
+
19
+ def logger
20
+ GoogleSpreadsheets.logger
21
+ end
22
+
23
+ def each_record(client, &block)
24
+ max_row_num = max_accessible_row_num(client)
25
+
26
+ total_fetched_rows = 0
27
+ last_fetched_row_num = start_row - 1
28
+ while true do
29
+ start_row_num = last_fetched_row_num + 1
30
+ end_row_num = last_fetched_row_num + max_fetch_rows
31
+ if end_row_num >= max_row_num
32
+ end_row_num = max_row_num
33
+ end
34
+
35
+ range = range(start_row_num, end_row_num)
36
+ page = client.worksheet_values(range)
37
+ unless page # no values
38
+ logger.warn { '`embulk-input-google_spreadsheets`: no data is found.' } if total_fetched_rows <= 0
39
+ break
40
+ end
41
+
42
+ num_fetched_rows = 0
43
+ page.each do |record|
44
+ break false if no_limit? and empty_record?(record)
45
+ num_fetched_rows += 1
46
+ yield(record)
47
+ end
48
+ total_fetched_rows = total_fetched_rows + num_fetched_rows
49
+ logger.info { "`embulk-input-google_spreadsheets`: fetched #{num_fetched_rows} rows in #{range} (tatal: #{total_fetched_rows} rows)" }
50
+ break if num_fetched_rows < max_fetch_rows
51
+
52
+ last_fetched_row_num = end_row_num
53
+ break if last_fetched_row_num >= max_row_num
54
+ end
55
+ end
56
+
57
+ private
58
+
59
+ def validate!
60
+ if (has_limit? && start_row > end_row) || start_column > end_column
61
+ raise ConfigError.new("`embulk-input-google_spreadsheets`: Area does not exist. Please check start & end for row and column. start_row: '#{start_row}', end_row: '#{end_row}', start_col: '#{start_column}', end_col: '#{end_column}'")
62
+ end
63
+ if max_fetch_rows <= 0
64
+ raise ConfigError.new('`embulk-input-google_spreadsheets`: `max_fetch_rows` must be positive integer.')
65
+ end
66
+ end
67
+
68
+ def max_accessible_row_num(client)
69
+ sheets_max = client.worksheet_max_row_num
70
+ if end_row > sheets_max
71
+ raise ConfigError.new("`embulk-input-google_spreadsheets`: end_row `#{end_row}` is larger than spreadsheets max row `#{sheets_max}`")
72
+ end
73
+
74
+ return sheets_max if no_limit?
75
+
76
+ end_row
77
+ end
78
+
79
+ def empty_record?(record)
80
+ return true unless record
81
+ return true if record.empty?
82
+ record.all?{|v| v.nil? or v.empty?}
83
+ end
84
+
85
+ def no_limit?
86
+ end_row <= 0
87
+ end
88
+
89
+ def has_limit?
90
+ !no_limit?
91
+ end
92
+
93
+ def start_column_name
94
+ @start_column_name ||= PagerUtil.num2col(start_column)
95
+ end
96
+
97
+ def end_column_name
98
+ @end_column_name ||= PagerUtil.num2col(end_column)
99
+ end
100
+
101
+ def range(start_row_num, end_row_num)
102
+ "#{start_column_name}#{start_row_num}:#{end_column_name}#{end_row_num}"
103
+ end
104
+ end
105
+ end
106
+ end
107
+ end