embulk-input-google_spreadsheets 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +12 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +94 -0
- data/Rakefile +13 -0
- data/embulk-input-google_spreadsheets.gemspec +24 -0
- data/example/config_authorized_user.yml +19 -0
- data/example/config_authorized_user.yml.liquid +25 -0
- data/example/config_authorized_user_emoji_worksheet.yml +19 -0
- data/example/config_authorized_user_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
- data/example/config_authorized_user_large_data.yml +19 -0
- data/example/config_authorized_user_no_data.yml +18 -0
- data/example/config_service_account.yml +19 -0
- data/example/config_service_account_emoji_worksheet.yml +19 -0
- data/example/config_service_account_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
- data/example/config_service_account_large_data.yml +19 -0
- data/example/config_service_account_no_data.yml +18 -0
- data/example/setup_authorized_user_credentials.rb +34 -0
- data/lib/embulk/input/google_spreadsheets.rb +182 -0
- data/lib/embulk/input/google_spreadsheets/auth.rb +63 -0
- data/lib/embulk/input/google_spreadsheets/error.rb +36 -0
- data/lib/embulk/input/google_spreadsheets/pager.rb +107 -0
- data/lib/embulk/input/google_spreadsheets/pager_util.rb +28 -0
- data/lib/embulk/input/google_spreadsheets/record_typecaster.rb +73 -0
- data/lib/embulk/input/google_spreadsheets/spreadsheets_client.rb +75 -0
- data/lib/embulk/input/google_spreadsheets/spreadsheets_url_util.rb +23 -0
- data/lib/embulk/input/google_spreadsheets/typecast/base.rb +62 -0
- data/lib/embulk/input/google_spreadsheets/typecast/loose_typecast.rb +84 -0
- data/lib/embulk/input/google_spreadsheets/typecast/minimal_typecast.rb +109 -0
- data/lib/embulk/input/google_spreadsheets/typecast/strict_typecast.rb +236 -0
- data/lib/embulk/input/google_spreadsheets/typecast/timestamp_format_util.rb +29 -0
- data/lib/embulk/input/google_spreadsheets/typecast_factory.rb +34 -0
- data/test/assert_embulk_nothing_raised.rb +11 -0
- data/test/assert_embulk_raise.rb +11 -0
- data/test/dummy.key +27 -0
- data/test/helper.rb +21 -0
- data/test/test_auth.rb +82 -0
- data/test/test_configure.rb +155 -0
- data/test/test_loose_typecast.rb +194 -0
- data/test/test_minimal_typecast.rb +616 -0
- data/test/test_pager_util.rb +24 -0
- data/test/test_run_examples.rb +125 -0
- data/test/test_spreadsheets_client.rb +87 -0
- data/test/test_spreadsheets_url_util.rb +23 -0
- data/test/test_strict_typecast.rb +666 -0
- data/test/test_typecast_factory.rb +36 -0
- metadata +220 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class GoogleSpreadsheets < InputPlugin
|
4
|
+
module PagerUtil
|
5
|
+
|
6
|
+
def self.num2col(num, base = default_base, offset = default_offset)
|
7
|
+
[].tap do |r|
|
8
|
+
while num > 0
|
9
|
+
num -= 1
|
10
|
+
r.unshift((num % base + offset).chr)
|
11
|
+
num /= base
|
12
|
+
end
|
13
|
+
end.join
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def self.default_offset
|
19
|
+
@default_offset ||= 'A'.ord
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.default_base
|
23
|
+
@default_base ||= 26 # number of alphabet
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative 'typecast_factory'
|
2
|
+
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Input
|
6
|
+
class GoogleSpreadsheets < InputPlugin
|
7
|
+
class RecordTypecaster
|
8
|
+
|
9
|
+
attr_reader :column_names, :column_details
|
10
|
+
|
11
|
+
def initialize(task)
|
12
|
+
@column_names = task['columns'].map{|c| c['name']}
|
13
|
+
@column_details = configure_column_details(task)
|
14
|
+
end
|
15
|
+
|
16
|
+
def configure_column_details(task)
|
17
|
+
_column_details = task['columns'].dup.each_with_index.inject({}) do |details, column_with_index|
|
18
|
+
c, i = *column_with_index
|
19
|
+
details.tap do |ds|
|
20
|
+
ds[c['name']] = {}.tap do |d|
|
21
|
+
d['index'] = i
|
22
|
+
d['name'] = c['name']
|
23
|
+
d['type'] = c['type'].to_sym
|
24
|
+
d['format'] = c['format']
|
25
|
+
d['timezone'] = c['timezone']
|
26
|
+
d['typecast'] = TypecastFactory.create(c['typecast'], task)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
logger.debug { "`embulk-input-google_spreadsheets`: configured column details '#{_column_details.to_json}'"}
|
32
|
+
_column_details
|
33
|
+
end
|
34
|
+
|
35
|
+
def logger
|
36
|
+
GoogleSpreadsheets.logger
|
37
|
+
end
|
38
|
+
|
39
|
+
def transform_by_columns(record)
|
40
|
+
column_names.map do |n|
|
41
|
+
d = column_details[n]
|
42
|
+
typecast = d['typecast']
|
43
|
+
value = record[d['index']]
|
44
|
+
type = d['type']
|
45
|
+
|
46
|
+
begin
|
47
|
+
case type
|
48
|
+
when :string
|
49
|
+
typecast.as_string(value)
|
50
|
+
when :long
|
51
|
+
typecast.as_long(value)
|
52
|
+
when :double
|
53
|
+
typecast.as_double(value)
|
54
|
+
when :boolean
|
55
|
+
typecast.as_boolean(value)
|
56
|
+
when :timestamp
|
57
|
+
typecast.as_timestamp(value, d['format'], d['timezone'])
|
58
|
+
when :json
|
59
|
+
typecast.as_json(value)
|
60
|
+
else
|
61
|
+
raise ConfigError.new("`google_spreadsheets`: Unsupported type `#{type}`")
|
62
|
+
end
|
63
|
+
rescue => e
|
64
|
+
# for adding column information
|
65
|
+
raise TypecastError.new(e, ", column: #{n}, column_detail: #{d.to_json}")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'google/apis/sheets_v4'
|
2
|
+
require_relative 'spreadsheets_url_util'
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Input
|
6
|
+
class GoogleSpreadsheets < InputPlugin
|
7
|
+
|
8
|
+
class SpreadsheetsClient
|
9
|
+
|
10
|
+
attr_accessor :spreadsheets_url, :worksheet_title, :auth, :pager
|
11
|
+
|
12
|
+
def initialize(task, auth:, pager:)
|
13
|
+
@spreadsheets_url = task['spreadsheets_url']
|
14
|
+
@worksheet_title = task['worksheet_title']
|
15
|
+
@auth = auth
|
16
|
+
@pager = pager
|
17
|
+
end
|
18
|
+
|
19
|
+
def logger
|
20
|
+
GoogleSpreadsheets.logger
|
21
|
+
end
|
22
|
+
|
23
|
+
def application_name
|
24
|
+
@application_name ||= 'embulk-input-google_spreadsheets'
|
25
|
+
end
|
26
|
+
|
27
|
+
def spreadsheets_id
|
28
|
+
SpreadsheetsUrlUtil.capture_id(spreadsheets_url)
|
29
|
+
end
|
30
|
+
|
31
|
+
def spreadsheets
|
32
|
+
service.get_spreadsheet(spreadsheets_id, ranges: worksheet_title, include_grid_data: false)
|
33
|
+
end
|
34
|
+
|
35
|
+
def worksheet
|
36
|
+
spreadsheets.sheets.first
|
37
|
+
end
|
38
|
+
|
39
|
+
def worksheet_properties
|
40
|
+
worksheet.properties
|
41
|
+
end
|
42
|
+
|
43
|
+
def worksheet_grid_properties
|
44
|
+
worksheet_properties.grid_properties
|
45
|
+
end
|
46
|
+
|
47
|
+
def worksheet_max_row_num
|
48
|
+
worksheet_grid_properties.row_count
|
49
|
+
end
|
50
|
+
|
51
|
+
def worksheet_max_column_num
|
52
|
+
worksheet_grid_properties.column_count
|
53
|
+
end
|
54
|
+
|
55
|
+
def worksheet_values(range)
|
56
|
+
range = "#{worksheet_title}!#{range}"
|
57
|
+
logger.info { "`embulk-input-google_spreadsheets`: load data from spreadsheet: '#{spreadsheets_url}', range: '#{range}'" }
|
58
|
+
service.get_spreadsheet_values(spreadsheets_id, range).values
|
59
|
+
end
|
60
|
+
|
61
|
+
def worksheet_each_record(&block)
|
62
|
+
pager.each_record(self, &block)
|
63
|
+
end
|
64
|
+
|
65
|
+
def service
|
66
|
+
@service ||= Google::Apis::SheetsV4::SheetsService.new.tap do |s|
|
67
|
+
s.client_options.application_name = application_name
|
68
|
+
s.authorization = auth.authenticate
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class GoogleSpreadsheets < InputPlugin
|
4
|
+
module SpreadsheetsUrlUtil
|
5
|
+
|
6
|
+
def self.capture_id(url)
|
7
|
+
scanned = url.scan(capture_id_regex).first
|
8
|
+
return unless scanned
|
9
|
+
scanned.first
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.base_url
|
13
|
+
@base_url ||= 'https://docs.google.com/spreadsheets/d/'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.capture_id_regex
|
17
|
+
@capture_id_regex ||= %r{#{base_url}([^/]+)/.*}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class GoogleSpreadsheets < InputPlugin
|
4
|
+
module Typecast
|
5
|
+
class Base
|
6
|
+
|
7
|
+
attr_reader :null_string
|
8
|
+
|
9
|
+
def initialize(task)
|
10
|
+
@null_string = task['null_string']
|
11
|
+
end
|
12
|
+
|
13
|
+
def logger
|
14
|
+
GoogleSpreadsheets.logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_json(*args) # for logging
|
18
|
+
spec = {JSON.create_id => self.class.name}
|
19
|
+
spec = instance_variables.inject(spec) do |spec, v|
|
20
|
+
spec.tap do |s|
|
21
|
+
s[v] = instance_variable_get(v)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
spec.to_json(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
def as_string(value)
|
28
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
29
|
+
end
|
30
|
+
|
31
|
+
def as_long(value)
|
32
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
33
|
+
end
|
34
|
+
|
35
|
+
def as_double(value)
|
36
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
37
|
+
end
|
38
|
+
|
39
|
+
def as_boolean(value)
|
40
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
41
|
+
end
|
42
|
+
|
43
|
+
def as_timestamp(value, timestamp_format, timezone)
|
44
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
45
|
+
end
|
46
|
+
|
47
|
+
def as_json(value)
|
48
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
def null_string?(value)
|
54
|
+
return false unless value.is_a?(String)
|
55
|
+
return true if value == null_string
|
56
|
+
return false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require_relative 'strict_typecast'
|
2
|
+
|
3
|
+
module Embulk
|
4
|
+
module Input
|
5
|
+
class GoogleSpreadsheets < InputPlugin
|
6
|
+
module Typecast
|
7
|
+
class LooseTypecast < StrictTypecast
|
8
|
+
def as_string(value)
|
9
|
+
begin
|
10
|
+
super
|
11
|
+
rescue => e
|
12
|
+
if e.is_a?(TypecastError)
|
13
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
14
|
+
return nil
|
15
|
+
end
|
16
|
+
raise e
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def as_long(value)
|
21
|
+
begin
|
22
|
+
super
|
23
|
+
rescue => e
|
24
|
+
if e.is_a?(TypecastError)
|
25
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
raise e
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def as_double(value)
|
33
|
+
begin
|
34
|
+
super
|
35
|
+
rescue => e
|
36
|
+
if e.is_a?(TypecastError)
|
37
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
38
|
+
return nil
|
39
|
+
end
|
40
|
+
raise e
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def as_boolean(value)
|
45
|
+
begin
|
46
|
+
super
|
47
|
+
rescue => e
|
48
|
+
if e.is_a?(TypecastError)
|
49
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
50
|
+
return nil
|
51
|
+
end
|
52
|
+
raise e
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def as_timestamp(value, timestamp_format = nil, timezone = nil)
|
57
|
+
begin
|
58
|
+
super
|
59
|
+
rescue => e
|
60
|
+
if e.is_a?(TypecastError)
|
61
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
62
|
+
return nil
|
63
|
+
end
|
64
|
+
raise e
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def as_json(value)
|
69
|
+
begin
|
70
|
+
super
|
71
|
+
rescue => e
|
72
|
+
if e.is_a?(TypecastError)
|
73
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
74
|
+
return nil
|
75
|
+
end
|
76
|
+
raise e
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require 'time_with_zone'
|
2
|
+
require_relative 'base'
|
3
|
+
require_relative 'timestamp_format_util'
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
module Input
|
7
|
+
class GoogleSpreadsheets < InputPlugin
|
8
|
+
module Typecast
|
9
|
+
class MinimalTypecast < Base
|
10
|
+
|
11
|
+
def as_string(value)
|
12
|
+
return nil if value.nil?
|
13
|
+
return nil if null_string?(value)
|
14
|
+
value.to_s
|
15
|
+
rescue NoMethodError => e
|
16
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to String: \"#{value}\" because of '#{e}'"
|
17
|
+
end
|
18
|
+
|
19
|
+
def as_long(value)
|
20
|
+
return nil if value.nil?
|
21
|
+
return nil if null_string?(value)
|
22
|
+
value.to_i
|
23
|
+
rescue NoMethodError => e
|
24
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Long: \"#{value}\" because of '#{e}'"
|
25
|
+
end
|
26
|
+
|
27
|
+
def as_double(value)
|
28
|
+
return nil if value.nil?
|
29
|
+
return nil if null_string?(value)
|
30
|
+
value.to_f
|
31
|
+
rescue NoMethodError => e
|
32
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Double: \"#{value}\" because of '#{e}'"
|
33
|
+
end
|
34
|
+
|
35
|
+
def as_boolean(value)
|
36
|
+
return nil if value.nil?
|
37
|
+
return nil if null_string?(value)
|
38
|
+
|
39
|
+
case value
|
40
|
+
when TrueClass, FalseClass
|
41
|
+
value
|
42
|
+
when String
|
43
|
+
value = value.downcase
|
44
|
+
case value
|
45
|
+
when 'true'
|
46
|
+
true
|
47
|
+
when 'false'
|
48
|
+
false
|
49
|
+
else
|
50
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast '#{value}' to a boolean value."
|
51
|
+
end
|
52
|
+
else
|
53
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to a boolean value: \"#{value}\""
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def as_timestamp(value, timestamp_format = nil, timezone = nil)
|
58
|
+
return nil if value.nil?
|
59
|
+
return nil if null_string?(value)
|
60
|
+
|
61
|
+
if timestamp_format and TimestampFormatUtil.timezone_format?(timestamp_format)
|
62
|
+
Time.strptime(value, timestamp_format)
|
63
|
+
elsif timestamp_format and timezone
|
64
|
+
TimeWithZone.strptime_with_zone(value, timestamp_format, timezone)
|
65
|
+
elsif timezone
|
66
|
+
TimeWithZone.parse_with_zone(value, timezone)
|
67
|
+
elsif timestamp_format
|
68
|
+
Time.strptime(value, timestamp_format)
|
69
|
+
else
|
70
|
+
Time.parse(value)
|
71
|
+
end
|
72
|
+
rescue ArgumentError, TypeError, NoMethodError => e
|
73
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Time: \"#{value}\" because of '#{e}'"
|
74
|
+
end
|
75
|
+
|
76
|
+
def as_json(value)
|
77
|
+
return nil if value.nil?
|
78
|
+
return nil if null_string?(value)
|
79
|
+
|
80
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/lib/embulk/page_builder.rb#L20
|
81
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java#L97
|
82
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java#L66
|
83
|
+
# cf. https://github.com/embulk/embulk/blob/997c7beb89d42122f7cb6fe844f8ca79a3cb666c/embulk-core/src/main/java/org/embulk/spi/util/dynamic/JsonColumnSetter.java#L50
|
84
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java#L47
|
85
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java#L57
|
86
|
+
# NOTE: As long as reading the above code, any object can be set as Json
|
87
|
+
# (that must be primitive type or must have `to_msgpack` method.)
|
88
|
+
case value
|
89
|
+
when TrueClass, FalseClass, Integer, Float, Array, Hash
|
90
|
+
value
|
91
|
+
when String
|
92
|
+
begin
|
93
|
+
JSON.parse(value)
|
94
|
+
rescue JSON::ParserError => e
|
95
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\" because of '#{e}'"
|
96
|
+
end
|
97
|
+
when Time
|
98
|
+
# TODO: support Time class. Now call Exception to avoid format/timezone trouble.
|
99
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast Time to JSON: \"#{value}\""
|
100
|
+
else
|
101
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\""
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|