embulk-input-google_spreadsheets 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +10 -0
- data/.travis.yml +12 -0
- data/CHANGELOG.md +67 -0
- data/Gemfile +3 -0
- data/LICENSE.txt +21 -0
- data/README.md +94 -0
- data/Rakefile +13 -0
- data/embulk-input-google_spreadsheets.gemspec +24 -0
- data/example/config_authorized_user.yml +19 -0
- data/example/config_authorized_user.yml.liquid +25 -0
- data/example/config_authorized_user_emoji_worksheet.yml +19 -0
- data/example/config_authorized_user_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
- data/example/config_authorized_user_large_data.yml +19 -0
- data/example/config_authorized_user_no_data.yml +18 -0
- data/example/config_service_account.yml +19 -0
- data/example/config_service_account_emoji_worksheet.yml +19 -0
- data/example/config_service_account_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
- data/example/config_service_account_large_data.yml +19 -0
- data/example/config_service_account_no_data.yml +18 -0
- data/example/setup_authorized_user_credentials.rb +34 -0
- data/lib/embulk/input/google_spreadsheets.rb +182 -0
- data/lib/embulk/input/google_spreadsheets/auth.rb +63 -0
- data/lib/embulk/input/google_spreadsheets/error.rb +36 -0
- data/lib/embulk/input/google_spreadsheets/pager.rb +107 -0
- data/lib/embulk/input/google_spreadsheets/pager_util.rb +28 -0
- data/lib/embulk/input/google_spreadsheets/record_typecaster.rb +73 -0
- data/lib/embulk/input/google_spreadsheets/spreadsheets_client.rb +75 -0
- data/lib/embulk/input/google_spreadsheets/spreadsheets_url_util.rb +23 -0
- data/lib/embulk/input/google_spreadsheets/typecast/base.rb +62 -0
- data/lib/embulk/input/google_spreadsheets/typecast/loose_typecast.rb +84 -0
- data/lib/embulk/input/google_spreadsheets/typecast/minimal_typecast.rb +109 -0
- data/lib/embulk/input/google_spreadsheets/typecast/strict_typecast.rb +236 -0
- data/lib/embulk/input/google_spreadsheets/typecast/timestamp_format_util.rb +29 -0
- data/lib/embulk/input/google_spreadsheets/typecast_factory.rb +34 -0
- data/test/assert_embulk_nothing_raised.rb +11 -0
- data/test/assert_embulk_raise.rb +11 -0
- data/test/dummy.key +27 -0
- data/test/helper.rb +21 -0
- data/test/test_auth.rb +82 -0
- data/test/test_configure.rb +155 -0
- data/test/test_loose_typecast.rb +194 -0
- data/test/test_minimal_typecast.rb +616 -0
- data/test/test_pager_util.rb +24 -0
- data/test/test_run_examples.rb +125 -0
- data/test/test_spreadsheets_client.rb +87 -0
- data/test/test_spreadsheets_url_util.rb +23 -0
- data/test/test_strict_typecast.rb +666 -0
- data/test/test_typecast_factory.rb +36 -0
- metadata +220 -0
@@ -0,0 +1,28 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class GoogleSpreadsheets < InputPlugin
|
4
|
+
module PagerUtil
|
5
|
+
|
6
|
+
def self.num2col(num, base = default_base, offset = default_offset)
|
7
|
+
[].tap do |r|
|
8
|
+
while num > 0
|
9
|
+
num -= 1
|
10
|
+
r.unshift((num % base + offset).chr)
|
11
|
+
num /= base
|
12
|
+
end
|
13
|
+
end.join
|
14
|
+
end
|
15
|
+
|
16
|
+
private
|
17
|
+
|
18
|
+
def self.default_offset
|
19
|
+
@default_offset ||= 'A'.ord
|
20
|
+
end
|
21
|
+
|
22
|
+
def self.default_base
|
23
|
+
@default_base ||= 26 # number of alphabet
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
require_relative 'typecast_factory'
|
2
|
+
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Input
|
6
|
+
class GoogleSpreadsheets < InputPlugin
|
7
|
+
class RecordTypecaster
|
8
|
+
|
9
|
+
attr_reader :column_names, :column_details
|
10
|
+
|
11
|
+
def initialize(task)
|
12
|
+
@column_names = task['columns'].map{|c| c['name']}
|
13
|
+
@column_details = configure_column_details(task)
|
14
|
+
end
|
15
|
+
|
16
|
+
def configure_column_details(task)
|
17
|
+
_column_details = task['columns'].dup.each_with_index.inject({}) do |details, column_with_index|
|
18
|
+
c, i = *column_with_index
|
19
|
+
details.tap do |ds|
|
20
|
+
ds[c['name']] = {}.tap do |d|
|
21
|
+
d['index'] = i
|
22
|
+
d['name'] = c['name']
|
23
|
+
d['type'] = c['type'].to_sym
|
24
|
+
d['format'] = c['format']
|
25
|
+
d['timezone'] = c['timezone']
|
26
|
+
d['typecast'] = TypecastFactory.create(c['typecast'], task)
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
logger.debug { "`embulk-input-google_spreadsheets`: configured column details '#{_column_details.to_json}'"}
|
32
|
+
_column_details
|
33
|
+
end
|
34
|
+
|
35
|
+
def logger
|
36
|
+
GoogleSpreadsheets.logger
|
37
|
+
end
|
38
|
+
|
39
|
+
def transform_by_columns(record)
|
40
|
+
column_names.map do |n|
|
41
|
+
d = column_details[n]
|
42
|
+
typecast = d['typecast']
|
43
|
+
value = record[d['index']]
|
44
|
+
type = d['type']
|
45
|
+
|
46
|
+
begin
|
47
|
+
case type
|
48
|
+
when :string
|
49
|
+
typecast.as_string(value)
|
50
|
+
when :long
|
51
|
+
typecast.as_long(value)
|
52
|
+
when :double
|
53
|
+
typecast.as_double(value)
|
54
|
+
when :boolean
|
55
|
+
typecast.as_boolean(value)
|
56
|
+
when :timestamp
|
57
|
+
typecast.as_timestamp(value, d['format'], d['timezone'])
|
58
|
+
when :json
|
59
|
+
typecast.as_json(value)
|
60
|
+
else
|
61
|
+
raise ConfigError.new("`google_spreadsheets`: Unsupported type `#{type}`")
|
62
|
+
end
|
63
|
+
rescue => e
|
64
|
+
# for adding column information
|
65
|
+
raise TypecastError.new(e, ", column: #{n}, column_detail: #{d.to_json}")
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require 'google/apis/sheets_v4'
|
2
|
+
require_relative 'spreadsheets_url_util'
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Input
|
6
|
+
class GoogleSpreadsheets < InputPlugin
|
7
|
+
|
8
|
+
class SpreadsheetsClient
|
9
|
+
|
10
|
+
attr_accessor :spreadsheets_url, :worksheet_title, :auth, :pager
|
11
|
+
|
12
|
+
def initialize(task, auth:, pager:)
|
13
|
+
@spreadsheets_url = task['spreadsheets_url']
|
14
|
+
@worksheet_title = task['worksheet_title']
|
15
|
+
@auth = auth
|
16
|
+
@pager = pager
|
17
|
+
end
|
18
|
+
|
19
|
+
def logger
|
20
|
+
GoogleSpreadsheets.logger
|
21
|
+
end
|
22
|
+
|
23
|
+
def application_name
|
24
|
+
@application_name ||= 'embulk-input-google_spreadsheets'
|
25
|
+
end
|
26
|
+
|
27
|
+
def spreadsheets_id
|
28
|
+
SpreadsheetsUrlUtil.capture_id(spreadsheets_url)
|
29
|
+
end
|
30
|
+
|
31
|
+
def spreadsheets
|
32
|
+
service.get_spreadsheet(spreadsheets_id, ranges: worksheet_title, include_grid_data: false)
|
33
|
+
end
|
34
|
+
|
35
|
+
def worksheet
|
36
|
+
spreadsheets.sheets.first
|
37
|
+
end
|
38
|
+
|
39
|
+
def worksheet_properties
|
40
|
+
worksheet.properties
|
41
|
+
end
|
42
|
+
|
43
|
+
def worksheet_grid_properties
|
44
|
+
worksheet_properties.grid_properties
|
45
|
+
end
|
46
|
+
|
47
|
+
def worksheet_max_row_num
|
48
|
+
worksheet_grid_properties.row_count
|
49
|
+
end
|
50
|
+
|
51
|
+
def worksheet_max_column_num
|
52
|
+
worksheet_grid_properties.column_count
|
53
|
+
end
|
54
|
+
|
55
|
+
def worksheet_values(range)
|
56
|
+
range = "#{worksheet_title}!#{range}"
|
57
|
+
logger.info { "`embulk-input-google_spreadsheets`: load data from spreadsheet: '#{spreadsheets_url}', range: '#{range}'" }
|
58
|
+
service.get_spreadsheet_values(spreadsheets_id, range).values
|
59
|
+
end
|
60
|
+
|
61
|
+
def worksheet_each_record(&block)
|
62
|
+
pager.each_record(self, &block)
|
63
|
+
end
|
64
|
+
|
65
|
+
def service
|
66
|
+
@service ||= Google::Apis::SheetsV4::SheetsService.new.tap do |s|
|
67
|
+
s.client_options.application_name = application_name
|
68
|
+
s.authorization = auth.authenticate
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class GoogleSpreadsheets < InputPlugin
|
4
|
+
module SpreadsheetsUrlUtil
|
5
|
+
|
6
|
+
def self.capture_id(url)
|
7
|
+
scanned = url.scan(capture_id_regex).first
|
8
|
+
return unless scanned
|
9
|
+
scanned.first
|
10
|
+
end
|
11
|
+
|
12
|
+
def self.base_url
|
13
|
+
@base_url ||= 'https://docs.google.com/spreadsheets/d/'
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.capture_id_regex
|
17
|
+
@capture_id_regex ||= %r{#{base_url}([^/]+)/.*}
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
@@ -0,0 +1,62 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class GoogleSpreadsheets < InputPlugin
|
4
|
+
module Typecast
|
5
|
+
class Base
|
6
|
+
|
7
|
+
attr_reader :null_string
|
8
|
+
|
9
|
+
def initialize(task)
|
10
|
+
@null_string = task['null_string']
|
11
|
+
end
|
12
|
+
|
13
|
+
def logger
|
14
|
+
GoogleSpreadsheets.logger
|
15
|
+
end
|
16
|
+
|
17
|
+
def to_json(*args) # for logging
|
18
|
+
spec = {JSON.create_id => self.class.name}
|
19
|
+
spec = instance_variables.inject(spec) do |spec, v|
|
20
|
+
spec.tap do |s|
|
21
|
+
s[v] = instance_variable_get(v)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
spec.to_json(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
def as_string(value)
|
28
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
29
|
+
end
|
30
|
+
|
31
|
+
def as_long(value)
|
32
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
33
|
+
end
|
34
|
+
|
35
|
+
def as_double(value)
|
36
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
37
|
+
end
|
38
|
+
|
39
|
+
def as_boolean(value)
|
40
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
41
|
+
end
|
42
|
+
|
43
|
+
def as_timestamp(value, timestamp_format, timezone)
|
44
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
45
|
+
end
|
46
|
+
|
47
|
+
def as_json(value)
|
48
|
+
raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
|
49
|
+
end
|
50
|
+
|
51
|
+
protected
|
52
|
+
|
53
|
+
def null_string?(value)
|
54
|
+
return false unless value.is_a?(String)
|
55
|
+
return true if value == null_string
|
56
|
+
return false
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
require_relative 'strict_typecast'
|
2
|
+
|
3
|
+
module Embulk
|
4
|
+
module Input
|
5
|
+
class GoogleSpreadsheets < InputPlugin
|
6
|
+
module Typecast
|
7
|
+
class LooseTypecast < StrictTypecast
|
8
|
+
def as_string(value)
|
9
|
+
begin
|
10
|
+
super
|
11
|
+
rescue => e
|
12
|
+
if e.is_a?(TypecastError)
|
13
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
14
|
+
return nil
|
15
|
+
end
|
16
|
+
raise e
|
17
|
+
end
|
18
|
+
end
|
19
|
+
|
20
|
+
def as_long(value)
|
21
|
+
begin
|
22
|
+
super
|
23
|
+
rescue => e
|
24
|
+
if e.is_a?(TypecastError)
|
25
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
26
|
+
return nil
|
27
|
+
end
|
28
|
+
raise e
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
def as_double(value)
|
33
|
+
begin
|
34
|
+
super
|
35
|
+
rescue => e
|
36
|
+
if e.is_a?(TypecastError)
|
37
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
38
|
+
return nil
|
39
|
+
end
|
40
|
+
raise e
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
def as_boolean(value)
|
45
|
+
begin
|
46
|
+
super
|
47
|
+
rescue => e
|
48
|
+
if e.is_a?(TypecastError)
|
49
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
50
|
+
return nil
|
51
|
+
end
|
52
|
+
raise e
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def as_timestamp(value, timestamp_format = nil, timezone = nil)
|
57
|
+
begin
|
58
|
+
super
|
59
|
+
rescue => e
|
60
|
+
if e.is_a?(TypecastError)
|
61
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
62
|
+
return nil
|
63
|
+
end
|
64
|
+
raise e
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def as_json(value)
|
69
|
+
begin
|
70
|
+
super
|
71
|
+
rescue => e
|
72
|
+
if e.is_a?(TypecastError)
|
73
|
+
logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
|
74
|
+
return nil
|
75
|
+
end
|
76
|
+
raise e
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
end
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|
@@ -0,0 +1,109 @@
|
|
1
|
+
require 'time_with_zone'
|
2
|
+
require_relative 'base'
|
3
|
+
require_relative 'timestamp_format_util'
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
module Input
|
7
|
+
class GoogleSpreadsheets < InputPlugin
|
8
|
+
module Typecast
|
9
|
+
class MinimalTypecast < Base
|
10
|
+
|
11
|
+
def as_string(value)
|
12
|
+
return nil if value.nil?
|
13
|
+
return nil if null_string?(value)
|
14
|
+
value.to_s
|
15
|
+
rescue NoMethodError => e
|
16
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to String: \"#{value}\" because of '#{e}'"
|
17
|
+
end
|
18
|
+
|
19
|
+
def as_long(value)
|
20
|
+
return nil if value.nil?
|
21
|
+
return nil if null_string?(value)
|
22
|
+
value.to_i
|
23
|
+
rescue NoMethodError => e
|
24
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Long: \"#{value}\" because of '#{e}'"
|
25
|
+
end
|
26
|
+
|
27
|
+
def as_double(value)
|
28
|
+
return nil if value.nil?
|
29
|
+
return nil if null_string?(value)
|
30
|
+
value.to_f
|
31
|
+
rescue NoMethodError => e
|
32
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Double: \"#{value}\" because of '#{e}'"
|
33
|
+
end
|
34
|
+
|
35
|
+
def as_boolean(value)
|
36
|
+
return nil if value.nil?
|
37
|
+
return nil if null_string?(value)
|
38
|
+
|
39
|
+
case value
|
40
|
+
when TrueClass, FalseClass
|
41
|
+
value
|
42
|
+
when String
|
43
|
+
value = value.downcase
|
44
|
+
case value
|
45
|
+
when 'true'
|
46
|
+
true
|
47
|
+
when 'false'
|
48
|
+
false
|
49
|
+
else
|
50
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast '#{value}' to a boolean value."
|
51
|
+
end
|
52
|
+
else
|
53
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to a boolean value: \"#{value}\""
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def as_timestamp(value, timestamp_format = nil, timezone = nil)
|
58
|
+
return nil if value.nil?
|
59
|
+
return nil if null_string?(value)
|
60
|
+
|
61
|
+
if timestamp_format and TimestampFormatUtil.timezone_format?(timestamp_format)
|
62
|
+
Time.strptime(value, timestamp_format)
|
63
|
+
elsif timestamp_format and timezone
|
64
|
+
TimeWithZone.strptime_with_zone(value, timestamp_format, timezone)
|
65
|
+
elsif timezone
|
66
|
+
TimeWithZone.parse_with_zone(value, timezone)
|
67
|
+
elsif timestamp_format
|
68
|
+
Time.strptime(value, timestamp_format)
|
69
|
+
else
|
70
|
+
Time.parse(value)
|
71
|
+
end
|
72
|
+
rescue ArgumentError, TypeError, NoMethodError => e
|
73
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Time: \"#{value}\" because of '#{e}'"
|
74
|
+
end
|
75
|
+
|
76
|
+
def as_json(value)
|
77
|
+
return nil if value.nil?
|
78
|
+
return nil if null_string?(value)
|
79
|
+
|
80
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/lib/embulk/page_builder.rb#L20
|
81
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java#L97
|
82
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java#L66
|
83
|
+
# cf. https://github.com/embulk/embulk/blob/997c7beb89d42122f7cb6fe844f8ca79a3cb666c/embulk-core/src/main/java/org/embulk/spi/util/dynamic/JsonColumnSetter.java#L50
|
84
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java#L47
|
85
|
+
# cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java#L57
|
86
|
+
# NOTE: As long as reading the above code, any object can be set as Json
|
87
|
+
# (that must be primitive type or must have `to_msgpack` method.)
|
88
|
+
case value
|
89
|
+
when TrueClass, FalseClass, Integer, Float, Array, Hash
|
90
|
+
value
|
91
|
+
when String
|
92
|
+
begin
|
93
|
+
JSON.parse(value)
|
94
|
+
rescue JSON::ParserError => e
|
95
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\" because of '#{e}'"
|
96
|
+
end
|
97
|
+
when Time
|
98
|
+
# TODO: support Time class. Now call Exception to avoid format/timezone trouble.
|
99
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast Time to JSON: \"#{value}\""
|
100
|
+
else
|
101
|
+
raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\""
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
108
|
+
end
|
109
|
+
end
|