embulk-input-google_spreadsheets 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.travis.yml +12 -0
  4. data/CHANGELOG.md +67 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +94 -0
  8. data/Rakefile +13 -0
  9. data/embulk-input-google_spreadsheets.gemspec +24 -0
  10. data/example/config_authorized_user.yml +19 -0
  11. data/example/config_authorized_user.yml.liquid +25 -0
  12. data/example/config_authorized_user_emoji_worksheet.yml +19 -0
  13. data/example/config_authorized_user_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  14. data/example/config_authorized_user_large_data.yml +19 -0
  15. data/example/config_authorized_user_no_data.yml +18 -0
  16. data/example/config_service_account.yml +19 -0
  17. data/example/config_service_account_emoji_worksheet.yml +19 -0
  18. data/example/config_service_account_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  19. data/example/config_service_account_large_data.yml +19 -0
  20. data/example/config_service_account_no_data.yml +18 -0
  21. data/example/setup_authorized_user_credentials.rb +34 -0
  22. data/lib/embulk/input/google_spreadsheets.rb +182 -0
  23. data/lib/embulk/input/google_spreadsheets/auth.rb +63 -0
  24. data/lib/embulk/input/google_spreadsheets/error.rb +36 -0
  25. data/lib/embulk/input/google_spreadsheets/pager.rb +107 -0
  26. data/lib/embulk/input/google_spreadsheets/pager_util.rb +28 -0
  27. data/lib/embulk/input/google_spreadsheets/record_typecaster.rb +73 -0
  28. data/lib/embulk/input/google_spreadsheets/spreadsheets_client.rb +75 -0
  29. data/lib/embulk/input/google_spreadsheets/spreadsheets_url_util.rb +23 -0
  30. data/lib/embulk/input/google_spreadsheets/typecast/base.rb +62 -0
  31. data/lib/embulk/input/google_spreadsheets/typecast/loose_typecast.rb +84 -0
  32. data/lib/embulk/input/google_spreadsheets/typecast/minimal_typecast.rb +109 -0
  33. data/lib/embulk/input/google_spreadsheets/typecast/strict_typecast.rb +236 -0
  34. data/lib/embulk/input/google_spreadsheets/typecast/timestamp_format_util.rb +29 -0
  35. data/lib/embulk/input/google_spreadsheets/typecast_factory.rb +34 -0
  36. data/test/assert_embulk_nothing_raised.rb +11 -0
  37. data/test/assert_embulk_raise.rb +11 -0
  38. data/test/dummy.key +27 -0
  39. data/test/helper.rb +21 -0
  40. data/test/test_auth.rb +82 -0
  41. data/test/test_configure.rb +155 -0
  42. data/test/test_loose_typecast.rb +194 -0
  43. data/test/test_minimal_typecast.rb +616 -0
  44. data/test/test_pager_util.rb +24 -0
  45. data/test/test_run_examples.rb +125 -0
  46. data/test/test_spreadsheets_client.rb +87 -0
  47. data/test/test_spreadsheets_url_util.rb +23 -0
  48. data/test/test_strict_typecast.rb +666 -0
  49. data/test/test_typecast_factory.rb +36 -0
  50. metadata +220 -0
@@ -0,0 +1,28 @@
1
+ module Embulk
2
+ module Input
3
+ class GoogleSpreadsheets < InputPlugin
4
+ module PagerUtil
5
+
6
+ def self.num2col(num, base = default_base, offset = default_offset)
7
+ [].tap do |r|
8
+ while num > 0
9
+ num -= 1
10
+ r.unshift((num % base + offset).chr)
11
+ num /= base
12
+ end
13
+ end.join
14
+ end
15
+
16
+ private
17
+
18
+ def self.default_offset
19
+ @default_offset ||= 'A'.ord
20
+ end
21
+
22
+ def self.default_base
23
+ @default_base ||= 26 # number of alphabet
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,73 @@
1
+ require_relative 'typecast_factory'
2
+
3
+
4
+ module Embulk
5
+ module Input
6
+ class GoogleSpreadsheets < InputPlugin
7
+ class RecordTypecaster
8
+
9
+ attr_reader :column_names, :column_details
10
+
11
+ def initialize(task)
12
+ @column_names = task['columns'].map{|c| c['name']}
13
+ @column_details = configure_column_details(task)
14
+ end
15
+
16
+ def configure_column_details(task)
17
+ _column_details = task['columns'].dup.each_with_index.inject({}) do |details, column_with_index|
18
+ c, i = *column_with_index
19
+ details.tap do |ds|
20
+ ds[c['name']] = {}.tap do |d|
21
+ d['index'] = i
22
+ d['name'] = c['name']
23
+ d['type'] = c['type'].to_sym
24
+ d['format'] = c['format']
25
+ d['timezone'] = c['timezone']
26
+ d['typecast'] = TypecastFactory.create(c['typecast'], task)
27
+ end
28
+ end
29
+ end
30
+
31
+ logger.debug { "`embulk-input-google_spreadsheets`: configured column details '#{_column_details.to_json}'"}
32
+ _column_details
33
+ end
34
+
35
+ def logger
36
+ GoogleSpreadsheets.logger
37
+ end
38
+
39
+ def transform_by_columns(record)
40
+ column_names.map do |n|
41
+ d = column_details[n]
42
+ typecast = d['typecast']
43
+ value = record[d['index']]
44
+ type = d['type']
45
+
46
+ begin
47
+ case type
48
+ when :string
49
+ typecast.as_string(value)
50
+ when :long
51
+ typecast.as_long(value)
52
+ when :double
53
+ typecast.as_double(value)
54
+ when :boolean
55
+ typecast.as_boolean(value)
56
+ when :timestamp
57
+ typecast.as_timestamp(value, d['format'], d['timezone'])
58
+ when :json
59
+ typecast.as_json(value)
60
+ else
61
+ raise ConfigError.new("`google_spreadsheets`: Unsupported type `#{type}`")
62
+ end
63
+ rescue => e
64
+ # for adding column information
65
+ raise TypecastError.new(e, ", column: #{n}, column_detail: #{d.to_json}")
66
+ end
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,75 @@
1
+ require 'google/apis/sheets_v4'
2
+ require_relative 'spreadsheets_url_util'
3
+
4
+ module Embulk
5
+ module Input
6
+ class GoogleSpreadsheets < InputPlugin
7
+
8
+ class SpreadsheetsClient
9
+
10
+ attr_accessor :spreadsheets_url, :worksheet_title, :auth, :pager
11
+
12
+ def initialize(task, auth:, pager:)
13
+ @spreadsheets_url = task['spreadsheets_url']
14
+ @worksheet_title = task['worksheet_title']
15
+ @auth = auth
16
+ @pager = pager
17
+ end
18
+
19
+ def logger
20
+ GoogleSpreadsheets.logger
21
+ end
22
+
23
+ def application_name
24
+ @application_name ||= 'embulk-input-google_spreadsheets'
25
+ end
26
+
27
+ def spreadsheets_id
28
+ SpreadsheetsUrlUtil.capture_id(spreadsheets_url)
29
+ end
30
+
31
+ def spreadsheets
32
+ service.get_spreadsheet(spreadsheets_id, ranges: worksheet_title, include_grid_data: false)
33
+ end
34
+
35
+ def worksheet
36
+ spreadsheets.sheets.first
37
+ end
38
+
39
+ def worksheet_properties
40
+ worksheet.properties
41
+ end
42
+
43
+ def worksheet_grid_properties
44
+ worksheet_properties.grid_properties
45
+ end
46
+
47
+ def worksheet_max_row_num
48
+ worksheet_grid_properties.row_count
49
+ end
50
+
51
+ def worksheet_max_column_num
52
+ worksheet_grid_properties.column_count
53
+ end
54
+
55
+ def worksheet_values(range)
56
+ range = "#{worksheet_title}!#{range}"
57
+ logger.info { "`embulk-input-google_spreadsheets`: load data from spreadsheet: '#{spreadsheets_url}', range: '#{range}'" }
58
+ service.get_spreadsheet_values(spreadsheets_id, range).values
59
+ end
60
+
61
+ def worksheet_each_record(&block)
62
+ pager.each_record(self, &block)
63
+ end
64
+
65
+ def service
66
+ @service ||= Google::Apis::SheetsV4::SheetsService.new.tap do |s|
67
+ s.client_options.application_name = application_name
68
+ s.authorization = auth.authenticate
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,23 @@
1
+ module Embulk
2
+ module Input
3
+ class GoogleSpreadsheets < InputPlugin
4
+ module SpreadsheetsUrlUtil
5
+
6
+ def self.capture_id(url)
7
+ scanned = url.scan(capture_id_regex).first
8
+ return unless scanned
9
+ scanned.first
10
+ end
11
+
12
+ def self.base_url
13
+ @base_url ||= 'https://docs.google.com/spreadsheets/d/'
14
+ end
15
+
16
+ def self.capture_id_regex
17
+ @capture_id_regex ||= %r{#{base_url}([^/]+)/.*}
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+
@@ -0,0 +1,62 @@
1
+ module Embulk
2
+ module Input
3
+ class GoogleSpreadsheets < InputPlugin
4
+ module Typecast
5
+ class Base
6
+
7
+ attr_reader :null_string
8
+
9
+ def initialize(task)
10
+ @null_string = task['null_string']
11
+ end
12
+
13
+ def logger
14
+ GoogleSpreadsheets.logger
15
+ end
16
+
17
+ def to_json(*args) # for logging
18
+ spec = {JSON.create_id => self.class.name}
19
+ spec = instance_variables.inject(spec) do |spec, v|
20
+ spec.tap do |s|
21
+ s[v] = instance_variable_get(v)
22
+ end
23
+ end
24
+ spec.to_json(*args)
25
+ end
26
+
27
+ def as_string(value)
28
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
29
+ end
30
+
31
+ def as_long(value)
32
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
33
+ end
34
+
35
+ def as_double(value)
36
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
37
+ end
38
+
39
+ def as_boolean(value)
40
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
41
+ end
42
+
43
+ def as_timestamp(value, timestamp_format, timezone)
44
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
45
+ end
46
+
47
+ def as_json(value)
48
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
49
+ end
50
+
51
+ protected
52
+
53
+ def null_string?(value)
54
+ return false unless value.is_a?(String)
55
+ return true if value == null_string
56
+ return false
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,84 @@
1
+ require_relative 'strict_typecast'
2
+
3
+ module Embulk
4
+ module Input
5
+ class GoogleSpreadsheets < InputPlugin
6
+ module Typecast
7
+ class LooseTypecast < StrictTypecast
8
+ def as_string(value)
9
+ begin
10
+ super
11
+ rescue => e
12
+ if e.is_a?(TypecastError)
13
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
14
+ return nil
15
+ end
16
+ raise e
17
+ end
18
+ end
19
+
20
+ def as_long(value)
21
+ begin
22
+ super
23
+ rescue => e
24
+ if e.is_a?(TypecastError)
25
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
26
+ return nil
27
+ end
28
+ raise e
29
+ end
30
+ end
31
+
32
+ def as_double(value)
33
+ begin
34
+ super
35
+ rescue => e
36
+ if e.is_a?(TypecastError)
37
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
38
+ return nil
39
+ end
40
+ raise e
41
+ end
42
+ end
43
+
44
+ def as_boolean(value)
45
+ begin
46
+ super
47
+ rescue => e
48
+ if e.is_a?(TypecastError)
49
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
50
+ return nil
51
+ end
52
+ raise e
53
+ end
54
+ end
55
+
56
+ def as_timestamp(value, timestamp_format = nil, timezone = nil)
57
+ begin
58
+ super
59
+ rescue => e
60
+ if e.is_a?(TypecastError)
61
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
62
+ return nil
63
+ end
64
+ raise e
65
+ end
66
+ end
67
+
68
+ def as_json(value)
69
+ begin
70
+ super
71
+ rescue => e
72
+ if e.is_a?(TypecastError)
73
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
74
+ return nil
75
+ end
76
+ raise e
77
+ end
78
+ end
79
+
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,109 @@
1
+ require 'time_with_zone'
2
+ require_relative 'base'
3
+ require_relative 'timestamp_format_util'
4
+
5
+ module Embulk
6
+ module Input
7
+ class GoogleSpreadsheets < InputPlugin
8
+ module Typecast
9
+ class MinimalTypecast < Base
10
+
11
+ def as_string(value)
12
+ return nil if value.nil?
13
+ return nil if null_string?(value)
14
+ value.to_s
15
+ rescue NoMethodError => e
16
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to String: \"#{value}\" because of '#{e}'"
17
+ end
18
+
19
+ def as_long(value)
20
+ return nil if value.nil?
21
+ return nil if null_string?(value)
22
+ value.to_i
23
+ rescue NoMethodError => e
24
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Long: \"#{value}\" because of '#{e}'"
25
+ end
26
+
27
+ def as_double(value)
28
+ return nil if value.nil?
29
+ return nil if null_string?(value)
30
+ value.to_f
31
+ rescue NoMethodError => e
32
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Double: \"#{value}\" because of '#{e}'"
33
+ end
34
+
35
+ def as_boolean(value)
36
+ return nil if value.nil?
37
+ return nil if null_string?(value)
38
+
39
+ case value
40
+ when TrueClass, FalseClass
41
+ value
42
+ when String
43
+ value = value.downcase
44
+ case value
45
+ when 'true'
46
+ true
47
+ when 'false'
48
+ false
49
+ else
50
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast '#{value}' to a boolean value."
51
+ end
52
+ else
53
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to a boolean value: \"#{value}\""
54
+ end
55
+ end
56
+
57
+ def as_timestamp(value, timestamp_format = nil, timezone = nil)
58
+ return nil if value.nil?
59
+ return nil if null_string?(value)
60
+
61
+ if timestamp_format and TimestampFormatUtil.timezone_format?(timestamp_format)
62
+ Time.strptime(value, timestamp_format)
63
+ elsif timestamp_format and timezone
64
+ TimeWithZone.strptime_with_zone(value, timestamp_format, timezone)
65
+ elsif timezone
66
+ TimeWithZone.parse_with_zone(value, timezone)
67
+ elsif timestamp_format
68
+ Time.strptime(value, timestamp_format)
69
+ else
70
+ Time.parse(value)
71
+ end
72
+ rescue ArgumentError, TypeError, NoMethodError => e
73
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Time: \"#{value}\" because of '#{e}'"
74
+ end
75
+
76
+ def as_json(value)
77
+ return nil if value.nil?
78
+ return nil if null_string?(value)
79
+
80
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/lib/embulk/page_builder.rb#L20
81
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java#L97
82
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java#L66
83
+ # cf. https://github.com/embulk/embulk/blob/997c7beb89d42122f7cb6fe844f8ca79a3cb666c/embulk-core/src/main/java/org/embulk/spi/util/dynamic/JsonColumnSetter.java#L50
84
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java#L47
85
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java#L57
86
+ # NOTE: As long as reading the above code, any object can be set as Json
87
+ # (that must be primitive type or must have `to_msgpack` method.)
88
+ case value
89
+ when TrueClass, FalseClass, Integer, Float, Array, Hash
90
+ value
91
+ when String
92
+ begin
93
+ JSON.parse(value)
94
+ rescue JSON::ParserError => e
95
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\" because of '#{e}'"
96
+ end
97
+ when Time
98
+ # TODO: support Time class. Now call Exception to avoid format/timezone trouble.
99
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast Time to JSON: \"#{value}\""
100
+ else
101
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\""
102
+ end
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end