embulk-input-google_spreadsheets 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +10 -0
  3. data/.travis.yml +12 -0
  4. data/CHANGELOG.md +67 -0
  5. data/Gemfile +3 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +94 -0
  8. data/Rakefile +13 -0
  9. data/embulk-input-google_spreadsheets.gemspec +24 -0
  10. data/example/config_authorized_user.yml +19 -0
  11. data/example/config_authorized_user.yml.liquid +25 -0
  12. data/example/config_authorized_user_emoji_worksheet.yml +19 -0
  13. data/example/config_authorized_user_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  14. data/example/config_authorized_user_large_data.yml +19 -0
  15. data/example/config_authorized_user_no_data.yml +18 -0
  16. data/example/config_service_account.yml +19 -0
  17. data/example/config_service_account_emoji_worksheet.yml +19 -0
  18. data/example/config_service_account_empty_rows_appears_at_the_same_as_max_fetch_rows.yml +20 -0
  19. data/example/config_service_account_large_data.yml +19 -0
  20. data/example/config_service_account_no_data.yml +18 -0
  21. data/example/setup_authorized_user_credentials.rb +34 -0
  22. data/lib/embulk/input/google_spreadsheets.rb +182 -0
  23. data/lib/embulk/input/google_spreadsheets/auth.rb +63 -0
  24. data/lib/embulk/input/google_spreadsheets/error.rb +36 -0
  25. data/lib/embulk/input/google_spreadsheets/pager.rb +107 -0
  26. data/lib/embulk/input/google_spreadsheets/pager_util.rb +28 -0
  27. data/lib/embulk/input/google_spreadsheets/record_typecaster.rb +73 -0
  28. data/lib/embulk/input/google_spreadsheets/spreadsheets_client.rb +75 -0
  29. data/lib/embulk/input/google_spreadsheets/spreadsheets_url_util.rb +23 -0
  30. data/lib/embulk/input/google_spreadsheets/typecast/base.rb +62 -0
  31. data/lib/embulk/input/google_spreadsheets/typecast/loose_typecast.rb +84 -0
  32. data/lib/embulk/input/google_spreadsheets/typecast/minimal_typecast.rb +109 -0
  33. data/lib/embulk/input/google_spreadsheets/typecast/strict_typecast.rb +236 -0
  34. data/lib/embulk/input/google_spreadsheets/typecast/timestamp_format_util.rb +29 -0
  35. data/lib/embulk/input/google_spreadsheets/typecast_factory.rb +34 -0
  36. data/test/assert_embulk_nothing_raised.rb +11 -0
  37. data/test/assert_embulk_raise.rb +11 -0
  38. data/test/dummy.key +27 -0
  39. data/test/helper.rb +21 -0
  40. data/test/test_auth.rb +82 -0
  41. data/test/test_configure.rb +155 -0
  42. data/test/test_loose_typecast.rb +194 -0
  43. data/test/test_minimal_typecast.rb +616 -0
  44. data/test/test_pager_util.rb +24 -0
  45. data/test/test_run_examples.rb +125 -0
  46. data/test/test_spreadsheets_client.rb +87 -0
  47. data/test/test_spreadsheets_url_util.rb +23 -0
  48. data/test/test_strict_typecast.rb +666 -0
  49. data/test/test_typecast_factory.rb +36 -0
  50. metadata +220 -0
@@ -0,0 +1,28 @@
1
+ module Embulk
2
+ module Input
3
+ class GoogleSpreadsheets < InputPlugin
4
+ module PagerUtil
5
+
6
+ def self.num2col(num, base = default_base, offset = default_offset)
7
+ [].tap do |r|
8
+ while num > 0
9
+ num -= 1
10
+ r.unshift((num % base + offset).chr)
11
+ num /= base
12
+ end
13
+ end.join
14
+ end
15
+
16
+ private
17
+
18
+ def self.default_offset
19
+ @default_offset ||= 'A'.ord
20
+ end
21
+
22
+ def self.default_base
23
+ @default_base ||= 26 # number of alphabet
24
+ end
25
+ end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,73 @@
1
+ require_relative 'typecast_factory'
2
+
3
+
4
+ module Embulk
5
+ module Input
6
+ class GoogleSpreadsheets < InputPlugin
7
+ class RecordTypecaster
8
+
9
+ attr_reader :column_names, :column_details
10
+
11
+ def initialize(task)
12
+ @column_names = task['columns'].map{|c| c['name']}
13
+ @column_details = configure_column_details(task)
14
+ end
15
+
16
+ def configure_column_details(task)
17
+ _column_details = task['columns'].dup.each_with_index.inject({}) do |details, column_with_index|
18
+ c, i = *column_with_index
19
+ details.tap do |ds|
20
+ ds[c['name']] = {}.tap do |d|
21
+ d['index'] = i
22
+ d['name'] = c['name']
23
+ d['type'] = c['type'].to_sym
24
+ d['format'] = c['format']
25
+ d['timezone'] = c['timezone']
26
+ d['typecast'] = TypecastFactory.create(c['typecast'], task)
27
+ end
28
+ end
29
+ end
30
+
31
+ logger.debug { "`embulk-input-google_spreadsheets`: configured column details '#{_column_details.to_json}'"}
32
+ _column_details
33
+ end
34
+
35
+ def logger
36
+ GoogleSpreadsheets.logger
37
+ end
38
+
39
+ def transform_by_columns(record)
40
+ column_names.map do |n|
41
+ d = column_details[n]
42
+ typecast = d['typecast']
43
+ value = record[d['index']]
44
+ type = d['type']
45
+
46
+ begin
47
+ case type
48
+ when :string
49
+ typecast.as_string(value)
50
+ when :long
51
+ typecast.as_long(value)
52
+ when :double
53
+ typecast.as_double(value)
54
+ when :boolean
55
+ typecast.as_boolean(value)
56
+ when :timestamp
57
+ typecast.as_timestamp(value, d['format'], d['timezone'])
58
+ when :json
59
+ typecast.as_json(value)
60
+ else
61
+ raise ConfigError.new("`google_spreadsheets`: Unsupported type `#{type}`")
62
+ end
63
+ rescue => e
64
+ # for adding column information
65
+ raise TypecastError.new(e, ", column: #{n}, column_detail: #{d.to_json}")
66
+ end
67
+ end
68
+ end
69
+
70
+ end
71
+ end
72
+ end
73
+ end
@@ -0,0 +1,75 @@
1
+ require 'google/apis/sheets_v4'
2
+ require_relative 'spreadsheets_url_util'
3
+
4
+ module Embulk
5
+ module Input
6
+ class GoogleSpreadsheets < InputPlugin
7
+
8
+ class SpreadsheetsClient
9
+
10
+ attr_accessor :spreadsheets_url, :worksheet_title, :auth, :pager
11
+
12
+ def initialize(task, auth:, pager:)
13
+ @spreadsheets_url = task['spreadsheets_url']
14
+ @worksheet_title = task['worksheet_title']
15
+ @auth = auth
16
+ @pager = pager
17
+ end
18
+
19
+ def logger
20
+ GoogleSpreadsheets.logger
21
+ end
22
+
23
+ def application_name
24
+ @application_name ||= 'embulk-input-google_spreadsheets'
25
+ end
26
+
27
+ def spreadsheets_id
28
+ SpreadsheetsUrlUtil.capture_id(spreadsheets_url)
29
+ end
30
+
31
+ def spreadsheets
32
+ service.get_spreadsheet(spreadsheets_id, ranges: worksheet_title, include_grid_data: false)
33
+ end
34
+
35
+ def worksheet
36
+ spreadsheets.sheets.first
37
+ end
38
+
39
+ def worksheet_properties
40
+ worksheet.properties
41
+ end
42
+
43
+ def worksheet_grid_properties
44
+ worksheet_properties.grid_properties
45
+ end
46
+
47
+ def worksheet_max_row_num
48
+ worksheet_grid_properties.row_count
49
+ end
50
+
51
+ def worksheet_max_column_num
52
+ worksheet_grid_properties.column_count
53
+ end
54
+
55
+ def worksheet_values(range)
56
+ range = "#{worksheet_title}!#{range}"
57
+ logger.info { "`embulk-input-google_spreadsheets`: load data from spreadsheet: '#{spreadsheets_url}', range: '#{range}'" }
58
+ service.get_spreadsheet_values(spreadsheets_id, range).values
59
+ end
60
+
61
+ def worksheet_each_record(&block)
62
+ pager.each_record(self, &block)
63
+ end
64
+
65
+ def service
66
+ @service ||= Google::Apis::SheetsV4::SheetsService.new.tap do |s|
67
+ s.client_options.application_name = application_name
68
+ s.authorization = auth.authenticate
69
+ end
70
+ end
71
+
72
+ end
73
+ end
74
+ end
75
+ end
@@ -0,0 +1,23 @@
1
+ module Embulk
2
+ module Input
3
+ class GoogleSpreadsheets < InputPlugin
4
+ module SpreadsheetsUrlUtil
5
+
6
+ def self.capture_id(url)
7
+ scanned = url.scan(capture_id_regex).first
8
+ return unless scanned
9
+ scanned.first
10
+ end
11
+
12
+ def self.base_url
13
+ @base_url ||= 'https://docs.google.com/spreadsheets/d/'
14
+ end
15
+
16
+ def self.capture_id_regex
17
+ @capture_id_regex ||= %r{#{base_url}([^/]+)/.*}
18
+ end
19
+ end
20
+ end
21
+ end
22
+ end
23
+
@@ -0,0 +1,62 @@
1
+ module Embulk
2
+ module Input
3
+ class GoogleSpreadsheets < InputPlugin
4
+ module Typecast
5
+ class Base
6
+
7
+ attr_reader :null_string
8
+
9
+ def initialize(task)
10
+ @null_string = task['null_string']
11
+ end
12
+
13
+ def logger
14
+ GoogleSpreadsheets.logger
15
+ end
16
+
17
+ def to_json(*args) # for logging
18
+ spec = {JSON.create_id => self.class.name}
19
+ spec = instance_variables.inject(spec) do |spec, v|
20
+ spec.tap do |s|
21
+ s[v] = instance_variable_get(v)
22
+ end
23
+ end
24
+ spec.to_json(*args)
25
+ end
26
+
27
+ def as_string(value)
28
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
29
+ end
30
+
31
+ def as_long(value)
32
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
33
+ end
34
+
35
+ def as_double(value)
36
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
37
+ end
38
+
39
+ def as_boolean(value)
40
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
41
+ end
42
+
43
+ def as_timestamp(value, timestamp_format, timezone)
44
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
45
+ end
46
+
47
+ def as_json(value)
48
+ raise NotImplementedError, '`embulk-input-google_spreadsheets`: override this.'
49
+ end
50
+
51
+ protected
52
+
53
+ def null_string?(value)
54
+ return false unless value.is_a?(String)
55
+ return true if value == null_string
56
+ return false
57
+ end
58
+ end
59
+ end
60
+ end
61
+ end
62
+ end
@@ -0,0 +1,84 @@
1
+ require_relative 'strict_typecast'
2
+
3
+ module Embulk
4
+ module Input
5
+ class GoogleSpreadsheets < InputPlugin
6
+ module Typecast
7
+ class LooseTypecast < StrictTypecast
8
+ def as_string(value)
9
+ begin
10
+ super
11
+ rescue => e
12
+ if e.is_a?(TypecastError)
13
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
14
+ return nil
15
+ end
16
+ raise e
17
+ end
18
+ end
19
+
20
+ def as_long(value)
21
+ begin
22
+ super
23
+ rescue => e
24
+ if e.is_a?(TypecastError)
25
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
26
+ return nil
27
+ end
28
+ raise e
29
+ end
30
+ end
31
+
32
+ def as_double(value)
33
+ begin
34
+ super
35
+ rescue => e
36
+ if e.is_a?(TypecastError)
37
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
38
+ return nil
39
+ end
40
+ raise e
41
+ end
42
+ end
43
+
44
+ def as_boolean(value)
45
+ begin
46
+ super
47
+ rescue => e
48
+ if e.is_a?(TypecastError)
49
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
50
+ return nil
51
+ end
52
+ raise e
53
+ end
54
+ end
55
+
56
+ def as_timestamp(value, timestamp_format = nil, timezone = nil)
57
+ begin
58
+ super
59
+ rescue => e
60
+ if e.is_a?(TypecastError)
61
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
62
+ return nil
63
+ end
64
+ raise e
65
+ end
66
+ end
67
+
68
+ def as_json(value)
69
+ begin
70
+ super
71
+ rescue => e
72
+ if e.is_a?(TypecastError)
73
+ logger.trace{"`embulk-input-google_spreadsheets`: Fallback to nil, because of '#{e}'"}
74
+ return nil
75
+ end
76
+ raise e
77
+ end
78
+ end
79
+
80
+ end
81
+ end
82
+ end
83
+ end
84
+ end
@@ -0,0 +1,109 @@
1
+ require 'time_with_zone'
2
+ require_relative 'base'
3
+ require_relative 'timestamp_format_util'
4
+
5
+ module Embulk
6
+ module Input
7
+ class GoogleSpreadsheets < InputPlugin
8
+ module Typecast
9
+ class MinimalTypecast < Base
10
+
11
+ def as_string(value)
12
+ return nil if value.nil?
13
+ return nil if null_string?(value)
14
+ value.to_s
15
+ rescue NoMethodError => e
16
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to String: \"#{value}\" because of '#{e}'"
17
+ end
18
+
19
+ def as_long(value)
20
+ return nil if value.nil?
21
+ return nil if null_string?(value)
22
+ value.to_i
23
+ rescue NoMethodError => e
24
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Long: \"#{value}\" because of '#{e}'"
25
+ end
26
+
27
+ def as_double(value)
28
+ return nil if value.nil?
29
+ return nil if null_string?(value)
30
+ value.to_f
31
+ rescue NoMethodError => e
32
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Double: \"#{value}\" because of '#{e}'"
33
+ end
34
+
35
+ def as_boolean(value)
36
+ return nil if value.nil?
37
+ return nil if null_string?(value)
38
+
39
+ case value
40
+ when TrueClass, FalseClass
41
+ value
42
+ when String
43
+ value = value.downcase
44
+ case value
45
+ when 'true'
46
+ true
47
+ when 'false'
48
+ false
49
+ else
50
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast '#{value}' to a boolean value."
51
+ end
52
+ else
53
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to a boolean value: \"#{value}\""
54
+ end
55
+ end
56
+
57
+ def as_timestamp(value, timestamp_format = nil, timezone = nil)
58
+ return nil if value.nil?
59
+ return nil if null_string?(value)
60
+
61
+ if timestamp_format and TimestampFormatUtil.timezone_format?(timestamp_format)
62
+ Time.strptime(value, timestamp_format)
63
+ elsif timestamp_format and timezone
64
+ TimeWithZone.strptime_with_zone(value, timestamp_format, timezone)
65
+ elsif timezone
66
+ TimeWithZone.parse_with_zone(value, timezone)
67
+ elsif timestamp_format
68
+ Time.strptime(value, timestamp_format)
69
+ else
70
+ Time.parse(value)
71
+ end
72
+ rescue ArgumentError, TypeError, NoMethodError => e
73
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to Time: \"#{value}\" because of '#{e}'"
74
+ end
75
+
76
+ def as_json(value)
77
+ return nil if value.nil?
78
+ return nil if null_string?(value)
79
+
80
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/lib/embulk/page_builder.rb#L20
81
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicPageBuilder.java#L97
82
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/DynamicColumnSetterFactory.java#L66
83
+ # cf. https://github.com/embulk/embulk/blob/997c7beb89d42122f7cb6fe844f8ca79a3cb666c/embulk-core/src/main/java/org/embulk/spi/util/dynamic/JsonColumnSetter.java#L50
84
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/util/dynamic/AbstractDynamicColumnSetter.java#L47
85
+ # cf. https://github.com/embulk/embulk/blob/191ffd50e555565be77f810db15a21ba66cb7bf6/embulk-core/src/main/java/org/embulk/spi/json/RubyValueApi.java#L57
86
+ # NOTE: As long as reading the above code, any object can be set as Json
87
+ # (that must be primitive type or must have `to_msgpack` method.)
88
+ case value
89
+ when TrueClass, FalseClass, Integer, Float, Array, Hash
90
+ value
91
+ when String
92
+ begin
93
+ JSON.parse(value)
94
+ rescue JSON::ParserError => e
95
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\" because of '#{e}'"
96
+ end
97
+ when Time
98
+ # TODO: support Time class. Now call Exception to avoid format/timezone trouble.
99
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast Time to JSON: \"#{value}\""
100
+ else
101
+ raise TypecastError.new "`embulk-input-google_spreadsheets`: cannot typecast #{value.class} to JSON: \"#{value}\""
102
+ end
103
+ end
104
+
105
+ end
106
+ end
107
+ end
108
+ end
109
+ end