embulk-input-mixpanel 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a9b930812eda7043923b65baf311b686f6a75955
4
- data.tar.gz: 9e7c02c4232d7cfa94a0e2fe051d6cc18122a33b
3
+ metadata.gz: 48c99d14bae13fa0257e070bc5b99d444679392a
4
+ data.tar.gz: 8327dd45de51b2dab42754f212e5c6861621de7f
5
5
  SHA512:
6
- metadata.gz: e77beb053f17adedbcb86a87004871721ed08bcca888e7d3b0754e673c3b0ec181231432145732846b005d6ebc84429ab5457aa4bbee5a9e403ce5984c915382
7
- data.tar.gz: 8b23002ec143aef8aa686e67f2c58eb7eb2b62fd2d7bc05aa913b773eda93d845dc349543a5d0f9e010784049ca4de7bd58a3056ec8d63343498dbcd0ed4a743
6
+ metadata.gz: ecb93dd7bade9b667d94a5f5cea247d6f46b1e1ad87d6046f6377f9495456e40f91b97a44a1440cbd04b8d4bbb15a2a223fbed7612469e08e427e32bc4e94db9
7
+ data.tar.gz: 94e3728eed0ce17d7178557556077f1d37049881567715523955c9888560d4886d774440206570cbf463822f2f82f19b2d216aa293bd8be1d1f0b27d3ea566a1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.3.3 - 2015-10-29
2
+
3
+ * [enhancement] Exponential backoff retry [#31](https://github.com/treasure-data/embulk-input-mixpanel/pull/31)
4
+ * [enhancement] Treat unguessed columns [#30](https://github.com/treasure-data/embulk-input-mixpanel/pull/30)
5
+ * [enhancement] Loosely guess [#27](https://github.com/treasure-data/embulk-input-mixpanel/pull/27)
6
+ * [maintenance] Refactor [#26](https://github.com/treasure-data/embulk-input-mixpanel/pull/26)
7
+
1
8
  ## 0.3.2 - 2015-10-06
2
9
 
3
10
  * [enhancement] Support embulk 0.7 [#25](https://github.com/treasure-data/embulk-input-mixpanel/pull/25)
data/README.md CHANGED
@@ -38,9 +38,13 @@ To get it, you should log in mixpanel website, and click gear icon at the lower
38
38
  - NOTE: Mixpanel API supports to export data from at least 2 days before to at most the previous day.
39
39
  - **fetch_days**: Count of days range for exporting (integer, optional, default: from_date - (today - 1))
40
40
  - NOTE: Mixpanel doesn't support to from_date > today - 2
41
+ - **fetch_unknown_columns**: If you want this plugin fetches unknown (unconfigured in config) columns (boolean, optional, default: true)
42
+ - NOTE: If true, `unknown_columns` column is created and added unknown columns' data.
41
43
  - **event**: The event or events to filter data (array, optional, default: nil)
42
44
  - **where**: Expression to filter data (c.f. https://mixpanel.com/docs/api-documentation/data-export-api#segmentation-expressions) (string, optional, default: nil)
43
45
  - **bucket**:The data backet to filter data (string, optional, default: nil)
46
+ - **retry_initial_wait_sec** Wait seconds for exponential backoff initial value (integer, default: 1)
47
+ - **retry_limit**: Try to retry this times (integer, default: 5)
44
48
 
45
49
  ## Example
46
50
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-mixpanel"
4
- spec.version = "0.3.2"
4
+ spec.version = "0.3.3"
5
5
  spec.authors = ["yoshihara", "uu59"]
6
6
  spec.summary = "Mixpanel input plugin for Embulk"
7
7
  spec.description = "Loads records from Mixpanel."
@@ -1,5 +1,7 @@
1
1
  require "tzinfo"
2
2
  require "embulk/input/mixpanel_api/client"
3
+ require "range_generator"
4
+ require "timezone_validator"
3
5
 
4
6
  module Embulk
5
7
  module Input
@@ -7,6 +9,7 @@ module Embulk
7
9
  Plugin.register_input("mixpanel", self)
8
10
 
9
11
  GUESS_RECORDS_COUNT = 10
12
+ NOT_PROPERTY_COLUMN = "event".freeze
10
13
 
11
14
  # NOTE: It takes long time to fetch data between from_date to
12
15
  # to_date by one API request. So this plugin fetches data
@@ -14,68 +17,36 @@ module Embulk
14
17
  SLICE_DAYS_COUNT = 7
15
18
 
16
19
  def self.transaction(config, &control)
17
- task = {}
18
-
19
- task[:params] = export_params(config)
20
-
21
- begin
22
- from_date_str = config.param(:from_date, :string, default: (Date.today - 2).to_s)
23
- from_date = Date.parse(from_date_str)
24
- rescue ArgumentError # invalid date
25
- raise ConfigError.new "from_date '#{from_date_str}' is invalid date"
26
- end
27
-
28
- if from_date > Date.today - 1
29
- Embulk.logger.warn "Mixpanel allow 2 days before to from_date, so no data is input."
30
- target_dates = []
31
- else
32
- days = config.param(:fetch_days, :integer, default: nil)
33
-
34
- if days.nil?
35
- # When no 'days' is specified in config file, so dates is
36
- # generated by from_date and yeasterday.
37
- dates = from_date..(Date.today - 1)
38
- elsif days < 1
39
- raise ConfigError.new "days '#{days}' is invalid. Please specify bigger number than 0."
40
- else
41
- # When 'days' is specified in config file and it is satisfied,
42
- # so it is used for dates.
43
- dates = from_date..(from_date + days - 1)
44
- end
45
-
46
- target_dates = dates.find_all {|date| date < Date.today}
47
-
48
- Embulk.logger.info "Try to fetch data from #{target_dates.first} to #{target_dates.last}"
49
-
50
- overtimes = dates.to_a - target_dates
51
- unless overtimes.empty?
52
- Embulk.logger.warn "These dates are too early access, ignored them: from #{overtimes.first} to #{overtimes.last}"
53
- end
54
- end
55
-
56
- task[:dates] = target_dates.map {|date| date.to_s}
57
-
58
- task[:api_key] = config.param(:api_key, :string)
59
- task[:api_secret] = config.param(:api_secret, :string)
60
- task[:timezone] = config.param(:timezone, :string)
61
-
62
- begin
63
- # raises exception if timezone is invalid string
64
- TZInfo::Timezone.get(task[:timezone])
65
- rescue => e
66
- Embulk.logger.error "'#{task[:timezone]}' is invalid timezone"
67
- raise ConfigError.new e.message
68
- end
20
+ timezone = config.param(:timezone, :string)
21
+ TimezoneValidator.new(timezone).validate
22
+
23
+ from_date = config.param(:from_date, :string, default: (Date.today - 2).to_s)
24
+ fetch_days = config.param(:fetch_days, :integer, default: nil)
25
+ range = RangeGenerator.new(from_date, fetch_days).generate_range
26
+ Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
27
+
28
+ task = {
29
+ params: export_params(config),
30
+ dates: range,
31
+ timezone: timezone,
32
+ api_key: config.param(:api_key, :string),
33
+ api_secret: config.param(:api_secret, :string),
34
+ schema: config.param(:columns, :array),
35
+ fetch_unknown_columns: config.param(:fetch_unknown_columns, :bool, default: true),
36
+ retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
37
+ retry_limit: config.param(:retry_limit, :integer, default: 5),
38
+ }
69
39
 
70
- columns = []
71
- task[:schema] = config.param(:columns, :array)
72
- task[:schema].each do |column|
40
+ columns = task[:schema].map do |column|
73
41
  name = column["name"]
74
42
  type = column["type"].to_sym
75
43
 
76
- columns << Column.new(nil, name, type, column["format"])
44
+ Column.new(nil, name, type, column["format"])
77
45
  end
78
46
 
47
+ # for unknown columns
48
+ columns << Column.new(nil, "unknown_columns", :string)
49
+
79
50
  resume(task, columns, 1, &control)
80
51
  end
81
52
 
@@ -94,36 +65,15 @@ module Embulk
94
65
  def self.guess(config)
95
66
  client = MixpanelApi::Client.new(config.param(:api_key, :string), config.param(:api_secret, :string))
96
67
 
97
- from_date_str = config.param(:from_date, :string, default: (Date.today - 1 - SLICE_DAYS_COUNT).to_s)
98
-
99
- from_date = Date.parse(from_date_str)
68
+ range = guess_range(config)
69
+ Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
100
70
 
101
- if from_date > Date.today - 1
102
- raise ConfigError.new "Please specify date later than yesterday (inclusive) as 'from_date'"
103
- end
104
-
105
- # NOTE: to_date is yeasterday if from_date..Date.Today doesn't have
106
- # more SLICE_DAYS_COUNT days.
107
- to_date = [from_date + SLICE_DAYS_COUNT, Date.today - 1].min
108
-
109
- params = export_params(config)
110
- params = params.merge(
111
- from_date: from_date.to_s,
112
- to_date: to_date.to_s,
71
+ params = export_params(config).merge(
72
+ from_date: range.first,
73
+ to_date: range.last,
113
74
  )
114
75
 
115
- records = client.export(params)
116
- sample_records = records.first(GUESS_RECORDS_COUNT)
117
- properties = Guess::SchemaGuess.from_hash_records(sample_records.map{|r| r["properties"]})
118
- columns = properties.map do |col|
119
- result = {
120
- name: col.name,
121
- type: col.type,
122
- }
123
- result[:format] = col.format if col.format
124
- result
125
- end
126
- columns.unshift(name: "event", type: :string)
76
+ columns = guess_from_records(client.export(params))
127
77
  return {"columns" => columns}
128
78
  end
129
79
 
@@ -134,33 +84,18 @@ module Embulk
134
84
  @timezone = task[:timezone]
135
85
  @schema = task[:schema]
136
86
  @dates = task[:dates]
87
+ @fetch_unknown_columns = task[:fetch_unknown_columns]
137
88
  end
138
89
 
139
90
  def run
140
- client = MixpanelApi::Client.new(@api_key, @api_secret)
141
91
  @dates.each_slice(SLICE_DAYS_COUNT) do |dates|
142
- from_date = dates.first
143
- to_date = dates.last
144
- Embulk.logger.info "Fetching data from #{from_date} to #{to_date} ..."
145
-
146
- params = @params.merge(
147
- "from_date" => from_date,
148
- "to_date" => to_date
149
- )
150
-
151
- records = client.export(params)
152
-
153
- records.each do |record|
154
- values = @schema.map do |column|
155
- case column["name"]
156
- when "event"
157
- record["event"]
158
- when "time"
159
- time = record["properties"]["time"]
160
- adjust_timezone(time)
161
- else
162
- record["properties"][column["name"]]
163
- end
92
+ Embulk.logger.info "Fetching data from #{dates.first} to #{dates.last} ..."
93
+
94
+ fetch(dates).each do |record|
95
+ values = extract_values(record)
96
+ if @fetch_unknown_columns
97
+ unknown_values = extract_unknown_values(record)
98
+ values << unknown_values.to_json
164
99
  end
165
100
  page_builder.add(values)
166
101
  end
@@ -176,6 +111,50 @@ module Embulk
176
111
 
177
112
  private
178
113
 
114
+ def extract_values(record)
115
+ @schema.map do |column|
116
+ extract_value(record, column["name"])
117
+ end
118
+ end
119
+
120
+ def extract_value(record, name)
121
+ case name
122
+ when NOT_PROPERTY_COLUMN
123
+ record[NOT_PROPERTY_COLUMN]
124
+ when "time"
125
+ time = record["properties"]["time"]
126
+ adjust_timezone(time)
127
+ else
128
+ record["properties"][name]
129
+ end
130
+ end
131
+
132
+ def extract_unknown_values(record)
133
+ record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
134
+ schema_keys = @schema.map {|column| column["name"]}
135
+ unknown_keys = record_keys - schema_keys
136
+
137
+ unless unknown_keys.empty?
138
+ Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
139
+ end
140
+
141
+ unknown_keys.inject({}) do |result, key|
142
+ result[key] = extract_value(record, key)
143
+ result
144
+ end
145
+ end
146
+
147
+ def fetch(dates)
148
+ from_date = dates.first
149
+ to_date = dates.last
150
+ params = @params.merge(
151
+ "from_date" => from_date,
152
+ "to_date" => to_date,
153
+ )
154
+ client = MixpanelApi::Client.new(@api_key, @api_secret)
155
+ client.export_with_retry(params, task[:retry_initial_wait_sec], task[:retry_limit])
156
+ end
157
+
179
158
  def adjust_timezone(epoch)
180
159
  # Adjust timezone offset to get UTC time
181
160
  # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
@@ -203,6 +182,34 @@ module Embulk
203
182
  bucket: config.param(:bucket, :string, default: nil),
204
183
  }
205
184
  end
185
+
186
+ def self.default_guess_start_date
187
+ Date.today - SLICE_DAYS_COUNT - 1
188
+ end
189
+
190
+ def self.guess_range(config)
191
+ from_date = config.param(:from_date, :string, default: default_guess_start_date.to_s)
192
+ fetch_days = config.param(:fetch_days, :integer, default: SLICE_DAYS_COUNT)
193
+ range = RangeGenerator.new(from_date, fetch_days).generate_range
194
+ if range.empty?
195
+ return default_guess_start_date..(Date.today - 1)
196
+ end
197
+ range
198
+ end
199
+
200
+ def self.guess_from_records(records)
201
+ sample_props = records.first(GUESS_RECORDS_COUNT).map{|r| r["properties"]}
202
+ schema = Guess::SchemaGuess.from_hash_records(sample_props)
203
+ columns = schema.map do |col|
204
+ result = {
205
+ name: col.name,
206
+ type: col.type,
207
+ }
208
+ result[:format] = col.format if col.format
209
+ result
210
+ end
211
+ columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
212
+ end
206
213
  end
207
214
 
208
215
  end
@@ -15,32 +15,68 @@ module Embulk
15
15
  @api_secret = api_secret
16
16
  end
17
17
 
18
+ def export_with_retry(params = {}, retry_initial_wait_sec, retry_limit)
19
+ body = with_retry(retry_initial_wait_sec, retry_limit) do
20
+ request(params)
21
+ end
22
+
23
+ response_to_enum(body)
24
+ end
25
+
18
26
  def export(params = {})
27
+ body = request(params)
28
+ response_to_enum(body)
29
+ end
30
+
31
+ private
32
+
33
+ def response_to_enum(response_body)
34
+ Enumerator.new do |y|
35
+ response_body.lines.each do |json|
36
+ # TODO: raise Embulk::DataError when invalid json given for Embulk 0.7+
37
+ y << JSON.parse(json)
38
+ end
39
+ end
40
+ end
41
+
42
+ def request(params)
19
43
  # https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel
20
44
  params[:expire] ||= Time.now.to_i + TIMEOUT_SECONDS
21
45
  params[:sig] = signature(params)
22
-
23
46
  Embulk.logger.debug "Export param: #{params.to_s}"
24
47
 
25
48
  response = httpclient.get(ENDPOINT_EXPORT, params)
26
-
27
49
  Embulk.logger.debug "response code: #{response.code}"
28
-
29
- if (400..499).include?(response.code)
50
+ case response.code
51
+ when 400..499
30
52
  raise ConfigError.new response.body
31
- elsif response.code >= 500
53
+ when 500..599
32
54
  raise RuntimeError, response.body
33
55
  end
56
+ response.body
57
+ end
34
58
 
35
- Enumerator.new do |y|
36
- response.body.lines.each do |json|
37
- y << JSON.parse(json)
59
+ def with_retry(initial_wait, retry_limit, &block)
60
+ retry_count = 0
61
+ wait_sec = initial_wait
62
+ begin
63
+ yield
64
+ rescue Embulk::ConfigError => e # TODO: rescue Embulk::DataError for Embulk 0.7+
65
+ # Don't retry
66
+ raise e
67
+ rescue => e
68
+ if retry_limit <= retry_count
69
+ Embulk.logger.error "'#{e}(#{e.class})' error occured and reached retry limit (#{retry_limit} times)"
70
+ raise e
38
71
  end
72
+ retry_count += 1
73
+ Embulk.logger.warn "Retrying after #{wait_sec} seconds [#{retry_count}/#{retry_limit}] '#{e}(#{e.class})' error occured"
74
+ sleep wait_sec
75
+ wait_sec *= 2
76
+ retry
39
77
  end
40
78
  end
41
79
 
42
- private
43
-
44
80
  def signature(params)
45
81
  # https://mixpanel.com/docs/api-documentation/data-export-api#auth-implementation
46
82
  sorted_keys = params.keys.map(&:to_s).sort
@@ -0,0 +1,79 @@
1
+ class RangeGenerator
2
+ attr_reader :from_date_str, :fetch_days
3
+
4
+ def initialize(from_date_str, fetch_days)
5
+ @from_date_str = from_date_str
6
+ @fetch_days = fetch_days
7
+ end
8
+
9
+ def generate_range
10
+ validate
11
+ show_warnings
12
+ range_only_past.map{|date| date.to_s}
13
+ end
14
+
15
+ private
16
+
17
+ def from_date
18
+ Date.parse(from_date_str)
19
+ end
20
+
21
+ def validate
22
+ begin
23
+ from_date
24
+ rescue ArgumentError # invalid date
25
+ raise Embulk::ConfigError.new "from_date '#{from_date_str}' is invalid date"
26
+ end
27
+
28
+ if fetch_days && fetch_days < 1
29
+ # `days` only allowed nil or positive number
30
+ raise Embulk::ConfigError.new "fetch_days '#{fetch_days}' is invalid. Please specify bigger number than 0."
31
+ end
32
+ end
33
+
34
+ def show_warnings
35
+ if from_date_too_early?
36
+ Embulk.logger.warn "Mixpanel allow 2 days before to from_date, so no data is input."
37
+ end
38
+
39
+ if overdays?
40
+ Embulk.logger.warn "These dates are too early access, ignored them: from #{overdays.first} to #{overdays.last}"
41
+ end
42
+ end
43
+
44
+ def range
45
+ if from_date_too_early?
46
+ return []
47
+ end
48
+
49
+ if fetch_days
50
+ from_date..(from_date + fetch_days - 1)
51
+ else
52
+ from_date..yesterday
53
+ end
54
+ end
55
+
56
+ def range_only_past
57
+ range.find_all{|date| date < today}
58
+ end
59
+
60
+ def overdays?
61
+ ! overdays.empty?
62
+ end
63
+
64
+ def overdays
65
+ range.to_a - range_only_past.to_a
66
+ end
67
+
68
+ def from_date_too_early?
69
+ from_date > yesterday
70
+ end
71
+
72
+ def yesterday
73
+ today - 1
74
+ end
75
+
76
+ def today
77
+ @today ||= Date.today
78
+ end
79
+ end
@@ -0,0 +1,15 @@
1
+ class TimezoneValidator
2
+ def initialize(timezone)
3
+ @timezone = timezone
4
+ end
5
+
6
+ def validate
7
+ begin
8
+ # raises exception if timezone is invalid string
9
+ TZInfo::Timezone.get(@timezone)
10
+ rescue => e
11
+ Embulk.logger.error "'#{@timezone}' is invalid timezone"
12
+ raise Embulk::ConfigError.new e.message
13
+ end
14
+ end
15
+ end
@@ -89,6 +89,71 @@ module Embulk
89
89
  end
90
90
  end
91
91
 
92
+ class ExportRetryTest < self
93
+ def setup
94
+ @httpclient = HTTPClient.new
95
+ @client = Client.new(API_KEY, API_SECRET)
96
+ @retry_initial_wait_sec = 1
97
+ @retry_limit = 3
98
+ stub_client
99
+ end
100
+
101
+ def test_retry_with_500
102
+ stub_response(failure_response(500))
103
+
104
+ @retry_limit.times do |n|
105
+ mock(@client).sleep(@retry_initial_wait_sec * (2**n))
106
+ end
107
+ mock(Embulk.logger).warn(/retry/i).times(@retry_limit)
108
+ mock(Embulk.logger).error(/retry/i).once
109
+
110
+ assert_raise do
111
+ @client.export_with_retry(params, @retry_initial_wait_sec, @retry_limit)
112
+ end
113
+ end
114
+
115
+ def test_retry_with_timeout
116
+ @httpclient.connect_timeout = 0.000000000000000000001
117
+
118
+ @retry_limit.times do |n|
119
+ mock(@client).sleep(@retry_initial_wait_sec * (2**n))
120
+ end
121
+ mock(Embulk.logger).warn(/retry/i).times(@retry_limit)
122
+ mock(Embulk.logger).error(/retry/i).once
123
+
124
+ assert_raise(HTTPClient::TimeoutError) do
125
+ @client.export_with_retry(params, @retry_initial_wait_sec, @retry_limit)
126
+ end
127
+ end
128
+
129
+ def test_not_retry_with_401
130
+ @httpclient.test_loopback_http_response << "HTTP/1.1 401\r\n\r\n"
131
+ mock(Embulk.logger).warn(/retry/i).never
132
+ mock(Embulk.logger).error(/retry/i).never
133
+
134
+ assert_raise(Embulk::ConfigError) do
135
+ @client.export_with_retry(params, 0, 1).each do |record|
136
+ record
137
+ end
138
+ end
139
+ end
140
+
141
+ def test_not_retry_with_invalid_json
142
+ omit "Embulk 0.6 or earlier has no DataError (enable to uncomment below code)"
143
+
144
+ # @httpclient.test_loopback_http_response << "HTTP/1.1 200\r\n\r\ninvalid json"
145
+ # mock(Embulk.logger).warn(/retry/i).never
146
+ # mock(Embulk.logger).error(/retry/i).never
147
+
148
+ # assert_raise(Embulk::DataError) do
149
+ # @client.export_with_retry(params, 0, 1).each do |record|
150
+ # # DataError will raised in each block
151
+ # record
152
+ # end
153
+ # end
154
+ end
155
+ end
156
+
92
157
  private
93
158
 
94
159
  def stub_client
@@ -50,6 +50,11 @@ module Embulk
50
50
  end
51
51
 
52
52
  class GuessTest < self
53
+ def setup
54
+ # Do nothing from parent
55
+ mute_warn
56
+ end
57
+
53
58
  def test_from_date_old_date
54
59
  config = {
55
60
  type: "mixpanel",
@@ -58,9 +63,8 @@ module Embulk
58
63
  from_date: FROM_DATE,
59
64
  }
60
65
 
61
- from_date = config[:from_date]
62
- to_date = Date.parse(from_date) + Mixpanel::SLICE_DAYS_COUNT
63
- stub_export(from_date, to_date)
66
+ stub_export_all
67
+ mock(Embulk.logger).info(/^Guessing.*#{Regexp.escape FROM_DATE}\.\./)
64
68
 
65
69
  actual = Mixpanel.guess(embulk_config(config))
66
70
  assert_equal(expected, actual)
@@ -74,25 +78,25 @@ module Embulk
74
78
  from_date: Date.today.to_s,
75
79
  }
76
80
 
77
- assert_raise(ConfigError) do
78
- Mixpanel.guess(embulk_config(config))
79
- end
81
+ stub_export_all
82
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date.to_s}/)
83
+
84
+ Mixpanel.guess(embulk_config(config))
80
85
  end
81
86
 
82
87
  def test_from_date_yesterday
88
+ from_date = (Date.today - 1).to_s
83
89
  config = {
84
90
  type: "mixpanel",
85
91
  api_key: API_KEY,
86
92
  api_secret: API_SECRET,
87
- from_date: (Date.today - 1).to_s,
93
+ from_date: from_date,
88
94
  }
89
95
 
90
- from_date = config[:from_date]
91
- to_date = from_date
92
- stub_export(from_date, to_date)
96
+ stub_export_all
97
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape from_date}/)
93
98
 
94
- actual = Mixpanel.guess(embulk_config(config))
95
- assert_equal(expected, actual)
99
+ Mixpanel.guess(embulk_config(config))
96
100
  end
97
101
 
98
102
  def test_no_from_date
@@ -102,31 +106,24 @@ module Embulk
102
106
  api_secret: API_SECRET,
103
107
  }
104
108
 
105
- from_date = Date.today - 1 - Mixpanel::SLICE_DAYS_COUNT
106
- to_date = Date.today - 1
107
- stub_export(from_date, to_date)
109
+ stub_export_all
110
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date.to_s}/)
108
111
 
109
- actual = Mixpanel.guess(embulk_config(config))
110
- assert_equal(expected, actual)
112
+ Mixpanel.guess(embulk_config(config))
111
113
  end
112
114
 
113
115
  private
114
116
 
115
- def stub_export(from_date, to_date)
116
- params = {
117
- api_key: API_KEY,
118
- event: nil,
119
- where: nil,
120
- bucket: nil,
121
- from_date: from_date.to_s,
122
- to_date: to_date.to_s,
123
- }
124
-
117
+ def stub_export_all
125
118
  any_instance_of(MixpanelApi::Client) do |klass|
126
- stub(klass).export(params) { records }
119
+ stub(klass).export(anything) { records }
127
120
  end
128
121
  end
129
122
 
123
+ def mute_warn
124
+ stub(Embulk.logger).warn(anything) {}
125
+ end
126
+
130
127
  def embulk_config(config)
131
128
  DataSource[*config.to_a.flatten(1)]
132
129
  end
@@ -336,9 +333,12 @@ module Embulk
336
333
  end
337
334
 
338
335
  def columns
339
- schema.map do |col|
336
+ unknown_columns = [Column.new(nil, "unknown_columns", :string)]
337
+ configured_columns = schema.map do |col|
340
338
  Column.new(nil, col["name"], col["type"].to_sym)
341
339
  end
340
+
341
+ configured_columns + unknown_columns
342
342
  end
343
343
  end
344
344
 
@@ -371,7 +371,7 @@ module Embulk
371
371
  def setup_client
372
372
 
373
373
  any_instance_of(MixpanelApi::Client) do |klass|
374
- stub(klass).export(anything) { records }
374
+ stub(klass).request { records_raw_response }
375
375
  end
376
376
  end
377
377
 
@@ -415,6 +415,52 @@ module Embulk
415
415
  @plugin.run
416
416
  end
417
417
 
418
+ class UnknownColumnsTest < self
419
+ def setup
420
+ super
421
+ @page_builder = Object.new
422
+ @plugin = Mixpanel.new(task, nil, nil, @page_builder)
423
+ end
424
+
425
+ def test_run
426
+ Embulk.logger.warn(anything)
427
+ stub(@plugin).preview? { false }
428
+
429
+ # NOTE: Expect records are contained same record
430
+ record = records.first
431
+ properties = record["properties"]
432
+
433
+ time = properties["time"]
434
+ tz = TZInfo::Timezone.get(TIMEZONE)
435
+ offset = tz.period_for_local(time, true).offset.utc_offset
436
+ adjusted_time = time - offset
437
+
438
+ added = [
439
+ properties["foo"],
440
+ adjusted_time,
441
+ {"int" => properties["int"], "event" => record["event"]}.to_json
442
+ ]
443
+
444
+ mock(@page_builder).add(added).times(records.length * 2)
445
+ mock(@page_builder).finish
446
+
447
+ @plugin.run
448
+ end
449
+
450
+ private
451
+
452
+ def task
453
+ super.merge(schema: schema, fetch_unknown_columns: true)
454
+ end
455
+
456
+ def schema
457
+ [
458
+ {"name" => "foo", "type" => "long"},
459
+ {"name" => "time", "type" => "long"},
460
+ ]
461
+ end
462
+ end
463
+
418
464
  private
419
465
 
420
466
  def timezone_offset_seconds
@@ -440,6 +486,9 @@ module Embulk
440
486
  schema: schema,
441
487
  dates: DATES.to_a.map(&:to_s),
442
488
  params: Mixpanel.export_params(embulk_config),
489
+ fetch_unknown_columns: false,
490
+ retry_initial_wait_sec: 2,
491
+ retry_limit: 3,
443
492
  }
444
493
  end
445
494
 
@@ -456,6 +505,10 @@ module Embulk
456
505
  ] * 30
457
506
  end
458
507
 
508
+ def records_raw_response
509
+ records.map(&:to_json).join("\n")
510
+ end
511
+
459
512
  def record_epoch
460
513
  1234567890
461
514
  end
@@ -467,6 +520,9 @@ module Embulk
467
520
  api_secret: API_SECRET,
468
521
  from_date: FROM_DATE,
469
522
  fetch_days: DAYS,
523
+ fetch_unknown_columns: false,
524
+ retry_initial_wait_sec: 2,
525
+ retry_limit: 3,
470
526
  }
471
527
  end
472
528
 
@@ -0,0 +1,94 @@
1
+ require "range_generator"
2
+ require "override_assert_raise"
3
+
4
+ class RangeGeneratorTest < Test::Unit::TestCase
5
+ include OverrideAssertRaise
6
+
7
+ class GenerateRangeTest < self
8
+ data do
9
+ {
10
+ from_date: ["aaaaaaaaa", 1],
11
+ fetch_days: ["2010-01-01", -9],
12
+ }
13
+ end
14
+ def test_invalid(args)
15
+ assert_raise(Embulk::ConfigError) do
16
+ generate_range(*args)
17
+ end
18
+ end
19
+
20
+ def test_all_days_past
21
+ days = 5
22
+ from = "2010-01-01"
23
+ expected_from = Date.parse(from)
24
+ expected_to = Date.parse("2010-01-05")
25
+
26
+ expected = (expected_from..expected_to).to_a.map{|date| date.to_s}
27
+
28
+ actual = RangeGenerator.new(from, days).generate_range
29
+
30
+ assert_equal(expected, actual)
31
+ end
32
+
33
+ class OverDaysTest < self
34
+ def setup
35
+ @from = Date.today - 5
36
+ @days = 10
37
+ @warn_message_regexp = /ignored them/
38
+ end
39
+
40
+ def test_range_only_past
41
+ expected_to = Date.today - 1
42
+ expected = (@from..expected_to).to_a.map{|date| date.to_s}
43
+
44
+ stub(Embulk.logger).warn(@warn_message_regexp)
45
+
46
+ assert_equal(expected, generate_range)
47
+ end
48
+
49
+ def test_warn
50
+ mock(Embulk.logger).warn(@warn_message_regexp)
51
+
52
+ generate_range
53
+ end
54
+
55
+ private
56
+
57
+ def generate_range
58
+ super(@from.to_s, @days)
59
+ end
60
+ end
61
+
62
+ class FromDateEarlyTest < self
63
+ def setup
64
+ @from = Date.today + 5
65
+ @days = 10
66
+ @warn_message_regexp = /allow 2 days/
67
+ end
68
+
69
+ def test_empty_range
70
+ stub(Embulk.logger).warn(@warn_message_regexp)
71
+
72
+ assert_equal([], generate_range)
73
+ end
74
+
75
+ def test_warn
76
+ mock(Embulk.logger).warn(@warn_message_regexp)
77
+
78
+ generate_range
79
+ end
80
+
81
+ private
82
+
83
+ def generate_range
84
+ super(@from.to_s, @days)
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ def generate_range(from_date_str, fetch_days)
91
+ RangeGenerator.new(from_date_str, fetch_days).generate_range
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,24 @@
1
+ require "timezone_validator"
2
+ require "override_assert_raise"
3
+
4
+ class TimezoneValidatorTest < Test::Unit::TestCase
5
+ include OverrideAssertRaise
6
+
7
+ def test_valid
8
+ valid_timezone = "Asia/Tokyo"
9
+
10
+ assert_nothing_raised do
11
+ TimezoneValidator.new(valid_timezone).validate
12
+ end
13
+ end
14
+
15
+ def test_invalid
16
+ invalid_timezone = "Asia/Tokyoooooooooooooo"
17
+
18
+ mock(Embulk.logger).error(/#{Regexp.new(invalid_timezone)}/)
19
+
20
+ assert_raise(Embulk::ConfigError) do
21
+ TimezoneValidator.new(invalid_timezone).validate
22
+ end
23
+ end
24
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-mixpanel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshihara
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-10-06 00:00:00.000000000 Z
12
+ date: 2015-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
@@ -199,11 +199,15 @@ files:
199
199
  - gemfiles/template.erb
200
200
  - lib/embulk/input/mixpanel.rb
201
201
  - lib/embulk/input/mixpanel_api/client.rb
202
+ - lib/range_generator.rb
203
+ - lib/timezone_validator.rb
202
204
  - test/embulk/input/mixpanel_api/test_client.rb
203
205
  - test/embulk/input/test_mixpanel.rb
204
206
  - test/override_assert_raise.rb
205
207
  - test/prepare_embulk.rb
206
208
  - test/run-test.rb
209
+ - test/test_range_generator.rb
210
+ - test/test_timezone_validator.rb
207
211
  homepage: https://github.com/treasure-data/embulk-input-mixpanel
208
212
  licenses:
209
213
  - Apache2
@@ -234,3 +238,5 @@ test_files:
234
238
  - test/override_assert_raise.rb
235
239
  - test/prepare_embulk.rb
236
240
  - test/run-test.rb
241
+ - test/test_range_generator.rb
242
+ - test/test_timezone_validator.rb