embulk-input-mixpanel 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a9b930812eda7043923b65baf311b686f6a75955
4
- data.tar.gz: 9e7c02c4232d7cfa94a0e2fe051d6cc18122a33b
3
+ metadata.gz: 48c99d14bae13fa0257e070bc5b99d444679392a
4
+ data.tar.gz: 8327dd45de51b2dab42754f212e5c6861621de7f
5
5
  SHA512:
6
- metadata.gz: e77beb053f17adedbcb86a87004871721ed08bcca888e7d3b0754e673c3b0ec181231432145732846b005d6ebc84429ab5457aa4bbee5a9e403ce5984c915382
7
- data.tar.gz: 8b23002ec143aef8aa686e67f2c58eb7eb2b62fd2d7bc05aa913b773eda93d845dc349543a5d0f9e010784049ca4de7bd58a3056ec8d63343498dbcd0ed4a743
6
+ metadata.gz: ecb93dd7bade9b667d94a5f5cea247d6f46b1e1ad87d6046f6377f9495456e40f91b97a44a1440cbd04b8d4bbb15a2a223fbed7612469e08e427e32bc4e94db9
7
+ data.tar.gz: 94e3728eed0ce17d7178557556077f1d37049881567715523955c9888560d4886d774440206570cbf463822f2f82f19b2d216aa293bd8be1d1f0b27d3ea566a1
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.3.3 - 2015-10-29
2
+
3
+ * [enhancement] Exponential backoff retry [#31](https://github.com/treasure-data/embulk-input-mixpanel/pull/31)
4
+ * [enhancement] Treat unguessed columns [#30](https://github.com/treasure-data/embulk-input-mixpanel/pull/30)
5
+ * [enhancement] Loosely guess [#27](https://github.com/treasure-data/embulk-input-mixpanel/pull/27)
6
+ * [maintenance] Refactor [#26](https://github.com/treasure-data/embulk-input-mixpanel/pull/26)
7
+
1
8
  ## 0.3.2 - 2015-10-06
2
9
 
3
10
  * [enhancement] Support embulk 0.7 [#25](https://github.com/treasure-data/embulk-input-mixpanel/pull/25)
data/README.md CHANGED
@@ -38,9 +38,13 @@ To get it, you should log in mixpanel website, and click gear icon at the lower
38
38
  - NOTE: Mixpanel API supports to export data from at least 2 days before to at most the previous day.
39
39
  - **fetch_days**: Count of days range for exporting (integer, optional, default: from_date - (today - 1))
40
40
  - NOTE: Mixpanel doesn't support to from_date > today - 2
41
+ - **fetch_unknown_columns**: If you want this plugin fetches unknown (unconfigured in config) columns (boolean, optional, default: true)
42
+ - NOTE: If true, `unknown_columns` column is created and added unknown columns' data.
41
43
  - **event**: The event or events to filter data (array, optional, default: nil)
42
44
  - **where**: Expression to filter data (c.f. https://mixpanel.com/docs/api-documentation/data-export-api#segmentation-expressions) (string, optional, default: nil)
43
45
  - **bucket**:The data backet to filter data (string, optional, default: nil)
46
+ - **retry_initial_wait_sec** Wait seconds for exponential backoff initial value (integer, default: 1)
47
+ - **retry_limit**: Try to retry this times (integer, default: 5)
44
48
 
45
49
  ## Example
46
50
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-mixpanel"
4
- spec.version = "0.3.2"
4
+ spec.version = "0.3.3"
5
5
  spec.authors = ["yoshihara", "uu59"]
6
6
  spec.summary = "Mixpanel input plugin for Embulk"
7
7
  spec.description = "Loads records from Mixpanel."
@@ -1,5 +1,7 @@
1
1
  require "tzinfo"
2
2
  require "embulk/input/mixpanel_api/client"
3
+ require "range_generator"
4
+ require "timezone_validator"
3
5
 
4
6
  module Embulk
5
7
  module Input
@@ -7,6 +9,7 @@ module Embulk
7
9
  Plugin.register_input("mixpanel", self)
8
10
 
9
11
  GUESS_RECORDS_COUNT = 10
12
+ NOT_PROPERTY_COLUMN = "event".freeze
10
13
 
11
14
  # NOTE: It takes long time to fetch data between from_date to
12
15
  # to_date by one API request. So this plugin fetches data
@@ -14,68 +17,36 @@ module Embulk
14
17
  SLICE_DAYS_COUNT = 7
15
18
 
16
19
  def self.transaction(config, &control)
17
- task = {}
18
-
19
- task[:params] = export_params(config)
20
-
21
- begin
22
- from_date_str = config.param(:from_date, :string, default: (Date.today - 2).to_s)
23
- from_date = Date.parse(from_date_str)
24
- rescue ArgumentError # invalid date
25
- raise ConfigError.new "from_date '#{from_date_str}' is invalid date"
26
- end
27
-
28
- if from_date > Date.today - 1
29
- Embulk.logger.warn "Mixpanel allow 2 days before to from_date, so no data is input."
30
- target_dates = []
31
- else
32
- days = config.param(:fetch_days, :integer, default: nil)
33
-
34
- if days.nil?
35
- # When no 'days' is specified in config file, so dates is
36
- # generated by from_date and yeasterday.
37
- dates = from_date..(Date.today - 1)
38
- elsif days < 1
39
- raise ConfigError.new "days '#{days}' is invalid. Please specify bigger number than 0."
40
- else
41
- # When 'days' is specified in config file and it is satisfied,
42
- # so it is used for dates.
43
- dates = from_date..(from_date + days - 1)
44
- end
45
-
46
- target_dates = dates.find_all {|date| date < Date.today}
47
-
48
- Embulk.logger.info "Try to fetch data from #{target_dates.first} to #{target_dates.last}"
49
-
50
- overtimes = dates.to_a - target_dates
51
- unless overtimes.empty?
52
- Embulk.logger.warn "These dates are too early access, ignored them: from #{overtimes.first} to #{overtimes.last}"
53
- end
54
- end
55
-
56
- task[:dates] = target_dates.map {|date| date.to_s}
57
-
58
- task[:api_key] = config.param(:api_key, :string)
59
- task[:api_secret] = config.param(:api_secret, :string)
60
- task[:timezone] = config.param(:timezone, :string)
61
-
62
- begin
63
- # raises exception if timezone is invalid string
64
- TZInfo::Timezone.get(task[:timezone])
65
- rescue => e
66
- Embulk.logger.error "'#{task[:timezone]}' is invalid timezone"
67
- raise ConfigError.new e.message
68
- end
20
+ timezone = config.param(:timezone, :string)
21
+ TimezoneValidator.new(timezone).validate
22
+
23
+ from_date = config.param(:from_date, :string, default: (Date.today - 2).to_s)
24
+ fetch_days = config.param(:fetch_days, :integer, default: nil)
25
+ range = RangeGenerator.new(from_date, fetch_days).generate_range
26
+ Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
27
+
28
+ task = {
29
+ params: export_params(config),
30
+ dates: range,
31
+ timezone: timezone,
32
+ api_key: config.param(:api_key, :string),
33
+ api_secret: config.param(:api_secret, :string),
34
+ schema: config.param(:columns, :array),
35
+ fetch_unknown_columns: config.param(:fetch_unknown_columns, :bool, default: true),
36
+ retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
37
+ retry_limit: config.param(:retry_limit, :integer, default: 5),
38
+ }
69
39
 
70
- columns = []
71
- task[:schema] = config.param(:columns, :array)
72
- task[:schema].each do |column|
40
+ columns = task[:schema].map do |column|
73
41
  name = column["name"]
74
42
  type = column["type"].to_sym
75
43
 
76
- columns << Column.new(nil, name, type, column["format"])
44
+ Column.new(nil, name, type, column["format"])
77
45
  end
78
46
 
47
+ # for unknown columns
48
+ columns << Column.new(nil, "unknown_columns", :string)
49
+
79
50
  resume(task, columns, 1, &control)
80
51
  end
81
52
 
@@ -94,36 +65,15 @@ module Embulk
94
65
  def self.guess(config)
95
66
  client = MixpanelApi::Client.new(config.param(:api_key, :string), config.param(:api_secret, :string))
96
67
 
97
- from_date_str = config.param(:from_date, :string, default: (Date.today - 1 - SLICE_DAYS_COUNT).to_s)
98
-
99
- from_date = Date.parse(from_date_str)
68
+ range = guess_range(config)
69
+ Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
100
70
 
101
- if from_date > Date.today - 1
102
- raise ConfigError.new "Please specify date later than yesterday (inclusive) as 'from_date'"
103
- end
104
-
105
- # NOTE: to_date is yeasterday if from_date..Date.Today doesn't have
106
- # more SLICE_DAYS_COUNT days.
107
- to_date = [from_date + SLICE_DAYS_COUNT, Date.today - 1].min
108
-
109
- params = export_params(config)
110
- params = params.merge(
111
- from_date: from_date.to_s,
112
- to_date: to_date.to_s,
71
+ params = export_params(config).merge(
72
+ from_date: range.first,
73
+ to_date: range.last,
113
74
  )
114
75
 
115
- records = client.export(params)
116
- sample_records = records.first(GUESS_RECORDS_COUNT)
117
- properties = Guess::SchemaGuess.from_hash_records(sample_records.map{|r| r["properties"]})
118
- columns = properties.map do |col|
119
- result = {
120
- name: col.name,
121
- type: col.type,
122
- }
123
- result[:format] = col.format if col.format
124
- result
125
- end
126
- columns.unshift(name: "event", type: :string)
76
+ columns = guess_from_records(client.export(params))
127
77
  return {"columns" => columns}
128
78
  end
129
79
 
@@ -134,33 +84,18 @@ module Embulk
134
84
  @timezone = task[:timezone]
135
85
  @schema = task[:schema]
136
86
  @dates = task[:dates]
87
+ @fetch_unknown_columns = task[:fetch_unknown_columns]
137
88
  end
138
89
 
139
90
  def run
140
- client = MixpanelApi::Client.new(@api_key, @api_secret)
141
91
  @dates.each_slice(SLICE_DAYS_COUNT) do |dates|
142
- from_date = dates.first
143
- to_date = dates.last
144
- Embulk.logger.info "Fetching data from #{from_date} to #{to_date} ..."
145
-
146
- params = @params.merge(
147
- "from_date" => from_date,
148
- "to_date" => to_date
149
- )
150
-
151
- records = client.export(params)
152
-
153
- records.each do |record|
154
- values = @schema.map do |column|
155
- case column["name"]
156
- when "event"
157
- record["event"]
158
- when "time"
159
- time = record["properties"]["time"]
160
- adjust_timezone(time)
161
- else
162
- record["properties"][column["name"]]
163
- end
92
+ Embulk.logger.info "Fetching data from #{dates.first} to #{dates.last} ..."
93
+
94
+ fetch(dates).each do |record|
95
+ values = extract_values(record)
96
+ if @fetch_unknown_columns
97
+ unknown_values = extract_unknown_values(record)
98
+ values << unknown_values.to_json
164
99
  end
165
100
  page_builder.add(values)
166
101
  end
@@ -176,6 +111,50 @@ module Embulk
176
111
 
177
112
  private
178
113
 
114
+ def extract_values(record)
115
+ @schema.map do |column|
116
+ extract_value(record, column["name"])
117
+ end
118
+ end
119
+
120
+ def extract_value(record, name)
121
+ case name
122
+ when NOT_PROPERTY_COLUMN
123
+ record[NOT_PROPERTY_COLUMN]
124
+ when "time"
125
+ time = record["properties"]["time"]
126
+ adjust_timezone(time)
127
+ else
128
+ record["properties"][name]
129
+ end
130
+ end
131
+
132
+ def extract_unknown_values(record)
133
+ record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
134
+ schema_keys = @schema.map {|column| column["name"]}
135
+ unknown_keys = record_keys - schema_keys
136
+
137
+ unless unknown_keys.empty?
138
+ Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
139
+ end
140
+
141
+ unknown_keys.inject({}) do |result, key|
142
+ result[key] = extract_value(record, key)
143
+ result
144
+ end
145
+ end
146
+
147
+ def fetch(dates)
148
+ from_date = dates.first
149
+ to_date = dates.last
150
+ params = @params.merge(
151
+ "from_date" => from_date,
152
+ "to_date" => to_date,
153
+ )
154
+ client = MixpanelApi::Client.new(@api_key, @api_secret)
155
+ client.export_with_retry(params, task[:retry_initial_wait_sec], task[:retry_limit])
156
+ end
157
+
179
158
  def adjust_timezone(epoch)
180
159
  # Adjust timezone offset to get UTC time
181
160
  # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
@@ -203,6 +182,34 @@ module Embulk
203
182
  bucket: config.param(:bucket, :string, default: nil),
204
183
  }
205
184
  end
185
+
186
+ def self.default_guess_start_date
187
+ Date.today - SLICE_DAYS_COUNT - 1
188
+ end
189
+
190
+ def self.guess_range(config)
191
+ from_date = config.param(:from_date, :string, default: default_guess_start_date.to_s)
192
+ fetch_days = config.param(:fetch_days, :integer, default: SLICE_DAYS_COUNT)
193
+ range = RangeGenerator.new(from_date, fetch_days).generate_range
194
+ if range.empty?
195
+ return default_guess_start_date..(Date.today - 1)
196
+ end
197
+ range
198
+ end
199
+
200
+ def self.guess_from_records(records)
201
+ sample_props = records.first(GUESS_RECORDS_COUNT).map{|r| r["properties"]}
202
+ schema = Guess::SchemaGuess.from_hash_records(sample_props)
203
+ columns = schema.map do |col|
204
+ result = {
205
+ name: col.name,
206
+ type: col.type,
207
+ }
208
+ result[:format] = col.format if col.format
209
+ result
210
+ end
211
+ columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
212
+ end
206
213
  end
207
214
 
208
215
  end
@@ -15,32 +15,68 @@ module Embulk
15
15
  @api_secret = api_secret
16
16
  end
17
17
 
18
+ def export_with_retry(params = {}, retry_initial_wait_sec, retry_limit)
19
+ body = with_retry(retry_initial_wait_sec, retry_limit) do
20
+ request(params)
21
+ end
22
+
23
+ response_to_enum(body)
24
+ end
25
+
18
26
  def export(params = {})
27
+ body = request(params)
28
+ response_to_enum(body)
29
+ end
30
+
31
+ private
32
+
33
+ def response_to_enum(response_body)
34
+ Enumerator.new do |y|
35
+ response_body.lines.each do |json|
36
+ # TODO: raise Embulk::DataError when invalid json given for Embulk 0.7+
37
+ y << JSON.parse(json)
38
+ end
39
+ end
40
+ end
41
+
42
+ def request(params)
19
43
  # https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel
20
44
  params[:expire] ||= Time.now.to_i + TIMEOUT_SECONDS
21
45
  params[:sig] = signature(params)
22
-
23
46
  Embulk.logger.debug "Export param: #{params.to_s}"
24
47
 
25
48
  response = httpclient.get(ENDPOINT_EXPORT, params)
26
-
27
49
  Embulk.logger.debug "response code: #{response.code}"
28
-
29
- if (400..499).include?(response.code)
50
+ case response.code
51
+ when 400..499
30
52
  raise ConfigError.new response.body
31
- elsif response.code >= 500
53
+ when 500..599
32
54
  raise RuntimeError, response.body
33
55
  end
56
+ response.body
57
+ end
34
58
 
35
- Enumerator.new do |y|
36
- response.body.lines.each do |json|
37
- y << JSON.parse(json)
59
+ def with_retry(initial_wait, retry_limit, &block)
60
+ retry_count = 0
61
+ wait_sec = initial_wait
62
+ begin
63
+ yield
64
+ rescue Embulk::ConfigError => e # TODO: rescue Embulk::DataError for Embulk 0.7+
65
+ # Don't retry
66
+ raise e
67
+ rescue => e
68
+ if retry_limit <= retry_count
69
+ Embulk.logger.error "'#{e}(#{e.class})' error occured and reached retry limit (#{retry_limit} times)"
70
+ raise e
38
71
  end
72
+ retry_count += 1
73
+ Embulk.logger.warn "Retrying after #{wait_sec} seconds [#{retry_count}/#{retry_limit}] '#{e}(#{e.class})' error occured"
74
+ sleep wait_sec
75
+ wait_sec *= 2
76
+ retry
39
77
  end
40
78
  end
41
79
 
42
- private
43
-
44
80
  def signature(params)
45
81
  # https://mixpanel.com/docs/api-documentation/data-export-api#auth-implementation
46
82
  sorted_keys = params.keys.map(&:to_s).sort
@@ -0,0 +1,79 @@
1
+ class RangeGenerator
2
+ attr_reader :from_date_str, :fetch_days
3
+
4
+ def initialize(from_date_str, fetch_days)
5
+ @from_date_str = from_date_str
6
+ @fetch_days = fetch_days
7
+ end
8
+
9
+ def generate_range
10
+ validate
11
+ show_warnings
12
+ range_only_past.map{|date| date.to_s}
13
+ end
14
+
15
+ private
16
+
17
+ def from_date
18
+ Date.parse(from_date_str)
19
+ end
20
+
21
+ def validate
22
+ begin
23
+ from_date
24
+ rescue ArgumentError # invalid date
25
+ raise Embulk::ConfigError.new "from_date '#{from_date_str}' is invalid date"
26
+ end
27
+
28
+ if fetch_days && fetch_days < 1
29
+ # `days` only allowed nil or positive number
30
+ raise Embulk::ConfigError.new "fetch_days '#{fetch_days}' is invalid. Please specify bigger number than 0."
31
+ end
32
+ end
33
+
34
+ def show_warnings
35
+ if from_date_too_early?
36
+ Embulk.logger.warn "Mixpanel allow 2 days before to from_date, so no data is input."
37
+ end
38
+
39
+ if overdays?
40
+ Embulk.logger.warn "These dates are too early access, ignored them: from #{overdays.first} to #{overdays.last}"
41
+ end
42
+ end
43
+
44
+ def range
45
+ if from_date_too_early?
46
+ return []
47
+ end
48
+
49
+ if fetch_days
50
+ from_date..(from_date + fetch_days - 1)
51
+ else
52
+ from_date..yesterday
53
+ end
54
+ end
55
+
56
+ def range_only_past
57
+ range.find_all{|date| date < today}
58
+ end
59
+
60
+ def overdays?
61
+ ! overdays.empty?
62
+ end
63
+
64
+ def overdays
65
+ range.to_a - range_only_past.to_a
66
+ end
67
+
68
+ def from_date_too_early?
69
+ from_date > yesterday
70
+ end
71
+
72
+ def yesterday
73
+ today - 1
74
+ end
75
+
76
+ def today
77
+ @today ||= Date.today
78
+ end
79
+ end
@@ -0,0 +1,15 @@
1
+ class TimezoneValidator
2
+ def initialize(timezone)
3
+ @timezone = timezone
4
+ end
5
+
6
+ def validate
7
+ begin
8
+ # raises exception if timezone is invalid string
9
+ TZInfo::Timezone.get(@timezone)
10
+ rescue => e
11
+ Embulk.logger.error "'#{@timezone}' is invalid timezone"
12
+ raise Embulk::ConfigError.new e.message
13
+ end
14
+ end
15
+ end
@@ -89,6 +89,71 @@ module Embulk
89
89
  end
90
90
  end
91
91
 
92
+ class ExportRetryTest < self
93
+ def setup
94
+ @httpclient = HTTPClient.new
95
+ @client = Client.new(API_KEY, API_SECRET)
96
+ @retry_initial_wait_sec = 1
97
+ @retry_limit = 3
98
+ stub_client
99
+ end
100
+
101
+ def test_retry_with_500
102
+ stub_response(failure_response(500))
103
+
104
+ @retry_limit.times do |n|
105
+ mock(@client).sleep(@retry_initial_wait_sec * (2**n))
106
+ end
107
+ mock(Embulk.logger).warn(/retry/i).times(@retry_limit)
108
+ mock(Embulk.logger).error(/retry/i).once
109
+
110
+ assert_raise do
111
+ @client.export_with_retry(params, @retry_initial_wait_sec, @retry_limit)
112
+ end
113
+ end
114
+
115
+ def test_retry_with_timeout
116
+ @httpclient.connect_timeout = 0.000000000000000000001
117
+
118
+ @retry_limit.times do |n|
119
+ mock(@client).sleep(@retry_initial_wait_sec * (2**n))
120
+ end
121
+ mock(Embulk.logger).warn(/retry/i).times(@retry_limit)
122
+ mock(Embulk.logger).error(/retry/i).once
123
+
124
+ assert_raise(HTTPClient::TimeoutError) do
125
+ @client.export_with_retry(params, @retry_initial_wait_sec, @retry_limit)
126
+ end
127
+ end
128
+
129
+ def test_not_retry_with_401
130
+ @httpclient.test_loopback_http_response << "HTTP/1.1 401\r\n\r\n"
131
+ mock(Embulk.logger).warn(/retry/i).never
132
+ mock(Embulk.logger).error(/retry/i).never
133
+
134
+ assert_raise(Embulk::ConfigError) do
135
+ @client.export_with_retry(params, 0, 1).each do |record|
136
+ record
137
+ end
138
+ end
139
+ end
140
+
141
+ def test_not_retry_with_invalid_json
142
+ omit "Embulk 0.6 or earlier has no DataError (enable to uncomment below code)"
143
+
144
+ # @httpclient.test_loopback_http_response << "HTTP/1.1 200\r\n\r\ninvalid json"
145
+ # mock(Embulk.logger).warn(/retry/i).never
146
+ # mock(Embulk.logger).error(/retry/i).never
147
+
148
+ # assert_raise(Embulk::DataError) do
149
+ # @client.export_with_retry(params, 0, 1).each do |record|
150
+ # # DataError will raised in each block
151
+ # record
152
+ # end
153
+ # end
154
+ end
155
+ end
156
+
92
157
  private
93
158
 
94
159
  def stub_client
@@ -50,6 +50,11 @@ module Embulk
50
50
  end
51
51
 
52
52
  class GuessTest < self
53
+ def setup
54
+ # Do nothing from parent
55
+ mute_warn
56
+ end
57
+
53
58
  def test_from_date_old_date
54
59
  config = {
55
60
  type: "mixpanel",
@@ -58,9 +63,8 @@ module Embulk
58
63
  from_date: FROM_DATE,
59
64
  }
60
65
 
61
- from_date = config[:from_date]
62
- to_date = Date.parse(from_date) + Mixpanel::SLICE_DAYS_COUNT
63
- stub_export(from_date, to_date)
66
+ stub_export_all
67
+ mock(Embulk.logger).info(/^Guessing.*#{Regexp.escape FROM_DATE}\.\./)
64
68
 
65
69
  actual = Mixpanel.guess(embulk_config(config))
66
70
  assert_equal(expected, actual)
@@ -74,25 +78,25 @@ module Embulk
74
78
  from_date: Date.today.to_s,
75
79
  }
76
80
 
77
- assert_raise(ConfigError) do
78
- Mixpanel.guess(embulk_config(config))
79
- end
81
+ stub_export_all
82
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date.to_s}/)
83
+
84
+ Mixpanel.guess(embulk_config(config))
80
85
  end
81
86
 
82
87
  def test_from_date_yesterday
88
+ from_date = (Date.today - 1).to_s
83
89
  config = {
84
90
  type: "mixpanel",
85
91
  api_key: API_KEY,
86
92
  api_secret: API_SECRET,
87
- from_date: (Date.today - 1).to_s,
93
+ from_date: from_date,
88
94
  }
89
95
 
90
- from_date = config[:from_date]
91
- to_date = from_date
92
- stub_export(from_date, to_date)
96
+ stub_export_all
97
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape from_date}/)
93
98
 
94
- actual = Mixpanel.guess(embulk_config(config))
95
- assert_equal(expected, actual)
99
+ Mixpanel.guess(embulk_config(config))
96
100
  end
97
101
 
98
102
  def test_no_from_date
@@ -102,31 +106,24 @@ module Embulk
102
106
  api_secret: API_SECRET,
103
107
  }
104
108
 
105
- from_date = Date.today - 1 - Mixpanel::SLICE_DAYS_COUNT
106
- to_date = Date.today - 1
107
- stub_export(from_date, to_date)
109
+ stub_export_all
110
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date.to_s}/)
108
111
 
109
- actual = Mixpanel.guess(embulk_config(config))
110
- assert_equal(expected, actual)
112
+ Mixpanel.guess(embulk_config(config))
111
113
  end
112
114
 
113
115
  private
114
116
 
115
- def stub_export(from_date, to_date)
116
- params = {
117
- api_key: API_KEY,
118
- event: nil,
119
- where: nil,
120
- bucket: nil,
121
- from_date: from_date.to_s,
122
- to_date: to_date.to_s,
123
- }
124
-
117
+ def stub_export_all
125
118
  any_instance_of(MixpanelApi::Client) do |klass|
126
- stub(klass).export(params) { records }
119
+ stub(klass).export(anything) { records }
127
120
  end
128
121
  end
129
122
 
123
+ def mute_warn
124
+ stub(Embulk.logger).warn(anything) {}
125
+ end
126
+
130
127
  def embulk_config(config)
131
128
  DataSource[*config.to_a.flatten(1)]
132
129
  end
@@ -336,9 +333,12 @@ module Embulk
336
333
  end
337
334
 
338
335
  def columns
339
- schema.map do |col|
336
+ unknown_columns = [Column.new(nil, "unknown_columns", :string)]
337
+ configured_columns = schema.map do |col|
340
338
  Column.new(nil, col["name"], col["type"].to_sym)
341
339
  end
340
+
341
+ configured_columns + unknown_columns
342
342
  end
343
343
  end
344
344
 
@@ -371,7 +371,7 @@ module Embulk
371
371
  def setup_client
372
372
 
373
373
  any_instance_of(MixpanelApi::Client) do |klass|
374
- stub(klass).export(anything) { records }
374
+ stub(klass).request { records_raw_response }
375
375
  end
376
376
  end
377
377
 
@@ -415,6 +415,52 @@ module Embulk
415
415
  @plugin.run
416
416
  end
417
417
 
418
+ class UnknownColumnsTest < self
419
+ def setup
420
+ super
421
+ @page_builder = Object.new
422
+ @plugin = Mixpanel.new(task, nil, nil, @page_builder)
423
+ end
424
+
425
+ def test_run
426
+ Embulk.logger.warn(anything)
427
+ stub(@plugin).preview? { false }
428
+
429
+ # NOTE: Expect records are contained same record
430
+ record = records.first
431
+ properties = record["properties"]
432
+
433
+ time = properties["time"]
434
+ tz = TZInfo::Timezone.get(TIMEZONE)
435
+ offset = tz.period_for_local(time, true).offset.utc_offset
436
+ adjusted_time = time - offset
437
+
438
+ added = [
439
+ properties["foo"],
440
+ adjusted_time,
441
+ {"int" => properties["int"], "event" => record["event"]}.to_json
442
+ ]
443
+
444
+ mock(@page_builder).add(added).times(records.length * 2)
445
+ mock(@page_builder).finish
446
+
447
+ @plugin.run
448
+ end
449
+
450
+ private
451
+
452
+ def task
453
+ super.merge(schema: schema, fetch_unknown_columns: true)
454
+ end
455
+
456
+ def schema
457
+ [
458
+ {"name" => "foo", "type" => "long"},
459
+ {"name" => "time", "type" => "long"},
460
+ ]
461
+ end
462
+ end
463
+
418
464
  private
419
465
 
420
466
  def timezone_offset_seconds
@@ -440,6 +486,9 @@ module Embulk
440
486
  schema: schema,
441
487
  dates: DATES.to_a.map(&:to_s),
442
488
  params: Mixpanel.export_params(embulk_config),
489
+ fetch_unknown_columns: false,
490
+ retry_initial_wait_sec: 2,
491
+ retry_limit: 3,
443
492
  }
444
493
  end
445
494
 
@@ -456,6 +505,10 @@ module Embulk
456
505
  ] * 30
457
506
  end
458
507
 
508
+ def records_raw_response
509
+ records.map(&:to_json).join("\n")
510
+ end
511
+
459
512
  def record_epoch
460
513
  1234567890
461
514
  end
@@ -467,6 +520,9 @@ module Embulk
467
520
  api_secret: API_SECRET,
468
521
  from_date: FROM_DATE,
469
522
  fetch_days: DAYS,
523
+ fetch_unknown_columns: false,
524
+ retry_initial_wait_sec: 2,
525
+ retry_limit: 3,
470
526
  }
471
527
  end
472
528
 
@@ -0,0 +1,94 @@
1
+ require "range_generator"
2
+ require "override_assert_raise"
3
+
4
+ class RangeGeneratorTest < Test::Unit::TestCase
5
+ include OverrideAssertRaise
6
+
7
+ class GenerateRangeTest < self
8
+ data do
9
+ {
10
+ from_date: ["aaaaaaaaa", 1],
11
+ fetch_days: ["2010-01-01", -9],
12
+ }
13
+ end
14
+ def test_invalid(args)
15
+ assert_raise(Embulk::ConfigError) do
16
+ generate_range(*args)
17
+ end
18
+ end
19
+
20
+ def test_all_days_past
21
+ days = 5
22
+ from = "2010-01-01"
23
+ expected_from = Date.parse(from)
24
+ expected_to = Date.parse("2010-01-05")
25
+
26
+ expected = (expected_from..expected_to).to_a.map{|date| date.to_s}
27
+
28
+ actual = RangeGenerator.new(from, days).generate_range
29
+
30
+ assert_equal(expected, actual)
31
+ end
32
+
33
+ class OverDaysTest < self
34
+ def setup
35
+ @from = Date.today - 5
36
+ @days = 10
37
+ @warn_message_regexp = /ignored them/
38
+ end
39
+
40
+ def test_range_only_past
41
+ expected_to = Date.today - 1
42
+ expected = (@from..expected_to).to_a.map{|date| date.to_s}
43
+
44
+ stub(Embulk.logger).warn(@warn_message_regexp)
45
+
46
+ assert_equal(expected, generate_range)
47
+ end
48
+
49
+ def test_warn
50
+ mock(Embulk.logger).warn(@warn_message_regexp)
51
+
52
+ generate_range
53
+ end
54
+
55
+ private
56
+
57
+ def generate_range
58
+ super(@from.to_s, @days)
59
+ end
60
+ end
61
+
62
+ class FromDateEarlyTest < self
63
+ def setup
64
+ @from = Date.today + 5
65
+ @days = 10
66
+ @warn_message_regexp = /allow 2 days/
67
+ end
68
+
69
+ def test_empty_range
70
+ stub(Embulk.logger).warn(@warn_message_regexp)
71
+
72
+ assert_equal([], generate_range)
73
+ end
74
+
75
+ def test_warn
76
+ mock(Embulk.logger).warn(@warn_message_regexp)
77
+
78
+ generate_range
79
+ end
80
+
81
+ private
82
+
83
+ def generate_range
84
+ super(@from.to_s, @days)
85
+ end
86
+ end
87
+
88
+ private
89
+
90
+ def generate_range(from_date_str, fetch_days)
91
+ RangeGenerator.new(from_date_str, fetch_days).generate_range
92
+ end
93
+ end
94
+ end
@@ -0,0 +1,24 @@
1
+ require "timezone_validator"
2
+ require "override_assert_raise"
3
+
4
+ class TimezoneValidatorTest < Test::Unit::TestCase
5
+ include OverrideAssertRaise
6
+
7
+ def test_valid
8
+ valid_timezone = "Asia/Tokyo"
9
+
10
+ assert_nothing_raised do
11
+ TimezoneValidator.new(valid_timezone).validate
12
+ end
13
+ end
14
+
15
+ def test_invalid
16
+ invalid_timezone = "Asia/Tokyoooooooooooooo"
17
+
18
+ mock(Embulk.logger).error(/#{Regexp.new(invalid_timezone)}/)
19
+
20
+ assert_raise(Embulk::ConfigError) do
21
+ TimezoneValidator.new(invalid_timezone).validate
22
+ end
23
+ end
24
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-mixpanel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshihara
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2015-10-06 00:00:00.000000000 Z
12
+ date: 2015-10-29 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
@@ -199,11 +199,15 @@ files:
199
199
  - gemfiles/template.erb
200
200
  - lib/embulk/input/mixpanel.rb
201
201
  - lib/embulk/input/mixpanel_api/client.rb
202
+ - lib/range_generator.rb
203
+ - lib/timezone_validator.rb
202
204
  - test/embulk/input/mixpanel_api/test_client.rb
203
205
  - test/embulk/input/test_mixpanel.rb
204
206
  - test/override_assert_raise.rb
205
207
  - test/prepare_embulk.rb
206
208
  - test/run-test.rb
209
+ - test/test_range_generator.rb
210
+ - test/test_timezone_validator.rb
207
211
  homepage: https://github.com/treasure-data/embulk-input-mixpanel
208
212
  licenses:
209
213
  - Apache2
@@ -234,3 +238,5 @@ test_files:
234
238
  - test/override_assert_raise.rb
235
239
  - test/prepare_embulk.rb
236
240
  - test/run-test.rb
241
+ - test/test_range_generator.rb
242
+ - test/test_timezone_validator.rb