embulk-input-mixpanel 0.5.15 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 6c2a00332d19da50d63a726cac679e745206ccab
4
- data.tar.gz: f2810c25272ee6b001f206dd0ddd93895c247ab6
3
+ metadata.gz: 8fc75a11eeef6fc6e9831e11a5b3eeae414d7799
4
+ data.tar.gz: 8176825bc52368dd5abd8eea00f394dec3061d37
5
5
  SHA512:
6
- metadata.gz: 062a5d9d8948ba9efe802b477756484f03add6ce542d9bdbf40ee3988c2c943f5f6445fc32b15d05dfd0b909761fbf4f47f530e0e2d1357b1a57ca0c20074c55
7
- data.tar.gz: df58ccf3ab6acad568681d57c39e97258849dc6e3946f5367ccdf31d64265d8c6d6afbb724259b66d21003b670b1763b6e097a0e871ff9bdaeb9e396f5e1b0cc
6
+ metadata.gz: 2d13fa15449b9900e4179260b6609a536e058de90e96ae691a88ebd757a13b6ddd8972a2715802a8d18557e30157220d1703b933e97a32e2d7766c347fc09c05
7
+ data.tar.gz: 42c5a4bada8075467e6b1195c7a232f78dc77e71f65a9b573b5e17152af53405a11595f43b4f6456db6f45b54267668a8ed6a6dc6c6db5e59bceb8fb78fec6fb
@@ -1,3 +1,11 @@
1
+ ## 0.6.1 - 2020-04-06
2
+
3
+ * [enhancement] Support JQL script for Profile [#66](https://github.com/treasure-data/embulk-input-mixpanel/pull/66)
4
+
5
+ ## 0.6.0 - 2020-03-30
6
+
7
+ * [enhancement] Support JQL script [#65](https://github.com/treasure-data/embulk-input-mixpanel/pull/65)
8
+
1
9
  ## 0.5.15 - 2020-01-22
2
10
 
3
11
  * [enhancement] Update the authentication method to latest [#63](https://github.com/treasure-data/embulk-input-mixpanel/pull/63)
data/README.md CHANGED
@@ -33,13 +33,16 @@ To get it, you should log in mixpanel website, and click gear icon at the lower
33
33
 
34
34
  - **api_secret**: project API Secret (string, required)
35
35
  - **export_endpoint**: the Data Export API's endpoint (string, default to "http://data.mixpanel.com/api/2.0/export")
36
+ - **jql_endpoint**: the JQL API's endpoint (string, default to "https://mixpanel.com/api/2.0/jql/")
37
+ - **jql_mode**: using JQL or export endpoint (boolean, default to false)
38
+ - **jql_script**: JQL script sent the JQL endpoint(string)
36
39
  - **timezone**: project timezone(string, required)
37
40
  - **from_date**: From date to export (string, optional, default: today - 2)
38
41
  - NOTE: Mixpanel API supports to export data from at least 2 days before to at most the previous day.
39
42
  - **fetch_days**: Count of days range for exporting (integer, optional, default: from_date - (today - 1))
40
43
  - NOTE: Mixpanel doesn't support to from_date > today - 2
41
44
  - **incremental**: Run incremental mode nor not (boolean, optional, default: true)
42
- - **incremental_column**: Column to be add to where query as a constraint for incremental time. Only data that have incremental_column timestamp > than previous latest_fetched_time will be return (string, optional, default: nil)
45
+ - **incremental_column**: Column to be add to where query as a constraint for incremental time. Only data that have incremental_column timestamp > than previous latest_fetched_time will be return (string, optional, default: time)
43
46
  - **back_fill_time**: Amount of time that will be subtracted from `from_date` to calculate the final `from_date` that will be use for API Request. This is due to Mixpanel caching data on user devices before sending it to Mixpanel server (integer, optional, default: 5)
44
47
  - NOTE: Only have effect when incremental is true and incremental_column is specified
45
48
  - **incremental_column_upper_limit_delay_in_seconds**: When query with incremental column, plugin will lock the upper limit of incremental column query with the job start time, in order to avoid issue with data that commit when the job is running
@@ -1,6 +1,6 @@
1
1
  Gem::Specification.new do |spec|
2
2
  spec.name = "embulk-input-mixpanel"
3
- spec.version = "0.5.15"
3
+ spec.version = "0.6.1"
4
4
  spec.authors = ["yoshihara", "uu59"]
5
5
  spec.summary = "Mixpanel input plugin for Embulk"
6
6
  spec.description = "Loads records from Mixpanel."
@@ -1,88 +1,16 @@
1
- require "tzinfo"
2
- require "perfect_retry"
3
- require "embulk/input/mixpanel_api/client"
4
- require "embulk/input/mixpanel_api/exceptions"
5
- require "range_generator"
6
- require "timezone_validator"
7
- require "active_support/core_ext/time"
1
+ require "embulk/input/service/jql_service"
2
+ require "embulk/input/service/export_service"
8
3
 
9
4
  module Embulk
10
5
  module Input
11
6
  class Mixpanel < InputPlugin
12
7
  Plugin.register_input("mixpanel", self)
13
-
14
- NOT_PROPERTY_COLUMN = "event".freeze
15
-
16
- # https://mixpanel.com/help/questions/articles/special-or-reserved-properties
17
- # https://mixpanel.com/help/questions/articles/what-properties-do-mixpanels-libraries-store-by-default
18
- #
19
- # JavaScript to extract key names from HTML: run it on Chrome Devtool when opening their document
20
- # > Array.from(document.querySelectorAll("strong")).map(function(s){ return s.textContent.match(/[A-Z]/) ? s.parentNode.textContent.match(/\((.*?)\)/)[1] : s.textContent.split(",").join(" ") }).join(" ")
21
- # > Array.from(document.querySelectorAll("li")).map(function(s){ m = s.textContent.match(/\((.*?)\)/); return m && m[1] }).filter(function(k) { return k && !k.match("utm") }).join(" ")
22
- KNOWN_KEYS = %W(
23
- #{NOT_PROPERTY_COLUMN}
24
- distinct_id ip mp_name_tag mp_note token time mp_country_code length campaign_id $email $phone $distinct_id $ios_devices $android_devices $first_name $last_name $name $city $region $country_code $timezone $unsubscribed
25
- $city $region mp_country_code $browser $browser_version $device $current_url $initial_referrer $initial_referring_domain $os $referrer $referring_domain $screen_height $screen_width $search_engine $city $region $mp_country_code $timezone $browser_version $browser $initial_referrer $initial_referring_domain $os $last_seen $city $region mp_country_code $app_release $app_version $carrier $ios_ifa $os_version $manufacturer $lib_version $model $os $screen_height $screen_width $wifi $city $region $mp_country_code $timezone $ios_app_release $ios_app_version $ios_device_model $ios_lib_version $ios_version $ios_ifa $last_seen $city $region mp_country_code $app_version $bluetooth_enabled $bluetooth_version $brand $carrier $has_nfc $has_telephone $lib_version $manufacturer $model $os $os_version $screen_dpi $screen_height $screen_width $wifi $google_play_services $city $region mp_country_code $timezone $android_app_version $android_app_version_code $android_lib_version $android_os $android_os_version $android_brand $android_model $android_manufacturer $last_seen
26
- ).uniq.freeze
27
-
28
-
29
- DEFAULT_FETCH_DAYS = 7
30
- DEFAULT_TIME_COLUMN = 'time'
31
-
8
+
32
9
  def self.transaction(config, &control)
33
- timezone = config.param(:timezone, :string)
34
- TimezoneValidator.new(timezone).validate
35
-
36
- from_date = config.param(:from_date, :string, default: (today(timezone) - 2).to_s)
37
- fetch_days = config.param(:fetch_days, :integer, default: nil)
38
-
39
-
40
- fetch_unknown_columns = config.param(:fetch_unknown_columns, :bool, default: false)
41
-
42
- incremental_column = config.param(:incremental_column, :string, default: nil)
43
- incremental = config.param(:incremental, :bool, default: true)
44
- latest_fetched_time = config.param(:latest_fetched_time, :integer, default: 0)
45
-
46
- # Backfill from date if incremental and an incremental field is set and we are in incremental run
47
- if incremental && incremental_column && latest_fetched_time !=0
48
- back_fill_days = config.param(:back_fill_days, :integer, default: 5)
49
- Embulk.logger.info "Backfill days #{back_fill_days}"
50
- from_date = (Date.parse(from_date) - back_fill_days).to_s
51
- fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
52
- end
53
-
54
- range = RangeGenerator.new(from_date, fetch_days, timezone).generate_range
55
- Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
56
- job_start_time = Time.now.to_i*1000
57
- upper_limit_delay = config.param(:incremental_column_upper_limit_delay_in_seconds, :integer, default: 0)
58
- incremental_column_upper_limit = job_start_time - (upper_limit_delay * 1000)
59
- task = {
60
- params: export_params(config),
61
- dates: range,
62
- timezone: timezone,
63
- export_endpoint: export_endpoint(config),
64
- api_secret: config.param(:api_secret, :string),
65
- schema: config.param(:columns, :array),
66
- fetch_unknown_columns: fetch_unknown_columns,
67
- fetch_custom_properties: config.param(:fetch_custom_properties, :bool, default: true),
68
- retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
69
- incremental_column: incremental_column,
70
- retry_limit: config.param(:retry_limit, :integer, default: 5),
71
- latest_fetched_time: latest_fetched_time,
72
- incremental: incremental,
73
- slice_range: config.param(:slice_range, :integer, default: 7),
74
- job_start_time: job_start_time,
75
- incremental_column_upper_limit: incremental_column_upper_limit,
76
- allow_partial_import: config.param(:allow_partial_import,:bool, default: true)
77
- }
78
-
79
- if !incremental_column.nil? && !latest_fetched_time.nil? && (incremental_column_upper_limit <= latest_fetched_time)
80
- raise Embulk::ConfigError.new("Incremental column upper limit (job_start_time - incremental_column_upper_limit_delay_in_seconds) can't be smaller or equal latest fetched time #{latest_fetched_time}")
81
- end
82
-
83
- if task[:fetch_unknown_columns] && task[:fetch_custom_properties]
84
- raise Embulk::ConfigError.new("Don't set true both `fetch_unknown_columns` and `fetch_custom_properties`.")
85
- end
10
+ service = service(config)
11
+ service.validate_config
12
+ task = service.create_task
13
+ Embulk.logger.info "Try to fetch data from #{task[:dates].first} to #{task[:dates].last}"
86
14
 
87
15
  columns = task[:schema].map do |column|
88
16
  name = column["name"]
@@ -91,15 +19,15 @@ module Embulk
91
19
  Column.new(nil, name, type, column["format"])
92
20
  end
93
21
 
94
- if fetch_unknown_columns
95
- Embulk.logger.warn "Deprecated `unknown_columns`. Use `fetch_custom_properties` instead."
96
- columns << Column.new(nil, "unknown_columns", :json)
97
- end
98
-
99
22
  if task[:fetch_custom_properties]
100
23
  columns << Column.new(nil, "custom_properties", :json)
101
24
  end
102
25
 
26
+ if task[:fetch_unknown_columns]
27
+ Embulk.logger.warn "Deprecated `unknown_columns`. Use `fetch_custom_properties` instead."
28
+ columns << Column.new(nil, "unknown_columns", :json)
29
+ end
30
+
103
31
  resume(task, columns, 1, &control)
104
32
  end
105
33
 
@@ -110,283 +38,37 @@ module Embulk
110
38
  # implementation is terrible.
111
39
  if task[:incremental]
112
40
  task_report = task_reports.first
113
- next_to_date = Date.parse(task_report[:to_date])
114
-
115
- next_config_diff = {
116
- from_date: next_to_date.to_s,
117
- latest_fetched_time: task_report[:latest_fetched_time],
118
- }
119
- return next_config_diff
41
+ service = service(task)
42
+ next_from_date = service.next_from_date(task_report)
43
+ return next_from_date
120
44
  end
121
45
  return {}
122
46
  end
123
47
 
124
48
  def self.guess(config)
125
- giveup_when_mixpanel_is_down(export_endpoint(config))
126
-
127
- retryer = perfect_retry({
128
- retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
129
- retry_limit: config.param(:retry_limit, :integer, default: 5),
130
- })
131
- client = MixpanelApi::Client.new(config.param(:api_secret, :string),
132
- retryer,
133
- export_endpoint(config))
134
-
135
- range = guess_range(config)
136
- Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
137
-
138
- params = export_params(config).merge(
139
- "from_date" => range.first,
140
- "to_date" => range.last,
141
- )
142
- columns = guess_from_records(client.export_for_small_dataset(params))
143
- return {"columns" => columns}
144
- end
145
-
146
- def self.perfect_retry(task)
147
- PerfectRetry.new do |config|
148
- config.limit = task[:retry_limit]
149
- config.sleep = proc{|n| task[:retry_initial_wait_sec] * (2 * (n - 1)) }
150
- config.dont_rescues = [Embulk::ConfigError,MixpanelApi::IncompleteExportResponseError]
151
- config.rescues = [RuntimeError]
152
- config.log_level = nil
153
- config.logger = Embulk.logger
154
- end
155
- end
156
-
157
- def self.export_endpoint(config)
158
- config.param(:export_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT)
49
+ service = service(config)
50
+ service.validate_config
51
+ return {"columns"=>service.guess_columns}
159
52
  end
160
53
 
161
54
  def init
162
- @export_endpoint = task[:export_endpoint]
163
55
  @api_secret = task[:api_secret]
164
- @params = task[:params]
165
- @timezone = task[:timezone]
166
- @schema = task[:schema]
167
- @dates = task[:dates]
168
- @fetch_unknown_columns = task[:fetch_unknown_columns]
169
- @incremental_column = task[:incremental_column]
170
- @incremental = task[:incremental]
171
56
  end
172
57
 
173
58
  def run
174
- Embulk.logger.info "Job start time is #{task[:job_start_time]}"
175
- self.class.giveup_when_mixpanel_is_down(task[:export_endpoint])
176
- prev_latest_fetched_time = task[:latest_fetched_time] || 0
177
- prev_latest_fetched_time_format = Time.at(prev_latest_fetched_time).strftime("%F %T %z")
178
- current_latest_fetched_time = prev_latest_fetched_time
179
- @dates.each_slice(task[:slice_range]) do |slice_dates|
180
- ignored_fetched_record_count = 0
181
- # There is the issue with Mixpanel time field during the transition from standard to daylight saving time
182
- # in the US timezone i.e. 11 Mar 2018 2AM - 2:59AM, time within that period must not be existed,
183
- # due to daylight saving, time will be forwarded 1 hour from 2AM to 3AM.
184
- #
185
- # All of records with wrong timezone will be ignored instead of throw exception out
186
- ignored_wrong_daylight_tz_record_count = 0
187
- unless preview?
188
- Embulk.logger.info "Fetching data from #{slice_dates.first} to #{slice_dates.last} ..."
189
- end
190
- record_time_column=@incremental_column || DEFAULT_TIME_COLUMN
191
- begin
192
- fetch(slice_dates, prev_latest_fetched_time).each do |record|
193
- if @incremental
194
- if !record["properties"].include?(record_time_column)
195
- raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
196
- end
197
- record_time = record["properties"][record_time_column]
198
- if @incremental_column.nil?
199
- if record_time <= prev_latest_fetched_time
200
- ignored_fetched_record_count += 1
201
- next
202
- end
203
- end
204
-
205
- current_latest_fetched_time= [
206
- current_latest_fetched_time,
207
- record_time,
208
- ].max
209
- end
210
- begin
211
- values = extract_values(record)
212
- if @fetch_unknown_columns
213
- unknown_values = extract_unknown_values(record)
214
- values << unknown_values.to_json
215
- end
216
- if task[:fetch_custom_properties]
217
- values << collect_custom_properties(record)
218
- end
219
- page_builder.add(values)
220
- rescue TZInfo::PeriodNotFound
221
- ignored_wrong_daylight_tz_record_count += 1
222
- end
223
- end
224
- rescue MixpanelApi::IncompleteExportResponseError
225
- if !task[:allow_partial_import]
226
- # re raise the exception if we don't allow partial import
227
- raise
228
- end
229
- end
230
- if ignored_fetched_record_count > 0
231
- Embulk.logger.warn "Skipped already loaded #{ignored_fetched_record_count} records. These record times are older or equal than previous fetched record time (#{prev_latest_fetched_time} @ #{prev_latest_fetched_time_format})."
232
- end
233
- if ignored_wrong_daylight_tz_record_count > 0
234
- Embulk.logger.warn "Skipped #{ignored_wrong_daylight_tz_record_count} records due to corrupted Mixpanel time transition from standard to daylight saving"
235
- end
236
- break if preview?
237
- end
238
- page_builder.finish
239
- task_report = {
240
- latest_fetched_time: current_latest_fetched_time,
241
- to_date: @dates.last || today(@timezone) - 1,
242
- }
243
- task_report
59
+ Mixpanel::service(DataSource[task.to_a]).ingest(task, page_builder)
244
60
  end
245
61
 
246
62
  private
247
63
 
248
- def self.giveup_when_mixpanel_is_down(export_endpoint)
249
- unless MixpanelApi::Client.mixpanel_available?(export_endpoint)
250
- raise Embulk::DataError.new("Mixpanel service is down. Please retry later.")
251
- end
252
- end
253
-
254
- def extract_values(record)
255
- @schema.map do |column|
256
- extract_value(record, column["name"])
257
- end
258
- end
259
-
260
- def extract_value(record, name)
261
- case name
262
- when NOT_PROPERTY_COLUMN
263
- record[NOT_PROPERTY_COLUMN]
264
- when "time"
265
- time = record["properties"]["time"]
266
- adjust_timezone(time)
64
+ def self.service(config)
65
+ jql_mode = config[:jql_mode]
66
+ if jql_mode
67
+ Service::JqlService.new(config)
267
68
  else
268
- record["properties"][name]
269
- end
270
- end
271
-
272
- def collect_custom_properties(record)
273
- specified_columns = @schema.map{|col| col["name"]}
274
- custom_keys = record["properties"].keys.find_all{|key| !KNOWN_KEYS.include?(key.to_s) && !specified_columns.include?(key.to_s) }
275
- custom_keys.inject({}) do |result, key|
276
- result.merge({
277
- key => record["properties"][key]
278
- })
69
+ Service::ExportService.new(config)
279
70
  end
280
71
  end
281
-
282
- def extract_unknown_values(record)
283
- record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
284
- schema_keys = @schema.map {|column| column["name"]}
285
- unknown_keys = record_keys - schema_keys
286
-
287
- unless unknown_keys.empty?
288
- Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
289
- end
290
-
291
- unknown_keys.inject({}) do |result, key|
292
- result[key] = extract_value(record, key)
293
- result
294
- end
295
- end
296
-
297
- def fetch(dates, last_fetch_time, &block)
298
- from_date = dates.first
299
- to_date = dates.last
300
- params = @params.merge(
301
- "from_date" => from_date,
302
- "to_date" => to_date
303
- )
304
- if !@incremental_column.nil? # can't do filter on time column, time column need to be filter manually.
305
- params = params.merge(
306
- "where" => "#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{@incremental_column}\"] > #{last_fetch_time || 0} and properties[\"#{@incremental_column}\"] < #{task[:incremental_column_upper_limit]}"
307
- )
308
- end
309
- Embulk.logger.info "Where params is #{params["where"]}"
310
- client = MixpanelApi::Client.new(@api_secret, self.class.perfect_retry(task), @export_endpoint)
311
-
312
- if preview?
313
- client.export_for_small_dataset(params)
314
- else
315
- Enumerator.new do |y|
316
- client.export(params) do |record|
317
- y << record
318
- end
319
- end
320
- end
321
- end
322
-
323
- def adjust_timezone(epoch)
324
- # Adjust timezone offset to get UTC time
325
- # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
326
- tz = TZInfo::Timezone.get(@timezone)
327
- offset = tz.period_for_local(epoch, true).offset.utc_total_offset
328
- epoch - offset
329
- end
330
-
331
- def preview?
332
- begin
333
- org.embulk.spi.Exec.isPreview()
334
- rescue java.lang.NullPointerException => e
335
- false
336
- end
337
- end
338
-
339
- def self.export_params(config)
340
- event = config.param(:event, :array, default: nil)
341
- event = event.nil? ? nil : event.to_json
342
- {
343
- event: event,
344
- where: config.param(:where, :string, default: nil),
345
- bucket: config.param(:bucket, :string, default: nil),
346
- }
347
- end
348
-
349
- def self.default_guess_start_date(timezone)
350
- today(timezone) - DEFAULT_FETCH_DAYS - 1
351
- end
352
-
353
- def self.guess_range(config)
354
- time_zone = config.param(:timezone, :string, default: "")
355
- from_date = config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
356
- fetch_days = config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
357
- range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
358
- if range.empty?
359
- return default_guess_start_date(time_zone)..(today(time_zone) - 1)
360
- end
361
- range
362
- end
363
-
364
- def self.guess_from_records(records)
365
- sample_props = records.map {|r| r["properties"]}
366
- schema = Guess::SchemaGuess.from_hash_records(sample_props)
367
- columns = schema.map do |col|
368
- next if col.name == "time"
369
- result = {
370
- name: col.name,
371
- type: col.type,
372
- }
373
- result[:format] = col.format if col.format
374
- result
375
- end.compact
376
- columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
377
- # Shift incremental column to top
378
- columns.unshift(name: "time", type: :long)
379
- end
380
-
381
- def self.today(timezone)
382
- if timezone.nil?
383
- Date.today
384
- else
385
- zone = ActiveSupport::TimeZone[timezone]
386
- zone.nil? ? Date.today : zone.today
387
- end
388
- end
389
-
390
72
  end
391
73
  end
392
74
  end