embulk-input-mixpanel 0.5.15 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,8 @@ module Embulk
14
14
  PING_RETRY_WAIT = 2
15
15
  SMALL_NUM_OF_RECORDS = 10
16
16
  DEFAULT_EXPORT_ENDPOINT = "https://data.mixpanel.com/api/2.0/export/".freeze
17
+ DEFAULT_JQL_ENDPOINT = "https://mixpanel.com/api/2.0/jql/".freeze
18
+ JQL_RATE_LIMIT = 60
17
19
 
18
20
  attr_reader :retryer
19
21
 
@@ -40,7 +42,7 @@ module Embulk
40
42
  end
41
43
  end
42
44
 
43
- def initialize(api_secret, retryer = nil, endpoint = DEFAULT_EXPORT_ENDPOINT)
45
+ def initialize(api_secret, endpoint, retryer = nil)
44
46
  @endpoint = endpoint
45
47
  @api_secret = api_secret
46
48
  @retryer = retryer || PerfectRetry.new do |config|
@@ -76,6 +78,30 @@ module Embulk
76
78
  raise ConfigError.new "#{params["from_date"]}..#{latest_tried_to_date} has no record."
77
79
  end
78
80
 
81
+ def send_jql_script(params = {})
82
+ retryer.with_retry do
83
+ response = request_jql(params)
84
+ handle_error(response, response.body)
85
+ begin
86
+ return JSON.parse(response.body)
87
+ rescue =>e
88
+ raise Embulk::DataError.new(e)
89
+ end
90
+ end
91
+ end
92
+
93
+ def send_jql_script_small_dataset(params = {})
94
+ retryer.with_retry do
95
+ response = request_jql(params)
96
+ handle_error(response, response.body)
97
+ begin
98
+ return JSON.parse(response.body)[0..SMALL_NUM_OF_RECORDS - 1]
99
+ rescue =>e
100
+ raise Embulk::DataError.new(e.message)
101
+ end
102
+ end
103
+ end
104
+
79
105
  def try_to_dates(from_date)
80
106
  try_to_dates = 5.times.map do |n|
81
107
  # from_date + 1, from_date + 10, from_date + 100, ... so on
@@ -107,18 +133,18 @@ module Embulk
107
133
  Embulk.logger.info "Sending request to #{@endpoint}"
108
134
  response = httpclient.get(@endpoint, params) do |response, chunk|
109
135
  # Only process data if response status is 200..299
110
- if response.status/100 == 2
136
+ if response.status / 100 == 2
111
137
  chunk.each_line do |line|
112
138
  begin
113
139
  record = JSON.parse(buf + line)
114
140
  block.call record
115
141
  buf = ""
116
- rescue JSON::ParserError => e
142
+ rescue JSON::ParserError=>e
117
143
  buf << line
118
144
  end
119
145
  end
120
146
  else
121
- error_response << chunk
147
+ error_response << chunk
122
148
  end
123
149
  end
124
150
  handle_error(response, error_response)
@@ -129,24 +155,37 @@ module Embulk
129
155
  end
130
156
  end
131
157
 
158
+ def request_jql(parameters)
159
+ Embulk.logger.info "Sending request to #{@endpoint} params #{parameters}"
160
+ httpclient.post(@endpoint, query_string(parameters))
161
+ end
162
+
163
+ def query_string(prs)
164
+ URI.encode_www_form({
165
+ params: JSON.generate(prs[:params]),
166
+ script: prs[:script]
167
+ })
168
+ end
169
+
132
170
  def request_small_dataset(params, num_of_records)
133
171
  # guess/preview
134
172
  # Try to fetch first number of records
135
173
  params["limit"] = num_of_records
136
174
  Embulk.logger.info "Sending request to #{@endpoint}"
137
175
  res = httpclient.get(@endpoint, params)
138
- handle_error(res,res.body)
176
+ handle_error(res, res.body)
139
177
  response_to_enum(res.body)
140
178
  end
141
179
 
142
180
  def handle_error(response, error_response)
143
181
  Embulk.logger.debug "response code: #{response.code}"
144
182
  case response.code
183
+ when 429
184
+ # [429] {"error": "too many export requests in progress for this project"}
185
+ Embulk.logger.info "Hit rate limit sleep for 1 hour"
186
+ sleep(60 * 60)
187
+ raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
145
188
  when 400..499
146
- if response.code == 429
147
- # [429] {"error": "too many export requests in progress for this project"}
148
- raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
149
- end
150
189
  raise ConfigError.new("[#{response.code}] #{error_response}")
151
190
  when 500..599
152
191
  raise RuntimeError.new("[#{response.code}] #{error_response}")
@@ -0,0 +1,122 @@
1
+ require "perfect_retry"
2
+ require "range_generator"
3
+ require "timezone_validator"
4
+ require "active_support/core_ext/time"
5
+ require "tzinfo"
6
+ require "embulk/input/mixpanel_api/client"
7
+ require "embulk/input/mixpanel_api/exceptions"
8
+
9
+ module Embulk
10
+ module Input
11
+ module Service
12
+ class BaseService
13
+
14
+ NOT_PROPERTY_COLUMN = "event".freeze
15
+ DEFAULT_FETCH_DAYS = 7
16
+ DEFAULT_TIME_COLUMN = 'time'.freeze
17
+
18
+ def initialize(config)
19
+ @config = config
20
+ end
21
+
22
+ def default_guess_start_date(timezone)
23
+ today(timezone) - DEFAULT_FETCH_DAYS - 1
24
+ end
25
+
26
+ protected
27
+
28
+ def validate_config
29
+ timezone = @config.param(:timezone, :string)
30
+ validate_timezone(timezone)
31
+ end
32
+
33
+ def validate_timezone(timezone)
34
+ TimezoneValidator.new(timezone).validate
35
+ end
36
+
37
+ def giveup_when_mixpanel_is_down
38
+ unless MixpanelApi::Client.mixpanel_available?(endpoint)
39
+ raise Embulk::DataError.new("Mixpanel service is down. Please retry later.")
40
+ end
41
+ end
42
+
43
+ def adjust_timezone(epoch)
44
+ # Adjust timezone offset to get UTC time
45
+ # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
46
+ if epoch.present?
47
+ tz = TZInfo::Timezone.get(@timezone)
48
+ offset = tz.period_for_local(epoch, true).offset.utc_total_offset
49
+ epoch - offset
50
+ end
51
+ end
52
+
53
+ def today(timezone)
54
+ if timezone.nil?
55
+ Date.today
56
+ else
57
+ zone = ActiveSupport::TimeZone[timezone]
58
+ zone.nil? ? Date.today : zone.today
59
+ end
60
+ end
61
+
62
+ def extract_values(record)
63
+ @schema.map do |column|
64
+ extract_value(record, column["name"])
65
+ end
66
+ end
67
+
68
+ def preview?
69
+ begin
70
+ org.embulk.spi.Exec.isPreview()
71
+ rescue java.lang.NullPointerException=>e
72
+ false
73
+ end
74
+ end
75
+
76
+ def create_client
77
+ if @client.present?
78
+ @client
79
+ else
80
+ retryer = perfect_retry({
81
+ # retry_initial_wait_sec: @config[:retry_initial_wait_sec] ? @config[:retry_initial_wait_sec] : 1,
82
+ # retry_limit: @config[:retry_limit] ? @config[:retry_limit] : 5,
83
+ retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
84
+ retry_limit: @config.param(:retry_limit, :integer, default: 5),
85
+ })
86
+ MixpanelApi::Client.new(@config.param(:api_secret, :string), endpoint, retryer)
87
+ end
88
+ end
89
+
90
+ def perfect_retry(task)
91
+ PerfectRetry.new do |config|
92
+ config.limit = task[:retry_limit]
93
+ config.sleep = proc {|n| task[:retry_initial_wait_sec] * (2 * (n - 1))}
94
+ config.dont_rescues = [Embulk::ConfigError, MixpanelApi::IncompleteExportResponseError]
95
+ config.rescues = [RuntimeError]
96
+ config.log_level = nil
97
+ config.logger = Embulk.logger
98
+ end
99
+ end
100
+
101
+ def range
102
+ timezone = @config.param(:timezone, :string, default: "")
103
+ from_date = @config.param(:from_date, :string, default: (today(timezone) - 2).to_s)
104
+ incremental = @config.param(:incremental, :bool, default: true)
105
+ incremental_column = @config.param(:incremental_column, :string, default: nil)
106
+ latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
107
+ fetch_days = @config.param(:fetch_days, :integer, default: nil)
108
+
109
+ # Backfill from date if incremental and an incremental field is set and we are in incremental run
110
+ if incremental && incremental_column && latest_fetched_time !=0
111
+ back_fill_days = @config.param(:back_fill_days, :integer, default: 5)
112
+ Embulk.logger.info "Backfill days #{back_fill_days}"
113
+ from_date = (Date.parse(from_date) - back_fill_days).to_s
114
+ fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
115
+ end
116
+
117
+ RangeGenerator.new(from_date, fetch_days, timezone).generate_range
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,284 @@
1
+ require 'embulk/input/service/base_service'
2
+
3
+ module Embulk
4
+ module Input
5
+ module Service
6
+ class ExportService < BaseService
7
+
8
+ # https://mixpanel.com/help/questions/articles/special-or-reserved-properties
9
+ # https://mixpanel.com/help/questions/articles/what-properties-do-mixpanels-libraries-store-by-default
10
+ #
11
+ # JavaScript to extract key names from HTML: run it on Chrome Devtool when opening their document
12
+ # > Array.from(document.querySelectorAll("strong")).map(function(s){ return s.textContent.match(/[A-Z]/) ? s.parentNode.textContent.match(/\((.*?)\)/)[1] : s.textContent.split(",").join(" ") }).join(" ")
13
+ # > Array.from(document.querySelectorAll("li")).map(function(s){ m = s.textContent.match(/\((.*?)\)/); return m && m[1] }).filter(function(k) { return k && !k.match("utm") }).join(" ")
14
+ KNOWN_KEYS = %W(
15
+ #{NOT_PROPERTY_COLUMN}
16
+ distinct_id ip mp_name_tag mp_note token time mp_country_code length campaign_id $email $phone $distinct_id $ios_devices $android_devices $first_name $last_name $name $city $region $country_code $timezone $unsubscribed
17
+ $city $region mp_country_code $browser $browser_version $device $current_url $initial_referrer $initial_referring_domain $os $referrer $referring_domain $screen_height $screen_width $search_engine $city $region $mp_country_code $timezone $browser_version $browser $initial_referrer $initial_referring_domain $os $last_seen $city $region mp_country_code $app_release $app_version $carrier $ios_ifa $os_version $manufacturer $lib_version $model $os $screen_height $screen_width $wifi $city $region $mp_country_code $timezone $ios_app_release $ios_app_version $ios_device_model $ios_lib_version $ios_version $ios_ifa $last_seen $city $region mp_country_code $app_version $bluetooth_enabled $bluetooth_version $brand $carrier $has_nfc $has_telephone $lib_version $manufacturer $model $os $os_version $screen_dpi $screen_height $screen_width $wifi $google_play_services $city $region mp_country_code $timezone $android_app_version $android_app_version_code $android_lib_version $android_os $android_os_version $android_brand $android_model $android_manufacturer $last_seen
18
+ ).uniq.freeze
19
+
20
+ def validate_config
21
+ super
22
+
23
+ incremental_column = @config.param(:incremental_column, :string, default: nil)
24
+ latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
25
+ fetch_custom_properties = @config.param(:fetch_custom_properties, :bool, default: true)
26
+ fetch_unknown_columns = @config.param(:fetch_unknown_columns, :bool, default: false)
27
+
28
+ if !incremental_column.nil? && !latest_fetched_time.nil? && (incremental_column_upper_limit <= latest_fetched_time)
29
+ raise Embulk::ConfigError.new("Incremental column upper limit (job_start_time - incremental_column_upper_limit_delay_in_seconds) can't be smaller or equal latest fetched time #{latest_fetched_time}")
30
+ end
31
+
32
+ if fetch_unknown_columns && fetch_custom_properties
33
+ raise Embulk::ConfigError.new("Don't set true both `fetch_unknown_columns` and `fetch_custom_properties`.")
34
+ end
35
+ end
36
+
37
+ def create_task
38
+ {
39
+ params: export_params,
40
+ dates: range,
41
+ timezone: @config.param(:timezone, :string, default: ""),
42
+ export_endpoint: endpoint,
43
+ api_secret: @config.param(:api_secret, :string),
44
+ schema: @config.param(:columns, :array),
45
+ fetch_unknown_columns: @config.param(:fetch_unknown_columns, :bool, default: false),
46
+ fetch_custom_properties: @config.param(:fetch_custom_properties, :bool, default: true),
47
+ retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
48
+ incremental_column: @config.param(:incremental_column, :string, default: nil),
49
+ retry_limit: @config.param(:retry_limit, :integer, default: 5),
50
+ latest_fetched_time: @config.param(:latest_fetched_time, :integer, default: 0),
51
+ incremental: @config.param(:incremental, :bool, default: true),
52
+ slice_range: @config.param(:slice_range, :integer, default: 7),
53
+ job_start_time: Time.now.to_i * 1000,
54
+ incremental_column_upper_limit: incremental_column_upper_limit,
55
+ allow_partial_import: @config.param(:allow_partial_import, :bool, default: true)
56
+ }
57
+ end
58
+
59
+ def next_from_date(task_report)
60
+ next_to_date = Date.parse(task_report[:to_date])
61
+ {
62
+ from_date: next_to_date.to_s,
63
+ latest_fetched_time: task_report[:latest_fetched_time],
64
+ }
65
+ end
66
+
67
+ def ingest(task, page_builder)
68
+ giveup_when_mixpanel_is_down
69
+
70
+ @schema = task[:schema]
71
+ @timezone = task[:timezone]
72
+
73
+ Embulk.logger.info "Job start time is #{task[:job_start_time]}"
74
+
75
+ dates = task[:dates]
76
+ prev_latest_fetched_time = task[:latest_fetched_time] || 0
77
+ prev_latest_fetched_time_format = Time.at(prev_latest_fetched_time).strftime("%F %T %z")
78
+ current_latest_fetched_time = prev_latest_fetched_time
79
+ incremental_column = task[:incremental_column]
80
+ incremental = task[:incremental]
81
+ fetch_unknown_columns = task[:fetch_unknown_columns]
82
+
83
+ dates.each_slice(task[:slice_range]) do |slice_dates|
84
+ ignored_fetched_record_count = 0
85
+ # There is the issue with Mixpanel time field during the transition from standard to daylight saving time
86
+ # in the US timezone i.e. 11 Mar 2018 2AM - 2:59AM, time within that period must not be existed,
87
+ # due to daylight saving, time will be forwarded 1 hour from 2AM to 3AM.
88
+ #
89
+ # All of records with wrong timezone will be ignored instead of throw exception out
90
+ ignored_wrong_daylight_tz_record_count = 0
91
+ unless preview?
92
+ Embulk.logger.info "Fetching data from #{slice_dates.first} to #{slice_dates.last} ..."
93
+ end
94
+ record_time_column = incremental_column || DEFAULT_TIME_COLUMN
95
+ begin
96
+ fetch(slice_dates, prev_latest_fetched_time, task).each do |record|
97
+ if incremental
98
+ if !record["properties"].include?(record_time_column)
99
+ raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
100
+ end
101
+ record_time = record["properties"][record_time_column]
102
+ if incremental_column.nil?
103
+ if record_time <= prev_latest_fetched_time
104
+ ignored_fetched_record_count += 1
105
+ next
106
+ end
107
+ end
108
+
109
+ current_latest_fetched_time = [
110
+ current_latest_fetched_time,
111
+ record_time,
112
+ ].max
113
+ end
114
+ begin
115
+ values = extract_values(record)
116
+ if fetch_unknown_columns
117
+ unknown_values = extract_unknown_values(record)
118
+ values << unknown_values.to_json
119
+ end
120
+ if task[:fetch_custom_properties]
121
+ values << collect_custom_properties(record)
122
+ end
123
+ page_builder.add(values)
124
+ rescue TZInfo::PeriodNotFound
125
+ ignored_wrong_daylight_tz_record_count += 1
126
+ end
127
+ end
128
+ rescue MixpanelApi::IncompleteExportResponseError
129
+ if !task[:allow_partial_import]
130
+ # re raise the exception if we don't allow partial import
131
+ raise
132
+ end
133
+ end
134
+ if ignored_fetched_record_count > 0
135
+ Embulk.logger.warn "Skipped already loaded #{ignored_fetched_record_count} records. These record times are older or equal than previous fetched record time (#{prev_latest_fetched_time} @ #{prev_latest_fetched_time_format})."
136
+ end
137
+ if ignored_wrong_daylight_tz_record_count > 0
138
+ Embulk.logger.warn "Skipped #{ignored_wrong_daylight_tz_record_count} records due to corrupted Mixpanel time transition from standard to daylight saving"
139
+ end
140
+ break if preview?
141
+ end
142
+ page_builder.finish
143
+ create_task_report(current_latest_fetched_time, dates.last, task[:timezone])
144
+ end
145
+
146
+ def create_task_report(current_latest_fetched_time, to_date, timezone)
147
+ {
148
+ latest_fetched_time: current_latest_fetched_time,
149
+ to_date: to_date || today(timezone) - 1,
150
+ }
151
+ end
152
+
153
+ def guess_columns
154
+ giveup_when_mixpanel_is_down
155
+ range = guess_range
156
+ Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
157
+
158
+ params = export_params.merge(
159
+ "from_date"=>range.first,
160
+ "to_date"=>range.last,
161
+ )
162
+
163
+ client = create_client
164
+ guess_from_records(client.export_for_small_dataset(params))
165
+ end
166
+
167
+ def guess_range
168
+ time_zone = @config.param(:timezone, :string, default: "")
169
+ from_date = @config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
170
+ fetch_days = @config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
171
+ range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
172
+ if range.empty?
173
+ return default_guess_start_date(time_zone)..(today(time_zone) - 1)
174
+ end
175
+ range
176
+ end
177
+
178
+ def export_params
179
+ event = @config.param(:event, :array, default: nil)
180
+ event = event.nil? ? nil : event.to_json
181
+ {
182
+ event: event,
183
+ where: @config.param(:where, :string, default: nil),
184
+ bucket: @config.param(:bucket, :string, default: nil),
185
+ }
186
+ end
187
+
188
+ def guess_from_records(records)
189
+ sample_props = records.map {|r| r["properties"]}
190
+ schema = Guess::SchemaGuess.from_hash_records(sample_props)
191
+ columns = schema.map do |col|
192
+ next if col.name == "time"
193
+ result = {
194
+ name: col.name,
195
+ type: col.type,
196
+ }
197
+ result["format"] = col.format if col.format
198
+ result
199
+ end.compact
200
+ columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
201
+ # Shift incremental column to top
202
+ columns.unshift(name: "time", type: :long)
203
+ end
204
+
205
+ def fetch(dates, last_fetch_time, task, &block)
206
+ from_date = dates.first
207
+ to_date = dates.last
208
+ params = task[:params].merge(
209
+ "from_date"=>from_date,
210
+ "to_date"=>to_date
211
+ )
212
+ incremental_column = task[:incremental_column]
213
+ if !incremental_column.nil? # can't do filter on time column, time column need to be filter manually.
214
+ params = params.merge(
215
+ "where"=>"#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{incremental_column}\"] > #{last_fetch_time || 0} and properties[\"#{incremental_column}\"] < #{task[:incremental_column_upper_limit]}"
216
+ )
217
+ end
218
+ Embulk.logger.info "Where params is #{params["where"]}"
219
+
220
+ client = create_client
221
+
222
+ if preview?
223
+ client.export_for_small_dataset(params)
224
+ else
225
+ Enumerator.new do |y|
226
+ client.export(params) do |record|
227
+ y << record
228
+ end
229
+ end
230
+ end
231
+ end
232
+
233
+ def endpoint
234
+ @config.param(:export_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT)
235
+ end
236
+
237
+ private
238
+
239
+ def incremental_column_upper_limit
240
+ job_start_time = Time.now.to_i * 1000
241
+ upper_limit_delay = @config.param(:incremental_column_upper_limit_delay_in_seconds, :integer, default: 0)
242
+ job_start_time - (upper_limit_delay * 1000)
243
+ end
244
+
245
+ def extract_value(record, name)
246
+ case name
247
+ when NOT_PROPERTY_COLUMN
248
+ record[NOT_PROPERTY_COLUMN]
249
+ when "time"
250
+ time = record["properties"]["time"]
251
+ adjust_timezone(time)
252
+ else
253
+ record["properties"][name]
254
+ end
255
+ end
256
+
257
+ def collect_custom_properties(record)
258
+ specified_columns = @schema.map {|col| col["name"]}
259
+ custom_keys = record["properties"].keys.find_all {|key| !KNOWN_KEYS.include?(key.to_s) && !specified_columns.include?(key.to_s)}
260
+ custom_keys.inject({}) do |result, key|
261
+ result.merge({
262
+ key=>record["properties"][key]
263
+ })
264
+ end
265
+ end
266
+
267
+ def extract_unknown_values(record)
268
+ record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
269
+ schema_keys = @schema.map {|column| column["name"]}
270
+ unknown_keys = record_keys - schema_keys
271
+
272
+ unless unknown_keys.empty?
273
+ Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
274
+ end
275
+
276
+ unknown_keys.inject({}) do |result, key|
277
+ result[key] = extract_value(record, key)
278
+ result
279
+ end
280
+ end
281
+ end
282
+ end
283
+ end
284
+ end