embulk-input-mixpanel 0.5.15 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,6 +14,8 @@ module Embulk
14
14
  PING_RETRY_WAIT = 2
15
15
  SMALL_NUM_OF_RECORDS = 10
16
16
  DEFAULT_EXPORT_ENDPOINT = "https://data.mixpanel.com/api/2.0/export/".freeze
17
+ DEFAULT_JQL_ENDPOINT = "https://mixpanel.com/api/2.0/jql/".freeze
18
+ JQL_RATE_LIMIT = 60
17
19
 
18
20
  attr_reader :retryer
19
21
 
@@ -40,7 +42,7 @@ module Embulk
40
42
  end
41
43
  end
42
44
 
43
- def initialize(api_secret, retryer = nil, endpoint = DEFAULT_EXPORT_ENDPOINT)
45
+ def initialize(api_secret, endpoint, retryer = nil)
44
46
  @endpoint = endpoint
45
47
  @api_secret = api_secret
46
48
  @retryer = retryer || PerfectRetry.new do |config|
@@ -76,6 +78,30 @@ module Embulk
76
78
  raise ConfigError.new "#{params["from_date"]}..#{latest_tried_to_date} has no record."
77
79
  end
78
80
 
81
+ def send_jql_script(params = {})
82
+ retryer.with_retry do
83
+ response = request_jql(params)
84
+ handle_error(response, response.body)
85
+ begin
86
+ return JSON.parse(response.body)
87
+ rescue =>e
88
+ raise Embulk::DataError.new(e)
89
+ end
90
+ end
91
+ end
92
+
93
+ def send_jql_script_small_dataset(params = {})
94
+ retryer.with_retry do
95
+ response = request_jql(params)
96
+ handle_error(response, response.body)
97
+ begin
98
+ return JSON.parse(response.body)[0..SMALL_NUM_OF_RECORDS - 1]
99
+ rescue =>e
100
+ raise Embulk::DataError.new(e.message)
101
+ end
102
+ end
103
+ end
104
+
79
105
  def try_to_dates(from_date)
80
106
  try_to_dates = 5.times.map do |n|
81
107
  # from_date + 1, from_date + 10, from_date + 100, ... so on
@@ -107,18 +133,18 @@ module Embulk
107
133
  Embulk.logger.info "Sending request to #{@endpoint}"
108
134
  response = httpclient.get(@endpoint, params) do |response, chunk|
109
135
  # Only process data if response status is 200..299
110
- if response.status/100 == 2
136
+ if response.status / 100 == 2
111
137
  chunk.each_line do |line|
112
138
  begin
113
139
  record = JSON.parse(buf + line)
114
140
  block.call record
115
141
  buf = ""
116
- rescue JSON::ParserError => e
142
+ rescue JSON::ParserError=>e
117
143
  buf << line
118
144
  end
119
145
  end
120
146
  else
121
- error_response << chunk
147
+ error_response << chunk
122
148
  end
123
149
  end
124
150
  handle_error(response, error_response)
@@ -129,24 +155,37 @@ module Embulk
129
155
  end
130
156
  end
131
157
 
158
+ def request_jql(parameters)
159
+ Embulk.logger.info "Sending request to #{@endpoint} params #{parameters}"
160
+ httpclient.post(@endpoint, query_string(parameters))
161
+ end
162
+
163
+ def query_string(prs)
164
+ URI.encode_www_form({
165
+ params: JSON.generate(prs[:params]),
166
+ script: prs[:script]
167
+ })
168
+ end
169
+
132
170
  def request_small_dataset(params, num_of_records)
133
171
  # guess/preview
134
172
  # Try to fetch first number of records
135
173
  params["limit"] = num_of_records
136
174
  Embulk.logger.info "Sending request to #{@endpoint}"
137
175
  res = httpclient.get(@endpoint, params)
138
- handle_error(res,res.body)
176
+ handle_error(res, res.body)
139
177
  response_to_enum(res.body)
140
178
  end
141
179
 
142
180
  def handle_error(response, error_response)
143
181
  Embulk.logger.debug "response code: #{response.code}"
144
182
  case response.code
183
+ when 429
184
+ # [429] {"error": "too many export requests in progress for this project"}
185
+ Embulk.logger.info "Hit rate limit sleep for 1 hour"
186
+ sleep(60 * 60)
187
+ raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
145
188
  when 400..499
146
- if response.code == 429
147
- # [429] {"error": "too many export requests in progress for this project"}
148
- raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
149
- end
150
189
  raise ConfigError.new("[#{response.code}] #{error_response}")
151
190
  when 500..599
152
191
  raise RuntimeError.new("[#{response.code}] #{error_response}")
@@ -0,0 +1,122 @@
1
+ require "perfect_retry"
2
+ require "range_generator"
3
+ require "timezone_validator"
4
+ require "active_support/core_ext/time"
5
+ require "tzinfo"
6
+ require "embulk/input/mixpanel_api/client"
7
+ require "embulk/input/mixpanel_api/exceptions"
8
+
9
+ module Embulk
10
+ module Input
11
+ module Service
12
+ class BaseService
13
+
14
+ NOT_PROPERTY_COLUMN = "event".freeze
15
+ DEFAULT_FETCH_DAYS = 7
16
+ DEFAULT_TIME_COLUMN = 'time'.freeze
17
+
18
+ def initialize(config)
19
+ @config = config
20
+ end
21
+
22
+ def default_guess_start_date(timezone)
23
+ today(timezone) - DEFAULT_FETCH_DAYS - 1
24
+ end
25
+
26
+ protected
27
+
28
+ def validate_config
29
+ timezone = @config.param(:timezone, :string)
30
+ validate_timezone(timezone)
31
+ end
32
+
33
+ def validate_timezone(timezone)
34
+ TimezoneValidator.new(timezone).validate
35
+ end
36
+
37
+ def giveup_when_mixpanel_is_down
38
+ unless MixpanelApi::Client.mixpanel_available?(endpoint)
39
+ raise Embulk::DataError.new("Mixpanel service is down. Please retry later.")
40
+ end
41
+ end
42
+
43
+ def adjust_timezone(epoch)
44
+ # Adjust timezone offset to get UTC time
45
+ # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
46
+ if epoch.present?
47
+ tz = TZInfo::Timezone.get(@timezone)
48
+ offset = tz.period_for_local(epoch, true).offset.utc_total_offset
49
+ epoch - offset
50
+ end
51
+ end
52
+
53
+ def today(timezone)
54
+ if timezone.nil?
55
+ Date.today
56
+ else
57
+ zone = ActiveSupport::TimeZone[timezone]
58
+ zone.nil? ? Date.today : zone.today
59
+ end
60
+ end
61
+
62
+ def extract_values(record)
63
+ @schema.map do |column|
64
+ extract_value(record, column["name"])
65
+ end
66
+ end
67
+
68
+ def preview?
69
+ begin
70
+ org.embulk.spi.Exec.isPreview()
71
+ rescue java.lang.NullPointerException=>e
72
+ false
73
+ end
74
+ end
75
+
76
+ def create_client
77
+ if @client.present?
78
+ @client
79
+ else
80
+ retryer = perfect_retry({
81
+ # retry_initial_wait_sec: @config[:retry_initial_wait_sec] ? @config[:retry_initial_wait_sec] : 1,
82
+ # retry_limit: @config[:retry_limit] ? @config[:retry_limit] : 5,
83
+ retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
84
+ retry_limit: @config.param(:retry_limit, :integer, default: 5),
85
+ })
86
+ MixpanelApi::Client.new(@config.param(:api_secret, :string), endpoint, retryer)
87
+ end
88
+ end
89
+
90
+ def perfect_retry(task)
91
+ PerfectRetry.new do |config|
92
+ config.limit = task[:retry_limit]
93
+ config.sleep = proc {|n| task[:retry_initial_wait_sec] * (2 * (n - 1))}
94
+ config.dont_rescues = [Embulk::ConfigError, MixpanelApi::IncompleteExportResponseError]
95
+ config.rescues = [RuntimeError]
96
+ config.log_level = nil
97
+ config.logger = Embulk.logger
98
+ end
99
+ end
100
+
101
+ def range
102
+ timezone = @config.param(:timezone, :string, default: "")
103
+ from_date = @config.param(:from_date, :string, default: (today(timezone) - 2).to_s)
104
+ incremental = @config.param(:incremental, :bool, default: true)
105
+ incremental_column = @config.param(:incremental_column, :string, default: nil)
106
+ latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
107
+ fetch_days = @config.param(:fetch_days, :integer, default: nil)
108
+
109
+ # Backfill from date if incremental and an incremental field is set and we are in incremental run
110
+ if incremental && incremental_column && latest_fetched_time !=0
111
+ back_fill_days = @config.param(:back_fill_days, :integer, default: 5)
112
+ Embulk.logger.info "Backfill days #{back_fill_days}"
113
+ from_date = (Date.parse(from_date) - back_fill_days).to_s
114
+ fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
115
+ end
116
+
117
+ RangeGenerator.new(from_date, fetch_days, timezone).generate_range
118
+ end
119
+ end
120
+ end
121
+ end
122
+ end
@@ -0,0 +1,284 @@
1
+ require 'embulk/input/service/base_service'
2
+
3
+ module Embulk
4
+ module Input
5
+ module Service
6
+ class ExportService < BaseService
7
+
8
+ # https://mixpanel.com/help/questions/articles/special-or-reserved-properties
9
+ # https://mixpanel.com/help/questions/articles/what-properties-do-mixpanels-libraries-store-by-default
10
+ #
11
+ # JavaScript to extract key names from HTML: run it on Chrome Devtool when opening their document
12
+ # > Array.from(document.querySelectorAll("strong")).map(function(s){ return s.textContent.match(/[A-Z]/) ? s.parentNode.textContent.match(/\((.*?)\)/)[1] : s.textContent.split(",").join(" ") }).join(" ")
13
+ # > Array.from(document.querySelectorAll("li")).map(function(s){ m = s.textContent.match(/\((.*?)\)/); return m && m[1] }).filter(function(k) { return k && !k.match("utm") }).join(" ")
14
+ KNOWN_KEYS = %W(
15
+ #{NOT_PROPERTY_COLUMN}
16
+ distinct_id ip mp_name_tag mp_note token time mp_country_code length campaign_id $email $phone $distinct_id $ios_devices $android_devices $first_name $last_name $name $city $region $country_code $timezone $unsubscribed
17
+ $city $region mp_country_code $browser $browser_version $device $current_url $initial_referrer $initial_referring_domain $os $referrer $referring_domain $screen_height $screen_width $search_engine $city $region $mp_country_code $timezone $browser_version $browser $initial_referrer $initial_referring_domain $os $last_seen $city $region mp_country_code $app_release $app_version $carrier $ios_ifa $os_version $manufacturer $lib_version $model $os $screen_height $screen_width $wifi $city $region $mp_country_code $timezone $ios_app_release $ios_app_version $ios_device_model $ios_lib_version $ios_version $ios_ifa $last_seen $city $region mp_country_code $app_version $bluetooth_enabled $bluetooth_version $brand $carrier $has_nfc $has_telephone $lib_version $manufacturer $model $os $os_version $screen_dpi $screen_height $screen_width $wifi $google_play_services $city $region mp_country_code $timezone $android_app_version $android_app_version_code $android_lib_version $android_os $android_os_version $android_brand $android_model $android_manufacturer $last_seen
18
+ ).uniq.freeze
19
+
20
+ def validate_config
21
+ super
22
+
23
+ incremental_column = @config.param(:incremental_column, :string, default: nil)
24
+ latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
25
+ fetch_custom_properties = @config.param(:fetch_custom_properties, :bool, default: true)
26
+ fetch_unknown_columns = @config.param(:fetch_unknown_columns, :bool, default: false)
27
+
28
+ if !incremental_column.nil? && !latest_fetched_time.nil? && (incremental_column_upper_limit <= latest_fetched_time)
29
+ raise Embulk::ConfigError.new("Incremental column upper limit (job_start_time - incremental_column_upper_limit_delay_in_seconds) can't be smaller or equal latest fetched time #{latest_fetched_time}")
30
+ end
31
+
32
+ if fetch_unknown_columns && fetch_custom_properties
33
+ raise Embulk::ConfigError.new("Don't set true both `fetch_unknown_columns` and `fetch_custom_properties`.")
34
+ end
35
+ end
36
+
37
+ def create_task
38
+ {
39
+ params: export_params,
40
+ dates: range,
41
+ timezone: @config.param(:timezone, :string, default: ""),
42
+ export_endpoint: endpoint,
43
+ api_secret: @config.param(:api_secret, :string),
44
+ schema: @config.param(:columns, :array),
45
+ fetch_unknown_columns: @config.param(:fetch_unknown_columns, :bool, default: false),
46
+ fetch_custom_properties: @config.param(:fetch_custom_properties, :bool, default: true),
47
+ retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
48
+ incremental_column: @config.param(:incremental_column, :string, default: nil),
49
+ retry_limit: @config.param(:retry_limit, :integer, default: 5),
50
+ latest_fetched_time: @config.param(:latest_fetched_time, :integer, default: 0),
51
+ incremental: @config.param(:incremental, :bool, default: true),
52
+ slice_range: @config.param(:slice_range, :integer, default: 7),
53
+ job_start_time: Time.now.to_i * 1000,
54
+ incremental_column_upper_limit: incremental_column_upper_limit,
55
+ allow_partial_import: @config.param(:allow_partial_import, :bool, default: true)
56
+ }
57
+ end
58
+
59
+ def next_from_date(task_report)
60
+ next_to_date = Date.parse(task_report[:to_date])
61
+ {
62
+ from_date: next_to_date.to_s,
63
+ latest_fetched_time: task_report[:latest_fetched_time],
64
+ }
65
+ end
66
+
67
+ def ingest(task, page_builder)
68
+ giveup_when_mixpanel_is_down
69
+
70
+ @schema = task[:schema]
71
+ @timezone = task[:timezone]
72
+
73
+ Embulk.logger.info "Job start time is #{task[:job_start_time]}"
74
+
75
+ dates = task[:dates]
76
+ prev_latest_fetched_time = task[:latest_fetched_time] || 0
77
+ prev_latest_fetched_time_format = Time.at(prev_latest_fetched_time).strftime("%F %T %z")
78
+ current_latest_fetched_time = prev_latest_fetched_time
79
+ incremental_column = task[:incremental_column]
80
+ incremental = task[:incremental]
81
+ fetch_unknown_columns = task[:fetch_unknown_columns]
82
+
83
+ dates.each_slice(task[:slice_range]) do |slice_dates|
84
+ ignored_fetched_record_count = 0
85
+ # There is the issue with Mixpanel time field during the transition from standard to daylight saving time
86
+ # in the US timezone i.e. 11 Mar 2018 2AM - 2:59AM, time within that period must not be existed,
87
+ # due to daylight saving, time will be forwarded 1 hour from 2AM to 3AM.
88
+ #
89
+ # All of records with wrong timezone will be ignored instead of throw exception out
90
+ ignored_wrong_daylight_tz_record_count = 0
91
+ unless preview?
92
+ Embulk.logger.info "Fetching data from #{slice_dates.first} to #{slice_dates.last} ..."
93
+ end
94
+ record_time_column = incremental_column || DEFAULT_TIME_COLUMN
95
+ begin
96
+ fetch(slice_dates, prev_latest_fetched_time, task).each do |record|
97
+ if incremental
98
+ if !record["properties"].include?(record_time_column)
99
+ raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
100
+ end
101
+ record_time = record["properties"][record_time_column]
102
+ if incremental_column.nil?
103
+ if record_time <= prev_latest_fetched_time
104
+ ignored_fetched_record_count += 1
105
+ next
106
+ end
107
+ end
108
+
109
+ current_latest_fetched_time = [
110
+ current_latest_fetched_time,
111
+ record_time,
112
+ ].max
113
+ end
114
+ begin
115
+ values = extract_values(record)
116
+ if fetch_unknown_columns
117
+ unknown_values = extract_unknown_values(record)
118
+ values << unknown_values.to_json
119
+ end
120
+ if task[:fetch_custom_properties]
121
+ values << collect_custom_properties(record)
122
+ end
123
+ page_builder.add(values)
124
+ rescue TZInfo::PeriodNotFound
125
+ ignored_wrong_daylight_tz_record_count += 1
126
+ end
127
+ end
128
+ rescue MixpanelApi::IncompleteExportResponseError
129
+ if !task[:allow_partial_import]
130
+ # re raise the exception if we don't allow partial import
131
+ raise
132
+ end
133
+ end
134
+ if ignored_fetched_record_count > 0
135
+ Embulk.logger.warn "Skipped already loaded #{ignored_fetched_record_count} records. These record times are older or equal than previous fetched record time (#{prev_latest_fetched_time} @ #{prev_latest_fetched_time_format})."
136
+ end
137
+ if ignored_wrong_daylight_tz_record_count > 0
138
+ Embulk.logger.warn "Skipped #{ignored_wrong_daylight_tz_record_count} records due to corrupted Mixpanel time transition from standard to daylight saving"
139
+ end
140
+ break if preview?
141
+ end
142
+ page_builder.finish
143
+ create_task_report(current_latest_fetched_time, dates.last, task[:timezone])
144
+ end
145
+
146
+ def create_task_report(current_latest_fetched_time, to_date, timezone)
147
+ {
148
+ latest_fetched_time: current_latest_fetched_time,
149
+ to_date: to_date || today(timezone) - 1,
150
+ }
151
+ end
152
+
153
+ def guess_columns
154
+ giveup_when_mixpanel_is_down
155
+ range = guess_range
156
+ Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
157
+
158
+ params = export_params.merge(
159
+ "from_date"=>range.first,
160
+ "to_date"=>range.last,
161
+ )
162
+
163
+ client = create_client
164
+ guess_from_records(client.export_for_small_dataset(params))
165
+ end
166
+
167
+ def guess_range
168
+ time_zone = @config.param(:timezone, :string, default: "")
169
+ from_date = @config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
170
+ fetch_days = @config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
171
+ range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
172
+ if range.empty?
173
+ return default_guess_start_date(time_zone)..(today(time_zone) - 1)
174
+ end
175
+ range
176
+ end
177
+
178
+ def export_params
179
+ event = @config.param(:event, :array, default: nil)
180
+ event = event.nil? ? nil : event.to_json
181
+ {
182
+ event: event,
183
+ where: @config.param(:where, :string, default: nil),
184
+ bucket: @config.param(:bucket, :string, default: nil),
185
+ }
186
+ end
187
+
188
+ def guess_from_records(records)
189
+ sample_props = records.map {|r| r["properties"]}
190
+ schema = Guess::SchemaGuess.from_hash_records(sample_props)
191
+ columns = schema.map do |col|
192
+ next if col.name == "time"
193
+ result = {
194
+ name: col.name,
195
+ type: col.type,
196
+ }
197
+ result["format"] = col.format if col.format
198
+ result
199
+ end.compact
200
+ columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
201
+ # Shift incremental column to top
202
+ columns.unshift(name: "time", type: :long)
203
+ end
204
+
205
+ def fetch(dates, last_fetch_time, task, &block)
206
+ from_date = dates.first
207
+ to_date = dates.last
208
+ params = task[:params].merge(
209
+ "from_date"=>from_date,
210
+ "to_date"=>to_date
211
+ )
212
+ incremental_column = task[:incremental_column]
213
+ if !incremental_column.nil? # can't do filter on time column, time column need to be filter manually.
214
+ params = params.merge(
215
+ "where"=>"#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{incremental_column}\"] > #{last_fetch_time || 0} and properties[\"#{incremental_column}\"] < #{task[:incremental_column_upper_limit]}"
216
+ )
217
+ end
218
+ Embulk.logger.info "Where params is #{params["where"]}"
219
+
220
+ client = create_client
221
+
222
+ if preview?
223
+ client.export_for_small_dataset(params)
224
+ else
225
+ Enumerator.new do |y|
226
+ client.export(params) do |record|
227
+ y << record
228
+ end
229
+ end
230
+ end
231
+ end
232
+
233
+ def endpoint
234
+ @config.param(:export_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT)
235
+ end
236
+
237
+ private
238
+
239
+ def incremental_column_upper_limit
240
+ job_start_time = Time.now.to_i * 1000
241
+ upper_limit_delay = @config.param(:incremental_column_upper_limit_delay_in_seconds, :integer, default: 0)
242
+ job_start_time - (upper_limit_delay * 1000)
243
+ end
244
+
245
+ def extract_value(record, name)
246
+ case name
247
+ when NOT_PROPERTY_COLUMN
248
+ record[NOT_PROPERTY_COLUMN]
249
+ when "time"
250
+ time = record["properties"]["time"]
251
+ adjust_timezone(time)
252
+ else
253
+ record["properties"][name]
254
+ end
255
+ end
256
+
257
+ def collect_custom_properties(record)
258
+ specified_columns = @schema.map {|col| col["name"]}
259
+ custom_keys = record["properties"].keys.find_all {|key| !KNOWN_KEYS.include?(key.to_s) && !specified_columns.include?(key.to_s)}
260
+ custom_keys.inject({}) do |result, key|
261
+ result.merge({
262
+ key=>record["properties"][key]
263
+ })
264
+ end
265
+ end
266
+
267
+ def extract_unknown_values(record)
268
+ record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
269
+ schema_keys = @schema.map {|column| column["name"]}
270
+ unknown_keys = record_keys - schema_keys
271
+
272
+ unless unknown_keys.empty?
273
+ Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
274
+ end
275
+
276
+ unknown_keys.inject({}) do |result, key|
277
+ result[key] = extract_value(record, key)
278
+ result
279
+ end
280
+ end
281
+ end
282
+ end
283
+ end
284
+ end