embulk-input-mixpanel 0.5.15 → 0.6.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +4 -1
- data/embulk-input-mixpanel.gemspec +1 -1
- data/lib/embulk/input/mixpanel.rb +24 -342
- data/lib/embulk/input/mixpanel_api/client.rb +48 -9
- data/lib/embulk/input/service/base_service.rb +122 -0
- data/lib/embulk/input/service/export_service.rb +284 -0
- data/lib/embulk/input/service/jql_service.rb +276 -0
- data/lib/timezone_validator.rb +1 -1
- data/test/embulk/input/mixpanel_api/test_client.rb +4 -22
- data/test/embulk/input/{test_mixpanel.rb → service/test_export_service.rb} +62 -25
- data/test/embulk/input/service/test_jql_service.rb +745 -0
- data/test/test_range_generator.rb +1 -1
- metadata +9 -4
@@ -14,6 +14,8 @@ module Embulk
|
|
14
14
|
PING_RETRY_WAIT = 2
|
15
15
|
SMALL_NUM_OF_RECORDS = 10
|
16
16
|
DEFAULT_EXPORT_ENDPOINT = "https://data.mixpanel.com/api/2.0/export/".freeze
|
17
|
+
DEFAULT_JQL_ENDPOINT = "https://mixpanel.com/api/2.0/jql/".freeze
|
18
|
+
JQL_RATE_LIMIT = 60
|
17
19
|
|
18
20
|
attr_reader :retryer
|
19
21
|
|
@@ -40,7 +42,7 @@ module Embulk
|
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
43
|
-
def initialize(api_secret, retryer = nil
|
45
|
+
def initialize(api_secret, endpoint, retryer = nil)
|
44
46
|
@endpoint = endpoint
|
45
47
|
@api_secret = api_secret
|
46
48
|
@retryer = retryer || PerfectRetry.new do |config|
|
@@ -76,6 +78,30 @@ module Embulk
|
|
76
78
|
raise ConfigError.new "#{params["from_date"]}..#{latest_tried_to_date} has no record."
|
77
79
|
end
|
78
80
|
|
81
|
+
def send_jql_script(params = {})
|
82
|
+
retryer.with_retry do
|
83
|
+
response = request_jql(params)
|
84
|
+
handle_error(response, response.body)
|
85
|
+
begin
|
86
|
+
return JSON.parse(response.body)
|
87
|
+
rescue =>e
|
88
|
+
raise Embulk::DataError.new(e)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def send_jql_script_small_dataset(params = {})
|
94
|
+
retryer.with_retry do
|
95
|
+
response = request_jql(params)
|
96
|
+
handle_error(response, response.body)
|
97
|
+
begin
|
98
|
+
return JSON.parse(response.body)[0..SMALL_NUM_OF_RECORDS - 1]
|
99
|
+
rescue =>e
|
100
|
+
raise Embulk::DataError.new(e.message)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
79
105
|
def try_to_dates(from_date)
|
80
106
|
try_to_dates = 5.times.map do |n|
|
81
107
|
# from_date + 1, from_date + 10, from_date + 100, ... so on
|
@@ -107,18 +133,18 @@ module Embulk
|
|
107
133
|
Embulk.logger.info "Sending request to #{@endpoint}"
|
108
134
|
response = httpclient.get(@endpoint, params) do |response, chunk|
|
109
135
|
# Only process data if response status is 200..299
|
110
|
-
if response.status/100 == 2
|
136
|
+
if response.status / 100 == 2
|
111
137
|
chunk.each_line do |line|
|
112
138
|
begin
|
113
139
|
record = JSON.parse(buf + line)
|
114
140
|
block.call record
|
115
141
|
buf = ""
|
116
|
-
rescue JSON::ParserError
|
142
|
+
rescue JSON::ParserError=>e
|
117
143
|
buf << line
|
118
144
|
end
|
119
145
|
end
|
120
146
|
else
|
121
|
-
|
147
|
+
error_response << chunk
|
122
148
|
end
|
123
149
|
end
|
124
150
|
handle_error(response, error_response)
|
@@ -129,24 +155,37 @@ module Embulk
|
|
129
155
|
end
|
130
156
|
end
|
131
157
|
|
158
|
+
def request_jql(parameters)
|
159
|
+
Embulk.logger.info "Sending request to #{@endpoint} params #{parameters}"
|
160
|
+
httpclient.post(@endpoint, query_string(parameters))
|
161
|
+
end
|
162
|
+
|
163
|
+
def query_string(prs)
|
164
|
+
URI.encode_www_form({
|
165
|
+
params: JSON.generate(prs[:params]),
|
166
|
+
script: prs[:script]
|
167
|
+
})
|
168
|
+
end
|
169
|
+
|
132
170
|
def request_small_dataset(params, num_of_records)
|
133
171
|
# guess/preview
|
134
172
|
# Try to fetch first number of records
|
135
173
|
params["limit"] = num_of_records
|
136
174
|
Embulk.logger.info "Sending request to #{@endpoint}"
|
137
175
|
res = httpclient.get(@endpoint, params)
|
138
|
-
handle_error(res,res.body)
|
176
|
+
handle_error(res, res.body)
|
139
177
|
response_to_enum(res.body)
|
140
178
|
end
|
141
179
|
|
142
180
|
def handle_error(response, error_response)
|
143
181
|
Embulk.logger.debug "response code: #{response.code}"
|
144
182
|
case response.code
|
183
|
+
when 429
|
184
|
+
# [429] {"error": "too many export requests in progress for this project"}
|
185
|
+
Embulk.logger.info "Hit rate limit sleep for 1 hour"
|
186
|
+
sleep(60 * 60)
|
187
|
+
raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
|
145
188
|
when 400..499
|
146
|
-
if response.code == 429
|
147
|
-
# [429] {"error": "too many export requests in progress for this project"}
|
148
|
-
raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
|
149
|
-
end
|
150
189
|
raise ConfigError.new("[#{response.code}] #{error_response}")
|
151
190
|
when 500..599
|
152
191
|
raise RuntimeError.new("[#{response.code}] #{error_response}")
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require "perfect_retry"
|
2
|
+
require "range_generator"
|
3
|
+
require "timezone_validator"
|
4
|
+
require "active_support/core_ext/time"
|
5
|
+
require "tzinfo"
|
6
|
+
require "embulk/input/mixpanel_api/client"
|
7
|
+
require "embulk/input/mixpanel_api/exceptions"
|
8
|
+
|
9
|
+
module Embulk
|
10
|
+
module Input
|
11
|
+
module Service
|
12
|
+
class BaseService
|
13
|
+
|
14
|
+
NOT_PROPERTY_COLUMN = "event".freeze
|
15
|
+
DEFAULT_FETCH_DAYS = 7
|
16
|
+
DEFAULT_TIME_COLUMN = 'time'.freeze
|
17
|
+
|
18
|
+
def initialize(config)
|
19
|
+
@config = config
|
20
|
+
end
|
21
|
+
|
22
|
+
def default_guess_start_date(timezone)
|
23
|
+
today(timezone) - DEFAULT_FETCH_DAYS - 1
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
|
28
|
+
def validate_config
|
29
|
+
timezone = @config.param(:timezone, :string)
|
30
|
+
validate_timezone(timezone)
|
31
|
+
end
|
32
|
+
|
33
|
+
def validate_timezone(timezone)
|
34
|
+
TimezoneValidator.new(timezone).validate
|
35
|
+
end
|
36
|
+
|
37
|
+
def giveup_when_mixpanel_is_down
|
38
|
+
unless MixpanelApi::Client.mixpanel_available?(endpoint)
|
39
|
+
raise Embulk::DataError.new("Mixpanel service is down. Please retry later.")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def adjust_timezone(epoch)
|
44
|
+
# Adjust timezone offset to get UTC time
|
45
|
+
# c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
|
46
|
+
if epoch.present?
|
47
|
+
tz = TZInfo::Timezone.get(@timezone)
|
48
|
+
offset = tz.period_for_local(epoch, true).offset.utc_total_offset
|
49
|
+
epoch - offset
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def today(timezone)
|
54
|
+
if timezone.nil?
|
55
|
+
Date.today
|
56
|
+
else
|
57
|
+
zone = ActiveSupport::TimeZone[timezone]
|
58
|
+
zone.nil? ? Date.today : zone.today
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract_values(record)
|
63
|
+
@schema.map do |column|
|
64
|
+
extract_value(record, column["name"])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def preview?
|
69
|
+
begin
|
70
|
+
org.embulk.spi.Exec.isPreview()
|
71
|
+
rescue java.lang.NullPointerException=>e
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def create_client
|
77
|
+
if @client.present?
|
78
|
+
@client
|
79
|
+
else
|
80
|
+
retryer = perfect_retry({
|
81
|
+
# retry_initial_wait_sec: @config[:retry_initial_wait_sec] ? @config[:retry_initial_wait_sec] : 1,
|
82
|
+
# retry_limit: @config[:retry_limit] ? @config[:retry_limit] : 5,
|
83
|
+
retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
|
84
|
+
retry_limit: @config.param(:retry_limit, :integer, default: 5),
|
85
|
+
})
|
86
|
+
MixpanelApi::Client.new(@config.param(:api_secret, :string), endpoint, retryer)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def perfect_retry(task)
|
91
|
+
PerfectRetry.new do |config|
|
92
|
+
config.limit = task[:retry_limit]
|
93
|
+
config.sleep = proc {|n| task[:retry_initial_wait_sec] * (2 * (n - 1))}
|
94
|
+
config.dont_rescues = [Embulk::ConfigError, MixpanelApi::IncompleteExportResponseError]
|
95
|
+
config.rescues = [RuntimeError]
|
96
|
+
config.log_level = nil
|
97
|
+
config.logger = Embulk.logger
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def range
|
102
|
+
timezone = @config.param(:timezone, :string, default: "")
|
103
|
+
from_date = @config.param(:from_date, :string, default: (today(timezone) - 2).to_s)
|
104
|
+
incremental = @config.param(:incremental, :bool, default: true)
|
105
|
+
incremental_column = @config.param(:incremental_column, :string, default: nil)
|
106
|
+
latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
|
107
|
+
fetch_days = @config.param(:fetch_days, :integer, default: nil)
|
108
|
+
|
109
|
+
# Backfill from date if incremental and an incremental field is set and we are in incremental run
|
110
|
+
if incremental && incremental_column && latest_fetched_time !=0
|
111
|
+
back_fill_days = @config.param(:back_fill_days, :integer, default: 5)
|
112
|
+
Embulk.logger.info "Backfill days #{back_fill_days}"
|
113
|
+
from_date = (Date.parse(from_date) - back_fill_days).to_s
|
114
|
+
fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
|
115
|
+
end
|
116
|
+
|
117
|
+
RangeGenerator.new(from_date, fetch_days, timezone).generate_range
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,284 @@
|
|
1
|
+
require 'embulk/input/service/base_service'
|
2
|
+
|
3
|
+
module Embulk
|
4
|
+
module Input
|
5
|
+
module Service
|
6
|
+
class ExportService < BaseService
|
7
|
+
|
8
|
+
# https://mixpanel.com/help/questions/articles/special-or-reserved-properties
|
9
|
+
# https://mixpanel.com/help/questions/articles/what-properties-do-mixpanels-libraries-store-by-default
|
10
|
+
#
|
11
|
+
# JavaScript to extract key names from HTML: run it on Chrome Devtool when opening their document
|
12
|
+
# > Array.from(document.querySelectorAll("strong")).map(function(s){ return s.textContent.match(/[A-Z]/) ? s.parentNode.textContent.match(/\((.*?)\)/)[1] : s.textContent.split(",").join(" ") }).join(" ")
|
13
|
+
# > Array.from(document.querySelectorAll("li")).map(function(s){ m = s.textContent.match(/\((.*?)\)/); return m && m[1] }).filter(function(k) { return k && !k.match("utm") }).join(" ")
|
14
|
+
KNOWN_KEYS = %W(
|
15
|
+
#{NOT_PROPERTY_COLUMN}
|
16
|
+
distinct_id ip mp_name_tag mp_note token time mp_country_code length campaign_id $email $phone $distinct_id $ios_devices $android_devices $first_name $last_name $name $city $region $country_code $timezone $unsubscribed
|
17
|
+
$city $region mp_country_code $browser $browser_version $device $current_url $initial_referrer $initial_referring_domain $os $referrer $referring_domain $screen_height $screen_width $search_engine $city $region $mp_country_code $timezone $browser_version $browser $initial_referrer $initial_referring_domain $os $last_seen $city $region mp_country_code $app_release $app_version $carrier $ios_ifa $os_version $manufacturer $lib_version $model $os $screen_height $screen_width $wifi $city $region $mp_country_code $timezone $ios_app_release $ios_app_version $ios_device_model $ios_lib_version $ios_version $ios_ifa $last_seen $city $region mp_country_code $app_version $bluetooth_enabled $bluetooth_version $brand $carrier $has_nfc $has_telephone $lib_version $manufacturer $model $os $os_version $screen_dpi $screen_height $screen_width $wifi $google_play_services $city $region mp_country_code $timezone $android_app_version $android_app_version_code $android_lib_version $android_os $android_os_version $android_brand $android_model $android_manufacturer $last_seen
|
18
|
+
).uniq.freeze
|
19
|
+
|
20
|
+
def validate_config
|
21
|
+
super
|
22
|
+
|
23
|
+
incremental_column = @config.param(:incremental_column, :string, default: nil)
|
24
|
+
latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
|
25
|
+
fetch_custom_properties = @config.param(:fetch_custom_properties, :bool, default: true)
|
26
|
+
fetch_unknown_columns = @config.param(:fetch_unknown_columns, :bool, default: false)
|
27
|
+
|
28
|
+
if !incremental_column.nil? && !latest_fetched_time.nil? && (incremental_column_upper_limit <= latest_fetched_time)
|
29
|
+
raise Embulk::ConfigError.new("Incremental column upper limit (job_start_time - incremental_column_upper_limit_delay_in_seconds) can't be smaller or equal latest fetched time #{latest_fetched_time}")
|
30
|
+
end
|
31
|
+
|
32
|
+
if fetch_unknown_columns && fetch_custom_properties
|
33
|
+
raise Embulk::ConfigError.new("Don't set true both `fetch_unknown_columns` and `fetch_custom_properties`.")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def create_task
|
38
|
+
{
|
39
|
+
params: export_params,
|
40
|
+
dates: range,
|
41
|
+
timezone: @config.param(:timezone, :string, default: ""),
|
42
|
+
export_endpoint: endpoint,
|
43
|
+
api_secret: @config.param(:api_secret, :string),
|
44
|
+
schema: @config.param(:columns, :array),
|
45
|
+
fetch_unknown_columns: @config.param(:fetch_unknown_columns, :bool, default: false),
|
46
|
+
fetch_custom_properties: @config.param(:fetch_custom_properties, :bool, default: true),
|
47
|
+
retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
|
48
|
+
incremental_column: @config.param(:incremental_column, :string, default: nil),
|
49
|
+
retry_limit: @config.param(:retry_limit, :integer, default: 5),
|
50
|
+
latest_fetched_time: @config.param(:latest_fetched_time, :integer, default: 0),
|
51
|
+
incremental: @config.param(:incremental, :bool, default: true),
|
52
|
+
slice_range: @config.param(:slice_range, :integer, default: 7),
|
53
|
+
job_start_time: Time.now.to_i * 1000,
|
54
|
+
incremental_column_upper_limit: incremental_column_upper_limit,
|
55
|
+
allow_partial_import: @config.param(:allow_partial_import, :bool, default: true)
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def next_from_date(task_report)
|
60
|
+
next_to_date = Date.parse(task_report[:to_date])
|
61
|
+
{
|
62
|
+
from_date: next_to_date.to_s,
|
63
|
+
latest_fetched_time: task_report[:latest_fetched_time],
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
def ingest(task, page_builder)
|
68
|
+
giveup_when_mixpanel_is_down
|
69
|
+
|
70
|
+
@schema = task[:schema]
|
71
|
+
@timezone = task[:timezone]
|
72
|
+
|
73
|
+
Embulk.logger.info "Job start time is #{task[:job_start_time]}"
|
74
|
+
|
75
|
+
dates = task[:dates]
|
76
|
+
prev_latest_fetched_time = task[:latest_fetched_time] || 0
|
77
|
+
prev_latest_fetched_time_format = Time.at(prev_latest_fetched_time).strftime("%F %T %z")
|
78
|
+
current_latest_fetched_time = prev_latest_fetched_time
|
79
|
+
incremental_column = task[:incremental_column]
|
80
|
+
incremental = task[:incremental]
|
81
|
+
fetch_unknown_columns = task[:fetch_unknown_columns]
|
82
|
+
|
83
|
+
dates.each_slice(task[:slice_range]) do |slice_dates|
|
84
|
+
ignored_fetched_record_count = 0
|
85
|
+
# There is the issue with Mixpanel time field during the transition from standard to daylight saving time
|
86
|
+
# in the US timezone i.e. 11 Mar 2018 2AM - 2:59AM, time within that period must not be existed,
|
87
|
+
# due to daylight saving, time will be forwarded 1 hour from 2AM to 3AM.
|
88
|
+
#
|
89
|
+
# All of records with wrong timezone will be ignored instead of throw exception out
|
90
|
+
ignored_wrong_daylight_tz_record_count = 0
|
91
|
+
unless preview?
|
92
|
+
Embulk.logger.info "Fetching data from #{slice_dates.first} to #{slice_dates.last} ..."
|
93
|
+
end
|
94
|
+
record_time_column = incremental_column || DEFAULT_TIME_COLUMN
|
95
|
+
begin
|
96
|
+
fetch(slice_dates, prev_latest_fetched_time, task).each do |record|
|
97
|
+
if incremental
|
98
|
+
if !record["properties"].include?(record_time_column)
|
99
|
+
raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
|
100
|
+
end
|
101
|
+
record_time = record["properties"][record_time_column]
|
102
|
+
if incremental_column.nil?
|
103
|
+
if record_time <= prev_latest_fetched_time
|
104
|
+
ignored_fetched_record_count += 1
|
105
|
+
next
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
current_latest_fetched_time = [
|
110
|
+
current_latest_fetched_time,
|
111
|
+
record_time,
|
112
|
+
].max
|
113
|
+
end
|
114
|
+
begin
|
115
|
+
values = extract_values(record)
|
116
|
+
if fetch_unknown_columns
|
117
|
+
unknown_values = extract_unknown_values(record)
|
118
|
+
values << unknown_values.to_json
|
119
|
+
end
|
120
|
+
if task[:fetch_custom_properties]
|
121
|
+
values << collect_custom_properties(record)
|
122
|
+
end
|
123
|
+
page_builder.add(values)
|
124
|
+
rescue TZInfo::PeriodNotFound
|
125
|
+
ignored_wrong_daylight_tz_record_count += 1
|
126
|
+
end
|
127
|
+
end
|
128
|
+
rescue MixpanelApi::IncompleteExportResponseError
|
129
|
+
if !task[:allow_partial_import]
|
130
|
+
# re raise the exception if we don't allow partial import
|
131
|
+
raise
|
132
|
+
end
|
133
|
+
end
|
134
|
+
if ignored_fetched_record_count > 0
|
135
|
+
Embulk.logger.warn "Skipped already loaded #{ignored_fetched_record_count} records. These record times are older or equal than previous fetched record time (#{prev_latest_fetched_time} @ #{prev_latest_fetched_time_format})."
|
136
|
+
end
|
137
|
+
if ignored_wrong_daylight_tz_record_count > 0
|
138
|
+
Embulk.logger.warn "Skipped #{ignored_wrong_daylight_tz_record_count} records due to corrupted Mixpanel time transition from standard to daylight saving"
|
139
|
+
end
|
140
|
+
break if preview?
|
141
|
+
end
|
142
|
+
page_builder.finish
|
143
|
+
create_task_report(current_latest_fetched_time, dates.last, task[:timezone])
|
144
|
+
end
|
145
|
+
|
146
|
+
def create_task_report(current_latest_fetched_time, to_date, timezone)
|
147
|
+
{
|
148
|
+
latest_fetched_time: current_latest_fetched_time,
|
149
|
+
to_date: to_date || today(timezone) - 1,
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
153
|
+
def guess_columns
|
154
|
+
giveup_when_mixpanel_is_down
|
155
|
+
range = guess_range
|
156
|
+
Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
|
157
|
+
|
158
|
+
params = export_params.merge(
|
159
|
+
"from_date"=>range.first,
|
160
|
+
"to_date"=>range.last,
|
161
|
+
)
|
162
|
+
|
163
|
+
client = create_client
|
164
|
+
guess_from_records(client.export_for_small_dataset(params))
|
165
|
+
end
|
166
|
+
|
167
|
+
def guess_range
|
168
|
+
time_zone = @config.param(:timezone, :string, default: "")
|
169
|
+
from_date = @config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
|
170
|
+
fetch_days = @config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
|
171
|
+
range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
|
172
|
+
if range.empty?
|
173
|
+
return default_guess_start_date(time_zone)..(today(time_zone) - 1)
|
174
|
+
end
|
175
|
+
range
|
176
|
+
end
|
177
|
+
|
178
|
+
def export_params
|
179
|
+
event = @config.param(:event, :array, default: nil)
|
180
|
+
event = event.nil? ? nil : event.to_json
|
181
|
+
{
|
182
|
+
event: event,
|
183
|
+
where: @config.param(:where, :string, default: nil),
|
184
|
+
bucket: @config.param(:bucket, :string, default: nil),
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
def guess_from_records(records)
|
189
|
+
sample_props = records.map {|r| r["properties"]}
|
190
|
+
schema = Guess::SchemaGuess.from_hash_records(sample_props)
|
191
|
+
columns = schema.map do |col|
|
192
|
+
next if col.name == "time"
|
193
|
+
result = {
|
194
|
+
name: col.name,
|
195
|
+
type: col.type,
|
196
|
+
}
|
197
|
+
result["format"] = col.format if col.format
|
198
|
+
result
|
199
|
+
end.compact
|
200
|
+
columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
|
201
|
+
# Shift incremental column to top
|
202
|
+
columns.unshift(name: "time", type: :long)
|
203
|
+
end
|
204
|
+
|
205
|
+
def fetch(dates, last_fetch_time, task, &block)
|
206
|
+
from_date = dates.first
|
207
|
+
to_date = dates.last
|
208
|
+
params = task[:params].merge(
|
209
|
+
"from_date"=>from_date,
|
210
|
+
"to_date"=>to_date
|
211
|
+
)
|
212
|
+
incremental_column = task[:incremental_column]
|
213
|
+
if !incremental_column.nil? # can't do filter on time column, time column need to be filter manually.
|
214
|
+
params = params.merge(
|
215
|
+
"where"=>"#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{incremental_column}\"] > #{last_fetch_time || 0} and properties[\"#{incremental_column}\"] < #{task[:incremental_column_upper_limit]}"
|
216
|
+
)
|
217
|
+
end
|
218
|
+
Embulk.logger.info "Where params is #{params["where"]}"
|
219
|
+
|
220
|
+
client = create_client
|
221
|
+
|
222
|
+
if preview?
|
223
|
+
client.export_for_small_dataset(params)
|
224
|
+
else
|
225
|
+
Enumerator.new do |y|
|
226
|
+
client.export(params) do |record|
|
227
|
+
y << record
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def endpoint
|
234
|
+
@config.param(:export_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT)
|
235
|
+
end
|
236
|
+
|
237
|
+
private
|
238
|
+
|
239
|
+
def incremental_column_upper_limit
|
240
|
+
job_start_time = Time.now.to_i * 1000
|
241
|
+
upper_limit_delay = @config.param(:incremental_column_upper_limit_delay_in_seconds, :integer, default: 0)
|
242
|
+
job_start_time - (upper_limit_delay * 1000)
|
243
|
+
end
|
244
|
+
|
245
|
+
def extract_value(record, name)
|
246
|
+
case name
|
247
|
+
when NOT_PROPERTY_COLUMN
|
248
|
+
record[NOT_PROPERTY_COLUMN]
|
249
|
+
when "time"
|
250
|
+
time = record["properties"]["time"]
|
251
|
+
adjust_timezone(time)
|
252
|
+
else
|
253
|
+
record["properties"][name]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def collect_custom_properties(record)
|
258
|
+
specified_columns = @schema.map {|col| col["name"]}
|
259
|
+
custom_keys = record["properties"].keys.find_all {|key| !KNOWN_KEYS.include?(key.to_s) && !specified_columns.include?(key.to_s)}
|
260
|
+
custom_keys.inject({}) do |result, key|
|
261
|
+
result.merge({
|
262
|
+
key=>record["properties"][key]
|
263
|
+
})
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def extract_unknown_values(record)
|
268
|
+
record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
|
269
|
+
schema_keys = @schema.map {|column| column["name"]}
|
270
|
+
unknown_keys = record_keys - schema_keys
|
271
|
+
|
272
|
+
unless unknown_keys.empty?
|
273
|
+
Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
|
274
|
+
end
|
275
|
+
|
276
|
+
unknown_keys.inject({}) do |result, key|
|
277
|
+
result[key] = extract_value(record, key)
|
278
|
+
result
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|