embulk-input-mixpanel 0.5.15 → 0.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/README.md +4 -1
- data/embulk-input-mixpanel.gemspec +1 -1
- data/lib/embulk/input/mixpanel.rb +24 -342
- data/lib/embulk/input/mixpanel_api/client.rb +48 -9
- data/lib/embulk/input/service/base_service.rb +122 -0
- data/lib/embulk/input/service/export_service.rb +284 -0
- data/lib/embulk/input/service/jql_service.rb +276 -0
- data/lib/timezone_validator.rb +1 -1
- data/test/embulk/input/mixpanel_api/test_client.rb +4 -22
- data/test/embulk/input/{test_mixpanel.rb → service/test_export_service.rb} +62 -25
- data/test/embulk/input/service/test_jql_service.rb +745 -0
- data/test/test_range_generator.rb +1 -1
- metadata +9 -4
@@ -14,6 +14,8 @@ module Embulk
|
|
14
14
|
PING_RETRY_WAIT = 2
|
15
15
|
SMALL_NUM_OF_RECORDS = 10
|
16
16
|
DEFAULT_EXPORT_ENDPOINT = "https://data.mixpanel.com/api/2.0/export/".freeze
|
17
|
+
DEFAULT_JQL_ENDPOINT = "https://mixpanel.com/api/2.0/jql/".freeze
|
18
|
+
JQL_RATE_LIMIT = 60
|
17
19
|
|
18
20
|
attr_reader :retryer
|
19
21
|
|
@@ -40,7 +42,7 @@ module Embulk
|
|
40
42
|
end
|
41
43
|
end
|
42
44
|
|
43
|
-
def initialize(api_secret, retryer = nil
|
45
|
+
def initialize(api_secret, endpoint, retryer = nil)
|
44
46
|
@endpoint = endpoint
|
45
47
|
@api_secret = api_secret
|
46
48
|
@retryer = retryer || PerfectRetry.new do |config|
|
@@ -76,6 +78,30 @@ module Embulk
|
|
76
78
|
raise ConfigError.new "#{params["from_date"]}..#{latest_tried_to_date} has no record."
|
77
79
|
end
|
78
80
|
|
81
|
+
def send_jql_script(params = {})
|
82
|
+
retryer.with_retry do
|
83
|
+
response = request_jql(params)
|
84
|
+
handle_error(response, response.body)
|
85
|
+
begin
|
86
|
+
return JSON.parse(response.body)
|
87
|
+
rescue =>e
|
88
|
+
raise Embulk::DataError.new(e)
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
def send_jql_script_small_dataset(params = {})
|
94
|
+
retryer.with_retry do
|
95
|
+
response = request_jql(params)
|
96
|
+
handle_error(response, response.body)
|
97
|
+
begin
|
98
|
+
return JSON.parse(response.body)[0..SMALL_NUM_OF_RECORDS - 1]
|
99
|
+
rescue =>e
|
100
|
+
raise Embulk::DataError.new(e.message)
|
101
|
+
end
|
102
|
+
end
|
103
|
+
end
|
104
|
+
|
79
105
|
def try_to_dates(from_date)
|
80
106
|
try_to_dates = 5.times.map do |n|
|
81
107
|
# from_date + 1, from_date + 10, from_date + 100, ... so on
|
@@ -107,18 +133,18 @@ module Embulk
|
|
107
133
|
Embulk.logger.info "Sending request to #{@endpoint}"
|
108
134
|
response = httpclient.get(@endpoint, params) do |response, chunk|
|
109
135
|
# Only process data if response status is 200..299
|
110
|
-
if response.status/100 == 2
|
136
|
+
if response.status / 100 == 2
|
111
137
|
chunk.each_line do |line|
|
112
138
|
begin
|
113
139
|
record = JSON.parse(buf + line)
|
114
140
|
block.call record
|
115
141
|
buf = ""
|
116
|
-
rescue JSON::ParserError
|
142
|
+
rescue JSON::ParserError=>e
|
117
143
|
buf << line
|
118
144
|
end
|
119
145
|
end
|
120
146
|
else
|
121
|
-
|
147
|
+
error_response << chunk
|
122
148
|
end
|
123
149
|
end
|
124
150
|
handle_error(response, error_response)
|
@@ -129,24 +155,37 @@ module Embulk
|
|
129
155
|
end
|
130
156
|
end
|
131
157
|
|
158
|
+
def request_jql(parameters)
|
159
|
+
Embulk.logger.info "Sending request to #{@endpoint} params #{parameters}"
|
160
|
+
httpclient.post(@endpoint, query_string(parameters))
|
161
|
+
end
|
162
|
+
|
163
|
+
def query_string(prs)
|
164
|
+
URI.encode_www_form({
|
165
|
+
params: JSON.generate(prs[:params]),
|
166
|
+
script: prs[:script]
|
167
|
+
})
|
168
|
+
end
|
169
|
+
|
132
170
|
def request_small_dataset(params, num_of_records)
|
133
171
|
# guess/preview
|
134
172
|
# Try to fetch first number of records
|
135
173
|
params["limit"] = num_of_records
|
136
174
|
Embulk.logger.info "Sending request to #{@endpoint}"
|
137
175
|
res = httpclient.get(@endpoint, params)
|
138
|
-
handle_error(res,res.body)
|
176
|
+
handle_error(res, res.body)
|
139
177
|
response_to_enum(res.body)
|
140
178
|
end
|
141
179
|
|
142
180
|
def handle_error(response, error_response)
|
143
181
|
Embulk.logger.debug "response code: #{response.code}"
|
144
182
|
case response.code
|
183
|
+
when 429
|
184
|
+
# [429] {"error": "too many export requests in progress for this project"}
|
185
|
+
Embulk.logger.info "Hit rate limit sleep for 1 hour"
|
186
|
+
sleep(60 * 60)
|
187
|
+
raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
|
145
188
|
when 400..499
|
146
|
-
if response.code == 429
|
147
|
-
# [429] {"error": "too many export requests in progress for this project"}
|
148
|
-
raise RuntimeError.new("[#{response.code}] #{error_response} (will retry)")
|
149
|
-
end
|
150
189
|
raise ConfigError.new("[#{response.code}] #{error_response}")
|
151
190
|
when 500..599
|
152
191
|
raise RuntimeError.new("[#{response.code}] #{error_response}")
|
@@ -0,0 +1,122 @@
|
|
1
|
+
require "perfect_retry"
|
2
|
+
require "range_generator"
|
3
|
+
require "timezone_validator"
|
4
|
+
require "active_support/core_ext/time"
|
5
|
+
require "tzinfo"
|
6
|
+
require "embulk/input/mixpanel_api/client"
|
7
|
+
require "embulk/input/mixpanel_api/exceptions"
|
8
|
+
|
9
|
+
module Embulk
|
10
|
+
module Input
|
11
|
+
module Service
|
12
|
+
class BaseService
|
13
|
+
|
14
|
+
NOT_PROPERTY_COLUMN = "event".freeze
|
15
|
+
DEFAULT_FETCH_DAYS = 7
|
16
|
+
DEFAULT_TIME_COLUMN = 'time'.freeze
|
17
|
+
|
18
|
+
def initialize(config)
|
19
|
+
@config = config
|
20
|
+
end
|
21
|
+
|
22
|
+
def default_guess_start_date(timezone)
|
23
|
+
today(timezone) - DEFAULT_FETCH_DAYS - 1
|
24
|
+
end
|
25
|
+
|
26
|
+
protected
|
27
|
+
|
28
|
+
def validate_config
|
29
|
+
timezone = @config.param(:timezone, :string)
|
30
|
+
validate_timezone(timezone)
|
31
|
+
end
|
32
|
+
|
33
|
+
def validate_timezone(timezone)
|
34
|
+
TimezoneValidator.new(timezone).validate
|
35
|
+
end
|
36
|
+
|
37
|
+
def giveup_when_mixpanel_is_down
|
38
|
+
unless MixpanelApi::Client.mixpanel_available?(endpoint)
|
39
|
+
raise Embulk::DataError.new("Mixpanel service is down. Please retry later.")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
def adjust_timezone(epoch)
|
44
|
+
# Adjust timezone offset to get UTC time
|
45
|
+
# c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
|
46
|
+
if epoch.present?
|
47
|
+
tz = TZInfo::Timezone.get(@timezone)
|
48
|
+
offset = tz.period_for_local(epoch, true).offset.utc_total_offset
|
49
|
+
epoch - offset
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
def today(timezone)
|
54
|
+
if timezone.nil?
|
55
|
+
Date.today
|
56
|
+
else
|
57
|
+
zone = ActiveSupport::TimeZone[timezone]
|
58
|
+
zone.nil? ? Date.today : zone.today
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def extract_values(record)
|
63
|
+
@schema.map do |column|
|
64
|
+
extract_value(record, column["name"])
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
def preview?
|
69
|
+
begin
|
70
|
+
org.embulk.spi.Exec.isPreview()
|
71
|
+
rescue java.lang.NullPointerException=>e
|
72
|
+
false
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
def create_client
|
77
|
+
if @client.present?
|
78
|
+
@client
|
79
|
+
else
|
80
|
+
retryer = perfect_retry({
|
81
|
+
# retry_initial_wait_sec: @config[:retry_initial_wait_sec] ? @config[:retry_initial_wait_sec] : 1,
|
82
|
+
# retry_limit: @config[:retry_limit] ? @config[:retry_limit] : 5,
|
83
|
+
retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
|
84
|
+
retry_limit: @config.param(:retry_limit, :integer, default: 5),
|
85
|
+
})
|
86
|
+
MixpanelApi::Client.new(@config.param(:api_secret, :string), endpoint, retryer)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def perfect_retry(task)
|
91
|
+
PerfectRetry.new do |config|
|
92
|
+
config.limit = task[:retry_limit]
|
93
|
+
config.sleep = proc {|n| task[:retry_initial_wait_sec] * (2 * (n - 1))}
|
94
|
+
config.dont_rescues = [Embulk::ConfigError, MixpanelApi::IncompleteExportResponseError]
|
95
|
+
config.rescues = [RuntimeError]
|
96
|
+
config.log_level = nil
|
97
|
+
config.logger = Embulk.logger
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
def range
|
102
|
+
timezone = @config.param(:timezone, :string, default: "")
|
103
|
+
from_date = @config.param(:from_date, :string, default: (today(timezone) - 2).to_s)
|
104
|
+
incremental = @config.param(:incremental, :bool, default: true)
|
105
|
+
incremental_column = @config.param(:incremental_column, :string, default: nil)
|
106
|
+
latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
|
107
|
+
fetch_days = @config.param(:fetch_days, :integer, default: nil)
|
108
|
+
|
109
|
+
# Backfill from date if incremental and an incremental field is set and we are in incremental run
|
110
|
+
if incremental && incremental_column && latest_fetched_time !=0
|
111
|
+
back_fill_days = @config.param(:back_fill_days, :integer, default: 5)
|
112
|
+
Embulk.logger.info "Backfill days #{back_fill_days}"
|
113
|
+
from_date = (Date.parse(from_date) - back_fill_days).to_s
|
114
|
+
fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
|
115
|
+
end
|
116
|
+
|
117
|
+
RangeGenerator.new(from_date, fetch_days, timezone).generate_range
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
@@ -0,0 +1,284 @@
|
|
1
|
+
require 'embulk/input/service/base_service'
|
2
|
+
|
3
|
+
module Embulk
|
4
|
+
module Input
|
5
|
+
module Service
|
6
|
+
class ExportService < BaseService
|
7
|
+
|
8
|
+
# https://mixpanel.com/help/questions/articles/special-or-reserved-properties
|
9
|
+
# https://mixpanel.com/help/questions/articles/what-properties-do-mixpanels-libraries-store-by-default
|
10
|
+
#
|
11
|
+
# JavaScript to extract key names from HTML: run it on Chrome Devtool when opening their document
|
12
|
+
# > Array.from(document.querySelectorAll("strong")).map(function(s){ return s.textContent.match(/[A-Z]/) ? s.parentNode.textContent.match(/\((.*?)\)/)[1] : s.textContent.split(",").join(" ") }).join(" ")
|
13
|
+
# > Array.from(document.querySelectorAll("li")).map(function(s){ m = s.textContent.match(/\((.*?)\)/); return m && m[1] }).filter(function(k) { return k && !k.match("utm") }).join(" ")
|
14
|
+
KNOWN_KEYS = %W(
|
15
|
+
#{NOT_PROPERTY_COLUMN}
|
16
|
+
distinct_id ip mp_name_tag mp_note token time mp_country_code length campaign_id $email $phone $distinct_id $ios_devices $android_devices $first_name $last_name $name $city $region $country_code $timezone $unsubscribed
|
17
|
+
$city $region mp_country_code $browser $browser_version $device $current_url $initial_referrer $initial_referring_domain $os $referrer $referring_domain $screen_height $screen_width $search_engine $city $region $mp_country_code $timezone $browser_version $browser $initial_referrer $initial_referring_domain $os $last_seen $city $region mp_country_code $app_release $app_version $carrier $ios_ifa $os_version $manufacturer $lib_version $model $os $screen_height $screen_width $wifi $city $region $mp_country_code $timezone $ios_app_release $ios_app_version $ios_device_model $ios_lib_version $ios_version $ios_ifa $last_seen $city $region mp_country_code $app_version $bluetooth_enabled $bluetooth_version $brand $carrier $has_nfc $has_telephone $lib_version $manufacturer $model $os $os_version $screen_dpi $screen_height $screen_width $wifi $google_play_services $city $region mp_country_code $timezone $android_app_version $android_app_version_code $android_lib_version $android_os $android_os_version $android_brand $android_model $android_manufacturer $last_seen
|
18
|
+
).uniq.freeze
|
19
|
+
|
20
|
+
def validate_config
|
21
|
+
super
|
22
|
+
|
23
|
+
incremental_column = @config.param(:incremental_column, :string, default: nil)
|
24
|
+
latest_fetched_time = @config.param(:latest_fetched_time, :integer, default: 0)
|
25
|
+
fetch_custom_properties = @config.param(:fetch_custom_properties, :bool, default: true)
|
26
|
+
fetch_unknown_columns = @config.param(:fetch_unknown_columns, :bool, default: false)
|
27
|
+
|
28
|
+
if !incremental_column.nil? && !latest_fetched_time.nil? && (incremental_column_upper_limit <= latest_fetched_time)
|
29
|
+
raise Embulk::ConfigError.new("Incremental column upper limit (job_start_time - incremental_column_upper_limit_delay_in_seconds) can't be smaller or equal latest fetched time #{latest_fetched_time}")
|
30
|
+
end
|
31
|
+
|
32
|
+
if fetch_unknown_columns && fetch_custom_properties
|
33
|
+
raise Embulk::ConfigError.new("Don't set true both `fetch_unknown_columns` and `fetch_custom_properties`.")
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
def create_task
|
38
|
+
{
|
39
|
+
params: export_params,
|
40
|
+
dates: range,
|
41
|
+
timezone: @config.param(:timezone, :string, default: ""),
|
42
|
+
export_endpoint: endpoint,
|
43
|
+
api_secret: @config.param(:api_secret, :string),
|
44
|
+
schema: @config.param(:columns, :array),
|
45
|
+
fetch_unknown_columns: @config.param(:fetch_unknown_columns, :bool, default: false),
|
46
|
+
fetch_custom_properties: @config.param(:fetch_custom_properties, :bool, default: true),
|
47
|
+
retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
|
48
|
+
incremental_column: @config.param(:incremental_column, :string, default: nil),
|
49
|
+
retry_limit: @config.param(:retry_limit, :integer, default: 5),
|
50
|
+
latest_fetched_time: @config.param(:latest_fetched_time, :integer, default: 0),
|
51
|
+
incremental: @config.param(:incremental, :bool, default: true),
|
52
|
+
slice_range: @config.param(:slice_range, :integer, default: 7),
|
53
|
+
job_start_time: Time.now.to_i * 1000,
|
54
|
+
incremental_column_upper_limit: incremental_column_upper_limit,
|
55
|
+
allow_partial_import: @config.param(:allow_partial_import, :bool, default: true)
|
56
|
+
}
|
57
|
+
end
|
58
|
+
|
59
|
+
def next_from_date(task_report)
|
60
|
+
next_to_date = Date.parse(task_report[:to_date])
|
61
|
+
{
|
62
|
+
from_date: next_to_date.to_s,
|
63
|
+
latest_fetched_time: task_report[:latest_fetched_time],
|
64
|
+
}
|
65
|
+
end
|
66
|
+
|
67
|
+
def ingest(task, page_builder)
|
68
|
+
giveup_when_mixpanel_is_down
|
69
|
+
|
70
|
+
@schema = task[:schema]
|
71
|
+
@timezone = task[:timezone]
|
72
|
+
|
73
|
+
Embulk.logger.info "Job start time is #{task[:job_start_time]}"
|
74
|
+
|
75
|
+
dates = task[:dates]
|
76
|
+
prev_latest_fetched_time = task[:latest_fetched_time] || 0
|
77
|
+
prev_latest_fetched_time_format = Time.at(prev_latest_fetched_time).strftime("%F %T %z")
|
78
|
+
current_latest_fetched_time = prev_latest_fetched_time
|
79
|
+
incremental_column = task[:incremental_column]
|
80
|
+
incremental = task[:incremental]
|
81
|
+
fetch_unknown_columns = task[:fetch_unknown_columns]
|
82
|
+
|
83
|
+
dates.each_slice(task[:slice_range]) do |slice_dates|
|
84
|
+
ignored_fetched_record_count = 0
|
85
|
+
# There is the issue with Mixpanel time field during the transition from standard to daylight saving time
|
86
|
+
# in the US timezone i.e. 11 Mar 2018 2AM - 2:59AM, time within that period must not be existed,
|
87
|
+
# due to daylight saving, time will be forwarded 1 hour from 2AM to 3AM.
|
88
|
+
#
|
89
|
+
# All of records with wrong timezone will be ignored instead of throw exception out
|
90
|
+
ignored_wrong_daylight_tz_record_count = 0
|
91
|
+
unless preview?
|
92
|
+
Embulk.logger.info "Fetching data from #{slice_dates.first} to #{slice_dates.last} ..."
|
93
|
+
end
|
94
|
+
record_time_column = incremental_column || DEFAULT_TIME_COLUMN
|
95
|
+
begin
|
96
|
+
fetch(slice_dates, prev_latest_fetched_time, task).each do |record|
|
97
|
+
if incremental
|
98
|
+
if !record["properties"].include?(record_time_column)
|
99
|
+
raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
|
100
|
+
end
|
101
|
+
record_time = record["properties"][record_time_column]
|
102
|
+
if incremental_column.nil?
|
103
|
+
if record_time <= prev_latest_fetched_time
|
104
|
+
ignored_fetched_record_count += 1
|
105
|
+
next
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
current_latest_fetched_time = [
|
110
|
+
current_latest_fetched_time,
|
111
|
+
record_time,
|
112
|
+
].max
|
113
|
+
end
|
114
|
+
begin
|
115
|
+
values = extract_values(record)
|
116
|
+
if fetch_unknown_columns
|
117
|
+
unknown_values = extract_unknown_values(record)
|
118
|
+
values << unknown_values.to_json
|
119
|
+
end
|
120
|
+
if task[:fetch_custom_properties]
|
121
|
+
values << collect_custom_properties(record)
|
122
|
+
end
|
123
|
+
page_builder.add(values)
|
124
|
+
rescue TZInfo::PeriodNotFound
|
125
|
+
ignored_wrong_daylight_tz_record_count += 1
|
126
|
+
end
|
127
|
+
end
|
128
|
+
rescue MixpanelApi::IncompleteExportResponseError
|
129
|
+
if !task[:allow_partial_import]
|
130
|
+
# re raise the exception if we don't allow partial import
|
131
|
+
raise
|
132
|
+
end
|
133
|
+
end
|
134
|
+
if ignored_fetched_record_count > 0
|
135
|
+
Embulk.logger.warn "Skipped already loaded #{ignored_fetched_record_count} records. These record times are older or equal than previous fetched record time (#{prev_latest_fetched_time} @ #{prev_latest_fetched_time_format})."
|
136
|
+
end
|
137
|
+
if ignored_wrong_daylight_tz_record_count > 0
|
138
|
+
Embulk.logger.warn "Skipped #{ignored_wrong_daylight_tz_record_count} records due to corrupted Mixpanel time transition from standard to daylight saving"
|
139
|
+
end
|
140
|
+
break if preview?
|
141
|
+
end
|
142
|
+
page_builder.finish
|
143
|
+
create_task_report(current_latest_fetched_time, dates.last, task[:timezone])
|
144
|
+
end
|
145
|
+
|
146
|
+
def create_task_report(current_latest_fetched_time, to_date, timezone)
|
147
|
+
{
|
148
|
+
latest_fetched_time: current_latest_fetched_time,
|
149
|
+
to_date: to_date || today(timezone) - 1,
|
150
|
+
}
|
151
|
+
end
|
152
|
+
|
153
|
+
def guess_columns
|
154
|
+
giveup_when_mixpanel_is_down
|
155
|
+
range = guess_range
|
156
|
+
Embulk.logger.info "Guessing schema using #{range.first}..#{range.last} records"
|
157
|
+
|
158
|
+
params = export_params.merge(
|
159
|
+
"from_date"=>range.first,
|
160
|
+
"to_date"=>range.last,
|
161
|
+
)
|
162
|
+
|
163
|
+
client = create_client
|
164
|
+
guess_from_records(client.export_for_small_dataset(params))
|
165
|
+
end
|
166
|
+
|
167
|
+
def guess_range
|
168
|
+
time_zone = @config.param(:timezone, :string, default: "")
|
169
|
+
from_date = @config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
|
170
|
+
fetch_days = @config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
|
171
|
+
range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
|
172
|
+
if range.empty?
|
173
|
+
return default_guess_start_date(time_zone)..(today(time_zone) - 1)
|
174
|
+
end
|
175
|
+
range
|
176
|
+
end
|
177
|
+
|
178
|
+
def export_params
|
179
|
+
event = @config.param(:event, :array, default: nil)
|
180
|
+
event = event.nil? ? nil : event.to_json
|
181
|
+
{
|
182
|
+
event: event,
|
183
|
+
where: @config.param(:where, :string, default: nil),
|
184
|
+
bucket: @config.param(:bucket, :string, default: nil),
|
185
|
+
}
|
186
|
+
end
|
187
|
+
|
188
|
+
def guess_from_records(records)
|
189
|
+
sample_props = records.map {|r| r["properties"]}
|
190
|
+
schema = Guess::SchemaGuess.from_hash_records(sample_props)
|
191
|
+
columns = schema.map do |col|
|
192
|
+
next if col.name == "time"
|
193
|
+
result = {
|
194
|
+
name: col.name,
|
195
|
+
type: col.type,
|
196
|
+
}
|
197
|
+
result["format"] = col.format if col.format
|
198
|
+
result
|
199
|
+
end.compact
|
200
|
+
columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
|
201
|
+
# Shift incremental column to top
|
202
|
+
columns.unshift(name: "time", type: :long)
|
203
|
+
end
|
204
|
+
|
205
|
+
def fetch(dates, last_fetch_time, task, &block)
|
206
|
+
from_date = dates.first
|
207
|
+
to_date = dates.last
|
208
|
+
params = task[:params].merge(
|
209
|
+
"from_date"=>from_date,
|
210
|
+
"to_date"=>to_date
|
211
|
+
)
|
212
|
+
incremental_column = task[:incremental_column]
|
213
|
+
if !incremental_column.nil? # can't do filter on time column, time column need to be filter manually.
|
214
|
+
params = params.merge(
|
215
|
+
"where"=>"#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{incremental_column}\"] > #{last_fetch_time || 0} and properties[\"#{incremental_column}\"] < #{task[:incremental_column_upper_limit]}"
|
216
|
+
)
|
217
|
+
end
|
218
|
+
Embulk.logger.info "Where params is #{params["where"]}"
|
219
|
+
|
220
|
+
client = create_client
|
221
|
+
|
222
|
+
if preview?
|
223
|
+
client.export_for_small_dataset(params)
|
224
|
+
else
|
225
|
+
Enumerator.new do |y|
|
226
|
+
client.export(params) do |record|
|
227
|
+
y << record
|
228
|
+
end
|
229
|
+
end
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
def endpoint
|
234
|
+
@config.param(:export_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT)
|
235
|
+
end
|
236
|
+
|
237
|
+
private
|
238
|
+
|
239
|
+
def incremental_column_upper_limit
|
240
|
+
job_start_time = Time.now.to_i * 1000
|
241
|
+
upper_limit_delay = @config.param(:incremental_column_upper_limit_delay_in_seconds, :integer, default: 0)
|
242
|
+
job_start_time - (upper_limit_delay * 1000)
|
243
|
+
end
|
244
|
+
|
245
|
+
def extract_value(record, name)
|
246
|
+
case name
|
247
|
+
when NOT_PROPERTY_COLUMN
|
248
|
+
record[NOT_PROPERTY_COLUMN]
|
249
|
+
when "time"
|
250
|
+
time = record["properties"]["time"]
|
251
|
+
adjust_timezone(time)
|
252
|
+
else
|
253
|
+
record["properties"][name]
|
254
|
+
end
|
255
|
+
end
|
256
|
+
|
257
|
+
def collect_custom_properties(record)
|
258
|
+
specified_columns = @schema.map {|col| col["name"]}
|
259
|
+
custom_keys = record["properties"].keys.find_all {|key| !KNOWN_KEYS.include?(key.to_s) && !specified_columns.include?(key.to_s)}
|
260
|
+
custom_keys.inject({}) do |result, key|
|
261
|
+
result.merge({
|
262
|
+
key=>record["properties"][key]
|
263
|
+
})
|
264
|
+
end
|
265
|
+
end
|
266
|
+
|
267
|
+
def extract_unknown_values(record)
|
268
|
+
record_keys = record["properties"].keys + [NOT_PROPERTY_COLUMN]
|
269
|
+
schema_keys = @schema.map {|column| column["name"]}
|
270
|
+
unknown_keys = record_keys - schema_keys
|
271
|
+
|
272
|
+
unless unknown_keys.empty?
|
273
|
+
Embulk.logger.warn("Unknown columns exists in record: #{unknown_keys.join(', ')}")
|
274
|
+
end
|
275
|
+
|
276
|
+
unknown_keys.inject({}) do |result, key|
|
277
|
+
result[key] = extract_value(record, key)
|
278
|
+
result
|
279
|
+
end
|
280
|
+
end
|
281
|
+
end
|
282
|
+
end
|
283
|
+
end
|
284
|
+
end
|