embulk-input-mixpanel 0.5.2 → 0.5.3.alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/embulk-input-mixpanel.gemspec +1 -2
- data/lib/embulk/input/mixpanel.rb +62 -31
- data/lib/range_generator.rb +1 -1
- data/test/embulk/input/test_mixpanel.rb +122 -1
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 28a4798aa8352ee4fbf06a23a57ff44aeb4cb902
|
4
|
+
data.tar.gz: 433333137ed26c3a2d62b822b767621885095cf4
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2d5fbba923cf5e52c169ed94001633001a2fcd8bb83cf0138541581d5a7768e4fd51bf7d690f7105c3fb31cf480a27a1af4fa9633636c8e4a237ba5786c99722
|
7
|
+
data.tar.gz: a147657608530a5f0ef513fdd150e93cf62bee74abeeadc3046ad30df42d19c1b94c4f42606260c827f1785dfc3c5c9fef6cf231efbc55ef4351d7be38cd4002
|
@@ -1,7 +1,6 @@
|
|
1
|
-
|
2
1
|
Gem::Specification.new do |spec|
|
3
2
|
spec.name = "embulk-input-mixpanel"
|
4
|
-
spec.version = "0.5.
|
3
|
+
spec.version = "0.5.3.alpha.1"
|
5
4
|
spec.authors = ["yoshihara", "uu59"]
|
6
5
|
spec.summary = "Mixpanel input plugin for Embulk"
|
7
6
|
spec.description = "Loads records from Mixpanel."
|
@@ -29,17 +29,32 @@ module Embulk
|
|
29
29
|
# between each 7 (SLICE_DAYS_COUNT) days.
|
30
30
|
SLICE_DAYS_COUNT = 7
|
31
31
|
|
32
|
+
DEFAULT_TIME_COLUMN = 'time'
|
33
|
+
|
32
34
|
def self.transaction(config, &control)
|
33
35
|
timezone = config.param(:timezone, :string)
|
34
36
|
TimezoneValidator.new(timezone).validate
|
35
37
|
|
36
38
|
from_date = config.param(:from_date, :string, default: (Date.today - 2).to_s)
|
37
39
|
fetch_days = config.param(:fetch_days, :integer, default: nil)
|
38
|
-
|
39
|
-
Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
|
40
|
+
|
40
41
|
|
41
42
|
fetch_unknown_columns = config.param(:fetch_unknown_columns, :bool, default: false)
|
42
43
|
|
44
|
+
incremental_column = config.param(:incremental_column, :string, default: nil)
|
45
|
+
incremental = config.param(:incremental, :bool, default: true)
|
46
|
+
latest_fetched_time = config.param(:latest_fetched_time, :integer, default: 0)
|
47
|
+
|
48
|
+
# Backfill from date if incremental and an incremental field is set and we are in incremental run
|
49
|
+
if incremental && !incremental_column.nil? && latest_fetched_time !=0
|
50
|
+
back_fill_days = config.param(:back_fill_days, :integer, default: 5)
|
51
|
+
puts "Backfill days #{back_fill_days}"
|
52
|
+
from_date = (Date.parse(from_date) - back_fill_days).to_s
|
53
|
+
fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
|
54
|
+
end
|
55
|
+
|
56
|
+
range = RangeGenerator.new(from_date, fetch_days).generate_range
|
57
|
+
Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
|
43
58
|
task = {
|
44
59
|
params: export_params(config),
|
45
60
|
dates: range,
|
@@ -50,8 +65,10 @@ module Embulk
|
|
50
65
|
fetch_unknown_columns: fetch_unknown_columns,
|
51
66
|
fetch_custom_properties: config.param(:fetch_custom_properties, :bool, default: true),
|
52
67
|
retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
|
68
|
+
incremental_column: incremental_column,
|
53
69
|
retry_limit: config.param(:retry_limit, :integer, default: 5),
|
54
|
-
latest_fetched_time:
|
70
|
+
latest_fetched_time: latest_fetched_time,
|
71
|
+
incremental: incremental
|
55
72
|
}
|
56
73
|
|
57
74
|
if task[:fetch_unknown_columns] && task[:fetch_custom_properties]
|
@@ -82,14 +99,16 @@ module Embulk
|
|
82
99
|
|
83
100
|
# NOTE: If this plugin supports to run by multi threads, this
|
84
101
|
# implementation is terrible.
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
102
|
+
if task[:incremental]
|
103
|
+
task_report = task_reports.first
|
104
|
+
next_to_date = Date.parse(task_report[:to_date])
|
105
|
+
next_config_diff = {
|
106
|
+
from_date: next_to_date.to_s,
|
107
|
+
latest_fetched_time: task_report[:latest_fetched_time],
|
108
|
+
}
|
109
|
+
return next_config_diff
|
110
|
+
end
|
111
|
+
return {}
|
93
112
|
end
|
94
113
|
|
95
114
|
def self.guess(config)
|
@@ -109,7 +128,6 @@ module Embulk
|
|
109
128
|
"from_date" => range.first,
|
110
129
|
"to_date" => range.last,
|
111
130
|
)
|
112
|
-
|
113
131
|
columns = guess_from_records(client.export_for_small_dataset(params))
|
114
132
|
return {"columns" => columns}
|
115
133
|
end
|
@@ -133,6 +151,8 @@ module Embulk
|
|
133
151
|
@schema = task[:schema]
|
134
152
|
@dates = task[:dates]
|
135
153
|
@fetch_unknown_columns = task[:fetch_unknown_columns]
|
154
|
+
@incremental_column = task[:incremental_column]
|
155
|
+
@incremental = task[:incremental]
|
136
156
|
end
|
137
157
|
|
138
158
|
def run
|
@@ -146,19 +166,25 @@ module Embulk
|
|
146
166
|
unless preview?
|
147
167
|
Embulk.logger.info "Fetching data from #{dates.first} to #{dates.last} ..."
|
148
168
|
end
|
149
|
-
|
150
|
-
fetch(dates).each do |record|
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
169
|
+
record_time_column=@incremental_column || DEFAULT_TIME_COLUMN
|
170
|
+
fetch(dates,prev_latest_fetched_time).each do |record|
|
171
|
+
if @incremental
|
172
|
+
if !record["properties"].include?(record_time_column)
|
173
|
+
raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
|
174
|
+
end
|
175
|
+
record_time = record["properties"][record_time_column]
|
176
|
+
if @incremental_column.nil?
|
177
|
+
if record_time <= prev_latest_fetched_time
|
178
|
+
ignored_record_count += 1
|
179
|
+
next
|
180
|
+
end
|
181
|
+
end
|
182
|
+
|
183
|
+
current_latest_fetched_time= [
|
184
|
+
current_latest_fetched_time,
|
185
|
+
record_time,
|
186
|
+
].max
|
187
|
+
end
|
162
188
|
values = extract_values(record)
|
163
189
|
if @fetch_unknown_columns
|
164
190
|
unknown_values = extract_unknown_values(record)
|
@@ -175,14 +201,12 @@ module Embulk
|
|
175
201
|
end
|
176
202
|
break if preview?
|
177
203
|
end
|
178
|
-
|
179
204
|
page_builder.finish
|
180
|
-
|
181
205
|
task_report = {
|
182
206
|
latest_fetched_time: current_latest_fetched_time,
|
183
207
|
to_date: @dates.last || Date.today - 1,
|
184
208
|
}
|
185
|
-
|
209
|
+
task_report
|
186
210
|
end
|
187
211
|
|
188
212
|
private
|
@@ -236,13 +260,19 @@ module Embulk
|
|
236
260
|
end
|
237
261
|
end
|
238
262
|
|
239
|
-
def fetch(dates, &block)
|
263
|
+
def fetch(dates,last_fetch_time, &block)
|
240
264
|
from_date = dates.first
|
241
265
|
to_date = dates.last
|
242
266
|
params = @params.merge(
|
243
267
|
"from_date" => from_date,
|
244
|
-
"to_date" => to_date
|
268
|
+
"to_date" => to_date
|
245
269
|
)
|
270
|
+
if !@incremental_column.nil? && !last_fetch_time.nil? && last_fetch_time!=0 # can't do filter on time column, time column need to be filter manually.
|
271
|
+
params = params.merge(
|
272
|
+
"where" => "#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{@incremental_column}\"] > #{last_fetch_time}"
|
273
|
+
)
|
274
|
+
end
|
275
|
+
puts "Where params is #{params["where"]}"
|
246
276
|
client = MixpanelApi::Client.new(@api_key, @api_secret, self.class.perfect_retry(task))
|
247
277
|
|
248
278
|
if preview?
|
@@ -299,7 +329,7 @@ module Embulk
|
|
299
329
|
end
|
300
330
|
|
301
331
|
def self.guess_from_records(records)
|
302
|
-
sample_props = records.first(GUESS_RECORDS_COUNT).map{|r| r["properties"]}
|
332
|
+
sample_props = records.first(GUESS_RECORDS_COUNT).map {|r| r["properties"]}
|
303
333
|
schema = Guess::SchemaGuess.from_hash_records(sample_props)
|
304
334
|
columns = schema.map do |col|
|
305
335
|
next if col.name == "time"
|
@@ -311,6 +341,7 @@ module Embulk
|
|
311
341
|
result
|
312
342
|
end.compact
|
313
343
|
columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
|
344
|
+
# Shift incremental column to top
|
314
345
|
columns.unshift(name: "time", type: :long)
|
315
346
|
end
|
316
347
|
end
|
data/lib/range_generator.rb
CHANGED
@@ -116,7 +116,7 @@ module Embulk
|
|
116
116
|
|
117
117
|
def test_json_type
|
118
118
|
sample_records = records.map do |r|
|
119
|
-
r.merge("properties" => {"array" => [1,2], "hash" => {foo: "FOO"}})
|
119
|
+
r.merge("properties" => {"time" => 1, "array" => [1, 2], "hash" => {foo: "FOO"}})
|
120
120
|
end
|
121
121
|
actual = Mixpanel.guess_from_records(sample_records)
|
122
122
|
assert actual.include?(name: "array", type: :json)
|
@@ -193,6 +193,14 @@ module Embulk
|
|
193
193
|
end
|
194
194
|
end
|
195
195
|
|
196
|
+
def test_default_configuration
|
197
|
+
stub(Mixpanel).resume {|task|
|
198
|
+
assert_nil(task[:incremental_column])
|
199
|
+
assert_true(task[:incremental])
|
200
|
+
}
|
201
|
+
Mixpanel.transaction(transaction_config(Date.today))
|
202
|
+
end
|
203
|
+
|
196
204
|
private
|
197
205
|
|
198
206
|
def transaction_config(from_date)
|
@@ -282,6 +290,8 @@ module Embulk
|
|
282
290
|
dates: DATES.map {|date| date.to_s},
|
283
291
|
api_key: API_KEY,
|
284
292
|
api_secret: API_SECRET,
|
293
|
+
incremental: true,
|
294
|
+
incremental_column: nil,
|
285
295
|
timezone: timezone,
|
286
296
|
schema: schema
|
287
297
|
)
|
@@ -304,6 +314,25 @@ module Embulk
|
|
304
314
|
Mixpanel.transaction(transaction_config(days), &control)
|
305
315
|
end
|
306
316
|
|
317
|
+
def test_valid_days_with_backfill
|
318
|
+
days = 5
|
319
|
+
|
320
|
+
stub(Mixpanel).resume() do |task|
|
321
|
+
assert_equal(["2015-02-17", "2015-02-18", "2015-02-19", "2015-02-20", "2015-02-21", "2015-02-22", "2015-02-23", "2015-02-24", "2015-02-25", "2015-02-26"], task[:dates])
|
322
|
+
end
|
323
|
+
config=transaction_config(days).merge("back_fill_days" => 5, "incremental_column" => "test_column", "latest_fetched_time" => 1501599491000)
|
324
|
+
Mixpanel.transaction(config, &control)
|
325
|
+
end
|
326
|
+
|
327
|
+
def test_valid_days_with_backfill_first_run
|
328
|
+
days = 5
|
329
|
+
stub(Mixpanel).resume() do |task|
|
330
|
+
assert_equal(transaction_task(days)[:dates], task[:dates])
|
331
|
+
end
|
332
|
+
config=transaction_config(days).merge("back_fill_days" => 5, "incremental_column" => "test_column")
|
333
|
+
Mixpanel.transaction(config, &control)
|
334
|
+
end
|
335
|
+
|
307
336
|
def test_invalid_days
|
308
337
|
days = 0
|
309
338
|
|
@@ -549,6 +578,96 @@ module Embulk
|
|
549
578
|
@plugin.run
|
550
579
|
end
|
551
580
|
|
581
|
+
class NonIncrementalRunTest < self
|
582
|
+
|
583
|
+
def test_non_incremental_run
|
584
|
+
|
585
|
+
mock(@page_builder).add(anything).times(records.length * 2)
|
586
|
+
mock(@page_builder).finish
|
587
|
+
task_report = @plugin.run
|
588
|
+
assert_equal(0, task_report[:latest_fetched_time])
|
589
|
+
end
|
590
|
+
|
591
|
+
def task
|
592
|
+
super.merge(incremental: false)
|
593
|
+
end
|
594
|
+
|
595
|
+
end
|
596
|
+
|
597
|
+
class IncrementalRunTest < self
|
598
|
+
|
599
|
+
def test_incremental_run
|
600
|
+
dont_allow(mock(@page_builder)).add(anything)
|
601
|
+
mock(@page_builder).finish
|
602
|
+
task_report = @plugin.run
|
603
|
+
assert_equal(record_epoch+1, task_report[:latest_fetched_time])
|
604
|
+
end
|
605
|
+
|
606
|
+
def task
|
607
|
+
super.merge(incremental: true, latest_fetched_time: record_epoch+1)
|
608
|
+
end
|
609
|
+
|
610
|
+
end
|
611
|
+
|
612
|
+
class IncrementalColumnTest < self
|
613
|
+
|
614
|
+
def setup
|
615
|
+
end
|
616
|
+
|
617
|
+
def test_incremental_column_with_where
|
618
|
+
page_builder = Object.new
|
619
|
+
plugin = Mixpanel.new(task.merge(params: task[:params].merge("where" => "abc==def")), nil, nil, page_builder)
|
620
|
+
stub(plugin).preview? {false}
|
621
|
+
adjusted = record_epoch - timezone_offset_seconds
|
622
|
+
mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
|
623
|
+
mock(page_builder).finish
|
624
|
+
any_instance_of(MixpanelApi::Client) do |klass|
|
625
|
+
stub(klass).export() do |params, block|
|
626
|
+
assert_equal('(abc==def) and properties["mp_processing_time_ms"] > 0',params["where"])
|
627
|
+
records.each{|record| block.call(record) }
|
628
|
+
end
|
629
|
+
end
|
630
|
+
task_report = plugin.run
|
631
|
+
assert_equal(1234567919, task_report[:latest_fetched_time])
|
632
|
+
end
|
633
|
+
|
634
|
+
def test_incremental_column
|
635
|
+
page_builder = Object.new
|
636
|
+
plugin = Mixpanel.new(task, nil, nil, page_builder)
|
637
|
+
stub(plugin).preview? {false}
|
638
|
+
adjusted = record_epoch - timezone_offset_seconds
|
639
|
+
mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
|
640
|
+
mock(page_builder).finish
|
641
|
+
any_instance_of(MixpanelApi::Client) do |klass|
|
642
|
+
stub(klass).export() do |params, block|
|
643
|
+
assert_equal('properties["mp_processing_time_ms"] > 0',params["where"])
|
644
|
+
records.each{|record| block.call(record) }
|
645
|
+
end
|
646
|
+
end
|
647
|
+
task_report = plugin.run
|
648
|
+
assert_equal(1234567919, task_report[:latest_fetched_time])
|
649
|
+
end
|
650
|
+
|
651
|
+
def records
|
652
|
+
super.each_with_index.map {|record, i|
|
653
|
+
record['properties']['mp_processing_time_ms'] = record_epoch+i
|
654
|
+
record
|
655
|
+
}
|
656
|
+
end
|
657
|
+
|
658
|
+
def schema
|
659
|
+
[
|
660
|
+
{"name" => "foo", "type" => "string"},
|
661
|
+
{"name" => "time", "type" => "integer"},
|
662
|
+
{"name" => "event", "type" => "string"},
|
663
|
+
]
|
664
|
+
end
|
665
|
+
|
666
|
+
def task
|
667
|
+
super.merge(incremental_column: 'mp_processing_time_ms')
|
668
|
+
end
|
669
|
+
end
|
670
|
+
|
552
671
|
class CustomPropertiesTest < self
|
553
672
|
def setup
|
554
673
|
super
|
@@ -671,6 +790,8 @@ module Embulk
|
|
671
790
|
api_key: API_KEY,
|
672
791
|
api_secret: API_SECRET,
|
673
792
|
timezone: TIMEZONE,
|
793
|
+
incremental: true,
|
794
|
+
incremental_column: nil,
|
674
795
|
schema: schema,
|
675
796
|
dates: DATES.to_a.map(&:to_s),
|
676
797
|
params: Mixpanel.export_params(embulk_config),
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-mixpanel
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3.alpha.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- yoshihara
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2017-
|
12
|
+
date: 2017-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -219,12 +219,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
219
219
|
version: '0'
|
220
220
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
221
221
|
requirements:
|
222
|
-
- - "
|
222
|
+
- - ">"
|
223
223
|
- !ruby/object:Gem::Version
|
224
|
-
version:
|
224
|
+
version: 1.3.1
|
225
225
|
requirements: []
|
226
226
|
rubyforge_project:
|
227
|
-
rubygems_version: 2.
|
227
|
+
rubygems_version: 2.6.12
|
228
228
|
signing_key:
|
229
229
|
specification_version: 4
|
230
230
|
summary: Mixpanel input plugin for Embulk
|