embulk-input-mixpanel 0.5.2 → 0.5.3.alpha.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8534316e5eae7127b70afc10d9b247e872643c32
4
- data.tar.gz: e629adeb4d42e6386564bd13cfba3bce107ecd9c
3
+ metadata.gz: 28a4798aa8352ee4fbf06a23a57ff44aeb4cb902
4
+ data.tar.gz: 433333137ed26c3a2d62b822b767621885095cf4
5
5
  SHA512:
6
- metadata.gz: 70287307438223546321af68362fc63fd56f592eb09d17d543d53986990109bb53b285a8467500617f7a8b99ab4991b6850a7b53c0df241282ee2259f5a8808d
7
- data.tar.gz: f8e370285d393fada617ccc2d4dcb9d5cd54396cf47757a9af6f425bdfb83ac77c6354127171a6859939367698cd9b23e0915b8c954e557533140dcc1ea1525c
6
+ metadata.gz: 2d5fbba923cf5e52c169ed94001633001a2fcd8bb83cf0138541581d5a7768e4fd51bf7d690f7105c3fb31cf480a27a1af4fa9633636c8e4a237ba5786c99722
7
+ data.tar.gz: a147657608530a5f0ef513fdd150e93cf62bee74abeeadc3046ad30df42d19c1b94c4f42606260c827f1785dfc3c5c9fef6cf231efbc55ef4351d7be38cd4002
@@ -1,7 +1,6 @@
1
-
2
1
  Gem::Specification.new do |spec|
3
2
  spec.name = "embulk-input-mixpanel"
4
- spec.version = "0.5.2"
3
+ spec.version = "0.5.3.alpha.1"
5
4
  spec.authors = ["yoshihara", "uu59"]
6
5
  spec.summary = "Mixpanel input plugin for Embulk"
7
6
  spec.description = "Loads records from Mixpanel."
@@ -29,17 +29,32 @@ module Embulk
29
29
  # between each 7 (SLICE_DAYS_COUNT) days.
30
30
  SLICE_DAYS_COUNT = 7
31
31
 
32
+ DEFAULT_TIME_COLUMN = 'time'
33
+
32
34
  def self.transaction(config, &control)
33
35
  timezone = config.param(:timezone, :string)
34
36
  TimezoneValidator.new(timezone).validate
35
37
 
36
38
  from_date = config.param(:from_date, :string, default: (Date.today - 2).to_s)
37
39
  fetch_days = config.param(:fetch_days, :integer, default: nil)
38
- range = RangeGenerator.new(from_date, fetch_days).generate_range
39
- Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
40
+
40
41
 
41
42
  fetch_unknown_columns = config.param(:fetch_unknown_columns, :bool, default: false)
42
43
 
44
+ incremental_column = config.param(:incremental_column, :string, default: nil)
45
+ incremental = config.param(:incremental, :bool, default: true)
46
+ latest_fetched_time = config.param(:latest_fetched_time, :integer, default: 0)
47
+
48
+ # Backfill from date if incremental and an incremental field is set and we are in incremental run
49
+ if incremental && !incremental_column.nil? && latest_fetched_time !=0
50
+ back_fill_days = config.param(:back_fill_days, :integer, default: 5)
51
+ puts "Backfill days #{back_fill_days}"
52
+ from_date = (Date.parse(from_date) - back_fill_days).to_s
53
+ fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
54
+ end
55
+
56
+ range = RangeGenerator.new(from_date, fetch_days).generate_range
57
+ Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
43
58
  task = {
44
59
  params: export_params(config),
45
60
  dates: range,
@@ -50,8 +65,10 @@ module Embulk
50
65
  fetch_unknown_columns: fetch_unknown_columns,
51
66
  fetch_custom_properties: config.param(:fetch_custom_properties, :bool, default: true),
52
67
  retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
68
+ incremental_column: incremental_column,
53
69
  retry_limit: config.param(:retry_limit, :integer, default: 5),
54
- latest_fetched_time: config.param(:latest_fetched_time, :integer, default: 0),
70
+ latest_fetched_time: latest_fetched_time,
71
+ incremental: incremental
55
72
  }
56
73
 
57
74
  if task[:fetch_unknown_columns] && task[:fetch_custom_properties]
@@ -82,14 +99,16 @@ module Embulk
82
99
 
83
100
  # NOTE: If this plugin supports to run by multi threads, this
84
101
  # implementation is terrible.
85
- task_report = task_reports.first
86
- next_to_date = Date.parse(task_report[:to_date])
87
-
88
- next_config_diff = {
89
- from_date: next_to_date.to_s,
90
- latest_fetched_time: task_report[:latest_fetched_time],
91
- }
92
- return next_config_diff
102
+ if task[:incremental]
103
+ task_report = task_reports.first
104
+ next_to_date = Date.parse(task_report[:to_date])
105
+ next_config_diff = {
106
+ from_date: next_to_date.to_s,
107
+ latest_fetched_time: task_report[:latest_fetched_time],
108
+ }
109
+ return next_config_diff
110
+ end
111
+ return {}
93
112
  end
94
113
 
95
114
  def self.guess(config)
@@ -109,7 +128,6 @@ module Embulk
109
128
  "from_date" => range.first,
110
129
  "to_date" => range.last,
111
130
  )
112
-
113
131
  columns = guess_from_records(client.export_for_small_dataset(params))
114
132
  return {"columns" => columns}
115
133
  end
@@ -133,6 +151,8 @@ module Embulk
133
151
  @schema = task[:schema]
134
152
  @dates = task[:dates]
135
153
  @fetch_unknown_columns = task[:fetch_unknown_columns]
154
+ @incremental_column = task[:incremental_column]
155
+ @incremental = task[:incremental]
136
156
  end
137
157
 
138
158
  def run
@@ -146,19 +166,25 @@ module Embulk
146
166
  unless preview?
147
167
  Embulk.logger.info "Fetching data from #{dates.first} to #{dates.last} ..."
148
168
  end
149
-
150
- fetch(dates).each do |record|
151
- record_time = record["properties"]["time"]
152
- if record_time <= prev_latest_fetched_time
153
- ignored_record_count += 1
154
- next
155
- end
156
-
157
- current_latest_fetched_time= [
158
- current_latest_fetched_time,
159
- record_time,
160
- ].max
161
-
169
+ record_time_column=@incremental_column || DEFAULT_TIME_COLUMN
170
+ fetch(dates,prev_latest_fetched_time).each do |record|
171
+ if @incremental
172
+ if !record["properties"].include?(record_time_column)
173
+ raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
174
+ end
175
+ record_time = record["properties"][record_time_column]
176
+ if @incremental_column.nil?
177
+ if record_time <= prev_latest_fetched_time
178
+ ignored_record_count += 1
179
+ next
180
+ end
181
+ end
182
+
183
+ current_latest_fetched_time= [
184
+ current_latest_fetched_time,
185
+ record_time,
186
+ ].max
187
+ end
162
188
  values = extract_values(record)
163
189
  if @fetch_unknown_columns
164
190
  unknown_values = extract_unknown_values(record)
@@ -175,14 +201,12 @@ module Embulk
175
201
  end
176
202
  break if preview?
177
203
  end
178
-
179
204
  page_builder.finish
180
-
181
205
  task_report = {
182
206
  latest_fetched_time: current_latest_fetched_time,
183
207
  to_date: @dates.last || Date.today - 1,
184
208
  }
185
- return task_report
209
+ task_report
186
210
  end
187
211
 
188
212
  private
@@ -236,13 +260,19 @@ module Embulk
236
260
  end
237
261
  end
238
262
 
239
- def fetch(dates, &block)
263
+ def fetch(dates,last_fetch_time, &block)
240
264
  from_date = dates.first
241
265
  to_date = dates.last
242
266
  params = @params.merge(
243
267
  "from_date" => from_date,
244
- "to_date" => to_date,
268
+ "to_date" => to_date
245
269
  )
270
+ if !@incremental_column.nil? && !last_fetch_time.nil? && last_fetch_time!=0 # can't do filter on time column, time column need to be filter manually.
271
+ params = params.merge(
272
+ "where" => "#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{@incremental_column}\"] > #{last_fetch_time}"
273
+ )
274
+ end
275
+ puts "Where params is #{params["where"]}"
246
276
  client = MixpanelApi::Client.new(@api_key, @api_secret, self.class.perfect_retry(task))
247
277
 
248
278
  if preview?
@@ -299,7 +329,7 @@ module Embulk
299
329
  end
300
330
 
301
331
  def self.guess_from_records(records)
302
- sample_props = records.first(GUESS_RECORDS_COUNT).map{|r| r["properties"]}
332
+ sample_props = records.first(GUESS_RECORDS_COUNT).map {|r| r["properties"]}
303
333
  schema = Guess::SchemaGuess.from_hash_records(sample_props)
304
334
  columns = schema.map do |col|
305
335
  next if col.name == "time"
@@ -311,6 +341,7 @@ module Embulk
311
341
  result
312
342
  end.compact
313
343
  columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
344
+ # Shift incremental column to top
314
345
  columns.unshift(name: "time", type: :long)
315
346
  end
316
347
  end
@@ -47,7 +47,7 @@ class RangeGenerator
47
47
  end
48
48
 
49
49
  if fetch_days
50
- from_date..(from_date + fetch_days - 1)
50
+ from_date..(from_date + (fetch_days > 1? fetch_days - 1 : fetch_days))
51
51
  else
52
52
  from_date..today
53
53
  end
@@ -116,7 +116,7 @@ module Embulk
116
116
 
117
117
  def test_json_type
118
118
  sample_records = records.map do |r|
119
- r.merge("properties" => {"array" => [1,2], "hash" => {foo: "FOO"}})
119
+ r.merge("properties" => {"time" => 1, "array" => [1, 2], "hash" => {foo: "FOO"}})
120
120
  end
121
121
  actual = Mixpanel.guess_from_records(sample_records)
122
122
  assert actual.include?(name: "array", type: :json)
@@ -193,6 +193,14 @@ module Embulk
193
193
  end
194
194
  end
195
195
 
196
+ def test_default_configuration
197
+ stub(Mixpanel).resume {|task|
198
+ assert_nil(task[:incremental_column])
199
+ assert_true(task[:incremental])
200
+ }
201
+ Mixpanel.transaction(transaction_config(Date.today))
202
+ end
203
+
196
204
  private
197
205
 
198
206
  def transaction_config(from_date)
@@ -282,6 +290,8 @@ module Embulk
282
290
  dates: DATES.map {|date| date.to_s},
283
291
  api_key: API_KEY,
284
292
  api_secret: API_SECRET,
293
+ incremental: true,
294
+ incremental_column: nil,
285
295
  timezone: timezone,
286
296
  schema: schema
287
297
  )
@@ -304,6 +314,25 @@ module Embulk
304
314
  Mixpanel.transaction(transaction_config(days), &control)
305
315
  end
306
316
 
317
+ def test_valid_days_with_backfill
318
+ days = 5
319
+
320
+ stub(Mixpanel).resume() do |task|
321
+ assert_equal(["2015-02-17", "2015-02-18", "2015-02-19", "2015-02-20", "2015-02-21", "2015-02-22", "2015-02-23", "2015-02-24", "2015-02-25", "2015-02-26"], task[:dates])
322
+ end
323
+ config=transaction_config(days).merge("back_fill_days" => 5, "incremental_column" => "test_column", "latest_fetched_time" => 1501599491000)
324
+ Mixpanel.transaction(config, &control)
325
+ end
326
+
327
+ def test_valid_days_with_backfill_first_run
328
+ days = 5
329
+ stub(Mixpanel).resume() do |task|
330
+ assert_equal(transaction_task(days)[:dates], task[:dates])
331
+ end
332
+ config=transaction_config(days).merge("back_fill_days" => 5, "incremental_column" => "test_column")
333
+ Mixpanel.transaction(config, &control)
334
+ end
335
+
307
336
  def test_invalid_days
308
337
  days = 0
309
338
 
@@ -549,6 +578,96 @@ module Embulk
549
578
  @plugin.run
550
579
  end
551
580
 
581
+ class NonIncrementalRunTest < self
582
+
583
+ def test_non_incremental_run
584
+
585
+ mock(@page_builder).add(anything).times(records.length * 2)
586
+ mock(@page_builder).finish
587
+ task_report = @plugin.run
588
+ assert_equal(0, task_report[:latest_fetched_time])
589
+ end
590
+
591
+ def task
592
+ super.merge(incremental: false)
593
+ end
594
+
595
+ end
596
+
597
+ class IncrementalRunTest < self
598
+
599
+ def test_incremental_run
600
+ dont_allow(mock(@page_builder)).add(anything)
601
+ mock(@page_builder).finish
602
+ task_report = @plugin.run
603
+ assert_equal(record_epoch+1, task_report[:latest_fetched_time])
604
+ end
605
+
606
+ def task
607
+ super.merge(incremental: true, latest_fetched_time: record_epoch+1)
608
+ end
609
+
610
+ end
611
+
612
+ class IncrementalColumnTest < self
613
+
614
+ def setup
615
+ end
616
+
617
+ def test_incremental_column_with_where
618
+ page_builder = Object.new
619
+ plugin = Mixpanel.new(task.merge(params: task[:params].merge("where" => "abc==def")), nil, nil, page_builder)
620
+ stub(plugin).preview? {false}
621
+ adjusted = record_epoch - timezone_offset_seconds
622
+ mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
623
+ mock(page_builder).finish
624
+ any_instance_of(MixpanelApi::Client) do |klass|
625
+ stub(klass).export() do |params, block|
626
+ assert_equal('(abc==def) and properties["mp_processing_time_ms"] > 0',params["where"])
627
+ records.each{|record| block.call(record) }
628
+ end
629
+ end
630
+ task_report = plugin.run
631
+ assert_equal(1234567919, task_report[:latest_fetched_time])
632
+ end
633
+
634
+ def test_incremental_column
635
+ page_builder = Object.new
636
+ plugin = Mixpanel.new(task, nil, nil, page_builder)
637
+ stub(plugin).preview? {false}
638
+ adjusted = record_epoch - timezone_offset_seconds
639
+ mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
640
+ mock(page_builder).finish
641
+ any_instance_of(MixpanelApi::Client) do |klass|
642
+ stub(klass).export() do |params, block|
643
+ assert_equal('properties["mp_processing_time_ms"] > 0',params["where"])
644
+ records.each{|record| block.call(record) }
645
+ end
646
+ end
647
+ task_report = plugin.run
648
+ assert_equal(1234567919, task_report[:latest_fetched_time])
649
+ end
650
+
651
+ def records
652
+ super.each_with_index.map {|record, i|
653
+ record['properties']['mp_processing_time_ms'] = record_epoch+i
654
+ record
655
+ }
656
+ end
657
+
658
+ def schema
659
+ [
660
+ {"name" => "foo", "type" => "string"},
661
+ {"name" => "time", "type" => "integer"},
662
+ {"name" => "event", "type" => "string"},
663
+ ]
664
+ end
665
+
666
+ def task
667
+ super.merge(incremental_column: 'mp_processing_time_ms')
668
+ end
669
+ end
670
+
552
671
  class CustomPropertiesTest < self
553
672
  def setup
554
673
  super
@@ -671,6 +790,8 @@ module Embulk
671
790
  api_key: API_KEY,
672
791
  api_secret: API_SECRET,
673
792
  timezone: TIMEZONE,
793
+ incremental: true,
794
+ incremental_column: nil,
674
795
  schema: schema,
675
796
  dates: DATES.to_a.map(&:to_s),
676
797
  params: Mixpanel.export_params(embulk_config),
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-mixpanel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3.alpha.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshihara
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-07-26 00:00:00.000000000 Z
12
+ date: 2017-08-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
@@ -219,12 +219,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
219
219
  version: '0'
220
220
  required_rubygems_version: !ruby/object:Gem::Requirement
221
221
  requirements:
222
- - - ">="
222
+ - - ">"
223
223
  - !ruby/object:Gem::Version
224
- version: '0'
224
+ version: 1.3.1
225
225
  requirements: []
226
226
  rubyforge_project:
227
- rubygems_version: 2.4.8
227
+ rubygems_version: 2.6.12
228
228
  signing_key:
229
229
  specification_version: 4
230
230
  summary: Mixpanel input plugin for Embulk