embulk-input-mixpanel 0.5.2 → 0.5.3.alpha.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 8534316e5eae7127b70afc10d9b247e872643c32
4
- data.tar.gz: e629adeb4d42e6386564bd13cfba3bce107ecd9c
3
+ metadata.gz: 28a4798aa8352ee4fbf06a23a57ff44aeb4cb902
4
+ data.tar.gz: 433333137ed26c3a2d62b822b767621885095cf4
5
5
  SHA512:
6
- metadata.gz: 70287307438223546321af68362fc63fd56f592eb09d17d543d53986990109bb53b285a8467500617f7a8b99ab4991b6850a7b53c0df241282ee2259f5a8808d
7
- data.tar.gz: f8e370285d393fada617ccc2d4dcb9d5cd54396cf47757a9af6f425bdfb83ac77c6354127171a6859939367698cd9b23e0915b8c954e557533140dcc1ea1525c
6
+ metadata.gz: 2d5fbba923cf5e52c169ed94001633001a2fcd8bb83cf0138541581d5a7768e4fd51bf7d690f7105c3fb31cf480a27a1af4fa9633636c8e4a237ba5786c99722
7
+ data.tar.gz: a147657608530a5f0ef513fdd150e93cf62bee74abeeadc3046ad30df42d19c1b94c4f42606260c827f1785dfc3c5c9fef6cf231efbc55ef4351d7be38cd4002
@@ -1,7 +1,6 @@
1
-
2
1
  Gem::Specification.new do |spec|
3
2
  spec.name = "embulk-input-mixpanel"
4
- spec.version = "0.5.2"
3
+ spec.version = "0.5.3.alpha.1"
5
4
  spec.authors = ["yoshihara", "uu59"]
6
5
  spec.summary = "Mixpanel input plugin for Embulk"
7
6
  spec.description = "Loads records from Mixpanel."
@@ -29,17 +29,32 @@ module Embulk
29
29
  # between each 7 (SLICE_DAYS_COUNT) days.
30
30
  SLICE_DAYS_COUNT = 7
31
31
 
32
+ DEFAULT_TIME_COLUMN = 'time'
33
+
32
34
  def self.transaction(config, &control)
33
35
  timezone = config.param(:timezone, :string)
34
36
  TimezoneValidator.new(timezone).validate
35
37
 
36
38
  from_date = config.param(:from_date, :string, default: (Date.today - 2).to_s)
37
39
  fetch_days = config.param(:fetch_days, :integer, default: nil)
38
- range = RangeGenerator.new(from_date, fetch_days).generate_range
39
- Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
40
+
40
41
 
41
42
  fetch_unknown_columns = config.param(:fetch_unknown_columns, :bool, default: false)
42
43
 
44
+ incremental_column = config.param(:incremental_column, :string, default: nil)
45
+ incremental = config.param(:incremental, :bool, default: true)
46
+ latest_fetched_time = config.param(:latest_fetched_time, :integer, default: 0)
47
+
48
+ # Backfill from date if incremental and an incremental field is set and we are in incremental run
49
+ if incremental && !incremental_column.nil? && latest_fetched_time !=0
50
+ back_fill_days = config.param(:back_fill_days, :integer, default: 5)
51
+ puts "Backfill days #{back_fill_days}"
52
+ from_date = (Date.parse(from_date) - back_fill_days).to_s
53
+ fetch_days = fetch_days.nil? ? nil : fetch_days + back_fill_days
54
+ end
55
+
56
+ range = RangeGenerator.new(from_date, fetch_days).generate_range
57
+ Embulk.logger.info "Try to fetch data from #{range.first} to #{range.last}"
43
58
  task = {
44
59
  params: export_params(config),
45
60
  dates: range,
@@ -50,8 +65,10 @@ module Embulk
50
65
  fetch_unknown_columns: fetch_unknown_columns,
51
66
  fetch_custom_properties: config.param(:fetch_custom_properties, :bool, default: true),
52
67
  retry_initial_wait_sec: config.param(:retry_initial_wait_sec, :integer, default: 1),
68
+ incremental_column: incremental_column,
53
69
  retry_limit: config.param(:retry_limit, :integer, default: 5),
54
- latest_fetched_time: config.param(:latest_fetched_time, :integer, default: 0),
70
+ latest_fetched_time: latest_fetched_time,
71
+ incremental: incremental
55
72
  }
56
73
 
57
74
  if task[:fetch_unknown_columns] && task[:fetch_custom_properties]
@@ -82,14 +99,16 @@ module Embulk
82
99
 
83
100
  # NOTE: If this plugin supports to run by multi threads, this
84
101
  # implementation is terrible.
85
- task_report = task_reports.first
86
- next_to_date = Date.parse(task_report[:to_date])
87
-
88
- next_config_diff = {
89
- from_date: next_to_date.to_s,
90
- latest_fetched_time: task_report[:latest_fetched_time],
91
- }
92
- return next_config_diff
102
+ if task[:incremental]
103
+ task_report = task_reports.first
104
+ next_to_date = Date.parse(task_report[:to_date])
105
+ next_config_diff = {
106
+ from_date: next_to_date.to_s,
107
+ latest_fetched_time: task_report[:latest_fetched_time],
108
+ }
109
+ return next_config_diff
110
+ end
111
+ return {}
93
112
  end
94
113
 
95
114
  def self.guess(config)
@@ -109,7 +128,6 @@ module Embulk
109
128
  "from_date" => range.first,
110
129
  "to_date" => range.last,
111
130
  )
112
-
113
131
  columns = guess_from_records(client.export_for_small_dataset(params))
114
132
  return {"columns" => columns}
115
133
  end
@@ -133,6 +151,8 @@ module Embulk
133
151
  @schema = task[:schema]
134
152
  @dates = task[:dates]
135
153
  @fetch_unknown_columns = task[:fetch_unknown_columns]
154
+ @incremental_column = task[:incremental_column]
155
+ @incremental = task[:incremental]
136
156
  end
137
157
 
138
158
  def run
@@ -146,19 +166,25 @@ module Embulk
146
166
  unless preview?
147
167
  Embulk.logger.info "Fetching data from #{dates.first} to #{dates.last} ..."
148
168
  end
149
-
150
- fetch(dates).each do |record|
151
- record_time = record["properties"]["time"]
152
- if record_time <= prev_latest_fetched_time
153
- ignored_record_count += 1
154
- next
155
- end
156
-
157
- current_latest_fetched_time= [
158
- current_latest_fetched_time,
159
- record_time,
160
- ].max
161
-
169
+ record_time_column=@incremental_column || DEFAULT_TIME_COLUMN
170
+ fetch(dates,prev_latest_fetched_time).each do |record|
171
+ if @incremental
172
+ if !record["properties"].include?(record_time_column)
173
+ raise Embulk::ConfigError.new("Incremental column not exists in fetched data #{record_time_column}")
174
+ end
175
+ record_time = record["properties"][record_time_column]
176
+ if @incremental_column.nil?
177
+ if record_time <= prev_latest_fetched_time
178
+ ignored_record_count += 1
179
+ next
180
+ end
181
+ end
182
+
183
+ current_latest_fetched_time= [
184
+ current_latest_fetched_time,
185
+ record_time,
186
+ ].max
187
+ end
162
188
  values = extract_values(record)
163
189
  if @fetch_unknown_columns
164
190
  unknown_values = extract_unknown_values(record)
@@ -175,14 +201,12 @@ module Embulk
175
201
  end
176
202
  break if preview?
177
203
  end
178
-
179
204
  page_builder.finish
180
-
181
205
  task_report = {
182
206
  latest_fetched_time: current_latest_fetched_time,
183
207
  to_date: @dates.last || Date.today - 1,
184
208
  }
185
- return task_report
209
+ task_report
186
210
  end
187
211
 
188
212
  private
@@ -236,13 +260,19 @@ module Embulk
236
260
  end
237
261
  end
238
262
 
239
- def fetch(dates, &block)
263
+ def fetch(dates,last_fetch_time, &block)
240
264
  from_date = dates.first
241
265
  to_date = dates.last
242
266
  params = @params.merge(
243
267
  "from_date" => from_date,
244
- "to_date" => to_date,
268
+ "to_date" => to_date
245
269
  )
270
+ if !@incremental_column.nil? && !last_fetch_time.nil? && last_fetch_time!=0 # can't do filter on time column, time column need to be filter manually.
271
+ params = params.merge(
272
+ "where" => "#{params['where'].nil? ? '' : "(#{params['where']}) and " }properties[\"#{@incremental_column}\"] > #{last_fetch_time}"
273
+ )
274
+ end
275
+ puts "Where params is #{params["where"]}"
246
276
  client = MixpanelApi::Client.new(@api_key, @api_secret, self.class.perfect_retry(task))
247
277
 
248
278
  if preview?
@@ -299,7 +329,7 @@ module Embulk
299
329
  end
300
330
 
301
331
  def self.guess_from_records(records)
302
- sample_props = records.first(GUESS_RECORDS_COUNT).map{|r| r["properties"]}
332
+ sample_props = records.first(GUESS_RECORDS_COUNT).map {|r| r["properties"]}
303
333
  schema = Guess::SchemaGuess.from_hash_records(sample_props)
304
334
  columns = schema.map do |col|
305
335
  next if col.name == "time"
@@ -311,6 +341,7 @@ module Embulk
311
341
  result
312
342
  end.compact
313
343
  columns.unshift(name: NOT_PROPERTY_COLUMN, type: :string)
344
+ # Shift incremental column to top
314
345
  columns.unshift(name: "time", type: :long)
315
346
  end
316
347
  end
@@ -47,7 +47,7 @@ class RangeGenerator
47
47
  end
48
48
 
49
49
  if fetch_days
50
- from_date..(from_date + fetch_days - 1)
50
+ from_date..(from_date + (fetch_days > 1? fetch_days - 1 : fetch_days))
51
51
  else
52
52
  from_date..today
53
53
  end
@@ -116,7 +116,7 @@ module Embulk
116
116
 
117
117
  def test_json_type
118
118
  sample_records = records.map do |r|
119
- r.merge("properties" => {"array" => [1,2], "hash" => {foo: "FOO"}})
119
+ r.merge("properties" => {"time" => 1, "array" => [1, 2], "hash" => {foo: "FOO"}})
120
120
  end
121
121
  actual = Mixpanel.guess_from_records(sample_records)
122
122
  assert actual.include?(name: "array", type: :json)
@@ -193,6 +193,14 @@ module Embulk
193
193
  end
194
194
  end
195
195
 
196
+ def test_default_configuration
197
+ stub(Mixpanel).resume {|task|
198
+ assert_nil(task[:incremental_column])
199
+ assert_true(task[:incremental])
200
+ }
201
+ Mixpanel.transaction(transaction_config(Date.today))
202
+ end
203
+
196
204
  private
197
205
 
198
206
  def transaction_config(from_date)
@@ -282,6 +290,8 @@ module Embulk
282
290
  dates: DATES.map {|date| date.to_s},
283
291
  api_key: API_KEY,
284
292
  api_secret: API_SECRET,
293
+ incremental: true,
294
+ incremental_column: nil,
285
295
  timezone: timezone,
286
296
  schema: schema
287
297
  )
@@ -304,6 +314,25 @@ module Embulk
304
314
  Mixpanel.transaction(transaction_config(days), &control)
305
315
  end
306
316
 
317
+ def test_valid_days_with_backfill
318
+ days = 5
319
+
320
+ stub(Mixpanel).resume() do |task|
321
+ assert_equal(["2015-02-17", "2015-02-18", "2015-02-19", "2015-02-20", "2015-02-21", "2015-02-22", "2015-02-23", "2015-02-24", "2015-02-25", "2015-02-26"], task[:dates])
322
+ end
323
+ config=transaction_config(days).merge("back_fill_days" => 5, "incremental_column" => "test_column", "latest_fetched_time" => 1501599491000)
324
+ Mixpanel.transaction(config, &control)
325
+ end
326
+
327
+ def test_valid_days_with_backfill_first_run
328
+ days = 5
329
+ stub(Mixpanel).resume() do |task|
330
+ assert_equal(transaction_task(days)[:dates], task[:dates])
331
+ end
332
+ config=transaction_config(days).merge("back_fill_days" => 5, "incremental_column" => "test_column")
333
+ Mixpanel.transaction(config, &control)
334
+ end
335
+
307
336
  def test_invalid_days
308
337
  days = 0
309
338
 
@@ -549,6 +578,96 @@ module Embulk
549
578
  @plugin.run
550
579
  end
551
580
 
581
+ class NonIncrementalRunTest < self
582
+
583
+ def test_non_incremental_run
584
+
585
+ mock(@page_builder).add(anything).times(records.length * 2)
586
+ mock(@page_builder).finish
587
+ task_report = @plugin.run
588
+ assert_equal(0, task_report[:latest_fetched_time])
589
+ end
590
+
591
+ def task
592
+ super.merge(incremental: false)
593
+ end
594
+
595
+ end
596
+
597
+ class IncrementalRunTest < self
598
+
599
+ def test_incremental_run
600
+ dont_allow(mock(@page_builder)).add(anything)
601
+ mock(@page_builder).finish
602
+ task_report = @plugin.run
603
+ assert_equal(record_epoch+1, task_report[:latest_fetched_time])
604
+ end
605
+
606
+ def task
607
+ super.merge(incremental: true, latest_fetched_time: record_epoch+1)
608
+ end
609
+
610
+ end
611
+
612
+ class IncrementalColumnTest < self
613
+
614
+ def setup
615
+ end
616
+
617
+ def test_incremental_column_with_where
618
+ page_builder = Object.new
619
+ plugin = Mixpanel.new(task.merge(params: task[:params].merge("where" => "abc==def")), nil, nil, page_builder)
620
+ stub(plugin).preview? {false}
621
+ adjusted = record_epoch - timezone_offset_seconds
622
+ mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
623
+ mock(page_builder).finish
624
+ any_instance_of(MixpanelApi::Client) do |klass|
625
+ stub(klass).export() do |params, block|
626
+ assert_equal('(abc==def) and properties["mp_processing_time_ms"] > 0',params["where"])
627
+ records.each{|record| block.call(record) }
628
+ end
629
+ end
630
+ task_report = plugin.run
631
+ assert_equal(1234567919, task_report[:latest_fetched_time])
632
+ end
633
+
634
+ def test_incremental_column
635
+ page_builder = Object.new
636
+ plugin = Mixpanel.new(task, nil, nil, page_builder)
637
+ stub(plugin).preview? {false}
638
+ adjusted = record_epoch - timezone_offset_seconds
639
+ mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
640
+ mock(page_builder).finish
641
+ any_instance_of(MixpanelApi::Client) do |klass|
642
+ stub(klass).export() do |params, block|
643
+ assert_equal('properties["mp_processing_time_ms"] > 0',params["where"])
644
+ records.each{|record| block.call(record) }
645
+ end
646
+ end
647
+ task_report = plugin.run
648
+ assert_equal(1234567919, task_report[:latest_fetched_time])
649
+ end
650
+
651
+ def records
652
+ super.each_with_index.map {|record, i|
653
+ record['properties']['mp_processing_time_ms'] = record_epoch+i
654
+ record
655
+ }
656
+ end
657
+
658
+ def schema
659
+ [
660
+ {"name" => "foo", "type" => "string"},
661
+ {"name" => "time", "type" => "integer"},
662
+ {"name" => "event", "type" => "string"},
663
+ ]
664
+ end
665
+
666
+ def task
667
+ super.merge(incremental_column: 'mp_processing_time_ms')
668
+ end
669
+ end
670
+
552
671
  class CustomPropertiesTest < self
553
672
  def setup
554
673
  super
@@ -671,6 +790,8 @@ module Embulk
671
790
  api_key: API_KEY,
672
791
  api_secret: API_SECRET,
673
792
  timezone: TIMEZONE,
793
+ incremental: true,
794
+ incremental_column: nil,
674
795
  schema: schema,
675
796
  dates: DATES.to_a.map(&:to_s),
676
797
  params: Mixpanel.export_params(embulk_config),
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-mixpanel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3.alpha.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshihara
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2017-07-26 00:00:00.000000000 Z
12
+ date: 2017-08-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
@@ -219,12 +219,12 @@ required_ruby_version: !ruby/object:Gem::Requirement
219
219
  version: '0'
220
220
  required_rubygems_version: !ruby/object:Gem::Requirement
221
221
  requirements:
222
- - - ">="
222
+ - - ">"
223
223
  - !ruby/object:Gem::Version
224
- version: '0'
224
+ version: 1.3.1
225
225
  requirements: []
226
226
  rubyforge_project:
227
- rubygems_version: 2.4.8
227
+ rubygems_version: 2.6.12
228
228
  signing_key:
229
229
  specification_version: 4
230
230
  summary: Mixpanel input plugin for Embulk