embulk-input-mixpanel 0.5.15 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,276 @@
1
+ require 'embulk/input/service/base_service'
2
+
3
+ module Embulk
4
+ module Input
5
+ module Service
6
+ class JqlService < BaseService
7
+
8
+ FROM_DATE_PARAM = "params.from_date"
9
+ TO_DATE_PARAM = "params.to_date"
10
+
11
+ def validate_config
12
+ super
13
+
14
+ validate_jql_script
15
+ validate_fetch_days
16
+ end
17
+
18
+ def create_task
19
+ {
20
+ timezone: @config.param(:timezone, :string, default: ""),
21
+ api_secret: @config.param(:api_secret, :string),
22
+ jql_endpoint: endpoint,
23
+ dates: range,
24
+ incremental: @config.param(:incremental, :bool, default: true),
25
+ slice_range: @config.param(:slice_range, :integer, default: 7),
26
+ schema: @config.param(:columns, :array),
27
+ retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
28
+ retry_limit: @config.param(:retry_limit, :integer, default: 5),
29
+ incremental_column: @config.param(:incremental_column, :string, default: nil),
30
+ latest_fetched_time: @config.param(:latest_fetched_time, :integer, default: 0),
31
+ jql_mode: true,
32
+ jql_script: @config.param(:jql_script, :string, nil)
33
+ }
34
+ end
35
+
36
+ def guess_columns
37
+ giveup_when_mixpanel_is_down
38
+ range = guess_range
39
+ Embulk.logger.info "Guessing schema using #{range.first}..#{range.last}"
40
+
41
+ client = create_client
42
+
43
+ sample_records = client.send_jql_script_small_dataset(parameters(@config.param(:jql_script, :string, nil), range.first, range.last))
44
+
45
+ validate_result(sample_records)
46
+
47
+ @incremental = @config.param(:incremental, :bool, default: true)
48
+ @incremental_column = @config.param(:incremental_column, :string, default: nil)
49
+ validate_result_contain_incremental_column(sample_records)
50
+
51
+ guess_from_records(sample_records)
52
+ end
53
+
54
+ def ingest(task, page_builder)
55
+ @dates = task[:dates]
56
+ @schema = task[:schema]
57
+ @timezone = task[:timezone]
58
+ @incremental_column = task[:incremental_column]
59
+ unless @incremental_column
60
+ Embulk.logger.warn "incremental_column should be specified when running in incremental mode to avoid duplicated"
61
+ Embulk.logger.warn "Use default value #{DEFAULT_TIME_COLUMN}"
62
+ @incremental_column = DEFAULT_TIME_COLUMN
63
+ end
64
+
65
+ @incremental = task[:incremental]
66
+ latest_fetched_time = task[:latest_fetched_time]
67
+
68
+ client = create_client
69
+
70
+ ignored_fetched_record_count = 0
71
+ next_fetched_time = latest_fetched_time
72
+ @dates.each_slice(task[:slice_range]) do |slice_dates|
73
+ Embulk.logger.info "Fetching date from #{slice_dates.first}..#{slice_dates.last}"
74
+ if preview?
75
+ records = client.send_jql_script_small_dataset(parameters(@config.param(:jql_script, :string, default: nil), slice_dates.first, slice_dates.last))
76
+ else
77
+ records = client.send_jql_script(parameters(task[:jql_script], slice_dates.first, slice_dates.last))
78
+ end
79
+ validate_result(records)
80
+ records.each do |record|
81
+ if @incremental
82
+ if @schema.map {|col| col["name"]}.include?(@incremental_column)
83
+ record_incremental_column = record[@incremental_column]
84
+ if record_incremental_column
85
+ if record_incremental_column <= latest_fetched_time
86
+ ignored_fetched_record_count += 1
87
+ next
88
+ else
89
+ next_fetched_time = [record_incremental_column, next_fetched_time].max
90
+ end
91
+ end
92
+ else
93
+ raise Embulk::ConfigError.new("Missing Incremental Field (<incremental_column>) in the returned dataset. Specify the correct Incremental Field value.")
94
+ end
95
+ end
96
+ values = extract_values(record)
97
+ page_builder.add(values)
98
+ end
99
+ break if preview?
100
+ end
101
+ Embulk.logger.info "Skip #{ignored_fetched_record_count} rows"
102
+ page_builder.finish
103
+
104
+ if task[:incremental] && !preview?
105
+ return create_task_report(next_fetched_time)
106
+ end
107
+ {}
108
+ end
109
+
110
+ def guess_range
111
+ time_zone = @config.param(:timezone, :string, default: "")
112
+ from_date = @config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
113
+ fetch_days = @config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
114
+
115
+ fetch_days = [fetch_days, DEFAULT_FETCH_DAYS].min
116
+
117
+ range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
118
+ if range.empty?
119
+ return default_guess_start_date(time_zone)..(today(time_zone) - 1)
120
+ end
121
+ range
122
+ end
123
+
124
+ def guess_from_records(sample_props)
125
+ validate_result(sample_props)
126
+
127
+ begin
128
+ schema = Guess::SchemaGuess.from_hash_records(sample_props)
129
+ schema.map do |col|
130
+ result = {
131
+ name: col.name,
132
+ type: col.type,
133
+ }
134
+ if (col.name.eql? "time") || (col.eql? "last_seen")
135
+ result["format"] = col.format if col.format
136
+ end
137
+ result
138
+ end
139
+ rescue DataError
140
+ raise Embulk::ConfigError.new("Non-supported result #{sample_props}. Revise your JQL.")
141
+ end
142
+ end
143
+
144
+ def parameters(script, from_date, to_date)
145
+ {
146
+ params: params(from_date, to_date),
147
+ script: script
148
+ }
149
+ end
150
+
151
+ def adjust_timezone(epoch)
152
+ # Adjust timezone offset to get UTC time
153
+ # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
154
+ tz = TZInfo::Timezone.get(@timezone)
155
+
156
+ begin
157
+ if epoch.present?
158
+ offset = tz.period_for_local(epoch, true).offset.utc_total_offset
159
+ epoch - offset
160
+ end
161
+ # TZInfo::PeriodNotFound signals that there is no equivalent UTC time (for example,
162
+ # during the transition from standard time to daylight savings time when the clocks are moved forward and an hour is skipped).
163
+ # tz.local_time(2018, 3, 11, 2, 30, 0, 0)
164
+ rescue TZInfo::PeriodNotFound
165
+ epoch + 1.hour
166
+ offset = tz.period_for_local(epoch, true).offset.utc_total_offset
167
+ epoch - offset
168
+ end
169
+ end
170
+
171
+ def next_from_date(task_report)
172
+ next_to_date = Date.parse(task_report[:to_date])
173
+ {
174
+ from_date: next_to_date.to_s,
175
+ latest_fetched_time: task_report[:latest_fetched_time],
176
+ }
177
+ end
178
+
179
+ def endpoint
180
+ @config.param(:jql_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_JQL_ENDPOINT)
181
+ end
182
+
183
+ private
184
+
185
+ def create_task_report(next_fetched_time)
186
+ {
187
+ to_date: @dates.last || today(@timezone) - 1,
188
+ latest_fetched_time: next_fetched_time.to_s
189
+ }
190
+ end
191
+
192
+ def params(from_date, to_date)
193
+ {
194
+ from_date: from_date,
195
+ to_date: to_date
196
+ }
197
+ end
198
+
199
+ def extract_value(record, name)
200
+ case name
201
+ when NOT_PROPERTY_COLUMN
202
+ record[NOT_PROPERTY_COLUMN]
203
+ when "time"
204
+ if record["time"].present?
205
+ value = record["time"]
206
+ if value > 0
207
+ time = record["time"] / 1000
208
+ adjust_timezone(time)
209
+ else
210
+ value
211
+ end
212
+ end
213
+ when "last_seen"
214
+ if record["last_seen"].present?
215
+ value = record["last_seen"]
216
+ if value > 0
217
+ # last_seen format in ms
218
+ time = record["last_seen"] / 1000
219
+ adjust_timezone(time)
220
+ else
221
+ value
222
+ end
223
+ end
224
+ when @incremental_column
225
+ if record[@incremental_column].present?
226
+ value = record[@incremental_column]
227
+ if value > 0
228
+ # format in ms
229
+ time = record[@incremental_column] / 1000
230
+ adjust_timezone(time)
231
+ else
232
+ value
233
+ end
234
+ end
235
+ else
236
+ record[name]
237
+ end
238
+ end
239
+
240
+ def validate_result(records)
241
+ if records.is_a?(Array) && records.first.is_a?(Integer)
242
+ # incase using reduce, it only return the number of records
243
+ raise Embulk::ConfigError.new("Non-supported result. Revise your JQL.")
244
+ end
245
+ end
246
+
247
+ def validate_result_contain_incremental_column(records)
248
+ unless @incremental_column
249
+ Embulk.logger.warn "incremental_column should be specified when running in incremental mode to avoid duplicated"
250
+ Embulk.logger.warn "Use default value #{DEFAULT_TIME_COLUMN}"
251
+ @incremental_column = DEFAULT_TIME_COLUMN
252
+ end
253
+
254
+ if @incremental && records.length > 0 && !records[0].include?(@incremental_column)
255
+ raise Embulk::ConfigError.new("Missing Incremental Field (<incremental_column>) in the returned dataset. Specify the correct Incremental Field value.")
256
+ end
257
+ end
258
+
259
+ def validate_jql_script
260
+ jql_script = @config.param(:jql_script, :string, default: nil)
261
+ if jql_script.blank?
262
+ raise Embulk::ConfigError.new("JQL script shouldn't be empty or null")
263
+ end
264
+ end
265
+
266
+ def validate_fetch_days
267
+ fetch_days = @config.param(:fetch_days, :integer, default: nil)
268
+ if fetch_days && fetch_days <= 0
269
+ raise Embulk::ConfigError.new("fetch_days should be larger than 0")
270
+ end
271
+ end
272
+
273
+ end
274
+ end
275
+ end
276
+ end
@@ -9,7 +9,7 @@ class TimezoneValidator
9
9
  TZInfo::Timezone.get(@timezone)
10
10
  rescue => e
11
11
  Embulk.logger.error "'#{@timezone}' is invalid timezone"
12
- raise Embulk::ConfigError.new e.message
12
+ raise Embulk::ConfigError.new ("Fail to identify timezone from '#{@timezone}':#{e.message}.")
13
13
  end
14
14
  end
15
15
  end
@@ -9,22 +9,23 @@ module Embulk
9
9
  include OverrideAssertRaise
10
10
 
11
11
  API_SECRET = "api_secret".freeze
12
+ EXPORT_ENDPOINT = Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT
12
13
 
13
14
  def setup
14
- @client = Client.new(API_SECRET)
15
+ @client = Client.new(API_SECRET, EXPORT_ENDPOINT)
15
16
  stub(Embulk).logger { ::Logger.new(IO::NULL) }
16
17
  end
17
18
 
18
19
  class TestKeepAlive < self
19
20
  def test_tcp_keepalive_enabled
20
- client = Client.new(API_SECRET)
21
+ client = Client.new(API_SECRET, EXPORT_ENDPOINT)
21
22
  assert client.send(:httpclient).tcp_keepalive
22
23
  end
23
24
  end
24
25
 
25
26
  class TryToDatesTest < self
26
27
  def setup
27
- @client = Client.new(API_SECRET)
28
+ @client = Client.new(API_SECRET, EXPORT_ENDPOINT)
28
29
  end
29
30
 
30
31
 
@@ -101,7 +102,6 @@ module Embulk
101
102
 
102
103
  def test_export_partial_with_error_json
103
104
  stub_client
104
- # stub(@client).set_signatures(anything) {}
105
105
  stub_response(Struct.new(:code, :body).new(200, jsonl_dummy_responses+"\n{\"error\":"))
106
106
  records = []
107
107
  assert_raise MixpanelApi::IncompleteExportResponseError do
@@ -130,15 +130,6 @@ module Embulk
130
130
  end
131
131
  end
132
132
 
133
- def test_retry_for_429_temporary_fail
134
- stub_client
135
- stub_response(failure_response(429))
136
-
137
- assert_raise(RuntimeError) do
138
- @client.export(params)
139
- end
140
- end
141
-
142
133
  class ExportSmallDataset < self
143
134
  def test_to_date_after_1_day
144
135
  to = (Date.parse(params["from_date"]) + 1).to_s
@@ -147,15 +138,6 @@ module Embulk
147
138
  @client.export_for_small_dataset(params)
148
139
  end
149
140
 
150
- def test_retry_for_429_temporary_fail
151
- stub_client
152
- stub_response(failure_response(429))
153
-
154
- assert_raise(RuntimeError) do
155
- @client.export_for_small_dataset(params)
156
- end
157
- end
158
-
159
141
  def test_to_date_after_1_day_after_10_days_if_empty
160
142
  stub_client
161
143
  to1 = (Date.parse(params["from_date"]) + 1).to_s
@@ -1,12 +1,14 @@
1
1
  require "prepare_embulk"
2
2
  require "override_assert_raise"
3
3
  require "embulk/input/mixpanel"
4
+ require "embulk/input/service/base_service"
5
+ require "embulk/input/service/export_service"
4
6
  require "active_support/core_ext/time"
5
7
  require "json"
6
8
 
7
9
  module Embulk
8
10
  module Input
9
- class MixpanelTest < Test::Unit::TestCase
11
+ class ExportServiceTest < Test::Unit::TestCase
10
12
  include OverrideAssertRaise
11
13
 
12
14
  API_SECRET = "api_secret".freeze
@@ -72,6 +74,7 @@ module Embulk
72
74
  type: "mixpanel",
73
75
  api_secret: API_SECRET,
74
76
  from_date: FROM_DATE,
77
+ timezone: TIMEZONE,
75
78
  }
76
79
 
77
80
  stub_export_all
@@ -90,7 +93,7 @@ module Embulk
90
93
  }
91
94
 
92
95
  stub_export_all
93
- mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date(TIMEZONE).to_s}/)
96
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Embulk::Input::Service::ExportService.new(config).default_guess_start_date(TIMEZONE).to_s}/)
94
97
 
95
98
  Mixpanel.guess(embulk_config(config))
96
99
  end
@@ -101,6 +104,7 @@ module Embulk
101
104
  type: "mixpanel",
102
105
  api_secret: API_SECRET,
103
106
  from_date: from_date,
107
+ timezone: TIMEZONE,
104
108
  }
105
109
 
106
110
  stub_export_all
@@ -113,11 +117,11 @@ module Embulk
113
117
  config = {
114
118
  type: "mixpanel",
115
119
  api_secret: API_SECRET,
116
- timezone: TIMEZONE
120
+ timezone: TIMEZONE,
117
121
  }
118
122
 
119
123
  stub_export_all
120
- mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date(TIMEZONE).to_s}/)
124
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Embulk::Input::Service::ExportService.new(config).default_guess_start_date(TIMEZONE).to_s}/)
121
125
 
122
126
  Mixpanel.guess(embulk_config(config))
123
127
  end
@@ -126,7 +130,14 @@ module Embulk
126
130
  sample_records = records.map do |r|
127
131
  r.merge("properties" => {"time" => 1, "array" => [1, 2], "hash" => {foo: "FOO"}})
128
132
  end
129
- actual = Mixpanel.guess_from_records(sample_records)
133
+
134
+ config = {
135
+ type: "mixpanel",
136
+ api_secret: API_SECRET,
137
+ timezone: TIMEZONE,
138
+ }
139
+
140
+ actual = Embulk::Input::Service::ExportService.new(config).guess_from_records(sample_records)
130
141
  assert actual.include?(name: "array", type: :json)
131
142
  assert actual.include?(name: "hash", type: :json)
132
143
  end
@@ -136,6 +147,7 @@ module Embulk
136
147
  config = {
137
148
  type: "mixpanel",
138
149
  api_secret: API_SECRET,
150
+ timezone: TIMEZONE,
139
151
  }
140
152
 
141
153
  assert_raise(Embulk::DataError) do
@@ -456,7 +468,7 @@ module Embulk
456
468
  where: 'properties["$os"] == "Windows"',
457
469
  bucket: "987",
458
470
  }
459
- actual = Mixpanel.export_params(config)
471
+ actual = Embulk::Input::Service::ExportService.new(config).export_params
460
472
 
461
473
  assert_equal(expected, actual)
462
474
  end
@@ -534,7 +546,7 @@ module Embulk
534
546
  timezone: TIMEZONE,
535
547
  schema: schema,
536
548
  dates: DATES.to_a.map(&:to_s),
537
- params: Mixpanel.export_params(embulk_config),
549
+ params: Mixpanel.service(embulk_config).export_params,
538
550
  fetch_unknown_columns: false,
539
551
  fetch_custom_properties: false,
540
552
  retry_initial_wait_sec: 0,
@@ -558,11 +570,16 @@ module Embulk
558
570
  super
559
571
  @page_builder = Object.new
560
572
  @plugin = Mixpanel.new(task, nil, nil, @page_builder)
561
- stub(@plugin).fetch { records }
573
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
574
+ stub(klass).fetch { records }
575
+ end
576
+ # Embulk::Input::Service::ExportService.(:fetch => :records)
562
577
  end
563
578
 
564
579
  def test_preview
565
- stub(@plugin).preview? { true }
580
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
581
+ stub(klass).preview? { true }
582
+ end
566
583
  mock(@page_builder).add(anything).times(records.length)
567
584
  mock(@page_builder).finish
568
585
 
@@ -570,7 +587,9 @@ module Embulk
570
587
  end
571
588
 
572
589
  def test_run
573
- stub(@plugin).preview? { false }
590
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
591
+ stub(klass).preview? { false }
592
+ end
574
593
  mock(@page_builder).add(anything).times(records.length * 2)
575
594
  mock(@page_builder).finish
576
595
 
@@ -578,13 +597,16 @@ module Embulk
578
597
  end
579
598
 
580
599
  def test_timezone
581
- stub(@plugin).preview? { false }
600
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
601
+ stub(klass).preview? { false }
602
+ end
582
603
  adjusted = record_epoch - timezone_offset_seconds
583
604
  mock(@page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
584
605
  mock(@page_builder).finish
585
606
 
586
607
  @plugin.run
587
608
  end
609
+
588
610
  class PartialRunTest < self
589
611
  def setup_client
590
612
  any_instance_of(MixpanelApi::Client) do |klass|
@@ -598,7 +620,9 @@ module Embulk
598
620
 
599
621
  def test_run_with_allow_partial_false
600
622
  @plugin = Mixpanel.new(task.merge(allow_partial_import: false), nil, nil, @page_builder)
601
- stub(@plugin).preview? {false}
623
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
624
+ stub(klass).preview? { false }
625
+ end
602
626
  assert_raise MixpanelApi::IncompleteExportResponseError do
603
627
  @plugin.run
604
628
  end
@@ -607,19 +631,25 @@ module Embulk
607
631
  def test_run_with_allow_partial_true
608
632
  @plugin = Mixpanel.new(task.merge(allow_partial_import: true), nil, nil, @page_builder)
609
633
  mock(@page_builder).finish
610
- stub(@plugin).preview? {false}
634
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
635
+ stub(klass).preview? { false }
636
+ end
611
637
  @plugin.run
612
638
  end
613
639
  end
640
+
614
641
  class SliceRangeRunTest < self
615
642
 
616
643
  def test_default_slice_range
617
644
  plugin = Mixpanel.new(task.merge(slice_range: 2), nil, nil, @page_builder)
618
- stub(plugin).preview? {false}
619
- stub(plugin).fetch(["2015-02-22", "2015-02-23"],0){[]}
620
- stub(plugin).fetch(["2015-02-24", "2015-02-25"],0){[]}
621
- stub(plugin).fetch(["2015-02-26", "2015-02-27"],0){[]}
622
- stub(plugin).fetch(["2015-02-28", "2015-03-01"],0){[]}
645
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
646
+ stub(klass).preview? { false }
647
+ stub(klass).fetch(["2015-02-22", "2015-02-23"],0,anything){[]}
648
+ stub(klass).fetch(["2015-02-24", "2015-02-25"],0,anything){[]}
649
+ stub(klass).fetch(["2015-02-26", "2015-02-27"],0,anything){[]}
650
+ stub(klass).fetch(["2015-02-28", "2015-03-01"],0,anything){[]}
651
+ end
652
+
623
653
  mock(@page_builder).finish
624
654
  plugin.run
625
655
  end
@@ -663,8 +693,10 @@ module Embulk
663
693
 
664
694
  def test_incremental_column_with_where
665
695
  page_builder = Object.new
666
- plugin = Mixpanel.new(task.merge(params: task[:params].merge("where" => "abc==def"),latest_fetched_time: 1), nil, nil, page_builder)
667
- stub(plugin).preview? {false}
696
+ plugin = Mixpanel.new(task.merge(params: task[:params].merge(where: "abc==def"),latest_fetched_time: 1), nil, nil, page_builder)
697
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
698
+ stub(klass).preview? { false }
699
+ end
668
700
  adjusted = record_epoch - timezone_offset_seconds
669
701
  mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
670
702
  mock(page_builder).finish
@@ -719,8 +751,10 @@ module Embulk
719
751
  def setup
720
752
  super
721
753
  @page_builder = Object.new
722
- @plugin = Mixpanel.new(task, nil, nil, @page_builder)
723
- stub(@plugin).fetch { [record] }
754
+ @plugin = Mixpanel.new(DataSource[task.to_a], nil, nil, @page_builder)
755
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
756
+ stub(klass).fetch { [record] }
757
+ end
724
758
  end
725
759
 
726
760
  def test_run
@@ -770,8 +804,10 @@ module Embulk
770
804
  class UnknownColumnsTest < self
771
805
  def setup
772
806
  @page_builder = Object.new
773
- @plugin = Mixpanel.new(task, nil, nil, @page_builder)
774
- stub(@plugin).fetch { records }
807
+ @plugin = Mixpanel.new(DataSource[task.to_a], nil, nil, @page_builder)
808
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
809
+ stub(klass).fetch { records }
810
+ end
775
811
  end
776
812
 
777
813
  def test_run
@@ -840,7 +876,7 @@ module Embulk
840
876
  incremental_column: nil,
841
877
  schema: schema,
842
878
  dates: DATES.to_a.map(&:to_s),
843
- params: Mixpanel.export_params(embulk_config),
879
+ params: Mixpanel.service(embulk_config).export_params,
844
880
  fetch_unknown_columns: false,
845
881
  fetch_custom_properties: false,
846
882
  retry_initial_wait_sec: 2,
@@ -880,6 +916,7 @@ module Embulk
880
916
  api_secret: API_SECRET,
881
917
  from_date: FROM_DATE,
882
918
  fetch_days: DAYS,
919
+ timezone: TIMEZONE,
883
920
  fetch_unknown_columns: false,
884
921
  fetch_custom_properties: false,
885
922
  retry_initial_wait_sec: 2,