embulk-input-mixpanel 0.5.15 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,276 @@
1
+ require 'embulk/input/service/base_service'
2
+
3
+ module Embulk
4
+ module Input
5
+ module Service
6
+ class JqlService < BaseService
7
+
8
+ FROM_DATE_PARAM = "params.from_date"
9
+ TO_DATE_PARAM = "params.to_date"
10
+
11
+ def validate_config
12
+ super
13
+
14
+ validate_jql_script
15
+ validate_fetch_days
16
+ end
17
+
18
+ def create_task
19
+ {
20
+ timezone: @config.param(:timezone, :string, default: ""),
21
+ api_secret: @config.param(:api_secret, :string),
22
+ jql_endpoint: endpoint,
23
+ dates: range,
24
+ incremental: @config.param(:incremental, :bool, default: true),
25
+ slice_range: @config.param(:slice_range, :integer, default: 7),
26
+ schema: @config.param(:columns, :array),
27
+ retry_initial_wait_sec: @config.param(:retry_initial_wait_sec, :integer, default: 1),
28
+ retry_limit: @config.param(:retry_limit, :integer, default: 5),
29
+ incremental_column: @config.param(:incremental_column, :string, default: nil),
30
+ latest_fetched_time: @config.param(:latest_fetched_time, :integer, default: 0),
31
+ jql_mode: true,
32
+ jql_script: @config.param(:jql_script, :string, nil)
33
+ }
34
+ end
35
+
36
+ def guess_columns
37
+ giveup_when_mixpanel_is_down
38
+ range = guess_range
39
+ Embulk.logger.info "Guessing schema using #{range.first}..#{range.last}"
40
+
41
+ client = create_client
42
+
43
+ sample_records = client.send_jql_script_small_dataset(parameters(@config.param(:jql_script, :string, nil), range.first, range.last))
44
+
45
+ validate_result(sample_records)
46
+
47
+ @incremental = @config.param(:incremental, :bool, default: true)
48
+ @incremental_column = @config.param(:incremental_column, :string, default: nil)
49
+ validate_result_contain_incremental_column(sample_records)
50
+
51
+ guess_from_records(sample_records)
52
+ end
53
+
54
+ def ingest(task, page_builder)
55
+ @dates = task[:dates]
56
+ @schema = task[:schema]
57
+ @timezone = task[:timezone]
58
+ @incremental_column = task[:incremental_column]
59
+ unless @incremental_column
60
+ Embulk.logger.warn "incremental_column should be specified when running in incremental mode to avoid duplicated"
61
+ Embulk.logger.warn "Use default value #{DEFAULT_TIME_COLUMN}"
62
+ @incremental_column = DEFAULT_TIME_COLUMN
63
+ end
64
+
65
+ @incremental = task[:incremental]
66
+ latest_fetched_time = task[:latest_fetched_time]
67
+
68
+ client = create_client
69
+
70
+ ignored_fetched_record_count = 0
71
+ next_fetched_time = latest_fetched_time
72
+ @dates.each_slice(task[:slice_range]) do |slice_dates|
73
+ Embulk.logger.info "Fetching date from #{slice_dates.first}..#{slice_dates.last}"
74
+ if preview?
75
+ records = client.send_jql_script_small_dataset(parameters(@config.param(:jql_script, :string, default: nil), slice_dates.first, slice_dates.last))
76
+ else
77
+ records = client.send_jql_script(parameters(task[:jql_script], slice_dates.first, slice_dates.last))
78
+ end
79
+ validate_result(records)
80
+ records.each do |record|
81
+ if @incremental
82
+ if @schema.map {|col| col["name"]}.include?(@incremental_column)
83
+ record_incremental_column = record[@incremental_column]
84
+ if record_incremental_column
85
+ if record_incremental_column <= latest_fetched_time
86
+ ignored_fetched_record_count += 1
87
+ next
88
+ else
89
+ next_fetched_time = [record_incremental_column, next_fetched_time].max
90
+ end
91
+ end
92
+ else
93
+ raise Embulk::ConfigError.new("Missing Incremental Field (<incremental_column>) in the returned dataset. Specify the correct Incremental Field value.")
94
+ end
95
+ end
96
+ values = extract_values(record)
97
+ page_builder.add(values)
98
+ end
99
+ break if preview?
100
+ end
101
+ Embulk.logger.info "Skip #{ignored_fetched_record_count} rows"
102
+ page_builder.finish
103
+
104
+ if task[:incremental] && !preview?
105
+ return create_task_report(next_fetched_time)
106
+ end
107
+ {}
108
+ end
109
+
110
+ def guess_range
111
+ time_zone = @config.param(:timezone, :string, default: "")
112
+ from_date = @config.param(:from_date, :string, default: default_guess_start_date(time_zone).to_s)
113
+ fetch_days = @config.param(:fetch_days, :integer, default: DEFAULT_FETCH_DAYS)
114
+
115
+ fetch_days = [fetch_days, DEFAULT_FETCH_DAYS].min
116
+
117
+ range = RangeGenerator.new(from_date, fetch_days, time_zone).generate_range
118
+ if range.empty?
119
+ return default_guess_start_date(time_zone)..(today(time_zone) - 1)
120
+ end
121
+ range
122
+ end
123
+
124
+ def guess_from_records(sample_props)
125
+ validate_result(sample_props)
126
+
127
+ begin
128
+ schema = Guess::SchemaGuess.from_hash_records(sample_props)
129
+ schema.map do |col|
130
+ result = {
131
+ name: col.name,
132
+ type: col.type,
133
+ }
134
+ if (col.name.eql? "time") || (col.eql? "last_seen")
135
+ result["format"] = col.format if col.format
136
+ end
137
+ result
138
+ end
139
+ rescue DataError
140
+ raise Embulk::ConfigError.new("Non-supported result #{sample_props}. Revise your JQL.")
141
+ end
142
+ end
143
+
144
+ def parameters(script, from_date, to_date)
145
+ {
146
+ params: params(from_date, to_date),
147
+ script: script
148
+ }
149
+ end
150
+
151
+ def adjust_timezone(epoch)
152
+ # Adjust timezone offset to get UTC time
153
+ # c.f. https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel#export
154
+ tz = TZInfo::Timezone.get(@timezone)
155
+
156
+ begin
157
+ if epoch.present?
158
+ offset = tz.period_for_local(epoch, true).offset.utc_total_offset
159
+ epoch - offset
160
+ end
161
+ # TZInfo::PeriodNotFound signals that there is no equivalent UTC time (for example,
162
+ # during the transition from standard time to daylight savings time when the clocks are moved forward and an hour is skipped).
163
+ # tz.local_time(2018, 3, 11, 2, 30, 0, 0)
164
+ rescue TZInfo::PeriodNotFound
165
+ epoch + 1.hour
166
+ offset = tz.period_for_local(epoch, true).offset.utc_total_offset
167
+ epoch - offset
168
+ end
169
+ end
170
+
171
+ def next_from_date(task_report)
172
+ next_to_date = Date.parse(task_report[:to_date])
173
+ {
174
+ from_date: next_to_date.to_s,
175
+ latest_fetched_time: task_report[:latest_fetched_time],
176
+ }
177
+ end
178
+
179
+ def endpoint
180
+ @config.param(:jql_endpoint, :string, default: Embulk::Input::MixpanelApi::Client::DEFAULT_JQL_ENDPOINT)
181
+ end
182
+
183
+ private
184
+
185
+ def create_task_report(next_fetched_time)
186
+ {
187
+ to_date: @dates.last || today(@timezone) - 1,
188
+ latest_fetched_time: next_fetched_time.to_s
189
+ }
190
+ end
191
+
192
+ def params(from_date, to_date)
193
+ {
194
+ from_date: from_date,
195
+ to_date: to_date
196
+ }
197
+ end
198
+
199
+ def extract_value(record, name)
200
+ case name
201
+ when NOT_PROPERTY_COLUMN
202
+ record[NOT_PROPERTY_COLUMN]
203
+ when "time"
204
+ if record["time"].present?
205
+ value = record["time"]
206
+ if value > 0
207
+ time = record["time"] / 1000
208
+ adjust_timezone(time)
209
+ else
210
+ value
211
+ end
212
+ end
213
+ when "last_seen"
214
+ if record["last_seen"].present?
215
+ value = record["last_seen"]
216
+ if value > 0
217
+ # last_seen format in ms
218
+ time = record["last_seen"] / 1000
219
+ adjust_timezone(time)
220
+ else
221
+ value
222
+ end
223
+ end
224
+ when @incremental_column
225
+ if record[@incremental_column].present?
226
+ value = record[@incremental_column]
227
+ if value > 0
228
+ # format in ms
229
+ time = record[@incremental_column] / 1000
230
+ adjust_timezone(time)
231
+ else
232
+ value
233
+ end
234
+ end
235
+ else
236
+ record[name]
237
+ end
238
+ end
239
+
240
+ def validate_result(records)
241
+ if records.is_a?(Array) && records.first.is_a?(Integer)
242
+ # incase using reduce, it only return the number of records
243
+ raise Embulk::ConfigError.new("Non-supported result. Revise your JQL.")
244
+ end
245
+ end
246
+
247
+ def validate_result_contain_incremental_column(records)
248
+ unless @incremental_column
249
+ Embulk.logger.warn "incremental_column should be specified when running in incremental mode to avoid duplicated"
250
+ Embulk.logger.warn "Use default value #{DEFAULT_TIME_COLUMN}"
251
+ @incremental_column = DEFAULT_TIME_COLUMN
252
+ end
253
+
254
+ if @incremental && records.length > 0 && !records[0].include?(@incremental_column)
255
+ raise Embulk::ConfigError.new("Missing Incremental Field (<incremental_column>) in the returned dataset. Specify the correct Incremental Field value.")
256
+ end
257
+ end
258
+
259
+ def validate_jql_script
260
+ jql_script = @config.param(:jql_script, :string, default: nil)
261
+ if jql_script.blank?
262
+ raise Embulk::ConfigError.new("JQL script shouldn't be empty or null")
263
+ end
264
+ end
265
+
266
+ def validate_fetch_days
267
+ fetch_days = @config.param(:fetch_days, :integer, default: nil)
268
+ if fetch_days && fetch_days <= 0
269
+ raise Embulk::ConfigError.new("fetch_days should be larger than 0")
270
+ end
271
+ end
272
+
273
+ end
274
+ end
275
+ end
276
+ end
@@ -9,7 +9,7 @@ class TimezoneValidator
9
9
  TZInfo::Timezone.get(@timezone)
10
10
  rescue => e
11
11
  Embulk.logger.error "'#{@timezone}' is invalid timezone"
12
- raise Embulk::ConfigError.new e.message
12
+ raise Embulk::ConfigError.new ("Fail to identify timezone from '#{@timezone}':#{e.message}.")
13
13
  end
14
14
  end
15
15
  end
@@ -9,22 +9,23 @@ module Embulk
9
9
  include OverrideAssertRaise
10
10
 
11
11
  API_SECRET = "api_secret".freeze
12
+ EXPORT_ENDPOINT = Embulk::Input::MixpanelApi::Client::DEFAULT_EXPORT_ENDPOINT
12
13
 
13
14
  def setup
14
- @client = Client.new(API_SECRET)
15
+ @client = Client.new(API_SECRET, EXPORT_ENDPOINT)
15
16
  stub(Embulk).logger { ::Logger.new(IO::NULL) }
16
17
  end
17
18
 
18
19
  class TestKeepAlive < self
19
20
  def test_tcp_keepalive_enabled
20
- client = Client.new(API_SECRET)
21
+ client = Client.new(API_SECRET, EXPORT_ENDPOINT)
21
22
  assert client.send(:httpclient).tcp_keepalive
22
23
  end
23
24
  end
24
25
 
25
26
  class TryToDatesTest < self
26
27
  def setup
27
- @client = Client.new(API_SECRET)
28
+ @client = Client.new(API_SECRET, EXPORT_ENDPOINT)
28
29
  end
29
30
 
30
31
 
@@ -101,7 +102,6 @@ module Embulk
101
102
 
102
103
  def test_export_partial_with_error_json
103
104
  stub_client
104
- # stub(@client).set_signatures(anything) {}
105
105
  stub_response(Struct.new(:code, :body).new(200, jsonl_dummy_responses+"\n{\"error\":"))
106
106
  records = []
107
107
  assert_raise MixpanelApi::IncompleteExportResponseError do
@@ -130,15 +130,6 @@ module Embulk
130
130
  end
131
131
  end
132
132
 
133
- def test_retry_for_429_temporary_fail
134
- stub_client
135
- stub_response(failure_response(429))
136
-
137
- assert_raise(RuntimeError) do
138
- @client.export(params)
139
- end
140
- end
141
-
142
133
  class ExportSmallDataset < self
143
134
  def test_to_date_after_1_day
144
135
  to = (Date.parse(params["from_date"]) + 1).to_s
@@ -147,15 +138,6 @@ module Embulk
147
138
  @client.export_for_small_dataset(params)
148
139
  end
149
140
 
150
- def test_retry_for_429_temporary_fail
151
- stub_client
152
- stub_response(failure_response(429))
153
-
154
- assert_raise(RuntimeError) do
155
- @client.export_for_small_dataset(params)
156
- end
157
- end
158
-
159
141
  def test_to_date_after_1_day_after_10_days_if_empty
160
142
  stub_client
161
143
  to1 = (Date.parse(params["from_date"]) + 1).to_s
@@ -1,12 +1,14 @@
1
1
  require "prepare_embulk"
2
2
  require "override_assert_raise"
3
3
  require "embulk/input/mixpanel"
4
+ require "embulk/input/service/base_service"
5
+ require "embulk/input/service/export_service"
4
6
  require "active_support/core_ext/time"
5
7
  require "json"
6
8
 
7
9
  module Embulk
8
10
  module Input
9
- class MixpanelTest < Test::Unit::TestCase
11
+ class ExportServiceTest < Test::Unit::TestCase
10
12
  include OverrideAssertRaise
11
13
 
12
14
  API_SECRET = "api_secret".freeze
@@ -72,6 +74,7 @@ module Embulk
72
74
  type: "mixpanel",
73
75
  api_secret: API_SECRET,
74
76
  from_date: FROM_DATE,
77
+ timezone: TIMEZONE,
75
78
  }
76
79
 
77
80
  stub_export_all
@@ -90,7 +93,7 @@ module Embulk
90
93
  }
91
94
 
92
95
  stub_export_all
93
- mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date(TIMEZONE).to_s}/)
96
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Embulk::Input::Service::ExportService.new(config).default_guess_start_date(TIMEZONE).to_s}/)
94
97
 
95
98
  Mixpanel.guess(embulk_config(config))
96
99
  end
@@ -101,6 +104,7 @@ module Embulk
101
104
  type: "mixpanel",
102
105
  api_secret: API_SECRET,
103
106
  from_date: from_date,
107
+ timezone: TIMEZONE,
104
108
  }
105
109
 
106
110
  stub_export_all
@@ -113,11 +117,11 @@ module Embulk
113
117
  config = {
114
118
  type: "mixpanel",
115
119
  api_secret: API_SECRET,
116
- timezone: TIMEZONE
120
+ timezone: TIMEZONE,
117
121
  }
118
122
 
119
123
  stub_export_all
120
- mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Mixpanel.default_guess_start_date(TIMEZONE).to_s}/)
124
+ mock(Embulk.logger).info(/Guessing.*#{Regexp.escape Embulk::Input::Service::ExportService.new(config).default_guess_start_date(TIMEZONE).to_s}/)
121
125
 
122
126
  Mixpanel.guess(embulk_config(config))
123
127
  end
@@ -126,7 +130,14 @@ module Embulk
126
130
  sample_records = records.map do |r|
127
131
  r.merge("properties" => {"time" => 1, "array" => [1, 2], "hash" => {foo: "FOO"}})
128
132
  end
129
- actual = Mixpanel.guess_from_records(sample_records)
133
+
134
+ config = {
135
+ type: "mixpanel",
136
+ api_secret: API_SECRET,
137
+ timezone: TIMEZONE,
138
+ }
139
+
140
+ actual = Embulk::Input::Service::ExportService.new(config).guess_from_records(sample_records)
130
141
  assert actual.include?(name: "array", type: :json)
131
142
  assert actual.include?(name: "hash", type: :json)
132
143
  end
@@ -136,6 +147,7 @@ module Embulk
136
147
  config = {
137
148
  type: "mixpanel",
138
149
  api_secret: API_SECRET,
150
+ timezone: TIMEZONE,
139
151
  }
140
152
 
141
153
  assert_raise(Embulk::DataError) do
@@ -456,7 +468,7 @@ module Embulk
456
468
  where: 'properties["$os"] == "Windows"',
457
469
  bucket: "987",
458
470
  }
459
- actual = Mixpanel.export_params(config)
471
+ actual = Embulk::Input::Service::ExportService.new(config).export_params
460
472
 
461
473
  assert_equal(expected, actual)
462
474
  end
@@ -534,7 +546,7 @@ module Embulk
534
546
  timezone: TIMEZONE,
535
547
  schema: schema,
536
548
  dates: DATES.to_a.map(&:to_s),
537
- params: Mixpanel.export_params(embulk_config),
549
+ params: Mixpanel.service(embulk_config).export_params,
538
550
  fetch_unknown_columns: false,
539
551
  fetch_custom_properties: false,
540
552
  retry_initial_wait_sec: 0,
@@ -558,11 +570,16 @@ module Embulk
558
570
  super
559
571
  @page_builder = Object.new
560
572
  @plugin = Mixpanel.new(task, nil, nil, @page_builder)
561
- stub(@plugin).fetch { records }
573
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
574
+ stub(klass).fetch { records }
575
+ end
576
+ # Embulk::Input::Service::ExportService.(:fetch => :records)
562
577
  end
563
578
 
564
579
  def test_preview
565
- stub(@plugin).preview? { true }
580
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
581
+ stub(klass).preview? { true }
582
+ end
566
583
  mock(@page_builder).add(anything).times(records.length)
567
584
  mock(@page_builder).finish
568
585
 
@@ -570,7 +587,9 @@ module Embulk
570
587
  end
571
588
 
572
589
  def test_run
573
- stub(@plugin).preview? { false }
590
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
591
+ stub(klass).preview? { false }
592
+ end
574
593
  mock(@page_builder).add(anything).times(records.length * 2)
575
594
  mock(@page_builder).finish
576
595
 
@@ -578,13 +597,16 @@ module Embulk
578
597
  end
579
598
 
580
599
  def test_timezone
581
- stub(@plugin).preview? { false }
600
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
601
+ stub(klass).preview? { false }
602
+ end
582
603
  adjusted = record_epoch - timezone_offset_seconds
583
604
  mock(@page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
584
605
  mock(@page_builder).finish
585
606
 
586
607
  @plugin.run
587
608
  end
609
+
588
610
  class PartialRunTest < self
589
611
  def setup_client
590
612
  any_instance_of(MixpanelApi::Client) do |klass|
@@ -598,7 +620,9 @@ module Embulk
598
620
 
599
621
  def test_run_with_allow_partial_false
600
622
  @plugin = Mixpanel.new(task.merge(allow_partial_import: false), nil, nil, @page_builder)
601
- stub(@plugin).preview? {false}
623
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
624
+ stub(klass).preview? { false }
625
+ end
602
626
  assert_raise MixpanelApi::IncompleteExportResponseError do
603
627
  @plugin.run
604
628
  end
@@ -607,19 +631,25 @@ module Embulk
607
631
  def test_run_with_allow_partial_true
608
632
  @plugin = Mixpanel.new(task.merge(allow_partial_import: true), nil, nil, @page_builder)
609
633
  mock(@page_builder).finish
610
- stub(@plugin).preview? {false}
634
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
635
+ stub(klass).preview? { false }
636
+ end
611
637
  @plugin.run
612
638
  end
613
639
  end
640
+
614
641
  class SliceRangeRunTest < self
615
642
 
616
643
  def test_default_slice_range
617
644
  plugin = Mixpanel.new(task.merge(slice_range: 2), nil, nil, @page_builder)
618
- stub(plugin).preview? {false}
619
- stub(plugin).fetch(["2015-02-22", "2015-02-23"],0){[]}
620
- stub(plugin).fetch(["2015-02-24", "2015-02-25"],0){[]}
621
- stub(plugin).fetch(["2015-02-26", "2015-02-27"],0){[]}
622
- stub(plugin).fetch(["2015-02-28", "2015-03-01"],0){[]}
645
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
646
+ stub(klass).preview? { false }
647
+ stub(klass).fetch(["2015-02-22", "2015-02-23"],0,anything){[]}
648
+ stub(klass).fetch(["2015-02-24", "2015-02-25"],0,anything){[]}
649
+ stub(klass).fetch(["2015-02-26", "2015-02-27"],0,anything){[]}
650
+ stub(klass).fetch(["2015-02-28", "2015-03-01"],0,anything){[]}
651
+ end
652
+
623
653
  mock(@page_builder).finish
624
654
  plugin.run
625
655
  end
@@ -663,8 +693,10 @@ module Embulk
663
693
 
664
694
  def test_incremental_column_with_where
665
695
  page_builder = Object.new
666
- plugin = Mixpanel.new(task.merge(params: task[:params].merge("where" => "abc==def"),latest_fetched_time: 1), nil, nil, page_builder)
667
- stub(plugin).preview? {false}
696
+ plugin = Mixpanel.new(task.merge(params: task[:params].merge(where: "abc==def"),latest_fetched_time: 1), nil, nil, page_builder)
697
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
698
+ stub(klass).preview? { false }
699
+ end
668
700
  adjusted = record_epoch - timezone_offset_seconds
669
701
  mock(page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
670
702
  mock(page_builder).finish
@@ -719,8 +751,10 @@ module Embulk
719
751
  def setup
720
752
  super
721
753
  @page_builder = Object.new
722
- @plugin = Mixpanel.new(task, nil, nil, @page_builder)
723
- stub(@plugin).fetch { [record] }
754
+ @plugin = Mixpanel.new(DataSource[task.to_a], nil, nil, @page_builder)
755
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
756
+ stub(klass).fetch { [record] }
757
+ end
724
758
  end
725
759
 
726
760
  def test_run
@@ -770,8 +804,10 @@ module Embulk
770
804
  class UnknownColumnsTest < self
771
805
  def setup
772
806
  @page_builder = Object.new
773
- @plugin = Mixpanel.new(task, nil, nil, @page_builder)
774
- stub(@plugin).fetch { records }
807
+ @plugin = Mixpanel.new(DataSource[task.to_a], nil, nil, @page_builder)
808
+ any_instance_of(Embulk::Input::Service::ExportService) do |klass|
809
+ stub(klass).fetch { records }
810
+ end
775
811
  end
776
812
 
777
813
  def test_run
@@ -840,7 +876,7 @@ module Embulk
840
876
  incremental_column: nil,
841
877
  schema: schema,
842
878
  dates: DATES.to_a.map(&:to_s),
843
- params: Mixpanel.export_params(embulk_config),
879
+ params: Mixpanel.service(embulk_config).export_params,
844
880
  fetch_unknown_columns: false,
845
881
  fetch_custom_properties: false,
846
882
  retry_initial_wait_sec: 2,
@@ -880,6 +916,7 @@ module Embulk
880
916
  api_secret: API_SECRET,
881
917
  from_date: FROM_DATE,
882
918
  fetch_days: DAYS,
919
+ timezone: TIMEZONE,
883
920
  fetch_unknown_columns: false,
884
921
  fetch_custom_properties: false,
885
922
  retry_initial_wait_sec: 2,