embulk-input-mixpanel 0.4.3 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 82bac0d9e5e93a4fc606b3d43f1e9cd35debf4b3
4
- data.tar.gz: 82e5c6323b3d9db15ff54589b572b642cb6d93e1
3
+ metadata.gz: f5cb5156064ae0192f6d1d2321a173cd12ef5fd3
4
+ data.tar.gz: 2a2f3a63e55035bb8dd96a8edf27eb45e546866d
5
5
  SHA512:
6
- metadata.gz: e606300ab082a98aa443432a0b4f4246c40a942ca7724bb68679d66a889fce60cb36eb95b7dcf59e7a35181bb3175efc785cb3710d7764b58d800209589a140d
7
- data.tar.gz: abf1f2528faf56e7c6e65d72d4f8e34a6beebda7f93de1eb16bd93130de2c4b647cf716053ec039a7ddd8b1fdbfa9c62864bb280334630b53af1f15db6137cd3
6
+ metadata.gz: 48d7e7af0b5ad28fc030e7c1baaa4c34fd236b60fb34df7e8cc1b764825c143aadcab7725d4d78afe3b1ba0ffb05067418217872d79cc0a1d00af7440e7abd90
7
+ data.tar.gz: b001c794683c39d69ae6b29782988ee3c64ea17a9449d29fb7d86342b144854116116b1928946f33691eaaf0f3ed6d711f985fbe7e4a414c7579a3fd6baf38ec
data/CHANGELOG.md CHANGED
@@ -1,3 +1,6 @@
1
+ ## 0.4.4 - 2016-09-02
2
+ * [enhancement] Reduce memory usage by streaming processing [#42](https://github.com/treasure-data/embulk-input-mixpanel/pull/42)
3
+
1
4
  ## 0.4.3 - 2016-03-16
2
5
  * [enhancement] Custom properties json [#40](https://github.com/treasure-data/embulk-input-mixpanel/pull/40)
3
6
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-mixpanel"
4
- spec.version = "0.4.3"
4
+ spec.version = "0.4.4"
5
5
  spec.authors = ["yoshihara", "uu59"]
6
6
  spec.summary = "Mixpanel input plugin for Embulk"
7
7
  spec.description = "Loads records from Mixpanel."
@@ -15,7 +15,7 @@ Gem::Specification.new do |spec|
15
15
 
16
16
  spec.add_dependency 'httpclient'
17
17
  spec.add_dependency 'tzinfo'
18
- spec.add_dependency 'perfect_retry', ["~> 0.3"]
18
+ spec.add_dependency 'perfect_retry', ["~> 0.5"]
19
19
  spec.add_development_dependency 'bundler', ['~> 1.0']
20
20
  spec.add_development_dependency 'rake', ['>= 10.0']
21
21
  spec.add_development_dependency 'embulk', ['>= 0.8.6', '< 1.0']
@@ -203,20 +203,22 @@ module Embulk
203
203
  end
204
204
  end
205
205
 
206
- def fetch(dates)
206
+ def fetch(dates, &block)
207
207
  from_date = dates.first
208
208
  to_date = dates.last
209
209
  params = @params.merge(
210
210
  "from_date" => from_date,
211
211
  "to_date" => to_date,
212
212
  )
213
- client = MixpanelApi::Client.new(@api_key, @api_secret)
213
+ client = MixpanelApi::Client.new(@api_key, @api_secret, @retryer)
214
214
 
215
- @retryer.with_retry do
216
- if preview?
217
- client.export_for_small_dataset(params)
218
- else
219
- client.export(params)
215
+ if preview?
216
+ client.export_for_small_dataset(params)
217
+ else
218
+ Enumerator.new do |y|
219
+ client.export(params) do |record|
220
+ y << record
221
+ end
220
222
  end
221
223
  end
222
224
  end
@@ -14,6 +14,8 @@ module Embulk
14
14
  PING_RETRY_WAIT = 2
15
15
  SMALLSET_BYTE_RANGE = "0-#{5 * 1024 * 1024}"
16
16
 
17
+ attr_reader :retryer
18
+
17
19
  def self.mixpanel_available?
18
20
  retryer = PerfectRetry.new do |config|
19
21
  config.limit = PING_RETRY_LIMIT
@@ -34,31 +36,42 @@ module Embulk
34
36
  end
35
37
  end
36
38
 
37
- def initialize(api_key, api_secret)
39
+ def initialize(api_key, api_secret, retryer = nil)
38
40
  @api_key = api_key
39
41
  @api_secret = api_secret
42
+ @retryer = retryer || PerfectRetry.new do |config|
43
+ # for test
44
+ config.limit = 0
45
+ config.dont_rescues = [RuntimeError]
46
+ config.log_level = nil
47
+ config.logger = Embulk.logger
48
+ config.raise_original_error = true
49
+ end
40
50
  end
41
51
 
42
- def export(params = {})
43
- body = request(params)
44
- response_to_enum(body)
52
+ def export(params = {}, &block)
53
+ retryer.with_retry do
54
+ request(params, &block)
55
+ end
45
56
  end
46
57
 
47
- def export_for_small_dataset(params = {}, times = 0)
48
- days = (1 * (10 ** times))
49
- to_date = Date.parse(params["from_date"].to_s) + days
50
- params["to_date"] = to_date.strftime("%Y-%m-%d")
58
+ def export_for_small_dataset(params = {})
59
+ try_to_dates = 5.times.map do |n|
60
+ # from_date + 1, from_date + 10, from_date + 100, ... so on
61
+ days = 1 * (10 ** n)
62
+ Date.parse(params["from_date"].to_s) + days
63
+ end
51
64
 
52
- body = request(params, SMALLSET_BYTE_RANGE)
53
- result = response_to_enum(body)
54
- if result.first.nil?
55
- if times >= 5
56
- raise ConfigError.new "#{params["from_date"]} + #{days} days has no record. too old date?"
65
+ try_to_dates.each do |to_date|
66
+ params["to_date"] = to_date.strftime("%Y-%m-%d")
67
+ records = retryer.with_retry do
68
+ request_small_dataset(params, SMALLSET_BYTE_RANGE)
57
69
  end
58
- export_for_small_dataset(params, times + 1)
59
- else
60
- result
70
+ next if records.first.nil?
71
+ return records
61
72
  end
73
+
74
+ raise ConfigError.new "#{params["from_date"]}..#{try_to_dates.last} has no record. too old date?"
62
75
  end
63
76
 
64
77
  private
@@ -72,34 +85,53 @@ module Embulk
72
85
  end
73
86
  end
74
87
 
75
- def request(params, range = nil)
88
+ def request(params, &block)
76
89
  # https://mixpanel.com/docs/api-documentation/exporting-raw-data-you-inserted-into-mixpanel
77
- params[:expire] ||= Time.now.to_i + TIMEOUT_SECONDS
78
- params[:sig] = signature(params)
79
90
  Embulk.logger.debug "Export param: #{params.to_s}"
91
+ set_signatures(params)
80
92
 
81
- headers = {}
82
- response =
83
- if range
84
- # guess/preview
85
- res = httpclient.get(ENDPOINT_EXPORT, params, {"Range" => "bytes=#{range}"})
86
- if res.code == 416
87
- # cannot satisfied requested Range, get full body
88
- httpclient.get(ENDPOINT_EXPORT, params)
89
- else
90
- res
93
+ buf = ""
94
+ response = httpclient.get(ENDPOINT_EXPORT, params) do |chunk|
95
+ chunk.each_line do |line|
96
+ begin
97
+ record = JSON.parse(buf + line)
98
+ block.call record
99
+ buf = ""
100
+ rescue JSON::ParserError => e
101
+ buf << line
91
102
  end
92
- else
93
- httpclient.get(ENDPOINT_EXPORT, params)
94
103
  end
104
+ end
105
+ handle_error(response)
106
+ end
107
+
108
+ def request_small_dataset(params, range)
109
+ # guess/preview
110
+ # Try to fetch first `range` bytes
111
+ set_signatures(params)
112
+ res = httpclient.get(ENDPOINT_EXPORT, params, {"Range" => "bytes=#{range}"})
113
+ if res.code == 416
114
+ # cannot satisfied requested Range, get full body
115
+ res = httpclient.get(ENDPOINT_EXPORT, params)
116
+ end
117
+ handle_error(res)
118
+ response_to_enum(res.body)
119
+ end
120
+
121
+ def handle_error(response)
95
122
  Embulk.logger.debug "response code: #{response.code}"
96
123
  case response.code
97
124
  when 400..499
98
- raise ConfigError.new response.body
125
+ raise ConfigError.new("[#{response.code}] #{response.body}")
99
126
  when 500..599
100
- raise RuntimeError, response.body
127
+ raise RuntimeError.new("[#{response.code}] #{response.body}")
101
128
  end
102
- response.body
129
+ end
130
+
131
+ def set_signatures(params)
132
+ params[:expire] ||= Time.now.to_i + TIMEOUT_SECONDS
133
+ params[:sig] = signature(params)
134
+ params
103
135
  end
104
136
 
105
137
  def signature(params)
@@ -121,6 +153,7 @@ module Embulk
121
153
  client = HTTPClient.new
122
154
  client.receive_timeout = TIMEOUT_SECONDS
123
155
  client.default_header = {Accept: "application/json; charset=UTF-8"}
156
+ # client.debug_dev = STDERR
124
157
  client
125
158
  end
126
159
  end
@@ -37,38 +37,17 @@ module Embulk
37
37
  @httpclient = HTTPClient.new
38
38
  end
39
39
 
40
- def test_httpclient
41
- stub_response(success_response)
42
- mock(@client).httpclient { @httpclient }
43
-
44
- @client.export(params)
45
- end
46
-
47
- def test_response_class
48
- stub_client
49
- stub_response(success_response)
50
-
51
- actual = @client.export(params)
52
-
53
- assert_equal(Enumerator, actual.class)
54
- end
55
-
56
- def test_http_request
57
- stub_client
58
- mock(@httpclient).get(Client::ENDPOINT_EXPORT, params) do
59
- success_response
60
- end
61
-
62
- @client.export(params)
63
- end
64
-
65
40
  def test_success
66
41
  stub_client
42
+ stub(@client).set_signatures(anything) {}
67
43
  stub_response(success_response)
68
44
 
69
- actual = @client.export(params)
45
+ records = []
46
+ @client.export(params) do |record|
47
+ records << record
48
+ end
70
49
 
71
- assert_equal(dummy_responses, actual.to_a)
50
+ assert_equal(dummy_responses, records)
72
51
  end
73
52
 
74
53
  def test_failure_with_400
@@ -92,16 +71,17 @@ module Embulk
92
71
  class ExportSmallDataset < self
93
72
  def test_to_date_after_1_day
94
73
  to = (Date.parse(params["from_date"]) + 1).to_s
95
- mock(@client).request(params.merge("to_date" => to), Client::SMALLSET_BYTE_RANGE) { jsonl_dummy_responses }
74
+ mock(@client).request_small_dataset(params.merge("to_date" => to), Client::SMALLSET_BYTE_RANGE) { [:foo] }
96
75
 
97
76
  @client.export_for_small_dataset(params)
98
77
  end
99
78
 
100
79
  def test_to_date_after_1_day_after_10_days_if_empty
80
+ stub_client
101
81
  to1 = (Date.parse(params["from_date"]) + 1).to_s
102
82
  to2 = (Date.parse(params["from_date"]) + 10).to_s
103
- mock(@client).request(params.merge("to_date" => to1), Client::SMALLSET_BYTE_RANGE) { "" }
104
- mock(@client).request(params.merge("to_date" => to2), Client::SMALLSET_BYTE_RANGE) { jsonl_dummy_responses }
83
+ mock(@client).request_small_dataset(params.merge("to_date" => to1), Client::SMALLSET_BYTE_RANGE) { [] }
84
+ mock(@client).request_small_dataset(params.merge("to_date" => to2), Client::SMALLSET_BYTE_RANGE) { [:foo] }
105
85
 
106
86
  @client.export_for_small_dataset(params)
107
87
  end
@@ -122,9 +102,12 @@ module Embulk
122
102
  end
123
103
 
124
104
  def stub_response(response)
125
- stub(@httpclient).get(Client::ENDPOINT_EXPORT, params) do
126
- response
127
- end
105
+ @httpclient.test_loopback_http_response << [
106
+ "HTTP/1.1 #{response.code}",
107
+ "Content-Type: application/json",
108
+ "",
109
+ response.body
110
+ ].join("\r\n")
128
111
  end
129
112
 
130
113
  def success_response
@@ -510,17 +510,17 @@ module Embulk
510
510
 
511
511
  class RunTest < self
512
512
  def setup_client
513
-
514
513
  any_instance_of(MixpanelApi::Client) do |klass|
515
- stub(klass).request { records_raw_response }
514
+ stub(klass).request_small_dataset { records_raw_response }
515
+ stub(klass).request { records }
516
516
  end
517
517
  end
518
518
 
519
519
  def setup
520
520
  super
521
-
522
521
  @page_builder = Object.new
523
522
  @plugin = Mixpanel.new(task, nil, nil, @page_builder)
523
+ stub(@plugin).fetch { records }
524
524
  end
525
525
 
526
526
  def test_preview
@@ -542,7 +542,7 @@ module Embulk
542
542
  def test_timezone
543
543
  stub(@plugin).preview? { false }
544
544
  adjusted = record_epoch - timezone_offset_seconds
545
- mock(@page_builder).add(["FOO", adjusted]).times(records.length * 2)
545
+ mock(@page_builder).add(["FOO", adjusted, "event"]).times(records.length * 2)
546
546
  mock(@page_builder).finish
547
547
 
548
548
  @plugin.run
@@ -600,13 +600,14 @@ module Embulk
600
600
 
601
601
  class UnknownColumnsTest < self
602
602
  def setup
603
- super
604
603
  @page_builder = Object.new
605
604
  @plugin = Mixpanel.new(task, nil, nil, @page_builder)
605
+ stub(@plugin).fetch { records }
606
606
  end
607
607
 
608
608
  def test_run
609
- Embulk.logger.warn(anything)
609
+ stub(Embulk.logger).warn
610
+ stub(Embulk.logger).info
610
611
  stub(@plugin).preview? { false }
611
612
 
612
613
  # NOTE: Expect records are contained same record
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-mixpanel
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.3
4
+ version: 0.4.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshihara
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-03-16 00:00:00.000000000 Z
12
+ date: 2016-09-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
@@ -44,7 +44,7 @@ dependencies:
44
44
  requirements:
45
45
  - - "~>"
46
46
  - !ruby/object:Gem::Version
47
- version: '0.3'
47
+ version: '0.5'
48
48
  name: perfect_retry
49
49
  prerelease: false
50
50
  type: :runtime
@@ -52,7 +52,7 @@ dependencies:
52
52
  requirements:
53
53
  - - "~>"
54
54
  - !ruby/object:Gem::Version
55
- version: '0.3'
55
+ version: '0.5'
56
56
  - !ruby/object:Gem::Dependency
57
57
  requirement: !ruby/object:Gem::Requirement
58
58
  requirements: