embulk-input-elasticsearch 0.3.2 → 0.3.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 209e87c44e92108a80a402616ef1d7ad7db9cbd7
4
- data.tar.gz: ec7efbe789ee0e2b17d922750172ba0e9f745ef2
3
+ metadata.gz: 6e50e12402a605f80964ad84cf29c555d6526671
4
+ data.tar.gz: a69bf91656bd9a569e68ee437efecf69cd03db57
5
5
  SHA512:
6
- metadata.gz: 337eac65409f489536ba0e4107cdca94115530b3781b2e2e8eb90b55e3df7e46e4f79530d9776b2ee341887bde394387fe3f08850b0aa2a1c0a50b9d9ba6c613
7
- data.tar.gz: 612f003dccdab158bd094b34d2853c3cd368b955b860b5a58f71ceaa3cae416c492e556acf02550ddc28aec9920485ae15fac0a8b3b78419e4bc78f31f00cd43
6
+ metadata.gz: 2400cf1b6273097d67d2a71f65f2984bbf3b1a79c03b1070c02aff0e1913d99be37af80e2a6d3eba0a65ef9462cee6b53d3df23ce70c6579694f4a3c278eda3e
7
+ data.tar.gz: a21f701e1b9695e170858ebdb16c3042cb38f6557573d03f1142b1764b2771b9c75d27056acb47e78f2a469953cecff65d2c73a2b8addd69834dc5c51b6d1eee
data/.travis.yml CHANGED
@@ -2,6 +2,7 @@ language: ruby
2
2
  cache: bundler
3
3
  rvm:
4
4
  - jruby-9.0.5.0
5
+ - jruby-9.1.5.0
5
6
  - jruby-head
6
7
  jdk:
7
8
  - openjdk7
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-elasticsearch"
4
- spec.version = "0.3.2"
4
+ spec.version = "0.3.3"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Elasticsearch input plugin for Embulk"
7
7
  spec.description = "Loads records from Elasticsearch. parallel query support."
@@ -1,7 +1,6 @@
1
- require 'excon'
2
- require 'elasticsearch'
3
1
  require_relative 'elasticsearch/connection'
4
2
  require_relative 'elasticsearch/input_thread'
3
+ require_relative 'elasticsearch/converter'
5
4
 
6
5
  module Embulk
7
6
  module Input
@@ -52,135 +51,24 @@ module Embulk
52
51
  def init
53
52
  @queries = task['slice_queries'][@index]
54
53
  Embulk.logger.info("this thread queries => #{@queries}")
55
- @client = Connection.create_client(task)
56
- @index_name = task['index']
57
- @index_type = task['index_type']
58
- @per_size = task['per_size']
59
- @limit_size = task['limit_size']
60
- @fields = task['fields']
61
- @sort = task['sort']
62
54
  @add_query_to_record = task['add_query_to_record']
63
- @scroll = task['scroll']
64
- @retry_on_failure = task['retry_on_failure']
55
+ @connection = Connection.new(task)
65
56
  end
66
57
 
67
58
  def run
68
- search(@index_type, @per_size, @routing, @fields, @sort)
59
+ @queries.each do |query|
60
+ @connection.search_with_query(query) { |result|
61
+ if @add_query_to_record
62
+ result << query
63
+ end
64
+ page_builder.add(result)
65
+ }
66
+ end
69
67
  page_builder.finish
70
68
 
71
69
  task_report = {}
72
70
  return task_report
73
71
  end
74
-
75
- private
76
-
77
- def search(type, size, routing, fields, sort)
78
- @queries.each do |query|
79
- search_with_query(query, type, size, routing, fields, sort)
80
- end
81
- end
82
-
83
- def search_with_query(query, type, size, routing, fields, sort)
84
- search_option = get_search_option(type, query, size, fields, sort)
85
- Embulk.logger.info("#{search_option}")
86
- r = search_with_retry { @client.search(search_option) }
87
- i = 0
88
- get_sources(r, fields).each do |result|
89
- result_proc(result, query)
90
- return if @limit_size == (i += 1)
91
- end
92
-
93
- while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
94
- get_sources(r, fields).each do |result|
95
- result_proc(result, query)
96
- return if @limit_size == (i += 1)
97
- end
98
- end
99
- end
100
-
101
- def search_with_retry
102
- retries = 0
103
- begin
104
- yield if block_given?
105
- rescue => e
106
- if retries < @retry_on_failure
107
- retries += 1
108
- Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
109
- sleep 2**retries
110
- retry
111
- end
112
- Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
113
- raise
114
- end
115
- end
116
-
117
- def result_proc(result, query)
118
- if @add_query_to_record
119
- result << query
120
- end
121
- page_builder.add(result)
122
- end
123
-
124
- def get_search_option(type, query, size, fields, sort)
125
- body = { }
126
- body[:query] = { query_string: { query: query } } unless query.nil?
127
- if sort
128
- sorts = []
129
- sort.each do |k, v|
130
- sorts << { k => v }
131
- end
132
- body[:sort] = sorts
133
- else
134
- body[:sort] = ["_doc"]
135
- end
136
- search_option = { index: @index_name, type: type, scroll: @scroll, body: body, size: size }
137
- search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
138
- search_option
139
- end
140
-
141
- def get_sources(results, fields)
142
- hits = results['hits']['hits']
143
- hits.map { |hit|
144
- result = hit['_source']
145
- fields.select{ |field| field['metadata'] }.each { |field|
146
- result[field['name']] = hit[field['name']]
147
- }
148
- @fields.map { |field|
149
- convert_value(result[field['name']], field)
150
- }
151
- }
152
- end
153
-
154
- def convert_value(value, field)
155
- return nil if value.nil?
156
- case field["type"]
157
- when "string"
158
- value
159
- when "long"
160
- value.to_i
161
- when "double"
162
- value.to_f
163
- when "boolean"
164
- if value.is_a?(TrueClass) || value.is_a?(FalseClass)
165
- value
166
- else
167
- downcased_val = value.downcase
168
- case downcased_val
169
- when 'true' then true
170
- when 'false' then false
171
- when '1' then true
172
- when '0' then false
173
- else nil
174
- end
175
- end
176
- when "timestamp"
177
- Time.parse(value)
178
- when "json"
179
- value
180
- else
181
- raise "Unsupported type #{field['type']}"
182
- end
183
- end
184
72
  end
185
73
  end
186
74
  end
@@ -1,17 +1,38 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
1
4
  module Embulk
2
5
  module Input
3
6
  class Elasticsearch < InputPlugin
4
7
  class Connection
5
- def self.create_client(task)
8
+ def initialize(task)
9
+ @scroll = task['scroll']
10
+ @index = task['index']
11
+ @index_type = task['index_type']
12
+ @size = task['per_size']
13
+ @fields = task['fields']
14
+ @sort = task['sort']
15
+ @limit_size = task['limit_size']
16
+ @retry_on_failure = task['retry_on_failure']
17
+ @client = create_client(
18
+ nodes: task['nodes'],
19
+ reload_connections: task['reload_connections'],
20
+ reload_on_failure: task['reload_on_failure'],
21
+ retry_on_failure: task['retry_on_failure'],
22
+ request_timeout: task['request_timeout']
23
+ )
24
+ end
25
+
26
+ def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
6
27
  transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
7
28
  {
8
- hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
29
+ hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
9
30
  options: {
10
- reload_connections: task['reload_connections'],
11
- reload_on_failure: task['reload_on_failure'],
12
- retry_on_failure: task['retry_on_failure'],
31
+ reload_connections: reload_connections,
32
+ reload_on_failure: reload_on_failure,
33
+ retry_on_failure: retry_on_failure,
13
34
  transport_options: {
14
- request: { timeout: task['request_timeout'] }
35
+ request: { timeout: request_timeout }
15
36
  }
16
37
  }
17
38
  }
@@ -19,6 +40,57 @@ module Embulk
19
40
 
20
41
  ::Elasticsearch::Client.new transport: transport
21
42
  end
43
+
44
+ def search_with_query(query)
45
+ search_option = get_search_option(query)
46
+ Embulk.logger.info("#{search_option}")
47
+ r = search_with_retry { @client.search(search_option) }
48
+ i = 0
49
+ Converter.get_sources(r, @fields).each do |result|
50
+ yield(result) if block_given?
51
+ return if @limit_size == (i += 1)
52
+ end
53
+
54
+ while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
55
+ Converter.get_sources(r, @fields).each do |result|
56
+ yield(result) if block_given?
57
+ return if @limit_size == (i += 1)
58
+ end
59
+ end
60
+ end
61
+
62
+ def search_with_retry
63
+ retries = 0
64
+ begin
65
+ yield if block_given?
66
+ rescue => e
67
+ if retries < @retry_on_failure
68
+ retries += 1
69
+ Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
70
+ sleep 2**retries
71
+ retry
72
+ end
73
+ Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
74
+ raise
75
+ end
76
+ end
77
+
78
+ def get_search_option(query)
79
+ body = { }
80
+ body[:query] = { query_string: { query: query } } unless query.nil?
81
+ if @sort
82
+ sorts = []
83
+ @sort.each do |k, v|
84
+ sorts << { k => v }
85
+ end
86
+ body[:sort] = sorts
87
+ else
88
+ body[:sort] = ["_doc"]
89
+ end
90
+ search_option = { index: @index, type: @index_type, scroll: @scroll, body: body, size: @size }
91
+ search_option[:_source] = @fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
92
+ search_option
93
+ end
22
94
  end
23
95
  end
24
96
  end
@@ -0,0 +1,49 @@
1
+ module Embulk
2
+ module Input
3
+ class Elasticsearch < InputPlugin
4
+ class Converter
5
+ def self.get_sources(results, fields)
6
+ hits = results['hits']['hits']
7
+ hits.map { |hit|
8
+ result = hit['_source']
9
+ fields.map { |field|
10
+ value = field['metadata'] ? hit[field['name']] : result[field['name']]
11
+ convert_value(value, field)
12
+ }
13
+ }
14
+ end
15
+
16
+ def self.convert_value(value, field)
17
+ return nil if value.nil?
18
+ case field["type"]
19
+ when "string"
20
+ value
21
+ when "long"
22
+ value.to_i
23
+ when "double"
24
+ value.to_f
25
+ when "boolean"
26
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
27
+ value
28
+ else
29
+ downcased_val = value.downcase
30
+ case downcased_val
31
+ when 'true' then true
32
+ when 'false' then false
33
+ when '1' then true
34
+ when '0' then false
35
+ else nil
36
+ end
37
+ end
38
+ when "timestamp"
39
+ Time.parse(value)
40
+ when "json"
41
+ value
42
+ else
43
+ raise "Unsupported type #{field['type']}"
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
data/test/helper.rb CHANGED
@@ -9,3 +9,5 @@ Embulk.logger = Embulk::Logger.new('/dev/null')
9
9
 
10
10
  APP_ROOT = File.expand_path('../', __dir__)
11
11
  TEST_ROOT = File.expand_path(File.dirname(__FILE__))
12
+
13
+ require 'embulk/input/elasticsearch'
@@ -0,0 +1,51 @@
1
+ require_relative './helper'
2
+
3
+ Elasticsearch = Embulk::Input::Elasticsearch
4
+
5
+ module Embulk
6
+ class Input::Elasticsearch
7
+ class TestConverter < Test::Unit::TestCase
8
+
9
+ def startup
10
+ end
11
+
12
+ def shutdown
13
+ end
14
+
15
+ sub_test_case "get_sources" do
16
+ def test_normal
17
+ fields = [
18
+ {"name"=>"_id", "type"=>"string", "metadata"=>true},
19
+ {"name"=>"product_id", "type"=>"long"},
20
+ {"name"=>"title", "type"=>"string"}
21
+ ]
22
+
23
+ results = {
24
+ "_scroll_id"=>"cXVlcnlUaGVuRmV0Y2g7NTsxNzg3MjE6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjI6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjM6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjU6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjQ6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTswOw==",
25
+ "took"=>41,
26
+ "timed_out"=>false,
27
+ "_shards"=>{"total"=>5, "successful"=>5, "failed"=>0},
28
+ "hits"=>{
29
+ "total"=>1,
30
+ "max_score"=>nil,
31
+ "hits"=>[
32
+ {
33
+ "_index"=>"test_index",
34
+ "_type"=>"test_type",
35
+ "_id"=>"AVTCxiCuNR-BVKOgUB7R",
36
+ "_score"=>nil,
37
+ "_source"=>{
38
+ "title"=>"dummy title",
39
+ "product_id"=>1
40
+ },
41
+ "sort"=>[12534]
42
+ }
43
+ ]
44
+ }
45
+ }
46
+ assert_equal Converter.get_sources(results, fields), [["AVTCxiCuNR-BVKOgUB7R", 1, "dummy title"]]
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,36 @@
1
+ require_relative './helper'
2
+
3
+ Elasticsearch = Embulk::Input::Elasticsearch
4
+
5
+ module Embulk
6
+ class Input::Elasticsearch
7
+ class TestTransaction < Test::Unit::TestCase
8
+ sub_test_case "get_slice_from_num_threads" do
9
+ def test_normal
10
+ slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
11
+ assert_equal slice.size, 5
12
+ assert_equal slice.first.size, 2
13
+ end
14
+
15
+ def test_normal_same
16
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
17
+ assert_equal slice.size, 3
18
+ assert_equal slice.first.size, 1
19
+ end
20
+
21
+ def test_num_threads_over_array_size
22
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
23
+ assert_equal slice.size, 3
24
+ assert_equal slice.first.size, 1
25
+ end
26
+
27
+ def test_rest
28
+ slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
29
+ assert_equal slice.size, 7
30
+ assert_equal slice.first.size, 3
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -1,5 +1,4 @@
1
1
  require_relative './helper'
2
- require 'embulk/input/elasticsearch'
3
2
  require 'yaml'
4
3
 
5
4
  Elasticsearch = Embulk::Input::Elasticsearch
@@ -10,33 +9,6 @@ module Embulk
10
9
  def control
11
10
  Proc.new {|task| task_reports = [] }
12
11
  end
13
-
14
- sub_test_case "get_slice_from_num_threads" do
15
- def test_normal
16
- slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
17
- assert_equal slice.size, 5
18
- assert_equal slice.first.size, 2
19
- end
20
-
21
- def test_normal_same
22
- slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
23
- assert_equal slice.size, 3
24
- assert_equal slice.first.size, 1
25
- end
26
-
27
- def test_num_threads_over_array_size
28
- slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
29
- assert_equal slice.size, 3
30
- assert_equal slice.first.size, 1
31
- end
32
-
33
- def test_rest
34
- slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
35
- assert_equal slice.size, 7
36
- assert_equal slice.first.size, 3
37
- end
38
- end
39
-
40
12
  sub_test_case "transaction" do
41
13
  def test_normal
42
14
  yaml = YAML.load(%(
@@ -75,4 +47,4 @@ module Embulk
75
47
  end
76
48
  end
77
49
  end
78
- end
50
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-elasticsearch
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-16 00:00:00.000000000 Z
11
+ date: 2017-03-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -111,8 +111,11 @@ files:
111
111
  - embulk-input-elasticsearch.gemspec
112
112
  - lib/embulk/input/elasticsearch.rb
113
113
  - lib/embulk/input/elasticsearch/connection.rb
114
+ - lib/embulk/input/elasticsearch/converter.rb
114
115
  - lib/embulk/input/elasticsearch/input_thread.rb
115
116
  - test/helper.rb
117
+ - test/test_converter.rb
118
+ - test/test_input_thread.rb
116
119
  - test/test_transaction.rb
117
120
  homepage: https://github.com/toyama0919/embulk-input-elasticsearch
118
121
  licenses:
@@ -140,4 +143,6 @@ specification_version: 4
140
143
  summary: Elasticsearch input plugin for Embulk
141
144
  test_files:
142
145
  - test/helper.rb
146
+ - test/test_converter.rb
147
+ - test/test_input_thread.rb
143
148
  - test/test_transaction.rb