embulk-input-elasticsearch 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/embulk-input-elasticsearch.gemspec +1 -1
- data/lib/embulk/input/elasticsearch.rb +10 -122
- data/lib/embulk/input/elasticsearch/connection.rb +78 -6
- data/lib/embulk/input/elasticsearch/converter.rb +49 -0
- data/test/helper.rb +2 -0
- data/test/test_converter.rb +51 -0
- data/test/test_input_thread.rb +36 -0
- data/test/test_transaction.rb +1 -29
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e50e12402a605f80964ad84cf29c555d6526671
|
4
|
+
data.tar.gz: a69bf91656bd9a569e68ee437efecf69cd03db57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2400cf1b6273097d67d2a71f65f2984bbf3b1a79c03b1070c02aff0e1913d99be37af80e2a6d3eba0a65ef9462cee6b53d3df23ce70c6579694f4a3c278eda3e
|
7
|
+
data.tar.gz: a21f701e1b9695e170858ebdb16c3042cb38f6557573d03f1142b1764b2771b9c75d27056acb47e78f2a469953cecff65d2c73a2b8addd69834dc5c51b6d1eee
|
data/.travis.yml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-input-elasticsearch"
|
4
|
-
spec.version = "0.3.
|
4
|
+
spec.version = "0.3.3"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Elasticsearch input plugin for Embulk"
|
7
7
|
spec.description = "Loads records from Elasticsearch. parallel query support."
|
@@ -1,7 +1,6 @@
|
|
1
|
-
require 'excon'
|
2
|
-
require 'elasticsearch'
|
3
1
|
require_relative 'elasticsearch/connection'
|
4
2
|
require_relative 'elasticsearch/input_thread'
|
3
|
+
require_relative 'elasticsearch/converter'
|
5
4
|
|
6
5
|
module Embulk
|
7
6
|
module Input
|
@@ -52,135 +51,24 @@ module Embulk
|
|
52
51
|
def init
|
53
52
|
@queries = task['slice_queries'][@index]
|
54
53
|
Embulk.logger.info("this thread queries => #{@queries}")
|
55
|
-
@client = Connection.create_client(task)
|
56
|
-
@index_name = task['index']
|
57
|
-
@index_type = task['index_type']
|
58
|
-
@per_size = task['per_size']
|
59
|
-
@limit_size = task['limit_size']
|
60
|
-
@fields = task['fields']
|
61
|
-
@sort = task['sort']
|
62
54
|
@add_query_to_record = task['add_query_to_record']
|
63
|
-
@
|
64
|
-
@retry_on_failure = task['retry_on_failure']
|
55
|
+
@connection = Connection.new(task)
|
65
56
|
end
|
66
57
|
|
67
58
|
def run
|
68
|
-
|
59
|
+
@queries.each do |query|
|
60
|
+
@connection.search_with_query(query) { |result|
|
61
|
+
if @add_query_to_record
|
62
|
+
result << query
|
63
|
+
end
|
64
|
+
page_builder.add(result)
|
65
|
+
}
|
66
|
+
end
|
69
67
|
page_builder.finish
|
70
68
|
|
71
69
|
task_report = {}
|
72
70
|
return task_report
|
73
71
|
end
|
74
|
-
|
75
|
-
private
|
76
|
-
|
77
|
-
def search(type, size, routing, fields, sort)
|
78
|
-
@queries.each do |query|
|
79
|
-
search_with_query(query, type, size, routing, fields, sort)
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
def search_with_query(query, type, size, routing, fields, sort)
|
84
|
-
search_option = get_search_option(type, query, size, fields, sort)
|
85
|
-
Embulk.logger.info("#{search_option}")
|
86
|
-
r = search_with_retry { @client.search(search_option) }
|
87
|
-
i = 0
|
88
|
-
get_sources(r, fields).each do |result|
|
89
|
-
result_proc(result, query)
|
90
|
-
return if @limit_size == (i += 1)
|
91
|
-
end
|
92
|
-
|
93
|
-
while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
|
94
|
-
get_sources(r, fields).each do |result|
|
95
|
-
result_proc(result, query)
|
96
|
-
return if @limit_size == (i += 1)
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def search_with_retry
|
102
|
-
retries = 0
|
103
|
-
begin
|
104
|
-
yield if block_given?
|
105
|
-
rescue => e
|
106
|
-
if retries < @retry_on_failure
|
107
|
-
retries += 1
|
108
|
-
Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
|
109
|
-
sleep 2**retries
|
110
|
-
retry
|
111
|
-
end
|
112
|
-
Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
|
113
|
-
raise
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def result_proc(result, query)
|
118
|
-
if @add_query_to_record
|
119
|
-
result << query
|
120
|
-
end
|
121
|
-
page_builder.add(result)
|
122
|
-
end
|
123
|
-
|
124
|
-
def get_search_option(type, query, size, fields, sort)
|
125
|
-
body = { }
|
126
|
-
body[:query] = { query_string: { query: query } } unless query.nil?
|
127
|
-
if sort
|
128
|
-
sorts = []
|
129
|
-
sort.each do |k, v|
|
130
|
-
sorts << { k => v }
|
131
|
-
end
|
132
|
-
body[:sort] = sorts
|
133
|
-
else
|
134
|
-
body[:sort] = ["_doc"]
|
135
|
-
end
|
136
|
-
search_option = { index: @index_name, type: type, scroll: @scroll, body: body, size: size }
|
137
|
-
search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
138
|
-
search_option
|
139
|
-
end
|
140
|
-
|
141
|
-
def get_sources(results, fields)
|
142
|
-
hits = results['hits']['hits']
|
143
|
-
hits.map { |hit|
|
144
|
-
result = hit['_source']
|
145
|
-
fields.select{ |field| field['metadata'] }.each { |field|
|
146
|
-
result[field['name']] = hit[field['name']]
|
147
|
-
}
|
148
|
-
@fields.map { |field|
|
149
|
-
convert_value(result[field['name']], field)
|
150
|
-
}
|
151
|
-
}
|
152
|
-
end
|
153
|
-
|
154
|
-
def convert_value(value, field)
|
155
|
-
return nil if value.nil?
|
156
|
-
case field["type"]
|
157
|
-
when "string"
|
158
|
-
value
|
159
|
-
when "long"
|
160
|
-
value.to_i
|
161
|
-
when "double"
|
162
|
-
value.to_f
|
163
|
-
when "boolean"
|
164
|
-
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
165
|
-
value
|
166
|
-
else
|
167
|
-
downcased_val = value.downcase
|
168
|
-
case downcased_val
|
169
|
-
when 'true' then true
|
170
|
-
when 'false' then false
|
171
|
-
when '1' then true
|
172
|
-
when '0' then false
|
173
|
-
else nil
|
174
|
-
end
|
175
|
-
end
|
176
|
-
when "timestamp"
|
177
|
-
Time.parse(value)
|
178
|
-
when "json"
|
179
|
-
value
|
180
|
-
else
|
181
|
-
raise "Unsupported type #{field['type']}"
|
182
|
-
end
|
183
|
-
end
|
184
72
|
end
|
185
73
|
end
|
186
74
|
end
|
@@ -1,17 +1,38 @@
|
|
1
|
+
require 'excon'
|
2
|
+
require 'elasticsearch'
|
3
|
+
|
1
4
|
module Embulk
|
2
5
|
module Input
|
3
6
|
class Elasticsearch < InputPlugin
|
4
7
|
class Connection
|
5
|
-
def
|
8
|
+
def initialize(task)
|
9
|
+
@scroll = task['scroll']
|
10
|
+
@index = task['index']
|
11
|
+
@index_type = task['index_type']
|
12
|
+
@size = task['per_size']
|
13
|
+
@fields = task['fields']
|
14
|
+
@sort = task['sort']
|
15
|
+
@limit_size = task['limit_size']
|
16
|
+
@retry_on_failure = task['retry_on_failure']
|
17
|
+
@client = create_client(
|
18
|
+
nodes: task['nodes'],
|
19
|
+
reload_connections: task['reload_connections'],
|
20
|
+
reload_on_failure: task['reload_on_failure'],
|
21
|
+
retry_on_failure: task['retry_on_failure'],
|
22
|
+
request_timeout: task['request_timeout']
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
|
6
27
|
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
7
28
|
{
|
8
|
-
hosts:
|
29
|
+
hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
9
30
|
options: {
|
10
|
-
reload_connections:
|
11
|
-
reload_on_failure:
|
12
|
-
retry_on_failure:
|
31
|
+
reload_connections: reload_connections,
|
32
|
+
reload_on_failure: reload_on_failure,
|
33
|
+
retry_on_failure: retry_on_failure,
|
13
34
|
transport_options: {
|
14
|
-
request: { timeout:
|
35
|
+
request: { timeout: request_timeout }
|
15
36
|
}
|
16
37
|
}
|
17
38
|
}
|
@@ -19,6 +40,57 @@ module Embulk
|
|
19
40
|
|
20
41
|
::Elasticsearch::Client.new transport: transport
|
21
42
|
end
|
43
|
+
|
44
|
+
def search_with_query(query)
|
45
|
+
search_option = get_search_option(query)
|
46
|
+
Embulk.logger.info("#{search_option}")
|
47
|
+
r = search_with_retry { @client.search(search_option) }
|
48
|
+
i = 0
|
49
|
+
Converter.get_sources(r, @fields).each do |result|
|
50
|
+
yield(result) if block_given?
|
51
|
+
return if @limit_size == (i += 1)
|
52
|
+
end
|
53
|
+
|
54
|
+
while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
|
55
|
+
Converter.get_sources(r, @fields).each do |result|
|
56
|
+
yield(result) if block_given?
|
57
|
+
return if @limit_size == (i += 1)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def search_with_retry
|
63
|
+
retries = 0
|
64
|
+
begin
|
65
|
+
yield if block_given?
|
66
|
+
rescue => e
|
67
|
+
if retries < @retry_on_failure
|
68
|
+
retries += 1
|
69
|
+
Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
|
70
|
+
sleep 2**retries
|
71
|
+
retry
|
72
|
+
end
|
73
|
+
Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
|
74
|
+
raise
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_search_option(query)
|
79
|
+
body = { }
|
80
|
+
body[:query] = { query_string: { query: query } } unless query.nil?
|
81
|
+
if @sort
|
82
|
+
sorts = []
|
83
|
+
@sort.each do |k, v|
|
84
|
+
sorts << { k => v }
|
85
|
+
end
|
86
|
+
body[:sort] = sorts
|
87
|
+
else
|
88
|
+
body[:sort] = ["_doc"]
|
89
|
+
end
|
90
|
+
search_option = { index: @index, type: @index_type, scroll: @scroll, body: body, size: @size }
|
91
|
+
search_option[:_source] = @fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
92
|
+
search_option
|
93
|
+
end
|
22
94
|
end
|
23
95
|
end
|
24
96
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class Converter
|
5
|
+
def self.get_sources(results, fields)
|
6
|
+
hits = results['hits']['hits']
|
7
|
+
hits.map { |hit|
|
8
|
+
result = hit['_source']
|
9
|
+
fields.map { |field|
|
10
|
+
value = field['metadata'] ? hit[field['name']] : result[field['name']]
|
11
|
+
convert_value(value, field)
|
12
|
+
}
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.convert_value(value, field)
|
17
|
+
return nil if value.nil?
|
18
|
+
case field["type"]
|
19
|
+
when "string"
|
20
|
+
value
|
21
|
+
when "long"
|
22
|
+
value.to_i
|
23
|
+
when "double"
|
24
|
+
value.to_f
|
25
|
+
when "boolean"
|
26
|
+
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
27
|
+
value
|
28
|
+
else
|
29
|
+
downcased_val = value.downcase
|
30
|
+
case downcased_val
|
31
|
+
when 'true' then true
|
32
|
+
when 'false' then false
|
33
|
+
when '1' then true
|
34
|
+
when '0' then false
|
35
|
+
else nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
when "timestamp"
|
39
|
+
Time.parse(value)
|
40
|
+
when "json"
|
41
|
+
value
|
42
|
+
else
|
43
|
+
raise "Unsupported type #{field['type']}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/test/helper.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
|
3
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
class Input::Elasticsearch
|
7
|
+
class TestConverter < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def startup
|
10
|
+
end
|
11
|
+
|
12
|
+
def shutdown
|
13
|
+
end
|
14
|
+
|
15
|
+
sub_test_case "get_sources" do
|
16
|
+
def test_normal
|
17
|
+
fields = [
|
18
|
+
{"name"=>"_id", "type"=>"string", "metadata"=>true},
|
19
|
+
{"name"=>"product_id", "type"=>"long"},
|
20
|
+
{"name"=>"title", "type"=>"string"}
|
21
|
+
]
|
22
|
+
|
23
|
+
results = {
|
24
|
+
"_scroll_id"=>"cXVlcnlUaGVuRmV0Y2g7NTsxNzg3MjE6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjI6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjM6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjU6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjQ6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTswOw==",
|
25
|
+
"took"=>41,
|
26
|
+
"timed_out"=>false,
|
27
|
+
"_shards"=>{"total"=>5, "successful"=>5, "failed"=>0},
|
28
|
+
"hits"=>{
|
29
|
+
"total"=>1,
|
30
|
+
"max_score"=>nil,
|
31
|
+
"hits"=>[
|
32
|
+
{
|
33
|
+
"_index"=>"test_index",
|
34
|
+
"_type"=>"test_type",
|
35
|
+
"_id"=>"AVTCxiCuNR-BVKOgUB7R",
|
36
|
+
"_score"=>nil,
|
37
|
+
"_source"=>{
|
38
|
+
"title"=>"dummy title",
|
39
|
+
"product_id"=>1
|
40
|
+
},
|
41
|
+
"sort"=>[12534]
|
42
|
+
}
|
43
|
+
]
|
44
|
+
}
|
45
|
+
}
|
46
|
+
assert_equal Converter.get_sources(results, fields), [["AVTCxiCuNR-BVKOgUB7R", 1, "dummy title"]]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
|
3
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
class Input::Elasticsearch
|
7
|
+
class TestTransaction < Test::Unit::TestCase
|
8
|
+
sub_test_case "get_slice_from_num_threads" do
|
9
|
+
def test_normal
|
10
|
+
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
11
|
+
assert_equal slice.size, 5
|
12
|
+
assert_equal slice.first.size, 2
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_normal_same
|
16
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
17
|
+
assert_equal slice.size, 3
|
18
|
+
assert_equal slice.first.size, 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_num_threads_over_array_size
|
22
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
23
|
+
assert_equal slice.size, 3
|
24
|
+
assert_equal slice.first.size, 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_rest
|
28
|
+
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
29
|
+
assert_equal slice.size, 7
|
30
|
+
assert_equal slice.first.size, 3
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
data/test/test_transaction.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require_relative './helper'
|
2
|
-
require 'embulk/input/elasticsearch'
|
3
2
|
require 'yaml'
|
4
3
|
|
5
4
|
Elasticsearch = Embulk::Input::Elasticsearch
|
@@ -10,33 +9,6 @@ module Embulk
|
|
10
9
|
def control
|
11
10
|
Proc.new {|task| task_reports = [] }
|
12
11
|
end
|
13
|
-
|
14
|
-
sub_test_case "get_slice_from_num_threads" do
|
15
|
-
def test_normal
|
16
|
-
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
17
|
-
assert_equal slice.size, 5
|
18
|
-
assert_equal slice.first.size, 2
|
19
|
-
end
|
20
|
-
|
21
|
-
def test_normal_same
|
22
|
-
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
23
|
-
assert_equal slice.size, 3
|
24
|
-
assert_equal slice.first.size, 1
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_num_threads_over_array_size
|
28
|
-
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
29
|
-
assert_equal slice.size, 3
|
30
|
-
assert_equal slice.first.size, 1
|
31
|
-
end
|
32
|
-
|
33
|
-
def test_rest
|
34
|
-
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
35
|
-
assert_equal slice.size, 7
|
36
|
-
assert_equal slice.first.size, 3
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
12
|
sub_test_case "transaction" do
|
41
13
|
def test_normal
|
42
14
|
yaml = YAML.load(%(
|
@@ -75,4 +47,4 @@ module Embulk
|
|
75
47
|
end
|
76
48
|
end
|
77
49
|
end
|
78
|
-
end
|
50
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-elasticsearch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -111,8 +111,11 @@ files:
|
|
111
111
|
- embulk-input-elasticsearch.gemspec
|
112
112
|
- lib/embulk/input/elasticsearch.rb
|
113
113
|
- lib/embulk/input/elasticsearch/connection.rb
|
114
|
+
- lib/embulk/input/elasticsearch/converter.rb
|
114
115
|
- lib/embulk/input/elasticsearch/input_thread.rb
|
115
116
|
- test/helper.rb
|
117
|
+
- test/test_converter.rb
|
118
|
+
- test/test_input_thread.rb
|
116
119
|
- test/test_transaction.rb
|
117
120
|
homepage: https://github.com/toyama0919/embulk-input-elasticsearch
|
118
121
|
licenses:
|
@@ -140,4 +143,6 @@ specification_version: 4
|
|
140
143
|
summary: Elasticsearch input plugin for Embulk
|
141
144
|
test_files:
|
142
145
|
- test/helper.rb
|
146
|
+
- test/test_converter.rb
|
147
|
+
- test/test_input_thread.rb
|
143
148
|
- test/test_transaction.rb
|