embulk-input-elasticsearch 0.3.2 → 0.3.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.travis.yml +1 -0
- data/embulk-input-elasticsearch.gemspec +1 -1
- data/lib/embulk/input/elasticsearch.rb +10 -122
- data/lib/embulk/input/elasticsearch/connection.rb +78 -6
- data/lib/embulk/input/elasticsearch/converter.rb +49 -0
- data/test/helper.rb +2 -0
- data/test/test_converter.rb +51 -0
- data/test/test_input_thread.rb +36 -0
- data/test/test_transaction.rb +1 -29
- metadata +7 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6e50e12402a605f80964ad84cf29c555d6526671
|
4
|
+
data.tar.gz: a69bf91656bd9a569e68ee437efecf69cd03db57
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2400cf1b6273097d67d2a71f65f2984bbf3b1a79c03b1070c02aff0e1913d99be37af80e2a6d3eba0a65ef9462cee6b53d3df23ce70c6579694f4a3c278eda3e
|
7
|
+
data.tar.gz: a21f701e1b9695e170858ebdb16c3042cb38f6557573d03f1142b1764b2771b9c75d27056acb47e78f2a469953cecff65d2c73a2b8addd69834dc5c51b6d1eee
|
data/.travis.yml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-input-elasticsearch"
|
4
|
-
spec.version = "0.3.
|
4
|
+
spec.version = "0.3.3"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Elasticsearch input plugin for Embulk"
|
7
7
|
spec.description = "Loads records from Elasticsearch. parallel query support."
|
@@ -1,7 +1,6 @@
|
|
1
|
-
require 'excon'
|
2
|
-
require 'elasticsearch'
|
3
1
|
require_relative 'elasticsearch/connection'
|
4
2
|
require_relative 'elasticsearch/input_thread'
|
3
|
+
require_relative 'elasticsearch/converter'
|
5
4
|
|
6
5
|
module Embulk
|
7
6
|
module Input
|
@@ -52,135 +51,24 @@ module Embulk
|
|
52
51
|
def init
|
53
52
|
@queries = task['slice_queries'][@index]
|
54
53
|
Embulk.logger.info("this thread queries => #{@queries}")
|
55
|
-
@client = Connection.create_client(task)
|
56
|
-
@index_name = task['index']
|
57
|
-
@index_type = task['index_type']
|
58
|
-
@per_size = task['per_size']
|
59
|
-
@limit_size = task['limit_size']
|
60
|
-
@fields = task['fields']
|
61
|
-
@sort = task['sort']
|
62
54
|
@add_query_to_record = task['add_query_to_record']
|
63
|
-
@
|
64
|
-
@retry_on_failure = task['retry_on_failure']
|
55
|
+
@connection = Connection.new(task)
|
65
56
|
end
|
66
57
|
|
67
58
|
def run
|
68
|
-
|
59
|
+
@queries.each do |query|
|
60
|
+
@connection.search_with_query(query) { |result|
|
61
|
+
if @add_query_to_record
|
62
|
+
result << query
|
63
|
+
end
|
64
|
+
page_builder.add(result)
|
65
|
+
}
|
66
|
+
end
|
69
67
|
page_builder.finish
|
70
68
|
|
71
69
|
task_report = {}
|
72
70
|
return task_report
|
73
71
|
end
|
74
|
-
|
75
|
-
private
|
76
|
-
|
77
|
-
def search(type, size, routing, fields, sort)
|
78
|
-
@queries.each do |query|
|
79
|
-
search_with_query(query, type, size, routing, fields, sort)
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
def search_with_query(query, type, size, routing, fields, sort)
|
84
|
-
search_option = get_search_option(type, query, size, fields, sort)
|
85
|
-
Embulk.logger.info("#{search_option}")
|
86
|
-
r = search_with_retry { @client.search(search_option) }
|
87
|
-
i = 0
|
88
|
-
get_sources(r, fields).each do |result|
|
89
|
-
result_proc(result, query)
|
90
|
-
return if @limit_size == (i += 1)
|
91
|
-
end
|
92
|
-
|
93
|
-
while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
|
94
|
-
get_sources(r, fields).each do |result|
|
95
|
-
result_proc(result, query)
|
96
|
-
return if @limit_size == (i += 1)
|
97
|
-
end
|
98
|
-
end
|
99
|
-
end
|
100
|
-
|
101
|
-
def search_with_retry
|
102
|
-
retries = 0
|
103
|
-
begin
|
104
|
-
yield if block_given?
|
105
|
-
rescue => e
|
106
|
-
if retries < @retry_on_failure
|
107
|
-
retries += 1
|
108
|
-
Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
|
109
|
-
sleep 2**retries
|
110
|
-
retry
|
111
|
-
end
|
112
|
-
Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
|
113
|
-
raise
|
114
|
-
end
|
115
|
-
end
|
116
|
-
|
117
|
-
def result_proc(result, query)
|
118
|
-
if @add_query_to_record
|
119
|
-
result << query
|
120
|
-
end
|
121
|
-
page_builder.add(result)
|
122
|
-
end
|
123
|
-
|
124
|
-
def get_search_option(type, query, size, fields, sort)
|
125
|
-
body = { }
|
126
|
-
body[:query] = { query_string: { query: query } } unless query.nil?
|
127
|
-
if sort
|
128
|
-
sorts = []
|
129
|
-
sort.each do |k, v|
|
130
|
-
sorts << { k => v }
|
131
|
-
end
|
132
|
-
body[:sort] = sorts
|
133
|
-
else
|
134
|
-
body[:sort] = ["_doc"]
|
135
|
-
end
|
136
|
-
search_option = { index: @index_name, type: type, scroll: @scroll, body: body, size: size }
|
137
|
-
search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
138
|
-
search_option
|
139
|
-
end
|
140
|
-
|
141
|
-
def get_sources(results, fields)
|
142
|
-
hits = results['hits']['hits']
|
143
|
-
hits.map { |hit|
|
144
|
-
result = hit['_source']
|
145
|
-
fields.select{ |field| field['metadata'] }.each { |field|
|
146
|
-
result[field['name']] = hit[field['name']]
|
147
|
-
}
|
148
|
-
@fields.map { |field|
|
149
|
-
convert_value(result[field['name']], field)
|
150
|
-
}
|
151
|
-
}
|
152
|
-
end
|
153
|
-
|
154
|
-
def convert_value(value, field)
|
155
|
-
return nil if value.nil?
|
156
|
-
case field["type"]
|
157
|
-
when "string"
|
158
|
-
value
|
159
|
-
when "long"
|
160
|
-
value.to_i
|
161
|
-
when "double"
|
162
|
-
value.to_f
|
163
|
-
when "boolean"
|
164
|
-
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
165
|
-
value
|
166
|
-
else
|
167
|
-
downcased_val = value.downcase
|
168
|
-
case downcased_val
|
169
|
-
when 'true' then true
|
170
|
-
when 'false' then false
|
171
|
-
when '1' then true
|
172
|
-
when '0' then false
|
173
|
-
else nil
|
174
|
-
end
|
175
|
-
end
|
176
|
-
when "timestamp"
|
177
|
-
Time.parse(value)
|
178
|
-
when "json"
|
179
|
-
value
|
180
|
-
else
|
181
|
-
raise "Unsupported type #{field['type']}"
|
182
|
-
end
|
183
|
-
end
|
184
72
|
end
|
185
73
|
end
|
186
74
|
end
|
@@ -1,17 +1,38 @@
|
|
1
|
+
require 'excon'
|
2
|
+
require 'elasticsearch'
|
3
|
+
|
1
4
|
module Embulk
|
2
5
|
module Input
|
3
6
|
class Elasticsearch < InputPlugin
|
4
7
|
class Connection
|
5
|
-
def
|
8
|
+
def initialize(task)
|
9
|
+
@scroll = task['scroll']
|
10
|
+
@index = task['index']
|
11
|
+
@index_type = task['index_type']
|
12
|
+
@size = task['per_size']
|
13
|
+
@fields = task['fields']
|
14
|
+
@sort = task['sort']
|
15
|
+
@limit_size = task['limit_size']
|
16
|
+
@retry_on_failure = task['retry_on_failure']
|
17
|
+
@client = create_client(
|
18
|
+
nodes: task['nodes'],
|
19
|
+
reload_connections: task['reload_connections'],
|
20
|
+
reload_on_failure: task['reload_on_failure'],
|
21
|
+
retry_on_failure: task['retry_on_failure'],
|
22
|
+
request_timeout: task['request_timeout']
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
|
6
27
|
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
7
28
|
{
|
8
|
-
hosts:
|
29
|
+
hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
9
30
|
options: {
|
10
|
-
reload_connections:
|
11
|
-
reload_on_failure:
|
12
|
-
retry_on_failure:
|
31
|
+
reload_connections: reload_connections,
|
32
|
+
reload_on_failure: reload_on_failure,
|
33
|
+
retry_on_failure: retry_on_failure,
|
13
34
|
transport_options: {
|
14
|
-
request: { timeout:
|
35
|
+
request: { timeout: request_timeout }
|
15
36
|
}
|
16
37
|
}
|
17
38
|
}
|
@@ -19,6 +40,57 @@ module Embulk
|
|
19
40
|
|
20
41
|
::Elasticsearch::Client.new transport: transport
|
21
42
|
end
|
43
|
+
|
44
|
+
def search_with_query(query)
|
45
|
+
search_option = get_search_option(query)
|
46
|
+
Embulk.logger.info("#{search_option}")
|
47
|
+
r = search_with_retry { @client.search(search_option) }
|
48
|
+
i = 0
|
49
|
+
Converter.get_sources(r, @fields).each do |result|
|
50
|
+
yield(result) if block_given?
|
51
|
+
return if @limit_size == (i += 1)
|
52
|
+
end
|
53
|
+
|
54
|
+
while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
|
55
|
+
Converter.get_sources(r, @fields).each do |result|
|
56
|
+
yield(result) if block_given?
|
57
|
+
return if @limit_size == (i += 1)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def search_with_retry
|
63
|
+
retries = 0
|
64
|
+
begin
|
65
|
+
yield if block_given?
|
66
|
+
rescue => e
|
67
|
+
if retries < @retry_on_failure
|
68
|
+
retries += 1
|
69
|
+
Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
|
70
|
+
sleep 2**retries
|
71
|
+
retry
|
72
|
+
end
|
73
|
+
Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
|
74
|
+
raise
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
def get_search_option(query)
|
79
|
+
body = { }
|
80
|
+
body[:query] = { query_string: { query: query } } unless query.nil?
|
81
|
+
if @sort
|
82
|
+
sorts = []
|
83
|
+
@sort.each do |k, v|
|
84
|
+
sorts << { k => v }
|
85
|
+
end
|
86
|
+
body[:sort] = sorts
|
87
|
+
else
|
88
|
+
body[:sort] = ["_doc"]
|
89
|
+
end
|
90
|
+
search_option = { index: @index, type: @index_type, scroll: @scroll, body: body, size: @size }
|
91
|
+
search_option[:_source] = @fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
92
|
+
search_option
|
93
|
+
end
|
22
94
|
end
|
23
95
|
end
|
24
96
|
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class Converter
|
5
|
+
def self.get_sources(results, fields)
|
6
|
+
hits = results['hits']['hits']
|
7
|
+
hits.map { |hit|
|
8
|
+
result = hit['_source']
|
9
|
+
fields.map { |field|
|
10
|
+
value = field['metadata'] ? hit[field['name']] : result[field['name']]
|
11
|
+
convert_value(value, field)
|
12
|
+
}
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.convert_value(value, field)
|
17
|
+
return nil if value.nil?
|
18
|
+
case field["type"]
|
19
|
+
when "string"
|
20
|
+
value
|
21
|
+
when "long"
|
22
|
+
value.to_i
|
23
|
+
when "double"
|
24
|
+
value.to_f
|
25
|
+
when "boolean"
|
26
|
+
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
27
|
+
value
|
28
|
+
else
|
29
|
+
downcased_val = value.downcase
|
30
|
+
case downcased_val
|
31
|
+
when 'true' then true
|
32
|
+
when 'false' then false
|
33
|
+
when '1' then true
|
34
|
+
when '0' then false
|
35
|
+
else nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
when "timestamp"
|
39
|
+
Time.parse(value)
|
40
|
+
when "json"
|
41
|
+
value
|
42
|
+
else
|
43
|
+
raise "Unsupported type #{field['type']}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
data/test/helper.rb
CHANGED
@@ -0,0 +1,51 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
|
3
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
class Input::Elasticsearch
|
7
|
+
class TestConverter < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def startup
|
10
|
+
end
|
11
|
+
|
12
|
+
def shutdown
|
13
|
+
end
|
14
|
+
|
15
|
+
sub_test_case "get_sources" do
|
16
|
+
def test_normal
|
17
|
+
fields = [
|
18
|
+
{"name"=>"_id", "type"=>"string", "metadata"=>true},
|
19
|
+
{"name"=>"product_id", "type"=>"long"},
|
20
|
+
{"name"=>"title", "type"=>"string"}
|
21
|
+
]
|
22
|
+
|
23
|
+
results = {
|
24
|
+
"_scroll_id"=>"cXVlcnlUaGVuRmV0Y2g7NTsxNzg3MjE6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjI6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjM6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjU6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjQ6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTswOw==",
|
25
|
+
"took"=>41,
|
26
|
+
"timed_out"=>false,
|
27
|
+
"_shards"=>{"total"=>5, "successful"=>5, "failed"=>0},
|
28
|
+
"hits"=>{
|
29
|
+
"total"=>1,
|
30
|
+
"max_score"=>nil,
|
31
|
+
"hits"=>[
|
32
|
+
{
|
33
|
+
"_index"=>"test_index",
|
34
|
+
"_type"=>"test_type",
|
35
|
+
"_id"=>"AVTCxiCuNR-BVKOgUB7R",
|
36
|
+
"_score"=>nil,
|
37
|
+
"_source"=>{
|
38
|
+
"title"=>"dummy title",
|
39
|
+
"product_id"=>1
|
40
|
+
},
|
41
|
+
"sort"=>[12534]
|
42
|
+
}
|
43
|
+
]
|
44
|
+
}
|
45
|
+
}
|
46
|
+
assert_equal Converter.get_sources(results, fields), [["AVTCxiCuNR-BVKOgUB7R", 1, "dummy title"]]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
|
3
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
class Input::Elasticsearch
|
7
|
+
class TestTransaction < Test::Unit::TestCase
|
8
|
+
sub_test_case "get_slice_from_num_threads" do
|
9
|
+
def test_normal
|
10
|
+
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
11
|
+
assert_equal slice.size, 5
|
12
|
+
assert_equal slice.first.size, 2
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_normal_same
|
16
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
17
|
+
assert_equal slice.size, 3
|
18
|
+
assert_equal slice.first.size, 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_num_threads_over_array_size
|
22
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
23
|
+
assert_equal slice.size, 3
|
24
|
+
assert_equal slice.first.size, 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_rest
|
28
|
+
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
29
|
+
assert_equal slice.size, 7
|
30
|
+
assert_equal slice.first.size, 3
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
data/test/test_transaction.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require_relative './helper'
|
2
|
-
require 'embulk/input/elasticsearch'
|
3
2
|
require 'yaml'
|
4
3
|
|
5
4
|
Elasticsearch = Embulk::Input::Elasticsearch
|
@@ -10,33 +9,6 @@ module Embulk
|
|
10
9
|
def control
|
11
10
|
Proc.new {|task| task_reports = [] }
|
12
11
|
end
|
13
|
-
|
14
|
-
sub_test_case "get_slice_from_num_threads" do
|
15
|
-
def test_normal
|
16
|
-
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
17
|
-
assert_equal slice.size, 5
|
18
|
-
assert_equal slice.first.size, 2
|
19
|
-
end
|
20
|
-
|
21
|
-
def test_normal_same
|
22
|
-
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
23
|
-
assert_equal slice.size, 3
|
24
|
-
assert_equal slice.first.size, 1
|
25
|
-
end
|
26
|
-
|
27
|
-
def test_num_threads_over_array_size
|
28
|
-
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
29
|
-
assert_equal slice.size, 3
|
30
|
-
assert_equal slice.first.size, 1
|
31
|
-
end
|
32
|
-
|
33
|
-
def test_rest
|
34
|
-
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
35
|
-
assert_equal slice.size, 7
|
36
|
-
assert_equal slice.first.size, 3
|
37
|
-
end
|
38
|
-
end
|
39
|
-
|
40
12
|
sub_test_case "transaction" do
|
41
13
|
def test_normal
|
42
14
|
yaml = YAML.load(%(
|
@@ -75,4 +47,4 @@ module Embulk
|
|
75
47
|
end
|
76
48
|
end
|
77
49
|
end
|
78
|
-
end
|
50
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-elasticsearch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-03-
|
11
|
+
date: 2017-03-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -111,8 +111,11 @@ files:
|
|
111
111
|
- embulk-input-elasticsearch.gemspec
|
112
112
|
- lib/embulk/input/elasticsearch.rb
|
113
113
|
- lib/embulk/input/elasticsearch/connection.rb
|
114
|
+
- lib/embulk/input/elasticsearch/converter.rb
|
114
115
|
- lib/embulk/input/elasticsearch/input_thread.rb
|
115
116
|
- test/helper.rb
|
117
|
+
- test/test_converter.rb
|
118
|
+
- test/test_input_thread.rb
|
116
119
|
- test/test_transaction.rb
|
117
120
|
homepage: https://github.com/toyama0919/embulk-input-elasticsearch
|
118
121
|
licenses:
|
@@ -140,4 +143,6 @@ specification_version: 4
|
|
140
143
|
summary: Elasticsearch input plugin for Embulk
|
141
144
|
test_files:
|
142
145
|
- test/helper.rb
|
146
|
+
- test/test_converter.rb
|
147
|
+
- test/test_input_thread.rb
|
143
148
|
- test/test_transaction.rb
|