embulk-input-elasticsearch 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0991bd544cf235c70290b6d307f164112280d1ea
|
4
|
+
data.tar.gz: 4e578ad476510b38eaa09ef9a20437f1548cc70c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b47b9034846fc515fda638e231b5c62f13b0c8fecf1fba8ef16ad8f084f967518b2089df5a95e7681bb6e9af2dd918da452b843d1072dd04e22f01f504af674e
|
7
|
+
data.tar.gz: ddc4879a8a6c0524499f1b10f3b1e871c1920024aefa81e1c4bf562c44533cc161ea6d8d202bf1dd89af30688f35790c3f2f9b0a9a637a9af9ebdd86e32b6e09
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-input-elasticsearch"
|
4
|
-
spec.version = "0.
|
4
|
+
spec.version = "0.3.0"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Elasticsearch input plugin for Embulk"
|
7
7
|
spec.description = "Loads records from Elasticsearch. parallel query support."
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require 'excon'
|
2
2
|
require 'elasticsearch'
|
3
|
+
require_relative 'elasticsearch/connection'
|
4
|
+
require_relative 'elasticsearch/input_thread'
|
3
5
|
|
4
6
|
module Embulk
|
5
7
|
module Input
|
6
8
|
|
7
9
|
class Elasticsearch < InputPlugin
|
8
10
|
Plugin.register_input("elasticsearch", self)
|
11
|
+
ADD_QUERY_TO_RECORD_KEY = 'query'
|
9
12
|
|
10
13
|
def self.transaction(config, &control)
|
11
14
|
task = {
|
@@ -25,31 +28,19 @@ module Embulk
|
|
25
28
|
}
|
26
29
|
# TODO: want max_threads
|
27
30
|
define_num_threads = config.param("num_threads", :integer, default: 1)
|
28
|
-
task['slice_queries'] = get_slice_from_num_threads(task['queries'], define_num_threads)
|
31
|
+
task['slice_queries'] = InputThread.get_slice_from_num_threads(task['queries'], define_num_threads)
|
29
32
|
|
30
33
|
columns = []
|
31
34
|
task['fields'].each_with_index{ |field, i|
|
32
35
|
columns << Column.new(i, field['name'], field['type'].to_sym)
|
33
36
|
}
|
34
37
|
if task['add_query_to_record']
|
35
|
-
columns << Column.new(task['fields'].size,
|
38
|
+
columns << Column.new(task['fields'].size, ADD_QUERY_TO_RECORD_KEY, :string)
|
36
39
|
end
|
37
40
|
|
38
41
|
resume(task, columns, task['slice_queries'].size, &control)
|
39
42
|
end
|
40
43
|
|
41
|
-
def self.get_slice_from_num_threads(array, define_num_threads)
|
42
|
-
num_threads = array.size < define_num_threads ? array.size : define_num_threads
|
43
|
-
per_queries = if (array.size % num_threads) == 0
|
44
|
-
(array.size / num_threads)
|
45
|
-
else
|
46
|
-
(array.size / num_threads) + 1
|
47
|
-
end
|
48
|
-
sliced = array.each_slice(per_queries).to_a
|
49
|
-
Embulk.logger.info("calculate num threads => #{sliced.size}")
|
50
|
-
return sliced
|
51
|
-
end
|
52
|
-
|
53
44
|
def self.resume(task, columns, count, &control)
|
54
45
|
task_reports = yield(task, columns, count)
|
55
46
|
|
@@ -57,28 +48,10 @@ module Embulk
|
|
57
48
|
return next_config_diff
|
58
49
|
end
|
59
50
|
|
60
|
-
def self.create_client(task)
|
61
|
-
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
62
|
-
{
|
63
|
-
hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
64
|
-
options: {
|
65
|
-
reload_connections: task['reload_connections'],
|
66
|
-
reload_on_failure: task['reload_on_failure'],
|
67
|
-
retry_on_failure: task['retry_on_failure'],
|
68
|
-
transport_options: {
|
69
|
-
request: { timeout: task['request_timeout'] }
|
70
|
-
}
|
71
|
-
}
|
72
|
-
}
|
73
|
-
)
|
74
|
-
|
75
|
-
::Elasticsearch::Client.new transport: transport
|
76
|
-
end
|
77
|
-
|
78
51
|
def init
|
79
52
|
@queries = task['slice_queries'][@index]
|
80
53
|
Embulk.logger.info("this thread queries => #{@queries}")
|
81
|
-
@client =
|
54
|
+
@client = Connection.create_client(task)
|
82
55
|
@index_name = task['index']
|
83
56
|
@index_type = task['index_type']
|
84
57
|
@per_size = task['per_size']
|
@@ -89,27 +62,7 @@ module Embulk
|
|
89
62
|
end
|
90
63
|
|
91
64
|
def run
|
92
|
-
@
|
93
|
-
query_count = 0
|
94
|
-
no_source_results = search(@index_type, query, 0, 0, @routing, @fields, @sort)
|
95
|
-
total_count = [no_source_results['hits']['total'], @limit_size].compact.min
|
96
|
-
while true
|
97
|
-
now_results_size = query_count * @per_size
|
98
|
-
next_results_size = (query_count + 1) * @per_size
|
99
|
-
size = get_size(next_results_size, now_results_size ,total_count)
|
100
|
-
break if size == 0
|
101
|
-
|
102
|
-
results = get_sources(search(@index_type, query, size, now_results_size, @routing, @fields, @sort), @fields)
|
103
|
-
results.each do |record|
|
104
|
-
if @add_query_to_record
|
105
|
-
record << query
|
106
|
-
end
|
107
|
-
page_builder.add(record)
|
108
|
-
end
|
109
|
-
break if last_query?(next_results_size ,total_count)
|
110
|
-
query_count += 1
|
111
|
-
end
|
112
|
-
end
|
65
|
+
search(@index_type, @per_size, @routing, @fields, @sort)
|
113
66
|
page_builder.finish
|
114
67
|
|
115
68
|
task_report = {}
|
@@ -118,65 +71,52 @@ module Embulk
|
|
118
71
|
|
119
72
|
private
|
120
73
|
|
121
|
-
def
|
122
|
-
|
123
|
-
|
124
|
-
when "string"
|
125
|
-
value
|
126
|
-
when "long"
|
127
|
-
value.to_i
|
128
|
-
when "double"
|
129
|
-
value.to_f
|
130
|
-
when "boolean"
|
131
|
-
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
132
|
-
value
|
133
|
-
else
|
134
|
-
downcased_val = value.downcase
|
135
|
-
case downcased_val
|
136
|
-
when 'true' then true
|
137
|
-
when 'false' then false
|
138
|
-
when '1' then true
|
139
|
-
when '0' then false
|
140
|
-
else nil
|
141
|
-
end
|
142
|
-
end
|
143
|
-
when "timestamp"
|
144
|
-
Time.parse(value)
|
145
|
-
when "json"
|
146
|
-
value
|
147
|
-
else
|
148
|
-
raise "Unsupported type #{field['type']}"
|
74
|
+
def search(type, size, routing, fields, sort)
|
75
|
+
@queries.each do |query|
|
76
|
+
search_with_query(query, type, size, routing, fields, sort)
|
149
77
|
end
|
150
78
|
end
|
151
79
|
|
152
|
-
def
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
80
|
+
def search_with_query(query, type, size, routing, fields, sort)
|
81
|
+
search_option = get_search_option(type, query, size, fields, sort)
|
82
|
+
Embulk.logger.info("#{search_option}")
|
83
|
+
r = @client.search(search_option)
|
84
|
+
i = 0
|
85
|
+
get_sources(r, fields).each do |result|
|
86
|
+
result_proc(result, query)
|
87
|
+
return if @limit_size == (i += 1)
|
88
|
+
end
|
89
|
+
|
90
|
+
while r = @client.scroll(scroll_id: r['_scroll_id'], scroll: '1m') and (not r['hits']['hits'].empty?) do
|
91
|
+
get_sources(r, fields).each do |result|
|
92
|
+
result_proc(result, query)
|
93
|
+
return if @limit_size == (i += 1)
|
94
|
+
end
|
157
95
|
end
|
158
96
|
end
|
159
97
|
|
160
|
-
def
|
161
|
-
|
98
|
+
def result_proc(result, query)
|
99
|
+
if @add_query_to_record
|
100
|
+
result << query
|
101
|
+
end
|
102
|
+
page_builder.add(result)
|
162
103
|
end
|
163
104
|
|
164
|
-
def
|
165
|
-
body = {
|
166
|
-
body[:
|
105
|
+
def get_search_option(type, query, size, fields, sort)
|
106
|
+
body = { }
|
107
|
+
body[:query] = { query_string: { query: query } } unless query.nil?
|
167
108
|
if sort
|
168
109
|
sorts = []
|
169
110
|
sort.each do |k, v|
|
170
111
|
sorts << { k => v }
|
171
112
|
end
|
172
113
|
body[:sort] = sorts
|
114
|
+
else
|
115
|
+
body[:sort] = ["_doc"]
|
173
116
|
end
|
174
|
-
|
175
|
-
search_option = { index: @index_name, type: type, body: body }
|
176
|
-
search_option[:routing] = routing unless routing.nil?
|
117
|
+
search_option = { index: @index_name, type: type, scroll: '1m', body: body, size: size }
|
177
118
|
search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
178
|
-
|
179
|
-
@client.search(search_option)
|
119
|
+
search_option
|
180
120
|
end
|
181
121
|
|
182
122
|
def get_sources(results, fields)
|
@@ -191,6 +131,37 @@ module Embulk
|
|
191
131
|
}
|
192
132
|
}
|
193
133
|
end
|
134
|
+
|
135
|
+
def convert_value(value, field)
|
136
|
+
return nil if value.nil?
|
137
|
+
case field["type"]
|
138
|
+
when "string"
|
139
|
+
value
|
140
|
+
when "long"
|
141
|
+
value.to_i
|
142
|
+
when "double"
|
143
|
+
value.to_f
|
144
|
+
when "boolean"
|
145
|
+
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
146
|
+
value
|
147
|
+
else
|
148
|
+
downcased_val = value.downcase
|
149
|
+
case downcased_val
|
150
|
+
when 'true' then true
|
151
|
+
when 'false' then false
|
152
|
+
when '1' then true
|
153
|
+
when '0' then false
|
154
|
+
else nil
|
155
|
+
end
|
156
|
+
end
|
157
|
+
when "timestamp"
|
158
|
+
Time.parse(value)
|
159
|
+
when "json"
|
160
|
+
value
|
161
|
+
else
|
162
|
+
raise "Unsupported type #{field['type']}"
|
163
|
+
end
|
164
|
+
end
|
194
165
|
end
|
195
166
|
end
|
196
167
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class Connection
|
5
|
+
def self.create_client(task)
|
6
|
+
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
7
|
+
{
|
8
|
+
hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
9
|
+
options: {
|
10
|
+
reload_connections: task['reload_connections'],
|
11
|
+
reload_on_failure: task['reload_on_failure'],
|
12
|
+
retry_on_failure: task['retry_on_failure'],
|
13
|
+
transport_options: {
|
14
|
+
request: { timeout: task['request_timeout'] }
|
15
|
+
}
|
16
|
+
}
|
17
|
+
}
|
18
|
+
)
|
19
|
+
|
20
|
+
::Elasticsearch::Client.new transport: transport
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class InputThread
|
5
|
+
def self.get_slice_from_num_threads(array, define_num_threads)
|
6
|
+
num_threads = array.size < define_num_threads ? array.size : define_num_threads
|
7
|
+
per_queries = if (array.size % num_threads) == 0
|
8
|
+
(array.size / num_threads)
|
9
|
+
else
|
10
|
+
(array.size / num_threads) + 1
|
11
|
+
end
|
12
|
+
sliced = array.each_slice(per_queries).to_a
|
13
|
+
Embulk.logger.info("calculate num threads => #{sliced.size}")
|
14
|
+
return sliced
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/test/test_transaction.rb
CHANGED
@@ -13,25 +13,25 @@ module Embulk
|
|
13
13
|
|
14
14
|
sub_test_case "get_slice_from_num_threads" do
|
15
15
|
def test_normal
|
16
|
-
slice =
|
16
|
+
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
17
17
|
assert_equal slice.size, 5
|
18
18
|
assert_equal slice.first.size, 2
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_normal_same
|
22
|
-
slice =
|
22
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
23
23
|
assert_equal slice.size, 3
|
24
24
|
assert_equal slice.first.size, 1
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_num_threads_over_array_size
|
28
|
-
slice =
|
28
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
29
29
|
assert_equal slice.size, 3
|
30
30
|
assert_equal slice.first.size, 1
|
31
31
|
end
|
32
32
|
|
33
33
|
def test_rest
|
34
|
-
slice =
|
34
|
+
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
35
35
|
assert_equal slice.size, 7
|
36
36
|
assert_equal slice.first.size, 3
|
37
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-elasticsearch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -110,6 +110,8 @@ files:
|
|
110
110
|
- Rakefile
|
111
111
|
- embulk-input-elasticsearch.gemspec
|
112
112
|
- lib/embulk/input/elasticsearch.rb
|
113
|
+
- lib/embulk/input/elasticsearch/connection.rb
|
114
|
+
- lib/embulk/input/elasticsearch/input_thread.rb
|
113
115
|
- test/helper.rb
|
114
116
|
- test/test_transaction.rb
|
115
117
|
homepage: https://github.com/toyama0919/embulk-input-elasticsearch
|