embulk-input-elasticsearch 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 0991bd544cf235c70290b6d307f164112280d1ea
|
4
|
+
data.tar.gz: 4e578ad476510b38eaa09ef9a20437f1548cc70c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b47b9034846fc515fda638e231b5c62f13b0c8fecf1fba8ef16ad8f084f967518b2089df5a95e7681bb6e9af2dd918da452b843d1072dd04e22f01f504af674e
|
7
|
+
data.tar.gz: ddc4879a8a6c0524499f1b10f3b1e871c1920024aefa81e1c4bf562c44533cc161ea6d8d202bf1dd89af30688f35790c3f2f9b0a9a637a9af9ebdd86e32b6e09
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-input-elasticsearch"
|
4
|
-
spec.version = "0.
|
4
|
+
spec.version = "0.3.0"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Elasticsearch input plugin for Embulk"
|
7
7
|
spec.description = "Loads records from Elasticsearch. parallel query support."
|
@@ -1,11 +1,14 @@
|
|
1
1
|
require 'excon'
|
2
2
|
require 'elasticsearch'
|
3
|
+
require_relative 'elasticsearch/connection'
|
4
|
+
require_relative 'elasticsearch/input_thread'
|
3
5
|
|
4
6
|
module Embulk
|
5
7
|
module Input
|
6
8
|
|
7
9
|
class Elasticsearch < InputPlugin
|
8
10
|
Plugin.register_input("elasticsearch", self)
|
11
|
+
ADD_QUERY_TO_RECORD_KEY = 'query'
|
9
12
|
|
10
13
|
def self.transaction(config, &control)
|
11
14
|
task = {
|
@@ -25,31 +28,19 @@ module Embulk
|
|
25
28
|
}
|
26
29
|
# TODO: want max_threads
|
27
30
|
define_num_threads = config.param("num_threads", :integer, default: 1)
|
28
|
-
task['slice_queries'] = get_slice_from_num_threads(task['queries'], define_num_threads)
|
31
|
+
task['slice_queries'] = InputThread.get_slice_from_num_threads(task['queries'], define_num_threads)
|
29
32
|
|
30
33
|
columns = []
|
31
34
|
task['fields'].each_with_index{ |field, i|
|
32
35
|
columns << Column.new(i, field['name'], field['type'].to_sym)
|
33
36
|
}
|
34
37
|
if task['add_query_to_record']
|
35
|
-
columns << Column.new(task['fields'].size,
|
38
|
+
columns << Column.new(task['fields'].size, ADD_QUERY_TO_RECORD_KEY, :string)
|
36
39
|
end
|
37
40
|
|
38
41
|
resume(task, columns, task['slice_queries'].size, &control)
|
39
42
|
end
|
40
43
|
|
41
|
-
def self.get_slice_from_num_threads(array, define_num_threads)
|
42
|
-
num_threads = array.size < define_num_threads ? array.size : define_num_threads
|
43
|
-
per_queries = if (array.size % num_threads) == 0
|
44
|
-
(array.size / num_threads)
|
45
|
-
else
|
46
|
-
(array.size / num_threads) + 1
|
47
|
-
end
|
48
|
-
sliced = array.each_slice(per_queries).to_a
|
49
|
-
Embulk.logger.info("calculate num threads => #{sliced.size}")
|
50
|
-
return sliced
|
51
|
-
end
|
52
|
-
|
53
44
|
def self.resume(task, columns, count, &control)
|
54
45
|
task_reports = yield(task, columns, count)
|
55
46
|
|
@@ -57,28 +48,10 @@ module Embulk
|
|
57
48
|
return next_config_diff
|
58
49
|
end
|
59
50
|
|
60
|
-
def self.create_client(task)
|
61
|
-
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
62
|
-
{
|
63
|
-
hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
64
|
-
options: {
|
65
|
-
reload_connections: task['reload_connections'],
|
66
|
-
reload_on_failure: task['reload_on_failure'],
|
67
|
-
retry_on_failure: task['retry_on_failure'],
|
68
|
-
transport_options: {
|
69
|
-
request: { timeout: task['request_timeout'] }
|
70
|
-
}
|
71
|
-
}
|
72
|
-
}
|
73
|
-
)
|
74
|
-
|
75
|
-
::Elasticsearch::Client.new transport: transport
|
76
|
-
end
|
77
|
-
|
78
51
|
def init
|
79
52
|
@queries = task['slice_queries'][@index]
|
80
53
|
Embulk.logger.info("this thread queries => #{@queries}")
|
81
|
-
@client =
|
54
|
+
@client = Connection.create_client(task)
|
82
55
|
@index_name = task['index']
|
83
56
|
@index_type = task['index_type']
|
84
57
|
@per_size = task['per_size']
|
@@ -89,27 +62,7 @@ module Embulk
|
|
89
62
|
end
|
90
63
|
|
91
64
|
def run
|
92
|
-
@
|
93
|
-
query_count = 0
|
94
|
-
no_source_results = search(@index_type, query, 0, 0, @routing, @fields, @sort)
|
95
|
-
total_count = [no_source_results['hits']['total'], @limit_size].compact.min
|
96
|
-
while true
|
97
|
-
now_results_size = query_count * @per_size
|
98
|
-
next_results_size = (query_count + 1) * @per_size
|
99
|
-
size = get_size(next_results_size, now_results_size ,total_count)
|
100
|
-
break if size == 0
|
101
|
-
|
102
|
-
results = get_sources(search(@index_type, query, size, now_results_size, @routing, @fields, @sort), @fields)
|
103
|
-
results.each do |record|
|
104
|
-
if @add_query_to_record
|
105
|
-
record << query
|
106
|
-
end
|
107
|
-
page_builder.add(record)
|
108
|
-
end
|
109
|
-
break if last_query?(next_results_size ,total_count)
|
110
|
-
query_count += 1
|
111
|
-
end
|
112
|
-
end
|
65
|
+
search(@index_type, @per_size, @routing, @fields, @sort)
|
113
66
|
page_builder.finish
|
114
67
|
|
115
68
|
task_report = {}
|
@@ -118,65 +71,52 @@ module Embulk
|
|
118
71
|
|
119
72
|
private
|
120
73
|
|
121
|
-
def
|
122
|
-
|
123
|
-
|
124
|
-
when "string"
|
125
|
-
value
|
126
|
-
when "long"
|
127
|
-
value.to_i
|
128
|
-
when "double"
|
129
|
-
value.to_f
|
130
|
-
when "boolean"
|
131
|
-
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
132
|
-
value
|
133
|
-
else
|
134
|
-
downcased_val = value.downcase
|
135
|
-
case downcased_val
|
136
|
-
when 'true' then true
|
137
|
-
when 'false' then false
|
138
|
-
when '1' then true
|
139
|
-
when '0' then false
|
140
|
-
else nil
|
141
|
-
end
|
142
|
-
end
|
143
|
-
when "timestamp"
|
144
|
-
Time.parse(value)
|
145
|
-
when "json"
|
146
|
-
value
|
147
|
-
else
|
148
|
-
raise "Unsupported type #{field['type']}"
|
74
|
+
def search(type, size, routing, fields, sort)
|
75
|
+
@queries.each do |query|
|
76
|
+
search_with_query(query, type, size, routing, fields, sort)
|
149
77
|
end
|
150
78
|
end
|
151
79
|
|
152
|
-
def
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
80
|
+
def search_with_query(query, type, size, routing, fields, sort)
|
81
|
+
search_option = get_search_option(type, query, size, fields, sort)
|
82
|
+
Embulk.logger.info("#{search_option}")
|
83
|
+
r = @client.search(search_option)
|
84
|
+
i = 0
|
85
|
+
get_sources(r, fields).each do |result|
|
86
|
+
result_proc(result, query)
|
87
|
+
return if @limit_size == (i += 1)
|
88
|
+
end
|
89
|
+
|
90
|
+
while r = @client.scroll(scroll_id: r['_scroll_id'], scroll: '1m') and (not r['hits']['hits'].empty?) do
|
91
|
+
get_sources(r, fields).each do |result|
|
92
|
+
result_proc(result, query)
|
93
|
+
return if @limit_size == (i += 1)
|
94
|
+
end
|
157
95
|
end
|
158
96
|
end
|
159
97
|
|
160
|
-
def
|
161
|
-
|
98
|
+
def result_proc(result, query)
|
99
|
+
if @add_query_to_record
|
100
|
+
result << query
|
101
|
+
end
|
102
|
+
page_builder.add(result)
|
162
103
|
end
|
163
104
|
|
164
|
-
def
|
165
|
-
body = {
|
166
|
-
body[:
|
105
|
+
def get_search_option(type, query, size, fields, sort)
|
106
|
+
body = { }
|
107
|
+
body[:query] = { query_string: { query: query } } unless query.nil?
|
167
108
|
if sort
|
168
109
|
sorts = []
|
169
110
|
sort.each do |k, v|
|
170
111
|
sorts << { k => v }
|
171
112
|
end
|
172
113
|
body[:sort] = sorts
|
114
|
+
else
|
115
|
+
body[:sort] = ["_doc"]
|
173
116
|
end
|
174
|
-
|
175
|
-
search_option = { index: @index_name, type: type, body: body }
|
176
|
-
search_option[:routing] = routing unless routing.nil?
|
117
|
+
search_option = { index: @index_name, type: type, scroll: '1m', body: body, size: size }
|
177
118
|
search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
178
|
-
|
179
|
-
@client.search(search_option)
|
119
|
+
search_option
|
180
120
|
end
|
181
121
|
|
182
122
|
def get_sources(results, fields)
|
@@ -191,6 +131,37 @@ module Embulk
|
|
191
131
|
}
|
192
132
|
}
|
193
133
|
end
|
134
|
+
|
135
|
+
def convert_value(value, field)
|
136
|
+
return nil if value.nil?
|
137
|
+
case field["type"]
|
138
|
+
when "string"
|
139
|
+
value
|
140
|
+
when "long"
|
141
|
+
value.to_i
|
142
|
+
when "double"
|
143
|
+
value.to_f
|
144
|
+
when "boolean"
|
145
|
+
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
146
|
+
value
|
147
|
+
else
|
148
|
+
downcased_val = value.downcase
|
149
|
+
case downcased_val
|
150
|
+
when 'true' then true
|
151
|
+
when 'false' then false
|
152
|
+
when '1' then true
|
153
|
+
when '0' then false
|
154
|
+
else nil
|
155
|
+
end
|
156
|
+
end
|
157
|
+
when "timestamp"
|
158
|
+
Time.parse(value)
|
159
|
+
when "json"
|
160
|
+
value
|
161
|
+
else
|
162
|
+
raise "Unsupported type #{field['type']}"
|
163
|
+
end
|
164
|
+
end
|
194
165
|
end
|
195
166
|
end
|
196
167
|
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class Connection
|
5
|
+
def self.create_client(task)
|
6
|
+
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
7
|
+
{
|
8
|
+
hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
9
|
+
options: {
|
10
|
+
reload_connections: task['reload_connections'],
|
11
|
+
reload_on_failure: task['reload_on_failure'],
|
12
|
+
retry_on_failure: task['retry_on_failure'],
|
13
|
+
transport_options: {
|
14
|
+
request: { timeout: task['request_timeout'] }
|
15
|
+
}
|
16
|
+
}
|
17
|
+
}
|
18
|
+
)
|
19
|
+
|
20
|
+
::Elasticsearch::Client.new transport: transport
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class InputThread
|
5
|
+
def self.get_slice_from_num_threads(array, define_num_threads)
|
6
|
+
num_threads = array.size < define_num_threads ? array.size : define_num_threads
|
7
|
+
per_queries = if (array.size % num_threads) == 0
|
8
|
+
(array.size / num_threads)
|
9
|
+
else
|
10
|
+
(array.size / num_threads) + 1
|
11
|
+
end
|
12
|
+
sliced = array.each_slice(per_queries).to_a
|
13
|
+
Embulk.logger.info("calculate num threads => #{sliced.size}")
|
14
|
+
return sliced
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/test/test_transaction.rb
CHANGED
@@ -13,25 +13,25 @@ module Embulk
|
|
13
13
|
|
14
14
|
sub_test_case "get_slice_from_num_threads" do
|
15
15
|
def test_normal
|
16
|
-
slice =
|
16
|
+
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
17
17
|
assert_equal slice.size, 5
|
18
18
|
assert_equal slice.first.size, 2
|
19
19
|
end
|
20
20
|
|
21
21
|
def test_normal_same
|
22
|
-
slice =
|
22
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
23
23
|
assert_equal slice.size, 3
|
24
24
|
assert_equal slice.first.size, 1
|
25
25
|
end
|
26
26
|
|
27
27
|
def test_num_threads_over_array_size
|
28
|
-
slice =
|
28
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
29
29
|
assert_equal slice.size, 3
|
30
30
|
assert_equal slice.first.size, 1
|
31
31
|
end
|
32
32
|
|
33
33
|
def test_rest
|
34
|
-
slice =
|
34
|
+
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
35
35
|
assert_equal slice.size, 7
|
36
36
|
assert_equal slice.first.size, 3
|
37
37
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-input-elasticsearch
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-06-
|
11
|
+
date: 2016-06-24 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -110,6 +110,8 @@ files:
|
|
110
110
|
- Rakefile
|
111
111
|
- embulk-input-elasticsearch.gemspec
|
112
112
|
- lib/embulk/input/elasticsearch.rb
|
113
|
+
- lib/embulk/input/elasticsearch/connection.rb
|
114
|
+
- lib/embulk/input/elasticsearch/input_thread.rb
|
113
115
|
- test/helper.rb
|
114
116
|
- test/test_transaction.rb
|
115
117
|
homepage: https://github.com/toyama0919/embulk-input-elasticsearch
|