embulk-output-elasticsearch_ruby 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +40 -21
- data/embulk-output-elasticsearch_ruby.gemspec +1 -1
- data/lib/embulk/output/elasticsearch/connection.rb +129 -0
- data/lib/embulk/output/elasticsearch_ruby.rb +38 -124
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6824687f4de2bdcc12467725ea585462b9b0aa21
|
4
|
+
data.tar.gz: 337d8df78354516360a8c922b7dda2fe49abb12f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab542815971e42add2330522e9ea867c3e8542c5003155ec9df69ea76d55b35f8289853326eee52e855b72ff8fa2e171cb2a4ee8a1f8bc9dcc572987a341de22
|
7
|
+
data.tar.gz: a734d8594b15f0be35b12cdf64d3b51d3c87427e7e2d5fe0865b55e281cf77d63d588b1f3b16c4a1ceaee38a2a7e0620f3f7bd85d6f618a71f0b041f3cbcb9ff
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Elasticsearch Ruby output plugin for Embulk
|
1
|
+
# Elasticsearch Ruby output plugin for Embulk [](http://badge.fury.io/rb/embulk-output-elasticsearch_ruby)
|
2
2
|
|
3
3
|
Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
|
4
4
|
|
@@ -10,26 +10,28 @@ Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatibl
|
|
10
10
|
* **Cleanup supported**: yes
|
11
11
|
|
12
12
|
## Configuration
|
13
|
-
|
14
|
-
- **host
|
15
|
-
- **port
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
13
|
+
- **nodes** nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
|
14
|
+
- **host** host (string)
|
15
|
+
- **port** port (string)
|
16
|
+
- **request_timeout** request timeout (integer, default: 60)
|
17
|
+
- **index_type** index type (string)
|
18
|
+
- **mode** mode (string, default: 'normal')
|
19
|
+
- **reload_connections** reload connections (bool, default: true)
|
20
|
+
- **reload_on_failure** reload on failure (bool, default: false)
|
21
|
+
- **delete_old_index** delete old index (bool, default: false)
|
22
|
+
- **delete_old_alias** delete old alias (bool, default: true)
|
23
|
+
- **id_keys** id keys (array, default: nil)
|
24
|
+
- **id_format** id format (string, default: nil)
|
25
|
+
- **array_columns** array columns (array, default: nil)
|
26
|
+
- **bulk_actions** bulk actions (integer, default: 1000)
|
27
|
+
- **retry_on_failure** retry on failure (integer, default: 5)
|
28
|
+
- **current_index_name** current index name (string, default: nil)
|
29
|
+
- **index** index (string, default: 'logstash-%Y.%m.%d')
|
30
|
+
- **before_delete_index** before delete index (bool, default: false)
|
31
|
+
- **before_template_name** before template name (string, default: nil)
|
32
|
+
- **before_template** before template (hash, default: nil)
|
31
33
|
|
32
|
-
## Example
|
34
|
+
## Example(minimum settings)
|
33
35
|
|
34
36
|
```yaml
|
35
37
|
out:
|
@@ -39,7 +41,7 @@ out:
|
|
39
41
|
index_type: page
|
40
42
|
```
|
41
43
|
|
42
|
-
## Example(update)
|
44
|
+
## Example(update mode)
|
43
45
|
|
44
46
|
```yaml
|
45
47
|
out:
|
@@ -56,6 +58,23 @@ out:
|
|
56
58
|
- _id
|
57
59
|
```
|
58
60
|
|
61
|
+
## Example(replace mode)
|
62
|
+
|
63
|
+
```yaml
|
64
|
+
out:
|
65
|
+
type: elasticsearch_ruby
|
66
|
+
nodes:
|
67
|
+
- {host: localhost, port: 9200}
|
68
|
+
index: test_alias
|
69
|
+
index_type: crawl_companies
|
70
|
+
mode: replace
|
71
|
+
delete_old_index: true
|
72
|
+
before_delete_index: true
|
73
|
+
bulk_actions: 1000
|
74
|
+
request_timeout: 60
|
75
|
+
```
|
76
|
+
|
77
|
+
* create alias
|
59
78
|
|
60
79
|
## Build
|
61
80
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-output-elasticsearch_ruby"
|
4
|
-
spec.version = "0.1.
|
4
|
+
spec.version = "0.1.5"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible."
|
7
7
|
spec.description = "Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible."
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'excon'
|
2
|
+
require 'elasticsearch'
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Output
|
6
|
+
class Elasticsearch < OutputPlugin
|
7
|
+
class Connection
|
8
|
+
def initialize(task)
|
9
|
+
@nodes = task["nodes"]
|
10
|
+
@index_type = task["index_type"]
|
11
|
+
@id_keys = task["id_keys"]
|
12
|
+
@id_format = task["id_format"]
|
13
|
+
@array_columns = task["array_columns"]
|
14
|
+
@retry_on_failure = task["retry_on_failure"]
|
15
|
+
@mode = task["mode"]
|
16
|
+
@delete_old_index = task['delete_old_index']
|
17
|
+
@delete_old_alias = task['delete_old_alias']
|
18
|
+
@index = task['index']
|
19
|
+
@alias = task['alias']
|
20
|
+
@action = (@mode == 'update') ? :update : :index
|
21
|
+
|
22
|
+
@client = create_client(
|
23
|
+
nodes: task['nodes'],
|
24
|
+
reload_connections: task['reload_connections'],
|
25
|
+
reload_on_failure: task['reload_on_failure'],
|
26
|
+
retry_on_failure: task['retry_on_failure'],
|
27
|
+
request_timeout: task['request_timeout']
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
|
32
|
+
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
33
|
+
{
|
34
|
+
hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
35
|
+
options: {
|
36
|
+
reload_connections: reload_connections,
|
37
|
+
reload_on_failure: reload_on_failure,
|
38
|
+
retry_on_failure: retry_on_failure,
|
39
|
+
transport_options: {
|
40
|
+
request: { timeout: request_timeout }
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
)
|
45
|
+
::Elasticsearch::Client.new transport: transport
|
46
|
+
end
|
47
|
+
|
48
|
+
def put_template(before_template_name, before_template)
|
49
|
+
Embulk.logger.info("put template => #{before_template_name}")
|
50
|
+
@client.indices.put_template name: before_template_name, body: before_template
|
51
|
+
end
|
52
|
+
|
53
|
+
def create_aliases
|
54
|
+
@client.indices.update_aliases body: {
|
55
|
+
actions: [{ add: { index: @index, alias: @alias } }]
|
56
|
+
}
|
57
|
+
Embulk.logger.info "created alias: #{@alias}, index: #{@index}"
|
58
|
+
end
|
59
|
+
|
60
|
+
def delete_aliases
|
61
|
+
indices = @client.indices.get_alias(name: @alias).keys
|
62
|
+
indices.each do |index|
|
63
|
+
if index != @index
|
64
|
+
if @delete_old_alias
|
65
|
+
@client.indices.delete_alias index: index, name: @alias
|
66
|
+
Embulk.logger.info "deleted alias: #{@alias}, index: #{index}"
|
67
|
+
end
|
68
|
+
if @delete_old_index
|
69
|
+
delete_index(index)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def delete_index(index)
|
76
|
+
indices = @client.cat.indices(format: 'json')
|
77
|
+
if indices.any? { |i| i['index'] == index }
|
78
|
+
@client.indices.delete index: index
|
79
|
+
Embulk.logger.info "deleted index: #{index}"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def send(bulk_message)
|
84
|
+
retries = 0
|
85
|
+
begin
|
86
|
+
@client.bulk body: bulk_message
|
87
|
+
Embulk.logger.info "bulk: #{bulk_message.size/2} success."
|
88
|
+
rescue => e
|
89
|
+
if retries < @retry_on_failure
|
90
|
+
retries += 1
|
91
|
+
Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
|
92
|
+
sleep 2**retries
|
93
|
+
retry
|
94
|
+
end
|
95
|
+
raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def generate_source(record)
|
100
|
+
result = {}
|
101
|
+
|
102
|
+
record.each { |key, value|
|
103
|
+
result[key] = value
|
104
|
+
next if (value.nil? || !@array_columns)
|
105
|
+
@array_columns.each do |array_column|
|
106
|
+
if array_column['name'] == key
|
107
|
+
array_value = value.split(array_column['delimiter']).reject(&:empty?)
|
108
|
+
array_value = array_value.map(&:to_i) if array_column['is_integer']
|
109
|
+
result[key] = array_value
|
110
|
+
end
|
111
|
+
end
|
112
|
+
}
|
113
|
+
(@mode == 'update') ? {doc: result} : result
|
114
|
+
end
|
115
|
+
|
116
|
+
def generate_id(template, record, id_keys)
|
117
|
+
template % id_keys.map { |key| record[key] }
|
118
|
+
end
|
119
|
+
|
120
|
+
def generate_meta(record)
|
121
|
+
meta = {}
|
122
|
+
meta[@action] = { _index: @index, _type: @index_type }
|
123
|
+
meta[@action][:_id] = generate_id(@id_format, record, @id_keys) unless @id_keys.nil?
|
124
|
+
meta
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
require 'elasticsearch'
|
1
|
+
require_relative 'elasticsearch/connection'
|
3
2
|
|
4
3
|
module Embulk
|
5
4
|
module Output
|
@@ -12,40 +11,47 @@ module Embulk
|
|
12
11
|
task = {
|
13
12
|
"nodes" => config.param("nodes", :array, default: [{ 'host' => 'localhost', 'port' => 9200 }]),
|
14
13
|
"request_timeout" => config.param("request_timeout", :integer, default: 60),
|
15
|
-
"
|
14
|
+
"index_type" => config.param("index_type", :string),
|
16
15
|
"mode" => config.param("mode", :string, default: 'normal'),
|
17
16
|
"reload_connections" => config.param("reload_connections", :bool, default: true),
|
18
17
|
"reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
|
19
18
|
"delete_old_index" => config.param("delete_old_index", :bool, default: false),
|
20
19
|
"delete_old_alias" => config.param("delete_old_alias", :bool, default: true),
|
21
|
-
"index_type" => config.param("index_type", :string),
|
22
20
|
"id_keys" => config.param("id_keys", :array, default: nil),
|
23
21
|
"id_format" => config.param("id_format", :string, default: nil),
|
24
22
|
"array_columns" => config.param("array_columns", :array, default: nil),
|
25
23
|
"bulk_actions" => config.param("bulk_actions", :integer, default: 1000),
|
26
24
|
"retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
|
27
|
-
"before_template_name" => config.param("before_template_name", :string, default: nil),
|
28
|
-
"before_template" => config.param("before_template", :hash, default: nil),
|
29
|
-
"current_index_name" => config.param("current_index_name", :string, default: nil),
|
30
25
|
}
|
31
|
-
task['time_value'] = Time.now.strftime('%Y.%m.%d.%H.%M.%S')
|
32
|
-
task['index'] = Time.now.strftime(task['index'])
|
33
|
-
|
34
|
-
task['current_index_name'] = if task['current_index_name']
|
35
|
-
task['current_index_name']
|
36
|
-
else
|
37
|
-
"#{task['index']}-#{task['index_type']}-#{task['time_value']}"
|
38
|
-
end
|
39
26
|
|
40
27
|
unless ENABLE_MODE.include?(task['mode'])
|
41
28
|
raise ConfigError.new "`mode` must be one of #{ENABLE_MODE.join(', ')}"
|
42
29
|
end
|
43
30
|
Embulk.logger.info("mode => #{task['mode']}")
|
44
31
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
32
|
+
current_index_name = config.param("current_index_name", :string, default: nil)
|
33
|
+
index = config.param("index", :string, default: 'logstash-%Y.%m.%d')
|
34
|
+
if task['mode'] == 'replace'
|
35
|
+
task['alias'] = index
|
36
|
+
task['index'] = if current_index_name
|
37
|
+
current_index_name
|
38
|
+
else
|
39
|
+
"#{index}-#{task['index_type']}-#{Time.now.strftime('%Y.%m.%d.%H.%M.%S')}"
|
40
|
+
end
|
41
|
+
else
|
42
|
+
task['index'] = Time.now.strftime(index)
|
43
|
+
end
|
44
|
+
|
45
|
+
connection = Connection.new(task)
|
46
|
+
before_delete_index = config.param("before_delete_index", :bool, default: false)
|
47
|
+
if before_delete_index
|
48
|
+
connection.delete_index(task['index'])
|
49
|
+
end
|
50
|
+
|
51
|
+
before_template_name = config.param("before_template_name", :string, default: nil)
|
52
|
+
before_template = config.param("before_template", :hash, default: nil)
|
53
|
+
if before_template_name && before_template
|
54
|
+
connection.put_template(before_template_name, before_template)
|
49
55
|
end
|
50
56
|
|
51
57
|
task_reports = yield(task)
|
@@ -55,57 +61,12 @@ module Embulk
|
|
55
61
|
|
56
62
|
def self.cleanup(task, schema, count, task_reports)
|
57
63
|
if task['mode'] == 'replace'
|
58
|
-
|
59
|
-
create_aliases
|
60
|
-
delete_aliases
|
64
|
+
connection = Connection.new(task)
|
65
|
+
connection.create_aliases
|
66
|
+
connection.delete_aliases
|
61
67
|
end
|
62
68
|
end
|
63
69
|
|
64
|
-
def self.create_client(task)
|
65
|
-
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
66
|
-
{
|
67
|
-
hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
68
|
-
options: {
|
69
|
-
reload_connections: task['reload_connections'],
|
70
|
-
reload_on_failure: task['reload_on_failure'],
|
71
|
-
retry_on_failure: task['retry_on_failure'],
|
72
|
-
transport_options: {
|
73
|
-
request: { timeout: task['request_timeout'] }
|
74
|
-
}
|
75
|
-
}
|
76
|
-
}
|
77
|
-
)
|
78
|
-
|
79
|
-
::Elasticsearch::Client.new transport: transport
|
80
|
-
end
|
81
|
-
|
82
|
-
def self.create_aliases(client, als, index)
|
83
|
-
client.indices.update_aliases body: {
|
84
|
-
actions: [{ add: { index: index, alias: als } }]
|
85
|
-
}
|
86
|
-
Embulk.logger.info "created alias: #{als}, index: #{index}"
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.delete_aliases(client, task)
|
90
|
-
indices = client.indices.get_alias(name: task['index']).keys
|
91
|
-
indices.each { |index|
|
92
|
-
if index != get_index(task)
|
93
|
-
if task['delete_old_alias']
|
94
|
-
client.indices.delete_alias index: index, name: task['index']
|
95
|
-
Embulk.logger.info "deleted alias: #{task['index']}, index: #{index}"
|
96
|
-
end
|
97
|
-
if task['delete_old_index']
|
98
|
-
client.indices.delete index: index
|
99
|
-
Embulk.logger.info "deleted index: #{index}"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
}
|
103
|
-
end
|
104
|
-
|
105
|
-
def self.get_index(task)
|
106
|
-
task['mode'] == 'replace' ? task['current_index_name'] : task['index']
|
107
|
-
end
|
108
|
-
|
109
70
|
#def self.resume(task, schema, count, &control)
|
110
71
|
# task_reports = yield(task)
|
111
72
|
#
|
@@ -114,17 +75,8 @@ module Embulk
|
|
114
75
|
#end
|
115
76
|
|
116
77
|
def init
|
117
|
-
@
|
118
|
-
@index_type = task["index_type"]
|
119
|
-
@id_keys = task["id_keys"]
|
120
|
-
@id_format = task["id_format"]
|
78
|
+
@connection = Connection.new(task)
|
121
79
|
@bulk_actions = task["bulk_actions"]
|
122
|
-
@array_columns = task["array_columns"]
|
123
|
-
@retry_on_failure = task["retry_on_failure"]
|
124
|
-
@mode = task["mode"]
|
125
|
-
@index = self.class.get_index(task)
|
126
|
-
|
127
|
-
@client = self.class.create_client(task)
|
128
80
|
@bulk_message = []
|
129
81
|
end
|
130
82
|
|
@@ -134,22 +86,24 @@ module Embulk
|
|
134
86
|
def add(page)
|
135
87
|
page.each do |record|
|
136
88
|
hash = Hash[schema.names.zip(record)]
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
meta
|
141
|
-
source
|
89
|
+
meta = @connection.generate_meta(hash)
|
90
|
+
source = @connection.generate_source(hash)
|
91
|
+
|
92
|
+
Embulk.logger.debug("meta => #{meta}")
|
93
|
+
Embulk.logger.debug("source => #{source}")
|
94
|
+
|
142
95
|
@bulk_message << meta
|
143
96
|
@bulk_message << source
|
144
97
|
if @bulk_actions * 2 <= @bulk_message.size
|
145
|
-
send
|
98
|
+
@connection.send(@bulk_message)
|
99
|
+
@bulk_message.clear
|
146
100
|
end
|
147
101
|
end
|
148
102
|
end
|
149
103
|
|
150
104
|
def finish
|
151
105
|
if @bulk_message.size > 0
|
152
|
-
send
|
106
|
+
@connection.send(@bulk_message)
|
153
107
|
end
|
154
108
|
end
|
155
109
|
|
@@ -160,46 +114,6 @@ module Embulk
|
|
160
114
|
task_report = {}
|
161
115
|
return task_report
|
162
116
|
end
|
163
|
-
|
164
|
-
private
|
165
|
-
|
166
|
-
def generate_array(record)
|
167
|
-
result = {}
|
168
|
-
|
169
|
-
record.each { |key, value|
|
170
|
-
result[key] = value
|
171
|
-
next if (value.nil? || !@array_columns)
|
172
|
-
@array_columns.each do |array_column|
|
173
|
-
if array_column['name'] == key
|
174
|
-
array_value = value.split(array_column['delimiter']).reject(&:empty?)
|
175
|
-
array_value = array_value.map(&:to_i) if array_column['is_integer']
|
176
|
-
result[key] = array_value
|
177
|
-
end
|
178
|
-
end
|
179
|
-
}
|
180
|
-
(@mode == 'update') ? {doc: result} : result
|
181
|
-
end
|
182
|
-
|
183
|
-
def generate_id(template, record, id_keys)
|
184
|
-
template % id_keys.map { |key| record[key] }
|
185
|
-
end
|
186
|
-
|
187
|
-
def send
|
188
|
-
retries = 0
|
189
|
-
begin
|
190
|
-
@client.bulk body: @bulk_message
|
191
|
-
Embulk.logger.info "bulk: #{@bulk_message.size/2} success."
|
192
|
-
rescue => e
|
193
|
-
if retries < @retry_on_failure
|
194
|
-
retries += 1
|
195
|
-
Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
|
196
|
-
sleep 2**retries
|
197
|
-
retry
|
198
|
-
end
|
199
|
-
raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
|
200
|
-
end
|
201
|
-
@bulk_message.clear
|
202
|
-
end
|
203
117
|
end
|
204
118
|
end
|
205
119
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-elasticsearch_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -121,6 +121,7 @@ files:
|
|
121
121
|
- README.md
|
122
122
|
- Rakefile
|
123
123
|
- embulk-output-elasticsearch_ruby.gemspec
|
124
|
+
- lib/embulk/output/elasticsearch/connection.rb
|
124
125
|
- lib/embulk/output/elasticsearch_ruby.rb
|
125
126
|
- test/helper.rb
|
126
127
|
- test/test_transaction.rb
|