embulk-output-elasticsearch_ruby 0.1.4 → 0.1.5
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +40 -21
- data/embulk-output-elasticsearch_ruby.gemspec +1 -1
- data/lib/embulk/output/elasticsearch/connection.rb +129 -0
- data/lib/embulk/output/elasticsearch_ruby.rb +38 -124
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6824687f4de2bdcc12467725ea585462b9b0aa21
|
4
|
+
data.tar.gz: 337d8df78354516360a8c922b7dda2fe49abb12f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: ab542815971e42add2330522e9ea867c3e8542c5003155ec9df69ea76d55b35f8289853326eee52e855b72ff8fa2e171cb2a4ee8a1f8bc9dcc572987a341de22
|
7
|
+
data.tar.gz: a734d8594b15f0be35b12cdf64d3b51d3c87427e7e2d5fe0865b55e281cf77d63d588b1f3b16c4a1ceaee38a2a7e0620f3f7bd85d6f618a71f0b041f3cbcb9ff
|
data/README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
# Elasticsearch Ruby output plugin for Embulk
|
1
|
+
# Elasticsearch Ruby output plugin for Embulk [![Gem Version](https://badge.fury.io/rb/embulk-output-elasticsearch_ruby.svg)](http://badge.fury.io/rb/embulk-output-elasticsearch_ruby)
|
2
2
|
|
3
3
|
Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
|
4
4
|
|
@@ -10,26 +10,28 @@ Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatibl
|
|
10
10
|
* **Cleanup supported**: yes
|
11
11
|
|
12
12
|
## Configuration
|
13
|
-
|
14
|
-
- **host
|
15
|
-
- **port
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
13
|
+
- **nodes** nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
|
14
|
+
- **host** host (string)
|
15
|
+
- **port** port (string)
|
16
|
+
- **request_timeout** request timeout (integer, default: 60)
|
17
|
+
- **index_type** index type (string)
|
18
|
+
- **mode** mode (string, default: 'normal')
|
19
|
+
- **reload_connections** reload connections (bool, default: true)
|
20
|
+
- **reload_on_failure** reload on failure (bool, default: false)
|
21
|
+
- **delete_old_index** delete old index (bool, default: false)
|
22
|
+
- **delete_old_alias** delete old alias (bool, default: true)
|
23
|
+
- **id_keys** id keys (array, default: nil)
|
24
|
+
- **id_format** id format (string, default: nil)
|
25
|
+
- **array_columns** array columns (array, default: nil)
|
26
|
+
- **bulk_actions** bulk actions (integer, default: 1000)
|
27
|
+
- **retry_on_failure** retry on failure (integer, default: 5)
|
28
|
+
- **current_index_name** current index name (string, default: nil)
|
29
|
+
- **index** index (string, default: 'logstash-%Y.%m.%d')
|
30
|
+
- **before_delete_index** before delete index (bool, default: false)
|
31
|
+
- **before_template_name** before template name (string, default: nil)
|
32
|
+
- **before_template** before template (hash, default: nil)
|
31
33
|
|
32
|
-
## Example
|
34
|
+
## Example(minimum settings)
|
33
35
|
|
34
36
|
```yaml
|
35
37
|
out:
|
@@ -39,7 +41,7 @@ out:
|
|
39
41
|
index_type: page
|
40
42
|
```
|
41
43
|
|
42
|
-
## Example(update)
|
44
|
+
## Example(update mode)
|
43
45
|
|
44
46
|
```yaml
|
45
47
|
out:
|
@@ -56,6 +58,23 @@ out:
|
|
56
58
|
- _id
|
57
59
|
```
|
58
60
|
|
61
|
+
## Example(replace mode)
|
62
|
+
|
63
|
+
```yaml
|
64
|
+
out:
|
65
|
+
type: elasticsearch_ruby
|
66
|
+
nodes:
|
67
|
+
- {host: localhost, port: 9200}
|
68
|
+
index: test_alias
|
69
|
+
index_type: crawl_companies
|
70
|
+
mode: replace
|
71
|
+
delete_old_index: true
|
72
|
+
before_delete_index: true
|
73
|
+
bulk_actions: 1000
|
74
|
+
request_timeout: 60
|
75
|
+
```
|
76
|
+
|
77
|
+
* create alias
|
59
78
|
|
60
79
|
## Build
|
61
80
|
|
@@ -1,7 +1,7 @@
|
|
1
1
|
|
2
2
|
Gem::Specification.new do |spec|
|
3
3
|
spec.name = "embulk-output-elasticsearch_ruby"
|
4
|
-
spec.version = "0.1.
|
4
|
+
spec.version = "0.1.5"
|
5
5
|
spec.authors = ["toyama0919"]
|
6
6
|
spec.summary = "Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible."
|
7
7
|
spec.description = "Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible."
|
@@ -0,0 +1,129 @@
|
|
1
|
+
require 'excon'
|
2
|
+
require 'elasticsearch'
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Output
|
6
|
+
class Elasticsearch < OutputPlugin
|
7
|
+
class Connection
|
8
|
+
def initialize(task)
|
9
|
+
@nodes = task["nodes"]
|
10
|
+
@index_type = task["index_type"]
|
11
|
+
@id_keys = task["id_keys"]
|
12
|
+
@id_format = task["id_format"]
|
13
|
+
@array_columns = task["array_columns"]
|
14
|
+
@retry_on_failure = task["retry_on_failure"]
|
15
|
+
@mode = task["mode"]
|
16
|
+
@delete_old_index = task['delete_old_index']
|
17
|
+
@delete_old_alias = task['delete_old_alias']
|
18
|
+
@index = task['index']
|
19
|
+
@alias = task['alias']
|
20
|
+
@action = (@mode == 'update') ? :update : :index
|
21
|
+
|
22
|
+
@client = create_client(
|
23
|
+
nodes: task['nodes'],
|
24
|
+
reload_connections: task['reload_connections'],
|
25
|
+
reload_on_failure: task['reload_on_failure'],
|
26
|
+
retry_on_failure: task['retry_on_failure'],
|
27
|
+
request_timeout: task['request_timeout']
|
28
|
+
)
|
29
|
+
end
|
30
|
+
|
31
|
+
def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
|
32
|
+
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
33
|
+
{
|
34
|
+
hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
35
|
+
options: {
|
36
|
+
reload_connections: reload_connections,
|
37
|
+
reload_on_failure: reload_on_failure,
|
38
|
+
retry_on_failure: retry_on_failure,
|
39
|
+
transport_options: {
|
40
|
+
request: { timeout: request_timeout }
|
41
|
+
}
|
42
|
+
}
|
43
|
+
}
|
44
|
+
)
|
45
|
+
::Elasticsearch::Client.new transport: transport
|
46
|
+
end
|
47
|
+
|
48
|
+
def put_template(before_template_name, before_template)
|
49
|
+
Embulk.logger.info("put template => #{before_template_name}")
|
50
|
+
@client.indices.put_template name: before_template_name, body: before_template
|
51
|
+
end
|
52
|
+
|
53
|
+
def create_aliases
|
54
|
+
@client.indices.update_aliases body: {
|
55
|
+
actions: [{ add: { index: @index, alias: @alias } }]
|
56
|
+
}
|
57
|
+
Embulk.logger.info "created alias: #{@alias}, index: #{@index}"
|
58
|
+
end
|
59
|
+
|
60
|
+
def delete_aliases
|
61
|
+
indices = @client.indices.get_alias(name: @alias).keys
|
62
|
+
indices.each do |index|
|
63
|
+
if index != @index
|
64
|
+
if @delete_old_alias
|
65
|
+
@client.indices.delete_alias index: index, name: @alias
|
66
|
+
Embulk.logger.info "deleted alias: #{@alias}, index: #{index}"
|
67
|
+
end
|
68
|
+
if @delete_old_index
|
69
|
+
delete_index(index)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def delete_index(index)
|
76
|
+
indices = @client.cat.indices(format: 'json')
|
77
|
+
if indices.any? { |i| i['index'] == index }
|
78
|
+
@client.indices.delete index: index
|
79
|
+
Embulk.logger.info "deleted index: #{index}"
|
80
|
+
end
|
81
|
+
end
|
82
|
+
|
83
|
+
def send(bulk_message)
|
84
|
+
retries = 0
|
85
|
+
begin
|
86
|
+
@client.bulk body: bulk_message
|
87
|
+
Embulk.logger.info "bulk: #{bulk_message.size/2} success."
|
88
|
+
rescue => e
|
89
|
+
if retries < @retry_on_failure
|
90
|
+
retries += 1
|
91
|
+
Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
|
92
|
+
sleep 2**retries
|
93
|
+
retry
|
94
|
+
end
|
95
|
+
raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
|
96
|
+
end
|
97
|
+
end
|
98
|
+
|
99
|
+
def generate_source(record)
|
100
|
+
result = {}
|
101
|
+
|
102
|
+
record.each { |key, value|
|
103
|
+
result[key] = value
|
104
|
+
next if (value.nil? || !@array_columns)
|
105
|
+
@array_columns.each do |array_column|
|
106
|
+
if array_column['name'] == key
|
107
|
+
array_value = value.split(array_column['delimiter']).reject(&:empty?)
|
108
|
+
array_value = array_value.map(&:to_i) if array_column['is_integer']
|
109
|
+
result[key] = array_value
|
110
|
+
end
|
111
|
+
end
|
112
|
+
}
|
113
|
+
(@mode == 'update') ? {doc: result} : result
|
114
|
+
end
|
115
|
+
|
116
|
+
def generate_id(template, record, id_keys)
|
117
|
+
template % id_keys.map { |key| record[key] }
|
118
|
+
end
|
119
|
+
|
120
|
+
def generate_meta(record)
|
121
|
+
meta = {}
|
122
|
+
meta[@action] = { _index: @index, _type: @index_type }
|
123
|
+
meta[@action][:_id] = generate_id(@id_format, record, @id_keys) unless @id_keys.nil?
|
124
|
+
meta
|
125
|
+
end
|
126
|
+
end
|
127
|
+
end
|
128
|
+
end
|
129
|
+
end
|
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
require 'elasticsearch'
|
1
|
+
require_relative 'elasticsearch/connection'
|
3
2
|
|
4
3
|
module Embulk
|
5
4
|
module Output
|
@@ -12,40 +11,47 @@ module Embulk
|
|
12
11
|
task = {
|
13
12
|
"nodes" => config.param("nodes", :array, default: [{ 'host' => 'localhost', 'port' => 9200 }]),
|
14
13
|
"request_timeout" => config.param("request_timeout", :integer, default: 60),
|
15
|
-
"
|
14
|
+
"index_type" => config.param("index_type", :string),
|
16
15
|
"mode" => config.param("mode", :string, default: 'normal'),
|
17
16
|
"reload_connections" => config.param("reload_connections", :bool, default: true),
|
18
17
|
"reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
|
19
18
|
"delete_old_index" => config.param("delete_old_index", :bool, default: false),
|
20
19
|
"delete_old_alias" => config.param("delete_old_alias", :bool, default: true),
|
21
|
-
"index_type" => config.param("index_type", :string),
|
22
20
|
"id_keys" => config.param("id_keys", :array, default: nil),
|
23
21
|
"id_format" => config.param("id_format", :string, default: nil),
|
24
22
|
"array_columns" => config.param("array_columns", :array, default: nil),
|
25
23
|
"bulk_actions" => config.param("bulk_actions", :integer, default: 1000),
|
26
24
|
"retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
|
27
|
-
"before_template_name" => config.param("before_template_name", :string, default: nil),
|
28
|
-
"before_template" => config.param("before_template", :hash, default: nil),
|
29
|
-
"current_index_name" => config.param("current_index_name", :string, default: nil),
|
30
25
|
}
|
31
|
-
task['time_value'] = Time.now.strftime('%Y.%m.%d.%H.%M.%S')
|
32
|
-
task['index'] = Time.now.strftime(task['index'])
|
33
|
-
|
34
|
-
task['current_index_name'] = if task['current_index_name']
|
35
|
-
task['current_index_name']
|
36
|
-
else
|
37
|
-
"#{task['index']}-#{task['index_type']}-#{task['time_value']}"
|
38
|
-
end
|
39
26
|
|
40
27
|
unless ENABLE_MODE.include?(task['mode'])
|
41
28
|
raise ConfigError.new "`mode` must be one of #{ENABLE_MODE.join(', ')}"
|
42
29
|
end
|
43
30
|
Embulk.logger.info("mode => #{task['mode']}")
|
44
31
|
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
32
|
+
current_index_name = config.param("current_index_name", :string, default: nil)
|
33
|
+
index = config.param("index", :string, default: 'logstash-%Y.%m.%d')
|
34
|
+
if task['mode'] == 'replace'
|
35
|
+
task['alias'] = index
|
36
|
+
task['index'] = if current_index_name
|
37
|
+
current_index_name
|
38
|
+
else
|
39
|
+
"#{index}-#{task['index_type']}-#{Time.now.strftime('%Y.%m.%d.%H.%M.%S')}"
|
40
|
+
end
|
41
|
+
else
|
42
|
+
task['index'] = Time.now.strftime(index)
|
43
|
+
end
|
44
|
+
|
45
|
+
connection = Connection.new(task)
|
46
|
+
before_delete_index = config.param("before_delete_index", :bool, default: false)
|
47
|
+
if before_delete_index
|
48
|
+
connection.delete_index(task['index'])
|
49
|
+
end
|
50
|
+
|
51
|
+
before_template_name = config.param("before_template_name", :string, default: nil)
|
52
|
+
before_template = config.param("before_template", :hash, default: nil)
|
53
|
+
if before_template_name && before_template
|
54
|
+
connection.put_template(before_template_name, before_template)
|
49
55
|
end
|
50
56
|
|
51
57
|
task_reports = yield(task)
|
@@ -55,57 +61,12 @@ module Embulk
|
|
55
61
|
|
56
62
|
def self.cleanup(task, schema, count, task_reports)
|
57
63
|
if task['mode'] == 'replace'
|
58
|
-
|
59
|
-
create_aliases
|
60
|
-
delete_aliases
|
64
|
+
connection = Connection.new(task)
|
65
|
+
connection.create_aliases
|
66
|
+
connection.delete_aliases
|
61
67
|
end
|
62
68
|
end
|
63
69
|
|
64
|
-
def self.create_client(task)
|
65
|
-
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
66
|
-
{
|
67
|
-
hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
68
|
-
options: {
|
69
|
-
reload_connections: task['reload_connections'],
|
70
|
-
reload_on_failure: task['reload_on_failure'],
|
71
|
-
retry_on_failure: task['retry_on_failure'],
|
72
|
-
transport_options: {
|
73
|
-
request: { timeout: task['request_timeout'] }
|
74
|
-
}
|
75
|
-
}
|
76
|
-
}
|
77
|
-
)
|
78
|
-
|
79
|
-
::Elasticsearch::Client.new transport: transport
|
80
|
-
end
|
81
|
-
|
82
|
-
def self.create_aliases(client, als, index)
|
83
|
-
client.indices.update_aliases body: {
|
84
|
-
actions: [{ add: { index: index, alias: als } }]
|
85
|
-
}
|
86
|
-
Embulk.logger.info "created alias: #{als}, index: #{index}"
|
87
|
-
end
|
88
|
-
|
89
|
-
def self.delete_aliases(client, task)
|
90
|
-
indices = client.indices.get_alias(name: task['index']).keys
|
91
|
-
indices.each { |index|
|
92
|
-
if index != get_index(task)
|
93
|
-
if task['delete_old_alias']
|
94
|
-
client.indices.delete_alias index: index, name: task['index']
|
95
|
-
Embulk.logger.info "deleted alias: #{task['index']}, index: #{index}"
|
96
|
-
end
|
97
|
-
if task['delete_old_index']
|
98
|
-
client.indices.delete index: index
|
99
|
-
Embulk.logger.info "deleted index: #{index}"
|
100
|
-
end
|
101
|
-
end
|
102
|
-
}
|
103
|
-
end
|
104
|
-
|
105
|
-
def self.get_index(task)
|
106
|
-
task['mode'] == 'replace' ? task['current_index_name'] : task['index']
|
107
|
-
end
|
108
|
-
|
109
70
|
#def self.resume(task, schema, count, &control)
|
110
71
|
# task_reports = yield(task)
|
111
72
|
#
|
@@ -114,17 +75,8 @@ module Embulk
|
|
114
75
|
#end
|
115
76
|
|
116
77
|
def init
|
117
|
-
@
|
118
|
-
@index_type = task["index_type"]
|
119
|
-
@id_keys = task["id_keys"]
|
120
|
-
@id_format = task["id_format"]
|
78
|
+
@connection = Connection.new(task)
|
121
79
|
@bulk_actions = task["bulk_actions"]
|
122
|
-
@array_columns = task["array_columns"]
|
123
|
-
@retry_on_failure = task["retry_on_failure"]
|
124
|
-
@mode = task["mode"]
|
125
|
-
@index = self.class.get_index(task)
|
126
|
-
|
127
|
-
@client = self.class.create_client(task)
|
128
80
|
@bulk_message = []
|
129
81
|
end
|
130
82
|
|
@@ -134,22 +86,24 @@ module Embulk
|
|
134
86
|
def add(page)
|
135
87
|
page.each do |record|
|
136
88
|
hash = Hash[schema.names.zip(record)]
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
meta
|
141
|
-
source
|
89
|
+
meta = @connection.generate_meta(hash)
|
90
|
+
source = @connection.generate_source(hash)
|
91
|
+
|
92
|
+
Embulk.logger.debug("meta => #{meta}")
|
93
|
+
Embulk.logger.debug("source => #{source}")
|
94
|
+
|
142
95
|
@bulk_message << meta
|
143
96
|
@bulk_message << source
|
144
97
|
if @bulk_actions * 2 <= @bulk_message.size
|
145
|
-
send
|
98
|
+
@connection.send(@bulk_message)
|
99
|
+
@bulk_message.clear
|
146
100
|
end
|
147
101
|
end
|
148
102
|
end
|
149
103
|
|
150
104
|
def finish
|
151
105
|
if @bulk_message.size > 0
|
152
|
-
send
|
106
|
+
@connection.send(@bulk_message)
|
153
107
|
end
|
154
108
|
end
|
155
109
|
|
@@ -160,46 +114,6 @@ module Embulk
|
|
160
114
|
task_report = {}
|
161
115
|
return task_report
|
162
116
|
end
|
163
|
-
|
164
|
-
private
|
165
|
-
|
166
|
-
def generate_array(record)
|
167
|
-
result = {}
|
168
|
-
|
169
|
-
record.each { |key, value|
|
170
|
-
result[key] = value
|
171
|
-
next if (value.nil? || !@array_columns)
|
172
|
-
@array_columns.each do |array_column|
|
173
|
-
if array_column['name'] == key
|
174
|
-
array_value = value.split(array_column['delimiter']).reject(&:empty?)
|
175
|
-
array_value = array_value.map(&:to_i) if array_column['is_integer']
|
176
|
-
result[key] = array_value
|
177
|
-
end
|
178
|
-
end
|
179
|
-
}
|
180
|
-
(@mode == 'update') ? {doc: result} : result
|
181
|
-
end
|
182
|
-
|
183
|
-
def generate_id(template, record, id_keys)
|
184
|
-
template % id_keys.map { |key| record[key] }
|
185
|
-
end
|
186
|
-
|
187
|
-
def send
|
188
|
-
retries = 0
|
189
|
-
begin
|
190
|
-
@client.bulk body: @bulk_message
|
191
|
-
Embulk.logger.info "bulk: #{@bulk_message.size/2} success."
|
192
|
-
rescue => e
|
193
|
-
if retries < @retry_on_failure
|
194
|
-
retries += 1
|
195
|
-
Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
|
196
|
-
sleep 2**retries
|
197
|
-
retry
|
198
|
-
end
|
199
|
-
raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
|
200
|
-
end
|
201
|
-
@bulk_message.clear
|
202
|
-
end
|
203
117
|
end
|
204
118
|
end
|
205
119
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: embulk-output-elasticsearch_ruby
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.5
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- toyama0919
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-03-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
requirement: !ruby/object:Gem::Requirement
|
@@ -121,6 +121,7 @@ files:
|
|
121
121
|
- README.md
|
122
122
|
- Rakefile
|
123
123
|
- embulk-output-elasticsearch_ruby.gemspec
|
124
|
+
- lib/embulk/output/elasticsearch/connection.rb
|
124
125
|
- lib/embulk/output/elasticsearch_ruby.rb
|
125
126
|
- test/helper.rb
|
126
127
|
- test/test_transaction.rb
|