embulk-output-elasticsearch_ruby 0.1.4 → 0.1.5

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be42f1b9b4d5953efa82af4f11eff2d1871e28d7
4
- data.tar.gz: 994268d29c57b5f1145e1f4ac79922da18282c9c
3
+ metadata.gz: 6824687f4de2bdcc12467725ea585462b9b0aa21
4
+ data.tar.gz: 337d8df78354516360a8c922b7dda2fe49abb12f
5
5
  SHA512:
6
- metadata.gz: ffd1752f6ae7f52fb1bc8a07d90ff59633f05799c31cf05f207f1607a31ab298aab48c031e7c1b6634dd9751c757a449b4ad94cce930f36195f0c22c415aaa52
7
- data.tar.gz: 8ea0f41e0de01e06f3916b4b1c2377260097536d9f9b68abc31a8962655ef0b74ee750d4de1b11bcd601c8c38b3b25f062b9a66aff5409d4ea67debf99f53d2b
6
+ metadata.gz: ab542815971e42add2330522e9ea867c3e8542c5003155ec9df69ea76d55b35f8289853326eee52e855b72ff8fa2e171cb2a4ee8a1f8bc9dcc572987a341de22
7
+ data.tar.gz: a734d8594b15f0be35b12cdf64d3b51d3c87427e7e2d5fe0865b55e281cf77d63d588b1f3b16c4a1ceaee38a2a7e0620f3f7bd85d6f618a71f0b041f3cbcb9ff
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Elasticsearch Ruby output plugin for Embulk
1
+ # Elasticsearch Ruby output plugin for Embulk [![Gem Version](https://badge.fury.io/rb/embulk-output-elasticsearch_ruby.svg)](http://badge.fury.io/rb/embulk-output-elasticsearch_ruby)
2
2
 
3
3
  Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
4
4
 
@@ -10,26 +10,28 @@ Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatibl
10
10
  * **Cleanup supported**: yes
11
11
 
12
12
  ## Configuration
13
- - **nodes**: nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
14
- - **host**: index (string)
15
- - **port**: index (integer)
16
- - **request_timeout**: request_timeout (integer, default: 60)
17
- - **index**: index (string, , default: 'logstash-%Y.%m.%d')
18
- - **mode**: mode, normal or update or replace (string, default: normal)
19
- - **reload_connections**: reload_connections (bool, default: true)
20
- - **reload_on_failure**: reload_on_failure (bool, default: false)
21
- - **delete_old_index**: delete_old_index (bool, default: false)
22
- - **index_type**: index_type (string)
23
- - **id_keys**: id_keys (array, default: nil)
24
- - **id_format**: id_format (string, default: nil)
25
- - **array_columns**: array_columns (array, default: nil)
26
- - **name**: Array convert column. (string)
27
- - **delimiter**: delimiter for split. (string)
28
- - **is_integer**: to integer. (bool)
29
- - **bulk_actions**: bulk_actions (integer, default: 1000)
30
- - **retry_on_failure**: retry_on_failure (integer, default: 5)
13
+ - **nodes** nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
14
+ - **host** host (string)
15
+ - **port** port (string)
16
+ - **request_timeout** request timeout (integer, default: 60)
17
+ - **index_type** index type (string)
18
+ - **mode** mode (string, default: 'normal')
19
+ - **reload_connections** reload connections (bool, default: true)
20
+ - **reload_on_failure** reload on failure (bool, default: false)
21
+ - **delete_old_index** delete old index (bool, default: false)
22
+ - **delete_old_alias** delete old alias (bool, default: true)
23
+ - **id_keys** id keys (array, default: nil)
24
+ - **id_format** id format (string, default: nil)
25
+ - **array_columns** array columns (array, default: nil)
26
+ - **bulk_actions** bulk actions (integer, default: 1000)
27
+ - **retry_on_failure** retry on failure (integer, default: 5)
28
+ - **current_index_name** current index name (string, default: nil)
29
+ - **index** index (string, default: 'logstash-%Y.%m.%d')
30
+ - **before_delete_index** before delete index (bool, default: false)
31
+ - **before_template_name** before template name (string, default: nil)
32
+ - **before_template** before template (hash, default: nil)
31
33
 
32
- ## Example
34
+ ## Example(minimum settings)
33
35
 
34
36
  ```yaml
35
37
  out:
@@ -39,7 +41,7 @@ out:
39
41
  index_type: page
40
42
  ```
41
43
 
42
- ## Example(update)
44
+ ## Example(update mode)
43
45
 
44
46
  ```yaml
45
47
  out:
@@ -56,6 +58,23 @@ out:
56
58
  - _id
57
59
  ```
58
60
 
61
+ ## Example(replace mode)
62
+
63
+ ```yaml
64
+ out:
65
+ type: elasticsearch_ruby
66
+ nodes:
67
+ - {host: localhost, port: 9200}
68
+ index: test_alias
69
+ index_type: crawl_companies
70
+ mode: replace
71
+ delete_old_index: true
72
+ before_delete_index: true
73
+ bulk_actions: 1000
74
+ request_timeout: 60
75
+ ```
76
+
77
+ * create alias
59
78
 
60
79
  ## Build
61
80
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-output-elasticsearch_ruby"
4
- spec.version = "0.1.4"
4
+ spec.version = "0.1.5"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible."
7
7
  spec.description = "Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible."
@@ -0,0 +1,129 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
4
+ module Embulk
5
+ module Output
6
+ class Elasticsearch < OutputPlugin
7
+ class Connection
8
+ def initialize(task)
9
+ @nodes = task["nodes"]
10
+ @index_type = task["index_type"]
11
+ @id_keys = task["id_keys"]
12
+ @id_format = task["id_format"]
13
+ @array_columns = task["array_columns"]
14
+ @retry_on_failure = task["retry_on_failure"]
15
+ @mode = task["mode"]
16
+ @delete_old_index = task['delete_old_index']
17
+ @delete_old_alias = task['delete_old_alias']
18
+ @index = task['index']
19
+ @alias = task['alias']
20
+ @action = (@mode == 'update') ? :update : :index
21
+
22
+ @client = create_client(
23
+ nodes: task['nodes'],
24
+ reload_connections: task['reload_connections'],
25
+ reload_on_failure: task['reload_on_failure'],
26
+ retry_on_failure: task['retry_on_failure'],
27
+ request_timeout: task['request_timeout']
28
+ )
29
+ end
30
+
31
+ def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
32
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
33
+ {
34
+ hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
35
+ options: {
36
+ reload_connections: reload_connections,
37
+ reload_on_failure: reload_on_failure,
38
+ retry_on_failure: retry_on_failure,
39
+ transport_options: {
40
+ request: { timeout: request_timeout }
41
+ }
42
+ }
43
+ }
44
+ )
45
+ ::Elasticsearch::Client.new transport: transport
46
+ end
47
+
48
+ def put_template(before_template_name, before_template)
49
+ Embulk.logger.info("put template => #{before_template_name}")
50
+ @client.indices.put_template name: before_template_name, body: before_template
51
+ end
52
+
53
+ def create_aliases
54
+ @client.indices.update_aliases body: {
55
+ actions: [{ add: { index: @index, alias: @alias } }]
56
+ }
57
+ Embulk.logger.info "created alias: #{@alias}, index: #{@index}"
58
+ end
59
+
60
+ def delete_aliases
61
+ indices = @client.indices.get_alias(name: @alias).keys
62
+ indices.each do |index|
63
+ if index != @index
64
+ if @delete_old_alias
65
+ @client.indices.delete_alias index: index, name: @alias
66
+ Embulk.logger.info "deleted alias: #{@alias}, index: #{index}"
67
+ end
68
+ if @delete_old_index
69
+ delete_index(index)
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def delete_index(index)
76
+ indices = @client.cat.indices(format: 'json')
77
+ if indices.any? { |i| i['index'] == index }
78
+ @client.indices.delete index: index
79
+ Embulk.logger.info "deleted index: #{index}"
80
+ end
81
+ end
82
+
83
+ def send(bulk_message)
84
+ retries = 0
85
+ begin
86
+ @client.bulk body: bulk_message
87
+ Embulk.logger.info "bulk: #{bulk_message.size/2} success."
88
+ rescue => e
89
+ if retries < @retry_on_failure
90
+ retries += 1
91
+ Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
92
+ sleep 2**retries
93
+ retry
94
+ end
95
+ raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
96
+ end
97
+ end
98
+
99
+ def generate_source(record)
100
+ result = {}
101
+
102
+ record.each { |key, value|
103
+ result[key] = value
104
+ next if (value.nil? || !@array_columns)
105
+ @array_columns.each do |array_column|
106
+ if array_column['name'] == key
107
+ array_value = value.split(array_column['delimiter']).reject(&:empty?)
108
+ array_value = array_value.map(&:to_i) if array_column['is_integer']
109
+ result[key] = array_value
110
+ end
111
+ end
112
+ }
113
+ (@mode == 'update') ? {doc: result} : result
114
+ end
115
+
116
+ def generate_id(template, record, id_keys)
117
+ template % id_keys.map { |key| record[key] }
118
+ end
119
+
120
+ def generate_meta(record)
121
+ meta = {}
122
+ meta[@action] = { _index: @index, _type: @index_type }
123
+ meta[@action][:_id] = generate_id(@id_format, record, @id_keys) unless @id_keys.nil?
124
+ meta
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
@@ -1,5 +1,4 @@
1
- require 'excon'
2
- require 'elasticsearch'
1
+ require_relative 'elasticsearch/connection'
3
2
 
4
3
  module Embulk
5
4
  module Output
@@ -12,40 +11,47 @@ module Embulk
12
11
  task = {
13
12
  "nodes" => config.param("nodes", :array, default: [{ 'host' => 'localhost', 'port' => 9200 }]),
14
13
  "request_timeout" => config.param("request_timeout", :integer, default: 60),
15
- "index" => config.param("index", :string, default: 'logstash-%Y.%m.%d'),
14
+ "index_type" => config.param("index_type", :string),
16
15
  "mode" => config.param("mode", :string, default: 'normal'),
17
16
  "reload_connections" => config.param("reload_connections", :bool, default: true),
18
17
  "reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
19
18
  "delete_old_index" => config.param("delete_old_index", :bool, default: false),
20
19
  "delete_old_alias" => config.param("delete_old_alias", :bool, default: true),
21
- "index_type" => config.param("index_type", :string),
22
20
  "id_keys" => config.param("id_keys", :array, default: nil),
23
21
  "id_format" => config.param("id_format", :string, default: nil),
24
22
  "array_columns" => config.param("array_columns", :array, default: nil),
25
23
  "bulk_actions" => config.param("bulk_actions", :integer, default: 1000),
26
24
  "retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
27
- "before_template_name" => config.param("before_template_name", :string, default: nil),
28
- "before_template" => config.param("before_template", :hash, default: nil),
29
- "current_index_name" => config.param("current_index_name", :string, default: nil),
30
25
  }
31
- task['time_value'] = Time.now.strftime('%Y.%m.%d.%H.%M.%S')
32
- task['index'] = Time.now.strftime(task['index'])
33
-
34
- task['current_index_name'] = if task['current_index_name']
35
- task['current_index_name']
36
- else
37
- "#{task['index']}-#{task['index_type']}-#{task['time_value']}"
38
- end
39
26
 
40
27
  unless ENABLE_MODE.include?(task['mode'])
41
28
  raise ConfigError.new "`mode` must be one of #{ENABLE_MODE.join(', ')}"
42
29
  end
43
30
  Embulk.logger.info("mode => #{task['mode']}")
44
31
 
45
- if task['before_template_name'] && task['before_template']
46
- client = create_client(task)
47
- Embulk.logger.info("put template => #{task['before_template_name']}")
48
- client.indices.put_template name: task['before_template_name'], body: task['before_template']
32
+ current_index_name = config.param("current_index_name", :string, default: nil)
33
+ index = config.param("index", :string, default: 'logstash-%Y.%m.%d')
34
+ if task['mode'] == 'replace'
35
+ task['alias'] = index
36
+ task['index'] = if current_index_name
37
+ current_index_name
38
+ else
39
+ "#{index}-#{task['index_type']}-#{Time.now.strftime('%Y.%m.%d.%H.%M.%S')}"
40
+ end
41
+ else
42
+ task['index'] = Time.now.strftime(index)
43
+ end
44
+
45
+ connection = Connection.new(task)
46
+ before_delete_index = config.param("before_delete_index", :bool, default: false)
47
+ if before_delete_index
48
+ connection.delete_index(task['index'])
49
+ end
50
+
51
+ before_template_name = config.param("before_template_name", :string, default: nil)
52
+ before_template = config.param("before_template", :hash, default: nil)
53
+ if before_template_name && before_template
54
+ connection.put_template(before_template_name, before_template)
49
55
  end
50
56
 
51
57
  task_reports = yield(task)
@@ -55,57 +61,12 @@ module Embulk
55
61
 
56
62
  def self.cleanup(task, schema, count, task_reports)
57
63
  if task['mode'] == 'replace'
58
- client = create_client(task)
59
- create_aliases(client, task['index'], get_index(task))
60
- delete_aliases(client, task)
64
+ connection = Connection.new(task)
65
+ connection.create_aliases
66
+ connection.delete_aliases
61
67
  end
62
68
  end
63
69
 
64
- def self.create_client(task)
65
- transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
66
- {
67
- hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
68
- options: {
69
- reload_connections: task['reload_connections'],
70
- reload_on_failure: task['reload_on_failure'],
71
- retry_on_failure: task['retry_on_failure'],
72
- transport_options: {
73
- request: { timeout: task['request_timeout'] }
74
- }
75
- }
76
- }
77
- )
78
-
79
- ::Elasticsearch::Client.new transport: transport
80
- end
81
-
82
- def self.create_aliases(client, als, index)
83
- client.indices.update_aliases body: {
84
- actions: [{ add: { index: index, alias: als } }]
85
- }
86
- Embulk.logger.info "created alias: #{als}, index: #{index}"
87
- end
88
-
89
- def self.delete_aliases(client, task)
90
- indices = client.indices.get_alias(name: task['index']).keys
91
- indices.each { |index|
92
- if index != get_index(task)
93
- if task['delete_old_alias']
94
- client.indices.delete_alias index: index, name: task['index']
95
- Embulk.logger.info "deleted alias: #{task['index']}, index: #{index}"
96
- end
97
- if task['delete_old_index']
98
- client.indices.delete index: index
99
- Embulk.logger.info "deleted index: #{index}"
100
- end
101
- end
102
- }
103
- end
104
-
105
- def self.get_index(task)
106
- task['mode'] == 'replace' ? task['current_index_name'] : task['index']
107
- end
108
-
109
70
  #def self.resume(task, schema, count, &control)
110
71
  # task_reports = yield(task)
111
72
  #
@@ -114,17 +75,8 @@ module Embulk
114
75
  #end
115
76
 
116
77
  def init
117
- @nodes = task["nodes"]
118
- @index_type = task["index_type"]
119
- @id_keys = task["id_keys"]
120
- @id_format = task["id_format"]
78
+ @connection = Connection.new(task)
121
79
  @bulk_actions = task["bulk_actions"]
122
- @array_columns = task["array_columns"]
123
- @retry_on_failure = task["retry_on_failure"]
124
- @mode = task["mode"]
125
- @index = self.class.get_index(task)
126
-
127
- @client = self.class.create_client(task)
128
80
  @bulk_message = []
129
81
  end
130
82
 
@@ -134,22 +86,24 @@ module Embulk
134
86
  def add(page)
135
87
  page.each do |record|
136
88
  hash = Hash[schema.names.zip(record)]
137
- action = (@mode == 'update') ? :update : :index
138
- meta = {}
139
- meta[action] = { _index: @index, _type: @index_type }
140
- meta[action][:_id] = generate_id(@id_format, hash, @id_keys) unless @id_keys.nil?
141
- source = generate_array(hash)
89
+ meta = @connection.generate_meta(hash)
90
+ source = @connection.generate_source(hash)
91
+
92
+ Embulk.logger.debug("meta => #{meta}")
93
+ Embulk.logger.debug("source => #{source}")
94
+
142
95
  @bulk_message << meta
143
96
  @bulk_message << source
144
97
  if @bulk_actions * 2 <= @bulk_message.size
145
- send
98
+ @connection.send(@bulk_message)
99
+ @bulk_message.clear
146
100
  end
147
101
  end
148
102
  end
149
103
 
150
104
  def finish
151
105
  if @bulk_message.size > 0
152
- send
106
+ @connection.send(@bulk_message)
153
107
  end
154
108
  end
155
109
 
@@ -160,46 +114,6 @@ module Embulk
160
114
  task_report = {}
161
115
  return task_report
162
116
  end
163
-
164
- private
165
-
166
- def generate_array(record)
167
- result = {}
168
-
169
- record.each { |key, value|
170
- result[key] = value
171
- next if (value.nil? || !@array_columns)
172
- @array_columns.each do |array_column|
173
- if array_column['name'] == key
174
- array_value = value.split(array_column['delimiter']).reject(&:empty?)
175
- array_value = array_value.map(&:to_i) if array_column['is_integer']
176
- result[key] = array_value
177
- end
178
- end
179
- }
180
- (@mode == 'update') ? {doc: result} : result
181
- end
182
-
183
- def generate_id(template, record, id_keys)
184
- template % id_keys.map { |key| record[key] }
185
- end
186
-
187
- def send
188
- retries = 0
189
- begin
190
- @client.bulk body: @bulk_message
191
- Embulk.logger.info "bulk: #{@bulk_message.size/2} success."
192
- rescue => e
193
- if retries < @retry_on_failure
194
- retries += 1
195
- Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
196
- sleep 2**retries
197
- retry
198
- end
199
- raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
200
- end
201
- @bulk_message.clear
202
- end
203
117
  end
204
118
  end
205
119
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-elasticsearch_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-27 00:00:00.000000000 Z
11
+ date: 2017-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -121,6 +121,7 @@ files:
121
121
  - README.md
122
122
  - Rakefile
123
123
  - embulk-output-elasticsearch_ruby.gemspec
124
+ - lib/embulk/output/elasticsearch/connection.rb
124
125
  - lib/embulk/output/elasticsearch_ruby.rb
125
126
  - test/helper.rb
126
127
  - test/test_transaction.rb