embulk-output-elasticsearch_ruby 0.1.4 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: be42f1b9b4d5953efa82af4f11eff2d1871e28d7
4
- data.tar.gz: 994268d29c57b5f1145e1f4ac79922da18282c9c
3
+ metadata.gz: 6824687f4de2bdcc12467725ea585462b9b0aa21
4
+ data.tar.gz: 337d8df78354516360a8c922b7dda2fe49abb12f
5
5
  SHA512:
6
- metadata.gz: ffd1752f6ae7f52fb1bc8a07d90ff59633f05799c31cf05f207f1607a31ab298aab48c031e7c1b6634dd9751c757a449b4ad94cce930f36195f0c22c415aaa52
7
- data.tar.gz: 8ea0f41e0de01e06f3916b4b1c2377260097536d9f9b68abc31a8962655ef0b74ee750d4de1b11bcd601c8c38b3b25f062b9a66aff5409d4ea67debf99f53d2b
6
+ metadata.gz: ab542815971e42add2330522e9ea867c3e8542c5003155ec9df69ea76d55b35f8289853326eee52e855b72ff8fa2e171cb2a4ee8a1f8bc9dcc572987a341de22
7
+ data.tar.gz: a734d8594b15f0be35b12cdf64d3b51d3c87427e7e2d5fe0865b55e281cf77d63d588b1f3b16c4a1ceaee38a2a7e0620f3f7bd85d6f618a71f0b041f3cbcb9ff
data/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # Elasticsearch Ruby output plugin for Embulk
1
+ # Elasticsearch Ruby output plugin for Embulk [![Gem Version](https://badge.fury.io/rb/embulk-output-elasticsearch_ruby.svg)](http://badge.fury.io/rb/embulk-output-elasticsearch_ruby)
2
2
 
3
3
  Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
4
4
 
@@ -10,26 +10,28 @@ Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatibl
10
10
  * **Cleanup supported**: yes
11
11
 
12
12
  ## Configuration
13
- - **nodes**: nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
14
- - **host**: index (string)
15
- - **port**: index (integer)
16
- - **request_timeout**: request_timeout (integer, default: 60)
17
- - **index**: index (string, , default: 'logstash-%Y.%m.%d')
18
- - **mode**: mode, normal or update or replace (string, default: normal)
19
- - **reload_connections**: reload_connections (bool, default: true)
20
- - **reload_on_failure**: reload_on_failure (bool, default: false)
21
- - **delete_old_index**: delete_old_index (bool, default: false)
22
- - **index_type**: index_type (string)
23
- - **id_keys**: id_keys (array, default: nil)
24
- - **id_format**: id_format (string, default: nil)
25
- - **array_columns**: array_columns (array, default: nil)
26
- - **name**: Array convert column. (string)
27
- - **delimiter**: delimiter for split. (string)
28
- - **is_integer**: to integer. (bool)
29
- - **bulk_actions**: bulk_actions (integer, default: 1000)
30
- - **retry_on_failure**: retry_on_failure (integer, default: 5)
13
+ - **nodes** nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
14
+ - **host** host (string)
15
+ - **port** port (string)
16
+ - **request_timeout** request timeout (integer, default: 60)
17
+ - **index_type** index type (string)
18
+ - **mode** mode (string, default: 'normal')
19
+ - **reload_connections** reload connections (bool, default: true)
20
+ - **reload_on_failure** reload on failure (bool, default: false)
21
+ - **delete_old_index** delete old index (bool, default: false)
22
+ - **delete_old_alias** delete old alias (bool, default: true)
23
+ - **id_keys** id keys (array, default: nil)
24
+ - **id_format** id format (string, default: nil)
25
+ - **array_columns** array columns (array, default: nil)
26
+ - **bulk_actions** bulk actions (integer, default: 1000)
27
+ - **retry_on_failure** retry on failure (integer, default: 5)
28
+ - **current_index_name** current index name (string, default: nil)
29
+ - **index** index (string, default: 'logstash-%Y.%m.%d')
30
+ - **before_delete_index** before delete index (bool, default: false)
31
+ - **before_template_name** before template name (string, default: nil)
32
+ - **before_template** before template (hash, default: nil)
31
33
 
32
- ## Example
34
+ ## Example(minimum settings)
33
35
 
34
36
  ```yaml
35
37
  out:
@@ -39,7 +41,7 @@ out:
39
41
  index_type: page
40
42
  ```
41
43
 
42
- ## Example(update)
44
+ ## Example(update mode)
43
45
 
44
46
  ```yaml
45
47
  out:
@@ -56,6 +58,23 @@ out:
56
58
  - _id
57
59
  ```
58
60
 
61
+ ## Example(replace mode)
62
+
63
+ ```yaml
64
+ out:
65
+ type: elasticsearch_ruby
66
+ nodes:
67
+ - {host: localhost, port: 9200}
68
+ index: test_alias
69
+ index_type: crawl_companies
70
+ mode: replace
71
+ delete_old_index: true
72
+ before_delete_index: true
73
+ bulk_actions: 1000
74
+ request_timeout: 60
75
+ ```
76
+
77
+ * create alias
59
78
 
60
79
  ## Build
61
80
 
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-output-elasticsearch_ruby"
4
- spec.version = "0.1.4"
4
+ spec.version = "0.1.5"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible."
7
7
  spec.description = "Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible."
@@ -0,0 +1,129 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
4
+ module Embulk
5
+ module Output
6
+ class Elasticsearch < OutputPlugin
7
+ class Connection
8
+ def initialize(task)
9
+ @nodes = task["nodes"]
10
+ @index_type = task["index_type"]
11
+ @id_keys = task["id_keys"]
12
+ @id_format = task["id_format"]
13
+ @array_columns = task["array_columns"]
14
+ @retry_on_failure = task["retry_on_failure"]
15
+ @mode = task["mode"]
16
+ @delete_old_index = task['delete_old_index']
17
+ @delete_old_alias = task['delete_old_alias']
18
+ @index = task['index']
19
+ @alias = task['alias']
20
+ @action = (@mode == 'update') ? :update : :index
21
+
22
+ @client = create_client(
23
+ nodes: task['nodes'],
24
+ reload_connections: task['reload_connections'],
25
+ reload_on_failure: task['reload_on_failure'],
26
+ retry_on_failure: task['retry_on_failure'],
27
+ request_timeout: task['request_timeout']
28
+ )
29
+ end
30
+
31
+ def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
32
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
33
+ {
34
+ hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
35
+ options: {
36
+ reload_connections: reload_connections,
37
+ reload_on_failure: reload_on_failure,
38
+ retry_on_failure: retry_on_failure,
39
+ transport_options: {
40
+ request: { timeout: request_timeout }
41
+ }
42
+ }
43
+ }
44
+ )
45
+ ::Elasticsearch::Client.new transport: transport
46
+ end
47
+
48
+ def put_template(before_template_name, before_template)
49
+ Embulk.logger.info("put template => #{before_template_name}")
50
+ @client.indices.put_template name: before_template_name, body: before_template
51
+ end
52
+
53
+ def create_aliases
54
+ @client.indices.update_aliases body: {
55
+ actions: [{ add: { index: @index, alias: @alias } }]
56
+ }
57
+ Embulk.logger.info "created alias: #{@alias}, index: #{@index}"
58
+ end
59
+
60
+ def delete_aliases
61
+ indices = @client.indices.get_alias(name: @alias).keys
62
+ indices.each do |index|
63
+ if index != @index
64
+ if @delete_old_alias
65
+ @client.indices.delete_alias index: index, name: @alias
66
+ Embulk.logger.info "deleted alias: #{@alias}, index: #{index}"
67
+ end
68
+ if @delete_old_index
69
+ delete_index(index)
70
+ end
71
+ end
72
+ end
73
+ end
74
+
75
+ def delete_index(index)
76
+ indices = @client.cat.indices(format: 'json')
77
+ if indices.any? { |i| i['index'] == index }
78
+ @client.indices.delete index: index
79
+ Embulk.logger.info "deleted index: #{index}"
80
+ end
81
+ end
82
+
83
+ def send(bulk_message)
84
+ retries = 0
85
+ begin
86
+ @client.bulk body: bulk_message
87
+ Embulk.logger.info "bulk: #{bulk_message.size/2} success."
88
+ rescue => e
89
+ if retries < @retry_on_failure
90
+ retries += 1
91
+ Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
92
+ sleep 2**retries
93
+ retry
94
+ end
95
+ raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
96
+ end
97
+ end
98
+
99
+ def generate_source(record)
100
+ result = {}
101
+
102
+ record.each { |key, value|
103
+ result[key] = value
104
+ next if (value.nil? || !@array_columns)
105
+ @array_columns.each do |array_column|
106
+ if array_column['name'] == key
107
+ array_value = value.split(array_column['delimiter']).reject(&:empty?)
108
+ array_value = array_value.map(&:to_i) if array_column['is_integer']
109
+ result[key] = array_value
110
+ end
111
+ end
112
+ }
113
+ (@mode == 'update') ? {doc: result} : result
114
+ end
115
+
116
+ def generate_id(template, record, id_keys)
117
+ template % id_keys.map { |key| record[key] }
118
+ end
119
+
120
+ def generate_meta(record)
121
+ meta = {}
122
+ meta[@action] = { _index: @index, _type: @index_type }
123
+ meta[@action][:_id] = generate_id(@id_format, record, @id_keys) unless @id_keys.nil?
124
+ meta
125
+ end
126
+ end
127
+ end
128
+ end
129
+ end
@@ -1,5 +1,4 @@
1
- require 'excon'
2
- require 'elasticsearch'
1
+ require_relative 'elasticsearch/connection'
3
2
 
4
3
  module Embulk
5
4
  module Output
@@ -12,40 +11,47 @@ module Embulk
12
11
  task = {
13
12
  "nodes" => config.param("nodes", :array, default: [{ 'host' => 'localhost', 'port' => 9200 }]),
14
13
  "request_timeout" => config.param("request_timeout", :integer, default: 60),
15
- "index" => config.param("index", :string, default: 'logstash-%Y.%m.%d'),
14
+ "index_type" => config.param("index_type", :string),
16
15
  "mode" => config.param("mode", :string, default: 'normal'),
17
16
  "reload_connections" => config.param("reload_connections", :bool, default: true),
18
17
  "reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
19
18
  "delete_old_index" => config.param("delete_old_index", :bool, default: false),
20
19
  "delete_old_alias" => config.param("delete_old_alias", :bool, default: true),
21
- "index_type" => config.param("index_type", :string),
22
20
  "id_keys" => config.param("id_keys", :array, default: nil),
23
21
  "id_format" => config.param("id_format", :string, default: nil),
24
22
  "array_columns" => config.param("array_columns", :array, default: nil),
25
23
  "bulk_actions" => config.param("bulk_actions", :integer, default: 1000),
26
24
  "retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
27
- "before_template_name" => config.param("before_template_name", :string, default: nil),
28
- "before_template" => config.param("before_template", :hash, default: nil),
29
- "current_index_name" => config.param("current_index_name", :string, default: nil),
30
25
  }
31
- task['time_value'] = Time.now.strftime('%Y.%m.%d.%H.%M.%S')
32
- task['index'] = Time.now.strftime(task['index'])
33
-
34
- task['current_index_name'] = if task['current_index_name']
35
- task['current_index_name']
36
- else
37
- "#{task['index']}-#{task['index_type']}-#{task['time_value']}"
38
- end
39
26
 
40
27
  unless ENABLE_MODE.include?(task['mode'])
41
28
  raise ConfigError.new "`mode` must be one of #{ENABLE_MODE.join(', ')}"
42
29
  end
43
30
  Embulk.logger.info("mode => #{task['mode']}")
44
31
 
45
- if task['before_template_name'] && task['before_template']
46
- client = create_client(task)
47
- Embulk.logger.info("put template => #{task['before_template_name']}")
48
- client.indices.put_template name: task['before_template_name'], body: task['before_template']
32
+ current_index_name = config.param("current_index_name", :string, default: nil)
33
+ index = config.param("index", :string, default: 'logstash-%Y.%m.%d')
34
+ if task['mode'] == 'replace'
35
+ task['alias'] = index
36
+ task['index'] = if current_index_name
37
+ current_index_name
38
+ else
39
+ "#{index}-#{task['index_type']}-#{Time.now.strftime('%Y.%m.%d.%H.%M.%S')}"
40
+ end
41
+ else
42
+ task['index'] = Time.now.strftime(index)
43
+ end
44
+
45
+ connection = Connection.new(task)
46
+ before_delete_index = config.param("before_delete_index", :bool, default: false)
47
+ if before_delete_index
48
+ connection.delete_index(task['index'])
49
+ end
50
+
51
+ before_template_name = config.param("before_template_name", :string, default: nil)
52
+ before_template = config.param("before_template", :hash, default: nil)
53
+ if before_template_name && before_template
54
+ connection.put_template(before_template_name, before_template)
49
55
  end
50
56
 
51
57
  task_reports = yield(task)
@@ -55,57 +61,12 @@ module Embulk
55
61
 
56
62
  def self.cleanup(task, schema, count, task_reports)
57
63
  if task['mode'] == 'replace'
58
- client = create_client(task)
59
- create_aliases(client, task['index'], get_index(task))
60
- delete_aliases(client, task)
64
+ connection = Connection.new(task)
65
+ connection.create_aliases
66
+ connection.delete_aliases
61
67
  end
62
68
  end
63
69
 
64
- def self.create_client(task)
65
- transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
66
- {
67
- hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
68
- options: {
69
- reload_connections: task['reload_connections'],
70
- reload_on_failure: task['reload_on_failure'],
71
- retry_on_failure: task['retry_on_failure'],
72
- transport_options: {
73
- request: { timeout: task['request_timeout'] }
74
- }
75
- }
76
- }
77
- )
78
-
79
- ::Elasticsearch::Client.new transport: transport
80
- end
81
-
82
- def self.create_aliases(client, als, index)
83
- client.indices.update_aliases body: {
84
- actions: [{ add: { index: index, alias: als } }]
85
- }
86
- Embulk.logger.info "created alias: #{als}, index: #{index}"
87
- end
88
-
89
- def self.delete_aliases(client, task)
90
- indices = client.indices.get_alias(name: task['index']).keys
91
- indices.each { |index|
92
- if index != get_index(task)
93
- if task['delete_old_alias']
94
- client.indices.delete_alias index: index, name: task['index']
95
- Embulk.logger.info "deleted alias: #{task['index']}, index: #{index}"
96
- end
97
- if task['delete_old_index']
98
- client.indices.delete index: index
99
- Embulk.logger.info "deleted index: #{index}"
100
- end
101
- end
102
- }
103
- end
104
-
105
- def self.get_index(task)
106
- task['mode'] == 'replace' ? task['current_index_name'] : task['index']
107
- end
108
-
109
70
  #def self.resume(task, schema, count, &control)
110
71
  # task_reports = yield(task)
111
72
  #
@@ -114,17 +75,8 @@ module Embulk
114
75
  #end
115
76
 
116
77
  def init
117
- @nodes = task["nodes"]
118
- @index_type = task["index_type"]
119
- @id_keys = task["id_keys"]
120
- @id_format = task["id_format"]
78
+ @connection = Connection.new(task)
121
79
  @bulk_actions = task["bulk_actions"]
122
- @array_columns = task["array_columns"]
123
- @retry_on_failure = task["retry_on_failure"]
124
- @mode = task["mode"]
125
- @index = self.class.get_index(task)
126
-
127
- @client = self.class.create_client(task)
128
80
  @bulk_message = []
129
81
  end
130
82
 
@@ -134,22 +86,24 @@ module Embulk
134
86
  def add(page)
135
87
  page.each do |record|
136
88
  hash = Hash[schema.names.zip(record)]
137
- action = (@mode == 'update') ? :update : :index
138
- meta = {}
139
- meta[action] = { _index: @index, _type: @index_type }
140
- meta[action][:_id] = generate_id(@id_format, hash, @id_keys) unless @id_keys.nil?
141
- source = generate_array(hash)
89
+ meta = @connection.generate_meta(hash)
90
+ source = @connection.generate_source(hash)
91
+
92
+ Embulk.logger.debug("meta => #{meta}")
93
+ Embulk.logger.debug("source => #{source}")
94
+
142
95
  @bulk_message << meta
143
96
  @bulk_message << source
144
97
  if @bulk_actions * 2 <= @bulk_message.size
145
- send
98
+ @connection.send(@bulk_message)
99
+ @bulk_message.clear
146
100
  end
147
101
  end
148
102
  end
149
103
 
150
104
  def finish
151
105
  if @bulk_message.size > 0
152
- send
106
+ @connection.send(@bulk_message)
153
107
  end
154
108
  end
155
109
 
@@ -160,46 +114,6 @@ module Embulk
160
114
  task_report = {}
161
115
  return task_report
162
116
  end
163
-
164
- private
165
-
166
- def generate_array(record)
167
- result = {}
168
-
169
- record.each { |key, value|
170
- result[key] = value
171
- next if (value.nil? || !@array_columns)
172
- @array_columns.each do |array_column|
173
- if array_column['name'] == key
174
- array_value = value.split(array_column['delimiter']).reject(&:empty?)
175
- array_value = array_value.map(&:to_i) if array_column['is_integer']
176
- result[key] = array_value
177
- end
178
- end
179
- }
180
- (@mode == 'update') ? {doc: result} : result
181
- end
182
-
183
- def generate_id(template, record, id_keys)
184
- template % id_keys.map { |key| record[key] }
185
- end
186
-
187
- def send
188
- retries = 0
189
- begin
190
- @client.bulk body: @bulk_message
191
- Embulk.logger.info "bulk: #{@bulk_message.size/2} success."
192
- rescue => e
193
- if retries < @retry_on_failure
194
- retries += 1
195
- Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
196
- sleep 2**retries
197
- retry
198
- end
199
- raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
200
- end
201
- @bulk_message.clear
202
- end
203
117
  end
204
118
  end
205
119
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-output-elasticsearch_ruby
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.4
4
+ version: 0.1.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-02-27 00:00:00.000000000 Z
11
+ date: 2017-03-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -121,6 +121,7 @@ files:
121
121
  - README.md
122
122
  - Rakefile
123
123
  - embulk-output-elasticsearch_ruby.gemspec
124
+ - lib/embulk/output/elasticsearch/connection.rb
124
125
  - lib/embulk/output/elasticsearch_ruby.rb
125
126
  - test/helper.rb
126
127
  - test/test_transaction.rb