embulk-output-elasticsearch_ruby 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7ca9bce994b986ee593e702147af3d22a8fb356e
4
+ data.tar.gz: aacbb3bd967e9f93dc27dac24506a1c060b9aef3
5
+ SHA512:
6
+ metadata.gz: 05762c410db87e80d0ffe68970ed62437a8273c3daa177b530e865ab768c62f15d6940feb92e4499f529623063f09f23a6c349b253a4234f65797703e35e7ce1
7
+ data.tar.gz: 248d53aed9f7dbd9a4ba0c4375f50d171fc4ec37848d4e065f418df9670f9d2183eb4105ccf3c748f10a4aee3985fd59270cef85fba22abaf27386a23ba3459e
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /vendor/
6
+ /Gemfile.lock
7
+ .ruby-version
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # Elasticsearch Ruby output plugin for Embulk
2
+
3
+ Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: output
8
+ * **Load all or nothing**: no
9
+ * **Resume supported**: no
10
+ * **Cleanup supported**: yes
11
+
12
+ ## Configuration
13
+ - **nodes**: nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
14
+ - **host**: index (string)
15
+ - **port**: index (integer)
16
+ - **request_timeout**: request_timeout (integer, default: 60)
17
+ - **index**: index (string, , default: 'logstash-%Y.%m.%d')
18
+ - **mode**: mode, normal or update or replace (string, default: normal)
19
+ - **reload_connections**: reload_connections (bool, default: true)
20
+ - **reload_on_failure**: reload_on_failure (bool, default: false)
21
+ - **delete_old_index**: delete_old_index (bool, default: false)
22
+ - **index_type**: index_type (string)
23
+ - **id_keys**: id_keys (array, default: nil)
24
+ - **id_format**: id_format (string, default: nil)
25
+ - **array_columns**: array_columns (array, default: nil)
26
+ - **name**: Array convert column. (string)
27
+ - **delimiter**: delimiter for split. (string)
28
+ - **is_integer**: to integer. (bool)
29
+ - **bulk_actions**: bulk_actions (integer, default: 1000)
30
+ - **retry_on_failure**: retry_on_failure (integer, default: 5)
31
+
32
+ ## Example
33
+
34
+ ```yaml
35
+ out:
36
+ type: elasticsearch_ruby
37
+ nodes:
38
+ - {host: localhost, port: 9200}
39
+ index_type: page
40
+ ```
41
+
42
+ ## Example(update)
43
+
44
+ ```yaml
45
+ out:
46
+ type: elasticsearch_ruby
47
+ nodes:
48
+ - {host: {{ env.ES_HOST }}, port: 9200}
49
+ index: crawl
50
+ index_type: page
51
+ bulk_actions: 1000
52
+ request_timeout: 60
53
+ mode: update
54
+ id_format: "%s"
55
+ id_keys:
56
+ - _id
57
+ ```
58
+
59
+
60
+ ## Build
61
+
62
+ ```
63
+ $ rake
64
+ ```
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ desc 'Run test_unit based test'
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.test_files = Dir["test/**/test_*.rb"].sort
8
+ t.verbose = true
9
+ t.warning = false
10
+ end
11
+ task :default => :test
@@ -0,0 +1,23 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-output-elasticsearch_ruby"
4
+ spec.version = "0.1.1"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible."
7
+ spec.description = "Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible."
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-output-elasticsearch_ruby"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency 'elasticsearch'
17
+ spec.add_dependency 'excon'
18
+ spec.add_development_dependency 'bundler', ['~> 1.0']
19
+ spec.add_development_dependency 'embulk', ['>= 0.8.15']
20
+ spec.add_development_dependency 'rake', ['>= 10.0']
21
+ spec.add_development_dependency 'test-unit'
22
+ spec.add_development_dependency 'test-unit-rr'
23
+ end
@@ -0,0 +1,192 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
4
+ module Embulk
5
+ module Output
6
+
7
+ class Elasticsearch < OutputPlugin
8
+ Plugin.register_output("elasticsearch_ruby", self)
9
+ ENABLE_MODE = %w[normal update replace]
10
+
11
+ def self.transaction(config, schema, count, &control)
12
+ task = {
13
+ "nodes" => config.param("nodes", :array, default: [{ 'host' => 'localhost', 'port' => 9200 }]),
14
+ "request_timeout" => config.param("request_timeout", :integer, default: 60),
15
+ "index" => config.param("index", :string, default: 'logstash-%Y.%m.%d'),
16
+ "mode" => config.param("mode", :string, default: 'normal'),
17
+ "reload_connections" => config.param("reload_connections", :bool, default: true),
18
+ "reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
19
+ "delete_old_index" => config.param("delete_old_index", :bool, default: false),
20
+ "index_type" => config.param("index_type", :string),
21
+ "id_keys" => config.param("id_keys", :array, default: nil),
22
+ "id_format" => config.param("id_format", :string, default: nil),
23
+ "array_columns" => config.param("array_columns", :array, default: nil),
24
+ "bulk_actions" => config.param("bulk_actions", :integer, default: 1000),
25
+ "retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
26
+ }
27
+ task['time_value'] = Time.now.strftime('%Y.%m.%d.%H.%M.%S')
28
+ task['index'] = Time.now.strftime(task['index'])
29
+
30
+ unless ENABLE_MODE.include?(task['mode'])
31
+ raise ConfigError.new "`mode` must be one of #{ENABLE_MODE.join(', ')}"
32
+ end
33
+ Embulk.logger.info("mode => #{task['mode']}")
34
+
35
+ task_reports = yield(task)
36
+ next_config_diff = {}
37
+ return next_config_diff
38
+ end
39
+
40
+ def self.cleanup(task, schema, count, task_reports)
41
+ if task['mode'] == 'replace'
42
+ client = create_client(task)
43
+ create_aliases(client, task['index'], get_index(task))
44
+ delete_aliases(client, task)
45
+ end
46
+ end
47
+
48
+ def self.create_client(task)
49
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
50
+ {
51
+ hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
52
+ options: {
53
+ reload_connections: task['reload_connections'],
54
+ reload_on_failure: task['reload_on_failure'],
55
+ retry_on_failure: task['retry_on_failure'],
56
+ transport_options: {
57
+ request: { timeout: task['request_timeout'] }
58
+ }
59
+ }
60
+ }
61
+ )
62
+
63
+ ::Elasticsearch::Client.new transport: transport
64
+ end
65
+
66
+ def self.create_aliases(client, als, index)
67
+ client.indices.update_aliases body: {
68
+ actions: [{ add: { index: index, alias: als } }]
69
+ }
70
+ Embulk.logger.info "created alias: #{als}, index: #{index}"
71
+ end
72
+
73
+ def self.delete_aliases(client, task)
74
+ indices = client.indices.get_aliases.select { |key, value| value['aliases'].include? task['index'] }.keys
75
+ indices = indices.select { |index| /^#{get_index_prefix(task)}-(\d*)/ =~ index }
76
+ indices.each { |index|
77
+ if index != get_index(task)
78
+ client.indices.delete_alias index: index, name: task['index']
79
+ Embulk.logger.info "deleted alias: #{task['index']}, index: #{index}"
80
+ if task['delete_old_index']
81
+ client.indices.delete index: index
82
+ Embulk.logger.info "deleted index: #{index}"
83
+ end
84
+ end
85
+ }
86
+ end
87
+
88
+ def self.get_index(task)
89
+ task['mode'] == 'replace' ? "#{get_index_prefix(task)}-#{task['time_value']}" : task['index']
90
+ end
91
+
92
+ def self.get_index_prefix(task)
93
+ "#{task['index']}-#{task['index_type']}"
94
+ end
95
+
96
+ #def self.resume(task, schema, count, &control)
97
+ # task_reports = yield(task)
98
+ #
99
+ # next_config_diff = {}
100
+ # return next_config_diff
101
+ #end
102
+
103
+ def init
104
+ @nodes = task["nodes"]
105
+ @index_type = task["index_type"]
106
+ @id_keys = task["id_keys"]
107
+ @id_format = task["id_format"]
108
+ @bulk_actions = task["bulk_actions"]
109
+ @array_columns = task["array_columns"]
110
+ @retry_on_failure = task["retry_on_failure"]
111
+ @mode = task["mode"]
112
+ @index = self.class.get_index(task)
113
+
114
+ @client = self.class.create_client(task)
115
+ @bulk_message = []
116
+ end
117
+
118
+ def close
119
+ end
120
+
121
+ def add(page)
122
+ page.each do |record|
123
+ hash = Hash[schema.names.zip(record)]
124
+ action = (@mode == 'update') ? :update : :index
125
+ meta = {}
126
+ meta[action] = { _index: @index, _type: @index_type }
127
+ meta[action][:_id] = generate_id(@id_format, hash, @id_keys) unless @id_keys.nil?
128
+ source = generate_array(hash)
129
+ @bulk_message << meta
130
+ @bulk_message << source
131
+ if @bulk_actions * 2 <= @bulk_message.size
132
+ send
133
+ end
134
+ end
135
+ end
136
+
137
+ def finish
138
+ if @bulk_message.size > 0
139
+ send
140
+ end
141
+ end
142
+
143
+ def abort
144
+ end
145
+
146
+ def commit
147
+ task_report = {}
148
+ return task_report
149
+ end
150
+
151
+ private
152
+
153
+ def generate_array(record)
154
+ result = {}
155
+
156
+ record.each { |key, value|
157
+ result[key] = value
158
+ next if (value.nil? || !@array_columns)
159
+ @array_columns.each do |array_column|
160
+ if array_column['name'] == key
161
+ array_value = value.split(array_column['delimiter']).reject(&:empty?)
162
+ array_value = array_value.map(&:to_i) if array_column['is_integer']
163
+ result[key] = array_value
164
+ end
165
+ end
166
+ }
167
+ (@mode == 'update') ? {doc: result} : result
168
+ end
169
+
170
+ def generate_id(template, record, id_keys)
171
+ template % id_keys.map { |key| record[key] }
172
+ end
173
+
174
+ def send
175
+ retries = 0
176
+ begin
177
+ @client.bulk body: @bulk_message
178
+ Embulk.logger.info "bulk: #{@bulk_message.size/2} success."
179
+ rescue => e
180
+ if retries < @retry_on_failure
181
+ retries += 1
182
+ Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
183
+ sleep 2**retries
184
+ retry
185
+ end
186
+ raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
187
+ end
188
+ @bulk_message.clear
189
+ end
190
+ end
191
+ end
192
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'test/unit/rr'
5
+
6
+ # require 'embulk/java/bootstrap'
7
+ require 'embulk'
8
+ Embulk.setup
9
+ Embulk.logger = Embulk::Logger.new('/dev/null')
10
+
11
+ APP_ROOT = File.expand_path('../', __dir__)
@@ -0,0 +1,64 @@
1
+ require_relative './helper'
2
+ require 'embulk/output/elasticsearch_ruby'
3
+
4
+ OUTPUT_ELASTICSEARCH = Embulk::Output::Elasticsearch
5
+
6
+ module Embulk
7
+ class Output::Elasticsearch
8
+ class TestTransaction < Test::Unit::TestCase
9
+ def least_config
10
+ DataSource.new({
11
+ 'nodes' => [{ 'host' => 'localhost', 'port' => 9200 }],
12
+ 'index_type' => 'page'
13
+ })
14
+ end
15
+
16
+ def schema
17
+ Schema.new([
18
+ Column.new({index: 0, name: 'boolean', type: :boolean}),
19
+ Column.new({index: 1, name: 'long', type: :long}),
20
+ Column.new({index: 2, name: 'double', type: :double}),
21
+ Column.new({index: 3, name: 'string', type: :string}),
22
+ Column.new({index: 4, name: 'timestamp', type: :timestamp}),
23
+ Column.new({index: 5, name: 'json', type: :json}),
24
+ ])
25
+ end
26
+
27
+ def processor_count
28
+ 1
29
+ end
30
+
31
+ def control
32
+ Proc.new {|task| task_reports = [] }
33
+ end
34
+
35
+ def setup
36
+ stub(OUTPUT_ELASTICSEARCH).transaction_report { {} }
37
+ end
38
+
39
+ sub_test_case "normal" do
40
+ def test_minimum
41
+ config = least_config
42
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
43
+ end
44
+
45
+ def test_mode
46
+ config = least_config.merge('mode' => 'update')
47
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
48
+
49
+ config = least_config.merge('mode' => 'replace')
50
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
51
+ end
52
+ end
53
+
54
+ sub_test_case "error" do
55
+ def test_mode
56
+ config = least_config.merge('mode' => 'hoge')
57
+ assert_raise ConfigError do
58
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
metadata ADDED
@@ -0,0 +1,153 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-output-elasticsearch_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-11-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: elasticsearch
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ name: excon
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.0'
47
+ name: bundler
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 0.8.15
61
+ name: embulk
62
+ prerelease: false
63
+ type: :development
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 0.8.15
69
+ - !ruby/object:Gem::Dependency
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '10.0'
75
+ name: rake
76
+ prerelease: false
77
+ type: :development
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ name: test-unit
90
+ prerelease: false
91
+ type: :development
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ name: test-unit-rr
104
+ prerelease: false
105
+ type: :development
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
112
+ email:
113
+ - toyama0919@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".gitignore"
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - embulk-output-elasticsearch_ruby.gemspec
124
+ - lib/embulk/output/elasticsearch_ruby.rb
125
+ - test/helper.rb
126
+ - test/test_transaction.rb
127
+ homepage: https://github.com/toyama0919/embulk-output-elasticsearch_ruby
128
+ licenses:
129
+ - MIT
130
+ metadata: {}
131
+ post_install_message:
132
+ rdoc_options: []
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ requirements: []
146
+ rubyforge_project:
147
+ rubygems_version: 2.6.6
148
+ signing_key:
149
+ specification_version: 4
150
+ summary: Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible.
151
+ test_files:
152
+ - test/helper.rb
153
+ - test/test_transaction.rb