embulk-output-elasticsearch_ruby 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 7ca9bce994b986ee593e702147af3d22a8fb356e
4
+ data.tar.gz: aacbb3bd967e9f93dc27dac24506a1c060b9aef3
5
+ SHA512:
6
+ metadata.gz: 05762c410db87e80d0ffe68970ed62437a8273c3daa177b530e865ab768c62f15d6940feb92e4499f529623063f09f23a6c349b253a4234f65797703e35e7ce1
7
+ data.tar.gz: 248d53aed9f7dbd9a4ba0c4375f50d171fc4ec37848d4e065f418df9670f9d2183eb4105ccf3c748f10a4aee3985fd59270cef85fba22abaf27386a23ba3459e
data/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /vendor/
6
+ /Gemfile.lock
7
+ .ruby-version
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,64 @@
1
+ # Elasticsearch Ruby output plugin for Embulk
2
+
3
+ Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
4
+
5
+ ## Overview
6
+
7
+ * **Plugin type**: output
8
+ * **Load all or nothing**: no
9
+ * **Resume supported**: no
10
+ * **Cleanup supported**: yes
11
+
12
+ ## Configuration
13
+ - **nodes**: nodes (array, default: [{ 'host' => 'localhost', 'port' => 9200 }])
14
+ - **host**: index (string)
15
+ - **port**: index (integer)
16
+ - **request_timeout**: request_timeout (integer, default: 60)
17
+ - **index**: index (string, , default: 'logstash-%Y.%m.%d')
18
+ - **mode**: mode, normal or update or replace (string, default: normal)
19
+ - **reload_connections**: reload_connections (bool, default: true)
20
+ - **reload_on_failure**: reload_on_failure (bool, default: false)
21
+ - **delete_old_index**: delete_old_index (bool, default: false)
22
+ - **index_type**: index_type (string)
23
+ - **id_keys**: id_keys (array, default: nil)
24
+ - **id_format**: id_format (string, default: nil)
25
+ - **array_columns**: array_columns (array, default: nil)
26
+ - **name**: Array convert column. (string)
27
+ - **delimiter**: delimiter for split. (string)
28
+ - **is_integer**: to integer. (bool)
29
+ - **bulk_actions**: bulk_actions (integer, default: 1000)
30
+ - **retry_on_failure**: retry_on_failure (integer, default: 5)
31
+
32
+ ## Example
33
+
34
+ ```yaml
35
+ out:
36
+ type: elasticsearch_ruby
37
+ nodes:
38
+ - {host: localhost, port: 9200}
39
+ index_type: page
40
+ ```
41
+
42
+ ## Example(update)
43
+
44
+ ```yaml
45
+ out:
46
+ type: elasticsearch_ruby
47
+ nodes:
48
+ - {host: {{ env.ES_HOST }}, port: 9200}
49
+ index: crawl
50
+ index_type: page
51
+ bulk_actions: 1000
52
+ request_timeout: 60
53
+ mode: update
54
+ id_format: "%s"
55
+ id_keys:
56
+ - _id
57
+ ```
58
+
59
+
60
+ ## Build
61
+
62
+ ```
63
+ $ rake
64
+ ```
data/Rakefile ADDED
@@ -0,0 +1,11 @@
1
+ require "bundler/gem_tasks"
2
+ require 'rake/testtask'
3
+
4
+ desc 'Run test_unit based test'
5
+ Rake::TestTask.new(:test) do |t|
6
+ t.libs << "test"
7
+ t.test_files = Dir["test/**/test_*.rb"].sort
8
+ t.verbose = true
9
+ t.warning = false
10
+ end
11
+ task :default => :test
@@ -0,0 +1,23 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-output-elasticsearch_ruby"
4
+ spec.version = "0.1.1"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible."
7
+ spec.description = "Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible."
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-output-elasticsearch_ruby"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency 'elasticsearch'
17
+ spec.add_dependency 'excon'
18
+ spec.add_development_dependency 'bundler', ['~> 1.0']
19
+ spec.add_development_dependency 'embulk', ['>= 0.8.15']
20
+ spec.add_development_dependency 'rake', ['>= 10.0']
21
+ spec.add_development_dependency 'test-unit'
22
+ spec.add_development_dependency 'test-unit-rr'
23
+ end
@@ -0,0 +1,192 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
4
+ module Embulk
5
+ module Output
6
+
7
+ class Elasticsearch < OutputPlugin
8
+ Plugin.register_output("elasticsearch_ruby", self)
9
+ ENABLE_MODE = %w[normal update replace]
10
+
11
+ def self.transaction(config, schema, count, &control)
12
+ task = {
13
+ "nodes" => config.param("nodes", :array, default: [{ 'host' => 'localhost', 'port' => 9200 }]),
14
+ "request_timeout" => config.param("request_timeout", :integer, default: 60),
15
+ "index" => config.param("index", :string, default: 'logstash-%Y.%m.%d'),
16
+ "mode" => config.param("mode", :string, default: 'normal'),
17
+ "reload_connections" => config.param("reload_connections", :bool, default: true),
18
+ "reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
19
+ "delete_old_index" => config.param("delete_old_index", :bool, default: false),
20
+ "index_type" => config.param("index_type", :string),
21
+ "id_keys" => config.param("id_keys", :array, default: nil),
22
+ "id_format" => config.param("id_format", :string, default: nil),
23
+ "array_columns" => config.param("array_columns", :array, default: nil),
24
+ "bulk_actions" => config.param("bulk_actions", :integer, default: 1000),
25
+ "retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
26
+ }
27
+ task['time_value'] = Time.now.strftime('%Y.%m.%d.%H.%M.%S')
28
+ task['index'] = Time.now.strftime(task['index'])
29
+
30
+ unless ENABLE_MODE.include?(task['mode'])
31
+ raise ConfigError.new "`mode` must be one of #{ENABLE_MODE.join(', ')}"
32
+ end
33
+ Embulk.logger.info("mode => #{task['mode']}")
34
+
35
+ task_reports = yield(task)
36
+ next_config_diff = {}
37
+ return next_config_diff
38
+ end
39
+
40
+ def self.cleanup(task, schema, count, task_reports)
41
+ if task['mode'] == 'replace'
42
+ client = create_client(task)
43
+ create_aliases(client, task['index'], get_index(task))
44
+ delete_aliases(client, task)
45
+ end
46
+ end
47
+
48
+ def self.create_client(task)
49
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
50
+ {
51
+ hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
52
+ options: {
53
+ reload_connections: task['reload_connections'],
54
+ reload_on_failure: task['reload_on_failure'],
55
+ retry_on_failure: task['retry_on_failure'],
56
+ transport_options: {
57
+ request: { timeout: task['request_timeout'] }
58
+ }
59
+ }
60
+ }
61
+ )
62
+
63
+ ::Elasticsearch::Client.new transport: transport
64
+ end
65
+
66
+ def self.create_aliases(client, als, index)
67
+ client.indices.update_aliases body: {
68
+ actions: [{ add: { index: index, alias: als } }]
69
+ }
70
+ Embulk.logger.info "created alias: #{als}, index: #{index}"
71
+ end
72
+
73
+ def self.delete_aliases(client, task)
74
+ indices = client.indices.get_aliases.select { |key, value| value['aliases'].include? task['index'] }.keys
75
+ indices = indices.select { |index| /^#{get_index_prefix(task)}-(\d*)/ =~ index }
76
+ indices.each { |index|
77
+ if index != get_index(task)
78
+ client.indices.delete_alias index: index, name: task['index']
79
+ Embulk.logger.info "deleted alias: #{task['index']}, index: #{index}"
80
+ if task['delete_old_index']
81
+ client.indices.delete index: index
82
+ Embulk.logger.info "deleted index: #{index}"
83
+ end
84
+ end
85
+ }
86
+ end
87
+
88
+ def self.get_index(task)
89
+ task['mode'] == 'replace' ? "#{get_index_prefix(task)}-#{task['time_value']}" : task['index']
90
+ end
91
+
92
+ def self.get_index_prefix(task)
93
+ "#{task['index']}-#{task['index_type']}"
94
+ end
95
+
96
+ #def self.resume(task, schema, count, &control)
97
+ # task_reports = yield(task)
98
+ #
99
+ # next_config_diff = {}
100
+ # return next_config_diff
101
+ #end
102
+
103
+ def init
104
+ @nodes = task["nodes"]
105
+ @index_type = task["index_type"]
106
+ @id_keys = task["id_keys"]
107
+ @id_format = task["id_format"]
108
+ @bulk_actions = task["bulk_actions"]
109
+ @array_columns = task["array_columns"]
110
+ @retry_on_failure = task["retry_on_failure"]
111
+ @mode = task["mode"]
112
+ @index = self.class.get_index(task)
113
+
114
+ @client = self.class.create_client(task)
115
+ @bulk_message = []
116
+ end
117
+
118
+ def close
119
+ end
120
+
121
+ def add(page)
122
+ page.each do |record|
123
+ hash = Hash[schema.names.zip(record)]
124
+ action = (@mode == 'update') ? :update : :index
125
+ meta = {}
126
+ meta[action] = { _index: @index, _type: @index_type }
127
+ meta[action][:_id] = generate_id(@id_format, hash, @id_keys) unless @id_keys.nil?
128
+ source = generate_array(hash)
129
+ @bulk_message << meta
130
+ @bulk_message << source
131
+ if @bulk_actions * 2 <= @bulk_message.size
132
+ send
133
+ end
134
+ end
135
+ end
136
+
137
+ def finish
138
+ if @bulk_message.size > 0
139
+ send
140
+ end
141
+ end
142
+
143
+ def abort
144
+ end
145
+
146
+ def commit
147
+ task_report = {}
148
+ return task_report
149
+ end
150
+
151
+ private
152
+
153
+ def generate_array(record)
154
+ result = {}
155
+
156
+ record.each { |key, value|
157
+ result[key] = value
158
+ next if (value.nil? || !@array_columns)
159
+ @array_columns.each do |array_column|
160
+ if array_column['name'] == key
161
+ array_value = value.split(array_column['delimiter']).reject(&:empty?)
162
+ array_value = array_value.map(&:to_i) if array_column['is_integer']
163
+ result[key] = array_value
164
+ end
165
+ end
166
+ }
167
+ (@mode == 'update') ? {doc: result} : result
168
+ end
169
+
170
+ def generate_id(template, record, id_keys)
171
+ template % id_keys.map { |key| record[key] }
172
+ end
173
+
174
+ def send
175
+ retries = 0
176
+ begin
177
+ @client.bulk body: @bulk_message
178
+ Embulk.logger.info "bulk: #{@bulk_message.size/2} success."
179
+ rescue => e
180
+ if retries < @retry_on_failure
181
+ retries += 1
182
+ Embulk.logger.warn "Could not push logs to Elasticsearch, resetting connection and trying again. #{e.message}"
183
+ sleep 2**retries
184
+ retry
185
+ end
186
+ raise "Could not push logs to Elasticsearch after #{retries} retries. #{e.message}"
187
+ end
188
+ @bulk_message.clear
189
+ end
190
+ end
191
+ end
192
+ end
data/test/helper.rb ADDED
@@ -0,0 +1,11 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require 'test/unit'
4
+ require 'test/unit/rr'
5
+
6
+ # require 'embulk/java/bootstrap'
7
+ require 'embulk'
8
+ Embulk.setup
9
+ Embulk.logger = Embulk::Logger.new('/dev/null')
10
+
11
+ APP_ROOT = File.expand_path('../', __dir__)
@@ -0,0 +1,64 @@
1
+ require_relative './helper'
2
+ require 'embulk/output/elasticsearch_ruby'
3
+
4
+ OUTPUT_ELASTICSEARCH = Embulk::Output::Elasticsearch
5
+
6
+ module Embulk
7
+ class Output::Elasticsearch
8
+ class TestTransaction < Test::Unit::TestCase
9
+ def least_config
10
+ DataSource.new({
11
+ 'nodes' => [{ 'host' => 'localhost', 'port' => 9200 }],
12
+ 'index_type' => 'page'
13
+ })
14
+ end
15
+
16
+ def schema
17
+ Schema.new([
18
+ Column.new({index: 0, name: 'boolean', type: :boolean}),
19
+ Column.new({index: 1, name: 'long', type: :long}),
20
+ Column.new({index: 2, name: 'double', type: :double}),
21
+ Column.new({index: 3, name: 'string', type: :string}),
22
+ Column.new({index: 4, name: 'timestamp', type: :timestamp}),
23
+ Column.new({index: 5, name: 'json', type: :json}),
24
+ ])
25
+ end
26
+
27
+ def processor_count
28
+ 1
29
+ end
30
+
31
+ def control
32
+ Proc.new {|task| task_reports = [] }
33
+ end
34
+
35
+ def setup
36
+ stub(OUTPUT_ELASTICSEARCH).transaction_report { {} }
37
+ end
38
+
39
+ sub_test_case "normal" do
40
+ def test_minimum
41
+ config = least_config
42
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
43
+ end
44
+
45
+ def test_mode
46
+ config = least_config.merge('mode' => 'update')
47
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
48
+
49
+ config = least_config.merge('mode' => 'replace')
50
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
51
+ end
52
+ end
53
+
54
+ sub_test_case "error" do
55
+ def test_mode
56
+ config = least_config.merge('mode' => 'hoge')
57
+ assert_raise ConfigError do
58
+ OUTPUT_ELASTICSEARCH.transaction(config, schema, processor_count, &control)
59
+ end
60
+ end
61
+ end
62
+ end
63
+ end
64
+ end
metadata ADDED
@@ -0,0 +1,153 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-output-elasticsearch_ruby
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-11-16 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: elasticsearch
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ name: excon
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - "~>"
45
+ - !ruby/object:Gem::Version
46
+ version: '1.0'
47
+ name: bundler
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - "~>"
53
+ - !ruby/object:Gem::Version
54
+ version: '1.0'
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 0.8.15
61
+ name: embulk
62
+ prerelease: false
63
+ type: :development
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 0.8.15
69
+ - !ruby/object:Gem::Dependency
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '10.0'
75
+ name: rake
76
+ prerelease: false
77
+ type: :development
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ name: test-unit
90
+ prerelease: false
91
+ type: :development
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ requirement: !ruby/object:Gem::Requirement
99
+ requirements:
100
+ - - ">="
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ name: test-unit-rr
104
+ prerelease: false
105
+ type: :development
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
111
+ description: Dumps records to Elasticsearch Ruby. Elasticsearch 1.X AND 2.X AND 5.X compatible.
112
+ email:
113
+ - toyama0919@gmail.com
114
+ executables: []
115
+ extensions: []
116
+ extra_rdoc_files: []
117
+ files:
118
+ - ".gitignore"
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - embulk-output-elasticsearch_ruby.gemspec
124
+ - lib/embulk/output/elasticsearch_ruby.rb
125
+ - test/helper.rb
126
+ - test/test_transaction.rb
127
+ homepage: https://github.com/toyama0919/embulk-output-elasticsearch_ruby
128
+ licenses:
129
+ - MIT
130
+ metadata: {}
131
+ post_install_message:
132
+ rdoc_options: []
133
+ require_paths:
134
+ - lib
135
+ required_ruby_version: !ruby/object:Gem::Requirement
136
+ requirements:
137
+ - - ">="
138
+ - !ruby/object:Gem::Version
139
+ version: '0'
140
+ required_rubygems_version: !ruby/object:Gem::Requirement
141
+ requirements:
142
+ - - ">="
143
+ - !ruby/object:Gem::Version
144
+ version: '0'
145
+ requirements: []
146
+ rubyforge_project:
147
+ rubygems_version: 2.6.6
148
+ signing_key:
149
+ specification_version: 4
150
+ summary: Elasticsearch Ruby output plugin for Embulk. Elasticsearch 1.X AND 2.X AND 5.X compatible.
151
+ test_files:
152
+ - test/helper.rb
153
+ - test/test_transaction.rb