embulk-input-elasticsearch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9e3e0bcff8c7de3a07cab81b1689f6f2f6b5dfe5
4
+ data.tar.gz: a2997eb7f98cdea800403211b75342a0ff360483
5
+ SHA512:
6
+ metadata.gz: bb618f0f0c4af1cdce787ce20cc38b009a4b48eba7e678f0f198c0e0195038242d8609ef42e1744077b0a51e99a8c3d62662c42e9482c4e33802d7b7ac8adeab
7
+ data.tar.gz: 8b43fc2e6c9afa87bb0851e2a813c9a3633104df341db9b73c74b472e3c701ce1e04e5e287f11d94b7ea2b1e11a6a0b7e2ea9c25bc70978b627765bf87ba2abf
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /vendor/
6
+ /Gemfile.lock
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.0.5.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # Elasticsearch input plugin for Embulk
2
+
3
+ ## Overview
4
+
5
+ * **Plugin type**: input
6
+ * **Resume supported**: yes
7
+ * **Cleanup supported**: yes
8
+ * **Guess supported**: no
9
+
10
+ ## Configuration
11
+ - **nodes**: nodes (array, required)
12
+ - **host**: host (string, required, default: ``)
13
+ - **port**: port (integer, required, default: ``)
14
+ - **queries**: query (array, required, default: ``)
15
+ - **index**: index (string, required, default: ``)
16
+ - **index_type**: index_type (string, default: ``)
17
+ - **request_timeout**: request_timeout (string, default: ``)
18
+ - **per_size**: per_size (integer, required, default: `1000`)
19
+ - **limit_size**: limit_size (integer, default: unlimit)
20
+ - **fields**: fields (string, required, default: ``)
21
+ - **name**: name (string, required, default: ``)
22
+ - **type**: type (string, required, default: ``)
23
+ - **metadata**: metadata (boolean, required, default: ``)
24
+ - **time_format**: time_format (string, required, default: ``)
25
+
26
+ ## Example
27
+
28
+ ```yaml
29
+ in:
30
+ type: elasticsearch
31
+ nodes:
32
+ - {host: localhost, port: 9200}
33
+ queries:
34
+ - 'page_type: HP'
35
+ - 'page_type: GP'
36
+ index: crawl
37
+ index_type: m_corporation_page
38
+ request_timeout: 60
39
+ per_size: 1000
40
+ limit_size: 200000
41
+ fields:
42
+ - { name: _id, type: string, metadata: true }
43
+ - { name: _type, type: string, metadata: true }
44
+ - { name: _index, type: string, metadata: true }
45
+ - { name: _score, type: double, metadata: true }
46
+ - { name: page_type, type: string }
47
+ - { name: corp_name, type: string }
48
+ - { name: corp_key, type: string }
49
+ - { name: title, type: string }
50
+ - { name: body, type: string }
51
+ - { name: url, type: string }
52
+ - { name: employee_range, type: long }
53
+ - { name: m_corporation_id, type: long }
54
+ - { name: cg_lv1, type: json }
55
+ - { name: cg_lv2, type: json }
56
+ - { name: cg_lv3, type: json }
57
+ ```
58
+
59
+ ## Support Type
60
+ * string
61
+ * long
62
+ * double
63
+ * timestamp
64
+ * json
65
+ * boolean
66
+
67
+ ## Build
68
+
69
+ ```
70
+ $ rake
71
+ ```
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
4
+
5
+ desc "Run tests"
6
+ task :test do
7
+ require "test-unit"
8
+
9
+ Test::Unit::AutoRunner.run(true, './')
10
+ end
@@ -0,0 +1,22 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-input-elasticsearch"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Elasticsearch input plugin for Embulk"
7
+ spec.description = "Loads records from Elasticsearch."
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-input-elasticsearch"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency 'elasticsearch'
17
+ spec.add_dependency 'excon'
18
+ spec.add_development_dependency 'embulk', ['>= 0.8.9']
19
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
20
+ spec.add_development_dependency 'rake', ['>= 10.0']
21
+ spec.add_development_dependency 'test-unit'
22
+ end
@@ -0,0 +1,172 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
4
+ module Embulk
5
+ module Input
6
+
7
+ class Elasticsearch < InputPlugin
8
+ Plugin.register_input("elasticsearch", self)
9
+
10
+ def self.transaction(config, &control)
11
+ task = {
12
+ "nodes" => config.param("nodes", :array),
13
+ "request_timeout" => config.param("request_timeout", :integer, default: 60),
14
+ "index" => config.param("index", :string),
15
+ "reload_connections" => config.param("reload_connections", :bool, default: true),
16
+ "reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
17
+ "index_type" => config.param("index_type", :string, default: nil),
18
+ "retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
19
+ "per_size" => config.param("per_size", :integer, default: 1000),
20
+ "limit_size" => config.param("limit_size", :integer, default: nil),
21
+ "fields" => config.param("fields", :array, default: nil),
22
+ "queries" => config.param("queries", :array),
23
+ "sort" => config.param("sort", :hash, default: nil)
24
+ }
25
+
26
+ columns = []
27
+ task['fields'].each_with_index{ |field, i|
28
+ columns << Column.new(i, field['name'], field['type'].to_sym)
29
+ }
30
+
31
+ resume(task, columns, 1, &control)
32
+ end
33
+
34
+ def self.resume(task, columns, count, &control)
35
+ task_reports = yield(task, columns, count)
36
+
37
+ next_config_diff = {}
38
+ return next_config_diff
39
+ end
40
+
41
+ def self.create_client(task)
42
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
43
+ {
44
+ hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
45
+ options: {
46
+ reload_connections: task['reload_connections'],
47
+ reload_on_failure: task['reload_on_failure'],
48
+ retry_on_failure: task['retry_on_failure'],
49
+ transport_options: {
50
+ request: { timeout: task['request_timeout'] }
51
+ }
52
+ }
53
+ }
54
+ )
55
+
56
+ ::Elasticsearch::Client.new transport: transport
57
+ end
58
+
59
+ def init
60
+ @client = self.class.create_client(task)
61
+ @index = task['index']
62
+ @index_type = task['index_type']
63
+ @queries = task['queries']
64
+ @per_size = task['per_size']
65
+ @limit_size = task['limit_size']
66
+ @fields = task['fields']
67
+ @sort = task['sort']
68
+ end
69
+
70
+ def run
71
+ @queries.each do |query|
72
+ query_count = 0
73
+ no_source_results = search(@index_type, query, 0, 0, @routing, @fields, @sort)
74
+ total_count = [no_source_results['hits']['total'], @limit_size].compact.min
75
+ while true
76
+ now_results_size = query_count * @per_size
77
+ next_results_size = (query_count + 1) * @per_size
78
+ size = get_size(next_results_size, now_results_size ,total_count)
79
+ break if size == 0
80
+
81
+ results = get_sources(search(@index_type, query, size, now_results_size, @routing, @fields, @sort), @fields)
82
+ results.each do |record|
83
+ page_builder.add(record)
84
+ end
85
+ break if last_query?(next_results_size ,total_count)
86
+ query_count += 1
87
+ end
88
+ end
89
+ page_builder.finish
90
+
91
+ task_report = {}
92
+ return task_report
93
+ end
94
+
95
+ private
96
+
97
+ def convert_value(value, field)
98
+ return nil if value.nil?
99
+ case field["type"]
100
+ when "string"
101
+ value
102
+ when "long"
103
+ value.to_i
104
+ when "double"
105
+ value.to_f
106
+ when "boolean"
107
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
108
+ value
109
+ else
110
+ downcased_val = value.downcase
111
+ case downcased_val
112
+ when 'true' then true
113
+ when 'false' then false
114
+ when '1' then true
115
+ when '0' then false
116
+ else nil
117
+ end
118
+ end
119
+ when "timestamp"
120
+ Time.parse(value)
121
+ when "json"
122
+ value
123
+ else
124
+ raise "Unsupported type #{field['type']}"
125
+ end
126
+ end
127
+
128
+ def get_size(next_results_size, now_results_size ,total_count)
129
+ if last_query?(next_results_size ,total_count)
130
+ (total_count - now_results_size)
131
+ else
132
+ @per_size
133
+ end
134
+ end
135
+
136
+ def last_query?(next_results_size ,total_count)
137
+ next_results_size > total_count
138
+ end
139
+
140
+ def search(type, query, size, from, routing, fields, sort)
141
+ body = { from: from }
142
+ body[:size] = size unless size.nil?
143
+ if sort
144
+ sorts = []
145
+ sort.each do |k, v|
146
+ sorts << { k => v }
147
+ end
148
+ body[:sort] = sorts
149
+ end
150
+ body[:query] = { query_string: { query: query } } unless query.nil?
151
+ search_option = { index: @index, type: type, body: body }
152
+ search_option[:routing] = routing unless routing.nil?
153
+ search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
154
+ Embulk.logger.info(%Q{search_option => #{search_option}})
155
+ @client.search(search_option)
156
+ end
157
+
158
+ def get_sources(results, fields)
159
+ hits = results['hits']['hits']
160
+ hits.map { |hit|
161
+ result = hit['_source']
162
+ fields.select{ |field| field['metadata'] }.each { |field|
163
+ result[field['name']] = hit[field['name']]
164
+ }
165
+ @fields.map { |field|
166
+ convert_value(result[field['name']], field)
167
+ }
168
+ }
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,11 @@
1
+ require "embulk/command/embulk_run"
2
+ require "embulk"
3
+ Embulk.setup
4
+
5
+ require "embulk/input/elasticsearch"
6
+ module Embulk
7
+ module Input
8
+ class ElasticsearchInputPluginTest < Test::Unit::TestCase
9
+ end
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-input-elasticsearch
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: elasticsearch
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ name: excon
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.8.9
47
+ name: embulk
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.8.9
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.10.6
61
+ name: bundler
62
+ prerelease: false
63
+ type: :development
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 1.10.6
69
+ - !ruby/object:Gem::Dependency
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '10.0'
75
+ name: rake
76
+ prerelease: false
77
+ type: :development
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ name: test-unit
90
+ prerelease: false
91
+ type: :development
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Loads records from Elasticsearch.
98
+ email:
99
+ - toyama0919@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".ruby-version"
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - embulk-input-elasticsearch.gemspec
111
+ - lib/embulk/input/elasticsearch.rb
112
+ - test/embulk/input/test_elasticsearch.rb
113
+ homepage: https://github.com/toyama0919/embulk-input-elasticsearch
114
+ licenses:
115
+ - MIT
116
+ metadata: {}
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ required_rubygems_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ requirements: []
132
+ rubyforge_project:
133
+ rubygems_version: 2.4.8
134
+ signing_key:
135
+ specification_version: 4
136
+ summary: Elasticsearch input plugin for Embulk
137
+ test_files:
138
+ - test/embulk/input/test_elasticsearch.rb