embulk-input-elasticsearch 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9e3e0bcff8c7de3a07cab81b1689f6f2f6b5dfe5
4
+ data.tar.gz: a2997eb7f98cdea800403211b75342a0ff360483
5
+ SHA512:
6
+ metadata.gz: bb618f0f0c4af1cdce787ce20cc38b009a4b48eba7e678f0f198c0e0195038242d8609ef42e1744077b0a51e99a8c3d62662c42e9482c4e33802d7b7ac8adeab
7
+ data.tar.gz: 8b43fc2e6c9afa87bb0851e2a813c9a3633104df341db9b73c74b472e3c701ce1e04e5e287f11d94b7ea2b1e11a6a0b7e2ea9c25bc70978b627765bf87ba2abf
data/.gitignore ADDED
@@ -0,0 +1,6 @@
1
+ *~
2
+ /pkg/
3
+ /tmp/
4
+ /.bundle/
5
+ /vendor/
6
+ /Gemfile.lock
data/.ruby-version ADDED
@@ -0,0 +1 @@
1
+ jruby-9.0.5.0
data/Gemfile ADDED
@@ -0,0 +1,2 @@
1
+ source 'https://rubygems.org/'
2
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+
2
+ MIT License
3
+
4
+ Permission is hereby granted, free of charge, to any person obtaining
5
+ a copy of this software and associated documentation files (the
6
+ "Software"), to deal in the Software without restriction, including
7
+ without limitation the rights to use, copy, modify, merge, publish,
8
+ distribute, sublicense, and/or sell copies of the Software, and to
9
+ permit persons to whom the Software is furnished to do so, subject to
10
+ the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be
13
+ included in all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
19
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
20
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
21
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,71 @@
1
+ # Elasticsearch input plugin for Embulk
2
+
3
+ ## Overview
4
+
5
+ * **Plugin type**: input
6
+ * **Resume supported**: yes
7
+ * **Cleanup supported**: yes
8
+ * **Guess supported**: no
9
+
10
+ ## Configuration
11
+ - **nodes**: nodes (array, required)
12
+ - **host**: host (string, required, default: ``)
13
+ - **port**: port (integer, required, default: ``)
14
+ - **queries**: query (array, required, default: ``)
15
+ - **index**: index (string, required, default: ``)
16
+ - **index_type**: index_type (string, default: ``)
17
+ - **request_timeout**: request_timeout (string, default: ``)
18
+ - **per_size**: per_size (integer, required, default: `1000`)
19
+ - **limit_size**: limit_size (integer, default: unlimit)
20
+ - **fields**: fields (string, required, default: ``)
21
+ - **name**: name (string, required, default: ``)
22
+ - **type**: type (string, required, default: ``)
23
+ - **metadata**: metadata (boolean, required, default: ``)
24
+ - **time_format**: time_format (string, required, default: ``)
25
+
26
+ ## Example
27
+
28
+ ```yaml
29
+ in:
30
+ type: elasticsearch
31
+ nodes:
32
+ - {host: localhost, port: 9200}
33
+ queries:
34
+ - 'page_type: HP'
35
+ - 'page_type: GP'
36
+ index: crawl
37
+ index_type: m_corporation_page
38
+ request_timeout: 60
39
+ per_size: 1000
40
+ limit_size: 200000
41
+ fields:
42
+ - { name: _id, type: string, metadata: true }
43
+ - { name: _type, type: string, metadata: true }
44
+ - { name: _index, type: string, metadata: true }
45
+ - { name: _score, type: double, metadata: true }
46
+ - { name: page_type, type: string }
47
+ - { name: corp_name, type: string }
48
+ - { name: corp_key, type: string }
49
+ - { name: title, type: string }
50
+ - { name: body, type: string }
51
+ - { name: url, type: string }
52
+ - { name: employee_range, type: long }
53
+ - { name: m_corporation_id, type: long }
54
+ - { name: cg_lv1, type: json }
55
+ - { name: cg_lv2, type: json }
56
+ - { name: cg_lv3, type: json }
57
+ ```
58
+
59
+ ## Support Type
60
+ * string
61
+ * long
62
+ * double
63
+ * timestamp
64
+ * json
65
+ * boolean
66
+
67
+ ## Build
68
+
69
+ ```
70
+ $ rake
71
+ ```
data/Rakefile ADDED
@@ -0,0 +1,10 @@
1
+ require "bundler/gem_tasks"
2
+
3
+ task default: :build
4
+
5
+ desc "Run tests"
6
+ task :test do
7
+ require "test-unit"
8
+
9
+ Test::Unit::AutoRunner.run(true, './')
10
+ end
@@ -0,0 +1,22 @@
1
+
2
+ Gem::Specification.new do |spec|
3
+ spec.name = "embulk-input-elasticsearch"
4
+ spec.version = "0.1.0"
5
+ spec.authors = ["toyama0919"]
6
+ spec.summary = "Elasticsearch input plugin for Embulk"
7
+ spec.description = "Loads records from Elasticsearch."
8
+ spec.email = ["toyama0919@gmail.com"]
9
+ spec.licenses = ["MIT"]
10
+ spec.homepage = "https://github.com/toyama0919/embulk-input-elasticsearch"
11
+
12
+ spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
13
+ spec.test_files = spec.files.grep(%r{^(test|spec)/})
14
+ spec.require_paths = ["lib"]
15
+
16
+ spec.add_dependency 'elasticsearch'
17
+ spec.add_dependency 'excon'
18
+ spec.add_development_dependency 'embulk', ['>= 0.8.9']
19
+ spec.add_development_dependency 'bundler', ['>= 1.10.6']
20
+ spec.add_development_dependency 'rake', ['>= 10.0']
21
+ spec.add_development_dependency 'test-unit'
22
+ end
@@ -0,0 +1,172 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
4
+ module Embulk
5
+ module Input
6
+
7
+ class Elasticsearch < InputPlugin
8
+ Plugin.register_input("elasticsearch", self)
9
+
10
+ def self.transaction(config, &control)
11
+ task = {
12
+ "nodes" => config.param("nodes", :array),
13
+ "request_timeout" => config.param("request_timeout", :integer, default: 60),
14
+ "index" => config.param("index", :string),
15
+ "reload_connections" => config.param("reload_connections", :bool, default: true),
16
+ "reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
17
+ "index_type" => config.param("index_type", :string, default: nil),
18
+ "retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
19
+ "per_size" => config.param("per_size", :integer, default: 1000),
20
+ "limit_size" => config.param("limit_size", :integer, default: nil),
21
+ "fields" => config.param("fields", :array, default: nil),
22
+ "queries" => config.param("queries", :array),
23
+ "sort" => config.param("sort", :hash, default: nil)
24
+ }
25
+
26
+ columns = []
27
+ task['fields'].each_with_index{ |field, i|
28
+ columns << Column.new(i, field['name'], field['type'].to_sym)
29
+ }
30
+
31
+ resume(task, columns, 1, &control)
32
+ end
33
+
34
+ def self.resume(task, columns, count, &control)
35
+ task_reports = yield(task, columns, count)
36
+
37
+ next_config_diff = {}
38
+ return next_config_diff
39
+ end
40
+
41
+ def self.create_client(task)
42
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
43
+ {
44
+ hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
45
+ options: {
46
+ reload_connections: task['reload_connections'],
47
+ reload_on_failure: task['reload_on_failure'],
48
+ retry_on_failure: task['retry_on_failure'],
49
+ transport_options: {
50
+ request: { timeout: task['request_timeout'] }
51
+ }
52
+ }
53
+ }
54
+ )
55
+
56
+ ::Elasticsearch::Client.new transport: transport
57
+ end
58
+
59
+ def init
60
+ @client = self.class.create_client(task)
61
+ @index = task['index']
62
+ @index_type = task['index_type']
63
+ @queries = task['queries']
64
+ @per_size = task['per_size']
65
+ @limit_size = task['limit_size']
66
+ @fields = task['fields']
67
+ @sort = task['sort']
68
+ end
69
+
70
+ def run
71
+ @queries.each do |query|
72
+ query_count = 0
73
+ no_source_results = search(@index_type, query, 0, 0, @routing, @fields, @sort)
74
+ total_count = [no_source_results['hits']['total'], @limit_size].compact.min
75
+ while true
76
+ now_results_size = query_count * @per_size
77
+ next_results_size = (query_count + 1) * @per_size
78
+ size = get_size(next_results_size, now_results_size ,total_count)
79
+ break if size == 0
80
+
81
+ results = get_sources(search(@index_type, query, size, now_results_size, @routing, @fields, @sort), @fields)
82
+ results.each do |record|
83
+ page_builder.add(record)
84
+ end
85
+ break if last_query?(next_results_size ,total_count)
86
+ query_count += 1
87
+ end
88
+ end
89
+ page_builder.finish
90
+
91
+ task_report = {}
92
+ return task_report
93
+ end
94
+
95
+ private
96
+
97
+ def convert_value(value, field)
98
+ return nil if value.nil?
99
+ case field["type"]
100
+ when "string"
101
+ value
102
+ when "long"
103
+ value.to_i
104
+ when "double"
105
+ value.to_f
106
+ when "boolean"
107
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
108
+ value
109
+ else
110
+ downcased_val = value.downcase
111
+ case downcased_val
112
+ when 'true' then true
113
+ when 'false' then false
114
+ when '1' then true
115
+ when '0' then false
116
+ else nil
117
+ end
118
+ end
119
+ when "timestamp"
120
+ Time.parse(value)
121
+ when "json"
122
+ value
123
+ else
124
+ raise "Unsupported type #{field['type']}"
125
+ end
126
+ end
127
+
128
+ def get_size(next_results_size, now_results_size ,total_count)
129
+ if last_query?(next_results_size ,total_count)
130
+ (total_count - now_results_size)
131
+ else
132
+ @per_size
133
+ end
134
+ end
135
+
136
+ def last_query?(next_results_size ,total_count)
137
+ next_results_size > total_count
138
+ end
139
+
140
+ def search(type, query, size, from, routing, fields, sort)
141
+ body = { from: from }
142
+ body[:size] = size unless size.nil?
143
+ if sort
144
+ sorts = []
145
+ sort.each do |k, v|
146
+ sorts << { k => v }
147
+ end
148
+ body[:sort] = sorts
149
+ end
150
+ body[:query] = { query_string: { query: query } } unless query.nil?
151
+ search_option = { index: @index, type: type, body: body }
152
+ search_option[:routing] = routing unless routing.nil?
153
+ search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
154
+ Embulk.logger.info(%Q{search_option => #{search_option}})
155
+ @client.search(search_option)
156
+ end
157
+
158
+ def get_sources(results, fields)
159
+ hits = results['hits']['hits']
160
+ hits.map { |hit|
161
+ result = hit['_source']
162
+ fields.select{ |field| field['metadata'] }.each { |field|
163
+ result[field['name']] = hit[field['name']]
164
+ }
165
+ @fields.map { |field|
166
+ convert_value(result[field['name']], field)
167
+ }
168
+ }
169
+ end
170
+ end
171
+ end
172
+ end
@@ -0,0 +1,11 @@
1
+ require "embulk/command/embulk_run"
2
+ require "embulk"
3
+ Embulk.setup
4
+
5
+ require "embulk/input/elasticsearch"
6
+ module Embulk
7
+ module Input
8
+ class ElasticsearchInputPluginTest < Test::Unit::TestCase
9
+ end
10
+ end
11
+ end
metadata ADDED
@@ -0,0 +1,138 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: embulk-input-elasticsearch
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - toyama0919
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2016-06-13 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: elasticsearch
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ name: excon
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ - !ruby/object:Gem::Dependency
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: 0.8.9
47
+ name: embulk
48
+ prerelease: false
49
+ type: :development
50
+ version_requirements: !ruby/object:Gem::Requirement
51
+ requirements:
52
+ - - ">="
53
+ - !ruby/object:Gem::Version
54
+ version: 0.8.9
55
+ - !ruby/object:Gem::Dependency
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: 1.10.6
61
+ name: bundler
62
+ prerelease: false
63
+ type: :development
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: 1.10.6
69
+ - !ruby/object:Gem::Dependency
70
+ requirement: !ruby/object:Gem::Requirement
71
+ requirements:
72
+ - - ">="
73
+ - !ruby/object:Gem::Version
74
+ version: '10.0'
75
+ name: rake
76
+ prerelease: false
77
+ type: :development
78
+ version_requirements: !ruby/object:Gem::Requirement
79
+ requirements:
80
+ - - ">="
81
+ - !ruby/object:Gem::Version
82
+ version: '10.0'
83
+ - !ruby/object:Gem::Dependency
84
+ requirement: !ruby/object:Gem::Requirement
85
+ requirements:
86
+ - - ">="
87
+ - !ruby/object:Gem::Version
88
+ version: '0'
89
+ name: test-unit
90
+ prerelease: false
91
+ type: :development
92
+ version_requirements: !ruby/object:Gem::Requirement
93
+ requirements:
94
+ - - ">="
95
+ - !ruby/object:Gem::Version
96
+ version: '0'
97
+ description: Loads records from Elasticsearch.
98
+ email:
99
+ - toyama0919@gmail.com
100
+ executables: []
101
+ extensions: []
102
+ extra_rdoc_files: []
103
+ files:
104
+ - ".gitignore"
105
+ - ".ruby-version"
106
+ - Gemfile
107
+ - LICENSE.txt
108
+ - README.md
109
+ - Rakefile
110
+ - embulk-input-elasticsearch.gemspec
111
+ - lib/embulk/input/elasticsearch.rb
112
+ - test/embulk/input/test_elasticsearch.rb
113
+ homepage: https://github.com/toyama0919/embulk-input-elasticsearch
114
+ licenses:
115
+ - MIT
116
+ metadata: {}
117
+ post_install_message:
118
+ rdoc_options: []
119
+ require_paths:
120
+ - lib
121
+ required_ruby_version: !ruby/object:Gem::Requirement
122
+ requirements:
123
+ - - ">="
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
126
+ required_rubygems_version: !ruby/object:Gem::Requirement
127
+ requirements:
128
+ - - ">="
129
+ - !ruby/object:Gem::Version
130
+ version: '0'
131
+ requirements: []
132
+ rubyforge_project:
133
+ rubygems_version: 2.4.8
134
+ signing_key:
135
+ specification_version: 4
136
+ summary: Elasticsearch input plugin for Embulk
137
+ test_files:
138
+ - test/embulk/input/test_elasticsearch.rb