embulk-input-elasticsearch-nosslverify 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +7 -0
- data/.ruby-version +1 -0
- data/.travis.yml +17 -0
- data/Gemfile +2 -0
- data/LICENSE.txt +21 -0
- data/README.md +96 -0
- data/Rakefile +11 -0
- data/embulk-input-elasticsearch.gemspec +22 -0
- data/lib/embulk/input/elasticsearch.rb +75 -0
- data/lib/embulk/input/elasticsearch/connection.rb +101 -0
- data/lib/embulk/input/elasticsearch/converter.rb +49 -0
- data/lib/embulk/input/elasticsearch/error.rb +39 -0
- data/lib/embulk/input/elasticsearch/input_thread.rb +19 -0
- data/test/helper.rb +19 -0
- data/test/test_converter.rb +51 -0
- data/test/test_input_thread.rb +36 -0
- data/test/test_transaction.rb +50 -0
- metadata +148 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: cce837140ec443edf4892a250708a8c1c1961d08fc8fb61c4137ff06198c67fd
|
4
|
+
data.tar.gz: 23463d69c4349aa8bd528d035c05eec0a3643c0b5f6f271949c091166a981821
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 2bdd8090f15fb4fee727d041ab410173939080fe73f1d3af6bf3e4edf9aaf617b90677c246c8b9eab69c42c9c81f3ae69bfe3e3a7fc0a3218390e695b600fcf7
|
7
|
+
data.tar.gz: 2ced32de36e5830f1ec0468fccad5f60a507b2033a231b4a3bc331d071632da236b86b625d756ef37d8151af4e44afbb8539ae3c6e1ab55ff22af30ef9097f29
|
data/.gitignore
ADDED
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
jruby-9.1.5.0
|
data/.travis.yml
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
language: ruby
|
2
|
+
install: ./embulk.jar bundle install --path vendor/bundle
|
3
|
+
matrix:
|
4
|
+
include:
|
5
|
+
- env: EMBULK_VERSION=0.9.15
|
6
|
+
rvm: jruby-9.1.5.0 # bundled jruby version
|
7
|
+
jdk: openjdk8 # embulk 0.9.x uses jdk8
|
8
|
+
- env: EMBULK_VERSION=latest
|
9
|
+
rvm: jruby-9.1.5.0 # ?
|
10
|
+
jdk: openjdk8 # ?
|
11
|
+
allow_failures:
|
12
|
+
- env: EMBULK_VERSION=latest
|
13
|
+
before_install:
|
14
|
+
- curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-${EMBULK_VERSION}.jar"
|
15
|
+
- chmod +x embulk.jar
|
16
|
+
- ./embulk.jar gem install bundler
|
17
|
+
script: ./embulk.jar bundle exec rake test
|
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
|
2
|
+
MIT License
|
3
|
+
|
4
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
5
|
+
a copy of this software and associated documentation files (the
|
6
|
+
"Software"), to deal in the Software without restriction, including
|
7
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
8
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
9
|
+
permit persons to whom the Software is furnished to do so, subject to
|
10
|
+
the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be
|
13
|
+
included in all copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
16
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
17
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
18
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
19
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
20
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
21
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
# Elasticsearch input plugin for Embulk [](http://travis-ci.org/toyama0919/embulk-input-elasticsearch) [](http://badge.fury.io/rb/embulk-input-elasticsearch)
|
2
|
+
|
3
|
+
## Overview
|
4
|
+
|
5
|
+
* **Plugin type**: input
|
6
|
+
* **Resume supported**: yes
|
7
|
+
* **Cleanup supported**: yes
|
8
|
+
* **Guess supported**: no
|
9
|
+
|
10
|
+
## Configuration
|
11
|
+
- **nodes**: nodes (array, required)
|
12
|
+
- **host**: host (string, required)
|
13
|
+
- **port**: port (integer, required)
|
14
|
+
- **queries**: lucene query array. (array, required)
|
15
|
+
- **index**: index (string, required)
|
16
|
+
- **index_type**: index_type (string)
|
17
|
+
- **request_timeout**: request timeout (integer)
|
18
|
+
- **per_size**: per size query. (integer, required, default: `1000`)
|
19
|
+
- **limit_size**: limit size unit query. (integer, default: unlimit)
|
20
|
+
- **num_threads**: number of threads for queries. (integer, default: 1)
|
21
|
+
- **retry_on_failure**: retry on failure. set 0 is retry forever. (integer, default: 5)
|
22
|
+
- **sort**: sort order. (hash, default: nil)
|
23
|
+
- **scroll**: scroll. to keep the search context. (string, default: '1m')
|
24
|
+
- **fields**: fields (array, required)
|
25
|
+
- **name**: name (string, required)
|
26
|
+
- **type**: type (string, required)
|
27
|
+
- **metadata**: metadata (boolean, default: false)
|
28
|
+
- **time_format**: time_format (string)
|
29
|
+
|
30
|
+
## Example
|
31
|
+
|
32
|
+
```yaml
|
33
|
+
in:
|
34
|
+
type: elasticsearch
|
35
|
+
nodes:
|
36
|
+
- {host: localhost, port: 9200}
|
37
|
+
queries:
|
38
|
+
- 'page_type: HP'
|
39
|
+
- 'page_type: GP'
|
40
|
+
index: crawl
|
41
|
+
index_type: m_corporation_page
|
42
|
+
request_timeout: 60
|
43
|
+
per_size: 1000
|
44
|
+
limit_size: 200000
|
45
|
+
num_threads: 2
|
46
|
+
sort:
|
47
|
+
m_corporation_id: desc
|
48
|
+
employee_range: asc
|
49
|
+
fields:
|
50
|
+
- { name: _id, type: string, metadata: true }
|
51
|
+
- { name: _type, type: string, metadata: true }
|
52
|
+
- { name: _index, type: string, metadata: true }
|
53
|
+
- { name: _score, type: double, metadata: true }
|
54
|
+
- { name: page_type, type: string }
|
55
|
+
- { name: corp_name, type: string }
|
56
|
+
- { name: corp_key, type: string }
|
57
|
+
- { name: title, type: string }
|
58
|
+
- { name: body, type: string }
|
59
|
+
- { name: url, type: string }
|
60
|
+
- { name: employee_range, type: long }
|
61
|
+
- { name: m_corporation_id, type: long }
|
62
|
+
- { name: cg_lv1, type: json }
|
63
|
+
- { name: cg_lv2, type: json }
|
64
|
+
- { name: cg_lv3, type: json }
|
65
|
+
```
|
66
|
+
|
67
|
+
## Support Type
|
68
|
+
* string
|
69
|
+
* long
|
70
|
+
* double
|
71
|
+
* timestamp
|
72
|
+
* json
|
73
|
+
* boolean
|
74
|
+
|
75
|
+
## test
|
76
|
+
|
77
|
+
### setup
|
78
|
+
|
79
|
+
```
|
80
|
+
curl -o embulk.jar --create-dirs -L "http://dl.embulk.org/embulk-latest.jar"
|
81
|
+
chmod +x embulk.jar
|
82
|
+
./embulk.jar gem install bundler
|
83
|
+
./embulk.jar bundle install --path vendor/bundle
|
84
|
+
```
|
85
|
+
|
86
|
+
### run test
|
87
|
+
|
88
|
+
```
|
89
|
+
./embulk.jar bundle exec rake test
|
90
|
+
```
|
91
|
+
|
92
|
+
## Build
|
93
|
+
|
94
|
+
```
|
95
|
+
$ rake
|
96
|
+
```
|
data/Rakefile
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require 'rake/testtask'
|
3
|
+
|
4
|
+
desc 'Run test_unit based test'
|
5
|
+
Rake::TestTask.new(:test) do |t|
|
6
|
+
t.libs << "test"
|
7
|
+
t.test_files = Dir["test/**/test_*.rb"].sort
|
8
|
+
t.verbose = true
|
9
|
+
t.warning = false
|
10
|
+
end
|
11
|
+
task :default => :test
|
@@ -0,0 +1,22 @@
|
|
1
|
+
|
2
|
+
Gem::Specification.new do |spec|
|
3
|
+
spec.name = "embulk-input-elasticsearch-nosslverify"
|
4
|
+
spec.version = "0.3.6"
|
5
|
+
spec.authors = ["toyama0919"]
|
6
|
+
spec.summary = "Elasticsearch input plugin for Embulk"
|
7
|
+
spec.description = "Loads records from Elasticsearch. parallel query support."
|
8
|
+
spec.email = ["toyama0919@gmail.com"]
|
9
|
+
spec.licenses = ["MIT"]
|
10
|
+
spec.homepage = "https://github.com/toyama0919/embulk-input-elasticsearch"
|
11
|
+
|
12
|
+
spec.files = `git ls-files`.split("\n") + Dir["classpath/*.jar"]
|
13
|
+
spec.test_files = spec.files.grep(%r{^(test|spec)/})
|
14
|
+
spec.require_paths = ["lib"]
|
15
|
+
|
16
|
+
spec.add_dependency 'elasticsearch'
|
17
|
+
spec.add_dependency 'excon'
|
18
|
+
spec.add_development_dependency 'embulk', ['>= 0.8.18']
|
19
|
+
spec.add_development_dependency 'bundler', ['>= 1.10.6']
|
20
|
+
spec.add_development_dependency 'rake', ['>= 10.0']
|
21
|
+
spec.add_development_dependency 'test-unit'
|
22
|
+
end
|
@@ -0,0 +1,75 @@
|
|
1
|
+
require_relative 'elasticsearch/connection'
|
2
|
+
require_relative 'elasticsearch/input_thread'
|
3
|
+
require_relative 'elasticsearch/converter'
|
4
|
+
require_relative 'elasticsearch/error'
|
5
|
+
|
6
|
+
module Embulk
|
7
|
+
module Input
|
8
|
+
|
9
|
+
class Elasticsearch < InputPlugin
|
10
|
+
Plugin.register_input("elasticsearch", self)
|
11
|
+
ADD_QUERY_TO_RECORD_KEY = 'query'
|
12
|
+
|
13
|
+
def self.transaction(config, &control)
|
14
|
+
task = {
|
15
|
+
"nodes" => config.param("nodes", :array),
|
16
|
+
"request_timeout" => config.param("request_timeout", :integer, default: 60),
|
17
|
+
"index" => config.param("index", :string),
|
18
|
+
"reload_connections" => config.param("reload_connections", :bool, default: true),
|
19
|
+
"reload_on_failure" => config.param("reload_on_failure", :bool, default: false),
|
20
|
+
"index_type" => config.param("index_type", :string, default: nil),
|
21
|
+
"retry_on_failure" => config.param("retry_on_failure", :integer, default: 5),
|
22
|
+
"per_size" => config.param("per_size", :integer, default: 1000),
|
23
|
+
"limit_size" => config.param("limit_size", :integer, default: nil),
|
24
|
+
"fields" => config.param("fields", :array, default: nil),
|
25
|
+
"queries" => config.param("queries", :array),
|
26
|
+
"sort" => config.param("sort", :hash, default: nil),
|
27
|
+
"add_query_to_record" => config.param("add_query_to_record", :bool, default: false),
|
28
|
+
"scroll" => config.param("scroll", :string, default: '1m')
|
29
|
+
}
|
30
|
+
# TODO: want max_threads
|
31
|
+
define_num_threads = config.param("num_threads", :integer, default: 1)
|
32
|
+
task['slice_queries'] = InputThread.get_slice_from_num_threads(task['queries'], define_num_threads)
|
33
|
+
|
34
|
+
columns = []
|
35
|
+
task['fields'].each_with_index{ |field, i|
|
36
|
+
columns << Column.new(i, field['name'], field['type'].to_sym)
|
37
|
+
}
|
38
|
+
if task['add_query_to_record']
|
39
|
+
columns << Column.new(task['fields'].size, ADD_QUERY_TO_RECORD_KEY, :string)
|
40
|
+
end
|
41
|
+
|
42
|
+
resume(task, columns, task['slice_queries'].size, &control)
|
43
|
+
end
|
44
|
+
|
45
|
+
def self.resume(task, columns, count, &control)
|
46
|
+
task_reports = yield(task, columns, count)
|
47
|
+
|
48
|
+
next_config_diff = {}
|
49
|
+
return next_config_diff
|
50
|
+
end
|
51
|
+
|
52
|
+
def init
|
53
|
+
@queries = task['slice_queries'][@index]
|
54
|
+
Embulk.logger.info("this thread queries => #{@queries}")
|
55
|
+
@add_query_to_record = task['add_query_to_record']
|
56
|
+
@connection = Connection.new(task)
|
57
|
+
end
|
58
|
+
|
59
|
+
def run
|
60
|
+
@queries.each do |query|
|
61
|
+
@connection.search_with_query(query) { |result|
|
62
|
+
if @add_query_to_record
|
63
|
+
result << query
|
64
|
+
end
|
65
|
+
page_builder.add(result)
|
66
|
+
}
|
67
|
+
end
|
68
|
+
page_builder.finish
|
69
|
+
|
70
|
+
task_report = {}
|
71
|
+
return task_report
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
75
|
+
end
|
@@ -0,0 +1,101 @@
|
|
1
|
+
require 'excon'
|
2
|
+
require 'elasticsearch'
|
3
|
+
|
4
|
+
module Embulk
|
5
|
+
module Input
|
6
|
+
class Elasticsearch < InputPlugin
|
7
|
+
class Connection
|
8
|
+
def initialize(task)
|
9
|
+
@scroll = task['scroll']
|
10
|
+
@index = task['index']
|
11
|
+
@index_type = task['index_type']
|
12
|
+
@size = task['per_size']
|
13
|
+
@fields = task['fields']
|
14
|
+
@sort = task['sort']
|
15
|
+
@limit_size = task['limit_size']
|
16
|
+
@retry_on_failure = task['retry_on_failure']
|
17
|
+
@client = create_client(
|
18
|
+
nodes: task['nodes'],
|
19
|
+
reload_connections: task['reload_connections'],
|
20
|
+
reload_on_failure: task['reload_on_failure'],
|
21
|
+
retry_on_failure: task['retry_on_failure'],
|
22
|
+
request_timeout: task['request_timeout']
|
23
|
+
)
|
24
|
+
end
|
25
|
+
|
26
|
+
def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
|
27
|
+
transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
|
28
|
+
{
|
29
|
+
hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
|
30
|
+
options: {
|
31
|
+
reload_connections: reload_connections,
|
32
|
+
reload_on_failure: reload_on_failure,
|
33
|
+
retry_on_failure: retry_on_failure,
|
34
|
+
transport_options: {
|
35
|
+
request: { timeout: request_timeout },
|
36
|
+
ssl: { verify: false }
|
37
|
+
}
|
38
|
+
}
|
39
|
+
}
|
40
|
+
)
|
41
|
+
|
42
|
+
::Elasticsearch::Client.new transport: transport
|
43
|
+
end
|
44
|
+
|
45
|
+
def search_with_query(query)
|
46
|
+
search_option = get_search_option(query)
|
47
|
+
Embulk.logger.info("#{search_option}")
|
48
|
+
r = search_with_retry { @client.search(search_option) }
|
49
|
+
return if r.nil?
|
50
|
+
i = 0
|
51
|
+
Converter.get_sources(r, @fields).each do |result|
|
52
|
+
yield(result) if block_given?
|
53
|
+
return if @limit_size == (i += 1)
|
54
|
+
end
|
55
|
+
|
56
|
+
while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
|
57
|
+
Converter.get_sources(r, @fields).each do |result|
|
58
|
+
yield(result) if block_given?
|
59
|
+
return if @limit_size == (i += 1)
|
60
|
+
end
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def search_with_retry
|
67
|
+
retries = 0
|
68
|
+
begin
|
69
|
+
yield if block_given?
|
70
|
+
rescue => e
|
71
|
+
if (@retry_on_failure == 0 || retries < @retry_on_failure)
|
72
|
+
retries += 1
|
73
|
+
Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
|
74
|
+
sleep 2**retries
|
75
|
+
retry
|
76
|
+
end
|
77
|
+
msg = "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
|
78
|
+
raise Elasticsearch::ConnectionError.new e, msg
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def get_search_option(query)
|
83
|
+
body = { }
|
84
|
+
body[:query] = { query_string: { query: query } } unless query.nil?
|
85
|
+
if @sort
|
86
|
+
sorts = []
|
87
|
+
@sort.each do |k, v|
|
88
|
+
sorts << { k => v }
|
89
|
+
end
|
90
|
+
body[:sort] = sorts
|
91
|
+
else
|
92
|
+
body[:sort] = ["_doc"]
|
93
|
+
end
|
94
|
+
search_option = { index: @index, type: @index_type, scroll: @scroll, body: body, size: @size }
|
95
|
+
search_option[:_source] = @fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
|
96
|
+
search_option
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
end
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class Converter
|
5
|
+
def self.get_sources(results, fields)
|
6
|
+
hits = results['hits']['hits']
|
7
|
+
hits.map { |hit|
|
8
|
+
result = hit['_source']
|
9
|
+
fields.map { |field|
|
10
|
+
value = field['metadata'] ? hit[field['name']] : result[field['name']]
|
11
|
+
convert_value(value, field)
|
12
|
+
}
|
13
|
+
}
|
14
|
+
end
|
15
|
+
|
16
|
+
def self.convert_value(value, field)
|
17
|
+
return nil if value.nil?
|
18
|
+
case field["type"]
|
19
|
+
when "string"
|
20
|
+
value
|
21
|
+
when "long"
|
22
|
+
value.to_i
|
23
|
+
when "double"
|
24
|
+
value.to_f
|
25
|
+
when "boolean"
|
26
|
+
if value.is_a?(TrueClass) || value.is_a?(FalseClass)
|
27
|
+
value
|
28
|
+
else
|
29
|
+
downcased_val = value.downcase
|
30
|
+
case downcased_val
|
31
|
+
when 'true' then true
|
32
|
+
when 'false' then false
|
33
|
+
when '1' then true
|
34
|
+
when '0' then false
|
35
|
+
else nil
|
36
|
+
end
|
37
|
+
end
|
38
|
+
when "timestamp"
|
39
|
+
Time.parse(value)
|
40
|
+
when "json"
|
41
|
+
value
|
42
|
+
else
|
43
|
+
raise Elasticsearch::TypecastError.new "Unsupported type #{field['type']}"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
|
4
|
+
class Elasticsearch < InputPlugin
|
5
|
+
|
6
|
+
module Traceable
|
7
|
+
def initialize(e, more_msg = nil)
|
8
|
+
message = e.is_a?(String) ? '' : "(#{e.class}) "
|
9
|
+
message << "#{e}#{more_msg}\n"
|
10
|
+
message << "\tat #{e.backtrace.join("\n\tat ")}\n" if e.respond_to?(:backtrace)
|
11
|
+
|
12
|
+
while e.respond_to?(:cause) and e.cause
|
13
|
+
# Java Exception cannot follow the JRuby causes.
|
14
|
+
message << "Caused by (#{e.cause.class}) #{e.cause}\n"
|
15
|
+
message << "\tat #{e.cause.backtrace.join("\n\tat ")}\n" if e.cause.respond_to?(:backtrace)
|
16
|
+
e = e.cause
|
17
|
+
end
|
18
|
+
|
19
|
+
super(message)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
class ConfigError < ::Embulk::ConfigError
|
24
|
+
include Traceable
|
25
|
+
end
|
26
|
+
|
27
|
+
class ConnectionError < ConfigError
|
28
|
+
end
|
29
|
+
|
30
|
+
class DataError < ::Embulk::DataError
|
31
|
+
include Traceable
|
32
|
+
end
|
33
|
+
|
34
|
+
class TypecastError < DataError
|
35
|
+
end
|
36
|
+
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Embulk
|
2
|
+
module Input
|
3
|
+
class Elasticsearch < InputPlugin
|
4
|
+
class InputThread
|
5
|
+
def self.get_slice_from_num_threads(array, define_num_threads)
|
6
|
+
num_threads = array.size < define_num_threads ? array.size : define_num_threads
|
7
|
+
per_queries = if (array.size % num_threads) == 0
|
8
|
+
(array.size / num_threads)
|
9
|
+
else
|
10
|
+
(array.size / num_threads) + 1
|
11
|
+
end
|
12
|
+
sliced = array.each_slice(per_queries).to_a
|
13
|
+
Embulk.logger.info("calculate num threads => #{sliced.size}")
|
14
|
+
return sliced
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
data/test/helper.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
|
5
|
+
# require 'embulk/java/bootstrap'
|
6
|
+
require 'embulk'
|
7
|
+
begin
|
8
|
+
# Embulk ~> 0.8.x
|
9
|
+
Embulk.setup
|
10
|
+
rescue NotImplementedError
|
11
|
+
# Embulk ~> 0.9.x
|
12
|
+
require 'embulk/java/bootstrap'
|
13
|
+
end
|
14
|
+
Embulk.logger = Embulk::Logger.new('/dev/null')
|
15
|
+
|
16
|
+
APP_ROOT = File.expand_path('../', __dir__)
|
17
|
+
TEST_ROOT = File.expand_path(File.dirname(__FILE__))
|
18
|
+
|
19
|
+
require 'embulk/input/elasticsearch'
|
@@ -0,0 +1,51 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
|
3
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
class Input::Elasticsearch
|
7
|
+
class TestConverter < Test::Unit::TestCase
|
8
|
+
|
9
|
+
def startup
|
10
|
+
end
|
11
|
+
|
12
|
+
def shutdown
|
13
|
+
end
|
14
|
+
|
15
|
+
sub_test_case "get_sources" do
|
16
|
+
def test_normal
|
17
|
+
fields = [
|
18
|
+
{"name"=>"_id", "type"=>"string", "metadata"=>true},
|
19
|
+
{"name"=>"product_id", "type"=>"long"},
|
20
|
+
{"name"=>"title", "type"=>"string"}
|
21
|
+
]
|
22
|
+
|
23
|
+
results = {
|
24
|
+
"_scroll_id"=>"cXVlcnlUaGVuRmV0Y2g7NTsxNzg3MjE6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjI6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjM6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjU6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjQ6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTswOw==",
|
25
|
+
"took"=>41,
|
26
|
+
"timed_out"=>false,
|
27
|
+
"_shards"=>{"total"=>5, "successful"=>5, "failed"=>0},
|
28
|
+
"hits"=>{
|
29
|
+
"total"=>1,
|
30
|
+
"max_score"=>nil,
|
31
|
+
"hits"=>[
|
32
|
+
{
|
33
|
+
"_index"=>"test_index",
|
34
|
+
"_type"=>"test_type",
|
35
|
+
"_id"=>"AVTCxiCuNR-BVKOgUB7R",
|
36
|
+
"_score"=>nil,
|
37
|
+
"_source"=>{
|
38
|
+
"title"=>"dummy title",
|
39
|
+
"product_id"=>1
|
40
|
+
},
|
41
|
+
"sort"=>[12534]
|
42
|
+
}
|
43
|
+
]
|
44
|
+
}
|
45
|
+
}
|
46
|
+
assert_equal Converter.get_sources(results, fields), [["AVTCxiCuNR-BVKOgUB7R", 1, "dummy title"]]
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
@@ -0,0 +1,36 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
|
3
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
4
|
+
|
5
|
+
module Embulk
|
6
|
+
class Input::Elasticsearch
|
7
|
+
class TestTransaction < Test::Unit::TestCase
|
8
|
+
sub_test_case "get_slice_from_num_threads" do
|
9
|
+
def test_normal
|
10
|
+
slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
|
11
|
+
assert_equal slice.size, 5
|
12
|
+
assert_equal slice.first.size, 2
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_normal_same
|
16
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
|
17
|
+
assert_equal slice.size, 3
|
18
|
+
assert_equal slice.first.size, 1
|
19
|
+
end
|
20
|
+
|
21
|
+
def test_num_threads_over_array_size
|
22
|
+
slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
|
23
|
+
assert_equal slice.size, 3
|
24
|
+
assert_equal slice.first.size, 1
|
25
|
+
end
|
26
|
+
|
27
|
+
def test_rest
|
28
|
+
slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
|
29
|
+
assert_equal slice.size, 7
|
30
|
+
assert_equal slice.first.size, 3
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
@@ -0,0 +1,50 @@
|
|
1
|
+
require_relative './helper'
|
2
|
+
require 'yaml'
|
3
|
+
|
4
|
+
Elasticsearch = Embulk::Input::Elasticsearch
|
5
|
+
|
6
|
+
module Embulk
|
7
|
+
class Input::Elasticsearch
|
8
|
+
class TestTransaction < Test::Unit::TestCase
|
9
|
+
def control
|
10
|
+
Proc.new {|task| task_reports = [] }
|
11
|
+
end
|
12
|
+
sub_test_case "transaction" do
|
13
|
+
def test_normal
|
14
|
+
yaml = YAML.load(%(
|
15
|
+
nodes:
|
16
|
+
- {host: localhost, port: 9200}
|
17
|
+
queries:
|
18
|
+
- 'title: 製函機'
|
19
|
+
index: crawl
|
20
|
+
index_type: m_corporation_page
|
21
|
+
request_timeout: 60
|
22
|
+
per_size: 1000
|
23
|
+
limit_size: 2000
|
24
|
+
num_threads: 20
|
25
|
+
fields:
|
26
|
+
- { name: title, type: string }
|
27
|
+
)
|
28
|
+
)
|
29
|
+
config = DataSource.new(yaml)
|
30
|
+
Elasticsearch.transaction(config, &control)
|
31
|
+
end
|
32
|
+
|
33
|
+
def test_minimum
|
34
|
+
yaml = YAML.load(%(
|
35
|
+
nodes:
|
36
|
+
- {host: localhost, port: 9200}
|
37
|
+
queries:
|
38
|
+
- 'title: 製函機'
|
39
|
+
index: crawl
|
40
|
+
fields:
|
41
|
+
- { name: title, type: string }
|
42
|
+
)
|
43
|
+
)
|
44
|
+
config = DataSource.new(yaml)
|
45
|
+
Elasticsearch.transaction(config, &control)
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,148 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: embulk-input-elasticsearch-nosslverify
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.3.6
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- toyama0919
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-11-07 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: elasticsearch
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: excon
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: embulk
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - ">="
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: 0.8.18
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - ">="
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 0.8.18
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: bundler
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: 1.10.6
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: 1.10.6
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rake
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '10.0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '10.0'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: test-unit
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - ">="
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - ">="
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '0'
|
97
|
+
description: Loads records from Elasticsearch. parallel query support.
|
98
|
+
email:
|
99
|
+
- toyama0919@gmail.com
|
100
|
+
executables: []
|
101
|
+
extensions: []
|
102
|
+
extra_rdoc_files: []
|
103
|
+
files:
|
104
|
+
- ".gitignore"
|
105
|
+
- ".ruby-version"
|
106
|
+
- ".travis.yml"
|
107
|
+
- Gemfile
|
108
|
+
- LICENSE.txt
|
109
|
+
- README.md
|
110
|
+
- Rakefile
|
111
|
+
- embulk-input-elasticsearch.gemspec
|
112
|
+
- lib/embulk/input/elasticsearch.rb
|
113
|
+
- lib/embulk/input/elasticsearch/connection.rb
|
114
|
+
- lib/embulk/input/elasticsearch/converter.rb
|
115
|
+
- lib/embulk/input/elasticsearch/error.rb
|
116
|
+
- lib/embulk/input/elasticsearch/input_thread.rb
|
117
|
+
- test/helper.rb
|
118
|
+
- test/test_converter.rb
|
119
|
+
- test/test_input_thread.rb
|
120
|
+
- test/test_transaction.rb
|
121
|
+
homepage: https://github.com/toyama0919/embulk-input-elasticsearch
|
122
|
+
licenses:
|
123
|
+
- MIT
|
124
|
+
metadata: {}
|
125
|
+
post_install_message:
|
126
|
+
rdoc_options: []
|
127
|
+
require_paths:
|
128
|
+
- lib
|
129
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
130
|
+
requirements:
|
131
|
+
- - ">="
|
132
|
+
- !ruby/object:Gem::Version
|
133
|
+
version: '0'
|
134
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
135
|
+
requirements:
|
136
|
+
- - ">="
|
137
|
+
- !ruby/object:Gem::Version
|
138
|
+
version: '0'
|
139
|
+
requirements: []
|
140
|
+
rubygems_version: 3.0.3
|
141
|
+
signing_key:
|
142
|
+
specification_version: 4
|
143
|
+
summary: Elasticsearch input plugin for Embulk
|
144
|
+
test_files:
|
145
|
+
- test/helper.rb
|
146
|
+
- test/test_converter.rb
|
147
|
+
- test/test_input_thread.rb
|
148
|
+
- test/test_transaction.rb
|