embulk-input-elasticsearch 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 209e87c44e92108a80a402616ef1d7ad7db9cbd7
4
- data.tar.gz: ec7efbe789ee0e2b17d922750172ba0e9f745ef2
3
+ metadata.gz: 6e50e12402a605f80964ad84cf29c555d6526671
4
+ data.tar.gz: a69bf91656bd9a569e68ee437efecf69cd03db57
5
5
  SHA512:
6
- metadata.gz: 337eac65409f489536ba0e4107cdca94115530b3781b2e2e8eb90b55e3df7e46e4f79530d9776b2ee341887bde394387fe3f08850b0aa2a1c0a50b9d9ba6c613
7
- data.tar.gz: 612f003dccdab158bd094b34d2853c3cd368b955b860b5a58f71ceaa3cae416c492e556acf02550ddc28aec9920485ae15fac0a8b3b78419e4bc78f31f00cd43
6
+ metadata.gz: 2400cf1b6273097d67d2a71f65f2984bbf3b1a79c03b1070c02aff0e1913d99be37af80e2a6d3eba0a65ef9462cee6b53d3df23ce70c6579694f4a3c278eda3e
7
+ data.tar.gz: a21f701e1b9695e170858ebdb16c3042cb38f6557573d03f1142b1764b2771b9c75d27056acb47e78f2a469953cecff65d2c73a2b8addd69834dc5c51b6d1eee
data/.travis.yml CHANGED
@@ -2,6 +2,7 @@ language: ruby
2
2
  cache: bundler
3
3
  rvm:
4
4
  - jruby-9.0.5.0
5
+ - jruby-9.1.5.0
5
6
  - jruby-head
6
7
  jdk:
7
8
  - openjdk7
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-elasticsearch"
4
- spec.version = "0.3.2"
4
+ spec.version = "0.3.3"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Elasticsearch input plugin for Embulk"
7
7
  spec.description = "Loads records from Elasticsearch. parallel query support."
@@ -1,7 +1,6 @@
1
- require 'excon'
2
- require 'elasticsearch'
3
1
  require_relative 'elasticsearch/connection'
4
2
  require_relative 'elasticsearch/input_thread'
3
+ require_relative 'elasticsearch/converter'
5
4
 
6
5
  module Embulk
7
6
  module Input
@@ -52,135 +51,24 @@ module Embulk
52
51
  def init
53
52
  @queries = task['slice_queries'][@index]
54
53
  Embulk.logger.info("this thread queries => #{@queries}")
55
- @client = Connection.create_client(task)
56
- @index_name = task['index']
57
- @index_type = task['index_type']
58
- @per_size = task['per_size']
59
- @limit_size = task['limit_size']
60
- @fields = task['fields']
61
- @sort = task['sort']
62
54
  @add_query_to_record = task['add_query_to_record']
63
- @scroll = task['scroll']
64
- @retry_on_failure = task['retry_on_failure']
55
+ @connection = Connection.new(task)
65
56
  end
66
57
 
67
58
  def run
68
- search(@index_type, @per_size, @routing, @fields, @sort)
59
+ @queries.each do |query|
60
+ @connection.search_with_query(query) { |result|
61
+ if @add_query_to_record
62
+ result << query
63
+ end
64
+ page_builder.add(result)
65
+ }
66
+ end
69
67
  page_builder.finish
70
68
 
71
69
  task_report = {}
72
70
  return task_report
73
71
  end
74
-
75
- private
76
-
77
- def search(type, size, routing, fields, sort)
78
- @queries.each do |query|
79
- search_with_query(query, type, size, routing, fields, sort)
80
- end
81
- end
82
-
83
- def search_with_query(query, type, size, routing, fields, sort)
84
- search_option = get_search_option(type, query, size, fields, sort)
85
- Embulk.logger.info("#{search_option}")
86
- r = search_with_retry { @client.search(search_option) }
87
- i = 0
88
- get_sources(r, fields).each do |result|
89
- result_proc(result, query)
90
- return if @limit_size == (i += 1)
91
- end
92
-
93
- while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
94
- get_sources(r, fields).each do |result|
95
- result_proc(result, query)
96
- return if @limit_size == (i += 1)
97
- end
98
- end
99
- end
100
-
101
- def search_with_retry
102
- retries = 0
103
- begin
104
- yield if block_given?
105
- rescue => e
106
- if retries < @retry_on_failure
107
- retries += 1
108
- Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
109
- sleep 2**retries
110
- retry
111
- end
112
- Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
113
- raise
114
- end
115
- end
116
-
117
- def result_proc(result, query)
118
- if @add_query_to_record
119
- result << query
120
- end
121
- page_builder.add(result)
122
- end
123
-
124
- def get_search_option(type, query, size, fields, sort)
125
- body = { }
126
- body[:query] = { query_string: { query: query } } unless query.nil?
127
- if sort
128
- sorts = []
129
- sort.each do |k, v|
130
- sorts << { k => v }
131
- end
132
- body[:sort] = sorts
133
- else
134
- body[:sort] = ["_doc"]
135
- end
136
- search_option = { index: @index_name, type: type, scroll: @scroll, body: body, size: size }
137
- search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
138
- search_option
139
- end
140
-
141
- def get_sources(results, fields)
142
- hits = results['hits']['hits']
143
- hits.map { |hit|
144
- result = hit['_source']
145
- fields.select{ |field| field['metadata'] }.each { |field|
146
- result[field['name']] = hit[field['name']]
147
- }
148
- @fields.map { |field|
149
- convert_value(result[field['name']], field)
150
- }
151
- }
152
- end
153
-
154
- def convert_value(value, field)
155
- return nil if value.nil?
156
- case field["type"]
157
- when "string"
158
- value
159
- when "long"
160
- value.to_i
161
- when "double"
162
- value.to_f
163
- when "boolean"
164
- if value.is_a?(TrueClass) || value.is_a?(FalseClass)
165
- value
166
- else
167
- downcased_val = value.downcase
168
- case downcased_val
169
- when 'true' then true
170
- when 'false' then false
171
- when '1' then true
172
- when '0' then false
173
- else nil
174
- end
175
- end
176
- when "timestamp"
177
- Time.parse(value)
178
- when "json"
179
- value
180
- else
181
- raise "Unsupported type #{field['type']}"
182
- end
183
- end
184
72
  end
185
73
  end
186
74
  end
@@ -1,17 +1,38 @@
1
+ require 'excon'
2
+ require 'elasticsearch'
3
+
1
4
  module Embulk
2
5
  module Input
3
6
  class Elasticsearch < InputPlugin
4
7
  class Connection
5
- def self.create_client(task)
8
+ def initialize(task)
9
+ @scroll = task['scroll']
10
+ @index = task['index']
11
+ @index_type = task['index_type']
12
+ @size = task['per_size']
13
+ @fields = task['fields']
14
+ @sort = task['sort']
15
+ @limit_size = task['limit_size']
16
+ @retry_on_failure = task['retry_on_failure']
17
+ @client = create_client(
18
+ nodes: task['nodes'],
19
+ reload_connections: task['reload_connections'],
20
+ reload_on_failure: task['reload_on_failure'],
21
+ retry_on_failure: task['retry_on_failure'],
22
+ request_timeout: task['request_timeout']
23
+ )
24
+ end
25
+
26
+ def create_client(nodes: ,reload_connections: ,reload_on_failure: ,retry_on_failure: ,request_timeout:)
6
27
  transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
7
28
  {
8
- hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
29
+ hosts: nodes.map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
9
30
  options: {
10
- reload_connections: task['reload_connections'],
11
- reload_on_failure: task['reload_on_failure'],
12
- retry_on_failure: task['retry_on_failure'],
31
+ reload_connections: reload_connections,
32
+ reload_on_failure: reload_on_failure,
33
+ retry_on_failure: retry_on_failure,
13
34
  transport_options: {
14
- request: { timeout: task['request_timeout'] }
35
+ request: { timeout: request_timeout }
15
36
  }
16
37
  }
17
38
  }
@@ -19,6 +40,57 @@ module Embulk
19
40
 
20
41
  ::Elasticsearch::Client.new transport: transport
21
42
  end
43
+
44
+ def search_with_query(query)
45
+ search_option = get_search_option(query)
46
+ Embulk.logger.info("#{search_option}")
47
+ r = search_with_retry { @client.search(search_option) }
48
+ i = 0
49
+ Converter.get_sources(r, @fields).each do |result|
50
+ yield(result) if block_given?
51
+ return if @limit_size == (i += 1)
52
+ end
53
+
54
+ while r = (search_with_retry { @client.scroll(scroll_id: r['_scroll_id'], scroll: @scroll) }) and (not r['hits']['hits'].empty?) do
55
+ Converter.get_sources(r, @fields).each do |result|
56
+ yield(result) if block_given?
57
+ return if @limit_size == (i += 1)
58
+ end
59
+ end
60
+ end
61
+
62
+ def search_with_retry
63
+ retries = 0
64
+ begin
65
+ yield if block_given?
66
+ rescue => e
67
+ if retries < @retry_on_failure
68
+ retries += 1
69
+ Embulk.logger.warn "Could not search to Elasticsearch, resetting connection and trying again. #{e.message}"
70
+ sleep 2**retries
71
+ retry
72
+ end
73
+ Embulk.logger.error "Could not search to Elasticsearch after #{retries} retries. #{e.message}"
74
+ raise
75
+ end
76
+ end
77
+
78
+ def get_search_option(query)
79
+ body = { }
80
+ body[:query] = { query_string: { query: query } } unless query.nil?
81
+ if @sort
82
+ sorts = []
83
+ @sort.each do |k, v|
84
+ sorts << { k => v }
85
+ end
86
+ body[:sort] = sorts
87
+ else
88
+ body[:sort] = ["_doc"]
89
+ end
90
+ search_option = { index: @index, type: @index_type, scroll: @scroll, body: body, size: @size }
91
+ search_option[:_source] = @fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
92
+ search_option
93
+ end
22
94
  end
23
95
  end
24
96
  end
@@ -0,0 +1,49 @@
1
+ module Embulk
2
+ module Input
3
+ class Elasticsearch < InputPlugin
4
+ class Converter
5
+ def self.get_sources(results, fields)
6
+ hits = results['hits']['hits']
7
+ hits.map { |hit|
8
+ result = hit['_source']
9
+ fields.map { |field|
10
+ value = field['metadata'] ? hit[field['name']] : result[field['name']]
11
+ convert_value(value, field)
12
+ }
13
+ }
14
+ end
15
+
16
+ def self.convert_value(value, field)
17
+ return nil if value.nil?
18
+ case field["type"]
19
+ when "string"
20
+ value
21
+ when "long"
22
+ value.to_i
23
+ when "double"
24
+ value.to_f
25
+ when "boolean"
26
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
27
+ value
28
+ else
29
+ downcased_val = value.downcase
30
+ case downcased_val
31
+ when 'true' then true
32
+ when 'false' then false
33
+ when '1' then true
34
+ when '0' then false
35
+ else nil
36
+ end
37
+ end
38
+ when "timestamp"
39
+ Time.parse(value)
40
+ when "json"
41
+ value
42
+ else
43
+ raise "Unsupported type #{field['type']}"
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
data/test/helper.rb CHANGED
@@ -9,3 +9,5 @@ Embulk.logger = Embulk::Logger.new('/dev/null')
9
9
 
10
10
  APP_ROOT = File.expand_path('../', __dir__)
11
11
  TEST_ROOT = File.expand_path(File.dirname(__FILE__))
12
+
13
+ require 'embulk/input/elasticsearch'
@@ -0,0 +1,51 @@
1
+ require_relative './helper'
2
+
3
+ Elasticsearch = Embulk::Input::Elasticsearch
4
+
5
+ module Embulk
6
+ class Input::Elasticsearch
7
+ class TestConverter < Test::Unit::TestCase
8
+
9
+ def startup
10
+ end
11
+
12
+ def shutdown
13
+ end
14
+
15
+ sub_test_case "get_sources" do
16
+ def test_normal
17
+ fields = [
18
+ {"name"=>"_id", "type"=>"string", "metadata"=>true},
19
+ {"name"=>"product_id", "type"=>"long"},
20
+ {"name"=>"title", "type"=>"string"}
21
+ ]
22
+
23
+ results = {
24
+ "_scroll_id"=>"cXVlcnlUaGVuRmV0Y2g7NTsxNzg3MjE6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjI6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjM6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjU6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTsxNzg3MjQ6WlphQ3V0WDNRYmFRcS1QQ3dCb2s5UTswOw==",
25
+ "took"=>41,
26
+ "timed_out"=>false,
27
+ "_shards"=>{"total"=>5, "successful"=>5, "failed"=>0},
28
+ "hits"=>{
29
+ "total"=>1,
30
+ "max_score"=>nil,
31
+ "hits"=>[
32
+ {
33
+ "_index"=>"test_index",
34
+ "_type"=>"test_type",
35
+ "_id"=>"AVTCxiCuNR-BVKOgUB7R",
36
+ "_score"=>nil,
37
+ "_source"=>{
38
+ "title"=>"dummy title",
39
+ "product_id"=>1
40
+ },
41
+ "sort"=>[12534]
42
+ }
43
+ ]
44
+ }
45
+ }
46
+ assert_equal Converter.get_sources(results, fields), [["AVTCxiCuNR-BVKOgUB7R", 1, "dummy title"]]
47
+ end
48
+ end
49
+ end
50
+ end
51
+ end
@@ -0,0 +1,36 @@
1
+ require_relative './helper'
2
+
3
+ Elasticsearch = Embulk::Input::Elasticsearch
4
+
5
+ module Embulk
6
+ class Input::Elasticsearch
7
+ class TestTransaction < Test::Unit::TestCase
8
+ sub_test_case "get_slice_from_num_threads" do
9
+ def test_normal
10
+ slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
11
+ assert_equal slice.size, 5
12
+ assert_equal slice.first.size, 2
13
+ end
14
+
15
+ def test_normal_same
16
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
17
+ assert_equal slice.size, 3
18
+ assert_equal slice.first.size, 1
19
+ end
20
+
21
+ def test_num_threads_over_array_size
22
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
23
+ assert_equal slice.size, 3
24
+ assert_equal slice.first.size, 1
25
+ end
26
+
27
+ def test_rest
28
+ slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
29
+ assert_equal slice.size, 7
30
+ assert_equal slice.first.size, 3
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
36
+
@@ -1,5 +1,4 @@
1
1
  require_relative './helper'
2
- require 'embulk/input/elasticsearch'
3
2
  require 'yaml'
4
3
 
5
4
  Elasticsearch = Embulk::Input::Elasticsearch
@@ -10,33 +9,6 @@ module Embulk
10
9
  def control
11
10
  Proc.new {|task| task_reports = [] }
12
11
  end
13
-
14
- sub_test_case "get_slice_from_num_threads" do
15
- def test_normal
16
- slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
17
- assert_equal slice.size, 5
18
- assert_equal slice.first.size, 2
19
- end
20
-
21
- def test_normal_same
22
- slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
23
- assert_equal slice.size, 3
24
- assert_equal slice.first.size, 1
25
- end
26
-
27
- def test_num_threads_over_array_size
28
- slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
29
- assert_equal slice.size, 3
30
- assert_equal slice.first.size, 1
31
- end
32
-
33
- def test_rest
34
- slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
35
- assert_equal slice.size, 7
36
- assert_equal slice.first.size, 3
37
- end
38
- end
39
-
40
12
  sub_test_case "transaction" do
41
13
  def test_normal
42
14
  yaml = YAML.load(%(
@@ -75,4 +47,4 @@ module Embulk
75
47
  end
76
48
  end
77
49
  end
78
- end
50
+ end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-elasticsearch
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.2
4
+ version: 0.3.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-03-16 00:00:00.000000000 Z
11
+ date: 2017-03-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -111,8 +111,11 @@ files:
111
111
  - embulk-input-elasticsearch.gemspec
112
112
  - lib/embulk/input/elasticsearch.rb
113
113
  - lib/embulk/input/elasticsearch/connection.rb
114
+ - lib/embulk/input/elasticsearch/converter.rb
114
115
  - lib/embulk/input/elasticsearch/input_thread.rb
115
116
  - test/helper.rb
117
+ - test/test_converter.rb
118
+ - test/test_input_thread.rb
116
119
  - test/test_transaction.rb
117
120
  homepage: https://github.com/toyama0919/embulk-input-elasticsearch
118
121
  licenses:
@@ -140,4 +143,6 @@ specification_version: 4
140
143
  summary: Elasticsearch input plugin for Embulk
141
144
  test_files:
142
145
  - test/helper.rb
146
+ - test/test_converter.rb
147
+ - test/test_input_thread.rb
143
148
  - test/test_transaction.rb