embulk-input-elasticsearch 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24e8943311e14d4c3ba8fbbdb490bed67a2974ba
4
- data.tar.gz: 4f795401dfb26d3e5b58e39d4921a99eb1530445
3
+ metadata.gz: 0991bd544cf235c70290b6d307f164112280d1ea
4
+ data.tar.gz: 4e578ad476510b38eaa09ef9a20437f1548cc70c
5
5
  SHA512:
6
- metadata.gz: 0b48169a99f7d055271aeecf213704bd1798d9b4f0bac14b28a56933c5ca3ce0bce48b849690abe8a133b7f2e06d4bd869c2cdb01351904fa9992bc7f429efa8
7
- data.tar.gz: 1d2dafc4528c3d306f7bbbd08de973b3bdc2ec0ab4dc6ae0997aeb1adbe83e940d71e9becf0c9dcafeebf8169a76275fb9ce3bb4c0e3a6d9dd1a9d3ad4612a1d
6
+ metadata.gz: b47b9034846fc515fda638e231b5c62f13b0c8fecf1fba8ef16ad8f084f967518b2089df5a95e7681bb6e9af2dd918da452b843d1072dd04e22f01f504af674e
7
+ data.tar.gz: ddc4879a8a6c0524499f1b10f3b1e871c1920024aefa81e1c4bf562c44533cc161ea6d8d202bf1dd89af30688f35790c3f2f9b0a9a637a9af9ebdd86e32b6e09
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-elasticsearch"
4
- spec.version = "0.2.1"
4
+ spec.version = "0.3.0"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Elasticsearch input plugin for Embulk"
7
7
  spec.description = "Loads records from Elasticsearch. parallel query support."
@@ -1,11 +1,14 @@
1
1
  require 'excon'
2
2
  require 'elasticsearch'
3
+ require_relative 'elasticsearch/connection'
4
+ require_relative 'elasticsearch/input_thread'
3
5
 
4
6
  module Embulk
5
7
  module Input
6
8
 
7
9
  class Elasticsearch < InputPlugin
8
10
  Plugin.register_input("elasticsearch", self)
11
+ ADD_QUERY_TO_RECORD_KEY = 'query'
9
12
 
10
13
  def self.transaction(config, &control)
11
14
  task = {
@@ -25,31 +28,19 @@ module Embulk
25
28
  }
26
29
  # TODO: want max_threads
27
30
  define_num_threads = config.param("num_threads", :integer, default: 1)
28
- task['slice_queries'] = get_slice_from_num_threads(task['queries'], define_num_threads)
31
+ task['slice_queries'] = InputThread.get_slice_from_num_threads(task['queries'], define_num_threads)
29
32
 
30
33
  columns = []
31
34
  task['fields'].each_with_index{ |field, i|
32
35
  columns << Column.new(i, field['name'], field['type'].to_sym)
33
36
  }
34
37
  if task['add_query_to_record']
35
- columns << Column.new(task['fields'].size, "query", :string)
38
+ columns << Column.new(task['fields'].size, ADD_QUERY_TO_RECORD_KEY, :string)
36
39
  end
37
40
 
38
41
  resume(task, columns, task['slice_queries'].size, &control)
39
42
  end
40
43
 
41
- def self.get_slice_from_num_threads(array, define_num_threads)
42
- num_threads = array.size < define_num_threads ? array.size : define_num_threads
43
- per_queries = if (array.size % num_threads) == 0
44
- (array.size / num_threads)
45
- else
46
- (array.size / num_threads) + 1
47
- end
48
- sliced = array.each_slice(per_queries).to_a
49
- Embulk.logger.info("calculate num threads => #{sliced.size}")
50
- return sliced
51
- end
52
-
53
44
  def self.resume(task, columns, count, &control)
54
45
  task_reports = yield(task, columns, count)
55
46
 
@@ -57,28 +48,10 @@ module Embulk
57
48
  return next_config_diff
58
49
  end
59
50
 
60
- def self.create_client(task)
61
- transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
62
- {
63
- hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
64
- options: {
65
- reload_connections: task['reload_connections'],
66
- reload_on_failure: task['reload_on_failure'],
67
- retry_on_failure: task['retry_on_failure'],
68
- transport_options: {
69
- request: { timeout: task['request_timeout'] }
70
- }
71
- }
72
- }
73
- )
74
-
75
- ::Elasticsearch::Client.new transport: transport
76
- end
77
-
78
51
  def init
79
52
  @queries = task['slice_queries'][@index]
80
53
  Embulk.logger.info("this thread queries => #{@queries}")
81
- @client = self.class.create_client(task)
54
+ @client = Connection.create_client(task)
82
55
  @index_name = task['index']
83
56
  @index_type = task['index_type']
84
57
  @per_size = task['per_size']
@@ -89,27 +62,7 @@ module Embulk
89
62
  end
90
63
 
91
64
  def run
92
- @queries.each do |query|
93
- query_count = 0
94
- no_source_results = search(@index_type, query, 0, 0, @routing, @fields, @sort)
95
- total_count = [no_source_results['hits']['total'], @limit_size].compact.min
96
- while true
97
- now_results_size = query_count * @per_size
98
- next_results_size = (query_count + 1) * @per_size
99
- size = get_size(next_results_size, now_results_size ,total_count)
100
- break if size == 0
101
-
102
- results = get_sources(search(@index_type, query, size, now_results_size, @routing, @fields, @sort), @fields)
103
- results.each do |record|
104
- if @add_query_to_record
105
- record << query
106
- end
107
- page_builder.add(record)
108
- end
109
- break if last_query?(next_results_size ,total_count)
110
- query_count += 1
111
- end
112
- end
65
+ search(@index_type, @per_size, @routing, @fields, @sort)
113
66
  page_builder.finish
114
67
 
115
68
  task_report = {}
@@ -118,65 +71,52 @@ module Embulk
118
71
 
119
72
  private
120
73
 
121
- def convert_value(value, field)
122
- return nil if value.nil?
123
- case field["type"]
124
- when "string"
125
- value
126
- when "long"
127
- value.to_i
128
- when "double"
129
- value.to_f
130
- when "boolean"
131
- if value.is_a?(TrueClass) || value.is_a?(FalseClass)
132
- value
133
- else
134
- downcased_val = value.downcase
135
- case downcased_val
136
- when 'true' then true
137
- when 'false' then false
138
- when '1' then true
139
- when '0' then false
140
- else nil
141
- end
142
- end
143
- when "timestamp"
144
- Time.parse(value)
145
- when "json"
146
- value
147
- else
148
- raise "Unsupported type #{field['type']}"
74
+ def search(type, size, routing, fields, sort)
75
+ @queries.each do |query|
76
+ search_with_query(query, type, size, routing, fields, sort)
149
77
  end
150
78
  end
151
79
 
152
- def get_size(next_results_size, now_results_size ,total_count)
153
- if last_query?(next_results_size ,total_count)
154
- (total_count - now_results_size)
155
- else
156
- @per_size
80
+ def search_with_query(query, type, size, routing, fields, sort)
81
+ search_option = get_search_option(type, query, size, fields, sort)
82
+ Embulk.logger.info("#{search_option}")
83
+ r = @client.search(search_option)
84
+ i = 0
85
+ get_sources(r, fields).each do |result|
86
+ result_proc(result, query)
87
+ return if @limit_size == (i += 1)
88
+ end
89
+
90
+ while r = @client.scroll(scroll_id: r['_scroll_id'], scroll: '1m') and (not r['hits']['hits'].empty?) do
91
+ get_sources(r, fields).each do |result|
92
+ result_proc(result, query)
93
+ return if @limit_size == (i += 1)
94
+ end
157
95
  end
158
96
  end
159
97
 
160
- def last_query?(next_results_size ,total_count)
161
- next_results_size > total_count
98
+ def result_proc(result, query)
99
+ if @add_query_to_record
100
+ result << query
101
+ end
102
+ page_builder.add(result)
162
103
  end
163
104
 
164
- def search(type, query, size, from, routing, fields, sort)
165
- body = { from: from }
166
- body[:size] = size unless size.nil?
105
+ def get_search_option(type, query, size, fields, sort)
106
+ body = { }
107
+ body[:query] = { query_string: { query: query } } unless query.nil?
167
108
  if sort
168
109
  sorts = []
169
110
  sort.each do |k, v|
170
111
  sorts << { k => v }
171
112
  end
172
113
  body[:sort] = sorts
114
+ else
115
+ body[:sort] = ["_doc"]
173
116
  end
174
- body[:query] = { query_string: { query: query } } unless query.nil?
175
- search_option = { index: @index_name, type: type, body: body }
176
- search_option[:routing] = routing unless routing.nil?
117
+ search_option = { index: @index_name, type: type, scroll: '1m', body: body, size: size }
177
118
  search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
178
- Embulk.logger.info(%Q{search_option => #{search_option}})
179
- @client.search(search_option)
119
+ search_option
180
120
  end
181
121
 
182
122
  def get_sources(results, fields)
@@ -191,6 +131,37 @@ module Embulk
191
131
  }
192
132
  }
193
133
  end
134
+
135
+ def convert_value(value, field)
136
+ return nil if value.nil?
137
+ case field["type"]
138
+ when "string"
139
+ value
140
+ when "long"
141
+ value.to_i
142
+ when "double"
143
+ value.to_f
144
+ when "boolean"
145
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
146
+ value
147
+ else
148
+ downcased_val = value.downcase
149
+ case downcased_val
150
+ when 'true' then true
151
+ when 'false' then false
152
+ when '1' then true
153
+ when '0' then false
154
+ else nil
155
+ end
156
+ end
157
+ when "timestamp"
158
+ Time.parse(value)
159
+ when "json"
160
+ value
161
+ else
162
+ raise "Unsupported type #{field['type']}"
163
+ end
164
+ end
194
165
  end
195
166
  end
196
167
  end
@@ -0,0 +1,25 @@
1
+ module Embulk
2
+ module Input
3
+ class Elasticsearch < InputPlugin
4
+ class Connection
5
+ def self.create_client(task)
6
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
7
+ {
8
+ hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
9
+ options: {
10
+ reload_connections: task['reload_connections'],
11
+ reload_on_failure: task['reload_on_failure'],
12
+ retry_on_failure: task['retry_on_failure'],
13
+ transport_options: {
14
+ request: { timeout: task['request_timeout'] }
15
+ }
16
+ }
17
+ }
18
+ )
19
+
20
+ ::Elasticsearch::Client.new transport: transport
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,19 @@
1
+ module Embulk
2
+ module Input
3
+ class Elasticsearch < InputPlugin
4
+ class InputThread
5
+ def self.get_slice_from_num_threads(array, define_num_threads)
6
+ num_threads = array.size < define_num_threads ? array.size : define_num_threads
7
+ per_queries = if (array.size % num_threads) == 0
8
+ (array.size / num_threads)
9
+ else
10
+ (array.size / num_threads) + 1
11
+ end
12
+ sliced = array.each_slice(per_queries).to_a
13
+ Embulk.logger.info("calculate num threads => #{sliced.size}")
14
+ return sliced
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -13,25 +13,25 @@ module Embulk
13
13
 
14
14
  sub_test_case "get_slice_from_num_threads" do
15
15
  def test_normal
16
- slice = Elasticsearch.get_slice_from_num_threads((1..10).to_a, 5)
16
+ slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
17
17
  assert_equal slice.size, 5
18
18
  assert_equal slice.first.size, 2
19
19
  end
20
20
 
21
21
  def test_normal_same
22
- slice = Elasticsearch.get_slice_from_num_threads((1..3).to_a, 3)
22
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
23
23
  assert_equal slice.size, 3
24
24
  assert_equal slice.first.size, 1
25
25
  end
26
26
 
27
27
  def test_num_threads_over_array_size
28
- slice = Elasticsearch.get_slice_from_num_threads((1..3).to_a, 10)
28
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
29
29
  assert_equal slice.size, 3
30
30
  assert_equal slice.first.size, 1
31
31
  end
32
32
 
33
33
  def test_rest
34
- slice = Elasticsearch.get_slice_from_num_threads((1..20).to_a, 8)
34
+ slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
35
35
  assert_equal slice.size, 7
36
36
  assert_equal slice.first.size, 3
37
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-elasticsearch
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-21 00:00:00.000000000 Z
11
+ date: 2016-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -110,6 +110,8 @@ files:
110
110
  - Rakefile
111
111
  - embulk-input-elasticsearch.gemspec
112
112
  - lib/embulk/input/elasticsearch.rb
113
+ - lib/embulk/input/elasticsearch/connection.rb
114
+ - lib/embulk/input/elasticsearch/input_thread.rb
113
115
  - test/helper.rb
114
116
  - test/test_transaction.rb
115
117
  homepage: https://github.com/toyama0919/embulk-input-elasticsearch