embulk-input-elasticsearch 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 24e8943311e14d4c3ba8fbbdb490bed67a2974ba
4
- data.tar.gz: 4f795401dfb26d3e5b58e39d4921a99eb1530445
3
+ metadata.gz: 0991bd544cf235c70290b6d307f164112280d1ea
4
+ data.tar.gz: 4e578ad476510b38eaa09ef9a20437f1548cc70c
5
5
  SHA512:
6
- metadata.gz: 0b48169a99f7d055271aeecf213704bd1798d9b4f0bac14b28a56933c5ca3ce0bce48b849690abe8a133b7f2e06d4bd869c2cdb01351904fa9992bc7f429efa8
7
- data.tar.gz: 1d2dafc4528c3d306f7bbbd08de973b3bdc2ec0ab4dc6ae0997aeb1adbe83e940d71e9becf0c9dcafeebf8169a76275fb9ce3bb4c0e3a6d9dd1a9d3ad4612a1d
6
+ metadata.gz: b47b9034846fc515fda638e231b5c62f13b0c8fecf1fba8ef16ad8f084f967518b2089df5a95e7681bb6e9af2dd918da452b843d1072dd04e22f01f504af674e
7
+ data.tar.gz: ddc4879a8a6c0524499f1b10f3b1e871c1920024aefa81e1c4bf562c44533cc161ea6d8d202bf1dd89af30688f35790c3f2f9b0a9a637a9af9ebdd86e32b6e09
@@ -1,7 +1,7 @@
1
1
 
2
2
  Gem::Specification.new do |spec|
3
3
  spec.name = "embulk-input-elasticsearch"
4
- spec.version = "0.2.1"
4
+ spec.version = "0.3.0"
5
5
  spec.authors = ["toyama0919"]
6
6
  spec.summary = "Elasticsearch input plugin for Embulk"
7
7
  spec.description = "Loads records from Elasticsearch. parallel query support."
@@ -1,11 +1,14 @@
1
1
  require 'excon'
2
2
  require 'elasticsearch'
3
+ require_relative 'elasticsearch/connection'
4
+ require_relative 'elasticsearch/input_thread'
3
5
 
4
6
  module Embulk
5
7
  module Input
6
8
 
7
9
  class Elasticsearch < InputPlugin
8
10
  Plugin.register_input("elasticsearch", self)
11
+ ADD_QUERY_TO_RECORD_KEY = 'query'
9
12
 
10
13
  def self.transaction(config, &control)
11
14
  task = {
@@ -25,31 +28,19 @@ module Embulk
25
28
  }
26
29
  # TODO: want max_threads
27
30
  define_num_threads = config.param("num_threads", :integer, default: 1)
28
- task['slice_queries'] = get_slice_from_num_threads(task['queries'], define_num_threads)
31
+ task['slice_queries'] = InputThread.get_slice_from_num_threads(task['queries'], define_num_threads)
29
32
 
30
33
  columns = []
31
34
  task['fields'].each_with_index{ |field, i|
32
35
  columns << Column.new(i, field['name'], field['type'].to_sym)
33
36
  }
34
37
  if task['add_query_to_record']
35
- columns << Column.new(task['fields'].size, "query", :string)
38
+ columns << Column.new(task['fields'].size, ADD_QUERY_TO_RECORD_KEY, :string)
36
39
  end
37
40
 
38
41
  resume(task, columns, task['slice_queries'].size, &control)
39
42
  end
40
43
 
41
- def self.get_slice_from_num_threads(array, define_num_threads)
42
- num_threads = array.size < define_num_threads ? array.size : define_num_threads
43
- per_queries = if (array.size % num_threads) == 0
44
- (array.size / num_threads)
45
- else
46
- (array.size / num_threads) + 1
47
- end
48
- sliced = array.each_slice(per_queries).to_a
49
- Embulk.logger.info("calculate num threads => #{sliced.size}")
50
- return sliced
51
- end
52
-
53
44
  def self.resume(task, columns, count, &control)
54
45
  task_reports = yield(task, columns, count)
55
46
 
@@ -57,28 +48,10 @@ module Embulk
57
48
  return next_config_diff
58
49
  end
59
50
 
60
- def self.create_client(task)
61
- transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
62
- {
63
- hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
64
- options: {
65
- reload_connections: task['reload_connections'],
66
- reload_on_failure: task['reload_on_failure'],
67
- retry_on_failure: task['retry_on_failure'],
68
- transport_options: {
69
- request: { timeout: task['request_timeout'] }
70
- }
71
- }
72
- }
73
- )
74
-
75
- ::Elasticsearch::Client.new transport: transport
76
- end
77
-
78
51
  def init
79
52
  @queries = task['slice_queries'][@index]
80
53
  Embulk.logger.info("this thread queries => #{@queries}")
81
- @client = self.class.create_client(task)
54
+ @client = Connection.create_client(task)
82
55
  @index_name = task['index']
83
56
  @index_type = task['index_type']
84
57
  @per_size = task['per_size']
@@ -89,27 +62,7 @@ module Embulk
89
62
  end
90
63
 
91
64
  def run
92
- @queries.each do |query|
93
- query_count = 0
94
- no_source_results = search(@index_type, query, 0, 0, @routing, @fields, @sort)
95
- total_count = [no_source_results['hits']['total'], @limit_size].compact.min
96
- while true
97
- now_results_size = query_count * @per_size
98
- next_results_size = (query_count + 1) * @per_size
99
- size = get_size(next_results_size, now_results_size ,total_count)
100
- break if size == 0
101
-
102
- results = get_sources(search(@index_type, query, size, now_results_size, @routing, @fields, @sort), @fields)
103
- results.each do |record|
104
- if @add_query_to_record
105
- record << query
106
- end
107
- page_builder.add(record)
108
- end
109
- break if last_query?(next_results_size ,total_count)
110
- query_count += 1
111
- end
112
- end
65
+ search(@index_type, @per_size, @routing, @fields, @sort)
113
66
  page_builder.finish
114
67
 
115
68
  task_report = {}
@@ -118,65 +71,52 @@ module Embulk
118
71
 
119
72
  private
120
73
 
121
- def convert_value(value, field)
122
- return nil if value.nil?
123
- case field["type"]
124
- when "string"
125
- value
126
- when "long"
127
- value.to_i
128
- when "double"
129
- value.to_f
130
- when "boolean"
131
- if value.is_a?(TrueClass) || value.is_a?(FalseClass)
132
- value
133
- else
134
- downcased_val = value.downcase
135
- case downcased_val
136
- when 'true' then true
137
- when 'false' then false
138
- when '1' then true
139
- when '0' then false
140
- else nil
141
- end
142
- end
143
- when "timestamp"
144
- Time.parse(value)
145
- when "json"
146
- value
147
- else
148
- raise "Unsupported type #{field['type']}"
74
+ def search(type, size, routing, fields, sort)
75
+ @queries.each do |query|
76
+ search_with_query(query, type, size, routing, fields, sort)
149
77
  end
150
78
  end
151
79
 
152
- def get_size(next_results_size, now_results_size ,total_count)
153
- if last_query?(next_results_size ,total_count)
154
- (total_count - now_results_size)
155
- else
156
- @per_size
80
+ def search_with_query(query, type, size, routing, fields, sort)
81
+ search_option = get_search_option(type, query, size, fields, sort)
82
+ Embulk.logger.info("#{search_option}")
83
+ r = @client.search(search_option)
84
+ i = 0
85
+ get_sources(r, fields).each do |result|
86
+ result_proc(result, query)
87
+ return if @limit_size == (i += 1)
88
+ end
89
+
90
+ while r = @client.scroll(scroll_id: r['_scroll_id'], scroll: '1m') and (not r['hits']['hits'].empty?) do
91
+ get_sources(r, fields).each do |result|
92
+ result_proc(result, query)
93
+ return if @limit_size == (i += 1)
94
+ end
157
95
  end
158
96
  end
159
97
 
160
- def last_query?(next_results_size ,total_count)
161
- next_results_size > total_count
98
+ def result_proc(result, query)
99
+ if @add_query_to_record
100
+ result << query
101
+ end
102
+ page_builder.add(result)
162
103
  end
163
104
 
164
- def search(type, query, size, from, routing, fields, sort)
165
- body = { from: from }
166
- body[:size] = size unless size.nil?
105
+ def get_search_option(type, query, size, fields, sort)
106
+ body = { }
107
+ body[:query] = { query_string: { query: query } } unless query.nil?
167
108
  if sort
168
109
  sorts = []
169
110
  sort.each do |k, v|
170
111
  sorts << { k => v }
171
112
  end
172
113
  body[:sort] = sorts
114
+ else
115
+ body[:sort] = ["_doc"]
173
116
  end
174
- body[:query] = { query_string: { query: query } } unless query.nil?
175
- search_option = { index: @index_name, type: type, body: body }
176
- search_option[:routing] = routing unless routing.nil?
117
+ search_option = { index: @index_name, type: type, scroll: '1m', body: body, size: size }
177
118
  search_option[:_source] = fields.select{ |field| !field['metadata'] }.map { |field| field['name'] }.join(',')
178
- Embulk.logger.info(%Q{search_option => #{search_option}})
179
- @client.search(search_option)
119
+ search_option
180
120
  end
181
121
 
182
122
  def get_sources(results, fields)
@@ -191,6 +131,37 @@ module Embulk
191
131
  }
192
132
  }
193
133
  end
134
+
135
+ def convert_value(value, field)
136
+ return nil if value.nil?
137
+ case field["type"]
138
+ when "string"
139
+ value
140
+ when "long"
141
+ value.to_i
142
+ when "double"
143
+ value.to_f
144
+ when "boolean"
145
+ if value.is_a?(TrueClass) || value.is_a?(FalseClass)
146
+ value
147
+ else
148
+ downcased_val = value.downcase
149
+ case downcased_val
150
+ when 'true' then true
151
+ when 'false' then false
152
+ when '1' then true
153
+ when '0' then false
154
+ else nil
155
+ end
156
+ end
157
+ when "timestamp"
158
+ Time.parse(value)
159
+ when "json"
160
+ value
161
+ else
162
+ raise "Unsupported type #{field['type']}"
163
+ end
164
+ end
194
165
  end
195
166
  end
196
167
  end
@@ -0,0 +1,25 @@
1
+ module Embulk
2
+ module Input
3
+ class Elasticsearch < InputPlugin
4
+ class Connection
5
+ def self.create_client(task)
6
+ transport = ::Elasticsearch::Transport::Transport::HTTP::Faraday.new(
7
+ {
8
+ hosts: task['nodes'].map{ |node| Hash[node.map{ |k, v| [k.to_sym, v] }] },
9
+ options: {
10
+ reload_connections: task['reload_connections'],
11
+ reload_on_failure: task['reload_on_failure'],
12
+ retry_on_failure: task['retry_on_failure'],
13
+ transport_options: {
14
+ request: { timeout: task['request_timeout'] }
15
+ }
16
+ }
17
+ }
18
+ )
19
+
20
+ ::Elasticsearch::Client.new transport: transport
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,19 @@
1
+ module Embulk
2
+ module Input
3
+ class Elasticsearch < InputPlugin
4
+ class InputThread
5
+ def self.get_slice_from_num_threads(array, define_num_threads)
6
+ num_threads = array.size < define_num_threads ? array.size : define_num_threads
7
+ per_queries = if (array.size % num_threads) == 0
8
+ (array.size / num_threads)
9
+ else
10
+ (array.size / num_threads) + 1
11
+ end
12
+ sliced = array.each_slice(per_queries).to_a
13
+ Embulk.logger.info("calculate num threads => #{sliced.size}")
14
+ return sliced
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -13,25 +13,25 @@ module Embulk
13
13
 
14
14
  sub_test_case "get_slice_from_num_threads" do
15
15
  def test_normal
16
- slice = Elasticsearch.get_slice_from_num_threads((1..10).to_a, 5)
16
+ slice = InputThread.get_slice_from_num_threads((1..10).to_a, 5)
17
17
  assert_equal slice.size, 5
18
18
  assert_equal slice.first.size, 2
19
19
  end
20
20
 
21
21
  def test_normal_same
22
- slice = Elasticsearch.get_slice_from_num_threads((1..3).to_a, 3)
22
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 3)
23
23
  assert_equal slice.size, 3
24
24
  assert_equal slice.first.size, 1
25
25
  end
26
26
 
27
27
  def test_num_threads_over_array_size
28
- slice = Elasticsearch.get_slice_from_num_threads((1..3).to_a, 10)
28
+ slice = InputThread.get_slice_from_num_threads((1..3).to_a, 10)
29
29
  assert_equal slice.size, 3
30
30
  assert_equal slice.first.size, 1
31
31
  end
32
32
 
33
33
  def test_rest
34
- slice = Elasticsearch.get_slice_from_num_threads((1..20).to_a, 8)
34
+ slice = InputThread.get_slice_from_num_threads((1..20).to_a, 8)
35
35
  assert_equal slice.size, 7
36
36
  assert_equal slice.first.size, 3
37
37
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: embulk-input-elasticsearch
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - toyama0919
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-06-21 00:00:00.000000000 Z
11
+ date: 2016-06-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  requirement: !ruby/object:Gem::Requirement
@@ -110,6 +110,8 @@ files:
110
110
  - Rakefile
111
111
  - embulk-input-elasticsearch.gemspec
112
112
  - lib/embulk/input/elasticsearch.rb
113
+ - lib/embulk/input/elasticsearch/connection.rb
114
+ - lib/embulk/input/elasticsearch/input_thread.rb
113
115
  - test/helper.rb
114
116
  - test/test_transaction.rb
115
117
  homepage: https://github.com/toyama0919/embulk-input-elasticsearch