esse 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/esse/import/bulk.rb +65 -34
- data/lib/esse/index/settings.rb +12 -20
- data/lib/esse/index_setting.rb +43 -0
- data/lib/esse/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 15b531b7c6876d665c12fd773d25fa68acda30c4c92cf7999790315f4fe35b2d
|
|
4
|
+
data.tar.gz: 9520b76a128e752bed3db6d2a1a96e38e2107ca790c6d0c0327c2690d91b21d7
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 865aa67f404a82e573009e73186dd60ee9c3f978ba3c93b39337184c2b8c9c9415bc116573c5eb4f21cb7e4ffc7e19aae4957e54fe7b8aa99a8804e86469b13f
|
|
7
|
+
data.tar.gz: bcacb23ae7b1b1a073298a59090455e30d6aa9531b93a815871762ed9e3364de873795c47ca0b2d495d8d9394110fd623011d3e4e3054a34ddf89b5e35947ded
|
data/lib/esse/import/bulk.rb
CHANGED
|
@@ -13,13 +13,16 @@ module Esse
|
|
|
13
13
|
# In case of timeout error, will retry with an exponential backoff using the following formula:
|
|
14
14
|
# wait_interval = (retry_count**4) + 15 + (rand(10) * (retry_count + 1)) seconds. It will retry up to max_retries times that is default 4.
|
|
15
15
|
#
|
|
16
|
-
# Too large bulk requests will be split into multiple requests
|
|
16
|
+
# Too large bulk requests will first be split into multiple size-balanced requests; if that still
|
|
17
|
+
# returns 413, the bulk is retried one document per request as a last resort. Only after a single
|
|
18
|
+
# document still returns 413 does the error bubble up.
|
|
17
19
|
#
|
|
18
20
|
# @yield [RequestBody] A request body instance
|
|
19
|
-
def each_request(max_retries: 4, last_retry_in_small_chunks: true)
|
|
21
|
+
def each_request(max_retries: 4, last_retry_in_small_chunks: true, last_retry_per_document: true)
|
|
20
22
|
# @TODO create indexes when by checking all the index suffixes (if mapping is not empty)
|
|
21
23
|
requests = [optimistic_request]
|
|
22
24
|
retry_count = 0
|
|
25
|
+
too_large_retry_count = 0
|
|
23
26
|
|
|
24
27
|
begin
|
|
25
28
|
requests.each do |request|
|
|
@@ -37,12 +40,28 @@ module Esse
|
|
|
37
40
|
sleep(wait_interval)
|
|
38
41
|
retry
|
|
39
42
|
rescue Esse::Transport::RequestEntityTooLargeError => e
|
|
40
|
-
|
|
41
|
-
raise e if
|
|
42
|
-
|
|
43
|
+
too_large_retry_count += 1
|
|
44
|
+
raise e if too_large_retry_count > 2
|
|
45
|
+
|
|
46
|
+
if too_large_retry_count == 1
|
|
47
|
+
balanced = balance_requests_size(e)
|
|
48
|
+
if balanced && !balanced.empty?
|
|
49
|
+
requests = balanced
|
|
50
|
+
Esse.logger.warn <<~MSG
|
|
51
|
+
Request entity too large, retrying with a bulk with: #{requests.map(&:bytesize).join(' + ')}.
|
|
52
|
+
Note that this cause performance degradation, consider adjusting the batch_size of the index or increasing the bulk size.
|
|
53
|
+
MSG
|
|
54
|
+
retry
|
|
55
|
+
end
|
|
56
|
+
raise e unless last_retry_per_document
|
|
57
|
+
too_large_retry_count = 2
|
|
58
|
+
end
|
|
59
|
+
|
|
60
|
+
raise e unless last_retry_per_document
|
|
61
|
+
requests = requests_per_document
|
|
43
62
|
Esse.logger.warn <<~MSG
|
|
44
|
-
Request entity too large, retrying
|
|
45
|
-
|
|
63
|
+
Request entity too large after balancing, retrying one document per request as a last resort.
|
|
64
|
+
If a single document still exceeds the bulk size, the error will be raised.
|
|
46
65
|
MSG
|
|
47
66
|
retry
|
|
48
67
|
end
|
|
@@ -60,45 +79,57 @@ module Esse
|
|
|
60
79
|
end
|
|
61
80
|
|
|
62
81
|
def requests_in_small_chunks(chunk_size: 1)
|
|
82
|
+
arr = build_per_document_requests(chunk_size: chunk_size)
|
|
83
|
+
Esse.logger.warn <<~MSG
|
|
84
|
+
Retrying the last request in small chunks of #{chunk_size} documents.
|
|
85
|
+
This is a last resort to avoid timeout errors, consider increasing the bulk size or reducing the batch size.
|
|
86
|
+
MSG
|
|
87
|
+
arr
|
|
88
|
+
end
|
|
89
|
+
|
|
90
|
+
def requests_per_document
|
|
91
|
+
build_per_document_requests(chunk_size: 1)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
def build_per_document_requests(chunk_size: 1)
|
|
63
95
|
arr = []
|
|
64
96
|
@create.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.create = slice } }
|
|
65
97
|
@index.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.index = slice } }
|
|
66
98
|
@update.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.update = slice } }
|
|
67
99
|
@delete.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.delete = slice } }
|
|
68
|
-
Esse.logger.warn <<~MSG
|
|
69
|
-
Retrying the last request in small chunks of #{chunk_size} documents.
|
|
70
|
-
This is a last resort to avoid timeout errors, consider increasing the bulk size or reducing the batch size.
|
|
71
|
-
MSG
|
|
72
100
|
arr
|
|
73
101
|
end
|
|
74
102
|
|
|
75
|
-
# @return [Array<RequestBody
|
|
103
|
+
# @return [Array<RequestBody>, nil] balanced requests, or nil when the error message has no parseable byte limit
|
|
76
104
|
def balance_requests_size(err)
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
operation, meta = as_json.to_a.first
|
|
80
|
-
meta = meta.dup
|
|
81
|
-
data = meta.delete(:data)
|
|
82
|
-
piece = MultiJson.dump(operation => meta)
|
|
83
|
-
piece << "\n" << MultiJson.dump(data) if data
|
|
84
|
-
if piece.bytesize > bulk_size
|
|
85
|
-
Esse.logger.warn <<~MSG
|
|
86
|
-
The document #{meta.inspect} size is #{piece.bytesize} bytes, which exceeds the maximum bulk size of #{bulk_size} bytes.
|
|
87
|
-
Consider increasing the bulk size or reducing the document size. The document will be ignored during this import.
|
|
88
|
-
MSG
|
|
89
|
-
next
|
|
90
|
-
end
|
|
105
|
+
bulk_size = err.message.scan(/exceeded.(\d+).bytes/).dig(0, 0).to_i
|
|
106
|
+
return nil unless bulk_size > 0
|
|
91
107
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
108
|
+
requests = (@create + @index + @update + @delete).each_with_object([Import::RequestBodyRaw.new]) do |as_json, result|
|
|
109
|
+
operation, meta = as_json.to_a.first
|
|
110
|
+
meta = meta.dup
|
|
111
|
+
data = meta.delete(:data)
|
|
112
|
+
piece = MultiJson.dump(operation => meta)
|
|
113
|
+
piece << "\n" << MultiJson.dump(data) if data
|
|
114
|
+
|
|
115
|
+
if piece.bytesize > bulk_size
|
|
116
|
+
Esse.logger.warn <<~MSG
|
|
117
|
+
The document #{meta.inspect} size is #{piece.bytesize} bytes, which exceeds the maximum bulk size of #{bulk_size} bytes.
|
|
118
|
+
It will be sent in its own request; if the cluster rejects it, the error will be raised.
|
|
119
|
+
MSG
|
|
120
|
+
result.push(Import::RequestBodyRaw.new.tap { |r| r.add(operation, piece) })
|
|
121
|
+
result.push(Import::RequestBodyRaw.new)
|
|
122
|
+
next
|
|
123
|
+
end
|
|
124
|
+
|
|
125
|
+
if result.last.body.bytesize + piece.bytesize > bulk_size
|
|
126
|
+
result.push(Import::RequestBodyRaw.new.tap { |r| r.add(operation, piece) })
|
|
127
|
+
else
|
|
128
|
+
result[-1].add(operation, piece)
|
|
97
129
|
end
|
|
98
|
-
requests.each(&:finalize)
|
|
99
|
-
else
|
|
100
|
-
raise err
|
|
101
130
|
end
|
|
131
|
+
requests.reject! { |r| r.body.empty? }
|
|
132
|
+
requests.each(&:finalize)
|
|
102
133
|
end
|
|
103
134
|
end
|
|
104
135
|
end
|
data/lib/esse/index/settings.rb
CHANGED
|
@@ -4,29 +4,21 @@ module Esse
|
|
|
4
4
|
# https://github.com/elastic/elasticsearch-ruby/blob/master/elasticsearch-api/lib/elasticsearch/api/actions/indices/put_settings.rb
|
|
5
5
|
class Index
|
|
6
6
|
module ClassMethods
|
|
7
|
-
#
|
|
8
|
-
#
|
|
9
|
-
#
|
|
10
|
-
INDEX_SIMPLIFIED_SETTINGS =
|
|
11
|
-
number_of_shards
|
|
12
|
-
number_of_replicas
|
|
13
|
-
refresh_interval
|
|
14
|
-
mapping
|
|
15
|
-
].freeze
|
|
7
|
+
# Backwards-compatible alias. The canonical list now lives on
|
|
8
|
+
# +Esse::IndexSetting::INDEX_SIMPLIFIED_SETTINGS+ so that the merge
|
|
9
|
+
# logic and the simplified-key promotion stay in sync.
|
|
10
|
+
INDEX_SIMPLIFIED_SETTINGS = Esse::IndexSetting::INDEX_SIMPLIFIED_SETTINGS
|
|
16
11
|
|
|
17
12
|
def settings_hash(settings: nil)
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
INDEX_SIMPLIFIED_SETTINGS.each do |key|
|
|
25
|
-
next unless values.key?(key)
|
|
26
|
-
value = values.delete(key)
|
|
27
|
-
next if value.nil?
|
|
13
|
+
# Normalize each side (global vs local) separately before merging so
|
|
14
|
+
# a flat global key (e.g. top-level :number_of_shards) cannot clobber
|
|
15
|
+
# an explicit nested local value (e.g. :index => { :number_of_shards => 8 }).
|
|
16
|
+
global = Esse::IndexSetting.normalize(setting.globals)
|
|
17
|
+
local = Esse::IndexSetting.normalize(setting.to_h)
|
|
18
|
+
values = HashUtils.deep_merge(global, local)
|
|
28
19
|
|
|
29
|
-
|
|
20
|
+
if settings.is_a?(Hash)
|
|
21
|
+
values = HashUtils.deep_merge(values, Esse::IndexSetting.normalize(settings))
|
|
30
22
|
end
|
|
31
23
|
|
|
32
24
|
if values[:index].is_a?(Hash)
|
data/lib/esse/index_setting.rb
CHANGED
|
@@ -3,6 +3,17 @@
|
|
|
3
3
|
module Esse
|
|
4
4
|
# https://www.elastic.co/guide/en/elasticsearch/reference/1.7/indices.html
|
|
5
5
|
class IndexSetting
|
|
6
|
+
# Top-level keys that Elasticsearch/OpenSearch accept either flat or nested
|
|
7
|
+
# under `index:`. We always promote them to the nested form so that values
|
|
8
|
+
# from different sources (cluster globals vs per-index template) merge
|
|
9
|
+
# predictably regardless of which form each side was authored in.
|
|
10
|
+
INDEX_SIMPLIFIED_SETTINGS = %i[
|
|
11
|
+
number_of_shards
|
|
12
|
+
number_of_replicas
|
|
13
|
+
refresh_interval
|
|
14
|
+
mapping
|
|
15
|
+
].freeze
|
|
16
|
+
|
|
6
17
|
# @param [Hash] options
|
|
7
18
|
# @option options [Proc] :globals A proc that will be called to load global settings
|
|
8
19
|
# @option options [Array] :paths A list of paths to load settings from
|
|
@@ -35,6 +46,38 @@ module Esse
|
|
|
35
46
|
HashUtils.deep_merge(global, local)
|
|
36
47
|
end
|
|
37
48
|
|
|
49
|
+
# Returns the raw (unsymbolized) global settings as supplied by the
|
|
50
|
+
# +globals+ proc. Public so that callers like
|
|
51
|
+
# +Esse::Index.settings_hash+ can normalize it independently before
|
|
52
|
+
# merging it with the local template — preventing a flat global value
|
|
53
|
+
# from clobbering a nested local value once both are merged.
|
|
54
|
+
def globals
|
|
55
|
+
@globals.call || {}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Normalize a settings hash by:
|
|
59
|
+
# * symbolizing keys
|
|
60
|
+
# * stripping the `:settings` root if present
|
|
61
|
+
# * exploding dotted keys ('index.number_of_replicas' -> { index: { number_of_replicas: ... } })
|
|
62
|
+
# * promoting simplified flat keys (number_of_shards, etc.) into the
|
|
63
|
+
# nested `:index` form, preserving any value already present under
|
|
64
|
+
# `:index` (we never overwrite an explicit nested setting with a
|
|
65
|
+
# flat value from the same source).
|
|
66
|
+
def self.normalize(hash)
|
|
67
|
+
values = HashUtils.deep_transform_keys(hash || {}, &:to_sym)
|
|
68
|
+
values = values[Esse::SETTING_ROOT_KEY] if values.key?(Esse::SETTING_ROOT_KEY)
|
|
69
|
+
values = HashUtils.explode_keys(values)
|
|
70
|
+
INDEX_SIMPLIFIED_SETTINGS.each do |key|
|
|
71
|
+
next unless values.key?(key)
|
|
72
|
+
value = values.delete(key)
|
|
73
|
+
next if value.nil?
|
|
74
|
+
|
|
75
|
+
values[:index] ||= {}
|
|
76
|
+
values[:index][key] = value unless values[:index].key?(key)
|
|
77
|
+
end
|
|
78
|
+
values
|
|
79
|
+
end
|
|
80
|
+
|
|
38
81
|
protected
|
|
39
82
|
|
|
40
83
|
def from_template
|
data/lib/esse/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: esse
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Marcos G. Zimmermann
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: exec
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2026-
|
|
11
|
+
date: 2026-05-15 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: multi_json
|