esse 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: dc6a554b787bfb51e5568bd590790171cc88ad8c5ce432e1de2c3ef68a5d1def
4
- data.tar.gz: cac82a25d6e8dd6dad3667a734d58be098f4b7020db76472359f7bdb4fe2067c
3
+ metadata.gz: 15b531b7c6876d665c12fd773d25fa68acda30c4c92cf7999790315f4fe35b2d
4
+ data.tar.gz: 9520b76a128e752bed3db6d2a1a96e38e2107ca790c6d0c0327c2690d91b21d7
5
5
  SHA512:
6
- metadata.gz: f26afad34715ed37cec3f1d1cad11ca6e05d2921a6f31884d9ea9f1136ef239ba7661e29796c37abf69ea19aa5f5407884cf88b108bbfa211e5639e371d492ed
7
- data.tar.gz: 13b1921d85723d3c33bacc54c7bcf95885a196c3a62c095a5ea3be5a5d5061b8cd4eb8fbb1bdbf1bb91c141ed4457f9a1baab5ec121dd7e4f4dc1c1c423277d2
6
+ metadata.gz: 865aa67f404a82e573009e73186dd60ee9c3f978ba3c93b39337184c2b8c9c9415bc116573c5eb4f21cb7e4ffc7e19aae4957e54fe7b8aa99a8804e86469b13f
7
+ data.tar.gz: bcacb23ae7b1b1a073298a59090455e30d6aa9531b93a815871762ed9e3364de873795c47ca0b2d495d8d9394110fd623011d3e4e3054a34ddf89b5e35947ded
@@ -13,13 +13,16 @@ module Esse
13
13
  # In case of timeout error, will retry with an exponential backoff using the following formula:
14
14
  # wait_interval = (retry_count**4) + 15 + (rand(10) * (retry_count + 1)) seconds. It will retry up to max_retries times that is default 4.
15
15
  #
16
- # Too large bulk requests will be split into multiple requests with only one attempt.
16
+ # Too large bulk requests will first be split into multiple size-balanced requests; if that still
17
+ # returns 413, the bulk is retried one document per request as a last resort. Only after a single
18
+ # document still returns 413 does the error bubble up.
17
19
  #
18
20
  # @yield [RequestBody] A request body instance
19
- def each_request(max_retries: 4, last_retry_in_small_chunks: true)
21
+ def each_request(max_retries: 4, last_retry_in_small_chunks: true, last_retry_per_document: true)
20
22
  # @TODO create indexes when by checking all the index suffixes (if mapping is not empty)
21
23
  requests = [optimistic_request]
22
24
  retry_count = 0
25
+ too_large_retry_count = 0
23
26
 
24
27
  begin
25
28
  requests.each do |request|
@@ -37,12 +40,28 @@ module Esse
37
40
  sleep(wait_interval)
38
41
  retry
39
42
  rescue Esse::Transport::RequestEntityTooLargeError => e
40
- retry_count += 1
41
- raise e if retry_count > 1 # only retry once on this error
42
- requests = balance_requests_size(e)
43
+ too_large_retry_count += 1
44
+ raise e if too_large_retry_count > 2
45
+
46
+ if too_large_retry_count == 1
47
+ balanced = balance_requests_size(e)
48
+ if balanced && !balanced.empty?
49
+ requests = balanced
50
+ Esse.logger.warn <<~MSG
51
+ Request entity too large, retrying with a bulk with: #{requests.map(&:bytesize).join(' + ')}.
52
+ Note that this cause performance degradation, consider adjusting the batch_size of the index or increasing the bulk size.
53
+ MSG
54
+ retry
55
+ end
56
+ raise e unless last_retry_per_document
57
+ too_large_retry_count = 2
58
+ end
59
+
60
+ raise e unless last_retry_per_document
61
+ requests = requests_per_document
43
62
  Esse.logger.warn <<~MSG
44
- Request entity too large, retrying with a bulk with: #{requests.map(&:bytesize).join(' + ')}.
45
- Note that this cause performance degradation, consider adjusting the batch_size of the index or increasing the bulk size.
63
+ Request entity too large after balancing, retrying one document per request as a last resort.
64
+ If a single document still exceeds the bulk size, the error will be raised.
46
65
  MSG
47
66
  retry
48
67
  end
@@ -60,45 +79,57 @@ module Esse
60
79
  end
61
80
 
62
81
  def requests_in_small_chunks(chunk_size: 1)
82
+ arr = build_per_document_requests(chunk_size: chunk_size)
83
+ Esse.logger.warn <<~MSG
84
+ Retrying the last request in small chunks of #{chunk_size} documents.
85
+ This is a last resort to avoid timeout errors, consider increasing the bulk size or reducing the batch size.
86
+ MSG
87
+ arr
88
+ end
89
+
90
+ def requests_per_document
91
+ build_per_document_requests(chunk_size: 1)
92
+ end
93
+
94
+ def build_per_document_requests(chunk_size: 1)
63
95
  arr = []
64
96
  @create.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.create = slice } }
65
97
  @index.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.index = slice } }
66
98
  @update.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.update = slice } }
67
99
  @delete.each_slice(chunk_size) { |slice| arr << Import::RequestBodyAsJson.new.tap { |r| r.delete = slice } }
68
- Esse.logger.warn <<~MSG
69
- Retrying the last request in small chunks of #{chunk_size} documents.
70
- This is a last resort to avoid timeout errors, consider increasing the bulk size or reducing the batch size.
71
- MSG
72
100
  arr
73
101
  end
74
102
 
75
- # @return [Array<RequestBody>]
103
+ # @return [Array<RequestBody>, nil] balanced requests, or nil when the error message has no parseable byte limit
76
104
  def balance_requests_size(err)
77
- if (bulk_size = err.message.scan(/exceeded.(\d+).bytes/).dig(0, 0).to_i) > 0
78
- requests = (@create + @index + @update + @delete).each_with_object([Import::RequestBodyRaw.new]) do |as_json, result|
79
- operation, meta = as_json.to_a.first
80
- meta = meta.dup
81
- data = meta.delete(:data)
82
- piece = MultiJson.dump(operation => meta)
83
- piece << "\n" << MultiJson.dump(data) if data
84
- if piece.bytesize > bulk_size
85
- Esse.logger.warn <<~MSG
86
- The document #{meta.inspect} size is #{piece.bytesize} bytes, which exceeds the maximum bulk size of #{bulk_size} bytes.
87
- Consider increasing the bulk size or reducing the document size. The document will be ignored during this import.
88
- MSG
89
- next
90
- end
105
+ bulk_size = err.message.scan(/exceeded.(\d+).bytes/).dig(0, 0).to_i
106
+ return nil unless bulk_size > 0
91
107
 
92
- if result.last.body.bytesize + piece.bytesize > bulk_size
93
- result.push(Import::RequestBodyRaw.new.tap { |r| r.add(operation, piece) })
94
- else
95
- result[-1].add(operation, piece)
96
- end
108
+ requests = (@create + @index + @update + @delete).each_with_object([Import::RequestBodyRaw.new]) do |as_json, result|
109
+ operation, meta = as_json.to_a.first
110
+ meta = meta.dup
111
+ data = meta.delete(:data)
112
+ piece = MultiJson.dump(operation => meta)
113
+ piece << "\n" << MultiJson.dump(data) if data
114
+
115
+ if piece.bytesize > bulk_size
116
+ Esse.logger.warn <<~MSG
117
+ The document #{meta.inspect} size is #{piece.bytesize} bytes, which exceeds the maximum bulk size of #{bulk_size} bytes.
118
+ It will be sent in its own request; if the cluster rejects it, the error will be raised.
119
+ MSG
120
+ result.push(Import::RequestBodyRaw.new.tap { |r| r.add(operation, piece) })
121
+ result.push(Import::RequestBodyRaw.new)
122
+ next
123
+ end
124
+
125
+ if result.last.body.bytesize + piece.bytesize > bulk_size
126
+ result.push(Import::RequestBodyRaw.new.tap { |r| r.add(operation, piece) })
127
+ else
128
+ result[-1].add(operation, piece)
97
129
  end
98
- requests.each(&:finalize)
99
- else
100
- raise err
101
130
  end
131
+ requests.reject! { |r| r.body.empty? }
132
+ requests.each(&:finalize)
102
133
  end
103
134
  end
104
135
  end
@@ -4,29 +4,21 @@ module Esse
4
4
  # https://github.com/elastic/elasticsearch-ruby/blob/master/elasticsearch-api/lib/elasticsearch/api/actions/indices/put_settings.rb
5
5
  class Index
6
6
  module ClassMethods
7
- # Elasticsearch supports passing index.* related settings directly in the body of the request.
8
- # We are moving it to the index key to make it more explicit and to be the source-of-truth when merging settings.
9
- # So the settings `{ number_of_shards: 1 }` will be transformed to `{ index: { number_of_shards: 1 } }`
10
- INDEX_SIMPLIFIED_SETTINGS = %i[
11
- number_of_shards
12
- number_of_replicas
13
- refresh_interval
14
- mapping
15
- ].freeze
7
+ # Backwards-compatible alias. The canonical list now lives on
8
+ # +Esse::IndexSetting::INDEX_SIMPLIFIED_SETTINGS+ so that the merge
9
+ # logic and the simplified-key promotion stay in sync.
10
+ INDEX_SIMPLIFIED_SETTINGS = Esse::IndexSetting::INDEX_SIMPLIFIED_SETTINGS
16
11
 
17
12
  def settings_hash(settings: nil)
18
- hash = setting.body
19
- values = (hash.key?(Esse::SETTING_ROOT_KEY) ? hash[Esse::SETTING_ROOT_KEY] : hash)
20
- values = HashUtils.explode_keys(values)
21
- if settings.is_a?(Hash)
22
- values = HashUtils.deep_merge(values, HashUtils.explode_keys(settings))
23
- end
24
- INDEX_SIMPLIFIED_SETTINGS.each do |key|
25
- next unless values.key?(key)
26
- value = values.delete(key)
27
- next if value.nil?
13
+ # Normalize each side (global vs local) separately before merging so
14
+ # a flat global key (e.g. top-level :number_of_shards) cannot clobber
15
+ # an explicit nested local value (e.g. :index => { :number_of_shards => 8 }).
16
+ global = Esse::IndexSetting.normalize(setting.globals)
17
+ local = Esse::IndexSetting.normalize(setting.to_h)
18
+ values = HashUtils.deep_merge(global, local)
28
19
 
29
- (values[:index] ||= {}).merge!(key => value)
20
+ if settings.is_a?(Hash)
21
+ values = HashUtils.deep_merge(values, Esse::IndexSetting.normalize(settings))
30
22
  end
31
23
 
32
24
  if values[:index].is_a?(Hash)
@@ -3,6 +3,17 @@
3
3
  module Esse
4
4
  # https://www.elastic.co/guide/en/elasticsearch/reference/1.7/indices.html
5
5
  class IndexSetting
6
+ # Top-level keys that Elasticsearch/OpenSearch accept either flat or nested
7
+ # under `index:`. We always promote them to the nested form so that values
8
+ # from different sources (cluster globals vs per-index template) merge
9
+ # predictably regardless of which form each side was authored in.
10
+ INDEX_SIMPLIFIED_SETTINGS = %i[
11
+ number_of_shards
12
+ number_of_replicas
13
+ refresh_interval
14
+ mapping
15
+ ].freeze
16
+
6
17
  # @param [Hash] options
7
18
  # @option options [Proc] :globals A proc that will be called to load global settings
8
19
  # @option options [Array] :paths A list of paths to load settings from
@@ -35,6 +46,38 @@ module Esse
35
46
  HashUtils.deep_merge(global, local)
36
47
  end
37
48
 
49
+ # Returns the raw (unsymbolized) global settings as supplied by the
50
+ # +globals+ proc. Public so that callers like
51
+ # +Esse::Index.settings_hash+ can normalize it independently before
52
+ # merging it with the local template — preventing a flat global value
53
+ # from clobbering a nested local value once both are merged.
54
+ def globals
55
+ @globals.call || {}
56
+ end
57
+
58
+ # Normalize a settings hash by:
59
+ # * symbolizing keys
60
+ # * stripping the `:settings` root if present
61
+ # * exploding dotted keys ('index.number_of_replicas' -> { index: { number_of_replicas: ... } })
62
+ # * promoting simplified flat keys (number_of_shards, etc.) into the
63
+ # nested `:index` form, preserving any value already present under
64
+ # `:index` (we never overwrite an explicit nested setting with a
65
+ # flat value from the same source).
66
+ def self.normalize(hash)
67
+ values = HashUtils.deep_transform_keys(hash || {}, &:to_sym)
68
+ values = values[Esse::SETTING_ROOT_KEY] if values.key?(Esse::SETTING_ROOT_KEY)
69
+ values = HashUtils.explode_keys(values)
70
+ INDEX_SIMPLIFIED_SETTINGS.each do |key|
71
+ next unless values.key?(key)
72
+ value = values.delete(key)
73
+ next if value.nil?
74
+
75
+ values[:index] ||= {}
76
+ values[:index][key] = value unless values[:index].key?(key)
77
+ end
78
+ values
79
+ end
80
+
38
81
  protected
39
82
 
40
83
  def from_template
data/lib/esse/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Esse
4
- VERSION = '0.4.0'
4
+ VERSION = '0.5.0'
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: esse
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.5.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Marcos G. Zimmermann
8
8
  autorequire:
9
9
  bindir: exec
10
10
  cert_chain: []
11
- date: 2026-03-18 00:00:00.000000000 Z
11
+ date: 2026-05-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: multi_json