search-engine-for-typesense 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +148 -0
  4. data/app/search_engine/search_engine/app_info.rb +11 -0
  5. data/app/search_engine/search_engine/index_partition_job.rb +170 -0
  6. data/lib/generators/search_engine/install/install_generator.rb +20 -0
  7. data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
  8. data/lib/generators/search_engine/model/model_generator.rb +86 -0
  9. data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
  10. data/lib/search-engine-for-typesense.rb +12 -0
  11. data/lib/search_engine/active_record_syncable.rb +247 -0
  12. data/lib/search_engine/admin/stopwords.rb +125 -0
  13. data/lib/search_engine/admin/synonyms.rb +125 -0
  14. data/lib/search_engine/admin.rb +12 -0
  15. data/lib/search_engine/ast/and.rb +52 -0
  16. data/lib/search_engine/ast/binary_op.rb +75 -0
  17. data/lib/search_engine/ast/eq.rb +19 -0
  18. data/lib/search_engine/ast/group.rb +18 -0
  19. data/lib/search_engine/ast/gt.rb +12 -0
  20. data/lib/search_engine/ast/gte.rb +12 -0
  21. data/lib/search_engine/ast/in.rb +28 -0
  22. data/lib/search_engine/ast/lt.rb +12 -0
  23. data/lib/search_engine/ast/lte.rb +12 -0
  24. data/lib/search_engine/ast/matches.rb +55 -0
  25. data/lib/search_engine/ast/node.rb +176 -0
  26. data/lib/search_engine/ast/not_eq.rb +13 -0
  27. data/lib/search_engine/ast/not_in.rb +24 -0
  28. data/lib/search_engine/ast/or.rb +52 -0
  29. data/lib/search_engine/ast/prefix.rb +51 -0
  30. data/lib/search_engine/ast/raw.rb +41 -0
  31. data/lib/search_engine/ast/unary_op.rb +43 -0
  32. data/lib/search_engine/ast.rb +101 -0
  33. data/lib/search_engine/base/creation.rb +727 -0
  34. data/lib/search_engine/base/deletion.rb +80 -0
  35. data/lib/search_engine/base/display_coercions.rb +36 -0
  36. data/lib/search_engine/base/hydration.rb +312 -0
  37. data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
  38. data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
  39. data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
  40. data/lib/search_engine/base/index_maintenance.rb +459 -0
  41. data/lib/search_engine/base/indexing_dsl.rb +255 -0
  42. data/lib/search_engine/base/joins.rb +479 -0
  43. data/lib/search_engine/base/model_dsl.rb +472 -0
  44. data/lib/search_engine/base/presets.rb +43 -0
  45. data/lib/search_engine/base/pretty_printer.rb +315 -0
  46. data/lib/search_engine/base/relation_delegation.rb +42 -0
  47. data/lib/search_engine/base/scopes.rb +113 -0
  48. data/lib/search_engine/base/updating.rb +92 -0
  49. data/lib/search_engine/base.rb +38 -0
  50. data/lib/search_engine/bulk.rb +284 -0
  51. data/lib/search_engine/cache.rb +33 -0
  52. data/lib/search_engine/cascade.rb +531 -0
  53. data/lib/search_engine/cli/doctor.rb +631 -0
  54. data/lib/search_engine/cli/support.rb +217 -0
  55. data/lib/search_engine/cli.rb +222 -0
  56. data/lib/search_engine/client/http_adapter.rb +63 -0
  57. data/lib/search_engine/client/request_builder.rb +92 -0
  58. data/lib/search_engine/client/services/base.rb +74 -0
  59. data/lib/search_engine/client/services/collections.rb +161 -0
  60. data/lib/search_engine/client/services/documents.rb +214 -0
  61. data/lib/search_engine/client/services/operations.rb +152 -0
  62. data/lib/search_engine/client/services/search.rb +190 -0
  63. data/lib/search_engine/client/services.rb +29 -0
  64. data/lib/search_engine/client.rb +765 -0
  65. data/lib/search_engine/client_options.rb +20 -0
  66. data/lib/search_engine/collection_resolver.rb +191 -0
  67. data/lib/search_engine/collections_graph.rb +330 -0
  68. data/lib/search_engine/compiled_params.rb +143 -0
  69. data/lib/search_engine/compiler.rb +383 -0
  70. data/lib/search_engine/config/observability.rb +27 -0
  71. data/lib/search_engine/config/presets.rb +92 -0
  72. data/lib/search_engine/config/selection.rb +16 -0
  73. data/lib/search_engine/config/typesense.rb +48 -0
  74. data/lib/search_engine/config/validators.rb +97 -0
  75. data/lib/search_engine/config.rb +917 -0
  76. data/lib/search_engine/console_helpers.rb +130 -0
  77. data/lib/search_engine/deletion.rb +103 -0
  78. data/lib/search_engine/dispatcher.rb +125 -0
  79. data/lib/search_engine/dsl/parser.rb +582 -0
  80. data/lib/search_engine/engine.rb +167 -0
  81. data/lib/search_engine/errors.rb +290 -0
  82. data/lib/search_engine/filters/sanitizer.rb +189 -0
  83. data/lib/search_engine/hydration/materializers.rb +808 -0
  84. data/lib/search_engine/hydration/selection_context.rb +96 -0
  85. data/lib/search_engine/indexer/batch_planner.rb +76 -0
  86. data/lib/search_engine/indexer/bulk_import.rb +626 -0
  87. data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
  88. data/lib/search_engine/indexer/retry_policy.rb +103 -0
  89. data/lib/search_engine/indexer.rb +747 -0
  90. data/lib/search_engine/instrumentation.rb +308 -0
  91. data/lib/search_engine/joins/guard.rb +202 -0
  92. data/lib/search_engine/joins/resolver.rb +95 -0
  93. data/lib/search_engine/logging/color.rb +78 -0
  94. data/lib/search_engine/logging/format_helpers.rb +92 -0
  95. data/lib/search_engine/logging/partition_progress.rb +53 -0
  96. data/lib/search_engine/logging_subscriber.rb +388 -0
  97. data/lib/search_engine/mapper.rb +785 -0
  98. data/lib/search_engine/multi.rb +286 -0
  99. data/lib/search_engine/multi_result.rb +186 -0
  100. data/lib/search_engine/notifications/compact_logger.rb +675 -0
  101. data/lib/search_engine/observability.rb +162 -0
  102. data/lib/search_engine/operations.rb +58 -0
  103. data/lib/search_engine/otel.rb +227 -0
  104. data/lib/search_engine/partitioner.rb +128 -0
  105. data/lib/search_engine/ranking_plan.rb +118 -0
  106. data/lib/search_engine/registry.rb +158 -0
  107. data/lib/search_engine/relation/compiler.rb +711 -0
  108. data/lib/search_engine/relation/deletion.rb +37 -0
  109. data/lib/search_engine/relation/dsl/filters.rb +624 -0
  110. data/lib/search_engine/relation/dsl/selection.rb +240 -0
  111. data/lib/search_engine/relation/dsl.rb +903 -0
  112. data/lib/search_engine/relation/dx/dry_run.rb +59 -0
  113. data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
  114. data/lib/search_engine/relation/dx.rb +231 -0
  115. data/lib/search_engine/relation/materializers.rb +118 -0
  116. data/lib/search_engine/relation/options.rb +138 -0
  117. data/lib/search_engine/relation/state.rb +274 -0
  118. data/lib/search_engine/relation/updating.rb +44 -0
  119. data/lib/search_engine/relation.rb +623 -0
  120. data/lib/search_engine/result.rb +664 -0
  121. data/lib/search_engine/schema.rb +1083 -0
  122. data/lib/search_engine/sources/active_record_source.rb +185 -0
  123. data/lib/search_engine/sources/base.rb +62 -0
  124. data/lib/search_engine/sources/lambda_source.rb +55 -0
  125. data/lib/search_engine/sources/sql_source.rb +196 -0
  126. data/lib/search_engine/sources.rb +71 -0
  127. data/lib/search_engine/stale_rules.rb +160 -0
  128. data/lib/search_engine/test/minitest_assertions.rb +57 -0
  129. data/lib/search_engine/test/offline_client.rb +134 -0
  130. data/lib/search_engine/test/rspec_matchers.rb +77 -0
  131. data/lib/search_engine/test/stub_client.rb +201 -0
  132. data/lib/search_engine/test.rb +66 -0
  133. data/lib/search_engine/test_autoload.rb +8 -0
  134. data/lib/search_engine/update.rb +35 -0
  135. data/lib/search_engine/version.rb +7 -0
  136. data/lib/search_engine.rb +332 -0
  137. data/lib/tasks/search_engine.rake +501 -0
  138. data/lib/tasks/search_engine_doctor.rake +16 -0
  139. metadata +225 -0
@@ -0,0 +1,626 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'search_engine/logging/color'
4
+
5
+ module SearchEngine
6
+ class Indexer
7
+ # Orchestrates streaming JSONL bulk imports for partition rebuilds.
8
+ #
9
+ # Keeps the legacy semantics from {SearchEngine::Indexer#import!} while delegating
10
+ # encoding and retry logic to dedicated helpers. Splits overlarge batches when
11
+ # Typesense responds with HTTP 413, aggregates metrics, and returns a
12
+ # {SearchEngine::Indexer::Summary} compatible with existing logging helpers.
13
+ #
14
+ # @since M8
15
+ class BulkImport
16
+ DEFAULT_ACTION = :upsert
17
+ class << self
18
+ # Execute a bulk import for the provided batches enumerable.
19
+ #
20
+ # @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
21
+ # @param into [String] physical collection name
22
+ # @param enum [Enumerable] yields batch-like objects convertible to Array
23
+ # @param batch_size [Integer, nil] soft guard for logging when exceeded
24
+ # @param action [Symbol] :upsert (default), :create, or :update
25
+ # @param log_batches [Boolean] whether to log each batch as it completes (default: true)
26
+ # @param max_parallel [Integer] maximum parallel threads for batch processing (default: 1)
27
+ # @return [SearchEngine::Indexer::Summary]
28
+ # @raise [SearchEngine::Errors::InvalidParams]
29
+ def call(klass:, into:, enum:, batch_size:, action: DEFAULT_ACTION, log_batches: true, max_parallel: 1)
30
+ validate_args!(klass, into, enum, action)
31
+ mp = max_parallel.to_i
32
+ mp = 1 unless mp.positive?
33
+
34
+ if mp > 1
35
+ call_parallel(
36
+ klass: klass, into: into, enum: enum, batch_size: batch_size, action: action,
37
+ log_batches: log_batches, max_parallel: mp
38
+ )
39
+ else
40
+ call_sequential(
41
+ klass: klass, into: into, enum: enum, batch_size: batch_size, action: action,
42
+ log_batches: log_batches
43
+ )
44
+ end
45
+ end
46
+
47
+ private
48
+
49
+ # Process batches sequentially (one at a time).
50
+ #
51
+ # @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
52
+ # @param into [String] physical collection name
53
+ # @param enum [Enumerable] yields batch-like objects convertible to Array
54
+ # @param batch_size [Integer, nil] soft guard for logging when exceeded
55
+ # @param action [Symbol] :upsert, :create, or :update
56
+ # @param log_batches [Boolean] whether to log each batch as it completes
57
+ # @return [SearchEngine::Indexer::Summary]
58
+ def call_sequential(klass:, into:, enum:, batch_size:, action:, log_batches:)
59
+ docs_enum = normalize_enum(enum)
60
+ retry_policy = RetryPolicy.from_config(SearchEngine.config.indexer&.retries)
61
+ client = SearchEngine.client
62
+ buffer = +''
63
+ next_index = sequence_generator
64
+
65
+ batches = []
66
+ docs_total = 0
67
+ success_total = 0
68
+ failed_total = 0
69
+ failed_batches_total = 0
70
+ batches_total = 0
71
+ # Capture start time before processing any batches to measure total wall-clock duration
72
+ started_at = monotonic_ms
73
+
74
+ docs_enum.each do |raw_batch|
75
+ stats_list = import_batch_with_handling(
76
+ client: client,
77
+ collection: into,
78
+ raw_batch: raw_batch,
79
+ action: action,
80
+ retry_policy: retry_policy,
81
+ buffer: buffer,
82
+ next_index: next_index
83
+ )
84
+
85
+ stats_list.each do |stats|
86
+ docs_total += stats[:docs_count].to_i
87
+ success_total += stats[:success_count].to_i
88
+ failed_total += stats[:failure_count].to_i
89
+ failed_batches_total += 1 if stats[:failure_count].to_i.positive?
90
+ batches_total += 1
91
+ batches << stats
92
+ validate_soft_batch_size!(batch_size, stats[:docs_count])
93
+ log_batch(stats, batches_total) if log_batches
94
+ end
95
+ end
96
+
97
+ # Calculate total duration as wall-clock time from start to finish (not sum of batch durations)
98
+ total_duration_ms = (monotonic_ms - started_at).round(1)
99
+
100
+ Summary.new(
101
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
102
+ status: status_from_counts(success_total, failed_total),
103
+ batches_total: batches_total,
104
+ docs_total: docs_total,
105
+ success_total: success_total,
106
+ failed_total: failed_total,
107
+ failed_batches_total: failed_batches_total,
108
+ duration_ms_total: total_duration_ms,
109
+ batches: batches
110
+ )
111
+ end
112
+
113
+ # Process batches in parallel using a thread pool.
114
+ #
115
+ # Materializes all batches upfront and processes them concurrently using
116
+ # a bounded thread pool. Each thread gets its own Client instance and buffer
117
+ # to avoid thread-safety issues. Thread-safe aggregation of stats is handled
118
+ # via mutex synchronization.
119
+ #
120
+ # @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
121
+ # @param into [String] physical collection name
122
+ # @param enum [Enumerable] yields batch-like objects convertible to Array
123
+ # @param batch_size [Integer, nil] soft guard for logging when exceeded
124
+ # @param action [Symbol] :upsert, :create, or :update
125
+ # @param log_batches [Boolean] whether to log each batch as it completes
126
+ # @param max_parallel [Integer] maximum number of parallel threads
127
+ # @return [SearchEngine::Indexer::Summary]
128
+ def call_parallel(klass:, into:, enum:, batch_size:, action:, log_batches:, max_parallel:)
129
+ require 'concurrent-ruby'
130
+
131
+ # Use producer-consumer pattern with bounded queue to avoid full materialization
132
+ # Queue capacity = max_parallel * 2 to keep workers busy while producer fetches
133
+ docs_enum = normalize_enum(enum)
134
+ total_batches_estimate = estimate_total_batches(klass)
135
+ queue_capacity = max_parallel * 2
136
+ batch_queue = SizedQueue.new(queue_capacity)
137
+ sentinel = Object.new # Unique object to signal completion
138
+
139
+ retry_policy = RetryPolicy.from_config(SearchEngine.config.indexer&.retries)
140
+ pool = Concurrent::FixedThreadPool.new(max_parallel)
141
+ shared_state = initialize_shared_state
142
+ producer_error = nil
143
+
144
+ puts(' Starting parallel batch processing...') if log_batches
145
+ started_at = monotonic_ms
146
+
147
+ # Producer thread: fetch batches lazily and push to queue
148
+ producer_thread = Thread.new do
149
+ batch_count = 0
150
+ docs_enum.each do |batch|
151
+ batch_queue.push(batch)
152
+ batch_count += 1
153
+ # Log progress every 10 batches
154
+ next unless log_batches && (batch_count % 10).zero?
155
+
156
+ elapsed = (monotonic_ms - started_at).round(1)
157
+ if total_batches_estimate
158
+ puts(" Processed #{batch_count}/#{total_batches_estimate} batches... (#{elapsed}ms)")
159
+ else
160
+ puts(" Processed #{batch_count} batches... (#{elapsed}ms)")
161
+ end
162
+ end
163
+ rescue StandardError => error
164
+ producer_error = error
165
+ warn(" Producer failed at batch #{batch_count}: #{error.class}: #{error.message.to_s[0, 200]}")
166
+ ensure
167
+ # Signal completion to all workers
168
+ max_parallel.times { batch_queue.push(sentinel) }
169
+ end
170
+
171
+ # Worker threads: consume batches from queue
172
+ begin
173
+ process_batches_from_queue(
174
+ batch_queue: batch_queue,
175
+ sentinel: sentinel,
176
+ into: into,
177
+ action: action,
178
+ retry_policy: retry_policy,
179
+ batch_size: batch_size,
180
+ log_batches: log_batches,
181
+ pool: pool,
182
+ shared_state: shared_state,
183
+ max_parallel: max_parallel
184
+ )
185
+ ensure
186
+ shutdown_pool(pool)
187
+ producer_thread.join if producer_thread.alive?
188
+ end
189
+
190
+ raise producer_error if producer_error
191
+
192
+ build_summary(klass, shared_state)
193
+ end
194
+
195
+ # Initialize shared state hash for parallel batch processing.
196
+ #
197
+ # Creates a hash containing counters, batches array, mutex, and timing
198
+ # information that will be shared across threads and synchronized via mutex.
199
+ #
200
+ # @return [Hash] shared state hash with keys: :batches, :docs_total, :success_total,
201
+ # :failed_total, :batches_total, :idx_counter, :started_at, :mtx
202
+ def initialize_shared_state
203
+ {
204
+ batches: [],
205
+ docs_total: 0,
206
+ success_total: 0,
207
+ failed_total: 0,
208
+ failed_batches_total: 0,
209
+ batches_total: 0,
210
+ idx_counter: -1,
211
+ started_at: monotonic_ms,
212
+ mtx: Mutex.new
213
+ }
214
+ end
215
+
216
+ # Process batches from a queue using worker threads.
217
+ #
218
+ # Workers pull batches from the queue and process them concurrently.
219
+ # Stops when sentinel is received. Uses a thread pool for concurrent processing.
220
+ #
221
+ # @param batch_queue [SizedQueue] thread-safe queue containing batches
222
+ # @param sentinel [Object] unique object signaling queue completion
223
+ # @param into [String] physical collection name
224
+ # @param action [Symbol] :upsert, :create, or :update
225
+ # @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
226
+ # @param batch_size [Integer, nil] soft guard for logging when exceeded
227
+ # @param log_batches [Boolean] whether to log each batch as it completes
228
+ # @param pool [Concurrent::FixedThreadPool] thread pool instance
229
+ # @param shared_state [Hash] shared state hash for thread-safe aggregation
230
+ # @param max_parallel [Integer] number of worker threads to start
231
+ # @return [void]
232
+ def process_batches_from_queue(batch_queue:, sentinel:, into:, action:, retry_policy:, batch_size:,
233
+ log_batches:, pool:, shared_state:, max_parallel:)
234
+ max_parallel.times do
235
+ pool.post do
236
+ loop do
237
+ batch = batch_queue.pop
238
+ break if batch.equal?(sentinel)
239
+
240
+ process_single_batch_parallel(
241
+ raw_batch: batch,
242
+ into: into,
243
+ action: action,
244
+ retry_policy: retry_policy,
245
+ batch_size: batch_size,
246
+ log_batches: log_batches,
247
+ shared_state: shared_state
248
+ )
249
+ end
250
+ end
251
+ end
252
+ end
253
+
254
+ # Process a single batch in a parallel thread.
255
+ #
256
+ # Executed within a thread pool worker thread. Each thread gets its own
257
+ # Client instance and buffer to avoid thread-safety issues. The batch index
258
+ # is assigned atomically via mutex synchronization. Stats are aggregated
259
+ # thread-safely after processing.
260
+ #
261
+ # @param raw_batch [Object] batch object convertible to Array
262
+ # @param into [String] physical collection name
263
+ # @param action [Symbol] :upsert, :create, or :update
264
+ # @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
265
+ # @param batch_size [Integer, nil] soft guard for logging when exceeded
266
+ # @param log_batches [Boolean] whether to log each batch as it completes
267
+ # @param shared_state [Hash] shared state hash for thread-safe aggregation
268
+ # @return [void]
269
+ def process_single_batch_parallel(raw_batch:, into:, action:, retry_policy:, batch_size:, log_batches:,
270
+ shared_state:)
271
+ # Each thread gets its own resources
272
+ thread_client = SearchEngine.client
273
+ thread_buffer = +''
274
+ thread_idx = shared_state[:mtx].synchronize { shared_state[:idx_counter] += 1 }
275
+
276
+ begin
277
+ stats_list = import_batch_with_handling(
278
+ client: thread_client,
279
+ collection: into,
280
+ raw_batch: raw_batch,
281
+ action: action,
282
+ retry_policy: retry_policy,
283
+ buffer: thread_buffer,
284
+ next_index: -> { thread_idx }
285
+ )
286
+
287
+ shared_state[:mtx].synchronize do
288
+ aggregate_stats(stats_list, shared_state, batch_size, log_batches)
289
+ end
290
+ rescue StandardError => error
291
+ # Calculate document count for error stats (before any potential encoding errors)
292
+ docs_count = begin
293
+ BatchPlanner.to_array(raw_batch).size
294
+ rescue StandardError
295
+ 0
296
+ end
297
+
298
+ # Create failure stats similar to import_batch_with_handling_internal error path
299
+ failure_stat = failure_stats(thread_idx, docs_count, 0, error)
300
+
301
+ shared_state[:mtx].synchronize do
302
+ warn(" batch_index=#{thread_idx} → error=#{error.class}: #{error.message.to_s[0, 200]}")
303
+ aggregate_stats([failure_stat], shared_state, batch_size, log_batches)
304
+ end
305
+ end
306
+ end
307
+
308
+ # Aggregate batch statistics thread-safely into shared state.
309
+ #
310
+ # Must be called within a mutex synchronization block. Updates counters,
311
+ # appends to batches array, validates batch size, and optionally logs.
312
+ #
313
+ # @param stats_list [Array<Hash>] array of stats hashes from batch processing
314
+ # @param shared_state [Hash] shared state hash to update (must be mutex-protected)
315
+ # @param batch_size [Integer, nil] soft guard for logging when exceeded
316
+ # @param log_batches [Boolean] whether to log each batch as it completes
317
+ # @return [void]
318
+ def aggregate_stats(stats_list, shared_state, batch_size, log_batches)
319
+ stats_list.each do |stats|
320
+ shared_state[:docs_total] += stats[:docs_count].to_i
321
+ shared_state[:success_total] += stats[:success_count].to_i
322
+ shared_state[:failed_total] += stats[:failure_count].to_i
323
+ shared_state[:failed_batches_total] += 1 if stats[:failure_count].to_i.positive?
324
+ shared_state[:batches_total] += 1
325
+ shared_state[:batches] << stats
326
+ validate_soft_batch_size!(batch_size, stats[:docs_count])
327
+ log_batch(stats, shared_state[:batches_total]) if log_batches
328
+ end
329
+ end
330
+
331
+ # Shutdown the thread pool gracefully with timeout.
332
+ #
333
+ # Shuts down the pool, waits up to 1 hour for completion, then force-kills
334
+ # if necessary and waits an additional minute for cleanup.
335
+ #
336
+ # @param pool [Concurrent::FixedThreadPool] thread pool instance to shutdown
337
+ # @return [void]
338
+ def shutdown_pool(pool)
339
+ pool.shutdown
340
+ # Wait up to 1 hour, then force-kill and wait a bit more to ensure cleanup
341
+ pool.wait_for_termination(3600) || pool.kill
342
+ pool.wait_for_termination(60)
343
+ end
344
+
345
+ # Build a Summary object from aggregated shared state.
346
+ #
347
+ # Calculates total duration and constructs a Summary with all aggregated
348
+ # statistics from parallel batch processing.
349
+ #
350
+ # @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
351
+ # @param shared_state [Hash] shared state hash containing aggregated statistics
352
+ # @return [SearchEngine::Indexer::Summary]
353
+ def build_summary(klass, shared_state)
354
+ total_duration_ms = (monotonic_ms - shared_state[:started_at]).round(1)
355
+
356
+ Summary.new(
357
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
358
+ status: status_from_counts(shared_state[:success_total], shared_state[:failed_total]),
359
+ batches_total: shared_state[:batches_total],
360
+ docs_total: shared_state[:docs_total],
361
+ success_total: shared_state[:success_total],
362
+ failed_total: shared_state[:failed_total],
363
+ failed_batches_total: shared_state[:failed_batches_total],
364
+ duration_ms_total: total_duration_ms,
365
+ batches: shared_state[:batches]
366
+ )
367
+ end
368
+
369
+ def validate_args!(klass, into, enum, action)
370
+ unless klass.is_a?(Class) && klass.ancestors.include?(SearchEngine::Base)
371
+ raise Errors::InvalidParams, 'klass must inherit from SearchEngine::Base'
372
+ end
373
+
374
+ raise Errors::InvalidParams, 'into must be a non-empty String' if into.nil? || into.to_s.strip.empty?
375
+
376
+ raise Errors::InvalidParams, 'enum must be enumerable' unless enum.respond_to?(:each)
377
+
378
+ valid_actions = %i[upsert create update]
379
+ return if valid_actions.include?(action.to_sym)
380
+
381
+ raise Errors::InvalidParams,
382
+ "action must be one of :upsert, :create, :update (received #{action.inspect})"
383
+ end
384
+
385
+ def normalize_enum(enum)
386
+ enum.is_a?(Enumerator) ? enum : enum.each
387
+ end
388
+
389
+ # Estimate total batch count for progress logging.
390
+ #
391
+ # Attempts to estimate batch count for ActiveRecord sources by counting records
392
+ # and dividing by batch_size. Returns nil for other source types or when estimation fails.
393
+ #
394
+ # @param klass [Class] a {SearchEngine::Base} subclass
395
+ # @return [Integer, nil] estimated total batch count or nil if not estimable
396
+ def estimate_total_batches(klass)
397
+ return nil unless klass.is_a?(Class)
398
+
399
+ dsl = mapper_dsl_for_klass(klass)
400
+ return nil unless dsl
401
+
402
+ source_def = dsl[:source]
403
+ return nil unless source_def
404
+ return nil unless source_def[:type] == :active_record
405
+
406
+ model = source_def.dig(:options, :model)
407
+ return nil unless model.respond_to?(:count)
408
+
409
+ batch_size = source_def.dig(:options, :batch_size)
410
+ batch_size ||= SearchEngine.config.sources.active_record.batch_size
411
+ batch_size = batch_size.to_i
412
+ return nil unless batch_size.positive?
413
+
414
+ begin
415
+ total_records = model.count
416
+ return nil unless total_records.positive?
417
+
418
+ (total_records.to_f / batch_size).ceil
419
+ rescue StandardError
420
+ nil
421
+ end
422
+ end
423
+
424
+ def mapper_dsl_for_klass(klass)
425
+ return nil unless klass.instance_variable_defined?(:@__mapper_dsl__)
426
+
427
+ klass.instance_variable_get(:@__mapper_dsl__)
428
+ end
429
+
430
+ # Import a single batch with error handling and recursive 413 splitting.
431
+ #
432
+ # Public wrapper that delegates to the internal method with batch_index set to nil,
433
+ # indicating the index should be computed from next_index.
434
+ #
435
+ # @param client [SearchEngine::Client] client instance
436
+ # @param collection [String] physical collection name
437
+ # @param raw_batch [Object] batch object convertible to Array
438
+ # @param action [Symbol] :upsert, :create, or :update
439
+ # @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
440
+ # @param buffer [String] mutable string buffer for JSONL encoding
441
+ # @param next_index [Proc, Integer] proc that returns next index, or integer index
442
+ # @return [Array<Hash>] array of stats hashes (may contain multiple entries on 413 split)
443
+ def import_batch_with_handling(client:, collection:, raw_batch:, action:, retry_policy:, buffer:, next_index:)
444
+ import_batch_with_handling_internal(
445
+ client: client,
446
+ collection: collection,
447
+ raw_batch: raw_batch,
448
+ action: action,
449
+ retry_policy: retry_policy,
450
+ buffer: buffer,
451
+ next_index: next_index,
452
+ batch_index: nil
453
+ )
454
+ end
455
+
456
+ # Internal method for importing a batch with optional batch_index preservation.
457
+ #
458
+ # When batch_index is provided (non-nil), it is used directly, preserving the
459
+ # original batch index for recursive splits on 413 errors. When batch_index is nil,
460
+ # the index is computed from next_index (either by calling the proc or using the integer).
461
+ #
462
+ # @param client [SearchEngine::Client] client instance
463
+ # @param collection [String] physical collection name
464
+ # @param raw_batch [Object] batch object convertible to Array
465
+ # @param action [Symbol] :upsert, :create, or :update
466
+ # @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
467
+ # @param buffer [String] mutable string buffer for JSONL encoding
468
+ # @param next_index [Proc, Integer] proc that returns next index, or integer index
469
+ # @param batch_index [Integer, nil] optional pre-computed batch index (for recursive splits)
470
+ # @return [Array<Hash>] array of stats hashes (may contain multiple entries on 413 split)
471
+ def import_batch_with_handling_internal(client:, collection:, raw_batch:, action:, retry_policy:, buffer:,
472
+ next_index:, batch_index:)
473
+ docs = BatchPlanner.to_array(raw_batch)
474
+ return [] if docs.empty?
475
+
476
+ docs_count, bytes_sent = BatchPlanner.encode_jsonl!(docs, buffer)
477
+ jsonl = buffer.dup
478
+ # Use provided batch_index if available (for recursive splits), otherwise compute from next_index
479
+ idx = batch_index || (next_index.is_a?(Proc) ? next_index.call : next_index)
480
+
481
+ started_at = monotonic_ms
482
+
483
+ stats = ImportDispatcher.import_batch(
484
+ client: client,
485
+ collection: collection,
486
+ action: action,
487
+ jsonl: jsonl,
488
+ docs_count: docs_count,
489
+ bytes_sent: bytes_sent,
490
+ batch_index: idx,
491
+ retry_policy: retry_policy,
492
+ dry_run: false
493
+ )
494
+ stats[:duration_ms] = (monotonic_ms - started_at).round(1)
495
+ stats[:index] = idx
496
+ [stats]
497
+ rescue Errors::Api => error
498
+ if error.status.to_i == 413 && docs.size > 1
499
+ mid = docs.size / 2
500
+ left = docs[0...mid]
501
+ right = docs[mid..]
502
+ # Preserve the original batch index for both recursive splits
503
+ return import_batch_with_handling_internal(
504
+ client: client,
505
+ collection: collection,
506
+ raw_batch: left,
507
+ action: action,
508
+ retry_policy: retry_policy,
509
+ buffer: buffer,
510
+ next_index: next_index,
511
+ batch_index: idx
512
+ ) + import_batch_with_handling_internal(
513
+ client: client,
514
+ collection: collection,
515
+ raw_batch: right,
516
+ action: action,
517
+ retry_policy: retry_policy,
518
+ buffer: buffer,
519
+ next_index: next_index,
520
+ batch_index: idx
521
+ )
522
+ end
523
+
524
+ [failure_stats(idx, docs_count, bytes_sent, error)]
525
+ end
526
+
527
+ def failure_stats(idx, docs_count, bytes_sent, error)
528
+ {
529
+ index: idx,
530
+ docs_count: docs_count,
531
+ success_count: 0,
532
+ failure_count: docs_count,
533
+ attempts: 1,
534
+ http_status: error&.status.to_i,
535
+ duration_ms: 0.0,
536
+ bytes_sent: bytes_sent,
537
+ errors_sample: [safe_error_excerpt(error)]
538
+ }
539
+ end
540
+
541
+ def safe_error_excerpt(error)
542
+ cls = error&.class&.name
543
+ msg = error&.message.to_s
544
+ "#{cls}: #{msg[0, 200]}"
545
+ end
546
+
547
+ def sequence_generator
548
+ idx = -1
549
+ -> { idx += 1 }
550
+ end
551
+
552
+ def validate_soft_batch_size!(batch_size, docs_count)
553
+ limit = batch_size&.to_i
554
+ return unless limit&.positive?
555
+ return if docs_count.to_i <= limit
556
+
557
+ Kernel.warn("[search_engine] BulkImport batch exceeded soft limit: size=#{docs_count}, limit=#{limit}")
558
+ end
559
+
560
+ def status_from_counts(success_total, failed_total)
561
+ if failed_total.positive? && success_total.positive?
562
+ :partial
563
+ elsif failed_total.positive?
564
+ :failed
565
+ else
566
+ :ok
567
+ end
568
+ end
569
+
570
+ def monotonic_ms
571
+ SearchEngine::Instrumentation.monotonic_ms
572
+ end
573
+
574
+ def log_batch(stats, batch_number)
575
+ batch_status = batch_status_from_stats(stats)
576
+ status_color = SearchEngine::Logging::Color.for_status(batch_status)
577
+
578
+ prefix = batch_number == 1 ? ' single → ' : ' '
579
+ line = +prefix
580
+ line << SearchEngine::Logging::Color.apply("status=#{batch_status}", status_color) << ' '
581
+ line << "docs=#{stats[:docs_count]}" << ' '
582
+ success_count = stats[:success_count].to_i
583
+ success_str = "success=#{success_count}"
584
+ line << (
585
+ success_count.positive? ? SearchEngine::Logging::Color.bold(success_str) : success_str
586
+ ) << ' '
587
+ failed_count = stats[:failure_count].to_i
588
+ failed_str = "failed=#{failed_count}"
589
+ line << (failed_count.positive? ? SearchEngine::Logging::Color.apply(failed_str, :red) : failed_str) << ' '
590
+ line << "batch=#{batch_number} "
591
+ line << "duration_ms=#{stats[:duration_ms]}"
592
+
593
+ # Extract sample error from batch stats
594
+ sample_err = extract_batch_sample_error(stats)
595
+ line << " sample_error=#{sample_err.inspect}" if sample_err
596
+
597
+ puts(line)
598
+ end
599
+
600
+ def extract_batch_sample_error(stats)
601
+ samples = stats[:errors_sample] || stats['errors_sample']
602
+ return nil unless samples.is_a?(Array) && samples.any?
603
+
604
+ samples.each do |msg|
605
+ s = msg.to_s
606
+ return s unless s.strip.empty?
607
+ end
608
+ nil
609
+ end
610
+
611
+ def batch_status_from_stats(stats)
612
+ success_count = stats[:success_count].to_i
613
+ failure_count = stats[:failure_count].to_i
614
+
615
+ if failure_count.positive? && success_count.positive?
616
+ :partial
617
+ elsif failure_count.positive?
618
+ :failed
619
+ else
620
+ :ok
621
+ end
622
+ end
623
+ end
624
+ end
625
+ end
626
+ end