search-engine-for-typesense 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +148 -0
- data/app/search_engine/search_engine/app_info.rb +11 -0
- data/app/search_engine/search_engine/index_partition_job.rb +170 -0
- data/lib/generators/search_engine/install/install_generator.rb +20 -0
- data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
- data/lib/generators/search_engine/model/model_generator.rb +86 -0
- data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
- data/lib/search-engine-for-typesense.rb +12 -0
- data/lib/search_engine/active_record_syncable.rb +247 -0
- data/lib/search_engine/admin/stopwords.rb +125 -0
- data/lib/search_engine/admin/synonyms.rb +125 -0
- data/lib/search_engine/admin.rb +12 -0
- data/lib/search_engine/ast/and.rb +52 -0
- data/lib/search_engine/ast/binary_op.rb +75 -0
- data/lib/search_engine/ast/eq.rb +19 -0
- data/lib/search_engine/ast/group.rb +18 -0
- data/lib/search_engine/ast/gt.rb +12 -0
- data/lib/search_engine/ast/gte.rb +12 -0
- data/lib/search_engine/ast/in.rb +28 -0
- data/lib/search_engine/ast/lt.rb +12 -0
- data/lib/search_engine/ast/lte.rb +12 -0
- data/lib/search_engine/ast/matches.rb +55 -0
- data/lib/search_engine/ast/node.rb +176 -0
- data/lib/search_engine/ast/not_eq.rb +13 -0
- data/lib/search_engine/ast/not_in.rb +24 -0
- data/lib/search_engine/ast/or.rb +52 -0
- data/lib/search_engine/ast/prefix.rb +51 -0
- data/lib/search_engine/ast/raw.rb +41 -0
- data/lib/search_engine/ast/unary_op.rb +43 -0
- data/lib/search_engine/ast.rb +101 -0
- data/lib/search_engine/base/creation.rb +727 -0
- data/lib/search_engine/base/deletion.rb +80 -0
- data/lib/search_engine/base/display_coercions.rb +36 -0
- data/lib/search_engine/base/hydration.rb +312 -0
- data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
- data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
- data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
- data/lib/search_engine/base/index_maintenance.rb +459 -0
- data/lib/search_engine/base/indexing_dsl.rb +255 -0
- data/lib/search_engine/base/joins.rb +479 -0
- data/lib/search_engine/base/model_dsl.rb +472 -0
- data/lib/search_engine/base/presets.rb +43 -0
- data/lib/search_engine/base/pretty_printer.rb +315 -0
- data/lib/search_engine/base/relation_delegation.rb +42 -0
- data/lib/search_engine/base/scopes.rb +113 -0
- data/lib/search_engine/base/updating.rb +92 -0
- data/lib/search_engine/base.rb +38 -0
- data/lib/search_engine/bulk.rb +284 -0
- data/lib/search_engine/cache.rb +33 -0
- data/lib/search_engine/cascade.rb +531 -0
- data/lib/search_engine/cli/doctor.rb +631 -0
- data/lib/search_engine/cli/support.rb +217 -0
- data/lib/search_engine/cli.rb +222 -0
- data/lib/search_engine/client/http_adapter.rb +63 -0
- data/lib/search_engine/client/request_builder.rb +92 -0
- data/lib/search_engine/client/services/base.rb +74 -0
- data/lib/search_engine/client/services/collections.rb +161 -0
- data/lib/search_engine/client/services/documents.rb +214 -0
- data/lib/search_engine/client/services/operations.rb +152 -0
- data/lib/search_engine/client/services/search.rb +190 -0
- data/lib/search_engine/client/services.rb +29 -0
- data/lib/search_engine/client.rb +765 -0
- data/lib/search_engine/client_options.rb +20 -0
- data/lib/search_engine/collection_resolver.rb +191 -0
- data/lib/search_engine/collections_graph.rb +330 -0
- data/lib/search_engine/compiled_params.rb +143 -0
- data/lib/search_engine/compiler.rb +383 -0
- data/lib/search_engine/config/observability.rb +27 -0
- data/lib/search_engine/config/presets.rb +92 -0
- data/lib/search_engine/config/selection.rb +16 -0
- data/lib/search_engine/config/typesense.rb +48 -0
- data/lib/search_engine/config/validators.rb +97 -0
- data/lib/search_engine/config.rb +917 -0
- data/lib/search_engine/console_helpers.rb +130 -0
- data/lib/search_engine/deletion.rb +103 -0
- data/lib/search_engine/dispatcher.rb +125 -0
- data/lib/search_engine/dsl/parser.rb +582 -0
- data/lib/search_engine/engine.rb +167 -0
- data/lib/search_engine/errors.rb +290 -0
- data/lib/search_engine/filters/sanitizer.rb +189 -0
- data/lib/search_engine/hydration/materializers.rb +808 -0
- data/lib/search_engine/hydration/selection_context.rb +96 -0
- data/lib/search_engine/indexer/batch_planner.rb +76 -0
- data/lib/search_engine/indexer/bulk_import.rb +626 -0
- data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
- data/lib/search_engine/indexer/retry_policy.rb +103 -0
- data/lib/search_engine/indexer.rb +747 -0
- data/lib/search_engine/instrumentation.rb +308 -0
- data/lib/search_engine/joins/guard.rb +202 -0
- data/lib/search_engine/joins/resolver.rb +95 -0
- data/lib/search_engine/logging/color.rb +78 -0
- data/lib/search_engine/logging/format_helpers.rb +92 -0
- data/lib/search_engine/logging/partition_progress.rb +53 -0
- data/lib/search_engine/logging_subscriber.rb +388 -0
- data/lib/search_engine/mapper.rb +785 -0
- data/lib/search_engine/multi.rb +286 -0
- data/lib/search_engine/multi_result.rb +186 -0
- data/lib/search_engine/notifications/compact_logger.rb +675 -0
- data/lib/search_engine/observability.rb +162 -0
- data/lib/search_engine/operations.rb +58 -0
- data/lib/search_engine/otel.rb +227 -0
- data/lib/search_engine/partitioner.rb +128 -0
- data/lib/search_engine/ranking_plan.rb +118 -0
- data/lib/search_engine/registry.rb +158 -0
- data/lib/search_engine/relation/compiler.rb +711 -0
- data/lib/search_engine/relation/deletion.rb +37 -0
- data/lib/search_engine/relation/dsl/filters.rb +624 -0
- data/lib/search_engine/relation/dsl/selection.rb +240 -0
- data/lib/search_engine/relation/dsl.rb +903 -0
- data/lib/search_engine/relation/dx/dry_run.rb +59 -0
- data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
- data/lib/search_engine/relation/dx.rb +231 -0
- data/lib/search_engine/relation/materializers.rb +118 -0
- data/lib/search_engine/relation/options.rb +138 -0
- data/lib/search_engine/relation/state.rb +274 -0
- data/lib/search_engine/relation/updating.rb +44 -0
- data/lib/search_engine/relation.rb +623 -0
- data/lib/search_engine/result.rb +664 -0
- data/lib/search_engine/schema.rb +1083 -0
- data/lib/search_engine/sources/active_record_source.rb +185 -0
- data/lib/search_engine/sources/base.rb +62 -0
- data/lib/search_engine/sources/lambda_source.rb +55 -0
- data/lib/search_engine/sources/sql_source.rb +196 -0
- data/lib/search_engine/sources.rb +71 -0
- data/lib/search_engine/stale_rules.rb +160 -0
- data/lib/search_engine/test/minitest_assertions.rb +57 -0
- data/lib/search_engine/test/offline_client.rb +134 -0
- data/lib/search_engine/test/rspec_matchers.rb +77 -0
- data/lib/search_engine/test/stub_client.rb +201 -0
- data/lib/search_engine/test.rb +66 -0
- data/lib/search_engine/test_autoload.rb +8 -0
- data/lib/search_engine/update.rb +35 -0
- data/lib/search_engine/version.rb +7 -0
- data/lib/search_engine.rb +332 -0
- data/lib/tasks/search_engine.rake +501 -0
- data/lib/tasks/search_engine_doctor.rake +16 -0
- metadata +225 -0
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'search_engine/logging/color'
|
|
4
|
+
|
|
5
|
+
module SearchEngine
|
|
6
|
+
class Indexer
|
|
7
|
+
# Orchestrates streaming JSONL bulk imports for partition rebuilds.
|
|
8
|
+
#
|
|
9
|
+
# Keeps the legacy semantics from {SearchEngine::Indexer#import!} while delegating
|
|
10
|
+
# encoding and retry logic to dedicated helpers. Splits overlarge batches when
|
|
11
|
+
# Typesense responds with HTTP 413, aggregates metrics, and returns a
|
|
12
|
+
# {SearchEngine::Indexer::Summary} compatible with existing logging helpers.
|
|
13
|
+
#
|
|
14
|
+
# @since M8
|
|
15
|
+
class BulkImport
|
|
16
|
+
DEFAULT_ACTION = :upsert
|
|
17
|
+
class << self
|
|
18
|
+
# Execute a bulk import for the provided batches enumerable.
|
|
19
|
+
#
|
|
20
|
+
# @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
|
|
21
|
+
# @param into [String] physical collection name
|
|
22
|
+
# @param enum [Enumerable] yields batch-like objects convertible to Array
|
|
23
|
+
# @param batch_size [Integer, nil] soft guard for logging when exceeded
|
|
24
|
+
# @param action [Symbol] :upsert (default), :create, or :update
|
|
25
|
+
# @param log_batches [Boolean] whether to log each batch as it completes (default: true)
|
|
26
|
+
# @param max_parallel [Integer] maximum parallel threads for batch processing (default: 1)
|
|
27
|
+
# @return [SearchEngine::Indexer::Summary]
|
|
28
|
+
# @raise [SearchEngine::Errors::InvalidParams]
|
|
29
|
+
def call(klass:, into:, enum:, batch_size:, action: DEFAULT_ACTION, log_batches: true, max_parallel: 1)
|
|
30
|
+
validate_args!(klass, into, enum, action)
|
|
31
|
+
mp = max_parallel.to_i
|
|
32
|
+
mp = 1 unless mp.positive?
|
|
33
|
+
|
|
34
|
+
if mp > 1
|
|
35
|
+
call_parallel(
|
|
36
|
+
klass: klass, into: into, enum: enum, batch_size: batch_size, action: action,
|
|
37
|
+
log_batches: log_batches, max_parallel: mp
|
|
38
|
+
)
|
|
39
|
+
else
|
|
40
|
+
call_sequential(
|
|
41
|
+
klass: klass, into: into, enum: enum, batch_size: batch_size, action: action,
|
|
42
|
+
log_batches: log_batches
|
|
43
|
+
)
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
private
|
|
48
|
+
|
|
49
|
+
# Process batches sequentially (one at a time).
|
|
50
|
+
#
|
|
51
|
+
# @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
|
|
52
|
+
# @param into [String] physical collection name
|
|
53
|
+
# @param enum [Enumerable] yields batch-like objects convertible to Array
|
|
54
|
+
# @param batch_size [Integer, nil] soft guard for logging when exceeded
|
|
55
|
+
# @param action [Symbol] :upsert, :create, or :update
|
|
56
|
+
# @param log_batches [Boolean] whether to log each batch as it completes
|
|
57
|
+
# @return [SearchEngine::Indexer::Summary]
|
|
58
|
+
def call_sequential(klass:, into:, enum:, batch_size:, action:, log_batches:)
|
|
59
|
+
docs_enum = normalize_enum(enum)
|
|
60
|
+
retry_policy = RetryPolicy.from_config(SearchEngine.config.indexer&.retries)
|
|
61
|
+
client = SearchEngine.client
|
|
62
|
+
buffer = +''
|
|
63
|
+
next_index = sequence_generator
|
|
64
|
+
|
|
65
|
+
batches = []
|
|
66
|
+
docs_total = 0
|
|
67
|
+
success_total = 0
|
|
68
|
+
failed_total = 0
|
|
69
|
+
failed_batches_total = 0
|
|
70
|
+
batches_total = 0
|
|
71
|
+
# Capture start time before processing any batches to measure total wall-clock duration
|
|
72
|
+
started_at = monotonic_ms
|
|
73
|
+
|
|
74
|
+
docs_enum.each do |raw_batch|
|
|
75
|
+
stats_list = import_batch_with_handling(
|
|
76
|
+
client: client,
|
|
77
|
+
collection: into,
|
|
78
|
+
raw_batch: raw_batch,
|
|
79
|
+
action: action,
|
|
80
|
+
retry_policy: retry_policy,
|
|
81
|
+
buffer: buffer,
|
|
82
|
+
next_index: next_index
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
stats_list.each do |stats|
|
|
86
|
+
docs_total += stats[:docs_count].to_i
|
|
87
|
+
success_total += stats[:success_count].to_i
|
|
88
|
+
failed_total += stats[:failure_count].to_i
|
|
89
|
+
failed_batches_total += 1 if stats[:failure_count].to_i.positive?
|
|
90
|
+
batches_total += 1
|
|
91
|
+
batches << stats
|
|
92
|
+
validate_soft_batch_size!(batch_size, stats[:docs_count])
|
|
93
|
+
log_batch(stats, batches_total) if log_batches
|
|
94
|
+
end
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Calculate total duration as wall-clock time from start to finish (not sum of batch durations)
|
|
98
|
+
total_duration_ms = (monotonic_ms - started_at).round(1)
|
|
99
|
+
|
|
100
|
+
Summary.new(
|
|
101
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
102
|
+
status: status_from_counts(success_total, failed_total),
|
|
103
|
+
batches_total: batches_total,
|
|
104
|
+
docs_total: docs_total,
|
|
105
|
+
success_total: success_total,
|
|
106
|
+
failed_total: failed_total,
|
|
107
|
+
failed_batches_total: failed_batches_total,
|
|
108
|
+
duration_ms_total: total_duration_ms,
|
|
109
|
+
batches: batches
|
|
110
|
+
)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
# Process batches in parallel using a thread pool.
|
|
114
|
+
#
|
|
115
|
+
# Materializes all batches upfront and processes them concurrently using
|
|
116
|
+
# a bounded thread pool. Each thread gets its own Client instance and buffer
|
|
117
|
+
# to avoid thread-safety issues. Thread-safe aggregation of stats is handled
|
|
118
|
+
# via mutex synchronization.
|
|
119
|
+
#
|
|
120
|
+
# @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
|
|
121
|
+
# @param into [String] physical collection name
|
|
122
|
+
# @param enum [Enumerable] yields batch-like objects convertible to Array
|
|
123
|
+
# @param batch_size [Integer, nil] soft guard for logging when exceeded
|
|
124
|
+
# @param action [Symbol] :upsert, :create, or :update
|
|
125
|
+
# @param log_batches [Boolean] whether to log each batch as it completes
|
|
126
|
+
# @param max_parallel [Integer] maximum number of parallel threads
|
|
127
|
+
# @return [SearchEngine::Indexer::Summary]
|
|
128
|
+
def call_parallel(klass:, into:, enum:, batch_size:, action:, log_batches:, max_parallel:)
|
|
129
|
+
require 'concurrent-ruby'
|
|
130
|
+
|
|
131
|
+
# Use producer-consumer pattern with bounded queue to avoid full materialization
|
|
132
|
+
# Queue capacity = max_parallel * 2 to keep workers busy while producer fetches
|
|
133
|
+
docs_enum = normalize_enum(enum)
|
|
134
|
+
total_batches_estimate = estimate_total_batches(klass)
|
|
135
|
+
queue_capacity = max_parallel * 2
|
|
136
|
+
batch_queue = SizedQueue.new(queue_capacity)
|
|
137
|
+
sentinel = Object.new # Unique object to signal completion
|
|
138
|
+
|
|
139
|
+
retry_policy = RetryPolicy.from_config(SearchEngine.config.indexer&.retries)
|
|
140
|
+
pool = Concurrent::FixedThreadPool.new(max_parallel)
|
|
141
|
+
shared_state = initialize_shared_state
|
|
142
|
+
producer_error = nil
|
|
143
|
+
|
|
144
|
+
puts(' Starting parallel batch processing...') if log_batches
|
|
145
|
+
started_at = monotonic_ms
|
|
146
|
+
|
|
147
|
+
# Producer thread: fetch batches lazily and push to queue
|
|
148
|
+
producer_thread = Thread.new do
|
|
149
|
+
batch_count = 0
|
|
150
|
+
docs_enum.each do |batch|
|
|
151
|
+
batch_queue.push(batch)
|
|
152
|
+
batch_count += 1
|
|
153
|
+
# Log progress every 10 batches
|
|
154
|
+
next unless log_batches && (batch_count % 10).zero?
|
|
155
|
+
|
|
156
|
+
elapsed = (monotonic_ms - started_at).round(1)
|
|
157
|
+
if total_batches_estimate
|
|
158
|
+
puts(" Processed #{batch_count}/#{total_batches_estimate} batches... (#{elapsed}ms)")
|
|
159
|
+
else
|
|
160
|
+
puts(" Processed #{batch_count} batches... (#{elapsed}ms)")
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
rescue StandardError => error
|
|
164
|
+
producer_error = error
|
|
165
|
+
warn(" Producer failed at batch #{batch_count}: #{error.class}: #{error.message.to_s[0, 200]}")
|
|
166
|
+
ensure
|
|
167
|
+
# Signal completion to all workers
|
|
168
|
+
max_parallel.times { batch_queue.push(sentinel) }
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Worker threads: consume batches from queue
|
|
172
|
+
begin
|
|
173
|
+
process_batches_from_queue(
|
|
174
|
+
batch_queue: batch_queue,
|
|
175
|
+
sentinel: sentinel,
|
|
176
|
+
into: into,
|
|
177
|
+
action: action,
|
|
178
|
+
retry_policy: retry_policy,
|
|
179
|
+
batch_size: batch_size,
|
|
180
|
+
log_batches: log_batches,
|
|
181
|
+
pool: pool,
|
|
182
|
+
shared_state: shared_state,
|
|
183
|
+
max_parallel: max_parallel
|
|
184
|
+
)
|
|
185
|
+
ensure
|
|
186
|
+
shutdown_pool(pool)
|
|
187
|
+
producer_thread.join if producer_thread.alive?
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
raise producer_error if producer_error
|
|
191
|
+
|
|
192
|
+
build_summary(klass, shared_state)
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# Initialize shared state hash for parallel batch processing.
|
|
196
|
+
#
|
|
197
|
+
# Creates a hash containing counters, batches array, mutex, and timing
|
|
198
|
+
# information that will be shared across threads and synchronized via mutex.
|
|
199
|
+
#
|
|
200
|
+
# @return [Hash] shared state hash with keys: :batches, :docs_total, :success_total,
|
|
201
|
+
# :failed_total, :batches_total, :idx_counter, :started_at, :mtx
|
|
202
|
+
def initialize_shared_state
|
|
203
|
+
{
|
|
204
|
+
batches: [],
|
|
205
|
+
docs_total: 0,
|
|
206
|
+
success_total: 0,
|
|
207
|
+
failed_total: 0,
|
|
208
|
+
failed_batches_total: 0,
|
|
209
|
+
batches_total: 0,
|
|
210
|
+
idx_counter: -1,
|
|
211
|
+
started_at: monotonic_ms,
|
|
212
|
+
mtx: Mutex.new
|
|
213
|
+
}
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
# Process batches from a queue using worker threads.
|
|
217
|
+
#
|
|
218
|
+
# Workers pull batches from the queue and process them concurrently.
|
|
219
|
+
# Stops when sentinel is received. Uses a thread pool for concurrent processing.
|
|
220
|
+
#
|
|
221
|
+
# @param batch_queue [SizedQueue] thread-safe queue containing batches
|
|
222
|
+
# @param sentinel [Object] unique object signaling queue completion
|
|
223
|
+
# @param into [String] physical collection name
|
|
224
|
+
# @param action [Symbol] :upsert, :create, or :update
|
|
225
|
+
# @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
|
|
226
|
+
# @param batch_size [Integer, nil] soft guard for logging when exceeded
|
|
227
|
+
# @param log_batches [Boolean] whether to log each batch as it completes
|
|
228
|
+
# @param pool [Concurrent::FixedThreadPool] thread pool instance
|
|
229
|
+
# @param shared_state [Hash] shared state hash for thread-safe aggregation
|
|
230
|
+
# @param max_parallel [Integer] number of worker threads to start
|
|
231
|
+
# @return [void]
|
|
232
|
+
def process_batches_from_queue(batch_queue:, sentinel:, into:, action:, retry_policy:, batch_size:,
|
|
233
|
+
log_batches:, pool:, shared_state:, max_parallel:)
|
|
234
|
+
max_parallel.times do
|
|
235
|
+
pool.post do
|
|
236
|
+
loop do
|
|
237
|
+
batch = batch_queue.pop
|
|
238
|
+
break if batch.equal?(sentinel)
|
|
239
|
+
|
|
240
|
+
process_single_batch_parallel(
|
|
241
|
+
raw_batch: batch,
|
|
242
|
+
into: into,
|
|
243
|
+
action: action,
|
|
244
|
+
retry_policy: retry_policy,
|
|
245
|
+
batch_size: batch_size,
|
|
246
|
+
log_batches: log_batches,
|
|
247
|
+
shared_state: shared_state
|
|
248
|
+
)
|
|
249
|
+
end
|
|
250
|
+
end
|
|
251
|
+
end
|
|
252
|
+
end
|
|
253
|
+
|
|
254
|
+
# Process a single batch in a parallel thread.
|
|
255
|
+
#
|
|
256
|
+
# Executed within a thread pool worker thread. Each thread gets its own
|
|
257
|
+
# Client instance and buffer to avoid thread-safety issues. The batch index
|
|
258
|
+
# is assigned atomically via mutex synchronization. Stats are aggregated
|
|
259
|
+
# thread-safely after processing.
|
|
260
|
+
#
|
|
261
|
+
# @param raw_batch [Object] batch object convertible to Array
|
|
262
|
+
# @param into [String] physical collection name
|
|
263
|
+
# @param action [Symbol] :upsert, :create, or :update
|
|
264
|
+
# @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
|
|
265
|
+
# @param batch_size [Integer, nil] soft guard for logging when exceeded
|
|
266
|
+
# @param log_batches [Boolean] whether to log each batch as it completes
|
|
267
|
+
# @param shared_state [Hash] shared state hash for thread-safe aggregation
|
|
268
|
+
# @return [void]
|
|
269
|
+
def process_single_batch_parallel(raw_batch:, into:, action:, retry_policy:, batch_size:, log_batches:,
|
|
270
|
+
shared_state:)
|
|
271
|
+
# Each thread gets its own resources
|
|
272
|
+
thread_client = SearchEngine.client
|
|
273
|
+
thread_buffer = +''
|
|
274
|
+
thread_idx = shared_state[:mtx].synchronize { shared_state[:idx_counter] += 1 }
|
|
275
|
+
|
|
276
|
+
begin
|
|
277
|
+
stats_list = import_batch_with_handling(
|
|
278
|
+
client: thread_client,
|
|
279
|
+
collection: into,
|
|
280
|
+
raw_batch: raw_batch,
|
|
281
|
+
action: action,
|
|
282
|
+
retry_policy: retry_policy,
|
|
283
|
+
buffer: thread_buffer,
|
|
284
|
+
next_index: -> { thread_idx }
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
shared_state[:mtx].synchronize do
|
|
288
|
+
aggregate_stats(stats_list, shared_state, batch_size, log_batches)
|
|
289
|
+
end
|
|
290
|
+
rescue StandardError => error
|
|
291
|
+
# Calculate document count for error stats (before any potential encoding errors)
|
|
292
|
+
docs_count = begin
|
|
293
|
+
BatchPlanner.to_array(raw_batch).size
|
|
294
|
+
rescue StandardError
|
|
295
|
+
0
|
|
296
|
+
end
|
|
297
|
+
|
|
298
|
+
# Create failure stats similar to import_batch_with_handling_internal error path
|
|
299
|
+
failure_stat = failure_stats(thread_idx, docs_count, 0, error)
|
|
300
|
+
|
|
301
|
+
shared_state[:mtx].synchronize do
|
|
302
|
+
warn(" batch_index=#{thread_idx} → error=#{error.class}: #{error.message.to_s[0, 200]}")
|
|
303
|
+
aggregate_stats([failure_stat], shared_state, batch_size, log_batches)
|
|
304
|
+
end
|
|
305
|
+
end
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
# Aggregate batch statistics thread-safely into shared state.
|
|
309
|
+
#
|
|
310
|
+
# Must be called within a mutex synchronization block. Updates counters,
|
|
311
|
+
# appends to batches array, validates batch size, and optionally logs.
|
|
312
|
+
#
|
|
313
|
+
# @param stats_list [Array<Hash>] array of stats hashes from batch processing
|
|
314
|
+
# @param shared_state [Hash] shared state hash to update (must be mutex-protected)
|
|
315
|
+
# @param batch_size [Integer, nil] soft guard for logging when exceeded
|
|
316
|
+
# @param log_batches [Boolean] whether to log each batch as it completes
|
|
317
|
+
# @return [void]
|
|
318
|
+
def aggregate_stats(stats_list, shared_state, batch_size, log_batches)
|
|
319
|
+
stats_list.each do |stats|
|
|
320
|
+
shared_state[:docs_total] += stats[:docs_count].to_i
|
|
321
|
+
shared_state[:success_total] += stats[:success_count].to_i
|
|
322
|
+
shared_state[:failed_total] += stats[:failure_count].to_i
|
|
323
|
+
shared_state[:failed_batches_total] += 1 if stats[:failure_count].to_i.positive?
|
|
324
|
+
shared_state[:batches_total] += 1
|
|
325
|
+
shared_state[:batches] << stats
|
|
326
|
+
validate_soft_batch_size!(batch_size, stats[:docs_count])
|
|
327
|
+
log_batch(stats, shared_state[:batches_total]) if log_batches
|
|
328
|
+
end
|
|
329
|
+
end
|
|
330
|
+
|
|
331
|
+
# Shutdown the thread pool gracefully with timeout.
|
|
332
|
+
#
|
|
333
|
+
# Shuts down the pool, waits up to 1 hour for completion, then force-kills
|
|
334
|
+
# if necessary and waits an additional minute for cleanup.
|
|
335
|
+
#
|
|
336
|
+
# @param pool [Concurrent::FixedThreadPool] thread pool instance to shutdown
|
|
337
|
+
# @return [void]
|
|
338
|
+
def shutdown_pool(pool)
|
|
339
|
+
pool.shutdown
|
|
340
|
+
# Wait up to 1 hour, then force-kill and wait a bit more to ensure cleanup
|
|
341
|
+
pool.wait_for_termination(3600) || pool.kill
|
|
342
|
+
pool.wait_for_termination(60)
|
|
343
|
+
end
|
|
344
|
+
|
|
345
|
+
# Build a Summary object from aggregated shared state.
|
|
346
|
+
#
|
|
347
|
+
# Calculates total duration and constructs a Summary with all aggregated
|
|
348
|
+
# statistics from parallel batch processing.
|
|
349
|
+
#
|
|
350
|
+
# @param klass [Class] a {SearchEngine::Base} subclass (used for metadata only)
|
|
351
|
+
# @param shared_state [Hash] shared state hash containing aggregated statistics
|
|
352
|
+
# @return [SearchEngine::Indexer::Summary]
|
|
353
|
+
def build_summary(klass, shared_state)
|
|
354
|
+
total_duration_ms = (monotonic_ms - shared_state[:started_at]).round(1)
|
|
355
|
+
|
|
356
|
+
Summary.new(
|
|
357
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
358
|
+
status: status_from_counts(shared_state[:success_total], shared_state[:failed_total]),
|
|
359
|
+
batches_total: shared_state[:batches_total],
|
|
360
|
+
docs_total: shared_state[:docs_total],
|
|
361
|
+
success_total: shared_state[:success_total],
|
|
362
|
+
failed_total: shared_state[:failed_total],
|
|
363
|
+
failed_batches_total: shared_state[:failed_batches_total],
|
|
364
|
+
duration_ms_total: total_duration_ms,
|
|
365
|
+
batches: shared_state[:batches]
|
|
366
|
+
)
|
|
367
|
+
end
|
|
368
|
+
|
|
369
|
+
def validate_args!(klass, into, enum, action)
|
|
370
|
+
unless klass.is_a?(Class) && klass.ancestors.include?(SearchEngine::Base)
|
|
371
|
+
raise Errors::InvalidParams, 'klass must inherit from SearchEngine::Base'
|
|
372
|
+
end
|
|
373
|
+
|
|
374
|
+
raise Errors::InvalidParams, 'into must be a non-empty String' if into.nil? || into.to_s.strip.empty?
|
|
375
|
+
|
|
376
|
+
raise Errors::InvalidParams, 'enum must be enumerable' unless enum.respond_to?(:each)
|
|
377
|
+
|
|
378
|
+
valid_actions = %i[upsert create update]
|
|
379
|
+
return if valid_actions.include?(action.to_sym)
|
|
380
|
+
|
|
381
|
+
raise Errors::InvalidParams,
|
|
382
|
+
"action must be one of :upsert, :create, :update (received #{action.inspect})"
|
|
383
|
+
end
|
|
384
|
+
|
|
385
|
+
def normalize_enum(enum)
|
|
386
|
+
enum.is_a?(Enumerator) ? enum : enum.each
|
|
387
|
+
end
|
|
388
|
+
|
|
389
|
+
# Estimate total batch count for progress logging.
|
|
390
|
+
#
|
|
391
|
+
# Attempts to estimate batch count for ActiveRecord sources by counting records
|
|
392
|
+
# and dividing by batch_size. Returns nil for other source types or when estimation fails.
|
|
393
|
+
#
|
|
394
|
+
# @param klass [Class] a {SearchEngine::Base} subclass
|
|
395
|
+
# @return [Integer, nil] estimated total batch count or nil if not estimable
|
|
396
|
+
def estimate_total_batches(klass)
|
|
397
|
+
return nil unless klass.is_a?(Class)
|
|
398
|
+
|
|
399
|
+
dsl = mapper_dsl_for_klass(klass)
|
|
400
|
+
return nil unless dsl
|
|
401
|
+
|
|
402
|
+
source_def = dsl[:source]
|
|
403
|
+
return nil unless source_def
|
|
404
|
+
return nil unless source_def[:type] == :active_record
|
|
405
|
+
|
|
406
|
+
model = source_def.dig(:options, :model)
|
|
407
|
+
return nil unless model.respond_to?(:count)
|
|
408
|
+
|
|
409
|
+
batch_size = source_def.dig(:options, :batch_size)
|
|
410
|
+
batch_size ||= SearchEngine.config.sources.active_record.batch_size
|
|
411
|
+
batch_size = batch_size.to_i
|
|
412
|
+
return nil unless batch_size.positive?
|
|
413
|
+
|
|
414
|
+
begin
|
|
415
|
+
total_records = model.count
|
|
416
|
+
return nil unless total_records.positive?
|
|
417
|
+
|
|
418
|
+
(total_records.to_f / batch_size).ceil
|
|
419
|
+
rescue StandardError
|
|
420
|
+
nil
|
|
421
|
+
end
|
|
422
|
+
end
|
|
423
|
+
|
|
424
|
+
def mapper_dsl_for_klass(klass)
|
|
425
|
+
return nil unless klass.instance_variable_defined?(:@__mapper_dsl__)
|
|
426
|
+
|
|
427
|
+
klass.instance_variable_get(:@__mapper_dsl__)
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
# Import a single batch with error handling and recursive 413 splitting.
|
|
431
|
+
#
|
|
432
|
+
# Public wrapper that delegates to the internal method with batch_index set to nil,
|
|
433
|
+
# indicating the index should be computed from next_index.
|
|
434
|
+
#
|
|
435
|
+
# @param client [SearchEngine::Client] client instance
|
|
436
|
+
# @param collection [String] physical collection name
|
|
437
|
+
# @param raw_batch [Object] batch object convertible to Array
|
|
438
|
+
# @param action [Symbol] :upsert, :create, or :update
|
|
439
|
+
# @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
|
|
440
|
+
# @param buffer [String] mutable string buffer for JSONL encoding
|
|
441
|
+
# @param next_index [Proc, Integer] proc that returns next index, or integer index
|
|
442
|
+
# @return [Array<Hash>] array of stats hashes (may contain multiple entries on 413 split)
|
|
443
|
+
def import_batch_with_handling(client:, collection:, raw_batch:, action:, retry_policy:, buffer:, next_index:)
|
|
444
|
+
import_batch_with_handling_internal(
|
|
445
|
+
client: client,
|
|
446
|
+
collection: collection,
|
|
447
|
+
raw_batch: raw_batch,
|
|
448
|
+
action: action,
|
|
449
|
+
retry_policy: retry_policy,
|
|
450
|
+
buffer: buffer,
|
|
451
|
+
next_index: next_index,
|
|
452
|
+
batch_index: nil
|
|
453
|
+
)
|
|
454
|
+
end
|
|
455
|
+
|
|
456
|
+
# Internal method for importing a batch with optional batch_index preservation.
|
|
457
|
+
#
|
|
458
|
+
# When batch_index is provided (non-nil), it is used directly, preserving the
|
|
459
|
+
# original batch index for recursive splits on 413 errors. When batch_index is nil,
|
|
460
|
+
# the index is computed from next_index (either by calling the proc or using the integer).
|
|
461
|
+
#
|
|
462
|
+
# @param client [SearchEngine::Client] client instance
|
|
463
|
+
# @param collection [String] physical collection name
|
|
464
|
+
# @param raw_batch [Object] batch object convertible to Array
|
|
465
|
+
# @param action [Symbol] :upsert, :create, or :update
|
|
466
|
+
# @param retry_policy [SearchEngine::Indexer::RetryPolicy] retry policy instance
|
|
467
|
+
# @param buffer [String] mutable string buffer for JSONL encoding
|
|
468
|
+
# @param next_index [Proc, Integer] proc that returns next index, or integer index
|
|
469
|
+
# @param batch_index [Integer, nil] optional pre-computed batch index (for recursive splits)
|
|
470
|
+
# @return [Array<Hash>] array of stats hashes (may contain multiple entries on 413 split)
|
|
471
|
+
def import_batch_with_handling_internal(client:, collection:, raw_batch:, action:, retry_policy:, buffer:,
|
|
472
|
+
next_index:, batch_index:)
|
|
473
|
+
docs = BatchPlanner.to_array(raw_batch)
|
|
474
|
+
return [] if docs.empty?
|
|
475
|
+
|
|
476
|
+
docs_count, bytes_sent = BatchPlanner.encode_jsonl!(docs, buffer)
|
|
477
|
+
jsonl = buffer.dup
|
|
478
|
+
# Use provided batch_index if available (for recursive splits), otherwise compute from next_index
|
|
479
|
+
idx = batch_index || (next_index.is_a?(Proc) ? next_index.call : next_index)
|
|
480
|
+
|
|
481
|
+
started_at = monotonic_ms
|
|
482
|
+
|
|
483
|
+
stats = ImportDispatcher.import_batch(
|
|
484
|
+
client: client,
|
|
485
|
+
collection: collection,
|
|
486
|
+
action: action,
|
|
487
|
+
jsonl: jsonl,
|
|
488
|
+
docs_count: docs_count,
|
|
489
|
+
bytes_sent: bytes_sent,
|
|
490
|
+
batch_index: idx,
|
|
491
|
+
retry_policy: retry_policy,
|
|
492
|
+
dry_run: false
|
|
493
|
+
)
|
|
494
|
+
stats[:duration_ms] = (monotonic_ms - started_at).round(1)
|
|
495
|
+
stats[:index] = idx
|
|
496
|
+
[stats]
|
|
497
|
+
rescue Errors::Api => error
|
|
498
|
+
if error.status.to_i == 413 && docs.size > 1
|
|
499
|
+
mid = docs.size / 2
|
|
500
|
+
left = docs[0...mid]
|
|
501
|
+
right = docs[mid..]
|
|
502
|
+
# Preserve the original batch index for both recursive splits
|
|
503
|
+
return import_batch_with_handling_internal(
|
|
504
|
+
client: client,
|
|
505
|
+
collection: collection,
|
|
506
|
+
raw_batch: left,
|
|
507
|
+
action: action,
|
|
508
|
+
retry_policy: retry_policy,
|
|
509
|
+
buffer: buffer,
|
|
510
|
+
next_index: next_index,
|
|
511
|
+
batch_index: idx
|
|
512
|
+
) + import_batch_with_handling_internal(
|
|
513
|
+
client: client,
|
|
514
|
+
collection: collection,
|
|
515
|
+
raw_batch: right,
|
|
516
|
+
action: action,
|
|
517
|
+
retry_policy: retry_policy,
|
|
518
|
+
buffer: buffer,
|
|
519
|
+
next_index: next_index,
|
|
520
|
+
batch_index: idx
|
|
521
|
+
)
|
|
522
|
+
end
|
|
523
|
+
|
|
524
|
+
[failure_stats(idx, docs_count, bytes_sent, error)]
|
|
525
|
+
end
|
|
526
|
+
|
|
527
|
+
def failure_stats(idx, docs_count, bytes_sent, error)
|
|
528
|
+
{
|
|
529
|
+
index: idx,
|
|
530
|
+
docs_count: docs_count,
|
|
531
|
+
success_count: 0,
|
|
532
|
+
failure_count: docs_count,
|
|
533
|
+
attempts: 1,
|
|
534
|
+
http_status: error&.status.to_i,
|
|
535
|
+
duration_ms: 0.0,
|
|
536
|
+
bytes_sent: bytes_sent,
|
|
537
|
+
errors_sample: [safe_error_excerpt(error)]
|
|
538
|
+
}
|
|
539
|
+
end
|
|
540
|
+
|
|
541
|
+
def safe_error_excerpt(error)
|
|
542
|
+
cls = error&.class&.name
|
|
543
|
+
msg = error&.message.to_s
|
|
544
|
+
"#{cls}: #{msg[0, 200]}"
|
|
545
|
+
end
|
|
546
|
+
|
|
547
|
+
def sequence_generator
|
|
548
|
+
idx = -1
|
|
549
|
+
-> { idx += 1 }
|
|
550
|
+
end
|
|
551
|
+
|
|
552
|
+
def validate_soft_batch_size!(batch_size, docs_count)
|
|
553
|
+
limit = batch_size&.to_i
|
|
554
|
+
return unless limit&.positive?
|
|
555
|
+
return if docs_count.to_i <= limit
|
|
556
|
+
|
|
557
|
+
Kernel.warn("[search_engine] BulkImport batch exceeded soft limit: size=#{docs_count}, limit=#{limit}")
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
def status_from_counts(success_total, failed_total)
|
|
561
|
+
if failed_total.positive? && success_total.positive?
|
|
562
|
+
:partial
|
|
563
|
+
elsif failed_total.positive?
|
|
564
|
+
:failed
|
|
565
|
+
else
|
|
566
|
+
:ok
|
|
567
|
+
end
|
|
568
|
+
end
|
|
569
|
+
|
|
570
|
+
def monotonic_ms
|
|
571
|
+
SearchEngine::Instrumentation.monotonic_ms
|
|
572
|
+
end
|
|
573
|
+
|
|
574
|
+
def log_batch(stats, batch_number)
|
|
575
|
+
batch_status = batch_status_from_stats(stats)
|
|
576
|
+
status_color = SearchEngine::Logging::Color.for_status(batch_status)
|
|
577
|
+
|
|
578
|
+
prefix = batch_number == 1 ? ' single → ' : ' '
|
|
579
|
+
line = +prefix
|
|
580
|
+
line << SearchEngine::Logging::Color.apply("status=#{batch_status}", status_color) << ' '
|
|
581
|
+
line << "docs=#{stats[:docs_count]}" << ' '
|
|
582
|
+
success_count = stats[:success_count].to_i
|
|
583
|
+
success_str = "success=#{success_count}"
|
|
584
|
+
line << (
|
|
585
|
+
success_count.positive? ? SearchEngine::Logging::Color.bold(success_str) : success_str
|
|
586
|
+
) << ' '
|
|
587
|
+
failed_count = stats[:failure_count].to_i
|
|
588
|
+
failed_str = "failed=#{failed_count}"
|
|
589
|
+
line << (failed_count.positive? ? SearchEngine::Logging::Color.apply(failed_str, :red) : failed_str) << ' '
|
|
590
|
+
line << "batch=#{batch_number} "
|
|
591
|
+
line << "duration_ms=#{stats[:duration_ms]}"
|
|
592
|
+
|
|
593
|
+
# Extract sample error from batch stats
|
|
594
|
+
sample_err = extract_batch_sample_error(stats)
|
|
595
|
+
line << " sample_error=#{sample_err.inspect}" if sample_err
|
|
596
|
+
|
|
597
|
+
puts(line)
|
|
598
|
+
end
|
|
599
|
+
|
|
600
|
+
def extract_batch_sample_error(stats)
|
|
601
|
+
samples = stats[:errors_sample] || stats['errors_sample']
|
|
602
|
+
return nil unless samples.is_a?(Array) && samples.any?
|
|
603
|
+
|
|
604
|
+
samples.each do |msg|
|
|
605
|
+
s = msg.to_s
|
|
606
|
+
return s unless s.strip.empty?
|
|
607
|
+
end
|
|
608
|
+
nil
|
|
609
|
+
end
|
|
610
|
+
|
|
611
|
+
def batch_status_from_stats(stats)
|
|
612
|
+
success_count = stats[:success_count].to_i
|
|
613
|
+
failure_count = stats[:failure_count].to_i
|
|
614
|
+
|
|
615
|
+
if failure_count.positive? && success_count.positive?
|
|
616
|
+
:partial
|
|
617
|
+
elsif failure_count.positive?
|
|
618
|
+
:failed
|
|
619
|
+
else
|
|
620
|
+
:ok
|
|
621
|
+
end
|
|
622
|
+
end
|
|
623
|
+
end
|
|
624
|
+
end
|
|
625
|
+
end
|
|
626
|
+
end
|