search-engine-for-typesense 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (139) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE +21 -0
  3. data/README.md +148 -0
  4. data/app/search_engine/search_engine/app_info.rb +11 -0
  5. data/app/search_engine/search_engine/index_partition_job.rb +170 -0
  6. data/lib/generators/search_engine/install/install_generator.rb +20 -0
  7. data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
  8. data/lib/generators/search_engine/model/model_generator.rb +86 -0
  9. data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
  10. data/lib/search-engine-for-typesense.rb +12 -0
  11. data/lib/search_engine/active_record_syncable.rb +247 -0
  12. data/lib/search_engine/admin/stopwords.rb +125 -0
  13. data/lib/search_engine/admin/synonyms.rb +125 -0
  14. data/lib/search_engine/admin.rb +12 -0
  15. data/lib/search_engine/ast/and.rb +52 -0
  16. data/lib/search_engine/ast/binary_op.rb +75 -0
  17. data/lib/search_engine/ast/eq.rb +19 -0
  18. data/lib/search_engine/ast/group.rb +18 -0
  19. data/lib/search_engine/ast/gt.rb +12 -0
  20. data/lib/search_engine/ast/gte.rb +12 -0
  21. data/lib/search_engine/ast/in.rb +28 -0
  22. data/lib/search_engine/ast/lt.rb +12 -0
  23. data/lib/search_engine/ast/lte.rb +12 -0
  24. data/lib/search_engine/ast/matches.rb +55 -0
  25. data/lib/search_engine/ast/node.rb +176 -0
  26. data/lib/search_engine/ast/not_eq.rb +13 -0
  27. data/lib/search_engine/ast/not_in.rb +24 -0
  28. data/lib/search_engine/ast/or.rb +52 -0
  29. data/lib/search_engine/ast/prefix.rb +51 -0
  30. data/lib/search_engine/ast/raw.rb +41 -0
  31. data/lib/search_engine/ast/unary_op.rb +43 -0
  32. data/lib/search_engine/ast.rb +101 -0
  33. data/lib/search_engine/base/creation.rb +727 -0
  34. data/lib/search_engine/base/deletion.rb +80 -0
  35. data/lib/search_engine/base/display_coercions.rb +36 -0
  36. data/lib/search_engine/base/hydration.rb +312 -0
  37. data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
  38. data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
  39. data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
  40. data/lib/search_engine/base/index_maintenance.rb +459 -0
  41. data/lib/search_engine/base/indexing_dsl.rb +255 -0
  42. data/lib/search_engine/base/joins.rb +479 -0
  43. data/lib/search_engine/base/model_dsl.rb +472 -0
  44. data/lib/search_engine/base/presets.rb +43 -0
  45. data/lib/search_engine/base/pretty_printer.rb +315 -0
  46. data/lib/search_engine/base/relation_delegation.rb +42 -0
  47. data/lib/search_engine/base/scopes.rb +113 -0
  48. data/lib/search_engine/base/updating.rb +92 -0
  49. data/lib/search_engine/base.rb +38 -0
  50. data/lib/search_engine/bulk.rb +284 -0
  51. data/lib/search_engine/cache.rb +33 -0
  52. data/lib/search_engine/cascade.rb +531 -0
  53. data/lib/search_engine/cli/doctor.rb +631 -0
  54. data/lib/search_engine/cli/support.rb +217 -0
  55. data/lib/search_engine/cli.rb +222 -0
  56. data/lib/search_engine/client/http_adapter.rb +63 -0
  57. data/lib/search_engine/client/request_builder.rb +92 -0
  58. data/lib/search_engine/client/services/base.rb +74 -0
  59. data/lib/search_engine/client/services/collections.rb +161 -0
  60. data/lib/search_engine/client/services/documents.rb +214 -0
  61. data/lib/search_engine/client/services/operations.rb +152 -0
  62. data/lib/search_engine/client/services/search.rb +190 -0
  63. data/lib/search_engine/client/services.rb +29 -0
  64. data/lib/search_engine/client.rb +765 -0
  65. data/lib/search_engine/client_options.rb +20 -0
  66. data/lib/search_engine/collection_resolver.rb +191 -0
  67. data/lib/search_engine/collections_graph.rb +330 -0
  68. data/lib/search_engine/compiled_params.rb +143 -0
  69. data/lib/search_engine/compiler.rb +383 -0
  70. data/lib/search_engine/config/observability.rb +27 -0
  71. data/lib/search_engine/config/presets.rb +92 -0
  72. data/lib/search_engine/config/selection.rb +16 -0
  73. data/lib/search_engine/config/typesense.rb +48 -0
  74. data/lib/search_engine/config/validators.rb +97 -0
  75. data/lib/search_engine/config.rb +917 -0
  76. data/lib/search_engine/console_helpers.rb +130 -0
  77. data/lib/search_engine/deletion.rb +103 -0
  78. data/lib/search_engine/dispatcher.rb +125 -0
  79. data/lib/search_engine/dsl/parser.rb +582 -0
  80. data/lib/search_engine/engine.rb +167 -0
  81. data/lib/search_engine/errors.rb +290 -0
  82. data/lib/search_engine/filters/sanitizer.rb +189 -0
  83. data/lib/search_engine/hydration/materializers.rb +808 -0
  84. data/lib/search_engine/hydration/selection_context.rb +96 -0
  85. data/lib/search_engine/indexer/batch_planner.rb +76 -0
  86. data/lib/search_engine/indexer/bulk_import.rb +626 -0
  87. data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
  88. data/lib/search_engine/indexer/retry_policy.rb +103 -0
  89. data/lib/search_engine/indexer.rb +747 -0
  90. data/lib/search_engine/instrumentation.rb +308 -0
  91. data/lib/search_engine/joins/guard.rb +202 -0
  92. data/lib/search_engine/joins/resolver.rb +95 -0
  93. data/lib/search_engine/logging/color.rb +78 -0
  94. data/lib/search_engine/logging/format_helpers.rb +92 -0
  95. data/lib/search_engine/logging/partition_progress.rb +53 -0
  96. data/lib/search_engine/logging_subscriber.rb +388 -0
  97. data/lib/search_engine/mapper.rb +785 -0
  98. data/lib/search_engine/multi.rb +286 -0
  99. data/lib/search_engine/multi_result.rb +186 -0
  100. data/lib/search_engine/notifications/compact_logger.rb +675 -0
  101. data/lib/search_engine/observability.rb +162 -0
  102. data/lib/search_engine/operations.rb +58 -0
  103. data/lib/search_engine/otel.rb +227 -0
  104. data/lib/search_engine/partitioner.rb +128 -0
  105. data/lib/search_engine/ranking_plan.rb +118 -0
  106. data/lib/search_engine/registry.rb +158 -0
  107. data/lib/search_engine/relation/compiler.rb +711 -0
  108. data/lib/search_engine/relation/deletion.rb +37 -0
  109. data/lib/search_engine/relation/dsl/filters.rb +624 -0
  110. data/lib/search_engine/relation/dsl/selection.rb +240 -0
  111. data/lib/search_engine/relation/dsl.rb +903 -0
  112. data/lib/search_engine/relation/dx/dry_run.rb +59 -0
  113. data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
  114. data/lib/search_engine/relation/dx.rb +231 -0
  115. data/lib/search_engine/relation/materializers.rb +118 -0
  116. data/lib/search_engine/relation/options.rb +138 -0
  117. data/lib/search_engine/relation/state.rb +274 -0
  118. data/lib/search_engine/relation/updating.rb +44 -0
  119. data/lib/search_engine/relation.rb +623 -0
  120. data/lib/search_engine/result.rb +664 -0
  121. data/lib/search_engine/schema.rb +1083 -0
  122. data/lib/search_engine/sources/active_record_source.rb +185 -0
  123. data/lib/search_engine/sources/base.rb +62 -0
  124. data/lib/search_engine/sources/lambda_source.rb +55 -0
  125. data/lib/search_engine/sources/sql_source.rb +196 -0
  126. data/lib/search_engine/sources.rb +71 -0
  127. data/lib/search_engine/stale_rules.rb +160 -0
  128. data/lib/search_engine/test/minitest_assertions.rb +57 -0
  129. data/lib/search_engine/test/offline_client.rb +134 -0
  130. data/lib/search_engine/test/rspec_matchers.rb +77 -0
  131. data/lib/search_engine/test/stub_client.rb +201 -0
  132. data/lib/search_engine/test.rb +66 -0
  133. data/lib/search_engine/test_autoload.rb +8 -0
  134. data/lib/search_engine/update.rb +35 -0
  135. data/lib/search_engine/version.rb +7 -0
  136. data/lib/search_engine.rb +332 -0
  137. data/lib/tasks/search_engine.rake +501 -0
  138. data/lib/tasks/search_engine_doctor.rake +16 -0
  139. metadata +225 -0
@@ -0,0 +1,747 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'json'
4
+ require 'timeout'
5
+ require 'digest'
6
+ require 'time'
7
+
8
+ module SearchEngine
9
+ # Batch importer for streaming JSONL documents into a physical collection.
10
+ #
11
+ # Emits one AS::Notifications event per attempt: "search_engine.indexer.batch_import".
12
+ # Works strictly batch-by-batch to avoid memory growth and retries transient
13
+ # failures with exponential backoff and jitter.
14
+ class Indexer
15
+ # Aggregated summary of an import run.
16
+ #
17
+ # @!attribute [r] failed_batches_total
18
+ # @return [Integer] count of batch stats with failures
19
+ Summary = Struct.new(
20
+ :collection,
21
+ :status,
22
+ :batches_total,
23
+ :docs_total,
24
+ :success_total,
25
+ :failed_total,
26
+ :failed_batches_total,
27
+ :duration_ms_total,
28
+ :batches,
29
+ keyword_init: true
30
+ )
31
+
32
+ # Rebuild a single partition end-to-end using the model's partitioning + mapper.
33
+ #
34
+ # The flow is:
35
+ # - Resolve a partition fetch enumerator from the partitioning DSL (or fall back to source adapter)
36
+ # - Optionally run before/after hooks with configured timeouts
37
+ # - Map each batch to documents and stream-import them into the target collection
38
+ #
39
+ # @param klass [Class] a {SearchEngine::Base} subclass
40
+ # @param partition [Object] opaque partition key as defined by the DSL/source
41
+ # @param into [String, nil] target collection; defaults to resolver or the logical collection alias
42
+ # @return [Summary]
43
+ # @raise [SearchEngine::Errors::InvalidParams]
44
+ # @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning`
45
+ def self.rebuild_partition!(klass, partition:, into: nil)
46
+ raise Errors::InvalidParams, 'klass must be a Class' unless klass.is_a?(Class)
47
+ unless klass.ancestors.include?(SearchEngine::Base)
48
+ raise Errors::InvalidParams, 'klass must inherit from SearchEngine::Base'
49
+ end
50
+
51
+ compiled_partitioner = SearchEngine::Partitioner.for(klass)
52
+ mapper = SearchEngine::Mapper.for(klass)
53
+ unless mapper
54
+ raise Errors::InvalidParams,
55
+ "mapper is not defined for #{klass.name}. Define it via `index do ... map { ... } end`."
56
+ end
57
+
58
+ target_into = resolve_into!(klass, partition: partition, into: into)
59
+ rows_enum = rows_enumerator_for(klass, partition: partition, compiled_partitioner: compiled_partitioner)
60
+
61
+ before_hook = compiled_partitioner&.instance_variable_get(:@before_hook_proc)
62
+ after_hook = compiled_partitioner&.instance_variable_get(:@after_hook_proc)
63
+
64
+ started_at = monotonic_ms
65
+ pfields = SearchEngine::Observability.partition_fields(partition)
66
+ dispatch_ctx = SearchEngine::Instrumentation.context
67
+ instrument_partition_start(klass, target_into, pfields, dispatch_ctx)
68
+
69
+ docs_enum = build_docs_enum(rows_enum, mapper)
70
+
71
+ dsl = mapper_dsl_for(klass)
72
+ max_parallel = dsl&.dig(:max_parallel) || 1
73
+
74
+ summary = nil
75
+ SearchEngine::Instrumentation.with_context(into: target_into) do
76
+ run_before_hook_if_present(before_hook, partition, klass)
77
+
78
+ summary = import!(
79
+ klass,
80
+ into: target_into,
81
+ enum: docs_enum,
82
+ batch_size: nil,
83
+ action: :upsert,
84
+ log_batches: partition.nil?,
85
+ max_parallel: max_parallel
86
+ )
87
+
88
+ run_after_hook_if_present(after_hook, partition)
89
+ end
90
+
91
+ instrument_partition_finish(klass, target_into, pfields, summary, started_at)
92
+
93
+ summary
94
+ end
95
+
96
+ # Delete stale documents from a physical collection using model stale rules.
97
+ #
98
+ # @param klass [Class] a {SearchEngine::Base} subclass
99
+ # @param partition [Object, nil]
100
+ # @param into [String, nil]
101
+ # @param dry_run [Boolean]
102
+ # @return [Hash]
103
+ # @raise [SearchEngine::Errors::InvalidParams]
104
+ # @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#stale-deletes`
105
+ def self.delete_stale!(klass, partition: nil, into: nil, dry_run: false)
106
+ validate_stale_args!(klass)
107
+
108
+ cfg = SearchEngine.config
109
+ sd_cfg = cfg.stale_deletes
110
+ target_into = resolve_into!(klass, partition: partition, into: into)
111
+
112
+ skipped = skip_if_disabled(klass, sd_cfg, target_into, partition)
113
+ return skipped if skipped
114
+
115
+ defined = SearchEngine::StaleRules.defined_for?(klass)
116
+ filters = defined ? SearchEngine::StaleRules.compile_filters(klass, partition: partition) : []
117
+ filters.compact!
118
+ filters.reject! { |f| f.to_s.strip.empty? }
119
+ filter = SearchEngine::StaleRules.merge_filters(filters)
120
+
121
+ skipped = skip_if_no_filter_defined(defined, klass, target_into, partition)
122
+ return skipped if skipped
123
+
124
+ skipped = skip_if_empty_filter(filter, klass, target_into, partition)
125
+ return skipped if skipped
126
+
127
+ skipped = skip_if_strict_blocked(filter, sd_cfg, klass, target_into, partition)
128
+ return skipped if skipped
129
+
130
+ fhash = Digest::SHA1.hexdigest(filter)
131
+ started = monotonic_ms
132
+ instrument_started(klass: klass, into: target_into, partition: partition, filter_hash: fhash)
133
+
134
+ if dry_run
135
+ estimated = estimate_found_if_enabled(cfg, sd_cfg, target_into, filter)
136
+ return dry_run_summary(klass, target_into, partition, filter, fhash, started, estimated)
137
+ end
138
+
139
+ deleted_count = perform_delete_and_count(target_into, filter, sd_cfg.timeout_ms)
140
+ duration = monotonic_ms - started
141
+ instrument_finished(
142
+ klass: klass,
143
+ into: target_into,
144
+ partition: partition,
145
+ duration_ms: duration,
146
+ deleted_count: deleted_count
147
+ )
148
+ ok_summary(klass, target_into, partition, filter, fhash, duration, deleted_count)
149
+ rescue Errors::Error => error
150
+ duration = monotonic_ms - (started || monotonic_ms)
151
+ instrument_error(error, klass: klass, into: target_into, partition: partition)
152
+ failed_summary(klass, target_into, partition, filter, fhash, duration, error)
153
+ end
154
+
155
+ # Import pre-batched documents using JSONL bulk import.
156
+ #
157
+ # @param klass [Class] a SearchEngine::Base subclass (reserved for future mappers)
158
+ # @param into [String] target physical collection name
159
+ # @param enum [Enumerable] yields batches (Array-like) of Hash documents
160
+ # @param batch_size [Integer, nil] soft guard only; not used unless 413 handling
161
+ # @param action [Symbol] :upsert (default), :create, or :update
162
+ # @param log_batches [Boolean] whether to log each batch as it completes (default: true)
163
+ # @param max_parallel [Integer] maximum parallel threads for batch processing (default: 1)
164
+ # @return [Summary]
165
+ # @raise [SearchEngine::Errors::InvalidParams]
166
+ # @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer`
167
+ # @see `https://typesense.org/docs/latest/api/documents.html#import-documents`
168
+ def self.import!(klass, into:, enum:, batch_size: nil, action: :upsert, log_batches: true, max_parallel: 1)
169
+ SearchEngine::Indexer::BulkImport.call(
170
+ klass: klass,
171
+ into: into,
172
+ enum: enum,
173
+ batch_size: batch_size,
174
+ action: action,
175
+ log_batches: log_batches,
176
+ max_parallel: max_parallel
177
+ )
178
+ end
179
+
180
+ class << self
181
+ private
182
+
183
+ def validate_stale_args!(klass)
184
+ raise Errors::InvalidParams, 'klass must be a Class' unless klass.is_a?(Class)
185
+ return if klass.ancestors.include?(SearchEngine::Base)
186
+
187
+ raise Errors::InvalidParams, 'klass must inherit from SearchEngine::Base'
188
+ end
189
+
190
+ def skip_if_disabled(klass, sd_cfg, into, partition)
191
+ return nil if sd_cfg&.enabled
192
+
193
+ instrument_stale(:skipped, reason: :disabled, klass: klass, into: into, partition: partition)
194
+ skip_summary(klass, into, partition)
195
+ end
196
+
197
+ def skip_if_no_filter_defined(defined, klass, into, partition)
198
+ return nil if defined
199
+
200
+ instrument_stale(:skipped, reason: :no_filter_defined, klass: klass, into: into, partition: partition)
201
+ skip_summary(klass, into, partition)
202
+ end
203
+
204
+ def skip_if_empty_filter(filter, klass, into, partition)
205
+ return nil if filter && !filter.to_s.strip.empty?
206
+
207
+ instrument_stale(:skipped, reason: :empty_filter, klass: klass, into: into, partition: partition)
208
+ skip_summary(klass, into, partition)
209
+ end
210
+
211
+ def skip_if_strict_blocked(filter, sd_cfg, klass, into, partition)
212
+ return nil unless sd_cfg.strict_mode && suspicious_filter?(filter)
213
+
214
+ instrument_stale(:skipped, reason: :strict_blocked, klass: klass, into: into, partition: partition)
215
+ {
216
+ status: :skipped,
217
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
218
+ into: into,
219
+ partition: partition,
220
+ filter_by: filter,
221
+ filter_hash: Digest::SHA1.hexdigest(filter),
222
+ duration_ms: 0.0,
223
+ deleted_count: 0,
224
+ estimated_found: nil
225
+ }
226
+ end
227
+
228
+ def estimate_found_if_enabled(cfg, sd_cfg, into, filter)
229
+ return nil unless sd_cfg.estimation_enabled && cfg.default_query_by && !cfg.default_query_by.to_s.strip.empty?
230
+
231
+ client = SearchEngine.client
232
+ payload = { q: '*', query_by: cfg.default_query_by, per_page: 0, filter_by: filter }
233
+ params = SearchEngine::CompiledParams.new(payload)
234
+ res = client.search(collection: into, params: params, url_opts: {})
235
+ res&.found
236
+ rescue StandardError
237
+ nil
238
+ end
239
+
240
+ def perform_delete_and_count(into, filter, timeout_ms)
241
+ client = SearchEngine.client
242
+ resp = client.delete_documents_by_filter(
243
+ collection: into,
244
+ filter_by: filter,
245
+ timeout_ms: timeout_ms
246
+ )
247
+ (resp && (resp[:num_deleted] || resp[:deleted] || resp[:numDeleted])).to_i
248
+ end
249
+
250
+ def dry_run_summary(klass, into, partition, filter, filter_hash, started, estimated)
251
+ duration = monotonic_ms - started
252
+ {
253
+ status: :ok,
254
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
255
+ into: into,
256
+ partition: partition,
257
+ filter_by: filter,
258
+ filter_hash: filter_hash,
259
+ duration_ms: duration.round(1),
260
+ deleted_count: 0,
261
+ estimated_found: estimated,
262
+ will_delete: true
263
+ }
264
+ end
265
+
266
+ def ok_summary(klass, into, partition, filter, filter_hash, duration, deleted_count)
267
+ {
268
+ status: :ok,
269
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
270
+ into: into,
271
+ partition: partition,
272
+ filter_by: filter,
273
+ filter_hash: filter_hash,
274
+ duration_ms: duration.round(1),
275
+ deleted_count: deleted_count,
276
+ estimated_found: nil
277
+ }
278
+ end
279
+
280
+ def failed_summary(klass, into, partition, filter, filter_hash, duration, error)
281
+ {
282
+ status: :failed,
283
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
284
+ into: into,
285
+ partition: partition,
286
+ filter_by: filter,
287
+ filter_hash: filter_hash,
288
+ duration_ms: duration.round(1),
289
+ deleted_count: 0,
290
+ estimated_found: nil,
291
+ error_class: error.class.name,
292
+ message_truncated: error.message.to_s[0, 200]
293
+ }
294
+ end
295
+
296
+ def skip_summary(klass, into, partition)
297
+ {
298
+ status: :skipped,
299
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
300
+ into: into,
301
+ partition: partition,
302
+ filter_by: nil,
303
+ filter_hash: nil,
304
+ duration_ms: 0.0,
305
+ deleted_count: 0,
306
+ estimated_found: nil
307
+ }
308
+ end
309
+
310
+ def rows_enumerator_for(klass, partition:, compiled_partitioner:)
311
+ if compiled_partitioner
312
+ compiled_partitioner.partition_fetch_enum(partition)
313
+ else
314
+ dsl = mapper_dsl_for(klass)
315
+ source_def = dsl && dsl[:source]
316
+ unless source_def
317
+ raise Errors::InvalidParams,
318
+ 'No partition_fetch defined and no source adapter provided. Define one in the DSL.'
319
+ end
320
+ adapter = SearchEngine::Sources.build(source_def[:type], **(source_def[:options] || {}), &source_def[:block])
321
+ adapter.each_batch(partition: partition)
322
+ end
323
+ end
324
+
325
+ def resolve_into!(klass, partition:, into:)
326
+ return into if into && !into.to_s.strip.empty?
327
+
328
+ resolver = SearchEngine.config.partitioning&.default_into_resolver
329
+ if resolver.respond_to?(:arity)
330
+ case resolver.arity
331
+ when 1
332
+ val = resolver.call(klass)
333
+ return val if val && !val.to_s.strip.empty?
334
+ when 2, -1
335
+ val = resolver.call(klass, partition)
336
+ return val if val && !val.to_s.strip.empty?
337
+ end
338
+ elsif resolver
339
+ val = resolver.call(klass)
340
+ return val if val && !val.to_s.strip.empty?
341
+ end
342
+
343
+ name = if klass.respond_to?(:collection)
344
+ klass.collection
345
+ else
346
+ klass.name.to_s
347
+ end
348
+ name.to_s
349
+ end
350
+
351
+ def run_hook_with_timeout(proc_obj, partition, timeout_ms:)
352
+ return proc_obj.call(partition) unless timeout_ms&.to_i&.positive?
353
+
354
+ Timeout.timeout(timeout_ms.to_f / 1000.0) do
355
+ proc_obj.call(partition)
356
+ end
357
+ end
358
+
359
+ def import_batch_with_handling(client, collection, docs, action, next_index)
360
+ buffer = +''
361
+ docs_count = encode_jsonl!(docs, buffer)
362
+ bytes_sent = buffer.bytesize
363
+ idx = next_index.call
364
+
365
+ begin
366
+ attempt_stats = with_retries do |attempt|
367
+ perform_attempt(client, collection, action, buffer, docs_count, bytes_sent, idx, attempt)
368
+ end
369
+ [attempt_stats]
370
+ rescue Errors::Api => error
371
+ if error.status.to_i == 413 && docs.size > 1
372
+ mid = docs.size / 2
373
+ left = docs[0...mid]
374
+ right = docs[mid..]
375
+ import_batch_with_handling(client, collection, left, action, next_index) +
376
+ import_batch_with_handling(client, collection, right, action, next_index)
377
+ else
378
+ [
379
+ {
380
+ index: idx,
381
+ docs_count: docs_count,
382
+ success_count: 0,
383
+ failure_count: docs_count,
384
+ attempts: 1,
385
+ http_status: error.status.to_i,
386
+ duration_ms: 0.0,
387
+ bytes_sent: bytes_sent,
388
+ errors_sample: [safe_error_excerpt(error)]
389
+ }
390
+ ]
391
+ end
392
+ end
393
+ end
394
+
395
+ def perform_attempt(client, collection, action, jsonl, docs_count, bytes_sent, idx, attempt)
396
+ start = monotonic_ms
397
+ success_count = 0
398
+ failure_count = 0
399
+ http_status = 200
400
+ error_sample = []
401
+
402
+ if defined?(ActiveSupport::Notifications)
403
+ se_payload = {
404
+ collection: SearchEngine::Instrumentation.context[:collection] || collection,
405
+ into: collection,
406
+ batch_index: idx,
407
+ docs_count: docs_count,
408
+ success_count: nil,
409
+ failure_count: nil,
410
+ attempts: attempt,
411
+ http_status: nil,
412
+ bytes_sent: bytes_sent,
413
+ transient_retry: attempt > 1,
414
+ retry_after_s: nil,
415
+ error_sample: nil
416
+ }
417
+ SearchEngine::Instrumentation.instrument('search_engine.indexer.batch_import', se_payload) do |ctx|
418
+ raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
419
+ success_count, failure_count, error_sample = parse_import_response(raw)
420
+ http_status = 200
421
+ ctx[:success_count] = success_count
422
+ ctx[:failure_count] = failure_count
423
+ ctx[:http_status] = http_status
424
+ end
425
+ else
426
+ raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
427
+ success_count, failure_count, error_sample = parse_import_response(raw)
428
+ end
429
+
430
+ duration = monotonic_ms - start
431
+ {
432
+ index: idx,
433
+ docs_count: docs_count,
434
+ success_count: success_count,
435
+ failure_count: failure_count,
436
+ attempts: attempt,
437
+ http_status: http_status,
438
+ duration_ms: duration.round(1),
439
+ bytes_sent: bytes_sent,
440
+ errors_sample: error_sample
441
+ }
442
+ end
443
+
444
+ def with_retries
445
+ cfg = SearchEngine.config.indexer
446
+ attempts = cfg&.retries && cfg.retries[:attempts].to_i.positive? ? cfg.retries[:attempts].to_i : 3
447
+ base = cfg&.retries && cfg.retries[:base].to_f.positive? ? cfg.retries[:base].to_f : 0.5
448
+ max = cfg&.retries && cfg.retries[:max].to_f.positive? ? cfg.retries[:max].to_f : 5.0
449
+ jitter = cfg&.retries && cfg.retries[:jitter_fraction].to_f >= 0 ? cfg.retries[:jitter_fraction].to_f : 0.2
450
+
451
+ (1..attempts).each do |i|
452
+ return yield(i)
453
+ rescue Errors::Timeout, Errors::Connection
454
+ raise if i >= attempts
455
+
456
+ sleep_with_backoff(i, base: base, max: max, jitter_fraction: jitter)
457
+ rescue Errors::Api => error
458
+ code = error.status.to_i
459
+ raise unless transient_status?(code)
460
+ raise if i >= attempts
461
+
462
+ sleep_with_backoff(i, base: base, max: max, jitter_fraction: jitter)
463
+ end
464
+ end
465
+
466
+ def sleep_with_backoff(attempt, base:, max:, jitter_fraction:)
467
+ exp = [base * (2 ** (attempt - 1)), max].min
468
+ jitter = exp * jitter_fraction
469
+ delta = rand(-jitter..jitter)
470
+ sleep_time = exp + delta
471
+ sleep(sleep_time) if sleep_time.positive?
472
+ end
473
+
474
+ def transient_status?(code)
475
+ return true if code == 429
476
+ return true if code >= 500 && code <= 599
477
+
478
+ false
479
+ end
480
+
481
+ def to_array(batch)
482
+ return batch if batch.is_a?(Array)
483
+
484
+ batch.respond_to?(:to_a) ? batch.to_a : Array(batch)
485
+ end
486
+
487
+ def encode_jsonl!(docs, buffer)
488
+ count = 0
489
+ buffer.clear
490
+ docs.each do |raw|
491
+ doc = ensure_hash_document(raw)
492
+ ensure_id!(doc)
493
+ # Force system timestamp field just before serialization; developers cannot override.
494
+ now_i = if defined?(Time) && defined?(Time.zone) && Time.zone
495
+ Time.zone.now.to_i
496
+ else
497
+ Time.now.to_i
498
+ end
499
+ doc[:doc_updated_at] = now_i if doc.is_a?(Hash)
500
+ buffer << JSON.generate(doc)
501
+ buffer << "\n" if count < (docs.size - 1)
502
+ count += 1
503
+ end
504
+ count
505
+ end
506
+
507
+ def ensure_hash_document(obj)
508
+ if obj.is_a?(Hash)
509
+ obj
510
+ else
511
+ raise Errors::InvalidParams,
512
+ 'Indexer requires batches of Hash-like documents with at least an :id key. ' \
513
+ 'Mapping DSL is not available yet. See https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
514
+ end
515
+ end
516
+
517
+ def ensure_id!(doc)
518
+ has_id = doc.key?(:id) || doc.key?('id')
519
+ raise Errors::InvalidParams, 'document is missing required id' unless has_id
520
+ end
521
+
522
+ def parse_import_response(raw)
523
+ return parse_from_string(raw) if raw.is_a?(String)
524
+ return parse_from_array(raw) if raw.is_a?(Array)
525
+
526
+ [0, 0, []]
527
+ end
528
+
529
+ def parse_from_string(str)
530
+ success = 0
531
+ failure = 0
532
+ samples = []
533
+
534
+ str.each_line do |line|
535
+ line = line.strip
536
+ next if line.empty?
537
+
538
+ h = safe_parse_json(line)
539
+ unless h
540
+ failure += 1
541
+ samples << 'invalid-json-line'
542
+ next
543
+ end
544
+
545
+ if truthy?(h['success'] || h[:success])
546
+ success += 1
547
+ else
548
+ failure += 1
549
+ msg = h['error'] || h[:error] || h['message'] || h[:message]
550
+ samples << msg.to_s[0, 200] if msg
551
+ end
552
+ end
553
+
554
+ [success, failure, samples[0, 5]]
555
+ end
556
+
557
+ def parse_from_array(arr)
558
+ success = 0
559
+ failure = 0
560
+ samples = []
561
+
562
+ arr.each do |h|
563
+ if h.is_a?(Hash) && truthy?(h['success'] || h[:success])
564
+ success += 1
565
+ else
566
+ failure += 1
567
+ msg = h.is_a?(Hash) ? (h['error'] || h[:error] || h['message'] || h[:message]) : nil
568
+ samples << msg.to_s[0, 200] if msg
569
+ end
570
+ end
571
+
572
+ [success, failure, samples[0, 5]]
573
+ end
574
+
575
+ def safe_parse_json(line)
576
+ JSON.parse(line)
577
+ rescue StandardError
578
+ nil
579
+ end
580
+
581
+ def truthy?(val)
582
+ val == true || val.to_s.downcase == 'true'
583
+ end
584
+
585
+ def safe_error_excerpt(error)
586
+ cls = error.class.name
587
+ msg = error.message.to_s
588
+ "#{cls}: #{msg[0, 200]}"
589
+ end
590
+
591
+ def monotonic_ms
592
+ SearchEngine::Instrumentation.monotonic_ms
593
+ end
594
+
595
+ def mapper_dsl_for(klass)
596
+ return unless klass.instance_variable_defined?(:@__mapper_dsl__)
597
+
598
+ klass.instance_variable_get(:@__mapper_dsl__)
599
+ end
600
+
601
+ def instrument_started(klass:, into:, partition:, filter_hash:)
602
+ return unless defined?(ActiveSupport::Notifications)
603
+
604
+ payload = {
605
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
606
+ into: into,
607
+ partition: partition,
608
+ filter_hash: filter_hash
609
+ }
610
+ ActiveSupport::Notifications.instrument('search_engine.stale_deletes.started', payload) {}
611
+ end
612
+
613
+ def instrument_finished(klass:, into:, partition:, duration_ms:, deleted_count:)
614
+ return unless defined?(ActiveSupport::Notifications)
615
+
616
+ payload = {
617
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
618
+ into: into,
619
+ partition: partition,
620
+ duration_ms: duration_ms.round(1),
621
+ deleted_count: deleted_count
622
+ }
623
+ ActiveSupport::Notifications.instrument('search_engine.stale_deletes.finished', payload) {}
624
+ pf = SearchEngine::Observability.partition_fields(partition)
625
+ SearchEngine::Instrumentation.instrument('search_engine.indexer.delete_stale', payload.merge(partition_hash: pf[:partition_hash], status: 'ok')) {}
626
+ end
627
+
628
+ def instrument_error(error, klass:, into:, partition:)
629
+ return unless defined?(ActiveSupport::Notifications)
630
+
631
+ payload = {
632
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
633
+ into: into,
634
+ partition: partition,
635
+ error_class: error.class.name,
636
+ message_truncated: error.message.to_s[0, 200]
637
+ }
638
+ ActiveSupport::Notifications.instrument('search_engine.stale_deletes.error', payload) {}
639
+ pf = SearchEngine::Observability.partition_fields(partition)
640
+ SearchEngine::Instrumentation.instrument('search_engine.indexer.delete_stale', payload.merge(partition_hash: pf[:partition_hash], status: 'failed')) {}
641
+ end
642
+
643
+ def instrument_stale(_type, reason:, klass:, into:, partition:)
644
+ return unless defined?(ActiveSupport::Notifications)
645
+
646
+ payload = {
647
+ reason: reason,
648
+ collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
649
+ into: into,
650
+ partition: partition
651
+ }
652
+ ActiveSupport::Notifications.instrument('search_engine.stale_deletes.skipped', payload) {}
653
+ pf = SearchEngine::Observability.partition_fields(partition)
654
+ SearchEngine::Instrumentation.instrument('search_engine.indexer.delete_stale', payload.merge(partition_hash: pf[:partition_hash], status: 'skipped')) {}
655
+ end
656
+
657
+ def suspicious_filter?(filter)
658
+ s = filter.to_s
659
+ return true unless s.include?('=')
660
+
661
+ # Contains wildcard star without any field comparator context
662
+ return true if s.include?('*') && !s.match?(/[a-zA-Z0-9_]+\s*[:><=!]/)
663
+
664
+ false
665
+ end
666
+
667
+ def run_before_hook_if_present(before_hook, partition, klass)
668
+ return unless before_hook
669
+
670
+ # Guard: skip executing before_partition when the logical collection (alias or
671
+ # same-named physical) is missing. This avoids 404s during the initial schema
672
+ # apply before the alias swap has occurred.
673
+ present = begin
674
+ klass.respond_to?(:current_schema) && klass.current_schema
675
+ rescue StandardError
676
+ false
677
+ end
678
+ return unless present
679
+
680
+ # Safety: do not execute before_partition hooks for nil partitions.
681
+ # This prevents developers from accidentally issuing dangerous deletes
682
+ # with empty filter values (e.g., "store_id:=").
683
+ return if partition.nil?
684
+
685
+ run_hook_with_timeout(
686
+ before_hook,
687
+ partition,
688
+ timeout_ms: SearchEngine.config.partitioning.before_hook_timeout_ms
689
+ )
690
+ end
691
+
692
+ def run_after_hook_if_present(after_hook, partition)
693
+ return unless after_hook
694
+
695
+ run_hook_with_timeout(
696
+ after_hook,
697
+ partition,
698
+ timeout_ms: SearchEngine.config.partitioning.after_hook_timeout_ms
699
+ )
700
+ end
701
+
702
+ def instrument_partition_start(klass, target_into, pfields, dispatch_ctx)
703
+ SearchEngine::Instrumentation.instrument(
704
+ 'search_engine.indexer.partition_start',
705
+ {
706
+ collection: (klass.respond_to?(:collection) ? klass.collection : klass.name.to_s),
707
+ into: target_into,
708
+ partition: pfields[:partition],
709
+ partition_hash: pfields[:partition_hash],
710
+ dispatch_mode: dispatch_ctx[:dispatch_mode],
711
+ job_id: dispatch_ctx[:job_id],
712
+ timestamp: Time.now.utc.iso8601
713
+ }
714
+ ) {}
715
+ end
716
+
717
+ def instrument_partition_finish(klass, target_into, pfields, summary, started_at)
718
+ SearchEngine::Instrumentation.instrument(
719
+ 'search_engine.indexer.partition_finish',
720
+ {
721
+ collection: (klass.respond_to?(:collection) ? klass.collection : klass.name.to_s),
722
+ into: target_into,
723
+ partition: pfields[:partition],
724
+ partition_hash: pfields[:partition_hash],
725
+ batches_total: summary.batches_total,
726
+ docs_total: summary.docs_total,
727
+ success_total: summary.success_total,
728
+ failed_total: summary.failed_total,
729
+ status: summary.status,
730
+ duration_ms: (monotonic_ms - started_at).round(1)
731
+ }
732
+ ) {}
733
+ end
734
+
735
+ def build_docs_enum(rows_enum, mapper)
736
+ Enumerator.new do |y|
737
+ idx = 0
738
+ rows_enum.each do |rows|
739
+ docs, _report = mapper.map_batch!(rows, batch_index: idx)
740
+ y << docs
741
+ idx += 1
742
+ end
743
+ end
744
+ end
745
+ end
746
+ end
747
+ end