search-engine-for-typesense 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +148 -0
- data/app/search_engine/search_engine/app_info.rb +11 -0
- data/app/search_engine/search_engine/index_partition_job.rb +170 -0
- data/lib/generators/search_engine/install/install_generator.rb +20 -0
- data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
- data/lib/generators/search_engine/model/model_generator.rb +86 -0
- data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
- data/lib/search-engine-for-typesense.rb +12 -0
- data/lib/search_engine/active_record_syncable.rb +247 -0
- data/lib/search_engine/admin/stopwords.rb +125 -0
- data/lib/search_engine/admin/synonyms.rb +125 -0
- data/lib/search_engine/admin.rb +12 -0
- data/lib/search_engine/ast/and.rb +52 -0
- data/lib/search_engine/ast/binary_op.rb +75 -0
- data/lib/search_engine/ast/eq.rb +19 -0
- data/lib/search_engine/ast/group.rb +18 -0
- data/lib/search_engine/ast/gt.rb +12 -0
- data/lib/search_engine/ast/gte.rb +12 -0
- data/lib/search_engine/ast/in.rb +28 -0
- data/lib/search_engine/ast/lt.rb +12 -0
- data/lib/search_engine/ast/lte.rb +12 -0
- data/lib/search_engine/ast/matches.rb +55 -0
- data/lib/search_engine/ast/node.rb +176 -0
- data/lib/search_engine/ast/not_eq.rb +13 -0
- data/lib/search_engine/ast/not_in.rb +24 -0
- data/lib/search_engine/ast/or.rb +52 -0
- data/lib/search_engine/ast/prefix.rb +51 -0
- data/lib/search_engine/ast/raw.rb +41 -0
- data/lib/search_engine/ast/unary_op.rb +43 -0
- data/lib/search_engine/ast.rb +101 -0
- data/lib/search_engine/base/creation.rb +727 -0
- data/lib/search_engine/base/deletion.rb +80 -0
- data/lib/search_engine/base/display_coercions.rb +36 -0
- data/lib/search_engine/base/hydration.rb +312 -0
- data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
- data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
- data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
- data/lib/search_engine/base/index_maintenance.rb +459 -0
- data/lib/search_engine/base/indexing_dsl.rb +255 -0
- data/lib/search_engine/base/joins.rb +479 -0
- data/lib/search_engine/base/model_dsl.rb +472 -0
- data/lib/search_engine/base/presets.rb +43 -0
- data/lib/search_engine/base/pretty_printer.rb +315 -0
- data/lib/search_engine/base/relation_delegation.rb +42 -0
- data/lib/search_engine/base/scopes.rb +113 -0
- data/lib/search_engine/base/updating.rb +92 -0
- data/lib/search_engine/base.rb +38 -0
- data/lib/search_engine/bulk.rb +284 -0
- data/lib/search_engine/cache.rb +33 -0
- data/lib/search_engine/cascade.rb +531 -0
- data/lib/search_engine/cli/doctor.rb +631 -0
- data/lib/search_engine/cli/support.rb +217 -0
- data/lib/search_engine/cli.rb +222 -0
- data/lib/search_engine/client/http_adapter.rb +63 -0
- data/lib/search_engine/client/request_builder.rb +92 -0
- data/lib/search_engine/client/services/base.rb +74 -0
- data/lib/search_engine/client/services/collections.rb +161 -0
- data/lib/search_engine/client/services/documents.rb +214 -0
- data/lib/search_engine/client/services/operations.rb +152 -0
- data/lib/search_engine/client/services/search.rb +190 -0
- data/lib/search_engine/client/services.rb +29 -0
- data/lib/search_engine/client.rb +765 -0
- data/lib/search_engine/client_options.rb +20 -0
- data/lib/search_engine/collection_resolver.rb +191 -0
- data/lib/search_engine/collections_graph.rb +330 -0
- data/lib/search_engine/compiled_params.rb +143 -0
- data/lib/search_engine/compiler.rb +383 -0
- data/lib/search_engine/config/observability.rb +27 -0
- data/lib/search_engine/config/presets.rb +92 -0
- data/lib/search_engine/config/selection.rb +16 -0
- data/lib/search_engine/config/typesense.rb +48 -0
- data/lib/search_engine/config/validators.rb +97 -0
- data/lib/search_engine/config.rb +917 -0
- data/lib/search_engine/console_helpers.rb +130 -0
- data/lib/search_engine/deletion.rb +103 -0
- data/lib/search_engine/dispatcher.rb +125 -0
- data/lib/search_engine/dsl/parser.rb +582 -0
- data/lib/search_engine/engine.rb +167 -0
- data/lib/search_engine/errors.rb +290 -0
- data/lib/search_engine/filters/sanitizer.rb +189 -0
- data/lib/search_engine/hydration/materializers.rb +808 -0
- data/lib/search_engine/hydration/selection_context.rb +96 -0
- data/lib/search_engine/indexer/batch_planner.rb +76 -0
- data/lib/search_engine/indexer/bulk_import.rb +626 -0
- data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
- data/lib/search_engine/indexer/retry_policy.rb +103 -0
- data/lib/search_engine/indexer.rb +747 -0
- data/lib/search_engine/instrumentation.rb +308 -0
- data/lib/search_engine/joins/guard.rb +202 -0
- data/lib/search_engine/joins/resolver.rb +95 -0
- data/lib/search_engine/logging/color.rb +78 -0
- data/lib/search_engine/logging/format_helpers.rb +92 -0
- data/lib/search_engine/logging/partition_progress.rb +53 -0
- data/lib/search_engine/logging_subscriber.rb +388 -0
- data/lib/search_engine/mapper.rb +785 -0
- data/lib/search_engine/multi.rb +286 -0
- data/lib/search_engine/multi_result.rb +186 -0
- data/lib/search_engine/notifications/compact_logger.rb +675 -0
- data/lib/search_engine/observability.rb +162 -0
- data/lib/search_engine/operations.rb +58 -0
- data/lib/search_engine/otel.rb +227 -0
- data/lib/search_engine/partitioner.rb +128 -0
- data/lib/search_engine/ranking_plan.rb +118 -0
- data/lib/search_engine/registry.rb +158 -0
- data/lib/search_engine/relation/compiler.rb +711 -0
- data/lib/search_engine/relation/deletion.rb +37 -0
- data/lib/search_engine/relation/dsl/filters.rb +624 -0
- data/lib/search_engine/relation/dsl/selection.rb +240 -0
- data/lib/search_engine/relation/dsl.rb +903 -0
- data/lib/search_engine/relation/dx/dry_run.rb +59 -0
- data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
- data/lib/search_engine/relation/dx.rb +231 -0
- data/lib/search_engine/relation/materializers.rb +118 -0
- data/lib/search_engine/relation/options.rb +138 -0
- data/lib/search_engine/relation/state.rb +274 -0
- data/lib/search_engine/relation/updating.rb +44 -0
- data/lib/search_engine/relation.rb +623 -0
- data/lib/search_engine/result.rb +664 -0
- data/lib/search_engine/schema.rb +1083 -0
- data/lib/search_engine/sources/active_record_source.rb +185 -0
- data/lib/search_engine/sources/base.rb +62 -0
- data/lib/search_engine/sources/lambda_source.rb +55 -0
- data/lib/search_engine/sources/sql_source.rb +196 -0
- data/lib/search_engine/sources.rb +71 -0
- data/lib/search_engine/stale_rules.rb +160 -0
- data/lib/search_engine/test/minitest_assertions.rb +57 -0
- data/lib/search_engine/test/offline_client.rb +134 -0
- data/lib/search_engine/test/rspec_matchers.rb +77 -0
- data/lib/search_engine/test/stub_client.rb +201 -0
- data/lib/search_engine/test.rb +66 -0
- data/lib/search_engine/test_autoload.rb +8 -0
- data/lib/search_engine/update.rb +35 -0
- data/lib/search_engine/version.rb +7 -0
- data/lib/search_engine.rb +332 -0
- data/lib/tasks/search_engine.rake +501 -0
- data/lib/tasks/search_engine_doctor.rake +16 -0
- metadata +225 -0
|
@@ -0,0 +1,747 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
require 'timeout'
|
|
5
|
+
require 'digest'
|
|
6
|
+
require 'time'
|
|
7
|
+
|
|
8
|
+
module SearchEngine
|
|
9
|
+
# Batch importer for streaming JSONL documents into a physical collection.
|
|
10
|
+
#
|
|
11
|
+
# Emits one AS::Notifications event per attempt: "search_engine.indexer.batch_import".
|
|
12
|
+
# Works strictly batch-by-batch to avoid memory growth and retries transient
|
|
13
|
+
# failures with exponential backoff and jitter.
|
|
14
|
+
class Indexer
|
|
15
|
+
# Aggregated summary of an import run.
|
|
16
|
+
#
|
|
17
|
+
# @!attribute [r] failed_batches_total
|
|
18
|
+
# @return [Integer] count of batch stats with failures
|
|
19
|
+
Summary = Struct.new(
|
|
20
|
+
:collection,
|
|
21
|
+
:status,
|
|
22
|
+
:batches_total,
|
|
23
|
+
:docs_total,
|
|
24
|
+
:success_total,
|
|
25
|
+
:failed_total,
|
|
26
|
+
:failed_batches_total,
|
|
27
|
+
:duration_ms_total,
|
|
28
|
+
:batches,
|
|
29
|
+
keyword_init: true
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
# Rebuild a single partition end-to-end using the model's partitioning + mapper.
|
|
33
|
+
#
|
|
34
|
+
# The flow is:
|
|
35
|
+
# - Resolve a partition fetch enumerator from the partitioning DSL (or fall back to source adapter)
|
|
36
|
+
# - Optionally run before/after hooks with configured timeouts
|
|
37
|
+
# - Map each batch to documents and stream-import them into the target collection
|
|
38
|
+
#
|
|
39
|
+
# @param klass [Class] a {SearchEngine::Base} subclass
|
|
40
|
+
# @param partition [Object] opaque partition key as defined by the DSL/source
|
|
41
|
+
# @param into [String, nil] target collection; defaults to resolver or the logical collection alias
|
|
42
|
+
# @return [Summary]
|
|
43
|
+
# @raise [SearchEngine::Errors::InvalidParams]
|
|
44
|
+
# @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#partitioning`
|
|
45
|
+
def self.rebuild_partition!(klass, partition:, into: nil)
|
|
46
|
+
raise Errors::InvalidParams, 'klass must be a Class' unless klass.is_a?(Class)
|
|
47
|
+
unless klass.ancestors.include?(SearchEngine::Base)
|
|
48
|
+
raise Errors::InvalidParams, 'klass must inherit from SearchEngine::Base'
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
compiled_partitioner = SearchEngine::Partitioner.for(klass)
|
|
52
|
+
mapper = SearchEngine::Mapper.for(klass)
|
|
53
|
+
unless mapper
|
|
54
|
+
raise Errors::InvalidParams,
|
|
55
|
+
"mapper is not defined for #{klass.name}. Define it via `index do ... map { ... } end`."
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
target_into = resolve_into!(klass, partition: partition, into: into)
|
|
59
|
+
rows_enum = rows_enumerator_for(klass, partition: partition, compiled_partitioner: compiled_partitioner)
|
|
60
|
+
|
|
61
|
+
before_hook = compiled_partitioner&.instance_variable_get(:@before_hook_proc)
|
|
62
|
+
after_hook = compiled_partitioner&.instance_variable_get(:@after_hook_proc)
|
|
63
|
+
|
|
64
|
+
started_at = monotonic_ms
|
|
65
|
+
pfields = SearchEngine::Observability.partition_fields(partition)
|
|
66
|
+
dispatch_ctx = SearchEngine::Instrumentation.context
|
|
67
|
+
instrument_partition_start(klass, target_into, pfields, dispatch_ctx)
|
|
68
|
+
|
|
69
|
+
docs_enum = build_docs_enum(rows_enum, mapper)
|
|
70
|
+
|
|
71
|
+
dsl = mapper_dsl_for(klass)
|
|
72
|
+
max_parallel = dsl&.dig(:max_parallel) || 1
|
|
73
|
+
|
|
74
|
+
summary = nil
|
|
75
|
+
SearchEngine::Instrumentation.with_context(into: target_into) do
|
|
76
|
+
run_before_hook_if_present(before_hook, partition, klass)
|
|
77
|
+
|
|
78
|
+
summary = import!(
|
|
79
|
+
klass,
|
|
80
|
+
into: target_into,
|
|
81
|
+
enum: docs_enum,
|
|
82
|
+
batch_size: nil,
|
|
83
|
+
action: :upsert,
|
|
84
|
+
log_batches: partition.nil?,
|
|
85
|
+
max_parallel: max_parallel
|
|
86
|
+
)
|
|
87
|
+
|
|
88
|
+
run_after_hook_if_present(after_hook, partition)
|
|
89
|
+
end
|
|
90
|
+
|
|
91
|
+
instrument_partition_finish(klass, target_into, pfields, summary, started_at)
|
|
92
|
+
|
|
93
|
+
summary
|
|
94
|
+
end
|
|
95
|
+
|
|
96
|
+
# Delete stale documents from a physical collection using model stale rules.
|
|
97
|
+
#
|
|
98
|
+
# @param klass [Class] a {SearchEngine::Base} subclass
|
|
99
|
+
# @param partition [Object, nil]
|
|
100
|
+
# @param into [String, nil]
|
|
101
|
+
# @param dry_run [Boolean]
|
|
102
|
+
# @return [Hash]
|
|
103
|
+
# @raise [SearchEngine::Errors::InvalidParams]
|
|
104
|
+
# @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer#stale-deletes`
|
|
105
|
+
def self.delete_stale!(klass, partition: nil, into: nil, dry_run: false)
|
|
106
|
+
validate_stale_args!(klass)
|
|
107
|
+
|
|
108
|
+
cfg = SearchEngine.config
|
|
109
|
+
sd_cfg = cfg.stale_deletes
|
|
110
|
+
target_into = resolve_into!(klass, partition: partition, into: into)
|
|
111
|
+
|
|
112
|
+
skipped = skip_if_disabled(klass, sd_cfg, target_into, partition)
|
|
113
|
+
return skipped if skipped
|
|
114
|
+
|
|
115
|
+
defined = SearchEngine::StaleRules.defined_for?(klass)
|
|
116
|
+
filters = defined ? SearchEngine::StaleRules.compile_filters(klass, partition: partition) : []
|
|
117
|
+
filters.compact!
|
|
118
|
+
filters.reject! { |f| f.to_s.strip.empty? }
|
|
119
|
+
filter = SearchEngine::StaleRules.merge_filters(filters)
|
|
120
|
+
|
|
121
|
+
skipped = skip_if_no_filter_defined(defined, klass, target_into, partition)
|
|
122
|
+
return skipped if skipped
|
|
123
|
+
|
|
124
|
+
skipped = skip_if_empty_filter(filter, klass, target_into, partition)
|
|
125
|
+
return skipped if skipped
|
|
126
|
+
|
|
127
|
+
skipped = skip_if_strict_blocked(filter, sd_cfg, klass, target_into, partition)
|
|
128
|
+
return skipped if skipped
|
|
129
|
+
|
|
130
|
+
fhash = Digest::SHA1.hexdigest(filter)
|
|
131
|
+
started = monotonic_ms
|
|
132
|
+
instrument_started(klass: klass, into: target_into, partition: partition, filter_hash: fhash)
|
|
133
|
+
|
|
134
|
+
if dry_run
|
|
135
|
+
estimated = estimate_found_if_enabled(cfg, sd_cfg, target_into, filter)
|
|
136
|
+
return dry_run_summary(klass, target_into, partition, filter, fhash, started, estimated)
|
|
137
|
+
end
|
|
138
|
+
|
|
139
|
+
deleted_count = perform_delete_and_count(target_into, filter, sd_cfg.timeout_ms)
|
|
140
|
+
duration = monotonic_ms - started
|
|
141
|
+
instrument_finished(
|
|
142
|
+
klass: klass,
|
|
143
|
+
into: target_into,
|
|
144
|
+
partition: partition,
|
|
145
|
+
duration_ms: duration,
|
|
146
|
+
deleted_count: deleted_count
|
|
147
|
+
)
|
|
148
|
+
ok_summary(klass, target_into, partition, filter, fhash, duration, deleted_count)
|
|
149
|
+
rescue Errors::Error => error
|
|
150
|
+
duration = monotonic_ms - (started || monotonic_ms)
|
|
151
|
+
instrument_error(error, klass: klass, into: target_into, partition: partition)
|
|
152
|
+
failed_summary(klass, target_into, partition, filter, fhash, duration, error)
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Import pre-batched documents using JSONL bulk import.
|
|
156
|
+
#
|
|
157
|
+
# @param klass [Class] a SearchEngine::Base subclass (reserved for future mappers)
|
|
158
|
+
# @param into [String] target physical collection name
|
|
159
|
+
# @param enum [Enumerable] yields batches (Array-like) of Hash documents
|
|
160
|
+
# @param batch_size [Integer, nil] soft guard only; not used unless 413 handling
|
|
161
|
+
# @param action [Symbol] :upsert (default), :create, or :update
|
|
162
|
+
# @param log_batches [Boolean] whether to log each batch as it completes (default: true)
|
|
163
|
+
# @param max_parallel [Integer] maximum parallel threads for batch processing (default: 1)
|
|
164
|
+
# @return [Summary]
|
|
165
|
+
# @raise [SearchEngine::Errors::InvalidParams]
|
|
166
|
+
# @see `https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer`
|
|
167
|
+
# @see `https://typesense.org/docs/latest/api/documents.html#import-documents`
|
|
168
|
+
def self.import!(klass, into:, enum:, batch_size: nil, action: :upsert, log_batches: true, max_parallel: 1)
|
|
169
|
+
SearchEngine::Indexer::BulkImport.call(
|
|
170
|
+
klass: klass,
|
|
171
|
+
into: into,
|
|
172
|
+
enum: enum,
|
|
173
|
+
batch_size: batch_size,
|
|
174
|
+
action: action,
|
|
175
|
+
log_batches: log_batches,
|
|
176
|
+
max_parallel: max_parallel
|
|
177
|
+
)
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
class << self
|
|
181
|
+
private
|
|
182
|
+
|
|
183
|
+
def validate_stale_args!(klass)
|
|
184
|
+
raise Errors::InvalidParams, 'klass must be a Class' unless klass.is_a?(Class)
|
|
185
|
+
return if klass.ancestors.include?(SearchEngine::Base)
|
|
186
|
+
|
|
187
|
+
raise Errors::InvalidParams, 'klass must inherit from SearchEngine::Base'
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
def skip_if_disabled(klass, sd_cfg, into, partition)
|
|
191
|
+
return nil if sd_cfg&.enabled
|
|
192
|
+
|
|
193
|
+
instrument_stale(:skipped, reason: :disabled, klass: klass, into: into, partition: partition)
|
|
194
|
+
skip_summary(klass, into, partition)
|
|
195
|
+
end
|
|
196
|
+
|
|
197
|
+
def skip_if_no_filter_defined(defined, klass, into, partition)
|
|
198
|
+
return nil if defined
|
|
199
|
+
|
|
200
|
+
instrument_stale(:skipped, reason: :no_filter_defined, klass: klass, into: into, partition: partition)
|
|
201
|
+
skip_summary(klass, into, partition)
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
def skip_if_empty_filter(filter, klass, into, partition)
|
|
205
|
+
return nil if filter && !filter.to_s.strip.empty?
|
|
206
|
+
|
|
207
|
+
instrument_stale(:skipped, reason: :empty_filter, klass: klass, into: into, partition: partition)
|
|
208
|
+
skip_summary(klass, into, partition)
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
def skip_if_strict_blocked(filter, sd_cfg, klass, into, partition)
|
|
212
|
+
return nil unless sd_cfg.strict_mode && suspicious_filter?(filter)
|
|
213
|
+
|
|
214
|
+
instrument_stale(:skipped, reason: :strict_blocked, klass: klass, into: into, partition: partition)
|
|
215
|
+
{
|
|
216
|
+
status: :skipped,
|
|
217
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
218
|
+
into: into,
|
|
219
|
+
partition: partition,
|
|
220
|
+
filter_by: filter,
|
|
221
|
+
filter_hash: Digest::SHA1.hexdigest(filter),
|
|
222
|
+
duration_ms: 0.0,
|
|
223
|
+
deleted_count: 0,
|
|
224
|
+
estimated_found: nil
|
|
225
|
+
}
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
def estimate_found_if_enabled(cfg, sd_cfg, into, filter)
|
|
229
|
+
return nil unless sd_cfg.estimation_enabled && cfg.default_query_by && !cfg.default_query_by.to_s.strip.empty?
|
|
230
|
+
|
|
231
|
+
client = SearchEngine.client
|
|
232
|
+
payload = { q: '*', query_by: cfg.default_query_by, per_page: 0, filter_by: filter }
|
|
233
|
+
params = SearchEngine::CompiledParams.new(payload)
|
|
234
|
+
res = client.search(collection: into, params: params, url_opts: {})
|
|
235
|
+
res&.found
|
|
236
|
+
rescue StandardError
|
|
237
|
+
nil
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
def perform_delete_and_count(into, filter, timeout_ms)
|
|
241
|
+
client = SearchEngine.client
|
|
242
|
+
resp = client.delete_documents_by_filter(
|
|
243
|
+
collection: into,
|
|
244
|
+
filter_by: filter,
|
|
245
|
+
timeout_ms: timeout_ms
|
|
246
|
+
)
|
|
247
|
+
(resp && (resp[:num_deleted] || resp[:deleted] || resp[:numDeleted])).to_i
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
def dry_run_summary(klass, into, partition, filter, filter_hash, started, estimated)
|
|
251
|
+
duration = monotonic_ms - started
|
|
252
|
+
{
|
|
253
|
+
status: :ok,
|
|
254
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
255
|
+
into: into,
|
|
256
|
+
partition: partition,
|
|
257
|
+
filter_by: filter,
|
|
258
|
+
filter_hash: filter_hash,
|
|
259
|
+
duration_ms: duration.round(1),
|
|
260
|
+
deleted_count: 0,
|
|
261
|
+
estimated_found: estimated,
|
|
262
|
+
will_delete: true
|
|
263
|
+
}
|
|
264
|
+
end
|
|
265
|
+
|
|
266
|
+
def ok_summary(klass, into, partition, filter, filter_hash, duration, deleted_count)
|
|
267
|
+
{
|
|
268
|
+
status: :ok,
|
|
269
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
270
|
+
into: into,
|
|
271
|
+
partition: partition,
|
|
272
|
+
filter_by: filter,
|
|
273
|
+
filter_hash: filter_hash,
|
|
274
|
+
duration_ms: duration.round(1),
|
|
275
|
+
deleted_count: deleted_count,
|
|
276
|
+
estimated_found: nil
|
|
277
|
+
}
|
|
278
|
+
end
|
|
279
|
+
|
|
280
|
+
def failed_summary(klass, into, partition, filter, filter_hash, duration, error)
|
|
281
|
+
{
|
|
282
|
+
status: :failed,
|
|
283
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
284
|
+
into: into,
|
|
285
|
+
partition: partition,
|
|
286
|
+
filter_by: filter,
|
|
287
|
+
filter_hash: filter_hash,
|
|
288
|
+
duration_ms: duration.round(1),
|
|
289
|
+
deleted_count: 0,
|
|
290
|
+
estimated_found: nil,
|
|
291
|
+
error_class: error.class.name,
|
|
292
|
+
message_truncated: error.message.to_s[0, 200]
|
|
293
|
+
}
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
def skip_summary(klass, into, partition)
|
|
297
|
+
{
|
|
298
|
+
status: :skipped,
|
|
299
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
300
|
+
into: into,
|
|
301
|
+
partition: partition,
|
|
302
|
+
filter_by: nil,
|
|
303
|
+
filter_hash: nil,
|
|
304
|
+
duration_ms: 0.0,
|
|
305
|
+
deleted_count: 0,
|
|
306
|
+
estimated_found: nil
|
|
307
|
+
}
|
|
308
|
+
end
|
|
309
|
+
|
|
310
|
+
def rows_enumerator_for(klass, partition:, compiled_partitioner:)
|
|
311
|
+
if compiled_partitioner
|
|
312
|
+
compiled_partitioner.partition_fetch_enum(partition)
|
|
313
|
+
else
|
|
314
|
+
dsl = mapper_dsl_for(klass)
|
|
315
|
+
source_def = dsl && dsl[:source]
|
|
316
|
+
unless source_def
|
|
317
|
+
raise Errors::InvalidParams,
|
|
318
|
+
'No partition_fetch defined and no source adapter provided. Define one in the DSL.'
|
|
319
|
+
end
|
|
320
|
+
adapter = SearchEngine::Sources.build(source_def[:type], **(source_def[:options] || {}), &source_def[:block])
|
|
321
|
+
adapter.each_batch(partition: partition)
|
|
322
|
+
end
|
|
323
|
+
end
|
|
324
|
+
|
|
325
|
+
def resolve_into!(klass, partition:, into:)
|
|
326
|
+
return into if into && !into.to_s.strip.empty?
|
|
327
|
+
|
|
328
|
+
resolver = SearchEngine.config.partitioning&.default_into_resolver
|
|
329
|
+
if resolver.respond_to?(:arity)
|
|
330
|
+
case resolver.arity
|
|
331
|
+
when 1
|
|
332
|
+
val = resolver.call(klass)
|
|
333
|
+
return val if val && !val.to_s.strip.empty?
|
|
334
|
+
when 2, -1
|
|
335
|
+
val = resolver.call(klass, partition)
|
|
336
|
+
return val if val && !val.to_s.strip.empty?
|
|
337
|
+
end
|
|
338
|
+
elsif resolver
|
|
339
|
+
val = resolver.call(klass)
|
|
340
|
+
return val if val && !val.to_s.strip.empty?
|
|
341
|
+
end
|
|
342
|
+
|
|
343
|
+
name = if klass.respond_to?(:collection)
|
|
344
|
+
klass.collection
|
|
345
|
+
else
|
|
346
|
+
klass.name.to_s
|
|
347
|
+
end
|
|
348
|
+
name.to_s
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
def run_hook_with_timeout(proc_obj, partition, timeout_ms:)
|
|
352
|
+
return proc_obj.call(partition) unless timeout_ms&.to_i&.positive?
|
|
353
|
+
|
|
354
|
+
Timeout.timeout(timeout_ms.to_f / 1000.0) do
|
|
355
|
+
proc_obj.call(partition)
|
|
356
|
+
end
|
|
357
|
+
end
|
|
358
|
+
|
|
359
|
+
def import_batch_with_handling(client, collection, docs, action, next_index)
|
|
360
|
+
buffer = +''
|
|
361
|
+
docs_count = encode_jsonl!(docs, buffer)
|
|
362
|
+
bytes_sent = buffer.bytesize
|
|
363
|
+
idx = next_index.call
|
|
364
|
+
|
|
365
|
+
begin
|
|
366
|
+
attempt_stats = with_retries do |attempt|
|
|
367
|
+
perform_attempt(client, collection, action, buffer, docs_count, bytes_sent, idx, attempt)
|
|
368
|
+
end
|
|
369
|
+
[attempt_stats]
|
|
370
|
+
rescue Errors::Api => error
|
|
371
|
+
if error.status.to_i == 413 && docs.size > 1
|
|
372
|
+
mid = docs.size / 2
|
|
373
|
+
left = docs[0...mid]
|
|
374
|
+
right = docs[mid..]
|
|
375
|
+
import_batch_with_handling(client, collection, left, action, next_index) +
|
|
376
|
+
import_batch_with_handling(client, collection, right, action, next_index)
|
|
377
|
+
else
|
|
378
|
+
[
|
|
379
|
+
{
|
|
380
|
+
index: idx,
|
|
381
|
+
docs_count: docs_count,
|
|
382
|
+
success_count: 0,
|
|
383
|
+
failure_count: docs_count,
|
|
384
|
+
attempts: 1,
|
|
385
|
+
http_status: error.status.to_i,
|
|
386
|
+
duration_ms: 0.0,
|
|
387
|
+
bytes_sent: bytes_sent,
|
|
388
|
+
errors_sample: [safe_error_excerpt(error)]
|
|
389
|
+
}
|
|
390
|
+
]
|
|
391
|
+
end
|
|
392
|
+
end
|
|
393
|
+
end
|
|
394
|
+
|
|
395
|
+
def perform_attempt(client, collection, action, jsonl, docs_count, bytes_sent, idx, attempt)
|
|
396
|
+
start = monotonic_ms
|
|
397
|
+
success_count = 0
|
|
398
|
+
failure_count = 0
|
|
399
|
+
http_status = 200
|
|
400
|
+
error_sample = []
|
|
401
|
+
|
|
402
|
+
if defined?(ActiveSupport::Notifications)
|
|
403
|
+
se_payload = {
|
|
404
|
+
collection: SearchEngine::Instrumentation.context[:collection] || collection,
|
|
405
|
+
into: collection,
|
|
406
|
+
batch_index: idx,
|
|
407
|
+
docs_count: docs_count,
|
|
408
|
+
success_count: nil,
|
|
409
|
+
failure_count: nil,
|
|
410
|
+
attempts: attempt,
|
|
411
|
+
http_status: nil,
|
|
412
|
+
bytes_sent: bytes_sent,
|
|
413
|
+
transient_retry: attempt > 1,
|
|
414
|
+
retry_after_s: nil,
|
|
415
|
+
error_sample: nil
|
|
416
|
+
}
|
|
417
|
+
SearchEngine::Instrumentation.instrument('search_engine.indexer.batch_import', se_payload) do |ctx|
|
|
418
|
+
raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
|
|
419
|
+
success_count, failure_count, error_sample = parse_import_response(raw)
|
|
420
|
+
http_status = 200
|
|
421
|
+
ctx[:success_count] = success_count
|
|
422
|
+
ctx[:failure_count] = failure_count
|
|
423
|
+
ctx[:http_status] = http_status
|
|
424
|
+
end
|
|
425
|
+
else
|
|
426
|
+
raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
|
|
427
|
+
success_count, failure_count, error_sample = parse_import_response(raw)
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
duration = monotonic_ms - start
|
|
431
|
+
{
|
|
432
|
+
index: idx,
|
|
433
|
+
docs_count: docs_count,
|
|
434
|
+
success_count: success_count,
|
|
435
|
+
failure_count: failure_count,
|
|
436
|
+
attempts: attempt,
|
|
437
|
+
http_status: http_status,
|
|
438
|
+
duration_ms: duration.round(1),
|
|
439
|
+
bytes_sent: bytes_sent,
|
|
440
|
+
errors_sample: error_sample
|
|
441
|
+
}
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def with_retries
|
|
445
|
+
cfg = SearchEngine.config.indexer
|
|
446
|
+
attempts = cfg&.retries && cfg.retries[:attempts].to_i.positive? ? cfg.retries[:attempts].to_i : 3
|
|
447
|
+
base = cfg&.retries && cfg.retries[:base].to_f.positive? ? cfg.retries[:base].to_f : 0.5
|
|
448
|
+
max = cfg&.retries && cfg.retries[:max].to_f.positive? ? cfg.retries[:max].to_f : 5.0
|
|
449
|
+
jitter = cfg&.retries && cfg.retries[:jitter_fraction].to_f >= 0 ? cfg.retries[:jitter_fraction].to_f : 0.2
|
|
450
|
+
|
|
451
|
+
(1..attempts).each do |i|
|
|
452
|
+
return yield(i)
|
|
453
|
+
rescue Errors::Timeout, Errors::Connection
|
|
454
|
+
raise if i >= attempts
|
|
455
|
+
|
|
456
|
+
sleep_with_backoff(i, base: base, max: max, jitter_fraction: jitter)
|
|
457
|
+
rescue Errors::Api => error
|
|
458
|
+
code = error.status.to_i
|
|
459
|
+
raise unless transient_status?(code)
|
|
460
|
+
raise if i >= attempts
|
|
461
|
+
|
|
462
|
+
sleep_with_backoff(i, base: base, max: max, jitter_fraction: jitter)
|
|
463
|
+
end
|
|
464
|
+
end
|
|
465
|
+
|
|
466
|
+
def sleep_with_backoff(attempt, base:, max:, jitter_fraction:)
|
|
467
|
+
exp = [base * (2 ** (attempt - 1)), max].min
|
|
468
|
+
jitter = exp * jitter_fraction
|
|
469
|
+
delta = rand(-jitter..jitter)
|
|
470
|
+
sleep_time = exp + delta
|
|
471
|
+
sleep(sleep_time) if sleep_time.positive?
|
|
472
|
+
end
|
|
473
|
+
|
|
474
|
+
def transient_status?(code)
|
|
475
|
+
return true if code == 429
|
|
476
|
+
return true if code >= 500 && code <= 599
|
|
477
|
+
|
|
478
|
+
false
|
|
479
|
+
end
|
|
480
|
+
|
|
481
|
+
def to_array(batch)
|
|
482
|
+
return batch if batch.is_a?(Array)
|
|
483
|
+
|
|
484
|
+
batch.respond_to?(:to_a) ? batch.to_a : Array(batch)
|
|
485
|
+
end
|
|
486
|
+
|
|
487
|
+
def encode_jsonl!(docs, buffer)
|
|
488
|
+
count = 0
|
|
489
|
+
buffer.clear
|
|
490
|
+
docs.each do |raw|
|
|
491
|
+
doc = ensure_hash_document(raw)
|
|
492
|
+
ensure_id!(doc)
|
|
493
|
+
# Force system timestamp field just before serialization; developers cannot override.
|
|
494
|
+
now_i = if defined?(Time) && defined?(Time.zone) && Time.zone
|
|
495
|
+
Time.zone.now.to_i
|
|
496
|
+
else
|
|
497
|
+
Time.now.to_i
|
|
498
|
+
end
|
|
499
|
+
doc[:doc_updated_at] = now_i if doc.is_a?(Hash)
|
|
500
|
+
buffer << JSON.generate(doc)
|
|
501
|
+
buffer << "\n" if count < (docs.size - 1)
|
|
502
|
+
count += 1
|
|
503
|
+
end
|
|
504
|
+
count
|
|
505
|
+
end
|
|
506
|
+
|
|
507
|
+
def ensure_hash_document(obj)
|
|
508
|
+
if obj.is_a?(Hash)
|
|
509
|
+
obj
|
|
510
|
+
else
|
|
511
|
+
raise Errors::InvalidParams,
|
|
512
|
+
'Indexer requires batches of Hash-like documents with at least an :id key. ' \
|
|
513
|
+
'Mapping DSL is not available yet. See https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer.'
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
|
|
517
|
+
def ensure_id!(doc)
|
|
518
|
+
has_id = doc.key?(:id) || doc.key?('id')
|
|
519
|
+
raise Errors::InvalidParams, 'document is missing required id' unless has_id
|
|
520
|
+
end
|
|
521
|
+
|
|
522
|
+
def parse_import_response(raw)
|
|
523
|
+
return parse_from_string(raw) if raw.is_a?(String)
|
|
524
|
+
return parse_from_array(raw) if raw.is_a?(Array)
|
|
525
|
+
|
|
526
|
+
[0, 0, []]
|
|
527
|
+
end
|
|
528
|
+
|
|
529
|
+
def parse_from_string(str)
|
|
530
|
+
success = 0
|
|
531
|
+
failure = 0
|
|
532
|
+
samples = []
|
|
533
|
+
|
|
534
|
+
str.each_line do |line|
|
|
535
|
+
line = line.strip
|
|
536
|
+
next if line.empty?
|
|
537
|
+
|
|
538
|
+
h = safe_parse_json(line)
|
|
539
|
+
unless h
|
|
540
|
+
failure += 1
|
|
541
|
+
samples << 'invalid-json-line'
|
|
542
|
+
next
|
|
543
|
+
end
|
|
544
|
+
|
|
545
|
+
if truthy?(h['success'] || h[:success])
|
|
546
|
+
success += 1
|
|
547
|
+
else
|
|
548
|
+
failure += 1
|
|
549
|
+
msg = h['error'] || h[:error] || h['message'] || h[:message]
|
|
550
|
+
samples << msg.to_s[0, 200] if msg
|
|
551
|
+
end
|
|
552
|
+
end
|
|
553
|
+
|
|
554
|
+
[success, failure, samples[0, 5]]
|
|
555
|
+
end
|
|
556
|
+
|
|
557
|
+
def parse_from_array(arr)
|
|
558
|
+
success = 0
|
|
559
|
+
failure = 0
|
|
560
|
+
samples = []
|
|
561
|
+
|
|
562
|
+
arr.each do |h|
|
|
563
|
+
if h.is_a?(Hash) && truthy?(h['success'] || h[:success])
|
|
564
|
+
success += 1
|
|
565
|
+
else
|
|
566
|
+
failure += 1
|
|
567
|
+
msg = h.is_a?(Hash) ? (h['error'] || h[:error] || h['message'] || h[:message]) : nil
|
|
568
|
+
samples << msg.to_s[0, 200] if msg
|
|
569
|
+
end
|
|
570
|
+
end
|
|
571
|
+
|
|
572
|
+
[success, failure, samples[0, 5]]
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
def safe_parse_json(line)
|
|
576
|
+
JSON.parse(line)
|
|
577
|
+
rescue StandardError
|
|
578
|
+
nil
|
|
579
|
+
end
|
|
580
|
+
|
|
581
|
+
def truthy?(val)
|
|
582
|
+
val == true || val.to_s.downcase == 'true'
|
|
583
|
+
end
|
|
584
|
+
|
|
585
|
+
def safe_error_excerpt(error)
|
|
586
|
+
cls = error.class.name
|
|
587
|
+
msg = error.message.to_s
|
|
588
|
+
"#{cls}: #{msg[0, 200]}"
|
|
589
|
+
end
|
|
590
|
+
|
|
591
|
+
def monotonic_ms
|
|
592
|
+
SearchEngine::Instrumentation.monotonic_ms
|
|
593
|
+
end
|
|
594
|
+
|
|
595
|
+
def mapper_dsl_for(klass)
|
|
596
|
+
return unless klass.instance_variable_defined?(:@__mapper_dsl__)
|
|
597
|
+
|
|
598
|
+
klass.instance_variable_get(:@__mapper_dsl__)
|
|
599
|
+
end
|
|
600
|
+
|
|
601
|
+
def instrument_started(klass:, into:, partition:, filter_hash:)
|
|
602
|
+
return unless defined?(ActiveSupport::Notifications)
|
|
603
|
+
|
|
604
|
+
payload = {
|
|
605
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
606
|
+
into: into,
|
|
607
|
+
partition: partition,
|
|
608
|
+
filter_hash: filter_hash
|
|
609
|
+
}
|
|
610
|
+
ActiveSupport::Notifications.instrument('search_engine.stale_deletes.started', payload) {}
|
|
611
|
+
end
|
|
612
|
+
|
|
613
|
+
def instrument_finished(klass:, into:, partition:, duration_ms:, deleted_count:)
|
|
614
|
+
return unless defined?(ActiveSupport::Notifications)
|
|
615
|
+
|
|
616
|
+
payload = {
|
|
617
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
618
|
+
into: into,
|
|
619
|
+
partition: partition,
|
|
620
|
+
duration_ms: duration_ms.round(1),
|
|
621
|
+
deleted_count: deleted_count
|
|
622
|
+
}
|
|
623
|
+
ActiveSupport::Notifications.instrument('search_engine.stale_deletes.finished', payload) {}
|
|
624
|
+
pf = SearchEngine::Observability.partition_fields(partition)
|
|
625
|
+
SearchEngine::Instrumentation.instrument('search_engine.indexer.delete_stale', payload.merge(partition_hash: pf[:partition_hash], status: 'ok')) {}
|
|
626
|
+
end
|
|
627
|
+
|
|
628
|
+
def instrument_error(error, klass:, into:, partition:)
|
|
629
|
+
return unless defined?(ActiveSupport::Notifications)
|
|
630
|
+
|
|
631
|
+
payload = {
|
|
632
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
633
|
+
into: into,
|
|
634
|
+
partition: partition,
|
|
635
|
+
error_class: error.class.name,
|
|
636
|
+
message_truncated: error.message.to_s[0, 200]
|
|
637
|
+
}
|
|
638
|
+
ActiveSupport::Notifications.instrument('search_engine.stale_deletes.error', payload) {}
|
|
639
|
+
pf = SearchEngine::Observability.partition_fields(partition)
|
|
640
|
+
SearchEngine::Instrumentation.instrument('search_engine.indexer.delete_stale', payload.merge(partition_hash: pf[:partition_hash], status: 'failed')) {}
|
|
641
|
+
end
|
|
642
|
+
|
|
643
|
+
def instrument_stale(_type, reason:, klass:, into:, partition:)
|
|
644
|
+
return unless defined?(ActiveSupport::Notifications)
|
|
645
|
+
|
|
646
|
+
payload = {
|
|
647
|
+
reason: reason,
|
|
648
|
+
collection: klass.respond_to?(:collection) ? klass.collection : klass.name.to_s,
|
|
649
|
+
into: into,
|
|
650
|
+
partition: partition
|
|
651
|
+
}
|
|
652
|
+
ActiveSupport::Notifications.instrument('search_engine.stale_deletes.skipped', payload) {}
|
|
653
|
+
pf = SearchEngine::Observability.partition_fields(partition)
|
|
654
|
+
SearchEngine::Instrumentation.instrument('search_engine.indexer.delete_stale', payload.merge(partition_hash: pf[:partition_hash], status: 'skipped')) {}
|
|
655
|
+
end
|
|
656
|
+
|
|
657
|
+
def suspicious_filter?(filter)
|
|
658
|
+
s = filter.to_s
|
|
659
|
+
return true unless s.include?('=')
|
|
660
|
+
|
|
661
|
+
# Contains wildcard star without any field comparator context
|
|
662
|
+
return true if s.include?('*') && !s.match?(/[a-zA-Z0-9_]+\s*[:><=!]/)
|
|
663
|
+
|
|
664
|
+
false
|
|
665
|
+
end
|
|
666
|
+
|
|
667
|
+
def run_before_hook_if_present(before_hook, partition, klass)
|
|
668
|
+
return unless before_hook
|
|
669
|
+
|
|
670
|
+
# Guard: skip executing before_partition when the logical collection (alias or
|
|
671
|
+
# same-named physical) is missing. This avoids 404s during the initial schema
|
|
672
|
+
# apply before the alias swap has occurred.
|
|
673
|
+
present = begin
|
|
674
|
+
klass.respond_to?(:current_schema) && klass.current_schema
|
|
675
|
+
rescue StandardError
|
|
676
|
+
false
|
|
677
|
+
end
|
|
678
|
+
return unless present
|
|
679
|
+
|
|
680
|
+
# Safety: do not execute before_partition hooks for nil partitions.
|
|
681
|
+
# This prevents developers from accidentally issuing dangerous deletes
|
|
682
|
+
# with empty filter values (e.g., "store_id:=").
|
|
683
|
+
return if partition.nil?
|
|
684
|
+
|
|
685
|
+
run_hook_with_timeout(
|
|
686
|
+
before_hook,
|
|
687
|
+
partition,
|
|
688
|
+
timeout_ms: SearchEngine.config.partitioning.before_hook_timeout_ms
|
|
689
|
+
)
|
|
690
|
+
end
|
|
691
|
+
|
|
692
|
+
def run_after_hook_if_present(after_hook, partition)
|
|
693
|
+
return unless after_hook
|
|
694
|
+
|
|
695
|
+
run_hook_with_timeout(
|
|
696
|
+
after_hook,
|
|
697
|
+
partition,
|
|
698
|
+
timeout_ms: SearchEngine.config.partitioning.after_hook_timeout_ms
|
|
699
|
+
)
|
|
700
|
+
end
|
|
701
|
+
|
|
702
|
+
def instrument_partition_start(klass, target_into, pfields, dispatch_ctx)
|
|
703
|
+
SearchEngine::Instrumentation.instrument(
|
|
704
|
+
'search_engine.indexer.partition_start',
|
|
705
|
+
{
|
|
706
|
+
collection: (klass.respond_to?(:collection) ? klass.collection : klass.name.to_s),
|
|
707
|
+
into: target_into,
|
|
708
|
+
partition: pfields[:partition],
|
|
709
|
+
partition_hash: pfields[:partition_hash],
|
|
710
|
+
dispatch_mode: dispatch_ctx[:dispatch_mode],
|
|
711
|
+
job_id: dispatch_ctx[:job_id],
|
|
712
|
+
timestamp: Time.now.utc.iso8601
|
|
713
|
+
}
|
|
714
|
+
) {}
|
|
715
|
+
end
|
|
716
|
+
|
|
717
|
+
def instrument_partition_finish(klass, target_into, pfields, summary, started_at)
|
|
718
|
+
SearchEngine::Instrumentation.instrument(
|
|
719
|
+
'search_engine.indexer.partition_finish',
|
|
720
|
+
{
|
|
721
|
+
collection: (klass.respond_to?(:collection) ? klass.collection : klass.name.to_s),
|
|
722
|
+
into: target_into,
|
|
723
|
+
partition: pfields[:partition],
|
|
724
|
+
partition_hash: pfields[:partition_hash],
|
|
725
|
+
batches_total: summary.batches_total,
|
|
726
|
+
docs_total: summary.docs_total,
|
|
727
|
+
success_total: summary.success_total,
|
|
728
|
+
failed_total: summary.failed_total,
|
|
729
|
+
status: summary.status,
|
|
730
|
+
duration_ms: (monotonic_ms - started_at).round(1)
|
|
731
|
+
}
|
|
732
|
+
) {}
|
|
733
|
+
end
|
|
734
|
+
|
|
735
|
+
def build_docs_enum(rows_enum, mapper)
|
|
736
|
+
Enumerator.new do |y|
|
|
737
|
+
idx = 0
|
|
738
|
+
rows_enum.each do |rows|
|
|
739
|
+
docs, _report = mapper.map_batch!(rows, batch_index: idx)
|
|
740
|
+
y << docs
|
|
741
|
+
idx += 1
|
|
742
|
+
end
|
|
743
|
+
end
|
|
744
|
+
end
|
|
745
|
+
end
|
|
746
|
+
end
|
|
747
|
+
end
|