search-engine-for-typesense 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +21 -0
- data/README.md +148 -0
- data/app/search_engine/search_engine/app_info.rb +11 -0
- data/app/search_engine/search_engine/index_partition_job.rb +170 -0
- data/lib/generators/search_engine/install/install_generator.rb +20 -0
- data/lib/generators/search_engine/install/templates/initializer.rb.tt +230 -0
- data/lib/generators/search_engine/model/model_generator.rb +86 -0
- data/lib/generators/search_engine/model/templates/model.rb.tt +12 -0
- data/lib/search-engine-for-typesense.rb +12 -0
- data/lib/search_engine/active_record_syncable.rb +247 -0
- data/lib/search_engine/admin/stopwords.rb +125 -0
- data/lib/search_engine/admin/synonyms.rb +125 -0
- data/lib/search_engine/admin.rb +12 -0
- data/lib/search_engine/ast/and.rb +52 -0
- data/lib/search_engine/ast/binary_op.rb +75 -0
- data/lib/search_engine/ast/eq.rb +19 -0
- data/lib/search_engine/ast/group.rb +18 -0
- data/lib/search_engine/ast/gt.rb +12 -0
- data/lib/search_engine/ast/gte.rb +12 -0
- data/lib/search_engine/ast/in.rb +28 -0
- data/lib/search_engine/ast/lt.rb +12 -0
- data/lib/search_engine/ast/lte.rb +12 -0
- data/lib/search_engine/ast/matches.rb +55 -0
- data/lib/search_engine/ast/node.rb +176 -0
- data/lib/search_engine/ast/not_eq.rb +13 -0
- data/lib/search_engine/ast/not_in.rb +24 -0
- data/lib/search_engine/ast/or.rb +52 -0
- data/lib/search_engine/ast/prefix.rb +51 -0
- data/lib/search_engine/ast/raw.rb +41 -0
- data/lib/search_engine/ast/unary_op.rb +43 -0
- data/lib/search_engine/ast.rb +101 -0
- data/lib/search_engine/base/creation.rb +727 -0
- data/lib/search_engine/base/deletion.rb +80 -0
- data/lib/search_engine/base/display_coercions.rb +36 -0
- data/lib/search_engine/base/hydration.rb +312 -0
- data/lib/search_engine/base/index_maintenance/cleanup.rb +202 -0
- data/lib/search_engine/base/index_maintenance/lifecycle.rb +251 -0
- data/lib/search_engine/base/index_maintenance/schema.rb +117 -0
- data/lib/search_engine/base/index_maintenance.rb +459 -0
- data/lib/search_engine/base/indexing_dsl.rb +255 -0
- data/lib/search_engine/base/joins.rb +479 -0
- data/lib/search_engine/base/model_dsl.rb +472 -0
- data/lib/search_engine/base/presets.rb +43 -0
- data/lib/search_engine/base/pretty_printer.rb +315 -0
- data/lib/search_engine/base/relation_delegation.rb +42 -0
- data/lib/search_engine/base/scopes.rb +113 -0
- data/lib/search_engine/base/updating.rb +92 -0
- data/lib/search_engine/base.rb +38 -0
- data/lib/search_engine/bulk.rb +284 -0
- data/lib/search_engine/cache.rb +33 -0
- data/lib/search_engine/cascade.rb +531 -0
- data/lib/search_engine/cli/doctor.rb +631 -0
- data/lib/search_engine/cli/support.rb +217 -0
- data/lib/search_engine/cli.rb +222 -0
- data/lib/search_engine/client/http_adapter.rb +63 -0
- data/lib/search_engine/client/request_builder.rb +92 -0
- data/lib/search_engine/client/services/base.rb +74 -0
- data/lib/search_engine/client/services/collections.rb +161 -0
- data/lib/search_engine/client/services/documents.rb +214 -0
- data/lib/search_engine/client/services/operations.rb +152 -0
- data/lib/search_engine/client/services/search.rb +190 -0
- data/lib/search_engine/client/services.rb +29 -0
- data/lib/search_engine/client.rb +765 -0
- data/lib/search_engine/client_options.rb +20 -0
- data/lib/search_engine/collection_resolver.rb +191 -0
- data/lib/search_engine/collections_graph.rb +330 -0
- data/lib/search_engine/compiled_params.rb +143 -0
- data/lib/search_engine/compiler.rb +383 -0
- data/lib/search_engine/config/observability.rb +27 -0
- data/lib/search_engine/config/presets.rb +92 -0
- data/lib/search_engine/config/selection.rb +16 -0
- data/lib/search_engine/config/typesense.rb +48 -0
- data/lib/search_engine/config/validators.rb +97 -0
- data/lib/search_engine/config.rb +917 -0
- data/lib/search_engine/console_helpers.rb +130 -0
- data/lib/search_engine/deletion.rb +103 -0
- data/lib/search_engine/dispatcher.rb +125 -0
- data/lib/search_engine/dsl/parser.rb +582 -0
- data/lib/search_engine/engine.rb +167 -0
- data/lib/search_engine/errors.rb +290 -0
- data/lib/search_engine/filters/sanitizer.rb +189 -0
- data/lib/search_engine/hydration/materializers.rb +808 -0
- data/lib/search_engine/hydration/selection_context.rb +96 -0
- data/lib/search_engine/indexer/batch_planner.rb +76 -0
- data/lib/search_engine/indexer/bulk_import.rb +626 -0
- data/lib/search_engine/indexer/import_dispatcher.rb +198 -0
- data/lib/search_engine/indexer/retry_policy.rb +103 -0
- data/lib/search_engine/indexer.rb +747 -0
- data/lib/search_engine/instrumentation.rb +308 -0
- data/lib/search_engine/joins/guard.rb +202 -0
- data/lib/search_engine/joins/resolver.rb +95 -0
- data/lib/search_engine/logging/color.rb +78 -0
- data/lib/search_engine/logging/format_helpers.rb +92 -0
- data/lib/search_engine/logging/partition_progress.rb +53 -0
- data/lib/search_engine/logging_subscriber.rb +388 -0
- data/lib/search_engine/mapper.rb +785 -0
- data/lib/search_engine/multi.rb +286 -0
- data/lib/search_engine/multi_result.rb +186 -0
- data/lib/search_engine/notifications/compact_logger.rb +675 -0
- data/lib/search_engine/observability.rb +162 -0
- data/lib/search_engine/operations.rb +58 -0
- data/lib/search_engine/otel.rb +227 -0
- data/lib/search_engine/partitioner.rb +128 -0
- data/lib/search_engine/ranking_plan.rb +118 -0
- data/lib/search_engine/registry.rb +158 -0
- data/lib/search_engine/relation/compiler.rb +711 -0
- data/lib/search_engine/relation/deletion.rb +37 -0
- data/lib/search_engine/relation/dsl/filters.rb +624 -0
- data/lib/search_engine/relation/dsl/selection.rb +240 -0
- data/lib/search_engine/relation/dsl.rb +903 -0
- data/lib/search_engine/relation/dx/dry_run.rb +59 -0
- data/lib/search_engine/relation/dx/friendly_where.rb +24 -0
- data/lib/search_engine/relation/dx.rb +231 -0
- data/lib/search_engine/relation/materializers.rb +118 -0
- data/lib/search_engine/relation/options.rb +138 -0
- data/lib/search_engine/relation/state.rb +274 -0
- data/lib/search_engine/relation/updating.rb +44 -0
- data/lib/search_engine/relation.rb +623 -0
- data/lib/search_engine/result.rb +664 -0
- data/lib/search_engine/schema.rb +1083 -0
- data/lib/search_engine/sources/active_record_source.rb +185 -0
- data/lib/search_engine/sources/base.rb +62 -0
- data/lib/search_engine/sources/lambda_source.rb +55 -0
- data/lib/search_engine/sources/sql_source.rb +196 -0
- data/lib/search_engine/sources.rb +71 -0
- data/lib/search_engine/stale_rules.rb +160 -0
- data/lib/search_engine/test/minitest_assertions.rb +57 -0
- data/lib/search_engine/test/offline_client.rb +134 -0
- data/lib/search_engine/test/rspec_matchers.rb +77 -0
- data/lib/search_engine/test/stub_client.rb +201 -0
- data/lib/search_engine/test.rb +66 -0
- data/lib/search_engine/test_autoload.rb +8 -0
- data/lib/search_engine/update.rb +35 -0
- data/lib/search_engine/version.rb +7 -0
- data/lib/search_engine.rb +332 -0
- data/lib/tasks/search_engine.rake +501 -0
- data/lib/tasks/search_engine_doctor.rake +16 -0
- metadata +225 -0
|
@@ -0,0 +1,198 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'json'
|
|
4
|
+
|
|
5
|
+
module SearchEngine
|
|
6
|
+
class Indexer
|
|
7
|
+
# Orchestrates a single batch import attempt and retry loop.
|
|
8
|
+
#
|
|
9
|
+
# Emits the same instrumentation as the monolithic Indexer, preserves
|
|
10
|
+
# :upsert default action, and supports dry-run mode (serialize only).
|
|
11
|
+
#
|
|
12
|
+
# @since M8
|
|
13
|
+
class ImportDispatcher
|
|
14
|
+
# Result of a single import (may be used as an element of batches list)
|
|
15
|
+
# Keys mirror existing payload structure.
|
|
16
|
+
# @return [Hash]
|
|
17
|
+
def self.import_batch(client:, collection:, action:, jsonl:, docs_count:, bytes_sent:, batch_index:,
|
|
18
|
+
retry_policy:, dry_run: false)
|
|
19
|
+
# @param client [SearchEngine::Client]
|
|
20
|
+
# @param collection [String] physical collection name to import into
|
|
21
|
+
# @param action [Symbol, String] one of :upsert, :create, :update
|
|
22
|
+
# @param jsonl [String] JSONL body
|
|
23
|
+
# @param docs_count [Integer] number of docs encoded in jsonl
|
|
24
|
+
# @param bytes_sent [Integer] payload size in bytes
|
|
25
|
+
# @param batch_index [Integer, nil] batch sequence number
|
|
26
|
+
# @param retry_policy [SearchEngine::Indexer::RetryPolicy]
|
|
27
|
+
# @param dry_run [Boolean] when true, do not perform network call
|
|
28
|
+
# @return [Hash] stats payload: { index:, docs_count:, success_count:, failure_count:, attempts:, http_status:, duration_ms:, bytes_sent:, errors_sample: [] }
|
|
29
|
+
# @raise [SearchEngine::Errors::Api] when the underlying client raises an API error (propagated)
|
|
30
|
+
# @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
|
|
31
|
+
if dry_run
|
|
32
|
+
# Emit instrumentation parity without network
|
|
33
|
+
http_status = 200
|
|
34
|
+
success_count = docs_count
|
|
35
|
+
failure_count = 0
|
|
36
|
+
duration_ms = 0.0
|
|
37
|
+
if defined?(ActiveSupport::Notifications)
|
|
38
|
+
payload = base_payload(collection, batch_index, docs_count, bytes_sent)
|
|
39
|
+
SearchEngine::Instrumentation.instrument('search_engine.indexer.batch_import', payload) do |ctx|
|
|
40
|
+
ctx[:success_count] = success_count
|
|
41
|
+
ctx[:failure_count] = failure_count
|
|
42
|
+
ctx[:http_status] = http_status
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
return build_stats(batch_index, docs_count, success_count, failure_count, 1, http_status, duration_ms,
|
|
46
|
+
bytes_sent, []
|
|
47
|
+
)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
attempt = 1
|
|
51
|
+
loop do
|
|
52
|
+
stats = perform_attempt(client, collection, action, jsonl, docs_count, bytes_sent, batch_index, attempt)
|
|
53
|
+
return stats
|
|
54
|
+
rescue StandardError => error
|
|
55
|
+
if retry_policy.retry?(attempt, error)
|
|
56
|
+
delay = retry_policy.next_delay(attempt, error)
|
|
57
|
+
sleep(delay) if delay.positive?
|
|
58
|
+
attempt += 1
|
|
59
|
+
next
|
|
60
|
+
end
|
|
61
|
+
raise
|
|
62
|
+
end
|
|
63
|
+
rescue SearchEngine::Errors::Api => error
|
|
64
|
+
# Let 413 handling and other mapping be owned by the caller for splitting
|
|
65
|
+
raise error
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
class << self
|
|
69
|
+
private
|
|
70
|
+
|
|
71
|
+
def perform_attempt(client, collection, action, jsonl, docs_count, bytes_sent, idx, attempt)
|
|
72
|
+
start = monotonic_ms
|
|
73
|
+
success_count = 0
|
|
74
|
+
failure_count = 0
|
|
75
|
+
http_status = 200
|
|
76
|
+
error_sample = []
|
|
77
|
+
|
|
78
|
+
if defined?(ActiveSupport::Notifications)
|
|
79
|
+
se_payload = base_payload(collection, idx, docs_count, bytes_sent).merge(attempts: attempt,
|
|
80
|
+
http_status: nil
|
|
81
|
+
)
|
|
82
|
+
SearchEngine::Instrumentation.instrument('search_engine.indexer.batch_import', se_payload) do |ctx|
|
|
83
|
+
raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
|
|
84
|
+
success_count, failure_count, error_sample = parse_import_response(raw)
|
|
85
|
+
http_status = 200
|
|
86
|
+
ctx[:success_count] = success_count
|
|
87
|
+
ctx[:failure_count] = failure_count
|
|
88
|
+
ctx[:http_status] = http_status
|
|
89
|
+
end
|
|
90
|
+
else
|
|
91
|
+
raw = client.import_documents(collection: collection, jsonl: jsonl, action: action)
|
|
92
|
+
success_count, failure_count, error_sample = parse_import_response(raw)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
duration = monotonic_ms - start
|
|
96
|
+
build_stats(idx, docs_count, success_count, failure_count, attempt, http_status, duration, bytes_sent,
|
|
97
|
+
error_sample
|
|
98
|
+
)
|
|
99
|
+
end
|
|
100
|
+
|
|
101
|
+
def base_payload(collection, idx, docs_count, bytes_sent)
|
|
102
|
+
{
|
|
103
|
+
collection: SearchEngine::Instrumentation.context[:collection] || collection,
|
|
104
|
+
into: collection,
|
|
105
|
+
batch_index: idx,
|
|
106
|
+
docs_count: docs_count,
|
|
107
|
+
success_count: nil,
|
|
108
|
+
failure_count: nil,
|
|
109
|
+
attempts: nil,
|
|
110
|
+
http_status: nil,
|
|
111
|
+
bytes_sent: bytes_sent,
|
|
112
|
+
transient_retry: false,
|
|
113
|
+
retry_after_s: nil,
|
|
114
|
+
error_sample: nil
|
|
115
|
+
}
|
|
116
|
+
end
|
|
117
|
+
|
|
118
|
+
def build_stats(idx, docs_count, success_count, failure_count, attempts, http_status, duration_ms, bytes_sent,
|
|
119
|
+
error_sample)
|
|
120
|
+
{
|
|
121
|
+
index: idx,
|
|
122
|
+
docs_count: docs_count,
|
|
123
|
+
success_count: success_count,
|
|
124
|
+
failure_count: failure_count,
|
|
125
|
+
attempts: attempts,
|
|
126
|
+
http_status: http_status,
|
|
127
|
+
duration_ms: duration_ms.round(1),
|
|
128
|
+
bytes_sent: bytes_sent,
|
|
129
|
+
errors_sample: Array(error_sample)[0, 5]
|
|
130
|
+
}
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
def parse_import_response(raw)
|
|
134
|
+
return parse_from_string(raw) if raw.is_a?(String)
|
|
135
|
+
return parse_from_array(raw) if raw.is_a?(Array)
|
|
136
|
+
|
|
137
|
+
[0, 0, []]
|
|
138
|
+
end
|
|
139
|
+
|
|
140
|
+
def parse_from_string(str)
|
|
141
|
+
success = 0
|
|
142
|
+
failure = 0
|
|
143
|
+
samples = []
|
|
144
|
+
str.each_line do |line|
|
|
145
|
+
line = line.strip
|
|
146
|
+
next if line.empty?
|
|
147
|
+
|
|
148
|
+
h = safe_parse_json(line)
|
|
149
|
+
unless h
|
|
150
|
+
failure += 1
|
|
151
|
+
samples << 'invalid-json-line'
|
|
152
|
+
next
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
if truthy?(h['success'] || h[:success])
|
|
156
|
+
success += 1
|
|
157
|
+
else
|
|
158
|
+
failure += 1
|
|
159
|
+
msg = h['error'] || h[:error] || h['message'] || h[:message]
|
|
160
|
+
samples << msg.to_s[0, 200] if msg
|
|
161
|
+
end
|
|
162
|
+
end
|
|
163
|
+
[success, failure, samples[0, 5]]
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def parse_from_array(arr)
|
|
167
|
+
success = 0
|
|
168
|
+
failure = 0
|
|
169
|
+
samples = []
|
|
170
|
+
arr.each do |h|
|
|
171
|
+
if h.is_a?(Hash) && truthy?(h['success'] || h[:success])
|
|
172
|
+
success += 1
|
|
173
|
+
else
|
|
174
|
+
failure += 1
|
|
175
|
+
msg = h.is_a?(Hash) ? (h['error'] || h[:error] || h['message'] || h[:message]) : nil
|
|
176
|
+
samples << msg.to_s[0, 200] if msg
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
[success, failure, samples[0, 5]]
|
|
180
|
+
end
|
|
181
|
+
|
|
182
|
+
def safe_parse_json(line)
|
|
183
|
+
JSON.parse(line)
|
|
184
|
+
rescue StandardError
|
|
185
|
+
nil
|
|
186
|
+
end
|
|
187
|
+
|
|
188
|
+
def truthy?(val)
|
|
189
|
+
val == true || val.to_s.downcase == 'true'
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
def monotonic_ms
|
|
193
|
+
SearchEngine::Instrumentation.monotonic_ms
|
|
194
|
+
end
|
|
195
|
+
end
|
|
196
|
+
end
|
|
197
|
+
end
|
|
198
|
+
end
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module SearchEngine
|
|
4
|
+
class Indexer
|
|
5
|
+
# Encapsulates retryability rules and exponential backoff with jitter
|
|
6
|
+
# for import attempts. Pure and deterministic except for jitter.
|
|
7
|
+
#
|
|
8
|
+
# Usage:
|
|
9
|
+
# policy = SearchEngine::Indexer::RetryPolicy.from_config(SearchEngine.config.indexer.retries)
|
|
10
|
+
# attempt = 1
|
|
11
|
+
# begin
|
|
12
|
+
# # perform
|
|
13
|
+
# rescue => error
|
|
14
|
+
# if policy.retry?(attempt, error)
|
|
15
|
+
# sleep(policy.next_delay(attempt, error))
|
|
16
|
+
# attempt += 1
|
|
17
|
+
# retry
|
|
18
|
+
# else
|
|
19
|
+
# raise
|
|
20
|
+
# end
|
|
21
|
+
# end
|
|
22
|
+
#
|
|
23
|
+
# @since M8
|
|
24
|
+
class RetryPolicy
|
|
25
|
+
# @return [Integer]
|
|
26
|
+
attr_reader :attempts
|
|
27
|
+
|
|
28
|
+
# @param attempts [Integer]
|
|
29
|
+
# @param base [Float]
|
|
30
|
+
# @param max [Float]
|
|
31
|
+
# @param jitter_fraction [Float]
|
|
32
|
+
def initialize(attempts:, base:, max:, jitter_fraction:)
|
|
33
|
+
@attempts = Integer(attempts)
|
|
34
|
+
@base = base.to_f
|
|
35
|
+
@max = max.to_f
|
|
36
|
+
@jitter_fraction = jitter_fraction.to_f
|
|
37
|
+
end
|
|
38
|
+
|
|
39
|
+
# Build a policy from a config-like Hash.
|
|
40
|
+
# @param cfg [Hash]
|
|
41
|
+
# @return [RetryPolicy]
|
|
42
|
+
# @see https://nikita-shkoda.mintlify.app/projects/search-engine-for-typesense/indexer
|
|
43
|
+
def self.from_config(cfg)
|
|
44
|
+
c = cfg || {}
|
|
45
|
+
new(
|
|
46
|
+
attempts: (c[:attempts]&.to_i&.positive? ? c[:attempts].to_i : 3),
|
|
47
|
+
base: (c[:base]&.to_f&.positive? ? c[:base].to_f : 0.5),
|
|
48
|
+
max: (c[:max]&.to_f&.positive? ? c[:max].to_f : 5.0),
|
|
49
|
+
jitter_fraction: (c[:jitter_fraction].to_f >= 0 ? c[:jitter_fraction].to_f : 0.2)
|
|
50
|
+
)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
# Whether we should retry for a given attempt and error.
|
|
54
|
+
# @param attempt [Integer] 1-based attempt index
|
|
55
|
+
# @param error [Exception]
|
|
56
|
+
# @return [Boolean]
|
|
57
|
+
def retry?(attempt, error)
|
|
58
|
+
return false if attempt >= @attempts
|
|
59
|
+
|
|
60
|
+
case error
|
|
61
|
+
when SearchEngine::Errors::Timeout, SearchEngine::Errors::Connection
|
|
62
|
+
true
|
|
63
|
+
when SearchEngine::Errors::Api
|
|
64
|
+
transient_status?(error.status.to_i)
|
|
65
|
+
else
|
|
66
|
+
false
|
|
67
|
+
end
|
|
68
|
+
end
|
|
69
|
+
|
|
70
|
+
# Compute the delay in seconds before the next attempt.
|
|
71
|
+
# @param attempt [Integer] 1-based attempt index
|
|
72
|
+
# @param error [Exception]
|
|
73
|
+
# @return [Float]
|
|
74
|
+
def next_delay(attempt, _error)
|
|
75
|
+
# Exponential backoff with bounded jitter
|
|
76
|
+
exp = [@base * (2 ** (attempt - 1)), @max].min
|
|
77
|
+
jitter = exp * @jitter_fraction
|
|
78
|
+
delta = random_in_range(-jitter..jitter)
|
|
79
|
+
sleep_time = exp + delta
|
|
80
|
+
sleep_time.positive? ? sleep_time : 0.0
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
private
|
|
84
|
+
|
|
85
|
+
def transient_status?(code)
|
|
86
|
+
return true if code == 429
|
|
87
|
+
return true if code >= 500 && code <= 599
|
|
88
|
+
|
|
89
|
+
false
|
|
90
|
+
end
|
|
91
|
+
|
|
92
|
+
def random_in_range(range)
|
|
93
|
+
# Use a thread-local RNG for low contention and testability
|
|
94
|
+
rng = (Thread.current[:__se_retry_rng__] ||= Random.new)
|
|
95
|
+
min = range.begin.to_f
|
|
96
|
+
max = range.end.to_f
|
|
97
|
+
# Range may be exclusive; normalize to inclusive space
|
|
98
|
+
span = max - min
|
|
99
|
+
min + (rng.rand * span)
|
|
100
|
+
end
|
|
101
|
+
end
|
|
102
|
+
end
|
|
103
|
+
end
|