parse-stack-next 5.3.0 → 5.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. checksums.yaml +4 -4
  2. data/.gitignore +2 -0
  3. data/CHANGELOG.md +461 -0
  4. data/Gemfile +7 -0
  5. data/Gemfile.lock +12 -4
  6. data/README.md +160 -3
  7. data/Rakefile +52 -3
  8. data/docs/atlas_vector_search_guide.md +86 -2
  9. data/docs/client_sdk_guide.md +5 -0
  10. data/docs/mcp_guide.md +59 -4
  11. data/docs/mongodb_direct_guide.md +93 -1
  12. data/docs/usage_guide.md +11 -1
  13. data/docs/webhooks_guide.md +418 -0
  14. data/examples/README.md +46 -0
  15. data/examples/basic_client.rb +93 -0
  16. data/examples/basic_server.rb +109 -0
  17. data/examples/live_query_listener.rb +98 -0
  18. data/examples/rag_chatbot.rb +221 -0
  19. data/examples/webhook_server.rb +111 -0
  20. data/lib/parse/agent/mcp_rack_app.rb +285 -62
  21. data/lib/parse/agent/tools.rb +45 -5
  22. data/lib/parse/api/aggregate.rb +7 -1
  23. data/lib/parse/api/cloud_functions.rb +12 -4
  24. data/lib/parse/api/hooks.rb +46 -9
  25. data/lib/parse/api/objects.rb +16 -2
  26. data/lib/parse/api/path_segment.rb +33 -0
  27. data/lib/parse/api/server.rb +94 -0
  28. data/lib/parse/api/users.rb +58 -2
  29. data/lib/parse/atlas_search.rb +7 -7
  30. data/lib/parse/client/body_builder.rb +5 -0
  31. data/lib/parse/client/protocol.rb +4 -0
  32. data/lib/parse/client.rb +55 -2
  33. data/lib/parse/embeddings/spend_cap.rb +255 -0
  34. data/lib/parse/embeddings.rb +1 -0
  35. data/lib/parse/live_query/client.rb +3 -1
  36. data/lib/parse/live_query/subscription.rb +32 -5
  37. data/lib/parse/model/acl.rb +4 -2
  38. data/lib/parse/model/classes/audience.rb +52 -4
  39. data/lib/parse/model/classes/user.rb +180 -3
  40. data/lib/parse/model/core/embed_managed.rb +113 -0
  41. data/lib/parse/model/core/querying.rb +3 -1
  42. data/lib/parse/model/core/vector_searchable.rb +161 -0
  43. data/lib/parse/model/object.rb +28 -5
  44. data/lib/parse/mongodb.rb +7 -1
  45. data/lib/parse/pipeline_security.rb +5 -3
  46. data/lib/parse/query/constraints.rb +29 -0
  47. data/lib/parse/query.rb +265 -27
  48. data/lib/parse/retrieval/agent_tool.rb +49 -0
  49. data/lib/parse/retrieval/reranker/cohere.rb +218 -0
  50. data/lib/parse/retrieval/reranker.rb +157 -0
  51. data/lib/parse/retrieval/retriever.rb +110 -23
  52. data/lib/parse/stack/version.rb +1 -1
  53. data/lib/parse/stack.rb +17 -0
  54. data/lib/parse/two_factor_auth/user_extension.rb +123 -31
  55. data/lib/parse/vector_search/hybrid.rb +578 -0
  56. data/lib/parse/webhooks/payload.rb +252 -7
  57. data/lib/parse/webhooks/trigger_audit.rb +502 -0
  58. data/lib/parse/webhooks.rb +215 -3
  59. data/scripts/docker/Dockerfile.parse +5 -1
  60. data/scripts/docker/docker-compose.test.yml +31 -0
  61. data/scripts/docker/docker-compose.verifyemail.yml +4 -0
  62. data/scripts/docker/preflight.sh +76 -0
  63. data/scripts/start-parse.sh +52 -4
  64. metadata +15 -1
@@ -0,0 +1,578 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "../vector_search"
5
+
6
+ module Parse
7
+ module VectorSearch
8
+ # Hybrid (lexical + vector) search with reciprocal-rank fusion.
9
+ #
10
+ # Lexical search (`Parse::AtlasSearch`, BM25/`$search`) nails
11
+ # exact-token matches — proper nouns, SKU codes, "OAuth 2.0". Vector
12
+ # search (`Parse::VectorSearch`, `$vectorSearch`) nails paraphrase —
13
+ # "login token spec". Fusing the two beats either alone on most real
14
+ # workloads.
15
+ #
16
+ # == Why two aggregations (and not one `$facet`)
17
+ #
18
+ # `$vectorSearch` is explicitly prohibited inside `$facet`,
19
+ # `$lookup`, `$unionWith`, or any compound stage on every Atlas
20
+ # version, and it must be the FIRST stage of its pipeline. So on
21
+ # pre-Atlas-8.0 clusters the only correct shape is two independent
22
+ # aggregations followed by client-side reciprocal-rank fusion (RRF).
23
+ # On Atlas 8.0+ the native `$rankFusion` stage performs the same
24
+ # fusion server-side in a single round-trip; {.rank_fusion_supported?}
25
+ # detects it (probe-and-cache, not version-string parsing).
26
+ #
27
+ # == ACL / CLP enforcement
28
+ #
29
+ # The client-side path delegates each branch to an entry point that
30
+ # already enforces the full SDK-side chain — {Parse::AtlasSearch.search}
31
+ # (lexical) and {Parse::VectorSearch.search} (vector). Both apply the
32
+ # CLP `find` boundary, the post-stage `_rperm` `$match`, pointerFields
33
+ # filtering, `protectedFields` redaction, and the internal-fields
34
+ # denylist BEFORE returning rows. Fusion therefore operates only on
35
+ # rows the caller is already allowed to read; there is no separate
36
+ # hydration fetch to re-secure. The native `$rankFusion` path
37
+ # reproduces the same enforcement inline (CLP `find`, post-stage ACL
38
+ # `$match`, post-fetch redaction), mirroring {Parse::VectorSearch.search}.
39
+ #
40
+ # == Scores
41
+ #
42
+ # The vector branch projects `_vscore` (Atlas `vectorSearchScore`),
43
+ # the lexical branch `_score` (Atlas `searchScore`). The fused row
44
+ # carries `_hybrid_score` (the summed RRF weight) and `_hybrid_ranks`
45
+ # (`{ lexical: <rank>, vector: <rank> }`, 1-based, absent for a branch
46
+ # the row did not appear in). The raw branch scores are preserved on
47
+ # the row for callers that want them.
48
+ module Hybrid
49
+ # Raised on malformed fusion input (bad weights, non-positive
50
+ # `k_constant`, empty branch set). Inherits {ArgumentError} so it
51
+ # joins the other bad-input raises in a single rescue boundary.
52
+ class FusionError < ArgumentError; end
53
+
54
+ # Standard RRF rank constant. Larger values flatten the
55
+ # contribution curve (later ranks matter more); 60 is the value
56
+ # from the original Cormack et al. RRF paper and the Atlas
57
+ # `$rankFusion` default.
58
+ DEFAULT_K_CONSTANT = 60
59
+
60
+ # Default number of fused hits returned.
61
+ DEFAULT_K = 20
62
+
63
+ # Per-branch oversample multiplier. Each branch fetches
64
+ # `k * this` candidates so a row ranked low in one branch but high
65
+ # in the other still has a rank to fuse. Atlas's own `$rankFusion`
66
+ # uses a comparable internal oversample.
67
+ DEFAULT_OVERSAMPLE_MULTIPLIER = 5
68
+
69
+ # Hard ceiling on the fused result count, matching
70
+ # {Parse::VectorSearch::MAX_K}.
71
+ MAX_K = Parse::VectorSearch::MAX_K
72
+
73
+ # TTL (seconds) for the {.rank_fusion_supported?} probe cache. A
74
+ # cluster gaining or losing `$rankFusion` support is a rare,
75
+ # operator-driven event (an Atlas major-version upgrade), so a
76
+ # 1-hour cache keeps the extra probe round-trip off the hot path.
77
+ PROBE_CACHE_TTL = 3600
78
+
79
+ class << self
80
+ # Pure reciprocal-rank fusion. Operates on already-fetched,
81
+ # already-ranked branch result lists — no I/O, no ACL concerns
82
+ # (the rows were enforced upstream).
83
+ #
84
+ # `fused_score(d) = Σ_b weight_b / (k_constant + rank_b(d))`
85
+ #
86
+ # @param branches [Hash{Symbol=>Array<Hash>}] each value is a
87
+ # branch's result rows in descending relevance order (best
88
+ # first). Keys name the branch (`:lexical`, `:vector`).
89
+ # @param k_constant [Integer] RRF rank constant (> 0).
90
+ # @param weights [Hash{Symbol=>Numeric}, nil] per-branch weight.
91
+ # Missing branches default to weight 1.0; nil weights the whole
92
+ # set at 1.0.
93
+ # @return [Array<Hash>] fused rows, descending by `_hybrid_score`,
94
+ # each carrying `_hybrid_score` and `_hybrid_ranks`. Ties broke
95
+ # deterministically by objectId (stable for snapshots).
96
+ def rrf(branches, k_constant: DEFAULT_K_CONSTANT, weights: nil)
97
+ unless branches.is_a?(Hash) && !branches.empty?
98
+ raise FusionError, "rrf: branches must be a non-empty Hash of ranked result lists."
99
+ end
100
+ kc = Integer(k_constant)
101
+ raise FusionError, "rrf: k_constant must be a positive integer (got #{kc})." if kc <= 0
102
+ validate_weights!(weights)
103
+
104
+ acc = {}
105
+ order = 0
106
+ branches.each do |branch_name, rows|
107
+ weight = weight_for(weights, branch_name)
108
+ next if weight.zero?
109
+ Array(rows).each_with_index do |row, i|
110
+ id = row_id(row)
111
+ next if id.nil?
112
+ rank = i + 1
113
+ entry = (acc[id] ||= { doc: row, score: 0.0, ranks: {}, seq: (order += 1) })
114
+ entry[:doc] = merge_rows(entry[:doc], row)
115
+ entry[:score] += weight.to_f / (kc + rank)
116
+ entry[:ranks][branch_name] = rank
117
+ end
118
+ end
119
+
120
+ acc.values
121
+ .sort_by { |e| [-e[:score], row_id(e[:doc]).to_s, e[:seq]] }
122
+ .map do |e|
123
+ row = e[:doc].dup
124
+ row["_hybrid_score"] = e[:score]
125
+ row["_hybrid_ranks"] = e[:ranks]
126
+ row
127
+ end
128
+ end
129
+
130
+ # Detect whether the cluster backing `collection` supports the
131
+ # native `$rankFusion` aggregation stage (Atlas 8.0+).
132
+ #
133
+ # Probe-and-cache, NOT version-string parsing: Atlas upgrades
134
+ # cluster versions silently and the exact version where
135
+ # `$rankFusion` reached general availability has moved. We send a
136
+ # zero-cost behavioural probe (`[{$rankFusion: {input: {}}},
137
+ # {$limit: 0}]`) and classify the response: success or any error
138
+ # OTHER than "unknown stage" means supported; an "Unknown
139
+ # aggregation stage" failure means unsupported. The result is
140
+ # cached per collection for {PROBE_CACHE_TTL}.
141
+ #
142
+ # @param collection [String] Parse class / Mongo collection name.
143
+ # @return [Boolean]
144
+ def rank_fusion_supported?(collection)
145
+ key = collection.to_s
146
+ now = monotonic
147
+ cached = probe_cache_get(key, now)
148
+ return cached unless cached.nil?
149
+
150
+ supported = run_probe(key)
151
+ probe_cache_put(key, supported, now)
152
+ supported
153
+ end
154
+
155
+ # Clear the {.rank_fusion_supported?} probe cache (all
156
+ # collections, or one). Mainly for tests that toggle cluster
157
+ # behaviour between cases.
158
+ #
159
+ # @param collection [String, nil]
160
+ def clear_probe_cache(collection = nil)
161
+ probe_mutex.synchronize do
162
+ if collection
163
+ probe_cache.delete(collection.to_s)
164
+ else
165
+ @probe_cache = {}
166
+ end
167
+ end
168
+ end
169
+
170
+ # Run a hybrid search and return the fused raw rows.
171
+ #
172
+ # @param collection_name [String] Parse class / collection.
173
+ # @param lexical [Hash] lexical branch config:
174
+ # * `:query` [String] (required) the `$search` query text.
175
+ # * `:index` [String, nil] Atlas Search (lexical) index name.
176
+ # * `:fields` [Array<String>, String, nil] fields to search;
177
+ # defaults to a wildcard path.
178
+ # * `:filter` [Hash, nil] post-`$search` `$match`.
179
+ # * `:fuzzy` [Hash, nil] forwarded to the text operator.
180
+ # @param vector [Hash] vector branch config:
181
+ # * `:query_vector` [Array<Float>] (required) the query embedding.
182
+ # * `:field` [String, Symbol] (required) vector field path.
183
+ # * `:index` [String, nil] vectorSearch index name.
184
+ # * `:num_candidates` [Integer, nil] Atlas HNSW search width.
185
+ # * `:filter` [Hash, nil] post-`$vectorSearch` `$match`.
186
+ # * `:vector_filter` [Hash, nil] Atlas-native pre-search filter.
187
+ # @param k [Integer] number of fused hits to return (≤ {MAX_K}).
188
+ # @param fusion [Hash, nil] fusion config:
189
+ # * `:method` [Symbol] `:rrf` (default) and `:rrf_client` both
190
+ # fuse CLIENT-SIDE (deterministic across Atlas versions).
191
+ # `:rrf_native` opts into the single-roundtrip server-side
192
+ # `$rankFusion` stage (Atlas 8.0+ only) and falls back to the
193
+ # client path when unsupported or on any execution error.
194
+ # * `:k_constant` [Integer] RRF rank constant.
195
+ # * `:weights` [Hash] `{ lexical:, vector: }` branch weights.
196
+ # @param scope_opts [Hash] ACL/CLP scope kwargs forwarded to BOTH
197
+ # branch entry points: `session_token:` / `master:` /
198
+ # `acl_user:` / `acl_role:`.
199
+ # @return [Array<Hash>] fused rows (see {.rrf}).
200
+ def search(collection_name, lexical:, vector:, k: DEFAULT_K, fusion: nil, **scope_opts)
201
+ require_available!
202
+ fusion = symbolize(fusion || {})
203
+ lex = symbolize(lexical || {})
204
+ vec = symbolize(vector || {})
205
+
206
+ k_int = Integer(k)
207
+ raise ArgumentError, "k must be in 1..#{MAX_K} (got #{k_int})." if k_int <= 0 || k_int > MAX_K
208
+
209
+ unless lex[:query].is_a?(String) && !lex[:query].strip.empty?
210
+ raise ArgumentError, "hybrid search: lexical[:query] must be a non-empty String."
211
+ end
212
+ if vec[:query_vector].nil? || vec[:field].nil?
213
+ raise ArgumentError, "hybrid search: vector[:query_vector] and vector[:field] are required."
214
+ end
215
+
216
+ method = (fusion[:method] || :rrf).to_sym
217
+ unless %i[rrf rrf_client rrf_native].include?(method)
218
+ raise ArgumentError,
219
+ "hybrid search: fusion[:method] must be :rrf, :rrf_client, or :rrf_native (got #{method.inspect})."
220
+ end
221
+ k_constant = fusion[:k_constant] || DEFAULT_K_CONSTANT
222
+ weights = fusion[:weights]
223
+ oversample = [k_int * DEFAULT_OVERSAMPLE_MULTIPLIER, k_int].max
224
+
225
+ # NOTE (deviation from plan §8.3): the default fuses CLIENT-SIDE.
226
+ # The native single-roundtrip `$rankFusion` path is OPT-IN
227
+ # (`fusion: { method: :rrf_native }`) rather than the default,
228
+ # because its server-side execution (and its ACL `$match`
229
+ # placement) cannot be validated without an Atlas 8.0+ cluster
230
+ # in CI. `rank_fusion_supported?` detection ships and is unit-
231
+ # tested; the native pipeline shape is snapshot-tested; but live
232
+ # results route through the always-correct, fully-enforced
233
+ # two-aggregate client path unless a caller explicitly opts into
234
+ # native AND the cluster supports it. Native still falls back to
235
+ # the client path on any execution error.
236
+ if method == :rrf_native && rank_fusion_supported?(collection_name)
237
+ fused = run_native(collection_name, lex, vec, oversample,
238
+ k_constant: k_constant, weights: weights, scope_opts: scope_opts)
239
+ return fused.first(k_int) if fused
240
+ end
241
+
242
+ lexical_rows = run_lexical(collection_name, lex, oversample, scope_opts)
243
+ vector_rows = run_vector(collection_name, vec, oversample, scope_opts)
244
+ rrf({ lexical: lexical_rows, vector: vector_rows },
245
+ k_constant: k_constant, weights: weights).first(k_int)
246
+ end
247
+
248
+ private
249
+
250
+ # -- client-side branch execution --------------------------------
251
+
252
+ def run_lexical(collection_name, lex, oversample, scope_opts)
253
+ require_relative "../atlas_search"
254
+ Parse::AtlasSearch.search(
255
+ collection_name, lex[:query],
256
+ index: lex[:index],
257
+ fields: lex[:fields],
258
+ filter: lex[:filter],
259
+ fuzzy: lex[:fuzzy],
260
+ limit: oversample,
261
+ raw: true,
262
+ **scope_opts.dup,
263
+ )
264
+ end
265
+
266
+ def run_vector(collection_name, vec, oversample, scope_opts)
267
+ Parse::VectorSearch.search(
268
+ collection_name,
269
+ field: vec[:field],
270
+ query_vector: vec[:query_vector],
271
+ k: oversample,
272
+ num_candidates: vec[:num_candidates],
273
+ filter: vec[:filter],
274
+ vector_filter: vec[:vector_filter],
275
+ index: vec[:index],
276
+ **scope_opts.dup,
277
+ )
278
+ end
279
+
280
+ # -- native $rankFusion path -------------------------------------
281
+
282
+ # Build the native `$rankFusion` pipeline (without ACL/CLP
283
+ # stages). Public-ish via {.native_pipeline} for snapshot tests;
284
+ # the live path appends ACL enforcement in {#run_native}.
285
+ def build_rank_fusion_stage(lex, vec, oversample, k_constant:, weights:)
286
+ vsel = vector_search_stage(vec, oversample)
287
+ lsel = lexical_search_stage(lex, oversample)
288
+ stage = {
289
+ "input" => {
290
+ "pipelines" => { "vector" => vsel, "lexical" => lsel },
291
+ },
292
+ # `$rankFusion` performs reciprocal-rank fusion implicitly; the
293
+ # only tunable in `combination` is per-input `weights`.
294
+ "scoreDetails" => false,
295
+ }
296
+ if weights
297
+ w = symbolize(weights)
298
+ stage["combination"] = {
299
+ "weights" => { "vector" => weight_for(w, :vector), "lexical" => weight_for(w, :lexical) },
300
+ }
301
+ end
302
+ { "$rankFusion" => stage }
303
+ end
304
+
305
+ # Assemble (but do not execute) the full native pipeline,
306
+ # including the ACL `$match` for a non-master resolution. Exposed
307
+ # for snapshot tests so the security-relevant shape is pinned even
308
+ # without an Atlas 8.0 cluster to execute against.
309
+ #
310
+ # @return [Array<Hash>] the aggregation pipeline.
311
+ def native_pipeline(collection_name, lexical:, vector:, k: DEFAULT_K, fusion: nil, **scope_opts)
312
+ fusion = symbolize(fusion || {})
313
+ lex = symbolize(lexical || {})
314
+ vec = symbolize(vector || {})
315
+ oversample = [Integer(k) * DEFAULT_OVERSAMPLE_MULTIPLIER, Integer(k)].max
316
+ resolution = Parse::ACLScope.resolve!(scope_opts.dup, method_name: :"VectorSearch::Hybrid.search")
317
+ native_pipeline_for(lex, vec, oversample, resolution,
318
+ k_constant: fusion[:k_constant] || DEFAULT_K_CONSTANT,
319
+ weights: fusion[:weights], limit: Integer(k))
320
+ end
321
+
322
+ def native_pipeline_for(lex, vec, oversample, resolution, k_constant:, weights:, limit:)
323
+ pipeline = [build_rank_fusion_stage(lex, vec, oversample, k_constant: k_constant, weights: weights)]
324
+ # The fused RRF score is surfaced via `{ $meta: "score" }`
325
+ # (a numeric), not "scoreDetails" (a breakdown document).
326
+ pipeline << { "$addFields" => { "_hybrid_score" => { "$meta" => "score" } } }
327
+ unless resolution.nil? || resolution.master?
328
+ acl_match = Parse::ACLScope.match_stage_for(resolution)
329
+ pipeline << acl_match if acl_match
330
+ end
331
+ pipeline << { "$sort" => { "_hybrid_score" => -1 } }
332
+ pipeline << { "$limit" => limit }
333
+ pipeline
334
+ end
335
+
336
+ def run_native(collection_name, lex, vec, oversample, k_constant:, weights:, scope_opts:)
337
+ resolution = Parse::ACLScope.resolve!(scope_opts.dup, method_name: :"VectorSearch::Hybrid.search")
338
+ assert_clp_find!(collection_name, resolution)
339
+ pointer_fields = resolve_pointer_fields!(collection_name, resolution)
340
+ protected_fields = Parse::CLPScope.protected_fields_for(
341
+ collection_name, resolution.permission_strings,
342
+ )
343
+ Parse::VectorSearch.validate_query_vector!(vec[:query_vector])
344
+ Parse::PipelineSecurity.validate_filter!(vec[:vector_filter]) if vec[:vector_filter]
345
+ Parse::PipelineSecurity.validate_filter!(vec[:filter]) if vec[:filter]
346
+ Parse::PipelineSecurity.validate_filter!(lex[:filter]) if lex[:filter]
347
+
348
+ pipeline = native_pipeline_for(lex, vec, oversample, resolution,
349
+ k_constant: k_constant, weights: weights, limit: oversample)
350
+ rows = run_pipeline!(collection_name, pipeline)
351
+
352
+ unless resolution.master?
353
+ # Defense-in-depth top-level row gate. The in-pipeline ACL
354
+ # `$match` is the primary filter, but it sits AFTER
355
+ # `$rankFusion` and treats a missing `_rperm` as public
356
+ # (`{$exists: false}`). If the fusion stage fails to carry
357
+ # `_rperm` through to its output documents — a behaviour we
358
+ # cannot validate without an Atlas 8.x cluster, and one this
359
+ # method would otherwise silently swallow via the StandardError
360
+ # fallback below — every row would fail OPEN as public. So
361
+ # re-verify each row here and FAIL CLOSED: a non-master row
362
+ # must carry an `_rperm` array that explicitly satisfies the
363
+ # scope. `redact_results!` does NOT cover this case — it skips
364
+ # top-level rows by design (see Parse::ACLScope). The tradeoff
365
+ # is that genuinely ACL-less rows (no `_rperm` at all) are
366
+ # dropped on this opt-in path; public-readable rows store
367
+ # `_rperm: ["*"]` and are kept (non-strict scopes carry `"*"`).
368
+ perms_set = Array(resolution.permission_strings).to_set
369
+ rows.select! { |doc| native_row_visible?(doc, perms_set) }
370
+ Parse::ACLScope.redact_results!(rows, resolution)
371
+ Parse::CLPScope.redact_protected_fields!(rows, protected_fields) if protected_fields.any?
372
+ if pointer_fields
373
+ rows = Parse::CLPScope.filter_by_pointer_fields(rows, pointer_fields, resolution.user_id)
374
+ end
375
+ end
376
+ rows.map! { |doc| Parse::PipelineSecurity.strip_internal_fields(doc) }
377
+ rows
378
+ rescue Parse::CLPScope::Denied
379
+ raise
380
+ rescue StandardError
381
+ # Native execution failed (e.g. a cluster that probed as
382
+ # supported but rejects this exact shape, or a transient error).
383
+ # Fall back to the client-side path rather than failing the
384
+ # whole search — the client path is the always-correct baseline.
385
+ nil
386
+ end
387
+
388
+ def vector_search_stage(vec, oversample)
389
+ # Parity with Parse::VectorSearch: Atlas requires
390
+ # `numCandidates >= limit` and caps it at 10_000. The default
391
+ # (`oversample * MULTIPLIER`) can blow past 10_000 for a large
392
+ # `k`, so clamp into `[limit, 10_000]` rather than emit a value
393
+ # Atlas will reject. `oversample` (the per-branch limit) is
394
+ # bounded by `MAX_K * OVERSAMPLE_MULTIPLIER` and stays below the
395
+ # cap, so the clamp range is always valid.
396
+ num_candidates = (vec[:num_candidates] || oversample * Parse::VectorSearch::DEFAULT_NUM_CANDIDATES_MULTIPLIER).to_i
397
+ num_candidates = [[num_candidates, oversample].max, 10_000].min
398
+ stage = {
399
+ "index" => vec[:index].to_s,
400
+ "path" => vec[:field].to_s,
401
+ "queryVector" => vec[:query_vector],
402
+ "numCandidates" => num_candidates,
403
+ "limit" => oversample,
404
+ }
405
+ stage["filter"] = vec[:vector_filter] if vec[:vector_filter] && !vec[:vector_filter].empty?
406
+ inner = [{ "$vectorSearch" => stage }]
407
+ inner << { "$match" => vec[:filter] } if vec[:filter]
408
+ inner
409
+ end
410
+
411
+ def lexical_search_stage(lex, oversample)
412
+ require_relative "../atlas_search" if defined?(Parse::AtlasSearch::SearchBuilder).nil?
413
+ builder = Parse::AtlasSearch::SearchBuilder.new(index_name: lex[:index])
414
+ fields = lex[:fields]
415
+ if fields.nil? || (fields.respond_to?(:empty?) && fields.empty?)
416
+ builder.text(query: lex[:query], path: { "wildcard" => "*" }, fuzzy: lex[:fuzzy])
417
+ else
418
+ Array(fields).each { |f| builder.text(query: lex[:query], path: f.to_s, fuzzy: lex[:fuzzy]) }
419
+ end
420
+ inner = [builder.build, { "$limit" => oversample }]
421
+ inner << { "$match" => lex[:filter] } if lex[:filter]
422
+ inner
423
+ end
424
+
425
+ # -- the $rankFusion support probe -------------------------------
426
+
427
+ def run_probe(collection_name)
428
+ coll = Parse::MongoDB.collection(collection_name)
429
+ coll.aggregate([{ "$rankFusion" => { "input" => {} } }, { "$limit" => 0 }]).to_a
430
+ true
431
+ rescue StandardError => e
432
+ # "Unknown aggregation stage $rankFusion" (or an unrecognized-
433
+ # operator variant) means the cluster predates native support.
434
+ # Any OTHER failure (a malformed-but-recognized stage, an auth
435
+ # error, etc.) means the stage IS recognized — treat as supported
436
+ # and let the real query surface the real error.
437
+ unsupported_stage_error?(e) ? false : true
438
+ end
439
+
440
+ # Message fragments Mongo emits for an UNRECOGNIZED pipeline stage.
441
+ # We only treat the probe failure as "unsupported" when BOTH the
442
+ # stage name AND an unrecognized-stage phrase appear, so a
443
+ # recognized-but-misused `$rankFusion` (or an unrelated auth/parse
444
+ # error) is treated as supported and surfaces its real error on the
445
+ # actual query rather than silently disabling native fusion.
446
+ UNSUPPORTED_STAGE_FRAGMENTS = [
447
+ "unrecognized pipeline stage name",
448
+ "unknown aggregation stage",
449
+ "is not allowed",
450
+ ].freeze
451
+ private_constant :UNSUPPORTED_STAGE_FRAGMENTS
452
+
453
+ def unsupported_stage_error?(err)
454
+ msg = err.message.to_s.downcase
455
+ msg.include?("rankfusion") && UNSUPPORTED_STAGE_FRAGMENTS.any? { |f| msg.include?(f) }
456
+ end
457
+
458
+ # -- probe cache -------------------------------------------------
459
+
460
+ PROBE_MUTEX_INIT = Mutex.new
461
+ private_constant :PROBE_MUTEX_INIT
462
+
463
+ def probe_mutex
464
+ @probe_mutex ||= PROBE_MUTEX_INIT.synchronize { @probe_mutex ||= Mutex.new }
465
+ end
466
+
467
+ def probe_cache
468
+ @probe_cache ||= {}
469
+ end
470
+
471
+ def probe_cache_get(key, now)
472
+ probe_mutex.synchronize do
473
+ entry = probe_cache[key]
474
+ next nil if entry.nil?
475
+ next nil if (now - entry[:at]) >= PROBE_CACHE_TTL
476
+ entry[:supported]
477
+ end
478
+ end
479
+
480
+ def probe_cache_put(key, supported, now)
481
+ probe_mutex.synchronize { probe_cache[key] = { supported: supported, at: now } }
482
+ end
483
+
484
+ # Monotonic clock so the TTL is immune to wall-clock jumps.
485
+ def monotonic
486
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
487
+ end
488
+
489
+ # -- shared helpers ----------------------------------------------
490
+
491
+ def require_available!
492
+ Parse::MongoDB.require_gem!
493
+ unless Parse::MongoDB.available?
494
+ raise Parse::VectorSearch::NotAvailable,
495
+ "Parse::VectorSearch::Hybrid requires Parse::MongoDB.configure(enabled: true)."
496
+ end
497
+ end
498
+
499
+ def run_pipeline!(collection_name, pipeline)
500
+ Parse::MongoDB.collection(collection_name).aggregate(pipeline).to_a
501
+ end
502
+
503
+ def assert_clp_find!(collection_name, resolution)
504
+ return if resolution.nil? || resolution.master?
505
+ unless Parse::CLPScope.permits?(collection_name, :find, resolution.permission_strings)
506
+ raise Parse::CLPScope::Denied.new(
507
+ collection_name, :find,
508
+ "CLP refuses find on '#{collection_name}' for the current hybrid-search scope.",
509
+ )
510
+ end
511
+ end
512
+
513
+ def resolve_pointer_fields!(collection_name, resolution)
514
+ return nil if resolution.nil? || resolution.master?
515
+ pointer_fields = Parse::CLPScope.pointer_fields_for(collection_name, :find)
516
+ return nil if pointer_fields.nil?
517
+ if resolution.user_id.nil?
518
+ raise Parse::CLPScope::Denied.new(
519
+ collection_name, :find,
520
+ "CLP requires user identity (pointerFields=#{pointer_fields.inspect}) " \
521
+ "but the current hybrid-search scope has no user_id.",
522
+ )
523
+ end
524
+ pointer_fields
525
+ end
526
+
527
+ def validate_weights!(weights)
528
+ return if weights.nil?
529
+ unless weights.is_a?(Hash)
530
+ raise FusionError, "rrf: weights must be a Hash of branch => weight (got #{weights.class})."
531
+ end
532
+ weights.each_value do |w|
533
+ unless w.is_a?(Numeric) && w >= 0
534
+ raise FusionError, "rrf: weights must be non-negative numbers (got #{w.inspect})."
535
+ end
536
+ end
537
+ end
538
+
539
+ def weight_for(weights, branch_name)
540
+ return 1.0 if weights.nil?
541
+ w = weights[branch_name] || weights[branch_name.to_s] || weights[branch_name.to_sym]
542
+ w.nil? ? 1.0 : w.to_f
543
+ end
544
+
545
+ def row_id(row)
546
+ id = row["_id"] || row[:_id] || row["objectId"] || row[:objectId]
547
+ id.nil? ? nil : id.to_s
548
+ end
549
+
550
+ # Fail-closed top-level row gate for the native fusion path.
551
+ # Unlike {Parse::ACLScope}'s subdoc matcher (which treats a
552
+ # missing `_rperm` as public), this REQUIRES an explicit,
553
+ # satisfied `_rperm` array: a row with no, empty, or non-Array
554
+ # `_rperm` is dropped, because on the native path a missing
555
+ # `_rperm` may mean `$rankFusion` stripped it rather than the row
556
+ # being genuinely public.
557
+ def native_row_visible?(doc, perms_set)
558
+ rperm = doc["_rperm"] || doc[:_rperm]
559
+ rperm.is_a?(Array) && rperm.any? { |entry| perms_set.include?(entry) }
560
+ end
561
+
562
+ # Merge two rows for the same objectId across branches: keep all
563
+ # fields, preferring non-nil values, so the fused row carries both
564
+ # branch scores (`_score` and `_vscore`).
565
+ def merge_rows(a, b)
566
+ return b if a.nil?
567
+ return a if b.nil?
568
+ a.merge(b) { |_k, va, vb| vb.nil? ? va : vb }
569
+ end
570
+
571
+ def symbolize(hash)
572
+ return {} if hash.nil?
573
+ hash.each_with_object({}) { |(k, v), out| out[k.to_sym] = v }
574
+ end
575
+ end
576
+ end
577
+ end
578
+ end