parse-stack-next 5.4.1 → 5.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. checksums.yaml +4 -4
  2. data/CHANGELOG.md +489 -0
  3. data/Gemfile.lock +1 -1
  4. data/README.md +61 -9
  5. data/docs/atlas_vector_search_guide.md +318 -19
  6. data/lib/parse/acl_scope.rb +11 -0
  7. data/lib/parse/agent/mcp_rack_app.rb +53 -14
  8. data/lib/parse/agent/mcp_server.rb +19 -0
  9. data/lib/parse/api/path_segment.rb +31 -0
  10. data/lib/parse/api/users.rb +13 -0
  11. data/lib/parse/cache/redis.rb +55 -11
  12. data/lib/parse/client/caching.rb +12 -3
  13. data/lib/parse/client/logging.rb +9 -0
  14. data/lib/parse/client.rb +37 -3
  15. data/lib/parse/embeddings/batch_embedder.rb +188 -0
  16. data/lib/parse/embeddings/cache.rb +374 -0
  17. data/lib/parse/embeddings/cohere.rb +31 -18
  18. data/lib/parse/embeddings/image_fetch.rb +347 -0
  19. data/lib/parse/embeddings/provider.rb +17 -11
  20. data/lib/parse/embeddings/spend_cap.rb +117 -3
  21. data/lib/parse/embeddings/voyage.rb +34 -25
  22. data/lib/parse/embeddings.rb +40 -3
  23. data/lib/parse/model/acl.rb +15 -11
  24. data/lib/parse/model/core/embed_managed.rb +243 -14
  25. data/lib/parse/model/core/properties.rb +42 -5
  26. data/lib/parse/model/core/vector_searchable.rb +157 -8
  27. data/lib/parse/mongodb.rb +12 -0
  28. data/lib/parse/pipeline_security.rb +81 -15
  29. data/lib/parse/query/constraint.rb +22 -0
  30. data/lib/parse/query/constraints.rb +271 -250
  31. data/lib/parse/query.rb +284 -43
  32. data/lib/parse/retrieval/agent_tool.rb +21 -14
  33. data/lib/parse/retrieval/retriever.rb +84 -0
  34. data/lib/parse/schema/search_index_migrator.rb +48 -1
  35. data/lib/parse/stack/version.rb +1 -1
  36. data/lib/parse/stack.rb +12 -1
  37. data/lib/parse/vector_search/hybrid.rb +39 -1
  38. data/lib/parse/vector_search.rb +34 -0
  39. data/lib/parse/webhooks/payload.rb +7 -1
  40. data/lib/parse/webhooks.rb +107 -21
  41. metadata +4 -1
@@ -55,6 +55,22 @@ module Parse
55
55
  # field and the caller didn't pass an explicit `index:` kwarg.
56
56
  class IndexNotResolved < ArgumentError; end
57
57
 
58
+ # Raised (under `Parse::VectorSearch.index_drift_policy = :raise`)
59
+ # when first-query verification finds the deployed vectorSearch
60
+ # index disagreeing with the model declaration — wrong
61
+ # `numDimensions`, wrong `similarity`, or a registered
62
+ # tenant-scope field missing from the index's `filter` paths.
63
+ # Under the default `:warn` policy the same findings emit a
64
+ # single `[Parse::VectorSearch:DRIFT]` warning instead.
65
+ class IndexDriftError < StandardError
66
+ # @return [Array<String>] human-readable drift findings.
67
+ attr_reader :findings
68
+ def initialize(message, findings: [])
69
+ @findings = findings
70
+ super(message)
71
+ end
72
+ end
73
+
58
74
  # Raised by the `find_similar(text:)` overload when the resolved
59
75
  # `:vector` property has no `provider:` (and therefore no way to
60
76
  # turn `text:` into a query vector). Distinct from
@@ -367,13 +383,23 @@ module Parse
367
383
  "on the property, or pass an explicit `vector:`."
368
384
  end
369
385
  provider = Parse::Embeddings.provider(provider_name)
370
- vectors = provider.embed_text([text], input_type: :search_query)
371
- unless vectors.is_a?(Array) && vectors.length == 1 && vectors.first.is_a?(Array)
372
- raise Parse::Embeddings::InvalidResponseError,
373
- "#{self}.find_similar: provider #{provider_name.inspect} did not return " \
374
- "a single vector for `text:` (got #{vectors.inspect[0, 80]})."
375
- end
376
- vectors.first
386
+ # Spend cap: every query-embed path (find_similar(text:),
387
+ # hybrid_search(text:), Retrieval.retrieve) funnels through this
388
+ # method, so charging here closes the "direct callers bypass the
389
+ # cap" gap. No-op when no limit is configured, or when an
390
+ # upstream caller (the semantic_search agent tool) has already
391
+ # charged with per-tenant identity (SpendCap.with_precharged).
392
+ #
393
+ # Deliberate: the charge runs BEFORE the cache lookup, so cache
394
+ # hits bill at full price. The cap bounds query *volume* (an
395
+ # abuse/probing control), not just provider spend — a caller
396
+ # replaying one cached query must not get unlimited throughput.
397
+ Parse::Embeddings::SpendCap.charge_query!(text)
398
+ # Query-embed cache: repeated identical queries skip the
399
+ # provider round-trip when Parse::Embeddings::Cache.enable! has
400
+ # been called; pass-through (with the provider's own response
401
+ # validation preserved) when disabled.
402
+ Parse::Embeddings::Cache.fetch_vector(provider, text, input_type: :search_query)
377
403
  end
378
404
 
379
405
  def coerce_query_vector(vector)
@@ -387,7 +413,10 @@ module Parse
387
413
  end
388
414
 
389
415
  def resolve_vector_index!(field, explicit_index)
390
- return explicit_index if explicit_index && !explicit_index.to_s.empty?
416
+ if explicit_index && !explicit_index.to_s.empty?
417
+ verify_explicit_vector_index(field, explicit_index.to_s)
418
+ return explicit_index
419
+ end
391
420
  begin
392
421
  require_relative "../../atlas_search"
393
422
  rescue LoadError
@@ -402,9 +431,129 @@ module Parse
402
431
  "#{parse_class}.#{field}; pass index: explicitly or create one " \
403
432
  "via Parse::AtlasSearch::IndexCatalog.create_index."
404
433
  end
434
+ verify_vector_index!(field, idx)
405
435
  (idx["name"] || idx[:name]).to_s
406
436
  end
407
437
 
438
+ # Best-effort drift verification for an explicitly named `index:`.
439
+ # The auto-discovery path verifies the index it resolves; an
440
+ # explicit kwarg would otherwise skip verification entirely. Look
441
+ # the field's covering index up in the catalog and verify it when
442
+ # its name matches the explicit one. Lookup failures (catalog
443
+ # unavailable, index not discoverable, name targeting a different
444
+ # index) skip verification rather than failing the query — the
445
+ # explicit kwarg is an override, not a discovery request.
446
+ def verify_explicit_vector_index(field, index_name)
447
+ return if Parse::VectorSearch.index_drift_policy == :ignore
448
+ begin
449
+ require_relative "../../atlas_search"
450
+ idx = Parse::AtlasSearch::IndexCatalog.find_vector_index(parse_class, field: field)
451
+ rescue StandardError, LoadError
452
+ return
453
+ end
454
+ return if idx.nil?
455
+ return unless (idx["name"] || idx[:name]).to_s == index_name
456
+ verify_vector_index!(field, idx)
457
+ end
458
+
459
+ # First-query drift verification: compare the deployed index's
460
+ # `latestDefinition` against the model declaration. The drift
461
+ # findings are computed once per (field, index name) per class per
462
+ # process and cached; the policy check runs on EVERY query, so
463
+ # under `:raise` a drifted index keeps failing instead of failing
464
+ # once and then silently serving results. Under `:warn` the
465
+ # warning is emitted only on the first check to avoid log spam.
466
+ # Honors {Parse::VectorSearch.index_drift_policy} (`:warn` default
467
+ # / `:raise` / `:ignore`).
468
+ #
469
+ # Checks:
470
+ # 1. `numDimensions` on the covering `type: "vector"` entry vs the
471
+ # property's declared `dimensions:`.
472
+ # 2. `similarity` vs the property's declared `similarity:` (only
473
+ # when both sides declare one).
474
+ # 3. When the class registers an `agent_tenant_scope`, the scope
475
+ # field must appear among the index's `type: "filter"` paths —
476
+ # otherwise the tenant pre-filter that
477
+ # {Parse::Retrieval.retrieve} folds into `$vectorSearch.filter`
478
+ # fails Atlas-side at query time.
479
+ def verify_vector_index!(field, idx)
480
+ return if Parse::VectorSearch.index_drift_policy == :ignore
481
+ index_name = (idx["name"] || idx[:name]).to_s
482
+ @_verified_vector_indexes ||= {}
483
+ cache_key = "#{field}|#{index_name}"
484
+ findings = @_verified_vector_indexes[cache_key]
485
+ first_check = findings.nil?
486
+ if first_check
487
+ findings = vector_index_drift_findings(field, idx).freeze
488
+ @_verified_vector_indexes[cache_key] = findings
489
+ end
490
+ return if findings.empty?
491
+
492
+ message = "#{self} vectorSearch index #{index_name.inspect} drifts from the " \
493
+ "model declaration for :#{field}: #{findings.join("; ")}"
494
+ if Parse::VectorSearch.index_drift_policy == :raise
495
+ # Raise on every query, not just the first: strict mode means a
496
+ # drifted index must never serve results.
497
+ raise IndexDriftError.new(message, findings: findings)
498
+ end
499
+ warn "[Parse::VectorSearch:DRIFT] #{message}" if first_check
500
+ end
501
+
502
+ # @!visibility private
503
+ # @return [Array<String>] drift findings (empty when in sync).
504
+ def vector_index_drift_findings(field, idx)
505
+ defn = idx["latestDefinition"] || idx[:latestDefinition] || {}
506
+ entries = defn["fields"] || defn[:fields] || []
507
+ field_str = field.to_s
508
+ vector_entry = entries.find do |f|
509
+ (f["type"] || f[:type]).to_s == "vector" && (f["path"] || f[:path]).to_s == field_str
510
+ end
511
+ findings = []
512
+ return findings if vector_entry.nil? # find_vector_index matched on it; defensive
513
+
514
+ declared_dims = vector_properties.dig(field.to_sym, :dimensions)
515
+ index_dims = vector_entry["numDimensions"] || vector_entry[:numDimensions]
516
+ if declared_dims && index_dims && Integer(index_dims) != Integer(declared_dims)
517
+ findings << "index numDimensions=#{index_dims} but property declares " \
518
+ "dimensions: #{declared_dims} (every query will mismatch — " \
519
+ "rebuild the index or run #{self}.reembed! after fixing the declaration)"
520
+ end
521
+
522
+ declared_sim = vector_properties.dig(field.to_sym, :similarity)
523
+ index_sim = vector_entry["similarity"] || vector_entry[:similarity]
524
+ if declared_sim && index_sim && index_sim.to_s != declared_sim.to_s
525
+ findings << "index similarity=#{index_sim.inspect} but property declares " \
526
+ "similarity: #{declared_sim.inspect}"
527
+ end
528
+
529
+ scope_field = registered_tenant_scope_field
530
+ if scope_field
531
+ filter_paths = entries.select { |f| (f["type"] || f[:type]).to_s == "filter" }
532
+ .map { |f| (f["path"] || f[:path]).to_s }
533
+ unless filter_paths.include?(scope_field)
534
+ findings << "agent_tenant_scope field #{scope_field.inspect} is not declared " \
535
+ "as a type: \"filter\" path in the index — tenant-scoped " \
536
+ "$vectorSearch.filter will fail Atlas-side"
537
+ end
538
+ end
539
+ findings
540
+ end
541
+
542
+ # @!visibility private
543
+ # Wire/storage name of the class's registered tenant-scope field,
544
+ # or nil. Mirrors the resolution Parse::Retrieval#wire_name uses
545
+ # when folding the scope into $vectorSearch.filter.
546
+ def registered_tenant_scope_field
547
+ return nil unless defined?(Parse::Agent::MetadataRegistry)
548
+ rule = Parse::Agent::MetadataRegistry.tenant_scope_rule(parse_class)
549
+ return nil unless rule
550
+ sym = rule[:field].to_sym
551
+ fmap = respond_to?(:field_map) ? field_map : {}
552
+ (fmap[sym] || sym.to_s.columnize).to_s
553
+ rescue StandardError
554
+ nil
555
+ end
556
+
408
557
  def build_vector_hits(raw_hits)
409
558
  return [] if raw_hits.nil? || raw_hits.empty?
410
559
  converted = Parse::MongoDB.convert_documents_to_parse(raw_hits, parse_class)
data/lib/parse/mongodb.rb CHANGED
@@ -1651,6 +1651,18 @@ module Parse
1651
1651
  collection_name, perms_for_clp,
1652
1652
  )
1653
1653
  Parse::CLPScope.redact_protected_fields!(results, strip_set) if strip_set.any?
1654
+
1655
+ # Process-level floor: recursively strip Parse-internal credential
1656
+ # columns (_hashed_password, _session_token, _auth_data_*, _rperm,
1657
+ # ...) from every row AND every embedded sub-document. The
1658
+ # protectedFields strip above is keyed on the OUTER class, and the
1659
+ # ACL sub-doc walk only DROPS ACL-failing sub-docs — neither covers
1660
+ # a foreign class (e.g. _User / _Session) pulled in via $lookup /
1661
+ # $graphLookup / $unionWith under an arbitrary alias. Runs last, for
1662
+ # scoped (non-master) callers only; master is unredacted by design.
1663
+ results.each do |row|
1664
+ Parse::PipelineSecurity.redact_internal_fields_deep!(row)
1665
+ end
1654
1666
  end
1655
1667
 
1656
1668
  payload[:result_count] = results.size
@@ -105,6 +105,7 @@ module Parse
105
105
  DENIED_FIELD_REFS = %w[
106
106
  $_hashed_password $_password_history
107
107
  $_session_token $_sessionToken
108
+ $sessionToken $session_token
108
109
  $_email_verify_token $_perishable_token
109
110
  $_failed_login_count $_account_lockout_expires_at
110
111
  $_rperm $_wperm
@@ -161,6 +162,19 @@ module Parse
161
162
  # walk_for_denied! field-name screen.
162
163
  INTERNAL_FIELDS_PREFIX_DENYLIST = %w[_auth_data_].freeze
163
164
 
165
+ # The credential / sensitive subset of {INTERNAL_FIELDS_DENYLIST}. These
166
+ # columns must NEVER appear as a user-influenced `$match` field name —
167
+ # even on a pipeline that runs with `allow_internal_fields: true` (which
168
+ # exists to permit SDK-emitted `_rperm`/`_wperm` references from
169
+ # `readable_by_role` / `publicly_readable`). A `$match`/`$count` on a
170
+ # password hash, session/reset token, or auth-data column is a credential-
171
+ # exfiltration oracle (bisect the value char-by-char), and these columns
172
+ # have NO legitimate SDK query use — so the `allow_internal_fields` escape
173
+ # hatch must not relax them. Derived from {INTERNAL_FIELDS_DENYLIST} minus
174
+ # the ACL/bookkeeping columns (`_rperm`/`_wperm`/`_tombstone`) the ACL DSL
175
+ # legitimately emits, so the two lists never drift.
176
+ CREDENTIAL_FIELDS_DENYLIST = (INTERNAL_FIELDS_DENYLIST - %w[_rperm _wperm _tombstone]).freeze
177
+
164
178
  # Forensic string-introspection operators. When any of these
165
179
  # appears INSIDE `$expr` with a field-reference input string, the
166
180
  # query becomes a per-character oracle even though the operator
@@ -336,6 +350,48 @@ module Parse
336
350
  end
337
351
  end
338
352
 
353
+ # Depth bound for {redact_internal_fields_deep!}. `$lookup`/`$graphLookup`/
354
+ # `$unionWith` embed foreign documents at shallow alias depth, so this is
355
+ # generous; the bound exists only to fail safe on cyclic/pathological docs.
356
+ INTERNAL_REDACT_MAX_DEPTH = 32
357
+
358
+ # Recursively delete {INTERNAL_FIELDS_DENYLIST} / {INTERNAL_FIELDS_PREFIX_DENYLIST}
359
+ # keys from `node` AND every embedded sub-document/array element, in place.
360
+ #
361
+ # This is the process-level floor that stops Parse-Server-internal
362
+ # credential columns (`_hashed_password`, `_session_token`, `_auth_data_*`,
363
+ # `_rperm`/`_wperm`, ...) from reaching a scoped caller through ANY result
364
+ # shape — most importantly a foreign-class document pulled in via
365
+ # `$lookup`/`$graphLookup`/`$unionWith` under an arbitrary alias. Neither
366
+ # the per-class protectedFields strip (keyed on the OUTER class) nor the
367
+ # ACL sub-document walk (which only DROPS ACL-failing sub-docs, never
368
+ # strips field names) covers that alias. Unlike {strip_internal_fields}
369
+ # (one level, non-mutating), this walks the whole tree and mutates in
370
+ # place so it can run as the last step over a result set.
371
+ #
372
+ # Structural columns (`_id`, `_p_*`, `_created_at`, `_updated_at`, `_acl`)
373
+ # are intentionally NOT in the denylist, so object/ACL reconstruction is
374
+ # unaffected.
375
+ #
376
+ # @param node [Object] a result row (Hash), array, or scalar.
377
+ # @return [Object] the same node, mutated.
378
+ def redact_internal_fields_deep!(node, depth: INTERNAL_REDACT_MAX_DEPTH)
379
+ case node
380
+ when Hash
381
+ # Always clean the current level (even at the depth floor) so an
382
+ # embedded document sitting exactly at the bound is still scrubbed.
383
+ node.delete_if do |key, _value|
384
+ ks = key.to_s
385
+ INTERNAL_FIELDS_DENYLIST.include?(ks) ||
386
+ INTERNAL_FIELDS_PREFIX_DENYLIST.any? { |prefix| ks.start_with?(prefix) }
387
+ end
388
+ node.each_value { |v| redact_internal_fields_deep!(v, depth: depth - 1) } if depth > 0
389
+ when Array
390
+ node.each { |el| redact_internal_fields_deep!(el, depth: depth - 1) } if depth > 0
391
+ end
392
+ node
393
+ end
394
+
339
395
  # Wave-3 TRACK-CLP-4: refuse caller-supplied pipelines that
340
396
  # reference a protected field via `$<field>` on the RHS of a
341
397
  # `$project` / `$addFields` / `$set` / `$group` / `$bucket` /
@@ -510,21 +566,31 @@ module Parse
510
566
  # oracle as the where:-constraint path in ConstraintTranslator.
511
567
  # Operators ($-prefixed) are excluded because they are validated
512
568
  # separately by DENIED_OPERATORS.
513
- if !allow_internal_fields &&
514
- !key_str.start_with?("$") &&
515
- (INTERNAL_FIELDS_DENYLIST.include?(key_str) ||
516
- INTERNAL_FIELDS_PREFIX_DENYLIST.any? { |prefix| key_str.start_with?(prefix) })
517
- raise Error.new(
518
- "SECURITY: Pipeline references internal Parse Server field " \
519
- "'#{key_str}' at nesting depth #{depth}" \
520
- "#{stage_idx ? " inside stage #{stage_idx}" : ""}. " \
521
- "This column (password hash, session token, auth data, or ACL " \
522
- "pointer) must not appear in a user-influenced pipeline — " \
523
- "it enables credential exfiltration via count/match oracles.",
524
- stage: stage_idx,
525
- operator: key_str,
526
- reason: :denied_internal_field,
527
- )
569
+ #
570
+ # CREDENTIAL columns (password hash, session/reset token, auth data)
571
+ # are refused UNCONDITIONALLY — `allow_internal_fields` (which exists
572
+ # so SDK-emitted `_rperm`/`_wperm` references survive on the mongo-
573
+ # direct path) must NOT relax them, or a `*_direct` terminal becomes
574
+ # a credential-bisection oracle. The remaining internal columns
575
+ # (`_rperm`/`_wperm`/`_tombstone`) stay gated by allow_internal_fields.
576
+ if !key_str.start_with?("$")
577
+ is_credential = CREDENTIAL_FIELDS_DENYLIST.include?(key_str) ||
578
+ INTERNAL_FIELDS_PREFIX_DENYLIST.any? { |prefix| key_str.start_with?(prefix) }
579
+ is_internal = INTERNAL_FIELDS_DENYLIST.include?(key_str) ||
580
+ INTERNAL_FIELDS_PREFIX_DENYLIST.any? { |prefix| key_str.start_with?(prefix) }
581
+ if is_credential || (is_internal && !allow_internal_fields)
582
+ raise Error.new(
583
+ "SECURITY: Pipeline references internal Parse Server field " \
584
+ "'#{key_str}' at nesting depth #{depth}" \
585
+ "#{stage_idx ? " inside stage #{stage_idx}" : ""}. " \
586
+ "This column (password hash, session token, auth data, or ACL " \
587
+ "pointer) must not appear in a user-influenced pipeline — " \
588
+ "it enables credential exfiltration via count/match oracles.",
589
+ stage: stage_idx,
590
+ operator: key_str,
591
+ reason: :denied_internal_field,
592
+ )
593
+ end
528
594
  end
529
595
  # Cap caller-supplied regex pattern length. Catches the two
530
596
  # shapes Mongo accepts: the find-form `{ field: { $regex: "..." } }`
@@ -191,6 +191,28 @@ module Parse
191
191
  self.class.formatted_value(@value)
192
192
  end
193
193
 
194
+ # Supports the opt-in `{ value:, unicode: true }` form accepted by the
195
+ # regex-based constraints ({RegularExpressionConstraint},
196
+ # {StartsWithConstraint}, {ContainsConstraint}, {EndsWithConstraint}).
197
+ # When the `unicode` flag is set, the constraint adds the `u` flag to the
198
+ # compiled `$options`, asking the backend to treat the pattern and subject
199
+ # as UTF-8 for correct multibyte (e.g. accented or CJK) case-insensitive
200
+ # matching.
201
+ #
202
+ # The `u` flag is only honored by Parse Server 8.3.0+ over REST (older
203
+ # servers reject it) and by MongoDB 6.1+ on the mongo-direct path; it is
204
+ # therefore strictly opt-in and never emitted for the bare-value form.
205
+ #
206
+ # @param raw [Object] the raw constraint value (`@value`).
207
+ # @return [Array(Object, Boolean)] the unwrapped value and the unicode flag.
208
+ # @api private
209
+ def regex_unicode_option(raw)
210
+ return [raw, false] unless raw.is_a?(Hash)
211
+
212
+ opts = raw.symbolize_keys
213
+ [opts[:value], opts[:unicode] ? true : false]
214
+ end
215
+
194
216
  # Registers the default constraint of equality
195
217
  register :eq, Constraint
196
218
  precedence 100