parse-stack-next 5.4.1 → 5.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +344 -0
- data/Gemfile.lock +1 -1
- data/README.md +45 -6
- data/docs/atlas_vector_search_guide.md +314 -19
- data/lib/parse/api/users.rb +10 -0
- data/lib/parse/client.rb +19 -1
- data/lib/parse/embeddings/batch_embedder.rb +188 -0
- data/lib/parse/embeddings/cache.rb +322 -0
- data/lib/parse/embeddings/cohere.rb +31 -18
- data/lib/parse/embeddings/image_fetch.rb +347 -0
- data/lib/parse/embeddings/provider.rb +17 -11
- data/lib/parse/embeddings/spend_cap.rb +117 -3
- data/lib/parse/embeddings/voyage.rb +34 -25
- data/lib/parse/embeddings.rb +40 -3
- data/lib/parse/model/acl.rb +15 -11
- data/lib/parse/model/core/embed_managed.rb +243 -14
- data/lib/parse/model/core/vector_searchable.rb +157 -8
- data/lib/parse/query/constraint.rb +22 -0
- data/lib/parse/query/constraints.rb +271 -250
- data/lib/parse/query.rb +233 -42
- data/lib/parse/retrieval/agent_tool.rb +21 -14
- data/lib/parse/retrieval/retriever.rb +84 -0
- data/lib/parse/schema/search_index_migrator.rb +48 -1
- data/lib/parse/stack/version.rb +1 -1
- data/lib/parse/vector_search/hybrid.rb +39 -1
- data/lib/parse/vector_search.rb +34 -0
- data/lib/parse/webhooks/payload.rb +7 -1
- data/lib/parse/webhooks.rb +107 -21
- metadata +4 -1
|
@@ -288,6 +288,85 @@ declared `dimensions:` before sending the pipeline. A mismatch raises
|
|
|
288
288
|
it — callers get "expected 1536, got 768" instead of a server-side
|
|
289
289
|
error after a round-trip.
|
|
290
290
|
|
|
291
|
+
### Index drift verification (v5.5)
|
|
292
|
+
|
|
293
|
+
On the first auto-discovered use of a vectorSearch index per
|
|
294
|
+
(class, field, index) per process, the SDK compares the deployed
|
|
295
|
+
index's `latestDefinition` against the model declaration:
|
|
296
|
+
|
|
297
|
+
* `numDimensions` vs the property's declared `dimensions:` — a
|
|
298
|
+
mismatch means every query will be rejected or return nonsense
|
|
299
|
+
(usually an index that predates a model change).
|
|
300
|
+
* `similarity` vs the property's declared `similarity:` (checked only
|
|
301
|
+
when both sides declare one).
|
|
302
|
+
* When the class registers an `agent_tenant_scope`, the scope field
|
|
303
|
+
must appear among the index's `type: "filter"` paths — without it,
|
|
304
|
+
every tenant-scoped `$vectorSearch.filter` fails Atlas-side at
|
|
305
|
+
query time.
|
|
306
|
+
|
|
307
|
+
Findings are computed once per (class, field, index) per process and
|
|
308
|
+
governed by `Parse::VectorSearch.index_drift_policy`:
|
|
309
|
+
|
|
310
|
+
```ruby
|
|
311
|
+
Parse::VectorSearch.index_drift_policy = :warn # default — [Parse::VectorSearch:DRIFT] warning on first check
|
|
312
|
+
Parse::VectorSearch.index_drift_policy = :raise # IndexDriftError on EVERY query against a drifted index
|
|
313
|
+
Parse::VectorSearch.index_drift_policy = :ignore # skip verification
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
Under `:raise` the cached findings keep raising — strict mode means a
|
|
317
|
+
drifted index never serves results, not "fails once, then passes".
|
|
318
|
+
Auto-discovery verification costs no extra round-trip (the definition
|
|
319
|
+
is already in hand from index discovery). An explicit `index:` kwarg
|
|
320
|
+
is verified best-effort: when the catalog's covering index for the
|
|
321
|
+
field carries the same name, its definition is checked too; catalog
|
|
322
|
+
lookup failures never fail the query.
|
|
323
|
+
|
|
324
|
+
### Query-embed caching and spend caps (v5.5)
|
|
325
|
+
|
|
326
|
+
Every `text:`-overload query funnels through one embed path
|
|
327
|
+
(`find_similar(text:)`, `hybrid_search(text:)`,
|
|
328
|
+
`Parse::Retrieval.retrieve` all share it), which gives two controls:
|
|
329
|
+
|
|
330
|
+
```ruby
|
|
331
|
+
# Opt-in query-embed cache: repeated identical queries skip the
|
|
332
|
+
# provider round-trip. Keyed by (provider, model, dimensions,
|
|
333
|
+
# input_type, SHA-256(input)) — plaintext never lands in the store.
|
|
334
|
+
Parse::Embeddings::Cache.enable!(max_entries: 2048, ttl: 600)
|
|
335
|
+
Parse::Embeddings::Cache.stats # => { enabled:, hits:, misses:, size: }
|
|
336
|
+
|
|
337
|
+
# Per-tenant spend cap now covers DIRECT callers too, not just the
|
|
338
|
+
# semantic_search agent tool. Tenant identity resolves to the ambient
|
|
339
|
+
# Parse.with_cache_tenant scope when set, else a shared default bucket.
|
|
340
|
+
# warn_at: adds a soft cap — crossing 80% of the limit emits a
|
|
341
|
+
# parse.embeddings.spend_cap_warning AS::N event (alert, never refuse).
|
|
342
|
+
Parse::Embeddings::SpendCap.configure(limit_tokens: 1_000_000, window: 3600,
|
|
343
|
+
warn_at: 0.8)
|
|
344
|
+
Parse.with_cache_tenant("tenant_abc") do
|
|
345
|
+
Document.find_similar(text: query) # charged against tenant_abc
|
|
346
|
+
end
|
|
347
|
+
```
|
|
348
|
+
|
|
349
|
+
Cache hits emit the standard `parse.embeddings.embed` notification
|
|
350
|
+
with `cached: true`, so existing spend subscribers see hits and misses
|
|
351
|
+
on one stream. The cache is in-process by default; for a persistent
|
|
352
|
+
layer shared across processes, wrap any Moneta-compatible backend in
|
|
353
|
+
the bundled adapter:
|
|
354
|
+
|
|
355
|
+
```ruby
|
|
356
|
+
moneta = Moneta.new(:Redis, url: ENV["REDIS_URL"])
|
|
357
|
+
Parse::Embeddings::Cache.enable!(
|
|
358
|
+
store: Parse::Embeddings::Cache::MonetaStore.new(moneta, ttl: 30 * 24 * 3600),
|
|
359
|
+
)
|
|
360
|
+
```
|
|
361
|
+
|
|
362
|
+
`MonetaStore` namespaces keys, forwards TTL via Moneta's `expires:`,
|
|
363
|
+
and fails open (a backend error is a cache miss, never a failed
|
|
364
|
+
embed). Keys are input hashes — plaintext queries never land in the
|
|
365
|
+
shared store; the VALUES are embeddings, so give the store the same
|
|
366
|
+
access controls as the database. A query the agent tool already
|
|
367
|
+
charged per-tenant is not double-billed (`SpendCap.with_precharged`
|
|
368
|
+
wraps the tool's retrieval).
|
|
369
|
+
|
|
291
370
|
### ACL/CLP inheritance
|
|
292
371
|
|
|
293
372
|
Vector search routes through `Parse::MongoDB.aggregate`. Every layer
|
|
@@ -405,6 +484,18 @@ branch — see [Hybrid search](#hybrid-search-vector--lexical) below) and
|
|
|
405
484
|
chunking — see [Reranking](#reranking)). Both were reserved in earlier
|
|
406
485
|
releases and now ship in 5.4.0.
|
|
407
486
|
|
|
487
|
+
**Pointer values in filters translate automatically (v5.5).** A filter
|
|
488
|
+
like `{ owner: some_user }` (a `Parse::Pointer` / `Parse::Object`, or a
|
|
489
|
+
wire-form `{"__type" => "Pointer", ...}` hash — including inside `$in`
|
|
490
|
+
/ `$eq` / `$ne` operator hashes) is rewritten to its MongoDB storage
|
|
491
|
+
form `{ "_p_owner" => "_User$abc123" }` before the `$match` /
|
|
492
|
+
`$vectorSearch.filter` is built, so pointer filters match rows instead
|
|
493
|
+
of silently matching nothing. Translation runs after the
|
|
494
|
+
underscore-key gate (callers still cannot name `_p_*` columns
|
|
495
|
+
directly) and before the tenant-scope fold; the `semantic_search`
|
|
496
|
+
agent tool inherits it. For `vector_filter:` use, the pointer column
|
|
497
|
+
(`_p_owner`) must be declared `type: "filter"` in the index.
|
|
498
|
+
|
|
408
499
|
### Hybrid search (vector + lexical)
|
|
409
500
|
|
|
410
501
|
`Class.hybrid_search` runs a lexical Atlas Search (`$search`) branch and a
|
|
@@ -556,13 +647,26 @@ envelope. See the [MCP guide's Token Economy section](./mcp_guide.md#token-econo
|
|
|
556
647
|
|
|
557
648
|
---
|
|
558
649
|
|
|
559
|
-
## Image embedding: `embed_image` macro (v5.1)
|
|
650
|
+
## Image embedding: `embed_image` macro (v5.1 URL mode, v5.5 bytes mode)
|
|
560
651
|
|
|
561
652
|
`embed_image` is the image-source counterpart to `embed`. The source
|
|
562
653
|
property must be `:file`-typed; the target must be a `:vector` property
|
|
563
654
|
whose declared `provider:` supports multimodal input (currently
|
|
564
655
|
`:voyage` with `voyage-multimodal-3`, or `:cohere` with `embed-v4.0`).
|
|
565
656
|
|
|
657
|
+
Two fetch modes, selected per declaration with `source:`:
|
|
658
|
+
|
|
659
|
+
* **`source: :url`** (default) — the SDK validates the file's URL and
|
|
660
|
+
forwards it; the **provider** performs the fetch from its own
|
|
661
|
+
network. Requires the `trust_provider_url_fetch` sentinel (see
|
|
662
|
+
operator setup below).
|
|
663
|
+
* **`source: :bytes`** (v5.5) — the **SDK** downloads the image
|
|
664
|
+
through `Parse::File.safe_open_url`, verifies the content by
|
|
665
|
+
magic-byte sniff, strips EXIF/XMP metadata, and forwards the bytes
|
|
666
|
+
to the provider as a base64 data URI. No provider-side URL fetch
|
|
667
|
+
occurs, so the sentinel is NOT required — the
|
|
668
|
+
`allowed_image_hosts` allowlist still is.
|
|
669
|
+
|
|
566
670
|
```ruby
|
|
567
671
|
class Post < Parse::Object
|
|
568
672
|
property :cover_image, :file
|
|
@@ -621,6 +725,57 @@ with `Parse::File`, not parallelized). Failures raise
|
|
|
621
725
|
(`:scheme`, `:port`, `:userinfo`, `:host_blocked`,
|
|
622
726
|
`:host_not_allowlisted`, `:parse`).
|
|
623
727
|
|
|
728
|
+
### Bytes mode (`source: :bytes`, v5.5)
|
|
729
|
+
|
|
730
|
+
```ruby
|
|
731
|
+
# Operator setup — only the host allowlist is required (the sentinel
|
|
732
|
+
# applies to URL forwarding, not SDK-side fetches):
|
|
733
|
+
Parse::Embeddings.allowed_image_hosts = [".cloudfront.net"]
|
|
734
|
+
|
|
735
|
+
class Post < Parse::Object
|
|
736
|
+
property :cover_image, :file
|
|
737
|
+
property :cover_image_embedding, :vector,
|
|
738
|
+
dimensions: 1024, provider: :voyage, model: "voyage-multimodal-3"
|
|
739
|
+
|
|
740
|
+
embed_image :cover_image, into: :cover_image_embedding,
|
|
741
|
+
source: :bytes # exif_strip: true is the default
|
|
742
|
+
end
|
|
743
|
+
```
|
|
744
|
+
|
|
745
|
+
What happens on each (digest-miss) save:
|
|
746
|
+
|
|
747
|
+
1. The file URL is validated through
|
|
748
|
+
`Parse::Embeddings.validate_image_url!(url, mode: :fetch)` — the
|
|
749
|
+
same host allowlist (deny-all when empty), obfuscated-IP screen,
|
|
750
|
+
port allowlist, and CIDR resolution check as URL mode, minus the
|
|
751
|
+
provider-egress sentinel.
|
|
752
|
+
2. `Parse::File.safe_open_url` downloads the bytes — CIDR blocks,
|
|
753
|
+
DNS-rebinding re-check, port allowlist, `max_remote_size` cap,
|
|
754
|
+
timeouts. No parallel fetch mechanism exists.
|
|
755
|
+
3. **Magic-byte verification** (`Parse::Embeddings::ImageFetch`):
|
|
756
|
+
the MIME type is determined exclusively from the leading bytes
|
|
757
|
+
(JPEG / PNG / GIF / WebP). The HTTP `Content-Type` header is never
|
|
758
|
+
consulted. The sniffed type must be in
|
|
759
|
+
`Parse::Embeddings.allowed_image_types` (default those four; SVG is
|
|
760
|
+
deliberately excluded as script-capable active content), and when
|
|
761
|
+
the URL carries a recognized image extension, the extension must
|
|
762
|
+
AGREE with the magic bytes — a `.jpg` URL serving PNG bytes (or
|
|
763
|
+
HTML) is refused as MIME laundering
|
|
764
|
+
(`ImageFetch::InvalidImageType`, with a `:reason` tag).
|
|
765
|
+
4. **EXIF/XMP stripping, default ON.** JPEG APP1 segments (Exif and
|
|
766
|
+
XMP), PNG `eXIf` chunks, and WebP `EXIF`/`XMP ` RIFF chunks (with
|
|
767
|
+
the VP8X flag bits cleared) are removed before the bytes leave the
|
|
768
|
+
process — user photos commonly carry GPS coordinates and device
|
|
769
|
+
serials. Opt out per declaration with `exif_strip: false` when
|
|
770
|
+
orientation metadata must survive.
|
|
771
|
+
5. The verified bytes ride to the provider as a base64 data URI
|
|
772
|
+
(Voyage `image_base64` content row; Cohere `image_url` data-URI
|
|
773
|
+
form).
|
|
774
|
+
|
|
775
|
+
Direct provider calls accept the same shape:
|
|
776
|
+
`provider.embed_image([Parse::Embeddings::ImageFetch.fetch!(url)])` —
|
|
777
|
+
`FetchedImage` sources and URL Strings may be mixed in one batch.
|
|
778
|
+
|
|
624
779
|
### Save-side semantics
|
|
625
780
|
|
|
626
781
|
* Digest is the **SHA-256 of the URL String**, not the file bytes.
|
|
@@ -641,24 +796,102 @@ with `Parse::File`, not parallelized). Failures raise
|
|
|
641
796
|
|
|
642
797
|
## Re-embedding existing rows
|
|
643
798
|
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
799
|
+
### Provenance: the `<into>_meta` sibling (v5.5)
|
|
800
|
+
|
|
801
|
+
Every `embed` / `embed_image` declaration auto-declares an
|
|
802
|
+
`<into>_meta` `:object` sibling (override with `meta_field:`) stamped
|
|
803
|
+
on each recompute and cleared with the vector:
|
|
804
|
+
|
|
805
|
+
```ruby
|
|
806
|
+
doc.body_embedding_meta
|
|
807
|
+
# => { "provider" => "openai",
|
|
808
|
+
# "model" => "text-embedding-3-small",
|
|
809
|
+
# "dimensions" => 1536,
|
|
810
|
+
# "modality" => "text",
|
|
811
|
+
# "embedded_at" => "2026-06-09T17:32:11Z" }
|
|
812
|
+
```
|
|
813
|
+
|
|
814
|
+
This is the record migration tooling reads to know which model
|
|
815
|
+
produced any stored vector.
|
|
816
|
+
|
|
817
|
+
### Same-shape migrations: `Class.reembed!` (v5.5)
|
|
818
|
+
|
|
819
|
+
When the new model has the **same dimensions** (e.g. swapping
|
|
820
|
+
`text-embedding-3-small` for a same-width replacement, or a provider
|
|
821
|
+
change at equal width), re-embed in place:
|
|
822
|
+
|
|
823
|
+
```ruby
|
|
824
|
+
# Re-embed every row through the CURRENT provider/model declaration.
|
|
825
|
+
Document.reembed!(batch_size: 100)
|
|
826
|
+
|
|
827
|
+
# Resumable: skip rows whose <into>_meta already matches the current
|
|
828
|
+
# provider + model + dimensions (rows with no meta count as stale).
|
|
829
|
+
Document.reembed!(only_stale: true)
|
|
830
|
+
|
|
831
|
+
# Scope it
|
|
832
|
+
Document.reembed!(field: :body_embedding, where: { published: true }, limit: 10_000)
|
|
833
|
+
```
|
|
834
|
+
|
|
835
|
+
`reembed!` walks the class with objectId-cursor pagination, clears
|
|
836
|
+
each row's digest sibling (so the save-path recompute cannot elide the
|
|
837
|
+
provider call), and saves. Unlike `embed_pending!` — which only fills
|
|
838
|
+
NULL vectors — `reembed!` recomputes populated rows too. Run it with a
|
|
839
|
+
master-key client (or pass `save_opts:` with a session token that can
|
|
840
|
+
write every row). Each row's save makes one provider call; pace bulk
|
|
841
|
+
runs against provider rate limits (see `BatchEmbedder` below for the
|
|
842
|
+
pattern, or just throttle the loop).
|
|
843
|
+
|
|
844
|
+
### Changed-width migrations: dual-field workflow
|
|
845
|
+
|
|
846
|
+
Changing `dimensions:` is a different beast — the existing
|
|
847
|
+
vectorSearch index can't serve the new width. Use the shadow-field
|
|
848
|
+
workflow:
|
|
647
849
|
|
|
648
850
|
1. Add the new property alongside the old one
|
|
649
851
|
(`property :body_embedding_v2, :vector, ...`) and an `embed` or
|
|
650
852
|
`embed_image` block targeting it.
|
|
651
|
-
2. Backfill
|
|
652
|
-
|
|
653
|
-
3.
|
|
654
|
-
|
|
655
|
-
4. Drop the old property.
|
|
656
|
-
|
|
657
|
-
Do NOT mutate
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
853
|
+
2. Backfill with `embed_pending!(field: :body_embedding_v2)` — the new
|
|
854
|
+
field is null everywhere, so the null-filling walk is exactly right.
|
|
855
|
+
3. Deploy a new vectorSearch index covering the new field and migrate
|
|
856
|
+
`find_similar` callers.
|
|
857
|
+
4. Drop the old property and index.
|
|
858
|
+
|
|
859
|
+
Do NOT mutate a model's `dimensions:` in place — the digest mechanism
|
|
860
|
+
will see unchanged source text and skip recompute, leaving stale
|
|
861
|
+
vectors, and the drift verifier will flag every query against the old
|
|
862
|
+
index (`index numDimensions=1536 but property declares ...`). For
|
|
863
|
+
`embed_image`, also remember the digest is over the URL String: if you
|
|
864
|
+
replace bytes at the same URL (PUT-replace on S3 without renaming),
|
|
865
|
+
null the digest field — or run `reembed!` — to force re-embed.
|
|
866
|
+
|
|
867
|
+
---
|
|
868
|
+
|
|
869
|
+
## Bulk embedding: `BatchEmbedder` (v5.5)
|
|
870
|
+
|
|
871
|
+
`Provider#embed_text_batched` only slices input into provider-sized
|
|
872
|
+
chunks; retry lives inside each provider's single HTTP call. For bulk
|
|
873
|
+
jobs (ingest pipelines, chunk-corpus embedding) use
|
|
874
|
+
`Parse::Embeddings::BatchEmbedder`, which adds batch-level pacing and
|
|
875
|
+
backoff:
|
|
876
|
+
|
|
877
|
+
```ruby
|
|
878
|
+
embedder = Parse::Embeddings::BatchEmbedder.new(
|
|
879
|
+
Parse::Embeddings.provider(:openai),
|
|
880
|
+
requests_per_minute: 60, # inter-batch pacing
|
|
881
|
+
max_attempts: 5, # per-batch tries (exponential backoff + jitter)
|
|
882
|
+
on_progress: ->(done:, total:, batch_index:, batch_count:) {
|
|
883
|
+
puts "#{done}/#{total}"
|
|
884
|
+
},
|
|
885
|
+
)
|
|
886
|
+
vectors = embedder.embed_text(texts, input_type: :search_document)
|
|
887
|
+
```
|
|
888
|
+
|
|
889
|
+
Rate-limit and transient errors (any provider error class ending in
|
|
890
|
+
`RateLimitError` / `TransientError`; override with `retry_on:`) retry
|
|
891
|
+
with exponential backoff; other errors propagate immediately. A batch
|
|
892
|
+
that exhausts its attempts raises `BatchEmbedder::BatchFailed`
|
|
893
|
+
carrying `batch_index` and `completed_count`, so a resumable job knows
|
|
894
|
+
exactly where to pick up.
|
|
662
895
|
|
|
663
896
|
---
|
|
664
897
|
|
|
@@ -728,6 +961,54 @@ floats out). Vectors only flow through the Parse↔Mongo path, where
|
|
|
728
961
|
the body builder's `<vector dims=N>` compaction prevents them from
|
|
729
962
|
landing in stdout / error trackers.
|
|
730
963
|
|
|
964
|
+
### When the embedded source is PII: deployment checklist
|
|
965
|
+
|
|
966
|
+
An embedding of PII is PII-equivalent. Inversion attacks reconstruct
|
|
967
|
+
substantial source text from dense embeddings, and a vector's nearest
|
|
968
|
+
neighbors leak the source's meaning even without reconstruction. If
|
|
969
|
+
the fields you `embed` contain personal data (names, addresses, health
|
|
970
|
+
or financial details, free-text user messages), treat the vector
|
|
971
|
+
column with the same handling as the source column:
|
|
972
|
+
|
|
973
|
+
1. **Provider contract.** You are sending the raw source text (and in
|
|
974
|
+
bytes mode, image content) to the embedding provider on every
|
|
975
|
+
recompute. Confirm the provider's data-retention and training-use
|
|
976
|
+
terms cover PII, and that a DPA is in place where required.
|
|
977
|
+
Self-hosting via `LocalHTTP` (Ollama / vLLM / TEI) keeps the text
|
|
978
|
+
in your network.
|
|
979
|
+
2. **Keep vectors off the wire.** Leave `vector_visibility` at its
|
|
980
|
+
`:owner_only` default so vectors are omitted from `as_json` and
|
|
981
|
+
webhook payloads. Do not flip a PII class to `:public`.
|
|
982
|
+
3. **Row ACL still governs.** Vector hits route mongo-direct with
|
|
983
|
+
`_rperm` enforcement — verify your rows carry real ACLs and that
|
|
984
|
+
callers use scoped credentials (`session_token:` / `acl_user:`),
|
|
985
|
+
not blanket master key.
|
|
986
|
+
4. **Tenant isolation.** Multi-tenant deployments must declare
|
|
987
|
+
`agent_tenant_scope` on searchable classes; the scope folds into
|
|
988
|
+
`$vectorSearch.filter` (and v5.5's drift verification confirms the
|
|
989
|
+
index covers it). Without it, similarity scores leak cross-tenant
|
|
990
|
+
document existence.
|
|
991
|
+
5. **Score exposure.** Keep score quantization on for non-admin agent
|
|
992
|
+
contexts (the default) — full-precision scores enable
|
|
993
|
+
membership-inference probing.
|
|
994
|
+
6. **EXIF stays stripped.** For image embedding, keep the bytes-mode
|
|
995
|
+
default `exif_strip: true`; user photos carry GPS coordinates and
|
|
996
|
+
device serials that would otherwise reach the provider.
|
|
997
|
+
7. **Log and cache hygiene.** Redact query text at the Faraday layer
|
|
998
|
+
(above); if you enable the persistent L2 cache, note that cache
|
|
999
|
+
KEYS are hashes (no plaintext) but cache VALUES are the embeddings
|
|
1000
|
+
themselves — point `MonetaStore` at a store with the same access
|
|
1001
|
+
controls as the database.
|
|
1002
|
+
8. **Deletion propagation.** When a user exercises erasure rights,
|
|
1003
|
+
the vector, its `<field>_digest`, and its `<field>_meta` siblings
|
|
1004
|
+
live on the same row and delete with it — but check external
|
|
1005
|
+
copies: provider-side logs (their retention policy), your L2
|
|
1006
|
+
embedding cache (TTL or explicit flush), and any analytics sink
|
|
1007
|
+
subscribed to embedding events.
|
|
1008
|
+
9. **Migration hygiene.** `reembed!` re-sends every row's source text
|
|
1009
|
+
to the provider — schedule PII-class migrations under the same
|
|
1010
|
+
approvals as a data export.
|
|
1011
|
+
|
|
731
1012
|
---
|
|
732
1013
|
|
|
733
1014
|
## Troubleshooting
|
|
@@ -775,10 +1056,20 @@ on every poll) rather than a `until index_ready?; sleep` loop.
|
|
|
775
1056
|
Key files:
|
|
776
1057
|
|
|
777
1058
|
* `lib/parse/embeddings.rb` — registry, `Configuration`, `register`,
|
|
778
|
-
`provider`, `configure`, `validate_image_url
|
|
779
|
-
`trust_provider_url_fetch=`, `allowed_image_hosts
|
|
1059
|
+
`provider`, `configure`, `validate_image_url!` (`mode: :forward | :fetch`),
|
|
1060
|
+
`trust_provider_url_fetch=`, `allowed_image_hosts=`,
|
|
1061
|
+
`allowed_image_types=`.
|
|
780
1062
|
* `lib/parse/embeddings/provider.rb` — abstract base, `validate_response!`,
|
|
781
1063
|
`instrument_embed`, AS::N payload contract.
|
|
1064
|
+
* `lib/parse/embeddings/image_fetch.rb` — bytes-fetch path:
|
|
1065
|
+
`ImageFetch.fetch!`, magic-byte `sniff_mime`/`verify!`, EXIF/XMP
|
|
1066
|
+
stripping, `FetchedImage`.
|
|
1067
|
+
* `lib/parse/embeddings/batch_embedder.rb` — `BatchEmbedder` bulk
|
|
1068
|
+
orchestration (pacing, batch-level backoff, `BatchFailed`).
|
|
1069
|
+
* `lib/parse/embeddings/cache.rb` — opt-in query-embed cache
|
|
1070
|
+
(`Cache.enable!` / `fetch_vector` / `stats`).
|
|
1071
|
+
* `lib/parse/embeddings/spend_cap.rb` — per-tenant token cap
|
|
1072
|
+
(`charge!`, `charge_query!`, `with_precharged`).
|
|
782
1073
|
* `lib/parse/embeddings/openai.rb` — OpenAI provider.
|
|
783
1074
|
* `lib/parse/embeddings/cohere.rb` — Cohere v3 + v4.0 text-mode provider.
|
|
784
1075
|
* `lib/parse/embeddings/voyage.rb` — Voyage text + multimodal-3
|
|
@@ -788,9 +1079,13 @@ Key files:
|
|
|
788
1079
|
* `lib/parse/embeddings/local_http.rb` — generic OpenAI-compatible
|
|
789
1080
|
local-gateway client.
|
|
790
1081
|
* `lib/parse/embeddings/fixture.rb` — deterministic test provider.
|
|
791
|
-
* `lib/parse/model/core/vector_searchable.rb` — `find_similar
|
|
1082
|
+
* `lib/parse/model/core/vector_searchable.rb` — `find_similar`,
|
|
1083
|
+
`hybrid_search`, index drift verification
|
|
1084
|
+
(`Parse::VectorSearch.index_drift_policy`).
|
|
792
1085
|
* `lib/parse/model/core/embed_managed.rb` — `embed` and `embed_image`
|
|
793
|
-
macros, `EmbedDirective` (carries `modality:`, `allow_insecure
|
|
1086
|
+
macros, `EmbedDirective` (carries `modality:`, `allow_insecure:`,
|
|
1087
|
+
`source_mode:`, `exif_strip:`, `meta_field:`), `embed_pending!`,
|
|
1088
|
+
`reembed!`.
|
|
794
1089
|
* `lib/parse/vector_search.rb` — low-level `Parse::VectorSearch.search`.
|
|
795
1090
|
* `lib/parse/atlas_search/index_manager.rb` — `IndexCatalog.create_index`,
|
|
796
1091
|
`find_vector_index`, `wait_for_ready`.
|
data/lib/parse/api/users.rb
CHANGED
|
@@ -223,15 +223,25 @@ module Parse
|
|
|
223
223
|
# - code 205 (+ERROR_EMAIL_NOT_FOUND+) when +preventLoginWithUnverifiedEmail+
|
|
224
224
|
# is enabled and the account's email has not been verified.
|
|
225
225
|
#
|
|
226
|
+
# Client-side rate limited per username using the SAME bucket as {#login}
|
|
227
|
+
# (bare username, no namespace) — failures across both credential oracles
|
|
228
|
+
# accumulate, so an attacker cannot bypass a +login+ lockout by pivoting to
|
|
229
|
+
# this endpoint. The trade-off: a run of failed step-up re-auth calls counts
|
|
230
|
+
# toward (and can trigger) the primary login lockout for that username.
|
|
231
|
+
# Client-side limiting is a convenience, not a boundary — the server is the
|
|
232
|
+
# real control.
|
|
233
|
+
#
|
|
226
234
|
# @param username [String] the Parse user username.
|
|
227
235
|
# @param password [String] the Parse user's associated password.
|
|
228
236
|
# @param headers [Hash] additional HTTP headers to send with the request.
|
|
229
237
|
# @param opts [Hash] additional options to pass to the {Parse::Client} request.
|
|
230
238
|
# @return [Parse::Response]
|
|
231
239
|
def verify_password(username, password, headers: {}, **opts)
|
|
240
|
+
check_login_rate_limit!(username)
|
|
232
241
|
body = { username: username, password: password }
|
|
233
242
|
response = request :post, VERIFY_PASSWORD_PATH, body: body, headers: headers, opts: opts
|
|
234
243
|
response.parse_class = Parse::Model::CLASS_USER
|
|
244
|
+
track_login_attempt(username, response.success?)
|
|
235
245
|
response
|
|
236
246
|
end
|
|
237
247
|
|
data/lib/parse/client.rb
CHANGED
|
@@ -1425,6 +1425,22 @@ module Parse
|
|
|
1425
1425
|
# Object/Pointer envelope is converted, and an Object of an UNregistered class
|
|
1426
1426
|
# is left as a raw Hash (building it would degrade to a field-less Pointer).
|
|
1427
1427
|
# Plain Hashes and arbitrary `__type` app data pass through untouched.
|
|
1428
|
+
#
|
|
1429
|
+
# SECURITY — cloud results are treated as server-authoritative. The
|
|
1430
|
+
# `__type:"Object"` decode in {._decode_cloud_value} routes through
|
|
1431
|
+
# +Parse::Object.build+, which hydrates with trusted-init — the SAME path
|
|
1432
|
+
# used to decode every query / +.fetch+ result. Trusted-init skips the
|
|
1433
|
+
# +PROTECTED_INITIALIZE_KEYS+ filter, so credential-shaped keys
|
|
1434
|
+
# (+sessionToken+, +authData+, +_rperm+, +_wperm+, +roles+, …) present in a
|
|
1435
|
+
# cloud function's return value populate the in-memory object, exactly as they
|
|
1436
|
+
# do for any other server response. This is by design: the payload is authored
|
|
1437
|
+
# by your Cloud Code and the request is caller-authenticated, and making cloud
|
|
1438
|
+
# results filter these keys would make them inconsistent with (and stricter
|
|
1439
|
+
# than) query/+.fetch+ hydration — e.g. a cloud function returning
|
|
1440
|
+
# +request.user+ would come back missing its +sessionToken+. If a cloud
|
|
1441
|
+
# function is expected to echo back third-party-influenced data, call it with
|
|
1442
|
+
# +raw: true+ (+Parse.call_function(name, body, raw: true)+) to receive the
|
|
1443
|
+
# undecoded response and sanitize it yourself before building objects.
|
|
1428
1444
|
def self._extract_cloud_result(response)
|
|
1429
1445
|
r = response.result
|
|
1430
1446
|
value = r.is_a?(Hash) ? r["result"] : r
|
|
@@ -1568,7 +1584,9 @@ module Parse
|
|
|
1568
1584
|
# specific {Parse::Error} subclasses as the underlying client does.
|
|
1569
1585
|
# @param name (see Parse.call_function)
|
|
1570
1586
|
# @param body (see Parse.call_function)
|
|
1571
|
-
# @param opts (see Parse.call_function) —
|
|
1587
|
+
# @param opts (see Parse.call_function) — +:raw+ has no effect; this method
|
|
1588
|
+
# always decodes the result. Use {Parse.call_function} with +raw: true+ if
|
|
1589
|
+
# you need the undecoded response.
|
|
1572
1590
|
# @raise [Parse::Error::CloudCodeError] when the response indicates a cloud-code error.
|
|
1573
1591
|
# @return [Object] the result data of the response.
|
|
1574
1592
|
def self.call_function!(name, body = {}, **opts)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module Parse
|
|
5
|
+
module Embeddings
|
|
6
|
+
# Batch-level orchestration for bulk embedding jobs.
|
|
7
|
+
#
|
|
8
|
+
# {Provider#embed_text_batched} only slices input into
|
|
9
|
+
# provider-sized chunks; any retry/backoff lives inside each
|
|
10
|
+
# provider's single HTTP call. That is the wrong layer for bulk
|
|
11
|
+
# work: a 50k-document backfill needs *batch-level* pacing (stay
|
|
12
|
+
# under the provider's requests-per-minute budget across calls) and
|
|
13
|
+
# *batch-level* backoff (a 429 after the provider's internal retries
|
|
14
|
+
# are exhausted should pause the whole job, not kill it).
|
|
15
|
+
# {BatchEmbedder} wraps any registered provider with both.
|
|
16
|
+
#
|
|
17
|
+
# @example Backfill with pacing and backoff
|
|
18
|
+
# embedder = Parse::Embeddings::BatchEmbedder.new(
|
|
19
|
+
# Parse::Embeddings.provider(:openai),
|
|
20
|
+
# requests_per_minute: 60,
|
|
21
|
+
# max_attempts: 5,
|
|
22
|
+
# )
|
|
23
|
+
# vectors = embedder.embed_text(texts, input_type: :search_document)
|
|
24
|
+
#
|
|
25
|
+
# @example Progress reporting
|
|
26
|
+
# embedder = Parse::Embeddings::BatchEmbedder.new(provider,
|
|
27
|
+
# on_progress: ->(done:, total:, batch_index:, batch_count:) {
|
|
28
|
+
# puts "#{done}/#{total}"
|
|
29
|
+
# })
|
|
30
|
+
#
|
|
31
|
+
# == Retry classification
|
|
32
|
+
#
|
|
33
|
+
# By default a batch is retried when the provider raises a
|
|
34
|
+
# {Parse::Embeddings::Error} subclass whose class name ends in
|
|
35
|
+
# `RateLimitError` or `TransientError` — the convention every
|
|
36
|
+
# bundled provider follows (`OpenAI::RateLimitError`,
|
|
37
|
+
# `Voyage::TransientError`, …). Pass `retry_on:` with explicit
|
|
38
|
+
# exception classes to override. Non-retryable errors (auth,
|
|
39
|
+
# bad-request, response-contract violations) propagate immediately.
|
|
40
|
+
#
|
|
41
|
+
# Vectors are returned aligned 1:1 with the input, identical to
|
|
42
|
+
# `embed_text` on the wrapped provider.
|
|
43
|
+
class BatchEmbedder
|
|
44
|
+
# Raised when a batch still fails after `max_attempts` retryable
|
|
45
|
+
# failures. Wraps the final provider error in `#cause` and carries
|
|
46
|
+
# the index of the failing batch so a resumable job knows where to
|
|
47
|
+
# pick up.
|
|
48
|
+
class BatchFailed < Parse::Embeddings::Error
|
|
49
|
+
# @return [Integer] zero-based index of the failing batch.
|
|
50
|
+
attr_reader :batch_index
|
|
51
|
+
# @return [Integer] number of inputs successfully embedded before the failure.
|
|
52
|
+
attr_reader :completed_count
|
|
53
|
+
|
|
54
|
+
def initialize(message, batch_index:, completed_count:)
|
|
55
|
+
@batch_index = batch_index
|
|
56
|
+
@completed_count = completed_count
|
|
57
|
+
super(message)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
RETRYABLE_NAME_SUFFIXES = %w[RateLimitError TransientError].freeze
|
|
62
|
+
|
|
63
|
+
# @return [Provider] the wrapped provider.
|
|
64
|
+
attr_reader :provider
|
|
65
|
+
|
|
66
|
+
# @param provider [Provider] any registered embedding provider.
|
|
67
|
+
# @param batch_size [Integer, nil] inputs per provider call.
|
|
68
|
+
# Defaults to the provider's own {Provider#embed_batch_size}
|
|
69
|
+
# hint, falling back to 64 when the provider has none.
|
|
70
|
+
# @param requests_per_minute [Numeric, nil] batch-level pacing
|
|
71
|
+
# budget. When set, consecutive provider calls are spaced at
|
|
72
|
+
# least `60.0 / requests_per_minute` seconds apart. nil disables
|
|
73
|
+
# pacing.
|
|
74
|
+
# @param max_attempts [Integer] attempts per batch (1 = no retry).
|
|
75
|
+
# @param base_delay [Numeric] first backoff delay in seconds;
|
|
76
|
+
# doubles per attempt.
|
|
77
|
+
# @param max_delay [Numeric] backoff ceiling in seconds.
|
|
78
|
+
# @param jitter [Numeric] random multiplier range added to each
|
|
79
|
+
# delay (`delay * (1 + rand * jitter)`); spreads thundering
|
|
80
|
+
# herds when several workers back off together.
|
|
81
|
+
# @param retry_on [Array<Class>, nil] explicit retryable exception
|
|
82
|
+
# classes; nil uses the name-suffix convention described above.
|
|
83
|
+
# @param on_progress [#call, nil] callable invoked after each
|
|
84
|
+
# successful batch with `done:, total:, batch_index:, batch_count:`.
|
|
85
|
+
def initialize(provider, batch_size: nil, requests_per_minute: nil,
|
|
86
|
+
max_attempts: 5, base_delay: 2.0, max_delay: 60.0,
|
|
87
|
+
jitter: 0.25, retry_on: nil, on_progress: nil)
|
|
88
|
+
unless provider.is_a?(Provider)
|
|
89
|
+
raise ArgumentError,
|
|
90
|
+
"Parse::Embeddings::BatchEmbedder expects a Parse::Embeddings::Provider " \
|
|
91
|
+
"(got #{provider.class})."
|
|
92
|
+
end
|
|
93
|
+
@provider = provider
|
|
94
|
+
@batch_size = batch_size ? Integer(batch_size) : nil
|
|
95
|
+
raise ArgumentError, "batch_size must be positive" if @batch_size && @batch_size <= 0
|
|
96
|
+
@min_interval = requests_per_minute ? (60.0 / Float(requests_per_minute)) : nil
|
|
97
|
+
@max_attempts = Integer(max_attempts)
|
|
98
|
+
raise ArgumentError, "max_attempts must be >= 1" if @max_attempts < 1
|
|
99
|
+
@base_delay = Float(base_delay)
|
|
100
|
+
@max_delay = Float(max_delay)
|
|
101
|
+
@jitter = Float(jitter)
|
|
102
|
+
@retry_on = retry_on && Array(retry_on)
|
|
103
|
+
@on_progress = on_progress
|
|
104
|
+
@last_call_at = nil
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Embed `strings` through the wrapped provider with pacing and
|
|
108
|
+
# batch-level backoff.
|
|
109
|
+
#
|
|
110
|
+
# @param strings [Array<String>]
|
|
111
|
+
# @param input_type [Symbol]
|
|
112
|
+
# @return [Array<Array<Float>>] aligned 1:1 with `strings`.
|
|
113
|
+
# @raise [BatchFailed] when a batch exhausts its attempts.
|
|
114
|
+
def embed_text(strings, input_type: :search_document)
|
|
115
|
+
unless strings.is_a?(Array)
|
|
116
|
+
raise ArgumentError,
|
|
117
|
+
"Parse::Embeddings::BatchEmbedder#embed_text expects Array<String> " \
|
|
118
|
+
"(got #{strings.class})."
|
|
119
|
+
end
|
|
120
|
+
return [] if strings.empty?
|
|
121
|
+
|
|
122
|
+
size = @batch_size || @provider.embed_batch_size || 64
|
|
123
|
+
batches = strings.each_slice(size).to_a
|
|
124
|
+
out = []
|
|
125
|
+
batches.each_with_index do |batch, idx|
|
|
126
|
+
out.concat(run_batch(batch, input_type, idx, out.length))
|
|
127
|
+
if @on_progress
|
|
128
|
+
@on_progress.call(done: out.length, total: strings.length,
|
|
129
|
+
batch_index: idx, batch_count: batches.length)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
out
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
private
|
|
136
|
+
|
|
137
|
+
def run_batch(batch, input_type, batch_index, completed_count)
|
|
138
|
+
attempts = 0
|
|
139
|
+
begin
|
|
140
|
+
attempts += 1
|
|
141
|
+
pace!
|
|
142
|
+
@provider.embed_text(batch, input_type: input_type)
|
|
143
|
+
rescue StandardError => e
|
|
144
|
+
raise unless retryable?(e)
|
|
145
|
+
if attempts >= @max_attempts
|
|
146
|
+
raise BatchFailed.new(
|
|
147
|
+
"Parse::Embeddings::BatchEmbedder: batch #{batch_index} failed after " \
|
|
148
|
+
"#{attempts} attempt(s) — #{e.class}: #{e.message}",
|
|
149
|
+
batch_index: batch_index, completed_count: completed_count,
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
sleep(backoff_delay(attempts))
|
|
153
|
+
retry
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def retryable?(error)
|
|
158
|
+
if @retry_on
|
|
159
|
+
return @retry_on.any? { |klass| error.is_a?(klass) }
|
|
160
|
+
end
|
|
161
|
+
return false unless error.is_a?(Parse::Embeddings::Error)
|
|
162
|
+
name = error.class.name.to_s
|
|
163
|
+
RETRYABLE_NAME_SUFFIXES.any? { |suffix| name.end_with?(suffix) }
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def backoff_delay(attempt)
|
|
167
|
+
delay = [@base_delay * (2**(attempt - 1)), @max_delay].min
|
|
168
|
+
delay * (1.0 + rand * @jitter)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Enforce the inter-call interval. Measured from the START of the
|
|
172
|
+
# previous call so a slow provider response counts toward the
|
|
173
|
+
# interval rather than stacking on top of it.
|
|
174
|
+
def pace!
|
|
175
|
+
return if @min_interval.nil?
|
|
176
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
177
|
+
if @last_call_at
|
|
178
|
+
wait = (@last_call_at + @min_interval) - now
|
|
179
|
+
if wait > 0
|
|
180
|
+
sleep(wait)
|
|
181
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
@last_call_at = now
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|