parse-stack-next 5.4.1 → 5.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +344 -0
- data/Gemfile.lock +1 -1
- data/README.md +45 -6
- data/docs/atlas_vector_search_guide.md +314 -19
- data/lib/parse/api/users.rb +10 -0
- data/lib/parse/client.rb +19 -1
- data/lib/parse/embeddings/batch_embedder.rb +188 -0
- data/lib/parse/embeddings/cache.rb +322 -0
- data/lib/parse/embeddings/cohere.rb +31 -18
- data/lib/parse/embeddings/image_fetch.rb +347 -0
- data/lib/parse/embeddings/provider.rb +17 -11
- data/lib/parse/embeddings/spend_cap.rb +117 -3
- data/lib/parse/embeddings/voyage.rb +34 -25
- data/lib/parse/embeddings.rb +40 -3
- data/lib/parse/model/acl.rb +15 -11
- data/lib/parse/model/core/embed_managed.rb +243 -14
- data/lib/parse/model/core/vector_searchable.rb +157 -8
- data/lib/parse/query/constraint.rb +22 -0
- data/lib/parse/query/constraints.rb +271 -250
- data/lib/parse/query.rb +233 -42
- data/lib/parse/retrieval/agent_tool.rb +21 -14
- data/lib/parse/retrieval/retriever.rb +84 -0
- data/lib/parse/schema/search_index_migrator.rb +48 -1
- data/lib/parse/stack/version.rb +1 -1
- data/lib/parse/vector_search/hybrid.rb +39 -1
- data/lib/parse/vector_search.rb +34 -0
- data/lib/parse/webhooks/payload.rb +7 -1
- data/lib/parse/webhooks.rb +107 -21
- metadata +4 -1
data/lib/parse/embeddings.rb
CHANGED
|
@@ -246,6 +246,7 @@ module Parse
|
|
|
246
246
|
CONFIG_MUTEX.synchronize do
|
|
247
247
|
@configuration = nil
|
|
248
248
|
@allowed_image_hosts = nil
|
|
249
|
+
@allowed_image_types = nil
|
|
249
250
|
@trust_provider_url_fetch = nil
|
|
250
251
|
end
|
|
251
252
|
end
|
|
@@ -298,6 +299,30 @@ module Parse
|
|
|
298
299
|
@allowed_image_hosts ||= [].freeze
|
|
299
300
|
end
|
|
300
301
|
|
|
302
|
+
# Configure the MIME types the bytes-fetch path accepts after
|
|
303
|
+
# magic-byte sniffing (see {ImageFetch.verify!}). Defaults to
|
|
304
|
+
# {ImageFetch::DEFAULT_ALLOWED_IMAGE_TYPES} (JPEG / PNG / GIF /
|
|
305
|
+
# WebP). The sniffed type — never the `Content-Type` header — is
|
|
306
|
+
# checked against this list, so adding a type here only matters
|
|
307
|
+
# when {ImageFetch.sniff_mime} can recognize its magic bytes.
|
|
308
|
+
#
|
|
309
|
+
# @param types [Array<String>] MIME type strings.
|
|
310
|
+
# @return [Array<String>]
|
|
311
|
+
def allowed_image_types=(types)
|
|
312
|
+
unless types.is_a?(Array) && !types.empty? &&
|
|
313
|
+
types.all? { |t| t.is_a?(String) && t.include?("/") }
|
|
314
|
+
raise ArgumentError,
|
|
315
|
+
"Parse::Embeddings.allowed_image_types= expects a non-empty Array of " \
|
|
316
|
+
"MIME type Strings (got #{types.inspect})."
|
|
317
|
+
end
|
|
318
|
+
CONFIG_MUTEX.synchronize { @allowed_image_types = types.dup.freeze }
|
|
319
|
+
end
|
|
320
|
+
|
|
321
|
+
# @return [Array<String>] MIME allowlist for the bytes-fetch path (frozen).
|
|
322
|
+
def allowed_image_types
|
|
323
|
+
@allowed_image_types ||= ImageFetch::DEFAULT_ALLOWED_IMAGE_TYPES
|
|
324
|
+
end
|
|
325
|
+
|
|
301
326
|
# Sentinel-gated opt-in for forwarding image URLs to embedding
|
|
302
327
|
# providers. Assign the exact {TRUST_PROVIDER_URL_FETCH_SENTINEL}
|
|
303
328
|
# String to unlock; any other value (including `true`, `1`,
|
|
@@ -357,11 +382,20 @@ module Parse
|
|
|
357
382
|
# @param allow_insecure [Boolean] permit `http://` (default
|
|
358
383
|
# false). Only meaningful for local development / container-
|
|
359
384
|
# internal CDN proxies.
|
|
385
|
+
# @param mode [Symbol] `:forward` (default) validates for
|
|
386
|
+
# URL-forwarding to a provider and requires the
|
|
387
|
+
# {.trust_provider_url_fetch=} sentinel. `:fetch` validates for
|
|
388
|
+
# the SDK's OWN download through {Parse::File.safe_open_url}
|
|
389
|
+
# (the v5.5 bytes path) and skips the sentinel — no URL is
|
|
390
|
+
# forwarded to a third party, so the provider-egress
|
|
391
|
+
# acknowledgment doesn't apply. Every other layer (host
|
|
392
|
+
# allowlist deny-by-default, obfuscated-IP screen, port
|
|
393
|
+
# allowlist, CIDR resolution check) is identical in both modes.
|
|
360
394
|
# @return [String] canonicalized URL (`URI.parse(url).to_s`).
|
|
361
|
-
# @raise [ConfirmationRequired] when the sentinel is unset.
|
|
395
|
+
# @raise [ConfirmationRequired] when the sentinel is unset (`:forward` mode).
|
|
362
396
|
# @raise [InvalidImageURL] on any other validation failure.
|
|
363
|
-
def validate_image_url!(url, allow_insecure: false)
|
|
364
|
-
unless trust_provider_url_fetch?
|
|
397
|
+
def validate_image_url!(url, allow_insecure: false, mode: :forward)
|
|
398
|
+
unless mode == :fetch || trust_provider_url_fetch?
|
|
365
399
|
hint =
|
|
366
400
|
if allowed_image_hosts.empty?
|
|
367
401
|
" First populate Parse::Embeddings.allowed_image_hosts with the CDN " \
|
|
@@ -555,3 +589,6 @@ require_relative "embeddings/jina"
|
|
|
555
589
|
require_relative "embeddings/qwen"
|
|
556
590
|
require_relative "embeddings/local_http"
|
|
557
591
|
require_relative "embeddings/spend_cap"
|
|
592
|
+
require_relative "embeddings/image_fetch"
|
|
593
|
+
require_relative "embeddings/cache"
|
|
594
|
+
require_relative "embeddings/batch_embedder"
|
data/lib/parse/model/acl.rb
CHANGED
|
@@ -211,8 +211,8 @@ module Parse
|
|
|
211
211
|
# @example
|
|
212
212
|
# permissions = ["*", user.id] + user.acl_roles.to_a.map { |n| "role:#{n}" }
|
|
213
213
|
# pipeline << { "$match" => Parse::ACL.read_predicate(permissions) }
|
|
214
|
-
def self.read_predicate(permissions, include_public: true)
|
|
215
|
-
permission_predicate("_rperm", permissions, include_public: include_public)
|
|
214
|
+
def self.read_predicate(permissions, include_public: true, include_missing: true)
|
|
215
|
+
permission_predicate("_rperm", permissions, include_public: include_public, include_missing: include_missing)
|
|
216
216
|
end
|
|
217
217
|
|
|
218
218
|
# Build a MongoDB +$match+-shaped predicate that matches documents
|
|
@@ -222,8 +222,8 @@ module Parse
|
|
|
222
222
|
# @param permissions [Array<String>] permission strings.
|
|
223
223
|
# @param include_public [Boolean] whether to append +"*"+.
|
|
224
224
|
# @return [Hash] a MongoDB +$or+ subexpression.
|
|
225
|
-
def self.write_predicate(permissions, include_public: true)
|
|
226
|
-
permission_predicate("_wperm", permissions, include_public: include_public)
|
|
225
|
+
def self.write_predicate(permissions, include_public: true, include_missing: true)
|
|
226
|
+
permission_predicate("_wperm", permissions, include_public: include_public, include_missing: include_missing)
|
|
227
227
|
end
|
|
228
228
|
|
|
229
229
|
# @!visibility private
|
|
@@ -231,15 +231,19 @@ module Parse
|
|
|
231
231
|
# Normalizes the permissions array (string-coerced, deduplicated,
|
|
232
232
|
# +"*"+ appended when +include_public+) and returns the +$or+
|
|
233
233
|
# subexpression.
|
|
234
|
-
|
|
234
|
+
# @param include_missing [Boolean] when true (default), append the
|
|
235
|
+
# +{ field => { "$exists" => false } }+ branch so a missing
|
|
236
|
+
# +_rperm+/+_wperm+ (treated as public by Parse Server) also matches.
|
|
237
|
+
# Set false for an EXACT match that requires the column to be present
|
|
238
|
+
# and to contain one of +permissions+ (the strict/`readable_by_exact`
|
|
239
|
+
# surface). When false and only the +$in+ branch remains, the +$or+
|
|
240
|
+
# wrapper is dropped for a cleaner +{ field => { "$in" => perms } }+.
|
|
241
|
+
def self.permission_predicate(field, permissions, include_public: true, include_missing: true)
|
|
235
242
|
perms = Array(permissions).map(&:to_s).reject(&:empty?).uniq
|
|
236
243
|
perms << "*" if include_public && !perms.include?("*")
|
|
237
|
-
{
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
{ field => { "$exists" => false } },
|
|
241
|
-
],
|
|
242
|
-
}
|
|
244
|
+
branches = [{ field => { "$in" => perms } }]
|
|
245
|
+
branches << { field => { "$exists" => false } } if include_missing
|
|
246
|
+
branches.length == 1 ? branches.first : { "$or" => branches }
|
|
243
247
|
end
|
|
244
248
|
# Determines whether two ACLs or a Parse-ACL hash is equivalent to this object.
|
|
245
249
|
# @example
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# frozen_string_literal: true
|
|
3
3
|
|
|
4
4
|
require "digest"
|
|
5
|
+
require "time"
|
|
5
6
|
require_relative "../../embeddings"
|
|
6
7
|
require_relative "../vector"
|
|
7
8
|
|
|
@@ -108,9 +109,21 @@ module Parse
|
|
|
108
109
|
#
|
|
109
110
|
# `allow_insecure` is forwarded to {.validate_image_url!} for
|
|
110
111
|
# image directives only; ignored for text.
|
|
112
|
+
#
|
|
113
|
+
# `source_mode` (image directives only) is `:url` (forward the
|
|
114
|
+
# validated URL to the provider — v5.1 behavior, requires the
|
|
115
|
+
# `trust_provider_url_fetch` sentinel) or `:bytes` (the SDK
|
|
116
|
+
# downloads via {Parse::File.safe_open_url}, magic-byte-verifies,
|
|
117
|
+
# EXIF-strips, and forwards base64 — v5.5). `exif_strip`
|
|
118
|
+
# (default true) applies to `:bytes` mode only.
|
|
119
|
+
#
|
|
120
|
+
# `meta_field` names the `:object` sibling that records
|
|
121
|
+
# provider/model/dimensions provenance on every recompute — the
|
|
122
|
+
# input {ClassMethods#reembed!} uses to find stale rows after a
|
|
123
|
+
# model migration.
|
|
111
124
|
EmbedDirective = Struct.new(
|
|
112
125
|
:sources, :into, :digest_field, :input_type, :provider_name,
|
|
113
|
-
:modality, :allow_insecure,
|
|
126
|
+
:modality, :allow_insecure, :source_mode, :exif_strip, :meta_field,
|
|
114
127
|
keyword_init: true,
|
|
115
128
|
) do
|
|
116
129
|
def freeze
|
|
@@ -121,6 +134,10 @@ module Parse
|
|
|
121
134
|
def image?
|
|
122
135
|
modality == :image
|
|
123
136
|
end
|
|
137
|
+
|
|
138
|
+
def bytes_mode?
|
|
139
|
+
source_mode == :bytes
|
|
140
|
+
end
|
|
124
141
|
end
|
|
125
142
|
|
|
126
143
|
# @!visibility private
|
|
@@ -184,9 +201,17 @@ module Parse
|
|
|
184
201
|
# @param digest_field [Symbol, nil] override for the digest
|
|
185
202
|
# sibling property. Defaults to `:"#{into}_digest"`. Auto-
|
|
186
203
|
# declared as `:string` if not already declared.
|
|
204
|
+
# @param meta_field [Symbol, nil] override for the provenance
|
|
205
|
+
# sibling property. Defaults to `:"#{into}_meta"`. Auto-
|
|
206
|
+
# declared as `:object` if not already declared; populated
|
|
207
|
+
# with `{ provider:, model:, dimensions:, modality:,
|
|
208
|
+
# embedded_at: }` on every recompute. Read by
|
|
209
|
+
# {ClassMethods#reembed!} to skip rows already embedded by
|
|
210
|
+
# the current provider/model.
|
|
187
211
|
# @return [Symbol] the target vector field name.
|
|
188
212
|
# @raise [InvalidEmbedDeclaration] on declaration-time misuse.
|
|
189
|
-
def embed(*source_fields, into:, input_type: :search_document, digest_field: nil
|
|
213
|
+
def embed(*source_fields, into:, input_type: :search_document, digest_field: nil,
|
|
214
|
+
meta_field: nil)
|
|
190
215
|
if source_fields.empty?
|
|
191
216
|
raise InvalidEmbedDeclaration,
|
|
192
217
|
"#{self}.embed: at least one source field is required."
|
|
@@ -215,6 +240,10 @@ module Parse
|
|
|
215
240
|
unless fields.key?(digest_field)
|
|
216
241
|
property digest_field, :string
|
|
217
242
|
end
|
|
243
|
+
meta_field = (meta_field || :"#{into}_meta").to_sym
|
|
244
|
+
unless fields.key?(meta_field)
|
|
245
|
+
property meta_field, :object
|
|
246
|
+
end
|
|
218
247
|
|
|
219
248
|
directive = EmbedDirective.new(
|
|
220
249
|
sources: sources,
|
|
@@ -222,6 +251,7 @@ module Parse
|
|
|
222
251
|
digest_field: digest_field,
|
|
223
252
|
input_type: input_type,
|
|
224
253
|
provider_name: provider_name,
|
|
254
|
+
meta_field: meta_field,
|
|
225
255
|
).freeze
|
|
226
256
|
embed_directives[into] = directive
|
|
227
257
|
|
|
@@ -243,12 +273,25 @@ module Parse
|
|
|
243
273
|
# Declare a managed image embedding. Mirrors {.embed} but the
|
|
244
274
|
# source field is a `:file` property (Parse::File) and the
|
|
245
275
|
# provider call routes through {Parse::Embeddings::Provider#embed_image}
|
|
246
|
-
# rather than `#embed_text`.
|
|
247
|
-
#
|
|
248
|
-
#
|
|
249
|
-
#
|
|
250
|
-
#
|
|
251
|
-
#
|
|
276
|
+
# rather than `#embed_text`. Two fetch modes (`source:`):
|
|
277
|
+
#
|
|
278
|
+
# * `:url` (default, v5.1 behavior) — the SDK extracts the
|
|
279
|
+
# file's URL, validates it through
|
|
280
|
+
# {Parse::Embeddings.validate_image_url!} (sentinel-gated
|
|
281
|
+
# egress opt-in, CIDR / port / host allowlist), and forwards
|
|
282
|
+
# the canonicalized URL to the provider, which performs its
|
|
283
|
+
# own fetch. The SDK does NOT download image bytes.
|
|
284
|
+
# * `:bytes` (v5.5) — the SDK downloads the image itself via
|
|
285
|
+
# {Parse::File.safe_open_url} (through
|
|
286
|
+
# {Parse::Embeddings::ImageFetch.fetch!}), verifies the
|
|
287
|
+
# content by magic-byte sniff against
|
|
288
|
+
# {Parse::Embeddings.allowed_image_types} (the Content-Type
|
|
289
|
+
# header is never trusted), strips EXIF/XMP metadata by
|
|
290
|
+
# default, and forwards the bytes to the provider as a
|
|
291
|
+
# base64 data URI. Does NOT require the
|
|
292
|
+
# `trust_provider_url_fetch` sentinel (no third-party URL
|
|
293
|
+
# egress), but the file's host must still be in
|
|
294
|
+
# {Parse::Embeddings.allowed_image_hosts}.
|
|
252
295
|
#
|
|
253
296
|
# **Digest is the URL string, not the file contents.** Replacing
|
|
254
297
|
# the Parse::File with one pointing to a different URL re-embeds;
|
|
@@ -272,10 +315,21 @@ module Parse
|
|
|
272
315
|
# @param allow_insecure [Boolean] forwarded to
|
|
273
316
|
# {Parse::Embeddings.validate_image_url!}; permit `http://`
|
|
274
317
|
# for local-dev CDN proxies. Default false.
|
|
318
|
+
# @param source [Symbol] `:url` (provider fetches; default) or
|
|
319
|
+
# `:bytes` (SDK fetches, verifies, strips, forwards base64).
|
|
320
|
+
# @param exif_strip [Boolean] strip EXIF/XMP metadata before
|
|
321
|
+
# forwarding bytes (default true; `:bytes` mode only —
|
|
322
|
+
# ignored for `:url`, where the SDK never sees the bytes).
|
|
323
|
+
# @param meta_field [Symbol, nil] override for the provenance
|
|
324
|
+
# sibling property. Defaults to `:"#{into}_meta"`; see {.embed}.
|
|
275
325
|
# @return [Symbol] the target vector field name.
|
|
276
326
|
# @raise [InvalidEmbedDeclaration] on declaration-time misuse.
|
|
277
327
|
def embed_image(source_field, into:, input_type: :search_document,
|
|
278
|
-
digest_field: nil, allow_insecure: false
|
|
328
|
+
digest_field: nil, allow_insecure: false,
|
|
329
|
+
source: :url, exif_strip: true, meta_field: nil)
|
|
330
|
+
# Capture the fetch mode immediately — the legacy local
|
|
331
|
+
# `source = source_field.to_sym` below shadows the kwarg.
|
|
332
|
+
source_mode = source_mode_for_embed_image!(source)
|
|
279
333
|
into = into.to_sym
|
|
280
334
|
unless vector_properties.key?(into)
|
|
281
335
|
raise InvalidEmbedDeclaration,
|
|
@@ -306,6 +360,10 @@ module Parse
|
|
|
306
360
|
unless fields.key?(digest_field)
|
|
307
361
|
property digest_field, :string
|
|
308
362
|
end
|
|
363
|
+
meta_field = (meta_field || :"#{into}_meta").to_sym
|
|
364
|
+
unless fields.key?(meta_field)
|
|
365
|
+
property meta_field, :object
|
|
366
|
+
end
|
|
309
367
|
|
|
310
368
|
directive = EmbedDirective.new(
|
|
311
369
|
sources: [source],
|
|
@@ -315,6 +373,9 @@ module Parse
|
|
|
315
373
|
provider_name: provider_name,
|
|
316
374
|
modality: :image,
|
|
317
375
|
allow_insecure: allow_insecure,
|
|
376
|
+
source_mode: source_mode,
|
|
377
|
+
exif_strip: exif_strip ? true : false,
|
|
378
|
+
meta_field: meta_field,
|
|
318
379
|
).freeze
|
|
319
380
|
embed_directives[into] = directive
|
|
320
381
|
|
|
@@ -333,6 +394,126 @@ module Parse
|
|
|
333
394
|
into
|
|
334
395
|
end
|
|
335
396
|
|
|
397
|
+
# @!visibility private
|
|
398
|
+
# Validate the `source:` kwarg of {.embed_image}.
|
|
399
|
+
def source_mode_for_embed_image!(source)
|
|
400
|
+
mode = source.to_sym
|
|
401
|
+
unless %i[url bytes].include?(mode)
|
|
402
|
+
raise InvalidEmbedDeclaration,
|
|
403
|
+
"#{self}.embed_image: source: must be :url or :bytes (got #{source.inspect})."
|
|
404
|
+
end
|
|
405
|
+
mode
|
|
406
|
+
end
|
|
407
|
+
|
|
408
|
+
# Re-embed records through the CURRENT provider/model — the bulk
|
|
409
|
+
# migration counterpart to {#embed_pending!} (which only fills
|
|
410
|
+
# null vectors). Use after changing a `:vector` property's
|
|
411
|
+
# `provider:` / `model:` / `dimensions:` declaration: walks the
|
|
412
|
+
# class with objectId-cursor pagination, clears each record's
|
|
413
|
+
# digest sibling so the `before_save` recompute cannot elide the
|
|
414
|
+
# provider call, and saves.
|
|
415
|
+
#
|
|
416
|
+
# With `only_stale: true`, rows whose `<into>_meta` provenance
|
|
417
|
+
# already matches the current provider name, model, and declared
|
|
418
|
+
# dimensions are skipped without a provider call — making the
|
|
419
|
+
# operation resumable: re-running after a partial failure only
|
|
420
|
+
# touches rows still carrying old-model vectors. Rows with no
|
|
421
|
+
# meta record (embedded before v5.5) always count as stale.
|
|
422
|
+
#
|
|
423
|
+
# Intended as an admin / maintenance operation: run it with a
|
|
424
|
+
# master-key client (or pass `save_opts:` carrying a
|
|
425
|
+
# `session_token:` that can write every row). Combine with
|
|
426
|
+
# {Parse::Embeddings::BatchEmbedder}-style pacing externally if
|
|
427
|
+
# the provider rate-limits — each record's save makes one
|
|
428
|
+
# provider call.
|
|
429
|
+
#
|
|
430
|
+
# @param field [Symbol, nil] limit to one embed target; nil
|
|
431
|
+
# processes every declared directive.
|
|
432
|
+
# @param batch_size [Integer] rows fetched per round (default 100).
|
|
433
|
+
# @param limit [Integer, nil] stop after re-embedding at most
|
|
434
|
+
# this many records across all directives; nil = no cap.
|
|
435
|
+
# @param where [Hash, nil] extra query constraints (e.g.
|
|
436
|
+
# `{ published: true }`).
|
|
437
|
+
# @param only_stale [Boolean] skip rows whose meta provenance
|
|
438
|
+
# matches the current provider/model/dimensions (default false
|
|
439
|
+
# — re-embed everything).
|
|
440
|
+
# @param save_opts [Hash] options forwarded to each `record.save`.
|
|
441
|
+
# @return [Integer] number of records re-embedded (saved).
|
|
442
|
+
# @raise [ArgumentError] when `field:` names no embed target, or
|
|
443
|
+
# the class declares no `embed` directives.
|
|
444
|
+
def reembed!(field: nil, batch_size: 100, limit: nil, where: nil,
|
|
445
|
+
only_stale: false, save_opts: {})
|
|
446
|
+
bs = Integer(batch_size)
|
|
447
|
+
raise ArgumentError, "#{self}.reembed!: batch_size must be positive." if bs <= 0
|
|
448
|
+
directives = resolve_embed_directives_for_backfill(field, caller_label: "reembed!")
|
|
449
|
+
|
|
450
|
+
processed = 0
|
|
451
|
+
directives.each do |directive|
|
|
452
|
+
remaining = limit ? (limit - processed) : nil
|
|
453
|
+
break if remaining && remaining <= 0
|
|
454
|
+
processed += reembed_directive!(directive, bs, where, remaining, only_stale, save_opts)
|
|
455
|
+
end
|
|
456
|
+
processed
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
# @!visibility private
|
|
460
|
+
# objectId-cursor walk over ALL rows (subject to `where:`),
|
|
461
|
+
# clearing the digest so the save-path recompute re-embeds.
|
|
462
|
+
def reembed_directive!(directive, batch_size, where, remaining, only_stale, save_opts)
|
|
463
|
+
count = 0
|
|
464
|
+
cursor = nil
|
|
465
|
+
current = only_stale ? current_embed_identity(directive) : nil
|
|
466
|
+
loop do
|
|
467
|
+
q = query
|
|
468
|
+
q = q.where(where) if where.is_a?(Hash) && !where.empty?
|
|
469
|
+
q = q.where(:objectId.gt => cursor) if cursor
|
|
470
|
+
q.order(:objectId.asc)
|
|
471
|
+
q.limit(batch_size)
|
|
472
|
+
batch = q.results
|
|
473
|
+
break if batch.nil? || batch.empty?
|
|
474
|
+
|
|
475
|
+
batch.each do |record|
|
|
476
|
+
cursor = record.id
|
|
477
|
+
next if current && embed_meta_current?(record, directive, current)
|
|
478
|
+
record.public_send(:"#{directive.digest_field}=", nil)
|
|
479
|
+
record.save(**save_opts)
|
|
480
|
+
count += 1
|
|
481
|
+
return count if remaining && count >= remaining
|
|
482
|
+
end
|
|
483
|
+
break if batch.length < batch_size
|
|
484
|
+
end
|
|
485
|
+
count
|
|
486
|
+
end
|
|
487
|
+
|
|
488
|
+
# @!visibility private
|
|
489
|
+
# The provenance tuple a freshly-embedded row would carry today.
|
|
490
|
+
def current_embed_identity(directive)
|
|
491
|
+
model = begin
|
|
492
|
+
Parse::Embeddings.provider(directive.provider_name).model_name
|
|
493
|
+
rescue Parse::Embeddings::ProviderNotRegistered
|
|
494
|
+
raise
|
|
495
|
+
rescue NotImplementedError
|
|
496
|
+
nil
|
|
497
|
+
end
|
|
498
|
+
{
|
|
499
|
+
"provider" => directive.provider_name.to_s,
|
|
500
|
+
"model" => model,
|
|
501
|
+
"dimensions" => vector_properties.dig(directive.into, :dimensions),
|
|
502
|
+
}
|
|
503
|
+
end
|
|
504
|
+
|
|
505
|
+
# @!visibility private
|
|
506
|
+
# True when the record's meta sibling matches `current` (so
|
|
507
|
+
# `only_stale: true` can skip it). Missing/foreign-shaped meta
|
|
508
|
+
# counts as stale.
|
|
509
|
+
def embed_meta_current?(record, directive, current)
|
|
510
|
+
meta = directive.meta_field && record.public_send(directive.meta_field)
|
|
511
|
+
return false unless meta.is_a?(Hash)
|
|
512
|
+
%w[provider model dimensions].all? do |key|
|
|
513
|
+
current[key].nil? || meta[key] == current[key] || meta[key.to_sym] == current[key]
|
|
514
|
+
end
|
|
515
|
+
end
|
|
516
|
+
|
|
336
517
|
# Backfill embeddings for records whose managed vector field is
|
|
337
518
|
# still null — the bulk counterpart to the per-save embed path.
|
|
338
519
|
# Walks the class with objectId-cursor pagination (robust to the
|
|
@@ -372,18 +553,20 @@ module Parse
|
|
|
372
553
|
end
|
|
373
554
|
|
|
374
555
|
# @!visibility private
|
|
375
|
-
|
|
556
|
+
# `caller_label` names the public entry point in error messages so
|
|
557
|
+
# a reembed! misuse is not reported as an embed_pending! one.
|
|
558
|
+
def resolve_embed_directives_for_backfill(field, caller_label: "embed_pending!")
|
|
376
559
|
if field
|
|
377
560
|
d = embed_directives[field.to_sym]
|
|
378
561
|
unless d
|
|
379
562
|
raise ArgumentError,
|
|
380
|
-
"#{self}
|
|
563
|
+
"#{self}.#{caller_label}: :#{field} is not an embed target " \
|
|
381
564
|
"(have #{embed_directives.keys.inspect})."
|
|
382
565
|
end
|
|
383
566
|
[d]
|
|
384
567
|
else
|
|
385
568
|
ds = embed_directives.values
|
|
386
|
-
raise ArgumentError, "#{self}
|
|
569
|
+
raise ArgumentError, "#{self}.#{caller_label}: no `embed` directives declared." if ds.empty?
|
|
387
570
|
ds
|
|
388
571
|
end
|
|
389
572
|
end
|
|
@@ -470,6 +653,7 @@ module Parse
|
|
|
470
653
|
record.public_send(:"#{directive.into}=", nil)
|
|
471
654
|
end
|
|
472
655
|
record.public_send(:"#{directive.digest_field}=", nil)
|
|
656
|
+
clear_embed_meta(record, directive)
|
|
473
657
|
end
|
|
474
658
|
return
|
|
475
659
|
end
|
|
@@ -498,6 +682,36 @@ module Parse
|
|
|
498
682
|
record.public_send(:"#{directive.into}=", vector)
|
|
499
683
|
end
|
|
500
684
|
record.public_send(:"#{directive.digest_field}=", digest)
|
|
685
|
+
stamp_embed_meta(record, directive, provider, vector)
|
|
686
|
+
end
|
|
687
|
+
|
|
688
|
+
# @!visibility private
|
|
689
|
+
# Record provider/model provenance on the `<into>_meta` sibling so
|
|
690
|
+
# migration tooling ({ClassMethods#reembed!} `only_stale:`) can
|
|
691
|
+
# tell which model produced the stored vector. String keys —
|
|
692
|
+
# `:object` properties round-trip through JSON.
|
|
693
|
+
def self.stamp_embed_meta(record, directive, provider, vector)
|
|
694
|
+
return if directive.meta_field.nil?
|
|
695
|
+
return unless record.respond_to?(:"#{directive.meta_field}=")
|
|
696
|
+
model = begin
|
|
697
|
+
provider.model_name
|
|
698
|
+
rescue NotImplementedError
|
|
699
|
+
nil
|
|
700
|
+
end
|
|
701
|
+
record.public_send(:"#{directive.meta_field}=", {
|
|
702
|
+
"provider" => directive.provider_name.to_s,
|
|
703
|
+
"model" => model,
|
|
704
|
+
"dimensions" => vector.dimensions,
|
|
705
|
+
"modality" => directive.image? ? "image" : "text",
|
|
706
|
+
"embedded_at" => Time.now.utc.iso8601,
|
|
707
|
+
})
|
|
708
|
+
end
|
|
709
|
+
|
|
710
|
+
# @!visibility private
|
|
711
|
+
def self.clear_embed_meta(record, directive)
|
|
712
|
+
return if directive.meta_field.nil?
|
|
713
|
+
return unless record.respond_to?(:"#{directive.meta_field}=")
|
|
714
|
+
record.public_send(:"#{directive.meta_field}=", nil)
|
|
501
715
|
end
|
|
502
716
|
|
|
503
717
|
# @!visibility private
|
|
@@ -529,10 +743,25 @@ module Parse
|
|
|
529
743
|
end
|
|
530
744
|
|
|
531
745
|
# @!visibility private
|
|
532
|
-
# Dispatch the provider call based on directive modality
|
|
746
|
+
# Dispatch the provider call based on directive modality and (for
|
|
747
|
+
# images) fetch mode. `:bytes` mode downloads + verifies + strips
|
|
748
|
+
# through {Parse::Embeddings::ImageFetch.fetch!} and hands the
|
|
749
|
+
# provider a {Parse::Embeddings::ImageFetch::FetchedImage}; `:url`
|
|
750
|
+
# mode forwards the raw URL String (the provider validates and
|
|
751
|
+
# fetches it itself).
|
|
533
752
|
def self.call_provider(provider, directive, input)
|
|
534
753
|
if directive.image?
|
|
535
|
-
|
|
754
|
+
source =
|
|
755
|
+
if directive.bytes_mode?
|
|
756
|
+
Parse::Embeddings::ImageFetch.fetch!(
|
|
757
|
+
input,
|
|
758
|
+
allow_insecure: directive.allow_insecure ? true : false,
|
|
759
|
+
exif_strip: directive.exif_strip != false,
|
|
760
|
+
)
|
|
761
|
+
else
|
|
762
|
+
input
|
|
763
|
+
end
|
|
764
|
+
provider.embed_image([source],
|
|
536
765
|
input_type: directive.input_type,
|
|
537
766
|
allow_insecure: directive.allow_insecure ? true : false)
|
|
538
767
|
else
|