parse-stack-next 5.4.1 → 5.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -246,6 +246,7 @@ module Parse
246
246
  CONFIG_MUTEX.synchronize do
247
247
  @configuration = nil
248
248
  @allowed_image_hosts = nil
249
+ @allowed_image_types = nil
249
250
  @trust_provider_url_fetch = nil
250
251
  end
251
252
  end
@@ -298,6 +299,30 @@ module Parse
298
299
  @allowed_image_hosts ||= [].freeze
299
300
  end
300
301
 
302
+ # Configure the MIME types the bytes-fetch path accepts after
303
+ # magic-byte sniffing (see {ImageFetch.verify!}). Defaults to
304
+ # {ImageFetch::DEFAULT_ALLOWED_IMAGE_TYPES} (JPEG / PNG / GIF /
305
+ # WebP). The sniffed type — never the `Content-Type` header — is
306
+ # checked against this list, so adding a type here only matters
307
+ # when {ImageFetch.sniff_mime} can recognize its magic bytes.
308
+ #
309
+ # @param types [Array<String>] MIME type strings.
310
+ # @return [Array<String>]
311
+ def allowed_image_types=(types)
312
+ unless types.is_a?(Array) && !types.empty? &&
313
+ types.all? { |t| t.is_a?(String) && t.include?("/") }
314
+ raise ArgumentError,
315
+ "Parse::Embeddings.allowed_image_types= expects a non-empty Array of " \
316
+ "MIME type Strings (got #{types.inspect})."
317
+ end
318
+ CONFIG_MUTEX.synchronize { @allowed_image_types = types.dup.freeze }
319
+ end
320
+
321
+ # @return [Array<String>] MIME allowlist for the bytes-fetch path (frozen).
322
+ def allowed_image_types
323
+ @allowed_image_types ||= ImageFetch::DEFAULT_ALLOWED_IMAGE_TYPES
324
+ end
325
+
301
326
  # Sentinel-gated opt-in for forwarding image URLs to embedding
302
327
  # providers. Assign the exact {TRUST_PROVIDER_URL_FETCH_SENTINEL}
303
328
  # String to unlock; any other value (including `true`, `1`,
@@ -357,11 +382,20 @@ module Parse
357
382
  # @param allow_insecure [Boolean] permit `http://` (default
358
383
  # false). Only meaningful for local development / container-
359
384
  # internal CDN proxies.
385
+ # @param mode [Symbol] `:forward` (default) validates for
386
+ # URL-forwarding to a provider and requires the
387
+ # {.trust_provider_url_fetch=} sentinel. `:fetch` validates for
388
+ # the SDK's OWN download through {Parse::File.safe_open_url}
389
+ # (the v5.5 bytes path) and skips the sentinel — no URL is
390
+ # forwarded to a third party, so the provider-egress
391
+ # acknowledgment doesn't apply. Every other layer (host
392
+ # allowlist deny-by-default, obfuscated-IP screen, port
393
+ # allowlist, CIDR resolution check) is identical in both modes.
360
394
  # @return [String] canonicalized URL (`URI.parse(url).to_s`).
361
- # @raise [ConfirmationRequired] when the sentinel is unset.
395
+ # @raise [ConfirmationRequired] when the sentinel is unset (`:forward` mode).
362
396
  # @raise [InvalidImageURL] on any other validation failure.
363
- def validate_image_url!(url, allow_insecure: false)
364
- unless trust_provider_url_fetch?
397
+ def validate_image_url!(url, allow_insecure: false, mode: :forward)
398
+ unless mode == :fetch || trust_provider_url_fetch?
365
399
  hint =
366
400
  if allowed_image_hosts.empty?
367
401
  " First populate Parse::Embeddings.allowed_image_hosts with the CDN " \
@@ -555,3 +589,6 @@ require_relative "embeddings/jina"
555
589
  require_relative "embeddings/qwen"
556
590
  require_relative "embeddings/local_http"
557
591
  require_relative "embeddings/spend_cap"
592
+ require_relative "embeddings/image_fetch"
593
+ require_relative "embeddings/cache"
594
+ require_relative "embeddings/batch_embedder"
@@ -211,8 +211,8 @@ module Parse
211
211
  # @example
212
212
  # permissions = ["*", user.id] + user.acl_roles.to_a.map { |n| "role:#{n}" }
213
213
  # pipeline << { "$match" => Parse::ACL.read_predicate(permissions) }
214
- def self.read_predicate(permissions, include_public: true)
215
- permission_predicate("_rperm", permissions, include_public: include_public)
214
+ def self.read_predicate(permissions, include_public: true, include_missing: true)
215
+ permission_predicate("_rperm", permissions, include_public: include_public, include_missing: include_missing)
216
216
  end
217
217
 
218
218
  # Build a MongoDB +$match+-shaped predicate that matches documents
@@ -222,8 +222,8 @@ module Parse
222
222
  # @param permissions [Array<String>] permission strings.
223
223
  # @param include_public [Boolean] whether to append +"*"+.
224
224
  # @return [Hash] a MongoDB +$or+ subexpression.
225
- def self.write_predicate(permissions, include_public: true)
226
- permission_predicate("_wperm", permissions, include_public: include_public)
225
+ def self.write_predicate(permissions, include_public: true, include_missing: true)
226
+ permission_predicate("_wperm", permissions, include_public: include_public, include_missing: include_missing)
227
227
  end
228
228
 
229
229
  # @!visibility private
@@ -231,15 +231,19 @@ module Parse
231
231
  # Normalizes the permissions array (string-coerced, deduplicated,
232
232
  # +"*"+ appended when +include_public+) and returns the +$or+
233
233
  # subexpression.
234
- def self.permission_predicate(field, permissions, include_public: true)
234
+ # @param include_missing [Boolean] when true (default), append the
235
+ # +{ field => { "$exists" => false } }+ branch so a missing
236
+ # +_rperm+/+_wperm+ (treated as public by Parse Server) also matches.
237
+ # Set false for an EXACT match that requires the column to be present
238
+ # and to contain one of +permissions+ (the strict/`readable_by_exact`
239
+ # surface). When false and only the +$in+ branch remains, the +$or+
240
+ # wrapper is dropped for a cleaner +{ field => { "$in" => perms } }+.
241
+ def self.permission_predicate(field, permissions, include_public: true, include_missing: true)
235
242
  perms = Array(permissions).map(&:to_s).reject(&:empty?).uniq
236
243
  perms << "*" if include_public && !perms.include?("*")
237
- {
238
- "$or" => [
239
- { field => { "$in" => perms } },
240
- { field => { "$exists" => false } },
241
- ],
242
- }
244
+ branches = [{ field => { "$in" => perms } }]
245
+ branches << { field => { "$exists" => false } } if include_missing
246
+ branches.length == 1 ? branches.first : { "$or" => branches }
243
247
  end
244
248
  # Determines whether two ACLs or a Parse-ACL hash is equivalent to this object.
245
249
  # @example
@@ -2,6 +2,7 @@
2
2
  # frozen_string_literal: true
3
3
 
4
4
  require "digest"
5
+ require "time"
5
6
  require_relative "../../embeddings"
6
7
  require_relative "../vector"
7
8
 
@@ -108,9 +109,21 @@ module Parse
108
109
  #
109
110
  # `allow_insecure` is forwarded to {.validate_image_url!} for
110
111
  # image directives only; ignored for text.
112
+ #
113
+ # `source_mode` (image directives only) is `:url` (forward the
114
+ # validated URL to the provider — v5.1 behavior, requires the
115
+ # `trust_provider_url_fetch` sentinel) or `:bytes` (the SDK
116
+ # downloads via {Parse::File.safe_open_url}, magic-byte-verifies,
117
+ # EXIF-strips, and forwards base64 — v5.5). `exif_strip`
118
+ # (default true) applies to `:bytes` mode only.
119
+ #
120
+ # `meta_field` names the `:object` sibling that records
121
+ # provider/model/dimensions provenance on every recompute — the
122
+ # input {ClassMethods#reembed!} uses to find stale rows after a
123
+ # model migration.
111
124
  EmbedDirective = Struct.new(
112
125
  :sources, :into, :digest_field, :input_type, :provider_name,
113
- :modality, :allow_insecure,
126
+ :modality, :allow_insecure, :source_mode, :exif_strip, :meta_field,
114
127
  keyword_init: true,
115
128
  ) do
116
129
  def freeze
@@ -121,6 +134,10 @@ module Parse
121
134
  def image?
122
135
  modality == :image
123
136
  end
137
+
138
+ def bytes_mode?
139
+ source_mode == :bytes
140
+ end
124
141
  end
125
142
 
126
143
  # @!visibility private
@@ -184,9 +201,17 @@ module Parse
184
201
  # @param digest_field [Symbol, nil] override for the digest
185
202
  # sibling property. Defaults to `:"#{into}_digest"`. Auto-
186
203
  # declared as `:string` if not already declared.
204
+ # @param meta_field [Symbol, nil] override for the provenance
205
+ # sibling property. Defaults to `:"#{into}_meta"`. Auto-
206
+ # declared as `:object` if not already declared; populated
207
+ # with `{ provider:, model:, dimensions:, modality:,
208
+ # embedded_at: }` on every recompute. Read by
209
+ # {ClassMethods#reembed!} to skip rows already embedded by
210
+ # the current provider/model.
187
211
  # @return [Symbol] the target vector field name.
188
212
  # @raise [InvalidEmbedDeclaration] on declaration-time misuse.
189
- def embed(*source_fields, into:, input_type: :search_document, digest_field: nil)
213
+ def embed(*source_fields, into:, input_type: :search_document, digest_field: nil,
214
+ meta_field: nil)
190
215
  if source_fields.empty?
191
216
  raise InvalidEmbedDeclaration,
192
217
  "#{self}.embed: at least one source field is required."
@@ -215,6 +240,10 @@ module Parse
215
240
  unless fields.key?(digest_field)
216
241
  property digest_field, :string
217
242
  end
243
+ meta_field = (meta_field || :"#{into}_meta").to_sym
244
+ unless fields.key?(meta_field)
245
+ property meta_field, :object
246
+ end
218
247
 
219
248
  directive = EmbedDirective.new(
220
249
  sources: sources,
@@ -222,6 +251,7 @@ module Parse
222
251
  digest_field: digest_field,
223
252
  input_type: input_type,
224
253
  provider_name: provider_name,
254
+ meta_field: meta_field,
225
255
  ).freeze
226
256
  embed_directives[into] = directive
227
257
 
@@ -243,12 +273,25 @@ module Parse
243
273
  # Declare a managed image embedding. Mirrors {.embed} but the
244
274
  # source field is a `:file` property (Parse::File) and the
245
275
  # provider call routes through {Parse::Embeddings::Provider#embed_image}
246
- # rather than `#embed_text`. v5.1 ships URL-only: the SDK
247
- # extracts the file's URL, validates it through
248
- # {Parse::Embeddings.validate_image_url!} (sentinel-gated egress
249
- # opt-in, CIDR / port / host allowlist), and forwards the
250
- # canonicalized URL to the provider. The SDK does NOT download
251
- # image bytes bytes-fetch is the v5.3 path.
276
+ # rather than `#embed_text`. Two fetch modes (`source:`):
277
+ #
278
+ # * `:url` (default, v5.1 behavior) — the SDK extracts the
279
+ # file's URL, validates it through
280
+ # {Parse::Embeddings.validate_image_url!} (sentinel-gated
281
+ # egress opt-in, CIDR / port / host allowlist), and forwards
282
+ # the canonicalized URL to the provider, which performs its
283
+ # own fetch. The SDK does NOT download image bytes.
284
+ # * `:bytes` (v5.5) — the SDK downloads the image itself via
285
+ # {Parse::File.safe_open_url} (through
286
+ # {Parse::Embeddings::ImageFetch.fetch!}), verifies the
287
+ # content by magic-byte sniff against
288
+ # {Parse::Embeddings.allowed_image_types} (the Content-Type
289
+ # header is never trusted), strips EXIF/XMP metadata by
290
+ # default, and forwards the bytes to the provider as a
291
+ # base64 data URI. Does NOT require the
292
+ # `trust_provider_url_fetch` sentinel (no third-party URL
293
+ # egress), but the file's host must still be in
294
+ # {Parse::Embeddings.allowed_image_hosts}.
252
295
  #
253
296
  # **Digest is the URL string, not the file contents.** Replacing
254
297
  # the Parse::File with one pointing to a different URL re-embeds;
@@ -272,10 +315,21 @@ module Parse
272
315
  # @param allow_insecure [Boolean] forwarded to
273
316
  # {Parse::Embeddings.validate_image_url!}; permit `http://`
274
317
  # for local-dev CDN proxies. Default false.
318
+ # @param source [Symbol] `:url` (provider fetches; default) or
319
+ # `:bytes` (SDK fetches, verifies, strips, forwards base64).
320
+ # @param exif_strip [Boolean] strip EXIF/XMP metadata before
321
+ # forwarding bytes (default true; `:bytes` mode only —
322
+ # ignored for `:url`, where the SDK never sees the bytes).
323
+ # @param meta_field [Symbol, nil] override for the provenance
324
+ # sibling property. Defaults to `:"#{into}_meta"`; see {.embed}.
275
325
  # @return [Symbol] the target vector field name.
276
326
  # @raise [InvalidEmbedDeclaration] on declaration-time misuse.
277
327
  def embed_image(source_field, into:, input_type: :search_document,
278
- digest_field: nil, allow_insecure: false)
328
+ digest_field: nil, allow_insecure: false,
329
+ source: :url, exif_strip: true, meta_field: nil)
330
+ # Capture the fetch mode immediately — the legacy local
331
+ # `source = source_field.to_sym` below shadows the kwarg.
332
+ source_mode = source_mode_for_embed_image!(source)
279
333
  into = into.to_sym
280
334
  unless vector_properties.key?(into)
281
335
  raise InvalidEmbedDeclaration,
@@ -306,6 +360,10 @@ module Parse
306
360
  unless fields.key?(digest_field)
307
361
  property digest_field, :string
308
362
  end
363
+ meta_field = (meta_field || :"#{into}_meta").to_sym
364
+ unless fields.key?(meta_field)
365
+ property meta_field, :object
366
+ end
309
367
 
310
368
  directive = EmbedDirective.new(
311
369
  sources: [source],
@@ -315,6 +373,9 @@ module Parse
315
373
  provider_name: provider_name,
316
374
  modality: :image,
317
375
  allow_insecure: allow_insecure,
376
+ source_mode: source_mode,
377
+ exif_strip: exif_strip ? true : false,
378
+ meta_field: meta_field,
318
379
  ).freeze
319
380
  embed_directives[into] = directive
320
381
 
@@ -333,6 +394,126 @@ module Parse
333
394
  into
334
395
  end
335
396
 
397
+ # @!visibility private
398
+ # Validate the `source:` kwarg of {.embed_image}.
399
+ def source_mode_for_embed_image!(source)
400
+ mode = source.to_sym
401
+ unless %i[url bytes].include?(mode)
402
+ raise InvalidEmbedDeclaration,
403
+ "#{self}.embed_image: source: must be :url or :bytes (got #{source.inspect})."
404
+ end
405
+ mode
406
+ end
407
+
408
+ # Re-embed records through the CURRENT provider/model — the bulk
409
+ # migration counterpart to {#embed_pending!} (which only fills
410
+ # null vectors). Use after changing a `:vector` property's
411
+ # `provider:` / `model:` / `dimensions:` declaration: walks the
412
+ # class with objectId-cursor pagination, clears each record's
413
+ # digest sibling so the `before_save` recompute cannot elide the
414
+ # provider call, and saves.
415
+ #
416
+ # With `only_stale: true`, rows whose `<into>_meta` provenance
417
+ # already matches the current provider name, model, and declared
418
+ # dimensions are skipped without a provider call — making the
419
+ # operation resumable: re-running after a partial failure only
420
+ # touches rows still carrying old-model vectors. Rows with no
421
+ # meta record (embedded before v5.5) always count as stale.
422
+ #
423
+ # Intended as an admin / maintenance operation: run it with a
424
+ # master-key client (or pass `save_opts:` carrying a
425
+ # `session_token:` that can write every row). Combine with
426
+ # {Parse::Embeddings::BatchEmbedder}-style pacing externally if
427
+ # the provider rate-limits — each record's save makes one
428
+ # provider call.
429
+ #
430
+ # @param field [Symbol, nil] limit to one embed target; nil
431
+ # processes every declared directive.
432
+ # @param batch_size [Integer] rows fetched per round (default 100).
433
+ # @param limit [Integer, nil] stop after re-embedding at most
434
+ # this many records across all directives; nil = no cap.
435
+ # @param where [Hash, nil] extra query constraints (e.g.
436
+ # `{ published: true }`).
437
+ # @param only_stale [Boolean] skip rows whose meta provenance
438
+ # matches the current provider/model/dimensions (default false
439
+ # — re-embed everything).
440
+ # @param save_opts [Hash] options forwarded to each `record.save`.
441
+ # @return [Integer] number of records re-embedded (saved).
442
+ # @raise [ArgumentError] when `field:` names no embed target, or
443
+ # the class declares no `embed` directives.
444
+ def reembed!(field: nil, batch_size: 100, limit: nil, where: nil,
445
+ only_stale: false, save_opts: {})
446
+ bs = Integer(batch_size)
447
+ raise ArgumentError, "#{self}.reembed!: batch_size must be positive." if bs <= 0
448
+ directives = resolve_embed_directives_for_backfill(field, caller_label: "reembed!")
449
+
450
+ processed = 0
451
+ directives.each do |directive|
452
+ remaining = limit ? (limit - processed) : nil
453
+ break if remaining && remaining <= 0
454
+ processed += reembed_directive!(directive, bs, where, remaining, only_stale, save_opts)
455
+ end
456
+ processed
457
+ end
458
+
459
+ # @!visibility private
460
+ # objectId-cursor walk over ALL rows (subject to `where:`),
461
+ # clearing the digest so the save-path recompute re-embeds.
462
+ def reembed_directive!(directive, batch_size, where, remaining, only_stale, save_opts)
463
+ count = 0
464
+ cursor = nil
465
+ current = only_stale ? current_embed_identity(directive) : nil
466
+ loop do
467
+ q = query
468
+ q = q.where(where) if where.is_a?(Hash) && !where.empty?
469
+ q = q.where(:objectId.gt => cursor) if cursor
470
+ q.order(:objectId.asc)
471
+ q.limit(batch_size)
472
+ batch = q.results
473
+ break if batch.nil? || batch.empty?
474
+
475
+ batch.each do |record|
476
+ cursor = record.id
477
+ next if current && embed_meta_current?(record, directive, current)
478
+ record.public_send(:"#{directive.digest_field}=", nil)
479
+ record.save(**save_opts)
480
+ count += 1
481
+ return count if remaining && count >= remaining
482
+ end
483
+ break if batch.length < batch_size
484
+ end
485
+ count
486
+ end
487
+
488
+ # @!visibility private
489
+ # The provenance tuple a freshly-embedded row would carry today.
490
+ def current_embed_identity(directive)
491
+ model = begin
492
+ Parse::Embeddings.provider(directive.provider_name).model_name
493
+ rescue Parse::Embeddings::ProviderNotRegistered
494
+ raise
495
+ rescue NotImplementedError
496
+ nil
497
+ end
498
+ {
499
+ "provider" => directive.provider_name.to_s,
500
+ "model" => model,
501
+ "dimensions" => vector_properties.dig(directive.into, :dimensions),
502
+ }
503
+ end
504
+
505
+ # @!visibility private
506
+ # True when the record's meta sibling matches `current` (so
507
+ # `only_stale: true` can skip it). Missing/foreign-shaped meta
508
+ # counts as stale.
509
+ def embed_meta_current?(record, directive, current)
510
+ meta = directive.meta_field && record.public_send(directive.meta_field)
511
+ return false unless meta.is_a?(Hash)
512
+ %w[provider model dimensions].all? do |key|
513
+ current[key].nil? || meta[key] == current[key] || meta[key.to_sym] == current[key]
514
+ end
515
+ end
516
+
336
517
  # Backfill embeddings for records whose managed vector field is
337
518
  # still null — the bulk counterpart to the per-save embed path.
338
519
  # Walks the class with objectId-cursor pagination (robust to the
@@ -372,18 +553,20 @@ module Parse
372
553
  end
373
554
 
374
555
  # @!visibility private
375
- def resolve_embed_directives_for_backfill(field)
556
+ # `caller_label` names the public entry point in error messages so
557
+ # a reembed! misuse is not reported as an embed_pending! one.
558
+ def resolve_embed_directives_for_backfill(field, caller_label: "embed_pending!")
376
559
  if field
377
560
  d = embed_directives[field.to_sym]
378
561
  unless d
379
562
  raise ArgumentError,
380
- "#{self}.embed_pending!: :#{field} is not an embed target " \
563
+ "#{self}.#{caller_label}: :#{field} is not an embed target " \
381
564
  "(have #{embed_directives.keys.inspect})."
382
565
  end
383
566
  [d]
384
567
  else
385
568
  ds = embed_directives.values
386
- raise ArgumentError, "#{self}.embed_pending!: no `embed` directives declared." if ds.empty?
569
+ raise ArgumentError, "#{self}.#{caller_label}: no `embed` directives declared." if ds.empty?
387
570
  ds
388
571
  end
389
572
  end
@@ -470,6 +653,7 @@ module Parse
470
653
  record.public_send(:"#{directive.into}=", nil)
471
654
  end
472
655
  record.public_send(:"#{directive.digest_field}=", nil)
656
+ clear_embed_meta(record, directive)
473
657
  end
474
658
  return
475
659
  end
@@ -498,6 +682,36 @@ module Parse
498
682
  record.public_send(:"#{directive.into}=", vector)
499
683
  end
500
684
  record.public_send(:"#{directive.digest_field}=", digest)
685
+ stamp_embed_meta(record, directive, provider, vector)
686
+ end
687
+
688
+ # @!visibility private
689
+ # Record provider/model provenance on the `<into>_meta` sibling so
690
+ # migration tooling ({ClassMethods#reembed!} `only_stale:`) can
691
+ # tell which model produced the stored vector. String keys —
692
+ # `:object` properties round-trip through JSON.
693
+ def self.stamp_embed_meta(record, directive, provider, vector)
694
+ return if directive.meta_field.nil?
695
+ return unless record.respond_to?(:"#{directive.meta_field}=")
696
+ model = begin
697
+ provider.model_name
698
+ rescue NotImplementedError
699
+ nil
700
+ end
701
+ record.public_send(:"#{directive.meta_field}=", {
702
+ "provider" => directive.provider_name.to_s,
703
+ "model" => model,
704
+ "dimensions" => vector.dimensions,
705
+ "modality" => directive.image? ? "image" : "text",
706
+ "embedded_at" => Time.now.utc.iso8601,
707
+ })
708
+ end
709
+
710
+ # @!visibility private
711
+ def self.clear_embed_meta(record, directive)
712
+ return if directive.meta_field.nil?
713
+ return unless record.respond_to?(:"#{directive.meta_field}=")
714
+ record.public_send(:"#{directive.meta_field}=", nil)
501
715
  end
502
716
 
503
717
  # @!visibility private
@@ -529,10 +743,25 @@ module Parse
529
743
  end
530
744
 
531
745
  # @!visibility private
532
- # Dispatch the provider call based on directive modality.
746
+ # Dispatch the provider call based on directive modality and (for
747
+ # images) fetch mode. `:bytes` mode downloads + verifies + strips
748
+ # through {Parse::Embeddings::ImageFetch.fetch!} and hands the
749
+ # provider a {Parse::Embeddings::ImageFetch::FetchedImage}; `:url`
750
+ # mode forwards the raw URL String (the provider validates and
751
+ # fetches it itself).
533
752
  def self.call_provider(provider, directive, input)
534
753
  if directive.image?
535
- provider.embed_image([input],
754
+ source =
755
+ if directive.bytes_mode?
756
+ Parse::Embeddings::ImageFetch.fetch!(
757
+ input,
758
+ allow_insecure: directive.allow_insecure ? true : false,
759
+ exif_strip: directive.exif_strip != false,
760
+ )
761
+ else
762
+ input
763
+ end
764
+ provider.embed_image([source],
536
765
  input_type: directive.input_type,
537
766
  allow_insecure: directive.allow_insecure ? true : false)
538
767
  else