parse-stack-next 5.4.1 → 5.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +489 -0
- data/Gemfile.lock +1 -1
- data/README.md +61 -9
- data/docs/atlas_vector_search_guide.md +318 -19
- data/lib/parse/acl_scope.rb +11 -0
- data/lib/parse/agent/mcp_rack_app.rb +53 -14
- data/lib/parse/agent/mcp_server.rb +19 -0
- data/lib/parse/api/path_segment.rb +31 -0
- data/lib/parse/api/users.rb +13 -0
- data/lib/parse/cache/redis.rb +55 -11
- data/lib/parse/client/caching.rb +12 -3
- data/lib/parse/client/logging.rb +9 -0
- data/lib/parse/client.rb +37 -3
- data/lib/parse/embeddings/batch_embedder.rb +188 -0
- data/lib/parse/embeddings/cache.rb +374 -0
- data/lib/parse/embeddings/cohere.rb +31 -18
- data/lib/parse/embeddings/image_fetch.rb +347 -0
- data/lib/parse/embeddings/provider.rb +17 -11
- data/lib/parse/embeddings/spend_cap.rb +117 -3
- data/lib/parse/embeddings/voyage.rb +34 -25
- data/lib/parse/embeddings.rb +40 -3
- data/lib/parse/model/acl.rb +15 -11
- data/lib/parse/model/core/embed_managed.rb +243 -14
- data/lib/parse/model/core/properties.rb +42 -5
- data/lib/parse/model/core/vector_searchable.rb +157 -8
- data/lib/parse/mongodb.rb +12 -0
- data/lib/parse/pipeline_security.rb +81 -15
- data/lib/parse/query/constraint.rb +22 -0
- data/lib/parse/query/constraints.rb +271 -250
- data/lib/parse/query.rb +284 -43
- data/lib/parse/retrieval/agent_tool.rb +21 -14
- data/lib/parse/retrieval/retriever.rb +84 -0
- data/lib/parse/schema/search_index_migrator.rb +48 -1
- data/lib/parse/stack/version.rb +1 -1
- data/lib/parse/stack.rb +12 -1
- data/lib/parse/vector_search/hybrid.rb +39 -1
- data/lib/parse/vector_search.rb +34 -0
- data/lib/parse/webhooks/payload.rb +7 -1
- data/lib/parse/webhooks.rb +107 -21
- metadata +4 -1
|
@@ -4,6 +4,7 @@
|
|
|
4
4
|
require "json"
|
|
5
5
|
require "securerandom"
|
|
6
6
|
require "digest"
|
|
7
|
+
require "uri"
|
|
7
8
|
require_relative "errors"
|
|
8
9
|
require_relative "mcp_dispatcher"
|
|
9
10
|
require_relative "mcp_subscriptions"
|
|
@@ -320,6 +321,7 @@ module Parse
|
|
|
320
321
|
pre_auth_rate_limiter: nil,
|
|
321
322
|
allowed_origins: nil,
|
|
322
323
|
require_custom_header: nil,
|
|
324
|
+
loopback_csrf_default: false,
|
|
323
325
|
resource_subscriptions: false,
|
|
324
326
|
subscription_manager: nil,
|
|
325
327
|
notifications: nil,
|
|
@@ -376,6 +378,16 @@ module Parse
|
|
|
376
378
|
@pre_auth_rate_limiter = pre_auth_rate_limiter
|
|
377
379
|
@allowed_origins = normalize_allowed_origins(allowed_origins)
|
|
378
380
|
@required_custom_header = normalize_required_custom_header(require_custom_header)
|
|
381
|
+
# NEW-9: when no explicit allowed_origins / require_custom_header CSRF
|
|
382
|
+
# gate is configured but the server was started on an unauthenticated
|
|
383
|
+
# loopback bind, default to a loopback-only Origin policy. A browser
|
|
384
|
+
# DNS-rebinding attack against 127.0.0.1 always carries an `Origin`
|
|
385
|
+
# header (the attacker page's origin), so refusing any present
|
|
386
|
+
# non-loopback Origin closes that vector — while native clients (curl,
|
|
387
|
+
# SDK-to-SDK) send NO Origin and stay allowed, and a legitimate local
|
|
388
|
+
# browser UI sends a loopback Origin and is allowed. Ignored when an
|
|
389
|
+
# explicit allowlist is configured (operator owns the policy then).
|
|
390
|
+
@loopback_csrf_default = loopback_csrf_default && @allowed_origins.nil?
|
|
379
391
|
@health_path = health_path.is_a?(String) && !health_path.empty? ? health_path : nil
|
|
380
392
|
# Per-app registry of in-flight cancellable requests. Keyed by
|
|
381
393
|
# [correlation_id, request_id]. A `notifications/cancelled` POST
|
|
@@ -660,12 +672,9 @@ module Parse
|
|
|
660
672
|
# Missing/empty `Origin` is allowed regardless — native
|
|
661
673
|
# clients (curl, SDK-to-SDK) shouldn't be broken by a
|
|
662
674
|
# CSRF defense aimed at browsers.
|
|
663
|
-
if
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
@logger&.warn("[Parse::Agent::MCPRackApp] Origin refused: #{origin.inspect}")
|
|
667
|
-
return [403, json_headers, [json_rpc_error(-32_700, "Origin not allowed")]]
|
|
668
|
-
end
|
|
675
|
+
if origin_refused?(env)
|
|
676
|
+
@logger&.warn("[Parse::Agent::MCPRackApp] Origin refused: #{env["HTTP_ORIGIN"].to_s.strip.inspect}")
|
|
677
|
+
return [403, json_headers, [json_rpc_error(-32_700, "Origin not allowed")]]
|
|
669
678
|
end
|
|
670
679
|
|
|
671
680
|
# 2c. Required custom header (CSRF defense-in-depth). A header
|
|
@@ -1051,14 +1060,11 @@ module Parse
|
|
|
1051
1060
|
return [400, json_headers, [json_rpc_error(-32_600, "Missing or invalid Mcp-Session-Id")]]
|
|
1052
1061
|
end
|
|
1053
1062
|
|
|
1054
|
-
# The origin
|
|
1055
|
-
# the same way it guards POST — a browser-driven
|
|
1056
|
-
# an SSE endpoint is the analogous CSRF surface.
|
|
1057
|
-
if
|
|
1058
|
-
|
|
1059
|
-
unless origin.empty? || origin_allowed?(origin)
|
|
1060
|
-
return [403, json_headers, [json_rpc_error(-32_700, "Origin not allowed")]]
|
|
1061
|
-
end
|
|
1063
|
+
# The origin policy (when configured, or the loopback default) guards
|
|
1064
|
+
# the listening stream the same way it guards POST — a browser-driven
|
|
1065
|
+
# cross-origin GET to an SSE endpoint is the analogous CSRF surface.
|
|
1066
|
+
if origin_refused?(env)
|
|
1067
|
+
return [403, json_headers, [json_rpc_error(-32_700, "Origin not allowed")]]
|
|
1062
1068
|
end
|
|
1063
1069
|
|
|
1064
1070
|
# Owner-binding: only the principal that established this session (or,
|
|
@@ -2119,6 +2125,39 @@ module Parse
|
|
|
2119
2125
|
# `@allowed_origins`. Comparison is case-insensitive on host and
|
|
2120
2126
|
# scheme. Wildcard via leading `.` matches subdomains:
|
|
2121
2127
|
# `.example.com` matches `app.example.com` and `example.com`.
|
|
2128
|
+
# Single chokepoint for the Origin CSRF gate, shared by the POST and
|
|
2129
|
+
# listening-stream paths. A missing/empty Origin (native clients: curl,
|
|
2130
|
+
# SDK-to-SDK) is always allowed — the CSRF surface is browser-only, and
|
|
2131
|
+
# browsers always send an Origin on cross-origin requests. When an
|
|
2132
|
+
# explicit allowlist is configured it wins; otherwise the loopback
|
|
2133
|
+
# default (NEW-9) refuses any present non-loopback Origin.
|
|
2134
|
+
def origin_refused?(env)
|
|
2135
|
+
origin = env["HTTP_ORIGIN"].to_s.strip
|
|
2136
|
+
return false if origin.empty?
|
|
2137
|
+
if @allowed_origins
|
|
2138
|
+
!origin_allowed?(origin)
|
|
2139
|
+
elsif @loopback_csrf_default
|
|
2140
|
+
!origin_is_loopback?(origin)
|
|
2141
|
+
else
|
|
2142
|
+
false
|
|
2143
|
+
end
|
|
2144
|
+
end
|
|
2145
|
+
|
|
2146
|
+
# True when `origin`'s host is a loopback address (any scheme/port).
|
|
2147
|
+
# Closes browser DNS-rebinding on an unauthenticated loopback bind: the
|
|
2148
|
+
# attacker page's Origin (e.g. http://evil.example) is not loopback and
|
|
2149
|
+
# is refused, while a real local UI on http://localhost:<port> passes.
|
|
2150
|
+
def origin_is_loopback?(origin)
|
|
2151
|
+
host = begin
|
|
2152
|
+
URI.parse(origin).host
|
|
2153
|
+
rescue URI::InvalidURIError, StandardError
|
|
2154
|
+
nil
|
|
2155
|
+
end
|
|
2156
|
+
return false if host.nil?
|
|
2157
|
+
host = host.downcase.delete_prefix("[").delete_suffix("]") # unwrap IPv6 brackets
|
|
2158
|
+
host == "localhost" || host == "127.0.0.1" || host == "::1"
|
|
2159
|
+
end
|
|
2160
|
+
|
|
2122
2161
|
def origin_allowed?(origin)
|
|
2123
2162
|
return false unless @allowed_origins
|
|
2124
2163
|
normalized = origin.downcase
|
|
@@ -162,11 +162,30 @@ module Parse
|
|
|
162
162
|
# pre_auth_rate_limiter: closes NEW-MCP-6 — runs before the factory
|
|
163
163
|
# is invoked so an empty or malformed body can't amplify into a
|
|
164
164
|
# Parse Server round-trip.
|
|
165
|
+
# NEW-9: on an unauthenticated loopback dev bind with no explicit CSRF
|
|
166
|
+
# gate configured, enable a loopback-only Origin policy by default to
|
|
167
|
+
# mitigate browser DNS-rebinding (a malicious page resolving a hostname
|
|
168
|
+
# to 127.0.0.1 and POSTing to the agent). The attacker page always
|
|
169
|
+
# carries a non-loopback Origin and is refused; native (no-Origin)
|
|
170
|
+
# clients and real local browser UIs are unaffected. Skipped when an
|
|
171
|
+
# API key is set (auth already gates) or the operator configured the
|
|
172
|
+
# Origin/custom-header gates themselves.
|
|
173
|
+
loopback_csrf_default =
|
|
174
|
+
LOOPBACK_HOSTS.include?(host.to_s) && @api_key.to_s.empty? &&
|
|
175
|
+
allowed_origins.nil? && require_custom_header.nil?
|
|
176
|
+
if loopback_csrf_default
|
|
177
|
+
warn "[Parse::Agent::MCPServer] Binding #{host}:#{port} without an API key. " \
|
|
178
|
+
"Enabling a loopback-only Origin policy to mitigate browser DNS-rebinding. " \
|
|
179
|
+
"For anything beyond local single-user dev set MCP_API_KEY (or pass api_key:), " \
|
|
180
|
+
"and/or configure allowed_origins:/require_custom_header:."
|
|
181
|
+
end
|
|
182
|
+
|
|
165
183
|
@rack_app = MCPRackApp.new(
|
|
166
184
|
agent_factory: method(:agent_factory),
|
|
167
185
|
pre_auth_rate_limiter: pre_auth_rate_limiter,
|
|
168
186
|
allowed_origins: allowed_origins,
|
|
169
187
|
require_custom_header: require_custom_header,
|
|
188
|
+
loopback_csrf_default: loopback_csrf_default,
|
|
170
189
|
)
|
|
171
190
|
end
|
|
172
191
|
|
|
@@ -45,6 +45,37 @@ module Parse
|
|
|
45
45
|
s
|
|
46
46
|
end
|
|
47
47
|
|
|
48
|
+
# Parse objectId pattern: 1–40 alphanumerics. Parse Server generates
|
|
49
|
+
# 10-char alphanumeric ids; the cap is generous to allow custom ids
|
|
50
|
+
# while still refusing path-traversal (`/`, `.`, `..`) and query
|
|
51
|
+
# injection (`?`, `&`, `=`). Mirrors Parse::API::Objects::OBJECT_ID_PATTERN.
|
|
52
|
+
OBJECT_ID_PATTERN = /\A[A-Za-z0-9]{1,40}\z/.freeze
|
|
53
|
+
|
|
54
|
+
# Validate a Parse objectId used in a REST path (`users/<id>`,
|
|
55
|
+
# `classes/<Class>/<id>`) and return it unchanged. Refuses anything that
|
|
56
|
+
# could traverse to a different endpoint or smuggle a query string when
|
|
57
|
+
# interpolated raw — e.g. a hostile/compromised Parse Server returning a
|
|
58
|
+
# crafted `objectId` like `../classes/_User?where=...` on a prior
|
|
59
|
+
# response that then rides the next fetch/update/delete with whatever
|
|
60
|
+
# credentials the call is authorized to send.
|
|
61
|
+
#
|
|
62
|
+
# @param value the objectId to validate (anything responding to `to_s`).
|
|
63
|
+
# @param kind [String] human-readable name for error messages.
|
|
64
|
+
# @return [String] the validated objectId.
|
|
65
|
+
# @raise [ArgumentError] if blank or it fails the pattern.
|
|
66
|
+
def object_id!(value, kind: "objectId")
|
|
67
|
+
s = value.to_s
|
|
68
|
+
if s.empty?
|
|
69
|
+
raise ArgumentError, "#{kind} must not be empty"
|
|
70
|
+
end
|
|
71
|
+
unless OBJECT_ID_PATTERN.match?(s)
|
|
72
|
+
raise ArgumentError,
|
|
73
|
+
"#{kind} #{s.inspect} contains characters not allowed in a Parse " \
|
|
74
|
+
"objectId. Must match /\\A[A-Za-z0-9]{1,40}\\z/."
|
|
75
|
+
end
|
|
76
|
+
s
|
|
77
|
+
end
|
|
78
|
+
|
|
48
79
|
# Parse trigger className pattern: a normal identifier, OR one of Parse
|
|
49
80
|
# Server's `@`-prefixed pseudo-classes (`@File` for file triggers,
|
|
50
81
|
# `@Connect` for the connection-global LiveQuery trigger). The optional
|
data/lib/parse/api/users.rb
CHANGED
|
@@ -26,6 +26,7 @@ module Parse
|
|
|
26
26
|
# @param headers [Hash] additional HTTP headers to send with the request.
|
|
27
27
|
# @return [Parse::Response]
|
|
28
28
|
def fetch_user(id, headers: {}, **opts)
|
|
29
|
+
id = Parse::API::PathSegment.object_id!(id)
|
|
29
30
|
request :get, "#{USER_PATH_PREFIX}/#{id}", headers: headers, opts: opts
|
|
30
31
|
end
|
|
31
32
|
|
|
@@ -74,6 +75,7 @@ module Parse
|
|
|
74
75
|
# @param headers [Hash] additional HTTP headers to send with the request.
|
|
75
76
|
# @return [Parse::Response]
|
|
76
77
|
def update_user(id, body = {}, headers: {}, **opts)
|
|
78
|
+
id = Parse::API::PathSegment.object_id!(id)
|
|
77
79
|
response = request :put, "#{USER_PATH_PREFIX}/#{id}", body: body, headers: headers, opts: opts
|
|
78
80
|
response.parse_class = Parse::Model::CLASS_USER
|
|
79
81
|
response
|
|
@@ -98,6 +100,7 @@ module Parse
|
|
|
98
100
|
# @param headers [Hash] additional HTTP headers to send with the request.
|
|
99
101
|
# @return [Parse::Response]
|
|
100
102
|
def delete_user(id, headers: {}, **opts)
|
|
103
|
+
id = Parse::API::PathSegment.object_id!(id)
|
|
101
104
|
request :delete, "#{USER_PATH_PREFIX}/#{id}", headers: headers, opts: opts
|
|
102
105
|
end
|
|
103
106
|
|
|
@@ -223,15 +226,25 @@ module Parse
|
|
|
223
226
|
# - code 205 (+ERROR_EMAIL_NOT_FOUND+) when +preventLoginWithUnverifiedEmail+
|
|
224
227
|
# is enabled and the account's email has not been verified.
|
|
225
228
|
#
|
|
229
|
+
# Client-side rate limited per username using the SAME bucket as {#login}
|
|
230
|
+
# (bare username, no namespace) — failures across both credential oracles
|
|
231
|
+
# accumulate, so an attacker cannot bypass a +login+ lockout by pivoting to
|
|
232
|
+
# this endpoint. The trade-off: a run of failed step-up re-auth calls counts
|
|
233
|
+
# toward (and can trigger) the primary login lockout for that username.
|
|
234
|
+
# Client-side limiting is a convenience, not a boundary — the server is the
|
|
235
|
+
# real control.
|
|
236
|
+
#
|
|
226
237
|
# @param username [String] the Parse user username.
|
|
227
238
|
# @param password [String] the Parse user's associated password.
|
|
228
239
|
# @param headers [Hash] additional HTTP headers to send with the request.
|
|
229
240
|
# @param opts [Hash] additional options to pass to the {Parse::Client} request.
|
|
230
241
|
# @return [Parse::Response]
|
|
231
242
|
def verify_password(username, password, headers: {}, **opts)
|
|
243
|
+
check_login_rate_limit!(username)
|
|
232
244
|
body = { username: username, password: password }
|
|
233
245
|
response = request :post, VERIFY_PASSWORD_PATH, body: body, headers: headers, opts: opts
|
|
234
246
|
response.parse_class = Parse::Model::CLASS_USER
|
|
247
|
+
track_login_attempt(username, response.success?)
|
|
235
248
|
response
|
|
236
249
|
end
|
|
237
250
|
|
data/lib/parse/cache/redis.rb
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# frozen_string_literal: true
|
|
3
3
|
|
|
4
4
|
require "moneta"
|
|
5
|
+
require "json"
|
|
5
6
|
require_relative "pool"
|
|
6
7
|
|
|
7
8
|
module Parse
|
|
@@ -82,6 +83,20 @@ module Parse
|
|
|
82
83
|
# session-scoped REST responses outlive their token's
|
|
83
84
|
# validity. Callers can still pass `expires: false` to opt out.
|
|
84
85
|
merged_options = { expires: true }.merge(moneta_options)
|
|
86
|
+
# SECURITY: disable Moneta's value serializer so cached values are NOT
|
|
87
|
+
# Marshal-encoded. We JSON-(de)serialize values ourselves in #store /
|
|
88
|
+
# #[] (see #encode_value / #decode_value). The default Moneta-Redis
|
|
89
|
+
# value serializer is Marshal, which would `Marshal.load` whatever
|
|
90
|
+
# bytes come back from Redis on every cache hit — an arbitrary-code-
|
|
91
|
+
# execution primitive if the Redis cache is shared, unauthenticated,
|
|
92
|
+
# or reachable through a plaintext `redis://` MITM. Forcing nil here
|
|
93
|
+
# (overriding any caller-supplied `value_serializer:`/`serializer:`)
|
|
94
|
+
# keeps that gadget-deserialization vector closed regardless of how
|
|
95
|
+
# the wrapper is configured. Keys keep the default (:marshal) encoding:
|
|
96
|
+
# they are only ever written and SCAN/DEL-compared as opaque strings,
|
|
97
|
+
# never `Marshal.load`ed from Redis content, so they are not a
|
|
98
|
+
# deserialization vector.
|
|
99
|
+
merged_options = merged_options.merge(value_serializer: nil)
|
|
85
100
|
@moneta_options = merged_options
|
|
86
101
|
@closed = false
|
|
87
102
|
@pool = Pool.new(size: pool_size, timeout: pool_timeout) do
|
|
@@ -90,7 +105,7 @@ module Parse
|
|
|
90
105
|
end
|
|
91
106
|
|
|
92
107
|
def [](key)
|
|
93
|
-
@pool[key]
|
|
108
|
+
decode_value(@pool[key])
|
|
94
109
|
end
|
|
95
110
|
|
|
96
111
|
def key?(key)
|
|
@@ -102,15 +117,18 @@ module Parse
|
|
|
102
117
|
end
|
|
103
118
|
|
|
104
119
|
def store(key, value, options = {})
|
|
105
|
-
@pool.store(key, value, options)
|
|
120
|
+
@pool.store(key, encode_value(value), options)
|
|
106
121
|
end
|
|
107
122
|
|
|
108
123
|
# Atomic SETNX. Required so `Parse::CreateLock` can acquire
|
|
109
124
|
# cross-process locks when this wrapper is the configured cache /
|
|
110
125
|
# `synchronize_create_store`. Returns `true` only when the key did
|
|
111
|
-
# not already exist.
|
|
126
|
+
# not already exist. The value goes through the same JSON encoding
|
|
127
|
+
# as {#store} so a later {#[]} read round-trips instead of decoding
|
|
128
|
+
# to nil. (Parse::LockBackend never hits this path on this wrapper —
|
|
129
|
+
# it prefers the raw-Redis {#lock_acquire}/{#lock_release} pair.)
|
|
112
130
|
def create(key, value, options = {})
|
|
113
|
-
@pool.create(key, value, options)
|
|
131
|
+
@pool.create(key, encode_value(value), options)
|
|
114
132
|
end
|
|
115
133
|
|
|
116
134
|
# Atomic counter increment. Forwarded for Moneta surface parity.
|
|
@@ -135,14 +153,14 @@ module Parse
|
|
|
135
153
|
# Atomically acquire a lock: SET key=owner only if absent, with a
|
|
136
154
|
# native expiry. Used by {Parse::LockBackend} for {Parse::Lock} and
|
|
137
155
|
# {Parse::CreateLock}. Deliberately bypasses Moneta's `create` —
|
|
138
|
-
# `Moneta.new(:Redis)` marshals
|
|
139
|
-
# compare-and-delete on
|
|
140
|
-
# coupled to Moneta's serializer config. Routing acquire
|
|
141
|
-
# through plain-string raw Redis here keeps one consistent
|
|
142
|
-
# across both ends of the lock and makes the keys human-
|
|
143
|
-
# in Redis (`parse-stack:lock:v1:<digest>`). Lock keys are
|
|
156
|
+
# `Moneta.new(:Redis)` marshals keys (and, by default, values), so a
|
|
157
|
+
# raw-Redis compare-and-delete on a Moneta-encoded blob would be
|
|
158
|
+
# fragile and coupled to Moneta's serializer config. Routing acquire
|
|
159
|
+
# AND release through plain-string raw Redis here keeps one consistent
|
|
160
|
+
# encoding across both ends of the lock and makes the keys human-
|
|
161
|
+
# inspectable in Redis (`parse-stack:lock:v1:<digest>`). Lock keys are
|
|
144
162
|
# short-lived (TTL ≤ 30s) so there is no migration concern when a
|
|
145
|
-
# deploy flips
|
|
163
|
+
# deploy flips encodings.
|
|
146
164
|
#
|
|
147
165
|
# @param key [String] plain-string lock key.
|
|
148
166
|
# @param owner [String] unique-per-acquisition owner token.
|
|
@@ -222,6 +240,32 @@ module Parse
|
|
|
222
240
|
|
|
223
241
|
private
|
|
224
242
|
|
|
243
|
+
# Serialize a cache value to a JSON String before handing it to Moneta
|
|
244
|
+
# (which stores it raw, since the value serializer is disabled — see the
|
|
245
|
+
# constructor). JSON is used instead of Marshal so the read side never
|
|
246
|
+
# `Marshal.load`s attacker-influenced Redis bytes. Cache values written
|
|
247
|
+
# by the caching middleware are `{ "headers" => ..., "body" => ... }`
|
|
248
|
+
# hashes of strings, which round-trip losslessly through JSON.
|
|
249
|
+
def encode_value(value)
|
|
250
|
+
JSON.generate(value)
|
|
251
|
+
end
|
|
252
|
+
|
|
253
|
+
# Decode a JSON String read back from Moneta/Redis. Returns nil on a
|
|
254
|
+
# miss or on any value that is not valid JSON — most importantly, legacy
|
|
255
|
+
# Marshal-encoded entries written before this wrapper switched to JSON.
|
|
256
|
+
# Treating an undecodable value as a miss makes the caller refetch and
|
|
257
|
+
# re-store it in the JSON format, and ensures a hostile non-JSON blob can
|
|
258
|
+
# at worst yield a cache miss, never a deserialized Ruby object graph.
|
|
259
|
+
def decode_value(raw)
|
|
260
|
+
return nil if raw.nil?
|
|
261
|
+
JSON.parse(raw)
|
|
262
|
+
rescue JSON::ParserError, EncodingError, TypeError
|
|
263
|
+
# ParserError covers malformed and hostile-depth JSON
|
|
264
|
+
# (JSON::NestingError subclasses it); TypeError covers a
|
|
265
|
+
# non-String blob from a misconfigured store. All are misses.
|
|
266
|
+
nil
|
|
267
|
+
end
|
|
268
|
+
|
|
225
269
|
def delete_keys_matching!(pattern)
|
|
226
270
|
@pool.pool.with do |store|
|
|
227
271
|
redis = backend_client(store)
|
data/lib/parse/client/caching.rb
CHANGED
|
@@ -190,8 +190,13 @@ module Parse
|
|
|
190
190
|
body = cache_data.respond_to?(:body) ? cache_data.body : nil
|
|
191
191
|
response_headers = cache_data.response_headers || {}
|
|
192
192
|
elsif cache_data.is_a?(Hash)
|
|
193
|
-
|
|
194
|
-
|
|
193
|
+
# New entries are stored with string keys so they survive a
|
|
194
|
+
# JSON round-trip (the Redis cache wrapper serializes values as
|
|
195
|
+
# JSON, not Marshal — see Parse::Cache::Redis). Fall back to
|
|
196
|
+
# symbol keys for legacy in-memory / Marshal-backed entries
|
|
197
|
+
# written before that switch.
|
|
198
|
+
body = cache_data["body"] || cache_data[:body]
|
|
199
|
+
response_headers = cache_data["headers"] || cache_data[:headers] || {}
|
|
195
200
|
end
|
|
196
201
|
|
|
197
202
|
if cache_data.present? && body.present?
|
|
@@ -244,8 +249,12 @@ module Parse
|
|
|
244
249
|
response_env.body.present? && response_env.response_headers[CONTENT_LENGTH_KEY].to_i.between?(20, 1_250_000)
|
|
245
250
|
store_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
246
251
|
begin
|
|
252
|
+
# Store with string keys (and a plain Hash of headers) so the
|
|
253
|
+
# value round-trips losslessly through the Redis cache wrapper's
|
|
254
|
+
# JSON serialization. The read path above reads string keys first
|
|
255
|
+
# with a symbol-key fallback for legacy entries.
|
|
247
256
|
@store.store(@cache_key,
|
|
248
|
-
{ headers
|
|
257
|
+
{ "headers" => response_env.response_headers.to_h, "body" => response_env.body },
|
|
249
258
|
expires: @expires)
|
|
250
259
|
duration_ms = ((Process.clock_gettime(Process::CLOCK_MONOTONIC) - store_start) * 1000.0).round(3)
|
|
251
260
|
instrument_cache(:store, method: method, url_path: url_path, duration_ms: duration_ms)
|
data/lib/parse/client/logging.rb
CHANGED
|
@@ -186,6 +186,15 @@ module Parse
|
|
|
186
186
|
end
|
|
187
187
|
end
|
|
188
188
|
|
|
189
|
+
# Scrub credentials before logging. At :debug level this method emits
|
|
190
|
+
# both the request body (login/signup carries a cleartext `password`)
|
|
191
|
+
# and the response body (auth responses carry a fresh `sessionToken`,
|
|
192
|
+
# `authData`, and MFA secrets). `log_headers` already redacts headers;
|
|
193
|
+
# the body path must use the same canonical scrubber or it leaks live
|
|
194
|
+
# credentials to anyone with log access. Redact BEFORE the length cap
|
|
195
|
+
# so truncation can't split a token across the boundary and slip past.
|
|
196
|
+
content = Parse::Middleware::BodyBuilder.redact(content)
|
|
197
|
+
|
|
189
198
|
if content.length > max_length
|
|
190
199
|
logger.debug " [#{prefix} Body] #{content[0...max_length]}... (truncated, #{content.length} total)"
|
|
191
200
|
elsif content.length > 0
|
data/lib/parse/client.rb
CHANGED
|
@@ -716,10 +716,26 @@ module Parse
|
|
|
716
716
|
warn "[Parse::Client] Cache store provided but :expires is not set or is 0. " \
|
|
717
717
|
"Caching will be disabled. Set :expires to enable caching (e.g., expires: 10)."
|
|
718
718
|
else
|
|
719
|
-
# advanced: provide a REDIS url, we'll configure a
|
|
719
|
+
# advanced: provide a REDIS url, we'll configure a Redis store.
|
|
720
720
|
if opts[:cache].is_a?(String) && opts[:cache].starts_with?("redis://")
|
|
721
721
|
begin
|
|
722
|
-
|
|
722
|
+
# Eagerly load the redis adapter so a missing `redis` gem
|
|
723
|
+
# fails fast here (at setup) with the friendly hint below,
|
|
724
|
+
# rather than deferring to the first cache access — the
|
|
725
|
+
# Parse::Cache::Redis pool builds its Moneta-Redis backends
|
|
726
|
+
# lazily, so without this the LoadError would surface later.
|
|
727
|
+
require "moneta/adapters/redis"
|
|
728
|
+
# Route through Parse::Cache::Redis rather than a bare
|
|
729
|
+
# `Moneta.new(:Redis, ...)`. SECURITY: the Moneta-Redis store
|
|
730
|
+
# Marshals values by default, so every cache hit would
|
|
731
|
+
# `Marshal.load` whatever bytes come back from Redis — an
|
|
732
|
+
# arbitrary-code-execution primitive if the cache is shared,
|
|
733
|
+
# unauthenticated, or reachable over a plaintext `redis://`
|
|
734
|
+
# MITM. The wrapper forces `value_serializer: nil` and
|
|
735
|
+
# JSON-(de)serializes cached values itself, closing that
|
|
736
|
+
# deserialization vector on this shorthand the same way an
|
|
737
|
+
# explicitly-constructed wrapper does.
|
|
738
|
+
opts[:cache] = Parse::Cache::Redis.new(url: opts[:cache])
|
|
723
739
|
rescue LoadError
|
|
724
740
|
puts "[Parse::Middleware::Caching] Did you forget to load the redis gem (Gemfile)?"
|
|
725
741
|
raise
|
|
@@ -1425,6 +1441,22 @@ module Parse
|
|
|
1425
1441
|
# Object/Pointer envelope is converted, and an Object of an UNregistered class
|
|
1426
1442
|
# is left as a raw Hash (building it would degrade to a field-less Pointer).
|
|
1427
1443
|
# Plain Hashes and arbitrary `__type` app data pass through untouched.
|
|
1444
|
+
#
|
|
1445
|
+
# SECURITY — cloud results are treated as server-authoritative. The
|
|
1446
|
+
# `__type:"Object"` decode in {._decode_cloud_value} routes through
|
|
1447
|
+
# +Parse::Object.build+, which hydrates with trusted-init — the SAME path
|
|
1448
|
+
# used to decode every query / +.fetch+ result. Trusted-init skips the
|
|
1449
|
+
# +PROTECTED_INITIALIZE_KEYS+ filter, so credential-shaped keys
|
|
1450
|
+
# (+sessionToken+, +authData+, +_rperm+, +_wperm+, +roles+, …) present in a
|
|
1451
|
+
# cloud function's return value populate the in-memory object, exactly as they
|
|
1452
|
+
# do for any other server response. This is by design: the payload is authored
|
|
1453
|
+
# by your Cloud Code and the request is caller-authenticated, and making cloud
|
|
1454
|
+
# results filter these keys would make them inconsistent with (and stricter
|
|
1455
|
+
# than) query/+.fetch+ hydration — e.g. a cloud function returning
|
|
1456
|
+
# +request.user+ would come back missing its +sessionToken+. If a cloud
|
|
1457
|
+
# function is expected to echo back third-party-influenced data, call it with
|
|
1458
|
+
# +raw: true+ (+Parse.call_function(name, body, raw: true)+) to receive the
|
|
1459
|
+
# undecoded response and sanitize it yourself before building objects.
|
|
1428
1460
|
def self._extract_cloud_result(response)
|
|
1429
1461
|
r = response.result
|
|
1430
1462
|
value = r.is_a?(Hash) ? r["result"] : r
|
|
@@ -1568,7 +1600,9 @@ module Parse
|
|
|
1568
1600
|
# specific {Parse::Error} subclasses as the underlying client does.
|
|
1569
1601
|
# @param name (see Parse.call_function)
|
|
1570
1602
|
# @param body (see Parse.call_function)
|
|
1571
|
-
# @param opts (see Parse.call_function) —
|
|
1603
|
+
# @param opts (see Parse.call_function) — +:raw+ has no effect; this method
|
|
1604
|
+
# always decodes the result. Use {Parse.call_function} with +raw: true+ if
|
|
1605
|
+
# you need the undecoded response.
|
|
1572
1606
|
# @raise [Parse::Error::CloudCodeError] when the response indicates a cloud-code error.
|
|
1573
1607
|
# @return [Object] the result data of the response.
|
|
1574
1608
|
def self.call_function!(name, body = {}, **opts)
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module Parse
|
|
5
|
+
module Embeddings
|
|
6
|
+
# Batch-level orchestration for bulk embedding jobs.
|
|
7
|
+
#
|
|
8
|
+
# {Provider#embed_text_batched} only slices input into
|
|
9
|
+
# provider-sized chunks; any retry/backoff lives inside each
|
|
10
|
+
# provider's single HTTP call. That is the wrong layer for bulk
|
|
11
|
+
# work: a 50k-document backfill needs *batch-level* pacing (stay
|
|
12
|
+
# under the provider's requests-per-minute budget across calls) and
|
|
13
|
+
# *batch-level* backoff (a 429 after the provider's internal retries
|
|
14
|
+
# are exhausted should pause the whole job, not kill it).
|
|
15
|
+
# {BatchEmbedder} wraps any registered provider with both.
|
|
16
|
+
#
|
|
17
|
+
# @example Backfill with pacing and backoff
|
|
18
|
+
# embedder = Parse::Embeddings::BatchEmbedder.new(
|
|
19
|
+
# Parse::Embeddings.provider(:openai),
|
|
20
|
+
# requests_per_minute: 60,
|
|
21
|
+
# max_attempts: 5,
|
|
22
|
+
# )
|
|
23
|
+
# vectors = embedder.embed_text(texts, input_type: :search_document)
|
|
24
|
+
#
|
|
25
|
+
# @example Progress reporting
|
|
26
|
+
# embedder = Parse::Embeddings::BatchEmbedder.new(provider,
|
|
27
|
+
# on_progress: ->(done:, total:, batch_index:, batch_count:) {
|
|
28
|
+
# puts "#{done}/#{total}"
|
|
29
|
+
# })
|
|
30
|
+
#
|
|
31
|
+
# == Retry classification
|
|
32
|
+
#
|
|
33
|
+
# By default a batch is retried when the provider raises a
|
|
34
|
+
# {Parse::Embeddings::Error} subclass whose class name ends in
|
|
35
|
+
# `RateLimitError` or `TransientError` — the convention every
|
|
36
|
+
# bundled provider follows (`OpenAI::RateLimitError`,
|
|
37
|
+
# `Voyage::TransientError`, …). Pass `retry_on:` with explicit
|
|
38
|
+
# exception classes to override. Non-retryable errors (auth,
|
|
39
|
+
# bad-request, response-contract violations) propagate immediately.
|
|
40
|
+
#
|
|
41
|
+
# Vectors are returned aligned 1:1 with the input, identical to
|
|
42
|
+
# `embed_text` on the wrapped provider.
|
|
43
|
+
class BatchEmbedder
|
|
44
|
+
# Raised when a batch still fails after `max_attempts` retryable
|
|
45
|
+
# failures. Wraps the final provider error in `#cause` and carries
|
|
46
|
+
# the index of the failing batch so a resumable job knows where to
|
|
47
|
+
# pick up.
|
|
48
|
+
class BatchFailed < Parse::Embeddings::Error
|
|
49
|
+
# @return [Integer] zero-based index of the failing batch.
|
|
50
|
+
attr_reader :batch_index
|
|
51
|
+
# @return [Integer] number of inputs successfully embedded before the failure.
|
|
52
|
+
attr_reader :completed_count
|
|
53
|
+
|
|
54
|
+
def initialize(message, batch_index:, completed_count:)
|
|
55
|
+
@batch_index = batch_index
|
|
56
|
+
@completed_count = completed_count
|
|
57
|
+
super(message)
|
|
58
|
+
end
|
|
59
|
+
end
|
|
60
|
+
|
|
61
|
+
RETRYABLE_NAME_SUFFIXES = %w[RateLimitError TransientError].freeze
|
|
62
|
+
|
|
63
|
+
# @return [Provider] the wrapped provider.
|
|
64
|
+
attr_reader :provider
|
|
65
|
+
|
|
66
|
+
# @param provider [Provider] any registered embedding provider.
|
|
67
|
+
# @param batch_size [Integer, nil] inputs per provider call.
|
|
68
|
+
# Defaults to the provider's own {Provider#embed_batch_size}
|
|
69
|
+
# hint, falling back to 64 when the provider has none.
|
|
70
|
+
# @param requests_per_minute [Numeric, nil] batch-level pacing
|
|
71
|
+
# budget. When set, consecutive provider calls are spaced at
|
|
72
|
+
# least `60.0 / requests_per_minute` seconds apart. nil disables
|
|
73
|
+
# pacing.
|
|
74
|
+
# @param max_attempts [Integer] attempts per batch (1 = no retry).
|
|
75
|
+
# @param base_delay [Numeric] first backoff delay in seconds;
|
|
76
|
+
# doubles per attempt.
|
|
77
|
+
# @param max_delay [Numeric] backoff ceiling in seconds.
|
|
78
|
+
# @param jitter [Numeric] random multiplier range added to each
|
|
79
|
+
# delay (`delay * (1 + rand * jitter)`); spreads thundering
|
|
80
|
+
# herds when several workers back off together.
|
|
81
|
+
# @param retry_on [Array<Class>, nil] explicit retryable exception
|
|
82
|
+
# classes; nil uses the name-suffix convention described above.
|
|
83
|
+
# @param on_progress [#call, nil] callable invoked after each
|
|
84
|
+
# successful batch with `done:, total:, batch_index:, batch_count:`.
|
|
85
|
+
def initialize(provider, batch_size: nil, requests_per_minute: nil,
|
|
86
|
+
max_attempts: 5, base_delay: 2.0, max_delay: 60.0,
|
|
87
|
+
jitter: 0.25, retry_on: nil, on_progress: nil)
|
|
88
|
+
unless provider.is_a?(Provider)
|
|
89
|
+
raise ArgumentError,
|
|
90
|
+
"Parse::Embeddings::BatchEmbedder expects a Parse::Embeddings::Provider " \
|
|
91
|
+
"(got #{provider.class})."
|
|
92
|
+
end
|
|
93
|
+
@provider = provider
|
|
94
|
+
@batch_size = batch_size ? Integer(batch_size) : nil
|
|
95
|
+
raise ArgumentError, "batch_size must be positive" if @batch_size && @batch_size <= 0
|
|
96
|
+
@min_interval = requests_per_minute ? (60.0 / Float(requests_per_minute)) : nil
|
|
97
|
+
@max_attempts = Integer(max_attempts)
|
|
98
|
+
raise ArgumentError, "max_attempts must be >= 1" if @max_attempts < 1
|
|
99
|
+
@base_delay = Float(base_delay)
|
|
100
|
+
@max_delay = Float(max_delay)
|
|
101
|
+
@jitter = Float(jitter)
|
|
102
|
+
@retry_on = retry_on && Array(retry_on)
|
|
103
|
+
@on_progress = on_progress
|
|
104
|
+
@last_call_at = nil
|
|
105
|
+
end
|
|
106
|
+
|
|
107
|
+
# Embed `strings` through the wrapped provider with pacing and
|
|
108
|
+
# batch-level backoff.
|
|
109
|
+
#
|
|
110
|
+
# @param strings [Array<String>]
|
|
111
|
+
# @param input_type [Symbol]
|
|
112
|
+
# @return [Array<Array<Float>>] aligned 1:1 with `strings`.
|
|
113
|
+
# @raise [BatchFailed] when a batch exhausts its attempts.
|
|
114
|
+
def embed_text(strings, input_type: :search_document)
|
|
115
|
+
unless strings.is_a?(Array)
|
|
116
|
+
raise ArgumentError,
|
|
117
|
+
"Parse::Embeddings::BatchEmbedder#embed_text expects Array<String> " \
|
|
118
|
+
"(got #{strings.class})."
|
|
119
|
+
end
|
|
120
|
+
return [] if strings.empty?
|
|
121
|
+
|
|
122
|
+
size = @batch_size || @provider.embed_batch_size || 64
|
|
123
|
+
batches = strings.each_slice(size).to_a
|
|
124
|
+
out = []
|
|
125
|
+
batches.each_with_index do |batch, idx|
|
|
126
|
+
out.concat(run_batch(batch, input_type, idx, out.length))
|
|
127
|
+
if @on_progress
|
|
128
|
+
@on_progress.call(done: out.length, total: strings.length,
|
|
129
|
+
batch_index: idx, batch_count: batches.length)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
out
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
private
|
|
136
|
+
|
|
137
|
+
def run_batch(batch, input_type, batch_index, completed_count)
|
|
138
|
+
attempts = 0
|
|
139
|
+
begin
|
|
140
|
+
attempts += 1
|
|
141
|
+
pace!
|
|
142
|
+
@provider.embed_text(batch, input_type: input_type)
|
|
143
|
+
rescue StandardError => e
|
|
144
|
+
raise unless retryable?(e)
|
|
145
|
+
if attempts >= @max_attempts
|
|
146
|
+
raise BatchFailed.new(
|
|
147
|
+
"Parse::Embeddings::BatchEmbedder: batch #{batch_index} failed after " \
|
|
148
|
+
"#{attempts} attempt(s) — #{e.class}: #{e.message}",
|
|
149
|
+
batch_index: batch_index, completed_count: completed_count,
|
|
150
|
+
)
|
|
151
|
+
end
|
|
152
|
+
sleep(backoff_delay(attempts))
|
|
153
|
+
retry
|
|
154
|
+
end
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
def retryable?(error)
|
|
158
|
+
if @retry_on
|
|
159
|
+
return @retry_on.any? { |klass| error.is_a?(klass) }
|
|
160
|
+
end
|
|
161
|
+
return false unless error.is_a?(Parse::Embeddings::Error)
|
|
162
|
+
name = error.class.name.to_s
|
|
163
|
+
RETRYABLE_NAME_SUFFIXES.any? { |suffix| name.end_with?(suffix) }
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
def backoff_delay(attempt)
|
|
167
|
+
delay = [@base_delay * (2**(attempt - 1)), @max_delay].min
|
|
168
|
+
delay * (1.0 + rand * @jitter)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
# Enforce the inter-call interval. Measured from the START of the
|
|
172
|
+
# previous call so a slow provider response counts toward the
|
|
173
|
+
# interval rather than stacking on top of it.
|
|
174
|
+
def pace!
|
|
175
|
+
return if @min_interval.nil?
|
|
176
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
177
|
+
if @last_call_at
|
|
178
|
+
wait = (@last_call_at + @min_interval) - now
|
|
179
|
+
if wait > 0
|
|
180
|
+
sleep(wait)
|
|
181
|
+
now = Process.clock_gettime(Process::CLOCK_MONOTONIC)
|
|
182
|
+
end
|
|
183
|
+
end
|
|
184
|
+
@last_call_at = now
|
|
185
|
+
end
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
end
|