parse-stack-next 5.1.1 → 5.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.env.sample +12 -0
- data/.env.test +4 -4
- data/CHANGELOG.md +545 -0
- data/Gemfile +3 -0
- data/Gemfile.lock +6 -1
- data/README.md +167 -38
- data/Rakefile +56 -10
- data/docs/atlas_vector_search_guide.md +110 -9
- data/docs/mcp_guide.md +433 -0
- data/docs/mongodb_direct_guide.md +66 -1
- data/docs/mongodb_index_optimization_guide.md +22 -1
- data/docs/usage_guide.md +15 -0
- data/lib/parse/agent/approval_gate.rb +0 -0
- data/lib/parse/agent/constraint_translator.rb +90 -19
- data/lib/parse/agent/describe.rb +1 -0
- data/lib/parse/agent/errors.rb +16 -0
- data/lib/parse/agent/mcp_client.rb +9 -0
- data/lib/parse/agent/mcp_dispatcher.rb +139 -7
- data/lib/parse/agent/mcp_rack_app.rb +621 -17
- data/lib/parse/agent/mcp_subscriptions.rb +607 -0
- data/lib/parse/agent/metadata_dsl.rb +58 -0
- data/lib/parse/agent/metadata_registry.rb +141 -1
- data/lib/parse/agent/prompt_hardening.rb +213 -0
- data/lib/parse/agent/result_formatter.rb +18 -3
- data/lib/parse/agent/tools.rb +167 -24
- data/lib/parse/agent.rb +692 -21
- data/lib/parse/client/request.rb +55 -4
- data/lib/parse/client/response.rb +4 -0
- data/lib/parse/client.rb +205 -7
- data/lib/parse/model/classes/installation.rb +27 -10
- data/lib/parse/model/classes/user.rb +8 -0
- data/lib/parse/model/core/actions.rb +58 -4
- data/lib/parse/model/core/embed_managed.rb +19 -14
- data/lib/parse/model/core/indexing.rb +108 -16
- data/lib/parse/model/core/querying.rb +29 -0
- data/lib/parse/model/model.rb +34 -3
- data/lib/parse/model/object.rb +1 -0
- data/lib/parse/query.rb +90 -24
- data/lib/parse/retrieval/agent_tool.rb +369 -0
- data/lib/parse/retrieval/chunk.rb +74 -0
- data/lib/parse/retrieval/chunker.rb +208 -0
- data/lib/parse/retrieval/retriever.rb +274 -0
- data/lib/parse/retrieval.rb +10 -0
- data/lib/parse/schema.rb +69 -20
- data/lib/parse/stack/version.rb +2 -2
- data/parse-stack-next.gemspec +1 -1
- data/scripts/docker/docker-compose.atlas.yml +14 -10
- data/scripts/docker/docker-compose.test.yml +24 -20
- data/scripts/docker/mongo-init.js +3 -3
- data/scripts/start-parse.sh +10 -0
- data/scripts/start_mcp_server.rb +1 -1
- data/scripts/test_server_connection.rb +1 -1
- data/scripts/vector_prototype/create_vector_index.js +1 -1
- data/scripts/vector_prototype/fetch_embeddings.py +2 -2
- data/scripts/vector_prototype/query_prototype.rb +1 -1
- data/scripts/vector_prototype/run.sh +4 -4
- metadata +10 -2
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require_relative "../retrieval"
|
|
5
|
+
|
|
6
|
+
module Parse
|
|
7
|
+
module Retrieval
|
|
8
|
+
# The `semantic_search` agent tool: the agent-aware wrapper around
|
|
9
|
+
# {Parse::Retrieval.retrieve}. It applies the agent security
|
|
10
|
+
# envelope that {Parse::Retrieval.retrieve} (a model-layer method) is
|
|
11
|
+
# deliberately kept free of:
|
|
12
|
+
#
|
|
13
|
+
# * Class allowlist via {Parse::Agent::MetadataRegistry.resolve_searchable!}
|
|
14
|
+
# (`agent_searchable` opt-in, hidden-class refusal, tenant-scope gate).
|
|
15
|
+
# * Recursive underscore-key refusal + filter-field allowlist on
|
|
16
|
+
# caller-supplied `filter:` / `vector_filter:`.
|
|
17
|
+
# * Tenant scope merged into the Atlas pre-filter AND re-asserted on
|
|
18
|
+
# every returned source record (NEW-TOOLS-3 guard).
|
|
19
|
+
# * `field_allowlist` projection of each source record on the way out.
|
|
20
|
+
# * Score quantization in non-admin contexts.
|
|
21
|
+
#
|
|
22
|
+
# ACL is enforced mongo-direct inside `find_similar` via the agent's
|
|
23
|
+
# `acl_scope_kwargs` (`session_token:` / `acl_user:` / `acl_role:` /
|
|
24
|
+
# `master:`), which is why the tool is `client_safe: true`: a
|
|
25
|
+
# session-token client routes through the one path with first-class
|
|
26
|
+
# SDK-side `_rperm` enforcement.
|
|
27
|
+
module AgentTool
|
|
28
|
+
module_function
|
|
29
|
+
|
|
30
|
+
# Upper bound on `k` (mirrors the registered parameter schema).
|
|
31
|
+
MAX_K = 20
|
|
32
|
+
# Default neighbour count for the agent tool. Intentionally lower than
|
|
33
|
+
# Parse::Retrieval.retrieve's library default of 10: an LLM tool result
|
|
34
|
+
# is paid for in context tokens, so the agent surface defaults
|
|
35
|
+
# conservatively. Callers/LLMs can raise it up to MAX_K per call.
|
|
36
|
+
DEFAULT_K = 5
|
|
37
|
+
|
|
38
|
+
# Default ceiling on total returned chunk-content tokens (estimated as
|
|
39
|
+
# chars/4). The retrieve count caps (k * max_chunks_per_document) bound
|
|
40
|
+
# the NUMBER of chunks but not their total size, so a few long documents
|
|
41
|
+
# could silently blow the context window. This budget trims the
|
|
42
|
+
# (score-ordered) chunk list and reports `budget_truncated` so the
|
|
43
|
+
# truncation is never silent. Pass `max_total_tokens: 0` to disable.
|
|
44
|
+
DEFAULT_MAX_TOTAL_TOKENS = 20_000
|
|
45
|
+
|
|
46
|
+
# @param agent [Parse::Agent]
|
|
47
|
+
# @param text_field [String, Symbol, nil] which embedded text source to
|
|
48
|
+
# chunk and return as `content`. Required only for models with more
|
|
49
|
+
# than one `embed` text source (otherwise inferred). Must name one of
|
|
50
|
+
# the class's declared embed sources — an arbitrary field is refused so
|
|
51
|
+
# the chunk `content` can't disclose a non-embedded field.
|
|
52
|
+
# @param max_chunks_per_document [Integer, nil] cap on chunks emitted per
|
|
53
|
+
# matched document (forwarded to the chunker).
|
|
54
|
+
# @param max_total_tokens [Integer, nil] ceiling on total returned
|
|
55
|
+
# chunk-content tokens (estimated chars/4). nil uses
|
|
56
|
+
# {DEFAULT_MAX_TOTAL_TOKENS}; 0 disables the budget.
|
|
57
|
+
# @return [Hash] `{ chunks: Array<Hash>, documents: Hash, count: Integer }`
|
|
58
|
+
# — each chunk's parent record is hoisted once into `documents` (keyed
|
|
59
|
+
# by objectId) instead of being duplicated on every chunk. When the
|
|
60
|
+
# token budget trims the result, `budget_truncated: true` and
|
|
61
|
+
# `budget_dropped: <n>` are added.
|
|
62
|
+
def semantic_search(agent, class_name: nil, query: nil, k: DEFAULT_K,
|
|
63
|
+
filter: nil, vector_filter: nil, text_field: nil,
|
|
64
|
+
chunk_size: nil, chunk_overlap: nil, chunk_by: nil,
|
|
65
|
+
max_chunks_per_document: nil, max_total_tokens: nil,
|
|
66
|
+
# Back-compat / ergonomic aliases for direct callers:
|
|
67
|
+
# `klass:`/`class:` for class_name, and the chunker's
|
|
68
|
+
# own `size:`/`overlap:`/`by:` names.
|
|
69
|
+
klass: nil, size: nil, overlap: nil, by: nil,
|
|
70
|
+
**rest)
|
|
71
|
+
class_name ||= klass || rest.delete(:class)
|
|
72
|
+
chunk_size ||= size
|
|
73
|
+
chunk_overlap ||= overlap
|
|
74
|
+
chunk_by ||= by
|
|
75
|
+
|
|
76
|
+
klass = Parse::Agent::MetadataRegistry.resolve_searchable!(class_name)
|
|
77
|
+
cname = klass.parse_class
|
|
78
|
+
|
|
79
|
+
unless query.is_a?(String) && !query.strip.empty?
|
|
80
|
+
raise Parse::Agent::ValidationError, "semantic_search: `query` must be a non-empty String."
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
resolved_text_field = normalize_text_field!(text_field, klass)
|
|
84
|
+
|
|
85
|
+
# Reject reserved underscore keys at any depth, then enforce the
|
|
86
|
+
# per-class filter-field allowlist on top-level keys.
|
|
87
|
+
Parse::Retrieval.assert_no_underscore_keys!(filter) unless filter.nil?
|
|
88
|
+
Parse::Retrieval.assert_no_underscore_keys!(vector_filter) unless vector_filter.nil?
|
|
89
|
+
allowed = Parse::Agent::MetadataRegistry.searchable_filter_fields(cname).map(&:to_s)
|
|
90
|
+
assert_filter_fields_allowed!(filter, allowed)
|
|
91
|
+
assert_filter_fields_allowed!(vector_filter, allowed)
|
|
92
|
+
|
|
93
|
+
# Tenant scope (nil for unscoped classes / bypassed admins; raises
|
|
94
|
+
# AccessDenied for an un-bound agent on a scoped class).
|
|
95
|
+
scope = Parse::Agent::Tools.resolve_tenant_scope!(agent, cname)
|
|
96
|
+
|
|
97
|
+
# Non-admin agents get quantized scores (membership-inference
|
|
98
|
+
# defense); admin agents get full precision. Keyed on the
|
|
99
|
+
# permission tier, not master-key posture.
|
|
100
|
+
score_quantize = (agent.permissions != :admin)
|
|
101
|
+
vector_field = Parse::Agent::MetadataRegistry.searchable_field(cname)
|
|
102
|
+
|
|
103
|
+
chunks = Parse::Retrieval.retrieve(
|
|
104
|
+
query: query,
|
|
105
|
+
klass: klass,
|
|
106
|
+
field: vector_field,
|
|
107
|
+
text_field: resolved_text_field,
|
|
108
|
+
k: clamp_k(k),
|
|
109
|
+
filter: filter,
|
|
110
|
+
vector_filter: vector_filter,
|
|
111
|
+
chunker: build_chunker(chunk_size, chunk_overlap, chunk_by, max_chunks_per_document),
|
|
112
|
+
tenant_scope: scope,
|
|
113
|
+
score_quantize: score_quantize,
|
|
114
|
+
source_transform: source_projector(agent, cname, scope),
|
|
115
|
+
**agent.acl_scope_kwargs,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Token budget (B4): trim the score-ordered chunk list before
|
|
119
|
+
# building the envelope so `documents` only carries parents whose
|
|
120
|
+
# chunks survived.
|
|
121
|
+
kept, dropped = apply_token_budget(chunks, resolve_token_budget(max_total_tokens))
|
|
122
|
+
|
|
123
|
+
# Source dedup (A3): a document's (projected) source record is
|
|
124
|
+
# identical across all its chunks. Hoist it into a `documents` map
|
|
125
|
+
# keyed by objectId and drop the inline `source` from each chunk —
|
|
126
|
+
# ~46 tok/chunk saved for every chunk past the first of a document.
|
|
127
|
+
documents = {}
|
|
128
|
+
chunk_hashes = kept.map do |chunk|
|
|
129
|
+
h = chunk.to_h
|
|
130
|
+
oid = h.dig(:metadata, :object_id)
|
|
131
|
+
if oid && !oid.to_s.empty?
|
|
132
|
+
documents[oid] ||= h[:source]
|
|
133
|
+
h = h.reject { |key, _| key == :source }
|
|
134
|
+
end
|
|
135
|
+
h
|
|
136
|
+
end
|
|
137
|
+
stamp_chunk_provenance!(chunk_hashes, cname) if Parse::Agent.include_source_provenance?
|
|
138
|
+
|
|
139
|
+
envelope = { chunks: chunk_hashes, documents: documents, count: chunk_hashes.length }
|
|
140
|
+
if dropped > 0
|
|
141
|
+
envelope[:budget_truncated] = true
|
|
142
|
+
envelope[:budget_dropped] = dropped
|
|
143
|
+
end
|
|
144
|
+
envelope
|
|
145
|
+
end
|
|
146
|
+
|
|
147
|
+
# @!visibility private
|
|
148
|
+
# nil -> DEFAULT_MAX_TOTAL_TOKENS; <=0 -> nil (unlimited); else the int.
|
|
149
|
+
def resolve_token_budget(max_total_tokens)
|
|
150
|
+
return DEFAULT_MAX_TOTAL_TOKENS if max_total_tokens.nil?
|
|
151
|
+
n = max_total_tokens.to_i
|
|
152
|
+
n <= 0 ? nil : n
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# @!visibility private
|
|
156
|
+
# Greedily keep score-ordered chunks until the cumulative content
|
|
157
|
+
# token estimate (chars/4) would exceed `budget`. Always keeps at
|
|
158
|
+
# least the first chunk so a single oversize chunk still returns
|
|
159
|
+
# something (flagged truncated).
|
|
160
|
+
# @return [Array(Array<Chunk>, Integer)] [kept, dropped_count]
|
|
161
|
+
def apply_token_budget(chunks, budget)
|
|
162
|
+
return [chunks, 0] if budget.nil? || chunks.empty?
|
|
163
|
+
total = 0
|
|
164
|
+
kept = []
|
|
165
|
+
chunks.each do |chunk|
|
|
166
|
+
est = (chunk.content.to_s.length / 4.0).ceil
|
|
167
|
+
break unless kept.empty? || total + est <= budget
|
|
168
|
+
kept << chunk
|
|
169
|
+
total += est
|
|
170
|
+
end
|
|
171
|
+
[kept, chunks.length - kept.length]
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
# @!visibility private
|
|
175
|
+
# Per-chunk `_source` provenance. The chunk already carries a
|
|
176
|
+
# `source` key (the projected parent record), so provenance uses the
|
|
177
|
+
# distinct `_source` key. object_id comes from the chunk metadata
|
|
178
|
+
# (or the projected source record).
|
|
179
|
+
def stamp_chunk_provenance!(chunk_hashes, cname)
|
|
180
|
+
chunk_hashes.each do |c|
|
|
181
|
+
next unless c.is_a?(Hash)
|
|
182
|
+
next if c.key?(:_source)
|
|
183
|
+
oid = c.dig(:metadata, :object_id)
|
|
184
|
+
oid ||= (c[:source]["objectId"] || c[:source][:objectId]) if c[:source].is_a?(Hash)
|
|
185
|
+
c[:_source] = { "class" => cname.to_s, "tool" => "semantic_search", "object_id" => oid }
|
|
186
|
+
end
|
|
187
|
+
end
|
|
188
|
+
|
|
189
|
+
# @!visibility private
|
|
190
|
+
# Build the per-record OUTPUT transform: convert the raw storage-
|
|
191
|
+
# form Mongo hit to Parse/wire form, re-assert tenant scope (raises
|
|
192
|
+
# AccessDenied — fail closed for the whole call), redact hidden
|
|
193
|
+
# nested classes, then project through `field_allowlist`.
|
|
194
|
+
def source_projector(agent, cname, scope)
|
|
195
|
+
lambda do |raw_doc|
|
|
196
|
+
converted = convert_to_parse_form(raw_doc, cname)
|
|
197
|
+
Parse::Agent::Tools.assert_record_in_tenant_scope!(converted, scope, cname) if scope
|
|
198
|
+
projected = Parse::Agent::Tools.project_object_to_allowlist(cname, converted)
|
|
199
|
+
redacted = Parse::Agent::Tools.redact_hidden_classes!(projected, agent: agent)
|
|
200
|
+
# Normalize to the same LLM-friendly, ACL-stripped form the other
|
|
201
|
+
# read tools emit so the `documents` map is consistent (and ACL-
|
|
202
|
+
# free) even for a searchable class with no agent_fields allowlist,
|
|
203
|
+
# where project_object_to_allowlist is a pass-through.
|
|
204
|
+
Parse::Agent::ResultFormatter.simplify_object(redacted)
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
|
|
208
|
+
# @!visibility private
|
|
209
|
+
def convert_to_parse_form(raw_doc, cname)
|
|
210
|
+
Parse::MongoDB.convert_documents_to_parse([raw_doc], cname).first || raw_doc
|
|
211
|
+
rescue StandardError
|
|
212
|
+
# Conversion failed for this hit. Do NOT surface the raw storage-form
|
|
213
|
+
# Mongo document: it carries internal metadata (_acl, _rperm/_wperm,
|
|
214
|
+
# storage-form _p_* pointers, _id, _created_at/_updated_at) that the
|
|
215
|
+
# success path strips. For a searchable class with NO agent_fields
|
|
216
|
+
# allowlist, project_object_to_allowlist downstream is a pass-through, so
|
|
217
|
+
# this fallback is the only thing standing between those keys and the
|
|
218
|
+
# LLM. Drop every storage-internal (underscore-prefixed) key. NOTE:
|
|
219
|
+
# reusing Parse::PipelineSecurity.strip_internal_fields is NOT enough —
|
|
220
|
+
# its denylist EXCLUDES _acl, which is exactly the field that discloses
|
|
221
|
+
# other principals' object ids and roles. The chunk's object_id is read
|
|
222
|
+
# from the raw doc before this transform runs, so dropping _id is
|
|
223
|
+
# harmless.
|
|
224
|
+
raw_doc.is_a?(Hash) ? raw_doc.reject { |k, _| k.to_s.start_with?("_") } : {}
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# @!visibility private
|
|
228
|
+
def clamp_k(k)
|
|
229
|
+
n = k.to_i
|
|
230
|
+
n = DEFAULT_K if n <= 0
|
|
231
|
+
[n, MAX_K].min
|
|
232
|
+
end
|
|
233
|
+
|
|
234
|
+
# @!visibility private
|
|
235
|
+
def build_chunker(size, overlap, by, max_chunks_per_document = nil)
|
|
236
|
+
return nil if size.nil? && overlap.nil? && by.nil? && max_chunks_per_document.nil?
|
|
237
|
+
opts = {
|
|
238
|
+
size: (size || 800).to_i,
|
|
239
|
+
overlap: (overlap || 100).to_i,
|
|
240
|
+
by: (by || :chars).to_sym,
|
|
241
|
+
}
|
|
242
|
+
# Only override the chunker's own default (200) when the caller asked,
|
|
243
|
+
# so an unset cap keeps the library default rather than forcing it here.
|
|
244
|
+
opts[:max_chunks_per_document] = max_chunks_per_document.to_i unless max_chunks_per_document.nil?
|
|
245
|
+
Parse::Retrieval::Chunker::FixedSizeOverlap.new(**opts)
|
|
246
|
+
rescue ArgumentError => e
|
|
247
|
+
raise Parse::Agent::ValidationError, "semantic_search: invalid chunker options — #{e.message}"
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# @!visibility private
|
|
251
|
+
# The class's declared embed TEXT sources — the only fields an agent may
|
|
252
|
+
# name as `text_field:`. Chunk `content` is the text_field's value, so
|
|
253
|
+
# restricting it to embedded sources stops the tool from surfacing a
|
|
254
|
+
# field the model never opted into embedding.
|
|
255
|
+
def searchable_text_fields(klass)
|
|
256
|
+
return [] unless klass.respond_to?(:embed_directives)
|
|
257
|
+
klass.embed_directives.values
|
|
258
|
+
.reject { |d| d.respond_to?(:image?) && d.image? }
|
|
259
|
+
.flat_map(&:sources).map(&:to_s).uniq
|
|
260
|
+
end
|
|
261
|
+
|
|
262
|
+
# @!visibility private
|
|
263
|
+
# Validate a caller-supplied text_field against the embedded-source
|
|
264
|
+
# allowlist. nil/blank → nil (retrieve infers; works for single-source
|
|
265
|
+
# models, raises AmbiguousTextField for multi-source so the agent knows
|
|
266
|
+
# to pass one).
|
|
267
|
+
def normalize_text_field!(text_field, klass)
|
|
268
|
+
return nil if text_field.nil? || text_field.to_s.strip.empty?
|
|
269
|
+
allowed = searchable_text_fields(klass)
|
|
270
|
+
unless allowed.include?(text_field.to_s)
|
|
271
|
+
raise Parse::Agent::ValidationError,
|
|
272
|
+
"semantic_search: text_field #{text_field.to_s.inspect} is not an embedded " \
|
|
273
|
+
"text source for this class (allowed: #{allowed.inspect})."
|
|
274
|
+
end
|
|
275
|
+
text_field.to_sym
|
|
276
|
+
end
|
|
277
|
+
|
|
278
|
+
# @!visibility private
|
|
279
|
+
# Refuse any top-level filter key not in the class's declared
|
|
280
|
+
# `filter_fields` allowlist (compound operators included — the
|
|
281
|
+
# allowlist is the complete set of keys the agent may use).
|
|
282
|
+
def assert_filter_fields_allowed!(filter, allowed)
|
|
283
|
+
return if filter.nil? || (filter.respond_to?(:empty?) && filter.empty?)
|
|
284
|
+
unless filter.is_a?(Hash)
|
|
285
|
+
raise Parse::Agent::ValidationError, "semantic_search: filter must be an object."
|
|
286
|
+
end
|
|
287
|
+
offending = filter.keys.map(&:to_s).reject { |key| allowed.include?(key) }
|
|
288
|
+
unless offending.empty?
|
|
289
|
+
raise Parse::Agent::ValidationError,
|
|
290
|
+
"semantic_search: filter field(s) #{offending.inspect} are not in the " \
|
|
291
|
+
"agent_searchable filter_fields allowlist (#{allowed.inspect})."
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
|
|
295
|
+
# JSON Schema for the registered tool's parameters.
|
|
296
|
+
PARAMETERS = {
|
|
297
|
+
"type" => "object",
|
|
298
|
+
"properties" => {
|
|
299
|
+
"class_name" => { "type" => "string", "description" => "Parse class name (must be agent_searchable)." },
|
|
300
|
+
"query" => { "type" => "string", "description" => "Natural-language query." },
|
|
301
|
+
"k" => { "type" => "integer", "default" => DEFAULT_K, "minimum" => 1, "maximum" => MAX_K },
|
|
302
|
+
"filter" => { "type" => "object", "description" => "Post-search field filter (allowlisted fields only)." },
|
|
303
|
+
"vector_filter" => { "type" => "object", "description" => "Atlas pre-search filter (allowlisted fields only)." },
|
|
304
|
+
"text_field" => { "type" => "string", "description" => "Which embedded text source to chunk and return as content. Required only when the class embeds more than one text field; must name one of those sources." },
|
|
305
|
+
"chunk_size" => { "type" => "integer", "description" => "Override chunk window size." },
|
|
306
|
+
"chunk_overlap" => { "type" => "integer", "description" => "Override chunk overlap." },
|
|
307
|
+
"chunk_by" => { "type" => "string", "enum" => %w[chars tokens], "description" => "Chunk unit." },
|
|
308
|
+
"max_chunks_per_document" => { "type" => "integer", "minimum" => 1, "description" => "Cap on chunks emitted per matched document." },
|
|
309
|
+
"max_total_tokens" => { "type" => "integer", "minimum" => 0, "description" => "Ceiling on total returned chunk-content tokens (approx chars/4). Trims lowest-ranked chunks first and sets budget_truncated. 0 disables." },
|
|
310
|
+
},
|
|
311
|
+
"required" => %w[class_name query],
|
|
312
|
+
}.freeze
|
|
313
|
+
|
|
314
|
+
# MCP outputSchema → mirrored as structuredContent on results.
|
|
315
|
+
# The parent record of each chunk is hoisted into `documents` (keyed
|
|
316
|
+
# by objectId) rather than duplicated inline on every chunk; map a
|
|
317
|
+
# chunk to its source via `metadata.object_id`.
|
|
318
|
+
OUTPUT_SCHEMA = {
|
|
319
|
+
"type" => "object",
|
|
320
|
+
"properties" => {
|
|
321
|
+
"chunks" => {
|
|
322
|
+
"type" => "array",
|
|
323
|
+
"items" => {
|
|
324
|
+
"type" => "object",
|
|
325
|
+
"properties" => {
|
|
326
|
+
"id" => { "type" => "string" },
|
|
327
|
+
"score" => { "type" => %w[number null] },
|
|
328
|
+
"content" => { "type" => "string" },
|
|
329
|
+
"metadata" => { "type" => "object" },
|
|
330
|
+
},
|
|
331
|
+
},
|
|
332
|
+
},
|
|
333
|
+
"documents" => {
|
|
334
|
+
"type" => "object",
|
|
335
|
+
"description" => "objectId => projected source record (sent once per matched document).",
|
|
336
|
+
},
|
|
337
|
+
"count" => { "type" => "integer" },
|
|
338
|
+
"budget_truncated" => { "type" => "boolean", "description" => "Present when the token budget dropped lowest-ranked chunks." },
|
|
339
|
+
"budget_dropped" => { "type" => "integer", "description" => "Number of chunks dropped by the token budget." },
|
|
340
|
+
},
|
|
341
|
+
}.freeze
|
|
342
|
+
|
|
343
|
+
# Register the tool. Idempotent-ish: re-requiring is a no-op because
|
|
344
|
+
# require caches; an explicit re-register after reset_registry! is
|
|
345
|
+
# supported via {.register!}.
|
|
346
|
+
def register!
|
|
347
|
+
Parse::Agent::Tools.register(
|
|
348
|
+
name: :semantic_search,
|
|
349
|
+
description: "Find documents semantically similar to a natural-language query and " \
|
|
350
|
+
"return scored text chunks. Use when keyword matching is unlikely to " \
|
|
351
|
+
"work or the question needs synthesizing across documents. The target " \
|
|
352
|
+
"class must be declared `agent_searchable`.",
|
|
353
|
+
parameters: PARAMETERS,
|
|
354
|
+
permission: :readonly,
|
|
355
|
+
timeout: 30,
|
|
356
|
+
output_schema: OUTPUT_SCHEMA,
|
|
357
|
+
client_safe: true,
|
|
358
|
+
handler: ->(agent, **args) { Parse::Retrieval::AgentTool.semantic_search(agent, **args) },
|
|
359
|
+
)
|
|
360
|
+
end
|
|
361
|
+
end
|
|
362
|
+
end
|
|
363
|
+
end
|
|
364
|
+
|
|
365
|
+
# Register at load. Requires Parse::Agent::Tools (TOOL_DEFINITIONS for the
|
|
366
|
+
# collision check), Parse::Retrieval (loaded with the model layer), and
|
|
367
|
+
# Parse::Object + MetadataDSL — all present by the time agent.rb requires
|
|
368
|
+
# this file at its tail.
|
|
369
|
+
Parse::Retrieval::AgentTool.register!
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module Parse
|
|
5
|
+
module Retrieval
|
|
6
|
+
# A single retrieved passage: one chunk of one source document,
|
|
7
|
+
# carrying the document's vector-search score and (optionally
|
|
8
|
+
# projected) source record.
|
|
9
|
+
#
|
|
10
|
+
# Produced by {Parse::Retrieval.retrieve}. Because embedding is
|
|
11
|
+
# one-vector-per-record (see {Parse::Core::EmbedManaged}), every
|
|
12
|
+
# chunk split from a document shares that document's single score —
|
|
13
|
+
# the chunking is presentation-only, applied after retrieval.
|
|
14
|
+
#
|
|
15
|
+
# @!attribute [r] id
|
|
16
|
+
# @return [String] stable synthetic chunk id, `"<objectId>#<index>"`.
|
|
17
|
+
# @!attribute [r] score
|
|
18
|
+
# @return [Float, nil] the parent document's Atlas vectorSearchScore,
|
|
19
|
+
# already quantized when the caller requested it.
|
|
20
|
+
# @!attribute [r] content
|
|
21
|
+
# @return [String] the chunk text.
|
|
22
|
+
# @!attribute [r] source
|
|
23
|
+
# @return [Hash] the parent document record. When the producer
|
|
24
|
+
# supplied a `source_transform:` (the agent tool does, projecting
|
|
25
|
+
# through `field_allowlist`), this is the projected/redacted form.
|
|
26
|
+
# @!attribute [r] metadata
|
|
27
|
+
# @return [Hash] presentation metadata: `:chunk_index`,
|
|
28
|
+
# `:chunk_count`, `:chunks_truncated`, and any producer-supplied
|
|
29
|
+
# signals (e.g. `:token_chunking_degraded`).
|
|
30
|
+
class Chunk
|
|
31
|
+
attr_reader :id, :score, :content, :source, :metadata
|
|
32
|
+
|
|
33
|
+
# @param id [String]
|
|
34
|
+
# @param score [Float, nil]
|
|
35
|
+
# @param content [String]
|
|
36
|
+
# @param source [Hash]
|
|
37
|
+
# @param metadata [Hash]
|
|
38
|
+
def initialize(id:, content:, source:, score: nil, metadata: {})
|
|
39
|
+
@id = id.to_s
|
|
40
|
+
@score = score
|
|
41
|
+
@content = content
|
|
42
|
+
@source = source
|
|
43
|
+
@metadata = metadata
|
|
44
|
+
freeze
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
# @return [Hash] plain-Hash form for tool output / JSON.
|
|
48
|
+
def to_h
|
|
49
|
+
{
|
|
50
|
+
id: @id,
|
|
51
|
+
score: @score,
|
|
52
|
+
content: @content,
|
|
53
|
+
source: @source,
|
|
54
|
+
metadata: @metadata,
|
|
55
|
+
}
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Value equality on the identifying triple — convenient for tests
|
|
59
|
+
# and de-duplication. `source`/`metadata` are intentionally not
|
|
60
|
+
# part of identity.
|
|
61
|
+
def ==(other)
|
|
62
|
+
other.is_a?(Chunk) &&
|
|
63
|
+
other.id == @id &&
|
|
64
|
+
other.score == @score &&
|
|
65
|
+
other.content == @content
|
|
66
|
+
end
|
|
67
|
+
alias eql? ==
|
|
68
|
+
|
|
69
|
+
def hash
|
|
70
|
+
[@id, @score, @content].hash
|
|
71
|
+
end
|
|
72
|
+
end
|
|
73
|
+
end
|
|
74
|
+
end
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
# encoding: UTF-8
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
module Parse
|
|
5
|
+
module Retrieval
|
|
6
|
+
# Pluggable text-chunking strategies for the retrieval layer.
|
|
7
|
+
#
|
|
8
|
+
# A chunker splits a source document's text into smaller, overlapping
|
|
9
|
+
# windows for presentation. {Parse::Retrieval.retrieve} fetches the
|
|
10
|
+
# top-k whole records via Atlas `$vectorSearch`, then runs each
|
|
11
|
+
# record's text field through a chunker so callers get focused,
|
|
12
|
+
# citable passages rather than whole documents.
|
|
13
|
+
#
|
|
14
|
+
# == Presentation chunking, not embedding chunking
|
|
15
|
+
#
|
|
16
|
+
# Embedding remains one-vector-per-record (see
|
|
17
|
+
# {Parse::Core::EmbedManaged}). Chunking here is purely a
|
|
18
|
+
# *presentation* step applied after retrieval: every chunk produced
|
|
19
|
+
# from a document inherits that document's single vector-search
|
|
20
|
+
# score. The chunker never calls an embedding provider.
|
|
21
|
+
#
|
|
22
|
+
# == Extending
|
|
23
|
+
#
|
|
24
|
+
# {FixedSizeOverlap} is the default and the only strategy shipped.
|
|
25
|
+
# Subclass {Base} for semantic, sentence-aware, or true
|
|
26
|
+
# token-aware chunking:
|
|
27
|
+
#
|
|
28
|
+
# class SentenceChunker < Parse::Retrieval::Chunker::Base
|
|
29
|
+
# def chunk(text)
|
|
30
|
+
# normalize(text).split(/(?<=[.!?])\s+/)
|
|
31
|
+
# end
|
|
32
|
+
# end
|
|
33
|
+
#
|
|
34
|
+
# Parse::Retrieval.retrieve(
|
|
35
|
+
# query: "onboarding steps",
|
|
36
|
+
# klass: KnowledgeArticle,
|
|
37
|
+
# chunker: SentenceChunker.new,
|
|
38
|
+
# )
|
|
39
|
+
module Chunker
|
|
40
|
+
# Abstract base. Subclasses MUST implement {#chunk}.
|
|
41
|
+
#
|
|
42
|
+
# Subclasses get one free behavior from {Base}: {#chunk_with_meta},
|
|
43
|
+
# which wraps {#chunk} and reports whether the result was capped.
|
|
44
|
+
# {Parse::Retrieval.retrieve} calls {#chunk_with_meta} so it can
|
|
45
|
+
# stamp a truncation signal onto each emitted chunk's metadata.
|
|
46
|
+
class Base
|
|
47
|
+
# @param text [String, nil] source document text.
|
|
48
|
+
# @return [Array<String>] zero or more chunks. MUST return `[]`
|
|
49
|
+
# for blank/`nil` input.
|
|
50
|
+
# @raise [NotImplementedError] unless overridden.
|
|
51
|
+
def chunk(text)
|
|
52
|
+
raise NotImplementedError, "#{self.class}#chunk must return Array<String>."
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# Wrap {#chunk} with truncation metadata. The default
|
|
56
|
+
# implementation here does NOT cap — it reports the chunk list as
|
|
57
|
+
# produced. {FixedSizeOverlap} overrides this to enforce its
|
|
58
|
+
# `max_chunks_per_document` cap and report the pre-cap count.
|
|
59
|
+
#
|
|
60
|
+
# @param text [String, nil]
|
|
61
|
+
# @return [Hash] `{ chunks: Array<String>, truncated: Boolean,
|
|
62
|
+
# total_before_truncation: Integer }`.
|
|
63
|
+
def chunk_with_meta(text)
|
|
64
|
+
chunks = Array(chunk(text))
|
|
65
|
+
{ chunks: chunks, truncated: false, total_before_truncation: chunks.length }
|
|
66
|
+
end
|
|
67
|
+
|
|
68
|
+
# @!visibility private
|
|
69
|
+
# Shared input normalization. Returns `nil` for `nil`,
|
|
70
|
+
# non-String, empty, or whitespace-only input — every concrete
|
|
71
|
+
# `#chunk` treats a `nil` return as "no chunks". A non-String is
|
|
72
|
+
# treated as blank rather than raised so a document with an
|
|
73
|
+
# unexpected non-text value in the chunked field is skipped, not
|
|
74
|
+
# fatal, during retrieval.
|
|
75
|
+
def normalize(text)
|
|
76
|
+
return nil if text.nil?
|
|
77
|
+
return nil unless text.is_a?(String)
|
|
78
|
+
stripped = text.strip
|
|
79
|
+
stripped.empty? ? nil : text
|
|
80
|
+
end
|
|
81
|
+
end
|
|
82
|
+
|
|
83
|
+
# Fixed-size sliding-window chunker with overlap.
|
|
84
|
+
#
|
|
85
|
+
# Splits text into windows of `size` units, advancing by
|
|
86
|
+
# `size - overlap` each step so consecutive chunks share `overlap`
|
|
87
|
+
# units of context. `by: :chars` (default) counts characters;
|
|
88
|
+
# `by: :tokens` counts whitespace-delimited tokens (a cheap
|
|
89
|
+
# approximation — there is no model tokenizer here; see the
|
|
90
|
+
# `:tokens` note below).
|
|
91
|
+
#
|
|
92
|
+
# c = Parse::Retrieval::Chunker::FixedSizeOverlap.new(size: 800, overlap: 100)
|
|
93
|
+
# c.chunk(long_text) #=> ["…800 chars…", "…overlap+800…", …]
|
|
94
|
+
#
|
|
95
|
+
# == Amplification cap
|
|
96
|
+
#
|
|
97
|
+
# `max_chunks_per_document` (default 200) bounds how many chunks a
|
|
98
|
+
# single document can yield. Beyond the cap the chunker
|
|
99
|
+
# *truncates* — it returns the first `max_chunks_per_document`
|
|
100
|
+
# chunks rather than raising — and {#chunk_with_meta} reports
|
|
101
|
+
# `truncated: true`. This is the DoS guard: a 10 MB field at
|
|
102
|
+
# 800-char windows would otherwise yield ~12,500 chunks.
|
|
103
|
+
#
|
|
104
|
+
# == `:tokens`
|
|
105
|
+
#
|
|
106
|
+
# `by: :tokens` treats `size`/`overlap` as literal whitespace-token
|
|
107
|
+
# counts supplied by the caller. The chunker does NOT consult an
|
|
108
|
+
# embedding provider's `max_input_tokens`; that hint is the
|
|
109
|
+
# caller's concern (see {Parse::Retrieval.retrieve}). The chunker
|
|
110
|
+
# always does exactly what it was constructed with and never
|
|
111
|
+
# silently switches modes.
|
|
112
|
+
class FixedSizeOverlap < Base
|
|
113
|
+
# @return [Integer] window width in `by:` units.
|
|
114
|
+
attr_reader :size
|
|
115
|
+
# @return [Integer] units shared between consecutive windows.
|
|
116
|
+
attr_reader :overlap
|
|
117
|
+
# @return [Symbol] `:chars` or `:tokens`.
|
|
118
|
+
attr_reader :by
|
|
119
|
+
# @return [Integer] hard cap on chunks emitted per document.
|
|
120
|
+
attr_reader :max_chunks_per_document
|
|
121
|
+
|
|
122
|
+
# @param size [Integer] window width (> 0).
|
|
123
|
+
# @param overlap [Integer] shared units between windows
|
|
124
|
+
# (`0 <= overlap < size`).
|
|
125
|
+
# @param by [Symbol] `:chars` (default) or `:tokens`.
|
|
126
|
+
# @param max_chunks_per_document [Integer] cap (> 0, default 200).
|
|
127
|
+
# @raise [ArgumentError] on any out-of-range argument. In
|
|
128
|
+
# particular `overlap >= size` is refused: a non-shrinking
|
|
129
|
+
# stride would never advance and would loop forever.
|
|
130
|
+
def initialize(size: 800, overlap: 100, by: :chars, max_chunks_per_document: 200)
|
|
131
|
+
unless size.is_a?(Integer) && size > 0
|
|
132
|
+
raise ArgumentError, "size must be a positive Integer (got #{size.inspect})."
|
|
133
|
+
end
|
|
134
|
+
unless overlap.is_a?(Integer) && overlap >= 0
|
|
135
|
+
raise ArgumentError, "overlap must be a non-negative Integer (got #{overlap.inspect})."
|
|
136
|
+
end
|
|
137
|
+
if overlap >= size
|
|
138
|
+
raise ArgumentError,
|
|
139
|
+
"overlap (#{overlap}) must be strictly less than size (#{size}); " \
|
|
140
|
+
"a stride of size - overlap <= 0 would never advance."
|
|
141
|
+
end
|
|
142
|
+
unless %i[chars tokens].include?(by)
|
|
143
|
+
raise ArgumentError, "by must be :chars or :tokens (got #{by.inspect})."
|
|
144
|
+
end
|
|
145
|
+
unless max_chunks_per_document.is_a?(Integer) && max_chunks_per_document > 0
|
|
146
|
+
raise ArgumentError,
|
|
147
|
+
"max_chunks_per_document must be a positive Integer " \
|
|
148
|
+
"(got #{max_chunks_per_document.inspect})."
|
|
149
|
+
end
|
|
150
|
+
@size = size
|
|
151
|
+
@overlap = overlap
|
|
152
|
+
@by = by
|
|
153
|
+
@max_chunks_per_document = max_chunks_per_document
|
|
154
|
+
@stride = size - overlap
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
# @param text [String, nil]
|
|
158
|
+
# @return [Array<String>] chunks (capped at
|
|
159
|
+
# {#max_chunks_per_document}). `[]` for blank input.
|
|
160
|
+
def chunk(text)
|
|
161
|
+
chunk_with_meta(text)[:chunks]
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
# (see Base#chunk_with_meta)
|
|
165
|
+
def chunk_with_meta(text)
|
|
166
|
+
source = normalize(text)
|
|
167
|
+
return { chunks: [], truncated: false, total_before_truncation: 0 } if source.nil?
|
|
168
|
+
|
|
169
|
+
all = (@by == :tokens) ? window_tokens(source) : window_chars(source)
|
|
170
|
+
total = all.length
|
|
171
|
+
if total > @max_chunks_per_document
|
|
172
|
+
{ chunks: all.first(@max_chunks_per_document),
|
|
173
|
+
truncated: true,
|
|
174
|
+
total_before_truncation: total }
|
|
175
|
+
else
|
|
176
|
+
{ chunks: all, truncated: false, total_before_truncation: total }
|
|
177
|
+
end
|
|
178
|
+
end
|
|
179
|
+
|
|
180
|
+
private
|
|
181
|
+
|
|
182
|
+
def window_chars(text)
|
|
183
|
+
len = text.length
|
|
184
|
+
out = []
|
|
185
|
+
start = 0
|
|
186
|
+
while start < len
|
|
187
|
+
out << text[start, @size]
|
|
188
|
+
start += @stride
|
|
189
|
+
end
|
|
190
|
+
out
|
|
191
|
+
end
|
|
192
|
+
|
|
193
|
+
def window_tokens(text)
|
|
194
|
+
tokens = text.split(/\s+/).reject(&:empty?)
|
|
195
|
+
return [] if tokens.empty?
|
|
196
|
+
out = []
|
|
197
|
+
start = 0
|
|
198
|
+
n = tokens.length
|
|
199
|
+
while start < n
|
|
200
|
+
out << tokens[start, @size].join(" ")
|
|
201
|
+
start += @stride
|
|
202
|
+
end
|
|
203
|
+
out
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
end
|
|
207
|
+
end
|
|
208
|
+
end
|