parse-stack-next 5.1.1 → 5.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. checksums.yaml +4 -4
  2. data/.env.sample +12 -0
  3. data/.env.test +4 -4
  4. data/CHANGELOG.md +545 -0
  5. data/Gemfile +3 -0
  6. data/Gemfile.lock +6 -1
  7. data/README.md +167 -38
  8. data/Rakefile +56 -10
  9. data/docs/atlas_vector_search_guide.md +110 -9
  10. data/docs/mcp_guide.md +433 -0
  11. data/docs/mongodb_direct_guide.md +66 -1
  12. data/docs/mongodb_index_optimization_guide.md +22 -1
  13. data/docs/usage_guide.md +15 -0
  14. data/lib/parse/agent/approval_gate.rb +0 -0
  15. data/lib/parse/agent/constraint_translator.rb +90 -19
  16. data/lib/parse/agent/describe.rb +1 -0
  17. data/lib/parse/agent/errors.rb +16 -0
  18. data/lib/parse/agent/mcp_client.rb +9 -0
  19. data/lib/parse/agent/mcp_dispatcher.rb +139 -7
  20. data/lib/parse/agent/mcp_rack_app.rb +621 -17
  21. data/lib/parse/agent/mcp_subscriptions.rb +607 -0
  22. data/lib/parse/agent/metadata_dsl.rb +58 -0
  23. data/lib/parse/agent/metadata_registry.rb +141 -1
  24. data/lib/parse/agent/prompt_hardening.rb +213 -0
  25. data/lib/parse/agent/result_formatter.rb +18 -3
  26. data/lib/parse/agent/tools.rb +167 -24
  27. data/lib/parse/agent.rb +692 -21
  28. data/lib/parse/client/request.rb +55 -4
  29. data/lib/parse/client/response.rb +4 -0
  30. data/lib/parse/client.rb +205 -7
  31. data/lib/parse/model/classes/installation.rb +27 -10
  32. data/lib/parse/model/classes/user.rb +8 -0
  33. data/lib/parse/model/core/actions.rb +58 -4
  34. data/lib/parse/model/core/embed_managed.rb +19 -14
  35. data/lib/parse/model/core/indexing.rb +108 -16
  36. data/lib/parse/model/core/querying.rb +29 -0
  37. data/lib/parse/model/model.rb +34 -3
  38. data/lib/parse/model/object.rb +1 -0
  39. data/lib/parse/query.rb +90 -24
  40. data/lib/parse/retrieval/agent_tool.rb +369 -0
  41. data/lib/parse/retrieval/chunk.rb +74 -0
  42. data/lib/parse/retrieval/chunker.rb +208 -0
  43. data/lib/parse/retrieval/retriever.rb +274 -0
  44. data/lib/parse/retrieval.rb +10 -0
  45. data/lib/parse/schema.rb +69 -20
  46. data/lib/parse/stack/version.rb +2 -2
  47. data/parse-stack-next.gemspec +1 -1
  48. data/scripts/docker/docker-compose.atlas.yml +14 -10
  49. data/scripts/docker/docker-compose.test.yml +24 -20
  50. data/scripts/docker/mongo-init.js +3 -3
  51. data/scripts/start-parse.sh +10 -0
  52. data/scripts/start_mcp_server.rb +1 -1
  53. data/scripts/test_server_connection.rb +1 -1
  54. data/scripts/vector_prototype/create_vector_index.js +1 -1
  55. data/scripts/vector_prototype/fetch_embeddings.py +2 -2
  56. data/scripts/vector_prototype/query_prototype.rb +1 -1
  57. data/scripts/vector_prototype/run.sh +4 -4
  58. metadata +10 -2
@@ -0,0 +1,369 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ require_relative "../retrieval"
5
+
6
+ module Parse
7
+ module Retrieval
8
+ # The `semantic_search` agent tool: the agent-aware wrapper around
9
+ # {Parse::Retrieval.retrieve}. It applies the agent security
10
+ # envelope that {Parse::Retrieval.retrieve} (a model-layer method) is
11
+ # deliberately kept free of:
12
+ #
13
+ # * Class allowlist via {Parse::Agent::MetadataRegistry.resolve_searchable!}
14
+ # (`agent_searchable` opt-in, hidden-class refusal, tenant-scope gate).
15
+ # * Recursive underscore-key refusal + filter-field allowlist on
16
+ # caller-supplied `filter:` / `vector_filter:`.
17
+ # * Tenant scope merged into the Atlas pre-filter AND re-asserted on
18
+ # every returned source record (NEW-TOOLS-3 guard).
19
+ # * `field_allowlist` projection of each source record on the way out.
20
+ # * Score quantization in non-admin contexts.
21
+ #
22
+ # ACL is enforced mongo-direct inside `find_similar` via the agent's
23
+ # `acl_scope_kwargs` (`session_token:` / `acl_user:` / `acl_role:` /
24
+ # `master:`), which is why the tool is `client_safe: true`: a
25
+ # session-token client routes through the one path with first-class
26
+ # SDK-side `_rperm` enforcement.
27
+ module AgentTool
28
+ module_function
29
+
30
+ # Upper bound on `k` (mirrors the registered parameter schema).
31
+ MAX_K = 20
32
+ # Default neighbour count for the agent tool. Intentionally lower than
33
+ # Parse::Retrieval.retrieve's library default of 10: an LLM tool result
34
+ # is paid for in context tokens, so the agent surface defaults
35
+ # conservatively. Callers/LLMs can raise it up to MAX_K per call.
36
+ DEFAULT_K = 5
37
+
38
+ # Default ceiling on total returned chunk-content tokens (estimated as
39
+ # chars/4). The retrieve count caps (k * max_chunks_per_document) bound
40
+ # the NUMBER of chunks but not their total size, so a few long documents
41
+ # could silently blow the context window. This budget trims the
42
+ # (score-ordered) chunk list and reports `budget_truncated` so the
43
+ # truncation is never silent. Pass `max_total_tokens: 0` to disable.
44
+ DEFAULT_MAX_TOTAL_TOKENS = 20_000
45
+
46
+ # @param agent [Parse::Agent]
47
+ # @param text_field [String, Symbol, nil] which embedded text source to
48
+ # chunk and return as `content`. Required only for models with more
49
+ # than one `embed` text source (otherwise inferred). Must name one of
50
+ # the class's declared embed sources — an arbitrary field is refused so
51
+ # the chunk `content` can't disclose a non-embedded field.
52
+ # @param max_chunks_per_document [Integer, nil] cap on chunks emitted per
53
+ # matched document (forwarded to the chunker).
54
+ # @param max_total_tokens [Integer, nil] ceiling on total returned
55
+ # chunk-content tokens (estimated chars/4). nil uses
56
+ # {DEFAULT_MAX_TOTAL_TOKENS}; 0 disables the budget.
57
+ # @return [Hash] `{ chunks: Array<Hash>, documents: Hash, count: Integer }`
58
+ # — each chunk's parent record is hoisted once into `documents` (keyed
59
+ # by objectId) instead of being duplicated on every chunk. When the
60
+ # token budget trims the result, `budget_truncated: true` and
61
+ # `budget_dropped: <n>` are added.
62
+ def semantic_search(agent, class_name: nil, query: nil, k: DEFAULT_K,
63
+ filter: nil, vector_filter: nil, text_field: nil,
64
+ chunk_size: nil, chunk_overlap: nil, chunk_by: nil,
65
+ max_chunks_per_document: nil, max_total_tokens: nil,
66
+ # Back-compat / ergonomic aliases for direct callers:
67
+ # `klass:`/`class:` for class_name, and the chunker's
68
+ # own `size:`/`overlap:`/`by:` names.
69
+ klass: nil, size: nil, overlap: nil, by: nil,
70
+ **rest)
71
+ class_name ||= klass || rest.delete(:class)
72
+ chunk_size ||= size
73
+ chunk_overlap ||= overlap
74
+ chunk_by ||= by
75
+
76
+ klass = Parse::Agent::MetadataRegistry.resolve_searchable!(class_name)
77
+ cname = klass.parse_class
78
+
79
+ unless query.is_a?(String) && !query.strip.empty?
80
+ raise Parse::Agent::ValidationError, "semantic_search: `query` must be a non-empty String."
81
+ end
82
+
83
+ resolved_text_field = normalize_text_field!(text_field, klass)
84
+
85
+ # Reject reserved underscore keys at any depth, then enforce the
86
+ # per-class filter-field allowlist on top-level keys.
87
+ Parse::Retrieval.assert_no_underscore_keys!(filter) unless filter.nil?
88
+ Parse::Retrieval.assert_no_underscore_keys!(vector_filter) unless vector_filter.nil?
89
+ allowed = Parse::Agent::MetadataRegistry.searchable_filter_fields(cname).map(&:to_s)
90
+ assert_filter_fields_allowed!(filter, allowed)
91
+ assert_filter_fields_allowed!(vector_filter, allowed)
92
+
93
+ # Tenant scope (nil for unscoped classes / bypassed admins; raises
94
+ # AccessDenied for an un-bound agent on a scoped class).
95
+ scope = Parse::Agent::Tools.resolve_tenant_scope!(agent, cname)
96
+
97
+ # Non-admin agents get quantized scores (membership-inference
98
+ # defense); admin agents get full precision. Keyed on the
99
+ # permission tier, not master-key posture.
100
+ score_quantize = (agent.permissions != :admin)
101
+ vector_field = Parse::Agent::MetadataRegistry.searchable_field(cname)
102
+
103
+ chunks = Parse::Retrieval.retrieve(
104
+ query: query,
105
+ klass: klass,
106
+ field: vector_field,
107
+ text_field: resolved_text_field,
108
+ k: clamp_k(k),
109
+ filter: filter,
110
+ vector_filter: vector_filter,
111
+ chunker: build_chunker(chunk_size, chunk_overlap, chunk_by, max_chunks_per_document),
112
+ tenant_scope: scope,
113
+ score_quantize: score_quantize,
114
+ source_transform: source_projector(agent, cname, scope),
115
+ **agent.acl_scope_kwargs,
116
+ )
117
+
118
+ # Token budget (B4): trim the score-ordered chunk list before
119
+ # building the envelope so `documents` only carries parents whose
120
+ # chunks survived.
121
+ kept, dropped = apply_token_budget(chunks, resolve_token_budget(max_total_tokens))
122
+
123
+ # Source dedup (A3): a document's (projected) source record is
124
+ # identical across all its chunks. Hoist it into a `documents` map
125
+ # keyed by objectId and drop the inline `source` from each chunk —
126
+ # ~46 tok/chunk saved for every chunk past the first of a document.
127
+ documents = {}
128
+ chunk_hashes = kept.map do |chunk|
129
+ h = chunk.to_h
130
+ oid = h.dig(:metadata, :object_id)
131
+ if oid && !oid.to_s.empty?
132
+ documents[oid] ||= h[:source]
133
+ h = h.reject { |key, _| key == :source }
134
+ end
135
+ h
136
+ end
137
+ stamp_chunk_provenance!(chunk_hashes, cname) if Parse::Agent.include_source_provenance?
138
+
139
+ envelope = { chunks: chunk_hashes, documents: documents, count: chunk_hashes.length }
140
+ if dropped > 0
141
+ envelope[:budget_truncated] = true
142
+ envelope[:budget_dropped] = dropped
143
+ end
144
+ envelope
145
+ end
146
+
147
+ # @!visibility private
148
+ # nil -> DEFAULT_MAX_TOTAL_TOKENS; <=0 -> nil (unlimited); else the int.
149
+ def resolve_token_budget(max_total_tokens)
150
+ return DEFAULT_MAX_TOTAL_TOKENS if max_total_tokens.nil?
151
+ n = max_total_tokens.to_i
152
+ n <= 0 ? nil : n
153
+ end
154
+
155
+ # @!visibility private
156
+ # Greedily keep score-ordered chunks until the cumulative content
157
+ # token estimate (chars/4) would exceed `budget`. Always keeps at
158
+ # least the first chunk so a single oversize chunk still returns
159
+ # something (flagged truncated).
160
+ # @return [Array(Array<Chunk>, Integer)] [kept, dropped_count]
161
+ def apply_token_budget(chunks, budget)
162
+ return [chunks, 0] if budget.nil? || chunks.empty?
163
+ total = 0
164
+ kept = []
165
+ chunks.each do |chunk|
166
+ est = (chunk.content.to_s.length / 4.0).ceil
167
+ break unless kept.empty? || total + est <= budget
168
+ kept << chunk
169
+ total += est
170
+ end
171
+ [kept, chunks.length - kept.length]
172
+ end
173
+
174
+ # @!visibility private
175
+ # Per-chunk `_source` provenance. The chunk already carries a
176
+ # `source` key (the projected parent record), so provenance uses the
177
+ # distinct `_source` key. object_id comes from the chunk metadata
178
+ # (or the projected source record).
179
+ def stamp_chunk_provenance!(chunk_hashes, cname)
180
+ chunk_hashes.each do |c|
181
+ next unless c.is_a?(Hash)
182
+ next if c.key?(:_source)
183
+ oid = c.dig(:metadata, :object_id)
184
+ oid ||= (c[:source]["objectId"] || c[:source][:objectId]) if c[:source].is_a?(Hash)
185
+ c[:_source] = { "class" => cname.to_s, "tool" => "semantic_search", "object_id" => oid }
186
+ end
187
+ end
188
+
189
+ # @!visibility private
190
+ # Build the per-record OUTPUT transform: convert the raw storage-
191
+ # form Mongo hit to Parse/wire form, re-assert tenant scope (raises
192
+ # AccessDenied — fail closed for the whole call), redact hidden
193
+ # nested classes, then project through `field_allowlist`.
194
+ def source_projector(agent, cname, scope)
195
+ lambda do |raw_doc|
196
+ converted = convert_to_parse_form(raw_doc, cname)
197
+ Parse::Agent::Tools.assert_record_in_tenant_scope!(converted, scope, cname) if scope
198
+ projected = Parse::Agent::Tools.project_object_to_allowlist(cname, converted)
199
+ redacted = Parse::Agent::Tools.redact_hidden_classes!(projected, agent: agent)
200
+ # Normalize to the same LLM-friendly, ACL-stripped form the other
201
+ # read tools emit so the `documents` map is consistent (and ACL-
202
+ # free) even for a searchable class with no agent_fields allowlist,
203
+ # where project_object_to_allowlist is a pass-through.
204
+ Parse::Agent::ResultFormatter.simplify_object(redacted)
205
+ end
206
+ end
207
+
208
+ # @!visibility private
209
+ def convert_to_parse_form(raw_doc, cname)
210
+ Parse::MongoDB.convert_documents_to_parse([raw_doc], cname).first || raw_doc
211
+ rescue StandardError
212
+ # Conversion failed for this hit. Do NOT surface the raw storage-form
213
+ # Mongo document: it carries internal metadata (_acl, _rperm/_wperm,
214
+ # storage-form _p_* pointers, _id, _created_at/_updated_at) that the
215
+ # success path strips. For a searchable class with NO agent_fields
216
+ # allowlist, project_object_to_allowlist downstream is a pass-through, so
217
+ # this fallback is the only thing standing between those keys and the
218
+ # LLM. Drop every storage-internal (underscore-prefixed) key. NOTE:
219
+ # reusing Parse::PipelineSecurity.strip_internal_fields is NOT enough —
220
+ # its denylist EXCLUDES _acl, which is exactly the field that discloses
221
+ # other principals' object ids and roles. The chunk's object_id is read
222
+ # from the raw doc before this transform runs, so dropping _id is
223
+ # harmless.
224
+ raw_doc.is_a?(Hash) ? raw_doc.reject { |k, _| k.to_s.start_with?("_") } : {}
225
+ end
226
+
227
+ # @!visibility private
228
+ def clamp_k(k)
229
+ n = k.to_i
230
+ n = DEFAULT_K if n <= 0
231
+ [n, MAX_K].min
232
+ end
233
+
234
+ # @!visibility private
235
+ def build_chunker(size, overlap, by, max_chunks_per_document = nil)
236
+ return nil if size.nil? && overlap.nil? && by.nil? && max_chunks_per_document.nil?
237
+ opts = {
238
+ size: (size || 800).to_i,
239
+ overlap: (overlap || 100).to_i,
240
+ by: (by || :chars).to_sym,
241
+ }
242
+ # Only override the chunker's own default (200) when the caller asked,
243
+ # so an unset cap keeps the library default rather than forcing it here.
244
+ opts[:max_chunks_per_document] = max_chunks_per_document.to_i unless max_chunks_per_document.nil?
245
+ Parse::Retrieval::Chunker::FixedSizeOverlap.new(**opts)
246
+ rescue ArgumentError => e
247
+ raise Parse::Agent::ValidationError, "semantic_search: invalid chunker options — #{e.message}"
248
+ end
249
+
250
+ # @!visibility private
251
+ # The class's declared embed TEXT sources — the only fields an agent may
252
+ # name as `text_field:`. Chunk `content` is the text_field's value, so
253
+ # restricting it to embedded sources stops the tool from surfacing a
254
+ # field the model never opted into embedding.
255
+ def searchable_text_fields(klass)
256
+ return [] unless klass.respond_to?(:embed_directives)
257
+ klass.embed_directives.values
258
+ .reject { |d| d.respond_to?(:image?) && d.image? }
259
+ .flat_map(&:sources).map(&:to_s).uniq
260
+ end
261
+
262
+ # @!visibility private
263
+ # Validate a caller-supplied text_field against the embedded-source
264
+ # allowlist. nil/blank → nil (retrieve infers; works for single-source
265
+ # models, raises AmbiguousTextField for multi-source so the agent knows
266
+ # to pass one).
267
+ def normalize_text_field!(text_field, klass)
268
+ return nil if text_field.nil? || text_field.to_s.strip.empty?
269
+ allowed = searchable_text_fields(klass)
270
+ unless allowed.include?(text_field.to_s)
271
+ raise Parse::Agent::ValidationError,
272
+ "semantic_search: text_field #{text_field.to_s.inspect} is not an embedded " \
273
+ "text source for this class (allowed: #{allowed.inspect})."
274
+ end
275
+ text_field.to_sym
276
+ end
277
+
278
+ # @!visibility private
279
+ # Refuse any top-level filter key not in the class's declared
280
+ # `filter_fields` allowlist (compound operators included — the
281
+ # allowlist is the complete set of keys the agent may use).
282
+ def assert_filter_fields_allowed!(filter, allowed)
283
+ return if filter.nil? || (filter.respond_to?(:empty?) && filter.empty?)
284
+ unless filter.is_a?(Hash)
285
+ raise Parse::Agent::ValidationError, "semantic_search: filter must be an object."
286
+ end
287
+ offending = filter.keys.map(&:to_s).reject { |key| allowed.include?(key) }
288
+ unless offending.empty?
289
+ raise Parse::Agent::ValidationError,
290
+ "semantic_search: filter field(s) #{offending.inspect} are not in the " \
291
+ "agent_searchable filter_fields allowlist (#{allowed.inspect})."
292
+ end
293
+ end
294
+
295
+ # JSON Schema for the registered tool's parameters.
296
+ PARAMETERS = {
297
+ "type" => "object",
298
+ "properties" => {
299
+ "class_name" => { "type" => "string", "description" => "Parse class name (must be agent_searchable)." },
300
+ "query" => { "type" => "string", "description" => "Natural-language query." },
301
+ "k" => { "type" => "integer", "default" => DEFAULT_K, "minimum" => 1, "maximum" => MAX_K },
302
+ "filter" => { "type" => "object", "description" => "Post-search field filter (allowlisted fields only)." },
303
+ "vector_filter" => { "type" => "object", "description" => "Atlas pre-search filter (allowlisted fields only)." },
304
+ "text_field" => { "type" => "string", "description" => "Which embedded text source to chunk and return as content. Required only when the class embeds more than one text field; must name one of those sources." },
305
+ "chunk_size" => { "type" => "integer", "description" => "Override chunk window size." },
306
+ "chunk_overlap" => { "type" => "integer", "description" => "Override chunk overlap." },
307
+ "chunk_by" => { "type" => "string", "enum" => %w[chars tokens], "description" => "Chunk unit." },
308
+ "max_chunks_per_document" => { "type" => "integer", "minimum" => 1, "description" => "Cap on chunks emitted per matched document." },
309
+ "max_total_tokens" => { "type" => "integer", "minimum" => 0, "description" => "Ceiling on total returned chunk-content tokens (approx chars/4). Trims lowest-ranked chunks first and sets budget_truncated. 0 disables." },
310
+ },
311
+ "required" => %w[class_name query],
312
+ }.freeze
313
+
314
+ # MCP outputSchema → mirrored as structuredContent on results.
315
+ # The parent record of each chunk is hoisted into `documents` (keyed
316
+ # by objectId) rather than duplicated inline on every chunk; map a
317
+ # chunk to its source via `metadata.object_id`.
318
+ OUTPUT_SCHEMA = {
319
+ "type" => "object",
320
+ "properties" => {
321
+ "chunks" => {
322
+ "type" => "array",
323
+ "items" => {
324
+ "type" => "object",
325
+ "properties" => {
326
+ "id" => { "type" => "string" },
327
+ "score" => { "type" => %w[number null] },
328
+ "content" => { "type" => "string" },
329
+ "metadata" => { "type" => "object" },
330
+ },
331
+ },
332
+ },
333
+ "documents" => {
334
+ "type" => "object",
335
+ "description" => "objectId => projected source record (sent once per matched document).",
336
+ },
337
+ "count" => { "type" => "integer" },
338
+ "budget_truncated" => { "type" => "boolean", "description" => "Present when the token budget dropped lowest-ranked chunks." },
339
+ "budget_dropped" => { "type" => "integer", "description" => "Number of chunks dropped by the token budget." },
340
+ },
341
+ }.freeze
342
+
343
+ # Register the tool. Idempotent-ish: re-requiring is a no-op because
344
+ # require caches; an explicit re-register after reset_registry! is
345
+ # supported via {.register!}.
346
+ def register!
347
+ Parse::Agent::Tools.register(
348
+ name: :semantic_search,
349
+ description: "Find documents semantically similar to a natural-language query and " \
350
+ "return scored text chunks. Use when keyword matching is unlikely to " \
351
+ "work or the question needs synthesizing across documents. The target " \
352
+ "class must be declared `agent_searchable`.",
353
+ parameters: PARAMETERS,
354
+ permission: :readonly,
355
+ timeout: 30,
356
+ output_schema: OUTPUT_SCHEMA,
357
+ client_safe: true,
358
+ handler: ->(agent, **args) { Parse::Retrieval::AgentTool.semantic_search(agent, **args) },
359
+ )
360
+ end
361
+ end
362
+ end
363
+ end
364
+
365
+ # Register at load. Requires Parse::Agent::Tools (TOOL_DEFINITIONS for the
366
+ # collision check), Parse::Retrieval (loaded with the model layer), and
367
+ # Parse::Object + MetadataDSL — all present by the time agent.rb requires
368
+ # this file at its tail.
369
+ Parse::Retrieval::AgentTool.register!
@@ -0,0 +1,74 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ module Parse
5
+ module Retrieval
6
+ # A single retrieved passage: one chunk of one source document,
7
+ # carrying the document's vector-search score and (optionally
8
+ # projected) source record.
9
+ #
10
+ # Produced by {Parse::Retrieval.retrieve}. Because embedding is
11
+ # one-vector-per-record (see {Parse::Core::EmbedManaged}), every
12
+ # chunk split from a document shares that document's single score —
13
+ # the chunking is presentation-only, applied after retrieval.
14
+ #
15
+ # @!attribute [r] id
16
+ # @return [String] stable synthetic chunk id, `"<objectId>#<index>"`.
17
+ # @!attribute [r] score
18
+ # @return [Float, nil] the parent document's Atlas vectorSearchScore,
19
+ # already quantized when the caller requested it.
20
+ # @!attribute [r] content
21
+ # @return [String] the chunk text.
22
+ # @!attribute [r] source
23
+ # @return [Hash] the parent document record. When the producer
24
+ # supplied a `source_transform:` (the agent tool does, projecting
25
+ # through `field_allowlist`), this is the projected/redacted form.
26
+ # @!attribute [r] metadata
27
+ # @return [Hash] presentation metadata: `:chunk_index`,
28
+ # `:chunk_count`, `:chunks_truncated`, and any producer-supplied
29
+ # signals (e.g. `:token_chunking_degraded`).
30
+ class Chunk
31
+ attr_reader :id, :score, :content, :source, :metadata
32
+
33
+ # @param id [String]
34
+ # @param score [Float, nil]
35
+ # @param content [String]
36
+ # @param source [Hash]
37
+ # @param metadata [Hash]
38
+ def initialize(id:, content:, source:, score: nil, metadata: {})
39
+ @id = id.to_s
40
+ @score = score
41
+ @content = content
42
+ @source = source
43
+ @metadata = metadata
44
+ freeze
45
+ end
46
+
47
+ # @return [Hash] plain-Hash form for tool output / JSON.
48
+ def to_h
49
+ {
50
+ id: @id,
51
+ score: @score,
52
+ content: @content,
53
+ source: @source,
54
+ metadata: @metadata,
55
+ }
56
+ end
57
+
58
+ # Value equality on the identifying triple — convenient for tests
59
+ # and de-duplication. `source`/`metadata` are intentionally not
60
+ # part of identity.
61
+ def ==(other)
62
+ other.is_a?(Chunk) &&
63
+ other.id == @id &&
64
+ other.score == @score &&
65
+ other.content == @content
66
+ end
67
+ alias eql? ==
68
+
69
+ def hash
70
+ [@id, @score, @content].hash
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,208 @@
1
+ # encoding: UTF-8
2
+ # frozen_string_literal: true
3
+
4
+ module Parse
5
+ module Retrieval
6
+ # Pluggable text-chunking strategies for the retrieval layer.
7
+ #
8
+ # A chunker splits a source document's text into smaller, overlapping
9
+ # windows for presentation. {Parse::Retrieval.retrieve} fetches the
10
+ # top-k whole records via Atlas `$vectorSearch`, then runs each
11
+ # record's text field through a chunker so callers get focused,
12
+ # citable passages rather than whole documents.
13
+ #
14
+ # == Presentation chunking, not embedding chunking
15
+ #
16
+ # Embedding remains one-vector-per-record (see
17
+ # {Parse::Core::EmbedManaged}). Chunking here is purely a
18
+ # *presentation* step applied after retrieval: every chunk produced
19
+ # from a document inherits that document's single vector-search
20
+ # score. The chunker never calls an embedding provider.
21
+ #
22
+ # == Extending
23
+ #
24
+ # {FixedSizeOverlap} is the default and the only strategy shipped.
25
+ # Subclass {Base} for semantic, sentence-aware, or true
26
+ # token-aware chunking:
27
+ #
28
+ # class SentenceChunker < Parse::Retrieval::Chunker::Base
29
+ # def chunk(text)
30
+ # normalize(text).split(/(?<=[.!?])\s+/)
31
+ # end
32
+ # end
33
+ #
34
+ # Parse::Retrieval.retrieve(
35
+ # query: "onboarding steps",
36
+ # klass: KnowledgeArticle,
37
+ # chunker: SentenceChunker.new,
38
+ # )
39
+ module Chunker
40
+ # Abstract base. Subclasses MUST implement {#chunk}.
41
+ #
42
+ # Subclasses get one free behavior from {Base}: {#chunk_with_meta},
43
+ # which wraps {#chunk} and reports whether the result was capped.
44
+ # {Parse::Retrieval.retrieve} calls {#chunk_with_meta} so it can
45
+ # stamp a truncation signal onto each emitted chunk's metadata.
46
+ class Base
47
+ # @param text [String, nil] source document text.
48
+ # @return [Array<String>] zero or more chunks. MUST return `[]`
49
+ # for blank/`nil` input.
50
+ # @raise [NotImplementedError] unless overridden.
51
+ def chunk(text)
52
+ raise NotImplementedError, "#{self.class}#chunk must return Array<String>."
53
+ end
54
+
55
+ # Wrap {#chunk} with truncation metadata. The default
56
+ # implementation here does NOT cap — it reports the chunk list as
57
+ # produced. {FixedSizeOverlap} overrides this to enforce its
58
+ # `max_chunks_per_document` cap and report the pre-cap count.
59
+ #
60
+ # @param text [String, nil]
61
+ # @return [Hash] `{ chunks: Array<String>, truncated: Boolean,
62
+ # total_before_truncation: Integer }`.
63
+ def chunk_with_meta(text)
64
+ chunks = Array(chunk(text))
65
+ { chunks: chunks, truncated: false, total_before_truncation: chunks.length }
66
+ end
67
+
68
+ # @!visibility private
69
+ # Shared input normalization. Returns `nil` for `nil`,
70
+ # non-String, empty, or whitespace-only input — every concrete
71
+ # `#chunk` treats a `nil` return as "no chunks". A non-String is
72
+ # treated as blank rather than raised so a document with an
73
+ # unexpected non-text value in the chunked field is skipped, not
74
+ # fatal, during retrieval.
75
+ def normalize(text)
76
+ return nil if text.nil?
77
+ return nil unless text.is_a?(String)
78
+ stripped = text.strip
79
+ stripped.empty? ? nil : text
80
+ end
81
+ end
82
+
83
+ # Fixed-size sliding-window chunker with overlap.
84
+ #
85
+ # Splits text into windows of `size` units, advancing by
86
+ # `size - overlap` each step so consecutive chunks share `overlap`
87
+ # units of context. `by: :chars` (default) counts characters;
88
+ # `by: :tokens` counts whitespace-delimited tokens (a cheap
89
+ # approximation — there is no model tokenizer here; see the
90
+ # `:tokens` note below).
91
+ #
92
+ # c = Parse::Retrieval::Chunker::FixedSizeOverlap.new(size: 800, overlap: 100)
93
+ # c.chunk(long_text) #=> ["…800 chars…", "…overlap+800…", …]
94
+ #
95
+ # == Amplification cap
96
+ #
97
+ # `max_chunks_per_document` (default 200) bounds how many chunks a
98
+ # single document can yield. Beyond the cap the chunker
99
+ # *truncates* — it returns the first `max_chunks_per_document`
100
+ # chunks rather than raising — and {#chunk_with_meta} reports
101
+ # `truncated: true`. This is the DoS guard: a 10 MB field at
102
+ # 800-char windows would otherwise yield ~12,500 chunks.
103
+ #
104
+ # == `:tokens`
105
+ #
106
+ # `by: :tokens` treats `size`/`overlap` as literal whitespace-token
107
+ # counts supplied by the caller. The chunker does NOT consult an
108
+ # embedding provider's `max_input_tokens`; that hint is the
109
+ # caller's concern (see {Parse::Retrieval.retrieve}). The chunker
110
+ # always does exactly what it was constructed with and never
111
+ # silently switches modes.
112
+ class FixedSizeOverlap < Base
113
+ # @return [Integer] window width in `by:` units.
114
+ attr_reader :size
115
+ # @return [Integer] units shared between consecutive windows.
116
+ attr_reader :overlap
117
+ # @return [Symbol] `:chars` or `:tokens`.
118
+ attr_reader :by
119
+ # @return [Integer] hard cap on chunks emitted per document.
120
+ attr_reader :max_chunks_per_document
121
+
122
+ # @param size [Integer] window width (> 0).
123
+ # @param overlap [Integer] shared units between windows
124
+ # (`0 <= overlap < size`).
125
+ # @param by [Symbol] `:chars` (default) or `:tokens`.
126
+ # @param max_chunks_per_document [Integer] cap (> 0, default 200).
127
+ # @raise [ArgumentError] on any out-of-range argument. In
128
+ # particular `overlap >= size` is refused: a non-shrinking
129
+ # stride would never advance and would loop forever.
130
+ def initialize(size: 800, overlap: 100, by: :chars, max_chunks_per_document: 200)
131
+ unless size.is_a?(Integer) && size > 0
132
+ raise ArgumentError, "size must be a positive Integer (got #{size.inspect})."
133
+ end
134
+ unless overlap.is_a?(Integer) && overlap >= 0
135
+ raise ArgumentError, "overlap must be a non-negative Integer (got #{overlap.inspect})."
136
+ end
137
+ if overlap >= size
138
+ raise ArgumentError,
139
+ "overlap (#{overlap}) must be strictly less than size (#{size}); " \
140
+ "a stride of size - overlap <= 0 would never advance."
141
+ end
142
+ unless %i[chars tokens].include?(by)
143
+ raise ArgumentError, "by must be :chars or :tokens (got #{by.inspect})."
144
+ end
145
+ unless max_chunks_per_document.is_a?(Integer) && max_chunks_per_document > 0
146
+ raise ArgumentError,
147
+ "max_chunks_per_document must be a positive Integer " \
148
+ "(got #{max_chunks_per_document.inspect})."
149
+ end
150
+ @size = size
151
+ @overlap = overlap
152
+ @by = by
153
+ @max_chunks_per_document = max_chunks_per_document
154
+ @stride = size - overlap
155
+ end
156
+
157
+ # @param text [String, nil]
158
+ # @return [Array<String>] chunks (capped at
159
+ # {#max_chunks_per_document}). `[]` for blank input.
160
+ def chunk(text)
161
+ chunk_with_meta(text)[:chunks]
162
+ end
163
+
164
+ # (see Base#chunk_with_meta)
165
+ def chunk_with_meta(text)
166
+ source = normalize(text)
167
+ return { chunks: [], truncated: false, total_before_truncation: 0 } if source.nil?
168
+
169
+ all = (@by == :tokens) ? window_tokens(source) : window_chars(source)
170
+ total = all.length
171
+ if total > @max_chunks_per_document
172
+ { chunks: all.first(@max_chunks_per_document),
173
+ truncated: true,
174
+ total_before_truncation: total }
175
+ else
176
+ { chunks: all, truncated: false, total_before_truncation: total }
177
+ end
178
+ end
179
+
180
+ private
181
+
182
+ def window_chars(text)
183
+ len = text.length
184
+ out = []
185
+ start = 0
186
+ while start < len
187
+ out << text[start, @size]
188
+ start += @stride
189
+ end
190
+ out
191
+ end
192
+
193
+ def window_tokens(text)
194
+ tokens = text.split(/\s+/).reject(&:empty?)
195
+ return [] if tokens.empty?
196
+ out = []
197
+ start = 0
198
+ n = tokens.length
199
+ while start < n
200
+ out << tokens[start, @size].join(" ")
201
+ start += @stride
202
+ end
203
+ out
204
+ end
205
+ end
206
+ end
207
+ end
208
+ end