npm - @rubytech/create-realagent - Versions diffs - 1.0.829 → 1.0.830 - Mend

@rubytech/create-realagent 1.0.829 → 1.0.830

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (103) hide show

package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts ADDED Viewed

@@ -0,0 +1,45 @@
+/**
+ * Deterministic prose chunker for oversize document classification (Task 896).
+ *
+ * Splits a document into overlapping fixed-size chunks so each chunk fits
+ * inside Haiku's input context window. The chunker is purely mechanical —
+ * it makes no semantic claim about where chunk boundaries should fall.
+ * Ontological boundaries remain Haiku's job per Task 737 (the document
+ * chunker that *did* try to be semantic was deleted because it leaked
+ * sections at the boundaries it picked).
+ *
+ * Overlap exists so a section straddling a chunk boundary appears in BOTH
+ * surrounding chunks; the merge step then unions the same-kind ranges so
+ * the boundary section isn't double-counted in the writer.
+ *
+ * Char counts are estimated from token counts via a fixed 3.5 chars/token
+ * ratio (English prose average). The estimate is conservative — Haiku
+ * tokenises slightly differently per script, but 3.5 leaves ~10% headroom
+ * for non-English content before bumping into the model's hard ceiling.
+ */
+export interface RangedSection {
+    /** Section kind from the classifier's closed enumeration. */
+    kind: string;
+    /** Inclusive whole-document start offset. */
+    sourceStart: number;
+    /** Exclusive whole-document end offset. */
+    sourceEnd: number;
+    /** Per-section summary; longer wins on merge tie-break. */
+    summary: string;
+}
+export interface DocumentChunk {
+    /** Substring of the source document covered by this chunk. */
+    chunkText: string;
+    /** Whole-document offset where this chunk's text begins. */
+    baseOffset: number;
+}
+export interface ChunkOptions {
+    /** Maximum chunk length in characters (already token→char converted). */
+    chunkSize: number;
+    /** Overlap in characters between consecutive chunks. */
+    overlap: number;
+}
+export declare function chunkDocument(text: string, opts: ChunkOptions): DocumentChunk[];
+export declare const MERGE_OVERLAP_THRESHOLD = 0.5;
+export declare function mergeOverlappingSections<T extends RangedSection>(input: T[]): T[];
+//# sourceMappingURL=document-chunker.d.ts.map

package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"document-chunker.d.ts","sourceRoot":"","sources":["../../src/lib/document-chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAQH,MAAM,WAAW,aAAa;IAC5B,6DAA6D;IAC7D,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,WAAW,EAAE,MAAM,CAAC;IACpB,2CAA2C;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,2DAA2D;IAC3D,OAAO,EAAE,MAAM,CAAC;CACjB;AAMD,MAAM,WAAW,aAAa;IAC5B,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAC;IAClB,4DAA4D;IAC5D,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,YAAY;IAC3B,yEAAyE;IACzE,SAAS,EAAE,MAAM,CAAC;IAClB,wDAAwD;IACxD,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,aAAa,EAAE,CA4B/E;AAqBD,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C,wBAAgB,wBAAwB,CAAC,CAAC,SAAS,aAAa,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAqDjF"}

package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js ADDED Viewed

@@ -0,0 +1,125 @@
+/**
+ * Deterministic prose chunker for oversize document classification (Task 896).
+ *
+ * Splits a document into overlapping fixed-size chunks so each chunk fits
+ * inside Haiku's input context window. The chunker is purely mechanical —
+ * it makes no semantic claim about where chunk boundaries should fall.
+ * Ontological boundaries remain Haiku's job per Task 737 (the document
+ * chunker that *did* try to be semantic was deleted because it leaked
+ * sections at the boundaries it picked).
+ *
+ * Overlap exists so a section straddling a chunk boundary appears in BOTH
+ * surrounding chunks; the merge step then unions the same-kind ranges so
+ * the boundary section isn't double-counted in the writer.
+ *
+ * Char counts are estimated from token counts via a fixed 3.5 chars/token
+ * ratio (English prose average). The estimate is conservative — Haiku
+ * tokenises slightly differently per script, but 3.5 leaves ~10% headroom
+ * for non-English content before bumping into the model's hard ceiling.
+ */
+export function chunkDocument(text, opts) {
+    const { chunkSize, overlap } = opts;
+    if (chunkSize <= 0) {
+        throw new Error(`chunkDocument: chunkSize must be positive, got ${chunkSize}`);
+    }
+    if (overlap < 0) {
+        throw new Error(`chunkDocument: overlap must be non-negative, got ${overlap}`);
+    }
+    if (overlap >= chunkSize) {
+        throw new Error(`chunkDocument: overlap (${overlap}) must be less than chunkSize (${chunkSize})`);
+    }
+    if (text.length === 0)
+        return [];
+    if (text.length <= chunkSize) {
+        // One-chunk fast path so callers that always-chunk don't pay the
+        // window-stepping arithmetic for inputs that already fit.
+        return [{ chunkText: text, baseOffset: 0 }];
+    }
+    const chunks = [];
+    const stride = chunkSize - overlap;
+    let start = 0;
+    while (start < text.length) {
+        const end = Math.min(start + chunkSize, text.length);
+        chunks.push({ chunkText: text.slice(start, end), baseOffset: start });
+        if (end >= text.length)
+            break;
+        start += stride;
+    }
+    return chunks;
+}
+// ---------------------------------------------------------------------------
+// mergeOverlappingSections — collates per-chunk classifier results.
+//
+// Algorithm: group sections by `kind`, sort by `sourceStart`, then walk and
+// union consecutive same-kind ranges whose intersection covers more than
+// MERGE_OVERLAP_THRESHOLD of the smaller range. The longer summary wins on
+// merge — empirically Haiku's longer summary on a chunk that saw more
+// surrounding context tends to be the better one.
+//
+// Cross-kind overlap is preserved: chunk A's `Position` and chunk B's
+// `Other` covering the same range are kept as two distinct sections (per
+// eng review). The classifier disagreed about kind; the writer's downstream
+// :Section:Other surfacing will let the operator decide which one wins
+// during ontology growth review.
+//
+// Disjoint same-kind sections are also preserved — only adjacent ranges
+// with material overlap are merged.
+// ---------------------------------------------------------------------------
+export const MERGE_OVERLAP_THRESHOLD = 0.5;
+export function mergeOverlappingSections(input) {
+    if (input.length <= 1)
+        return input.slice();
+    // Group by kind so we never accidentally merge across kinds.
+    const byKind = new Map();
+    for (const s of input) {
+        const arr = byKind.get(s.kind);
+        if (arr)
+            arr.push(s);
+        else
+            byKind.set(s.kind, [s]);
+    }
+    const merged = [];
+    for (const group of byKind.values()) {
+        group.sort((a, b) => a.sourceStart - b.sourceStart || a.sourceEnd - b.sourceEnd);
+        let current = null;
+        for (const s of group) {
+            if (current === null) {
+                current = { ...s };
+                continue;
+            }
+            const intersection = Math.max(0, Math.min(current.sourceEnd, s.sourceEnd) - Math.max(current.sourceStart, s.sourceStart));
+            if (intersection === 0) {
+                merged.push(current);
+                current = { ...s };
+                continue;
+            }
+            const currentLen = current.sourceEnd - current.sourceStart;
+            const sLen = s.sourceEnd - s.sourceStart;
+            const overlapFraction = intersection / Math.min(currentLen, sLen);
+            if (overlapFraction > MERGE_OVERLAP_THRESHOLD) {
+                // Union the range; the section with the longer body contributes its
+                // non-range fields (title, properties, anchorEdge, related, etc.) on
+                // the assumption that a wider classification window grounded its
+                // properties more reliably. Summary always picks the longer of the two.
+                const fieldsWinner = sLen > currentLen ? s : current;
+                current = {
+                    ...fieldsWinner,
+                    sourceStart: Math.min(current.sourceStart, s.sourceStart),
+                    sourceEnd: Math.max(current.sourceEnd, s.sourceEnd),
+                    summary: s.summary.length > current.summary.length ? s.summary : current.summary,
+                };
+            }
+            else {
+                merged.push(current);
+                current = { ...s };
+            }
+        }
+        if (current !== null)
+            merged.push(current);
+    }
+    // Return in whole-document reading order so the writer's :NEXT chain
+    // maps to source order.
+    merged.sort((a, b) => a.sourceStart - b.sourceStart);
+    return merged;
+}
+//# sourceMappingURL=document-chunker.js.map

package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"document-chunker.js","sourceRoot":"","sources":["../../src/lib/document-chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAqCH,MAAM,UAAU,aAAa,CAAC,IAAY,EAAE,IAAkB;IAC5D,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC;IACpC,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,kDAAkD,SAAS,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;QAChB,MAAM,IAAI,KAAK,CAAC,oDAAoD,OAAO,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,OAAO,IAAI,SAAS,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,2BAA2B,OAAO,kCAAkC,SAAS,GAAG,CAAC,CAAC;IACpG,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,iEAAiE;QACjE,0DAA0D;QAC1D,OAAO,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC;IACnC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC,CAAC;QACtE,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM;QAC9B,KAAK,IAAI,MAAM,CAAC;IAClB,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,8EAA8E;AAC9E,oEAAoE;AACpE,EAAE;AACF,4EAA4E;AAC5E,yEAAyE;AACzE,2EAA2E;AAC3E,sEAAsE;AACtE,kDAAkD;AAClD,EAAE;AACF,sEAAsE;AACtE,yEAAyE;AACzE,4EAA4E;AAC5E,uEAAuE;AACvE,iCAAiC;AACjC,EAAE;AACF,wEAAwE;AACxE,oCAAoC;AACpC,8EAA8E;AAE9E,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C,MAAM,UAAU,wBAAwB,CAA0B,KAAU;IAC1E,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;IAE5C,6DAA6D;IAC7D,MAAM,MAAM,GAAG,IAAI,GAAG,EAAe,CAAC;IACtC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,GAAG;YAAE,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;YAChB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/B,CAAC;IAED,MAAM,MAAM,GAAQ,EAAE,CAAC;IACvB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,IAAI,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;QACjF,IAAI,OAAO,GAAa,IAAI,CAAC;QAC7B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;gBACnB,SAAS;YACX,CAAC;YACD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;YAC1H,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;gBACnB,SAAS;YACX,CAAC;YACD,MAAM,UAAU,GAAW,OAAO,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;YACnE,MAAM,IAAI,GAAW,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,WAAW,CAAC;YACjD,MAAM,eAAe,GAAG,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;YAClE,IAAI,eAAe,GAAG,uBAAuB,EAAE,CAAC;gBAC9C,oEAAoE;gBACpE,qEAAqE;gBACrE,iEAAiE;gBACjE,wEAAwE;gBACxE,MAAM,YAAY,GAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;gBACxD,OAAO,GAAG;oBACR,GAAG,YAAY;oBACf,WAAW,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC;oBACzD,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC;oBACnD,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO;iBACjF,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;YACrB,CAAC;QACH,CAAC;QACD,IAAI,OAAO,KAAK,IAAI;YAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC7C,CAAC;IAED,qEAAqE;IACrE,wBAAwB;IACxB,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;IACrD,OAAO,MAAM,CAAC;AAChB,CAAC"}

package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts CHANGED Viewed

@@ -70,8 +70,31 @@ export interface ClassifiedSection {
     kind: string;
     /** Short human-readable title for the section. */
     title: string;
-    /** The section's body text — embedded and stored on the section node. */
+    /**
+     * The section's body text — embedded and stored on the section node.
+     *
+     * Task 896: server-reconstructed via `documentText.slice(sourceStart, sourceEnd)`.
+     * The LLM emits offsets, never the body text — output size becomes O(sections),
+     * not O(input chars). Callers consume the same `body: string` shape as before.
+     */
     body: string;
+    /**
+     * 1-3 sentence summary of the section, ≤500 chars (server-validated).
+     * The LLM emits this; the server truncates if oversize. Stored as
+     * `properties.summary` on the section node so adjacency search can
+     * surface it without rehydrating the body.
+     */
+    summary: string;
+    /**
+     * Whole-document character offsets — inclusive start, exclusive end.
+     * The LLM emits these; the server validates bounds and reconstructs
+     * `body` via `documentText.slice(sourceStart, sourceEnd)`. In the
+     * chunked-classify path these are translated from chunk-local to
+     * whole-document coordinates so the merge step can detect boundary
+     * straddlers across chunks.
+     */
+    sourceStart: number;
+    sourceEnd: number;
     /** Properties on the section node (excluding accountId/embedding/provenance). */
     properties: Record<string, unknown>;
     /**

package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"llm-classifier.d.ts","sourceRoot":"","sources":["../../src/lib/llm-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;~~AASH~~,+DAA+D;AAC/D,MAAM,MAAM,mBAAmB,GAAG,aAAa,GAAG,WAAW,CAAC;AAE9D,mEAAmE;AACnE,MAAM,MAAM,oBAAoB,GAAG,UAAU,GAAG,UAAU,CAAC;AAE3D,kFAAkF;AAClF,MAAM,WAAW,iBAAiB;IAChC,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC,oDAAoD;IACpD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,oBAAoB,CAAC;QAChC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,CAAC;IACF;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,oGAAoG;AACpG,MAAM,WAAW,iBAAiB;IAChC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAC;IACd~~,yEAAyE~~;~~IACzE~~,IAAI,EAAE,MAAM,CAAC;IACb,iFAAiF;IACjF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC;;;;OAIG;IACH,UAAU,EAAE;QACV,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,mBAAmB,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,GAAG,IAAI,CAAC;IACT,oFAAoF;IACpF,OAAO,CAAC,EAAE,iBAAiB,EAAE,CAAC;IAC9B;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,6EAA6E;IAC7E,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oCAAoC;AACpC,MAAM,WAAW,gBAAgB;IAC/B,kDAAkD;IAClD,eAAe,EAAE,MAAM,CAAC;IACxB,kEAAkE;IAClE,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,6BAA6B;IAC7B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,iFAAiF;IACjF,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC;;+CAE2C;IAC3C,aAAa,CAAC,EAAE,KAAK,CAAC;QACpB,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,UAAU,GAAG,UAAU,CAAC;QACnC,UAAU,EAAE,MAAM,CAAC;QACnB,gBAAgB,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,wEAAwE;QACxE,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,mFAAmF;IACnF,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,MAAM,cAAc,GACtB;IAAE,IAAI,EAAE,IAAI,CAAC;IAAC,MAAM,EAAE,gBAAgB,CAAA;CAAE,GACxC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC;~~AAQzC~~;;;;;;;;GAQG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAE1C,eAAO,MAAM,sBAAsB,wEAMzB,CAAC;AAEX,eAAO,MAAM,wBAAwB,~~yKAgB3B~~,CAAC;AAEX,eAAO,MAAM,sBAAsB,4SAqBzB,CAAC;AAEX,8EAA8E;AAC9E,eAAO,MAAM,qBAAqB,sBAAuB,CAAC;AAE1D,eAAO,MAAM,iBAAiB,EAAE,WAAW,CAAC,MAAM,CAMhD,CAAC;~~AA+HH~~,MAAM,WAAW,cAAc;IAC7B,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;;OAOG;IACH,IAAI,CAAC,EAAE,UAAU,GAAG,MAAM,CAAC;IAC3B;;;;;;;;OAQG;IACH,iBAAiB,EAAE,MAAM,CAAC;IAC1B;;;;;;OAMG;IACH,cAAc,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;IACpC;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB,8EAA8E;IAC9E,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,cAAc,CAAC,~~CA2OzB~~"}
1	+ {"version":3,"file":"llm-classifier.d.ts","sourceRoot":"","sources":["../../src/lib/llm-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAUH,+DAA+D;AAC/D,MAAM,MAAM,mBAAmB,GAAG,aAAa,GAAG,WAAW,CAAC;AAE9D,mEAAmE;AACnE,MAAM,MAAM,oBAAoB,GAAG,UAAU,GAAG,UAAU,CAAC;AAE3D,kFAAkF;AAClF,MAAM,WAAW,iBAAiB;IAChC,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC,oDAAoD;IACpD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,oBAAoB,CAAC;QAChC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,CAAC;IACF;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,oGAAoG;AACpG,MAAM,WAAW,iBAAiB;IAChC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAC;IACd;;;;;;OAMG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,OAAO,EAAE,MAAM,CAAC;IAChB;;;;;;;OAOG;IACH,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,iFAAiF;IACjF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC;;;;OAIG;IACH,UAAU,EAAE;QACV,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,mBAAmB,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,GAAG,IAAI,CAAC;IACT,oFAAoF;IACpF,OAAO,CAAC,EAAE,iBAAiB,EAAE,CAAC;IAC9B;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,6EAA6E;IAC7E,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oCAAoC;AACpC,MAAM,WAAW,gBAAgB;IAC/B,kDAAkD;IAClD,eAAe,EAAE,MAAM,CAAC;IACxB,kEAAkE;IAClE,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,6BAA6B;IAC7B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,iFAAiF;IACjF,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC;;+CAE2C;IAC3C,aAAa,CAAC,EAAE,KAAK,CAAC;QACpB,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,UAAU,GAAG,UAAU,CAAC;QACnC,UAAU,EAAE,MAAM,CAAC;QACnB,gBAAgB,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,wEAAwE;QACxE,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,mFAAmF;IACnF,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,MAAM,cAAc,GACtB;IAAE,IAAI,EAAE,IAAI,CAAC;IAAC,MAAM,EAAE,gBAAgB,CAAA;CAAE,GACxC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC;AA8CzC;;;;;;;;GAQG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAE1C,eAAO,MAAM,sBAAsB,wEAMzB,CAAC;AAEX,eAAO,MAAM,wBAAwB,yKAa3B,CAAC;AAEX,eAAO,MAAM,sBAAsB,4SAqBzB,CAAC;AAEX,8EAA8E;AAC9E,eAAO,MAAM,qBAAqB,sBAAuB,CAAC;AAE1D,eAAO,MAAM,iBAAiB,EAAE,WAAW,CAAC,MAAM,CAMhD,CAAC;AAoJH,MAAM,WAAW,cAAc;IAC7B,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;;OAOG;IACH,IAAI,CAAC,EAAE,UAAU,GAAG,MAAM,CAAC;IAC3B;;;;;;;;OAQG;IACH,iBAAiB,EAAE,MAAM,CAAC;IAC1B;;;;;;OAMG;IACH,cAAc,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;IACpC;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB,8EAA8E;IAC9E,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,cAAc,CAAC,CA0UzB"}

package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js CHANGED Viewed

@@ -23,10 +23,45 @@
  */
 import { callOauthLlm } from "../../../../../lib/oauth-llm/dist/index.js";
 import { HAIKU_MODEL } from "../../../../../lib/models/dist/index.js";
+import { chunkDocument, mergeOverlappingSections } from "./document-chunker.js";
 // ---------------------------------------------------------------------------
 // Constants
 // ---------------------------------------------------------------------------
 const MAX_OUTPUT_TOKENS = 8192;
+/**
+ * Per-section summary cap (Task 896 clause 1). The classifier prompt asks
+ * for ≤500 chars; the server truncates anything longer with an ellipsis
+ * marker so a single overlong summary never inflates the output JSON
+ * unbounded. Truncation is observable (logged once per oversize section)
+ * but not fatal — Haiku usually respects the cap.
+ */
+const SUMMARY_MAX_CHARS = 500;
+/**
+ * Output budget the prompt advertises to the model. ≈6000 tokens leaves
+ * headroom under MAX_OUTPUT_TOKENS=8192 for a few hundred sections of
+ * offsets + short summaries without re-emitting body text. Pre-Task-896
+ * the verbatim-body schema made output ≈ input — a 251K-char Adam Mackay
+ * archive truncated mid-word at 8K.
+ */
+const PROMPT_OUTPUT_TOKEN_BUDGET = 6000;
+// ---------------------------------------------------------------------------
+// Task 896 clause 3 — chunker constants for oversize prose.
+//
+// Haiku 4.5: 200K input tokens. Reserve ~5K for prompt + system overhead
+// → ~195K usable tokens × ~3.5 chars/token = ~682K char input ceiling per
+// Haiku call. The chunker emits chunks of ~150K tokens (~525K chars) with
+// ~5K-token (~17.5K-char) overlap so a section straddling the boundary
+// surfaces in both surrounding chunks for the merge step.
+// ---------------------------------------------------------------------------
+const CHARS_PER_TOKEN_ESTIMATE = 3.5;
+const HAIKU_INPUT_TOKEN_BUDGET = 195_000;
+/** Per-Haiku-call hard ceiling on `documentText` characters — enforced regardless of mode. */
+const INPUT_CHAR_CEILING = Math.floor(HAIKU_INPUT_TOKEN_BUDGET * CHARS_PER_TOKEN_ESTIMATE);
+const CHUNK_TOKEN_SIZE = 150_000;
+const CHUNK_OVERLAP_TOKENS = 5_000;
+/** Target chunk char size for the prose chunker (Task 896 clause 3). */
+const CHUNK_CHAR_SIZE = Math.floor(CHUNK_TOKEN_SIZE * CHARS_PER_TOKEN_ESTIMATE);
+const CHUNK_OVERLAP_CHARS = Math.floor(CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN_ESTIMATE);
 /**
  * Closed enumeration of section `kind` values. Each becomes a secondary
  * label on the `:Section` node (e.g. `:Section:Position`). Anything outside
@@ -55,10 +90,7 @@ export const STRUCTURAL_SECTION_KINDS = [
     "Bibliography",
     "Glossary",
     "Acknowledgments",
-    // Task 891 — chat-mode kind. Emitted only when ClassifyParams.mode === 'chat'.
-    // The chat-mode system prompt restricts output to this single kind; the
-    // document-mode prompt never names it. Listed here so the validator's closed
-    // enumeration accepts it without a per-mode dictionary split.
+    // Task 891: chat-mode kind. Emitted only when mode==='chat'; listed here so the validator's closed enumeration accepts it.
     "Conversation",
 ];
 export const CONTRACT_SECTION_KINDS = [
@@ -115,14 +147,17 @@ const CHAT_SYSTEM_PROMPT = [
     "  [DD/MM/YYYY, HH:MM:SS] <Sender>: <body>",
     "  [DD/MM/YY, HH:MM:SS] <Sender>: <body>",
     "  [YYYY-MM-DD HH:MM:SS] <Sender>: <body>",
-    "Body lines without a leading bracketed timestamp belong to the previous message (multi-line bodies). System messages (no sender) and media-only lines (e.g. '<Media omitted>') are kept verbatim in the chunk body.",
+    "Body lines without a leading bracketed timestamp belong to the previous message (multi-line bodies). System messages (no sender) and media-only lines (e.g. '<Media omitted>') belong inside the chunk that covers their position.",
+    "",
+    `OUTPUT BUDGET — your JSON response must fit within ~${PROMPT_OUTPUT_TOKEN_BUDGET} output tokens. Use offsets — NEVER re-emit body text. The server reconstructs each chunk's body from your offsets via documentText.slice(sourceStart, sourceEnd) and stores it on the node, so byte-equal recovery works without you transmitting the bytes.`,
     "",
     "Each chunk is a JSON object with:",
     "- 'kind': MUST be exactly 'Conversation'. No other kinds are legal in chat mode.",
     "- 'title': short human-readable topic label for the chunk (max 120 chars).",
-    "- 'body': the verbatim text of the messages in this chunk — exactly as supplied, including timestamp prefixes, sender names, internal newlines, and multi-line continuations. NEVER summarise, NEVER strip the prefixes — downstream provenance recovery depends on byte equality.",
+    `- 'summary': 1-3 sentences describing what this chunk is about. Hard ceiling ${SUMMARY_MAX_CHARS} characters — the server truncates anything longer.`,
+    "- 'sourceStart': INTEGER character offset into the supplied archive text where this chunk's first message begins (0-indexed, inclusive). MUST point at the opening '[' of the bracketed timestamp prefix.",
+    "- 'sourceEnd': INTEGER character offset where this chunk ends (exclusive). MUST be > sourceStart and ≤ total length of the supplied text.",
     "- 'properties': required typed properties on the chunk node:",
-    "    summary          : 1-3 sentences describing what this chunk is about (this is your one chance to summarise; the body stays verbatim).",
     "    keywords         : array of 3-10 lowercase topic keywords for retrieval.",
     "    firstMessageAt   : timestamp of the first message in the chunk, copied verbatim from the line prefix (preserve the file's native format and any offset).",
     "    lastMessageAt    : timestamp of the last message in the chunk, copied verbatim from the line prefix.",
@@ -142,7 +177,7 @@ const CHAT_SYSTEM_PROMPT = [
     "- Split at topic transitions, not at message count or arbitrary intervals. A coherent exchange ('let's discuss the deck') is one chunk; a separate exchange ('what time tomorrow?') is another.",
     "- An archive of fewer than ~10 messages is usually one chunk.",
     "- Even a one-message archive must produce one chunk — never return zero chunks for non-empty input.",
-    "- Chunks MUST cover every message in the archive in chronological order with no gaps and no overlap. messageCount summed across chunks equals total archive messages.",
+    "- Offset coverage: chunks MUST cover every message in chronological order. Adjacent chunks should be contiguous (chunk N's sourceEnd equals chunk N+1's sourceStart) so no message is skipped. messageCount summed across chunks equals total archive messages.",
     "",
     "Respond with ONLY the JSON object, no prose, no markdown fences.",
 ].join("\n");
@@ -154,6 +189,8 @@ const SYSTEM_PROMPT = [
     "2. The natural-edge map naming the anchor edge for identity-kind sections.",
     "3. The full document text.",
     "",
+    `OUTPUT BUDGET — your JSON response must fit within ~${PROMPT_OUTPUT_TOKEN_BUDGET} output tokens. Use offsets — NEVER re-emit body text. The server reconstructs each section's body from your offsets via documentText.slice(sourceStart, sourceEnd) and stores it on the node. Per-section 'summary' is hard-capped at ${SUMMARY_MAX_CHARS} chars.`,
+    "",
     "Closed enumeration of section `kind` values:",
     `  Identity (anchor edge to subject): ${IDENTITY_SECTION_KINDS.join(", ")}`,
     `  Document-structural (no anchor edge; HAS_SECTION + NEXT only): ${STRUCTURAL_SECTION_KINDS.join(", ")}`,
@@ -164,7 +201,9 @@ const SYSTEM_PROMPT = [
     "For each meaningful section, return a JSON object with:",
     "- 'kind': one of the closed-enumeration values above. Never invent new kinds; use 'Other' with a 'classifierReason' if nothing fits.",
     "- 'title': short human-readable title (max 120 chars).",
-    "- 'body': the section's text, exactly as it appears (verbatim — no summarising).",
+    `- 'summary': 1-3 sentences describing the section. Hard ceiling ${SUMMARY_MAX_CHARS} characters — the server truncates anything longer.`,
+    "- 'sourceStart': INTEGER character offset into the supplied document text where this section begins (0-indexed, inclusive).",
+    "- 'sourceEnd': INTEGER character offset where this section ends (exclusive). MUST be > sourceStart and ≤ total length of the supplied text.",
     "- 'properties': any typed properties for the section node (e.g. for Position: jobTitle, startDate, endDate; for Education: degree, fieldOfStudy; do NOT include accountId, embedding, createdAt, or other system fields — the writer adds them).",
     "- 'anchorEdge': for identity-kind sections (Position, Education, Credential, Skill, Biography) and for standalone Project, an object { type, direction, properties } naming the natural edge to the document subject (e.g. UserProfile -[HAS_POSITION]-> the Section). 'direction' is 'from-anchor' if the subject points at the section, 'to-anchor' if the section points at the subject. Set to null for structural + contract-clause kinds and for 'Other'.",
     "- 'related': optional array of additional entity nodes this section references (e.g. a Position section's employer Organization via AT, an Education section's school Organization via ATTENDED). Each entry: { kind, properties, edge: { type, direction, properties }, merge: true|false }. Direction is 'outgoing' (section -> related) or 'incoming' (section <- related). Use 'merge': true for entities reused across documents (Organization by name, Person by email/telephone).",
@@ -186,7 +225,7 @@ const SYSTEM_PROMPT = [
     "- 'kind' values are restricted to the closed enumeration above. If a section truly fits no listed kind, use 'Other' with a 'classifierReason'. Never emit a kind not on the list.",
     "- Never invent edge names. Use the natural-edge map exactly as given. The graph validator rejects writes with unknown edge types.",
     "- Be conservative with 'related' entities — only include them when the section explicitly names them.",
-    "- Keep 'body' verbatim from the document. Summaries belong only in 'documentSummary'.",
+    "- Offsets cover the source: sourceStart and sourceEnd are integer character positions in the supplied document text. Do not re-emit body text — the server reconstructs it from your offsets.",
     "- Respond with ONLY the JSON object, no prose, no markdown fences.",
 ].join("\n");
 // ---------------------------------------------------------------------------
@@ -206,6 +245,22 @@ function asString(v) {
 function asObject(v) {
     return v && typeof v === "object" && !Array.isArray(v) ? v : null;
 }
+/**
+ * Coerce a JSON value into a non-negative integer character offset, or null
+ * if it isn't one. Floats, NaN, negatives, and non-numbers all return null —
+ * Haiku has been observed emitting `null` and stringly-typed offsets when
+ * stressed; we drop the section silently and let the missing-offsets
+ * diagnostic surface the rate.
+ */
+function asNonNegativeInt(v) {
+    if (typeof v !== "number")
+        return null;
+    if (!Number.isFinite(v) || !Number.isInteger(v))
+        return null;
+    if (v < 0)
+        return null;
+    return v;
+}
 /**
  * Classify a document into typed sections via Haiku (Task 740).
  *
@@ -223,6 +278,28 @@ function asObject(v) {
 export async function classifyDocument(params) {
     const { accountId, anchorDescription, ontologyLabels, naturalEdgeMap, documentText } = params;
     const mode = params.mode ?? "document";
+    // Task 896 clause 3 dispatch — oversize document mode goes to the chunked
+    // path; oversize chat mode loud-fails (sessionize must keep sessions under
+    // the ceiling, per eng review). Single-shot path stays unchanged below.
+    if (mode === "document" && documentText.length > CHUNK_CHAR_SIZE) {
+        return classifyDocumentChunked(params);
+    }
+    if (documentText.length > INPUT_CHAR_CEILING) {
+        const overage = `chars=${documentText.length}, ceiling=${INPUT_CHAR_CEILING}`;
+        if (mode === "chat") {
+            logFallback(accountId, `input-too-large: chat session exceeds Haiku input ceiling (${overage}). Sessionize must split sessions before classify (Task 894).`);
+        }
+        else {
+            // Document mode > INPUT_CHAR_CEILING but ≤ CHUNK_CHAR_SIZE shouldn't
+            // happen since CHUNK_CHAR_SIZE < INPUT_CHAR_CEILING — kept as
+            // defence-in-depth in case constants drift.
+            logFallback(accountId, `input-too-large: document exceeds Haiku input ceiling without chunking (${overage}). Constants drift between CHUNK_CHAR_SIZE and INPUT_CHAR_CEILING.`);
+        }
+        return {
+            kind: "fallback",
+            reason: `Input is ${documentText.length} chars; classifier ceiling is ${INPUT_CHAR_CEILING}.`,
+        };
+    }
     // System prompt + user message branch on mode. Chat mode strips the
     // natural-edge map and reframes the input as a session of turn-attributed
     // text; document mode is unchanged from Task 740.
@@ -276,8 +353,17 @@ export async function classifyDocument(params) {
     try {
         parsed = JSON.parse(jsonText);
     }
-    catch {
-        logFallback(accountId, `malformed JSON: ${jsonText.slice(0, 120)}`);
+    catch (err) {
+        // Task 896 clause 5: surface diagnostics so a malformed-JSON fallback
+        // distinguishes truncation (output budget exceeded), fence drift, and
+        // genuine model junk. Pre-Task-896 the fallback discarded the parser
+        // error and 120 chars from the post-strip text — Adam Mackay's 251K-char
+        // ingest bottomed out here with no visible cause.
+        const message = err instanceof Error ? err.message : String(err);
+        const fenceStripped = jsonText !== responseText;
+        logFallback(accountId, `malformed JSON: parse-error=${JSON.stringify(message)} len=${responseText.length} fence-stripped=${fenceStripped} ` +
+            `pre-strip-head=${JSON.stringify(responseText.slice(0, 200))} ` +
+            `pre-strip-tail=${JSON.stringify(responseText.slice(-200))}`);
         return { kind: "fallback", reason: "Haiku returned malformed JSON" };
     }
     const root = asObject(parsed);
@@ -296,15 +382,44 @@ export async function classifyDocument(params) {
     }
     const sections = [];
     let hallucinatedRelated = 0;
+    // Task 896 clause 1 diagnostics — counters for offset/summary post-validation
+    // failures so the haiku-ok log line names the rate of model misbehaviour.
+    // Per-section drops are silent; the aggregate count tells the operator
+    // whether the prompt is degrading.
+    let droppedForOffsets = 0;
+    let summaryTruncated = 0;
     for (const raw of rawSections) {
         const obj = asObject(raw);
         if (!obj)
             continue;
+        // Task 896 clause 1: read offsets and reconstruct body server-side.
+        // Pre-Task-896 the LLM emitted body verbatim, making output ≈ input
+        // and causing 8K-token truncation on >80K-char inputs.
+        const sourceStart = asNonNegativeInt(obj.sourceStart);
+        const sourceEnd = asNonNegativeInt(obj.sourceEnd);
+        if (sourceStart === null || sourceEnd === null) {
+            droppedForOffsets += 1;
+            continue;
+        }
+        if (sourceEnd <= sourceStart || sourceEnd > documentText.length) {
+            droppedForOffsets += 1;
+            continue;
+        }
+        const body = documentText.slice(sourceStart, sourceEnd);
+        if (body.length === 0) {
+            droppedForOffsets += 1;
+            continue;
+        }
         const title = asString(obj.title) ?? "";
-        const body = asString(obj.body) ?? "";
         const properties = asObject(obj.properties) ?? {};
-        if (!body.trim())
-            continue; // skip empty sections
+        let summary = asString(obj.summary) ?? "";
+        if (summary.length > SUMMARY_MAX_CHARS) {
+            summary = summary.slice(0, SUMMARY_MAX_CHARS - 1) + "…";
+            summaryTruncated += 1;
+        }
+        // Mirror summary into properties so the Neo4j section node carries it
+        // (chat-mode parity — pre-Task-896 chunks stored summary as properties.summary).
+        properties.summary = summary;
         if (mode === "chat") {
             // Chat mode: only `Conversation` is legal. Haiku is instructed to emit
             // exactly that kind; force it here so a misfire still produces a valid
@@ -315,6 +430,9 @@ export async function classifyDocument(params) {
                 kind: "Conversation",
                 title: title.slice(0, 200),
                 body,
+                summary,
+                sourceStart,
+                sourceEnd,
                 properties,
                 anchorEdge: null,
             });
@@ -376,6 +494,9 @@ export async function classifyDocument(params) {
             kind,
             title: title.slice(0, 200),
             body,
+            summary,
+            sourceStart,
+            sourceEnd,
             properties,
             anchorEdge: kind === SECTION_KIND_OTHER ? null : anchorEdge,
             related: related.length > 0 ? related : undefined,
@@ -384,6 +505,19 @@ export async function classifyDocument(params) {
                 : {}),
         });
     }
+    // Missing-offsets fallback (Task 896 clause 1, surfaced by CEO review):
+    // if Haiku emitted sections but every one failed offset validation, we'd
+    // otherwise return an empty `sections` array silently and the writer would
+    // happily produce zero `:Section` nodes. Loud-fail instead so the operator
+    // sees the regression — typically caused by a model that ignored the new
+    // offset contract and reverted to emitting `body`.
+    if (rawSections.length > 0 && sections.length === 0) {
+        logFallback(accountId, `missing-offsets: every section failed offset validation (rawSections=${rawSections.length}, droppedForOffsets=${droppedForOffsets}). Likely cause: Haiku emitted body text instead of sourceStart/sourceEnd offsets, or the prompt update didn't reach the model.`);
+        return {
+            kind: "fallback",
+            reason: "Haiku response had no parseable section offsets",
+        };
+    }
     // Top-level orphan candidates and document-level edges are document-mode
     // concepts. In chat mode the operator confirms participants up front and
     // attaches them as :PARTICIPANT_IN edges off the :ConversationArchive
@@ -432,7 +566,7 @@ export async function classifyDocument(params) {
             }
         }
     }
-    process.stderr.write(`[memory-classify] [${accountId}] haiku ok (mode=${mode}, sections=${sections.length}, orphanCandidates=${orphanCandidates.length}, hallucinatedRelated=${hallucinatedRelated}, elapsedMs=${haikuMs})\n`);
+    process.stderr.write(`[memory-classify] [${accountId}] haiku ok (mode=${mode}, sections=${sections.length}, orphanCandidates=${orphanCandidates.length}, hallucinatedRelated=${hallucinatedRelated}, droppedForOffsets=${droppedForOffsets}, summaryTruncated=${summaryTruncated}, elapsedMs=${haikuMs})\n`);
     return {
         kind: "ok",
         output: {
@@ -445,4 +579,120 @@ export async function classifyDocument(params) {
         },
     };
 }
+// ---------------------------------------------------------------------------
+// Chunked classification path (Task 896 clause 3).
+//
+// Used only for document mode when the input exceeds CHUNK_CHAR_SIZE. Each
+// chunk is classified independently via the same single-shot path; the
+// per-chunk results are stitched back together with offset translation and
+// a same-kind merge to fix sections that straddled a chunk boundary.
+//
+// documentSummary is dropped in chunked mode (Haiku only sees one chunk at
+// a time, so no per-chunk summary describes the whole document) — see
+// 896-followup if a downstream consumer needs a synthesised whole-doc
+// summary later.
+// ---------------------------------------------------------------------------
+async function classifyDocumentChunked(params) {
+    const { accountId, documentText } = params;
+    const chunks = chunkDocument(documentText, {
+        chunkSize: CHUNK_CHAR_SIZE,
+        overlap: CHUNK_OVERLAP_CHARS,
+    });
+    process.stderr.write(`[memory-classify] [${accountId}] chunked path: chunks=${chunks.length} chars=${documentText.length} chunkSize=${CHUNK_CHAR_SIZE} overlap=${CHUNK_OVERLAP_CHARS}\n`);
+    // Defence-in-depth: chunkSize < INPUT_CHAR_CEILING by construction, so
+    // no chunk should exceed the per-call ceiling. If one does, that's a
+    // chunker bug or constants-drift — loud-fail instead of pretending.
+    for (const c of chunks) {
+        if (c.chunkText.length > INPUT_CHAR_CEILING) {
+            logFallback(accountId, `input-too-large: chunker emitted oversize chunk (chars=${c.chunkText.length}, ceiling=${INPUT_CHAR_CEILING}). Chunker invariant violated.`);
+            return {
+                kind: "fallback",
+                reason: `Chunker produced an oversize chunk (${c.chunkText.length} > ${INPUT_CHAR_CEILING})`,
+            };
+        }
+    }
+    const allSections = [];
+    const allKeywords = new Set();
+    const allOrphans = [];
+    const allDocumentEdges = [];
+    let totalHallucinatedRelated = 0;
+    for (let i = 0; i < chunks.length; i++) {
+        const c = chunks[i];
+        process.stderr.write(`[memory-classify] [${accountId}] classify-chunk ${i + 1}/${chunks.length} (chars=${c.chunkText.length}, baseOffset=${c.baseOffset})\n`);
+        // Recurse into the single-shot path — chunkSize < CHUNK_CHAR_SIZE is the
+        // dispatch threshold so the recursive call lands in the existing logic.
+        const chunkResult = await classifyDocument({ ...params, documentText: c.chunkText });
+        if (chunkResult.kind === "fallback") {
+            // One chunk failure aborts the whole ingest (loud-failure doctrine).
+            return chunkResult;
+        }
+        for (const s of chunkResult.output.sections) {
+            const wholeStart = s.sourceStart + c.baseOffset;
+            const wholeEnd = s.sourceEnd + c.baseOffset;
+            allSections.push({
+                ...s,
+                sourceStart: wholeStart,
+                sourceEnd: wholeEnd,
+                body: documentText.slice(wholeStart, wholeEnd),
+            });
+        }
+        chunkResult.output.documentKeywords.forEach((k) => allKeywords.add(k));
+        allOrphans.push(...chunkResult.output.orphanCandidates);
+        if (chunkResult.output.documentEdges) {
+            allDocumentEdges.push(...chunkResult.output.documentEdges);
+        }
+        totalHallucinatedRelated += chunkResult.output.hallucinatedRelated;
+    }
+    // Same-kind boundary-straddler merge. Cross-kind overlap is preserved as
+    // distinct sections per eng review — disagreement about kind is operator-
+    // visible signal, not noise to collapse.
+    const mergedSections = mergeOverlappingSections(allSections);
+    // After the merge, any merged section whose range was unioned needs its
+    // body re-sliced from the whole document so it covers the union, not just
+    // one of the contributing chunks. Walk the result and re-slice — cheap.
+    for (const s of mergedSections) {
+        s.body = documentText.slice(s.sourceStart, s.sourceEnd);
+    }
+    // documentEdges dedupe — a Parties / PARTICIPANT / FROM-TO target named
+    // across multiple chunks would otherwise be appended N times and the
+    // writer would attempt N edge writes against the same MERGEd target.
+    // Stable key = (type, targetKind, JSON.stringify(targetProperties))
+    // since two chunks emitting "PARTY of Person {givenName, familyName}"
+    // for the same party will produce identical targetProperties shapes.
+    const dedupedDocumentEdges = [];
+    const seenEdgeKeys = new Set();
+    for (const edge of allDocumentEdges) {
+        const key = `${edge.type}|${edge.direction}|${edge.targetKind}|${JSON.stringify(edge.targetProperties)}`;
+        if (seenEdgeKeys.has(key))
+            continue;
+        seenEdgeKeys.add(key);
+        dedupedDocumentEdges.push(edge);
+    }
+    // Orphan candidates similarly may repeat across chunks (same hallucinated
+    // node mentioned in two adjacent windows). Dedupe on (kind, label).
+    const dedupedOrphans = [];
+    const seenOrphanKeys = new Set();
+    for (const o of allOrphans) {
+        const key = `${o.kind}|${o.label}`;
+        if (seenOrphanKeys.has(key))
+            continue;
+        seenOrphanKeys.add(key);
+        dedupedOrphans.push(o);
+    }
+    process.stderr.write(`[memory-classify] [${accountId}] chunked merge: rawSections=${allSections.length} mergedSections=${mergedSections.length} rawEdges=${allDocumentEdges.length} mergedEdges=${dedupedDocumentEdges.length} rawOrphans=${allOrphans.length} mergedOrphans=${dedupedOrphans.length} hallucinatedRelated=${totalHallucinatedRelated}\n`);
+    return {
+        kind: "ok",
+        output: {
+            // documentSummary is dropped in chunked mode — Haiku never saw the
+            // whole document. Downstream consumers that need a whole-doc summary
+            // should call a separate reduce step (out of scope for this change).
+            documentSummary: "",
+            documentKeywords: Array.from(allKeywords),
+            sections: mergedSections,
+            orphanCandidates: dedupedOrphans,
+            ...(dedupedDocumentEdges.length > 0 ? { documentEdges: dedupedDocumentEdges } : {}),
+            hallucinatedRelated: totalHallucinatedRelated,
+        },
+    };
+}
 //# sourceMappingURL=llm-classifier.js.map