@rubytech/create-realagent 1.0.829 → 1.0.830

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (103) hide show
  1. package/package.json +1 -1
  2. package/payload/platform/config/brand.json +1 -1
  3. package/payload/platform/lib/oauth-llm/dist/index.d.ts +1 -1
  4. package/payload/platform/lib/oauth-llm/dist/index.d.ts.map +1 -1
  5. package/payload/platform/lib/oauth-llm/dist/index.js +21 -0
  6. package/payload/platform/lib/oauth-llm/dist/index.js.map +1 -1
  7. package/payload/platform/lib/oauth-llm/src/index.ts +24 -0
  8. package/payload/platform/neo4j/migrations/007-conversation-archive-source.ts +116 -0
  9. package/payload/platform/neo4j/schema.cypher +12 -3
  10. package/payload/platform/plugins/admin/hooks/__tests__/archive-ingest-surface-gate.test.sh +54 -39
  11. package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +64 -26
  12. package/payload/platform/plugins/contacts/mcp/dist/index.js +5 -5
  13. package/payload/platform/plugins/contacts/mcp/dist/index.js.map +1 -1
  14. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts +1 -1
  15. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts.map +1 -1
  16. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js +29 -23
  17. package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js.map +1 -1
  18. package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
  19. package/payload/platform/plugins/memory/PLUGIN.md +2 -1
  20. package/payload/platform/plugins/memory/bin/conversation-archive-ingest.mjs +541 -0
  21. package/payload/platform/plugins/memory/bin/conversation-archive-ingest.sh +106 -0
  22. package/payload/platform/plugins/memory/mcp/dist/index.js +30 -16
  23. package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
  24. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +4 -3
  25. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
  26. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js +11 -6
  27. package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js.map +1 -1
  28. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts +5 -0
  29. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts.map +1 -0
  30. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js +30 -0
  31. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js.map +1 -0
  32. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts +48 -0
  33. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts.map +1 -0
  34. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js +23 -0
  35. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js.map +1 -0
  36. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts +3 -0
  37. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts.map +1 -0
  38. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js +237 -0
  39. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js.map +1 -0
  40. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts +11 -0
  41. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts.map +1 -0
  42. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js +21 -0
  43. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js.map +1 -0
  44. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts +16 -0
  45. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts.map +1 -0
  46. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js +39 -0
  47. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js.map +1 -0
  48. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts +17 -0
  49. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts.map +1 -0
  50. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js +90 -0
  51. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js.map +1 -0
  52. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts +9 -0
  53. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts.map +1 -0
  54. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js +32 -0
  55. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js.map +1 -0
  56. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts +3 -0
  57. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts.map +1 -0
  58. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js +27 -0
  59. package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js.map +1 -0
  60. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts +45 -0
  61. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts.map +1 -0
  62. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js +125 -0
  63. package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js.map +1 -0
  64. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +24 -1
  65. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
  66. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +266 -16
  67. package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
  68. package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.d.ts.map +1 -1
  69. package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js +9 -2
  70. package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js.map +1 -1
  71. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts +2 -0
  72. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts.map +1 -0
  73. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js +75 -0
  74. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js.map +1 -0
  75. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts +2 -0
  76. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts.map +1 -0
  77. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js +67 -0
  78. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js.map +1 -0
  79. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +34 -3
  80. package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -1
  81. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts +17 -0
  82. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts.map +1 -1
  83. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js +34 -13
  84. package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js.map +1 -1
  85. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +18 -7
  86. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
  87. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +24 -8
  88. package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
  89. package/payload/platform/plugins/memory/references/schema-base.md +2 -2
  90. package/payload/platform/plugins/memory/skills/conversation-archive/SKILL.md +133 -0
  91. package/payload/platform/plugins/memory/skills/document-ingest/SKILL.md +5 -2
  92. package/payload/platform/plugins/whatsapp/PLUGIN.md +1 -1
  93. package/payload/platform/scripts/seed-neo4j.sh +15 -15
  94. package/payload/platform/templates/specialists/agents/database-operator.md +8 -9
  95. package/payload/server/chunk-7BO5HDJC.js +10093 -0
  96. package/payload/server/chunk-EL4DZ56X.js +1116 -0
  97. package/payload/server/chunk-QOJ2D26Z.js +654 -0
  98. package/payload/server/chunk-RC46ZYGT.js +2305 -0
  99. package/payload/server/client-pool-7NTEFNVQ.js +32 -0
  100. package/payload/server/cloudflare-task-tracker-WE77WXSI.js +19 -0
  101. package/payload/server/maxy-edge.js +3 -3
  102. package/payload/server/neo4j-migrations-4XPNJNM6.js +490 -0
  103. package/payload/server/server.js +6 -6
@@ -0,0 +1,45 @@
1
+ /**
2
+ * Deterministic prose chunker for oversize document classification (Task 896).
3
+ *
4
+ * Splits a document into overlapping fixed-size chunks so each chunk fits
5
+ * inside Haiku's input context window. The chunker is purely mechanical —
6
+ * it makes no semantic claim about where chunk boundaries should fall.
7
+ * Ontological boundaries remain Haiku's job per Task 737 (the document
8
+ * chunker that *did* try to be semantic was deleted because it leaked
9
+ * sections at the boundaries it picked).
10
+ *
11
+ * Overlap exists so a section straddling a chunk boundary appears in BOTH
12
+ * surrounding chunks; the merge step then unions the same-kind ranges so
13
+ * the boundary section isn't double-counted in the writer.
14
+ *
15
+ * Char counts are estimated from token counts via a fixed 3.5 chars/token
16
+ * ratio (English prose average). The estimate is conservative — Haiku
17
+ * tokenises slightly differently per script, but 3.5 leaves ~10% headroom
18
+ * for non-English content before bumping into the model's hard ceiling.
19
+ */
20
+ export interface RangedSection {
21
+ /** Section kind from the classifier's closed enumeration. */
22
+ kind: string;
23
+ /** Inclusive whole-document start offset. */
24
+ sourceStart: number;
25
+ /** Exclusive whole-document end offset. */
26
+ sourceEnd: number;
27
+ /** Per-section summary; longer wins on merge tie-break. */
28
+ summary: string;
29
+ }
30
+ export interface DocumentChunk {
31
+ /** Substring of the source document covered by this chunk. */
32
+ chunkText: string;
33
+ /** Whole-document offset where this chunk's text begins. */
34
+ baseOffset: number;
35
+ }
36
+ export interface ChunkOptions {
37
+ /** Maximum chunk length in characters (already token→char converted). */
38
+ chunkSize: number;
39
+ /** Overlap in characters between consecutive chunks. */
40
+ overlap: number;
41
+ }
42
+ export declare function chunkDocument(text: string, opts: ChunkOptions): DocumentChunk[];
43
+ export declare const MERGE_OVERLAP_THRESHOLD = 0.5;
44
+ export declare function mergeOverlappingSections<T extends RangedSection>(input: T[]): T[];
45
+ //# sourceMappingURL=document-chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"document-chunker.d.ts","sourceRoot":"","sources":["../../src/lib/document-chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAQH,MAAM,WAAW,aAAa;IAC5B,6DAA6D;IAC7D,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,WAAW,EAAE,MAAM,CAAC;IACpB,2CAA2C;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,2DAA2D;IAC3D,OAAO,EAAE,MAAM,CAAC;CACjB;AAMD,MAAM,WAAW,aAAa;IAC5B,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAC;IAClB,4DAA4D;IAC5D,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,YAAY;IAC3B,yEAAyE;IACzE,SAAS,EAAE,MAAM,CAAC;IAClB,wDAAwD;IACxD,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,aAAa,EAAE,CA4B/E;AAqBD,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C,wBAAgB,wBAAwB,CAAC,CAAC,SAAS,aAAa,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAqDjF"}
@@ -0,0 +1,125 @@
1
+ /**
2
+ * Deterministic prose chunker for oversize document classification (Task 896).
3
+ *
4
+ * Splits a document into overlapping fixed-size chunks so each chunk fits
5
+ * inside Haiku's input context window. The chunker is purely mechanical —
6
+ * it makes no semantic claim about where chunk boundaries should fall.
7
+ * Ontological boundaries remain Haiku's job per Task 737 (the document
8
+ * chunker that *did* try to be semantic was deleted because it leaked
9
+ * sections at the boundaries it picked).
10
+ *
11
+ * Overlap exists so a section straddling a chunk boundary appears in BOTH
12
+ * surrounding chunks; the merge step then unions the same-kind ranges so
13
+ * the boundary section isn't double-counted in the writer.
14
+ *
15
+ * Char counts are estimated from token counts via a fixed 3.5 chars/token
16
+ * ratio (English prose average). The estimate is conservative — Haiku
17
+ * tokenises slightly differently per script, but 3.5 leaves ~10% headroom
18
+ * for non-English content before bumping into the model's hard ceiling.
19
+ */
20
+ export function chunkDocument(text, opts) {
21
+ const { chunkSize, overlap } = opts;
22
+ if (chunkSize <= 0) {
23
+ throw new Error(`chunkDocument: chunkSize must be positive, got ${chunkSize}`);
24
+ }
25
+ if (overlap < 0) {
26
+ throw new Error(`chunkDocument: overlap must be non-negative, got ${overlap}`);
27
+ }
28
+ if (overlap >= chunkSize) {
29
+ throw new Error(`chunkDocument: overlap (${overlap}) must be less than chunkSize (${chunkSize})`);
30
+ }
31
+ if (text.length === 0)
32
+ return [];
33
+ if (text.length <= chunkSize) {
34
+ // One-chunk fast path so callers that always-chunk don't pay the
35
+ // window-stepping arithmetic for inputs that already fit.
36
+ return [{ chunkText: text, baseOffset: 0 }];
37
+ }
38
+ const chunks = [];
39
+ const stride = chunkSize - overlap;
40
+ let start = 0;
41
+ while (start < text.length) {
42
+ const end = Math.min(start + chunkSize, text.length);
43
+ chunks.push({ chunkText: text.slice(start, end), baseOffset: start });
44
+ if (end >= text.length)
45
+ break;
46
+ start += stride;
47
+ }
48
+ return chunks;
49
+ }
50
+ // ---------------------------------------------------------------------------
51
+ // mergeOverlappingSections — collates per-chunk classifier results.
52
+ //
53
+ // Algorithm: group sections by `kind`, sort by `sourceStart`, then walk and
54
+ // union consecutive same-kind ranges whose intersection covers more than
55
+ // MERGE_OVERLAP_THRESHOLD of the smaller range. The longer summary wins on
56
+ // merge — empirically Haiku's longer summary on a chunk that saw more
57
+ // surrounding context tends to be the better one.
58
+ //
59
+ // Cross-kind overlap is preserved: chunk A's `Position` and chunk B's
60
+ // `Other` covering the same range are kept as two distinct sections (per
61
+ // eng review). The classifier disagreed about kind; the writer's downstream
62
+ // :Section:Other surfacing will let the operator decide which one wins
63
+ // during ontology growth review.
64
+ //
65
+ // Disjoint same-kind sections are also preserved — only adjacent ranges
66
+ // with material overlap are merged.
67
+ // ---------------------------------------------------------------------------
68
+ export const MERGE_OVERLAP_THRESHOLD = 0.5;
69
+ export function mergeOverlappingSections(input) {
70
+ if (input.length <= 1)
71
+ return input.slice();
72
+ // Group by kind so we never accidentally merge across kinds.
73
+ const byKind = new Map();
74
+ for (const s of input) {
75
+ const arr = byKind.get(s.kind);
76
+ if (arr)
77
+ arr.push(s);
78
+ else
79
+ byKind.set(s.kind, [s]);
80
+ }
81
+ const merged = [];
82
+ for (const group of byKind.values()) {
83
+ group.sort((a, b) => a.sourceStart - b.sourceStart || a.sourceEnd - b.sourceEnd);
84
+ let current = null;
85
+ for (const s of group) {
86
+ if (current === null) {
87
+ current = { ...s };
88
+ continue;
89
+ }
90
+ const intersection = Math.max(0, Math.min(current.sourceEnd, s.sourceEnd) - Math.max(current.sourceStart, s.sourceStart));
91
+ if (intersection === 0) {
92
+ merged.push(current);
93
+ current = { ...s };
94
+ continue;
95
+ }
96
+ const currentLen = current.sourceEnd - current.sourceStart;
97
+ const sLen = s.sourceEnd - s.sourceStart;
98
+ const overlapFraction = intersection / Math.min(currentLen, sLen);
99
+ if (overlapFraction > MERGE_OVERLAP_THRESHOLD) {
100
+ // Union the range; the section with the longer body contributes its
101
+ // non-range fields (title, properties, anchorEdge, related, etc.) on
102
+ // the assumption that a wider classification window grounded its
103
+ // properties more reliably. Summary always picks the longer of the two.
104
+ const fieldsWinner = sLen > currentLen ? s : current;
105
+ current = {
106
+ ...fieldsWinner,
107
+ sourceStart: Math.min(current.sourceStart, s.sourceStart),
108
+ sourceEnd: Math.max(current.sourceEnd, s.sourceEnd),
109
+ summary: s.summary.length > current.summary.length ? s.summary : current.summary,
110
+ };
111
+ }
112
+ else {
113
+ merged.push(current);
114
+ current = { ...s };
115
+ }
116
+ }
117
+ if (current !== null)
118
+ merged.push(current);
119
+ }
120
+ // Return in whole-document reading order so the writer's :NEXT chain
121
+ // maps to source order.
122
+ merged.sort((a, b) => a.sourceStart - b.sourceStart);
123
+ return merged;
124
+ }
125
+ //# sourceMappingURL=document-chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"document-chunker.js","sourceRoot":"","sources":["../../src/lib/document-chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAqCH,MAAM,UAAU,aAAa,CAAC,IAAY,EAAE,IAAkB;IAC5D,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC;IACpC,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,kDAAkD,SAAS,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;QAChB,MAAM,IAAI,KAAK,CAAC,oDAAoD,OAAO,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,OAAO,IAAI,SAAS,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,2BAA2B,OAAO,kCAAkC,SAAS,GAAG,CAAC,CAAC;IACpG,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,iEAAiE;QACjE,0DAA0D;QAC1D,OAAO,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC;IACnC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC,CAAC;QACtE,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM;QAC9B,KAAK,IAAI,MAAM,CAAC;IAClB,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,8EAA8E;AAC9E,oEAAoE;AACpE,EAAE;AACF,4EAA4E;AAC5E,yEAAyE;AACzE,2EAA2E;AAC3E,sEAAsE;AACtE,kDAAkD;AAClD,EAAE;AACF,sEAAsE;AACtE,yEAAyE;AACzE,4EAA4E;AAC5E,uEAAuE;AACvE,iCAAiC;AACjC,EAAE;AACF,wEAAwE;AACxE,oCAAoC;AACpC,8EAA8E;AAE9E,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C,MAAM,UAAU,wBAAwB,CAA0B,KAAU;IAC1E,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;IAE5C,6DAA6D;IAC7D,MAAM,MAAM,GAAG,IAAI,GAAG,EAAe,CAAC;IACtC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,GAAG;YAAE,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;YAChB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/B,CAAC;IAED,MAAM,MAAM,GAAQ,EAAE,CAAC;IACvB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,IAAI,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;QACjF,IAAI,OAAO,GAAa,IAAI,CAAC;QAC7B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;gBACnB,SAAS;YACX,CAAC;YACD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;YAC1H,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;gBACnB,SAAS;YACX,CAAC;YACD,MAAM,UAAU,GAAW,OAAO,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;YACnE,MAAM,IAAI,GAAW,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,WAAW,CAAC;YACjD,MAAM,eAAe,GAAG,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;YAClE,IAAI,eAAe,GAAG,uBAAuB,EAAE,CAAC;gBAC9C,oEAAoE;gBACpE,qEAAqE;gBACrE,iEAAiE;gBACjE,wEAAwE;gBACxE,MAAM,YAAY,GAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;gBACxD,OAAO,GAAG;oBACR,GAAG,YAAY;oBACf,WAAW,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC;oBACzD,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC;oBACnD,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO;iBACjF,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;YACrB,CAAC;QACH,CAAC;QACD,IAAI,OAAO,KAAK,IAAI;YAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC7C,CAAC;IAED,qEAAqE;IACrE,wBAAwB;IACxB,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;IACrD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -70,8 +70,31 @@ export interface ClassifiedSection {
70
70
  kind: string;
71
71
  /** Short human-readable title for the section. */
72
72
  title: string;
73
- /** The section's body text — embedded and stored on the section node. */
73
+ /**
74
+ * The section's body text — embedded and stored on the section node.
75
+ *
76
+ * Task 896: server-reconstructed via `documentText.slice(sourceStart, sourceEnd)`.
77
+ * The LLM emits offsets, never the body text — output size becomes O(sections),
78
+ * not O(input chars). Callers consume the same `body: string` shape as before.
79
+ */
74
80
  body: string;
81
+ /**
82
+ * 1-3 sentence summary of the section, ≤500 chars (server-validated).
83
+ * The LLM emits this; the server truncates if oversize. Stored as
84
+ * `properties.summary` on the section node so adjacency search can
85
+ * surface it without rehydrating the body.
86
+ */
87
+ summary: string;
88
+ /**
89
+ * Whole-document character offsets — inclusive start, exclusive end.
90
+ * The LLM emits these; the server validates bounds and reconstructs
91
+ * `body` via `documentText.slice(sourceStart, sourceEnd)`. In the
92
+ * chunked-classify path these are translated from chunk-local to
93
+ * whole-document coordinates so the merge step can detect boundary
94
+ * straddlers across chunks.
95
+ */
96
+ sourceStart: number;
97
+ sourceEnd: number;
75
98
  /** Properties on the section node (excluding accountId/embedding/provenance). */
76
99
  properties: Record<string, unknown>;
77
100
  /**
@@ -1 +1 @@
1
- {"version":3,"file":"llm-classifier.d.ts","sourceRoot":"","sources":["../../src/lib/llm-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AASH,+DAA+D;AAC/D,MAAM,MAAM,mBAAmB,GAAG,aAAa,GAAG,WAAW,CAAC;AAE9D,mEAAmE;AACnE,MAAM,MAAM,oBAAoB,GAAG,UAAU,GAAG,UAAU,CAAC;AAE3D,kFAAkF;AAClF,MAAM,WAAW,iBAAiB;IAChC,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC,oDAAoD;IACpD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,oBAAoB,CAAC;QAChC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,CAAC;IACF;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,oGAAoG;AACpG,MAAM,WAAW,iBAAiB;IAChC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAC;IACd,yEAAyE;IACzE,IAAI,EAAE,MAAM,CAAC;IACb,iFAAiF;IACjF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC;;;;OAIG;IACH,UAAU,EAAE;QACV,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,mBAAmB,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,GAAG,IAAI,CAAC;IACT,oFAAoF;IACpF,OAAO,CAAC,EAAE,iBAAiB,EAAE,CAAC;IAC9B;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,6EAA6E;IAC7E,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oCAAoC;AACpC,MAAM,WAAW,gBAAgB;IAC/B,kDAAkD;IAClD,eAAe,EAAE,MAAM,CAAC;IACxB,kEAAkE;IAClE,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,6BAA6B;IAC7B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,iFAAiF;IACjF,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC;;+CAE2C;IAC3C,aAAa,CAAC,EAAE,KAAK,CAAC;QACpB,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,UAAU,GAAG,UAAU,CAAC;QACnC,UAAU,EAAE,MAAM,CAAC;QACnB,gBAAgB,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,wEAAwE;QACxE,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,mFAAmF;IACnF,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,MAAM,cAAc,GACtB;IAAE,IAAI,EAAE,IAAI,CAAC;IAAC,MAAM,EAAE,gBAAgB,CAAA;CAAE,GACxC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC;AAQzC;;;;;;;;GAQG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAE1C,eAAO,MAAM,sBAAsB,wEAMzB,CAAC;AAEX,eAAO,MAAM,wBAAwB,yKAgB3B,CAAC;AAEX,eAAO,MAAM,sBAAsB,4SAqBzB,CAAC;AAEX,8EAA8E;AAC9E,eAAO,MAAM,qBAAqB,sBAAuB,CAAC;AAE1D,eAAO,MAAM,iBAAiB,EAAE,WAAW,CAAC,MAAM,CAMhD,CAAC;AA+HH,MAAM,WAAW,cAAc;IAC7B,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;;OAOG;IACH,IAAI,CAAC,EAAE,UAAU,GAAG,MAAM,CAAC;IAC3B;;;;;;;;OAQG;IACH,iBAAiB,EAAE,MAAM,CAAC;IAC1B;;;;;;OAMG;IACH,cAAc,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;IACpC;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB,8EAA8E;IAC9E,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,cAAc,CAAC,CA2OzB"}
1
+ {"version":3,"file":"llm-classifier.d.ts","sourceRoot":"","sources":["../../src/lib/llm-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAUH,+DAA+D;AAC/D,MAAM,MAAM,mBAAmB,GAAG,aAAa,GAAG,WAAW,CAAC;AAE9D,mEAAmE;AACnE,MAAM,MAAM,oBAAoB,GAAG,UAAU,GAAG,UAAU,CAAC;AAE3D,kFAAkF;AAClF,MAAM,WAAW,iBAAiB;IAChC,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC,oDAAoD;IACpD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,oBAAoB,CAAC;QAChC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,CAAC;IACF;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,oGAAoG;AACpG,MAAM,WAAW,iBAAiB;IAChC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAC;IACd;;;;;;OAMG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,OAAO,EAAE,MAAM,CAAC;IAChB;;;;;;;OAOG;IACH,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,iFAAiF;IACjF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC;;;;OAIG;IACH,UAAU,EAAE;QACV,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,mBAAmB,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,GAAG,IAAI,CAAC;IACT,oFAAoF;IACpF,OAAO,CAAC,EAAE,iBAAiB,EAAE,CAAC;IAC9B;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,6EAA6E;IAC7E,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oCAAoC;AACpC,MAAM,WAAW,gBAAgB;IAC/B,kDAAkD;IAClD,eAAe,EAAE,MAAM,CAAC;IACxB,kEAAkE;IAClE,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,6BAA6B;IAC7B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,iFAAiF;IACjF,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC;;+CAE2C;IAC3C,aAAa,CAAC,EAAE,KAAK,CAAC;QACpB,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,UAAU,GAAG,UAAU,CAAC;QACnC,UAAU,EAAE,MAAM,CAAC;QACnB,gBAAgB,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,wEAAwE;QACxE,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,mFAAmF;IACnF,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,MAAM,cAAc,GACtB;IAAE,IAAI,EAAE,IAAI,CAAC;IAAC,MAAM,EAAE,gBAAgB,CAAA;CAAE,GACxC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC;AA8CzC;;;;;;;;GAQG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAE1C,eAAO,MAAM,sBAAsB,wEAMzB,CAAC;AAEX,eAAO,MAAM,wBAAwB,yKAa3B,CAAC;AAEX,eAAO,MAAM,sBAAsB,4SAqBzB,CAAC;AAEX,8EAA8E;AAC9E,eAAO,MAAM,qBAAqB,sBAAuB,CAAC;AAE1D,eAAO,MAAM,iBAAiB,EAAE,WAAW,CAAC,MAAM,CAMhD,CAAC;AAoJH,MAAM,WAAW,cAAc;IAC7B,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;;OAOG;IACH,IAAI,CAAC,EAAE,UAAU,GAAG,MAAM,CAAC;IAC3B;;;;;;;;OAQG;IACH,iBAAiB,EAAE,MAAM,CAAC;IAC1B;;;;;;OAMG;IACH,cAAc,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;IACpC;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB,8EAA8E;IAC9E,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,cAAc,CAAC,CA0UzB"}
@@ -23,10 +23,45 @@
23
23
  */
24
24
  import { callOauthLlm } from "../../../../../lib/oauth-llm/dist/index.js";
25
25
  import { HAIKU_MODEL } from "../../../../../lib/models/dist/index.js";
26
+ import { chunkDocument, mergeOverlappingSections } from "./document-chunker.js";
26
27
  // ---------------------------------------------------------------------------
27
28
  // Constants
28
29
  // ---------------------------------------------------------------------------
29
30
  const MAX_OUTPUT_TOKENS = 8192;
31
+ /**
32
+ * Per-section summary cap (Task 896 clause 1). The classifier prompt asks
33
+ * for ≤500 chars; the server truncates anything longer with an ellipsis
34
+ * marker so a single overlong summary never inflates the output JSON
35
+ * unbounded. Truncation is observable (logged once per oversize section)
36
+ * but not fatal — Haiku usually respects the cap.
37
+ */
38
+ const SUMMARY_MAX_CHARS = 500;
39
+ /**
40
+ * Output budget the prompt advertises to the model. ≈6000 tokens leaves
41
+ * headroom under MAX_OUTPUT_TOKENS=8192 for a few hundred sections of
42
+ * offsets + short summaries without re-emitting body text. Pre-Task-896
43
+ * the verbatim-body schema made output ≈ input — a 251K-char Adam Mackay
44
+ * archive truncated mid-word at 8K.
45
+ */
46
+ const PROMPT_OUTPUT_TOKEN_BUDGET = 6000;
47
+ // ---------------------------------------------------------------------------
48
+ // Task 896 clause 3 — chunker constants for oversize prose.
49
+ //
50
+ // Haiku 4.5: 200K input tokens. Reserve ~5K for prompt + system overhead
51
+ // → ~195K usable tokens × ~3.5 chars/token = ~682K char input ceiling per
52
+ // Haiku call. The chunker emits chunks of ~150K tokens (~525K chars) with
53
+ // ~5K-token (~17.5K-char) overlap so a section straddling the boundary
54
+ // surfaces in both surrounding chunks for the merge step.
55
+ // ---------------------------------------------------------------------------
56
+ const CHARS_PER_TOKEN_ESTIMATE = 3.5;
57
+ const HAIKU_INPUT_TOKEN_BUDGET = 195_000;
58
+ /** Per-Haiku-call hard ceiling on `documentText` characters — enforced regardless of mode. */
59
+ const INPUT_CHAR_CEILING = Math.floor(HAIKU_INPUT_TOKEN_BUDGET * CHARS_PER_TOKEN_ESTIMATE);
60
+ const CHUNK_TOKEN_SIZE = 150_000;
61
+ const CHUNK_OVERLAP_TOKENS = 5_000;
62
+ /** Target chunk char size for the prose chunker (Task 896 clause 3). */
63
+ const CHUNK_CHAR_SIZE = Math.floor(CHUNK_TOKEN_SIZE * CHARS_PER_TOKEN_ESTIMATE);
64
+ const CHUNK_OVERLAP_CHARS = Math.floor(CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN_ESTIMATE);
30
65
  /**
31
66
  * Closed enumeration of section `kind` values. Each becomes a secondary
32
67
  * label on the `:Section` node (e.g. `:Section:Position`). Anything outside
@@ -55,10 +90,7 @@ export const STRUCTURAL_SECTION_KINDS = [
55
90
  "Bibliography",
56
91
  "Glossary",
57
92
  "Acknowledgments",
58
- // Task 891 chat-mode kind. Emitted only when ClassifyParams.mode === 'chat'.
59
- // The chat-mode system prompt restricts output to this single kind; the
60
- // document-mode prompt never names it. Listed here so the validator's closed
61
- // enumeration accepts it without a per-mode dictionary split.
93
+ // Task 891: chat-mode kind. Emitted only when mode==='chat'; listed here so the validator's closed enumeration accepts it.
62
94
  "Conversation",
63
95
  ];
64
96
  export const CONTRACT_SECTION_KINDS = [
@@ -115,14 +147,17 @@ const CHAT_SYSTEM_PROMPT = [
115
147
  " [DD/MM/YYYY, HH:MM:SS] <Sender>: <body>",
116
148
  " [DD/MM/YY, HH:MM:SS] <Sender>: <body>",
117
149
  " [YYYY-MM-DD HH:MM:SS] <Sender>: <body>",
118
- "Body lines without a leading bracketed timestamp belong to the previous message (multi-line bodies). System messages (no sender) and media-only lines (e.g. '<Media omitted>') are kept verbatim in the chunk body.",
150
+ "Body lines without a leading bracketed timestamp belong to the previous message (multi-line bodies). System messages (no sender) and media-only lines (e.g. '<Media omitted>') belong inside the chunk that covers their position.",
151
+ "",
152
+ `OUTPUT BUDGET — your JSON response must fit within ~${PROMPT_OUTPUT_TOKEN_BUDGET} output tokens. Use offsets — NEVER re-emit body text. The server reconstructs each chunk's body from your offsets via documentText.slice(sourceStart, sourceEnd) and stores it on the node, so byte-equal recovery works without you transmitting the bytes.`,
119
153
  "",
120
154
  "Each chunk is a JSON object with:",
121
155
  "- 'kind': MUST be exactly 'Conversation'. No other kinds are legal in chat mode.",
122
156
  "- 'title': short human-readable topic label for the chunk (max 120 chars).",
123
- "- 'body': the verbatim text of the messages in this chunk exactly as supplied, including timestamp prefixes, sender names, internal newlines, and multi-line continuations. NEVER summarise, NEVER strip the prefixes downstream provenance recovery depends on byte equality.",
157
+ `- 'summary': 1-3 sentences describing what this chunk is about. Hard ceiling ${SUMMARY_MAX_CHARS} charactersthe server truncates anything longer.`,
158
+ "- 'sourceStart': INTEGER character offset into the supplied archive text where this chunk's first message begins (0-indexed, inclusive). MUST point at the opening '[' of the bracketed timestamp prefix.",
159
+ "- 'sourceEnd': INTEGER character offset where this chunk ends (exclusive). MUST be > sourceStart and ≤ total length of the supplied text.",
124
160
  "- 'properties': required typed properties on the chunk node:",
125
- " summary : 1-3 sentences describing what this chunk is about (this is your one chance to summarise; the body stays verbatim).",
126
161
  " keywords : array of 3-10 lowercase topic keywords for retrieval.",
127
162
  " firstMessageAt : timestamp of the first message in the chunk, copied verbatim from the line prefix (preserve the file's native format and any offset).",
128
163
  " lastMessageAt : timestamp of the last message in the chunk, copied verbatim from the line prefix.",
@@ -142,7 +177,7 @@ const CHAT_SYSTEM_PROMPT = [
142
177
  "- Split at topic transitions, not at message count or arbitrary intervals. A coherent exchange ('let's discuss the deck') is one chunk; a separate exchange ('what time tomorrow?') is another.",
143
178
  "- An archive of fewer than ~10 messages is usually one chunk.",
144
179
  "- Even a one-message archive must produce one chunk — never return zero chunks for non-empty input.",
145
- "- Chunks MUST cover every message in the archive in chronological order with no gaps and no overlap. messageCount summed across chunks equals total archive messages.",
180
+ "- Offset coverage: chunks MUST cover every message in chronological order. Adjacent chunks should be contiguous (chunk N's sourceEnd equals chunk N+1's sourceStart) so no message is skipped. messageCount summed across chunks equals total archive messages.",
146
181
  "",
147
182
  "Respond with ONLY the JSON object, no prose, no markdown fences.",
148
183
  ].join("\n");
@@ -154,6 +189,8 @@ const SYSTEM_PROMPT = [
154
189
  "2. The natural-edge map naming the anchor edge for identity-kind sections.",
155
190
  "3. The full document text.",
156
191
  "",
192
+ `OUTPUT BUDGET — your JSON response must fit within ~${PROMPT_OUTPUT_TOKEN_BUDGET} output tokens. Use offsets — NEVER re-emit body text. The server reconstructs each section's body from your offsets via documentText.slice(sourceStart, sourceEnd) and stores it on the node. Per-section 'summary' is hard-capped at ${SUMMARY_MAX_CHARS} chars.`,
193
+ "",
157
194
  "Closed enumeration of section `kind` values:",
158
195
  ` Identity (anchor edge to subject): ${IDENTITY_SECTION_KINDS.join(", ")}`,
159
196
  ` Document-structural (no anchor edge; HAS_SECTION + NEXT only): ${STRUCTURAL_SECTION_KINDS.join(", ")}`,
@@ -164,7 +201,9 @@ const SYSTEM_PROMPT = [
164
201
  "For each meaningful section, return a JSON object with:",
165
202
  "- 'kind': one of the closed-enumeration values above. Never invent new kinds; use 'Other' with a 'classifierReason' if nothing fits.",
166
203
  "- 'title': short human-readable title (max 120 chars).",
167
- "- 'body': the section's text, exactly as it appears (verbatim no summarising).",
204
+ `- 'summary': 1-3 sentences describing the section. Hard ceiling ${SUMMARY_MAX_CHARS} characters the server truncates anything longer.`,
205
+ "- 'sourceStart': INTEGER character offset into the supplied document text where this section begins (0-indexed, inclusive).",
206
+ "- 'sourceEnd': INTEGER character offset where this section ends (exclusive). MUST be > sourceStart and ≤ total length of the supplied text.",
168
207
  "- 'properties': any typed properties for the section node (e.g. for Position: jobTitle, startDate, endDate; for Education: degree, fieldOfStudy; do NOT include accountId, embedding, createdAt, or other system fields — the writer adds them).",
169
208
  "- 'anchorEdge': for identity-kind sections (Position, Education, Credential, Skill, Biography) and for standalone Project, an object { type, direction, properties } naming the natural edge to the document subject (e.g. UserProfile -[HAS_POSITION]-> the Section). 'direction' is 'from-anchor' if the subject points at the section, 'to-anchor' if the section points at the subject. Set to null for structural + contract-clause kinds and for 'Other'.",
170
209
  "- 'related': optional array of additional entity nodes this section references (e.g. a Position section's employer Organization via AT, an Education section's school Organization via ATTENDED). Each entry: { kind, properties, edge: { type, direction, properties }, merge: true|false }. Direction is 'outgoing' (section -> related) or 'incoming' (section <- related). Use 'merge': true for entities reused across documents (Organization by name, Person by email/telephone).",
@@ -186,7 +225,7 @@ const SYSTEM_PROMPT = [
186
225
  "- 'kind' values are restricted to the closed enumeration above. If a section truly fits no listed kind, use 'Other' with a 'classifierReason'. Never emit a kind not on the list.",
187
226
  "- Never invent edge names. Use the natural-edge map exactly as given. The graph validator rejects writes with unknown edge types.",
188
227
  "- Be conservative with 'related' entities — only include them when the section explicitly names them.",
189
- "- Keep 'body' verbatim from the document. Summaries belong only in 'documentSummary'.",
228
+ "- Offsets cover the source: sourceStart and sourceEnd are integer character positions in the supplied document text. Do not re-emit body text — the server reconstructs it from your offsets.",
190
229
  "- Respond with ONLY the JSON object, no prose, no markdown fences.",
191
230
  ].join("\n");
192
231
  // ---------------------------------------------------------------------------
@@ -206,6 +245,22 @@ function asString(v) {
206
245
  function asObject(v) {
207
246
  return v && typeof v === "object" && !Array.isArray(v) ? v : null;
208
247
  }
248
+ /**
249
+ * Coerce a JSON value into a non-negative integer character offset, or null
250
+ * if it isn't one. Floats, NaN, negatives, and non-numbers all return null —
251
+ * Haiku has been observed emitting `null` and stringly-typed offsets when
252
+ * stressed; we drop the section silently and let the missing-offsets
253
+ * diagnostic surface the rate.
254
+ */
255
+ function asNonNegativeInt(v) {
256
+ if (typeof v !== "number")
257
+ return null;
258
+ if (!Number.isFinite(v) || !Number.isInteger(v))
259
+ return null;
260
+ if (v < 0)
261
+ return null;
262
+ return v;
263
+ }
209
264
  /**
210
265
  * Classify a document into typed sections via Haiku (Task 740).
211
266
  *
@@ -223,6 +278,28 @@ function asObject(v) {
223
278
  export async function classifyDocument(params) {
224
279
  const { accountId, anchorDescription, ontologyLabels, naturalEdgeMap, documentText } = params;
225
280
  const mode = params.mode ?? "document";
281
+ // Task 896 clause 3 dispatch — oversize document mode goes to the chunked
282
+ // path; oversize chat mode loud-fails (sessionize must keep sessions under
283
+ // the ceiling, per eng review). Single-shot path stays unchanged below.
284
+ if (mode === "document" && documentText.length > CHUNK_CHAR_SIZE) {
285
+ return classifyDocumentChunked(params);
286
+ }
287
+ if (documentText.length > INPUT_CHAR_CEILING) {
288
+ const overage = `chars=${documentText.length}, ceiling=${INPUT_CHAR_CEILING}`;
289
+ if (mode === "chat") {
290
+ logFallback(accountId, `input-too-large: chat session exceeds Haiku input ceiling (${overage}). Sessionize must split sessions before classify (Task 894).`);
291
+ }
292
+ else {
293
+ // Document mode > INPUT_CHAR_CEILING but ≤ CHUNK_CHAR_SIZE shouldn't
294
+ // happen since CHUNK_CHAR_SIZE < INPUT_CHAR_CEILING — kept as
295
+ // defence-in-depth in case constants drift.
296
+ logFallback(accountId, `input-too-large: document exceeds Haiku input ceiling without chunking (${overage}). Constants drift between CHUNK_CHAR_SIZE and INPUT_CHAR_CEILING.`);
297
+ }
298
+ return {
299
+ kind: "fallback",
300
+ reason: `Input is ${documentText.length} chars; classifier ceiling is ${INPUT_CHAR_CEILING}.`,
301
+ };
302
+ }
226
303
  // System prompt + user message branch on mode. Chat mode strips the
227
304
  // natural-edge map and reframes the input as a session of turn-attributed
228
305
  // text; document mode is unchanged from Task 740.
@@ -276,8 +353,17 @@ export async function classifyDocument(params) {
276
353
  try {
277
354
  parsed = JSON.parse(jsonText);
278
355
  }
279
- catch {
280
- logFallback(accountId, `malformed JSON: ${jsonText.slice(0, 120)}`);
356
+ catch (err) {
357
+ // Task 896 clause 5: surface diagnostics so a malformed-JSON fallback
358
+ // distinguishes truncation (output budget exceeded), fence drift, and
359
+ // genuine model junk. Pre-Task-896 the fallback discarded the parser
360
+ // error and 120 chars from the post-strip text — Adam Mackay's 251K-char
361
+ // ingest bottomed out here with no visible cause.
362
+ const message = err instanceof Error ? err.message : String(err);
363
+ const fenceStripped = jsonText !== responseText;
364
+ logFallback(accountId, `malformed JSON: parse-error=${JSON.stringify(message)} len=${responseText.length} fence-stripped=${fenceStripped} ` +
365
+ `pre-strip-head=${JSON.stringify(responseText.slice(0, 200))} ` +
366
+ `pre-strip-tail=${JSON.stringify(responseText.slice(-200))}`);
281
367
  return { kind: "fallback", reason: "Haiku returned malformed JSON" };
282
368
  }
283
369
  const root = asObject(parsed);
@@ -296,15 +382,44 @@ export async function classifyDocument(params) {
296
382
  }
297
383
  const sections = [];
298
384
  let hallucinatedRelated = 0;
385
+ // Task 896 clause 1 diagnostics — counters for offset/summary post-validation
386
+ // failures so the haiku-ok log line names the rate of model misbehaviour.
387
+ // Per-section drops are silent; the aggregate count tells the operator
388
+ // whether the prompt is degrading.
389
+ let droppedForOffsets = 0;
390
+ let summaryTruncated = 0;
299
391
  for (const raw of rawSections) {
300
392
  const obj = asObject(raw);
301
393
  if (!obj)
302
394
  continue;
395
+ // Task 896 clause 1: read offsets and reconstruct body server-side.
396
+ // Pre-Task-896 the LLM emitted body verbatim, making output ≈ input
397
+ // and causing 8K-token truncation on >80K-char inputs.
398
+ const sourceStart = asNonNegativeInt(obj.sourceStart);
399
+ const sourceEnd = asNonNegativeInt(obj.sourceEnd);
400
+ if (sourceStart === null || sourceEnd === null) {
401
+ droppedForOffsets += 1;
402
+ continue;
403
+ }
404
+ if (sourceEnd <= sourceStart || sourceEnd > documentText.length) {
405
+ droppedForOffsets += 1;
406
+ continue;
407
+ }
408
+ const body = documentText.slice(sourceStart, sourceEnd);
409
+ if (body.length === 0) {
410
+ droppedForOffsets += 1;
411
+ continue;
412
+ }
303
413
  const title = asString(obj.title) ?? "";
304
- const body = asString(obj.body) ?? "";
305
414
  const properties = asObject(obj.properties) ?? {};
306
- if (!body.trim())
307
- continue; // skip empty sections
415
+ let summary = asString(obj.summary) ?? "";
416
+ if (summary.length > SUMMARY_MAX_CHARS) {
417
+ summary = summary.slice(0, SUMMARY_MAX_CHARS - 1) + "…";
418
+ summaryTruncated += 1;
419
+ }
420
+ // Mirror summary into properties so the Neo4j section node carries it
421
+ // (chat-mode parity — pre-Task-896 chunks stored summary as properties.summary).
422
+ properties.summary = summary;
308
423
  if (mode === "chat") {
309
424
  // Chat mode: only `Conversation` is legal. Haiku is instructed to emit
310
425
  // exactly that kind; force it here so a misfire still produces a valid
@@ -315,6 +430,9 @@ export async function classifyDocument(params) {
315
430
  kind: "Conversation",
316
431
  title: title.slice(0, 200),
317
432
  body,
433
+ summary,
434
+ sourceStart,
435
+ sourceEnd,
318
436
  properties,
319
437
  anchorEdge: null,
320
438
  });
@@ -376,6 +494,9 @@ export async function classifyDocument(params) {
376
494
  kind,
377
495
  title: title.slice(0, 200),
378
496
  body,
497
+ summary,
498
+ sourceStart,
499
+ sourceEnd,
379
500
  properties,
380
501
  anchorEdge: kind === SECTION_KIND_OTHER ? null : anchorEdge,
381
502
  related: related.length > 0 ? related : undefined,
@@ -384,6 +505,19 @@ export async function classifyDocument(params) {
384
505
  : {}),
385
506
  });
386
507
  }
508
+ // Missing-offsets fallback (Task 896 clause 1, surfaced by CEO review):
509
+ // if Haiku emitted sections but every one failed offset validation, we'd
510
+ // otherwise return an empty `sections` array silently and the writer would
511
+ // happily produce zero `:Section` nodes. Loud-fail instead so the operator
512
+ // sees the regression — typically caused by a model that ignored the new
513
+ // offset contract and reverted to emitting `body`.
514
+ if (rawSections.length > 0 && sections.length === 0) {
515
+ logFallback(accountId, `missing-offsets: every section failed offset validation (rawSections=${rawSections.length}, droppedForOffsets=${droppedForOffsets}). Likely cause: Haiku emitted body text instead of sourceStart/sourceEnd offsets, or the prompt update didn't reach the model.`);
516
+ return {
517
+ kind: "fallback",
518
+ reason: "Haiku response had no parseable section offsets",
519
+ };
520
+ }
387
521
  // Top-level orphan candidates and document-level edges are document-mode
388
522
  // concepts. In chat mode the operator confirms participants up front and
389
523
  // attaches them as :PARTICIPANT_IN edges off the :ConversationArchive
@@ -432,7 +566,7 @@ export async function classifyDocument(params) {
432
566
  }
433
567
  }
434
568
  }
435
- process.stderr.write(`[memory-classify] [${accountId}] haiku ok (mode=${mode}, sections=${sections.length}, orphanCandidates=${orphanCandidates.length}, hallucinatedRelated=${hallucinatedRelated}, elapsedMs=${haikuMs})\n`);
569
+ process.stderr.write(`[memory-classify] [${accountId}] haiku ok (mode=${mode}, sections=${sections.length}, orphanCandidates=${orphanCandidates.length}, hallucinatedRelated=${hallucinatedRelated}, droppedForOffsets=${droppedForOffsets}, summaryTruncated=${summaryTruncated}, elapsedMs=${haikuMs})\n`);
436
570
  return {
437
571
  kind: "ok",
438
572
  output: {
@@ -445,4 +579,120 @@ export async function classifyDocument(params) {
445
579
  },
446
580
  };
447
581
  }
582
+ // ---------------------------------------------------------------------------
583
+ // Chunked classification path (Task 896 clause 3).
584
+ //
585
+ // Used only for document mode when the input exceeds CHUNK_CHAR_SIZE. Each
586
+ // chunk is classified independently via the same single-shot path; the
587
+ // per-chunk results are stitched back together with offset translation and
588
+ // a same-kind merge to fix sections that straddled a chunk boundary.
589
+ //
590
+ // documentSummary is dropped in chunked mode (Haiku only sees one chunk at
591
+ // a time, so no per-chunk summary describes the whole document) — see
592
+ // 896-followup if a downstream consumer needs a synthesised whole-doc
593
+ // summary later.
594
+ // ---------------------------------------------------------------------------
595
+ async function classifyDocumentChunked(params) {
596
+ const { accountId, documentText } = params;
597
+ const chunks = chunkDocument(documentText, {
598
+ chunkSize: CHUNK_CHAR_SIZE,
599
+ overlap: CHUNK_OVERLAP_CHARS,
600
+ });
601
+ process.stderr.write(`[memory-classify] [${accountId}] chunked path: chunks=${chunks.length} chars=${documentText.length} chunkSize=${CHUNK_CHAR_SIZE} overlap=${CHUNK_OVERLAP_CHARS}\n`);
602
+ // Defence-in-depth: chunkSize < INPUT_CHAR_CEILING by construction, so
603
+ // no chunk should exceed the per-call ceiling. If one does, that's a
604
+ // chunker bug or constants-drift — loud-fail instead of pretending.
605
+ for (const c of chunks) {
606
+ if (c.chunkText.length > INPUT_CHAR_CEILING) {
607
+ logFallback(accountId, `input-too-large: chunker emitted oversize chunk (chars=${c.chunkText.length}, ceiling=${INPUT_CHAR_CEILING}). Chunker invariant violated.`);
608
+ return {
609
+ kind: "fallback",
610
+ reason: `Chunker produced an oversize chunk (${c.chunkText.length} > ${INPUT_CHAR_CEILING})`,
611
+ };
612
+ }
613
+ }
614
+ const allSections = [];
615
+ const allKeywords = new Set();
616
+ const allOrphans = [];
617
+ const allDocumentEdges = [];
618
+ let totalHallucinatedRelated = 0;
619
+ for (let i = 0; i < chunks.length; i++) {
620
+ const c = chunks[i];
621
+ process.stderr.write(`[memory-classify] [${accountId}] classify-chunk ${i + 1}/${chunks.length} (chars=${c.chunkText.length}, baseOffset=${c.baseOffset})\n`);
622
+ // Recurse into the single-shot path — chunkSize < CHUNK_CHAR_SIZE is the
623
+ // dispatch threshold so the recursive call lands in the existing logic.
624
+ const chunkResult = await classifyDocument({ ...params, documentText: c.chunkText });
625
+ if (chunkResult.kind === "fallback") {
626
+ // One chunk failure aborts the whole ingest (loud-failure doctrine).
627
+ return chunkResult;
628
+ }
629
+ for (const s of chunkResult.output.sections) {
630
+ const wholeStart = s.sourceStart + c.baseOffset;
631
+ const wholeEnd = s.sourceEnd + c.baseOffset;
632
+ allSections.push({
633
+ ...s,
634
+ sourceStart: wholeStart,
635
+ sourceEnd: wholeEnd,
636
+ body: documentText.slice(wholeStart, wholeEnd),
637
+ });
638
+ }
639
+ chunkResult.output.documentKeywords.forEach((k) => allKeywords.add(k));
640
+ allOrphans.push(...chunkResult.output.orphanCandidates);
641
+ if (chunkResult.output.documentEdges) {
642
+ allDocumentEdges.push(...chunkResult.output.documentEdges);
643
+ }
644
+ totalHallucinatedRelated += chunkResult.output.hallucinatedRelated;
645
+ }
646
+ // Same-kind boundary-straddler merge. Cross-kind overlap is preserved as
647
+ // distinct sections per eng review — disagreement about kind is operator-
648
+ // visible signal, not noise to collapse.
649
+ const mergedSections = mergeOverlappingSections(allSections);
650
+ // After the merge, any merged section whose range was unioned needs its
651
+ // body re-sliced from the whole document so it covers the union, not just
652
+ // one of the contributing chunks. Walk the result and re-slice — cheap.
653
+ for (const s of mergedSections) {
654
+ s.body = documentText.slice(s.sourceStart, s.sourceEnd);
655
+ }
656
+ // documentEdges dedupe — a Parties / PARTICIPANT / FROM-TO target named
657
+ // across multiple chunks would otherwise be appended N times and the
658
+ // writer would attempt N edge writes against the same MERGEd target.
659
+ // Stable key = (type, targetKind, JSON.stringify(targetProperties))
660
+ // since two chunks emitting "PARTY of Person {givenName, familyName}"
661
+ // for the same party will produce identical targetProperties shapes.
662
+ const dedupedDocumentEdges = [];
663
+ const seenEdgeKeys = new Set();
664
+ for (const edge of allDocumentEdges) {
665
+ const key = `${edge.type}|${edge.direction}|${edge.targetKind}|${JSON.stringify(edge.targetProperties)}`;
666
+ if (seenEdgeKeys.has(key))
667
+ continue;
668
+ seenEdgeKeys.add(key);
669
+ dedupedDocumentEdges.push(edge);
670
+ }
671
+ // Orphan candidates similarly may repeat across chunks (same hallucinated
672
+ // node mentioned in two adjacent windows). Dedupe on (kind, label).
673
+ const dedupedOrphans = [];
674
+ const seenOrphanKeys = new Set();
675
+ for (const o of allOrphans) {
676
+ const key = `${o.kind}|${o.label}`;
677
+ if (seenOrphanKeys.has(key))
678
+ continue;
679
+ seenOrphanKeys.add(key);
680
+ dedupedOrphans.push(o);
681
+ }
682
+ process.stderr.write(`[memory-classify] [${accountId}] chunked merge: rawSections=${allSections.length} mergedSections=${mergedSections.length} rawEdges=${allDocumentEdges.length} mergedEdges=${dedupedDocumentEdges.length} rawOrphans=${allOrphans.length} mergedOrphans=${dedupedOrphans.length} hallucinatedRelated=${totalHallucinatedRelated}\n`);
683
+ return {
684
+ kind: "ok",
685
+ output: {
686
+ // documentSummary is dropped in chunked mode — Haiku never saw the
687
+ // whole document. Downstream consumers that need a whole-doc summary
688
+ // should call a separate reduce step (out of scope for this change).
689
+ documentSummary: "",
690
+ documentKeywords: Array.from(allKeywords),
691
+ sections: mergedSections,
692
+ orphanCandidates: dedupedOrphans,
693
+ ...(dedupedDocumentEdges.length > 0 ? { documentEdges: dedupedDocumentEdges } : {}),
694
+ hallucinatedRelated: totalHallucinatedRelated,
695
+ },
696
+ };
697
+ }
448
698
  //# sourceMappingURL=llm-classifier.js.map