@rubytech/create-realagent 1.0.829 → 1.0.830
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/payload/platform/config/brand.json +1 -1
- package/payload/platform/lib/oauth-llm/dist/index.d.ts +1 -1
- package/payload/platform/lib/oauth-llm/dist/index.d.ts.map +1 -1
- package/payload/platform/lib/oauth-llm/dist/index.js +21 -0
- package/payload/platform/lib/oauth-llm/dist/index.js.map +1 -1
- package/payload/platform/lib/oauth-llm/src/index.ts +24 -0
- package/payload/platform/neo4j/migrations/007-conversation-archive-source.ts +116 -0
- package/payload/platform/neo4j/schema.cypher +12 -3
- package/payload/platform/plugins/admin/hooks/__tests__/archive-ingest-surface-gate.test.sh +54 -39
- package/payload/platform/plugins/admin/hooks/archive-ingest-surface-gate.sh +64 -26
- package/payload/platform/plugins/contacts/mcp/dist/index.js +5 -5
- package/payload/platform/plugins/contacts/mcp/dist/index.js.map +1 -1
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts +1 -1
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.d.ts.map +1 -1
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js +29 -23
- package/payload/platform/plugins/contacts/mcp/dist/tools/contact-create.js.map +1 -1
- package/payload/platform/plugins/docs/references/plugins-guide.md +1 -1
- package/payload/platform/plugins/memory/PLUGIN.md +2 -1
- package/payload/platform/plugins/memory/bin/conversation-archive-ingest.mjs +541 -0
- package/payload/platform/plugins/memory/bin/conversation-archive-ingest.sh +106 -0
- package/payload/platform/plugins/memory/mcp/dist/index.js +30 -16
- package/payload/platform/plugins/memory/mcp/dist/index.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js +4 -3
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/llm-classifier.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js +11 -6
- package/payload/platform/plugins/memory/mcp/dist/lib/__tests__/schema-loader.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts +5 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js +30 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/index.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts +48 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js +23 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/types.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts +3 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js +237 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-normalisers/whatsapp-text.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts +11 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js +21 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/delta-cursor.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts +16 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js +39 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/derive-keys.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts +17 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js +90 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sender-bind.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts +9 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js +32 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/sessionize.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts +3 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js +27 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/conversation-pipeline/to-turn-text.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts +45 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js +125 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/document-chunker.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts +24 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js +266 -16
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-classifier.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js +9 -2
- package/payload/platform/plugins/memory/mcp/dist/lib/llm-ranker.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts +2 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js +75 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-source-agnosticism.test.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts +2 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.d.ts.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js +67 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/conversation-normalisers-whatsapp-text.test.js.map +1 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js +34 -3
- package/payload/platform/plugins/memory/mcp/dist/tools/__tests__/memory-ingest.test.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts +17 -0
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js +34 -13
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-archive-write.js.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts +18 -7
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.d.ts.map +1 -1
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js +24 -8
- package/payload/platform/plugins/memory/mcp/dist/tools/memory-ingest.js.map +1 -1
- package/payload/platform/plugins/memory/references/schema-base.md +2 -2
- package/payload/platform/plugins/memory/skills/conversation-archive/SKILL.md +133 -0
- package/payload/platform/plugins/memory/skills/document-ingest/SKILL.md +5 -2
- package/payload/platform/plugins/whatsapp/PLUGIN.md +1 -1
- package/payload/platform/scripts/seed-neo4j.sh +15 -15
- package/payload/platform/templates/specialists/agents/database-operator.md +8 -9
- package/payload/server/chunk-7BO5HDJC.js +10093 -0
- package/payload/server/chunk-EL4DZ56X.js +1116 -0
- package/payload/server/chunk-QOJ2D26Z.js +654 -0
- package/payload/server/chunk-RC46ZYGT.js +2305 -0
- package/payload/server/client-pool-7NTEFNVQ.js +32 -0
- package/payload/server/cloudflare-task-tracker-WE77WXSI.js +19 -0
- package/payload/server/maxy-edge.js +3 -3
- package/payload/server/neo4j-migrations-4XPNJNM6.js +490 -0
- package/payload/server/server.js +6 -6
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic prose chunker for oversize document classification (Task 896).
|
|
3
|
+
*
|
|
4
|
+
* Splits a document into overlapping fixed-size chunks so each chunk fits
|
|
5
|
+
* inside Haiku's input context window. The chunker is purely mechanical —
|
|
6
|
+
* it makes no semantic claim about where chunk boundaries should fall.
|
|
7
|
+
* Ontological boundaries remain Haiku's job per Task 737 (the document
|
|
8
|
+
* chunker that *did* try to be semantic was deleted because it leaked
|
|
9
|
+
* sections at the boundaries it picked).
|
|
10
|
+
*
|
|
11
|
+
* Overlap exists so a section straddling a chunk boundary appears in BOTH
|
|
12
|
+
* surrounding chunks; the merge step then unions the same-kind ranges so
|
|
13
|
+
* the boundary section isn't double-counted in the writer.
|
|
14
|
+
*
|
|
15
|
+
* Char counts are estimated from token counts via a fixed 3.5 chars/token
|
|
16
|
+
* ratio (English prose average). The estimate is conservative — Haiku
|
|
17
|
+
* tokenises slightly differently per script, but 3.5 leaves ~10% headroom
|
|
18
|
+
* for non-English content before bumping into the model's hard ceiling.
|
|
19
|
+
*/
|
|
20
|
+
export interface RangedSection {
|
|
21
|
+
/** Section kind from the classifier's closed enumeration. */
|
|
22
|
+
kind: string;
|
|
23
|
+
/** Inclusive whole-document start offset. */
|
|
24
|
+
sourceStart: number;
|
|
25
|
+
/** Exclusive whole-document end offset. */
|
|
26
|
+
sourceEnd: number;
|
|
27
|
+
/** Per-section summary; longer wins on merge tie-break. */
|
|
28
|
+
summary: string;
|
|
29
|
+
}
|
|
30
|
+
export interface DocumentChunk {
|
|
31
|
+
/** Substring of the source document covered by this chunk. */
|
|
32
|
+
chunkText: string;
|
|
33
|
+
/** Whole-document offset where this chunk's text begins. */
|
|
34
|
+
baseOffset: number;
|
|
35
|
+
}
|
|
36
|
+
export interface ChunkOptions {
|
|
37
|
+
/** Maximum chunk length in characters (already token→char converted). */
|
|
38
|
+
chunkSize: number;
|
|
39
|
+
/** Overlap in characters between consecutive chunks. */
|
|
40
|
+
overlap: number;
|
|
41
|
+
}
|
|
42
|
+
export declare function chunkDocument(text: string, opts: ChunkOptions): DocumentChunk[];
|
|
43
|
+
export declare const MERGE_OVERLAP_THRESHOLD = 0.5;
|
|
44
|
+
export declare function mergeOverlappingSections<T extends RangedSection>(input: T[]): T[];
|
|
45
|
+
//# sourceMappingURL=document-chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-chunker.d.ts","sourceRoot":"","sources":["../../src/lib/document-chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAQH,MAAM,WAAW,aAAa;IAC5B,6DAA6D;IAC7D,IAAI,EAAE,MAAM,CAAC;IACb,6CAA6C;IAC7C,WAAW,EAAE,MAAM,CAAC;IACpB,2CAA2C;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,2DAA2D;IAC3D,OAAO,EAAE,MAAM,CAAC;CACjB;AAMD,MAAM,WAAW,aAAa;IAC5B,8DAA8D;IAC9D,SAAS,EAAE,MAAM,CAAC;IAClB,4DAA4D;IAC5D,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,YAAY;IAC3B,yEAAyE;IACzE,SAAS,EAAE,MAAM,CAAC;IAClB,wDAAwD;IACxD,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,YAAY,GAAG,aAAa,EAAE,CA4B/E;AAqBD,eAAO,MAAM,uBAAuB,MAAM,CAAC;AAE3C,wBAAgB,wBAAwB,CAAC,CAAC,SAAS,aAAa,EAAE,KAAK,EAAE,CAAC,EAAE,GAAG,CAAC,EAAE,CAqDjF"}
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic prose chunker for oversize document classification (Task 896).
|
|
3
|
+
*
|
|
4
|
+
* Splits a document into overlapping fixed-size chunks so each chunk fits
|
|
5
|
+
* inside Haiku's input context window. The chunker is purely mechanical —
|
|
6
|
+
* it makes no semantic claim about where chunk boundaries should fall.
|
|
7
|
+
* Ontological boundaries remain Haiku's job per Task 737 (the document
|
|
8
|
+
* chunker that *did* try to be semantic was deleted because it leaked
|
|
9
|
+
* sections at the boundaries it picked).
|
|
10
|
+
*
|
|
11
|
+
* Overlap exists so a section straddling a chunk boundary appears in BOTH
|
|
12
|
+
* surrounding chunks; the merge step then unions the same-kind ranges so
|
|
13
|
+
* the boundary section isn't double-counted in the writer.
|
|
14
|
+
*
|
|
15
|
+
* Char counts are estimated from token counts via a fixed 3.5 chars/token
|
|
16
|
+
* ratio (English prose average). The estimate is conservative — Haiku
|
|
17
|
+
* tokenises slightly differently per script, but 3.5 leaves ~10% headroom
|
|
18
|
+
* for non-English content before bumping into the model's hard ceiling.
|
|
19
|
+
*/
|
|
20
|
+
export function chunkDocument(text, opts) {
|
|
21
|
+
const { chunkSize, overlap } = opts;
|
|
22
|
+
if (chunkSize <= 0) {
|
|
23
|
+
throw new Error(`chunkDocument: chunkSize must be positive, got ${chunkSize}`);
|
|
24
|
+
}
|
|
25
|
+
if (overlap < 0) {
|
|
26
|
+
throw new Error(`chunkDocument: overlap must be non-negative, got ${overlap}`);
|
|
27
|
+
}
|
|
28
|
+
if (overlap >= chunkSize) {
|
|
29
|
+
throw new Error(`chunkDocument: overlap (${overlap}) must be less than chunkSize (${chunkSize})`);
|
|
30
|
+
}
|
|
31
|
+
if (text.length === 0)
|
|
32
|
+
return [];
|
|
33
|
+
if (text.length <= chunkSize) {
|
|
34
|
+
// One-chunk fast path so callers that always-chunk don't pay the
|
|
35
|
+
// window-stepping arithmetic for inputs that already fit.
|
|
36
|
+
return [{ chunkText: text, baseOffset: 0 }];
|
|
37
|
+
}
|
|
38
|
+
const chunks = [];
|
|
39
|
+
const stride = chunkSize - overlap;
|
|
40
|
+
let start = 0;
|
|
41
|
+
while (start < text.length) {
|
|
42
|
+
const end = Math.min(start + chunkSize, text.length);
|
|
43
|
+
chunks.push({ chunkText: text.slice(start, end), baseOffset: start });
|
|
44
|
+
if (end >= text.length)
|
|
45
|
+
break;
|
|
46
|
+
start += stride;
|
|
47
|
+
}
|
|
48
|
+
return chunks;
|
|
49
|
+
}
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
// mergeOverlappingSections — collates per-chunk classifier results.
|
|
52
|
+
//
|
|
53
|
+
// Algorithm: group sections by `kind`, sort by `sourceStart`, then walk and
|
|
54
|
+
// union consecutive same-kind ranges whose intersection covers more than
|
|
55
|
+
// MERGE_OVERLAP_THRESHOLD of the smaller range. The longer summary wins on
|
|
56
|
+
// merge — empirically Haiku's longer summary on a chunk that saw more
|
|
57
|
+
// surrounding context tends to be the better one.
|
|
58
|
+
//
|
|
59
|
+
// Cross-kind overlap is preserved: chunk A's `Position` and chunk B's
|
|
60
|
+
// `Other` covering the same range are kept as two distinct sections (per
|
|
61
|
+
// eng review). The classifier disagreed about kind; the writer's downstream
|
|
62
|
+
// :Section:Other surfacing will let the operator decide which one wins
|
|
63
|
+
// during ontology growth review.
|
|
64
|
+
//
|
|
65
|
+
// Disjoint same-kind sections are also preserved — only adjacent ranges
|
|
66
|
+
// with material overlap are merged.
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
export const MERGE_OVERLAP_THRESHOLD = 0.5;
|
|
69
|
+
export function mergeOverlappingSections(input) {
|
|
70
|
+
if (input.length <= 1)
|
|
71
|
+
return input.slice();
|
|
72
|
+
// Group by kind so we never accidentally merge across kinds.
|
|
73
|
+
const byKind = new Map();
|
|
74
|
+
for (const s of input) {
|
|
75
|
+
const arr = byKind.get(s.kind);
|
|
76
|
+
if (arr)
|
|
77
|
+
arr.push(s);
|
|
78
|
+
else
|
|
79
|
+
byKind.set(s.kind, [s]);
|
|
80
|
+
}
|
|
81
|
+
const merged = [];
|
|
82
|
+
for (const group of byKind.values()) {
|
|
83
|
+
group.sort((a, b) => a.sourceStart - b.sourceStart || a.sourceEnd - b.sourceEnd);
|
|
84
|
+
let current = null;
|
|
85
|
+
for (const s of group) {
|
|
86
|
+
if (current === null) {
|
|
87
|
+
current = { ...s };
|
|
88
|
+
continue;
|
|
89
|
+
}
|
|
90
|
+
const intersection = Math.max(0, Math.min(current.sourceEnd, s.sourceEnd) - Math.max(current.sourceStart, s.sourceStart));
|
|
91
|
+
if (intersection === 0) {
|
|
92
|
+
merged.push(current);
|
|
93
|
+
current = { ...s };
|
|
94
|
+
continue;
|
|
95
|
+
}
|
|
96
|
+
const currentLen = current.sourceEnd - current.sourceStart;
|
|
97
|
+
const sLen = s.sourceEnd - s.sourceStart;
|
|
98
|
+
const overlapFraction = intersection / Math.min(currentLen, sLen);
|
|
99
|
+
if (overlapFraction > MERGE_OVERLAP_THRESHOLD) {
|
|
100
|
+
// Union the range; the section with the longer body contributes its
|
|
101
|
+
// non-range fields (title, properties, anchorEdge, related, etc.) on
|
|
102
|
+
// the assumption that a wider classification window grounded its
|
|
103
|
+
// properties more reliably. Summary always picks the longer of the two.
|
|
104
|
+
const fieldsWinner = sLen > currentLen ? s : current;
|
|
105
|
+
current = {
|
|
106
|
+
...fieldsWinner,
|
|
107
|
+
sourceStart: Math.min(current.sourceStart, s.sourceStart),
|
|
108
|
+
sourceEnd: Math.max(current.sourceEnd, s.sourceEnd),
|
|
109
|
+
summary: s.summary.length > current.summary.length ? s.summary : current.summary,
|
|
110
|
+
};
|
|
111
|
+
}
|
|
112
|
+
else {
|
|
113
|
+
merged.push(current);
|
|
114
|
+
current = { ...s };
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
if (current !== null)
|
|
118
|
+
merged.push(current);
|
|
119
|
+
}
|
|
120
|
+
// Return in whole-document reading order so the writer's :NEXT chain
|
|
121
|
+
// maps to source order.
|
|
122
|
+
merged.sort((a, b) => a.sourceStart - b.sourceStart);
|
|
123
|
+
return merged;
|
|
124
|
+
}
|
|
125
|
+
//# sourceMappingURL=document-chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"document-chunker.js","sourceRoot":"","sources":["../../src/lib/document-chunker.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAqCH,MAAM,UAAU,aAAa,CAAC,IAAY,EAAE,IAAkB;IAC5D,MAAM,EAAE,SAAS,EAAE,OAAO,EAAE,GAAG,IAAI,CAAC;IACpC,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnB,MAAM,IAAI,KAAK,CAAC,kDAAkD,SAAS,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;QAChB,MAAM,IAAI,KAAK,CAAC,oDAAoD,OAAO,EAAE,CAAC,CAAC;IACjF,CAAC;IACD,IAAI,OAAO,IAAI,SAAS,EAAE,CAAC;QACzB,MAAM,IAAI,KAAK,CAAC,2BAA2B,OAAO,kCAAkC,SAAS,GAAG,CAAC,CAAC;IACpG,CAAC;IACD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACjC,IAAI,IAAI,CAAC,MAAM,IAAI,SAAS,EAAE,CAAC;QAC7B,iEAAiE;QACjE,0DAA0D;QAC1D,OAAO,CAAC,EAAE,SAAS,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,MAAM,MAAM,GAAoB,EAAE,CAAC;IACnC,MAAM,MAAM,GAAG,SAAS,GAAG,OAAO,CAAC;IACnC,IAAI,KAAK,GAAG,CAAC,CAAC;IACd,OAAO,KAAK,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC3B,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,SAAS,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC;QACrD,MAAM,CAAC,IAAI,CAAC,EAAE,SAAS,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,GAAG,CAAC,EAAE,UAAU,EAAE,KAAK,EAAE,CAAC,CAAC;QACtE,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM;YAAE,MAAM;QAC9B,KAAK,IAAI,MAAM,CAAC;IAClB,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,8EAA8E;AAC9E,oEAAoE;AACpE,EAAE;AACF,4EAA4E;AAC5E,yEAAyE;AACzE,2EAA2E;AAC3E,sEAAsE;AACtE,kDAAkD;AAClD,EAAE;AACF,sEAAsE;AACtE,yEAAyE;AACzE,4EAA4E;AAC5E,uEAAuE;AACvE,iCAAiC;AACjC,EAAE;AACF,wEAAwE;AACxE,oCAAoC;AACpC,8EAA8E;AAE9E,MAAM,CAAC,MAAM,uBAAuB,GAAG,GAAG,CAAC;AAE3C,MAAM,UAAU,wBAAwB,CAA0B,KAAU;IAC1E,IAAI,KAAK,CAAC,MAAM,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,KAAK,EAAE,CAAC;IAE5C,6DAA6D;IAC7D,MAAM,MAAM,GAAG,IAAI,GAAG,EAAe,CAAC;IACtC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QAC/B,IAAI,GAAG;YAAE,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;;YAChB,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAC/B,CAAC;IAED,MAAM,MAAM,GAAQ,EAAE,CAAC;IACvB,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,EAAE,EAAE,CAAC;QACpC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,IAAI,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAC;QACjF,IAAI,OAAO,GAAa,IAAI,CAAC;QAC7B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;YACtB,IAAI,OAAO,KAAK,IAAI,EAAE,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;gBACnB,SAAS;YACX,CAAC;YACD,MAAM,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC;YAC1H,IAAI,YAAY,KAAK,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;gBACnB,SAAS;YACX,CAAC;YACD,MAAM,UAAU,GAAW,OAAO,CAAC,SAAS,GAAG,OAAO,CAAC,WAAW,CAAC;YACnE,MAAM,IAAI,GAAW,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,WAAW,CAAC;YACjD,MAAM,eAAe,GAAG,YAAY,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,IAAI,CAAC,CAAC;YAClE,IAAI,eAAe,GAAG,uBAAuB,EAAE,CAAC;gBAC9C,oEAAoE;gBACpE,qEAAqE;gBACrE,iEAAiE;gBACjE,wEAAwE;gBACxE,MAAM,YAAY,GAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC;gBACxD,OAAO,GAAG;oBACR,GAAG,YAAY;oBACf,WAAW,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,CAAC,WAAW,CAAC;oBACzD,SAAS,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC,CAAC,SAAS,CAAC;oBACnD,OAAO,EAAE,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,OAAO;iBACjF,CAAC;YACJ,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;gBACrB,OAAO,GAAG,EAAE,GAAG,CAAC,EAAE,CAAC;YACrB,CAAC;QACH,CAAC;QACD,IAAI,OAAO,KAAK,IAAI;YAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAC7C,CAAC;IAED,qEAAqE;IACrE,wBAAwB;IACxB,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,GAAG,CAAC,CAAC,WAAW,CAAC,CAAC;IACrD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -70,8 +70,31 @@ export interface ClassifiedSection {
|
|
|
70
70
|
kind: string;
|
|
71
71
|
/** Short human-readable title for the section. */
|
|
72
72
|
title: string;
|
|
73
|
-
/**
|
|
73
|
+
/**
|
|
74
|
+
* The section's body text — embedded and stored on the section node.
|
|
75
|
+
*
|
|
76
|
+
* Task 896: server-reconstructed via `documentText.slice(sourceStart, sourceEnd)`.
|
|
77
|
+
* The LLM emits offsets, never the body text — output size becomes O(sections),
|
|
78
|
+
* not O(input chars). Callers consume the same `body: string` shape as before.
|
|
79
|
+
*/
|
|
74
80
|
body: string;
|
|
81
|
+
/**
|
|
82
|
+
* 1-3 sentence summary of the section, ≤500 chars (server-validated).
|
|
83
|
+
* The LLM emits this; the server truncates if oversize. Stored as
|
|
84
|
+
* `properties.summary` on the section node so adjacency search can
|
|
85
|
+
* surface it without rehydrating the body.
|
|
86
|
+
*/
|
|
87
|
+
summary: string;
|
|
88
|
+
/**
|
|
89
|
+
* Whole-document character offsets — inclusive start, exclusive end.
|
|
90
|
+
* The LLM emits these; the server validates bounds and reconstructs
|
|
91
|
+
* `body` via `documentText.slice(sourceStart, sourceEnd)`. In the
|
|
92
|
+
* chunked-classify path these are translated from chunk-local to
|
|
93
|
+
* whole-document coordinates so the merge step can detect boundary
|
|
94
|
+
* straddlers across chunks.
|
|
95
|
+
*/
|
|
96
|
+
sourceStart: number;
|
|
97
|
+
sourceEnd: number;
|
|
75
98
|
/** Properties on the section node (excluding accountId/embedding/provenance). */
|
|
76
99
|
properties: Record<string, unknown>;
|
|
77
100
|
/**
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"llm-classifier.d.ts","sourceRoot":"","sources":["../../src/lib/llm-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;
|
|
1
|
+
{"version":3,"file":"llm-classifier.d.ts","sourceRoot":"","sources":["../../src/lib/llm-classifier.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAUH,+DAA+D;AAC/D,MAAM,MAAM,mBAAmB,GAAG,aAAa,GAAG,WAAW,CAAC;AAE9D,mEAAmE;AACnE,MAAM,MAAM,oBAAoB,GAAG,UAAU,GAAG,UAAU,CAAC;AAE3D,kFAAkF;AAClF,MAAM,WAAW,iBAAiB;IAChC,8DAA8D;IAC9D,IAAI,EAAE,MAAM,CAAC;IACb,sCAAsC;IACtC,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC,oDAAoD;IACpD,IAAI,EAAE;QACJ,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,oBAAoB,CAAC;QAChC,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,CAAC;IACF;;;;OAIG;IACH,KAAK,CAAC,EAAE,OAAO,CAAC;CACjB;AAED,oGAAoG;AACpG,MAAM,WAAW,iBAAiB;IAChC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,IAAI,EAAE,MAAM,CAAC;IACb,kDAAkD;IAClD,KAAK,EAAE,MAAM,CAAC;IACd;;;;;;OAMG;IACH,IAAI,EAAE,MAAM,CAAC;IACb;;;;;OAKG;IACH,OAAO,EAAE,MAAM,CAAC;IAChB;;;;;;;OAOG;IACH,WAAW,EAAE,MAAM,CAAC;IACpB,SAAS,EAAE,MAAM,CAAC;IAClB,iFAAiF;IACjF,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;IACpC;;;;OAIG;IACH,UAAU,EAAE;QACV,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,mBAAmB,CAAC;QAC/B,UAAU,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;KACtC,GAAG,IAAI,CAAC;IACT,oFAAoF;IACpF,OAAO,CAAC,EAAE,iBAAiB,EAAE,CAAC;IAC9B;;;;;;OAMG;IACH,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED;;;;;GAKG;AACH,MAAM,WAAW,eAAe;IAC9B,2CAA2C;IAC3C,IAAI,EAAE,MAAM,CAAC;IACb,6EAA6E;IAC7E,KAAK,EAAE,MAAM,CAAC;IACd,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,oCAAoC;AACpC,MAAM,WAAW,gBAAgB;IAC/B,kDAAkD;IAClD,eAAe,EAAE,MAAM,CAAC;IACxB,kEAAkE;IAClE,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,6BAA6B;IAC7B,QAAQ,EAAE,iBAAiB,EAAE,CAAC;IAC9B,iFAAiF;IACjF,gBAAgB,EAAE,eAAe,EAAE,CAAC;IACpC;;+CAE2C;IAC3C,aAAa,CAAC,EAAE,KAAK,CAAC;QACpB,IAAI,EAAE,MAAM,CAAC;QACb,SAAS,EAAE,UAAU,GAAG,UAAU,CAAC;QACnC,UAAU,EAAE,MAAM,CAAC;QACnB,gBAAgB,EAAE,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC,CAAC;QAC1C,wEAAwE;QACxE,KAAK,CAAC,EAAE,OAAO,CAAC;KACjB,CAAC,CAAC;IACH,mFAAmF;IACnF,mBAAmB,EAAE,MAAM,CAAC;CAC7B;AAED,MAAM,MAAM,cAAc,GACtB;IAAE,IAAI,EAAE,IAAI,CAAC;IAAC,MAAM,EAAE,gBAAgB,CAAA;CAAE,GACxC;IAAE,IAAI,EAAE,UAAU,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC;AA8CzC;;;;;;;;GAQG;AACH,eAAO,MAAM,kBAAkB,UAAU,CAAC;AAE1C,eAAO,MAAM,sBAAsB,wEAMzB,CAAC;AAEX,eAAO,MAAM,wBAAwB,yKAa3B,CAAC;AAEX,eAAO,MAAM,sBAAsB,4SAqBzB,CAAC;AAEX,8EAA8E;AAC9E,eAAO,MAAM,qBAAqB,sBAAuB,CAAC;AAE1D,eAAO,MAAM,iBAAiB,EAAE,WAAW,CAAC,MAAM,CAMhD,CAAC;AAoJH,MAAM,WAAW,cAAc;IAC7B,wCAAwC;IACxC,SAAS,EAAE,MAAM,CAAC;IAClB;;;;;;;OAOG;IACH,IAAI,CAAC,EAAE,UAAU,GAAG,MAAM,CAAC;IAC3B;;;;;;;;OAQG;IACH,iBAAiB,EAAE,MAAM,CAAC;IAC1B;;;;;;OAMG;IACH,cAAc,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC;IACpC;;;;;;OAMG;IACH,cAAc,EAAE,MAAM,CAAC;IACvB,8EAA8E;IAC9E,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;;;;;;;;;;GAaG;AACH,wBAAsB,gBAAgB,CACpC,MAAM,EAAE,cAAc,GACrB,OAAO,CAAC,cAAc,CAAC,CA0UzB"}
|
|
@@ -23,10 +23,45 @@
|
|
|
23
23
|
*/
|
|
24
24
|
import { callOauthLlm } from "../../../../../lib/oauth-llm/dist/index.js";
|
|
25
25
|
import { HAIKU_MODEL } from "../../../../../lib/models/dist/index.js";
|
|
26
|
+
import { chunkDocument, mergeOverlappingSections } from "./document-chunker.js";
|
|
26
27
|
// ---------------------------------------------------------------------------
|
|
27
28
|
// Constants
|
|
28
29
|
// ---------------------------------------------------------------------------
|
|
29
30
|
const MAX_OUTPUT_TOKENS = 8192;
|
|
31
|
+
/**
|
|
32
|
+
* Per-section summary cap (Task 896 clause 1). The classifier prompt asks
|
|
33
|
+
* for ≤500 chars; the server truncates anything longer with an ellipsis
|
|
34
|
+
* marker so a single overlong summary never inflates the output JSON
|
|
35
|
+
* unbounded. Truncation is observable (logged once per oversize section)
|
|
36
|
+
* but not fatal — Haiku usually respects the cap.
|
|
37
|
+
*/
|
|
38
|
+
const SUMMARY_MAX_CHARS = 500;
|
|
39
|
+
/**
|
|
40
|
+
* Output budget the prompt advertises to the model. ≈6000 tokens leaves
|
|
41
|
+
* headroom under MAX_OUTPUT_TOKENS=8192 for a few hundred sections of
|
|
42
|
+
* offsets + short summaries without re-emitting body text. Pre-Task-896
|
|
43
|
+
* the verbatim-body schema made output ≈ input — a 251K-char Adam Mackay
|
|
44
|
+
* archive truncated mid-word at 8K.
|
|
45
|
+
*/
|
|
46
|
+
const PROMPT_OUTPUT_TOKEN_BUDGET = 6000;
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
// Task 896 clause 3 — chunker constants for oversize prose.
|
|
49
|
+
//
|
|
50
|
+
// Haiku 4.5: 200K input tokens. Reserve ~5K for prompt + system overhead
|
|
51
|
+
// → ~195K usable tokens × ~3.5 chars/token = ~682K char input ceiling per
|
|
52
|
+
// Haiku call. The chunker emits chunks of ~150K tokens (~525K chars) with
|
|
53
|
+
// ~5K-token (~17.5K-char) overlap so a section straddling the boundary
|
|
54
|
+
// surfaces in both surrounding chunks for the merge step.
|
|
55
|
+
// ---------------------------------------------------------------------------
|
|
56
|
+
const CHARS_PER_TOKEN_ESTIMATE = 3.5;
|
|
57
|
+
const HAIKU_INPUT_TOKEN_BUDGET = 195_000;
|
|
58
|
+
/** Per-Haiku-call hard ceiling on `documentText` characters — enforced regardless of mode. */
|
|
59
|
+
const INPUT_CHAR_CEILING = Math.floor(HAIKU_INPUT_TOKEN_BUDGET * CHARS_PER_TOKEN_ESTIMATE);
|
|
60
|
+
const CHUNK_TOKEN_SIZE = 150_000;
|
|
61
|
+
const CHUNK_OVERLAP_TOKENS = 5_000;
|
|
62
|
+
/** Target chunk char size for the prose chunker (Task 896 clause 3). */
|
|
63
|
+
const CHUNK_CHAR_SIZE = Math.floor(CHUNK_TOKEN_SIZE * CHARS_PER_TOKEN_ESTIMATE);
|
|
64
|
+
const CHUNK_OVERLAP_CHARS = Math.floor(CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN_ESTIMATE);
|
|
30
65
|
/**
|
|
31
66
|
* Closed enumeration of section `kind` values. Each becomes a secondary
|
|
32
67
|
* label on the `:Section` node (e.g. `:Section:Position`). Anything outside
|
|
@@ -55,10 +90,7 @@ export const STRUCTURAL_SECTION_KINDS = [
|
|
|
55
90
|
"Bibliography",
|
|
56
91
|
"Glossary",
|
|
57
92
|
"Acknowledgments",
|
|
58
|
-
// Task 891
|
|
59
|
-
// The chat-mode system prompt restricts output to this single kind; the
|
|
60
|
-
// document-mode prompt never names it. Listed here so the validator's closed
|
|
61
|
-
// enumeration accepts it without a per-mode dictionary split.
|
|
93
|
+
// Task 891: chat-mode kind. Emitted only when mode==='chat'; listed here so the validator's closed enumeration accepts it.
|
|
62
94
|
"Conversation",
|
|
63
95
|
];
|
|
64
96
|
export const CONTRACT_SECTION_KINDS = [
|
|
@@ -115,14 +147,17 @@ const CHAT_SYSTEM_PROMPT = [
|
|
|
115
147
|
" [DD/MM/YYYY, HH:MM:SS] <Sender>: <body>",
|
|
116
148
|
" [DD/MM/YY, HH:MM:SS] <Sender>: <body>",
|
|
117
149
|
" [YYYY-MM-DD HH:MM:SS] <Sender>: <body>",
|
|
118
|
-
"Body lines without a leading bracketed timestamp belong to the previous message (multi-line bodies). System messages (no sender) and media-only lines (e.g. '<Media omitted>')
|
|
150
|
+
"Body lines without a leading bracketed timestamp belong to the previous message (multi-line bodies). System messages (no sender) and media-only lines (e.g. '<Media omitted>') belong inside the chunk that covers their position.",
|
|
151
|
+
"",
|
|
152
|
+
`OUTPUT BUDGET — your JSON response must fit within ~${PROMPT_OUTPUT_TOKEN_BUDGET} output tokens. Use offsets — NEVER re-emit body text. The server reconstructs each chunk's body from your offsets via documentText.slice(sourceStart, sourceEnd) and stores it on the node, so byte-equal recovery works without you transmitting the bytes.`,
|
|
119
153
|
"",
|
|
120
154
|
"Each chunk is a JSON object with:",
|
|
121
155
|
"- 'kind': MUST be exactly 'Conversation'. No other kinds are legal in chat mode.",
|
|
122
156
|
"- 'title': short human-readable topic label for the chunk (max 120 chars).",
|
|
123
|
-
|
|
157
|
+
`- 'summary': 1-3 sentences describing what this chunk is about. Hard ceiling ${SUMMARY_MAX_CHARS} characters — the server truncates anything longer.`,
|
|
158
|
+
"- 'sourceStart': INTEGER character offset into the supplied archive text where this chunk's first message begins (0-indexed, inclusive). MUST point at the opening '[' of the bracketed timestamp prefix.",
|
|
159
|
+
"- 'sourceEnd': INTEGER character offset where this chunk ends (exclusive). MUST be > sourceStart and ≤ total length of the supplied text.",
|
|
124
160
|
"- 'properties': required typed properties on the chunk node:",
|
|
125
|
-
" summary : 1-3 sentences describing what this chunk is about (this is your one chance to summarise; the body stays verbatim).",
|
|
126
161
|
" keywords : array of 3-10 lowercase topic keywords for retrieval.",
|
|
127
162
|
" firstMessageAt : timestamp of the first message in the chunk, copied verbatim from the line prefix (preserve the file's native format and any offset).",
|
|
128
163
|
" lastMessageAt : timestamp of the last message in the chunk, copied verbatim from the line prefix.",
|
|
@@ -142,7 +177,7 @@ const CHAT_SYSTEM_PROMPT = [
|
|
|
142
177
|
"- Split at topic transitions, not at message count or arbitrary intervals. A coherent exchange ('let's discuss the deck') is one chunk; a separate exchange ('what time tomorrow?') is another.",
|
|
143
178
|
"- An archive of fewer than ~10 messages is usually one chunk.",
|
|
144
179
|
"- Even a one-message archive must produce one chunk — never return zero chunks for non-empty input.",
|
|
145
|
-
"-
|
|
180
|
+
"- Offset coverage: chunks MUST cover every message in chronological order. Adjacent chunks should be contiguous (chunk N's sourceEnd equals chunk N+1's sourceStart) so no message is skipped. messageCount summed across chunks equals total archive messages.",
|
|
146
181
|
"",
|
|
147
182
|
"Respond with ONLY the JSON object, no prose, no markdown fences.",
|
|
148
183
|
].join("\n");
|
|
@@ -154,6 +189,8 @@ const SYSTEM_PROMPT = [
|
|
|
154
189
|
"2. The natural-edge map naming the anchor edge for identity-kind sections.",
|
|
155
190
|
"3. The full document text.",
|
|
156
191
|
"",
|
|
192
|
+
`OUTPUT BUDGET — your JSON response must fit within ~${PROMPT_OUTPUT_TOKEN_BUDGET} output tokens. Use offsets — NEVER re-emit body text. The server reconstructs each section's body from your offsets via documentText.slice(sourceStart, sourceEnd) and stores it on the node. Per-section 'summary' is hard-capped at ${SUMMARY_MAX_CHARS} chars.`,
|
|
193
|
+
"",
|
|
157
194
|
"Closed enumeration of section `kind` values:",
|
|
158
195
|
` Identity (anchor edge to subject): ${IDENTITY_SECTION_KINDS.join(", ")}`,
|
|
159
196
|
` Document-structural (no anchor edge; HAS_SECTION + NEXT only): ${STRUCTURAL_SECTION_KINDS.join(", ")}`,
|
|
@@ -164,7 +201,9 @@ const SYSTEM_PROMPT = [
|
|
|
164
201
|
"For each meaningful section, return a JSON object with:",
|
|
165
202
|
"- 'kind': one of the closed-enumeration values above. Never invent new kinds; use 'Other' with a 'classifierReason' if nothing fits.",
|
|
166
203
|
"- 'title': short human-readable title (max 120 chars).",
|
|
167
|
-
|
|
204
|
+
`- 'summary': 1-3 sentences describing the section. Hard ceiling ${SUMMARY_MAX_CHARS} characters — the server truncates anything longer.`,
|
|
205
|
+
"- 'sourceStart': INTEGER character offset into the supplied document text where this section begins (0-indexed, inclusive).",
|
|
206
|
+
"- 'sourceEnd': INTEGER character offset where this section ends (exclusive). MUST be > sourceStart and ≤ total length of the supplied text.",
|
|
168
207
|
"- 'properties': any typed properties for the section node (e.g. for Position: jobTitle, startDate, endDate; for Education: degree, fieldOfStudy; do NOT include accountId, embedding, createdAt, or other system fields — the writer adds them).",
|
|
169
208
|
"- 'anchorEdge': for identity-kind sections (Position, Education, Credential, Skill, Biography) and for standalone Project, an object { type, direction, properties } naming the natural edge to the document subject (e.g. UserProfile -[HAS_POSITION]-> the Section). 'direction' is 'from-anchor' if the subject points at the section, 'to-anchor' if the section points at the subject. Set to null for structural + contract-clause kinds and for 'Other'.",
|
|
170
209
|
"- 'related': optional array of additional entity nodes this section references (e.g. a Position section's employer Organization via AT, an Education section's school Organization via ATTENDED). Each entry: { kind, properties, edge: { type, direction, properties }, merge: true|false }. Direction is 'outgoing' (section -> related) or 'incoming' (section <- related). Use 'merge': true for entities reused across documents (Organization by name, Person by email/telephone).",
|
|
@@ -186,7 +225,7 @@ const SYSTEM_PROMPT = [
|
|
|
186
225
|
"- 'kind' values are restricted to the closed enumeration above. If a section truly fits no listed kind, use 'Other' with a 'classifierReason'. Never emit a kind not on the list.",
|
|
187
226
|
"- Never invent edge names. Use the natural-edge map exactly as given. The graph validator rejects writes with unknown edge types.",
|
|
188
227
|
"- Be conservative with 'related' entities — only include them when the section explicitly names them.",
|
|
189
|
-
"-
|
|
228
|
+
"- Offsets cover the source: sourceStart and sourceEnd are integer character positions in the supplied document text. Do not re-emit body text — the server reconstructs it from your offsets.",
|
|
190
229
|
"- Respond with ONLY the JSON object, no prose, no markdown fences.",
|
|
191
230
|
].join("\n");
|
|
192
231
|
// ---------------------------------------------------------------------------
|
|
@@ -206,6 +245,22 @@ function asString(v) {
|
|
|
206
245
|
function asObject(v) {
|
|
207
246
|
return v && typeof v === "object" && !Array.isArray(v) ? v : null;
|
|
208
247
|
}
|
|
248
|
+
/**
|
|
249
|
+
* Coerce a JSON value into a non-negative integer character offset, or null
|
|
250
|
+
* if it isn't one. Floats, NaN, negatives, and non-numbers all return null —
|
|
251
|
+
* Haiku has been observed emitting `null` and stringly-typed offsets when
|
|
252
|
+
* stressed; we drop the section silently and let the missing-offsets
|
|
253
|
+
* diagnostic surface the rate.
|
|
254
|
+
*/
|
|
255
|
+
function asNonNegativeInt(v) {
|
|
256
|
+
if (typeof v !== "number")
|
|
257
|
+
return null;
|
|
258
|
+
if (!Number.isFinite(v) || !Number.isInteger(v))
|
|
259
|
+
return null;
|
|
260
|
+
if (v < 0)
|
|
261
|
+
return null;
|
|
262
|
+
return v;
|
|
263
|
+
}
|
|
209
264
|
/**
|
|
210
265
|
* Classify a document into typed sections via Haiku (Task 740).
|
|
211
266
|
*
|
|
@@ -223,6 +278,28 @@ function asObject(v) {
|
|
|
223
278
|
export async function classifyDocument(params) {
|
|
224
279
|
const { accountId, anchorDescription, ontologyLabels, naturalEdgeMap, documentText } = params;
|
|
225
280
|
const mode = params.mode ?? "document";
|
|
281
|
+
// Task 896 clause 3 dispatch — oversize document mode goes to the chunked
|
|
282
|
+
// path; oversize chat mode loud-fails (sessionize must keep sessions under
|
|
283
|
+
// the ceiling, per eng review). Single-shot path stays unchanged below.
|
|
284
|
+
if (mode === "document" && documentText.length > CHUNK_CHAR_SIZE) {
|
|
285
|
+
return classifyDocumentChunked(params);
|
|
286
|
+
}
|
|
287
|
+
if (documentText.length > INPUT_CHAR_CEILING) {
|
|
288
|
+
const overage = `chars=${documentText.length}, ceiling=${INPUT_CHAR_CEILING}`;
|
|
289
|
+
if (mode === "chat") {
|
|
290
|
+
logFallback(accountId, `input-too-large: chat session exceeds Haiku input ceiling (${overage}). Sessionize must split sessions before classify (Task 894).`);
|
|
291
|
+
}
|
|
292
|
+
else {
|
|
293
|
+
// Document mode > INPUT_CHAR_CEILING but ≤ CHUNK_CHAR_SIZE shouldn't
|
|
294
|
+
// happen since CHUNK_CHAR_SIZE < INPUT_CHAR_CEILING — kept as
|
|
295
|
+
// defence-in-depth in case constants drift.
|
|
296
|
+
logFallback(accountId, `input-too-large: document exceeds Haiku input ceiling without chunking (${overage}). Constants drift between CHUNK_CHAR_SIZE and INPUT_CHAR_CEILING.`);
|
|
297
|
+
}
|
|
298
|
+
return {
|
|
299
|
+
kind: "fallback",
|
|
300
|
+
reason: `Input is ${documentText.length} chars; classifier ceiling is ${INPUT_CHAR_CEILING}.`,
|
|
301
|
+
};
|
|
302
|
+
}
|
|
226
303
|
// System prompt + user message branch on mode. Chat mode strips the
|
|
227
304
|
// natural-edge map and reframes the input as a session of turn-attributed
|
|
228
305
|
// text; document mode is unchanged from Task 740.
|
|
@@ -276,8 +353,17 @@ export async function classifyDocument(params) {
|
|
|
276
353
|
try {
|
|
277
354
|
parsed = JSON.parse(jsonText);
|
|
278
355
|
}
|
|
279
|
-
catch {
|
|
280
|
-
|
|
356
|
+
catch (err) {
|
|
357
|
+
// Task 896 clause 5: surface diagnostics so a malformed-JSON fallback
|
|
358
|
+
// distinguishes truncation (output budget exceeded), fence drift, and
|
|
359
|
+
// genuine model junk. Pre-Task-896 the fallback discarded the parser
|
|
360
|
+
// error and 120 chars from the post-strip text — Adam Mackay's 251K-char
|
|
361
|
+
// ingest bottomed out here with no visible cause.
|
|
362
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
363
|
+
const fenceStripped = jsonText !== responseText;
|
|
364
|
+
logFallback(accountId, `malformed JSON: parse-error=${JSON.stringify(message)} len=${responseText.length} fence-stripped=${fenceStripped} ` +
|
|
365
|
+
`pre-strip-head=${JSON.stringify(responseText.slice(0, 200))} ` +
|
|
366
|
+
`pre-strip-tail=${JSON.stringify(responseText.slice(-200))}`);
|
|
281
367
|
return { kind: "fallback", reason: "Haiku returned malformed JSON" };
|
|
282
368
|
}
|
|
283
369
|
const root = asObject(parsed);
|
|
@@ -296,15 +382,44 @@ export async function classifyDocument(params) {
|
|
|
296
382
|
}
|
|
297
383
|
const sections = [];
|
|
298
384
|
let hallucinatedRelated = 0;
|
|
385
|
+
// Task 896 clause 1 diagnostics — counters for offset/summary post-validation
|
|
386
|
+
// failures so the haiku-ok log line names the rate of model misbehaviour.
|
|
387
|
+
// Per-section drops are silent; the aggregate count tells the operator
|
|
388
|
+
// whether the prompt is degrading.
|
|
389
|
+
let droppedForOffsets = 0;
|
|
390
|
+
let summaryTruncated = 0;
|
|
299
391
|
for (const raw of rawSections) {
|
|
300
392
|
const obj = asObject(raw);
|
|
301
393
|
if (!obj)
|
|
302
394
|
continue;
|
|
395
|
+
// Task 896 clause 1: read offsets and reconstruct body server-side.
|
|
396
|
+
// Pre-Task-896 the LLM emitted body verbatim, making output ≈ input
|
|
397
|
+
// and causing 8K-token truncation on >80K-char inputs.
|
|
398
|
+
const sourceStart = asNonNegativeInt(obj.sourceStart);
|
|
399
|
+
const sourceEnd = asNonNegativeInt(obj.sourceEnd);
|
|
400
|
+
if (sourceStart === null || sourceEnd === null) {
|
|
401
|
+
droppedForOffsets += 1;
|
|
402
|
+
continue;
|
|
403
|
+
}
|
|
404
|
+
if (sourceEnd <= sourceStart || sourceEnd > documentText.length) {
|
|
405
|
+
droppedForOffsets += 1;
|
|
406
|
+
continue;
|
|
407
|
+
}
|
|
408
|
+
const body = documentText.slice(sourceStart, sourceEnd);
|
|
409
|
+
if (body.length === 0) {
|
|
410
|
+
droppedForOffsets += 1;
|
|
411
|
+
continue;
|
|
412
|
+
}
|
|
303
413
|
const title = asString(obj.title) ?? "";
|
|
304
|
-
const body = asString(obj.body) ?? "";
|
|
305
414
|
const properties = asObject(obj.properties) ?? {};
|
|
306
|
-
|
|
307
|
-
|
|
415
|
+
let summary = asString(obj.summary) ?? "";
|
|
416
|
+
if (summary.length > SUMMARY_MAX_CHARS) {
|
|
417
|
+
summary = summary.slice(0, SUMMARY_MAX_CHARS - 1) + "…";
|
|
418
|
+
summaryTruncated += 1;
|
|
419
|
+
}
|
|
420
|
+
// Mirror summary into properties so the Neo4j section node carries it
|
|
421
|
+
// (chat-mode parity — pre-Task-896 chunks stored summary as properties.summary).
|
|
422
|
+
properties.summary = summary;
|
|
308
423
|
if (mode === "chat") {
|
|
309
424
|
// Chat mode: only `Conversation` is legal. Haiku is instructed to emit
|
|
310
425
|
// exactly that kind; force it here so a misfire still produces a valid
|
|
@@ -315,6 +430,9 @@ export async function classifyDocument(params) {
|
|
|
315
430
|
kind: "Conversation",
|
|
316
431
|
title: title.slice(0, 200),
|
|
317
432
|
body,
|
|
433
|
+
summary,
|
|
434
|
+
sourceStart,
|
|
435
|
+
sourceEnd,
|
|
318
436
|
properties,
|
|
319
437
|
anchorEdge: null,
|
|
320
438
|
});
|
|
@@ -376,6 +494,9 @@ export async function classifyDocument(params) {
|
|
|
376
494
|
kind,
|
|
377
495
|
title: title.slice(0, 200),
|
|
378
496
|
body,
|
|
497
|
+
summary,
|
|
498
|
+
sourceStart,
|
|
499
|
+
sourceEnd,
|
|
379
500
|
properties,
|
|
380
501
|
anchorEdge: kind === SECTION_KIND_OTHER ? null : anchorEdge,
|
|
381
502
|
related: related.length > 0 ? related : undefined,
|
|
@@ -384,6 +505,19 @@ export async function classifyDocument(params) {
|
|
|
384
505
|
: {}),
|
|
385
506
|
});
|
|
386
507
|
}
|
|
508
|
+
// Missing-offsets fallback (Task 896 clause 1, surfaced by CEO review):
|
|
509
|
+
// if Haiku emitted sections but every one failed offset validation, we'd
|
|
510
|
+
// otherwise return an empty `sections` array silently and the writer would
|
|
511
|
+
// happily produce zero `:Section` nodes. Loud-fail instead so the operator
|
|
512
|
+
// sees the regression — typically caused by a model that ignored the new
|
|
513
|
+
// offset contract and reverted to emitting `body`.
|
|
514
|
+
if (rawSections.length > 0 && sections.length === 0) {
|
|
515
|
+
logFallback(accountId, `missing-offsets: every section failed offset validation (rawSections=${rawSections.length}, droppedForOffsets=${droppedForOffsets}). Likely cause: Haiku emitted body text instead of sourceStart/sourceEnd offsets, or the prompt update didn't reach the model.`);
|
|
516
|
+
return {
|
|
517
|
+
kind: "fallback",
|
|
518
|
+
reason: "Haiku response had no parseable section offsets",
|
|
519
|
+
};
|
|
520
|
+
}
|
|
387
521
|
// Top-level orphan candidates and document-level edges are document-mode
|
|
388
522
|
// concepts. In chat mode the operator confirms participants up front and
|
|
389
523
|
// attaches them as :PARTICIPANT_IN edges off the :ConversationArchive
|
|
@@ -432,7 +566,7 @@ export async function classifyDocument(params) {
|
|
|
432
566
|
}
|
|
433
567
|
}
|
|
434
568
|
}
|
|
435
|
-
process.stderr.write(`[memory-classify] [${accountId}] haiku ok (mode=${mode}, sections=${sections.length}, orphanCandidates=${orphanCandidates.length}, hallucinatedRelated=${hallucinatedRelated}, elapsedMs=${haikuMs})\n`);
|
|
569
|
+
process.stderr.write(`[memory-classify] [${accountId}] haiku ok (mode=${mode}, sections=${sections.length}, orphanCandidates=${orphanCandidates.length}, hallucinatedRelated=${hallucinatedRelated}, droppedForOffsets=${droppedForOffsets}, summaryTruncated=${summaryTruncated}, elapsedMs=${haikuMs})\n`);
|
|
436
570
|
return {
|
|
437
571
|
kind: "ok",
|
|
438
572
|
output: {
|
|
@@ -445,4 +579,120 @@ export async function classifyDocument(params) {
|
|
|
445
579
|
},
|
|
446
580
|
};
|
|
447
581
|
}
|
|
582
|
+
// ---------------------------------------------------------------------------
|
|
583
|
+
// Chunked classification path (Task 896 clause 3).
|
|
584
|
+
//
|
|
585
|
+
// Used only for document mode when the input exceeds CHUNK_CHAR_SIZE. Each
|
|
586
|
+
// chunk is classified independently via the same single-shot path; the
|
|
587
|
+
// per-chunk results are stitched back together with offset translation and
|
|
588
|
+
// a same-kind merge to fix sections that straddled a chunk boundary.
|
|
589
|
+
//
|
|
590
|
+
// documentSummary is dropped in chunked mode (Haiku only sees one chunk at
|
|
591
|
+
// a time, so no per-chunk summary describes the whole document) — see
|
|
592
|
+
// 896-followup if a downstream consumer needs a synthesised whole-doc
|
|
593
|
+
// summary later.
|
|
594
|
+
// ---------------------------------------------------------------------------
|
|
595
|
+
async function classifyDocumentChunked(params) {
|
|
596
|
+
const { accountId, documentText } = params;
|
|
597
|
+
const chunks = chunkDocument(documentText, {
|
|
598
|
+
chunkSize: CHUNK_CHAR_SIZE,
|
|
599
|
+
overlap: CHUNK_OVERLAP_CHARS,
|
|
600
|
+
});
|
|
601
|
+
process.stderr.write(`[memory-classify] [${accountId}] chunked path: chunks=${chunks.length} chars=${documentText.length} chunkSize=${CHUNK_CHAR_SIZE} overlap=${CHUNK_OVERLAP_CHARS}\n`);
|
|
602
|
+
// Defence-in-depth: chunkSize < INPUT_CHAR_CEILING by construction, so
|
|
603
|
+
// no chunk should exceed the per-call ceiling. If one does, that's a
|
|
604
|
+
// chunker bug or constants-drift — loud-fail instead of pretending.
|
|
605
|
+
for (const c of chunks) {
|
|
606
|
+
if (c.chunkText.length > INPUT_CHAR_CEILING) {
|
|
607
|
+
logFallback(accountId, `input-too-large: chunker emitted oversize chunk (chars=${c.chunkText.length}, ceiling=${INPUT_CHAR_CEILING}). Chunker invariant violated.`);
|
|
608
|
+
return {
|
|
609
|
+
kind: "fallback",
|
|
610
|
+
reason: `Chunker produced an oversize chunk (${c.chunkText.length} > ${INPUT_CHAR_CEILING})`,
|
|
611
|
+
};
|
|
612
|
+
}
|
|
613
|
+
}
|
|
614
|
+
const allSections = [];
|
|
615
|
+
const allKeywords = new Set();
|
|
616
|
+
const allOrphans = [];
|
|
617
|
+
const allDocumentEdges = [];
|
|
618
|
+
let totalHallucinatedRelated = 0;
|
|
619
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
620
|
+
const c = chunks[i];
|
|
621
|
+
process.stderr.write(`[memory-classify] [${accountId}] classify-chunk ${i + 1}/${chunks.length} (chars=${c.chunkText.length}, baseOffset=${c.baseOffset})\n`);
|
|
622
|
+
// Recurse into the single-shot path — chunkSize < CHUNK_CHAR_SIZE is the
|
|
623
|
+
// dispatch threshold so the recursive call lands in the existing logic.
|
|
624
|
+
const chunkResult = await classifyDocument({ ...params, documentText: c.chunkText });
|
|
625
|
+
if (chunkResult.kind === "fallback") {
|
|
626
|
+
// One chunk failure aborts the whole ingest (loud-failure doctrine).
|
|
627
|
+
return chunkResult;
|
|
628
|
+
}
|
|
629
|
+
for (const s of chunkResult.output.sections) {
|
|
630
|
+
const wholeStart = s.sourceStart + c.baseOffset;
|
|
631
|
+
const wholeEnd = s.sourceEnd + c.baseOffset;
|
|
632
|
+
allSections.push({
|
|
633
|
+
...s,
|
|
634
|
+
sourceStart: wholeStart,
|
|
635
|
+
sourceEnd: wholeEnd,
|
|
636
|
+
body: documentText.slice(wholeStart, wholeEnd),
|
|
637
|
+
});
|
|
638
|
+
}
|
|
639
|
+
chunkResult.output.documentKeywords.forEach((k) => allKeywords.add(k));
|
|
640
|
+
allOrphans.push(...chunkResult.output.orphanCandidates);
|
|
641
|
+
if (chunkResult.output.documentEdges) {
|
|
642
|
+
allDocumentEdges.push(...chunkResult.output.documentEdges);
|
|
643
|
+
}
|
|
644
|
+
totalHallucinatedRelated += chunkResult.output.hallucinatedRelated;
|
|
645
|
+
}
|
|
646
|
+
// Same-kind boundary-straddler merge. Cross-kind overlap is preserved as
|
|
647
|
+
// distinct sections per eng review — disagreement about kind is operator-
|
|
648
|
+
// visible signal, not noise to collapse.
|
|
649
|
+
const mergedSections = mergeOverlappingSections(allSections);
|
|
650
|
+
// After the merge, any merged section whose range was unioned needs its
|
|
651
|
+
// body re-sliced from the whole document so it covers the union, not just
|
|
652
|
+
// one of the contributing chunks. Walk the result and re-slice — cheap.
|
|
653
|
+
for (const s of mergedSections) {
|
|
654
|
+
s.body = documentText.slice(s.sourceStart, s.sourceEnd);
|
|
655
|
+
}
|
|
656
|
+
// documentEdges dedupe — a Parties / PARTICIPANT / FROM-TO target named
|
|
657
|
+
// across multiple chunks would otherwise be appended N times and the
|
|
658
|
+
// writer would attempt N edge writes against the same MERGEd target.
|
|
659
|
+
// Stable key = (type, targetKind, JSON.stringify(targetProperties))
|
|
660
|
+
// since two chunks emitting "PARTY of Person {givenName, familyName}"
|
|
661
|
+
// for the same party will produce identical targetProperties shapes.
|
|
662
|
+
const dedupedDocumentEdges = [];
|
|
663
|
+
const seenEdgeKeys = new Set();
|
|
664
|
+
for (const edge of allDocumentEdges) {
|
|
665
|
+
const key = `${edge.type}|${edge.direction}|${edge.targetKind}|${JSON.stringify(edge.targetProperties)}`;
|
|
666
|
+
if (seenEdgeKeys.has(key))
|
|
667
|
+
continue;
|
|
668
|
+
seenEdgeKeys.add(key);
|
|
669
|
+
dedupedDocumentEdges.push(edge);
|
|
670
|
+
}
|
|
671
|
+
// Orphan candidates similarly may repeat across chunks (same hallucinated
|
|
672
|
+
// node mentioned in two adjacent windows). Dedupe on (kind, label).
|
|
673
|
+
const dedupedOrphans = [];
|
|
674
|
+
const seenOrphanKeys = new Set();
|
|
675
|
+
for (const o of allOrphans) {
|
|
676
|
+
const key = `${o.kind}|${o.label}`;
|
|
677
|
+
if (seenOrphanKeys.has(key))
|
|
678
|
+
continue;
|
|
679
|
+
seenOrphanKeys.add(key);
|
|
680
|
+
dedupedOrphans.push(o);
|
|
681
|
+
}
|
|
682
|
+
process.stderr.write(`[memory-classify] [${accountId}] chunked merge: rawSections=${allSections.length} mergedSections=${mergedSections.length} rawEdges=${allDocumentEdges.length} mergedEdges=${dedupedDocumentEdges.length} rawOrphans=${allOrphans.length} mergedOrphans=${dedupedOrphans.length} hallucinatedRelated=${totalHallucinatedRelated}\n`);
|
|
683
|
+
return {
|
|
684
|
+
kind: "ok",
|
|
685
|
+
output: {
|
|
686
|
+
// documentSummary is dropped in chunked mode — Haiku never saw the
|
|
687
|
+
// whole document. Downstream consumers that need a whole-doc summary
|
|
688
|
+
// should call a separate reduce step (out of scope for this change).
|
|
689
|
+
documentSummary: "",
|
|
690
|
+
documentKeywords: Array.from(allKeywords),
|
|
691
|
+
sections: mergedSections,
|
|
692
|
+
orphanCandidates: dedupedOrphans,
|
|
693
|
+
...(dedupedDocumentEdges.length > 0 ? { documentEdges: dedupedDocumentEdges } : {}),
|
|
694
|
+
hallucinatedRelated: totalHallucinatedRelated,
|
|
695
|
+
},
|
|
696
|
+
};
|
|
697
|
+
}
|
|
448
698
|
//# sourceMappingURL=llm-classifier.js.map
|