@vivantel/virage-chunker-ce-ast 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ast-walker.d.ts +13 -0
- package/dist/ast-walker.d.ts.map +1 -0
- package/dist/ast-walker.js +33 -0
- package/dist/ast-walker.js.map +1 -0
- package/dist/chunker.d.ts +27 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +127 -0
- package/dist/chunker.js.map +1 -0
- package/dist/index.d.ts +7 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -0
- package/dist/outline.d.ts +4 -0
- package/dist/outline.d.ts.map +1 -0
- package/dist/outline.js +16 -0
- package/dist/outline.js.map +1 -0
- package/dist/types.d.ts +51 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/package.json +43 -0
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { DocNode, DocNodeAttrs } from "./types.js";
|
|
2
|
+
export interface TextSegment {
|
|
3
|
+
text: string;
|
|
4
|
+
attrs: DocNodeAttrs;
|
|
5
|
+
/** Ancestor heading texts at the time this segment was emitted. */
|
|
6
|
+
breadcrumb: string[];
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Depth-first walk of a DocNode tree collecting leaf text segments.
|
|
10
|
+
* Maintains a breadcrumb stack updated on entry/exit of heading nodes.
|
|
11
|
+
*/
|
|
12
|
+
export declare function walkDocNode(root: DocNode): TextSegment[];
|
|
13
|
+
//# sourceMappingURL=ast-walker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ast-walker.d.ts","sourceRoot":"","sources":["../src/ast-walker.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAExD,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,YAAY,CAAC;IACpB,mEAAmE;IACnE,UAAU,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,OAAO,GAAG,WAAW,EAAE,CAiCxD"}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Depth-first walk of a DocNode tree collecting leaf text segments.
|
|
3
|
+
* Maintains a breadcrumb stack updated on entry/exit of heading nodes.
|
|
4
|
+
*/
|
|
5
|
+
export function walkDocNode(root) {
|
|
6
|
+
const segments = [];
|
|
7
|
+
const breadcrumb = [];
|
|
8
|
+
function visit(node) {
|
|
9
|
+
if (node.type === "heading" && node.text) {
|
|
10
|
+
const level = (node.attrs.headingLevel ?? 1) - 1;
|
|
11
|
+
// Truncate breadcrumb at this heading level and push the new heading.
|
|
12
|
+
breadcrumb.splice(level, breadcrumb.length - level, node.text);
|
|
13
|
+
}
|
|
14
|
+
// Emit leaf text (non-structural nodes that carry text content).
|
|
15
|
+
const isLeafText = node.text != null &&
|
|
16
|
+
node.type !== "heading" &&
|
|
17
|
+
node.type !== "image" &&
|
|
18
|
+
node.type !== "link";
|
|
19
|
+
if (isLeafText && node.text) {
|
|
20
|
+
segments.push({
|
|
21
|
+
text: node.text,
|
|
22
|
+
attrs: node.attrs,
|
|
23
|
+
breadcrumb: [...breadcrumb],
|
|
24
|
+
});
|
|
25
|
+
}
|
|
26
|
+
for (const child of node.children ?? []) {
|
|
27
|
+
visit(child);
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
visit(root);
|
|
31
|
+
return segments;
|
|
32
|
+
}
|
|
33
|
+
//# sourceMappingURL=ast-walker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ast-walker.js","sourceRoot":"","sources":["../src/ast-walker.ts"],"names":[],"mappings":"AASA;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,IAAa;IACvC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,SAAS,KAAK,CAAC,IAAa;QAC1B,IAAI,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YACzC,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YACjD,sEAAsE;YACtE,UAAU,CAAC,MAAM,CAAC,KAAK,EAAE,UAAU,CAAC,MAAM,GAAG,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACjE,CAAC;QAED,iEAAiE;QACjE,MAAM,UAAU,GACd,IAAI,CAAC,IAAI,IAAI,IAAI;YACjB,IAAI,CAAC,IAAI,KAAK,SAAS;YACvB,IAAI,CAAC,IAAI,KAAK,OAAO;YACrB,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC;QAEvB,IAAI,UAAU,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YAC5B,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,UAAU,EAAE,CAAC,GAAG,UAAU,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ,IAAI,EAAE,EAAE,CAAC;YACxC,KAAK,CAAC,KAAK,CAAC,CAAC;QACf,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,CAAC;IACZ,OAAO,QAAQ,CAAC;AAClB,CAAC"}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
import type { DocNode, ChunkMeta } from "./types.js";
|
|
2
|
+
export interface WalkOptions {
|
|
3
|
+
sourceFile: string;
|
|
4
|
+
sourceFormat: string;
|
|
5
|
+
commitHash: string;
|
|
6
|
+
strategy: string;
|
|
7
|
+
maxTokens?: number;
|
|
8
|
+
minTokens?: number;
|
|
9
|
+
fileHash?: string;
|
|
10
|
+
fileSizeBytes?: number;
|
|
11
|
+
fileModifiedAt?: string;
|
|
12
|
+
}
|
|
13
|
+
export interface ChunkResult {
|
|
14
|
+
content: string;
|
|
15
|
+
metadata: ChunkMeta;
|
|
16
|
+
sourceFile: string;
|
|
17
|
+
commitHash: string;
|
|
18
|
+
contentHash?: string;
|
|
19
|
+
}
|
|
20
|
+
/**
|
|
21
|
+
* Walk a ViDoc AST and produce Chunk[] with full ChunkMeta.
|
|
22
|
+
*
|
|
23
|
+
* Splits at paragraph boundaries when the buffer reaches maxTokens.
|
|
24
|
+
* Merges trailing windows shorter than minTokens into the predecessor.
|
|
25
|
+
*/
|
|
26
|
+
export declare function walkToChunks(root: DocNode, opts: WalkOptions): ChunkResult[];
|
|
27
|
+
//# sourceMappingURL=chunker.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAIrD,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,SAAS,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAQD;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,GAAG,WAAW,EAAE,CAoI5E"}
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import { walkDocNode } from "./ast-walker.js";
|
|
2
|
+
import { extractOutline } from "./outline.js";
|
|
3
|
+
const CHARS_PER_TOKEN = 4;
|
|
4
|
+
function estimateTokens(text) {
|
|
5
|
+
return Math.ceil(text.length / CHARS_PER_TOKEN);
|
|
6
|
+
}
|
|
7
|
+
/**
|
|
8
|
+
* Walk a ViDoc AST and produce Chunk[] with full ChunkMeta.
|
|
9
|
+
*
|
|
10
|
+
* Splits at paragraph boundaries when the buffer reaches maxTokens.
|
|
11
|
+
* Merges trailing windows shorter than minTokens into the predecessor.
|
|
12
|
+
*/
|
|
13
|
+
export function walkToChunks(root, opts) {
|
|
14
|
+
const maxTokens = opts.maxTokens ?? 512;
|
|
15
|
+
const minTokens = opts.minTokens ?? Math.floor(maxTokens / 4);
|
|
16
|
+
const documentOutline = extractOutline(root);
|
|
17
|
+
const segments = walkDocNode(root);
|
|
18
|
+
if (segments.length === 0)
|
|
19
|
+
return [];
|
|
20
|
+
const windows = [];
|
|
21
|
+
let current = {
|
|
22
|
+
texts: [],
|
|
23
|
+
byteStart: segments[0].attrs.byteStart,
|
|
24
|
+
byteEnd: segments[0].attrs.byteEnd,
|
|
25
|
+
breadcrumb: segments[0].breadcrumb,
|
|
26
|
+
truncated: false,
|
|
27
|
+
};
|
|
28
|
+
let currentTokens = 0;
|
|
29
|
+
for (const seg of segments) {
|
|
30
|
+
const segTokens = estimateTokens(seg.text);
|
|
31
|
+
// If adding this segment would overflow and we already have content, flush.
|
|
32
|
+
if (currentTokens > 0 && currentTokens + segTokens > maxTokens) {
|
|
33
|
+
windows.push(current);
|
|
34
|
+
current = {
|
|
35
|
+
texts: [],
|
|
36
|
+
byteStart: seg.attrs.byteStart,
|
|
37
|
+
byteEnd: seg.attrs.byteEnd,
|
|
38
|
+
breadcrumb: seg.breadcrumb,
|
|
39
|
+
truncated: false,
|
|
40
|
+
};
|
|
41
|
+
currentTokens = 0;
|
|
42
|
+
}
|
|
43
|
+
// If a single segment exceeds maxTokens, hard-cut it.
|
|
44
|
+
if (segTokens > maxTokens) {
|
|
45
|
+
const maxChars = maxTokens * CHARS_PER_TOKEN;
|
|
46
|
+
current.texts.push(seg.text.slice(0, maxChars));
|
|
47
|
+
current.byteEnd = seg.attrs.byteEnd;
|
|
48
|
+
current.truncated = true;
|
|
49
|
+
windows.push(current);
|
|
50
|
+
current = {
|
|
51
|
+
texts: [],
|
|
52
|
+
byteStart: seg.attrs.byteEnd,
|
|
53
|
+
byteEnd: seg.attrs.byteEnd,
|
|
54
|
+
breadcrumb: seg.breadcrumb,
|
|
55
|
+
truncated: false,
|
|
56
|
+
};
|
|
57
|
+
currentTokens = 0;
|
|
58
|
+
continue;
|
|
59
|
+
}
|
|
60
|
+
current.texts.push(seg.text);
|
|
61
|
+
current.byteEnd = seg.attrs.byteEnd;
|
|
62
|
+
if (current.lineStart == null && seg.attrs.lineStart != null)
|
|
63
|
+
current.lineStart = seg.attrs.lineStart;
|
|
64
|
+
if (seg.attrs.lineEnd != null)
|
|
65
|
+
current.lineEnd = seg.attrs.lineEnd;
|
|
66
|
+
if (current.pageStart == null && seg.attrs.pageNumber != null)
|
|
67
|
+
current.pageStart = seg.attrs.pageNumber;
|
|
68
|
+
if (seg.attrs.pageNumber != null)
|
|
69
|
+
current.pageEnd = seg.attrs.pageNumber;
|
|
70
|
+
if (!current.lang && seg.attrs.lang)
|
|
71
|
+
current.lang = seg.attrs.lang;
|
|
72
|
+
if (!current.codeLanguage && seg.attrs.codeLanguage)
|
|
73
|
+
current.codeLanguage = seg.attrs.codeLanguage;
|
|
74
|
+
currentTokens += segTokens;
|
|
75
|
+
}
|
|
76
|
+
if (current.texts.length > 0) {
|
|
77
|
+
windows.push(current);
|
|
78
|
+
}
|
|
79
|
+
// Merge trailing window into predecessor if it is below minTokens.
|
|
80
|
+
if (windows.length > 1) {
|
|
81
|
+
const last = windows[windows.length - 1];
|
|
82
|
+
const lastTokens = estimateTokens(last.texts.join("\n\n"));
|
|
83
|
+
if (lastTokens < minTokens) {
|
|
84
|
+
const prev = windows[windows.length - 2];
|
|
85
|
+
prev.texts.push(...last.texts);
|
|
86
|
+
prev.byteEnd = last.byteEnd;
|
|
87
|
+
prev.lineEnd = last.lineEnd;
|
|
88
|
+
prev.pageEnd = last.pageEnd;
|
|
89
|
+
windows.pop();
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
const totalChunks = windows.length;
|
|
93
|
+
return windows.map((win, i) => {
|
|
94
|
+
const content = win.texts.join("\n\n");
|
|
95
|
+
const meta = {
|
|
96
|
+
sourceFile: opts.sourceFile,
|
|
97
|
+
sourceFormat: opts.sourceFormat,
|
|
98
|
+
byteStart: win.byteStart,
|
|
99
|
+
byteEnd: win.byteEnd,
|
|
100
|
+
lineStart: win.lineStart,
|
|
101
|
+
lineEnd: win.lineEnd,
|
|
102
|
+
pageStart: win.pageStart,
|
|
103
|
+
pageEnd: win.pageEnd,
|
|
104
|
+
breadcrumb: win.breadcrumb,
|
|
105
|
+
sectionTitle: win.breadcrumb.at(-1),
|
|
106
|
+
headingLevel: win.breadcrumb.length > 0 ? win.breadcrumb.length : undefined,
|
|
107
|
+
documentOutline,
|
|
108
|
+
lang: win.lang,
|
|
109
|
+
codeLanguage: win.codeLanguage,
|
|
110
|
+
strategy: opts.strategy,
|
|
111
|
+
chunkIndex: i,
|
|
112
|
+
totalChunks,
|
|
113
|
+
estimatedTokens: estimateTokens(content),
|
|
114
|
+
truncated: win.truncated,
|
|
115
|
+
...(opts.fileHash ? { fileHash: opts.fileHash } : {}),
|
|
116
|
+
...(opts.fileSizeBytes != null ? { fileSizeBytes: opts.fileSizeBytes } : {}),
|
|
117
|
+
...(opts.fileModifiedAt ? { fileModifiedAt: opts.fileModifiedAt } : {}),
|
|
118
|
+
};
|
|
119
|
+
return {
|
|
120
|
+
content,
|
|
121
|
+
metadata: meta,
|
|
122
|
+
sourceFile: opts.sourceFile,
|
|
123
|
+
commitHash: opts.commitHash,
|
|
124
|
+
};
|
|
125
|
+
});
|
|
126
|
+
}
|
|
127
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAsB9C,MAAM,eAAe,GAAG,CAAC,CAAC;AAE1B,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC,CAAC;AAClD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,IAAa,EAAE,IAAiB;IAC3D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAC;IACxC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC;IAC9D,MAAM,eAAe,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;IAEnC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAiBrC,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,OAAO,GAAW;QACpB,KAAK,EAAE,EAAE;QACT,SAAS,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,SAAS;QACvC,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,OAAO;QACnC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,UAAU;QACnC,SAAS,EAAE,KAAK;KACjB,CAAC;IACF,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC3B,MAAM,SAAS,GAAG,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAE3C,4EAA4E;QAC5E,IAAI,aAAa,GAAG,CAAC,IAAI,aAAa,GAAG,SAAS,GAAG,SAAS,EAAE,CAAC;YAC/D,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,OAAO,GAAG;gBACR,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,GAAG,CAAC,KAAK,CAAC,SAAS;gBAC9B,OAAO,EAAE,GAAG,CAAC,KAAK,CAAC,OAAO;gBAC1B,UAAU,EAAE,GAAG,CAAC,UAAU;gBAC1B,SAAS,EAAE,KAAK;aACjB,CAAC;YACF,aAAa,GAAG,CAAC,CAAC;QACpB,CAAC;QAED,sDAAsD;QACtD,IAAI,SAAS,GAAG,SAAS,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,SAAS,GAAG,eAAe,CAAC;YAC7C,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;YAChD,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC;YACpC,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,OAAO,GAAG;gBACR,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,GAAG,CAAC,KAAK,CAAC,OAAO;gBAC5B,OAAO,EAAE,GAAG,CAAC,KAAK,CAAC,OAAO;gBAC1B,UAAU,EAAE,GAAG,CAAC,UAAU;gBAC1B,SAAS,EAAE,KAAK;aACjB,CAAC;YACF,aAAa,GAAG,CAAC,CAAC;YAClB,SAAS;QACX,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7B,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC;QACpC,IAAI,OAAO,CAAC,SAAS,IAAI,IAAI,IAAI,GAAG,CAAC,KAAK,CAAC,SAAS,IAAI,IAAI;YAAE,OAAO,CAAC,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC;QACtG,IAAI,GAAG,CAAC,KAAK,CAAC,OAAO,IAAI,IAAI;YAAE,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC;QACnE,IAAI,OAAO,CAAC,SAAS,IAAI,IAAI,IAAI,GAAG,CAAC,KAAK,CAAC,UAAU,IAAI,IAAI;YAAE,OAAO,CAAC,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC;QACxG,IAAI,GAAG,CAAC,KAAK,CAAC,UAAU,IAAI,IAAI;YAAE,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC;QACzE,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,GAAG,CAAC,KAAK,CAAC,IAAI;YAAE,OAAO,CAAC,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC;QACnE,IAAI,CAAC,OAAO,CAAC,YAAY,IAAI,GAAG,CAAC,KAAK,CAAC,YAAY;YAAE,OAAO,CAAC,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC;QACnG,aAAa,IAAI,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACxB,CAAC;IAED,mEAAmE;IACnE,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;QAC3D,IAAI,UAAU,GAAG,SAAS,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;YAC1C,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;YAC/B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;YAC5B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;YAC5B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;YAC5B,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAED,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;IAEnC,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QAC5B,MAAM,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,MAAM,IAAI,GAAc;YACtB,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,YAAY,EAAE,IAAI,CAAC,YAAY;YAC/B,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,UAAU,EAAE,GAAG,CAAC,UAAU;YAC1B,YAAY,EAAE,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YACnC,YAAY,EAAE,GAAG,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;YAC3E,eAAe;YACf,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,YAAY,EAAE,GAAG,CAAC,YAAY;YAC9B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,UAAU,EAAE,CAAC;YACb,WAAW;YACX,eAAe,EAAE,cAAc,CAAC,OAAO,CAAC;YACxC,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACrD,GAAG,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,aAAa,EAAE,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5E,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,cAAc,EAAE,IAAI,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACxE,CAAC;QAEF,OAAO;YACL,OAAO;YACP,QAAQ,EAAE,IAAI;YACd,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
export type { DocNode, DocNodeType, DocNodeAttrs, ChunkMeta } from "./types.js";
|
|
2
|
+
export { walkDocNode } from "./ast-walker.js";
|
|
3
|
+
export type { TextSegment } from "./ast-walker.js";
|
|
4
|
+
export { extractOutline } from "./outline.js";
|
|
5
|
+
export { walkToChunks } from "./chunker.js";
|
|
6
|
+
export type { WalkOptions, ChunkResult } from "./chunker.js";
|
|
7
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,YAAY,EAAE,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAChF,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,YAAY,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AACnD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"outline.d.ts","sourceRoot":"","sources":["../src/outline.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAE1C,mFAAmF;AACnF,wBAAgB,cAAc,CAAC,IAAI,EAAE,OAAO,GAAG,MAAM,EAAE,CAatD"}
|
package/dist/outline.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/** Extract the top-level section titles from a DocNode tree (document outline). */
|
|
2
|
+
export function extractOutline(root) {
|
|
3
|
+
const titles = [];
|
|
4
|
+
for (const child of root.children ?? []) {
|
|
5
|
+
if (child.type === "heading" && child.attrs.headingLevel === 1 && child.text) {
|
|
6
|
+
titles.push(child.text);
|
|
7
|
+
}
|
|
8
|
+
else if (child.type === "section") {
|
|
9
|
+
const h = (child.children ?? []).find((n) => n.type === "heading" && n.text);
|
|
10
|
+
if (h?.text)
|
|
11
|
+
titles.push(h.text);
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
return titles;
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=outline.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"outline.js","sourceRoot":"","sources":["../src/outline.ts"],"names":[],"mappings":"AAEA,mFAAmF;AACnF,MAAM,UAAU,cAAc,CAAC,IAAa;IAC1C,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ,IAAI,EAAE,EAAE,CAAC;QACxC,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC7E,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;aAAM,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,IAAI,CACnC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,IAAI,CAAC,CAAC,IAAI,CACtC,CAAC;YACF,IAAI,CAAC,EAAE,IAAI;gBAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
export type DocNodeType = "document" | "section" | "heading" | "paragraph" | "table" | "table-row" | "table-cell" | "list" | "list-item" | "code" | "formula" | "image" | "link" | "footnote" | "caption" | "abstract" | "metadata";
|
|
2
|
+
export interface DocNodeAttrs {
|
|
3
|
+
headingLevel?: 1 | 2 | 3 | 4 | 5 | 6;
|
|
4
|
+
role?: "caption" | "footnote" | "abstract" | "toc-entry" | "header" | "footer";
|
|
5
|
+
breadcrumb?: string[];
|
|
6
|
+
byteStart: number;
|
|
7
|
+
byteEnd: number;
|
|
8
|
+
lineStart?: number;
|
|
9
|
+
lineEnd?: number;
|
|
10
|
+
pageNumber?: number;
|
|
11
|
+
lang?: string;
|
|
12
|
+
codeLanguage?: string;
|
|
13
|
+
tableRow?: number;
|
|
14
|
+
tableCol?: number;
|
|
15
|
+
isHeader?: boolean;
|
|
16
|
+
listDepth?: number;
|
|
17
|
+
ordered?: boolean;
|
|
18
|
+
sourceFormat?: string;
|
|
19
|
+
}
|
|
20
|
+
export interface DocNode {
|
|
21
|
+
type: DocNodeType;
|
|
22
|
+
children?: DocNode[];
|
|
23
|
+
text?: string;
|
|
24
|
+
attrs: DocNodeAttrs;
|
|
25
|
+
}
|
|
26
|
+
export interface ChunkMeta extends Record<string, unknown> {
|
|
27
|
+
sourceFile: string;
|
|
28
|
+
sourceFormat: string;
|
|
29
|
+
byteStart: number;
|
|
30
|
+
byteEnd: number;
|
|
31
|
+
lineStart?: number;
|
|
32
|
+
lineEnd?: number;
|
|
33
|
+
pageStart?: number;
|
|
34
|
+
pageEnd?: number;
|
|
35
|
+
fileSizeBytes?: number;
|
|
36
|
+
fileModifiedAt?: string;
|
|
37
|
+
fileHash?: string;
|
|
38
|
+
breadcrumb: string[];
|
|
39
|
+
sectionTitle?: string;
|
|
40
|
+
headingLevel?: number;
|
|
41
|
+
documentOutline?: string[];
|
|
42
|
+
lang?: string;
|
|
43
|
+
codeLanguage?: string;
|
|
44
|
+
strategy: string;
|
|
45
|
+
chunkIndex: number;
|
|
46
|
+
totalChunks: number;
|
|
47
|
+
estimatedTokens: number;
|
|
48
|
+
qualityScore?: number;
|
|
49
|
+
truncated?: boolean;
|
|
50
|
+
}
|
|
51
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAIA,MAAM,MAAM,WAAW,GACnB,UAAU,GACV,SAAS,GACT,SAAS,GACT,WAAW,GACX,OAAO,GACP,WAAW,GACX,YAAY,GACZ,MAAM,GACN,WAAW,GACX,MAAM,GACN,SAAS,GACT,OAAO,GACP,MAAM,GACN,UAAU,GACV,SAAS,GACT,UAAU,GACV,UAAU,CAAC;AAEf,MAAM,WAAW,YAAY;IAC3B,YAAY,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IACrC,IAAI,CAAC,EAAE,SAAS,GAAG,UAAU,GAAG,UAAU,GAAG,WAAW,GAAG,QAAQ,GAAG,QAAQ,CAAC;IAC/E,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,WAAW,CAAC;IAClB,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,YAAY,CAAC;CACrB;AAID,MAAM,WAAW,SAAU,SAAQ,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC;IAExD,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IAGjB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAGlB,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAG3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IAGpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,mBAAmB;AACnB,mEAAmE;AACnE,uEAAuE"}
|
package/package.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@vivantel/virage-chunker-ce-ast",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Generalized ViDoc AST walker — shared chunking strategy for all structured document formats",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": {
|
|
9
|
+
"import": "./dist/index.js",
|
|
10
|
+
"types": "./dist/index.d.ts"
|
|
11
|
+
}
|
|
12
|
+
},
|
|
13
|
+
"files": ["dist"],
|
|
14
|
+
"scripts": {
|
|
15
|
+
"build": "tsc --build",
|
|
16
|
+
"type-check": "tsc --build --noEmit",
|
|
17
|
+
"test": "vitest run"
|
|
18
|
+
},
|
|
19
|
+
"peerDependencies": {
|
|
20
|
+
"@vivantel/virage-core": ">=0.2"
|
|
21
|
+
},
|
|
22
|
+
"devDependencies": {
|
|
23
|
+
"@vivantel/virage-core": ">=0.2"
|
|
24
|
+
},
|
|
25
|
+
"rag-plugin": {
|
|
26
|
+
"type": "chunker",
|
|
27
|
+
"label": "ViDoc AST walker (generic)",
|
|
28
|
+
"key": "ast",
|
|
29
|
+
"defaultConfig": {
|
|
30
|
+
"maxTokens": 512,
|
|
31
|
+
"minTokens": 128
|
|
32
|
+
}
|
|
33
|
+
},
|
|
34
|
+
"publishConfig": {
|
|
35
|
+
"access": "public"
|
|
36
|
+
},
|
|
37
|
+
"license": "MIT OR Apache-2.0",
|
|
38
|
+
"repository": {
|
|
39
|
+
"type": "git",
|
|
40
|
+
"url": "https://github.com/vivantel/virage-chunkers-ce.git",
|
|
41
|
+
"directory": "packages/virage-chunker-ce-ast"
|
|
42
|
+
}
|
|
43
|
+
}
|