@vivantel/virage-chunker-ce-ast 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,13 @@
1
+ import type { DocNode, DocNodeAttrs } from "./types.js";
2
+ export interface TextSegment {
3
+ text: string;
4
+ attrs: DocNodeAttrs;
5
+ /** Ancestor heading texts at the time this segment was emitted. */
6
+ breadcrumb: string[];
7
+ }
8
+ /**
9
+ * Depth-first walk of a DocNode tree collecting leaf text segments.
10
+ * Maintains a breadcrumb stack updated on entry/exit of heading nodes.
11
+ */
12
+ export declare function walkDocNode(root: DocNode): TextSegment[];
13
+ //# sourceMappingURL=ast-walker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ast-walker.d.ts","sourceRoot":"","sources":["../src/ast-walker.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,YAAY,EAAE,MAAM,YAAY,CAAC;AAExD,MAAM,WAAW,WAAW;IAC1B,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,YAAY,CAAC;IACpB,mEAAmE;IACnE,UAAU,EAAE,MAAM,EAAE,CAAC;CACtB;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,OAAO,GAAG,WAAW,EAAE,CAiCxD"}
@@ -0,0 +1,33 @@
1
+ /**
2
+ * Depth-first walk of a DocNode tree collecting leaf text segments.
3
+ * Maintains a breadcrumb stack updated on entry/exit of heading nodes.
4
+ */
5
+ export function walkDocNode(root) {
6
+ const segments = [];
7
+ const breadcrumb = [];
8
+ function visit(node) {
9
+ if (node.type === "heading" && node.text) {
10
+ const level = (node.attrs.headingLevel ?? 1) - 1;
11
+ // Truncate breadcrumb at this heading level and push the new heading.
12
+ breadcrumb.splice(level, breadcrumb.length - level, node.text);
13
+ }
14
+ // Emit leaf text (non-structural nodes that carry text content).
15
+ const isLeafText = node.text != null &&
16
+ node.type !== "heading" &&
17
+ node.type !== "image" &&
18
+ node.type !== "link";
19
+ if (isLeafText && node.text) {
20
+ segments.push({
21
+ text: node.text,
22
+ attrs: node.attrs,
23
+ breadcrumb: [...breadcrumb],
24
+ });
25
+ }
26
+ for (const child of node.children ?? []) {
27
+ visit(child);
28
+ }
29
+ }
30
+ visit(root);
31
+ return segments;
32
+ }
33
+ //# sourceMappingURL=ast-walker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ast-walker.js","sourceRoot":"","sources":["../src/ast-walker.ts"],"names":[],"mappings":"AASA;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,IAAa;IACvC,MAAM,QAAQ,GAAkB,EAAE,CAAC;IACnC,MAAM,UAAU,GAAa,EAAE,CAAC;IAEhC,SAAS,KAAK,CAAC,IAAa;QAC1B,IAAI,IAAI,CAAC,IAAI,KAAK,SAAS,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YACzC,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,YAAY,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC;YACjD,sEAAsE;YACtE,UAAU,CAAC,MAAM,CAAC,KAAK,EAAE,UAAU,CAAC,MAAM,GAAG,KAAK,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC;QACjE,CAAC;QAED,iEAAiE;QACjE,MAAM,UAAU,GACd,IAAI,CAAC,IAAI,IAAI,IAAI;YACjB,IAAI,CAAC,IAAI,KAAK,SAAS;YACvB,IAAI,CAAC,IAAI,KAAK,OAAO;YACrB,IAAI,CAAC,IAAI,KAAK,MAAM,CAAC;QAEvB,IAAI,UAAU,IAAI,IAAI,CAAC,IAAI,EAAE,CAAC;YAC5B,QAAQ,CAAC,IAAI,CAAC;gBACZ,IAAI,EAAE,IAAI,CAAC,IAAI;gBACf,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,UAAU,EAAE,CAAC,GAAG,UAAU,CAAC;aAC5B,CAAC,CAAC;QACL,CAAC;QAED,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ,IAAI,EAAE,EAAE,CAAC;YACxC,KAAK,CAAC,KAAK,CAAC,CAAC;QACf,CAAC;IACH,CAAC;IAED,KAAK,CAAC,IAAI,CAAC,CAAC;IACZ,OAAO,QAAQ,CAAC;AAClB,CAAC"}
@@ -0,0 +1,27 @@
1
+ import type { DocNode, ChunkMeta } from "./types.js";
2
+ export interface WalkOptions {
3
+ sourceFile: string;
4
+ sourceFormat: string;
5
+ commitHash: string;
6
+ strategy: string;
7
+ maxTokens?: number;
8
+ minTokens?: number;
9
+ fileHash?: string;
10
+ fileSizeBytes?: number;
11
+ fileModifiedAt?: string;
12
+ }
13
+ export interface ChunkResult {
14
+ content: string;
15
+ metadata: ChunkMeta;
16
+ sourceFile: string;
17
+ commitHash: string;
18
+ contentHash?: string;
19
+ }
20
+ /**
21
+ * Walk a ViDoc AST and produce Chunk[] with full ChunkMeta.
22
+ *
23
+ * Splits at paragraph boundaries when the buffer reaches maxTokens.
24
+ * Merges trailing windows shorter than minTokens into the predecessor.
25
+ */
26
+ export declare function walkToChunks(root: DocNode, opts: WalkOptions): ChunkResult[];
27
+ //# sourceMappingURL=chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAIrD,MAAM,WAAW,WAAW;IAC1B,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,WAAW;IAC1B,OAAO,EAAE,MAAM,CAAC;IAChB,QAAQ,EAAE,SAAS,CAAC;IACpB,UAAU,EAAE,MAAM,CAAC;IACnB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAQD;;;;;GAKG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,WAAW,GAAG,WAAW,EAAE,CAoI5E"}
@@ -0,0 +1,127 @@
1
+ import { walkDocNode } from "./ast-walker.js";
2
+ import { extractOutline } from "./outline.js";
3
+ const CHARS_PER_TOKEN = 4;
4
+ function estimateTokens(text) {
5
+ return Math.ceil(text.length / CHARS_PER_TOKEN);
6
+ }
7
+ /**
8
+ * Walk a ViDoc AST and produce Chunk[] with full ChunkMeta.
9
+ *
10
+ * Splits at paragraph boundaries when the buffer reaches maxTokens.
11
+ * Merges trailing windows shorter than minTokens into the predecessor.
12
+ */
13
+ export function walkToChunks(root, opts) {
14
+ const maxTokens = opts.maxTokens ?? 512;
15
+ const minTokens = opts.minTokens ?? Math.floor(maxTokens / 4);
16
+ const documentOutline = extractOutline(root);
17
+ const segments = walkDocNode(root);
18
+ if (segments.length === 0)
19
+ return [];
20
+ const windows = [];
21
+ let current = {
22
+ texts: [],
23
+ byteStart: segments[0].attrs.byteStart,
24
+ byteEnd: segments[0].attrs.byteEnd,
25
+ breadcrumb: segments[0].breadcrumb,
26
+ truncated: false,
27
+ };
28
+ let currentTokens = 0;
29
+ for (const seg of segments) {
30
+ const segTokens = estimateTokens(seg.text);
31
+ // If adding this segment would overflow and we already have content, flush.
32
+ if (currentTokens > 0 && currentTokens + segTokens > maxTokens) {
33
+ windows.push(current);
34
+ current = {
35
+ texts: [],
36
+ byteStart: seg.attrs.byteStart,
37
+ byteEnd: seg.attrs.byteEnd,
38
+ breadcrumb: seg.breadcrumb,
39
+ truncated: false,
40
+ };
41
+ currentTokens = 0;
42
+ }
43
+ // If a single segment exceeds maxTokens, hard-cut it.
44
+ if (segTokens > maxTokens) {
45
+ const maxChars = maxTokens * CHARS_PER_TOKEN;
46
+ current.texts.push(seg.text.slice(0, maxChars));
47
+ current.byteEnd = seg.attrs.byteEnd;
48
+ current.truncated = true;
49
+ windows.push(current);
50
+ current = {
51
+ texts: [],
52
+ byteStart: seg.attrs.byteEnd,
53
+ byteEnd: seg.attrs.byteEnd,
54
+ breadcrumb: seg.breadcrumb,
55
+ truncated: false,
56
+ };
57
+ currentTokens = 0;
58
+ continue;
59
+ }
60
+ current.texts.push(seg.text);
61
+ current.byteEnd = seg.attrs.byteEnd;
62
+ if (current.lineStart == null && seg.attrs.lineStart != null)
63
+ current.lineStart = seg.attrs.lineStart;
64
+ if (seg.attrs.lineEnd != null)
65
+ current.lineEnd = seg.attrs.lineEnd;
66
+ if (current.pageStart == null && seg.attrs.pageNumber != null)
67
+ current.pageStart = seg.attrs.pageNumber;
68
+ if (seg.attrs.pageNumber != null)
69
+ current.pageEnd = seg.attrs.pageNumber;
70
+ if (!current.lang && seg.attrs.lang)
71
+ current.lang = seg.attrs.lang;
72
+ if (!current.codeLanguage && seg.attrs.codeLanguage)
73
+ current.codeLanguage = seg.attrs.codeLanguage;
74
+ currentTokens += segTokens;
75
+ }
76
+ if (current.texts.length > 0) {
77
+ windows.push(current);
78
+ }
79
+ // Merge trailing window into predecessor if it is below minTokens.
80
+ if (windows.length > 1) {
81
+ const last = windows[windows.length - 1];
82
+ const lastTokens = estimateTokens(last.texts.join("\n\n"));
83
+ if (lastTokens < minTokens) {
84
+ const prev = windows[windows.length - 2];
85
+ prev.texts.push(...last.texts);
86
+ prev.byteEnd = last.byteEnd;
87
+ prev.lineEnd = last.lineEnd;
88
+ prev.pageEnd = last.pageEnd;
89
+ windows.pop();
90
+ }
91
+ }
92
+ const totalChunks = windows.length;
93
+ return windows.map((win, i) => {
94
+ const content = win.texts.join("\n\n");
95
+ const meta = {
96
+ sourceFile: opts.sourceFile,
97
+ sourceFormat: opts.sourceFormat,
98
+ byteStart: win.byteStart,
99
+ byteEnd: win.byteEnd,
100
+ lineStart: win.lineStart,
101
+ lineEnd: win.lineEnd,
102
+ pageStart: win.pageStart,
103
+ pageEnd: win.pageEnd,
104
+ breadcrumb: win.breadcrumb,
105
+ sectionTitle: win.breadcrumb.at(-1),
106
+ headingLevel: win.breadcrumb.length > 0 ? win.breadcrumb.length : undefined,
107
+ documentOutline,
108
+ lang: win.lang,
109
+ codeLanguage: win.codeLanguage,
110
+ strategy: opts.strategy,
111
+ chunkIndex: i,
112
+ totalChunks,
113
+ estimatedTokens: estimateTokens(content),
114
+ truncated: win.truncated,
115
+ ...(opts.fileHash ? { fileHash: opts.fileHash } : {}),
116
+ ...(opts.fileSizeBytes != null ? { fileSizeBytes: opts.fileSizeBytes } : {}),
117
+ ...(opts.fileModifiedAt ? { fileModifiedAt: opts.fileModifiedAt } : {}),
118
+ };
119
+ return {
120
+ content,
121
+ metadata: meta,
122
+ sourceFile: opts.sourceFile,
123
+ commitHash: opts.commitHash,
124
+ };
125
+ });
126
+ }
127
+ //# sourceMappingURL=chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAsB9C,MAAM,eAAe,GAAG,CAAC,CAAC;AAE1B,SAAS,cAAc,CAAC,IAAY;IAClC,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,eAAe,CAAC,CAAC;AAClD,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,YAAY,CAAC,IAAa,EAAE,IAAiB;IAC3D,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,GAAG,CAAC;IACxC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,CAAC,CAAC,CAAC;IAC9D,MAAM,eAAe,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;IAC7C,MAAM,QAAQ,GAAG,WAAW,CAAC,IAAI,CAAC,CAAC;IAEnC,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAiBrC,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,OAAO,GAAW;QACpB,KAAK,EAAE,EAAE;QACT,SAAS,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,SAAS;QACvC,OAAO,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,KAAK,CAAC,OAAO;QACnC,UAAU,EAAE,QAAQ,CAAC,CAAC,CAAE,CAAC,UAAU;QACnC,SAAS,EAAE,KAAK;KACjB,CAAC;IACF,IAAI,aAAa,GAAG,CAAC,CAAC;IAEtB,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC3B,MAAM,SAAS,GAAG,cAAc,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAE3C,4EAA4E;QAC5E,IAAI,aAAa,GAAG,CAAC,IAAI,aAAa,GAAG,SAAS,GAAG,SAAS,EAAE,CAAC;YAC/D,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,OAAO,GAAG;gBACR,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,GAAG,CAAC,KAAK,CAAC,SAAS;gBAC9B,OAAO,EAAE,GAAG,CAAC,KAAK,CAAC,OAAO;gBAC1B,UAAU,EAAE,GAAG,CAAC,UAAU;gBAC1B,SAAS,EAAE,KAAK;aACjB,CAAC;YACF,aAAa,GAAG,CAAC,CAAC;QACpB,CAAC;QAED,sDAAsD;QACtD,IAAI,SAAS,GAAG,SAAS,EAAE,CAAC;YAC1B,MAAM,QAAQ,GAAG,SAAS,GAAG,eAAe,CAAC;YAC7C,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,QAAQ,CAAC,CAAC,CAAC;YAChD,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC;YACpC,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC;YACzB,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;YACtB,OAAO,GAAG;gBACR,KAAK,EAAE,EAAE;gBACT,SAAS,EAAE,GAAG,CAAC,KAAK,CAAC,OAAO;gBAC5B,OAAO,EAAE,GAAG,CAAC,KAAK,CAAC,OAAO;gBAC1B,UAAU,EAAE,GAAG,CAAC,UAAU;gBAC1B,SAAS,EAAE,KAAK;aACjB,CAAC;YACF,aAAa,GAAG,CAAC,CAAC;YAClB,SAAS;QACX,CAAC;QAED,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC;QAC7B,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC;QACpC,IAAI,OAAO,CAAC,SAAS,IAAI,IAAI,IAAI,GAAG,CAAC,KAAK,CAAC,SAAS,IAAI,IAAI;YAAE,OAAO,CAAC,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC,SAAS,CAAC;QACtG,IAAI,GAAG,CAAC,KAAK,CAAC,OAAO,IAAI,IAAI;YAAE,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,OAAO,CAAC;QACnE,IAAI,OAAO,CAAC,SAAS,IAAI,IAAI,IAAI,GAAG,CAAC,KAAK,CAAC,UAAU,IAAI,IAAI;YAAE,OAAO,CAAC,SAAS,GAAG,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC;QACxG,IAAI,GAAG,CAAC,KAAK,CAAC,UAAU,IAAI,IAAI;YAAE,OAAO,CAAC,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,UAAU,CAAC;QACzE,IAAI,CAAC,OAAO,CAAC,IAAI,IAAI,GAAG,CAAC,KAAK,CAAC,IAAI;YAAE,OAAO,CAAC,IAAI,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC;QACnE,IAAI,CAAC,OAAO,CAAC,YAAY,IAAI,GAAG,CAAC,KAAK,CAAC,YAAY;YAAE,OAAO,CAAC,YAAY,GAAG,GAAG,CAAC,KAAK,CAAC,YAAY,CAAC;QACnG,aAAa,IAAI,SAAS,CAAC;IAC7B,CAAC;IAED,IAAI,OAAO,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC7B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IACxB,CAAC;IAED,mEAAmE;IACnE,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;QAC1C,MAAM,UAAU,GAAG,cAAc,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC;QAC3D,IAAI,UAAU,GAAG,SAAS,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,OAAO,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAE,CAAC;YAC1C,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;YAC/B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;YAC5B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;YAC5B,IAAI,CAAC,OAAO,GAAG,IAAI,CAAC,OAAO,CAAC;YAC5B,OAAO,CAAC,GAAG,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAED,MAAM,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC;IAEnC,OAAO,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QAC5B,MAAM,OAAO,GAAG,GAAG,CAAC,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,MAAM,IAAI,GAAc;YACtB,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,YAAY,EAAE,IAAI,CAAC,YAAY;YAC/B,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,OAAO,EAAE,GAAG,CAAC,OAAO;YACpB,UAAU,EAAE,GAAG,CAAC,UAAU;YAC1B,YAAY,EAAE,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC;YACnC,YAAY,EAAE,GAAG,CAAC,UAAU,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;YAC3E,eAAe;YACf,IAAI,EAAE,GAAG,CAAC,IAAI;YACd,YAAY,EAAE,GAAG,CAAC,YAAY;YAC9B,QAAQ,EAAE,IAAI,CAAC,QAAQ;YACvB,UAAU,EAAE,CAAC;YACb,WAAW;YACX,eAAe,EAAE,cAAc,CAAC,OAAO,CAAC;YACxC,SAAS,EAAE,GAAG,CAAC,SAAS;YACxB,GAAG,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YACrD,GAAG,CAAC,IAAI,CAAC,aAAa,IAAI,IAAI,CAAC,CAAC,CAAC,EAAE,aAAa,EAAE,IAAI,CAAC,aAAa,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC5E,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,CAAC,CAAC,EAAE,cAAc,EAAE,IAAI,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;SACxE,CAAC;QAEF,OAAO;YACL,OAAO;YACP,QAAQ,EAAE,IAAI;YACd,UAAU,EAAE,IAAI,CAAC,UAAU;YAC3B,UAAU,EAAE,IAAI,CAAC,UAAU;SAC5B,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC"}
@@ -0,0 +1,7 @@
1
+ export type { DocNode, DocNodeType, DocNodeAttrs, ChunkMeta } from "./types.js";
2
+ export { walkDocNode } from "./ast-walker.js";
3
+ export type { TextSegment } from "./ast-walker.js";
4
+ export { extractOutline } from "./outline.js";
5
+ export { walkToChunks } from "./chunker.js";
6
+ export type { WalkOptions, ChunkResult } from "./chunker.js";
7
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,YAAY,EAAE,OAAO,EAAE,WAAW,EAAE,YAAY,EAAE,SAAS,EAAE,MAAM,YAAY,CAAC;AAChF,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAC9C,YAAY,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AACnD,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5C,YAAY,EAAE,WAAW,EAAE,WAAW,EAAE,MAAM,cAAc,CAAC"}
package/dist/index.js ADDED
@@ -0,0 +1,4 @@
1
+ export { walkDocNode } from "./ast-walker.js";
2
+ export { extractOutline } from "./outline.js";
3
+ export { walkToChunks } from "./chunker.js";
4
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAE,MAAM,iBAAiB,CAAC;AAE9C,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC"}
@@ -0,0 +1,4 @@
1
+ import type { DocNode } from "./types.js";
2
+ /** Extract the top-level section titles from a DocNode tree (document outline). */
3
+ export declare function extractOutline(root: DocNode): string[];
4
+ //# sourceMappingURL=outline.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"outline.d.ts","sourceRoot":"","sources":["../src/outline.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,YAAY,CAAC;AAE1C,mFAAmF;AACnF,wBAAgB,cAAc,CAAC,IAAI,EAAE,OAAO,GAAG,MAAM,EAAE,CAatD"}
@@ -0,0 +1,16 @@
1
+ /** Extract the top-level section titles from a DocNode tree (document outline). */
2
+ export function extractOutline(root) {
3
+ const titles = [];
4
+ for (const child of root.children ?? []) {
5
+ if (child.type === "heading" && child.attrs.headingLevel === 1 && child.text) {
6
+ titles.push(child.text);
7
+ }
8
+ else if (child.type === "section") {
9
+ const h = (child.children ?? []).find((n) => n.type === "heading" && n.text);
10
+ if (h?.text)
11
+ titles.push(h.text);
12
+ }
13
+ }
14
+ return titles;
15
+ }
16
+ //# sourceMappingURL=outline.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"outline.js","sourceRoot":"","sources":["../src/outline.ts"],"names":[],"mappings":"AAEA,mFAAmF;AACnF,MAAM,UAAU,cAAc,CAAC,IAAa;IAC1C,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,KAAK,MAAM,KAAK,IAAI,IAAI,CAAC,QAAQ,IAAI,EAAE,EAAE,CAAC;QACxC,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,IAAI,KAAK,CAAC,KAAK,CAAC,YAAY,KAAK,CAAC,IAAI,KAAK,CAAC,IAAI,EAAE,CAAC;YAC7E,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;QAC1B,CAAC;aAAM,IAAI,KAAK,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;YACpC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,IAAI,CACnC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,KAAK,SAAS,IAAI,CAAC,CAAC,IAAI,CACtC,CAAC;YACF,IAAI,CAAC,EAAE,IAAI;gBAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACnC,CAAC;IACH,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,51 @@
1
+ export type DocNodeType = "document" | "section" | "heading" | "paragraph" | "table" | "table-row" | "table-cell" | "list" | "list-item" | "code" | "formula" | "image" | "link" | "footnote" | "caption" | "abstract" | "metadata";
2
+ export interface DocNodeAttrs {
3
+ headingLevel?: 1 | 2 | 3 | 4 | 5 | 6;
4
+ role?: "caption" | "footnote" | "abstract" | "toc-entry" | "header" | "footer";
5
+ breadcrumb?: string[];
6
+ byteStart: number;
7
+ byteEnd: number;
8
+ lineStart?: number;
9
+ lineEnd?: number;
10
+ pageNumber?: number;
11
+ lang?: string;
12
+ codeLanguage?: string;
13
+ tableRow?: number;
14
+ tableCol?: number;
15
+ isHeader?: boolean;
16
+ listDepth?: number;
17
+ ordered?: boolean;
18
+ sourceFormat?: string;
19
+ }
20
+ export interface DocNode {
21
+ type: DocNodeType;
22
+ children?: DocNode[];
23
+ text?: string;
24
+ attrs: DocNodeAttrs;
25
+ }
26
+ export interface ChunkMeta extends Record<string, unknown> {
27
+ sourceFile: string;
28
+ sourceFormat: string;
29
+ byteStart: number;
30
+ byteEnd: number;
31
+ lineStart?: number;
32
+ lineEnd?: number;
33
+ pageStart?: number;
34
+ pageEnd?: number;
35
+ fileSizeBytes?: number;
36
+ fileModifiedAt?: string;
37
+ fileHash?: string;
38
+ breadcrumb: string[];
39
+ sectionTitle?: string;
40
+ headingLevel?: number;
41
+ documentOutline?: string[];
42
+ lang?: string;
43
+ codeLanguage?: string;
44
+ strategy: string;
45
+ chunkIndex: number;
46
+ totalChunks: number;
47
+ estimatedTokens: number;
48
+ qualityScore?: number;
49
+ truncated?: boolean;
50
+ }
51
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAIA,MAAM,MAAM,WAAW,GACnB,UAAU,GACV,SAAS,GACT,SAAS,GACT,WAAW,GACX,OAAO,GACP,WAAW,GACX,YAAY,GACZ,MAAM,GACN,WAAW,GACX,MAAM,GACN,SAAS,GACT,OAAO,GACP,MAAM,GACN,UAAU,GACV,SAAS,GACT,UAAU,GACV,UAAU,CAAC;AAEf,MAAM,WAAW,YAAY;IAC3B,YAAY,CAAC,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IACrC,IAAI,CAAC,EAAE,SAAS,GAAG,UAAU,GAAG,UAAU,GAAG,WAAW,GAAG,QAAQ,GAAG,QAAQ,CAAC;IAC/E,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,QAAQ,CAAC,EAAE,OAAO,CAAC;IACnB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,OAAO,CAAC;IAClB,YAAY,CAAC,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,OAAO;IACtB,IAAI,EAAE,WAAW,CAAC;IAClB,QAAQ,CAAC,EAAE,OAAO,EAAE,CAAC;IACrB,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,KAAK,EAAE,YAAY,CAAC;CACrB;AAID,MAAM,WAAW,SAAU,SAAQ,MAAM,CAAC,MAAM,EAAE,OAAO,CAAC;IAExD,UAAU,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,OAAO,CAAC,EAAE,MAAM,CAAC;IAGjB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAGlB,UAAU,EAAE,MAAM,EAAE,CAAC;IACrB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,eAAe,CAAC,EAAE,MAAM,EAAE,CAAC;IAG3B,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,YAAY,CAAC,EAAE,MAAM,CAAC;IAGtB,QAAQ,EAAE,MAAM,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IAGpB,eAAe,EAAE,MAAM,CAAC;IACxB,YAAY,CAAC,EAAE,MAAM,CAAC;IACtB,SAAS,CAAC,EAAE,OAAO,CAAC;CACrB"}
package/dist/types.js ADDED
@@ -0,0 +1,5 @@
1
+ // ViDoc AST types.
2
+ // These mirror the Rust structs in crates/virage-vidoc/src/lib.rs.
3
+ // TODO(phase-1): move to @vivantel/virage-core once the core PR lands.
4
+ export {};
5
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,mBAAmB;AACnB,mEAAmE;AACnE,uEAAuE"}
package/package.json ADDED
@@ -0,0 +1,43 @@
1
+ {
2
+ "name": "@vivantel/virage-chunker-ce-ast",
3
+ "version": "0.1.0",
4
+ "description": "Generalized ViDoc AST walker — shared chunking strategy for all structured document formats",
5
+ "type": "module",
6
+ "main": "./dist/index.js",
7
+ "exports": {
8
+ ".": {
9
+ "import": "./dist/index.js",
10
+ "types": "./dist/index.d.ts"
11
+ }
12
+ },
13
+ "files": ["dist"],
14
+ "scripts": {
15
+ "build": "tsc --build",
16
+ "type-check": "tsc --build --noEmit",
17
+ "test": "vitest run"
18
+ },
19
+ "peerDependencies": {
20
+ "@vivantel/virage-core": ">=0.2"
21
+ },
22
+ "devDependencies": {
23
+ "@vivantel/virage-core": ">=0.2"
24
+ },
25
+ "rag-plugin": {
26
+ "type": "chunker",
27
+ "label": "ViDoc AST walker (generic)",
28
+ "key": "ast",
29
+ "defaultConfig": {
30
+ "maxTokens": 512,
31
+ "minTokens": 128
32
+ }
33
+ },
34
+ "publishConfig": {
35
+ "access": "public"
36
+ },
37
+ "license": "MIT OR Apache-2.0",
38
+ "repository": {
39
+ "type": "git",
40
+ "url": "https://github.com/vivantel/virage-chunkers-ce.git",
41
+ "directory": "packages/virage-chunker-ce-ast"
42
+ }
43
+ }