chunk-smart 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,77 @@
1
+ # chunk-smart
2
+
3
+ Structure-aware text chunker for RAG pipelines. Detects content type (markdown, code, JSON, HTML, YAML, plain text) and splits at natural boundaries — headings, functions, top-level keys, paragraphs, sentences — rather than blindly by character count.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ npm install chunk-smart
9
+ ```
10
+
11
+ ## Quick Start
12
+
13
+ ```typescript
14
+ import { chunk } from 'chunk-smart'
15
+
16
+ const chunks = chunk(myText)
17
+ // Auto-detects content type and splits at natural boundaries
18
+
19
+ chunks.forEach(c => {
20
+ console.log(c.metadata.contentType, c.metadata.tokenCount, c.content.slice(0, 80))
21
+ })
22
+ ```
23
+
24
+ ## createChunker
25
+
26
+ ```typescript
27
+ import { createChunker } from 'chunk-smart'
28
+
29
+ const chunker = createChunker({ maxTokens: 256, overlap: 20 })
30
+
31
+ const mdChunks = chunker.chunkMarkdown(markdownText)
32
+ const codeChunks = chunker.chunkCode(sourceCode)
33
+ const jsonChunks = chunker.chunkJSON(jsonString)
34
+ const detected = chunker.detectContentType(someText)
35
+ ```
36
+
37
+ ## Content Types
38
+
39
+ | Type | Splits at |
40
+ |------------|------------------------------------------------|
41
+ | `markdown` | `#` heading boundaries, then paragraphs |
42
+ | `code` | `function`/`class`/`def` boundaries, then blank lines |
43
+ | `json` | top-level object keys or array element groups |
44
+ | `html` | paragraph/sentence boundaries |
45
+ | `yaml` | paragraph/sentence boundaries |
46
+ | `text` | paragraph boundaries (`\n\n`), then sentences |
47
+
48
+ ## ChunkOptions
49
+
50
+ | Option | Type | Default | Description |
51
+ |--------------------|---------------|---------|--------------------------------------|
52
+ | `maxTokens` | `number` | `512` | Max tokens per chunk (1 token ≈ 4 chars) |
53
+ | `minTokens` | `number` | `50` | Min tokens (informational) |
54
+ | `overlap` | `number` | `0` | Overlap in tokens between chunks |
55
+ | `contentType` | `ContentType` | `'auto'`| Force a specific content type |
56
+ | `preserveStructure`| `boolean` | `true` | Use boundary-aware splitting |
57
+
58
+ ## ChunkMetadata
59
+
60
+ ```typescript
61
+ interface ChunkMetadata {
62
+ index: number // position in result array
63
+ startOffset: number // char offset in original text
64
+ endOffset: number // char offset in original text
65
+ tokenCount: number // Math.ceil(content.length / 4)
66
+ charCount: number // content.length
67
+ contentType: ContentType
68
+ headings: string[] // ancestor headings (markdown only)
69
+ codeLanguage?: string // detected from ``` fence or shebang
70
+ overlapBefore: number // overlap chars with previous chunk
71
+ overlapAfter: number // overlap chars with next chunk
72
+ }
73
+ ```
74
+
75
+ ## License
76
+
77
+ MIT
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=chunk.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/chunk.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,129 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const vitest_1 = require("vitest");
4
+ const chunker_1 = require("../chunker");
5
+ (0, vitest_1.describe)('chunk()', () => {
6
+ (0, vitest_1.it)('returns an array of Chunk objects', () => {
7
+ const text = `# Title\n\nSome paragraph text here.\n\n## Section\n\nMore text in section.`;
8
+ const result = (0, chunker_1.chunk)(text);
9
+ (0, vitest_1.expect)(Array.isArray(result)).toBe(true);
10
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(0);
11
+ for (const c of result) {
12
+ (0, vitest_1.expect)(typeof c.content).toBe('string');
13
+ (0, vitest_1.expect)(c.metadata).toBeDefined();
14
+ (0, vitest_1.expect)(typeof c.metadata.index).toBe('number');
15
+ (0, vitest_1.expect)(typeof c.metadata.tokenCount).toBe('number');
16
+ (0, vitest_1.expect)(typeof c.metadata.charCount).toBe('number');
17
+ (0, vitest_1.expect)(typeof c.metadata.startOffset).toBe('number');
18
+ (0, vitest_1.expect)(typeof c.metadata.endOffset).toBe('number');
19
+ }
20
+ });
21
+ (0, vitest_1.it)('respects maxTokens option', () => {
22
+ const text = 'Word '.repeat(500);
23
+ const result = (0, chunker_1.chunk)(text, { maxTokens: 50, contentType: 'text' });
24
+ for (const c of result) {
25
+ (0, vitest_1.expect)(c.metadata.tokenCount).toBeLessThanOrEqual(55); // small tolerance
26
+ }
27
+ });
28
+ (0, vitest_1.it)('assigns correct contentType in metadata for markdown', () => {
29
+ const text = `# Heading\n\nParagraph text here.\n\n## Another heading\n\nMore text.`;
30
+ const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
31
+ for (const c of result) {
32
+ (0, vitest_1.expect)(c.metadata.contentType).toBe('markdown');
33
+ }
34
+ });
35
+ (0, vitest_1.it)('assigns correct contentType in metadata for code', () => {
36
+ const code = `function foo() { return 1 }\nfunction bar() { return 2 }`;
37
+ const result = (0, chunker_1.chunk)(code, { contentType: 'code' });
38
+ for (const c of result) {
39
+ (0, vitest_1.expect)(c.metadata.contentType).toBe('code');
40
+ }
41
+ });
42
+ (0, vitest_1.it)('assigns sequential indices to chunks', () => {
43
+ const text = 'First paragraph.\n\nSecond paragraph.\n\nThird paragraph.';
44
+ const result = (0, chunker_1.chunk)(text, { maxTokens: 5, contentType: 'text' });
45
+ result.forEach((c, i) => {
46
+ (0, vitest_1.expect)(c.metadata.index).toBe(i);
47
+ });
48
+ });
49
+ (0, vitest_1.it)('extracts headings from markdown chunks', () => {
50
+ const text = `# Main Heading\n\nSome content under main heading.\n\n## Sub Heading\n\nSome content under sub.`;
51
+ const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
52
+ const withHeadings = result.filter(c => c.metadata.headings.length > 0);
53
+ (0, vitest_1.expect)(withHeadings.length).toBeGreaterThan(0);
54
+ });
55
+ (0, vitest_1.it)('detects JSON content type automatically', () => {
56
+ const json = JSON.stringify({ foo: 'bar', baz: 42 });
57
+ const result = (0, chunker_1.chunk)(json);
58
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(0);
59
+ (0, vitest_1.expect)(result[0].metadata.contentType).toBe('json');
60
+ });
61
+ (0, vitest_1.it)('handles empty string gracefully', () => {
62
+ const result = (0, chunker_1.chunk)('');
63
+ (0, vitest_1.expect)(Array.isArray(result)).toBe(true);
64
+ (0, vitest_1.expect)(result.length).toBe(0);
65
+ });
66
+ (0, vitest_1.it)('charCount matches content length', () => {
67
+ const text = `# Title\n\nA paragraph of reasonable length here.\n\n## Section\n\nAnother paragraph.`;
68
+ const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
69
+ for (const c of result) {
70
+ (0, vitest_1.expect)(c.metadata.charCount).toBe(c.content.length);
71
+ }
72
+ });
73
+ (0, vitest_1.it)('tokenCount is ceil(charCount / 4)', () => {
74
+ const text = `# Title\n\nA paragraph of reasonable length here.`;
75
+ const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
76
+ for (const c of result) {
77
+ (0, vitest_1.expect)(c.metadata.tokenCount).toBe(Math.ceil(c.content.length / 4));
78
+ }
79
+ });
80
+ });
81
+ (0, vitest_1.describe)('createChunker()', () => {
82
+ (0, vitest_1.it)('returns a Chunker object with all methods', () => {
83
+ const chunker = (0, chunker_1.createChunker)();
84
+ (0, vitest_1.expect)(typeof chunker.chunk).toBe('function');
85
+ (0, vitest_1.expect)(typeof chunker.chunkMarkdown).toBe('function');
86
+ (0, vitest_1.expect)(typeof chunker.chunkCode).toBe('function');
87
+ (0, vitest_1.expect)(typeof chunker.chunkJSON).toBe('function');
88
+ (0, vitest_1.expect)(typeof chunker.detectContentType).toBe('function');
89
+ });
90
+ (0, vitest_1.it)('applies default options to all chunk calls', () => {
91
+ const chunker = (0, chunker_1.createChunker)({ maxTokens: 20 });
92
+ const text = 'A'.repeat(400);
93
+ const result = chunker.chunk(text, { contentType: 'text' });
94
+ for (const c of result) {
95
+ (0, vitest_1.expect)(c.metadata.tokenCount).toBeLessThanOrEqual(25);
96
+ }
97
+ });
98
+ (0, vitest_1.it)('chunkMarkdown forces markdown content type', () => {
99
+ const text = `# Heading\n\nText here.\n\n## Other\n\nMore text.`;
100
+ const chunker = (0, chunker_1.createChunker)();
101
+ const result = chunker.chunkMarkdown(text);
102
+ for (const c of result) {
103
+ (0, vitest_1.expect)(c.metadata.contentType).toBe('markdown');
104
+ }
105
+ });
106
+ (0, vitest_1.it)('chunkCode forces code content type', () => {
107
+ const code = `function a() {}\nfunction b() {}`;
108
+ const chunker = (0, chunker_1.createChunker)();
109
+ const result = chunker.chunkCode(code);
110
+ for (const c of result) {
111
+ (0, vitest_1.expect)(c.metadata.contentType).toBe('code');
112
+ }
113
+ });
114
+ (0, vitest_1.it)('chunkJSON forces json content type', () => {
115
+ const json = '{"a":1,"b":2}';
116
+ const chunker = (0, chunker_1.createChunker)();
117
+ const result = chunker.chunkJSON(json);
118
+ for (const c of result) {
119
+ (0, vitest_1.expect)(c.metadata.contentType).toBe('json');
120
+ }
121
+ });
122
+ (0, vitest_1.it)('detectContentType delegates to detect module', () => {
123
+ const chunker = (0, chunker_1.createChunker)();
124
+ const result = chunker.detectContentType('{"x": 1}');
125
+ (0, vitest_1.expect)(result.type).toBe('json');
126
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThan(0);
127
+ });
128
+ });
129
+ //# sourceMappingURL=chunk.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunk.test.js","sourceRoot":"","sources":["../../src/__tests__/chunk.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,wCAAiD;AAEjD,IAAA,iBAAQ,EAAC,SAAS,EAAE,GAAG,EAAE;IACvB,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,6EAA6E,CAAA;QAC1F,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,CAAC,CAAA;QAC1B,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACvC,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAA;YAChC,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YAC9C,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACnD,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YAClD,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACpD,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QACpD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAChC,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QAClE,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,mBAAmB,CAAC,EAAE,CAAC,CAAA,CAAC,kBAAkB;QAC1E,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,IAAI,GAAG,uEAAuE,CAAA;QACpF,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,kDAAkD,EAAE,GAAG,EAAE;QAC1D,MAAM,IAAI,GAAG,0DAA0D,CAAA;QACvE,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QACnD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,2DAA2D,CAAA;QACxE,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QACjE,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACtB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAClC,CAAC,CAAC,CAAA;IACJ,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,IAAI,GAAG,iGAAiG,CAAA;QAC9G,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;QACvE,IAAA,eAAM,EAAC,YAAY,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAChD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAA;QACpD,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,CAAC,CAAA;QAC1B,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACrD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,EAAE,CAAC,CAAA;QACxB,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IAC/B,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,IAAI,GAAG,uFAAuF,CAAA;QACpG,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAA;QACrD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,mDAAmD,CAAA;QAChE,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAA;QACrE,CAAC;IACH,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,IAAA,WAAE,EAAC,2CAA2C,EAAE,GAAG,EAAE;QACnD,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACrD,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IAC3D,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,OAAO,GAAG,IAAA,uBAAa,EAAC,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC,CAAA;QAChD,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC5B,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QAC3D,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,mBAAmB,CAAC,EAAE,CAAC,CAAA;QACvD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,mDAAmD,CAAA;QAChE,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,CAAA;QAC1C,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,kCAAkC,CAAA;QAC/C,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;QACtC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,eAAe,CAAA;QAC5B,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;QACtC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC,UAAU,CAAC,CAAA;QACpD,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC9C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=detect.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/detect.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,72 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const vitest_1 = require("vitest");
4
+ const detect_1 = require("../detect");
5
+ (0, vitest_1.describe)('detectContentType', () => {
6
+ (0, vitest_1.it)('detects valid JSON object with high confidence', () => {
7
+ const result = (0, detect_1.detectContentType)('{"key": "value", "num": 42}');
8
+ (0, vitest_1.expect)(result.type).toBe('json');
9
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.9);
10
+ });
11
+ (0, vitest_1.it)('detects valid JSON array with high confidence', () => {
12
+ const result = (0, detect_1.detectContentType)('[1, 2, 3]');
13
+ (0, vitest_1.expect)(result.type).toBe('json');
14
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.9);
15
+ });
16
+ (0, vitest_1.it)('detects malformed JSON starting with { at lower confidence', () => {
17
+ const result = (0, detect_1.detectContentType)('{ not valid json here }');
18
+ (0, vitest_1.expect)(result.type).toBe('json');
19
+ (0, vitest_1.expect)(result.confidence).toBeLessThan(0.9);
20
+ });
21
+ (0, vitest_1.it)('detects markdown with headings', () => {
22
+ const md = `# Title\n\nSome paragraph text here.\n\n## Section Two\n\nMore text.`;
23
+ const result = (0, detect_1.detectContentType)(md);
24
+ (0, vitest_1.expect)(result.type).toBe('markdown');
25
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.8);
26
+ });
27
+ (0, vitest_1.it)('detects markdown with code fences', () => {
28
+ const md = `Some text\n\n\`\`\`js\nconsole.log('hi')\n\`\`\``;
29
+ const result = (0, detect_1.detectContentType)(md);
30
+ (0, vitest_1.expect)(result.type).toBe('markdown');
31
+ });
32
+ (0, vitest_1.it)('detects HTML with DOCTYPE', () => {
33
+ const html = `<!DOCTYPE html><html><body><p>Hello</p></body></html>`;
34
+ const result = (0, detect_1.detectContentType)(html);
35
+ (0, vitest_1.expect)(result.type).toBe('html');
36
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.9);
37
+ });
38
+ (0, vitest_1.it)('detects HTML with div tags', () => {
39
+ const html = `<div class="container"><p>Hello world</p></div>`;
40
+ const result = (0, detect_1.detectContentType)(html);
41
+ (0, vitest_1.expect)(result.type).toBe('html');
42
+ });
43
+ (0, vitest_1.it)('detects code with function keyword', () => {
44
+ const code = `function greet(name) {\n return 'Hello ' + name\n}`;
45
+ const result = (0, detect_1.detectContentType)(code);
46
+ (0, vitest_1.expect)(result.type).toBe('code');
47
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.7);
48
+ });
49
+ (0, vitest_1.it)('detects code with class keyword', () => {
50
+ const code = `class MyClass {\n constructor() {}\n}`;
51
+ const result = (0, detect_1.detectContentType)(code);
52
+ (0, vitest_1.expect)(result.type).toBe('code');
53
+ });
54
+ (0, vitest_1.it)('detects code with Python def keyword', () => {
55
+ const code = `def compute(x, y):\n return x + y\n`;
56
+ const result = (0, detect_1.detectContentType)(code);
57
+ (0, vitest_1.expect)(result.type).toBe('code');
58
+ });
59
+ (0, vitest_1.it)('falls back to text for plain prose', () => {
60
+ const text = `This is just a plain text paragraph without any special syntax. It has multiple sentences and no special markers.`;
61
+ const result = (0, detect_1.detectContentType)(text);
62
+ (0, vitest_1.expect)(result.type).toBe('text');
63
+ (0, vitest_1.expect)(result.confidence).toBe(0.5);
64
+ });
65
+ (0, vitest_1.it)('detects YAML with key-value lines', () => {
66
+ const yaml = `name: my-project\nversion: 1.0.0\ndescription: a project\nauthor: someone`;
67
+ const result = (0, detect_1.detectContentType)(yaml);
68
+ (0, vitest_1.expect)(result.type).toBe('yaml');
69
+ (0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.8);
70
+ });
71
+ });
72
+ //# sourceMappingURL=detect.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect.test.js","sourceRoot":"","sources":["../../src/__tests__/detect.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,sCAA6C;AAE7C,IAAA,iBAAQ,EAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,IAAA,WAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,6BAA6B,CAAC,CAAA;QAC/D,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,WAAW,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4DAA4D,EAAE,GAAG,EAAE;QACpE,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,yBAAyB,CAAC,CAAA;QAC3D,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAA;IAC7C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,EAAE,GAAG,sEAAsE,CAAA;QACjF,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,EAAE,CAAC,CAAA;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,EAAE,GAAG,kDAAkD,CAAA;QAC7D,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,EAAE,CAAC,CAAA;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IACtC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,IAAI,GAAG,uDAAuD,CAAA;QACpE,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,MAAM,IAAI,GAAG,iDAAiD,CAAA;QAC9D,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,qDAAqD,CAAA;QAClE,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,mHAAmH,CAAA;QAChI,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACrC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,2EAA2E,CAAA;QACxF,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
@@ -0,0 +1,2 @@
1
+ export {};
2
+ //# sourceMappingURL=split.test.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"split.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/split.test.ts"],"names":[],"mappings":""}
@@ -0,0 +1,115 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const vitest_1 = require("vitest");
4
+ const split_1 = require("../split");
5
+ function opts(maxTokens, overlap = 0) {
6
+ return { maxTokens, overlap, minTokens: 0, preserveStructure: true };
7
+ }
8
+ (0, vitest_1.describe)('splitByTokenCount', () => {
9
+ (0, vitest_1.it)('returns single chunk when text fits within maxTokens', () => {
10
+ const text = 'Short text';
11
+ const result = (0, split_1.splitByTokenCount)(text, 512, 0);
12
+ (0, vitest_1.expect)(result).toHaveLength(1);
13
+ (0, vitest_1.expect)(result[0]).toBe(text);
14
+ });
15
+ (0, vitest_1.it)('splits long text into multiple chunks respecting maxTokens', () => {
16
+ // 200 chars -> 50 tokens; max is 10 tokens = 40 chars
17
+ const text = 'A'.repeat(200);
18
+ const result = (0, split_1.splitByTokenCount)(text, 10, 0);
19
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
20
+ for (const chunk of result) {
21
+ (0, vitest_1.expect)(chunk.length).toBeLessThanOrEqual(40 + 5); // small tolerance for boundary snapping
22
+ }
23
+ });
24
+ (0, vitest_1.it)('adds overlap between chunks', () => {
25
+ const text = 'Hello world. This is a sentence. And another one follows here. Final words.';
26
+ const result = (0, split_1.splitByTokenCount)(text, 5, 1); // 5 tokens = 20 chars, 1 token overlap = 4 chars
27
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
28
+ });
29
+ });
30
+ (0, vitest_1.describe)('splitMarkdown', () => {
31
+ (0, vitest_1.it)('splits at heading boundaries', () => {
32
+ const md = `# Heading One\n\nParagraph under heading one.\n\n# Heading Two\n\nParagraph under heading two.`;
33
+ const result = (0, split_1.splitMarkdown)(md, opts(512));
34
+ (0, vitest_1.expect)(result.length).toBeGreaterThanOrEqual(2);
35
+ (0, vitest_1.expect)(result.some(r => r.includes('# Heading One'))).toBe(true);
36
+ (0, vitest_1.expect)(result.some(r => r.includes('# Heading Two'))).toBe(true);
37
+ });
38
+ (0, vitest_1.it)('splits a large section at paragraph boundaries', () => {
39
+ // Each paragraph is ~100 chars; maxTokens=10 means 40 chars
40
+ const para = (n) => `Para ${n}: ` + 'x'.repeat(30);
41
+ const md = `# Title\n\n${para(1)}\n\n${para(2)}\n\n${para(3)}`;
42
+ const result = (0, split_1.splitMarkdown)(md, opts(10));
43
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
44
+ });
45
+ (0, vitest_1.it)('handles text with no headings by falling back to paragraph split', () => {
46
+ const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.';
47
+ const result = (0, split_1.splitMarkdown)(text, opts(512));
48
+ (0, vitest_1.expect)(result.length).toBeGreaterThanOrEqual(1);
49
+ });
50
+ (0, vitest_1.it)('returns entire text as one chunk when it fits', () => {
51
+ const text = '# Small doc\n\nJust a small paragraph.';
52
+ const result = (0, split_1.splitMarkdown)(text, opts(512));
53
+ (0, vitest_1.expect)(result).toHaveLength(1);
54
+ });
55
+ });
56
+ (0, vitest_1.describe)('splitCode', () => {
57
+ (0, vitest_1.it)('returns single chunk for small code', () => {
58
+ const code = `function foo() { return 1 }`;
59
+ const result = (0, split_1.splitCode)(code, opts(512));
60
+ (0, vitest_1.expect)(result).toHaveLength(1);
61
+ });
62
+ (0, vitest_1.it)('splits at function boundaries', () => {
63
+ const code = `function alpha() {\n return 1\n}\n\nfunction beta() {\n return 2\n}\n\nfunction gamma() {\n return 3\n}`;
64
+ const result = (0, split_1.splitCode)(code, opts(5)); // tiny maxTokens to force splits
65
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
66
+ });
67
+ (0, vitest_1.it)('splits at blank lines when no top-level patterns found', () => {
68
+ const code = `x = 1\ny = 2\n\nz = x + y\n\nprint(z)\n`.repeat(20);
69
+ const result = (0, split_1.splitCode)(code, opts(10));
70
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
71
+ });
72
+ });
73
+ (0, vitest_1.describe)('splitJSON', () => {
74
+ (0, vitest_1.it)('splits object by top-level keys', () => {
75
+ const obj = {};
76
+ for (let i = 0; i < 20; i++)
77
+ obj[`key${i}`] = i;
78
+ const json = JSON.stringify(obj, null, 2);
79
+ const result = (0, split_1.splitJSON)(json, opts(10)); // force small chunk size
80
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
81
+ // Each chunk should be valid JSON
82
+ for (const c of result) {
83
+ (0, vitest_1.expect)(() => JSON.parse(c)).not.toThrow();
84
+ }
85
+ });
86
+ (0, vitest_1.it)('splits array into groups', () => {
87
+ const arr = Array.from({ length: 50 }, (_, i) => ({ id: i, value: `item${i}` }));
88
+ const json = JSON.stringify(arr, null, 2);
89
+ const result = (0, split_1.splitJSON)(json, opts(10));
90
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
91
+ });
92
+ (0, vitest_1.it)('falls back to token split for invalid JSON', () => {
93
+ const text = 'not json at all'.repeat(50);
94
+ const result = (0, split_1.splitJSON)(text, opts(10));
95
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(0);
96
+ });
97
+ });
98
+ (0, vitest_1.describe)('splitText', () => {
99
+ (0, vitest_1.it)('returns single chunk when text fits', () => {
100
+ const text = 'Short text that fits easily.';
101
+ const result = (0, split_1.splitText)(text, opts(512));
102
+ (0, vitest_1.expect)(result).toHaveLength(1);
103
+ });
104
+ (0, vitest_1.it)('splits at paragraph boundaries', () => {
105
+ const text = 'First paragraph.\n\nSecond paragraph.\n\nThird paragraph.';
106
+ const result = (0, split_1.splitText)(text, opts(3)); // 3 tokens = 12 chars — forces splits
107
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
108
+ });
109
+ (0, vitest_1.it)('splits at sentence boundaries when paragraph is too large', () => {
110
+ const longPara = 'This is sentence one. This is sentence two. This is sentence three. This is sentence four.';
111
+ const result = (0, split_1.splitText)(longPara, opts(5));
112
+ (0, vitest_1.expect)(result.length).toBeGreaterThan(1);
113
+ });
114
+ });
115
+ //# sourceMappingURL=split.test.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"split.test.js","sourceRoot":"","sources":["../../src/__tests__/split.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,oCAA4F;AAG5F,SAAS,IAAI,CAAC,SAAiB,EAAE,OAAO,GAAG,CAAC;IAC1C,OAAO,EAAE,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,iBAAiB,EAAE,IAAI,EAAE,CAAA;AACtE,CAAC;AAED,IAAA,iBAAQ,EAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,IAAA,WAAE,EAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,IAAI,GAAG,YAAY,CAAA;QACzB,MAAM,MAAM,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA;QAC9C,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;QAC9B,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC9B,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4DAA4D,EAAE,GAAG,EAAE;QACpE,sDAAsD;QACtD,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC5B,MAAM,MAAM,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAA,eAAM,EAAC,KAAK,CAAC,MAAM,CAAC,CAAC,mBAAmB,CAAC,EAAE,GAAG,CAAC,CAAC,CAAA,CAAC,wCAAwC;QAC3F,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,IAAI,GAAG,6EAA6E,CAAA;QAC1F,MAAM,MAAM,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC,CAAA,CAAC,iDAAiD;QAC9F,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,IAAA,WAAE,EAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,EAAE,GAAG,gGAAgG,CAAA;QAC3G,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,EAAE,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAC3C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAA;QAC/C,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAChE,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAClE,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,4DAA4D;QAC5D,MAAM,IAAI,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QAC1D,MAAM,EAAE,GAAG,cAAc,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,EAAE,CAAA;QAC9D,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,EAAE,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QAC1C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,kEAAkE,EAAE,GAAG,EAAE;QAC1E,MAAM,IAAI,GAAG,0EAA0E,CAAA;QACvF,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAA;IACjD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,WAAW,EAAE,GAAG,EAAE;IACzB,IAAA,WAAE,EAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,6BAA6B,CAAA;QAC1C,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QACzC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,IAAI,GAAG,4GAA4G,CAAA;QACzH,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA,CAAC,iCAAiC;QACzE,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,wDAAwD,EAAE,GAAG,EAAE;QAChE,MAAM,IAAI,GAAG,yCAAyC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QACjE,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,WAAW,EAAE,GAAG,EAAE;IACzB,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,GAAG,GAA2B,EAAE,CAAA;QACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,CAAA;QAC/C,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA,CAAC,yBAAyB;QAClE,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,kCAAkC;QAClC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,GAAG,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,EAAE,CAAA;QAC3C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,0BAA0B,EAAE,GAAG,EAAE;QAClC,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,EAAE,EAAE,CAAC,CAAC,CAAA;QAChF,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,iBAAiB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,WAAW,EAAE,GAAG,EAAE;IACzB,IAAA,WAAE,EAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,8BAA8B,CAAA;QAC3C,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QACzC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,IAAI,GAAG,2DAA2D,CAAA;QACxE,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA,CAAC,sCAAsC;QAC9E,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,2DAA2D,EAAE,GAAG,EAAE;QACnE,MAAM,QAAQ,GAAG,4FAA4F,CAAA;QAC7G,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QAC3C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
@@ -0,0 +1,4 @@
1
+ import type { Chunk, ChunkOptions, Chunker } from './types';
2
+ export declare function chunk(text: string, options?: ChunkOptions): Chunk[];
3
+ export declare function createChunker(defaultOptions?: ChunkOptions): Chunker;
4
+ //# sourceMappingURL=chunker.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAiB,YAAY,EAAE,OAAO,EAA6B,MAAM,SAAS,CAAA;AAuFrG,wBAAgB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,KAAK,EAAE,CAsCnE;AAED,wBAAgB,aAAa,CAAC,cAAc,CAAC,EAAE,YAAY,GAAG,OAAO,CAwBpE"}
@@ -0,0 +1,136 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.chunk = chunk;
4
+ exports.createChunker = createChunker;
5
+ const detect_1 = require("./detect");
6
+ const split_1 = require("./split");
7
+ const DEFAULTS = {
8
+ maxTokens: 512,
9
+ minTokens: 50,
10
+ overlap: 0,
11
+ contentType: 'text',
12
+ preserveStructure: true,
13
+ };
14
+ function resolveOptions(defaults, overrides) {
15
+ return { ...defaults, ...(overrides ?? {}) };
16
+ }
17
+ function tokensFor(text) {
18
+ return Math.ceil(text.length / 4);
19
+ }
20
+ /** Extract first-level headings from a markdown chunk for metadata. */
21
+ function extractHeadings(content) {
22
+ const headings = [];
23
+ for (const line of content.split('\n')) {
24
+ const m = line.match(/^(#{1,6})\s+(.+)/);
25
+ if (m)
26
+ headings.push(m[2].trim());
27
+ }
28
+ return headings;
29
+ }
30
+ /** Detect code language from fence or shebang on first non-empty line. */
31
+ function detectCodeLanguage(content) {
32
+ const first = content.trimStart().split('\n')[0] ?? '';
33
+ const fenceMatch = first.match(/^```(\w+)/);
34
+ if (fenceMatch)
35
+ return fenceMatch[1];
36
+ const shebangMatch = first.match(/^#!.*\/(\w+)/);
37
+ if (shebangMatch)
38
+ return shebangMatch[1];
39
+ return undefined;
40
+ }
41
+ function buildChunks(parts, originalText, contentType, options) {
42
+ const overlapChars = options.overlap * 4;
43
+ const chunks = [];
44
+ // Build startOffset by searching for each part in the original text
45
+ let searchFrom = 0;
46
+ for (let i = 0; i < parts.length; i++) {
47
+ const content = parts[i];
48
+ if (!content || content.trim().length === 0)
49
+ continue;
50
+ // Find where this part appears in the original text
51
+ let startOffset = originalText.indexOf(content.trimEnd(), searchFrom);
52
+ if (startOffset === -1) {
53
+ // Fallback: search trimmed
54
+ startOffset = originalText.indexOf(content.trim(), searchFrom);
55
+ }
56
+ if (startOffset === -1)
57
+ startOffset = searchFrom;
58
+ const endOffset = startOffset + content.length;
59
+ const overlapBefore = i > 0 ? Math.min(overlapChars, (parts[i - 1] ?? '').length) : 0;
60
+ const overlapAfter = i < parts.length - 1 ? Math.min(overlapChars, (parts[i + 1] ?? '').length) : 0;
61
+ const metadata = {
62
+ index: chunks.length,
63
+ startOffset,
64
+ endOffset,
65
+ tokenCount: tokensFor(content),
66
+ charCount: content.length,
67
+ contentType,
68
+ headings: contentType === 'markdown' ? extractHeadings(content) : [],
69
+ codeLanguage: contentType === 'code' ? detectCodeLanguage(content) : undefined,
70
+ overlapBefore,
71
+ overlapAfter,
72
+ };
73
+ chunks.push({ content, metadata });
74
+ searchFrom = Math.max(searchFrom, startOffset + 1);
75
+ }
76
+ return chunks;
77
+ }
78
+ function chunk(text, options) {
79
+ const opts = resolveOptions(DEFAULTS, options);
80
+ let contentType;
81
+ if (opts.contentType && opts.contentType !== 'text') {
82
+ contentType = opts.contentType;
83
+ }
84
+ else if (options?.contentType) {
85
+ contentType = options.contentType;
86
+ }
87
+ else {
88
+ const detected = (0, detect_1.detectContentType)(text);
89
+ contentType = detected.type;
90
+ }
91
+ let parts;
92
+ switch (contentType) {
93
+ case 'markdown':
94
+ parts = (0, split_1.splitMarkdown)(text, opts);
95
+ break;
96
+ case 'code':
97
+ parts = (0, split_1.splitCode)(text, opts);
98
+ break;
99
+ case 'json':
100
+ parts = (0, split_1.splitJSON)(text, opts);
101
+ break;
102
+ case 'html':
103
+ case 'yaml':
104
+ case 'text':
105
+ default:
106
+ if (opts.preserveStructure) {
107
+ parts = (0, split_1.splitText)(text, opts);
108
+ }
109
+ else {
110
+ parts = (0, split_1.splitByTokenCount)(text, opts.maxTokens, opts.overlap);
111
+ }
112
+ break;
113
+ }
114
+ return buildChunks(parts, text, contentType, opts);
115
+ }
116
+ function createChunker(defaultOptions) {
117
+ const defaults = resolveOptions(DEFAULTS, defaultOptions);
118
+ return {
119
+ chunk(text, overrides) {
120
+ return chunk(text, resolveOptions(defaults, overrides));
121
+ },
122
+ chunkMarkdown(text, overrides) {
123
+ return chunk(text, resolveOptions(defaults, { ...overrides, contentType: 'markdown' }));
124
+ },
125
+ chunkCode(text, overrides) {
126
+ return chunk(text, resolveOptions(defaults, { ...overrides, contentType: 'code' }));
127
+ },
128
+ chunkJSON(text, overrides) {
129
+ return chunk(text, resolveOptions(defaults, { ...overrides, contentType: 'json' }));
130
+ },
131
+ detectContentType(text) {
132
+ return (0, detect_1.detectContentType)(text);
133
+ },
134
+ };
135
+ }
136
+ //# sourceMappingURL=chunker.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":";;AAyFA,sBAsCC;AAED,sCAwBC;AAzJD,qCAA4C;AAC5C,mCAA2F;AAG3F,MAAM,QAAQ,GAA2B;IACvC,SAAS,EAAE,GAAG;IACd,SAAS,EAAE,EAAE;IACb,OAAO,EAAE,CAAC;IACV,WAAW,EAAE,MAAM;IACnB,iBAAiB,EAAE,IAAI;CACxB,CAAA;AAED,SAAS,cAAc,CAAC,QAAgC,EAAE,SAAiC;IACzF,OAAO,EAAE,GAAG,QAAQ,EAAE,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,EAAE,CAAA;AAC9C,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AACnC,CAAC;AAED,uEAAuE;AACvE,SAAS,eAAe,CAAC,OAAe;IACtC,MAAM,QAAQ,GAAa,EAAE,CAAA;IAC7B,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAA;QACxC,IAAI,CAAC;YAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;IACnC,CAAC;IACD,OAAO,QAAQ,CAAA;AACjB,CAAC;AAED,0EAA0E;AAC1E,SAAS,kBAAkB,CAAC,OAAe;IACzC,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;IACtD,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;IAC3C,IAAI,UAAU;QAAE,OAAO,UAAU,CAAC,CAAC,CAAC,CAAA;IACpC,MAAM,YAAY,GAAG,KAAK,CAAC,KAAK,CAAC,cAAc,CAAC,CAAA;IAChD,IAAI,YAAY;QAAE,OAAO,YAAY,CAAC,CAAC,CAAC,CAAA;IACxC,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,SAAS,WAAW,CAClB,KAAe,EACf,YAAoB,EACpB,WAAwB,EACxB,OAA+B;IAE/B,MAAM,YAAY,GAAG,OAAO,CAAC,OAAO,GAAG,CAAC,CAAA;IACxC,MAAM,MAAM,GAAY,EAAE,CAAA;IAE1B,oEAAoE;IACpE,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;QACxB,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAErD,oDAAoD;QACpD,IAAI,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,UAAU,CAAC,CAAA;QACrE,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;YACvB,2BAA2B;YAC3B,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,UAAU,CAAC,CAAA;QAChE,CAAC;QACD,IAAI,WAAW,KAAK,CAAC,CAAC;YAAE,WAAW,GAAG,UAAU,CAAA;QAEhD,MAAM,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC,MAAM,CAAA;QAE9C,MAAM,aAAa,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACrF,MAAM,YAAY,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAEnG,MAAM,QAAQ,GAAkB;YAC9B,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,WAAW;YACX,SAAS;YACT,UAAU,EAAE,SAAS,CAAC,OAAO,CAAC;YAC9B,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE,WAAW,KAAK,UAAU,CAAC,CAAC,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE;YACpE,YAAY,EAAE,WAAW,KAAK,MAAM,CAAC,CAAC,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS;YAC9E,aAAa;YACb,YAAY;SACb,CAAA;QAED,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAA;QAClC,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,WAAW,GAAG,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAgB,KAAK,CAAC,IAAY,EAAE,OAAsB;IACxD,MAAM,IAAI,GAAG,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;IAE9C,IAAI,WAAwB,CAAA;IAC5B,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,WAAW,KAAK,MAAM,EAAE,CAAC;QACpD,WAAW,GAAG,IAAI,CAAC,WAAW,CAAA;IAChC,CAAC;SAAM,IAAI,OAAO,EAAE,WAAW,EAAE,CAAC;QAChC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAA;IACnC,CAAC;SAAM,CAAC;QACN,MAAM,QAAQ,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACxC,WAAW,GAAG,QAAQ,CAAC,IAAI,CAAA;IAC7B,CAAC;IAED,IAAI,KAAe,CAAA;IAEnB,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,UAAU;YACb,KAAK,GAAG,IAAA,qBAAa,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YACjC,MAAK;QACP,KAAK,MAAM;YACT,KAAK,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YAC7B,MAAK;QACP,KAAK,MAAM;YACT,KAAK,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YAC7B,MAAK;QACP,KAAK,MAAM,CAAC;QACZ,KAAK,MAAM,CAAC;QACZ,KAAK,MAAM,CAAC;QACZ;YACE,IAAI,IAAI,CAAC,iBAAiB,EAAE,CAAC;gBAC3B,KAAK,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YAC/B,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,CAAA;YAC/D,CAAC;YACD,MAAK;IACT,CAAC;IAED,OAAO,WAAW,CAAC,KAAK,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,CAAC,CAAA;AACpD,CAAC;AAED,SAAgB,aAAa,CAAC,cAA6B;IACzD,MAAM,QAAQ,GAAG,cAAc,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAA;IAEzD,OAAO;QACL,KAAK,CAAC,IAAY,EAAE,SAAiC;YACnD,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAA;QACzD,CAAC;QAED,aAAa,CAAC,IAAY,EAAE,SAAiC;YAC3D,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,EAAE,GAAG,SAAS,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAC,CAAA;QACzF,CAAC;QAED,SAAS,CAAC,IAAY,EAAE,SAAiC;YACvD,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,EAAE,GAAG,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC,CAAA;QACrF,CAAC;QAED,SAAS,CAAC,IAAY,EAAE,SAAiC;YACvD,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,EAAE,GAAG,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC,CAAA;QACrF,CAAC;QAED,iBAAiB,CAAC,IAAY;YAC5B,OAAO,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QAChC,CAAC;KACF,CAAA;AACH,CAAC"}
@@ -0,0 +1,3 @@
1
+ import type { DetectResult } from './types';
2
+ export declare function detectContentType(text: string): DetectResult;
3
+ //# sourceMappingURL=detect.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect.d.ts","sourceRoot":"","sources":["../src/detect.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAe,YAAY,EAAE,MAAM,SAAS,CAAA;AAExD,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CA0D5D"}
package/dist/detect.js ADDED
@@ -0,0 +1,69 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.detectContentType = detectContentType;
4
+ function detectContentType(text) {
5
+ const trimmed = text.trim();
6
+ // JSON: starts with { or [
7
+ if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
8
+ try {
9
+ JSON.parse(trimmed);
10
+ return { type: 'json', confidence: 0.95 };
11
+ }
12
+ catch {
13
+ return { type: 'json', confidence: 0.7 };
14
+ }
15
+ }
16
+ // HTML: starts with <!DOCTYPE or <html, or has common HTML tags
17
+ if (/^<!doctype\s+html/i.test(trimmed) ||
18
+ /^<html/i.test(trimmed) ||
19
+ /<(div|p|span|body|head|h[1-6]|ul|ol|li|table|form|input|a\s|img\s)[^>]*>/i.test(trimmed)) {
20
+ return { type: 'html', confidence: 0.9 };
21
+ }
22
+ // YAML: has key: value lines, optional --- marker, no < chars
23
+ if (!trimmed.includes('<')) {
24
+ const lines = trimmed.split('\n').slice(0, 20);
25
+ const yamlKeyValueLines = lines.filter(l => /^\s*[\w-]+\s*:\s*.+/.test(l));
26
+ const hasMarker = trimmed.startsWith('---');
27
+ if (hasMarker || yamlKeyValueLines.length >= 2) {
28
+ return { type: 'yaml', confidence: 0.8 };
29
+ }
30
+ }
31
+ // Markdown: has # headings, ``` fences, **bold**, - list items
32
+ let mdScore = 0;
33
+ if (/^#{1,6}\s+\S/m.test(trimmed))
34
+ mdScore += 2;
35
+ if (/```/.test(trimmed))
36
+ mdScore += 2;
37
+ if (/\*\*[^*]+\*\*/.test(trimmed))
38
+ mdScore += 1;
39
+ if (/^[-*+]\s+\S/m.test(trimmed))
40
+ mdScore += 1;
41
+ if (/^\[.+\]\(.+\)/m.test(trimmed))
42
+ mdScore += 1;
43
+ if (mdScore >= 2) {
44
+ return { type: 'markdown', confidence: 0.8 };
45
+ }
46
+ // Code: common code patterns
47
+ let codeScore = 0;
48
+ if (/\bfunction\s+\w+\s*\(/.test(trimmed))
49
+ codeScore += 2;
50
+ if (/\bclass\s+\w+/.test(trimmed))
51
+ codeScore += 2;
52
+ if (/\bdef\s+\w+\s*\(/.test(trimmed))
53
+ codeScore += 2;
54
+ if (/\bimport\s+[\w{*]/.test(trimmed))
55
+ codeScore += 1;
56
+ if (/\bconst\s+\w+\s*=/.test(trimmed))
57
+ codeScore += 1;
58
+ if (/\blet\s+\w+\s*=/.test(trimmed))
59
+ codeScore += 1;
60
+ if (/\bvar\s+\w+\s*=/.test(trimmed))
61
+ codeScore += 1;
62
+ if (/^\s{2,}.*[{};]$/m.test(trimmed))
63
+ codeScore += 1;
64
+ if (codeScore >= 2) {
65
+ return { type: 'code', confidence: 0.7 };
66
+ }
67
+ return { type: 'text', confidence: 0.5 };
68
+ }
69
+ //# sourceMappingURL=detect.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"detect.js","sourceRoot":"","sources":["../src/detect.ts"],"names":[],"mappings":";;AAEA,8CA0DC;AA1DD,SAAgB,iBAAiB,CAAC,IAAY;IAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAE3B,2BAA2B;IAC3B,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACvD,IAAI,CAAC;YACH,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;YACnB,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,IAAI,EAAE,CAAA;QAC1D,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;QACzD,CAAC;IACH,CAAC;IAED,gEAAgE;IAChE,IACE,oBAAoB,CAAC,IAAI,CAAC,OAAO,CAAC;QAClC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC;QACvB,2EAA2E,CAAC,IAAI,CAAC,OAAO,CAAC,EACzF,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;IACzD,CAAC;IAED,8DAA8D;IAC9D,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QAC9C,MAAM,iBAAiB,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QAC1E,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAA;QAC3C,IAAI,SAAS,IAAI,iBAAiB,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YAC/C,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;QACzD,CAAC;IACH,CAAC;IAED,+DAA+D;IAC/D,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAC/C,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IACrC,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAC/C,IAAI,cAAc,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAC9C,IAAI,gBAAgB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAChD,IAAI,OAAO,IAAI,CAAC,EAAE,CAAC;QACjB,OAAO,EAAE,IAAI,EAAE,UAAyB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;IAC7D,CAAC;IAED,6BAA6B;IAC7B,IAAI,SAAS,GAAG,CAAC,CAAA;IACjB,IAAI,uBAAuB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACzD,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACjD,IAAI,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACpD,IAAI,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACrD,IAAI,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACrD,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACnD,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACnD,IAAI,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACpD,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnB,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;IACzD,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;AACzD,CAAC"}
@@ -0,0 +1,4 @@
1
+ export { chunk, createChunker } from './chunker';
2
+ export { detectContentType } from './detect';
3
+ export type { ContentType, DetectResult, ChunkMetadata, Chunk, ChunkOptions, Chunker, } from './types';
4
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,aAAa,EAAE,MAAM,WAAW,CAAA;AAChD,OAAO,EAAE,iBAAiB,EAAE,MAAM,UAAU,CAAA;AAC5C,YAAY,EACV,WAAW,EACX,YAAY,EACZ,aAAa,EACb,KAAK,EACL,YAAY,EACZ,OAAO,GACR,MAAM,SAAS,CAAA"}
package/dist/index.js ADDED
@@ -0,0 +1,10 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.detectContentType = exports.createChunker = exports.chunk = void 0;
4
+ // chunk-smart - Structure-aware text chunker for RAG pipelines
5
+ var chunker_1 = require("./chunker");
6
+ Object.defineProperty(exports, "chunk", { enumerable: true, get: function () { return chunker_1.chunk; } });
7
+ Object.defineProperty(exports, "createChunker", { enumerable: true, get: function () { return chunker_1.createChunker; } });
8
+ var detect_1 = require("./detect");
9
+ Object.defineProperty(exports, "detectContentType", { enumerable: true, get: function () { return detect_1.detectContentType; } });
10
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,+DAA+D;AAC/D,qCAAgD;AAAvC,gGAAA,KAAK,OAAA;AAAE,wGAAA,aAAa,OAAA;AAC7B,mCAA4C;AAAnC,2GAAA,iBAAiB,OAAA"}
@@ -0,0 +1,7 @@
1
+ import type { ChunkOptions } from './types';
2
+ export declare function splitByTokenCount(text: string, maxTokens: number, overlap: number): string[];
3
+ export declare function splitMarkdown(text: string, options: ChunkOptions): string[];
4
+ export declare function splitCode(text: string, options: ChunkOptions): string[];
5
+ export declare function splitJSON(text: string, options: ChunkOptions): string[];
6
+ export declare function splitText(text: string, options: ChunkOptions): string[];
7
+ //# sourceMappingURL=split.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"split.d.ts","sourceRoot":"","sources":["../src/split.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,SAAS,CAAA;AAwD3C,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAiC5F;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CA0C3E;AAED,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CAkCvE;AA6BD,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CA2CvE;AAED,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CA8BvE"}
package/dist/split.js ADDED
@@ -0,0 +1,269 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.splitByTokenCount = splitByTokenCount;
4
+ exports.splitMarkdown = splitMarkdown;
5
+ exports.splitCode = splitCode;
6
+ exports.splitJSON = splitJSON;
7
+ exports.splitText = splitText;
8
+ function tokensToChars(tokens) {
9
+ return tokens * 4;
10
+ }
11
+ /**
12
+ * Split text at sentence boundaries (. ! ? followed by space or end).
13
+ * Returns an array of sentence strings.
14
+ */
15
+ function splitSentences(text) {
16
+ const parts = text.split(/(?<=[.!?])\s+/);
17
+ return parts.filter(p => p.length > 0);
18
+ }
19
+ /**
20
+ * Greedy-pack sentences into chunks no larger than maxChars.
21
+ */
22
+ function packSentences(sentences, maxChars) {
23
+ const chunks = [];
24
+ let current = '';
25
+ for (const sentence of sentences) {
26
+ if (sentence.length > maxChars) {
27
+ // Single sentence exceeds limit: split at word boundaries
28
+ if (current.length > 0) {
29
+ chunks.push(current.trimEnd());
30
+ current = '';
31
+ }
32
+ const words = sentence.split(/\s+/);
33
+ let wordChunk = '';
34
+ for (const word of words) {
35
+ if (wordChunk.length + word.length + 1 > maxChars && wordChunk.length > 0) {
36
+ chunks.push(wordChunk.trimEnd());
37
+ wordChunk = '';
38
+ }
39
+ // Single word exceeds maxChars: hard split it
40
+ if (word.length > maxChars) {
41
+ for (let i = 0; i < word.length; i += maxChars) {
42
+ chunks.push(word.slice(i, i + maxChars));
43
+ }
44
+ continue;
45
+ }
46
+ wordChunk += (wordChunk.length > 0 ? ' ' : '') + word;
47
+ }
48
+ if (wordChunk.length > 0)
49
+ current = wordChunk;
50
+ }
51
+ else if (current.length + sentence.length + 1 > maxChars && current.length > 0) {
52
+ chunks.push(current.trimEnd());
53
+ current = sentence;
54
+ }
55
+ else {
56
+ current += (current.length > 0 ? ' ' : '') + sentence;
57
+ }
58
+ }
59
+ if (current.length > 0)
60
+ chunks.push(current.trimEnd());
61
+ return chunks;
62
+ }
63
+ function splitByTokenCount(text, maxTokens, overlap) {
64
+ const maxChars = tokensToChars(maxTokens);
65
+ const overlapChars = tokensToChars(overlap);
66
+ if (text.length <= maxChars)
67
+ return [text];
68
+ const chunks = [];
69
+ let pos = 0;
70
+ while (pos < text.length) {
71
+ let end = pos + maxChars;
72
+ if (end >= text.length) {
73
+ chunks.push(text.slice(pos));
74
+ break;
75
+ }
76
+ // Try to find a sentence boundary near end
77
+ const window = text.slice(pos, end);
78
+ const sentenceMatch = window.search(/[.!?]\s+[^\s](?=[^.!?]*$)/);
79
+ if (sentenceMatch > maxChars / 2) {
80
+ // Snap to sentence boundary
81
+ const snapEnd = pos + sentenceMatch + 1;
82
+ chunks.push(text.slice(pos, snapEnd).trimEnd());
83
+ pos = snapEnd - overlapChars;
84
+ if (pos < 0)
85
+ pos = 0;
86
+ }
87
+ else {
88
+ chunks.push(text.slice(pos, end).trimEnd());
89
+ pos = end - overlapChars;
90
+ if (pos <= 0 || pos >= text.length)
91
+ break;
92
+ }
93
+ }
94
+ return chunks.filter(c => c.length > 0);
95
+ }
96
+ function splitMarkdown(text, options) {
97
+ const maxChars = tokensToChars(options.maxTokens ?? 512);
98
+ // Split at heading boundaries
99
+ const headingRegex = /(?=^#{1,6}\s+)/m;
100
+ const sections = text.split(headingRegex).filter(s => s.trim().length > 0);
101
+ if (sections.length <= 1) {
102
+ // No headings: split at paragraphs
103
+ return splitText(text, options);
104
+ }
105
+ const result = [];
106
+ for (const section of sections) {
107
+ if (section.length <= maxChars) {
108
+ result.push(section);
109
+ }
110
+ else {
111
+ // Section too big: split at paragraph boundaries
112
+ const paragraphs = section.split(/\n{2,}/).filter(p => p.trim().length > 0);
113
+ let current = '';
114
+ for (const para of paragraphs) {
115
+ if (para.length > maxChars) {
116
+ if (current.length > 0) {
117
+ result.push(current.trimEnd());
118
+ current = '';
119
+ }
120
+ // Paragraph too big: split at sentences
121
+ const sentenceChunks = packSentences(splitSentences(para), maxChars);
122
+ result.push(...sentenceChunks);
123
+ }
124
+ else if (current.length + para.length + 2 > maxChars && current.length > 0) {
125
+ result.push(current.trimEnd());
126
+ current = para;
127
+ }
128
+ else {
129
+ current += (current.length > 0 ? '\n\n' : '') + para;
130
+ }
131
+ }
132
+ if (current.length > 0)
133
+ result.push(current.trimEnd());
134
+ }
135
+ }
136
+ return result.filter(c => c.trim().length > 0);
137
+ }
138
+ function splitCode(text, options) {
139
+ const maxChars = tokensToChars(options.maxTokens ?? 512);
140
+ if (text.length <= maxChars)
141
+ return [text];
142
+ // Find top-level function/class/def/const/let/var boundaries
143
+ const topLevelRegex = /(?=^(?:export\s+)?(?:async\s+)?(?:function\s+\w|class\s+\w|def\s+\w|const\s+\w+\s*=|let\s+\w+\s*=|var\s+\w+\s*=))/m;
144
+ const blocks = text.split(topLevelRegex).filter(b => b.trim().length > 0);
145
+ if (blocks.length <= 1) {
146
+ // No top-level boundaries: split at blank lines
147
+ return splitAtBlankLines(text, maxChars);
148
+ }
149
+ const result = [];
150
+ let current = '';
151
+ for (const block of blocks) {
152
+ if (block.length > maxChars) {
153
+ if (current.length > 0) {
154
+ result.push(current.trimEnd());
155
+ current = '';
156
+ }
157
+ result.push(...splitAtBlankLines(block, maxChars));
158
+ }
159
+ else if (current.length + block.length > maxChars && current.length > 0) {
160
+ result.push(current.trimEnd());
161
+ current = block;
162
+ }
163
+ else {
164
+ current += block;
165
+ }
166
+ }
167
+ if (current.length > 0)
168
+ result.push(current.trimEnd());
169
+ return result.filter(c => c.trim().length > 0);
170
+ }
171
+ function splitAtBlankLines(text, maxChars) {
172
+ const paragraphs = text.split(/\n{2,}/).filter(p => p.trim().length > 0);
173
+ const result = [];
174
+ let current = '';
175
+ for (const para of paragraphs) {
176
+ if (para.length > maxChars) {
177
+ if (current.length > 0) {
178
+ result.push(current.trimEnd());
179
+ current = '';
180
+ }
181
+ // Hard split by chars
182
+ for (let i = 0; i < para.length; i += maxChars) {
183
+ result.push(para.slice(i, i + maxChars));
184
+ }
185
+ }
186
+ else if (current.length + para.length + 2 > maxChars && current.length > 0) {
187
+ result.push(current.trimEnd());
188
+ current = para;
189
+ }
190
+ else {
191
+ current += (current.length > 0 ? '\n\n' : '') + para;
192
+ }
193
+ }
194
+ if (current.length > 0)
195
+ result.push(current.trimEnd());
196
+ return result;
197
+ }
198
+ function splitJSON(text, options) {
199
+ const maxChars = tokensToChars(options.maxTokens ?? 512);
200
+ let parsed;
201
+ try {
202
+ parsed = JSON.parse(text);
203
+ }
204
+ catch {
205
+ return splitByTokenCount(text, options.maxTokens ?? 512, options.overlap ?? 0);
206
+ }
207
+ const chunks = [];
208
+ if (Array.isArray(parsed)) {
209
+ // Split array elements into groups
210
+ const itemsPerChunk = Math.max(1, Math.floor(maxChars / Math.max(1, text.length / parsed.length)));
211
+ for (let i = 0; i < parsed.length; i += itemsPerChunk) {
212
+ chunks.push(JSON.stringify(parsed.slice(i, i + itemsPerChunk), null, 2));
213
+ }
214
+ }
215
+ else if (parsed !== null && typeof parsed === 'object') {
216
+ const obj = parsed;
217
+ const keys = Object.keys(obj);
218
+ let currentObj = {};
219
+ let currentSize = 0;
220
+ for (const key of keys) {
221
+ const entry = JSON.stringify({ [key]: obj[key] }, null, 2);
222
+ if (currentSize + entry.length > maxChars && currentSize > 0) {
223
+ chunks.push(JSON.stringify(currentObj, null, 2));
224
+ currentObj = {};
225
+ currentSize = 0;
226
+ }
227
+ currentObj[key] = obj[key];
228
+ currentSize += entry.length;
229
+ }
230
+ if (Object.keys(currentObj).length > 0) {
231
+ chunks.push(JSON.stringify(currentObj, null, 2));
232
+ }
233
+ }
234
+ else {
235
+ return [text];
236
+ }
237
+ return chunks.filter(c => c.trim().length > 0);
238
+ }
239
+ function splitText(text, options) {
240
+ const maxChars = tokensToChars(options.maxTokens ?? 512);
241
+ if (text.length <= maxChars)
242
+ return [text];
243
+ // Split at paragraph boundaries (double newline)
244
+ const paragraphs = text.split(/\n{2,}/).filter(p => p.trim().length > 0);
245
+ const result = [];
246
+ let current = '';
247
+ for (const para of paragraphs) {
248
+ if (para.length > maxChars) {
249
+ if (current.length > 0) {
250
+ result.push(current.trimEnd());
251
+ current = '';
252
+ }
253
+ // Split at sentence boundaries
254
+ const sentenceChunks = packSentences(splitSentences(para), maxChars);
255
+ result.push(...sentenceChunks);
256
+ }
257
+ else if (current.length + para.length + 2 > maxChars && current.length > 0) {
258
+ result.push(current.trimEnd());
259
+ current = para;
260
+ }
261
+ else {
262
+ current += (current.length > 0 ? '\n\n' : '') + para;
263
+ }
264
+ }
265
+ if (current.length > 0)
266
+ result.push(current.trimEnd());
267
+ return result.filter(c => c.trim().length > 0);
268
+ }
269
+ //# sourceMappingURL=split.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"split.js","sourceRoot":"","sources":["../src/split.ts"],"names":[],"mappings":";;AAwDA,8CAiCC;AAED,sCA0CC;AAED,8BAkCC;AA6BD,8BA2CC;AAED,8BA8BC;AA/QD,SAAS,aAAa,CAAC,MAAc;IACnC,OAAO,MAAM,GAAG,CAAC,CAAA;AACnB,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CAAC,IAAY;IAClC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,CAAA;IACzC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AACxC,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,SAAmB,EAAE,QAAgB;IAC1D,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAChB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,QAAQ,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC/B,0DAA0D;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;YACnC,IAAI,SAAS,GAAG,EAAE,CAAA;YAClB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,SAAS,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC1E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC,CAAA;oBAChC,SAAS,GAAG,EAAE,CAAA;gBAChB,CAAC;gBACD,8CAA8C;gBAC9C,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;oBAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,IAAI,QAAQ,EAAE,CAAC;wBAC/C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAA;oBAC1C,CAAC;oBACD,SAAQ;gBACV,CAAC;gBACD,SAAS,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;YACvD,CAAC;YACD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC;gBAAE,OAAO,GAAG,SAAS,CAAA;QAC/C,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjF,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,QAAQ,CAAA;QACpB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAA;QACvD,CAAC;IACH,CAAC;IACD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAgB,iBAAiB,CAAC,IAAY,EAAE,SAAiB,EAAE,OAAe;IAChF,MAAM,QAAQ,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACzC,MAAM,YAAY,GAAG,aAAa,CAAC,OAAO,CAAC,CAAA;IAE3C,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAE1C,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,GAAG,GAAG,CAAC,CAAA;IAEX,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACzB,IAAI,GAAG,GAAG,GAAG,GAAG,QAAQ,CAAA;QACxB,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;YAC5B,MAAK;QACP,CAAC;QAED,2CAA2C;QAC3C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,CAAC,CAAA;QACnC,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,2BAA2B,CAAC,CAAA;QAChE,IAAI,aAAa,GAAG,QAAQ,GAAG,CAAC,EAAE,CAAC;YACjC,4BAA4B;YAC5B,MAAM,OAAO,GAAG,GAAG,GAAG,aAAa,GAAG,CAAC,CAAA;YACvC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC,CAAA;YAC/C,GAAG,GAAG,OAAO,GAAG,YAAY,CAAA;YAC5B,IAAI,GAAG,GAAG,CAAC;gBAAE,GAAG,GAAG,CAAC,CAAA;QACtB,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAA;YAC3C,GAAG,GAAG,GAAG,GAAG,YAAY,CAAA;YACxB,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM;gBAAE,MAAK;QAC3C,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AACzC,CAAC;AAED,SAAgB,aAAa,CAAC,IAAY,EAAE,OAAqB;IAC/D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,iBAAiB,CAAA;IACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAE1E,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACzB,mCAAmC;QACnC,OAAO,SAAS,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IACjC,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAE3B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,OAAO,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;QACtB,CAAC;aAAM,CAAC;YACN,iDAAiD;YACjD,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;YAC3E,IAAI,OAAO,GAAG,EAAE,CAAA;YAChB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;gBAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;oBAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;wBAC9B,OAAO,GAAG,EAAE,CAAA;oBACd,CAAC;oBACD,wCAAwC;oBACxC,MAAM,cAAc,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,QAAQ,CAAC,CAAA;oBACpE,MAAM,CAAC,IAAI,CAAC,GAAG,cAAc,CAAC,CAAA;gBAChC,CAAC;qBAAM,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC7E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;oBAC9B,OAAO,GAAG,IAAI,CAAA;gBAChB,CAAC;qBAAM,CAAC;oBACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;gBACtD,CAAC;YACH,CAAC;YACD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;gBAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;QACxD,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC;AAED,SAAgB,SAAS,CAAC,IAAY,EAAE,OAAqB;IAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAE1C,6DAA6D;IAC7D,MAAM,aAAa,GAAG,oHAAoH,CAAA;IAC1I,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAEzE,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,gDAAgD;QAChD,OAAO,iBAAiB,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAA;IAC1C,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAEhB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC5B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,GAAG,iBAAiB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAA;QACpD,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,KAAK,CAAA;QACjB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,KAAK,CAAA;QAClB,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,QAAgB;IACvD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IACxE,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAEhB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,sBAAsB;YACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,IAAI,QAAQ,EAAE,CAAC;gBAC/C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAA;YAC1C,CAAC;QACH,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,IAAI,CAAA;QAChB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;QACtD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAgB,SAAS,CAAC,IAAY,EAAE,OAAqB;IAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,IAAI,MAAe,CAAA;IACnB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,iBAAiB,CAAC,IAAI,EAAE,OAAO,CAAC,SAAS,IAAI,GAAG,EAAE,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAA;IAChF,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAE3B,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC1B,mCAAmC;QACnC,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;QAClG,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,aAAa,EAAE,CAAC;YACtD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;QAC1E,CAAC;IACH,CAAC;SAAM,IAAI,MAAM,KAAK,IAAI,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QACzD,MAAM,GAAG,GAAG,MAAiC,CAAA;QAC7C,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAC7B,IAAI,UAAU,GAA4B,EAAE,CAAA;QAC5C,IAAI,WAAW,GAAG,CAAC,CAAA;QAEnB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,CAAA;YAC1D,IAAI,WAAW,GAAG,KAAK,CAAC,MAAM,GAAG,QAAQ,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;gBAC7D,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;gBAChD,UAAU,GAAG,EAAE,CAAA;gBACf,WAAW,GAAG,CAAC,CAAA;YACjB,CAAC;YACD,UAAU,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,CAAA;YAC1B,WAAW,IAAI,KAAK,CAAC,MAAM,CAAA;QAC7B,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;QAClD,CAAC;IACH,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,IAAI,CAAC,CAAA;IACf,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC;AAED,SAAgB,SAAS,CAAC,IAAY,EAAE,OAAqB;IAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAE1C,iDAAiD;IACjD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAExE,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAEhB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,+BAA+B;YAC/B,MAAM,cAAc,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,QAAQ,CAAC,CAAA;YACpE,MAAM,CAAC,IAAI,CAAC,GAAG,cAAc,CAAC,CAAA;QAChC,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,IAAI,CAAA;QAChB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;QACtD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC"}
@@ -0,0 +1,36 @@
1
+ export type ContentType = 'markdown' | 'code' | 'html' | 'json' | 'yaml' | 'text';
2
+ export interface DetectResult {
3
+ type: ContentType;
4
+ confidence: number;
5
+ }
6
+ export interface ChunkMetadata {
7
+ index: number;
8
+ startOffset: number;
9
+ endOffset: number;
10
+ tokenCount: number;
11
+ charCount: number;
12
+ contentType: ContentType;
13
+ headings: string[];
14
+ codeLanguage?: string;
15
+ overlapBefore: number;
16
+ overlapAfter: number;
17
+ }
18
+ export interface Chunk {
19
+ content: string;
20
+ metadata: ChunkMetadata;
21
+ }
22
+ export interface ChunkOptions {
23
+ maxTokens?: number;
24
+ minTokens?: number;
25
+ overlap?: number;
26
+ contentType?: ContentType;
27
+ preserveStructure?: boolean;
28
+ }
29
+ export interface Chunker {
30
+ chunk(text: string, overrides?: Partial<ChunkOptions>): Chunk[];
31
+ chunkMarkdown(text: string, options?: Partial<ChunkOptions>): Chunk[];
32
+ chunkCode(text: string, options?: Partial<ChunkOptions>): Chunk[];
33
+ chunkJSON(text: string, options?: Partial<ChunkOptions>): Chunk[];
34
+ detectContentType(text: string): DetectResult;
35
+ }
36
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,WAAW,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAA;AAEjF,MAAM,WAAW,YAAY;IAAG,IAAI,EAAE,WAAW,CAAC;IAAC,UAAU,EAAE,MAAM,CAAA;CAAE;AAEvE,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAA;IACb,WAAW,EAAE,MAAM,CAAA;IACnB,SAAS,EAAE,MAAM,CAAA;IACjB,UAAU,EAAE,MAAM,CAAA;IAClB,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,WAAW,CAAA;IACxB,QAAQ,EAAE,MAAM,EAAE,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,YAAY,EAAE,MAAM,CAAA;CACrB;AAED,MAAM,WAAW,KAAK;IAAG,OAAO,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,aAAa,CAAA;CAAE;AAEnE,MAAM,WAAW,YAAY;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,WAAW,CAAC,EAAE,WAAW,CAAA;IACzB,iBAAiB,CAAC,EAAE,OAAO,CAAA;CAC5B;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IAC/D,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IACrE,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IACjE,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IACjE,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAA;CAC9C"}
package/dist/types.js ADDED
@@ -0,0 +1,3 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
package/package.json ADDED
@@ -0,0 +1,40 @@
1
+ {
2
+ "name": "chunk-smart",
3
+ "version": "0.1.0",
4
+ "description": "Structure-aware text chunker for RAG pipelines",
5
+ "main": "dist/index.js",
6
+ "types": "dist/index.d.ts",
7
+ "files": [
8
+ "dist"
9
+ ],
10
+ "scripts": {
11
+ "build": "tsc",
12
+ "test": "vitest run",
13
+ "lint": "eslint src/",
14
+ "prepublishOnly": "npm run build"
15
+ },
16
+ "keywords": [
17
+ "chunk",
18
+ "rag",
19
+ "llm",
20
+ "text-splitting",
21
+ "markdown",
22
+ "tokenizer"
23
+ ],
24
+ "author": "",
25
+ "license": "MIT",
26
+ "engines": {
27
+ "node": ">=18"
28
+ },
29
+ "publishConfig": {
30
+ "access": "public"
31
+ },
32
+ "devDependencies": {
33
+ "@types/node": "^25.5.0",
34
+ "@typescript-eslint/eslint-plugin": "^8.57.1",
35
+ "@typescript-eslint/parser": "^8.57.1",
36
+ "eslint": "^10.1.0",
37
+ "typescript": "^5.9.3",
38
+ "vitest": "^4.1.0"
39
+ }
40
+ }