chunk-smart 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +77 -0
- package/dist/__tests__/chunk.test.d.ts +2 -0
- package/dist/__tests__/chunk.test.d.ts.map +1 -0
- package/dist/__tests__/chunk.test.js +129 -0
- package/dist/__tests__/chunk.test.js.map +1 -0
- package/dist/__tests__/detect.test.d.ts +2 -0
- package/dist/__tests__/detect.test.d.ts.map +1 -0
- package/dist/__tests__/detect.test.js +72 -0
- package/dist/__tests__/detect.test.js.map +1 -0
- package/dist/__tests__/split.test.d.ts +2 -0
- package/dist/__tests__/split.test.d.ts.map +1 -0
- package/dist/__tests__/split.test.js +115 -0
- package/dist/__tests__/split.test.js.map +1 -0
- package/dist/chunker.d.ts +4 -0
- package/dist/chunker.d.ts.map +1 -0
- package/dist/chunker.js +136 -0
- package/dist/chunker.js.map +1 -0
- package/dist/detect.d.ts +3 -0
- package/dist/detect.d.ts.map +1 -0
- package/dist/detect.js +69 -0
- package/dist/detect.js.map +1 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +10 -0
- package/dist/index.js.map +1 -0
- package/dist/split.d.ts +7 -0
- package/dist/split.d.ts.map +1 -0
- package/dist/split.js +269 -0
- package/dist/split.js.map +1 -0
- package/dist/types.d.ts +36 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/package.json +40 -0
package/README.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
# chunk-smart
|
|
2
|
+
|
|
3
|
+
Structure-aware text chunker for RAG pipelines. Detects content type (markdown, code, JSON, HTML, YAML, plain text) and splits at natural boundaries — headings, functions, top-level keys, paragraphs, sentences — rather than blindly by character count.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install chunk-smart
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Quick Start
|
|
12
|
+
|
|
13
|
+
```typescript
|
|
14
|
+
import { chunk } from 'chunk-smart'
|
|
15
|
+
|
|
16
|
+
const chunks = chunk(myText)
|
|
17
|
+
// Auto-detects content type and splits at natural boundaries
|
|
18
|
+
|
|
19
|
+
chunks.forEach(c => {
|
|
20
|
+
console.log(c.metadata.contentType, c.metadata.tokenCount, c.content.slice(0, 80))
|
|
21
|
+
})
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
## createChunker
|
|
25
|
+
|
|
26
|
+
```typescript
|
|
27
|
+
import { createChunker } from 'chunk-smart'
|
|
28
|
+
|
|
29
|
+
const chunker = createChunker({ maxTokens: 256, overlap: 20 })
|
|
30
|
+
|
|
31
|
+
const mdChunks = chunker.chunkMarkdown(markdownText)
|
|
32
|
+
const codeChunks = chunker.chunkCode(sourceCode)
|
|
33
|
+
const jsonChunks = chunker.chunkJSON(jsonString)
|
|
34
|
+
const detected = chunker.detectContentType(someText)
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## Content Types
|
|
38
|
+
|
|
39
|
+
| Type | Splits at |
|
|
40
|
+
|------------|------------------------------------------------|
|
|
41
|
+
| `markdown` | `#` heading boundaries, then paragraphs |
|
|
42
|
+
| `code` | `function`/`class`/`def` boundaries, then blank lines |
|
|
43
|
+
| `json` | top-level object keys or array element groups |
|
|
44
|
+
| `html` | paragraph/sentence boundaries |
|
|
45
|
+
| `yaml` | paragraph/sentence boundaries |
|
|
46
|
+
| `text` | paragraph boundaries (`\n\n`), then sentences |
|
|
47
|
+
|
|
48
|
+
## ChunkOptions
|
|
49
|
+
|
|
50
|
+
| Option | Type | Default | Description |
|
|
51
|
+
|--------------------|---------------|---------|--------------------------------------|
|
|
52
|
+
| `maxTokens` | `number` | `512` | Max tokens per chunk (1 token ≈ 4 chars) |
|
|
53
|
+
| `minTokens` | `number` | `50` | Min tokens (informational) |
|
|
54
|
+
| `overlap` | `number` | `0` | Overlap in tokens between chunks |
|
|
55
|
+
| `contentType` | `ContentType` | `'auto'`| Force a specific content type |
|
|
56
|
+
| `preserveStructure`| `boolean` | `true` | Use boundary-aware splitting |
|
|
57
|
+
|
|
58
|
+
## ChunkMetadata
|
|
59
|
+
|
|
60
|
+
```typescript
|
|
61
|
+
interface ChunkMetadata {
|
|
62
|
+
index: number // position in result array
|
|
63
|
+
startOffset: number // char offset in original text
|
|
64
|
+
endOffset: number // char offset in original text
|
|
65
|
+
tokenCount: number // Math.ceil(content.length / 4)
|
|
66
|
+
charCount: number // content.length
|
|
67
|
+
contentType: ContentType
|
|
68
|
+
headings: string[] // ancestor headings (markdown only)
|
|
69
|
+
codeLanguage?: string // detected from ``` fence or shebang
|
|
70
|
+
overlapBefore: number // overlap chars with previous chunk
|
|
71
|
+
overlapAfter: number // overlap chars with next chunk
|
|
72
|
+
}
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/chunk.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const chunker_1 = require("../chunker");
|
|
5
|
+
(0, vitest_1.describe)('chunk()', () => {
|
|
6
|
+
(0, vitest_1.it)('returns an array of Chunk objects', () => {
|
|
7
|
+
const text = `# Title\n\nSome paragraph text here.\n\n## Section\n\nMore text in section.`;
|
|
8
|
+
const result = (0, chunker_1.chunk)(text);
|
|
9
|
+
(0, vitest_1.expect)(Array.isArray(result)).toBe(true);
|
|
10
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(0);
|
|
11
|
+
for (const c of result) {
|
|
12
|
+
(0, vitest_1.expect)(typeof c.content).toBe('string');
|
|
13
|
+
(0, vitest_1.expect)(c.metadata).toBeDefined();
|
|
14
|
+
(0, vitest_1.expect)(typeof c.metadata.index).toBe('number');
|
|
15
|
+
(0, vitest_1.expect)(typeof c.metadata.tokenCount).toBe('number');
|
|
16
|
+
(0, vitest_1.expect)(typeof c.metadata.charCount).toBe('number');
|
|
17
|
+
(0, vitest_1.expect)(typeof c.metadata.startOffset).toBe('number');
|
|
18
|
+
(0, vitest_1.expect)(typeof c.metadata.endOffset).toBe('number');
|
|
19
|
+
}
|
|
20
|
+
});
|
|
21
|
+
(0, vitest_1.it)('respects maxTokens option', () => {
|
|
22
|
+
const text = 'Word '.repeat(500);
|
|
23
|
+
const result = (0, chunker_1.chunk)(text, { maxTokens: 50, contentType: 'text' });
|
|
24
|
+
for (const c of result) {
|
|
25
|
+
(0, vitest_1.expect)(c.metadata.tokenCount).toBeLessThanOrEqual(55); // small tolerance
|
|
26
|
+
}
|
|
27
|
+
});
|
|
28
|
+
(0, vitest_1.it)('assigns correct contentType in metadata for markdown', () => {
|
|
29
|
+
const text = `# Heading\n\nParagraph text here.\n\n## Another heading\n\nMore text.`;
|
|
30
|
+
const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
|
|
31
|
+
for (const c of result) {
|
|
32
|
+
(0, vitest_1.expect)(c.metadata.contentType).toBe('markdown');
|
|
33
|
+
}
|
|
34
|
+
});
|
|
35
|
+
(0, vitest_1.it)('assigns correct contentType in metadata for code', () => {
|
|
36
|
+
const code = `function foo() { return 1 }\nfunction bar() { return 2 }`;
|
|
37
|
+
const result = (0, chunker_1.chunk)(code, { contentType: 'code' });
|
|
38
|
+
for (const c of result) {
|
|
39
|
+
(0, vitest_1.expect)(c.metadata.contentType).toBe('code');
|
|
40
|
+
}
|
|
41
|
+
});
|
|
42
|
+
(0, vitest_1.it)('assigns sequential indices to chunks', () => {
|
|
43
|
+
const text = 'First paragraph.\n\nSecond paragraph.\n\nThird paragraph.';
|
|
44
|
+
const result = (0, chunker_1.chunk)(text, { maxTokens: 5, contentType: 'text' });
|
|
45
|
+
result.forEach((c, i) => {
|
|
46
|
+
(0, vitest_1.expect)(c.metadata.index).toBe(i);
|
|
47
|
+
});
|
|
48
|
+
});
|
|
49
|
+
(0, vitest_1.it)('extracts headings from markdown chunks', () => {
|
|
50
|
+
const text = `# Main Heading\n\nSome content under main heading.\n\n## Sub Heading\n\nSome content under sub.`;
|
|
51
|
+
const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
|
|
52
|
+
const withHeadings = result.filter(c => c.metadata.headings.length > 0);
|
|
53
|
+
(0, vitest_1.expect)(withHeadings.length).toBeGreaterThan(0);
|
|
54
|
+
});
|
|
55
|
+
(0, vitest_1.it)('detects JSON content type automatically', () => {
|
|
56
|
+
const json = JSON.stringify({ foo: 'bar', baz: 42 });
|
|
57
|
+
const result = (0, chunker_1.chunk)(json);
|
|
58
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(0);
|
|
59
|
+
(0, vitest_1.expect)(result[0].metadata.contentType).toBe('json');
|
|
60
|
+
});
|
|
61
|
+
(0, vitest_1.it)('handles empty string gracefully', () => {
|
|
62
|
+
const result = (0, chunker_1.chunk)('');
|
|
63
|
+
(0, vitest_1.expect)(Array.isArray(result)).toBe(true);
|
|
64
|
+
(0, vitest_1.expect)(result.length).toBe(0);
|
|
65
|
+
});
|
|
66
|
+
(0, vitest_1.it)('charCount matches content length', () => {
|
|
67
|
+
const text = `# Title\n\nA paragraph of reasonable length here.\n\n## Section\n\nAnother paragraph.`;
|
|
68
|
+
const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
|
|
69
|
+
for (const c of result) {
|
|
70
|
+
(0, vitest_1.expect)(c.metadata.charCount).toBe(c.content.length);
|
|
71
|
+
}
|
|
72
|
+
});
|
|
73
|
+
(0, vitest_1.it)('tokenCount is ceil(charCount / 4)', () => {
|
|
74
|
+
const text = `# Title\n\nA paragraph of reasonable length here.`;
|
|
75
|
+
const result = (0, chunker_1.chunk)(text, { contentType: 'markdown' });
|
|
76
|
+
for (const c of result) {
|
|
77
|
+
(0, vitest_1.expect)(c.metadata.tokenCount).toBe(Math.ceil(c.content.length / 4));
|
|
78
|
+
}
|
|
79
|
+
});
|
|
80
|
+
});
|
|
81
|
+
(0, vitest_1.describe)('createChunker()', () => {
|
|
82
|
+
(0, vitest_1.it)('returns a Chunker object with all methods', () => {
|
|
83
|
+
const chunker = (0, chunker_1.createChunker)();
|
|
84
|
+
(0, vitest_1.expect)(typeof chunker.chunk).toBe('function');
|
|
85
|
+
(0, vitest_1.expect)(typeof chunker.chunkMarkdown).toBe('function');
|
|
86
|
+
(0, vitest_1.expect)(typeof chunker.chunkCode).toBe('function');
|
|
87
|
+
(0, vitest_1.expect)(typeof chunker.chunkJSON).toBe('function');
|
|
88
|
+
(0, vitest_1.expect)(typeof chunker.detectContentType).toBe('function');
|
|
89
|
+
});
|
|
90
|
+
(0, vitest_1.it)('applies default options to all chunk calls', () => {
|
|
91
|
+
const chunker = (0, chunker_1.createChunker)({ maxTokens: 20 });
|
|
92
|
+
const text = 'A'.repeat(400);
|
|
93
|
+
const result = chunker.chunk(text, { contentType: 'text' });
|
|
94
|
+
for (const c of result) {
|
|
95
|
+
(0, vitest_1.expect)(c.metadata.tokenCount).toBeLessThanOrEqual(25);
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
(0, vitest_1.it)('chunkMarkdown forces markdown content type', () => {
|
|
99
|
+
const text = `# Heading\n\nText here.\n\n## Other\n\nMore text.`;
|
|
100
|
+
const chunker = (0, chunker_1.createChunker)();
|
|
101
|
+
const result = chunker.chunkMarkdown(text);
|
|
102
|
+
for (const c of result) {
|
|
103
|
+
(0, vitest_1.expect)(c.metadata.contentType).toBe('markdown');
|
|
104
|
+
}
|
|
105
|
+
});
|
|
106
|
+
(0, vitest_1.it)('chunkCode forces code content type', () => {
|
|
107
|
+
const code = `function a() {}\nfunction b() {}`;
|
|
108
|
+
const chunker = (0, chunker_1.createChunker)();
|
|
109
|
+
const result = chunker.chunkCode(code);
|
|
110
|
+
for (const c of result) {
|
|
111
|
+
(0, vitest_1.expect)(c.metadata.contentType).toBe('code');
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
(0, vitest_1.it)('chunkJSON forces json content type', () => {
|
|
115
|
+
const json = '{"a":1,"b":2}';
|
|
116
|
+
const chunker = (0, chunker_1.createChunker)();
|
|
117
|
+
const result = chunker.chunkJSON(json);
|
|
118
|
+
for (const c of result) {
|
|
119
|
+
(0, vitest_1.expect)(c.metadata.contentType).toBe('json');
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
(0, vitest_1.it)('detectContentType delegates to detect module', () => {
|
|
123
|
+
const chunker = (0, chunker_1.createChunker)();
|
|
124
|
+
const result = chunker.detectContentType('{"x": 1}');
|
|
125
|
+
(0, vitest_1.expect)(result.type).toBe('json');
|
|
126
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThan(0);
|
|
127
|
+
});
|
|
128
|
+
});
|
|
129
|
+
//# sourceMappingURL=chunk.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunk.test.js","sourceRoot":"","sources":["../../src/__tests__/chunk.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,wCAAiD;AAEjD,IAAA,iBAAQ,EAAC,SAAS,EAAE,GAAG,EAAE;IACvB,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,6EAA6E,CAAA;QAC1F,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,CAAC,CAAA;QAC1B,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACvC,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,WAAW,EAAE,CAAA;YAChC,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YAC9C,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACnD,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YAClD,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;YACpD,IAAA,eAAM,EAAC,OAAO,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAA;QACpD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,IAAI,GAAG,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAChC,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,SAAS,EAAE,EAAE,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QAClE,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,mBAAmB,CAAC,EAAE,CAAC,CAAA,CAAC,kBAAkB;QAC1E,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,IAAI,GAAG,uEAAuE,CAAA;QACpF,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,kDAAkD,EAAE,GAAG,EAAE;QAC1D,MAAM,IAAI,GAAG,0DAA0D,CAAA;QACvE,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QACnD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,2DAA2D,CAAA;QACxE,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QACjE,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;YACtB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAClC,CAAC,CAAC,CAAA;IACJ,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,wCAAwC,EAAE,GAAG,EAAE;QAChD,MAAM,IAAI,GAAG,iGAAiG,CAAA;QAC9G,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,MAAM,YAAY,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;QACvE,IAAA,eAAM,EAAC,YAAY,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAChD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,yCAAyC,EAAE,GAAG,EAAE;QACjD,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,GAAG,EAAE,KAAK,EAAE,GAAG,EAAE,EAAE,EAAE,CAAC,CAAA;QACpD,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,CAAC,CAAA;QAC1B,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IACrD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,EAAE,CAAC,CAAA;QACxB,IAAA,eAAM,EAAC,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IAC/B,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,kCAAkC,EAAE,GAAG,EAAE;QAC1C,MAAM,IAAI,GAAG,uFAAuF,CAAA;QACpG,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAA;QACrD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,mDAAmD,CAAA;QAChE,MAAM,MAAM,GAAG,IAAA,eAAK,EAAC,IAAI,EAAE,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAA;QACvD,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAA;QACrE,CAAC;IACH,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,iBAAiB,EAAE,GAAG,EAAE;IAC/B,IAAA,WAAE,EAAC,2CAA2C,EAAE,GAAG,EAAE;QACnD,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,aAAa,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACrD,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,IAAA,eAAM,EAAC,OAAO,OAAO,CAAC,iBAAiB,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IAC3D,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,OAAO,GAAG,IAAA,uBAAa,EAAC,EAAE,SAAS,EAAE,EAAE,EAAE,CAAC,CAAA;QAChD,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC5B,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,EAAE,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAA;QAC3D,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,UAAU,CAAC,CAAC,mBAAmB,CAAC,EAAE,CAAC,CAAA;QACvD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,mDAAmD,CAAA;QAChE,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,CAAA;QAC1C,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACjD,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,kCAAkC,CAAA;QAC/C,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;QACtC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,eAAe,CAAA;QAC5B,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,SAAS,CAAC,IAAI,CAAC,CAAA;QACtC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,CAAC,CAAC,QAAQ,CAAC,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAC7C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,8CAA8C,EAAE,GAAG,EAAE;QACtD,MAAM,OAAO,GAAG,IAAA,uBAAa,GAAE,CAAA;QAC/B,MAAM,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC,UAAU,CAAC,CAAA;QACpD,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC9C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/detect.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const detect_1 = require("../detect");
|
|
5
|
+
(0, vitest_1.describe)('detectContentType', () => {
|
|
6
|
+
(0, vitest_1.it)('detects valid JSON object with high confidence', () => {
|
|
7
|
+
const result = (0, detect_1.detectContentType)('{"key": "value", "num": 42}');
|
|
8
|
+
(0, vitest_1.expect)(result.type).toBe('json');
|
|
9
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.9);
|
|
10
|
+
});
|
|
11
|
+
(0, vitest_1.it)('detects valid JSON array with high confidence', () => {
|
|
12
|
+
const result = (0, detect_1.detectContentType)('[1, 2, 3]');
|
|
13
|
+
(0, vitest_1.expect)(result.type).toBe('json');
|
|
14
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.9);
|
|
15
|
+
});
|
|
16
|
+
(0, vitest_1.it)('detects malformed JSON starting with { at lower confidence', () => {
|
|
17
|
+
const result = (0, detect_1.detectContentType)('{ not valid json here }');
|
|
18
|
+
(0, vitest_1.expect)(result.type).toBe('json');
|
|
19
|
+
(0, vitest_1.expect)(result.confidence).toBeLessThan(0.9);
|
|
20
|
+
});
|
|
21
|
+
(0, vitest_1.it)('detects markdown with headings', () => {
|
|
22
|
+
const md = `# Title\n\nSome paragraph text here.\n\n## Section Two\n\nMore text.`;
|
|
23
|
+
const result = (0, detect_1.detectContentType)(md);
|
|
24
|
+
(0, vitest_1.expect)(result.type).toBe('markdown');
|
|
25
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.8);
|
|
26
|
+
});
|
|
27
|
+
(0, vitest_1.it)('detects markdown with code fences', () => {
|
|
28
|
+
const md = `Some text\n\n\`\`\`js\nconsole.log('hi')\n\`\`\``;
|
|
29
|
+
const result = (0, detect_1.detectContentType)(md);
|
|
30
|
+
(0, vitest_1.expect)(result.type).toBe('markdown');
|
|
31
|
+
});
|
|
32
|
+
(0, vitest_1.it)('detects HTML with DOCTYPE', () => {
|
|
33
|
+
const html = `<!DOCTYPE html><html><body><p>Hello</p></body></html>`;
|
|
34
|
+
const result = (0, detect_1.detectContentType)(html);
|
|
35
|
+
(0, vitest_1.expect)(result.type).toBe('html');
|
|
36
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.9);
|
|
37
|
+
});
|
|
38
|
+
(0, vitest_1.it)('detects HTML with div tags', () => {
|
|
39
|
+
const html = `<div class="container"><p>Hello world</p></div>`;
|
|
40
|
+
const result = (0, detect_1.detectContentType)(html);
|
|
41
|
+
(0, vitest_1.expect)(result.type).toBe('html');
|
|
42
|
+
});
|
|
43
|
+
(0, vitest_1.it)('detects code with function keyword', () => {
|
|
44
|
+
const code = `function greet(name) {\n return 'Hello ' + name\n}`;
|
|
45
|
+
const result = (0, detect_1.detectContentType)(code);
|
|
46
|
+
(0, vitest_1.expect)(result.type).toBe('code');
|
|
47
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.7);
|
|
48
|
+
});
|
|
49
|
+
(0, vitest_1.it)('detects code with class keyword', () => {
|
|
50
|
+
const code = `class MyClass {\n constructor() {}\n}`;
|
|
51
|
+
const result = (0, detect_1.detectContentType)(code);
|
|
52
|
+
(0, vitest_1.expect)(result.type).toBe('code');
|
|
53
|
+
});
|
|
54
|
+
(0, vitest_1.it)('detects code with Python def keyword', () => {
|
|
55
|
+
const code = `def compute(x, y):\n return x + y\n`;
|
|
56
|
+
const result = (0, detect_1.detectContentType)(code);
|
|
57
|
+
(0, vitest_1.expect)(result.type).toBe('code');
|
|
58
|
+
});
|
|
59
|
+
(0, vitest_1.it)('falls back to text for plain prose', () => {
|
|
60
|
+
const text = `This is just a plain text paragraph without any special syntax. It has multiple sentences and no special markers.`;
|
|
61
|
+
const result = (0, detect_1.detectContentType)(text);
|
|
62
|
+
(0, vitest_1.expect)(result.type).toBe('text');
|
|
63
|
+
(0, vitest_1.expect)(result.confidence).toBe(0.5);
|
|
64
|
+
});
|
|
65
|
+
(0, vitest_1.it)('detects YAML with key-value lines', () => {
|
|
66
|
+
const yaml = `name: my-project\nversion: 1.0.0\ndescription: a project\nauthor: someone`;
|
|
67
|
+
const result = (0, detect_1.detectContentType)(yaml);
|
|
68
|
+
(0, vitest_1.expect)(result.type).toBe('yaml');
|
|
69
|
+
(0, vitest_1.expect)(result.confidence).toBeGreaterThanOrEqual(0.8);
|
|
70
|
+
});
|
|
71
|
+
});
|
|
72
|
+
//# sourceMappingURL=detect.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.test.js","sourceRoot":"","sources":["../../src/__tests__/detect.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,sCAA6C;AAE7C,IAAA,iBAAQ,EAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,IAAA,WAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,6BAA6B,CAAC,CAAA;QAC/D,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,WAAW,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4DAA4D,EAAE,GAAG,EAAE;QACpE,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,yBAAyB,CAAC,CAAA;QAC3D,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAA;IAC7C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,EAAE,GAAG,sEAAsE,CAAA;QACjF,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,EAAE,CAAC,CAAA;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,EAAE,GAAG,kDAAkD,CAAA;QAC7D,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,EAAE,CAAC,CAAA;QACpC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,UAAU,CAAC,CAAA;IACtC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,2BAA2B,EAAE,GAAG,EAAE;QACnC,MAAM,IAAI,GAAG,uDAAuD,CAAA;QACpE,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4BAA4B,EAAE,GAAG,EAAE;QACpC,MAAM,IAAI,GAAG,iDAAiD,CAAA;QAC9D,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,qDAAqD,CAAA;QAClE,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,sCAAsC,EAAE,GAAG,EAAE;QAC9C,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;IAClC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,oCAAoC,EAAE,GAAG,EAAE;QAC5C,MAAM,IAAI,GAAG,mHAAmH,CAAA;QAChI,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;IACrC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,mCAAmC,EAAE,GAAG,EAAE;QAC3C,MAAM,IAAI,GAAG,2EAA2E,CAAA;QACxF,MAAM,MAAM,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACtC,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;QAChC,IAAA,eAAM,EAAC,MAAM,CAAC,UAAU,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAA;IACvD,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"split.test.d.ts","sourceRoot":"","sources":["../../src/__tests__/split.test.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const vitest_1 = require("vitest");
|
|
4
|
+
const split_1 = require("../split");
|
|
5
|
+
function opts(maxTokens, overlap = 0) {
|
|
6
|
+
return { maxTokens, overlap, minTokens: 0, preserveStructure: true };
|
|
7
|
+
}
|
|
8
|
+
(0, vitest_1.describe)('splitByTokenCount', () => {
|
|
9
|
+
(0, vitest_1.it)('returns single chunk when text fits within maxTokens', () => {
|
|
10
|
+
const text = 'Short text';
|
|
11
|
+
const result = (0, split_1.splitByTokenCount)(text, 512, 0);
|
|
12
|
+
(0, vitest_1.expect)(result).toHaveLength(1);
|
|
13
|
+
(0, vitest_1.expect)(result[0]).toBe(text);
|
|
14
|
+
});
|
|
15
|
+
(0, vitest_1.it)('splits long text into multiple chunks respecting maxTokens', () => {
|
|
16
|
+
// 200 chars -> 50 tokens; max is 10 tokens = 40 chars
|
|
17
|
+
const text = 'A'.repeat(200);
|
|
18
|
+
const result = (0, split_1.splitByTokenCount)(text, 10, 0);
|
|
19
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
20
|
+
for (const chunk of result) {
|
|
21
|
+
(0, vitest_1.expect)(chunk.length).toBeLessThanOrEqual(40 + 5); // small tolerance for boundary snapping
|
|
22
|
+
}
|
|
23
|
+
});
|
|
24
|
+
(0, vitest_1.it)('adds overlap between chunks', () => {
|
|
25
|
+
const text = 'Hello world. This is a sentence. And another one follows here. Final words.';
|
|
26
|
+
const result = (0, split_1.splitByTokenCount)(text, 5, 1); // 5 tokens = 20 chars, 1 token overlap = 4 chars
|
|
27
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
28
|
+
});
|
|
29
|
+
});
|
|
30
|
+
(0, vitest_1.describe)('splitMarkdown', () => {
|
|
31
|
+
(0, vitest_1.it)('splits at heading boundaries', () => {
|
|
32
|
+
const md = `# Heading One\n\nParagraph under heading one.\n\n# Heading Two\n\nParagraph under heading two.`;
|
|
33
|
+
const result = (0, split_1.splitMarkdown)(md, opts(512));
|
|
34
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThanOrEqual(2);
|
|
35
|
+
(0, vitest_1.expect)(result.some(r => r.includes('# Heading One'))).toBe(true);
|
|
36
|
+
(0, vitest_1.expect)(result.some(r => r.includes('# Heading Two'))).toBe(true);
|
|
37
|
+
});
|
|
38
|
+
(0, vitest_1.it)('splits a large section at paragraph boundaries', () => {
|
|
39
|
+
// Each paragraph is ~100 chars; maxTokens=10 means 40 chars
|
|
40
|
+
const para = (n) => `Para ${n}: ` + 'x'.repeat(30);
|
|
41
|
+
const md = `# Title\n\n${para(1)}\n\n${para(2)}\n\n${para(3)}`;
|
|
42
|
+
const result = (0, split_1.splitMarkdown)(md, opts(10));
|
|
43
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
44
|
+
});
|
|
45
|
+
(0, vitest_1.it)('handles text with no headings by falling back to paragraph split', () => {
|
|
46
|
+
const text = 'First paragraph here.\n\nSecond paragraph here.\n\nThird paragraph here.';
|
|
47
|
+
const result = (0, split_1.splitMarkdown)(text, opts(512));
|
|
48
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThanOrEqual(1);
|
|
49
|
+
});
|
|
50
|
+
(0, vitest_1.it)('returns entire text as one chunk when it fits', () => {
|
|
51
|
+
const text = '# Small doc\n\nJust a small paragraph.';
|
|
52
|
+
const result = (0, split_1.splitMarkdown)(text, opts(512));
|
|
53
|
+
(0, vitest_1.expect)(result).toHaveLength(1);
|
|
54
|
+
});
|
|
55
|
+
});
|
|
56
|
+
(0, vitest_1.describe)('splitCode', () => {
|
|
57
|
+
(0, vitest_1.it)('returns single chunk for small code', () => {
|
|
58
|
+
const code = `function foo() { return 1 }`;
|
|
59
|
+
const result = (0, split_1.splitCode)(code, opts(512));
|
|
60
|
+
(0, vitest_1.expect)(result).toHaveLength(1);
|
|
61
|
+
});
|
|
62
|
+
(0, vitest_1.it)('splits at function boundaries', () => {
|
|
63
|
+
const code = `function alpha() {\n return 1\n}\n\nfunction beta() {\n return 2\n}\n\nfunction gamma() {\n return 3\n}`;
|
|
64
|
+
const result = (0, split_1.splitCode)(code, opts(5)); // tiny maxTokens to force splits
|
|
65
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
66
|
+
});
|
|
67
|
+
(0, vitest_1.it)('splits at blank lines when no top-level patterns found', () => {
|
|
68
|
+
const code = `x = 1\ny = 2\n\nz = x + y\n\nprint(z)\n`.repeat(20);
|
|
69
|
+
const result = (0, split_1.splitCode)(code, opts(10));
|
|
70
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
71
|
+
});
|
|
72
|
+
});
|
|
73
|
+
(0, vitest_1.describe)('splitJSON', () => {
|
|
74
|
+
(0, vitest_1.it)('splits object by top-level keys', () => {
|
|
75
|
+
const obj = {};
|
|
76
|
+
for (let i = 0; i < 20; i++)
|
|
77
|
+
obj[`key${i}`] = i;
|
|
78
|
+
const json = JSON.stringify(obj, null, 2);
|
|
79
|
+
const result = (0, split_1.splitJSON)(json, opts(10)); // force small chunk size
|
|
80
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
81
|
+
// Each chunk should be valid JSON
|
|
82
|
+
for (const c of result) {
|
|
83
|
+
(0, vitest_1.expect)(() => JSON.parse(c)).not.toThrow();
|
|
84
|
+
}
|
|
85
|
+
});
|
|
86
|
+
(0, vitest_1.it)('splits array into groups', () => {
|
|
87
|
+
const arr = Array.from({ length: 50 }, (_, i) => ({ id: i, value: `item${i}` }));
|
|
88
|
+
const json = JSON.stringify(arr, null, 2);
|
|
89
|
+
const result = (0, split_1.splitJSON)(json, opts(10));
|
|
90
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
91
|
+
});
|
|
92
|
+
(0, vitest_1.it)('falls back to token split for invalid JSON', () => {
|
|
93
|
+
const text = 'not json at all'.repeat(50);
|
|
94
|
+
const result = (0, split_1.splitJSON)(text, opts(10));
|
|
95
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(0);
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
(0, vitest_1.describe)('splitText', () => {
|
|
99
|
+
(0, vitest_1.it)('returns single chunk when text fits', () => {
|
|
100
|
+
const text = 'Short text that fits easily.';
|
|
101
|
+
const result = (0, split_1.splitText)(text, opts(512));
|
|
102
|
+
(0, vitest_1.expect)(result).toHaveLength(1);
|
|
103
|
+
});
|
|
104
|
+
(0, vitest_1.it)('splits at paragraph boundaries', () => {
|
|
105
|
+
const text = 'First paragraph.\n\nSecond paragraph.\n\nThird paragraph.';
|
|
106
|
+
const result = (0, split_1.splitText)(text, opts(3)); // 3 tokens = 12 chars — forces splits
|
|
107
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
108
|
+
});
|
|
109
|
+
(0, vitest_1.it)('splits at sentence boundaries when paragraph is too large', () => {
|
|
110
|
+
const longPara = 'This is sentence one. This is sentence two. This is sentence three. This is sentence four.';
|
|
111
|
+
const result = (0, split_1.splitText)(longPara, opts(5));
|
|
112
|
+
(0, vitest_1.expect)(result.length).toBeGreaterThan(1);
|
|
113
|
+
});
|
|
114
|
+
});
|
|
115
|
+
//# sourceMappingURL=split.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"split.test.js","sourceRoot":"","sources":["../../src/__tests__/split.test.ts"],"names":[],"mappings":";;AAAA,mCAA6C;AAC7C,oCAA4F;AAG5F,SAAS,IAAI,CAAC,SAAiB,EAAE,OAAO,GAAG,CAAC;IAC1C,OAAO,EAAE,SAAS,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC,EAAE,iBAAiB,EAAE,IAAI,EAAE,CAAA;AACtE,CAAC;AAED,IAAA,iBAAQ,EAAC,mBAAmB,EAAE,GAAG,EAAE;IACjC,IAAA,WAAE,EAAC,sDAAsD,EAAE,GAAG,EAAE;QAC9D,MAAM,IAAI,GAAG,YAAY,CAAA;QACzB,MAAM,MAAM,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,GAAG,EAAE,CAAC,CAAC,CAAA;QAC9C,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;QAC9B,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAC9B,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4DAA4D,EAAE,GAAG,EAAE;QACpE,sDAAsD;QACtD,MAAM,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,GAAG,CAAC,CAAA;QAC5B,MAAM,MAAM,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,EAAE,EAAE,CAAC,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;YAC3B,IAAA,eAAM,EAAC,KAAK,CAAC,MAAM,CAAC,CAAC,mBAAmB,CAAC,EAAE,GAAG,CAAC,CAAC,CAAA,CAAC,wCAAwC;QAC3F,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,6BAA6B,EAAE,GAAG,EAAE;QACrC,MAAM,IAAI,GAAG,6EAA6E,CAAA;QAC1F,MAAM,MAAM,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC,CAAA,CAAC,iDAAiD;QAC9F,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,eAAe,EAAE,GAAG,EAAE;IAC7B,IAAA,WAAE,EAAC,8BAA8B,EAAE,GAAG,EAAE;QACtC,MAAM,EAAE,GAAG,gGAAgG,CAAA;QAC3G,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,EAAE,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAC3C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAA;QAC/C,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;QAChE,IAAA,eAAM,EAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAA;IAClE,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gDAAgD,EAAE,GAAG,EAAE;QACxD,4DAA4D;QAC5D,MAAM,IAAI,GAAG,CAAC,CAAS,EAAE,EAAE,CAAC,QAAQ,CAAC,IAAI,GAAG,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QAC1D,MAAM,EAAE,GAAG,cAAc,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,OAAO,IAAI,CAAC,CAAC,CAAC,EAAE,CAAA;QAC9D,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,EAAE,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QAC1C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,kEAAkE,EAAE,GAAG,EAAE;QAC1E,MAAM,IAAI,GAAG,0EAA0E,CAAA;QACvF,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,sBAAsB,CAAC,CAAC,CAAC,CAAA;IACjD,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+CAA+C,EAAE,GAAG,EAAE;QACvD,MAAM,IAAI,GAAG,wCAAwC,CAAA;QACrD,MAAM,MAAM,GAAG,IAAA,qBAAa,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QAC7C,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,WAAW,EAAE,GAAG,EAAE;IACzB,IAAA,WAAE,EAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,6BAA6B,CAAA;QAC1C,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QACzC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,+BAA+B,EAAE,GAAG,EAAE;QACvC,MAAM,IAAI,GAAG,4GAA4G,CAAA;QACzH,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA,CAAC,iCAAiC;QACzE,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,wDAAwD,EAAE,GAAG,EAAE;QAChE,MAAM,IAAI,GAAG,yCAAyC,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QACjE,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,WAAW,EAAE,GAAG,EAAE;IACzB,IAAA,WAAE,EAAC,iCAAiC,EAAE,GAAG,EAAE;QACzC,MAAM,GAAG,GAA2B,EAAE,CAAA;QACtC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE;YAAE,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,GAAG,CAAC,CAAA;QAC/C,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA,CAAC,yBAAyB;QAClE,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;QACxC,kCAAkC;QAClC,KAAK,MAAM,CAAC,IAAI,MAAM,EAAE,CAAC;YACvB,IAAA,eAAM,EAAC,GAAG,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,EAAE,CAAA;QAC3C,CAAC;IACH,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,0BAA0B,EAAE,GAAG,EAAE;QAClC,MAAM,GAAG,GAAG,KAAK,CAAC,IAAI,CAAC,EAAE,MAAM,EAAE,EAAE,EAAE,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,EAAE,EAAE,CAAC,CAAC,CAAA;QAChF,MAAM,IAAI,GAAG,IAAI,CAAC,SAAS,CAAC,GAAG,EAAE,IAAI,EAAE,CAAC,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,4CAA4C,EAAE,GAAG,EAAE;QACpD,MAAM,IAAI,GAAG,iBAAiB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAA;QACzC,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,CAAC,CAAA;QACxC,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA;AAEF,IAAA,iBAAQ,EAAC,WAAW,EAAE,GAAG,EAAE;IACzB,IAAA,WAAE,EAAC,qCAAqC,EAAE,GAAG,EAAE;QAC7C,MAAM,IAAI,GAAG,8BAA8B,CAAA;QAC3C,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,CAAA;QACzC,IAAA,eAAM,EAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,gCAAgC,EAAE,GAAG,EAAE;QACxC,MAAM,IAAI,GAAG,2DAA2D,CAAA;QACxE,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA,CAAC,sCAAsC;QAC9E,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;IAEF,IAAA,WAAE,EAAC,2DAA2D,EAAE,GAAG,EAAE;QACnE,MAAM,QAAQ,GAAG,4FAA4F,CAAA;QAC7G,MAAM,MAAM,GAAG,IAAA,iBAAS,EAAC,QAAQ,EAAE,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QAC3C,IAAA,eAAM,EAAC,MAAM,CAAC,MAAM,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAA;IAC1C,CAAC,CAAC,CAAA;AACJ,CAAC,CAAC,CAAA"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.d.ts","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,KAAK,EAAiB,YAAY,EAAE,OAAO,EAA6B,MAAM,SAAS,CAAA;AAuFrG,wBAAgB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,YAAY,GAAG,KAAK,EAAE,CAsCnE;AAED,wBAAgB,aAAa,CAAC,cAAc,CAAC,EAAE,YAAY,GAAG,OAAO,CAwBpE"}
|
package/dist/chunker.js
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.chunk = chunk;
|
|
4
|
+
exports.createChunker = createChunker;
|
|
5
|
+
const detect_1 = require("./detect");
|
|
6
|
+
const split_1 = require("./split");
|
|
7
|
+
const DEFAULTS = {
|
|
8
|
+
maxTokens: 512,
|
|
9
|
+
minTokens: 50,
|
|
10
|
+
overlap: 0,
|
|
11
|
+
contentType: 'text',
|
|
12
|
+
preserveStructure: true,
|
|
13
|
+
};
|
|
14
|
+
function resolveOptions(defaults, overrides) {
|
|
15
|
+
return { ...defaults, ...(overrides ?? {}) };
|
|
16
|
+
}
|
|
17
|
+
function tokensFor(text) {
|
|
18
|
+
return Math.ceil(text.length / 4);
|
|
19
|
+
}
|
|
20
|
+
/** Extract first-level headings from a markdown chunk for metadata. */
|
|
21
|
+
function extractHeadings(content) {
|
|
22
|
+
const headings = [];
|
|
23
|
+
for (const line of content.split('\n')) {
|
|
24
|
+
const m = line.match(/^(#{1,6})\s+(.+)/);
|
|
25
|
+
if (m)
|
|
26
|
+
headings.push(m[2].trim());
|
|
27
|
+
}
|
|
28
|
+
return headings;
|
|
29
|
+
}
|
|
30
|
+
/** Detect code language from fence or shebang on first non-empty line. */
|
|
31
|
+
function detectCodeLanguage(content) {
|
|
32
|
+
const first = content.trimStart().split('\n')[0] ?? '';
|
|
33
|
+
const fenceMatch = first.match(/^```(\w+)/);
|
|
34
|
+
if (fenceMatch)
|
|
35
|
+
return fenceMatch[1];
|
|
36
|
+
const shebangMatch = first.match(/^#!.*\/(\w+)/);
|
|
37
|
+
if (shebangMatch)
|
|
38
|
+
return shebangMatch[1];
|
|
39
|
+
return undefined;
|
|
40
|
+
}
|
|
41
|
+
function buildChunks(parts, originalText, contentType, options) {
|
|
42
|
+
const overlapChars = options.overlap * 4;
|
|
43
|
+
const chunks = [];
|
|
44
|
+
// Build startOffset by searching for each part in the original text
|
|
45
|
+
let searchFrom = 0;
|
|
46
|
+
for (let i = 0; i < parts.length; i++) {
|
|
47
|
+
const content = parts[i];
|
|
48
|
+
if (!content || content.trim().length === 0)
|
|
49
|
+
continue;
|
|
50
|
+
// Find where this part appears in the original text
|
|
51
|
+
let startOffset = originalText.indexOf(content.trimEnd(), searchFrom);
|
|
52
|
+
if (startOffset === -1) {
|
|
53
|
+
// Fallback: search trimmed
|
|
54
|
+
startOffset = originalText.indexOf(content.trim(), searchFrom);
|
|
55
|
+
}
|
|
56
|
+
if (startOffset === -1)
|
|
57
|
+
startOffset = searchFrom;
|
|
58
|
+
const endOffset = startOffset + content.length;
|
|
59
|
+
const overlapBefore = i > 0 ? Math.min(overlapChars, (parts[i - 1] ?? '').length) : 0;
|
|
60
|
+
const overlapAfter = i < parts.length - 1 ? Math.min(overlapChars, (parts[i + 1] ?? '').length) : 0;
|
|
61
|
+
const metadata = {
|
|
62
|
+
index: chunks.length,
|
|
63
|
+
startOffset,
|
|
64
|
+
endOffset,
|
|
65
|
+
tokenCount: tokensFor(content),
|
|
66
|
+
charCount: content.length,
|
|
67
|
+
contentType,
|
|
68
|
+
headings: contentType === 'markdown' ? extractHeadings(content) : [],
|
|
69
|
+
codeLanguage: contentType === 'code' ? detectCodeLanguage(content) : undefined,
|
|
70
|
+
overlapBefore,
|
|
71
|
+
overlapAfter,
|
|
72
|
+
};
|
|
73
|
+
chunks.push({ content, metadata });
|
|
74
|
+
searchFrom = Math.max(searchFrom, startOffset + 1);
|
|
75
|
+
}
|
|
76
|
+
return chunks;
|
|
77
|
+
}
|
|
78
|
+
function chunk(text, options) {
|
|
79
|
+
const opts = resolveOptions(DEFAULTS, options);
|
|
80
|
+
let contentType;
|
|
81
|
+
if (opts.contentType && opts.contentType !== 'text') {
|
|
82
|
+
contentType = opts.contentType;
|
|
83
|
+
}
|
|
84
|
+
else if (options?.contentType) {
|
|
85
|
+
contentType = options.contentType;
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
const detected = (0, detect_1.detectContentType)(text);
|
|
89
|
+
contentType = detected.type;
|
|
90
|
+
}
|
|
91
|
+
let parts;
|
|
92
|
+
switch (contentType) {
|
|
93
|
+
case 'markdown':
|
|
94
|
+
parts = (0, split_1.splitMarkdown)(text, opts);
|
|
95
|
+
break;
|
|
96
|
+
case 'code':
|
|
97
|
+
parts = (0, split_1.splitCode)(text, opts);
|
|
98
|
+
break;
|
|
99
|
+
case 'json':
|
|
100
|
+
parts = (0, split_1.splitJSON)(text, opts);
|
|
101
|
+
break;
|
|
102
|
+
case 'html':
|
|
103
|
+
case 'yaml':
|
|
104
|
+
case 'text':
|
|
105
|
+
default:
|
|
106
|
+
if (opts.preserveStructure) {
|
|
107
|
+
parts = (0, split_1.splitText)(text, opts);
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
parts = (0, split_1.splitByTokenCount)(text, opts.maxTokens, opts.overlap);
|
|
111
|
+
}
|
|
112
|
+
break;
|
|
113
|
+
}
|
|
114
|
+
return buildChunks(parts, text, contentType, opts);
|
|
115
|
+
}
|
|
116
|
+
function createChunker(defaultOptions) {
|
|
117
|
+
const defaults = resolveOptions(DEFAULTS, defaultOptions);
|
|
118
|
+
return {
|
|
119
|
+
chunk(text, overrides) {
|
|
120
|
+
return chunk(text, resolveOptions(defaults, overrides));
|
|
121
|
+
},
|
|
122
|
+
chunkMarkdown(text, overrides) {
|
|
123
|
+
return chunk(text, resolveOptions(defaults, { ...overrides, contentType: 'markdown' }));
|
|
124
|
+
},
|
|
125
|
+
chunkCode(text, overrides) {
|
|
126
|
+
return chunk(text, resolveOptions(defaults, { ...overrides, contentType: 'code' }));
|
|
127
|
+
},
|
|
128
|
+
chunkJSON(text, overrides) {
|
|
129
|
+
return chunk(text, resolveOptions(defaults, { ...overrides, contentType: 'json' }));
|
|
130
|
+
},
|
|
131
|
+
detectContentType(text) {
|
|
132
|
+
return (0, detect_1.detectContentType)(text);
|
|
133
|
+
},
|
|
134
|
+
};
|
|
135
|
+
}
|
|
136
|
+
//# sourceMappingURL=chunker.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"chunker.js","sourceRoot":"","sources":["../src/chunker.ts"],"names":[],"mappings":";;AAyFA,sBAsCC;AAED,sCAwBC;AAzJD,qCAA4C;AAC5C,mCAA2F;AAG3F,MAAM,QAAQ,GAA2B;IACvC,SAAS,EAAE,GAAG;IACd,SAAS,EAAE,EAAE;IACb,OAAO,EAAE,CAAC;IACV,WAAW,EAAE,MAAM;IACnB,iBAAiB,EAAE,IAAI;CACxB,CAAA;AAED,SAAS,cAAc,CAAC,QAAgC,EAAE,SAAiC;IACzF,OAAO,EAAE,GAAG,QAAQ,EAAE,GAAG,CAAC,SAAS,IAAI,EAAE,CAAC,EAAE,CAAA;AAC9C,CAAC;AAED,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AACnC,CAAC;AAED,uEAAuE;AACvE,SAAS,eAAe,CAAC,OAAe;IACtC,MAAM,QAAQ,GAAa,EAAE,CAAA;IAC7B,KAAK,MAAM,IAAI,IAAI,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC;QACvC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,kBAAkB,CAAC,CAAA;QACxC,IAAI,CAAC;YAAE,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAA;IACnC,CAAC;IACD,OAAO,QAAQ,CAAA;AACjB,CAAC;AAED,0EAA0E;AAC1E,SAAS,kBAAkB,CAAC,OAAe;IACzC,MAAM,KAAK,GAAG,OAAO,CAAC,SAAS,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAA;IACtD,MAAM,UAAU,GAAG,KAAK,CAAC,KAAK,CAAC,WAAW,CAAC,CAAA;IAC3C,IAAI,UAAU;QAAE,OAAO,UAAU,CAAC,CAAC,CAAC,CAAA;IACpC,MAAM,YAAY,GAAG,KAAK,CAAC,KAAK,CAAC,cAAc,CAAC,CAAA;IAChD,IAAI,YAAY;QAAE,OAAO,YAAY,CAAC,CAAC,CAAC,CAAA;IACxC,OAAO,SAAS,CAAA;AAClB,CAAC;AAED,SAAS,WAAW,CAClB,KAAe,EACf,YAAoB,EACpB,WAAwB,EACxB,OAA+B;IAE/B,MAAM,YAAY,GAAG,OAAO,CAAC,OAAO,GAAG,CAAC,CAAA;IACxC,MAAM,MAAM,GAAY,EAAE,CAAA;IAE1B,oEAAoE;IACpE,IAAI,UAAU,GAAG,CAAC,CAAA;IAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;QACxB,IAAI,CAAC,OAAO,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC;YAAE,SAAQ;QAErD,oDAAoD;QACpD,IAAI,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,OAAO,EAAE,EAAE,UAAU,CAAC,CAAA;QACrE,IAAI,WAAW,KAAK,CAAC,CAAC,EAAE,CAAC;YACvB,2BAA2B;YAC3B,WAAW,GAAG,YAAY,CAAC,OAAO,CAAC,OAAO,CAAC,IAAI,EAAE,EAAE,UAAU,CAAC,CAAA;QAChE,CAAC;QACD,IAAI,WAAW,KAAK,CAAC,CAAC;YAAE,WAAW,GAAG,UAAU,CAAA;QAEhD,MAAM,SAAS,GAAG,WAAW,GAAG,OAAO,CAAC,MAAM,CAAA;QAE9C,MAAM,aAAa,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QACrF,MAAM,YAAY,GAAG,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,YAAY,EAAE,CAAC,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;QAEnG,MAAM,QAAQ,GAAkB;YAC9B,KAAK,EAAE,MAAM,CAAC,MAAM;YACpB,WAAW;YACX,SAAS;YACT,UAAU,EAAE,SAAS,CAAC,OAAO,CAAC;YAC9B,SAAS,EAAE,OAAO,CAAC,MAAM;YACzB,WAAW;YACX,QAAQ,EAAE,WAAW,KAAK,UAAU,CAAC,CAAC,CAAC,eAAe,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE;YACpE,YAAY,EAAE,WAAW,KAAK,MAAM,CAAC,CAAC,CAAC,kBAAkB,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,SAAS;YAC9E,aAAa;YACb,YAAY;SACb,CAAA;QAED,MAAM,CAAC,IAAI,CAAC,EAAE,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAA;QAClC,UAAU,GAAG,IAAI,CAAC,GAAG,CAAC,UAAU,EAAE,WAAW,GAAG,CAAC,CAAC,CAAA;IACpD,CAAC;IAED,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAgB,KAAK,CAAC,IAAY,EAAE,OAAsB;IACxD,MAAM,IAAI,GAAG,cAAc,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAA;IAE9C,IAAI,WAAwB,CAAA;IAC5B,IAAI,IAAI,CAAC,WAAW,IAAI,IAAI,CAAC,WAAW,KAAK,MAAM,EAAE,CAAC;QACpD,WAAW,GAAG,IAAI,CAAC,WAAW,CAAA;IAChC,CAAC;SAAM,IAAI,OAAO,EAAE,WAAW,EAAE,CAAC;QAChC,WAAW,GAAG,OAAO,CAAC,WAAW,CAAA;IACnC,CAAC;SAAM,CAAC;QACN,MAAM,QAAQ,GAAG,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QACxC,WAAW,GAAG,QAAQ,CAAC,IAAI,CAAA;IAC7B,CAAC;IAED,IAAI,KAAe,CAAA;IAEnB,QAAQ,WAAW,EAAE,CAAC;QACpB,KAAK,UAAU;YACb,KAAK,GAAG,IAAA,qBAAa,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YACjC,MAAK;QACP,KAAK,MAAM;YACT,KAAK,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YAC7B,MAAK;QACP,KAAK,MAAM;YACT,KAAK,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YAC7B,MAAK;QACP,KAAK,MAAM,CAAC;QACZ,KAAK,MAAM,CAAC;QACZ,KAAK,MAAM,CAAC;QACZ;YACE,IAAI,IAAI,CAAC,iBAAiB,EAAE,CAAC;gBAC3B,KAAK,GAAG,IAAA,iBAAS,EAAC,IAAI,EAAE,IAAI,CAAC,CAAA;YAC/B,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,IAAA,yBAAiB,EAAC,IAAI,EAAE,IAAI,CAAC,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,CAAA;YAC/D,CAAC;YACD,MAAK;IACT,CAAC;IAED,OAAO,WAAW,CAAC,KAAK,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,CAAC,CAAA;AACpD,CAAC;AAED,SAAgB,aAAa,CAAC,cAA6B;IACzD,MAAM,QAAQ,GAAG,cAAc,CAAC,QAAQ,EAAE,cAAc,CAAC,CAAA;IAEzD,OAAO;QACL,KAAK,CAAC,IAAY,EAAE,SAAiC;YACnD,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC,CAAA;QACzD,CAAC;QAED,aAAa,CAAC,IAAY,EAAE,SAAiC;YAC3D,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,EAAE,GAAG,SAAS,EAAE,WAAW,EAAE,UAAU,EAAE,CAAC,CAAC,CAAA;QACzF,CAAC;QAED,SAAS,CAAC,IAAY,EAAE,SAAiC;YACvD,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,EAAE,GAAG,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC,CAAA;QACrF,CAAC;QAED,SAAS,CAAC,IAAY,EAAE,SAAiC;YACvD,OAAO,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,QAAQ,EAAE,EAAE,GAAG,SAAS,EAAE,WAAW,EAAE,MAAM,EAAE,CAAC,CAAC,CAAA;QACrF,CAAC;QAED,iBAAiB,CAAC,IAAY;YAC5B,OAAO,IAAA,0BAAiB,EAAC,IAAI,CAAC,CAAA;QAChC,CAAC;KACF,CAAA;AACH,CAAC"}
|
package/dist/detect.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.d.ts","sourceRoot":"","sources":["../src/detect.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAe,YAAY,EAAE,MAAM,SAAS,CAAA;AAExD,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CA0D5D"}
|
package/dist/detect.js
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.detectContentType = detectContentType;
|
|
4
|
+
function detectContentType(text) {
|
|
5
|
+
const trimmed = text.trim();
|
|
6
|
+
// JSON: starts with { or [
|
|
7
|
+
if (trimmed.startsWith('{') || trimmed.startsWith('[')) {
|
|
8
|
+
try {
|
|
9
|
+
JSON.parse(trimmed);
|
|
10
|
+
return { type: 'json', confidence: 0.95 };
|
|
11
|
+
}
|
|
12
|
+
catch {
|
|
13
|
+
return { type: 'json', confidence: 0.7 };
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
// HTML: starts with <!DOCTYPE or <html, or has common HTML tags
|
|
17
|
+
if (/^<!doctype\s+html/i.test(trimmed) ||
|
|
18
|
+
/^<html/i.test(trimmed) ||
|
|
19
|
+
/<(div|p|span|body|head|h[1-6]|ul|ol|li|table|form|input|a\s|img\s)[^>]*>/i.test(trimmed)) {
|
|
20
|
+
return { type: 'html', confidence: 0.9 };
|
|
21
|
+
}
|
|
22
|
+
// YAML: has key: value lines, optional --- marker, no < chars
|
|
23
|
+
if (!trimmed.includes('<')) {
|
|
24
|
+
const lines = trimmed.split('\n').slice(0, 20);
|
|
25
|
+
const yamlKeyValueLines = lines.filter(l => /^\s*[\w-]+\s*:\s*.+/.test(l));
|
|
26
|
+
const hasMarker = trimmed.startsWith('---');
|
|
27
|
+
if (hasMarker || yamlKeyValueLines.length >= 2) {
|
|
28
|
+
return { type: 'yaml', confidence: 0.8 };
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
// Markdown: has # headings, ``` fences, **bold**, - list items
|
|
32
|
+
let mdScore = 0;
|
|
33
|
+
if (/^#{1,6}\s+\S/m.test(trimmed))
|
|
34
|
+
mdScore += 2;
|
|
35
|
+
if (/```/.test(trimmed))
|
|
36
|
+
mdScore += 2;
|
|
37
|
+
if (/\*\*[^*]+\*\*/.test(trimmed))
|
|
38
|
+
mdScore += 1;
|
|
39
|
+
if (/^[-*+]\s+\S/m.test(trimmed))
|
|
40
|
+
mdScore += 1;
|
|
41
|
+
if (/^\[.+\]\(.+\)/m.test(trimmed))
|
|
42
|
+
mdScore += 1;
|
|
43
|
+
if (mdScore >= 2) {
|
|
44
|
+
return { type: 'markdown', confidence: 0.8 };
|
|
45
|
+
}
|
|
46
|
+
// Code: common code patterns
|
|
47
|
+
let codeScore = 0;
|
|
48
|
+
if (/\bfunction\s+\w+\s*\(/.test(trimmed))
|
|
49
|
+
codeScore += 2;
|
|
50
|
+
if (/\bclass\s+\w+/.test(trimmed))
|
|
51
|
+
codeScore += 2;
|
|
52
|
+
if (/\bdef\s+\w+\s*\(/.test(trimmed))
|
|
53
|
+
codeScore += 2;
|
|
54
|
+
if (/\bimport\s+[\w{*]/.test(trimmed))
|
|
55
|
+
codeScore += 1;
|
|
56
|
+
if (/\bconst\s+\w+\s*=/.test(trimmed))
|
|
57
|
+
codeScore += 1;
|
|
58
|
+
if (/\blet\s+\w+\s*=/.test(trimmed))
|
|
59
|
+
codeScore += 1;
|
|
60
|
+
if (/\bvar\s+\w+\s*=/.test(trimmed))
|
|
61
|
+
codeScore += 1;
|
|
62
|
+
if (/^\s{2,}.*[{};]$/m.test(trimmed))
|
|
63
|
+
codeScore += 1;
|
|
64
|
+
if (codeScore >= 2) {
|
|
65
|
+
return { type: 'code', confidence: 0.7 };
|
|
66
|
+
}
|
|
67
|
+
return { type: 'text', confidence: 0.5 };
|
|
68
|
+
}
|
|
69
|
+
//# sourceMappingURL=detect.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"detect.js","sourceRoot":"","sources":["../src/detect.ts"],"names":[],"mappings":";;AAEA,8CA0DC;AA1DD,SAAgB,iBAAiB,CAAC,IAAY;IAC5C,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,EAAE,CAAA;IAE3B,2BAA2B;IAC3B,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,IAAI,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,EAAE,CAAC;QACvD,IAAI,CAAC;YACH,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,CAAA;YACnB,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,IAAI,EAAE,CAAA;QAC1D,CAAC;QAAC,MAAM,CAAC;YACP,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;QACzD,CAAC;IACH,CAAC;IAED,gEAAgE;IAChE,IACE,oBAAoB,CAAC,IAAI,CAAC,OAAO,CAAC;QAClC,SAAS,CAAC,IAAI,CAAC,OAAO,CAAC;QACvB,2EAA2E,CAAC,IAAI,CAAC,OAAO,CAAC,EACzF,CAAC;QACD,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;IACzD,CAAC;IAED,8DAA8D;IAC9D,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,OAAO,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAA;QAC9C,MAAM,iBAAiB,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,qBAAqB,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QAC1E,MAAM,SAAS,GAAG,OAAO,CAAC,UAAU,CAAC,KAAK,CAAC,CAAA;QAC3C,IAAI,SAAS,IAAI,iBAAiB,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;YAC/C,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;QACzD,CAAC;IACH,CAAC;IAED,+DAA+D;IAC/D,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAC/C,IAAI,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IACrC,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAC/C,IAAI,cAAc,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAC9C,IAAI,gBAAgB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,OAAO,IAAI,CAAC,CAAA;IAChD,IAAI,OAAO,IAAI,CAAC,EAAE,CAAC;QACjB,OAAO,EAAE,IAAI,EAAE,UAAyB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;IAC7D,CAAC;IAED,6BAA6B;IAC7B,IAAI,SAAS,GAAG,CAAC,CAAA;IACjB,IAAI,uBAAuB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACzD,IAAI,eAAe,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACjD,IAAI,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACpD,IAAI,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACrD,IAAI,mBAAmB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACrD,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACnD,IAAI,iBAAiB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACnD,IAAI,kBAAkB,CAAC,IAAI,CAAC,OAAO,CAAC;QAAE,SAAS,IAAI,CAAC,CAAA;IACpD,IAAI,SAAS,IAAI,CAAC,EAAE,CAAC;QACnB,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;IACzD,CAAC;IAED,OAAO,EAAE,IAAI,EAAE,MAAqB,EAAE,UAAU,EAAE,GAAG,EAAE,CAAA;AACzD,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,KAAK,EAAE,aAAa,EAAE,MAAM,WAAW,CAAA;AAChD,OAAO,EAAE,iBAAiB,EAAE,MAAM,UAAU,CAAA;AAC5C,YAAY,EACV,WAAW,EACX,YAAY,EACZ,aAAa,EACb,KAAK,EACL,YAAY,EACZ,OAAO,GACR,MAAM,SAAS,CAAA"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.detectContentType = exports.createChunker = exports.chunk = void 0;
|
|
4
|
+
// chunk-smart - Structure-aware text chunker for RAG pipelines
|
|
5
|
+
var chunker_1 = require("./chunker");
|
|
6
|
+
Object.defineProperty(exports, "chunk", { enumerable: true, get: function () { return chunker_1.chunk; } });
|
|
7
|
+
Object.defineProperty(exports, "createChunker", { enumerable: true, get: function () { return chunker_1.createChunker; } });
|
|
8
|
+
var detect_1 = require("./detect");
|
|
9
|
+
Object.defineProperty(exports, "detectContentType", { enumerable: true, get: function () { return detect_1.detectContentType; } });
|
|
10
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":";;;AAAA,+DAA+D;AAC/D,qCAAgD;AAAvC,gGAAA,KAAK,OAAA;AAAE,wGAAA,aAAa,OAAA;AAC7B,mCAA4C;AAAnC,2GAAA,iBAAiB,OAAA"}
|
package/dist/split.d.ts
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { ChunkOptions } from './types';
|
|
2
|
+
export declare function splitByTokenCount(text: string, maxTokens: number, overlap: number): string[];
|
|
3
|
+
export declare function splitMarkdown(text: string, options: ChunkOptions): string[];
|
|
4
|
+
export declare function splitCode(text: string, options: ChunkOptions): string[];
|
|
5
|
+
export declare function splitJSON(text: string, options: ChunkOptions): string[];
|
|
6
|
+
export declare function splitText(text: string, options: ChunkOptions): string[];
|
|
7
|
+
//# sourceMappingURL=split.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"split.d.ts","sourceRoot":"","sources":["../src/split.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,MAAM,SAAS,CAAA;AAwD3C,wBAAgB,iBAAiB,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,EAAE,CAiC5F;AAED,wBAAgB,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CA0C3E;AAED,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CAkCvE;AA6BD,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CA2CvE;AAED,wBAAgB,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,YAAY,GAAG,MAAM,EAAE,CA8BvE"}
|
package/dist/split.js
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.splitByTokenCount = splitByTokenCount;
|
|
4
|
+
exports.splitMarkdown = splitMarkdown;
|
|
5
|
+
exports.splitCode = splitCode;
|
|
6
|
+
exports.splitJSON = splitJSON;
|
|
7
|
+
exports.splitText = splitText;
|
|
8
|
+
function tokensToChars(tokens) {
|
|
9
|
+
return tokens * 4;
|
|
10
|
+
}
|
|
11
|
+
/**
|
|
12
|
+
* Split text at sentence boundaries (. ! ? followed by space or end).
|
|
13
|
+
* Returns an array of sentence strings.
|
|
14
|
+
*/
|
|
15
|
+
function splitSentences(text) {
|
|
16
|
+
const parts = text.split(/(?<=[.!?])\s+/);
|
|
17
|
+
return parts.filter(p => p.length > 0);
|
|
18
|
+
}
|
|
19
|
+
/**
|
|
20
|
+
* Greedy-pack sentences into chunks no larger than maxChars.
|
|
21
|
+
*/
|
|
22
|
+
function packSentences(sentences, maxChars) {
|
|
23
|
+
const chunks = [];
|
|
24
|
+
let current = '';
|
|
25
|
+
for (const sentence of sentences) {
|
|
26
|
+
if (sentence.length > maxChars) {
|
|
27
|
+
// Single sentence exceeds limit: split at word boundaries
|
|
28
|
+
if (current.length > 0) {
|
|
29
|
+
chunks.push(current.trimEnd());
|
|
30
|
+
current = '';
|
|
31
|
+
}
|
|
32
|
+
const words = sentence.split(/\s+/);
|
|
33
|
+
let wordChunk = '';
|
|
34
|
+
for (const word of words) {
|
|
35
|
+
if (wordChunk.length + word.length + 1 > maxChars && wordChunk.length > 0) {
|
|
36
|
+
chunks.push(wordChunk.trimEnd());
|
|
37
|
+
wordChunk = '';
|
|
38
|
+
}
|
|
39
|
+
// Single word exceeds maxChars: hard split it
|
|
40
|
+
if (word.length > maxChars) {
|
|
41
|
+
for (let i = 0; i < word.length; i += maxChars) {
|
|
42
|
+
chunks.push(word.slice(i, i + maxChars));
|
|
43
|
+
}
|
|
44
|
+
continue;
|
|
45
|
+
}
|
|
46
|
+
wordChunk += (wordChunk.length > 0 ? ' ' : '') + word;
|
|
47
|
+
}
|
|
48
|
+
if (wordChunk.length > 0)
|
|
49
|
+
current = wordChunk;
|
|
50
|
+
}
|
|
51
|
+
else if (current.length + sentence.length + 1 > maxChars && current.length > 0) {
|
|
52
|
+
chunks.push(current.trimEnd());
|
|
53
|
+
current = sentence;
|
|
54
|
+
}
|
|
55
|
+
else {
|
|
56
|
+
current += (current.length > 0 ? ' ' : '') + sentence;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
if (current.length > 0)
|
|
60
|
+
chunks.push(current.trimEnd());
|
|
61
|
+
return chunks;
|
|
62
|
+
}
|
|
63
|
+
function splitByTokenCount(text, maxTokens, overlap) {
|
|
64
|
+
const maxChars = tokensToChars(maxTokens);
|
|
65
|
+
const overlapChars = tokensToChars(overlap);
|
|
66
|
+
if (text.length <= maxChars)
|
|
67
|
+
return [text];
|
|
68
|
+
const chunks = [];
|
|
69
|
+
let pos = 0;
|
|
70
|
+
while (pos < text.length) {
|
|
71
|
+
let end = pos + maxChars;
|
|
72
|
+
if (end >= text.length) {
|
|
73
|
+
chunks.push(text.slice(pos));
|
|
74
|
+
break;
|
|
75
|
+
}
|
|
76
|
+
// Try to find a sentence boundary near end
|
|
77
|
+
const window = text.slice(pos, end);
|
|
78
|
+
const sentenceMatch = window.search(/[.!?]\s+[^\s](?=[^.!?]*$)/);
|
|
79
|
+
if (sentenceMatch > maxChars / 2) {
|
|
80
|
+
// Snap to sentence boundary
|
|
81
|
+
const snapEnd = pos + sentenceMatch + 1;
|
|
82
|
+
chunks.push(text.slice(pos, snapEnd).trimEnd());
|
|
83
|
+
pos = snapEnd - overlapChars;
|
|
84
|
+
if (pos < 0)
|
|
85
|
+
pos = 0;
|
|
86
|
+
}
|
|
87
|
+
else {
|
|
88
|
+
chunks.push(text.slice(pos, end).trimEnd());
|
|
89
|
+
pos = end - overlapChars;
|
|
90
|
+
if (pos <= 0 || pos >= text.length)
|
|
91
|
+
break;
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return chunks.filter(c => c.length > 0);
|
|
95
|
+
}
|
|
96
|
+
function splitMarkdown(text, options) {
|
|
97
|
+
const maxChars = tokensToChars(options.maxTokens ?? 512);
|
|
98
|
+
// Split at heading boundaries
|
|
99
|
+
const headingRegex = /(?=^#{1,6}\s+)/m;
|
|
100
|
+
const sections = text.split(headingRegex).filter(s => s.trim().length > 0);
|
|
101
|
+
if (sections.length <= 1) {
|
|
102
|
+
// No headings: split at paragraphs
|
|
103
|
+
return splitText(text, options);
|
|
104
|
+
}
|
|
105
|
+
const result = [];
|
|
106
|
+
for (const section of sections) {
|
|
107
|
+
if (section.length <= maxChars) {
|
|
108
|
+
result.push(section);
|
|
109
|
+
}
|
|
110
|
+
else {
|
|
111
|
+
// Section too big: split at paragraph boundaries
|
|
112
|
+
const paragraphs = section.split(/\n{2,}/).filter(p => p.trim().length > 0);
|
|
113
|
+
let current = '';
|
|
114
|
+
for (const para of paragraphs) {
|
|
115
|
+
if (para.length > maxChars) {
|
|
116
|
+
if (current.length > 0) {
|
|
117
|
+
result.push(current.trimEnd());
|
|
118
|
+
current = '';
|
|
119
|
+
}
|
|
120
|
+
// Paragraph too big: split at sentences
|
|
121
|
+
const sentenceChunks = packSentences(splitSentences(para), maxChars);
|
|
122
|
+
result.push(...sentenceChunks);
|
|
123
|
+
}
|
|
124
|
+
else if (current.length + para.length + 2 > maxChars && current.length > 0) {
|
|
125
|
+
result.push(current.trimEnd());
|
|
126
|
+
current = para;
|
|
127
|
+
}
|
|
128
|
+
else {
|
|
129
|
+
current += (current.length > 0 ? '\n\n' : '') + para;
|
|
130
|
+
}
|
|
131
|
+
}
|
|
132
|
+
if (current.length > 0)
|
|
133
|
+
result.push(current.trimEnd());
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
return result.filter(c => c.trim().length > 0);
|
|
137
|
+
}
|
|
138
|
+
function splitCode(text, options) {
|
|
139
|
+
const maxChars = tokensToChars(options.maxTokens ?? 512);
|
|
140
|
+
if (text.length <= maxChars)
|
|
141
|
+
return [text];
|
|
142
|
+
// Find top-level function/class/def/const/let/var boundaries
|
|
143
|
+
const topLevelRegex = /(?=^(?:export\s+)?(?:async\s+)?(?:function\s+\w|class\s+\w|def\s+\w|const\s+\w+\s*=|let\s+\w+\s*=|var\s+\w+\s*=))/m;
|
|
144
|
+
const blocks = text.split(topLevelRegex).filter(b => b.trim().length > 0);
|
|
145
|
+
if (blocks.length <= 1) {
|
|
146
|
+
// No top-level boundaries: split at blank lines
|
|
147
|
+
return splitAtBlankLines(text, maxChars);
|
|
148
|
+
}
|
|
149
|
+
const result = [];
|
|
150
|
+
let current = '';
|
|
151
|
+
for (const block of blocks) {
|
|
152
|
+
if (block.length > maxChars) {
|
|
153
|
+
if (current.length > 0) {
|
|
154
|
+
result.push(current.trimEnd());
|
|
155
|
+
current = '';
|
|
156
|
+
}
|
|
157
|
+
result.push(...splitAtBlankLines(block, maxChars));
|
|
158
|
+
}
|
|
159
|
+
else if (current.length + block.length > maxChars && current.length > 0) {
|
|
160
|
+
result.push(current.trimEnd());
|
|
161
|
+
current = block;
|
|
162
|
+
}
|
|
163
|
+
else {
|
|
164
|
+
current += block;
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
if (current.length > 0)
|
|
168
|
+
result.push(current.trimEnd());
|
|
169
|
+
return result.filter(c => c.trim().length > 0);
|
|
170
|
+
}
|
|
171
|
+
function splitAtBlankLines(text, maxChars) {
|
|
172
|
+
const paragraphs = text.split(/\n{2,}/).filter(p => p.trim().length > 0);
|
|
173
|
+
const result = [];
|
|
174
|
+
let current = '';
|
|
175
|
+
for (const para of paragraphs) {
|
|
176
|
+
if (para.length > maxChars) {
|
|
177
|
+
if (current.length > 0) {
|
|
178
|
+
result.push(current.trimEnd());
|
|
179
|
+
current = '';
|
|
180
|
+
}
|
|
181
|
+
// Hard split by chars
|
|
182
|
+
for (let i = 0; i < para.length; i += maxChars) {
|
|
183
|
+
result.push(para.slice(i, i + maxChars));
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
else if (current.length + para.length + 2 > maxChars && current.length > 0) {
|
|
187
|
+
result.push(current.trimEnd());
|
|
188
|
+
current = para;
|
|
189
|
+
}
|
|
190
|
+
else {
|
|
191
|
+
current += (current.length > 0 ? '\n\n' : '') + para;
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
if (current.length > 0)
|
|
195
|
+
result.push(current.trimEnd());
|
|
196
|
+
return result;
|
|
197
|
+
}
|
|
198
|
+
function splitJSON(text, options) {
|
|
199
|
+
const maxChars = tokensToChars(options.maxTokens ?? 512);
|
|
200
|
+
let parsed;
|
|
201
|
+
try {
|
|
202
|
+
parsed = JSON.parse(text);
|
|
203
|
+
}
|
|
204
|
+
catch {
|
|
205
|
+
return splitByTokenCount(text, options.maxTokens ?? 512, options.overlap ?? 0);
|
|
206
|
+
}
|
|
207
|
+
const chunks = [];
|
|
208
|
+
if (Array.isArray(parsed)) {
|
|
209
|
+
// Split array elements into groups
|
|
210
|
+
const itemsPerChunk = Math.max(1, Math.floor(maxChars / Math.max(1, text.length / parsed.length)));
|
|
211
|
+
for (let i = 0; i < parsed.length; i += itemsPerChunk) {
|
|
212
|
+
chunks.push(JSON.stringify(parsed.slice(i, i + itemsPerChunk), null, 2));
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
else if (parsed !== null && typeof parsed === 'object') {
|
|
216
|
+
const obj = parsed;
|
|
217
|
+
const keys = Object.keys(obj);
|
|
218
|
+
let currentObj = {};
|
|
219
|
+
let currentSize = 0;
|
|
220
|
+
for (const key of keys) {
|
|
221
|
+
const entry = JSON.stringify({ [key]: obj[key] }, null, 2);
|
|
222
|
+
if (currentSize + entry.length > maxChars && currentSize > 0) {
|
|
223
|
+
chunks.push(JSON.stringify(currentObj, null, 2));
|
|
224
|
+
currentObj = {};
|
|
225
|
+
currentSize = 0;
|
|
226
|
+
}
|
|
227
|
+
currentObj[key] = obj[key];
|
|
228
|
+
currentSize += entry.length;
|
|
229
|
+
}
|
|
230
|
+
if (Object.keys(currentObj).length > 0) {
|
|
231
|
+
chunks.push(JSON.stringify(currentObj, null, 2));
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
else {
|
|
235
|
+
return [text];
|
|
236
|
+
}
|
|
237
|
+
return chunks.filter(c => c.trim().length > 0);
|
|
238
|
+
}
|
|
239
|
+
function splitText(text, options) {
|
|
240
|
+
const maxChars = tokensToChars(options.maxTokens ?? 512);
|
|
241
|
+
if (text.length <= maxChars)
|
|
242
|
+
return [text];
|
|
243
|
+
// Split at paragraph boundaries (double newline)
|
|
244
|
+
const paragraphs = text.split(/\n{2,}/).filter(p => p.trim().length > 0);
|
|
245
|
+
const result = [];
|
|
246
|
+
let current = '';
|
|
247
|
+
for (const para of paragraphs) {
|
|
248
|
+
if (para.length > maxChars) {
|
|
249
|
+
if (current.length > 0) {
|
|
250
|
+
result.push(current.trimEnd());
|
|
251
|
+
current = '';
|
|
252
|
+
}
|
|
253
|
+
// Split at sentence boundaries
|
|
254
|
+
const sentenceChunks = packSentences(splitSentences(para), maxChars);
|
|
255
|
+
result.push(...sentenceChunks);
|
|
256
|
+
}
|
|
257
|
+
else if (current.length + para.length + 2 > maxChars && current.length > 0) {
|
|
258
|
+
result.push(current.trimEnd());
|
|
259
|
+
current = para;
|
|
260
|
+
}
|
|
261
|
+
else {
|
|
262
|
+
current += (current.length > 0 ? '\n\n' : '') + para;
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
if (current.length > 0)
|
|
266
|
+
result.push(current.trimEnd());
|
|
267
|
+
return result.filter(c => c.trim().length > 0);
|
|
268
|
+
}
|
|
269
|
+
//# sourceMappingURL=split.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"split.js","sourceRoot":"","sources":["../src/split.ts"],"names":[],"mappings":";;AAwDA,8CAiCC;AAED,sCA0CC;AAED,8BAkCC;AA6BD,8BA2CC;AAED,8BA8BC;AA/QD,SAAS,aAAa,CAAC,MAAc;IACnC,OAAO,MAAM,GAAG,CAAC,CAAA;AACnB,CAAC;AAED;;;GAGG;AACH,SAAS,cAAc,CAAC,IAAY;IAClC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,eAAe,CAAC,CAAA;IACzC,OAAO,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AACxC,CAAC;AAED;;GAEG;AACH,SAAS,aAAa,CAAC,SAAmB,EAAE,QAAgB;IAC1D,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAChB,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;QACjC,IAAI,QAAQ,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC/B,0DAA0D;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,KAAK,CAAC,CAAA;YACnC,IAAI,SAAS,GAAG,EAAE,CAAA;YAClB,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;gBACzB,IAAI,SAAS,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC1E,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,EAAE,CAAC,CAAA;oBAChC,SAAS,GAAG,EAAE,CAAA;gBAChB,CAAC;gBACD,8CAA8C;gBAC9C,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;oBAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,IAAI,QAAQ,EAAE,CAAC;wBAC/C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAA;oBAC1C,CAAC;oBACD,SAAQ;gBACV,CAAC;gBACD,SAAS,IAAI,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;YACvD,CAAC;YACD,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC;gBAAE,OAAO,GAAG,SAAS,CAAA;QAC/C,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,QAAQ,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACjF,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,QAAQ,CAAA;QACpB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAA;QACvD,CAAC;IACH,CAAC;IACD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAgB,iBAAiB,CAAC,IAAY,EAAE,SAAiB,EAAE,OAAe;IAChF,MAAM,QAAQ,GAAG,aAAa,CAAC,SAAS,CAAC,CAAA;IACzC,MAAM,YAAY,GAAG,aAAa,CAAC,OAAO,CAAC,CAAA;IAE3C,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAE1C,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,GAAG,GAAG,CAAC,CAAA;IAEX,OAAO,GAAG,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QACzB,IAAI,GAAG,GAAG,GAAG,GAAG,QAAQ,CAAA;QACxB,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM,EAAE,CAAC;YACvB,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;YAC5B,MAAK;QACP,CAAC;QAED,2CAA2C;QAC3C,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,CAAC,CAAA;QACnC,MAAM,aAAa,GAAG,MAAM,CAAC,MAAM,CAAC,2BAA2B,CAAC,CAAA;QAChE,IAAI,aAAa,GAAG,QAAQ,GAAG,CAAC,EAAE,CAAC;YACjC,4BAA4B;YAC5B,MAAM,OAAO,GAAG,GAAG,GAAG,aAAa,GAAG,CAAC,CAAA;YACvC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC,OAAO,EAAE,CAAC,CAAA;YAC/C,GAAG,GAAG,OAAO,GAAG,YAAY,CAAA;YAC5B,IAAI,GAAG,GAAG,CAAC;gBAAE,GAAG,GAAG,CAAC,CAAA;QACtB,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,OAAO,EAAE,CAAC,CAAA;YAC3C,GAAG,GAAG,GAAG,GAAG,YAAY,CAAA;YACxB,IAAI,GAAG,IAAI,CAAC,IAAI,GAAG,IAAI,IAAI,CAAC,MAAM;gBAAE,MAAK;QAC3C,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AACzC,CAAC;AAED,SAAgB,aAAa,CAAC,IAAY,EAAE,OAAqB;IAC/D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,8BAA8B;IAC9B,MAAM,YAAY,GAAG,iBAAiB,CAAA;IACtC,MAAM,QAAQ,GAAG,IAAI,CAAC,KAAK,CAAC,YAAY,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAE1E,IAAI,QAAQ,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACzB,mCAAmC;QACnC,OAAO,SAAS,CAAC,IAAI,EAAE,OAAO,CAAC,CAAA;IACjC,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAE3B,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,OAAO,CAAC,MAAM,IAAI,QAAQ,EAAE,CAAC;YAC/B,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,CAAA;QACtB,CAAC;aAAM,CAAC;YACN,iDAAiD;YACjD,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;YAC3E,IAAI,OAAO,GAAG,EAAE,CAAA;YAChB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;gBAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;oBAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;wBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;wBAC9B,OAAO,GAAG,EAAE,CAAA;oBACd,CAAC;oBACD,wCAAwC;oBACxC,MAAM,cAAc,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,QAAQ,CAAC,CAAA;oBACpE,MAAM,CAAC,IAAI,CAAC,GAAG,cAAc,CAAC,CAAA;gBAChC,CAAC;qBAAM,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;oBAC7E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;oBAC9B,OAAO,GAAG,IAAI,CAAA;gBAChB,CAAC;qBAAM,CAAC;oBACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;gBACtD,CAAC;YACH,CAAC;YACD,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;gBAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;QACxD,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC;AAED,SAAgB,SAAS,CAAC,IAAY,EAAE,OAAqB;IAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAE1C,6DAA6D;IAC7D,MAAM,aAAa,GAAG,oHAAoH,CAAA;IAC1I,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAEzE,IAAI,MAAM,CAAC,MAAM,IAAI,CAAC,EAAE,CAAC;QACvB,gDAAgD;QAChD,OAAO,iBAAiB,CAAC,IAAI,EAAE,QAAQ,CAAC,CAAA;IAC1C,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAEhB,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,KAAK,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC5B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,MAAM,CAAC,IAAI,CAAC,GAAG,iBAAiB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC,CAAA;QACpD,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,KAAK,CAAC,MAAM,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC1E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,KAAK,CAAA;QACjB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,KAAK,CAAA;QAClB,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAY,EAAE,QAAgB;IACvD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IACxE,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAEhB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,sBAAsB;YACtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,IAAI,QAAQ,EAAE,CAAC;gBAC/C,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,QAAQ,CAAC,CAAC,CAAA;YAC1C,CAAC;QACH,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,IAAI,CAAA;QAChB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;QACtD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAA;AACf,CAAC;AAED,SAAgB,SAAS,CAAC,IAAY,EAAE,OAAqB;IAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,IAAI,MAAe,CAAA;IACnB,IAAI,CAAC;QACH,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAA;IAC3B,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,iBAAiB,CAAC,IAAI,EAAE,OAAO,CAAC,SAAS,IAAI,GAAG,EAAE,OAAO,CAAC,OAAO,IAAI,CAAC,CAAC,CAAA;IAChF,CAAC;IAED,MAAM,MAAM,GAAa,EAAE,CAAA;IAE3B,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC1B,mCAAmC;QACnC,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,GAAG,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,CAAA;QAClG,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,IAAI,aAAa,EAAE,CAAC;YACtD,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,GAAG,aAAa,CAAC,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;QAC1E,CAAC;IACH,CAAC;SAAM,IAAI,MAAM,KAAK,IAAI,IAAI,OAAO,MAAM,KAAK,QAAQ,EAAE,CAAC;QACzD,MAAM,GAAG,GAAG,MAAiC,CAAA;QAC7C,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAA;QAC7B,IAAI,UAAU,GAA4B,EAAE,CAAA;QAC5C,IAAI,WAAW,GAAG,CAAC,CAAA;QAEnB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACvB,MAAM,KAAK,GAAG,IAAI,CAAC,SAAS,CAAC,EAAE,CAAC,GAAG,CAAC,EAAE,GAAG,CAAC,GAAG,CAAC,EAAE,EAAE,IAAI,EAAE,CAAC,CAAC,CAAA;YAC1D,IAAI,WAAW,GAAG,KAAK,CAAC,MAAM,GAAG,QAAQ,IAAI,WAAW,GAAG,CAAC,EAAE,CAAC;gBAC7D,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;gBAChD,UAAU,GAAG,EAAE,CAAA;gBACf,WAAW,GAAG,CAAC,CAAA;YACjB,CAAC;YACD,UAAU,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,CAAA;YAC1B,WAAW,IAAI,KAAK,CAAC,MAAM,CAAA;QAC7B,CAAC;QAED,IAAI,MAAM,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvC,MAAM,CAAC,IAAI,CAAC,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC,CAAA;QAClD,CAAC;IACH,CAAC;SAAM,CAAC;QACN,OAAO,CAAC,IAAI,CAAC,CAAA;IACf,CAAC;IAED,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC;AAED,SAAgB,SAAS,CAAC,IAAY,EAAE,OAAqB;IAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,OAAO,CAAC,SAAS,IAAI,GAAG,CAAC,CAAA;IAExD,IAAI,IAAI,CAAC,MAAM,IAAI,QAAQ;QAAE,OAAO,CAAC,IAAI,CAAC,CAAA;IAE1C,iDAAiD;IACjD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;IAExE,MAAM,MAAM,GAAa,EAAE,CAAA;IAC3B,IAAI,OAAO,GAAG,EAAE,CAAA;IAEhB,KAAK,MAAM,IAAI,IAAI,UAAU,EAAE,CAAC;QAC9B,IAAI,IAAI,CAAC,MAAM,GAAG,QAAQ,EAAE,CAAC;YAC3B,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBACvB,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;gBAC9B,OAAO,GAAG,EAAE,CAAA;YACd,CAAC;YACD,+BAA+B;YAC/B,MAAM,cAAc,GAAG,aAAa,CAAC,cAAc,CAAC,IAAI,CAAC,EAAE,QAAQ,CAAC,CAAA;YACpE,MAAM,CAAC,IAAI,CAAC,GAAG,cAAc,CAAC,CAAA;QAChC,CAAC;aAAM,IAAI,OAAO,CAAC,MAAM,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,GAAG,QAAQ,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC7E,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;YAC9B,OAAO,GAAG,IAAI,CAAA;QAChB,CAAC;aAAM,CAAC;YACN,OAAO,IAAI,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,GAAG,IAAI,CAAA;QACtD,CAAC;IACH,CAAC;IAED,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,MAAM,CAAC,IAAI,CAAC,OAAO,CAAC,OAAO,EAAE,CAAC,CAAA;IACtD,OAAO,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,CAAA;AAChD,CAAC"}
|
package/dist/types.d.ts
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
export type ContentType = 'markdown' | 'code' | 'html' | 'json' | 'yaml' | 'text';
|
|
2
|
+
export interface DetectResult {
|
|
3
|
+
type: ContentType;
|
|
4
|
+
confidence: number;
|
|
5
|
+
}
|
|
6
|
+
export interface ChunkMetadata {
|
|
7
|
+
index: number;
|
|
8
|
+
startOffset: number;
|
|
9
|
+
endOffset: number;
|
|
10
|
+
tokenCount: number;
|
|
11
|
+
charCount: number;
|
|
12
|
+
contentType: ContentType;
|
|
13
|
+
headings: string[];
|
|
14
|
+
codeLanguage?: string;
|
|
15
|
+
overlapBefore: number;
|
|
16
|
+
overlapAfter: number;
|
|
17
|
+
}
|
|
18
|
+
export interface Chunk {
|
|
19
|
+
content: string;
|
|
20
|
+
metadata: ChunkMetadata;
|
|
21
|
+
}
|
|
22
|
+
export interface ChunkOptions {
|
|
23
|
+
maxTokens?: number;
|
|
24
|
+
minTokens?: number;
|
|
25
|
+
overlap?: number;
|
|
26
|
+
contentType?: ContentType;
|
|
27
|
+
preserveStructure?: boolean;
|
|
28
|
+
}
|
|
29
|
+
export interface Chunker {
|
|
30
|
+
chunk(text: string, overrides?: Partial<ChunkOptions>): Chunk[];
|
|
31
|
+
chunkMarkdown(text: string, options?: Partial<ChunkOptions>): Chunk[];
|
|
32
|
+
chunkCode(text: string, options?: Partial<ChunkOptions>): Chunk[];
|
|
33
|
+
chunkJSON(text: string, options?: Partial<ChunkOptions>): Chunk[];
|
|
34
|
+
detectContentType(text: string): DetectResult;
|
|
35
|
+
}
|
|
36
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,WAAW,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,GAAG,MAAM,CAAA;AAEjF,MAAM,WAAW,YAAY;IAAG,IAAI,EAAE,WAAW,CAAC;IAAC,UAAU,EAAE,MAAM,CAAA;CAAE;AAEvE,MAAM,WAAW,aAAa;IAC5B,KAAK,EAAE,MAAM,CAAA;IACb,WAAW,EAAE,MAAM,CAAA;IACnB,SAAS,EAAE,MAAM,CAAA;IACjB,UAAU,EAAE,MAAM,CAAA;IAClB,SAAS,EAAE,MAAM,CAAA;IACjB,WAAW,EAAE,WAAW,CAAA;IACxB,QAAQ,EAAE,MAAM,EAAE,CAAA;IAClB,YAAY,CAAC,EAAE,MAAM,CAAA;IACrB,aAAa,EAAE,MAAM,CAAA;IACrB,YAAY,EAAE,MAAM,CAAA;CACrB;AAED,MAAM,WAAW,KAAK;IAAG,OAAO,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,aAAa,CAAA;CAAE;AAEnE,MAAM,WAAW,YAAY;IAC3B,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,OAAO,CAAC,EAAE,MAAM,CAAA;IAChB,WAAW,CAAC,EAAE,WAAW,CAAA;IACzB,iBAAiB,CAAC,EAAE,OAAO,CAAA;CAC5B;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,CAAC,IAAI,EAAE,MAAM,EAAE,SAAS,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IAC/D,aAAa,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IACrE,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IACjE,SAAS,CAAC,IAAI,EAAE,MAAM,EAAE,OAAO,CAAC,EAAE,OAAO,CAAC,YAAY,CAAC,GAAG,KAAK,EAAE,CAAA;IACjE,iBAAiB,CAAC,IAAI,EAAE,MAAM,GAAG,YAAY,CAAA;CAC9C"}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../src/types.ts"],"names":[],"mappings":""}
|
package/package.json
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "chunk-smart",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Structure-aware text chunker for RAG pipelines",
|
|
5
|
+
"main": "dist/index.js",
|
|
6
|
+
"types": "dist/index.d.ts",
|
|
7
|
+
"files": [
|
|
8
|
+
"dist"
|
|
9
|
+
],
|
|
10
|
+
"scripts": {
|
|
11
|
+
"build": "tsc",
|
|
12
|
+
"test": "vitest run",
|
|
13
|
+
"lint": "eslint src/",
|
|
14
|
+
"prepublishOnly": "npm run build"
|
|
15
|
+
},
|
|
16
|
+
"keywords": [
|
|
17
|
+
"chunk",
|
|
18
|
+
"rag",
|
|
19
|
+
"llm",
|
|
20
|
+
"text-splitting",
|
|
21
|
+
"markdown",
|
|
22
|
+
"tokenizer"
|
|
23
|
+
],
|
|
24
|
+
"author": "",
|
|
25
|
+
"license": "MIT",
|
|
26
|
+
"engines": {
|
|
27
|
+
"node": ">=18"
|
|
28
|
+
},
|
|
29
|
+
"publishConfig": {
|
|
30
|
+
"access": "public"
|
|
31
|
+
},
|
|
32
|
+
"devDependencies": {
|
|
33
|
+
"@types/node": "^25.5.0",
|
|
34
|
+
"@typescript-eslint/eslint-plugin": "^8.57.1",
|
|
35
|
+
"@typescript-eslint/parser": "^8.57.1",
|
|
36
|
+
"eslint": "^10.1.0",
|
|
37
|
+
"typescript": "^5.9.3",
|
|
38
|
+
"vitest": "^4.1.0"
|
|
39
|
+
}
|
|
40
|
+
}
|