@storepress/llm-md-text-splitter 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +816 -0
- package/package.json +79 -0
- package/src/MarkdownTextSplitter.d.ts +304 -0
- package/src/MarkdownTextSplitter.js +1432 -0
package/package.json
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@storepress/llm-md-text-splitter",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "High-performance streaming Markdown text splitter for LLM pipelines and RAG systems. Zero sequence loss for code blocks, tables, links, and videos. 5 built-in strategies + custom. Zero dependencies.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./src/MarkdownTextSplitter.js",
|
|
7
|
+
"module": "./src/MarkdownTextSplitter.js",
|
|
8
|
+
"publishConfig": {
|
|
9
|
+
"access": "public"
|
|
10
|
+
},
|
|
11
|
+
"exports": {
|
|
12
|
+
".": {
|
|
13
|
+
"import": "./src/MarkdownTextSplitter.js",
|
|
14
|
+
"default": "./src/MarkdownTextSplitter.js"
|
|
15
|
+
}
|
|
16
|
+
},
|
|
17
|
+
"browser": "./src/MarkdownTextSplitter.js",
|
|
18
|
+
"types": "./src/MarkdownTextSplitter.d.ts",
|
|
19
|
+
"files": [
|
|
20
|
+
"src/**",
|
|
21
|
+
"README.md",
|
|
22
|
+
"LICENSE"
|
|
23
|
+
],
|
|
24
|
+
"scripts": {
|
|
25
|
+
"test": "node --test test/test.mjs",
|
|
26
|
+
"test:watch": "node --test --watch test/test.mjs",
|
|
27
|
+
"demo": "npx serve .",
|
|
28
|
+
"lint": "eslint src/",
|
|
29
|
+
"prepublishOnly": "npm test"
|
|
30
|
+
},
|
|
31
|
+
"keywords": [
|
|
32
|
+
"markdown",
|
|
33
|
+
"text-splitter",
|
|
34
|
+
"llm",
|
|
35
|
+
"rag",
|
|
36
|
+
"chunking",
|
|
37
|
+
"streaming",
|
|
38
|
+
"zero-loss",
|
|
39
|
+
"code-blocks",
|
|
40
|
+
"semantic",
|
|
41
|
+
"tokenizer",
|
|
42
|
+
"langchain",
|
|
43
|
+
"vector-database",
|
|
44
|
+
"embeddings",
|
|
45
|
+
"ai",
|
|
46
|
+
"openai",
|
|
47
|
+
"anthropic",
|
|
48
|
+
"gpt",
|
|
49
|
+
"claude",
|
|
50
|
+
"pinecone",
|
|
51
|
+
"weaviate",
|
|
52
|
+
"chromadb",
|
|
53
|
+
"text-processing",
|
|
54
|
+
"document-processing",
|
|
55
|
+
"markdown-parser",
|
|
56
|
+
"context-window",
|
|
57
|
+
"chunk",
|
|
58
|
+
"splitter",
|
|
59
|
+
"delimiter",
|
|
60
|
+
"word-limit",
|
|
61
|
+
"char-limit",
|
|
62
|
+
"token-limit",
|
|
63
|
+
"browser",
|
|
64
|
+
"esm"
|
|
65
|
+
],
|
|
66
|
+
"author": "StorePress",
|
|
67
|
+
"license": "MIT",
|
|
68
|
+
"repository": {
|
|
69
|
+
"type": "git",
|
|
70
|
+
"url": "https://github.com/EmranAhmed/storepress-llm-md-text-splitter.git"
|
|
71
|
+
},
|
|
72
|
+
"bugs": {
|
|
73
|
+
"url": "https://github.com/EmranAhmed/storepress-llm-md-text-splitter/issues"
|
|
74
|
+
},
|
|
75
|
+
"homepage": "https://emranahmed.github.io/storepress-llm-md-text-splitter/",
|
|
76
|
+
"sideEffects": false,
|
|
77
|
+
"dependencies": {},
|
|
78
|
+
"devDependencies": {}
|
|
79
|
+
}
|
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* md-text-splitter — TypeScript Declarations
|
|
3
|
+
* @module md-text-splitter
|
|
4
|
+
* @version 3.0.0
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
// ── Block Types ──
|
|
8
|
+
|
|
9
|
+
export declare const BlockType: Readonly<{
|
|
10
|
+
HEADING: "heading";
|
|
11
|
+
PARAGRAPH: "paragraph";
|
|
12
|
+
CODE_BLOCK: "code_block";
|
|
13
|
+
LIST: "list";
|
|
14
|
+
BLOCKQUOTE: "blockquote";
|
|
15
|
+
TABLE: "table";
|
|
16
|
+
LINK_REF: "link_reference";
|
|
17
|
+
VIDEO_EMBED: "video_embed";
|
|
18
|
+
HR: "hr";
|
|
19
|
+
EMPTY: "empty";
|
|
20
|
+
FRONTMATTER: "frontmatter";
|
|
21
|
+
HTML_BLOCK: "html_block";
|
|
22
|
+
IMAGE: "image";
|
|
23
|
+
}>;
|
|
24
|
+
|
|
25
|
+
export type BlockTypeName = (typeof BlockType)[keyof typeof BlockType];
|
|
26
|
+
|
|
27
|
+
// ── Configuration ──
|
|
28
|
+
|
|
29
|
+
export interface StrategyOptions {
|
|
30
|
+
/** DelimiterStrategy: string to split on (default: '---') */
|
|
31
|
+
delimiter?: string;
|
|
32
|
+
/** DelimiterStrategy: include delimiter in output (default: false) */
|
|
33
|
+
keepDelimiter?: boolean;
|
|
34
|
+
/** DelimiterStrategy: trim whitespace from chunk edges (default: true) */
|
|
35
|
+
trimChunks?: boolean;
|
|
36
|
+
/** CharLimitStrategy: max characters per chunk (default: maxChunkTokens × charsPerToken) */
|
|
37
|
+
charLimit?: number;
|
|
38
|
+
/** CharLimitStrategy / WordLimitStrategy: overlap amount */
|
|
39
|
+
overlap?: number;
|
|
40
|
+
/** WordLimitStrategy: max words per chunk (default: 1000) */
|
|
41
|
+
wordLimit?: number;
|
|
42
|
+
/** TokenLimitStrategy: max tokens per chunk (default: maxChunkTokens) */
|
|
43
|
+
tokenLimit?: number;
|
|
44
|
+
|
|
45
|
+
/** Custom strategy options (any key-value pairs) */
|
|
46
|
+
[key: string]: unknown;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export interface SplitterConfig {
|
|
50
|
+
/** Target max tokens per chunk (~4 chars/token). Default: 1500 */
|
|
51
|
+
maxChunkTokens?: number;
|
|
52
|
+
/** Tokens of overlap between consecutive chunks. Default: 150 */
|
|
53
|
+
overlapTokens?: number;
|
|
54
|
+
/** Characters-per-token ratio for estimation. Default: 4 */
|
|
55
|
+
charsPerToken?: number;
|
|
56
|
+
/** HTTP fetch timeout in milliseconds. Default: 60000 */
|
|
57
|
+
fetchTimeoutMs?: number;
|
|
58
|
+
/** Group code blocks with surrounding text (semantic). Default: true */
|
|
59
|
+
preserveCodeContext?: boolean;
|
|
60
|
+
/** Group reference links with their sections (semantic). Default: true */
|
|
61
|
+
preserveLinks?: boolean;
|
|
62
|
+
/** Group video embeds with their context (semantic). Default: true */
|
|
63
|
+
preserveVideos?: boolean;
|
|
64
|
+
/** Prefix for generated chunk IDs. Default: 'chunk' */
|
|
65
|
+
chunkIdPrefix?: string;
|
|
66
|
+
/** Regex to detect video embed lines */
|
|
67
|
+
videoPattern?: RegExp;
|
|
68
|
+
/** Regex to detect reference-style link definitions */
|
|
69
|
+
linkRefPattern?: RegExp;
|
|
70
|
+
/** Active strategy name. Default: 'semantic' */
|
|
71
|
+
strategy?: "semantic" | "delimiter" | "char" | "word" | "token" | string;
|
|
72
|
+
/** Strategy-specific options */
|
|
73
|
+
strategyOptions?: StrategyOptions;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
export declare const DEFAULT_CONFIG: Readonly<Required<SplitterConfig>>;
|
|
77
|
+
|
|
78
|
+
// ── Chunk Output ──
|
|
79
|
+
|
|
80
|
+
export interface ExtractedLink {
|
|
81
|
+
text: string;
|
|
82
|
+
url: string;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export interface ExtractedVideo {
|
|
86
|
+
platform: "youtube" | "vimeo";
|
|
87
|
+
url: string;
|
|
88
|
+
videoId: string;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
export interface ChunkLines {
|
|
92
|
+
start: number;
|
|
93
|
+
end: number;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
export interface Chunk {
|
|
97
|
+
/** Deterministic FNV-1a hash-based ID */
|
|
98
|
+
id: string;
|
|
99
|
+
/** Sequential position (0-based) */
|
|
100
|
+
index: number;
|
|
101
|
+
/** The actual text content */
|
|
102
|
+
content: string;
|
|
103
|
+
/** Approximate LLM token count */
|
|
104
|
+
tokenEstimate: number;
|
|
105
|
+
/** Tokens repeated from previous chunk */
|
|
106
|
+
overlapTokens: number;
|
|
107
|
+
/** Exact character count */
|
|
108
|
+
charCount: number;
|
|
109
|
+
/** Word count */
|
|
110
|
+
wordCount: number;
|
|
111
|
+
/** Original line numbers */
|
|
112
|
+
lines: ChunkLines;
|
|
113
|
+
/** Nearest heading text (semantic strategy) */
|
|
114
|
+
heading: string | null;
|
|
115
|
+
/** Full heading breadcrumb trail */
|
|
116
|
+
headingPath: string[];
|
|
117
|
+
/** Heading depth (1–6) */
|
|
118
|
+
headingLevel: number | null;
|
|
119
|
+
/** Contains fenced code blocks */
|
|
120
|
+
hasCode: boolean;
|
|
121
|
+
/** Contains markdown tables */
|
|
122
|
+
hasTable: boolean;
|
|
123
|
+
/** Contains video embeds */
|
|
124
|
+
hasVideo: boolean;
|
|
125
|
+
/** Programming languages detected in code blocks */
|
|
126
|
+
languages: string[];
|
|
127
|
+
/** Extracted inline and reference links */
|
|
128
|
+
links: ExtractedLink[];
|
|
129
|
+
/** Extracted video embed metadata */
|
|
130
|
+
videos: ExtractedVideo[];
|
|
131
|
+
/** Exceeds 1.5× target chunk size */
|
|
132
|
+
isOversized: boolean;
|
|
133
|
+
/** Contains atomic blocks (code/table/video) */
|
|
134
|
+
containsAtomicBlock: boolean;
|
|
135
|
+
/** Semantic block types present */
|
|
136
|
+
blockTypes: BlockTypeName[];
|
|
137
|
+
/** Which strategy produced this chunk */
|
|
138
|
+
strategy: string;
|
|
139
|
+
/** Extensible metadata */
|
|
140
|
+
metadata: Record<string, unknown>;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ── Statistics ──
|
|
144
|
+
|
|
145
|
+
export interface SplitterStats {
|
|
146
|
+
totalChunks: number;
|
|
147
|
+
totalTokens: number;
|
|
148
|
+
totalChars: number;
|
|
149
|
+
totalWords: number;
|
|
150
|
+
oversizedChunks: number;
|
|
151
|
+
codeBlockChunks: number;
|
|
152
|
+
tableChunks: number;
|
|
153
|
+
videoChunks: number;
|
|
154
|
+
processingTimeMs: number;
|
|
155
|
+
source: string;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// ── Line Iterator ──
|
|
159
|
+
|
|
160
|
+
export interface LineObject {
|
|
161
|
+
lineNumber: number;
|
|
162
|
+
text: string;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// ── Semantic Block ──
|
|
166
|
+
|
|
167
|
+
export interface SemanticBlock {
|
|
168
|
+
type: BlockTypeName;
|
|
169
|
+
content: string;
|
|
170
|
+
lines: ChunkLines;
|
|
171
|
+
heading: string | null;
|
|
172
|
+
headingLevel: number | null;
|
|
173
|
+
headingPath: string[];
|
|
174
|
+
language: string | null;
|
|
175
|
+
isAtomic: boolean;
|
|
176
|
+
links: ExtractedLink[];
|
|
177
|
+
videos: ExtractedVideo[];
|
|
178
|
+
metadata: Record<string, unknown>;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
// ── Strategy Interface ──
|
|
182
|
+
|
|
183
|
+
export interface SplittingStrategy {
|
|
184
|
+
name: string;
|
|
185
|
+
|
|
186
|
+
process(lineIterator: AsyncIterable<LineObject>): AsyncGenerator<Chunk>;
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
export interface SplittingStrategyConstructor {
|
|
190
|
+
new(config: Required<SplitterConfig>): SplittingStrategy;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
// ── Strategy Classes ──
|
|
194
|
+
|
|
195
|
+
export declare class SemanticStrategy implements SplittingStrategy {
|
|
196
|
+
name: "semantic";
|
|
197
|
+
|
|
198
|
+
constructor(config: Required<SplitterConfig>);
|
|
199
|
+
|
|
200
|
+
process(lineIterator: AsyncIterable<LineObject>): AsyncGenerator<Chunk>;
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
export declare class DelimiterStrategy implements SplittingStrategy {
|
|
204
|
+
name: "delimiter";
|
|
205
|
+
|
|
206
|
+
constructor(config: Required<SplitterConfig>);
|
|
207
|
+
|
|
208
|
+
process(lineIterator: AsyncIterable<LineObject>): AsyncGenerator<Chunk>;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
export declare class CharLimitStrategy implements SplittingStrategy {
|
|
212
|
+
name: "char";
|
|
213
|
+
|
|
214
|
+
constructor(config: Required<SplitterConfig>);
|
|
215
|
+
|
|
216
|
+
process(lineIterator: AsyncIterable<LineObject>): AsyncGenerator<Chunk>;
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
export declare class WordLimitStrategy implements SplittingStrategy {
|
|
220
|
+
name: "word";
|
|
221
|
+
|
|
222
|
+
constructor(config: Required<SplitterConfig>);
|
|
223
|
+
|
|
224
|
+
process(lineIterator: AsyncIterable<LineObject>): AsyncGenerator<Chunk>;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
export declare class TokenLimitStrategy extends CharLimitStrategy {
|
|
228
|
+
name: "token";
|
|
229
|
+
|
|
230
|
+
constructor(config: Required<SplitterConfig>);
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
// ── Parser ──
|
|
234
|
+
|
|
235
|
+
export declare class SemanticParser {
|
|
236
|
+
constructor(config: Required<SplitterConfig>);
|
|
237
|
+
|
|
238
|
+
parse(lineIterator: AsyncIterable<LineObject>): AsyncGenerator<SemanticBlock>;
|
|
239
|
+
|
|
240
|
+
getLinkDefinitions(): Map<string, string>;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
// ── Main Class ──
|
|
244
|
+
|
|
245
|
+
export declare class MarkdownTextSplitter {
|
|
246
|
+
config: Required<SplitterConfig>;
|
|
247
|
+
strategy: SplittingStrategy;
|
|
248
|
+
stats: SplitterStats;
|
|
249
|
+
|
|
250
|
+
constructor(config?: SplitterConfig);
|
|
251
|
+
|
|
252
|
+
/** Split a markdown string into chunks */
|
|
253
|
+
splitFromString(markdown: string): Promise<Chunk[]>;
|
|
254
|
+
|
|
255
|
+
/** Split a remote markdown file via streaming HTTP */
|
|
256
|
+
splitFromUrl(url: string, fetchOptions?: RequestInit): Promise<Chunk[]>;
|
|
257
|
+
|
|
258
|
+
/** Split a browser File or Blob */
|
|
259
|
+
splitFromFile(file: File | Blob): Promise<Chunk[]>;
|
|
260
|
+
|
|
261
|
+
/** Stream chunks one-at-a-time from a string */
|
|
262
|
+
streamFromString(markdown: string): AsyncGenerator<Chunk>;
|
|
263
|
+
|
|
264
|
+
/** Stream chunks one-at-a-time from a URL */
|
|
265
|
+
streamFromUrl(url: string, fetchOptions?: RequestInit): AsyncGenerator<Chunk>;
|
|
266
|
+
|
|
267
|
+
/** Stream chunks one-at-a-time from a File/Blob */
|
|
268
|
+
streamFromFile(file: File | Blob): AsyncGenerator<Chunk>;
|
|
269
|
+
|
|
270
|
+
/** Switch the active strategy at runtime */
|
|
271
|
+
setStrategy(name: string, options?: StrategyOptions): void;
|
|
272
|
+
|
|
273
|
+
/** Get processing statistics from the last split */
|
|
274
|
+
getStats(): SplitterStats;
|
|
275
|
+
|
|
276
|
+
/** Reset internal state for reuse */
|
|
277
|
+
reset(): void;
|
|
278
|
+
|
|
279
|
+
/** Register a custom splitting strategy globally */
|
|
280
|
+
static registerStrategy(name: string, strategyClass: SplittingStrategyConstructor): void;
|
|
281
|
+
|
|
282
|
+
/** List all available strategy names (built-in + custom) */
|
|
283
|
+
static getAvailableStrategies(): string[];
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
// ── Utility Functions ──
|
|
287
|
+
|
|
288
|
+
export declare function estimateTokens(text: string, charsPerToken?: number): number;
|
|
289
|
+
|
|
290
|
+
export declare function countWords(text: string): number;
|
|
291
|
+
|
|
292
|
+
export declare function generateChunkId(content: string, index: number, prefix?: string): string;
|
|
293
|
+
|
|
294
|
+
export declare function extractLinks(text: string): ExtractedLink[];
|
|
295
|
+
|
|
296
|
+
export declare function extractVideos(text: string): ExtractedVideo[];
|
|
297
|
+
|
|
298
|
+
export declare function streamToLines(byteStream: ReadableStream<Uint8Array>): AsyncGenerator<LineObject>;
|
|
299
|
+
|
|
300
|
+
export declare function stringToLines(text: string): AsyncGenerator<LineObject>;
|
|
301
|
+
|
|
302
|
+
// ── Default Export ──
|
|
303
|
+
|
|
304
|
+
export default MarkdownTextSplitter;
|