@mastra/rag 1.0.7 → 1.0.8-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +1 -1
- package/CHANGELOG.md +11 -0
- package/dist/document/document.d.ts +4 -1
- package/dist/document/document.d.ts.map +1 -1
- package/dist/document/transformers/semantic-markdown.d.ts +25 -0
- package/dist/document/transformers/semantic-markdown.d.ts.map +1 -0
- package/dist/document/types.d.ts +13 -1
- package/dist/document/types.d.ts.map +1 -1
- package/dist/document/validation.d.ts.map +1 -1
- package/dist/index.cjs +197 -11
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +197 -11
- package/dist/index.js.map +1 -1
- package/package.json +4 -4
- package/src/document/document.test.ts +644 -1
- package/src/document/document.ts +32 -12
- package/src/document/transformers/semantic-markdown.ts +227 -0
- package/src/document/types.ts +21 -2
- package/src/document/validation.ts +11 -0
package/.turbo/turbo-build.log
CHANGED
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,16 @@
|
|
|
1
1
|
# @mastra/rag
|
|
2
2
|
|
|
3
|
+
## 1.0.8-alpha.0
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- 1be6004: Added semantic markdown chunking strategy.
|
|
8
|
+
- Updated dependencies [8388649]
|
|
9
|
+
- Updated dependencies [dd94a26]
|
|
10
|
+
- Updated dependencies [3ba6772]
|
|
11
|
+
- Updated dependencies [2fff911]
|
|
12
|
+
- @mastra/core@0.13.2-alpha.0
|
|
13
|
+
|
|
3
14
|
## 1.0.7
|
|
4
15
|
|
|
5
16
|
### Patch Changes
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { Document as Chunk } from './schema/index.js';
|
|
2
|
-
import type { ChunkParams, ExtractParams, HTMLChunkOptions, RecursiveChunkOptions, CharacterChunkOptions, TokenChunkOptions, MarkdownChunkOptions, JsonChunkOptions, LatexChunkOptions, SentenceChunkOptions } from './types.js';
|
|
2
|
+
import type { ChunkParams, ExtractParams, HTMLChunkOptions, RecursiveChunkOptions, CharacterChunkOptions, TokenChunkOptions, MarkdownChunkOptions, SemanticMarkdownChunkOptions, JsonChunkOptions, LatexChunkOptions, SentenceChunkOptions } from './types.js';
|
|
3
3
|
export declare class MDocument {
|
|
4
4
|
private chunks;
|
|
5
5
|
private type;
|
|
@@ -16,6 +16,8 @@ export declare class MDocument {
|
|
|
16
16
|
static fromMarkdown(markdown: string, metadata?: Record<string, any>): MDocument;
|
|
17
17
|
static fromJSON(jsonString: string, metadata?: Record<string, any>): MDocument;
|
|
18
18
|
private defaultStrategy;
|
|
19
|
+
private _strategyMap?;
|
|
20
|
+
private get strategyMap();
|
|
19
21
|
private chunkBy;
|
|
20
22
|
chunkRecursive(options?: RecursiveChunkOptions): Promise<void>;
|
|
21
23
|
chunkCharacter(options?: CharacterChunkOptions): Promise<void>;
|
|
@@ -25,6 +27,7 @@ export declare class MDocument {
|
|
|
25
27
|
chunkToken(options?: TokenChunkOptions): Promise<void>;
|
|
26
28
|
chunkMarkdown(options?: MarkdownChunkOptions): Promise<void>;
|
|
27
29
|
chunkSentence(options?: SentenceChunkOptions): Promise<void>;
|
|
30
|
+
chunkSemanticMarkdown(options?: SemanticMarkdownChunkOptions): Promise<void>;
|
|
28
31
|
chunk(params?: ChunkParams): Promise<Chunk[]>;
|
|
29
32
|
getDocs(): Chunk[];
|
|
30
33
|
getText(): string[];
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAU3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,4BAA4B,EAC5B,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;IAevB,OAAO,CAAC,YAAY,CAAC,CAA4E;IAEjG,OAAO,KAAK,WAAW,GAetB;YAEa,OAAO;IASf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBpD,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAmBpD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAMtD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAUtD,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa5D,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAuB5D,qBAAqB,CAAC,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,IAAI,CAAC;IAU5E,KAAK,CAAC,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAiBnD,OAAO,IAAI,KAAK,EAAE;IAIlB,OAAO,IAAI,MAAM,EAAE;IAInB,WAAW,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE;CAGrC"}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import type { TiktokenModel, TiktokenEncoding } from 'js-tiktoken';
|
|
2
|
+
import { Document } from '../schema/index.js';
|
|
3
|
+
import type { SemanticMarkdownChunkOptions } from '../types.js';
|
|
4
|
+
import { TextTransformer } from './text.js';
|
|
5
|
+
export declare class SemanticMarkdownTransformer extends TextTransformer {
|
|
6
|
+
private tokenizer;
|
|
7
|
+
private joinThreshold;
|
|
8
|
+
private allowedSpecial;
|
|
9
|
+
private disallowedSpecial;
|
|
10
|
+
constructor({ joinThreshold, encodingName, modelName, allowedSpecial, disallowedSpecial, ...baseOptions }?: SemanticMarkdownChunkOptions);
|
|
11
|
+
private countTokens;
|
|
12
|
+
private splitMarkdownByHeaders;
|
|
13
|
+
private mergeSemanticSections;
|
|
14
|
+
splitText({ text }: {
|
|
15
|
+
text: string;
|
|
16
|
+
}): string[];
|
|
17
|
+
createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[];
|
|
18
|
+
transformDocuments(documents: Document[]): Document[];
|
|
19
|
+
static fromTikToken({ encodingName, modelName, options, }: {
|
|
20
|
+
encodingName?: TiktokenEncoding;
|
|
21
|
+
modelName?: TiktokenModel;
|
|
22
|
+
options?: SemanticMarkdownChunkOptions;
|
|
23
|
+
}): SemanticMarkdownTransformer;
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=semantic-markdown.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"semantic-markdown.d.ts","sourceRoot":"","sources":["../../../src/document/transformers/semantic-markdown.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,gBAAgB,EAAY,MAAM,aAAa,CAAC;AAE7E,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,KAAK,EAAE,4BAA4B,EAAE,MAAM,UAAU,CAAC;AAE7D,OAAO,EAAE,eAAe,EAAE,MAAM,QAAQ,CAAC;AASzC,qBAAa,2BAA4B,SAAQ,eAAe;IAC9D,OAAO,CAAC,SAAS,CAAW;IAC5B,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,cAAc,CAAsB;IAC5C,OAAO,CAAC,iBAAiB,CAAsB;gBAEnC,EACV,aAAmB,EACnB,YAA4B,EAC5B,SAAS,EACT,cAA0B,EAC1B,iBAAyB,EACzB,GAAG,WAAW,EACf,GAAE,4BAAiC;IAcpC,OAAO,CAAC,WAAW;IAQnB,OAAO,CAAC,sBAAsB;IA2D9B,OAAO,CAAC,qBAAqB;IA+B7B,SAAS,CAAC,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,MAAM,EAAE;IAgB/C,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IAuB/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;IAYrD,MAAM,CAAC,YAAY,CAAC,EAClB,YAA4B,EAC5B,SAAS,EACT,OAAY,GACb,EAAE;QACD,YAAY,CAAC,EAAE,gBAAgB,CAAC;QAChC,SAAS,CAAC,EAAE,aAAa,CAAC;QAC1B,OAAO,CAAC,EAAE,4BAA4B,CAAC;KACxC,GAAG,2BAA2B;CA4BhC"}
|
package/dist/document/types.d.ts
CHANGED
|
@@ -66,6 +66,13 @@ export type MarkdownChunkOptions = BaseChunkOptions & {
|
|
|
66
66
|
returnEachLine?: boolean;
|
|
67
67
|
stripHeaders?: boolean;
|
|
68
68
|
};
|
|
69
|
+
export type SemanticMarkdownChunkOptions = BaseChunkOptions & {
|
|
70
|
+
joinThreshold?: number;
|
|
71
|
+
encodingName?: TiktokenEncoding;
|
|
72
|
+
modelName?: TiktokenModel;
|
|
73
|
+
allowedSpecial?: Set<string> | 'all';
|
|
74
|
+
disallowedSpecial?: Set<string> | 'all';
|
|
75
|
+
};
|
|
69
76
|
export type HTMLChunkOptions = BaseChunkOptions & ({
|
|
70
77
|
headers: [string, string][];
|
|
71
78
|
sections?: never;
|
|
@@ -99,8 +106,9 @@ export type StrategyOptions = {
|
|
|
99
106
|
json: JsonChunkOptions;
|
|
100
107
|
latex: LatexChunkOptions;
|
|
101
108
|
sentence: SentenceChunkOptions;
|
|
109
|
+
'semantic-markdown': SemanticMarkdownChunkOptions;
|
|
102
110
|
};
|
|
103
|
-
export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex' | 'sentence';
|
|
111
|
+
export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex' | 'sentence' | 'semantic-markdown';
|
|
104
112
|
export type ChunkParams = ({
|
|
105
113
|
strategy?: 'character';
|
|
106
114
|
} & CharacterChunkOptions & {
|
|
@@ -133,5 +141,9 @@ export type ChunkParams = ({
|
|
|
133
141
|
strategy: 'sentence';
|
|
134
142
|
} & SentenceChunkOptions & {
|
|
135
143
|
extract?: ExtractParams;
|
|
144
|
+
}) | ({
|
|
145
|
+
strategy: 'semantic-markdown';
|
|
146
|
+
} & SemanticMarkdownChunkOptions & {
|
|
147
|
+
extract?: ExtractParams;
|
|
136
148
|
});
|
|
137
149
|
//# sourceMappingURL=types.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/document/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACnE,OAAO,KAAK,EACV,mBAAmB,EACnB,kBAAkB,EAClB,yBAAyB,EACzB,kBAAkB,EACnB,MAAM,cAAc,CAAC;AAEtB,oBAAY,QAAQ;IAClB,GAAG,QAAQ;IACX,EAAE,OAAO;IACT,IAAI,SAAS;IACb,MAAM,WAAW;IACjB,EAAE,OAAO;IACT,EAAE,OAAO;IACT,GAAG,QAAQ;IACX,KAAK,UAAU;IACf,MAAM,WAAW;IACjB,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,IAAI,SAAS;IACb,KAAK,UAAU;IACf,KAAK,UAAU;IACf,QAAQ,aAAa;IACrB,KAAK,UAAU;IACf,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,MAAM,WAAW;IACjB,KAAK,UAAU;IACf,CAAC,MAAM;IACP,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;IACnB,MAAM,WAAW;IACjB,UAAU,eAAe;CAC1B;AAED,MAAM,MAAM,aAAa,GAAG;IAC1B,KAAK,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC;IACtC,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;IACvC,SAAS,CAAC,EAAE,yBAAyB,GAAG,OAAO,CAAC;IAChD,QAAQ,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,GAAG,OAAO,GAAG,KAAK,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,QAAQ,CAAC,EAAE,QAAQ,CAAC;CACrB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG;IACjD,YAAY,CAAC,EAAE,gBAAgB,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,cAAc,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAC7B,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAC7C,CACI;IAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,QAAQ,CAAC,EAAE,KAAK,CAAC;IAAC,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,GAC3E;IAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,OAAO,CAAC,EAAE,KAAK,CAAA;CAAE,CACpD,GAAG;IAAE,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,CAAC;AAEnC,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAAG;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG,EAAE,CAAC;AAEtD,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,SAAS,EAAE,qBAAqB,CAAC;IACjC,SAAS,EAAE,qBAAqB,CAAC;IACjC,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,IAAI,EAAE,gBAAgB,CAAC;IACvB,IAAI,EAAE,gBAAgB,CAAC;IACvB,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/document/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACnE,OAAO,KAAK,EACV,mBAAmB,EACnB,kBAAkB,EAClB,yBAAyB,EACzB,kBAAkB,EACnB,MAAM,cAAc,CAAC;AAEtB,oBAAY,QAAQ;IAClB,GAAG,QAAQ;IACX,EAAE,OAAO;IACT,IAAI,SAAS;IACb,MAAM,WAAW;IACjB,EAAE,OAAO;IACT,EAAE,OAAO;IACT,GAAG,QAAQ;IACX,KAAK,UAAU;IACf,MAAM,WAAW;IACjB,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,IAAI,SAAS;IACb,KAAK,UAAU;IACf,KAAK,UAAU;IACf,QAAQ,aAAa;IACrB,KAAK,UAAU;IACf,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,MAAM,WAAW;IACjB,KAAK,UAAU;IACf,CAAC,MAAM;IACP,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;IACnB,MAAM,WAAW;IACjB,UAAU,eAAe;CAC1B;AAED,MAAM,MAAM,aAAa,GAAG;IAC1B,KAAK,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC;IACtC,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;IACvC,SAAS,CAAC,EAAE,yBAAyB,GAAG,OAAO,CAAC;IAChD,QAAQ,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,GAAG,OAAO,GAAG,KAAK,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,QAAQ,CAAC,EAAE,QAAQ,CAAC;CACrB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG;IACjD,YAAY,CAAC,EAAE,gBAAgB,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,cAAc,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAC7B,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,4BAA4B,GAAG,gBAAgB,GAAG;IAC5D,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,gBAAgB,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,cAAc,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAC7C,CACI;IAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,QAAQ,CAAC,EAAE,KAAK,CAAC;IAAC,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,GAC3E;IAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,OAAO,CAAC,EAAE,KAAK,CAAA;CAAE,CACpD,GAAG;IAAE,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,CAAC;AAEnC,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAAG;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG,EAAE,CAAC;AAEtD,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,SAAS,EAAE,qBAAqB,CAAC;IACjC,SAAS,EAAE,qBAAqB,CAAC;IACjC,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,IAAI,EAAE,gBAAgB,CAAC;IACvB,IAAI,EAAE,gBAAgB,CAAC;IACvB,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,mBAAmB,EAAE,4BAA4B,CAAC;CACnD,CAAC;AAEF,MAAM,MAAM,aAAa,GACrB,WAAW,GACX,WAAW,GACX,OAAO,GACP,UAAU,GACV,MAAM,GACN,MAAM,GACN,OAAO,GACP,UAAU,GACV,mBAAmB,CAAC;AAExB,MAAM,MAAM,WAAW,GACnB,CAAC;IAAE,QAAQ,CAAC,EAAE,WAAW,CAAA;CAAE,GAAG,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAClF,CAAC;IAAE,QAAQ,EAAE,WAAW,CAAA;CAAE,GAAG,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACjF,CAAC;IAAE,QAAQ,EAAE,OAAO,CAAA;CAAE,GAAG,iBAAiB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACzE,CAAC;IAAE,QAAQ,EAAE,UAAU,CAAA;CAAE,GAAG,oBAAoB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAC/E,CAAC;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,gBAAgB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACvE,CAAC;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,gBAAgB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACvE,CAAC;IAAE,QAAQ,EAAE,OAAO,CAAA;CAAE,GAAG,iBAAiB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACzE,CAAC;IAAE,QAAQ,EAAE,UAAU,CAAA;CAAE,GAAG,oBAAoB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAC/E,CAAC;IAAE,QAAQ,EAAE,mBAAmB,CAAA;CAAE,GAAG,4BAA4B,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"validation.d.ts","sourceRoot":"","sources":["../../src/document/validation.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;
|
|
1
|
+
{"version":3,"file":"validation.d.ts","sourceRoot":"","sources":["../../src/document/validation.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAsI7C,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,aAAa,EAAE,MAAM,EAAE,GAAG,GAAG,IAAI,CAsB9E"}
|
package/dist/index.cjs
CHANGED
|
@@ -5650,6 +5650,168 @@ var MarkdownHeaderTransformer = class {
|
|
|
5650
5650
|
return this.createDocuments(texts, metadatas);
|
|
5651
5651
|
}
|
|
5652
5652
|
};
|
|
5653
|
+
var SemanticMarkdownTransformer = class _SemanticMarkdownTransformer extends TextTransformer {
|
|
5654
|
+
tokenizer;
|
|
5655
|
+
joinThreshold;
|
|
5656
|
+
allowedSpecial;
|
|
5657
|
+
disallowedSpecial;
|
|
5658
|
+
constructor({
|
|
5659
|
+
joinThreshold = 500,
|
|
5660
|
+
encodingName = "cl100k_base",
|
|
5661
|
+
modelName,
|
|
5662
|
+
allowedSpecial = /* @__PURE__ */ new Set(),
|
|
5663
|
+
disallowedSpecial = "all",
|
|
5664
|
+
...baseOptions
|
|
5665
|
+
} = {}) {
|
|
5666
|
+
super(baseOptions);
|
|
5667
|
+
this.joinThreshold = joinThreshold;
|
|
5668
|
+
this.allowedSpecial = allowedSpecial;
|
|
5669
|
+
this.disallowedSpecial = disallowedSpecial;
|
|
5670
|
+
try {
|
|
5671
|
+
this.tokenizer = modelName ? jsTiktoken.encodingForModel(modelName) : jsTiktoken.getEncoding(encodingName);
|
|
5672
|
+
} catch {
|
|
5673
|
+
throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
|
|
5674
|
+
}
|
|
5675
|
+
}
|
|
5676
|
+
countTokens(text) {
|
|
5677
|
+
const allowed = this.allowedSpecial === "all" ? "all" : Array.from(this.allowedSpecial);
|
|
5678
|
+
const disallowed = this.disallowedSpecial === "all" ? "all" : Array.from(this.disallowedSpecial);
|
|
5679
|
+
const processedText = this.stripWhitespace ? text.trim() : text;
|
|
5680
|
+
return this.tokenizer.encode(processedText, allowed, disallowed).length;
|
|
5681
|
+
}
|
|
5682
|
+
splitMarkdownByHeaders(markdown) {
|
|
5683
|
+
const sections = [];
|
|
5684
|
+
const lines = markdown.split("\n");
|
|
5685
|
+
let currentContent = "";
|
|
5686
|
+
let currentTitle = "";
|
|
5687
|
+
let currentDepth = 0;
|
|
5688
|
+
let inCodeBlock = false;
|
|
5689
|
+
const headerRegex = /^(#+)\s+(.+)$/;
|
|
5690
|
+
for (let i = 0; i < lines.length; i++) {
|
|
5691
|
+
const line = lines[i];
|
|
5692
|
+
const headerMatch = line.match(headerRegex);
|
|
5693
|
+
if (line.startsWith("```") || line.startsWith("~~~")) {
|
|
5694
|
+
inCodeBlock = !inCodeBlock;
|
|
5695
|
+
}
|
|
5696
|
+
if (headerMatch && !inCodeBlock) {
|
|
5697
|
+
if (currentContent.trim() !== "" || currentTitle && currentDepth > 0) {
|
|
5698
|
+
sections.push({
|
|
5699
|
+
title: currentTitle,
|
|
5700
|
+
content: currentContent.trim(),
|
|
5701
|
+
depth: currentDepth,
|
|
5702
|
+
length: this.countTokens(currentContent.trim())
|
|
5703
|
+
});
|
|
5704
|
+
}
|
|
5705
|
+
currentContent = "";
|
|
5706
|
+
currentDepth = headerMatch[1].length;
|
|
5707
|
+
currentTitle = headerMatch[2];
|
|
5708
|
+
} else {
|
|
5709
|
+
currentContent += line + "\n";
|
|
5710
|
+
}
|
|
5711
|
+
}
|
|
5712
|
+
if (currentContent.trim() !== "") {
|
|
5713
|
+
sections.push({
|
|
5714
|
+
title: currentTitle,
|
|
5715
|
+
content: currentContent.trim(),
|
|
5716
|
+
depth: currentDepth,
|
|
5717
|
+
length: this.countTokens(currentContent.trim())
|
|
5718
|
+
});
|
|
5719
|
+
}
|
|
5720
|
+
if (sections.length > 1 && sections[0].title === "" && sections[0].content.trim() === "") {
|
|
5721
|
+
sections.shift();
|
|
5722
|
+
}
|
|
5723
|
+
return sections;
|
|
5724
|
+
}
|
|
5725
|
+
mergeSemanticSections(sections) {
|
|
5726
|
+
if (sections.length === 0) return sections;
|
|
5727
|
+
const workingSections = [...sections];
|
|
5728
|
+
const deepest = Math.max(...workingSections.map((s) => s.depth));
|
|
5729
|
+
for (let depth = deepest; depth > 0; depth--) {
|
|
5730
|
+
for (let j = 1; j < workingSections.length; j++) {
|
|
5731
|
+
const current = workingSections[j];
|
|
5732
|
+
if (current.depth === depth) {
|
|
5733
|
+
const prev = workingSections[j - 1];
|
|
5734
|
+
if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
|
|
5735
|
+
const title = `${"#".repeat(current.depth)} ${current.title}`;
|
|
5736
|
+
const formattedTitle = `
|
|
5737
|
+
|
|
5738
|
+
${title}`;
|
|
5739
|
+
prev.content += `${formattedTitle}
|
|
5740
|
+
${current.content}`;
|
|
5741
|
+
prev.length = this.countTokens(prev.content);
|
|
5742
|
+
workingSections.splice(j, 1);
|
|
5743
|
+
j--;
|
|
5744
|
+
}
|
|
5745
|
+
}
|
|
5746
|
+
}
|
|
5747
|
+
}
|
|
5748
|
+
return workingSections;
|
|
5749
|
+
}
|
|
5750
|
+
splitText({ text }) {
|
|
5751
|
+
if (!text.trim()) return [];
|
|
5752
|
+
const initialSections = this.splitMarkdownByHeaders(text);
|
|
5753
|
+
const mergedSections = this.mergeSemanticSections(initialSections);
|
|
5754
|
+
return mergedSections.map((section) => {
|
|
5755
|
+
if (section.title) {
|
|
5756
|
+
const header = `${"#".repeat(section.depth)} ${section.title}`;
|
|
5757
|
+
return `${header}
|
|
5758
|
+
${section.content}`;
|
|
5759
|
+
}
|
|
5760
|
+
return section.content;
|
|
5761
|
+
});
|
|
5762
|
+
}
|
|
5763
|
+
createDocuments(texts, metadatas) {
|
|
5764
|
+
const _metadatas = metadatas || Array(texts.length).fill({});
|
|
5765
|
+
const documents = [];
|
|
5766
|
+
texts.forEach((text, i) => {
|
|
5767
|
+
this.splitText({ text }).forEach((chunk) => {
|
|
5768
|
+
const metadata = {
|
|
5769
|
+
..._metadatas[i],
|
|
5770
|
+
tokenCount: this.countTokens(chunk)
|
|
5771
|
+
};
|
|
5772
|
+
documents.push(
|
|
5773
|
+
new Document({
|
|
5774
|
+
text: chunk,
|
|
5775
|
+
metadata
|
|
5776
|
+
})
|
|
5777
|
+
);
|
|
5778
|
+
});
|
|
5779
|
+
});
|
|
5780
|
+
return documents;
|
|
5781
|
+
}
|
|
5782
|
+
transformDocuments(documents) {
|
|
5783
|
+
const texts = [];
|
|
5784
|
+
const metadatas = [];
|
|
5785
|
+
for (const doc of documents) {
|
|
5786
|
+
texts.push(doc.text);
|
|
5787
|
+
metadatas.push(doc.metadata);
|
|
5788
|
+
}
|
|
5789
|
+
return this.createDocuments(texts, metadatas);
|
|
5790
|
+
}
|
|
5791
|
+
static fromTikToken({
|
|
5792
|
+
encodingName = "cl100k_base",
|
|
5793
|
+
modelName,
|
|
5794
|
+
options = {}
|
|
5795
|
+
}) {
|
|
5796
|
+
let tokenizer;
|
|
5797
|
+
try {
|
|
5798
|
+
tokenizer = modelName ? jsTiktoken.encodingForModel(modelName) : jsTiktoken.getEncoding(encodingName);
|
|
5799
|
+
} catch {
|
|
5800
|
+
throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
|
|
5801
|
+
}
|
|
5802
|
+
const tikTokenCounter = (text) => {
|
|
5803
|
+
const allowed = options.allowedSpecial === "all" ? "all" : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
|
|
5804
|
+
const disallowed = options.disallowedSpecial === "all" ? "all" : options.disallowedSpecial ? Array.from(options.disallowedSpecial) : [];
|
|
5805
|
+
return tokenizer.encode(text, allowed, disallowed).length;
|
|
5806
|
+
};
|
|
5807
|
+
return new _SemanticMarkdownTransformer({
|
|
5808
|
+
...options,
|
|
5809
|
+
encodingName,
|
|
5810
|
+
modelName,
|
|
5811
|
+
lengthFunction: tikTokenCounter
|
|
5812
|
+
});
|
|
5813
|
+
}
|
|
5814
|
+
};
|
|
5653
5815
|
|
|
5654
5816
|
// src/document/transformers/sentence.ts
|
|
5655
5817
|
var SentenceTransformer = class extends TextTransformer {
|
|
@@ -6037,6 +6199,13 @@ var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
|
6037
6199
|
returnEachLine: zod.z.boolean().optional(),
|
|
6038
6200
|
stripHeaders: zod.z.boolean().optional()
|
|
6039
6201
|
}).strict();
|
|
6202
|
+
var semanticMarkdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
|
|
6203
|
+
joinThreshold: zod.z.number().positive().optional(),
|
|
6204
|
+
encodingName: zod.z.string().optional(),
|
|
6205
|
+
modelName: zod.z.string().optional(),
|
|
6206
|
+
allowedSpecial: setOrAllSchema,
|
|
6207
|
+
disallowedSpecial: setOrAllSchema
|
|
6208
|
+
}).strict();
|
|
6040
6209
|
var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
|
|
6041
6210
|
var validationSchemas = {
|
|
6042
6211
|
character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
@@ -6046,6 +6215,7 @@ var validationSchemas = {
|
|
|
6046
6215
|
json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6047
6216
|
html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6048
6217
|
markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6218
|
+
"semantic-markdown": semanticMarkdownChunkOptionsSchema.transform(handleDeprecatedSize),
|
|
6049
6219
|
latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
|
|
6050
6220
|
};
|
|
6051
6221
|
function validateChunkParams(strategy, params) {
|
|
@@ -6175,18 +6345,25 @@ var MDocument = class _MDocument {
|
|
|
6175
6345
|
return "recursive";
|
|
6176
6346
|
}
|
|
6177
6347
|
}
|
|
6348
|
+
_strategyMap;
|
|
6349
|
+
get strategyMap() {
|
|
6350
|
+
if (!this._strategyMap) {
|
|
6351
|
+
this._strategyMap = {
|
|
6352
|
+
recursive: (options) => this.chunkRecursive(options),
|
|
6353
|
+
character: (options) => this.chunkCharacter(options),
|
|
6354
|
+
token: (options) => this.chunkToken(options),
|
|
6355
|
+
markdown: (options) => this.chunkMarkdown(options),
|
|
6356
|
+
html: (options) => this.chunkHTML(options),
|
|
6357
|
+
json: (options) => this.chunkJSON(options),
|
|
6358
|
+
latex: (options) => this.chunkLatex(options),
|
|
6359
|
+
sentence: (options) => this.chunkSentence(options),
|
|
6360
|
+
"semantic-markdown": (options) => this.chunkSemanticMarkdown(options)
|
|
6361
|
+
};
|
|
6362
|
+
}
|
|
6363
|
+
return this._strategyMap;
|
|
6364
|
+
}
|
|
6178
6365
|
async chunkBy(strategy, options) {
|
|
6179
|
-
const
|
|
6180
|
-
recursive: (options2) => this.chunkRecursive(options2),
|
|
6181
|
-
character: (options2) => this.chunkCharacter(options2),
|
|
6182
|
-
token: (options2) => this.chunkToken(options2),
|
|
6183
|
-
markdown: (options2) => this.chunkMarkdown(options2),
|
|
6184
|
-
html: (options2) => this.chunkHTML(options2),
|
|
6185
|
-
json: (options2) => this.chunkJSON(options2),
|
|
6186
|
-
latex: (options2) => this.chunkLatex(options2),
|
|
6187
|
-
sentence: (options2) => this.chunkSentence(options2)
|
|
6188
|
-
};
|
|
6189
|
-
const chunkingFunc = strategyMap[strategy];
|
|
6366
|
+
const chunkingFunc = this.strategyMap[strategy];
|
|
6190
6367
|
if (chunkingFunc) {
|
|
6191
6368
|
await chunkingFunc(options);
|
|
6192
6369
|
} else {
|
|
@@ -6288,6 +6465,15 @@ var MDocument = class _MDocument {
|
|
|
6288
6465
|
const textSplit = rt.transformDocuments(this.chunks);
|
|
6289
6466
|
this.chunks = textSplit;
|
|
6290
6467
|
}
|
|
6468
|
+
async chunkSemanticMarkdown(options) {
|
|
6469
|
+
const rt = SemanticMarkdownTransformer.fromTikToken({
|
|
6470
|
+
options,
|
|
6471
|
+
encodingName: options?.encodingName,
|
|
6472
|
+
modelName: options?.modelName
|
|
6473
|
+
});
|
|
6474
|
+
const textSplit = rt.transformDocuments(this.chunks);
|
|
6475
|
+
this.chunks = textSplit;
|
|
6476
|
+
}
|
|
6291
6477
|
async chunk(params) {
|
|
6292
6478
|
const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
|
|
6293
6479
|
const strategy = passedStrategy || this.defaultStrategy();
|