@mastra/rag 1.0.7 → 1.0.8-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,4 +1,4 @@
1
1
 
2
- > @mastra/rag@1.0.7-alpha.1 build /home/runner/work/mastra/mastra/packages/rag
2
+ > @mastra/rag@1.0.8-alpha.0 build /home/runner/work/mastra/mastra/packages/rag
3
3
  > tsup --silent --config tsup.config.ts
4
4
 
package/CHANGELOG.md CHANGED
@@ -1,5 +1,16 @@
1
1
  # @mastra/rag
2
2
 
3
+ ## 1.0.8-alpha.0
4
+
5
+ ### Patch Changes
6
+
7
+ - 1be6004: Added semantic markdown chunking strategy.
8
+ - Updated dependencies [8388649]
9
+ - Updated dependencies [dd94a26]
10
+ - Updated dependencies [3ba6772]
11
+ - Updated dependencies [2fff911]
12
+ - @mastra/core@0.13.2-alpha.0
13
+
3
14
  ## 1.0.7
4
15
 
5
16
  ### Patch Changes
@@ -1,5 +1,5 @@
1
1
  import { Document as Chunk } from './schema/index.js';
2
- import type { ChunkParams, ExtractParams, HTMLChunkOptions, RecursiveChunkOptions, CharacterChunkOptions, TokenChunkOptions, MarkdownChunkOptions, JsonChunkOptions, LatexChunkOptions, SentenceChunkOptions } from './types.js';
2
+ import type { ChunkParams, ExtractParams, HTMLChunkOptions, RecursiveChunkOptions, CharacterChunkOptions, TokenChunkOptions, MarkdownChunkOptions, SemanticMarkdownChunkOptions, JsonChunkOptions, LatexChunkOptions, SentenceChunkOptions } from './types.js';
3
3
  export declare class MDocument {
4
4
  private chunks;
5
5
  private type;
@@ -16,6 +16,8 @@ export declare class MDocument {
16
16
  static fromMarkdown(markdown: string, metadata?: Record<string, any>): MDocument;
17
17
  static fromJSON(jsonString: string, metadata?: Record<string, any>): MDocument;
18
18
  private defaultStrategy;
19
+ private _strategyMap?;
20
+ private get strategyMap();
19
21
  private chunkBy;
20
22
  chunkRecursive(options?: RecursiveChunkOptions): Promise<void>;
21
23
  chunkCharacter(options?: CharacterChunkOptions): Promise<void>;
@@ -25,6 +27,7 @@ export declare class MDocument {
25
27
  chunkToken(options?: TokenChunkOptions): Promise<void>;
26
28
  chunkMarkdown(options?: MarkdownChunkOptions): Promise<void>;
27
29
  chunkSentence(options?: SentenceChunkOptions): Promise<void>;
30
+ chunkSemanticMarkdown(options?: SemanticMarkdownChunkOptions): Promise<void>;
28
31
  chunk(params?: ChunkParams): Promise<Chunk[]>;
29
32
  getDocs(): Chunk[];
30
33
  getText(): string[];
@@ -1 +1 @@
1
- {"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAS3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;YAeT,OAAO;IAoBf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBpD,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAmBpD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAMtD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAUtD,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa5D,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAuB5D,KAAK,CAAC,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAiBnD,OAAO,IAAI,KAAK,EAAE;IAIlB,OAAO,IAAI,MAAM,EAAE;IAInB,WAAW,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE;CAGrC"}
1
+ {"version":3,"file":"document.d.ts","sourceRoot":"","sources":["../../src/document/document.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,QAAQ,IAAI,KAAK,EAAgC,MAAM,UAAU,CAAC;AAU3E,OAAO,KAAK,EACV,WAAW,EAEX,aAAa,EACb,gBAAgB,EAChB,qBAAqB,EACrB,qBAAqB,EACrB,iBAAiB,EACjB,oBAAoB,EACpB,4BAA4B,EAC5B,gBAAgB,EAChB,iBAAiB,EACjB,oBAAoB,EAErB,MAAM,SAAS,CAAC;AAGjB,qBAAa,SAAS;IACpB,OAAO,CAAC,MAAM,CAAU;IACxB,OAAO,CAAC,IAAI,CAAS;gBAET,EAAE,IAAI,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE;YAAE,IAAI,EAAE,MAAM,CAAC;YAAC,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;SAAE,EAAE,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE;IAOhG,eAAe,CAAC,EAAE,KAAK,EAAE,OAAO,EAAE,SAAS,EAAE,QAAQ,EAAE,EAAE,aAAa,GAAG,OAAO,CAAC,SAAS,CAAC;IAmDjG,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,QAAQ,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYxE,MAAM,CAAC,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAYhF,MAAM,CAAC,QAAQ,CAAC,UAAU,EAAE,MAAM,EAAE,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,GAAG,SAAS;IAY9E,OAAO,CAAC,eAAe;IAevB,OAAO,CAAC,YAAY,CAAC,CAA4E;IAEjG,OAAO,KAAK,WAAW,GAetB;YAEa,OAAO;IASf,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa9D,cAAc,CAAC,OAAO,CAAC,EAAE,qBAAqB,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAoBpD,SAAS,CAAC,OAAO,CAAC,EAAE,gBAAgB,GAAG,OAAO,CAAC,IAAI,CAAC;IAmBpD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAMtD,UAAU,CAAC,OAAO,CAAC,EAAE,iBAAiB,GAAG,OAAO,CAAC,IAAI,CAAC;IAUtD,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAa5D,aAAa,CAAC,OAAO,CAAC,EAAE,oBAAoB,GAAG,OAAO,CAAC,IAAI,CAAC;IAuB5D,qBAAqB,CAAC,OAAO,CAAC,EAAE,4BAA4B,GAAG,OAAO,CAAC,IAAI,CAAC;IAU5E,KAAK,CAAC,MAAM,CAAC,EAAE,WAAW,GAAG,OAAO,CAAC,KAAK,EAAE,CAAC;IAiBnD,OAAO,IAAI,KAAK,EAAE;IAIlB,OAAO,IAAI,MAAM,EAAE;IAInB,WAAW,IAAI,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE;CAGrC"}
@@ -0,0 +1,25 @@
1
+ import type { TiktokenModel, TiktokenEncoding } from 'js-tiktoken';
2
+ import { Document } from '../schema/index.js';
3
+ import type { SemanticMarkdownChunkOptions } from '../types.js';
4
+ import { TextTransformer } from './text.js';
5
+ export declare class SemanticMarkdownTransformer extends TextTransformer {
6
+ private tokenizer;
7
+ private joinThreshold;
8
+ private allowedSpecial;
9
+ private disallowedSpecial;
10
+ constructor({ joinThreshold, encodingName, modelName, allowedSpecial, disallowedSpecial, ...baseOptions }?: SemanticMarkdownChunkOptions);
11
+ private countTokens;
12
+ private splitMarkdownByHeaders;
13
+ private mergeSemanticSections;
14
+ splitText({ text }: {
15
+ text: string;
16
+ }): string[];
17
+ createDocuments(texts: string[], metadatas?: Record<string, any>[]): Document[];
18
+ transformDocuments(documents: Document[]): Document[];
19
+ static fromTikToken({ encodingName, modelName, options, }: {
20
+ encodingName?: TiktokenEncoding;
21
+ modelName?: TiktokenModel;
22
+ options?: SemanticMarkdownChunkOptions;
23
+ }): SemanticMarkdownTransformer;
24
+ }
25
+ //# sourceMappingURL=semantic-markdown.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"semantic-markdown.d.ts","sourceRoot":"","sources":["../../../src/document/transformers/semantic-markdown.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,gBAAgB,EAAY,MAAM,aAAa,CAAC;AAE7E,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AACrC,OAAO,KAAK,EAAE,4BAA4B,EAAE,MAAM,UAAU,CAAC;AAE7D,OAAO,EAAE,eAAe,EAAE,MAAM,QAAQ,CAAC;AASzC,qBAAa,2BAA4B,SAAQ,eAAe;IAC9D,OAAO,CAAC,SAAS,CAAW;IAC5B,OAAO,CAAC,aAAa,CAAS;IAC9B,OAAO,CAAC,cAAc,CAAsB;IAC5C,OAAO,CAAC,iBAAiB,CAAsB;gBAEnC,EACV,aAAmB,EACnB,YAA4B,EAC5B,SAAS,EACT,cAA0B,EAC1B,iBAAyB,EACzB,GAAG,WAAW,EACf,GAAE,4BAAiC;IAcpC,OAAO,CAAC,WAAW;IAQnB,OAAO,CAAC,sBAAsB;IA2D9B,OAAO,CAAC,qBAAqB;IA+B7B,SAAS,CAAC,EAAE,IAAI,EAAE,EAAE;QAAE,IAAI,EAAE,MAAM,CAAA;KAAE,GAAG,MAAM,EAAE;IAgB/C,eAAe,CAAC,KAAK,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,EAAE,GAAG,QAAQ,EAAE;IAuB/E,kBAAkB,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,QAAQ,EAAE;IAYrD,MAAM,CAAC,YAAY,CAAC,EAClB,YAA4B,EAC5B,SAAS,EACT,OAAY,GACb,EAAE;QACD,YAAY,CAAC,EAAE,gBAAgB,CAAC;QAChC,SAAS,CAAC,EAAE,aAAa,CAAC;QAC1B,OAAO,CAAC,EAAE,4BAA4B,CAAC;KACxC,GAAG,2BAA2B;CA4BhC"}
@@ -66,6 +66,13 @@ export type MarkdownChunkOptions = BaseChunkOptions & {
66
66
  returnEachLine?: boolean;
67
67
  stripHeaders?: boolean;
68
68
  };
69
+ export type SemanticMarkdownChunkOptions = BaseChunkOptions & {
70
+ joinThreshold?: number;
71
+ encodingName?: TiktokenEncoding;
72
+ modelName?: TiktokenModel;
73
+ allowedSpecial?: Set<string> | 'all';
74
+ disallowedSpecial?: Set<string> | 'all';
75
+ };
69
76
  export type HTMLChunkOptions = BaseChunkOptions & ({
70
77
  headers: [string, string][];
71
78
  sections?: never;
@@ -99,8 +106,9 @@ export type StrategyOptions = {
99
106
  json: JsonChunkOptions;
100
107
  latex: LatexChunkOptions;
101
108
  sentence: SentenceChunkOptions;
109
+ 'semantic-markdown': SemanticMarkdownChunkOptions;
102
110
  };
103
- export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex' | 'sentence';
111
+ export type ChunkStrategy = 'recursive' | 'character' | 'token' | 'markdown' | 'html' | 'json' | 'latex' | 'sentence' | 'semantic-markdown';
104
112
  export type ChunkParams = ({
105
113
  strategy?: 'character';
106
114
  } & CharacterChunkOptions & {
@@ -133,5 +141,9 @@ export type ChunkParams = ({
133
141
  strategy: 'sentence';
134
142
  } & SentenceChunkOptions & {
135
143
  extract?: ExtractParams;
144
+ }) | ({
145
+ strategy: 'semantic-markdown';
146
+ } & SemanticMarkdownChunkOptions & {
147
+ extract?: ExtractParams;
136
148
  });
137
149
  //# sourceMappingURL=types.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/document/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACnE,OAAO,KAAK,EACV,mBAAmB,EACnB,kBAAkB,EAClB,yBAAyB,EACzB,kBAAkB,EACnB,MAAM,cAAc,CAAC;AAEtB,oBAAY,QAAQ;IAClB,GAAG,QAAQ;IACX,EAAE,OAAO;IACT,IAAI,SAAS;IACb,MAAM,WAAW;IACjB,EAAE,OAAO;IACT,EAAE,OAAO;IACT,GAAG,QAAQ;IACX,KAAK,UAAU;IACf,MAAM,WAAW;IACjB,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,IAAI,SAAS;IACb,KAAK,UAAU;IACf,KAAK,UAAU;IACf,QAAQ,aAAa;IACrB,KAAK,UAAU;IACf,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,MAAM,WAAW;IACjB,KAAK,UAAU;IACf,CAAC,MAAM;IACP,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;IACnB,MAAM,WAAW;IACjB,UAAU,eAAe;CAC1B;AAED,MAAM,MAAM,aAAa,GAAG;IAC1B,KAAK,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC;IACtC,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;IACvC,SAAS,CAAC,EAAE,yBAAyB,GAAG,OAAO,CAAC;IAChD,QAAQ,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,GAAG,OAAO,GAAG,KAAK,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,QAAQ,CAAC,EAAE,QAAQ,CAAC;CACrB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG;IACjD,YAAY,CAAC,EAAE,gBAAgB,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,cAAc,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAC7B,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAC7C,CACI;IAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,QAAQ,CAAC,EAAE,KAAK,CAAC;IAAC,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,GAC3E;IAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,OAAO,CAAC,EAAE,KAAK,CAAA;CAAE,CACpD,GAAG;IAAE,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,CAAC;AAEnC,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAAG;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG,EAAE,CAAC;AAEtD,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,SAAS,EAAE,qBAAqB,CAAC;IACjC,SAAS,EAAE,qBAAqB,CAAC;IACjC,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,IAAI,EAAE,gBAAgB,CAAC;IACvB,IAAI,EAAE,gBAAgB,CAAC;IACvB,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;CAChC,CAAC;AAEF,MAAM,MAAM,aAAa,GAAG,WAAW,GAAG,WAAW,GAAG,OAAO,GAAG,UAAU,GAAG,MAAM,GAAG,MAAM,GAAG,OAAO,GAAG,UAAU,CAAC;AAEtH,MAAM,MAAM,WAAW,GACnB,CAAC;IAAE,QAAQ,CAAC,EAAE,WAAW,CAAA;CAAE,GAAG,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAClF,CAAC;IAAE,QAAQ,EAAE,WAAW,CAAA;CAAE,GAAG,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACjF,CAAC;IAAE,QAAQ,EAAE,OAAO,CAAA;CAAE,GAAG,iBAAiB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACzE,CAAC;IAAE,QAAQ,EAAE,UAAU,CAAA;CAAE,GAAG,oBAAoB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAC/E,CAAC;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,gBAAgB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACvE,CAAC;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,gBAAgB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACvE,CAAC;IAAE,QAAQ,EAAE,OAAO,CAAA;CAAE,GAAG,iBAAiB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACzE,CAAC;IAAE,QAAQ,EAAE,UAAU,CAAA;CAAE,GAAG,oBAAoB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,CAAC"}
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/document/types.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,gBAAgB,EAAE,aAAa,EAAE,MAAM,aAAa,CAAC;AACnE,OAAO,KAAK,EACV,mBAAmB,EACnB,kBAAkB,EAClB,yBAAyB,EACzB,kBAAkB,EACnB,MAAM,cAAc,CAAC;AAEtB,oBAAY,QAAQ;IAClB,GAAG,QAAQ;IACX,EAAE,OAAO;IACT,IAAI,SAAS;IACb,MAAM,WAAW;IACjB,EAAE,OAAO;IACT,EAAE,OAAO;IACT,GAAG,QAAQ;IACX,KAAK,UAAU;IACf,MAAM,WAAW;IACjB,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,IAAI,SAAS;IACb,KAAK,UAAU;IACf,KAAK,UAAU;IACf,QAAQ,aAAa;IACrB,KAAK,UAAU;IACf,IAAI,SAAS;IACb,GAAG,QAAQ;IACX,MAAM,WAAW;IACjB,KAAK,UAAU;IACf,CAAC,MAAM;IACP,GAAG,QAAQ;IACX,IAAI,SAAS;IACb,OAAO,YAAY;IACnB,MAAM,WAAW;IACjB,UAAU,eAAe;CAC1B;AAED,MAAM,MAAM,aAAa,GAAG;IAC1B,KAAK,CAAC,EAAE,mBAAmB,GAAG,OAAO,CAAC;IACtC,OAAO,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;IACvC,SAAS,CAAC,EAAE,yBAAyB,GAAG,OAAO,CAAC;IAChD,QAAQ,CAAC,EAAE,kBAAkB,GAAG,OAAO,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG;IAC7B;;OAEG;IACH,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,cAAc,CAAC,EAAE,CAAC,IAAI,EAAE,MAAM,KAAK,MAAM,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,GAAG,OAAO,GAAG,KAAK,CAAC;IAC1C,aAAa,CAAC,EAAE,OAAO,CAAC;IACxB,eAAe,CAAC,EAAE,OAAO,CAAC;CAC3B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,gBAAgB,CAAC,EAAE,OAAO,CAAC;CAC5B,CAAC;AAEF,MAAM,MAAM,qBAAqB,GAAG,gBAAgB,GAAG;IACrD,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC;IACtB,gBAAgB,CAAC,EAAE,OAAO,CAAC;IAC3B,QAAQ,CAAC,EAAE,QAAQ,CAAC;CACrB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG;IACjD,YAAY,CAAC,EAAE,gBAAgB,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,cAAc,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,CAAC,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAC7B,cAAc,CAAC,EAAE,OAAO,CAAC;IACzB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,4BAA4B,GAAG,gBAAgB,GAAG;IAC5D,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,YAAY,CAAC,EAAE,gBAAgB,CAAC;IAChC,SAAS,CAAC,EAAE,aAAa,CAAC;IAC1B,cAAc,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;IACrC,iBAAiB,CAAC,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,KAAK,CAAC;CACzC,CAAC;AAEF,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAC7C,CACI;IAAE,OAAO,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,QAAQ,CAAC,EAAE,KAAK,CAAC;IAAC,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,GAC3E;IAAE,QAAQ,EAAE,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC;IAAC,OAAO,CAAC,EAAE,KAAK,CAAA;CAAE,CACpD,GAAG;IAAE,cAAc,CAAC,EAAE,OAAO,CAAA;CAAE,CAAC;AAEnC,MAAM,MAAM,gBAAgB,GAAG,gBAAgB,GAAG;IAChD,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,OAAO,CAAC;IACtB,YAAY,CAAC,EAAE,OAAO,CAAC;CACxB,CAAC;AAEF,MAAM,MAAM,iBAAiB,GAAG,gBAAgB,GAAG,EAAE,CAAC;AAEtD,MAAM,MAAM,oBAAoB,GAAG,gBAAgB,GAAG;IACpD,OAAO,EAAE,MAAM,CAAC;IAChB,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;IAC1B,eAAe,CAAC,EAAE,OAAO,CAAC;IAC1B,oBAAoB,CAAC,EAAE,OAAO,CAAC;CAChC,CAAC;AAEF,MAAM,MAAM,eAAe,GAAG;IAC5B,SAAS,EAAE,qBAAqB,CAAC;IACjC,SAAS,EAAE,qBAAqB,CAAC;IACjC,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,IAAI,EAAE,gBAAgB,CAAC;IACvB,IAAI,EAAE,gBAAgB,CAAC;IACvB,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,EAAE,oBAAoB,CAAC;IAC/B,mBAAmB,EAAE,4BAA4B,CAAC;CACnD,CAAC;AAEF,MAAM,MAAM,aAAa,GACrB,WAAW,GACX,WAAW,GACX,OAAO,GACP,UAAU,GACV,MAAM,GACN,MAAM,GACN,OAAO,GACP,UAAU,GACV,mBAAmB,CAAC;AAExB,MAAM,MAAM,WAAW,GACnB,CAAC;IAAE,QAAQ,CAAC,EAAE,WAAW,CAAA;CAAE,GAAG,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAClF,CAAC;IAAE,QAAQ,EAAE,WAAW,CAAA;CAAE,GAAG,qBAAqB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACjF,CAAC;IAAE,QAAQ,EAAE,OAAO,CAAA;CAAE,GAAG,iBAAiB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACzE,CAAC;IAAE,QAAQ,EAAE,UAAU,CAAA;CAAE,GAAG,oBAAoB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAC/E,CAAC;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,gBAAgB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACvE,CAAC;IAAE,QAAQ,EAAE,MAAM,CAAA;CAAE,GAAG,gBAAgB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACvE,CAAC;IAAE,QAAQ,EAAE,OAAO,CAAA;CAAE,GAAG,iBAAiB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GACzE,CAAC;IAAE,QAAQ,EAAE,UAAU,CAAA;CAAE,GAAG,oBAAoB,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,GAC/E,CAAC;IAAE,QAAQ,EAAE,mBAAmB,CAAA;CAAE,GAAG,4BAA4B,GAAG;IAAE,OAAO,CAAC,EAAE,aAAa,CAAA;CAAE,CAAC,CAAC"}
@@ -1 +1 @@
1
- {"version":3,"file":"validation.d.ts","sourceRoot":"","sources":["../../src/document/validation.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AA2H7C,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,aAAa,EAAE,MAAM,EAAE,GAAG,GAAG,IAAI,CAsB9E"}
1
+ {"version":3,"file":"validation.d.ts","sourceRoot":"","sources":["../../src/document/validation.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,SAAS,CAAC;AAsI7C,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,aAAa,EAAE,MAAM,EAAE,GAAG,GAAG,IAAI,CAsB9E"}
package/dist/index.cjs CHANGED
@@ -5650,6 +5650,168 @@ var MarkdownHeaderTransformer = class {
5650
5650
  return this.createDocuments(texts, metadatas);
5651
5651
  }
5652
5652
  };
5653
+ var SemanticMarkdownTransformer = class _SemanticMarkdownTransformer extends TextTransformer {
5654
+ tokenizer;
5655
+ joinThreshold;
5656
+ allowedSpecial;
5657
+ disallowedSpecial;
5658
+ constructor({
5659
+ joinThreshold = 500,
5660
+ encodingName = "cl100k_base",
5661
+ modelName,
5662
+ allowedSpecial = /* @__PURE__ */ new Set(),
5663
+ disallowedSpecial = "all",
5664
+ ...baseOptions
5665
+ } = {}) {
5666
+ super(baseOptions);
5667
+ this.joinThreshold = joinThreshold;
5668
+ this.allowedSpecial = allowedSpecial;
5669
+ this.disallowedSpecial = disallowedSpecial;
5670
+ try {
5671
+ this.tokenizer = modelName ? jsTiktoken.encodingForModel(modelName) : jsTiktoken.getEncoding(encodingName);
5672
+ } catch {
5673
+ throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
5674
+ }
5675
+ }
5676
+ countTokens(text) {
5677
+ const allowed = this.allowedSpecial === "all" ? "all" : Array.from(this.allowedSpecial);
5678
+ const disallowed = this.disallowedSpecial === "all" ? "all" : Array.from(this.disallowedSpecial);
5679
+ const processedText = this.stripWhitespace ? text.trim() : text;
5680
+ return this.tokenizer.encode(processedText, allowed, disallowed).length;
5681
+ }
5682
+ splitMarkdownByHeaders(markdown) {
5683
+ const sections = [];
5684
+ const lines = markdown.split("\n");
5685
+ let currentContent = "";
5686
+ let currentTitle = "";
5687
+ let currentDepth = 0;
5688
+ let inCodeBlock = false;
5689
+ const headerRegex = /^(#+)\s+(.+)$/;
5690
+ for (let i = 0; i < lines.length; i++) {
5691
+ const line = lines[i];
5692
+ const headerMatch = line.match(headerRegex);
5693
+ if (line.startsWith("```") || line.startsWith("~~~")) {
5694
+ inCodeBlock = !inCodeBlock;
5695
+ }
5696
+ if (headerMatch && !inCodeBlock) {
5697
+ if (currentContent.trim() !== "" || currentTitle && currentDepth > 0) {
5698
+ sections.push({
5699
+ title: currentTitle,
5700
+ content: currentContent.trim(),
5701
+ depth: currentDepth,
5702
+ length: this.countTokens(currentContent.trim())
5703
+ });
5704
+ }
5705
+ currentContent = "";
5706
+ currentDepth = headerMatch[1].length;
5707
+ currentTitle = headerMatch[2];
5708
+ } else {
5709
+ currentContent += line + "\n";
5710
+ }
5711
+ }
5712
+ if (currentContent.trim() !== "") {
5713
+ sections.push({
5714
+ title: currentTitle,
5715
+ content: currentContent.trim(),
5716
+ depth: currentDepth,
5717
+ length: this.countTokens(currentContent.trim())
5718
+ });
5719
+ }
5720
+ if (sections.length > 1 && sections[0].title === "" && sections[0].content.trim() === "") {
5721
+ sections.shift();
5722
+ }
5723
+ return sections;
5724
+ }
5725
+ mergeSemanticSections(sections) {
5726
+ if (sections.length === 0) return sections;
5727
+ const workingSections = [...sections];
5728
+ const deepest = Math.max(...workingSections.map((s) => s.depth));
5729
+ for (let depth = deepest; depth > 0; depth--) {
5730
+ for (let j = 1; j < workingSections.length; j++) {
5731
+ const current = workingSections[j];
5732
+ if (current.depth === depth) {
5733
+ const prev = workingSections[j - 1];
5734
+ if (prev.length + current.length < this.joinThreshold && prev.depth <= current.depth) {
5735
+ const title = `${"#".repeat(current.depth)} ${current.title}`;
5736
+ const formattedTitle = `
5737
+
5738
+ ${title}`;
5739
+ prev.content += `${formattedTitle}
5740
+ ${current.content}`;
5741
+ prev.length = this.countTokens(prev.content);
5742
+ workingSections.splice(j, 1);
5743
+ j--;
5744
+ }
5745
+ }
5746
+ }
5747
+ }
5748
+ return workingSections;
5749
+ }
5750
+ splitText({ text }) {
5751
+ if (!text.trim()) return [];
5752
+ const initialSections = this.splitMarkdownByHeaders(text);
5753
+ const mergedSections = this.mergeSemanticSections(initialSections);
5754
+ return mergedSections.map((section) => {
5755
+ if (section.title) {
5756
+ const header = `${"#".repeat(section.depth)} ${section.title}`;
5757
+ return `${header}
5758
+ ${section.content}`;
5759
+ }
5760
+ return section.content;
5761
+ });
5762
+ }
5763
+ createDocuments(texts, metadatas) {
5764
+ const _metadatas = metadatas || Array(texts.length).fill({});
5765
+ const documents = [];
5766
+ texts.forEach((text, i) => {
5767
+ this.splitText({ text }).forEach((chunk) => {
5768
+ const metadata = {
5769
+ ..._metadatas[i],
5770
+ tokenCount: this.countTokens(chunk)
5771
+ };
5772
+ documents.push(
5773
+ new Document({
5774
+ text: chunk,
5775
+ metadata
5776
+ })
5777
+ );
5778
+ });
5779
+ });
5780
+ return documents;
5781
+ }
5782
+ transformDocuments(documents) {
5783
+ const texts = [];
5784
+ const metadatas = [];
5785
+ for (const doc of documents) {
5786
+ texts.push(doc.text);
5787
+ metadatas.push(doc.metadata);
5788
+ }
5789
+ return this.createDocuments(texts, metadatas);
5790
+ }
5791
+ static fromTikToken({
5792
+ encodingName = "cl100k_base",
5793
+ modelName,
5794
+ options = {}
5795
+ }) {
5796
+ let tokenizer;
5797
+ try {
5798
+ tokenizer = modelName ? jsTiktoken.encodingForModel(modelName) : jsTiktoken.getEncoding(encodingName);
5799
+ } catch {
5800
+ throw new Error("Could not load tiktoken encoding. Please install it with `npm install js-tiktoken`.");
5801
+ }
5802
+ const tikTokenCounter = (text) => {
5803
+ const allowed = options.allowedSpecial === "all" ? "all" : options.allowedSpecial ? Array.from(options.allowedSpecial) : [];
5804
+ const disallowed = options.disallowedSpecial === "all" ? "all" : options.disallowedSpecial ? Array.from(options.disallowedSpecial) : [];
5805
+ return tokenizer.encode(text, allowed, disallowed).length;
5806
+ };
5807
+ return new _SemanticMarkdownTransformer({
5808
+ ...options,
5809
+ encodingName,
5810
+ modelName,
5811
+ lengthFunction: tikTokenCounter
5812
+ });
5813
+ }
5814
+ };
5653
5815
 
5654
5816
  // src/document/transformers/sentence.ts
5655
5817
  var SentenceTransformer = class extends TextTransformer {
@@ -6037,6 +6199,13 @@ var markdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
6037
6199
  returnEachLine: zod.z.boolean().optional(),
6038
6200
  stripHeaders: zod.z.boolean().optional()
6039
6201
  }).strict();
6202
+ var semanticMarkdownChunkOptionsSchema = baseChunkOptionsSchema.extend({
6203
+ joinThreshold: zod.z.number().positive().optional(),
6204
+ encodingName: zod.z.string().optional(),
6205
+ modelName: zod.z.string().optional(),
6206
+ allowedSpecial: setOrAllSchema,
6207
+ disallowedSpecial: setOrAllSchema
6208
+ }).strict();
6040
6209
  var latexChunkOptionsSchema = baseChunkOptionsSchema.strict();
6041
6210
  var validationSchemas = {
6042
6211
  character: characterChunkOptionsSchema.transform(handleDeprecatedSize),
@@ -6046,6 +6215,7 @@ var validationSchemas = {
6046
6215
  json: jsonChunkOptionsSchema.transform(handleDeprecatedSize),
6047
6216
  html: htmlChunkOptionsSchema.transform(handleDeprecatedSize),
6048
6217
  markdown: markdownChunkOptionsSchema.transform(handleDeprecatedSize),
6218
+ "semantic-markdown": semanticMarkdownChunkOptionsSchema.transform(handleDeprecatedSize),
6049
6219
  latex: latexChunkOptionsSchema.transform(handleDeprecatedSize)
6050
6220
  };
6051
6221
  function validateChunkParams(strategy, params) {
@@ -6175,18 +6345,25 @@ var MDocument = class _MDocument {
6175
6345
  return "recursive";
6176
6346
  }
6177
6347
  }
6348
+ _strategyMap;
6349
+ get strategyMap() {
6350
+ if (!this._strategyMap) {
6351
+ this._strategyMap = {
6352
+ recursive: (options) => this.chunkRecursive(options),
6353
+ character: (options) => this.chunkCharacter(options),
6354
+ token: (options) => this.chunkToken(options),
6355
+ markdown: (options) => this.chunkMarkdown(options),
6356
+ html: (options) => this.chunkHTML(options),
6357
+ json: (options) => this.chunkJSON(options),
6358
+ latex: (options) => this.chunkLatex(options),
6359
+ sentence: (options) => this.chunkSentence(options),
6360
+ "semantic-markdown": (options) => this.chunkSemanticMarkdown(options)
6361
+ };
6362
+ }
6363
+ return this._strategyMap;
6364
+ }
6178
6365
  async chunkBy(strategy, options) {
6179
- const strategyMap = {
6180
- recursive: (options2) => this.chunkRecursive(options2),
6181
- character: (options2) => this.chunkCharacter(options2),
6182
- token: (options2) => this.chunkToken(options2),
6183
- markdown: (options2) => this.chunkMarkdown(options2),
6184
- html: (options2) => this.chunkHTML(options2),
6185
- json: (options2) => this.chunkJSON(options2),
6186
- latex: (options2) => this.chunkLatex(options2),
6187
- sentence: (options2) => this.chunkSentence(options2)
6188
- };
6189
- const chunkingFunc = strategyMap[strategy];
6366
+ const chunkingFunc = this.strategyMap[strategy];
6190
6367
  if (chunkingFunc) {
6191
6368
  await chunkingFunc(options);
6192
6369
  } else {
@@ -6288,6 +6465,15 @@ var MDocument = class _MDocument {
6288
6465
  const textSplit = rt.transformDocuments(this.chunks);
6289
6466
  this.chunks = textSplit;
6290
6467
  }
6468
+ async chunkSemanticMarkdown(options) {
6469
+ const rt = SemanticMarkdownTransformer.fromTikToken({
6470
+ options,
6471
+ encodingName: options?.encodingName,
6472
+ modelName: options?.modelName
6473
+ });
6474
+ const textSplit = rt.transformDocuments(this.chunks);
6475
+ this.chunks = textSplit;
6476
+ }
6291
6477
  async chunk(params) {
6292
6478
  const { strategy: passedStrategy, extract, ...chunkOptions } = params || {};
6293
6479
  const strategy = passedStrategy || this.defaultStrategy();