@mastra/rag 2.1.1 → 2.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +18 -0
- package/dist/docs/SKILL.md +38 -0
- package/dist/docs/assets/SOURCE_MAP.json +6 -0
- package/dist/docs/references/docs-rag-chunking-and-embedding.md +183 -0
- package/dist/docs/references/docs-rag-graph-rag.md +215 -0
- package/dist/docs/references/docs-rag-overview.md +72 -0
- package/dist/docs/references/docs-rag-retrieval.md +515 -0
- package/dist/docs/references/reference-rag-chunk.md +221 -0
- package/dist/docs/references/reference-rag-database-config.md +261 -0
- package/dist/docs/references/reference-rag-document.md +114 -0
- package/dist/docs/references/reference-rag-extract-params.md +168 -0
- package/dist/docs/references/reference-rag-graph-rag.md +111 -0
- package/dist/docs/references/reference-rag-rerank.md +75 -0
- package/dist/docs/references/reference-rag-rerankWithScorer.md +80 -0
- package/dist/docs/references/reference-tools-document-chunker-tool.md +89 -0
- package/dist/docs/references/reference-tools-graph-rag-tool.md +182 -0
- package/dist/docs/references/reference-tools-vector-query-tool.md +459 -0
- package/dist/document/transformers/semantic-markdown.d.ts +6 -4
- package/dist/document/transformers/semantic-markdown.d.ts.map +1 -1
- package/dist/document/transformers/token.d.ts +5 -4
- package/dist/document/transformers/token.d.ts.map +1 -1
- package/dist/index.cjs +41 -26
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +41 -26
- package/dist/index.js.map +1 -1
- package/package.json +5 -5
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
# .chunk()
|
|
2
|
+
|
|
3
|
+
The `.chunk()` function splits documents into smaller segments using various strategies and options.
|
|
4
|
+
|
|
5
|
+
## Example
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
import { MDocument } from '@mastra/rag'
|
|
9
|
+
|
|
10
|
+
const doc = MDocument.fromMarkdown(`
|
|
11
|
+
# Introduction
|
|
12
|
+
This is a sample document that we want to split into chunks.
|
|
13
|
+
|
|
14
|
+
## Section 1
|
|
15
|
+
Here is the first section with some content.
|
|
16
|
+
|
|
17
|
+
## Section 2
|
|
18
|
+
Here is another section with different content.
|
|
19
|
+
`)
|
|
20
|
+
|
|
21
|
+
// Basic chunking with defaults
|
|
22
|
+
const chunks = await doc.chunk()
|
|
23
|
+
|
|
24
|
+
// Markdown-specific chunking with header extraction
|
|
25
|
+
const chunksWithMetadata = await doc.chunk({
|
|
26
|
+
strategy: 'markdown',
|
|
27
|
+
headers: [
|
|
28
|
+
['#', 'title'],
|
|
29
|
+
['##', 'section'],
|
|
30
|
+
],
|
|
31
|
+
extract: {
|
|
32
|
+
summary: true, // Extract summaries with default settings
|
|
33
|
+
keywords: true, // Extract keywords with default settings
|
|
34
|
+
},
|
|
35
|
+
})
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Parameters
|
|
39
|
+
|
|
40
|
+
The following parameters are available for all chunking strategies. **Important:** Each strategy will only utilize a subset of these parameters relevant to its specific use case.
|
|
41
|
+
|
|
42
|
+
**strategy?:** (`'recursive' | 'character' | 'token' | 'markdown' | 'semantic-markdown' | 'html' | 'json' | 'latex' | 'sentence'`): The chunking strategy to use. If not specified, defaults based on document type. Depending on the chunking strategy, there are additional optionals. Defaults: .md files → 'markdown', .html/.htm → 'html', .json → 'json', .tex → 'latex', others → 'recursive'
|
|
43
|
+
|
|
44
|
+
**maxSize?:** (`number`): Maximum size of each chunk. \*\*Note:\*\* Some strategy configurations (markdown with headers, HTML with headers) ignore this parameter. (Default: `4000`)
|
|
45
|
+
|
|
46
|
+
**overlap?:** (`number`): Number of characters/tokens that overlap between chunks. (Default: `50`)
|
|
47
|
+
|
|
48
|
+
**lengthFunction?:** (`(text: string) => number`): Function to calculate text length. Defaults to character count.
|
|
49
|
+
|
|
50
|
+
**separatorPosition?:** (`'start' | 'end'`): Where to position the separator in chunks. 'start' attaches to beginning of next chunk, 'end' attaches to end of current chunk. If not specified, separators are discarded.
|
|
51
|
+
|
|
52
|
+
**addStartIndex?:** (`boolean`): Whether to add start index metadata to chunks. (Default: `false`)
|
|
53
|
+
|
|
54
|
+
**stripWhitespace?:** (`boolean`): Whether to strip whitespace from chunks. (Default: `true`)
|
|
55
|
+
|
|
56
|
+
**extract?:** (`ExtractParams`): Metadata extraction configuration.
|
|
57
|
+
|
|
58
|
+
See [ExtractParams reference](https://mastra.ai/reference/rag/extract-params) for details on the `extract` parameter.
|
|
59
|
+
|
|
60
|
+
## Strategy-Specific Options
|
|
61
|
+
|
|
62
|
+
Strategy-specific options are passed as top-level parameters alongside the strategy parameter. For example:
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
// Character strategy example
|
|
66
|
+
const chunks = await doc.chunk({
|
|
67
|
+
strategy: 'character',
|
|
68
|
+
separator: '.', // Character-specific option
|
|
69
|
+
isSeparatorRegex: false, // Character-specific option
|
|
70
|
+
maxSize: 300, // general option
|
|
71
|
+
})
|
|
72
|
+
|
|
73
|
+
// Recursive strategy example
|
|
74
|
+
const chunks = await doc.chunk({
|
|
75
|
+
strategy: 'recursive',
|
|
76
|
+
separators: ['\n\n', '\n', ' '], // Recursive-specific option
|
|
77
|
+
language: 'markdown', // Recursive-specific option
|
|
78
|
+
maxSize: 500, // general option
|
|
79
|
+
})
|
|
80
|
+
|
|
81
|
+
// Sentence strategy example
|
|
82
|
+
const chunks = await doc.chunk({
|
|
83
|
+
strategy: 'sentence',
|
|
84
|
+
maxSize: 450, // Required for sentence strategy
|
|
85
|
+
minSize: 50, // Sentence-specific option
|
|
86
|
+
sentenceEnders: ['.'], // Sentence-specific option
|
|
87
|
+
fallbackToCharacters: false, // Sentence-specific option
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
// HTML strategy example
|
|
91
|
+
const chunks = await doc.chunk({
|
|
92
|
+
strategy: 'html',
|
|
93
|
+
headers: [
|
|
94
|
+
['h1', 'title'],
|
|
95
|
+
['h2', 'subtitle'],
|
|
96
|
+
], // HTML-specific option
|
|
97
|
+
})
|
|
98
|
+
|
|
99
|
+
// Markdown strategy example
|
|
100
|
+
const chunks = await doc.chunk({
|
|
101
|
+
strategy: 'markdown',
|
|
102
|
+
headers: [
|
|
103
|
+
['#', 'title'],
|
|
104
|
+
['##', 'section'],
|
|
105
|
+
], // Markdown-specific option
|
|
106
|
+
stripHeaders: true, // Markdown-specific option
|
|
107
|
+
})
|
|
108
|
+
|
|
109
|
+
// Semantic Markdown strategy example
|
|
110
|
+
const chunks = await doc.chunk({
|
|
111
|
+
strategy: 'semantic-markdown',
|
|
112
|
+
joinThreshold: 500, // Semantic Markdown-specific option
|
|
113
|
+
modelName: 'gpt-3.5-turbo', // Semantic Markdown-specific option
|
|
114
|
+
})
|
|
115
|
+
|
|
116
|
+
// Token strategy example
|
|
117
|
+
const chunks = await doc.chunk({
|
|
118
|
+
strategy: 'token',
|
|
119
|
+
encodingName: 'gpt2', // Token-specific option
|
|
120
|
+
modelName: 'gpt-3.5-turbo', // Token-specific option
|
|
121
|
+
maxSize: 1000, // general option
|
|
122
|
+
})
|
|
123
|
+
```
|
|
124
|
+
|
|
125
|
+
The options documented below are passed directly at the top level of the configuration object, not nested within a separate options object.
|
|
126
|
+
|
|
127
|
+
### Character
|
|
128
|
+
|
|
129
|
+
**separators?:** (`string[]`): Array of separators to try in order of preference. The strategy will attempt to split on the first separator, then fall back to subsequent ones.
|
|
130
|
+
|
|
131
|
+
**isSeparatorRegex?:** (`boolean`): Whether the separator is a regex pattern (Default: `false`)
|
|
132
|
+
|
|
133
|
+
### Recursive
|
|
134
|
+
|
|
135
|
+
**separators?:** (`string[]`): Array of separators to try in order of preference. The strategy will attempt to split on the first separator, then fall back to subsequent ones.
|
|
136
|
+
|
|
137
|
+
**isSeparatorRegex?:** (`boolean`): Whether the separators are regex patterns (Default: `false`)
|
|
138
|
+
|
|
139
|
+
**language?:** (`Language`): Programming or markup language for language-specific splitting behavior. See Language enum for supported values.
|
|
140
|
+
|
|
141
|
+
### Sentence
|
|
142
|
+
|
|
143
|
+
**maxSize:** (`number`): Maximum size of each chunk (required for sentence strategy)
|
|
144
|
+
|
|
145
|
+
**minSize?:** (`number`): Minimum size of each chunk. Chunks smaller than this will be merged with adjacent chunks when possible. (Default: `50`)
|
|
146
|
+
|
|
147
|
+
**targetSize?:** (`number`): Preferred target size for chunks. Defaults to 80% of maxSize. The strategy will try to create chunks close to this size.
|
|
148
|
+
|
|
149
|
+
**sentenceEnders?:** (`string[]`): Array of characters that mark sentence endings for splitting boundaries. (Default: `['.', '!', '?']`)
|
|
150
|
+
|
|
151
|
+
**fallbackToWords?:** (`boolean`): Whether to fall back to word-level splitting for sentences that exceed maxSize. (Default: `true`)
|
|
152
|
+
|
|
153
|
+
**fallbackToCharacters?:** (`boolean`): Whether to fall back to character-level splitting for words that exceed maxSize. Only applies if fallbackToWords is enabled. (Default: `true`)
|
|
154
|
+
|
|
155
|
+
### HTML
|
|
156
|
+
|
|
157
|
+
**headers:** (`Array<[string, string]>`): Array of \[selector, metadata key] pairs for header-based splitting
|
|
158
|
+
|
|
159
|
+
**sections:** (`Array<[string, string]>`): Array of \[selector, metadata key] pairs for section-based splitting
|
|
160
|
+
|
|
161
|
+
**returnEachLine?:** (`boolean`): Whether to return each line as a separate chunk
|
|
162
|
+
|
|
163
|
+
**Important:** When using the HTML strategy, all general options are ignored. Use `headers` for header-based splitting or `sections` for section-based splitting. If used together, `sections` will be ignored.
|
|
164
|
+
|
|
165
|
+
### Markdown
|
|
166
|
+
|
|
167
|
+
**headers?:** (`Array<[string, string]>`): Array of \[header level, metadata key] pairs
|
|
168
|
+
|
|
169
|
+
**stripHeaders?:** (`boolean`): Whether to remove headers from the output
|
|
170
|
+
|
|
171
|
+
**returnEachLine?:** (`boolean`): Whether to return each line as a separate chunk
|
|
172
|
+
|
|
173
|
+
**Important:** When using the `headers` option, the markdown strategy ignores all general options and content is split based on the markdown header structure. To use size-based chunking with markdown, omit the `headers` parameter.
|
|
174
|
+
|
|
175
|
+
### Semantic Markdown
|
|
176
|
+
|
|
177
|
+
**joinThreshold?:** (`number`): Maximum token count for merging related sections. Sections exceeding this limit individually are left intact, but smaller sections are merged with siblings or parents if the combined size stays under this threshold. (Default: `500`)
|
|
178
|
+
|
|
179
|
+
**modelName?:** (`string`): Name of the model for tokenization. If provided, the model's underlying tokenization \`encodingName\` will be used.
|
|
180
|
+
|
|
181
|
+
**encodingName?:** (`string`): Name of the token encoding to use. Derived from \`modelName\` if available. (Default: `cl100k_base`)
|
|
182
|
+
|
|
183
|
+
**allowedSpecial?:** (`Set<string> | 'all'`): Set of special tokens allowed during tokenization, or 'all' to allow all special tokens
|
|
184
|
+
|
|
185
|
+
**disallowedSpecial?:** (`Set<string> | 'all'`): Set of special tokens to disallow during tokenization, or 'all' to disallow all special tokens (Default: `all`)
|
|
186
|
+
|
|
187
|
+
### Token
|
|
188
|
+
|
|
189
|
+
**encodingName?:** (`string`): Name of the token encoding to use
|
|
190
|
+
|
|
191
|
+
**modelName?:** (`string`): Name of the model for tokenization
|
|
192
|
+
|
|
193
|
+
**allowedSpecial?:** (`Set<string> | 'all'`): Set of special tokens allowed during tokenization, or 'all' to allow all special tokens
|
|
194
|
+
|
|
195
|
+
**disallowedSpecial?:** (`Set<string> | 'all'`): Set of special tokens to disallow during tokenization, or 'all' to disallow all special tokens
|
|
196
|
+
|
|
197
|
+
### JSON
|
|
198
|
+
|
|
199
|
+
**maxSize:** (`number`): Maximum size of each chunk
|
|
200
|
+
|
|
201
|
+
**minSize?:** (`number`): Minimum size of each chunk
|
|
202
|
+
|
|
203
|
+
**ensureAscii?:** (`boolean`): Whether to ensure ASCII encoding
|
|
204
|
+
|
|
205
|
+
**convertLists?:** (`boolean`): Whether to convert lists in the JSON
|
|
206
|
+
|
|
207
|
+
### Latex
|
|
208
|
+
|
|
209
|
+
The Latex strategy uses only the general chunking options listed above. It provides LaTeX-aware splitting optimized for mathematical and academic documents.
|
|
210
|
+
|
|
211
|
+
## Return Value
|
|
212
|
+
|
|
213
|
+
Returns a `MDocument` instance containing the chunked documents. Each chunk includes:
|
|
214
|
+
|
|
215
|
+
```typescript
|
|
216
|
+
interface DocumentNode {
|
|
217
|
+
text: string
|
|
218
|
+
metadata: Record<string, any>
|
|
219
|
+
embedding?: number[]
|
|
220
|
+
}
|
|
221
|
+
```
|
|
@@ -0,0 +1,261 @@
|
|
|
1
|
+
# DatabaseConfig
|
|
2
|
+
|
|
3
|
+
The `DatabaseConfig` type allows you to specify database-specific configurations when using vector query tools. These configurations enable you to leverage unique features and optimizations offered by different vector stores.
|
|
4
|
+
|
|
5
|
+
## Type Definition
|
|
6
|
+
|
|
7
|
+
```typescript
|
|
8
|
+
export type DatabaseConfig = {
|
|
9
|
+
pinecone?: PineconeConfig
|
|
10
|
+
pgvector?: PgVectorConfig
|
|
11
|
+
chroma?: ChromaConfig
|
|
12
|
+
[key: string]: any // Extensible for future databases
|
|
13
|
+
}
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Database-Specific Types
|
|
17
|
+
|
|
18
|
+
### PineconeConfig
|
|
19
|
+
|
|
20
|
+
Configuration options specific to Pinecone vector store.
|
|
21
|
+
|
|
22
|
+
**namespace?:** (`string`): Pinecone namespace for organizing and isolating vectors within the same index. Useful for multi-tenancy or environment separation.
|
|
23
|
+
|
|
24
|
+
**sparseVector?:** (`{ indices: number[]; values: number[]; }`): objectindices:number\[]Array of indices for sparse vector componentsvalues:number\[]Array of values corresponding to the indices
|
|
25
|
+
|
|
26
|
+
**Use Cases:**
|
|
27
|
+
|
|
28
|
+
- Multi-tenant applications (separate namespaces per tenant)
|
|
29
|
+
- Environment isolation (dev/staging/prod namespaces)
|
|
30
|
+
- Hybrid search combining semantic and keyword matching
|
|
31
|
+
|
|
32
|
+
### PgVectorConfig
|
|
33
|
+
|
|
34
|
+
Configuration options specific to PostgreSQL with pgvector extension.
|
|
35
|
+
|
|
36
|
+
**minScore?:** (`number`): Minimum similarity score threshold for results. Only vectors with similarity scores above this value will be returned.
|
|
37
|
+
|
|
38
|
+
**ef?:** (`number`): HNSW search parameter that controls the size of the dynamic candidate list during search. Higher values improve accuracy at the cost of speed. Typically set between topK and 200.
|
|
39
|
+
|
|
40
|
+
**probes?:** (`number`): IVFFlat probe parameter that specifies the number of index cells to visit during search. Higher values improve recall at the cost of speed.
|
|
41
|
+
|
|
42
|
+
**Performance Guidelines:**
|
|
43
|
+
|
|
44
|
+
- **ef**: Start with 2-4x your topK value, increase for better accuracy
|
|
45
|
+
- **probes**: Start with 1-10, increase for better recall
|
|
46
|
+
- **minScore**: Use values between 0.5-0.9 depending on your quality requirements
|
|
47
|
+
|
|
48
|
+
**Use Cases:**
|
|
49
|
+
|
|
50
|
+
- Performance optimization for high-load scenarios
|
|
51
|
+
- Quality filtering to remove irrelevant results
|
|
52
|
+
- Fine-tuning search accuracy vs speed tradeoffs
|
|
53
|
+
|
|
54
|
+
### ChromaConfig
|
|
55
|
+
|
|
56
|
+
Configuration options specific to Chroma vector store.
|
|
57
|
+
|
|
58
|
+
**where?:** (`Record<string, any>`): Metadata filtering conditions using MongoDB-style query syntax. Filters results based on metadata fields.
|
|
59
|
+
|
|
60
|
+
**whereDocument?:** (`Record<string, any>`): Document content filtering conditions. Allows filtering based on the actual document text content.
|
|
61
|
+
|
|
62
|
+
**Filter Syntax Examples:**
|
|
63
|
+
|
|
64
|
+
```typescript
|
|
65
|
+
// Simple equality
|
|
66
|
+
where: { "category": "technical" }
|
|
67
|
+
|
|
68
|
+
// Operators
|
|
69
|
+
where: { "price": { "$gt": 100 } }
|
|
70
|
+
|
|
71
|
+
// Multiple conditions
|
|
72
|
+
where: {
|
|
73
|
+
"category": "electronics",
|
|
74
|
+
"inStock": true
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
// Document content filtering
|
|
78
|
+
whereDocument: { "$contains": "API documentation" }
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Use Cases:**
|
|
82
|
+
|
|
83
|
+
- Advanced metadata filtering
|
|
84
|
+
- Content-based document filtering
|
|
85
|
+
- Complex query combinations
|
|
86
|
+
|
|
87
|
+
## Usage Examples
|
|
88
|
+
|
|
89
|
+
**Basic Usage**:
|
|
90
|
+
|
|
91
|
+
### Basic Database Configuration
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
import { createVectorQueryTool } from '@mastra/rag'
|
|
95
|
+
|
|
96
|
+
const vectorTool = createVectorQueryTool({
|
|
97
|
+
vectorStoreName: 'pinecone',
|
|
98
|
+
indexName: 'documents',
|
|
99
|
+
model: embedModel,
|
|
100
|
+
databaseConfig: {
|
|
101
|
+
pinecone: {
|
|
102
|
+
namespace: 'production',
|
|
103
|
+
},
|
|
104
|
+
},
|
|
105
|
+
})
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Runtime Override**:
|
|
109
|
+
|
|
110
|
+
### Runtime Configuration Override
|
|
111
|
+
|
|
112
|
+
```typescript
|
|
113
|
+
import { RequestContext } from '@mastra/core/request-context'
|
|
114
|
+
|
|
115
|
+
// Initial configuration
|
|
116
|
+
const vectorTool = createVectorQueryTool({
|
|
117
|
+
vectorStoreName: 'pinecone',
|
|
118
|
+
indexName: 'documents',
|
|
119
|
+
model: embedModel,
|
|
120
|
+
databaseConfig: {
|
|
121
|
+
pinecone: {
|
|
122
|
+
namespace: 'development',
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
})
|
|
126
|
+
|
|
127
|
+
// Override at runtime
|
|
128
|
+
const requestContext = new RequestContext()
|
|
129
|
+
requestContext.set('databaseConfig', {
|
|
130
|
+
pinecone: {
|
|
131
|
+
namespace: 'production',
|
|
132
|
+
},
|
|
133
|
+
})
|
|
134
|
+
|
|
135
|
+
await vectorTool.execute({ queryText: 'search query' }, { mastra, requestContext })
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
**Multi-Database**:
|
|
139
|
+
|
|
140
|
+
### Multi-Database Configuration
|
|
141
|
+
|
|
142
|
+
```typescript
|
|
143
|
+
const vectorTool = createVectorQueryTool({
|
|
144
|
+
vectorStoreName: 'dynamic', // Will be determined at runtime
|
|
145
|
+
indexName: 'documents',
|
|
146
|
+
model: embedModel,
|
|
147
|
+
databaseConfig: {
|
|
148
|
+
pinecone: {
|
|
149
|
+
namespace: 'default',
|
|
150
|
+
},
|
|
151
|
+
pgvector: {
|
|
152
|
+
minScore: 0.8,
|
|
153
|
+
ef: 150,
|
|
154
|
+
},
|
|
155
|
+
chroma: {
|
|
156
|
+
where: { type: 'documentation' },
|
|
157
|
+
},
|
|
158
|
+
},
|
|
159
|
+
})
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
> **Note:** **Multi-Database Support**: When you configure multiple databases, only the configuration matching the actual vector store being used will be applied.
|
|
163
|
+
|
|
164
|
+
**Performance Tuning**:
|
|
165
|
+
|
|
166
|
+
### Performance Tuning
|
|
167
|
+
|
|
168
|
+
```typescript
|
|
169
|
+
// High accuracy configuration
|
|
170
|
+
const highAccuracyTool = createVectorQueryTool({
|
|
171
|
+
vectorStoreName: 'postgres',
|
|
172
|
+
indexName: 'embeddings',
|
|
173
|
+
model: embedModel,
|
|
174
|
+
databaseConfig: {
|
|
175
|
+
pgvector: {
|
|
176
|
+
ef: 400, // High accuracy
|
|
177
|
+
probes: 20, // High recall
|
|
178
|
+
minScore: 0.85, // High quality threshold
|
|
179
|
+
},
|
|
180
|
+
},
|
|
181
|
+
})
|
|
182
|
+
|
|
183
|
+
// High speed configuration
|
|
184
|
+
const highSpeedTool = createVectorQueryTool({
|
|
185
|
+
vectorStoreName: 'postgres',
|
|
186
|
+
indexName: 'embeddings',
|
|
187
|
+
model: embedModel,
|
|
188
|
+
databaseConfig: {
|
|
189
|
+
pgvector: {
|
|
190
|
+
ef: 50, // Lower accuracy, faster
|
|
191
|
+
probes: 3, // Lower recall, faster
|
|
192
|
+
minScore: 0.6, // Lower quality threshold
|
|
193
|
+
},
|
|
194
|
+
},
|
|
195
|
+
})
|
|
196
|
+
```
|
|
197
|
+
|
|
198
|
+
## Extensibility
|
|
199
|
+
|
|
200
|
+
The `DatabaseConfig` type is designed to be extensible. To add support for a new vector database:
|
|
201
|
+
|
|
202
|
+
```typescript
|
|
203
|
+
// 1. Define the configuration interface
|
|
204
|
+
export interface NewDatabaseConfig {
|
|
205
|
+
customParam1?: string
|
|
206
|
+
customParam2?: number
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// 2. Extend DatabaseConfig type
|
|
210
|
+
export type DatabaseConfig = {
|
|
211
|
+
pinecone?: PineconeConfig
|
|
212
|
+
pgvector?: PgVectorConfig
|
|
213
|
+
chroma?: ChromaConfig
|
|
214
|
+
newdatabase?: NewDatabaseConfig
|
|
215
|
+
[key: string]: any
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// 3. Use in vector query tool
|
|
219
|
+
const vectorTool = createVectorQueryTool({
|
|
220
|
+
vectorStoreName: 'newdatabase',
|
|
221
|
+
indexName: 'documents',
|
|
222
|
+
model: embedModel,
|
|
223
|
+
databaseConfig: {
|
|
224
|
+
newdatabase: {
|
|
225
|
+
customParam1: 'value',
|
|
226
|
+
customParam2: 42,
|
|
227
|
+
},
|
|
228
|
+
},
|
|
229
|
+
})
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
## Best Practices
|
|
233
|
+
|
|
234
|
+
1. **Environment Configuration**: Use different namespaces or configurations for different environments
|
|
235
|
+
2. **Performance Tuning**: Start with default values and adjust based on your specific needs
|
|
236
|
+
3. **Quality Filtering**: Use minScore to filter out low-quality results
|
|
237
|
+
4. **Runtime Flexibility**: Override configurations at runtime for dynamic scenarios
|
|
238
|
+
5. **Documentation**: Document your specific configuration choices for team members
|
|
239
|
+
|
|
240
|
+
## Migration Guide
|
|
241
|
+
|
|
242
|
+
Existing vector query tools continue to work without changes. To add database configurations:
|
|
243
|
+
|
|
244
|
+
```diff
|
|
245
|
+
const vectorTool = createVectorQueryTool({
|
|
246
|
+
vectorStoreName: 'pinecone',
|
|
247
|
+
indexName: 'documents',
|
|
248
|
+
model: embedModel,
|
|
249
|
+
+ databaseConfig: {
|
|
250
|
+
+ pinecone: {
|
|
251
|
+
+ namespace: 'production'
|
|
252
|
+
+ }
|
|
253
|
+
+ }
|
|
254
|
+
});
|
|
255
|
+
```
|
|
256
|
+
|
|
257
|
+
## Related
|
|
258
|
+
|
|
259
|
+
- [createVectorQueryTool()](https://mastra.ai/reference/tools/vector-query-tool)
|
|
260
|
+
- [Hybrid Vector Search](https://mastra.ai/docs/rag/retrieval)
|
|
261
|
+
- [Metadata Filters](https://mastra.ai/reference/rag/metadata-filters)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# MDocument
|
|
2
|
+
|
|
3
|
+
The MDocument class processes documents for RAG applications. The main methods are `.chunk()` and `.extractMetadata()`.
|
|
4
|
+
|
|
5
|
+
## Constructor
|
|
6
|
+
|
|
7
|
+
**docs:** (`Array<{ text: string, metadata?: Record<string, any> }>`): Array of document chunks with their text content and optional metadata
|
|
8
|
+
|
|
9
|
+
**type:** (`'text' | 'html' | 'markdown' | 'json' | 'latex'`): Type of document content
|
|
10
|
+
|
|
11
|
+
## Static Methods
|
|
12
|
+
|
|
13
|
+
### fromText()
|
|
14
|
+
|
|
15
|
+
Creates a document from plain text content.
|
|
16
|
+
|
|
17
|
+
```typescript
|
|
18
|
+
static fromText(text: string, metadata?: Record<string, any>): MDocument
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### fromHTML()
|
|
22
|
+
|
|
23
|
+
Creates a document from HTML content.
|
|
24
|
+
|
|
25
|
+
```typescript
|
|
26
|
+
static fromHTML(html: string, metadata?: Record<string, any>): MDocument
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
### fromMarkdown()
|
|
30
|
+
|
|
31
|
+
Creates a document from Markdown content.
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
static fromMarkdown(markdown: string, metadata?: Record<string, any>): MDocument
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### fromJSON()
|
|
38
|
+
|
|
39
|
+
Creates a document from JSON content.
|
|
40
|
+
|
|
41
|
+
```typescript
|
|
42
|
+
static fromJSON(json: string, metadata?: Record<string, any>): MDocument
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## Instance Methods
|
|
46
|
+
|
|
47
|
+
### chunk()
|
|
48
|
+
|
|
49
|
+
Splits document into chunks and optionally extracts metadata.
|
|
50
|
+
|
|
51
|
+
```typescript
|
|
52
|
+
async chunk(params?: ChunkParams): Promise<Chunk[]>
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
See [chunk() reference](https://mastra.ai/reference/rag/chunk) for detailed options.
|
|
56
|
+
|
|
57
|
+
### getDocs()
|
|
58
|
+
|
|
59
|
+
Returns array of processed document chunks.
|
|
60
|
+
|
|
61
|
+
```typescript
|
|
62
|
+
getDocs(): Chunk[]
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
### getText()
|
|
66
|
+
|
|
67
|
+
Returns array of text strings from chunks.
|
|
68
|
+
|
|
69
|
+
```typescript
|
|
70
|
+
getText(): string[]
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
### getMetadata()
|
|
74
|
+
|
|
75
|
+
Returns array of metadata objects from chunks.
|
|
76
|
+
|
|
77
|
+
```typescript
|
|
78
|
+
getMetadata(): Record<string, any>[]
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
### extractMetadata()
|
|
82
|
+
|
|
83
|
+
Extracts metadata using specified extractors. See [ExtractParams reference](https://mastra.ai/reference/rag/extract-params) for details.
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
async extractMetadata(params: ExtractParams): Promise<MDocument>
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## Examples
|
|
90
|
+
|
|
91
|
+
```typescript
|
|
92
|
+
import { MDocument } from '@mastra/rag'
|
|
93
|
+
|
|
94
|
+
// Create document from text
|
|
95
|
+
const doc = MDocument.fromText('Your content here')
|
|
96
|
+
|
|
97
|
+
// Split into chunks with metadata extraction
|
|
98
|
+
const chunks = await doc.chunk({
|
|
99
|
+
strategy: 'markdown',
|
|
100
|
+
headers: [
|
|
101
|
+
['#', 'title'],
|
|
102
|
+
['##', 'section'],
|
|
103
|
+
],
|
|
104
|
+
extract: {
|
|
105
|
+
summary: true, // Extract summaries with default settings
|
|
106
|
+
keywords: true, // Extract keywords with default settings
|
|
107
|
+
},
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
// Get processed chunks
|
|
111
|
+
const docs = doc.getDocs()
|
|
112
|
+
const texts = doc.getText()
|
|
113
|
+
const metadata = doc.getMetadata()
|
|
114
|
+
```
|