@mastra/rag 0.1.19-alpha.3 β 0.1.19-alpha.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.turbo/turbo-build.log +7 -7
- package/CHANGELOG.md +15 -0
- package/dist/_tsup-dts-rollup.d.cts +237 -5
- package/dist/_tsup-dts-rollup.d.ts +237 -5
- package/dist/index.cjs +4118 -14
- package/dist/index.js +4115 -11
- package/package.json +2 -4
- package/src/document/document.test.ts +123 -2
- package/src/document/document.ts +15 -21
- package/src/document/extractors/index.ts +5 -0
- package/src/document/extractors/keywords.test.ts +119 -0
- package/src/document/extractors/keywords.ts +123 -0
- package/src/document/extractors/questions.test.ts +120 -0
- package/src/document/extractors/questions.ts +126 -0
- package/src/document/extractors/summary.test.ts +107 -0
- package/src/document/extractors/summary.ts +130 -0
- package/src/document/extractors/title.test.ts +121 -0
- package/src/document/extractors/title.ts +210 -0
- package/src/document/extractors/types.ts +40 -0
- package/src/document/types.ts +5 -33
- package/vitest.config.ts +0 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mastra/rag",
|
|
3
|
-
"version": "0.1.19-alpha.
|
|
3
|
+
"version": "0.1.19-alpha.5",
|
|
4
4
|
"description": "",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "dist/index.js",
|
|
@@ -21,15 +21,13 @@
|
|
|
21
21
|
"author": "",
|
|
22
22
|
"license": "Elastic-2.0",
|
|
23
23
|
"dependencies": {
|
|
24
|
-
"@llamaindex/core": "^0.6.2",
|
|
25
|
-
"@llamaindex/env": "^0.1.29",
|
|
26
24
|
"@paralleldrive/cuid2": "^2.2.2",
|
|
27
25
|
"js-tiktoken": "^1.0.19",
|
|
28
26
|
"llamaindex": "^0.9.17",
|
|
29
27
|
"node-html-better-parser": "^1.4.7",
|
|
30
28
|
"pathe": "^2.0.3",
|
|
31
29
|
"zod": "^3.24.2",
|
|
32
|
-
"@mastra/core": "^0.9.0-alpha.
|
|
30
|
+
"@mastra/core": "^0.9.0-alpha.5"
|
|
33
31
|
},
|
|
34
32
|
"peerDependencies": {
|
|
35
33
|
"ai": "^4.0.0"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
2
|
import { embedMany } from 'ai';
|
|
3
|
-
import { describe, it, expect } from 'vitest';
|
|
3
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
4
4
|
|
|
5
5
|
import { MDocument } from './document';
|
|
6
6
|
import { Language } from './types';
|
|
@@ -20,6 +20,8 @@ const openai = createOpenAI({
|
|
|
20
20
|
apiKey: process.env.OPENAI_API_KEY,
|
|
21
21
|
});
|
|
22
22
|
|
|
23
|
+
vi.setConfig({ testTimeout: 10_000, hookTimeout: 10_000 });
|
|
24
|
+
|
|
23
25
|
describe('MDocument', () => {
|
|
24
26
|
describe('basics', () => {
|
|
25
27
|
let chunks: MDocument['chunks'];
|
|
@@ -1693,7 +1695,10 @@ describe('MDocument', () => {
|
|
|
1693
1695
|
expect(metadata).toBeDefined();
|
|
1694
1696
|
expect(metadata.documentTitle).toBeDefined();
|
|
1695
1697
|
expect(metadata.sectionSummary).toBeDefined();
|
|
1696
|
-
|
|
1698
|
+
const qStr = metadata.questionsThisExcerptCanAnswer;
|
|
1699
|
+
expect(qStr).toMatch(/1\..*\?/s);
|
|
1700
|
+
expect(qStr).toMatch(/2\..*\?/s);
|
|
1701
|
+
expect((qStr.match(/\?/g) || []).length).toBeGreaterThanOrEqual(2);
|
|
1697
1702
|
expect(metadata.excerptKeywords).toMatch(/^1\. .*\n2\. .*\n3\. .*$/);
|
|
1698
1703
|
}, 15000);
|
|
1699
1704
|
|
|
@@ -1711,6 +1716,122 @@ describe('MDocument', () => {
|
|
|
1711
1716
|
).rejects.toThrow("Summaries must be one of 'self', 'prev', 'next'");
|
|
1712
1717
|
}, 15000);
|
|
1713
1718
|
});
|
|
1719
|
+
|
|
1720
|
+
describe('metadata preservation', () => {
|
|
1721
|
+
const baseText = 'This is a test document for metadata extraction.';
|
|
1722
|
+
const baseMetadata = { source: 'unit-test', customField: 123 };
|
|
1723
|
+
|
|
1724
|
+
it('preserves metadata with KeywordExtractor', async () => {
|
|
1725
|
+
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1726
|
+
const chunks = await doc.chunk({ extract: { keywords: true } });
|
|
1727
|
+
const metadata = chunks[0].metadata;
|
|
1728
|
+
expect(metadata.source).toBe('unit-test');
|
|
1729
|
+
expect(metadata.customField).toBe(123);
|
|
1730
|
+
expect(metadata.excerptKeywords).toBeDefined();
|
|
1731
|
+
});
|
|
1732
|
+
|
|
1733
|
+
it('preserves metadata with SummaryExtractor', async () => {
|
|
1734
|
+
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1735
|
+
const chunks = await doc.chunk({ extract: { summary: true } });
|
|
1736
|
+
const metadata = chunks[0].metadata;
|
|
1737
|
+
expect(metadata.source).toBe('unit-test');
|
|
1738
|
+
expect(metadata.customField).toBe(123);
|
|
1739
|
+
expect(metadata.sectionSummary).toBeDefined();
|
|
1740
|
+
});
|
|
1741
|
+
|
|
1742
|
+
it('preserves metadata with QuestionsAnsweredExtractor', async () => {
|
|
1743
|
+
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1744
|
+
const chunks = await doc.chunk({ extract: { questions: true } });
|
|
1745
|
+
const metadata = chunks[0].metadata;
|
|
1746
|
+
expect(metadata.source).toBe('unit-test');
|
|
1747
|
+
expect(metadata.customField).toBe(123);
|
|
1748
|
+
expect(metadata.questionsThisExcerptCanAnswer).toBeDefined();
|
|
1749
|
+
});
|
|
1750
|
+
|
|
1751
|
+
it('preserves metadata with TitleExtractor', async () => {
|
|
1752
|
+
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1753
|
+
const chunks = await doc.chunk({ extract: { title: true } });
|
|
1754
|
+
const metadata = chunks[0].metadata;
|
|
1755
|
+
expect(metadata.source).toBe('unit-test');
|
|
1756
|
+
expect(metadata.customField).toBe(123);
|
|
1757
|
+
expect(metadata.documentTitle).toBeDefined();
|
|
1758
|
+
});
|
|
1759
|
+
|
|
1760
|
+
it('preserves metadata with multiple extractors', async () => {
|
|
1761
|
+
const doc = MDocument.fromText(baseText, { ...baseMetadata });
|
|
1762
|
+
const chunks = await doc.chunk({
|
|
1763
|
+
extract: {
|
|
1764
|
+
keywords: true,
|
|
1765
|
+
summary: true,
|
|
1766
|
+
questions: true,
|
|
1767
|
+
title: true,
|
|
1768
|
+
},
|
|
1769
|
+
});
|
|
1770
|
+
const metadata = chunks[0].metadata;
|
|
1771
|
+
expect(metadata.source).toBe('unit-test');
|
|
1772
|
+
expect(metadata.customField).toBe(123);
|
|
1773
|
+
expect(metadata.excerptKeywords).toBeDefined();
|
|
1774
|
+
expect(metadata.sectionSummary).toBeDefined();
|
|
1775
|
+
expect(metadata.questionsThisExcerptCanAnswer).toBeDefined();
|
|
1776
|
+
expect(metadata.documentTitle).toBeDefined();
|
|
1777
|
+
});
|
|
1778
|
+
it('preserves metadata on all chunks when multiple are created', async () => {
|
|
1779
|
+
const text = 'Chunk one.\n\nChunk two.\n\nChunk three.';
|
|
1780
|
+
const doc = MDocument.fromText(text, { source: 'multi-chunk', customField: 42 });
|
|
1781
|
+
const chunks = await doc.chunk({
|
|
1782
|
+
strategy: 'character',
|
|
1783
|
+
separator: '\n\n',
|
|
1784
|
+
size: 20,
|
|
1785
|
+
overlap: 0,
|
|
1786
|
+
extract: { keywords: true },
|
|
1787
|
+
});
|
|
1788
|
+
expect(chunks.length).toBeGreaterThan(1);
|
|
1789
|
+
for (const chunk of chunks) {
|
|
1790
|
+
const metadata = chunk.metadata;
|
|
1791
|
+
expect(metadata.source).toBe('multi-chunk');
|
|
1792
|
+
expect(metadata.customField).toBe(42);
|
|
1793
|
+
expect(metadata.excerptKeywords).toBeDefined();
|
|
1794
|
+
}
|
|
1795
|
+
});
|
|
1796
|
+
|
|
1797
|
+
it('overwrites only the matching metadata field with extractor output', async () => {
|
|
1798
|
+
const doc = MDocument.fromText('Test for overwrite', {
|
|
1799
|
+
excerptKeywords: 'original,keywords',
|
|
1800
|
+
unrelatedField: 'should stay',
|
|
1801
|
+
source: 'unit-test',
|
|
1802
|
+
});
|
|
1803
|
+
const chunks = await doc.chunk({ extract: { keywords: true } });
|
|
1804
|
+
const metadata = chunks[0].metadata;
|
|
1805
|
+
expect(metadata.source).toBe('unit-test');
|
|
1806
|
+
expect(metadata.unrelatedField).toBe('should stay');
|
|
1807
|
+
expect(metadata.excerptKeywords).not.toBe('original,keywords'); // Should be new keywords
|
|
1808
|
+
});
|
|
1809
|
+
});
|
|
1810
|
+
describe('MDocument TitleExtractor document grouping integration', () => {
|
|
1811
|
+
it('groups chunks by docId for title extraction (integration)', async () => {
|
|
1812
|
+
const doc = new MDocument({
|
|
1813
|
+
docs: [
|
|
1814
|
+
{ text: 'Alpha chunk 1', metadata: { docId: 'docA' } },
|
|
1815
|
+
{ text: 'Alpha chunk 2', metadata: { docId: 'docA' } },
|
|
1816
|
+
{ text: 'Beta chunk 1', metadata: { docId: 'docB' } },
|
|
1817
|
+
],
|
|
1818
|
+
type: 'text',
|
|
1819
|
+
});
|
|
1820
|
+
|
|
1821
|
+
await doc.extractMetadata({ title: true });
|
|
1822
|
+
const chunks = doc.getDocs();
|
|
1823
|
+
|
|
1824
|
+
const titleA1 = chunks[0].metadata.documentTitle;
|
|
1825
|
+
const titleA2 = chunks[1].metadata.documentTitle;
|
|
1826
|
+
const titleB = chunks[2].metadata.documentTitle;
|
|
1827
|
+
|
|
1828
|
+
expect(titleA1).toBeDefined();
|
|
1829
|
+
expect(titleA2).toBeDefined();
|
|
1830
|
+
expect(titleB).toBeDefined();
|
|
1831
|
+
expect(titleA1).toBe(titleA2);
|
|
1832
|
+
expect(titleA1).not.toBe(titleB);
|
|
1833
|
+
});
|
|
1834
|
+
});
|
|
1714
1835
|
});
|
|
1715
1836
|
|
|
1716
1837
|
// Helper function to find the longest common substring between two strings
|
package/src/document/document.ts
CHANGED
|
@@ -1,13 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
KeywordExtractor,
|
|
5
|
-
QuestionsAnsweredExtractor,
|
|
6
|
-
SummaryExtractor,
|
|
7
|
-
TitleExtractor,
|
|
8
|
-
ObjectType,
|
|
9
|
-
NodeRelationship,
|
|
10
|
-
} from 'llamaindex';
|
|
1
|
+
import { Document as Chunk, IngestionPipeline, NodeRelationship, ObjectType } from 'llamaindex';
|
|
2
|
+
|
|
3
|
+
import { TitleExtractor, SummaryExtractor, QuestionsAnsweredExtractor, KeywordExtractor } from './extractors';
|
|
11
4
|
|
|
12
5
|
import { CharacterTransformer, RecursiveCharacterTransformer } from './transformers/character';
|
|
13
6
|
import { HTMLHeaderTransformer, HTMLSectionTransformer } from './transformers/html';
|
|
@@ -45,18 +38,19 @@ export class MDocument {
|
|
|
45
38
|
|
|
46
39
|
if (typeof title !== 'undefined') {
|
|
47
40
|
transformations.push(new TitleExtractor(typeof title === 'boolean' ? {} : title));
|
|
48
|
-
this.chunks = this.chunks.map(
|
|
49
|
-
|
|
50
|
-
new Chunk({
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
41
|
+
this.chunks = this.chunks.map(doc =>
|
|
42
|
+
doc?.metadata?.docId
|
|
43
|
+
? new Chunk({
|
|
44
|
+
...doc,
|
|
45
|
+
relationships: {
|
|
46
|
+
[NodeRelationship.SOURCE]: {
|
|
47
|
+
nodeId: doc.metadata.docId,
|
|
48
|
+
nodeType: ObjectType.DOCUMENT,
|
|
49
|
+
metadata: doc.metadata,
|
|
50
|
+
},
|
|
57
51
|
},
|
|
58
|
-
}
|
|
59
|
-
|
|
52
|
+
})
|
|
53
|
+
: doc,
|
|
60
54
|
);
|
|
61
55
|
}
|
|
62
56
|
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
|
+
import { TextNode } from 'llamaindex';
|
|
3
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
4
|
+
import { KeywordExtractor } from './keywords';
|
|
5
|
+
|
|
6
|
+
const openai = createOpenAI({
|
|
7
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
const model = openai('gpt-4o');
|
|
11
|
+
|
|
12
|
+
vi.setConfig({ testTimeout: 10_000, hookTimeout: 10_000 });
|
|
13
|
+
|
|
14
|
+
describe('KeywordExtractor', () => {
|
|
15
|
+
it('can use a custom model for keywords extraction', async () => {
|
|
16
|
+
const extractor = new KeywordExtractor({ llm: model });
|
|
17
|
+
const node = new TextNode({ text: 'The quick brown fox jumps over the lazy dog.' });
|
|
18
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
19
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
20
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it('handles empty input gracefully', async () => {
|
|
24
|
+
const extractor = new KeywordExtractor();
|
|
25
|
+
const node = new TextNode({ text: '' });
|
|
26
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
27
|
+
expect(result.excerptKeywords).toBe('');
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
it('supports prompt customization', async () => {
|
|
31
|
+
const extractor = new KeywordExtractor({
|
|
32
|
+
promptTemplate: 'List keywords in: {context}. Limit to {maxKeywords}.',
|
|
33
|
+
});
|
|
34
|
+
const node = new TextNode({ text: 'Test document for prompt customization.' });
|
|
35
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
36
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
37
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
38
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
it('extracts keywords from text', async () => {
|
|
42
|
+
const extractor = new KeywordExtractor();
|
|
43
|
+
const node = new TextNode({ text: 'The quick brown fox jumps over the lazy dog.' });
|
|
44
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
45
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
46
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
47
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
48
|
+
});
|
|
49
|
+
it('handles very long input', async () => {
|
|
50
|
+
const extractor = new KeywordExtractor();
|
|
51
|
+
const longText = 'A'.repeat(1000);
|
|
52
|
+
const node = new TextNode({ text: longText });
|
|
53
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
54
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
55
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
56
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
it('handles whitespace only input', async () => {
|
|
60
|
+
const extractor = new KeywordExtractor();
|
|
61
|
+
const node = new TextNode({ text: ' ' });
|
|
62
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
63
|
+
expect(result.excerptKeywords).toBe('');
|
|
64
|
+
});
|
|
65
|
+
|
|
66
|
+
it('handles special characters and emojis', async () => {
|
|
67
|
+
const extractor = new KeywordExtractor();
|
|
68
|
+
const node = new TextNode({ text: 'πβ¨π₯' });
|
|
69
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
70
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
71
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
72
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
it('handles numbers only', async () => {
|
|
76
|
+
const extractor = new KeywordExtractor();
|
|
77
|
+
const node = new TextNode({ text: '1234567890' });
|
|
78
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
79
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
80
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
81
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('handles HTML tags', async () => {
|
|
85
|
+
const extractor = new KeywordExtractor();
|
|
86
|
+
const node = new TextNode({ text: '<h1>Test</h1>' });
|
|
87
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
88
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
89
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
90
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('handles non-English text', async () => {
|
|
94
|
+
const extractor = new KeywordExtractor();
|
|
95
|
+
const node = new TextNode({ text: 'θΏζ―δΈδΈͺζ΅θ―ζζ‘£γ' });
|
|
96
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
97
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
98
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
99
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
100
|
+
});
|
|
101
|
+
|
|
102
|
+
it('handles duplicate/repeated text', async () => {
|
|
103
|
+
const extractor = new KeywordExtractor();
|
|
104
|
+
const node = new TextNode({ text: 'repeat repeat repeat' });
|
|
105
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
106
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
107
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
108
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
109
|
+
});
|
|
110
|
+
|
|
111
|
+
it('handles only punctuation', async () => {
|
|
112
|
+
const extractor = new KeywordExtractor();
|
|
113
|
+
const node = new TextNode({ text: '!!!???...' });
|
|
114
|
+
const result = await extractor.extractKeywordsFromNodes(node);
|
|
115
|
+
expect(result).toHaveProperty('excerptKeywords');
|
|
116
|
+
expect(typeof result.excerptKeywords).toBe('string');
|
|
117
|
+
expect(result.excerptKeywords.length).toBeGreaterThan(0);
|
|
118
|
+
});
|
|
119
|
+
});
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import type { MastraLanguageModel } from '@mastra/core/agent';
|
|
2
|
+
import { PromptTemplate, defaultKeywordExtractPrompt, MetadataMode, TextNode, BaseExtractor } from 'llamaindex';
|
|
3
|
+
import type { KeywordExtractPrompt, BaseNode } from 'llamaindex';
|
|
4
|
+
import { baseLLM } from './types';
|
|
5
|
+
import type { KeywordExtractArgs } from './types';
|
|
6
|
+
|
|
7
|
+
type ExtractKeyword = {
|
|
8
|
+
/**
|
|
9
|
+
* Comma-separated keywords extracted from the node. May be empty if extraction fails.
|
|
10
|
+
*/
|
|
11
|
+
excerptKeywords: string;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Extract keywords from a list of nodes.
|
|
16
|
+
*/
|
|
17
|
+
export class KeywordExtractor extends BaseExtractor {
|
|
18
|
+
/**
|
|
19
|
+
* MastraLanguageModel instance.
|
|
20
|
+
* @type {MastraLanguageModel}
|
|
21
|
+
*/
|
|
22
|
+
llm: MastraLanguageModel;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Number of keywords to extract.
|
|
26
|
+
* @type {number}
|
|
27
|
+
* @default 5
|
|
28
|
+
*/
|
|
29
|
+
keywords: number = 5;
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* The prompt template to use for the question extractor.
|
|
33
|
+
* @type {string}
|
|
34
|
+
*/
|
|
35
|
+
promptTemplate: KeywordExtractPrompt;
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Constructor for the KeywordExtractor class.
|
|
39
|
+
* @param {MastraLanguageModel} llm MastraLanguageModel instance.
|
|
40
|
+
* @param {number} keywords Number of keywords to extract.
|
|
41
|
+
* @param {string} [promptTemplate] Optional custom prompt template (must include {context})
|
|
42
|
+
* @throws {Error} If keywords is less than 1.
|
|
43
|
+
*/
|
|
44
|
+
constructor(options?: KeywordExtractArgs) {
|
|
45
|
+
if (options?.keywords && options.keywords < 1) throw new Error('Keywords must be greater than 0');
|
|
46
|
+
|
|
47
|
+
super();
|
|
48
|
+
|
|
49
|
+
this.llm = options?.llm ?? baseLLM;
|
|
50
|
+
this.keywords = options?.keywords ?? 5;
|
|
51
|
+
this.promptTemplate = options?.promptTemplate
|
|
52
|
+
? new PromptTemplate({
|
|
53
|
+
templateVars: ['context', 'maxKeywords'],
|
|
54
|
+
template: options.promptTemplate,
|
|
55
|
+
})
|
|
56
|
+
: defaultKeywordExtractPrompt;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
*
|
|
61
|
+
* @param node Node to extract keywords from.
|
|
62
|
+
* @returns Keywords extracted from the node.
|
|
63
|
+
*/
|
|
64
|
+
/**
|
|
65
|
+
* Extract keywords from a node. Returns an object with a comma-separated string of keywords, or an empty string if extraction fails.
|
|
66
|
+
* Adds error handling for malformed/empty LLM output.
|
|
67
|
+
*/
|
|
68
|
+
async extractKeywordsFromNodes(node: BaseNode): Promise<ExtractKeyword> {
|
|
69
|
+
const text = node.getContent(this.metadataMode);
|
|
70
|
+
if (!text || text.trim() === '') {
|
|
71
|
+
return { excerptKeywords: '' };
|
|
72
|
+
}
|
|
73
|
+
if (this.isTextNodeOnly && !(node instanceof TextNode)) {
|
|
74
|
+
return { excerptKeywords: '' };
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
let keywords = '';
|
|
78
|
+
try {
|
|
79
|
+
const completion = await this.llm.doGenerate({
|
|
80
|
+
inputFormat: 'messages',
|
|
81
|
+
mode: { type: 'regular' },
|
|
82
|
+
prompt: [
|
|
83
|
+
{
|
|
84
|
+
role: 'user',
|
|
85
|
+
content: [
|
|
86
|
+
{
|
|
87
|
+
type: 'text',
|
|
88
|
+
text: this.promptTemplate.format({
|
|
89
|
+
context: node.getContent(MetadataMode.ALL),
|
|
90
|
+
maxKeywords: this.keywords.toString(),
|
|
91
|
+
}),
|
|
92
|
+
},
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
],
|
|
96
|
+
});
|
|
97
|
+
if (typeof completion.text === 'string') {
|
|
98
|
+
keywords = completion.text.trim();
|
|
99
|
+
} else {
|
|
100
|
+
console.warn('Keyword extraction LLM output was not a string:', completion.text);
|
|
101
|
+
}
|
|
102
|
+
} catch (err) {
|
|
103
|
+
console.warn('Keyword extraction failed:', err);
|
|
104
|
+
}
|
|
105
|
+
return { excerptKeywords: keywords };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/**
|
|
109
|
+
*
|
|
110
|
+
* @param nodes Nodes to extract keywords from.
|
|
111
|
+
* @returns Keywords extracted from the nodes.
|
|
112
|
+
*/
|
|
113
|
+
/**
|
|
114
|
+
* Extract keywords from an array of nodes. Always returns an array (may be empty).
|
|
115
|
+
* @param nodes Nodes to extract keywords from.
|
|
116
|
+
* @returns Array of keyword extraction results.
|
|
117
|
+
*/
|
|
118
|
+
async extract(nodes: BaseNode[]): Promise<Array<ExtractKeyword>> {
|
|
119
|
+
if (!Array.isArray(nodes) || nodes.length === 0) return [];
|
|
120
|
+
const results = await Promise.all(nodes.map(node => this.extractKeywordsFromNodes(node)));
|
|
121
|
+
return results;
|
|
122
|
+
}
|
|
123
|
+
}
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import { createOpenAI } from '@ai-sdk/openai';
|
|
2
|
+
import { TextNode } from 'llamaindex';
|
|
3
|
+
import { describe, it, expect, vi } from 'vitest';
|
|
4
|
+
import { QuestionsAnsweredExtractor } from './questions';
|
|
5
|
+
|
|
6
|
+
const openai = createOpenAI({
|
|
7
|
+
apiKey: process.env.OPENAI_API_KEY,
|
|
8
|
+
});
|
|
9
|
+
|
|
10
|
+
const model = openai('gpt-4o');
|
|
11
|
+
|
|
12
|
+
vi.setConfig({ testTimeout: 10_000, hookTimeout: 10_000 });
|
|
13
|
+
|
|
14
|
+
describe('QuestionsAnsweredExtractor', () => {
|
|
15
|
+
it('can use a custom model for questions extraction', async () => {
|
|
16
|
+
const extractor = new QuestionsAnsweredExtractor({ llm: model });
|
|
17
|
+
const node = new TextNode({ text: 'What is the capital of Spain?' });
|
|
18
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
19
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
20
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
it('extracts questions', async () => {
|
|
24
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
25
|
+
const node = new TextNode({ text: 'What is the capital of France? What is the color of the sky?' });
|
|
26
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
27
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
28
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
29
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
it('handles empty input gracefully', async () => {
|
|
33
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
34
|
+
const node = new TextNode({ text: '' });
|
|
35
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
36
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
37
|
+
expect(result.questionsThisExcerptCanAnswer).toBe('');
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
it('supports prompt customization', async () => {
|
|
41
|
+
const extractor = new QuestionsAnsweredExtractor({
|
|
42
|
+
promptTemplate: 'List questions in: {context}. Limit to {numQuestions}.',
|
|
43
|
+
});
|
|
44
|
+
const node = new TextNode({ text: 'Test document for prompt customization.' });
|
|
45
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
46
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
47
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
48
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
49
|
+
});
|
|
50
|
+
it('handles very long input', async () => {
|
|
51
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
52
|
+
const longText = 'A'.repeat(1000);
|
|
53
|
+
const node = new TextNode({ text: longText });
|
|
54
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
55
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
56
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
57
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
it('handles whitespace only input', async () => {
|
|
61
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
62
|
+
const node = new TextNode({ text: ' ' });
|
|
63
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
64
|
+
expect(result.questionsThisExcerptCanAnswer).toBe('');
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
it('handles special characters and emojis', async () => {
|
|
68
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
69
|
+
const node = new TextNode({ text: 'πβ¨π₯' });
|
|
70
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
71
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
72
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
73
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
it('handles numbers only', async () => {
|
|
77
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
78
|
+
const node = new TextNode({ text: '1234567890' });
|
|
79
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
80
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
81
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
82
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
it('handles HTML tags', async () => {
|
|
86
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
87
|
+
const node = new TextNode({ text: '<h1>Test</h1>' });
|
|
88
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
89
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
90
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
91
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('handles non-English text', async () => {
|
|
95
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
96
|
+
const node = new TextNode({ text: 'θΏζ―δΈδΈͺζ΅θ―ζζ‘£γ' });
|
|
97
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
98
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
99
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
100
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
101
|
+
});
|
|
102
|
+
|
|
103
|
+
it('handles duplicate/repeated text', async () => {
|
|
104
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
105
|
+
const node = new TextNode({ text: 'repeat repeat repeat' });
|
|
106
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
107
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
108
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
109
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
it('handles only punctuation', async () => {
|
|
113
|
+
const extractor = new QuestionsAnsweredExtractor();
|
|
114
|
+
const node = new TextNode({ text: '!!!???...' });
|
|
115
|
+
const result = await extractor.extractQuestionsFromNode(node);
|
|
116
|
+
expect(result).toHaveProperty('questionsThisExcerptCanAnswer');
|
|
117
|
+
expect(typeof result.questionsThisExcerptCanAnswer).toBe('string');
|
|
118
|
+
expect(result.questionsThisExcerptCanAnswer.length).toBeGreaterThan(0);
|
|
119
|
+
});
|
|
120
|
+
});
|