@lobehub/chat 1.81.4 → 1.81.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.js +1 -0
- package/.github/workflows/release.yml +5 -0
- package/.github/workflows/test.yml +5 -0
- package/CHANGELOG.md +50 -0
- package/changelog/v1.json +18 -0
- package/locales/ar/auth.json +1 -1
- package/locales/ar/hotkey.json +4 -0
- package/locales/ar/models.json +3 -0
- package/locales/bg-BG/auth.json +1 -1
- package/locales/bg-BG/hotkey.json +4 -0
- package/locales/bg-BG/models.json +3 -0
- package/locales/de-DE/auth.json +1 -1
- package/locales/de-DE/hotkey.json +4 -0
- package/locales/de-DE/models.json +3 -0
- package/locales/en-US/auth.json +1 -1
- package/locales/en-US/hotkey.json +4 -0
- package/locales/en-US/models.json +3 -0
- package/locales/es-ES/auth.json +1 -1
- package/locales/es-ES/hotkey.json +4 -0
- package/locales/es-ES/models.json +3 -0
- package/locales/fa-IR/auth.json +1 -1
- package/locales/fa-IR/hotkey.json +4 -0
- package/locales/fa-IR/models.json +3 -0
- package/locales/fr-FR/auth.json +1 -1
- package/locales/fr-FR/hotkey.json +4 -0
- package/locales/fr-FR/models.json +3 -0
- package/locales/it-IT/auth.json +1 -1
- package/locales/it-IT/hotkey.json +4 -0
- package/locales/it-IT/models.json +3 -0
- package/locales/ja-JP/auth.json +1 -1
- package/locales/ja-JP/hotkey.json +4 -0
- package/locales/ja-JP/models.json +3 -0
- package/locales/ko-KR/auth.json +1 -1
- package/locales/ko-KR/hotkey.json +4 -0
- package/locales/ko-KR/models.json +3 -0
- package/locales/nl-NL/auth.json +1 -1
- package/locales/nl-NL/hotkey.json +4 -0
- package/locales/nl-NL/models.json +3 -0
- package/locales/pl-PL/auth.json +1 -1
- package/locales/pl-PL/hotkey.json +4 -0
- package/locales/pl-PL/models.json +3 -0
- package/locales/pt-BR/auth.json +1 -1
- package/locales/pt-BR/hotkey.json +4 -0
- package/locales/pt-BR/models.json +3 -0
- package/locales/ru-RU/auth.json +1 -1
- package/locales/ru-RU/hotkey.json +4 -0
- package/locales/ru-RU/models.json +3 -0
- package/locales/tr-TR/auth.json +1 -1
- package/locales/tr-TR/hotkey.json +4 -0
- package/locales/tr-TR/models.json +3 -0
- package/locales/vi-VN/auth.json +1 -1
- package/locales/vi-VN/hotkey.json +4 -0
- package/locales/vi-VN/models.json +3 -0
- package/locales/zh-CN/auth.json +1 -1
- package/locales/zh-CN/changelog.json +1 -1
- package/locales/zh-CN/clerk.json +1 -1
- package/locales/zh-CN/discover.json +1 -1
- package/locales/zh-CN/file.json +1 -1
- package/locales/zh-CN/hotkey.json +4 -0
- package/locales/zh-CN/knowledgeBase.json +1 -1
- package/locales/zh-CN/metadata.json +1 -1
- package/locales/zh-CN/migration.json +1 -1
- package/locales/zh-CN/models.json +3 -0
- package/locales/zh-CN/ragEval.json +1 -1
- package/locales/zh-CN/thread.json +1 -1
- package/locales/zh-CN/welcome.json +1 -1
- package/locales/zh-TW/auth.json +1 -1
- package/locales/zh-TW/hotkey.json +4 -0
- package/locales/zh-TW/models.json +3 -0
- package/package.json +6 -4
- package/packages/file-loaders/README.md +63 -0
- package/packages/file-loaders/package.json +42 -0
- package/packages/file-loaders/src/index.ts +2 -0
- package/packages/file-loaders/src/loadFile.ts +206 -0
- package/packages/file-loaders/src/loaders/docx/__snapshots__/index.test.ts.snap +74 -0
- package/packages/file-loaders/src/loaders/docx/fixtures/test.docx +0 -0
- package/packages/file-loaders/src/loaders/docx/index.test.ts +41 -0
- package/packages/file-loaders/src/loaders/docx/index.ts +73 -0
- package/packages/file-loaders/src/loaders/excel/__snapshots__/index.test.ts.snap +58 -0
- package/packages/file-loaders/src/loaders/excel/fixtures/test.xlsx +0 -0
- package/packages/file-loaders/src/loaders/excel/index.test.ts +47 -0
- package/packages/file-loaders/src/loaders/excel/index.ts +121 -0
- package/packages/file-loaders/src/loaders/index.ts +19 -0
- package/packages/file-loaders/src/loaders/pdf/__snapshots__/index.test.ts.snap +98 -0
- package/packages/file-loaders/src/loaders/pdf/index.test.ts +49 -0
- package/packages/file-loaders/src/loaders/pdf/index.ts +133 -0
- package/packages/file-loaders/src/loaders/pptx/__snapshots__/index.test.ts.snap +40 -0
- package/packages/file-loaders/src/loaders/pptx/fixtures/test.pptx +0 -0
- package/packages/file-loaders/src/loaders/pptx/index.test.ts +47 -0
- package/packages/file-loaders/src/loaders/pptx/index.ts +186 -0
- package/packages/file-loaders/src/loaders/text/__snapshots__/index.test.ts.snap +15 -0
- package/packages/file-loaders/src/loaders/text/fixtures/test.txt +2 -0
- package/packages/file-loaders/src/loaders/text/index.test.ts +38 -0
- package/packages/file-loaders/src/loaders/text/index.ts +53 -0
- package/packages/file-loaders/src/types.ts +200 -0
- package/packages/file-loaders/src/utils/isTextReadableFile.ts +68 -0
- package/packages/file-loaders/src/utils/parser-utils.ts +112 -0
- package/packages/file-loaders/test/__snapshots__/loaders.test.ts.snap +93 -0
- package/packages/file-loaders/test/fixtures/test.csv +4 -0
- package/packages/file-loaders/test/fixtures/test.docx +0 -0
- package/packages/file-loaders/test/fixtures/test.epub +0 -0
- package/packages/file-loaders/test/fixtures/test.md +3 -0
- package/packages/file-loaders/test/fixtures/test.pptx +0 -0
- package/packages/file-loaders/test/fixtures/test.txt +3 -0
- package/packages/file-loaders/test/loaders.test.ts +39 -0
- package/src/config/aiModels/github.ts +2 -4
- package/src/config/aiModels/google.ts +3 -4
- package/src/config/aiModels/sensenova.ts +4 -5
- package/src/const/hotkeys.ts +6 -0
- package/src/features/ChatInput/ActionBar/Clear.tsx +18 -8
- package/src/hooks/useHotkeys/chatScope.ts +7 -0
- package/src/libs/agent-runtime/google/index.ts +1 -1
- package/src/libs/agent-runtime/sensenova/index.ts +20 -27
- package/src/libs/agent-runtime/utils/sensenovaHelpers.test.ts +24 -33
- package/src/libs/agent-runtime/utils/sensenovaHelpers.ts +2 -3
- package/src/locales/default/hotkey.ts +4 -0
- package/src/server/modules/MCPClient/__tests__/__snapshots__/index.test.ts.snap +113 -0
- package/src/server/modules/MCPClient/__tests__/index.test.ts +81 -0
- package/src/server/modules/MCPClient/index.ts +80 -0
- package/src/types/hotkey.ts +1 -0
@@ -0,0 +1,47 @@
|
|
1
|
+
// @vitest-environment node
|
2
|
+
import path from 'node:path';
|
3
|
+
import { beforeEach, describe, expect, it } from 'vitest';
|
4
|
+
|
5
|
+
import type { FileLoaderInterface } from '../../types';
|
6
|
+
import { PptxLoader } from './index';
|
7
|
+
|
8
|
+
// Import PptxLoader
|
9
|
+
|
10
|
+
// 确保你已经在 fixtures 目录下放置了 test.pptx 文件
|
11
|
+
// 这个 PPTX 文件最好包含多个幻灯片 (slides) 以便测试
|
12
|
+
const fixturePath = (filename: string) => path.join(__dirname, `./fixtures/${filename}`);
|
13
|
+
|
14
|
+
let loader: FileLoaderInterface;
|
15
|
+
|
16
|
+
const testFile = fixturePath('test.pptx'); // Use .pptx
|
17
|
+
const nonExistentFile = fixturePath('nonexistent.pptx'); // Use .pptx
|
18
|
+
|
19
|
+
beforeEach(() => {
|
20
|
+
loader = new PptxLoader(); // Instantiate PptxLoader
|
21
|
+
});
|
22
|
+
|
23
|
+
describe('PptxLoader', () => {
|
24
|
+
// Describe PptxLoader
|
25
|
+
it('should load pages correctly from a PPTX file (one page per slide)', async () => {
|
26
|
+
const pages = await loader.loadPages(testFile);
|
27
|
+
// PPTX 文件有多少个 slide,就应该有多少个 page
|
28
|
+
expect(pages.length).toBeGreaterThan(1);
|
29
|
+
|
30
|
+
// 直接对整个 pages 数组进行快照测试 (会包含 slideNumber)
|
31
|
+
expect(pages).toMatchSnapshot();
|
32
|
+
});
|
33
|
+
|
34
|
+
it('should aggregate content correctly (joining slides)', async () => {
|
35
|
+
const pages = await loader.loadPages(testFile);
|
36
|
+
const content = await loader.aggregateContent(pages);
|
37
|
+
// 默认聚合是以换行符连接各 slide 内容
|
38
|
+
expect(content).toMatchSnapshot('aggregated_content');
|
39
|
+
});
|
40
|
+
|
41
|
+
it('should handle file read errors in loadPages', async () => {
|
42
|
+
const pages = await loader.loadPages(nonExistentFile);
|
43
|
+
expect(pages).toHaveLength(1); // 即使失败也返回一个包含错误信息的页面
|
44
|
+
expect(pages[0].pageContent).toBe('');
|
45
|
+
expect(pages[0].metadata.error).toContain('Failed to load or process PPTX file:'); // Update error message check
|
46
|
+
});
|
47
|
+
});
|
@@ -0,0 +1,186 @@
|
|
1
|
+
import path from 'node:path';
|
2
|
+
|
3
|
+
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
|
+
import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils';
|
5
|
+
|
6
|
+
/**
|
7
|
+
* Represents a loader for PPTX files using extracted utility functions.
|
8
|
+
*
|
9
|
+
* This loader reads a PPTX file, extracts text content from each slide,
|
10
|
+
* and represents each slide as a `DocumentPage`.
|
11
|
+
*/
|
12
|
+
export class PptxLoader implements FileLoaderInterface {
|
13
|
+
/**
|
14
|
+
* Loads pages from the specified PPTX file path.
|
15
|
+
*
|
16
|
+
* @param filePath The absolute path to the PPTX file.
|
17
|
+
* @returns A Promise resolving to an array of `DocumentPage` objects.
|
18
|
+
* If loading or parsing fails, it returns an array containing a single
|
19
|
+
* `DocumentPage` object with error information in its metadata.
|
20
|
+
*/
|
21
|
+
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
22
|
+
const sourceFileName = path.basename(filePath);
|
23
|
+
|
24
|
+
try {
|
25
|
+
// --- File Extraction Step ---
|
26
|
+
const slidesRegex = /ppt\/slides\/slide\d+\.xml/g;
|
27
|
+
const slideNumberRegex = /slide(\d+)\.xml/;
|
28
|
+
|
29
|
+
// Extract only slide XML files
|
30
|
+
const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) =>
|
31
|
+
slidesRegex.test(fileName),
|
32
|
+
);
|
33
|
+
|
34
|
+
// --- Validation Step ---
|
35
|
+
if (slideFiles.length === 0) {
|
36
|
+
console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`);
|
37
|
+
return [
|
38
|
+
this.createErrorPage(
|
39
|
+
'No slides found. The PPTX file might be empty, corrupted, or does not contain standard slide XMLs.',
|
40
|
+
sourceFileName,
|
41
|
+
),
|
42
|
+
];
|
43
|
+
}
|
44
|
+
|
45
|
+
// --- Sorting Step ---
|
46
|
+
// Sort files based on the slide number extracted from the path
|
47
|
+
slideFiles.sort((a, b) => {
|
48
|
+
const matchA = a.path.match(slideNumberRegex);
|
49
|
+
const matchB = b.path.match(slideNumberRegex);
|
50
|
+
const numA = matchA ? parseInt(matchA[1], 10) : Infinity;
|
51
|
+
const numB = matchB ? parseInt(matchB[1], 10) : Infinity;
|
52
|
+
return numA - numB;
|
53
|
+
});
|
54
|
+
|
55
|
+
// --- Page Creation Step ---
|
56
|
+
const pages: DocumentPage[] = slideFiles
|
57
|
+
.map((slideFile, index) => {
|
58
|
+
try {
|
59
|
+
const xmlDoc = parseString(slideFile.content);
|
60
|
+
const paragraphNodes = xmlDoc.getElementsByTagName('a:p');
|
61
|
+
|
62
|
+
const slideText = Array.from(paragraphNodes)
|
63
|
+
.map((pNode) => {
|
64
|
+
const textNodes = pNode.getElementsByTagName('a:t');
|
65
|
+
return Array.from(textNodes)
|
66
|
+
.map((tNode) => (tNode.childNodes[0] ? tNode.childNodes[0].nodeValue : ''))
|
67
|
+
.join(''); // Join text within a paragraph without spaces
|
68
|
+
})
|
69
|
+
.filter((text) => text.length > 0) // Filter out empty paragraphs
|
70
|
+
.join('\n'); // Join paragraphs with newline
|
71
|
+
|
72
|
+
const lines = slideText.split('\n');
|
73
|
+
const slideNumberMatch = slideFile.path.match(slideNumberRegex);
|
74
|
+
const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails
|
75
|
+
|
76
|
+
const metadata = {
|
77
|
+
pageCount: slideFiles.length, // Total number of slides found
|
78
|
+
slideNumber: slideNumber,
|
79
|
+
sourceFileName,
|
80
|
+
};
|
81
|
+
|
82
|
+
return {
|
83
|
+
charCount: slideText.length,
|
84
|
+
lineCount: lines.length,
|
85
|
+
metadata: metadata,
|
86
|
+
pageContent: slideText.trim(), // Trim final content
|
87
|
+
};
|
88
|
+
} catch (parseError) {
|
89
|
+
console.error(
|
90
|
+
`Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
|
91
|
+
);
|
92
|
+
// Create a specific error page for this slide, or could return null and filter later
|
93
|
+
// Returning null might be better if one slide fails but others succeed.
|
94
|
+
// For now, let's keep it simple and create an error page for this slide.
|
95
|
+
return this.createErrorPage(
|
96
|
+
`Error parsing slide ${slideFile.path}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
|
97
|
+
sourceFileName,
|
98
|
+
slideFile.path,
|
99
|
+
);
|
100
|
+
}
|
101
|
+
})
|
102
|
+
// Filter out any potential nulls if we change the error handling above
|
103
|
+
.filter((page): page is DocumentPage => page !== null);
|
104
|
+
|
105
|
+
if (pages.length === 0) {
|
106
|
+
// This case might happen if all slides failed to parse
|
107
|
+
console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`);
|
108
|
+
return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)];
|
109
|
+
}
|
110
|
+
|
111
|
+
// Check if all pages are error pages
|
112
|
+
const allErrored = pages.every((page) => page.metadata?.error);
|
113
|
+
if (allErrored) {
|
114
|
+
// If all pages resulted in errors, perhaps return a single summary error
|
115
|
+
console.warn(`All slides failed to parse for ${sourceFileName}`);
|
116
|
+
return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)];
|
117
|
+
// Or return all the individual error pages: return pages;
|
118
|
+
}
|
119
|
+
|
120
|
+
return pages;
|
121
|
+
} catch (error) {
|
122
|
+
// --- Error Handling Step ---
|
123
|
+
// This catches errors from extractFiles or other unexpected issues
|
124
|
+
const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`;
|
125
|
+
console.error(errorMessage, { filePath });
|
126
|
+
return [this.createErrorPage(errorMessage, sourceFileName)];
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
/**
|
131
|
+
* Aggregates the content from all DocumentPages (slides).
|
132
|
+
*
|
133
|
+
* Prepends each slide's content with a "## Slide: N" header.
|
134
|
+
* Joins the content of slides with a standard separator.
|
135
|
+
*
|
136
|
+
* @param pages An array of `DocumentPage` objects obtained from `loadPages`.
|
137
|
+
* @returns A Promise resolving to the aggregated content string.
|
138
|
+
*/
|
139
|
+
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
140
|
+
// Ensure pages array is valid and non-empty before proceeding
|
141
|
+
// Filter out error pages before aggregation unless we want to include error messages
|
142
|
+
const validPages = pages.filter((page) => !page.metadata?.error);
|
143
|
+
|
144
|
+
if (validPages.length === 0) {
|
145
|
+
// If only error pages existed, return empty or a summary error message
|
146
|
+
return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page)
|
147
|
+
}
|
148
|
+
|
149
|
+
return validPages
|
150
|
+
.map((page) => {
|
151
|
+
const slideNumber = page.metadata?.slideNumber;
|
152
|
+
// Use Markdown H2 for slide headers
|
153
|
+
const header = slideNumber ? `<slide_page pageNumber="${slideNumber}">` : '<slide_page>'; // Fallback header
|
154
|
+
return `${header}
|
155
|
+
${page.pageContent}
|
156
|
+
</slide_page>`;
|
157
|
+
})
|
158
|
+
.join('\n\n'); // Use Markdown horizontal rule as separator
|
159
|
+
}
|
160
|
+
|
161
|
+
/**
|
162
|
+
* Helper method to create a standardized error page object.
|
163
|
+
*
|
164
|
+
* @param errorInfo A string describing the error.
|
165
|
+
* @param sourceFileName The name of the file that caused the error.
|
166
|
+
* @param sourceFilePath Optional: Specific path within the archive that caused the error (e.g., slide path)
|
167
|
+
* @returns A `DocumentPage` object representing the error state.
|
168
|
+
*/
|
169
|
+
private createErrorPage(
|
170
|
+
errorInfo: string,
|
171
|
+
sourceFileName: string,
|
172
|
+
sourceFilePath?: string,
|
173
|
+
): DocumentPage {
|
174
|
+
return {
|
175
|
+
charCount: 0,
|
176
|
+
lineCount: 0,
|
177
|
+
metadata: {
|
178
|
+
error: errorInfo,
|
179
|
+
pageCount: 0,
|
180
|
+
sourceFileName: sourceFileName,
|
181
|
+
...(sourceFilePath && { sourceFilePath }), // Add specific path if available
|
182
|
+
},
|
183
|
+
pageContent: '', // Error pages have no content
|
184
|
+
};
|
185
|
+
}
|
186
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
2
|
+
|
3
|
+
exports[`TextLoader > should load pages correctly 1`] = `
|
4
|
+
{
|
5
|
+
"charCount": 25,
|
6
|
+
"lineCount": 3,
|
7
|
+
"metadata": {
|
8
|
+
"lineNumberEnd": 3,
|
9
|
+
"lineNumberStart": 1,
|
10
|
+
},
|
11
|
+
"pageContent": "Hello Text.
|
12
|
+
Second Line.
|
13
|
+
",
|
14
|
+
}
|
15
|
+
`;
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import path from 'node:path';
|
2
|
+
import { beforeEach } from 'vitest';
|
3
|
+
|
4
|
+
import type { FileLoaderInterface } from '../../types';
|
5
|
+
import { TextLoader } from './index';
|
6
|
+
|
7
|
+
const fixturePath = (filename: string) => path.join(__dirname, `./fixtures/${filename}`);
|
8
|
+
|
9
|
+
let loader: FileLoaderInterface;
|
10
|
+
|
11
|
+
const testFile = fixturePath('test.txt');
|
12
|
+
|
13
|
+
beforeEach(() => {
|
14
|
+
loader = new TextLoader();
|
15
|
+
});
|
16
|
+
|
17
|
+
describe('TextLoader', () => {
|
18
|
+
it('should load pages correctly', async () => {
|
19
|
+
const pages = await loader.loadPages(testFile);
|
20
|
+
expect(pages).toHaveLength(1);
|
21
|
+
const page = pages[0];
|
22
|
+
expect(page).toMatchSnapshot();
|
23
|
+
});
|
24
|
+
|
25
|
+
it('should aggregate content correctly', async () => {
|
26
|
+
const pages = await loader.loadPages(testFile);
|
27
|
+
const content = await loader.aggregateContent(pages);
|
28
|
+
// Default aggregation joins with newline
|
29
|
+
expect(content).toBe('Hello Text.\nSecond Line.\n');
|
30
|
+
});
|
31
|
+
|
32
|
+
it('should handle file read errors in loadPages', async () => {
|
33
|
+
const pages = await loader.loadPages(fixturePath('nonexistent.txt'));
|
34
|
+
expect(pages).toHaveLength(1);
|
35
|
+
expect(pages[0].metadata.error).toContain('Failed to load text file');
|
36
|
+
expect(pages[0].pageContent).toBe('');
|
37
|
+
});
|
38
|
+
});
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
2
|
+
|
3
|
+
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
|
+
|
5
|
+
/**
|
6
|
+
* 用于加载纯文本文件的加载器。
|
7
|
+
*/
|
8
|
+
export class TextLoader implements FileLoaderInterface {
|
9
|
+
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
10
|
+
try {
|
11
|
+
const fileContent = await readFile(filePath, 'utf8');
|
12
|
+
const lines = fileContent.split('\n');
|
13
|
+
const lineCount = lines.length;
|
14
|
+
const charCount = fileContent.length;
|
15
|
+
|
16
|
+
const page: DocumentPage = {
|
17
|
+
charCount,
|
18
|
+
lineCount,
|
19
|
+
metadata: {
|
20
|
+
lineNumberEnd: lineCount,
|
21
|
+
lineNumberStart: 1,
|
22
|
+
},
|
23
|
+
pageContent: fileContent,
|
24
|
+
};
|
25
|
+
|
26
|
+
return [page];
|
27
|
+
} catch (e) {
|
28
|
+
const error = e as Error;
|
29
|
+
console.error(`Error loading text file ${filePath}: ${error.message}`);
|
30
|
+
// 如果读取失败,返回一个包含错误信息的 Page
|
31
|
+
const errorPage: DocumentPage = {
|
32
|
+
charCount: 0,
|
33
|
+
lineCount: 0,
|
34
|
+
metadata: {
|
35
|
+
error: `Failed to load text file: ${error.message}`,
|
36
|
+
},
|
37
|
+
pageContent: '',
|
38
|
+
};
|
39
|
+
return [errorPage];
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
/**
|
44
|
+
* 对于纯文本,简单地连接所有页面的内容。
|
45
|
+
* (虽然 TextLoader 通常只有一个页面,但保持接口一致性)
|
46
|
+
* @param pages 页面数组
|
47
|
+
* @returns 聚合后的内容
|
48
|
+
*/
|
49
|
+
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
50
|
+
// 默认使用换行符连接,可以根据需要调整或使其可配置
|
51
|
+
return pages.map((page) => page.pageContent).join('\n');
|
52
|
+
}
|
53
|
+
}
|
@@ -0,0 +1,200 @@
|
|
1
|
+
// Define supported file types - consider using an enum or const assertion
|
2
|
+
export type SupportedFileType = 'pdf' | 'docx' | 'txt' | 'excel' | 'pptx'; // | 'pptx' | 'latex' | 'epub' | 'code' | 'markdown';
|
3
|
+
|
4
|
+
/**
|
5
|
+
* 代表一个完整的已加载文件,包含文件级信息和其所有页面/块。
|
6
|
+
*/
|
7
|
+
export interface FileDocument {
|
8
|
+
/**
|
9
|
+
* 文件内容
|
10
|
+
*/
|
11
|
+
content: string;
|
12
|
+
|
13
|
+
/**
|
14
|
+
* 文件创建时间戳。
|
15
|
+
*/
|
16
|
+
createdTime: Date;
|
17
|
+
|
18
|
+
/**
|
19
|
+
* 文件类型或扩展名。
|
20
|
+
*/
|
21
|
+
fileType: string;
|
22
|
+
|
23
|
+
/**
|
24
|
+
* 原始文件名。
|
25
|
+
*/
|
26
|
+
filename: string;
|
27
|
+
|
28
|
+
/**
|
29
|
+
* 文件级别的元数据。
|
30
|
+
* 例如从文件属性中提取的标题、作者,或整个文件加载失败时的错误。
|
31
|
+
*/
|
32
|
+
metadata: {
|
33
|
+
/**
|
34
|
+
* 允许添加其他文件级别的元数据。
|
35
|
+
*/
|
36
|
+
[key: string]: any;
|
37
|
+
/**
|
38
|
+
* 文档作者 (如果可用)。
|
39
|
+
*/
|
40
|
+
author?: string;
|
41
|
+
/**
|
42
|
+
* 如果整个文件加载失败,记录错误信息。
|
43
|
+
*/
|
44
|
+
error?: string;
|
45
|
+
/**
|
46
|
+
* 文档标题 (如果可用)。
|
47
|
+
*/
|
48
|
+
title?: string;
|
49
|
+
};
|
50
|
+
|
51
|
+
/**
|
52
|
+
* 文件最后修改时间戳。
|
53
|
+
*/
|
54
|
+
modifiedTime: Date;
|
55
|
+
|
56
|
+
/**
|
57
|
+
* 包含文档中所有逻辑页面/块的数组。
|
58
|
+
* 顺序通常对应文件中的自然顺序。
|
59
|
+
*/
|
60
|
+
pages?: DocumentPage[];
|
61
|
+
|
62
|
+
/**
|
63
|
+
* 原始文件的完整路径。
|
64
|
+
*/
|
65
|
+
source: string;
|
66
|
+
|
67
|
+
/**
|
68
|
+
* 整个文档的总字符数 (所有 Page 的 charCount 之和)。
|
69
|
+
* 需要在所有 Page 加载和计算后得出。
|
70
|
+
*/
|
71
|
+
totalCharCount: number;
|
72
|
+
|
73
|
+
/**
|
74
|
+
* 整个文档的总行数 (所有 Page 的 lineCount 之和)。
|
75
|
+
* 需要在所有 Page 加载和计算后得出。
|
76
|
+
*/
|
77
|
+
totalLineCount: number;
|
78
|
+
}
|
79
|
+
|
80
|
+
/**
|
81
|
+
* 代表文件中的一个逻辑单元/页面/块。
|
82
|
+
*/
|
83
|
+
export interface DocumentPage {
|
84
|
+
/**
|
85
|
+
* 此页/块内容的字符数。
|
86
|
+
*/
|
87
|
+
charCount: number;
|
88
|
+
|
89
|
+
/**
|
90
|
+
* 此页/块内容的行数。
|
91
|
+
*/
|
92
|
+
lineCount: number;
|
93
|
+
|
94
|
+
/**
|
95
|
+
* 与此页/块相关的元数据。
|
96
|
+
*/
|
97
|
+
metadata: {
|
98
|
+
/**
|
99
|
+
* 允许添加其他特定于页/块的元数据。
|
100
|
+
*/
|
101
|
+
[key: string]: any;
|
102
|
+
|
103
|
+
/**
|
104
|
+
* 如果原始文件单元被进一步分割成块,这是当前块的索引。
|
105
|
+
*/
|
106
|
+
chunkIndex?: number;
|
107
|
+
|
108
|
+
/**
|
109
|
+
* 处理此页/块时发生的错误。
|
110
|
+
*/
|
111
|
+
error?: string;
|
112
|
+
|
113
|
+
/**
|
114
|
+
* 此页/块在原始文件中的结束行号。
|
115
|
+
*/
|
116
|
+
lineNumberEnd?: number;
|
117
|
+
|
118
|
+
/**
|
119
|
+
* 此页/块在原始文件中的起始行号。
|
120
|
+
*/
|
121
|
+
lineNumberStart?: number;
|
122
|
+
|
123
|
+
/**
|
124
|
+
* 页码 (适用于 PDF, DOCX)。
|
125
|
+
*/
|
126
|
+
pageNumber?: number;
|
127
|
+
|
128
|
+
/**
|
129
|
+
* 与此页/块相关的章节标题。
|
130
|
+
*/
|
131
|
+
sectionTitle?: string;
|
132
|
+
|
133
|
+
/**
|
134
|
+
* 工作表名称 (适用于 XLSX)。
|
135
|
+
*/
|
136
|
+
sheetName?: string;
|
137
|
+
|
138
|
+
/**
|
139
|
+
* 幻灯片编号 (适用于 PPTX)。
|
140
|
+
*/
|
141
|
+
slideNumber?: number;
|
142
|
+
|
143
|
+
/**
|
144
|
+
* 如果原始文件单元被进一步分割成块,这是该单元的总块数。
|
145
|
+
*/
|
146
|
+
totalChunks?: number;
|
147
|
+
};
|
148
|
+
|
149
|
+
/**
|
150
|
+
* 此页/块的核心文本内容。
|
151
|
+
*/
|
152
|
+
pageContent: string;
|
153
|
+
}
|
154
|
+
|
155
|
+
/**
|
156
|
+
* 可选的文件元数据,用于覆盖从文件系统读取的信息。
|
157
|
+
*/
|
158
|
+
export interface FileMetadata {
|
159
|
+
/**
|
160
|
+
* 文件创建时间戳。
|
161
|
+
*/
|
162
|
+
createdTime?: Date;
|
163
|
+
/**
|
164
|
+
* 文件类型或扩展名。
|
165
|
+
*/
|
166
|
+
fileType?: string;
|
167
|
+
/**
|
168
|
+
* 文件名。
|
169
|
+
*/
|
170
|
+
filename?: string;
|
171
|
+
/**
|
172
|
+
* 文件最后修改时间戳。
|
173
|
+
*/
|
174
|
+
modifiedTime?: Date;
|
175
|
+
/**
|
176
|
+
* 文件来源标识 (例如 S3 URL 或原始路径)。
|
177
|
+
*/
|
178
|
+
source?: string;
|
179
|
+
}
|
180
|
+
|
181
|
+
/**
|
182
|
+
* 定义所有文件加载器类必须实现的接口。
|
183
|
+
*/
|
184
|
+
export interface FileLoaderInterface {
|
185
|
+
/**
|
186
|
+
* 将从 loadPages 获取的页面内容聚合成单一的字符串。
|
187
|
+
* @param pages DocumentPage 对象的数组。
|
188
|
+
* @returns 返回聚合后的文本内容的 Promise。
|
189
|
+
*/
|
190
|
+
aggregateContent(pages: DocumentPage[]): Promise<string>;
|
191
|
+
|
192
|
+
attachDocumentMetadata?(filePath: string): Promise<Record<string, any>>;
|
193
|
+
|
194
|
+
/**
|
195
|
+
* 根据文件路径加载文件内容,并将其分割为逻辑页面/块。
|
196
|
+
* @param filePath 文件的完整路径。
|
197
|
+
* @returns 返回包含 DocumentPage 对象的数组的 Promise。
|
198
|
+
*/
|
199
|
+
loadPages(filePath: string): Promise<DocumentPage[]>;
|
200
|
+
}
|
@@ -0,0 +1,68 @@
|
|
1
|
+
export const TEXT_READABLE_FILE_TYPES = [
|
2
|
+
// Plain Text & Markup
|
3
|
+
'txt',
|
4
|
+
'md',
|
5
|
+
'markdown',
|
6
|
+
'mdx',
|
7
|
+
|
8
|
+
// Configuration & Data
|
9
|
+
'json',
|
10
|
+
'xml',
|
11
|
+
'yaml',
|
12
|
+
'yml',
|
13
|
+
'toml',
|
14
|
+
'ini',
|
15
|
+
'cfg',
|
16
|
+
'conf',
|
17
|
+
'csv',
|
18
|
+
|
19
|
+
// Web Development
|
20
|
+
'html',
|
21
|
+
'htm',
|
22
|
+
'css',
|
23
|
+
'scss',
|
24
|
+
'less',
|
25
|
+
'js',
|
26
|
+
'jsx',
|
27
|
+
'ts',
|
28
|
+
'tsx',
|
29
|
+
'mjs',
|
30
|
+
'vue',
|
31
|
+
'svelte',
|
32
|
+
'svg',
|
33
|
+
|
34
|
+
// Scripting & Programming
|
35
|
+
'php',
|
36
|
+
'py',
|
37
|
+
'rb',
|
38
|
+
'java',
|
39
|
+
'c',
|
40
|
+
'cpp',
|
41
|
+
'h',
|
42
|
+
'hpp',
|
43
|
+
'cs',
|
44
|
+
'go',
|
45
|
+
'rs',
|
46
|
+
'swift',
|
47
|
+
'kt',
|
48
|
+
'sh',
|
49
|
+
'bash',
|
50
|
+
'bat',
|
51
|
+
'ps1',
|
52
|
+
|
53
|
+
// Other
|
54
|
+
'log',
|
55
|
+
'sql',
|
56
|
+
'patch',
|
57
|
+
'diff',
|
58
|
+
'db', // Often text-based, like SQLite journals
|
59
|
+
];
|
60
|
+
|
61
|
+
/**
|
62
|
+
* Determine if a file can be read as text based on its extension.
|
63
|
+
* @param fileType File extension (without the leading dot)
|
64
|
+
* @returns Whether the file is likely text-readable
|
65
|
+
*/
|
66
|
+
export function isTextReadableFile(fileType: string): boolean {
|
67
|
+
return TEXT_READABLE_FILE_TYPES.includes(fileType.toLowerCase());
|
68
|
+
}
|