@lobehub/chat 1.81.4 → 1.81.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.js +1 -0
- package/.github/workflows/release.yml +5 -0
- package/.github/workflows/test.yml +5 -0
- package/CHANGELOG.md +25 -0
- package/changelog/v1.json +9 -0
- package/locales/ar/models.json +3 -0
- package/locales/bg-BG/models.json +3 -0
- package/locales/de-DE/models.json +3 -0
- package/locales/en-US/models.json +3 -0
- package/locales/es-ES/models.json +3 -0
- package/locales/fa-IR/models.json +3 -0
- package/locales/fr-FR/models.json +3 -0
- package/locales/it-IT/models.json +3 -0
- package/locales/ja-JP/models.json +3 -0
- package/locales/ko-KR/models.json +3 -0
- package/locales/nl-NL/models.json +3 -0
- package/locales/pl-PL/models.json +3 -0
- package/locales/pt-BR/models.json +3 -0
- package/locales/ru-RU/models.json +3 -0
- package/locales/tr-TR/models.json +3 -0
- package/locales/vi-VN/models.json +3 -0
- package/locales/zh-CN/models.json +3 -0
- package/locales/zh-TW/models.json +3 -0
- package/package.json +2 -2
- package/packages/file-loaders/README.md +63 -0
- package/packages/file-loaders/package.json +42 -0
- package/packages/file-loaders/src/index.ts +2 -0
- package/packages/file-loaders/src/loadFile.ts +206 -0
- package/packages/file-loaders/src/loaders/docx/__snapshots__/index.test.ts.snap +74 -0
- package/packages/file-loaders/src/loaders/docx/fixtures/test.docx +0 -0
- package/packages/file-loaders/src/loaders/docx/index.test.ts +41 -0
- package/packages/file-loaders/src/loaders/docx/index.ts +73 -0
- package/packages/file-loaders/src/loaders/excel/__snapshots__/index.test.ts.snap +58 -0
- package/packages/file-loaders/src/loaders/excel/fixtures/test.xlsx +0 -0
- package/packages/file-loaders/src/loaders/excel/index.test.ts +47 -0
- package/packages/file-loaders/src/loaders/excel/index.ts +121 -0
- package/packages/file-loaders/src/loaders/index.ts +19 -0
- package/packages/file-loaders/src/loaders/pdf/__snapshots__/index.test.ts.snap +98 -0
- package/packages/file-loaders/src/loaders/pdf/index.test.ts +49 -0
- package/packages/file-loaders/src/loaders/pdf/index.ts +133 -0
- package/packages/file-loaders/src/loaders/pptx/__snapshots__/index.test.ts.snap +40 -0
- package/packages/file-loaders/src/loaders/pptx/fixtures/test.pptx +0 -0
- package/packages/file-loaders/src/loaders/pptx/index.test.ts +47 -0
- package/packages/file-loaders/src/loaders/pptx/index.ts +186 -0
- package/packages/file-loaders/src/loaders/text/__snapshots__/index.test.ts.snap +15 -0
- package/packages/file-loaders/src/loaders/text/fixtures/test.txt +2 -0
- package/packages/file-loaders/src/loaders/text/index.test.ts +38 -0
- package/packages/file-loaders/src/loaders/text/index.ts +53 -0
- package/packages/file-loaders/src/types.ts +200 -0
- package/packages/file-loaders/src/utils/isTextReadableFile.ts +68 -0
- package/packages/file-loaders/src/utils/parser-utils.ts +112 -0
- package/packages/file-loaders/test/__snapshots__/loaders.test.ts.snap +93 -0
- package/packages/file-loaders/test/fixtures/test.csv +4 -0
- package/packages/file-loaders/test/fixtures/test.docx +0 -0
- package/packages/file-loaders/test/fixtures/test.epub +0 -0
- package/packages/file-loaders/test/fixtures/test.md +3 -0
- package/packages/file-loaders/test/fixtures/test.pptx +0 -0
- package/packages/file-loaders/test/fixtures/test.txt +3 -0
- package/packages/file-loaders/test/loaders.test.ts +39 -0
@@ -0,0 +1,133 @@
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
2
|
+
import * as pdfjsLib from 'pdfjs-dist';
|
3
|
+
import type { PDFDocumentProxy, PDFPageProxy, TextContent } from 'pdfjs-dist/types/src/display/api';
|
4
|
+
|
5
|
+
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
6
|
+
|
7
|
+
/**
|
8
|
+
* Loads PDF files page by page using the official pdfjs-dist library.
|
9
|
+
*/
|
10
|
+
export class PdfLoader implements FileLoaderInterface {
|
11
|
+
private pdfInstance: PDFDocumentProxy | null = null;
|
12
|
+
|
13
|
+
private async getPDFFile(filePath: string) {
|
14
|
+
if (!!this.pdfInstance) return this.pdfInstance;
|
15
|
+
|
16
|
+
const dataBuffer = await readFile(filePath);
|
17
|
+
|
18
|
+
const loadingTask = pdfjsLib.getDocument({
|
19
|
+
data: new Uint8Array(dataBuffer.buffer, dataBuffer.byteOffset, dataBuffer.length),
|
20
|
+
useSystemFonts: true,
|
21
|
+
// Explicitly disable worker thread
|
22
|
+
worker: undefined, // Attempt to use system fonts
|
23
|
+
});
|
24
|
+
|
25
|
+
const pdf: PDFDocumentProxy = await loadingTask.promise;
|
26
|
+
|
27
|
+
this.pdfInstance = pdf;
|
28
|
+
|
29
|
+
return pdf;
|
30
|
+
}
|
31
|
+
|
32
|
+
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
33
|
+
try {
|
34
|
+
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
|
35
|
+
|
36
|
+
const pages: DocumentPage[] = [];
|
37
|
+
|
38
|
+
for (let i = 1; i <= pdf.numPages; i += 1) {
|
39
|
+
const page: PDFPageProxy = await pdf.getPage(i);
|
40
|
+
const content: TextContent = await page.getTextContent();
|
41
|
+
|
42
|
+
// --- Revert to EXACT Simple Langchain PDFLoader Logic ---
|
43
|
+
let lastY;
|
44
|
+
const textItems = [];
|
45
|
+
for (const item of content.items) {
|
46
|
+
// Ensure 'str' exists and potentially filter empty strings if needed, though Langchain's snippet doesn't explicitly filter empties
|
47
|
+
if ('str' in item) {
|
48
|
+
if (lastY === item.transform[5] || !lastY) {
|
49
|
+
// Exact check from Langchain
|
50
|
+
textItems.push(item.str);
|
51
|
+
} else {
|
52
|
+
// Exact else from Langchain for Y change
|
53
|
+
textItems.push(`\n${item.str}`);
|
54
|
+
}
|
55
|
+
// Update lastY, Langchain's snippet doesn't use destructuring here
|
56
|
+
lastY = item.transform[5];
|
57
|
+
}
|
58
|
+
}
|
59
|
+
|
60
|
+
const pageText = textItems.join(''); // Join with empty separator
|
61
|
+
// --- End Revert to Simple Langchain Logic ---
|
62
|
+
|
63
|
+
// Clean the final text (keep null character removal)
|
64
|
+
const cleanedPageContent = pageText.replaceAll('\0', '');
|
65
|
+
|
66
|
+
// Calculate stats based on the final content
|
67
|
+
const pageLines = cleanedPageContent.split('\n');
|
68
|
+
const lineCount = pageLines.length;
|
69
|
+
const charCount = cleanedPageContent.length;
|
70
|
+
|
71
|
+
pages.push({
|
72
|
+
charCount,
|
73
|
+
lineCount,
|
74
|
+
metadata: { pageNumber: i },
|
75
|
+
pageContent: cleanedPageContent,
|
76
|
+
});
|
77
|
+
|
78
|
+
// Clean up page resources
|
79
|
+
page.cleanup();
|
80
|
+
}
|
81
|
+
|
82
|
+
// Clean up document resources
|
83
|
+
await pdf.destroy();
|
84
|
+
|
85
|
+
return pages;
|
86
|
+
} catch (e) {
|
87
|
+
const error = e as Error;
|
88
|
+
console.error(
|
89
|
+
`Error loading PDF file ${filePath} using pdfjs-dist: ${error.message}`,
|
90
|
+
error.stack,
|
91
|
+
);
|
92
|
+
const errorPage: DocumentPage = {
|
93
|
+
charCount: 0,
|
94
|
+
lineCount: 0,
|
95
|
+
metadata: {
|
96
|
+
error: `Failed to load or parse PDF file: ${error.message}`,
|
97
|
+
filePath: filePath,
|
98
|
+
},
|
99
|
+
pageContent: '',
|
100
|
+
};
|
101
|
+
return [errorPage];
|
102
|
+
}
|
103
|
+
}
|
104
|
+
|
105
|
+
/**
|
106
|
+
* Aggregates content from PDF pages.
|
107
|
+
* Uses double newline as a separator.
|
108
|
+
* @param pages Array of DocumentPage objects.
|
109
|
+
* @returns Aggregated content as a string.
|
110
|
+
*/
|
111
|
+
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
112
|
+
return pages
|
113
|
+
.filter((page) => !page.metadata.error)
|
114
|
+
.map((page) => page.pageContent)
|
115
|
+
.join('\n\n');
|
116
|
+
}
|
117
|
+
|
118
|
+
async attachDocumentMetadata(filePath: string): Promise<any> {
|
119
|
+
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
|
120
|
+
|
121
|
+
const pdfMetadata = (await pdf.getMetadata().catch(() => null)) ?? null;
|
122
|
+
const pdfInfo = pdfMetadata?.info ?? {};
|
123
|
+
const metadata = pdfMetadata?.metadata ?? null;
|
124
|
+
|
125
|
+
return {
|
126
|
+
pdfInfo: pdfInfo,
|
127
|
+
// PDF info (Author, Title, etc.)
|
128
|
+
pdfMetadata: metadata,
|
129
|
+
// PDF metadata
|
130
|
+
pdfVersion: pdfjsLib.version,
|
131
|
+
};
|
132
|
+
}
|
133
|
+
}
|
@@ -0,0 +1,40 @@
|
|
1
|
+
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
2
|
+
|
3
|
+
exports[`PptxLoader > should aggregate content correctly (joining slides) > aggregated_content 1`] = `
|
4
|
+
"<slide_page pageNumber="1">
|
5
|
+
Hello
|
6
|
+
Page1
|
7
|
+
</slide_page>
|
8
|
+
|
9
|
+
<slide_page pageNumber="2">
|
10
|
+
Word
|
11
|
+
Page2
|
12
|
+
</slide_page>"
|
13
|
+
`;
|
14
|
+
|
15
|
+
exports[`PptxLoader > should load pages correctly from a PPTX file (one page per slide) 1`] = `
|
16
|
+
[
|
17
|
+
{
|
18
|
+
"charCount": 11,
|
19
|
+
"lineCount": 2,
|
20
|
+
"metadata": {
|
21
|
+
"pageCount": 2,
|
22
|
+
"slideNumber": 1,
|
23
|
+
"sourceFileName": "test.pptx",
|
24
|
+
},
|
25
|
+
"pageContent": "Hello
|
26
|
+
Page1",
|
27
|
+
},
|
28
|
+
{
|
29
|
+
"charCount": 10,
|
30
|
+
"lineCount": 2,
|
31
|
+
"metadata": {
|
32
|
+
"pageCount": 2,
|
33
|
+
"slideNumber": 2,
|
34
|
+
"sourceFileName": "test.pptx",
|
35
|
+
},
|
36
|
+
"pageContent": "Word
|
37
|
+
Page2",
|
38
|
+
},
|
39
|
+
]
|
40
|
+
`;
|
Binary file
|
@@ -0,0 +1,47 @@
|
|
1
|
+
// @vitest-environment node
|
2
|
+
import path from 'node:path';
|
3
|
+
import { beforeEach, describe, expect, it } from 'vitest';
|
4
|
+
|
5
|
+
import type { FileLoaderInterface } from '../../types';
|
6
|
+
import { PptxLoader } from './index';
|
7
|
+
|
8
|
+
// Import PptxLoader
|
9
|
+
|
10
|
+
// 确保你已经在 fixtures 目录下放置了 test.pptx 文件
|
11
|
+
// 这个 PPTX 文件最好包含多个幻灯片 (slides) 以便测试
|
12
|
+
const fixturePath = (filename: string) => path.join(__dirname, `./fixtures/${filename}`);
|
13
|
+
|
14
|
+
let loader: FileLoaderInterface;
|
15
|
+
|
16
|
+
const testFile = fixturePath('test.pptx'); // Use .pptx
|
17
|
+
const nonExistentFile = fixturePath('nonexistent.pptx'); // Use .pptx
|
18
|
+
|
19
|
+
beforeEach(() => {
|
20
|
+
loader = new PptxLoader(); // Instantiate PptxLoader
|
21
|
+
});
|
22
|
+
|
23
|
+
describe('PptxLoader', () => {
|
24
|
+
// Describe PptxLoader
|
25
|
+
it('should load pages correctly from a PPTX file (one page per slide)', async () => {
|
26
|
+
const pages = await loader.loadPages(testFile);
|
27
|
+
// PPTX 文件有多少个 slide,就应该有多少个 page
|
28
|
+
expect(pages.length).toBeGreaterThan(1);
|
29
|
+
|
30
|
+
// 直接对整个 pages 数组进行快照测试 (会包含 slideNumber)
|
31
|
+
expect(pages).toMatchSnapshot();
|
32
|
+
});
|
33
|
+
|
34
|
+
it('should aggregate content correctly (joining slides)', async () => {
|
35
|
+
const pages = await loader.loadPages(testFile);
|
36
|
+
const content = await loader.aggregateContent(pages);
|
37
|
+
// 默认聚合是以换行符连接各 slide 内容
|
38
|
+
expect(content).toMatchSnapshot('aggregated_content');
|
39
|
+
});
|
40
|
+
|
41
|
+
it('should handle file read errors in loadPages', async () => {
|
42
|
+
const pages = await loader.loadPages(nonExistentFile);
|
43
|
+
expect(pages).toHaveLength(1); // 即使失败也返回一个包含错误信息的页面
|
44
|
+
expect(pages[0].pageContent).toBe('');
|
45
|
+
expect(pages[0].metadata.error).toContain('Failed to load or process PPTX file:'); // Update error message check
|
46
|
+
});
|
47
|
+
});
|
@@ -0,0 +1,186 @@
|
|
1
|
+
import path from 'node:path';
|
2
|
+
|
3
|
+
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
|
+
import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils';
|
5
|
+
|
6
|
+
/**
|
7
|
+
* Represents a loader for PPTX files using extracted utility functions.
|
8
|
+
*
|
9
|
+
* This loader reads a PPTX file, extracts text content from each slide,
|
10
|
+
* and represents each slide as a `DocumentPage`.
|
11
|
+
*/
|
12
|
+
export class PptxLoader implements FileLoaderInterface {
|
13
|
+
/**
|
14
|
+
* Loads pages from the specified PPTX file path.
|
15
|
+
*
|
16
|
+
* @param filePath The absolute path to the PPTX file.
|
17
|
+
* @returns A Promise resolving to an array of `DocumentPage` objects.
|
18
|
+
* If loading or parsing fails, it returns an array containing a single
|
19
|
+
* `DocumentPage` object with error information in its metadata.
|
20
|
+
*/
|
21
|
+
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
22
|
+
const sourceFileName = path.basename(filePath);
|
23
|
+
|
24
|
+
try {
|
25
|
+
// --- File Extraction Step ---
|
26
|
+
const slidesRegex = /ppt\/slides\/slide\d+\.xml/g;
|
27
|
+
const slideNumberRegex = /slide(\d+)\.xml/;
|
28
|
+
|
29
|
+
// Extract only slide XML files
|
30
|
+
const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) =>
|
31
|
+
slidesRegex.test(fileName),
|
32
|
+
);
|
33
|
+
|
34
|
+
// --- Validation Step ---
|
35
|
+
if (slideFiles.length === 0) {
|
36
|
+
console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`);
|
37
|
+
return [
|
38
|
+
this.createErrorPage(
|
39
|
+
'No slides found. The PPTX file might be empty, corrupted, or does not contain standard slide XMLs.',
|
40
|
+
sourceFileName,
|
41
|
+
),
|
42
|
+
];
|
43
|
+
}
|
44
|
+
|
45
|
+
// --- Sorting Step ---
|
46
|
+
// Sort files based on the slide number extracted from the path
|
47
|
+
slideFiles.sort((a, b) => {
|
48
|
+
const matchA = a.path.match(slideNumberRegex);
|
49
|
+
const matchB = b.path.match(slideNumberRegex);
|
50
|
+
const numA = matchA ? parseInt(matchA[1], 10) : Infinity;
|
51
|
+
const numB = matchB ? parseInt(matchB[1], 10) : Infinity;
|
52
|
+
return numA - numB;
|
53
|
+
});
|
54
|
+
|
55
|
+
// --- Page Creation Step ---
|
56
|
+
const pages: DocumentPage[] = slideFiles
|
57
|
+
.map((slideFile, index) => {
|
58
|
+
try {
|
59
|
+
const xmlDoc = parseString(slideFile.content);
|
60
|
+
const paragraphNodes = xmlDoc.getElementsByTagName('a:p');
|
61
|
+
|
62
|
+
const slideText = Array.from(paragraphNodes)
|
63
|
+
.map((pNode) => {
|
64
|
+
const textNodes = pNode.getElementsByTagName('a:t');
|
65
|
+
return Array.from(textNodes)
|
66
|
+
.map((tNode) => (tNode.childNodes[0] ? tNode.childNodes[0].nodeValue : ''))
|
67
|
+
.join(''); // Join text within a paragraph without spaces
|
68
|
+
})
|
69
|
+
.filter((text) => text.length > 0) // Filter out empty paragraphs
|
70
|
+
.join('\n'); // Join paragraphs with newline
|
71
|
+
|
72
|
+
const lines = slideText.split('\n');
|
73
|
+
const slideNumberMatch = slideFile.path.match(slideNumberRegex);
|
74
|
+
const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails
|
75
|
+
|
76
|
+
const metadata = {
|
77
|
+
pageCount: slideFiles.length, // Total number of slides found
|
78
|
+
slideNumber: slideNumber,
|
79
|
+
sourceFileName,
|
80
|
+
};
|
81
|
+
|
82
|
+
return {
|
83
|
+
charCount: slideText.length,
|
84
|
+
lineCount: lines.length,
|
85
|
+
metadata: metadata,
|
86
|
+
pageContent: slideText.trim(), // Trim final content
|
87
|
+
};
|
88
|
+
} catch (parseError) {
|
89
|
+
console.error(
|
90
|
+
`Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
|
91
|
+
);
|
92
|
+
// Create a specific error page for this slide, or could return null and filter later
|
93
|
+
// Returning null might be better if one slide fails but others succeed.
|
94
|
+
// For now, let's keep it simple and create an error page for this slide.
|
95
|
+
return this.createErrorPage(
|
96
|
+
`Error parsing slide ${slideFile.path}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
|
97
|
+
sourceFileName,
|
98
|
+
slideFile.path,
|
99
|
+
);
|
100
|
+
}
|
101
|
+
})
|
102
|
+
// Filter out any potential nulls if we change the error handling above
|
103
|
+
.filter((page): page is DocumentPage => page !== null);
|
104
|
+
|
105
|
+
if (pages.length === 0) {
|
106
|
+
// This case might happen if all slides failed to parse
|
107
|
+
console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`);
|
108
|
+
return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)];
|
109
|
+
}
|
110
|
+
|
111
|
+
// Check if all pages are error pages
|
112
|
+
const allErrored = pages.every((page) => page.metadata?.error);
|
113
|
+
if (allErrored) {
|
114
|
+
// If all pages resulted in errors, perhaps return a single summary error
|
115
|
+
console.warn(`All slides failed to parse for ${sourceFileName}`);
|
116
|
+
return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)];
|
117
|
+
// Or return all the individual error pages: return pages;
|
118
|
+
}
|
119
|
+
|
120
|
+
return pages;
|
121
|
+
} catch (error) {
|
122
|
+
// --- Error Handling Step ---
|
123
|
+
// This catches errors from extractFiles or other unexpected issues
|
124
|
+
const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`;
|
125
|
+
console.error(errorMessage, { filePath });
|
126
|
+
return [this.createErrorPage(errorMessage, sourceFileName)];
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
/**
|
131
|
+
* Aggregates the content from all DocumentPages (slides).
|
132
|
+
*
|
133
|
+
* Prepends each slide's content with a "## Slide: N" header.
|
134
|
+
* Joins the content of slides with a standard separator.
|
135
|
+
*
|
136
|
+
* @param pages An array of `DocumentPage` objects obtained from `loadPages`.
|
137
|
+
* @returns A Promise resolving to the aggregated content string.
|
138
|
+
*/
|
139
|
+
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
140
|
+
// Ensure pages array is valid and non-empty before proceeding
|
141
|
+
// Filter out error pages before aggregation unless we want to include error messages
|
142
|
+
const validPages = pages.filter((page) => !page.metadata?.error);
|
143
|
+
|
144
|
+
if (validPages.length === 0) {
|
145
|
+
// If only error pages existed, return empty or a summary error message
|
146
|
+
return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page)
|
147
|
+
}
|
148
|
+
|
149
|
+
return validPages
|
150
|
+
.map((page) => {
|
151
|
+
const slideNumber = page.metadata?.slideNumber;
|
152
|
+
// Use Markdown H2 for slide headers
|
153
|
+
const header = slideNumber ? `<slide_page pageNumber="${slideNumber}">` : '<slide_page>'; // Fallback header
|
154
|
+
return `${header}
|
155
|
+
${page.pageContent}
|
156
|
+
</slide_page>`;
|
157
|
+
})
|
158
|
+
.join('\n\n'); // Use Markdown horizontal rule as separator
|
159
|
+
}
|
160
|
+
|
161
|
+
/**
|
162
|
+
* Helper method to create a standardized error page object.
|
163
|
+
*
|
164
|
+
* @param errorInfo A string describing the error.
|
165
|
+
* @param sourceFileName The name of the file that caused the error.
|
166
|
+
* @param sourceFilePath Optional: Specific path within the archive that caused the error (e.g., slide path)
|
167
|
+
* @returns A `DocumentPage` object representing the error state.
|
168
|
+
*/
|
169
|
+
private createErrorPage(
|
170
|
+
errorInfo: string,
|
171
|
+
sourceFileName: string,
|
172
|
+
sourceFilePath?: string,
|
173
|
+
): DocumentPage {
|
174
|
+
return {
|
175
|
+
charCount: 0,
|
176
|
+
lineCount: 0,
|
177
|
+
metadata: {
|
178
|
+
error: errorInfo,
|
179
|
+
pageCount: 0,
|
180
|
+
sourceFileName: sourceFileName,
|
181
|
+
...(sourceFilePath && { sourceFilePath }), // Add specific path if available
|
182
|
+
},
|
183
|
+
pageContent: '', // Error pages have no content
|
184
|
+
};
|
185
|
+
}
|
186
|
+
}
|
@@ -0,0 +1,15 @@
|
|
1
|
+
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
2
|
+
|
3
|
+
exports[`TextLoader > should load pages correctly 1`] = `
|
4
|
+
{
|
5
|
+
"charCount": 25,
|
6
|
+
"lineCount": 3,
|
7
|
+
"metadata": {
|
8
|
+
"lineNumberEnd": 3,
|
9
|
+
"lineNumberStart": 1,
|
10
|
+
},
|
11
|
+
"pageContent": "Hello Text.
|
12
|
+
Second Line.
|
13
|
+
",
|
14
|
+
}
|
15
|
+
`;
|
@@ -0,0 +1,38 @@
|
|
1
|
+
import path from 'node:path';
|
2
|
+
import { beforeEach } from 'vitest';
|
3
|
+
|
4
|
+
import type { FileLoaderInterface } from '../../types';
|
5
|
+
import { TextLoader } from './index';
|
6
|
+
|
7
|
+
const fixturePath = (filename: string) => path.join(__dirname, `./fixtures/${filename}`);
|
8
|
+
|
9
|
+
let loader: FileLoaderInterface;
|
10
|
+
|
11
|
+
const testFile = fixturePath('test.txt');
|
12
|
+
|
13
|
+
beforeEach(() => {
|
14
|
+
loader = new TextLoader();
|
15
|
+
});
|
16
|
+
|
17
|
+
describe('TextLoader', () => {
|
18
|
+
it('should load pages correctly', async () => {
|
19
|
+
const pages = await loader.loadPages(testFile);
|
20
|
+
expect(pages).toHaveLength(1);
|
21
|
+
const page = pages[0];
|
22
|
+
expect(page).toMatchSnapshot();
|
23
|
+
});
|
24
|
+
|
25
|
+
it('should aggregate content correctly', async () => {
|
26
|
+
const pages = await loader.loadPages(testFile);
|
27
|
+
const content = await loader.aggregateContent(pages);
|
28
|
+
// Default aggregation joins with newline
|
29
|
+
expect(content).toBe('Hello Text.\nSecond Line.\n');
|
30
|
+
});
|
31
|
+
|
32
|
+
it('should handle file read errors in loadPages', async () => {
|
33
|
+
const pages = await loader.loadPages(fixturePath('nonexistent.txt'));
|
34
|
+
expect(pages).toHaveLength(1);
|
35
|
+
expect(pages[0].metadata.error).toContain('Failed to load text file');
|
36
|
+
expect(pages[0].pageContent).toBe('');
|
37
|
+
});
|
38
|
+
});
|
@@ -0,0 +1,53 @@
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
2
|
+
|
3
|
+
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
|
+
|
5
|
+
/**
|
6
|
+
* 用于加载纯文本文件的加载器。
|
7
|
+
*/
|
8
|
+
export class TextLoader implements FileLoaderInterface {
|
9
|
+
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
10
|
+
try {
|
11
|
+
const fileContent = await readFile(filePath, 'utf8');
|
12
|
+
const lines = fileContent.split('\n');
|
13
|
+
const lineCount = lines.length;
|
14
|
+
const charCount = fileContent.length;
|
15
|
+
|
16
|
+
const page: DocumentPage = {
|
17
|
+
charCount,
|
18
|
+
lineCount,
|
19
|
+
metadata: {
|
20
|
+
lineNumberEnd: lineCount,
|
21
|
+
lineNumberStart: 1,
|
22
|
+
},
|
23
|
+
pageContent: fileContent,
|
24
|
+
};
|
25
|
+
|
26
|
+
return [page];
|
27
|
+
} catch (e) {
|
28
|
+
const error = e as Error;
|
29
|
+
console.error(`Error loading text file ${filePath}: ${error.message}`);
|
30
|
+
// 如果读取失败,返回一个包含错误信息的 Page
|
31
|
+
const errorPage: DocumentPage = {
|
32
|
+
charCount: 0,
|
33
|
+
lineCount: 0,
|
34
|
+
metadata: {
|
35
|
+
error: `Failed to load text file: ${error.message}`,
|
36
|
+
},
|
37
|
+
pageContent: '',
|
38
|
+
};
|
39
|
+
return [errorPage];
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
43
|
+
/**
|
44
|
+
* 对于纯文本,简单地连接所有页面的内容。
|
45
|
+
* (虽然 TextLoader 通常只有一个页面,但保持接口一致性)
|
46
|
+
* @param pages 页面数组
|
47
|
+
* @returns 聚合后的内容
|
48
|
+
*/
|
49
|
+
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
50
|
+
// 默认使用换行符连接,可以根据需要调整或使其可配置
|
51
|
+
return pages.map((page) => page.pageContent).join('\n');
|
52
|
+
}
|
53
|
+
}
|