@lobehub/chat 1.81.4 → 1.81.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. package/.eslintrc.js +1 -0
  2. package/.github/workflows/release.yml +5 -0
  3. package/.github/workflows/test.yml +5 -0
  4. package/CHANGELOG.md +50 -0
  5. package/changelog/v1.json +18 -0
  6. package/locales/ar/auth.json +1 -1
  7. package/locales/ar/hotkey.json +4 -0
  8. package/locales/ar/models.json +3 -0
  9. package/locales/bg-BG/auth.json +1 -1
  10. package/locales/bg-BG/hotkey.json +4 -0
  11. package/locales/bg-BG/models.json +3 -0
  12. package/locales/de-DE/auth.json +1 -1
  13. package/locales/de-DE/hotkey.json +4 -0
  14. package/locales/de-DE/models.json +3 -0
  15. package/locales/en-US/auth.json +1 -1
  16. package/locales/en-US/hotkey.json +4 -0
  17. package/locales/en-US/models.json +3 -0
  18. package/locales/es-ES/auth.json +1 -1
  19. package/locales/es-ES/hotkey.json +4 -0
  20. package/locales/es-ES/models.json +3 -0
  21. package/locales/fa-IR/auth.json +1 -1
  22. package/locales/fa-IR/hotkey.json +4 -0
  23. package/locales/fa-IR/models.json +3 -0
  24. package/locales/fr-FR/auth.json +1 -1
  25. package/locales/fr-FR/hotkey.json +4 -0
  26. package/locales/fr-FR/models.json +3 -0
  27. package/locales/it-IT/auth.json +1 -1
  28. package/locales/it-IT/hotkey.json +4 -0
  29. package/locales/it-IT/models.json +3 -0
  30. package/locales/ja-JP/auth.json +1 -1
  31. package/locales/ja-JP/hotkey.json +4 -0
  32. package/locales/ja-JP/models.json +3 -0
  33. package/locales/ko-KR/auth.json +1 -1
  34. package/locales/ko-KR/hotkey.json +4 -0
  35. package/locales/ko-KR/models.json +3 -0
  36. package/locales/nl-NL/auth.json +1 -1
  37. package/locales/nl-NL/hotkey.json +4 -0
  38. package/locales/nl-NL/models.json +3 -0
  39. package/locales/pl-PL/auth.json +1 -1
  40. package/locales/pl-PL/hotkey.json +4 -0
  41. package/locales/pl-PL/models.json +3 -0
  42. package/locales/pt-BR/auth.json +1 -1
  43. package/locales/pt-BR/hotkey.json +4 -0
  44. package/locales/pt-BR/models.json +3 -0
  45. package/locales/ru-RU/auth.json +1 -1
  46. package/locales/ru-RU/hotkey.json +4 -0
  47. package/locales/ru-RU/models.json +3 -0
  48. package/locales/tr-TR/auth.json +1 -1
  49. package/locales/tr-TR/hotkey.json +4 -0
  50. package/locales/tr-TR/models.json +3 -0
  51. package/locales/vi-VN/auth.json +1 -1
  52. package/locales/vi-VN/hotkey.json +4 -0
  53. package/locales/vi-VN/models.json +3 -0
  54. package/locales/zh-CN/auth.json +1 -1
  55. package/locales/zh-CN/changelog.json +1 -1
  56. package/locales/zh-CN/clerk.json +1 -1
  57. package/locales/zh-CN/discover.json +1 -1
  58. package/locales/zh-CN/file.json +1 -1
  59. package/locales/zh-CN/hotkey.json +4 -0
  60. package/locales/zh-CN/knowledgeBase.json +1 -1
  61. package/locales/zh-CN/metadata.json +1 -1
  62. package/locales/zh-CN/migration.json +1 -1
  63. package/locales/zh-CN/models.json +3 -0
  64. package/locales/zh-CN/ragEval.json +1 -1
  65. package/locales/zh-CN/thread.json +1 -1
  66. package/locales/zh-CN/welcome.json +1 -1
  67. package/locales/zh-TW/auth.json +1 -1
  68. package/locales/zh-TW/hotkey.json +4 -0
  69. package/locales/zh-TW/models.json +3 -0
  70. package/package.json +6 -4
  71. package/packages/file-loaders/README.md +63 -0
  72. package/packages/file-loaders/package.json +42 -0
  73. package/packages/file-loaders/src/index.ts +2 -0
  74. package/packages/file-loaders/src/loadFile.ts +206 -0
  75. package/packages/file-loaders/src/loaders/docx/__snapshots__/index.test.ts.snap +74 -0
  76. package/packages/file-loaders/src/loaders/docx/fixtures/test.docx +0 -0
  77. package/packages/file-loaders/src/loaders/docx/index.test.ts +41 -0
  78. package/packages/file-loaders/src/loaders/docx/index.ts +73 -0
  79. package/packages/file-loaders/src/loaders/excel/__snapshots__/index.test.ts.snap +58 -0
  80. package/packages/file-loaders/src/loaders/excel/fixtures/test.xlsx +0 -0
  81. package/packages/file-loaders/src/loaders/excel/index.test.ts +47 -0
  82. package/packages/file-loaders/src/loaders/excel/index.ts +121 -0
  83. package/packages/file-loaders/src/loaders/index.ts +19 -0
  84. package/packages/file-loaders/src/loaders/pdf/__snapshots__/index.test.ts.snap +98 -0
  85. package/packages/file-loaders/src/loaders/pdf/index.test.ts +49 -0
  86. package/packages/file-loaders/src/loaders/pdf/index.ts +133 -0
  87. package/packages/file-loaders/src/loaders/pptx/__snapshots__/index.test.ts.snap +40 -0
  88. package/packages/file-loaders/src/loaders/pptx/fixtures/test.pptx +0 -0
  89. package/packages/file-loaders/src/loaders/pptx/index.test.ts +47 -0
  90. package/packages/file-loaders/src/loaders/pptx/index.ts +186 -0
  91. package/packages/file-loaders/src/loaders/text/__snapshots__/index.test.ts.snap +15 -0
  92. package/packages/file-loaders/src/loaders/text/fixtures/test.txt +2 -0
  93. package/packages/file-loaders/src/loaders/text/index.test.ts +38 -0
  94. package/packages/file-loaders/src/loaders/text/index.ts +53 -0
  95. package/packages/file-loaders/src/types.ts +200 -0
  96. package/packages/file-loaders/src/utils/isTextReadableFile.ts +68 -0
  97. package/packages/file-loaders/src/utils/parser-utils.ts +112 -0
  98. package/packages/file-loaders/test/__snapshots__/loaders.test.ts.snap +93 -0
  99. package/packages/file-loaders/test/fixtures/test.csv +4 -0
  100. package/packages/file-loaders/test/fixtures/test.docx +0 -0
  101. package/packages/file-loaders/test/fixtures/test.epub +0 -0
  102. package/packages/file-loaders/test/fixtures/test.md +3 -0
  103. package/packages/file-loaders/test/fixtures/test.pptx +0 -0
  104. package/packages/file-loaders/test/fixtures/test.txt +3 -0
  105. package/packages/file-loaders/test/loaders.test.ts +39 -0
  106. package/src/config/aiModels/github.ts +2 -4
  107. package/src/config/aiModels/google.ts +3 -4
  108. package/src/config/aiModels/sensenova.ts +4 -5
  109. package/src/const/hotkeys.ts +6 -0
  110. package/src/features/ChatInput/ActionBar/Clear.tsx +18 -8
  111. package/src/hooks/useHotkeys/chatScope.ts +7 -0
  112. package/src/libs/agent-runtime/google/index.ts +1 -1
  113. package/src/libs/agent-runtime/sensenova/index.ts +20 -27
  114. package/src/libs/agent-runtime/utils/sensenovaHelpers.test.ts +24 -33
  115. package/src/libs/agent-runtime/utils/sensenovaHelpers.ts +2 -3
  116. package/src/locales/default/hotkey.ts +4 -0
  117. package/src/server/modules/MCPClient/__tests__/__snapshots__/index.test.ts.snap +113 -0
  118. package/src/server/modules/MCPClient/__tests__/index.test.ts +81 -0
  119. package/src/server/modules/MCPClient/index.ts +80 -0
  120. package/src/types/hotkey.ts +1 -0
@@ -0,0 +1,73 @@
1
+ import { DocxLoader as LangchainDocxLoader } from '@langchain/community/document_loaders/fs/docx';
2
+
3
+ import type { DocumentPage, FileLoaderInterface } from '../../types';
4
+
5
+ /**
6
+ * Loads Word documents (.docx) using the LangChain Community DocxLoader.
7
+ */
8
+ export class DocxLoader implements FileLoaderInterface {
9
+ async loadPages(filePath: string): Promise<DocumentPage[]> {
10
+ try {
11
+ const loader = new LangchainDocxLoader(filePath);
12
+ const docs = await loader.load(); // Langchain DocxLoader typically loads the whole doc as one
13
+
14
+ const pages: DocumentPage[] = docs.map((doc) => {
15
+ const pageContent = doc.pageContent || '';
16
+ const lines = pageContent.split('\n');
17
+ const lineCount = lines.length;
18
+ const charCount = pageContent.length;
19
+
20
+ // Langchain DocxLoader doesn't usually provide page numbers in metadata
21
+ // We treat it as a single page
22
+ const metadata = {
23
+ ...doc.metadata, // Include any other metadata Langchain provides
24
+ pageNumber: 1,
25
+ };
26
+
27
+ // @ts-expect-error Remove source if present, as it's handled at the FileDocument level
28
+ delete metadata.source;
29
+
30
+ return {
31
+ charCount,
32
+ lineCount,
33
+ metadata,
34
+ pageContent,
35
+ };
36
+ });
37
+
38
+ // If docs array is empty (e.g., empty file), create an empty page
39
+ if (pages.length === 0) {
40
+ pages.push({
41
+ charCount: 0,
42
+ lineCount: 0,
43
+ metadata: { pageNumber: 1 },
44
+ pageContent: '',
45
+ });
46
+ }
47
+
48
+ return pages;
49
+ } catch (e) {
50
+ const error = e as Error;
51
+ console.error(`Error loading DOCX file ${filePath} using LangChain loader: ${error.message}`);
52
+ const errorPage: DocumentPage = {
53
+ charCount: 0,
54
+ lineCount: 0,
55
+ metadata: {
56
+ error: `Failed to load DOCX file: ${error.message}`,
57
+ },
58
+ pageContent: '',
59
+ };
60
+ return [errorPage];
61
+ }
62
+ }
63
+
64
+ /**
65
+ * Aggregates content from DOCX pages.
66
+ * Uses double newline as a separator.
67
+ * @param pages Array of DocumentPage objects.
68
+ * @returns Aggregated content as a string.
69
+ */
70
+ async aggregateContent(pages: DocumentPage[]): Promise<string> {
71
+ return pages.map((page) => page.pageContent).join('\n\n');
72
+ }
73
+ }
@@ -0,0 +1,58 @@
1
+ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2
+
3
+ exports[`ExcelLoader > should aggregate content correctly (joining sheets) > aggregated_content 1`] = `
4
+ "## Sheet: 表1
5
+
6
+ | __EMPTY | 类别 A | 类别 B | __EMPTY_1 | __EMPTY_2 |
7
+ | --- | --- | --- | --- | --- |
8
+ | 项目 1 | 5 | 7 | | |
9
+ | 项目 2 | 10 | 8 | | |
10
+ | 项目 3 | 9 | 15 | | |
11
+ | 项目 4 | 7 | 12 | | |
12
+ | 项目 5 | 16 | 21 | | |
13
+
14
+ ---
15
+
16
+ ## Sheet: 表2 - 表格 2
17
+
18
+ | __EMPTY | 类别 A | 类别 B | __EMPTY_1 | __EMPTY_2 |
19
+ | --- | --- | --- | --- | --- |
20
+ | 项目 1 | 5 | 7 | | |
21
+ | 项目 2 | 10 | 8 | | |
22
+ | 项目 3 | 9 | 15 | | |
23
+ | 项目 4 | 7 | 12 | | |
24
+ | 项目 5 | 16 | 21 | | |"
25
+ `;
26
+
27
+ exports[`ExcelLoader > should load pages correctly from an Excel file (one page per sheet) 1`] = `
28
+ [
29
+ {
30
+ "charCount": 201,
31
+ "lineCount": 7,
32
+ "metadata": {
33
+ "sheetName": "表1",
34
+ },
35
+ "pageContent": "| __EMPTY | 类别 A | 类别 B | __EMPTY_1 | __EMPTY_2 |
36
+ | --- | --- | --- | --- | --- |
37
+ | 项目 1 | 5 | 7 | | |
38
+ | 项目 2 | 10 | 8 | | |
39
+ | 项目 3 | 9 | 15 | | |
40
+ | 项目 4 | 7 | 12 | | |
41
+ | 项目 5 | 16 | 21 | | |",
42
+ },
43
+ {
44
+ "charCount": 201,
45
+ "lineCount": 7,
46
+ "metadata": {
47
+ "sheetName": "表2 - 表格 2",
48
+ },
49
+ "pageContent": "| __EMPTY | 类别 A | 类别 B | __EMPTY_1 | __EMPTY_2 |
50
+ | --- | --- | --- | --- | --- |
51
+ | 项目 1 | 5 | 7 | | |
52
+ | 项目 2 | 10 | 8 | | |
53
+ | 项目 3 | 9 | 15 | | |
54
+ | 项目 4 | 7 | 12 | | |
55
+ | 项目 5 | 16 | 21 | | |",
56
+ },
57
+ ]
58
+ `;
@@ -0,0 +1,47 @@
1
+ import path from 'node:path';
2
+ import { beforeEach, describe, expect, it } from 'vitest';
3
+
4
+ import type { FileLoaderInterface } from '../../types';
5
+ import { ExcelLoader } from './index';
6
+
7
+ // 确保你已经在 fixtures 目录下放置了 test.xlsx 文件
8
+ // 这个 Excel 文件最好包含多个工作表 (sheets) 以便测试
9
+ const fixturePath = (filename: string) => path.join(__dirname, `./fixtures/${filename}`);
10
+
11
+ let loader: FileLoaderInterface;
12
+
13
+ const testFile = fixturePath('test.xlsx');
14
+ const nonExistentFile = fixturePath('nonexistent.xlsx');
15
+
16
+ beforeEach(() => {
17
+ loader = new ExcelLoader();
18
+ });
19
+
20
+ describe('ExcelLoader', () => {
21
+ it('should load pages correctly from an Excel file (one page per sheet)', async () => {
22
+ const pages = await loader.loadPages(testFile);
23
+ // Excel 文件有多少个 sheet,就应该有多少个 page
24
+ expect(pages.length).toBeGreaterThan(0);
25
+
26
+ // 直接对整个 pages 数组进行快照测试
27
+ expect(pages).toMatchSnapshot();
28
+
29
+ // 如果你的 test.xlsx 有多个 sheet,可以添加更多断言
30
+ // 例如检查特定 sheet 的 metadata 中的 sheetName
31
+ // expect(pages[1].metadata.sheetName).toBe('Sheet2');
32
+ });
33
+
34
+ it('should aggregate content correctly (joining sheets)', async () => {
35
+ const pages = await loader.loadPages(testFile);
36
+ const content = await loader.aggregateContent(pages);
37
+ // 默认聚合是以换行符连接各 sheet 内容
38
+ expect(content).toMatchSnapshot('aggregated_content');
39
+ });
40
+
41
+ it('should handle file read errors in loadPages', async () => {
42
+ const pages = await loader.loadPages(nonExistentFile);
43
+ expect(pages).toHaveLength(1); // 即使失败也返回一个包含错误信息的页面
44
+ expect(pages[0].pageContent).toBe('');
45
+ expect(pages[0].metadata.error).toContain('Failed to load Excel file');
46
+ });
47
+ });
@@ -0,0 +1,121 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import * as xlsx from 'xlsx';
3
+
4
+ import type { DocumentPage, FileLoaderInterface } from '../../types';
5
+
6
+ /**
7
+ * Converts sheet data (array of objects) to a Markdown table string.
8
+ * Handles empty sheets and escapes pipe characters.
9
+ */
10
+ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
11
+ if (!jsonData || jsonData.length === 0) {
12
+ return '*Sheet is empty or contains no data.*';
13
+ }
14
+
15
+ // Ensure all rows have the same keys based on the first row, handle potentially sparse data
16
+ const headers = Object.keys(jsonData[0] || {});
17
+ if (headers.length === 0) {
18
+ return '*Sheet has headers but no data.*';
19
+ }
20
+
21
+ const headerRow = `| ${headers.join(' | ')} |`;
22
+ const separatorRow = `| ${headers.map(() => '---').join(' | ')} |`;
23
+
24
+ const dataRows = jsonData
25
+ .map((row) => {
26
+ const cells = headers.map((header) => {
27
+ const value = row[header];
28
+ // Handle null/undefined and escape pipe characters within cells
29
+ const cellContent =
30
+ value === null || value === undefined ? '' : String(value).replaceAll('|', '\\|');
31
+ return cellContent.trim(); // Trim whitespace from cells
32
+ });
33
+ return `| ${cells.join(' | ')} |`;
34
+ })
35
+ .join('\n');
36
+
37
+ return `${headerRow}\n${separatorRow}\n${dataRows}`;
38
+ }
39
+
40
+ /**
41
+ * Loads Excel files (.xlsx, .xls) using the 'xlsx' library.
42
+ * Each sheet becomes a DocumentPage containing a Markdown table generated by sheetToMarkdownTable.
43
+ */
44
+ export class ExcelLoader implements FileLoaderInterface {
45
+ async loadPages(filePath: string): Promise<DocumentPage[]> {
46
+ const pages: DocumentPage[] = [];
47
+ try {
48
+ // Use readFile for async operation compatible with other loaders
49
+ const dataBuffer = await readFile(filePath);
50
+ const workbook = xlsx.read(dataBuffer, { type: 'buffer' });
51
+
52
+ for (const sheetName of workbook.SheetNames) {
53
+ const worksheet = workbook.Sheets[sheetName];
54
+ // Use sheet_to_json to get array of objects for our custom markdown function
55
+ const jsonData = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet, {
56
+ // Get formatted strings, not raw values
57
+ defval: '',
58
+ raw: false, // Use empty string for blank cells
59
+ });
60
+
61
+ // Convert to markdown using YOUR helper function
62
+ const tableMarkdown = sheetToMarkdownTable(jsonData);
63
+
64
+ const lines = tableMarkdown.split('\n');
65
+ const lineCount = lines.length;
66
+ const charCount = tableMarkdown.length;
67
+
68
+ pages.push({
69
+ // Trim whitespace
70
+ charCount,
71
+ lineCount,
72
+ metadata: {
73
+ sheetName: sheetName,
74
+ },
75
+ pageContent: tableMarkdown.trim(),
76
+ });
77
+ }
78
+
79
+ if (pages.length === 0) {
80
+ pages.push({
81
+ charCount: 0,
82
+ lineCount: 0,
83
+ metadata: {
84
+ error: 'Excel file contains no sheets.',
85
+ },
86
+ pageContent: '',
87
+ });
88
+ }
89
+
90
+ return pages;
91
+ } catch (e) {
92
+ const error = e as Error;
93
+ console.error(`Error loading Excel file ${filePath}: ${error.message}`);
94
+ const errorPage: DocumentPage = {
95
+ charCount: 0,
96
+ lineCount: 0,
97
+ metadata: {
98
+ error: `Failed to load Excel file: ${error.message}`,
99
+ },
100
+ pageContent: '',
101
+ };
102
+ return [errorPage];
103
+ }
104
+ }
105
+
106
+ /**
107
+ * Aggregates content from Excel sheets (Markdown tables).
108
+ * Adds the sheet name as a header before each table.
109
+ * @param pages Array of DocumentPage objects from loadPages.
110
+ * @returns Aggregated content as a string.
111
+ */
112
+ async aggregateContent(pages: DocumentPage[]): Promise<string> {
113
+ return pages
114
+ .map((page) => {
115
+ const sheetName = page.metadata.sheetName;
116
+ const header = sheetName ? `## Sheet: ${sheetName}\n\n` : '';
117
+ return header + page.pageContent;
118
+ })
119
+ .join('\n\n---\n\n'); // Separator between sheets
120
+ }
121
+ }
@@ -0,0 +1,19 @@
1
+ import { FileLoaderInterface, SupportedFileType } from '../types';
2
+ import { DocxLoader } from './docx';
3
+ // import { EpubLoader } from './epub';
4
+ import { ExcelLoader } from './excel';
5
+ import { PdfLoader } from './pdf';
6
+ import { PptxLoader } from './pptx';
7
+ import { TextLoader } from './text';
8
+
9
+ // Loader configuration map
10
+ // Key: file extension (lowercase, without leading dot) or specific type name
11
+ // Value: Loader Class implementing FileLoaderInterface
12
+ export const fileLoaders: Record<SupportedFileType, new () => FileLoaderInterface> = {
13
+ docx: DocxLoader,
14
+ // epub: EpubLoader,
15
+ excel: ExcelLoader,
16
+ pdf: PdfLoader,
17
+ pptx: PptxLoader,
18
+ txt: TextLoader,
19
+ };
@@ -0,0 +1,98 @@
1
+ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2
+
3
+ exports[`PdfLoader > should aggregate content correctly 1`] = `
4
+ "简单报告
5
+ 副标题
6
+ 轻点或点按此占位符⽂本并开始键⼊即可开始。你可以在 Mac、iPad、iPhone 或
7
+ iCloud.com 上查看和编辑此⽂稿。
8
+ 轻松编辑⽂本、更改字体以及添加精美的图形。使⽤段落样式来使整篇⽂稿保持⼀
9
+ 致的⻛格。例如,此段落使⽤“正⽂”样式。你可以在“格式”控制的“⽂本”标签⻚中
10
+ 更改样式。
11
+ 若要添加照⽚、图像画廊、⾳频⽚段、视频、图表或任意 700 多种可⾃定义形状,
12
+ 请在⼯具栏中轻点或点按其中⼀个插⼊按钮,或者将对象拖放到⻚⾯中。你可以分
13
+ 层放置对象、调整其⼤⼩以及将其放在⻚⾯中的任意位置。若要更改对象随⽂本移
14
+ 动的⽅式,请选择对象并随后轻点或点按“格式”控制中的“排列”标签⻚。
15
+ ⼩标题
16
+ Pages ⽂稿可⽤于⽂字处理和⻚⾯布局。此“简单报告”模板为⽂字处理⽽设置,如
17
+ 此⼀来,⽂本便会随着你的键⼊⽽从某⼀⻚流向下⼀⻚,到达⻚⾯末尾时会⾃动创
18
+ 建新的⻚⾯。
19
+ 在⻚⾯布局⽂稿中,你可以⼿动重新排列⻚⾯并随意调整⻚⾯中的⽂本框、图像和
20
+ 其他对象的位置。若要创建⻚⾯布局⽂稿,请在模板选取器中选取⼀种⻚⾯布局模
21
+ 板。你也可以在 Mac、iPad 或 iPhone 上将此⽂稿改为⻚⾯布局,⽅法是在“⽂稿”
22
+ 控制中关闭“⽂稿正⽂”。
23
+ “这是⼀个引⽤(报告中的关键短语)的例⼦。轻点或点按此
24
+ ⽂本添加你⾃⼰的内容。”
25
+ ⻚脚
26
+ 1
27
+
28
+ 这是第⼆⻚的内容
29
+ ⻚脚
30
+ 2"
31
+ `;
32
+
33
+ exports[`PdfLoader > should attach document metadata correctly 1`] = `
34
+ {
35
+ "pdfInfo": {
36
+ "CreationDate": "D:20250419155028Z00'00'",
37
+ "Creator": "Pages文稿",
38
+ "EncryptFilterName": null,
39
+ "IsAcroFormPresent": false,
40
+ "IsCollectionPresent": false,
41
+ "IsLinearized": false,
42
+ "IsSignaturesPresent": false,
43
+ "IsXFAPresent": false,
44
+ "Language": null,
45
+ "ModDate": "D:20250419155028Z00'00'",
46
+ "PDFFormatVersion": "1.3",
47
+ "Producer": "macOS 版本15.3.2(版号24D81) Quartz PDFContext",
48
+ "Title": "test",
49
+ },
50
+ "pdfMetadata": null,
51
+ "pdfVersion": "4.8.69",
52
+ }
53
+ `;
54
+
55
+ exports[`PdfLoader > should load pages correctly from a PDF file 1`] = `
56
+ [
57
+ {
58
+ "charCount": 576,
59
+ "lineCount": 23,
60
+ "metadata": {
61
+ "pageNumber": 1,
62
+ },
63
+ "pageContent": "简单报告
64
+ 副标题
65
+ 轻点或点按此占位符⽂本并开始键⼊即可开始。你可以在 Mac、iPad、iPhone 或
66
+ iCloud.com 上查看和编辑此⽂稿。
67
+ 轻松编辑⽂本、更改字体以及添加精美的图形。使⽤段落样式来使整篇⽂稿保持⼀
68
+ 致的⻛格。例如,此段落使⽤“正⽂”样式。你可以在“格式”控制的“⽂本”标签⻚中
69
+ 更改样式。
70
+ 若要添加照⽚、图像画廊、⾳频⽚段、视频、图表或任意 700 多种可⾃定义形状,
71
+ 请在⼯具栏中轻点或点按其中⼀个插⼊按钮,或者将对象拖放到⻚⾯中。你可以分
72
+ 层放置对象、调整其⼤⼩以及将其放在⻚⾯中的任意位置。若要更改对象随⽂本移
73
+ 动的⽅式,请选择对象并随后轻点或点按“格式”控制中的“排列”标签⻚。
74
+ ⼩标题
75
+ Pages ⽂稿可⽤于⽂字处理和⻚⾯布局。此“简单报告”模板为⽂字处理⽽设置,如
76
+ 此⼀来,⽂本便会随着你的键⼊⽽从某⼀⻚流向下⼀⻚,到达⻚⾯末尾时会⾃动创
77
+ 建新的⻚⾯。
78
+ 在⻚⾯布局⽂稿中,你可以⼿动重新排列⻚⾯并随意调整⻚⾯中的⽂本框、图像和
79
+ 其他对象的位置。若要创建⻚⾯布局⽂稿,请在模板选取器中选取⼀种⻚⾯布局模
80
+ 板。你也可以在 Mac、iPad 或 iPhone 上将此⽂稿改为⻚⾯布局,⽅法是在“⽂稿”
81
+ 控制中关闭“⽂稿正⽂”。
82
+ “这是⼀个引⽤(报告中的关键短语)的例⼦。轻点或点按此
83
+ ⽂本添加你⾃⼰的内容。”
84
+ ⻚脚
85
+ 1",
86
+ },
87
+ {
88
+ "charCount": 14,
89
+ "lineCount": 3,
90
+ "metadata": {
91
+ "pageNumber": 2,
92
+ },
93
+ "pageContent": "这是第⼆⻚的内容
94
+ ⻚脚
95
+ 2",
96
+ },
97
+ ]
98
+ `;
@@ -0,0 +1,49 @@
1
+ // @vitest-environment node
2
+ import path from 'node:path';
3
+ import { beforeEach, describe, expect, it } from 'vitest';
4
+
5
+ import type { FileLoaderInterface } from '../../types';
6
+ import { PdfLoader } from './index';
7
+
8
+ // 确保你已经在 fixtures 目录下放置了 test.pdf 文件
9
+ const fixturePath = (filename: string) => path.join(__dirname, `./fixtures/${filename}`);
10
+
11
+ let loader: FileLoaderInterface;
12
+
13
+ const testFile = fixturePath('test.pdf');
14
+ const nonExistentFile = fixturePath('nonexistent.pdf');
15
+
16
+ beforeEach(() => {
17
+ loader = new PdfLoader();
18
+ });
19
+
20
+ describe('PdfLoader', () => {
21
+ it('should load pages correctly from a PDF file', async () => {
22
+ const pages = await loader.loadPages(testFile);
23
+
24
+ expect(pages.length).toBeGreaterThan(0);
25
+
26
+ expect(pages).toMatchSnapshot();
27
+ });
28
+
29
+ it('should aggregate content correctly', async () => {
30
+ const pages = await loader.loadPages(testFile);
31
+ const content = await loader.aggregateContent(pages);
32
+ // 默认聚合是以换行符连接各页内容
33
+ expect(content).toMatchSnapshot();
34
+ });
35
+
36
+ it('should handle file read errors in loadPages', async () => {
37
+ const pages = await loader.loadPages(nonExistentFile);
38
+ expect(pages).toHaveLength(1); // 即使失败也返回一个包含错误信息的页面
39
+ expect(pages[0].pageContent).toBe('');
40
+ expect(pages[0].metadata.error).toContain('Failed to load or parse PDF file:');
41
+ });
42
+
43
+ it('should attach document metadata correctly', async () => {
44
+ // 首先加载页面以初始化 pdfInstance,尽管此方法不直接使用页面
45
+ const metadata = await loader.attachDocumentMetadata!(testFile);
46
+
47
+ expect(metadata).toMatchSnapshot();
48
+ });
49
+ });
@@ -0,0 +1,133 @@
1
+ import { readFile } from 'node:fs/promises';
2
+ import * as pdfjsLib from 'pdfjs-dist';
3
+ import type { PDFDocumentProxy, PDFPageProxy, TextContent } from 'pdfjs-dist/types/src/display/api';
4
+
5
+ import type { DocumentPage, FileLoaderInterface } from '../../types';
6
+
7
+ /**
8
+ * Loads PDF files page by page using the official pdfjs-dist library.
9
+ */
10
+ export class PdfLoader implements FileLoaderInterface {
11
+ private pdfInstance: PDFDocumentProxy | null = null;
12
+
13
+ private async getPDFFile(filePath: string) {
14
+ if (!!this.pdfInstance) return this.pdfInstance;
15
+
16
+ const dataBuffer = await readFile(filePath);
17
+
18
+ const loadingTask = pdfjsLib.getDocument({
19
+ data: new Uint8Array(dataBuffer.buffer, dataBuffer.byteOffset, dataBuffer.length),
20
+ useSystemFonts: true,
21
+ // Explicitly disable worker thread
22
+ worker: undefined, // Attempt to use system fonts
23
+ });
24
+
25
+ const pdf: PDFDocumentProxy = await loadingTask.promise;
26
+
27
+ this.pdfInstance = pdf;
28
+
29
+ return pdf;
30
+ }
31
+
32
+ async loadPages(filePath: string): Promise<DocumentPage[]> {
33
+ try {
34
+ const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
35
+
36
+ const pages: DocumentPage[] = [];
37
+
38
+ for (let i = 1; i <= pdf.numPages; i += 1) {
39
+ const page: PDFPageProxy = await pdf.getPage(i);
40
+ const content: TextContent = await page.getTextContent();
41
+
42
+ // --- Revert to EXACT Simple Langchain PDFLoader Logic ---
43
+ let lastY;
44
+ const textItems = [];
45
+ for (const item of content.items) {
46
+ // Ensure 'str' exists and potentially filter empty strings if needed, though Langchain's snippet doesn't explicitly filter empties
47
+ if ('str' in item) {
48
+ if (lastY === item.transform[5] || !lastY) {
49
+ // Exact check from Langchain
50
+ textItems.push(item.str);
51
+ } else {
52
+ // Exact else from Langchain for Y change
53
+ textItems.push(`\n${item.str}`);
54
+ }
55
+ // Update lastY, Langchain's snippet doesn't use destructuring here
56
+ lastY = item.transform[5];
57
+ }
58
+ }
59
+
60
+ const pageText = textItems.join(''); // Join with empty separator
61
+ // --- End Revert to Simple Langchain Logic ---
62
+
63
+ // Clean the final text (keep null character removal)
64
+ const cleanedPageContent = pageText.replaceAll('\0', '');
65
+
66
+ // Calculate stats based on the final content
67
+ const pageLines = cleanedPageContent.split('\n');
68
+ const lineCount = pageLines.length;
69
+ const charCount = cleanedPageContent.length;
70
+
71
+ pages.push({
72
+ charCount,
73
+ lineCount,
74
+ metadata: { pageNumber: i },
75
+ pageContent: cleanedPageContent,
76
+ });
77
+
78
+ // Clean up page resources
79
+ page.cleanup();
80
+ }
81
+
82
+ // Clean up document resources
83
+ await pdf.destroy();
84
+
85
+ return pages;
86
+ } catch (e) {
87
+ const error = e as Error;
88
+ console.error(
89
+ `Error loading PDF file ${filePath} using pdfjs-dist: ${error.message}`,
90
+ error.stack,
91
+ );
92
+ const errorPage: DocumentPage = {
93
+ charCount: 0,
94
+ lineCount: 0,
95
+ metadata: {
96
+ error: `Failed to load or parse PDF file: ${error.message}`,
97
+ filePath: filePath,
98
+ },
99
+ pageContent: '',
100
+ };
101
+ return [errorPage];
102
+ }
103
+ }
104
+
105
+ /**
106
+ * Aggregates content from PDF pages.
107
+ * Uses double newline as a separator.
108
+ * @param pages Array of DocumentPage objects.
109
+ * @returns Aggregated content as a string.
110
+ */
111
+ async aggregateContent(pages: DocumentPage[]): Promise<string> {
112
+ return pages
113
+ .filter((page) => !page.metadata.error)
114
+ .map((page) => page.pageContent)
115
+ .join('\n\n');
116
+ }
117
+
118
+ async attachDocumentMetadata(filePath: string): Promise<any> {
119
+ const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
120
+
121
+ const pdfMetadata = (await pdf.getMetadata().catch(() => null)) ?? null;
122
+ const pdfInfo = pdfMetadata?.info ?? {};
123
+ const metadata = pdfMetadata?.metadata ?? null;
124
+
125
+ return {
126
+ pdfInfo: pdfInfo,
127
+ // PDF info (Author, Title, etc.)
128
+ pdfMetadata: metadata,
129
+ // PDF metadata
130
+ pdfVersion: pdfjsLib.version,
131
+ };
132
+ }
133
+ }
@@ -0,0 +1,40 @@
1
+ // Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
2
+
3
+ exports[`PptxLoader > should aggregate content correctly (joining slides) > aggregated_content 1`] = `
4
+ "<slide_page pageNumber="1">
5
+ Hello
6
+ Page1
7
+ </slide_page>
8
+
9
+ <slide_page pageNumber="2">
10
+ Word
11
+ Page2
12
+ </slide_page>"
13
+ `;
14
+
15
+ exports[`PptxLoader > should load pages correctly from a PPTX file (one page per slide) 1`] = `
16
+ [
17
+ {
18
+ "charCount": 11,
19
+ "lineCount": 2,
20
+ "metadata": {
21
+ "pageCount": 2,
22
+ "slideNumber": 1,
23
+ "sourceFileName": "test.pptx",
24
+ },
25
+ "pageContent": "Hello
26
+ Page1",
27
+ },
28
+ {
29
+ "charCount": 10,
30
+ "lineCount": 2,
31
+ "metadata": {
32
+ "pageCount": 2,
33
+ "slideNumber": 2,
34
+ "sourceFileName": "test.pptx",
35
+ },
36
+ "pageContent": "Word
37
+ Page2",
38
+ },
39
+ ]
40
+ `;