@lobehub/chat 1.84.27 → 1.85.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/changelog/v1.json +9 -0
- package/docs/development/database-schema.dbml +59 -1
- package/package.json +2 -1
- package/packages/file-loaders/package.json +5 -1
- package/packages/file-loaders/src/loadFile.ts +51 -1
- package/packages/file-loaders/src/loaders/docx/index.ts +16 -1
- package/packages/file-loaders/src/loaders/excel/index.ts +30 -2
- package/packages/file-loaders/src/loaders/pdf/__snapshots__/index.test.ts.snap +1 -1
- package/packages/file-loaders/src/loaders/pdf/index.ts +52 -12
- package/packages/file-loaders/src/loaders/pptx/index.ts +32 -1
- package/packages/file-loaders/src/loaders/text/index.test.ts +1 -1
- package/packages/file-loaders/src/loaders/text/index.ts +13 -1
- package/packages/file-loaders/test/__snapshots__/loaders.test.ts.snap +41 -0
- package/packages/file-loaders/test/loaders.test.ts +20 -0
- package/packages/file-loaders/test/setup.ts +17 -0
- package/packages/file-loaders/vitest.config.ts +14 -0
- package/src/const/file.ts +8 -1
- package/src/database/client/migrations.json +23 -1
- package/src/database/migrations/0022_add_documents.sql +49 -0
- package/src/database/migrations/meta/0022_snapshot.json +5340 -0
- package/src/database/migrations/meta/_journal.json +7 -0
- package/src/database/models/_template.ts +1 -1
- package/src/database/models/document.ts +54 -0
- package/src/database/models/message.ts +25 -0
- package/src/database/repositories/tableViewer/index.test.ts +1 -1
- package/src/database/schemas/document.ts +104 -0
- package/src/database/schemas/index.ts +1 -0
- package/src/database/schemas/relations.ts +34 -2
- package/src/database/schemas/topic.ts +31 -8
- package/src/database/utils/idGenerator.ts +1 -0
- package/src/features/ChatInput/Desktop/FilePreview/FileItem/Content.tsx +1 -1
- package/src/features/ChatInput/Desktop/FilePreview/FileItem/index.tsx +10 -10
- package/src/features/ChatInput/components/UploadDetail/UploadStatus.tsx +2 -2
- package/src/features/Conversation/Actions/Error.tsx +2 -2
- package/src/libs/trpc/lambda/context.ts +7 -0
- package/src/prompts/files/file.ts +6 -4
- package/src/server/routers/lambda/document.ts +36 -0
- package/src/server/routers/lambda/index.ts +2 -0
- package/src/server/services/document/index.ts +66 -0
- package/src/server/services/mcp/index.ts +0 -4
- package/src/services/rag.ts +4 -0
- package/src/store/chat/slices/aiChat/actions/__tests__/rag.test.ts +2 -2
- package/src/store/chat/slices/aiChat/actions/rag.ts +2 -3
- package/src/store/file/slices/chat/action.ts +3 -51
- package/src/types/document/index.ts +172 -0
- package/src/types/message/chat.ts +1 -0
- package/src/features/ChatInput/Desktop/FilePreview/FileItem/style.ts +0 -4
package/CHANGELOG.md
CHANGED
@@ -2,6 +2,31 @@
|
|
2
2
|
|
3
3
|
# Changelog
|
4
4
|
|
5
|
+
## [Version 1.85.0](https://github.com/lobehub/lobe-chat/compare/v1.84.27...v1.85.0)
|
6
|
+
|
7
|
+
<sup>Released on **2025-05-09**</sup>
|
8
|
+
|
9
|
+
#### ✨ Features
|
10
|
+
|
11
|
+
- **misc**: Support upload files direct into chat context.
|
12
|
+
|
13
|
+
<br/>
|
14
|
+
|
15
|
+
<details>
|
16
|
+
<summary><kbd>Improvements and Fixes</kbd></summary>
|
17
|
+
|
18
|
+
#### What's improved
|
19
|
+
|
20
|
+
- **misc**: Support upload files direct into chat context, closes [#7751](https://github.com/lobehub/lobe-chat/issues/7751) ([39b790e](https://github.com/lobehub/lobe-chat/commit/39b790e))
|
21
|
+
|
22
|
+
</details>
|
23
|
+
|
24
|
+
<div align="right">
|
25
|
+
|
26
|
+
[](#readme-top)
|
27
|
+
|
28
|
+
</div>
|
29
|
+
|
5
30
|
### [Version 1.84.27](https://github.com/lobehub/lobe-chat/compare/v1.84.26...v1.84.27)
|
6
31
|
|
7
32
|
<sup>Released on **2025-05-09**</sup>
|
package/changelog/v1.json
CHANGED
@@ -115,6 +115,45 @@ table async_tasks {
|
|
115
115
|
updated_at "timestamp with time zone" [not null, default: `now()`]
|
116
116
|
}
|
117
117
|
|
118
|
+
table document_chunks {
|
119
|
+
document_id varchar(30) [not null]
|
120
|
+
chunk_id uuid [not null]
|
121
|
+
page_index integer
|
122
|
+
user_id text [not null]
|
123
|
+
created_at "timestamp with time zone" [not null, default: `now()`]
|
124
|
+
|
125
|
+
indexes {
|
126
|
+
(document_id, chunk_id) [pk]
|
127
|
+
}
|
128
|
+
}
|
129
|
+
|
130
|
+
table documents {
|
131
|
+
id varchar(30) [pk, not null]
|
132
|
+
title text
|
133
|
+
content text
|
134
|
+
file_type varchar(255) [not null]
|
135
|
+
filename text
|
136
|
+
total_char_count integer [not null]
|
137
|
+
total_line_count integer [not null]
|
138
|
+
metadata jsonb
|
139
|
+
pages jsonb
|
140
|
+
source_type text [not null]
|
141
|
+
source text [not null]
|
142
|
+
file_id text
|
143
|
+
user_id text [not null]
|
144
|
+
client_id text
|
145
|
+
accessed_at "timestamp with time zone" [not null, default: `now()`]
|
146
|
+
created_at "timestamp with time zone" [not null, default: `now()`]
|
147
|
+
updated_at "timestamp with time zone" [not null, default: `now()`]
|
148
|
+
|
149
|
+
indexes {
|
150
|
+
source [name: 'documents_source_idx']
|
151
|
+
file_type [name: 'documents_file_type_idx']
|
152
|
+
file_id [name: 'documents_file_id_idx']
|
153
|
+
(client_id, user_id) [name: 'documents_client_id_user_id_unique', unique]
|
154
|
+
}
|
155
|
+
}
|
156
|
+
|
118
157
|
table files {
|
119
158
|
id text [pk, not null]
|
120
159
|
user_id text [not null]
|
@@ -670,6 +709,17 @@ table threads {
|
|
670
709
|
}
|
671
710
|
}
|
672
711
|
|
712
|
+
table topic_documents {
|
713
|
+
document_id text [not null]
|
714
|
+
topic_id text [not null]
|
715
|
+
user_id text [not null]
|
716
|
+
created_at "timestamp with time zone" [not null, default: `now()`]
|
717
|
+
|
718
|
+
indexes {
|
719
|
+
(document_id, topic_id) [pk]
|
720
|
+
}
|
721
|
+
}
|
722
|
+
|
673
723
|
table topics {
|
674
724
|
id text [pk, not null]
|
675
725
|
title text
|
@@ -744,6 +794,10 @@ ref: agents_to_sessions.agent_id > agents.id
|
|
744
794
|
|
745
795
|
ref: unstructured_chunks.file_id - files.id
|
746
796
|
|
797
|
+
ref: document_chunks.document_id > documents.id
|
798
|
+
|
799
|
+
ref: documents.file_id > files.id
|
800
|
+
|
747
801
|
ref: files.embedding_task_id - async_tasks.id
|
748
802
|
|
749
803
|
ref: messages.session_id - sessions.id
|
@@ -756,4 +810,8 @@ ref: threads.source_message_id - messages.id
|
|
756
810
|
|
757
811
|
ref: sessions.group_id - session_groups.id
|
758
812
|
|
759
|
-
ref:
|
813
|
+
ref: topic_documents.document_id > documents.id
|
814
|
+
|
815
|
+
ref: topic_documents.topic_id > topics.id
|
816
|
+
|
817
|
+
ref: topics.session_id - sessions.id
|
package/package.json
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
{
|
2
2
|
"name": "@lobehub/chat",
|
3
|
-
"version": "1.
|
3
|
+
"version": "1.85.0",
|
4
4
|
"description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
|
5
5
|
"keywords": [
|
6
6
|
"framework",
|
@@ -143,6 +143,7 @@
|
|
143
143
|
"@langchain/community": "^0.3.38",
|
144
144
|
"@lobechat/electron-client-ipc": "workspace:*",
|
145
145
|
"@lobechat/electron-server-ipc": "workspace:*",
|
146
|
+
"@lobechat/file-loaders": "workspace:*",
|
146
147
|
"@lobechat/web-crawler": "workspace:*",
|
147
148
|
"@lobehub/charts": "^2.0.0",
|
148
149
|
"@lobehub/chat-plugin-sdk": "^1.32.4",
|
@@ -20,6 +20,9 @@
|
|
20
20
|
"author": "LobeHub <i@lobehub.com>",
|
21
21
|
"sideEffects": false,
|
22
22
|
"main": "./src/index.ts",
|
23
|
+
"scripts": {
|
24
|
+
"test": "vitest"
|
25
|
+
},
|
23
26
|
"dependencies": {
|
24
27
|
"@langchain/community": "^0.3.41",
|
25
28
|
"@langchain/core": "^0.3.45",
|
@@ -27,13 +30,14 @@
|
|
27
30
|
"concat-stream": "^2.0.0",
|
28
31
|
"mammoth": "^1.8.0",
|
29
32
|
"officeparser": "^5.1.1",
|
30
|
-
"pdfjs-dist": "4.
|
33
|
+
"pdfjs-dist": "4.10.38",
|
31
34
|
"xlsx": "^0.18.5",
|
32
35
|
"yauzl": "^3.2.0"
|
33
36
|
},
|
34
37
|
"devDependencies": {
|
35
38
|
"@types/concat-stream": "^2.0.3",
|
36
39
|
"@types/yauzl": "^2.10.3",
|
40
|
+
"canvas": "^3.1.0",
|
37
41
|
"typescript": "^5"
|
38
42
|
},
|
39
43
|
"peerDependencies": {
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import debug from 'debug';
|
1
2
|
import { stat } from 'node:fs/promises';
|
2
3
|
import * as path from 'node:path';
|
3
4
|
|
@@ -7,37 +8,52 @@ import { FileDocument, FileMetadata, SupportedFileType } from './types';
|
|
7
8
|
import type { DocumentPage, FileLoaderInterface } from './types';
|
8
9
|
import { isTextReadableFile } from './utils/isTextReadableFile';
|
9
10
|
|
11
|
+
const log = debug('file-loaders:loadFile');
|
12
|
+
|
10
13
|
/**
|
11
14
|
* Determines the file type based on the filename extension.
|
12
15
|
* @param filePath The path to the file.
|
13
16
|
* @returns The determined file type or 'txt' if text-readable, undefined otherwise.
|
14
17
|
*/
|
15
18
|
const getFileType = (filePath: string): SupportedFileType | undefined => {
|
19
|
+
log('Determining file type for:', filePath);
|
16
20
|
const extension = path.extname(filePath).toLowerCase().replace('.', '');
|
17
21
|
|
18
|
-
if (!extension)
|
22
|
+
if (!extension) {
|
23
|
+
log('No extension found, treating as txt');
|
24
|
+
return 'txt'; // Treat files without extension as text?
|
25
|
+
}
|
19
26
|
|
20
27
|
// Prioritize checking if it's a generally text-readable type
|
21
28
|
if (isTextReadableFile(extension)) {
|
29
|
+
log(`Extension '${extension}' is text-readable, treating as txt`);
|
22
30
|
return 'txt';
|
23
31
|
}
|
24
32
|
|
25
33
|
// Handle specific non-text or complex types
|
34
|
+
log(`Checking specific types for extension: '${extension}'`);
|
26
35
|
switch (extension) {
|
27
36
|
case 'pdf': {
|
37
|
+
log('File type identified as pdf');
|
28
38
|
return 'pdf';
|
29
39
|
}
|
30
40
|
case 'docx': {
|
41
|
+
log('File type identified as docx');
|
31
42
|
return 'docx';
|
32
43
|
}
|
33
44
|
case 'xlsx':
|
34
45
|
case 'xls': {
|
46
|
+
log('File type identified as excel');
|
35
47
|
return 'excel';
|
36
48
|
}
|
37
49
|
case 'pptx': {
|
50
|
+
log('File type identified as pptx');
|
38
51
|
return 'pptx';
|
39
52
|
}
|
40
53
|
default: {
|
54
|
+
log(
|
55
|
+
`Extension '${extension}' is not a specifically handled type and not text-readable. Unsupported.`,
|
56
|
+
);
|
41
57
|
// If not text-readable and not a specific known type, it's unsupported
|
42
58
|
return undefined;
|
43
59
|
}
|
@@ -59,18 +75,23 @@ export const loadFile = async (
|
|
59
75
|
filePath: string,
|
60
76
|
fileMetadata?: FileMetadata,
|
61
77
|
): Promise<FileDocument> => {
|
78
|
+
log('Starting to load file:', filePath, 'with metadata:', fileMetadata);
|
62
79
|
let stats;
|
63
80
|
let fsError: string | undefined;
|
64
81
|
|
65
82
|
try {
|
83
|
+
log('Attempting to get file stats for:', filePath);
|
66
84
|
stats = await stat(filePath);
|
85
|
+
log('Successfully retrieved file stats:', stats);
|
67
86
|
} catch (e) {
|
68
87
|
const error = e as Error;
|
88
|
+
log('Error getting file stats for %s: %s', filePath, error.message);
|
69
89
|
console.error(`Error getting file stats for ${filePath}: ${error.message}`);
|
70
90
|
fsError = `Failed to access file stats: ${error.message}`;
|
71
91
|
}
|
72
92
|
|
73
93
|
// Determine base file info from path and stats (if available)
|
94
|
+
log('Determining base file info');
|
74
95
|
const fileExtension = path.extname(filePath).slice(1).toLowerCase();
|
75
96
|
const baseFilename = path.basename(filePath);
|
76
97
|
|
@@ -80,13 +101,22 @@ export const loadFile = async (
|
|
80
101
|
const fileType = fileMetadata?.fileType ?? fileExtension;
|
81
102
|
const createdTime = fileMetadata?.createdTime ?? stats?.ctime ?? new Date();
|
82
103
|
const modifiedTime = fileMetadata?.modifiedTime ?? stats?.mtime ?? new Date();
|
104
|
+
log('File info determined/overridden: %O', {
|
105
|
+
createdTime,
|
106
|
+
fileType,
|
107
|
+
filename,
|
108
|
+
modifiedTime,
|
109
|
+
source,
|
110
|
+
});
|
83
111
|
|
84
112
|
const paserType = getFileType(filePath);
|
113
|
+
log('Parser type determined as:', paserType);
|
85
114
|
|
86
115
|
// Select the loader CLASS based on the determined fileType, fallback to DefaultLoader
|
87
116
|
const LoaderClass: new () => FileLoaderInterface = paserType
|
88
117
|
? fileLoaders[paserType]
|
89
118
|
: DefaultLoader;
|
119
|
+
log('Selected loader class:', LoaderClass.name);
|
90
120
|
|
91
121
|
if (!paserType) {
|
92
122
|
console.warn(
|
@@ -102,17 +132,23 @@ export const loadFile = async (
|
|
102
132
|
let loaderSpecificMetadata: any | undefined;
|
103
133
|
|
104
134
|
// Instantiate the loader
|
135
|
+
log('Instantiating loader:', LoaderClass.name);
|
105
136
|
const loaderInstance = new LoaderClass();
|
106
137
|
|
107
138
|
// If we couldn't even get stats, skip loader execution
|
108
139
|
if (!fsError) {
|
140
|
+
log('File stats available, proceeding with loader execution.');
|
109
141
|
try {
|
110
142
|
// 1. Load pages using the instance
|
143
|
+
log('Loading pages with loader:', LoaderClass.name, 'for file:', filePath);
|
111
144
|
pages = await loaderInstance.loadPages(filePath);
|
145
|
+
log('Pages loaded successfully, count:', pages.length);
|
112
146
|
|
113
147
|
try {
|
114
148
|
// 2. Aggregate content using the instance
|
149
|
+
log('Aggregating content with loader:', LoaderClass.name);
|
115
150
|
aggregatedContent = await loaderInstance.aggregateContent(pages);
|
151
|
+
log('Content aggregated successfully, length:', aggregatedContent.length);
|
116
152
|
} catch (aggError) {
|
117
153
|
const error = aggError as Error;
|
118
154
|
console.error(
|
@@ -124,8 +160,10 @@ export const loadFile = async (
|
|
124
160
|
|
125
161
|
// 3. Attach document-specific metadata if loader supports it
|
126
162
|
if (typeof loaderInstance.attachDocumentMetadata === 'function') {
|
163
|
+
log('Loader supports attachDocumentMetadata. Attaching...');
|
127
164
|
try {
|
128
165
|
loaderSpecificMetadata = await loaderInstance.attachDocumentMetadata(filePath);
|
166
|
+
log('Document-specific metadata attached:', loaderSpecificMetadata);
|
129
167
|
} catch (metaErr) {
|
130
168
|
const error = metaErr as Error;
|
131
169
|
console.error(
|
@@ -133,6 +171,8 @@ export const loadFile = async (
|
|
133
171
|
);
|
134
172
|
metadataError = `Metadata attachment failed: ${error.message}`;
|
135
173
|
}
|
174
|
+
} else {
|
175
|
+
log('Loader does not support attachDocumentMetadata.');
|
136
176
|
}
|
137
177
|
} catch (loadErr) {
|
138
178
|
const error = loadErr as Error;
|
@@ -152,6 +192,7 @@ export const loadFile = async (
|
|
152
192
|
// Aggregated content remains empty
|
153
193
|
}
|
154
194
|
} else {
|
195
|
+
log('File stats access failed (fsError: %s). Creating minimal error page.', fsError);
|
155
196
|
// If stats failed, create a minimal error page
|
156
197
|
pages = [
|
157
198
|
{
|
@@ -167,16 +208,20 @@ export const loadFile = async (
|
|
167
208
|
// Calculate totals from the loaded pages
|
168
209
|
let totalCharCount = 0;
|
169
210
|
let totalLineCount = 0;
|
211
|
+
log('Calculating total char and line counts from pages.');
|
170
212
|
for (const page of pages) {
|
171
213
|
totalCharCount += page.charCount;
|
172
214
|
totalLineCount += page.lineCount;
|
173
215
|
}
|
216
|
+
log('Totals calculated:', { totalCharCount, totalLineCount });
|
174
217
|
|
175
218
|
// Combine all potential errors
|
176
219
|
const combinedError =
|
177
220
|
[fsError, loaderError, aggregationError, metadataError].filter(Boolean).join('; ') || undefined;
|
221
|
+
if (combinedError) log('Combined errors:', combinedError);
|
178
222
|
|
179
223
|
// Construct the final FileDocument
|
224
|
+
log('Constructing final FileDocument.');
|
180
225
|
const fileDocument: FileDocument = {
|
181
226
|
content: aggregatedContent, // Use content from aggregateContent
|
182
227
|
createdTime,
|
@@ -202,5 +247,10 @@ export const loadFile = async (
|
|
202
247
|
delete fileDocument.metadata.error;
|
203
248
|
}
|
204
249
|
|
250
|
+
log('File loading process completed for:', filePath, 'Returning document:', {
|
251
|
+
fileType: fileDocument.fileType,
|
252
|
+
filename: fileDocument.filename,
|
253
|
+
pages: fileDocument.pages?.length,
|
254
|
+
});
|
205
255
|
return fileDocument;
|
206
256
|
};
|
@@ -1,15 +1,21 @@
|
|
1
1
|
import { DocxLoader as LangchainDocxLoader } from '@langchain/community/document_loaders/fs/docx';
|
2
|
+
import debug from 'debug';
|
2
3
|
|
3
4
|
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
5
|
|
6
|
+
const log = debug('file-loaders:docx');
|
7
|
+
|
5
8
|
/**
|
6
9
|
* Loads Word documents (.docx) using the LangChain Community DocxLoader.
|
7
10
|
*/
|
8
11
|
export class DocxLoader implements FileLoaderInterface {
|
9
12
|
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
13
|
+
log('Loading DOCX file:', filePath);
|
10
14
|
try {
|
11
15
|
const loader = new LangchainDocxLoader(filePath);
|
16
|
+
log('LangChain DocxLoader created');
|
12
17
|
const docs = await loader.load(); // Langchain DocxLoader typically loads the whole doc as one
|
18
|
+
log('DOCX document loaded, parts:', docs.length);
|
13
19
|
|
14
20
|
const pages: DocumentPage[] = docs.map((doc) => {
|
15
21
|
const pageContent = doc.pageContent || '';
|
@@ -27,6 +33,8 @@ export class DocxLoader implements FileLoaderInterface {
|
|
27
33
|
// @ts-expect-error Remove source if present, as it's handled at the FileDocument level
|
28
34
|
delete metadata.source;
|
29
35
|
|
36
|
+
log('DOCX document processed, lines:', lineCount, 'chars:', charCount);
|
37
|
+
|
30
38
|
return {
|
31
39
|
charCount,
|
32
40
|
lineCount,
|
@@ -37,6 +45,7 @@ export class DocxLoader implements FileLoaderInterface {
|
|
37
45
|
|
38
46
|
// If docs array is empty (e.g., empty file), create an empty page
|
39
47
|
if (pages.length === 0) {
|
48
|
+
log('No content in DOCX document, creating empty page');
|
40
49
|
pages.push({
|
41
50
|
charCount: 0,
|
42
51
|
lineCount: 0,
|
@@ -45,9 +54,11 @@ export class DocxLoader implements FileLoaderInterface {
|
|
45
54
|
});
|
46
55
|
}
|
47
56
|
|
57
|
+
log('DOCX loading completed, total pages:', pages.length);
|
48
58
|
return pages;
|
49
59
|
} catch (e) {
|
50
60
|
const error = e as Error;
|
61
|
+
log('Error encountered while loading DOCX file');
|
51
62
|
console.error(`Error loading DOCX file ${filePath} using LangChain loader: ${error.message}`);
|
52
63
|
const errorPage: DocumentPage = {
|
53
64
|
charCount: 0,
|
@@ -57,6 +68,7 @@ export class DocxLoader implements FileLoaderInterface {
|
|
57
68
|
},
|
58
69
|
pageContent: '',
|
59
70
|
};
|
71
|
+
log('Created error page for failed DOCX loading');
|
60
72
|
return [errorPage];
|
61
73
|
}
|
62
74
|
}
|
@@ -68,6 +80,9 @@ export class DocxLoader implements FileLoaderInterface {
|
|
68
80
|
* @returns Aggregated content as a string.
|
69
81
|
*/
|
70
82
|
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
71
|
-
|
83
|
+
log('Aggregating content from', pages.length, 'DOCX pages');
|
84
|
+
const result = pages.map((page) => page.pageContent).join('\n\n');
|
85
|
+
log('DOCX content aggregated successfully, length:', result.length);
|
86
|
+
return result;
|
72
87
|
}
|
73
88
|
}
|
@@ -1,26 +1,34 @@
|
|
1
|
+
import debug from 'debug';
|
1
2
|
import { readFile } from 'node:fs/promises';
|
2
3
|
import * as xlsx from 'xlsx';
|
3
4
|
|
4
5
|
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
5
6
|
|
7
|
+
const log = debug('file-loaders:excel');
|
8
|
+
|
6
9
|
/**
|
7
10
|
* Converts sheet data (array of objects) to a Markdown table string.
|
8
11
|
* Handles empty sheets and escapes pipe characters.
|
9
12
|
*/
|
10
13
|
function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
|
14
|
+
log('Converting sheet data to Markdown table, rows:', jsonData?.length || 0);
|
11
15
|
if (!jsonData || jsonData.length === 0) {
|
16
|
+
log('Sheet is empty, returning placeholder message');
|
12
17
|
return '*Sheet is empty or contains no data.*';
|
13
18
|
}
|
14
19
|
|
15
20
|
// Ensure all rows have the same keys based on the first row, handle potentially sparse data
|
16
21
|
const headers = Object.keys(jsonData[0] || {});
|
22
|
+
log('Sheet headers:', headers);
|
17
23
|
if (headers.length === 0) {
|
24
|
+
log('Sheet has no headers, returning placeholder message');
|
18
25
|
return '*Sheet has headers but no data.*';
|
19
26
|
}
|
20
27
|
|
21
28
|
const headerRow = `| ${headers.join(' | ')} |`;
|
22
29
|
const separatorRow = `| ${headers.map(() => '---').join(' | ')} |`;
|
23
30
|
|
31
|
+
log('Building data rows for Markdown table');
|
24
32
|
const dataRows = jsonData
|
25
33
|
.map((row) => {
|
26
34
|
const cells = headers.map((header) => {
|
@@ -34,7 +42,9 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
|
|
34
42
|
})
|
35
43
|
.join('\n');
|
36
44
|
|
37
|
-
|
45
|
+
const result = `${headerRow}\n${separatorRow}\n${dataRows}`;
|
46
|
+
log('Markdown table created, length:', result.length);
|
47
|
+
return result;
|
38
48
|
}
|
39
49
|
|
40
50
|
/**
|
@@ -43,13 +53,20 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
|
|
43
53
|
*/
|
44
54
|
export class ExcelLoader implements FileLoaderInterface {
|
45
55
|
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
56
|
+
log('Loading Excel file:', filePath);
|
46
57
|
const pages: DocumentPage[] = [];
|
47
58
|
try {
|
48
59
|
// Use readFile for async operation compatible with other loaders
|
60
|
+
log('Reading Excel file as buffer');
|
49
61
|
const dataBuffer = await readFile(filePath);
|
62
|
+
log('Excel file read successfully, size:', dataBuffer.length, 'bytes');
|
63
|
+
|
64
|
+
log('Parsing Excel workbook');
|
50
65
|
const workbook = xlsx.read(dataBuffer, { type: 'buffer' });
|
66
|
+
log('Excel workbook parsed successfully, sheets:', workbook.SheetNames.length);
|
51
67
|
|
52
68
|
for (const sheetName of workbook.SheetNames) {
|
69
|
+
log(`Processing sheet: ${sheetName}`);
|
53
70
|
const worksheet = workbook.Sheets[sheetName];
|
54
71
|
// Use sheet_to_json to get array of objects for our custom markdown function
|
55
72
|
const jsonData = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet, {
|
@@ -57,6 +74,7 @@ export class ExcelLoader implements FileLoaderInterface {
|
|
57
74
|
defval: '',
|
58
75
|
raw: false, // Use empty string for blank cells
|
59
76
|
});
|
77
|
+
log(`Sheet ${sheetName} converted to JSON, rows:`, jsonData.length);
|
60
78
|
|
61
79
|
// Convert to markdown using YOUR helper function
|
62
80
|
const tableMarkdown = sheetToMarkdownTable(jsonData);
|
@@ -64,6 +82,7 @@ export class ExcelLoader implements FileLoaderInterface {
|
|
64
82
|
const lines = tableMarkdown.split('\n');
|
65
83
|
const lineCount = lines.length;
|
66
84
|
const charCount = tableMarkdown.length;
|
85
|
+
log(`Sheet ${sheetName} converted to Markdown, lines: ${lineCount}, chars: ${charCount}`);
|
67
86
|
|
68
87
|
pages.push({
|
69
88
|
// Trim whitespace
|
@@ -74,9 +93,11 @@ export class ExcelLoader implements FileLoaderInterface {
|
|
74
93
|
},
|
75
94
|
pageContent: tableMarkdown.trim(),
|
76
95
|
});
|
96
|
+
log(`Added sheet ${sheetName} as page`);
|
77
97
|
}
|
78
98
|
|
79
99
|
if (pages.length === 0) {
|
100
|
+
log('Excel file contains no sheets, creating empty page with error');
|
80
101
|
pages.push({
|
81
102
|
charCount: 0,
|
82
103
|
lineCount: 0,
|
@@ -87,9 +108,11 @@ export class ExcelLoader implements FileLoaderInterface {
|
|
87
108
|
});
|
88
109
|
}
|
89
110
|
|
111
|
+
log('Excel loading completed, total pages:', pages.length);
|
90
112
|
return pages;
|
91
113
|
} catch (e) {
|
92
114
|
const error = e as Error;
|
115
|
+
log('Error encountered while loading Excel file');
|
93
116
|
console.error(`Error loading Excel file ${filePath}: ${error.message}`);
|
94
117
|
const errorPage: DocumentPage = {
|
95
118
|
charCount: 0,
|
@@ -99,6 +122,7 @@ export class ExcelLoader implements FileLoaderInterface {
|
|
99
122
|
},
|
100
123
|
pageContent: '',
|
101
124
|
};
|
125
|
+
log('Created error page for failed Excel loading');
|
102
126
|
return [errorPage];
|
103
127
|
}
|
104
128
|
}
|
@@ -110,12 +134,16 @@ export class ExcelLoader implements FileLoaderInterface {
|
|
110
134
|
* @returns Aggregated content as a string.
|
111
135
|
*/
|
112
136
|
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
113
|
-
|
137
|
+
log('Aggregating content from', pages.length, 'Excel pages');
|
138
|
+
const result = pages
|
114
139
|
.map((page) => {
|
115
140
|
const sheetName = page.metadata.sheetName;
|
116
141
|
const header = sheetName ? `## Sheet: ${sheetName}\n\n` : '';
|
117
142
|
return header + page.pageContent;
|
118
143
|
})
|
119
144
|
.join('\n\n---\n\n'); // Separator between sheets
|
145
|
+
|
146
|
+
log('Excel content aggregated successfully, length:', result.length);
|
147
|
+
return result;
|
120
148
|
}
|
121
149
|
}
|