@lobehub/chat 1.84.26 → 1.85.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/CHANGELOG.md +50 -0
  2. package/changelog/v1.json +18 -0
  3. package/docs/development/database-schema.dbml +59 -1
  4. package/package.json +3 -2
  5. package/packages/file-loaders/package.json +5 -1
  6. package/packages/file-loaders/src/loadFile.ts +51 -1
  7. package/packages/file-loaders/src/loaders/docx/index.ts +16 -1
  8. package/packages/file-loaders/src/loaders/excel/index.ts +30 -2
  9. package/packages/file-loaders/src/loaders/pdf/__snapshots__/index.test.ts.snap +1 -1
  10. package/packages/file-loaders/src/loaders/pdf/index.ts +52 -12
  11. package/packages/file-loaders/src/loaders/pptx/index.ts +32 -1
  12. package/packages/file-loaders/src/loaders/text/index.test.ts +1 -1
  13. package/packages/file-loaders/src/loaders/text/index.ts +13 -1
  14. package/packages/file-loaders/test/__snapshots__/loaders.test.ts.snap +41 -0
  15. package/packages/file-loaders/test/loaders.test.ts +20 -0
  16. package/packages/file-loaders/test/setup.ts +17 -0
  17. package/packages/file-loaders/vitest.config.ts +14 -0
  18. package/src/config/aiModels/vertexai.ts +6 -6
  19. package/src/const/file.ts +8 -1
  20. package/src/database/client/migrations.json +23 -1
  21. package/src/database/migrations/0022_add_documents.sql +49 -0
  22. package/src/database/migrations/meta/0022_snapshot.json +5340 -0
  23. package/src/database/migrations/meta/_journal.json +7 -0
  24. package/src/database/models/_template.ts +1 -1
  25. package/src/database/models/document.ts +54 -0
  26. package/src/database/models/message.ts +25 -0
  27. package/src/database/repositories/tableViewer/index.test.ts +1 -1
  28. package/src/database/schemas/document.ts +104 -0
  29. package/src/database/schemas/index.ts +1 -0
  30. package/src/database/schemas/relations.ts +34 -2
  31. package/src/database/schemas/topic.ts +31 -8
  32. package/src/database/utils/idGenerator.ts +1 -0
  33. package/src/features/ChatInput/Desktop/FilePreview/FileItem/Content.tsx +1 -1
  34. package/src/features/ChatInput/Desktop/FilePreview/FileItem/index.tsx +10 -10
  35. package/src/features/ChatInput/components/UploadDetail/UploadStatus.tsx +2 -2
  36. package/src/features/Conversation/Actions/Error.tsx +2 -2
  37. package/src/libs/agent-runtime/google/index.ts +2 -1
  38. package/src/libs/agent-runtime/utils/streams/google-ai.test.ts +101 -6
  39. package/src/libs/agent-runtime/utils/streams/google-ai.ts +62 -38
  40. package/src/libs/agent-runtime/utils/streams/protocol.ts +24 -4
  41. package/src/libs/agent-runtime/utils/streams/vertex-ai.test.ts +109 -8
  42. package/src/libs/agent-runtime/utils/streams/vertex-ai.ts +68 -23
  43. package/src/libs/trpc/lambda/context.ts +7 -0
  44. package/src/prompts/files/file.ts +6 -4
  45. package/src/server/routers/lambda/__tests__/file.test.ts +213 -0
  46. package/src/server/routers/lambda/document.ts +36 -0
  47. package/src/server/routers/lambda/index.ts +2 -0
  48. package/src/server/services/document/index.ts +66 -0
  49. package/src/server/services/mcp/index.ts +0 -4
  50. package/src/services/rag.ts +4 -0
  51. package/src/store/chat/slices/aiChat/actions/__tests__/rag.test.ts +2 -2
  52. package/src/store/chat/slices/aiChat/actions/rag.ts +2 -3
  53. package/src/store/file/slices/chat/action.ts +3 -51
  54. package/src/types/document/index.ts +172 -0
  55. package/src/types/message/chat.ts +1 -0
  56. package/src/features/ChatInput/Desktop/FilePreview/FileItem/style.ts +0 -4
package/CHANGELOG.md CHANGED
@@ -2,6 +2,56 @@
2
2
 
3
3
  # Changelog
4
4
 
5
+ ## [Version 1.85.0](https://github.com/lobehub/lobe-chat/compare/v1.84.27...v1.85.0)
6
+
7
+ <sup>Released on **2025-05-09**</sup>
8
+
9
+ #### ✨ Features
10
+
11
+ - **misc**: Support upload files direct into chat context.
12
+
13
+ <br/>
14
+
15
+ <details>
16
+ <summary><kbd>Improvements and Fixes</kbd></summary>
17
+
18
+ #### What's improved
19
+
20
+ - **misc**: Support upload files direct into chat context, closes [#7751](https://github.com/lobehub/lobe-chat/issues/7751) ([39b790e](https://github.com/lobehub/lobe-chat/commit/39b790e))
21
+
22
+ </details>
23
+
24
+ <div align="right">
25
+
26
+ [![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top)
27
+
28
+ </div>
29
+
30
+ ### [Version 1.84.27](https://github.com/lobehub/lobe-chat/compare/v1.84.26...v1.84.27)
31
+
32
+ <sup>Released on **2025-05-09**</sup>
33
+
34
+ #### 💄 Styles
35
+
36
+ - **misc**: Add reasoning tokens and token usage statistics for Google Gemini.
37
+
38
+ <br/>
39
+
40
+ <details>
41
+ <summary><kbd>Improvements and Fixes</kbd></summary>
42
+
43
+ #### Styles
44
+
45
+ - **misc**: Add reasoning tokens and token usage statistics for Google Gemini, closes [#7501](https://github.com/lobehub/lobe-chat/issues/7501) ([b466b42](https://github.com/lobehub/lobe-chat/commit/b466b42))
46
+
47
+ </details>
48
+
49
+ <div align="right">
50
+
51
+ [![](https://img.shields.io/badge/-BACK_TO_TOP-151515?style=flat-square)](#readme-top)
52
+
53
+ </div>
54
+
5
55
  ### [Version 1.84.26](https://github.com/lobehub/lobe-chat/compare/v1.84.25...v1.84.26)
6
56
 
7
57
  <sup>Released on **2025-05-08**</sup>
package/changelog/v1.json CHANGED
@@ -1,4 +1,22 @@
1
1
  [
2
+ {
3
+ "children": {
4
+ "features": [
5
+ "Support upload files direct into chat context."
6
+ ]
7
+ },
8
+ "date": "2025-05-09",
9
+ "version": "1.85.0"
10
+ },
11
+ {
12
+ "children": {
13
+ "improvements": [
14
+ "Add reasoning tokens and token usage statistics for Google Gemini."
15
+ ]
16
+ },
17
+ "date": "2025-05-09",
18
+ "version": "1.84.27"
19
+ },
2
20
  {
3
21
  "children": {
4
22
  "improvements": [
@@ -115,6 +115,45 @@ table async_tasks {
115
115
  updated_at "timestamp with time zone" [not null, default: `now()`]
116
116
  }
117
117
 
118
+ table document_chunks {
119
+ document_id varchar(30) [not null]
120
+ chunk_id uuid [not null]
121
+ page_index integer
122
+ user_id text [not null]
123
+ created_at "timestamp with time zone" [not null, default: `now()`]
124
+
125
+ indexes {
126
+ (document_id, chunk_id) [pk]
127
+ }
128
+ }
129
+
130
+ table documents {
131
+ id varchar(30) [pk, not null]
132
+ title text
133
+ content text
134
+ file_type varchar(255) [not null]
135
+ filename text
136
+ total_char_count integer [not null]
137
+ total_line_count integer [not null]
138
+ metadata jsonb
139
+ pages jsonb
140
+ source_type text [not null]
141
+ source text [not null]
142
+ file_id text
143
+ user_id text [not null]
144
+ client_id text
145
+ accessed_at "timestamp with time zone" [not null, default: `now()`]
146
+ created_at "timestamp with time zone" [not null, default: `now()`]
147
+ updated_at "timestamp with time zone" [not null, default: `now()`]
148
+
149
+ indexes {
150
+ source [name: 'documents_source_idx']
151
+ file_type [name: 'documents_file_type_idx']
152
+ file_id [name: 'documents_file_id_idx']
153
+ (client_id, user_id) [name: 'documents_client_id_user_id_unique', unique]
154
+ }
155
+ }
156
+
118
157
  table files {
119
158
  id text [pk, not null]
120
159
  user_id text [not null]
@@ -670,6 +709,17 @@ table threads {
670
709
  }
671
710
  }
672
711
 
712
+ table topic_documents {
713
+ document_id text [not null]
714
+ topic_id text [not null]
715
+ user_id text [not null]
716
+ created_at "timestamp with time zone" [not null, default: `now()`]
717
+
718
+ indexes {
719
+ (document_id, topic_id) [pk]
720
+ }
721
+ }
722
+
673
723
  table topics {
674
724
  id text [pk, not null]
675
725
  title text
@@ -744,6 +794,10 @@ ref: agents_to_sessions.agent_id > agents.id
744
794
 
745
795
  ref: unstructured_chunks.file_id - files.id
746
796
 
797
+ ref: document_chunks.document_id > documents.id
798
+
799
+ ref: documents.file_id > files.id
800
+
747
801
  ref: files.embedding_task_id - async_tasks.id
748
802
 
749
803
  ref: messages.session_id - sessions.id
@@ -756,4 +810,8 @@ ref: threads.source_message_id - messages.id
756
810
 
757
811
  ref: sessions.group_id - session_groups.id
758
812
 
759
- ref: topics.session_id - sessions.id
813
+ ref: topic_documents.document_id > documents.id
814
+
815
+ ref: topic_documents.topic_id > topics.id
816
+
817
+ ref: topics.session_id - sessions.id
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lobehub/chat",
3
- "version": "1.84.26",
3
+ "version": "1.85.0",
4
4
  "description": "Lobe Chat - an open-source, high-performance chatbot framework that supports speech synthesis, multimodal, and extensible Function Call plugin system. Supports one-click free deployment of your private ChatGPT/LLM web application.",
5
5
  "keywords": [
6
6
  "framework",
@@ -121,7 +121,7 @@
121
121
  "dependencies": {
122
122
  "@ant-design/icons": "^5.6.1",
123
123
  "@ant-design/pro-components": "^2.8.7",
124
- "@anthropic-ai/sdk": "^0.40.1",
124
+ "@anthropic-ai/sdk": "^0.41.0",
125
125
  "@auth/core": "^0.38.0",
126
126
  "@aws-sdk/client-bedrock-runtime": "^3.779.0",
127
127
  "@aws-sdk/client-s3": "^3.779.0",
@@ -143,6 +143,7 @@
143
143
  "@langchain/community": "^0.3.38",
144
144
  "@lobechat/electron-client-ipc": "workspace:*",
145
145
  "@lobechat/electron-server-ipc": "workspace:*",
146
+ "@lobechat/file-loaders": "workspace:*",
146
147
  "@lobechat/web-crawler": "workspace:*",
147
148
  "@lobehub/charts": "^2.0.0",
148
149
  "@lobehub/chat-plugin-sdk": "^1.32.4",
@@ -20,6 +20,9 @@
20
20
  "author": "LobeHub <i@lobehub.com>",
21
21
  "sideEffects": false,
22
22
  "main": "./src/index.ts",
23
+ "scripts": {
24
+ "test": "vitest"
25
+ },
23
26
  "dependencies": {
24
27
  "@langchain/community": "^0.3.41",
25
28
  "@langchain/core": "^0.3.45",
@@ -27,13 +30,14 @@
27
30
  "concat-stream": "^2.0.0",
28
31
  "mammoth": "^1.8.0",
29
32
  "officeparser": "^5.1.1",
30
- "pdfjs-dist": "4.8.69",
33
+ "pdfjs-dist": "4.10.38",
31
34
  "xlsx": "^0.18.5",
32
35
  "yauzl": "^3.2.0"
33
36
  },
34
37
  "devDependencies": {
35
38
  "@types/concat-stream": "^2.0.3",
36
39
  "@types/yauzl": "^2.10.3",
40
+ "canvas": "^3.1.0",
37
41
  "typescript": "^5"
38
42
  },
39
43
  "peerDependencies": {
@@ -1,3 +1,4 @@
1
+ import debug from 'debug';
1
2
  import { stat } from 'node:fs/promises';
2
3
  import * as path from 'node:path';
3
4
 
@@ -7,37 +8,52 @@ import { FileDocument, FileMetadata, SupportedFileType } from './types';
7
8
  import type { DocumentPage, FileLoaderInterface } from './types';
8
9
  import { isTextReadableFile } from './utils/isTextReadableFile';
9
10
 
11
+ const log = debug('file-loaders:loadFile');
12
+
10
13
  /**
11
14
  * Determines the file type based on the filename extension.
12
15
  * @param filePath The path to the file.
13
16
  * @returns The determined file type or 'txt' if text-readable, undefined otherwise.
14
17
  */
15
18
  const getFileType = (filePath: string): SupportedFileType | undefined => {
19
+ log('Determining file type for:', filePath);
16
20
  const extension = path.extname(filePath).toLowerCase().replace('.', '');
17
21
 
18
- if (!extension) return 'txt'; // Treat files without extension as text?
22
+ if (!extension) {
23
+ log('No extension found, treating as txt');
24
+ return 'txt'; // Treat files without extension as text?
25
+ }
19
26
 
20
27
  // Prioritize checking if it's a generally text-readable type
21
28
  if (isTextReadableFile(extension)) {
29
+ log(`Extension '${extension}' is text-readable, treating as txt`);
22
30
  return 'txt';
23
31
  }
24
32
 
25
33
  // Handle specific non-text or complex types
34
+ log(`Checking specific types for extension: '${extension}'`);
26
35
  switch (extension) {
27
36
  case 'pdf': {
37
+ log('File type identified as pdf');
28
38
  return 'pdf';
29
39
  }
30
40
  case 'docx': {
41
+ log('File type identified as docx');
31
42
  return 'docx';
32
43
  }
33
44
  case 'xlsx':
34
45
  case 'xls': {
46
+ log('File type identified as excel');
35
47
  return 'excel';
36
48
  }
37
49
  case 'pptx': {
50
+ log('File type identified as pptx');
38
51
  return 'pptx';
39
52
  }
40
53
  default: {
54
+ log(
55
+ `Extension '${extension}' is not a specifically handled type and not text-readable. Unsupported.`,
56
+ );
41
57
  // If not text-readable and not a specific known type, it's unsupported
42
58
  return undefined;
43
59
  }
@@ -59,18 +75,23 @@ export const loadFile = async (
59
75
  filePath: string,
60
76
  fileMetadata?: FileMetadata,
61
77
  ): Promise<FileDocument> => {
78
+ log('Starting to load file:', filePath, 'with metadata:', fileMetadata);
62
79
  let stats;
63
80
  let fsError: string | undefined;
64
81
 
65
82
  try {
83
+ log('Attempting to get file stats for:', filePath);
66
84
  stats = await stat(filePath);
85
+ log('Successfully retrieved file stats:', stats);
67
86
  } catch (e) {
68
87
  const error = e as Error;
88
+ log('Error getting file stats for %s: %s', filePath, error.message);
69
89
  console.error(`Error getting file stats for ${filePath}: ${error.message}`);
70
90
  fsError = `Failed to access file stats: ${error.message}`;
71
91
  }
72
92
 
73
93
  // Determine base file info from path and stats (if available)
94
+ log('Determining base file info');
74
95
  const fileExtension = path.extname(filePath).slice(1).toLowerCase();
75
96
  const baseFilename = path.basename(filePath);
76
97
 
@@ -80,13 +101,22 @@ export const loadFile = async (
80
101
  const fileType = fileMetadata?.fileType ?? fileExtension;
81
102
  const createdTime = fileMetadata?.createdTime ?? stats?.ctime ?? new Date();
82
103
  const modifiedTime = fileMetadata?.modifiedTime ?? stats?.mtime ?? new Date();
104
+ log('File info determined/overridden: %O', {
105
+ createdTime,
106
+ fileType,
107
+ filename,
108
+ modifiedTime,
109
+ source,
110
+ });
83
111
 
84
112
  const paserType = getFileType(filePath);
113
+ log('Parser type determined as:', paserType);
85
114
 
86
115
  // Select the loader CLASS based on the determined fileType, fallback to DefaultLoader
87
116
  const LoaderClass: new () => FileLoaderInterface = paserType
88
117
  ? fileLoaders[paserType]
89
118
  : DefaultLoader;
119
+ log('Selected loader class:', LoaderClass.name);
90
120
 
91
121
  if (!paserType) {
92
122
  console.warn(
@@ -102,17 +132,23 @@ export const loadFile = async (
102
132
  let loaderSpecificMetadata: any | undefined;
103
133
 
104
134
  // Instantiate the loader
135
+ log('Instantiating loader:', LoaderClass.name);
105
136
  const loaderInstance = new LoaderClass();
106
137
 
107
138
  // If we couldn't even get stats, skip loader execution
108
139
  if (!fsError) {
140
+ log('File stats available, proceeding with loader execution.');
109
141
  try {
110
142
  // 1. Load pages using the instance
143
+ log('Loading pages with loader:', LoaderClass.name, 'for file:', filePath);
111
144
  pages = await loaderInstance.loadPages(filePath);
145
+ log('Pages loaded successfully, count:', pages.length);
112
146
 
113
147
  try {
114
148
  // 2. Aggregate content using the instance
149
+ log('Aggregating content with loader:', LoaderClass.name);
115
150
  aggregatedContent = await loaderInstance.aggregateContent(pages);
151
+ log('Content aggregated successfully, length:', aggregatedContent.length);
116
152
  } catch (aggError) {
117
153
  const error = aggError as Error;
118
154
  console.error(
@@ -124,8 +160,10 @@ export const loadFile = async (
124
160
 
125
161
  // 3. Attach document-specific metadata if loader supports it
126
162
  if (typeof loaderInstance.attachDocumentMetadata === 'function') {
163
+ log('Loader supports attachDocumentMetadata. Attaching...');
127
164
  try {
128
165
  loaderSpecificMetadata = await loaderInstance.attachDocumentMetadata(filePath);
166
+ log('Document-specific metadata attached:', loaderSpecificMetadata);
129
167
  } catch (metaErr) {
130
168
  const error = metaErr as Error;
131
169
  console.error(
@@ -133,6 +171,8 @@ export const loadFile = async (
133
171
  );
134
172
  metadataError = `Metadata attachment failed: ${error.message}`;
135
173
  }
174
+ } else {
175
+ log('Loader does not support attachDocumentMetadata.');
136
176
  }
137
177
  } catch (loadErr) {
138
178
  const error = loadErr as Error;
@@ -152,6 +192,7 @@ export const loadFile = async (
152
192
  // Aggregated content remains empty
153
193
  }
154
194
  } else {
195
+ log('File stats access failed (fsError: %s). Creating minimal error page.', fsError);
155
196
  // If stats failed, create a minimal error page
156
197
  pages = [
157
198
  {
@@ -167,16 +208,20 @@ export const loadFile = async (
167
208
  // Calculate totals from the loaded pages
168
209
  let totalCharCount = 0;
169
210
  let totalLineCount = 0;
211
+ log('Calculating total char and line counts from pages.');
170
212
  for (const page of pages) {
171
213
  totalCharCount += page.charCount;
172
214
  totalLineCount += page.lineCount;
173
215
  }
216
+ log('Totals calculated:', { totalCharCount, totalLineCount });
174
217
 
175
218
  // Combine all potential errors
176
219
  const combinedError =
177
220
  [fsError, loaderError, aggregationError, metadataError].filter(Boolean).join('; ') || undefined;
221
+ if (combinedError) log('Combined errors:', combinedError);
178
222
 
179
223
  // Construct the final FileDocument
224
+ log('Constructing final FileDocument.');
180
225
  const fileDocument: FileDocument = {
181
226
  content: aggregatedContent, // Use content from aggregateContent
182
227
  createdTime,
@@ -202,5 +247,10 @@ export const loadFile = async (
202
247
  delete fileDocument.metadata.error;
203
248
  }
204
249
 
250
+ log('File loading process completed for:', filePath, 'Returning document:', {
251
+ fileType: fileDocument.fileType,
252
+ filename: fileDocument.filename,
253
+ pages: fileDocument.pages?.length,
254
+ });
205
255
  return fileDocument;
206
256
  };
@@ -1,15 +1,21 @@
1
1
  import { DocxLoader as LangchainDocxLoader } from '@langchain/community/document_loaders/fs/docx';
2
+ import debug from 'debug';
2
3
 
3
4
  import type { DocumentPage, FileLoaderInterface } from '../../types';
4
5
 
6
+ const log = debug('file-loaders:docx');
7
+
5
8
  /**
6
9
  * Loads Word documents (.docx) using the LangChain Community DocxLoader.
7
10
  */
8
11
  export class DocxLoader implements FileLoaderInterface {
9
12
  async loadPages(filePath: string): Promise<DocumentPage[]> {
13
+ log('Loading DOCX file:', filePath);
10
14
  try {
11
15
  const loader = new LangchainDocxLoader(filePath);
16
+ log('LangChain DocxLoader created');
12
17
  const docs = await loader.load(); // Langchain DocxLoader typically loads the whole doc as one
18
+ log('DOCX document loaded, parts:', docs.length);
13
19
 
14
20
  const pages: DocumentPage[] = docs.map((doc) => {
15
21
  const pageContent = doc.pageContent || '';
@@ -27,6 +33,8 @@ export class DocxLoader implements FileLoaderInterface {
27
33
  // @ts-expect-error Remove source if present, as it's handled at the FileDocument level
28
34
  delete metadata.source;
29
35
 
36
+ log('DOCX document processed, lines:', lineCount, 'chars:', charCount);
37
+
30
38
  return {
31
39
  charCount,
32
40
  lineCount,
@@ -37,6 +45,7 @@ export class DocxLoader implements FileLoaderInterface {
37
45
 
38
46
  // If docs array is empty (e.g., empty file), create an empty page
39
47
  if (pages.length === 0) {
48
+ log('No content in DOCX document, creating empty page');
40
49
  pages.push({
41
50
  charCount: 0,
42
51
  lineCount: 0,
@@ -45,9 +54,11 @@ export class DocxLoader implements FileLoaderInterface {
45
54
  });
46
55
  }
47
56
 
57
+ log('DOCX loading completed, total pages:', pages.length);
48
58
  return pages;
49
59
  } catch (e) {
50
60
  const error = e as Error;
61
+ log('Error encountered while loading DOCX file');
51
62
  console.error(`Error loading DOCX file ${filePath} using LangChain loader: ${error.message}`);
52
63
  const errorPage: DocumentPage = {
53
64
  charCount: 0,
@@ -57,6 +68,7 @@ export class DocxLoader implements FileLoaderInterface {
57
68
  },
58
69
  pageContent: '',
59
70
  };
71
+ log('Created error page for failed DOCX loading');
60
72
  return [errorPage];
61
73
  }
62
74
  }
@@ -68,6 +80,9 @@ export class DocxLoader implements FileLoaderInterface {
68
80
  * @returns Aggregated content as a string.
69
81
  */
70
82
  async aggregateContent(pages: DocumentPage[]): Promise<string> {
71
- return pages.map((page) => page.pageContent).join('\n\n');
83
+ log('Aggregating content from', pages.length, 'DOCX pages');
84
+ const result = pages.map((page) => page.pageContent).join('\n\n');
85
+ log('DOCX content aggregated successfully, length:', result.length);
86
+ return result;
72
87
  }
73
88
  }
@@ -1,26 +1,34 @@
1
+ import debug from 'debug';
1
2
  import { readFile } from 'node:fs/promises';
2
3
  import * as xlsx from 'xlsx';
3
4
 
4
5
  import type { DocumentPage, FileLoaderInterface } from '../../types';
5
6
 
7
+ const log = debug('file-loaders:excel');
8
+
6
9
  /**
7
10
  * Converts sheet data (array of objects) to a Markdown table string.
8
11
  * Handles empty sheets and escapes pipe characters.
9
12
  */
10
13
  function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
14
+ log('Converting sheet data to Markdown table, rows:', jsonData?.length || 0);
11
15
  if (!jsonData || jsonData.length === 0) {
16
+ log('Sheet is empty, returning placeholder message');
12
17
  return '*Sheet is empty or contains no data.*';
13
18
  }
14
19
 
15
20
  // Ensure all rows have the same keys based on the first row, handle potentially sparse data
16
21
  const headers = Object.keys(jsonData[0] || {});
22
+ log('Sheet headers:', headers);
17
23
  if (headers.length === 0) {
24
+ log('Sheet has no headers, returning placeholder message');
18
25
  return '*Sheet has headers but no data.*';
19
26
  }
20
27
 
21
28
  const headerRow = `| ${headers.join(' | ')} |`;
22
29
  const separatorRow = `| ${headers.map(() => '---').join(' | ')} |`;
23
30
 
31
+ log('Building data rows for Markdown table');
24
32
  const dataRows = jsonData
25
33
  .map((row) => {
26
34
  const cells = headers.map((header) => {
@@ -34,7 +42,9 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
34
42
  })
35
43
  .join('\n');
36
44
 
37
- return `${headerRow}\n${separatorRow}\n${dataRows}`;
45
+ const result = `${headerRow}\n${separatorRow}\n${dataRows}`;
46
+ log('Markdown table created, length:', result.length);
47
+ return result;
38
48
  }
39
49
 
40
50
  /**
@@ -43,13 +53,20 @@ function sheetToMarkdownTable(jsonData: Record<string, any>[]): string {
43
53
  */
44
54
  export class ExcelLoader implements FileLoaderInterface {
45
55
  async loadPages(filePath: string): Promise<DocumentPage[]> {
56
+ log('Loading Excel file:', filePath);
46
57
  const pages: DocumentPage[] = [];
47
58
  try {
48
59
  // Use readFile for async operation compatible with other loaders
60
+ log('Reading Excel file as buffer');
49
61
  const dataBuffer = await readFile(filePath);
62
+ log('Excel file read successfully, size:', dataBuffer.length, 'bytes');
63
+
64
+ log('Parsing Excel workbook');
50
65
  const workbook = xlsx.read(dataBuffer, { type: 'buffer' });
66
+ log('Excel workbook parsed successfully, sheets:', workbook.SheetNames.length);
51
67
 
52
68
  for (const sheetName of workbook.SheetNames) {
69
+ log(`Processing sheet: ${sheetName}`);
53
70
  const worksheet = workbook.Sheets[sheetName];
54
71
  // Use sheet_to_json to get array of objects for our custom markdown function
55
72
  const jsonData = xlsx.utils.sheet_to_json<Record<string, any>>(worksheet, {
@@ -57,6 +74,7 @@ export class ExcelLoader implements FileLoaderInterface {
57
74
  defval: '',
58
75
  raw: false, // Use empty string for blank cells
59
76
  });
77
+ log(`Sheet ${sheetName} converted to JSON, rows:`, jsonData.length);
60
78
 
61
79
  // Convert to markdown using YOUR helper function
62
80
  const tableMarkdown = sheetToMarkdownTable(jsonData);
@@ -64,6 +82,7 @@ export class ExcelLoader implements FileLoaderInterface {
64
82
  const lines = tableMarkdown.split('\n');
65
83
  const lineCount = lines.length;
66
84
  const charCount = tableMarkdown.length;
85
+ log(`Sheet ${sheetName} converted to Markdown, lines: ${lineCount}, chars: ${charCount}`);
67
86
 
68
87
  pages.push({
69
88
  // Trim whitespace
@@ -74,9 +93,11 @@ export class ExcelLoader implements FileLoaderInterface {
74
93
  },
75
94
  pageContent: tableMarkdown.trim(),
76
95
  });
96
+ log(`Added sheet ${sheetName} as page`);
77
97
  }
78
98
 
79
99
  if (pages.length === 0) {
100
+ log('Excel file contains no sheets, creating empty page with error');
80
101
  pages.push({
81
102
  charCount: 0,
82
103
  lineCount: 0,
@@ -87,9 +108,11 @@ export class ExcelLoader implements FileLoaderInterface {
87
108
  });
88
109
  }
89
110
 
111
+ log('Excel loading completed, total pages:', pages.length);
90
112
  return pages;
91
113
  } catch (e) {
92
114
  const error = e as Error;
115
+ log('Error encountered while loading Excel file');
93
116
  console.error(`Error loading Excel file ${filePath}: ${error.message}`);
94
117
  const errorPage: DocumentPage = {
95
118
  charCount: 0,
@@ -99,6 +122,7 @@ export class ExcelLoader implements FileLoaderInterface {
99
122
  },
100
123
  pageContent: '',
101
124
  };
125
+ log('Created error page for failed Excel loading');
102
126
  return [errorPage];
103
127
  }
104
128
  }
@@ -110,12 +134,16 @@ export class ExcelLoader implements FileLoaderInterface {
110
134
  * @returns Aggregated content as a string.
111
135
  */
112
136
  async aggregateContent(pages: DocumentPage[]): Promise<string> {
113
- return pages
137
+ log('Aggregating content from', pages.length, 'Excel pages');
138
+ const result = pages
114
139
  .map((page) => {
115
140
  const sheetName = page.metadata.sheetName;
116
141
  const header = sheetName ? `## Sheet: ${sheetName}\n\n` : '';
117
142
  return header + page.pageContent;
118
143
  })
119
144
  .join('\n\n---\n\n'); // Separator between sheets
145
+
146
+ log('Excel content aggregated successfully, length:', result.length);
147
+ return result;
120
148
  }
121
149
  }
@@ -48,7 +48,7 @@ exports[`PdfLoader > should attach document metadata correctly 1`] = `
48
48
  "Title": "test",
49
49
  },
50
50
  "pdfMetadata": null,
51
- "pdfVersion": "4.8.69",
51
+ "pdfVersion": "4.10.38",
52
52
  }
53
53
  `;
54
54