@lobehub/chat 1.84.27 → 1.85.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +25 -0
- package/changelog/v1.json +9 -0
- package/docs/development/database-schema.dbml +59 -1
- package/package.json +2 -1
- package/packages/file-loaders/package.json +5 -1
- package/packages/file-loaders/src/loadFile.ts +51 -1
- package/packages/file-loaders/src/loaders/docx/index.ts +16 -1
- package/packages/file-loaders/src/loaders/excel/index.ts +30 -2
- package/packages/file-loaders/src/loaders/pdf/__snapshots__/index.test.ts.snap +1 -1
- package/packages/file-loaders/src/loaders/pdf/index.ts +52 -12
- package/packages/file-loaders/src/loaders/pptx/index.ts +32 -1
- package/packages/file-loaders/src/loaders/text/index.test.ts +1 -1
- package/packages/file-loaders/src/loaders/text/index.ts +13 -1
- package/packages/file-loaders/test/__snapshots__/loaders.test.ts.snap +41 -0
- package/packages/file-loaders/test/loaders.test.ts +20 -0
- package/packages/file-loaders/test/setup.ts +17 -0
- package/packages/file-loaders/vitest.config.ts +14 -0
- package/src/const/file.ts +8 -1
- package/src/database/client/migrations.json +23 -1
- package/src/database/migrations/0022_add_documents.sql +49 -0
- package/src/database/migrations/meta/0022_snapshot.json +5340 -0
- package/src/database/migrations/meta/_journal.json +7 -0
- package/src/database/models/_template.ts +1 -1
- package/src/database/models/document.ts +54 -0
- package/src/database/models/message.ts +25 -0
- package/src/database/repositories/tableViewer/index.test.ts +1 -1
- package/src/database/schemas/document.ts +104 -0
- package/src/database/schemas/index.ts +1 -0
- package/src/database/schemas/relations.ts +34 -2
- package/src/database/schemas/topic.ts +31 -8
- package/src/database/utils/idGenerator.ts +1 -0
- package/src/features/ChatInput/Desktop/FilePreview/FileItem/Content.tsx +1 -1
- package/src/features/ChatInput/Desktop/FilePreview/FileItem/index.tsx +10 -10
- package/src/features/ChatInput/components/UploadDetail/UploadStatus.tsx +2 -2
- package/src/features/Conversation/Actions/Error.tsx +2 -2
- package/src/libs/trpc/lambda/context.ts +7 -0
- package/src/prompts/files/file.ts +6 -4
- package/src/server/routers/lambda/document.ts +36 -0
- package/src/server/routers/lambda/index.ts +2 -0
- package/src/server/services/document/index.ts +66 -0
- package/src/server/services/mcp/index.ts +0 -4
- package/src/services/rag.ts +4 -0
- package/src/store/chat/slices/aiChat/actions/__tests__/rag.test.ts +2 -2
- package/src/store/chat/slices/aiChat/actions/rag.ts +2 -3
- package/src/store/file/slices/chat/action.ts +3 -51
- package/src/types/document/index.ts +172 -0
- package/src/types/message/chat.ts +1 -0
- package/src/features/ChatInput/Desktop/FilePreview/FileItem/style.ts +0 -4
@@ -1,37 +1,54 @@
|
|
1
|
+
import debug from 'debug';
|
1
2
|
import { readFile } from 'node:fs/promises';
|
2
|
-
import
|
3
|
-
import
|
3
|
+
import type { PDFDocumentProxy, PDFPageProxy } from 'pdfjs-dist';
|
4
|
+
import { getDocument, version } from 'pdfjs-dist/legacy/build/pdf.mjs';
|
5
|
+
// @ts-ignore
|
6
|
+
import * as _pdfjsWorker from 'pdfjs-dist/legacy/build/pdf.worker.mjs';
|
7
|
+
import type { TextContent } from 'pdfjs-dist/types/src/display/api';
|
4
8
|
|
5
9
|
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
6
10
|
|
11
|
+
const log = debug('file-loaders:pdf');
|
12
|
+
|
7
13
|
/**
|
8
14
|
* Loads PDF files page by page using the official pdfjs-dist library.
|
9
15
|
*/
|
10
16
|
export class PdfLoader implements FileLoaderInterface {
|
11
17
|
private pdfInstance: PDFDocumentProxy | null = null;
|
18
|
+
private pdfjsWorker = _pdfjsWorker;
|
12
19
|
|
13
20
|
private async getPDFFile(filePath: string) {
|
21
|
+
// GlobalWorkerOptions.workerSrc should have been set at the module level.
|
22
|
+
// We are now relying on pdfjs-dist to use this path when it creates a worker.
|
23
|
+
|
24
|
+
log('Reading PDF file:', filePath);
|
14
25
|
const dataBuffer = await readFile(filePath);
|
26
|
+
log('PDF file read successfully, size:', dataBuffer.length, 'bytes');
|
15
27
|
|
16
|
-
const loadingTask =
|
28
|
+
const loadingTask = getDocument({
|
17
29
|
data: new Uint8Array(dataBuffer.buffer, dataBuffer.byteOffset, dataBuffer.length),
|
18
30
|
useSystemFonts: true,
|
19
|
-
// Explicitly disable worker thread
|
20
|
-
worker: undefined, // Attempt to use system fonts
|
21
31
|
});
|
22
32
|
|
23
|
-
|
33
|
+
log('PDF document loading task created');
|
34
|
+
const pdf = await loadingTask.promise;
|
35
|
+
log('PDF document loaded successfully, pages:', pdf.numPages);
|
36
|
+
return pdf;
|
24
37
|
}
|
25
38
|
|
26
39
|
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
40
|
+
log('Starting to load PDF pages from:', filePath);
|
27
41
|
try {
|
28
42
|
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
|
29
43
|
|
30
44
|
const pages: DocumentPage[] = [];
|
45
|
+
log(`Processing ${pdf.numPages} PDF pages`);
|
31
46
|
|
32
47
|
for (let i = 1; i <= pdf.numPages; i += 1) {
|
48
|
+
log(`Loading page ${i}/${pdf.numPages}`);
|
33
49
|
const page: PDFPageProxy = await pdf.getPage(i);
|
34
50
|
const content: TextContent = await page.getTextContent();
|
51
|
+
log(`Page ${i} text content retrieved, items:`, content.items.length);
|
35
52
|
|
36
53
|
// --- Revert to EXACT Simple Langchain PDFLoader Logic ---
|
37
54
|
let lastY;
|
@@ -61,6 +78,7 @@ export class PdfLoader implements FileLoaderInterface {
|
|
61
78
|
const pageLines = cleanedPageContent.split('\n');
|
62
79
|
const lineCount = pageLines.length;
|
63
80
|
const charCount = cleanedPageContent.length;
|
81
|
+
log(`Page ${i} processed, lines: ${lineCount}, chars: ${charCount}`);
|
64
82
|
|
65
83
|
pages.push({
|
66
84
|
charCount,
|
@@ -70,15 +88,19 @@ export class PdfLoader implements FileLoaderInterface {
|
|
70
88
|
});
|
71
89
|
|
72
90
|
// Clean up page resources
|
91
|
+
log(`Cleaning up page ${i} resources`);
|
73
92
|
page.cleanup();
|
74
93
|
}
|
75
94
|
|
76
95
|
// Clean up document resources
|
96
|
+
log('Cleaning up PDF document resources');
|
77
97
|
await pdf.destroy();
|
78
98
|
|
99
|
+
log(`PDF loading completed for ${filePath}, total pages:`, pages.length);
|
79
100
|
return pages;
|
80
101
|
} catch (e) {
|
81
102
|
const error = e as Error;
|
103
|
+
log('Error encountered while loading PDF file');
|
82
104
|
console.error(
|
83
105
|
`Error loading PDF file ${filePath} using pdfjs-dist: ${error.message}`,
|
84
106
|
error.stack,
|
@@ -92,6 +114,7 @@ export class PdfLoader implements FileLoaderInterface {
|
|
92
114
|
},
|
93
115
|
pageContent: '',
|
94
116
|
};
|
117
|
+
log('Created error page for failed PDF loading');
|
95
118
|
return [errorPage];
|
96
119
|
}
|
97
120
|
}
|
@@ -103,25 +126,42 @@ export class PdfLoader implements FileLoaderInterface {
|
|
103
126
|
* @returns Aggregated content as a string.
|
104
127
|
*/
|
105
128
|
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
.
|
129
|
+
log('Aggregating content from', pages.length, 'PDF pages');
|
130
|
+
const validPages = pages.filter((page) => !page.metadata.error);
|
131
|
+
log(
|
132
|
+
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} pages with errors filtered out)`,
|
133
|
+
);
|
134
|
+
|
135
|
+
const result = validPages.map((page) => page.pageContent).join('\n\n');
|
136
|
+
log('PDF content aggregated successfully, length:', result.length);
|
137
|
+
return result;
|
110
138
|
}
|
111
139
|
|
112
140
|
async attachDocumentMetadata(filePath: string): Promise<any> {
|
141
|
+
log('Attaching document metadata for PDF:', filePath);
|
113
142
|
const pdf: PDFDocumentProxy = await this.getPDFFile(filePath);
|
114
143
|
|
115
|
-
|
144
|
+
log('Getting PDF metadata');
|
145
|
+
const pdfMetadata =
|
146
|
+
(await pdf.getMetadata().catch((err) => {
|
147
|
+
log('Error retrieving PDF metadata');
|
148
|
+
console.error(`Error getting PDF metadata: ${err.message}`);
|
149
|
+
return null;
|
150
|
+
})) ?? null;
|
151
|
+
|
116
152
|
const pdfInfo = pdfMetadata?.info ?? {};
|
117
153
|
const metadata = pdfMetadata?.metadata ?? null;
|
154
|
+
log('PDF metadata retrieved:', {
|
155
|
+
hasInfo: !!Object.keys(pdfInfo).length,
|
156
|
+
hasMetadata: !!metadata,
|
157
|
+
});
|
118
158
|
|
119
159
|
return {
|
120
160
|
pdfInfo: pdfInfo,
|
121
161
|
// PDF info (Author, Title, etc.)
|
122
162
|
pdfMetadata: metadata,
|
123
163
|
// PDF metadata
|
124
|
-
pdfVersion:
|
164
|
+
pdfVersion: version,
|
125
165
|
};
|
126
166
|
}
|
127
167
|
}
|
@@ -1,8 +1,11 @@
|
|
1
|
+
import debug from 'debug';
|
1
2
|
import path from 'node:path';
|
2
3
|
|
3
4
|
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
5
|
import { type ExtractedFile, extractFiles, parseString } from '../../utils/parser-utils';
|
5
6
|
|
7
|
+
const log = debug('file-loaders:pptx');
|
8
|
+
|
6
9
|
/**
|
7
10
|
* Represents a loader for PPTX files using extracted utility functions.
|
8
11
|
*
|
@@ -19,20 +22,25 @@ export class PptxLoader implements FileLoaderInterface {
|
|
19
22
|
* `DocumentPage` object with error information in its metadata.
|
20
23
|
*/
|
21
24
|
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
25
|
+
log('Loading PPTX file:', filePath);
|
22
26
|
const sourceFileName = path.basename(filePath);
|
27
|
+
log('Source file name:', sourceFileName);
|
23
28
|
|
24
29
|
try {
|
25
30
|
// --- File Extraction Step ---
|
26
31
|
const slidesRegex = /ppt\/slides\/slide\d+\.xml/g;
|
27
32
|
const slideNumberRegex = /slide(\d+)\.xml/;
|
28
33
|
|
34
|
+
log('Extracting slide XML files from PPTX');
|
29
35
|
// Extract only slide XML files
|
30
36
|
const slideFiles: ExtractedFile[] = await extractFiles(filePath, (fileName) =>
|
31
37
|
slidesRegex.test(fileName),
|
32
38
|
);
|
39
|
+
log('Extracted slide files:', slideFiles.length);
|
33
40
|
|
34
41
|
// --- Validation Step ---
|
35
42
|
if (slideFiles.length === 0) {
|
43
|
+
log('No slide XML files found in the PPTX file');
|
36
44
|
console.warn(`No slide XML files found in ${sourceFileName}. May be corrupted or empty.`);
|
37
45
|
return [
|
38
46
|
this.createErrorPage(
|
@@ -43,6 +51,7 @@ export class PptxLoader implements FileLoaderInterface {
|
|
43
51
|
}
|
44
52
|
|
45
53
|
// --- Sorting Step ---
|
54
|
+
log('Sorting slide files by slide number');
|
46
55
|
// Sort files based on the slide number extracted from the path
|
47
56
|
slideFiles.sort((a, b) => {
|
48
57
|
const matchA = a.path.match(slideNumberRegex);
|
@@ -51,13 +60,17 @@ export class PptxLoader implements FileLoaderInterface {
|
|
51
60
|
const numB = matchB ? parseInt(matchB[1], 10) : Infinity;
|
52
61
|
return numA - numB;
|
53
62
|
});
|
63
|
+
log('Slide files sorted');
|
54
64
|
|
55
65
|
// --- Page Creation Step ---
|
66
|
+
log('Creating document pages from slide files');
|
56
67
|
const pages: DocumentPage[] = slideFiles
|
57
68
|
.map((slideFile, index) => {
|
58
69
|
try {
|
70
|
+
log(`Processing slide ${index + 1}/${slideFiles.length}, path: ${slideFile.path}`);
|
59
71
|
const xmlDoc = parseString(slideFile.content);
|
60
72
|
const paragraphNodes = xmlDoc.getElementsByTagName('a:p');
|
73
|
+
log(`Found ${paragraphNodes.length} paragraph nodes in slide ${index + 1}`);
|
61
74
|
|
62
75
|
const slideText = Array.from(paragraphNodes)
|
63
76
|
.map((pNode) => {
|
@@ -72,6 +85,9 @@ export class PptxLoader implements FileLoaderInterface {
|
|
72
85
|
const lines = slideText.split('\n');
|
73
86
|
const slideNumberMatch = slideFile.path.match(slideNumberRegex);
|
74
87
|
const slideNumber = slideNumberMatch ? parseInt(slideNumberMatch[1], 10) : index + 1; // Fallback to index if regex fails
|
88
|
+
log(
|
89
|
+
`Slide ${index + 1} text extracted, lines: ${lines.length}, characters: ${slideText.length}`,
|
90
|
+
);
|
75
91
|
|
76
92
|
const metadata = {
|
77
93
|
pageCount: slideFiles.length, // Total number of slides found
|
@@ -86,6 +102,7 @@ export class PptxLoader implements FileLoaderInterface {
|
|
86
102
|
pageContent: slideText.trim(), // Trim final content
|
87
103
|
};
|
88
104
|
} catch (parseError) {
|
105
|
+
log(`Error parsing slide ${slideFile.path}`);
|
89
106
|
console.error(
|
90
107
|
`Failed to parse XML for slide ${slideFile.path} in ${sourceFileName}: ${parseError instanceof Error ? parseError.message : String(parseError)}`,
|
91
108
|
);
|
@@ -101,9 +118,11 @@ export class PptxLoader implements FileLoaderInterface {
|
|
101
118
|
})
|
102
119
|
// Filter out any potential nulls if we change the error handling above
|
103
120
|
.filter((page): page is DocumentPage => page !== null);
|
121
|
+
log(`Created ${pages.length} document pages from slides`);
|
104
122
|
|
105
123
|
if (pages.length === 0) {
|
106
124
|
// This case might happen if all slides failed to parse
|
125
|
+
log('Parsing resulted in zero valid pages');
|
107
126
|
console.warn(`Parsing resulted in zero valid pages for ${sourceFileName}`);
|
108
127
|
return [this.createErrorPage('Parsing resulted in zero valid pages.', sourceFileName)];
|
109
128
|
}
|
@@ -112,15 +131,18 @@ export class PptxLoader implements FileLoaderInterface {
|
|
112
131
|
const allErrored = pages.every((page) => page.metadata?.error);
|
113
132
|
if (allErrored) {
|
114
133
|
// If all pages resulted in errors, perhaps return a single summary error
|
134
|
+
log('All slides failed to parse');
|
115
135
|
console.warn(`All slides failed to parse for ${sourceFileName}`);
|
116
136
|
return [this.createErrorPage('All slides failed to parse correctly.', sourceFileName)];
|
117
137
|
// Or return all the individual error pages: return pages;
|
118
138
|
}
|
119
139
|
|
140
|
+
log('PPTX loading completed successfully');
|
120
141
|
return pages;
|
121
142
|
} catch (error) {
|
122
143
|
// --- Error Handling Step ---
|
123
144
|
// This catches errors from extractFiles or other unexpected issues
|
145
|
+
log('Error loading or processing PPTX file');
|
124
146
|
const errorMessage = `Failed to load or process PPTX file: ${error instanceof Error ? error.message : String(error)}`;
|
125
147
|
console.error(errorMessage, { filePath });
|
126
148
|
return [this.createErrorPage(errorMessage, sourceFileName)];
|
@@ -137,16 +159,21 @@ export class PptxLoader implements FileLoaderInterface {
|
|
137
159
|
* @returns A Promise resolving to the aggregated content string.
|
138
160
|
*/
|
139
161
|
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
162
|
+
log('Aggregating content from', pages.length, 'PPTX pages');
|
140
163
|
// Ensure pages array is valid and non-empty before proceeding
|
141
164
|
// Filter out error pages before aggregation unless we want to include error messages
|
142
165
|
const validPages = pages.filter((page) => !page.metadata?.error);
|
166
|
+
log(
|
167
|
+
`Found ${validPages.length} valid pages for aggregation (${pages.length - validPages.length} error pages filtered out)`,
|
168
|
+
);
|
143
169
|
|
144
170
|
if (validPages.length === 0) {
|
145
171
|
// If only error pages existed, return empty or a summary error message
|
172
|
+
log('No valid pages found, returning content of first page (may be error page)');
|
146
173
|
return pages[0]?.pageContent || ''; // Return content of the first page (might be an error page)
|
147
174
|
}
|
148
175
|
|
149
|
-
|
176
|
+
const result = validPages
|
150
177
|
.map((page) => {
|
151
178
|
const slideNumber = page.metadata?.slideNumber;
|
152
179
|
// Use Markdown H2 for slide headers
|
@@ -156,6 +183,9 @@ ${page.pageContent}
|
|
156
183
|
</slide_page>`;
|
157
184
|
})
|
158
185
|
.join('\n\n'); // Use Markdown horizontal rule as separator
|
186
|
+
|
187
|
+
log('PPTX content aggregated successfully, length:', result.length);
|
188
|
+
return result;
|
159
189
|
}
|
160
190
|
|
161
191
|
/**
|
@@ -171,6 +201,7 @@ ${page.pageContent}
|
|
171
201
|
sourceFileName: string,
|
172
202
|
sourceFilePath?: string,
|
173
203
|
): DocumentPage {
|
204
|
+
log('Creating error page:', errorInfo);
|
174
205
|
return {
|
175
206
|
charCount: 0,
|
176
207
|
lineCount: 0,
|
@@ -1,17 +1,23 @@
|
|
1
|
+
import debug from 'debug';
|
1
2
|
import { readFile } from 'node:fs/promises';
|
2
3
|
|
3
4
|
import type { DocumentPage, FileLoaderInterface } from '../../types';
|
4
5
|
|
6
|
+
const log = debug('file-loaders:text');
|
7
|
+
|
5
8
|
/**
|
6
9
|
* 用于加载纯文本文件的加载器。
|
7
10
|
*/
|
8
11
|
export class TextLoader implements FileLoaderInterface {
|
9
12
|
async loadPages(filePath: string): Promise<DocumentPage[]> {
|
13
|
+
log('Loading text file:', filePath);
|
10
14
|
try {
|
11
15
|
const fileContent = await readFile(filePath, 'utf8');
|
16
|
+
log('Text file loaded successfully, size:', fileContent.length, 'bytes');
|
12
17
|
const lines = fileContent.split('\n');
|
13
18
|
const lineCount = lines.length;
|
14
19
|
const charCount = fileContent.length;
|
20
|
+
log('Text file stats:', { charCount, lineCount });
|
15
21
|
|
16
22
|
const page: DocumentPage = {
|
17
23
|
charCount,
|
@@ -23,9 +29,11 @@ export class TextLoader implements FileLoaderInterface {
|
|
23
29
|
pageContent: fileContent,
|
24
30
|
};
|
25
31
|
|
32
|
+
log('Text page created successfully');
|
26
33
|
return [page];
|
27
34
|
} catch (e) {
|
28
35
|
const error = e as Error;
|
36
|
+
log('Error encountered while loading text file');
|
29
37
|
console.error(`Error loading text file ${filePath}: ${error.message}`);
|
30
38
|
// 如果读取失败,返回一个包含错误信息的 Page
|
31
39
|
const errorPage: DocumentPage = {
|
@@ -36,6 +44,7 @@ export class TextLoader implements FileLoaderInterface {
|
|
36
44
|
},
|
37
45
|
pageContent: '',
|
38
46
|
};
|
47
|
+
log('Created error page for failed text file loading');
|
39
48
|
return [errorPage];
|
40
49
|
}
|
41
50
|
}
|
@@ -47,7 +56,10 @@ export class TextLoader implements FileLoaderInterface {
|
|
47
56
|
* @returns 聚合后的内容
|
48
57
|
*/
|
49
58
|
async aggregateContent(pages: DocumentPage[]): Promise<string> {
|
59
|
+
log('Aggregating content from', pages.length, 'text pages');
|
50
60
|
// 默认使用换行符连接,可以根据需要调整或使其可配置
|
51
|
-
|
61
|
+
const result = pages.map((page) => page.pageContent).join('\n');
|
62
|
+
log('Content aggregated successfully, length:', result.length);
|
63
|
+
return result;
|
52
64
|
}
|
53
65
|
}
|
@@ -1,5 +1,46 @@
|
|
1
1
|
// Vitest Snapshot v1, https://vitest.dev/guide/snapshot.html
|
2
2
|
|
3
|
+
exports[`loadFile Integration Tests > PDF Handling > should load content from a pdf file using filePath 1`] = `
|
4
|
+
{
|
5
|
+
"content": "123",
|
6
|
+
"fileType": "pdf",
|
7
|
+
"filename": "test.pdf",
|
8
|
+
"metadata": {
|
9
|
+
"loaderSpecific": {
|
10
|
+
"pdfInfo": {
|
11
|
+
"CreationDate": "D:20250419143655Z00'00'",
|
12
|
+
"Creator": "Pages文稿",
|
13
|
+
"EncryptFilterName": null,
|
14
|
+
"IsAcroFormPresent": false,
|
15
|
+
"IsCollectionPresent": false,
|
16
|
+
"IsLinearized": false,
|
17
|
+
"IsSignaturesPresent": false,
|
18
|
+
"IsXFAPresent": false,
|
19
|
+
"Language": null,
|
20
|
+
"ModDate": "D:20250419143655Z00'00'",
|
21
|
+
"PDFFormatVersion": "1.3",
|
22
|
+
"Producer": "macOS 版本15.3.2(版号24D81) Quartz PDFContext",
|
23
|
+
"Title": "test",
|
24
|
+
},
|
25
|
+
"pdfMetadata": null,
|
26
|
+
"pdfVersion": "4.10.38",
|
27
|
+
},
|
28
|
+
},
|
29
|
+
"pages": [
|
30
|
+
{
|
31
|
+
"charCount": 3,
|
32
|
+
"lineCount": 1,
|
33
|
+
"metadata": {
|
34
|
+
"pageNumber": 1,
|
35
|
+
},
|
36
|
+
"pageContent": "123",
|
37
|
+
},
|
38
|
+
],
|
39
|
+
"totalCharCount": 3,
|
40
|
+
"totalLineCount": 1,
|
41
|
+
}
|
42
|
+
`;
|
43
|
+
|
3
44
|
exports[`loadFile Integration Tests > Text Handling (.txt, .csv, .md, etc.) > should load content from a test.csv file using filePath 1`] = `
|
4
45
|
{
|
5
46
|
"content": "ID,Name,Value
|
@@ -36,4 +36,24 @@ describe('loadFile Integration Tests', () => {
|
|
36
36
|
testPureTextFile(file);
|
37
37
|
});
|
38
38
|
});
|
39
|
+
|
40
|
+
describe('PDF Handling', () => {
|
41
|
+
it(`should load content from a pdf file using filePath`, async () => {
|
42
|
+
const filePath = getFixturePath('test.pdf');
|
43
|
+
|
44
|
+
// Pass filePath directly to loadFile
|
45
|
+
const docs = await loadFile(filePath);
|
46
|
+
|
47
|
+
expect(docs.content).toEqual('123');
|
48
|
+
expect(docs.source).toEqual(filePath);
|
49
|
+
|
50
|
+
// @ts-expect-error
|
51
|
+
delete docs.source;
|
52
|
+
// @ts-expect-error
|
53
|
+
delete docs.createdTime;
|
54
|
+
// @ts-expect-error
|
55
|
+
delete docs.modifiedTime;
|
56
|
+
expect(docs).toMatchSnapshot();
|
57
|
+
});
|
58
|
+
});
|
39
59
|
});
|
@@ -0,0 +1,17 @@
|
|
1
|
+
// Polyfill DOMMatrix for pdfjs-dist in Node.js environment
|
2
|
+
import { DOMMatrix } from 'canvas';
|
3
|
+
|
4
|
+
if (typeof global.DOMMatrix === 'undefined') {
|
5
|
+
// @ts-ignore
|
6
|
+
global.DOMMatrix = DOMMatrix;
|
7
|
+
}
|
8
|
+
|
9
|
+
// Polyfill URL.createObjectURL and URL.revokeObjectURL for pdfjs-dist
|
10
|
+
if (typeof global.URL.createObjectURL === 'undefined') {
|
11
|
+
global.URL.createObjectURL = () => 'blob:http://localhost/fake-blob-url';
|
12
|
+
}
|
13
|
+
if (typeof global.URL.revokeObjectURL === 'undefined') {
|
14
|
+
global.URL.revokeObjectURL = () => {
|
15
|
+
/* no-op */
|
16
|
+
};
|
17
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
import { defineConfig } from 'vitest/config';
|
2
|
+
|
3
|
+
export default defineConfig({
|
4
|
+
test: {
|
5
|
+
// coverage: {
|
6
|
+
// all: false,
|
7
|
+
// provider: 'v8',
|
8
|
+
// reporter: ['text', 'json', 'lcov', 'text-summary'],
|
9
|
+
// reportsDirectory: './coverage/app',
|
10
|
+
// },
|
11
|
+
environment: 'happy-dom',
|
12
|
+
// setupFiles: join(__dirname, './test/setup.ts'),
|
13
|
+
},
|
14
|
+
});
|
package/src/const/file.ts
CHANGED
@@ -456,6 +456,28 @@
|
|
456
456
|
],
|
457
457
|
"bps": true,
|
458
458
|
"folderMillis": 1744602998656,
|
459
|
-
"hash": "
|
459
|
+
"hash": "fdbac49ffdbe759234e760d0d48cdc1854028ea70d756a12b72f24305b4f3072"
|
460
|
+
},
|
461
|
+
{
|
462
|
+
"sql": [
|
463
|
+
"CREATE TABLE IF NOT EXISTS \"document_chunks\" (\n\t\"document_id\" varchar(30) NOT NULL,\n\t\"chunk_id\" uuid NOT NULL,\n\t\"page_index\" integer,\n\t\"user_id\" text NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\tCONSTRAINT \"document_chunks_document_id_chunk_id_pk\" PRIMARY KEY(\"document_id\",\"chunk_id\")\n);\n",
|
464
|
+
"\nCREATE TABLE IF NOT EXISTS \"documents\" (\n\t\"id\" varchar(30) PRIMARY KEY NOT NULL,\n\t\"title\" text,\n\t\"content\" text,\n\t\"file_type\" varchar(255) NOT NULL,\n\t\"filename\" text,\n\t\"total_char_count\" integer NOT NULL,\n\t\"total_line_count\" integer NOT NULL,\n\t\"metadata\" jsonb,\n\t\"pages\" jsonb,\n\t\"source_type\" text NOT NULL,\n\t\"source\" text NOT NULL,\n\t\"file_id\" text,\n\t\"user_id\" text NOT NULL,\n\t\"client_id\" text,\n\t\"accessed_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\t\"updated_at\" timestamp with time zone DEFAULT now() NOT NULL\n);\n",
|
465
|
+
"\nCREATE TABLE IF NOT EXISTS \"topic_documents\" (\n\t\"document_id\" text NOT NULL,\n\t\"topic_id\" text NOT NULL,\n\t\"user_id\" text NOT NULL,\n\t\"created_at\" timestamp with time zone DEFAULT now() NOT NULL,\n\tCONSTRAINT \"topic_documents_document_id_topic_id_pk\" PRIMARY KEY(\"document_id\",\"topic_id\")\n);\n",
|
466
|
+
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_document_id_documents_id_fk\" FOREIGN KEY (\"document_id\") REFERENCES \"public\".\"documents\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
467
|
+
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_chunk_id_chunks_id_fk\" FOREIGN KEY (\"chunk_id\") REFERENCES \"public\".\"chunks\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
468
|
+
"\nALTER TABLE \"document_chunks\" ADD CONSTRAINT \"document_chunks_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
469
|
+
"\nALTER TABLE \"documents\" ADD CONSTRAINT \"documents_file_id_files_id_fk\" FOREIGN KEY (\"file_id\") REFERENCES \"public\".\"files\"(\"id\") ON DELETE set null ON UPDATE no action;",
|
470
|
+
"\nALTER TABLE \"documents\" ADD CONSTRAINT \"documents_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
471
|
+
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_document_id_documents_id_fk\" FOREIGN KEY (\"document_id\") REFERENCES \"public\".\"documents\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
472
|
+
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_topic_id_topics_id_fk\" FOREIGN KEY (\"topic_id\") REFERENCES \"public\".\"topics\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
473
|
+
"\nALTER TABLE \"topic_documents\" ADD CONSTRAINT \"topic_documents_user_id_users_id_fk\" FOREIGN KEY (\"user_id\") REFERENCES \"public\".\"users\"(\"id\") ON DELETE cascade ON UPDATE no action;",
|
474
|
+
"\nCREATE INDEX \"documents_source_idx\" ON \"documents\" USING btree (\"source\");",
|
475
|
+
"\nCREATE INDEX \"documents_file_type_idx\" ON \"documents\" USING btree (\"file_type\");",
|
476
|
+
"\nCREATE INDEX \"documents_file_id_idx\" ON \"documents\" USING btree (\"file_id\");",
|
477
|
+
"\nCREATE UNIQUE INDEX \"documents_client_id_user_id_unique\" ON \"documents\" USING btree (\"client_id\",\"user_id\");\n"
|
478
|
+
],
|
479
|
+
"bps": true,
|
480
|
+
"folderMillis": 1746724476380,
|
481
|
+
"hash": "0518cd9882f7ea38eb498b31c8dda73fb56bbc3aa55445ecbc7a9e716631d047"
|
460
482
|
}
|
461
483
|
]
|
@@ -0,0 +1,49 @@
|
|
1
|
+
CREATE TABLE IF NOT EXISTS "document_chunks" (
|
2
|
+
"document_id" varchar(30) NOT NULL,
|
3
|
+
"chunk_id" uuid NOT NULL,
|
4
|
+
"page_index" integer,
|
5
|
+
"user_id" text NOT NULL,
|
6
|
+
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
7
|
+
CONSTRAINT "document_chunks_document_id_chunk_id_pk" PRIMARY KEY("document_id","chunk_id")
|
8
|
+
);
|
9
|
+
--> statement-breakpoint
|
10
|
+
CREATE TABLE IF NOT EXISTS "documents" (
|
11
|
+
"id" varchar(30) PRIMARY KEY NOT NULL,
|
12
|
+
"title" text,
|
13
|
+
"content" text,
|
14
|
+
"file_type" varchar(255) NOT NULL,
|
15
|
+
"filename" text,
|
16
|
+
"total_char_count" integer NOT NULL,
|
17
|
+
"total_line_count" integer NOT NULL,
|
18
|
+
"metadata" jsonb,
|
19
|
+
"pages" jsonb,
|
20
|
+
"source_type" text NOT NULL,
|
21
|
+
"source" text NOT NULL,
|
22
|
+
"file_id" text,
|
23
|
+
"user_id" text NOT NULL,
|
24
|
+
"client_id" text,
|
25
|
+
"accessed_at" timestamp with time zone DEFAULT now() NOT NULL,
|
26
|
+
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
27
|
+
"updated_at" timestamp with time zone DEFAULT now() NOT NULL
|
28
|
+
);
|
29
|
+
--> statement-breakpoint
|
30
|
+
CREATE TABLE IF NOT EXISTS "topic_documents" (
|
31
|
+
"document_id" text NOT NULL,
|
32
|
+
"topic_id" text NOT NULL,
|
33
|
+
"user_id" text NOT NULL,
|
34
|
+
"created_at" timestamp with time zone DEFAULT now() NOT NULL,
|
35
|
+
CONSTRAINT "topic_documents_document_id_topic_id_pk" PRIMARY KEY("document_id","topic_id")
|
36
|
+
);
|
37
|
+
--> statement-breakpoint
|
38
|
+
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_document_id_documents_id_fk" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
39
|
+
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_chunk_id_chunks_id_fk" FOREIGN KEY ("chunk_id") REFERENCES "public"."chunks"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
40
|
+
ALTER TABLE "document_chunks" ADD CONSTRAINT "document_chunks_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
41
|
+
ALTER TABLE "documents" ADD CONSTRAINT "documents_file_id_files_id_fk" FOREIGN KEY ("file_id") REFERENCES "public"."files"("id") ON DELETE set null ON UPDATE no action;--> statement-breakpoint
|
42
|
+
ALTER TABLE "documents" ADD CONSTRAINT "documents_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
43
|
+
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_document_id_documents_id_fk" FOREIGN KEY ("document_id") REFERENCES "public"."documents"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
44
|
+
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_topic_id_topics_id_fk" FOREIGN KEY ("topic_id") REFERENCES "public"."topics"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
45
|
+
ALTER TABLE "topic_documents" ADD CONSTRAINT "topic_documents_user_id_users_id_fk" FOREIGN KEY ("user_id") REFERENCES "public"."users"("id") ON DELETE cascade ON UPDATE no action;--> statement-breakpoint
|
46
|
+
CREATE INDEX "documents_source_idx" ON "documents" USING btree ("source");--> statement-breakpoint
|
47
|
+
CREATE INDEX "documents_file_type_idx" ON "documents" USING btree ("file_type");--> statement-breakpoint
|
48
|
+
CREATE INDEX "documents_file_id_idx" ON "documents" USING btree ("file_id");--> statement-breakpoint
|
49
|
+
CREATE UNIQUE INDEX "documents_client_id_user_id_unique" ON "documents" USING btree ("client_id","user_id");
|