@minded-ai/mindedjs 3.0.7 → 3.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +2 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +6 -3
- package/dist/index.js.map +1 -1
- package/dist/internalTools/documentExtraction/documentExtraction.d.ts +112 -102
- package/dist/internalTools/documentExtraction/documentExtraction.d.ts.map +1 -1
- package/dist/internalTools/documentExtraction/documentExtraction.js +146 -705
- package/dist/internalTools/documentExtraction/documentExtraction.js.map +1 -1
- package/dist/internalTools/documentExtraction/extractStructuredData.d.ts +57 -0
- package/dist/internalTools/documentExtraction/extractStructuredData.d.ts.map +1 -0
- package/dist/internalTools/documentExtraction/extractStructuredData.js +121 -0
- package/dist/internalTools/documentExtraction/extractStructuredData.js.map +1 -0
- package/dist/internalTools/documentExtraction/parseDocumentLocal.d.ts +16 -0
- package/dist/internalTools/documentExtraction/parseDocumentLocal.d.ts.map +1 -0
- package/dist/internalTools/documentExtraction/parseDocumentLocal.js +547 -0
- package/dist/internalTools/documentExtraction/parseDocumentLocal.js.map +1 -0
- package/dist/internalTools/documentExtraction/parseDocumentManaged.d.ts +13 -0
- package/dist/internalTools/documentExtraction/parseDocumentManaged.d.ts.map +1 -0
- package/dist/internalTools/documentExtraction/parseDocumentManaged.js +150 -0
- package/dist/internalTools/documentExtraction/parseDocumentManaged.js.map +1 -0
- package/dist/nodes/addAppToolNode.d.ts.map +1 -1
- package/dist/nodes/addAppToolNode.js +20 -1
- package/dist/nodes/addAppToolNode.js.map +1 -1
- package/dist/toolsLibrary/classifier.d.ts +2 -2
- package/dist/toolsLibrary/parseDocument.d.ts +11 -10
- package/dist/toolsLibrary/parseDocument.d.ts.map +1 -1
- package/dist/toolsLibrary/parseDocument.js +33 -189
- package/dist/toolsLibrary/parseDocument.js.map +1 -1
- package/dist/types/Flows.types.d.ts +1 -0
- package/dist/types/Flows.types.d.ts.map +1 -1
- package/dist/types/Flows.types.js.map +1 -1
- package/dist/utils/schemaUtils.js +1 -1
- package/dist/utils/schemaUtils.js.map +1 -1
- package/docs/tooling/document-processing.md +235 -174
- package/package.json +3 -2
- package/src/index.ts +2 -1
- package/src/internalTools/documentExtraction/documentExtraction.ts +184 -767
- package/src/internalTools/documentExtraction/extractStructuredData.ts +140 -0
- package/src/internalTools/documentExtraction/parseDocumentLocal.ts +660 -0
- package/src/internalTools/documentExtraction/parseDocumentManaged.ts +152 -0
- package/src/nodes/addAppToolNode.ts +30 -7
- package/src/toolsLibrary/parseDocument.ts +38 -206
- package/src/types/Flows.types.ts +1 -0
- package/src/utils/schemaUtils.ts +1 -1
|
@@ -1,804 +1,221 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ZodType } from 'zod';
|
|
2
2
|
import { BaseLanguageModel } from '@langchain/core/language_models/base';
|
|
3
|
-
import
|
|
4
|
-
import
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
7
|
-
import { DocumentExtractionOptions, DocumentProcessingResult, DocumentProcessorConfig, SUPPORTED_DOCUMENT_TYPES } from './types';
|
|
3
|
+
import { extractStructuredDataFromString } from './extractStructuredData';
|
|
4
|
+
import { DocumentProcessResponse } from '../../platform/mindedConnectionTypes';
|
|
5
|
+
import { parseDocumentWithManagedService } from './parseDocumentManaged';
|
|
6
|
+
import { parseDocumentWithLocalService } from './parseDocumentLocal';
|
|
8
7
|
|
|
9
8
|
/**
|
|
10
|
-
*
|
|
9
|
+
* Document processing mode
|
|
10
|
+
*/
|
|
11
|
+
export enum DocumentProcessingMode {
|
|
12
|
+
/** Process documents using Minded cloud service (default) */
|
|
13
|
+
MANAGED = 'managed',
|
|
14
|
+
/** Process documents locally using LlamaCloud */
|
|
15
|
+
LOCAL = 'local',
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Parse document and extract structured data using AI.
|
|
20
|
+
*
|
|
21
|
+
* This function provides a flexible way to process documents with optional AI-powered extraction:
|
|
22
|
+
* - Raw text extraction: Parse document without LLM processing
|
|
23
|
+
* - Structured extraction with schema: Extract data matching a Zod schema
|
|
24
|
+
* - Structured extraction with prompt: Guide extraction using custom prompts
|
|
25
|
+
* - Processing modes: Use DocumentProcessingMode.LOCAL (requires LlamaCloud API key) or DocumentProcessingMode.MANAGED (backend service)
|
|
11
26
|
*
|
|
12
|
-
*
|
|
13
|
-
*
|
|
14
|
-
* -
|
|
15
|
-
*
|
|
16
|
-
* -
|
|
27
|
+
* @param options - Document processing options
|
|
28
|
+
* @param options.documentSource - URL or file path to the document
|
|
29
|
+
* @param options.processingMode - Document parsing mode: DocumentProcessingMode.MANAGED (default, backend service) or DocumentProcessingMode.LOCAL (requires llamaCloudApiKey)
|
|
30
|
+
* @param options.sessionId - Unique session identifier for logging and tracking
|
|
31
|
+
* @param options.llamaCloudApiKey - LlamaCloud API key for local processing. Required when processingMode is DocumentProcessingMode.LOCAL. Can be provided as parameter or via LLAMA_CLOUD_API_KEY environment variable
|
|
32
|
+
* @param options.returnStructuredOutput - Whether to extract structured data using LLM (true) or return raw text only (false). Defaults to false.
|
|
33
|
+
* @param options.llm - Language model instance for AI-powered extraction. Required when returnStructuredOutput is true
|
|
34
|
+
* @param options.outputSchema - Optional Zod schema defining the structure of extracted data
|
|
35
|
+
* @param options.outputSchemaPrompt - Optional prompt to guide the llm how to extract the data
|
|
17
36
|
*
|
|
18
|
-
* @
|
|
19
|
-
*
|
|
20
|
-
*
|
|
21
|
-
*
|
|
22
|
-
* @param options.documentUrl - URL to fetch the document from
|
|
23
|
-
* @param options.schema - Optional Zod schema for structured data extraction
|
|
24
|
-
* @param options.systemPrompt - Optional prompt for guiding extraction
|
|
25
|
-
* @param options.config - Optional document processor configuration
|
|
37
|
+
* @returns Promise resolving to an object containing:
|
|
38
|
+
* - rawContent: The raw extracted text from the document
|
|
39
|
+
* - structuredContent: AI-extracted structured data (if returnStructuredOutput is true)
|
|
40
|
+
* - metadata: Document metadata from processing
|
|
26
41
|
*
|
|
27
|
-
* @
|
|
42
|
+
* @throws {Error} If documentSource is not provided
|
|
43
|
+
* @throws {Error} If returnStructuredOutput is true but llm is not provided
|
|
44
|
+
* @throws {Error} If document parsing or extraction fails
|
|
28
45
|
*
|
|
29
46
|
* @example
|
|
30
47
|
* ```typescript
|
|
31
|
-
* import {
|
|
48
|
+
* import { parseDocumentAndExtractStructuredData, DocumentProcessingMode } from '@minded-ai/mindedjs';
|
|
32
49
|
* import { z } from 'zod';
|
|
33
50
|
*
|
|
34
|
-
* //
|
|
35
|
-
* const result1 = await
|
|
51
|
+
* // Parse document and extract structured data using a schema
|
|
52
|
+
* const result1 = await parseDocumentAndExtractStructuredData({
|
|
53
|
+
* documentSource: './invoice.pdf',
|
|
54
|
+
* processingMode: DocumentProcessingMode.MANAGED,
|
|
55
|
+
* sessionId: state.sessionId,
|
|
56
|
+
* returnStructuredOutput: true,
|
|
36
57
|
* llm: agent.llm,
|
|
37
|
-
*
|
|
38
|
-
* schema: z.object({
|
|
58
|
+
* outputSchema: z.object({
|
|
39
59
|
* invoiceNumber: z.string(),
|
|
40
|
-
*
|
|
41
|
-
*
|
|
60
|
+
* totalAmount: z.number(),
|
|
61
|
+
* date: z.string(),
|
|
62
|
+
* }),
|
|
42
63
|
* });
|
|
43
64
|
*
|
|
44
|
-
* //
|
|
45
|
-
* const
|
|
46
|
-
*
|
|
47
|
-
*
|
|
48
|
-
*
|
|
49
|
-
*
|
|
50
|
-
*
|
|
51
|
-
* // Extract raw text without LLM
|
|
52
|
-
* const result3 = await extractFromDocument({
|
|
53
|
-
* documentPath: './document.pdf'
|
|
65
|
+
* // Parse document only
|
|
66
|
+
* const result3 = await parseDocumentAndExtractStructuredData({
|
|
67
|
+
* documentSource: './document.pdf',
|
|
68
|
+
* processingMode: DocumentProcessingMode.MANAGED,
|
|
69
|
+
* sessionId: state.sessionId,
|
|
70
|
+
* returnStructuredOutput: false,
|
|
54
71
|
* });
|
|
55
72
|
* ```
|
|
56
73
|
*/
|
|
57
|
-
export async function
|
|
74
|
+
export async function parseDocumentAndExtractStructuredData<T extends Record<string, any>>({
|
|
75
|
+
documentSource,
|
|
76
|
+
processingMode,
|
|
77
|
+
sessionId,
|
|
78
|
+
llamaCloudApiKey,
|
|
79
|
+
returnStructuredOutput = false,
|
|
80
|
+
llm,
|
|
81
|
+
outputSchema,
|
|
82
|
+
outputSchemaPrompt,
|
|
83
|
+
}: {
|
|
84
|
+
documentSource: string;
|
|
85
|
+
processingMode?: DocumentProcessingMode;
|
|
86
|
+
sessionId: string;
|
|
87
|
+
llamaCloudApiKey?: string;
|
|
88
|
+
returnStructuredOutput: boolean;
|
|
58
89
|
llm?: BaseLanguageModel;
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
documentUrl?: string;
|
|
62
|
-
schema?: ZodSchema<T>;
|
|
63
|
-
systemPrompt?: string;
|
|
64
|
-
config?: DocumentProcessorConfig;
|
|
90
|
+
outputSchema?: ZodType<T>;
|
|
91
|
+
outputSchemaPrompt?: string;
|
|
65
92
|
}): Promise<{
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
fileType: string;
|
|
70
|
-
processingTime: number;
|
|
71
|
-
contentLength: number;
|
|
72
|
-
};
|
|
93
|
+
rawContent?: string;
|
|
94
|
+
structuredContent?: T | string;
|
|
95
|
+
metadata?: DocumentProcessResponse['metadata'];
|
|
73
96
|
}> {
|
|
74
|
-
//
|
|
75
|
-
const
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
documentContent: options.documentContent,
|
|
81
|
-
documentUrl: options.documentUrl,
|
|
82
|
-
schema: options.schema,
|
|
83
|
-
systemPrompt: options.systemPrompt,
|
|
97
|
+
// Parse document
|
|
98
|
+
const result = await parseDocument({
|
|
99
|
+
documentSource,
|
|
100
|
+
processingMode,
|
|
101
|
+
sessionId,
|
|
102
|
+
llamaCloudApiKey,
|
|
84
103
|
});
|
|
85
|
-
}
|
|
86
|
-
|
|
87
|
-
/**
|
|
88
|
-
* Generic document processor that can extract structured data from various document types
|
|
89
|
-
* including images, PDFs, Word documents, spreadsheets, and more.
|
|
90
|
-
*/
|
|
91
|
-
export class DocumentProcessor {
|
|
92
|
-
private config: DocumentProcessorConfig;
|
|
93
|
-
private llm: BaseLanguageModel | null = null;
|
|
94
|
-
private llamaCloudApiKey: string | null = null;
|
|
95
|
-
private sharpModule: any = null;
|
|
96
|
-
private sharpLoadAttempted = false;
|
|
97
|
-
|
|
98
|
-
constructor(config: DocumentProcessorConfig = {}, llm?: BaseLanguageModel) {
|
|
99
|
-
this.config = {
|
|
100
|
-
maxImageWidth: 1200,
|
|
101
|
-
imageQuality: 85,
|
|
102
|
-
useBase64: false,
|
|
103
|
-
...config,
|
|
104
|
-
};
|
|
105
|
-
|
|
106
|
-
this.llm = llm || null;
|
|
107
|
-
this.llamaCloudApiKey = this.config.llamaCloudApiKey || process.env.LLAMA_CLOUD_API_KEY || null;
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
/**
|
|
111
|
-
* Parse document using LlamaCloud REST API
|
|
112
|
-
*/
|
|
113
|
-
private async parseWithLlamaCloud(filePath: string): Promise<string | null> {
|
|
114
|
-
if (!this.llamaCloudApiKey) {
|
|
115
|
-
return null;
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
try {
|
|
119
|
-
// Step 1: Upload file and start parsing
|
|
120
|
-
const fileContent = fs.readFileSync(filePath);
|
|
121
|
-
const fileName = path.basename(filePath);
|
|
122
|
-
const mimeType = this.getMimeType(path.extname(filePath));
|
|
123
|
-
|
|
124
|
-
const formData = new FormData();
|
|
125
|
-
const blob = new Blob([fileContent], { type: mimeType });
|
|
126
|
-
formData.append('file', blob, fileName);
|
|
127
|
-
formData.append('premium_mode', 'true');
|
|
128
|
-
|
|
129
|
-
const uploadResponse = await fetch('https://api.cloud.llamaindex.ai/api/v1/parsing/upload', {
|
|
130
|
-
method: 'POST',
|
|
131
|
-
headers: {
|
|
132
|
-
Accept: 'application/json',
|
|
133
|
-
Authorization: `Bearer ${this.llamaCloudApiKey}`,
|
|
134
|
-
},
|
|
135
|
-
body: formData,
|
|
136
|
-
});
|
|
137
|
-
|
|
138
|
-
if (!uploadResponse.ok) {
|
|
139
|
-
const errorText = await uploadResponse.text();
|
|
140
|
-
throw new Error(`Failed to upload file: ${uploadResponse.status} - ${errorText}`);
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
const uploadResult = await uploadResponse.json();
|
|
144
|
-
const jobId = uploadResult.id || uploadResult.job_id;
|
|
145
|
-
|
|
146
|
-
if (!jobId) {
|
|
147
|
-
throw new Error('No job ID returned from upload');
|
|
148
|
-
}
|
|
149
|
-
|
|
150
|
-
logger.info({
|
|
151
|
-
msg: '[DocumentProcessor] File uploaded to LlamaCloud',
|
|
152
|
-
jobId,
|
|
153
|
-
fileName,
|
|
154
|
-
});
|
|
155
|
-
|
|
156
|
-
// Step 2: Poll for job completion
|
|
157
|
-
let attempts = 0;
|
|
158
|
-
const maxAttempts = 60; // 60 attempts with 2 second delay = 2 minutes max
|
|
159
|
-
const pollDelay = 2000; // 2 seconds
|
|
160
|
-
|
|
161
|
-
while (attempts < maxAttempts) {
|
|
162
|
-
const statusResponse = await fetch(`https://api.cloud.llamaindex.ai/api/v1/parsing/job/${jobId}`, {
|
|
163
|
-
method: 'GET',
|
|
164
|
-
headers: {
|
|
165
|
-
Accept: 'application/json',
|
|
166
|
-
Authorization: `Bearer ${this.llamaCloudApiKey}`,
|
|
167
|
-
},
|
|
168
|
-
});
|
|
169
|
-
|
|
170
|
-
if (!statusResponse.ok) {
|
|
171
|
-
throw new Error(`Failed to check job status: ${statusResponse.status}`);
|
|
172
|
-
}
|
|
173
|
-
|
|
174
|
-
const statusResult = await statusResponse.json();
|
|
175
|
-
const status = statusResult.status || statusResult.job_status;
|
|
176
|
-
|
|
177
|
-
if (status === 'SUCCESS' || status === 'COMPLETED' || status === 'completed') {
|
|
178
|
-
// Step 3: Retrieve results in Markdown
|
|
179
|
-
|
|
180
|
-
// Create an AbortController for timeout
|
|
181
|
-
const controller = new AbortController();
|
|
182
|
-
const timeout = setTimeout(() => controller.abort(), 20000); // 20 second timeout
|
|
183
|
-
|
|
184
|
-
let resultResponse;
|
|
185
|
-
try {
|
|
186
|
-
resultResponse = await fetch(`https://api.cloud.llamaindex.ai/api/v1/parsing/job/${jobId}/result/markdown`, {
|
|
187
|
-
method: 'GET',
|
|
188
|
-
headers: {
|
|
189
|
-
Accept: 'application/json',
|
|
190
|
-
Authorization: `Bearer ${this.llamaCloudApiKey}`,
|
|
191
|
-
},
|
|
192
|
-
signal: controller.signal,
|
|
193
|
-
});
|
|
194
|
-
} catch (fetchError) {
|
|
195
|
-
clearTimeout(timeout);
|
|
196
|
-
if (fetchError instanceof Error && fetchError.name === 'AbortError') {
|
|
197
|
-
throw new Error('Timeout fetching results from LlamaCloud after 20 seconds');
|
|
198
|
-
}
|
|
199
|
-
throw fetchError;
|
|
200
|
-
}
|
|
201
104
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
if (!resultResponse.ok) {
|
|
205
|
-
const errorText = await resultResponse.text();
|
|
206
|
-
throw new Error(`Failed to retrieve results: ${resultResponse.status} - ${errorText}`);
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
let resultData: any;
|
|
210
|
-
try {
|
|
211
|
-
// Read response using manual stream reading (more reliable than text())
|
|
212
|
-
let responseText;
|
|
213
|
-
if (resultResponse.body) {
|
|
214
|
-
const reader = resultResponse.body.getReader();
|
|
215
|
-
const chunks: Uint8Array[] = [];
|
|
216
|
-
let totalLength = 0;
|
|
217
|
-
|
|
218
|
-
try {
|
|
219
|
-
while (true) {
|
|
220
|
-
const { done, value } = await reader.read();
|
|
221
|
-
if (done) break;
|
|
222
|
-
if (value) {
|
|
223
|
-
chunks.push(value);
|
|
224
|
-
totalLength += value.length;
|
|
225
|
-
}
|
|
226
|
-
}
|
|
227
|
-
|
|
228
|
-
// Combine chunks
|
|
229
|
-
const combined = new Uint8Array(totalLength);
|
|
230
|
-
let offset = 0;
|
|
231
|
-
for (const chunk of chunks) {
|
|
232
|
-
combined.set(chunk, offset);
|
|
233
|
-
offset += chunk.length;
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
responseText = new TextDecoder().decode(combined);
|
|
237
|
-
} finally {
|
|
238
|
-
reader.releaseLock();
|
|
239
|
-
}
|
|
240
|
-
} else {
|
|
241
|
-
responseText = await resultResponse.text();
|
|
242
|
-
}
|
|
243
|
-
|
|
244
|
-
// Try to parse as JSON, but if it fails, use the text directly
|
|
245
|
-
try {
|
|
246
|
-
resultData = JSON.parse(responseText);
|
|
247
|
-
} catch {
|
|
248
|
-
// If it's not JSON, assume it's the markdown content directly
|
|
249
|
-
resultData = responseText;
|
|
250
|
-
}
|
|
251
|
-
} catch (textError) {
|
|
252
|
-
logger.error({
|
|
253
|
-
msg: '[DocumentProcessor] Failed to read response text',
|
|
254
|
-
jobId,
|
|
255
|
-
error: textError instanceof Error ? textError.message : String(textError),
|
|
256
|
-
stack: textError instanceof Error ? textError.stack : undefined,
|
|
257
|
-
});
|
|
258
|
-
throw new Error('Failed to read response from LlamaCloud');
|
|
259
|
-
}
|
|
260
|
-
|
|
261
|
-
logger.debug({
|
|
262
|
-
msg: '[DocumentProcessor] Result data structure',
|
|
263
|
-
jobId,
|
|
264
|
-
dataType: typeof resultData,
|
|
265
|
-
keys: typeof resultData === 'object' && resultData !== null ? Object.keys(resultData) : [],
|
|
266
|
-
hasMarkdown: typeof resultData === 'object' && 'markdown' in resultData,
|
|
267
|
-
hasContent: typeof resultData === 'object' && 'content' in resultData,
|
|
268
|
-
hasText: typeof resultData === 'object' && 'text' in resultData,
|
|
269
|
-
});
|
|
270
|
-
|
|
271
|
-
// The API might return the markdown directly as a string or nested in an object
|
|
272
|
-
let markdownContent: string;
|
|
273
|
-
if (typeof resultData === 'string') {
|
|
274
|
-
markdownContent = resultData;
|
|
275
|
-
} else {
|
|
276
|
-
markdownContent = resultData.markdown || resultData.content || resultData.text || '';
|
|
277
|
-
}
|
|
278
|
-
|
|
279
|
-
if (!markdownContent) {
|
|
280
|
-
logger.error({
|
|
281
|
-
msg: '[DocumentProcessor] No content in result',
|
|
282
|
-
jobId,
|
|
283
|
-
resultData: JSON.stringify(resultData).substring(0, 500),
|
|
284
|
-
});
|
|
285
|
-
throw new Error('No content returned from parsing');
|
|
286
|
-
}
|
|
287
|
-
|
|
288
|
-
logger.info({
|
|
289
|
-
msg: '[DocumentProcessor] Successfully parsed document with LlamaCloud',
|
|
290
|
-
jobId,
|
|
291
|
-
contentLength: markdownContent.length,
|
|
292
|
-
preview: markdownContent.substring(0, 100),
|
|
293
|
-
});
|
|
294
|
-
|
|
295
|
-
logger.debug({
|
|
296
|
-
msg: '[DocumentProcessor] About to return markdown content',
|
|
297
|
-
jobId,
|
|
298
|
-
});
|
|
299
|
-
|
|
300
|
-
return markdownContent;
|
|
301
|
-
} else if (status === 'FAILED' || status === 'ERROR' || status === 'failed') {
|
|
302
|
-
throw new Error(`Parsing job failed: ${statusResult.error || 'Unknown error'}`);
|
|
303
|
-
}
|
|
304
|
-
|
|
305
|
-
// Wait before next attempt
|
|
306
|
-
await new Promise((resolve) => setTimeout(resolve, pollDelay));
|
|
307
|
-
attempts++;
|
|
308
|
-
}
|
|
309
|
-
|
|
310
|
-
throw new Error('Parsing job timed out after 2 minutes');
|
|
311
|
-
} catch (err) {
|
|
312
|
-
logger.warn({
|
|
313
|
-
message: '[DocumentProcessor] LlamaCloud parsing failed',
|
|
314
|
-
err,
|
|
315
|
-
});
|
|
316
|
-
return null;
|
|
317
|
-
} finally {
|
|
318
|
-
logger.debug({
|
|
319
|
-
msg: '[DocumentProcessor] parseWithLlamaCloud finished',
|
|
320
|
-
filePath,
|
|
321
|
-
});
|
|
322
|
-
}
|
|
105
|
+
if (!returnStructuredOutput || !result.rawContent) {
|
|
106
|
+
return result;
|
|
323
107
|
}
|
|
324
108
|
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
*/
|
|
328
|
-
private getMimeType(fileExtension: string): string {
|
|
329
|
-
const mimeTypes: { [key: string]: string } = {
|
|
330
|
-
'.pdf': 'application/pdf',
|
|
331
|
-
'.doc': 'application/msword',
|
|
332
|
-
'.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
333
|
-
'.txt': 'text/plain',
|
|
334
|
-
'.rtf': 'application/rtf',
|
|
335
|
-
'.jpg': 'image/jpeg',
|
|
336
|
-
'.jpeg': 'image/jpeg',
|
|
337
|
-
'.png': 'image/png',
|
|
338
|
-
'.gif': 'image/gif',
|
|
339
|
-
'.bmp': 'image/bmp',
|
|
340
|
-
'.webp': 'image/webp',
|
|
341
|
-
'.tiff': 'image/tiff',
|
|
342
|
-
'.xls': 'application/vnd.ms-excel',
|
|
343
|
-
'.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
344
|
-
'.csv': 'text/csv',
|
|
345
|
-
'.html': 'text/html',
|
|
346
|
-
'.htm': 'text/html',
|
|
347
|
-
'.xml': 'application/xml',
|
|
348
|
-
'.md': 'text/markdown',
|
|
349
|
-
};
|
|
350
|
-
|
|
351
|
-
return mimeTypes[fileExtension.toLowerCase()] || 'application/octet-stream';
|
|
109
|
+
if (!llm) {
|
|
110
|
+
throw new Error('LLM instance is required when returnStructuredOutput is true. Please provide an LLM instance.');
|
|
352
111
|
}
|
|
353
112
|
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
hasDocumentContent: !!options.documentContent,
|
|
363
|
-
hasLLM: !!this.llm,
|
|
364
|
-
hasSchema: !!options.schema,
|
|
365
|
-
});
|
|
366
|
-
|
|
367
|
-
const startTime = Date.now();
|
|
368
|
-
|
|
369
|
-
try {
|
|
370
|
-
logger.debug({
|
|
371
|
-
msg: '[DocumentProcessor] Getting document content',
|
|
372
|
-
});
|
|
373
|
-
|
|
374
|
-
// Determine document source and content
|
|
375
|
-
const { content, fileType, fileSize } = await this.getDocumentContent(options);
|
|
376
|
-
|
|
377
|
-
// Process document content based on type
|
|
378
|
-
let processedContent: string;
|
|
379
|
-
|
|
380
|
-
if (this.isImageFile(fileType)) {
|
|
381
|
-
processedContent = await this.processImageDocument(content as Buffer, fileType, options.documentPath);
|
|
382
|
-
} else {
|
|
383
|
-
processedContent = await this.processTextDocument(content, options.documentPath, fileType);
|
|
384
|
-
}
|
|
385
|
-
|
|
386
|
-
logger.info({
|
|
387
|
-
msg: '[DocumentProcessor] Document content processed',
|
|
388
|
-
fileType,
|
|
389
|
-
contentLength: processedContent.length,
|
|
390
|
-
});
|
|
391
|
-
|
|
392
|
-
const processingTime = Date.now() - startTime;
|
|
393
|
-
|
|
394
|
-
// If no LLM is provided, return the raw text content
|
|
395
|
-
if (!this.llm) {
|
|
396
|
-
return {
|
|
397
|
-
data: processedContent as T,
|
|
398
|
-
metadata: {
|
|
399
|
-
fileSize,
|
|
400
|
-
fileType,
|
|
401
|
-
processingTime,
|
|
402
|
-
contentLength: processedContent.length,
|
|
403
|
-
},
|
|
404
|
-
};
|
|
405
|
-
}
|
|
406
|
-
|
|
407
|
-
// If LLM is provided, extract data (structured or unstructured)
|
|
408
|
-
const extractedData = await this.extractStructuredData<T>(processedContent, options.schema, options.llmConfig, options.systemPrompt);
|
|
113
|
+
// Extract structured data from the parsed document
|
|
114
|
+
const structuredContent = await extractStructuredDataFromString<T>({
|
|
115
|
+
content: result.rawContent,
|
|
116
|
+
llm,
|
|
117
|
+
schema: outputSchema,
|
|
118
|
+
prompt: outputSchemaPrompt,
|
|
119
|
+
sessionId,
|
|
120
|
+
});
|
|
409
121
|
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
processingTime,
|
|
416
|
-
contentLength: processedContent.length,
|
|
417
|
-
},
|
|
418
|
-
};
|
|
419
|
-
} catch (err) {
|
|
420
|
-
logger.error({
|
|
421
|
-
message: '[DocumentProcessor] Document processing failed',
|
|
422
|
-
err,
|
|
423
|
-
});
|
|
424
|
-
throw new Error(`Document processing failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
425
|
-
}
|
|
426
|
-
}
|
|
122
|
+
return {
|
|
123
|
+
...result,
|
|
124
|
+
structuredContent,
|
|
125
|
+
};
|
|
126
|
+
}
|
|
427
127
|
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
128
|
+
/**
|
|
129
|
+
* Parse document and extract raw text content.
|
|
130
|
+
*
|
|
131
|
+
* This function processes various document types (PDFs, images, Word docs, etc.) and extracts
|
|
132
|
+
* raw text content using either local processing with LlamaCloud or managed backend service.
|
|
133
|
+
* Use this for raw text extraction without AI-powered data extraction.
|
|
134
|
+
*
|
|
135
|
+
* @param options - Document parsing options
|
|
136
|
+
* @param options.documentSource - URL or file path to the document
|
|
137
|
+
* @param options.processingMode - Parsing mode: DocumentProcessingMode.LOCAL (requires llamaCloudApiKey) or DocumentProcessingMode.MANAGED (backend service, default)
|
|
138
|
+
* @param options.sessionId - Unique session identifier for logging and tracking
|
|
139
|
+
* @param options.llamaCloudApiKey - LlamaCloud API key for local processing. Required when processingMode is DocumentProcessingMode.LOCAL. Can be provided as parameter or via LLAMA_CLOUD_API_KEY environment variable
|
|
140
|
+
*
|
|
141
|
+
* @returns Promise resolving to an object containing:
|
|
142
|
+
* - rawContent: The raw extracted text from the document
|
|
143
|
+
* - metadata: Document processing metadata (file size, type, processing time, content length)
|
|
144
|
+
*
|
|
145
|
+
* @throws {Error} If documentSource is not provided
|
|
146
|
+
* @throws {Error} If document processing fails
|
|
147
|
+
*
|
|
148
|
+
* @example
|
|
149
|
+
* ```typescript
|
|
150
|
+
* import { parseDocument, DocumentProcessingMode } from '@minded-ai/mindedjs';
|
|
151
|
+
*
|
|
152
|
+
* // Parse document using managed service
|
|
153
|
+
* const result1 = await parseDocument({
|
|
154
|
+
* documentSource: 'https://example.com/invoice.pdf',
|
|
155
|
+
* processingMode: DocumentProcessingMode.MANAGED,
|
|
156
|
+
* sessionId: state.sessionId,
|
|
157
|
+
* });
|
|
158
|
+
* // result1: { rawContent: "Invoice text...", metadata: {...} }
|
|
159
|
+
*
|
|
160
|
+
* // Parse local document using LlamaCloud
|
|
161
|
+
* const result2 = await parseDocument({
|
|
162
|
+
* documentSource: './contract.pdf',
|
|
163
|
+
* processingMode: DocumentProcessingMode.LOCAL,
|
|
164
|
+
* sessionId: state.sessionId,
|
|
165
|
+
* llamaCloudApiKey: process.env.LLAMA_CLOUD_API_KEY,
|
|
166
|
+
* });
|
|
167
|
+
* // result2: { rawContent: "Contract text...", metadata: {...} }
|
|
168
|
+
* ```
|
|
169
|
+
*/
|
|
170
|
+
export async function parseDocument({
|
|
171
|
+
documentSource,
|
|
172
|
+
processingMode = DocumentProcessingMode.MANAGED,
|
|
173
|
+
sessionId,
|
|
174
|
+
llamaCloudApiKey,
|
|
175
|
+
}: {
|
|
176
|
+
documentSource: string;
|
|
177
|
+
processingMode?: DocumentProcessingMode;
|
|
178
|
+
sessionId: string;
|
|
179
|
+
llamaCloudApiKey?: string;
|
|
180
|
+
}): Promise<{
|
|
181
|
+
rawContent?: string;
|
|
182
|
+
metadata?: {
|
|
434
183
|
fileSize?: number;
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
// From file path
|
|
444
|
-
if (options.documentPath) {
|
|
445
|
-
if (!fs.existsSync(options.documentPath)) {
|
|
446
|
-
throw new Error(`Document not found: ${options.documentPath}`);
|
|
447
|
-
}
|
|
448
|
-
|
|
449
|
-
const content = fs.readFileSync(options.documentPath);
|
|
450
|
-
const fileType = path.extname(options.documentPath).toLowerCase();
|
|
451
|
-
|
|
452
|
-
return {
|
|
453
|
-
content,
|
|
454
|
-
fileType,
|
|
455
|
-
fileSize: content.length,
|
|
456
|
-
};
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
// From provided content
|
|
460
|
-
if (options.documentContent) {
|
|
461
|
-
// Try to infer file type from content if it's a buffer
|
|
462
|
-
let fileType = '.unknown';
|
|
463
|
-
if (Buffer.isBuffer(options.documentContent)) {
|
|
464
|
-
fileType = this.inferFileTypeFromBuffer(options.documentContent);
|
|
465
|
-
} else if (typeof options.documentContent === 'string') {
|
|
466
|
-
fileType = '.txt'; // Assume text content
|
|
467
|
-
}
|
|
468
|
-
|
|
469
|
-
return {
|
|
470
|
-
content: options.documentContent,
|
|
471
|
-
fileType,
|
|
472
|
-
fileSize: Buffer.isBuffer(options.documentContent) ? options.documentContent.length : Buffer.byteLength(options.documentContent),
|
|
473
|
-
};
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
// From URL
|
|
477
|
-
if (options.documentUrl) {
|
|
478
|
-
logger.debug({
|
|
479
|
-
msg: '[DocumentProcessor] Fetching document from URL',
|
|
480
|
-
url: options.documentUrl,
|
|
481
|
-
});
|
|
482
|
-
|
|
483
|
-
const response = await fetch(options.documentUrl);
|
|
484
|
-
|
|
485
|
-
logger.debug({
|
|
486
|
-
msg: '[DocumentProcessor] URL fetch response',
|
|
487
|
-
status: response.status,
|
|
488
|
-
ok: response.ok,
|
|
489
|
-
});
|
|
490
|
-
|
|
491
|
-
if (!response.ok) {
|
|
492
|
-
throw new Error(`Failed to fetch document from URL: ${response.statusText}`);
|
|
493
|
-
}
|
|
494
|
-
|
|
495
|
-
const arrayBuffer = await response.arrayBuffer();
|
|
496
|
-
const content = Buffer.from(arrayBuffer);
|
|
497
|
-
const fileType = this.inferFileTypeFromUrl(options.documentUrl) || this.inferFileTypeFromBuffer(content);
|
|
498
|
-
|
|
499
|
-
logger.debug({
|
|
500
|
-
msg: '[DocumentProcessor] Document fetched from URL',
|
|
501
|
-
contentSize: content.length,
|
|
502
|
-
fileType,
|
|
503
|
-
});
|
|
504
|
-
|
|
505
|
-
return {
|
|
506
|
-
content,
|
|
507
|
-
fileType,
|
|
508
|
-
fileSize: content.length,
|
|
509
|
-
};
|
|
510
|
-
}
|
|
511
|
-
|
|
512
|
-
throw new Error('No document source provided. Specify documentPath, documentContent, or documentUrl.');
|
|
513
|
-
}
|
|
514
|
-
|
|
515
|
-
/**
|
|
516
|
-
* Process image documents by converting them to a standardized format
|
|
517
|
-
*/
|
|
518
|
-
private async processImageDocument(content: Buffer, fileType: string, filePath?: string): Promise<string> {
|
|
519
|
-
try {
|
|
520
|
-
// First, try to use LlamaParser if available for text extraction
|
|
521
|
-
if (filePath && this.llamaCloudApiKey) {
|
|
522
|
-
logger.debug({
|
|
523
|
-
msg: '[DocumentProcessor] Calling parseWithLlamaCloud for image',
|
|
524
|
-
filePath,
|
|
525
|
-
});
|
|
526
|
-
const parsedContent = await this.parseWithLlamaCloud(filePath);
|
|
527
|
-
logger.debug({
|
|
528
|
-
msg: '[DocumentProcessor] parseWithLlamaCloud returned for image',
|
|
529
|
-
hasContent: !!parsedContent,
|
|
530
|
-
contentLength: parsedContent?.length,
|
|
531
|
-
});
|
|
532
|
-
if (parsedContent) {
|
|
533
|
-
return parsedContent;
|
|
534
|
-
}
|
|
535
|
-
}
|
|
536
|
-
|
|
537
|
-
// If no file path, create a temporary file for LlamaCloud parsing
|
|
538
|
-
if (!filePath && this.llamaCloudApiKey) {
|
|
539
|
-
const tempDir = os.tmpdir();
|
|
540
|
-
const tempFileName = `temp_${Date.now()}${fileType}`;
|
|
541
|
-
const tempFilePath = path.join(tempDir, tempFileName);
|
|
542
|
-
|
|
543
|
-
logger.debug({
|
|
544
|
-
msg: '[DocumentProcessor] Creating temp file for image',
|
|
545
|
-
tempFilePath,
|
|
546
|
-
contentSize: content.length,
|
|
547
|
-
});
|
|
548
|
-
|
|
549
|
-
try {
|
|
550
|
-
fs.writeFileSync(tempFilePath, content);
|
|
551
|
-
logger.debug({
|
|
552
|
-
msg: '[DocumentProcessor] Calling parseWithLlamaCloud for temp image',
|
|
553
|
-
tempFilePath,
|
|
554
|
-
});
|
|
555
|
-
const parsedContent = await this.parseWithLlamaCloud(tempFilePath);
|
|
556
|
-
logger.debug({
|
|
557
|
-
msg: '[DocumentProcessor] parseWithLlamaCloud returned for temp image',
|
|
558
|
-
hasContent: !!parsedContent,
|
|
559
|
-
contentLength: parsedContent?.length,
|
|
560
|
-
});
|
|
561
|
-
fs.unlinkSync(tempFilePath);
|
|
562
|
-
|
|
563
|
-
if (parsedContent) {
|
|
564
|
-
return parsedContent;
|
|
565
|
-
}
|
|
566
|
-
} catch (err) {
|
|
567
|
-
// Clean up temp file on error
|
|
568
|
-
if (fs.existsSync(tempFilePath)) {
|
|
569
|
-
fs.unlinkSync(tempFilePath);
|
|
570
|
-
}
|
|
571
|
-
logger.warn({ msg: '[DocumentProcessor] Failed to parse image with LlamaCloud', err });
|
|
572
|
-
}
|
|
573
|
-
}
|
|
574
|
-
|
|
575
|
-
// Fallback: Convert to image format for LLM processing
|
|
576
|
-
// For PDFs, convert first page to image
|
|
577
|
-
if (fileType === '.pdf') {
|
|
578
|
-
throw new Error('Failed to convert PDF to image');
|
|
579
|
-
}
|
|
580
|
-
|
|
581
|
-
logger.warn({
|
|
582
|
-
msg: '[DocumentProcessor] Sharp module not available. Using original image without optimization.',
|
|
583
|
-
fileType,
|
|
584
|
-
contentSize: content.length,
|
|
585
|
-
});
|
|
586
|
-
|
|
587
|
-
// If sharp is not available, use the original image
|
|
588
|
-
if (this.config.useBase64) {
|
|
589
|
-
// Return original image as base64
|
|
590
|
-
const base64Image = content.toString('base64');
|
|
591
|
-
const mimeType = this.getMimeType(fileType);
|
|
592
|
-
return `data:${mimeType};base64,${base64Image}`;
|
|
593
|
-
} else {
|
|
594
|
-
// Without sharp and without base64, we cannot process the image
|
|
595
|
-
return `[IMAGE CONTENT - ${fileType.toUpperCase()} file. Size: ${
|
|
596
|
-
content.length
|
|
597
|
-
} bytes. Consider using LLAMA_CLOUD_API_KEY for text extraction or set useBase64: true]`;
|
|
598
|
-
}
|
|
599
|
-
} catch (err) {
|
|
600
|
-
throw new Error(`Failed to process image document: ${err instanceof Error ? err.message : String(err)}`);
|
|
601
|
-
}
|
|
602
|
-
}
|
|
603
|
-
|
|
604
|
-
/**
|
|
605
|
-
* Process text-based documents using LlamaParser or fallback methods
|
|
606
|
-
*/
|
|
607
|
-
private async processTextDocument(content: Buffer | string, filePath?: string, fileType?: string): Promise<string> {
|
|
608
|
-
// Try LlamaCloud parsing if we have a file path
|
|
609
|
-
if (filePath && this.llamaCloudApiKey) {
|
|
610
|
-
const parsedContent = await this.parseWithLlamaCloud(filePath);
|
|
611
|
-
if (parsedContent) {
|
|
612
|
-
return parsedContent;
|
|
613
|
-
}
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
// If no file path but we have content and LlamaCloud API key, create a temp file
|
|
617
|
-
if (!filePath && this.llamaCloudApiKey && Buffer.isBuffer(content)) {
|
|
618
|
-
const tempDir = os.tmpdir();
|
|
619
|
-
const tempFileName = `temp_${Date.now()}${fileType || '.txt'}`;
|
|
620
|
-
const tempFilePath = path.join(tempDir, tempFileName);
|
|
621
|
-
|
|
622
|
-
try {
|
|
623
|
-
fs.writeFileSync(tempFilePath, content);
|
|
624
|
-
const parsedContent = await this.parseWithLlamaCloud(tempFilePath);
|
|
625
|
-
fs.unlinkSync(tempFilePath);
|
|
626
|
-
|
|
627
|
-
if (parsedContent) {
|
|
628
|
-
return parsedContent;
|
|
629
|
-
}
|
|
630
|
-
} catch (err) {
|
|
631
|
-
// Clean up temp file on error
|
|
632
|
-
if (fs.existsSync(tempFilePath)) {
|
|
633
|
-
fs.unlinkSync(tempFilePath);
|
|
634
|
-
}
|
|
635
|
-
logger.warn({ msg: '[DocumentProcessor] Failed to parse text document with LlamaCloud', err });
|
|
636
|
-
}
|
|
637
|
-
}
|
|
638
|
-
|
|
639
|
-
// Fallback: handle based on file type
|
|
640
|
-
if (typeof content === 'string') {
|
|
641
|
-
return content;
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
// For binary content, convert to text
|
|
645
|
-
if (fileType === '.pdf' && !this.llamaCloudApiKey) {
|
|
646
|
-
throw new Error('PDF processing requires LLAMA_CLOUD_API_KEY environment variable for LlamaParser');
|
|
647
|
-
}
|
|
648
|
-
|
|
649
|
-
// Basic text extraction for simple formats
|
|
650
|
-
if (['.txt', '.md', '.html', '.htm', '.xml', '.csv'].includes(fileType || '')) {
|
|
651
|
-
return content.toString('utf-8');
|
|
652
|
-
}
|
|
653
|
-
|
|
654
|
-
// For unsupported binary formats without LlamaParser
|
|
655
|
-
throw new Error(`Unsupported document type ${fileType}. Please provide LLAMA_CLOUD_API_KEY for advanced document processing.`);
|
|
656
|
-
}
|
|
657
|
-
|
|
658
|
-
/**
|
|
659
|
-
* Extract data using LLM - either structured with schema or unstructured with prompt
|
|
660
|
-
*/
|
|
661
|
-
private async extractStructuredData<T>(
|
|
662
|
-
content: string,
|
|
663
|
-
schema?: ZodSchema<T> | ZodTypeAny,
|
|
664
|
-
llmConfig?: { model?: string; temperature?: number },
|
|
665
|
-
systemPrompt?: string,
|
|
666
|
-
): Promise<T> {
|
|
667
|
-
if (!this.llm) {
|
|
668
|
-
throw new Error('LLM instance is required for data extraction. Please provide an LLM when creating the DocumentProcessor.');
|
|
669
|
-
}
|
|
670
|
-
|
|
671
|
-
// Note: llmConfig is ignored when using the provided LLM instance
|
|
672
|
-
// The LLM should already be configured with the desired model and temperature
|
|
673
|
-
|
|
674
|
-
const defaultSystemPrompt =
|
|
675
|
-
'You are an expert data-extraction assistant. ' +
|
|
676
|
-
'Extract the requested information from the provided document content. ' +
|
|
677
|
-
'If you cannot find a value for a required field, use "N/A" or a descriptive placeholder. ' +
|
|
678
|
-
'Be accurate and thorough in your extraction.';
|
|
679
|
-
|
|
680
|
-
const finalSystemPrompt = systemPrompt || defaultSystemPrompt;
|
|
681
|
-
|
|
682
|
-
try {
|
|
683
|
-
// If schema is provided, use structured output
|
|
684
|
-
if (schema) {
|
|
685
|
-
// Check if the LLM supports withStructuredOutput
|
|
686
|
-
if (!('withStructuredOutput' in this.llm)) {
|
|
687
|
-
throw new Error('The provided LLM does not support structured output. Please use a compatible LLM instance.');
|
|
688
|
-
}
|
|
689
|
-
|
|
690
|
-
const structuredLlm = (this.llm as any).withStructuredOutput(schema as any);
|
|
691
|
-
|
|
692
|
-
const result = await structuredLlm.invoke([
|
|
693
|
-
{
|
|
694
|
-
role: 'system',
|
|
695
|
-
content: finalSystemPrompt,
|
|
696
|
-
},
|
|
697
|
-
{
|
|
698
|
-
role: 'user',
|
|
699
|
-
content: `Please extract the following information from this document:\n\n${content}`,
|
|
700
|
-
},
|
|
701
|
-
]);
|
|
702
|
-
|
|
703
|
-
logger.debug({
|
|
704
|
-
msg: '[DocumentProcessor] Structured data extraction completed',
|
|
705
|
-
extractedData: JSON.stringify(result, null, 2),
|
|
706
|
-
});
|
|
707
|
-
|
|
708
|
-
return result as T;
|
|
709
|
-
} else {
|
|
710
|
-
// Without schema, return the LLM's text response
|
|
711
|
-
const response = await this.llm.invoke([
|
|
712
|
-
{
|
|
713
|
-
role: 'system',
|
|
714
|
-
content: finalSystemPrompt,
|
|
715
|
-
},
|
|
716
|
-
{
|
|
717
|
-
role: 'user',
|
|
718
|
-
content: `Please analyze and extract information from this document:\n\n${content}`,
|
|
719
|
-
},
|
|
720
|
-
]);
|
|
721
|
-
|
|
722
|
-
// Extract the text content from the response
|
|
723
|
-
let textContent: string;
|
|
724
|
-
if (typeof response.content === 'string') {
|
|
725
|
-
textContent = response.content;
|
|
726
|
-
} else if (Array.isArray(response.content) && response.content.length > 0) {
|
|
727
|
-
// Handle array of content blocks
|
|
728
|
-
textContent = response.content.map((block: any) => (typeof block === 'string' ? block : block.text || '')).join('\n');
|
|
729
|
-
} else {
|
|
730
|
-
textContent = String(response.content);
|
|
731
|
-
}
|
|
732
|
-
|
|
733
|
-
logger.debug({
|
|
734
|
-
msg: '[DocumentProcessor] Unstructured data extraction completed',
|
|
735
|
-
contentLength: textContent.length,
|
|
736
|
-
});
|
|
737
|
-
|
|
738
|
-
return textContent as T;
|
|
739
|
-
}
|
|
740
|
-
} catch (err) {
|
|
741
|
-
throw new Error(`LLM extraction failed: ${err instanceof Error ? err.message : String(err)}`);
|
|
742
|
-
}
|
|
743
|
-
}
|
|
744
|
-
|
|
745
|
-
/**
|
|
746
|
-
* Check if file is an image type
|
|
747
|
-
*/
|
|
748
|
-
private isImageFile(fileType: string): boolean {
|
|
749
|
-
const imageTypes = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff'];
|
|
750
|
-
return imageTypes.includes(fileType.toLowerCase());
|
|
751
|
-
}
|
|
752
|
-
|
|
753
|
-
/**
|
|
754
|
-
* Infer file type from buffer content
|
|
755
|
-
*/
|
|
756
|
-
private inferFileTypeFromBuffer(buffer: Buffer): string {
|
|
757
|
-
// Check common file signatures
|
|
758
|
-
const signatures: { [key: string]: string } = {
|
|
759
|
-
'89504E47': '.png',
|
|
760
|
-
FFD8FF: '.jpg',
|
|
761
|
-
'47494638': '.gif',
|
|
762
|
-
'25504446': '.pdf',
|
|
763
|
-
'504B0304': '.zip', // Also used by docx, xlsx, pptx
|
|
764
|
-
D0CF11E0: '.doc', // Also xls, ppt
|
|
765
|
-
};
|
|
766
|
-
|
|
767
|
-
const hex = buffer.toString('hex', 0, 4).toUpperCase();
|
|
768
|
-
|
|
769
|
-
for (const [signature, type] of Object.entries(signatures)) {
|
|
770
|
-
if (hex.startsWith(signature)) {
|
|
771
|
-
return type;
|
|
772
|
-
}
|
|
773
|
-
}
|
|
774
|
-
|
|
775
|
-
return '.unknown';
|
|
184
|
+
fileType: string;
|
|
185
|
+
processingTime: number;
|
|
186
|
+
contentLength: number;
|
|
187
|
+
};
|
|
188
|
+
}> {
|
|
189
|
+
if (!documentSource) {
|
|
190
|
+
throw new Error('documentSource is required - provide a URL or file path');
|
|
776
191
|
}
|
|
777
192
|
|
|
778
|
-
|
|
779
|
-
* Infer file type from URL
|
|
780
|
-
*/
|
|
781
|
-
private inferFileTypeFromUrl(url: string): string | null {
|
|
782
|
-
try {
|
|
783
|
-
const pathname = new URL(url).pathname;
|
|
784
|
-
const extension = path.extname(pathname).toLowerCase();
|
|
785
|
-
return extension || null;
|
|
786
|
-
} catch {
|
|
787
|
-
return null;
|
|
788
|
-
}
|
|
789
|
-
}
|
|
193
|
+
const isDocumentUrl = isUrl(documentSource);
|
|
790
194
|
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
195
|
+
if (processingMode === DocumentProcessingMode.MANAGED) {
|
|
196
|
+
return parseDocumentWithManagedService({
|
|
197
|
+
documentSource,
|
|
198
|
+
isDocumentUrl,
|
|
199
|
+
sessionId,
|
|
200
|
+
});
|
|
201
|
+
} else {
|
|
202
|
+
return parseDocumentWithLocalService({
|
|
203
|
+
documentSource,
|
|
204
|
+
isDocumentUrl,
|
|
205
|
+
sessionId,
|
|
206
|
+
llamaCloudApiKey: llamaCloudApiKey ?? process.env.LLAMA_CLOUD_API_KEY,
|
|
207
|
+
});
|
|
796
208
|
}
|
|
209
|
+
}
|
|
797
210
|
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
211
|
+
/**
|
|
212
|
+
* Check if a string is a URL
|
|
213
|
+
*/
|
|
214
|
+
function isUrl(source: string): boolean {
|
|
215
|
+
try {
|
|
216
|
+
const url = new URL(source);
|
|
217
|
+
return url.protocol === 'http:' || url.protocol === 'https:';
|
|
218
|
+
} catch {
|
|
219
|
+
return false;
|
|
803
220
|
}
|
|
804
221
|
}
|