@minded-ai/mindedjs 3.0.7 → 3.1.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. package/dist/index.d.ts +2 -1
  2. package/dist/index.d.ts.map +1 -1
  3. package/dist/index.js +6 -3
  4. package/dist/index.js.map +1 -1
  5. package/dist/internalTools/documentExtraction/documentExtraction.d.ts +112 -102
  6. package/dist/internalTools/documentExtraction/documentExtraction.d.ts.map +1 -1
  7. package/dist/internalTools/documentExtraction/documentExtraction.js +146 -705
  8. package/dist/internalTools/documentExtraction/documentExtraction.js.map +1 -1
  9. package/dist/internalTools/documentExtraction/extractStructuredData.d.ts +57 -0
  10. package/dist/internalTools/documentExtraction/extractStructuredData.d.ts.map +1 -0
  11. package/dist/internalTools/documentExtraction/extractStructuredData.js +121 -0
  12. package/dist/internalTools/documentExtraction/extractStructuredData.js.map +1 -0
  13. package/dist/internalTools/documentExtraction/parseDocumentLocal.d.ts +16 -0
  14. package/dist/internalTools/documentExtraction/parseDocumentLocal.d.ts.map +1 -0
  15. package/dist/internalTools/documentExtraction/parseDocumentLocal.js +547 -0
  16. package/dist/internalTools/documentExtraction/parseDocumentLocal.js.map +1 -0
  17. package/dist/internalTools/documentExtraction/parseDocumentManaged.d.ts +13 -0
  18. package/dist/internalTools/documentExtraction/parseDocumentManaged.d.ts.map +1 -0
  19. package/dist/internalTools/documentExtraction/parseDocumentManaged.js +150 -0
  20. package/dist/internalTools/documentExtraction/parseDocumentManaged.js.map +1 -0
  21. package/dist/nodes/addAppToolNode.d.ts.map +1 -1
  22. package/dist/nodes/addAppToolNode.js +20 -1
  23. package/dist/nodes/addAppToolNode.js.map +1 -1
  24. package/dist/toolsLibrary/classifier.d.ts +2 -2
  25. package/dist/toolsLibrary/parseDocument.d.ts +11 -10
  26. package/dist/toolsLibrary/parseDocument.d.ts.map +1 -1
  27. package/dist/toolsLibrary/parseDocument.js +33 -189
  28. package/dist/toolsLibrary/parseDocument.js.map +1 -1
  29. package/dist/types/Flows.types.d.ts +1 -0
  30. package/dist/types/Flows.types.d.ts.map +1 -1
  31. package/dist/types/Flows.types.js.map +1 -1
  32. package/dist/utils/schemaUtils.js +1 -1
  33. package/dist/utils/schemaUtils.js.map +1 -1
  34. package/docs/tooling/document-processing.md +235 -174
  35. package/package.json +3 -2
  36. package/src/index.ts +2 -1
  37. package/src/internalTools/documentExtraction/documentExtraction.ts +184 -767
  38. package/src/internalTools/documentExtraction/extractStructuredData.ts +140 -0
  39. package/src/internalTools/documentExtraction/parseDocumentLocal.ts +660 -0
  40. package/src/internalTools/documentExtraction/parseDocumentManaged.ts +152 -0
  41. package/src/nodes/addAppToolNode.ts +30 -7
  42. package/src/toolsLibrary/parseDocument.ts +38 -206
  43. package/src/types/Flows.types.ts +1 -0
  44. package/src/utils/schemaUtils.ts +1 -1
@@ -1,736 +1,177 @@
1
1
  "use strict";
2
- var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
- if (k2 === undefined) k2 = k;
4
- var desc = Object.getOwnPropertyDescriptor(m, k);
5
- if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
- desc = { enumerable: true, get: function() { return m[k]; } };
7
- }
8
- Object.defineProperty(o, k2, desc);
9
- }) : (function(o, m, k, k2) {
10
- if (k2 === undefined) k2 = k;
11
- o[k2] = m[k];
12
- }));
13
- var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
- Object.defineProperty(o, "default", { enumerable: true, value: v });
15
- }) : function(o, v) {
16
- o["default"] = v;
17
- });
18
- var __importStar = (this && this.__importStar) || (function () {
19
- var ownKeys = function(o) {
20
- ownKeys = Object.getOwnPropertyNames || function (o) {
21
- var ar = [];
22
- for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
- return ar;
24
- };
25
- return ownKeys(o);
26
- };
27
- return function (mod) {
28
- if (mod && mod.__esModule) return mod;
29
- var result = {};
30
- if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
- __setModuleDefault(result, mod);
32
- return result;
33
- };
34
- })();
35
2
  Object.defineProperty(exports, "__esModule", { value: true });
36
- exports.DocumentProcessor = void 0;
37
- exports.extractFromDocument = extractFromDocument;
38
- const fs = __importStar(require("fs"));
39
- const path = __importStar(require("path"));
40
- const logger_1 = require("../../utils/logger");
41
- const os = __importStar(require("os"));
42
- const types_1 = require("./types");
3
+ exports.DocumentProcessingMode = void 0;
4
+ exports.parseDocumentAndExtractStructuredData = parseDocumentAndExtractStructuredData;
5
+ exports.parseDocument = parseDocument;
6
+ const extractStructuredData_1 = require("./extractStructuredData");
7
+ const parseDocumentManaged_1 = require("./parseDocumentManaged");
8
+ const parseDocumentLocal_1 = require("./parseDocumentLocal");
9
+ /**
10
+ * Document processing mode
11
+ */
12
+ var DocumentProcessingMode;
13
+ (function (DocumentProcessingMode) {
14
+ /** Process documents using Minded cloud service (default) */
15
+ DocumentProcessingMode["MANAGED"] = "managed";
16
+ /** Process documents locally using LlamaCloud */
17
+ DocumentProcessingMode["LOCAL"] = "local";
18
+ })(DocumentProcessingMode || (exports.DocumentProcessingMode = DocumentProcessingMode = {}));
43
19
  /**
44
- * Extract data from documents using AI or return raw text.
20
+ * Parse document and extract structured data using AI.
21
+ *
22
+ * This function provides a flexible way to process documents with optional AI-powered extraction:
23
+ * - Raw text extraction: Parse document without LLM processing
24
+ * - Structured extraction with schema: Extract data matching a Zod schema
25
+ * - Structured extraction with prompt: Guide extraction using custom prompts
26
+ * - Processing modes: Use DocumentProcessingMode.LOCAL (requires LlamaCloud API key) or DocumentProcessingMode.MANAGED (backend service)
45
27
  *
46
- * This function allows you to process various document types (PDFs, images, Word docs, etc.)
47
- * in multiple ways:
48
- * - With LLM + Schema: Extract structured data according to a Zod schema
49
- * - With LLM + System Prompt: Extract unstructured data based on prompt instructions
50
- * - Without LLM: Extract raw text content using LlamaParse
28
+ * @param options - Document processing options
29
+ * @param options.documentSource - URL or file path to the document
30
+ * @param options.processingMode - Document parsing mode: DocumentProcessingMode.MANAGED (default, backend service) or DocumentProcessingMode.LOCAL (requires llamaCloudApiKey)
31
+ * @param options.sessionId - Unique session identifier for logging and tracking
32
+ * @param options.llamaCloudApiKey - LlamaCloud API key for local processing. Required when processingMode is DocumentProcessingMode.LOCAL. Can be provided as parameter or via LLAMA_CLOUD_API_KEY environment variable
33
+ * @param options.returnStructuredOutput - Whether to extract structured data using LLM (true) or return raw text only (false). Defaults to false.
34
+ * @param options.llm - Language model instance for AI-powered extraction. Required when returnStructuredOutput is true
35
+ * @param options.outputSchema - Optional Zod schema defining the structure of extracted data
36
+ * @param options.outputSchemaPrompt - Optional prompt to guide the llm how to extract the data
51
37
  *
52
- * @param options - Document extraction options
53
- * @param options.llm - Optional language model for AI-powered extraction
54
- * @param options.documentPath - Path to the document file
55
- * @param options.documentContent - Document content as Buffer or string
56
- * @param options.documentUrl - URL to fetch the document from
57
- * @param options.schema - Optional Zod schema for structured data extraction
58
- * @param options.systemPrompt - Optional prompt for guiding extraction
59
- * @param options.config - Optional document processor configuration
38
+ * @returns Promise resolving to an object containing:
39
+ * - rawContent: The raw extracted text from the document
40
+ * - structuredContent: AI-extracted structured data (if returnStructuredOutput is true)
41
+ * - metadata: Document metadata from processing
60
42
  *
61
- * @returns Promise resolving to extracted data/text and metadata
43
+ * @throws {Error} If documentSource is not provided
44
+ * @throws {Error} If returnStructuredOutput is true but llm is not provided
45
+ * @throws {Error} If document parsing or extraction fails
62
46
  *
63
47
  * @example
64
48
  * ```typescript
65
- * import { extractFromDocument } from '@minded-ai/mindedjs';
49
+ * import { parseDocumentAndExtractStructuredData, DocumentProcessingMode } from '@minded-ai/mindedjs';
66
50
  * import { z } from 'zod';
67
51
  *
68
- * // Extract structured data with schema
69
- * const result1 = await extractFromDocument({
52
+ * // Parse document and extract structured data using a schema
53
+ * const result1 = await parseDocumentAndExtractStructuredData({
54
+ * documentSource: './invoice.pdf',
55
+ * processingMode: DocumentProcessingMode.MANAGED,
56
+ * sessionId: state.sessionId,
57
+ * returnStructuredOutput: true,
70
58
  * llm: agent.llm,
71
- * documentPath: './invoice.pdf',
72
- * schema: z.object({
59
+ * outputSchema: z.object({
73
60
  * invoiceNumber: z.string(),
74
- * amount: z.number(),
75
- * })
76
- * });
77
- *
78
- * // Extract unstructured data with prompt
79
- * const result2 = await extractFromDocument({
80
- * llm: agent.llm,
81
- * documentPath: './contract.pdf',
82
- * systemPrompt: 'Extract all payment terms and conditions'
61
+ * totalAmount: z.number(),
62
+ * date: z.string(),
63
+ * }),
83
64
  * });
84
65
  *
85
- * // Extract raw text without LLM
86
- * const result3 = await extractFromDocument({
87
- * documentPath: './document.pdf'
66
+ * // Parse document only
67
+ * const result3 = await parseDocumentAndExtractStructuredData({
68
+ * documentSource: './document.pdf',
69
+ * processingMode: DocumentProcessingMode.MANAGED,
70
+ * sessionId: state.sessionId,
71
+ * returnStructuredOutput: false,
88
72
  * });
89
73
  * ```
90
74
  */
91
- async function extractFromDocument(options) {
92
- // Create a document processor
93
- const processor = new DocumentProcessor(options.config, options.llm);
94
- // Extract from document using the processor
95
- return processor.extractFromDocument({
96
- documentPath: options.documentPath,
97
- documentContent: options.documentContent,
98
- documentUrl: options.documentUrl,
99
- schema: options.schema,
100
- systemPrompt: options.systemPrompt,
75
+ async function parseDocumentAndExtractStructuredData({ documentSource, processingMode, sessionId, llamaCloudApiKey, returnStructuredOutput = false, llm, outputSchema, outputSchemaPrompt, }) {
76
+ // Parse document
77
+ const result = await parseDocument({
78
+ documentSource,
79
+ processingMode,
80
+ sessionId,
81
+ llamaCloudApiKey,
101
82
  });
83
+ if (!returnStructuredOutput || !result.rawContent) {
84
+ return result;
85
+ }
86
+ if (!llm) {
87
+ throw new Error('LLM instance is required when returnStructuredOutput is true. Please provide an LLM instance.');
88
+ }
89
+ // Extract structured data from the parsed document
90
+ const structuredContent = await (0, extractStructuredData_1.extractStructuredDataFromString)({
91
+ content: result.rawContent,
92
+ llm,
93
+ schema: outputSchema,
94
+ prompt: outputSchemaPrompt,
95
+ sessionId,
96
+ });
97
+ return {
98
+ ...result,
99
+ structuredContent,
100
+ };
102
101
  }
103
102
  /**
104
- * Generic document processor that can extract structured data from various document types
105
- * including images, PDFs, Word documents, spreadsheets, and more.
103
+ * Parse document and extract raw text content.
104
+ *
105
+ * This function processes various document types (PDFs, images, Word docs, etc.) and extracts
106
+ * raw text content using either local processing with LlamaCloud or managed backend service.
107
+ * Use this for raw text extraction without AI-powered data extraction.
108
+ *
109
+ * @param options - Document parsing options
110
+ * @param options.documentSource - URL or file path to the document
111
+ * @param options.processingMode - Parsing mode: DocumentProcessingMode.LOCAL (requires llamaCloudApiKey) or DocumentProcessingMode.MANAGED (backend service, default)
112
+ * @param options.sessionId - Unique session identifier for logging and tracking
113
+ * @param options.llamaCloudApiKey - LlamaCloud API key for local processing. Required when processingMode is DocumentProcessingMode.LOCAL. Can be provided as parameter or via LLAMA_CLOUD_API_KEY environment variable
114
+ *
115
+ * @returns Promise resolving to an object containing:
116
+ * - rawContent: The raw extracted text from the document
117
+ * - metadata: Document processing metadata (file size, type, processing time, content length)
118
+ *
119
+ * @throws {Error} If documentSource is not provided
120
+ * @throws {Error} If document processing fails
121
+ *
122
+ * @example
123
+ * ```typescript
124
+ * import { parseDocument, DocumentProcessingMode } from '@minded-ai/mindedjs';
125
+ *
126
+ * // Parse document using managed service
127
+ * const result1 = await parseDocument({
128
+ * documentSource: 'https://example.com/invoice.pdf',
129
+ * processingMode: DocumentProcessingMode.MANAGED,
130
+ * sessionId: state.sessionId,
131
+ * });
132
+ * // result1: { rawContent: "Invoice text...", metadata: {...} }
133
+ *
134
+ * // Parse local document using LlamaCloud
135
+ * const result2 = await parseDocument({
136
+ * documentSource: './contract.pdf',
137
+ * processingMode: DocumentProcessingMode.LOCAL,
138
+ * sessionId: state.sessionId,
139
+ * llamaCloudApiKey: process.env.LLAMA_CLOUD_API_KEY,
140
+ * });
141
+ * // result2: { rawContent: "Contract text...", metadata: {...} }
142
+ * ```
106
143
  */
107
- class DocumentProcessor {
108
- constructor(config = {}, llm) {
109
- this.llm = null;
110
- this.llamaCloudApiKey = null;
111
- this.sharpModule = null;
112
- this.sharpLoadAttempted = false;
113
- this.config = {
114
- maxImageWidth: 1200,
115
- imageQuality: 85,
116
- useBase64: false,
117
- ...config,
118
- };
119
- this.llm = llm || null;
120
- this.llamaCloudApiKey = this.config.llamaCloudApiKey || process.env.LLAMA_CLOUD_API_KEY || null;
121
- }
122
- /**
123
- * Parse document using LlamaCloud REST API
124
- */
125
- async parseWithLlamaCloud(filePath) {
126
- if (!this.llamaCloudApiKey) {
127
- return null;
128
- }
129
- try {
130
- // Step 1: Upload file and start parsing
131
- const fileContent = fs.readFileSync(filePath);
132
- const fileName = path.basename(filePath);
133
- const mimeType = this.getMimeType(path.extname(filePath));
134
- const formData = new FormData();
135
- const blob = new Blob([fileContent], { type: mimeType });
136
- formData.append('file', blob, fileName);
137
- formData.append('premium_mode', 'true');
138
- const uploadResponse = await fetch('https://api.cloud.llamaindex.ai/api/v1/parsing/upload', {
139
- method: 'POST',
140
- headers: {
141
- Accept: 'application/json',
142
- Authorization: `Bearer ${this.llamaCloudApiKey}`,
143
- },
144
- body: formData,
145
- });
146
- if (!uploadResponse.ok) {
147
- const errorText = await uploadResponse.text();
148
- throw new Error(`Failed to upload file: ${uploadResponse.status} - ${errorText}`);
149
- }
150
- const uploadResult = await uploadResponse.json();
151
- const jobId = uploadResult.id || uploadResult.job_id;
152
- if (!jobId) {
153
- throw new Error('No job ID returned from upload');
154
- }
155
- logger_1.logger.info({
156
- msg: '[DocumentProcessor] File uploaded to LlamaCloud',
157
- jobId,
158
- fileName,
159
- });
160
- // Step 2: Poll for job completion
161
- let attempts = 0;
162
- const maxAttempts = 60; // 60 attempts with 2 second delay = 2 minutes max
163
- const pollDelay = 2000; // 2 seconds
164
- while (attempts < maxAttempts) {
165
- const statusResponse = await fetch(`https://api.cloud.llamaindex.ai/api/v1/parsing/job/${jobId}`, {
166
- method: 'GET',
167
- headers: {
168
- Accept: 'application/json',
169
- Authorization: `Bearer ${this.llamaCloudApiKey}`,
170
- },
171
- });
172
- if (!statusResponse.ok) {
173
- throw new Error(`Failed to check job status: ${statusResponse.status}`);
174
- }
175
- const statusResult = await statusResponse.json();
176
- const status = statusResult.status || statusResult.job_status;
177
- if (status === 'SUCCESS' || status === 'COMPLETED' || status === 'completed') {
178
- // Step 3: Retrieve results in Markdown
179
- // Create an AbortController for timeout
180
- const controller = new AbortController();
181
- const timeout = setTimeout(() => controller.abort(), 20000); // 20 second timeout
182
- let resultResponse;
183
- try {
184
- resultResponse = await fetch(`https://api.cloud.llamaindex.ai/api/v1/parsing/job/${jobId}/result/markdown`, {
185
- method: 'GET',
186
- headers: {
187
- Accept: 'application/json',
188
- Authorization: `Bearer ${this.llamaCloudApiKey}`,
189
- },
190
- signal: controller.signal,
191
- });
192
- }
193
- catch (fetchError) {
194
- clearTimeout(timeout);
195
- if (fetchError instanceof Error && fetchError.name === 'AbortError') {
196
- throw new Error('Timeout fetching results from LlamaCloud after 20 seconds');
197
- }
198
- throw fetchError;
199
- }
200
- clearTimeout(timeout);
201
- if (!resultResponse.ok) {
202
- const errorText = await resultResponse.text();
203
- throw new Error(`Failed to retrieve results: ${resultResponse.status} - ${errorText}`);
204
- }
205
- let resultData;
206
- try {
207
- // Read response using manual stream reading (more reliable than text())
208
- let responseText;
209
- if (resultResponse.body) {
210
- const reader = resultResponse.body.getReader();
211
- const chunks = [];
212
- let totalLength = 0;
213
- try {
214
- while (true) {
215
- const { done, value } = await reader.read();
216
- if (done)
217
- break;
218
- if (value) {
219
- chunks.push(value);
220
- totalLength += value.length;
221
- }
222
- }
223
- // Combine chunks
224
- const combined = new Uint8Array(totalLength);
225
- let offset = 0;
226
- for (const chunk of chunks) {
227
- combined.set(chunk, offset);
228
- offset += chunk.length;
229
- }
230
- responseText = new TextDecoder().decode(combined);
231
- }
232
- finally {
233
- reader.releaseLock();
234
- }
235
- }
236
- else {
237
- responseText = await resultResponse.text();
238
- }
239
- // Try to parse as JSON, but if it fails, use the text directly
240
- try {
241
- resultData = JSON.parse(responseText);
242
- }
243
- catch (_a) {
244
- // If it's not JSON, assume it's the markdown content directly
245
- resultData = responseText;
246
- }
247
- }
248
- catch (textError) {
249
- logger_1.logger.error({
250
- msg: '[DocumentProcessor] Failed to read response text',
251
- jobId,
252
- error: textError instanceof Error ? textError.message : String(textError),
253
- stack: textError instanceof Error ? textError.stack : undefined,
254
- });
255
- throw new Error('Failed to read response from LlamaCloud');
256
- }
257
- logger_1.logger.debug({
258
- msg: '[DocumentProcessor] Result data structure',
259
- jobId,
260
- dataType: typeof resultData,
261
- keys: typeof resultData === 'object' && resultData !== null ? Object.keys(resultData) : [],
262
- hasMarkdown: typeof resultData === 'object' && 'markdown' in resultData,
263
- hasContent: typeof resultData === 'object' && 'content' in resultData,
264
- hasText: typeof resultData === 'object' && 'text' in resultData,
265
- });
266
- // The API might return the markdown directly as a string or nested in an object
267
- let markdownContent;
268
- if (typeof resultData === 'string') {
269
- markdownContent = resultData;
270
- }
271
- else {
272
- markdownContent = resultData.markdown || resultData.content || resultData.text || '';
273
- }
274
- if (!markdownContent) {
275
- logger_1.logger.error({
276
- msg: '[DocumentProcessor] No content in result',
277
- jobId,
278
- resultData: JSON.stringify(resultData).substring(0, 500),
279
- });
280
- throw new Error('No content returned from parsing');
281
- }
282
- logger_1.logger.info({
283
- msg: '[DocumentProcessor] Successfully parsed document with LlamaCloud',
284
- jobId,
285
- contentLength: markdownContent.length,
286
- preview: markdownContent.substring(0, 100),
287
- });
288
- logger_1.logger.debug({
289
- msg: '[DocumentProcessor] About to return markdown content',
290
- jobId,
291
- });
292
- return markdownContent;
293
- }
294
- else if (status === 'FAILED' || status === 'ERROR' || status === 'failed') {
295
- throw new Error(`Parsing job failed: ${statusResult.error || 'Unknown error'}`);
296
- }
297
- // Wait before next attempt
298
- await new Promise((resolve) => setTimeout(resolve, pollDelay));
299
- attempts++;
300
- }
301
- throw new Error('Parsing job timed out after 2 minutes');
302
- }
303
- catch (err) {
304
- logger_1.logger.warn({
305
- message: '[DocumentProcessor] LlamaCloud parsing failed',
306
- err,
307
- });
308
- return null;
309
- }
310
- finally {
311
- logger_1.logger.debug({
312
- msg: '[DocumentProcessor] parseWithLlamaCloud finished',
313
- filePath,
314
- });
315
- }
316
- }
317
- /**
318
- * Get MIME type for file extension
319
- */
320
- getMimeType(fileExtension) {
321
- const mimeTypes = {
322
- '.pdf': 'application/pdf',
323
- '.doc': 'application/msword',
324
- '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
325
- '.txt': 'text/plain',
326
- '.rtf': 'application/rtf',
327
- '.jpg': 'image/jpeg',
328
- '.jpeg': 'image/jpeg',
329
- '.png': 'image/png',
330
- '.gif': 'image/gif',
331
- '.bmp': 'image/bmp',
332
- '.webp': 'image/webp',
333
- '.tiff': 'image/tiff',
334
- '.xls': 'application/vnd.ms-excel',
335
- '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
336
- '.csv': 'text/csv',
337
- '.html': 'text/html',
338
- '.htm': 'text/html',
339
- '.xml': 'application/xml',
340
- '.md': 'text/markdown',
341
- };
342
- return mimeTypes[fileExtension.toLowerCase()] || 'application/octet-stream';
144
+ async function parseDocument({ documentSource, processingMode = DocumentProcessingMode.MANAGED, sessionId, llamaCloudApiKey, }) {
145
+ if (!documentSource) {
146
+ throw new Error('documentSource is required - provide a URL or file path');
343
147
  }
344
- /**
345
- * Extract data from a document - either structured data using AI or raw text
346
- */
347
- async extractFromDocument(options) {
348
- logger_1.logger.debug({
349
- msg: '[DocumentProcessor] extractFromDocument called',
350
- hasDocumentPath: !!options.documentPath,
351
- hasDocumentUrl: !!options.documentUrl,
352
- hasDocumentContent: !!options.documentContent,
353
- hasLLM: !!this.llm,
354
- hasSchema: !!options.schema,
148
+ const isDocumentUrl = isUrl(documentSource);
149
+ if (processingMode === DocumentProcessingMode.MANAGED) {
150
+ return (0, parseDocumentManaged_1.parseDocumentWithManagedService)({
151
+ documentSource,
152
+ isDocumentUrl,
153
+ sessionId,
355
154
  });
356
- const startTime = Date.now();
357
- try {
358
- logger_1.logger.debug({
359
- msg: '[DocumentProcessor] Getting document content',
360
- });
361
- // Determine document source and content
362
- const { content, fileType, fileSize } = await this.getDocumentContent(options);
363
- // Process document content based on type
364
- let processedContent;
365
- if (this.isImageFile(fileType)) {
366
- processedContent = await this.processImageDocument(content, fileType, options.documentPath);
367
- }
368
- else {
369
- processedContent = await this.processTextDocument(content, options.documentPath, fileType);
370
- }
371
- logger_1.logger.info({
372
- msg: '[DocumentProcessor] Document content processed',
373
- fileType,
374
- contentLength: processedContent.length,
375
- });
376
- const processingTime = Date.now() - startTime;
377
- // If no LLM is provided, return the raw text content
378
- if (!this.llm) {
379
- return {
380
- data: processedContent,
381
- metadata: {
382
- fileSize,
383
- fileType,
384
- processingTime,
385
- contentLength: processedContent.length,
386
- },
387
- };
388
- }
389
- // If LLM is provided, extract data (structured or unstructured)
390
- const extractedData = await this.extractStructuredData(processedContent, options.schema, options.llmConfig, options.systemPrompt);
391
- return {
392
- data: extractedData,
393
- metadata: {
394
- fileSize,
395
- fileType,
396
- processingTime,
397
- contentLength: processedContent.length,
398
- },
399
- };
400
- }
401
- catch (err) {
402
- logger_1.logger.error({
403
- message: '[DocumentProcessor] Document processing failed',
404
- err,
405
- });
406
- throw new Error(`Document processing failed: ${err instanceof Error ? err.message : String(err)}`);
407
- }
408
155
  }
409
- /**
410
- * Get document content from various sources
411
- */
412
- async getDocumentContent(options) {
413
- logger_1.logger.debug({
414
- msg: '[DocumentProcessor] getDocumentContent called',
415
- hasPath: !!options.documentPath,
416
- hasContent: !!options.documentContent,
417
- hasUrl: !!options.documentUrl,
156
+ else {
157
+ return (0, parseDocumentLocal_1.parseDocumentWithLocalService)({
158
+ documentSource,
159
+ isDocumentUrl,
160
+ sessionId,
161
+ llamaCloudApiKey: llamaCloudApiKey !== null && llamaCloudApiKey !== void 0 ? llamaCloudApiKey : process.env.LLAMA_CLOUD_API_KEY,
418
162
  });
419
- // From file path
420
- if (options.documentPath) {
421
- if (!fs.existsSync(options.documentPath)) {
422
- throw new Error(`Document not found: ${options.documentPath}`);
423
- }
424
- const content = fs.readFileSync(options.documentPath);
425
- const fileType = path.extname(options.documentPath).toLowerCase();
426
- return {
427
- content,
428
- fileType,
429
- fileSize: content.length,
430
- };
431
- }
432
- // From provided content
433
- if (options.documentContent) {
434
- // Try to infer file type from content if it's a buffer
435
- let fileType = '.unknown';
436
- if (Buffer.isBuffer(options.documentContent)) {
437
- fileType = this.inferFileTypeFromBuffer(options.documentContent);
438
- }
439
- else if (typeof options.documentContent === 'string') {
440
- fileType = '.txt'; // Assume text content
441
- }
442
- return {
443
- content: options.documentContent,
444
- fileType,
445
- fileSize: Buffer.isBuffer(options.documentContent) ? options.documentContent.length : Buffer.byteLength(options.documentContent),
446
- };
447
- }
448
- // From URL
449
- if (options.documentUrl) {
450
- logger_1.logger.debug({
451
- msg: '[DocumentProcessor] Fetching document from URL',
452
- url: options.documentUrl,
453
- });
454
- const response = await fetch(options.documentUrl);
455
- logger_1.logger.debug({
456
- msg: '[DocumentProcessor] URL fetch response',
457
- status: response.status,
458
- ok: response.ok,
459
- });
460
- if (!response.ok) {
461
- throw new Error(`Failed to fetch document from URL: ${response.statusText}`);
462
- }
463
- const arrayBuffer = await response.arrayBuffer();
464
- const content = Buffer.from(arrayBuffer);
465
- const fileType = this.inferFileTypeFromUrl(options.documentUrl) || this.inferFileTypeFromBuffer(content);
466
- logger_1.logger.debug({
467
- msg: '[DocumentProcessor] Document fetched from URL',
468
- contentSize: content.length,
469
- fileType,
470
- });
471
- return {
472
- content,
473
- fileType,
474
- fileSize: content.length,
475
- };
476
- }
477
- throw new Error('No document source provided. Specify documentPath, documentContent, or documentUrl.');
478
163
  }
479
- /**
480
- * Process image documents by converting them to a standardized format
481
- */
482
- async processImageDocument(content, fileType, filePath) {
483
- try {
484
- // First, try to use LlamaParser if available for text extraction
485
- if (filePath && this.llamaCloudApiKey) {
486
- logger_1.logger.debug({
487
- msg: '[DocumentProcessor] Calling parseWithLlamaCloud for image',
488
- filePath,
489
- });
490
- const parsedContent = await this.parseWithLlamaCloud(filePath);
491
- logger_1.logger.debug({
492
- msg: '[DocumentProcessor] parseWithLlamaCloud returned for image',
493
- hasContent: !!parsedContent,
494
- contentLength: parsedContent === null || parsedContent === void 0 ? void 0 : parsedContent.length,
495
- });
496
- if (parsedContent) {
497
- return parsedContent;
498
- }
499
- }
500
- // If no file path, create a temporary file for LlamaCloud parsing
501
- if (!filePath && this.llamaCloudApiKey) {
502
- const tempDir = os.tmpdir();
503
- const tempFileName = `temp_${Date.now()}${fileType}`;
504
- const tempFilePath = path.join(tempDir, tempFileName);
505
- logger_1.logger.debug({
506
- msg: '[DocumentProcessor] Creating temp file for image',
507
- tempFilePath,
508
- contentSize: content.length,
509
- });
510
- try {
511
- fs.writeFileSync(tempFilePath, content);
512
- logger_1.logger.debug({
513
- msg: '[DocumentProcessor] Calling parseWithLlamaCloud for temp image',
514
- tempFilePath,
515
- });
516
- const parsedContent = await this.parseWithLlamaCloud(tempFilePath);
517
- logger_1.logger.debug({
518
- msg: '[DocumentProcessor] parseWithLlamaCloud returned for temp image',
519
- hasContent: !!parsedContent,
520
- contentLength: parsedContent === null || parsedContent === void 0 ? void 0 : parsedContent.length,
521
- });
522
- fs.unlinkSync(tempFilePath);
523
- if (parsedContent) {
524
- return parsedContent;
525
- }
526
- }
527
- catch (err) {
528
- // Clean up temp file on error
529
- if (fs.existsSync(tempFilePath)) {
530
- fs.unlinkSync(tempFilePath);
531
- }
532
- logger_1.logger.warn({ msg: '[DocumentProcessor] Failed to parse image with LlamaCloud', err });
533
- }
534
- }
535
- // Fallback: Convert to image format for LLM processing
536
- // For PDFs, convert first page to image
537
- if (fileType === '.pdf') {
538
- throw new Error('Failed to convert PDF to image');
539
- }
540
- logger_1.logger.warn({
541
- msg: '[DocumentProcessor] Sharp module not available. Using original image without optimization.',
542
- fileType,
543
- contentSize: content.length,
544
- });
545
- // If sharp is not available, use the original image
546
- if (this.config.useBase64) {
547
- // Return original image as base64
548
- const base64Image = content.toString('base64');
549
- const mimeType = this.getMimeType(fileType);
550
- return `data:${mimeType};base64,${base64Image}`;
551
- }
552
- else {
553
- // Without sharp and without base64, we cannot process the image
554
- return `[IMAGE CONTENT - ${fileType.toUpperCase()} file. Size: ${content.length} bytes. Consider using LLAMA_CLOUD_API_KEY for text extraction or set useBase64: true]`;
555
- }
556
- }
557
- catch (err) {
558
- throw new Error(`Failed to process image document: ${err instanceof Error ? err.message : String(err)}`);
559
- }
560
- }
561
- /**
562
- * Process text-based documents using LlamaParser or fallback methods
563
- */
564
- async processTextDocument(content, filePath, fileType) {
565
- // Try LlamaCloud parsing if we have a file path
566
- if (filePath && this.llamaCloudApiKey) {
567
- const parsedContent = await this.parseWithLlamaCloud(filePath);
568
- if (parsedContent) {
569
- return parsedContent;
570
- }
571
- }
572
- // If no file path but we have content and LlamaCloud API key, create a temp file
573
- if (!filePath && this.llamaCloudApiKey && Buffer.isBuffer(content)) {
574
- const tempDir = os.tmpdir();
575
- const tempFileName = `temp_${Date.now()}${fileType || '.txt'}`;
576
- const tempFilePath = path.join(tempDir, tempFileName);
577
- try {
578
- fs.writeFileSync(tempFilePath, content);
579
- const parsedContent = await this.parseWithLlamaCloud(tempFilePath);
580
- fs.unlinkSync(tempFilePath);
581
- if (parsedContent) {
582
- return parsedContent;
583
- }
584
- }
585
- catch (err) {
586
- // Clean up temp file on error
587
- if (fs.existsSync(tempFilePath)) {
588
- fs.unlinkSync(tempFilePath);
589
- }
590
- logger_1.logger.warn({ msg: '[DocumentProcessor] Failed to parse text document with LlamaCloud', err });
591
- }
592
- }
593
- // Fallback: handle based on file type
594
- if (typeof content === 'string') {
595
- return content;
596
- }
597
- // For binary content, convert to text
598
- if (fileType === '.pdf' && !this.llamaCloudApiKey) {
599
- throw new Error('PDF processing requires LLAMA_CLOUD_API_KEY environment variable for LlamaParser');
600
- }
601
- // Basic text extraction for simple formats
602
- if (['.txt', '.md', '.html', '.htm', '.xml', '.csv'].includes(fileType || '')) {
603
- return content.toString('utf-8');
604
- }
605
- // For unsupported binary formats without LlamaParser
606
- throw new Error(`Unsupported document type ${fileType}. Please provide LLAMA_CLOUD_API_KEY for advanced document processing.`);
607
- }
608
- /**
609
- * Extract data using LLM - either structured with schema or unstructured with prompt
610
- */
611
- async extractStructuredData(content, schema, llmConfig, systemPrompt) {
612
- if (!this.llm) {
613
- throw new Error('LLM instance is required for data extraction. Please provide an LLM when creating the DocumentProcessor.');
614
- }
615
- // Note: llmConfig is ignored when using the provided LLM instance
616
- // The LLM should already be configured with the desired model and temperature
617
- const defaultSystemPrompt = 'You are an expert data-extraction assistant. ' +
618
- 'Extract the requested information from the provided document content. ' +
619
- 'If you cannot find a value for a required field, use "N/A" or a descriptive placeholder. ' +
620
- 'Be accurate and thorough in your extraction.';
621
- const finalSystemPrompt = systemPrompt || defaultSystemPrompt;
622
- try {
623
- // If schema is provided, use structured output
624
- if (schema) {
625
- // Check if the LLM supports withStructuredOutput
626
- if (!('withStructuredOutput' in this.llm)) {
627
- throw new Error('The provided LLM does not support structured output. Please use a compatible LLM instance.');
628
- }
629
- const structuredLlm = this.llm.withStructuredOutput(schema);
630
- const result = await structuredLlm.invoke([
631
- {
632
- role: 'system',
633
- content: finalSystemPrompt,
634
- },
635
- {
636
- role: 'user',
637
- content: `Please extract the following information from this document:\n\n${content}`,
638
- },
639
- ]);
640
- logger_1.logger.debug({
641
- msg: '[DocumentProcessor] Structured data extraction completed',
642
- extractedData: JSON.stringify(result, null, 2),
643
- });
644
- return result;
645
- }
646
- else {
647
- // Without schema, return the LLM's text response
648
- const response = await this.llm.invoke([
649
- {
650
- role: 'system',
651
- content: finalSystemPrompt,
652
- },
653
- {
654
- role: 'user',
655
- content: `Please analyze and extract information from this document:\n\n${content}`,
656
- },
657
- ]);
658
- // Extract the text content from the response
659
- let textContent;
660
- if (typeof response.content === 'string') {
661
- textContent = response.content;
662
- }
663
- else if (Array.isArray(response.content) && response.content.length > 0) {
664
- // Handle array of content blocks
665
- textContent = response.content.map((block) => (typeof block === 'string' ? block : block.text || '')).join('\n');
666
- }
667
- else {
668
- textContent = String(response.content);
669
- }
670
- logger_1.logger.debug({
671
- msg: '[DocumentProcessor] Unstructured data extraction completed',
672
- contentLength: textContent.length,
673
- });
674
- return textContent;
675
- }
676
- }
677
- catch (err) {
678
- throw new Error(`LLM extraction failed: ${err instanceof Error ? err.message : String(err)}`);
679
- }
680
- }
681
- /**
682
- * Check if file is an image type
683
- */
684
- isImageFile(fileType) {
685
- const imageTypes = ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp', '.tiff'];
686
- return imageTypes.includes(fileType.toLowerCase());
687
- }
688
- /**
689
- * Infer file type from buffer content
690
- */
691
- inferFileTypeFromBuffer(buffer) {
692
- // Check common file signatures
693
- const signatures = {
694
- '89504E47': '.png',
695
- FFD8FF: '.jpg',
696
- '47494638': '.gif',
697
- '25504446': '.pdf',
698
- '504B0304': '.zip', // Also used by docx, xlsx, pptx
699
- D0CF11E0: '.doc', // Also xls, ppt
700
- };
701
- const hex = buffer.toString('hex', 0, 4).toUpperCase();
702
- for (const [signature, type] of Object.entries(signatures)) {
703
- if (hex.startsWith(signature)) {
704
- return type;
705
- }
706
- }
707
- return '.unknown';
708
- }
709
- /**
710
- * Infer file type from URL
711
- */
712
- inferFileTypeFromUrl(url) {
713
- try {
714
- const pathname = new URL(url).pathname;
715
- const extension = path.extname(pathname).toLowerCase();
716
- return extension || null;
717
- }
718
- catch (_a) {
719
- return null;
720
- }
721
- }
722
- /**
723
- * Get list of supported document types
724
- */
725
- static getSupportedDocumentTypes() {
726
- return [...types_1.SUPPORTED_DOCUMENT_TYPES];
164
+ }
165
+ /**
166
+ * Check if a string is a URL
167
+ */
168
+ function isUrl(source) {
169
+ try {
170
+ const url = new URL(source);
171
+ return url.protocol === 'http:' || url.protocol === 'https:';
727
172
  }
728
- /**
729
- * Check if a file type is supported
730
- */
731
- static isDocumentTypeSupported(fileType) {
732
- return types_1.SUPPORTED_DOCUMENT_TYPES.includes(fileType.toLowerCase());
173
+ catch (_a) {
174
+ return false;
733
175
  }
734
176
  }
735
- exports.DocumentProcessor = DocumentProcessor;
736
177
  //# sourceMappingURL=documentExtraction.js.map