@juspay/neurolink 9.1.0 → 9.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +54 -7
- package/dist/agent/directTools.d.ts +3 -3
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/lib/agent/directTools.d.ts +3 -3
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/server/utils/validation.d.ts +6 -6
- package/dist/lib/types/fileTypes.d.ts +51 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +18 -18
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/csvProcessor.js +442 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/server/utils/validation.d.ts +6 -6
- package/dist/types/fileTypes.d.ts +51 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/csvProcessor.js +442 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,519 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Excel Processor
|
|
3
|
+
*
|
|
4
|
+
* Handles downloading, validating, and processing Excel files (.xlsx, .xls).
|
|
5
|
+
* Uses exceljs library for parsing with streaming support for large files.
|
|
6
|
+
*
|
|
7
|
+
* Key features:
|
|
8
|
+
* - Supports both .xlsx and legacy .xls formats
|
|
9
|
+
* - Extracts worksheet data with headers
|
|
10
|
+
* - Handles complex cell types (formulas, rich text, dates)
|
|
11
|
+
* - Respects configurable row and sheet limits
|
|
12
|
+
* - Provides truncation metadata when limits are exceeded
|
|
13
|
+
*
|
|
14
|
+
* @module processors/document/ExcelProcessor
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```typescript
|
|
18
|
+
* import { excelProcessor, processExcel, isExcelFile } from "./ExcelProcessor.js";
|
|
19
|
+
*
|
|
20
|
+
* // Check if a file is an Excel file
|
|
21
|
+
* if (isExcelFile(fileInfo.mimetype, fileInfo.name)) {
|
|
22
|
+
* // Process the Excel file
|
|
23
|
+
* const result = await processExcel(fileInfo, {
|
|
24
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
25
|
+
* });
|
|
26
|
+
*
|
|
27
|
+
* if (result.success) {
|
|
28
|
+
* console.log(`Processed ${result.data.sheetCount} sheets`);
|
|
29
|
+
* console.log(`Total rows: ${result.data.totalRows}`);
|
|
30
|
+
*
|
|
31
|
+
* for (const sheet of result.data.worksheets) {
|
|
32
|
+
* console.log(`Sheet: ${sheet.name}, Rows: ${sheet.rowCount}`);
|
|
33
|
+
* }
|
|
34
|
+
* }
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
import { Workbook } from "exceljs";
|
|
39
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
40
|
+
import { SIZE_LIMITS } from "../config/index.js";
|
|
41
|
+
import { FileErrorCode } from "../errors/index.js";
|
|
42
|
+
// =============================================================================
|
|
43
|
+
// CONSTANTS
|
|
44
|
+
// =============================================================================
|
|
45
|
+
/** Supported MIME types for Excel files */
|
|
46
|
+
const SUPPORTED_EXCEL_TYPES = [
|
|
47
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", // .xlsx
|
|
48
|
+
"application/vnd.ms-excel", // .xls
|
|
49
|
+
];
|
|
50
|
+
/** Supported file extensions for Excel files */
|
|
51
|
+
const SUPPORTED_EXCEL_EXTENSIONS = [".xlsx", ".xls"];
|
|
52
|
+
// =============================================================================
|
|
53
|
+
// EXCEL PROCESSOR CLASS
|
|
54
|
+
// =============================================================================
|
|
55
|
+
/**
|
|
56
|
+
* Excel Processor - handles .xlsx and .xls files.
|
|
57
|
+
* Uses exceljs library for parsing with support for large files.
|
|
58
|
+
*
|
|
59
|
+
* Features:
|
|
60
|
+
* - ZIP format validation (XLSX files are ZIP archives)
|
|
61
|
+
* - Sheet count limiting (MAX_EXCEL_SHEETS)
|
|
62
|
+
* - Row count limiting per sheet (MAX_EXCEL_ROWS)
|
|
63
|
+
* - Cell type handling (text, numbers, formulas, dates, rich text)
|
|
64
|
+
*
|
|
65
|
+
* @example
|
|
66
|
+
* ```typescript
|
|
67
|
+
* const processor = new ExcelProcessor();
|
|
68
|
+
*
|
|
69
|
+
* // Process a file
|
|
70
|
+
* const result = await processor.processFile(fileInfo, {
|
|
71
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
72
|
+
* });
|
|
73
|
+
*
|
|
74
|
+
* if (result.success) {
|
|
75
|
+
* console.log(`Sheets: ${result.data.sheetCount}`);
|
|
76
|
+
* console.log(`Truncated: ${result.data.truncated}`);
|
|
77
|
+
* }
|
|
78
|
+
* ```
|
|
79
|
+
*/
|
|
80
|
+
export class ExcelProcessor extends BaseFileProcessor {
|
|
81
|
+
constructor() {
|
|
82
|
+
super({
|
|
83
|
+
maxSizeMB: SIZE_LIMITS.EXCEL_MAX_MB,
|
|
84
|
+
timeoutMs: 60000, // Excel parsing can take longer than text files
|
|
85
|
+
supportedMimeTypes: [...SUPPORTED_EXCEL_TYPES],
|
|
86
|
+
supportedExtensions: [...SUPPORTED_EXCEL_EXTENSIONS],
|
|
87
|
+
fileTypeName: "Excel",
|
|
88
|
+
defaultFilename: "spreadsheet.xlsx",
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
// ===========================================================================
|
|
92
|
+
// VALIDATION
|
|
93
|
+
// ===========================================================================
|
|
94
|
+
/**
|
|
95
|
+
* Validate downloaded Excel file has correct format.
|
|
96
|
+
* XLSX files are ZIP archives starting with PK signature.
|
|
97
|
+
*
|
|
98
|
+
* @param buffer - Downloaded file content
|
|
99
|
+
* @param _fileInfo - Original file information (unused but required by interface)
|
|
100
|
+
* @returns null if valid, error message if invalid
|
|
101
|
+
*/
|
|
102
|
+
async validateDownloadedFile(buffer, _fileInfo) {
|
|
103
|
+
// Check minimum size
|
|
104
|
+
if (buffer.length < 4) {
|
|
105
|
+
return "Invalid Excel file - file too small";
|
|
106
|
+
}
|
|
107
|
+
// XLSX files are ZIP archives (PK signature: 0x50 0x4B)
|
|
108
|
+
const pkSignature = buffer.subarray(0, 2).toString("ascii");
|
|
109
|
+
if (pkSignature !== "PK") {
|
|
110
|
+
// Provide helpful error for common issues
|
|
111
|
+
const preview = buffer
|
|
112
|
+
.subarray(0, 100)
|
|
113
|
+
.toString("utf8")
|
|
114
|
+
.substring(0, 100);
|
|
115
|
+
if (preview.includes("<!DOCTYPE") || preview.includes("<html")) {
|
|
116
|
+
return "Invalid Excel file - received HTML response instead of file content";
|
|
117
|
+
}
|
|
118
|
+
return "Invalid Excel file - not a valid XLSX format (missing PK signature)";
|
|
119
|
+
}
|
|
120
|
+
return null;
|
|
121
|
+
}
|
|
122
|
+
// ===========================================================================
|
|
123
|
+
// PROCESSING
|
|
124
|
+
// ===========================================================================
|
|
125
|
+
/**
|
|
126
|
+
* Build processed result stub.
|
|
127
|
+
* Note: This is a synchronous stub - actual parsing happens in processFile override.
|
|
128
|
+
*
|
|
129
|
+
* @param buffer - Downloaded file content
|
|
130
|
+
* @param fileInfo - Original file information
|
|
131
|
+
* @returns Empty ProcessedExcel structure (populated by processFile)
|
|
132
|
+
*/
|
|
133
|
+
buildProcessedResult(buffer, fileInfo) {
|
|
134
|
+
return {
|
|
135
|
+
worksheets: [],
|
|
136
|
+
buffer,
|
|
137
|
+
mimetype: fileInfo.mimetype ||
|
|
138
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
139
|
+
size: fileInfo.size,
|
|
140
|
+
filename: this.getFilename(fileInfo),
|
|
141
|
+
sheetCount: 0,
|
|
142
|
+
totalRows: 0,
|
|
143
|
+
truncated: false,
|
|
144
|
+
truncatedSheets: [],
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Override processFile for async Excel parsing with exceljs.
|
|
149
|
+
* This override is necessary because exceljs uses async parsing.
|
|
150
|
+
*
|
|
151
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
152
|
+
* @param options - Optional processing options (auth headers, timeout, etc.)
|
|
153
|
+
* @returns Processing result with parsed Excel data or error
|
|
154
|
+
*/
|
|
155
|
+
async processFile(fileInfo, options) {
|
|
156
|
+
try {
|
|
157
|
+
// Step 1: Validate file type and size
|
|
158
|
+
const validationResult = this.validateFileWithResult(fileInfo);
|
|
159
|
+
if (!validationResult.success) {
|
|
160
|
+
return {
|
|
161
|
+
success: false,
|
|
162
|
+
error: validationResult.error,
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
// Step 2: Get file buffer (from direct buffer or download from URL)
|
|
166
|
+
let buffer;
|
|
167
|
+
if (fileInfo.buffer) {
|
|
168
|
+
buffer = fileInfo.buffer;
|
|
169
|
+
}
|
|
170
|
+
else if (fileInfo.url) {
|
|
171
|
+
const downloadResult = await this.downloadFileWithRetry(fileInfo, options);
|
|
172
|
+
if (!downloadResult.success) {
|
|
173
|
+
return {
|
|
174
|
+
success: false,
|
|
175
|
+
error: downloadResult.error,
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
if (!downloadResult.data) {
|
|
179
|
+
return {
|
|
180
|
+
success: false,
|
|
181
|
+
error: this.createError(FileErrorCode.DOWNLOAD_FAILED, {
|
|
182
|
+
reason: "Download succeeded but returned no data",
|
|
183
|
+
}),
|
|
184
|
+
};
|
|
185
|
+
}
|
|
186
|
+
buffer = downloadResult.data;
|
|
187
|
+
}
|
|
188
|
+
else {
|
|
189
|
+
return {
|
|
190
|
+
success: false,
|
|
191
|
+
error: this.createError(FileErrorCode.DOWNLOAD_FAILED, {
|
|
192
|
+
reason: "No buffer or URL provided for file",
|
|
193
|
+
}),
|
|
194
|
+
};
|
|
195
|
+
}
|
|
196
|
+
// Step 3: Validate downloaded file (magic bytes check)
|
|
197
|
+
const postValidationResult = await this.validateDownloadedFileWithResult(buffer, fileInfo);
|
|
198
|
+
if (!postValidationResult.success) {
|
|
199
|
+
return {
|
|
200
|
+
success: false,
|
|
201
|
+
error: postValidationResult.error,
|
|
202
|
+
};
|
|
203
|
+
}
|
|
204
|
+
// Step 4: Parse Excel file asynchronously using exceljs
|
|
205
|
+
const workbook = await this.parseWorkbook(buffer);
|
|
206
|
+
// Step 5: Extract worksheet data with limits
|
|
207
|
+
const { worksheets, truncated, truncatedSheets } = this.extractWorksheets(workbook);
|
|
208
|
+
// Calculate total rows across all worksheets
|
|
209
|
+
const totalRows = worksheets.reduce((sum, sheet) => sum + sheet.rowCount, 0);
|
|
210
|
+
// Build final result
|
|
211
|
+
return {
|
|
212
|
+
success: true,
|
|
213
|
+
data: {
|
|
214
|
+
worksheets,
|
|
215
|
+
buffer,
|
|
216
|
+
mimetype: fileInfo.mimetype ||
|
|
217
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
218
|
+
size: fileInfo.size,
|
|
219
|
+
filename: this.getFilename(fileInfo),
|
|
220
|
+
sheetCount: worksheets.length,
|
|
221
|
+
totalRows,
|
|
222
|
+
truncated,
|
|
223
|
+
truncatedSheets,
|
|
224
|
+
},
|
|
225
|
+
};
|
|
226
|
+
}
|
|
227
|
+
catch (error) {
|
|
228
|
+
return {
|
|
229
|
+
success: false,
|
|
230
|
+
error: this.createError(FileErrorCode.PROCESSING_FAILED, {
|
|
231
|
+
fileType: "Excel",
|
|
232
|
+
error: error instanceof Error ? error.message : String(error),
|
|
233
|
+
}, error instanceof Error ? error : undefined),
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
}
|
|
237
|
+
// ===========================================================================
|
|
238
|
+
// PRIVATE HELPER METHODS
|
|
239
|
+
// ===========================================================================
|
|
240
|
+
/**
|
|
241
|
+
* Parse Excel buffer into workbook using exceljs.
|
|
242
|
+
*
|
|
243
|
+
* @param buffer - Excel file content
|
|
244
|
+
* @returns Parsed ExcelJS Workbook
|
|
245
|
+
*/
|
|
246
|
+
async parseWorkbook(buffer) {
|
|
247
|
+
const workbook = new Workbook();
|
|
248
|
+
// ExcelJS load() types expect Buffer but Node 22+ Buffer<ArrayBufferLike>
|
|
249
|
+
// is not directly assignable. Extract a clean ArrayBuffer for the exact
|
|
250
|
+
// byte range via slice, then cast for type compatibility.
|
|
251
|
+
await workbook.xlsx.load(buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength));
|
|
252
|
+
return workbook;
|
|
253
|
+
}
|
|
254
|
+
/**
|
|
255
|
+
* Extract worksheet data from workbook with row and sheet limits.
|
|
256
|
+
*
|
|
257
|
+
* @param workbook - Parsed ExcelJS Workbook
|
|
258
|
+
* @returns Extracted worksheets with truncation metadata
|
|
259
|
+
*/
|
|
260
|
+
extractWorksheets(workbook) {
|
|
261
|
+
const worksheets = [];
|
|
262
|
+
const truncatedSheets = [];
|
|
263
|
+
let truncated = false;
|
|
264
|
+
const maxRows = SIZE_LIMITS.EXCEL_MAX_ROWS;
|
|
265
|
+
const maxSheets = SIZE_LIMITS.EXCEL_MAX_SHEETS;
|
|
266
|
+
let sheetIndex = 0;
|
|
267
|
+
for (const worksheet of workbook.worksheets) {
|
|
268
|
+
// Check sheet limit
|
|
269
|
+
if (sheetIndex >= maxSheets) {
|
|
270
|
+
truncated = true;
|
|
271
|
+
break;
|
|
272
|
+
}
|
|
273
|
+
const rows = [];
|
|
274
|
+
let headers = [];
|
|
275
|
+
let rowIndex = 0;
|
|
276
|
+
let hitLimit = false;
|
|
277
|
+
worksheet.eachRow((row, rowNumber) => {
|
|
278
|
+
if (hitLimit) {
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
// Check row limit
|
|
282
|
+
if (rowIndex >= maxRows) {
|
|
283
|
+
if (!truncatedSheets.includes(worksheet.name)) {
|
|
284
|
+
truncatedSheets.push(worksheet.name);
|
|
285
|
+
}
|
|
286
|
+
truncated = true;
|
|
287
|
+
hitLimit = true;
|
|
288
|
+
return;
|
|
289
|
+
}
|
|
290
|
+
// ExcelJS row.values is 1-indexed, so first element is undefined
|
|
291
|
+
const rowValues = row.values;
|
|
292
|
+
// Convert cell values to primitive types and remove the first undefined element
|
|
293
|
+
const cleanRow = rowValues
|
|
294
|
+
.slice(1)
|
|
295
|
+
.map((cell) => this.getCellValue(cell));
|
|
296
|
+
// Extract headers from first row
|
|
297
|
+
if (rowNumber === 1) {
|
|
298
|
+
headers = cleanRow.map((v) => String(v ?? ""));
|
|
299
|
+
}
|
|
300
|
+
rows.push(cleanRow);
|
|
301
|
+
rowIndex++;
|
|
302
|
+
});
|
|
303
|
+
worksheets.push({
|
|
304
|
+
name: worksheet.name,
|
|
305
|
+
rows,
|
|
306
|
+
headers,
|
|
307
|
+
rowCount: rows.length,
|
|
308
|
+
columnCount: headers.length || (rows[0]?.length ?? 0),
|
|
309
|
+
});
|
|
310
|
+
sheetIndex++;
|
|
311
|
+
}
|
|
312
|
+
return { worksheets, truncated, truncatedSheets };
|
|
313
|
+
}
|
|
314
|
+
/**
|
|
315
|
+
* Convert an Excel cell value to a primitive type.
|
|
316
|
+
* Handles various cell types including formulas, rich text, and dates.
|
|
317
|
+
*
|
|
318
|
+
* @param cell - ExcelJS cell value (can be various types)
|
|
319
|
+
* @returns Primitive value (string, number, boolean, or null)
|
|
320
|
+
*/
|
|
321
|
+
getCellValue(cell) {
|
|
322
|
+
if (cell === null || cell === undefined) {
|
|
323
|
+
return null;
|
|
324
|
+
}
|
|
325
|
+
// Handle primitive types directly
|
|
326
|
+
if (typeof cell === "string" ||
|
|
327
|
+
typeof cell === "number" ||
|
|
328
|
+
typeof cell === "boolean") {
|
|
329
|
+
return cell;
|
|
330
|
+
}
|
|
331
|
+
// Handle Date objects
|
|
332
|
+
if (cell instanceof Date) {
|
|
333
|
+
return cell.toISOString();
|
|
334
|
+
}
|
|
335
|
+
// Handle ExcelJS cell objects
|
|
336
|
+
if (typeof cell === "object" && cell !== null) {
|
|
337
|
+
const cellObj = cell;
|
|
338
|
+
// Formula result (prioritize result over formula string)
|
|
339
|
+
if ("result" in cellObj && cellObj.result !== undefined) {
|
|
340
|
+
if (typeof cellObj.result === "object" && cellObj.result !== null) {
|
|
341
|
+
// Handle error values like { error: '#VALUE!' }
|
|
342
|
+
if ("error" in cellObj.result) {
|
|
343
|
+
return String(cellObj.result.error);
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
return typeof cellObj.result === "string" ||
|
|
347
|
+
typeof cellObj.result === "number" ||
|
|
348
|
+
typeof cellObj.result === "boolean"
|
|
349
|
+
? cellObj.result
|
|
350
|
+
: String(cellObj.result);
|
|
351
|
+
}
|
|
352
|
+
// Rich text
|
|
353
|
+
if ("richText" in cellObj && Array.isArray(cellObj.richText)) {
|
|
354
|
+
return this.extractRichText(cellObj.richText);
|
|
355
|
+
}
|
|
356
|
+
// Simple text value
|
|
357
|
+
if ("text" in cellObj && cellObj.text !== undefined) {
|
|
358
|
+
return cellObj.text;
|
|
359
|
+
}
|
|
360
|
+
// Hyperlink (return the display text or URL)
|
|
361
|
+
if ("hyperlink" in cellObj && cellObj.hyperlink) {
|
|
362
|
+
return cellObj.text || cellObj.hyperlink;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
// Fallback: convert to string
|
|
366
|
+
return String(cell);
|
|
367
|
+
}
|
|
368
|
+
/**
|
|
369
|
+
* Extract text from rich text cell format.
|
|
370
|
+
* Rich text cells contain an array of text fragments with formatting.
|
|
371
|
+
*
|
|
372
|
+
* @param richText - Array of rich text fragments
|
|
373
|
+
* @returns Concatenated plain text
|
|
374
|
+
*/
|
|
375
|
+
extractRichText(richText) {
|
|
376
|
+
if (!Array.isArray(richText)) {
|
|
377
|
+
return "";
|
|
378
|
+
}
|
|
379
|
+
return richText
|
|
380
|
+
.map((rt) => {
|
|
381
|
+
if (typeof rt === "object" && rt !== null && "text" in rt) {
|
|
382
|
+
return rt.text || "";
|
|
383
|
+
}
|
|
384
|
+
return "";
|
|
385
|
+
})
|
|
386
|
+
.join("");
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
// =============================================================================
|
|
390
|
+
// SINGLETON INSTANCE
|
|
391
|
+
// =============================================================================
|
|
392
|
+
/**
|
|
393
|
+
* Singleton Excel processor instance.
|
|
394
|
+
* Use this for standard Excel processing operations.
|
|
395
|
+
*
|
|
396
|
+
* @example
|
|
397
|
+
* ```typescript
|
|
398
|
+
* import { excelProcessor } from "./ExcelProcessor.js";
|
|
399
|
+
*
|
|
400
|
+
* const result = await excelProcessor.processFile(fileInfo);
|
|
401
|
+
* ```
|
|
402
|
+
*/
|
|
403
|
+
export const excelProcessor = new ExcelProcessor();
|
|
404
|
+
// =============================================================================
|
|
405
|
+
// HELPER FUNCTIONS
|
|
406
|
+
// =============================================================================
|
|
407
|
+
/**
|
|
408
|
+
* Check if a file is an Excel file.
|
|
409
|
+
* Matches by MIME type or file extension.
|
|
410
|
+
*
|
|
411
|
+
* @param mimetype - MIME type of the file
|
|
412
|
+
* @param filename - Filename (for extension-based detection)
|
|
413
|
+
* @returns true if the file is an Excel file
|
|
414
|
+
*
|
|
415
|
+
* @example
|
|
416
|
+
* ```typescript
|
|
417
|
+
* if (isExcelFile("application/vnd.ms-excel", "data.xls")) {
|
|
418
|
+
* // Process as Excel
|
|
419
|
+
* }
|
|
420
|
+
*
|
|
421
|
+
* if (isExcelFile("", "report.xlsx")) {
|
|
422
|
+
* // Also matches by extension
|
|
423
|
+
* }
|
|
424
|
+
* ```
|
|
425
|
+
*/
|
|
426
|
+
export function isExcelFile(mimetype, filename) {
|
|
427
|
+
return excelProcessor.isFileSupported(mimetype, filename);
|
|
428
|
+
}
|
|
429
|
+
/**
|
|
430
|
+
* Validate Excel file size against configured limit.
|
|
431
|
+
*
|
|
432
|
+
* @param sizeBytes - File size in bytes
|
|
433
|
+
* @returns true if size is within the Excel file limit
|
|
434
|
+
*
|
|
435
|
+
* @example
|
|
436
|
+
* ```typescript
|
|
437
|
+
* if (!validateExcelSize(fileInfo.size)) {
|
|
438
|
+
* console.error(`File too large: max ${SIZE_LIMITS.EXCEL_MAX_MB}MB`);
|
|
439
|
+
* }
|
|
440
|
+
* ```
|
|
441
|
+
*/
|
|
442
|
+
export function validateExcelSize(sizeBytes) {
|
|
443
|
+
const maxBytes = SIZE_LIMITS.EXCEL_MAX_MB * 1024 * 1024;
|
|
444
|
+
return sizeBytes <= maxBytes;
|
|
445
|
+
}
|
|
446
|
+
/**
|
|
447
|
+
* Process a single Excel file.
|
|
448
|
+
* Convenience function that uses the singleton processor.
|
|
449
|
+
*
|
|
450
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
451
|
+
* @param options - Optional processing options (auth headers, timeout, etc.)
|
|
452
|
+
* @returns Processing result with parsed Excel data or error
|
|
453
|
+
*
|
|
454
|
+
* @example
|
|
455
|
+
* ```typescript
|
|
456
|
+
* import { processExcel } from "./ExcelProcessor.js";
|
|
457
|
+
*
|
|
458
|
+
* const result = await processExcel(fileInfo, {
|
|
459
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
460
|
+
* timeout: 120000, // 2 minutes for large files
|
|
461
|
+
* });
|
|
462
|
+
*
|
|
463
|
+
* if (result.success) {
|
|
464
|
+
* const { worksheets, totalRows, truncated } = result.data;
|
|
465
|
+
* console.log(`Extracted ${totalRows} rows from ${worksheets.length} sheets`);
|
|
466
|
+
*
|
|
467
|
+
* if (truncated) {
|
|
468
|
+
* console.warn("Some data was truncated due to size limits");
|
|
469
|
+
* }
|
|
470
|
+
* } else {
|
|
471
|
+
* console.error(`Processing failed: ${result.error?.userMessage}`);
|
|
472
|
+
* }
|
|
473
|
+
* ```
|
|
474
|
+
*/
|
|
475
|
+
export async function processExcel(fileInfo, options) {
|
|
476
|
+
return excelProcessor.processFile(fileInfo, options);
|
|
477
|
+
}
|
|
478
|
+
/**
|
|
479
|
+
* Get Excel max size in MB.
|
|
480
|
+
*
|
|
481
|
+
* @returns Maximum Excel file size in megabytes
|
|
482
|
+
*
|
|
483
|
+
* @example
|
|
484
|
+
* ```typescript
|
|
485
|
+
* const maxSize = getExcelMaxSizeMB(); // 10
|
|
486
|
+
* console.log(`Maximum Excel file size: ${maxSize}MB`);
|
|
487
|
+
* ```
|
|
488
|
+
*/
|
|
489
|
+
export function getExcelMaxSizeMB() {
|
|
490
|
+
return SIZE_LIMITS.EXCEL_MAX_MB;
|
|
491
|
+
}
|
|
492
|
+
/**
|
|
493
|
+
* Get Excel max rows per sheet.
|
|
494
|
+
*
|
|
495
|
+
* @returns Maximum rows to process per worksheet
|
|
496
|
+
*
|
|
497
|
+
* @example
|
|
498
|
+
* ```typescript
|
|
499
|
+
* const maxRows = getExcelMaxRows(); // 5000
|
|
500
|
+
* console.log(`Maximum rows per sheet: ${maxRows}`);
|
|
501
|
+
* ```
|
|
502
|
+
*/
|
|
503
|
+
export function getExcelMaxRows() {
|
|
504
|
+
return SIZE_LIMITS.EXCEL_MAX_ROWS;
|
|
505
|
+
}
|
|
506
|
+
/**
|
|
507
|
+
* Get Excel max sheets to process.
|
|
508
|
+
*
|
|
509
|
+
* @returns Maximum number of worksheets to process
|
|
510
|
+
*
|
|
511
|
+
* @example
|
|
512
|
+
* ```typescript
|
|
513
|
+
* const maxSheets = getExcelMaxSheets(); // 10
|
|
514
|
+
* console.log(`Maximum sheets to process: ${maxSheets}`);
|
|
515
|
+
* ```
|
|
516
|
+
*/
|
|
517
|
+
export function getExcelMaxSheets() {
|
|
518
|
+
return SIZE_LIMITS.EXCEL_MAX_SHEETS;
|
|
519
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* OpenDocument Processor
|
|
3
|
+
*
|
|
4
|
+
* Processes OpenDocument format files (.odt, .ods, .odp) by extracting
|
|
5
|
+
* text content from the internal XML structure.
|
|
6
|
+
*
|
|
7
|
+
* @module processors/document/OpenDocumentProcessor
|
|
8
|
+
*/
|
|
9
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
10
|
+
import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
|
|
11
|
+
export type { ProcessedOpenDocument } from "../base/types.js";
|
|
12
|
+
import type { ProcessedOpenDocument } from "../base/types.js";
|
|
13
|
+
/**
|
|
14
|
+
* OpenDocument Processor - handles .odt, .ods, .odp files
|
|
15
|
+
*
|
|
16
|
+
* OpenDocument files are ZIP archives containing XML content.
|
|
17
|
+
* The main content is in content.xml within the archive.
|
|
18
|
+
*
|
|
19
|
+
* Priority: ~105 (between Word and Text)
|
|
20
|
+
*/
|
|
21
|
+
export declare class OpenDocumentProcessor extends BaseFileProcessor<ProcessedOpenDocument> {
|
|
22
|
+
constructor();
|
|
23
|
+
/**
|
|
24
|
+
* Validate that the file is a valid ZIP archive (OpenDocument format)
|
|
25
|
+
*/
|
|
26
|
+
protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
|
|
27
|
+
/**
|
|
28
|
+
* Build the processed result by extracting content from the OpenDocument
|
|
29
|
+
*/
|
|
30
|
+
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedOpenDocument;
|
|
31
|
+
/**
|
|
32
|
+
* Decode HTML entities in a single pass to prevent double-unescaping.
|
|
33
|
+
* Sequential replacement is vulnerable: "&lt;" → "<" → "<"
|
|
34
|
+
* Single-pass avoids this by replacing each entity exactly once.
|
|
35
|
+
*/
|
|
36
|
+
private decodeHtmlEntities;
|
|
37
|
+
/**
|
|
38
|
+
* Extract text content from OpenDocument XML
|
|
39
|
+
*/
|
|
40
|
+
private extractTextFromXml;
|
|
41
|
+
/**
|
|
42
|
+
* Detect the OpenDocument format from file extension
|
|
43
|
+
*/
|
|
44
|
+
private detectFormat;
|
|
45
|
+
/**
|
|
46
|
+
* Get file extension from filename
|
|
47
|
+
*/
|
|
48
|
+
private getExtension;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Singleton instance of OpenDocumentProcessor
|
|
52
|
+
*/
|
|
53
|
+
export declare const openDocumentProcessor: OpenDocumentProcessor;
|
|
54
|
+
/**
|
|
55
|
+
* Check if a file is an OpenDocument file by MIME type or extension
|
|
56
|
+
*/
|
|
57
|
+
export declare function isOpenDocumentFile(mimetype: string, filename: string): boolean;
|
|
58
|
+
/**
|
|
59
|
+
* Validate OpenDocument file size against limits
|
|
60
|
+
*/
|
|
61
|
+
export declare function validateOpenDocumentSize(sizeBytes: number): boolean;
|
|
62
|
+
/**
|
|
63
|
+
* Process an OpenDocument file
|
|
64
|
+
*/
|
|
65
|
+
export declare function processOpenDocument(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedOpenDocument>>;
|
|
66
|
+
/**
|
|
67
|
+
* Get the maximum allowed OpenDocument file size in MB
|
|
68
|
+
*/
|
|
69
|
+
export declare function getOpenDocumentMaxSizeMB(): number;
|