@juspay/neurolink 9.1.1 → 9.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +54 -7
- package/dist/agent/directTools.d.ts +3 -3
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/lib/agent/directTools.d.ts +3 -3
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/server/utils/validation.d.ts +6 -6
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +18 -18
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/server/utils/validation.d.ts +6 -6
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* RTF Document Processor
|
|
3
|
+
*
|
|
4
|
+
* Processes Rich Text Format (.rtf) files by extracting plain text content
|
|
5
|
+
* from RTF control codes. Uses a lightweight text extraction approach
|
|
6
|
+
* without requiring external dependencies.
|
|
7
|
+
*
|
|
8
|
+
* Key features:
|
|
9
|
+
* - RTF control code stripping
|
|
10
|
+
* - Text content extraction
|
|
11
|
+
* - Raw content preservation for debugging
|
|
12
|
+
* - No external dependencies required
|
|
13
|
+
*
|
|
14
|
+
* Priority: ~110 (document format, processed after binary formats)
|
|
15
|
+
*
|
|
16
|
+
* @module processors/document/RtfProcessor
|
|
17
|
+
*
|
|
18
|
+
* @example
|
|
19
|
+
* ```typescript
|
|
20
|
+
* import { rtfProcessor, processRtf, isRtfFile } from "./document/index.js";
|
|
21
|
+
*
|
|
22
|
+
* // Check if a file is an RTF file
|
|
23
|
+
* if (isRtfFile("application/rtf", "document.rtf")) {
|
|
24
|
+
* const result = await processRtf({
|
|
25
|
+
* id: "file-123",
|
|
26
|
+
* name: "document.rtf",
|
|
27
|
+
* mimetype: "application/rtf",
|
|
28
|
+
* size: 10240,
|
|
29
|
+
* buffer: rtfBuffer,
|
|
30
|
+
* });
|
|
31
|
+
*
|
|
32
|
+
* if (result.success) {
|
|
33
|
+
* console.log(`Text content: ${result.data.textContent}`);
|
|
34
|
+
* }
|
|
35
|
+
* }
|
|
36
|
+
* ```
|
|
37
|
+
*/
|
|
38
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
39
|
+
import { SIZE_LIMITS } from "../config/index.js";
|
|
40
|
+
// =============================================================================
|
|
41
|
+
// CONSTANTS
|
|
42
|
+
// =============================================================================
|
|
43
|
+
/**
|
|
44
|
+
* Supported MIME types for RTF documents
|
|
45
|
+
*/
|
|
46
|
+
const SUPPORTED_RTF_MIME_TYPES = [
|
|
47
|
+
"application/rtf",
|
|
48
|
+
"text/rtf",
|
|
49
|
+
"text/richtext",
|
|
50
|
+
];
|
|
51
|
+
/**
|
|
52
|
+
* Supported file extensions for RTF documents
|
|
53
|
+
*/
|
|
54
|
+
const SUPPORTED_RTF_EXTENSIONS = [".rtf"];
|
|
55
|
+
/**
|
|
56
|
+
* Default timeout for RTF processing (30 seconds)
|
|
57
|
+
*/
|
|
58
|
+
const RTF_TIMEOUT_MS = 30000;
|
|
59
|
+
// =============================================================================
|
|
60
|
+
// RTF PROCESSOR CLASS
|
|
61
|
+
// =============================================================================
|
|
62
|
+
/**
|
|
63
|
+
* RTF Processor - handles Rich Text Format files.
|
|
64
|
+
*
|
|
65
|
+
* Extracts plain text from RTF documents by stripping RTF control codes.
|
|
66
|
+
* This is a lightweight implementation that doesn't require external
|
|
67
|
+
* RTF parsing libraries.
|
|
68
|
+
*
|
|
69
|
+
* Priority: ~110 (document format)
|
|
70
|
+
*
|
|
71
|
+
* @example
|
|
72
|
+
* ```typescript
|
|
73
|
+
* const processor = new RtfProcessor();
|
|
74
|
+
*
|
|
75
|
+
* const result = await processor.processFile({
|
|
76
|
+
* id: "file-123",
|
|
77
|
+
* name: "report.rtf",
|
|
78
|
+
* mimetype: "application/rtf",
|
|
79
|
+
* size: 5120,
|
|
80
|
+
* buffer: rtfBuffer,
|
|
81
|
+
* });
|
|
82
|
+
*
|
|
83
|
+
* if (result.success) {
|
|
84
|
+
* console.log("Extracted text:", result.data.textContent);
|
|
85
|
+
* }
|
|
86
|
+
* ```
|
|
87
|
+
*/
|
|
88
|
+
export class RtfProcessor extends BaseFileProcessor {
|
|
89
|
+
constructor() {
|
|
90
|
+
super({
|
|
91
|
+
maxSizeMB: SIZE_LIMITS.DOCUMENT_MAX_MB,
|
|
92
|
+
timeoutMs: RTF_TIMEOUT_MS,
|
|
93
|
+
supportedMimeTypes: SUPPORTED_RTF_MIME_TYPES,
|
|
94
|
+
supportedExtensions: SUPPORTED_RTF_EXTENSIONS,
|
|
95
|
+
fileTypeName: "RTF",
|
|
96
|
+
defaultFilename: "document.rtf",
|
|
97
|
+
});
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Validate downloaded RTF document.
|
|
101
|
+
* Checks for RTF header signature "{\\rtf".
|
|
102
|
+
*
|
|
103
|
+
* @param buffer - Downloaded file content
|
|
104
|
+
* @param fileInfo - Original file information
|
|
105
|
+
* @returns null if valid, error message if invalid
|
|
106
|
+
*/
|
|
107
|
+
async validateDownloadedFile(buffer, _fileInfo) {
|
|
108
|
+
if (buffer.length < 5) {
|
|
109
|
+
return "Invalid RTF document - file too small";
|
|
110
|
+
}
|
|
111
|
+
// RTF files should start with "{\rtf"
|
|
112
|
+
const header = buffer.subarray(0, 10).toString("ascii");
|
|
113
|
+
if (!header.startsWith("{\\rtf")) {
|
|
114
|
+
// Check if it might be HTML error page
|
|
115
|
+
const preview = buffer
|
|
116
|
+
.subarray(0, 100)
|
|
117
|
+
.toString("utf8")
|
|
118
|
+
.substring(0, 100);
|
|
119
|
+
if (preview.includes("<!DOCTYPE") || preview.includes("<html")) {
|
|
120
|
+
return "Invalid RTF document - received HTML response instead of file content";
|
|
121
|
+
}
|
|
122
|
+
return "Invalid RTF document - missing RTF header signature";
|
|
123
|
+
}
|
|
124
|
+
return null;
|
|
125
|
+
}
|
|
126
|
+
/**
|
|
127
|
+
* Build the processed RTF result.
|
|
128
|
+
* Extracts plain text by stripping RTF control codes.
|
|
129
|
+
*
|
|
130
|
+
* @param buffer - Raw file content
|
|
131
|
+
* @param fileInfo - Original file information
|
|
132
|
+
* @returns Processed RTF with extracted text content
|
|
133
|
+
*/
|
|
134
|
+
buildProcessedResult(buffer, fileInfo) {
|
|
135
|
+
const rawContent = buffer.toString("utf-8");
|
|
136
|
+
const textContent = this.extractText(rawContent);
|
|
137
|
+
return {
|
|
138
|
+
textContent,
|
|
139
|
+
rawContent,
|
|
140
|
+
buffer,
|
|
141
|
+
mimetype: fileInfo.mimetype || "application/rtf",
|
|
142
|
+
size: fileInfo.size,
|
|
143
|
+
filename: this.getFilename(fileInfo),
|
|
144
|
+
};
|
|
145
|
+
}
|
|
146
|
+
/**
|
|
147
|
+
* Extract plain text from RTF content.
|
|
148
|
+
* Strips RTF control codes, groups, and formatting commands.
|
|
149
|
+
*
|
|
150
|
+
* This is a basic RTF parser that handles common RTF constructs:
|
|
151
|
+
* - Control groups like {\fonttbl...}
|
|
152
|
+
* - Control words like \par, \b, \i
|
|
153
|
+
* - Special characters like \' hex escapes
|
|
154
|
+
* - Newlines from \par and \line commands
|
|
155
|
+
*
|
|
156
|
+
* @param rtf - Raw RTF content
|
|
157
|
+
* @returns Extracted plain text
|
|
158
|
+
*/
|
|
159
|
+
extractText(rtf) {
|
|
160
|
+
const text = rtf;
|
|
161
|
+
let result = "";
|
|
162
|
+
let depth = 0;
|
|
163
|
+
let skipGroup = false;
|
|
164
|
+
let skipGroupDepth = 0;
|
|
165
|
+
let i = 0;
|
|
166
|
+
// Groups that should be skipped entirely (metadata, not content)
|
|
167
|
+
const skipGroupNames = [
|
|
168
|
+
"fonttbl",
|
|
169
|
+
"colortbl",
|
|
170
|
+
"stylesheet",
|
|
171
|
+
"info",
|
|
172
|
+
"pict",
|
|
173
|
+
"object",
|
|
174
|
+
"header",
|
|
175
|
+
"footer",
|
|
176
|
+
];
|
|
177
|
+
while (i < text.length) {
|
|
178
|
+
const char = text[i];
|
|
179
|
+
if (char === "{") {
|
|
180
|
+
depth++;
|
|
181
|
+
// Check if this is a group we should skip
|
|
182
|
+
const nextChars = text.substring(i + 1, i + 20);
|
|
183
|
+
const groupMatch = nextChars.match(/^\\([a-z]+)/);
|
|
184
|
+
if (groupMatch &&
|
|
185
|
+
skipGroupNames.includes(groupMatch[1]) &&
|
|
186
|
+
!skipGroup) {
|
|
187
|
+
skipGroup = true;
|
|
188
|
+
skipGroupDepth = depth;
|
|
189
|
+
}
|
|
190
|
+
i++;
|
|
191
|
+
continue;
|
|
192
|
+
}
|
|
193
|
+
if (char === "}") {
|
|
194
|
+
depth--;
|
|
195
|
+
if (skipGroup && depth < skipGroupDepth) {
|
|
196
|
+
skipGroup = false;
|
|
197
|
+
skipGroupDepth = 0;
|
|
198
|
+
}
|
|
199
|
+
i++;
|
|
200
|
+
continue;
|
|
201
|
+
}
|
|
202
|
+
if (skipGroup) {
|
|
203
|
+
i++;
|
|
204
|
+
continue;
|
|
205
|
+
}
|
|
206
|
+
if (char === "\\") {
|
|
207
|
+
// Control word or symbol
|
|
208
|
+
const remaining = text.substring(i);
|
|
209
|
+
// Handle special escapes
|
|
210
|
+
if (remaining.startsWith("\\\\")) {
|
|
211
|
+
result += "\\";
|
|
212
|
+
i += 2;
|
|
213
|
+
continue;
|
|
214
|
+
}
|
|
215
|
+
if (remaining.startsWith("\\{")) {
|
|
216
|
+
result += "{";
|
|
217
|
+
i += 2;
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
if (remaining.startsWith("\\}")) {
|
|
221
|
+
result += "}";
|
|
222
|
+
i += 2;
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
// Handle hex escapes like \'e9 (é)
|
|
226
|
+
const hexMatch = remaining.match(/^\\'([0-9a-f]{2})/i);
|
|
227
|
+
if (hexMatch) {
|
|
228
|
+
const charCode = parseInt(hexMatch[1], 16);
|
|
229
|
+
result += String.fromCharCode(charCode);
|
|
230
|
+
i += 4;
|
|
231
|
+
continue;
|
|
232
|
+
}
|
|
233
|
+
// Handle Unicode escapes like \u233? (é)
|
|
234
|
+
const unicodeMatch = remaining.match(/^\\u(-?\d+)\??/);
|
|
235
|
+
if (unicodeMatch) {
|
|
236
|
+
let charCode = parseInt(unicodeMatch[1], 10);
|
|
237
|
+
if (charCode < 0) {
|
|
238
|
+
charCode += 65536; // Convert negative to positive
|
|
239
|
+
}
|
|
240
|
+
result += String.fromCharCode(charCode);
|
|
241
|
+
i += unicodeMatch[0].length;
|
|
242
|
+
continue;
|
|
243
|
+
}
|
|
244
|
+
// Handle control words
|
|
245
|
+
const controlMatch = remaining.match(/^\\([a-z]+)(-?\d*)[ ]?/i);
|
|
246
|
+
if (controlMatch) {
|
|
247
|
+
const controlWord = controlMatch[1].toLowerCase();
|
|
248
|
+
// Convert some control words to text
|
|
249
|
+
if (controlWord === "par" || controlWord === "line") {
|
|
250
|
+
result += "\n";
|
|
251
|
+
}
|
|
252
|
+
else if (controlWord === "tab") {
|
|
253
|
+
result += "\t";
|
|
254
|
+
}
|
|
255
|
+
else if (controlWord === "emdash") {
|
|
256
|
+
result += "—";
|
|
257
|
+
}
|
|
258
|
+
else if (controlWord === "endash") {
|
|
259
|
+
result += "–";
|
|
260
|
+
}
|
|
261
|
+
else if (controlWord === "bullet") {
|
|
262
|
+
result += "•";
|
|
263
|
+
}
|
|
264
|
+
else if (controlWord === "lquote") {
|
|
265
|
+
result += "'";
|
|
266
|
+
}
|
|
267
|
+
else if (controlWord === "rquote") {
|
|
268
|
+
result += "'";
|
|
269
|
+
}
|
|
270
|
+
else if (controlWord === "ldblquote") {
|
|
271
|
+
result += '"';
|
|
272
|
+
}
|
|
273
|
+
else if (controlWord === "rdblquote") {
|
|
274
|
+
result += '"';
|
|
275
|
+
}
|
|
276
|
+
i += controlMatch[0].length;
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
// Unknown control sequence, skip the backslash and control word
|
|
280
|
+
i++;
|
|
281
|
+
continue;
|
|
282
|
+
}
|
|
283
|
+
// Regular character
|
|
284
|
+
if (char !== "\r" && char !== "\n") {
|
|
285
|
+
result += char;
|
|
286
|
+
}
|
|
287
|
+
i++;
|
|
288
|
+
}
|
|
289
|
+
// Clean up the result
|
|
290
|
+
result = result
|
|
291
|
+
.replace(/\s+/g, " ") // Normalize whitespace
|
|
292
|
+
.replace(/ +\n/g, "\n") // Remove trailing spaces before newlines
|
|
293
|
+
.replace(/\n +/g, "\n") // Remove leading spaces after newlines
|
|
294
|
+
.replace(/\n{3,}/g, "\n\n") // Collapse multiple newlines
|
|
295
|
+
.trim();
|
|
296
|
+
return result;
|
|
297
|
+
}
|
|
298
|
+
}
|
|
299
|
+
// =============================================================================
|
|
300
|
+
// SINGLETON INSTANCE
|
|
301
|
+
// =============================================================================
|
|
302
|
+
/**
|
|
303
|
+
* Singleton instance of the RtfProcessor.
|
|
304
|
+
* Use this for all RTF document processing to share configuration.
|
|
305
|
+
*/
|
|
306
|
+
export const rtfProcessor = new RtfProcessor();
|
|
307
|
+
// =============================================================================
|
|
308
|
+
// HELPER FUNCTIONS
|
|
309
|
+
// =============================================================================
|
|
310
|
+
/**
|
|
311
|
+
* Check if a file is an RTF document.
|
|
312
|
+
*
|
|
313
|
+
* @param mimetype - MIME type of the file
|
|
314
|
+
* @param filename - Filename for detection
|
|
315
|
+
* @returns true if the file is a supported RTF document
|
|
316
|
+
*
|
|
317
|
+
* @example
|
|
318
|
+
* ```typescript
|
|
319
|
+
* if (isRtfFile("application/rtf", "document.rtf")) {
|
|
320
|
+
* console.log("This is an RTF document");
|
|
321
|
+
* }
|
|
322
|
+
* ```
|
|
323
|
+
*/
|
|
324
|
+
export function isRtfFile(mimetype, filename) {
|
|
325
|
+
return rtfProcessor.isFileSupported(mimetype, filename);
|
|
326
|
+
}
|
|
327
|
+
/**
|
|
328
|
+
* Validate RTF document size against configured limit.
|
|
329
|
+
*
|
|
330
|
+
* @param sizeBytes - File size in bytes
|
|
331
|
+
* @returns true if size is within the allowed limit
|
|
332
|
+
*/
|
|
333
|
+
export function validateRtfSize(sizeBytes) {
|
|
334
|
+
const maxBytes = SIZE_LIMITS.DOCUMENT_MAX_MB * 1024 * 1024;
|
|
335
|
+
return sizeBytes <= maxBytes;
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Process an RTF document.
|
|
339
|
+
*
|
|
340
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
341
|
+
* @param options - Optional processing options
|
|
342
|
+
* @returns Processing result with success flag and either data or error
|
|
343
|
+
*
|
|
344
|
+
* @example
|
|
345
|
+
* ```typescript
|
|
346
|
+
* const result = await processRtf({
|
|
347
|
+
* id: "file-123",
|
|
348
|
+
* name: "report.rtf",
|
|
349
|
+
* mimetype: "application/rtf",
|
|
350
|
+
* size: 10240,
|
|
351
|
+
* buffer: rtfBuffer,
|
|
352
|
+
* });
|
|
353
|
+
*
|
|
354
|
+
* if (result.success) {
|
|
355
|
+
* console.log("Extracted text:", result.data.textContent);
|
|
356
|
+
* }
|
|
357
|
+
* ```
|
|
358
|
+
*/
|
|
359
|
+
export async function processRtf(fileInfo, options) {
|
|
360
|
+
return rtfProcessor.processFile(fileInfo, options);
|
|
361
|
+
}
|
|
362
|
+
//# sourceMappingURL=RtfProcessor.js.map
|
|
@@ -0,0 +1,168 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Word Document Processing Utility
|
|
3
|
+
*
|
|
4
|
+
* Handles downloading, validating, and processing Word (.docx, .doc) files.
|
|
5
|
+
* Uses mammoth library to extract text and HTML content from Word documents.
|
|
6
|
+
*
|
|
7
|
+
* Features:
|
|
8
|
+
* - DOCX format validation via ZIP/PK signature check
|
|
9
|
+
* - Text extraction using mammoth.extractRawText()
|
|
10
|
+
* - HTML conversion using mammoth.convertToHtml()
|
|
11
|
+
* - Warning collection from mammoth processing
|
|
12
|
+
* - Support for both URL downloads and direct buffer input
|
|
13
|
+
*
|
|
14
|
+
* @module processors/document/WordProcessor
|
|
15
|
+
*
|
|
16
|
+
* @example
|
|
17
|
+
* ```typescript
|
|
18
|
+
* import { wordProcessor, processWord, isWordFile } from "./WordProcessor.js";
|
|
19
|
+
*
|
|
20
|
+
* // Check if file is supported
|
|
21
|
+
* if (isWordFile(file.mimetype, file.name)) {
|
|
22
|
+
* const result = await processWord(fileInfo, {
|
|
23
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
24
|
+
* });
|
|
25
|
+
*
|
|
26
|
+
* if (result.success) {
|
|
27
|
+
* console.log("Text:", result.data.textContent);
|
|
28
|
+
* console.log("HTML:", result.data.htmlContent);
|
|
29
|
+
* console.log("Warnings:", result.data.warnings);
|
|
30
|
+
* }
|
|
31
|
+
* }
|
|
32
|
+
* ```
|
|
33
|
+
*/
|
|
34
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
35
|
+
import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
|
|
36
|
+
export type { ProcessedWord } from "../base/types.js";
|
|
37
|
+
import type { ProcessedWord } from "../base/types.js";
|
|
38
|
+
/**
|
|
39
|
+
* Word Processor - handles .docx and .doc files
|
|
40
|
+
*
|
|
41
|
+
* Uses mammoth library for both text and HTML extraction. The processor
|
|
42
|
+
* validates DOCX files by checking for the ZIP/PK signature (since DOCX
|
|
43
|
+
* files are actually ZIP archives).
|
|
44
|
+
*
|
|
45
|
+
* @example
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const processor = new WordProcessor();
|
|
48
|
+
*
|
|
49
|
+
* // Check if file is supported
|
|
50
|
+
* if (processor.isFileSupported("application/msword", "report.doc")) {
|
|
51
|
+
* const result = await processor.processFile(fileInfo);
|
|
52
|
+
* if (result.success) {
|
|
53
|
+
* console.log("Extracted text:", result.data.textContent);
|
|
54
|
+
* }
|
|
55
|
+
* }
|
|
56
|
+
* ```
|
|
57
|
+
*/
|
|
58
|
+
export declare class WordProcessor extends BaseFileProcessor<ProcessedWord> {
|
|
59
|
+
constructor();
|
|
60
|
+
/**
|
|
61
|
+
* Validate downloaded Word document has correct magic bytes.
|
|
62
|
+
* DOCX files are ZIP archives starting with PK signature (0x50 0x4B).
|
|
63
|
+
*
|
|
64
|
+
* @param buffer - Downloaded file content
|
|
65
|
+
* @param fileInfo - Original file information
|
|
66
|
+
* @returns null if valid, error message if invalid
|
|
67
|
+
*/
|
|
68
|
+
protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
|
|
69
|
+
/**
|
|
70
|
+
* Build processed Word result with extracted text and HTML content.
|
|
71
|
+
* This is a stub that returns an empty result - actual processing
|
|
72
|
+
* happens in the overridden processFile method since mammoth
|
|
73
|
+
* operations are asynchronous.
|
|
74
|
+
*
|
|
75
|
+
* @param buffer - Downloaded file content
|
|
76
|
+
* @param fileInfo - Original file information
|
|
77
|
+
* @returns Processed Word result (placeholder)
|
|
78
|
+
*/
|
|
79
|
+
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedWord;
|
|
80
|
+
/**
|
|
81
|
+
* Override processFile for async mammoth extraction.
|
|
82
|
+
*
|
|
83
|
+
* The mammoth library's extractRawText and convertToHtml methods are
|
|
84
|
+
* asynchronous, so we need to override the entire processFile method
|
|
85
|
+
* rather than just buildProcessedResult.
|
|
86
|
+
*
|
|
87
|
+
* Processing steps:
|
|
88
|
+
* 1. Validate file type and size
|
|
89
|
+
* 2. Get buffer (download from URL or use provided buffer)
|
|
90
|
+
* 3. Validate downloaded file (check PK signature)
|
|
91
|
+
* 4. Extract text with mammoth.extractRawText()
|
|
92
|
+
* 5. Convert to HTML with mammoth.convertToHtml()
|
|
93
|
+
* 6. Collect any warnings from mammoth
|
|
94
|
+
* 7. Return structured result
|
|
95
|
+
*
|
|
96
|
+
* @param fileInfo - File information with URL or buffer
|
|
97
|
+
* @param options - Optional processing options
|
|
98
|
+
* @returns Processing result with text, HTML, and warnings
|
|
99
|
+
*/
|
|
100
|
+
processFile(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedWord>>;
|
|
101
|
+
}
|
|
102
|
+
/**
|
|
103
|
+
* Singleton Word processor instance.
|
|
104
|
+
* Use this for most use cases to avoid creating multiple instances.
|
|
105
|
+
*/
|
|
106
|
+
export declare const wordProcessor: WordProcessor;
|
|
107
|
+
/**
|
|
108
|
+
* Check if a file is a Word document (.docx or .doc).
|
|
109
|
+
*
|
|
110
|
+
* @param mimetype - MIME type of the file
|
|
111
|
+
* @param filename - Filename (for extension-based detection)
|
|
112
|
+
* @returns true if the file is a supported Word document
|
|
113
|
+
*
|
|
114
|
+
* @example
|
|
115
|
+
* ```typescript
|
|
116
|
+
* if (isWordFile(file.mimetype, file.name)) {
|
|
117
|
+
* const result = await processWord(file);
|
|
118
|
+
* }
|
|
119
|
+
* ```
|
|
120
|
+
*/
|
|
121
|
+
export declare function isWordFile(mimetype: string, filename: string): boolean;
|
|
122
|
+
/**
|
|
123
|
+
* Validate Word document size against configured limit.
|
|
124
|
+
*
|
|
125
|
+
* @param sizeBytes - File size in bytes
|
|
126
|
+
* @returns true if size is within the allowed limit
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* if (!validateWordSize(file.size)) {
|
|
131
|
+
* throw new Error(`File exceeds ${SIZE_LIMITS.WORD_MAX_MB}MB limit`);
|
|
132
|
+
* }
|
|
133
|
+
* ```
|
|
134
|
+
*/
|
|
135
|
+
export declare function validateWordSize(sizeBytes: number): boolean;
|
|
136
|
+
/**
|
|
137
|
+
* Process a single Word document.
|
|
138
|
+
*
|
|
139
|
+
* Convenience function that uses the singleton wordProcessor instance.
|
|
140
|
+
*
|
|
141
|
+
* @param fileInfo - File information with URL or buffer
|
|
142
|
+
* @param options - Optional processing options (auth headers, timeout, retry config)
|
|
143
|
+
* @returns Processing result with extracted text, HTML, and warnings
|
|
144
|
+
*
|
|
145
|
+
* @example
|
|
146
|
+
* ```typescript
|
|
147
|
+
* const result = await processWord({
|
|
148
|
+
* id: "doc-123",
|
|
149
|
+
* name: "report.docx",
|
|
150
|
+
* mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
151
|
+
* size: 12345,
|
|
152
|
+
* url: "https://example.com/files/report.docx",
|
|
153
|
+
* }, {
|
|
154
|
+
* authHeaders: { Authorization: "Bearer token" },
|
|
155
|
+
* });
|
|
156
|
+
*
|
|
157
|
+
* if (result.success) {
|
|
158
|
+
* console.log("Text content:", result.data.textContent);
|
|
159
|
+
* console.log("HTML content:", result.data.htmlContent);
|
|
160
|
+
* if (result.data.warnings.length > 0) {
|
|
161
|
+
* console.warn("Warnings:", result.data.warnings);
|
|
162
|
+
* }
|
|
163
|
+
* } else {
|
|
164
|
+
* console.error("Failed:", result.error.userMessage);
|
|
165
|
+
* }
|
|
166
|
+
* ```
|
|
167
|
+
*/
|
|
168
|
+
export declare function processWord(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedWord>>;
|