@juspay/neurolink 9.1.1 → 9.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +54 -7
  3. package/dist/agent/directTools.d.ts +3 -3
  4. package/dist/cli/commands/config.d.ts +6 -6
  5. package/dist/image-gen/ImageGenService.d.ts +143 -0
  6. package/dist/image-gen/ImageGenService.js +345 -0
  7. package/dist/image-gen/imageGenTools.d.ts +126 -0
  8. package/dist/image-gen/imageGenTools.js +304 -0
  9. package/dist/image-gen/index.d.ts +46 -0
  10. package/dist/image-gen/index.js +48 -0
  11. package/dist/image-gen/types.d.ts +237 -0
  12. package/dist/image-gen/types.js +24 -0
  13. package/dist/lib/agent/directTools.d.ts +3 -3
  14. package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
  15. package/dist/lib/image-gen/ImageGenService.js +346 -0
  16. package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
  17. package/dist/lib/image-gen/imageGenTools.js +305 -0
  18. package/dist/lib/image-gen/index.d.ts +46 -0
  19. package/dist/lib/image-gen/index.js +49 -0
  20. package/dist/lib/image-gen/types.d.ts +237 -0
  21. package/dist/lib/image-gen/types.js +25 -0
  22. package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
  23. package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
  24. package/dist/lib/processors/base/index.d.ts +14 -0
  25. package/dist/lib/processors/base/index.js +20 -0
  26. package/dist/lib/processors/base/types.d.ts +593 -0
  27. package/dist/lib/processors/base/types.js +77 -0
  28. package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
  29. package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
  30. package/dist/lib/processors/cli/index.d.ts +37 -0
  31. package/dist/lib/processors/cli/index.js +50 -0
  32. package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
  33. package/dist/lib/processors/code/ConfigProcessor.js +401 -0
  34. package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
  35. package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
  36. package/dist/lib/processors/code/index.d.ts +44 -0
  37. package/dist/lib/processors/code/index.js +61 -0
  38. package/dist/lib/processors/config/fileTypes.d.ts +283 -0
  39. package/dist/lib/processors/config/fileTypes.js +521 -0
  40. package/dist/lib/processors/config/index.d.ts +32 -0
  41. package/dist/lib/processors/config/index.js +93 -0
  42. package/dist/lib/processors/config/languageMap.d.ts +66 -0
  43. package/dist/lib/processors/config/languageMap.js +411 -0
  44. package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
  45. package/dist/lib/processors/config/mimeTypes.js +339 -0
  46. package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
  47. package/dist/lib/processors/config/sizeLimits.js +247 -0
  48. package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
  49. package/dist/lib/processors/data/JsonProcessor.js +204 -0
  50. package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
  51. package/dist/lib/processors/data/XmlProcessor.js +284 -0
  52. package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
  53. package/dist/lib/processors/data/YamlProcessor.js +295 -0
  54. package/dist/lib/processors/data/index.d.ts +49 -0
  55. package/dist/lib/processors/data/index.js +77 -0
  56. package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
  57. package/dist/lib/processors/document/ExcelProcessor.js +520 -0
  58. package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
  59. package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
  60. package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
  61. package/dist/lib/processors/document/RtfProcessor.js +362 -0
  62. package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
  63. package/dist/lib/processors/document/WordProcessor.js +354 -0
  64. package/dist/lib/processors/document/index.d.ts +54 -0
  65. package/dist/lib/processors/document/index.js +91 -0
  66. package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
  67. package/dist/lib/processors/errors/FileErrorCode.js +256 -0
  68. package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
  69. package/dist/lib/processors/errors/errorHelpers.js +379 -0
  70. package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
  71. package/dist/lib/processors/errors/errorSerializer.js +508 -0
  72. package/dist/lib/processors/errors/index.d.ts +46 -0
  73. package/dist/lib/processors/errors/index.js +50 -0
  74. package/dist/lib/processors/index.d.ts +76 -0
  75. package/dist/lib/processors/index.js +113 -0
  76. package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
  77. package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
  78. package/dist/lib/processors/integration/index.d.ts +42 -0
  79. package/dist/lib/processors/integration/index.js +45 -0
  80. package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
  81. package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
  82. package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
  83. package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
  84. package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
  85. package/dist/lib/processors/markup/SvgProcessor.js +241 -0
  86. package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
  87. package/dist/lib/processors/markup/TextProcessor.js +189 -0
  88. package/dist/lib/processors/markup/index.d.ts +66 -0
  89. package/dist/lib/processors/markup/index.js +103 -0
  90. package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
  91. package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
  92. package/dist/lib/processors/registry/index.d.ts +12 -0
  93. package/dist/lib/processors/registry/index.js +17 -0
  94. package/dist/lib/processors/registry/types.d.ts +53 -0
  95. package/dist/lib/processors/registry/types.js +11 -0
  96. package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
  97. package/dist/lib/server/utils/validation.d.ts +6 -6
  98. package/dist/lib/types/fileTypes.d.ts +1 -1
  99. package/dist/lib/types/index.d.ts +25 -24
  100. package/dist/lib/types/index.js +21 -20
  101. package/dist/lib/types/modelTypes.d.ts +18 -18
  102. package/dist/lib/types/pptTypes.d.ts +14 -2
  103. package/dist/lib/types/pptTypes.js +16 -0
  104. package/dist/lib/utils/async/delay.d.ts +40 -0
  105. package/dist/lib/utils/async/delay.js +43 -0
  106. package/dist/lib/utils/async/index.d.ts +23 -0
  107. package/dist/lib/utils/async/index.js +24 -0
  108. package/dist/lib/utils/async/retry.d.ts +141 -0
  109. package/dist/lib/utils/async/retry.js +172 -0
  110. package/dist/lib/utils/async/withTimeout.d.ts +73 -0
  111. package/dist/lib/utils/async/withTimeout.js +97 -0
  112. package/dist/lib/utils/fileDetector.d.ts +7 -1
  113. package/dist/lib/utils/fileDetector.js +91 -18
  114. package/dist/lib/utils/json/extract.d.ts +103 -0
  115. package/dist/lib/utils/json/extract.js +249 -0
  116. package/dist/lib/utils/json/index.d.ts +36 -0
  117. package/dist/lib/utils/json/index.js +37 -0
  118. package/dist/lib/utils/json/safeParse.d.ts +137 -0
  119. package/dist/lib/utils/json/safeParse.js +191 -0
  120. package/dist/lib/utils/messageBuilder.d.ts +2 -2
  121. package/dist/lib/utils/messageBuilder.js +15 -7
  122. package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
  123. package/dist/lib/utils/sanitizers/filename.js +366 -0
  124. package/dist/lib/utils/sanitizers/html.d.ts +170 -0
  125. package/dist/lib/utils/sanitizers/html.js +326 -0
  126. package/dist/lib/utils/sanitizers/index.d.ts +26 -0
  127. package/dist/lib/utils/sanitizers/index.js +30 -0
  128. package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
  129. package/dist/lib/utils/sanitizers/svg.js +483 -0
  130. package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
  131. package/dist/processors/base/BaseFileProcessor.js +613 -0
  132. package/dist/processors/base/index.d.ts +14 -0
  133. package/dist/processors/base/index.js +19 -0
  134. package/dist/processors/base/types.d.ts +593 -0
  135. package/dist/processors/base/types.js +76 -0
  136. package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
  137. package/dist/processors/cli/fileProcessorCli.js +388 -0
  138. package/dist/processors/cli/index.d.ts +37 -0
  139. package/dist/processors/cli/index.js +49 -0
  140. package/dist/processors/code/ConfigProcessor.d.ts +171 -0
  141. package/dist/processors/code/ConfigProcessor.js +400 -0
  142. package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
  143. package/dist/processors/code/SourceCodeProcessor.js +304 -0
  144. package/dist/processors/code/index.d.ts +44 -0
  145. package/dist/processors/code/index.js +60 -0
  146. package/dist/processors/config/fileTypes.d.ts +283 -0
  147. package/dist/processors/config/fileTypes.js +520 -0
  148. package/dist/processors/config/index.d.ts +32 -0
  149. package/dist/processors/config/index.js +92 -0
  150. package/dist/processors/config/languageMap.d.ts +66 -0
  151. package/dist/processors/config/languageMap.js +410 -0
  152. package/dist/processors/config/mimeTypes.d.ts +376 -0
  153. package/dist/processors/config/mimeTypes.js +338 -0
  154. package/dist/processors/config/sizeLimits.d.ts +194 -0
  155. package/dist/processors/config/sizeLimits.js +246 -0
  156. package/dist/processors/data/JsonProcessor.d.ts +122 -0
  157. package/dist/processors/data/JsonProcessor.js +203 -0
  158. package/dist/processors/data/XmlProcessor.d.ts +160 -0
  159. package/dist/processors/data/XmlProcessor.js +283 -0
  160. package/dist/processors/data/YamlProcessor.d.ts +163 -0
  161. package/dist/processors/data/YamlProcessor.js +294 -0
  162. package/dist/processors/data/index.d.ts +49 -0
  163. package/dist/processors/data/index.js +76 -0
  164. package/dist/processors/document/ExcelProcessor.d.ts +238 -0
  165. package/dist/processors/document/ExcelProcessor.js +519 -0
  166. package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
  167. package/dist/processors/document/OpenDocumentProcessor.js +210 -0
  168. package/dist/processors/document/RtfProcessor.d.ts +152 -0
  169. package/dist/processors/document/RtfProcessor.js +361 -0
  170. package/dist/processors/document/WordProcessor.d.ts +168 -0
  171. package/dist/processors/document/WordProcessor.js +353 -0
  172. package/dist/processors/document/index.d.ts +54 -0
  173. package/dist/processors/document/index.js +90 -0
  174. package/dist/processors/errors/FileErrorCode.d.ts +98 -0
  175. package/dist/processors/errors/FileErrorCode.js +255 -0
  176. package/dist/processors/errors/errorHelpers.d.ts +151 -0
  177. package/dist/processors/errors/errorHelpers.js +378 -0
  178. package/dist/processors/errors/errorSerializer.d.ts +139 -0
  179. package/dist/processors/errors/errorSerializer.js +507 -0
  180. package/dist/processors/errors/index.d.ts +46 -0
  181. package/dist/processors/errors/index.js +49 -0
  182. package/dist/processors/index.d.ts +76 -0
  183. package/dist/processors/index.js +112 -0
  184. package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
  185. package/dist/processors/integration/FileProcessorIntegration.js +272 -0
  186. package/dist/processors/integration/index.d.ts +42 -0
  187. package/dist/processors/integration/index.js +44 -0
  188. package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
  189. package/dist/processors/markup/HtmlProcessor.js +249 -0
  190. package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
  191. package/dist/processors/markup/MarkdownProcessor.js +244 -0
  192. package/dist/processors/markup/SvgProcessor.d.ts +156 -0
  193. package/dist/processors/markup/SvgProcessor.js +240 -0
  194. package/dist/processors/markup/TextProcessor.d.ts +135 -0
  195. package/dist/processors/markup/TextProcessor.js +188 -0
  196. package/dist/processors/markup/index.d.ts +66 -0
  197. package/dist/processors/markup/index.js +102 -0
  198. package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
  199. package/dist/processors/registry/ProcessorRegistry.js +608 -0
  200. package/dist/processors/registry/index.d.ts +12 -0
  201. package/dist/processors/registry/index.js +16 -0
  202. package/dist/processors/registry/types.d.ts +53 -0
  203. package/dist/processors/registry/types.js +10 -0
  204. package/dist/server/utils/validation.d.ts +6 -6
  205. package/dist/types/fileTypes.d.ts +1 -1
  206. package/dist/types/index.d.ts +25 -24
  207. package/dist/types/index.js +21 -20
  208. package/dist/types/modelTypes.d.ts +10 -10
  209. package/dist/types/pptTypes.d.ts +14 -2
  210. package/dist/types/pptTypes.js +16 -0
  211. package/dist/utils/async/delay.d.ts +40 -0
  212. package/dist/utils/async/delay.js +42 -0
  213. package/dist/utils/async/index.d.ts +23 -0
  214. package/dist/utils/async/index.js +23 -0
  215. package/dist/utils/async/retry.d.ts +141 -0
  216. package/dist/utils/async/retry.js +171 -0
  217. package/dist/utils/async/withTimeout.d.ts +73 -0
  218. package/dist/utils/async/withTimeout.js +96 -0
  219. package/dist/utils/fileDetector.d.ts +7 -1
  220. package/dist/utils/fileDetector.js +91 -18
  221. package/dist/utils/json/extract.d.ts +103 -0
  222. package/dist/utils/json/extract.js +248 -0
  223. package/dist/utils/json/index.d.ts +36 -0
  224. package/dist/utils/json/index.js +36 -0
  225. package/dist/utils/json/safeParse.d.ts +137 -0
  226. package/dist/utils/json/safeParse.js +190 -0
  227. package/dist/utils/messageBuilder.d.ts +2 -2
  228. package/dist/utils/messageBuilder.js +15 -7
  229. package/dist/utils/sanitizers/filename.d.ts +137 -0
  230. package/dist/utils/sanitizers/filename.js +365 -0
  231. package/dist/utils/sanitizers/html.d.ts +170 -0
  232. package/dist/utils/sanitizers/html.js +325 -0
  233. package/dist/utils/sanitizers/index.d.ts +26 -0
  234. package/dist/utils/sanitizers/index.js +29 -0
  235. package/dist/utils/sanitizers/svg.d.ts +81 -0
  236. package/dist/utils/sanitizers/svg.js +482 -0
  237. package/package.json +2 -2
@@ -0,0 +1,362 @@
1
+ /**
2
+ * RTF Document Processor
3
+ *
4
+ * Processes Rich Text Format (.rtf) files by extracting plain text content
5
+ * from RTF control codes. Uses a lightweight text extraction approach
6
+ * without requiring external dependencies.
7
+ *
8
+ * Key features:
9
+ * - RTF control code stripping
10
+ * - Text content extraction
11
+ * - Raw content preservation for debugging
12
+ * - No external dependencies required
13
+ *
14
+ * Priority: ~110 (document format, processed after binary formats)
15
+ *
16
+ * @module processors/document/RtfProcessor
17
+ *
18
+ * @example
19
+ * ```typescript
20
+ * import { rtfProcessor, processRtf, isRtfFile } from "./document/index.js";
21
+ *
22
+ * // Check if a file is an RTF file
23
+ * if (isRtfFile("application/rtf", "document.rtf")) {
24
+ * const result = await processRtf({
25
+ * id: "file-123",
26
+ * name: "document.rtf",
27
+ * mimetype: "application/rtf",
28
+ * size: 10240,
29
+ * buffer: rtfBuffer,
30
+ * });
31
+ *
32
+ * if (result.success) {
33
+ * console.log(`Text content: ${result.data.textContent}`);
34
+ * }
35
+ * }
36
+ * ```
37
+ */
38
+ import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
39
+ import { SIZE_LIMITS } from "../config/index.js";
40
+ // =============================================================================
41
+ // CONSTANTS
42
+ // =============================================================================
43
+ /**
44
+ * Supported MIME types for RTF documents
45
+ */
46
+ const SUPPORTED_RTF_MIME_TYPES = [
47
+ "application/rtf",
48
+ "text/rtf",
49
+ "text/richtext",
50
+ ];
51
+ /**
52
+ * Supported file extensions for RTF documents
53
+ */
54
+ const SUPPORTED_RTF_EXTENSIONS = [".rtf"];
55
+ /**
56
+ * Default timeout for RTF processing (30 seconds)
57
+ */
58
+ const RTF_TIMEOUT_MS = 30000;
59
+ // =============================================================================
60
+ // RTF PROCESSOR CLASS
61
+ // =============================================================================
62
+ /**
63
+ * RTF Processor - handles Rich Text Format files.
64
+ *
65
+ * Extracts plain text from RTF documents by stripping RTF control codes.
66
+ * This is a lightweight implementation that doesn't require external
67
+ * RTF parsing libraries.
68
+ *
69
+ * Priority: ~110 (document format)
70
+ *
71
+ * @example
72
+ * ```typescript
73
+ * const processor = new RtfProcessor();
74
+ *
75
+ * const result = await processor.processFile({
76
+ * id: "file-123",
77
+ * name: "report.rtf",
78
+ * mimetype: "application/rtf",
79
+ * size: 5120,
80
+ * buffer: rtfBuffer,
81
+ * });
82
+ *
83
+ * if (result.success) {
84
+ * console.log("Extracted text:", result.data.textContent);
85
+ * }
86
+ * ```
87
+ */
88
+ export class RtfProcessor extends BaseFileProcessor {
89
+ constructor() {
90
+ super({
91
+ maxSizeMB: SIZE_LIMITS.DOCUMENT_MAX_MB,
92
+ timeoutMs: RTF_TIMEOUT_MS,
93
+ supportedMimeTypes: SUPPORTED_RTF_MIME_TYPES,
94
+ supportedExtensions: SUPPORTED_RTF_EXTENSIONS,
95
+ fileTypeName: "RTF",
96
+ defaultFilename: "document.rtf",
97
+ });
98
+ }
99
+ /**
100
+ * Validate downloaded RTF document.
101
+ * Checks for RTF header signature "{\\rtf".
102
+ *
103
+ * @param buffer - Downloaded file content
104
+ * @param fileInfo - Original file information
105
+ * @returns null if valid, error message if invalid
106
+ */
107
+ async validateDownloadedFile(buffer, _fileInfo) {
108
+ if (buffer.length < 5) {
109
+ return "Invalid RTF document - file too small";
110
+ }
111
+ // RTF files should start with "{\rtf"
112
+ const header = buffer.subarray(0, 10).toString("ascii");
113
+ if (!header.startsWith("{\\rtf")) {
114
+ // Check if it might be HTML error page
115
+ const preview = buffer
116
+ .subarray(0, 100)
117
+ .toString("utf8")
118
+ .substring(0, 100);
119
+ if (preview.includes("<!DOCTYPE") || preview.includes("<html")) {
120
+ return "Invalid RTF document - received HTML response instead of file content";
121
+ }
122
+ return "Invalid RTF document - missing RTF header signature";
123
+ }
124
+ return null;
125
+ }
126
+ /**
127
+ * Build the processed RTF result.
128
+ * Extracts plain text by stripping RTF control codes.
129
+ *
130
+ * @param buffer - Raw file content
131
+ * @param fileInfo - Original file information
132
+ * @returns Processed RTF with extracted text content
133
+ */
134
+ buildProcessedResult(buffer, fileInfo) {
135
+ const rawContent = buffer.toString("utf-8");
136
+ const textContent = this.extractText(rawContent);
137
+ return {
138
+ textContent,
139
+ rawContent,
140
+ buffer,
141
+ mimetype: fileInfo.mimetype || "application/rtf",
142
+ size: fileInfo.size,
143
+ filename: this.getFilename(fileInfo),
144
+ };
145
+ }
146
+ /**
147
+ * Extract plain text from RTF content.
148
+ * Strips RTF control codes, groups, and formatting commands.
149
+ *
150
+ * This is a basic RTF parser that handles common RTF constructs:
151
+ * - Control groups like {\fonttbl...}
152
+ * - Control words like \par, \b, \i
153
+ * - Special characters like \' hex escapes
154
+ * - Newlines from \par and \line commands
155
+ *
156
+ * @param rtf - Raw RTF content
157
+ * @returns Extracted plain text
158
+ */
159
+ extractText(rtf) {
160
+ const text = rtf;
161
+ let result = "";
162
+ let depth = 0;
163
+ let skipGroup = false;
164
+ let skipGroupDepth = 0;
165
+ let i = 0;
166
+ // Groups that should be skipped entirely (metadata, not content)
167
+ const skipGroupNames = [
168
+ "fonttbl",
169
+ "colortbl",
170
+ "stylesheet",
171
+ "info",
172
+ "pict",
173
+ "object",
174
+ "header",
175
+ "footer",
176
+ ];
177
+ while (i < text.length) {
178
+ const char = text[i];
179
+ if (char === "{") {
180
+ depth++;
181
+ // Check if this is a group we should skip
182
+ const nextChars = text.substring(i + 1, i + 20);
183
+ const groupMatch = nextChars.match(/^\\([a-z]+)/);
184
+ if (groupMatch &&
185
+ skipGroupNames.includes(groupMatch[1]) &&
186
+ !skipGroup) {
187
+ skipGroup = true;
188
+ skipGroupDepth = depth;
189
+ }
190
+ i++;
191
+ continue;
192
+ }
193
+ if (char === "}") {
194
+ depth--;
195
+ if (skipGroup && depth < skipGroupDepth) {
196
+ skipGroup = false;
197
+ skipGroupDepth = 0;
198
+ }
199
+ i++;
200
+ continue;
201
+ }
202
+ if (skipGroup) {
203
+ i++;
204
+ continue;
205
+ }
206
+ if (char === "\\") {
207
+ // Control word or symbol
208
+ const remaining = text.substring(i);
209
+ // Handle special escapes
210
+ if (remaining.startsWith("\\\\")) {
211
+ result += "\\";
212
+ i += 2;
213
+ continue;
214
+ }
215
+ if (remaining.startsWith("\\{")) {
216
+ result += "{";
217
+ i += 2;
218
+ continue;
219
+ }
220
+ if (remaining.startsWith("\\}")) {
221
+ result += "}";
222
+ i += 2;
223
+ continue;
224
+ }
225
+ // Handle hex escapes like \'e9 (é)
226
+ const hexMatch = remaining.match(/^\\'([0-9a-f]{2})/i);
227
+ if (hexMatch) {
228
+ const charCode = parseInt(hexMatch[1], 16);
229
+ result += String.fromCharCode(charCode);
230
+ i += 4;
231
+ continue;
232
+ }
233
+ // Handle Unicode escapes like \u233? (é)
234
+ const unicodeMatch = remaining.match(/^\\u(-?\d+)\??/);
235
+ if (unicodeMatch) {
236
+ let charCode = parseInt(unicodeMatch[1], 10);
237
+ if (charCode < 0) {
238
+ charCode += 65536; // Convert negative to positive
239
+ }
240
+ result += String.fromCharCode(charCode);
241
+ i += unicodeMatch[0].length;
242
+ continue;
243
+ }
244
+ // Handle control words
245
+ const controlMatch = remaining.match(/^\\([a-z]+)(-?\d*)[ ]?/i);
246
+ if (controlMatch) {
247
+ const controlWord = controlMatch[1].toLowerCase();
248
+ // Convert some control words to text
249
+ if (controlWord === "par" || controlWord === "line") {
250
+ result += "\n";
251
+ }
252
+ else if (controlWord === "tab") {
253
+ result += "\t";
254
+ }
255
+ else if (controlWord === "emdash") {
256
+ result += "—";
257
+ }
258
+ else if (controlWord === "endash") {
259
+ result += "–";
260
+ }
261
+ else if (controlWord === "bullet") {
262
+ result += "•";
263
+ }
264
+ else if (controlWord === "lquote") {
265
+ result += "'";
266
+ }
267
+ else if (controlWord === "rquote") {
268
+ result += "'";
269
+ }
270
+ else if (controlWord === "ldblquote") {
271
+ result += '"';
272
+ }
273
+ else if (controlWord === "rdblquote") {
274
+ result += '"';
275
+ }
276
+ i += controlMatch[0].length;
277
+ continue;
278
+ }
279
+ // Unknown control sequence, skip the backslash and control word
280
+ i++;
281
+ continue;
282
+ }
283
+ // Regular character
284
+ if (char !== "\r" && char !== "\n") {
285
+ result += char;
286
+ }
287
+ i++;
288
+ }
289
+ // Clean up the result
290
+ result = result
291
+ .replace(/\s+/g, " ") // Normalize whitespace
292
+ .replace(/ +\n/g, "\n") // Remove trailing spaces before newlines
293
+ .replace(/\n +/g, "\n") // Remove leading spaces after newlines
294
+ .replace(/\n{3,}/g, "\n\n") // Collapse multiple newlines
295
+ .trim();
296
+ return result;
297
+ }
298
+ }
299
+ // =============================================================================
300
+ // SINGLETON INSTANCE
301
+ // =============================================================================
302
+ /**
303
+ * Singleton instance of the RtfProcessor.
304
+ * Use this for all RTF document processing to share configuration.
305
+ */
306
+ export const rtfProcessor = new RtfProcessor();
307
+ // =============================================================================
308
+ // HELPER FUNCTIONS
309
+ // =============================================================================
310
+ /**
311
+ * Check if a file is an RTF document.
312
+ *
313
+ * @param mimetype - MIME type of the file
314
+ * @param filename - Filename for detection
315
+ * @returns true if the file is a supported RTF document
316
+ *
317
+ * @example
318
+ * ```typescript
319
+ * if (isRtfFile("application/rtf", "document.rtf")) {
320
+ * console.log("This is an RTF document");
321
+ * }
322
+ * ```
323
+ */
324
+ export function isRtfFile(mimetype, filename) {
325
+ return rtfProcessor.isFileSupported(mimetype, filename);
326
+ }
327
+ /**
328
+ * Validate RTF document size against configured limit.
329
+ *
330
+ * @param sizeBytes - File size in bytes
331
+ * @returns true if size is within the allowed limit
332
+ */
333
+ export function validateRtfSize(sizeBytes) {
334
+ const maxBytes = SIZE_LIMITS.DOCUMENT_MAX_MB * 1024 * 1024;
335
+ return sizeBytes <= maxBytes;
336
+ }
337
+ /**
338
+ * Process an RTF document.
339
+ *
340
+ * @param fileInfo - File information (can include URL or buffer)
341
+ * @param options - Optional processing options
342
+ * @returns Processing result with success flag and either data or error
343
+ *
344
+ * @example
345
+ * ```typescript
346
+ * const result = await processRtf({
347
+ * id: "file-123",
348
+ * name: "report.rtf",
349
+ * mimetype: "application/rtf",
350
+ * size: 10240,
351
+ * buffer: rtfBuffer,
352
+ * });
353
+ *
354
+ * if (result.success) {
355
+ * console.log("Extracted text:", result.data.textContent);
356
+ * }
357
+ * ```
358
+ */
359
+ export async function processRtf(fileInfo, options) {
360
+ return rtfProcessor.processFile(fileInfo, options);
361
+ }
362
+ //# sourceMappingURL=RtfProcessor.js.map
@@ -0,0 +1,168 @@
1
+ /**
2
+ * Word Document Processing Utility
3
+ *
4
+ * Handles downloading, validating, and processing Word (.docx, .doc) files.
5
+ * Uses mammoth library to extract text and HTML content from Word documents.
6
+ *
7
+ * Features:
8
+ * - DOCX format validation via ZIP/PK signature check
9
+ * - Text extraction using mammoth.extractRawText()
10
+ * - HTML conversion using mammoth.convertToHtml()
11
+ * - Warning collection from mammoth processing
12
+ * - Support for both URL downloads and direct buffer input
13
+ *
14
+ * @module processors/document/WordProcessor
15
+ *
16
+ * @example
17
+ * ```typescript
18
+ * import { wordProcessor, processWord, isWordFile } from "./WordProcessor.js";
19
+ *
20
+ * // Check if file is supported
21
+ * if (isWordFile(file.mimetype, file.name)) {
22
+ * const result = await processWord(fileInfo, {
23
+ * authHeaders: { Authorization: "Bearer token" },
24
+ * });
25
+ *
26
+ * if (result.success) {
27
+ * console.log("Text:", result.data.textContent);
28
+ * console.log("HTML:", result.data.htmlContent);
29
+ * console.log("Warnings:", result.data.warnings);
30
+ * }
31
+ * }
32
+ * ```
33
+ */
34
+ import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
35
+ import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
36
+ export type { ProcessedWord } from "../base/types.js";
37
+ import type { ProcessedWord } from "../base/types.js";
38
+ /**
39
+ * Word Processor - handles .docx and .doc files
40
+ *
41
+ * Uses mammoth library for both text and HTML extraction. The processor
42
+ * validates DOCX files by checking for the ZIP/PK signature (since DOCX
43
+ * files are actually ZIP archives).
44
+ *
45
+ * @example
46
+ * ```typescript
47
+ * const processor = new WordProcessor();
48
+ *
49
+ * // Check if file is supported
50
+ * if (processor.isFileSupported("application/msword", "report.doc")) {
51
+ * const result = await processor.processFile(fileInfo);
52
+ * if (result.success) {
53
+ * console.log("Extracted text:", result.data.textContent);
54
+ * }
55
+ * }
56
+ * ```
57
+ */
58
+ export declare class WordProcessor extends BaseFileProcessor<ProcessedWord> {
59
+ constructor();
60
+ /**
61
+ * Validate downloaded Word document has correct magic bytes.
62
+ * DOCX files are ZIP archives starting with PK signature (0x50 0x4B).
63
+ *
64
+ * @param buffer - Downloaded file content
65
+ * @param fileInfo - Original file information
66
+ * @returns null if valid, error message if invalid
67
+ */
68
+ protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
69
+ /**
70
+ * Build processed Word result with extracted text and HTML content.
71
+ * This is a stub that returns an empty result - actual processing
72
+ * happens in the overridden processFile method since mammoth
73
+ * operations are asynchronous.
74
+ *
75
+ * @param buffer - Downloaded file content
76
+ * @param fileInfo - Original file information
77
+ * @returns Processed Word result (placeholder)
78
+ */
79
+ protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedWord;
80
+ /**
81
+ * Override processFile for async mammoth extraction.
82
+ *
83
+ * The mammoth library's extractRawText and convertToHtml methods are
84
+ * asynchronous, so we need to override the entire processFile method
85
+ * rather than just buildProcessedResult.
86
+ *
87
+ * Processing steps:
88
+ * 1. Validate file type and size
89
+ * 2. Get buffer (download from URL or use provided buffer)
90
+ * 3. Validate downloaded file (check PK signature)
91
+ * 4. Extract text with mammoth.extractRawText()
92
+ * 5. Convert to HTML with mammoth.convertToHtml()
93
+ * 6. Collect any warnings from mammoth
94
+ * 7. Return structured result
95
+ *
96
+ * @param fileInfo - File information with URL or buffer
97
+ * @param options - Optional processing options
98
+ * @returns Processing result with text, HTML, and warnings
99
+ */
100
+ processFile(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedWord>>;
101
+ }
102
+ /**
103
+ * Singleton Word processor instance.
104
+ * Use this for most use cases to avoid creating multiple instances.
105
+ */
106
+ export declare const wordProcessor: WordProcessor;
107
+ /**
108
+ * Check if a file is a Word document (.docx or .doc).
109
+ *
110
+ * @param mimetype - MIME type of the file
111
+ * @param filename - Filename (for extension-based detection)
112
+ * @returns true if the file is a supported Word document
113
+ *
114
+ * @example
115
+ * ```typescript
116
+ * if (isWordFile(file.mimetype, file.name)) {
117
+ * const result = await processWord(file);
118
+ * }
119
+ * ```
120
+ */
121
+ export declare function isWordFile(mimetype: string, filename: string): boolean;
122
+ /**
123
+ * Validate Word document size against configured limit.
124
+ *
125
+ * @param sizeBytes - File size in bytes
126
+ * @returns true if size is within the allowed limit
127
+ *
128
+ * @example
129
+ * ```typescript
130
+ * if (!validateWordSize(file.size)) {
131
+ * throw new Error(`File exceeds ${SIZE_LIMITS.WORD_MAX_MB}MB limit`);
132
+ * }
133
+ * ```
134
+ */
135
+ export declare function validateWordSize(sizeBytes: number): boolean;
136
+ /**
137
+ * Process a single Word document.
138
+ *
139
+ * Convenience function that uses the singleton wordProcessor instance.
140
+ *
141
+ * @param fileInfo - File information with URL or buffer
142
+ * @param options - Optional processing options (auth headers, timeout, retry config)
143
+ * @returns Processing result with extracted text, HTML, and warnings
144
+ *
145
+ * @example
146
+ * ```typescript
147
+ * const result = await processWord({
148
+ * id: "doc-123",
149
+ * name: "report.docx",
150
+ * mimetype: "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
151
+ * size: 12345,
152
+ * url: "https://example.com/files/report.docx",
153
+ * }, {
154
+ * authHeaders: { Authorization: "Bearer token" },
155
+ * });
156
+ *
157
+ * if (result.success) {
158
+ * console.log("Text content:", result.data.textContent);
159
+ * console.log("HTML content:", result.data.htmlContent);
160
+ * if (result.data.warnings.length > 0) {
161
+ * console.warn("Warnings:", result.data.warnings);
162
+ * }
163
+ * } else {
164
+ * console.error("Failed:", result.error.userMessage);
165
+ * }
166
+ * ```
167
+ */
168
+ export declare function processWord(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedWord>>;