@juspay/neurolink 9.1.1 → 9.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (237) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/README.md +54 -7
  3. package/dist/agent/directTools.d.ts +3 -3
  4. package/dist/cli/commands/config.d.ts +6 -6
  5. package/dist/image-gen/ImageGenService.d.ts +143 -0
  6. package/dist/image-gen/ImageGenService.js +345 -0
  7. package/dist/image-gen/imageGenTools.d.ts +126 -0
  8. package/dist/image-gen/imageGenTools.js +304 -0
  9. package/dist/image-gen/index.d.ts +46 -0
  10. package/dist/image-gen/index.js +48 -0
  11. package/dist/image-gen/types.d.ts +237 -0
  12. package/dist/image-gen/types.js +24 -0
  13. package/dist/lib/agent/directTools.d.ts +3 -3
  14. package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
  15. package/dist/lib/image-gen/ImageGenService.js +346 -0
  16. package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
  17. package/dist/lib/image-gen/imageGenTools.js +305 -0
  18. package/dist/lib/image-gen/index.d.ts +46 -0
  19. package/dist/lib/image-gen/index.js +49 -0
  20. package/dist/lib/image-gen/types.d.ts +237 -0
  21. package/dist/lib/image-gen/types.js +25 -0
  22. package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
  23. package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
  24. package/dist/lib/processors/base/index.d.ts +14 -0
  25. package/dist/lib/processors/base/index.js +20 -0
  26. package/dist/lib/processors/base/types.d.ts +593 -0
  27. package/dist/lib/processors/base/types.js +77 -0
  28. package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
  29. package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
  30. package/dist/lib/processors/cli/index.d.ts +37 -0
  31. package/dist/lib/processors/cli/index.js +50 -0
  32. package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
  33. package/dist/lib/processors/code/ConfigProcessor.js +401 -0
  34. package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
  35. package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
  36. package/dist/lib/processors/code/index.d.ts +44 -0
  37. package/dist/lib/processors/code/index.js +61 -0
  38. package/dist/lib/processors/config/fileTypes.d.ts +283 -0
  39. package/dist/lib/processors/config/fileTypes.js +521 -0
  40. package/dist/lib/processors/config/index.d.ts +32 -0
  41. package/dist/lib/processors/config/index.js +93 -0
  42. package/dist/lib/processors/config/languageMap.d.ts +66 -0
  43. package/dist/lib/processors/config/languageMap.js +411 -0
  44. package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
  45. package/dist/lib/processors/config/mimeTypes.js +339 -0
  46. package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
  47. package/dist/lib/processors/config/sizeLimits.js +247 -0
  48. package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
  49. package/dist/lib/processors/data/JsonProcessor.js +204 -0
  50. package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
  51. package/dist/lib/processors/data/XmlProcessor.js +284 -0
  52. package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
  53. package/dist/lib/processors/data/YamlProcessor.js +295 -0
  54. package/dist/lib/processors/data/index.d.ts +49 -0
  55. package/dist/lib/processors/data/index.js +77 -0
  56. package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
  57. package/dist/lib/processors/document/ExcelProcessor.js +520 -0
  58. package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
  59. package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
  60. package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
  61. package/dist/lib/processors/document/RtfProcessor.js +362 -0
  62. package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
  63. package/dist/lib/processors/document/WordProcessor.js +354 -0
  64. package/dist/lib/processors/document/index.d.ts +54 -0
  65. package/dist/lib/processors/document/index.js +91 -0
  66. package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
  67. package/dist/lib/processors/errors/FileErrorCode.js +256 -0
  68. package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
  69. package/dist/lib/processors/errors/errorHelpers.js +379 -0
  70. package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
  71. package/dist/lib/processors/errors/errorSerializer.js +508 -0
  72. package/dist/lib/processors/errors/index.d.ts +46 -0
  73. package/dist/lib/processors/errors/index.js +50 -0
  74. package/dist/lib/processors/index.d.ts +76 -0
  75. package/dist/lib/processors/index.js +113 -0
  76. package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
  77. package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
  78. package/dist/lib/processors/integration/index.d.ts +42 -0
  79. package/dist/lib/processors/integration/index.js +45 -0
  80. package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
  81. package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
  82. package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
  83. package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
  84. package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
  85. package/dist/lib/processors/markup/SvgProcessor.js +241 -0
  86. package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
  87. package/dist/lib/processors/markup/TextProcessor.js +189 -0
  88. package/dist/lib/processors/markup/index.d.ts +66 -0
  89. package/dist/lib/processors/markup/index.js +103 -0
  90. package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
  91. package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
  92. package/dist/lib/processors/registry/index.d.ts +12 -0
  93. package/dist/lib/processors/registry/index.js +17 -0
  94. package/dist/lib/processors/registry/types.d.ts +53 -0
  95. package/dist/lib/processors/registry/types.js +11 -0
  96. package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
  97. package/dist/lib/server/utils/validation.d.ts +6 -6
  98. package/dist/lib/types/fileTypes.d.ts +1 -1
  99. package/dist/lib/types/index.d.ts +25 -24
  100. package/dist/lib/types/index.js +21 -20
  101. package/dist/lib/types/modelTypes.d.ts +18 -18
  102. package/dist/lib/types/pptTypes.d.ts +14 -2
  103. package/dist/lib/types/pptTypes.js +16 -0
  104. package/dist/lib/utils/async/delay.d.ts +40 -0
  105. package/dist/lib/utils/async/delay.js +43 -0
  106. package/dist/lib/utils/async/index.d.ts +23 -0
  107. package/dist/lib/utils/async/index.js +24 -0
  108. package/dist/lib/utils/async/retry.d.ts +141 -0
  109. package/dist/lib/utils/async/retry.js +172 -0
  110. package/dist/lib/utils/async/withTimeout.d.ts +73 -0
  111. package/dist/lib/utils/async/withTimeout.js +97 -0
  112. package/dist/lib/utils/fileDetector.d.ts +7 -1
  113. package/dist/lib/utils/fileDetector.js +91 -18
  114. package/dist/lib/utils/json/extract.d.ts +103 -0
  115. package/dist/lib/utils/json/extract.js +249 -0
  116. package/dist/lib/utils/json/index.d.ts +36 -0
  117. package/dist/lib/utils/json/index.js +37 -0
  118. package/dist/lib/utils/json/safeParse.d.ts +137 -0
  119. package/dist/lib/utils/json/safeParse.js +191 -0
  120. package/dist/lib/utils/messageBuilder.d.ts +2 -2
  121. package/dist/lib/utils/messageBuilder.js +15 -7
  122. package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
  123. package/dist/lib/utils/sanitizers/filename.js +366 -0
  124. package/dist/lib/utils/sanitizers/html.d.ts +170 -0
  125. package/dist/lib/utils/sanitizers/html.js +326 -0
  126. package/dist/lib/utils/sanitizers/index.d.ts +26 -0
  127. package/dist/lib/utils/sanitizers/index.js +30 -0
  128. package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
  129. package/dist/lib/utils/sanitizers/svg.js +483 -0
  130. package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
  131. package/dist/processors/base/BaseFileProcessor.js +613 -0
  132. package/dist/processors/base/index.d.ts +14 -0
  133. package/dist/processors/base/index.js +19 -0
  134. package/dist/processors/base/types.d.ts +593 -0
  135. package/dist/processors/base/types.js +76 -0
  136. package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
  137. package/dist/processors/cli/fileProcessorCli.js +388 -0
  138. package/dist/processors/cli/index.d.ts +37 -0
  139. package/dist/processors/cli/index.js +49 -0
  140. package/dist/processors/code/ConfigProcessor.d.ts +171 -0
  141. package/dist/processors/code/ConfigProcessor.js +400 -0
  142. package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
  143. package/dist/processors/code/SourceCodeProcessor.js +304 -0
  144. package/dist/processors/code/index.d.ts +44 -0
  145. package/dist/processors/code/index.js +60 -0
  146. package/dist/processors/config/fileTypes.d.ts +283 -0
  147. package/dist/processors/config/fileTypes.js +520 -0
  148. package/dist/processors/config/index.d.ts +32 -0
  149. package/dist/processors/config/index.js +92 -0
  150. package/dist/processors/config/languageMap.d.ts +66 -0
  151. package/dist/processors/config/languageMap.js +410 -0
  152. package/dist/processors/config/mimeTypes.d.ts +376 -0
  153. package/dist/processors/config/mimeTypes.js +338 -0
  154. package/dist/processors/config/sizeLimits.d.ts +194 -0
  155. package/dist/processors/config/sizeLimits.js +246 -0
  156. package/dist/processors/data/JsonProcessor.d.ts +122 -0
  157. package/dist/processors/data/JsonProcessor.js +203 -0
  158. package/dist/processors/data/XmlProcessor.d.ts +160 -0
  159. package/dist/processors/data/XmlProcessor.js +283 -0
  160. package/dist/processors/data/YamlProcessor.d.ts +163 -0
  161. package/dist/processors/data/YamlProcessor.js +294 -0
  162. package/dist/processors/data/index.d.ts +49 -0
  163. package/dist/processors/data/index.js +76 -0
  164. package/dist/processors/document/ExcelProcessor.d.ts +238 -0
  165. package/dist/processors/document/ExcelProcessor.js +519 -0
  166. package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
  167. package/dist/processors/document/OpenDocumentProcessor.js +210 -0
  168. package/dist/processors/document/RtfProcessor.d.ts +152 -0
  169. package/dist/processors/document/RtfProcessor.js +361 -0
  170. package/dist/processors/document/WordProcessor.d.ts +168 -0
  171. package/dist/processors/document/WordProcessor.js +353 -0
  172. package/dist/processors/document/index.d.ts +54 -0
  173. package/dist/processors/document/index.js +90 -0
  174. package/dist/processors/errors/FileErrorCode.d.ts +98 -0
  175. package/dist/processors/errors/FileErrorCode.js +255 -0
  176. package/dist/processors/errors/errorHelpers.d.ts +151 -0
  177. package/dist/processors/errors/errorHelpers.js +378 -0
  178. package/dist/processors/errors/errorSerializer.d.ts +139 -0
  179. package/dist/processors/errors/errorSerializer.js +507 -0
  180. package/dist/processors/errors/index.d.ts +46 -0
  181. package/dist/processors/errors/index.js +49 -0
  182. package/dist/processors/index.d.ts +76 -0
  183. package/dist/processors/index.js +112 -0
  184. package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
  185. package/dist/processors/integration/FileProcessorIntegration.js +272 -0
  186. package/dist/processors/integration/index.d.ts +42 -0
  187. package/dist/processors/integration/index.js +44 -0
  188. package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
  189. package/dist/processors/markup/HtmlProcessor.js +249 -0
  190. package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
  191. package/dist/processors/markup/MarkdownProcessor.js +244 -0
  192. package/dist/processors/markup/SvgProcessor.d.ts +156 -0
  193. package/dist/processors/markup/SvgProcessor.js +240 -0
  194. package/dist/processors/markup/TextProcessor.d.ts +135 -0
  195. package/dist/processors/markup/TextProcessor.js +188 -0
  196. package/dist/processors/markup/index.d.ts +66 -0
  197. package/dist/processors/markup/index.js +102 -0
  198. package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
  199. package/dist/processors/registry/ProcessorRegistry.js +608 -0
  200. package/dist/processors/registry/index.d.ts +12 -0
  201. package/dist/processors/registry/index.js +16 -0
  202. package/dist/processors/registry/types.d.ts +53 -0
  203. package/dist/processors/registry/types.js +10 -0
  204. package/dist/server/utils/validation.d.ts +6 -6
  205. package/dist/types/fileTypes.d.ts +1 -1
  206. package/dist/types/index.d.ts +25 -24
  207. package/dist/types/index.js +21 -20
  208. package/dist/types/modelTypes.d.ts +10 -10
  209. package/dist/types/pptTypes.d.ts +14 -2
  210. package/dist/types/pptTypes.js +16 -0
  211. package/dist/utils/async/delay.d.ts +40 -0
  212. package/dist/utils/async/delay.js +42 -0
  213. package/dist/utils/async/index.d.ts +23 -0
  214. package/dist/utils/async/index.js +23 -0
  215. package/dist/utils/async/retry.d.ts +141 -0
  216. package/dist/utils/async/retry.js +171 -0
  217. package/dist/utils/async/withTimeout.d.ts +73 -0
  218. package/dist/utils/async/withTimeout.js +96 -0
  219. package/dist/utils/fileDetector.d.ts +7 -1
  220. package/dist/utils/fileDetector.js +91 -18
  221. package/dist/utils/json/extract.d.ts +103 -0
  222. package/dist/utils/json/extract.js +248 -0
  223. package/dist/utils/json/index.d.ts +36 -0
  224. package/dist/utils/json/index.js +36 -0
  225. package/dist/utils/json/safeParse.d.ts +137 -0
  226. package/dist/utils/json/safeParse.js +190 -0
  227. package/dist/utils/messageBuilder.d.ts +2 -2
  228. package/dist/utils/messageBuilder.js +15 -7
  229. package/dist/utils/sanitizers/filename.d.ts +137 -0
  230. package/dist/utils/sanitizers/filename.js +365 -0
  231. package/dist/utils/sanitizers/html.d.ts +170 -0
  232. package/dist/utils/sanitizers/html.js +325 -0
  233. package/dist/utils/sanitizers/index.d.ts +26 -0
  234. package/dist/utils/sanitizers/index.js +29 -0
  235. package/dist/utils/sanitizers/svg.d.ts +81 -0
  236. package/dist/utils/sanitizers/svg.js +482 -0
  237. package/package.json +2 -2
@@ -0,0 +1,45 @@
1
+ /**
2
+ * File Processor Integration Module
3
+ *
4
+ * Provides integration between the ProcessorRegistry and message building.
5
+ * Exports utilities for processing files through registered processors
6
+ * with automatic type detection and batch processing support.
7
+ *
8
+ * @module processors/integration
9
+ *
10
+ * @example
11
+ * ```typescript
12
+ * import {
13
+ * // Single file processing
14
+ * processFileWithRegistry,
15
+ *
16
+ * // Batch processing
17
+ * processBatchWithRegistry,
18
+ *
19
+ * // Discovery utilities
20
+ * getSupportedFileTypes,
21
+ * isFileTypeSupported,
22
+ * getProcessorForFile,
23
+ *
24
+ * // Types
25
+ * type FileProcessingOptions,
26
+ * type BatchFileProcessingResult,
27
+ * } from "./integration/index.js";
28
+ *
29
+ * // Process a single file with auto-detection
30
+ * const { processorName, result } = await processFileWithRegistry(fileInfo);
31
+ *
32
+ * // Process multiple files
33
+ * const batchResult = await processBatchWithRegistry(files, { maxFiles: 50 });
34
+ *
35
+ * // Check supported types
36
+ * const supported = getSupportedFileTypes();
37
+ * const isSupported = isFileTypeSupported("application/pdf", "doc.pdf");
38
+ * const match = getProcessorForFile("image/jpeg", "photo.jpg");
39
+ * ```
40
+ */
41
+ // =============================================================================
42
+ // FUNCTION EXPORTS
43
+ // =============================================================================
44
+ export { getProcessorForFile, getSupportedFileTypes, isFileTypeSupported, processBatchWithRegistry, processFileWithRegistry, } from "./FileProcessorIntegration.js";
45
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1,169 @@
1
+ /**
2
+ * HTML File Processor
3
+ *
4
+ * Processes HTML files with text extraction and security analysis.
5
+ * HTML files are processed as text content for AI analysis, with
6
+ * extraction of plain text content (tags stripped) for easier processing.
7
+ *
8
+ * Features:
9
+ * - Original HTML content preservation
10
+ * - Text extraction (all tags stripped)
11
+ * - Script and style tag detection
12
+ * - Title extraction
13
+ * - Security warnings for dangerous content
14
+ *
15
+ * Security: Uses OWASP-compliant HTML sanitization utilities
16
+ *
17
+ * @module processors/markup/HtmlProcessor
18
+ *
19
+ * @example
20
+ * ```typescript
21
+ * import { htmlProcessor, processHtml, isHtmlFile } from "./markup/HtmlProcessor.js";
22
+ *
23
+ * // Check if file is HTML
24
+ * if (isHtmlFile(mimetype, filename)) {
25
+ * const result = await processHtml(fileInfo);
26
+ * if (result.success) {
27
+ * console.log('Text content:', result.data.textContent);
28
+ * console.log('Has scripts:', result.data.hasScripts);
29
+ * if (result.data.title) {
30
+ * console.log('Page title:', result.data.title);
31
+ * }
32
+ * }
33
+ * }
34
+ * ```
35
+ */
36
+ import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
37
+ import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
38
+ export type { ProcessedHtml } from "../base/types.js";
39
+ import type { ProcessedHtml } from "../base/types.js";
40
+ /**
41
+ * HTML Processor - processes HTML files with text extraction.
42
+ *
43
+ * This processor extracts both the original HTML content and a plain text
44
+ * version with all tags stripped. It also performs security analysis to
45
+ * detect potentially dangerous content.
46
+ *
47
+ * Priority: 20 (after SVG at priority 5, before generic text)
48
+ *
49
+ * @example
50
+ * ```typescript
51
+ * const processor = new HtmlProcessor();
52
+ *
53
+ * const result = await processor.processFile({
54
+ * id: 'html-123',
55
+ * name: 'page.html',
56
+ * mimetype: 'text/html',
57
+ * size: 8192,
58
+ * url: 'https://example.com/page.html',
59
+ * });
60
+ *
61
+ * if (result.success) {
62
+ * console.log('Title:', result.data.title);
63
+ * console.log('Text content:', result.data.textContent);
64
+ * }
65
+ * ```
66
+ */
67
+ export declare class HtmlProcessor extends BaseFileProcessor<ProcessedHtml> {
68
+ constructor();
69
+ /**
70
+ * Validate downloaded HTML file.
71
+ * Performs basic validation to ensure content appears to be HTML.
72
+ *
73
+ * @param buffer - Downloaded file content
74
+ * @param _fileInfo - Original file information
75
+ * @returns null if valid, error message if invalid
76
+ */
77
+ protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
78
+ /**
79
+ * Build processed HTML result with text extraction.
80
+ *
81
+ * Processing steps:
82
+ * 1. Preserve original HTML content
83
+ * 2. Extract plain text (strip all tags)
84
+ * 3. Detect script and style tags
85
+ * 4. Extract page title if present
86
+ * 5. Check for dangerous content
87
+ *
88
+ * @param buffer - Downloaded file content
89
+ * @param fileInfo - Original file information
90
+ * @returns Processed HTML result
91
+ */
92
+ protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedHtml;
93
+ }
94
+ /**
95
+ * Singleton HTML processor instance.
96
+ * Use this for most processing needs.
97
+ *
98
+ * @example
99
+ * ```typescript
100
+ * import { htmlProcessor } from "./markup/HtmlProcessor.js";
101
+ *
102
+ * const result = await htmlProcessor.processFile(fileInfo);
103
+ * ```
104
+ */
105
+ export declare const htmlProcessor: HtmlProcessor;
106
+ /**
107
+ * Check if a file is an HTML file.
108
+ *
109
+ * @param mimetype - MIME type of the file
110
+ * @param filename - Filename (for extension-based detection)
111
+ * @returns true if the file is an HTML file
112
+ *
113
+ * @example
114
+ * ```typescript
115
+ * if (isHtmlFile('text/html', 'page.html')) {
116
+ * // Handle as HTML
117
+ * }
118
+ *
119
+ * // Also works with just filename
120
+ * if (isHtmlFile('', 'index.htm')) {
121
+ * // Handle as HTML based on extension
122
+ * }
123
+ * ```
124
+ */
125
+ export declare function isHtmlFile(mimetype: string, filename: string): boolean;
126
+ /**
127
+ * Validate HTML file size against configured limit.
128
+ *
129
+ * @param sizeBytes - File size in bytes
130
+ * @returns true if size is within the allowed limit
131
+ *
132
+ * @example
133
+ * ```typescript
134
+ * if (!validateHtmlSize(fileInfo.size)) {
135
+ * console.error('HTML file is too large');
136
+ * }
137
+ * ```
138
+ */
139
+ export declare function validateHtmlSize(sizeBytes: number): boolean;
140
+ /**
141
+ * Process a single HTML file.
142
+ * Convenience function that uses the singleton processor.
143
+ *
144
+ * @param fileInfo - File information (can include URL or buffer)
145
+ * @param options - Optional processing options (auth headers, timeout, retry config)
146
+ * @returns Processing result with HTML content and extracted text
147
+ *
148
+ * @example
149
+ * ```typescript
150
+ * const result = await processHtml({
151
+ * id: 'html-123',
152
+ * name: 'page.html',
153
+ * mimetype: 'text/html',
154
+ * size: 8192,
155
+ * buffer: htmlBuffer,
156
+ * });
157
+ *
158
+ * if (result.success) {
159
+ * console.log('Page title:', result.data.title);
160
+ * console.log('Text content:', result.data.textContent);
161
+ * if (result.data.hasDangerousContent) {
162
+ * console.warn('HTML contains potentially dangerous content');
163
+ * }
164
+ * } else {
165
+ * console.error('Processing failed:', result.error.userMessage);
166
+ * }
167
+ * ```
168
+ */
169
+ export declare function processHtml(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedHtml>>;
@@ -0,0 +1,250 @@
1
+ /**
2
+ * HTML File Processor
3
+ *
4
+ * Processes HTML files with text extraction and security analysis.
5
+ * HTML files are processed as text content for AI analysis, with
6
+ * extraction of plain text content (tags stripped) for easier processing.
7
+ *
8
+ * Features:
9
+ * - Original HTML content preservation
10
+ * - Text extraction (all tags stripped)
11
+ * - Script and style tag detection
12
+ * - Title extraction
13
+ * - Security warnings for dangerous content
14
+ *
15
+ * Security: Uses OWASP-compliant HTML sanitization utilities
16
+ *
17
+ * @module processors/markup/HtmlProcessor
18
+ *
19
+ * @example
20
+ * ```typescript
21
+ * import { htmlProcessor, processHtml, isHtmlFile } from "./markup/HtmlProcessor.js";
22
+ *
23
+ * // Check if file is HTML
24
+ * if (isHtmlFile(mimetype, filename)) {
25
+ * const result = await processHtml(fileInfo);
26
+ * if (result.success) {
27
+ * console.log('Text content:', result.data.textContent);
28
+ * console.log('Has scripts:', result.data.hasScripts);
29
+ * if (result.data.title) {
30
+ * console.log('Page title:', result.data.title);
31
+ * }
32
+ * }
33
+ * }
34
+ * ```
35
+ */
36
+ import { containsDangerousHtml, stripHtmlTags, } from "../../utils/sanitizers/html.js";
37
+ import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
38
+ import { SIZE_LIMITS } from "../config/index.js";
39
+ // =============================================================================
40
+ // CONSTANTS
41
+ // =============================================================================
42
+ /** Supported HTML MIME types */
43
+ const SUPPORTED_HTML_TYPES = ["text/html", "application/xhtml+xml"];
44
+ /** Supported HTML file extensions */
45
+ const SUPPORTED_HTML_EXTENSIONS = [".html", ".htm", ".xhtml"];
46
+ /** Default timeout for HTML processing (30 seconds) */
47
+ const HTML_TIMEOUT_MS = 30000;
48
+ // =============================================================================
49
+ // HTML PROCESSOR
50
+ // =============================================================================
51
+ /**
52
+ * HTML Processor - processes HTML files with text extraction.
53
+ *
54
+ * This processor extracts both the original HTML content and a plain text
55
+ * version with all tags stripped. It also performs security analysis to
56
+ * detect potentially dangerous content.
57
+ *
58
+ * Priority: 20 (after SVG at priority 5, before generic text)
59
+ *
60
+ * @example
61
+ * ```typescript
62
+ * const processor = new HtmlProcessor();
63
+ *
64
+ * const result = await processor.processFile({
65
+ * id: 'html-123',
66
+ * name: 'page.html',
67
+ * mimetype: 'text/html',
68
+ * size: 8192,
69
+ * url: 'https://example.com/page.html',
70
+ * });
71
+ *
72
+ * if (result.success) {
73
+ * console.log('Title:', result.data.title);
74
+ * console.log('Text content:', result.data.textContent);
75
+ * }
76
+ * ```
77
+ */
78
+ export class HtmlProcessor extends BaseFileProcessor {
79
+ constructor() {
80
+ super({
81
+ maxSizeMB: SIZE_LIMITS.TEXT_MAX_MB,
82
+ timeoutMs: HTML_TIMEOUT_MS,
83
+ supportedMimeTypes: [...SUPPORTED_HTML_TYPES],
84
+ supportedExtensions: [...SUPPORTED_HTML_EXTENSIONS],
85
+ fileTypeName: "HTML",
86
+ defaultFilename: "page.html",
87
+ });
88
+ }
89
+ /**
90
+ * Validate downloaded HTML file.
91
+ * Performs basic validation to ensure content appears to be HTML.
92
+ *
93
+ * @param buffer - Downloaded file content
94
+ * @param _fileInfo - Original file information
95
+ * @returns null if valid, error message if invalid
96
+ */
97
+ async validateDownloadedFile(buffer, _fileInfo) {
98
+ const content = buffer.toString("utf-8").trim();
99
+ // Check minimum size
100
+ if (content.length === 0) {
101
+ return "Invalid HTML - file is empty";
102
+ }
103
+ // Very basic HTML detection - must contain at least one tag
104
+ // We're lenient here because HTML can be quite varied
105
+ const hasHtmlContent = content.includes("<") ||
106
+ content.toLowerCase().includes("<!doctype") ||
107
+ content.toLowerCase().includes("<html") ||
108
+ content.toLowerCase().includes("<body") ||
109
+ content.toLowerCase().includes("<head");
110
+ if (!hasHtmlContent) {
111
+ return "Invalid HTML - no HTML content detected";
112
+ }
113
+ return null;
114
+ }
115
+ /**
116
+ * Build processed HTML result with text extraction.
117
+ *
118
+ * Processing steps:
119
+ * 1. Preserve original HTML content
120
+ * 2. Extract plain text (strip all tags)
121
+ * 3. Detect script and style tags
122
+ * 4. Extract page title if present
123
+ * 5. Check for dangerous content
124
+ *
125
+ * @param buffer - Downloaded file content
126
+ * @param fileInfo - Original file information
127
+ * @returns Processed HTML result
128
+ */
129
+ buildProcessedResult(buffer, fileInfo) {
130
+ const content = buffer.toString("utf-8");
131
+ const filename = this.getFilename(fileInfo);
132
+ // Extract text content (strip all tags)
133
+ const textContent = stripHtmlTags(content);
134
+ // Check for script and style tags
135
+ const hasScripts = /<script[\s>]/i.test(content);
136
+ const hasStyles = /<style[\s>]/i.test(content);
137
+ // Extract title if present
138
+ const titleMatch = content.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
139
+ const title = titleMatch ? titleMatch[1].trim() : undefined;
140
+ // Check for dangerous content (XSS vectors)
141
+ const hasDangerousContent = containsDangerousHtml(content);
142
+ // Build base result
143
+ const result = {
144
+ content,
145
+ textContent,
146
+ hasScripts,
147
+ hasStyles,
148
+ hasDangerousContent,
149
+ buffer,
150
+ mimetype: fileInfo.mimetype || "text/html",
151
+ size: fileInfo.size,
152
+ filename,
153
+ };
154
+ // Only include title if it was found (avoid undefined property with exactOptionalPropertyTypes)
155
+ if (title) {
156
+ result.title = title;
157
+ }
158
+ return result;
159
+ }
160
+ }
161
+ // =============================================================================
162
+ // SINGLETON INSTANCE
163
+ // =============================================================================
164
+ /**
165
+ * Singleton HTML processor instance.
166
+ * Use this for most processing needs.
167
+ *
168
+ * @example
169
+ * ```typescript
170
+ * import { htmlProcessor } from "./markup/HtmlProcessor.js";
171
+ *
172
+ * const result = await htmlProcessor.processFile(fileInfo);
173
+ * ```
174
+ */
175
+ export const htmlProcessor = new HtmlProcessor();
176
+ // =============================================================================
177
+ // HELPER FUNCTIONS
178
+ // =============================================================================
179
+ /**
180
+ * Check if a file is an HTML file.
181
+ *
182
+ * @param mimetype - MIME type of the file
183
+ * @param filename - Filename (for extension-based detection)
184
+ * @returns true if the file is an HTML file
185
+ *
186
+ * @example
187
+ * ```typescript
188
+ * if (isHtmlFile('text/html', 'page.html')) {
189
+ * // Handle as HTML
190
+ * }
191
+ *
192
+ * // Also works with just filename
193
+ * if (isHtmlFile('', 'index.htm')) {
194
+ * // Handle as HTML based on extension
195
+ * }
196
+ * ```
197
+ */
198
+ export function isHtmlFile(mimetype, filename) {
199
+ return htmlProcessor.isFileSupported(mimetype, filename);
200
+ }
201
+ /**
202
+ * Validate HTML file size against configured limit.
203
+ *
204
+ * @param sizeBytes - File size in bytes
205
+ * @returns true if size is within the allowed limit
206
+ *
207
+ * @example
208
+ * ```typescript
209
+ * if (!validateHtmlSize(fileInfo.size)) {
210
+ * console.error('HTML file is too large');
211
+ * }
212
+ * ```
213
+ */
214
+ export function validateHtmlSize(sizeBytes) {
215
+ const maxBytes = SIZE_LIMITS.TEXT_MAX_MB * 1024 * 1024;
216
+ return sizeBytes <= maxBytes;
217
+ }
218
+ /**
219
+ * Process a single HTML file.
220
+ * Convenience function that uses the singleton processor.
221
+ *
222
+ * @param fileInfo - File information (can include URL or buffer)
223
+ * @param options - Optional processing options (auth headers, timeout, retry config)
224
+ * @returns Processing result with HTML content and extracted text
225
+ *
226
+ * @example
227
+ * ```typescript
228
+ * const result = await processHtml({
229
+ * id: 'html-123',
230
+ * name: 'page.html',
231
+ * mimetype: 'text/html',
232
+ * size: 8192,
233
+ * buffer: htmlBuffer,
234
+ * });
235
+ *
236
+ * if (result.success) {
237
+ * console.log('Page title:', result.data.title);
238
+ * console.log('Text content:', result.data.textContent);
239
+ * if (result.data.hasDangerousContent) {
240
+ * console.warn('HTML contains potentially dangerous content');
241
+ * }
242
+ * } else {
243
+ * console.error('Processing failed:', result.error.userMessage);
244
+ * }
245
+ * ```
246
+ */
247
+ export async function processHtml(fileInfo, options) {
248
+ return htmlProcessor.processFile(fileInfo, options);
249
+ }
250
+ //# sourceMappingURL=HtmlProcessor.js.map
@@ -0,0 +1,165 @@
1
+ /**
2
+ * Markdown File Processor
3
+ *
4
+ * Processes Markdown files with structure extraction and analysis.
5
+ * Markdown files are analyzed to extract metadata about their structure
6
+ * including headings, code blocks, and tables.
7
+ *
8
+ * Features:
9
+ * - Original content preservation
10
+ * - Line count calculation
11
+ * - Code block detection
12
+ * - Table detection
13
+ * - Heading extraction (all levels)
14
+ *
15
+ * @module processors/markup/MarkdownProcessor
16
+ *
17
+ * @example
18
+ * ```typescript
19
+ * import { markdownProcessor, processMarkdown, isMarkdownFile } from "./markup/MarkdownProcessor.js";
20
+ *
21
+ * // Check if file is Markdown
22
+ * if (isMarkdownFile(mimetype, filename)) {
23
+ * const result = await processMarkdown(fileInfo);
24
+ * if (result.success) {
25
+ * console.log('Line count:', result.data.lineCount);
26
+ * console.log('Headings:', result.data.headings);
27
+ * console.log('Has code blocks:', result.data.hasCodeBlocks);
28
+ * }
29
+ * }
30
+ * ```
31
+ */
32
+ import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
33
+ import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
34
+ export type { ProcessedMarkdown } from "../base/types.js";
35
+ import type { ProcessedMarkdown } from "../base/types.js";
36
+ /**
37
+ * Markdown Processor - processes Markdown files with structure analysis.
38
+ *
39
+ * This processor analyzes Markdown documents to extract structural metadata
40
+ * including headings, code blocks, and tables. The original content is
41
+ * preserved for AI processing.
42
+ *
43
+ * Priority: 40 (before JSON at 50, before generic text at 110)
44
+ *
45
+ * @example
46
+ * ```typescript
47
+ * const processor = new MarkdownProcessor();
48
+ *
49
+ * const result = await processor.processFile({
50
+ * id: 'md-123',
51
+ * name: 'README.md',
52
+ * mimetype: 'text/markdown',
53
+ * size: 4096,
54
+ * url: 'https://example.com/README.md',
55
+ * });
56
+ *
57
+ * if (result.success) {
58
+ * console.log('Headings:', result.data.headings);
59
+ * console.log('Has code blocks:', result.data.hasCodeBlocks);
60
+ * }
61
+ * ```
62
+ */
63
+ export declare class MarkdownProcessor extends BaseFileProcessor<ProcessedMarkdown> {
64
+ constructor();
65
+ /**
66
+ * Validate downloaded Markdown file.
67
+ * Markdown is very permissive - almost any text is valid.
68
+ *
69
+ * @param buffer - Downloaded file content
70
+ * @param _fileInfo - Original file information
71
+ * @returns null if valid, error message if invalid
72
+ */
73
+ protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
74
+ /**
75
+ * Build processed Markdown result with structure analysis.
76
+ *
77
+ * Processing steps:
78
+ * 1. Preserve original content
79
+ * 2. Count lines
80
+ * 3. Detect fenced code blocks
81
+ * 4. Detect tables
82
+ * 5. Extract headings
83
+ *
84
+ * @param buffer - Downloaded file content
85
+ * @param fileInfo - Original file information
86
+ * @returns Processed Markdown result
87
+ */
88
+ protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedMarkdown;
89
+ }
90
+ /**
91
+ * Singleton Markdown processor instance.
92
+ * Use this for most processing needs.
93
+ *
94
+ * @example
95
+ * ```typescript
96
+ * import { markdownProcessor } from "./markup/MarkdownProcessor.js";
97
+ *
98
+ * const result = await markdownProcessor.processFile(fileInfo);
99
+ * ```
100
+ */
101
+ export declare const markdownProcessor: MarkdownProcessor;
102
+ /**
103
+ * Check if a file is a Markdown file.
104
+ *
105
+ * @param mimetype - MIME type of the file
106
+ * @param filename - Filename (for extension-based detection)
107
+ * @returns true if the file is a Markdown file
108
+ *
109
+ * @example
110
+ * ```typescript
111
+ * if (isMarkdownFile('text/markdown', 'README.md')) {
112
+ * // Handle as Markdown
113
+ * }
114
+ *
115
+ * // Also works with just filename
116
+ * if (isMarkdownFile('', 'CHANGELOG.markdown')) {
117
+ * // Handle as Markdown based on extension
118
+ * }
119
+ * ```
120
+ */
121
+ export declare function isMarkdownFile(mimetype: string, filename: string): boolean;
122
+ /**
123
+ * Validate Markdown file size against configured limit.
124
+ *
125
+ * @param sizeBytes - File size in bytes
126
+ * @returns true if size is within the allowed limit
127
+ *
128
+ * @example
129
+ * ```typescript
130
+ * if (!validateMarkdownSize(fileInfo.size)) {
131
+ * console.error('Markdown file is too large');
132
+ * }
133
+ * ```
134
+ */
135
+ export declare function validateMarkdownSize(sizeBytes: number): boolean;
136
+ /**
137
+ * Process a single Markdown file.
138
+ * Convenience function that uses the singleton processor.
139
+ *
140
+ * @param fileInfo - File information (can include URL or buffer)
141
+ * @param options - Optional processing options (auth headers, timeout, retry config)
142
+ * @returns Processing result with Markdown content and structure analysis
143
+ *
144
+ * @example
145
+ * ```typescript
146
+ * const result = await processMarkdown({
147
+ * id: 'md-123',
148
+ * name: 'README.md',
149
+ * mimetype: 'text/markdown',
150
+ * size: 4096,
151
+ * buffer: markdownBuffer,
152
+ * });
153
+ *
154
+ * if (result.success) {
155
+ * console.log('Line count:', result.data.lineCount);
156
+ * console.log('Headings:', result.data.headings);
157
+ * if (result.data.hasCodeBlocks) {
158
+ * console.log('Document contains code examples');
159
+ * }
160
+ * } else {
161
+ * console.error('Processing failed:', result.error.userMessage);
162
+ * }
163
+ * ```
164
+ */
165
+ export declare function processMarkdown(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedMarkdown>>;