@juspay/neurolink 9.1.1 → 9.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/README.md +54 -7
- package/dist/agent/directTools.d.ts +3 -3
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/lib/agent/directTools.d.ts +3 -3
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/server/utils/validation.d.ts +6 -6
- package/dist/lib/types/fileTypes.d.ts +1 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +18 -18
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/server/utils/validation.d.ts +6 -6
- package/dist/types/fileTypes.d.ts +1 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* File Processor Integration Module
|
|
3
|
+
*
|
|
4
|
+
* Provides integration between the ProcessorRegistry and message building.
|
|
5
|
+
* Exports utilities for processing files through registered processors
|
|
6
|
+
* with automatic type detection and batch processing support.
|
|
7
|
+
*
|
|
8
|
+
* @module processors/integration
|
|
9
|
+
*
|
|
10
|
+
* @example
|
|
11
|
+
* ```typescript
|
|
12
|
+
* import {
|
|
13
|
+
* // Single file processing
|
|
14
|
+
* processFileWithRegistry,
|
|
15
|
+
*
|
|
16
|
+
* // Batch processing
|
|
17
|
+
* processBatchWithRegistry,
|
|
18
|
+
*
|
|
19
|
+
* // Discovery utilities
|
|
20
|
+
* getSupportedFileTypes,
|
|
21
|
+
* isFileTypeSupported,
|
|
22
|
+
* getProcessorForFile,
|
|
23
|
+
*
|
|
24
|
+
* // Types
|
|
25
|
+
* type FileProcessingOptions,
|
|
26
|
+
* type BatchFileProcessingResult,
|
|
27
|
+
* } from "./integration/index.js";
|
|
28
|
+
*
|
|
29
|
+
* // Process a single file with auto-detection
|
|
30
|
+
* const { processorName, result } = await processFileWithRegistry(fileInfo);
|
|
31
|
+
*
|
|
32
|
+
* // Process multiple files
|
|
33
|
+
* const batchResult = await processBatchWithRegistry(files, { maxFiles: 50 });
|
|
34
|
+
*
|
|
35
|
+
* // Check supported types
|
|
36
|
+
* const supported = getSupportedFileTypes();
|
|
37
|
+
* const isSupported = isFileTypeSupported("application/pdf", "doc.pdf");
|
|
38
|
+
* const match = getProcessorForFile("image/jpeg", "photo.jpg");
|
|
39
|
+
* ```
|
|
40
|
+
*/
|
|
41
|
+
// =============================================================================
|
|
42
|
+
// FUNCTION EXPORTS
|
|
43
|
+
// =============================================================================
|
|
44
|
+
export { getProcessorForFile, getSupportedFileTypes, isFileTypeSupported, processBatchWithRegistry, processFileWithRegistry, } from "./FileProcessorIntegration.js";
|
|
45
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML File Processor
|
|
3
|
+
*
|
|
4
|
+
* Processes HTML files with text extraction and security analysis.
|
|
5
|
+
* HTML files are processed as text content for AI analysis, with
|
|
6
|
+
* extraction of plain text content (tags stripped) for easier processing.
|
|
7
|
+
*
|
|
8
|
+
* Features:
|
|
9
|
+
* - Original HTML content preservation
|
|
10
|
+
* - Text extraction (all tags stripped)
|
|
11
|
+
* - Script and style tag detection
|
|
12
|
+
* - Title extraction
|
|
13
|
+
* - Security warnings for dangerous content
|
|
14
|
+
*
|
|
15
|
+
* Security: Uses OWASP-compliant HTML sanitization utilities
|
|
16
|
+
*
|
|
17
|
+
* @module processors/markup/HtmlProcessor
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
* ```typescript
|
|
21
|
+
* import { htmlProcessor, processHtml, isHtmlFile } from "./markup/HtmlProcessor.js";
|
|
22
|
+
*
|
|
23
|
+
* // Check if file is HTML
|
|
24
|
+
* if (isHtmlFile(mimetype, filename)) {
|
|
25
|
+
* const result = await processHtml(fileInfo);
|
|
26
|
+
* if (result.success) {
|
|
27
|
+
* console.log('Text content:', result.data.textContent);
|
|
28
|
+
* console.log('Has scripts:', result.data.hasScripts);
|
|
29
|
+
* if (result.data.title) {
|
|
30
|
+
* console.log('Page title:', result.data.title);
|
|
31
|
+
* }
|
|
32
|
+
* }
|
|
33
|
+
* }
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
37
|
+
import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
|
|
38
|
+
export type { ProcessedHtml } from "../base/types.js";
|
|
39
|
+
import type { ProcessedHtml } from "../base/types.js";
|
|
40
|
+
/**
|
|
41
|
+
* HTML Processor - processes HTML files with text extraction.
|
|
42
|
+
*
|
|
43
|
+
* This processor extracts both the original HTML content and a plain text
|
|
44
|
+
* version with all tags stripped. It also performs security analysis to
|
|
45
|
+
* detect potentially dangerous content.
|
|
46
|
+
*
|
|
47
|
+
* Priority: 20 (after SVG at priority 5, before generic text)
|
|
48
|
+
*
|
|
49
|
+
* @example
|
|
50
|
+
* ```typescript
|
|
51
|
+
* const processor = new HtmlProcessor();
|
|
52
|
+
*
|
|
53
|
+
* const result = await processor.processFile({
|
|
54
|
+
* id: 'html-123',
|
|
55
|
+
* name: 'page.html',
|
|
56
|
+
* mimetype: 'text/html',
|
|
57
|
+
* size: 8192,
|
|
58
|
+
* url: 'https://example.com/page.html',
|
|
59
|
+
* });
|
|
60
|
+
*
|
|
61
|
+
* if (result.success) {
|
|
62
|
+
* console.log('Title:', result.data.title);
|
|
63
|
+
* console.log('Text content:', result.data.textContent);
|
|
64
|
+
* }
|
|
65
|
+
* ```
|
|
66
|
+
*/
|
|
67
|
+
export declare class HtmlProcessor extends BaseFileProcessor<ProcessedHtml> {
|
|
68
|
+
constructor();
|
|
69
|
+
/**
|
|
70
|
+
* Validate downloaded HTML file.
|
|
71
|
+
* Performs basic validation to ensure content appears to be HTML.
|
|
72
|
+
*
|
|
73
|
+
* @param buffer - Downloaded file content
|
|
74
|
+
* @param _fileInfo - Original file information
|
|
75
|
+
* @returns null if valid, error message if invalid
|
|
76
|
+
*/
|
|
77
|
+
protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
|
|
78
|
+
/**
|
|
79
|
+
* Build processed HTML result with text extraction.
|
|
80
|
+
*
|
|
81
|
+
* Processing steps:
|
|
82
|
+
* 1. Preserve original HTML content
|
|
83
|
+
* 2. Extract plain text (strip all tags)
|
|
84
|
+
* 3. Detect script and style tags
|
|
85
|
+
* 4. Extract page title if present
|
|
86
|
+
* 5. Check for dangerous content
|
|
87
|
+
*
|
|
88
|
+
* @param buffer - Downloaded file content
|
|
89
|
+
* @param fileInfo - Original file information
|
|
90
|
+
* @returns Processed HTML result
|
|
91
|
+
*/
|
|
92
|
+
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedHtml;
|
|
93
|
+
}
|
|
94
|
+
/**
|
|
95
|
+
* Singleton HTML processor instance.
|
|
96
|
+
* Use this for most processing needs.
|
|
97
|
+
*
|
|
98
|
+
* @example
|
|
99
|
+
* ```typescript
|
|
100
|
+
* import { htmlProcessor } from "./markup/HtmlProcessor.js";
|
|
101
|
+
*
|
|
102
|
+
* const result = await htmlProcessor.processFile(fileInfo);
|
|
103
|
+
* ```
|
|
104
|
+
*/
|
|
105
|
+
export declare const htmlProcessor: HtmlProcessor;
|
|
106
|
+
/**
|
|
107
|
+
* Check if a file is an HTML file.
|
|
108
|
+
*
|
|
109
|
+
* @param mimetype - MIME type of the file
|
|
110
|
+
* @param filename - Filename (for extension-based detection)
|
|
111
|
+
* @returns true if the file is an HTML file
|
|
112
|
+
*
|
|
113
|
+
* @example
|
|
114
|
+
* ```typescript
|
|
115
|
+
* if (isHtmlFile('text/html', 'page.html')) {
|
|
116
|
+
* // Handle as HTML
|
|
117
|
+
* }
|
|
118
|
+
*
|
|
119
|
+
* // Also works with just filename
|
|
120
|
+
* if (isHtmlFile('', 'index.htm')) {
|
|
121
|
+
* // Handle as HTML based on extension
|
|
122
|
+
* }
|
|
123
|
+
* ```
|
|
124
|
+
*/
|
|
125
|
+
export declare function isHtmlFile(mimetype: string, filename: string): boolean;
|
|
126
|
+
/**
|
|
127
|
+
* Validate HTML file size against configured limit.
|
|
128
|
+
*
|
|
129
|
+
* @param sizeBytes - File size in bytes
|
|
130
|
+
* @returns true if size is within the allowed limit
|
|
131
|
+
*
|
|
132
|
+
* @example
|
|
133
|
+
* ```typescript
|
|
134
|
+
* if (!validateHtmlSize(fileInfo.size)) {
|
|
135
|
+
* console.error('HTML file is too large');
|
|
136
|
+
* }
|
|
137
|
+
* ```
|
|
138
|
+
*/
|
|
139
|
+
export declare function validateHtmlSize(sizeBytes: number): boolean;
|
|
140
|
+
/**
|
|
141
|
+
* Process a single HTML file.
|
|
142
|
+
* Convenience function that uses the singleton processor.
|
|
143
|
+
*
|
|
144
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
145
|
+
* @param options - Optional processing options (auth headers, timeout, retry config)
|
|
146
|
+
* @returns Processing result with HTML content and extracted text
|
|
147
|
+
*
|
|
148
|
+
* @example
|
|
149
|
+
* ```typescript
|
|
150
|
+
* const result = await processHtml({
|
|
151
|
+
* id: 'html-123',
|
|
152
|
+
* name: 'page.html',
|
|
153
|
+
* mimetype: 'text/html',
|
|
154
|
+
* size: 8192,
|
|
155
|
+
* buffer: htmlBuffer,
|
|
156
|
+
* });
|
|
157
|
+
*
|
|
158
|
+
* if (result.success) {
|
|
159
|
+
* console.log('Page title:', result.data.title);
|
|
160
|
+
* console.log('Text content:', result.data.textContent);
|
|
161
|
+
* if (result.data.hasDangerousContent) {
|
|
162
|
+
* console.warn('HTML contains potentially dangerous content');
|
|
163
|
+
* }
|
|
164
|
+
* } else {
|
|
165
|
+
* console.error('Processing failed:', result.error.userMessage);
|
|
166
|
+
* }
|
|
167
|
+
* ```
|
|
168
|
+
*/
|
|
169
|
+
export declare function processHtml(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedHtml>>;
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* HTML File Processor
|
|
3
|
+
*
|
|
4
|
+
* Processes HTML files with text extraction and security analysis.
|
|
5
|
+
* HTML files are processed as text content for AI analysis, with
|
|
6
|
+
* extraction of plain text content (tags stripped) for easier processing.
|
|
7
|
+
*
|
|
8
|
+
* Features:
|
|
9
|
+
* - Original HTML content preservation
|
|
10
|
+
* - Text extraction (all tags stripped)
|
|
11
|
+
* - Script and style tag detection
|
|
12
|
+
* - Title extraction
|
|
13
|
+
* - Security warnings for dangerous content
|
|
14
|
+
*
|
|
15
|
+
* Security: Uses OWASP-compliant HTML sanitization utilities
|
|
16
|
+
*
|
|
17
|
+
* @module processors/markup/HtmlProcessor
|
|
18
|
+
*
|
|
19
|
+
* @example
|
|
20
|
+
* ```typescript
|
|
21
|
+
* import { htmlProcessor, processHtml, isHtmlFile } from "./markup/HtmlProcessor.js";
|
|
22
|
+
*
|
|
23
|
+
* // Check if file is HTML
|
|
24
|
+
* if (isHtmlFile(mimetype, filename)) {
|
|
25
|
+
* const result = await processHtml(fileInfo);
|
|
26
|
+
* if (result.success) {
|
|
27
|
+
* console.log('Text content:', result.data.textContent);
|
|
28
|
+
* console.log('Has scripts:', result.data.hasScripts);
|
|
29
|
+
* if (result.data.title) {
|
|
30
|
+
* console.log('Page title:', result.data.title);
|
|
31
|
+
* }
|
|
32
|
+
* }
|
|
33
|
+
* }
|
|
34
|
+
* ```
|
|
35
|
+
*/
|
|
36
|
+
import { containsDangerousHtml, stripHtmlTags, } from "../../utils/sanitizers/html.js";
|
|
37
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
38
|
+
import { SIZE_LIMITS } from "../config/index.js";
|
|
39
|
+
// =============================================================================
|
|
40
|
+
// CONSTANTS
|
|
41
|
+
// =============================================================================
|
|
42
|
+
/** Supported HTML MIME types */
|
|
43
|
+
const SUPPORTED_HTML_TYPES = ["text/html", "application/xhtml+xml"];
|
|
44
|
+
/** Supported HTML file extensions */
|
|
45
|
+
const SUPPORTED_HTML_EXTENSIONS = [".html", ".htm", ".xhtml"];
|
|
46
|
+
/** Default timeout for HTML processing (30 seconds) */
|
|
47
|
+
const HTML_TIMEOUT_MS = 30000;
|
|
48
|
+
// =============================================================================
|
|
49
|
+
// HTML PROCESSOR
|
|
50
|
+
// =============================================================================
|
|
51
|
+
/**
|
|
52
|
+
* HTML Processor - processes HTML files with text extraction.
|
|
53
|
+
*
|
|
54
|
+
* This processor extracts both the original HTML content and a plain text
|
|
55
|
+
* version with all tags stripped. It also performs security analysis to
|
|
56
|
+
* detect potentially dangerous content.
|
|
57
|
+
*
|
|
58
|
+
* Priority: 20 (after SVG at priority 5, before generic text)
|
|
59
|
+
*
|
|
60
|
+
* @example
|
|
61
|
+
* ```typescript
|
|
62
|
+
* const processor = new HtmlProcessor();
|
|
63
|
+
*
|
|
64
|
+
* const result = await processor.processFile({
|
|
65
|
+
* id: 'html-123',
|
|
66
|
+
* name: 'page.html',
|
|
67
|
+
* mimetype: 'text/html',
|
|
68
|
+
* size: 8192,
|
|
69
|
+
* url: 'https://example.com/page.html',
|
|
70
|
+
* });
|
|
71
|
+
*
|
|
72
|
+
* if (result.success) {
|
|
73
|
+
* console.log('Title:', result.data.title);
|
|
74
|
+
* console.log('Text content:', result.data.textContent);
|
|
75
|
+
* }
|
|
76
|
+
* ```
|
|
77
|
+
*/
|
|
78
|
+
export class HtmlProcessor extends BaseFileProcessor {
|
|
79
|
+
constructor() {
|
|
80
|
+
super({
|
|
81
|
+
maxSizeMB: SIZE_LIMITS.TEXT_MAX_MB,
|
|
82
|
+
timeoutMs: HTML_TIMEOUT_MS,
|
|
83
|
+
supportedMimeTypes: [...SUPPORTED_HTML_TYPES],
|
|
84
|
+
supportedExtensions: [...SUPPORTED_HTML_EXTENSIONS],
|
|
85
|
+
fileTypeName: "HTML",
|
|
86
|
+
defaultFilename: "page.html",
|
|
87
|
+
});
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Validate downloaded HTML file.
|
|
91
|
+
* Performs basic validation to ensure content appears to be HTML.
|
|
92
|
+
*
|
|
93
|
+
* @param buffer - Downloaded file content
|
|
94
|
+
* @param _fileInfo - Original file information
|
|
95
|
+
* @returns null if valid, error message if invalid
|
|
96
|
+
*/
|
|
97
|
+
async validateDownloadedFile(buffer, _fileInfo) {
|
|
98
|
+
const content = buffer.toString("utf-8").trim();
|
|
99
|
+
// Check minimum size
|
|
100
|
+
if (content.length === 0) {
|
|
101
|
+
return "Invalid HTML - file is empty";
|
|
102
|
+
}
|
|
103
|
+
// Very basic HTML detection - must contain at least one tag
|
|
104
|
+
// We're lenient here because HTML can be quite varied
|
|
105
|
+
const hasHtmlContent = content.includes("<") ||
|
|
106
|
+
content.toLowerCase().includes("<!doctype") ||
|
|
107
|
+
content.toLowerCase().includes("<html") ||
|
|
108
|
+
content.toLowerCase().includes("<body") ||
|
|
109
|
+
content.toLowerCase().includes("<head");
|
|
110
|
+
if (!hasHtmlContent) {
|
|
111
|
+
return "Invalid HTML - no HTML content detected";
|
|
112
|
+
}
|
|
113
|
+
return null;
|
|
114
|
+
}
|
|
115
|
+
/**
|
|
116
|
+
* Build processed HTML result with text extraction.
|
|
117
|
+
*
|
|
118
|
+
* Processing steps:
|
|
119
|
+
* 1. Preserve original HTML content
|
|
120
|
+
* 2. Extract plain text (strip all tags)
|
|
121
|
+
* 3. Detect script and style tags
|
|
122
|
+
* 4. Extract page title if present
|
|
123
|
+
* 5. Check for dangerous content
|
|
124
|
+
*
|
|
125
|
+
* @param buffer - Downloaded file content
|
|
126
|
+
* @param fileInfo - Original file information
|
|
127
|
+
* @returns Processed HTML result
|
|
128
|
+
*/
|
|
129
|
+
buildProcessedResult(buffer, fileInfo) {
|
|
130
|
+
const content = buffer.toString("utf-8");
|
|
131
|
+
const filename = this.getFilename(fileInfo);
|
|
132
|
+
// Extract text content (strip all tags)
|
|
133
|
+
const textContent = stripHtmlTags(content);
|
|
134
|
+
// Check for script and style tags
|
|
135
|
+
const hasScripts = /<script[\s>]/i.test(content);
|
|
136
|
+
const hasStyles = /<style[\s>]/i.test(content);
|
|
137
|
+
// Extract title if present
|
|
138
|
+
const titleMatch = content.match(/<title[^>]*>([\s\S]*?)<\/title>/i);
|
|
139
|
+
const title = titleMatch ? titleMatch[1].trim() : undefined;
|
|
140
|
+
// Check for dangerous content (XSS vectors)
|
|
141
|
+
const hasDangerousContent = containsDangerousHtml(content);
|
|
142
|
+
// Build base result
|
|
143
|
+
const result = {
|
|
144
|
+
content,
|
|
145
|
+
textContent,
|
|
146
|
+
hasScripts,
|
|
147
|
+
hasStyles,
|
|
148
|
+
hasDangerousContent,
|
|
149
|
+
buffer,
|
|
150
|
+
mimetype: fileInfo.mimetype || "text/html",
|
|
151
|
+
size: fileInfo.size,
|
|
152
|
+
filename,
|
|
153
|
+
};
|
|
154
|
+
// Only include title if it was found (avoid undefined property with exactOptionalPropertyTypes)
|
|
155
|
+
if (title) {
|
|
156
|
+
result.title = title;
|
|
157
|
+
}
|
|
158
|
+
return result;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
// =============================================================================
|
|
162
|
+
// SINGLETON INSTANCE
|
|
163
|
+
// =============================================================================
|
|
164
|
+
/**
|
|
165
|
+
* Singleton HTML processor instance.
|
|
166
|
+
* Use this for most processing needs.
|
|
167
|
+
*
|
|
168
|
+
* @example
|
|
169
|
+
* ```typescript
|
|
170
|
+
* import { htmlProcessor } from "./markup/HtmlProcessor.js";
|
|
171
|
+
*
|
|
172
|
+
* const result = await htmlProcessor.processFile(fileInfo);
|
|
173
|
+
* ```
|
|
174
|
+
*/
|
|
175
|
+
export const htmlProcessor = new HtmlProcessor();
|
|
176
|
+
// =============================================================================
|
|
177
|
+
// HELPER FUNCTIONS
|
|
178
|
+
// =============================================================================
|
|
179
|
+
/**
|
|
180
|
+
* Check if a file is an HTML file.
|
|
181
|
+
*
|
|
182
|
+
* @param mimetype - MIME type of the file
|
|
183
|
+
* @param filename - Filename (for extension-based detection)
|
|
184
|
+
* @returns true if the file is an HTML file
|
|
185
|
+
*
|
|
186
|
+
* @example
|
|
187
|
+
* ```typescript
|
|
188
|
+
* if (isHtmlFile('text/html', 'page.html')) {
|
|
189
|
+
* // Handle as HTML
|
|
190
|
+
* }
|
|
191
|
+
*
|
|
192
|
+
* // Also works with just filename
|
|
193
|
+
* if (isHtmlFile('', 'index.htm')) {
|
|
194
|
+
* // Handle as HTML based on extension
|
|
195
|
+
* }
|
|
196
|
+
* ```
|
|
197
|
+
*/
|
|
198
|
+
export function isHtmlFile(mimetype, filename) {
|
|
199
|
+
return htmlProcessor.isFileSupported(mimetype, filename);
|
|
200
|
+
}
|
|
201
|
+
/**
|
|
202
|
+
* Validate HTML file size against configured limit.
|
|
203
|
+
*
|
|
204
|
+
* @param sizeBytes - File size in bytes
|
|
205
|
+
* @returns true if size is within the allowed limit
|
|
206
|
+
*
|
|
207
|
+
* @example
|
|
208
|
+
* ```typescript
|
|
209
|
+
* if (!validateHtmlSize(fileInfo.size)) {
|
|
210
|
+
* console.error('HTML file is too large');
|
|
211
|
+
* }
|
|
212
|
+
* ```
|
|
213
|
+
*/
|
|
214
|
+
export function validateHtmlSize(sizeBytes) {
|
|
215
|
+
const maxBytes = SIZE_LIMITS.TEXT_MAX_MB * 1024 * 1024;
|
|
216
|
+
return sizeBytes <= maxBytes;
|
|
217
|
+
}
|
|
218
|
+
/**
|
|
219
|
+
* Process a single HTML file.
|
|
220
|
+
* Convenience function that uses the singleton processor.
|
|
221
|
+
*
|
|
222
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
223
|
+
* @param options - Optional processing options (auth headers, timeout, retry config)
|
|
224
|
+
* @returns Processing result with HTML content and extracted text
|
|
225
|
+
*
|
|
226
|
+
* @example
|
|
227
|
+
* ```typescript
|
|
228
|
+
* const result = await processHtml({
|
|
229
|
+
* id: 'html-123',
|
|
230
|
+
* name: 'page.html',
|
|
231
|
+
* mimetype: 'text/html',
|
|
232
|
+
* size: 8192,
|
|
233
|
+
* buffer: htmlBuffer,
|
|
234
|
+
* });
|
|
235
|
+
*
|
|
236
|
+
* if (result.success) {
|
|
237
|
+
* console.log('Page title:', result.data.title);
|
|
238
|
+
* console.log('Text content:', result.data.textContent);
|
|
239
|
+
* if (result.data.hasDangerousContent) {
|
|
240
|
+
* console.warn('HTML contains potentially dangerous content');
|
|
241
|
+
* }
|
|
242
|
+
* } else {
|
|
243
|
+
* console.error('Processing failed:', result.error.userMessage);
|
|
244
|
+
* }
|
|
245
|
+
* ```
|
|
246
|
+
*/
|
|
247
|
+
export async function processHtml(fileInfo, options) {
|
|
248
|
+
return htmlProcessor.processFile(fileInfo, options);
|
|
249
|
+
}
|
|
250
|
+
//# sourceMappingURL=HtmlProcessor.js.map
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Markdown File Processor
|
|
3
|
+
*
|
|
4
|
+
* Processes Markdown files with structure extraction and analysis.
|
|
5
|
+
* Markdown files are analyzed to extract metadata about their structure
|
|
6
|
+
* including headings, code blocks, and tables.
|
|
7
|
+
*
|
|
8
|
+
* Features:
|
|
9
|
+
* - Original content preservation
|
|
10
|
+
* - Line count calculation
|
|
11
|
+
* - Code block detection
|
|
12
|
+
* - Table detection
|
|
13
|
+
* - Heading extraction (all levels)
|
|
14
|
+
*
|
|
15
|
+
* @module processors/markup/MarkdownProcessor
|
|
16
|
+
*
|
|
17
|
+
* @example
|
|
18
|
+
* ```typescript
|
|
19
|
+
* import { markdownProcessor, processMarkdown, isMarkdownFile } from "./markup/MarkdownProcessor.js";
|
|
20
|
+
*
|
|
21
|
+
* // Check if file is Markdown
|
|
22
|
+
* if (isMarkdownFile(mimetype, filename)) {
|
|
23
|
+
* const result = await processMarkdown(fileInfo);
|
|
24
|
+
* if (result.success) {
|
|
25
|
+
* console.log('Line count:', result.data.lineCount);
|
|
26
|
+
* console.log('Headings:', result.data.headings);
|
|
27
|
+
* console.log('Has code blocks:', result.data.hasCodeBlocks);
|
|
28
|
+
* }
|
|
29
|
+
* }
|
|
30
|
+
* ```
|
|
31
|
+
*/
|
|
32
|
+
import { BaseFileProcessor } from "../base/BaseFileProcessor.js";
|
|
33
|
+
import type { FileInfo, FileProcessingResult, ProcessOptions } from "../base/types.js";
|
|
34
|
+
export type { ProcessedMarkdown } from "../base/types.js";
|
|
35
|
+
import type { ProcessedMarkdown } from "../base/types.js";
|
|
36
|
+
/**
|
|
37
|
+
* Markdown Processor - processes Markdown files with structure analysis.
|
|
38
|
+
*
|
|
39
|
+
* This processor analyzes Markdown documents to extract structural metadata
|
|
40
|
+
* including headings, code blocks, and tables. The original content is
|
|
41
|
+
* preserved for AI processing.
|
|
42
|
+
*
|
|
43
|
+
* Priority: 40 (before JSON at 50, before generic text at 110)
|
|
44
|
+
*
|
|
45
|
+
* @example
|
|
46
|
+
* ```typescript
|
|
47
|
+
* const processor = new MarkdownProcessor();
|
|
48
|
+
*
|
|
49
|
+
* const result = await processor.processFile({
|
|
50
|
+
* id: 'md-123',
|
|
51
|
+
* name: 'README.md',
|
|
52
|
+
* mimetype: 'text/markdown',
|
|
53
|
+
* size: 4096,
|
|
54
|
+
* url: 'https://example.com/README.md',
|
|
55
|
+
* });
|
|
56
|
+
*
|
|
57
|
+
* if (result.success) {
|
|
58
|
+
* console.log('Headings:', result.data.headings);
|
|
59
|
+
* console.log('Has code blocks:', result.data.hasCodeBlocks);
|
|
60
|
+
* }
|
|
61
|
+
* ```
|
|
62
|
+
*/
|
|
63
|
+
export declare class MarkdownProcessor extends BaseFileProcessor<ProcessedMarkdown> {
|
|
64
|
+
constructor();
|
|
65
|
+
/**
|
|
66
|
+
* Validate downloaded Markdown file.
|
|
67
|
+
* Markdown is very permissive - almost any text is valid.
|
|
68
|
+
*
|
|
69
|
+
* @param buffer - Downloaded file content
|
|
70
|
+
* @param _fileInfo - Original file information
|
|
71
|
+
* @returns null if valid, error message if invalid
|
|
72
|
+
*/
|
|
73
|
+
protected validateDownloadedFile(buffer: Buffer, _fileInfo: FileInfo): Promise<string | null>;
|
|
74
|
+
/**
|
|
75
|
+
* Build processed Markdown result with structure analysis.
|
|
76
|
+
*
|
|
77
|
+
* Processing steps:
|
|
78
|
+
* 1. Preserve original content
|
|
79
|
+
* 2. Count lines
|
|
80
|
+
* 3. Detect fenced code blocks
|
|
81
|
+
* 4. Detect tables
|
|
82
|
+
* 5. Extract headings
|
|
83
|
+
*
|
|
84
|
+
* @param buffer - Downloaded file content
|
|
85
|
+
* @param fileInfo - Original file information
|
|
86
|
+
* @returns Processed Markdown result
|
|
87
|
+
*/
|
|
88
|
+
protected buildProcessedResult(buffer: Buffer, fileInfo: FileInfo): ProcessedMarkdown;
|
|
89
|
+
}
|
|
90
|
+
/**
|
|
91
|
+
* Singleton Markdown processor instance.
|
|
92
|
+
* Use this for most processing needs.
|
|
93
|
+
*
|
|
94
|
+
* @example
|
|
95
|
+
* ```typescript
|
|
96
|
+
* import { markdownProcessor } from "./markup/MarkdownProcessor.js";
|
|
97
|
+
*
|
|
98
|
+
* const result = await markdownProcessor.processFile(fileInfo);
|
|
99
|
+
* ```
|
|
100
|
+
*/
|
|
101
|
+
export declare const markdownProcessor: MarkdownProcessor;
|
|
102
|
+
/**
|
|
103
|
+
* Check if a file is a Markdown file.
|
|
104
|
+
*
|
|
105
|
+
* @param mimetype - MIME type of the file
|
|
106
|
+
* @param filename - Filename (for extension-based detection)
|
|
107
|
+
* @returns true if the file is a Markdown file
|
|
108
|
+
*
|
|
109
|
+
* @example
|
|
110
|
+
* ```typescript
|
|
111
|
+
* if (isMarkdownFile('text/markdown', 'README.md')) {
|
|
112
|
+
* // Handle as Markdown
|
|
113
|
+
* }
|
|
114
|
+
*
|
|
115
|
+
* // Also works with just filename
|
|
116
|
+
* if (isMarkdownFile('', 'CHANGELOG.markdown')) {
|
|
117
|
+
* // Handle as Markdown based on extension
|
|
118
|
+
* }
|
|
119
|
+
* ```
|
|
120
|
+
*/
|
|
121
|
+
export declare function isMarkdownFile(mimetype: string, filename: string): boolean;
|
|
122
|
+
/**
|
|
123
|
+
* Validate Markdown file size against configured limit.
|
|
124
|
+
*
|
|
125
|
+
* @param sizeBytes - File size in bytes
|
|
126
|
+
* @returns true if size is within the allowed limit
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* if (!validateMarkdownSize(fileInfo.size)) {
|
|
131
|
+
* console.error('Markdown file is too large');
|
|
132
|
+
* }
|
|
133
|
+
* ```
|
|
134
|
+
*/
|
|
135
|
+
export declare function validateMarkdownSize(sizeBytes: number): boolean;
|
|
136
|
+
/**
|
|
137
|
+
* Process a single Markdown file.
|
|
138
|
+
* Convenience function that uses the singleton processor.
|
|
139
|
+
*
|
|
140
|
+
* @param fileInfo - File information (can include URL or buffer)
|
|
141
|
+
* @param options - Optional processing options (auth headers, timeout, retry config)
|
|
142
|
+
* @returns Processing result with Markdown content and structure analysis
|
|
143
|
+
*
|
|
144
|
+
* @example
|
|
145
|
+
* ```typescript
|
|
146
|
+
* const result = await processMarkdown({
|
|
147
|
+
* id: 'md-123',
|
|
148
|
+
* name: 'README.md',
|
|
149
|
+
* mimetype: 'text/markdown',
|
|
150
|
+
* size: 4096,
|
|
151
|
+
* buffer: markdownBuffer,
|
|
152
|
+
* });
|
|
153
|
+
*
|
|
154
|
+
* if (result.success) {
|
|
155
|
+
* console.log('Line count:', result.data.lineCount);
|
|
156
|
+
* console.log('Headings:', result.data.headings);
|
|
157
|
+
* if (result.data.hasCodeBlocks) {
|
|
158
|
+
* console.log('Document contains code examples');
|
|
159
|
+
* }
|
|
160
|
+
* } else {
|
|
161
|
+
* console.error('Processing failed:', result.error.userMessage);
|
|
162
|
+
* }
|
|
163
|
+
* ```
|
|
164
|
+
*/
|
|
165
|
+
export declare function processMarkdown(fileInfo: FileInfo, options?: ProcessOptions): Promise<FileProcessingResult<ProcessedMarkdown>>;
|