@juspay/neurolink 9.1.0 → 9.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +54 -7
- package/dist/agent/directTools.d.ts +3 -3
- package/dist/cli/commands/config.d.ts +6 -6
- package/dist/image-gen/ImageGenService.d.ts +143 -0
- package/dist/image-gen/ImageGenService.js +345 -0
- package/dist/image-gen/imageGenTools.d.ts +126 -0
- package/dist/image-gen/imageGenTools.js +304 -0
- package/dist/image-gen/index.d.ts +46 -0
- package/dist/image-gen/index.js +48 -0
- package/dist/image-gen/types.d.ts +237 -0
- package/dist/image-gen/types.js +24 -0
- package/dist/lib/agent/directTools.d.ts +3 -3
- package/dist/lib/image-gen/ImageGenService.d.ts +143 -0
- package/dist/lib/image-gen/ImageGenService.js +346 -0
- package/dist/lib/image-gen/imageGenTools.d.ts +126 -0
- package/dist/lib/image-gen/imageGenTools.js +305 -0
- package/dist/lib/image-gen/index.d.ts +46 -0
- package/dist/lib/image-gen/index.js +49 -0
- package/dist/lib/image-gen/types.d.ts +237 -0
- package/dist/lib/image-gen/types.js +25 -0
- package/dist/lib/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/lib/processors/base/BaseFileProcessor.js +614 -0
- package/dist/lib/processors/base/index.d.ts +14 -0
- package/dist/lib/processors/base/index.js +20 -0
- package/dist/lib/processors/base/types.d.ts +593 -0
- package/dist/lib/processors/base/types.js +77 -0
- package/dist/lib/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/lib/processors/cli/fileProcessorCli.js +389 -0
- package/dist/lib/processors/cli/index.d.ts +37 -0
- package/dist/lib/processors/cli/index.js +50 -0
- package/dist/lib/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/lib/processors/code/ConfigProcessor.js +401 -0
- package/dist/lib/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/lib/processors/code/SourceCodeProcessor.js +305 -0
- package/dist/lib/processors/code/index.d.ts +44 -0
- package/dist/lib/processors/code/index.js +61 -0
- package/dist/lib/processors/config/fileTypes.d.ts +283 -0
- package/dist/lib/processors/config/fileTypes.js +521 -0
- package/dist/lib/processors/config/index.d.ts +32 -0
- package/dist/lib/processors/config/index.js +93 -0
- package/dist/lib/processors/config/languageMap.d.ts +66 -0
- package/dist/lib/processors/config/languageMap.js +411 -0
- package/dist/lib/processors/config/mimeTypes.d.ts +376 -0
- package/dist/lib/processors/config/mimeTypes.js +339 -0
- package/dist/lib/processors/config/sizeLimits.d.ts +194 -0
- package/dist/lib/processors/config/sizeLimits.js +247 -0
- package/dist/lib/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/lib/processors/data/JsonProcessor.js +204 -0
- package/dist/lib/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/lib/processors/data/XmlProcessor.js +284 -0
- package/dist/lib/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/lib/processors/data/YamlProcessor.js +295 -0
- package/dist/lib/processors/data/index.d.ts +49 -0
- package/dist/lib/processors/data/index.js +77 -0
- package/dist/lib/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/lib/processors/document/ExcelProcessor.js +520 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/lib/processors/document/OpenDocumentProcessor.js +211 -0
- package/dist/lib/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/lib/processors/document/RtfProcessor.js +362 -0
- package/dist/lib/processors/document/WordProcessor.d.ts +168 -0
- package/dist/lib/processors/document/WordProcessor.js +354 -0
- package/dist/lib/processors/document/index.d.ts +54 -0
- package/dist/lib/processors/document/index.js +91 -0
- package/dist/lib/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/lib/processors/errors/FileErrorCode.js +256 -0
- package/dist/lib/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/lib/processors/errors/errorHelpers.js +379 -0
- package/dist/lib/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/lib/processors/errors/errorSerializer.js +508 -0
- package/dist/lib/processors/errors/index.d.ts +46 -0
- package/dist/lib/processors/errors/index.js +50 -0
- package/dist/lib/processors/index.d.ts +76 -0
- package/dist/lib/processors/index.js +113 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/lib/processors/integration/FileProcessorIntegration.js +273 -0
- package/dist/lib/processors/integration/index.d.ts +42 -0
- package/dist/lib/processors/integration/index.js +45 -0
- package/dist/lib/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/lib/processors/markup/HtmlProcessor.js +250 -0
- package/dist/lib/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/lib/processors/markup/MarkdownProcessor.js +245 -0
- package/dist/lib/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/lib/processors/markup/SvgProcessor.js +241 -0
- package/dist/lib/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/lib/processors/markup/TextProcessor.js +189 -0
- package/dist/lib/processors/markup/index.d.ts +66 -0
- package/dist/lib/processors/markup/index.js +103 -0
- package/dist/lib/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/lib/processors/registry/ProcessorRegistry.js +609 -0
- package/dist/lib/processors/registry/index.d.ts +12 -0
- package/dist/lib/processors/registry/index.js +17 -0
- package/dist/lib/processors/registry/types.d.ts +53 -0
- package/dist/lib/processors/registry/types.js +11 -0
- package/dist/lib/providers/sagemaker/language-model.d.ts +2 -2
- package/dist/lib/server/utils/validation.d.ts +6 -6
- package/dist/lib/types/fileTypes.d.ts +51 -1
- package/dist/lib/types/index.d.ts +25 -24
- package/dist/lib/types/index.js +21 -20
- package/dist/lib/types/modelTypes.d.ts +18 -18
- package/dist/lib/types/pptTypes.d.ts +14 -2
- package/dist/lib/types/pptTypes.js +16 -0
- package/dist/lib/utils/async/delay.d.ts +40 -0
- package/dist/lib/utils/async/delay.js +43 -0
- package/dist/lib/utils/async/index.d.ts +23 -0
- package/dist/lib/utils/async/index.js +24 -0
- package/dist/lib/utils/async/retry.d.ts +141 -0
- package/dist/lib/utils/async/retry.js +172 -0
- package/dist/lib/utils/async/withTimeout.d.ts +73 -0
- package/dist/lib/utils/async/withTimeout.js +97 -0
- package/dist/lib/utils/csvProcessor.js +442 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -1
- package/dist/lib/utils/fileDetector.js +91 -18
- package/dist/lib/utils/json/extract.d.ts +103 -0
- package/dist/lib/utils/json/extract.js +249 -0
- package/dist/lib/utils/json/index.d.ts +36 -0
- package/dist/lib/utils/json/index.js +37 -0
- package/dist/lib/utils/json/safeParse.d.ts +137 -0
- package/dist/lib/utils/json/safeParse.js +191 -0
- package/dist/lib/utils/messageBuilder.d.ts +2 -2
- package/dist/lib/utils/messageBuilder.js +15 -7
- package/dist/lib/utils/sanitizers/filename.d.ts +137 -0
- package/dist/lib/utils/sanitizers/filename.js +366 -0
- package/dist/lib/utils/sanitizers/html.d.ts +170 -0
- package/dist/lib/utils/sanitizers/html.js +326 -0
- package/dist/lib/utils/sanitizers/index.d.ts +26 -0
- package/dist/lib/utils/sanitizers/index.js +30 -0
- package/dist/lib/utils/sanitizers/svg.d.ts +81 -0
- package/dist/lib/utils/sanitizers/svg.js +483 -0
- package/dist/processors/base/BaseFileProcessor.d.ts +273 -0
- package/dist/processors/base/BaseFileProcessor.js +613 -0
- package/dist/processors/base/index.d.ts +14 -0
- package/dist/processors/base/index.js +19 -0
- package/dist/processors/base/types.d.ts +593 -0
- package/dist/processors/base/types.js +76 -0
- package/dist/processors/cli/fileProcessorCli.d.ts +163 -0
- package/dist/processors/cli/fileProcessorCli.js +388 -0
- package/dist/processors/cli/index.d.ts +37 -0
- package/dist/processors/cli/index.js +49 -0
- package/dist/processors/code/ConfigProcessor.d.ts +171 -0
- package/dist/processors/code/ConfigProcessor.js +400 -0
- package/dist/processors/code/SourceCodeProcessor.d.ts +174 -0
- package/dist/processors/code/SourceCodeProcessor.js +304 -0
- package/dist/processors/code/index.d.ts +44 -0
- package/dist/processors/code/index.js +60 -0
- package/dist/processors/config/fileTypes.d.ts +283 -0
- package/dist/processors/config/fileTypes.js +520 -0
- package/dist/processors/config/index.d.ts +32 -0
- package/dist/processors/config/index.js +92 -0
- package/dist/processors/config/languageMap.d.ts +66 -0
- package/dist/processors/config/languageMap.js +410 -0
- package/dist/processors/config/mimeTypes.d.ts +376 -0
- package/dist/processors/config/mimeTypes.js +338 -0
- package/dist/processors/config/sizeLimits.d.ts +194 -0
- package/dist/processors/config/sizeLimits.js +246 -0
- package/dist/processors/data/JsonProcessor.d.ts +122 -0
- package/dist/processors/data/JsonProcessor.js +203 -0
- package/dist/processors/data/XmlProcessor.d.ts +160 -0
- package/dist/processors/data/XmlProcessor.js +283 -0
- package/dist/processors/data/YamlProcessor.d.ts +163 -0
- package/dist/processors/data/YamlProcessor.js +294 -0
- package/dist/processors/data/index.d.ts +49 -0
- package/dist/processors/data/index.js +76 -0
- package/dist/processors/document/ExcelProcessor.d.ts +238 -0
- package/dist/processors/document/ExcelProcessor.js +519 -0
- package/dist/processors/document/OpenDocumentProcessor.d.ts +69 -0
- package/dist/processors/document/OpenDocumentProcessor.js +210 -0
- package/dist/processors/document/RtfProcessor.d.ts +152 -0
- package/dist/processors/document/RtfProcessor.js +361 -0
- package/dist/processors/document/WordProcessor.d.ts +168 -0
- package/dist/processors/document/WordProcessor.js +353 -0
- package/dist/processors/document/index.d.ts +54 -0
- package/dist/processors/document/index.js +90 -0
- package/dist/processors/errors/FileErrorCode.d.ts +98 -0
- package/dist/processors/errors/FileErrorCode.js +255 -0
- package/dist/processors/errors/errorHelpers.d.ts +151 -0
- package/dist/processors/errors/errorHelpers.js +378 -0
- package/dist/processors/errors/errorSerializer.d.ts +139 -0
- package/dist/processors/errors/errorSerializer.js +507 -0
- package/dist/processors/errors/index.d.ts +46 -0
- package/dist/processors/errors/index.js +49 -0
- package/dist/processors/index.d.ts +76 -0
- package/dist/processors/index.js +112 -0
- package/dist/processors/integration/FileProcessorIntegration.d.ts +244 -0
- package/dist/processors/integration/FileProcessorIntegration.js +272 -0
- package/dist/processors/integration/index.d.ts +42 -0
- package/dist/processors/integration/index.js +44 -0
- package/dist/processors/markup/HtmlProcessor.d.ts +169 -0
- package/dist/processors/markup/HtmlProcessor.js +249 -0
- package/dist/processors/markup/MarkdownProcessor.d.ts +165 -0
- package/dist/processors/markup/MarkdownProcessor.js +244 -0
- package/dist/processors/markup/SvgProcessor.d.ts +156 -0
- package/dist/processors/markup/SvgProcessor.js +240 -0
- package/dist/processors/markup/TextProcessor.d.ts +135 -0
- package/dist/processors/markup/TextProcessor.js +188 -0
- package/dist/processors/markup/index.d.ts +66 -0
- package/dist/processors/markup/index.js +102 -0
- package/dist/processors/registry/ProcessorRegistry.d.ts +334 -0
- package/dist/processors/registry/ProcessorRegistry.js +608 -0
- package/dist/processors/registry/index.d.ts +12 -0
- package/dist/processors/registry/index.js +16 -0
- package/dist/processors/registry/types.d.ts +53 -0
- package/dist/processors/registry/types.js +10 -0
- package/dist/server/utils/validation.d.ts +6 -6
- package/dist/types/fileTypes.d.ts +51 -1
- package/dist/types/index.d.ts +25 -24
- package/dist/types/index.js +21 -20
- package/dist/types/modelTypes.d.ts +10 -10
- package/dist/types/pptTypes.d.ts +14 -2
- package/dist/types/pptTypes.js +16 -0
- package/dist/utils/async/delay.d.ts +40 -0
- package/dist/utils/async/delay.js +42 -0
- package/dist/utils/async/index.d.ts +23 -0
- package/dist/utils/async/index.js +23 -0
- package/dist/utils/async/retry.d.ts +141 -0
- package/dist/utils/async/retry.js +171 -0
- package/dist/utils/async/withTimeout.d.ts +73 -0
- package/dist/utils/async/withTimeout.js +96 -0
- package/dist/utils/csvProcessor.js +442 -0
- package/dist/utils/fileDetector.d.ts +7 -1
- package/dist/utils/fileDetector.js +91 -18
- package/dist/utils/json/extract.d.ts +103 -0
- package/dist/utils/json/extract.js +248 -0
- package/dist/utils/json/index.d.ts +36 -0
- package/dist/utils/json/index.js +36 -0
- package/dist/utils/json/safeParse.d.ts +137 -0
- package/dist/utils/json/safeParse.js +190 -0
- package/dist/utils/messageBuilder.d.ts +2 -2
- package/dist/utils/messageBuilder.js +15 -7
- package/dist/utils/sanitizers/filename.d.ts +137 -0
- package/dist/utils/sanitizers/filename.js +365 -0
- package/dist/utils/sanitizers/html.d.ts +170 -0
- package/dist/utils/sanitizers/html.js +325 -0
- package/dist/utils/sanitizers/index.d.ts +26 -0
- package/dist/utils/sanitizers/index.js +29 -0
- package/dist/utils/sanitizers/svg.d.ts +81 -0
- package/dist/utils/sanitizers/svg.js +482 -0
- package/package.json +2 -2
|
@@ -6,6 +6,418 @@
|
|
|
6
6
|
import csvParser from "csv-parser";
|
|
7
7
|
import { Readable } from "stream";
|
|
8
8
|
import { logger } from "./logger.js";
|
|
9
|
+
// ============================================================================
|
|
10
|
+
// Data Type Detection Patterns
|
|
11
|
+
// ============================================================================
|
|
12
|
+
const DATE_PATTERNS = [
|
|
13
|
+
{ regex: /^\d{4}-\d{2}-\d{2}$/, format: "YYYY-MM-DD" },
|
|
14
|
+
{ regex: /^\d{2}\/\d{2}\/\d{4}$/, format: "MM/DD/YYYY" },
|
|
15
|
+
{ regex: /^\d{2}-\d{2}-\d{4}$/, format: "DD-MM-YYYY" },
|
|
16
|
+
{ regex: /^\d{2}\.\d{2}\.\d{4}$/, format: "DD.MM.YYYY" },
|
|
17
|
+
{ regex: /^\d{4}\/\d{2}\/\d{2}$/, format: "YYYY/MM/DD" },
|
|
18
|
+
];
|
|
19
|
+
const DATETIME_PATTERNS = [
|
|
20
|
+
{ regex: /^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}/, format: "ISO8601" },
|
|
21
|
+
{ regex: /^\d{2}\/\d{2}\/\d{4} \d{2}:\d{2}/, format: "MM/DD/YYYY HH:mm" },
|
|
22
|
+
];
|
|
23
|
+
const EMAIL_REGEX = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
|
|
24
|
+
const URL_REGEX = /^(https?:\/\/|www\.)[^\s]+$/i;
|
|
25
|
+
const INTEGER_REGEX = /^-?\d+$/;
|
|
26
|
+
const FLOAT_REGEX = /^-?\d+\.\d+$/;
|
|
27
|
+
const BOOLEAN_VALUES = new Set([
|
|
28
|
+
"true",
|
|
29
|
+
"false",
|
|
30
|
+
"yes",
|
|
31
|
+
"no",
|
|
32
|
+
"1",
|
|
33
|
+
"0",
|
|
34
|
+
"t",
|
|
35
|
+
"f",
|
|
36
|
+
"y",
|
|
37
|
+
"n",
|
|
38
|
+
]);
|
|
39
|
+
// ============================================================================
|
|
40
|
+
// Column Name Validation
|
|
41
|
+
// ============================================================================
|
|
42
|
+
/**
|
|
43
|
+
* Validate column name and return issues
|
|
44
|
+
*/
|
|
45
|
+
function validateColumnName(name) {
|
|
46
|
+
const issues = [];
|
|
47
|
+
if (!name || name.trim() === "") {
|
|
48
|
+
issues.push("Empty or blank column name");
|
|
49
|
+
return issues;
|
|
50
|
+
}
|
|
51
|
+
if (name !== name.trim()) {
|
|
52
|
+
issues.push("Leading or trailing whitespace");
|
|
53
|
+
}
|
|
54
|
+
if (/^\d/.test(name)) {
|
|
55
|
+
issues.push("Starts with a number");
|
|
56
|
+
}
|
|
57
|
+
if (/[^a-zA-Z0-9_\- ]/.test(name)) {
|
|
58
|
+
issues.push("Contains special characters");
|
|
59
|
+
}
|
|
60
|
+
if (name.length > 64) {
|
|
61
|
+
issues.push("Name exceeds 64 characters");
|
|
62
|
+
}
|
|
63
|
+
if (/\s{2,}/.test(name)) {
|
|
64
|
+
issues.push("Contains multiple consecutive spaces");
|
|
65
|
+
}
|
|
66
|
+
return issues;
|
|
67
|
+
}
|
|
68
|
+
// ============================================================================
|
|
69
|
+
// Data Type Detection
|
|
70
|
+
// ============================================================================
|
|
71
|
+
/**
|
|
72
|
+
* Detect the data type of a single value
|
|
73
|
+
*/
|
|
74
|
+
function detectValueType(value) {
|
|
75
|
+
if (value === "" || value === null || value === undefined) {
|
|
76
|
+
return "empty";
|
|
77
|
+
}
|
|
78
|
+
const trimmed = value.trim();
|
|
79
|
+
if (trimmed === "") {
|
|
80
|
+
return "empty";
|
|
81
|
+
}
|
|
82
|
+
// Check boolean first (before numbers since "1" and "0" could be both)
|
|
83
|
+
if (BOOLEAN_VALUES.has(trimmed.toLowerCase())) {
|
|
84
|
+
return "boolean";
|
|
85
|
+
}
|
|
86
|
+
// Check integer
|
|
87
|
+
if (INTEGER_REGEX.test(trimmed)) {
|
|
88
|
+
return "integer";
|
|
89
|
+
}
|
|
90
|
+
// Check float
|
|
91
|
+
if (FLOAT_REGEX.test(trimmed)) {
|
|
92
|
+
return "float";
|
|
93
|
+
}
|
|
94
|
+
// Check email
|
|
95
|
+
if (EMAIL_REGEX.test(trimmed)) {
|
|
96
|
+
return "email";
|
|
97
|
+
}
|
|
98
|
+
// Check URL
|
|
99
|
+
if (URL_REGEX.test(trimmed)) {
|
|
100
|
+
return "url";
|
|
101
|
+
}
|
|
102
|
+
// Check datetime (before date since datetime is more specific)
|
|
103
|
+
for (const pattern of DATETIME_PATTERNS) {
|
|
104
|
+
if (pattern.regex.test(trimmed)) {
|
|
105
|
+
return "datetime";
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
// Check date
|
|
109
|
+
for (const pattern of DATE_PATTERNS) {
|
|
110
|
+
if (pattern.regex.test(trimmed)) {
|
|
111
|
+
return "date";
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
return "string";
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Detect date format from value
|
|
118
|
+
*/
|
|
119
|
+
function detectDateFormat(value) {
|
|
120
|
+
const trimmed = value.trim();
|
|
121
|
+
for (const pattern of DATETIME_PATTERNS) {
|
|
122
|
+
if (pattern.regex.test(trimmed)) {
|
|
123
|
+
return pattern.format;
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
for (const pattern of DATE_PATTERNS) {
|
|
127
|
+
if (pattern.regex.test(trimmed)) {
|
|
128
|
+
return pattern.format;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return undefined;
|
|
132
|
+
}
|
|
133
|
+
/**
|
|
134
|
+
* Determine the predominant type for a column based on sampled values
|
|
135
|
+
*/
|
|
136
|
+
function determineColumnType(types) {
|
|
137
|
+
const nonEmpty = types.filter((t) => t !== "empty");
|
|
138
|
+
if (nonEmpty.length === 0) {
|
|
139
|
+
return { type: "empty", confidence: 100 };
|
|
140
|
+
}
|
|
141
|
+
// Count occurrences of each type
|
|
142
|
+
const typeCounts = new Map();
|
|
143
|
+
for (const t of nonEmpty) {
|
|
144
|
+
typeCounts.set(t, (typeCounts.get(t) || 0) + 1);
|
|
145
|
+
}
|
|
146
|
+
// Find the most common type
|
|
147
|
+
let maxType = "string";
|
|
148
|
+
let maxCount = 0;
|
|
149
|
+
for (const [type, count] of typeCounts) {
|
|
150
|
+
if (count > maxCount) {
|
|
151
|
+
maxCount = count;
|
|
152
|
+
maxType = type;
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
// Calculate confidence
|
|
156
|
+
const confidence = Math.round((maxCount / nonEmpty.length) * 100);
|
|
157
|
+
// Consolidate integer and float into number if the column contains only numeric types
|
|
158
|
+
// This check must happen before the mixed-type check to avoid classifying numeric-only columns as mixed
|
|
159
|
+
if (typeCounts.has("integer") && typeCounts.has("float")) {
|
|
160
|
+
// Check if these are the only two types (purely numeric column)
|
|
161
|
+
if (typeCounts.size === 2) {
|
|
162
|
+
const totalNumeric = (typeCounts.get("integer") || 0) + (typeCounts.get("float") || 0);
|
|
163
|
+
const numericConfidence = Math.round((totalNumeric / nonEmpty.length) * 100);
|
|
164
|
+
return { type: "number", confidence: numericConfidence };
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
// If confidence is low and multiple types exist, mark as mixed
|
|
168
|
+
if (confidence < 70 && typeCounts.size > 1) {
|
|
169
|
+
return { type: "mixed", confidence };
|
|
170
|
+
}
|
|
171
|
+
return { type: maxType, confidence };
|
|
172
|
+
}
|
|
173
|
+
/**
|
|
174
|
+
* Analyze a single column and return rich metadata
|
|
175
|
+
*/
|
|
176
|
+
function analyzeColumn(columnName, columnIndex, values) {
|
|
177
|
+
const types = [];
|
|
178
|
+
const uniqueValues = new Set();
|
|
179
|
+
const numericValues = [];
|
|
180
|
+
let nullCount = 0;
|
|
181
|
+
let dateFormat;
|
|
182
|
+
for (const value of values) {
|
|
183
|
+
const trimmed = value?.trim() ?? "";
|
|
184
|
+
if (trimmed === "") {
|
|
185
|
+
nullCount++;
|
|
186
|
+
types.push("empty");
|
|
187
|
+
continue;
|
|
188
|
+
}
|
|
189
|
+
uniqueValues.add(trimmed);
|
|
190
|
+
const type = detectValueType(trimmed);
|
|
191
|
+
types.push(type);
|
|
192
|
+
// Collect numeric values for statistics
|
|
193
|
+
if (type === "integer" || type === "float") {
|
|
194
|
+
const num = parseFloat(trimmed);
|
|
195
|
+
if (!isNaN(num)) {
|
|
196
|
+
numericValues.push(num);
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
// Detect date format
|
|
200
|
+
if ((type === "date" || type === "datetime") && !dateFormat) {
|
|
201
|
+
dateFormat = detectDateFormat(trimmed);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
const { type: detectedType, confidence } = determineColumnType(types);
|
|
205
|
+
// Get sample values (up to 5 unique non-empty)
|
|
206
|
+
const sampleValues = Array.from(uniqueValues).slice(0, 5);
|
|
207
|
+
// Calculate numeric statistics
|
|
208
|
+
let minValue;
|
|
209
|
+
let maxValue;
|
|
210
|
+
let avgValue;
|
|
211
|
+
if (numericValues.length > 0) {
|
|
212
|
+
minValue = Math.min(...numericValues);
|
|
213
|
+
maxValue = Math.max(...numericValues);
|
|
214
|
+
avgValue =
|
|
215
|
+
Math.round((numericValues.reduce((a, b) => a + b, 0) / numericValues.length) * 100) / 100;
|
|
216
|
+
}
|
|
217
|
+
// Validate column name
|
|
218
|
+
const nameIssues = validateColumnName(columnName);
|
|
219
|
+
const metadata = {
|
|
220
|
+
name: columnName,
|
|
221
|
+
index: columnIndex,
|
|
222
|
+
detectedType,
|
|
223
|
+
typeConfidence: confidence,
|
|
224
|
+
nullCount,
|
|
225
|
+
uniqueCount: uniqueValues.size,
|
|
226
|
+
sampleValues,
|
|
227
|
+
};
|
|
228
|
+
if (minValue !== undefined) {
|
|
229
|
+
metadata.minValue = minValue;
|
|
230
|
+
}
|
|
231
|
+
if (maxValue !== undefined) {
|
|
232
|
+
metadata.maxValue = maxValue;
|
|
233
|
+
}
|
|
234
|
+
if (avgValue !== undefined) {
|
|
235
|
+
metadata.avgValue = avgValue;
|
|
236
|
+
}
|
|
237
|
+
if (dateFormat) {
|
|
238
|
+
metadata.dateFormat = dateFormat;
|
|
239
|
+
}
|
|
240
|
+
if (nameIssues.length > 0) {
|
|
241
|
+
metadata.nameIssues = nameIssues;
|
|
242
|
+
}
|
|
243
|
+
return metadata;
|
|
244
|
+
}
|
|
245
|
+
/**
|
|
246
|
+
* Generate data quality warnings based on column analysis
|
|
247
|
+
*/
|
|
248
|
+
function generateDataQualityWarnings(columns, totalRows) {
|
|
249
|
+
const warnings = [];
|
|
250
|
+
for (const col of columns) {
|
|
251
|
+
// Check for high null rate (>20%)
|
|
252
|
+
const nullRate = totalRows > 0 ? col.nullCount / totalRows : 0;
|
|
253
|
+
if (nullRate > 0.2) {
|
|
254
|
+
warnings.push({
|
|
255
|
+
column: col.name,
|
|
256
|
+
type: "high_null_rate",
|
|
257
|
+
message: `Column has ${Math.round(nullRate * 100)}% empty/null values (${col.nullCount} of ${totalRows} rows)`,
|
|
258
|
+
severity: nullRate > 0.5 ? "warning" : "info",
|
|
259
|
+
affectedRows: col.nullCount,
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
// Check for invalid column names
|
|
263
|
+
if (col.nameIssues && col.nameIssues.length > 0) {
|
|
264
|
+
warnings.push({
|
|
265
|
+
column: col.name,
|
|
266
|
+
type: "invalid_name",
|
|
267
|
+
message: `Column name issues: ${col.nameIssues.join(", ")}`,
|
|
268
|
+
severity: col.name.trim() === "" ? "error" : "warning",
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
// Check for mixed types (low confidence)
|
|
272
|
+
if (col.detectedType === "mixed" || col.typeConfidence < 70) {
|
|
273
|
+
warnings.push({
|
|
274
|
+
column: col.name,
|
|
275
|
+
type: "mixed_types",
|
|
276
|
+
message: `Column has inconsistent data types (${col.typeConfidence}% confidence for ${col.detectedType})`,
|
|
277
|
+
severity: "warning",
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
// Check for potential duplicates (very low unique count)
|
|
281
|
+
if (totalRows > 10 && col.uniqueCount === 1 && col.nullCount === 0) {
|
|
282
|
+
warnings.push({
|
|
283
|
+
column: col.name,
|
|
284
|
+
type: "duplicates",
|
|
285
|
+
message: `All ${totalRows} rows have the same value`,
|
|
286
|
+
severity: "info",
|
|
287
|
+
affectedRows: totalRows,
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
// Check for all empty column
|
|
291
|
+
if (col.detectedType === "empty") {
|
|
292
|
+
warnings.push({
|
|
293
|
+
column: col.name,
|
|
294
|
+
type: "empty_values",
|
|
295
|
+
message: "Column is entirely empty",
|
|
296
|
+
severity: "warning",
|
|
297
|
+
affectedRows: totalRows,
|
|
298
|
+
});
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
return warnings;
|
|
302
|
+
}
|
|
303
|
+
/**
|
|
304
|
+
* Calculate overall data quality score
|
|
305
|
+
*/
|
|
306
|
+
function calculateDataQualityScore(columns, warnings, totalRows) {
|
|
307
|
+
if (columns.length === 0 || totalRows === 0) {
|
|
308
|
+
return 0;
|
|
309
|
+
}
|
|
310
|
+
let score = 100;
|
|
311
|
+
// Deduct for warnings
|
|
312
|
+
for (const warning of warnings) {
|
|
313
|
+
switch (warning.severity) {
|
|
314
|
+
case "error":
|
|
315
|
+
score -= 15;
|
|
316
|
+
break;
|
|
317
|
+
case "warning":
|
|
318
|
+
score -= 8;
|
|
319
|
+
break;
|
|
320
|
+
case "info":
|
|
321
|
+
score -= 3;
|
|
322
|
+
break;
|
|
323
|
+
}
|
|
324
|
+
}
|
|
325
|
+
// Deduct for overall null rate
|
|
326
|
+
const totalNulls = columns.reduce((sum, col) => sum + col.nullCount, 0);
|
|
327
|
+
const totalCells = columns.length * totalRows;
|
|
328
|
+
const overallNullRate = totalCells > 0 ? totalNulls / totalCells : 0;
|
|
329
|
+
score -= Math.round(overallNullRate * 30);
|
|
330
|
+
// Deduct for low type confidence
|
|
331
|
+
const avgConfidence = columns.reduce((sum, col) => sum + col.typeConfidence, 0) / columns.length;
|
|
332
|
+
if (avgConfidence < 80) {
|
|
333
|
+
score -= Math.round((80 - avgConfidence) / 2);
|
|
334
|
+
}
|
|
335
|
+
return Math.max(0, Math.min(100, score));
|
|
336
|
+
}
|
|
337
|
+
/**
|
|
338
|
+
* Analyze all columns in parsed CSV data
|
|
339
|
+
*/
|
|
340
|
+
function analyzeColumns(rows) {
|
|
341
|
+
if (rows.length === 0) {
|
|
342
|
+
return {
|
|
343
|
+
columnMetadata: [],
|
|
344
|
+
dataQualityWarnings: [],
|
|
345
|
+
dataQualityScore: 0,
|
|
346
|
+
};
|
|
347
|
+
}
|
|
348
|
+
const columnNames = Object.keys(rows[0]);
|
|
349
|
+
const columnMetadata = [];
|
|
350
|
+
for (let i = 0; i < columnNames.length; i++) {
|
|
351
|
+
const colName = columnNames[i];
|
|
352
|
+
const values = rows.map((row) => String(row[colName] ?? ""));
|
|
353
|
+
columnMetadata.push(analyzeColumn(colName, i, values));
|
|
354
|
+
}
|
|
355
|
+
const dataQualityWarnings = generateDataQualityWarnings(columnMetadata, rows.length);
|
|
356
|
+
const dataQualityScore = calculateDataQualityScore(columnMetadata, dataQualityWarnings, rows.length);
|
|
357
|
+
return {
|
|
358
|
+
columnMetadata,
|
|
359
|
+
dataQualityWarnings,
|
|
360
|
+
dataQualityScore,
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Detect if the first row appears to be a header row
|
|
365
|
+
*
|
|
366
|
+
* Heuristics used:
|
|
367
|
+
* 1. Header values should be text/string type (not numbers, dates, emails, etc.)
|
|
368
|
+
* 2. Header values should be unique (no duplicate column names)
|
|
369
|
+
* 3. If data rows exist, headers should have different type profile than data
|
|
370
|
+
*
|
|
371
|
+
* @param headerValues - The values from the first row (potential headers)
|
|
372
|
+
* @param dataRows - Sample of data rows for comparison (optional)
|
|
373
|
+
* @returns true if the first row appears to be headers
|
|
374
|
+
*/
|
|
375
|
+
function detectHasHeaders(headerValues, dataRows) {
|
|
376
|
+
if (headerValues.length === 0) {
|
|
377
|
+
return false;
|
|
378
|
+
}
|
|
379
|
+
// Check 1: All header values should look like text labels, not data values
|
|
380
|
+
let textLikeCount = 0;
|
|
381
|
+
for (const value of headerValues) {
|
|
382
|
+
const trimmed = value?.trim() ?? "";
|
|
383
|
+
if (trimmed === "") {
|
|
384
|
+
continue; // Empty headers are allowed but don't count toward text-like
|
|
385
|
+
}
|
|
386
|
+
const type = detectValueType(trimmed);
|
|
387
|
+
// Headers are typically strings - not numbers, dates, emails, URLs, or booleans
|
|
388
|
+
if (type === "string") {
|
|
389
|
+
textLikeCount++;
|
|
390
|
+
}
|
|
391
|
+
}
|
|
392
|
+
// If most header values are text-like (not numeric/date/etc.), likely headers
|
|
393
|
+
const nonEmptyHeaders = headerValues.filter((v) => v?.trim()).length;
|
|
394
|
+
if (nonEmptyHeaders === 0) {
|
|
395
|
+
return false;
|
|
396
|
+
}
|
|
397
|
+
const textRatio = textLikeCount / nonEmptyHeaders;
|
|
398
|
+
// Check 2: Headers should be unique
|
|
399
|
+
const uniqueHeaders = new Set(headerValues.map((v) => v?.trim().toLowerCase()));
|
|
400
|
+
const hasUniqueHeaders = uniqueHeaders.size === headerValues.length;
|
|
401
|
+
// Check 3: Compare with data rows if available
|
|
402
|
+
if (dataRows && dataRows.length > 0) {
|
|
403
|
+
// If first data row has different type profile than headers, likely has headers
|
|
404
|
+
const firstDataRow = Object.values(dataRows[0] || {}).map((v) => String(v ?? ""));
|
|
405
|
+
let dataTextCount = 0;
|
|
406
|
+
for (const value of firstDataRow) {
|
|
407
|
+
const type = detectValueType(value?.trim() ?? "");
|
|
408
|
+
if (type === "string") {
|
|
409
|
+
dataTextCount++;
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
const dataTextRatio = firstDataRow.length > 0 ? dataTextCount / firstDataRow.length : 0;
|
|
413
|
+
// If headers are mostly text but data has more varied types, likely has headers
|
|
414
|
+
if (textRatio > 0.7 && dataTextRatio < textRatio - 0.2) {
|
|
415
|
+
return true;
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
// Default: if >70% of header values are text-like and unique, assume headers
|
|
419
|
+
return textRatio >= 0.7 && hasUniqueHeaders;
|
|
420
|
+
}
|
|
9
421
|
/**
|
|
10
422
|
* Detect if first line is CSV metadata (not actual data/headers)
|
|
11
423
|
* Common patterns:
|
|
@@ -106,6 +518,16 @@ export class CSVProcessor {
|
|
|
106
518
|
columnCount: (limitedLines[0] || "").split(",").length,
|
|
107
519
|
truncated: wasTruncated,
|
|
108
520
|
});
|
|
521
|
+
// Parse a sample for enhanced metadata analysis (raw format still benefits from column analysis)
|
|
522
|
+
const sampleForAnalysis = await this.parseCSVString(limitedCSV, Math.min(rowCount, 500));
|
|
523
|
+
const { columnMetadata, dataQualityWarnings, dataQualityScore } = analyzeColumns(sampleForAnalysis);
|
|
524
|
+
// Log data quality summary
|
|
525
|
+
if (dataQualityWarnings.length > 0) {
|
|
526
|
+
logger.debug("[CSVProcessor] Data quality warnings detected", {
|
|
527
|
+
warningCount: dataQualityWarnings.length,
|
|
528
|
+
score: dataQualityScore,
|
|
529
|
+
});
|
|
530
|
+
}
|
|
109
531
|
return {
|
|
110
532
|
type: "csv",
|
|
111
533
|
content: limitedCSV,
|
|
@@ -117,6 +539,11 @@ export class CSVProcessor {
|
|
|
117
539
|
totalLines: limitedLines.length,
|
|
118
540
|
columnCount: (limitedLines[0] || "").split(",").length,
|
|
119
541
|
extension,
|
|
542
|
+
columnMetadata,
|
|
543
|
+
dataQualityWarnings,
|
|
544
|
+
dataQualityScore,
|
|
545
|
+
hasHeaders: detectHasHeaders((limitedLines[0] || "").split(","), undefined),
|
|
546
|
+
detectedDelimiter: ",",
|
|
120
547
|
},
|
|
121
548
|
};
|
|
122
549
|
}
|
|
@@ -155,6 +582,15 @@ export class CSVProcessor {
|
|
|
155
582
|
if (rowCount === 0) {
|
|
156
583
|
logger.warn("[CSVProcessor] CSV file contains no data rows");
|
|
157
584
|
}
|
|
585
|
+
// Perform enhanced column analysis
|
|
586
|
+
const { columnMetadata, dataQualityWarnings, dataQualityScore } = analyzeColumns(nonEmptyRows);
|
|
587
|
+
// Log data quality summary
|
|
588
|
+
if (dataQualityWarnings.length > 0) {
|
|
589
|
+
logger.debug("[CSVProcessor] Data quality warnings detected", {
|
|
590
|
+
warningCount: dataQualityWarnings.length,
|
|
591
|
+
score: dataQualityScore,
|
|
592
|
+
});
|
|
593
|
+
}
|
|
158
594
|
// Format parsed data
|
|
159
595
|
logger.debug(`[CSVProcessor] Converting ${rowCount} rows to ${formatStyle} format`);
|
|
160
596
|
const formatted = this.formatForLLM(nonEmptyRows, formatStyle, includeHeaders);
|
|
@@ -164,6 +600,7 @@ export class CSVProcessor {
|
|
|
164
600
|
columnCount,
|
|
165
601
|
outputLength: formatted.length,
|
|
166
602
|
hasEmptyColumns,
|
|
603
|
+
dataQualityScore,
|
|
167
604
|
});
|
|
168
605
|
return {
|
|
169
606
|
type: "csv",
|
|
@@ -178,6 +615,11 @@ export class CSVProcessor {
|
|
|
178
615
|
sampleData,
|
|
179
616
|
hasEmptyColumns,
|
|
180
617
|
extension,
|
|
618
|
+
columnMetadata,
|
|
619
|
+
dataQualityWarnings,
|
|
620
|
+
dataQualityScore,
|
|
621
|
+
hasHeaders: detectHasHeaders(columnNames, nonEmptyRows),
|
|
622
|
+
detectedDelimiter: ",",
|
|
181
623
|
},
|
|
182
624
|
};
|
|
183
625
|
}
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* Centralized file detection for all multimodal file types
|
|
4
4
|
* Uses multi-strategy approach for reliable type identification
|
|
5
5
|
*/
|
|
6
|
-
import type { FileInput, FileProcessingResult
|
|
6
|
+
import type { FileDetectorOptions, FileInput, FileProcessingResult } from "../types/fileTypes.js";
|
|
7
7
|
/**
|
|
8
8
|
* Centralized file type detection and processing
|
|
9
9
|
*
|
|
@@ -69,6 +69,12 @@ export declare class FileDetector {
|
|
|
69
69
|
* Route to appropriate processor
|
|
70
70
|
*/
|
|
71
71
|
private static processFile;
|
|
72
|
+
/**
|
|
73
|
+
* Process SVG file as text content
|
|
74
|
+
* Uses SvgProcessor for security sanitization (removes XSS vectors)
|
|
75
|
+
* Returns sanitized SVG markup as text for AI analysis
|
|
76
|
+
*/
|
|
77
|
+
private static processSvgAsText;
|
|
72
78
|
/**
|
|
73
79
|
* Load file from URL with automatic retry on transient network errors
|
|
74
80
|
*/
|