@juspay/neurolink 9.56.1 → 9.56.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +6 -0
- package/dist/browser/neurolink.min.js +306 -306
- package/dist/files/fileReferenceRegistry.js +25 -10
- package/dist/lib/files/fileReferenceRegistry.js +25 -10
- package/dist/lib/types/file.d.ts +10 -0
- package/dist/lib/types/fileReference.d.ts +9 -0
- package/dist/lib/utils/fileDetector.d.ts +7 -0
- package/dist/lib/utils/fileDetector.js +47 -0
- package/dist/lib/utils/messageBuilder.js +15 -1
- package/dist/lib/utils/mimeTypeHints.d.ts +40 -0
- package/dist/lib/utils/mimeTypeHints.js +122 -0
- package/dist/types/file.d.ts +10 -0
- package/dist/types/fileReference.d.ts +9 -0
- package/dist/utils/fileDetector.d.ts +7 -0
- package/dist/utils/fileDetector.js +47 -0
- package/dist/utils/messageBuilder.js +15 -1
- package/dist/utils/mimeTypeHints.d.ts +40 -0
- package/dist/utils/mimeTypeHints.js +121 -0
- package/package.json +1 -1
|
@@ -17,6 +17,7 @@ import { tmpdir } from "node:os";
|
|
|
17
17
|
import { basename, extname, join } from "node:path";
|
|
18
18
|
import { estimatePostProcessingTokens } from "../context/fileTokenBudget.js";
|
|
19
19
|
import { logger } from "../utils/logger.js";
|
|
20
|
+
import { mimeHintToExtension, mimeHintToFileType, normalizeMimeHint, } from "../utils/mimeTypeHints.js";
|
|
20
21
|
import { StreamingReader } from "./streamingReader.js";
|
|
21
22
|
import { SIZE_TIER_THRESHOLDS } from "../types/index.js";
|
|
22
23
|
/** Default maximum files in registry before LRU eviction */
|
|
@@ -89,19 +90,33 @@ export class FileReferenceRegistry {
|
|
|
89
90
|
const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(1);
|
|
90
91
|
throw new Error(`File too large (${sizeMB} MB). Maximum accepted size is 2 GB.`);
|
|
91
92
|
}
|
|
93
|
+
// Normalize the caller-provided mimetype hint — shared helper drops
|
|
94
|
+
// `application/octet-stream` because that opaque sentinel would
|
|
95
|
+
// otherwise be trusted verbatim for the output mimeType and mask a
|
|
96
|
+
// better magic-byte-derived classification (e.g. PNG bytes hinted as
|
|
97
|
+
// octet-stream would record mimeType=octet-stream, not image/png).
|
|
98
|
+
const hintMime = normalizeMimeHint(options.mimetype);
|
|
99
|
+
const hintExt = hintMime ? mimeHintToExtension(hintMime) : "";
|
|
92
100
|
// Detect file type from magic bytes and extension.
|
|
93
|
-
// If the provided filename has no extension, append one guessed from
|
|
94
|
-
//
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
+
// If the provided filename has no extension, append one guessed from the
|
|
102
|
+
// mimetype hint first (more reliable for text formats than magic bytes),
|
|
103
|
+
// then fall back to magic bytes — so downstream processors (e.g.,
|
|
104
|
+
// VideoProcessor) can validate by extension. Compute once, reuse.
|
|
105
|
+
const synthDefaultExt = hintExt
|
|
106
|
+
? `.${hintExt}`
|
|
107
|
+
: this.guessExtension(buffer);
|
|
108
|
+
let filename = options.filename || `file-${Date.now()}${synthDefaultExt}`;
|
|
109
|
+
if (!extname(filename) && synthDefaultExt) {
|
|
110
|
+
filename = `${filename}${synthDefaultExt}`;
|
|
101
111
|
}
|
|
102
112
|
const ext = extname(filename).toLowerCase().replace(".", "");
|
|
103
|
-
const detectedType = options.fileType ||
|
|
104
|
-
|
|
113
|
+
const detectedType = options.fileType ||
|
|
114
|
+
(hintMime && mimeHintToFileType(hintMime)) ||
|
|
115
|
+
this.detectType(buffer, ext);
|
|
116
|
+
// Prefer the caller's hint verbatim for the output mimeType, but only
|
|
117
|
+
// when normalizeMimeHint accepted it (i.e. it is not the opaque
|
|
118
|
+
// octet-stream sentinel). Otherwise derive from the detected type.
|
|
119
|
+
const mimeType = hintMime || this.guessMimeType(detectedType, ext);
|
|
105
120
|
const sizeTier = FileReferenceRegistry.classifySizeTier(sizeBytes);
|
|
106
121
|
// Generate preview (fast — only reads first N chars)
|
|
107
122
|
const preview = this.extractPreview(buffer, detectedType, options.maxPreviewChars ?? this.defaultPreviewChars);
|
|
@@ -17,6 +17,7 @@ import { tmpdir } from "node:os";
|
|
|
17
17
|
import { basename, extname, join } from "node:path";
|
|
18
18
|
import { estimatePostProcessingTokens } from "../context/fileTokenBudget.js";
|
|
19
19
|
import { logger } from "../utils/logger.js";
|
|
20
|
+
import { mimeHintToExtension, mimeHintToFileType, normalizeMimeHint, } from "../utils/mimeTypeHints.js";
|
|
20
21
|
import { StreamingReader } from "./streamingReader.js";
|
|
21
22
|
import { SIZE_TIER_THRESHOLDS } from "../types/index.js";
|
|
22
23
|
/** Default maximum files in registry before LRU eviction */
|
|
@@ -89,19 +90,33 @@ export class FileReferenceRegistry {
|
|
|
89
90
|
const sizeMB = (sizeBytes / (1024 * 1024)).toFixed(1);
|
|
90
91
|
throw new Error(`File too large (${sizeMB} MB). Maximum accepted size is 2 GB.`);
|
|
91
92
|
}
|
|
93
|
+
// Normalize the caller-provided mimetype hint — shared helper drops
|
|
94
|
+
// `application/octet-stream` because that opaque sentinel would
|
|
95
|
+
// otherwise be trusted verbatim for the output mimeType and mask a
|
|
96
|
+
// better magic-byte-derived classification (e.g. PNG bytes hinted as
|
|
97
|
+
// octet-stream would record mimeType=octet-stream, not image/png).
|
|
98
|
+
const hintMime = normalizeMimeHint(options.mimetype);
|
|
99
|
+
const hintExt = hintMime ? mimeHintToExtension(hintMime) : "";
|
|
92
100
|
// Detect file type from magic bytes and extension.
|
|
93
|
-
// If the provided filename has no extension, append one guessed from
|
|
94
|
-
//
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
+
// If the provided filename has no extension, append one guessed from the
|
|
102
|
+
// mimetype hint first (more reliable for text formats than magic bytes),
|
|
103
|
+
// then fall back to magic bytes — so downstream processors (e.g.,
|
|
104
|
+
// VideoProcessor) can validate by extension. Compute once, reuse.
|
|
105
|
+
const synthDefaultExt = hintExt
|
|
106
|
+
? `.${hintExt}`
|
|
107
|
+
: this.guessExtension(buffer);
|
|
108
|
+
let filename = options.filename || `file-${Date.now()}${synthDefaultExt}`;
|
|
109
|
+
if (!extname(filename) && synthDefaultExt) {
|
|
110
|
+
filename = `${filename}${synthDefaultExt}`;
|
|
101
111
|
}
|
|
102
112
|
const ext = extname(filename).toLowerCase().replace(".", "");
|
|
103
|
-
const detectedType = options.fileType ||
|
|
104
|
-
|
|
113
|
+
const detectedType = options.fileType ||
|
|
114
|
+
(hintMime && mimeHintToFileType(hintMime)) ||
|
|
115
|
+
this.detectType(buffer, ext);
|
|
116
|
+
// Prefer the caller's hint verbatim for the output mimeType, but only
|
|
117
|
+
// when normalizeMimeHint accepted it (i.e. it is not the opaque
|
|
118
|
+
// octet-stream sentinel). Otherwise derive from the detected type.
|
|
119
|
+
const mimeType = hintMime || this.guessMimeType(detectedType, ext);
|
|
105
120
|
const sizeTier = FileReferenceRegistry.classifySizeTier(sizeBytes);
|
|
106
121
|
// Generate preview (fast — only reads first N chars)
|
|
107
122
|
const preview = this.extractPreview(buffer, detectedType, options.maxPreviewChars ?? this.defaultPreviewChars);
|
package/dist/lib/types/file.d.ts
CHANGED
|
@@ -307,6 +307,16 @@ export type FileDetectorOptions = {
|
|
|
307
307
|
maxRetries?: number;
|
|
308
308
|
/** Initial retry delay in milliseconds with exponential backoff (default: 1000) */
|
|
309
309
|
retryDelay?: number;
|
|
310
|
+
/**
|
|
311
|
+
* Caller-provided MIME type hint (e.g. "text/plain", "application/json").
|
|
312
|
+
* Used when the filename has no extension and magic-byte detection cannot
|
|
313
|
+
* identify the content — the common Slack/Curator extension-less-buffer
|
|
314
|
+
* case. When set to a trustworthy mimetype (not "application/octet-stream"),
|
|
315
|
+
* it short-circuits the detection strategy loop with a high-confidence
|
|
316
|
+
* result so small files on the eager file-processing path still honor the
|
|
317
|
+
* hint (the lazy FileReferenceRegistry path has its own hint-handling).
|
|
318
|
+
*/
|
|
319
|
+
mimetypeHint?: string;
|
|
310
320
|
};
|
|
311
321
|
/**
|
|
312
322
|
* Google AI Studio Files API types
|
|
@@ -96,6 +96,15 @@ export type FileRegistrationOptions = {
|
|
|
96
96
|
filename?: string;
|
|
97
97
|
/** Override file type detection */
|
|
98
98
|
fileType?: FileType;
|
|
99
|
+
/**
|
|
100
|
+
* Caller-provided MIME type hint (e.g. "text/plain", "application/json").
|
|
101
|
+
* Used when the filename has no extension and magic-byte detection cannot
|
|
102
|
+
* identify the content (common for Slack/Curator-style buffers where the
|
|
103
|
+
* original extension was stripped). Honored during type detection, mimeType
|
|
104
|
+
* assignment, and filename-extension synthesis. An explicit `fileType`
|
|
105
|
+
* override still wins over this hint.
|
|
106
|
+
*/
|
|
107
|
+
mimetype?: string;
|
|
99
108
|
/** Maximum preview length in characters */
|
|
100
109
|
maxPreviewChars?: number;
|
|
101
110
|
/** Skip persisting buffer to temp directory */
|
|
@@ -43,6 +43,13 @@ export declare class FileDetector {
|
|
|
43
43
|
* Derive byte size from FileInput for tracing.
|
|
44
44
|
*/
|
|
45
45
|
private static deriveInputSize;
|
|
46
|
+
/**
|
|
47
|
+
* Classify a FileInput into the FileSource enum used by downstream
|
|
48
|
+
* loaders. Keeps the mimetype-hint short-circuit in detect() able to
|
|
49
|
+
* produce a valid FileDetectionResult without re-implementing the
|
|
50
|
+
* source-inference rules scattered across loadContent().
|
|
51
|
+
*/
|
|
52
|
+
private static deriveInputSource;
|
|
46
53
|
/**
|
|
47
54
|
* Try fallback parsing for a specific file type
|
|
48
55
|
* Used when file detection returns "unknown" but we want to try parsing anyway
|
|
@@ -23,6 +23,7 @@ import { tracers, ATTR, withSpan } from "../telemetry/index.js";
|
|
|
23
23
|
import { CSVProcessor } from "./csvProcessor.js";
|
|
24
24
|
import { ImageProcessor } from "./imageProcessor.js";
|
|
25
25
|
import { logger } from "./logger.js";
|
|
26
|
+
import { mimeHintToExtension, mimeHintToFileType, normalizeMimeHint, } from "./mimeTypeHints.js";
|
|
26
27
|
import { PDFProcessor } from "./pdfProcessor.js";
|
|
27
28
|
/**
|
|
28
29
|
* Default retry configuration constants
|
|
@@ -320,6 +321,27 @@ export class FileDetector {
|
|
|
320
321
|
}
|
|
321
322
|
return 0;
|
|
322
323
|
}
|
|
324
|
+
/**
|
|
325
|
+
* Classify a FileInput into the FileSource enum used by downstream
|
|
326
|
+
* loaders. Keeps the mimetype-hint short-circuit in detect() able to
|
|
327
|
+
* produce a valid FileDetectionResult without re-implementing the
|
|
328
|
+
* source-inference rules scattered across loadContent().
|
|
329
|
+
*/
|
|
330
|
+
static deriveInputSource(input) {
|
|
331
|
+
if (Buffer.isBuffer(input)) {
|
|
332
|
+
return "buffer";
|
|
333
|
+
}
|
|
334
|
+
if (typeof input === "string") {
|
|
335
|
+
if (input.startsWith("data:")) {
|
|
336
|
+
return "datauri";
|
|
337
|
+
}
|
|
338
|
+
if (input.startsWith("http://") || input.startsWith("https://")) {
|
|
339
|
+
return "url";
|
|
340
|
+
}
|
|
341
|
+
return "path";
|
|
342
|
+
}
|
|
343
|
+
return "buffer";
|
|
344
|
+
}
|
|
323
345
|
/**
|
|
324
346
|
* Try fallback parsing for a specific file type
|
|
325
347
|
* Used when file detection returns "unknown" but we want to try parsing anyway
|
|
@@ -520,6 +542,31 @@ export class FileDetector {
|
|
|
520
542
|
* Stops at first strategy with confidence >= threshold (default: 80%)
|
|
521
543
|
*/
|
|
522
544
|
static async detect(input, options) {
|
|
545
|
+
// Short-circuit on a trustworthy caller-provided mimetype hint. This is
|
|
546
|
+
// the eager-path counterpart to FileReferenceRegistry.register()'s hint
|
|
547
|
+
// handling — necessary for tiny files (<= TINY_MAX) that skip the lazy
|
|
548
|
+
// registry path. normalizeMimeHint drops "application/octet-stream" so a
|
|
549
|
+
// caller cannot hide real content behind the opaque sentinel.
|
|
550
|
+
const hintMime = normalizeMimeHint(options?.mimetypeHint);
|
|
551
|
+
if (hintMime) {
|
|
552
|
+
const type = mimeHintToFileType(hintMime);
|
|
553
|
+
if (type) {
|
|
554
|
+
const ext = mimeHintToExtension(hintMime);
|
|
555
|
+
const result = {
|
|
556
|
+
type,
|
|
557
|
+
mimeType: hintMime,
|
|
558
|
+
extension: ext || null,
|
|
559
|
+
source: FileDetector.deriveInputSource(input),
|
|
560
|
+
metadata: {
|
|
561
|
+
confidence: 95,
|
|
562
|
+
filename: FileDetector.deriveInputFilename(input),
|
|
563
|
+
size: FileDetector.deriveInputSize(input),
|
|
564
|
+
},
|
|
565
|
+
};
|
|
566
|
+
logger.info(`[FileDetector] Type: ${type} (95%, from mimetype hint: ${hintMime})`);
|
|
567
|
+
return result;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
523
570
|
const confidenceThreshold = options?.confidenceThreshold ?? 80;
|
|
524
571
|
const strategies = [
|
|
525
572
|
new MagicBytesStrategy(),
|
|
@@ -554,6 +554,7 @@ export async function buildMessagesArray(options) {
|
|
|
554
554
|
maxSize: 50 * 1024 * 1024,
|
|
555
555
|
allowedTypes: ["csv"],
|
|
556
556
|
csvOptions: csvOptions,
|
|
557
|
+
mimetypeHint: isFileWithMetadata(file) ? file.mimetype : undefined,
|
|
557
558
|
});
|
|
558
559
|
if (result.type === "csv") {
|
|
559
560
|
let csvSection = `\n\n## CSV Data from "${filename}":\n`;
|
|
@@ -806,6 +807,12 @@ async function processUnifiedFilesArray(options, maxSize, provider) {
|
|
|
806
807
|
// ─── Full processing path (current behavior) ──────────────────
|
|
807
808
|
const genericFileMaxSize = Math.max(maxSize, 100 * 1024 * 1024);
|
|
808
809
|
const rawFileInput = isFileWithMetadata(file) ? file.buffer : file;
|
|
810
|
+
// Forward the caller's mimetype hint (Slack/Curator-style
|
|
811
|
+
// extension-less buffers) so the eager path classifies correctly
|
|
812
|
+
// for tiny files — the lazy registry path has its own hint wiring.
|
|
813
|
+
const fileMimetypeHint = isFileWithMetadata(file)
|
|
814
|
+
? file.mimetype
|
|
815
|
+
: undefined;
|
|
809
816
|
const result = await FileDetector.detectAndProcess(rawFileInput, {
|
|
810
817
|
maxSize: genericFileMaxSize,
|
|
811
818
|
allowedTypes: [
|
|
@@ -824,6 +831,7 @@ async function processUnifiedFilesArray(options, maxSize, provider) {
|
|
|
824
831
|
],
|
|
825
832
|
csvOptions: options.csvOptions,
|
|
826
833
|
provider: provider,
|
|
834
|
+
mimetypeHint: fileMimetypeHint,
|
|
827
835
|
});
|
|
828
836
|
appendDetectedFileResult(result, file, options);
|
|
829
837
|
includedCount++;
|
|
@@ -1658,7 +1666,13 @@ async function tryRegisterFileReference(file, fileSize, registry, index = 0) {
|
|
|
1658
1666
|
return false;
|
|
1659
1667
|
}
|
|
1660
1668
|
const filename = extractFilename(file, index);
|
|
1661
|
-
|
|
1669
|
+
const mimetype = typeof file === "object" && !Buffer.isBuffer(file)
|
|
1670
|
+
? file.mimetype
|
|
1671
|
+
: undefined;
|
|
1672
|
+
await registry.register(buffer, getFileSource(file), {
|
|
1673
|
+
filename,
|
|
1674
|
+
mimetype,
|
|
1675
|
+
});
|
|
1662
1676
|
logger.info(`[FileDetector] Registered "${filename}" (${(fileSize / 1024).toFixed(0)} KB) ` +
|
|
1663
1677
|
`as lazy reference — skipping upfront processing`);
|
|
1664
1678
|
return true;
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared helpers for caller-provided MIME type hints.
|
|
3
|
+
*
|
|
4
|
+
* A "MIME hint" is a mimetype string the SDK receives alongside a raw Buffer
|
|
5
|
+
* whose original filename is missing (e.g. Slack/Curator file-uploads that
|
|
6
|
+
* arrive as { buffer, filename: "Untitled", mimetype: "text/plain" }). When
|
|
7
|
+
* the filename has no extension and magic-byte detection cannot identify the
|
|
8
|
+
* content, the hint is the only signal we have.
|
|
9
|
+
*
|
|
10
|
+
* Both FileReferenceRegistry.register() and FileDetector.detect() consume
|
|
11
|
+
* these helpers so the trust/normalization rules stay in one place:
|
|
12
|
+
*
|
|
13
|
+
* - `application/octet-stream` is never trusted — it is the opaque
|
|
14
|
+
* "I don't know" sentinel and would let a caller hide real content
|
|
15
|
+
* behind a generic label (a PNG hinted as octet-stream would otherwise
|
|
16
|
+
* record mimeType="application/octet-stream" instead of "image/png").
|
|
17
|
+
* - Empty/undefined hints pass through as `undefined`.
|
|
18
|
+
* - A hint that cannot be classified maps to `null` so the caller falls
|
|
19
|
+
* back to magic-byte / extension detection instead of synthesising a
|
|
20
|
+
* wrong type.
|
|
21
|
+
*/
|
|
22
|
+
import type { FileType } from "../types/index.js";
|
|
23
|
+
/**
|
|
24
|
+
* Normalize a caller-provided mimetype hint: strip any `;charset=...`
|
|
25
|
+
* parameter, lowercase, trim. Returns undefined for empty strings or for
|
|
26
|
+
* the opaque `application/octet-stream` sentinel so downstream code can
|
|
27
|
+
* treat the hint as absent instead of trusting it verbatim.
|
|
28
|
+
*/
|
|
29
|
+
export declare function normalizeMimeHint(raw?: string): string | undefined;
|
|
30
|
+
/**
|
|
31
|
+
* Map a normalized mimetype hint to a NeuroLink FileType. Returns null when
|
|
32
|
+
* the mimetype is unknown or too generic to classify confidently.
|
|
33
|
+
*/
|
|
34
|
+
export declare function mimeHintToFileType(mimetype: string): FileType | null;
|
|
35
|
+
/**
|
|
36
|
+
* Map a normalized mimetype hint to the canonical file extension (without
|
|
37
|
+
* leading dot). Returns "" when the mimetype is unknown — caller should
|
|
38
|
+
* then fall back to magic-byte detection.
|
|
39
|
+
*/
|
|
40
|
+
export declare function mimeHintToExtension(mimetype: string): string;
|
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
const OPAQUE_MIMETYPE = "application/octet-stream";
|
|
2
|
+
/**
|
|
3
|
+
* Normalize a caller-provided mimetype hint: strip any `;charset=...`
|
|
4
|
+
* parameter, lowercase, trim. Returns undefined for empty strings or for
|
|
5
|
+
* the opaque `application/octet-stream` sentinel so downstream code can
|
|
6
|
+
* treat the hint as absent instead of trusting it verbatim.
|
|
7
|
+
*/
|
|
8
|
+
export function normalizeMimeHint(raw) {
|
|
9
|
+
if (!raw) {
|
|
10
|
+
return undefined;
|
|
11
|
+
}
|
|
12
|
+
const cleaned = raw.split(";")[0].trim().toLowerCase();
|
|
13
|
+
if (!cleaned || cleaned === OPAQUE_MIMETYPE) {
|
|
14
|
+
return undefined;
|
|
15
|
+
}
|
|
16
|
+
return cleaned;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Map a normalized mimetype hint to a NeuroLink FileType. Returns null when
|
|
20
|
+
* the mimetype is unknown or too generic to classify confidently.
|
|
21
|
+
*/
|
|
22
|
+
export function mimeHintToFileType(mimetype) {
|
|
23
|
+
const exact = {
|
|
24
|
+
"text/csv": "csv",
|
|
25
|
+
"application/csv": "csv",
|
|
26
|
+
"image/svg+xml": "svg",
|
|
27
|
+
"application/pdf": "pdf",
|
|
28
|
+
"application/json": "text",
|
|
29
|
+
"application/xml": "text",
|
|
30
|
+
"text/xml": "text",
|
|
31
|
+
"application/yaml": "text",
|
|
32
|
+
"application/x-yaml": "text",
|
|
33
|
+
"text/yaml": "text",
|
|
34
|
+
"application/javascript": "text",
|
|
35
|
+
"application/typescript": "text",
|
|
36
|
+
"application/zip": "archive",
|
|
37
|
+
"application/x-tar": "archive",
|
|
38
|
+
"application/gzip": "archive",
|
|
39
|
+
"application/x-gzip": "archive",
|
|
40
|
+
"application/x-7z-compressed": "archive",
|
|
41
|
+
"application/vnd.rar": "archive",
|
|
42
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
|
43
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
|
44
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
|
45
|
+
};
|
|
46
|
+
if (exact[mimetype]) {
|
|
47
|
+
return exact[mimetype];
|
|
48
|
+
}
|
|
49
|
+
if (mimetype.startsWith("text/")) {
|
|
50
|
+
return "text";
|
|
51
|
+
}
|
|
52
|
+
if (mimetype.startsWith("image/")) {
|
|
53
|
+
return "image";
|
|
54
|
+
}
|
|
55
|
+
if (mimetype.startsWith("audio/")) {
|
|
56
|
+
return "audio";
|
|
57
|
+
}
|
|
58
|
+
if (mimetype.startsWith("video/")) {
|
|
59
|
+
return "video";
|
|
60
|
+
}
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Map a normalized mimetype hint to the canonical file extension (without
|
|
65
|
+
* leading dot). Returns "" when the mimetype is unknown — caller should
|
|
66
|
+
* then fall back to magic-byte detection.
|
|
67
|
+
*/
|
|
68
|
+
export function mimeHintToExtension(mimetype) {
|
|
69
|
+
const table = {
|
|
70
|
+
// Text
|
|
71
|
+
"text/plain": "txt",
|
|
72
|
+
"text/html": "html",
|
|
73
|
+
"text/css": "css",
|
|
74
|
+
"text/javascript": "js",
|
|
75
|
+
"application/javascript": "js",
|
|
76
|
+
"application/typescript": "ts",
|
|
77
|
+
"text/markdown": "md",
|
|
78
|
+
"text/csv": "csv",
|
|
79
|
+
"application/csv": "csv",
|
|
80
|
+
"application/json": "json",
|
|
81
|
+
"application/xml": "xml",
|
|
82
|
+
"text/xml": "xml",
|
|
83
|
+
"application/yaml": "yaml",
|
|
84
|
+
"application/x-yaml": "yaml",
|
|
85
|
+
"text/yaml": "yaml",
|
|
86
|
+
// Documents
|
|
87
|
+
"application/pdf": "pdf",
|
|
88
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
|
89
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": "pptx",
|
|
90
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "xlsx",
|
|
91
|
+
// Images
|
|
92
|
+
"image/png": "png",
|
|
93
|
+
"image/jpeg": "jpg",
|
|
94
|
+
"image/gif": "gif",
|
|
95
|
+
"image/webp": "webp",
|
|
96
|
+
"image/bmp": "bmp",
|
|
97
|
+
"image/tiff": "tiff",
|
|
98
|
+
"image/svg+xml": "svg",
|
|
99
|
+
// Video
|
|
100
|
+
"video/mp4": "mp4",
|
|
101
|
+
"video/webm": "webm",
|
|
102
|
+
"video/quicktime": "mov",
|
|
103
|
+
"video/x-matroska": "mkv",
|
|
104
|
+
"video/x-msvideo": "avi",
|
|
105
|
+
// Audio
|
|
106
|
+
"audio/mpeg": "mp3",
|
|
107
|
+
"audio/wav": "wav",
|
|
108
|
+
"audio/ogg": "ogg",
|
|
109
|
+
"audio/flac": "flac",
|
|
110
|
+
"audio/mp4": "m4a",
|
|
111
|
+
"audio/aac": "aac",
|
|
112
|
+
// Archives
|
|
113
|
+
"application/zip": "zip",
|
|
114
|
+
"application/x-tar": "tar",
|
|
115
|
+
"application/gzip": "gz",
|
|
116
|
+
"application/x-gzip": "gz",
|
|
117
|
+
"application/x-7z-compressed": "7z",
|
|
118
|
+
"application/vnd.rar": "rar",
|
|
119
|
+
};
|
|
120
|
+
return table[mimetype] || "";
|
|
121
|
+
}
|
|
122
|
+
//# sourceMappingURL=mimeTypeHints.js.map
|
package/dist/types/file.d.ts
CHANGED
|
@@ -307,6 +307,16 @@ export type FileDetectorOptions = {
|
|
|
307
307
|
maxRetries?: number;
|
|
308
308
|
/** Initial retry delay in milliseconds with exponential backoff (default: 1000) */
|
|
309
309
|
retryDelay?: number;
|
|
310
|
+
/**
|
|
311
|
+
* Caller-provided MIME type hint (e.g. "text/plain", "application/json").
|
|
312
|
+
* Used when the filename has no extension and magic-byte detection cannot
|
|
313
|
+
* identify the content — the common Slack/Curator extension-less-buffer
|
|
314
|
+
* case. When set to a trustworthy mimetype (not "application/octet-stream"),
|
|
315
|
+
* it short-circuits the detection strategy loop with a high-confidence
|
|
316
|
+
* result so small files on the eager file-processing path still honor the
|
|
317
|
+
* hint (the lazy FileReferenceRegistry path has its own hint-handling).
|
|
318
|
+
*/
|
|
319
|
+
mimetypeHint?: string;
|
|
310
320
|
};
|
|
311
321
|
/**
|
|
312
322
|
* Google AI Studio Files API types
|
|
@@ -96,6 +96,15 @@ export type FileRegistrationOptions = {
|
|
|
96
96
|
filename?: string;
|
|
97
97
|
/** Override file type detection */
|
|
98
98
|
fileType?: FileType;
|
|
99
|
+
/**
|
|
100
|
+
* Caller-provided MIME type hint (e.g. "text/plain", "application/json").
|
|
101
|
+
* Used when the filename has no extension and magic-byte detection cannot
|
|
102
|
+
* identify the content (common for Slack/Curator-style buffers where the
|
|
103
|
+
* original extension was stripped). Honored during type detection, mimeType
|
|
104
|
+
* assignment, and filename-extension synthesis. An explicit `fileType`
|
|
105
|
+
* override still wins over this hint.
|
|
106
|
+
*/
|
|
107
|
+
mimetype?: string;
|
|
99
108
|
/** Maximum preview length in characters */
|
|
100
109
|
maxPreviewChars?: number;
|
|
101
110
|
/** Skip persisting buffer to temp directory */
|
|
@@ -43,6 +43,13 @@ export declare class FileDetector {
|
|
|
43
43
|
* Derive byte size from FileInput for tracing.
|
|
44
44
|
*/
|
|
45
45
|
private static deriveInputSize;
|
|
46
|
+
/**
|
|
47
|
+
* Classify a FileInput into the FileSource enum used by downstream
|
|
48
|
+
* loaders. Keeps the mimetype-hint short-circuit in detect() able to
|
|
49
|
+
* produce a valid FileDetectionResult without re-implementing the
|
|
50
|
+
* source-inference rules scattered across loadContent().
|
|
51
|
+
*/
|
|
52
|
+
private static deriveInputSource;
|
|
46
53
|
/**
|
|
47
54
|
* Try fallback parsing for a specific file type
|
|
48
55
|
* Used when file detection returns "unknown" but we want to try parsing anyway
|
|
@@ -23,6 +23,7 @@ import { tracers, ATTR, withSpan } from "../telemetry/index.js";
|
|
|
23
23
|
import { CSVProcessor } from "./csvProcessor.js";
|
|
24
24
|
import { ImageProcessor } from "./imageProcessor.js";
|
|
25
25
|
import { logger } from "./logger.js";
|
|
26
|
+
import { mimeHintToExtension, mimeHintToFileType, normalizeMimeHint, } from "./mimeTypeHints.js";
|
|
26
27
|
import { PDFProcessor } from "./pdfProcessor.js";
|
|
27
28
|
/**
|
|
28
29
|
* Default retry configuration constants
|
|
@@ -320,6 +321,27 @@ export class FileDetector {
|
|
|
320
321
|
}
|
|
321
322
|
return 0;
|
|
322
323
|
}
|
|
324
|
+
/**
|
|
325
|
+
* Classify a FileInput into the FileSource enum used by downstream
|
|
326
|
+
* loaders. Keeps the mimetype-hint short-circuit in detect() able to
|
|
327
|
+
* produce a valid FileDetectionResult without re-implementing the
|
|
328
|
+
* source-inference rules scattered across loadContent().
|
|
329
|
+
*/
|
|
330
|
+
static deriveInputSource(input) {
|
|
331
|
+
if (Buffer.isBuffer(input)) {
|
|
332
|
+
return "buffer";
|
|
333
|
+
}
|
|
334
|
+
if (typeof input === "string") {
|
|
335
|
+
if (input.startsWith("data:")) {
|
|
336
|
+
return "datauri";
|
|
337
|
+
}
|
|
338
|
+
if (input.startsWith("http://") || input.startsWith("https://")) {
|
|
339
|
+
return "url";
|
|
340
|
+
}
|
|
341
|
+
return "path";
|
|
342
|
+
}
|
|
343
|
+
return "buffer";
|
|
344
|
+
}
|
|
323
345
|
/**
|
|
324
346
|
* Try fallback parsing for a specific file type
|
|
325
347
|
* Used when file detection returns "unknown" but we want to try parsing anyway
|
|
@@ -520,6 +542,31 @@ export class FileDetector {
|
|
|
520
542
|
* Stops at first strategy with confidence >= threshold (default: 80%)
|
|
521
543
|
*/
|
|
522
544
|
static async detect(input, options) {
|
|
545
|
+
// Short-circuit on a trustworthy caller-provided mimetype hint. This is
|
|
546
|
+
// the eager-path counterpart to FileReferenceRegistry.register()'s hint
|
|
547
|
+
// handling — necessary for tiny files (<= TINY_MAX) that skip the lazy
|
|
548
|
+
// registry path. normalizeMimeHint drops "application/octet-stream" so a
|
|
549
|
+
// caller cannot hide real content behind the opaque sentinel.
|
|
550
|
+
const hintMime = normalizeMimeHint(options?.mimetypeHint);
|
|
551
|
+
if (hintMime) {
|
|
552
|
+
const type = mimeHintToFileType(hintMime);
|
|
553
|
+
if (type) {
|
|
554
|
+
const ext = mimeHintToExtension(hintMime);
|
|
555
|
+
const result = {
|
|
556
|
+
type,
|
|
557
|
+
mimeType: hintMime,
|
|
558
|
+
extension: ext || null,
|
|
559
|
+
source: FileDetector.deriveInputSource(input),
|
|
560
|
+
metadata: {
|
|
561
|
+
confidence: 95,
|
|
562
|
+
filename: FileDetector.deriveInputFilename(input),
|
|
563
|
+
size: FileDetector.deriveInputSize(input),
|
|
564
|
+
},
|
|
565
|
+
};
|
|
566
|
+
logger.info(`[FileDetector] Type: ${type} (95%, from mimetype hint: ${hintMime})`);
|
|
567
|
+
return result;
|
|
568
|
+
}
|
|
569
|
+
}
|
|
523
570
|
const confidenceThreshold = options?.confidenceThreshold ?? 80;
|
|
524
571
|
const strategies = [
|
|
525
572
|
new MagicBytesStrategy(),
|
|
@@ -554,6 +554,7 @@ export async function buildMessagesArray(options) {
|
|
|
554
554
|
maxSize: 50 * 1024 * 1024,
|
|
555
555
|
allowedTypes: ["csv"],
|
|
556
556
|
csvOptions: csvOptions,
|
|
557
|
+
mimetypeHint: isFileWithMetadata(file) ? file.mimetype : undefined,
|
|
557
558
|
});
|
|
558
559
|
if (result.type === "csv") {
|
|
559
560
|
let csvSection = `\n\n## CSV Data from "${filename}":\n`;
|
|
@@ -806,6 +807,12 @@ async function processUnifiedFilesArray(options, maxSize, provider) {
|
|
|
806
807
|
// ─── Full processing path (current behavior) ──────────────────
|
|
807
808
|
const genericFileMaxSize = Math.max(maxSize, 100 * 1024 * 1024);
|
|
808
809
|
const rawFileInput = isFileWithMetadata(file) ? file.buffer : file;
|
|
810
|
+
// Forward the caller's mimetype hint (Slack/Curator-style
|
|
811
|
+
// extension-less buffers) so the eager path classifies correctly
|
|
812
|
+
// for tiny files — the lazy registry path has its own hint wiring.
|
|
813
|
+
const fileMimetypeHint = isFileWithMetadata(file)
|
|
814
|
+
? file.mimetype
|
|
815
|
+
: undefined;
|
|
809
816
|
const result = await FileDetector.detectAndProcess(rawFileInput, {
|
|
810
817
|
maxSize: genericFileMaxSize,
|
|
811
818
|
allowedTypes: [
|
|
@@ -824,6 +831,7 @@ async function processUnifiedFilesArray(options, maxSize, provider) {
|
|
|
824
831
|
],
|
|
825
832
|
csvOptions: options.csvOptions,
|
|
826
833
|
provider: provider,
|
|
834
|
+
mimetypeHint: fileMimetypeHint,
|
|
827
835
|
});
|
|
828
836
|
appendDetectedFileResult(result, file, options);
|
|
829
837
|
includedCount++;
|
|
@@ -1658,7 +1666,13 @@ async function tryRegisterFileReference(file, fileSize, registry, index = 0) {
|
|
|
1658
1666
|
return false;
|
|
1659
1667
|
}
|
|
1660
1668
|
const filename = extractFilename(file, index);
|
|
1661
|
-
|
|
1669
|
+
const mimetype = typeof file === "object" && !Buffer.isBuffer(file)
|
|
1670
|
+
? file.mimetype
|
|
1671
|
+
: undefined;
|
|
1672
|
+
await registry.register(buffer, getFileSource(file), {
|
|
1673
|
+
filename,
|
|
1674
|
+
mimetype,
|
|
1675
|
+
});
|
|
1662
1676
|
logger.info(`[FileDetector] Registered "${filename}" (${(fileSize / 1024).toFixed(0)} KB) ` +
|
|
1663
1677
|
`as lazy reference — skipping upfront processing`);
|
|
1664
1678
|
return true;
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Shared helpers for caller-provided MIME type hints.
|
|
3
|
+
*
|
|
4
|
+
* A "MIME hint" is a mimetype string the SDK receives alongside a raw Buffer
|
|
5
|
+
* whose original filename is missing (e.g. Slack/Curator file-uploads that
|
|
6
|
+
* arrive as { buffer, filename: "Untitled", mimetype: "text/plain" }). When
|
|
7
|
+
* the filename has no extension and magic-byte detection cannot identify the
|
|
8
|
+
* content, the hint is the only signal we have.
|
|
9
|
+
*
|
|
10
|
+
* Both FileReferenceRegistry.register() and FileDetector.detect() consume
|
|
11
|
+
* these helpers so the trust/normalization rules stay in one place:
|
|
12
|
+
*
|
|
13
|
+
* - `application/octet-stream` is never trusted — it is the opaque
|
|
14
|
+
* "I don't know" sentinel and would let a caller hide real content
|
|
15
|
+
* behind a generic label (a PNG hinted as octet-stream would otherwise
|
|
16
|
+
* record mimeType="application/octet-stream" instead of "image/png").
|
|
17
|
+
* - Empty/undefined hints pass through as `undefined`.
|
|
18
|
+
* - A hint that cannot be classified maps to `null` so the caller falls
|
|
19
|
+
* back to magic-byte / extension detection instead of synthesising a
|
|
20
|
+
* wrong type.
|
|
21
|
+
*/
|
|
22
|
+
import type { FileType } from "../types/index.js";
|
|
23
|
+
/**
|
|
24
|
+
* Normalize a caller-provided mimetype hint: strip any `;charset=...`
|
|
25
|
+
* parameter, lowercase, trim. Returns undefined for empty strings or for
|
|
26
|
+
* the opaque `application/octet-stream` sentinel so downstream code can
|
|
27
|
+
* treat the hint as absent instead of trusting it verbatim.
|
|
28
|
+
*/
|
|
29
|
+
export declare function normalizeMimeHint(raw?: string): string | undefined;
|
|
30
|
+
/**
|
|
31
|
+
* Map a normalized mimetype hint to a NeuroLink FileType. Returns null when
|
|
32
|
+
* the mimetype is unknown or too generic to classify confidently.
|
|
33
|
+
*/
|
|
34
|
+
export declare function mimeHintToFileType(mimetype: string): FileType | null;
|
|
35
|
+
/**
|
|
36
|
+
* Map a normalized mimetype hint to the canonical file extension (without
|
|
37
|
+
* leading dot). Returns "" when the mimetype is unknown — caller should
|
|
38
|
+
* then fall back to magic-byte detection.
|
|
39
|
+
*/
|
|
40
|
+
export declare function mimeHintToExtension(mimetype: string): string;
|