mime-bytes 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -0
- package/README.md +398 -0
- package/esm/file-type-detector.js +374 -0
- package/esm/file-types-registry.js +1208 -0
- package/esm/index.js +11 -0
- package/esm/peak.js +90 -0
- package/esm/utils/extensions.js +114 -0
- package/esm/utils/magic-bytes.js +61 -0
- package/esm/utils/mime-types.js +90 -0
- package/file-type-detector.d.ts +101 -0
- package/file-type-detector.js +381 -0
- package/file-types-registry.d.ts +28 -0
- package/file-types-registry.js +1217 -0
- package/index.d.ts +6 -0
- package/index.js +42 -0
- package/package.json +38 -0
- package/peak.d.ts +20 -0
- package/peak.js +95 -0
- package/utils/extensions.d.ts +9 -0
- package/utils/extensions.js +125 -0
- package/utils/magic-bytes.d.ts +9 -0
- package/utils/magic-bytes.js +69 -0
- package/utils/mime-types.d.ts +14 -0
- package/utils/mime-types.js +98 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
// Main file type detector class with stream-focused API
|
|
2
|
+
import { peek } from './peak';
|
|
3
|
+
import { FILE_TYPES, getFileTypeByExtension, getFileTypesByCategory, getContentTypeByExtension, detectCharset, getContentTypeForExtension } from './file-types-registry';
|
|
4
|
+
import { compareBytes } from './utils/magic-bytes';
|
|
5
|
+
import { normalizeExtension } from './utils/extensions';
|
|
6
|
+
import { resolveMimeAlias } from './utils/mime-types';
|
|
7
|
+
export class FileTypeDetector {
|
|
8
|
+
fileTypes;
|
|
9
|
+
options;
|
|
10
|
+
magicBytesCache;
|
|
11
|
+
extensionCache;
|
|
12
|
+
constructor(options = {}) {
|
|
13
|
+
// Create a copy of FILE_TYPES to avoid modifying the global registry
|
|
14
|
+
this.fileTypes = [...FILE_TYPES];
|
|
15
|
+
this.options = {
|
|
16
|
+
peekBytes: options.peekBytes || 32,
|
|
17
|
+
checkMultipleOffsets: options.checkMultipleOffsets !== false,
|
|
18
|
+
maxOffset: options.maxOffset || 12
|
|
19
|
+
};
|
|
20
|
+
this.magicBytesCache = new Map();
|
|
21
|
+
this.extensionCache = new Map();
|
|
22
|
+
}
|
|
23
|
+
/**
|
|
24
|
+
* Detect file type from a stream (PRIMARY METHOD - memory efficient)
|
|
25
|
+
* @param stream - Readable stream to detect from
|
|
26
|
+
* @returns Detection result or null if not detected
|
|
27
|
+
*/
|
|
28
|
+
async detectFromStream(stream) {
|
|
29
|
+
try {
|
|
30
|
+
const [buffer, peekStream] = await peek.promise(stream, this.options.peekBytes);
|
|
31
|
+
// Check multiple offsets for different file types
|
|
32
|
+
const result = await this.detectFromBuffer(buffer);
|
|
33
|
+
// Important: Return the peek stream so it can be used for further processing
|
|
34
|
+
// The caller should use peekStream instead of the original stream
|
|
35
|
+
if (result) {
|
|
36
|
+
result._stream = peekStream;
|
|
37
|
+
}
|
|
38
|
+
return result;
|
|
39
|
+
}
|
|
40
|
+
catch (error) {
|
|
41
|
+
// Handle stream errors gracefully
|
|
42
|
+
// Only log errors in non-test environments
|
|
43
|
+
if (process.env.NODE_ENV !== 'test') {
|
|
44
|
+
console.error('Error detecting file type from stream:', error);
|
|
45
|
+
}
|
|
46
|
+
return null;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Detect file type from an already-read buffer
|
|
51
|
+
* @param buffer - Buffer to detect from
|
|
52
|
+
* @returns Detection result or null if not detected
|
|
53
|
+
*/
|
|
54
|
+
async detectFromBuffer(buffer) {
|
|
55
|
+
if (!buffer || buffer.length === 0) {
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
// Check multiple offsets if enabled
|
|
59
|
+
const offsets = this.options.checkMultipleOffsets
|
|
60
|
+
? this.generateOffsets(buffer.length)
|
|
61
|
+
: [0];
|
|
62
|
+
for (const offset of offsets) {
|
|
63
|
+
const fileType = this.checkMagicBytesAtOffset(buffer, offset);
|
|
64
|
+
if (fileType) {
|
|
65
|
+
return this.enhanceDetectionResult(fileType, buffer);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
// No magic bytes matched, but we can still detect charset for unknown files
|
|
69
|
+
const charset = detectCharset(buffer);
|
|
70
|
+
if (charset !== 'binary') {
|
|
71
|
+
// Return a generic text file result
|
|
72
|
+
return {
|
|
73
|
+
name: 'text',
|
|
74
|
+
mimeType: 'text/plain',
|
|
75
|
+
extensions: ['txt'],
|
|
76
|
+
charset,
|
|
77
|
+
contentType: 'text/plain',
|
|
78
|
+
confidence: 0.5 // Lower confidence since we only detected charset
|
|
79
|
+
};
|
|
80
|
+
}
|
|
81
|
+
return null;
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Detect file type from extension only
|
|
85
|
+
* @param extension - File extension (with or without dot)
|
|
86
|
+
* @returns Array of possible detection results with lower confidence
|
|
87
|
+
*/
|
|
88
|
+
detectFromExtension(extension) {
|
|
89
|
+
const cleanExt = normalizeExtension(extension);
|
|
90
|
+
// Check cache first
|
|
91
|
+
if (this.extensionCache.has(cleanExt)) {
|
|
92
|
+
const cachedTypes = this.extensionCache.get(cleanExt);
|
|
93
|
+
return cachedTypes.map(fileType => ({
|
|
94
|
+
name: fileType.name,
|
|
95
|
+
mimeType: resolveMimeAlias(fileType.mimeType),
|
|
96
|
+
extensions: fileType.extensions,
|
|
97
|
+
charset: 'unknown', // Can't determine charset from extension alone
|
|
98
|
+
contentType: getContentTypeByExtension(cleanExt) || fileType.mimeType,
|
|
99
|
+
confidence: 0.8 // Lower confidence for extension-only detection
|
|
100
|
+
}));
|
|
101
|
+
}
|
|
102
|
+
const fileTypes = getFileTypeByExtension(cleanExt);
|
|
103
|
+
// Cache the result
|
|
104
|
+
this.extensionCache.set(cleanExt, fileTypes);
|
|
105
|
+
return fileTypes.map(fileType => ({
|
|
106
|
+
name: fileType.name,
|
|
107
|
+
mimeType: resolveMimeAlias(fileType.mimeType),
|
|
108
|
+
extensions: fileType.extensions,
|
|
109
|
+
charset: 'unknown', // Can't determine charset from extension alone
|
|
110
|
+
contentType: getContentTypeByExtension(cleanExt) || fileType.mimeType,
|
|
111
|
+
confidence: 0.8 // Lower confidence for extension-only detection
|
|
112
|
+
}));
|
|
113
|
+
}
|
|
114
|
+
/**
|
|
115
|
+
* Get all file types by category
|
|
116
|
+
* @param category - Category name (e.g., 'image', 'video', 'archive')
|
|
117
|
+
* @returns Array of file type definitions
|
|
118
|
+
*/
|
|
119
|
+
getByCategory(category) {
|
|
120
|
+
return getFileTypesByCategory(category);
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Add a new file type dynamically
|
|
124
|
+
* @param fileType - File type definition to add
|
|
125
|
+
*/
|
|
126
|
+
addFileType(fileType) {
|
|
127
|
+
this.fileTypes.push(fileType);
|
|
128
|
+
// Clear caches when file types change
|
|
129
|
+
this.clearCache();
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Remove a file type by name
|
|
133
|
+
* @param name - Name of the file type to remove
|
|
134
|
+
*/
|
|
135
|
+
removeFileType(name) {
|
|
136
|
+
const index = this.fileTypes.findIndex(ft => ft.name === name);
|
|
137
|
+
if (index !== -1) {
|
|
138
|
+
this.fileTypes.splice(index, 1);
|
|
139
|
+
// Clear caches when file types change
|
|
140
|
+
this.clearCache();
|
|
141
|
+
return true;
|
|
142
|
+
}
|
|
143
|
+
return false;
|
|
144
|
+
}
|
|
145
|
+
/**
|
|
146
|
+
* Clear all caches
|
|
147
|
+
*/
|
|
148
|
+
clearCache() {
|
|
149
|
+
this.magicBytesCache.clear();
|
|
150
|
+
this.extensionCache.clear();
|
|
151
|
+
}
|
|
152
|
+
/**
|
|
153
|
+
* Get all registered file types
|
|
154
|
+
* @returns Array of all file type definitions
|
|
155
|
+
*/
|
|
156
|
+
getAllFileTypes() {
|
|
157
|
+
return [...this.fileTypes];
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Check magic bytes at a specific offset
|
|
161
|
+
* @private
|
|
162
|
+
*/
|
|
163
|
+
checkMagicBytesAtOffset(buffer, offset) {
|
|
164
|
+
// Check each file type in order
|
|
165
|
+
for (const fileType of this.fileTypes) {
|
|
166
|
+
// Skip if this file type requires a different offset
|
|
167
|
+
if (fileType.offset !== undefined && fileType.offset !== offset)
|
|
168
|
+
continue;
|
|
169
|
+
// Skip if offset is not 0 and file type doesn't specify an offset
|
|
170
|
+
if (offset > 0 && fileType.offset === undefined)
|
|
171
|
+
continue;
|
|
172
|
+
// Check if magic bytes match
|
|
173
|
+
if (compareBytes(buffer, fileType.magicBytes, offset)) {
|
|
174
|
+
return fileType;
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
return null;
|
|
178
|
+
}
|
|
179
|
+
/**
|
|
180
|
+
* Generate offsets to check based on buffer size
|
|
181
|
+
* @private
|
|
182
|
+
*/
|
|
183
|
+
generateOffsets(bufferLength) {
|
|
184
|
+
const offsets = [];
|
|
185
|
+
for (let i = 0; i <= this.options.maxOffset && i < bufferLength; i += 4) {
|
|
186
|
+
offsets.push(i);
|
|
187
|
+
}
|
|
188
|
+
return offsets;
|
|
189
|
+
}
|
|
190
|
+
/**
|
|
191
|
+
* Enhance detection result with additional information
|
|
192
|
+
* @private
|
|
193
|
+
*/
|
|
194
|
+
enhanceDetectionResult(fileType, buffer) {
|
|
195
|
+
// Use charset from file type definition if available, otherwise detect it
|
|
196
|
+
const charset = fileType.charset || detectCharset(buffer);
|
|
197
|
+
// Determine content type based on extension and charset
|
|
198
|
+
let contentType = fileType.contentType || fileType.mimeType;
|
|
199
|
+
// Try charset-aware content type lookup first
|
|
200
|
+
if (fileType.extensions.length > 0) {
|
|
201
|
+
const primaryExt = fileType.extensions[0];
|
|
202
|
+
const charsetAwareContentType = getContentTypeForExtension(primaryExt, charset);
|
|
203
|
+
if (charsetAwareContentType) {
|
|
204
|
+
contentType = charsetAwareContentType;
|
|
205
|
+
}
|
|
206
|
+
else if (!fileType.contentType) {
|
|
207
|
+
// Fall back to regular content type lookup if no charset-specific match
|
|
208
|
+
const inferredContentType = getContentTypeByExtension(primaryExt);
|
|
209
|
+
if (inferredContentType) {
|
|
210
|
+
contentType = inferredContentType;
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
return {
|
|
215
|
+
name: fileType.name,
|
|
216
|
+
mimeType: resolveMimeAlias(fileType.mimeType),
|
|
217
|
+
extensions: fileType.extensions,
|
|
218
|
+
charset,
|
|
219
|
+
contentType: resolveMimeAlias(contentType),
|
|
220
|
+
confidence: 1.0 // High confidence for magic bytes detection
|
|
221
|
+
};
|
|
222
|
+
}
|
|
223
|
+
/**
|
|
224
|
+
* Detect file type with fallback to extension
|
|
225
|
+
* @param input - Readable stream or Buffer
|
|
226
|
+
* @param filename - Optional filename for extension fallback
|
|
227
|
+
* @returns Detection result with attached stream for reuse (if input was stream)
|
|
228
|
+
*/
|
|
229
|
+
async detectWithFallback(input, filename) {
|
|
230
|
+
try {
|
|
231
|
+
let buffer;
|
|
232
|
+
let peekStream;
|
|
233
|
+
// Handle both Buffer and Readable inputs
|
|
234
|
+
if (Buffer.isBuffer(input)) {
|
|
235
|
+
buffer = input;
|
|
236
|
+
}
|
|
237
|
+
else {
|
|
238
|
+
const peekResult = await peek.promise(input, this.options.peekBytes);
|
|
239
|
+
buffer = peekResult[0];
|
|
240
|
+
peekStream = peekResult[1];
|
|
241
|
+
}
|
|
242
|
+
// Try magic bytes detection first
|
|
243
|
+
const magicResult = await this.detectFromBuffer(buffer);
|
|
244
|
+
if (magicResult) {
|
|
245
|
+
// If we have a filename, try to enhance with more specific content type
|
|
246
|
+
if (filename) {
|
|
247
|
+
const lastDot = filename.lastIndexOf('.');
|
|
248
|
+
if (lastDot !== -1) {
|
|
249
|
+
const extension = filename.substring(lastDot + 1);
|
|
250
|
+
// Check for generic text files that might have specific content types
|
|
251
|
+
if (magicResult.name === 'text' && magicResult.charset) {
|
|
252
|
+
const contentType = getContentTypeForExtension(extension, magicResult.charset);
|
|
253
|
+
if (contentType) {
|
|
254
|
+
// Enhance the result with charset-aware content type
|
|
255
|
+
const enhancedResult = {
|
|
256
|
+
...magicResult,
|
|
257
|
+
contentType,
|
|
258
|
+
confidence: 0.8 // Higher confidence since we have both magic bytes and extension
|
|
259
|
+
};
|
|
260
|
+
return peekStream ? { ...enhancedResult, _stream: peekStream } : enhancedResult;
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
// Check for ZIP files that might be Office Open XML or other specific formats
|
|
264
|
+
if (magicResult.name === 'zip' || magicResult.mimeType === 'application/zip') {
|
|
265
|
+
const contentType = getContentTypeForExtension(extension, magicResult.charset || 'binary');
|
|
266
|
+
if (contentType && contentType !== 'application/zip') {
|
|
267
|
+
// Enhance the result with more specific content type
|
|
268
|
+
const enhancedResult = {
|
|
269
|
+
...magicResult,
|
|
270
|
+
contentType,
|
|
271
|
+
confidence: 0.9 // High confidence for known ZIP-based formats
|
|
272
|
+
};
|
|
273
|
+
return peekStream ? { ...enhancedResult, _stream: peekStream } : enhancedResult;
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
// Attach the peek stream for reuse if available
|
|
279
|
+
return peekStream ? { ...magicResult, _stream: peekStream } : magicResult;
|
|
280
|
+
}
|
|
281
|
+
// Fallback to extension if filename provided
|
|
282
|
+
if (filename) {
|
|
283
|
+
const lastDot = filename.lastIndexOf('.');
|
|
284
|
+
if (lastDot !== -1) {
|
|
285
|
+
const extension = filename.substring(lastDot + 1);
|
|
286
|
+
const charset = detectCharset(buffer);
|
|
287
|
+
// Try charset-aware content type lookup
|
|
288
|
+
const contentType = getContentTypeForExtension(extension, charset);
|
|
289
|
+
if (contentType) {
|
|
290
|
+
// Create a result based on extension and detected charset
|
|
291
|
+
const result = {
|
|
292
|
+
name: extension.toLowerCase(),
|
|
293
|
+
mimeType: contentType,
|
|
294
|
+
extensions: [extension.toLowerCase()],
|
|
295
|
+
charset,
|
|
296
|
+
contentType,
|
|
297
|
+
confidence: 0.7 // Higher confidence when charset matches
|
|
298
|
+
};
|
|
299
|
+
return peekStream ? { ...result, _stream: peekStream } : result;
|
|
300
|
+
}
|
|
301
|
+
// Fall back to regular extension detection
|
|
302
|
+
const extensionResults = this.detectFromExtension(extension);
|
|
303
|
+
if (extensionResults.length > 0) {
|
|
304
|
+
const result = {
|
|
305
|
+
...extensionResults[0],
|
|
306
|
+
charset, // Use detected charset
|
|
307
|
+
confidence: 0.6 // Lower confidence for fallback
|
|
308
|
+
};
|
|
309
|
+
// Update content type if charset-specific mapping exists
|
|
310
|
+
const charsetContentType = getContentTypeForExtension(extension, charset);
|
|
311
|
+
if (charsetContentType) {
|
|
312
|
+
result.contentType = charsetContentType;
|
|
313
|
+
}
|
|
314
|
+
return peekStream ? { ...result, _stream: peekStream } : result;
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
// No detection possible
|
|
319
|
+
return null;
|
|
320
|
+
}
|
|
321
|
+
catch (error) {
|
|
322
|
+
// Only log errors in non-test environments
|
|
323
|
+
if (process.env.NODE_ENV !== 'test') {
|
|
324
|
+
console.error('Error in detectWithFallback:', error);
|
|
325
|
+
}
|
|
326
|
+
return null;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
/**
|
|
330
|
+
* Check if a buffer matches a specific file type
|
|
331
|
+
* @param buffer - Buffer to check
|
|
332
|
+
* @param fileTypeName - Name of the file type to check against
|
|
333
|
+
* @returns True if matches, false otherwise
|
|
334
|
+
*/
|
|
335
|
+
isFileType(buffer, fileTypeName) {
|
|
336
|
+
const fileType = this.fileTypes.find(ft => ft.name === fileTypeName);
|
|
337
|
+
if (!fileType)
|
|
338
|
+
return false;
|
|
339
|
+
const offset = fileType.offset || 0;
|
|
340
|
+
return compareBytes(buffer, fileType.magicBytes, offset);
|
|
341
|
+
}
|
|
342
|
+
/**
|
|
343
|
+
* Get statistics about registered file types
|
|
344
|
+
* @returns Statistics object
|
|
345
|
+
*/
|
|
346
|
+
getStatistics() {
|
|
347
|
+
const stats = {
|
|
348
|
+
totalTypes: this.fileTypes.length,
|
|
349
|
+
byCategory: {},
|
|
350
|
+
byMimePrefix: {}
|
|
351
|
+
};
|
|
352
|
+
for (const fileType of this.fileTypes) {
|
|
353
|
+
// Count by category
|
|
354
|
+
const category = fileType.category || 'other';
|
|
355
|
+
stats.byCategory[category] = (stats.byCategory[category] || 0) + 1;
|
|
356
|
+
// Count by MIME prefix
|
|
357
|
+
const mimePrefix = fileType.mimeType.split('/')[0];
|
|
358
|
+
stats.byMimePrefix[mimePrefix] = (stats.byMimePrefix[mimePrefix] || 0) + 1;
|
|
359
|
+
}
|
|
360
|
+
return stats;
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
// Export a default instance for convenience
|
|
364
|
+
export const defaultDetector = new FileTypeDetector();
|
|
365
|
+
// Convenience functions using the default detector
|
|
366
|
+
export async function detectFromStream(stream) {
|
|
367
|
+
return defaultDetector.detectFromStream(stream);
|
|
368
|
+
}
|
|
369
|
+
export async function detectFromBuffer(buffer) {
|
|
370
|
+
return defaultDetector.detectFromBuffer(buffer);
|
|
371
|
+
}
|
|
372
|
+
export function detectFromExtension(extension) {
|
|
373
|
+
return defaultDetector.detectFromExtension(extension);
|
|
374
|
+
}
|