npm - @file-type/pdf - Versions diffs - 0.1.1 → 0.2.0 - Mend

@file-type/pdf 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/LICENSE.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 The MIT License (MIT)
-Copyright © 2025 Borewit
+Copyright © 2026 Borewit
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

package/README.md CHANGED Viewed

@@ -11,11 +11,10 @@ This plugin goes beyond simple magic-number detection and can inspect the intern
 structure to distinguish between generic PDF files and specific producer formats such as
 **Adobe Illustrator (.ai)**.
-The detector is designed to be:
-- tokenizer-based (using `strtok3`)
-- streaming-friendly
-- safe to chain with other `file-type` detectors
-- compatible with both Node.js and browser environments
+## Scope
+This detector is designed for well-formed PDF files and established PDF-based subtypes.
+Support for corrupted or non-conforming PDFs is intentionally limited and only considered when a deviation is both common and widely accepted.
 ## Installation

package/lib/index.js CHANGED Viewed

@@ -1,53 +1,16 @@
-import sax from "sax";
-import { PdfTokenizerReader } from "./PdfTokenizerReader.js";
+import sax from 'sax';
+import { PdfTokenizerReader } from './PdfTokenizerReader.js';
+import { textDecode } from '@borewit/text-codec';
 const OBJ_REGEX = /^\s*(\d+)\s+(\d+)\s+obj\b/;
 const PDF_TYPE = Object.freeze({ ext: "pdf", mime: "application/pdf" });
 const AI_TYPE = Object.freeze({ ext: "ai", mime: "application/illustrator" });
-const encoder = new TextEncoder();
-const utf8Decoder = new TextDecoder("utf-8");
-function indexOfBytes(hay, needle) {
-    if (needle.length === 0)
-        return 0;
-    outer: for (let i = 0; i <= hay.length - needle.length; i++) {
-        for (let j = 0; j < needle.length; j++) {
-            if (hay[i + j] !== needle[j])
-                continue outer;
-        }
-        return i;
-    }
-    return -1;
-}
-async function peekPdfHeader(tokenizer) {
-    const buf = new Uint8Array(1024);
-    let n = 0;
-    try {
-        const opts = { length: buf.length, mayBeLess: true };
-        n = await tokenizer.peekBuffer(buf, opts);
-    }
-    catch {
-        return { isPdf: false, headerOffset: -1 };
-    }
-    if (!n)
-        return { isPdf: false, headerOffset: -1 };
-    const hay = buf.subarray(0, n);
-    const idx = indexOfBytes(hay, encoder.encode("%PDF-"));
-    if (idx === -1)
-        return { isPdf: false, headerOffset: -1 };
-    return { isPdf: true, headerOffset: idx };
-}
-async function skipBytes(tokenizer, n) {
-    if (n <= 0)
-        return;
-    const tmp = new Uint8Array(Math.min(64 * 1024, n));
-    let left = n;
-    while (left > 0) {
-        const len = Math.min(tmp.length, left);
-        const opts = { length: len };
-        const read = await tokenizer.readBuffer(tmp, opts);
-        if (!read)
-            throw new Error("Unexpected EOF while skipping bytes");
-        left -= read;
-    }
+/**
+ * Peeks the tokenizer, and returns true if magic signature is found.
+ */
+async function peekIsPdfHeader(tokenizer) {
+    const rawSignature = new Uint8Array(5);
+    return await tokenizer.peekBuffer(rawSignature, { mayBeLess: true }) === 5
+        && textDecode(rawSignature, 'ascii') === '%PDF-';
 }
 function parseDictFromRaw(raw) {
     const dictRegex = /\/(\w+)(?:\s+([^/>\n\r]+))?/g;
@@ -55,8 +18,7 @@ function parseDictFromRaw(raw) {
     let match = dictRegex.exec(raw);
     while (match !== null) {
         const key = match[1];
-        const value = match[2] ? match[2].trim() : true;
-        info[key] = value;
+        info[key] = match[2] ? match[2].trim() : true;
         match = dictRegex.exec(raw);
     }
     return info;
@@ -210,13 +172,10 @@ async function _detectPdf(tokenizer, opts = {}) {
     };
     const ctx = { debug, log };
     // NOT PDF => PEEK ONLY, do not advance
-    const { isPdf, headerOffset } = await peekPdfHeader(tokenizer);
-    if (!isPdf)
+    if (!await peekIsPdfHeader(tokenizer))
         return undefined;
     // Confirmed PDF => ok to advance
-    log(`[PDF] Detected %PDF- header at +${headerOffset} (abs=${tokenizer.position + headerOffset})`);
-    if (headerOffset > 0)
-        await skipBytes(tokenizer, headerOffset);
+    log(`[PDF] Detected %PDF- header at abs=${tokenizer.position}`);
     const reader = new PdfTokenizerReader(tokenizer, { debug });
     // pushback so we don't lose a line when probing for "stream"
     let pendingLine = null;
@@ -296,7 +255,7 @@ async function _detectPdf(tokenizer, opts = {}) {
             if (!rawBytes)
                 break;
             const decodedBytes = await decodeStreamBytes(objectInfo, rawBytes);
-            const streamText = utf8Decoder.decode(decodedBytes);
+            const streamText = textDecode(decodedBytes, 'utf-8');
             // Stream probes
             for (const probe of subtypeProbes) {
                 if (!probe.onStreamText)

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@file-type/pdf",
-	"version": "0.1.1",
+	"version": "0.2.0",
 	"description": "file-type plugin to parse PDF files",
 	"type": "module",
 	"exports": {
@@ -45,6 +45,7 @@
 	},
 	"homepage": "https://github.com/Borewit/file-type-pdf#readme",
 	"dependencies": {
+		"@borewit/text-codec": "^0.2.1",
 		"read-next-line": "^0.5.0",
 		"sax": "^1.4.1"
 	},