@file-type/pdf 0.1.1 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE.txt CHANGED
@@ -1,6 +1,6 @@
1
1
  The MIT License (MIT)
2
2
 
3
- Copyright © 2025 Borewit
3
+ Copyright © 2026 Borewit
4
4
 
5
5
  Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
6
 
package/README.md CHANGED
@@ -11,11 +11,10 @@ This plugin goes beyond simple magic-number detection and can inspect the intern
11
11
  structure to distinguish between generic PDF files and specific producer formats such as
12
12
  **Adobe Illustrator (.ai)**.
13
13
 
14
- The detector is designed to be:
15
- - tokenizer-based (using `strtok3`)
16
- - streaming-friendly
17
- - safe to chain with other `file-type` detectors
18
- - compatible with both Node.js and browser environments
14
+ ## Scope
15
+
16
+ This detector is designed for well-formed PDF files and established PDF-based subtypes.
17
+ Support for corrupted or non-conforming PDFs is intentionally limited and only considered when a deviation is both common and widely accepted.
19
18
 
20
19
  ## Installation
21
20
 
package/lib/index.js CHANGED
@@ -1,53 +1,16 @@
1
- import sax from "sax";
2
- import { PdfTokenizerReader } from "./PdfTokenizerReader.js";
1
+ import sax from 'sax';
2
+ import { PdfTokenizerReader } from './PdfTokenizerReader.js';
3
+ import { textDecode } from '@borewit/text-codec';
3
4
  const OBJ_REGEX = /^\s*(\d+)\s+(\d+)\s+obj\b/;
4
5
  const PDF_TYPE = Object.freeze({ ext: "pdf", mime: "application/pdf" });
5
6
  const AI_TYPE = Object.freeze({ ext: "ai", mime: "application/illustrator" });
6
- const encoder = new TextEncoder();
7
- const utf8Decoder = new TextDecoder("utf-8");
8
- function indexOfBytes(hay, needle) {
9
- if (needle.length === 0)
10
- return 0;
11
- outer: for (let i = 0; i <= hay.length - needle.length; i++) {
12
- for (let j = 0; j < needle.length; j++) {
13
- if (hay[i + j] !== needle[j])
14
- continue outer;
15
- }
16
- return i;
17
- }
18
- return -1;
19
- }
20
- async function peekPdfHeader(tokenizer) {
21
- const buf = new Uint8Array(1024);
22
- let n = 0;
23
- try {
24
- const opts = { length: buf.length, mayBeLess: true };
25
- n = await tokenizer.peekBuffer(buf, opts);
26
- }
27
- catch {
28
- return { isPdf: false, headerOffset: -1 };
29
- }
30
- if (!n)
31
- return { isPdf: false, headerOffset: -1 };
32
- const hay = buf.subarray(0, n);
33
- const idx = indexOfBytes(hay, encoder.encode("%PDF-"));
34
- if (idx === -1)
35
- return { isPdf: false, headerOffset: -1 };
36
- return { isPdf: true, headerOffset: idx };
37
- }
38
- async function skipBytes(tokenizer, n) {
39
- if (n <= 0)
40
- return;
41
- const tmp = new Uint8Array(Math.min(64 * 1024, n));
42
- let left = n;
43
- while (left > 0) {
44
- const len = Math.min(tmp.length, left);
45
- const opts = { length: len };
46
- const read = await tokenizer.readBuffer(tmp, opts);
47
- if (!read)
48
- throw new Error("Unexpected EOF while skipping bytes");
49
- left -= read;
50
- }
7
+ /**
8
+ * Peeks the tokenizer, and returns true if magic signature is found.
9
+ */
10
+ async function peekIsPdfHeader(tokenizer) {
11
+ const rawSignature = new Uint8Array(5);
12
+ return await tokenizer.peekBuffer(rawSignature, { mayBeLess: true }) === 5
13
+ && textDecode(rawSignature, 'ascii') === '%PDF-';
51
14
  }
52
15
  function parseDictFromRaw(raw) {
53
16
  const dictRegex = /\/(\w+)(?:\s+([^/>\n\r]+))?/g;
@@ -55,8 +18,7 @@ function parseDictFromRaw(raw) {
55
18
  let match = dictRegex.exec(raw);
56
19
  while (match !== null) {
57
20
  const key = match[1];
58
- const value = match[2] ? match[2].trim() : true;
59
- info[key] = value;
21
+ info[key] = match[2] ? match[2].trim() : true;
60
22
  match = dictRegex.exec(raw);
61
23
  }
62
24
  return info;
@@ -210,13 +172,10 @@ async function _detectPdf(tokenizer, opts = {}) {
210
172
  };
211
173
  const ctx = { debug, log };
212
174
  // NOT PDF => PEEK ONLY, do not advance
213
- const { isPdf, headerOffset } = await peekPdfHeader(tokenizer);
214
- if (!isPdf)
175
+ if (!await peekIsPdfHeader(tokenizer))
215
176
  return undefined;
216
177
  // Confirmed PDF => ok to advance
217
- log(`[PDF] Detected %PDF- header at +${headerOffset} (abs=${tokenizer.position + headerOffset})`);
218
- if (headerOffset > 0)
219
- await skipBytes(tokenizer, headerOffset);
178
+ log(`[PDF] Detected %PDF- header at abs=${tokenizer.position}`);
220
179
  const reader = new PdfTokenizerReader(tokenizer, { debug });
221
180
  // pushback so we don't lose a line when probing for "stream"
222
181
  let pendingLine = null;
@@ -296,7 +255,7 @@ async function _detectPdf(tokenizer, opts = {}) {
296
255
  if (!rawBytes)
297
256
  break;
298
257
  const decodedBytes = await decodeStreamBytes(objectInfo, rawBytes);
299
- const streamText = utf8Decoder.decode(decodedBytes);
258
+ const streamText = textDecode(decodedBytes, 'utf-8');
300
259
  // Stream probes
301
260
  for (const probe of subtypeProbes) {
302
261
  if (!probe.onStreamText)
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@file-type/pdf",
3
- "version": "0.1.1",
3
+ "version": "0.2.0",
4
4
  "description": "file-type plugin to parse PDF files",
5
5
  "type": "module",
6
6
  "exports": {
@@ -45,6 +45,7 @@
45
45
  },
46
46
  "homepage": "https://github.com/Borewit/file-type-pdf#readme",
47
47
  "dependencies": {
48
+ "@borewit/text-codec": "^0.2.1",
48
49
  "read-next-line": "^0.5.0",
49
50
  "sax": "^1.4.1"
50
51
  },