@file-type/pdf 0.1.1 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +1 -1
- package/README.md +4 -5
- package/lib/index.js +14 -55
- package/package.json +2 -1
package/LICENSE.txt
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
The MIT License (MIT)
|
|
2
2
|
|
|
3
|
-
Copyright ©
|
|
3
|
+
Copyright © 2026 Borewit
|
|
4
4
|
|
|
5
5
|
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
6
|
|
package/README.md
CHANGED
|
@@ -11,11 +11,10 @@ This plugin goes beyond simple magic-number detection and can inspect the intern
|
|
|
11
11
|
structure to distinguish between generic PDF files and specific producer formats such as
|
|
12
12
|
**Adobe Illustrator (.ai)**.
|
|
13
13
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
-
|
|
17
|
-
-
|
|
18
|
-
- compatible with both Node.js and browser environments
|
|
14
|
+
## Scope
|
|
15
|
+
|
|
16
|
+
This detector is designed for well-formed PDF files and established PDF-based subtypes.
|
|
17
|
+
Support for corrupted or non-conforming PDFs is intentionally limited and only considered when a deviation is both common and widely accepted.
|
|
19
18
|
|
|
20
19
|
## Installation
|
|
21
20
|
|
package/lib/index.js
CHANGED
|
@@ -1,53 +1,16 @@
|
|
|
1
|
-
import sax from
|
|
2
|
-
import { PdfTokenizerReader } from
|
|
1
|
+
import sax from 'sax';
|
|
2
|
+
import { PdfTokenizerReader } from './PdfTokenizerReader.js';
|
|
3
|
+
import { textDecode } from '@borewit/text-codec';
|
|
3
4
|
const OBJ_REGEX = /^\s*(\d+)\s+(\d+)\s+obj\b/;
|
|
4
5
|
const PDF_TYPE = Object.freeze({ ext: "pdf", mime: "application/pdf" });
|
|
5
6
|
const AI_TYPE = Object.freeze({ ext: "ai", mime: "application/illustrator" });
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
if (hay[i + j] !== needle[j])
|
|
14
|
-
continue outer;
|
|
15
|
-
}
|
|
16
|
-
return i;
|
|
17
|
-
}
|
|
18
|
-
return -1;
|
|
19
|
-
}
|
|
20
|
-
async function peekPdfHeader(tokenizer) {
|
|
21
|
-
const buf = new Uint8Array(1024);
|
|
22
|
-
let n = 0;
|
|
23
|
-
try {
|
|
24
|
-
const opts = { length: buf.length, mayBeLess: true };
|
|
25
|
-
n = await tokenizer.peekBuffer(buf, opts);
|
|
26
|
-
}
|
|
27
|
-
catch {
|
|
28
|
-
return { isPdf: false, headerOffset: -1 };
|
|
29
|
-
}
|
|
30
|
-
if (!n)
|
|
31
|
-
return { isPdf: false, headerOffset: -1 };
|
|
32
|
-
const hay = buf.subarray(0, n);
|
|
33
|
-
const idx = indexOfBytes(hay, encoder.encode("%PDF-"));
|
|
34
|
-
if (idx === -1)
|
|
35
|
-
return { isPdf: false, headerOffset: -1 };
|
|
36
|
-
return { isPdf: true, headerOffset: idx };
|
|
37
|
-
}
|
|
38
|
-
async function skipBytes(tokenizer, n) {
|
|
39
|
-
if (n <= 0)
|
|
40
|
-
return;
|
|
41
|
-
const tmp = new Uint8Array(Math.min(64 * 1024, n));
|
|
42
|
-
let left = n;
|
|
43
|
-
while (left > 0) {
|
|
44
|
-
const len = Math.min(tmp.length, left);
|
|
45
|
-
const opts = { length: len };
|
|
46
|
-
const read = await tokenizer.readBuffer(tmp, opts);
|
|
47
|
-
if (!read)
|
|
48
|
-
throw new Error("Unexpected EOF while skipping bytes");
|
|
49
|
-
left -= read;
|
|
50
|
-
}
|
|
7
|
+
/**
|
|
8
|
+
* Peeks the tokenizer, and returns true if magic signature is found.
|
|
9
|
+
*/
|
|
10
|
+
async function peekIsPdfHeader(tokenizer) {
|
|
11
|
+
const rawSignature = new Uint8Array(5);
|
|
12
|
+
return await tokenizer.peekBuffer(rawSignature, { mayBeLess: true }) === 5
|
|
13
|
+
&& textDecode(rawSignature, 'ascii') === '%PDF-';
|
|
51
14
|
}
|
|
52
15
|
function parseDictFromRaw(raw) {
|
|
53
16
|
const dictRegex = /\/(\w+)(?:\s+([^/>\n\r]+))?/g;
|
|
@@ -55,8 +18,7 @@ function parseDictFromRaw(raw) {
|
|
|
55
18
|
let match = dictRegex.exec(raw);
|
|
56
19
|
while (match !== null) {
|
|
57
20
|
const key = match[1];
|
|
58
|
-
|
|
59
|
-
info[key] = value;
|
|
21
|
+
info[key] = match[2] ? match[2].trim() : true;
|
|
60
22
|
match = dictRegex.exec(raw);
|
|
61
23
|
}
|
|
62
24
|
return info;
|
|
@@ -210,13 +172,10 @@ async function _detectPdf(tokenizer, opts = {}) {
|
|
|
210
172
|
};
|
|
211
173
|
const ctx = { debug, log };
|
|
212
174
|
// NOT PDF => PEEK ONLY, do not advance
|
|
213
|
-
|
|
214
|
-
if (!isPdf)
|
|
175
|
+
if (!await peekIsPdfHeader(tokenizer))
|
|
215
176
|
return undefined;
|
|
216
177
|
// Confirmed PDF => ok to advance
|
|
217
|
-
log(`[PDF] Detected %PDF- header at
|
|
218
|
-
if (headerOffset > 0)
|
|
219
|
-
await skipBytes(tokenizer, headerOffset);
|
|
178
|
+
log(`[PDF] Detected %PDF- header at abs=${tokenizer.position}`);
|
|
220
179
|
const reader = new PdfTokenizerReader(tokenizer, { debug });
|
|
221
180
|
// pushback so we don't lose a line when probing for "stream"
|
|
222
181
|
let pendingLine = null;
|
|
@@ -296,7 +255,7 @@ async function _detectPdf(tokenizer, opts = {}) {
|
|
|
296
255
|
if (!rawBytes)
|
|
297
256
|
break;
|
|
298
257
|
const decodedBytes = await decodeStreamBytes(objectInfo, rawBytes);
|
|
299
|
-
const streamText =
|
|
258
|
+
const streamText = textDecode(decodedBytes, 'utf-8');
|
|
300
259
|
// Stream probes
|
|
301
260
|
for (const probe of subtypeProbes) {
|
|
302
261
|
if (!probe.onStreamText)
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@file-type/pdf",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.2.0",
|
|
4
4
|
"description": "file-type plugin to parse PDF files",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -45,6 +45,7 @@
|
|
|
45
45
|
},
|
|
46
46
|
"homepage": "https://github.com/Borewit/file-type-pdf#readme",
|
|
47
47
|
"dependencies": {
|
|
48
|
+
"@borewit/text-codec": "^0.2.1",
|
|
48
49
|
"read-next-line": "^0.5.0",
|
|
49
50
|
"sax": "^1.4.1"
|
|
50
51
|
},
|