@file-type/pdf 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.txt +9 -0
- package/README.md +50 -0
- package/lib/PdfTokenizerReader.d.ts +36 -0
- package/lib/PdfTokenizerReader.js +145 -0
- package/lib/index.d.ts +2 -0
- package/lib/index.js +348 -0
- package/package.json +52 -0
package/LICENSE.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
The MIT License (MIT)
|
|
2
|
+
|
|
3
|
+
Copyright © 2025 Borewit
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[](https://npmjs.org/package/@file-type/pdf)
|
|
2
|
+
[](https://github.com/Borewit/file-type-pdf/actions/workflows/nodejs-ci.yml)
|
|
3
|
+
[](https://npmcharts.com/compare/@file-type/pdf?start=365)
|
|
4
|
+
|
|
5
|
+
# @file-type/pdf
|
|
6
|
+
|
|
7
|
+
Detector plugin for [file-type](https://github.com/sindresorhus/file-type) that identifies
|
|
8
|
+
[PDF (Portable Document Format)](https://en.wikipedia.org/wiki/PDF) files and selected PDF-based subtypes.
|
|
9
|
+
|
|
10
|
+
This plugin goes beyond simple magic-number detection and can inspect the internal PDF
|
|
11
|
+
structure to distinguish between generic PDF files and specific producer formats such as
|
|
12
|
+
**Adobe Illustrator (.ai)**.
|
|
13
|
+
|
|
14
|
+
The detector is designed to be:
|
|
15
|
+
- tokenizer-based (using `strtok3`)
|
|
16
|
+
- streaming-friendly
|
|
17
|
+
- safe to chain with other `file-type` detectors
|
|
18
|
+
- compatible with both Node.js and browser environments
|
|
19
|
+
|
|
20
|
+
## Installation
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
npm install @file-type/pdf
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
## Usage
|
|
27
|
+
|
|
28
|
+
The following example shows how to add the PDF detector to [file-type](https://github.com/sindresorhus/file-type):
|
|
29
|
+
|
|
30
|
+
```js
|
|
31
|
+
import { FileTypeParser } from 'file-type';
|
|
32
|
+
import { detectPdf } from '@file-type/pdf';
|
|
33
|
+
|
|
34
|
+
const parser = new FileTypeParser({
|
|
35
|
+
customDetectors: [detectPdf],
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
const fileType = await parser.fromFile('example.pdf');
|
|
39
|
+
console.log(fileType);
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## Supported file formats
|
|
43
|
+
|
|
44
|
+
- `.ai` / `application/illustrator`: Adobe Illustrator
|
|
45
|
+
- `.pdf` / `application/pdf`: Generic Portable Document Format files
|
|
46
|
+
|
|
47
|
+
## License
|
|
48
|
+
|
|
49
|
+
This project is licensed under the [MIT License](LICENSE.txt).
|
|
50
|
+
Feel free to use, modify, and distribute it as needed.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import type { ITokenizer } from "strtok3";
|
|
2
|
+
export type PdfTokenizerReaderOptions = {
|
|
3
|
+
chunkSize?: number;
|
|
4
|
+
debug?: boolean;
|
|
5
|
+
};
|
|
6
|
+
export declare class PdfTokenizerReader {
|
|
7
|
+
private tokenizer;
|
|
8
|
+
private buf;
|
|
9
|
+
private pos;
|
|
10
|
+
private chunkSize;
|
|
11
|
+
private eof;
|
|
12
|
+
private debug;
|
|
13
|
+
constructor(tokenizer: ITokenizer, opts?: PdfTokenizerReaderOptions);
|
|
14
|
+
private log;
|
|
15
|
+
/**
|
|
16
|
+
* Logical file position of the next byte that will be consumed by the reader.
|
|
17
|
+
*/
|
|
18
|
+
getPosition(): number;
|
|
19
|
+
private peekMayBeLess;
|
|
20
|
+
private readMayBeLess;
|
|
21
|
+
private compactBuffer;
|
|
22
|
+
private fill;
|
|
23
|
+
/**
|
|
24
|
+
* Reads a line terminated by '\n' (supports '\r\n').
|
|
25
|
+
* Returns the line (latin1) without line ending, or null at EOF.
|
|
26
|
+
*/
|
|
27
|
+
readLine(): Promise<string | null>;
|
|
28
|
+
/**
|
|
29
|
+
* Reads exactly n bytes, or returns null if EOF occurs before n bytes are available.
|
|
30
|
+
*/
|
|
31
|
+
readBytes(n: number): Promise<Buffer | null>;
|
|
32
|
+
/**
|
|
33
|
+
* Consume exactly one EOL after the 'stream' keyword if present.
|
|
34
|
+
*/
|
|
35
|
+
consumeStreamEol(): Promise<void>;
|
|
36
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
export class PdfTokenizerReader {
|
|
2
|
+
constructor(tokenizer, opts = {}) {
|
|
3
|
+
this.buf = Buffer.alloc(0);
|
|
4
|
+
this.pos = 0;
|
|
5
|
+
this.eof = false;
|
|
6
|
+
this.tokenizer = tokenizer;
|
|
7
|
+
this.chunkSize = opts.chunkSize ?? 64 * 1024;
|
|
8
|
+
this.debug = !!opts.debug;
|
|
9
|
+
}
|
|
10
|
+
log(msg) {
|
|
11
|
+
if (this.debug)
|
|
12
|
+
console.log(msg);
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Logical file position of the next byte that will be consumed by the reader.
|
|
16
|
+
*/
|
|
17
|
+
getPosition() {
|
|
18
|
+
const bufferedRemaining = this.buf.length - this.pos;
|
|
19
|
+
return this.tokenizer.position - bufferedRemaining;
|
|
20
|
+
}
|
|
21
|
+
async peekMayBeLess(target, length) {
|
|
22
|
+
const opts = { length, mayBeLess: true };
|
|
23
|
+
try {
|
|
24
|
+
return await this.tokenizer.peekBuffer(target, opts);
|
|
25
|
+
}
|
|
26
|
+
catch (e) {
|
|
27
|
+
if (isEndOfStreamError(e))
|
|
28
|
+
return 0;
|
|
29
|
+
throw e;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
async readMayBeLess(target, length) {
|
|
33
|
+
const opts = { length, mayBeLess: true };
|
|
34
|
+
try {
|
|
35
|
+
return await this.tokenizer.readBuffer(target, opts);
|
|
36
|
+
}
|
|
37
|
+
catch (e) {
|
|
38
|
+
if (isEndOfStreamError(e))
|
|
39
|
+
return 0;
|
|
40
|
+
throw e;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
compactBuffer() {
|
|
44
|
+
if (this.pos > 0) {
|
|
45
|
+
this.buf = this.buf.subarray(this.pos);
|
|
46
|
+
this.pos = 0;
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
async fill(minBytes = 1) {
|
|
50
|
+
if (this.eof)
|
|
51
|
+
return;
|
|
52
|
+
while (!this.eof && (this.buf.length - this.pos) < minBytes) {
|
|
53
|
+
this.compactBuffer();
|
|
54
|
+
// Peek first, then read exactly what we peeked
|
|
55
|
+
const peekBuf = Buffer.alloc(this.chunkSize);
|
|
56
|
+
const peeked = await this.peekMayBeLess(peekBuf, peekBuf.length);
|
|
57
|
+
if (!peeked) {
|
|
58
|
+
this.eof = true;
|
|
59
|
+
this.log(`[READER] EOF @${this.getPosition()} (peekBuffer returned 0)`);
|
|
60
|
+
break;
|
|
61
|
+
}
|
|
62
|
+
const readBuf = Buffer.alloc(peeked);
|
|
63
|
+
const read = await this.readMayBeLess(readBuf, readBuf.length);
|
|
64
|
+
if (!read) {
|
|
65
|
+
this.eof = true;
|
|
66
|
+
this.log(`[READER] EOF @${this.getPosition()} (readBuffer returned 0)`);
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
const slice = readBuf.subarray(0, read);
|
|
70
|
+
this.buf = this.buf.length ? Buffer.concat([this.buf, slice]) : slice;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Reads a line terminated by '\n' (supports '\r\n').
|
|
75
|
+
* Returns the line (latin1) without line ending, or null at EOF.
|
|
76
|
+
*/
|
|
77
|
+
async readLine() {
|
|
78
|
+
while (true) {
|
|
79
|
+
const idx = this.buf.indexOf(0x0a, this.pos); // '\n'
|
|
80
|
+
if (idx !== -1) {
|
|
81
|
+
let lineBuf = this.buf.subarray(this.pos, idx);
|
|
82
|
+
if (lineBuf.length && lineBuf[lineBuf.length - 1] === 0x0d) {
|
|
83
|
+
lineBuf = lineBuf.subarray(0, lineBuf.length - 1); // drop '\r'
|
|
84
|
+
}
|
|
85
|
+
this.pos = idx + 1;
|
|
86
|
+
return lineBuf.toString("latin1");
|
|
87
|
+
}
|
|
88
|
+
const before = this.buf.length - this.pos;
|
|
89
|
+
await this.fill(before + 1);
|
|
90
|
+
const after = this.buf.length - this.pos;
|
|
91
|
+
if (after === before && this.eof) {
|
|
92
|
+
if (before === 0)
|
|
93
|
+
return null;
|
|
94
|
+
const tail = this.buf.subarray(this.pos);
|
|
95
|
+
this.pos = this.buf.length;
|
|
96
|
+
return tail.toString("latin1");
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
/**
|
|
101
|
+
* Reads exactly n bytes, or returns null if EOF occurs before n bytes are available.
|
|
102
|
+
*/
|
|
103
|
+
async readBytes(n) {
|
|
104
|
+
if (n < 0)
|
|
105
|
+
throw new Error("readBytes(n): n must be >= 0");
|
|
106
|
+
if (n === 0)
|
|
107
|
+
return Buffer.alloc(0);
|
|
108
|
+
await this.fill(n);
|
|
109
|
+
const avail = this.buf.length - this.pos;
|
|
110
|
+
if (avail < n)
|
|
111
|
+
return null;
|
|
112
|
+
const out = this.buf.subarray(this.pos, this.pos + n);
|
|
113
|
+
this.pos += n;
|
|
114
|
+
return out;
|
|
115
|
+
}
|
|
116
|
+
/**
|
|
117
|
+
* Consume exactly one EOL after the 'stream' keyword if present.
|
|
118
|
+
*/
|
|
119
|
+
async consumeStreamEol() {
|
|
120
|
+
await this.fill(1);
|
|
121
|
+
const avail = this.buf.length - this.pos;
|
|
122
|
+
if (avail <= 0)
|
|
123
|
+
return;
|
|
124
|
+
const b0 = this.buf[this.pos];
|
|
125
|
+
if (b0 === 0x0d) {
|
|
126
|
+
await this.fill(2);
|
|
127
|
+
const avail2 = this.buf.length - this.pos;
|
|
128
|
+
if (avail2 >= 2 && this.buf[this.pos + 1] === 0x0a)
|
|
129
|
+
this.pos += 2; // \r\n
|
|
130
|
+
else
|
|
131
|
+
this.pos += 1; // \r
|
|
132
|
+
}
|
|
133
|
+
else if (b0 === 0x0a) {
|
|
134
|
+
this.pos += 1; // \n
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
function isEndOfStreamError(e) {
|
|
139
|
+
if (!e || typeof e !== "object")
|
|
140
|
+
return false;
|
|
141
|
+
const anyE = e;
|
|
142
|
+
const name = typeof anyE.name === "string" ? anyE.name : "";
|
|
143
|
+
const message = typeof anyE.message === "string" ? anyE.message : "";
|
|
144
|
+
return name === "EndOfStreamError" || message.includes("End-Of-Stream");
|
|
145
|
+
}
|
package/lib/index.d.ts
ADDED
package/lib/index.js
ADDED
|
@@ -0,0 +1,348 @@
|
|
|
1
|
+
import sax from "sax";
|
|
2
|
+
import { PdfTokenizerReader } from "./PdfTokenizerReader.js";
|
|
3
|
+
const OBJ_REGEX = /^\s*(\d+)\s+(\d+)\s+obj\b/;
|
|
4
|
+
const PDF_TYPE = Object.freeze({ ext: "pdf", mime: "application/pdf" });
|
|
5
|
+
const AI_TYPE = Object.freeze({ ext: "ai", mime: "application/illustrator" });
|
|
6
|
+
const encoder = new TextEncoder();
|
|
7
|
+
const utf8Decoder = new TextDecoder("utf-8");
|
|
8
|
+
function indexOfBytes(hay, needle) {
|
|
9
|
+
if (needle.length === 0)
|
|
10
|
+
return 0;
|
|
11
|
+
outer: for (let i = 0; i <= hay.length - needle.length; i++) {
|
|
12
|
+
for (let j = 0; j < needle.length; j++) {
|
|
13
|
+
if (hay[i + j] !== needle[j])
|
|
14
|
+
continue outer;
|
|
15
|
+
}
|
|
16
|
+
return i;
|
|
17
|
+
}
|
|
18
|
+
return -1;
|
|
19
|
+
}
|
|
20
|
+
async function peekPdfHeader(tokenizer) {
|
|
21
|
+
const buf = new Uint8Array(1024);
|
|
22
|
+
let n = 0;
|
|
23
|
+
try {
|
|
24
|
+
const opts = { length: buf.length, mayBeLess: true };
|
|
25
|
+
n = await tokenizer.peekBuffer(buf, opts);
|
|
26
|
+
}
|
|
27
|
+
catch {
|
|
28
|
+
return { isPdf: false, headerOffset: -1 };
|
|
29
|
+
}
|
|
30
|
+
if (!n)
|
|
31
|
+
return { isPdf: false, headerOffset: -1 };
|
|
32
|
+
const hay = buf.subarray(0, n);
|
|
33
|
+
const idx = indexOfBytes(hay, encoder.encode("%PDF-"));
|
|
34
|
+
if (idx === -1)
|
|
35
|
+
return { isPdf: false, headerOffset: -1 };
|
|
36
|
+
return { isPdf: true, headerOffset: idx };
|
|
37
|
+
}
|
|
38
|
+
async function skipBytes(tokenizer, n) {
|
|
39
|
+
if (n <= 0)
|
|
40
|
+
return;
|
|
41
|
+
const tmp = new Uint8Array(Math.min(64 * 1024, n));
|
|
42
|
+
let left = n;
|
|
43
|
+
while (left > 0) {
|
|
44
|
+
const len = Math.min(tmp.length, left);
|
|
45
|
+
const opts = { length: len };
|
|
46
|
+
const read = await tokenizer.readBuffer(tmp, opts);
|
|
47
|
+
if (!read)
|
|
48
|
+
throw new Error("Unexpected EOF while skipping bytes");
|
|
49
|
+
left -= read;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
function parseDictFromRaw(raw) {
|
|
53
|
+
const dictRegex = /\/(\w+)(?:\s+([^/>\n\r]+))?/g;
|
|
54
|
+
const info = {};
|
|
55
|
+
let match = dictRegex.exec(raw);
|
|
56
|
+
while (match !== null) {
|
|
57
|
+
const key = match[1];
|
|
58
|
+
const value = match[2] ? match[2].trim() : true;
|
|
59
|
+
info[key] = value;
|
|
60
|
+
match = dictRegex.exec(raw);
|
|
61
|
+
}
|
|
62
|
+
return info;
|
|
63
|
+
}
|
|
64
|
+
function normalizeFilters(filterValue) {
|
|
65
|
+
if (!filterValue || filterValue === true)
|
|
66
|
+
return [];
|
|
67
|
+
const names = String(filterValue).match(/FlateDecode|ASCII85Decode|LZWDecode|RunLengthDecode/g);
|
|
68
|
+
return names ? [...new Set(names)] : [];
|
|
69
|
+
}
|
|
70
|
+
async function inflateFlateDecode(data) {
|
|
71
|
+
try {
|
|
72
|
+
return await inflateWithFormat("deflate", data);
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
return await inflateWithFormat("deflate-raw", data);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
async function inflateWithFormat(format, data) {
|
|
79
|
+
// Normalize input so TS sees an ArrayBuffer-backed Uint8Array (not ArrayBufferLike/SharedArrayBuffer).
|
|
80
|
+
const normalized = new Uint8Array(data.byteLength);
|
|
81
|
+
normalized.set(data);
|
|
82
|
+
const ds = new DecompressionStream(format);
|
|
83
|
+
// Use the most permissive stream element type and cast pipeThrough to avoid DOM lib generic friction.
|
|
84
|
+
const input = new ReadableStream({
|
|
85
|
+
start(controller) {
|
|
86
|
+
controller.enqueue(normalized); // Uint8Array is a valid chunk at runtime
|
|
87
|
+
controller.close();
|
|
88
|
+
},
|
|
89
|
+
});
|
|
90
|
+
const out = input.pipeThrough(ds);
|
|
91
|
+
const ab = await new Response(out).arrayBuffer();
|
|
92
|
+
return new Uint8Array(ab);
|
|
93
|
+
}
|
|
94
|
+
async function decodeStreamBytes(objectInfo, rawBytes) {
|
|
95
|
+
const filters = normalizeFilters(objectInfo.Filter);
|
|
96
|
+
if (!filters.length)
|
|
97
|
+
return rawBytes;
|
|
98
|
+
let out = rawBytes;
|
|
99
|
+
for (const f of filters) {
|
|
100
|
+
if (f === "FlateDecode") {
|
|
101
|
+
out = await inflateFlateDecode(out);
|
|
102
|
+
}
|
|
103
|
+
else {
|
|
104
|
+
// Unsupported filters, return raw stream
|
|
105
|
+
return rawBytes;
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
return out;
|
|
109
|
+
}
|
|
110
|
+
async function readDictionaryBlock(reader, firstLine) {
|
|
111
|
+
let raw = firstLine;
|
|
112
|
+
while (!raw.includes(">>")) {
|
|
113
|
+
const next = await reader.readLine();
|
|
114
|
+
if (next === null)
|
|
115
|
+
break;
|
|
116
|
+
raw += `\n${next}`;
|
|
117
|
+
}
|
|
118
|
+
const start = raw.indexOf("<<");
|
|
119
|
+
const end = raw.indexOf(">>", start + 2);
|
|
120
|
+
if (start === -1 || end === -1)
|
|
121
|
+
return { dictText: null, streamInline: false };
|
|
122
|
+
const dictText = raw.slice(start + 2, end).trim();
|
|
123
|
+
const after = raw.slice(end + 2).trim();
|
|
124
|
+
const streamInline = after === "stream" || after.startsWith("stream ");
|
|
125
|
+
return { dictText, streamInline };
|
|
126
|
+
}
|
|
127
|
+
class XmlHandler {
|
|
128
|
+
constructor(opts = {}) {
|
|
129
|
+
this.readingCreatorTool = false;
|
|
130
|
+
this.onCreatorTool = opts.onCreatorTool;
|
|
131
|
+
this.saxParser = sax.parser(true, { xmlns: true });
|
|
132
|
+
this.saxParser.onerror = (e) => {
|
|
133
|
+
if (e.message.startsWith("Invalid character entity")) {
|
|
134
|
+
this.saxParser.error = null;
|
|
135
|
+
this.saxParser.resume();
|
|
136
|
+
return;
|
|
137
|
+
}
|
|
138
|
+
throw e;
|
|
139
|
+
};
|
|
140
|
+
this.saxParser.onopentag = (node) => {
|
|
141
|
+
const tag = node;
|
|
142
|
+
const isCreatorTool = tag.uri === "http://ns.adobe.com/xap/1.0/" && tag.local === "CreatorTool";
|
|
143
|
+
// Fallback by name, in case xmlns typing/runtime differs
|
|
144
|
+
const nameMatch = typeof tag.name === "string" &&
|
|
145
|
+
(tag.name === "xap:CreatorTool" ||
|
|
146
|
+
tag.name.endsWith(":CreatorTool") ||
|
|
147
|
+
tag.name === "CreatorTool");
|
|
148
|
+
this.readingCreatorTool = isCreatorTool || nameMatch;
|
|
149
|
+
};
|
|
150
|
+
this.saxParser.ontext = (text) => {
|
|
151
|
+
if (!this.readingCreatorTool)
|
|
152
|
+
return;
|
|
153
|
+
this.onCreatorTool?.(text);
|
|
154
|
+
this.readingCreatorTool = false;
|
|
155
|
+
};
|
|
156
|
+
this.saxParser.onclosetag = () => {
|
|
157
|
+
this.readingCreatorTool = false;
|
|
158
|
+
};
|
|
159
|
+
}
|
|
160
|
+
write(text) {
|
|
161
|
+
this.saxParser.write(text);
|
|
162
|
+
}
|
|
163
|
+
close() {
|
|
164
|
+
this.saxParser.close();
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
function createIllustratorProbe() {
|
|
168
|
+
return {
|
|
169
|
+
name: "adobe-illustrator",
|
|
170
|
+
onDict: (_ctx, dictText, dict) => {
|
|
171
|
+
if (dict.Illustrator === true)
|
|
172
|
+
return AI_TYPE;
|
|
173
|
+
if (dictText.includes("/Illustrator"))
|
|
174
|
+
return AI_TYPE;
|
|
175
|
+
const creator = dict.Creator;
|
|
176
|
+
const producer = dict.Producer;
|
|
177
|
+
if (creator && creator !== true && String(creator).includes("Illustrator"))
|
|
178
|
+
return AI_TYPE;
|
|
179
|
+
if (producer && producer !== true && String(producer).includes("Illustrator"))
|
|
180
|
+
return AI_TYPE;
|
|
181
|
+
if (dictText.includes("Adobe Illustrator"))
|
|
182
|
+
return AI_TYPE;
|
|
183
|
+
return undefined;
|
|
184
|
+
},
|
|
185
|
+
onCreatorTool: (_ctx, creatorTool) => {
|
|
186
|
+
if (creatorTool.toLowerCase().includes("illustrator"))
|
|
187
|
+
return AI_TYPE;
|
|
188
|
+
return undefined;
|
|
189
|
+
},
|
|
190
|
+
onStreamText: (_ctx, streamText) => {
|
|
191
|
+
if (streamText.includes("Adobe Illustrator"))
|
|
192
|
+
return AI_TYPE;
|
|
193
|
+
return undefined;
|
|
194
|
+
},
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
const subtypeProbes = [createIllustratorProbe()];
|
|
198
|
+
/**
|
|
199
|
+
* File-type detector plugin:
|
|
200
|
+
* - returns undefined if NOT a PDF (and does not advance tokenizer.position in that case)
|
|
201
|
+
* - returns PDF_TYPE for PDF
|
|
202
|
+
* - returns subtype result when a probe matches (e.g. AI_TYPE)
|
|
203
|
+
*/
|
|
204
|
+
async function _detectPdf(tokenizer, opts = {}) {
|
|
205
|
+
const debug = !!opts.debug;
|
|
206
|
+
const maxScanLines = opts.maxScanLines ?? 50000;
|
|
207
|
+
const log = (...args) => {
|
|
208
|
+
if (debug)
|
|
209
|
+
console.log(...args);
|
|
210
|
+
};
|
|
211
|
+
const ctx = { debug, log };
|
|
212
|
+
// NOT PDF => PEEK ONLY, do not advance
|
|
213
|
+
const { isPdf, headerOffset } = await peekPdfHeader(tokenizer);
|
|
214
|
+
if (!isPdf)
|
|
215
|
+
return undefined;
|
|
216
|
+
// Confirmed PDF => ok to advance
|
|
217
|
+
log(`[PDF] Detected %PDF- header at +${headerOffset} (abs=${tokenizer.position + headerOffset})`);
|
|
218
|
+
if (headerOffset > 0)
|
|
219
|
+
await skipBytes(tokenizer, headerOffset);
|
|
220
|
+
const reader = new PdfTokenizerReader(tokenizer, { debug });
|
|
221
|
+
// pushback so we don't lose a line when probing for "stream"
|
|
222
|
+
let pendingLine = null;
|
|
223
|
+
const readLine = async () => {
|
|
224
|
+
if (pendingLine !== null) {
|
|
225
|
+
const l = pendingLine;
|
|
226
|
+
pendingLine = null;
|
|
227
|
+
return l;
|
|
228
|
+
}
|
|
229
|
+
return await reader.readLine();
|
|
230
|
+
};
|
|
231
|
+
const creatorToolListeners = subtypeProbes
|
|
232
|
+
.map(p => p.onCreatorTool)
|
|
233
|
+
.filter((fn) => typeof fn === "function");
|
|
234
|
+
log("[ROOT] Start parsing (PDF)");
|
|
235
|
+
let state = 0; // ROOT=0, OBJ=10
|
|
236
|
+
let scannedLines = 0;
|
|
237
|
+
while (scannedLines++ < maxScanLines) {
|
|
238
|
+
const line = await readLine();
|
|
239
|
+
if (line === null)
|
|
240
|
+
break;
|
|
241
|
+
if (state === 0) {
|
|
242
|
+
const m = OBJ_REGEX.exec(line);
|
|
243
|
+
if (m) {
|
|
244
|
+
log(`Found object: ${m[1]} Generation: ${m[2]}`);
|
|
245
|
+
state = 10;
|
|
246
|
+
}
|
|
247
|
+
continue;
|
|
248
|
+
}
|
|
249
|
+
if (state === 10) {
|
|
250
|
+
if (line.trim() === "endobj") {
|
|
251
|
+
log("[OBJ] => [ROOT]");
|
|
252
|
+
state = 0;
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
if (!line.includes("<<"))
|
|
256
|
+
continue;
|
|
257
|
+
const { dictText, streamInline } = await readDictionaryBlock(reader, line);
|
|
258
|
+
if (!dictText)
|
|
259
|
+
continue;
|
|
260
|
+
log(`[OBJ] Dictionary content: ${dictText.replace(/\s+/g, " ")}`);
|
|
261
|
+
log(streamInline ? "[OBJ] Stream keyword detected: stream" : "[OBJ] No stream keyword present on this line.");
|
|
262
|
+
const objectInfo = parseDictFromRaw(dictText);
|
|
263
|
+
// Dict probes
|
|
264
|
+
for (const probe of subtypeProbes) {
|
|
265
|
+
if (!probe.onDict)
|
|
266
|
+
continue;
|
|
267
|
+
const hit = probe.onDict(ctx, dictText, objectInfo);
|
|
268
|
+
if (hit)
|
|
269
|
+
return hit;
|
|
270
|
+
}
|
|
271
|
+
// Stream check with pushback
|
|
272
|
+
let hasStream = streamInline;
|
|
273
|
+
if (!hasStream) {
|
|
274
|
+
const nextLine = await readLine();
|
|
275
|
+
if (nextLine === null)
|
|
276
|
+
break;
|
|
277
|
+
if (nextLine.trim() === "stream") {
|
|
278
|
+
hasStream = true;
|
|
279
|
+
}
|
|
280
|
+
else {
|
|
281
|
+
pendingLine = nextLine;
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
if (!hasStream)
|
|
285
|
+
continue;
|
|
286
|
+
// Length may be indirect like "12 0 R", skip if not numeric
|
|
287
|
+
const lenVal = objectInfo.Length;
|
|
288
|
+
if (!lenVal || lenVal === true)
|
|
289
|
+
continue;
|
|
290
|
+
const streamLength = parseInt(lenVal, 10);
|
|
291
|
+
if (!Number.isFinite(streamLength) || streamLength < 0)
|
|
292
|
+
continue;
|
|
293
|
+
log(`[OBJ] => [STREAM] Start read stream of ${streamLength} bytes`);
|
|
294
|
+
await reader.consumeStreamEol();
|
|
295
|
+
const rawBytes = await reader.readBytes(streamLength);
|
|
296
|
+
if (!rawBytes)
|
|
297
|
+
break;
|
|
298
|
+
const decodedBytes = await decodeStreamBytes(objectInfo, rawBytes);
|
|
299
|
+
const streamText = utf8Decoder.decode(decodedBytes);
|
|
300
|
+
// Stream probes
|
|
301
|
+
for (const probe of subtypeProbes) {
|
|
302
|
+
if (!probe.onStreamText)
|
|
303
|
+
continue;
|
|
304
|
+
const hit = probe.onStreamText(ctx, streamText, objectInfo);
|
|
305
|
+
if (hit)
|
|
306
|
+
return hit;
|
|
307
|
+
}
|
|
308
|
+
// XMP CreatorTool
|
|
309
|
+
const looksLikeXmp = objectInfo.Type === "Metadata" ||
|
|
310
|
+
objectInfo.Type === "/Metadata" ||
|
|
311
|
+
objectInfo.Subtype === "XML" ||
|
|
312
|
+
objectInfo.Subtype === "/XML" ||
|
|
313
|
+
objectInfo.XML === true;
|
|
314
|
+
if (looksLikeXmp && creatorToolListeners.length) {
|
|
315
|
+
log("[STREAM] XML metadata detected, feeding SAX");
|
|
316
|
+
const xml = new XmlHandler({
|
|
317
|
+
onCreatorTool: (v) => {
|
|
318
|
+
log(`CreatorTool=${v}`);
|
|
319
|
+
for (const fn of creatorToolListeners) {
|
|
320
|
+
const hit = fn(ctx, v);
|
|
321
|
+
if (hit)
|
|
322
|
+
throw hit;
|
|
323
|
+
}
|
|
324
|
+
},
|
|
325
|
+
});
|
|
326
|
+
try {
|
|
327
|
+
xml.write(streamText);
|
|
328
|
+
xml.close();
|
|
329
|
+
}
|
|
330
|
+
catch (e) {
|
|
331
|
+
if (e && typeof e === "object" && "ext" in e && "mime" in e) {
|
|
332
|
+
return e;
|
|
333
|
+
}
|
|
334
|
+
throw e;
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
log("[STREAM] => [OBJ]");
|
|
338
|
+
}
|
|
339
|
+
}
|
|
340
|
+
log("[ROOT] Done parsing (PDF)");
|
|
341
|
+
return PDF_TYPE;
|
|
342
|
+
}
|
|
343
|
+
export const detectPdf = {
|
|
344
|
+
id: 'cfbf',
|
|
345
|
+
detect: async (tokenizer) => {
|
|
346
|
+
return _detectPdf(tokenizer);
|
|
347
|
+
}
|
|
348
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@file-type/pdf",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "file-type plugin to parse PDF files",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"exports": {
|
|
7
|
+
"default": "./parse.js"
|
|
8
|
+
},
|
|
9
|
+
"files": [
|
|
10
|
+
"lib/**/*.js",
|
|
11
|
+
"lib/**/*.d.ts"
|
|
12
|
+
],
|
|
13
|
+
"scripts": {
|
|
14
|
+
"clean": "del-cli lib/**/*.js lib/**/*.js.map lib/**/*.d.ts src/**/*.d.ts",
|
|
15
|
+
"compile": "tsc -p lib",
|
|
16
|
+
"build": "npm run clean && npm run compile",
|
|
17
|
+
"lint:ts": "biome check",
|
|
18
|
+
"test": "mocha",
|
|
19
|
+
"prepublishOnly": "npm run build",
|
|
20
|
+
"update-biome": "npm add -D --exact @biomejs/biome && npx @biomejs/biome migrate --write"
|
|
21
|
+
},
|
|
22
|
+
"repository": {
|
|
23
|
+
"type": "git",
|
|
24
|
+
"url": "git+https://github.com/Borewit/file-type-pdf.git"
|
|
25
|
+
},
|
|
26
|
+
"keywords": [
|
|
27
|
+
"file-type",
|
|
28
|
+
"PDF",
|
|
29
|
+
"XMP"
|
|
30
|
+
],
|
|
31
|
+
"author": "Borewit",
|
|
32
|
+
"license": "MIT",
|
|
33
|
+
"bugs": {
|
|
34
|
+
"url": "https://github.com/Borewit/file-type-pdf/issues"
|
|
35
|
+
},
|
|
36
|
+
"homepage": "https://github.com/Borewit/file-type-pdf#readme",
|
|
37
|
+
"dependencies": {
|
|
38
|
+
"read-next-line": "^0.5.0",
|
|
39
|
+
"sax": "^1.4.1"
|
|
40
|
+
},
|
|
41
|
+
"devDependencies": {
|
|
42
|
+
"@biomejs/biome": "^2.3.10",
|
|
43
|
+
"@types/node": "^25.0.3",
|
|
44
|
+
"@types/sax": "^1.2.7",
|
|
45
|
+
"chai": "^6.2.2",
|
|
46
|
+
"del-cli": "^7.0.0",
|
|
47
|
+
"file-type": "^21.2.0",
|
|
48
|
+
"mocha": "^11.7.5",
|
|
49
|
+
"strtok3": "^10.3.4",
|
|
50
|
+
"typescript": "^5.9.3"
|
|
51
|
+
}
|
|
52
|
+
}
|