pdf-metadata-extractor 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +91 -0
- package/LICENSE +21 -0
- package/README.md +427 -0
- package/dist/core/extractor.d.ts +3 -0
- package/dist/core/extractor.d.ts.map +1 -0
- package/dist/core/extractor.js +87 -0
- package/dist/core/extractor.js.map +1 -0
- package/dist/core/pageProcessor.d.ts +30 -0
- package/dist/core/pageProcessor.d.ts.map +1 -0
- package/dist/core/pageProcessor.js +480 -0
- package/dist/core/pageProcessor.js.map +1 -0
- package/dist/core/sourceDetector.d.ts +4 -0
- package/dist/core/sourceDetector.d.ts.map +1 -0
- package/dist/core/sourceDetector.js +33 -0
- package/dist/core/sourceDetector.js.map +1 -0
- package/dist/fetch/fetchBuffer.d.ts +2 -0
- package/dist/fetch/fetchBuffer.d.ts.map +1 -0
- package/dist/fetch/fetchBuffer.js +12 -0
- package/dist/fetch/fetchBuffer.js.map +1 -0
- package/dist/index.d.ts +8 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +42 -0
- package/dist/index.js.map +1 -0
- package/dist/parser/streamParser.d.ts +34 -0
- package/dist/parser/streamParser.d.ts.map +1 -0
- package/dist/parser/streamParser.js +191 -0
- package/dist/parser/streamParser.js.map +1 -0
- package/dist/parser/textParser.d.ts +56 -0
- package/dist/parser/textParser.d.ts.map +1 -0
- package/dist/parser/textParser.js +175 -0
- package/dist/parser/textParser.js.map +1 -0
- package/dist/pdf/fonts.d.ts +4 -0
- package/dist/pdf/fonts.d.ts.map +1 -0
- package/dist/pdf/fonts.js +113 -0
- package/dist/pdf/fonts.js.map +1 -0
- package/dist/pdf/loader.d.ts +2 -0
- package/dist/pdf/loader.d.ts.map +1 -0
- package/dist/pdf/loader.js +18 -0
- package/dist/pdf/loader.js.map +1 -0
- package/dist/pdf/metadata.d.ts +13 -0
- package/dist/pdf/metadata.d.ts.map +1 -0
- package/dist/pdf/metadata.js +9 -0
- package/dist/pdf/metadata.js.map +1 -0
- package/dist/pdf/xobjects.d.ts +12 -0
- package/dist/pdf/xobjects.d.ts.map +1 -0
- package/dist/pdf/xobjects.js +107 -0
- package/dist/pdf/xobjects.js.map +1 -0
- package/dist/types.d.ts +136 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +3 -0
- package/dist/types.js.map +1 -0
- package/dist/utils/buffer.d.ts +3 -0
- package/dist/utils/buffer.d.ts.map +1 -0
- package/dist/utils/buffer.js +11 -0
- package/dist/utils/buffer.js.map +1 -0
- package/dist/utils/color.d.ts +6 -0
- package/dist/utils/color.d.ts.map +1 -0
- package/dist/utils/color.js +21 -0
- package/dist/utils/color.js.map +1 -0
- package/dist/utils/matrix.d.ts +11 -0
- package/dist/utils/matrix.d.ts.map +1 -0
- package/dist/utils/matrix.js +22 -0
- package/dist/utils/matrix.js.map +1 -0
- package/package.json +61 -0
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import { PDFDocument } from "pdf-lib";
|
|
2
|
+
import { TextElement } from "../types";
|
|
3
|
+
export interface BoundingBox {
|
|
4
|
+
x: number;
|
|
5
|
+
y: number;
|
|
6
|
+
width: number;
|
|
7
|
+
height: number;
|
|
8
|
+
}
|
|
9
|
+
export declare function getBoundingBox(elements: TextElement[]): BoundingBox | null;
|
|
10
|
+
export declare function filterByRegion(elements: TextElement[], box: BoundingBox): TextElement[];
|
|
11
|
+
/**
|
|
12
|
+
* Decompress and return a page's full content text — including any Form XObject
|
|
13
|
+
* sub-streams that pdfjs would process when extracting text.
|
|
14
|
+
* `pageIndex` is 0-based (pdf-lib convention).
|
|
15
|
+
*/
|
|
16
|
+
export declare function getContentStreamText(pdfDoc: PDFDocument, pageIndex: number): string;
|
|
17
|
+
/**
|
|
18
|
+
* Extract the ordered list of unique PDF resource font keys from combined stream text.
|
|
19
|
+
* PDF names can contain hyphens/plus signs (e.g. "f-0-0", "F4+sub").
|
|
20
|
+
* e.g. "/F4 42 Tf … /f-0-0 10 Tf … /F4 8 Tf" → ["F4", "f-0-0"]
|
|
21
|
+
*/
|
|
22
|
+
export declare function streamFontOrder(streamText: string): string[];
|
|
23
|
+
/**
|
|
24
|
+
* Build a bridge map: pdfjsKey → pdfResourceKey
|
|
25
|
+
*
|
|
26
|
+
* pdfjs-dist assigns keys like g_d1_f2 in first-appearance order matching the
|
|
27
|
+
* content stream Tf commands. The docId (d0, d1 …) is global across all
|
|
28
|
+
* documents in the same process, so we match by position rather than by name.
|
|
29
|
+
*
|
|
30
|
+
* @param streamText — full content text (main + XObject streams)
|
|
31
|
+
* @param pdfjsOrderedFonts — fontName values in first-appearance order from pdfjs
|
|
32
|
+
*/
|
|
33
|
+
export declare function buildFontBridge(streamText: string, pdfjsOrderedFonts: string[]): Record<string, string>;
|
|
34
|
+
//# sourceMappingURL=streamParser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"streamParser.d.ts","sourceRoot":"","sources":["../../src/parser/streamParser.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAoB,MAAM,SAAS,CAAC;AACxD,OAAO,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAIvC,MAAM,WAAW,WAAW;IAC1B,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,WAAW,GAAG,IAAI,CAe1E;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,WAAW,GAAG,WAAW,EAAE,CAQvF;AAgID;;;;GAIG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM,CAcnF;AAED;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,EAAE,CAS5D;AAED;;;;;;;;;GASG;AACH,wBAAgB,eAAe,CAC7B,UAAU,EAAE,MAAM,EAClB,iBAAiB,EAAE,MAAM,EAAE,GAC1B,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAOxB"}
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
+
exports.getBoundingBox = getBoundingBox;
|
|
7
|
+
exports.filterByRegion = filterByRegion;
|
|
8
|
+
exports.getContentStreamText = getContentStreamText;
|
|
9
|
+
exports.streamFontOrder = streamFontOrder;
|
|
10
|
+
exports.buildFontBridge = buildFontBridge;
|
|
11
|
+
const zlib_1 = __importDefault(require("zlib"));
|
|
12
|
+
const pdf_lib_1 = require("pdf-lib");
|
|
13
|
+
function getBoundingBox(elements) {
|
|
14
|
+
if (elements.length === 0)
|
|
15
|
+
return null;
|
|
16
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
17
|
+
for (const el of elements) {
|
|
18
|
+
const w = el.width ?? 0;
|
|
19
|
+
const h = el.height ?? el.fontSize;
|
|
20
|
+
minX = Math.min(minX, el.x);
|
|
21
|
+
minY = Math.min(minY, el.y);
|
|
22
|
+
maxX = Math.max(maxX, el.x + w);
|
|
23
|
+
maxY = Math.max(maxY, el.y + h);
|
|
24
|
+
}
|
|
25
|
+
return { x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
|
26
|
+
}
|
|
27
|
+
function filterByRegion(elements, box) {
|
|
28
|
+
return elements.filter((el) => el.x >= box.x &&
|
|
29
|
+
el.y >= box.y &&
|
|
30
|
+
el.x <= box.x + box.width &&
|
|
31
|
+
el.y <= box.y + box.height);
|
|
32
|
+
}
|
|
33
|
+
// ─── Content-stream helpers ──────────────────────────────────────────────────
|
|
34
|
+
/**
|
|
35
|
+
* Read the raw bytes of a pdf-lib object that represents a content stream.
|
|
36
|
+
*/
|
|
37
|
+
function streamContents(obj) {
|
|
38
|
+
if (obj && typeof obj.contents !== "undefined") {
|
|
39
|
+
return Buffer.from(obj.contents);
|
|
40
|
+
}
|
|
41
|
+
return null;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Try zlib inflate, inflate-raw, then return raw bytes as latin1.
|
|
45
|
+
*/
|
|
46
|
+
function decompress(raw) {
|
|
47
|
+
try {
|
|
48
|
+
return zlib_1.default.inflateSync(raw).toString("latin1");
|
|
49
|
+
}
|
|
50
|
+
catch { /* */ }
|
|
51
|
+
try {
|
|
52
|
+
return zlib_1.default.inflateRawSync(raw).toString("latin1");
|
|
53
|
+
}
|
|
54
|
+
catch { /* */ }
|
|
55
|
+
return raw.toString("latin1");
|
|
56
|
+
}
|
|
57
|
+
/**
|
|
58
|
+
* Resolve and decompress one or more content stream references into a string.
|
|
59
|
+
*/
|
|
60
|
+
function resolveStreams(pdfDoc, contentsRef) {
|
|
61
|
+
if (!contentsRef)
|
|
62
|
+
return "";
|
|
63
|
+
const contents = pdfDoc.context.lookup(contentsRef);
|
|
64
|
+
// Single stream
|
|
65
|
+
const single = streamContents(contents);
|
|
66
|
+
if (single)
|
|
67
|
+
return decompress(single);
|
|
68
|
+
// PDFArray of streams
|
|
69
|
+
const arr = contents;
|
|
70
|
+
if (arr && typeof arr.size === "function") {
|
|
71
|
+
const parts = [];
|
|
72
|
+
for (let i = 0; i < arr.size(); i++) {
|
|
73
|
+
const ref = arr.get(i);
|
|
74
|
+
const s = pdfDoc.context.lookup(ref);
|
|
75
|
+
const raw = streamContents(s);
|
|
76
|
+
if (raw)
|
|
77
|
+
parts.push(decompress(raw));
|
|
78
|
+
}
|
|
79
|
+
return parts.join("\n");
|
|
80
|
+
}
|
|
81
|
+
return "";
|
|
82
|
+
}
|
|
83
|
+
/**
|
|
84
|
+
* Collect all Form-XObject content streams referenced by a stream (recursively).
|
|
85
|
+
* This handles PDFs where text lives inside /Form XObjects rather than the page stream.
|
|
86
|
+
*
|
|
87
|
+
* @param pdfDoc — pdf-lib document
|
|
88
|
+
* @param resourcesRef — the Resources dict for the current stream scope
|
|
89
|
+
* @param streamText — already-decompressed text of the current stream
|
|
90
|
+
* @param visited — guard against circular references
|
|
91
|
+
*/
|
|
92
|
+
function collectXObjectStreams(pdfDoc, resourcesRef, streamText, visited, depth) {
|
|
93
|
+
if (depth <= 0)
|
|
94
|
+
return [];
|
|
95
|
+
const extra = [];
|
|
96
|
+
try {
|
|
97
|
+
const resDict = pdfDoc.context.lookupMaybe(resourcesRef, pdf_lib_1.PDFDict);
|
|
98
|
+
if (!resDict)
|
|
99
|
+
return extra;
|
|
100
|
+
const xobjDictRef = resDict.get(pdf_lib_1.PDFName.of("XObject"));
|
|
101
|
+
if (!xobjDictRef)
|
|
102
|
+
return extra;
|
|
103
|
+
const xobjDict = pdfDoc.context.lookupMaybe(xobjDictRef, pdf_lib_1.PDFDict);
|
|
104
|
+
if (!xobjDict)
|
|
105
|
+
return extra;
|
|
106
|
+
// Find all /Name Do operators in the stream
|
|
107
|
+
const doNames = new Set([...streamText.matchAll(/\/([^\s/\[\]<>(){}]+)\s+Do/g)].map((m) => m[1]));
|
|
108
|
+
for (const name of doNames) {
|
|
109
|
+
const xobjRef = xobjDict.get(pdf_lib_1.PDFName.of(name));
|
|
110
|
+
if (!xobjRef)
|
|
111
|
+
continue;
|
|
112
|
+
const xobj = pdfDoc.context.lookup(xobjRef);
|
|
113
|
+
// Only process Form XObjects (Subtype = Form)
|
|
114
|
+
const subtype = xobj?.dict?.get(pdf_lib_1.PDFName.of("Subtype"));
|
|
115
|
+
const subtypeStr = subtype?.toString?.()?.replace(/^\//, "");
|
|
116
|
+
if (subtypeStr !== "Form")
|
|
117
|
+
continue;
|
|
118
|
+
const raw = streamContents(xobj);
|
|
119
|
+
if (!raw)
|
|
120
|
+
continue;
|
|
121
|
+
const key = raw.toString("hex").slice(0, 32);
|
|
122
|
+
if (visited.has(key))
|
|
123
|
+
continue;
|
|
124
|
+
visited.add(key);
|
|
125
|
+
const text = decompress(raw);
|
|
126
|
+
extra.push(text);
|
|
127
|
+
// Recurse: XObjects can reference their own sub-XObjects
|
|
128
|
+
const innerResRef = xobj?.dict?.get(pdf_lib_1.PDFName.of("Resources"));
|
|
129
|
+
const nested = collectXObjectStreams(pdfDoc, innerResRef, text, visited, depth - 1);
|
|
130
|
+
extra.push(...nested);
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
catch { /* ignore */ }
|
|
134
|
+
return extra;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Decompress and return a page's full content text — including any Form XObject
|
|
138
|
+
* sub-streams that pdfjs would process when extracting text.
|
|
139
|
+
* `pageIndex` is 0-based (pdf-lib convention).
|
|
140
|
+
*/
|
|
141
|
+
function getContentStreamText(pdfDoc, pageIndex) {
|
|
142
|
+
try {
|
|
143
|
+
const page = pdfDoc.getPage(pageIndex);
|
|
144
|
+
const contentsRef = page.node.get(pdf_lib_1.PDFName.of("Contents"));
|
|
145
|
+
const mainText = resolveStreams(pdfDoc, contentsRef);
|
|
146
|
+
const resourcesRef = page.node.get(pdf_lib_1.PDFName.of("Resources"));
|
|
147
|
+
const visited = new Set();
|
|
148
|
+
const xobjTexts = collectXObjectStreams(pdfDoc, resourcesRef, mainText, visited, 4);
|
|
149
|
+
return [mainText, ...xobjTexts].join("\n");
|
|
150
|
+
}
|
|
151
|
+
catch { /* ignore */ }
|
|
152
|
+
return "";
|
|
153
|
+
}
|
|
154
|
+
/**
|
|
155
|
+
* Extract the ordered list of unique PDF resource font keys from combined stream text.
|
|
156
|
+
* PDF names can contain hyphens/plus signs (e.g. "f-0-0", "F4+sub").
|
|
157
|
+
* e.g. "/F4 42 Tf … /f-0-0 10 Tf … /F4 8 Tf" → ["F4", "f-0-0"]
|
|
158
|
+
*/
|
|
159
|
+
function streamFontOrder(streamText) {
|
|
160
|
+
const seen = new Set();
|
|
161
|
+
const ordered = [];
|
|
162
|
+
// Match /FontName size Tf — FontName is any PDF name (no whitespace or delimiters)
|
|
163
|
+
for (const m of streamText.matchAll(/\/([^\s/\[\]<>(){}]+)\s+[\d.]+\s+Tf/g)) {
|
|
164
|
+
const key = m[1];
|
|
165
|
+
if (!seen.has(key)) {
|
|
166
|
+
seen.add(key);
|
|
167
|
+
ordered.push(key);
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
return ordered;
|
|
171
|
+
}
|
|
172
|
+
/**
|
|
173
|
+
* Build a bridge map: pdfjsKey → pdfResourceKey
|
|
174
|
+
*
|
|
175
|
+
* pdfjs-dist assigns keys like g_d1_f2 in first-appearance order matching the
|
|
176
|
+
* content stream Tf commands. The docId (d0, d1 …) is global across all
|
|
177
|
+
* documents in the same process, so we match by position rather than by name.
|
|
178
|
+
*
|
|
179
|
+
* @param streamText — full content text (main + XObject streams)
|
|
180
|
+
* @param pdfjsOrderedFonts — fontName values in first-appearance order from pdfjs
|
|
181
|
+
*/
|
|
182
|
+
function buildFontBridge(streamText, pdfjsOrderedFonts) {
|
|
183
|
+
const streamOrder = streamFontOrder(streamText);
|
|
184
|
+
const bridge = {};
|
|
185
|
+
pdfjsOrderedFonts.forEach((pdfjsKey, i) => {
|
|
186
|
+
if (streamOrder[i])
|
|
187
|
+
bridge[pdfjsKey] = streamOrder[i];
|
|
188
|
+
});
|
|
189
|
+
return bridge;
|
|
190
|
+
}
|
|
191
|
+
//# sourceMappingURL=streamParser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"streamParser.js","sourceRoot":"","sources":["../../src/parser/streamParser.ts"],"names":[],"mappings":";;;;;AAaA,wCAeC;AAED,wCAQC;AAqID,oDAcC;AAOD,0CASC;AAYD,0CAUC;AA/ND,gDAAwB;AACxB,qCAAwD;AAYxD,SAAgB,cAAc,CAAC,QAAuB;IACpD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,IAAI,GAAG,QAAQ,EAAE,IAAI,GAAG,QAAQ,EAAE,IAAI,GAAG,CAAC,QAAQ,EAAE,IAAI,GAAG,CAAC,QAAQ,CAAC;IAEzE,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,EAAE,CAAC,KAAK,IAAI,CAAC,CAAC;QACxB,MAAM,CAAC,GAAG,EAAE,CAAC,MAAM,IAAI,EAAE,CAAC,QAAQ,CAAC;QACnC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAChC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAClC,CAAC;IAED,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,GAAG,IAAI,EAAE,MAAM,EAAE,IAAI,GAAG,IAAI,EAAE,CAAC;AACvE,CAAC;AAED,SAAgB,cAAc,CAAC,QAAuB,EAAE,GAAgB;IACtE,OAAO,QAAQ,CAAC,MAAM,CACpB,CAAC,EAAE,EAAE,EAAE,CACL,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACb,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACb,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,KAAK;QACzB,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,MAAM,CAC7B,CAAC;AACJ,CAAC;AAED,gFAAgF;AAEhF;;GAEG;AACH,SAAS,cAAc,CAAC,GAAY;IAClC,IAAI,GAAG,IAAI,OAAQ,GAA8B,CAAC,QAAQ,KAAK,WAAW,EAAE,CAAC;QAC3E,OAAO,MAAM,CAAC,IAAI,CAAE,GAAgC,CAAC,QAAQ,CAAC,CAAC;IACjE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,CAAC;QAAC,OAAO,cAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC;IACxE,IAAI,CAAC;QAAC,OAAO,cAAI,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC;IAC3E,OAAO,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CACrB,MAAmB,EACnB,WAAoB;IAEpB,IAAI,CAAC,WAAW;QAAE,OAAO,EAAE,CAAC;IAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CACpC,WAA0D,CAC3D,CAAC;IAEF,gBAAgB;IAChB,MAAM,MAAM,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;IACxC,IAAI,MAAM;QAAE,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC;IAEtC,sBAAsB;IACtB,MAAM,GAAG,GAAG,QAAqE,CAAC;IAClF,IAAI,GAAG,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,MAAM,GAAG,GAAG,GAAG,CAAC,GAAI,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,CAAC,GAAK,MAAM,CAAC,OAAO,CAAC,MAAM,CAC/B,GAAkD,CACnD,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;YAC9B,IAAI,GAAG;gBAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QACvC,CAAC;QACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,qBAAqB,CAC5B,MAAmB,EACnB,YAAqB,EACrB,UAAkB,EAClB,OAAoB,EACpB,KAAa;IAEb,IAAI,KAAK,IAAI,CAAC;QAAE,OAAO,EAAE,CAAC;IAE1B,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACxC,YAAgE,EAChE,iBAAO,CACR,CAAC;QACF,IAAI,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;QAE3B,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,WAAW;YAAE,OAAO,KAAK,CAAC;QAE/B,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,WAAW,EAAE,iBAAO,CAAC,CAAC;QAClE,IAAI,CAAC,QAAQ;YAAE,OAAO,KAAK,CAAC;QAE5B,4CAA4C;QAC5C,MAAM,OAAO,GAAG,IAAI,GAAG,CACrB,CAAC,GAAG,UAAU,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CACzE,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC;YAC/C,IAAI,CAAC,OAAO;gBAAE,SAAS;YAEvB,MAAM,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAChC,OAAsD,CACvD,CAAC;YAEF,8CAA8C;YAC9C,MAAM,OAAO,GAAI,IAA2B,EAAE,IAAI,EAAE,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;YAC/E,MAAM,UAAU,GAAG,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAC7D,IAAI,UAAU,KAAK,MAAM;gBAAE,SAAS;YAEpC,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,IAAI,CAAC,GAAG;gBAAE,SAAS;YAEnB,MAAM,GAAG,GAAG,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC7C,IAAI,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,SAAS;YAC/B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAEjB,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;YAC7B,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAEjB,yDAAyD;YACzD,MAAM,WAAW,GAAI,IAA2B,EAAE,IAAI,EAAE,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC;YACrF,MAAM,MAAM,GAAG,qBAAqB,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;YACpF,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,SAAgB,oBAAoB,CAAC,MAAmB,EAAE,SAAiB;IACzE,IAAI,CAAC;QACH,MAAM,IAAI,GAAU,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QAC9C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC;QAC1D,MAAM,QAAQ,GAAM,cAAc,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QAExD,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC;QAC5D,MAAM,OAAO,GAAQ,IAAI,GAAG,EAAU,CAAC;QACvC,MAAM,SAAS,GAAM,qBAAqB,CAAC,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC;QAEvF,OAAO,CAAC,QAAQ,EAAE,GAAG,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7C,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;GAIG;AACH,SAAgB,eAAe,CAAC,UAAkB;IAChD,MAAM,IAAI,GAAM,IAAI,GAAG,EAAU,CAAC;IAClC,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,mFAAmF;IACnF,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,QAAQ,CAAC,sCAAsC,CAAC,EAAE,CAAC;QAC5E,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACjB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAAC,CAAC;IAC3D,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;;;;;;GASG;AACH,SAAgB,eAAe,CAC7B,UAAkB,EAClB,iBAA2B;IAE3B,MAAM,WAAW,GAAG,eAAe,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,iBAAiB,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE;QACxC,IAAI,WAAW,CAAC,CAAC,CAAC;YAAE,MAAM,CAAC,QAAQ,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IACH,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { TextElement, TextLine, TextLineWithWords, TextWord } from "../types";
|
|
2
|
+
/**
|
|
3
|
+
* Group text elements into visual lines.
|
|
4
|
+
*
|
|
5
|
+
* Elements whose Y values fall within `tolerance` points of each other are
|
|
6
|
+
* considered the same line. Within each line, elements are sorted left-to-right
|
|
7
|
+
* by X. Empty-string elements are discarded; whitespace-only elements (spaces)
|
|
8
|
+
* are kept so that word gaps are preserved when joining.
|
|
9
|
+
*
|
|
10
|
+
* @param elements - flat list of TextElements from a page
|
|
11
|
+
* @param tolerance - max Y delta to treat two elements as the same line (default 2)
|
|
12
|
+
*/
|
|
13
|
+
export declare function groupIntoLines(elements: TextElement[], tolerance?: number): TextLine[];
|
|
14
|
+
/**
|
|
15
|
+
* Group text elements into words by detecting inter-character gaps.
|
|
16
|
+
*
|
|
17
|
+
* A new word starts when:
|
|
18
|
+
* - the current element is a whitespace-only element (explicit space), OR
|
|
19
|
+
* - the X gap between consecutive elements exceeds `fontSize × gapFactor`
|
|
20
|
+
*
|
|
21
|
+
* Whitespace elements are used as separators but are NOT included in the
|
|
22
|
+
* resulting words, so the output contains only printable content.
|
|
23
|
+
*
|
|
24
|
+
* Works on elements that share the same visual line (i.e. the output of
|
|
25
|
+
* `groupIntoLines`), but can also be called on any sorted element list.
|
|
26
|
+
*
|
|
27
|
+
* @param elements - TextElements sorted left-to-right (same line)
|
|
28
|
+
* @param gapFactor - gap threshold as a fraction of fontSize (default 0.4)
|
|
29
|
+
*/
|
|
30
|
+
export declare function groupIntoWords(elements: TextElement[], gapFactor?: number): TextWord[];
|
|
31
|
+
/**
|
|
32
|
+
* Convenience: extract all words from a flat list of TextElements in a single call.
|
|
33
|
+
* Equivalent to `groupIntoLines(elements).flatMap(l => groupIntoWords(l.elements))`.
|
|
34
|
+
* Words are returned in reading order (top-to-bottom, left-to-right).
|
|
35
|
+
*
|
|
36
|
+
* @param elements - flat list of TextElements from a page
|
|
37
|
+
* @param lineTolerance - Y tolerance for line grouping (default 2)
|
|
38
|
+
* @param gapFactor - word gap threshold as a fraction of fontSize (default 0.4)
|
|
39
|
+
*/
|
|
40
|
+
export declare function extractWords(elements: TextElement[], lineTolerance?: number, gapFactor?: number): TextWord[];
|
|
41
|
+
/**
|
|
42
|
+
* Full structured text extraction: returns lines with their words already grouped.
|
|
43
|
+
* Use this to get a ready-to-serialize JSON hierarchy:
|
|
44
|
+
* line.text — full line string
|
|
45
|
+
* line.words — words with position, size, font, color, and raw elements
|
|
46
|
+
*
|
|
47
|
+
* @param elements - flat list of TextElements from a page
|
|
48
|
+
* @param lineTolerance - Y tolerance for line grouping (default 2)
|
|
49
|
+
* @param gapFactor - word gap threshold as fraction of fontSize (default 0.4)
|
|
50
|
+
*/
|
|
51
|
+
export declare function extractTextStructure(elements: TextElement[], lineTolerance?: number, gapFactor?: number): TextLineWithWords[];
|
|
52
|
+
/**
|
|
53
|
+
* Concatenate text elements in a line into a single string (legacy helper).
|
|
54
|
+
*/
|
|
55
|
+
export declare function lineToString(line: TextElement[]): string;
|
|
56
|
+
//# sourceMappingURL=textParser.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"textParser.d.ts","sourceRoot":"","sources":["../../src/parser/textParser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,QAAQ,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAI9E;;;;;;;;;;GAUG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,SAAS,SAAI,GAAG,QAAQ,EAAE,CAqBjF;AA0BD;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,SAAS,SAAM,GAAG,QAAQ,EAAE,CA4DnF;AAED;;;;;;;;GAQG;AACH,wBAAgB,YAAY,CAC1B,QAAQ,EAAE,WAAW,EAAE,EACvB,aAAa,SAAI,EACjB,SAAS,SAAM,GACd,QAAQ,EAAE,CAIZ;AAED;;;;;;;;;GASG;AACH,wBAAgB,oBAAoB,CAClC,QAAQ,EAAE,WAAW,EAAE,EACvB,aAAa,SAAI,EACjB,SAAS,SAAM,GACd,iBAAiB,EAAE,CAKrB;AAID;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,WAAW,EAAE,GAAG,MAAM,CAMxD"}
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.groupIntoLines = groupIntoLines;
|
|
4
|
+
exports.groupIntoWords = groupIntoWords;
|
|
5
|
+
exports.extractWords = extractWords;
|
|
6
|
+
exports.extractTextStructure = extractTextStructure;
|
|
7
|
+
exports.lineToString = lineToString;
|
|
8
|
+
// ─── Lines ────────────────────────────────────────────────────────────────────
|
|
9
|
+
/**
|
|
10
|
+
* Group text elements into visual lines.
|
|
11
|
+
*
|
|
12
|
+
* Elements whose Y values fall within `tolerance` points of each other are
|
|
13
|
+
* considered the same line. Within each line, elements are sorted left-to-right
|
|
14
|
+
* by X. Empty-string elements are discarded; whitespace-only elements (spaces)
|
|
15
|
+
* are kept so that word gaps are preserved when joining.
|
|
16
|
+
*
|
|
17
|
+
* @param elements - flat list of TextElements from a page
|
|
18
|
+
* @param tolerance - max Y delta to treat two elements as the same line (default 2)
|
|
19
|
+
*/
|
|
20
|
+
function groupIntoLines(elements, tolerance = 2) {
|
|
21
|
+
const visible = elements.filter((el) => el.text !== "");
|
|
22
|
+
const buckets = new Map();
|
|
23
|
+
for (const el of visible) {
|
|
24
|
+
const key = Math.round(el.y / tolerance) * tolerance;
|
|
25
|
+
if (!buckets.has(key))
|
|
26
|
+
buckets.set(key, []);
|
|
27
|
+
buckets.get(key).push(el);
|
|
28
|
+
}
|
|
29
|
+
return [...buckets.entries()]
|
|
30
|
+
.sort(([a], [b]) => a - b)
|
|
31
|
+
.map(([y, els]) => {
|
|
32
|
+
const sorted = [...els].sort((a, b) => a.x - b.x);
|
|
33
|
+
return {
|
|
34
|
+
y,
|
|
35
|
+
text: sorted.map((e) => e.text).join("").trim(),
|
|
36
|
+
elements: sorted,
|
|
37
|
+
};
|
|
38
|
+
})
|
|
39
|
+
.filter((line) => line.text !== "");
|
|
40
|
+
}
|
|
41
|
+
// ─── Words ────────────────────────────────────────────────────────────────────
|
|
42
|
+
/**
|
|
43
|
+
* Build a TextWord from a non-empty group of elements (already sorted by X).
|
|
44
|
+
*/
|
|
45
|
+
function buildWord(els) {
|
|
46
|
+
const first = els[0];
|
|
47
|
+
const last = els[els.length - 1];
|
|
48
|
+
return {
|
|
49
|
+
text: els.map((e) => e.text).join(""),
|
|
50
|
+
x: first.x,
|
|
51
|
+
y: first.y,
|
|
52
|
+
width: last.x + last.width - first.x,
|
|
53
|
+
height: Math.max(...els.map((e) => e.height)),
|
|
54
|
+
fontSize: first.fontSize,
|
|
55
|
+
fontRealName: first.fontRealName,
|
|
56
|
+
fontFamily: first.fontFamily,
|
|
57
|
+
fontStyle: first.fontStyle,
|
|
58
|
+
fontWeight: first.fontWeight,
|
|
59
|
+
color: first.color,
|
|
60
|
+
elements: els,
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Group text elements into words by detecting inter-character gaps.
|
|
65
|
+
*
|
|
66
|
+
* A new word starts when:
|
|
67
|
+
* - the current element is a whitespace-only element (explicit space), OR
|
|
68
|
+
* - the X gap between consecutive elements exceeds `fontSize × gapFactor`
|
|
69
|
+
*
|
|
70
|
+
* Whitespace elements are used as separators but are NOT included in the
|
|
71
|
+
* resulting words, so the output contains only printable content.
|
|
72
|
+
*
|
|
73
|
+
* Works on elements that share the same visual line (i.e. the output of
|
|
74
|
+
* `groupIntoLines`), but can also be called on any sorted element list.
|
|
75
|
+
*
|
|
76
|
+
* @param elements - TextElements sorted left-to-right (same line)
|
|
77
|
+
* @param gapFactor - gap threshold as a fraction of fontSize (default 0.4)
|
|
78
|
+
*/
|
|
79
|
+
function groupIntoWords(elements, gapFactor = 0.4) {
|
|
80
|
+
const els = [...elements]
|
|
81
|
+
.filter((el) => el.text !== "")
|
|
82
|
+
.sort((a, b) => a.x - b.x);
|
|
83
|
+
if (els.length === 0)
|
|
84
|
+
return [];
|
|
85
|
+
const words = [];
|
|
86
|
+
let group = [];
|
|
87
|
+
for (let i = 0; i < els.length; i++) {
|
|
88
|
+
const curr = els[i];
|
|
89
|
+
const isSpace = curr.text.trim() === "";
|
|
90
|
+
if (group.length === 0) {
|
|
91
|
+
if (!isSpace)
|
|
92
|
+
group.push(curr);
|
|
93
|
+
continue;
|
|
94
|
+
}
|
|
95
|
+
if (isSpace) {
|
|
96
|
+
// Decide: letter-spacing (merge) or word boundary (split)?
|
|
97
|
+
//
|
|
98
|
+
// Heuristic: if the element before AND after this space are both
|
|
99
|
+
// single-character elements, this is decorative letter-spacing — keep
|
|
100
|
+
// the group together. Otherwise it is a real word break.
|
|
101
|
+
const prev = group[group.length - 1];
|
|
102
|
+
const next = els[i + 1];
|
|
103
|
+
const isLetterSpacing = prev.text.length === 1 && next != null && next.text.trim().length === 1;
|
|
104
|
+
if (isLetterSpacing) {
|
|
105
|
+
// Include the space in the group so X layout is preserved;
|
|
106
|
+
// buildWord will strip spaces from the final text string.
|
|
107
|
+
group.push(curr);
|
|
108
|
+
}
|
|
109
|
+
else {
|
|
110
|
+
const content = group.filter((e) => e.text.trim() !== "");
|
|
111
|
+
if (content.length > 0)
|
|
112
|
+
words.push(buildWord(content));
|
|
113
|
+
group = [];
|
|
114
|
+
}
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
// Non-space element: check X gap against the last non-space in the group
|
|
118
|
+
const prev = group[group.length - 1];
|
|
119
|
+
const gap = curr.x - (prev.x + prev.width);
|
|
120
|
+
const largeGap = gap > prev.fontSize * gapFactor;
|
|
121
|
+
if (largeGap) {
|
|
122
|
+
const content = group.filter((e) => e.text.trim() !== "");
|
|
123
|
+
if (content.length > 0)
|
|
124
|
+
words.push(buildWord(content));
|
|
125
|
+
group = [curr];
|
|
126
|
+
}
|
|
127
|
+
else {
|
|
128
|
+
group.push(curr);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
const content = group.filter((e) => e.text.trim() !== "");
|
|
132
|
+
if (content.length > 0)
|
|
133
|
+
words.push(buildWord(content));
|
|
134
|
+
return words;
|
|
135
|
+
}
|
|
136
|
+
/**
|
|
137
|
+
* Convenience: extract all words from a flat list of TextElements in a single call.
|
|
138
|
+
* Equivalent to `groupIntoLines(elements).flatMap(l => groupIntoWords(l.elements))`.
|
|
139
|
+
* Words are returned in reading order (top-to-bottom, left-to-right).
|
|
140
|
+
*
|
|
141
|
+
* @param elements - flat list of TextElements from a page
|
|
142
|
+
* @param lineTolerance - Y tolerance for line grouping (default 2)
|
|
143
|
+
* @param gapFactor - word gap threshold as a fraction of fontSize (default 0.4)
|
|
144
|
+
*/
|
|
145
|
+
function extractWords(elements, lineTolerance = 2, gapFactor = 0.4) {
|
|
146
|
+
return groupIntoLines(elements, lineTolerance).flatMap((line) => groupIntoWords(line.elements, gapFactor));
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Full structured text extraction: returns lines with their words already grouped.
|
|
150
|
+
* Use this to get a ready-to-serialize JSON hierarchy:
|
|
151
|
+
* line.text — full line string
|
|
152
|
+
* line.words — words with position, size, font, color, and raw elements
|
|
153
|
+
*
|
|
154
|
+
* @param elements - flat list of TextElements from a page
|
|
155
|
+
* @param lineTolerance - Y tolerance for line grouping (default 2)
|
|
156
|
+
* @param gapFactor - word gap threshold as fraction of fontSize (default 0.4)
|
|
157
|
+
*/
|
|
158
|
+
function extractTextStructure(elements, lineTolerance = 2, gapFactor = 0.4) {
|
|
159
|
+
return groupIntoLines(elements, lineTolerance).map((line) => ({
|
|
160
|
+
...line,
|
|
161
|
+
words: groupIntoWords(line.elements, gapFactor),
|
|
162
|
+
}));
|
|
163
|
+
}
|
|
164
|
+
// ─── Legacy helper ────────────────────────────────────────────────────────────
|
|
165
|
+
/**
|
|
166
|
+
* Concatenate text elements in a line into a single string (legacy helper).
|
|
167
|
+
*/
|
|
168
|
+
function lineToString(line) {
|
|
169
|
+
return line
|
|
170
|
+
.sort((a, b) => a.x - b.x)
|
|
171
|
+
.map((el) => el.text)
|
|
172
|
+
.join("")
|
|
173
|
+
.trim();
|
|
174
|
+
}
|
|
175
|
+
//# sourceMappingURL=textParser.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"textParser.js","sourceRoot":"","sources":["../../src/parser/textParser.ts"],"names":[],"mappings":";;AAeA,wCAqBC;AA0CD,wCA4DC;AAWD,oCAQC;AAYD,oDASC;AAOD,oCAMC;AA7LD,iFAAiF;AAEjF;;;;;;;;;;GAUG;AACH,SAAgB,cAAc,CAAC,QAAuB,EAAE,SAAS,GAAG,CAAC;IACnE,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC;IAExD,MAAM,OAAO,GAAG,IAAI,GAAG,EAAyB,CAAC;IACjD,KAAK,MAAM,EAAE,IAAI,OAAO,EAAE,CAAC;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,GAAG,SAAS,CAAC;QACrD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,EAAE;QAChB,MAAM,MAAM,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,OAAO;YACL,CAAC;YACD,IAAI,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE;YAC/C,QAAQ,EAAE,MAAM;SACjB,CAAC;IACJ,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC;AACxC,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,SAAS,SAAS,CAAC,GAAkB;IACnC,MAAM,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;IACrB,MAAM,IAAI,GAAI,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAClC,OAAO;QACL,IAAI,EAAU,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;QAC7C,CAAC,EAAa,KAAK,CAAC,CAAC;QACrB,CAAC,EAAa,KAAK,CAAC,CAAC;QACrB,KAAK,EAAS,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC;QAC3C,MAAM,EAAQ,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACnD,QAAQ,EAAM,KAAK,CAAC,QAAQ;QAC5B,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,UAAU,EAAI,KAAK,CAAC,UAAU;QAC9B,SAAS,EAAK,KAAK,CAAC,SAAS;QAC7B,UAAU,EAAI,KAAK,CAAC,UAAU;QAC9B,KAAK,EAAS,KAAK,CAAC,KAAK;QACzB,QAAQ,EAAM,GAAG;KAClB,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;;GAeG;AACH,SAAgB,cAAc,CAAC,QAAuB,EAAE,SAAS,GAAG,GAAG;IACrE,MAAM,GAAG,GAAG,CAAC,GAAG,QAAQ,CAAC;SACtB,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;SAC9B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7B,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEhC,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,IAAI,KAAK,GAAkB,EAAE,CAAC;IAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,IAAI,GAAM,GAAG,CAAC,CAAC,CAAC,CAAC;QACvB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC;QAExC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,IAAI,CAAC,OAAO;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC/B,SAAS;QACX,CAAC;QAED,IAAI,OAAO,EAAE,CAAC;YACZ,2DAA2D;YAC3D,EAAE;YACF,iEAAiE;YACjE,sEAAsE;YACtE,0DAA0D;YAC1D,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACxB,MAAM,eAAe,GACnB,IAAI,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,CAAC;YAE1E,IAAI,eAAe,EAAE,CAAC;gBACpB,2DAA2D;gBAC3D,0DAA0D;gBAC1D,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACN,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;gBAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;gBACvD,KAAK,GAAG,EAAE,CAAC;YACb,CAAC;YACD,SAAS;QACX,CAAC;QAED,yEAAyE;QACzE,MAAM,IAAI,GAAO,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzC,MAAM,GAAG,GAAQ,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;QAChD,MAAM,QAAQ,GAAG,GAAG,GAAG,IAAI,CAAC,QAAQ,GAAG,SAAS,CAAC;QAEjD,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;gBAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;YACvD,KAAK,GAAG,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;IAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;;;GAQG;AACH,SAAgB,YAAY,CAC1B,QAAuB,EACvB,aAAa,GAAG,CAAC,EACjB,SAAS,GAAG,GAAG;IAEf,OAAO,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE,CAC9D,cAAc,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,CACzC,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,SAAgB,oBAAoB,CAClC,QAAuB,EACvB,aAAa,GAAG,CAAC,EACjB,SAAS,GAAG,GAAG;IAEf,OAAO,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC5D,GAAG,IAAI;QACP,KAAK,EAAE,cAAc,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC;KAChD,CAAC,CAAC,CAAC;AACN,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,SAAgB,YAAY,CAAC,IAAmB;IAC9C,OAAO,IAAI;SACR,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,CAAC;SACpB,IAAI,CAAC,EAAE,CAAC;SACR,IAAI,EAAE,CAAC;AACZ,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fonts.d.ts","sourceRoot":"","sources":["../../src/pdf/fonts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAoB,MAAM,SAAS,CAAC;AACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAoBpC,wBAAsB,YAAY,CAAC,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC,CAsGzF"}
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.extractFonts = extractFonts;
|
|
4
|
+
const pdf_lib_1 = require("pdf-lib");
|
|
5
|
+
function pdfValToString(val) {
|
|
6
|
+
if (!val)
|
|
7
|
+
return null;
|
|
8
|
+
try {
|
|
9
|
+
return val
|
|
10
|
+
.toString()
|
|
11
|
+
.replace(/^\//, "")
|
|
12
|
+
.replace(/^\(|\)$/g, "")
|
|
13
|
+
.trim();
|
|
14
|
+
}
|
|
15
|
+
catch {
|
|
16
|
+
return null;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
function stripSubsetPrefix(name) {
|
|
20
|
+
if (!name)
|
|
21
|
+
return null;
|
|
22
|
+
return name.replace(/^\//, "").replace(/^[A-Z]{6}\+/, "");
|
|
23
|
+
}
|
|
24
|
+
async function extractFonts(pdfDoc) {
|
|
25
|
+
const fontMap = {};
|
|
26
|
+
for (const page of pdfDoc.getPages()) {
|
|
27
|
+
let resourceDict;
|
|
28
|
+
try {
|
|
29
|
+
const res = page.node.get(pdf_lib_1.PDFName.of("Resources"));
|
|
30
|
+
resourceDict = pdfDoc.context.lookupMaybe(res, pdf_lib_1.PDFDict);
|
|
31
|
+
}
|
|
32
|
+
catch {
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
if (!resourceDict)
|
|
36
|
+
continue;
|
|
37
|
+
let fontDict;
|
|
38
|
+
try {
|
|
39
|
+
fontDict = pdfDoc.context.lookupMaybe(resourceDict.get(pdf_lib_1.PDFName.of("Font")), pdf_lib_1.PDFDict);
|
|
40
|
+
}
|
|
41
|
+
catch {
|
|
42
|
+
continue;
|
|
43
|
+
}
|
|
44
|
+
if (!fontDict)
|
|
45
|
+
continue;
|
|
46
|
+
for (const [nameKey, fontRef] of fontDict.entries()) {
|
|
47
|
+
const key = pdfValToString(nameKey) ?? nameKey.toString().replace(/^\//, "");
|
|
48
|
+
if (fontMap[key])
|
|
49
|
+
continue;
|
|
50
|
+
let fontObj;
|
|
51
|
+
try {
|
|
52
|
+
fontObj = pdfDoc.context.lookupMaybe(fontRef, pdf_lib_1.PDFDict);
|
|
53
|
+
}
|
|
54
|
+
catch {
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
if (!fontObj) {
|
|
58
|
+
fontMap[key] = {
|
|
59
|
+
key, realName: null, baseFontRaw: null, isSubset: false,
|
|
60
|
+
subtype: null, encoding: null, fontFamily: null, fontStyle: null,
|
|
61
|
+
fontWeight: null, italicAngle: null,
|
|
62
|
+
};
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
const baseFontRaw = pdfValToString(fontObj.get(pdf_lib_1.PDFName.of("BaseFont")));
|
|
66
|
+
const subtype = pdfValToString(fontObj.get(pdf_lib_1.PDFName.of("Subtype")));
|
|
67
|
+
const encoding = pdfValToString(fontObj.get(pdf_lib_1.PDFName.of("Encoding")));
|
|
68
|
+
// FontDescriptor: first try direct, then through DescendantFonts (Type0)
|
|
69
|
+
let descriptor;
|
|
70
|
+
try {
|
|
71
|
+
descriptor = pdfDoc.context.lookupMaybe(fontObj.get(pdf_lib_1.PDFName.of("FontDescriptor")), pdf_lib_1.PDFDict);
|
|
72
|
+
}
|
|
73
|
+
catch { /* ignore */ }
|
|
74
|
+
if (!descriptor && subtype === "Type0") {
|
|
75
|
+
try {
|
|
76
|
+
const descArr = pdfDoc.context.lookup(fontObj.get(pdf_lib_1.PDFName.of("DescendantFonts")));
|
|
77
|
+
const cidRef = descArr?.array?.[0]
|
|
78
|
+
?? descArr?.get?.(0);
|
|
79
|
+
const cidFont = pdfDoc.context.lookupMaybe(cidRef, pdf_lib_1.PDFDict);
|
|
80
|
+
descriptor = pdfDoc.context.lookupMaybe(cidFont?.get(pdf_lib_1.PDFName.of("FontDescriptor")), pdf_lib_1.PDFDict);
|
|
81
|
+
}
|
|
82
|
+
catch { /* ignore */ }
|
|
83
|
+
}
|
|
84
|
+
let fontFamily = null;
|
|
85
|
+
let fontWeight = null;
|
|
86
|
+
let italicAngle = null;
|
|
87
|
+
if (descriptor) {
|
|
88
|
+
fontFamily = pdfValToString(descriptor.get(pdf_lib_1.PDFName.of("FontFamily")));
|
|
89
|
+
try {
|
|
90
|
+
fontWeight = descriptor.get(pdf_lib_1.PDFName.of("FontWeight"))?.numberValue?.() ?? null;
|
|
91
|
+
}
|
|
92
|
+
catch { /* ignore */ }
|
|
93
|
+
try {
|
|
94
|
+
italicAngle = descriptor.get(pdf_lib_1.PDFName.of("ItalicAngle"))?.numberValue?.() ?? null;
|
|
95
|
+
}
|
|
96
|
+
catch { /* ignore */ }
|
|
97
|
+
}
|
|
98
|
+
const realName = stripSubsetPrefix(baseFontRaw);
|
|
99
|
+
const isSubset = /^[A-Z]{6}\+/.test((baseFontRaw ?? "").replace(/^\//, ""));
|
|
100
|
+
const dashIdx = realName?.indexOf("-") ?? -1;
|
|
101
|
+
if (!fontFamily && realName) {
|
|
102
|
+
fontFamily = dashIdx > -1 ? realName.slice(0, dashIdx) : realName;
|
|
103
|
+
}
|
|
104
|
+
const fontStyle = dashIdx > -1 ? realName.slice(dashIdx + 1) : null;
|
|
105
|
+
fontMap[key] = {
|
|
106
|
+
key, realName, baseFontRaw, isSubset, subtype, encoding,
|
|
107
|
+
fontFamily, fontStyle, fontWeight, italicAngle,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
return fontMap;
|
|
112
|
+
}
|
|
113
|
+
//# sourceMappingURL=fonts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"fonts.js","sourceRoot":"","sources":["../../src/pdf/fonts.ts"],"names":[],"mappings":";;AAqBA,oCAsGC;AA3HD,qCAAwD;AAGxD,SAAS,cAAc,CAAC,GAAY;IAClC,IAAI,CAAC,GAAG;QAAE,OAAO,IAAI,CAAC;IACtB,IAAI,CAAC;QACH,OAAQ,GAA8B;aACnC,QAAQ,EAAE;aACV,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;aAClB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;aACvB,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAmB;IAC5C,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IACvB,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC;AAC5D,CAAC;AAEM,KAAK,UAAU,YAAY,CAAC,MAAmB;IACpD,MAAM,OAAO,GAA6B,EAAE,CAAC;IAE7C,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,QAAQ,EAAE,EAAE,CAAC;QACrC,IAAI,YAAiC,CAAC;QAEtC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC;YACnD,YAAY,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,GAAG,EAAE,iBAAO,CAAC,CAAC;QAC1D,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,IAAI,CAAC,YAAY;YAAE,SAAS;QAE5B,IAAI,QAA6B,CAAC;QAElC,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACnC,YAAY,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,EACpC,iBAAO,CACR,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,IAAI,CAAC,QAAQ;YAAE,SAAS;QAExB,KAAK,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,IAAI,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAE7E,IAAI,OAAO,CAAC,GAAG,CAAC;gBAAE,SAAS;YAE3B,IAAI,OAA4B,CAAC;YACjC,IAAI,CAAC;gBACH,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,OAAO,EAAE,iBAAO,CAAC,CAAC;YACzD,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS;YACX,CAAC;YAED,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,CAAC,GAAG,CAAC,GAAG;oBACb,GAAG,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK;oBACvD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI;oBAChE,UAAU,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI;iBACpC,CAAC;gBACF,SAAS;YACX,CAAC;YAED,MAAM,WAAW,GAAK,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC1E,MAAM,OAAO,GAAS,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;YACzE,MAAM,QAAQ,GAAQ,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAE1E,yEAAyE;YACzE,IAAI,UAA+B,CAAC;YACpC,IAAI,CAAC;gBACH,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACrC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,gBAAgB,CAAC,CAAC,EACzC,iBAAO,CACR,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;YAExB,IAAI,CAAC,UAAU,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;gBACvC,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC;oBAClF,MAAM,MAAM,GAAK,OAA2D,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC;2BAClF,OAAwC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;oBACzD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,MAA0D,EAAE,iBAAO,CAAC,CAAC;oBAChH,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACrC,OAAO,EAAE,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,gBAAgB,CAAC,CAAC,EAC1C,iBAAO,CACR,CAAC;gBACJ,CAAC;gBAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;YAC1B,CAAC;YAED,IAAI,UAAU,GAAmB,IAAI,CAAC;YACtC,IAAI,UAAU,GAAmB,IAAI,CAAC;YACtC,IAAI,WAAW,GAAkB,IAAI,CAAC;YAEtC,IAAI,UAAU,EAAE,CAAC;gBACf,UAAU,GAAI,cAAc,CAAC,UAAU,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;gBACvE,IAAI,CAAC;oBAAC,UAAU,GAAK,UAAU,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,YAAY,CAAC,CAA4C,EAAE,WAAW,EAAE,EAAE,IAAI,IAAI,CAAC;gBAAC,CAAC;gBAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;gBAC3J,IAAI,CAAC;oBAAC,WAAW,GAAI,UAAU,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,aAAa,CAAC,CAA4C,EAAE,WAAW,EAAE,EAAE,IAAI,IAAI,CAAC;gBAAC,CAAC;gBAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;YAC9J,CAAC;YAED,MAAM,QAAQ,GAAI,iBAAiB,CAAC,WAAW,CAAC,CAAC;YACjD,MAAM,QAAQ,GAAI,aAAa,CAAC,IAAI,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,CAAC;YAC7E,MAAM,OAAO,GAAK,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC;YAE/C,IAAI,CAAC,UAAU,IAAI,QAAQ,EAAE,CAAC;gBAC5B,UAAU,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;YACpE,CAAC;YACD,MAAM,SAAS,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAS,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YAErE,OAAO,CAAC,GAAG,CAAC,GAAG;gBACb,GAAG,EAAE,QAAQ,EAAE,WAAW,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ;gBACvD,UAAU,EAAE,SAAS,EAAE,UAAU,EAAE,WAAW;aAC/C,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/pdf/loader.ts"],"names":[],"mappings":"AAGA,wBAAsB,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAcvE"}
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.loadInput = loadInput;
|
|
4
|
+
const fs = require("fs");
|
|
5
|
+
const fetchBuffer_1 = require("../fetch/fetchBuffer");
|
|
6
|
+
async function loadInput(input) {
|
|
7
|
+
if (Buffer.isBuffer(input)) {
|
|
8
|
+
return input;
|
|
9
|
+
}
|
|
10
|
+
if (input.startsWith("http://") || input.startsWith("https://")) {
|
|
11
|
+
return (0, fetchBuffer_1.fetchBuffer)(input);
|
|
12
|
+
}
|
|
13
|
+
if (!fs.existsSync(input)) {
|
|
14
|
+
throw new Error(`File not found: "${input}"`);
|
|
15
|
+
}
|
|
16
|
+
return fs.readFileSync(input);
|
|
17
|
+
}
|
|
18
|
+
//# sourceMappingURL=loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/pdf/loader.ts"],"names":[],"mappings":";;AAGA,8BAcC;AAjBD,yBAA0B;AAC1B,sDAAmD;AAE5C,KAAK,UAAU,SAAS,CAAC,KAAsB;IACpD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAChE,OAAO,IAAA,yBAAW,EAAC,KAAK,CAAC,CAAC;IAC5B,CAAC;IAED,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,oBAAoB,KAAK,GAAG,CAAC,CAAC;IAChD,CAAC;IAED,OAAO,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;AAChC,CAAC"}
|