pdf-metadata-extractor 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +91 -0
  2. package/LICENSE +21 -0
  3. package/README.md +427 -0
  4. package/dist/core/extractor.d.ts +3 -0
  5. package/dist/core/extractor.d.ts.map +1 -0
  6. package/dist/core/extractor.js +87 -0
  7. package/dist/core/extractor.js.map +1 -0
  8. package/dist/core/pageProcessor.d.ts +30 -0
  9. package/dist/core/pageProcessor.d.ts.map +1 -0
  10. package/dist/core/pageProcessor.js +480 -0
  11. package/dist/core/pageProcessor.js.map +1 -0
  12. package/dist/core/sourceDetector.d.ts +4 -0
  13. package/dist/core/sourceDetector.d.ts.map +1 -0
  14. package/dist/core/sourceDetector.js +33 -0
  15. package/dist/core/sourceDetector.js.map +1 -0
  16. package/dist/fetch/fetchBuffer.d.ts +2 -0
  17. package/dist/fetch/fetchBuffer.d.ts.map +1 -0
  18. package/dist/fetch/fetchBuffer.js +12 -0
  19. package/dist/fetch/fetchBuffer.js.map +1 -0
  20. package/dist/index.d.ts +8 -0
  21. package/dist/index.d.ts.map +1 -0
  22. package/dist/index.js +42 -0
  23. package/dist/index.js.map +1 -0
  24. package/dist/parser/streamParser.d.ts +34 -0
  25. package/dist/parser/streamParser.d.ts.map +1 -0
  26. package/dist/parser/streamParser.js +191 -0
  27. package/dist/parser/streamParser.js.map +1 -0
  28. package/dist/parser/textParser.d.ts +56 -0
  29. package/dist/parser/textParser.d.ts.map +1 -0
  30. package/dist/parser/textParser.js +175 -0
  31. package/dist/parser/textParser.js.map +1 -0
  32. package/dist/pdf/fonts.d.ts +4 -0
  33. package/dist/pdf/fonts.d.ts.map +1 -0
  34. package/dist/pdf/fonts.js +113 -0
  35. package/dist/pdf/fonts.js.map +1 -0
  36. package/dist/pdf/loader.d.ts +2 -0
  37. package/dist/pdf/loader.d.ts.map +1 -0
  38. package/dist/pdf/loader.js +18 -0
  39. package/dist/pdf/loader.js.map +1 -0
  40. package/dist/pdf/metadata.d.ts +13 -0
  41. package/dist/pdf/metadata.d.ts.map +1 -0
  42. package/dist/pdf/metadata.js +9 -0
  43. package/dist/pdf/metadata.js.map +1 -0
  44. package/dist/pdf/xobjects.d.ts +12 -0
  45. package/dist/pdf/xobjects.d.ts.map +1 -0
  46. package/dist/pdf/xobjects.js +107 -0
  47. package/dist/pdf/xobjects.js.map +1 -0
  48. package/dist/types.d.ts +136 -0
  49. package/dist/types.d.ts.map +1 -0
  50. package/dist/types.js +3 -0
  51. package/dist/types.js.map +1 -0
  52. package/dist/utils/buffer.d.ts +3 -0
  53. package/dist/utils/buffer.d.ts.map +1 -0
  54. package/dist/utils/buffer.js +11 -0
  55. package/dist/utils/buffer.js.map +1 -0
  56. package/dist/utils/color.d.ts +6 -0
  57. package/dist/utils/color.d.ts.map +1 -0
  58. package/dist/utils/color.js +21 -0
  59. package/dist/utils/color.js.map +1 -0
  60. package/dist/utils/matrix.d.ts +11 -0
  61. package/dist/utils/matrix.d.ts.map +1 -0
  62. package/dist/utils/matrix.js +22 -0
  63. package/dist/utils/matrix.js.map +1 -0
  64. package/package.json +61 -0
@@ -0,0 +1,34 @@
1
+ import { PDFDocument } from "pdf-lib";
2
+ import { TextElement } from "../types";
3
+ export interface BoundingBox {
4
+ x: number;
5
+ y: number;
6
+ width: number;
7
+ height: number;
8
+ }
9
+ export declare function getBoundingBox(elements: TextElement[]): BoundingBox | null;
10
+ export declare function filterByRegion(elements: TextElement[], box: BoundingBox): TextElement[];
11
+ /**
12
+ * Decompress and return a page's full content text — including any Form XObject
13
+ * sub-streams that pdfjs would process when extracting text.
14
+ * `pageIndex` is 0-based (pdf-lib convention).
15
+ */
16
+ export declare function getContentStreamText(pdfDoc: PDFDocument, pageIndex: number): string;
17
+ /**
18
+ * Extract the ordered list of unique PDF resource font keys from combined stream text.
19
+ * PDF names can contain hyphens/plus signs (e.g. "f-0-0", "F4+sub").
20
+ * e.g. "/F4 42 Tf … /f-0-0 10 Tf … /F4 8 Tf" → ["F4", "f-0-0"]
21
+ */
22
+ export declare function streamFontOrder(streamText: string): string[];
23
+ /**
24
+ * Build a bridge map: pdfjsKey → pdfResourceKey
25
+ *
26
+ * pdfjs-dist assigns keys like g_d1_f2 in first-appearance order matching the
27
+ * content stream Tf commands. The docId (d0, d1 …) is global across all
28
+ * documents in the same process, so we match by position rather than by name.
29
+ *
30
+ * @param streamText — full content text (main + XObject streams)
31
+ * @param pdfjsOrderedFonts — fontName values in first-appearance order from pdfjs
32
+ */
33
+ export declare function buildFontBridge(streamText: string, pdfjsOrderedFonts: string[]): Record<string, string>;
34
+ //# sourceMappingURL=streamParser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"streamParser.d.ts","sourceRoot":"","sources":["../../src/parser/streamParser.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,WAAW,EAAoB,MAAM,SAAS,CAAC;AACxD,OAAO,EAAE,WAAW,EAAE,MAAM,UAAU,CAAC;AAIvC,MAAM,WAAW,WAAW;IAC1B,CAAC,EAAE,MAAM,CAAC;IACV,CAAC,EAAE,MAAM,CAAC;IACV,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;CAChB;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,GAAG,WAAW,GAAG,IAAI,CAe1E;AAED,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,GAAG,EAAE,WAAW,GAAG,WAAW,EAAE,CAQvF;AAgID;;;;GAIG;AACH,wBAAgB,oBAAoB,CAAC,MAAM,EAAE,WAAW,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM,CAcnF;AAED;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,EAAE,CAS5D;AAED;;;;;;;;;GASG;AACH,wBAAgB,eAAe,CAC7B,UAAU,EAAE,MAAM,EAClB,iBAAiB,EAAE,MAAM,EAAE,GAC1B,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAOxB"}
@@ -0,0 +1,191 @@
1
+ "use strict";
2
+ var __importDefault = (this && this.__importDefault) || function (mod) {
3
+ return (mod && mod.__esModule) ? mod : { "default": mod };
4
+ };
5
+ Object.defineProperty(exports, "__esModule", { value: true });
6
+ exports.getBoundingBox = getBoundingBox;
7
+ exports.filterByRegion = filterByRegion;
8
+ exports.getContentStreamText = getContentStreamText;
9
+ exports.streamFontOrder = streamFontOrder;
10
+ exports.buildFontBridge = buildFontBridge;
11
+ const zlib_1 = __importDefault(require("zlib"));
12
+ const pdf_lib_1 = require("pdf-lib");
13
+ function getBoundingBox(elements) {
14
+ if (elements.length === 0)
15
+ return null;
16
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
17
+ for (const el of elements) {
18
+ const w = el.width ?? 0;
19
+ const h = el.height ?? el.fontSize;
20
+ minX = Math.min(minX, el.x);
21
+ minY = Math.min(minY, el.y);
22
+ maxX = Math.max(maxX, el.x + w);
23
+ maxY = Math.max(maxY, el.y + h);
24
+ }
25
+ return { x: minX, y: minY, width: maxX - minX, height: maxY - minY };
26
+ }
27
+ function filterByRegion(elements, box) {
28
+ return elements.filter((el) => el.x >= box.x &&
29
+ el.y >= box.y &&
30
+ el.x <= box.x + box.width &&
31
+ el.y <= box.y + box.height);
32
+ }
33
+ // ─── Content-stream helpers ──────────────────────────────────────────────────
34
+ /**
35
+ * Read the raw bytes of a pdf-lib object that represents a content stream.
36
+ */
37
+ function streamContents(obj) {
38
+ if (obj && typeof obj.contents !== "undefined") {
39
+ return Buffer.from(obj.contents);
40
+ }
41
+ return null;
42
+ }
43
+ /**
44
+ * Try zlib inflate, inflate-raw, then return raw bytes as latin1.
45
+ */
46
+ function decompress(raw) {
47
+ try {
48
+ return zlib_1.default.inflateSync(raw).toString("latin1");
49
+ }
50
+ catch { /* */ }
51
+ try {
52
+ return zlib_1.default.inflateRawSync(raw).toString("latin1");
53
+ }
54
+ catch { /* */ }
55
+ return raw.toString("latin1");
56
+ }
57
+ /**
58
+ * Resolve and decompress one or more content stream references into a string.
59
+ */
60
+ function resolveStreams(pdfDoc, contentsRef) {
61
+ if (!contentsRef)
62
+ return "";
63
+ const contents = pdfDoc.context.lookup(contentsRef);
64
+ // Single stream
65
+ const single = streamContents(contents);
66
+ if (single)
67
+ return decompress(single);
68
+ // PDFArray of streams
69
+ const arr = contents;
70
+ if (arr && typeof arr.size === "function") {
71
+ const parts = [];
72
+ for (let i = 0; i < arr.size(); i++) {
73
+ const ref = arr.get(i);
74
+ const s = pdfDoc.context.lookup(ref);
75
+ const raw = streamContents(s);
76
+ if (raw)
77
+ parts.push(decompress(raw));
78
+ }
79
+ return parts.join("\n");
80
+ }
81
+ return "";
82
+ }
83
+ /**
84
+ * Collect all Form-XObject content streams referenced by a stream (recursively).
85
+ * This handles PDFs where text lives inside /Form XObjects rather than the page stream.
86
+ *
87
+ * @param pdfDoc — pdf-lib document
88
+ * @param resourcesRef — the Resources dict for the current stream scope
89
+ * @param streamText — already-decompressed text of the current stream
90
+ * @param visited — guard against circular references
91
+ */
92
+ function collectXObjectStreams(pdfDoc, resourcesRef, streamText, visited, depth) {
93
+ if (depth <= 0)
94
+ return [];
95
+ const extra = [];
96
+ try {
97
+ const resDict = pdfDoc.context.lookupMaybe(resourcesRef, pdf_lib_1.PDFDict);
98
+ if (!resDict)
99
+ return extra;
100
+ const xobjDictRef = resDict.get(pdf_lib_1.PDFName.of("XObject"));
101
+ if (!xobjDictRef)
102
+ return extra;
103
+ const xobjDict = pdfDoc.context.lookupMaybe(xobjDictRef, pdf_lib_1.PDFDict);
104
+ if (!xobjDict)
105
+ return extra;
106
+ // Find all /Name Do operators in the stream
107
+ const doNames = new Set([...streamText.matchAll(/\/([^\s/\[\]<>(){}]+)\s+Do/g)].map((m) => m[1]));
108
+ for (const name of doNames) {
109
+ const xobjRef = xobjDict.get(pdf_lib_1.PDFName.of(name));
110
+ if (!xobjRef)
111
+ continue;
112
+ const xobj = pdfDoc.context.lookup(xobjRef);
113
+ // Only process Form XObjects (Subtype = Form)
114
+ const subtype = xobj?.dict?.get(pdf_lib_1.PDFName.of("Subtype"));
115
+ const subtypeStr = subtype?.toString?.()?.replace(/^\//, "");
116
+ if (subtypeStr !== "Form")
117
+ continue;
118
+ const raw = streamContents(xobj);
119
+ if (!raw)
120
+ continue;
121
+ const key = raw.toString("hex").slice(0, 32);
122
+ if (visited.has(key))
123
+ continue;
124
+ visited.add(key);
125
+ const text = decompress(raw);
126
+ extra.push(text);
127
+ // Recurse: XObjects can reference their own sub-XObjects
128
+ const innerResRef = xobj?.dict?.get(pdf_lib_1.PDFName.of("Resources"));
129
+ const nested = collectXObjectStreams(pdfDoc, innerResRef, text, visited, depth - 1);
130
+ extra.push(...nested);
131
+ }
132
+ }
133
+ catch { /* ignore */ }
134
+ return extra;
135
+ }
136
+ /**
137
+ * Decompress and return a page's full content text — including any Form XObject
138
+ * sub-streams that pdfjs would process when extracting text.
139
+ * `pageIndex` is 0-based (pdf-lib convention).
140
+ */
141
+ function getContentStreamText(pdfDoc, pageIndex) {
142
+ try {
143
+ const page = pdfDoc.getPage(pageIndex);
144
+ const contentsRef = page.node.get(pdf_lib_1.PDFName.of("Contents"));
145
+ const mainText = resolveStreams(pdfDoc, contentsRef);
146
+ const resourcesRef = page.node.get(pdf_lib_1.PDFName.of("Resources"));
147
+ const visited = new Set();
148
+ const xobjTexts = collectXObjectStreams(pdfDoc, resourcesRef, mainText, visited, 4);
149
+ return [mainText, ...xobjTexts].join("\n");
150
+ }
151
+ catch { /* ignore */ }
152
+ return "";
153
+ }
154
+ /**
155
+ * Extract the ordered list of unique PDF resource font keys from combined stream text.
156
+ * PDF names can contain hyphens/plus signs (e.g. "f-0-0", "F4+sub").
157
+ * e.g. "/F4 42 Tf … /f-0-0 10 Tf … /F4 8 Tf" → ["F4", "f-0-0"]
158
+ */
159
+ function streamFontOrder(streamText) {
160
+ const seen = new Set();
161
+ const ordered = [];
162
+ // Match /FontName size Tf — FontName is any PDF name (no whitespace or delimiters)
163
+ for (const m of streamText.matchAll(/\/([^\s/\[\]<>(){}]+)\s+[\d.]+\s+Tf/g)) {
164
+ const key = m[1];
165
+ if (!seen.has(key)) {
166
+ seen.add(key);
167
+ ordered.push(key);
168
+ }
169
+ }
170
+ return ordered;
171
+ }
172
+ /**
173
+ * Build a bridge map: pdfjsKey → pdfResourceKey
174
+ *
175
+ * pdfjs-dist assigns keys like g_d1_f2 in first-appearance order matching the
176
+ * content stream Tf commands. The docId (d0, d1 …) is global across all
177
+ * documents in the same process, so we match by position rather than by name.
178
+ *
179
+ * @param streamText — full content text (main + XObject streams)
180
+ * @param pdfjsOrderedFonts — fontName values in first-appearance order from pdfjs
181
+ */
182
+ function buildFontBridge(streamText, pdfjsOrderedFonts) {
183
+ const streamOrder = streamFontOrder(streamText);
184
+ const bridge = {};
185
+ pdfjsOrderedFonts.forEach((pdfjsKey, i) => {
186
+ if (streamOrder[i])
187
+ bridge[pdfjsKey] = streamOrder[i];
188
+ });
189
+ return bridge;
190
+ }
191
+ //# sourceMappingURL=streamParser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"streamParser.js","sourceRoot":"","sources":["../../src/parser/streamParser.ts"],"names":[],"mappings":";;;;;AAaA,wCAeC;AAED,wCAQC;AAqID,oDAcC;AAOD,0CASC;AAYD,0CAUC;AA/ND,gDAAwB;AACxB,qCAAwD;AAYxD,SAAgB,cAAc,CAAC,QAAuB;IACpD,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,IAAI,CAAC;IAEvC,IAAI,IAAI,GAAG,QAAQ,EAAE,IAAI,GAAG,QAAQ,EAAE,IAAI,GAAG,CAAC,QAAQ,EAAE,IAAI,GAAG,CAAC,QAAQ,CAAC;IAEzE,KAAK,MAAM,EAAE,IAAI,QAAQ,EAAE,CAAC;QAC1B,MAAM,CAAC,GAAG,EAAE,CAAC,KAAK,IAAI,CAAC,CAAC;QACxB,MAAM,CAAC,GAAG,EAAE,CAAC,MAAM,IAAI,EAAE,CAAC,QAAQ,CAAC;QACnC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;QAC5B,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QAChC,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAClC,CAAC;IAED,OAAO,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,GAAG,IAAI,EAAE,MAAM,EAAE,IAAI,GAAG,IAAI,EAAE,CAAC;AACvE,CAAC;AAED,SAAgB,cAAc,CAAC,QAAuB,EAAE,GAAgB;IACtE,OAAO,QAAQ,CAAC,MAAM,CACpB,CAAC,EAAE,EAAE,EAAE,CACL,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACb,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC;QACb,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,KAAK;QACzB,EAAE,CAAC,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,GAAG,CAAC,MAAM,CAC7B,CAAC;AACJ,CAAC;AAED,gFAAgF;AAEhF;;GAEG;AACH,SAAS,cAAc,CAAC,GAAY;IAClC,IAAI,GAAG,IAAI,OAAQ,GAA8B,CAAC,QAAQ,KAAK,WAAW,EAAE,CAAC;QAC3E,OAAO,MAAM,CAAC,IAAI,CAAE,GAAgC,CAAC,QAAQ,CAAC,CAAC;IACjE,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED;;GAEG;AACH,SAAS,UAAU,CAAC,GAAW;IAC7B,IAAI,CAAC;QAAC,OAAO,cAAI,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC;IACxE,IAAI,CAAC;QAAC,OAAO,cAAI,CAAC,cAAc,CAAC,GAAG,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAAC,CAAC;IAAC,MAAM,CAAC,CAAC,KAAK,CAAC,CAAC;IAC3E,OAAO,GAAG,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CACrB,MAAmB,EACnB,WAAoB;IAEpB,IAAI,CAAC,WAAW;QAAE,OAAO,EAAE,CAAC;IAC5B,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CACpC,WAA0D,CAC3D,CAAC;IAEF,gBAAgB;IAChB,MAAM,MAAM,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;IACxC,IAAI,MAAM;QAAE,OAAO,UAAU,CAAC,MAAM,CAAC,CAAC;IAEtC,sBAAsB;IACtB,MAAM,GAAG,GAAG,QAAqE,CAAC;IAClF,IAAI,GAAG,IAAI,OAAO,GAAG,CAAC,IAAI,KAAK,UAAU,EAAE,CAAC;QAC1C,MAAM,KAAK,GAAa,EAAE,CAAC;QAC3B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,IAAI,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,MAAM,GAAG,GAAG,GAAG,CAAC,GAAI,CAAC,CAAC,CAAC,CAAC;YACxB,MAAM,CAAC,GAAK,MAAM,CAAC,OAAO,CAAC,MAAM,CAC/B,GAAkD,CACnD,CAAC;YACF,MAAM,GAAG,GAAG,cAAc,CAAC,CAAC,CAAC,CAAC;YAC9B,IAAI,GAAG;gBAAE,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;QACvC,CAAC;QACD,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IAED,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;;;;;GAQG;AACH,SAAS,qBAAqB,CAC5B,MAAmB,EACnB,YAAqB,EACrB,UAAkB,EAClB,OAAoB,EACpB,KAAa;IAEb,IAAI,KAAK,IAAI,CAAC;QAAE,OAAO,EAAE,CAAC;IAE1B,MAAM,KAAK,GAAa,EAAE,CAAC;IAE3B,IAAI,CAAC;QACH,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACxC,YAAgE,EAChE,iBAAO,CACR,CAAC;QACF,IAAI,CAAC,OAAO;YAAE,OAAO,KAAK,CAAC;QAE3B,MAAM,WAAW,GAAG,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;QACvD,IAAI,CAAC,WAAW;YAAE,OAAO,KAAK,CAAC;QAE/B,MAAM,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,WAAW,EAAE,iBAAO,CAAC,CAAC;QAClE,IAAI,CAAC,QAAQ;YAAE,OAAO,KAAK,CAAC;QAE5B,4CAA4C;QAC5C,MAAM,OAAO,GAAG,IAAI,GAAG,CACrB,CAAC,GAAG,UAAU,CAAC,QAAQ,CAAC,6BAA6B,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CACzE,CAAC;QAEF,KAAK,MAAM,IAAI,IAAI,OAAO,EAAE,CAAC;YAC3B,MAAM,OAAO,GAAG,QAAQ,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,IAAI,CAAC,CAAC,CAAC;YAC/C,IAAI,CAAC,OAAO;gBAAE,SAAS;YAEvB,MAAM,IAAI,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAChC,OAAsD,CACvD,CAAC;YAEF,8CAA8C;YAC9C,MAAM,OAAO,GAAI,IAA2B,EAAE,IAAI,EAAE,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC;YAC/E,MAAM,UAAU,GAAG,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAC7D,IAAI,UAAU,KAAK,MAAM;gBAAE,SAAS;YAEpC,MAAM,GAAG,GAAG,cAAc,CAAC,IAAI,CAAC,CAAC;YACjC,IAAI,CAAC,GAAG;gBAAE,SAAS;YAEnB,MAAM,GAAG,GAAG,GAAG,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YAC7C,IAAI,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;gBAAE,SAAS;YAC/B,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAEjB,MAAM,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;YAC7B,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAEjB,yDAAyD;YACzD,MAAM,WAAW,GAAI,IAA2B,EAAE,IAAI,EAAE,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC;YACrF,MAAM,MAAM,GAAG,qBAAqB,CAAC,MAAM,EAAE,WAAW,EAAE,IAAI,EAAE,OAAO,EAAE,KAAK,GAAG,CAAC,CAAC,CAAC;YACpF,KAAK,CAAC,IAAI,CAAC,GAAG,MAAM,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,SAAgB,oBAAoB,CAAC,MAAmB,EAAE,SAAiB;IACzE,IAAI,CAAC;QACH,MAAM,IAAI,GAAU,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,CAAC;QAC9C,MAAM,WAAW,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC;QAC1D,MAAM,QAAQ,GAAM,cAAc,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;QAExD,MAAM,YAAY,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC;QAC5D,MAAM,OAAO,GAAQ,IAAI,GAAG,EAAU,CAAC;QACvC,MAAM,SAAS,GAAM,qBAAqB,CAAC,MAAM,EAAE,YAAY,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC,CAAC;QAEvF,OAAO,CAAC,QAAQ,EAAE,GAAG,SAAS,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7C,CAAC;IAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;IAExB,OAAO,EAAE,CAAC;AACZ,CAAC;AAED;;;;GAIG;AACH,SAAgB,eAAe,CAAC,UAAkB;IAChD,MAAM,IAAI,GAAM,IAAI,GAAG,EAAU,CAAC;IAClC,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,mFAAmF;IACnF,KAAK,MAAM,CAAC,IAAI,UAAU,CAAC,QAAQ,CAAC,sCAAsC,CAAC,EAAE,CAAC;QAC5E,MAAM,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACjB,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC;YAAC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;YAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAAC,CAAC;IAC3D,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED;;;;;;;;;GASG;AACH,SAAgB,eAAe,CAC7B,UAAkB,EAClB,iBAA2B;IAE3B,MAAM,WAAW,GAAG,eAAe,CAAC,UAAU,CAAC,CAAC;IAChD,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,iBAAiB,CAAC,OAAO,CAAC,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE;QACxC,IAAI,WAAW,CAAC,CAAC,CAAC;YAAE,MAAM,CAAC,QAAQ,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC;IACxD,CAAC,CAAC,CAAC;IACH,OAAO,MAAM,CAAC;AAChB,CAAC"}
@@ -0,0 +1,56 @@
1
+ import { TextElement, TextLine, TextLineWithWords, TextWord } from "../types";
2
+ /**
3
+ * Group text elements into visual lines.
4
+ *
5
+ * Elements whose Y values fall within `tolerance` points of each other are
6
+ * considered the same line. Within each line, elements are sorted left-to-right
7
+ * by X. Empty-string elements are discarded; whitespace-only elements (spaces)
8
+ * are kept so that word gaps are preserved when joining.
9
+ *
10
+ * @param elements - flat list of TextElements from a page
11
+ * @param tolerance - max Y delta to treat two elements as the same line (default 2)
12
+ */
13
+ export declare function groupIntoLines(elements: TextElement[], tolerance?: number): TextLine[];
14
+ /**
15
+ * Group text elements into words by detecting inter-character gaps.
16
+ *
17
+ * A new word starts when:
18
+ * - the current element is a whitespace-only element (explicit space), OR
19
+ * - the X gap between consecutive elements exceeds `fontSize × gapFactor`
20
+ *
21
+ * Whitespace elements are used as separators but are NOT included in the
22
+ * resulting words, so the output contains only printable content.
23
+ *
24
+ * Works on elements that share the same visual line (i.e. the output of
25
+ * `groupIntoLines`), but can also be called on any sorted element list.
26
+ *
27
+ * @param elements - TextElements sorted left-to-right (same line)
28
+ * @param gapFactor - gap threshold as a fraction of fontSize (default 0.4)
29
+ */
30
+ export declare function groupIntoWords(elements: TextElement[], gapFactor?: number): TextWord[];
31
+ /**
32
+ * Convenience: extract all words from a flat list of TextElements in a single call.
33
+ * Equivalent to `groupIntoLines(elements).flatMap(l => groupIntoWords(l.elements))`.
34
+ * Words are returned in reading order (top-to-bottom, left-to-right).
35
+ *
36
+ * @param elements - flat list of TextElements from a page
37
+ * @param lineTolerance - Y tolerance for line grouping (default 2)
38
+ * @param gapFactor - word gap threshold as a fraction of fontSize (default 0.4)
39
+ */
40
+ export declare function extractWords(elements: TextElement[], lineTolerance?: number, gapFactor?: number): TextWord[];
41
+ /**
42
+ * Full structured text extraction: returns lines with their words already grouped.
43
+ * Use this to get a ready-to-serialize JSON hierarchy:
44
+ * line.text — full line string
45
+ * line.words — words with position, size, font, color, and raw elements
46
+ *
47
+ * @param elements - flat list of TextElements from a page
48
+ * @param lineTolerance - Y tolerance for line grouping (default 2)
49
+ * @param gapFactor - word gap threshold as fraction of fontSize (default 0.4)
50
+ */
51
+ export declare function extractTextStructure(elements: TextElement[], lineTolerance?: number, gapFactor?: number): TextLineWithWords[];
52
+ /**
53
+ * Concatenate text elements in a line into a single string (legacy helper).
54
+ */
55
+ export declare function lineToString(line: TextElement[]): string;
56
+ //# sourceMappingURL=textParser.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"textParser.d.ts","sourceRoot":"","sources":["../../src/parser/textParser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAE,QAAQ,EAAE,iBAAiB,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAI9E;;;;;;;;;;GAUG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,SAAS,SAAI,GAAG,QAAQ,EAAE,CAqBjF;AA0BD;;;;;;;;;;;;;;;GAeG;AACH,wBAAgB,cAAc,CAAC,QAAQ,EAAE,WAAW,EAAE,EAAE,SAAS,SAAM,GAAG,QAAQ,EAAE,CA4DnF;AAED;;;;;;;;GAQG;AACH,wBAAgB,YAAY,CAC1B,QAAQ,EAAE,WAAW,EAAE,EACvB,aAAa,SAAI,EACjB,SAAS,SAAM,GACd,QAAQ,EAAE,CAIZ;AAED;;;;;;;;;GASG;AACH,wBAAgB,oBAAoB,CAClC,QAAQ,EAAE,WAAW,EAAE,EACvB,aAAa,SAAI,EACjB,SAAS,SAAM,GACd,iBAAiB,EAAE,CAKrB;AAID;;GAEG;AACH,wBAAgB,YAAY,CAAC,IAAI,EAAE,WAAW,EAAE,GAAG,MAAM,CAMxD"}
@@ -0,0 +1,175 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.groupIntoLines = groupIntoLines;
4
+ exports.groupIntoWords = groupIntoWords;
5
+ exports.extractWords = extractWords;
6
+ exports.extractTextStructure = extractTextStructure;
7
+ exports.lineToString = lineToString;
8
+ // ─── Lines ────────────────────────────────────────────────────────────────────
9
+ /**
10
+ * Group text elements into visual lines.
11
+ *
12
+ * Elements whose Y values fall within `tolerance` points of each other are
13
+ * considered the same line. Within each line, elements are sorted left-to-right
14
+ * by X. Empty-string elements are discarded; whitespace-only elements (spaces)
15
+ * are kept so that word gaps are preserved when joining.
16
+ *
17
+ * @param elements - flat list of TextElements from a page
18
+ * @param tolerance - max Y delta to treat two elements as the same line (default 2)
19
+ */
20
+ function groupIntoLines(elements, tolerance = 2) {
21
+ const visible = elements.filter((el) => el.text !== "");
22
+ const buckets = new Map();
23
+ for (const el of visible) {
24
+ const key = Math.round(el.y / tolerance) * tolerance;
25
+ if (!buckets.has(key))
26
+ buckets.set(key, []);
27
+ buckets.get(key).push(el);
28
+ }
29
+ return [...buckets.entries()]
30
+ .sort(([a], [b]) => a - b)
31
+ .map(([y, els]) => {
32
+ const sorted = [...els].sort((a, b) => a.x - b.x);
33
+ return {
34
+ y,
35
+ text: sorted.map((e) => e.text).join("").trim(),
36
+ elements: sorted,
37
+ };
38
+ })
39
+ .filter((line) => line.text !== "");
40
+ }
41
+ // ─── Words ────────────────────────────────────────────────────────────────────
42
+ /**
43
+ * Build a TextWord from a non-empty group of elements (already sorted by X).
44
+ */
45
+ function buildWord(els) {
46
+ const first = els[0];
47
+ const last = els[els.length - 1];
48
+ return {
49
+ text: els.map((e) => e.text).join(""),
50
+ x: first.x,
51
+ y: first.y,
52
+ width: last.x + last.width - first.x,
53
+ height: Math.max(...els.map((e) => e.height)),
54
+ fontSize: first.fontSize,
55
+ fontRealName: first.fontRealName,
56
+ fontFamily: first.fontFamily,
57
+ fontStyle: first.fontStyle,
58
+ fontWeight: first.fontWeight,
59
+ color: first.color,
60
+ elements: els,
61
+ };
62
+ }
63
+ /**
64
+ * Group text elements into words by detecting inter-character gaps.
65
+ *
66
+ * A new word starts when:
67
+ * - the current element is a whitespace-only element (explicit space), OR
68
+ * - the X gap between consecutive elements exceeds `fontSize × gapFactor`
69
+ *
70
+ * Whitespace elements are used as separators but are NOT included in the
71
+ * resulting words, so the output contains only printable content.
72
+ *
73
+ * Works on elements that share the same visual line (i.e. the output of
74
+ * `groupIntoLines`), but can also be called on any sorted element list.
75
+ *
76
+ * @param elements - TextElements sorted left-to-right (same line)
77
+ * @param gapFactor - gap threshold as a fraction of fontSize (default 0.4)
78
+ */
79
+ function groupIntoWords(elements, gapFactor = 0.4) {
80
+ const els = [...elements]
81
+ .filter((el) => el.text !== "")
82
+ .sort((a, b) => a.x - b.x);
83
+ if (els.length === 0)
84
+ return [];
85
+ const words = [];
86
+ let group = [];
87
+ for (let i = 0; i < els.length; i++) {
88
+ const curr = els[i];
89
+ const isSpace = curr.text.trim() === "";
90
+ if (group.length === 0) {
91
+ if (!isSpace)
92
+ group.push(curr);
93
+ continue;
94
+ }
95
+ if (isSpace) {
96
+ // Decide: letter-spacing (merge) or word boundary (split)?
97
+ //
98
+ // Heuristic: if the element before AND after this space are both
99
+ // single-character elements, this is decorative letter-spacing — keep
100
+ // the group together. Otherwise it is a real word break.
101
+ const prev = group[group.length - 1];
102
+ const next = els[i + 1];
103
+ const isLetterSpacing = prev.text.length === 1 && next != null && next.text.trim().length === 1;
104
+ if (isLetterSpacing) {
105
+ // Include the space in the group so X layout is preserved;
106
+ // buildWord will strip spaces from the final text string.
107
+ group.push(curr);
108
+ }
109
+ else {
110
+ const content = group.filter((e) => e.text.trim() !== "");
111
+ if (content.length > 0)
112
+ words.push(buildWord(content));
113
+ group = [];
114
+ }
115
+ continue;
116
+ }
117
+ // Non-space element: check X gap against the last non-space in the group
118
+ const prev = group[group.length - 1];
119
+ const gap = curr.x - (prev.x + prev.width);
120
+ const largeGap = gap > prev.fontSize * gapFactor;
121
+ if (largeGap) {
122
+ const content = group.filter((e) => e.text.trim() !== "");
123
+ if (content.length > 0)
124
+ words.push(buildWord(content));
125
+ group = [curr];
126
+ }
127
+ else {
128
+ group.push(curr);
129
+ }
130
+ }
131
+ const content = group.filter((e) => e.text.trim() !== "");
132
+ if (content.length > 0)
133
+ words.push(buildWord(content));
134
+ return words;
135
+ }
136
+ /**
137
+ * Convenience: extract all words from a flat list of TextElements in a single call.
138
+ * Equivalent to `groupIntoLines(elements).flatMap(l => groupIntoWords(l.elements))`.
139
+ * Words are returned in reading order (top-to-bottom, left-to-right).
140
+ *
141
+ * @param elements - flat list of TextElements from a page
142
+ * @param lineTolerance - Y tolerance for line grouping (default 2)
143
+ * @param gapFactor - word gap threshold as a fraction of fontSize (default 0.4)
144
+ */
145
+ function extractWords(elements, lineTolerance = 2, gapFactor = 0.4) {
146
+ return groupIntoLines(elements, lineTolerance).flatMap((line) => groupIntoWords(line.elements, gapFactor));
147
+ }
148
+ /**
149
+ * Full structured text extraction: returns lines with their words already grouped.
150
+ * Use this to get a ready-to-serialize JSON hierarchy:
151
+ * line.text — full line string
152
+ * line.words — words with position, size, font, color, and raw elements
153
+ *
154
+ * @param elements - flat list of TextElements from a page
155
+ * @param lineTolerance - Y tolerance for line grouping (default 2)
156
+ * @param gapFactor - word gap threshold as fraction of fontSize (default 0.4)
157
+ */
158
+ function extractTextStructure(elements, lineTolerance = 2, gapFactor = 0.4) {
159
+ return groupIntoLines(elements, lineTolerance).map((line) => ({
160
+ ...line,
161
+ words: groupIntoWords(line.elements, gapFactor),
162
+ }));
163
+ }
164
+ // ─── Legacy helper ────────────────────────────────────────────────────────────
165
+ /**
166
+ * Concatenate text elements in a line into a single string (legacy helper).
167
+ */
168
+ function lineToString(line) {
169
+ return line
170
+ .sort((a, b) => a.x - b.x)
171
+ .map((el) => el.text)
172
+ .join("")
173
+ .trim();
174
+ }
175
+ //# sourceMappingURL=textParser.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"textParser.js","sourceRoot":"","sources":["../../src/parser/textParser.ts"],"names":[],"mappings":";;AAeA,wCAqBC;AA0CD,wCA4DC;AAWD,oCAQC;AAYD,oDASC;AAOD,oCAMC;AA7LD,iFAAiF;AAEjF;;;;;;;;;;GAUG;AACH,SAAgB,cAAc,CAAC,QAAuB,EAAE,SAAS,GAAG,CAAC;IACnE,MAAM,OAAO,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC;IAExD,MAAM,OAAO,GAAG,IAAI,GAAG,EAAyB,CAAC;IACjD,KAAK,MAAM,EAAE,IAAI,OAAO,EAAE,CAAC;QACzB,MAAM,GAAG,GAAG,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,GAAG,SAAS,CAAC,GAAG,SAAS,CAAC;QACrD,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,GAAG,CAAC;YAAE,OAAO,CAAC,GAAG,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,GAAG,CAAE,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;IAC7B,CAAC;IAED,OAAO,CAAC,GAAG,OAAO,CAAC,OAAO,EAAE,CAAC;SAC1B,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,EAAE;QAChB,MAAM,MAAM,GAAG,CAAC,GAAG,GAAG,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAClD,OAAO;YACL,CAAC;YACD,IAAI,EAAE,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,IAAI,EAAE;YAC/C,QAAQ,EAAE,MAAM;SACjB,CAAC;IACJ,CAAC,CAAC;SACD,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,IAAI,KAAK,EAAE,CAAC,CAAC;AACxC,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,SAAS,SAAS,CAAC,GAAkB;IACnC,MAAM,KAAK,GAAG,GAAG,CAAC,CAAC,CAAC,CAAC;IACrB,MAAM,IAAI,GAAI,GAAG,CAAC,GAAG,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAClC,OAAO;QACL,IAAI,EAAU,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,IAAI,CAAC,EAAE,CAAC;QAC7C,CAAC,EAAa,KAAK,CAAC,CAAC;QACrB,CAAC,EAAa,KAAK,CAAC,CAAC;QACrB,KAAK,EAAS,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC;QAC3C,MAAM,EAAQ,IAAI,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;QACnD,QAAQ,EAAM,KAAK,CAAC,QAAQ;QAC5B,YAAY,EAAE,KAAK,CAAC,YAAY;QAChC,UAAU,EAAI,KAAK,CAAC,UAAU;QAC9B,SAAS,EAAK,KAAK,CAAC,SAAS;QAC7B,UAAU,EAAI,KAAK,CAAC,UAAU;QAC9B,KAAK,EAAS,KAAK,CAAC,KAAK;QACzB,QAAQ,EAAM,GAAG;KAClB,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;;GAeG;AACH,SAAgB,cAAc,CAAC,QAAuB,EAAE,SAAS,GAAG,GAAG;IACrE,MAAM,GAAG,GAAG,CAAC,GAAG,QAAQ,CAAC;SACtB,MAAM,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,KAAK,EAAE,CAAC;SAC9B,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAE7B,IAAI,GAAG,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAEhC,MAAM,KAAK,GAAe,EAAE,CAAC;IAC7B,IAAI,KAAK,GAAkB,EAAE,CAAC;IAE9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,IAAI,GAAM,GAAG,CAAC,CAAC,CAAC,CAAC;QACvB,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC;QAExC,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;YACvB,IAAI,CAAC,OAAO;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC/B,SAAS;QACX,CAAC;QAED,IAAI,OAAO,EAAE,CAAC;YACZ,2DAA2D;YAC3D,EAAE;YACF,iEAAiE;YACjE,sEAAsE;YACtE,0DAA0D;YAC1D,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;YACrC,MAAM,IAAI,GAAG,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;YACxB,MAAM,eAAe,GACnB,IAAI,CAAC,IAAI,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,IAAI,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,CAAC;YAE1E,IAAI,eAAe,EAAE,CAAC;gBACpB,2DAA2D;gBAC3D,0DAA0D;gBAC1D,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACnB,CAAC;iBAAM,CAAC;gBACN,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;gBAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;oBAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;gBACvD,KAAK,GAAG,EAAE,CAAC;YACb,CAAC;YACD,SAAS;QACX,CAAC;QAED,yEAAyE;QACzE,MAAM,IAAI,GAAO,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzC,MAAM,GAAG,GAAQ,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC;QAChD,MAAM,QAAQ,GAAG,GAAG,GAAG,IAAI,CAAC,QAAQ,GAAG,SAAS,CAAC;QAEjD,IAAI,QAAQ,EAAE,CAAC;YACb,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;YAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;gBAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;YACvD,KAAK,GAAG,CAAC,IAAI,CAAC,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,EAAE,KAAK,EAAE,CAAC,CAAC;IAC1D,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC;QAAE,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC;IAEvD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;;;;;GAQG;AACH,SAAgB,YAAY,CAC1B,QAAuB,EACvB,aAAa,GAAG,CAAC,EACjB,SAAS,GAAG,GAAG;IAEf,OAAO,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,EAAE,EAAE,CAC9D,cAAc,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC,CACzC,CAAC;AACJ,CAAC;AAED;;;;;;;;;GASG;AACH,SAAgB,oBAAoB,CAClC,QAAuB,EACvB,aAAa,GAAG,CAAC,EACjB,SAAS,GAAG,GAAG;IAEf,OAAO,cAAc,CAAC,QAAQ,EAAE,aAAa,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC5D,GAAG,IAAI;QACP,KAAK,EAAE,cAAc,CAAC,IAAI,CAAC,QAAQ,EAAE,SAAS,CAAC;KAChD,CAAC,CAAC,CAAC;AACN,CAAC;AAED,iFAAiF;AAEjF;;GAEG;AACH,SAAgB,YAAY,CAAC,IAAmB;IAC9C,OAAO,IAAI;SACR,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC;SACzB,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,IAAI,CAAC;SACpB,IAAI,CAAC,EAAE,CAAC;SACR,IAAI,EAAE,CAAC;AACZ,CAAC"}
@@ -0,0 +1,4 @@
1
+ import { PDFDocument } from "pdf-lib";
2
+ import { FontInfo } from "../types";
3
+ export declare function extractFonts(pdfDoc: PDFDocument): Promise<Record<string, FontInfo>>;
4
+ //# sourceMappingURL=fonts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fonts.d.ts","sourceRoot":"","sources":["../../src/pdf/fonts.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,WAAW,EAAoB,MAAM,SAAS,CAAC;AACxD,OAAO,EAAE,QAAQ,EAAE,MAAM,UAAU,CAAC;AAoBpC,wBAAsB,YAAY,CAAC,MAAM,EAAE,WAAW,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC,CAsGzF"}
@@ -0,0 +1,113 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.extractFonts = extractFonts;
4
+ const pdf_lib_1 = require("pdf-lib");
5
+ function pdfValToString(val) {
6
+ if (!val)
7
+ return null;
8
+ try {
9
+ return val
10
+ .toString()
11
+ .replace(/^\//, "")
12
+ .replace(/^\(|\)$/g, "")
13
+ .trim();
14
+ }
15
+ catch {
16
+ return null;
17
+ }
18
+ }
19
+ function stripSubsetPrefix(name) {
20
+ if (!name)
21
+ return null;
22
+ return name.replace(/^\//, "").replace(/^[A-Z]{6}\+/, "");
23
+ }
24
+ async function extractFonts(pdfDoc) {
25
+ const fontMap = {};
26
+ for (const page of pdfDoc.getPages()) {
27
+ let resourceDict;
28
+ try {
29
+ const res = page.node.get(pdf_lib_1.PDFName.of("Resources"));
30
+ resourceDict = pdfDoc.context.lookupMaybe(res, pdf_lib_1.PDFDict);
31
+ }
32
+ catch {
33
+ continue;
34
+ }
35
+ if (!resourceDict)
36
+ continue;
37
+ let fontDict;
38
+ try {
39
+ fontDict = pdfDoc.context.lookupMaybe(resourceDict.get(pdf_lib_1.PDFName.of("Font")), pdf_lib_1.PDFDict);
40
+ }
41
+ catch {
42
+ continue;
43
+ }
44
+ if (!fontDict)
45
+ continue;
46
+ for (const [nameKey, fontRef] of fontDict.entries()) {
47
+ const key = pdfValToString(nameKey) ?? nameKey.toString().replace(/^\//, "");
48
+ if (fontMap[key])
49
+ continue;
50
+ let fontObj;
51
+ try {
52
+ fontObj = pdfDoc.context.lookupMaybe(fontRef, pdf_lib_1.PDFDict);
53
+ }
54
+ catch {
55
+ continue;
56
+ }
57
+ if (!fontObj) {
58
+ fontMap[key] = {
59
+ key, realName: null, baseFontRaw: null, isSubset: false,
60
+ subtype: null, encoding: null, fontFamily: null, fontStyle: null,
61
+ fontWeight: null, italicAngle: null,
62
+ };
63
+ continue;
64
+ }
65
+ const baseFontRaw = pdfValToString(fontObj.get(pdf_lib_1.PDFName.of("BaseFont")));
66
+ const subtype = pdfValToString(fontObj.get(pdf_lib_1.PDFName.of("Subtype")));
67
+ const encoding = pdfValToString(fontObj.get(pdf_lib_1.PDFName.of("Encoding")));
68
+ // FontDescriptor: first try direct, then through DescendantFonts (Type0)
69
+ let descriptor;
70
+ try {
71
+ descriptor = pdfDoc.context.lookupMaybe(fontObj.get(pdf_lib_1.PDFName.of("FontDescriptor")), pdf_lib_1.PDFDict);
72
+ }
73
+ catch { /* ignore */ }
74
+ if (!descriptor && subtype === "Type0") {
75
+ try {
76
+ const descArr = pdfDoc.context.lookup(fontObj.get(pdf_lib_1.PDFName.of("DescendantFonts")));
77
+ const cidRef = descArr?.array?.[0]
78
+ ?? descArr?.get?.(0);
79
+ const cidFont = pdfDoc.context.lookupMaybe(cidRef, pdf_lib_1.PDFDict);
80
+ descriptor = pdfDoc.context.lookupMaybe(cidFont?.get(pdf_lib_1.PDFName.of("FontDescriptor")), pdf_lib_1.PDFDict);
81
+ }
82
+ catch { /* ignore */ }
83
+ }
84
+ let fontFamily = null;
85
+ let fontWeight = null;
86
+ let italicAngle = null;
87
+ if (descriptor) {
88
+ fontFamily = pdfValToString(descriptor.get(pdf_lib_1.PDFName.of("FontFamily")));
89
+ try {
90
+ fontWeight = descriptor.get(pdf_lib_1.PDFName.of("FontWeight"))?.numberValue?.() ?? null;
91
+ }
92
+ catch { /* ignore */ }
93
+ try {
94
+ italicAngle = descriptor.get(pdf_lib_1.PDFName.of("ItalicAngle"))?.numberValue?.() ?? null;
95
+ }
96
+ catch { /* ignore */ }
97
+ }
98
+ const realName = stripSubsetPrefix(baseFontRaw);
99
+ const isSubset = /^[A-Z]{6}\+/.test((baseFontRaw ?? "").replace(/^\//, ""));
100
+ const dashIdx = realName?.indexOf("-") ?? -1;
101
+ if (!fontFamily && realName) {
102
+ fontFamily = dashIdx > -1 ? realName.slice(0, dashIdx) : realName;
103
+ }
104
+ const fontStyle = dashIdx > -1 ? realName.slice(dashIdx + 1) : null;
105
+ fontMap[key] = {
106
+ key, realName, baseFontRaw, isSubset, subtype, encoding,
107
+ fontFamily, fontStyle, fontWeight, italicAngle,
108
+ };
109
+ }
110
+ }
111
+ return fontMap;
112
+ }
113
+ //# sourceMappingURL=fonts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fonts.js","sourceRoot":"","sources":["../../src/pdf/fonts.ts"],"names":[],"mappings":";;AAqBA,oCAsGC;AA3HD,qCAAwD;AAGxD,SAAS,cAAc,CAAC,GAAY;IAClC,IAAI,CAAC,GAAG;QAAE,OAAO,IAAI,CAAC;IACtB,IAAI,CAAC;QACH,OAAQ,GAA8B;aACnC,QAAQ,EAAE;aACV,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC;aAClB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;aACvB,IAAI,EAAE,CAAC;IACZ,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,IAAI,CAAC;IACd,CAAC;AACH,CAAC;AAED,SAAS,iBAAiB,CAAC,IAAmB;IAC5C,IAAI,CAAC,IAAI;QAAE,OAAO,IAAI,CAAC;IACvB,OAAO,IAAI,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC;AAC5D,CAAC;AAEM,KAAK,UAAU,YAAY,CAAC,MAAmB;IACpD,MAAM,OAAO,GAA6B,EAAE,CAAC;IAE7C,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,QAAQ,EAAE,EAAE,CAAC;QACrC,IAAI,YAAiC,CAAC;QAEtC,IAAI,CAAC;YACH,MAAM,GAAG,GAAG,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,WAAW,CAAC,CAAC,CAAC;YACnD,YAAY,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,GAAG,EAAE,iBAAO,CAAC,CAAC;QAC1D,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,IAAI,CAAC,YAAY;YAAE,SAAS;QAE5B,IAAI,QAA6B,CAAC;QAElC,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACnC,YAAY,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,MAAM,CAAC,CAAC,EACpC,iBAAO,CACR,CAAC;QACJ,CAAC;QAAC,MAAM,CAAC;YACP,SAAS;QACX,CAAC;QAED,IAAI,CAAC,QAAQ;YAAE,SAAS;QAExB,KAAK,MAAM,CAAC,OAAO,EAAE,OAAO,CAAC,IAAI,QAAQ,CAAC,OAAO,EAAE,EAAE,CAAC;YACpD,MAAM,GAAG,GAAG,cAAc,CAAC,OAAO,CAAC,IAAI,OAAO,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;YAE7E,IAAI,OAAO,CAAC,GAAG,CAAC;gBAAE,SAAS;YAE3B,IAAI,OAA4B,CAAC;YACjC,IAAI,CAAC;gBACH,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,OAAO,EAAE,iBAAO,CAAC,CAAC;YACzD,CAAC;YAAC,MAAM,CAAC;gBACP,SAAS;YACX,CAAC;YAED,IAAI,CAAC,OAAO,EAAE,CAAC;gBACb,OAAO,CAAC,GAAG,CAAC,GAAG;oBACb,GAAG,EAAE,QAAQ,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI,EAAE,QAAQ,EAAE,KAAK;oBACvD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,UAAU,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI;oBAChE,UAAU,EAAE,IAAI,EAAE,WAAW,EAAE,IAAI;iBACpC,CAAC;gBACF,SAAS;YACX,CAAC;YAED,MAAM,WAAW,GAAK,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC1E,MAAM,OAAO,GAAS,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,SAAS,CAAC,CAAC,CAAC,CAAC;YACzE,MAAM,QAAQ,GAAQ,cAAc,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAE1E,yEAAyE;YACzE,IAAI,UAA+B,CAAC;YACpC,IAAI,CAAC;gBACH,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACrC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,gBAAgB,CAAC,CAAC,EACzC,iBAAO,CACR,CAAC;YACJ,CAAC;YAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;YAExB,IAAI,CAAC,UAAU,IAAI,OAAO,KAAK,OAAO,EAAE,CAAC;gBACvC,IAAI,CAAC;oBACH,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,iBAAiB,CAAC,CAAC,CAAC,CAAC;oBAClF,MAAM,MAAM,GAAK,OAA2D,EAAE,KAAK,EAAE,CAAC,CAAC,CAAC;2BAClF,OAAwC,EAAE,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC;oBACzD,MAAM,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CAAC,MAA0D,EAAE,iBAAO,CAAC,CAAC;oBAChH,UAAU,GAAG,MAAM,CAAC,OAAO,CAAC,WAAW,CACrC,OAAO,EAAE,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,gBAAgB,CAAC,CAAC,EAC1C,iBAAO,CACR,CAAC;gBACJ,CAAC;gBAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;YAC1B,CAAC;YAED,IAAI,UAAU,GAAmB,IAAI,CAAC;YACtC,IAAI,UAAU,GAAmB,IAAI,CAAC;YACtC,IAAI,WAAW,GAAkB,IAAI,CAAC;YAEtC,IAAI,UAAU,EAAE,CAAC;gBACf,UAAU,GAAI,cAAc,CAAC,UAAU,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;gBACvE,IAAI,CAAC;oBAAC,UAAU,GAAK,UAAU,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,YAAY,CAAC,CAA4C,EAAE,WAAW,EAAE,EAAE,IAAI,IAAI,CAAC;gBAAC,CAAC;gBAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;gBAC3J,IAAI,CAAC;oBAAC,WAAW,GAAI,UAAU,CAAC,GAAG,CAAC,iBAAO,CAAC,EAAE,CAAC,aAAa,CAAC,CAA4C,EAAE,WAAW,EAAE,EAAE,IAAI,IAAI,CAAC;gBAAC,CAAC;gBAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC;YAC9J,CAAC;YAED,MAAM,QAAQ,GAAI,iBAAiB,CAAC,WAAW,CAAC,CAAC;YACjD,MAAM,QAAQ,GAAI,aAAa,CAAC,IAAI,CAAC,CAAC,WAAW,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC,CAAC;YAC7E,MAAM,OAAO,GAAK,QAAQ,EAAE,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC;YAE/C,IAAI,CAAC,UAAU,IAAI,QAAQ,EAAE,CAAC;gBAC5B,UAAU,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC;YACpE,CAAC;YACD,MAAM,SAAS,GAAG,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAS,CAAC,KAAK,CAAC,OAAO,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;YAErE,OAAO,CAAC,GAAG,CAAC,GAAG;gBACb,GAAG,EAAE,QAAQ,EAAE,WAAW,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ;gBACvD,UAAU,EAAE,SAAS,EAAE,UAAU,EAAE,WAAW;aAC/C,CAAC;QACJ,CAAC;IACH,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
@@ -0,0 +1,2 @@
1
+ export declare function loadInput(input: string | Buffer): Promise<Buffer>;
2
+ //# sourceMappingURL=loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../src/pdf/loader.ts"],"names":[],"mappings":"AAGA,wBAAsB,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAcvE"}
@@ -0,0 +1,18 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.loadInput = loadInput;
4
+ const fs = require("fs");
5
+ const fetchBuffer_1 = require("../fetch/fetchBuffer");
6
+ async function loadInput(input) {
7
+ if (Buffer.isBuffer(input)) {
8
+ return input;
9
+ }
10
+ if (input.startsWith("http://") || input.startsWith("https://")) {
11
+ return (0, fetchBuffer_1.fetchBuffer)(input);
12
+ }
13
+ if (!fs.existsSync(input)) {
14
+ throw new Error(`File not found: "${input}"`);
15
+ }
16
+ return fs.readFileSync(input);
17
+ }
18
+ //# sourceMappingURL=loader.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.js","sourceRoot":"","sources":["../../src/pdf/loader.ts"],"names":[],"mappings":";;AAGA,8BAcC;AAjBD,yBAA0B;AAC1B,sDAAmD;AAE5C,KAAK,UAAU,SAAS,CAAC,KAAsB;IACpD,IAAI,MAAM,CAAC,QAAQ,CAAC,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,KAAK,CAAC;IACf,CAAC;IAED,IAAI,KAAK,CAAC,UAAU,CAAC,SAAS,CAAC,IAAI,KAAK,CAAC,UAAU,CAAC,UAAU,CAAC,EAAE,CAAC;QAChE,OAAO,IAAA,yBAAW,EAAC,KAAK,CAAC,CAAC;IAC5B,CAAC;IAED,IAAI,CAAC,EAAE,CAAC,UAAU,CAAC,KAAK,CAAC,EAAE,CAAC;QAC1B,MAAM,IAAI,KAAK,CAAC,oBAAoB,KAAK,GAAG,CAAC,CAAC;IAChD,CAAC;IAED,OAAO,EAAE,CAAC,YAAY,CAAC,KAAK,CAAC,CAAC;AAChC,CAAC"}