pdf-metadata-extractor 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +91 -0
  2. package/LICENSE +21 -0
  3. package/README.md +427 -0
  4. package/dist/core/extractor.d.ts +3 -0
  5. package/dist/core/extractor.d.ts.map +1 -0
  6. package/dist/core/extractor.js +87 -0
  7. package/dist/core/extractor.js.map +1 -0
  8. package/dist/core/pageProcessor.d.ts +30 -0
  9. package/dist/core/pageProcessor.d.ts.map +1 -0
  10. package/dist/core/pageProcessor.js +480 -0
  11. package/dist/core/pageProcessor.js.map +1 -0
  12. package/dist/core/sourceDetector.d.ts +4 -0
  13. package/dist/core/sourceDetector.d.ts.map +1 -0
  14. package/dist/core/sourceDetector.js +33 -0
  15. package/dist/core/sourceDetector.js.map +1 -0
  16. package/dist/fetch/fetchBuffer.d.ts +2 -0
  17. package/dist/fetch/fetchBuffer.d.ts.map +1 -0
  18. package/dist/fetch/fetchBuffer.js +12 -0
  19. package/dist/fetch/fetchBuffer.js.map +1 -0
  20. package/dist/index.d.ts +8 -0
  21. package/dist/index.d.ts.map +1 -0
  22. package/dist/index.js +42 -0
  23. package/dist/index.js.map +1 -0
  24. package/dist/parser/streamParser.d.ts +34 -0
  25. package/dist/parser/streamParser.d.ts.map +1 -0
  26. package/dist/parser/streamParser.js +191 -0
  27. package/dist/parser/streamParser.js.map +1 -0
  28. package/dist/parser/textParser.d.ts +56 -0
  29. package/dist/parser/textParser.d.ts.map +1 -0
  30. package/dist/parser/textParser.js +175 -0
  31. package/dist/parser/textParser.js.map +1 -0
  32. package/dist/pdf/fonts.d.ts +4 -0
  33. package/dist/pdf/fonts.d.ts.map +1 -0
  34. package/dist/pdf/fonts.js +113 -0
  35. package/dist/pdf/fonts.js.map +1 -0
  36. package/dist/pdf/loader.d.ts +2 -0
  37. package/dist/pdf/loader.d.ts.map +1 -0
  38. package/dist/pdf/loader.js +18 -0
  39. package/dist/pdf/loader.js.map +1 -0
  40. package/dist/pdf/metadata.d.ts +13 -0
  41. package/dist/pdf/metadata.d.ts.map +1 -0
  42. package/dist/pdf/metadata.js +9 -0
  43. package/dist/pdf/metadata.js.map +1 -0
  44. package/dist/pdf/xobjects.d.ts +12 -0
  45. package/dist/pdf/xobjects.d.ts.map +1 -0
  46. package/dist/pdf/xobjects.js +107 -0
  47. package/dist/pdf/xobjects.js.map +1 -0
  48. package/dist/types.d.ts +136 -0
  49. package/dist/types.d.ts.map +1 -0
  50. package/dist/types.js +3 -0
  51. package/dist/types.js.map +1 -0
  52. package/dist/utils/buffer.d.ts +3 -0
  53. package/dist/utils/buffer.d.ts.map +1 -0
  54. package/dist/utils/buffer.js +11 -0
  55. package/dist/utils/buffer.js.map +1 -0
  56. package/dist/utils/color.d.ts +6 -0
  57. package/dist/utils/color.d.ts.map +1 -0
  58. package/dist/utils/color.js +21 -0
  59. package/dist/utils/color.js.map +1 -0
  60. package/dist/utils/matrix.d.ts +11 -0
  61. package/dist/utils/matrix.d.ts.map +1 -0
  62. package/dist/utils/matrix.js +22 -0
  63. package/dist/utils/matrix.js.map +1 -0
  64. package/package.json +61 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,91 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ ## [1.1.0] - 2026-02-26
6
+
7
+ ### Added
8
+
9
+ #### Graphics extraction (new)
10
+ - `ImageElement` — extracts embedded images with display bounding box (x, y, width, height in pts) and source metadata (imageWidth, imageHeight, colorSpace, bitsPerComponent, filter, imageMask)
11
+ - `RectElement` — extracts rectangle paths with fillColor, strokeColor, strokeWidth; CTM (current transformation matrix) applied so coordinates are in page display space
12
+ - `PathElement` — extracts non-rectangular paths (curves, polylines) as axis-aligned bounding boxes with fill/stroke color
13
+ - `PageResult.rectElements`, `PageResult.pathElements`, `PageResult.imageElements` arrays
14
+ - `PageResult.graphicSummary.vectorCount` and `imageCount` counters
15
+ - `PageResult.elements` now includes all element types combined (text + rect + path + image)
16
+ - Page type `"vector"` and `"hybrid"` now correctly classified when vector/image elements are present
17
+
18
+ #### CTM tracking
19
+ - Full current transformation matrix (save/restore/transform) tracked throughout operator list
20
+ - Image bounding box derived from unit-square corners transformed through CTM
21
+ - Rect/path corners transformed through CTM before computing axis-aligned bounding box
22
+
23
+ ### Changed
24
+
25
+ #### pdfjs-dist upgrade: v3.11.174 → v5.4.624
26
+ - Updated import to `pdfjs-dist/legacy/build/pdf.mjs` (legacy build still present in v5)
27
+ - **Worker setup**: v5 fake-worker mode uses `await import(workerSrc)` internally; empty string no longer works. Fix: `pathToFileURL(require.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs")).href`
28
+ - **Color format**: v5 worker normalises all color ops (gray, RGB, CMYK) into `setFillRGBColor` (OPS 59) / `setStrokeRGBColor` (OPS 58) with a single `"#rrggbb"` hex string argument. The old `[r, g, b]` integer triple format (v3) is supported as a fallback
29
+ - **`constructPath` (OPS 91) argument format**: v5 changed from `[opsArray, coordsArray]` to `[renderFn, [Float32Array], [minX, minY, maxX, maxY]]`. The rendering op (fill/stroke/both) and pre-computed bounding box are now embedded in the single op. Detection: `typeof args[0] === "number"` distinguishes v5 from v3
30
+
31
+ #### `processPage` signature change
32
+ - Old: `processPage(pdfjsPage, pageNumber, bridgeMap, realFontMap)`
33
+ - New: `processPage(pdfjsPage, pageNumber, pdfDoc, pageIndex0, realFontMap)` — `pdfDoc` and `pageIndex0` passed through so the function can call `extractXObjectInfo` and `buildFontBridge` internally
34
+
35
+ ### Fixed
36
+ - Image XObject name mismatch: pdfjs renames XObjects internally (`img_p0_1` ≠ PDF key `X5`). Resolved with positional fallback — Nth image paint op maps to Nth Image XObject in dict insertion order
37
+ - `PDFRawStream` vs `PDFDict`: image XObjects are streams not plain dicts; `lookupMaybe(ref, PDFDict)` threw for them. Fixed: `ctx.lookup(ref)` then `.dict` property
38
+ - `PDFNumber.asNumber()` used instead of the private `.numberValue` field for XObject dimension extraction
39
+
40
+ ---
41
+
42
+ ## [1.0.0] - 2026-02-26
43
+
44
+ ### Added
45
+
46
+ #### Core extraction
47
+ - `extractPDF(input, options?)` — main entry point; accepts file path, https URL, or `Buffer`
48
+ - Parallel loading via **pdfjs-dist** (text/operator extraction) and **pdf-lib** (font dict, content streams)
49
+ - Page type classification: `"text"` | `"image"` | `"hybrid"` | `"vector"` | `"unknown"`
50
+ - PDF metadata normalization (`Title`, `Author`, `Creator`, `Producer`, `CreationDate`, …)
51
+ - Source/creator detection: `detectSource()` identifies Canva, Inkscape, Word, LibreOffice, etc.
52
+ - Print PDF detection: `detectPrintPDF()` based on `Producer` field heuristics
53
+
54
+ #### Font resolution (bridge map)
55
+ - `extractFonts(pdfDoc)` — walks the PDF font dictionary to collect `FontInfo` for every resource key
56
+ - `getContentStreamText(pdfDoc, pageIndex)` — decompresses page content streams (single or array); recursively follows Form XObject sub-streams via `/Name Do` operators (depth limit 4) to handle Inkscape-style PDFs where text lives inside XObjects
57
+ - `streamFontOrder(streamText)` — extracts ordered unique font resource keys from `/FontKey size Tf` operators; regex `[^\s/\[\]<>(){}]+` correctly handles names with hyphens (e.g. `f-0-0`)
58
+ - `buildFontBridge(streamText, pdfjsOrderedFonts)` — positional matching of pdfjs internal keys (`g_dN_fK`) to PDF resource keys (`F4`, `f-0-0`); robust against pdfjs global document counter incrementing across multiple extractions in the same process
59
+ - Full font metadata on every `TextElement`: `fontFamily`, `fontStyle`, `fontWeight`, `fontRealName`, `fontSubtype`, `isSubsetFont`
60
+
61
+ #### Color extraction
62
+ - Color extracted from the pdfjs operator list and carried on every `TextElement` as `RGB`
63
+ - Utilities: `rgbFromArray()`, `rgbToHex()`, `BLACK`, `WHITE` constants
64
+
65
+ #### Text grouping (parser)
66
+ - `groupIntoLines(elements, tolerance?)` — bucket elements by Y coordinate within `tolerance` pts; produces `TextLine[]` sorted top-to-bottom
67
+ - `groupIntoWords(elements, gapFactor?)` — reconstruct words from glyph-level elements; handles two split triggers: explicit whitespace elements and large X gaps (`gap > fontSize × gapFactor`)
68
+ - Letter-spacing heuristic: a whitespace element between two single-character elements is treated as decorative letter-spacing (Canva-style) and merged rather than used as a word boundary
69
+ - `extractWords(elements, lineTolerance?, gapFactor?)` — convenience wrapper; equivalent to `groupIntoLines().flatMap(groupIntoWords())`; returns `TextWord[]` in reading order
70
+ - `extractTextStructure(elements, lineTolerance?, gapFactor?)` — returns `TextLineWithWords[]` (full hierarchy: lines → words → raw elements)
71
+ - `lineToString(elements)` — legacy helper; joins elements in X order
72
+
73
+ #### Bounding box / region helpers
74
+ - `getBoundingBox(elements)` — tight bounding box around a set of `TextElement[]`
75
+ - `filterByRegion(elements, box)` — filter elements whose top-left falls inside a `BoundingBox`
76
+
77
+ #### Matrix utilities
78
+ - `getFontSizeFromMatrix(matrix)`, `getXFromMatrix(matrix)`, `getYFromMatrix(matrix)`
79
+
80
+ #### Types
81
+ - `RGB`, `FontInfo`
82
+ - `TextElement`, `RectElement`, `PathElement`, `ImageElement`, `XObjectElement`, `PageElement`
83
+ - `TextWord`, `TextLine`, `TextLineWithWords`
84
+ - `GraphicSummary`, `PageResult`, `PDFResult`, `ExtractOptions`
85
+ - `BoundingBox` (from `parser/streamParser`)
86
+
87
+ ### Technical notes
88
+ - Uses **pdfjs-dist 3.11.174** legacy build with fake worker disabled for Node.js compatibility
89
+ - Uses **pdf-lib 1.17.x** for font dictionary traversal and raw content stream access
90
+ - Content stream decompression: `zlib.inflateSync` → `zlib.inflateRawSync` → raw latin1 fallback
91
+ - Requires Node.js ≥ 18 (native `fetch` for URL loading)
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Kent Phung
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,427 @@
1
+ # pdf-metadata-extractor
2
+
3
+ Extract text elements, fonts, colors, images, and vector graphics metadata from PDF files. Supports file paths, URLs, and Buffers. Includes text-grouping helpers to reconstruct lines and words from raw PDF glyph streams.
4
+
5
+ ## Requirements
6
+
7
+ - Node.js **≥ 18** (uses native `fetch` and modern `zlib`)
8
+ - pnpm (recommended) or npm
9
+
10
+ ## Installation
11
+
12
+ ```bash
13
+ pnpm add pdf-metadata-extractor
14
+ ```
15
+
16
+ ## Quick start
17
+
18
+ ```typescript
19
+ import { extractPDF } from "pdf-metadata-extractor";
20
+
21
+ const result = await extractPDF("./document.pdf");
22
+ console.log(result.totalPages); // number of pages
23
+ console.log(result.fonts); // font list with real names, family, style, weight
24
+ console.log(result.pages[0].textElements); // raw TextElement[] for page 1
25
+ console.log(result.pages[0].rectElements); // colored rectangles / shapes
26
+ console.log(result.pages[0].imageElements); // embedded images with position + metadata
27
+ ```
28
+
29
+ Input can be a **file path**, a **URL** (https), or a **Buffer**:
30
+
31
+ ```typescript
32
+ await extractPDF("./local.pdf");
33
+ await extractPDF("https://example.com/file.pdf");
34
+ await extractPDF(fs.readFileSync("./local.pdf"));
35
+ ```
36
+
37
+ ## Text grouping
38
+
39
+ Raw `TextElement[]` contains individual glyphs or characters. Use the grouping helpers to reconstruct human-readable text:
40
+
41
+ ### Lines + words in one call
42
+
43
+ ```typescript
44
+ import { extractPDF, extractTextStructure } from "pdf-metadata-extractor";
45
+
46
+ const result = await extractPDF("./document.pdf");
47
+ for (const page of result.pages) {
48
+ const lines = extractTextStructure(page.textElements);
49
+ for (const line of lines) {
50
+ console.log(line.text); // full line string
51
+ for (const word of line.words) {
52
+ console.log(word.text, word.x, word.y, word.fontSize, word.fontFamily);
53
+ }
54
+ }
55
+ }
56
+ ```
57
+
58
+ ### Words only (flat list)
59
+
60
+ ```typescript
61
+ import { extractWords } from "pdf-metadata-extractor";
62
+
63
+ const words = extractWords(page.textElements);
64
+ // returns TextWord[] in reading order (top-to-bottom, left-to-right)
65
+ ```
66
+
67
+ ### Step by step
68
+
69
+ ```typescript
70
+ import { groupIntoLines, groupIntoWords } from "pdf-metadata-extractor";
71
+
72
+ const lines = groupIntoLines(page.textElements); // TextLine[]
73
+ const words = groupIntoWords(lines[0].elements); // TextWord[]
74
+ ```
75
+
76
+ ## Working with graphics
77
+
78
+ ### Colored rectangles and paths
79
+
80
+ ```typescript
81
+ for (const rect of page.rectElements) {
82
+ console.log(rect.x, rect.y, rect.width, rect.height);
83
+ console.log(rect.fillColor); // RGB | null e.g. { r: 244, g: 233, b: 215 }
84
+ console.log(rect.strokeColor); // RGB | null
85
+ }
86
+ ```
87
+
88
+ ### Images
89
+
90
+ ```typescript
91
+ for (const img of page.imageElements) {
92
+ console.log(img.name); // pdfjs internal XObject name
93
+ console.log(img.x, img.y, img.width, img.height); // display bounding box (pts)
94
+ console.log(img.imageWidth, img.imageHeight); // source pixel dimensions
95
+ console.log(img.colorSpace, img.filter); // e.g. "ICCBased", "DCTDecode"
96
+ }
97
+ ```
98
+
99
+ ### Graphic summary
100
+
101
+ ```typescript
102
+ const { imageCount, vectorCount } = page.graphicSummary;
103
+ ```
104
+
105
+ ---
106
+
107
+ ## API
108
+
109
+ ### `extractPDF(input, options?)`
110
+
111
+ | Parameter | Type | Description |
112
+ |-----------|------|-------------|
113
+ | `input` | `string \| Buffer` | File path, https URL, or raw Buffer |
114
+ | `options.loadExif` | `boolean` | (reserved, not yet active) |
115
+
116
+ Returns `Promise<PDFResult>`.
117
+
118
+ ---
119
+
120
+ ### `extractTextStructure(elements, lineTolerance?, gapFactor?)`
121
+
122
+ Groups raw `TextElement[]` into lines with words nested inside.
123
+
124
+ | Parameter | Type | Default | Description |
125
+ |-----------|------|---------|-------------|
126
+ | `elements` | `TextElement[]` | — | Raw elements from `page.textElements` |
127
+ | `lineTolerance` | `number` | `2` | Max Y-delta (pts) to treat two elements as the same line |
128
+ | `gapFactor` | `number` | `0.4` | Word-gap threshold as a fraction of `fontSize` |
129
+
130
+ Returns `TextLineWithWords[]`.
131
+
132
+ ---
133
+
134
+ ### `extractWords(elements, lineTolerance?, gapFactor?)`
135
+
136
+ Convenience wrapper: `groupIntoLines` → flatMap `groupIntoWords`.
137
+
138
+ Returns `TextWord[]` in reading order.
139
+
140
+ ---
141
+
142
+ ### `groupIntoLines(elements, tolerance?)`
143
+
144
+ Bucket elements by Y coordinate (within `tolerance` pts), sort top-to-bottom, left-to-right.
145
+
146
+ Returns `TextLine[]`.
147
+
148
+ ---
149
+
150
+ ### `groupIntoWords(elements, gapFactor?)`
151
+
152
+ Split a single line's elements into words by detecting:
153
+ - Explicit whitespace elements (word boundary unless letter-spacing heuristic applies)
154
+ - Large X gaps (`gap > fontSize × gapFactor`)
155
+
156
+ **Letter-spacing heuristic**: a space element sandwiched between two single-character elements is treated as decorative letter-spacing (e.g. Canva-generated PDFs) and merged into the current word rather than creating a word boundary.
157
+
158
+ Returns `TextWord[]`.
159
+
160
+ ---
161
+
162
+ ### `getBoundingBox(elements)`
163
+
164
+ Returns the tight `BoundingBox` (x, y, width, height) that encloses all elements, or `null` if the list is empty.
165
+
166
+ ---
167
+
168
+ ### `filterByRegion(elements, box)`
169
+
170
+ Filter elements whose top-left point falls inside the given `BoundingBox`.
171
+
172
+ ---
173
+
174
+ ### Color utilities
175
+
176
+ ```typescript
177
+ import { rgbFromArray, rgbToHex, BLACK, WHITE } from "pdf-metadata-extractor";
178
+
179
+ rgbFromArray([0.2, 0.4, 0.6]); // { r: 51, g: 102, b: 153 }
180
+ rgbToHex({ r: 255, g: 0, b: 0 }); // "#ff0000"
181
+ ```
182
+
183
+ ---
184
+
185
+ ### Matrix utilities
186
+
187
+ ```typescript
188
+ import { getFontSizeFromMatrix, getXFromMatrix, getYFromMatrix } from "pdf-metadata-extractor";
189
+ ```
190
+
191
+ ---
192
+
193
+ ## Types
194
+
195
+ ### `PDFResult`
196
+
197
+ ```typescript
198
+ interface PDFResult {
199
+ file?: string; // basename of the source file (if path was given)
200
+ totalPages: number;
201
+ source: string; // detected creator app ("Word", "Canva", "Inkscape", …)
202
+ isPrintPDF: boolean; // true if produced by a print driver
203
+ info: Record<string, unknown>; // raw PDF metadata (Title, Author, Creator, …)
204
+ fonts: FontInfo[]; // deduplicated font list for the whole document
205
+ pages: PageResult[];
206
+ }
207
+ ```
208
+
209
+ ### `PageResult`
210
+
211
+ ```typescript
212
+ interface PageResult {
213
+ pageNumber: number; // 1-based
214
+ width: number; // pts
215
+ height: number; // pts
216
+ pageType: "text" | "image" | "hybrid" | "vector" | "unknown";
217
+ elements: PageElement[]; // all elements combined (text + rect + path + image)
218
+ textElements: TextElement[];
219
+ imageElements: ImageElement[];
220
+ rectElements: RectElement[];
221
+ pathElements: PathElement[];
222
+ xobjectElements: XObjectElement[];
223
+ graphicSummary: GraphicSummary;
224
+ }
225
+ ```
226
+
227
+ ### `TextElement`
228
+
229
+ ```typescript
230
+ interface TextElement {
231
+ type: "text";
232
+ text: string;
233
+ x: number;
234
+ y: number;
235
+ width: number;
236
+ height: number;
237
+ fontSize: number;
238
+ fontFamily: string | null;
239
+ fontStyle: string | null; // "italic" | "normal" | null
240
+ fontWeight: number | null; // 400 | 700 | null
241
+ fontRealName: string | null; // e.g. "OpenSans-Regular"
242
+ fontSubtype: string | null; // "Type1" | "TrueType" | "CIDFontType2" | …
243
+ isSubsetFont: boolean | null;
244
+ color: RGB;
245
+ }
246
+ ```
247
+
248
+ ### `RectElement`
249
+
250
+ ```typescript
251
+ interface RectElement {
252
+ type: "rect";
253
+ x: number;
254
+ y: number;
255
+ width: number;
256
+ height: number;
257
+ fillColor: RGB | null;
258
+ strokeColor: RGB | null;
259
+ strokeWidth: number | null;
260
+ }
261
+ ```
262
+
263
+ ### `PathElement`
264
+
265
+ ```typescript
266
+ interface PathElement {
267
+ type: "path";
268
+ x: number;
269
+ y: number;
270
+ width: number;
271
+ height: number;
272
+ fillColor: RGB | null;
273
+ strokeColor: RGB | null;
274
+ strokeWidth: number | null;
275
+ }
276
+ ```
277
+
278
+ ### `ImageElement`
279
+
280
+ ```typescript
281
+ interface ImageElement {
282
+ type: "image";
283
+ name: string; // XObject resource name from the PDF
284
+ x: number; // display position (pts, top-left origin)
285
+ y: number;
286
+ width: number; // display size (pts)
287
+ height: number;
288
+ imageWidth: number | undefined; // source pixel width
289
+ imageHeight: number | undefined; // source pixel height
290
+ colorSpace: string | null | undefined; // e.g. "ICCBased", "DeviceRGB"
291
+ bitsPerComponent: number | undefined;
292
+ filter: string | null | undefined; // e.g. "DCTDecode", "FlateDecode"
293
+ imageMask: boolean | undefined;
294
+ }
295
+ ```
296
+
297
+ ### `TextWord`
298
+
299
+ ```typescript
300
+ interface TextWord {
301
+ text: string;
302
+ x: number;
303
+ y: number;
304
+ width: number;
305
+ height: number;
306
+ fontSize: number;
307
+ fontRealName: string | null;
308
+ fontFamily: string | null;
309
+ fontStyle: string | null;
310
+ fontWeight: number | null;
311
+ color: RGB;
312
+ elements: TextElement[]; // constituent raw elements
313
+ }
314
+ ```
315
+
316
+ ### `TextLine` / `TextLineWithWords`
317
+
318
+ ```typescript
319
+ interface TextLine {
320
+ y: number; // representative Y coordinate of the line
321
+ text: string; // full line text (joined elements)
322
+ elements: TextElement[];
323
+ }
324
+
325
+ interface TextLineWithWords extends TextLine {
326
+ words: TextWord[];
327
+ }
328
+ ```
329
+
330
+ ### `FontInfo`
331
+
332
+ ```typescript
333
+ interface FontInfo {
334
+ key: string; // PDF resource key ("F4", "f-0-0", …)
335
+ realName: string | null; // "OpenSans-Regular"
336
+ baseFontRaw: string | null; // raw /BaseFont value (may include subset prefix)
337
+ isSubset: boolean; // true if baseFontRaw starts with "XXXXXX+"
338
+ subtype: string | null; // "TrueType" | "Type1" | "CIDFontType2" | …
339
+ encoding: string | null;
340
+ fontFamily: string | null;
341
+ fontStyle: string | null;
342
+ fontWeight: number | null;
343
+ italicAngle: number | null;
344
+ }
345
+ ```
346
+
347
+ ### `GraphicSummary`
348
+
349
+ ```typescript
350
+ interface GraphicSummary {
351
+ vectorCount: number; // total rect + path elements on the page
352
+ imageCount: number; // total image elements on the page
353
+ }
354
+ ```
355
+
356
+ ---
357
+
358
+ ## JSON output example
359
+
360
+ ```json
361
+ {
362
+ "file": "document.pdf",
363
+ "totalPages": 1,
364
+ "source": "Canva",
365
+ "isPrintPDF": false,
366
+ "fonts": [
367
+ {
368
+ "key": "F4",
369
+ "realName": "OpenSans-Regular",
370
+ "isSubset": true,
371
+ "subtype": "TrueType",
372
+ "fontFamily": "OpenSans",
373
+ "fontStyle": "normal",
374
+ "fontWeight": 400
375
+ }
376
+ ],
377
+ "pages": [
378
+ {
379
+ "pageNumber": 1,
380
+ "width": 595.28,
381
+ "height": 841.89,
382
+ "pageType": "hybrid",
383
+ "graphicSummary": { "vectorCount": 22, "imageCount": 1 },
384
+ "lines": [
385
+ {
386
+ "y": 740,
387
+ "text": "Hello World",
388
+ "words": [
389
+ {
390
+ "text": "Hello",
391
+ "x": 72, "y": 740,
392
+ "width": 42.5, "height": 14,
393
+ "fontSize": 14,
394
+ "fontFamily": "OpenSans",
395
+ "fontStyle": "normal",
396
+ "fontWeight": 400,
397
+ "color": { "r": 0, "g": 0, "b": 0 }
398
+ }
399
+ ]
400
+ }
401
+ ]
402
+ }
403
+ ]
404
+ }
405
+ ```
406
+
407
+ ---
408
+
409
+ ## Development
410
+
411
+ ```bash
412
+ # Install dependencies
413
+ pnpm install
414
+
415
+ # Build
416
+ pnpm run build
417
+
418
+ # Run tests
419
+ pnpm run test
420
+
421
+ # Run example (requires Node 18+)
422
+ pnpm run example
423
+ ```
424
+
425
+ ## License
426
+
427
+ MIT
@@ -0,0 +1,3 @@
1
+ import { PDFResult, ExtractOptions } from "../types";
2
+ export declare function extractPDF(input: string | Buffer, options?: ExtractOptions): Promise<PDFResult>;
3
+ //# sourceMappingURL=extractor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":"AAUA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAWrD,wBAAsB,UAAU,CAC9B,KAAK,EAAE,MAAM,GAAG,MAAM,EACtB,OAAO,GAAE,cAAmB,GAC3B,OAAO,CAAC,SAAS,CAAC,CAmDpB"}
@@ -0,0 +1,87 @@
1
+ "use strict";
2
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
3
+ if (k2 === undefined) k2 = k;
4
+ var desc = Object.getOwnPropertyDescriptor(m, k);
5
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
6
+ desc = { enumerable: true, get: function() { return m[k]; } };
7
+ }
8
+ Object.defineProperty(o, k2, desc);
9
+ }) : (function(o, m, k, k2) {
10
+ if (k2 === undefined) k2 = k;
11
+ o[k2] = m[k];
12
+ }));
13
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
14
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
15
+ }) : function(o, v) {
16
+ o["default"] = v;
17
+ });
18
+ var __importStar = (this && this.__importStar) || (function () {
19
+ var ownKeys = function(o) {
20
+ ownKeys = Object.getOwnPropertyNames || function (o) {
21
+ var ar = [];
22
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
23
+ return ar;
24
+ };
25
+ return ownKeys(o);
26
+ };
27
+ return function (mod) {
28
+ if (mod && mod.__esModule) return mod;
29
+ var result = {};
30
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
31
+ __setModuleDefault(result, mod);
32
+ return result;
33
+ };
34
+ })();
35
+ var __importDefault = (this && this.__importDefault) || function (mod) {
36
+ return (mod && mod.__esModule) ? mod : { "default": mod };
37
+ };
38
+ Object.defineProperty(exports, "__esModule", { value: true });
39
+ exports.extractPDF = extractPDF;
40
+ const pdfjsLib = __importStar(require("pdfjs-dist/legacy/build/pdf.mjs"));
41
+ const pdf_lib_1 = require("pdf-lib");
42
+ const path_1 = __importDefault(require("path"));
43
+ const url_1 = require("url");
44
+ const loader_1 = require("../pdf/loader");
45
+ const fonts_1 = require("../pdf/fonts");
46
+ const metadata_1 = require("../pdf/metadata");
47
+ const pageProcessor_1 = require("./pageProcessor");
48
+ const sourceDetector_1 = require("./sourceDetector");
49
+ // pdfjs v5 fake-worker mode: the library dynamically imports workerSrc on the
50
+ // same thread when no real Web Worker is available. An empty string no longer
51
+ // works — we must supply a resolvable file:// URL so Node's dynamic import()
52
+ // can load the worker module.
53
+ // eslint-disable-next-line @typescript-eslint/no-require-imports
54
+ const _workerPath = require.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
55
+ pdfjsLib.GlobalWorkerOptions.workerSrc =
56
+ (0, url_1.pathToFileURL)(_workerPath).href;
57
+ async function extractPDF(input, options = {}) {
58
+ void options;
59
+ const buffer = await (0, loader_1.loadInput)(input);
60
+ const loadingTask = pdfjsLib.getDocument({
61
+ data: new Uint8Array(buffer),
62
+ useSystemFonts: true,
63
+ });
64
+ const pdf = await loadingTask.promise;
65
+ const pdfDoc = await pdf_lib_1.PDFDocument.load(buffer, { ignoreEncryption: true });
66
+ const rawMeta = await pdf.getMetadata().catch(() => ({}));
67
+ const meta = rawMeta;
68
+ const metaInfo = (0, metadata_1.normalizeMetaInfo)(meta?.info ?? {});
69
+ // realFontMap: { "F4": FontInfo, "F7": FontInfo, … }
70
+ const realFontMap = await (0, fonts_1.extractFonts)(pdfDoc);
71
+ const pages = [];
72
+ for (let i = 1; i <= pdf.numPages; i++) {
73
+ const page = await pdf.getPage(i);
74
+ const pageData = await (0, pageProcessor_1.processPage)(page, i, pdfDoc, i - 1, realFontMap);
75
+ pages.push(pageData);
76
+ }
77
+ return {
78
+ file: typeof input === "string" ? path_1.default.basename(input) : undefined,
79
+ totalPages: pdf.numPages,
80
+ source: (0, sourceDetector_1.detectSource)(metaInfo, null),
81
+ isPrintPDF: (0, sourceDetector_1.detectPrintPDF)(metaInfo),
82
+ info: metaInfo,
83
+ fonts: Object.values(realFontMap),
84
+ pages,
85
+ };
86
+ }
87
+ //# sourceMappingURL=extractor.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAqBA,gCAsDC;AA3ED,0EAA4D;AAC5D,qCAAsC;AACtC,gDAAwB;AACxB,6BAAoC;AAEpC,0CAA0C;AAC1C,wCAA4C;AAC5C,8CAAoD;AACpD,mDAA8C;AAC9C,qDAAgE;AAGhE,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,8BAA8B;AAC9B,iEAAiE;AACjE,MAAM,WAAW,GAAI,OAAuB,CAAC,OAAO,CAAC,wCAAwC,CAAC,CAAC;AAC9F,QAAsE,CAAC,mBAAmB,CAAC,SAAS;IACnG,IAAA,mBAAa,EAAC,WAAW,CAAC,CAAC,IAAI,CAAC;AAE3B,KAAK,UAAU,UAAU,CAC9B,KAAsB,EACtB,UAA0B,EAAE;IAE5B,KAAK,OAAO,CAAC;IAEb,MAAM,MAAM,GAAG,MAAM,IAAA,kBAAS,EAAC,KAAK,CAAC,CAAC;IAEtC,MAAM,WAAW,GAAI,QAEnB,CAAC,WAAW,CAAC;QACb,IAAI,EAAE,IAAI,UAAU,CAAC,MAAM,CAAC;QAC5B,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;IAEH,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,OAI7B,CAAC;IAEF,MAAM,MAAM,GAAG,MAAM,qBAAW,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,gBAAgB,EAAE,IAAI,EAAE,CAAC,CAAC;IAE1E,MAAM,OAAO,GAAI,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3D,MAAM,IAAI,GAAO,OAAiD,CAAC;IACnE,MAAM,QAAQ,GAAG,IAAA,4BAAiB,EAAC,IAAI,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC;IAErD,qDAAqD;IACrD,MAAM,WAAW,GAAG,MAAM,IAAA,oBAAY,EAAC,MAAM,CAAC,CAAC;IAE/C,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAElC,MAAM,QAAQ,GAAG,MAAM,IAAA,2BAAW,EAChC,IAAyC,EACzC,CAAC,EACD,MAAM,EACN,CAAC,GAAG,CAAC,EACL,WAAW,CACZ,CAAC;QACF,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;IAED,OAAO;QACL,IAAI,EAAQ,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,cAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS;QACxE,UAAU,EAAE,GAAG,CAAC,QAAQ;QACxB,MAAM,EAAM,IAAA,6BAAY,EAAC,QAAQ,EAAE,IAAI,CAAC;QACxC,UAAU,EAAE,IAAA,+BAAc,EAAC,QAAQ,CAAC;QACpC,IAAI,EAAQ,QAAmC;QAC/C,KAAK,EAAO,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC;QACtC,KAAK;KACN,CAAC;AACJ,CAAC"}
@@ -0,0 +1,30 @@
1
+ import { PageResult, FontInfo } from "../types";
2
+ import { PDFDocument } from "pdf-lib";
3
+ interface PDFOperatorList {
4
+ fnArray: number[];
5
+ argsArray: unknown[][];
6
+ }
7
+ interface PDFTextItem {
8
+ str: string;
9
+ transform: number[];
10
+ fontName?: string;
11
+ width?: number;
12
+ height?: number;
13
+ }
14
+ interface PDFTextContent {
15
+ items: PDFTextItem[];
16
+ }
17
+ interface PDFViewport {
18
+ width: number;
19
+ height: number;
20
+ }
21
+ interface PDFJSPage {
22
+ getViewport(options: {
23
+ scale: number;
24
+ }): PDFViewport;
25
+ getTextContent(): Promise<PDFTextContent>;
26
+ getOperatorList(): Promise<PDFOperatorList>;
27
+ }
28
+ export declare function processPage(pdfjsPage: PDFJSPage, pageNumber: number, pdfDoc: PDFDocument, pageIndex0: number, realFontMap?: Record<string, FontInfo>): Promise<PageResult>;
29
+ export {};
30
+ //# sourceMappingURL=pageProcessor.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"pageProcessor.d.ts","sourceRoot":"","sources":["../../src/core/pageProcessor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAA4D,QAAQ,EAAE,MAAM,UAAU,CAAC;AAG1G,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAuJtC,UAAU,eAAe;IAAG,OAAO,EAAE,MAAM,EAAE,CAAC;IAAC,SAAS,EAAE,OAAO,EAAE,EAAE,CAAA;CAAE;AACvE,UAAU,WAAW;IAAO,GAAG,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,EAAE,CAAC;IAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE;AAClH,UAAU,cAAc;IAAI,KAAK,EAAE,WAAW,EAAE,CAAA;CAAE;AAClD,UAAU,WAAW;IAAO,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE;AAC3D,UAAU,SAAS;IACjB,WAAW,CAAC,OAAO,EAAE;QAAE,KAAK,EAAE,MAAM,CAAA;KAAE,GAAG,WAAW,CAAC;IACrD,cAAc,IAAI,OAAO,CAAC,cAAc,CAAC,CAAC;IAC1C,eAAe,IAAI,OAAO,CAAC,eAAe,CAAC,CAAC;CAC7C;AAID,wBAAsB,WAAW,CAC/B,SAAS,EAAE,SAAS,EACpB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAO,WAAW,EACxB,UAAU,EAAG,MAAM,EACnB,WAAW,GAAE,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAM,GACzC,OAAO,CAAC,UAAU,CAAC,CA4WrB"}