npm - pdf-metadata-extractor - Versions diffs - 1.1.0 - Mend

pdf-metadata-extractor 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/CHANGELOG.md +91 -0
package/LICENSE +21 -0
package/README.md +427 -0
package/dist/core/extractor.d.ts +3 -0
package/dist/core/extractor.d.ts.map +1 -0
package/dist/core/extractor.js +87 -0
package/dist/core/extractor.js.map +1 -0
package/dist/core/pageProcessor.d.ts +30 -0
package/dist/core/pageProcessor.d.ts.map +1 -0
package/dist/core/pageProcessor.js +480 -0
package/dist/core/pageProcessor.js.map +1 -0
package/dist/core/sourceDetector.d.ts +4 -0
package/dist/core/sourceDetector.d.ts.map +1 -0
package/dist/core/sourceDetector.js +33 -0
package/dist/core/sourceDetector.js.map +1 -0
package/dist/fetch/fetchBuffer.d.ts +2 -0
package/dist/fetch/fetchBuffer.d.ts.map +1 -0
package/dist/fetch/fetchBuffer.js +12 -0
package/dist/fetch/fetchBuffer.js.map +1 -0
package/dist/index.d.ts +8 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +42 -0
package/dist/index.js.map +1 -0
package/dist/parser/streamParser.d.ts +34 -0
package/dist/parser/streamParser.d.ts.map +1 -0
package/dist/parser/streamParser.js +191 -0
package/dist/parser/streamParser.js.map +1 -0
package/dist/parser/textParser.d.ts +56 -0
package/dist/parser/textParser.d.ts.map +1 -0
package/dist/parser/textParser.js +175 -0
package/dist/parser/textParser.js.map +1 -0
package/dist/pdf/fonts.d.ts +4 -0
package/dist/pdf/fonts.d.ts.map +1 -0
package/dist/pdf/fonts.js +113 -0
package/dist/pdf/fonts.js.map +1 -0
package/dist/pdf/loader.d.ts +2 -0
package/dist/pdf/loader.d.ts.map +1 -0
package/dist/pdf/loader.js +18 -0
package/dist/pdf/loader.js.map +1 -0
package/dist/pdf/metadata.d.ts +13 -0
package/dist/pdf/metadata.d.ts.map +1 -0
package/dist/pdf/metadata.js +9 -0
package/dist/pdf/metadata.js.map +1 -0
package/dist/pdf/xobjects.d.ts +12 -0
package/dist/pdf/xobjects.d.ts.map +1 -0
package/dist/pdf/xobjects.js +107 -0
package/dist/pdf/xobjects.js.map +1 -0
package/dist/types.d.ts +136 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +3 -0
package/dist/types.js.map +1 -0
package/dist/utils/buffer.d.ts +3 -0
package/dist/utils/buffer.d.ts.map +1 -0
package/dist/utils/buffer.js +11 -0
package/dist/utils/buffer.js.map +1 -0
package/dist/utils/color.d.ts +6 -0
package/dist/utils/color.d.ts.map +1 -0
package/dist/utils/color.js +21 -0
package/dist/utils/color.js.map +1 -0
package/dist/utils/matrix.d.ts +11 -0
package/dist/utils/matrix.d.ts.map +1 -0
package/dist/utils/matrix.js +22 -0
package/dist/utils/matrix.js.map +1 -0
package/package.json +61 -0

package/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,91 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+## [1.1.0] - 2026-02-26
+### Added
+#### Graphics extraction (new)
+- `ImageElement` — extracts embedded images with display bounding box (x, y, width, height in pts) and source metadata (imageWidth, imageHeight, colorSpace, bitsPerComponent, filter, imageMask)
+- `RectElement` — extracts rectangle paths with fillColor, strokeColor, strokeWidth; CTM (current transformation matrix) applied so coordinates are in page display space
+- `PathElement` — extracts non-rectangular paths (curves, polylines) as axis-aligned bounding boxes with fill/stroke color
+- `PageResult.rectElements`, `PageResult.pathElements`, `PageResult.imageElements` arrays
+- `PageResult.graphicSummary.vectorCount` and `imageCount` counters
+- `PageResult.elements` now includes all element types combined (text + rect + path + image)
+- Page type `"vector"` and `"hybrid"` now correctly classified when vector/image elements are present
+#### CTM tracking
+- Full current transformation matrix (save/restore/transform) tracked throughout operator list
+- Image bounding box derived from unit-square corners transformed through CTM
+- Rect/path corners transformed through CTM before computing axis-aligned bounding box
+### Changed
+#### pdfjs-dist upgrade: v3.11.174 → v5.4.624
+- Updated import to `pdfjs-dist/legacy/build/pdf.mjs` (legacy build still present in v5)
+- **Worker setup**: v5 fake-worker mode uses `await import(workerSrc)` internally; empty string no longer works. Fix: `pathToFileURL(require.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs")).href`
+- **Color format**: v5 worker normalises all color ops (gray, RGB, CMYK) into `setFillRGBColor` (OPS 59) / `setStrokeRGBColor` (OPS 58) with a single `"#rrggbb"` hex string argument. The old `[r, g, b]` integer triple format (v3) is supported as a fallback
+- **`constructPath` (OPS 91) argument format**: v5 changed from `[opsArray, coordsArray]` to `[renderFn, [Float32Array], [minX, minY, maxX, maxY]]`. The rendering op (fill/stroke/both) and pre-computed bounding box are now embedded in the single op. Detection: `typeof args[0] === "number"` distinguishes v5 from v3
+#### `processPage` signature change
+- Old: `processPage(pdfjsPage, pageNumber, bridgeMap, realFontMap)`
+- New: `processPage(pdfjsPage, pageNumber, pdfDoc, pageIndex0, realFontMap)` — `pdfDoc` and `pageIndex0` passed through so the function can call `extractXObjectInfo` and `buildFontBridge` internally
+### Fixed
+- Image XObject name mismatch: pdfjs renames XObjects internally (`img_p0_1` ≠ PDF key `X5`). Resolved with positional fallback — Nth image paint op maps to Nth Image XObject in dict insertion order
+- `PDFRawStream` vs `PDFDict`: image XObjects are streams not plain dicts; `lookupMaybe(ref, PDFDict)` threw for them. Fixed: `ctx.lookup(ref)` then `.dict` property
+- `PDFNumber.asNumber()` used instead of the private `.numberValue` field for XObject dimension extraction
+---
+## [1.0.0] - 2026-02-26
+### Added
+#### Core extraction
+- `extractPDF(input, options?)` — main entry point; accepts file path, https URL, or `Buffer`
+- Parallel loading via **pdfjs-dist** (text/operator extraction) and **pdf-lib** (font dict, content streams)
+- Page type classification: `"text"` | `"image"` | `"hybrid"` | `"vector"` | `"unknown"`
+- PDF metadata normalization (`Title`, `Author`, `Creator`, `Producer`, `CreationDate`, …)
+- Source/creator detection: `detectSource()` identifies Canva, Inkscape, Word, LibreOffice, etc.
+- Print PDF detection: `detectPrintPDF()` based on `Producer` field heuristics
+#### Font resolution (bridge map)
+- `extractFonts(pdfDoc)` — walks the PDF font dictionary to collect `FontInfo` for every resource key
+- `getContentStreamText(pdfDoc, pageIndex)` — decompresses page content streams (single or array); recursively follows Form XObject sub-streams via `/Name Do` operators (depth limit 4) to handle Inkscape-style PDFs where text lives inside XObjects
+- `streamFontOrder(streamText)` — extracts ordered unique font resource keys from `/FontKey size Tf` operators; regex `[^\s/\[\]<>(){}]+` correctly handles names with hyphens (e.g. `f-0-0`)
+- `buildFontBridge(streamText, pdfjsOrderedFonts)` — positional matching of pdfjs internal keys (`g_dN_fK`) to PDF resource keys (`F4`, `f-0-0`); robust against pdfjs global document counter incrementing across multiple extractions in the same process
+- Full font metadata on every `TextElement`: `fontFamily`, `fontStyle`, `fontWeight`, `fontRealName`, `fontSubtype`, `isSubsetFont`
+#### Color extraction
+- Color extracted from the pdfjs operator list and carried on every `TextElement` as `RGB`
+- Utilities: `rgbFromArray()`, `rgbToHex()`, `BLACK`, `WHITE` constants
+#### Text grouping (parser)
+- `groupIntoLines(elements, tolerance?)` — bucket elements by Y coordinate within `tolerance` pts; produces `TextLine[]` sorted top-to-bottom
+- `groupIntoWords(elements, gapFactor?)` — reconstruct words from glyph-level elements; handles two split triggers: explicit whitespace elements and large X gaps (`gap > fontSize × gapFactor`)
+  - Letter-spacing heuristic: a whitespace element between two single-character elements is treated as decorative letter-spacing (Canva-style) and merged rather than used as a word boundary
+- `extractWords(elements, lineTolerance?, gapFactor?)` — convenience wrapper; equivalent to `groupIntoLines().flatMap(groupIntoWords())`; returns `TextWord[]` in reading order
+- `extractTextStructure(elements, lineTolerance?, gapFactor?)` — returns `TextLineWithWords[]` (full hierarchy: lines → words → raw elements)
+- `lineToString(elements)` — legacy helper; joins elements in X order
+#### Bounding box / region helpers
+- `getBoundingBox(elements)` — tight bounding box around a set of `TextElement[]`
+- `filterByRegion(elements, box)` — filter elements whose top-left falls inside a `BoundingBox`
+#### Matrix utilities
+- `getFontSizeFromMatrix(matrix)`, `getXFromMatrix(matrix)`, `getYFromMatrix(matrix)`
+#### Types
+- `RGB`, `FontInfo`
+- `TextElement`, `RectElement`, `PathElement`, `ImageElement`, `XObjectElement`, `PageElement`
+- `TextWord`, `TextLine`, `TextLineWithWords`
+- `GraphicSummary`, `PageResult`, `PDFResult`, `ExtractOptions`
+- `BoundingBox` (from `parser/streamParser`)
+### Technical notes
+- Uses **pdfjs-dist 3.11.174** legacy build with fake worker disabled for Node.js compatibility
+- Uses **pdf-lib 1.17.x** for font dictionary traversal and raw content stream access
+- Content stream decompression: `zlib.inflateSync` → `zlib.inflateRawSync` → raw latin1 fallback
+- Requires Node.js ≥ 18 (native `fetch` for URL loading)

package/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Kent Phung
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

package/README.md ADDED Viewed

@@ -0,0 +1,427 @@
+# pdf-metadata-extractor
+Extract text elements, fonts, colors, images, and vector graphics metadata from PDF files. Supports file paths, URLs, and Buffers. Includes text-grouping helpers to reconstruct lines and words from raw PDF glyph streams.
+## Requirements
+- Node.js **≥ 18** (uses native `fetch` and modern `zlib`)
+- pnpm (recommended) or npm
+## Installation
+```bash
+pnpm add pdf-metadata-extractor
+```
+## Quick start
+```typescript
+import { extractPDF } from "pdf-metadata-extractor";
+const result = await extractPDF("./document.pdf");
+console.log(result.totalPages);            // number of pages
+console.log(result.fonts);                 // font list with real names, family, style, weight
+console.log(result.pages[0].textElements); // raw TextElement[] for page 1
+console.log(result.pages[0].rectElements); // colored rectangles / shapes
+console.log(result.pages[0].imageElements); // embedded images with position + metadata
+```
+Input can be a **file path**, a **URL** (https), or a **Buffer**:
+```typescript
+await extractPDF("./local.pdf");
+await extractPDF("https://example.com/file.pdf");
+await extractPDF(fs.readFileSync("./local.pdf"));
+```
+## Text grouping
+Raw `TextElement[]` contains individual glyphs or characters. Use the grouping helpers to reconstruct human-readable text:
+### Lines + words in one call
+```typescript
+import { extractPDF, extractTextStructure } from "pdf-metadata-extractor";
+const result = await extractPDF("./document.pdf");
+for (const page of result.pages) {
+  const lines = extractTextStructure(page.textElements);
+  for (const line of lines) {
+    console.log(line.text);           // full line string
+    for (const word of line.words) {
+      console.log(word.text, word.x, word.y, word.fontSize, word.fontFamily);
+    }
+  }
+}
+```
+### Words only (flat list)
+```typescript
+import { extractWords } from "pdf-metadata-extractor";
+const words = extractWords(page.textElements);
+// returns TextWord[] in reading order (top-to-bottom, left-to-right)
+```
+### Step by step
+```typescript
+import { groupIntoLines, groupIntoWords } from "pdf-metadata-extractor";
+const lines = groupIntoLines(page.textElements);        // TextLine[]
+const words = groupIntoWords(lines[0].elements);        // TextWord[]
+```
+## Working with graphics
+### Colored rectangles and paths
+```typescript
+for (const rect of page.rectElements) {
+  console.log(rect.x, rect.y, rect.width, rect.height);
+  console.log(rect.fillColor);   // RGB | null  e.g. { r: 244, g: 233, b: 215 }
+  console.log(rect.strokeColor); // RGB | null
+}
+```
+### Images
+```typescript
+for (const img of page.imageElements) {
+  console.log(img.name);                          // pdfjs internal XObject name
+  console.log(img.x, img.y, img.width, img.height); // display bounding box (pts)
+  console.log(img.imageWidth, img.imageHeight);   // source pixel dimensions
+  console.log(img.colorSpace, img.filter);        // e.g. "ICCBased", "DCTDecode"
+}
+```
+### Graphic summary
+```typescript
+const { imageCount, vectorCount } = page.graphicSummary;
+```
+---
+## API
+### `extractPDF(input, options?)`
+| Parameter | Type | Description |
+|-----------|------|-------------|
+| `input` | `string \| Buffer` | File path, https URL, or raw Buffer |
+| `options.loadExif` | `boolean` | (reserved, not yet active) |
+Returns `Promise<PDFResult>`.
+---
+### `extractTextStructure(elements, lineTolerance?, gapFactor?)`
+Groups raw `TextElement[]` into lines with words nested inside.
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `elements` | `TextElement[]` | — | Raw elements from `page.textElements` |
+| `lineTolerance` | `number` | `2` | Max Y-delta (pts) to treat two elements as the same line |
+| `gapFactor` | `number` | `0.4` | Word-gap threshold as a fraction of `fontSize` |
+Returns `TextLineWithWords[]`.
+---
+### `extractWords(elements, lineTolerance?, gapFactor?)`
+Convenience wrapper: `groupIntoLines` → flatMap `groupIntoWords`.
+Returns `TextWord[]` in reading order.
+---
+### `groupIntoLines(elements, tolerance?)`
+Bucket elements by Y coordinate (within `tolerance` pts), sort top-to-bottom, left-to-right.
+Returns `TextLine[]`.
+---
+### `groupIntoWords(elements, gapFactor?)`
+Split a single line's elements into words by detecting:
+- Explicit whitespace elements (word boundary unless letter-spacing heuristic applies)
+- Large X gaps (`gap > fontSize × gapFactor`)
+**Letter-spacing heuristic**: a space element sandwiched between two single-character elements is treated as decorative letter-spacing (e.g. Canva-generated PDFs) and merged into the current word rather than creating a word boundary.
+Returns `TextWord[]`.
+---
+### `getBoundingBox(elements)`
+Returns the tight `BoundingBox` (x, y, width, height) that encloses all elements, or `null` if the list is empty.
+---
+### `filterByRegion(elements, box)`
+Filter elements whose top-left point falls inside the given `BoundingBox`.
+---
+### Color utilities
+```typescript
+import { rgbFromArray, rgbToHex, BLACK, WHITE } from "pdf-metadata-extractor";
+rgbFromArray([0.2, 0.4, 0.6]);   // { r: 51, g: 102, b: 153 }
+rgbToHex({ r: 255, g: 0, b: 0 }); // "#ff0000"
+```
+---
+### Matrix utilities
+```typescript
+import { getFontSizeFromMatrix, getXFromMatrix, getYFromMatrix } from "pdf-metadata-extractor";
+```
+---
+## Types
+### `PDFResult`
+```typescript
+interface PDFResult {
+  file?: string;            // basename of the source file (if path was given)
+  totalPages: number;
+  source: string;           // detected creator app ("Word", "Canva", "Inkscape", …)
+  isPrintPDF: boolean;      // true if produced by a print driver
+  info: Record<string, unknown>;  // raw PDF metadata (Title, Author, Creator, …)
+  fonts: FontInfo[];        // deduplicated font list for the whole document
+  pages: PageResult[];
+}
+```
+### `PageResult`
+```typescript
+interface PageResult {
+  pageNumber: number;       // 1-based
+  width: number;            // pts
+  height: number;           // pts
+  pageType: "text" | "image" | "hybrid" | "vector" | "unknown";
+  elements: PageElement[];  // all elements combined (text + rect + path + image)
+  textElements: TextElement[];
+  imageElements: ImageElement[];
+  rectElements: RectElement[];
+  pathElements: PathElement[];
+  xobjectElements: XObjectElement[];
+  graphicSummary: GraphicSummary;
+}
+```
+### `TextElement`
+```typescript
+interface TextElement {
+  type: "text";
+  text: string;
+  x: number;
+  y: number;
+  width: number;
+  height: number;
+  fontSize: number;
+  fontFamily: string | null;
+  fontStyle: string | null;      // "italic" | "normal" | null
+  fontWeight: number | null;     // 400 | 700 | null
+  fontRealName: string | null;   // e.g. "OpenSans-Regular"
+  fontSubtype: string | null;    // "Type1" | "TrueType" | "CIDFontType2" | …
+  isSubsetFont: boolean | null;
+  color: RGB;
+}
+```
+### `RectElement`
+```typescript
+interface RectElement {
+  type: "rect";
+  x: number;
+  y: number;
+  width: number;
+  height: number;
+  fillColor: RGB | null;
+  strokeColor: RGB | null;
+  strokeWidth: number | null;
+}
+```
+### `PathElement`
+```typescript
+interface PathElement {
+  type: "path";
+  x: number;
+  y: number;
+  width: number;
+  height: number;
+  fillColor: RGB | null;
+  strokeColor: RGB | null;
+  strokeWidth: number | null;
+}
+```
+### `ImageElement`
+```typescript
+interface ImageElement {
+  type: "image";
+  name: string;              // XObject resource name from the PDF
+  x: number;                 // display position (pts, top-left origin)
+  y: number;
+  width: number;             // display size (pts)
+  height: number;
+  imageWidth: number | undefined;      // source pixel width
+  imageHeight: number | undefined;     // source pixel height
+  colorSpace: string | null | undefined;  // e.g. "ICCBased", "DeviceRGB"
+  bitsPerComponent: number | undefined;
+  filter: string | null | undefined;  // e.g. "DCTDecode", "FlateDecode"
+  imageMask: boolean | undefined;
+}
+```
+### `TextWord`
+```typescript
+interface TextWord {
+  text: string;
+  x: number;
+  y: number;
+  width: number;
+  height: number;
+  fontSize: number;
+  fontRealName: string | null;
+  fontFamily: string | null;
+  fontStyle: string | null;
+  fontWeight: number | null;
+  color: RGB;
+  elements: TextElement[];   // constituent raw elements
+}
+```
+### `TextLine` / `TextLineWithWords`
+```typescript
+interface TextLine {
+  y: number;            // representative Y coordinate of the line
+  text: string;         // full line text (joined elements)
+  elements: TextElement[];
+}
+interface TextLineWithWords extends TextLine {
+  words: TextWord[];
+}
+```
+### `FontInfo`
+```typescript
+interface FontInfo {
+  key: string;           // PDF resource key ("F4", "f-0-0", …)
+  realName: string | null;     // "OpenSans-Regular"
+  baseFontRaw: string | null;  // raw /BaseFont value (may include subset prefix)
+  isSubset: boolean;     // true if baseFontRaw starts with "XXXXXX+"
+  subtype: string | null;      // "TrueType" | "Type1" | "CIDFontType2" | …
+  encoding: string | null;
+  fontFamily: string | null;
+  fontStyle: string | null;
+  fontWeight: number | null;
+  italicAngle: number | null;
+}
+```
+### `GraphicSummary`
+```typescript
+interface GraphicSummary {
+  vectorCount: number;   // total rect + path elements on the page
+  imageCount: number;    // total image elements on the page
+}
+```
+---
+## JSON output example
+```json
+{
+  "file": "document.pdf",
+  "totalPages": 1,
+  "source": "Canva",
+  "isPrintPDF": false,
+  "fonts": [
+    {
+      "key": "F4",
+      "realName": "OpenSans-Regular",
+      "isSubset": true,
+      "subtype": "TrueType",
+      "fontFamily": "OpenSans",
+      "fontStyle": "normal",
+      "fontWeight": 400
+    }
+  ],
+  "pages": [
+    {
+      "pageNumber": 1,
+      "width": 595.28,
+      "height": 841.89,
+      "pageType": "hybrid",
+      "graphicSummary": { "vectorCount": 22, "imageCount": 1 },
+      "lines": [
+        {
+          "y": 740,
+          "text": "Hello World",
+          "words": [
+            {
+              "text": "Hello",
+              "x": 72, "y": 740,
+              "width": 42.5, "height": 14,
+              "fontSize": 14,
+              "fontFamily": "OpenSans",
+              "fontStyle": "normal",
+              "fontWeight": 400,
+              "color": { "r": 0, "g": 0, "b": 0 }
+            }
+          ]
+        }
+      ]
+    }
+  ]
+}
+```
+---
+## Development
+```bash
+# Install dependencies
+pnpm install
+# Build
+pnpm run build
+# Run tests
+pnpm run test
+# Run example (requires Node 18+)
+pnpm run example
+```
+## License
+MIT

package/dist/core/extractor.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+import { PDFResult, ExtractOptions } from "../types";
+export declare function extractPDF(input: string | Buffer, options?: ExtractOptions): Promise<PDFResult>;
+//# sourceMappingURL=extractor.d.ts.map

package/dist/core/extractor.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"extractor.d.ts","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":"AAUA,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAWrD,wBAAsB,UAAU,CAC9B,KAAK,EAAE,MAAM,GAAG,MAAM,EACtB,OAAO,GAAE,cAAmB,GAC3B,OAAO,CAAC,SAAS,CAAC,CAmDpB"}

package/dist/core/extractor.js ADDED Viewed

@@ -0,0 +1,87 @@
+"use strict";
+var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    var desc = Object.getOwnPropertyDescriptor(m, k);
+    if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
+      desc = { enumerable: true, get: function() { return m[k]; } };
+    }
+    Object.defineProperty(o, k2, desc);
+}) : (function(o, m, k, k2) {
+    if (k2 === undefined) k2 = k;
+    o[k2] = m[k];
+}));
+var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
+    Object.defineProperty(o, "default", { enumerable: true, value: v });
+}) : function(o, v) {
+    o["default"] = v;
+});
+var __importStar = (this && this.__importStar) || (function () {
+    var ownKeys = function(o) {
+        ownKeys = Object.getOwnPropertyNames || function (o) {
+            var ar = [];
+            for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
+            return ar;
+        };
+        return ownKeys(o);
+    };
+    return function (mod) {
+        if (mod && mod.__esModule) return mod;
+        var result = {};
+        if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
+        __setModuleDefault(result, mod);
+        return result;
+    };
+})();
+var __importDefault = (this && this.__importDefault) || function (mod) {
+    return (mod && mod.__esModule) ? mod : { "default": mod };
+};
+Object.defineProperty(exports, "__esModule", { value: true });
+exports.extractPDF = extractPDF;
+const pdfjsLib = __importStar(require("pdfjs-dist/legacy/build/pdf.mjs"));
+const pdf_lib_1 = require("pdf-lib");
+const path_1 = __importDefault(require("path"));
+const url_1 = require("url");
+const loader_1 = require("../pdf/loader");
+const fonts_1 = require("../pdf/fonts");
+const metadata_1 = require("../pdf/metadata");
+const pageProcessor_1 = require("./pageProcessor");
+const sourceDetector_1 = require("./sourceDetector");
+// pdfjs v5 fake-worker mode: the library dynamically imports workerSrc on the
+// same thread when no real Web Worker is available.  An empty string no longer
+// works — we must supply a resolvable file:// URL so Node's dynamic import()
+// can load the worker module.
+// eslint-disable-next-line @typescript-eslint/no-require-imports
+const _workerPath = require.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
+pdfjsLib.GlobalWorkerOptions.workerSrc =
+    (0, url_1.pathToFileURL)(_workerPath).href;
+async function extractPDF(input, options = {}) {
+    void options;
+    const buffer = await (0, loader_1.loadInput)(input);
+    const loadingTask = pdfjsLib.getDocument({
+        data: new Uint8Array(buffer),
+        useSystemFonts: true,
+    });
+    const pdf = await loadingTask.promise;
+    const pdfDoc = await pdf_lib_1.PDFDocument.load(buffer, { ignoreEncryption: true });
+    const rawMeta = await pdf.getMetadata().catch(() => ({}));
+    const meta = rawMeta;
+    const metaInfo = (0, metadata_1.normalizeMetaInfo)(meta?.info ?? {});
+    // realFontMap: { "F4": FontInfo, "F7": FontInfo, … }
+    const realFontMap = await (0, fonts_1.extractFonts)(pdfDoc);
+    const pages = [];
+    for (let i = 1; i <= pdf.numPages; i++) {
+        const page = await pdf.getPage(i);
+        const pageData = await (0, pageProcessor_1.processPage)(page, i, pdfDoc, i - 1, realFontMap);
+        pages.push(pageData);
+    }
+    return {
+        file: typeof input === "string" ? path_1.default.basename(input) : undefined,
+        totalPages: pdf.numPages,
+        source: (0, sourceDetector_1.detectSource)(metaInfo, null),
+        isPrintPDF: (0, sourceDetector_1.detectPrintPDF)(metaInfo),
+        info: metaInfo,
+        fonts: Object.values(realFontMap),
+        pages,
+    };
+}
+//# sourceMappingURL=extractor.js.map

package/dist/core/extractor.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"extractor.js","sourceRoot":"","sources":["../../src/core/extractor.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAqBA,gCAsDC;AA3ED,0EAA4D;AAC5D,qCAAsC;AACtC,gDAAwB;AACxB,6BAAoC;AAEpC,0CAA0C;AAC1C,wCAA4C;AAC5C,8CAAoD;AACpD,mDAA8C;AAC9C,qDAAgE;AAGhE,8EAA8E;AAC9E,+EAA+E;AAC/E,6EAA6E;AAC7E,8BAA8B;AAC9B,iEAAiE;AACjE,MAAM,WAAW,GAAI,OAAuB,CAAC,OAAO,CAAC,wCAAwC,CAAC,CAAC;AAC9F,QAAsE,CAAC,mBAAmB,CAAC,SAAS;IACnG,IAAA,mBAAa,EAAC,WAAW,CAAC,CAAC,IAAI,CAAC;AAE3B,KAAK,UAAU,UAAU,CAC9B,KAAsB,EACtB,UAA0B,EAAE;IAE5B,KAAK,OAAO,CAAC;IAEb,MAAM,MAAM,GAAG,MAAM,IAAA,kBAAS,EAAC,KAAK,CAAC,CAAC;IAEtC,MAAM,WAAW,GAAI,QAEnB,CAAC,WAAW,CAAC;QACb,IAAI,EAAE,IAAI,UAAU,CAAC,MAAM,CAAC;QAC5B,cAAc,EAAE,IAAI;KACrB,CAAC,CAAC;IAEH,MAAM,GAAG,GAAG,MAAM,WAAW,CAAC,OAI7B,CAAC;IAEF,MAAM,MAAM,GAAG,MAAM,qBAAW,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,gBAAgB,EAAE,IAAI,EAAE,CAAC,CAAC;IAE1E,MAAM,OAAO,GAAI,MAAM,GAAG,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3D,MAAM,IAAI,GAAO,OAAiD,CAAC;IACnE,MAAM,QAAQ,GAAG,IAAA,4BAAiB,EAAC,IAAI,EAAE,IAAI,IAAI,EAAE,CAAC,CAAC;IAErD,qDAAqD;IACrD,MAAM,WAAW,GAAG,MAAM,IAAA,oBAAY,EAAC,MAAM,CAAC,CAAC;IAE/C,MAAM,KAAK,GAAG,EAAE,CAAC;IAEjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,GAAG,CAAC,QAAQ,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,IAAI,GAAG,MAAM,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC;QAElC,MAAM,QAAQ,GAAG,MAAM,IAAA,2BAAW,EAChC,IAAyC,EACzC,CAAC,EACD,MAAM,EACN,CAAC,GAAG,CAAC,EACL,WAAW,CACZ,CAAC;QACF,KAAK,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACvB,CAAC;IAED,OAAO;QACL,IAAI,EAAQ,OAAO,KAAK,KAAK,QAAQ,CAAC,CAAC,CAAC,cAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,SAAS;QACxE,UAAU,EAAE,GAAG,CAAC,QAAQ;QACxB,MAAM,EAAM,IAAA,6BAAY,EAAC,QAAQ,EAAE,IAAI,CAAC;QACxC,UAAU,EAAE,IAAA,+BAAc,EAAC,QAAQ,CAAC;QACpC,IAAI,EAAQ,QAAmC;QAC/C,KAAK,EAAO,MAAM,CAAC,MAAM,CAAC,WAAW,CAAC;QACtC,KAAK;KACN,CAAC;AACJ,CAAC"}

package/dist/core/pageProcessor.d.ts ADDED Viewed

@@ -0,0 +1,30 @@
+import { PageResult, FontInfo } from "../types";
+import { PDFDocument } from "pdf-lib";
+interface PDFOperatorList {
+    fnArray: number[];
+    argsArray: unknown[][];
+}
+interface PDFTextItem {
+    str: string;
+    transform: number[];
+    fontName?: string;
+    width?: number;
+    height?: number;
+}
+interface PDFTextContent {
+    items: PDFTextItem[];
+}
+interface PDFViewport {
+    width: number;
+    height: number;
+}
+interface PDFJSPage {
+    getViewport(options: {
+        scale: number;
+    }): PDFViewport;
+    getTextContent(): Promise<PDFTextContent>;
+    getOperatorList(): Promise<PDFOperatorList>;
+}
+export declare function processPage(pdfjsPage: PDFJSPage, pageNumber: number, pdfDoc: PDFDocument, pageIndex0: number, realFontMap?: Record<string, FontInfo>): Promise<PageResult>;
+export {};
+//# sourceMappingURL=pageProcessor.d.ts.map

package/dist/core/pageProcessor.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"pageProcessor.d.ts","sourceRoot":"","sources":["../../src/core/pageProcessor.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,UAAU,EAA4D,QAAQ,EAAE,MAAM,UAAU,CAAC;AAG1G,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AAuJtC,UAAU,eAAe;IAAG,OAAO,EAAE,MAAM,EAAE,CAAC;IAAC,SAAS,EAAE,OAAO,EAAE,EAAE,CAAA;CAAE;AACvE,UAAU,WAAW;IAAO,GAAG,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,EAAE,CAAC;IAAC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAAC,KAAK,CAAC,EAAE,MAAM,CAAC;IAAC,MAAM,CAAC,EAAE,MAAM,CAAA;CAAE;AAClH,UAAU,cAAc;IAAI,KAAK,EAAE,WAAW,EAAE,CAAA;CAAE;AAClD,UAAU,WAAW;IAAO,KAAK,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE;AAC3D,UAAU,SAAS;IACjB,WAAW,CAAC,OAAO,EAAE;QAAE,KAAK,EAAE,MAAM,CAAA;KAAE,GAAG,WAAW,CAAC;IACrD,cAAc,IAAI,OAAO,CAAC,cAAc,CAAC,CAAC;IAC1C,eAAe,IAAI,OAAO,CAAC,eAAe,CAAC,CAAC;CAC7C;AAID,wBAAsB,WAAW,CAC/B,SAAS,EAAE,SAAS,EACpB,UAAU,EAAE,MAAM,EAClB,MAAM,EAAO,WAAW,EACxB,UAAU,EAAG,MAAM,EACnB,WAAW,GAAE,MAAM,CAAC,MAAM,EAAE,QAAQ,CAAM,GACzC,OAAO,CAAC,UAAU,CAAC,CA4WrB"}