firecrawl-pdf-inspector 0.2.2 → 0.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/README.md +99 -0
  2. package/package.json +16 -2
package/README.md ADDED
@@ -0,0 +1,99 @@
1
+ # firecrawl-pdf-inspector
2
+
3
+ Fast PDF classification and region-based text extraction for Node.js/Bun. Native Rust performance via [napi-rs](https://napi.rs).
4
+
5
+ Built by [Firecrawl](https://firecrawl.dev) for hybrid OCR pipelines — extract text from PDF structure where possible, fall back to OCR only when needed.
6
+
7
+ ## Install
8
+
9
+ ```bash
10
+ npm install firecrawl-pdf-inspector
11
+ # or
12
+ bun add firecrawl-pdf-inspector
13
+ ```
14
+
15
+ Prebuilt binaries included for **linux-x64** and **macOS ARM64**. No Rust toolchain needed.
16
+
17
+ ## API
18
+
19
+ ### `classifyPdf(buffer: Buffer): PdfClassification`
20
+
21
+ Classify a PDF as TextBased, Scanned, Mixed, or ImageBased (~10-50ms). Returns which pages need OCR.
22
+
23
+ ```typescript
24
+ import { classifyPdf } from 'firecrawl-pdf-inspector'
25
+ import { readFileSync } from 'fs'
26
+
27
+ const pdf = readFileSync('document.pdf')
28
+ const result = classifyPdf(pdf)
29
+
30
+ console.log(result.pdfType) // "TextBased" | "Scanned" | "Mixed" | "ImageBased"
31
+ console.log(result.pageCount) // 42
32
+ console.log(result.pagesNeedingOcr) // [5, 12, 15] (0-indexed)
33
+ console.log(result.confidence) // 0.875
34
+ ```
35
+
36
+ ### `extractTextInRegions(buffer: Buffer, pageRegions: PageRegions[]): PageRegionTexts[]`
37
+
38
+ Extract text within bounding-box regions from a PDF. Designed for hybrid OCR pipelines where a layout model detects regions in rendered page images, and this function extracts text from the PDF structure for text-based pages — skipping GPU OCR.
39
+
40
+ Each region result includes a `needsOcr` flag that signals unreliable extraction (empty text, GID-encoded fonts, garbage text, encoding issues).
41
+
42
+ ```typescript
43
+ import { extractTextInRegions } from 'firecrawl-pdf-inspector'
44
+
45
+ const result = extractTextInRegions(pdf, [
46
+ {
47
+ page: 0, // 0-indexed
48
+ regions: [
49
+ [0, 0, 300, 400], // [x1, y1, x2, y2] in PDF points, top-left origin
50
+ [300, 0, 612, 400],
51
+ ]
52
+ }
53
+ ])
54
+
55
+ for (const region of result[0].regions) {
56
+ if (region.needsOcr) {
57
+ // Unreliable text — send this region to OCR instead
58
+ } else {
59
+ console.log(region.text) // Extracted text in reading order
60
+ }
61
+ }
62
+ ```
63
+
64
+ ## Types
65
+
66
+ ```typescript
67
+ interface PdfClassification {
68
+ pdfType: string // "TextBased" | "Scanned" | "Mixed" | "ImageBased"
69
+ pageCount: number
70
+ pagesNeedingOcr: number[] // 0-indexed page numbers
71
+ confidence: number // 0.0 - 1.0
72
+ }
73
+
74
+ interface PageRegions {
75
+ page: number // 0-indexed
76
+ regions: number[][] // [[x1, y1, x2, y2], ...] in PDF points, top-left origin
77
+ }
78
+
79
+ interface PageRegionTexts {
80
+ page: number
81
+ regions: RegionText[]
82
+ }
83
+
84
+ interface RegionText {
85
+ text: string
86
+ needsOcr: boolean // true when text is unreliable
87
+ }
88
+ ```
89
+
90
+ ## Platforms
91
+
92
+ | Platform | Architecture | Supported |
93
+ |----------|-------------|-----------|
94
+ | Linux | x64 | Yes |
95
+ | macOS | ARM64 | Yes |
96
+
97
+ ## License
98
+
99
+ MIT
package/package.json CHANGED
@@ -1,18 +1,32 @@
1
1
  {
2
2
  "name": "firecrawl-pdf-inspector",
3
- "version": "0.2.2",
3
+ "version": "0.2.3",
4
+ "description": "Fast PDF classification and text extraction. Detect text-based vs scanned PDFs, extract text by region with quality checks. Native Rust performance via napi-rs.",
4
5
  "main": "index.js",
5
6
  "types": "index.d.ts",
6
7
  "license": "MIT",
8
+ "keywords": [
9
+ "pdf",
10
+ "pdf-extraction",
11
+ "pdf-parser",
12
+ "text-extraction",
13
+ "ocr",
14
+ "pdf-classification",
15
+ "napi",
16
+ "rust",
17
+ "firecrawl"
18
+ ],
7
19
  "files": [
8
20
  "index.js",
9
21
  "index.d.ts",
10
- "*.node"
22
+ "*.node",
23
+ "README.md"
11
24
  ],
12
25
  "repository": {
13
26
  "type": "git",
14
27
  "url": "https://github.com/firecrawl/pdf-inspector"
15
28
  },
29
+ "homepage": "https://github.com/firecrawl/pdf-inspector",
16
30
  "publishConfig": {
17
31
  "access": "public"
18
32
  },