firecrawl-pdf-inspector 0.2.0 → 0.2.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +99 -0
- package/package.json +16 -2
- package/pdf-inspector.darwin-arm64.node +0 -0
- package/pdf-inspector.linux-x64-gnu.node +0 -0
package/README.md
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
# firecrawl-pdf-inspector
|
|
2
|
+
|
|
3
|
+
Fast PDF classification and region-based text extraction for Node.js/Bun. Native Rust performance via [napi-rs](https://napi.rs).
|
|
4
|
+
|
|
5
|
+
Built by [Firecrawl](https://firecrawl.dev) for hybrid OCR pipelines — extract text from PDF structure where possible, fall back to OCR only when needed.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
npm install firecrawl-pdf-inspector
|
|
11
|
+
# or
|
|
12
|
+
bun add firecrawl-pdf-inspector
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Prebuilt binaries included for **linux-x64** and **macOS ARM64**. No Rust toolchain needed.
|
|
16
|
+
|
|
17
|
+
## API
|
|
18
|
+
|
|
19
|
+
### `classifyPdf(buffer: Buffer): PdfClassification`
|
|
20
|
+
|
|
21
|
+
Classify a PDF as TextBased, Scanned, Mixed, or ImageBased (~10-50ms). Returns which pages need OCR.
|
|
22
|
+
|
|
23
|
+
```typescript
|
|
24
|
+
import { classifyPdf } from 'firecrawl-pdf-inspector'
|
|
25
|
+
import { readFileSync } from 'fs'
|
|
26
|
+
|
|
27
|
+
const pdf = readFileSync('document.pdf')
|
|
28
|
+
const result = classifyPdf(pdf)
|
|
29
|
+
|
|
30
|
+
console.log(result.pdfType) // "TextBased" | "Scanned" | "Mixed" | "ImageBased"
|
|
31
|
+
console.log(result.pageCount) // 42
|
|
32
|
+
console.log(result.pagesNeedingOcr) // [5, 12, 15] (0-indexed)
|
|
33
|
+
console.log(result.confidence) // 0.875
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
### `extractTextInRegions(buffer: Buffer, pageRegions: PageRegions[]): PageRegionTexts[]`
|
|
37
|
+
|
|
38
|
+
Extract text within bounding-box regions from a PDF. Designed for hybrid OCR pipelines where a layout model detects regions in rendered page images, and this function extracts text from the PDF structure for text-based pages — skipping GPU OCR.
|
|
39
|
+
|
|
40
|
+
Each region result includes a `needsOcr` flag that signals unreliable extraction (empty text, GID-encoded fonts, garbage text, encoding issues).
|
|
41
|
+
|
|
42
|
+
```typescript
|
|
43
|
+
import { extractTextInRegions } from 'firecrawl-pdf-inspector'
|
|
44
|
+
|
|
45
|
+
const result = extractTextInRegions(pdf, [
|
|
46
|
+
{
|
|
47
|
+
page: 0, // 0-indexed
|
|
48
|
+
regions: [
|
|
49
|
+
[0, 0, 300, 400], // [x1, y1, x2, y2] in PDF points, top-left origin
|
|
50
|
+
[300, 0, 612, 400],
|
|
51
|
+
]
|
|
52
|
+
}
|
|
53
|
+
])
|
|
54
|
+
|
|
55
|
+
for (const region of result[0].regions) {
|
|
56
|
+
if (region.needsOcr) {
|
|
57
|
+
// Unreliable text — send this region to OCR instead
|
|
58
|
+
} else {
|
|
59
|
+
console.log(region.text) // Extracted text in reading order
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
## Types
|
|
65
|
+
|
|
66
|
+
```typescript
|
|
67
|
+
interface PdfClassification {
|
|
68
|
+
pdfType: string // "TextBased" | "Scanned" | "Mixed" | "ImageBased"
|
|
69
|
+
pageCount: number
|
|
70
|
+
pagesNeedingOcr: number[] // 0-indexed page numbers
|
|
71
|
+
confidence: number // 0.0 - 1.0
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
interface PageRegions {
|
|
75
|
+
page: number // 0-indexed
|
|
76
|
+
regions: number[][] // [[x1, y1, x2, y2], ...] in PDF points, top-left origin
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
interface PageRegionTexts {
|
|
80
|
+
page: number
|
|
81
|
+
regions: RegionText[]
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
interface RegionText {
|
|
85
|
+
text: string
|
|
86
|
+
needsOcr: boolean // true when text is unreliable
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
## Platforms
|
|
91
|
+
|
|
92
|
+
| Platform | Architecture | Supported |
|
|
93
|
+
|----------|-------------|-----------|
|
|
94
|
+
| Linux | x64 | Yes |
|
|
95
|
+
| macOS | ARM64 | Yes |
|
|
96
|
+
|
|
97
|
+
## License
|
|
98
|
+
|
|
99
|
+
MIT
|
package/package.json
CHANGED
|
@@ -1,18 +1,32 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "firecrawl-pdf-inspector",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.3",
|
|
4
|
+
"description": "Fast PDF classification and text extraction. Detect text-based vs scanned PDFs, extract text by region with quality checks. Native Rust performance via napi-rs.",
|
|
4
5
|
"main": "index.js",
|
|
5
6
|
"types": "index.d.ts",
|
|
6
7
|
"license": "MIT",
|
|
8
|
+
"keywords": [
|
|
9
|
+
"pdf",
|
|
10
|
+
"pdf-extraction",
|
|
11
|
+
"pdf-parser",
|
|
12
|
+
"text-extraction",
|
|
13
|
+
"ocr",
|
|
14
|
+
"pdf-classification",
|
|
15
|
+
"napi",
|
|
16
|
+
"rust",
|
|
17
|
+
"firecrawl"
|
|
18
|
+
],
|
|
7
19
|
"files": [
|
|
8
20
|
"index.js",
|
|
9
21
|
"index.d.ts",
|
|
10
|
-
"*.node"
|
|
22
|
+
"*.node",
|
|
23
|
+
"README.md"
|
|
11
24
|
],
|
|
12
25
|
"repository": {
|
|
13
26
|
"type": "git",
|
|
14
27
|
"url": "https://github.com/firecrawl/pdf-inspector"
|
|
15
28
|
},
|
|
29
|
+
"homepage": "https://github.com/firecrawl/pdf-inspector",
|
|
16
30
|
"publishConfig": {
|
|
17
31
|
"access": "public"
|
|
18
32
|
},
|
|
Binary file
|
|
Binary file
|