preocr 1.2.1__tar.gz → 1.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {preocr-1.2.1 → preocr-1.2.2}/PKG-INFO +1 -1
- {preocr-1.2.1 → preocr-1.2.2}/preocr/analysis/page_detection.py +2 -2
- {preocr-1.2.1 → preocr-1.2.2}/preocr/core/decision.py +44 -37
- {preocr-1.2.1 → preocr-1.2.2}/preocr/core/detector.py +7 -7
- {preocr-1.2.1 → preocr-1.2.2}/preocr/core/extractor.py +3 -3
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/base.py +1 -1
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/formatters.py +10 -10
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/pdf_extractor.py +390 -343
- {preocr-1.2.1 → preocr-1.2.2}/preocr/probes/pdf_probe.py +3 -3
- {preocr-1.2.1 → preocr-1.2.2}/preocr/utils/logger.py +11 -9
- {preocr-1.2.1 → preocr-1.2.2}/preocr/version.py +1 -1
- {preocr-1.2.1 → preocr-1.2.2}/preocr.egg-info/PKG-INFO +1 -1
- {preocr-1.2.1 → preocr-1.2.2}/preocr.egg-info/SOURCES.txt +2 -0
- preocr-1.2.2/tests/test_config_thresholds.py +94 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_hybrid_pipeline.py +1 -1
- preocr-1.2.2/tests/test_layout_aware_needs_ocr.py +109 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_pdf_probe.py +0 -1
- {preocr-1.2.1 → preocr-1.2.2}/LICENSE +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/README.md +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/__init__.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/analysis/__init__.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/analysis/layout_analyzer.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/analysis/opencv_layout.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/constants.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/core/__init__.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/core/signals.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/exceptions.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/__init__.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/office_extractor.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/schemas.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/extraction/text_extractor.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/probes/__init__.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/probes/image_probe.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/probes/office_probe.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/probes/text_probe.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/py.typed +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/reason_codes.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/utils/__init__.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/utils/batch.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/utils/cache.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr/utils/filetype.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr.egg-info/dependency_links.txt +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr.egg-info/requires.txt +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/preocr.egg-info/top_level.txt +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/pyproject.toml +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/setup.cfg +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_batch.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_decision.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_detector.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_filetype.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_image_probe.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_integration.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_layout_analyzer.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_office_probe.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_opencv_layout.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_page_detection.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_reason_codes.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_signals.py +0 -0
- {preocr-1.2.1 → preocr-1.2.2}/tests/test_text_probe.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: preocr
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.2
|
|
4
4
|
Summary: A fast, CPU-only library that intelligently detects whether files need OCR processing before expensive OCR operations. Uses hybrid adaptive pipeline for 92-95% accuracy.
|
|
5
5
|
Author: PreOCR Contributors
|
|
6
6
|
License-Expression: Apache-2.0
|
|
@@ -42,11 +42,11 @@ def analyze_pdf_pages(
|
|
|
42
42
|
2. Adjusted based on consistency:
|
|
43
43
|
- If all pages are consistent (all need OCR or all don't), confidence +0.1
|
|
44
44
|
- If pages are mixed (some need OCR, some don't), confidence -0.1
|
|
45
|
-
|
|
45
|
+
|
|
46
46
|
This means:
|
|
47
47
|
- Uniform documents (all scanned or all digital) get higher confidence
|
|
48
48
|
- Mixed documents get lower confidence, reflecting the uncertainty
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
Per-page confidence:
|
|
51
51
|
- Pages with text: 0.95 (high confidence)
|
|
52
52
|
- Pages without text: 0.80 if completely empty, 0.60 if sparse text
|
|
@@ -22,44 +22,44 @@ def calculate_ocr_score(
|
|
|
22
22
|
) -> float:
|
|
23
23
|
"""
|
|
24
24
|
Calculate OCR_SCORE using pixel-aware scoring model.
|
|
25
|
-
|
|
26
|
-
OCR_SCORE = 0.35 * image_ratio + 0.25 * (1 - alphabet_ratio) +
|
|
25
|
+
|
|
26
|
+
OCR_SCORE = 0.35 * image_ratio + 0.25 * (1 - alphabet_ratio) +
|
|
27
27
|
0.2 * low_text_density + 0.2 * font_suspicion
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
Args:
|
|
30
30
|
text_length: Length of extracted text
|
|
31
31
|
image_coverage: Image coverage percentage (0-100)
|
|
32
32
|
text_coverage: Text coverage percentage (0-100)
|
|
33
33
|
config: Optional Config object
|
|
34
|
-
|
|
34
|
+
|
|
35
35
|
Returns:
|
|
36
36
|
OCR_SCORE (0.0-1.0) where higher score indicates more likely to need OCR
|
|
37
37
|
"""
|
|
38
38
|
if config is None:
|
|
39
39
|
config = _DEFAULT_CONFIG
|
|
40
|
-
|
|
40
|
+
|
|
41
41
|
# Calculate image_ratio from image_coverage (convert percentage to ratio)
|
|
42
42
|
image_ratio = image_coverage / 100.0 if image_coverage > 0 else 0.0
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
# Approximate alphabet_ratio (normalized text length factor)
|
|
45
45
|
max_expected_text = 10000 # Reasonable max for a page
|
|
46
46
|
alphabet_ratio = min(text_length / max_expected_text, 1.0) if text_length > 0 else 0.0
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
# Calculate low_text_density (inverse of text_coverage, normalized)
|
|
49
49
|
text_density = text_coverage / 100.0 if text_coverage > 0 else 0.0
|
|
50
50
|
low_text_density = 1.0 - min(text_density, 1.0)
|
|
51
|
-
|
|
51
|
+
|
|
52
52
|
# Font suspicion: higher when text_length is very low
|
|
53
53
|
font_suspicion = 1.0 - min(text_length / 50.0, 1.0) if text_length < 50 else 0.0
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
# Calculate OCR score
|
|
56
56
|
ocr_score = (
|
|
57
|
-
0.35 * image_ratio
|
|
58
|
-
0.25 * (1.0 - alphabet_ratio)
|
|
59
|
-
0.20 * low_text_density
|
|
60
|
-
0.20 * font_suspicion
|
|
57
|
+
0.35 * image_ratio
|
|
58
|
+
+ 0.25 * (1.0 - alphabet_ratio)
|
|
59
|
+
+ 0.20 * low_text_density
|
|
60
|
+
+ 0.20 * font_suspicion
|
|
61
61
|
)
|
|
62
|
-
|
|
62
|
+
|
|
63
63
|
return round(ocr_score, 3)
|
|
64
64
|
|
|
65
65
|
|
|
@@ -71,24 +71,24 @@ def calculate_confidence_from_signals(
|
|
|
71
71
|
) -> float:
|
|
72
72
|
"""
|
|
73
73
|
Calculate confidence score from signals using unified approach.
|
|
74
|
-
|
|
74
|
+
|
|
75
75
|
Priority:
|
|
76
76
|
1. Use OCR_SCORE if available (most accurate)
|
|
77
77
|
2. Use layout-based calculation
|
|
78
78
|
3. Fallback to text-length based
|
|
79
|
-
|
|
79
|
+
|
|
80
80
|
Args:
|
|
81
81
|
signals: Dictionary of signals from signals.collect_signals()
|
|
82
82
|
needs_ocr: Boolean indicating if OCR is needed
|
|
83
83
|
ocr_score: Optional OCR_SCORE (0.0-1.0) if already calculated
|
|
84
84
|
config: Optional Config object
|
|
85
|
-
|
|
85
|
+
|
|
86
86
|
Returns:
|
|
87
87
|
Confidence score (0.0-1.0)
|
|
88
88
|
"""
|
|
89
89
|
if config is None:
|
|
90
90
|
config = _DEFAULT_CONFIG
|
|
91
|
-
|
|
91
|
+
|
|
92
92
|
# Priority 1: Use OCR_SCORE if available (most accurate)
|
|
93
93
|
if ocr_score is not None and config.use_ocr_score_confidence:
|
|
94
94
|
# Calibrate OCR_SCORE to confidence range (0.50-0.95)
|
|
@@ -99,13 +99,13 @@ def calculate_confidence_from_signals(
|
|
|
99
99
|
# Lower OCR_SCORE = higher confidence for "no OCR"
|
|
100
100
|
confidence = 0.50 + ((1.0 - ocr_score) * 0.45) # Range: 0.50-0.95
|
|
101
101
|
return round(confidence, 2)
|
|
102
|
-
|
|
102
|
+
|
|
103
103
|
# Priority 2: Layout-based (if layout data available)
|
|
104
104
|
layout_type = signals.get("layout_type")
|
|
105
105
|
if layout_type and layout_type != "unknown":
|
|
106
106
|
text_coverage = float(signals.get("text_coverage", 0.0))
|
|
107
107
|
image_coverage = float(signals.get("image_coverage", 0.0))
|
|
108
|
-
|
|
108
|
+
|
|
109
109
|
if needs_ocr:
|
|
110
110
|
# More images = higher confidence
|
|
111
111
|
image_factor = min(image_coverage / 100.0, 1.0)
|
|
@@ -115,7 +115,7 @@ def calculate_confidence_from_signals(
|
|
|
115
115
|
text_factor = min(text_coverage / 100.0, 1.0)
|
|
116
116
|
confidence = 0.70 + (text_factor * 0.25) # Range: 0.70-0.95
|
|
117
117
|
return round(confidence, 2)
|
|
118
|
-
|
|
118
|
+
|
|
119
119
|
# Priority 3: Text-length based fallback
|
|
120
120
|
text_length = signals.get("text_length", 0)
|
|
121
121
|
if needs_ocr:
|
|
@@ -129,7 +129,7 @@ def calculate_confidence_from_signals(
|
|
|
129
129
|
# More text = higher confidence (digital)
|
|
130
130
|
text_factor = min(text_length / 1000.0, 1.0)
|
|
131
131
|
confidence = 0.75 + (text_factor * 0.20) # Range: 0.75-0.95
|
|
132
|
-
|
|
132
|
+
|
|
133
133
|
return round(confidence, 2)
|
|
134
134
|
|
|
135
135
|
|
|
@@ -205,21 +205,25 @@ def decide(
|
|
|
205
205
|
is_mixed_content = signals.get("is_mixed_content", False)
|
|
206
206
|
text_coverage = signals.get("text_coverage", 0.0)
|
|
207
207
|
image_coverage = signals.get("image_coverage", 0.0)
|
|
208
|
-
|
|
208
|
+
|
|
209
209
|
# Calculate image_ratio from image_coverage (convert percentage to ratio)
|
|
210
210
|
# Also check OpenCV results if available (more accurate for scanned PDFs)
|
|
211
211
|
opencv_layout = signals.get("opencv_layout", {})
|
|
212
212
|
image_coverage_opencv = opencv_layout.get("image_coverage", 0.0) if opencv_layout else 0.0
|
|
213
|
-
|
|
213
|
+
|
|
214
214
|
# Use OpenCV image_coverage if available (more accurate), otherwise use layout image_coverage
|
|
215
|
-
effective_image_coverage =
|
|
215
|
+
effective_image_coverage = (
|
|
216
|
+
image_coverage_opencv if image_coverage_opencv > 0 else image_coverage
|
|
217
|
+
)
|
|
216
218
|
image_ratio = effective_image_coverage / 100.0 if effective_image_coverage > 0 else 0.0
|
|
217
|
-
|
|
219
|
+
|
|
218
220
|
# Calculate OCR_SCORE for unified confidence calculation
|
|
219
221
|
ocr_score = None
|
|
220
222
|
if layout_type and layout_type != "unknown":
|
|
221
|
-
ocr_score = calculate_ocr_score(
|
|
222
|
-
|
|
223
|
+
ocr_score = calculate_ocr_score(
|
|
224
|
+
text_length, effective_image_coverage, text_coverage, config
|
|
225
|
+
)
|
|
226
|
+
|
|
223
227
|
# 🔥 Hybrid Rule: Sweet spot for OCR detection
|
|
224
228
|
# If image_ratio > 0.75 AND extracted_text_length < 30 → OCR
|
|
225
229
|
# This catches scanned PDFs that are image-heavy with minimal extractable text
|
|
@@ -237,7 +241,7 @@ def decide(
|
|
|
237
241
|
CATEGORY_UNSTRUCTURED,
|
|
238
242
|
ReasonCode.PDF_SCANNED,
|
|
239
243
|
)
|
|
240
|
-
|
|
244
|
+
|
|
241
245
|
# Alternative: If text_length is very low (< 30) and we have layout data suggesting images
|
|
242
246
|
# This handles cases where scanned PDFs aren't detected as images but have no text
|
|
243
247
|
if text_length < 30 and layout_type and layout_type != "unknown":
|
|
@@ -276,7 +280,7 @@ def decide(
|
|
|
276
280
|
CATEGORY_UNSTRUCTURED,
|
|
277
281
|
ReasonCode.PDF_MIXED,
|
|
278
282
|
)
|
|
279
|
-
|
|
283
|
+
|
|
280
284
|
# If text coverage is significant, might not need full OCR
|
|
281
285
|
if text_length >= config.min_text_length and text_coverage > 10:
|
|
282
286
|
confidence = calculate_confidence_from_signals(
|
|
@@ -359,7 +363,7 @@ def decide(
|
|
|
359
363
|
CATEGORY_UNSTRUCTURED,
|
|
360
364
|
ReasonCode.PDF_SCANNED,
|
|
361
365
|
)
|
|
362
|
-
|
|
366
|
+
|
|
363
367
|
# Fallback to text-length based decision (when layout analysis not available)
|
|
364
368
|
if text_length >= config.min_text_length:
|
|
365
369
|
# Use unified confidence calculation (fallback mode)
|
|
@@ -472,21 +476,21 @@ def refine_with_opencv(
|
|
|
472
476
|
image_coverage_opencv = opencv_result.get("image_coverage", 0.0)
|
|
473
477
|
has_text_regions = opencv_result.get("has_text_regions", False)
|
|
474
478
|
layout_type = opencv_result.get("layout_type", "unknown")
|
|
475
|
-
|
|
479
|
+
|
|
476
480
|
# Calculate OCR_SCORE from OpenCV results for unified confidence
|
|
477
481
|
ocr_score_opencv = calculate_ocr_score(
|
|
478
482
|
text_length, image_coverage_opencv, text_coverage_opencv, config
|
|
479
483
|
)
|
|
480
|
-
|
|
484
|
+
|
|
481
485
|
# Update signals with OpenCV layout data for confidence calculation
|
|
482
486
|
signals_with_opencv = signals.copy()
|
|
483
487
|
signals_with_opencv["layout_type"] = layout_type
|
|
484
488
|
signals_with_opencv["text_coverage"] = text_coverage_opencv
|
|
485
489
|
signals_with_opencv["image_coverage"] = image_coverage_opencv
|
|
486
|
-
|
|
490
|
+
|
|
487
491
|
# Calculate image_ratio from image_coverage (convert percentage to ratio)
|
|
488
492
|
image_ratio = image_coverage_opencv / 100.0 if image_coverage_opencv > 0 else 0.0
|
|
489
|
-
|
|
493
|
+
|
|
490
494
|
# 🔥 Hybrid Rule: Sweet spot for OCR detection (applied in OpenCV refinement too)
|
|
491
495
|
# If image_ratio > 0.75 AND extracted_text_length < 30 → OCR
|
|
492
496
|
if image_ratio > 0.75 and text_length < 30:
|
|
@@ -526,7 +530,7 @@ def refine_with_opencv(
|
|
|
526
530
|
CATEGORY_UNSTRUCTURED,
|
|
527
531
|
ReasonCode.PDF_MIXED,
|
|
528
532
|
)
|
|
529
|
-
|
|
533
|
+
|
|
530
534
|
if text_length >= config.min_text_length and text_coverage_opencv > 15:
|
|
531
535
|
# Digital text document - use unified confidence calculation
|
|
532
536
|
confidence = calculate_confidence_from_signals(
|
|
@@ -607,7 +611,10 @@ def refine_with_opencv(
|
|
|
607
611
|
if (initial_needs_ocr and not has_text_regions) or (not initial_needs_ocr and has_text_regions):
|
|
608
612
|
# Calculate OCR_SCORE-based confidence
|
|
609
613
|
ocr_confidence = calculate_confidence_from_signals(
|
|
610
|
-
signals_with_opencv,
|
|
614
|
+
signals_with_opencv,
|
|
615
|
+
needs_ocr=initial_needs_ocr,
|
|
616
|
+
ocr_score=ocr_score_opencv,
|
|
617
|
+
config=config,
|
|
611
618
|
)
|
|
612
619
|
# Weighted combination: 30% initial, 70% OCR_SCORE-based (OpenCV is more accurate)
|
|
613
620
|
confidence = (initial_confidence * 0.3) + (ocr_confidence * 0.7)
|
|
@@ -66,20 +66,20 @@ def needs_ocr(
|
|
|
66
66
|
|
|
67
67
|
Note on Confidence Scores:
|
|
68
68
|
Confidence scores may vary between page_level=True and page_level=False modes:
|
|
69
|
-
|
|
69
|
+
|
|
70
70
|
- **Without page_level**: Confidence is calculated based on document-level heuristics
|
|
71
71
|
and OpenCV analysis (if triggered). Typical range: 0.60-0.95.
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
- **With page_level=True**: Confidence is calculated as the average of per-page
|
|
74
74
|
confidence scores, adjusted for consistency. For mixed documents (some pages
|
|
75
75
|
need OCR, some don't), confidence may be lower due to the averaging effect.
|
|
76
76
|
Typical range: 0.60-0.95, but may be lower for mixed documents.
|
|
77
|
-
|
|
77
|
+
|
|
78
78
|
- **Why the difference**: Page-level analysis provides more granular information
|
|
79
79
|
but averages confidence across pages. Document-level analysis uses overall
|
|
80
80
|
text extraction and layout analysis, which can be more confident for uniform
|
|
81
81
|
documents.
|
|
82
|
-
|
|
82
|
+
|
|
83
83
|
Both modes are accurate; the difference reflects the analysis granularity.
|
|
84
84
|
Use page_level=True when you need per-page decisions, otherwise use the
|
|
85
85
|
default (page_level=False) for faster, document-level decisions.
|
|
@@ -187,7 +187,7 @@ def needs_ocr(
|
|
|
187
187
|
if opencv_result:
|
|
188
188
|
# Add OpenCV results to signals BEFORE refining (so hybrid rule can use it)
|
|
189
189
|
collected_signals["opencv_layout"] = opencv_result
|
|
190
|
-
|
|
190
|
+
|
|
191
191
|
# Refine decision based on OpenCV analysis
|
|
192
192
|
needs_ocr_flag, reason, confidence, category, reason_code = decision.refine_with_opencv(
|
|
193
193
|
collected_signals,
|
|
@@ -218,14 +218,14 @@ def needs_ocr(
|
|
|
218
218
|
if page_analysis and "pages" in page_analysis:
|
|
219
219
|
page_count = page_analysis.get("page_count", 0)
|
|
220
220
|
pages_list = page_analysis.get("pages", [])
|
|
221
|
-
|
|
221
|
+
|
|
222
222
|
# Only add page-level data if it's valid
|
|
223
223
|
if page_count > 0 and len(pages_list) > 0:
|
|
224
224
|
result["pages"] = pages_list
|
|
225
225
|
result["page_count"] = page_count
|
|
226
226
|
result["pages_needing_ocr"] = page_analysis.get("pages_needing_ocr", 0)
|
|
227
227
|
result["pages_with_text"] = page_analysis.get("pages_with_text", 0)
|
|
228
|
-
|
|
228
|
+
|
|
229
229
|
# Override overall decision with page-level analysis only if data is valid
|
|
230
230
|
if page_analysis.get("overall_needs_ocr") is not None:
|
|
231
231
|
# Validate that page-level analysis is complete and consistent
|
|
@@ -134,8 +134,8 @@ def extract_native_data(
|
|
|
134
134
|
|
|
135
135
|
# Format output
|
|
136
136
|
return format_result(
|
|
137
|
-
result,
|
|
138
|
-
output_format=output_format,
|
|
137
|
+
result,
|
|
138
|
+
output_format=output_format,
|
|
139
139
|
markdown_clean=markdown_clean,
|
|
140
|
-
include_metadata=include_metadata
|
|
140
|
+
include_metadata=include_metadata,
|
|
141
141
|
)
|
|
@@ -75,7 +75,7 @@ def calculate_confidence(
|
|
|
75
75
|
text_quality: Quality of text (0.0-1.0), based on font size and clarity
|
|
76
76
|
extraction_method: Method used ("pdfplumber" = 0.9, "pymupdf" = 0.8)
|
|
77
77
|
element_type_certainty: How certain we are about classification (0.0-1.0)
|
|
78
|
-
bbox_accuracy: How well-defined the bbox is (0.0-1.0)
|
|
78
|
+
bbox_accuracy: How well-defined the bbox is (0.0-1.0)
|
|
79
79
|
|
|
80
80
|
Returns:
|
|
81
81
|
Confidence score between 0.0 and 1.0
|
|
@@ -46,17 +46,17 @@ def format_as_json(result: ExtractionResult) -> Dict[str, Any]:
|
|
|
46
46
|
def format_as_markdown(result: ExtractionResult, clean: bool = False) -> str:
|
|
47
47
|
"""
|
|
48
48
|
Format result as LLM-ready markdown.
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
Args:
|
|
51
51
|
result: ExtractionResult to format
|
|
52
52
|
clean: If True, output only content without metadata (file paths, confidence scores, etc.)
|
|
53
53
|
If False, include all metadata (default: False for backward compatibility)
|
|
54
|
-
|
|
54
|
+
|
|
55
55
|
Returns:
|
|
56
56
|
Markdown string
|
|
57
57
|
"""
|
|
58
58
|
lines = []
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
# If clean mode, skip all metadata and just output content
|
|
61
61
|
if clean:
|
|
62
62
|
return _format_as_clean_markdown(result)
|
|
@@ -191,14 +191,14 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
|
|
|
191
191
|
Perfect for LLM consumption - just the text content.
|
|
192
192
|
"""
|
|
193
193
|
lines = []
|
|
194
|
-
|
|
194
|
+
|
|
195
195
|
# Tables - just the table content
|
|
196
196
|
if result.tables:
|
|
197
197
|
for table in result.tables:
|
|
198
198
|
table_md = _format_table_as_markdown(table)
|
|
199
199
|
lines.append(table_md)
|
|
200
200
|
lines.append("")
|
|
201
|
-
|
|
201
|
+
|
|
202
202
|
# Forms - just field names and values
|
|
203
203
|
if result.forms:
|
|
204
204
|
for form in result.forms:
|
|
@@ -207,7 +207,7 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
|
|
|
207
207
|
elif form.value:
|
|
208
208
|
lines.append(form.value)
|
|
209
209
|
lines.append("")
|
|
210
|
-
|
|
210
|
+
|
|
211
211
|
# Elements (text content) - main content
|
|
212
212
|
if result.elements:
|
|
213
213
|
# Group by page
|
|
@@ -217,11 +217,11 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
|
|
|
217
217
|
if page_num not in elements_by_page:
|
|
218
218
|
elements_by_page[page_num] = []
|
|
219
219
|
elements_by_page[page_num].append(elem)
|
|
220
|
-
|
|
220
|
+
|
|
221
221
|
# Sort pages
|
|
222
222
|
for page_num in sorted(elements_by_page.keys()):
|
|
223
223
|
page_elements = elements_by_page[page_num]
|
|
224
|
-
|
|
224
|
+
|
|
225
225
|
# Sort by reading order if available
|
|
226
226
|
if result.reading_order:
|
|
227
227
|
page_elements.sort(
|
|
@@ -231,7 +231,7 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
|
|
|
231
231
|
else 9999
|
|
232
232
|
)
|
|
233
233
|
)
|
|
234
|
-
|
|
234
|
+
|
|
235
235
|
for elem in page_elements:
|
|
236
236
|
if elem.element_type == ElementType.TITLE:
|
|
237
237
|
lines.append(f"# {elem.text}")
|
|
@@ -249,7 +249,7 @@ def _format_as_clean_markdown(result: ExtractionResult) -> str:
|
|
|
249
249
|
elif elem.text:
|
|
250
250
|
lines.append(elem.text)
|
|
251
251
|
lines.append("")
|
|
252
|
-
|
|
252
|
+
|
|
253
253
|
return "\n".join(lines).strip()
|
|
254
254
|
|
|
255
255
|
|