natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +131 -45
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +113 -22
- natural_pdf/core/pdf.py +477 -75
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +222 -108
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.33.dist-info/RECORD +0 -118
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
|
|
1
|
+
"""Evaluate the quality of LLM enrichment suggestions.
|
2
|
+
|
3
|
+
This script analyzes the code suggestions to identify:
|
4
|
+
- Use of modern features (Guides API, extract_table)
|
5
|
+
- Avoidance of anti-patterns (placeholder text, unnecessary TATR)
|
6
|
+
- Practical, working code
|
7
|
+
"""
|
8
|
+
|
9
|
+
import json
|
10
|
+
import re
|
11
|
+
from collections import defaultdict
|
12
|
+
from pathlib import Path
|
13
|
+
from typing import Any, Dict, List
|
14
|
+
|
15
|
+
|
16
|
+
def analyze_code_quality(code: str) -> Dict[str, Any]:
|
17
|
+
"""Analyze a code suggestion for quality indicators."""
|
18
|
+
|
19
|
+
quality = {
|
20
|
+
"uses_guides": bool(
|
21
|
+
re.search(r"from natural_pdf\.analyzers import Guides|Guides\(|Guides\.", code)
|
22
|
+
),
|
23
|
+
"uses_tatr": bool(re.search(r'analyze_layout\([\'"]tatr[\'"]?\)', code)),
|
24
|
+
"uses_extract_table": bool(re.search(r"\.extract_table\(\)", code)),
|
25
|
+
"has_placeholder_text": bool(
|
26
|
+
re.search(r'[\'"](?:AnchorText|Texts|Also|HeaderText|TableHeader)[\'"]', code)
|
27
|
+
),
|
28
|
+
"uses_real_text": bool(
|
29
|
+
re.search(
|
30
|
+
r'[\'"](?!AnchorText|Texts|Also|HeaderText|TableHeader)[^\'"{}\[\]]+[\'"]', code
|
31
|
+
)
|
32
|
+
)
|
33
|
+
and not bool(
|
34
|
+
re.search(r'[\'"](?:AnchorText|Texts|Also|HeaderText|TableHeader)[\'"]', code)
|
35
|
+
),
|
36
|
+
"uses_snap_to_whitespace": bool(re.search(r"snap_to_whitespace", code)),
|
37
|
+
"uses_parent_navigation": bool(re.search(r"\.parent\(", code)),
|
38
|
+
"uses_until": bool(re.search(r"until\s*=", code)),
|
39
|
+
"has_ocr_call": bool(re.search(r"apply_ocr\(", code)),
|
40
|
+
"removes_text_layer": bool(re.search(r"remove_text_layer\(|text_layer\s*=\s*False", code)),
|
41
|
+
}
|
42
|
+
|
43
|
+
# Calculate score based on quality indicators
|
44
|
+
score = 0
|
45
|
+
if quality["uses_guides"]:
|
46
|
+
score += 3 # Major positive: modern approach
|
47
|
+
if quality["uses_tatr"]:
|
48
|
+
score += 1 # Minor positive: valid for complex layouts
|
49
|
+
if quality["uses_extract_table"]:
|
50
|
+
score += 2 # Positive: using singular method
|
51
|
+
if quality["uses_real_text"]:
|
52
|
+
score += 2 # Positive: using actual anchors
|
53
|
+
if quality["uses_snap_to_whitespace"]:
|
54
|
+
score += 2 # Positive: modern Guides feature
|
55
|
+
if quality["uses_parent_navigation"]:
|
56
|
+
score += 1 # Positive: robust navigation
|
57
|
+
if quality["uses_until"]:
|
58
|
+
score += 1 # Positive when appropriate: precise region selection
|
59
|
+
|
60
|
+
quality["score"] = score
|
61
|
+
quality["max_score"] = 12
|
62
|
+
|
63
|
+
return quality
|
64
|
+
|
65
|
+
|
66
|
+
def analyze_difficult_elements(elements: List[str]) -> Dict[str, int]:
|
67
|
+
"""Count types of difficult elements identified."""
|
68
|
+
|
69
|
+
patterns = {
|
70
|
+
"tiny_font": r"tiny.*font|small.*font|font.*size|<\s*\d+\s*pt",
|
71
|
+
"rtl_language": r"arabic|hebrew|rtl|right.*to.*left",
|
72
|
+
"scanned": r"scanned|image.*only|no.*text.*layer",
|
73
|
+
"complex_layout": r"column|multi.*column|layout",
|
74
|
+
"handwritten": r"handwritten|hand.*written",
|
75
|
+
"redacted": r"redact",
|
76
|
+
}
|
77
|
+
|
78
|
+
counts = defaultdict(int)
|
79
|
+
for element in elements:
|
80
|
+
element_lower = element.lower()
|
81
|
+
for category, pattern in patterns.items():
|
82
|
+
if re.search(pattern, element_lower):
|
83
|
+
counts[category] += 1
|
84
|
+
|
85
|
+
return dict(counts)
|
86
|
+
|
87
|
+
|
88
|
+
def evaluate_submission(submission_path: Path) -> Dict[str, Any]:
|
89
|
+
"""Evaluate a single submission's enrichment quality."""
|
90
|
+
|
91
|
+
summary_path = submission_path / "summary.json"
|
92
|
+
if not summary_path.exists():
|
93
|
+
return None
|
94
|
+
|
95
|
+
data = json.loads(summary_path.read_text())
|
96
|
+
|
97
|
+
result = {
|
98
|
+
"submission_id": data.get("submission_id", submission_path.name),
|
99
|
+
"has_doc_enrichment": bool(data.get("code_suggestion")),
|
100
|
+
"doc_code_quality": None,
|
101
|
+
"difficult_elements_analysis": None,
|
102
|
+
"page_code_quality": [],
|
103
|
+
}
|
104
|
+
|
105
|
+
# Analyze document-level code
|
106
|
+
if data.get("code_suggestion"):
|
107
|
+
result["doc_code_quality"] = analyze_code_quality(data["code_suggestion"])
|
108
|
+
|
109
|
+
# Analyze difficult elements
|
110
|
+
if data.get("difficult_elements"):
|
111
|
+
result["difficult_elements_analysis"] = analyze_difficult_elements(
|
112
|
+
data["difficult_elements"]
|
113
|
+
)
|
114
|
+
|
115
|
+
# Analyze page-level code
|
116
|
+
for page in data.get("pages", []):
|
117
|
+
if page.get("code_suggestion"):
|
118
|
+
page_quality = analyze_code_quality(page["code_suggestion"])
|
119
|
+
page_quality["page_number"] = page.get("page_number")
|
120
|
+
result["page_code_quality"].append(page_quality)
|
121
|
+
|
122
|
+
return result
|
123
|
+
|
124
|
+
|
125
|
+
def main():
|
126
|
+
"""Analyze all submissions and generate quality report."""
|
127
|
+
|
128
|
+
eval_dir = Path("eval_results")
|
129
|
+
results = []
|
130
|
+
|
131
|
+
for submission_dir in eval_dir.iterdir():
|
132
|
+
if submission_dir.is_dir() and (submission_dir / "summary.json").exists():
|
133
|
+
result = evaluate_submission(submission_dir)
|
134
|
+
if result:
|
135
|
+
results.append(result)
|
136
|
+
|
137
|
+
# Aggregate statistics
|
138
|
+
stats = {
|
139
|
+
"total_submissions": len(results),
|
140
|
+
"with_doc_enrichment": sum(1 for r in results if r["has_doc_enrichment"]),
|
141
|
+
"using_guides": 0,
|
142
|
+
"using_tatr": 0,
|
143
|
+
"using_placeholders": 0,
|
144
|
+
"avg_quality_score": 0,
|
145
|
+
"difficult_elements_breakdown": defaultdict(int),
|
146
|
+
}
|
147
|
+
|
148
|
+
all_scores = []
|
149
|
+
for result in results:
|
150
|
+
if result["doc_code_quality"]:
|
151
|
+
quality = result["doc_code_quality"]
|
152
|
+
if quality["uses_guides"]:
|
153
|
+
stats["using_guides"] += 1
|
154
|
+
if quality["uses_tatr"]:
|
155
|
+
stats["using_tatr"] += 1
|
156
|
+
if quality["has_placeholder_text"]:
|
157
|
+
stats["using_placeholders"] += 1
|
158
|
+
all_scores.append(quality["score"])
|
159
|
+
|
160
|
+
if result["difficult_elements_analysis"]:
|
161
|
+
for elem_type, count in result["difficult_elements_analysis"].items():
|
162
|
+
stats["difficult_elements_breakdown"][elem_type] += count
|
163
|
+
|
164
|
+
if all_scores:
|
165
|
+
stats["avg_quality_score"] = sum(all_scores) / len(all_scores)
|
166
|
+
|
167
|
+
# Generate report
|
168
|
+
print("\n=== Natural PDF Evaluation Quality Report ===\n")
|
169
|
+
print(f"Total submissions analyzed: {stats['total_submissions']}")
|
170
|
+
print(f"With document enrichment: {stats['with_doc_enrichment']}")
|
171
|
+
print(f"\nCode Quality Metrics:")
|
172
|
+
print(
|
173
|
+
f" Using Guides API: {stats['using_guides']} ({stats['using_guides']/stats['with_doc_enrichment']*100:.1f}%)"
|
174
|
+
)
|
175
|
+
print(
|
176
|
+
f" Using TATR: {stats['using_tatr']} ({stats['using_tatr']/stats['with_doc_enrichment']*100:.1f}%)"
|
177
|
+
)
|
178
|
+
print(
|
179
|
+
f" Using placeholders: {stats['using_placeholders']} ({stats['using_placeholders']/stats['with_doc_enrichment']*100:.1f}%)"
|
180
|
+
)
|
181
|
+
print(f" Average quality score: {stats['avg_quality_score']:.1f}/12")
|
182
|
+
|
183
|
+
print(f"\nDifficult Elements Identified:")
|
184
|
+
for elem_type, count in sorted(
|
185
|
+
stats["difficult_elements_breakdown"].items(), key=lambda x: x[1], reverse=True
|
186
|
+
):
|
187
|
+
print(f" {elem_type}: {count}")
|
188
|
+
|
189
|
+
# Save detailed results
|
190
|
+
output_path = eval_dir / "quality_analysis.json"
|
191
|
+
with open(output_path, "w") as f:
|
192
|
+
json.dump({"stats": stats, "detailed_results": results}, f, indent=2)
|
193
|
+
|
194
|
+
print(f"\nDetailed results saved to: {output_path}")
|
195
|
+
|
196
|
+
|
197
|
+
if __name__ == "__main__":
|
198
|
+
main()
|
@@ -11,7 +11,7 @@ import argparse
|
|
11
11
|
import csv
|
12
12
|
import json
|
13
13
|
from pathlib import Path
|
14
|
-
from typing import
|
14
|
+
from typing import Dict, List
|
15
15
|
|
16
16
|
ROOT = Path(__file__).resolve().parent.parent.parent # repo root
|
17
17
|
EVAL_DIR = ROOT / "eval_results"
|
@@ -29,17 +29,21 @@ def collect_records() -> List[Dict[str, str]]:
|
|
29
29
|
if not tp and not cs:
|
30
30
|
# Skip summaries without enrichment at doc level
|
31
31
|
continue
|
32
|
-
records.append(
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
32
|
+
records.append(
|
33
|
+
{
|
34
|
+
"id": data.get("submission_id", summary_path.parent.name),
|
35
|
+
"thought_process": tp.replace("\n", " ").strip(),
|
36
|
+
"code_suggestion": cs.replace("\n", " ").strip(),
|
37
|
+
}
|
38
|
+
)
|
37
39
|
return records
|
38
40
|
|
39
41
|
|
40
42
|
def main():
|
41
43
|
ap = argparse.ArgumentParser(description="Export enriched summaries to CSV.")
|
42
|
-
ap.add_argument(
|
44
|
+
ap.add_argument(
|
45
|
+
"--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path"
|
46
|
+
)
|
43
47
|
args = ap.parse_args()
|
44
48
|
|
45
49
|
records = collect_records()
|
@@ -59,4 +63,4 @@ def main():
|
|
59
63
|
|
60
64
|
|
61
65
|
if __name__ == "__main__":
|
62
|
-
main()
|
66
|
+
main()
|
tools/bad_pdf_eval/llm_enrich.py
CHANGED
@@ -8,6 +8,7 @@ Environment
|
|
8
8
|
-----------
|
9
9
|
OPENAI_API_KEY must be set or passed via --api-key.
|
10
10
|
"""
|
11
|
+
|
11
12
|
from __future__ import annotations
|
12
13
|
|
13
14
|
import argparse
|
@@ -16,16 +17,17 @@ import json
|
|
16
17
|
import os
|
17
18
|
import textwrap
|
18
19
|
from pathlib import Path
|
19
|
-
from typing import
|
20
|
+
from typing import Any, Dict, List
|
20
21
|
|
21
22
|
from openai import OpenAI
|
22
|
-
from pydantic import BaseModel, Field
|
23
23
|
from PIL import Image
|
24
|
+
from pydantic import BaseModel, Field
|
24
25
|
|
25
26
|
ROOT = Path(__file__).resolve().parent.parent.parent # repo root
|
26
27
|
EVAL_DIR = ROOT / "eval_results"
|
27
28
|
CHEATSHEET_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_CheatSheet.md"
|
28
29
|
WORKFLOWS_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_Workflows.md"
|
30
|
+
DECISION_TREE_PATH = ROOT / "tools" / "bad_pdf_eval" / "extraction_decision_tree.md"
|
29
31
|
|
30
32
|
|
31
33
|
def read_md(path: Path) -> str:
|
@@ -43,6 +45,7 @@ def img_to_b64_jpeg(path: Path, max_px: int = 1024) -> str:
|
|
43
45
|
|
44
46
|
from io import BytesIO
|
45
47
|
|
48
|
+
|
46
49
|
def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
|
47
50
|
"""Return OpenAI chat prompt messages list."""
|
48
51
|
cheatsheet = read_md(CHEATSHEET_PATH)
|
@@ -52,7 +55,10 @@ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
|
|
52
55
|
if page.get("image") and Path(page["image"]).exists():
|
53
56
|
try:
|
54
57
|
b64 = img_to_b64_jpeg(Path(page["image"]))
|
55
|
-
image_section = {
|
58
|
+
image_section = {
|
59
|
+
"type": "image_url",
|
60
|
+
"image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
|
61
|
+
}
|
56
62
|
except Exception:
|
57
63
|
pass
|
58
64
|
|
@@ -93,6 +99,7 @@ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
|
|
93
99
|
def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
94
100
|
cheatsheet = read_md(CHEATSHEET_PATH)
|
95
101
|
workflows = read_md(WORKFLOWS_PATH)
|
102
|
+
decision_tree = read_md(DECISION_TREE_PATH)
|
96
103
|
|
97
104
|
pdf_overview = [
|
98
105
|
f"PDF: {Path(summary['pdf']).name}",
|
@@ -106,7 +113,10 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
106
113
|
if page.get("image") and Path(page["image"]).exists():
|
107
114
|
try:
|
108
115
|
b64 = img_to_b64_jpeg(Path(page["image"]))
|
109
|
-
image_section = {
|
116
|
+
image_section = {
|
117
|
+
"type": "image_url",
|
118
|
+
"image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
|
119
|
+
}
|
110
120
|
except Exception:
|
111
121
|
pass
|
112
122
|
context_json = {
|
@@ -117,12 +127,14 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
117
127
|
"blob_sample": page.get("blobs_sample", []),
|
118
128
|
"ocr_sample": page.get("ocr_sample", []),
|
119
129
|
}
|
120
|
-
per_page_sections.append(
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
130
|
+
per_page_sections.append(
|
131
|
+
{
|
132
|
+
"page_number": page["page_number"],
|
133
|
+
"goal_tag": page.get("goal_tag") or "generic_extraction",
|
134
|
+
"image": image_section,
|
135
|
+
"context": context_json,
|
136
|
+
}
|
137
|
+
)
|
126
138
|
|
127
139
|
sys_msg = textwrap.dedent(
|
128
140
|
"""
|
@@ -134,15 +146,23 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
134
146
|
|
135
147
|
Extraction strategy:
|
136
148
|
1. Start with the text layer: `page.extract_text()`, `page.extract_table()`, or region selectors.
|
137
|
-
2.
|
149
|
+
2. For tables, strongly prefer the Guides API over TATR:
|
150
|
+
• Use `Guides.from_content()` with actual column headers as markers
|
151
|
+
• Apply `.snap_to_whitespace()` to auto-align to natural gaps
|
152
|
+
• Only fall back to TATR for genuinely complex multi-table pages
|
153
|
+
3. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
|
138
154
|
between anchors via `.find()`, `.below()`, `.above()`, `.until()`, `.expand()`, etc.
|
139
|
-
Example: `page.find('text:contains(Violations)').below(until='text:bold')`.
|
140
|
-
|
155
|
+
Example: `page.find('text:contains("Violations")').below(until='text:bold')`.
|
156
|
+
4. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
|
141
157
|
as this allows your code to work on potentially other similar pages of the document.
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
158
|
+
5. Direct region extraction often works: `region.extract_table()` without any layout model.
|
159
|
+
|
160
|
+
Recent improvements to leverage:
|
161
|
+
• Tiny text (<7pt) is now extracted reliably - no need to flag as difficult
|
162
|
+
• RTL languages (Arabic, Hebrew) work automatically with proper BiDi
|
163
|
+
• Use `.extract_table()` (singular) which returns TableResult with .df property
|
164
|
+
• Guides API can detect lines from pixels directly - no vector lines needed
|
165
|
+
• Can discard corrupted text layers with `PDF(..., text_layer=False)` or `page.remove_text_layer()`
|
146
166
|
|
147
167
|
Handle tables, key-value forms, and free-form paragraphs with the same anchor-driven approach. Key-value
|
148
168
|
forms might be easily extracted with .ask(...) or .extract(), feel free to mention as an option
|
@@ -158,30 +178,31 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
158
178
|
a fluent API, and for loops are discouraged.
|
159
179
|
|
160
180
|
Return ONE JSON object with exactly these keys:
|
161
|
-
• thought_process – concise reasoning
|
181
|
+
• thought_process – concise reasoning about your approach, noting if Guides would work better than TATR
|
162
182
|
• code_suggestion – executable Python snippet using natural_pdf
|
163
|
-
• difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (tiny fonts
|
164
|
-
• test_case – short description of how this PDF/page could be turned into an automated regression test
|
183
|
+
• difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (exclude tiny fonts unless <5pt, exclude RTL languages). If no difficult element is evident, return an empty list. Do *not* speculate.
|
184
|
+
• test_case – short description of how this PDF/page could be turned into an automated regression test
|
165
185
|
|
166
186
|
Code-style expectations:
|
167
187
|
• Use **real sample text** from the page as anchors — never placeholders such as
|
168
|
-
"AnchorText", "Texts", or "Also".
|
169
|
-
fact in the *thought_process* and leave a TODO rather than shipping a placeholder.
|
188
|
+
"AnchorText", "Texts", or "Also". Look in the inspect/describe data for actual text.
|
170
189
|
• When a page is flagged as *scanned_image* (or no text layer exists) your code
|
171
190
|
MUST call `page.apply_ocr()` *before* any `.find()` or `.extract_text()` calls.
|
191
|
+
• If text appears as "(cid:xxx)" in the evidence, use `page.remove_text_layer()` or
|
192
|
+
`PDF(..., text_layer=False)` before OCR to avoid corrupted text interference.
|
193
|
+
• For table extraction, show Guides-based approach first, TATR only as fallback
|
172
194
|
• Prefer `header_el.parent('table')` (up-tree navigation) over a global
|
173
|
-
`page.find('table')[i]` positional index — this is more robust
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
• Explicitly name the extractor (`analyze_layout('tatr')`, `analyze_layout('detectron')`)
|
179
|
-
instead of vague comments like "YOLO fallback".
|
195
|
+
`page.find('table')[i]` positional index — this is more robust to layout changes.
|
196
|
+
• Use `.below()` or `.above()` to select regions. Add `until=` only when you need to
|
197
|
+
stop before reaching the page edge (e.g., before another section). Going to page edge
|
198
|
+
is fine without `until`.
|
199
|
+
• Keep page-level suggestions consistent with document-level patterns (same extraction approach)
|
180
200
|
"""
|
181
201
|
)
|
182
202
|
|
183
203
|
messages = [
|
184
204
|
{"role": "system", "content": sys_msg},
|
205
|
+
{"role": "system", "content": "DECISION TREE:\n" + decision_tree},
|
185
206
|
{"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
|
186
207
|
{"role": "system", "content": "WORKFLOWS:\n" + workflows},
|
187
208
|
]
|
@@ -205,10 +226,21 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
|
205
226
|
class DocOutput(BaseModel):
|
206
227
|
"""LLM enrichment for a whole PDF (single object)."""
|
207
228
|
|
208
|
-
thought_process: str = Field(
|
209
|
-
|
210
|
-
|
211
|
-
|
229
|
+
thought_process: str = Field(
|
230
|
+
...,
|
231
|
+
description="Overall reasoning about the PDF and extraction plan, noting whether Guides API would be better than TATR for tables",
|
232
|
+
)
|
233
|
+
code_suggestion: str = Field(
|
234
|
+
...,
|
235
|
+
description="Python snippet using natural_pdf, preferring Guides API over TATR for table extraction",
|
236
|
+
)
|
237
|
+
difficult_elements: List[str] = Field(
|
238
|
+
...,
|
239
|
+
description="Bullet list of page features that are genuinely hard (not tiny fonts >5pt or RTL languages)",
|
240
|
+
)
|
241
|
+
test_case: str = Field(
|
242
|
+
..., description="Specific assertion that could verify the extraction worked correctly"
|
243
|
+
)
|
212
244
|
|
213
245
|
|
214
246
|
def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
|
@@ -225,9 +257,7 @@ def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
|
|
225
257
|
msgs = build_pdf_prompt(summary)
|
226
258
|
|
227
259
|
completion = client.beta.chat.completions.parse(
|
228
|
-
model=model,
|
229
|
-
messages=msgs,
|
230
|
-
response_format=DocOutput
|
260
|
+
model=model, messages=msgs, response_format=DocOutput
|
231
261
|
)
|
232
262
|
|
233
263
|
# Expect exactly one function call in the first choice
|
@@ -249,10 +279,12 @@ def main():
|
|
249
279
|
ap = argparse.ArgumentParser()
|
250
280
|
ap.add_argument("--submission", help="Submission ID to enrich (folder name)")
|
251
281
|
ap.add_argument("--model", default="o3")
|
252
|
-
ap.add_argument(
|
282
|
+
ap.add_argument(
|
283
|
+
"--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env"
|
284
|
+
)
|
253
285
|
ap.add_argument("--force", action="store_true", help="overwrite existing enrichment")
|
254
286
|
args = ap.parse_args()
|
255
|
-
|
287
|
+
|
256
288
|
if not args.api_key:
|
257
289
|
raise SystemExit("OPENAI_API_KEY not provided")
|
258
290
|
|
@@ -270,4 +302,4 @@ def main():
|
|
270
302
|
|
271
303
|
|
272
304
|
if __name__ == "__main__":
|
273
|
-
main()
|
305
|
+
main()
|