natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +113 -22
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,198 @@
1
+ """Evaluate the quality of LLM enrichment suggestions.
2
+
3
+ This script analyzes the code suggestions to identify:
4
+ - Use of modern features (Guides API, extract_table)
5
+ - Avoidance of anti-patterns (placeholder text, unnecessary TATR)
6
+ - Practical, working code
7
+ """
8
+
9
+ import json
10
+ import re
11
+ from collections import defaultdict
12
+ from pathlib import Path
13
+ from typing import Any, Dict, List
14
+
15
+
16
+ def analyze_code_quality(code: str) -> Dict[str, Any]:
17
+ """Analyze a code suggestion for quality indicators."""
18
+
19
+ quality = {
20
+ "uses_guides": bool(
21
+ re.search(r"from natural_pdf\.analyzers import Guides|Guides\(|Guides\.", code)
22
+ ),
23
+ "uses_tatr": bool(re.search(r'analyze_layout\([\'"]tatr[\'"]?\)', code)),
24
+ "uses_extract_table": bool(re.search(r"\.extract_table\(\)", code)),
25
+ "has_placeholder_text": bool(
26
+ re.search(r'[\'"](?:AnchorText|Texts|Also|HeaderText|TableHeader)[\'"]', code)
27
+ ),
28
+ "uses_real_text": bool(
29
+ re.search(
30
+ r'[\'"](?!AnchorText|Texts|Also|HeaderText|TableHeader)[^\'"{}\[\]]+[\'"]', code
31
+ )
32
+ )
33
+ and not bool(
34
+ re.search(r'[\'"](?:AnchorText|Texts|Also|HeaderText|TableHeader)[\'"]', code)
35
+ ),
36
+ "uses_snap_to_whitespace": bool(re.search(r"snap_to_whitespace", code)),
37
+ "uses_parent_navigation": bool(re.search(r"\.parent\(", code)),
38
+ "uses_until": bool(re.search(r"until\s*=", code)),
39
+ "has_ocr_call": bool(re.search(r"apply_ocr\(", code)),
40
+ "removes_text_layer": bool(re.search(r"remove_text_layer\(|text_layer\s*=\s*False", code)),
41
+ }
42
+
43
+ # Calculate score based on quality indicators
44
+ score = 0
45
+ if quality["uses_guides"]:
46
+ score += 3 # Major positive: modern approach
47
+ if quality["uses_tatr"]:
48
+ score += 1 # Minor positive: valid for complex layouts
49
+ if quality["uses_extract_table"]:
50
+ score += 2 # Positive: using singular method
51
+ if quality["uses_real_text"]:
52
+ score += 2 # Positive: using actual anchors
53
+ if quality["uses_snap_to_whitespace"]:
54
+ score += 2 # Positive: modern Guides feature
55
+ if quality["uses_parent_navigation"]:
56
+ score += 1 # Positive: robust navigation
57
+ if quality["uses_until"]:
58
+ score += 1 # Positive when appropriate: precise region selection
59
+
60
+ quality["score"] = score
61
+ quality["max_score"] = 12
62
+
63
+ return quality
64
+
65
+
66
+ def analyze_difficult_elements(elements: List[str]) -> Dict[str, int]:
67
+ """Count types of difficult elements identified."""
68
+
69
+ patterns = {
70
+ "tiny_font": r"tiny.*font|small.*font|font.*size|<\s*\d+\s*pt",
71
+ "rtl_language": r"arabic|hebrew|rtl|right.*to.*left",
72
+ "scanned": r"scanned|image.*only|no.*text.*layer",
73
+ "complex_layout": r"column|multi.*column|layout",
74
+ "handwritten": r"handwritten|hand.*written",
75
+ "redacted": r"redact",
76
+ }
77
+
78
+ counts = defaultdict(int)
79
+ for element in elements:
80
+ element_lower = element.lower()
81
+ for category, pattern in patterns.items():
82
+ if re.search(pattern, element_lower):
83
+ counts[category] += 1
84
+
85
+ return dict(counts)
86
+
87
+
88
+ def evaluate_submission(submission_path: Path) -> Dict[str, Any]:
89
+ """Evaluate a single submission's enrichment quality."""
90
+
91
+ summary_path = submission_path / "summary.json"
92
+ if not summary_path.exists():
93
+ return None
94
+
95
+ data = json.loads(summary_path.read_text())
96
+
97
+ result = {
98
+ "submission_id": data.get("submission_id", submission_path.name),
99
+ "has_doc_enrichment": bool(data.get("code_suggestion")),
100
+ "doc_code_quality": None,
101
+ "difficult_elements_analysis": None,
102
+ "page_code_quality": [],
103
+ }
104
+
105
+ # Analyze document-level code
106
+ if data.get("code_suggestion"):
107
+ result["doc_code_quality"] = analyze_code_quality(data["code_suggestion"])
108
+
109
+ # Analyze difficult elements
110
+ if data.get("difficult_elements"):
111
+ result["difficult_elements_analysis"] = analyze_difficult_elements(
112
+ data["difficult_elements"]
113
+ )
114
+
115
+ # Analyze page-level code
116
+ for page in data.get("pages", []):
117
+ if page.get("code_suggestion"):
118
+ page_quality = analyze_code_quality(page["code_suggestion"])
119
+ page_quality["page_number"] = page.get("page_number")
120
+ result["page_code_quality"].append(page_quality)
121
+
122
+ return result
123
+
124
+
125
+ def main():
126
+ """Analyze all submissions and generate quality report."""
127
+
128
+ eval_dir = Path("eval_results")
129
+ results = []
130
+
131
+ for submission_dir in eval_dir.iterdir():
132
+ if submission_dir.is_dir() and (submission_dir / "summary.json").exists():
133
+ result = evaluate_submission(submission_dir)
134
+ if result:
135
+ results.append(result)
136
+
137
+ # Aggregate statistics
138
+ stats = {
139
+ "total_submissions": len(results),
140
+ "with_doc_enrichment": sum(1 for r in results if r["has_doc_enrichment"]),
141
+ "using_guides": 0,
142
+ "using_tatr": 0,
143
+ "using_placeholders": 0,
144
+ "avg_quality_score": 0,
145
+ "difficult_elements_breakdown": defaultdict(int),
146
+ }
147
+
148
+ all_scores = []
149
+ for result in results:
150
+ if result["doc_code_quality"]:
151
+ quality = result["doc_code_quality"]
152
+ if quality["uses_guides"]:
153
+ stats["using_guides"] += 1
154
+ if quality["uses_tatr"]:
155
+ stats["using_tatr"] += 1
156
+ if quality["has_placeholder_text"]:
157
+ stats["using_placeholders"] += 1
158
+ all_scores.append(quality["score"])
159
+
160
+ if result["difficult_elements_analysis"]:
161
+ for elem_type, count in result["difficult_elements_analysis"].items():
162
+ stats["difficult_elements_breakdown"][elem_type] += count
163
+
164
+ if all_scores:
165
+ stats["avg_quality_score"] = sum(all_scores) / len(all_scores)
166
+
167
+ # Generate report
168
+ print("\n=== Natural PDF Evaluation Quality Report ===\n")
169
+ print(f"Total submissions analyzed: {stats['total_submissions']}")
170
+ print(f"With document enrichment: {stats['with_doc_enrichment']}")
171
+ print(f"\nCode Quality Metrics:")
172
+ print(
173
+ f" Using Guides API: {stats['using_guides']} ({stats['using_guides']/stats['with_doc_enrichment']*100:.1f}%)"
174
+ )
175
+ print(
176
+ f" Using TATR: {stats['using_tatr']} ({stats['using_tatr']/stats['with_doc_enrichment']*100:.1f}%)"
177
+ )
178
+ print(
179
+ f" Using placeholders: {stats['using_placeholders']} ({stats['using_placeholders']/stats['with_doc_enrichment']*100:.1f}%)"
180
+ )
181
+ print(f" Average quality score: {stats['avg_quality_score']:.1f}/12")
182
+
183
+ print(f"\nDifficult Elements Identified:")
184
+ for elem_type, count in sorted(
185
+ stats["difficult_elements_breakdown"].items(), key=lambda x: x[1], reverse=True
186
+ ):
187
+ print(f" {elem_type}: {count}")
188
+
189
+ # Save detailed results
190
+ output_path = eval_dir / "quality_analysis.json"
191
+ with open(output_path, "w") as f:
192
+ json.dump({"stats": stats, "detailed_results": results}, f, indent=2)
193
+
194
+ print(f"\nDetailed results saved to: {output_path}")
195
+
196
+
197
+ if __name__ == "__main__":
198
+ main()
@@ -11,7 +11,7 @@ import argparse
11
11
  import csv
12
12
  import json
13
13
  from pathlib import Path
14
- from typing import List, Dict
14
+ from typing import Dict, List
15
15
 
16
16
  ROOT = Path(__file__).resolve().parent.parent.parent # repo root
17
17
  EVAL_DIR = ROOT / "eval_results"
@@ -29,17 +29,21 @@ def collect_records() -> List[Dict[str, str]]:
29
29
  if not tp and not cs:
30
30
  # Skip summaries without enrichment at doc level
31
31
  continue
32
- records.append({
33
- "id": data.get("submission_id", summary_path.parent.name),
34
- "thought_process": tp.replace("\n", " ").strip(),
35
- "code_suggestion": cs.replace("\n", " ").strip(),
36
- })
32
+ records.append(
33
+ {
34
+ "id": data.get("submission_id", summary_path.parent.name),
35
+ "thought_process": tp.replace("\n", " ").strip(),
36
+ "code_suggestion": cs.replace("\n", " ").strip(),
37
+ }
38
+ )
37
39
  return records
38
40
 
39
41
 
40
42
  def main():
41
43
  ap = argparse.ArgumentParser(description="Export enriched summaries to CSV.")
42
- ap.add_argument("--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path")
44
+ ap.add_argument(
45
+ "--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path"
46
+ )
43
47
  args = ap.parse_args()
44
48
 
45
49
  records = collect_records()
@@ -59,4 +63,4 @@ def main():
59
63
 
60
64
 
61
65
  if __name__ == "__main__":
62
- main()
66
+ main()
@@ -8,6 +8,7 @@ Environment
8
8
  -----------
9
9
  OPENAI_API_KEY must be set or passed via --api-key.
10
10
  """
11
+
11
12
  from __future__ import annotations
12
13
 
13
14
  import argparse
@@ -16,16 +17,17 @@ import json
16
17
  import os
17
18
  import textwrap
18
19
  from pathlib import Path
19
- from typing import Dict, Any, List
20
+ from typing import Any, Dict, List
20
21
 
21
22
  from openai import OpenAI
22
- from pydantic import BaseModel, Field
23
23
  from PIL import Image
24
+ from pydantic import BaseModel, Field
24
25
 
25
26
  ROOT = Path(__file__).resolve().parent.parent.parent # repo root
26
27
  EVAL_DIR = ROOT / "eval_results"
27
28
  CHEATSHEET_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_CheatSheet.md"
28
29
  WORKFLOWS_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_Workflows.md"
30
+ DECISION_TREE_PATH = ROOT / "tools" / "bad_pdf_eval" / "extraction_decision_tree.md"
29
31
 
30
32
 
31
33
  def read_md(path: Path) -> str:
@@ -43,6 +45,7 @@ def img_to_b64_jpeg(path: Path, max_px: int = 1024) -> str:
43
45
 
44
46
  from io import BytesIO
45
47
 
48
+
46
49
  def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
47
50
  """Return OpenAI chat prompt messages list."""
48
51
  cheatsheet = read_md(CHEATSHEET_PATH)
@@ -52,7 +55,10 @@ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
52
55
  if page.get("image") and Path(page["image"]).exists():
53
56
  try:
54
57
  b64 = img_to_b64_jpeg(Path(page["image"]))
55
- image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
58
+ image_section = {
59
+ "type": "image_url",
60
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
61
+ }
56
62
  except Exception:
57
63
  pass
58
64
 
@@ -93,6 +99,7 @@ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
93
99
  def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
94
100
  cheatsheet = read_md(CHEATSHEET_PATH)
95
101
  workflows = read_md(WORKFLOWS_PATH)
102
+ decision_tree = read_md(DECISION_TREE_PATH)
96
103
 
97
104
  pdf_overview = [
98
105
  f"PDF: {Path(summary['pdf']).name}",
@@ -106,7 +113,10 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
106
113
  if page.get("image") and Path(page["image"]).exists():
107
114
  try:
108
115
  b64 = img_to_b64_jpeg(Path(page["image"]))
109
- image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
116
+ image_section = {
117
+ "type": "image_url",
118
+ "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"},
119
+ }
110
120
  except Exception:
111
121
  pass
112
122
  context_json = {
@@ -117,12 +127,14 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
117
127
  "blob_sample": page.get("blobs_sample", []),
118
128
  "ocr_sample": page.get("ocr_sample", []),
119
129
  }
120
- per_page_sections.append({
121
- "page_number": page["page_number"],
122
- "goal_tag": page.get("goal_tag") or "generic_extraction",
123
- "image": image_section,
124
- "context": context_json,
125
- })
130
+ per_page_sections.append(
131
+ {
132
+ "page_number": page["page_number"],
133
+ "goal_tag": page.get("goal_tag") or "generic_extraction",
134
+ "image": image_section,
135
+ "context": context_json,
136
+ }
137
+ )
126
138
 
127
139
  sys_msg = textwrap.dedent(
128
140
  """
@@ -134,15 +146,23 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
134
146
 
135
147
  Extraction strategy:
136
148
  1. Start with the text layer: `page.extract_text()`, `page.extract_table()`, or region selectors.
137
- 2. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
149
+ 2. For tables, strongly prefer the Guides API over TATR:
150
+ • Use `Guides.from_content()` with actual column headers as markers
151
+ • Apply `.snap_to_whitespace()` to auto-align to natural gaps
152
+ • Only fall back to TATR for genuinely complex multi-table pages
153
+ 3. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
138
154
  between anchors via `.find()`, `.below()`, `.above()`, `.until()`, `.expand()`, etc.
139
- Example: `page.find('text:contains(Violations)').below(until='text:bold')`.
140
- 3. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
155
+ Example: `page.find('text:contains("Violations")').below(until='text:bold')`.
156
+ 4. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
141
157
  as this allows your code to work on potentially other similar pages of the document.
142
- 3. Fall back to TATR or other vision models only if the text layer is unusable.
143
- • Blanket advice like "run analyze_layout('tatr') on every page" is discouraged—
144
- only invoke a layout model when anchor-based text extraction genuinely fails
145
- or evidence shows complex unruled tables that require cell-grid inference.
158
+ 5. Direct region extraction often works: `region.extract_table()` without any layout model.
159
+
160
+ Recent improvements to leverage:
161
+ Tiny text (<7pt) is now extracted reliably - no need to flag as difficult
162
+ • RTL languages (Arabic, Hebrew) work automatically with proper BiDi
163
+ • Use `.extract_table()` (singular) which returns TableResult with .df property
164
+ • Guides API can detect lines from pixels directly - no vector lines needed
165
+ • Can discard corrupted text layers with `PDF(..., text_layer=False)` or `page.remove_text_layer()`
146
166
 
147
167
  Handle tables, key-value forms, and free-form paragraphs with the same anchor-driven approach. Key-value
148
168
  forms might be easily extracted with .ask(...) or .extract(), feel free to mention as an option
@@ -158,30 +178,31 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
158
178
  a fluent API, and for loops are discouraged.
159
179
 
160
180
  Return ONE JSON object with exactly these keys:
161
- • thought_process – concise reasoning and feature/enhancement requests (≤4 short paragraphs)
181
+ • thought_process – concise reasoning about your approach, noting if Guides would work better than TATR
162
182
  • code_suggestion – executable Python snippet using natural_pdf
163
- • difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (tiny fonts in `describe`, scanned_image flag, missing text layer, no ruling lines inferred from `layout_*` arrays, etc.). If no difficult element is evident, return an empty list. Do *not* speculate.
164
- • test_case – short description of how this PDF/page could be turned into an automated regression test (e.g. "assert tbl.df.shape == (12, 5)")
183
+ • difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (exclude tiny fonts unless <5pt, exclude RTL languages). If no difficult element is evident, return an empty list. Do *not* speculate.
184
+ • test_case – short description of how this PDF/page could be turned into an automated regression test
165
185
 
166
186
  Code-style expectations:
167
187
  • Use **real sample text** from the page as anchors — never placeholders such as
168
- "AnchorText", "Texts", or "Also". If no stable anchor is visible, state that
169
- fact in the *thought_process* and leave a TODO rather than shipping a placeholder.
188
+ "AnchorText", "Texts", or "Also". Look in the inspect/describe data for actual text.
170
189
  • When a page is flagged as *scanned_image* (or no text layer exists) your code
171
190
  MUST call `page.apply_ocr()` *before* any `.find()` or `.extract_text()` calls.
191
+ • If text appears as "(cid:xxx)" in the evidence, use `page.remove_text_layer()` or
192
+ `PDF(..., text_layer=False)` before OCR to avoid corrupted text interference.
193
+ • For table extraction, show Guides-based approach first, TATR only as fallback
172
194
  • Prefer `header_el.parent('table')` (up-tree navigation) over a global
173
- `page.find('table')[i]` positional index — this is more robust when multiple tables
174
- are present.
175
- For tables, assume Natural-PDF returns a `TableResult`; use `tbl.df` or
176
- `tbl.to_df(header='first')` instead of manually building a DataFrame unless you
177
- need custom header/skiprows logic.
178
- • Explicitly name the extractor (`analyze_layout('tatr')`, `analyze_layout('detectron')`)
179
- instead of vague comments like "YOLO fallback".
195
+ `page.find('table')[i]` positional index — this is more robust to layout changes.
196
+ Use `.below()` or `.above()` to select regions. Add `until=` only when you need to
197
+ stop before reaching the page edge (e.g., before another section). Going to page edge
198
+ is fine without `until`.
199
+ Keep page-level suggestions consistent with document-level patterns (same extraction approach)
180
200
  """
181
201
  )
182
202
 
183
203
  messages = [
184
204
  {"role": "system", "content": sys_msg},
205
+ {"role": "system", "content": "DECISION TREE:\n" + decision_tree},
185
206
  {"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
186
207
  {"role": "system", "content": "WORKFLOWS:\n" + workflows},
187
208
  ]
@@ -205,10 +226,21 @@ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
205
226
  class DocOutput(BaseModel):
206
227
  """LLM enrichment for a whole PDF (single object)."""
207
228
 
208
- thought_process: str = Field(..., description="Overall reasoning about the PDF and extraction plan")
209
- code_suggestion: str = Field(..., description="Python snippet using natural_pdf to achieve the user goal for this PDF")
210
- difficult_elements: List[str] = Field(..., description="Bullet list of page features that are *hard* for any extraction engine")
211
- test_case: str = Field(..., description="Short description of how this PDF/page could be turned into an automated regression test")
229
+ thought_process: str = Field(
230
+ ...,
231
+ description="Overall reasoning about the PDF and extraction plan, noting whether Guides API would be better than TATR for tables",
232
+ )
233
+ code_suggestion: str = Field(
234
+ ...,
235
+ description="Python snippet using natural_pdf, preferring Guides API over TATR for table extraction",
236
+ )
237
+ difficult_elements: List[str] = Field(
238
+ ...,
239
+ description="Bullet list of page features that are genuinely hard (not tiny fonts >5pt or RTL languages)",
240
+ )
241
+ test_case: str = Field(
242
+ ..., description="Specific assertion that could verify the extraction worked correctly"
243
+ )
212
244
 
213
245
 
214
246
  def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
@@ -225,9 +257,7 @@ def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
225
257
  msgs = build_pdf_prompt(summary)
226
258
 
227
259
  completion = client.beta.chat.completions.parse(
228
- model=model,
229
- messages=msgs,
230
- response_format=DocOutput
260
+ model=model, messages=msgs, response_format=DocOutput
231
261
  )
232
262
 
233
263
  # Expect exactly one function call in the first choice
@@ -249,10 +279,12 @@ def main():
249
279
  ap = argparse.ArgumentParser()
250
280
  ap.add_argument("--submission", help="Submission ID to enrich (folder name)")
251
281
  ap.add_argument("--model", default="o3")
252
- ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env")
282
+ ap.add_argument(
283
+ "--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env"
284
+ )
253
285
  ap.add_argument("--force", action="store_true", help="overwrite existing enrichment")
254
286
  args = ap.parse_args()
255
-
287
+
256
288
  if not args.api_key:
257
289
  raise SystemExit("OPENAI_API_KEY not provided")
258
290
 
@@ -270,4 +302,4 @@ def main():
270
302
 
271
303
 
272
304
  if __name__ == "__main__":
273
- main()
305
+ main()