natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +45 -1
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/layout/yolo.py +2 -2
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/engine_paddle.py +1 -1
- natural_pdf/ocr/ocr_factory.py +8 -8
- natural_pdf/ocr/ocr_manager.py +51 -1
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,116 @@
|
|
1
|
+
import argparse
|
2
|
+
import re
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import List, Dict
|
5
|
+
|
6
|
+
import pandas as pd
|
7
|
+
from rich.console import Console
|
8
|
+
|
9
|
+
from .utils import find_local_pdf, slugify
|
10
|
+
from .analyser import BadPDFAnalyzer, extract_page_hints
|
11
|
+
from .reporter import save_json
|
12
|
+
|
13
|
+
console = Console()
|
14
|
+
|
15
|
+
|
16
|
+
DEFAULT_PAGES = [1, 2, 3]
|
17
|
+
|
18
|
+
|
19
|
+
def build_pages_list(row: Dict[str, str]) -> List[int]:
|
20
|
+
pages = DEFAULT_PAGES.copy()
|
21
|
+
text_fields = [
|
22
|
+
row.get("What are we trying to get out of the PDF?", ""),
|
23
|
+
row.get("What do you think makes this PDF bad?", ""),
|
24
|
+
]
|
25
|
+
for field in text_fields:
|
26
|
+
# Guard against NaN/None or other non-string pandas dtypes
|
27
|
+
if isinstance(field, str) and field:
|
28
|
+
pages += extract_page_hints(field)
|
29
|
+
# deduplicate and sort
|
30
|
+
pages = sorted(set(pages))
|
31
|
+
return pages
|
32
|
+
|
33
|
+
|
34
|
+
def main():
|
35
|
+
parser = argparse.ArgumentParser(description="Run bad PDF evaluation suite")
|
36
|
+
parser.add_argument(
|
37
|
+
"--csv",
|
38
|
+
default="bad-pdfs/Bad PDF Submission form_Submissions_2025-06-22.csv",
|
39
|
+
help="Path to submissions CSV",
|
40
|
+
)
|
41
|
+
parser.add_argument(
|
42
|
+
"--output-dir",
|
43
|
+
default="eval_results",
|
44
|
+
help="Directory to write results into (will be git-ignored)",
|
45
|
+
)
|
46
|
+
parser.add_argument("--max-row", type=int, default=None, help="debug: process only first n CSV rows")
|
47
|
+
parser.add_argument("--limit", type=int, default=None, help="process at most N PDFs with local files")
|
48
|
+
parser.add_argument("--overwrite", action="store_true", help="re-run analysis even if summary.json exists")
|
49
|
+
args = parser.parse_args()
|
50
|
+
|
51
|
+
csv_path = Path(args.csv)
|
52
|
+
df = pd.read_csv(csv_path)
|
53
|
+
if args.max_row:
|
54
|
+
df = df.head(args.max_row)
|
55
|
+
|
56
|
+
output_root = Path(args.output_dir)
|
57
|
+
output_root.mkdir(exist_ok=True)
|
58
|
+
|
59
|
+
master_records = []
|
60
|
+
processed = 0
|
61
|
+
|
62
|
+
try:
|
63
|
+
for idx, row in df.iterrows():
|
64
|
+
submission_id = row["Submission ID"]
|
65
|
+
pdf_url = row.get("Your bad PDF (one per submission!)", "")
|
66
|
+
pdf_path = find_local_pdf(submission_id, pdf_url)
|
67
|
+
if not pdf_path or not pdf_path.exists():
|
68
|
+
console.print(f"[red]PDF not found for {submission_id}. Skipping.")
|
69
|
+
continue
|
70
|
+
|
71
|
+
# Ignore files that are not .pdf (e.g. ZIPs mistakenly included)
|
72
|
+
if pdf_path.suffix.lower() != ".pdf":
|
73
|
+
console.print(f"[yellow]Not a PDF ({pdf_path.suffix}) for {submission_id}; skipping.")
|
74
|
+
continue
|
75
|
+
|
76
|
+
sub_output = output_root / submission_id
|
77
|
+
summary_path = sub_output / "summary.json"
|
78
|
+
|
79
|
+
# Ensure the original PDF is stored alongside the analysis artefacts
|
80
|
+
try:
|
81
|
+
from shutil import copy2
|
82
|
+
|
83
|
+
sub_output.mkdir(parents=True, exist_ok=True)
|
84
|
+
dest_pdf = sub_output / pdf_path.name
|
85
|
+
if not dest_pdf.exists():
|
86
|
+
copy2(pdf_path, dest_pdf)
|
87
|
+
except Exception as copy_err:
|
88
|
+
console.print(f"[yellow]Could not copy PDF into results folder: {copy_err}")
|
89
|
+
|
90
|
+
if summary_path.exists() and not args.overwrite:
|
91
|
+
console.print(f"[yellow]Summary exists for {submission_id}; skipping (use --overwrite to refresh)")
|
92
|
+
continue
|
93
|
+
|
94
|
+
pages = build_pages_list(row)
|
95
|
+
try:
|
96
|
+
analyser = BadPDFAnalyzer(pdf_path=pdf_path, output_dir=sub_output, submission_meta=row, pages=pages)
|
97
|
+
summary = analyser.run()
|
98
|
+
master_records.append(summary)
|
99
|
+
except Exception as e:
|
100
|
+
console.print(f"[red]Error processing {submission_id}: {e}. Skipping.")
|
101
|
+
continue
|
102
|
+
processed += 1
|
103
|
+
if args.limit and processed >= args.limit:
|
104
|
+
break
|
105
|
+
except KeyboardInterrupt:
|
106
|
+
console.print("[bold yellow]\nInterrupted by user – saving progress made so far…")
|
107
|
+
finally:
|
108
|
+
# Save master index even on interrupt
|
109
|
+
if master_records:
|
110
|
+
save_json(master_records, output_root / "master_index.json")
|
111
|
+
console.print(f"[bold green]Progress saved to {output_root / 'master_index.json'}")
|
112
|
+
console.print(f"[bold green]Finished. Results in {output_root}")
|
113
|
+
|
114
|
+
|
115
|
+
if __name__ == "__main__":
|
116
|
+
main()
|
@@ -0,0 +1,62 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
"""Export enrichment data (id, thought_process, code_suggestion) to a CSV.
|
4
|
+
|
5
|
+
Usage
|
6
|
+
-----
|
7
|
+
python -m tools.bad_pdf_eval.export_enrichment_csv --out eval_results/enrichment.csv
|
8
|
+
"""
|
9
|
+
|
10
|
+
import argparse
|
11
|
+
import csv
|
12
|
+
import json
|
13
|
+
from pathlib import Path
|
14
|
+
from typing import List, Dict
|
15
|
+
|
16
|
+
ROOT = Path(__file__).resolve().parent.parent.parent # repo root
|
17
|
+
EVAL_DIR = ROOT / "eval_results"
|
18
|
+
|
19
|
+
|
20
|
+
def collect_records() -> List[Dict[str, str]]:
|
21
|
+
records: List[Dict[str, str]] = []
|
22
|
+
for summary_path in EVAL_DIR.glob("*/summary.json"):
|
23
|
+
try:
|
24
|
+
data = json.loads(summary_path.read_text())
|
25
|
+
except Exception:
|
26
|
+
continue
|
27
|
+
tp = data.get("thought_process", "").strip()
|
28
|
+
cs = data.get("code_suggestion", "").strip()
|
29
|
+
if not tp and not cs:
|
30
|
+
# Skip summaries without enrichment at doc level
|
31
|
+
continue
|
32
|
+
records.append({
|
33
|
+
"id": data.get("submission_id", summary_path.parent.name),
|
34
|
+
"thought_process": tp.replace("\n", " ").strip(),
|
35
|
+
"code_suggestion": cs.replace("\n", " ").strip(),
|
36
|
+
})
|
37
|
+
return records
|
38
|
+
|
39
|
+
|
40
|
+
def main():
|
41
|
+
ap = argparse.ArgumentParser(description="Export enriched summaries to CSV.")
|
42
|
+
ap.add_argument("--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path")
|
43
|
+
args = ap.parse_args()
|
44
|
+
|
45
|
+
records = collect_records()
|
46
|
+
if not records:
|
47
|
+
print("No enriched summaries found.")
|
48
|
+
return
|
49
|
+
|
50
|
+
out_path = Path(args.out)
|
51
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
52
|
+
with out_path.open("w", newline="", encoding="utf-8") as f:
|
53
|
+
writer = csv.DictWriter(f, fieldnames=["id", "thought_process", "code_suggestion"])
|
54
|
+
writer.writeheader()
|
55
|
+
for rec in records:
|
56
|
+
writer.writerow(rec)
|
57
|
+
|
58
|
+
print(f"Wrote {len(records)} records to {out_path}")
|
59
|
+
|
60
|
+
|
61
|
+
if __name__ == "__main__":
|
62
|
+
main()
|
@@ -0,0 +1,273 @@
|
|
1
|
+
"""Enrich evaluation summaries with LLM-generated thought_process and code_suggestion.
|
2
|
+
|
3
|
+
Usage
|
4
|
+
-----
|
5
|
+
python -m tools.bad_pdf_eval.llm_enrich --submission ja6EqV1 --model o3
|
6
|
+
|
7
|
+
Environment
|
8
|
+
-----------
|
9
|
+
OPENAI_API_KEY must be set or passed via --api-key.
|
10
|
+
"""
|
11
|
+
from __future__ import annotations
|
12
|
+
|
13
|
+
import argparse
|
14
|
+
import base64
|
15
|
+
import json
|
16
|
+
import os
|
17
|
+
import textwrap
|
18
|
+
from pathlib import Path
|
19
|
+
from typing import Dict, Any, List
|
20
|
+
|
21
|
+
from openai import OpenAI
|
22
|
+
from pydantic import BaseModel, Field
|
23
|
+
from PIL import Image
|
24
|
+
|
25
|
+
ROOT = Path(__file__).resolve().parent.parent.parent # repo root
|
26
|
+
EVAL_DIR = ROOT / "eval_results"
|
27
|
+
CHEATSHEET_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_CheatSheet.md"
|
28
|
+
WORKFLOWS_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_Workflows.md"
|
29
|
+
|
30
|
+
|
31
|
+
def read_md(path: Path) -> str:
|
32
|
+
return path.read_text(encoding="utf-8")
|
33
|
+
|
34
|
+
|
35
|
+
def img_to_b64_jpeg(path: Path, max_px: int = 1024) -> str:
|
36
|
+
"""Return base64-encoded tiny JPEG thumbnail."""
|
37
|
+
with Image.open(path) as im:
|
38
|
+
im.thumbnail((max_px, max_px))
|
39
|
+
buffered = BytesIO()
|
40
|
+
im.convert("RGB").save(buffered, format="JPEG", quality=40, optimize=True)
|
41
|
+
return base64.b64encode(buffered.getvalue()).decode()
|
42
|
+
|
43
|
+
|
44
|
+
from io import BytesIO
|
45
|
+
|
46
|
+
def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
|
47
|
+
"""Return OpenAI chat prompt messages list."""
|
48
|
+
cheatsheet = read_md(CHEATSHEET_PATH)
|
49
|
+
workflows = read_md(WORKFLOWS_PATH)
|
50
|
+
|
51
|
+
image_section = None
|
52
|
+
if page.get("image") and Path(page["image"]).exists():
|
53
|
+
try:
|
54
|
+
b64 = img_to_b64_jpeg(Path(page["image"]))
|
55
|
+
image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
|
56
|
+
except Exception:
|
57
|
+
pass
|
58
|
+
|
59
|
+
context_json = {
|
60
|
+
"describe": page.get("describe", ""),
|
61
|
+
"inspect": page.get("inspect", ""),
|
62
|
+
"layout_yolo_regions": page.get("layout_yolo_regions", []),
|
63
|
+
"layout_tatr_regions": page.get("layout_tatr_regions", []),
|
64
|
+
"blob_sample": page.get("blobs_sample", []),
|
65
|
+
"ocr_sample": page.get("ocr_sample", []),
|
66
|
+
}
|
67
|
+
|
68
|
+
sys_msg = textwrap.dedent(
|
69
|
+
f"""
|
70
|
+
You are an expert Natural-PDF engineer. Use the cheat-sheet and workflows to craft bespoke extraction code.
|
71
|
+
Return JSON with two keys: thought_process (concise reasoning) and code_suggestion (Python code). Do not add extra keys.
|
72
|
+
"""
|
73
|
+
)
|
74
|
+
|
75
|
+
messages = [
|
76
|
+
{"role": "system", "content": sys_msg},
|
77
|
+
{"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
|
78
|
+
{"role": "system", "content": "WORKFLOWS:\n" + workflows},
|
79
|
+
]
|
80
|
+
|
81
|
+
user_parts = [
|
82
|
+
f"Goal: {page.get('goal_tag') or 'generic_extraction'} — {page.get('goal', '') or 'Extract the most useful information (text, tables, key/value pairs) from the page.'}",
|
83
|
+
f"Page number: {page['page_number']}",
|
84
|
+
"Context JSON:" + json.dumps(context_json),
|
85
|
+
"Provide your JSON response now.",
|
86
|
+
]
|
87
|
+
if image_section:
|
88
|
+
user_parts.insert(2, image_section)
|
89
|
+
messages.append({"role": "user", "content": "\n\n".join(user_parts)})
|
90
|
+
return messages
|
91
|
+
|
92
|
+
|
93
|
+
def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
|
94
|
+
cheatsheet = read_md(CHEATSHEET_PATH)
|
95
|
+
workflows = read_md(WORKFLOWS_PATH)
|
96
|
+
|
97
|
+
pdf_overview = [
|
98
|
+
f"PDF: {Path(summary['pdf']).name}",
|
99
|
+
f"Goal: {summary.get('goal') or 'Extract useful information from the document'}",
|
100
|
+
f"Total pages analysed: {len(summary['pages'])}",
|
101
|
+
]
|
102
|
+
|
103
|
+
per_page_sections = []
|
104
|
+
for page in summary["pages"]:
|
105
|
+
image_section = None
|
106
|
+
if page.get("image") and Path(page["image"]).exists():
|
107
|
+
try:
|
108
|
+
b64 = img_to_b64_jpeg(Path(page["image"]))
|
109
|
+
image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
|
110
|
+
except Exception:
|
111
|
+
pass
|
112
|
+
context_json = {
|
113
|
+
"describe": page.get("describe", ""),
|
114
|
+
"inspect": page.get("inspect", ""),
|
115
|
+
"layout_yolo_regions": page.get("layout_yolo_regions", []),
|
116
|
+
"layout_tatr_regions": page.get("layout_tatr_regions", []),
|
117
|
+
"blob_sample": page.get("blobs_sample", []),
|
118
|
+
"ocr_sample": page.get("ocr_sample", []),
|
119
|
+
}
|
120
|
+
per_page_sections.append({
|
121
|
+
"page_number": page["page_number"],
|
122
|
+
"goal_tag": page.get("goal_tag") or "generic_extraction",
|
123
|
+
"image": image_section,
|
124
|
+
"context": context_json,
|
125
|
+
})
|
126
|
+
|
127
|
+
sys_msg = textwrap.dedent(
|
128
|
+
"""
|
129
|
+
You are a Natural-PDF engineer with full access to the provided evidence
|
130
|
+
(describe/inspect text, YOLO & TATR regions, blob samples, OCR snippets, images).
|
131
|
+
|
132
|
+
Rely on these artefacts—not on generic heuristics. Avoid phrases like "try" or "usually this works".
|
133
|
+
If the evidence is genuinely insufficient, state exactly what is missing.
|
134
|
+
|
135
|
+
Extraction strategy:
|
136
|
+
1. Start with the text layer: `page.extract_text()`, `page.extract_table()`, or region selectors.
|
137
|
+
2. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
|
138
|
+
between anchors via `.find()`, `.below()`, `.above()`, `.until()`, `.expand()`, etc.
|
139
|
+
Example: `page.find('text:contains(Violations)').below(until='text:bold')`.
|
140
|
+
3. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
|
141
|
+
as this allows your code to work on potentially other similar pages of the document.
|
142
|
+
3. Fall back to TATR or other vision models only if the text layer is unusable.
|
143
|
+
• Blanket advice like "run analyze_layout('tatr') on every page" is discouraged—
|
144
|
+
only invoke a layout model when anchor-based text extraction genuinely fails
|
145
|
+
or evidence shows complex unruled tables that require cell-grid inference.
|
146
|
+
|
147
|
+
Handle tables, key-value forms, and free-form paragraphs with the same anchor-driven approach. Key-value
|
148
|
+
forms might be easily extracted with .ask(...) or .extract(), feel free to mention as an option
|
149
|
+
but try to not rely on it.
|
150
|
+
|
151
|
+
Use Natural PDF Flows to access multi-page or columnar content, falling back on loops when necessary.
|
152
|
+
|
153
|
+
If it seems like the approach is not ideal, or that additional features would be useful in
|
154
|
+
this use case, outline the specifics of the issues and what additional information/approaches/code/etc
|
155
|
+
would allow you to more easily extract the information.
|
156
|
+
|
157
|
+
When working with pages or elements, try to use .apply and .filter. Natural PDF stresses
|
158
|
+
a fluent API, and for loops are discouraged.
|
159
|
+
|
160
|
+
Return ONE JSON object with exactly these keys:
|
161
|
+
• thought_process – concise reasoning and feature/enhancement requests (≤4 short paragraphs)
|
162
|
+
• code_suggestion – executable Python snippet using natural_pdf
|
163
|
+
• difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (tiny fonts in `describe`, scanned_image flag, missing text layer, no ruling lines inferred from `layout_*` arrays, etc.). If no difficult element is evident, return an empty list. Do *not* speculate.
|
164
|
+
• test_case – short description of how this PDF/page could be turned into an automated regression test (e.g. "assert tbl.df.shape == (12, 5)")
|
165
|
+
|
166
|
+
Code-style expectations:
|
167
|
+
• Use **real sample text** from the page as anchors — never placeholders such as
|
168
|
+
"AnchorText", "Texts", or "Also". If no stable anchor is visible, state that
|
169
|
+
fact in the *thought_process* and leave a TODO rather than shipping a placeholder.
|
170
|
+
• When a page is flagged as *scanned_image* (or no text layer exists) your code
|
171
|
+
MUST call `page.apply_ocr()` *before* any `.find()` or `.extract_text()` calls.
|
172
|
+
• Prefer `header_el.parent('table')` (up-tree navigation) over a global
|
173
|
+
`page.find('table')[i]` positional index — this is more robust when multiple tables
|
174
|
+
are present.
|
175
|
+
• For tables, assume Natural-PDF returns a `TableResult`; use `tbl.df` or
|
176
|
+
`tbl.to_df(header='first')` instead of manually building a DataFrame unless you
|
177
|
+
need custom header/skiprows logic.
|
178
|
+
• Explicitly name the extractor (`analyze_layout('tatr')`, `analyze_layout('detectron')`)
|
179
|
+
instead of vague comments like "YOLO fallback".
|
180
|
+
"""
|
181
|
+
)
|
182
|
+
|
183
|
+
messages = [
|
184
|
+
{"role": "system", "content": sys_msg},
|
185
|
+
{"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
|
186
|
+
{"role": "system", "content": "WORKFLOWS:\n" + workflows},
|
187
|
+
]
|
188
|
+
|
189
|
+
user_content: List[Dict[str, Any]] = [{"type": "text", "text": "\n".join(pdf_overview)}]
|
190
|
+
for sec in per_page_sections:
|
191
|
+
txt = json.dumps({k: sec[k] for k in ("page_number", "goal_tag", "context")})
|
192
|
+
user_content.append({"type": "text", "text": txt})
|
193
|
+
if sec["image"]:
|
194
|
+
user_content.append(sec["image"])
|
195
|
+
|
196
|
+
messages.append({"role": "user", "content": user_content})
|
197
|
+
return messages
|
198
|
+
|
199
|
+
|
200
|
+
# -------------------------------------------------
|
201
|
+
# Structured output via Pydantic model + function call
|
202
|
+
# -------------------------------------------------
|
203
|
+
|
204
|
+
|
205
|
+
class DocOutput(BaseModel):
|
206
|
+
"""LLM enrichment for a whole PDF (single object)."""
|
207
|
+
|
208
|
+
thought_process: str = Field(..., description="Overall reasoning about the PDF and extraction plan")
|
209
|
+
code_suggestion: str = Field(..., description="Python snippet using natural_pdf to achieve the user goal for this PDF")
|
210
|
+
difficult_elements: List[str] = Field(..., description="Bullet list of page features that are *hard* for any extraction engine")
|
211
|
+
test_case: str = Field(..., description="Short description of how this PDF/page could be turned into an automated regression test")
|
212
|
+
|
213
|
+
|
214
|
+
def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
|
215
|
+
summary = json.loads(summary_path.read_text())
|
216
|
+
|
217
|
+
# Decide whether to re-enrich
|
218
|
+
if not FORCE and summary.get("thought_process") and summary.get("code_suggestion"):
|
219
|
+
print(f"[skip] {summary_path.parent.name}: already enriched (use --force to overwrite)")
|
220
|
+
return
|
221
|
+
|
222
|
+
print(f"[send] {summary_path.parent.name}: requesting enrichment for entire document")
|
223
|
+
|
224
|
+
client = OpenAI(api_key=api_key)
|
225
|
+
msgs = build_pdf_prompt(summary)
|
226
|
+
|
227
|
+
completion = client.beta.chat.completions.parse(
|
228
|
+
model=model,
|
229
|
+
messages=msgs,
|
230
|
+
response_format=DocOutput
|
231
|
+
)
|
232
|
+
|
233
|
+
# Expect exactly one function call in the first choice
|
234
|
+
doc_out = completion.choices[0].message.parsed
|
235
|
+
|
236
|
+
summary["thought_process"] = doc_out.thought_process
|
237
|
+
summary["code_suggestion"] = doc_out.code_suggestion
|
238
|
+
summary["difficult_elements"] = doc_out.difficult_elements
|
239
|
+
summary["test_case"] = doc_out.test_case
|
240
|
+
|
241
|
+
print("** Code suggestion:\n", doc_out.code_suggestion)
|
242
|
+
print("** Thought process:\n", doc_out.thought_process)
|
243
|
+
|
244
|
+
summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False))
|
245
|
+
print(f"[update] Wrote enriched data to {summary_path}")
|
246
|
+
|
247
|
+
|
248
|
+
def main():
|
249
|
+
ap = argparse.ArgumentParser()
|
250
|
+
ap.add_argument("--submission", help="Submission ID to enrich (folder name)")
|
251
|
+
ap.add_argument("--model", default="o3")
|
252
|
+
ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env")
|
253
|
+
ap.add_argument("--force", action="store_true", help="overwrite existing enrichment")
|
254
|
+
args = ap.parse_args()
|
255
|
+
|
256
|
+
if not args.api_key:
|
257
|
+
raise SystemExit("OPENAI_API_KEY not provided")
|
258
|
+
|
259
|
+
global FORCE
|
260
|
+
FORCE = args.force
|
261
|
+
|
262
|
+
if args.submission:
|
263
|
+
summary_path = EVAL_DIR / args.submission / "summary.json"
|
264
|
+
if not summary_path.exists():
|
265
|
+
raise SystemExit("summary.json not found for submission")
|
266
|
+
enrich_summary(summary_path, args.api_key, args.model)
|
267
|
+
else:
|
268
|
+
for summary_path in EVAL_DIR.glob("*/summary.json"):
|
269
|
+
enrich_summary(summary_path, args.api_key, args.model)
|
270
|
+
|
271
|
+
|
272
|
+
if __name__ == "__main__":
|
273
|
+
main()
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import json
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Dict
|
4
|
+
|
5
|
+
from rich.console import Console
|
6
|
+
|
7
|
+
console = Console()
|
8
|
+
|
9
|
+
|
10
|
+
def save_json(data: Dict[str, Any], path: Path):
|
11
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
12
|
+
with open(path, "w", encoding="utf-8") as f:
|
13
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
14
|
+
|
15
|
+
|
16
|
+
def log_section(title: str):
|
17
|
+
console.rule(f"[bold cyan]{title}")
|
@@ -0,0 +1,127 @@
|
|
1
|
+
import re
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Optional
|
4
|
+
import urllib.request
|
5
|
+
import ssl
|
6
|
+
from rich.console import Console
|
7
|
+
|
8
|
+
ROOT_DIR = Path(__file__).resolve().parent.parent.parent # project root
|
9
|
+
BAD_PDF_DIR = ROOT_DIR / "bad_pdf_analysis"
|
10
|
+
SUBMISSIONS_DIR = ROOT_DIR / "bad-pdfs" / "submissions"
|
11
|
+
|
12
|
+
console = Console()
|
13
|
+
|
14
|
+
|
15
|
+
def slugify(value: str, max_length: int = 50) -> str:
|
16
|
+
"""Make a filesystem-safe filename from arbitrary text."""
|
17
|
+
value = re.sub(r"[^\w\-\. ]+", "_", value)
|
18
|
+
value = value.strip().replace(" ", "_")
|
19
|
+
return value[:max_length]
|
20
|
+
|
21
|
+
|
22
|
+
def _search_directory(directory: Path, pattern: str, predicate) -> Optional[Path]:
|
23
|
+
"""Utility: recursively search *directory* using glob *pattern*; return first match passing *predicate*."""
|
24
|
+
for p in directory.glob(pattern):
|
25
|
+
try:
|
26
|
+
if predicate(p):
|
27
|
+
return p
|
28
|
+
except Exception:
|
29
|
+
# Just in case of any weird permission/path errors – skip
|
30
|
+
continue
|
31
|
+
return None
|
32
|
+
|
33
|
+
|
34
|
+
def find_local_pdf(submission_id: str, pdf_url: Optional[str] = None) -> Optional[Path]:
|
35
|
+
"""Return the local path to the PDF for *submission_id*.
|
36
|
+
|
37
|
+
Search strategy (in order):
|
38
|
+
1. Inside ``bad_pdf_analysis`` where early analyses live – matching *submission_id* in filename.
|
39
|
+
2. Inside ``bad-pdfs/submissions`` where raw downloads reside – matching *submission_id*.
|
40
|
+
3. If *pdf_url* is supplied, also try the basename of the URL in ``bad-pdfs/submissions``.
|
41
|
+
4. As a last resort – try to download the PDF to the submissions folder
|
42
|
+
"""
|
43
|
+
|
44
|
+
submission_id_lower = submission_id.lower()
|
45
|
+
|
46
|
+
# 1) Search the processed-analysis folder (legacy path)
|
47
|
+
path = _search_directory(
|
48
|
+
BAD_PDF_DIR,
|
49
|
+
f"**/{submission_id}*.pdf",
|
50
|
+
lambda p: submission_id_lower in p.stem.lower(),
|
51
|
+
)
|
52
|
+
if path:
|
53
|
+
return path
|
54
|
+
|
55
|
+
# 2) Search the raw submissions folder by id substring
|
56
|
+
path = _search_directory(
|
57
|
+
SUBMISSIONS_DIR,
|
58
|
+
f"**/*{submission_id}*.pdf",
|
59
|
+
lambda p: submission_id_lower in p.stem.lower(),
|
60
|
+
)
|
61
|
+
if path:
|
62
|
+
return path
|
63
|
+
|
64
|
+
# 3) Use basename from URL, if provided
|
65
|
+
if pdf_url:
|
66
|
+
# Extract filename portion before any query string
|
67
|
+
from urllib.parse import urlparse
|
68
|
+
|
69
|
+
parsed = urlparse(pdf_url)
|
70
|
+
filename = Path(parsed.path).name
|
71
|
+
if filename:
|
72
|
+
candidate = SUBMISSIONS_DIR / filename
|
73
|
+
if candidate.exists():
|
74
|
+
return candidate
|
75
|
+
# fallback: case-insensitive glob match on stem
|
76
|
+
stem = Path(filename).stem.lower()
|
77
|
+
path = _search_directory(
|
78
|
+
SUBMISSIONS_DIR,
|
79
|
+
f"**/{stem}*.pdf",
|
80
|
+
lambda p: stem in p.stem.lower(),
|
81
|
+
)
|
82
|
+
if path:
|
83
|
+
return path
|
84
|
+
|
85
|
+
# 4) As a last resort – try to download the PDF to the submissions folder
|
86
|
+
if pdf_url:
|
87
|
+
try:
|
88
|
+
SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
|
89
|
+
from urllib.parse import urlparse
|
90
|
+
|
91
|
+
parsed = urlparse(pdf_url)
|
92
|
+
filename = Path(parsed.path).name or f"{submission_id}.pdf"
|
93
|
+
# Sanitise filename a bit – avoid query strings leaking in
|
94
|
+
filename = slugify(filename, max_length=100)
|
95
|
+
if not filename.lower().endswith(".pdf"):
|
96
|
+
filename += ".pdf"
|
97
|
+
|
98
|
+
dest_path = SUBMISSIONS_DIR / filename
|
99
|
+
if not dest_path.exists():
|
100
|
+
# Retrieve the file (no progress bar – keep it simple and robust)
|
101
|
+
try:
|
102
|
+
# Some hosts reject default Python user-agent; set one.
|
103
|
+
req = urllib.request.Request(pdf_url, headers={"User-Agent": "Mozilla/5.0"})
|
104
|
+
# Disable SSL verification edge-cases the storage host sometimes triggers
|
105
|
+
ctx = ssl.create_default_context()
|
106
|
+
with urllib.request.urlopen(req, context=ctx, timeout=30) as resp, open(dest_path, "wb") as f:
|
107
|
+
f.write(resp.read())
|
108
|
+
except Exception as e:
|
109
|
+
# Fallback: try requests if available (venv usually has it)
|
110
|
+
try:
|
111
|
+
import requests
|
112
|
+
|
113
|
+
r = requests.get(pdf_url, timeout=30)
|
114
|
+
r.raise_for_status()
|
115
|
+
with open(dest_path, "wb") as f:
|
116
|
+
f.write(r.content)
|
117
|
+
except Exception as e2:
|
118
|
+
console.print(f"[red]Download failed for {submission_id}: {e2}")
|
119
|
+
return None
|
120
|
+
if dest_path.exists():
|
121
|
+
return dest_path
|
122
|
+
except Exception:
|
123
|
+
# Networking problems, permissions, etc. – silently give up; caller will log
|
124
|
+
return None
|
125
|
+
|
126
|
+
# None found
|
127
|
+
return None
|
tools/rtl_smoke_test.py
ADDED
@@ -0,0 +1,80 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""RTL pipeline smoke-test for natural-pdf.
|
3
|
+
|
4
|
+
Run it from the repository root:
|
5
|
+
|
6
|
+
python tools/rtl_smoke_test.py
|
7
|
+
|
8
|
+
It loads *pdfs/arabic.pdf* and performs a handful of checks that cover the
|
9
|
+
most common break-points we identified for RTL handling:
|
10
|
+
1. char ingestion / word grouping
|
11
|
+
2. selector finds on logical Arabic tokens
|
12
|
+
3. bracket mirroring
|
13
|
+
4. number directionality inside RTL lines
|
14
|
+
|
15
|
+
Exit code is **0** when all checks pass, **1** otherwise.
|
16
|
+
"""
|
17
|
+
from __future__ import annotations
|
18
|
+
|
19
|
+
import sys
|
20
|
+
from pathlib import Path
|
21
|
+
|
22
|
+
from bidi.algorithm import get_display # type: ignore
|
23
|
+
|
24
|
+
from natural_pdf import PDF
|
25
|
+
from natural_pdf.utils.bidi_mirror import mirror_brackets
|
26
|
+
|
27
|
+
|
28
|
+
PDF_PATH = Path("pdfs/arabic.pdf")
|
29
|
+
|
30
|
+
if not PDF_PATH.exists():
|
31
|
+
print(f"❗ PDF not found: {PDF_PATH.resolve()}")
|
32
|
+
sys.exit(1)
|
33
|
+
|
34
|
+
# ────────────────────────────────────────────────────────────────
|
35
|
+
# Helpers
|
36
|
+
# ────────────────────────────────────────────────────────────────
|
37
|
+
|
38
|
+
failures: list[str] = []
|
39
|
+
|
40
|
+
def check(cond: bool, msg: str):
|
41
|
+
"""Collect failures but keep running to show full report."""
|
42
|
+
if cond:
|
43
|
+
print(f"✓ {msg}")
|
44
|
+
else:
|
45
|
+
print(f"✗ {msg}")
|
46
|
+
failures.append(msg)
|
47
|
+
|
48
|
+
|
49
|
+
# ────────────────────────────────────────────────────────────────
|
50
|
+
# Load page
|
51
|
+
# ────────────────────────────────────────────────────────────────
|
52
|
+
|
53
|
+
pdf = PDF(str(PDF_PATH))
|
54
|
+
page = pdf.pages[0]
|
55
|
+
|
56
|
+
# Basic char/word counts (should be non-zero)
|
57
|
+
check(len(page.chars) > 0, "chars were ingested")
|
58
|
+
check(len(page.words) > 0, "words were grouped")
|
59
|
+
|
60
|
+
# First line logical text
|
61
|
+
logical_first_line = page.extract_text().split("\n")[0]
|
62
|
+
print("First logical line:")
|
63
|
+
print(" ", logical_first_line)
|
64
|
+
|
65
|
+
# 1. Arabic keyword should be findable
|
66
|
+
check(page.find(text="مكرر") is not None, "page.find works for Arabic token 'مكرر'")
|
67
|
+
|
68
|
+
# 2. Reversed token should NOT match
|
69
|
+
check(page.find(text="مكرر"[::-1]) is None, "reverse token does not match (logical order stored)")
|
70
|
+
|
71
|
+
# 3. Extracted line should already show the bracket pair in correct orientation
|
72
|
+
check("(مكرر)" in logical_first_line, "parentheses orientation is correct in extract_text")
|
73
|
+
|
74
|
+
# 4. Western numbers must stay LTR inside RTL
|
75
|
+
# After visual re-order, the line should end with 2022 (year on the left visually → last in logical string)
|
76
|
+
check(logical_first_line.rstrip().endswith("2022"), "Western number '2022' kept logical placement")
|
77
|
+
|
78
|
+
print("\nSummary: {} passed, {} failed".format(4 - len(failures), len(failures)))
|
79
|
+
|
80
|
+
sys.exit(0 if not failures else 1)
|
File without changes
|
File without changes
|