natural-pdf 0.1.28__py3-none-any.whl → 0.1.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +556 -25
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +89 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,116 @@
1
+ import argparse
2
+ import re
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+
6
+ import pandas as pd
7
+ from rich.console import Console
8
+
9
+ from .utils import find_local_pdf, slugify
10
+ from .analyser import BadPDFAnalyzer, extract_page_hints
11
+ from .reporter import save_json
12
+
13
+ console = Console()
14
+
15
+
16
+ DEFAULT_PAGES = [1, 2, 3]
17
+
18
+
19
+ def build_pages_list(row: Dict[str, str]) -> List[int]:
20
+ pages = DEFAULT_PAGES.copy()
21
+ text_fields = [
22
+ row.get("What are we trying to get out of the PDF?", ""),
23
+ row.get("What do you think makes this PDF bad?", ""),
24
+ ]
25
+ for field in text_fields:
26
+ # Guard against NaN/None or other non-string pandas dtypes
27
+ if isinstance(field, str) and field:
28
+ pages += extract_page_hints(field)
29
+ # deduplicate and sort
30
+ pages = sorted(set(pages))
31
+ return pages
32
+
33
+
34
+ def main():
35
+ parser = argparse.ArgumentParser(description="Run bad PDF evaluation suite")
36
+ parser.add_argument(
37
+ "--csv",
38
+ default="bad-pdfs/Bad PDF Submission form_Submissions_2025-06-22.csv",
39
+ help="Path to submissions CSV",
40
+ )
41
+ parser.add_argument(
42
+ "--output-dir",
43
+ default="eval_results",
44
+ help="Directory to write results into (will be git-ignored)",
45
+ )
46
+ parser.add_argument("--max-row", type=int, default=None, help="debug: process only first n CSV rows")
47
+ parser.add_argument("--limit", type=int, default=None, help="process at most N PDFs with local files")
48
+ parser.add_argument("--overwrite", action="store_true", help="re-run analysis even if summary.json exists")
49
+ args = parser.parse_args()
50
+
51
+ csv_path = Path(args.csv)
52
+ df = pd.read_csv(csv_path)
53
+ if args.max_row:
54
+ df = df.head(args.max_row)
55
+
56
+ output_root = Path(args.output_dir)
57
+ output_root.mkdir(exist_ok=True)
58
+
59
+ master_records = []
60
+ processed = 0
61
+
62
+ try:
63
+ for idx, row in df.iterrows():
64
+ submission_id = row["Submission ID"]
65
+ pdf_url = row.get("Your bad PDF (one per submission!)", "")
66
+ pdf_path = find_local_pdf(submission_id, pdf_url)
67
+ if not pdf_path or not pdf_path.exists():
68
+ console.print(f"[red]PDF not found for {submission_id}. Skipping.")
69
+ continue
70
+
71
+ # Ignore files that are not .pdf (e.g. ZIPs mistakenly included)
72
+ if pdf_path.suffix.lower() != ".pdf":
73
+ console.print(f"[yellow]Not a PDF ({pdf_path.suffix}) for {submission_id}; skipping.")
74
+ continue
75
+
76
+ sub_output = output_root / submission_id
77
+ summary_path = sub_output / "summary.json"
78
+
79
+ # Ensure the original PDF is stored alongside the analysis artefacts
80
+ try:
81
+ from shutil import copy2
82
+
83
+ sub_output.mkdir(parents=True, exist_ok=True)
84
+ dest_pdf = sub_output / pdf_path.name
85
+ if not dest_pdf.exists():
86
+ copy2(pdf_path, dest_pdf)
87
+ except Exception as copy_err:
88
+ console.print(f"[yellow]Could not copy PDF into results folder: {copy_err}")
89
+
90
+ if summary_path.exists() and not args.overwrite:
91
+ console.print(f"[yellow]Summary exists for {submission_id}; skipping (use --overwrite to refresh)")
92
+ continue
93
+
94
+ pages = build_pages_list(row)
95
+ try:
96
+ analyser = BadPDFAnalyzer(pdf_path=pdf_path, output_dir=sub_output, submission_meta=row, pages=pages)
97
+ summary = analyser.run()
98
+ master_records.append(summary)
99
+ except Exception as e:
100
+ console.print(f"[red]Error processing {submission_id}: {e}. Skipping.")
101
+ continue
102
+ processed += 1
103
+ if args.limit and processed >= args.limit:
104
+ break
105
+ except KeyboardInterrupt:
106
+ console.print("[bold yellow]\nInterrupted by user – saving progress made so far…")
107
+ finally:
108
+ # Save master index even on interrupt
109
+ if master_records:
110
+ save_json(master_records, output_root / "master_index.json")
111
+ console.print(f"[bold green]Progress saved to {output_root / 'master_index.json'}")
112
+ console.print(f"[bold green]Finished. Results in {output_root}")
113
+
114
+
115
+ if __name__ == "__main__":
116
+ main()
@@ -0,0 +1,62 @@
1
+ from __future__ import annotations
2
+
3
+ """Export enrichment data (id, thought_process, code_suggestion) to a CSV.
4
+
5
+ Usage
6
+ -----
7
+ python -m tools.bad_pdf_eval.export_enrichment_csv --out eval_results/enrichment.csv
8
+ """
9
+
10
+ import argparse
11
+ import csv
12
+ import json
13
+ from pathlib import Path
14
+ from typing import List, Dict
15
+
16
+ ROOT = Path(__file__).resolve().parent.parent.parent # repo root
17
+ EVAL_DIR = ROOT / "eval_results"
18
+
19
+
20
+ def collect_records() -> List[Dict[str, str]]:
21
+ records: List[Dict[str, str]] = []
22
+ for summary_path in EVAL_DIR.glob("*/summary.json"):
23
+ try:
24
+ data = json.loads(summary_path.read_text())
25
+ except Exception:
26
+ continue
27
+ tp = data.get("thought_process", "").strip()
28
+ cs = data.get("code_suggestion", "").strip()
29
+ if not tp and not cs:
30
+ # Skip summaries without enrichment at doc level
31
+ continue
32
+ records.append({
33
+ "id": data.get("submission_id", summary_path.parent.name),
34
+ "thought_process": tp.replace("\n", " ").strip(),
35
+ "code_suggestion": cs.replace("\n", " ").strip(),
36
+ })
37
+ return records
38
+
39
+
40
+ def main():
41
+ ap = argparse.ArgumentParser(description="Export enriched summaries to CSV.")
42
+ ap.add_argument("--out", default=str(EVAL_DIR / "enrichment_export.csv"), help="Output CSV path")
43
+ args = ap.parse_args()
44
+
45
+ records = collect_records()
46
+ if not records:
47
+ print("No enriched summaries found.")
48
+ return
49
+
50
+ out_path = Path(args.out)
51
+ out_path.parent.mkdir(parents=True, exist_ok=True)
52
+ with out_path.open("w", newline="", encoding="utf-8") as f:
53
+ writer = csv.DictWriter(f, fieldnames=["id", "thought_process", "code_suggestion"])
54
+ writer.writeheader()
55
+ for rec in records:
56
+ writer.writerow(rec)
57
+
58
+ print(f"Wrote {len(records)} records to {out_path}")
59
+
60
+
61
+ if __name__ == "__main__":
62
+ main()
@@ -0,0 +1,273 @@
1
+ """Enrich evaluation summaries with LLM-generated thought_process and code_suggestion.
2
+
3
+ Usage
4
+ -----
5
+ python -m tools.bad_pdf_eval.llm_enrich --submission ja6EqV1 --model o3
6
+
7
+ Environment
8
+ -----------
9
+ OPENAI_API_KEY must be set or passed via --api-key.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import base64
15
+ import json
16
+ import os
17
+ import textwrap
18
+ from pathlib import Path
19
+ from typing import Dict, Any, List
20
+
21
+ from openai import OpenAI
22
+ from pydantic import BaseModel, Field
23
+ from PIL import Image
24
+
25
+ ROOT = Path(__file__).resolve().parent.parent.parent # repo root
26
+ EVAL_DIR = ROOT / "eval_results"
27
+ CHEATSHEET_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_CheatSheet.md"
28
+ WORKFLOWS_PATH = ROOT / "tools" / "bad_pdf_eval" / "LLM_NaturalPDF_Workflows.md"
29
+
30
+
31
+ def read_md(path: Path) -> str:
32
+ return path.read_text(encoding="utf-8")
33
+
34
+
35
+ def img_to_b64_jpeg(path: Path, max_px: int = 1024) -> str:
36
+ """Return base64-encoded tiny JPEG thumbnail."""
37
+ with Image.open(path) as im:
38
+ im.thumbnail((max_px, max_px))
39
+ buffered = BytesIO()
40
+ im.convert("RGB").save(buffered, format="JPEG", quality=40, optimize=True)
41
+ return base64.b64encode(buffered.getvalue()).decode()
42
+
43
+
44
+ from io import BytesIO
45
+
46
+ def build_prompt(page: Dict[str, Any]) -> List[Dict[str, str]]:
47
+ """Return OpenAI chat prompt messages list."""
48
+ cheatsheet = read_md(CHEATSHEET_PATH)
49
+ workflows = read_md(WORKFLOWS_PATH)
50
+
51
+ image_section = None
52
+ if page.get("image") and Path(page["image"]).exists():
53
+ try:
54
+ b64 = img_to_b64_jpeg(Path(page["image"]))
55
+ image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
56
+ except Exception:
57
+ pass
58
+
59
+ context_json = {
60
+ "describe": page.get("describe", ""),
61
+ "inspect": page.get("inspect", ""),
62
+ "layout_yolo_regions": page.get("layout_yolo_regions", []),
63
+ "layout_tatr_regions": page.get("layout_tatr_regions", []),
64
+ "blob_sample": page.get("blobs_sample", []),
65
+ "ocr_sample": page.get("ocr_sample", []),
66
+ }
67
+
68
+ sys_msg = textwrap.dedent(
69
+ f"""
70
+ You are an expert Natural-PDF engineer. Use the cheat-sheet and workflows to craft bespoke extraction code.
71
+ Return JSON with two keys: thought_process (concise reasoning) and code_suggestion (Python code). Do not add extra keys.
72
+ """
73
+ )
74
+
75
+ messages = [
76
+ {"role": "system", "content": sys_msg},
77
+ {"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
78
+ {"role": "system", "content": "WORKFLOWS:\n" + workflows},
79
+ ]
80
+
81
+ user_parts = [
82
+ f"Goal: {page.get('goal_tag') or 'generic_extraction'} — {page.get('goal', '') or 'Extract the most useful information (text, tables, key/value pairs) from the page.'}",
83
+ f"Page number: {page['page_number']}",
84
+ "Context JSON:" + json.dumps(context_json),
85
+ "Provide your JSON response now.",
86
+ ]
87
+ if image_section:
88
+ user_parts.insert(2, image_section)
89
+ messages.append({"role": "user", "content": "\n\n".join(user_parts)})
90
+ return messages
91
+
92
+
93
+ def build_pdf_prompt(summary: Dict[str, Any]) -> List[Dict[str, Any]]:
94
+ cheatsheet = read_md(CHEATSHEET_PATH)
95
+ workflows = read_md(WORKFLOWS_PATH)
96
+
97
+ pdf_overview = [
98
+ f"PDF: {Path(summary['pdf']).name}",
99
+ f"Goal: {summary.get('goal') or 'Extract useful information from the document'}",
100
+ f"Total pages analysed: {len(summary['pages'])}",
101
+ ]
102
+
103
+ per_page_sections = []
104
+ for page in summary["pages"]:
105
+ image_section = None
106
+ if page.get("image") and Path(page["image"]).exists():
107
+ try:
108
+ b64 = img_to_b64_jpeg(Path(page["image"]))
109
+ image_section = {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}", "detail": "low"}}
110
+ except Exception:
111
+ pass
112
+ context_json = {
113
+ "describe": page.get("describe", ""),
114
+ "inspect": page.get("inspect", ""),
115
+ "layout_yolo_regions": page.get("layout_yolo_regions", []),
116
+ "layout_tatr_regions": page.get("layout_tatr_regions", []),
117
+ "blob_sample": page.get("blobs_sample", []),
118
+ "ocr_sample": page.get("ocr_sample", []),
119
+ }
120
+ per_page_sections.append({
121
+ "page_number": page["page_number"],
122
+ "goal_tag": page.get("goal_tag") or "generic_extraction",
123
+ "image": image_section,
124
+ "context": context_json,
125
+ })
126
+
127
+ sys_msg = textwrap.dedent(
128
+ """
129
+ You are a Natural-PDF engineer with full access to the provided evidence
130
+ (describe/inspect text, YOLO & TATR regions, blob samples, OCR snippets, images).
131
+
132
+ Rely on these artefacts—not on generic heuristics. Avoid phrases like "try" or "usually this works".
133
+ If the evidence is genuinely insufficient, state exactly what is missing.
134
+
135
+ Extraction strategy:
136
+ 1. Start with the text layer: `page.extract_text()`, `page.extract_table()`, or region selectors.
137
+ 2. Use **anchor-based region selection**: locate a stable header/label/line/rect and select the area
138
+ between anchors via `.find()`, `.below()`, `.above()`, `.until()`, `.expand()`, etc.
139
+ Example: `page.find('text:contains(Violations)').below(until='text:bold')`.
140
+ 3. Strongly prefer until= to find a specific ending point as opposed to a pixel-based approach,
141
+ as this allows your code to work on potentially other similar pages of the document.
142
+ 3. Fall back to TATR or other vision models only if the text layer is unusable.
143
+ • Blanket advice like "run analyze_layout('tatr') on every page" is discouraged—
144
+ only invoke a layout model when anchor-based text extraction genuinely fails
145
+ or evidence shows complex unruled tables that require cell-grid inference.
146
+
147
+ Handle tables, key-value forms, and free-form paragraphs with the same anchor-driven approach. Key-value
148
+ forms might be easily extracted with .ask(...) or .extract(), feel free to mention as an option
149
+ but try to not rely on it.
150
+
151
+ Use Natural PDF Flows to access multi-page or columnar content, falling back on loops when necessary.
152
+
153
+ If it seems like the approach is not ideal, or that additional features would be useful in
154
+ this use case, outline the specifics of the issues and what additional information/approaches/code/etc
155
+ would allow you to more easily extract the information.
156
+
157
+ When working with pages or elements, try to use .apply and .filter. Natural PDF stresses
158
+ a fluent API, and for loops are discouraged.
159
+
160
+ Return ONE JSON object with exactly these keys:
161
+ • thought_process – concise reasoning and feature/enhancement requests (≤4 short paragraphs)
162
+ • code_suggestion – executable Python snippet using natural_pdf
163
+ • difficult_elements – bullet list of page features that are *hard* for any extraction engine **and that you can _prove_ from the supplied evidence** (tiny fonts in `describe`, scanned_image flag, missing text layer, no ruling lines inferred from `layout_*` arrays, etc.). If no difficult element is evident, return an empty list. Do *not* speculate.
164
+ • test_case – short description of how this PDF/page could be turned into an automated regression test (e.g. "assert tbl.df.shape == (12, 5)")
165
+
166
+ Code-style expectations:
167
+ • Use **real sample text** from the page as anchors — never placeholders such as
168
+ "AnchorText", "Texts", or "Also". If no stable anchor is visible, state that
169
+ fact in the *thought_process* and leave a TODO rather than shipping a placeholder.
170
+ • When a page is flagged as *scanned_image* (or no text layer exists) your code
171
+ MUST call `page.apply_ocr()` *before* any `.find()` or `.extract_text()` calls.
172
+ • Prefer `header_el.parent('table')` (up-tree navigation) over a global
173
+ `page.find('table')[i]` positional index — this is more robust when multiple tables
174
+ are present.
175
+ • For tables, assume Natural-PDF returns a `TableResult`; use `tbl.df` or
176
+ `tbl.to_df(header='first')` instead of manually building a DataFrame unless you
177
+ need custom header/skiprows logic.
178
+ • Explicitly name the extractor (`analyze_layout('tatr')`, `analyze_layout('detectron')`)
179
+ instead of vague comments like "YOLO fallback".
180
+ """
181
+ )
182
+
183
+ messages = [
184
+ {"role": "system", "content": sys_msg},
185
+ {"role": "system", "content": "CHEATSHEET:\n" + cheatsheet},
186
+ {"role": "system", "content": "WORKFLOWS:\n" + workflows},
187
+ ]
188
+
189
+ user_content: List[Dict[str, Any]] = [{"type": "text", "text": "\n".join(pdf_overview)}]
190
+ for sec in per_page_sections:
191
+ txt = json.dumps({k: sec[k] for k in ("page_number", "goal_tag", "context")})
192
+ user_content.append({"type": "text", "text": txt})
193
+ if sec["image"]:
194
+ user_content.append(sec["image"])
195
+
196
+ messages.append({"role": "user", "content": user_content})
197
+ return messages
198
+
199
+
200
+ # -------------------------------------------------
201
+ # Structured output via Pydantic model + function call
202
+ # -------------------------------------------------
203
+
204
+
205
+ class DocOutput(BaseModel):
206
+ """LLM enrichment for a whole PDF (single object)."""
207
+
208
+ thought_process: str = Field(..., description="Overall reasoning about the PDF and extraction plan")
209
+ code_suggestion: str = Field(..., description="Python snippet using natural_pdf to achieve the user goal for this PDF")
210
+ difficult_elements: List[str] = Field(..., description="Bullet list of page features that are *hard* for any extraction engine")
211
+ test_case: str = Field(..., description="Short description of how this PDF/page could be turned into an automated regression test")
212
+
213
+
214
+ def enrich_summary(summary_path: Path, api_key: str, model: str = "o3"):
215
+ summary = json.loads(summary_path.read_text())
216
+
217
+ # Decide whether to re-enrich
218
+ if not FORCE and summary.get("thought_process") and summary.get("code_suggestion"):
219
+ print(f"[skip] {summary_path.parent.name}: already enriched (use --force to overwrite)")
220
+ return
221
+
222
+ print(f"[send] {summary_path.parent.name}: requesting enrichment for entire document")
223
+
224
+ client = OpenAI(api_key=api_key)
225
+ msgs = build_pdf_prompt(summary)
226
+
227
+ completion = client.beta.chat.completions.parse(
228
+ model=model,
229
+ messages=msgs,
230
+ response_format=DocOutput
231
+ )
232
+
233
+ # Expect exactly one function call in the first choice
234
+ doc_out = completion.choices[0].message.parsed
235
+
236
+ summary["thought_process"] = doc_out.thought_process
237
+ summary["code_suggestion"] = doc_out.code_suggestion
238
+ summary["difficult_elements"] = doc_out.difficult_elements
239
+ summary["test_case"] = doc_out.test_case
240
+
241
+ print("** Code suggestion:\n", doc_out.code_suggestion)
242
+ print("** Thought process:\n", doc_out.thought_process)
243
+
244
+ summary_path.write_text(json.dumps(summary, indent=2, ensure_ascii=False))
245
+ print(f"[update] Wrote enriched data to {summary_path}")
246
+
247
+
248
+ def main():
249
+ ap = argparse.ArgumentParser()
250
+ ap.add_argument("--submission", help="Submission ID to enrich (folder name)")
251
+ ap.add_argument("--model", default="o3")
252
+ ap.add_argument("--api-key", default=os.getenv("OPENAI_API_KEY"), help="OpenAI key if not in env")
253
+ ap.add_argument("--force", action="store_true", help="overwrite existing enrichment")
254
+ args = ap.parse_args()
255
+
256
+ if not args.api_key:
257
+ raise SystemExit("OPENAI_API_KEY not provided")
258
+
259
+ global FORCE
260
+ FORCE = args.force
261
+
262
+ if args.submission:
263
+ summary_path = EVAL_DIR / args.submission / "summary.json"
264
+ if not summary_path.exists():
265
+ raise SystemExit("summary.json not found for submission")
266
+ enrich_summary(summary_path, args.api_key, args.model)
267
+ else:
268
+ for summary_path in EVAL_DIR.glob("*/summary.json"):
269
+ enrich_summary(summary_path, args.api_key, args.model)
270
+
271
+
272
+ if __name__ == "__main__":
273
+ main()
@@ -0,0 +1,17 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import Any, Dict
4
+
5
+ from rich.console import Console
6
+
7
+ console = Console()
8
+
9
+
10
+ def save_json(data: Dict[str, Any], path: Path):
11
+ path.parent.mkdir(parents=True, exist_ok=True)
12
+ with open(path, "w", encoding="utf-8") as f:
13
+ json.dump(data, f, indent=2, ensure_ascii=False)
14
+
15
+
16
+ def log_section(title: str):
17
+ console.rule(f"[bold cyan]{title}")
@@ -0,0 +1,127 @@
1
+ import re
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ import urllib.request
5
+ import ssl
6
+ from rich.console import Console
7
+
8
+ ROOT_DIR = Path(__file__).resolve().parent.parent.parent # project root
9
+ BAD_PDF_DIR = ROOT_DIR / "bad_pdf_analysis"
10
+ SUBMISSIONS_DIR = ROOT_DIR / "bad-pdfs" / "submissions"
11
+
12
+ console = Console()
13
+
14
+
15
+ def slugify(value: str, max_length: int = 50) -> str:
16
+ """Make a filesystem-safe filename from arbitrary text."""
17
+ value = re.sub(r"[^\w\-\. ]+", "_", value)
18
+ value = value.strip().replace(" ", "_")
19
+ return value[:max_length]
20
+
21
+
22
+ def _search_directory(directory: Path, pattern: str, predicate) -> Optional[Path]:
23
+ """Utility: recursively search *directory* using glob *pattern*; return first match passing *predicate*."""
24
+ for p in directory.glob(pattern):
25
+ try:
26
+ if predicate(p):
27
+ return p
28
+ except Exception:
29
+ # Just in case of any weird permission/path errors – skip
30
+ continue
31
+ return None
32
+
33
+
34
+ def find_local_pdf(submission_id: str, pdf_url: Optional[str] = None) -> Optional[Path]:
35
+ """Return the local path to the PDF for *submission_id*.
36
+
37
+ Search strategy (in order):
38
+ 1. Inside ``bad_pdf_analysis`` where early analyses live – matching *submission_id* in filename.
39
+ 2. Inside ``bad-pdfs/submissions`` where raw downloads reside – matching *submission_id*.
40
+ 3. If *pdf_url* is supplied, also try the basename of the URL in ``bad-pdfs/submissions``.
41
+ 4. As a last resort – try to download the PDF to the submissions folder
42
+ """
43
+
44
+ submission_id_lower = submission_id.lower()
45
+
46
+ # 1) Search the processed-analysis folder (legacy path)
47
+ path = _search_directory(
48
+ BAD_PDF_DIR,
49
+ f"**/{submission_id}*.pdf",
50
+ lambda p: submission_id_lower in p.stem.lower(),
51
+ )
52
+ if path:
53
+ return path
54
+
55
+ # 2) Search the raw submissions folder by id substring
56
+ path = _search_directory(
57
+ SUBMISSIONS_DIR,
58
+ f"**/*{submission_id}*.pdf",
59
+ lambda p: submission_id_lower in p.stem.lower(),
60
+ )
61
+ if path:
62
+ return path
63
+
64
+ # 3) Use basename from URL, if provided
65
+ if pdf_url:
66
+ # Extract filename portion before any query string
67
+ from urllib.parse import urlparse
68
+
69
+ parsed = urlparse(pdf_url)
70
+ filename = Path(parsed.path).name
71
+ if filename:
72
+ candidate = SUBMISSIONS_DIR / filename
73
+ if candidate.exists():
74
+ return candidate
75
+ # fallback: case-insensitive glob match on stem
76
+ stem = Path(filename).stem.lower()
77
+ path = _search_directory(
78
+ SUBMISSIONS_DIR,
79
+ f"**/{stem}*.pdf",
80
+ lambda p: stem in p.stem.lower(),
81
+ )
82
+ if path:
83
+ return path
84
+
85
+ # 4) As a last resort – try to download the PDF to the submissions folder
86
+ if pdf_url:
87
+ try:
88
+ SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
89
+ from urllib.parse import urlparse
90
+
91
+ parsed = urlparse(pdf_url)
92
+ filename = Path(parsed.path).name or f"{submission_id}.pdf"
93
+ # Sanitise filename a bit – avoid query strings leaking in
94
+ filename = slugify(filename, max_length=100)
95
+ if not filename.lower().endswith(".pdf"):
96
+ filename += ".pdf"
97
+
98
+ dest_path = SUBMISSIONS_DIR / filename
99
+ if not dest_path.exists():
100
+ # Retrieve the file (no progress bar – keep it simple and robust)
101
+ try:
102
+ # Some hosts reject default Python user-agent; set one.
103
+ req = urllib.request.Request(pdf_url, headers={"User-Agent": "Mozilla/5.0"})
104
+ # Disable SSL verification edge-cases the storage host sometimes triggers
105
+ ctx = ssl.create_default_context()
106
+ with urllib.request.urlopen(req, context=ctx, timeout=30) as resp, open(dest_path, "wb") as f:
107
+ f.write(resp.read())
108
+ except Exception as e:
109
+ # Fallback: try requests if available (venv usually has it)
110
+ try:
111
+ import requests
112
+
113
+ r = requests.get(pdf_url, timeout=30)
114
+ r.raise_for_status()
115
+ with open(dest_path, "wb") as f:
116
+ f.write(r.content)
117
+ except Exception as e2:
118
+ console.print(f"[red]Download failed for {submission_id}: {e2}")
119
+ return None
120
+ if dest_path.exists():
121
+ return dest_path
122
+ except Exception:
123
+ # Networking problems, permissions, etc. – silently give up; caller will log
124
+ return None
125
+
126
+ # None found
127
+ return None
@@ -0,0 +1,80 @@
1
+ #!/usr/bin/env python3
2
+ """RTL pipeline smoke-test for natural-pdf.
3
+
4
+ Run it from the repository root:
5
+
6
+ python tools/rtl_smoke_test.py
7
+
8
+ It loads *pdfs/arabic.pdf* and performs a handful of checks that cover the
9
+ most common break-points we identified for RTL handling:
10
+ 1. char ingestion / word grouping
11
+ 2. selector finds on logical Arabic tokens
12
+ 3. bracket mirroring
13
+ 4. number directionality inside RTL lines
14
+
15
+ Exit code is **0** when all checks pass, **1** otherwise.
16
+ """
17
+ from __future__ import annotations
18
+
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ from bidi.algorithm import get_display # type: ignore
23
+
24
+ from natural_pdf import PDF
25
+ from natural_pdf.utils.bidi_mirror import mirror_brackets
26
+
27
+
28
+ PDF_PATH = Path("pdfs/arabic.pdf")
29
+
30
+ if not PDF_PATH.exists():
31
+ print(f"❗ PDF not found: {PDF_PATH.resolve()}")
32
+ sys.exit(1)
33
+
34
+ # ────────────────────────────────────────────────────────────────
35
+ # Helpers
36
+ # ────────────────────────────────────────────────────────────────
37
+
38
+ failures: list[str] = []
39
+
40
+ def check(cond: bool, msg: str):
41
+ """Collect failures but keep running to show full report."""
42
+ if cond:
43
+ print(f"✓ {msg}")
44
+ else:
45
+ print(f"✗ {msg}")
46
+ failures.append(msg)
47
+
48
+
49
+ # ────────────────────────────────────────────────────────────────
50
+ # Load page
51
+ # ────────────────────────────────────────────────────────────────
52
+
53
+ pdf = PDF(str(PDF_PATH))
54
+ page = pdf.pages[0]
55
+
56
+ # Basic char/word counts (should be non-zero)
57
+ check(len(page.chars) > 0, "chars were ingested")
58
+ check(len(page.words) > 0, "words were grouped")
59
+
60
+ # First line logical text
61
+ logical_first_line = page.extract_text().split("\n")[0]
62
+ print("First logical line:")
63
+ print(" ", logical_first_line)
64
+
65
+ # 1. Arabic keyword should be findable
66
+ check(page.find(text="مكرر") is not None, "page.find works for Arabic token 'مكرر'")
67
+
68
+ # 2. Reversed token should NOT match
69
+ check(page.find(text="مكرر"[::-1]) is None, "reverse token does not match (logical order stored)")
70
+
71
+ # 3. Extracted line should already show the bracket pair in correct orientation
72
+ check("(مكرر)" in logical_first_line, "parentheses orientation is correct in extract_text")
73
+
74
+ # 4. Western numbers must stay LTR inside RTL
75
+ # After visual re-order, the line should end with 2022 (year on the left visually → last in logical string)
76
+ check(logical_first_line.rstrip().endswith("2022"), "Western number '2022' kept logical placement")
77
+
78
+ print("\nSummary: {} passed, {} failed".format(4 - len(failures), len(failures)))
79
+
80
+ sys.exit(0 if not failures else 1)
@@ -1,2 +0,0 @@
1
- natural_pdf
2
- pdfs