natural-pdf 0.1.33__py3-none-any.whl → 0.1.34__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (77) hide show
  1. natural_pdf/analyzers/__init__.py +2 -2
  2. natural_pdf/analyzers/guides.py +670 -595
  3. natural_pdf/analyzers/layout/base.py +53 -6
  4. natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
  5. natural_pdf/analyzers/layout/layout_manager.py +18 -14
  6. natural_pdf/analyzers/layout/layout_options.py +1 -0
  7. natural_pdf/analyzers/layout/paddle.py +102 -64
  8. natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
  9. natural_pdf/analyzers/layout/yolo.py +2 -6
  10. natural_pdf/analyzers/shape_detection_mixin.py +15 -6
  11. natural_pdf/classification/manager.py +92 -77
  12. natural_pdf/classification/mixin.py +49 -5
  13. natural_pdf/classification/results.py +1 -1
  14. natural_pdf/cli.py +7 -3
  15. natural_pdf/collections/pdf_collection.py +96 -101
  16. natural_pdf/core/element_manager.py +131 -45
  17. natural_pdf/core/highlighting_service.py +5 -6
  18. natural_pdf/core/page.py +113 -22
  19. natural_pdf/core/pdf.py +477 -75
  20. natural_pdf/describe/__init__.py +18 -12
  21. natural_pdf/describe/base.py +179 -172
  22. natural_pdf/describe/elements.py +155 -155
  23. natural_pdf/describe/mixin.py +27 -19
  24. natural_pdf/describe/summary.py +44 -55
  25. natural_pdf/elements/base.py +134 -18
  26. natural_pdf/elements/collections.py +90 -18
  27. natural_pdf/elements/image.py +2 -1
  28. natural_pdf/elements/line.py +0 -31
  29. natural_pdf/elements/rect.py +0 -14
  30. natural_pdf/elements/region.py +222 -108
  31. natural_pdf/elements/text.py +18 -12
  32. natural_pdf/exporters/__init__.py +4 -1
  33. natural_pdf/exporters/original_pdf.py +12 -4
  34. natural_pdf/extraction/mixin.py +66 -10
  35. natural_pdf/extraction/result.py +1 -1
  36. natural_pdf/flows/flow.py +63 -4
  37. natural_pdf/flows/region.py +4 -4
  38. natural_pdf/ocr/engine.py +83 -2
  39. natural_pdf/ocr/engine_paddle.py +5 -5
  40. natural_pdf/ocr/ocr_factory.py +2 -1
  41. natural_pdf/ocr/ocr_manager.py +24 -13
  42. natural_pdf/ocr/ocr_options.py +3 -10
  43. natural_pdf/qa/document_qa.py +21 -8
  44. natural_pdf/qa/qa_result.py +3 -7
  45. natural_pdf/search/__init__.py +3 -2
  46. natural_pdf/search/lancedb_search_service.py +5 -6
  47. natural_pdf/search/numpy_search_service.py +5 -2
  48. natural_pdf/selectors/parser.py +51 -6
  49. natural_pdf/tables/__init__.py +2 -2
  50. natural_pdf/tables/result.py +7 -6
  51. natural_pdf/utils/bidi_mirror.py +2 -1
  52. natural_pdf/utils/reading_order.py +3 -2
  53. natural_pdf/utils/visualization.py +3 -3
  54. natural_pdf/widgets/viewer.py +0 -1
  55. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
  56. natural_pdf-0.1.34.dist-info/RECORD +121 -0
  57. optimization/memory_comparison.py +73 -58
  58. optimization/pdf_analyzer.py +141 -96
  59. optimization/performance_analysis.py +111 -110
  60. optimization/test_cleanup_methods.py +47 -36
  61. optimization/test_memory_fix.py +40 -39
  62. tools/bad_pdf_eval/__init__.py +0 -1
  63. tools/bad_pdf_eval/analyser.py +35 -18
  64. tools/bad_pdf_eval/collate_summaries.py +22 -18
  65. tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
  66. tools/bad_pdf_eval/eval_suite.py +21 -9
  67. tools/bad_pdf_eval/evaluate_quality.py +198 -0
  68. tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
  69. tools/bad_pdf_eval/llm_enrich.py +71 -39
  70. tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
  71. tools/bad_pdf_eval/reporter.py +1 -1
  72. tools/bad_pdf_eval/utils.py +7 -4
  73. natural_pdf-0.1.33.dist-info/RECORD +0 -118
  74. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
  75. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
  76. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
  77. {natural_pdf-0.1.33.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
@@ -10,9 +10,10 @@ This test verifies that:
10
10
 
11
11
  import gc
12
12
  import os
13
- import psutil
14
13
  import sys
15
14
  from pathlib import Path
15
+
16
+ import psutil
16
17
  import pytest
17
18
 
18
19
  import natural_pdf as npdf
@@ -26,7 +27,7 @@ def get_memory_usage():
26
27
 
27
28
  class TestCharacterMemoryFix:
28
29
  """Test suite for character memory optimization"""
29
-
30
+
30
31
  @pytest.fixture
31
32
  def test_pdf_path(self):
32
33
  """Get path to a test PDF"""
@@ -35,128 +36,128 @@ class TestCharacterMemoryFix:
35
36
  if not test_path.exists():
36
37
  pytest.skip("Test PDF not found")
37
38
  return str(test_path)
38
-
39
+
39
40
  def test_character_access_still_works(self, test_pdf_path):
40
41
  """Test that character access through words still works after optimization"""
41
42
  pdf = npdf.PDF(test_pdf_path)
42
43
  page = pdf.pages[0]
43
-
44
- # Force loading of elements
44
+
45
+ # Force loading of elements
45
46
  text_elements = page.find_all("text")
46
-
47
+
47
48
  # Test that we have text elements
48
49
  assert len(text_elements) > 0, "Should have text elements"
49
50
  print(f"Found {len(text_elements)} text elements")
50
-
51
- # Test that words can access their constituent characters
51
+
52
+ # Test that words can access their constituent characters
52
53
  for word in text_elements[:5]: # Test first 5 words
53
- if hasattr(word, '_char_indices') and word._char_indices:
54
+ if hasattr(word, "_char_indices") and word._char_indices:
54
55
  # New optimized approach
55
56
  constituent_chars = word.chars
56
57
  assert isinstance(constituent_chars, list), "word.chars should return a list"
57
58
  assert len(constituent_chars) > 0, "Should have constituent characters"
58
-
59
+
59
60
  # Test character properties
60
61
  for char in constituent_chars[:3]: # Test first 3 chars of each word
61
- assert hasattr(char, 'text'), "Character should have text attribute"
62
- assert hasattr(char, 'x0'), "Character should have x0 coordinate"
63
-
64
- elif hasattr(word, '_char_dicts') and word._char_dicts:
62
+ assert hasattr(char, "text"), "Character should have text attribute"
63
+ assert hasattr(char, "x0"), "Character should have x0 coordinate"
64
+
65
+ elif hasattr(word, "_char_dicts") and word._char_dicts:
65
66
  # Old approach - should still work for compatibility
66
67
  char_dicts = word._char_dicts
67
68
  assert isinstance(char_dicts, list), "word._char_dicts should be a list"
68
69
  assert len(char_dicts) > 0, "Should have character dictionaries"
69
-
70
+
70
71
  def test_memory_usage_improvement(self, test_pdf_path):
71
72
  """Test that memory usage is improved with the optimization"""
72
73
  # This test will compare memory usage patterns
73
74
  # Note: Exact numbers will vary, but we should see improvement
74
-
75
+
75
76
  pdf = npdf.PDF(test_pdf_path)
76
77
  page = pdf.pages[0]
77
-
78
+
78
79
  # Measure memory before loading elements
79
80
  gc.collect()
80
81
  memory_before = get_memory_usage()
81
-
82
+
82
83
  # Load elements (this triggers the optimization)
83
84
  chars = page.find_all("text")
84
85
  words = page.find_all("words")
85
-
86
+
86
87
  # Measure memory after loading
87
88
  gc.collect()
88
89
  memory_after = get_memory_usage()
89
-
90
+
90
91
  memory_used = memory_after - memory_before
91
-
92
+
92
93
  # Log the memory usage for analysis
93
94
  print(f"\nMemory usage analysis:")
94
95
  print(f"Characters loaded: {len(chars)}")
95
96
  print(f"Words loaded: {len(words)}")
96
97
  print(f"Memory used: {memory_used:.2f} MB")
97
98
  print(f"Memory per character: {memory_used / len(chars) * 1024:.2f} KB" if chars else "N/A")
98
-
99
+
99
100
  # The memory usage should be reasonable (not exact test due to variability)
100
101
  # Main goal is to verify no crashes and reasonable memory usage
101
102
  assert memory_used < 100, f"Memory usage seems too high: {memory_used:.2f} MB"
102
-
103
+
103
104
  def test_word_text_extraction_works(self, test_pdf_path):
104
105
  """Test that text extraction from words still works correctly"""
105
106
  pdf = npdf.PDF(test_pdf_path)
106
107
  page = pdf.pages[0]
107
-
108
+
108
109
  words = page.find_all("text") # All text elements are words in this PDF
109
-
110
+
110
111
  # Test text extraction from words
111
112
  for word in words[:10]: # Test first 10 words
112
113
  word_text = word.text
113
114
  assert isinstance(word_text, str), "Word text should be a string"
114
-
115
+
115
116
  # Text should not be empty for actual words
116
117
  if word_text.strip(): # Skip empty/whitespace words
117
118
  assert len(word_text) > 0, "Non-empty words should have text content"
118
-
119
+
119
120
  def test_backwards_compatibility(self, test_pdf_path):
120
121
  """Test that existing code patterns still work"""
121
122
  pdf = npdf.PDF(test_pdf_path)
122
123
  page = pdf.pages[0]
123
-
124
+
124
125
  # Test that existing element access patterns work
125
126
  all_elements = page.find_all("text")
126
127
  assert len(all_elements) > 0, "Should find text elements"
127
-
128
+
128
129
  # Test that element properties are accessible
129
130
  for element in all_elements[:5]:
130
- assert hasattr(element, 'text'), "Element should have text attribute"
131
- assert hasattr(element, 'x0'), "Element should have x0 coordinate"
132
- assert hasattr(element, 'top'), "Element should have top coordinate"
133
- assert hasattr(element, 'width'), "Element should have width"
134
- assert hasattr(element, 'height'), "Element should have height"
131
+ assert hasattr(element, "text"), "Element should have text attribute"
132
+ assert hasattr(element, "x0"), "Element should have x0 coordinate"
133
+ assert hasattr(element, "top"), "Element should have top coordinate"
134
+ assert hasattr(element, "width"), "Element should have width"
135
+ assert hasattr(element, "height"), "Element should have height"
135
136
 
136
137
 
137
138
  def main():
138
139
  """Run the memory fix test"""
139
140
  print("Running character memory optimization test...")
140
-
141
+
141
142
  # Check if test PDF exists
142
143
  test_pdf = Path("pdfs/01-practice.pdf")
143
144
  if not test_pdf.exists():
144
145
  print(f"ERROR: Test PDF not found at {test_pdf}")
145
146
  print("Please ensure the test PDF exists before running this test.")
146
147
  return 1
147
-
148
+
148
149
  # Run pytest on just this file
149
150
  exit_code = pytest.main([__file__, "-v", "-s"])
150
-
151
+
151
152
  if exit_code == 0:
152
153
  print("\n✅ All memory optimization tests passed!")
153
154
  print("The character duplication fix is working correctly.")
154
155
  else:
155
156
  print("\n❌ Some tests failed!")
156
157
  print("The memory optimization needs investigation.")
157
-
158
+
158
159
  return exit_code
159
160
 
160
161
 
161
162
  if __name__ == "__main__":
162
- exit(main())
163
+ exit(main())
@@ -1 +0,0 @@
1
-
@@ -3,15 +3,16 @@ from __future__ import annotations
3
3
  import re
4
4
  import time
5
5
  from pathlib import Path
6
- from typing import Dict, List, Optional, Any
6
+ from typing import Any, Dict, List, Optional
7
7
 
8
- import natural_pdf as npdf
9
8
  from PIL import Image
10
- from rich.table import Table
11
9
  from rich.console import Console
10
+ from rich.table import Table
12
11
 
12
+ import natural_pdf as npdf
13
+
14
+ from .reporter import log_section, save_json
13
15
  from .utils import slugify
14
- from .reporter import save_json, log_section
15
16
 
16
17
  console = Console()
17
18
 
@@ -201,7 +202,10 @@ class BadPDFAnalyzer:
201
202
  page_result["goal_tag"] = "unknown"
202
203
 
203
204
  # Difficulties determination
204
- if page_result.get("text_len", 0) < 100 and page_result.get("ocr_text_elements", 0) > 20:
205
+ if (
206
+ page_result.get("text_len", 0) < 100
207
+ and page_result.get("ocr_text_elements", 0) > 20
208
+ ):
205
209
  difficulties.append("scanned_image")
206
210
 
207
211
  page_result["difficulties"] = difficulties
@@ -235,40 +239,48 @@ class BadPDFAnalyzer:
235
239
  import_lines.append("import pandas as pd")
236
240
 
237
241
  code_lines: List[str] = import_lines + [
238
- f"pdf = PDF(\"{self.pdf_path}\")",
242
+ f'pdf = PDF("{self.pdf_path}")',
239
243
  f"page = pdf.pages[{page_idx_1based - 1}] # page {page_idx_1based}",
240
244
  ]
241
245
 
242
246
  thought_lines: List[str] = []
243
247
  # build reasoning
244
- thought_lines.append(f"Goal tag: {page_result['goal_tag']}. Detected difficulties: {', '.join(difficulties) or 'none'}.")
248
+ thought_lines.append(
249
+ f"Goal tag: {page_result['goal_tag']}. Detected difficulties: {', '.join(difficulties) or 'none'}."
250
+ )
245
251
 
246
252
  if page_result["goal_tag"] == "table_extraction":
247
- thought_lines.append("Plan: rely on layout models to locate tables, then extract with Natural-PDF helper.")
253
+ thought_lines.append(
254
+ "Plan: rely on layout models to locate tables, then extract with Natural-PDF helper."
255
+ )
248
256
  if page_result.get("layout_tatr_count", 0) > 0:
249
257
  code_lines.append("page.analyze_layout('tatr') # adds 'table' regions")
250
258
  else:
251
259
  code_lines.append("page.analyze_layout() # YOLO fallback")
252
260
 
253
261
  if page_result.get("layout_tatr_count", 0) > 1:
254
- thought_lines.append("Multiple tables detected, choose second as goal mentions 'second table'.")
262
+ thought_lines.append(
263
+ "Multiple tables detected, choose second as goal mentions 'second table'."
264
+ )
255
265
  code_lines.append("tables = page.find_all('table')")
256
266
  code_lines.append("tbl = tables[1]")
257
267
  else:
258
268
  code_lines.append("tbl = page.find('table') # first table")
259
269
 
260
- code_lines.extend([
261
- "data = tbl.extract_table()",
262
- "columns, rows = data[0], data[1:]",
263
- "df = pd.DataFrame(rows, columns=columns)",
264
- ])
270
+ code_lines.extend(
271
+ [
272
+ "data = tbl.extract_table()",
273
+ "columns, rows = data[0], data[1:]",
274
+ "df = pd.DataFrame(rows, columns=columns)",
275
+ ]
276
+ )
265
277
  elif page_result["goal_tag"] == "text_extraction":
266
278
  anchor = _first_anchor_from_goal(goal_str)
267
279
  if "scanned_image" in difficulties:
268
280
  thought_lines.append("No native text detected; need OCR before querying.")
269
281
  code_lines.append("page.apply_ocr(engine='paddle')")
270
282
  thought_lines.append(f"Anchor on text '{anchor}' then read below region.")
271
- code_lines.append(f"section = page.find(\"text:contains({anchor})\").below(0, 50)")
283
+ code_lines.append(f'section = page.find("text:contains({anchor})").below(0, 50)')
272
284
  code_lines.append("text = section.extract_text()")
273
285
  else:
274
286
  thought_lines.append("Goal unclear; placeholder snippet provided.")
@@ -282,8 +294,13 @@ class BadPDFAnalyzer:
282
294
  # Provide quick heuristic comment
283
295
  if page_result.get("text_len", 0) == 0 and page_result.get("ocr_text_elements", 0) > 20:
284
296
  page_result["auto_comment"] = "Likely scanned/needs OCR; no native text."
285
- elif page_result.get("text_len", 0) > 1000 and page_result.get("layout_yolo_count", 0) == 0:
286
- page_result["auto_comment"] = "Native dense text; YOLO found no regions – may be fine, fonts just small."
297
+ elif (
298
+ page_result.get("text_len", 0) > 1000
299
+ and page_result.get("layout_yolo_count", 0) == 0
300
+ ):
301
+ page_result["auto_comment"] = (
302
+ "Native dense text; YOLO found no regions – may be fine, fonts just small."
303
+ )
287
304
  else:
288
305
  page_result.setdefault("auto_comment", "")
289
306
 
@@ -299,4 +316,4 @@ PAGE_REGEX = re.compile(r"page\s*(\d{1,4})", re.IGNORECASE)
299
316
 
300
317
 
301
318
  def extract_page_hints(text: str) -> List[int]:
302
- return [int(m.group(1)) for m in PAGE_REGEX.finditer(text)]
319
+ return [int(m.group(1)) for m in PAGE_REGEX.finditer(text)]
@@ -36,11 +36,13 @@ def collect() -> List[dict]:
36
36
  tp = (p.get("thought_process") or "").strip()
37
37
  if not cs and not tp:
38
38
  continue
39
- page_snippets.append({
40
- "page": p.get("page_number"),
41
- "code": cs,
42
- "thought": tp,
43
- })
39
+ page_snippets.append(
40
+ {
41
+ "page": p.get("page_number"),
42
+ "code": cs,
43
+ "thought": tp,
44
+ }
45
+ )
44
46
  # --- lightweight feature tagging --------------------------------
45
47
  gt = (p.get("goal_tag") or "").lower()
46
48
  if "table" in gt:
@@ -64,18 +66,20 @@ def collect() -> List[dict]:
64
66
  if language and language.lower() not in {"english", "en", "en-us"}:
65
67
  features.add("non_english")
66
68
 
67
- rows.append({
68
- "id": submission_id,
69
- "language": language,
70
- "issues": issues,
71
- "description": description,
72
- "doc_thought": doc_tp,
73
- "doc_code": doc_cs,
74
- "doc_difficult": doc_diff,
75
- "doc_test": doc_test,
76
- "snippets": page_snippets,
77
- "features": sorted(features),
78
- })
69
+ rows.append(
70
+ {
71
+ "id": submission_id,
72
+ "language": language,
73
+ "issues": issues,
74
+ "description": description,
75
+ "doc_thought": doc_tp,
76
+ "doc_code": doc_cs,
77
+ "doc_difficult": doc_diff,
78
+ "doc_test": doc_test,
79
+ "snippets": page_snippets,
80
+ "features": sorted(features),
81
+ }
82
+ )
79
83
  return rows
80
84
 
81
85
 
@@ -127,4 +131,4 @@ def export_markdown(rows: List[dict]):
127
131
  if __name__ == "__main__":
128
132
  rows = collect()
129
133
  export_markdown(rows)
130
- print(f"Wrote {len(rows)} summaries to eval_results/collated_summary.md")
134
+ print(f"Wrote {len(rows)} summaries to eval_results/collated_summary.md")
@@ -0,0 +1,127 @@
1
+ #!/usr/bin/env python3
2
+ """Compile multi-try enrichment attempts into a single Markdown report.
3
+
4
+ For every `summary.json` produced by the retry enrichment pipeline, this script
5
+ collects all attempts (initial + retries) and writes a human-readable markdown
6
+ file that shows, *per PDF*, every attempt alongside its quality score.
7
+
8
+ Example
9
+ -------
10
+ $ python -m tools.bad_pdf_eval.compile_attempts_markdown \
11
+ --output eval_results/attempts_progress.md
12
+
13
+ The resulting markdown looks like::
14
+
15
+ # Attempts Progress Report
16
+ ## obe1Vq5 — obe1Vq5.pdf
17
+ ### Attempt 0 (Score: 3/12)
18
+ ...
19
+ ### Attempt 1 (Score: 6/12)
20
+ ...
21
+
22
+ This file can then be fed into an LLM for meta-analysis of score improvements
23
+ and guidance quality.
24
+ """
25
+ from __future__ import annotations
26
+
27
+ import argparse
28
+ import json
29
+ from pathlib import Path
30
+ from typing import Iterable, List
31
+
32
+ # Re-use the same constants as other evaluation utilities ---------------------
33
+ ROOT = Path(__file__).resolve().parent.parent.parent # repo root
34
+ EVAL_DIR = ROOT / "eval_results"
35
+
36
+ # ---------------------------------------------------------------------------
37
+
38
+
39
+ def iter_summary_paths(submission: str | None) -> Iterable[Path]:
40
+ """Yield all summary.json paths (optionally filtered by submission ID)."""
41
+ if submission:
42
+ p = EVAL_DIR / submission / "summary.json"
43
+ if not p.exists():
44
+ raise FileNotFoundError(
45
+ f"No summary.json found for submission '{submission}' – expected {p}"
46
+ )
47
+ yield p
48
+ else:
49
+ yield from EVAL_DIR.glob("*/summary.json")
50
+
51
+
52
+ def load_summary(path: Path) -> dict:
53
+ """Return the parsed JSON for the given summary path."""
54
+ try:
55
+ return json.loads(path.read_text(encoding="utf-8"))
56
+ except json.JSONDecodeError as exc:
57
+ raise ValueError(f"Invalid JSON in {path}: {exc}") from exc
58
+
59
+
60
+ def build_markdown_for_summary(submission_id: str, summary: dict) -> str:
61
+ """Return a markdown string for a single submission (all attempts)."""
62
+ pdf_name = Path(summary.get("pdf", "")).name or "<unknown.pdf>"
63
+ header = f"## {submission_id} — {pdf_name}"
64
+
65
+ attempts: List[dict] = sorted(summary.get("attempts", []), key=lambda d: d.get("attempt", 0))
66
+ if not attempts:
67
+ return header + "\n\n_No attempts recorded – run the enrichment retry pipeline first._\n"
68
+
69
+ sections: List[str] = [header]
70
+
71
+ for att in attempts:
72
+ num = att.get("attempt", "?")
73
+ score = att.get("score", "?")
74
+ tp = att.get("thought_process", "").strip()
75
+ code = att.get("code_suggestion", "").rstrip()
76
+
77
+ sections.append(f"### Attempt {num} (Score: {score}/12)")
78
+
79
+ if tp:
80
+ sections.append("**Thought Process**")
81
+ # indent each line with > for blockquote formatting
82
+ quoted_tp = "\n".join(f"> {line}" for line in tp.splitlines())
83
+ sections.append(quoted_tp)
84
+
85
+ if code:
86
+ sections.append("```python")
87
+ sections.append(code)
88
+ sections.append("```")
89
+
90
+ return "\n\n".join(sections)
91
+
92
+
93
+ def compile_report(paths: Iterable[Path]) -> str:
94
+ """Aggregate individual submission markdown into one report."""
95
+ pieces: List[str] = ["# Attempts Progress Report", ""]
96
+ for p in sorted(paths):
97
+ submission_id = p.parent.name
98
+ summary = load_summary(p)
99
+ pieces.append(build_markdown_for_summary(submission_id, summary))
100
+ pieces.append("---") # horizontal rule between PDFs
101
+ return "\n\n".join(pieces).rstrip("-\n")
102
+
103
+
104
+ def main() -> None:
105
+ parser = argparse.ArgumentParser(description="Compile multi-retry attempts into markdown.")
106
+ parser.add_argument(
107
+ "--output",
108
+ type=Path,
109
+ default=EVAL_DIR / "attempts_progress.md",
110
+ help="Destination .md file (default: eval_results/attempts_progress.md).",
111
+ )
112
+ parser.add_argument("--submission", help="Only compile a single submission ID.")
113
+ args = parser.parse_args()
114
+
115
+ summary_paths = list(iter_summary_paths(args.submission))
116
+ if not summary_paths:
117
+ raise SystemExit("No summary.json files found.")
118
+
119
+ md = compile_report(summary_paths)
120
+ args.output.write_text(md, encoding="utf-8")
121
+ print(
122
+ f"[ok] Wrote markdown report to {args.output.relative_to(ROOT)} (covers {len(summary_paths)} PDFs)"
123
+ )
124
+
125
+
126
+ if __name__ == "__main__":
127
+ main()
@@ -1,14 +1,14 @@
1
1
  import argparse
2
2
  import re
3
3
  from pathlib import Path
4
- from typing import List, Dict
4
+ from typing import Dict, List
5
5
 
6
6
  import pandas as pd
7
7
  from rich.console import Console
8
8
 
9
- from .utils import find_local_pdf, slugify
10
9
  from .analyser import BadPDFAnalyzer, extract_page_hints
11
10
  from .reporter import save_json
11
+ from .utils import find_local_pdf, slugify
12
12
 
13
13
  console = Console()
14
14
 
@@ -43,9 +43,15 @@ def main():
43
43
  default="eval_results",
44
44
  help="Directory to write results into (will be git-ignored)",
45
45
  )
46
- parser.add_argument("--max-row", type=int, default=None, help="debug: process only first n CSV rows")
47
- parser.add_argument("--limit", type=int, default=None, help="process at most N PDFs with local files")
48
- parser.add_argument("--overwrite", action="store_true", help="re-run analysis even if summary.json exists")
46
+ parser.add_argument(
47
+ "--max-row", type=int, default=None, help="debug: process only first n CSV rows"
48
+ )
49
+ parser.add_argument(
50
+ "--limit", type=int, default=None, help="process at most N PDFs with local files"
51
+ )
52
+ parser.add_argument(
53
+ "--overwrite", action="store_true", help="re-run analysis even if summary.json exists"
54
+ )
49
55
  args = parser.parse_args()
50
56
 
51
57
  csv_path = Path(args.csv)
@@ -70,7 +76,9 @@ def main():
70
76
 
71
77
  # Ignore files that are not .pdf (e.g. ZIPs mistakenly included)
72
78
  if pdf_path.suffix.lower() != ".pdf":
73
- console.print(f"[yellow]Not a PDF ({pdf_path.suffix}) for {submission_id}; skipping.")
79
+ console.print(
80
+ f"[yellow]Not a PDF ({pdf_path.suffix}) for {submission_id}; skipping."
81
+ )
74
82
  continue
75
83
 
76
84
  sub_output = output_root / submission_id
@@ -88,12 +96,16 @@ def main():
88
96
  console.print(f"[yellow]Could not copy PDF into results folder: {copy_err}")
89
97
 
90
98
  if summary_path.exists() and not args.overwrite:
91
- console.print(f"[yellow]Summary exists for {submission_id}; skipping (use --overwrite to refresh)")
99
+ console.print(
100
+ f"[yellow]Summary exists for {submission_id}; skipping (use --overwrite to refresh)"
101
+ )
92
102
  continue
93
103
 
94
104
  pages = build_pages_list(row)
95
105
  try:
96
- analyser = BadPDFAnalyzer(pdf_path=pdf_path, output_dir=sub_output, submission_meta=row, pages=pages)
106
+ analyser = BadPDFAnalyzer(
107
+ pdf_path=pdf_path, output_dir=sub_output, submission_meta=row, pages=pages
108
+ )
97
109
  summary = analyser.run()
98
110
  master_records.append(summary)
99
111
  except Exception as e:
@@ -113,4 +125,4 @@ def main():
113
125
 
114
126
 
115
127
  if __name__ == "__main__":
116
- main()
128
+ main()