natural-pdf 0.1.32__py3-none-any.whl → 0.1.34__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- natural_pdf/analyzers/__init__.py +2 -2
- natural_pdf/analyzers/guides.py +670 -595
- natural_pdf/analyzers/layout/base.py +53 -6
- natural_pdf/analyzers/layout/layout_analyzer.py +3 -1
- natural_pdf/analyzers/layout/layout_manager.py +18 -14
- natural_pdf/analyzers/layout/layout_options.py +1 -0
- natural_pdf/analyzers/layout/paddle.py +102 -64
- natural_pdf/analyzers/layout/table_structure_utils.py +3 -1
- natural_pdf/analyzers/layout/yolo.py +2 -6
- natural_pdf/analyzers/shape_detection_mixin.py +15 -6
- natural_pdf/classification/manager.py +92 -77
- natural_pdf/classification/mixin.py +49 -5
- natural_pdf/classification/results.py +1 -1
- natural_pdf/cli.py +7 -3
- natural_pdf/collections/pdf_collection.py +96 -101
- natural_pdf/core/element_manager.py +188 -82
- natural_pdf/core/highlighting_service.py +5 -6
- natural_pdf/core/page.py +132 -16
- natural_pdf/core/pdf.py +486 -71
- natural_pdf/describe/__init__.py +18 -12
- natural_pdf/describe/base.py +179 -172
- natural_pdf/describe/elements.py +155 -155
- natural_pdf/describe/mixin.py +27 -19
- natural_pdf/describe/summary.py +44 -55
- natural_pdf/elements/base.py +134 -18
- natural_pdf/elements/collections.py +90 -18
- natural_pdf/elements/image.py +2 -1
- natural_pdf/elements/line.py +0 -31
- natural_pdf/elements/rect.py +0 -14
- natural_pdf/elements/region.py +238 -111
- natural_pdf/elements/text.py +18 -12
- natural_pdf/exporters/__init__.py +4 -1
- natural_pdf/exporters/original_pdf.py +12 -4
- natural_pdf/extraction/mixin.py +66 -10
- natural_pdf/extraction/result.py +1 -1
- natural_pdf/flows/flow.py +63 -4
- natural_pdf/flows/region.py +4 -4
- natural_pdf/ocr/engine.py +83 -2
- natural_pdf/ocr/engine_paddle.py +5 -5
- natural_pdf/ocr/ocr_factory.py +2 -1
- natural_pdf/ocr/ocr_manager.py +24 -13
- natural_pdf/ocr/ocr_options.py +3 -10
- natural_pdf/qa/document_qa.py +21 -8
- natural_pdf/qa/qa_result.py +3 -7
- natural_pdf/search/__init__.py +3 -2
- natural_pdf/search/lancedb_search_service.py +5 -6
- natural_pdf/search/numpy_search_service.py +5 -2
- natural_pdf/selectors/parser.py +51 -6
- natural_pdf/tables/__init__.py +2 -2
- natural_pdf/tables/result.py +7 -6
- natural_pdf/utils/bidi_mirror.py +2 -1
- natural_pdf/utils/reading_order.py +3 -2
- natural_pdf/utils/visualization.py +3 -3
- natural_pdf/widgets/viewer.py +0 -1
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/METADATA +1 -1
- natural_pdf-0.1.34.dist-info/RECORD +121 -0
- optimization/memory_comparison.py +73 -58
- optimization/pdf_analyzer.py +141 -96
- optimization/performance_analysis.py +111 -110
- optimization/test_cleanup_methods.py +47 -36
- optimization/test_memory_fix.py +40 -39
- tools/bad_pdf_eval/__init__.py +0 -1
- tools/bad_pdf_eval/analyser.py +35 -18
- tools/bad_pdf_eval/collate_summaries.py +22 -18
- tools/bad_pdf_eval/compile_attempts_markdown.py +127 -0
- tools/bad_pdf_eval/eval_suite.py +21 -9
- tools/bad_pdf_eval/evaluate_quality.py +198 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +12 -8
- tools/bad_pdf_eval/llm_enrich.py +71 -39
- tools/bad_pdf_eval/llm_enrich_with_retry.py +289 -0
- tools/bad_pdf_eval/reporter.py +1 -1
- tools/bad_pdf_eval/utils.py +7 -4
- natural_pdf-0.1.32.dist-info/RECORD +0 -118
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/licenses/LICENSE +0 -0
- {natural_pdf-0.1.32.dist-info → natural_pdf-0.1.34.dist-info}/top_level.txt +0 -0
optimization/test_memory_fix.py
CHANGED
@@ -10,9 +10,10 @@ This test verifies that:
|
|
10
10
|
|
11
11
|
import gc
|
12
12
|
import os
|
13
|
-
import psutil
|
14
13
|
import sys
|
15
14
|
from pathlib import Path
|
15
|
+
|
16
|
+
import psutil
|
16
17
|
import pytest
|
17
18
|
|
18
19
|
import natural_pdf as npdf
|
@@ -26,7 +27,7 @@ def get_memory_usage():
|
|
26
27
|
|
27
28
|
class TestCharacterMemoryFix:
|
28
29
|
"""Test suite for character memory optimization"""
|
29
|
-
|
30
|
+
|
30
31
|
@pytest.fixture
|
31
32
|
def test_pdf_path(self):
|
32
33
|
"""Get path to a test PDF"""
|
@@ -35,128 +36,128 @@ class TestCharacterMemoryFix:
|
|
35
36
|
if not test_path.exists():
|
36
37
|
pytest.skip("Test PDF not found")
|
37
38
|
return str(test_path)
|
38
|
-
|
39
|
+
|
39
40
|
def test_character_access_still_works(self, test_pdf_path):
|
40
41
|
"""Test that character access through words still works after optimization"""
|
41
42
|
pdf = npdf.PDF(test_pdf_path)
|
42
43
|
page = pdf.pages[0]
|
43
|
-
|
44
|
-
# Force loading of elements
|
44
|
+
|
45
|
+
# Force loading of elements
|
45
46
|
text_elements = page.find_all("text")
|
46
|
-
|
47
|
+
|
47
48
|
# Test that we have text elements
|
48
49
|
assert len(text_elements) > 0, "Should have text elements"
|
49
50
|
print(f"Found {len(text_elements)} text elements")
|
50
|
-
|
51
|
-
# Test that words can access their constituent characters
|
51
|
+
|
52
|
+
# Test that words can access their constituent characters
|
52
53
|
for word in text_elements[:5]: # Test first 5 words
|
53
|
-
if hasattr(word,
|
54
|
+
if hasattr(word, "_char_indices") and word._char_indices:
|
54
55
|
# New optimized approach
|
55
56
|
constituent_chars = word.chars
|
56
57
|
assert isinstance(constituent_chars, list), "word.chars should return a list"
|
57
58
|
assert len(constituent_chars) > 0, "Should have constituent characters"
|
58
|
-
|
59
|
+
|
59
60
|
# Test character properties
|
60
61
|
for char in constituent_chars[:3]: # Test first 3 chars of each word
|
61
|
-
assert hasattr(char,
|
62
|
-
assert hasattr(char,
|
63
|
-
|
64
|
-
elif hasattr(word,
|
62
|
+
assert hasattr(char, "text"), "Character should have text attribute"
|
63
|
+
assert hasattr(char, "x0"), "Character should have x0 coordinate"
|
64
|
+
|
65
|
+
elif hasattr(word, "_char_dicts") and word._char_dicts:
|
65
66
|
# Old approach - should still work for compatibility
|
66
67
|
char_dicts = word._char_dicts
|
67
68
|
assert isinstance(char_dicts, list), "word._char_dicts should be a list"
|
68
69
|
assert len(char_dicts) > 0, "Should have character dictionaries"
|
69
|
-
|
70
|
+
|
70
71
|
def test_memory_usage_improvement(self, test_pdf_path):
|
71
72
|
"""Test that memory usage is improved with the optimization"""
|
72
73
|
# This test will compare memory usage patterns
|
73
74
|
# Note: Exact numbers will vary, but we should see improvement
|
74
|
-
|
75
|
+
|
75
76
|
pdf = npdf.PDF(test_pdf_path)
|
76
77
|
page = pdf.pages[0]
|
77
|
-
|
78
|
+
|
78
79
|
# Measure memory before loading elements
|
79
80
|
gc.collect()
|
80
81
|
memory_before = get_memory_usage()
|
81
|
-
|
82
|
+
|
82
83
|
# Load elements (this triggers the optimization)
|
83
84
|
chars = page.find_all("text")
|
84
85
|
words = page.find_all("words")
|
85
|
-
|
86
|
+
|
86
87
|
# Measure memory after loading
|
87
88
|
gc.collect()
|
88
89
|
memory_after = get_memory_usage()
|
89
|
-
|
90
|
+
|
90
91
|
memory_used = memory_after - memory_before
|
91
|
-
|
92
|
+
|
92
93
|
# Log the memory usage for analysis
|
93
94
|
print(f"\nMemory usage analysis:")
|
94
95
|
print(f"Characters loaded: {len(chars)}")
|
95
96
|
print(f"Words loaded: {len(words)}")
|
96
97
|
print(f"Memory used: {memory_used:.2f} MB")
|
97
98
|
print(f"Memory per character: {memory_used / len(chars) * 1024:.2f} KB" if chars else "N/A")
|
98
|
-
|
99
|
+
|
99
100
|
# The memory usage should be reasonable (not exact test due to variability)
|
100
101
|
# Main goal is to verify no crashes and reasonable memory usage
|
101
102
|
assert memory_used < 100, f"Memory usage seems too high: {memory_used:.2f} MB"
|
102
|
-
|
103
|
+
|
103
104
|
def test_word_text_extraction_works(self, test_pdf_path):
|
104
105
|
"""Test that text extraction from words still works correctly"""
|
105
106
|
pdf = npdf.PDF(test_pdf_path)
|
106
107
|
page = pdf.pages[0]
|
107
|
-
|
108
|
+
|
108
109
|
words = page.find_all("text") # All text elements are words in this PDF
|
109
|
-
|
110
|
+
|
110
111
|
# Test text extraction from words
|
111
112
|
for word in words[:10]: # Test first 10 words
|
112
113
|
word_text = word.text
|
113
114
|
assert isinstance(word_text, str), "Word text should be a string"
|
114
|
-
|
115
|
+
|
115
116
|
# Text should not be empty for actual words
|
116
117
|
if word_text.strip(): # Skip empty/whitespace words
|
117
118
|
assert len(word_text) > 0, "Non-empty words should have text content"
|
118
|
-
|
119
|
+
|
119
120
|
def test_backwards_compatibility(self, test_pdf_path):
|
120
121
|
"""Test that existing code patterns still work"""
|
121
122
|
pdf = npdf.PDF(test_pdf_path)
|
122
123
|
page = pdf.pages[0]
|
123
|
-
|
124
|
+
|
124
125
|
# Test that existing element access patterns work
|
125
126
|
all_elements = page.find_all("text")
|
126
127
|
assert len(all_elements) > 0, "Should find text elements"
|
127
|
-
|
128
|
+
|
128
129
|
# Test that element properties are accessible
|
129
130
|
for element in all_elements[:5]:
|
130
|
-
assert hasattr(element,
|
131
|
-
assert hasattr(element,
|
132
|
-
assert hasattr(element,
|
133
|
-
assert hasattr(element,
|
134
|
-
assert hasattr(element,
|
131
|
+
assert hasattr(element, "text"), "Element should have text attribute"
|
132
|
+
assert hasattr(element, "x0"), "Element should have x0 coordinate"
|
133
|
+
assert hasattr(element, "top"), "Element should have top coordinate"
|
134
|
+
assert hasattr(element, "width"), "Element should have width"
|
135
|
+
assert hasattr(element, "height"), "Element should have height"
|
135
136
|
|
136
137
|
|
137
138
|
def main():
|
138
139
|
"""Run the memory fix test"""
|
139
140
|
print("Running character memory optimization test...")
|
140
|
-
|
141
|
+
|
141
142
|
# Check if test PDF exists
|
142
143
|
test_pdf = Path("pdfs/01-practice.pdf")
|
143
144
|
if not test_pdf.exists():
|
144
145
|
print(f"ERROR: Test PDF not found at {test_pdf}")
|
145
146
|
print("Please ensure the test PDF exists before running this test.")
|
146
147
|
return 1
|
147
|
-
|
148
|
+
|
148
149
|
# Run pytest on just this file
|
149
150
|
exit_code = pytest.main([__file__, "-v", "-s"])
|
150
|
-
|
151
|
+
|
151
152
|
if exit_code == 0:
|
152
153
|
print("\n✅ All memory optimization tests passed!")
|
153
154
|
print("The character duplication fix is working correctly.")
|
154
155
|
else:
|
155
156
|
print("\n❌ Some tests failed!")
|
156
157
|
print("The memory optimization needs investigation.")
|
157
|
-
|
158
|
+
|
158
159
|
return exit_code
|
159
160
|
|
160
161
|
|
161
162
|
if __name__ == "__main__":
|
162
|
-
exit(main())
|
163
|
+
exit(main())
|
tools/bad_pdf_eval/__init__.py
CHANGED
@@ -1 +0,0 @@
|
|
1
|
-
|
tools/bad_pdf_eval/analyser.py
CHANGED
@@ -3,15 +3,16 @@ from __future__ import annotations
|
|
3
3
|
import re
|
4
4
|
import time
|
5
5
|
from pathlib import Path
|
6
|
-
from typing import Dict, List, Optional
|
6
|
+
from typing import Any, Dict, List, Optional
|
7
7
|
|
8
|
-
import natural_pdf as npdf
|
9
8
|
from PIL import Image
|
10
|
-
from rich.table import Table
|
11
9
|
from rich.console import Console
|
10
|
+
from rich.table import Table
|
12
11
|
|
12
|
+
import natural_pdf as npdf
|
13
|
+
|
14
|
+
from .reporter import log_section, save_json
|
13
15
|
from .utils import slugify
|
14
|
-
from .reporter import save_json, log_section
|
15
16
|
|
16
17
|
console = Console()
|
17
18
|
|
@@ -201,7 +202,10 @@ class BadPDFAnalyzer:
|
|
201
202
|
page_result["goal_tag"] = "unknown"
|
202
203
|
|
203
204
|
# Difficulties determination
|
204
|
-
if
|
205
|
+
if (
|
206
|
+
page_result.get("text_len", 0) < 100
|
207
|
+
and page_result.get("ocr_text_elements", 0) > 20
|
208
|
+
):
|
205
209
|
difficulties.append("scanned_image")
|
206
210
|
|
207
211
|
page_result["difficulties"] = difficulties
|
@@ -235,40 +239,48 @@ class BadPDFAnalyzer:
|
|
235
239
|
import_lines.append("import pandas as pd")
|
236
240
|
|
237
241
|
code_lines: List[str] = import_lines + [
|
238
|
-
f
|
242
|
+
f'pdf = PDF("{self.pdf_path}")',
|
239
243
|
f"page = pdf.pages[{page_idx_1based - 1}] # page {page_idx_1based}",
|
240
244
|
]
|
241
245
|
|
242
246
|
thought_lines: List[str] = []
|
243
247
|
# build reasoning
|
244
|
-
thought_lines.append(
|
248
|
+
thought_lines.append(
|
249
|
+
f"Goal tag: {page_result['goal_tag']}. Detected difficulties: {', '.join(difficulties) or 'none'}."
|
250
|
+
)
|
245
251
|
|
246
252
|
if page_result["goal_tag"] == "table_extraction":
|
247
|
-
thought_lines.append(
|
253
|
+
thought_lines.append(
|
254
|
+
"Plan: rely on layout models to locate tables, then extract with Natural-PDF helper."
|
255
|
+
)
|
248
256
|
if page_result.get("layout_tatr_count", 0) > 0:
|
249
257
|
code_lines.append("page.analyze_layout('tatr') # adds 'table' regions")
|
250
258
|
else:
|
251
259
|
code_lines.append("page.analyze_layout() # YOLO fallback")
|
252
260
|
|
253
261
|
if page_result.get("layout_tatr_count", 0) > 1:
|
254
|
-
thought_lines.append(
|
262
|
+
thought_lines.append(
|
263
|
+
"Multiple tables detected, choose second as goal mentions 'second table'."
|
264
|
+
)
|
255
265
|
code_lines.append("tables = page.find_all('table')")
|
256
266
|
code_lines.append("tbl = tables[1]")
|
257
267
|
else:
|
258
268
|
code_lines.append("tbl = page.find('table') # first table")
|
259
269
|
|
260
|
-
code_lines.extend(
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
270
|
+
code_lines.extend(
|
271
|
+
[
|
272
|
+
"data = tbl.extract_table()",
|
273
|
+
"columns, rows = data[0], data[1:]",
|
274
|
+
"df = pd.DataFrame(rows, columns=columns)",
|
275
|
+
]
|
276
|
+
)
|
265
277
|
elif page_result["goal_tag"] == "text_extraction":
|
266
278
|
anchor = _first_anchor_from_goal(goal_str)
|
267
279
|
if "scanned_image" in difficulties:
|
268
280
|
thought_lines.append("No native text detected; need OCR before querying.")
|
269
281
|
code_lines.append("page.apply_ocr(engine='paddle')")
|
270
282
|
thought_lines.append(f"Anchor on text '{anchor}' then read below region.")
|
271
|
-
code_lines.append(f
|
283
|
+
code_lines.append(f'section = page.find("text:contains({anchor})").below(0, 50)')
|
272
284
|
code_lines.append("text = section.extract_text()")
|
273
285
|
else:
|
274
286
|
thought_lines.append("Goal unclear; placeholder snippet provided.")
|
@@ -282,8 +294,13 @@ class BadPDFAnalyzer:
|
|
282
294
|
# Provide quick heuristic comment
|
283
295
|
if page_result.get("text_len", 0) == 0 and page_result.get("ocr_text_elements", 0) > 20:
|
284
296
|
page_result["auto_comment"] = "Likely scanned/needs OCR; no native text."
|
285
|
-
elif
|
286
|
-
page_result
|
297
|
+
elif (
|
298
|
+
page_result.get("text_len", 0) > 1000
|
299
|
+
and page_result.get("layout_yolo_count", 0) == 0
|
300
|
+
):
|
301
|
+
page_result["auto_comment"] = (
|
302
|
+
"Native dense text; YOLO found no regions – may be fine, fonts just small."
|
303
|
+
)
|
287
304
|
else:
|
288
305
|
page_result.setdefault("auto_comment", "")
|
289
306
|
|
@@ -299,4 +316,4 @@ PAGE_REGEX = re.compile(r"page\s*(\d{1,4})", re.IGNORECASE)
|
|
299
316
|
|
300
317
|
|
301
318
|
def extract_page_hints(text: str) -> List[int]:
|
302
|
-
return [int(m.group(1)) for m in PAGE_REGEX.finditer(text)]
|
319
|
+
return [int(m.group(1)) for m in PAGE_REGEX.finditer(text)]
|
@@ -36,11 +36,13 @@ def collect() -> List[dict]:
|
|
36
36
|
tp = (p.get("thought_process") or "").strip()
|
37
37
|
if not cs and not tp:
|
38
38
|
continue
|
39
|
-
page_snippets.append(
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
39
|
+
page_snippets.append(
|
40
|
+
{
|
41
|
+
"page": p.get("page_number"),
|
42
|
+
"code": cs,
|
43
|
+
"thought": tp,
|
44
|
+
}
|
45
|
+
)
|
44
46
|
# --- lightweight feature tagging --------------------------------
|
45
47
|
gt = (p.get("goal_tag") or "").lower()
|
46
48
|
if "table" in gt:
|
@@ -64,18 +66,20 @@ def collect() -> List[dict]:
|
|
64
66
|
if language and language.lower() not in {"english", "en", "en-us"}:
|
65
67
|
features.add("non_english")
|
66
68
|
|
67
|
-
rows.append(
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
69
|
+
rows.append(
|
70
|
+
{
|
71
|
+
"id": submission_id,
|
72
|
+
"language": language,
|
73
|
+
"issues": issues,
|
74
|
+
"description": description,
|
75
|
+
"doc_thought": doc_tp,
|
76
|
+
"doc_code": doc_cs,
|
77
|
+
"doc_difficult": doc_diff,
|
78
|
+
"doc_test": doc_test,
|
79
|
+
"snippets": page_snippets,
|
80
|
+
"features": sorted(features),
|
81
|
+
}
|
82
|
+
)
|
79
83
|
return rows
|
80
84
|
|
81
85
|
|
@@ -127,4 +131,4 @@ def export_markdown(rows: List[dict]):
|
|
127
131
|
if __name__ == "__main__":
|
128
132
|
rows = collect()
|
129
133
|
export_markdown(rows)
|
130
|
-
print(f"Wrote {len(rows)} summaries to eval_results/collated_summary.md")
|
134
|
+
print(f"Wrote {len(rows)} summaries to eval_results/collated_summary.md")
|
@@ -0,0 +1,127 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""Compile multi-try enrichment attempts into a single Markdown report.
|
3
|
+
|
4
|
+
For every `summary.json` produced by the retry enrichment pipeline, this script
|
5
|
+
collects all attempts (initial + retries) and writes a human-readable markdown
|
6
|
+
file that shows, *per PDF*, every attempt alongside its quality score.
|
7
|
+
|
8
|
+
Example
|
9
|
+
-------
|
10
|
+
$ python -m tools.bad_pdf_eval.compile_attempts_markdown \
|
11
|
+
--output eval_results/attempts_progress.md
|
12
|
+
|
13
|
+
The resulting markdown looks like::
|
14
|
+
|
15
|
+
# Attempts Progress Report
|
16
|
+
## obe1Vq5 — obe1Vq5.pdf
|
17
|
+
### Attempt 0 (Score: 3/12)
|
18
|
+
...
|
19
|
+
### Attempt 1 (Score: 6/12)
|
20
|
+
...
|
21
|
+
|
22
|
+
This file can then be fed into an LLM for meta-analysis of score improvements
|
23
|
+
and guidance quality.
|
24
|
+
"""
|
25
|
+
from __future__ import annotations
|
26
|
+
|
27
|
+
import argparse
|
28
|
+
import json
|
29
|
+
from pathlib import Path
|
30
|
+
from typing import Iterable, List
|
31
|
+
|
32
|
+
# Re-use the same constants as other evaluation utilities ---------------------
|
33
|
+
ROOT = Path(__file__).resolve().parent.parent.parent # repo root
|
34
|
+
EVAL_DIR = ROOT / "eval_results"
|
35
|
+
|
36
|
+
# ---------------------------------------------------------------------------
|
37
|
+
|
38
|
+
|
39
|
+
def iter_summary_paths(submission: str | None) -> Iterable[Path]:
|
40
|
+
"""Yield all summary.json paths (optionally filtered by submission ID)."""
|
41
|
+
if submission:
|
42
|
+
p = EVAL_DIR / submission / "summary.json"
|
43
|
+
if not p.exists():
|
44
|
+
raise FileNotFoundError(
|
45
|
+
f"No summary.json found for submission '{submission}' – expected {p}"
|
46
|
+
)
|
47
|
+
yield p
|
48
|
+
else:
|
49
|
+
yield from EVAL_DIR.glob("*/summary.json")
|
50
|
+
|
51
|
+
|
52
|
+
def load_summary(path: Path) -> dict:
|
53
|
+
"""Return the parsed JSON for the given summary path."""
|
54
|
+
try:
|
55
|
+
return json.loads(path.read_text(encoding="utf-8"))
|
56
|
+
except json.JSONDecodeError as exc:
|
57
|
+
raise ValueError(f"Invalid JSON in {path}: {exc}") from exc
|
58
|
+
|
59
|
+
|
60
|
+
def build_markdown_for_summary(submission_id: str, summary: dict) -> str:
|
61
|
+
"""Return a markdown string for a single submission (all attempts)."""
|
62
|
+
pdf_name = Path(summary.get("pdf", "")).name or "<unknown.pdf>"
|
63
|
+
header = f"## {submission_id} — {pdf_name}"
|
64
|
+
|
65
|
+
attempts: List[dict] = sorted(summary.get("attempts", []), key=lambda d: d.get("attempt", 0))
|
66
|
+
if not attempts:
|
67
|
+
return header + "\n\n_No attempts recorded – run the enrichment retry pipeline first._\n"
|
68
|
+
|
69
|
+
sections: List[str] = [header]
|
70
|
+
|
71
|
+
for att in attempts:
|
72
|
+
num = att.get("attempt", "?")
|
73
|
+
score = att.get("score", "?")
|
74
|
+
tp = att.get("thought_process", "").strip()
|
75
|
+
code = att.get("code_suggestion", "").rstrip()
|
76
|
+
|
77
|
+
sections.append(f"### Attempt {num} (Score: {score}/12)")
|
78
|
+
|
79
|
+
if tp:
|
80
|
+
sections.append("**Thought Process**")
|
81
|
+
# indent each line with > for blockquote formatting
|
82
|
+
quoted_tp = "\n".join(f"> {line}" for line in tp.splitlines())
|
83
|
+
sections.append(quoted_tp)
|
84
|
+
|
85
|
+
if code:
|
86
|
+
sections.append("```python")
|
87
|
+
sections.append(code)
|
88
|
+
sections.append("```")
|
89
|
+
|
90
|
+
return "\n\n".join(sections)
|
91
|
+
|
92
|
+
|
93
|
+
def compile_report(paths: Iterable[Path]) -> str:
|
94
|
+
"""Aggregate individual submission markdown into one report."""
|
95
|
+
pieces: List[str] = ["# Attempts Progress Report", ""]
|
96
|
+
for p in sorted(paths):
|
97
|
+
submission_id = p.parent.name
|
98
|
+
summary = load_summary(p)
|
99
|
+
pieces.append(build_markdown_for_summary(submission_id, summary))
|
100
|
+
pieces.append("---") # horizontal rule between PDFs
|
101
|
+
return "\n\n".join(pieces).rstrip("-\n")
|
102
|
+
|
103
|
+
|
104
|
+
def main() -> None:
|
105
|
+
parser = argparse.ArgumentParser(description="Compile multi-retry attempts into markdown.")
|
106
|
+
parser.add_argument(
|
107
|
+
"--output",
|
108
|
+
type=Path,
|
109
|
+
default=EVAL_DIR / "attempts_progress.md",
|
110
|
+
help="Destination .md file (default: eval_results/attempts_progress.md).",
|
111
|
+
)
|
112
|
+
parser.add_argument("--submission", help="Only compile a single submission ID.")
|
113
|
+
args = parser.parse_args()
|
114
|
+
|
115
|
+
summary_paths = list(iter_summary_paths(args.submission))
|
116
|
+
if not summary_paths:
|
117
|
+
raise SystemExit("No summary.json files found.")
|
118
|
+
|
119
|
+
md = compile_report(summary_paths)
|
120
|
+
args.output.write_text(md, encoding="utf-8")
|
121
|
+
print(
|
122
|
+
f"[ok] Wrote markdown report to {args.output.relative_to(ROOT)} (covers {len(summary_paths)} PDFs)"
|
123
|
+
)
|
124
|
+
|
125
|
+
|
126
|
+
if __name__ == "__main__":
|
127
|
+
main()
|
tools/bad_pdf_eval/eval_suite.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
import argparse
|
2
2
|
import re
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import
|
4
|
+
from typing import Dict, List
|
5
5
|
|
6
6
|
import pandas as pd
|
7
7
|
from rich.console import Console
|
8
8
|
|
9
|
-
from .utils import find_local_pdf, slugify
|
10
9
|
from .analyser import BadPDFAnalyzer, extract_page_hints
|
11
10
|
from .reporter import save_json
|
11
|
+
from .utils import find_local_pdf, slugify
|
12
12
|
|
13
13
|
console = Console()
|
14
14
|
|
@@ -43,9 +43,15 @@ def main():
|
|
43
43
|
default="eval_results",
|
44
44
|
help="Directory to write results into (will be git-ignored)",
|
45
45
|
)
|
46
|
-
parser.add_argument(
|
47
|
-
|
48
|
-
|
46
|
+
parser.add_argument(
|
47
|
+
"--max-row", type=int, default=None, help="debug: process only first n CSV rows"
|
48
|
+
)
|
49
|
+
parser.add_argument(
|
50
|
+
"--limit", type=int, default=None, help="process at most N PDFs with local files"
|
51
|
+
)
|
52
|
+
parser.add_argument(
|
53
|
+
"--overwrite", action="store_true", help="re-run analysis even if summary.json exists"
|
54
|
+
)
|
49
55
|
args = parser.parse_args()
|
50
56
|
|
51
57
|
csv_path = Path(args.csv)
|
@@ -70,7 +76,9 @@ def main():
|
|
70
76
|
|
71
77
|
# Ignore files that are not .pdf (e.g. ZIPs mistakenly included)
|
72
78
|
if pdf_path.suffix.lower() != ".pdf":
|
73
|
-
console.print(
|
79
|
+
console.print(
|
80
|
+
f"[yellow]Not a PDF ({pdf_path.suffix}) for {submission_id}; skipping."
|
81
|
+
)
|
74
82
|
continue
|
75
83
|
|
76
84
|
sub_output = output_root / submission_id
|
@@ -88,12 +96,16 @@ def main():
|
|
88
96
|
console.print(f"[yellow]Could not copy PDF into results folder: {copy_err}")
|
89
97
|
|
90
98
|
if summary_path.exists() and not args.overwrite:
|
91
|
-
console.print(
|
99
|
+
console.print(
|
100
|
+
f"[yellow]Summary exists for {submission_id}; skipping (use --overwrite to refresh)"
|
101
|
+
)
|
92
102
|
continue
|
93
103
|
|
94
104
|
pages = build_pages_list(row)
|
95
105
|
try:
|
96
|
-
analyser = BadPDFAnalyzer(
|
106
|
+
analyser = BadPDFAnalyzer(
|
107
|
+
pdf_path=pdf_path, output_dir=sub_output, submission_meta=row, pages=pages
|
108
|
+
)
|
97
109
|
summary = analyser.run()
|
98
110
|
master_records.append(summary)
|
99
111
|
except Exception as e:
|
@@ -113,4 +125,4 @@ def main():
|
|
113
125
|
|
114
126
|
|
115
127
|
if __name__ == "__main__":
|
116
|
-
main()
|
128
|
+
main()
|