natural-pdf 0.1.27__py3-none-any.whl → 0.1.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bad_pdf_analysis/analyze_10_more.py +300 -0
- bad_pdf_analysis/analyze_final_10.py +552 -0
- bad_pdf_analysis/analyze_specific_pages.py +394 -0
- bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
- natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
- natural_pdf/analyzers/layout/layout_manager.py +45 -1
- natural_pdf/analyzers/layout/surya.py +1 -1
- natural_pdf/analyzers/layout/yolo.py +2 -2
- natural_pdf/analyzers/shape_detection_mixin.py +228 -0
- natural_pdf/classification/manager.py +67 -0
- natural_pdf/core/element_manager.py +556 -25
- natural_pdf/core/highlighting_service.py +98 -43
- natural_pdf/core/page.py +86 -20
- natural_pdf/core/pdf.py +0 -2
- natural_pdf/describe/base.py +40 -9
- natural_pdf/describe/elements.py +11 -6
- natural_pdf/elements/base.py +134 -20
- natural_pdf/elements/collections.py +43 -11
- natural_pdf/elements/image.py +43 -0
- natural_pdf/elements/region.py +64 -19
- natural_pdf/elements/text.py +89 -11
- natural_pdf/flows/collections.py +4 -4
- natural_pdf/flows/region.py +17 -2
- natural_pdf/ocr/engine_paddle.py +1 -1
- natural_pdf/ocr/ocr_factory.py +8 -8
- natural_pdf/ocr/ocr_manager.py +51 -1
- natural_pdf/selectors/parser.py +27 -7
- natural_pdf/tables/__init__.py +5 -0
- natural_pdf/tables/result.py +101 -0
- natural_pdf/utils/bidi_mirror.py +36 -0
- natural_pdf/utils/visualization.py +15 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/METADATA +2 -1
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/RECORD +51 -29
- natural_pdf-0.1.30.dist-info/top_level.txt +6 -0
- optimization/memory_comparison.py +172 -0
- optimization/pdf_analyzer.py +410 -0
- optimization/performance_analysis.py +397 -0
- optimization/test_cleanup_methods.py +155 -0
- optimization/test_memory_fix.py +162 -0
- tools/bad_pdf_eval/__init__.py +1 -0
- tools/bad_pdf_eval/analyser.py +302 -0
- tools/bad_pdf_eval/collate_summaries.py +130 -0
- tools/bad_pdf_eval/eval_suite.py +116 -0
- tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
- tools/bad_pdf_eval/llm_enrich.py +273 -0
- tools/bad_pdf_eval/reporter.py +17 -0
- tools/bad_pdf_eval/utils.py +127 -0
- tools/rtl_smoke_test.py +80 -0
- natural_pdf-0.1.27.dist-info/top_level.txt +0 -2
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/WHEEL +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/entry_points.txt +0 -0
- {natural_pdf-0.1.27.dist-info → natural_pdf-0.1.30.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,162 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""
|
3
|
+
Test script to verify character duplication fix works correctly.
|
4
|
+
|
5
|
+
This test verifies that:
|
6
|
+
1. Memory usage is reduced by eliminating character duplication
|
7
|
+
2. All existing functionality still works correctly
|
8
|
+
3. Character access through words remains functional
|
9
|
+
"""
|
10
|
+
|
11
|
+
import gc
|
12
|
+
import os
|
13
|
+
import psutil
|
14
|
+
import sys
|
15
|
+
from pathlib import Path
|
16
|
+
import pytest
|
17
|
+
|
18
|
+
import natural_pdf as npdf
|
19
|
+
|
20
|
+
|
21
|
+
def get_memory_usage():
|
22
|
+
"""Get current memory usage in MB"""
|
23
|
+
process = psutil.Process()
|
24
|
+
return process.memory_info().rss / 1024 / 1024
|
25
|
+
|
26
|
+
|
27
|
+
class TestCharacterMemoryFix:
|
28
|
+
"""Test suite for character memory optimization"""
|
29
|
+
|
30
|
+
@pytest.fixture
|
31
|
+
def test_pdf_path(self):
|
32
|
+
"""Get path to a test PDF"""
|
33
|
+
# Use the practice PDF for testing
|
34
|
+
test_path = Path("pdfs/01-practice.pdf")
|
35
|
+
if not test_path.exists():
|
36
|
+
pytest.skip("Test PDF not found")
|
37
|
+
return str(test_path)
|
38
|
+
|
39
|
+
def test_character_access_still_works(self, test_pdf_path):
|
40
|
+
"""Test that character access through words still works after optimization"""
|
41
|
+
pdf = npdf.PDF(test_pdf_path)
|
42
|
+
page = pdf.pages[0]
|
43
|
+
|
44
|
+
# Force loading of elements
|
45
|
+
text_elements = page.find_all("text")
|
46
|
+
|
47
|
+
# Test that we have text elements
|
48
|
+
assert len(text_elements) > 0, "Should have text elements"
|
49
|
+
print(f"Found {len(text_elements)} text elements")
|
50
|
+
|
51
|
+
# Test that words can access their constituent characters
|
52
|
+
for word in text_elements[:5]: # Test first 5 words
|
53
|
+
if hasattr(word, '_char_indices') and word._char_indices:
|
54
|
+
# New optimized approach
|
55
|
+
constituent_chars = word.chars
|
56
|
+
assert isinstance(constituent_chars, list), "word.chars should return a list"
|
57
|
+
assert len(constituent_chars) > 0, "Should have constituent characters"
|
58
|
+
|
59
|
+
# Test character properties
|
60
|
+
for char in constituent_chars[:3]: # Test first 3 chars of each word
|
61
|
+
assert hasattr(char, 'text'), "Character should have text attribute"
|
62
|
+
assert hasattr(char, 'x0'), "Character should have x0 coordinate"
|
63
|
+
|
64
|
+
elif hasattr(word, '_char_dicts') and word._char_dicts:
|
65
|
+
# Old approach - should still work for compatibility
|
66
|
+
char_dicts = word._char_dicts
|
67
|
+
assert isinstance(char_dicts, list), "word._char_dicts should be a list"
|
68
|
+
assert len(char_dicts) > 0, "Should have character dictionaries"
|
69
|
+
|
70
|
+
def test_memory_usage_improvement(self, test_pdf_path):
|
71
|
+
"""Test that memory usage is improved with the optimization"""
|
72
|
+
# This test will compare memory usage patterns
|
73
|
+
# Note: Exact numbers will vary, but we should see improvement
|
74
|
+
|
75
|
+
pdf = npdf.PDF(test_pdf_path)
|
76
|
+
page = pdf.pages[0]
|
77
|
+
|
78
|
+
# Measure memory before loading elements
|
79
|
+
gc.collect()
|
80
|
+
memory_before = get_memory_usage()
|
81
|
+
|
82
|
+
# Load elements (this triggers the optimization)
|
83
|
+
chars = page.find_all("text")
|
84
|
+
words = page.find_all("words")
|
85
|
+
|
86
|
+
# Measure memory after loading
|
87
|
+
gc.collect()
|
88
|
+
memory_after = get_memory_usage()
|
89
|
+
|
90
|
+
memory_used = memory_after - memory_before
|
91
|
+
|
92
|
+
# Log the memory usage for analysis
|
93
|
+
print(f"\nMemory usage analysis:")
|
94
|
+
print(f"Characters loaded: {len(chars)}")
|
95
|
+
print(f"Words loaded: {len(words)}")
|
96
|
+
print(f"Memory used: {memory_used:.2f} MB")
|
97
|
+
print(f"Memory per character: {memory_used / len(chars) * 1024:.2f} KB" if chars else "N/A")
|
98
|
+
|
99
|
+
# The memory usage should be reasonable (not exact test due to variability)
|
100
|
+
# Main goal is to verify no crashes and reasonable memory usage
|
101
|
+
assert memory_used < 100, f"Memory usage seems too high: {memory_used:.2f} MB"
|
102
|
+
|
103
|
+
def test_word_text_extraction_works(self, test_pdf_path):
|
104
|
+
"""Test that text extraction from words still works correctly"""
|
105
|
+
pdf = npdf.PDF(test_pdf_path)
|
106
|
+
page = pdf.pages[0]
|
107
|
+
|
108
|
+
words = page.find_all("text") # All text elements are words in this PDF
|
109
|
+
|
110
|
+
# Test text extraction from words
|
111
|
+
for word in words[:10]: # Test first 10 words
|
112
|
+
word_text = word.text
|
113
|
+
assert isinstance(word_text, str), "Word text should be a string"
|
114
|
+
|
115
|
+
# Text should not be empty for actual words
|
116
|
+
if word_text.strip(): # Skip empty/whitespace words
|
117
|
+
assert len(word_text) > 0, "Non-empty words should have text content"
|
118
|
+
|
119
|
+
def test_backwards_compatibility(self, test_pdf_path):
|
120
|
+
"""Test that existing code patterns still work"""
|
121
|
+
pdf = npdf.PDF(test_pdf_path)
|
122
|
+
page = pdf.pages[0]
|
123
|
+
|
124
|
+
# Test that existing element access patterns work
|
125
|
+
all_elements = page.find_all("text")
|
126
|
+
assert len(all_elements) > 0, "Should find text elements"
|
127
|
+
|
128
|
+
# Test that element properties are accessible
|
129
|
+
for element in all_elements[:5]:
|
130
|
+
assert hasattr(element, 'text'), "Element should have text attribute"
|
131
|
+
assert hasattr(element, 'x0'), "Element should have x0 coordinate"
|
132
|
+
assert hasattr(element, 'top'), "Element should have top coordinate"
|
133
|
+
assert hasattr(element, 'width'), "Element should have width"
|
134
|
+
assert hasattr(element, 'height'), "Element should have height"
|
135
|
+
|
136
|
+
|
137
|
+
def main():
|
138
|
+
"""Run the memory fix test"""
|
139
|
+
print("Running character memory optimization test...")
|
140
|
+
|
141
|
+
# Check if test PDF exists
|
142
|
+
test_pdf = Path("pdfs/01-practice.pdf")
|
143
|
+
if not test_pdf.exists():
|
144
|
+
print(f"ERROR: Test PDF not found at {test_pdf}")
|
145
|
+
print("Please ensure the test PDF exists before running this test.")
|
146
|
+
return 1
|
147
|
+
|
148
|
+
# Run pytest on just this file
|
149
|
+
exit_code = pytest.main([__file__, "-v", "-s"])
|
150
|
+
|
151
|
+
if exit_code == 0:
|
152
|
+
print("\n✅ All memory optimization tests passed!")
|
153
|
+
print("The character duplication fix is working correctly.")
|
154
|
+
else:
|
155
|
+
print("\n❌ Some tests failed!")
|
156
|
+
print("The memory optimization needs investigation.")
|
157
|
+
|
158
|
+
return exit_code
|
159
|
+
|
160
|
+
|
161
|
+
if __name__ == "__main__":
|
162
|
+
exit(main())
|
@@ -0,0 +1 @@
|
|
1
|
+
|
@@ -0,0 +1,302 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import time
|
5
|
+
from pathlib import Path
|
6
|
+
from typing import Dict, List, Optional, Any
|
7
|
+
|
8
|
+
import natural_pdf as npdf
|
9
|
+
from PIL import Image
|
10
|
+
from rich.table import Table
|
11
|
+
from rich.console import Console
|
12
|
+
|
13
|
+
from .utils import slugify
|
14
|
+
from .reporter import save_json, log_section
|
15
|
+
|
16
|
+
console = Console()
|
17
|
+
|
18
|
+
|
19
|
+
class BadPDFAnalyzer:
|
20
|
+
"""Run a battery of Natural-PDF probes on a PDF and dump artefacts."""
|
21
|
+
|
22
|
+
def __init__(
|
23
|
+
self,
|
24
|
+
pdf_path: Path,
|
25
|
+
output_dir: Path,
|
26
|
+
submission_meta: Dict[str, Any],
|
27
|
+
pages: List[int],
|
28
|
+
resolution: int = 216,
|
29
|
+
):
|
30
|
+
self.pdf_path = pdf_path
|
31
|
+
self.output_dir = output_dir
|
32
|
+
self.meta = submission_meta
|
33
|
+
self.pages_to_analyze = pages
|
34
|
+
self.resolution = resolution
|
35
|
+
|
36
|
+
# ---------------------------------------------------------------------
|
37
|
+
# Helpers
|
38
|
+
# ---------------------------------------------------------------------
|
39
|
+
def _save_page_image(self, page, page_num: int) -> Path:
|
40
|
+
"""Render and save page image as high-quality JPG."""
|
41
|
+
img: Image.Image = page.to_image(resolution=self.resolution)
|
42
|
+
if img.mode != "RGB":
|
43
|
+
img = img.convert("RGB")
|
44
|
+
img_path = self.output_dir / f"page_{page_num:04d}.jpg"
|
45
|
+
img.save(img_path, "JPEG", quality=90, optimize=True, progressive=True)
|
46
|
+
return img_path
|
47
|
+
|
48
|
+
# ------------------------------------------------------------------
|
49
|
+
def run(self) -> Dict[str, Any]:
|
50
|
+
"""Return master JSON summary (also persisted inside output_dir)."""
|
51
|
+
console.print(f"[green]Analyzing[/] {self.pdf_path.name} ↦ pages {self.pages_to_analyze}")
|
52
|
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
53
|
+
|
54
|
+
pdf = npdf.PDF(str(self.pdf_path))
|
55
|
+
summary: Dict[str, Any] = {
|
56
|
+
"submission_id": self.meta["Submission ID"],
|
57
|
+
"pdf": str(self.pdf_path),
|
58
|
+
"total_pages": len(pdf.pages),
|
59
|
+
"pages": [],
|
60
|
+
"goal": self.meta.get("What are we trying to get out of the PDF?", ""),
|
61
|
+
"language": self.meta.get("What language(s) or script is the content in?", ""),
|
62
|
+
"issues": self.meta.get("What do you think makes this PDF bad?", ""),
|
63
|
+
"description": self.meta.get("What is the PDF, and/or where did it come from?", ""),
|
64
|
+
}
|
65
|
+
|
66
|
+
for page_idx_1based in self.pages_to_analyze:
|
67
|
+
if page_idx_1based < 1 or page_idx_1based > len(pdf.pages):
|
68
|
+
console.print(f"[yellow]- skipping page {page_idx_1based} (out of range)")
|
69
|
+
continue
|
70
|
+
page = pdf.pages[page_idx_1based - 1]
|
71
|
+
page_result: Dict[str, Any] = {"page_number": page_idx_1based}
|
72
|
+
difficulties: List[str] = []
|
73
|
+
|
74
|
+
# ---------------- image
|
75
|
+
img_path = self._save_page_image(page, page_idx_1based)
|
76
|
+
page_result["image"] = str(img_path)
|
77
|
+
|
78
|
+
# ---------------- describe / inspect
|
79
|
+
try:
|
80
|
+
descr = page.describe()
|
81
|
+
page_result["describe"] = str(descr)
|
82
|
+
except Exception as e:
|
83
|
+
page_result["describe_error"] = str(e)
|
84
|
+
|
85
|
+
try:
|
86
|
+
page_result["inspect"] = str(page.inspect(limit=30))
|
87
|
+
except Exception as e:
|
88
|
+
page_result["inspect_error"] = str(e)
|
89
|
+
|
90
|
+
# ---------------- extract text
|
91
|
+
text = ""
|
92
|
+
try:
|
93
|
+
text = page.extract_text()
|
94
|
+
page_result["text_len"] = len(text or "")
|
95
|
+
if text:
|
96
|
+
page_result["text_preview"] = text[:300]
|
97
|
+
except Exception as e:
|
98
|
+
page_result["extract_text_error"] = str(e)
|
99
|
+
|
100
|
+
# ---------------- tiny font detection
|
101
|
+
try:
|
102
|
+
words_sample = page.words[:5000] # not expensive
|
103
|
+
if words_sample:
|
104
|
+
small_fonts = sum(1 for w in words_sample if getattr(w, "size", 10) < 4)
|
105
|
+
ratio = small_fonts / len(words_sample)
|
106
|
+
page_result["tiny_font_ratio"] = round(ratio, 3)
|
107
|
+
if ratio >= 0.2:
|
108
|
+
difficulties.append("tiny_font")
|
109
|
+
except Exception:
|
110
|
+
pass
|
111
|
+
|
112
|
+
# ---------------- extract table simple
|
113
|
+
try:
|
114
|
+
table_data = page.extract_table()
|
115
|
+
if table_data and table_data[0]:
|
116
|
+
page_result["table_found"] = True
|
117
|
+
page_result["table_dims"] = [len(table_data), len(table_data[0])]
|
118
|
+
else:
|
119
|
+
page_result["table_found"] = False
|
120
|
+
except Exception as e:
|
121
|
+
page_result["table_error"] = str(e)
|
122
|
+
|
123
|
+
# ---------------- layout YOLO
|
124
|
+
try:
|
125
|
+
yolo_layout = page.analyze_layout()
|
126
|
+
page_result["layout_yolo_count"] = len(yolo_layout)
|
127
|
+
page_result["layout_yolo_regions"] = [
|
128
|
+
{
|
129
|
+
"type": getattr(r, "type", "unknown"),
|
130
|
+
"bbox": [r.x0, r.top, r.x1, r.bottom],
|
131
|
+
"confidence": getattr(r, "confidence", None),
|
132
|
+
}
|
133
|
+
for r in yolo_layout
|
134
|
+
]
|
135
|
+
except Exception as e:
|
136
|
+
page_result["layout_yolo_error"] = str(e)
|
137
|
+
|
138
|
+
# ---------------- layout TATR for tables
|
139
|
+
try:
|
140
|
+
tatr_layout = page.analyze_layout("tatr", existing="append")
|
141
|
+
page_result["layout_tatr_count"] = len(tatr_layout)
|
142
|
+
page_result["layout_tatr_regions"] = [
|
143
|
+
{
|
144
|
+
"type": getattr(r, "type", "unknown"),
|
145
|
+
"bbox": [r.x0, r.top, r.x1, r.bottom],
|
146
|
+
"confidence": getattr(r, "confidence", None),
|
147
|
+
}
|
148
|
+
for r in tatr_layout
|
149
|
+
]
|
150
|
+
except Exception as e:
|
151
|
+
page_result["layout_tatr_error"] = str(e)
|
152
|
+
|
153
|
+
# ---------------- color blob detection (rect fills / graphical anchors)
|
154
|
+
try:
|
155
|
+
blobs = page.detect_blobs()
|
156
|
+
page_result["blob_count"] = len(blobs)
|
157
|
+
page_result["blobs_sample"] = [
|
158
|
+
{
|
159
|
+
"color": getattr(b, "color", None),
|
160
|
+
"bbox": [b.x0, b.top, b.x1, b.bottom],
|
161
|
+
}
|
162
|
+
for b in blobs[:20]
|
163
|
+
]
|
164
|
+
except Exception as e:
|
165
|
+
page_result["blobs_error"] = str(e)
|
166
|
+
|
167
|
+
# ---------------- OCR pass (only if little native text)
|
168
|
+
ocr_elements = []
|
169
|
+
if page_result.get("text_len", 0) < 100:
|
170
|
+
start = time.time()
|
171
|
+
try:
|
172
|
+
ocr_elements = page.extract_ocr_elements(engine="easyocr")
|
173
|
+
page_result["ocr_text_elements"] = len(ocr_elements)
|
174
|
+
page_result["ocr_runtime_sec"] = round(time.time() - start, 2)
|
175
|
+
# Embed small OCR preview instead of separate file
|
176
|
+
ocr_json = [
|
177
|
+
{
|
178
|
+
"text": el.text,
|
179
|
+
"bbox": [el.x0, el.top, el.x1, el.bottom],
|
180
|
+
"size": getattr(el, "size", None),
|
181
|
+
}
|
182
|
+
for el in ocr_elements[:500]
|
183
|
+
]
|
184
|
+
page_result["ocr_sample"] = ocr_json[:30]
|
185
|
+
except Exception as e:
|
186
|
+
page_result["ocr_error"] = str(e)
|
187
|
+
else:
|
188
|
+
page_result["ocr_text_elements"] = 0
|
189
|
+
|
190
|
+
# ---------------- tags – handle non-string entries (NaN etc.)
|
191
|
+
goal_raw = summary.get("goal", "")
|
192
|
+
# Convert to string to avoid attribute errors if the CSV cell is NaN/float
|
193
|
+
goal_str = str(goal_raw) if goal_raw is not None else ""
|
194
|
+
goal = goal_str.lower()
|
195
|
+
|
196
|
+
if "table" in goal:
|
197
|
+
page_result["goal_tag"] = "table_extraction"
|
198
|
+
elif any(word in goal for word in ["text", "content", "information"]):
|
199
|
+
page_result["goal_tag"] = "text_extraction"
|
200
|
+
else:
|
201
|
+
page_result["goal_tag"] = "unknown"
|
202
|
+
|
203
|
+
# Difficulties determination
|
204
|
+
if page_result.get("text_len", 0) < 100 and page_result.get("ocr_text_elements", 0) > 20:
|
205
|
+
difficulties.append("scanned_image")
|
206
|
+
|
207
|
+
page_result["difficulties"] = difficulties
|
208
|
+
|
209
|
+
# Suggested approach heuristic
|
210
|
+
approach = []
|
211
|
+
if "table" in goal:
|
212
|
+
if page_result.get("layout_tatr_count", 0) > 0:
|
213
|
+
approach.append("Crop TATR regions → extract_table('tatr')")
|
214
|
+
else:
|
215
|
+
approach.append("Anchor header text, .below(), extract_table(custom settings)")
|
216
|
+
if "text" in goal and "scanned_image" in difficulties:
|
217
|
+
approach.append("Apply OCR (paddle for non-Latin)")
|
218
|
+
if "tiny_font" in difficulties:
|
219
|
+
approach.append("Re-render at higher scale or adjust char_margin")
|
220
|
+
page_result["suggested_approach"] = "; ".join(approach)
|
221
|
+
|
222
|
+
# ---------------- code snippet suggestion
|
223
|
+
def _first_anchor_from_goal(g: str) -> str:
|
224
|
+
"""Pick a plausible anchor token (capitalised word) from the free-form goal text."""
|
225
|
+
for tok in g.split():
|
226
|
+
t = tok.strip().strip(".;:,()[]{}")
|
227
|
+
if len(t) > 3 and t[0].isupper() and t.isalpha():
|
228
|
+
return t
|
229
|
+
return "AnchorText"
|
230
|
+
|
231
|
+
import_lines = [
|
232
|
+
"from natural_pdf import PDF",
|
233
|
+
]
|
234
|
+
if page_result["goal_tag"] == "table_extraction":
|
235
|
+
import_lines.append("import pandas as pd")
|
236
|
+
|
237
|
+
code_lines: List[str] = import_lines + [
|
238
|
+
f"pdf = PDF(\"{self.pdf_path}\")",
|
239
|
+
f"page = pdf.pages[{page_idx_1based - 1}] # page {page_idx_1based}",
|
240
|
+
]
|
241
|
+
|
242
|
+
thought_lines: List[str] = []
|
243
|
+
# build reasoning
|
244
|
+
thought_lines.append(f"Goal tag: {page_result['goal_tag']}. Detected difficulties: {', '.join(difficulties) or 'none'}.")
|
245
|
+
|
246
|
+
if page_result["goal_tag"] == "table_extraction":
|
247
|
+
thought_lines.append("Plan: rely on layout models to locate tables, then extract with Natural-PDF helper.")
|
248
|
+
if page_result.get("layout_tatr_count", 0) > 0:
|
249
|
+
code_lines.append("page.analyze_layout('tatr') # adds 'table' regions")
|
250
|
+
else:
|
251
|
+
code_lines.append("page.analyze_layout() # YOLO fallback")
|
252
|
+
|
253
|
+
if page_result.get("layout_tatr_count", 0) > 1:
|
254
|
+
thought_lines.append("Multiple tables detected, choose second as goal mentions 'second table'.")
|
255
|
+
code_lines.append("tables = page.find_all('table')")
|
256
|
+
code_lines.append("tbl = tables[1]")
|
257
|
+
else:
|
258
|
+
code_lines.append("tbl = page.find('table') # first table")
|
259
|
+
|
260
|
+
code_lines.extend([
|
261
|
+
"data = tbl.extract_table()",
|
262
|
+
"columns, rows = data[0], data[1:]",
|
263
|
+
"df = pd.DataFrame(rows, columns=columns)",
|
264
|
+
])
|
265
|
+
elif page_result["goal_tag"] == "text_extraction":
|
266
|
+
anchor = _first_anchor_from_goal(goal_str)
|
267
|
+
if "scanned_image" in difficulties:
|
268
|
+
thought_lines.append("No native text detected; need OCR before querying.")
|
269
|
+
code_lines.append("page.apply_ocr(engine='paddle')")
|
270
|
+
thought_lines.append(f"Anchor on text '{anchor}' then read below region.")
|
271
|
+
code_lines.append(f"section = page.find(\"text:contains({anchor})\").below(0, 50)")
|
272
|
+
code_lines.append("text = section.extract_text()")
|
273
|
+
else:
|
274
|
+
thought_lines.append("Goal unclear; placeholder snippet provided.")
|
275
|
+
code_lines.append("# TODO: clarify extraction goal")
|
276
|
+
|
277
|
+
page_result["code_suggestion"] = "\n".join(code_lines)
|
278
|
+
page_result["thought_process"] = " ".join(thought_lines)
|
279
|
+
|
280
|
+
summary["pages"].append(page_result)
|
281
|
+
|
282
|
+
# Provide quick heuristic comment
|
283
|
+
if page_result.get("text_len", 0) == 0 and page_result.get("ocr_text_elements", 0) > 20:
|
284
|
+
page_result["auto_comment"] = "Likely scanned/needs OCR; no native text."
|
285
|
+
elif page_result.get("text_len", 0) > 1000 and page_result.get("layout_yolo_count", 0) == 0:
|
286
|
+
page_result["auto_comment"] = "Native dense text; YOLO found no regions – may be fine, fonts just small."
|
287
|
+
else:
|
288
|
+
page_result.setdefault("auto_comment", "")
|
289
|
+
|
290
|
+
# Save master summary
|
291
|
+
save_json(summary, self.output_dir / "summary.json")
|
292
|
+
return summary
|
293
|
+
|
294
|
+
|
295
|
+
# -------------------------------------------------------------------------
|
296
|
+
# Helper to parse specific pages mentioned in free text
|
297
|
+
# -------------------------------------------------------------------------
|
298
|
+
PAGE_REGEX = re.compile(r"page\s*(\d{1,4})", re.IGNORECASE)
|
299
|
+
|
300
|
+
|
301
|
+
def extract_page_hints(text: str) -> List[int]:
|
302
|
+
return [int(m.group(1)) for m in PAGE_REGEX.finditer(text)]
|
@@ -0,0 +1,130 @@
|
|
1
|
+
import json
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import List
|
4
|
+
|
5
|
+
ROOT = Path(__file__).resolve().parent.parent.parent / "eval_results"
|
6
|
+
|
7
|
+
|
8
|
+
def collect() -> List[dict]:
|
9
|
+
rows = []
|
10
|
+
for subdir in ROOT.iterdir():
|
11
|
+
if not subdir.is_dir():
|
12
|
+
continue
|
13
|
+
summary_path = subdir / "summary.json"
|
14
|
+
if not summary_path.exists():
|
15
|
+
continue
|
16
|
+
try:
|
17
|
+
data = json.loads(summary_path.read_text())
|
18
|
+
except Exception as e:
|
19
|
+
print(f"Failed to parse {summary_path}: {e}")
|
20
|
+
continue
|
21
|
+
submission_id = data.get("submission_id", subdir.name)
|
22
|
+
description = data.get("description", "")
|
23
|
+
language = data.get("language", "")
|
24
|
+
issues = data.get("issues", "")
|
25
|
+
|
26
|
+
# ---------------- document-level enrichment (added by llm_enrich.py) ----
|
27
|
+
doc_tp = (data.get("thought_process") or "").strip()
|
28
|
+
doc_cs = (data.get("code_suggestion") or "").strip()
|
29
|
+
doc_diff = data.get("difficult_elements", [])
|
30
|
+
doc_test = (data.get("test_case") or "").strip()
|
31
|
+
|
32
|
+
page_snippets = []
|
33
|
+
features = set()
|
34
|
+
for p in data.get("pages", [])[:5]: # first 5 pages enough for summary
|
35
|
+
cs = (p.get("code_suggestion") or "").strip()
|
36
|
+
tp = (p.get("thought_process") or "").strip()
|
37
|
+
if not cs and not tp:
|
38
|
+
continue
|
39
|
+
page_snippets.append({
|
40
|
+
"page": p.get("page_number"),
|
41
|
+
"code": cs,
|
42
|
+
"thought": tp,
|
43
|
+
})
|
44
|
+
# --- lightweight feature tagging --------------------------------
|
45
|
+
gt = (p.get("goal_tag") or "").lower()
|
46
|
+
if "table" in gt:
|
47
|
+
features.add("table")
|
48
|
+
if "text" in gt:
|
49
|
+
features.add("text")
|
50
|
+
# look into region labels for structural hints
|
51
|
+
for reg in p.get("layout_tatr_regions", []) + p.get("layout_yolo_regions", []):
|
52
|
+
label = (reg.get("label") or reg.get("type") or "").lower()
|
53
|
+
if label == "table":
|
54
|
+
features.add("table")
|
55
|
+
if label in {"figure", "isolate_formula"}:
|
56
|
+
features.add("figure")
|
57
|
+
# parse difficulties hints in thought_process
|
58
|
+
difficulties = tp.lower()
|
59
|
+
if "scanned_image" in difficulties:
|
60
|
+
features.add("scanned_image")
|
61
|
+
if "tiny_font" in difficulties or "small font" in difficulties:
|
62
|
+
features.add("small_font")
|
63
|
+
# language-based feature
|
64
|
+
if language and language.lower() not in {"english", "en", "en-us"}:
|
65
|
+
features.add("non_english")
|
66
|
+
|
67
|
+
rows.append({
|
68
|
+
"id": submission_id,
|
69
|
+
"language": language,
|
70
|
+
"issues": issues,
|
71
|
+
"description": description,
|
72
|
+
"doc_thought": doc_tp,
|
73
|
+
"doc_code": doc_cs,
|
74
|
+
"doc_difficult": doc_diff,
|
75
|
+
"doc_test": doc_test,
|
76
|
+
"snippets": page_snippets,
|
77
|
+
"features": sorted(features),
|
78
|
+
})
|
79
|
+
return rows
|
80
|
+
|
81
|
+
|
82
|
+
def export_markdown(rows: List[dict]):
|
83
|
+
lines = ["# Evaluation Summaries\n"]
|
84
|
+
for r in sorted(rows, key=lambda x: x["id"]):
|
85
|
+
lines.append(f"## {r['id']}")
|
86
|
+
if r["description"]:
|
87
|
+
lines.append(f"*Description*: {r['description']}")
|
88
|
+
if r["issues"]:
|
89
|
+
lines.append(f"*Issues*: {r['issues']}")
|
90
|
+
if r["language"]:
|
91
|
+
lines.append(f"*Language*: {r['language']}")
|
92
|
+
if r.get("features"):
|
93
|
+
lines.append(f"*Features*: {', '.join(r['features'])}")
|
94
|
+
|
95
|
+
# ---- document-level enrichment -----------------------------------
|
96
|
+
if r.get("doc_thought") or r.get("doc_code"):
|
97
|
+
lines.append("\n### Document-level enrichment")
|
98
|
+
if r.get("doc_thought"):
|
99
|
+
lines.append("**Doc thought process:**")
|
100
|
+
lines.append(f"```")
|
101
|
+
lines.append(r["doc_thought"])
|
102
|
+
lines.append(f"```")
|
103
|
+
if r.get("doc_code"):
|
104
|
+
lines.append("**Doc code suggestion:**")
|
105
|
+
lines.append(f"```python")
|
106
|
+
lines.append(r["doc_code"])
|
107
|
+
lines.append(f"```")
|
108
|
+
if r.get("doc_difficult"):
|
109
|
+
lines.append("*Difficult elements*: " + ", ".join(r["doc_difficult"]))
|
110
|
+
if r.get("doc_test"):
|
111
|
+
lines.append("*Suggested test*: " + r["doc_test"])
|
112
|
+
|
113
|
+
lines.append("")
|
114
|
+
for s in r["snippets"]:
|
115
|
+
lines.append(f"### Page {s['page']}")
|
116
|
+
if s["thought"]:
|
117
|
+
lines.append("**Thoughts**:")
|
118
|
+
lines.append(f"```\n{s['thought']}\n```")
|
119
|
+
if s["code"]:
|
120
|
+
lines.append("**Code suggestion**:")
|
121
|
+
lines.append(f"```python\n{s['code']}\n```")
|
122
|
+
lines.append("")
|
123
|
+
lines.append("\n---\n")
|
124
|
+
Path("eval_results/collated_summary.md").write_text("\n".join(lines))
|
125
|
+
|
126
|
+
|
127
|
+
if __name__ == "__main__":
|
128
|
+
rows = collect()
|
129
|
+
export_markdown(rows)
|
130
|
+
print(f"Wrote {len(rows)} summaries to eval_results/collated_summary.md")
|