natural-pdf 0.1.28__py3-none-any.whl → 0.1.31__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. bad_pdf_analysis/analyze_10_more.py +300 -0
  2. bad_pdf_analysis/analyze_final_10.py +552 -0
  3. bad_pdf_analysis/analyze_specific_pages.py +394 -0
  4. bad_pdf_analysis/analyze_specific_pages_direct.py +382 -0
  5. natural_pdf/analyzers/layout/layout_analyzer.py +2 -3
  6. natural_pdf/analyzers/layout/layout_manager.py +44 -0
  7. natural_pdf/analyzers/layout/surya.py +1 -1
  8. natural_pdf/analyzers/shape_detection_mixin.py +228 -0
  9. natural_pdf/classification/manager.py +67 -0
  10. natural_pdf/core/element_manager.py +578 -27
  11. natural_pdf/core/highlighting_service.py +98 -43
  12. natural_pdf/core/page.py +86 -20
  13. natural_pdf/core/pdf.py +0 -2
  14. natural_pdf/describe/base.py +40 -9
  15. natural_pdf/describe/elements.py +11 -6
  16. natural_pdf/elements/base.py +134 -20
  17. natural_pdf/elements/collections.py +43 -11
  18. natural_pdf/elements/image.py +43 -0
  19. natural_pdf/elements/region.py +64 -19
  20. natural_pdf/elements/text.py +118 -11
  21. natural_pdf/flows/collections.py +4 -4
  22. natural_pdf/flows/region.py +17 -2
  23. natural_pdf/ocr/ocr_manager.py +50 -0
  24. natural_pdf/selectors/parser.py +27 -7
  25. natural_pdf/tables/__init__.py +5 -0
  26. natural_pdf/tables/result.py +101 -0
  27. natural_pdf/utils/bidi_mirror.py +36 -0
  28. natural_pdf/utils/visualization.py +15 -1
  29. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/METADATA +2 -1
  30. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/RECORD +48 -26
  31. natural_pdf-0.1.31.dist-info/top_level.txt +6 -0
  32. optimization/memory_comparison.py +172 -0
  33. optimization/pdf_analyzer.py +410 -0
  34. optimization/performance_analysis.py +397 -0
  35. optimization/test_cleanup_methods.py +155 -0
  36. optimization/test_memory_fix.py +162 -0
  37. tools/bad_pdf_eval/__init__.py +1 -0
  38. tools/bad_pdf_eval/analyser.py +302 -0
  39. tools/bad_pdf_eval/collate_summaries.py +130 -0
  40. tools/bad_pdf_eval/eval_suite.py +116 -0
  41. tools/bad_pdf_eval/export_enrichment_csv.py +62 -0
  42. tools/bad_pdf_eval/llm_enrich.py +273 -0
  43. tools/bad_pdf_eval/reporter.py +17 -0
  44. tools/bad_pdf_eval/utils.py +127 -0
  45. tools/rtl_smoke_test.py +80 -0
  46. natural_pdf-0.1.28.dist-info/top_level.txt +0 -2
  47. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/WHEEL +0 -0
  48. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/entry_points.txt +0 -0
  49. {natural_pdf-0.1.28.dist-info → natural_pdf-0.1.31.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,162 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script to verify character duplication fix works correctly.
4
+
5
+ This test verifies that:
6
+ 1. Memory usage is reduced by eliminating character duplication
7
+ 2. All existing functionality still works correctly
8
+ 3. Character access through words remains functional
9
+ """
10
+
11
+ import gc
12
+ import os
13
+ import psutil
14
+ import sys
15
+ from pathlib import Path
16
+ import pytest
17
+
18
+ import natural_pdf as npdf
19
+
20
+
21
+ def get_memory_usage():
22
+ """Get current memory usage in MB"""
23
+ process = psutil.Process()
24
+ return process.memory_info().rss / 1024 / 1024
25
+
26
+
27
+ class TestCharacterMemoryFix:
28
+ """Test suite for character memory optimization"""
29
+
30
+ @pytest.fixture
31
+ def test_pdf_path(self):
32
+ """Get path to a test PDF"""
33
+ # Use the practice PDF for testing
34
+ test_path = Path("pdfs/01-practice.pdf")
35
+ if not test_path.exists():
36
+ pytest.skip("Test PDF not found")
37
+ return str(test_path)
38
+
39
+ def test_character_access_still_works(self, test_pdf_path):
40
+ """Test that character access through words still works after optimization"""
41
+ pdf = npdf.PDF(test_pdf_path)
42
+ page = pdf.pages[0]
43
+
44
+ # Force loading of elements
45
+ text_elements = page.find_all("text")
46
+
47
+ # Test that we have text elements
48
+ assert len(text_elements) > 0, "Should have text elements"
49
+ print(f"Found {len(text_elements)} text elements")
50
+
51
+ # Test that words can access their constituent characters
52
+ for word in text_elements[:5]: # Test first 5 words
53
+ if hasattr(word, '_char_indices') and word._char_indices:
54
+ # New optimized approach
55
+ constituent_chars = word.chars
56
+ assert isinstance(constituent_chars, list), "word.chars should return a list"
57
+ assert len(constituent_chars) > 0, "Should have constituent characters"
58
+
59
+ # Test character properties
60
+ for char in constituent_chars[:3]: # Test first 3 chars of each word
61
+ assert hasattr(char, 'text'), "Character should have text attribute"
62
+ assert hasattr(char, 'x0'), "Character should have x0 coordinate"
63
+
64
+ elif hasattr(word, '_char_dicts') and word._char_dicts:
65
+ # Old approach - should still work for compatibility
66
+ char_dicts = word._char_dicts
67
+ assert isinstance(char_dicts, list), "word._char_dicts should be a list"
68
+ assert len(char_dicts) > 0, "Should have character dictionaries"
69
+
70
+ def test_memory_usage_improvement(self, test_pdf_path):
71
+ """Test that memory usage is improved with the optimization"""
72
+ # This test will compare memory usage patterns
73
+ # Note: Exact numbers will vary, but we should see improvement
74
+
75
+ pdf = npdf.PDF(test_pdf_path)
76
+ page = pdf.pages[0]
77
+
78
+ # Measure memory before loading elements
79
+ gc.collect()
80
+ memory_before = get_memory_usage()
81
+
82
+ # Load elements (this triggers the optimization)
83
+ chars = page.find_all("text")
84
+ words = page.find_all("words")
85
+
86
+ # Measure memory after loading
87
+ gc.collect()
88
+ memory_after = get_memory_usage()
89
+
90
+ memory_used = memory_after - memory_before
91
+
92
+ # Log the memory usage for analysis
93
+ print(f"\nMemory usage analysis:")
94
+ print(f"Characters loaded: {len(chars)}")
95
+ print(f"Words loaded: {len(words)}")
96
+ print(f"Memory used: {memory_used:.2f} MB")
97
+ print(f"Memory per character: {memory_used / len(chars) * 1024:.2f} KB" if chars else "N/A")
98
+
99
+ # The memory usage should be reasonable (not exact test due to variability)
100
+ # Main goal is to verify no crashes and reasonable memory usage
101
+ assert memory_used < 100, f"Memory usage seems too high: {memory_used:.2f} MB"
102
+
103
+ def test_word_text_extraction_works(self, test_pdf_path):
104
+ """Test that text extraction from words still works correctly"""
105
+ pdf = npdf.PDF(test_pdf_path)
106
+ page = pdf.pages[0]
107
+
108
+ words = page.find_all("text") # All text elements are words in this PDF
109
+
110
+ # Test text extraction from words
111
+ for word in words[:10]: # Test first 10 words
112
+ word_text = word.text
113
+ assert isinstance(word_text, str), "Word text should be a string"
114
+
115
+ # Text should not be empty for actual words
116
+ if word_text.strip(): # Skip empty/whitespace words
117
+ assert len(word_text) > 0, "Non-empty words should have text content"
118
+
119
+ def test_backwards_compatibility(self, test_pdf_path):
120
+ """Test that existing code patterns still work"""
121
+ pdf = npdf.PDF(test_pdf_path)
122
+ page = pdf.pages[0]
123
+
124
+ # Test that existing element access patterns work
125
+ all_elements = page.find_all("text")
126
+ assert len(all_elements) > 0, "Should find text elements"
127
+
128
+ # Test that element properties are accessible
129
+ for element in all_elements[:5]:
130
+ assert hasattr(element, 'text'), "Element should have text attribute"
131
+ assert hasattr(element, 'x0'), "Element should have x0 coordinate"
132
+ assert hasattr(element, 'top'), "Element should have top coordinate"
133
+ assert hasattr(element, 'width'), "Element should have width"
134
+ assert hasattr(element, 'height'), "Element should have height"
135
+
136
+
137
+ def main():
138
+ """Run the memory fix test"""
139
+ print("Running character memory optimization test...")
140
+
141
+ # Check if test PDF exists
142
+ test_pdf = Path("pdfs/01-practice.pdf")
143
+ if not test_pdf.exists():
144
+ print(f"ERROR: Test PDF not found at {test_pdf}")
145
+ print("Please ensure the test PDF exists before running this test.")
146
+ return 1
147
+
148
+ # Run pytest on just this file
149
+ exit_code = pytest.main([__file__, "-v", "-s"])
150
+
151
+ if exit_code == 0:
152
+ print("\n✅ All memory optimization tests passed!")
153
+ print("The character duplication fix is working correctly.")
154
+ else:
155
+ print("\n❌ Some tests failed!")
156
+ print("The memory optimization needs investigation.")
157
+
158
+ return exit_code
159
+
160
+
161
+ if __name__ == "__main__":
162
+ exit(main())
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,302 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import time
5
+ from pathlib import Path
6
+ from typing import Dict, List, Optional, Any
7
+
8
+ import natural_pdf as npdf
9
+ from PIL import Image
10
+ from rich.table import Table
11
+ from rich.console import Console
12
+
13
+ from .utils import slugify
14
+ from .reporter import save_json, log_section
15
+
16
+ console = Console()
17
+
18
+
19
+ class BadPDFAnalyzer:
20
+ """Run a battery of Natural-PDF probes on a PDF and dump artefacts."""
21
+
22
+ def __init__(
23
+ self,
24
+ pdf_path: Path,
25
+ output_dir: Path,
26
+ submission_meta: Dict[str, Any],
27
+ pages: List[int],
28
+ resolution: int = 216,
29
+ ):
30
+ self.pdf_path = pdf_path
31
+ self.output_dir = output_dir
32
+ self.meta = submission_meta
33
+ self.pages_to_analyze = pages
34
+ self.resolution = resolution
35
+
36
+ # ---------------------------------------------------------------------
37
+ # Helpers
38
+ # ---------------------------------------------------------------------
39
+ def _save_page_image(self, page, page_num: int) -> Path:
40
+ """Render and save page image as high-quality JPG."""
41
+ img: Image.Image = page.to_image(resolution=self.resolution)
42
+ if img.mode != "RGB":
43
+ img = img.convert("RGB")
44
+ img_path = self.output_dir / f"page_{page_num:04d}.jpg"
45
+ img.save(img_path, "JPEG", quality=90, optimize=True, progressive=True)
46
+ return img_path
47
+
48
+ # ------------------------------------------------------------------
49
+ def run(self) -> Dict[str, Any]:
50
+ """Return master JSON summary (also persisted inside output_dir)."""
51
+ console.print(f"[green]Analyzing[/] {self.pdf_path.name} ↦ pages {self.pages_to_analyze}")
52
+ self.output_dir.mkdir(parents=True, exist_ok=True)
53
+
54
+ pdf = npdf.PDF(str(self.pdf_path))
55
+ summary: Dict[str, Any] = {
56
+ "submission_id": self.meta["Submission ID"],
57
+ "pdf": str(self.pdf_path),
58
+ "total_pages": len(pdf.pages),
59
+ "pages": [],
60
+ "goal": self.meta.get("What are we trying to get out of the PDF?", ""),
61
+ "language": self.meta.get("What language(s) or script is the content in?", ""),
62
+ "issues": self.meta.get("What do you think makes this PDF bad?", ""),
63
+ "description": self.meta.get("What is the PDF, and/or where did it come from?", ""),
64
+ }
65
+
66
+ for page_idx_1based in self.pages_to_analyze:
67
+ if page_idx_1based < 1 or page_idx_1based > len(pdf.pages):
68
+ console.print(f"[yellow]- skipping page {page_idx_1based} (out of range)")
69
+ continue
70
+ page = pdf.pages[page_idx_1based - 1]
71
+ page_result: Dict[str, Any] = {"page_number": page_idx_1based}
72
+ difficulties: List[str] = []
73
+
74
+ # ---------------- image
75
+ img_path = self._save_page_image(page, page_idx_1based)
76
+ page_result["image"] = str(img_path)
77
+
78
+ # ---------------- describe / inspect
79
+ try:
80
+ descr = page.describe()
81
+ page_result["describe"] = str(descr)
82
+ except Exception as e:
83
+ page_result["describe_error"] = str(e)
84
+
85
+ try:
86
+ page_result["inspect"] = str(page.inspect(limit=30))
87
+ except Exception as e:
88
+ page_result["inspect_error"] = str(e)
89
+
90
+ # ---------------- extract text
91
+ text = ""
92
+ try:
93
+ text = page.extract_text()
94
+ page_result["text_len"] = len(text or "")
95
+ if text:
96
+ page_result["text_preview"] = text[:300]
97
+ except Exception as e:
98
+ page_result["extract_text_error"] = str(e)
99
+
100
+ # ---------------- tiny font detection
101
+ try:
102
+ words_sample = page.words[:5000] # not expensive
103
+ if words_sample:
104
+ small_fonts = sum(1 for w in words_sample if getattr(w, "size", 10) < 4)
105
+ ratio = small_fonts / len(words_sample)
106
+ page_result["tiny_font_ratio"] = round(ratio, 3)
107
+ if ratio >= 0.2:
108
+ difficulties.append("tiny_font")
109
+ except Exception:
110
+ pass
111
+
112
+ # ---------------- extract table simple
113
+ try:
114
+ table_data = page.extract_table()
115
+ if table_data and table_data[0]:
116
+ page_result["table_found"] = True
117
+ page_result["table_dims"] = [len(table_data), len(table_data[0])]
118
+ else:
119
+ page_result["table_found"] = False
120
+ except Exception as e:
121
+ page_result["table_error"] = str(e)
122
+
123
+ # ---------------- layout YOLO
124
+ try:
125
+ yolo_layout = page.analyze_layout()
126
+ page_result["layout_yolo_count"] = len(yolo_layout)
127
+ page_result["layout_yolo_regions"] = [
128
+ {
129
+ "type": getattr(r, "type", "unknown"),
130
+ "bbox": [r.x0, r.top, r.x1, r.bottom],
131
+ "confidence": getattr(r, "confidence", None),
132
+ }
133
+ for r in yolo_layout
134
+ ]
135
+ except Exception as e:
136
+ page_result["layout_yolo_error"] = str(e)
137
+
138
+ # ---------------- layout TATR for tables
139
+ try:
140
+ tatr_layout = page.analyze_layout("tatr", existing="append")
141
+ page_result["layout_tatr_count"] = len(tatr_layout)
142
+ page_result["layout_tatr_regions"] = [
143
+ {
144
+ "type": getattr(r, "type", "unknown"),
145
+ "bbox": [r.x0, r.top, r.x1, r.bottom],
146
+ "confidence": getattr(r, "confidence", None),
147
+ }
148
+ for r in tatr_layout
149
+ ]
150
+ except Exception as e:
151
+ page_result["layout_tatr_error"] = str(e)
152
+
153
+ # ---------------- color blob detection (rect fills / graphical anchors)
154
+ try:
155
+ blobs = page.detect_blobs()
156
+ page_result["blob_count"] = len(blobs)
157
+ page_result["blobs_sample"] = [
158
+ {
159
+ "color": getattr(b, "color", None),
160
+ "bbox": [b.x0, b.top, b.x1, b.bottom],
161
+ }
162
+ for b in blobs[:20]
163
+ ]
164
+ except Exception as e:
165
+ page_result["blobs_error"] = str(e)
166
+
167
+ # ---------------- OCR pass (only if little native text)
168
+ ocr_elements = []
169
+ if page_result.get("text_len", 0) < 100:
170
+ start = time.time()
171
+ try:
172
+ ocr_elements = page.extract_ocr_elements(engine="easyocr")
173
+ page_result["ocr_text_elements"] = len(ocr_elements)
174
+ page_result["ocr_runtime_sec"] = round(time.time() - start, 2)
175
+ # Embed small OCR preview instead of separate file
176
+ ocr_json = [
177
+ {
178
+ "text": el.text,
179
+ "bbox": [el.x0, el.top, el.x1, el.bottom],
180
+ "size": getattr(el, "size", None),
181
+ }
182
+ for el in ocr_elements[:500]
183
+ ]
184
+ page_result["ocr_sample"] = ocr_json[:30]
185
+ except Exception as e:
186
+ page_result["ocr_error"] = str(e)
187
+ else:
188
+ page_result["ocr_text_elements"] = 0
189
+
190
+ # ---------------- tags – handle non-string entries (NaN etc.)
191
+ goal_raw = summary.get("goal", "")
192
+ # Convert to string to avoid attribute errors if the CSV cell is NaN/float
193
+ goal_str = str(goal_raw) if goal_raw is not None else ""
194
+ goal = goal_str.lower()
195
+
196
+ if "table" in goal:
197
+ page_result["goal_tag"] = "table_extraction"
198
+ elif any(word in goal for word in ["text", "content", "information"]):
199
+ page_result["goal_tag"] = "text_extraction"
200
+ else:
201
+ page_result["goal_tag"] = "unknown"
202
+
203
+ # Difficulties determination
204
+ if page_result.get("text_len", 0) < 100 and page_result.get("ocr_text_elements", 0) > 20:
205
+ difficulties.append("scanned_image")
206
+
207
+ page_result["difficulties"] = difficulties
208
+
209
+ # Suggested approach heuristic
210
+ approach = []
211
+ if "table" in goal:
212
+ if page_result.get("layout_tatr_count", 0) > 0:
213
+ approach.append("Crop TATR regions → extract_table('tatr')")
214
+ else:
215
+ approach.append("Anchor header text, .below(), extract_table(custom settings)")
216
+ if "text" in goal and "scanned_image" in difficulties:
217
+ approach.append("Apply OCR (paddle for non-Latin)")
218
+ if "tiny_font" in difficulties:
219
+ approach.append("Re-render at higher scale or adjust char_margin")
220
+ page_result["suggested_approach"] = "; ".join(approach)
221
+
222
+ # ---------------- code snippet suggestion
223
+ def _first_anchor_from_goal(g: str) -> str:
224
+ """Pick a plausible anchor token (capitalised word) from the free-form goal text."""
225
+ for tok in g.split():
226
+ t = tok.strip().strip(".;:,()[]{}")
227
+ if len(t) > 3 and t[0].isupper() and t.isalpha():
228
+ return t
229
+ return "AnchorText"
230
+
231
+ import_lines = [
232
+ "from natural_pdf import PDF",
233
+ ]
234
+ if page_result["goal_tag"] == "table_extraction":
235
+ import_lines.append("import pandas as pd")
236
+
237
+ code_lines: List[str] = import_lines + [
238
+ f"pdf = PDF(\"{self.pdf_path}\")",
239
+ f"page = pdf.pages[{page_idx_1based - 1}] # page {page_idx_1based}",
240
+ ]
241
+
242
+ thought_lines: List[str] = []
243
+ # build reasoning
244
+ thought_lines.append(f"Goal tag: {page_result['goal_tag']}. Detected difficulties: {', '.join(difficulties) or 'none'}.")
245
+
246
+ if page_result["goal_tag"] == "table_extraction":
247
+ thought_lines.append("Plan: rely on layout models to locate tables, then extract with Natural-PDF helper.")
248
+ if page_result.get("layout_tatr_count", 0) > 0:
249
+ code_lines.append("page.analyze_layout('tatr') # adds 'table' regions")
250
+ else:
251
+ code_lines.append("page.analyze_layout() # YOLO fallback")
252
+
253
+ if page_result.get("layout_tatr_count", 0) > 1:
254
+ thought_lines.append("Multiple tables detected, choose second as goal mentions 'second table'.")
255
+ code_lines.append("tables = page.find_all('table')")
256
+ code_lines.append("tbl = tables[1]")
257
+ else:
258
+ code_lines.append("tbl = page.find('table') # first table")
259
+
260
+ code_lines.extend([
261
+ "data = tbl.extract_table()",
262
+ "columns, rows = data[0], data[1:]",
263
+ "df = pd.DataFrame(rows, columns=columns)",
264
+ ])
265
+ elif page_result["goal_tag"] == "text_extraction":
266
+ anchor = _first_anchor_from_goal(goal_str)
267
+ if "scanned_image" in difficulties:
268
+ thought_lines.append("No native text detected; need OCR before querying.")
269
+ code_lines.append("page.apply_ocr(engine='paddle')")
270
+ thought_lines.append(f"Anchor on text '{anchor}' then read below region.")
271
+ code_lines.append(f"section = page.find(\"text:contains({anchor})\").below(0, 50)")
272
+ code_lines.append("text = section.extract_text()")
273
+ else:
274
+ thought_lines.append("Goal unclear; placeholder snippet provided.")
275
+ code_lines.append("# TODO: clarify extraction goal")
276
+
277
+ page_result["code_suggestion"] = "\n".join(code_lines)
278
+ page_result["thought_process"] = " ".join(thought_lines)
279
+
280
+ summary["pages"].append(page_result)
281
+
282
+ # Provide quick heuristic comment
283
+ if page_result.get("text_len", 0) == 0 and page_result.get("ocr_text_elements", 0) > 20:
284
+ page_result["auto_comment"] = "Likely scanned/needs OCR; no native text."
285
+ elif page_result.get("text_len", 0) > 1000 and page_result.get("layout_yolo_count", 0) == 0:
286
+ page_result["auto_comment"] = "Native dense text; YOLO found no regions – may be fine, fonts just small."
287
+ else:
288
+ page_result.setdefault("auto_comment", "")
289
+
290
+ # Save master summary
291
+ save_json(summary, self.output_dir / "summary.json")
292
+ return summary
293
+
294
+
295
+ # -------------------------------------------------------------------------
296
+ # Helper to parse specific pages mentioned in free text
297
+ # -------------------------------------------------------------------------
298
+ PAGE_REGEX = re.compile(r"page\s*(\d{1,4})", re.IGNORECASE)
299
+
300
+
301
+ def extract_page_hints(text: str) -> List[int]:
302
+ return [int(m.group(1)) for m in PAGE_REGEX.finditer(text)]
@@ -0,0 +1,130 @@
1
+ import json
2
+ from pathlib import Path
3
+ from typing import List
4
+
5
+ ROOT = Path(__file__).resolve().parent.parent.parent / "eval_results"
6
+
7
+
8
+ def collect() -> List[dict]:
9
+ rows = []
10
+ for subdir in ROOT.iterdir():
11
+ if not subdir.is_dir():
12
+ continue
13
+ summary_path = subdir / "summary.json"
14
+ if not summary_path.exists():
15
+ continue
16
+ try:
17
+ data = json.loads(summary_path.read_text())
18
+ except Exception as e:
19
+ print(f"Failed to parse {summary_path}: {e}")
20
+ continue
21
+ submission_id = data.get("submission_id", subdir.name)
22
+ description = data.get("description", "")
23
+ language = data.get("language", "")
24
+ issues = data.get("issues", "")
25
+
26
+ # ---------------- document-level enrichment (added by llm_enrich.py) ----
27
+ doc_tp = (data.get("thought_process") or "").strip()
28
+ doc_cs = (data.get("code_suggestion") or "").strip()
29
+ doc_diff = data.get("difficult_elements", [])
30
+ doc_test = (data.get("test_case") or "").strip()
31
+
32
+ page_snippets = []
33
+ features = set()
34
+ for p in data.get("pages", [])[:5]: # first 5 pages enough for summary
35
+ cs = (p.get("code_suggestion") or "").strip()
36
+ tp = (p.get("thought_process") or "").strip()
37
+ if not cs and not tp:
38
+ continue
39
+ page_snippets.append({
40
+ "page": p.get("page_number"),
41
+ "code": cs,
42
+ "thought": tp,
43
+ })
44
+ # --- lightweight feature tagging --------------------------------
45
+ gt = (p.get("goal_tag") or "").lower()
46
+ if "table" in gt:
47
+ features.add("table")
48
+ if "text" in gt:
49
+ features.add("text")
50
+ # look into region labels for structural hints
51
+ for reg in p.get("layout_tatr_regions", []) + p.get("layout_yolo_regions", []):
52
+ label = (reg.get("label") or reg.get("type") or "").lower()
53
+ if label == "table":
54
+ features.add("table")
55
+ if label in {"figure", "isolate_formula"}:
56
+ features.add("figure")
57
+ # parse difficulties hints in thought_process
58
+ difficulties = tp.lower()
59
+ if "scanned_image" in difficulties:
60
+ features.add("scanned_image")
61
+ if "tiny_font" in difficulties or "small font" in difficulties:
62
+ features.add("small_font")
63
+ # language-based feature
64
+ if language and language.lower() not in {"english", "en", "en-us"}:
65
+ features.add("non_english")
66
+
67
+ rows.append({
68
+ "id": submission_id,
69
+ "language": language,
70
+ "issues": issues,
71
+ "description": description,
72
+ "doc_thought": doc_tp,
73
+ "doc_code": doc_cs,
74
+ "doc_difficult": doc_diff,
75
+ "doc_test": doc_test,
76
+ "snippets": page_snippets,
77
+ "features": sorted(features),
78
+ })
79
+ return rows
80
+
81
+
82
+ def export_markdown(rows: List[dict]):
83
+ lines = ["# Evaluation Summaries\n"]
84
+ for r in sorted(rows, key=lambda x: x["id"]):
85
+ lines.append(f"## {r['id']}")
86
+ if r["description"]:
87
+ lines.append(f"*Description*: {r['description']}")
88
+ if r["issues"]:
89
+ lines.append(f"*Issues*: {r['issues']}")
90
+ if r["language"]:
91
+ lines.append(f"*Language*: {r['language']}")
92
+ if r.get("features"):
93
+ lines.append(f"*Features*: {', '.join(r['features'])}")
94
+
95
+ # ---- document-level enrichment -----------------------------------
96
+ if r.get("doc_thought") or r.get("doc_code"):
97
+ lines.append("\n### Document-level enrichment")
98
+ if r.get("doc_thought"):
99
+ lines.append("**Doc thought process:**")
100
+ lines.append(f"```")
101
+ lines.append(r["doc_thought"])
102
+ lines.append(f"```")
103
+ if r.get("doc_code"):
104
+ lines.append("**Doc code suggestion:**")
105
+ lines.append(f"```python")
106
+ lines.append(r["doc_code"])
107
+ lines.append(f"```")
108
+ if r.get("doc_difficult"):
109
+ lines.append("*Difficult elements*: " + ", ".join(r["doc_difficult"]))
110
+ if r.get("doc_test"):
111
+ lines.append("*Suggested test*: " + r["doc_test"])
112
+
113
+ lines.append("")
114
+ for s in r["snippets"]:
115
+ lines.append(f"### Page {s['page']}")
116
+ if s["thought"]:
117
+ lines.append("**Thoughts**:")
118
+ lines.append(f"```\n{s['thought']}\n```")
119
+ if s["code"]:
120
+ lines.append("**Code suggestion**:")
121
+ lines.append(f"```python\n{s['code']}\n```")
122
+ lines.append("")
123
+ lines.append("\n---\n")
124
+ Path("eval_results/collated_summary.md").write_text("\n".join(lines))
125
+
126
+
127
+ if __name__ == "__main__":
128
+ rows = collect()
129
+ export_markdown(rows)
130
+ print(f"Wrote {len(rows)} summaries to eval_results/collated_summary.md")