mirage-benchmark 1.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mirage-benchmark might be problematic. Click here for more details.
- mirage/__init__.py +83 -0
- mirage/cli.py +150 -0
- mirage/core/__init__.py +52 -0
- mirage/core/config.py +248 -0
- mirage/core/llm.py +1745 -0
- mirage/core/prompts.py +884 -0
- mirage/embeddings/__init__.py +31 -0
- mirage/embeddings/models.py +512 -0
- mirage/embeddings/rerankers_multimodal.py +766 -0
- mirage/embeddings/rerankers_text.py +149 -0
- mirage/evaluation/__init__.py +26 -0
- mirage/evaluation/metrics.py +2223 -0
- mirage/evaluation/metrics_optimized.py +2172 -0
- mirage/pipeline/__init__.py +45 -0
- mirage/pipeline/chunker.py +545 -0
- mirage/pipeline/context.py +1003 -0
- mirage/pipeline/deduplication.py +491 -0
- mirage/pipeline/domain.py +514 -0
- mirage/pipeline/pdf_processor.py +598 -0
- mirage/pipeline/qa_generator.py +798 -0
- mirage/utils/__init__.py +31 -0
- mirage/utils/ablation.py +360 -0
- mirage/utils/preflight.py +663 -0
- mirage/utils/stats.py +626 -0
- mirage_benchmark-1.0.4.dist-info/METADATA +490 -0
- mirage_benchmark-1.0.4.dist-info/RECORD +30 -0
- mirage_benchmark-1.0.4.dist-info/WHEEL +5 -0
- mirage_benchmark-1.0.4.dist-info/entry_points.txt +3 -0
- mirage_benchmark-1.0.4.dist-info/licenses/LICENSE +190 -0
- mirage_benchmark-1.0.4.dist-info/top_level.txt +1 -0
mirage/utils/stats.py
ADDED
|
@@ -0,0 +1,626 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Dataset Statistics Calculator
|
|
4
|
+
Measures pages, images, and tokens for each dataset using chunks.json files.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import zipfile
|
|
10
|
+
import io
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from collections import defaultdict
|
|
14
|
+
from typing import Dict, List, Any, Optional
|
|
15
|
+
|
|
16
|
+
import pypdfium2 as pdfium
|
|
17
|
+
from tqdm import tqdm
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def get_tokenizer():
|
|
21
|
+
"""Get tokenizer - uses GPT2 if available, otherwise word-based estimate."""
|
|
22
|
+
try:
|
|
23
|
+
from transformers import GPT2TokenizerFast
|
|
24
|
+
return GPT2TokenizerFast.from_pretrained("gpt2")
|
|
25
|
+
except Exception:
|
|
26
|
+
return None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def count_tokens(text: str, tokenizer=None) -> int:
|
|
30
|
+
"""Count tokens - uses tokenizer if available, otherwise word-based estimate."""
|
|
31
|
+
if not text:
|
|
32
|
+
return 0
|
|
33
|
+
|
|
34
|
+
if tokenizer:
|
|
35
|
+
try:
|
|
36
|
+
return len(tokenizer.encode(text))
|
|
37
|
+
except Exception:
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
# Fallback: approximate tokens as ~1.3 * words (GPT average)
|
|
41
|
+
words = len(text.split())
|
|
42
|
+
return int(words * 1.3)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def count_pages_from_zip(zip_path: Path) -> Dict[str, int]:
|
|
46
|
+
"""Count pages per document from zip file (PDFs or HTML files)."""
|
|
47
|
+
page_counts = {}
|
|
48
|
+
try:
|
|
49
|
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
50
|
+
for name in zf.namelist():
|
|
51
|
+
stem = Path(name).stem
|
|
52
|
+
lower_name = name.lower()
|
|
53
|
+
|
|
54
|
+
if lower_name.endswith('.pdf'):
|
|
55
|
+
try:
|
|
56
|
+
pdf_bytes = zf.read(name)
|
|
57
|
+
pdf = pdfium.PdfDocument(pdf_bytes)
|
|
58
|
+
page_counts[stem] = len(pdf)
|
|
59
|
+
except Exception:
|
|
60
|
+
page_counts[stem] = 0
|
|
61
|
+
elif lower_name.endswith(('.html', '.htm')):
|
|
62
|
+
# HTML files count as 1 "page"
|
|
63
|
+
page_counts[stem] = 1
|
|
64
|
+
except Exception as e:
|
|
65
|
+
print(f"Error reading zip {zip_path}: {e}")
|
|
66
|
+
return page_counts
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_file_size_mb_from_zip(zip_path: Path) -> Dict[str, float]:
|
|
70
|
+
"""Get file sizes per document from zip file."""
|
|
71
|
+
sizes = {}
|
|
72
|
+
try:
|
|
73
|
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
74
|
+
for info in zf.infolist():
|
|
75
|
+
lower = info.filename.lower()
|
|
76
|
+
if lower.endswith(('.pdf', '.html', '.htm')):
|
|
77
|
+
stem = Path(info.filename).stem
|
|
78
|
+
sizes[stem] = info.file_size / (1024 * 1024)
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
return sizes
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def get_file_type_from_zip(zip_path: Path) -> str:
|
|
85
|
+
"""Detect primary file type in zip."""
|
|
86
|
+
try:
|
|
87
|
+
with zipfile.ZipFile(zip_path, 'r') as zf:
|
|
88
|
+
names = zf.namelist()
|
|
89
|
+
pdfs = sum(1 for n in names if n.lower().endswith('.pdf'))
|
|
90
|
+
htmls = sum(1 for n in names if n.lower().endswith(('.html', '.htm')))
|
|
91
|
+
if pdfs > htmls:
|
|
92
|
+
return "pdf"
|
|
93
|
+
elif htmls > 0:
|
|
94
|
+
return "html"
|
|
95
|
+
except Exception:
|
|
96
|
+
pass
|
|
97
|
+
return "unknown"
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def analyze_chunks_json(chunks_path: Path, tokenizer=None) -> Dict[str, Any]:
|
|
101
|
+
"""Analyze a chunks.json file to extract stats per PDF."""
|
|
102
|
+
pdf_stats = defaultdict(lambda: {
|
|
103
|
+
"chunks": 0,
|
|
104
|
+
"text_chunks": 0,
|
|
105
|
+
"image_chunks": 0,
|
|
106
|
+
"table_chunks": 0,
|
|
107
|
+
"tokens": 0,
|
|
108
|
+
"chars": 0
|
|
109
|
+
})
|
|
110
|
+
|
|
111
|
+
try:
|
|
112
|
+
with open(chunks_path, 'r') as f:
|
|
113
|
+
chunks = json.load(f)
|
|
114
|
+
|
|
115
|
+
for chunk in chunks:
|
|
116
|
+
file_name = chunk.get("file_name", "unknown")
|
|
117
|
+
chunk_type = chunk.get("chunk_type", "").lower()
|
|
118
|
+
content = chunk.get("content", "")
|
|
119
|
+
artifact = chunk.get("artifact", "None")
|
|
120
|
+
|
|
121
|
+
stats = pdf_stats[file_name]
|
|
122
|
+
stats["chunks"] += 1
|
|
123
|
+
stats["chars"] += len(content)
|
|
124
|
+
stats["tokens"] += count_tokens(content, tokenizer)
|
|
125
|
+
|
|
126
|
+
# Classify chunk type
|
|
127
|
+
if "image" in chunk_type or (artifact and artifact != "None"):
|
|
128
|
+
stats["image_chunks"] += 1
|
|
129
|
+
elif "table" in chunk_type:
|
|
130
|
+
stats["table_chunks"] += 1
|
|
131
|
+
else:
|
|
132
|
+
stats["text_chunks"] += 1
|
|
133
|
+
|
|
134
|
+
except Exception as e:
|
|
135
|
+
print(f"Error reading {chunks_path}: {e}")
|
|
136
|
+
|
|
137
|
+
return dict(pdf_stats)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def analyze_dataset(dataset_name: str, chunks_path: Optional[Path], zip_path: Optional[Path], tokenizer=None) -> Dict[str, Any]:
|
|
141
|
+
"""Analyze a single dataset using chunks.json and/or zip file."""
|
|
142
|
+
stats = {
|
|
143
|
+
"dataset_name": dataset_name,
|
|
144
|
+
"source": "chunks.json" if chunks_path else "zip",
|
|
145
|
+
"num_pdfs": 0,
|
|
146
|
+
"total_pages": 0,
|
|
147
|
+
"total_chunks": 0,
|
|
148
|
+
"total_text_chunks": 0,
|
|
149
|
+
"total_image_chunks": 0,
|
|
150
|
+
"total_table_chunks": 0,
|
|
151
|
+
"total_tokens": 0,
|
|
152
|
+
"total_chars": 0,
|
|
153
|
+
"total_size_mb": 0.0,
|
|
154
|
+
"pdfs": []
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
# Get page counts and file sizes from zip
|
|
158
|
+
page_counts = {}
|
|
159
|
+
file_sizes = {}
|
|
160
|
+
file_type = "unknown"
|
|
161
|
+
if zip_path and zip_path.exists():
|
|
162
|
+
page_counts = count_pages_from_zip(zip_path)
|
|
163
|
+
file_sizes = get_file_size_mb_from_zip(zip_path)
|
|
164
|
+
file_type = get_file_type_from_zip(zip_path)
|
|
165
|
+
stats["total_size_mb"] = sum(file_sizes.values())
|
|
166
|
+
stats["file_type"] = file_type
|
|
167
|
+
|
|
168
|
+
# Get chunk stats from chunks.json
|
|
169
|
+
chunk_stats = {}
|
|
170
|
+
if chunks_path and chunks_path.exists():
|
|
171
|
+
chunk_stats = analyze_chunks_json(chunks_path, tokenizer)
|
|
172
|
+
|
|
173
|
+
# Merge stats - use chunk_stats keys as primary if available
|
|
174
|
+
all_pdfs = set(chunk_stats.keys()) | set(page_counts.keys())
|
|
175
|
+
stats["num_pdfs"] = len(all_pdfs)
|
|
176
|
+
|
|
177
|
+
for pdf_name in sorted(all_pdfs):
|
|
178
|
+
pdf_info = {
|
|
179
|
+
"filename": pdf_name,
|
|
180
|
+
"pages": page_counts.get(pdf_name, 0),
|
|
181
|
+
"size_mb": round(file_sizes.get(pdf_name, 0), 2),
|
|
182
|
+
"chunks": chunk_stats.get(pdf_name, {}).get("chunks", 0),
|
|
183
|
+
"text_chunks": chunk_stats.get(pdf_name, {}).get("text_chunks", 0),
|
|
184
|
+
"image_chunks": chunk_stats.get(pdf_name, {}).get("image_chunks", 0),
|
|
185
|
+
"table_chunks": chunk_stats.get(pdf_name, {}).get("table_chunks", 0),
|
|
186
|
+
"tokens": chunk_stats.get(pdf_name, {}).get("tokens", 0),
|
|
187
|
+
"chars": chunk_stats.get(pdf_name, {}).get("chars", 0),
|
|
188
|
+
}
|
|
189
|
+
stats["pdfs"].append(pdf_info)
|
|
190
|
+
|
|
191
|
+
stats["total_pages"] += pdf_info["pages"]
|
|
192
|
+
stats["total_chunks"] += pdf_info["chunks"]
|
|
193
|
+
stats["total_text_chunks"] += pdf_info["text_chunks"]
|
|
194
|
+
stats["total_image_chunks"] += pdf_info["image_chunks"]
|
|
195
|
+
stats["total_table_chunks"] += pdf_info["table_chunks"]
|
|
196
|
+
stats["total_tokens"] += pdf_info["tokens"]
|
|
197
|
+
stats["total_chars"] += pdf_info["chars"]
|
|
198
|
+
|
|
199
|
+
stats["total_size_mb"] = round(stats["total_size_mb"], 2)
|
|
200
|
+
|
|
201
|
+
# Compute averages
|
|
202
|
+
if stats["num_pdfs"] > 0:
|
|
203
|
+
stats["avg_pages_per_pdf"] = round(stats["total_pages"] / stats["num_pdfs"], 1)
|
|
204
|
+
stats["avg_chunks_per_pdf"] = round(stats["total_chunks"] / stats["num_pdfs"], 1)
|
|
205
|
+
stats["avg_images_per_pdf"] = round(stats["total_image_chunks"] / stats["num_pdfs"], 1)
|
|
206
|
+
stats["avg_tokens_per_pdf"] = int(stats["total_tokens"] / stats["num_pdfs"])
|
|
207
|
+
if stats["total_pages"] > 0:
|
|
208
|
+
stats["avg_tokens_per_page"] = int(stats["total_tokens"] / stats["total_pages"])
|
|
209
|
+
|
|
210
|
+
return stats
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def print_summary(all_stats: List[Dict]) -> None:
|
|
214
|
+
"""Print formatted summary of all datasets."""
|
|
215
|
+
print("\n" + "=" * 110)
|
|
216
|
+
print("DATASET STATISTICS SUMMARY (from chunks.json)")
|
|
217
|
+
print("=" * 110)
|
|
218
|
+
|
|
219
|
+
print(f"\n{'Dataset':<28} {'Docs':>5} {'Pages':>6} {'Chunks':>7} {'Images':>7} {'Tables':>7} {'Tokens':>12} {'Size(MB)':>10}")
|
|
220
|
+
print("-" * 110)
|
|
221
|
+
|
|
222
|
+
totals = defaultdict(int)
|
|
223
|
+
totals["total_size_mb"] = 0.0
|
|
224
|
+
|
|
225
|
+
for stats in all_stats:
|
|
226
|
+
print(f"{stats['dataset_name']:<28} {stats['num_pdfs']:>5} {stats['total_pages']:>6} "
|
|
227
|
+
f"{stats['total_chunks']:>7} {stats['total_image_chunks']:>7} {stats['total_table_chunks']:>7} "
|
|
228
|
+
f"{stats['total_tokens']:>12,} {stats['total_size_mb']:>10.1f}")
|
|
229
|
+
|
|
230
|
+
totals["num_pdfs"] += stats["num_pdfs"]
|
|
231
|
+
totals["total_pages"] += stats["total_pages"]
|
|
232
|
+
totals["total_chunks"] += stats["total_chunks"]
|
|
233
|
+
totals["total_image_chunks"] += stats["total_image_chunks"]
|
|
234
|
+
totals["total_table_chunks"] += stats["total_table_chunks"]
|
|
235
|
+
totals["total_tokens"] += stats["total_tokens"]
|
|
236
|
+
totals["total_size_mb"] += stats["total_size_mb"]
|
|
237
|
+
|
|
238
|
+
print("-" * 110)
|
|
239
|
+
print(f"{'TOTAL':<28} {totals['num_pdfs']:>5} {totals['total_pages']:>6} "
|
|
240
|
+
f"{totals['total_chunks']:>7} {totals['total_image_chunks']:>7} {totals['total_table_chunks']:>7} "
|
|
241
|
+
f"{totals['total_tokens']:>12,} {totals['total_size_mb']:>10.1f}")
|
|
242
|
+
print("=" * 110)
|
|
243
|
+
|
|
244
|
+
# Print averages
|
|
245
|
+
print("\nAVERAGES PER DATASET:")
|
|
246
|
+
print(f"{'Dataset':<28} {'Pg/Doc':>8} {'Chunks/Doc':>10} {'Img/Doc':>8} {'Tok/Doc':>12} {'Tok/Page':>10}")
|
|
247
|
+
print("-" * 80)
|
|
248
|
+
for stats in all_stats:
|
|
249
|
+
print(f"{stats['dataset_name']:<28} {stats.get('avg_pages_per_pdf', 0):>8.1f} "
|
|
250
|
+
f"{stats.get('avg_chunks_per_pdf', 0):>10.1f} {stats.get('avg_images_per_pdf', 0):>8.1f} "
|
|
251
|
+
f"{stats.get('avg_tokens_per_pdf', 0):>12,} {stats.get('avg_tokens_per_page', 0):>10,}")
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def find_datasets(data_dir: Path, results_dir: Path) -> List[Dict]:
|
|
255
|
+
"""Find all datasets and their corresponding chunks.json files."""
|
|
256
|
+
datasets = []
|
|
257
|
+
|
|
258
|
+
# Get all zip files in data directory
|
|
259
|
+
zip_files = {z.stem: z for z in data_dir.glob("*.zip")}
|
|
260
|
+
|
|
261
|
+
# Get all chunks.json files in results directory
|
|
262
|
+
chunks_files = {}
|
|
263
|
+
if results_dir.exists():
|
|
264
|
+
for chunks_path in results_dir.glob("*/chunks.json"):
|
|
265
|
+
dataset_name = chunks_path.parent.name
|
|
266
|
+
chunks_files[dataset_name] = chunks_path
|
|
267
|
+
|
|
268
|
+
# Merge: prefer chunks.json when available
|
|
269
|
+
all_datasets = set(zip_files.keys()) | set(chunks_files.keys())
|
|
270
|
+
|
|
271
|
+
for name in sorted(all_datasets):
|
|
272
|
+
datasets.append({
|
|
273
|
+
"name": name,
|
|
274
|
+
"zip_path": zip_files.get(name),
|
|
275
|
+
"chunks_path": chunks_files.get(name)
|
|
276
|
+
})
|
|
277
|
+
|
|
278
|
+
return datasets
|
|
279
|
+
|
|
280
|
+
|
|
281
|
+
def compute_dataset_stats(
|
|
282
|
+
output_dir: str,
|
|
283
|
+
pdf_dir: str = None,
|
|
284
|
+
chunks_file: str = None,
|
|
285
|
+
tokenizer=None
|
|
286
|
+
) -> Dict[str, Any]:
|
|
287
|
+
"""
|
|
288
|
+
Compute comprehensive dataset statistics from a trial results directory.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
output_dir: Path to the output directory (e.g., output/results/my_dataset)
|
|
292
|
+
pdf_dir: Path to source PDFs directory (for page counts)
|
|
293
|
+
chunks_file: Path to chunks.json (defaults to output_dir/chunks.json)
|
|
294
|
+
tokenizer: Optional tokenizer for accurate token counting
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
Dict with keys: total_images, total_tables, total_pages, total_tokens,
|
|
298
|
+
num_pdfs, per_pdf_stats, etc.
|
|
299
|
+
"""
|
|
300
|
+
output_path = Path(output_dir)
|
|
301
|
+
|
|
302
|
+
# Default chunks file path
|
|
303
|
+
if chunks_file is None:
|
|
304
|
+
chunks_file = output_path / "chunks.json"
|
|
305
|
+
else:
|
|
306
|
+
chunks_file = Path(chunks_file)
|
|
307
|
+
|
|
308
|
+
stats = {
|
|
309
|
+
"num_pdfs": 0,
|
|
310
|
+
"total_images": 0,
|
|
311
|
+
"total_tables": 0,
|
|
312
|
+
"total_pages": 0,
|
|
313
|
+
"total_tokens": 0,
|
|
314
|
+
"total_chars": 0,
|
|
315
|
+
"total_chunks": 0,
|
|
316
|
+
"per_pdf_stats": []
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
# Get PDF names from chunks.json
|
|
320
|
+
pdf_names = set()
|
|
321
|
+
chunks_data = []
|
|
322
|
+
if chunks_file.exists():
|
|
323
|
+
try:
|
|
324
|
+
with open(chunks_file, 'r') as f:
|
|
325
|
+
chunks_data = json.load(f)
|
|
326
|
+
for chunk in chunks_data:
|
|
327
|
+
file_name = chunk.get("file_name", "")
|
|
328
|
+
if file_name:
|
|
329
|
+
pdf_names.add(file_name)
|
|
330
|
+
stats["total_chunks"] = len(chunks_data)
|
|
331
|
+
except Exception as e:
|
|
332
|
+
print(f"Warning: Could not read chunks.json: {e}")
|
|
333
|
+
|
|
334
|
+
stats["num_pdfs"] = len(pdf_names)
|
|
335
|
+
|
|
336
|
+
# Count images and tables from markdown directory structure
|
|
337
|
+
markdown_dir = output_path / "markdown"
|
|
338
|
+
all_image_files = set()
|
|
339
|
+
all_table_files = set()
|
|
340
|
+
per_pdf_images = defaultdict(set)
|
|
341
|
+
per_pdf_tables = defaultdict(set)
|
|
342
|
+
|
|
343
|
+
if markdown_dir.exists():
|
|
344
|
+
for subdir in markdown_dir.iterdir():
|
|
345
|
+
if subdir.is_dir():
|
|
346
|
+
pdf_name = subdir.name
|
|
347
|
+
|
|
348
|
+
# Count images from ref_artifacts/
|
|
349
|
+
artifact_dir = subdir / "ref_artifacts"
|
|
350
|
+
if artifact_dir.exists():
|
|
351
|
+
for img_file in artifact_dir.glob("image_*"):
|
|
352
|
+
if img_file.suffix.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.webp']:
|
|
353
|
+
all_image_files.add(img_file.name)
|
|
354
|
+
per_pdf_images[pdf_name].add(img_file.name)
|
|
355
|
+
|
|
356
|
+
# Count tables from tables/
|
|
357
|
+
table_dir = subdir / "tables"
|
|
358
|
+
if table_dir.exists():
|
|
359
|
+
for table_file in table_dir.glob("*"):
|
|
360
|
+
if table_file.suffix.lower() in ['.png', '.jpg', '.jpeg']:
|
|
361
|
+
all_table_files.add(table_file.name)
|
|
362
|
+
per_pdf_tables[pdf_name].add(table_file.name)
|
|
363
|
+
|
|
364
|
+
stats["total_images"] = len(all_image_files)
|
|
365
|
+
stats["total_tables"] = len(all_table_files)
|
|
366
|
+
|
|
367
|
+
# Get page counts from source PDFs
|
|
368
|
+
page_counts = {}
|
|
369
|
+
if pdf_dir:
|
|
370
|
+
pdf_path = Path(pdf_dir)
|
|
371
|
+
if pdf_path.exists():
|
|
372
|
+
for pdf_file in pdf_path.glob("*.pdf"):
|
|
373
|
+
pdf_stem = pdf_file.stem
|
|
374
|
+
# Match PDF names to chunk file names
|
|
375
|
+
for chunk_name in pdf_names:
|
|
376
|
+
if chunk_name.lower() in pdf_stem.lower() or pdf_stem.lower() in chunk_name.lower():
|
|
377
|
+
try:
|
|
378
|
+
pdf = pdfium.PdfDocument(pdf_file)
|
|
379
|
+
page_counts[chunk_name] = len(pdf)
|
|
380
|
+
except Exception:
|
|
381
|
+
page_counts[chunk_name] = 0
|
|
382
|
+
break
|
|
383
|
+
|
|
384
|
+
stats["total_pages"] = sum(page_counts.values())
|
|
385
|
+
|
|
386
|
+
# Count tokens from chunks
|
|
387
|
+
total_tokens = 0
|
|
388
|
+
total_chars = 0
|
|
389
|
+
per_pdf_tokens = defaultdict(int)
|
|
390
|
+
per_pdf_chars = defaultdict(int)
|
|
391
|
+
|
|
392
|
+
for chunk in chunks_data:
|
|
393
|
+
content = chunk.get("content", "")
|
|
394
|
+
file_name = chunk.get("file_name", "unknown")
|
|
395
|
+
|
|
396
|
+
chars = len(content)
|
|
397
|
+
tokens = count_tokens(content, tokenizer)
|
|
398
|
+
|
|
399
|
+
total_chars += chars
|
|
400
|
+
total_tokens += tokens
|
|
401
|
+
per_pdf_chars[file_name] += chars
|
|
402
|
+
per_pdf_tokens[file_name] += tokens
|
|
403
|
+
|
|
404
|
+
stats["total_tokens"] = total_tokens
|
|
405
|
+
stats["total_chars"] = total_chars
|
|
406
|
+
|
|
407
|
+
# Build per-PDF stats
|
|
408
|
+
for pdf_name in sorted(pdf_names):
|
|
409
|
+
pdf_stats = {
|
|
410
|
+
"filename": pdf_name,
|
|
411
|
+
"pages": page_counts.get(pdf_name, 0),
|
|
412
|
+
"images": len(per_pdf_images.get(pdf_name, set())),
|
|
413
|
+
"tables": len(per_pdf_tables.get(pdf_name, set())),
|
|
414
|
+
"tokens": per_pdf_tokens.get(pdf_name, 0),
|
|
415
|
+
"chars": per_pdf_chars.get(pdf_name, 0)
|
|
416
|
+
}
|
|
417
|
+
stats["per_pdf_stats"].append(pdf_stats)
|
|
418
|
+
|
|
419
|
+
# Compute averages
|
|
420
|
+
if stats["num_pdfs"] > 0:
|
|
421
|
+
stats["avg_pages_per_pdf"] = round(stats["total_pages"] / stats["num_pdfs"], 1)
|
|
422
|
+
stats["avg_images_per_pdf"] = round(stats["total_images"] / stats["num_pdfs"], 1)
|
|
423
|
+
stats["avg_tables_per_pdf"] = round(stats["total_tables"] / stats["num_pdfs"], 1)
|
|
424
|
+
stats["avg_tokens_per_pdf"] = int(stats["total_tokens"] / stats["num_pdfs"])
|
|
425
|
+
|
|
426
|
+
if stats["total_pages"] > 0:
|
|
427
|
+
stats["avg_tokens_per_page"] = int(stats["total_tokens"] / stats["total_pages"])
|
|
428
|
+
|
|
429
|
+
return stats
|
|
430
|
+
|
|
431
|
+
|
|
432
|
+
def print_dataset_stats(stats: Dict[str, Any]) -> None:
|
|
433
|
+
"""Print formatted dataset statistics."""
|
|
434
|
+
print("\n" + "=" * 60)
|
|
435
|
+
print("DATASET STATISTICS")
|
|
436
|
+
print("=" * 60)
|
|
437
|
+
print(f"\nImages: {stats['total_images']}")
|
|
438
|
+
print(f"Tables: {stats['total_tables']}")
|
|
439
|
+
print(f"Pages: {stats['total_pages']}")
|
|
440
|
+
print(f"Tokens: {stats['total_tokens']:,}")
|
|
441
|
+
print(f"PDFs: {stats['num_pdfs']}")
|
|
442
|
+
print(f"Chunks: {stats['total_chunks']}")
|
|
443
|
+
print("=" * 60)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
def compute_qa_category_stats(qa_data: List[Dict]) -> Dict[str, Any]:
|
|
447
|
+
"""
|
|
448
|
+
Compute QA category statistics including multihop/multimodal intersection.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
qa_data: List of QA pairs from deduplicated dataset
|
|
452
|
+
|
|
453
|
+
Returns:
|
|
454
|
+
Dict with category counts and percentages
|
|
455
|
+
"""
|
|
456
|
+
total = len(qa_data)
|
|
457
|
+
if total == 0:
|
|
458
|
+
return {
|
|
459
|
+
'total_qa_pairs': 0,
|
|
460
|
+
'multihop_count': 0,
|
|
461
|
+
'multimodal_count': 0,
|
|
462
|
+
'multihop_multimodal_count': 0,
|
|
463
|
+
'multihop_only_count': 0,
|
|
464
|
+
'multimodal_only_count': 0,
|
|
465
|
+
'text_only_count': 0,
|
|
466
|
+
'avg_difficulty': 0.0,
|
|
467
|
+
'avg_relevance': 0.0
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
multihop = 0
|
|
471
|
+
multimodal = 0
|
|
472
|
+
both = 0
|
|
473
|
+
|
|
474
|
+
# Track difficulty and relevance scores
|
|
475
|
+
difficulty_scores = []
|
|
476
|
+
relevance_scores = []
|
|
477
|
+
|
|
478
|
+
for qa in qa_data:
|
|
479
|
+
# Multihop: hop_count > 1 or multiple chunks added
|
|
480
|
+
hop_count = qa.get('hop_count', 0)
|
|
481
|
+
chunks_added = qa.get('chunks_added', [])
|
|
482
|
+
is_multihop = hop_count > 1 or (isinstance(chunks_added, list) and len(chunks_added) > 1)
|
|
483
|
+
|
|
484
|
+
# Multimodal: has image_path in context chunks OR markdown images in content
|
|
485
|
+
# Match logic from metrics_optimized.py for consistency
|
|
486
|
+
context_chunks = qa.get('context_chunks', [])
|
|
487
|
+
is_multimodal = False
|
|
488
|
+
for chunk in context_chunks:
|
|
489
|
+
if isinstance(chunk, dict):
|
|
490
|
+
# Check image_path field
|
|
491
|
+
image_path = chunk.get('image_path')
|
|
492
|
+
if image_path and image_path not in ('None', 'null', None) and str(image_path).strip():
|
|
493
|
+
is_multimodal = True
|
|
494
|
+
break
|
|
495
|
+
# Check content for markdown image references (e.g., )
|
|
496
|
+
content = chunk.get('content', '')
|
|
497
|
+
if content and re.search(r'!\[[^\]]*\]\([^)]+\)', content):
|
|
498
|
+
is_multimodal = True
|
|
499
|
+
break
|
|
500
|
+
|
|
501
|
+
if is_multihop:
|
|
502
|
+
multihop += 1
|
|
503
|
+
if is_multimodal:
|
|
504
|
+
multimodal += 1
|
|
505
|
+
if is_multihop and is_multimodal:
|
|
506
|
+
both += 1
|
|
507
|
+
|
|
508
|
+
# Extract difficulty and relevance scores
|
|
509
|
+
try:
|
|
510
|
+
diff = qa.get('difficulty_score', qa.get('difficulty', 0))
|
|
511
|
+
if diff is not None and str(diff).strip():
|
|
512
|
+
difficulty_scores.append(float(diff))
|
|
513
|
+
except (ValueError, TypeError):
|
|
514
|
+
pass
|
|
515
|
+
try:
|
|
516
|
+
rel = qa.get('relevance_score', qa.get('relevance', 0))
|
|
517
|
+
if rel is not None and str(rel).strip():
|
|
518
|
+
relevance_scores.append(float(rel))
|
|
519
|
+
except (ValueError, TypeError):
|
|
520
|
+
pass
|
|
521
|
+
|
|
522
|
+
# Exclusive counts
|
|
523
|
+
multihop_only = multihop - both
|
|
524
|
+
multimodal_only = multimodal - both
|
|
525
|
+
text_only = total - multihop - multimodal + both
|
|
526
|
+
|
|
527
|
+
# Compute averages
|
|
528
|
+
avg_difficulty = round(sum(difficulty_scores) / len(difficulty_scores), 2) if difficulty_scores else 0.0
|
|
529
|
+
avg_relevance = round(sum(relevance_scores) / len(relevance_scores), 2) if relevance_scores else 0.0
|
|
530
|
+
|
|
531
|
+
return {
|
|
532
|
+
'total_qa_pairs': total,
|
|
533
|
+
'multihop_count': multihop,
|
|
534
|
+
'multimodal_count': multimodal,
|
|
535
|
+
'multihop_multimodal_count': both,
|
|
536
|
+
'multihop_only_count': multihop_only,
|
|
537
|
+
'multimodal_only_count': multimodal_only,
|
|
538
|
+
'text_only_count': text_only,
|
|
539
|
+
'multihop_pct': round(100 * multihop / total, 1),
|
|
540
|
+
'multimodal_pct': round(100 * multimodal / total, 1),
|
|
541
|
+
'multihop_multimodal_pct': round(100 * both / total, 1),
|
|
542
|
+
'multihop_only_pct': round(100 * multihop_only / total, 1),
|
|
543
|
+
'multimodal_only_pct': round(100 * multimodal_only / total, 1),
|
|
544
|
+
'text_only_pct': round(100 * text_only / total, 1),
|
|
545
|
+
'avg_difficulty': avg_difficulty,
|
|
546
|
+
'avg_relevance': avg_relevance
|
|
547
|
+
}
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def print_qa_category_stats(stats: Dict[str, Any]) -> None:
|
|
551
|
+
"""Print formatted QA category statistics."""
|
|
552
|
+
total = stats.get('total_qa_pairs', 0)
|
|
553
|
+
if total == 0:
|
|
554
|
+
print("\n⚠️ No QA pairs to analyze")
|
|
555
|
+
return
|
|
556
|
+
|
|
557
|
+
print("\n" + "=" * 60)
|
|
558
|
+
print("QA CATEGORY BREAKDOWN")
|
|
559
|
+
print("=" * 60)
|
|
560
|
+
print(f"\nTotal QA Pairs: {total}")
|
|
561
|
+
print()
|
|
562
|
+
print("Category Counts:")
|
|
563
|
+
print(f" Multihop: {stats['multihop_count']:>3} ({stats['multihop_pct']:>5.1f}%)")
|
|
564
|
+
print(f" Multimodal: {stats['multimodal_count']:>3} ({stats['multimodal_pct']:>5.1f}%)")
|
|
565
|
+
print(f" Both (Multihop ∩ Multimodal): {stats['multihop_multimodal_count']:>3} ({stats['multihop_multimodal_pct']:>5.1f}%)")
|
|
566
|
+
print()
|
|
567
|
+
print("Exclusive Breakdown:")
|
|
568
|
+
print(f" Multihop only (text): {stats['multihop_only_count']:>3} ({stats['multihop_only_pct']:>5.1f}%)")
|
|
569
|
+
print(f" Multimodal only (single-hop): {stats['multimodal_only_count']:>3} ({stats['multimodal_only_pct']:>5.1f}%)")
|
|
570
|
+
print(f" Both (multihop + multimodal): {stats['multihop_multimodal_count']:>3} ({stats['multihop_multimodal_pct']:>5.1f}%)")
|
|
571
|
+
print(f" Neither (single-hop, text): {stats['text_only_count']:>3} ({stats['text_only_pct']:>5.1f}%)")
|
|
572
|
+
print()
|
|
573
|
+
print("Quality Scores (0-10 scale):")
|
|
574
|
+
print(f" Avg Difficulty: {stats.get('avg_difficulty', 0.0):>5.2f}")
|
|
575
|
+
print(f" Avg Relevance: {stats.get('avg_relevance', 0.0):>5.2f}")
|
|
576
|
+
print("=" * 60)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
def main():
|
|
580
|
+
"""Main entry point."""
|
|
581
|
+
data_dir = Path(__file__).parent
|
|
582
|
+
project_root = data_dir.parent
|
|
583
|
+
results_dir = project_root / "trials" / "results"
|
|
584
|
+
|
|
585
|
+
datasets = find_datasets(data_dir, results_dir)
|
|
586
|
+
|
|
587
|
+
if not datasets:
|
|
588
|
+
print("No datasets found.")
|
|
589
|
+
return
|
|
590
|
+
|
|
591
|
+
print(f"Found {len(datasets)} datasets to analyze...")
|
|
592
|
+
|
|
593
|
+
# Show which have chunks.json
|
|
594
|
+
with_chunks = sum(1 for d in datasets if d["chunks_path"])
|
|
595
|
+
print(f" - {with_chunks} with chunks.json (processed)")
|
|
596
|
+
print(f" - {len(datasets) - with_chunks} zip-only (not yet processed)")
|
|
597
|
+
|
|
598
|
+
# Load tokenizer once
|
|
599
|
+
tokenizer = get_tokenizer()
|
|
600
|
+
if tokenizer:
|
|
601
|
+
print("Using GPT2 tokenizer for token counting")
|
|
602
|
+
else:
|
|
603
|
+
print("Using word-based token estimation")
|
|
604
|
+
|
|
605
|
+
all_stats = []
|
|
606
|
+
for ds in tqdm(datasets, desc="Analyzing datasets"):
|
|
607
|
+
stats = analyze_dataset(
|
|
608
|
+
ds["name"],
|
|
609
|
+
ds["chunks_path"],
|
|
610
|
+
ds["zip_path"],
|
|
611
|
+
tokenizer
|
|
612
|
+
)
|
|
613
|
+
all_stats.append(stats)
|
|
614
|
+
|
|
615
|
+
# Print summary
|
|
616
|
+
print_summary(all_stats)
|
|
617
|
+
|
|
618
|
+
# Save detailed results
|
|
619
|
+
output_file = data_dir / "dataset_stats.json"
|
|
620
|
+
with open(output_file, 'w') as f:
|
|
621
|
+
json.dump(all_stats, f, indent=2)
|
|
622
|
+
print(f"\nDetailed stats saved to: {output_file}")
|
|
623
|
+
|
|
624
|
+
|
|
625
|
+
if __name__ == "__main__":
|
|
626
|
+
main()
|