mirage-benchmark 1.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mirage-benchmark might be problematic. Click here for more details.

mirage/utils/stats.py ADDED
@@ -0,0 +1,626 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Dataset Statistics Calculator
4
+ Measures pages, images, and tokens for each dataset using chunks.json files.
5
+ """
6
+
7
+ import os
8
+ import re
9
+ import zipfile
10
+ import io
11
+ import json
12
+ from pathlib import Path
13
+ from collections import defaultdict
14
+ from typing import Dict, List, Any, Optional
15
+
16
+ import pypdfium2 as pdfium
17
+ from tqdm import tqdm
18
+
19
+
20
+ def get_tokenizer():
21
+ """Get tokenizer - uses GPT2 if available, otherwise word-based estimate."""
22
+ try:
23
+ from transformers import GPT2TokenizerFast
24
+ return GPT2TokenizerFast.from_pretrained("gpt2")
25
+ except Exception:
26
+ return None
27
+
28
+
29
+ def count_tokens(text: str, tokenizer=None) -> int:
30
+ """Count tokens - uses tokenizer if available, otherwise word-based estimate."""
31
+ if not text:
32
+ return 0
33
+
34
+ if tokenizer:
35
+ try:
36
+ return len(tokenizer.encode(text))
37
+ except Exception:
38
+ pass
39
+
40
+ # Fallback: approximate tokens as ~1.3 * words (GPT average)
41
+ words = len(text.split())
42
+ return int(words * 1.3)
43
+
44
+
45
+ def count_pages_from_zip(zip_path: Path) -> Dict[str, int]:
46
+ """Count pages per document from zip file (PDFs or HTML files)."""
47
+ page_counts = {}
48
+ try:
49
+ with zipfile.ZipFile(zip_path, 'r') as zf:
50
+ for name in zf.namelist():
51
+ stem = Path(name).stem
52
+ lower_name = name.lower()
53
+
54
+ if lower_name.endswith('.pdf'):
55
+ try:
56
+ pdf_bytes = zf.read(name)
57
+ pdf = pdfium.PdfDocument(pdf_bytes)
58
+ page_counts[stem] = len(pdf)
59
+ except Exception:
60
+ page_counts[stem] = 0
61
+ elif lower_name.endswith(('.html', '.htm')):
62
+ # HTML files count as 1 "page"
63
+ page_counts[stem] = 1
64
+ except Exception as e:
65
+ print(f"Error reading zip {zip_path}: {e}")
66
+ return page_counts
67
+
68
+
69
+ def get_file_size_mb_from_zip(zip_path: Path) -> Dict[str, float]:
70
+ """Get file sizes per document from zip file."""
71
+ sizes = {}
72
+ try:
73
+ with zipfile.ZipFile(zip_path, 'r') as zf:
74
+ for info in zf.infolist():
75
+ lower = info.filename.lower()
76
+ if lower.endswith(('.pdf', '.html', '.htm')):
77
+ stem = Path(info.filename).stem
78
+ sizes[stem] = info.file_size / (1024 * 1024)
79
+ except Exception:
80
+ pass
81
+ return sizes
82
+
83
+
84
+ def get_file_type_from_zip(zip_path: Path) -> str:
85
+ """Detect primary file type in zip."""
86
+ try:
87
+ with zipfile.ZipFile(zip_path, 'r') as zf:
88
+ names = zf.namelist()
89
+ pdfs = sum(1 for n in names if n.lower().endswith('.pdf'))
90
+ htmls = sum(1 for n in names if n.lower().endswith(('.html', '.htm')))
91
+ if pdfs > htmls:
92
+ return "pdf"
93
+ elif htmls > 0:
94
+ return "html"
95
+ except Exception:
96
+ pass
97
+ return "unknown"
98
+
99
+
100
+ def analyze_chunks_json(chunks_path: Path, tokenizer=None) -> Dict[str, Any]:
101
+ """Analyze a chunks.json file to extract stats per PDF."""
102
+ pdf_stats = defaultdict(lambda: {
103
+ "chunks": 0,
104
+ "text_chunks": 0,
105
+ "image_chunks": 0,
106
+ "table_chunks": 0,
107
+ "tokens": 0,
108
+ "chars": 0
109
+ })
110
+
111
+ try:
112
+ with open(chunks_path, 'r') as f:
113
+ chunks = json.load(f)
114
+
115
+ for chunk in chunks:
116
+ file_name = chunk.get("file_name", "unknown")
117
+ chunk_type = chunk.get("chunk_type", "").lower()
118
+ content = chunk.get("content", "")
119
+ artifact = chunk.get("artifact", "None")
120
+
121
+ stats = pdf_stats[file_name]
122
+ stats["chunks"] += 1
123
+ stats["chars"] += len(content)
124
+ stats["tokens"] += count_tokens(content, tokenizer)
125
+
126
+ # Classify chunk type
127
+ if "image" in chunk_type or (artifact and artifact != "None"):
128
+ stats["image_chunks"] += 1
129
+ elif "table" in chunk_type:
130
+ stats["table_chunks"] += 1
131
+ else:
132
+ stats["text_chunks"] += 1
133
+
134
+ except Exception as e:
135
+ print(f"Error reading {chunks_path}: {e}")
136
+
137
+ return dict(pdf_stats)
138
+
139
+
140
+ def analyze_dataset(dataset_name: str, chunks_path: Optional[Path], zip_path: Optional[Path], tokenizer=None) -> Dict[str, Any]:
141
+ """Analyze a single dataset using chunks.json and/or zip file."""
142
+ stats = {
143
+ "dataset_name": dataset_name,
144
+ "source": "chunks.json" if chunks_path else "zip",
145
+ "num_pdfs": 0,
146
+ "total_pages": 0,
147
+ "total_chunks": 0,
148
+ "total_text_chunks": 0,
149
+ "total_image_chunks": 0,
150
+ "total_table_chunks": 0,
151
+ "total_tokens": 0,
152
+ "total_chars": 0,
153
+ "total_size_mb": 0.0,
154
+ "pdfs": []
155
+ }
156
+
157
+ # Get page counts and file sizes from zip
158
+ page_counts = {}
159
+ file_sizes = {}
160
+ file_type = "unknown"
161
+ if zip_path and zip_path.exists():
162
+ page_counts = count_pages_from_zip(zip_path)
163
+ file_sizes = get_file_size_mb_from_zip(zip_path)
164
+ file_type = get_file_type_from_zip(zip_path)
165
+ stats["total_size_mb"] = sum(file_sizes.values())
166
+ stats["file_type"] = file_type
167
+
168
+ # Get chunk stats from chunks.json
169
+ chunk_stats = {}
170
+ if chunks_path and chunks_path.exists():
171
+ chunk_stats = analyze_chunks_json(chunks_path, tokenizer)
172
+
173
+ # Merge stats - use chunk_stats keys as primary if available
174
+ all_pdfs = set(chunk_stats.keys()) | set(page_counts.keys())
175
+ stats["num_pdfs"] = len(all_pdfs)
176
+
177
+ for pdf_name in sorted(all_pdfs):
178
+ pdf_info = {
179
+ "filename": pdf_name,
180
+ "pages": page_counts.get(pdf_name, 0),
181
+ "size_mb": round(file_sizes.get(pdf_name, 0), 2),
182
+ "chunks": chunk_stats.get(pdf_name, {}).get("chunks", 0),
183
+ "text_chunks": chunk_stats.get(pdf_name, {}).get("text_chunks", 0),
184
+ "image_chunks": chunk_stats.get(pdf_name, {}).get("image_chunks", 0),
185
+ "table_chunks": chunk_stats.get(pdf_name, {}).get("table_chunks", 0),
186
+ "tokens": chunk_stats.get(pdf_name, {}).get("tokens", 0),
187
+ "chars": chunk_stats.get(pdf_name, {}).get("chars", 0),
188
+ }
189
+ stats["pdfs"].append(pdf_info)
190
+
191
+ stats["total_pages"] += pdf_info["pages"]
192
+ stats["total_chunks"] += pdf_info["chunks"]
193
+ stats["total_text_chunks"] += pdf_info["text_chunks"]
194
+ stats["total_image_chunks"] += pdf_info["image_chunks"]
195
+ stats["total_table_chunks"] += pdf_info["table_chunks"]
196
+ stats["total_tokens"] += pdf_info["tokens"]
197
+ stats["total_chars"] += pdf_info["chars"]
198
+
199
+ stats["total_size_mb"] = round(stats["total_size_mb"], 2)
200
+
201
+ # Compute averages
202
+ if stats["num_pdfs"] > 0:
203
+ stats["avg_pages_per_pdf"] = round(stats["total_pages"] / stats["num_pdfs"], 1)
204
+ stats["avg_chunks_per_pdf"] = round(stats["total_chunks"] / stats["num_pdfs"], 1)
205
+ stats["avg_images_per_pdf"] = round(stats["total_image_chunks"] / stats["num_pdfs"], 1)
206
+ stats["avg_tokens_per_pdf"] = int(stats["total_tokens"] / stats["num_pdfs"])
207
+ if stats["total_pages"] > 0:
208
+ stats["avg_tokens_per_page"] = int(stats["total_tokens"] / stats["total_pages"])
209
+
210
+ return stats
211
+
212
+
213
+ def print_summary(all_stats: List[Dict]) -> None:
214
+ """Print formatted summary of all datasets."""
215
+ print("\n" + "=" * 110)
216
+ print("DATASET STATISTICS SUMMARY (from chunks.json)")
217
+ print("=" * 110)
218
+
219
+ print(f"\n{'Dataset':<28} {'Docs':>5} {'Pages':>6} {'Chunks':>7} {'Images':>7} {'Tables':>7} {'Tokens':>12} {'Size(MB)':>10}")
220
+ print("-" * 110)
221
+
222
+ totals = defaultdict(int)
223
+ totals["total_size_mb"] = 0.0
224
+
225
+ for stats in all_stats:
226
+ print(f"{stats['dataset_name']:<28} {stats['num_pdfs']:>5} {stats['total_pages']:>6} "
227
+ f"{stats['total_chunks']:>7} {stats['total_image_chunks']:>7} {stats['total_table_chunks']:>7} "
228
+ f"{stats['total_tokens']:>12,} {stats['total_size_mb']:>10.1f}")
229
+
230
+ totals["num_pdfs"] += stats["num_pdfs"]
231
+ totals["total_pages"] += stats["total_pages"]
232
+ totals["total_chunks"] += stats["total_chunks"]
233
+ totals["total_image_chunks"] += stats["total_image_chunks"]
234
+ totals["total_table_chunks"] += stats["total_table_chunks"]
235
+ totals["total_tokens"] += stats["total_tokens"]
236
+ totals["total_size_mb"] += stats["total_size_mb"]
237
+
238
+ print("-" * 110)
239
+ print(f"{'TOTAL':<28} {totals['num_pdfs']:>5} {totals['total_pages']:>6} "
240
+ f"{totals['total_chunks']:>7} {totals['total_image_chunks']:>7} {totals['total_table_chunks']:>7} "
241
+ f"{totals['total_tokens']:>12,} {totals['total_size_mb']:>10.1f}")
242
+ print("=" * 110)
243
+
244
+ # Print averages
245
+ print("\nAVERAGES PER DATASET:")
246
+ print(f"{'Dataset':<28} {'Pg/Doc':>8} {'Chunks/Doc':>10} {'Img/Doc':>8} {'Tok/Doc':>12} {'Tok/Page':>10}")
247
+ print("-" * 80)
248
+ for stats in all_stats:
249
+ print(f"{stats['dataset_name']:<28} {stats.get('avg_pages_per_pdf', 0):>8.1f} "
250
+ f"{stats.get('avg_chunks_per_pdf', 0):>10.1f} {stats.get('avg_images_per_pdf', 0):>8.1f} "
251
+ f"{stats.get('avg_tokens_per_pdf', 0):>12,} {stats.get('avg_tokens_per_page', 0):>10,}")
252
+
253
+
254
+ def find_datasets(data_dir: Path, results_dir: Path) -> List[Dict]:
255
+ """Find all datasets and their corresponding chunks.json files."""
256
+ datasets = []
257
+
258
+ # Get all zip files in data directory
259
+ zip_files = {z.stem: z for z in data_dir.glob("*.zip")}
260
+
261
+ # Get all chunks.json files in results directory
262
+ chunks_files = {}
263
+ if results_dir.exists():
264
+ for chunks_path in results_dir.glob("*/chunks.json"):
265
+ dataset_name = chunks_path.parent.name
266
+ chunks_files[dataset_name] = chunks_path
267
+
268
+ # Merge: prefer chunks.json when available
269
+ all_datasets = set(zip_files.keys()) | set(chunks_files.keys())
270
+
271
+ for name in sorted(all_datasets):
272
+ datasets.append({
273
+ "name": name,
274
+ "zip_path": zip_files.get(name),
275
+ "chunks_path": chunks_files.get(name)
276
+ })
277
+
278
+ return datasets
279
+
280
+
281
+ def compute_dataset_stats(
282
+ output_dir: str,
283
+ pdf_dir: str = None,
284
+ chunks_file: str = None,
285
+ tokenizer=None
286
+ ) -> Dict[str, Any]:
287
+ """
288
+ Compute comprehensive dataset statistics from a trial results directory.
289
+
290
+ Args:
291
+ output_dir: Path to the output directory (e.g., output/results/my_dataset)
292
+ pdf_dir: Path to source PDFs directory (for page counts)
293
+ chunks_file: Path to chunks.json (defaults to output_dir/chunks.json)
294
+ tokenizer: Optional tokenizer for accurate token counting
295
+
296
+ Returns:
297
+ Dict with keys: total_images, total_tables, total_pages, total_tokens,
298
+ num_pdfs, per_pdf_stats, etc.
299
+ """
300
+ output_path = Path(output_dir)
301
+
302
+ # Default chunks file path
303
+ if chunks_file is None:
304
+ chunks_file = output_path / "chunks.json"
305
+ else:
306
+ chunks_file = Path(chunks_file)
307
+
308
+ stats = {
309
+ "num_pdfs": 0,
310
+ "total_images": 0,
311
+ "total_tables": 0,
312
+ "total_pages": 0,
313
+ "total_tokens": 0,
314
+ "total_chars": 0,
315
+ "total_chunks": 0,
316
+ "per_pdf_stats": []
317
+ }
318
+
319
+ # Get PDF names from chunks.json
320
+ pdf_names = set()
321
+ chunks_data = []
322
+ if chunks_file.exists():
323
+ try:
324
+ with open(chunks_file, 'r') as f:
325
+ chunks_data = json.load(f)
326
+ for chunk in chunks_data:
327
+ file_name = chunk.get("file_name", "")
328
+ if file_name:
329
+ pdf_names.add(file_name)
330
+ stats["total_chunks"] = len(chunks_data)
331
+ except Exception as e:
332
+ print(f"Warning: Could not read chunks.json: {e}")
333
+
334
+ stats["num_pdfs"] = len(pdf_names)
335
+
336
+ # Count images and tables from markdown directory structure
337
+ markdown_dir = output_path / "markdown"
338
+ all_image_files = set()
339
+ all_table_files = set()
340
+ per_pdf_images = defaultdict(set)
341
+ per_pdf_tables = defaultdict(set)
342
+
343
+ if markdown_dir.exists():
344
+ for subdir in markdown_dir.iterdir():
345
+ if subdir.is_dir():
346
+ pdf_name = subdir.name
347
+
348
+ # Count images from ref_artifacts/
349
+ artifact_dir = subdir / "ref_artifacts"
350
+ if artifact_dir.exists():
351
+ for img_file in artifact_dir.glob("image_*"):
352
+ if img_file.suffix.lower() in ['.png', '.jpg', '.jpeg', '.gif', '.webp']:
353
+ all_image_files.add(img_file.name)
354
+ per_pdf_images[pdf_name].add(img_file.name)
355
+
356
+ # Count tables from tables/
357
+ table_dir = subdir / "tables"
358
+ if table_dir.exists():
359
+ for table_file in table_dir.glob("*"):
360
+ if table_file.suffix.lower() in ['.png', '.jpg', '.jpeg']:
361
+ all_table_files.add(table_file.name)
362
+ per_pdf_tables[pdf_name].add(table_file.name)
363
+
364
+ stats["total_images"] = len(all_image_files)
365
+ stats["total_tables"] = len(all_table_files)
366
+
367
+ # Get page counts from source PDFs
368
+ page_counts = {}
369
+ if pdf_dir:
370
+ pdf_path = Path(pdf_dir)
371
+ if pdf_path.exists():
372
+ for pdf_file in pdf_path.glob("*.pdf"):
373
+ pdf_stem = pdf_file.stem
374
+ # Match PDF names to chunk file names
375
+ for chunk_name in pdf_names:
376
+ if chunk_name.lower() in pdf_stem.lower() or pdf_stem.lower() in chunk_name.lower():
377
+ try:
378
+ pdf = pdfium.PdfDocument(pdf_file)
379
+ page_counts[chunk_name] = len(pdf)
380
+ except Exception:
381
+ page_counts[chunk_name] = 0
382
+ break
383
+
384
+ stats["total_pages"] = sum(page_counts.values())
385
+
386
+ # Count tokens from chunks
387
+ total_tokens = 0
388
+ total_chars = 0
389
+ per_pdf_tokens = defaultdict(int)
390
+ per_pdf_chars = defaultdict(int)
391
+
392
+ for chunk in chunks_data:
393
+ content = chunk.get("content", "")
394
+ file_name = chunk.get("file_name", "unknown")
395
+
396
+ chars = len(content)
397
+ tokens = count_tokens(content, tokenizer)
398
+
399
+ total_chars += chars
400
+ total_tokens += tokens
401
+ per_pdf_chars[file_name] += chars
402
+ per_pdf_tokens[file_name] += tokens
403
+
404
+ stats["total_tokens"] = total_tokens
405
+ stats["total_chars"] = total_chars
406
+
407
+ # Build per-PDF stats
408
+ for pdf_name in sorted(pdf_names):
409
+ pdf_stats = {
410
+ "filename": pdf_name,
411
+ "pages": page_counts.get(pdf_name, 0),
412
+ "images": len(per_pdf_images.get(pdf_name, set())),
413
+ "tables": len(per_pdf_tables.get(pdf_name, set())),
414
+ "tokens": per_pdf_tokens.get(pdf_name, 0),
415
+ "chars": per_pdf_chars.get(pdf_name, 0)
416
+ }
417
+ stats["per_pdf_stats"].append(pdf_stats)
418
+
419
+ # Compute averages
420
+ if stats["num_pdfs"] > 0:
421
+ stats["avg_pages_per_pdf"] = round(stats["total_pages"] / stats["num_pdfs"], 1)
422
+ stats["avg_images_per_pdf"] = round(stats["total_images"] / stats["num_pdfs"], 1)
423
+ stats["avg_tables_per_pdf"] = round(stats["total_tables"] / stats["num_pdfs"], 1)
424
+ stats["avg_tokens_per_pdf"] = int(stats["total_tokens"] / stats["num_pdfs"])
425
+
426
+ if stats["total_pages"] > 0:
427
+ stats["avg_tokens_per_page"] = int(stats["total_tokens"] / stats["total_pages"])
428
+
429
+ return stats
430
+
431
+
432
+ def print_dataset_stats(stats: Dict[str, Any]) -> None:
433
+ """Print formatted dataset statistics."""
434
+ print("\n" + "=" * 60)
435
+ print("DATASET STATISTICS")
436
+ print("=" * 60)
437
+ print(f"\nImages: {stats['total_images']}")
438
+ print(f"Tables: {stats['total_tables']}")
439
+ print(f"Pages: {stats['total_pages']}")
440
+ print(f"Tokens: {stats['total_tokens']:,}")
441
+ print(f"PDFs: {stats['num_pdfs']}")
442
+ print(f"Chunks: {stats['total_chunks']}")
443
+ print("=" * 60)
444
+
445
+
446
+ def compute_qa_category_stats(qa_data: List[Dict]) -> Dict[str, Any]:
447
+ """
448
+ Compute QA category statistics including multihop/multimodal intersection.
449
+
450
+ Args:
451
+ qa_data: List of QA pairs from deduplicated dataset
452
+
453
+ Returns:
454
+ Dict with category counts and percentages
455
+ """
456
+ total = len(qa_data)
457
+ if total == 0:
458
+ return {
459
+ 'total_qa_pairs': 0,
460
+ 'multihop_count': 0,
461
+ 'multimodal_count': 0,
462
+ 'multihop_multimodal_count': 0,
463
+ 'multihop_only_count': 0,
464
+ 'multimodal_only_count': 0,
465
+ 'text_only_count': 0,
466
+ 'avg_difficulty': 0.0,
467
+ 'avg_relevance': 0.0
468
+ }
469
+
470
+ multihop = 0
471
+ multimodal = 0
472
+ both = 0
473
+
474
+ # Track difficulty and relevance scores
475
+ difficulty_scores = []
476
+ relevance_scores = []
477
+
478
+ for qa in qa_data:
479
+ # Multihop: hop_count > 1 or multiple chunks added
480
+ hop_count = qa.get('hop_count', 0)
481
+ chunks_added = qa.get('chunks_added', [])
482
+ is_multihop = hop_count > 1 or (isinstance(chunks_added, list) and len(chunks_added) > 1)
483
+
484
+ # Multimodal: has image_path in context chunks OR markdown images in content
485
+ # Match logic from metrics_optimized.py for consistency
486
+ context_chunks = qa.get('context_chunks', [])
487
+ is_multimodal = False
488
+ for chunk in context_chunks:
489
+ if isinstance(chunk, dict):
490
+ # Check image_path field
491
+ image_path = chunk.get('image_path')
492
+ if image_path and image_path not in ('None', 'null', None) and str(image_path).strip():
493
+ is_multimodal = True
494
+ break
495
+ # Check content for markdown image references (e.g., ![alt](path))
496
+ content = chunk.get('content', '')
497
+ if content and re.search(r'!\[[^\]]*\]\([^)]+\)', content):
498
+ is_multimodal = True
499
+ break
500
+
501
+ if is_multihop:
502
+ multihop += 1
503
+ if is_multimodal:
504
+ multimodal += 1
505
+ if is_multihop and is_multimodal:
506
+ both += 1
507
+
508
+ # Extract difficulty and relevance scores
509
+ try:
510
+ diff = qa.get('difficulty_score', qa.get('difficulty', 0))
511
+ if diff is not None and str(diff).strip():
512
+ difficulty_scores.append(float(diff))
513
+ except (ValueError, TypeError):
514
+ pass
515
+ try:
516
+ rel = qa.get('relevance_score', qa.get('relevance', 0))
517
+ if rel is not None and str(rel).strip():
518
+ relevance_scores.append(float(rel))
519
+ except (ValueError, TypeError):
520
+ pass
521
+
522
+ # Exclusive counts
523
+ multihop_only = multihop - both
524
+ multimodal_only = multimodal - both
525
+ text_only = total - multihop - multimodal + both
526
+
527
+ # Compute averages
528
+ avg_difficulty = round(sum(difficulty_scores) / len(difficulty_scores), 2) if difficulty_scores else 0.0
529
+ avg_relevance = round(sum(relevance_scores) / len(relevance_scores), 2) if relevance_scores else 0.0
530
+
531
+ return {
532
+ 'total_qa_pairs': total,
533
+ 'multihop_count': multihop,
534
+ 'multimodal_count': multimodal,
535
+ 'multihop_multimodal_count': both,
536
+ 'multihop_only_count': multihop_only,
537
+ 'multimodal_only_count': multimodal_only,
538
+ 'text_only_count': text_only,
539
+ 'multihop_pct': round(100 * multihop / total, 1),
540
+ 'multimodal_pct': round(100 * multimodal / total, 1),
541
+ 'multihop_multimodal_pct': round(100 * both / total, 1),
542
+ 'multihop_only_pct': round(100 * multihop_only / total, 1),
543
+ 'multimodal_only_pct': round(100 * multimodal_only / total, 1),
544
+ 'text_only_pct': round(100 * text_only / total, 1),
545
+ 'avg_difficulty': avg_difficulty,
546
+ 'avg_relevance': avg_relevance
547
+ }
548
+
549
+
550
+ def print_qa_category_stats(stats: Dict[str, Any]) -> None:
551
+ """Print formatted QA category statistics."""
552
+ total = stats.get('total_qa_pairs', 0)
553
+ if total == 0:
554
+ print("\n⚠️ No QA pairs to analyze")
555
+ return
556
+
557
+ print("\n" + "=" * 60)
558
+ print("QA CATEGORY BREAKDOWN")
559
+ print("=" * 60)
560
+ print(f"\nTotal QA Pairs: {total}")
561
+ print()
562
+ print("Category Counts:")
563
+ print(f" Multihop: {stats['multihop_count']:>3} ({stats['multihop_pct']:>5.1f}%)")
564
+ print(f" Multimodal: {stats['multimodal_count']:>3} ({stats['multimodal_pct']:>5.1f}%)")
565
+ print(f" Both (Multihop ∩ Multimodal): {stats['multihop_multimodal_count']:>3} ({stats['multihop_multimodal_pct']:>5.1f}%)")
566
+ print()
567
+ print("Exclusive Breakdown:")
568
+ print(f" Multihop only (text): {stats['multihop_only_count']:>3} ({stats['multihop_only_pct']:>5.1f}%)")
569
+ print(f" Multimodal only (single-hop): {stats['multimodal_only_count']:>3} ({stats['multimodal_only_pct']:>5.1f}%)")
570
+ print(f" Both (multihop + multimodal): {stats['multihop_multimodal_count']:>3} ({stats['multihop_multimodal_pct']:>5.1f}%)")
571
+ print(f" Neither (single-hop, text): {stats['text_only_count']:>3} ({stats['text_only_pct']:>5.1f}%)")
572
+ print()
573
+ print("Quality Scores (0-10 scale):")
574
+ print(f" Avg Difficulty: {stats.get('avg_difficulty', 0.0):>5.2f}")
575
+ print(f" Avg Relevance: {stats.get('avg_relevance', 0.0):>5.2f}")
576
+ print("=" * 60)
577
+
578
+
579
+ def main():
580
+ """Main entry point."""
581
+ data_dir = Path(__file__).parent
582
+ project_root = data_dir.parent
583
+ results_dir = project_root / "trials" / "results"
584
+
585
+ datasets = find_datasets(data_dir, results_dir)
586
+
587
+ if not datasets:
588
+ print("No datasets found.")
589
+ return
590
+
591
+ print(f"Found {len(datasets)} datasets to analyze...")
592
+
593
+ # Show which have chunks.json
594
+ with_chunks = sum(1 for d in datasets if d["chunks_path"])
595
+ print(f" - {with_chunks} with chunks.json (processed)")
596
+ print(f" - {len(datasets) - with_chunks} zip-only (not yet processed)")
597
+
598
+ # Load tokenizer once
599
+ tokenizer = get_tokenizer()
600
+ if tokenizer:
601
+ print("Using GPT2 tokenizer for token counting")
602
+ else:
603
+ print("Using word-based token estimation")
604
+
605
+ all_stats = []
606
+ for ds in tqdm(datasets, desc="Analyzing datasets"):
607
+ stats = analyze_dataset(
608
+ ds["name"],
609
+ ds["chunks_path"],
610
+ ds["zip_path"],
611
+ tokenizer
612
+ )
613
+ all_stats.append(stats)
614
+
615
+ # Print summary
616
+ print_summary(all_stats)
617
+
618
+ # Save detailed results
619
+ output_file = data_dir / "dataset_stats.json"
620
+ with open(output_file, 'w') as f:
621
+ json.dump(all_stats, f, indent=2)
622
+ print(f"\nDetailed stats saved to: {output_file}")
623
+
624
+
625
+ if __name__ == "__main__":
626
+ main()