isa-model 0.0.1__py3-none-any.whl → 0.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,582 @@
1
+ """
2
+ Unified Evaluation Factory for ISA Model Framework
3
+
4
+ This factory provides a single interface for all evaluation operations:
5
+ - LLM evaluation (perplexity, BLEU, ROUGE, custom metrics)
6
+ - Image model evaluation (FID, IS, LPIPS)
7
+ - Benchmark testing (MMLU, HellaSwag, ARC, etc.)
8
+ - Custom evaluation pipelines
9
+ """
10
+
11
+ import os
12
+ import json
13
+ import logging
14
+ from typing import Optional, Dict, Any, List, Union
15
+ from pathlib import Path
16
+ import datetime
17
+
18
+ from .metrics import LLMMetrics, ImageMetrics, BenchmarkRunner
19
+ from .benchmarks import MMLU, HellaSwag, ARC, GSM8K
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class EvaluationFactory:
25
+ """
26
+ Unified factory for all AI model evaluation operations.
27
+
28
+ This class provides simplified interfaces for:
29
+ - LLM evaluation with various metrics
30
+ - Image model evaluation
31
+ - Benchmark testing on standard datasets
32
+ - Custom evaluation pipelines
33
+
34
+ Example usage:
35
+ ```python
36
+ from isa_model.eval import EvaluationFactory
37
+
38
+ evaluator = EvaluationFactory()
39
+
40
+ # Evaluate LLM on custom dataset
41
+ results = evaluator.evaluate_llm(
42
+ model_path="path/to/model",
43
+ dataset_path="test_data.json",
44
+ metrics=["perplexity", "bleu", "rouge"],
45
+ output_dir="eval_results"
46
+ )
47
+
48
+ # Run MMLU benchmark
49
+ mmlu_results = evaluator.run_benchmark(
50
+ model_path="path/to/model",
51
+ benchmark="mmlu",
52
+ subjects=["math", "physics", "chemistry"]
53
+ )
54
+
55
+ # Compare multiple models
56
+ comparison = evaluator.compare_models([
57
+ "model1/path",
58
+ "model2/path"
59
+ ], benchmark="hellaswag")
60
+ ```
61
+ """
62
+
63
+ def __init__(self, output_dir: Optional[str] = None):
64
+ """
65
+ Initialize the evaluation factory.
66
+
67
+ Args:
68
+ output_dir: Base directory for evaluation outputs
69
+ """
70
+ self.output_dir = output_dir or os.path.join(os.getcwd(), "evaluation_results")
71
+ os.makedirs(self.output_dir, exist_ok=True)
72
+
73
+ # Initialize metrics calculators
74
+ self.llm_metrics = LLMMetrics()
75
+ self.image_metrics = ImageMetrics()
76
+ self.benchmark_runner = BenchmarkRunner()
77
+
78
+ logger.info(f"EvaluationFactory initialized with output dir: {self.output_dir}")
79
+
80
+ def _get_output_path(self, model_name: str, eval_type: str) -> str:
81
+ """Generate timestamped output path for evaluation results."""
82
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
83
+ safe_model_name = os.path.basename(model_name).replace("/", "_").replace(":", "_")
84
+ filename = f"{safe_model_name}_{eval_type}_{timestamp}.json"
85
+ return os.path.join(self.output_dir, filename)
86
+
87
+ # =================
88
+ # LLM Evaluation Methods
89
+ # =================
90
+
91
+ def evaluate_llm(
92
+ self,
93
+ model_path: str,
94
+ dataset_path: str,
95
+ metrics: List[str] = None,
96
+ output_path: Optional[str] = None,
97
+ batch_size: int = 8,
98
+ max_samples: Optional[int] = None,
99
+ provider: str = "ollama",
100
+ **kwargs
101
+ ) -> Dict[str, Any]:
102
+ """
103
+ Evaluate an LLM model on a dataset with specified metrics.
104
+
105
+ Args:
106
+ model_path: Path to the model or model identifier
107
+ dataset_path: Path to evaluation dataset (JSON format)
108
+ metrics: List of metrics to compute ["perplexity", "bleu", "rouge", "accuracy"]
109
+ output_path: Path to save results
110
+ batch_size: Batch size for evaluation
111
+ max_samples: Maximum number of samples to evaluate
112
+ provider: Model provider ("ollama", "openai", "hf")
113
+ **kwargs: Additional parameters
114
+
115
+ Returns:
116
+ Dictionary containing evaluation results
117
+
118
+ Example:
119
+ ```python
120
+ results = evaluator.evaluate_llm(
121
+ model_path="google/gemma-2-4b-it",
122
+ dataset_path="test_data.json",
123
+ metrics=["perplexity", "bleu", "rouge"],
124
+ max_samples=1000
125
+ )
126
+ ```
127
+ """
128
+ if metrics is None:
129
+ metrics = ["perplexity", "bleu", "rouge"]
130
+
131
+ if not output_path:
132
+ output_path = self._get_output_path(model_path, "llm_eval")
133
+
134
+ logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
135
+
136
+ # Load dataset
137
+ with open(dataset_path, 'r') as f:
138
+ dataset = json.load(f)
139
+
140
+ if max_samples:
141
+ dataset = dataset[:max_samples]
142
+
143
+ # Run evaluation
144
+ results = self.llm_metrics.evaluate(
145
+ model_path=model_path,
146
+ dataset=dataset,
147
+ metrics=metrics,
148
+ batch_size=batch_size,
149
+ provider=provider,
150
+ **kwargs
151
+ )
152
+
153
+ # Add metadata
154
+ results["metadata"] = {
155
+ "model_path": model_path,
156
+ "dataset_path": dataset_path,
157
+ "metrics": metrics,
158
+ "num_samples": len(dataset),
159
+ "timestamp": datetime.datetime.now().isoformat(),
160
+ "provider": provider
161
+ }
162
+
163
+ # Save results
164
+ with open(output_path, 'w') as f:
165
+ json.dump(results, f, indent=2)
166
+
167
+ logger.info(f"Evaluation results saved to: {output_path}")
168
+ return results
169
+
170
+ def evaluate_generation_quality(
171
+ self,
172
+ model_path: str,
173
+ prompts: List[str],
174
+ reference_texts: List[str] = None,
175
+ metrics: List[str] = None,
176
+ output_path: Optional[str] = None,
177
+ provider: str = "ollama",
178
+ **kwargs
179
+ ) -> Dict[str, Any]:
180
+ """
181
+ Evaluate text generation quality.
182
+
183
+ Args:
184
+ model_path: Path to the model
185
+ prompts: List of input prompts
186
+ reference_texts: Reference texts for comparison (optional)
187
+ metrics: Metrics to compute
188
+ output_path: Output path for results
189
+ provider: Model provider
190
+ **kwargs: Additional parameters
191
+
192
+ Returns:
193
+ Evaluation results dictionary
194
+ """
195
+ if metrics is None:
196
+ metrics = ["diversity", "coherence", "fluency"]
197
+
198
+ if not output_path:
199
+ output_path = self._get_output_path(model_path, "generation_eval")
200
+
201
+ results = self.llm_metrics.evaluate_generation(
202
+ model_path=model_path,
203
+ prompts=prompts,
204
+ reference_texts=reference_texts,
205
+ metrics=metrics,
206
+ provider=provider,
207
+ **kwargs
208
+ )
209
+
210
+ # Save results
211
+ with open(output_path, 'w') as f:
212
+ json.dump(results, f, indent=2)
213
+
214
+ return results
215
+
216
+ # =================
217
+ # Benchmark Testing Methods
218
+ # =================
219
+
220
+ def run_benchmark(
221
+ self,
222
+ model_path: str,
223
+ benchmark: str,
224
+ output_path: Optional[str] = None,
225
+ num_shots: int = 0,
226
+ max_samples: Optional[int] = None,
227
+ provider: str = "ollama",
228
+ **kwargs
229
+ ) -> Dict[str, Any]:
230
+ """
231
+ Run a standard benchmark test.
232
+
233
+ Args:
234
+ model_path: Path to the model
235
+ benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
236
+ output_path: Output path for results
237
+ num_shots: Number of few-shot examples
238
+ max_samples: Maximum samples to evaluate
239
+ provider: Model provider
240
+ **kwargs: Additional parameters
241
+
242
+ Returns:
243
+ Benchmark results dictionary
244
+
245
+ Example:
246
+ ```python
247
+ mmlu_results = evaluator.run_benchmark(
248
+ model_path="google/gemma-2-4b-it",
249
+ benchmark="mmlu",
250
+ num_shots=5,
251
+ max_samples=1000
252
+ )
253
+ ```
254
+ """
255
+ if not output_path:
256
+ output_path = self._get_output_path(model_path, f"{benchmark}_benchmark")
257
+
258
+ logger.info(f"Running {benchmark} benchmark on {model_path}")
259
+
260
+ # Select benchmark
261
+ benchmark_map = {
262
+ "mmlu": MMLU(),
263
+ "hellaswag": HellaSwag(),
264
+ "arc": ARC(),
265
+ "gsm8k": GSM8K()
266
+ }
267
+
268
+ if benchmark.lower() not in benchmark_map:
269
+ raise ValueError(f"Unsupported benchmark: {benchmark}")
270
+
271
+ benchmark_instance = benchmark_map[benchmark.lower()]
272
+
273
+ # Run benchmark
274
+ results = self.benchmark_runner.run(
275
+ benchmark=benchmark_instance,
276
+ model_path=model_path,
277
+ num_shots=num_shots,
278
+ max_samples=max_samples,
279
+ provider=provider,
280
+ **kwargs
281
+ )
282
+
283
+ # Add metadata
284
+ results["metadata"] = {
285
+ "model_path": model_path,
286
+ "benchmark": benchmark,
287
+ "num_shots": num_shots,
288
+ "max_samples": max_samples,
289
+ "timestamp": datetime.datetime.now().isoformat(),
290
+ "provider": provider
291
+ }
292
+
293
+ # Save results
294
+ with open(output_path, 'w') as f:
295
+ json.dump(results, f, indent=2)
296
+
297
+ logger.info(f"Benchmark results saved to: {output_path}")
298
+ return results
299
+
300
+ def run_multiple_benchmarks(
301
+ self,
302
+ model_path: str,
303
+ benchmarks: List[str] = None,
304
+ output_dir: Optional[str] = None,
305
+ **kwargs
306
+ ) -> Dict[str, Any]:
307
+ """
308
+ Run multiple benchmarks on a model.
309
+
310
+ Args:
311
+ model_path: Path to the model
312
+ benchmarks: List of benchmark names
313
+ output_dir: Directory to save results
314
+ **kwargs: Additional parameters
315
+
316
+ Returns:
317
+ Combined results dictionary
318
+ """
319
+ if benchmarks is None:
320
+ benchmarks = ["mmlu", "hellaswag", "arc"]
321
+
322
+ if not output_dir:
323
+ output_dir = os.path.join(self.output_dir, "multi_benchmark")
324
+ os.makedirs(output_dir, exist_ok=True)
325
+
326
+ all_results = {}
327
+
328
+ for benchmark in benchmarks:
329
+ try:
330
+ output_path = os.path.join(output_dir, f"{benchmark}_results.json")
331
+ results = self.run_benchmark(
332
+ model_path=model_path,
333
+ benchmark=benchmark,
334
+ output_path=output_path,
335
+ **kwargs
336
+ )
337
+ all_results[benchmark] = results
338
+ except Exception as e:
339
+ logger.error(f"Failed to run benchmark {benchmark}: {e}")
340
+ all_results[benchmark] = {"error": str(e)}
341
+
342
+ # Save combined results
343
+ combined_path = os.path.join(output_dir, "combined_results.json")
344
+ with open(combined_path, 'w') as f:
345
+ json.dump(all_results, f, indent=2)
346
+
347
+ return all_results
348
+
349
+ # =================
350
+ # Model Comparison Methods
351
+ # =================
352
+
353
+ def compare_models(
354
+ self,
355
+ model_paths: List[str],
356
+ dataset_path: Optional[str] = None,
357
+ benchmark: Optional[str] = None,
358
+ metrics: List[str] = None,
359
+ output_path: Optional[str] = None,
360
+ **kwargs
361
+ ) -> Dict[str, Any]:
362
+ """
363
+ Compare multiple models on the same evaluation.
364
+
365
+ Args:
366
+ model_paths: List of model paths to compare
367
+ dataset_path: Path to evaluation dataset
368
+ benchmark: Benchmark name for comparison
369
+ metrics: Metrics to compute
370
+ output_path: Output path for comparison results
371
+ **kwargs: Additional parameters
372
+
373
+ Returns:
374
+ Comparison results dictionary
375
+ """
376
+ if not output_path:
377
+ output_path = self._get_output_path("model_comparison", "comparison")
378
+
379
+ comparison_results = {
380
+ "models": model_paths,
381
+ "results": {},
382
+ "summary": {}
383
+ }
384
+
385
+ # Run evaluation for each model
386
+ for model_path in model_paths:
387
+ model_name = os.path.basename(model_path)
388
+ logger.info(f"Evaluating model: {model_name}")
389
+
390
+ try:
391
+ if dataset_path:
392
+ # Custom dataset evaluation
393
+ results = self.evaluate_llm(
394
+ model_path=model_path,
395
+ dataset_path=dataset_path,
396
+ metrics=metrics,
397
+ **kwargs
398
+ )
399
+ elif benchmark:
400
+ # Benchmark evaluation
401
+ results = self.run_benchmark(
402
+ model_path=model_path,
403
+ benchmark=benchmark,
404
+ **kwargs
405
+ )
406
+ else:
407
+ raise ValueError("Either dataset_path or benchmark must be provided")
408
+
409
+ comparison_results["results"][model_name] = results
410
+
411
+ except Exception as e:
412
+ logger.error(f"Failed to evaluate {model_name}: {e}")
413
+ comparison_results["results"][model_name] = {"error": str(e)}
414
+
415
+ # Generate summary
416
+ comparison_results["summary"] = self._generate_comparison_summary(
417
+ comparison_results["results"]
418
+ )
419
+
420
+ # Save results
421
+ with open(output_path, 'w') as f:
422
+ json.dump(comparison_results, f, indent=2)
423
+
424
+ return comparison_results
425
+
426
+ def _generate_comparison_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
427
+ """Generate summary statistics for model comparison."""
428
+ summary = {
429
+ "best_performing": {},
430
+ "rankings": {},
431
+ "average_scores": {}
432
+ }
433
+
434
+ # Extract key metrics and find best performing models
435
+ for model_name, model_results in results.items():
436
+ if "error" in model_results:
437
+ continue
438
+
439
+ # Extract main scores (this is simplified - would need more sophisticated logic)
440
+ if "accuracy" in model_results:
441
+ summary["average_scores"][model_name] = model_results["accuracy"]
442
+ elif "overall_score" in model_results:
443
+ summary["average_scores"][model_name] = model_results["overall_score"]
444
+
445
+ # Rank models by performance
446
+ if summary["average_scores"]:
447
+ ranked = sorted(
448
+ summary["average_scores"].items(),
449
+ key=lambda x: x[1],
450
+ reverse=True
451
+ )
452
+ summary["rankings"] = {i+1: model for i, (model, score) in enumerate(ranked)}
453
+ summary["best_performing"]["model"] = ranked[0][0]
454
+ summary["best_performing"]["score"] = ranked[0][1]
455
+
456
+ return summary
457
+
458
+ # =================
459
+ # Image Model Evaluation Methods
460
+ # =================
461
+
462
+ def evaluate_image_model(
463
+ self,
464
+ model_path: str,
465
+ test_images_dir: str,
466
+ reference_images_dir: Optional[str] = None,
467
+ metrics: List[str] = None,
468
+ output_path: Optional[str] = None,
469
+ **kwargs
470
+ ) -> Dict[str, Any]:
471
+ """
472
+ Evaluate image generation model.
473
+
474
+ Args:
475
+ model_path: Path to the image model
476
+ test_images_dir: Directory with test images
477
+ reference_images_dir: Directory with reference images
478
+ metrics: Metrics to compute ["fid", "is", "lpips"]
479
+ output_path: Output path for results
480
+ **kwargs: Additional parameters
481
+
482
+ Returns:
483
+ Image evaluation results
484
+ """
485
+ if metrics is None:
486
+ metrics = ["fid", "is"]
487
+
488
+ if not output_path:
489
+ output_path = self._get_output_path(model_path, "image_eval")
490
+
491
+ results = self.image_metrics.evaluate(
492
+ model_path=model_path,
493
+ test_images_dir=test_images_dir,
494
+ reference_images_dir=reference_images_dir,
495
+ metrics=metrics,
496
+ **kwargs
497
+ )
498
+
499
+ # Save results
500
+ with open(output_path, 'w') as f:
501
+ json.dump(results, f, indent=2)
502
+
503
+ return results
504
+
505
+ # =================
506
+ # Utility Methods
507
+ # =================
508
+
509
+ def load_results(self, results_path: str) -> Dict[str, Any]:
510
+ """Load evaluation results from file."""
511
+ with open(results_path, 'r') as f:
512
+ return json.load(f)
513
+
514
+ def list_evaluation_results(self) -> List[Dict[str, Any]]:
515
+ """List all evaluation results in the output directory."""
516
+ results = []
517
+
518
+ if os.path.exists(self.output_dir):
519
+ for filename in os.listdir(self.output_dir):
520
+ if filename.endswith('.json'):
521
+ filepath = os.path.join(self.output_dir, filename)
522
+ try:
523
+ with open(filepath, 'r') as f:
524
+ data = json.load(f)
525
+ results.append({
526
+ "filename": filename,
527
+ "path": filepath,
528
+ "metadata": data.get("metadata", {}),
529
+ "created": datetime.datetime.fromtimestamp(
530
+ os.path.getctime(filepath)
531
+ ).isoformat()
532
+ })
533
+ except Exception as e:
534
+ logger.warning(f"Failed to load {filename}: {e}")
535
+
536
+ return sorted(results, key=lambda x: x["created"], reverse=True)
537
+
538
+ def generate_report(
539
+ self,
540
+ results_paths: List[str],
541
+ output_path: Optional[str] = None,
542
+ format: str = "json"
543
+ ) -> str:
544
+ """
545
+ Generate evaluation report from multiple results.
546
+
547
+ Args:
548
+ results_paths: List of result file paths
549
+ output_path: Output path for report
550
+ format: Report format ("json", "html", "markdown")
551
+
552
+ Returns:
553
+ Path to generated report
554
+ """
555
+ if not output_path:
556
+ timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
557
+ output_path = os.path.join(self.output_dir, f"evaluation_report_{timestamp}.{format}")
558
+
559
+ # Load all results
560
+ all_results = []
561
+ for path in results_paths:
562
+ try:
563
+ results = self.load_results(path)
564
+ all_results.append(results)
565
+ except Exception as e:
566
+ logger.warning(f"Failed to load results from {path}: {e}")
567
+
568
+ # Generate report based on format
569
+ if format == "json":
570
+ report_data = {
571
+ "report_generated": datetime.datetime.now().isoformat(),
572
+ "num_evaluations": len(all_results),
573
+ "results": all_results
574
+ }
575
+
576
+ with open(output_path, 'w') as f:
577
+ json.dump(report_data, f, indent=2)
578
+
579
+ # TODO: Implement HTML and Markdown report generation
580
+
581
+ logger.info(f"Evaluation report generated: {output_path}")
582
+ return output_path