isa-model 0.0.1__py3-none-any.whl → 0.0.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isa_model/eval/__init__.py +56 -0
- isa_model/eval/benchmarks.py +469 -0
- isa_model/eval/factory.py +582 -0
- isa_model/eval/metrics.py +628 -0
- isa_model/training/__init__.py +44 -0
- isa_model/training/factory.py +393 -0
- {isa_model-0.0.1.dist-info → isa_model-0.0.3.dist-info}/METADATA +1 -1
- {isa_model-0.0.1.dist-info → isa_model-0.0.3.dist-info}/RECORD +11 -5
- {isa_model-0.0.1.dist-info → isa_model-0.0.3.dist-info}/WHEEL +0 -0
- {isa_model-0.0.1.dist-info → isa_model-0.0.3.dist-info}/licenses/LICENSE +0 -0
- {isa_model-0.0.1.dist-info → isa_model-0.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,582 @@
|
|
1
|
+
"""
|
2
|
+
Unified Evaluation Factory for ISA Model Framework
|
3
|
+
|
4
|
+
This factory provides a single interface for all evaluation operations:
|
5
|
+
- LLM evaluation (perplexity, BLEU, ROUGE, custom metrics)
|
6
|
+
- Image model evaluation (FID, IS, LPIPS)
|
7
|
+
- Benchmark testing (MMLU, HellaSwag, ARC, etc.)
|
8
|
+
- Custom evaluation pipelines
|
9
|
+
"""
|
10
|
+
|
11
|
+
import os
|
12
|
+
import json
|
13
|
+
import logging
|
14
|
+
from typing import Optional, Dict, Any, List, Union
|
15
|
+
from pathlib import Path
|
16
|
+
import datetime
|
17
|
+
|
18
|
+
from .metrics import LLMMetrics, ImageMetrics, BenchmarkRunner
|
19
|
+
from .benchmarks import MMLU, HellaSwag, ARC, GSM8K
|
20
|
+
|
21
|
+
logger = logging.getLogger(__name__)
|
22
|
+
|
23
|
+
|
24
|
+
class EvaluationFactory:
|
25
|
+
"""
|
26
|
+
Unified factory for all AI model evaluation operations.
|
27
|
+
|
28
|
+
This class provides simplified interfaces for:
|
29
|
+
- LLM evaluation with various metrics
|
30
|
+
- Image model evaluation
|
31
|
+
- Benchmark testing on standard datasets
|
32
|
+
- Custom evaluation pipelines
|
33
|
+
|
34
|
+
Example usage:
|
35
|
+
```python
|
36
|
+
from isa_model.eval import EvaluationFactory
|
37
|
+
|
38
|
+
evaluator = EvaluationFactory()
|
39
|
+
|
40
|
+
# Evaluate LLM on custom dataset
|
41
|
+
results = evaluator.evaluate_llm(
|
42
|
+
model_path="path/to/model",
|
43
|
+
dataset_path="test_data.json",
|
44
|
+
metrics=["perplexity", "bleu", "rouge"],
|
45
|
+
output_dir="eval_results"
|
46
|
+
)
|
47
|
+
|
48
|
+
# Run MMLU benchmark
|
49
|
+
mmlu_results = evaluator.run_benchmark(
|
50
|
+
model_path="path/to/model",
|
51
|
+
benchmark="mmlu",
|
52
|
+
subjects=["math", "physics", "chemistry"]
|
53
|
+
)
|
54
|
+
|
55
|
+
# Compare multiple models
|
56
|
+
comparison = evaluator.compare_models([
|
57
|
+
"model1/path",
|
58
|
+
"model2/path"
|
59
|
+
], benchmark="hellaswag")
|
60
|
+
```
|
61
|
+
"""
|
62
|
+
|
63
|
+
def __init__(self, output_dir: Optional[str] = None):
|
64
|
+
"""
|
65
|
+
Initialize the evaluation factory.
|
66
|
+
|
67
|
+
Args:
|
68
|
+
output_dir: Base directory for evaluation outputs
|
69
|
+
"""
|
70
|
+
self.output_dir = output_dir or os.path.join(os.getcwd(), "evaluation_results")
|
71
|
+
os.makedirs(self.output_dir, exist_ok=True)
|
72
|
+
|
73
|
+
# Initialize metrics calculators
|
74
|
+
self.llm_metrics = LLMMetrics()
|
75
|
+
self.image_metrics = ImageMetrics()
|
76
|
+
self.benchmark_runner = BenchmarkRunner()
|
77
|
+
|
78
|
+
logger.info(f"EvaluationFactory initialized with output dir: {self.output_dir}")
|
79
|
+
|
80
|
+
def _get_output_path(self, model_name: str, eval_type: str) -> str:
|
81
|
+
"""Generate timestamped output path for evaluation results."""
|
82
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
83
|
+
safe_model_name = os.path.basename(model_name).replace("/", "_").replace(":", "_")
|
84
|
+
filename = f"{safe_model_name}_{eval_type}_{timestamp}.json"
|
85
|
+
return os.path.join(self.output_dir, filename)
|
86
|
+
|
87
|
+
# =================
|
88
|
+
# LLM Evaluation Methods
|
89
|
+
# =================
|
90
|
+
|
91
|
+
def evaluate_llm(
|
92
|
+
self,
|
93
|
+
model_path: str,
|
94
|
+
dataset_path: str,
|
95
|
+
metrics: List[str] = None,
|
96
|
+
output_path: Optional[str] = None,
|
97
|
+
batch_size: int = 8,
|
98
|
+
max_samples: Optional[int] = None,
|
99
|
+
provider: str = "ollama",
|
100
|
+
**kwargs
|
101
|
+
) -> Dict[str, Any]:
|
102
|
+
"""
|
103
|
+
Evaluate an LLM model on a dataset with specified metrics.
|
104
|
+
|
105
|
+
Args:
|
106
|
+
model_path: Path to the model or model identifier
|
107
|
+
dataset_path: Path to evaluation dataset (JSON format)
|
108
|
+
metrics: List of metrics to compute ["perplexity", "bleu", "rouge", "accuracy"]
|
109
|
+
output_path: Path to save results
|
110
|
+
batch_size: Batch size for evaluation
|
111
|
+
max_samples: Maximum number of samples to evaluate
|
112
|
+
provider: Model provider ("ollama", "openai", "hf")
|
113
|
+
**kwargs: Additional parameters
|
114
|
+
|
115
|
+
Returns:
|
116
|
+
Dictionary containing evaluation results
|
117
|
+
|
118
|
+
Example:
|
119
|
+
```python
|
120
|
+
results = evaluator.evaluate_llm(
|
121
|
+
model_path="google/gemma-2-4b-it",
|
122
|
+
dataset_path="test_data.json",
|
123
|
+
metrics=["perplexity", "bleu", "rouge"],
|
124
|
+
max_samples=1000
|
125
|
+
)
|
126
|
+
```
|
127
|
+
"""
|
128
|
+
if metrics is None:
|
129
|
+
metrics = ["perplexity", "bleu", "rouge"]
|
130
|
+
|
131
|
+
if not output_path:
|
132
|
+
output_path = self._get_output_path(model_path, "llm_eval")
|
133
|
+
|
134
|
+
logger.info(f"Evaluating LLM {model_path} with metrics: {metrics}")
|
135
|
+
|
136
|
+
# Load dataset
|
137
|
+
with open(dataset_path, 'r') as f:
|
138
|
+
dataset = json.load(f)
|
139
|
+
|
140
|
+
if max_samples:
|
141
|
+
dataset = dataset[:max_samples]
|
142
|
+
|
143
|
+
# Run evaluation
|
144
|
+
results = self.llm_metrics.evaluate(
|
145
|
+
model_path=model_path,
|
146
|
+
dataset=dataset,
|
147
|
+
metrics=metrics,
|
148
|
+
batch_size=batch_size,
|
149
|
+
provider=provider,
|
150
|
+
**kwargs
|
151
|
+
)
|
152
|
+
|
153
|
+
# Add metadata
|
154
|
+
results["metadata"] = {
|
155
|
+
"model_path": model_path,
|
156
|
+
"dataset_path": dataset_path,
|
157
|
+
"metrics": metrics,
|
158
|
+
"num_samples": len(dataset),
|
159
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
160
|
+
"provider": provider
|
161
|
+
}
|
162
|
+
|
163
|
+
# Save results
|
164
|
+
with open(output_path, 'w') as f:
|
165
|
+
json.dump(results, f, indent=2)
|
166
|
+
|
167
|
+
logger.info(f"Evaluation results saved to: {output_path}")
|
168
|
+
return results
|
169
|
+
|
170
|
+
def evaluate_generation_quality(
|
171
|
+
self,
|
172
|
+
model_path: str,
|
173
|
+
prompts: List[str],
|
174
|
+
reference_texts: List[str] = None,
|
175
|
+
metrics: List[str] = None,
|
176
|
+
output_path: Optional[str] = None,
|
177
|
+
provider: str = "ollama",
|
178
|
+
**kwargs
|
179
|
+
) -> Dict[str, Any]:
|
180
|
+
"""
|
181
|
+
Evaluate text generation quality.
|
182
|
+
|
183
|
+
Args:
|
184
|
+
model_path: Path to the model
|
185
|
+
prompts: List of input prompts
|
186
|
+
reference_texts: Reference texts for comparison (optional)
|
187
|
+
metrics: Metrics to compute
|
188
|
+
output_path: Output path for results
|
189
|
+
provider: Model provider
|
190
|
+
**kwargs: Additional parameters
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
Evaluation results dictionary
|
194
|
+
"""
|
195
|
+
if metrics is None:
|
196
|
+
metrics = ["diversity", "coherence", "fluency"]
|
197
|
+
|
198
|
+
if not output_path:
|
199
|
+
output_path = self._get_output_path(model_path, "generation_eval")
|
200
|
+
|
201
|
+
results = self.llm_metrics.evaluate_generation(
|
202
|
+
model_path=model_path,
|
203
|
+
prompts=prompts,
|
204
|
+
reference_texts=reference_texts,
|
205
|
+
metrics=metrics,
|
206
|
+
provider=provider,
|
207
|
+
**kwargs
|
208
|
+
)
|
209
|
+
|
210
|
+
# Save results
|
211
|
+
with open(output_path, 'w') as f:
|
212
|
+
json.dump(results, f, indent=2)
|
213
|
+
|
214
|
+
return results
|
215
|
+
|
216
|
+
# =================
|
217
|
+
# Benchmark Testing Methods
|
218
|
+
# =================
|
219
|
+
|
220
|
+
def run_benchmark(
|
221
|
+
self,
|
222
|
+
model_path: str,
|
223
|
+
benchmark: str,
|
224
|
+
output_path: Optional[str] = None,
|
225
|
+
num_shots: int = 0,
|
226
|
+
max_samples: Optional[int] = None,
|
227
|
+
provider: str = "ollama",
|
228
|
+
**kwargs
|
229
|
+
) -> Dict[str, Any]:
|
230
|
+
"""
|
231
|
+
Run a standard benchmark test.
|
232
|
+
|
233
|
+
Args:
|
234
|
+
model_path: Path to the model
|
235
|
+
benchmark: Benchmark name ("mmlu", "hellaswag", "arc", "gsm8k")
|
236
|
+
output_path: Output path for results
|
237
|
+
num_shots: Number of few-shot examples
|
238
|
+
max_samples: Maximum samples to evaluate
|
239
|
+
provider: Model provider
|
240
|
+
**kwargs: Additional parameters
|
241
|
+
|
242
|
+
Returns:
|
243
|
+
Benchmark results dictionary
|
244
|
+
|
245
|
+
Example:
|
246
|
+
```python
|
247
|
+
mmlu_results = evaluator.run_benchmark(
|
248
|
+
model_path="google/gemma-2-4b-it",
|
249
|
+
benchmark="mmlu",
|
250
|
+
num_shots=5,
|
251
|
+
max_samples=1000
|
252
|
+
)
|
253
|
+
```
|
254
|
+
"""
|
255
|
+
if not output_path:
|
256
|
+
output_path = self._get_output_path(model_path, f"{benchmark}_benchmark")
|
257
|
+
|
258
|
+
logger.info(f"Running {benchmark} benchmark on {model_path}")
|
259
|
+
|
260
|
+
# Select benchmark
|
261
|
+
benchmark_map = {
|
262
|
+
"mmlu": MMLU(),
|
263
|
+
"hellaswag": HellaSwag(),
|
264
|
+
"arc": ARC(),
|
265
|
+
"gsm8k": GSM8K()
|
266
|
+
}
|
267
|
+
|
268
|
+
if benchmark.lower() not in benchmark_map:
|
269
|
+
raise ValueError(f"Unsupported benchmark: {benchmark}")
|
270
|
+
|
271
|
+
benchmark_instance = benchmark_map[benchmark.lower()]
|
272
|
+
|
273
|
+
# Run benchmark
|
274
|
+
results = self.benchmark_runner.run(
|
275
|
+
benchmark=benchmark_instance,
|
276
|
+
model_path=model_path,
|
277
|
+
num_shots=num_shots,
|
278
|
+
max_samples=max_samples,
|
279
|
+
provider=provider,
|
280
|
+
**kwargs
|
281
|
+
)
|
282
|
+
|
283
|
+
# Add metadata
|
284
|
+
results["metadata"] = {
|
285
|
+
"model_path": model_path,
|
286
|
+
"benchmark": benchmark,
|
287
|
+
"num_shots": num_shots,
|
288
|
+
"max_samples": max_samples,
|
289
|
+
"timestamp": datetime.datetime.now().isoformat(),
|
290
|
+
"provider": provider
|
291
|
+
}
|
292
|
+
|
293
|
+
# Save results
|
294
|
+
with open(output_path, 'w') as f:
|
295
|
+
json.dump(results, f, indent=2)
|
296
|
+
|
297
|
+
logger.info(f"Benchmark results saved to: {output_path}")
|
298
|
+
return results
|
299
|
+
|
300
|
+
def run_multiple_benchmarks(
|
301
|
+
self,
|
302
|
+
model_path: str,
|
303
|
+
benchmarks: List[str] = None,
|
304
|
+
output_dir: Optional[str] = None,
|
305
|
+
**kwargs
|
306
|
+
) -> Dict[str, Any]:
|
307
|
+
"""
|
308
|
+
Run multiple benchmarks on a model.
|
309
|
+
|
310
|
+
Args:
|
311
|
+
model_path: Path to the model
|
312
|
+
benchmarks: List of benchmark names
|
313
|
+
output_dir: Directory to save results
|
314
|
+
**kwargs: Additional parameters
|
315
|
+
|
316
|
+
Returns:
|
317
|
+
Combined results dictionary
|
318
|
+
"""
|
319
|
+
if benchmarks is None:
|
320
|
+
benchmarks = ["mmlu", "hellaswag", "arc"]
|
321
|
+
|
322
|
+
if not output_dir:
|
323
|
+
output_dir = os.path.join(self.output_dir, "multi_benchmark")
|
324
|
+
os.makedirs(output_dir, exist_ok=True)
|
325
|
+
|
326
|
+
all_results = {}
|
327
|
+
|
328
|
+
for benchmark in benchmarks:
|
329
|
+
try:
|
330
|
+
output_path = os.path.join(output_dir, f"{benchmark}_results.json")
|
331
|
+
results = self.run_benchmark(
|
332
|
+
model_path=model_path,
|
333
|
+
benchmark=benchmark,
|
334
|
+
output_path=output_path,
|
335
|
+
**kwargs
|
336
|
+
)
|
337
|
+
all_results[benchmark] = results
|
338
|
+
except Exception as e:
|
339
|
+
logger.error(f"Failed to run benchmark {benchmark}: {e}")
|
340
|
+
all_results[benchmark] = {"error": str(e)}
|
341
|
+
|
342
|
+
# Save combined results
|
343
|
+
combined_path = os.path.join(output_dir, "combined_results.json")
|
344
|
+
with open(combined_path, 'w') as f:
|
345
|
+
json.dump(all_results, f, indent=2)
|
346
|
+
|
347
|
+
return all_results
|
348
|
+
|
349
|
+
# =================
|
350
|
+
# Model Comparison Methods
|
351
|
+
# =================
|
352
|
+
|
353
|
+
def compare_models(
|
354
|
+
self,
|
355
|
+
model_paths: List[str],
|
356
|
+
dataset_path: Optional[str] = None,
|
357
|
+
benchmark: Optional[str] = None,
|
358
|
+
metrics: List[str] = None,
|
359
|
+
output_path: Optional[str] = None,
|
360
|
+
**kwargs
|
361
|
+
) -> Dict[str, Any]:
|
362
|
+
"""
|
363
|
+
Compare multiple models on the same evaluation.
|
364
|
+
|
365
|
+
Args:
|
366
|
+
model_paths: List of model paths to compare
|
367
|
+
dataset_path: Path to evaluation dataset
|
368
|
+
benchmark: Benchmark name for comparison
|
369
|
+
metrics: Metrics to compute
|
370
|
+
output_path: Output path for comparison results
|
371
|
+
**kwargs: Additional parameters
|
372
|
+
|
373
|
+
Returns:
|
374
|
+
Comparison results dictionary
|
375
|
+
"""
|
376
|
+
if not output_path:
|
377
|
+
output_path = self._get_output_path("model_comparison", "comparison")
|
378
|
+
|
379
|
+
comparison_results = {
|
380
|
+
"models": model_paths,
|
381
|
+
"results": {},
|
382
|
+
"summary": {}
|
383
|
+
}
|
384
|
+
|
385
|
+
# Run evaluation for each model
|
386
|
+
for model_path in model_paths:
|
387
|
+
model_name = os.path.basename(model_path)
|
388
|
+
logger.info(f"Evaluating model: {model_name}")
|
389
|
+
|
390
|
+
try:
|
391
|
+
if dataset_path:
|
392
|
+
# Custom dataset evaluation
|
393
|
+
results = self.evaluate_llm(
|
394
|
+
model_path=model_path,
|
395
|
+
dataset_path=dataset_path,
|
396
|
+
metrics=metrics,
|
397
|
+
**kwargs
|
398
|
+
)
|
399
|
+
elif benchmark:
|
400
|
+
# Benchmark evaluation
|
401
|
+
results = self.run_benchmark(
|
402
|
+
model_path=model_path,
|
403
|
+
benchmark=benchmark,
|
404
|
+
**kwargs
|
405
|
+
)
|
406
|
+
else:
|
407
|
+
raise ValueError("Either dataset_path or benchmark must be provided")
|
408
|
+
|
409
|
+
comparison_results["results"][model_name] = results
|
410
|
+
|
411
|
+
except Exception as e:
|
412
|
+
logger.error(f"Failed to evaluate {model_name}: {e}")
|
413
|
+
comparison_results["results"][model_name] = {"error": str(e)}
|
414
|
+
|
415
|
+
# Generate summary
|
416
|
+
comparison_results["summary"] = self._generate_comparison_summary(
|
417
|
+
comparison_results["results"]
|
418
|
+
)
|
419
|
+
|
420
|
+
# Save results
|
421
|
+
with open(output_path, 'w') as f:
|
422
|
+
json.dump(comparison_results, f, indent=2)
|
423
|
+
|
424
|
+
return comparison_results
|
425
|
+
|
426
|
+
def _generate_comparison_summary(self, results: Dict[str, Any]) -> Dict[str, Any]:
|
427
|
+
"""Generate summary statistics for model comparison."""
|
428
|
+
summary = {
|
429
|
+
"best_performing": {},
|
430
|
+
"rankings": {},
|
431
|
+
"average_scores": {}
|
432
|
+
}
|
433
|
+
|
434
|
+
# Extract key metrics and find best performing models
|
435
|
+
for model_name, model_results in results.items():
|
436
|
+
if "error" in model_results:
|
437
|
+
continue
|
438
|
+
|
439
|
+
# Extract main scores (this is simplified - would need more sophisticated logic)
|
440
|
+
if "accuracy" in model_results:
|
441
|
+
summary["average_scores"][model_name] = model_results["accuracy"]
|
442
|
+
elif "overall_score" in model_results:
|
443
|
+
summary["average_scores"][model_name] = model_results["overall_score"]
|
444
|
+
|
445
|
+
# Rank models by performance
|
446
|
+
if summary["average_scores"]:
|
447
|
+
ranked = sorted(
|
448
|
+
summary["average_scores"].items(),
|
449
|
+
key=lambda x: x[1],
|
450
|
+
reverse=True
|
451
|
+
)
|
452
|
+
summary["rankings"] = {i+1: model for i, (model, score) in enumerate(ranked)}
|
453
|
+
summary["best_performing"]["model"] = ranked[0][0]
|
454
|
+
summary["best_performing"]["score"] = ranked[0][1]
|
455
|
+
|
456
|
+
return summary
|
457
|
+
|
458
|
+
# =================
|
459
|
+
# Image Model Evaluation Methods
|
460
|
+
# =================
|
461
|
+
|
462
|
+
def evaluate_image_model(
|
463
|
+
self,
|
464
|
+
model_path: str,
|
465
|
+
test_images_dir: str,
|
466
|
+
reference_images_dir: Optional[str] = None,
|
467
|
+
metrics: List[str] = None,
|
468
|
+
output_path: Optional[str] = None,
|
469
|
+
**kwargs
|
470
|
+
) -> Dict[str, Any]:
|
471
|
+
"""
|
472
|
+
Evaluate image generation model.
|
473
|
+
|
474
|
+
Args:
|
475
|
+
model_path: Path to the image model
|
476
|
+
test_images_dir: Directory with test images
|
477
|
+
reference_images_dir: Directory with reference images
|
478
|
+
metrics: Metrics to compute ["fid", "is", "lpips"]
|
479
|
+
output_path: Output path for results
|
480
|
+
**kwargs: Additional parameters
|
481
|
+
|
482
|
+
Returns:
|
483
|
+
Image evaluation results
|
484
|
+
"""
|
485
|
+
if metrics is None:
|
486
|
+
metrics = ["fid", "is"]
|
487
|
+
|
488
|
+
if not output_path:
|
489
|
+
output_path = self._get_output_path(model_path, "image_eval")
|
490
|
+
|
491
|
+
results = self.image_metrics.evaluate(
|
492
|
+
model_path=model_path,
|
493
|
+
test_images_dir=test_images_dir,
|
494
|
+
reference_images_dir=reference_images_dir,
|
495
|
+
metrics=metrics,
|
496
|
+
**kwargs
|
497
|
+
)
|
498
|
+
|
499
|
+
# Save results
|
500
|
+
with open(output_path, 'w') as f:
|
501
|
+
json.dump(results, f, indent=2)
|
502
|
+
|
503
|
+
return results
|
504
|
+
|
505
|
+
# =================
|
506
|
+
# Utility Methods
|
507
|
+
# =================
|
508
|
+
|
509
|
+
def load_results(self, results_path: str) -> Dict[str, Any]:
|
510
|
+
"""Load evaluation results from file."""
|
511
|
+
with open(results_path, 'r') as f:
|
512
|
+
return json.load(f)
|
513
|
+
|
514
|
+
def list_evaluation_results(self) -> List[Dict[str, Any]]:
|
515
|
+
"""List all evaluation results in the output directory."""
|
516
|
+
results = []
|
517
|
+
|
518
|
+
if os.path.exists(self.output_dir):
|
519
|
+
for filename in os.listdir(self.output_dir):
|
520
|
+
if filename.endswith('.json'):
|
521
|
+
filepath = os.path.join(self.output_dir, filename)
|
522
|
+
try:
|
523
|
+
with open(filepath, 'r') as f:
|
524
|
+
data = json.load(f)
|
525
|
+
results.append({
|
526
|
+
"filename": filename,
|
527
|
+
"path": filepath,
|
528
|
+
"metadata": data.get("metadata", {}),
|
529
|
+
"created": datetime.datetime.fromtimestamp(
|
530
|
+
os.path.getctime(filepath)
|
531
|
+
).isoformat()
|
532
|
+
})
|
533
|
+
except Exception as e:
|
534
|
+
logger.warning(f"Failed to load {filename}: {e}")
|
535
|
+
|
536
|
+
return sorted(results, key=lambda x: x["created"], reverse=True)
|
537
|
+
|
538
|
+
def generate_report(
|
539
|
+
self,
|
540
|
+
results_paths: List[str],
|
541
|
+
output_path: Optional[str] = None,
|
542
|
+
format: str = "json"
|
543
|
+
) -> str:
|
544
|
+
"""
|
545
|
+
Generate evaluation report from multiple results.
|
546
|
+
|
547
|
+
Args:
|
548
|
+
results_paths: List of result file paths
|
549
|
+
output_path: Output path for report
|
550
|
+
format: Report format ("json", "html", "markdown")
|
551
|
+
|
552
|
+
Returns:
|
553
|
+
Path to generated report
|
554
|
+
"""
|
555
|
+
if not output_path:
|
556
|
+
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
557
|
+
output_path = os.path.join(self.output_dir, f"evaluation_report_{timestamp}.{format}")
|
558
|
+
|
559
|
+
# Load all results
|
560
|
+
all_results = []
|
561
|
+
for path in results_paths:
|
562
|
+
try:
|
563
|
+
results = self.load_results(path)
|
564
|
+
all_results.append(results)
|
565
|
+
except Exception as e:
|
566
|
+
logger.warning(f"Failed to load results from {path}: {e}")
|
567
|
+
|
568
|
+
# Generate report based on format
|
569
|
+
if format == "json":
|
570
|
+
report_data = {
|
571
|
+
"report_generated": datetime.datetime.now().isoformat(),
|
572
|
+
"num_evaluations": len(all_results),
|
573
|
+
"results": all_results
|
574
|
+
}
|
575
|
+
|
576
|
+
with open(output_path, 'w') as f:
|
577
|
+
json.dump(report_data, f, indent=2)
|
578
|
+
|
579
|
+
# TODO: Implement HTML and Markdown report generation
|
580
|
+
|
581
|
+
logger.info(f"Evaluation report generated: {output_path}")
|
582
|
+
return output_path
|