themis-eval 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {themis_eval-0.2.0/themis_eval.egg-info → themis_eval-0.2.2}/PKG-INFO +6 -5
- {themis_eval-0.2.0 → themis_eval-0.2.2}/README.md +4 -4
- {themis_eval-0.2.0 → themis_eval-0.2.2}/pyproject.toml +5 -1
- themis_eval-0.2.2/themis/__init__.py +47 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/_version.py +1 -1
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/api.py +156 -17
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/orchestrator.py +61 -5
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/storage.py +163 -19
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/providers/litellm_provider.py +46 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/runner.py +22 -6
- themis_eval-0.2.2/themis/presets/__init__.py +21 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/logging_utils.py +8 -3
- themis_eval-0.2.2/themis/utils/progress.py +77 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2/themis_eval.egg-info}/PKG-INFO +6 -5
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/requires.txt +1 -0
- themis_eval-0.2.0/themis/__init__.py +0 -25
- themis_eval-0.2.0/themis/presets/__init__.py +0 -10
- themis_eval-0.2.0/themis/utils/progress.py +0 -58
- {themis_eval-0.2.0 → themis_eval-0.2.2}/LICENSE +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/setup.cfg +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/tests/test_package_metadata.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/backends/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/backends/execution.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/backends/storage.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/__main__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/benchmarks.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/comparison.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/config_commands.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/cost.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/demo.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/info.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/leaderboard.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/math_benchmarks.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/mcq_benchmarks.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/results.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/sample_run.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/commands/visualize.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/main.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/new_project.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/cli/utils.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/engine.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/reports.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/comparison/statistics.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/loader.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/registry.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/runtime.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/config/schema.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/conversation.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/entities.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/serialization.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/tools.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/core/types.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/base.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/commonsense_qa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/competition_math.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/coqa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/gpqa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/gsm8k.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/gsm_symbolic.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/math500.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/med_qa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/medmcqa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/mmlu_pro.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/piqa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/registry.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/schema.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/sciq.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/social_i_qa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/datasets/super_gpqa.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/conditional.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/error_taxonomy_extractor.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/exceptions.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/identity_extractor.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/json_field_extractor.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/math_verify_extractor.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/extractors/regex_extractor.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/math_verify_utils.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/codebleu.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/execution.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/code/pass_at_k.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/composite_metric.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/consistency_metric.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/exact_match.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/length_difference_tolerance.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/math_verify_accuracy.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/bertscore.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/bleu.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/meteor.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/nlp/rouge.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/pairwise_judge_metric.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/response_length.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/metrics/rubric_judge_metric.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipeline.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipelines/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipelines/composable_pipeline.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/pipelines/standard_pipeline.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/reports.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/bootstrap.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/confidence_intervals.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/distributions.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/effect_sizes.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/hypothesis_tests.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/statistics/types.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/default_evaluation_strategy.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/evaluation_strategy.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/evaluation/strategies/judge_evaluation_strategy.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/builder.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/cache_manager.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/comparison.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/cost.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/definitions.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/export.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/export_csv.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/integration_manager.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/math.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/mcq.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/pricing.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/experiment/visualization.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/agentic_runner.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/batching.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/clients.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/conversation_runner.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/plan.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/providers/vllm_provider.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/router.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/strategies.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/templates.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/turn_strategies.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/generation/types.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/integrations/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/integrations/huggingface.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/integrations/wandb.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/interfaces/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/presets/benchmarks.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/presets/models.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/project/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/project/definitions.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/project/patterns.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/providers/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/providers/registry.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/py.typed +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/server/__init__.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/server/app.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/api_generator.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/cost_tracking.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/dashboard.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis/utils/tracing.py +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/SOURCES.txt +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/dependency_links.txt +0 -0
- {themis_eval-0.2.0 → themis_eval-0.2.2}/themis_eval.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: themis-eval
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Lightweight evaluation platform for LLM experiments
|
|
5
5
|
Author: Pittawat Taveekitworachai
|
|
6
6
|
License: MIT
|
|
@@ -25,6 +25,7 @@ Requires-Dist: tabulate>=0.9.0
|
|
|
25
25
|
Requires-Dist: tenacity>=9.1.2
|
|
26
26
|
Requires-Dist: plotly>=6.5.0
|
|
27
27
|
Requires-Dist: math-verify>=0.8.0
|
|
28
|
+
Requires-Dist: rich>=14.2.0
|
|
28
29
|
Provides-Extra: dev
|
|
29
30
|
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
30
31
|
Requires-Dist: pytest-cov>=6.0.0; extra == "dev"
|
|
@@ -358,9 +359,9 @@ Themis is built on a clean, modular architecture:
|
|
|
358
359
|
|
|
359
360
|
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
360
361
|
- **[Examples](examples-simple/)** - Runnable code examples
|
|
361
|
-
- **[Extending Backends](docs/
|
|
362
|
-
- **[API Server](docs/
|
|
363
|
-
- **[Comparison Engine](docs/
|
|
362
|
+
- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
|
|
363
|
+
- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
|
|
364
|
+
- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
|
|
364
365
|
|
|
365
366
|
---
|
|
366
367
|
|
|
@@ -388,7 +389,7 @@ result = evaluate(
|
|
|
388
389
|
)
|
|
389
390
|
```
|
|
390
391
|
|
|
391
|
-
See [
|
|
392
|
+
See [docs/customization/backends.md](docs/customization/backends.md) for details.
|
|
392
393
|
|
|
393
394
|
### Distributed Execution
|
|
394
395
|
|
|
@@ -300,9 +300,9 @@ Themis is built on a clean, modular architecture:
|
|
|
300
300
|
|
|
301
301
|
- **[API Reference](docs/index.md)** - Detailed API documentation
|
|
302
302
|
- **[Examples](examples-simple/)** - Runnable code examples
|
|
303
|
-
- **[Extending Backends](docs/
|
|
304
|
-
- **[API Server](docs/
|
|
305
|
-
- **[Comparison Engine](docs/
|
|
303
|
+
- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
|
|
304
|
+
- **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
|
|
305
|
+
- **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
|
|
306
306
|
|
|
307
307
|
---
|
|
308
308
|
|
|
@@ -330,7 +330,7 @@ result = evaluate(
|
|
|
330
330
|
)
|
|
331
331
|
```
|
|
332
332
|
|
|
333
|
-
See [
|
|
333
|
+
See [docs/customization/backends.md](docs/customization/backends.md) for details.
|
|
334
334
|
|
|
335
335
|
### Distributed Execution
|
|
336
336
|
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "themis-eval"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.2"
|
|
8
8
|
description = "Lightweight evaluation platform for LLM experiments"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.12"
|
|
@@ -32,6 +32,7 @@ dependencies = [
|
|
|
32
32
|
"tenacity>=9.1.2",
|
|
33
33
|
"plotly>=6.5.0",
|
|
34
34
|
"math-verify>=0.8.0",
|
|
35
|
+
"rich>=14.2.0",
|
|
35
36
|
]
|
|
36
37
|
|
|
37
38
|
[tool.setuptools.packages.find]
|
|
@@ -85,3 +86,6 @@ all = [
|
|
|
85
86
|
[tool.pytest.ini_options]
|
|
86
87
|
addopts = "-q"
|
|
87
88
|
pythonpath = ["."]
|
|
89
|
+
markers = [
|
|
90
|
+
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
|
|
91
|
+
]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Themis experiment platform - Dead simple LLM evaluation.
|
|
2
|
+
|
|
3
|
+
The primary interface is the `evaluate()` function:
|
|
4
|
+
|
|
5
|
+
import themis
|
|
6
|
+
report = themis.evaluate("math500", model="gpt-4", limit=100)
|
|
7
|
+
|
|
8
|
+
Extension APIs for registering custom components:
|
|
9
|
+
- themis.register_metric() - Register custom metrics
|
|
10
|
+
- themis.register_dataset() - Register custom datasets
|
|
11
|
+
- themis.register_provider() - Register custom model providers
|
|
12
|
+
- themis.register_benchmark() - Register custom benchmark presets
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from themis import config, core, evaluation, experiment, generation, project
|
|
16
|
+
from themis._version import __version__
|
|
17
|
+
from themis.api import evaluate, get_registered_metrics, register_metric
|
|
18
|
+
from themis.datasets import register_dataset, list_datasets, is_dataset_registered
|
|
19
|
+
from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
|
|
20
|
+
from themis.providers import register_provider
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
# Main API
|
|
24
|
+
"evaluate",
|
|
25
|
+
# Metrics
|
|
26
|
+
"register_metric",
|
|
27
|
+
"get_registered_metrics",
|
|
28
|
+
# Datasets
|
|
29
|
+
"register_dataset",
|
|
30
|
+
"list_datasets",
|
|
31
|
+
"is_dataset_registered",
|
|
32
|
+
# Benchmarks
|
|
33
|
+
"register_benchmark",
|
|
34
|
+
"list_benchmarks",
|
|
35
|
+
"get_benchmark_preset",
|
|
36
|
+
# Providers
|
|
37
|
+
"register_provider",
|
|
38
|
+
# Submodules
|
|
39
|
+
"config",
|
|
40
|
+
"core",
|
|
41
|
+
"evaluation",
|
|
42
|
+
"experiment",
|
|
43
|
+
"generation",
|
|
44
|
+
"project",
|
|
45
|
+
# Version
|
|
46
|
+
"__version__",
|
|
47
|
+
]
|
|
@@ -9,7 +9,7 @@ def _detect_version() -> str:
|
|
|
9
9
|
try:
|
|
10
10
|
return metadata.version("themis-eval")
|
|
11
11
|
except metadata.PackageNotFoundError: # pragma: no cover - local dev only
|
|
12
|
-
return "0.2.
|
|
12
|
+
return "0.2.2" # Fallback for development
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
__version__ = _detect_version()
|
|
@@ -33,6 +33,7 @@ Example:
|
|
|
33
33
|
|
|
34
34
|
from __future__ import annotations
|
|
35
35
|
|
|
36
|
+
import logging
|
|
36
37
|
from datetime import datetime
|
|
37
38
|
from pathlib import Path
|
|
38
39
|
from typing import Any, Callable, Sequence
|
|
@@ -52,6 +53,67 @@ from themis.generation.runner import GenerationRunner
|
|
|
52
53
|
from themis.generation.templates import PromptTemplate
|
|
53
54
|
from themis.providers import create_provider
|
|
54
55
|
|
|
56
|
+
# Import provider modules to ensure they register themselves
|
|
57
|
+
try:
|
|
58
|
+
from themis.generation import clients # noqa: F401 - registers fake provider
|
|
59
|
+
from themis.generation.providers import (
|
|
60
|
+
litellm_provider, # noqa: F401
|
|
61
|
+
vllm_provider, # noqa: F401
|
|
62
|
+
)
|
|
63
|
+
except ImportError:
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
logger = logging.getLogger(__name__)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Module-level metrics registry for custom metrics
|
|
70
|
+
_METRICS_REGISTRY: dict[str, type] = {}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def register_metric(name: str, metric_cls: type) -> None:
|
|
74
|
+
"""Register a custom metric for use in evaluate().
|
|
75
|
+
|
|
76
|
+
This allows users to add their own metrics to Themis without modifying
|
|
77
|
+
the source code. Registered metrics can be used by passing their names
|
|
78
|
+
to the `metrics` parameter in evaluate().
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
name: Metric name (used in evaluate(metrics=[name]))
|
|
82
|
+
metric_cls: Metric class implementing the Metric interface.
|
|
83
|
+
Must have a compute() method that takes prediction, references,
|
|
84
|
+
and metadata parameters.
|
|
85
|
+
|
|
86
|
+
Raises:
|
|
87
|
+
TypeError: If metric_cls is not a class
|
|
88
|
+
ValueError: If metric_cls doesn't implement the required interface
|
|
89
|
+
|
|
90
|
+
Example:
|
|
91
|
+
>>> from themis.evaluation.metrics import MyCustomMetric
|
|
92
|
+
>>> themis.register_metric("my_metric", MyCustomMetric)
|
|
93
|
+
>>> report = themis.evaluate("math500", model="gpt-4", metrics=["my_metric"])
|
|
94
|
+
"""
|
|
95
|
+
if not isinstance(metric_cls, type):
|
|
96
|
+
raise TypeError(f"metric_cls must be a class, got {type(metric_cls)}")
|
|
97
|
+
|
|
98
|
+
# Validate that it implements the Metric interface
|
|
99
|
+
if not hasattr(metric_cls, "compute"):
|
|
100
|
+
raise ValueError(
|
|
101
|
+
f"{metric_cls.__name__} must implement compute() method. "
|
|
102
|
+
f"See themis.evaluation.metrics for examples."
|
|
103
|
+
)
|
|
104
|
+
|
|
105
|
+
_METRICS_REGISTRY[name] = metric_cls
|
|
106
|
+
logger.info(f"Registered custom metric: {name} -> {metric_cls.__name__}")
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_registered_metrics() -> dict[str, type]:
|
|
110
|
+
"""Get all currently registered custom metrics.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Dictionary mapping metric names to their classes
|
|
114
|
+
"""
|
|
115
|
+
return _METRICS_REGISTRY.copy()
|
|
116
|
+
|
|
55
117
|
|
|
56
118
|
def evaluate(
|
|
57
119
|
benchmark_or_dataset: str | Sequence[dict[str, Any]],
|
|
@@ -123,6 +185,19 @@ def evaluate(
|
|
|
123
185
|
>>> print(f"Accuracy: {report.evaluation_report.metrics['accuracy']:.2%}")
|
|
124
186
|
Accuracy: 85.00%
|
|
125
187
|
"""
|
|
188
|
+
logger.info("=" * 60)
|
|
189
|
+
logger.info("Starting Themis evaluation")
|
|
190
|
+
logger.info(f"Model: {model}")
|
|
191
|
+
logger.info(f"Workers: {workers}")
|
|
192
|
+
logger.info(f"Temperature: {temperature}, Max tokens: {max_tokens}")
|
|
193
|
+
if "api_base" in kwargs:
|
|
194
|
+
logger.info(f"Custom API base: {kwargs['api_base']}")
|
|
195
|
+
if "api_key" in kwargs:
|
|
196
|
+
logger.info("API key: <provided>")
|
|
197
|
+
else:
|
|
198
|
+
logger.warning("⚠️ No api_key provided - may fail for custom API endpoints")
|
|
199
|
+
logger.info("=" * 60)
|
|
200
|
+
|
|
126
201
|
# Import presets system (lazy import to avoid circular dependencies)
|
|
127
202
|
from themis.presets import get_benchmark_preset, parse_model_name
|
|
128
203
|
|
|
@@ -131,11 +206,23 @@ def evaluate(
|
|
|
131
206
|
|
|
132
207
|
if is_benchmark:
|
|
133
208
|
benchmark_name = benchmark_or_dataset
|
|
209
|
+
logger.info(f"Loading benchmark: {benchmark_name}")
|
|
210
|
+
|
|
134
211
|
# Get preset configuration
|
|
135
|
-
|
|
212
|
+
try:
|
|
213
|
+
preset = get_benchmark_preset(benchmark_name)
|
|
214
|
+
except Exception as e:
|
|
215
|
+
logger.error(f"❌ Failed to get benchmark preset '{benchmark_name}': {e}")
|
|
216
|
+
raise
|
|
136
217
|
|
|
137
218
|
# Load dataset using preset loader
|
|
138
|
-
dataset
|
|
219
|
+
logger.info(f"Loading dataset (limit={limit})...")
|
|
220
|
+
try:
|
|
221
|
+
dataset = preset.load_dataset(limit=limit)
|
|
222
|
+
logger.info(f"✅ Loaded {len(dataset)} samples from {benchmark_name}")
|
|
223
|
+
except Exception as e:
|
|
224
|
+
logger.error(f"❌ Failed to load dataset: {e}")
|
|
225
|
+
raise
|
|
139
226
|
|
|
140
227
|
# Use preset prompt if not overridden
|
|
141
228
|
if prompt is None:
|
|
@@ -158,11 +245,14 @@ def evaluate(
|
|
|
158
245
|
dataset_id_field = preset.dataset_id_field
|
|
159
246
|
else:
|
|
160
247
|
# Custom dataset
|
|
248
|
+
logger.info("Using custom dataset")
|
|
161
249
|
dataset = list(benchmark_or_dataset)
|
|
250
|
+
logger.info(f"Custom dataset has {len(dataset)} samples")
|
|
162
251
|
|
|
163
252
|
# Limit dataset if requested
|
|
164
253
|
if limit is not None:
|
|
165
254
|
dataset = dataset[:limit]
|
|
255
|
+
logger.info(f"Limited to {len(dataset)} samples")
|
|
166
256
|
|
|
167
257
|
# Use provided prompt or default
|
|
168
258
|
if prompt is None:
|
|
@@ -188,7 +278,15 @@ def evaluate(
|
|
|
188
278
|
dataset_id_field = "id"
|
|
189
279
|
|
|
190
280
|
# Parse model name to get provider and options
|
|
191
|
-
|
|
281
|
+
logger.info(f"Parsing model configuration...")
|
|
282
|
+
try:
|
|
283
|
+
provider_name, model_id, provider_options = parse_model_name(model, **kwargs)
|
|
284
|
+
logger.info(f"Provider: {provider_name}")
|
|
285
|
+
logger.info(f"Model ID: {model_id}")
|
|
286
|
+
logger.debug(f"Provider options: {provider_options}")
|
|
287
|
+
except Exception as e:
|
|
288
|
+
logger.error(f"❌ Failed to parse model name '{model}': {e}")
|
|
289
|
+
raise
|
|
192
290
|
|
|
193
291
|
# Create model spec
|
|
194
292
|
model_spec = ModelSpec(
|
|
@@ -214,17 +312,31 @@ def evaluate(
|
|
|
214
312
|
)
|
|
215
313
|
|
|
216
314
|
# Create provider and router
|
|
217
|
-
provider
|
|
315
|
+
logger.info(f"Creating provider '{provider_name}'...")
|
|
316
|
+
try:
|
|
317
|
+
provider = create_provider(provider_name, **provider_options)
|
|
318
|
+
logger.info(f"✅ Provider created successfully")
|
|
319
|
+
except KeyError as e:
|
|
320
|
+
logger.error(f"❌ Provider '{provider_name}' not registered. Available providers: fake, litellm, openai, anthropic, azure, bedrock, gemini, cohere, vllm")
|
|
321
|
+
logger.error(f" This usually means the provider module wasn't imported.")
|
|
322
|
+
raise
|
|
323
|
+
except Exception as e:
|
|
324
|
+
logger.error(f"❌ Failed to create provider: {e}")
|
|
325
|
+
raise
|
|
326
|
+
|
|
218
327
|
router = ProviderRouter({model_id: provider})
|
|
328
|
+
logger.debug(f"Router configured for model: {model_id}")
|
|
219
329
|
|
|
220
330
|
# Create runner
|
|
221
|
-
runner = GenerationRunner(provider=router)
|
|
331
|
+
runner = GenerationRunner(provider=router, max_parallel=workers)
|
|
332
|
+
logger.info(f"Runner configured with {workers} parallel workers")
|
|
222
333
|
|
|
223
334
|
# Create evaluation pipeline
|
|
224
335
|
pipeline = EvaluationPipeline(
|
|
225
336
|
extractor=extractor,
|
|
226
337
|
metrics=metrics_list,
|
|
227
338
|
)
|
|
339
|
+
logger.info(f"Evaluation metrics: {[m.name for m in metrics_list]}")
|
|
228
340
|
|
|
229
341
|
# Determine storage location
|
|
230
342
|
if storage is None:
|
|
@@ -235,11 +347,15 @@ def evaluate(
|
|
|
235
347
|
# Generate run ID if not provided
|
|
236
348
|
if run_id is None:
|
|
237
349
|
run_id = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
|
|
350
|
+
logger.info(f"Run ID: {run_id}")
|
|
351
|
+
logger.info(f"Storage: {storage_dir}")
|
|
352
|
+
logger.info(f"Resume: {resume}")
|
|
238
353
|
|
|
239
354
|
# Create storage backend
|
|
240
355
|
if isinstance(storage_dir, Path):
|
|
241
356
|
from themis.experiment.storage import ExperimentStorage
|
|
242
357
|
storage_backend = ExperimentStorage(storage_dir)
|
|
358
|
+
logger.debug(f"Storage backend created at {storage_dir}")
|
|
243
359
|
else:
|
|
244
360
|
# Cloud storage (to be implemented in Phase 3)
|
|
245
361
|
raise NotImplementedError(
|
|
@@ -264,15 +380,34 @@ def evaluate(
|
|
|
264
380
|
)
|
|
265
381
|
|
|
266
382
|
# Run locally
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
run_id=run_id,
|
|
271
|
-
resume=resume,
|
|
272
|
-
on_result=on_result,
|
|
273
|
-
)
|
|
383
|
+
logger.info("=" * 60)
|
|
384
|
+
logger.info("🚀 Starting experiment execution...")
|
|
385
|
+
logger.info("=" * 60)
|
|
274
386
|
|
|
275
|
-
|
|
387
|
+
try:
|
|
388
|
+
report = orchestrator.run(
|
|
389
|
+
dataset=dataset,
|
|
390
|
+
max_samples=limit,
|
|
391
|
+
run_id=run_id,
|
|
392
|
+
resume=resume,
|
|
393
|
+
on_result=on_result,
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
logger.info("=" * 60)
|
|
397
|
+
logger.info("✅ Evaluation completed successfully!")
|
|
398
|
+
logger.info(f" Total samples: {len(report.generation_results)}")
|
|
399
|
+
logger.info(f" Successful: {report.metadata.get('successful_generations', 0)}")
|
|
400
|
+
logger.info(f" Failed: {report.metadata.get('failed_generations', 0)}")
|
|
401
|
+
if report.evaluation_report.metrics:
|
|
402
|
+
logger.info(f" Metrics: {list(report.evaluation_report.metrics.keys())}")
|
|
403
|
+
logger.info("=" * 60)
|
|
404
|
+
|
|
405
|
+
return report
|
|
406
|
+
except Exception as e:
|
|
407
|
+
logger.error("=" * 60)
|
|
408
|
+
logger.error(f"❌ Evaluation failed: {e}")
|
|
409
|
+
logger.error("=" * 60)
|
|
410
|
+
raise
|
|
276
411
|
|
|
277
412
|
|
|
278
413
|
def _resolve_metrics(metric_names: list[str]) -> list:
|
|
@@ -298,8 +433,8 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
298
433
|
except ImportError:
|
|
299
434
|
nlp_available = False
|
|
300
435
|
|
|
301
|
-
#
|
|
302
|
-
|
|
436
|
+
# Built-in metrics registry
|
|
437
|
+
BUILTIN_METRICS = {
|
|
303
438
|
# Core metrics
|
|
304
439
|
"exact_match": ExactMatch,
|
|
305
440
|
"math_verify": MathVerifyAccuracy,
|
|
@@ -308,7 +443,7 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
308
443
|
|
|
309
444
|
# Add NLP metrics if available
|
|
310
445
|
if nlp_available:
|
|
311
|
-
|
|
446
|
+
BUILTIN_METRICS.update({
|
|
312
447
|
"bleu": BLEU,
|
|
313
448
|
"rouge1": lambda: ROUGE(variant=ROUGEVariant.ROUGE_1),
|
|
314
449
|
"rouge2": lambda: ROUGE(variant=ROUGEVariant.ROUGE_2),
|
|
@@ -321,6 +456,10 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
321
456
|
# "pass_at_k": PassAtK,
|
|
322
457
|
# "codebleu": CodeBLEU,
|
|
323
458
|
|
|
459
|
+
# Merge built-in and custom metrics
|
|
460
|
+
# Custom metrics can override built-in metrics
|
|
461
|
+
METRICS_REGISTRY = {**BUILTIN_METRICS, **_METRICS_REGISTRY}
|
|
462
|
+
|
|
324
463
|
metrics = []
|
|
325
464
|
for name in metric_names:
|
|
326
465
|
if name not in METRICS_REGISTRY:
|
|
@@ -340,4 +479,4 @@ def _resolve_metrics(metric_names: list[str]) -> list:
|
|
|
340
479
|
return metrics
|
|
341
480
|
|
|
342
481
|
|
|
343
|
-
__all__ = ["evaluate"]
|
|
482
|
+
__all__ = ["evaluate", "register_metric", "get_registered_metrics"]
|
|
@@ -2,10 +2,13 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
import logging
|
|
5
6
|
from datetime import datetime, timezone
|
|
6
7
|
from typing import Callable, Sequence
|
|
7
8
|
|
|
8
9
|
from themis.config.schema import IntegrationsConfig
|
|
10
|
+
|
|
11
|
+
logger = logging.getLogger(__name__)
|
|
9
12
|
from themis.core.entities import (
|
|
10
13
|
EvaluationRecord,
|
|
11
14
|
ExperimentFailure,
|
|
@@ -102,6 +105,8 @@ class ExperimentOrchestrator:
|
|
|
102
105
|
Returns:
|
|
103
106
|
ExperimentReport with generation results, evaluation, and metadata
|
|
104
107
|
"""
|
|
108
|
+
logger.info("Orchestrator: Initializing experiment run")
|
|
109
|
+
|
|
105
110
|
# Initialize integrations
|
|
106
111
|
self._integrations.initialize_run(
|
|
107
112
|
{
|
|
@@ -112,13 +117,23 @@ class ExperimentOrchestrator:
|
|
|
112
117
|
)
|
|
113
118
|
|
|
114
119
|
# Prepare dataset
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
120
|
+
logger.info("Orchestrator: Loading dataset...")
|
|
121
|
+
try:
|
|
122
|
+
dataset_list = self._resolve_dataset(
|
|
123
|
+
dataset=dataset, dataset_loader=dataset_loader, run_id=run_id
|
|
124
|
+
)
|
|
125
|
+
logger.info(f"Orchestrator: Dataset loaded ({len(dataset_list)} total samples)")
|
|
126
|
+
except Exception as e:
|
|
127
|
+
logger.error(f"Orchestrator: ❌ Failed to load dataset: {e}")
|
|
128
|
+
raise
|
|
129
|
+
|
|
118
130
|
selected_dataset = (
|
|
119
131
|
dataset_list[:max_samples] if max_samples is not None else dataset_list
|
|
120
132
|
)
|
|
121
133
|
run_identifier = run_id or self._default_run_id()
|
|
134
|
+
|
|
135
|
+
logger.info(f"Orchestrator: Processing {len(selected_dataset)} samples")
|
|
136
|
+
logger.info(f"Orchestrator: Run ID = {run_identifier}")
|
|
122
137
|
|
|
123
138
|
# Initialize run in storage (if storage exists and run doesn't exist)
|
|
124
139
|
if self._cache.has_storage:
|
|
@@ -130,18 +145,30 @@ class ExperimentOrchestrator:
|
|
|
130
145
|
self._cache.cache_dataset(run_identifier, dataset_list)
|
|
131
146
|
|
|
132
147
|
# Expand dataset into generation tasks
|
|
133
|
-
|
|
148
|
+
logger.info("Orchestrator: Expanding dataset into generation tasks...")
|
|
149
|
+
try:
|
|
150
|
+
tasks = list(self._plan.expand(selected_dataset))
|
|
151
|
+
logger.info(f"Orchestrator: Created {len(tasks)} generation tasks")
|
|
152
|
+
except Exception as e:
|
|
153
|
+
logger.error(f"Orchestrator: ❌ Failed to expand dataset: {e}")
|
|
154
|
+
raise
|
|
134
155
|
|
|
135
156
|
# Build evaluation configuration for cache invalidation
|
|
136
157
|
evaluation_config = self._build_evaluation_config()
|
|
137
158
|
|
|
138
159
|
# Load cached results if resuming
|
|
160
|
+
if resume:
|
|
161
|
+
logger.info("Orchestrator: Loading cached results...")
|
|
139
162
|
cached_records = (
|
|
140
163
|
self._cache.load_cached_records(run_identifier) if resume else {}
|
|
141
164
|
)
|
|
142
165
|
cached_evaluations = (
|
|
143
166
|
self._cache.load_cached_evaluations(run_identifier, evaluation_config) if resume else {}
|
|
144
167
|
)
|
|
168
|
+
if resume and cached_records:
|
|
169
|
+
logger.info(f"Orchestrator: Found {len(cached_records)} cached generation records")
|
|
170
|
+
if resume and cached_evaluations:
|
|
171
|
+
logger.info(f"Orchestrator: Found {len(cached_evaluations)} cached evaluation records")
|
|
145
172
|
|
|
146
173
|
# Process tasks: use cached or run new generations
|
|
147
174
|
generation_results: list[GenerationRecord] = []
|
|
@@ -178,9 +205,18 @@ class ExperimentOrchestrator:
|
|
|
178
205
|
|
|
179
206
|
# Run pending generation tasks
|
|
180
207
|
if pending_tasks:
|
|
208
|
+
logger.info(f"Orchestrator: Running {len(pending_tasks)} generation tasks...")
|
|
209
|
+
completed = 0
|
|
181
210
|
for record in self._runner.run(pending_tasks):
|
|
211
|
+
logger.debug(f"Orchestrator: Received generation record")
|
|
182
212
|
generation_results.append(record)
|
|
213
|
+
completed += 1
|
|
214
|
+
|
|
215
|
+
# Log progress every 10 samples or at key milestones
|
|
216
|
+
if completed % 10 == 0 or completed == len(pending_tasks):
|
|
217
|
+
logger.info(f"Orchestrator: Generation progress: {completed}/{len(pending_tasks)} ({100*completed//len(pending_tasks)}%)")
|
|
183
218
|
|
|
219
|
+
logger.debug(f"Orchestrator: Processing record (cost tracking...)")
|
|
184
220
|
# Track cost for successful generations
|
|
185
221
|
if record.output and record.output.usage:
|
|
186
222
|
usage = record.output.usage
|
|
@@ -197,6 +233,7 @@ class ExperimentOrchestrator:
|
|
|
197
233
|
cost=cost,
|
|
198
234
|
)
|
|
199
235
|
|
|
236
|
+
logger.debug(f"Orchestrator: Processing record (error handling...)")
|
|
200
237
|
if record.error:
|
|
201
238
|
failures.append(
|
|
202
239
|
ExperimentFailure(
|
|
@@ -204,20 +241,35 @@ class ExperimentOrchestrator:
|
|
|
204
241
|
message=record.error.message,
|
|
205
242
|
)
|
|
206
243
|
)
|
|
244
|
+
|
|
245
|
+
logger.debug(f"Orchestrator: Processing record (caching...)")
|
|
207
246
|
cache_key = experiment_storage.task_cache_key(record.task)
|
|
208
247
|
if cache_results:
|
|
209
248
|
self._cache.save_generation_record(
|
|
210
249
|
run_identifier, record, cache_key
|
|
211
250
|
)
|
|
251
|
+
|
|
252
|
+
logger.debug(f"Orchestrator: Processing record (adding to pending...)")
|
|
212
253
|
pending_records.append(record)
|
|
213
254
|
pending_keys.append(cache_key)
|
|
255
|
+
|
|
256
|
+
logger.debug(f"Orchestrator: Processing record (callback...)")
|
|
214
257
|
if on_result:
|
|
215
258
|
on_result(record)
|
|
259
|
+
logger.debug(f"Orchestrator: Record processing complete")
|
|
216
260
|
|
|
217
261
|
# Evaluate pending records
|
|
262
|
+
logger.info(f"Orchestrator: Preparing to evaluate {len(pending_records)} pending records...")
|
|
218
263
|
if pending_records:
|
|
219
|
-
|
|
264
|
+
logger.info(f"Orchestrator: Starting evaluation of {len(pending_records)} records...")
|
|
265
|
+
try:
|
|
266
|
+
new_evaluation_report = self._evaluation.evaluate(pending_records)
|
|
267
|
+
logger.info(f"Orchestrator: ✅ Evaluation complete - got {len(new_evaluation_report.records)} results")
|
|
268
|
+
except Exception as e:
|
|
269
|
+
logger.error(f"Orchestrator: ❌ Evaluation failed: {e}")
|
|
270
|
+
raise
|
|
220
271
|
else:
|
|
272
|
+
logger.info("Orchestrator: No new records to evaluate (all cached)")
|
|
221
273
|
new_evaluation_report = evaluation_pipeline.EvaluationReport(
|
|
222
274
|
metrics={}, failures=[], records=[]
|
|
223
275
|
)
|
|
@@ -229,12 +281,16 @@ class ExperimentOrchestrator:
|
|
|
229
281
|
)
|
|
230
282
|
|
|
231
283
|
# Combine cached and new evaluations
|
|
284
|
+
logger.info("Orchestrator: Combining cached and new evaluations...")
|
|
232
285
|
evaluation_report = self._combine_evaluations(
|
|
233
286
|
cached_eval_records, new_evaluation_report
|
|
234
287
|
)
|
|
288
|
+
logger.info(f"Orchestrator: Total evaluation records: {len(evaluation_report.records)}")
|
|
235
289
|
|
|
236
290
|
# Get cost breakdown
|
|
237
291
|
cost_breakdown = self._cost_tracker.get_breakdown()
|
|
292
|
+
if cost_breakdown.total_cost > 0:
|
|
293
|
+
logger.info(f"Orchestrator: Total cost: ${cost_breakdown.total_cost:.4f}")
|
|
238
294
|
|
|
239
295
|
# Build metadata
|
|
240
296
|
metadata = {
|