PyPI - themis-eval - Versions diffs - 0.2.2__tar.gz → 1.0.0__tar.gz - Mend

themis-eval 0.2.2tar.gz → 1.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

{themis_eval-0.2.2/themis_eval.egg-info → themis_eval-1.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: themis-eval
-Version: 0.2.2
+Version: 1.0.0
 Summary: Lightweight evaluation platform for LLM experiments
 Author: Pittawat Taveekitworachai
 License: MIT
@@ -100,13 +100,14 @@ pip install themis-eval[math,nlp,code,server]
 from themis import evaluate
 # Evaluate any model on any benchmark
-result = evaluate(
-    benchmark="gsm8k",
+report = evaluate(
+    "gsm8k",
     model="gpt-4",
-    limit=100
+    limit=100,
 )
-print(f"Accuracy: {result.metrics['exact_match']:.2%}")
+accuracy = report.evaluation_report.metrics["ExactMatch"].mean
+print(f"Accuracy: {accuracy:.2%}")
 ```
 ### CLI Usage
@@ -122,6 +123,9 @@ themis compare gpt4-run claude-run
 # Start web dashboard
 themis serve
+# Share a run
+themis share gpt4-run --output-dir share
 ```
 ---
@@ -130,20 +134,28 @@ themis serve
 ### 🎯 Built-in Benchmarks
-Themis includes 6 popular benchmarks out-of-the-box:
+Themis includes 19 built-in benchmarks out-of-the-box:
 ```python
 # Math reasoning
-evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
-evaluate(benchmark="math500", model="gpt-4", limit=50)
-evaluate(benchmark="aime24", model="gpt-4")
+evaluate("gsm8k", model="gpt-4", limit=100)
+evaluate("math500", model="gpt-4", limit=50)
+evaluate("aime24", model="gpt-4")
 # General knowledge
-evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
-evaluate(benchmark="supergpqa", model="gpt-4")
+evaluate("mmlu-pro", model="gpt-4", limit=1000)
+evaluate("supergpqa", model="gpt-4")
+# Science & medical
+evaluate("gpqa", model="gpt-4", limit=200)
+evaluate("medmcqa", model="gpt-4", limit=200)
+# Commonsense & conversational
+evaluate("commonsense_qa", model="gpt-4", limit=200)
+evaluate("coqa", model="gpt-4", limit=200)
 # Quick testing
-evaluate(benchmark="demo", model="fake-math-llm", limit=10)
+evaluate("demo", model="fake-math-llm", limit=10)
 ```
 **See all available benchmarks:**
@@ -165,8 +177,7 @@ themis list benchmarks
 ```python
 # Use specific metrics
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     metrics=["exact_match", "bleu", "rouge1"],
 )
@@ -192,7 +203,7 @@ print(report.summary())
 **CLI:**
 ```bash
-themis compare run-1 run-2 --test bootstrap --output comparison.html
+themis compare run-1 run-2 --output comparison.html
 ```
 ### 🌐 Web Dashboard
@@ -218,19 +229,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
 ```python
 # OpenAI
-evaluate(benchmark="gsm8k", model="gpt-4")
+evaluate("gsm8k", model="gpt-4")
 # Anthropic
-evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
+evaluate("gsm8k", model="claude-3-opus-20240229")
 # Azure OpenAI
-evaluate(benchmark="gsm8k", model="azure/gpt-4")
+evaluate("gsm8k", model="azure/gpt-4")
 # Local models (vLLM, Ollama, etc.)
-evaluate(benchmark="gsm8k", model="ollama/llama3")
+evaluate("gsm8k", model="ollama/llama3")
 # AWS Bedrock
-evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
+evaluate("gsm8k", model="bedrock/anthropic.claude-3")
 ```
 ### 💾 Smart Caching
@@ -239,8 +250,7 @@ Themis automatically caches results and resumes failed runs:
 ```python
 # Run with caching
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     limit=1000,
     run_id="my-experiment",
@@ -275,14 +285,13 @@ result = evaluate(
     metrics=["exact_match"],
 )
-print(result.report)
+print(result.evaluation_report.metrics["ExactMatch"].mean)
 ```
 ### Advanced Configuration
 ```python
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     temperature=0.7,
     max_tokens=512,
@@ -335,7 +344,7 @@ Themis is built on a clean, modular architecture:
          │                 │
     ┌────▼─────┐     ┌────▼─────┐
     │Benchmarks│     │Evaluation│
-    │(6 built- │     │ Pipeline │
+    │(19 built-│     │ Pipeline │
     │   in)    │     └────┬─────┘
     └──────────┘          │
                      ┌────▼─────┐
@@ -359,7 +368,7 @@ Themis is built on a clean, modular architecture:
 - **[API Reference](docs/index.md)** - Detailed API documentation
 - **[Examples](examples-simple/)** - Runnable code examples
-- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
+- **[Backends API](docs/api/backends.md)** - Custom storage and execution
 - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
 - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
@@ -382,14 +391,13 @@ class S3StorageBackend(StorageBackend):
     # ... implement other methods
 # Use custom backend
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     storage_backend=S3StorageBackend(bucket="my-bucket")
 )
 ```
-See [docs/customization/backends.md](docs/customization/backends.md) for details.
+See [docs/api/backends.md](docs/api/backends.md) for details.
 ### Distributed Execution
@@ -401,8 +409,7 @@ class RayExecutionBackend(ExecutionBackend):
     """Distributed execution with Ray"""
     # ... implementation
-result = evaluate(
-    benchmark="math500",
+result = evaluate("math500",
     model="gpt-4",
     execution_backend=RayExecutionBackend(num_cpus=32)
 )
@@ -454,10 +461,10 @@ themis eval <benchmark> --model <model> [options]
 themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
 # Options:
+#   --metric NAME          Restrict to one metric
 #   --storage PATH         Storage directory
-#   --test STR             Statistical test: t_test, bootstrap, permutation
-#   --alpha FLOAT          Significance level (default: 0.05)
 #   --output FILE          Export report (.json, .html, .md)
+#   --show-diff            Include detailed per-sample differences in summary
 ```
 ### Server
@@ -539,6 +546,12 @@ uv run python examples-simple/04_comparison.py
 # API server example
 uv run python examples-simple/05_api_server.py
+# Resume/cache example
+uv run python examples-simple/08_resume_cache.py
+# End-to-end research loop example
+uv run python examples-simple/09_research_loop.py
 ```
 ---

{themis_eval-0.2.2 → themis_eval-1.0.0}/README.md RENAMED Viewed

@@ -41,13 +41,14 @@ pip install themis-eval[math,nlp,code,server]
 from themis import evaluate
 # Evaluate any model on any benchmark
-result = evaluate(
-    benchmark="gsm8k",
+report = evaluate(
+    "gsm8k",
     model="gpt-4",
-    limit=100
+    limit=100,
 )
-print(f"Accuracy: {result.metrics['exact_match']:.2%}")
+accuracy = report.evaluation_report.metrics["ExactMatch"].mean
+print(f"Accuracy: {accuracy:.2%}")
 ```
 ### CLI Usage
@@ -63,6 +64,9 @@ themis compare gpt4-run claude-run
 # Start web dashboard
 themis serve
+# Share a run
+themis share gpt4-run --output-dir share
 ```
 ---
@@ -71,20 +75,28 @@ themis serve
 ### 🎯 Built-in Benchmarks
-Themis includes 6 popular benchmarks out-of-the-box:
+Themis includes 19 built-in benchmarks out-of-the-box:
 ```python
 # Math reasoning
-evaluate(benchmark="gsm8k", model="gpt-4", limit=100)
-evaluate(benchmark="math500", model="gpt-4", limit=50)
-evaluate(benchmark="aime24", model="gpt-4")
+evaluate("gsm8k", model="gpt-4", limit=100)
+evaluate("math500", model="gpt-4", limit=50)
+evaluate("aime24", model="gpt-4")
 # General knowledge
-evaluate(benchmark="mmlu_pro", model="gpt-4", limit=1000)
-evaluate(benchmark="supergpqa", model="gpt-4")
+evaluate("mmlu-pro", model="gpt-4", limit=1000)
+evaluate("supergpqa", model="gpt-4")
+# Science & medical
+evaluate("gpqa", model="gpt-4", limit=200)
+evaluate("medmcqa", model="gpt-4", limit=200)
+# Commonsense & conversational
+evaluate("commonsense_qa", model="gpt-4", limit=200)
+evaluate("coqa", model="gpt-4", limit=200)
 # Quick testing
-evaluate(benchmark="demo", model="fake-math-llm", limit=10)
+evaluate("demo", model="fake-math-llm", limit=10)
 ```
 **See all available benchmarks:**
@@ -106,8 +118,7 @@ themis list benchmarks
 ```python
 # Use specific metrics
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     metrics=["exact_match", "bleu", "rouge1"],
 )
@@ -133,7 +144,7 @@ print(report.summary())
 **CLI:**
 ```bash
-themis compare run-1 run-2 --test bootstrap --output comparison.html
+themis compare run-1 run-2 --output comparison.html
 ```
 ### 🌐 Web Dashboard
@@ -159,19 +170,19 @@ Themis uses [LiteLLM](https://github.com/BerriAI/litellm) for broad provider sup
 ```python
 # OpenAI
-evaluate(benchmark="gsm8k", model="gpt-4")
+evaluate("gsm8k", model="gpt-4")
 # Anthropic
-evaluate(benchmark="gsm8k", model="claude-3-opus-20240229")
+evaluate("gsm8k", model="claude-3-opus-20240229")
 # Azure OpenAI
-evaluate(benchmark="gsm8k", model="azure/gpt-4")
+evaluate("gsm8k", model="azure/gpt-4")
 # Local models (vLLM, Ollama, etc.)
-evaluate(benchmark="gsm8k", model="ollama/llama3")
+evaluate("gsm8k", model="ollama/llama3")
 # AWS Bedrock
-evaluate(benchmark="gsm8k", model="bedrock/anthropic.claude-3")
+evaluate("gsm8k", model="bedrock/anthropic.claude-3")
 ```
 ### 💾 Smart Caching
@@ -180,8 +191,7 @@ Themis automatically caches results and resumes failed runs:
 ```python
 # Run with caching
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     limit=1000,
     run_id="my-experiment",
@@ -216,14 +226,13 @@ result = evaluate(
     metrics=["exact_match"],
 )
-print(result.report)
+print(result.evaluation_report.metrics["ExactMatch"].mean)
 ```
 ### Advanced Configuration
 ```python
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     temperature=0.7,
     max_tokens=512,
@@ -276,7 +285,7 @@ Themis is built on a clean, modular architecture:
          │                 │
     ┌────▼─────┐     ┌────▼─────┐
     │Benchmarks│     │Evaluation│
-    │(6 built- │     │ Pipeline │
+    │(19 built-│     │ Pipeline │
     │   in)    │     └────┬─────┘
     └──────────┘          │
                      ┌────▼─────┐
@@ -300,7 +309,7 @@ Themis is built on a clean, modular architecture:
 - **[API Reference](docs/index.md)** - Detailed API documentation
 - **[Examples](examples-simple/)** - Runnable code examples
-- **[Extending Backends](docs/customization/backends.md)** - Custom storage and execution
+- **[Backends API](docs/api/backends.md)** - Custom storage and execution
 - **[API Server](docs/reference/api-server.md)** - Web dashboard and REST API
 - **[Comparison Engine](docs/guides/comparison.md)** - Statistical testing guide
@@ -323,14 +332,13 @@ class S3StorageBackend(StorageBackend):
     # ... implement other methods
 # Use custom backend
-result = evaluate(
-    benchmark="gsm8k",
+result = evaluate("gsm8k",
     model="gpt-4",
     storage_backend=S3StorageBackend(bucket="my-bucket")
 )
 ```
-See [docs/customization/backends.md](docs/customization/backends.md) for details.
+See [docs/api/backends.md](docs/api/backends.md) for details.
 ### Distributed Execution
@@ -342,8 +350,7 @@ class RayExecutionBackend(ExecutionBackend):
     """Distributed execution with Ray"""
     # ... implementation
-result = evaluate(
-    benchmark="math500",
+result = evaluate("math500",
     model="gpt-4",
     execution_backend=RayExecutionBackend(num_cpus=32)
 )
@@ -395,10 +402,10 @@ themis eval <benchmark> --model <model> [options]
 themis compare <run-id-1> <run-id-2> [run-id-3...] [options]
 # Options:
+#   --metric NAME          Restrict to one metric
 #   --storage PATH         Storage directory
-#   --test STR             Statistical test: t_test, bootstrap, permutation
-#   --alpha FLOAT          Significance level (default: 0.05)
 #   --output FILE          Export report (.json, .html, .md)
+#   --show-diff            Include detailed per-sample differences in summary
 ```
 ### Server
@@ -480,6 +487,12 @@ uv run python examples-simple/04_comparison.py
 # API server example
 uv run python examples-simple/05_api_server.py
+# Resume/cache example
+uv run python examples-simple/08_resume_cache.py
+# End-to-end research loop example
+uv run python examples-simple/09_research_loop.py
 ```
 ---

{themis_eval-0.2.2 → themis_eval-1.0.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "themis-eval"
-version = "0.2.2"
+version = "1.0.0"
 description = "Lightweight evaluation platform for LLM experiments"
 readme = "README.md"
 requires-python = ">=3.12"

{themis_eval-0.2.2 → themis_eval-1.0.0}/themis/__init__.py RENAMED Viewed

@@ -12,9 +12,10 @@ Extension APIs for registering custom components:
     - themis.register_benchmark() - Register custom benchmark presets
 """
-from themis import config, core, evaluation, experiment, generation, project
+from themis import config, core, evaluation, generation, project, session
 from themis._version import __version__
 from themis.api import evaluate, get_registered_metrics, register_metric
+from themis.session import ExperimentSession
 from themis.datasets import register_dataset, list_datasets, is_dataset_registered
 from themis.presets import register_benchmark, list_benchmarks, get_benchmark_preset
 from themis.providers import register_provider
@@ -39,9 +40,11 @@ __all__ = [
     "config",
     "core",
     "evaluation",
-    "experiment",
     "generation",
     "project",
+    "session",
+    # Session API
+    "ExperimentSession",
     # Version
     "__version__",
 ]

themis_eval-1.0.0/themis/_version.py ADDED Viewed

@@ -0,0 +1,30 @@
+"""Package version helpers."""
+from __future__ import annotations
+from importlib import metadata
+from pathlib import Path
+import tomllib
+def _read_local_pyproject_version() -> str:
+    """Return the version declared in pyproject.toml for local development."""
+    pyproject_path = Path(__file__).resolve().parents[1] / "pyproject.toml"
+    try:
+        with pyproject_path.open("rb") as fh:
+            data = tomllib.load(fh)
+    except FileNotFoundError:
+        return "0.0.0"
+    return data.get("project", {}).get("version", "0.0.0")
+def _detect_version() -> str:
+    try:
+        return metadata.version("themis-eval")
+    except metadata.PackageNotFoundError:  # pragma: no cover - local dev only
+        return _read_local_pyproject_version()
+__version__ = _detect_version()
+__all__ = ["__version__"]

themis-eval 0.2.2__tar.gz → 1.0.0__tar.gz

themis-eval 0.2.2tar.gz → 1.0.0tar.gz