PyPI - themis-eval - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

themis/__init__.py +12 -1
themis/_version.py +2 -2
themis/api.py +343 -0
themis/backends/__init__.py +17 -0
themis/backends/execution.py +197 -0
themis/backends/storage.py +260 -0
themis/cli/__init__.py +5 -0
themis/cli/__main__.py +6 -0
themis/cli/commands/__init__.py +19 -0
themis/cli/commands/benchmarks.py +221 -0
themis/cli/commands/comparison.py +394 -0
themis/cli/commands/config_commands.py +244 -0
themis/cli/commands/cost.py +214 -0
themis/cli/commands/demo.py +68 -0
themis/cli/commands/info.py +90 -0
themis/cli/commands/leaderboard.py +362 -0
themis/cli/commands/math_benchmarks.py +318 -0
themis/cli/commands/mcq_benchmarks.py +207 -0
themis/cli/commands/results.py +252 -0
themis/cli/commands/sample_run.py +244 -0
themis/cli/commands/visualize.py +299 -0
themis/cli/main.py +463 -0
themis/cli/new_project.py +33 -0
themis/cli/utils.py +51 -0
themis/comparison/__init__.py +25 -0
themis/comparison/engine.py +348 -0
themis/comparison/reports.py +283 -0
themis/comparison/statistics.py +402 -0
themis/config/__init__.py +19 -0
themis/config/loader.py +27 -0
themis/config/registry.py +34 -0
themis/config/runtime.py +214 -0
themis/config/schema.py +112 -0
themis/core/__init__.py +5 -0
themis/core/conversation.py +354 -0
themis/core/entities.py +184 -0
themis/core/serialization.py +231 -0
themis/core/tools.py +393 -0
themis/core/types.py +141 -0
themis/datasets/__init__.py +273 -0
themis/datasets/base.py +264 -0
themis/datasets/commonsense_qa.py +174 -0
themis/datasets/competition_math.py +265 -0
themis/datasets/coqa.py +133 -0
themis/datasets/gpqa.py +190 -0
themis/datasets/gsm8k.py +123 -0
themis/datasets/gsm_symbolic.py +124 -0
themis/datasets/math500.py +122 -0
themis/datasets/med_qa.py +179 -0
themis/datasets/medmcqa.py +169 -0
themis/datasets/mmlu_pro.py +262 -0
themis/datasets/piqa.py +146 -0
themis/datasets/registry.py +201 -0
themis/datasets/schema.py +245 -0
themis/datasets/sciq.py +150 -0
themis/datasets/social_i_qa.py +151 -0
themis/datasets/super_gpqa.py +263 -0
themis/evaluation/__init__.py +1 -0
themis/evaluation/conditional.py +410 -0
themis/evaluation/extractors/__init__.py +19 -0
themis/evaluation/extractors/error_taxonomy_extractor.py +80 -0
themis/evaluation/extractors/exceptions.py +7 -0
themis/evaluation/extractors/identity_extractor.py +29 -0
themis/evaluation/extractors/json_field_extractor.py +45 -0
themis/evaluation/extractors/math_verify_extractor.py +37 -0
themis/evaluation/extractors/regex_extractor.py +43 -0
themis/evaluation/math_verify_utils.py +87 -0
themis/evaluation/metrics/__init__.py +21 -0
themis/evaluation/metrics/code/__init__.py +19 -0
themis/evaluation/metrics/code/codebleu.py +144 -0
themis/evaluation/metrics/code/execution.py +280 -0
themis/evaluation/metrics/code/pass_at_k.py +181 -0
themis/evaluation/metrics/composite_metric.py +47 -0
themis/evaluation/metrics/consistency_metric.py +80 -0
themis/evaluation/metrics/exact_match.py +51 -0
themis/evaluation/metrics/length_difference_tolerance.py +33 -0
themis/evaluation/metrics/math_verify_accuracy.py +40 -0
themis/evaluation/metrics/nlp/__init__.py +21 -0
themis/evaluation/metrics/nlp/bertscore.py +138 -0
themis/evaluation/metrics/nlp/bleu.py +129 -0
themis/evaluation/metrics/nlp/meteor.py +153 -0
themis/evaluation/metrics/nlp/rouge.py +136 -0
themis/evaluation/metrics/pairwise_judge_metric.py +141 -0
themis/evaluation/metrics/response_length.py +33 -0
themis/evaluation/metrics/rubric_judge_metric.py +134 -0
themis/evaluation/pipeline.py +49 -0
themis/evaluation/pipelines/__init__.py +15 -0
themis/evaluation/pipelines/composable_pipeline.py +357 -0
themis/evaluation/pipelines/standard_pipeline.py +348 -0
themis/evaluation/reports.py +293 -0
themis/evaluation/statistics/__init__.py +53 -0
themis/evaluation/statistics/bootstrap.py +79 -0
themis/evaluation/statistics/confidence_intervals.py +121 -0
themis/evaluation/statistics/distributions.py +207 -0
themis/evaluation/statistics/effect_sizes.py +124 -0
themis/evaluation/statistics/hypothesis_tests.py +305 -0
themis/evaluation/statistics/types.py +139 -0
themis/evaluation/strategies/__init__.py +13 -0
themis/evaluation/strategies/attempt_aware_evaluation_strategy.py +51 -0
themis/evaluation/strategies/default_evaluation_strategy.py +25 -0
themis/evaluation/strategies/evaluation_strategy.py +24 -0
themis/evaluation/strategies/judge_evaluation_strategy.py +64 -0
themis/experiment/__init__.py +5 -0
themis/experiment/builder.py +151 -0
themis/experiment/cache_manager.py +134 -0
themis/experiment/comparison.py +631 -0
themis/experiment/cost.py +310 -0
themis/experiment/definitions.py +62 -0
themis/experiment/export.py +798 -0
themis/experiment/export_csv.py +159 -0
themis/experiment/integration_manager.py +104 -0
themis/experiment/math.py +192 -0
themis/experiment/mcq.py +169 -0
themis/experiment/orchestrator.py +415 -0
themis/experiment/pricing.py +317 -0
themis/experiment/storage.py +1458 -0
themis/experiment/visualization.py +588 -0
themis/generation/__init__.py +1 -0
themis/generation/agentic_runner.py +420 -0
themis/generation/batching.py +254 -0
themis/generation/clients.py +143 -0
themis/generation/conversation_runner.py +236 -0
themis/generation/plan.py +456 -0
themis/generation/providers/litellm_provider.py +221 -0
themis/generation/providers/vllm_provider.py +135 -0
themis/generation/router.py +34 -0
themis/generation/runner.py +207 -0
themis/generation/strategies.py +98 -0
themis/generation/templates.py +71 -0
themis/generation/turn_strategies.py +393 -0
themis/generation/types.py +9 -0
themis/integrations/__init__.py +0 -0
themis/integrations/huggingface.py +72 -0
themis/integrations/wandb.py +77 -0
themis/interfaces/__init__.py +169 -0
themis/presets/__init__.py +10 -0
themis/presets/benchmarks.py +354 -0
themis/presets/models.py +190 -0
themis/project/__init__.py +20 -0
themis/project/definitions.py +98 -0
themis/project/patterns.py +230 -0
themis/providers/__init__.py +5 -0
themis/providers/registry.py +39 -0
themis/server/__init__.py +28 -0
themis/server/app.py +337 -0
themis/utils/api_generator.py +379 -0
themis/utils/cost_tracking.py +376 -0
themis/utils/dashboard.py +452 -0
themis/utils/logging_utils.py +41 -0
themis/utils/progress.py +58 -0
themis/utils/tracing.py +320 -0
themis_eval-0.2.0.dist-info/METADATA +596 -0
themis_eval-0.2.0.dist-info/RECORD +157 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/WHEEL +1 -1
themis_eval-0.1.0.dist-info/METADATA +0 -758
themis_eval-0.1.0.dist-info/RECORD +0 -8
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/licenses/LICENSE +0 -0
{themis_eval-0.1.0.dist-info → themis_eval-0.2.0.dist-info}/top_level.txt +0 -0

themis/utils/dashboard.py ADDED Viewed

@@ -0,0 +1,452 @@
+"""Dashboard generator for experiment results, costs, and statistics.
+This module provides HTML dashboard generation for visualizing experiment
+results, cost breakdowns, and statistical analysis.
+"""
+from __future__ import annotations
+from pathlib import Path
+from typing import Dict, List
+from themis.evaluation import reports as eval_reports
+from themis.evaluation import statistics as eval_stats
+from themis.utils import cost_tracking
+def generate_html_dashboard(
+    evaluation_report: eval_reports.EvaluationReport,
+    cost_summary: cost_tracking.CostSummary | None = None,
+    statistical_summaries: Dict[str, eval_stats.StatisticalSummary] | None = None,
+    output_path: str | Path = "dashboard.html",
+    title: str = "Themis Experiment Dashboard",
+) -> None:
+    """Generate HTML dashboard with evaluation results, costs, and statistics.
+    Args:
+        evaluation_report: Evaluation report with metric results
+        cost_summary: Optional cost summary
+        statistical_summaries: Optional dictionary mapping metric names to statistical summaries
+        output_path: Path to output HTML file
+        title: Dashboard title
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    html_content = _generate_html(
+        evaluation_report,
+        cost_summary,
+        statistical_summaries,
+        title,
+    )
+    with open(output_path, "w") as f:
+        f.write(html_content)
+def _generate_html(
+    evaluation_report: eval_reports.EvaluationReport,
+    cost_summary: cost_tracking.CostSummary | None,
+    statistical_summaries: Dict[str, eval_stats.StatisticalSummary] | None,
+    title: str,
+) -> str:
+    """Generate complete HTML dashboard content."""
+    # Build sections
+    metrics_section = _build_metrics_section(evaluation_report.metrics)
+    stats_section = ""
+    if statistical_summaries:
+        stats_section = _build_statistics_section(statistical_summaries)
+    cost_section = ""
+    if cost_summary:
+        cost_section = _build_cost_section(cost_summary)
+    failures_section = ""
+    if evaluation_report.failures:
+        failures_section = _build_failures_section(evaluation_report.failures)
+    # Compose full HTML
+    html = f"""<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>{title}</title>
+    <style>
+        * {{
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }}
+        body {{
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Helvetica', 'Arial', sans-serif;
+            line-height: 1.6;
+            color: #333;
+            background: #f5f5f5;
+            padding: 20px;
+        }}
+        .container {{
+            max-width: 1200px;
+            margin: 0 auto;
+            background: white;
+            padding: 30px;
+            border-radius: 8px;
+            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+        }}
+        h1 {{
+            color: #2c3e50;
+            margin-bottom: 30px;
+            padding-bottom: 10px;
+            border-bottom: 3px solid #3498db;
+        }}
+        h2 {{
+            color: #34495e;
+            margin-top: 30px;
+            margin-bottom: 15px;
+            padding-bottom: 8px;
+            border-bottom: 2px solid #ecf0f1;
+        }}
+        h3 {{
+            color: #7f8c8d;
+            margin-top: 20px;
+            margin-bottom: 10px;
+        }}
+        .metric-card {{
+            background: #f8f9fa;
+            border-left: 4px solid #3498db;
+            padding: 15px;
+            margin-bottom: 15px;
+            border-radius: 4px;
+        }}
+        .metric-name {{
+            font-size: 18px;
+            font-weight: 600;
+            color: #2c3e50;
+            margin-bottom: 8px;
+        }}
+        .metric-value {{
+            font-size: 32px;
+            font-weight: 700;
+            color: #3498db;
+            margin: 10px 0;
+        }}
+        .metric-details {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
+            gap: 10px;
+            margin-top: 10px;
+        }}
+        .detail-item {{
+            background: white;
+            padding: 10px;
+            border-radius: 4px;
+        }}
+        .detail-label {{
+            font-size: 12px;
+            color: #7f8c8d;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }}
+        .detail-value {{
+            font-size: 16px;
+            font-weight: 600;
+            color: #2c3e50;
+            margin-top: 4px;
+        }}
+        .cost-summary {{
+            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+            color: white;
+            padding: 20px;
+            border-radius: 8px;
+            margin-bottom: 20px;
+        }}
+        .cost-total {{
+            font-size: 48px;
+            font-weight: 700;
+            margin: 10px 0;
+        }}
+        .cost-breakdown {{
+            display: grid;
+            grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+            gap: 15px;
+            margin-top: 15px;
+        }}
+        .cost-item {{
+            background: rgba(255, 255, 255, 0.1);
+            padding: 12px;
+            border-radius: 4px;
+        }}
+        .cost-item-name {{
+            font-size: 14px;
+            opacity: 0.9;
+        }}
+        .cost-item-value {{
+            font-size: 20px;
+            font-weight: 600;
+            margin-top: 4px;
+        }}
+        .failures {{
+            background: #fee;
+            border-left: 4px solid #e74c3c;
+            padding: 15px;
+            border-radius: 4px;
+        }}
+        .failure-item {{
+            margin: 10px 0;
+            padding: 10px;
+            background: white;
+            border-radius: 4px;
+        }}
+        .confidence-interval {{
+            font-size: 14px;
+            color: #7f8c8d;
+            font-family: 'Courier New', monospace;
+        }}
+        .badge {{
+            display: inline-block;
+            padding: 4px 8px;
+            border-radius: 4px;
+            font-size: 12px;
+            font-weight: 600;
+        }}
+        .badge-success {{
+            background: #d4edda;
+            color: #155724;
+        }}
+        .badge-info {{
+            background: #d1ecf1;
+            color: #0c5460;
+        }}
+    </style>
+</head>
+<body>
+    <div class="container">
+        <h1>{title}</h1>
+        {metrics_section}
+        {stats_section}
+        {cost_section}
+        {failures_section}
+    </div>
+</body>
+</html>"""
+    return html
+def _build_metrics_section(metrics: Dict[str, eval_reports.MetricAggregate]) -> str:
+    """Build metrics overview section."""
+    if not metrics:
+        return "<p>No metrics available.</p>"
+    cards = []
+    for metric_name, aggregate in metrics.items():
+        card = f"""
+        <div class="metric-card">
+            <div class="metric-name">{metric_name}</div>
+            <div class="metric-value">{aggregate.mean:.4f}</div>
+            <div class="metric-details">
+                <div class="detail-item">
+                    <div class="detail-label">Samples</div>
+                    <div class="detail-value">{aggregate.count}</div>
+                </div>
+            </div>
+        </div>
+        """
+        cards.append(card)
+    return f"""
+    <h2>📊 Metrics Overview</h2>
+    {"".join(cards)}
+    """
+def _build_statistics_section(
+    statistical_summaries: Dict[str, eval_stats.StatisticalSummary],
+) -> str:
+    """Build detailed statistics section."""
+    if not statistical_summaries:
+        return ""
+    cards = []
+    for metric_name, summary in statistical_summaries.items():
+        ci_text = ""
+        if summary.confidence_interval_95:
+            ci = summary.confidence_interval_95
+            ci_text = f"""
+            <div class="detail-item">
+                <div class="detail-label">95% CI</div>
+                <div class="detail-value confidence-interval">
+                    [{ci.lower:.4f}, {ci.upper:.4f}]
+                </div>
+            </div>
+            """
+        card = f"""
+        <div class="metric-card">
+            <div class="metric-name">{metric_name} - Statistical Analysis</div>
+            <div class="metric-details">
+                <div class="detail-item">
+                    <div class="detail-label">Mean</div>
+                    <div class="detail-value">{summary.mean:.4f}</div>
+                </div>
+                <div class="detail-item">
+                    <div class="detail-label">Std Dev</div>
+                    <div class="detail-value">{summary.std:.4f}</div>
+                </div>
+                <div class="detail-item">
+                    <div class="detail-label">Median</div>
+                    <div class="detail-value">{summary.median:.4f}</div>
+                </div>
+                <div class="detail-item">
+                    <div class="detail-label">Min</div>
+                    <div class="detail-value">{summary.min_value:.4f}</div>
+                </div>
+                <div class="detail-item">
+                    <div class="detail-label">Max</div>
+                    <div class="detail-value">{summary.max_value:.4f}</div>
+                </div>
+                {ci_text}
+            </div>
+        </div>
+        """
+        cards.append(card)
+    return f"""
+    <h2>📈 Statistical Analysis</h2>
+    {"".join(cards)}
+    """
+def _build_cost_section(cost_summary: cost_tracking.CostSummary) -> str:
+    """Build cost tracking section."""
+    # Model breakdown
+    model_items = []
+    for model, cost in sorted(
+        cost_summary.cost_by_model.items(),
+        key=lambda x: x[1],
+        reverse=True,
+    ):
+        pct = (
+            (cost / cost_summary.total_cost * 100) if cost_summary.total_cost > 0 else 0
+        )
+        model_items.append(f"""
+        <div class="cost-item">
+            <div class="cost-item-name">{model}</div>
+            <div class="cost-item-value">${cost:.4f} ({pct:.1f}%)</div>
+        </div>
+        """)
+    # Provider breakdown
+    provider_items = []
+    for provider, cost in sorted(
+        cost_summary.cost_by_provider.items(),
+        key=lambda x: x[1],
+        reverse=True,
+    ):
+        pct = (
+            (cost / cost_summary.total_cost * 100) if cost_summary.total_cost > 0 else 0
+        )
+        provider_items.append(f"""
+        <div class="cost-item">
+            <div class="cost-item-name">{provider}</div>
+            <div class="cost-item-value">${cost:.4f} ({pct:.1f}%)</div>
+        </div>
+        """)
+    return f"""
+    <h2>💰 Cost Tracking</h2>
+    <div class="cost-summary">
+        <h3 style="color: white; margin-top: 0;">Total Cost</h3>
+        <div class="cost-total">${cost_summary.total_cost:.4f}</div>
+        <div class="cost-breakdown">
+            <div class="cost-item">
+                <div class="cost-item-name">Total Tokens</div>
+                <div class="cost-item-value">{cost_summary.total_tokens:,}</div>
+            </div>
+            <div class="cost-item">
+                <div class="cost-item-name">Input Tokens</div>
+                <div class="cost-item-value">{cost_summary.total_input_tokens:,}</div>
+            </div>
+            <div class="cost-item">
+                <div class="cost-item-name">Output Tokens</div>
+                <div class="cost-item-value">{cost_summary.total_output_tokens:,}</div>
+            </div>
+            <div class="cost-item">
+                <div class="cost-item-name">API Requests</div>
+                <div class="cost-item-value">{cost_summary.num_requests:,}</div>
+            </div>
+        </div>
+    </div>
+    <h3>Cost by Model</h3>
+    <div class="cost-breakdown">
+        {"".join(model_items)}
+    </div>
+    <h3>Cost by Provider</h3>
+    <div class="cost-breakdown">
+        {"".join(provider_items)}
+    </div>
+    """
+def _build_failures_section(failures: List[eval_reports.EvaluationFailure]) -> str:
+    """Build failures section."""
+    if not failures:
+        return ""
+    failure_items = []
+    for failure in failures[:20]:  # Limit to first 20 failures
+        sample_id = failure.sample_id or "Unknown"
+        failure_items.append(f"""
+        <div class="failure-item">
+            <strong>Sample: {sample_id}</strong><br>
+            {failure.message}
+        </div>
+        """)
+    more_text = ""
+    if len(failures) > 20:
+        more_text = f"<p><em>...and {len(failures) - 20} more failures</em></p>"
+    return f"""
+    <h2>⚠️ Failures ({len(failures)})</h2>
+    <div class="failures">
+        {"".join(failure_items)}
+        {more_text}
+    </div>
+    """
+__all__ = ["generate_html_dashboard"]

themis/utils/logging_utils.py ADDED Viewed

@@ -0,0 +1,41 @@
+"""Utility helpers for configuring package-wide logging."""
+from __future__ import annotations
+import logging
+from typing import Mapping
+TRACE_LEVEL = 5
+logging.addLevelName(TRACE_LEVEL, "TRACE")
+def _trace(self, message, *args, **kwargs):
+    if self.isEnabledFor(TRACE_LEVEL):
+        self._log(TRACE_LEVEL, message, args, **kwargs)
+logging.Logger.trace = _trace  # type: ignore[attr-defined]
+_LEVELS: Mapping[str, int] = {
+    "critical": logging.CRITICAL,
+    "error": logging.ERROR,
+    "warning": logging.WARNING,
+    "info": logging.INFO,
+    "debug": logging.DEBUG,
+    "trace": TRACE_LEVEL,
+}
+def configure_logging(level: str = "info") -> None:
+    """Configure root logging with human-friendly formatting."""
+    numeric_level = _LEVELS.get(level.lower(), logging.INFO)
+    logging.basicConfig(
+        level=numeric_level,
+        format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
+        datefmt="%H:%M:%S",
+        force=True,
+    )
+__all__ = ["configure_logging", "TRACE_LEVEL"]

themis/utils/progress.py ADDED Viewed

@@ -0,0 +1,58 @@
+"""Simple CLI-friendly progress reporter."""
+from __future__ import annotations
+from contextlib import AbstractContextManager
+from typing import Any, Callable
+from tqdm import tqdm
+class ProgressReporter(AbstractContextManager["ProgressReporter"]):
+    def __init__(
+        self,
+        *,
+        total: int | None,
+        description: str = "Processing",
+        unit: str = "sample",
+        leave: bool = False,
+    ) -> None:
+        self._total = total
+        self._description = description
+        self._unit = unit
+        self._leave = leave
+        self._pbar: tqdm | None = None
+    def __enter__(self) -> "ProgressReporter":
+        self.start()
+        return self
+    def __exit__(self, *_exc) -> None:
+        self.close()
+    def start(self) -> None:
+        if self._pbar is None:
+            self._pbar = tqdm(
+                total=self._total,
+                desc=self._description,
+                unit=self._unit,
+                leave=self._leave,
+            )
+    def close(self) -> None:
+        if self._pbar is not None:
+            self._pbar.close()
+            self._pbar = None
+    def increment(self, step: int = 1) -> None:
+        if self._pbar is not None:
+            self._pbar.update(step)
+    def on_result(self, _record: Any) -> None:
+        self.increment()
+    def as_callback(self) -> Callable[[Any], None]:
+        return self.on_result
+__all__ = ["ProgressReporter"]

themis-eval 0.1.0__py3-none-any.whl → 0.2.0__py3-none-any.whl

themis-eval 0.1.0py3-none-any.whl → 0.2.0py3-none-any.whl