PyPI - onetool-mcp - Versions diffs - 1.0.0b1__py3-none-any.whl - Mend

onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (132) hide show

bench/__init__.py +5 -0
bench/cli.py +69 -0
bench/harness/__init__.py +66 -0
bench/harness/client.py +692 -0
bench/harness/config.py +397 -0
bench/harness/csv_writer.py +109 -0
bench/harness/evaluate.py +512 -0
bench/harness/metrics.py +283 -0
bench/harness/runner.py +899 -0
bench/py.typed +0 -0
bench/reporter.py +629 -0
bench/run.py +487 -0
bench/secrets.py +101 -0
bench/utils.py +16 -0
onetool/__init__.py +4 -0
onetool/cli.py +391 -0
onetool/py.typed +0 -0
onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
ot/__init__.py +37 -0
ot/__main__.py +6 -0
ot/_cli.py +107 -0
ot/_tui.py +53 -0
ot/config/__init__.py +46 -0
ot/config/defaults/bench.yaml +4 -0
ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
ot/config/defaults/diagram-templates/c4-context.puml +30 -0
ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
ot/config/defaults/diagram-templates/microservices.d2 +81 -0
ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
ot/config/defaults/onetool.yaml +25 -0
ot/config/defaults/prompts.yaml +97 -0
ot/config/defaults/servers.yaml +7 -0
ot/config/defaults/snippets.yaml +4 -0
ot/config/defaults/tool_templates/__init__.py +7 -0
ot/config/defaults/tool_templates/extension.py +52 -0
ot/config/defaults/tool_templates/isolated.py +61 -0
ot/config/dynamic.py +121 -0
ot/config/global_templates/__init__.py +2 -0
ot/config/global_templates/bench-secrets-template.yaml +6 -0
ot/config/global_templates/bench.yaml +9 -0
ot/config/global_templates/onetool.yaml +27 -0
ot/config/global_templates/secrets-template.yaml +44 -0
ot/config/global_templates/servers.yaml +18 -0
ot/config/global_templates/snippets.yaml +235 -0
ot/config/loader.py +1087 -0
ot/config/mcp.py +145 -0
ot/config/secrets.py +190 -0
ot/config/tool_config.py +125 -0
ot/decorators.py +116 -0
ot/executor/__init__.py +35 -0
ot/executor/base.py +16 -0
ot/executor/fence_processor.py +83 -0
ot/executor/linter.py +142 -0
ot/executor/pack_proxy.py +260 -0
ot/executor/param_resolver.py +140 -0
ot/executor/pep723.py +288 -0
ot/executor/result_store.py +369 -0
ot/executor/runner.py +496 -0
ot/executor/simple.py +163 -0
ot/executor/tool_loader.py +396 -0
ot/executor/validator.py +398 -0
ot/executor/worker_pool.py +388 -0
ot/executor/worker_proxy.py +189 -0
ot/http_client.py +145 -0
ot/logging/__init__.py +37 -0
ot/logging/config.py +315 -0
ot/logging/entry.py +213 -0
ot/logging/format.py +188 -0
ot/logging/span.py +349 -0
ot/meta.py +1555 -0
ot/paths.py +453 -0
ot/prompts.py +218 -0
ot/proxy/__init__.py +21 -0
ot/proxy/manager.py +396 -0
ot/py.typed +0 -0
ot/registry/__init__.py +189 -0
ot/registry/models.py +57 -0
ot/registry/parser.py +269 -0
ot/registry/registry.py +413 -0
ot/server.py +315 -0
ot/shortcuts/__init__.py +15 -0
ot/shortcuts/aliases.py +87 -0
ot/shortcuts/snippets.py +258 -0
ot/stats/__init__.py +35 -0
ot/stats/html.py +250 -0
ot/stats/jsonl_writer.py +283 -0
ot/stats/reader.py +354 -0
ot/stats/timing.py +57 -0
ot/support.py +63 -0
ot/tools.py +114 -0
ot/utils/__init__.py +81 -0
ot/utils/batch.py +161 -0
ot/utils/cache.py +120 -0
ot/utils/deps.py +403 -0
ot/utils/exceptions.py +23 -0
ot/utils/factory.py +179 -0
ot/utils/format.py +65 -0
ot/utils/http.py +202 -0
ot/utils/platform.py +45 -0
ot/utils/sanitize.py +130 -0
ot/utils/truncate.py +69 -0
ot_tools/__init__.py +4 -0
ot_tools/_convert/__init__.py +12 -0
ot_tools/_convert/excel.py +279 -0
ot_tools/_convert/pdf.py +254 -0
ot_tools/_convert/powerpoint.py +268 -0
ot_tools/_convert/utils.py +358 -0
ot_tools/_convert/word.py +283 -0
ot_tools/brave_search.py +604 -0
ot_tools/code_search.py +736 -0
ot_tools/context7.py +495 -0
ot_tools/convert.py +614 -0
ot_tools/db.py +415 -0
ot_tools/diagram.py +1604 -0
ot_tools/diagram.yaml +167 -0
ot_tools/excel.py +1372 -0
ot_tools/file.py +1348 -0
ot_tools/firecrawl.py +732 -0
ot_tools/grounding_search.py +646 -0
ot_tools/package.py +604 -0
ot_tools/py.typed +0 -0
ot_tools/ripgrep.py +544 -0
ot_tools/scaffold.py +471 -0
ot_tools/transform.py +213 -0
ot_tools/web_fetch.py +384 -0

bench/harness/metrics.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Metrics collection and cost calculation for benchmark runs."""
+from __future__ import annotations
+import logging
+from dataclasses import dataclass, field
+from datetime import UTC, datetime
+from typing import Any
+import httpx
+logger = logging.getLogger(__name__)
+def _utc_now() -> datetime:
+    """Get current UTC datetime in a timezone-aware manner."""
+    return datetime.now(UTC)
+# Cached pricing from OpenRouter API: model_id -> (input_per_1M, output_per_1M)
+_openrouter_pricing: dict[str, tuple[float, float]] | None = None
+def get_openrouter_pricing() -> dict[str, tuple[float, float]]:
+    """Fetch model pricing from OpenRouter API and cache it.
+    Returns:
+        Dictionary mapping model IDs to (input_price, output_price) per 1M tokens.
+    """
+    global _openrouter_pricing
+    if _openrouter_pricing is not None:
+        return _openrouter_pricing
+    try:
+        response = httpx.get("https://openrouter.ai/api/v1/models", timeout=10.0)
+        response.raise_for_status()
+        data = response.json()
+        pricing = {}
+        for model in data.get("data", []):
+            model_id = model.get("id")
+            model_pricing = model.get("pricing", {})
+            prompt_price = model_pricing.get("prompt")
+            completion_price = model_pricing.get("completion")
+            if model_id and prompt_price and completion_price:
+                # API returns price per token as string, convert to per 1M tokens
+                pricing[model_id] = (
+                    float(prompt_price) * 1_000_000,
+                    float(completion_price) * 1_000_000,
+                )
+        _openrouter_pricing = pricing
+        logger.debug(f"Loaded pricing for {len(pricing)} models from OpenRouter")
+        return pricing
+    except Exception as e:
+        logger.warning(f"Failed to fetch OpenRouter pricing: {e}")
+        _openrouter_pricing = {}
+        return {}
+def calculate_cost(
+    model: str,
+    input_tokens: int,
+    output_tokens: int,
+) -> float:
+    """Calculate estimated cost in USD for a completion.
+    Args:
+        model: Model identifier.
+        input_tokens: Number of input tokens.
+        output_tokens: Number of output tokens.
+    Returns:
+        Estimated cost in USD, or 0 if model pricing unknown.
+    """
+    pricing = get_openrouter_pricing().get(model)
+    if pricing is None:
+        logger.warning(f"No pricing found for model: {model}")
+        return 0.0
+    input_cost = (input_tokens / 1_000_000) * pricing[0]
+    output_cost = (output_tokens / 1_000_000) * pricing[1]
+    return round(input_cost + output_cost, 6)
+@dataclass
+class LLMCallMetrics:
+    """Metrics captured for a single LLM API call within a task.
+    Tracks token usage, latency, and tool call count for each individual
+    LLM call in an agentic loop.
+    """
+    call_number: int
+    input_tokens: int
+    output_tokens: int
+    tool_calls_made: int
+    cumulative_input: int
+    latency_ms: int
+@dataclass
+class EvaluationResult:
+    """Result from evaluation (pass/fail or scored).
+    Two evaluation modes:
+    - pass_fail: Binary outcome from deterministic checks (expected value matching)
+    - scored: Numeric 0-100 score from LLM-as-judge evaluation
+    Attributes:
+        score: Numeric score (100 for pass, 0 for fail in pass_fail mode; 0-100 in scored mode)
+        reason: Explanation of the evaluation result
+        eval_type: Type of evaluation ("pass_fail" or "scored")
+        passed: Whether the evaluation passed (only meaningful for pass_fail type)
+        expected: The expected value (for pass_fail evaluations)
+        actual: What was actually found/matched (for verbose logging)
+    """
+    score: int
+    reason: str
+    eval_type: str = "scored"  # "pass_fail" or "scored"
+    passed: bool | None = None  # True/False for pass_fail, None for scored
+    expected: Any = None  # Expected value for deterministic checks
+    actual: str | None = None  # Actual matched value for logging
+@dataclass
+class TaskResult:
+    """Result from running a single benchmark task."""
+    name: str
+    server: str | list[str] | None
+    model: str
+    prompt: str
+    response: str
+    input_tokens: int
+    output_tokens: int
+    llm_calls: int
+    tool_calls: int
+    tools_used: list[str]
+    duration_seconds: float
+    cost_usd: float
+    evaluation: EvaluationResult | None = None
+    error: str | None = None
+    timestamp: datetime = field(default_factory=_utc_now)
+    executor: str = "simple"
+    # Tool results for evaluation (actual output from tools)
+    tool_results: list[str] = field(default_factory=list)
+    # Tags from task config
+    tags: list[str] = field(default_factory=list)
+    # Per-LLM-call metrics for context growth analysis
+    llm_call_metrics: list[LLMCallMetrics] = field(default_factory=list)
+    @property
+    def base_context(self) -> int:
+        """Return first call's input tokens (base context size)."""
+        if self.llm_call_metrics:
+            return self.llm_call_metrics[0].input_tokens
+        return 0
+    @property
+    def context_growth_avg(self) -> float:
+        """Calculate average context growth per turn.
+        Returns average increase in input tokens between consecutive LLM calls.
+        Returns 0 if fewer than 2 calls.
+        """
+        if len(self.llm_call_metrics) < 2:
+            return 0.0
+        total_growth = 0
+        for i in range(1, len(self.llm_call_metrics)):
+            growth = (
+                self.llm_call_metrics[i].input_tokens
+                - self.llm_call_metrics[i - 1].input_tokens
+            )
+            total_growth += growth
+        return total_growth / (len(self.llm_call_metrics) - 1)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for YAML output."""
+        result: dict[str, Any] = {
+            "name": self.name,
+            "server": self.server,
+            "model": self.model,
+            "metrics": {
+                "input_tokens": self.input_tokens,
+                "output_tokens": self.output_tokens,
+                "llm_calls": self.llm_calls,
+                "tool_calls": self.tool_calls,
+                "tools_used": self.tools_used,
+                "duration_seconds": round(self.duration_seconds, 2),
+                "cost_usd": round(self.cost_usd, 6),
+                "executor": self.executor,
+            },
+            "response": self.response,
+        }
+        if self.evaluation:
+            eval_dict: dict[str, Any] = {
+                "type": self.evaluation.eval_type,
+                "reason": self.evaluation.reason,
+            }
+            if self.evaluation.eval_type == "pass_fail":
+                eval_dict["passed"] = self.evaluation.passed
+            else:
+                eval_dict["score"] = self.evaluation.score
+            if self.evaluation.expected is not None:
+                eval_dict["expected"] = self.evaluation.expected
+            if self.evaluation.actual is not None:
+                eval_dict["actual"] = self.evaluation.actual
+            result["evaluation"] = eval_dict
+        if self.error:
+            result["error"] = self.error
+        if self.llm_call_metrics:
+            result["llm_call_metrics"] = [
+                {
+                    "call_number": m.call_number,
+                    "input_tokens": m.input_tokens,
+                    "output_tokens": m.output_tokens,
+                    "tool_calls_made": m.tool_calls_made,
+                    "cumulative_input": m.cumulative_input,
+                    "latency_ms": m.latency_ms,
+                }
+                for m in self.llm_call_metrics
+            ]
+        return result
+@dataclass
+class ScenarioResult:
+    """Result from running a benchmark scenario."""
+    name: str
+    model: str
+    tasks: list[TaskResult]
+    timestamp: datetime = field(default_factory=_utc_now)
+    def to_dict(self) -> dict[str, Any]:
+        """Convert to dictionary for YAML output."""
+        return {
+            "scenario": self.name,
+            "model": self.model,
+            "timestamp": self.timestamp.isoformat(),
+            "tasks": [task.to_dict() for task in self.tasks],
+        }
+    def calculate_totals(self) -> dict[str, Any]:
+        """Calculate total metrics across all tasks."""
+        # Basic metrics
+        totals: dict[str, Any] = {
+            "total_input_tokens": sum(t.input_tokens for t in self.tasks),
+            "total_output_tokens": sum(t.output_tokens for t in self.tasks),
+            "total_llm_calls": sum(t.llm_calls for t in self.tasks),
+            "total_tool_calls": sum(t.tool_calls for t in self.tasks),
+            "total_duration_seconds": sum(t.duration_seconds for t in self.tasks),
+            "total_cost_usd": sum(t.cost_usd for t in self.tasks),
+            "task_count": len(self.tasks),
+            "error_count": sum(1 for t in self.tasks if t.error),
+        }
+        # Evaluation aggregation
+        pass_fail_tasks = [
+            t
+            for t in self.tasks
+            if t.evaluation and t.evaluation.eval_type == "pass_fail"
+        ]
+        scored_tasks = [
+            t for t in self.tasks if t.evaluation and t.evaluation.eval_type == "scored"
+        ]
+        if pass_fail_tasks:
+            passed = sum(
+                1 for t in pass_fail_tasks if t.evaluation and t.evaluation.passed
+            )
+            failed = len(pass_fail_tasks) - passed
+            totals["pass_count"] = passed
+            totals["fail_count"] = failed
+        if scored_tasks:
+            scores = [t.evaluation.score for t in scored_tasks if t.evaluation]
+            totals["avg_score"] = round(sum(scores) / len(scores), 1) if scores else 0
+        return totals