onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
@@ -0,0 +1,283 @@
1
+ """Metrics collection and cost calculation for benchmark runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+ from datetime import UTC, datetime
8
+ from typing import Any
9
+
10
+ import httpx
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+
15
+ def _utc_now() -> datetime:
16
+ """Get current UTC datetime in a timezone-aware manner."""
17
+ return datetime.now(UTC)
18
+
19
+
20
+ # Cached pricing from OpenRouter API: model_id -> (input_per_1M, output_per_1M)
21
+ _openrouter_pricing: dict[str, tuple[float, float]] | None = None
22
+
23
+
24
+ def get_openrouter_pricing() -> dict[str, tuple[float, float]]:
25
+ """Fetch model pricing from OpenRouter API and cache it.
26
+
27
+ Returns:
28
+ Dictionary mapping model IDs to (input_price, output_price) per 1M tokens.
29
+ """
30
+ global _openrouter_pricing
31
+ if _openrouter_pricing is not None:
32
+ return _openrouter_pricing
33
+
34
+ try:
35
+ response = httpx.get("https://openrouter.ai/api/v1/models", timeout=10.0)
36
+ response.raise_for_status()
37
+ data = response.json()
38
+
39
+ pricing = {}
40
+ for model in data.get("data", []):
41
+ model_id = model.get("id")
42
+ model_pricing = model.get("pricing", {})
43
+ prompt_price = model_pricing.get("prompt")
44
+ completion_price = model_pricing.get("completion")
45
+
46
+ if model_id and prompt_price and completion_price:
47
+ # API returns price per token as string, convert to per 1M tokens
48
+ pricing[model_id] = (
49
+ float(prompt_price) * 1_000_000,
50
+ float(completion_price) * 1_000_000,
51
+ )
52
+
53
+ _openrouter_pricing = pricing
54
+ logger.debug(f"Loaded pricing for {len(pricing)} models from OpenRouter")
55
+ return pricing
56
+ except Exception as e:
57
+ logger.warning(f"Failed to fetch OpenRouter pricing: {e}")
58
+ _openrouter_pricing = {}
59
+ return {}
60
+
61
+
62
+ def calculate_cost(
63
+ model: str,
64
+ input_tokens: int,
65
+ output_tokens: int,
66
+ ) -> float:
67
+ """Calculate estimated cost in USD for a completion.
68
+
69
+ Args:
70
+ model: Model identifier.
71
+ input_tokens: Number of input tokens.
72
+ output_tokens: Number of output tokens.
73
+
74
+ Returns:
75
+ Estimated cost in USD, or 0 if model pricing unknown.
76
+ """
77
+ pricing = get_openrouter_pricing().get(model)
78
+ if pricing is None:
79
+ logger.warning(f"No pricing found for model: {model}")
80
+ return 0.0
81
+
82
+ input_cost = (input_tokens / 1_000_000) * pricing[0]
83
+ output_cost = (output_tokens / 1_000_000) * pricing[1]
84
+ return round(input_cost + output_cost, 6)
85
+
86
+
87
+ @dataclass
88
+ class LLMCallMetrics:
89
+ """Metrics captured for a single LLM API call within a task.
90
+
91
+ Tracks token usage, latency, and tool call count for each individual
92
+ LLM call in an agentic loop.
93
+ """
94
+
95
+ call_number: int
96
+ input_tokens: int
97
+ output_tokens: int
98
+ tool_calls_made: int
99
+ cumulative_input: int
100
+ latency_ms: int
101
+
102
+
103
+ @dataclass
104
+ class EvaluationResult:
105
+ """Result from evaluation (pass/fail or scored).
106
+
107
+ Two evaluation modes:
108
+ - pass_fail: Binary outcome from deterministic checks (expected value matching)
109
+ - scored: Numeric 0-100 score from LLM-as-judge evaluation
110
+
111
+ Attributes:
112
+ score: Numeric score (100 for pass, 0 for fail in pass_fail mode; 0-100 in scored mode)
113
+ reason: Explanation of the evaluation result
114
+ eval_type: Type of evaluation ("pass_fail" or "scored")
115
+ passed: Whether the evaluation passed (only meaningful for pass_fail type)
116
+ expected: The expected value (for pass_fail evaluations)
117
+ actual: What was actually found/matched (for verbose logging)
118
+ """
119
+
120
+ score: int
121
+ reason: str
122
+ eval_type: str = "scored" # "pass_fail" or "scored"
123
+ passed: bool | None = None # True/False for pass_fail, None for scored
124
+ expected: Any = None # Expected value for deterministic checks
125
+ actual: str | None = None # Actual matched value for logging
126
+
127
+
128
+ @dataclass
129
+ class TaskResult:
130
+ """Result from running a single benchmark task."""
131
+
132
+ name: str
133
+ server: str | list[str] | None
134
+ model: str
135
+ prompt: str
136
+ response: str
137
+ input_tokens: int
138
+ output_tokens: int
139
+ llm_calls: int
140
+ tool_calls: int
141
+ tools_used: list[str]
142
+ duration_seconds: float
143
+ cost_usd: float
144
+ evaluation: EvaluationResult | None = None
145
+ error: str | None = None
146
+ timestamp: datetime = field(default_factory=_utc_now)
147
+ executor: str = "simple"
148
+ # Tool results for evaluation (actual output from tools)
149
+ tool_results: list[str] = field(default_factory=list)
150
+ # Tags from task config
151
+ tags: list[str] = field(default_factory=list)
152
+ # Per-LLM-call metrics for context growth analysis
153
+ llm_call_metrics: list[LLMCallMetrics] = field(default_factory=list)
154
+
155
+ @property
156
+ def base_context(self) -> int:
157
+ """Return first call's input tokens (base context size)."""
158
+ if self.llm_call_metrics:
159
+ return self.llm_call_metrics[0].input_tokens
160
+ return 0
161
+
162
+ @property
163
+ def context_growth_avg(self) -> float:
164
+ """Calculate average context growth per turn.
165
+
166
+ Returns average increase in input tokens between consecutive LLM calls.
167
+ Returns 0 if fewer than 2 calls.
168
+ """
169
+ if len(self.llm_call_metrics) < 2:
170
+ return 0.0
171
+ total_growth = 0
172
+ for i in range(1, len(self.llm_call_metrics)):
173
+ growth = (
174
+ self.llm_call_metrics[i].input_tokens
175
+ - self.llm_call_metrics[i - 1].input_tokens
176
+ )
177
+ total_growth += growth
178
+ return total_growth / (len(self.llm_call_metrics) - 1)
179
+
180
+ def to_dict(self) -> dict[str, Any]:
181
+ """Convert to dictionary for YAML output."""
182
+ result: dict[str, Any] = {
183
+ "name": self.name,
184
+ "server": self.server,
185
+ "model": self.model,
186
+ "metrics": {
187
+ "input_tokens": self.input_tokens,
188
+ "output_tokens": self.output_tokens,
189
+ "llm_calls": self.llm_calls,
190
+ "tool_calls": self.tool_calls,
191
+ "tools_used": self.tools_used,
192
+ "duration_seconds": round(self.duration_seconds, 2),
193
+ "cost_usd": round(self.cost_usd, 6),
194
+ "executor": self.executor,
195
+ },
196
+ "response": self.response,
197
+ }
198
+ if self.evaluation:
199
+ eval_dict: dict[str, Any] = {
200
+ "type": self.evaluation.eval_type,
201
+ "reason": self.evaluation.reason,
202
+ }
203
+ if self.evaluation.eval_type == "pass_fail":
204
+ eval_dict["passed"] = self.evaluation.passed
205
+ else:
206
+ eval_dict["score"] = self.evaluation.score
207
+ if self.evaluation.expected is not None:
208
+ eval_dict["expected"] = self.evaluation.expected
209
+ if self.evaluation.actual is not None:
210
+ eval_dict["actual"] = self.evaluation.actual
211
+ result["evaluation"] = eval_dict
212
+ if self.error:
213
+ result["error"] = self.error
214
+ if self.llm_call_metrics:
215
+ result["llm_call_metrics"] = [
216
+ {
217
+ "call_number": m.call_number,
218
+ "input_tokens": m.input_tokens,
219
+ "output_tokens": m.output_tokens,
220
+ "tool_calls_made": m.tool_calls_made,
221
+ "cumulative_input": m.cumulative_input,
222
+ "latency_ms": m.latency_ms,
223
+ }
224
+ for m in self.llm_call_metrics
225
+ ]
226
+ return result
227
+
228
+
229
+ @dataclass
230
+ class ScenarioResult:
231
+ """Result from running a benchmark scenario."""
232
+
233
+ name: str
234
+ model: str
235
+ tasks: list[TaskResult]
236
+ timestamp: datetime = field(default_factory=_utc_now)
237
+
238
+ def to_dict(self) -> dict[str, Any]:
239
+ """Convert to dictionary for YAML output."""
240
+ return {
241
+ "scenario": self.name,
242
+ "model": self.model,
243
+ "timestamp": self.timestamp.isoformat(),
244
+ "tasks": [task.to_dict() for task in self.tasks],
245
+ }
246
+
247
+ def calculate_totals(self) -> dict[str, Any]:
248
+ """Calculate total metrics across all tasks."""
249
+ # Basic metrics
250
+ totals: dict[str, Any] = {
251
+ "total_input_tokens": sum(t.input_tokens for t in self.tasks),
252
+ "total_output_tokens": sum(t.output_tokens for t in self.tasks),
253
+ "total_llm_calls": sum(t.llm_calls for t in self.tasks),
254
+ "total_tool_calls": sum(t.tool_calls for t in self.tasks),
255
+ "total_duration_seconds": sum(t.duration_seconds for t in self.tasks),
256
+ "total_cost_usd": sum(t.cost_usd for t in self.tasks),
257
+ "task_count": len(self.tasks),
258
+ "error_count": sum(1 for t in self.tasks if t.error),
259
+ }
260
+
261
+ # Evaluation aggregation
262
+ pass_fail_tasks = [
263
+ t
264
+ for t in self.tasks
265
+ if t.evaluation and t.evaluation.eval_type == "pass_fail"
266
+ ]
267
+ scored_tasks = [
268
+ t for t in self.tasks if t.evaluation and t.evaluation.eval_type == "scored"
269
+ ]
270
+
271
+ if pass_fail_tasks:
272
+ passed = sum(
273
+ 1 for t in pass_fail_tasks if t.evaluation and t.evaluation.passed
274
+ )
275
+ failed = len(pass_fail_tasks) - passed
276
+ totals["pass_count"] = passed
277
+ totals["fail_count"] = failed
278
+
279
+ if scored_tasks:
280
+ scores = [t.evaluation.score for t in scored_tasks if t.evaluation]
281
+ totals["avg_score"] = round(sum(scores) / len(scores), 1) if scores else 0
282
+
283
+ return totals