onetool-mcp 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. bench/__init__.py +5 -0
  2. bench/cli.py +69 -0
  3. bench/harness/__init__.py +66 -0
  4. bench/harness/client.py +692 -0
  5. bench/harness/config.py +397 -0
  6. bench/harness/csv_writer.py +109 -0
  7. bench/harness/evaluate.py +512 -0
  8. bench/harness/metrics.py +283 -0
  9. bench/harness/runner.py +899 -0
  10. bench/py.typed +0 -0
  11. bench/reporter.py +629 -0
  12. bench/run.py +487 -0
  13. bench/secrets.py +101 -0
  14. bench/utils.py +16 -0
  15. onetool/__init__.py +4 -0
  16. onetool/cli.py +391 -0
  17. onetool/py.typed +0 -0
  18. onetool_mcp-1.0.0b1.dist-info/METADATA +163 -0
  19. onetool_mcp-1.0.0b1.dist-info/RECORD +132 -0
  20. onetool_mcp-1.0.0b1.dist-info/WHEEL +4 -0
  21. onetool_mcp-1.0.0b1.dist-info/entry_points.txt +3 -0
  22. onetool_mcp-1.0.0b1.dist-info/licenses/LICENSE.txt +687 -0
  23. onetool_mcp-1.0.0b1.dist-info/licenses/NOTICE.txt +64 -0
  24. ot/__init__.py +37 -0
  25. ot/__main__.py +6 -0
  26. ot/_cli.py +107 -0
  27. ot/_tui.py +53 -0
  28. ot/config/__init__.py +46 -0
  29. ot/config/defaults/bench.yaml +4 -0
  30. ot/config/defaults/diagram-templates/api-flow.mmd +33 -0
  31. ot/config/defaults/diagram-templates/c4-context.puml +30 -0
  32. ot/config/defaults/diagram-templates/class-diagram.mmd +87 -0
  33. ot/config/defaults/diagram-templates/feature-mindmap.mmd +70 -0
  34. ot/config/defaults/diagram-templates/microservices.d2 +81 -0
  35. ot/config/defaults/diagram-templates/project-gantt.mmd +37 -0
  36. ot/config/defaults/diagram-templates/state-machine.mmd +42 -0
  37. ot/config/defaults/onetool.yaml +25 -0
  38. ot/config/defaults/prompts.yaml +97 -0
  39. ot/config/defaults/servers.yaml +7 -0
  40. ot/config/defaults/snippets.yaml +4 -0
  41. ot/config/defaults/tool_templates/__init__.py +7 -0
  42. ot/config/defaults/tool_templates/extension.py +52 -0
  43. ot/config/defaults/tool_templates/isolated.py +61 -0
  44. ot/config/dynamic.py +121 -0
  45. ot/config/global_templates/__init__.py +2 -0
  46. ot/config/global_templates/bench-secrets-template.yaml +6 -0
  47. ot/config/global_templates/bench.yaml +9 -0
  48. ot/config/global_templates/onetool.yaml +27 -0
  49. ot/config/global_templates/secrets-template.yaml +44 -0
  50. ot/config/global_templates/servers.yaml +18 -0
  51. ot/config/global_templates/snippets.yaml +235 -0
  52. ot/config/loader.py +1087 -0
  53. ot/config/mcp.py +145 -0
  54. ot/config/secrets.py +190 -0
  55. ot/config/tool_config.py +125 -0
  56. ot/decorators.py +116 -0
  57. ot/executor/__init__.py +35 -0
  58. ot/executor/base.py +16 -0
  59. ot/executor/fence_processor.py +83 -0
  60. ot/executor/linter.py +142 -0
  61. ot/executor/pack_proxy.py +260 -0
  62. ot/executor/param_resolver.py +140 -0
  63. ot/executor/pep723.py +288 -0
  64. ot/executor/result_store.py +369 -0
  65. ot/executor/runner.py +496 -0
  66. ot/executor/simple.py +163 -0
  67. ot/executor/tool_loader.py +396 -0
  68. ot/executor/validator.py +398 -0
  69. ot/executor/worker_pool.py +388 -0
  70. ot/executor/worker_proxy.py +189 -0
  71. ot/http_client.py +145 -0
  72. ot/logging/__init__.py +37 -0
  73. ot/logging/config.py +315 -0
  74. ot/logging/entry.py +213 -0
  75. ot/logging/format.py +188 -0
  76. ot/logging/span.py +349 -0
  77. ot/meta.py +1555 -0
  78. ot/paths.py +453 -0
  79. ot/prompts.py +218 -0
  80. ot/proxy/__init__.py +21 -0
  81. ot/proxy/manager.py +396 -0
  82. ot/py.typed +0 -0
  83. ot/registry/__init__.py +189 -0
  84. ot/registry/models.py +57 -0
  85. ot/registry/parser.py +269 -0
  86. ot/registry/registry.py +413 -0
  87. ot/server.py +315 -0
  88. ot/shortcuts/__init__.py +15 -0
  89. ot/shortcuts/aliases.py +87 -0
  90. ot/shortcuts/snippets.py +258 -0
  91. ot/stats/__init__.py +35 -0
  92. ot/stats/html.py +250 -0
  93. ot/stats/jsonl_writer.py +283 -0
  94. ot/stats/reader.py +354 -0
  95. ot/stats/timing.py +57 -0
  96. ot/support.py +63 -0
  97. ot/tools.py +114 -0
  98. ot/utils/__init__.py +81 -0
  99. ot/utils/batch.py +161 -0
  100. ot/utils/cache.py +120 -0
  101. ot/utils/deps.py +403 -0
  102. ot/utils/exceptions.py +23 -0
  103. ot/utils/factory.py +179 -0
  104. ot/utils/format.py +65 -0
  105. ot/utils/http.py +202 -0
  106. ot/utils/platform.py +45 -0
  107. ot/utils/sanitize.py +130 -0
  108. ot/utils/truncate.py +69 -0
  109. ot_tools/__init__.py +4 -0
  110. ot_tools/_convert/__init__.py +12 -0
  111. ot_tools/_convert/excel.py +279 -0
  112. ot_tools/_convert/pdf.py +254 -0
  113. ot_tools/_convert/powerpoint.py +268 -0
  114. ot_tools/_convert/utils.py +358 -0
  115. ot_tools/_convert/word.py +283 -0
  116. ot_tools/brave_search.py +604 -0
  117. ot_tools/code_search.py +736 -0
  118. ot_tools/context7.py +495 -0
  119. ot_tools/convert.py +614 -0
  120. ot_tools/db.py +415 -0
  121. ot_tools/diagram.py +1604 -0
  122. ot_tools/diagram.yaml +167 -0
  123. ot_tools/excel.py +1372 -0
  124. ot_tools/file.py +1348 -0
  125. ot_tools/firecrawl.py +732 -0
  126. ot_tools/grounding_search.py +646 -0
  127. ot_tools/package.py +604 -0
  128. ot_tools/py.typed +0 -0
  129. ot_tools/ripgrep.py +544 -0
  130. ot_tools/scaffold.py +471 -0
  131. ot_tools/transform.py +213 -0
  132. ot_tools/web_fetch.py +384 -0
@@ -0,0 +1,512 @@
1
+ """Evaluation module for benchmark responses.
2
+
3
+ Supports three evaluation methods:
4
+ 1. Regex - Pattern matching with expect_match flag
5
+ 2. Deterministic - Contains checks for strings, lists, dicts, scalars
6
+ 3. LLM-as-judge - AI-based evaluation with custom prompts
7
+
8
+ Usage:
9
+ The main entry point is `evaluate_task()` which routes to the appropriate
10
+ evaluation method based on the EvaluateConfig.
11
+
12
+ Evaluation is called AFTER task completion to ensure task duration excludes
13
+ evaluation time. The runner handles this in the task loop.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import json
19
+ import re
20
+ from typing import TYPE_CHECKING, Any
21
+
22
+ from loguru import logger
23
+ from openai import OpenAI
24
+
25
+ from bench.harness.metrics import EvaluationResult, TaskResult
26
+ from bench.secrets import get_bench_secret
27
+
28
+ if TYPE_CHECKING:
29
+ from bench.harness.config import EvaluateConfig, HarnessConfig, TaskConfig
30
+
31
+
32
+ # =============================================================================
33
+ # Helper Functions
34
+ # =============================================================================
35
+
36
+
37
+ def _normalize_value(value: Any) -> str:
38
+ """Convert a value to string for comparison."""
39
+ if isinstance(value, str):
40
+ return value
41
+ if isinstance(value, bool):
42
+ # Return both Python and JSON representations for matching
43
+ return str(value) # "True" or "False"
44
+ return json.dumps(value, sort_keys=True)
45
+
46
+
47
+ def _check_pattern(pattern: Any, response: str) -> bool:
48
+ """Check if a pattern matches the response.
49
+
50
+ Args:
51
+ pattern: String, dict with 'regex' key, or other value
52
+ response: Response text to check
53
+
54
+ Returns:
55
+ True if pattern matches
56
+ """
57
+ if isinstance(pattern, dict) and "regex" in pattern:
58
+ return bool(re.search(pattern["regex"], response))
59
+ elif isinstance(pattern, str):
60
+ return pattern in response
61
+ else:
62
+ # For numbers, bools, etc - convert to string and check contains
63
+ return _normalize_value(pattern) in response
64
+
65
+
66
+ def _list_is_expected_output(lst: list[Any]) -> bool:
67
+ """Check if list represents an expected output (not patterns to check).
68
+
69
+ Returns True for lists like [97, 101] or [True, False, True] that should
70
+ be checked as serialized JSON. Returns False for lists with strings or
71
+ regex patterns that should be checked individually.
72
+ """
73
+ # Lists with any strings are treated as patterns to check
74
+ if any(isinstance(item, str) for item in lst):
75
+ return False
76
+ # Lists with regex dicts are patterns
77
+ if any(isinstance(item, dict) and "regex" in item for item in lst):
78
+ return False
79
+ # Pure numeric/boolean lists are expected outputs
80
+ return all(isinstance(item, (int, float, bool)) or item is None for item in lst)
81
+
82
+
83
+ def _truncate(s: str, max_len: int = 100) -> str:
84
+ """Truncate string for display."""
85
+ if len(s) <= max_len:
86
+ return s
87
+ return s[:max_len] + "..."
88
+
89
+
90
+ def _find_actual_match(
91
+ response: str, pattern: str, max_context: int = 50
92
+ ) -> str | None:
93
+ """Find where a pattern appears in response and extract context."""
94
+ idx = response.find(pattern)
95
+ if idx == -1:
96
+ return None
97
+ start = max(0, idx - 10)
98
+ end = min(len(response), idx + len(pattern) + max_context)
99
+ return response[start:end]
100
+
101
+
102
+ # =============================================================================
103
+ # Evaluation Methods
104
+ # =============================================================================
105
+
106
+
107
+ def evaluate_deterministic(
108
+ response: str,
109
+ expected: str | list[Any] | dict[str, Any] | int | float | bool,
110
+ expect_error: bool = False,
111
+ ) -> EvaluationResult:
112
+ """Evaluate response against expected value(s) deterministically.
113
+
114
+ Args:
115
+ response: The response text to evaluate
116
+ expected: Expected value(s) - string, list, dict, or scalar
117
+ expect_error: If True, test expects an error. Failure to match means LLM fixed the code.
118
+
119
+ Returns:
120
+ EvaluationResult with pass/fail status and expected/actual values
121
+ """
122
+ if isinstance(expected, list):
123
+ # For lists of pure numbers/booleans, check if the serialized list appears
124
+ if _list_is_expected_output(expected):
125
+ # Try multiple representations
126
+ representations = [
127
+ json.dumps(expected), # [true, false, ...]
128
+ str(expected), # [True, False, ...] - Python repr
129
+ repr(expected), # [True, False, ...]
130
+ ]
131
+ for rep in representations:
132
+ if rep in response:
133
+ actual = _find_actual_match(response, rep)
134
+ return EvaluationResult(
135
+ score=100,
136
+ reason="Expected list found",
137
+ eval_type="pass_fail",
138
+ passed=True,
139
+ expected=_truncate(rep),
140
+ actual=actual,
141
+ )
142
+ return EvaluationResult(
143
+ score=0,
144
+ reason="Expected list not found",
145
+ eval_type="pass_fail",
146
+ passed=False,
147
+ expected=_truncate(representations[0]),
148
+ actual=_truncate(response, 200),
149
+ )
150
+
151
+ # For lists with patterns (regex dicts), check each item
152
+ missing = []
153
+ found = []
154
+ for item in expected:
155
+ if _check_pattern(item, response):
156
+ if isinstance(item, dict) and "regex" in item:
157
+ found.append(f"regex:{item['regex'][:30]}")
158
+ else:
159
+ found.append(str(item)[:30])
160
+ else:
161
+ if isinstance(item, dict) and "regex" in item:
162
+ missing.append(f"regex:{item['regex']}")
163
+ else:
164
+ missing.append(str(item))
165
+
166
+ if missing:
167
+ # When expect_error=True and LLM fixed the code (no error patterns matched),
168
+ # this is a PASS - demonstrates LLM's ability to fix small errors
169
+ if expect_error:
170
+ return EvaluationResult(
171
+ score=100,
172
+ reason="LLM fixed the error",
173
+ eval_type="pass_fail",
174
+ passed=True,
175
+ expected="error or fix",
176
+ actual="LLM fixed code",
177
+ )
178
+ reason = (
179
+ f"Missing: {', '.join(missing[:3])}{'...' if len(missing) > 3 else ''}"
180
+ )
181
+ return EvaluationResult(
182
+ score=0,
183
+ reason=reason,
184
+ eval_type="pass_fail",
185
+ passed=False,
186
+ expected=_truncate(str(expected)),
187
+ actual=f"Found: {', '.join(found[:3])}" if found else "None matched",
188
+ )
189
+ # When expect_error=True and error patterns matched, the error was returned
190
+ reason = (
191
+ "Error returned"
192
+ if expect_error
193
+ else f"All {len(expected)} expected items found"
194
+ )
195
+ actual = (
196
+ "Error in response"
197
+ if expect_error
198
+ else f"All {len(expected)} patterns matched"
199
+ )
200
+ return EvaluationResult(
201
+ score=100,
202
+ reason=reason,
203
+ eval_type="pass_fail",
204
+ passed=True,
205
+ expected="error or fix" if expect_error else _truncate(str(expected)),
206
+ actual=actual,
207
+ )
208
+
209
+ elif isinstance(expected, dict):
210
+ # Check if dict is in response (JSON serialized)
211
+ expected_str = _normalize_value(expected)
212
+ if expected_str in response:
213
+ actual = _find_actual_match(response, expected_str)
214
+ return EvaluationResult(
215
+ score=100,
216
+ reason="Expected dict found in response",
217
+ eval_type="pass_fail",
218
+ passed=True,
219
+ expected=_truncate(expected_str),
220
+ actual=actual,
221
+ )
222
+ # Try checking each key-value pair
223
+ missing = []
224
+ found_keys = []
225
+ for key, _value in expected.items():
226
+ pattern = f'"{key}"' if isinstance(key, str) else str(key)
227
+ if pattern not in response:
228
+ missing.append(key)
229
+ else:
230
+ found_keys.append(key)
231
+ if missing:
232
+ return EvaluationResult(
233
+ score=0,
234
+ reason=f"Missing keys: {', '.join(str(k) for k in missing[:3])}",
235
+ eval_type="pass_fail",
236
+ passed=False,
237
+ expected=_truncate(expected_str),
238
+ actual=f"Found keys: {', '.join(str(k) for k in found_keys[:3])}"
239
+ if found_keys
240
+ else "No keys found",
241
+ )
242
+ return EvaluationResult(
243
+ score=100,
244
+ reason="All expected keys found",
245
+ eval_type="pass_fail",
246
+ passed=True,
247
+ expected=_truncate(expected_str),
248
+ actual=f"All {len(expected)} keys present",
249
+ )
250
+
251
+ else:
252
+ # String or scalar - simple contains check
253
+ expected_str = _normalize_value(expected)
254
+ if expected_str in response:
255
+ actual = _find_actual_match(response, expected_str)
256
+ return EvaluationResult(
257
+ score=100,
258
+ reason="Expected value found",
259
+ eval_type="pass_fail",
260
+ passed=True,
261
+ expected=_truncate(expected_str),
262
+ actual=actual,
263
+ )
264
+ return EvaluationResult(
265
+ score=0,
266
+ reason="Expected value not found in response",
267
+ eval_type="pass_fail",
268
+ passed=False,
269
+ expected=_truncate(expected_str),
270
+ actual=_truncate(response, 200),
271
+ )
272
+
273
+
274
+ def evaluate_llm(
275
+ response: str,
276
+ config: EvaluateConfig,
277
+ expected: Any = None,
278
+ ) -> EvaluationResult:
279
+ """Evaluate response using LLM-as-judge.
280
+
281
+ Args:
282
+ response: The response text to evaluate
283
+ config: Evaluation config with prompt and model
284
+ expected: Optional expected value for substitution
285
+
286
+ Returns:
287
+ EvaluationResult with score (0-100) and reason
288
+ """
289
+ if not config.prompt:
290
+ return EvaluationResult(
291
+ score=0,
292
+ reason="No evaluation prompt configured",
293
+ eval_type="scored",
294
+ )
295
+
296
+ if not config.model:
297
+ return EvaluationResult(
298
+ score=0,
299
+ reason="No evaluation model configured - set evaluator.model in YAML",
300
+ eval_type="scored",
301
+ )
302
+
303
+ client = OpenAI(
304
+ api_key=get_bench_secret("OPENAI_API_KEY"),
305
+ base_url=get_bench_secret("OPENAI_BASE_URL"),
306
+ )
307
+
308
+ # Format the evaluation prompt
309
+ prompt = config.prompt.replace("{response}", response)
310
+ if expected is not None:
311
+ expected_str = _normalize_value(expected)
312
+ prompt = prompt.replace("{expected}", expected_str)
313
+
314
+ try:
315
+ llm_response = client.chat.completions.create(
316
+ model=config.model,
317
+ messages=[{"role": "user", "content": prompt}],
318
+ )
319
+
320
+ content = llm_response.choices[0].message.content or ""
321
+
322
+ # Strip markdown code blocks if present
323
+ content = re.sub(r"```json\s*", "", content)
324
+ content = re.sub(r"```\s*", "", content)
325
+
326
+ # Try to parse JSON response
327
+ brace_start = content.find("{")
328
+ if brace_start != -1:
329
+ depth = 0
330
+ brace_end = -1
331
+ for i, c in enumerate(content[brace_start:], brace_start):
332
+ if c == "{":
333
+ depth += 1
334
+ elif c == "}":
335
+ depth -= 1
336
+ if depth == 0:
337
+ brace_end = i + 1
338
+ break
339
+
340
+ if brace_end > brace_start:
341
+ json_str = content[brace_start:brace_end]
342
+ try:
343
+ data = json.loads(json_str)
344
+ score = int(data.get("score") or 5)
345
+ reason = data.get("reason", "No reason provided")
346
+ if isinstance(reason, dict):
347
+ reason = json.dumps(reason)
348
+ return EvaluationResult(
349
+ score=score,
350
+ reason=str(reason),
351
+ eval_type="scored",
352
+ )
353
+ except (json.JSONDecodeError, ValueError):
354
+ pass
355
+
356
+ # Fallback: try to extract score from text (e.g., "7/10")
357
+ score_match = re.search(r"(\d+)\s*/?\s*10", content)
358
+ if score_match:
359
+ # Scale from 0-10 to 0-100
360
+ score = int(score_match.group(1)) * 10
361
+ return EvaluationResult(
362
+ score=score,
363
+ reason=_truncate(content, 200),
364
+ eval_type="scored",
365
+ )
366
+
367
+ logger.warning(f"Could not parse evaluation response: {content[:100]}")
368
+ return EvaluationResult(
369
+ score=5,
370
+ reason="Could not parse evaluation response",
371
+ eval_type="scored",
372
+ )
373
+
374
+ except Exception as e:
375
+ logger.error(f"Evaluation failed: {e}")
376
+ return EvaluationResult(
377
+ score=0,
378
+ reason=f"Evaluation error: {e}",
379
+ eval_type="scored",
380
+ )
381
+
382
+
383
+ # =============================================================================
384
+ # Evaluator Resolution and Task Evaluation
385
+ # =============================================================================
386
+
387
+
388
+ def resolve_evaluator(
389
+ task: TaskConfig,
390
+ harness_config: HarnessConfig,
391
+ ) -> EvaluateConfig | None:
392
+ """Resolve the evaluator config for a task.
393
+
394
+ Args:
395
+ task: The task configuration
396
+ harness_config: The harness configuration with evaluators dict
397
+
398
+ Returns:
399
+ Resolved EvaluateConfig or None if no evaluation
400
+ """
401
+ if task.evaluate is None:
402
+ return None
403
+
404
+ if isinstance(task.evaluate, str):
405
+ # Reference to named evaluator
406
+ if task.evaluate in harness_config.evaluators:
407
+ return harness_config.evaluators[task.evaluate]
408
+ logger.warning(f"Unknown evaluator '{task.evaluate}', skipping evaluation")
409
+ return None
410
+
411
+ # Inline EvaluateConfig
412
+ return task.evaluate
413
+
414
+
415
+ def evaluate_regex(
416
+ response: str,
417
+ pattern: str,
418
+ expect_match: bool = True,
419
+ ) -> EvaluationResult:
420
+ """Evaluate response against a regex pattern.
421
+
422
+ Args:
423
+ response: The response text to evaluate
424
+ pattern: Regex pattern to match
425
+ expect_match: If True, pattern must match. If False, must NOT match.
426
+
427
+ Returns:
428
+ EvaluationResult with pass/fail status
429
+ """
430
+ match = re.search(pattern, response)
431
+
432
+ if expect_match:
433
+ if match:
434
+ return EvaluationResult(
435
+ score=100,
436
+ reason="Regex pattern matched",
437
+ eval_type="pass_fail",
438
+ passed=True,
439
+ expected=f"match: {_truncate(pattern, 50)}",
440
+ actual=_truncate(match.group(0), 100),
441
+ )
442
+ return EvaluationResult(
443
+ score=0,
444
+ reason="Regex pattern did not match",
445
+ eval_type="pass_fail",
446
+ passed=False,
447
+ expected=f"match: {_truncate(pattern, 50)}",
448
+ actual=_truncate(response, 200),
449
+ )
450
+ else:
451
+ # expect_match=False means pattern must NOT match
452
+ if not match:
453
+ return EvaluationResult(
454
+ score=100,
455
+ reason="Regex pattern correctly did not match",
456
+ eval_type="pass_fail",
457
+ passed=True,
458
+ expected=f"no match: {_truncate(pattern, 50)}",
459
+ actual="No match found",
460
+ )
461
+ return EvaluationResult(
462
+ score=0,
463
+ reason="Regex pattern matched when it should not",
464
+ eval_type="pass_fail",
465
+ passed=False,
466
+ expected=f"no match: {_truncate(pattern, 50)}",
467
+ actual=_truncate(match.group(0), 100),
468
+ )
469
+
470
+
471
+ def evaluate_task(
472
+ task_result: TaskResult,
473
+ task: TaskConfig,
474
+ harness_config: HarnessConfig,
475
+ ) -> EvaluationResult | None:
476
+ """Evaluate a task result.
477
+
478
+ Args:
479
+ task_result: The task result with response
480
+ task: The task configuration
481
+ harness_config: The harness configuration
482
+
483
+ Returns:
484
+ EvaluationResult or None if no evaluation configured
485
+ """
486
+ eval_config = resolve_evaluator(task, harness_config)
487
+
488
+ if eval_config is None:
489
+ return None
490
+
491
+ # Regex evaluation if regex pattern is set
492
+ if eval_config.regex is not None:
493
+ return evaluate_regex(
494
+ task_result.response,
495
+ eval_config.regex,
496
+ expect_match=eval_config.expect_match,
497
+ )
498
+
499
+ # Deterministic evaluation if expected is set
500
+ if eval_config.expected is not None:
501
+ return evaluate_deterministic(
502
+ task_result.response,
503
+ eval_config.expected,
504
+ expect_error=eval_config.expect_error,
505
+ )
506
+
507
+ # LLM evaluation if prompt is set
508
+ if eval_config.prompt:
509
+ return evaluate_llm(task_result.response, eval_config)
510
+
511
+ # No evaluation method configured
512
+ return None