tsugite-cli 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tsugite/__init__.py +6 -0
- tsugite/agent_composition.py +163 -0
- tsugite/agent_inheritance.py +479 -0
- tsugite/agent_preparation.py +236 -0
- tsugite/agent_runner/__init__.py +45 -0
- tsugite/agent_runner/helpers.py +106 -0
- tsugite/agent_runner/history_integration.py +248 -0
- tsugite/agent_runner/metrics.py +100 -0
- tsugite/agent_runner/runner.py +1879 -0
- tsugite/agent_runner/validation.py +70 -0
- tsugite/agent_utils.py +167 -0
- tsugite/attachments/__init__.py +65 -0
- tsugite/attachments/auto_context.py +199 -0
- tsugite/attachments/base.py +34 -0
- tsugite/attachments/file.py +51 -0
- tsugite/attachments/inline.py +31 -0
- tsugite/attachments/storage.py +178 -0
- tsugite/attachments/url.py +59 -0
- tsugite/attachments/youtube.py +101 -0
- tsugite/benchmark/__init__.py +62 -0
- tsugite/benchmark/config.py +183 -0
- tsugite/benchmark/core.py +292 -0
- tsugite/benchmark/discovery.py +377 -0
- tsugite/benchmark/evaluators.py +671 -0
- tsugite/benchmark/execution.py +657 -0
- tsugite/benchmark/metrics.py +204 -0
- tsugite/benchmark/reports.py +420 -0
- tsugite/benchmark/utils.py +288 -0
- tsugite/builtin_agents/chat-assistant.md +53 -0
- tsugite/builtin_agents/default.md +140 -0
- tsugite/builtin_agents.py +5 -0
- tsugite/cache.py +195 -0
- tsugite/cli/__init__.py +1042 -0
- tsugite/cli/agents.py +148 -0
- tsugite/cli/attachments.py +193 -0
- tsugite/cli/benchmark.py +663 -0
- tsugite/cli/cache.py +113 -0
- tsugite/cli/config.py +272 -0
- tsugite/cli/helpers.py +534 -0
- tsugite/cli/history.py +193 -0
- tsugite/cli/init.py +387 -0
- tsugite/cli/mcp.py +193 -0
- tsugite/cli/tools.py +419 -0
- tsugite/config.py +204 -0
- tsugite/console.py +48 -0
- tsugite/constants.py +21 -0
- tsugite/core/__init__.py +19 -0
- tsugite/core/agent.py +774 -0
- tsugite/core/executor.py +300 -0
- tsugite/core/memory.py +67 -0
- tsugite/core/tools.py +271 -0
- tsugite/docker_cli.py +270 -0
- tsugite/events/__init__.py +55 -0
- tsugite/events/base.py +46 -0
- tsugite/events/bus.py +62 -0
- tsugite/events/events.py +224 -0
- tsugite/exceptions.py +40 -0
- tsugite/history/__init__.py +29 -0
- tsugite/history/index.py +210 -0
- tsugite/history/models.py +106 -0
- tsugite/history/storage.py +157 -0
- tsugite/mcp_client.py +219 -0
- tsugite/mcp_config.py +174 -0
- tsugite/md_agents.py +751 -0
- tsugite/models.py +257 -0
- tsugite/renderer.py +151 -0
- tsugite/shell_tool_config.py +265 -0
- tsugite/templates/assistant.md +14 -0
- tsugite/tools/__init__.py +265 -0
- tsugite/tools/agents.py +312 -0
- tsugite/tools/edit_strategies.py +393 -0
- tsugite/tools/fs.py +329 -0
- tsugite/tools/http.py +239 -0
- tsugite/tools/interactive.py +430 -0
- tsugite/tools/shell.py +129 -0
- tsugite/tools/shell_tools.py +214 -0
- tsugite/tools/tasks.py +339 -0
- tsugite/tsugite.py +7 -0
- tsugite/ui/__init__.py +46 -0
- tsugite/ui/base.py +638 -0
- tsugite/ui/chat.py +265 -0
- tsugite/ui/chat.tcss +92 -0
- tsugite/ui/chat_history.py +286 -0
- tsugite/ui/helpers.py +102 -0
- tsugite/ui/jsonl.py +125 -0
- tsugite/ui/live_template.py +529 -0
- tsugite/ui/plain.py +419 -0
- tsugite/ui/textual_chat.py +642 -0
- tsugite/ui/textual_handler.py +225 -0
- tsugite/ui/widgets/__init__.py +6 -0
- tsugite/ui/widgets/base_scroll_log.py +27 -0
- tsugite/ui/widgets/message_list.py +121 -0
- tsugite/ui/widgets/thought_log.py +80 -0
- tsugite/ui_context.py +90 -0
- tsugite/utils.py +367 -0
- tsugite/xdg.py +104 -0
- tsugite_cli-0.3.3.dist-info/METADATA +325 -0
- tsugite_cli-0.3.3.dist-info/RECORD +101 -0
- tsugite_cli-0.3.3.dist-info/WHEEL +4 -0
- tsugite_cli-0.3.3.dist-info/entry_points.txt +5 -0
- tsugite_cli-0.3.3.dist-info/licenses/LICENSE +235 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
"""Attachment management for reusable context."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from datetime import datetime, timezone
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Dict, Optional, Tuple
|
|
7
|
+
|
|
8
|
+
from tsugite.xdg import get_xdg_config_path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_attachments_path() -> Path:
|
|
12
|
+
"""Get path to attachments.json file.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
Path to attachments.json in tsugite config directory
|
|
16
|
+
"""
|
|
17
|
+
return get_xdg_config_path("attachments.json")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_attachments() -> Dict[str, Dict[str, str]]:
|
|
21
|
+
"""Load attachments from JSON file.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Dictionary of attachments, empty dict if file doesn't exist
|
|
25
|
+
"""
|
|
26
|
+
attachments_path = get_attachments_path()
|
|
27
|
+
|
|
28
|
+
if not attachments_path.exists():
|
|
29
|
+
return {}
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
with open(attachments_path, "r", encoding="utf-8") as f:
|
|
33
|
+
data = json.load(f)
|
|
34
|
+
return data.get("attachments", {})
|
|
35
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
36
|
+
raise RuntimeError(f"Failed to load attachments from {attachments_path}: {e}")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def save_attachments(attachments: Dict[str, Dict[str, str]]) -> None:
|
|
40
|
+
"""Save attachments to JSON file.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
attachments: Dictionary of attachment data to save
|
|
44
|
+
|
|
45
|
+
Raises:
|
|
46
|
+
RuntimeError: If save fails
|
|
47
|
+
"""
|
|
48
|
+
attachments_path = get_attachments_path()
|
|
49
|
+
|
|
50
|
+
# Ensure directory exists
|
|
51
|
+
attachments_path.parent.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
data = {"attachments": attachments}
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
with open(attachments_path, "w", encoding="utf-8") as f:
|
|
57
|
+
json.dump(data, f, indent=2, ensure_ascii=False)
|
|
58
|
+
except IOError as e:
|
|
59
|
+
raise RuntimeError(f"Failed to save attachments to {attachments_path}: {e}")
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def add_attachment(alias: str, source: str, content: Optional[str] = None) -> None:
|
|
63
|
+
"""Add or update an attachment.
|
|
64
|
+
|
|
65
|
+
For inline text (stdin), provide both source="inline" and content.
|
|
66
|
+
For file/URL references, provide only source (content will be fetched on demand).
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
alias: Unique identifier for the attachment
|
|
70
|
+
source: Source reference (file path, URL, or "inline" for text)
|
|
71
|
+
content: Text content (only for inline attachments)
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
ValueError: If alias is empty or invalid parameters
|
|
75
|
+
RuntimeError: If save fails
|
|
76
|
+
"""
|
|
77
|
+
if not alias or not alias.strip():
|
|
78
|
+
raise ValueError("Attachment alias cannot be empty")
|
|
79
|
+
|
|
80
|
+
# Validate inline vs reference
|
|
81
|
+
is_inline = source.lower() in ("inline", "text")
|
|
82
|
+
if is_inline and not content:
|
|
83
|
+
raise ValueError("Inline attachments require content")
|
|
84
|
+
|
|
85
|
+
attachments = load_attachments()
|
|
86
|
+
now = datetime.now(timezone.utc).isoformat()
|
|
87
|
+
|
|
88
|
+
# Build attachment entry
|
|
89
|
+
entry = {
|
|
90
|
+
"source": source,
|
|
91
|
+
"updated": now,
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Only store content for inline attachments
|
|
95
|
+
if is_inline:
|
|
96
|
+
entry["content"] = content
|
|
97
|
+
|
|
98
|
+
# Add created timestamp for new attachments
|
|
99
|
+
if alias not in attachments:
|
|
100
|
+
entry["created"] = now
|
|
101
|
+
else:
|
|
102
|
+
# Preserve original created timestamp
|
|
103
|
+
entry["created"] = attachments[alias].get("created", now)
|
|
104
|
+
|
|
105
|
+
attachments[alias] = entry
|
|
106
|
+
save_attachments(attachments)
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def get_attachment(alias: str) -> Optional[Tuple[str, Optional[str]]]:
|
|
110
|
+
"""Get an attachment by alias.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
alias: Attachment identifier
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Tuple of (source, content) if found, None otherwise.
|
|
117
|
+
For inline attachments, content is the stored text.
|
|
118
|
+
For file/URL references, content is None (fetch on demand).
|
|
119
|
+
"""
|
|
120
|
+
attachments = load_attachments()
|
|
121
|
+
|
|
122
|
+
if alias not in attachments:
|
|
123
|
+
return None
|
|
124
|
+
|
|
125
|
+
attachment = attachments[alias]
|
|
126
|
+
source = attachment["source"]
|
|
127
|
+
content = attachment.get("content") # None for references
|
|
128
|
+
return source, content
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def list_attachments() -> Dict[str, Dict[str, str]]:
|
|
132
|
+
"""List all attachments.
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Dictionary of all attachment data
|
|
136
|
+
"""
|
|
137
|
+
return load_attachments()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def remove_attachment(alias: str) -> bool:
|
|
141
|
+
"""Remove an attachment.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
alias: Attachment identifier to remove
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
True if attachment was removed, False if it didn't exist
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
RuntimeError: If save fails
|
|
151
|
+
"""
|
|
152
|
+
attachments = load_attachments()
|
|
153
|
+
|
|
154
|
+
if alias not in attachments:
|
|
155
|
+
return False
|
|
156
|
+
|
|
157
|
+
del attachments[alias]
|
|
158
|
+
save_attachments(attachments)
|
|
159
|
+
return True
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def search_attachments(query: str) -> Dict[str, Dict[str, str]]:
|
|
163
|
+
"""Search attachments by alias or source.
|
|
164
|
+
|
|
165
|
+
Args:
|
|
166
|
+
query: Search term (case-insensitive)
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Dictionary of matching attachments
|
|
170
|
+
"""
|
|
171
|
+
attachments = load_attachments()
|
|
172
|
+
query_lower = query.lower()
|
|
173
|
+
|
|
174
|
+
return {
|
|
175
|
+
alias: data
|
|
176
|
+
for alias, data in attachments.items()
|
|
177
|
+
if query_lower in alias.lower() or query_lower in data.get("source", "").lower()
|
|
178
|
+
}
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Generic URL handler for HTTP/HTTPS attachments."""
|
|
2
|
+
|
|
3
|
+
import urllib.request
|
|
4
|
+
|
|
5
|
+
from tsugite.attachments.base import AttachmentHandler
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class GenericURLHandler(AttachmentHandler):
|
|
9
|
+
"""Handler for generic HTTP(S) URLs."""
|
|
10
|
+
|
|
11
|
+
def can_handle(self, source: str) -> bool:
|
|
12
|
+
"""Check if source is an HTTP(S) URL.
|
|
13
|
+
|
|
14
|
+
Args:
|
|
15
|
+
source: Source string
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
True if source is HTTP or HTTPS URL
|
|
19
|
+
"""
|
|
20
|
+
return source.startswith("http://") or source.startswith("https://")
|
|
21
|
+
|
|
22
|
+
def fetch(self, source: str) -> str:
|
|
23
|
+
"""Fetch URL content and convert HTML to markdown if needed.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
source: URL to fetch
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
Content as plain text (HTML converted to markdown if applicable)
|
|
30
|
+
|
|
31
|
+
Raises:
|
|
32
|
+
ValueError: If fetch fails
|
|
33
|
+
"""
|
|
34
|
+
try:
|
|
35
|
+
# Fetch URL - get both content and headers
|
|
36
|
+
with urllib.request.urlopen(source, timeout=30) as response:
|
|
37
|
+
content_type = response.headers.get("Content-Type", "").lower()
|
|
38
|
+
content = response.read().decode("utf-8")
|
|
39
|
+
|
|
40
|
+
# If HTML, convert to markdown
|
|
41
|
+
if "text/html" in content_type:
|
|
42
|
+
try:
|
|
43
|
+
import html2text
|
|
44
|
+
|
|
45
|
+
h = html2text.HTML2Text()
|
|
46
|
+
h.ignore_links = False
|
|
47
|
+
h.ignore_images = False
|
|
48
|
+
h.body_width = 0 # Don't wrap lines
|
|
49
|
+
return h.handle(content)
|
|
50
|
+
except ImportError:
|
|
51
|
+
# Fall back to raw HTML if html2text not available
|
|
52
|
+
# Note: html2text is optional for better readability
|
|
53
|
+
return content
|
|
54
|
+
else:
|
|
55
|
+
# Plain text, JSON, XML, etc
|
|
56
|
+
return content
|
|
57
|
+
|
|
58
|
+
except Exception as e:
|
|
59
|
+
raise ValueError(f"Failed to fetch URL '{source}': {e}")
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""YouTube transcript handler for attachments."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
from tsugite.attachments.base import AttachmentHandler
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class YouTubeHandler(AttachmentHandler):
|
|
10
|
+
"""Handler for YouTube video transcripts."""
|
|
11
|
+
|
|
12
|
+
def can_handle(self, source: str) -> bool:
|
|
13
|
+
"""Check if source is a YouTube URL.
|
|
14
|
+
|
|
15
|
+
Args:
|
|
16
|
+
source: Source string
|
|
17
|
+
|
|
18
|
+
Returns:
|
|
19
|
+
True if source is a YouTube URL or youtube: prefix
|
|
20
|
+
"""
|
|
21
|
+
patterns = [
|
|
22
|
+
r"youtube\.com/watch",
|
|
23
|
+
r"youtu\.be/",
|
|
24
|
+
r"^youtube:",
|
|
25
|
+
]
|
|
26
|
+
return any(re.search(pattern, source) for pattern in patterns)
|
|
27
|
+
|
|
28
|
+
def fetch(self, source: str) -> str:
|
|
29
|
+
"""Fetch YouTube transcript.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
source: YouTube URL or youtube:VIDEO_ID
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
Transcript as formatted text
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
ValueError: If transcript cannot be fetched
|
|
39
|
+
"""
|
|
40
|
+
try:
|
|
41
|
+
from youtube_transcript_api import YouTubeTranscriptApi
|
|
42
|
+
except ImportError:
|
|
43
|
+
raise ValueError("youtube-transcript-api not installed. Install with: uv add youtube-transcript-api")
|
|
44
|
+
|
|
45
|
+
# Extract video ID
|
|
46
|
+
video_id = self._extract_video_id(source)
|
|
47
|
+
if not video_id:
|
|
48
|
+
raise ValueError(f"Could not extract video ID from: {source}")
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
# Fetch transcript
|
|
52
|
+
transcript = YouTubeTranscriptApi.get_transcript(video_id) # pylint: disable=no-member
|
|
53
|
+
|
|
54
|
+
# Format as text
|
|
55
|
+
lines = []
|
|
56
|
+
for entry in transcript:
|
|
57
|
+
timestamp = self._format_timestamp(entry["start"])
|
|
58
|
+
text = entry["text"]
|
|
59
|
+
lines.append(f"[{timestamp}] {text}")
|
|
60
|
+
|
|
61
|
+
return "\n".join(lines)
|
|
62
|
+
except Exception as e:
|
|
63
|
+
raise ValueError(f"Failed to fetch YouTube transcript for {video_id}: {e}")
|
|
64
|
+
|
|
65
|
+
def _extract_video_id(self, source: str) -> Optional[str]:
|
|
66
|
+
"""Extract video ID from YouTube URL or youtube: prefix.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
source: YouTube URL or youtube:VIDEO_ID
|
|
70
|
+
|
|
71
|
+
Returns:
|
|
72
|
+
Video ID or None if not found
|
|
73
|
+
"""
|
|
74
|
+
# Handle youtube:VIDEO_ID format
|
|
75
|
+
if source.startswith("youtube:"):
|
|
76
|
+
return source[8:] # Remove "youtube:" prefix
|
|
77
|
+
|
|
78
|
+
# Handle youtu.be/VIDEO_ID format
|
|
79
|
+
match = re.search(r"youtu\.be/([a-zA-Z0-9_-]+)", source)
|
|
80
|
+
if match:
|
|
81
|
+
return match.group(1)
|
|
82
|
+
|
|
83
|
+
# Handle youtube.com/watch?v=VIDEO_ID format
|
|
84
|
+
match = re.search(r"[?&]v=([a-zA-Z0-9_-]+)", source)
|
|
85
|
+
if match:
|
|
86
|
+
return match.group(1)
|
|
87
|
+
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
def _format_timestamp(self, seconds: float) -> str:
|
|
91
|
+
"""Format seconds as MM:SS timestamp.
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
seconds: Time in seconds
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Formatted timestamp string
|
|
98
|
+
"""
|
|
99
|
+
minutes = int(seconds // 60)
|
|
100
|
+
secs = int(seconds % 60)
|
|
101
|
+
return f"{minutes:02d}:{secs:02d}"
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"""Tsugite benchmark framework for evaluating agent performance across different models."""
|
|
2
|
+
|
|
3
|
+
from .config import (
|
|
4
|
+
COST_TIERS,
|
|
5
|
+
EVALUATION_WEIGHTS,
|
|
6
|
+
MODEL_COSTS,
|
|
7
|
+
PERFORMANCE_TIERS,
|
|
8
|
+
SIMILARITY_THRESHOLDS,
|
|
9
|
+
TEST_CATEGORIES,
|
|
10
|
+
get_cost_tier,
|
|
11
|
+
get_performance_tier,
|
|
12
|
+
)
|
|
13
|
+
from .core import BenchmarkConfig, BenchmarkResult, BenchmarkRunner
|
|
14
|
+
from .discovery import BenchmarkTest, TestCase, TestDiscovery
|
|
15
|
+
from .evaluators import (
|
|
16
|
+
CorrectnessEvaluator,
|
|
17
|
+
CostEvaluator,
|
|
18
|
+
LLMEvaluator,
|
|
19
|
+
PerformanceEvaluator,
|
|
20
|
+
QualityEvaluator,
|
|
21
|
+
)
|
|
22
|
+
from .execution import TestExecutor
|
|
23
|
+
from .metrics import (
|
|
24
|
+
BenchmarkMetrics,
|
|
25
|
+
BenchmarkTestResult,
|
|
26
|
+
ModelPerformance,
|
|
27
|
+
)
|
|
28
|
+
from .reports import ReportGenerator
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
# Core
|
|
32
|
+
"BenchmarkRunner",
|
|
33
|
+
"BenchmarkResult",
|
|
34
|
+
"BenchmarkConfig",
|
|
35
|
+
# Discovery
|
|
36
|
+
"TestDiscovery",
|
|
37
|
+
"BenchmarkTest",
|
|
38
|
+
"TestCase",
|
|
39
|
+
# Execution
|
|
40
|
+
"TestExecutor",
|
|
41
|
+
# Evaluators
|
|
42
|
+
"CorrectnessEvaluator",
|
|
43
|
+
"PerformanceEvaluator",
|
|
44
|
+
"QualityEvaluator",
|
|
45
|
+
"CostEvaluator",
|
|
46
|
+
"LLMEvaluator",
|
|
47
|
+
# Metrics
|
|
48
|
+
"BenchmarkMetrics",
|
|
49
|
+
"BenchmarkTestResult",
|
|
50
|
+
"ModelPerformance",
|
|
51
|
+
# Reports
|
|
52
|
+
"ReportGenerator",
|
|
53
|
+
# Config
|
|
54
|
+
"SIMILARITY_THRESHOLDS",
|
|
55
|
+
"PERFORMANCE_TIERS",
|
|
56
|
+
"EVALUATION_WEIGHTS",
|
|
57
|
+
"MODEL_COSTS",
|
|
58
|
+
"COST_TIERS",
|
|
59
|
+
"TEST_CATEGORIES",
|
|
60
|
+
"get_performance_tier",
|
|
61
|
+
"get_cost_tier",
|
|
62
|
+
]
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""Benchmark configuration and shared constants."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Dict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass
|
|
8
|
+
class SimilarityThresholds:
|
|
9
|
+
"""Thresholds for similarity-based evaluation."""
|
|
10
|
+
|
|
11
|
+
string_high_similarity: float = 0.9 # High similarity for string matching
|
|
12
|
+
code_similarity: float = 0.85 # Code similarity threshold
|
|
13
|
+
json_similarity: float = 0.8 # JSON structure similarity
|
|
14
|
+
behavior_pass_threshold: float = 0.6 # Minimum score for behavior tests
|
|
15
|
+
custom_criteria_threshold: float = 0.7 # Minimum for custom criteria
|
|
16
|
+
llm_evaluation_threshold: float = 0.7 # Minimum LLM judge score
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class PerformanceTiers:
|
|
21
|
+
"""Performance tier thresholds."""
|
|
22
|
+
|
|
23
|
+
excellent: float = 0.9
|
|
24
|
+
good: float = 0.75
|
|
25
|
+
fair: float = 0.6
|
|
26
|
+
poor: float = 0.4
|
|
27
|
+
# Below poor is "Very Poor"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class EvaluationWeights:
|
|
32
|
+
"""Weights for blending different evaluation scores."""
|
|
33
|
+
|
|
34
|
+
llm_weight: float = 0.4 # Weight for LLM evaluation in blended score
|
|
35
|
+
base_weight: float = 0.6 # Weight for base evaluation
|
|
36
|
+
planning_weight: float = 0.3 # Weight for planning evaluation
|
|
37
|
+
planning_minimum: float = 0.4 # Minimum planning score required
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class BehaviorScores:
|
|
42
|
+
"""Score values for behavior-based evaluation."""
|
|
43
|
+
|
|
44
|
+
tool_usage: float = 0.3
|
|
45
|
+
file_created: float = 0.3
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@dataclass
|
|
49
|
+
class ModelCosts:
|
|
50
|
+
"""Token costs for different model providers (USD per token)."""
|
|
51
|
+
|
|
52
|
+
# OpenAI
|
|
53
|
+
gpt_4: float = 0.00003
|
|
54
|
+
gpt_4_turbo: float = 0.00001
|
|
55
|
+
gpt_3_5_turbo: float = 0.000002
|
|
56
|
+
|
|
57
|
+
# Anthropic
|
|
58
|
+
claude_3: float = 0.000015
|
|
59
|
+
claude_3_5: float = 0.000015
|
|
60
|
+
|
|
61
|
+
# Google
|
|
62
|
+
gemini_pro: float = 0.0000005
|
|
63
|
+
|
|
64
|
+
# Local models
|
|
65
|
+
ollama: float = 0.0
|
|
66
|
+
|
|
67
|
+
# Default fallback
|
|
68
|
+
default: float = 0.00001
|
|
69
|
+
|
|
70
|
+
def get_cost_for_model(self, model: str) -> float:
|
|
71
|
+
"""Get cost per token for a model string."""
|
|
72
|
+
model_lower = model.lower()
|
|
73
|
+
|
|
74
|
+
# OpenAI models
|
|
75
|
+
if "gpt-4-turbo" in model_lower or "gpt-4o" in model_lower:
|
|
76
|
+
return self.gpt_4_turbo
|
|
77
|
+
elif "gpt-4" in model_lower:
|
|
78
|
+
return self.gpt_4
|
|
79
|
+
elif "gpt-3.5" in model_lower or "gpt-35" in model_lower:
|
|
80
|
+
return self.gpt_3_5_turbo
|
|
81
|
+
|
|
82
|
+
# Anthropic models
|
|
83
|
+
elif "claude-3-5" in model_lower or "claude-3.5" in model_lower:
|
|
84
|
+
return self.claude_3_5
|
|
85
|
+
elif "claude-3" in model_lower:
|
|
86
|
+
return self.claude_3
|
|
87
|
+
|
|
88
|
+
# Google models
|
|
89
|
+
elif "gemini" in model_lower:
|
|
90
|
+
return self.gemini_pro
|
|
91
|
+
|
|
92
|
+
# Local models
|
|
93
|
+
elif "ollama" in model_lower or "local" in model_lower:
|
|
94
|
+
return self.ollama
|
|
95
|
+
|
|
96
|
+
return self.default
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@dataclass
|
|
100
|
+
class CostTiers:
|
|
101
|
+
"""Thresholds for cost tier classification."""
|
|
102
|
+
|
|
103
|
+
very_low: float = 0.001
|
|
104
|
+
low: float = 0.01
|
|
105
|
+
medium: float = 0.1
|
|
106
|
+
high: float = 1.0
|
|
107
|
+
# Above high is "Very High", free is 0.0
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@dataclass
|
|
111
|
+
class TestCategories:
|
|
112
|
+
"""Known test categories and their prefixes."""
|
|
113
|
+
|
|
114
|
+
categories: Dict[str, str] = field(
|
|
115
|
+
default_factory=lambda: {
|
|
116
|
+
"basic": "basic_",
|
|
117
|
+
"tools": "tools_",
|
|
118
|
+
"scenarios": "scenarios_",
|
|
119
|
+
"performance": "performance_",
|
|
120
|
+
"complex": "complex_",
|
|
121
|
+
}
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
def get_category(self, test_id: str) -> str:
|
|
125
|
+
"""Extract category from test ID."""
|
|
126
|
+
for category, prefix in self.categories.items():
|
|
127
|
+
if test_id.startswith(prefix):
|
|
128
|
+
return category
|
|
129
|
+
return "unknown"
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
# Global configuration instances
|
|
133
|
+
SIMILARITY_THRESHOLDS = SimilarityThresholds()
|
|
134
|
+
PERFORMANCE_TIERS = PerformanceTiers()
|
|
135
|
+
EVALUATION_WEIGHTS = EvaluationWeights()
|
|
136
|
+
BEHAVIOR_SCORES = BehaviorScores()
|
|
137
|
+
MODEL_COSTS = ModelCosts()
|
|
138
|
+
COST_TIERS = CostTiers()
|
|
139
|
+
TEST_CATEGORIES = TestCategories()
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def get_performance_tier(accuracy: float) -> str:
|
|
143
|
+
"""Get performance tier label based on accuracy.
|
|
144
|
+
|
|
145
|
+
Args:
|
|
146
|
+
accuracy: Accuracy score from 0.0 to 1.0
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Performance tier label
|
|
150
|
+
"""
|
|
151
|
+
if accuracy >= PERFORMANCE_TIERS.excellent:
|
|
152
|
+
return "Excellent"
|
|
153
|
+
elif accuracy >= PERFORMANCE_TIERS.good:
|
|
154
|
+
return "Good"
|
|
155
|
+
elif accuracy >= PERFORMANCE_TIERS.fair:
|
|
156
|
+
return "Fair"
|
|
157
|
+
elif accuracy >= PERFORMANCE_TIERS.poor:
|
|
158
|
+
return "Poor"
|
|
159
|
+
else:
|
|
160
|
+
return "Very Poor"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def get_cost_tier(cost: float) -> str:
|
|
164
|
+
"""Get cost tier label based on cost in USD.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
cost: Cost in USD
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Cost tier label
|
|
171
|
+
"""
|
|
172
|
+
if cost == 0:
|
|
173
|
+
return "Free"
|
|
174
|
+
elif cost < COST_TIERS.very_low:
|
|
175
|
+
return "Very Low"
|
|
176
|
+
elif cost < COST_TIERS.low:
|
|
177
|
+
return "Low"
|
|
178
|
+
elif cost < COST_TIERS.medium:
|
|
179
|
+
return "Medium"
|
|
180
|
+
elif cost < COST_TIERS.high:
|
|
181
|
+
return "High"
|
|
182
|
+
else:
|
|
183
|
+
return "Very High"
|