hanuscode 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hanus/__init__.py +5 -0
- hanus/__main__.py +10 -0
- hanus/action_handlers.py +76 -0
- hanus/action_parser.py +82 -0
- hanus/agent_runner.py +1445 -0
- hanus/analysis/__init__.py +5 -0
- hanus/analysis/debt.py +702 -0
- hanus/analysis/dependencies.py +475 -0
- hanus/cache/__init__.py +5 -0
- hanus/cache/response_cache.py +560 -0
- hanus/config.py +401 -0
- hanus/connectors/__init__.py +19 -0
- hanus/connectors/base.py +114 -0
- hanus/connectors/claude_connector.py +146 -0
- hanus/connectors/gemini_connector.py +141 -0
- hanus/connectors/glm_connector.py +160 -0
- hanus/connectors/ollama_connector.py +174 -0
- hanus/connectors/openai_connector.py +122 -0
- hanus/connectors/registry.py +26 -0
- hanus/context/__init__.py +7 -0
- hanus/context/manager.py +837 -0
- hanus/context/selective.py +626 -0
- hanus/error_recovery/__init__.py +5 -0
- hanus/error_recovery/auto_fix.py +605 -0
- hanus/hooks/__init__.py +5 -0
- hanus/hooks/manager.py +247 -0
- hanus/instincts/__init__.py +44 -0
- hanus/instincts/cli.py +372 -0
- hanus/instincts/detector.py +281 -0
- hanus/instincts/evolver.py +361 -0
- hanus/instincts/manager.py +343 -0
- hanus/instincts/types.py +253 -0
- hanus/logger.py +81 -0
- hanus/memory/__init__.py +8 -0
- hanus/memory/manager.py +265 -0
- hanus/memory/types.py +119 -0
- hanus/monitor.py +341 -0
- hanus/parallel/__init__.py +5 -0
- hanus/parallel/executor.py +300 -0
- hanus/permissions.py +182 -0
- hanus/plan/__init__.py +8 -0
- hanus/plan/mode.py +267 -0
- hanus/plan/models.py +152 -0
- hanus/plugin_manager.py +754 -0
- hanus/plugin_registry.py +391 -0
- hanus/plugins/__init__.py +1 -0
- hanus/plugins/arena.py +630 -0
- hanus/plugins/code_review.py +123 -0
- hanus/plugins/cortex.py +1750 -0
- hanus/plugins/deps_check.py +27 -0
- hanus/plugins/git_ops.py +33 -0
- hanus/plugins/metasploit.py +530 -0
- hanus/plugins/notes.py +583 -0
- hanus/plugins/search_code.py +59 -0
- hanus/plugins/searchsploit.py +495 -0
- hanus/plugins/strategist.py +175 -0
- hanus/plugins/webui.py +5200 -0
- hanus/profiles.py +479 -0
- hanus/profiles_builtin/__init__.py +0 -0
- hanus/profiles_builtin/architect/profile.yaml +12 -0
- hanus/profiles_builtin/architect/system_prompt.txt +71 -0
- hanus/profiles_builtin/deep/profile.yaml +12 -0
- hanus/profiles_builtin/deep/system_prompt.txt +66 -0
- hanus/profiles_builtin/developer/__init__.py +0 -0
- hanus/profiles_builtin/developer/profile.yaml +9 -0
- hanus/profiles_builtin/developer/system_prompt.txt +176 -0
- hanus/profiles_builtin/speed/profile.yaml +12 -0
- hanus/profiles_builtin/speed/system_prompt.txt +51 -0
- hanus/project_tools.py +177 -0
- hanus/query_engine.py +1594 -0
- hanus/rules/__init__.py +237 -0
- hanus/search/__init__.py +5 -0
- hanus/search/semantic.py +596 -0
- hanus/session_manager.py +547 -0
- hanus/skill_manager.py +702 -0
- hanus/skills/__init__.py +4 -0
- hanus/subagent/__init__.py +8 -0
- hanus/subagent/agents/__init__.py +253 -0
- hanus/subagent/manager.py +309 -0
- hanus/subagent/types.py +266 -0
- hanus/suggestions/__init__.py +5 -0
- hanus/suggestions/proactive.py +451 -0
- hanus/tasks/__init__.py +8 -0
- hanus/tasks/manager.py +330 -0
- hanus/tasks/models.py +106 -0
- hanus/terminal_prompt.py +166 -0
- hanus/tools.py +1849 -0
- hanus/ui.py +939 -0
- hanuscode-1.0.0.dist-info/METADATA +1151 -0
- hanuscode-1.0.0.dist-info/RECORD +93 -0
- hanuscode-1.0.0.dist-info/WHEEL +5 -0
- hanuscode-1.0.0.dist-info/entry_points.txt +2 -0
- hanuscode-1.0.0.dist-info/top_level.txt +1 -0
hanus/plugins/arena.py
ADDED
|
@@ -0,0 +1,630 @@
|
|
|
1
|
+
# plugins/arena.py — Model Arena: Compare multiple models with a judge
|
|
2
|
+
"""
|
|
3
|
+
Plugin para comparar múltiples modelos en la misma tarea.
|
|
4
|
+
Un modelo "juez" evalúa las respuestas de todos los modelos participantes.
|
|
5
|
+
|
|
6
|
+
Uso:
|
|
7
|
+
/arena config — Ver configuración actual
|
|
8
|
+
/arena add <provider/model> [alias] — Añadir modelo participante
|
|
9
|
+
/arena remove <alias> — Eliminar modelo participante
|
|
10
|
+
/arena judge <provider/model> — Configurar modelo juez
|
|
11
|
+
/arena task <prompt> — Ejecutar tarea en todos los modelos
|
|
12
|
+
/arena run <prompt> — Ejecutar y evaluar con el juez
|
|
13
|
+
/arena compare <prompt> — Ejecutar y mostrar comparación
|
|
14
|
+
/arena list — Listar modelos configurados
|
|
15
|
+
/arena clear — Limpiar configuración
|
|
16
|
+
"""
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
import json
|
|
19
|
+
import time
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Dict, List, Optional, Any
|
|
22
|
+
from datetime import datetime
|
|
23
|
+
|
|
24
|
+
NAME = "arena"
|
|
25
|
+
DESCRIPTION = "Model Arena: Compara múltiples modelos con un juez evaluador"
|
|
26
|
+
USAGE = "<comando> [args...] | add <model> | judge <model> | run <prompt> | results"
|
|
27
|
+
AGENT_DOC = """
|
|
28
|
+
Plugin para comparar respuestas de múltiples modelos de IA.
|
|
29
|
+
|
|
30
|
+
Comandos disponibles:
|
|
31
|
+
- config — Ver configuración actual
|
|
32
|
+
- add <provider/model> [alias] — Añadir modelo participante (ej: add openai/gpt-4 gpt4)
|
|
33
|
+
- remove <alias> — Eliminar modelo participante
|
|
34
|
+
- judge <provider/model> — Configurar modelo juez (evalúa todas las respuestas)
|
|
35
|
+
- task <prompt> — Ejecutar tarea en todos los modelos (sin juez)
|
|
36
|
+
- run <prompt> — Ejecutar tarea y evaluar con el juez
|
|
37
|
+
- compare <prompt> — Ejecutar y mostrar comparación detallada
|
|
38
|
+
- results — Ver últimos resultados guardados
|
|
39
|
+
- list — Listar modelos configurados
|
|
40
|
+
- clear — Limpiar toda la configuración
|
|
41
|
+
|
|
42
|
+
Ejemplos:
|
|
43
|
+
<run_plugin name="arena" args="add anthropic/claude-sonnet-4-6 claude"/>
|
|
44
|
+
<run_plugin name="arena" args="add openai/gpt-4o gpt4"/>
|
|
45
|
+
<run_plugin name="arena" args="judge anthropic/claude-opus-4-7"/>
|
|
46
|
+
<run_plugin name="arena" args="run Explain quantum computing in 3 sentences"/>
|
|
47
|
+
<run_plugin name="arena" args="results"/>
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# Configuración guardada
|
|
51
|
+
CONFIG_FILE = Path.home() / ".hanus" / "arena_config.json"
|
|
52
|
+
|
|
53
|
+
# System prompt para el juez
|
|
54
|
+
JUDGE_PROMPT = """You are an expert judge evaluating AI model responses.
|
|
55
|
+
|
|
56
|
+
You will receive:
|
|
57
|
+
1. A task/prompt that was given to multiple AI models
|
|
58
|
+
2. Responses from each model (identified by alias)
|
|
59
|
+
|
|
60
|
+
Your job is to:
|
|
61
|
+
1. Evaluate each response based on: accuracy, completeness, clarity, usefulness
|
|
62
|
+
2. Rank the responses from best to worst
|
|
63
|
+
3. Provide a brief justification for your ranking
|
|
64
|
+
4. Assign a score (1-10) to each response
|
|
65
|
+
|
|
66
|
+
Output your evaluation in this JSON format:
|
|
67
|
+
{
|
|
68
|
+
"rankings": [
|
|
69
|
+
{"alias": "model_alias", "rank": 1, "score": 9.5, "strengths": ["..."], "weaknesses": ["..."]},
|
|
70
|
+
...
|
|
71
|
+
],
|
|
72
|
+
"best": "model_alias",
|
|
73
|
+
"summary": "Brief overall comparison..."
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
Be fair, objective, and thorough in your evaluation."""
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
class ModelConfig:
|
|
80
|
+
"""Configuración de un modelo participante."""
|
|
81
|
+
|
|
82
|
+
def __init__(self, provider: str, model_id: str, alias: str = ""):
|
|
83
|
+
self.provider = provider
|
|
84
|
+
self.model_id = model_id
|
|
85
|
+
self.alias = alias or f"{provider}/{model_id}"
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> dict:
|
|
88
|
+
return {
|
|
89
|
+
"provider": self.provider,
|
|
90
|
+
"model_id": self.model_id,
|
|
91
|
+
"alias": self.alias
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
@classmethod
|
|
95
|
+
def from_dict(cls, data: dict) -> "ModelConfig":
|
|
96
|
+
return cls(
|
|
97
|
+
provider=data.get("provider", ""),
|
|
98
|
+
model_id=data.get("model_id", ""),
|
|
99
|
+
alias=data.get("alias", "")
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def __repr__(self) -> str:
|
|
103
|
+
return f"ModelConfig({self.alias}: {self.provider}/{self.model_id})"
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class ArenaConfig:
|
|
107
|
+
"""Configuración del Arena."""
|
|
108
|
+
|
|
109
|
+
def __init__(self):
|
|
110
|
+
self.models: List[ModelConfig] = []
|
|
111
|
+
self.judge: Optional[ModelConfig] = None
|
|
112
|
+
self.last_results: List[dict] = []
|
|
113
|
+
|
|
114
|
+
def to_dict(self) -> dict:
|
|
115
|
+
return {
|
|
116
|
+
"models": [m.to_dict() for m in self.models],
|
|
117
|
+
"judge": self.judge.to_dict() if self.judge else None,
|
|
118
|
+
"last_results": self.last_results
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
@classmethod
|
|
122
|
+
def from_dict(cls, data: dict) -> "ArenaConfig":
|
|
123
|
+
config = cls()
|
|
124
|
+
for m in data.get("models", []):
|
|
125
|
+
config.models.append(ModelConfig.from_dict(m))
|
|
126
|
+
if data.get("judge"):
|
|
127
|
+
config.judge = ModelConfig.from_dict(data["judge"])
|
|
128
|
+
config.last_results = data.get("last_results", [])
|
|
129
|
+
return config
|
|
130
|
+
|
|
131
|
+
def save(self):
|
|
132
|
+
CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
133
|
+
CONFIG_FILE.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
|
|
134
|
+
|
|
135
|
+
@classmethod
|
|
136
|
+
def load(cls) -> "ArenaConfig":
|
|
137
|
+
if not CONFIG_FILE.exists():
|
|
138
|
+
return cls()
|
|
139
|
+
try:
|
|
140
|
+
data = json.loads(CONFIG_FILE.read_text(encoding="utf-8"))
|
|
141
|
+
return cls.from_dict(data)
|
|
142
|
+
except Exception:
|
|
143
|
+
return cls()
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
# Estado global
|
|
147
|
+
_config: Optional[ArenaConfig] = None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def _get_config() -> ArenaConfig:
|
|
151
|
+
"""Obtiene la configuración (desde cache o archivo)."""
|
|
152
|
+
global _config
|
|
153
|
+
if _config is None:
|
|
154
|
+
_config = ArenaConfig.load()
|
|
155
|
+
return _config
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _get_connector(provider: str, model_id: str):
|
|
159
|
+
"""Obtiene un conector para el modelo especificado."""
|
|
160
|
+
import sys
|
|
161
|
+
from hanus.connectors.registry import ConnectorRegistry
|
|
162
|
+
from hanus.config import HanusConfig
|
|
163
|
+
|
|
164
|
+
print(f"[Arena] 🔧 Creando conector para {provider}/{model_id}...", file=sys.stderr, flush=True)
|
|
165
|
+
|
|
166
|
+
config = HanusConfig.load()
|
|
167
|
+
config.provider = provider
|
|
168
|
+
config.model_id = model_id
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
connector = ConnectorRegistry.get(provider, config.get_connector_config())
|
|
172
|
+
print(f"[Arena] ✓ Conector creado: {connector.__class__.__name__}", file=sys.stderr, flush=True)
|
|
173
|
+
return connector
|
|
174
|
+
except Exception as e:
|
|
175
|
+
print(f"[Arena] ❌ Error creando conector: {e}", file=sys.stderr, flush=True)
|
|
176
|
+
raise
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def _run_model(model: ModelConfig, prompt: str, system_prompt: str = "") -> dict:
|
|
180
|
+
"""Ejecuta un prompt en un modelo y retorna el resultado."""
|
|
181
|
+
import sys
|
|
182
|
+
start_time = time.time()
|
|
183
|
+
|
|
184
|
+
# Print para indicar que el modelo está trabajando
|
|
185
|
+
print(f"\n[Arena] 🔄 Modelo {model.alias} ({model.provider}/{model.model_id}) está procesando...", file=sys.stderr, flush=True)
|
|
186
|
+
|
|
187
|
+
try:
|
|
188
|
+
connector = _get_connector(model.provider, model.model_id)
|
|
189
|
+
|
|
190
|
+
messages = []
|
|
191
|
+
if system_prompt:
|
|
192
|
+
messages.append({"role": "system", "content": system_prompt})
|
|
193
|
+
messages.append({"role": "user", "content": prompt})
|
|
194
|
+
|
|
195
|
+
print(f"[Arena] 📤 Enviando prompt a {model.alias}...", file=sys.stderr, flush=True)
|
|
196
|
+
response = connector.chat(messages=messages)
|
|
197
|
+
|
|
198
|
+
elapsed = time.time() - start_time
|
|
199
|
+
print(f"[Arena] ✅ {model.alias} completado en {elapsed:.2f}s ({response.output_tokens} tokens)", file=sys.stderr, flush=True)
|
|
200
|
+
|
|
201
|
+
return {
|
|
202
|
+
"alias": model.alias,
|
|
203
|
+
"provider": model.provider,
|
|
204
|
+
"model_id": model.model_id,
|
|
205
|
+
"response": response.text,
|
|
206
|
+
"tokens_in": response.input_tokens,
|
|
207
|
+
"tokens_out": response.output_tokens,
|
|
208
|
+
"elapsed": elapsed,
|
|
209
|
+
"success": True,
|
|
210
|
+
"error": None
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
except Exception as e:
|
|
214
|
+
elapsed = time.time() - start_time
|
|
215
|
+
print(f"[Arena] ❌ Error en {model.alias}: {e}", file=sys.stderr, flush=True)
|
|
216
|
+
return {
|
|
217
|
+
"alias": model.alias,
|
|
218
|
+
"provider": model.provider,
|
|
219
|
+
"model_id": model.model_id,
|
|
220
|
+
"response": "",
|
|
221
|
+
"tokens_in": 0,
|
|
222
|
+
"tokens_out": 0,
|
|
223
|
+
"elapsed": elapsed,
|
|
224
|
+
"success": False,
|
|
225
|
+
"error": str(e)
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def _format_response(result: dict, max_lines: int = 20) -> str:
|
|
230
|
+
"""Formatea una respuesta para mostrar."""
|
|
231
|
+
lines = []
|
|
232
|
+
lines.append(f"{'='*60}")
|
|
233
|
+
lines.append(f"Modelo: {result['alias']} ({result['provider']}/{result['model_id']})")
|
|
234
|
+
|
|
235
|
+
if not result['success']:
|
|
236
|
+
lines.append(f"ERROR: {result['error']}")
|
|
237
|
+
return "\n".join(lines)
|
|
238
|
+
|
|
239
|
+
lines.append(f"Tokens: {result['tokens_in']} in / {result['tokens_out']} out")
|
|
240
|
+
lines.append(f"Tiempo: {result['elapsed']:.2f}s")
|
|
241
|
+
lines.append(f"{'='*60}")
|
|
242
|
+
|
|
243
|
+
response_text = result['response']
|
|
244
|
+
response_lines = response_text.split('\n')
|
|
245
|
+
|
|
246
|
+
if len(response_lines) > max_lines:
|
|
247
|
+
shown = response_lines[:max_lines]
|
|
248
|
+
hidden = len(response_lines) - max_lines
|
|
249
|
+
lines.append('\n'.join(shown))
|
|
250
|
+
lines.append(f"\n... ({hidden} líneas omitidas)")
|
|
251
|
+
else:
|
|
252
|
+
lines.append(response_text)
|
|
253
|
+
|
|
254
|
+
return "\n".join(lines)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _format_judge_response(response: str) -> str:
|
|
258
|
+
"""Formatea la respuesta del juez."""
|
|
259
|
+
import sys
|
|
260
|
+
lines = []
|
|
261
|
+
lines.append("=" * 60)
|
|
262
|
+
lines.append("JUICIO DEL MODELO JUEZ")
|
|
263
|
+
lines.append("=" * 60)
|
|
264
|
+
|
|
265
|
+
print(f"[Arena] 📋 Formateando respuesta del juez ({len(response)} chars)...", file=sys.stderr, flush=True)
|
|
266
|
+
|
|
267
|
+
# Si la respuesta está vacía
|
|
268
|
+
if not response or not response.strip():
|
|
269
|
+
lines.append("\n⚠️ El juez no devolvió respuesta.")
|
|
270
|
+
return "\n".join(lines)
|
|
271
|
+
|
|
272
|
+
# Limpiar respuesta: quitar bloques de código markdown
|
|
273
|
+
clean_response = response.strip()
|
|
274
|
+
if clean_response.startswith("```"):
|
|
275
|
+
# Quitar ```json o ``` del inicio
|
|
276
|
+
first_newline = clean_response.find('\n')
|
|
277
|
+
if first_newline != -1:
|
|
278
|
+
clean_response = clean_response[first_newline + 1:]
|
|
279
|
+
# Quitar ``` del final
|
|
280
|
+
if clean_response.endswith("```"):
|
|
281
|
+
clean_response = clean_response[:-3].strip()
|
|
282
|
+
|
|
283
|
+
# Intentar parsear JSON si es posible
|
|
284
|
+
try:
|
|
285
|
+
data = json.loads(clean_response)
|
|
286
|
+
print(f"[Arena] ✅ JSON parseado correctamente", file=sys.stderr, flush=True)
|
|
287
|
+
if "rankings" in data:
|
|
288
|
+
lines.append("\nRANKING FINAL:")
|
|
289
|
+
lines.append("-" * 40)
|
|
290
|
+
for r in data.get("rankings", []):
|
|
291
|
+
lines.append(f"\n{r.get('rank', '?')}. {r.get('alias', '?')} (Score: {r.get('score', '?')})")
|
|
292
|
+
if r.get("strengths"):
|
|
293
|
+
lines.append(f" Fortalezas: {', '.join(r['strengths'][:3])}")
|
|
294
|
+
if r.get("weaknesses"):
|
|
295
|
+
lines.append(f" Debilidades: {', '.join(r['weaknesses'][:3])}")
|
|
296
|
+
|
|
297
|
+
if data.get("best"):
|
|
298
|
+
lines.append(f"\n🏆 MEJOR: {data['best']}")
|
|
299
|
+
|
|
300
|
+
if data.get("summary"):
|
|
301
|
+
lines.append(f"\nRESUMEN: {data['summary']}")
|
|
302
|
+
|
|
303
|
+
return "\n".join(lines)
|
|
304
|
+
except (json.JSONDecodeError, TypeError) as e:
|
|
305
|
+
print(f"[Arena] ⚠️ JSON no válido, mostrando respuesta raw: {e}", file=sys.stderr, flush=True)
|
|
306
|
+
|
|
307
|
+
# Si no es JSON, mostrar respuesta completa sin truncar
|
|
308
|
+
lines.append("\nRespuesta del juez:")
|
|
309
|
+
lines.append("-" * 40)
|
|
310
|
+
lines.append(response)
|
|
311
|
+
|
|
312
|
+
return "\n".join(lines)
|
|
313
|
+
|
|
314
|
+
|
|
315
|
+
def run(args: str = "") -> str:
|
|
316
|
+
"""Punto de entrada del plugin."""
|
|
317
|
+
global _config
|
|
318
|
+
|
|
319
|
+
if not args.strip():
|
|
320
|
+
return f"Uso: {USAGE}\n\n{AGENT_DOC}"
|
|
321
|
+
|
|
322
|
+
parts = args.strip().split(maxsplit=2)
|
|
323
|
+
cmd = parts[0].lower()
|
|
324
|
+
cmd_args = parts[1:] if len(parts) > 1 else []
|
|
325
|
+
|
|
326
|
+
config = _get_config()
|
|
327
|
+
|
|
328
|
+
# ── CONFIG ──────────────────────────────────────────────────────────
|
|
329
|
+
if cmd == "config":
|
|
330
|
+
output = ["Configuración del Arena", "=" * 50]
|
|
331
|
+
|
|
332
|
+
if config.judge:
|
|
333
|
+
output.append(f"\nJuez: {config.judge.alias} ({config.judge.provider}/{config.judge.model_id})")
|
|
334
|
+
else:
|
|
335
|
+
output.append("\nJuez: No configurado")
|
|
336
|
+
|
|
337
|
+
if config.models:
|
|
338
|
+
output.append(f"\nModelos participantes ({len(config.models)}):")
|
|
339
|
+
for m in config.models:
|
|
340
|
+
output.append(f" • {m.alias}: {m.provider}/{m.model_id}")
|
|
341
|
+
else:
|
|
342
|
+
output.append("\nModelos participantes: Ninguno")
|
|
343
|
+
|
|
344
|
+
output.append(f"\nArchivo: {CONFIG_FILE}")
|
|
345
|
+
return "\n".join(output)
|
|
346
|
+
|
|
347
|
+
# ── ADD ────────────────────────────────────────────────────────────
|
|
348
|
+
if cmd == "add":
|
|
349
|
+
if len(cmd_args) < 1:
|
|
350
|
+
return "Uso: arena add <provider/model> [alias]\nEjemplo: arena add anthropic/claude-sonnet-4-6 claude"
|
|
351
|
+
|
|
352
|
+
model_str = cmd_args[0]
|
|
353
|
+
alias = cmd_args[1] if len(cmd_args) > 1 else ""
|
|
354
|
+
|
|
355
|
+
if "/" not in model_str:
|
|
356
|
+
return "Formato inválido. Usa: provider/model (ej: anthropic/claude-sonnet-4-6)"
|
|
357
|
+
|
|
358
|
+
provider, model_id = model_str.split("/", 1)
|
|
359
|
+
if not alias:
|
|
360
|
+
alias = model_id.split("/")[-1] if "/" in model_id else model_id
|
|
361
|
+
alias = alias.replace("-", "_")[:20]
|
|
362
|
+
|
|
363
|
+
# Verificar que no exista
|
|
364
|
+
for m in config.models:
|
|
365
|
+
if m.alias == alias:
|
|
366
|
+
return f"Ya existe un modelo con alias '{alias}'. Usa: arena remove {alias} primero."
|
|
367
|
+
|
|
368
|
+
model = ModelConfig(provider=provider, model_id=model_id, alias=alias)
|
|
369
|
+
config.models.append(model)
|
|
370
|
+
config.save()
|
|
371
|
+
|
|
372
|
+
return f"✓ Modelo añadido: {alias} ({provider}/{model_id})\nTotal: {len(config.models)} modelos"
|
|
373
|
+
|
|
374
|
+
# ── REMOVE ─────────────────────────────────────────────────────────
|
|
375
|
+
if cmd == "remove":
|
|
376
|
+
if len(cmd_args) < 1:
|
|
377
|
+
return "Uso: arena remove <alias>"
|
|
378
|
+
|
|
379
|
+
alias = cmd_args[0]
|
|
380
|
+
original_count = len(config.models)
|
|
381
|
+
config.models = [m for m in config.models if m.alias != alias]
|
|
382
|
+
|
|
383
|
+
if len(config.models) == original_count:
|
|
384
|
+
return f"No se encontró modelo con alias '{alias}'"
|
|
385
|
+
|
|
386
|
+
config.save()
|
|
387
|
+
return f"✓ Modelo eliminado: {alias}\nTotal: {len(config.models)} modelos"
|
|
388
|
+
|
|
389
|
+
# ── JUDGE ───────────────────────────────────────────────────────────
|
|
390
|
+
if cmd == "judge":
|
|
391
|
+
if len(cmd_args) < 1:
|
|
392
|
+
if config.judge:
|
|
393
|
+
return f"Juez actual: {config.judge.alias} ({config.judge.provider}/{config.judge.model_id})\n\nPara cambiar: arena judge <provider/model>"
|
|
394
|
+
return "No hay juez configurado.\nUsa: arena judge <provider/model>"
|
|
395
|
+
|
|
396
|
+
model_str = cmd_args[0]
|
|
397
|
+
|
|
398
|
+
if "/" not in model_str:
|
|
399
|
+
return "Formato inválido. Usa: provider/model (ej: anthropic/claude-opus-4-7)"
|
|
400
|
+
|
|
401
|
+
provider, model_id = model_str.split("/", 1)
|
|
402
|
+
config.judge = ModelConfig(provider=provider, model_id=model_id, alias="judge")
|
|
403
|
+
config.save()
|
|
404
|
+
|
|
405
|
+
return f"✓ Juez configurado: {provider}/{model_id}"
|
|
406
|
+
|
|
407
|
+
# ── LIST ───────────────────────────────────────────────────────────
|
|
408
|
+
if cmd == "list":
|
|
409
|
+
output = ["Modelos en el Arena", "=" * 50]
|
|
410
|
+
|
|
411
|
+
if config.judge:
|
|
412
|
+
output.append(f"\n[JUEZ] {config.judge.alias}: {config.judge.provider}/{config.judge.model_id}")
|
|
413
|
+
|
|
414
|
+
if config.models:
|
|
415
|
+
output.append("\nParticipantes:")
|
|
416
|
+
for i, m in enumerate(config.models, 1):
|
|
417
|
+
output.append(f" {i}. {m.alias}: {m.provider}/{m.model_id}")
|
|
418
|
+
else:
|
|
419
|
+
output.append("\nNo hay modelos participantes.")
|
|
420
|
+
output.append("Añade con: arena add <provider/model> [alias]")
|
|
421
|
+
|
|
422
|
+
return "\n".join(output)
|
|
423
|
+
|
|
424
|
+
# ── RESULTS ─────────────────────────────────────────────────────────
|
|
425
|
+
if cmd == "results":
|
|
426
|
+
if not config.last_results:
|
|
427
|
+
return "No hay resultados anteriores.\nUsa: arena run <prompt> o arena task <prompt>"
|
|
428
|
+
|
|
429
|
+
output = []
|
|
430
|
+
|
|
431
|
+
# Separar modelos participantes del juez
|
|
432
|
+
model_results = []
|
|
433
|
+
judge_result = None
|
|
434
|
+
|
|
435
|
+
for result in config.last_results:
|
|
436
|
+
if isinstance(result, dict) and "judge" in result:
|
|
437
|
+
judge_result = result["judge"]
|
|
438
|
+
elif isinstance(result, dict) and result.get("alias") == "judge":
|
|
439
|
+
judge_result = result
|
|
440
|
+
else:
|
|
441
|
+
model_results.append(result)
|
|
442
|
+
|
|
443
|
+
# Mostrar resultados de modelos participantes
|
|
444
|
+
if model_results:
|
|
445
|
+
output.append("=" * 60)
|
|
446
|
+
output.append("RESULTADOS DE MODELOS PARTICIPANTES")
|
|
447
|
+
output.append("=" * 60)
|
|
448
|
+
for i, result in enumerate(model_results, 1):
|
|
449
|
+
output.append(f"\n[{i}] {result.get('alias', 'Unknown')}")
|
|
450
|
+
output.append(f" Status: {'✓ OK' if result.get('success') else '✗ Error'}")
|
|
451
|
+
output.append(f" Tiempo: {result.get('elapsed', 0):.2f}s")
|
|
452
|
+
output.append(f" Tokens: {result.get('tokens_out', 0)} out")
|
|
453
|
+
|
|
454
|
+
# Mostrar resultado del juez (esto es lo importante)
|
|
455
|
+
if judge_result:
|
|
456
|
+
output.append("\n" + "=" * 60)
|
|
457
|
+
output.append("EVALUACIÓN DEL JUEZ")
|
|
458
|
+
output.append("=" * 60)
|
|
459
|
+
|
|
460
|
+
if judge_result.get("success"):
|
|
461
|
+
output.append(_format_judge_response(judge_result.get("response", "")))
|
|
462
|
+
else:
|
|
463
|
+
output.append(f"\n❌ Error del juez: {judge_result.get('error', 'Unknown error')}")
|
|
464
|
+
else:
|
|
465
|
+
output.append("\n⚠️ No hay evaluación del juez guardada.")
|
|
466
|
+
output.append("Usa: arena run <prompt> para ejecutar con evaluación")
|
|
467
|
+
|
|
468
|
+
return "\n".join(output)
|
|
469
|
+
|
|
470
|
+
# ── JUDGE RESULT ─────────────────────────────────────────────────────
|
|
471
|
+
if cmd in ("judge_result", "juez"):
|
|
472
|
+
"""Mostrar solo la respuesta del juez."""
|
|
473
|
+
if not config.last_results:
|
|
474
|
+
return "No hay resultados guardados.\nUsa: arena run <prompt> para ejecutar una evaluación."
|
|
475
|
+
|
|
476
|
+
judge_result = None
|
|
477
|
+
for result in config.last_results:
|
|
478
|
+
if isinstance(result, dict) and "judge" in result:
|
|
479
|
+
judge_result = result["judge"]
|
|
480
|
+
break
|
|
481
|
+
elif isinstance(result, dict) and result.get("alias") == "judge":
|
|
482
|
+
judge_result = result
|
|
483
|
+
break
|
|
484
|
+
|
|
485
|
+
if not judge_result:
|
|
486
|
+
return "No hay resultado del juez.\nUsa: arena run <prompt> para ejecutar con evaluación."
|
|
487
|
+
|
|
488
|
+
output = ["=" * 60]
|
|
489
|
+
output.append("RESPUESTA COMPLETA DEL JUEZ")
|
|
490
|
+
output.append("=" * 60)
|
|
491
|
+
|
|
492
|
+
if judge_result.get("success"):
|
|
493
|
+
output.append(f"\nModelo: {judge_result.get('provider', '?')}/{judge_result.get('model_id', '?')}")
|
|
494
|
+
output.append(f"Tiempo: {judge_result.get('elapsed', 0):.2f}s")
|
|
495
|
+
output.append(f"Tokens: {judge_result.get('tokens_out', 0)}")
|
|
496
|
+
output.append("")
|
|
497
|
+
output.append(_format_judge_response(judge_result.get("response", "")))
|
|
498
|
+
else:
|
|
499
|
+
output.append(f"\n❌ Error: {judge_result.get('error', 'Unknown error')}")
|
|
500
|
+
|
|
501
|
+
return "\n".join(output)
|
|
502
|
+
|
|
503
|
+
# ── CLEAR ───────────────────────────────────────────────────────────
|
|
504
|
+
if cmd == "clear":
|
|
505
|
+
config.models = []
|
|
506
|
+
config.judge = None
|
|
507
|
+
config.last_results = []
|
|
508
|
+
config.save()
|
|
509
|
+
return "✓ Configuración limpiada.\nUsa 'arena add' y 'arena judge' para configurar."
|
|
510
|
+
|
|
511
|
+
# ── TASK (solo ejecutar, sin juez) ───────────────────────────────────
|
|
512
|
+
if cmd == "task":
|
|
513
|
+
import sys
|
|
514
|
+
if len(cmd_args) < 1:
|
|
515
|
+
return "Uso: arena task <prompt>\nEjemplo: arena task Explain quantum computing"
|
|
516
|
+
|
|
517
|
+
if not config.models:
|
|
518
|
+
return "No hay modelos configurados.\nUsa: arena add <provider/model> [alias]"
|
|
519
|
+
|
|
520
|
+
prompt = " ".join(cmd_args)
|
|
521
|
+
|
|
522
|
+
print(f"\n[Arena] 🚀 Iniciando tarea en {len(config.models)} modelo(s)...", file=sys.stderr, flush=True)
|
|
523
|
+
print(f"[Arena] 📝 Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}", file=sys.stderr, flush=True)
|
|
524
|
+
|
|
525
|
+
output = [f"Ejecutando tarea en {len(config.models)} modelos...", "=" * 50]
|
|
526
|
+
output.append(f"\nPrompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
|
|
527
|
+
output.append("")
|
|
528
|
+
|
|
529
|
+
results = []
|
|
530
|
+
for i, model in enumerate(config.models, 1):
|
|
531
|
+
print(f"[Arena] [{i}/{len(config.models)}] Preparando {model.alias}...", file=sys.stderr, flush=True)
|
|
532
|
+
output.append(f"\nEjecutando {model.alias}...")
|
|
533
|
+
result = _run_model(model, prompt)
|
|
534
|
+
results.append(result)
|
|
535
|
+
output.append(_format_response(result))
|
|
536
|
+
|
|
537
|
+
print(f"\n[Arena] ✅ Tarea completada en {len(config.models)} modelo(s)", file=sys.stderr, flush=True)
|
|
538
|
+
|
|
539
|
+
# Guardar resultados
|
|
540
|
+
config.last_results = results
|
|
541
|
+
config.save()
|
|
542
|
+
|
|
543
|
+
return "\n".join(output)
|
|
544
|
+
|
|
545
|
+
# ── RUN (ejecutar y evaluar con juez) ───────────────────────────────
|
|
546
|
+
if cmd in ("run", "compare"):
|
|
547
|
+
import sys
|
|
548
|
+
if len(cmd_args) < 1:
|
|
549
|
+
return f"Uso: arena {cmd} <prompt>\nEjemplo: arena run Explain quantum computing"
|
|
550
|
+
|
|
551
|
+
if not config.models:
|
|
552
|
+
return "No hay modelos configurados.\nUsa: arena add <provider/model> [alias]"
|
|
553
|
+
|
|
554
|
+
if not config.judge:
|
|
555
|
+
return "No hay juez configurado.\nUsa: arena judge <provider/model>"
|
|
556
|
+
|
|
557
|
+
prompt = " ".join(cmd_args)
|
|
558
|
+
|
|
559
|
+
print(f"\n[Arena] 🚀 ARENA MODE: Ejecutando en {len(config.models)} modelo(s) + juez", file=sys.stderr, flush=True)
|
|
560
|
+
print(f"[Arena] 📝 Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}", file=sys.stderr, flush=True)
|
|
561
|
+
|
|
562
|
+
output = [f"Arena: Ejecutando tarea en {len(config.models)} modelos", "=" * 50]
|
|
563
|
+
output.append(f"\nPrompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
|
|
564
|
+
|
|
565
|
+
# Ejecutar en todos los modelos
|
|
566
|
+
results = []
|
|
567
|
+
for i, model in enumerate(config.models, 1):
|
|
568
|
+
print(f"[Arena] [{i}/{len(config.models)}] Ejecutando {model.alias} ({model.provider}/{model.model_id})...", file=sys.stderr, flush=True)
|
|
569
|
+
output.append(f"\n⏳ {model.alias} ({model.provider}/{model.model_id})...")
|
|
570
|
+
result = _run_model(model, prompt)
|
|
571
|
+
results.append(result)
|
|
572
|
+
status = "✓" if result['success'] else "✗"
|
|
573
|
+
output.append(f" {status} Completado en {result['elapsed']:.2f}s ({result['tokens_out']} tokens)")
|
|
574
|
+
|
|
575
|
+
# Preparar input para el juez
|
|
576
|
+
print(f"[Arena] 📊 Preparando evaluación del juez...", file=sys.stderr, flush=True)
|
|
577
|
+
judge_input = f"""Task: {prompt}
|
|
578
|
+
|
|
579
|
+
Responses from {len(results)} models:
|
|
580
|
+
|
|
581
|
+
"""
|
|
582
|
+
for r in results:
|
|
583
|
+
judge_input += f"\n--- {r['alias']} ---\n{r['response']}\n"
|
|
584
|
+
|
|
585
|
+
judge_input += "\n\nEvaluate all responses and provide your ranking in JSON format."
|
|
586
|
+
|
|
587
|
+
# Ejecutar el juez
|
|
588
|
+
print(f"[Arena] ⚖️ Ejecutando juez ({config.judge.provider}/{config.judge.model_id})...", file=sys.stderr, flush=True)
|
|
589
|
+
output.append(f"\n{'='*50}")
|
|
590
|
+
output.append(f" Evaluando con el juez ({config.judge.provider}/{config.judge.model_id})...")
|
|
591
|
+
output.append(f"{'='*50}")
|
|
592
|
+
|
|
593
|
+
judge_result = _run_model(config.judge, judge_input, JUDGE_PROMPT)
|
|
594
|
+
|
|
595
|
+
if judge_result['success']:
|
|
596
|
+
print(f"[Arena] ✅ Evaluación completada", file=sys.stderr, flush=True)
|
|
597
|
+
output.append(_format_judge_response(judge_result['response']))
|
|
598
|
+
else:
|
|
599
|
+
print(f"[Arena] ❌ Error en evaluación: {judge_result['error']}", file=sys.stderr, flush=True)
|
|
600
|
+
output.append(f"Error del juez: {judge_result['error']}")
|
|
601
|
+
|
|
602
|
+
# Guardar resultados
|
|
603
|
+
config.last_results = results + [{"judge": judge_result}]
|
|
604
|
+
config.save()
|
|
605
|
+
|
|
606
|
+
print(f"\n[Arena] ✅ ARENA completado", file=sys.stderr, flush=True)
|
|
607
|
+
return "\n".join(output)
|
|
608
|
+
|
|
609
|
+
# Comando no reconocido
|
|
610
|
+
return f"""Comando desconocido: {cmd}
|
|
611
|
+
|
|
612
|
+
Comandos disponibles:
|
|
613
|
+
config — Ver configuración actual
|
|
614
|
+
add <provider/model> [alias] — Añadir modelo participante
|
|
615
|
+
remove <alias> — Eliminar modelo participante
|
|
616
|
+
judge <provider/model> — Configurar modelo juez
|
|
617
|
+
task <prompt> — Ejecutar tarea en todos los modelos (sin juez)
|
|
618
|
+
run <prompt> — Ejecutar y evaluar con el juez
|
|
619
|
+
compare <prompt> — Ejecutar y mostrar comparación detallada
|
|
620
|
+
results — Ver últimos resultados (modelos + juez)
|
|
621
|
+
judge_result — Ver solo la respuesta del juez
|
|
622
|
+
list — Listar modelos configurados
|
|
623
|
+
clear — Limpiar configuración
|
|
624
|
+
|
|
625
|
+
Ejemplo de uso:
|
|
626
|
+
arena add anthropic/claude-sonnet-4-6 claude
|
|
627
|
+
arena add openai/gpt-4o gpt4
|
|
628
|
+
arena judge anthropic/claude-opus-4-7
|
|
629
|
+
arena run Explain quantum computing in 3 sentences
|
|
630
|
+
arena judge_result — Ver respuesta del juez"""
|