hanuscode 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. hanus/__init__.py +5 -0
  2. hanus/__main__.py +10 -0
  3. hanus/action_handlers.py +76 -0
  4. hanus/action_parser.py +82 -0
  5. hanus/agent_runner.py +1445 -0
  6. hanus/analysis/__init__.py +5 -0
  7. hanus/analysis/debt.py +702 -0
  8. hanus/analysis/dependencies.py +475 -0
  9. hanus/cache/__init__.py +5 -0
  10. hanus/cache/response_cache.py +560 -0
  11. hanus/config.py +401 -0
  12. hanus/connectors/__init__.py +19 -0
  13. hanus/connectors/base.py +114 -0
  14. hanus/connectors/claude_connector.py +146 -0
  15. hanus/connectors/gemini_connector.py +141 -0
  16. hanus/connectors/glm_connector.py +160 -0
  17. hanus/connectors/ollama_connector.py +174 -0
  18. hanus/connectors/openai_connector.py +122 -0
  19. hanus/connectors/registry.py +26 -0
  20. hanus/context/__init__.py +7 -0
  21. hanus/context/manager.py +837 -0
  22. hanus/context/selective.py +626 -0
  23. hanus/error_recovery/__init__.py +5 -0
  24. hanus/error_recovery/auto_fix.py +605 -0
  25. hanus/hooks/__init__.py +5 -0
  26. hanus/hooks/manager.py +247 -0
  27. hanus/instincts/__init__.py +44 -0
  28. hanus/instincts/cli.py +372 -0
  29. hanus/instincts/detector.py +281 -0
  30. hanus/instincts/evolver.py +361 -0
  31. hanus/instincts/manager.py +343 -0
  32. hanus/instincts/types.py +253 -0
  33. hanus/logger.py +81 -0
  34. hanus/memory/__init__.py +8 -0
  35. hanus/memory/manager.py +265 -0
  36. hanus/memory/types.py +119 -0
  37. hanus/monitor.py +341 -0
  38. hanus/parallel/__init__.py +5 -0
  39. hanus/parallel/executor.py +300 -0
  40. hanus/permissions.py +182 -0
  41. hanus/plan/__init__.py +8 -0
  42. hanus/plan/mode.py +267 -0
  43. hanus/plan/models.py +152 -0
  44. hanus/plugin_manager.py +754 -0
  45. hanus/plugin_registry.py +391 -0
  46. hanus/plugins/__init__.py +1 -0
  47. hanus/plugins/arena.py +630 -0
  48. hanus/plugins/code_review.py +123 -0
  49. hanus/plugins/cortex.py +1750 -0
  50. hanus/plugins/deps_check.py +27 -0
  51. hanus/plugins/git_ops.py +33 -0
  52. hanus/plugins/metasploit.py +530 -0
  53. hanus/plugins/notes.py +583 -0
  54. hanus/plugins/search_code.py +59 -0
  55. hanus/plugins/searchsploit.py +495 -0
  56. hanus/plugins/strategist.py +175 -0
  57. hanus/plugins/webui.py +5200 -0
  58. hanus/profiles.py +479 -0
  59. hanus/profiles_builtin/__init__.py +0 -0
  60. hanus/profiles_builtin/architect/profile.yaml +12 -0
  61. hanus/profiles_builtin/architect/system_prompt.txt +71 -0
  62. hanus/profiles_builtin/deep/profile.yaml +12 -0
  63. hanus/profiles_builtin/deep/system_prompt.txt +66 -0
  64. hanus/profiles_builtin/developer/__init__.py +0 -0
  65. hanus/profiles_builtin/developer/profile.yaml +9 -0
  66. hanus/profiles_builtin/developer/system_prompt.txt +176 -0
  67. hanus/profiles_builtin/speed/profile.yaml +12 -0
  68. hanus/profiles_builtin/speed/system_prompt.txt +51 -0
  69. hanus/project_tools.py +177 -0
  70. hanus/query_engine.py +1594 -0
  71. hanus/rules/__init__.py +237 -0
  72. hanus/search/__init__.py +5 -0
  73. hanus/search/semantic.py +596 -0
  74. hanus/session_manager.py +547 -0
  75. hanus/skill_manager.py +702 -0
  76. hanus/skills/__init__.py +4 -0
  77. hanus/subagent/__init__.py +8 -0
  78. hanus/subagent/agents/__init__.py +253 -0
  79. hanus/subagent/manager.py +309 -0
  80. hanus/subagent/types.py +266 -0
  81. hanus/suggestions/__init__.py +5 -0
  82. hanus/suggestions/proactive.py +451 -0
  83. hanus/tasks/__init__.py +8 -0
  84. hanus/tasks/manager.py +330 -0
  85. hanus/tasks/models.py +106 -0
  86. hanus/terminal_prompt.py +166 -0
  87. hanus/tools.py +1849 -0
  88. hanus/ui.py +939 -0
  89. hanuscode-1.0.0.dist-info/METADATA +1151 -0
  90. hanuscode-1.0.0.dist-info/RECORD +93 -0
  91. hanuscode-1.0.0.dist-info/WHEEL +5 -0
  92. hanuscode-1.0.0.dist-info/entry_points.txt +2 -0
  93. hanuscode-1.0.0.dist-info/top_level.txt +1 -0
hanus/plugins/arena.py ADDED
@@ -0,0 +1,630 @@
1
+ # plugins/arena.py — Model Arena: Compare multiple models with a judge
2
+ """
3
+ Plugin para comparar múltiples modelos en la misma tarea.
4
+ Un modelo "juez" evalúa las respuestas de todos los modelos participantes.
5
+
6
+ Uso:
7
+ /arena config — Ver configuración actual
8
+ /arena add <provider/model> [alias] — Añadir modelo participante
9
+ /arena remove <alias> — Eliminar modelo participante
10
+ /arena judge <provider/model> — Configurar modelo juez
11
+ /arena task <prompt> — Ejecutar tarea en todos los modelos
12
+ /arena run <prompt> — Ejecutar y evaluar con el juez
13
+ /arena compare <prompt> — Ejecutar y mostrar comparación
14
+ /arena list — Listar modelos configurados
15
+ /arena clear — Limpiar configuración
16
+ """
17
+ from __future__ import annotations
18
+ import json
19
+ import time
20
+ from pathlib import Path
21
+ from typing import Dict, List, Optional, Any
22
+ from datetime import datetime
23
+
24
+ NAME = "arena"
25
+ DESCRIPTION = "Model Arena: Compara múltiples modelos con un juez evaluador"
26
+ USAGE = "<comando> [args...] | add <model> | judge <model> | run <prompt> | results"
27
+ AGENT_DOC = """
28
+ Plugin para comparar respuestas de múltiples modelos de IA.
29
+
30
+ Comandos disponibles:
31
+ - config — Ver configuración actual
32
+ - add <provider/model> [alias] — Añadir modelo participante (ej: add openai/gpt-4 gpt4)
33
+ - remove <alias> — Eliminar modelo participante
34
+ - judge <provider/model> — Configurar modelo juez (evalúa todas las respuestas)
35
+ - task <prompt> — Ejecutar tarea en todos los modelos (sin juez)
36
+ - run <prompt> — Ejecutar tarea y evaluar con el juez
37
+ - compare <prompt> — Ejecutar y mostrar comparación detallada
38
+ - results — Ver últimos resultados guardados
39
+ - list — Listar modelos configurados
40
+ - clear — Limpiar toda la configuración
41
+
42
+ Ejemplos:
43
+ <run_plugin name="arena" args="add anthropic/claude-sonnet-4-6 claude"/>
44
+ <run_plugin name="arena" args="add openai/gpt-4o gpt4"/>
45
+ <run_plugin name="arena" args="judge anthropic/claude-opus-4-7"/>
46
+ <run_plugin name="arena" args="run Explain quantum computing in 3 sentences"/>
47
+ <run_plugin name="arena" args="results"/>
48
+ """
49
+
50
+ # Configuración guardada
51
+ CONFIG_FILE = Path.home() / ".hanus" / "arena_config.json"
52
+
53
+ # System prompt para el juez
54
+ JUDGE_PROMPT = """You are an expert judge evaluating AI model responses.
55
+
56
+ You will receive:
57
+ 1. A task/prompt that was given to multiple AI models
58
+ 2. Responses from each model (identified by alias)
59
+
60
+ Your job is to:
61
+ 1. Evaluate each response based on: accuracy, completeness, clarity, usefulness
62
+ 2. Rank the responses from best to worst
63
+ 3. Provide a brief justification for your ranking
64
+ 4. Assign a score (1-10) to each response
65
+
66
+ Output your evaluation in this JSON format:
67
+ {
68
+ "rankings": [
69
+ {"alias": "model_alias", "rank": 1, "score": 9.5, "strengths": ["..."], "weaknesses": ["..."]},
70
+ ...
71
+ ],
72
+ "best": "model_alias",
73
+ "summary": "Brief overall comparison..."
74
+ }
75
+
76
+ Be fair, objective, and thorough in your evaluation."""
77
+
78
+
79
+ class ModelConfig:
80
+ """Configuración de un modelo participante."""
81
+
82
+ def __init__(self, provider: str, model_id: str, alias: str = ""):
83
+ self.provider = provider
84
+ self.model_id = model_id
85
+ self.alias = alias or f"{provider}/{model_id}"
86
+
87
+ def to_dict(self) -> dict:
88
+ return {
89
+ "provider": self.provider,
90
+ "model_id": self.model_id,
91
+ "alias": self.alias
92
+ }
93
+
94
+ @classmethod
95
+ def from_dict(cls, data: dict) -> "ModelConfig":
96
+ return cls(
97
+ provider=data.get("provider", ""),
98
+ model_id=data.get("model_id", ""),
99
+ alias=data.get("alias", "")
100
+ )
101
+
102
+ def __repr__(self) -> str:
103
+ return f"ModelConfig({self.alias}: {self.provider}/{self.model_id})"
104
+
105
+
106
+ class ArenaConfig:
107
+ """Configuración del Arena."""
108
+
109
+ def __init__(self):
110
+ self.models: List[ModelConfig] = []
111
+ self.judge: Optional[ModelConfig] = None
112
+ self.last_results: List[dict] = []
113
+
114
+ def to_dict(self) -> dict:
115
+ return {
116
+ "models": [m.to_dict() for m in self.models],
117
+ "judge": self.judge.to_dict() if self.judge else None,
118
+ "last_results": self.last_results
119
+ }
120
+
121
+ @classmethod
122
+ def from_dict(cls, data: dict) -> "ArenaConfig":
123
+ config = cls()
124
+ for m in data.get("models", []):
125
+ config.models.append(ModelConfig.from_dict(m))
126
+ if data.get("judge"):
127
+ config.judge = ModelConfig.from_dict(data["judge"])
128
+ config.last_results = data.get("last_results", [])
129
+ return config
130
+
131
+ def save(self):
132
+ CONFIG_FILE.parent.mkdir(parents=True, exist_ok=True)
133
+ CONFIG_FILE.write_text(json.dumps(self.to_dict(), indent=2), encoding="utf-8")
134
+
135
+ @classmethod
136
+ def load(cls) -> "ArenaConfig":
137
+ if not CONFIG_FILE.exists():
138
+ return cls()
139
+ try:
140
+ data = json.loads(CONFIG_FILE.read_text(encoding="utf-8"))
141
+ return cls.from_dict(data)
142
+ except Exception:
143
+ return cls()
144
+
145
+
146
+ # Estado global
147
+ _config: Optional[ArenaConfig] = None
148
+
149
+
150
+ def _get_config() -> ArenaConfig:
151
+ """Obtiene la configuración (desde cache o archivo)."""
152
+ global _config
153
+ if _config is None:
154
+ _config = ArenaConfig.load()
155
+ return _config
156
+
157
+
158
+ def _get_connector(provider: str, model_id: str):
159
+ """Obtiene un conector para el modelo especificado."""
160
+ import sys
161
+ from hanus.connectors.registry import ConnectorRegistry
162
+ from hanus.config import HanusConfig
163
+
164
+ print(f"[Arena] 🔧 Creando conector para {provider}/{model_id}...", file=sys.stderr, flush=True)
165
+
166
+ config = HanusConfig.load()
167
+ config.provider = provider
168
+ config.model_id = model_id
169
+
170
+ try:
171
+ connector = ConnectorRegistry.get(provider, config.get_connector_config())
172
+ print(f"[Arena] ✓ Conector creado: {connector.__class__.__name__}", file=sys.stderr, flush=True)
173
+ return connector
174
+ except Exception as e:
175
+ print(f"[Arena] ❌ Error creando conector: {e}", file=sys.stderr, flush=True)
176
+ raise
177
+
178
+
179
+ def _run_model(model: ModelConfig, prompt: str, system_prompt: str = "") -> dict:
180
+ """Ejecuta un prompt en un modelo y retorna el resultado."""
181
+ import sys
182
+ start_time = time.time()
183
+
184
+ # Print para indicar que el modelo está trabajando
185
+ print(f"\n[Arena] 🔄 Modelo {model.alias} ({model.provider}/{model.model_id}) está procesando...", file=sys.stderr, flush=True)
186
+
187
+ try:
188
+ connector = _get_connector(model.provider, model.model_id)
189
+
190
+ messages = []
191
+ if system_prompt:
192
+ messages.append({"role": "system", "content": system_prompt})
193
+ messages.append({"role": "user", "content": prompt})
194
+
195
+ print(f"[Arena] 📤 Enviando prompt a {model.alias}...", file=sys.stderr, flush=True)
196
+ response = connector.chat(messages=messages)
197
+
198
+ elapsed = time.time() - start_time
199
+ print(f"[Arena] ✅ {model.alias} completado en {elapsed:.2f}s ({response.output_tokens} tokens)", file=sys.stderr, flush=True)
200
+
201
+ return {
202
+ "alias": model.alias,
203
+ "provider": model.provider,
204
+ "model_id": model.model_id,
205
+ "response": response.text,
206
+ "tokens_in": response.input_tokens,
207
+ "tokens_out": response.output_tokens,
208
+ "elapsed": elapsed,
209
+ "success": True,
210
+ "error": None
211
+ }
212
+
213
+ except Exception as e:
214
+ elapsed = time.time() - start_time
215
+ print(f"[Arena] ❌ Error en {model.alias}: {e}", file=sys.stderr, flush=True)
216
+ return {
217
+ "alias": model.alias,
218
+ "provider": model.provider,
219
+ "model_id": model.model_id,
220
+ "response": "",
221
+ "tokens_in": 0,
222
+ "tokens_out": 0,
223
+ "elapsed": elapsed,
224
+ "success": False,
225
+ "error": str(e)
226
+ }
227
+
228
+
229
+ def _format_response(result: dict, max_lines: int = 20) -> str:
230
+ """Formatea una respuesta para mostrar."""
231
+ lines = []
232
+ lines.append(f"{'='*60}")
233
+ lines.append(f"Modelo: {result['alias']} ({result['provider']}/{result['model_id']})")
234
+
235
+ if not result['success']:
236
+ lines.append(f"ERROR: {result['error']}")
237
+ return "\n".join(lines)
238
+
239
+ lines.append(f"Tokens: {result['tokens_in']} in / {result['tokens_out']} out")
240
+ lines.append(f"Tiempo: {result['elapsed']:.2f}s")
241
+ lines.append(f"{'='*60}")
242
+
243
+ response_text = result['response']
244
+ response_lines = response_text.split('\n')
245
+
246
+ if len(response_lines) > max_lines:
247
+ shown = response_lines[:max_lines]
248
+ hidden = len(response_lines) - max_lines
249
+ lines.append('\n'.join(shown))
250
+ lines.append(f"\n... ({hidden} líneas omitidas)")
251
+ else:
252
+ lines.append(response_text)
253
+
254
+ return "\n".join(lines)
255
+
256
+
257
+ def _format_judge_response(response: str) -> str:
258
+ """Formatea la respuesta del juez."""
259
+ import sys
260
+ lines = []
261
+ lines.append("=" * 60)
262
+ lines.append("JUICIO DEL MODELO JUEZ")
263
+ lines.append("=" * 60)
264
+
265
+ print(f"[Arena] 📋 Formateando respuesta del juez ({len(response)} chars)...", file=sys.stderr, flush=True)
266
+
267
+ # Si la respuesta está vacía
268
+ if not response or not response.strip():
269
+ lines.append("\n⚠️ El juez no devolvió respuesta.")
270
+ return "\n".join(lines)
271
+
272
+ # Limpiar respuesta: quitar bloques de código markdown
273
+ clean_response = response.strip()
274
+ if clean_response.startswith("```"):
275
+ # Quitar ```json o ``` del inicio
276
+ first_newline = clean_response.find('\n')
277
+ if first_newline != -1:
278
+ clean_response = clean_response[first_newline + 1:]
279
+ # Quitar ``` del final
280
+ if clean_response.endswith("```"):
281
+ clean_response = clean_response[:-3].strip()
282
+
283
+ # Intentar parsear JSON si es posible
284
+ try:
285
+ data = json.loads(clean_response)
286
+ print(f"[Arena] ✅ JSON parseado correctamente", file=sys.stderr, flush=True)
287
+ if "rankings" in data:
288
+ lines.append("\nRANKING FINAL:")
289
+ lines.append("-" * 40)
290
+ for r in data.get("rankings", []):
291
+ lines.append(f"\n{r.get('rank', '?')}. {r.get('alias', '?')} (Score: {r.get('score', '?')})")
292
+ if r.get("strengths"):
293
+ lines.append(f" Fortalezas: {', '.join(r['strengths'][:3])}")
294
+ if r.get("weaknesses"):
295
+ lines.append(f" Debilidades: {', '.join(r['weaknesses'][:3])}")
296
+
297
+ if data.get("best"):
298
+ lines.append(f"\n🏆 MEJOR: {data['best']}")
299
+
300
+ if data.get("summary"):
301
+ lines.append(f"\nRESUMEN: {data['summary']}")
302
+
303
+ return "\n".join(lines)
304
+ except (json.JSONDecodeError, TypeError) as e:
305
+ print(f"[Arena] ⚠️ JSON no válido, mostrando respuesta raw: {e}", file=sys.stderr, flush=True)
306
+
307
+ # Si no es JSON, mostrar respuesta completa sin truncar
308
+ lines.append("\nRespuesta del juez:")
309
+ lines.append("-" * 40)
310
+ lines.append(response)
311
+
312
+ return "\n".join(lines)
313
+
314
+
315
+ def run(args: str = "") -> str:
316
+ """Punto de entrada del plugin."""
317
+ global _config
318
+
319
+ if not args.strip():
320
+ return f"Uso: {USAGE}\n\n{AGENT_DOC}"
321
+
322
+ parts = args.strip().split(maxsplit=2)
323
+ cmd = parts[0].lower()
324
+ cmd_args = parts[1:] if len(parts) > 1 else []
325
+
326
+ config = _get_config()
327
+
328
+ # ── CONFIG ──────────────────────────────────────────────────────────
329
+ if cmd == "config":
330
+ output = ["Configuración del Arena", "=" * 50]
331
+
332
+ if config.judge:
333
+ output.append(f"\nJuez: {config.judge.alias} ({config.judge.provider}/{config.judge.model_id})")
334
+ else:
335
+ output.append("\nJuez: No configurado")
336
+
337
+ if config.models:
338
+ output.append(f"\nModelos participantes ({len(config.models)}):")
339
+ for m in config.models:
340
+ output.append(f" • {m.alias}: {m.provider}/{m.model_id}")
341
+ else:
342
+ output.append("\nModelos participantes: Ninguno")
343
+
344
+ output.append(f"\nArchivo: {CONFIG_FILE}")
345
+ return "\n".join(output)
346
+
347
+ # ── ADD ────────────────────────────────────────────────────────────
348
+ if cmd == "add":
349
+ if len(cmd_args) < 1:
350
+ return "Uso: arena add <provider/model> [alias]\nEjemplo: arena add anthropic/claude-sonnet-4-6 claude"
351
+
352
+ model_str = cmd_args[0]
353
+ alias = cmd_args[1] if len(cmd_args) > 1 else ""
354
+
355
+ if "/" not in model_str:
356
+ return "Formato inválido. Usa: provider/model (ej: anthropic/claude-sonnet-4-6)"
357
+
358
+ provider, model_id = model_str.split("/", 1)
359
+ if not alias:
360
+ alias = model_id.split("/")[-1] if "/" in model_id else model_id
361
+ alias = alias.replace("-", "_")[:20]
362
+
363
+ # Verificar que no exista
364
+ for m in config.models:
365
+ if m.alias == alias:
366
+ return f"Ya existe un modelo con alias '{alias}'. Usa: arena remove {alias} primero."
367
+
368
+ model = ModelConfig(provider=provider, model_id=model_id, alias=alias)
369
+ config.models.append(model)
370
+ config.save()
371
+
372
+ return f"✓ Modelo añadido: {alias} ({provider}/{model_id})\nTotal: {len(config.models)} modelos"
373
+
374
+ # ── REMOVE ─────────────────────────────────────────────────────────
375
+ if cmd == "remove":
376
+ if len(cmd_args) < 1:
377
+ return "Uso: arena remove <alias>"
378
+
379
+ alias = cmd_args[0]
380
+ original_count = len(config.models)
381
+ config.models = [m for m in config.models if m.alias != alias]
382
+
383
+ if len(config.models) == original_count:
384
+ return f"No se encontró modelo con alias '{alias}'"
385
+
386
+ config.save()
387
+ return f"✓ Modelo eliminado: {alias}\nTotal: {len(config.models)} modelos"
388
+
389
+ # ── JUDGE ───────────────────────────────────────────────────────────
390
+ if cmd == "judge":
391
+ if len(cmd_args) < 1:
392
+ if config.judge:
393
+ return f"Juez actual: {config.judge.alias} ({config.judge.provider}/{config.judge.model_id})\n\nPara cambiar: arena judge <provider/model>"
394
+ return "No hay juez configurado.\nUsa: arena judge <provider/model>"
395
+
396
+ model_str = cmd_args[0]
397
+
398
+ if "/" not in model_str:
399
+ return "Formato inválido. Usa: provider/model (ej: anthropic/claude-opus-4-7)"
400
+
401
+ provider, model_id = model_str.split("/", 1)
402
+ config.judge = ModelConfig(provider=provider, model_id=model_id, alias="judge")
403
+ config.save()
404
+
405
+ return f"✓ Juez configurado: {provider}/{model_id}"
406
+
407
+ # ── LIST ───────────────────────────────────────────────────────────
408
+ if cmd == "list":
409
+ output = ["Modelos en el Arena", "=" * 50]
410
+
411
+ if config.judge:
412
+ output.append(f"\n[JUEZ] {config.judge.alias}: {config.judge.provider}/{config.judge.model_id}")
413
+
414
+ if config.models:
415
+ output.append("\nParticipantes:")
416
+ for i, m in enumerate(config.models, 1):
417
+ output.append(f" {i}. {m.alias}: {m.provider}/{m.model_id}")
418
+ else:
419
+ output.append("\nNo hay modelos participantes.")
420
+ output.append("Añade con: arena add <provider/model> [alias]")
421
+
422
+ return "\n".join(output)
423
+
424
+ # ── RESULTS ─────────────────────────────────────────────────────────
425
+ if cmd == "results":
426
+ if not config.last_results:
427
+ return "No hay resultados anteriores.\nUsa: arena run <prompt> o arena task <prompt>"
428
+
429
+ output = []
430
+
431
+ # Separar modelos participantes del juez
432
+ model_results = []
433
+ judge_result = None
434
+
435
+ for result in config.last_results:
436
+ if isinstance(result, dict) and "judge" in result:
437
+ judge_result = result["judge"]
438
+ elif isinstance(result, dict) and result.get("alias") == "judge":
439
+ judge_result = result
440
+ else:
441
+ model_results.append(result)
442
+
443
+ # Mostrar resultados de modelos participantes
444
+ if model_results:
445
+ output.append("=" * 60)
446
+ output.append("RESULTADOS DE MODELOS PARTICIPANTES")
447
+ output.append("=" * 60)
448
+ for i, result in enumerate(model_results, 1):
449
+ output.append(f"\n[{i}] {result.get('alias', 'Unknown')}")
450
+ output.append(f" Status: {'✓ OK' if result.get('success') else '✗ Error'}")
451
+ output.append(f" Tiempo: {result.get('elapsed', 0):.2f}s")
452
+ output.append(f" Tokens: {result.get('tokens_out', 0)} out")
453
+
454
+ # Mostrar resultado del juez (esto es lo importante)
455
+ if judge_result:
456
+ output.append("\n" + "=" * 60)
457
+ output.append("EVALUACIÓN DEL JUEZ")
458
+ output.append("=" * 60)
459
+
460
+ if judge_result.get("success"):
461
+ output.append(_format_judge_response(judge_result.get("response", "")))
462
+ else:
463
+ output.append(f"\n❌ Error del juez: {judge_result.get('error', 'Unknown error')}")
464
+ else:
465
+ output.append("\n⚠️ No hay evaluación del juez guardada.")
466
+ output.append("Usa: arena run <prompt> para ejecutar con evaluación")
467
+
468
+ return "\n".join(output)
469
+
470
+ # ── JUDGE RESULT ─────────────────────────────────────────────────────
471
+ if cmd in ("judge_result", "juez"):
472
+ """Mostrar solo la respuesta del juez."""
473
+ if not config.last_results:
474
+ return "No hay resultados guardados.\nUsa: arena run <prompt> para ejecutar una evaluación."
475
+
476
+ judge_result = None
477
+ for result in config.last_results:
478
+ if isinstance(result, dict) and "judge" in result:
479
+ judge_result = result["judge"]
480
+ break
481
+ elif isinstance(result, dict) and result.get("alias") == "judge":
482
+ judge_result = result
483
+ break
484
+
485
+ if not judge_result:
486
+ return "No hay resultado del juez.\nUsa: arena run <prompt> para ejecutar con evaluación."
487
+
488
+ output = ["=" * 60]
489
+ output.append("RESPUESTA COMPLETA DEL JUEZ")
490
+ output.append("=" * 60)
491
+
492
+ if judge_result.get("success"):
493
+ output.append(f"\nModelo: {judge_result.get('provider', '?')}/{judge_result.get('model_id', '?')}")
494
+ output.append(f"Tiempo: {judge_result.get('elapsed', 0):.2f}s")
495
+ output.append(f"Tokens: {judge_result.get('tokens_out', 0)}")
496
+ output.append("")
497
+ output.append(_format_judge_response(judge_result.get("response", "")))
498
+ else:
499
+ output.append(f"\n❌ Error: {judge_result.get('error', 'Unknown error')}")
500
+
501
+ return "\n".join(output)
502
+
503
+ # ── CLEAR ───────────────────────────────────────────────────────────
504
+ if cmd == "clear":
505
+ config.models = []
506
+ config.judge = None
507
+ config.last_results = []
508
+ config.save()
509
+ return "✓ Configuración limpiada.\nUsa 'arena add' y 'arena judge' para configurar."
510
+
511
+ # ── TASK (solo ejecutar, sin juez) ───────────────────────────────────
512
+ if cmd == "task":
513
+ import sys
514
+ if len(cmd_args) < 1:
515
+ return "Uso: arena task <prompt>\nEjemplo: arena task Explain quantum computing"
516
+
517
+ if not config.models:
518
+ return "No hay modelos configurados.\nUsa: arena add <provider/model> [alias]"
519
+
520
+ prompt = " ".join(cmd_args)
521
+
522
+ print(f"\n[Arena] 🚀 Iniciando tarea en {len(config.models)} modelo(s)...", file=sys.stderr, flush=True)
523
+ print(f"[Arena] 📝 Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}", file=sys.stderr, flush=True)
524
+
525
+ output = [f"Ejecutando tarea en {len(config.models)} modelos...", "=" * 50]
526
+ output.append(f"\nPrompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
527
+ output.append("")
528
+
529
+ results = []
530
+ for i, model in enumerate(config.models, 1):
531
+ print(f"[Arena] [{i}/{len(config.models)}] Preparando {model.alias}...", file=sys.stderr, flush=True)
532
+ output.append(f"\nEjecutando {model.alias}...")
533
+ result = _run_model(model, prompt)
534
+ results.append(result)
535
+ output.append(_format_response(result))
536
+
537
+ print(f"\n[Arena] ✅ Tarea completada en {len(config.models)} modelo(s)", file=sys.stderr, flush=True)
538
+
539
+ # Guardar resultados
540
+ config.last_results = results
541
+ config.save()
542
+
543
+ return "\n".join(output)
544
+
545
+ # ── RUN (ejecutar y evaluar con juez) ───────────────────────────────
546
+ if cmd in ("run", "compare"):
547
+ import sys
548
+ if len(cmd_args) < 1:
549
+ return f"Uso: arena {cmd} <prompt>\nEjemplo: arena run Explain quantum computing"
550
+
551
+ if not config.models:
552
+ return "No hay modelos configurados.\nUsa: arena add <provider/model> [alias]"
553
+
554
+ if not config.judge:
555
+ return "No hay juez configurado.\nUsa: arena judge <provider/model>"
556
+
557
+ prompt = " ".join(cmd_args)
558
+
559
+ print(f"\n[Arena] 🚀 ARENA MODE: Ejecutando en {len(config.models)} modelo(s) + juez", file=sys.stderr, flush=True)
560
+ print(f"[Arena] 📝 Prompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}", file=sys.stderr, flush=True)
561
+
562
+ output = [f"Arena: Ejecutando tarea en {len(config.models)} modelos", "=" * 50]
563
+ output.append(f"\nPrompt: {prompt[:100]}{'...' if len(prompt) > 100 else ''}")
564
+
565
+ # Ejecutar en todos los modelos
566
+ results = []
567
+ for i, model in enumerate(config.models, 1):
568
+ print(f"[Arena] [{i}/{len(config.models)}] Ejecutando {model.alias} ({model.provider}/{model.model_id})...", file=sys.stderr, flush=True)
569
+ output.append(f"\n⏳ {model.alias} ({model.provider}/{model.model_id})...")
570
+ result = _run_model(model, prompt)
571
+ results.append(result)
572
+ status = "✓" if result['success'] else "✗"
573
+ output.append(f" {status} Completado en {result['elapsed']:.2f}s ({result['tokens_out']} tokens)")
574
+
575
+ # Preparar input para el juez
576
+ print(f"[Arena] 📊 Preparando evaluación del juez...", file=sys.stderr, flush=True)
577
+ judge_input = f"""Task: {prompt}
578
+
579
+ Responses from {len(results)} models:
580
+
581
+ """
582
+ for r in results:
583
+ judge_input += f"\n--- {r['alias']} ---\n{r['response']}\n"
584
+
585
+ judge_input += "\n\nEvaluate all responses and provide your ranking in JSON format."
586
+
587
+ # Ejecutar el juez
588
+ print(f"[Arena] ⚖️ Ejecutando juez ({config.judge.provider}/{config.judge.model_id})...", file=sys.stderr, flush=True)
589
+ output.append(f"\n{'='*50}")
590
+ output.append(f" Evaluando con el juez ({config.judge.provider}/{config.judge.model_id})...")
591
+ output.append(f"{'='*50}")
592
+
593
+ judge_result = _run_model(config.judge, judge_input, JUDGE_PROMPT)
594
+
595
+ if judge_result['success']:
596
+ print(f"[Arena] ✅ Evaluación completada", file=sys.stderr, flush=True)
597
+ output.append(_format_judge_response(judge_result['response']))
598
+ else:
599
+ print(f"[Arena] ❌ Error en evaluación: {judge_result['error']}", file=sys.stderr, flush=True)
600
+ output.append(f"Error del juez: {judge_result['error']}")
601
+
602
+ # Guardar resultados
603
+ config.last_results = results + [{"judge": judge_result}]
604
+ config.save()
605
+
606
+ print(f"\n[Arena] ✅ ARENA completado", file=sys.stderr, flush=True)
607
+ return "\n".join(output)
608
+
609
+ # Comando no reconocido
610
+ return f"""Comando desconocido: {cmd}
611
+
612
+ Comandos disponibles:
613
+ config — Ver configuración actual
614
+ add <provider/model> [alias] — Añadir modelo participante
615
+ remove <alias> — Eliminar modelo participante
616
+ judge <provider/model> — Configurar modelo juez
617
+ task <prompt> — Ejecutar tarea en todos los modelos (sin juez)
618
+ run <prompt> — Ejecutar y evaluar con el juez
619
+ compare <prompt> — Ejecutar y mostrar comparación detallada
620
+ results — Ver últimos resultados (modelos + juez)
621
+ judge_result — Ver solo la respuesta del juez
622
+ list — Listar modelos configurados
623
+ clear — Limpiar configuración
624
+
625
+ Ejemplo de uso:
626
+ arena add anthropic/claude-sonnet-4-6 claude
627
+ arena add openai/gpt-4o gpt4
628
+ arena judge anthropic/claude-opus-4-7
629
+ arena run Explain quantum computing in 3 sentences
630
+ arena judge_result — Ver respuesta del juez"""