mageagent-local 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,951 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ MageAgent Orchestrator - Multi-Model LLM Server for MLX
4
+ Provides OpenAI-compatible API with intelligent model routing and validation patterns
5
+
6
+ Patterns:
7
+ - mageagent:auto - Intelligent task classification and routing
8
+ - mageagent:execute - ReAct loop with REAL tool execution (reads files, runs commands)
9
+ - mageagent:validated - Generate + validate with correction loop
10
+ - mageagent:compete - Competing models with judge
11
+ - mageagent:hybrid - Qwen-72B reasoning + Hermes-3 tool extraction
12
+ - mageagent:tools - Tool-calling specialist (Hermes-3 Q8)
13
+ - mageagent:primary - Direct access to 72B model
14
+ - mageagent:validator - Direct access to 7B validator
15
+ - mageagent:fast - Quick responses with 7B model
16
+ """
17
+
18
+ import asyncio
19
+ import json
20
+ import os
21
+ import re
22
+ import time
23
+ from pathlib import Path
24
+ from typing import Optional, List, Dict, Any
25
+ from contextlib import asynccontextmanager
26
+
27
+ from fastapi import FastAPI, HTTPException
28
+ from fastapi.middleware.cors import CORSMiddleware
29
+ from pydantic import BaseModel
30
+ import mlx.core as mx
31
+ from mlx_lm import load, generate
32
+
33
+ # Model paths - using existing downloaded models
34
+ MLX_MODELS_DIR = Path.home() / ".cache" / "mlx-models"
35
+
36
+ MODELS = {
37
+ "tools": {
38
+ "path": str(MLX_MODELS_DIR / "Hermes-3-Llama-3.1-8B-8bit"),
39
+ "role": "tool calling specialist - file operations, function execution, structured output",
40
+ "quant": "Q8_0",
41
+ "memory_gb": 9,
42
+ "supports_tools": True, # Q8 reliably supports tool calling
43
+ "tok_per_sec": 50
44
+ },
45
+ "primary": {
46
+ "path": str(MLX_MODELS_DIR / "Qwen2.5-72B-Instruct-8bit"),
47
+ "role": "primary generator - planning, analysis, complex reasoning",
48
+ "quant": "Q8_0",
49
+ "memory_gb": 77,
50
+ "supports_tools": True, # Q8 supports tool calling
51
+ "tok_per_sec": 8
52
+ },
53
+ "validator": {
54
+ "path": str(MLX_MODELS_DIR / "Qwen2.5-Coder-7B-Instruct-4bit"),
55
+ "role": "fast validation, cross-checking, judging",
56
+ "quant": "Q4_K_M",
57
+ "memory_gb": 5,
58
+ "supports_tools": False,
59
+ "tok_per_sec": 105
60
+ },
61
+ "competitor": {
62
+ "path": str(MLX_MODELS_DIR / "Qwen2.5-Coder-32B-Instruct-4bit"),
63
+ "role": "competing solution generator, code specialist",
64
+ "quant": "Q4_K_M",
65
+ "memory_gb": 18,
66
+ "supports_tools": False,
67
+ "tok_per_sec": 25
68
+ }
69
+ }
70
+
71
+ # Lazy-loaded models cache
72
+ loaded_models: Dict[str, Any] = {}
73
+ model_tokenizers: Dict[str, Any] = {}
74
+
75
+ # Request/Response models
76
+ class ChatMessage(BaseModel):
77
+ role: str
78
+ content: str
79
+
80
+ class ChatRequest(BaseModel):
81
+ model: str
82
+ messages: List[ChatMessage]
83
+ temperature: Optional[float] = 0.7
84
+ max_tokens: Optional[int] = 2048
85
+ stream: Optional[bool] = False
86
+
87
+ class ChatChoice(BaseModel):
88
+ index: int
89
+ message: ChatMessage
90
+ finish_reason: str
91
+
92
+ class Usage(BaseModel):
93
+ prompt_tokens: int
94
+ completion_tokens: int
95
+ total_tokens: int
96
+
97
+ class ChatResponse(BaseModel):
98
+ id: str
99
+ object: str = "chat.completion"
100
+ created: int
101
+ model: str
102
+ choices: List[ChatChoice]
103
+ usage: Usage
104
+
105
+ class ModelInfo(BaseModel):
106
+ id: str
107
+ object: str = "model"
108
+ created: int
109
+ owned_by: str = "mageagent"
110
+
111
+ class ModelsResponse(BaseModel):
112
+ object: str = "list"
113
+ data: List[ModelInfo]
114
+
115
+
116
+ def get_model(model_type: str) -> tuple:
117
+ """Lazy-load and cache models"""
118
+ if model_type not in MODELS:
119
+ raise ValueError(f"Unknown model type: {model_type}")
120
+
121
+ if model_type not in loaded_models:
122
+ model_config = MODELS[model_type]
123
+ model_path = model_config["path"]
124
+
125
+ if not Path(model_path).exists():
126
+ raise FileNotFoundError(f"Model not found at {model_path}")
127
+
128
+ print(f"Loading {model_type} model from {model_path}...")
129
+ start = time.time()
130
+ model, tokenizer = load(model_path)
131
+ print(f"Loaded {model_type} in {time.time() - start:.1f}s")
132
+
133
+ loaded_models[model_type] = model
134
+ model_tokenizers[model_type] = tokenizer
135
+
136
+ return loaded_models[model_type], model_tokenizers[model_type]
137
+
138
+
139
+ def format_chat_prompt(messages: List[ChatMessage], tokenizer) -> str:
140
+ """Format messages into a chat prompt using the tokenizer's chat template"""
141
+ formatted_messages = [{"role": m.role, "content": m.content} for m in messages]
142
+
143
+ # Use the tokenizer's chat template if available
144
+ if hasattr(tokenizer, 'apply_chat_template'):
145
+ return tokenizer.apply_chat_template(
146
+ formatted_messages,
147
+ tokenize=False,
148
+ add_generation_prompt=True
149
+ )
150
+
151
+ # Fallback to simple formatting
152
+ prompt = ""
153
+ for msg in formatted_messages:
154
+ if msg["role"] == "system":
155
+ prompt += f"<|im_start|>system\n{msg['content']}<|im_end|>\n"
156
+ elif msg["role"] == "user":
157
+ prompt += f"<|im_start|>user\n{msg['content']}<|im_end|>\n"
158
+ elif msg["role"] == "assistant":
159
+ prompt += f"<|im_start|>assistant\n{msg['content']}<|im_end|>\n"
160
+ prompt += "<|im_start|>assistant\n"
161
+ return prompt
162
+
163
+
164
+ async def generate_with_model(
165
+ model_type: str,
166
+ messages: List[ChatMessage],
167
+ max_tokens: int = 2048,
168
+ temperature: float = 0.7
169
+ ) -> str:
170
+ """Generate response using specified model"""
171
+ model, tokenizer = get_model(model_type)
172
+ prompt = format_chat_prompt(messages, tokenizer)
173
+
174
+ # Run generation in a thread pool to not block
175
+ loop = asyncio.get_event_loop()
176
+ response = await loop.run_in_executor(
177
+ None,
178
+ lambda: generate(
179
+ model,
180
+ tokenizer,
181
+ prompt=prompt,
182
+ max_tokens=max_tokens,
183
+ verbose=False
184
+ )
185
+ )
186
+
187
+ return response
188
+
189
+
190
+ def needs_tool_extraction(prompt: str) -> bool:
191
+ """Check if the prompt requires tool extraction - be very liberal"""
192
+ tool_patterns = [
193
+ r'\bread\b.*\bfile\b', r'\bwrite\b.*\bfile\b', r'\blist\b',
194
+ r'\bexecute\b', r'\brun\b', r'\bcreate\b.*\bfile\b', r'\bdelete\b',
195
+ r'\bsearch\b', r'\bfind\b', r'\bedit\b', r'\bmodify\b',
196
+ r'\btool\b', r'\bfunction\b.*\bcall\b', r'\bapi\b.*\bcall\b',
197
+ r'\bglob\b', r'\bgrep\b', r'\bbash\b', r'\bshell\b',
198
+ # Filesystem-related patterns
199
+ r'\bfile[s]?\b', r'\bdirectory\b', r'\bfolder\b', r'\bpath\b',
200
+ r'/users/', r'~/', r'\.\w+$', # Path patterns
201
+ # Count/listing patterns
202
+ r'\bhow many\b', r'\bcount\b', r'\blist\b.*\bfiles?\b',
203
+ # Web patterns
204
+ r'\bweb\b.*\bsearch\b', r'\bsearch\b.*\bweb\b', r'\bonline\b',
205
+ r'\binternet\b', r'\burl\b', r'\bhttp', r'\bfetch\b',
206
+ # Command patterns
207
+ r'\bcommand\b', r'\bterminal\b', r'\bcli\b'
208
+ ]
209
+ return any(re.search(p, prompt.lower()) for p in tool_patterns)
210
+
211
+
212
+ async def extract_tool_calls(user_content: str, response: str) -> list:
213
+ """
214
+ Use Hermes-3 Q8 to extract tool calls from any response.
215
+ This is the ONLY model that should handle tool extraction.
216
+ """
217
+ print("Hermes-3 Q8 extracting tool calls...")
218
+ tool_messages = [
219
+ ChatMessage(role="system", content="""You are an AGGRESSIVE tool-calling assistant. Your job is to identify what tools are needed to complete a task.
220
+
221
+ ALWAYS prefer using tools over generating text explanations. If the task involves:
222
+ - Reading/viewing files → Use Read tool
223
+ - Running commands → Use Bash tool
224
+ - Finding files → Use Glob or Bash with find/ls
225
+ - Searching content → Use Grep
226
+ - Counting files → Use Bash with find | wc -l
227
+ - Web search → Use WebSearch
228
+ - Fetching URLs → Use WebFetch
229
+
230
+ Output tool calls as JSON array:
231
+ [{"tool": "tool_name", "arguments": {"arg1": "value1"}}]
232
+
233
+ Available tools:
234
+ - Read: {"file_path": "path"} - Read file contents (use absolute paths)
235
+ - Write: {"file_path": "path", "content": "content"} - Write to file
236
+ - Edit: {"file_path": "path", "old_string": "text", "new_string": "text"} - Edit file
237
+ - Bash: {"command": "shell_command"} - Execute ANY shell command (ls, find, cat, etc.)
238
+ - Glob: {"pattern": "**/*.py", "path": "dir"} - Find files by pattern
239
+ - Grep: {"pattern": "regex", "path": "dir"} - Search file contents
240
+ - WebSearch: {"query": "search terms"} - Search the web
241
+ - WebFetch: {"url": "https://...", "prompt": "what to extract"} - Fetch and process URL
242
+
243
+ IMPORTANT: If the task requires getting ACTUAL data from the filesystem or web, you MUST output tools.
244
+ Only output [] if the task is purely conversational with no data needs.
245
+
246
+ Example: "How many Python files in /foo?" → [{"tool": "Bash", "arguments": {"command": "find /foo -name '*.py' | wc -l"}}]
247
+ Example: "List files in /bar" → [{"tool": "Bash", "arguments": {"command": "ls -la /bar"}}]"""),
248
+ ChatMessage(role="user", content=f"""Task: {user_content}
249
+
250
+ The model's initial response was:
251
+ {response[:1000]}
252
+
253
+ What tools should be executed to complete this task? Output JSON array only:""")
254
+ ]
255
+
256
+ tool_response = await generate_with_model("tools", tool_messages, 512, 0.1)
257
+
258
+ # Parse tool calls
259
+ try:
260
+ match = re.search(r'\[.*\]', tool_response, re.DOTALL)
261
+ if match:
262
+ return json.loads(match.group())
263
+ except:
264
+ pass
265
+ return None
266
+
267
+
268
+ async def execute_extracted_tools(
269
+ tool_calls: list,
270
+ user_content: str,
271
+ initial_response: str,
272
+ max_iterations: int = 3
273
+ ) -> Dict[str, Any]:
274
+ """
275
+ Execute extracted tool calls and feed results back for a final response.
276
+ This is the shared tool execution logic used by ALL patterns.
277
+ """
278
+ if not tool_calls:
279
+ return {
280
+ "final_response": initial_response,
281
+ "observations": [],
282
+ "tools_executed": 0
283
+ }
284
+
285
+ from .tool_executor import ToolExecutor
286
+ executor = ToolExecutor()
287
+
288
+ all_observations = []
289
+
290
+ # Execute all tool calls
291
+ print(f"Executing {len(tool_calls)} extracted tool(s)...")
292
+ for i, tc in enumerate(tool_calls):
293
+ tool_name = tc.get("tool", "unknown")
294
+ print(f" [{i+1}/{len(tool_calls)}] {tool_name}")
295
+
296
+ result = executor.execute(tc)
297
+ all_observations.append({
298
+ "tool": tool_name,
299
+ "arguments": tc.get("arguments", {}),
300
+ "result": result
301
+ })
302
+
303
+ if "error" in result:
304
+ print(f" ❌ {result['error']}")
305
+ else:
306
+ print(f" ✓ Success")
307
+
308
+ # Generate final response with tool results
309
+ print("Generating final response with tool results...")
310
+ obs_text = "\n\n".join([
311
+ f"**{o['tool']}** ({json.dumps(o['arguments'])}): ```{json.dumps(o['result'], indent=2)[:500]}```"
312
+ for o in all_observations
313
+ ])
314
+
315
+ final_messages = [
316
+ ChatMessage(role="system", content="You are a helpful assistant. Use the tool execution results provided to give an accurate, factual answer."),
317
+ ChatMessage(role="user", content=f"""Original task: {user_content}
318
+
319
+ Tool execution results:
320
+ {obs_text}
321
+
322
+ Based on these ACTUAL results, provide your final answer:""")
323
+ ]
324
+
325
+ final_response = await generate_with_model("tools", final_messages, 2048, 0.3)
326
+
327
+ return {
328
+ "final_response": final_response,
329
+ "observations": all_observations,
330
+ "tools_executed": len(all_observations)
331
+ }
332
+
333
+
334
+ def classify_task(prompt: str) -> str:
335
+ """Classify task type for model routing"""
336
+ coding_patterns = [
337
+ r'\bwrite\b.*\bcode\b', r'\bimplement\b', r'\bfunction\b',
338
+ r'\bclass\b', r'\brefactor\b', r'\bfix\b.*\bbug\b',
339
+ r'```', r'\btypescript\b', r'\bpython\b', r'\brust\b',
340
+ r'\bjavascript\b', r'\bjava\b', r'\bgo\b', r'\bc\+\+\b'
341
+ ]
342
+ reasoning_patterns = [
343
+ r'\bexplain\b', r'\banalyze\b', r'\bplan\b', r'\bdesign\b',
344
+ r'\barchitecture\b', r'\bwhy\b', r'\bhow does\b', r'\bcompare\b',
345
+ r'\bwhat is\b', r'\bdefine\b', r'\bdescribe\b'
346
+ ]
347
+
348
+ prompt_lower = prompt.lower()
349
+
350
+ coding_score = sum(1 for p in coding_patterns if re.search(p, prompt_lower))
351
+ reasoning_score = sum(1 for p in reasoning_patterns if re.search(p, prompt_lower))
352
+
353
+ if coding_score > reasoning_score:
354
+ return "coding"
355
+ elif reasoning_score > 0:
356
+ return "reasoning"
357
+ else:
358
+ return "simple"
359
+
360
+
361
+ async def generate_with_validation(
362
+ messages: List[ChatMessage],
363
+ max_tokens: int = 2048,
364
+ temperature: float = 0.7
365
+ ) -> Dict[str, Any]:
366
+ """Generate with primary model, then validate with validator model"""
367
+
368
+ # Step 1: Generate with primary model
369
+ print("Step 1: Generating with primary model (72B)...")
370
+ primary_response = await generate_with_model(
371
+ "primary", messages, max_tokens, temperature
372
+ )
373
+
374
+ # Step 2: Validate with fast model
375
+ print("Step 2: Validating with validator model (7B)...")
376
+ user_content = messages[-1].content if messages else ""
377
+
378
+ validation_messages = [
379
+ ChatMessage(role="system", content="""You are a code reviewer. Review the response for issues:
380
+ 1. Syntax errors
381
+ 2. Logic bugs
382
+ 3. Missing error handling
383
+ 4. Security vulnerabilities
384
+ 5. Performance problems
385
+
386
+ Output ONLY "PASS" if no issues found, or "FAIL: <brief list of issues>" if problems exist."""),
387
+ ChatMessage(role="user", content=f"""Original question:
388
+ {user_content}
389
+
390
+ Response to review:
391
+ {primary_response}
392
+
393
+ Your review (PASS or FAIL with issues):""")
394
+ ]
395
+
396
+ validation = await generate_with_model(
397
+ "validator", validation_messages, 512, 0.3
398
+ )
399
+
400
+ # Step 3: If issues found, regenerate with feedback
401
+ needs_revision = "FAIL" in validation.upper() or "PASS" not in validation.upper()
402
+
403
+ if needs_revision:
404
+ print("Step 3: Issues found, regenerating with feedback...")
405
+ revision_messages = messages.copy()
406
+ revision_messages.append(ChatMessage(
407
+ role="assistant",
408
+ content=primary_response
409
+ ))
410
+ revision_messages.append(ChatMessage(
411
+ role="user",
412
+ content=f"""The previous response had these issues:
413
+ {validation}
414
+
415
+ Please provide a corrected response addressing these issues."""
416
+ ))
417
+
418
+ primary_response = await generate_with_model(
419
+ "primary", revision_messages, max_tokens, temperature
420
+ )
421
+
422
+ # Step 4: Extract AND EXECUTE tool calls if needed
423
+ tool_calls = None
424
+ tool_result = {"observations": [], "tools_executed": 0}
425
+ final_response = primary_response
426
+
427
+ if needs_tool_extraction(user_content):
428
+ print("Step 4: Hermes-3 Q8 extracting tool calls...")
429
+ tool_calls = await extract_tool_calls(user_content, primary_response)
430
+
431
+ if tool_calls:
432
+ print("Step 5: EXECUTING extracted tools...")
433
+ tool_result = await execute_extracted_tools(
434
+ tool_calls, user_content, primary_response
435
+ )
436
+ final_response = tool_result["final_response"]
437
+
438
+ return {
439
+ "response": final_response,
440
+ "validation": validation,
441
+ "revised": needs_revision,
442
+ "tool_calls": tool_calls,
443
+ "observations": tool_result["observations"],
444
+ "tools_executed": tool_result["tools_executed"],
445
+ "model_flow": f"72B-Q8 -> 7B-validator -> hermes-3-Q8 -> exec ({tool_result['tools_executed']} tools)" if tool_calls else "72B-Q8 -> 7B-validator"
446
+ }
447
+
448
+
449
+ async def generate_competing(
450
+ messages: List[ChatMessage],
451
+ max_tokens: int = 2048,
452
+ temperature: float = 0.7
453
+ ) -> Dict[str, Any]:
454
+ """Generate with two models sequentially, judge picks best"""
455
+
456
+ # Step 1: Generate with both models SEQUENTIALLY (parallel crashes Metal on large models)
457
+ print("Step 1a: Generating with primary (72B)...")
458
+ primary_response = await generate_with_model("primary", messages, max_tokens, temperature)
459
+
460
+ print("Step 1b: Generating with competitor (32B)...")
461
+ competitor_response = await generate_with_model("competitor", messages, max_tokens, temperature)
462
+
463
+ # Step 2: Judge picks best
464
+ print("Step 2: Judging with validator (7B)...")
465
+ user_content = messages[-1].content if messages else ""
466
+
467
+ judge_messages = [
468
+ ChatMessage(role="system", content="""You are a code quality judge. Compare two solutions and pick the better one.
469
+ Consider: correctness, efficiency, readability, error handling.
470
+ Output ONLY "A" or "B" followed by a brief one-sentence explanation."""),
471
+ ChatMessage(role="user", content=f"""Original question:
472
+ {user_content}
473
+
474
+ Solution A (72B reasoning model):
475
+ {primary_response}
476
+
477
+ Solution B (32B coding model):
478
+ {competitor_response}
479
+
480
+ Which is better? (A or B with brief reason):""")
481
+ ]
482
+
483
+ judgment = await generate_with_model("validator", judge_messages, 256, 0.3)
484
+
485
+ # Parse judgment
486
+ winner = "A" if judgment.strip().startswith("A") else "B"
487
+ best_response = primary_response if winner == "A" else competitor_response
488
+
489
+ # Step 3: Extract AND EXECUTE tool calls if needed
490
+ tool_calls = None
491
+ tool_result = {"observations": [], "tools_executed": 0}
492
+ final_response = best_response
493
+
494
+ if needs_tool_extraction(user_content):
495
+ print("Step 3: Hermes-3 Q8 extracting tool calls...")
496
+ tool_calls = await extract_tool_calls(user_content, best_response)
497
+
498
+ if tool_calls:
499
+ print("Step 4: EXECUTING extracted tools...")
500
+ tool_result = await execute_extracted_tools(
501
+ tool_calls, user_content, best_response
502
+ )
503
+ final_response = tool_result["final_response"]
504
+
505
+ return {
506
+ "response": final_response,
507
+ "winner": winner,
508
+ "judgment": judgment,
509
+ "solution_a": primary_response,
510
+ "solution_b": competitor_response,
511
+ "tool_calls": tool_calls,
512
+ "observations": tool_result["observations"],
513
+ "tools_executed": tool_result["tools_executed"],
514
+ "model_flow": f"72B + 32B -> 7B-judge -> exec ({tool_result['tools_executed']} tools, winner: {winner})" if tool_calls else f"72B + 32B -> 7B-judge (winner: {winner})"
515
+ }
516
+
517
+
518
+ async def generate_hybrid(
519
+ messages: List[ChatMessage],
520
+ max_tokens: int = 2048,
521
+ temperature: float = 0.7
522
+ ) -> Dict[str, Any]:
523
+ """
524
+ Hybrid pattern: Qwen-72B Q8 for reasoning + Hermes-3 Q8 for tool execution
525
+ ALWAYS extracts tools via Hermes-3 for best capability, then EXECUTES them.
526
+ """
527
+
528
+ user_content = messages[-1].content if messages else ""
529
+
530
+ # Step 1: Qwen-72B generates the main response with reasoning
531
+ print("Step 1: Qwen-72B Q8 analyzing and generating response...")
532
+ primary_response = await generate_with_model(
533
+ "primary", messages, max_tokens, temperature
534
+ )
535
+
536
+ # Step 2: Extract AND EXECUTE tool calls via Hermes-3
537
+ tool_calls = None
538
+ tool_result = {"observations": [], "tools_executed": 0}
539
+ final_response = primary_response
540
+
541
+ if needs_tool_extraction(user_content):
542
+ print("Step 2: Hermes-3 Q8 extracting tool calls...")
543
+ tool_calls = await extract_tool_calls(user_content, primary_response)
544
+
545
+ if tool_calls:
546
+ print("Step 3: EXECUTING extracted tools...")
547
+ tool_result = await execute_extracted_tools(
548
+ tool_calls, user_content, primary_response
549
+ )
550
+ final_response = tool_result["final_response"]
551
+
552
+ return {
553
+ "response": final_response,
554
+ "tool_calls": tool_calls,
555
+ "observations": tool_result["observations"],
556
+ "tools_executed": tool_result["tools_executed"],
557
+ "model_flow": f"qwen-72b-q8 -> hermes-3-q8 -> exec ({tool_result['tools_executed']} tools)" if tool_calls else "qwen-72b-q8"
558
+ }
559
+
560
+
561
+ async def generate_with_tool_execution(
562
+ messages: List[ChatMessage],
563
+ max_tokens: int = 2048,
564
+ temperature: float = 0.7,
565
+ max_iterations: int = 5
566
+ ) -> Dict[str, Any]:
567
+ """
568
+ ReAct loop: Generate → Extract Tools → ACTUALLY EXECUTE → Observe → Repeat
569
+
570
+ This is the key innovation: instead of just generating tool call JSON,
571
+ we actually execute the tools and feed real results back to the model.
572
+ """
573
+ from .tool_executor import ToolExecutor
574
+ executor = ToolExecutor()
575
+
576
+ current_messages = list(messages)
577
+ all_observations = []
578
+ iterations = 0
579
+ user_content = messages[-1].content if messages else ""
580
+
581
+ print(f"Starting ReAct loop for: {user_content[:100]}...")
582
+
583
+ while iterations < max_iterations:
584
+ iterations += 1
585
+ print(f"\n=== ReAct Iteration {iterations}/{max_iterations} ===")
586
+
587
+ # Step 1: Generate response with primary model (Qwen-72B or tools model)
588
+ # Use tools model for faster iteration if primary is slow
589
+ model_to_use = "tools" if iterations > 1 else "primary"
590
+ print(f"Step 1: Generating with {model_to_use} model...")
591
+
592
+ response = await generate_with_model(
593
+ model_to_use, current_messages, max_tokens, temperature
594
+ )
595
+
596
+ # Step 2: ALWAYS extract tool calls with Hermes-3 Q8 (be aggressive)
597
+ print("Step 2: Extracting tool calls with Hermes-3 Q8...")
598
+ tool_calls = await extract_tool_calls(user_content, response)
599
+
600
+ # On first iteration, be very aggressive - if no tools extracted but task seems to need them, force it
601
+ if iterations == 1 and not tool_calls and needs_tool_extraction(user_content):
602
+ print(" Forcing tool extraction for data-requiring task...")
603
+ tool_calls = await extract_tool_calls(
604
+ user_content + "\n\nIMPORTANT: This task REQUIRES using tools to get real data. Do NOT just explain - execute tools!",
605
+ response
606
+ )
607
+
608
+ if not tool_calls:
609
+ # No more tools needed - return final response
610
+ print(f"No more tools needed. Returning final response after {iterations} iterations.")
611
+ return {
612
+ "response": response,
613
+ "observations": all_observations,
614
+ "iterations": iterations,
615
+ "tools_executed": len(all_observations),
616
+ "model_flow": f"react-loop ({iterations} iterations, {len(all_observations)} tools executed)"
617
+ }
618
+
619
+ # Step 3: ACTUALLY EXECUTE tools and collect observations
620
+ print(f"Step 3: Executing {len(tool_calls)} tool(s)...")
621
+ observations = []
622
+ for i, tc in enumerate(tool_calls):
623
+ tool_name = tc.get("tool", "unknown")
624
+ print(f" Executing [{i+1}/{len(tool_calls)}]: {tool_name}")
625
+
626
+ result = executor.execute(tc)
627
+ observations.append({
628
+ "tool": tool_name,
629
+ "arguments": tc.get("arguments", {}),
630
+ "result": result
631
+ })
632
+
633
+ # Log result summary
634
+ if "error" in result:
635
+ print(f" ❌ Error: {result['error']}")
636
+ else:
637
+ result_str = str(result)[:100]
638
+ print(f" ✓ Success: {result_str}...")
639
+
640
+ all_observations.extend(observations)
641
+
642
+ # Step 4: Feed REAL observations back to model
643
+ print("Step 4: Feeding tool results back to model...")
644
+ obs_text = "\n\n".join([
645
+ f"### Tool: {o['tool']}\n**Arguments:** {json.dumps(o['arguments'])}\n**Result:**\n```json\n{json.dumps(o['result'], indent=2)}\n```"
646
+ for o in observations
647
+ ])
648
+
649
+ current_messages.append(ChatMessage(role="assistant", content=response))
650
+ current_messages.append(ChatMessage(
651
+ role="user",
652
+ content=f"""Tool execution completed. Here are the REAL results:
653
+
654
+ {obs_text}
655
+
656
+ Based on these actual results, please continue with the task. If you have all the information you need, provide your final answer. If you need more information, specify what additional tools to call."""
657
+ ))
658
+
659
+ # Max iterations reached
660
+ print(f"Max iterations ({max_iterations}) reached.")
661
+ return {
662
+ "response": response,
663
+ "observations": all_observations,
664
+ "iterations": iterations,
665
+ "max_iterations_reached": True,
666
+ "tools_executed": len(all_observations),
667
+ "model_flow": f"react-loop (max {max_iterations} iterations, {len(all_observations)} tools executed)"
668
+ }
669
+
670
+
671
+ # FastAPI app with lifespan
672
+ @asynccontextmanager
673
+ async def lifespan(app: FastAPI):
674
+ # Startup: Pre-load validator model (smallest, always needed)
675
+ print("MageAgent server starting...")
676
+ print(f"Available models: {list(MODELS.keys())}")
677
+
678
+ # Only pre-load validator since it's small and always used
679
+ try:
680
+ print("Pre-loading validator model...")
681
+ get_model("validator")
682
+ print("Validator model ready!")
683
+ except Exception as e:
684
+ print(f"Warning: Could not pre-load validator: {e}")
685
+
686
+ yield
687
+
688
+ # Shutdown
689
+ print("MageAgent server shutting down...")
690
+ loaded_models.clear()
691
+ model_tokenizers.clear()
692
+
693
+
694
+ app = FastAPI(
695
+ title="MageAgent Orchestrator",
696
+ description="Multi-Model LLM Server with Validation Patterns",
697
+ version="1.0.0",
698
+ lifespan=lifespan
699
+ )
700
+
701
+ app.add_middleware(
702
+ CORSMiddleware,
703
+ allow_origins=["*"],
704
+ allow_credentials=True,
705
+ allow_methods=["*"],
706
+ allow_headers=["*"],
707
+ )
708
+
709
+
710
+ @app.get("/")
711
+ async def root():
712
+ return {
713
+ "name": "MageAgent Orchestrator",
714
+ "version": "2.0.0", # Major version bump for tool execution
715
+ "models": list(MODELS.keys()),
716
+ "endpoints": [
717
+ "mageagent:auto - Intelligent routing",
718
+ "mageagent:execute - ⭐ REAL tool execution (reads files, runs commands, web search)",
719
+ "mageagent:hybrid - Qwen-72B + Hermes-3 (best capability)",
720
+ "mageagent:validated - Generate + validate",
721
+ "mageagent:compete - Competing models",
722
+ "mageagent:tools - Tool calling (Hermes-3 Q8)",
723
+ "mageagent:primary - Direct 72B access (Q8)",
724
+ "mageagent:validator - Direct 7B access",
725
+ "mageagent:competitor - Direct 32B access"
726
+ ],
727
+ "new_in_v2": "mageagent:execute - ReAct loop that ACTUALLY executes tools instead of hallucinating"
728
+ }
729
+
730
+
731
+ @app.get("/v1/models")
732
+ async def list_models():
733
+ """List available models (OpenAI compatible)"""
734
+ models = [
735
+ ModelInfo(id="mageagent:auto", created=int(time.time())),
736
+ ModelInfo(id="mageagent:execute", created=int(time.time())), # NEW: Real tool execution!
737
+ ModelInfo(id="mageagent:hybrid", created=int(time.time())),
738
+ ModelInfo(id="mageagent:validated", created=int(time.time())),
739
+ ModelInfo(id="mageagent:compete", created=int(time.time())),
740
+ ModelInfo(id="mageagent:tools", created=int(time.time())),
741
+ ModelInfo(id="mageagent:primary", created=int(time.time())),
742
+ ModelInfo(id="mageagent:validator", created=int(time.time())),
743
+ ModelInfo(id="mageagent:competitor", created=int(time.time())),
744
+ ]
745
+ return ModelsResponse(data=models)
746
+
747
+
748
+ @app.get("/health")
749
+ async def health():
750
+ return {
751
+ "status": "healthy",
752
+ "loaded_models": list(loaded_models.keys()),
753
+ "available_models": list(MODELS.keys())
754
+ }
755
+
756
+
757
+ @app.post("/v1/chat/completions")
758
+ async def chat_completions(request: ChatRequest):
759
+ """OpenAI-compatible chat completions endpoint"""
760
+
761
+ start_time = time.time()
762
+ model_name = request.model
763
+
764
+ # Extract user prompt for classification
765
+ user_prompt = request.messages[-1].content if request.messages else ""
766
+
767
+ try:
768
+ if model_name == "mageagent:execute":
769
+ # ReAct loop with REAL tool execution - the key innovation!
770
+ # This actually reads files, runs commands, and searches the web
771
+ result = await generate_with_tool_execution(
772
+ request.messages,
773
+ request.max_tokens or 2048,
774
+ request.temperature or 0.7
775
+ )
776
+ response_text = result["response"]
777
+
778
+ # Add execution summary
779
+ if result.get("observations"):
780
+ tools_summary = ", ".join([o["tool"] for o in result["observations"]])
781
+ response_text += f"\n\n---\n*Executed {len(result['observations'])} tools: {tools_summary}*"
782
+
783
+ used_model = f"mageagent:execute ({result['model_flow']})"
784
+
785
+ elif model_name == "mageagent:validated":
786
+ # Generate + validate pattern (with real tool execution)
787
+ result = await generate_with_validation(
788
+ request.messages,
789
+ request.max_tokens or 2048,
790
+ request.temperature or 0.7
791
+ )
792
+ response_text = result["response"]
793
+ # Add execution summary if tools were run
794
+ if result.get("tools_executed", 0) > 0:
795
+ tools_summary = ", ".join([o["tool"] for o in result.get("observations", [])])
796
+ response_text += f"\n\n---\n*Executed {result['tools_executed']} tools: {tools_summary}*"
797
+ used_model = f"mageagent:validated ({result.get('model_flow', '72B-Q8 -> 7B-validator')})"
798
+
799
+ elif model_name == "mageagent:compete":
800
+ # Competing models pattern (with real tool execution)
801
+ result = await generate_competing(
802
+ request.messages,
803
+ request.max_tokens or 2048,
804
+ request.temperature or 0.7
805
+ )
806
+ response_text = result["response"]
807
+ # Add execution summary if tools were run
808
+ if result.get("tools_executed", 0) > 0:
809
+ tools_summary = ", ".join([o["tool"] for o in result.get("observations", [])])
810
+ response_text += f"\n\n---\n*Executed {result['tools_executed']} tools: {tools_summary}*"
811
+ winner = result.get('winner', '?')
812
+ used_model = f"mageagent:compete ({result.get('model_flow', f'winner: {winner}')})"
813
+
814
+ elif model_name == "mageagent:hybrid":
815
+ # Hybrid pattern: Qwen-72B Q8 reasoning + Hermes-3 Q8 tools (with real execution)
816
+ result = await generate_hybrid(
817
+ request.messages,
818
+ request.max_tokens or 2048,
819
+ request.temperature or 0.7
820
+ )
821
+ response_text = result["response"]
822
+ # Add execution summary if tools were run
823
+ if result.get("tools_executed", 0) > 0:
824
+ tools_summary = ", ".join([o["tool"] for o in result.get("observations", [])])
825
+ response_text += f"\n\n---\n*Executed {result['tools_executed']} tools: {tools_summary}*"
826
+ used_model = f"mageagent:hybrid ({result['model_flow']})"
827
+
828
+ elif model_name == "mageagent:auto":
829
+ # Intelligent routing based on task classification
830
+ task_type = classify_task(user_prompt)
831
+ print(f"Task classified as: {task_type}")
832
+
833
+ if task_type == "coding":
834
+ # Use validation pattern for coding tasks (with real tool execution)
835
+ result = await generate_with_validation(
836
+ request.messages,
837
+ request.max_tokens or 2048,
838
+ request.temperature or 0.7
839
+ )
840
+ response_text = result["response"]
841
+ if result.get("tools_executed", 0) > 0:
842
+ tools_summary = ", ".join([o["tool"] for o in result.get("observations", [])])
843
+ response_text += f"\n\n---\n*Executed {result['tools_executed']} tools: {tools_summary}*"
844
+ used_model = f"mageagent:auto->validated ({result.get('model_flow', '')})"
845
+ elif task_type == "reasoning":
846
+ # Use hybrid for reasoning (with real tool execution)
847
+ result = await generate_hybrid(
848
+ request.messages,
849
+ request.max_tokens or 2048,
850
+ request.temperature or 0.7
851
+ )
852
+ response_text = result["response"]
853
+ if result.get("tools_executed", 0) > 0:
854
+ tools_summary = ", ".join([o["tool"] for o in result.get("observations", [])])
855
+ response_text += f"\n\n---\n*Executed {result['tools_executed']} tools: {tools_summary}*"
856
+ used_model = f"mageagent:auto->hybrid ({result.get('model_flow', '')})"
857
+ else:
858
+ # Use fast validator for simple tasks (no tools needed)
859
+ response_text = await generate_with_model(
860
+ "validator",
861
+ request.messages,
862
+ request.max_tokens or 2048,
863
+ request.temperature or 0.7
864
+ )
865
+ used_model = "mageagent:auto->validator"
866
+
867
+ elif model_name in ["mageagent:primary", "mageagent:reasoning"]:
868
+ # Direct primary model access
869
+ response_text = await generate_with_model(
870
+ "primary",
871
+ request.messages,
872
+ request.max_tokens or 2048,
873
+ request.temperature or 0.7
874
+ )
875
+ used_model = "mageagent:primary"
876
+
877
+ elif model_name in ["mageagent:validator", "mageagent:fast"]:
878
+ # Direct validator model access
879
+ response_text = await generate_with_model(
880
+ "validator",
881
+ request.messages,
882
+ request.max_tokens or 2048,
883
+ request.temperature or 0.7
884
+ )
885
+ used_model = "mageagent:validator"
886
+
887
+ elif model_name in ["mageagent:competitor", "mageagent:coding"]:
888
+ # Direct competitor model access
889
+ response_text = await generate_with_model(
890
+ "competitor",
891
+ request.messages,
892
+ request.max_tokens or 2048,
893
+ request.temperature or 0.7
894
+ )
895
+ used_model = "mageagent:competitor"
896
+
897
+ elif model_name in ["mageagent:tools", "mageagent:hermes"]:
898
+ # Direct tools model access (Hermes-3 Q8 for tool calling)
899
+ response_text = await generate_with_model(
900
+ "tools",
901
+ request.messages,
902
+ request.max_tokens or 2048,
903
+ request.temperature or 0.7
904
+ )
905
+ used_model = "mageagent:tools"
906
+
907
+ else:
908
+ # Default to auto
909
+ response_text = await generate_with_model(
910
+ "validator",
911
+ request.messages,
912
+ request.max_tokens or 2048,
913
+ request.temperature or 0.7
914
+ )
915
+ used_model = "mageagent:default->validator"
916
+
917
+ elapsed = time.time() - start_time
918
+ print(f"Request completed in {elapsed:.1f}s using {used_model}")
919
+
920
+ # Estimate token counts
921
+ prompt_tokens = sum(len(m.content.split()) for m in request.messages)
922
+ completion_tokens = len(response_text.split())
923
+
924
+ return ChatResponse(
925
+ id=f"chatcmpl-{int(time.time())}",
926
+ created=int(time.time()),
927
+ model=used_model,
928
+ choices=[
929
+ ChatChoice(
930
+ index=0,
931
+ message=ChatMessage(role="assistant", content=response_text),
932
+ finish_reason="stop"
933
+ )
934
+ ],
935
+ usage=Usage(
936
+ prompt_tokens=prompt_tokens,
937
+ completion_tokens=completion_tokens,
938
+ total_tokens=prompt_tokens + completion_tokens
939
+ )
940
+ )
941
+
942
+ except FileNotFoundError as e:
943
+ raise HTTPException(status_code=404, detail=str(e))
944
+ except Exception as e:
945
+ print(f"Error: {e}")
946
+ raise HTTPException(status_code=500, detail=str(e))
947
+
948
+
949
+ if __name__ == "__main__":
950
+ import uvicorn
951
+ uvicorn.run(app, host="127.0.0.1", port=3457)