@meridiona/meridian-darwin-arm64 1.34.0 → 1.34.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/VERSION +1 -1
- package/bin/meridian +0 -0
- package/package.json +1 -1
- package/scripts/install-screenpipe-daemon.sh +30 -14
- package/scripts/meridian-cli.sh +2 -2
- package/services/agents/__init__.py +1 -1
- package/services/agents/config.py +1 -14
- package/services/agents/llm_selector.py +4 -339
- package/services/agents/observability.py +1 -1
- package/services/agents/run_task_linker_mlx.py +0 -1
- package/services/agents/server.py +27 -125
- package/services/agents/tests/conftest.py +0 -7
- package/services/agents/tests/test_llm_selector.py +0 -190
- package/services/pyproject.toml +1 -1
- package/services/tests/conftest.py +1 -8
- package/services/tests/evals/build_dataset.py +3 -4
- package/services/tests/evals/test_classifier.py +0 -2
- package/services/tests/evals/test_model_sweep.py +0 -2
- package/ui.tar.gz +0 -0
- package/services/agents/_hermes_setup.py +0 -44
- package/services/agents/run_task_linker.py +0 -445
- package/services/agents/tests/test_run_task_linker.py +0 -360
- package/services/tests/evals/eval_agent.py +0 -143
- package/services/tests/test_task_linker.py +0 -157
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.34.
|
|
1
|
+
1.34.1
|
package/bin/meridian
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@meridiona/meridian-darwin-arm64",
|
|
3
|
-
"version": "1.34.
|
|
3
|
+
"version": "1.34.1",
|
|
4
4
|
"description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
|
|
5
5
|
"homepage": "https://github.com/Meridiona/meridian",
|
|
6
6
|
"repository": {
|
|
@@ -40,21 +40,37 @@ fi
|
|
|
40
40
|
# attaches to a stable binary named `screenpipe` (and survives reinstalls of the
|
|
41
41
|
# same version, since its path is fixed). Falls back to whatever `command -v`
|
|
42
42
|
# found when screenpipe is a native binary (Homebrew) rather than the npm shim.
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
echo "
|
|
43
|
+
STAGED_BIN="${HOME}/.meridian/bin/screenpipe"
|
|
44
|
+
|
|
45
|
+
# Prefer the already-staged stable binary (written by install-from-bundle.sh).
|
|
46
|
+
# On a standalone re-run of this script (e.g. `meridian repair`) resolve the
|
|
47
|
+
# real Mach-O from the npm tree and stage it so the launchd plist is immune to
|
|
48
|
+
# nvm version changes — the npm shim path under ~/.nvm is version-specific and
|
|
49
|
+
# breaks silently when the user runs `nvm use` or upgrades Node.
|
|
50
|
+
if [[ -x "${STAGED_BIN}" ]] && file "${STAGED_BIN}" 2>/dev/null | grep -q "Mach-O"; then
|
|
51
|
+
SCREENPIPE_BIN="${STAGED_BIN}"
|
|
52
|
+
echo "→ using staged screenpipe binary: ${SCREENPIPE_BIN}"
|
|
53
|
+
else
|
|
54
|
+
SCREENPIPE_BIN="$(command -v screenpipe 2>/dev/null || true)"
|
|
55
|
+
if [[ -z "${SCREENPIPE_BIN}" ]]; then
|
|
56
|
+
echo "✗ screenpipe not found in PATH — install with: npm install -g screenpipe" >&2
|
|
57
|
+
exit 1
|
|
58
|
+
fi
|
|
59
|
+
_npm_root="$(npm root -g 2>/dev/null || true)"
|
|
60
|
+
if [[ -n "${_npm_root}" && -d "${_npm_root}/screenpipe" ]]; then
|
|
61
|
+
_real=""
|
|
62
|
+
while IFS= read -r _cand; do
|
|
63
|
+
if file "${_cand}" 2>/dev/null | grep -q "Mach-O"; then _real="${_cand}"; break; fi
|
|
64
|
+
done < <(find "${_npm_root}/screenpipe" -type f -name screenpipe -perm +0111 2>/dev/null)
|
|
65
|
+
if [[ -n "${_real}" ]]; then
|
|
66
|
+
SCREENPIPE_BIN="${_real}"
|
|
67
|
+
fi
|
|
57
68
|
fi
|
|
69
|
+
mkdir -p "${HOME}/.meridian/bin"
|
|
70
|
+
cp "${SCREENPIPE_BIN}" "${STAGED_BIN}"
|
|
71
|
+
chmod +x "${STAGED_BIN}"
|
|
72
|
+
SCREENPIPE_BIN="${STAGED_BIN}"
|
|
73
|
+
echo "→ staged screenpipe binary: ${SCREENPIPE_BIN}"
|
|
58
74
|
fi
|
|
59
75
|
|
|
60
76
|
mkdir -p "${HOME}/.meridian/logs"
|
package/scripts/meridian-cli.sh
CHANGED
|
@@ -315,8 +315,8 @@ cmd_smoke() {
|
|
|
315
315
|
mlx_port="$(_smoke_read_env MLX_SERVER_PORT)"
|
|
316
316
|
mlx_port="${mlx_port:-7823}"
|
|
317
317
|
local base="http://127.0.0.1:${mlx_port}"
|
|
318
|
-
local classify_timeout=
|
|
319
|
-
[[ $classify_only -eq 1 ]] && classify_timeout=
|
|
318
|
+
local classify_timeout=180
|
|
319
|
+
[[ $classify_only -eq 1 ]] && classify_timeout=180
|
|
320
320
|
local all_ok=1
|
|
321
321
|
|
|
322
322
|
if [[ -t 1 ]]; then
|
|
@@ -1 +1 @@
|
|
|
1
|
-
"""Meridian agents — AI-powered session task classification
|
|
1
|
+
"""Meridian agents — AI-powered session task classification."""
|
|
@@ -21,15 +21,6 @@ _ENV_FILE = PROJECT_ROOT / ".env"
|
|
|
21
21
|
if _ENV_FILE.exists():
|
|
22
22
|
load_dotenv(_ENV_FILE, override=False)
|
|
23
23
|
|
|
24
|
-
# ── Hermes (AIAgent library) ──────────────────────────────────────────────────
|
|
25
|
-
HERMES_HOME = Path(os.environ.get("HERMES_HOME", str(REPO_ROOT / ".hermes")))
|
|
26
|
-
|
|
27
|
-
# Directories searched for skill files (SKILL.md, SKILL-*.md).
|
|
28
|
-
SKILLS_SEARCH_PATHS: list[Path] = [
|
|
29
|
-
REPO_ROOT / "skills" / "activity",
|
|
30
|
-
HERMES_HOME / "skills",
|
|
31
|
-
]
|
|
32
|
-
|
|
33
24
|
# ── LLM ───────────────────────────────────────────────────────────────────────
|
|
34
25
|
MODEL = os.environ.get("OLLAMA_MODEL")
|
|
35
26
|
BASE_URL = os.environ.get("OLLAMA_HOST")
|
|
@@ -43,7 +34,7 @@ if not API_KEY:
|
|
|
43
34
|
)
|
|
44
35
|
|
|
45
36
|
# Local model selection — Apple Silicon only.
|
|
46
|
-
# LLM_PREFER_LOCAL=1 tries a local model before the cloud
|
|
37
|
+
# LLM_PREFER_LOCAL=1 tries a local model before the cloud path.
|
|
47
38
|
# LLM_BUDGET_PCT controls the fraction of available Metal headroom to allocate
|
|
48
39
|
# (0.5 = 50% of free GPU memory). Set to 0 or LLM_PREFER_LOCAL=0 to disable.
|
|
49
40
|
|
|
@@ -58,10 +49,6 @@ def _env_bool(name: str, default: bool) -> bool:
|
|
|
58
49
|
LLM_PREFER_LOCAL = _env_bool("LLM_PREFER_LOCAL", True)
|
|
59
50
|
LLM_BUDGET_PCT = float(os.environ.get("LLM_BUDGET_PCT", "0.5"))
|
|
60
51
|
|
|
61
|
-
# When true, _hermes_setup.ensure_hermes_importable() prepends services/.hermes/
|
|
62
|
-
# to sys.path so the local source checkout shadows the installed hermes-agent package.
|
|
63
|
-
HERMES_DEV_MODE = os.environ.get("HERMES_DEV_MODE", "0") == "1"
|
|
64
|
-
|
|
65
52
|
# ── DB / runtime paths ────────────────────────────────────────────────────────
|
|
66
53
|
MERIDIAN_HOME = Path(os.environ.get("MERIDIAN_HOME", str(Path.home() / ".meridian")))
|
|
67
54
|
MERIDIAN_DB = Path(os.environ.get("MERIDIAN_DB", str(MERIDIAN_HOME / "meridian.db")))
|
|
@@ -28,7 +28,6 @@ import logging
|
|
|
28
28
|
import os
|
|
29
29
|
import platform
|
|
30
30
|
import re
|
|
31
|
-
import signal
|
|
32
31
|
import socket
|
|
33
32
|
import subprocess
|
|
34
33
|
import sys
|
|
@@ -263,17 +262,6 @@ class ComputeSnapshot:
|
|
|
263
262
|
mem_bw_gbs: int
|
|
264
263
|
|
|
265
264
|
|
|
266
|
-
@dataclass
|
|
267
|
-
class LocalModelEndpoint:
|
|
268
|
-
model: str # model name to pass to AIAgent
|
|
269
|
-
base_url: str # OpenAI-compatible base URL
|
|
270
|
-
api_key: str # typically "local"
|
|
271
|
-
runtime: str # "ollama" | "lmstudio" | "llamacpp" | "mlxlm" | "mlx_managed"
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
_MANAGED_SERVER_PORT = 8765
|
|
275
|
-
_MANAGED_SERVER_PID_FILE = Path.home() / ".meridian" / "mlx_lm_server.pid"
|
|
276
|
-
|
|
277
265
|
# Sentinel returned by select_mlx_model_id() when Apple Intelligence is chosen.
|
|
278
266
|
APPLE_INTELLIGENCE_ID = "apple-intelligence"
|
|
279
267
|
|
|
@@ -438,7 +426,7 @@ def local_infer(system_prompt: str, user_message: str,
|
|
|
438
426
|
"""Run inference on the best available local model.
|
|
439
427
|
|
|
440
428
|
Returns the model's text response, or None if nothing is available
|
|
441
|
-
(caller falls back to the cloud
|
|
429
|
+
(caller falls back to the cloud path).
|
|
442
430
|
|
|
443
431
|
Priority:
|
|
444
432
|
1. Already-running server with a model in memory (zero load cost)
|
|
@@ -521,325 +509,6 @@ def _infer_mlx(model_id: str, system: str, user: str, max_tokens: int) -> Option
|
|
|
521
509
|
return None
|
|
522
510
|
|
|
523
511
|
|
|
524
|
-
def _shutdown_managed_server() -> None:
|
|
525
|
-
"""Kill the managed mlx_lm.server if it is running and remove the PID file."""
|
|
526
|
-
pid_file = _MANAGED_SERVER_PID_FILE
|
|
527
|
-
if not pid_file.exists():
|
|
528
|
-
return
|
|
529
|
-
try:
|
|
530
|
-
meta = json.loads(pid_file.read_text())
|
|
531
|
-
pid = meta["pid"]
|
|
532
|
-
try:
|
|
533
|
-
os.kill(pid, 0)
|
|
534
|
-
os.kill(pid, signal.SIGTERM)
|
|
535
|
-
log.info("llm_selector: unloaded managed mlx_lm.server pid=%d model=%s",
|
|
536
|
-
pid, meta.get("model", "?"))
|
|
537
|
-
except OSError:
|
|
538
|
-
pass
|
|
539
|
-
except Exception:
|
|
540
|
-
pass
|
|
541
|
-
pid_file.unlink(missing_ok=True)
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
def _wait_for_process_exit(pid: int, timeout: float = 10.0) -> None:
|
|
545
|
-
"""Wait for a process to exit; SIGKILL after timeout."""
|
|
546
|
-
deadline = time.monotonic() + timeout
|
|
547
|
-
while time.monotonic() < deadline:
|
|
548
|
-
try:
|
|
549
|
-
os.kill(pid, 0)
|
|
550
|
-
except OSError:
|
|
551
|
-
return # dead
|
|
552
|
-
time.sleep(0.3)
|
|
553
|
-
try:
|
|
554
|
-
os.kill(pid, signal.SIGKILL)
|
|
555
|
-
except OSError:
|
|
556
|
-
pass
|
|
557
|
-
time.sleep(0.5)
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
def _wait_for_port_free(port: int, timeout: float = 5.0) -> None:
|
|
561
|
-
"""Wait until a local TCP port stops accepting connections."""
|
|
562
|
-
deadline = time.monotonic() + timeout
|
|
563
|
-
while time.monotonic() < deadline:
|
|
564
|
-
if not _tcp_open("127.0.0.1", port, timeout=0.3):
|
|
565
|
-
return
|
|
566
|
-
time.sleep(0.3)
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
def _ensure_mlx_server(model_id: str, port: int = _MANAGED_SERVER_PORT) -> bool:
|
|
570
|
-
with _tracer.start_as_current_span("llm_selector.ensure_server") as span:
|
|
571
|
-
span.set_attribute("server.model", model_id)
|
|
572
|
-
span.set_attribute("server.port", port)
|
|
573
|
-
t0 = time.monotonic()
|
|
574
|
-
|
|
575
|
-
pid_file = _MANAGED_SERVER_PID_FILE
|
|
576
|
-
if pid_file.exists():
|
|
577
|
-
try:
|
|
578
|
-
meta = json.loads(pid_file.read_text())
|
|
579
|
-
pid, existing_model, existing_port = meta["pid"], meta["model"], meta["port"]
|
|
580
|
-
try:
|
|
581
|
-
os.kill(pid, 0)
|
|
582
|
-
alive = True
|
|
583
|
-
except OSError:
|
|
584
|
-
alive = False
|
|
585
|
-
|
|
586
|
-
if alive and existing_model == model_id and existing_port == port:
|
|
587
|
-
log.info(
|
|
588
|
-
"llm_selector: managed server already running model=%s pid=%d port=%d",
|
|
589
|
-
model_id, pid, port,
|
|
590
|
-
)
|
|
591
|
-
span.set_attribute("server.action", "reused")
|
|
592
|
-
span.set_attribute("server.pid", pid)
|
|
593
|
-
span.add_event("server_reused", {"pid": pid, "model": model_id})
|
|
594
|
-
return True
|
|
595
|
-
|
|
596
|
-
if alive:
|
|
597
|
-
log.info(
|
|
598
|
-
"llm_selector: model switch %s → %s — stopping pid=%d",
|
|
599
|
-
existing_model, model_id, pid,
|
|
600
|
-
)
|
|
601
|
-
span.set_attribute("server.previous_model", existing_model)
|
|
602
|
-
span.add_event("model_switch", {
|
|
603
|
-
"from_model": existing_model,
|
|
604
|
-
"to_model": model_id,
|
|
605
|
-
"pid": pid,
|
|
606
|
-
})
|
|
607
|
-
os.kill(pid, signal.SIGTERM)
|
|
608
|
-
_wait_for_process_exit(pid)
|
|
609
|
-
_wait_for_port_free(port)
|
|
610
|
-
stop_ms = int((time.monotonic() - t0) * 1000)
|
|
611
|
-
log.info(
|
|
612
|
-
"llm_selector: stopped old managed server pid=%d model=%s elapsed_ms=%d",
|
|
613
|
-
pid, existing_model, stop_ms,
|
|
614
|
-
)
|
|
615
|
-
span.add_event("old_server_stopped", {"elapsed_ms": stop_ms})
|
|
616
|
-
else:
|
|
617
|
-
log.debug("llm_selector: stale pid file (pid=%d dead) — starting fresh", pid)
|
|
618
|
-
span.add_event("stale_pid_file", {"pid": pid})
|
|
619
|
-
except Exception:
|
|
620
|
-
pass
|
|
621
|
-
|
|
622
|
-
proc = subprocess.Popen(
|
|
623
|
-
[sys.executable, "-m", "mlx_lm.server",
|
|
624
|
-
"--model", model_id, "--port", str(port), "--max-tokens", "4096"],
|
|
625
|
-
stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
|
|
626
|
-
start_new_session=True,
|
|
627
|
-
)
|
|
628
|
-
pid_file.parent.mkdir(parents=True, exist_ok=True)
|
|
629
|
-
pid_file.write_text(json.dumps({"pid": proc.pid, "model": model_id, "port": port}))
|
|
630
|
-
log.info(
|
|
631
|
-
"llm_selector: started mlx_lm.server model=%s pid=%d port=%d — waiting for ready",
|
|
632
|
-
model_id, proc.pid, port,
|
|
633
|
-
)
|
|
634
|
-
span.set_attribute("server.action", "started")
|
|
635
|
-
span.set_attribute("server.pid", proc.pid)
|
|
636
|
-
span.add_event("server_started", {"pid": proc.pid, "model": model_id})
|
|
637
|
-
|
|
638
|
-
url = f"http://127.0.0.1:{port}/v1/models"
|
|
639
|
-
deadline = time.monotonic() + 90.0
|
|
640
|
-
while time.monotonic() < deadline:
|
|
641
|
-
if proc.poll() is not None:
|
|
642
|
-
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
|
643
|
-
log.warning(
|
|
644
|
-
"llm_selector: mlx_lm.server exited early exit=%d model=%s elapsed_ms=%d"
|
|
645
|
-
" — is mlx_lm installed?",
|
|
646
|
-
proc.returncode, model_id, elapsed_ms,
|
|
647
|
-
)
|
|
648
|
-
span.set_attribute("server.action", "failed")
|
|
649
|
-
span.set_attribute("server.exit_code", proc.returncode)
|
|
650
|
-
span.add_event("server_exited_early", {"exit_code": proc.returncode})
|
|
651
|
-
pid_file.unlink(missing_ok=True)
|
|
652
|
-
return False
|
|
653
|
-
_, status = _get_json(url, timeout=1.0)
|
|
654
|
-
if status == 200:
|
|
655
|
-
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
|
656
|
-
log.info(
|
|
657
|
-
"llm_selector: mlx_lm.server ready model=%s port=%d startup_ms=%d",
|
|
658
|
-
model_id, port, elapsed_ms,
|
|
659
|
-
)
|
|
660
|
-
span.set_attribute("server.startup_ms", elapsed_ms)
|
|
661
|
-
span.add_event("server_ready", {"startup_ms": elapsed_ms})
|
|
662
|
-
return True
|
|
663
|
-
time.sleep(1)
|
|
664
|
-
|
|
665
|
-
elapsed_ms = int((time.monotonic() - t0) * 1000)
|
|
666
|
-
log.warning(
|
|
667
|
-
"llm_selector: mlx_lm.server startup timeout model=%s elapsed_ms=%d",
|
|
668
|
-
model_id, elapsed_ms,
|
|
669
|
-
)
|
|
670
|
-
span.set_attribute("server.action", "timeout")
|
|
671
|
-
span.add_event("server_timeout", {"elapsed_ms": elapsed_ms})
|
|
672
|
-
return False
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
def select_model_for_hermes(budget_pct: Optional[float] = None) -> Optional[LocalModelEndpoint]:
|
|
676
|
-
"""Return the best available local endpoint for AIAgent, or None to use cloud."""
|
|
677
|
-
if budget_pct is None:
|
|
678
|
-
from agents.config import LLM_BUDGET_PCT
|
|
679
|
-
budget_pct = LLM_BUDGET_PCT
|
|
680
|
-
with _tracer.start_as_current_span("llm_selector.select_model") as span:
|
|
681
|
-
try:
|
|
682
|
-
result: Optional[LocalModelEndpoint] = None
|
|
683
|
-
|
|
684
|
-
if platform.system() != "Darwin":
|
|
685
|
-
span.set_attribute("llm.budget_pct", budget_pct)
|
|
686
|
-
span.set_attribute("llm.selected_model", "cloud_fallback")
|
|
687
|
-
span.set_attribute("llm.selected_runtime", "cloud")
|
|
688
|
-
span.set_attribute("llm.is_local", False)
|
|
689
|
-
return None
|
|
690
|
-
brand = _sysctl("machdep.cpu.brand_string") or ""
|
|
691
|
-
if not brand.startswith("Apple M"):
|
|
692
|
-
span.set_attribute("llm.budget_pct", budget_pct)
|
|
693
|
-
span.set_attribute("llm.selected_model", "cloud_fallback")
|
|
694
|
-
span.set_attribute("llm.selected_runtime", "cloud")
|
|
695
|
-
span.set_attribute("llm.is_local", False)
|
|
696
|
-
return None
|
|
697
|
-
|
|
698
|
-
servers = discover_running_servers()
|
|
699
|
-
if not servers:
|
|
700
|
-
log.debug("llm_selector: no external servers found — will compute budget")
|
|
701
|
-
|
|
702
|
-
for server in servers:
|
|
703
|
-
if server.runtime == "apple_fm":
|
|
704
|
-
continue
|
|
705
|
-
_shutdown_managed_server()
|
|
706
|
-
log.info("llm_selector: using external server runtime=%s model=%s",
|
|
707
|
-
server.runtime, server.best_model)
|
|
708
|
-
result = LocalModelEndpoint(
|
|
709
|
-
model=server.best_model,
|
|
710
|
-
base_url=server.base_url,
|
|
711
|
-
api_key="local",
|
|
712
|
-
runtime=server.runtime,
|
|
713
|
-
)
|
|
714
|
-
break
|
|
715
|
-
|
|
716
|
-
_reason = "cloud_fallback"
|
|
717
|
-
_headroom_gb = 0.0
|
|
718
|
-
_adj_headroom_gb = 0.0
|
|
719
|
-
_budget_gb = 0.0
|
|
720
|
-
_thermal = 0
|
|
721
|
-
_screen_locked_val = False
|
|
722
|
-
_effective_pct = budget_pct
|
|
723
|
-
|
|
724
|
-
if result is None:
|
|
725
|
-
try:
|
|
726
|
-
snap = probe_compute()
|
|
727
|
-
except Exception as exc:
|
|
728
|
-
log.warning("llm_selector: compute probe failed: %s", exc)
|
|
729
|
-
_reason = "compute_probe_failed"
|
|
730
|
-
span.set_attribute("llm.budget_pct", budget_pct)
|
|
731
|
-
span.set_attribute("llm.selected_model", "cloud_fallback")
|
|
732
|
-
span.set_attribute("llm.selected_runtime", "cloud")
|
|
733
|
-
span.set_attribute("llm.is_local", False)
|
|
734
|
-
span.set_attribute("llm.reason", _reason)
|
|
735
|
-
return None
|
|
736
|
-
|
|
737
|
-
_headroom_gb = snap.metal_headroom_gb
|
|
738
|
-
_thermal = snap.thermal_level
|
|
739
|
-
_screen_locked_val = snap.screen_locked
|
|
740
|
-
|
|
741
|
-
# If a managed server is already running, its model weight is
|
|
742
|
-
# included in Metal's "used" accounting, which shrinks headroom.
|
|
743
|
-
# Add that weight back so the selection sees the true system-wide
|
|
744
|
-
# budget rather than headroom-minus-current-model. Without this
|
|
745
|
-
# the selected model changes on every tick as headroom shifts,
|
|
746
|
-
# causing an oscillation loop (Qwen3.5 → phi-4 → gemma → …).
|
|
747
|
-
_adj_headroom_gb = _headroom_gb
|
|
748
|
-
if _MANAGED_SERVER_PID_FILE.exists():
|
|
749
|
-
try:
|
|
750
|
-
meta = json.loads(_MANAGED_SERVER_PID_FILE.read_text())
|
|
751
|
-
os.kill(meta["pid"], 0) # raises OSError if dead
|
|
752
|
-
current_ram = next(
|
|
753
|
-
(min_ram for _, _, min_ram, _, hf in _MODELS
|
|
754
|
-
if hf == meta["model"]),
|
|
755
|
-
0.0,
|
|
756
|
-
)
|
|
757
|
-
_adj_headroom_gb = _headroom_gb + current_ram
|
|
758
|
-
log.info(
|
|
759
|
-
"llm_selector: headroom adjusted %.1f→%.1f GB "
|
|
760
|
-
"(managed model=%s uses %.1f GB)",
|
|
761
|
-
_headroom_gb, _adj_headroom_gb,
|
|
762
|
-
meta["model"], current_ram,
|
|
763
|
-
)
|
|
764
|
-
except (OSError, Exception):
|
|
765
|
-
pass
|
|
766
|
-
|
|
767
|
-
_effective_pct = min(0.8, budget_pct * 1.5) if snap.screen_locked else budget_pct
|
|
768
|
-
_budget_gb = _adj_headroom_gb * _effective_pct
|
|
769
|
-
|
|
770
|
-
entry = _select_mlx_entry(_adj_headroom_gb, _effective_pct,
|
|
771
|
-
snap.thermal_level, apple_intelligence=False)
|
|
772
|
-
if entry is None:
|
|
773
|
-
_reason = "no_model_fits"
|
|
774
|
-
log.info(
|
|
775
|
-
"llm_selector: no local model fits "
|
|
776
|
-
"headroom=%.1f GB adj=%.1f GB budget=%.1f GB pct=%.2f → cloud fallback",
|
|
777
|
-
_headroom_gb, _adj_headroom_gb, _budget_gb, _effective_pct,
|
|
778
|
-
)
|
|
779
|
-
else:
|
|
780
|
-
model_id, _, min_ram, quality, hf_id = entry
|
|
781
|
-
log.info(
|
|
782
|
-
"llm_selector: selected model=%s hf=%s min_ram=%.1f GB quality=%d "
|
|
783
|
-
"headroom=%.1f GB adj=%.1f GB budget=%.1f GB pct=%.2f",
|
|
784
|
-
model_id, hf_id, min_ram, quality,
|
|
785
|
-
_headroom_gb, _adj_headroom_gb, _budget_gb, _effective_pct,
|
|
786
|
-
)
|
|
787
|
-
if _ensure_mlx_server(hf_id, _MANAGED_SERVER_PORT):
|
|
788
|
-
_reason = "mlx_managed"
|
|
789
|
-
result = LocalModelEndpoint(
|
|
790
|
-
model=hf_id,
|
|
791
|
-
base_url=f"http://127.0.0.1:{_MANAGED_SERVER_PORT}/v1",
|
|
792
|
-
api_key="local",
|
|
793
|
-
runtime="mlx_managed",
|
|
794
|
-
)
|
|
795
|
-
else:
|
|
796
|
-
_reason = "mlx_server_failed"
|
|
797
|
-
log.warning(
|
|
798
|
-
"llm_selector: mlx_lm.server failed to start for model=%s — cloud fallback",
|
|
799
|
-
hf_id,
|
|
800
|
-
)
|
|
801
|
-
else:
|
|
802
|
-
_reason = result.runtime
|
|
803
|
-
|
|
804
|
-
_selected_model = result.model if result else "cloud_fallback"
|
|
805
|
-
_selected_runtime = result.runtime if result else "cloud"
|
|
806
|
-
_is_local = result is not None
|
|
807
|
-
|
|
808
|
-
span.set_attribute("llm.budget_pct", budget_pct)
|
|
809
|
-
span.set_attribute("llm.effective_pct", round(_effective_pct, 3))
|
|
810
|
-
span.set_attribute("llm.headroom_gb", round(_headroom_gb, 2))
|
|
811
|
-
span.set_attribute("llm.adj_headroom_gb", round(_adj_headroom_gb, 2))
|
|
812
|
-
span.set_attribute("llm.budget_gb", round(_budget_gb, 2))
|
|
813
|
-
span.set_attribute("llm.thermal_level", _thermal)
|
|
814
|
-
span.set_attribute("llm.screen_locked", _screen_locked_val)
|
|
815
|
-
span.set_attribute("llm.reason", _reason)
|
|
816
|
-
span.set_attribute("llm.selected_model", _selected_model)
|
|
817
|
-
span.set_attribute("llm.selected_runtime", _selected_runtime)
|
|
818
|
-
span.set_attribute("llm.is_local", _is_local)
|
|
819
|
-
|
|
820
|
-
log.info(
|
|
821
|
-
"llm_selector: decision reason=%s model=%s runtime=%s "
|
|
822
|
-
"budget_pct=%.2f headroom_gb=%.1f budget_gb=%.1f thermal=%d",
|
|
823
|
-
_reason, _selected_model, _selected_runtime,
|
|
824
|
-
budget_pct, _adj_headroom_gb, _budget_gb, _thermal,
|
|
825
|
-
extra={
|
|
826
|
-
"llm_selector_reason": _reason,
|
|
827
|
-
"llm_selector_model": _selected_model,
|
|
828
|
-
"llm_selector_runtime": _selected_runtime,
|
|
829
|
-
"llm_selector_budget_pct": budget_pct,
|
|
830
|
-
"llm_selector_headroom_gb": round(_adj_headroom_gb, 2),
|
|
831
|
-
"llm_selector_budget_gb": round(_budget_gb, 2),
|
|
832
|
-
"llm_selector_thermal": _thermal,
|
|
833
|
-
"llm_selector_screen_locked": _screen_locked_val,
|
|
834
|
-
"llm_selector_is_local": _is_local,
|
|
835
|
-
},
|
|
836
|
-
)
|
|
837
|
-
return result
|
|
838
|
-
except Exception as exc:
|
|
839
|
-
span.record_exception(exc)
|
|
840
|
-
raise
|
|
841
|
-
|
|
842
|
-
|
|
843
512
|
def _hf_model_cached(hf_id: "str | None") -> bool:
|
|
844
513
|
"""True when a HuggingFace repo's weights are already in the local cache.
|
|
845
514
|
|
|
@@ -874,7 +543,7 @@ def select_mlx_model_id(
|
|
|
874
543
|
) -> "str | None":
|
|
875
544
|
"""Pick the best **in-process** MLX model id for this machine.
|
|
876
545
|
|
|
877
|
-
|
|
546
|
+
Returns a HuggingFace
|
|
878
547
|
repo id the caller loads directly via mlx_lm + outlines (FSM-constrained
|
|
879
548
|
decoding). It deliberately does NOT discover external servers
|
|
880
549
|
(Ollama / LM Studio / Apple Intelligence give no constrained decoding) and
|
|
@@ -1054,10 +723,6 @@ def discover_mlx_eval_server(port: int = 7823) -> "str | None":
|
|
|
1054
723
|
|
|
1055
724
|
|
|
1056
725
|
__all__ = ["local_infer", "discover_running_servers", "probe_compute",
|
|
1057
|
-
"RunningServer", "ComputeSnapshot",
|
|
1058
|
-
"
|
|
1059
|
-
"shutdown_managed_server",
|
|
726
|
+
"RunningServer", "ComputeSnapshot",
|
|
727
|
+
"select_mlx_model_id",
|
|
1060
728
|
"resolve_model", "discover_mlx_eval_server"]
|
|
1061
|
-
|
|
1062
|
-
# Public alias (no underscore) for external callers
|
|
1063
|
-
shutdown_managed_server = _shutdown_managed_server
|
|
@@ -198,7 +198,7 @@ def _configure_logging(agent_name: str) -> None:
|
|
|
198
198
|
|
|
199
199
|
root = logging.getLogger()
|
|
200
200
|
# Clear any pre-existing handlers — long-running daemons that import
|
|
201
|
-
# third-party libs (
|
|
201
|
+
# third-party libs (mcp, etc.) often leave a default basicConfig handler
|
|
202
202
|
# behind that would duplicate every line.
|
|
203
203
|
root.handlers.clear()
|
|
204
204
|
root.addHandler(file_h)
|
|
@@ -37,7 +37,6 @@ from opentelemetry.trace import StatusCode
|
|
|
37
37
|
from pydantic import BaseModel, Field
|
|
38
38
|
|
|
39
39
|
_SERVICES_DIR = Path(__file__).parent.parent
|
|
40
|
-
os.environ.setdefault("HERMES_HOME", str(_SERVICES_DIR / ".hermes"))
|
|
41
40
|
|
|
42
41
|
from agents import observability
|
|
43
42
|
from agents._prompts import build_user_message
|