npm - @meridiona/meridian-darwin-arm64 - Versions diffs - 1.53.1 → 1.54.1 - Mend

@meridiona/meridian-darwin-arm64 1.53.1 → 1.54.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/.env.example +9 -0
package/VERSION +1 -1
package/bin/meridian +0 -0
package/bin/meridian-tray +0 -0
package/package.json +1 -1
package/scripts/install-from-bundle.sh +15 -0
package/services/agents/_prompts.py +14 -2
package/services/agents/run_task_linker_mlx.py +209 -40
package/services/agents/server.py +77 -32
package/services/agents/tests/test_run_task_linker_mlx.py +53 -0
package/services/pyproject.toml +1 -1
package/ui.tar.gz +0 -0

package/.env.example CHANGED Viewed

@@ -19,6 +19,15 @@
 # MLX_SERVER_HOST=127.0.0.1
 # MLX_SERVER_PORT=7823
+# Idle eviction for the MLX model. The model holds ~7 GB of Metal memory while
+# resident, but classification is bursty — so the server unloads it after this
+# many seconds idle and reloads on the next request (~3 s cold start). Default
+# 120s (aggressive: lightest idle footprint). Raise it to keep the model warm
+# longer; set 0 to disable eviction (pin the model in memory). Avoid values
+# below ~30s: if the TTL drops under the gap between sessions in a classification
+# burst, the model evicts and cold-reloads (~3 s) repeatedly mid-burst.
+# MLX_IDLE_EVICT_S=120
 # Dashboard (Next.js UI) port. Defaults to 3939. Change this and re-run
 # `meridian setup` to move the dashboard.
 # MERIDIAN_UI_PORT=3939

package/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.53.1
1	+ 1.54.1

package/bin/meridian CHANGED Viewed

Binary file

package/bin/meridian-tray CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@meridiona/meridian-darwin-arm64",
-  "version": "1.53.1",
+  "version": "1.54.1",
   "description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
   "homepage": "https://github.com/Meridiona/meridian",
   "repository": {

package/scripts/install-from-bundle.sh CHANGED Viewed

@@ -540,6 +540,21 @@ if [[ "${SKIP_PERMISSIONS}" -eq 0 ]]; then
     echo "      ${HOME}/.meridian/bin/meridian-a11y-helper"
     echo "  Without the a11y helper, Electron apps (Claude, Codex, Slack, …) stay invisible to capture."
     read -r -p "  Press Enter once all are granted… " _ || true
+    # Notifications: the tray surfaces desktop toasts (plan nudges, worklog
+    # drafts, faults). macOS hides ALL notifications while the screen is being
+    # recorded/shared unless this is on — and screenpipe records continuously, so
+    # without it every Meridian toast is silently suppressed. No API/prompt exists
+    # for this toggle, so we can only walk the user to it.
+    echo "→ Meridian's tray shows desktop notifications. Because screenpipe records"
+    echo "  the screen, macOS hides notifications during screen sharing unless allowed."
+    read -r -p "  Press Enter to open Notifications settings… " _ || true
+    open "x-apple.systempreferences:com.apple.Notifications-Settings.extension" 2>/dev/null || true
+    echo "  → Scroll to the bottom and turn ON"
+    echo "    'Allow notifications when mirroring or sharing the display'."
+    echo "  → When 'Meridian Tray' appears, ensure its notifications are allowed"
+    echo "    (style Banners or Alerts, not None)."
+    read -r -p "  Press Enter when done… " _ || true
 fi
 # Enable a11y mode in installed VS Code-family editors (idempotent). Without

package/services/agents/_prompts.py CHANGED Viewed

@@ -106,8 +106,11 @@ def _format_candidates(tasks: list[dict]) -> str:
             desc = desc[:240] + "…"
         meta_parts = [p for p in [issue_type, f"Epic: {epic_title}" if epic_title else "", sprint_name, f"tags: {tags}" if tags else ""] if p]
         meta = "  [" + " · ".join(meta_parts) + "]" if meta_parts else ""
+        # The dev declared this ticket as today's focus on the plan page. It's a
+        # tie-breaking prior, not a forced answer — only matches if the evidence fits.
+        focus = " ★ TODAY'S FOCUS" if task.get("is_today_focus") else ""
         rows.append(
-            f"{i}. {task['task_key']}{meta}\n"
+            f"{i}. {task['task_key']}{focus}{meta}\n"
             f"   title: {title}\n"
             f"   description: {desc or '(empty)'}"
         )
@@ -152,12 +155,21 @@ def build_user_message(
         f"{_format_recent_sessions(sessions)}\n"
         "\n"
     ) if has_any_task_key else ""
+    # When the dev declared a focus for the day, name it in the header so the model
+    # treats ★ rows as a prior — preferred when the evidence plausibly fits, but
+    # never forced. Recall is preserved: every candidate is still listed.
+    has_focus = any(c.get("is_today_focus") for c in candidates)
+    candidate_header = (
+        "CANDIDATE TICKETS (★ = the dev declared this as a task they're working on "
+        "today; prefer a ★ ticket when the session plausibly matches it, but only "
+        "if the evidence fits — never force a match):\n"
+    ) if has_focus else "CANDIDATE TICKETS:\n"
     return (
         f"{recent_block}"
         "SESSION:\n"
         f"{_format_session(session)}\n"
         "\n"
-        "CANDIDATE TICKETS:\n"
+        f"{candidate_header}"
         f"{_format_candidates(candidates)}"
     )

package/services/agents/run_task_linker_mlx.py CHANGED Viewed

@@ -24,14 +24,18 @@ Method tag in results: "mlx_direct".
 """
 from __future__ import annotations
+import datetime as _dt
+import gc
 import json
 import logging
 import os
 import sqlite3 as _sqlite3
 import sys
+import threading
 import time
+from contextlib import contextmanager
 from pathlib import Path
-from typing import Any, Literal, Optional
+from typing import Any, Literal, Optional, Iterator
 from opentelemetry.trace import StatusCode
 from pydantic import BaseModel, Field
@@ -233,42 +237,144 @@ _SYSTEM_PROMPT = (
 # ---------------------------------------------------------------------------
-# Model loading — cached for the process lifetime.
-# outlines.from_mlxlm wraps the already-loaded mlx model; subsequent calls
-# skip the expensive disk load.
+# Model loading — loaded lazily on first use, evicted when idle.
+#
+# The MLX model holds ~7 GB of Metal unified memory while resident (measured;
+# note `ps`/Activity Monitor RSS does NOT show it). Classification is bursty,
+# so we keep the model only while it's being used: load on first inference,
+# and evict after MLX_IDLE_EVICT_S of inactivity (server.py runs the evictor).
+# `del + gc.collect() + mx.clear_cache()` reclaims the full 7 GB; cold reload
+# is ~3 s. `_model_lock` + `_in_flight` guarantee the evictor never frees the
+# model out from under an in-flight inference.
 # ---------------------------------------------------------------------------
 _model_cache: dict[str, Any] = {}
+_model_lock = threading.Lock()       # guards _model_cache mutation, _in_flight, _last_used, eviction
+_in_flight = 0                       # inferences currently using the model
+_last_used = time.monotonic()        # monotonic ts of the last finished inference
+# Aggressive default (2 min): the model is present only during active bursts.
+# Tune via env without a code change; 0 disables idle eviction entirely.
+_IDLE_EVICT_S = float(os.environ.get("MLX_IDLE_EVICT_S", "120"))
 def _get_model() -> Any:
-    """Return an outlines-wrapped model, loading from disk on the first call."""
+    """Return an outlines-wrapped model, loading from disk on the first call.
+    Cache-miss load is done under _model_lock (double-checked) so concurrent
+    callers can't double-load and the idle evictor can't race the load.
+    """
     model_id = _resolve_model_id()
-    if model_id in _model_cache:
-        return _model_cache[model_id]
+    cached = _model_cache.get(model_id)
+    if cached is not None:
+        return cached
+    with _model_lock:
+        cached = _model_cache.get(model_id)   # re-check under lock
+        if cached is not None:
+            return cached
+        try:
+            import mlx_lm
+            import outlines
+        except ImportError as exc:
+            raise ImportError(
+                f"Required package not installed: {exc}. "
+                "Install with: pip install 'mlx-lm>=0.22' 'outlines[mlxlm]>=1.3'"
+            ) from exc
+        log.info(
+            "run_task_linker_mlx: loading %s (first call this process)", model_id
+        )
+        t0 = time.time()
+        mlx_model, tokenizer = mlx_lm.load(
+            model_id,
+            tokenizer_config={"trust_remote_code": True},
+        )
+        outlines_model = outlines.from_mlxlm(mlx_model, tokenizer)
+        log.info("run_task_linker_mlx: model loaded in %.1fs", time.time() - t0)
+        _model_cache[model_id] = outlines_model
+        return outlines_model
+@contextmanager
+def model_session() -> Iterator[Any]:
+    """Yield the loaded model, marking it in-flight so the idle evictor never
+    frees it mid-inference. Wrap every direct ``model(...)`` call in this.
+    Lock is held only briefly (to bump/clear the in-flight counter), never for
+    the duration of inference. NOTE: production serialises all MLX calls upstream
+    via the Rust llm_gate (1-permit semaphore), so inferences don't actually
+    overlap — this lock scope just avoids adding a second, redundant serialisation
+    point, NOT a claim that concurrent generation on the shared model is safe.
+    """
+    global _in_flight, _last_used
+    with _model_lock:
+        _in_flight += 1
     try:
-        import mlx_lm
-        import outlines
-    except ImportError as exc:
-        raise ImportError(
-            f"Required package not installed: {exc}. "
-            "Install with: pip install 'mlx-lm>=0.22' 'outlines[mlxlm]>=1.3'"
-        ) from exc
-    log.info(
-        "run_task_linker_mlx: loading %s (first call this process)", model_id
-    )
-    t0 = time.time()
-    mlx_model, tokenizer = mlx_lm.load(
-        model_id,
-        tokenizer_config={"trust_remote_code": True},
-    )
-    outlines_model = outlines.from_mlxlm(mlx_model, tokenizer)
-    log.info("run_task_linker_mlx: model loaded in %.1fs", time.time() - t0)
+        yield _get_model()
+    finally:
+        with _model_lock:
+            _in_flight -= 1
+            _last_used = time.monotonic()
+def maybe_evict_idle(idle_s: float | None = None) -> float | None:
+    """Evict the model if it's resident, nothing is in flight, and it's been
+    idle longer than ``idle_s`` (default MLX_IDLE_EVICT_S). Returns the GB freed,
+    or None if no eviction happened. Safe to call from a threadpool worker.
+    Uses a non-blocking lock acquire: if an inference/load is mutating state we
+    simply skip this tick and try again on the next one.
+    """
+    ttl = _IDLE_EVICT_S if idle_s is None else idle_s
+    if ttl <= 0:
+        return None
+    if not _model_lock.acquire(blocking=False):
+        return None
+    try:
+        if _in_flight > 0 or not _model_cache:
+            return None
+        if (time.monotonic() - _last_used) < ttl:
+            return None
+        try:
+            import mlx.core as mx
+            before = mx.get_active_memory()
+        except Exception:               # noqa: BLE001 — mx should always import here
+            mx, before = None, 0
+        _model_cache.clear()
+        gc.collect()
+        freed = 0.0
+        if mx is not None:
+            mx.clear_cache()
+            freed = max(0.0, (before - mx.get_active_memory()) / 1e9)
+        log.info(
+            "run_task_linker_mlx: evicted idle model (idle ≥ %.0fs), freed ~%.1f GB",
+            ttl, freed,
+        )
+        return freed
+    finally:
+        _model_lock.release()
+def model_resident() -> bool:
+    """True if the MLX model is currently loaded in memory."""
+    return bool(_model_cache)
-    _model_cache[model_id] = outlines_model
-    return outlines_model
+def model_active_memory_gb() -> float | None:
+    """Live Metal active-memory footprint in GB, or None if MLX is unavailable.
+    Process-wide Metal active memory (≈ the model when resident — the model
+    dominates, though a transient load allocation can briefly inflate it), and
+    the only honest measure: `ps`/Activity Monitor can't see Metal unified
+    memory (they undercount by ~6.5 GB).
+    """
+    try:
+        import mlx.core as mx
+        return round(mx.get_active_memory() / 1e9, 2)
+    except Exception:  # noqa: BLE001 — mx absent on non-MLX machines
+        return None
 # Apple Foundation Models has a 4096-token combined context window (input + output).
@@ -483,7 +589,53 @@ def _fetch_recent_sessions(
     return result
-def _fetch_pm_tasks(con: _sqlite3.Connection) -> list[dict[str, Any]]:
+def _local_day(started_at: str) -> str:
+    """The local calendar day (YYYY-MM-DD) of a session's UTC `started_at`.
+    `daily_plan.plan_date` is the dev's *local* day (the dashboard stamps it from
+    the browser's local date), but `app_sessions.started_at` is stored UTC. We
+    convert UTC → local here so a session is matched to the plan the dev actually
+    declared for that day. Returns "" on an unparseable timestamp (→ no boost).
+    """
+    if not started_at:
+        return ""
+    try:
+        # `astimezone()` with no arg converts an aware datetime to the host's
+        # local zone — the same zone the dashboard used to compute plan_date.
+        return _dt.datetime.fromisoformat(started_at).astimezone().date().isoformat()
+    except ValueError:
+        return ""
+def _fetch_plan_focus(con: _sqlite3.Connection, plan_date: str) -> list[str]:
+    """Ordered task_keys the dev CONFIRMED as their focus for `plan_date`.
+    Empty (→ no boost, classification proceeds exactly as before) when the day is
+    unconfirmed, explicitly skipped, has no plan rows, or the plan tables don't
+    exist yet (pre-migration-041 DB). This is a ranking signal only — never a
+    filter — so an empty result can only ever cost the boost, never recall.
+    """
+    if not plan_date:
+        return []
+    try:
+        meta = con.execute(
+            "SELECT confirmed_at, skipped FROM daily_plan_meta WHERE plan_date = ?",
+            (plan_date,),
+        ).fetchone()
+        if meta is None or meta["skipped"] or not meta["confirmed_at"]:
+            return []
+        rows = con.execute(
+            "SELECT task_key FROM daily_plan WHERE plan_date = ? ORDER BY position",
+            (plan_date,),
+        ).fetchall()
+        return [r["task_key"] for r in rows]
+    except _sqlite3.OperationalError:
+        return []
+def _fetch_pm_tasks(
+    con: _sqlite3.Connection, focus_keys: list[str] | None = None
+) -> list[dict[str, Any]]:
     # Candidate set for classification. Tickets the user explicitly EXCLUDED during
     # onboarding board-cleanup (pm_task_curation.decision = 'excluded') are dropped
     # so a cleaned-up dead ticket can never be a classification target. Everything
@@ -512,7 +664,20 @@ def _fetch_pm_tasks(con: _sqlite3.Connection) -> list[dict[str, Any]]:
         # Pre-migration-038 DB (no pm_task_curation): degrade to the unfiltered
         # candidate set rather than crashing the whole /classify_sessions call.
         rows = con.execute(base_cols).fetchall()
-    return [dict(r) for r in rows]
+    tasks = [dict(r) for r in rows]
+    # Today's-focus boost: tag the tickets the dev declared for the day and float
+    # them to the top of the candidate list, in their declared order. This is a
+    # BOOST, never a filter — every other candidate still follows, so recall is
+    # untouched. A focus key that isn't in `tasks` (e.g. excluded by curation)
+    # simply has no effect; we never resurrect a filtered-out ticket.
+    focus = focus_keys or []
+    if focus:
+        order = {key: i for i, key in enumerate(focus)}
+        for t in tasks:
+            t["is_today_focus"] = t["task_key"] in order
+        tasks.sort(key=lambda t: (0, order[t["task_key"]]) if t.get("is_today_focus") else (1, 0))
+    return tasks
 # ---------------------------------------------------------------------------
@@ -555,10 +720,13 @@ def _classify_one(
                 session_id, f"session {session_id} not found in DB", 0.0, "mlx_error"
             )
-        pm_tasks = _fetch_pm_tasks(con)
-        recent   = _fetch_recent_sessions(con, session_id)
+        plan_date  = _local_day(session_raw.get("started_at") or "")
+        focus_keys = _fetch_plan_focus(con, plan_date)
+        pm_tasks   = _fetch_pm_tasks(con, focus_keys)
+        recent     = _fetch_recent_sessions(con, session_id)
         db_span.set_attribute("pm_tasks_count", len(pm_tasks))
+        db_span.set_attribute("today_focus_count", len(focus_keys))
         db_span.set_attribute("recent_sessions_count", len(recent))
         session_text = session_raw.get("session_text") or ""
@@ -642,14 +810,14 @@ def _classify_one(
                 from mlx_lm.sample_utils import make_sampler
                 from outlines.inputs import Chat
-                model = _get_model()
-                raw = model(
-                    Chat(messages),
-                    output_type=SessionClassification,
-                    max_tokens=_MAX_TOKENS,
-                    sampler=make_sampler(temp=_TEMPERATURE),
-                    verbose=False,
-                )
+                with model_session() as model:
+                    raw = model(
+                        Chat(messages),
+                        output_type=SessionClassification,
+                        max_tokens=_MAX_TOKENS,
+                        sampler=make_sampler(temp=_TEMPERATURE),
+                        verbose=False,
+                    )
         except Exception as exc:
             elapsed = time.time() - t0
             outcome = "apple_fm_error" if _use_apple_fm else "mlx_error"
@@ -785,7 +953,8 @@ def _classify_one_logged(
     """Classify one session and append a full record to the run log."""
     # Gather inputs before classification so we can log them even on error.
     session_raw = _fetch_session(con, session_id)
-    pm_tasks = _fetch_pm_tasks(con) if session_raw else []
+    focus_keys = _fetch_plan_focus(con, _local_day(session_raw.get("started_at") or "")) if session_raw else []
+    pm_tasks = _fetch_pm_tasks(con, focus_keys) if session_raw else []
     recent = _fetch_recent_sessions(con, session_id) if session_raw else []
     if session_raw:

package/services/agents/server.py CHANGED Viewed

@@ -41,20 +41,58 @@ _DB_PATH = Path(os.environ.get("MERIDIAN_DB", Path.home() / ".meridian/meridian.
 _app_state: dict[str, Any] = {}
+async def _idle_evictor(mlx_module: Any) -> None:
+    """Background loop: evict the MLX model after it has been idle long enough.
+    Runs the (briefly blocking) eviction in a threadpool so it never stalls the
+    event loop, and never raises out — the evictor must outlive transient errors.
+    """
+    import asyncio
+    from fastapi.concurrency import run_in_threadpool
+    ttl = mlx_module._IDLE_EVICT_S
+    if ttl <= 0:
+        return
+    interval = max(15.0, ttl / 4.0)   # check ~4× per idle window
+    while True:
+        await asyncio.sleep(interval)
+        try:
+            await run_in_threadpool(mlx_module.maybe_evict_idle)
+        except Exception as exc:       # noqa: BLE001 — evictor must never die
+            log.warning("server: idle-evictor error: %s", exc)
 @asynccontextmanager
 async def _lifespan(app: FastAPI) -> AsyncIterator[None]:
+    import asyncio
     import datetime
     import agents.run_task_linker_mlx as _mlx
     _app_state["mlx_module"] = _mlx
     _app_state["loaded_at"] = datetime.datetime.now(datetime.timezone.utc).isoformat()
     from agents.llm_selector import APPLE_INTELLIGENCE_ID
+    evictor: "asyncio.Task | None" = None
     if _mlx._resolve_model_id() == APPLE_INTELLIGENCE_ID:
-        log.info("server: 8 GB machine — Apple Intelligence backend, no MLX model to pre-load")
+        log.info("server: Apple Intelligence backend — no MLX model to load")
+    elif _mlx._IDLE_EVICT_S > 0:
+        # Lazy: the ~7 GB model loads on the first inference and is evicted after
+        # MLX_IDLE_EVICT_S of inactivity, so the server idles light (~0.4 GB)
+        # instead of pinning ~7 GB of Metal memory for the whole process life.
+        log.info(
+            "server: MLX model loads on first request; idle-evict after %.0fs",
+            _mlx._IDLE_EVICT_S,
+        )
+        evictor = asyncio.create_task(_idle_evictor(_mlx))
     else:
-        log.info("server: loading MLX model at startup…")
-        _mlx._get_model()
-        log.info("server: MLX model ready")
-    yield
+        # Eviction disabled — don't spawn a no-op evictor task just to cancel it.
+        log.info("server: MLX model loads on first request; idle-eviction disabled (MLX_IDLE_EVICT_S=0)")
+    try:
+        yield
+    finally:
+        if evictor is not None:
+            import contextlib
+            evictor.cancel()
+            with contextlib.suppress(asyncio.CancelledError):
+                await evictor
 app = FastAPI(title="Meridian Agent", version="1.0.0", lifespan=_lifespan)
@@ -76,12 +114,19 @@ async def health() -> dict:
 @app.get("/info")
 async def info() -> dict:
-    """Return the identity of the loaded model."""
+    """Return the identity of the model and its live memory state.
+    `active_memory_gb` reads `mx.get_active_memory()` — the ONLY honest measure
+    of the model's footprint, since Metal unified memory is invisible to `ps`
+    and Activity Monitor (they undercount the model by ~6.5 GB).
+    """
     m = _app_state.get("mlx_module")
     return {
-        "backend":   "mlx",
-        "model_id":  m._resolve_model_id() if m else None,
-        "loaded_at": _app_state.get("loaded_at"),
+        "backend":          "mlx",
+        "model_id":         m._resolve_model_id() if m else None,
+        "loaded_at":        _app_state.get("loaded_at"),
+        "model_resident":   m.model_resident() if m else False,
+        "active_memory_gb": m.model_active_memory_gb() if m else None,
     }
@@ -143,14 +188,14 @@ async def classify(req: ClassifyRequest) -> ClassifyResponse:
             # _classify_apple_fm uses asyncio.new_event_loop() internally;
             # must run in a thread (no existing loop) not in the async handler.
             return m._classify_apple_fm(messages)
-        model = m._get_model()
-        raw = model(
-            Chat(messages),
-            output_type=m.SessionClassification,
-            max_tokens=m._MAX_TOKENS,
-            sampler=make_sampler(temp=m._TEMPERATURE),
-            verbose=False,
-        )
+        with m.model_session() as model:
+            raw = model(
+                Chat(messages),
+                output_type=m.SessionClassification,
+                max_tokens=m._MAX_TOKENS,
+                sampler=make_sampler(temp=m._TEMPERATURE),
+                verbose=False,
+            )
         return m.SessionClassification.model_validate_json(raw)
     try:
@@ -375,13 +420,13 @@ async def openai_chat_completions(req: _OAIChatRequest) -> dict:
     def _generate() -> str:
         if m._resolve_model_id() == APPLE_INTELLIGENCE_ID:
             return _infer_apple_fm(msgs, max_tokens)
-        model = m._get_model()
-        return model(
-            Chat(msgs),
-            max_tokens=max_tokens,
-            sampler=make_sampler(temp=temperature),
-            verbose=False,
-        )
+        with m.model_session() as model:
+            return model(
+                Chat(msgs),
+                max_tokens=max_tokens,
+                sampler=make_sampler(temp=temperature),
+                verbose=False,
+            )
     t0 = _time.time()
     try:
@@ -504,14 +549,14 @@ async def summarise(req: _SummariseRequest) -> _SummariseResponse:
     from outlines.inputs import Chat
     def _generate() -> str:
-        model = m._get_model()
-        return model(
-            Chat(messages),
-            output_type=_SummarySchema,
-            max_tokens=req.max_tokens,
-            sampler=make_sampler(temp=req.temperature),
-            verbose=False,
-        )
+        with m.model_session() as model:
+            return model(
+                Chat(messages),
+                output_type=_SummarySchema,
+                max_tokens=req.max_tokens,
+                sampler=make_sampler(temp=req.temperature),
+                verbose=False,
+            )
     try:
         raw = await run_in_threadpool(_generate)

package/services/agents/tests/test_run_task_linker_mlx.py CHANGED Viewed

@@ -9,6 +9,7 @@ from __future__ import annotations
 import json
 import sqlite3
 import sys
+import time
 from io import StringIO
 from pathlib import Path
 from typing import Iterator
@@ -1011,6 +1012,58 @@ class TestModelCache:
                 m._get_model()
+# ---------------------------------------------------------------------------
+# Idle eviction — model_session() in-flight tracking + maybe_evict_idle()
+# (the model holds ~7 GB while resident; the server unloads it when idle)
+# ---------------------------------------------------------------------------
+class TestModelEviction:
+    def test_model_session_loads_and_tracks_in_flight(self):
+        import agents.run_task_linker_mlx as m
+        sentinel = MagicMock(name="model")
+        with patch.object(m, "_get_model", return_value=sentinel):
+            m._in_flight = 0
+            with m.model_session() as model:
+                assert model is sentinel
+                assert m._in_flight == 1          # marked in-flight while in use
+            assert m._in_flight == 0              # released on exit
+    def test_evict_noop_when_not_idle_long_enough(self):
+        import agents.run_task_linker_mlx as m
+        m._model_cache["x"] = MagicMock()
+        m._in_flight = 0
+        m._last_used = time.monotonic()           # just used
+        assert m.maybe_evict_idle(idle_s=600) is None
+        assert m.model_resident() is True
+    def test_evict_disabled_when_ttl_zero(self):
+        import agents.run_task_linker_mlx as m
+        m._model_cache["x"] = MagicMock()
+        assert m.maybe_evict_idle(idle_s=0) is None
+        assert m.model_resident() is True
+    def test_evict_noop_when_in_flight(self):
+        import agents.run_task_linker_mlx as m
+        m._model_cache["x"] = MagicMock()
+        m._in_flight = 1                          # an inference is using the model
+        m._last_used = time.monotonic() - 1000
+        try:
+            assert m.maybe_evict_idle(idle_s=0.001) is None
+            assert m.model_resident() is True     # never freed mid-inference
+        finally:
+            m._in_flight = 0
+    def test_evict_clears_cache_when_idle(self):
+        import agents.run_task_linker_mlx as m
+        m._model_cache["x"] = MagicMock()
+        m._in_flight = 0
+        m._last_used = time.monotonic() - 1000    # idle long past the window
+        freed = m.maybe_evict_idle(idle_s=0.001)
+        assert freed is not None                  # eviction happened
+        assert m.model_resident() is False
+        assert m._model_cache == {}
 # ---------------------------------------------------------------------------
 # SessionClassification schema
 # ---------------------------------------------------------------------------

package/services/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "meridian-agents"
-version = "1.53.1"
+version = "1.54.1"
 description = "Meridian agents — MLX classifier server and Jira worklog synthesis for meridian.db"
 requires-python = ">=3.11"
 authors = [{ name = "Meridiona" }]

package/ui.tar.gz CHANGED Viewed

Binary file