@meridiona/meridian-darwin-arm64 1.53.1 → 1.54.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/.env.example CHANGED
@@ -19,6 +19,15 @@
19
19
  # MLX_SERVER_HOST=127.0.0.1
20
20
  # MLX_SERVER_PORT=7823
21
21
 
22
+ # Idle eviction for the MLX model. The model holds ~7 GB of Metal memory while
23
+ # resident, but classification is bursty — so the server unloads it after this
24
+ # many seconds idle and reloads on the next request (~3 s cold start). Default
25
+ # 120s (aggressive: lightest idle footprint). Raise it to keep the model warm
26
+ # longer; set 0 to disable eviction (pin the model in memory). Avoid values
27
+ # below ~30s: if the TTL drops under the gap between sessions in a classification
28
+ # burst, the model evicts and cold-reloads (~3 s) repeatedly mid-burst.
29
+ # MLX_IDLE_EVICT_S=120
30
+
22
31
  # Dashboard (Next.js UI) port. Defaults to 3939. Change this and re-run
23
32
  # `meridian setup` to move the dashboard.
24
33
  # MERIDIAN_UI_PORT=3939
package/VERSION CHANGED
@@ -1 +1 @@
1
- 1.53.1
1
+ 1.54.1
package/bin/meridian CHANGED
Binary file
package/bin/meridian-tray CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@meridiona/meridian-darwin-arm64",
3
- "version": "1.53.1",
3
+ "version": "1.54.1",
4
4
  "description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
5
5
  "homepage": "https://github.com/Meridiona/meridian",
6
6
  "repository": {
@@ -540,6 +540,21 @@ if [[ "${SKIP_PERMISSIONS}" -eq 0 ]]; then
540
540
  echo " ${HOME}/.meridian/bin/meridian-a11y-helper"
541
541
  echo " Without the a11y helper, Electron apps (Claude, Codex, Slack, …) stay invisible to capture."
542
542
  read -r -p " Press Enter once all are granted… " _ || true
543
+
544
+ # Notifications: the tray surfaces desktop toasts (plan nudges, worklog
545
+ # drafts, faults). macOS hides ALL notifications while the screen is being
546
+ # recorded/shared unless this is on — and screenpipe records continuously, so
547
+ # without it every Meridian toast is silently suppressed. No API/prompt exists
548
+ # for this toggle, so we can only walk the user to it.
549
+ echo "→ Meridian's tray shows desktop notifications. Because screenpipe records"
550
+ echo " the screen, macOS hides notifications during screen sharing unless allowed."
551
+ read -r -p " Press Enter to open Notifications settings… " _ || true
552
+ open "x-apple.systempreferences:com.apple.Notifications-Settings.extension" 2>/dev/null || true
553
+ echo " → Scroll to the bottom and turn ON"
554
+ echo " 'Allow notifications when mirroring or sharing the display'."
555
+ echo " → When 'Meridian Tray' appears, ensure its notifications are allowed"
556
+ echo " (style Banners or Alerts, not None)."
557
+ read -r -p " Press Enter when done… " _ || true
543
558
  fi
544
559
 
545
560
  # Enable a11y mode in installed VS Code-family editors (idempotent). Without
@@ -106,8 +106,11 @@ def _format_candidates(tasks: list[dict]) -> str:
106
106
  desc = desc[:240] + "…"
107
107
  meta_parts = [p for p in [issue_type, f"Epic: {epic_title}" if epic_title else "", sprint_name, f"tags: {tags}" if tags else ""] if p]
108
108
  meta = " [" + " · ".join(meta_parts) + "]" if meta_parts else ""
109
+ # The dev declared this ticket as today's focus on the plan page. It's a
110
+ # tie-breaking prior, not a forced answer — only matches if the evidence fits.
111
+ focus = " ★ TODAY'S FOCUS" if task.get("is_today_focus") else ""
109
112
  rows.append(
110
- f"{i}. {task['task_key']}{meta}\n"
113
+ f"{i}. {task['task_key']}{focus}{meta}\n"
111
114
  f" title: {title}\n"
112
115
  f" description: {desc or '(empty)'}"
113
116
  )
@@ -152,12 +155,21 @@ def build_user_message(
152
155
  f"{_format_recent_sessions(sessions)}\n"
153
156
  "\n"
154
157
  ) if has_any_task_key else ""
158
+ # When the dev declared a focus for the day, name it in the header so the model
159
+ # treats ★ rows as a prior — preferred when the evidence plausibly fits, but
160
+ # never forced. Recall is preserved: every candidate is still listed.
161
+ has_focus = any(c.get("is_today_focus") for c in candidates)
162
+ candidate_header = (
163
+ "CANDIDATE TICKETS (★ = the dev declared this as a task they're working on "
164
+ "today; prefer a ★ ticket when the session plausibly matches it, but only "
165
+ "if the evidence fits — never force a match):\n"
166
+ ) if has_focus else "CANDIDATE TICKETS:\n"
155
167
  return (
156
168
  f"{recent_block}"
157
169
  "SESSION:\n"
158
170
  f"{_format_session(session)}\n"
159
171
  "\n"
160
- "CANDIDATE TICKETS:\n"
172
+ f"{candidate_header}"
161
173
  f"{_format_candidates(candidates)}"
162
174
  )
163
175
 
@@ -24,14 +24,18 @@ Method tag in results: "mlx_direct".
24
24
  """
25
25
  from __future__ import annotations
26
26
 
27
+ import datetime as _dt
28
+ import gc
27
29
  import json
28
30
  import logging
29
31
  import os
30
32
  import sqlite3 as _sqlite3
31
33
  import sys
34
+ import threading
32
35
  import time
36
+ from contextlib import contextmanager
33
37
  from pathlib import Path
34
- from typing import Any, Literal, Optional
38
+ from typing import Any, Literal, Optional, Iterator
35
39
 
36
40
  from opentelemetry.trace import StatusCode
37
41
  from pydantic import BaseModel, Field
@@ -233,42 +237,144 @@ _SYSTEM_PROMPT = (
233
237
 
234
238
 
235
239
  # ---------------------------------------------------------------------------
236
- # Model loading — cached for the process lifetime.
237
- # outlines.from_mlxlm wraps the already-loaded mlx model; subsequent calls
238
- # skip the expensive disk load.
240
+ # Model loading — loaded lazily on first use, evicted when idle.
241
+ #
242
+ # The MLX model holds ~7 GB of Metal unified memory while resident (measured;
243
+ # note `ps`/Activity Monitor RSS does NOT show it). Classification is bursty,
244
+ # so we keep the model only while it's being used: load on first inference,
245
+ # and evict after MLX_IDLE_EVICT_S of inactivity (server.py runs the evictor).
246
+ # `del + gc.collect() + mx.clear_cache()` reclaims the full 7 GB; cold reload
247
+ # is ~3 s. `_model_lock` + `_in_flight` guarantee the evictor never frees the
248
+ # model out from under an in-flight inference.
239
249
  # ---------------------------------------------------------------------------
240
250
 
241
251
  _model_cache: dict[str, Any] = {}
252
+ _model_lock = threading.Lock() # guards _model_cache mutation, _in_flight, _last_used, eviction
253
+ _in_flight = 0 # inferences currently using the model
254
+ _last_used = time.monotonic() # monotonic ts of the last finished inference
255
+
256
+ # Aggressive default (2 min): the model is present only during active bursts.
257
+ # Tune via env without a code change; 0 disables idle eviction entirely.
258
+ _IDLE_EVICT_S = float(os.environ.get("MLX_IDLE_EVICT_S", "120"))
242
259
 
243
260
 
244
261
  def _get_model() -> Any:
245
- """Return an outlines-wrapped model, loading from disk on the first call."""
262
+ """Return an outlines-wrapped model, loading from disk on the first call.
263
+
264
+ Cache-miss load is done under _model_lock (double-checked) so concurrent
265
+ callers can't double-load and the idle evictor can't race the load.
266
+ """
246
267
  model_id = _resolve_model_id()
247
- if model_id in _model_cache:
248
- return _model_cache[model_id]
268
+ cached = _model_cache.get(model_id)
269
+ if cached is not None:
270
+ return cached
271
+
272
+ with _model_lock:
273
+ cached = _model_cache.get(model_id) # re-check under lock
274
+ if cached is not None:
275
+ return cached
276
+ try:
277
+ import mlx_lm
278
+ import outlines
279
+ except ImportError as exc:
280
+ raise ImportError(
281
+ f"Required package not installed: {exc}. "
282
+ "Install with: pip install 'mlx-lm>=0.22' 'outlines[mlxlm]>=1.3'"
283
+ ) from exc
284
+
285
+ log.info(
286
+ "run_task_linker_mlx: loading %s (first call this process)", model_id
287
+ )
288
+ t0 = time.time()
289
+ mlx_model, tokenizer = mlx_lm.load(
290
+ model_id,
291
+ tokenizer_config={"trust_remote_code": True},
292
+ )
293
+ outlines_model = outlines.from_mlxlm(mlx_model, tokenizer)
294
+ log.info("run_task_linker_mlx: model loaded in %.1fs", time.time() - t0)
249
295
 
296
+ _model_cache[model_id] = outlines_model
297
+ return outlines_model
298
+
299
+
300
+ @contextmanager
301
+ def model_session() -> Iterator[Any]:
302
+ """Yield the loaded model, marking it in-flight so the idle evictor never
303
+ frees it mid-inference. Wrap every direct ``model(...)`` call in this.
304
+
305
+ Lock is held only briefly (to bump/clear the in-flight counter), never for
306
+ the duration of inference. NOTE: production serialises all MLX calls upstream
307
+ via the Rust llm_gate (1-permit semaphore), so inferences don't actually
308
+ overlap — this lock scope just avoids adding a second, redundant serialisation
309
+ point, NOT a claim that concurrent generation on the shared model is safe.
310
+ """
311
+ global _in_flight, _last_used
312
+ with _model_lock:
313
+ _in_flight += 1
250
314
  try:
251
- import mlx_lm
252
- import outlines
253
- except ImportError as exc:
254
- raise ImportError(
255
- f"Required package not installed: {exc}. "
256
- "Install with: pip install 'mlx-lm>=0.22' 'outlines[mlxlm]>=1.3'"
257
- ) from exc
258
-
259
- log.info(
260
- "run_task_linker_mlx: loading %s (first call this process)", model_id
261
- )
262
- t0 = time.time()
263
- mlx_model, tokenizer = mlx_lm.load(
264
- model_id,
265
- tokenizer_config={"trust_remote_code": True},
266
- )
267
- outlines_model = outlines.from_mlxlm(mlx_model, tokenizer)
268
- log.info("run_task_linker_mlx: model loaded in %.1fs", time.time() - t0)
315
+ yield _get_model()
316
+ finally:
317
+ with _model_lock:
318
+ _in_flight -= 1
319
+ _last_used = time.monotonic()
320
+
321
+
322
+ def maybe_evict_idle(idle_s: float | None = None) -> float | None:
323
+ """Evict the model if it's resident, nothing is in flight, and it's been
324
+ idle longer than ``idle_s`` (default MLX_IDLE_EVICT_S). Returns the GB freed,
325
+ or None if no eviction happened. Safe to call from a threadpool worker.
326
+
327
+ Uses a non-blocking lock acquire: if an inference/load is mutating state we
328
+ simply skip this tick and try again on the next one.
329
+ """
330
+ ttl = _IDLE_EVICT_S if idle_s is None else idle_s
331
+ if ttl <= 0:
332
+ return None
333
+ if not _model_lock.acquire(blocking=False):
334
+ return None
335
+ try:
336
+ if _in_flight > 0 or not _model_cache:
337
+ return None
338
+ if (time.monotonic() - _last_used) < ttl:
339
+ return None
340
+ try:
341
+ import mlx.core as mx
342
+ before = mx.get_active_memory()
343
+ except Exception: # noqa: BLE001 — mx should always import here
344
+ mx, before = None, 0
345
+ _model_cache.clear()
346
+ gc.collect()
347
+ freed = 0.0
348
+ if mx is not None:
349
+ mx.clear_cache()
350
+ freed = max(0.0, (before - mx.get_active_memory()) / 1e9)
351
+ log.info(
352
+ "run_task_linker_mlx: evicted idle model (idle ≥ %.0fs), freed ~%.1f GB",
353
+ ttl, freed,
354
+ )
355
+ return freed
356
+ finally:
357
+ _model_lock.release()
358
+
359
+
360
+ def model_resident() -> bool:
361
+ """True if the MLX model is currently loaded in memory."""
362
+ return bool(_model_cache)
269
363
 
270
- _model_cache[model_id] = outlines_model
271
- return outlines_model
364
+
365
+ def model_active_memory_gb() -> float | None:
366
+ """Live Metal active-memory footprint in GB, or None if MLX is unavailable.
367
+
368
+ Process-wide Metal active memory (≈ the model when resident — the model
369
+ dominates, though a transient load allocation can briefly inflate it), and
370
+ the only honest measure: `ps`/Activity Monitor can't see Metal unified
371
+ memory (they undercount by ~6.5 GB).
372
+ """
373
+ try:
374
+ import mlx.core as mx
375
+ return round(mx.get_active_memory() / 1e9, 2)
376
+ except Exception: # noqa: BLE001 — mx absent on non-MLX machines
377
+ return None
272
378
 
273
379
 
274
380
  # Apple Foundation Models has a 4096-token combined context window (input + output).
@@ -483,7 +589,53 @@ def _fetch_recent_sessions(
483
589
  return result
484
590
 
485
591
 
486
- def _fetch_pm_tasks(con: _sqlite3.Connection) -> list[dict[str, Any]]:
592
+ def _local_day(started_at: str) -> str:
593
+ """The local calendar day (YYYY-MM-DD) of a session's UTC `started_at`.
594
+
595
+ `daily_plan.plan_date` is the dev's *local* day (the dashboard stamps it from
596
+ the browser's local date), but `app_sessions.started_at` is stored UTC. We
597
+ convert UTC → local here so a session is matched to the plan the dev actually
598
+ declared for that day. Returns "" on an unparseable timestamp (→ no boost).
599
+ """
600
+ if not started_at:
601
+ return ""
602
+ try:
603
+ # `astimezone()` with no arg converts an aware datetime to the host's
604
+ # local zone — the same zone the dashboard used to compute plan_date.
605
+ return _dt.datetime.fromisoformat(started_at).astimezone().date().isoformat()
606
+ except ValueError:
607
+ return ""
608
+
609
+
610
+ def _fetch_plan_focus(con: _sqlite3.Connection, plan_date: str) -> list[str]:
611
+ """Ordered task_keys the dev CONFIRMED as their focus for `plan_date`.
612
+
613
+ Empty (→ no boost, classification proceeds exactly as before) when the day is
614
+ unconfirmed, explicitly skipped, has no plan rows, or the plan tables don't
615
+ exist yet (pre-migration-041 DB). This is a ranking signal only — never a
616
+ filter — so an empty result can only ever cost the boost, never recall.
617
+ """
618
+ if not plan_date:
619
+ return []
620
+ try:
621
+ meta = con.execute(
622
+ "SELECT confirmed_at, skipped FROM daily_plan_meta WHERE plan_date = ?",
623
+ (plan_date,),
624
+ ).fetchone()
625
+ if meta is None or meta["skipped"] or not meta["confirmed_at"]:
626
+ return []
627
+ rows = con.execute(
628
+ "SELECT task_key FROM daily_plan WHERE plan_date = ? ORDER BY position",
629
+ (plan_date,),
630
+ ).fetchall()
631
+ return [r["task_key"] for r in rows]
632
+ except _sqlite3.OperationalError:
633
+ return []
634
+
635
+
636
+ def _fetch_pm_tasks(
637
+ con: _sqlite3.Connection, focus_keys: list[str] | None = None
638
+ ) -> list[dict[str, Any]]:
487
639
  # Candidate set for classification. Tickets the user explicitly EXCLUDED during
488
640
  # onboarding board-cleanup (pm_task_curation.decision = 'excluded') are dropped
489
641
  # so a cleaned-up dead ticket can never be a classification target. Everything
@@ -512,7 +664,20 @@ def _fetch_pm_tasks(con: _sqlite3.Connection) -> list[dict[str, Any]]:
512
664
  # Pre-migration-038 DB (no pm_task_curation): degrade to the unfiltered
513
665
  # candidate set rather than crashing the whole /classify_sessions call.
514
666
  rows = con.execute(base_cols).fetchall()
515
- return [dict(r) for r in rows]
667
+ tasks = [dict(r) for r in rows]
668
+
669
+ # Today's-focus boost: tag the tickets the dev declared for the day and float
670
+ # them to the top of the candidate list, in their declared order. This is a
671
+ # BOOST, never a filter — every other candidate still follows, so recall is
672
+ # untouched. A focus key that isn't in `tasks` (e.g. excluded by curation)
673
+ # simply has no effect; we never resurrect a filtered-out ticket.
674
+ focus = focus_keys or []
675
+ if focus:
676
+ order = {key: i for i, key in enumerate(focus)}
677
+ for t in tasks:
678
+ t["is_today_focus"] = t["task_key"] in order
679
+ tasks.sort(key=lambda t: (0, order[t["task_key"]]) if t.get("is_today_focus") else (1, 0))
680
+ return tasks
516
681
 
517
682
 
518
683
  # ---------------------------------------------------------------------------
@@ -555,10 +720,13 @@ def _classify_one(
555
720
  session_id, f"session {session_id} not found in DB", 0.0, "mlx_error"
556
721
  )
557
722
 
558
- pm_tasks = _fetch_pm_tasks(con)
559
- recent = _fetch_recent_sessions(con, session_id)
723
+ plan_date = _local_day(session_raw.get("started_at") or "")
724
+ focus_keys = _fetch_plan_focus(con, plan_date)
725
+ pm_tasks = _fetch_pm_tasks(con, focus_keys)
726
+ recent = _fetch_recent_sessions(con, session_id)
560
727
 
561
728
  db_span.set_attribute("pm_tasks_count", len(pm_tasks))
729
+ db_span.set_attribute("today_focus_count", len(focus_keys))
562
730
  db_span.set_attribute("recent_sessions_count", len(recent))
563
731
 
564
732
  session_text = session_raw.get("session_text") or ""
@@ -642,14 +810,14 @@ def _classify_one(
642
810
  from mlx_lm.sample_utils import make_sampler
643
811
  from outlines.inputs import Chat
644
812
 
645
- model = _get_model()
646
- raw = model(
647
- Chat(messages),
648
- output_type=SessionClassification,
649
- max_tokens=_MAX_TOKENS,
650
- sampler=make_sampler(temp=_TEMPERATURE),
651
- verbose=False,
652
- )
813
+ with model_session() as model:
814
+ raw = model(
815
+ Chat(messages),
816
+ output_type=SessionClassification,
817
+ max_tokens=_MAX_TOKENS,
818
+ sampler=make_sampler(temp=_TEMPERATURE),
819
+ verbose=False,
820
+ )
653
821
  except Exception as exc:
654
822
  elapsed = time.time() - t0
655
823
  outcome = "apple_fm_error" if _use_apple_fm else "mlx_error"
@@ -785,7 +953,8 @@ def _classify_one_logged(
785
953
  """Classify one session and append a full record to the run log."""
786
954
  # Gather inputs before classification so we can log them even on error.
787
955
  session_raw = _fetch_session(con, session_id)
788
- pm_tasks = _fetch_pm_tasks(con) if session_raw else []
956
+ focus_keys = _fetch_plan_focus(con, _local_day(session_raw.get("started_at") or "")) if session_raw else []
957
+ pm_tasks = _fetch_pm_tasks(con, focus_keys) if session_raw else []
789
958
  recent = _fetch_recent_sessions(con, session_id) if session_raw else []
790
959
 
791
960
  if session_raw:
@@ -41,20 +41,58 @@ _DB_PATH = Path(os.environ.get("MERIDIAN_DB", Path.home() / ".meridian/meridian.
41
41
  _app_state: dict[str, Any] = {}
42
42
 
43
43
 
44
+ async def _idle_evictor(mlx_module: Any) -> None:
45
+ """Background loop: evict the MLX model after it has been idle long enough.
46
+
47
+ Runs the (briefly blocking) eviction in a threadpool so it never stalls the
48
+ event loop, and never raises out — the evictor must outlive transient errors.
49
+ """
50
+ import asyncio
51
+ from fastapi.concurrency import run_in_threadpool
52
+
53
+ ttl = mlx_module._IDLE_EVICT_S
54
+ if ttl <= 0:
55
+ return
56
+ interval = max(15.0, ttl / 4.0) # check ~4× per idle window
57
+ while True:
58
+ await asyncio.sleep(interval)
59
+ try:
60
+ await run_in_threadpool(mlx_module.maybe_evict_idle)
61
+ except Exception as exc: # noqa: BLE001 — evictor must never die
62
+ log.warning("server: idle-evictor error: %s", exc)
63
+
64
+
44
65
  @asynccontextmanager
45
66
  async def _lifespan(app: FastAPI) -> AsyncIterator[None]:
67
+ import asyncio
46
68
  import datetime
47
69
  import agents.run_task_linker_mlx as _mlx
48
70
  _app_state["mlx_module"] = _mlx
49
71
  _app_state["loaded_at"] = datetime.datetime.now(datetime.timezone.utc).isoformat()
50
72
  from agents.llm_selector import APPLE_INTELLIGENCE_ID
73
+ evictor: "asyncio.Task | None" = None
51
74
  if _mlx._resolve_model_id() == APPLE_INTELLIGENCE_ID:
52
- log.info("server: 8 GB machine — Apple Intelligence backend, no MLX model to pre-load")
75
+ log.info("server: Apple Intelligence backend no MLX model to load")
76
+ elif _mlx._IDLE_EVICT_S > 0:
77
+ # Lazy: the ~7 GB model loads on the first inference and is evicted after
78
+ # MLX_IDLE_EVICT_S of inactivity, so the server idles light (~0.4 GB)
79
+ # instead of pinning ~7 GB of Metal memory for the whole process life.
80
+ log.info(
81
+ "server: MLX model loads on first request; idle-evict after %.0fs",
82
+ _mlx._IDLE_EVICT_S,
83
+ )
84
+ evictor = asyncio.create_task(_idle_evictor(_mlx))
53
85
  else:
54
- log.info("server: loading MLX model at startup…")
55
- _mlx._get_model()
56
- log.info("server: MLX model ready")
57
- yield
86
+ # Eviction disabled don't spawn a no-op evictor task just to cancel it.
87
+ log.info("server: MLX model loads on first request; idle-eviction disabled (MLX_IDLE_EVICT_S=0)")
88
+ try:
89
+ yield
90
+ finally:
91
+ if evictor is not None:
92
+ import contextlib
93
+ evictor.cancel()
94
+ with contextlib.suppress(asyncio.CancelledError):
95
+ await evictor
58
96
 
59
97
 
60
98
  app = FastAPI(title="Meridian Agent", version="1.0.0", lifespan=_lifespan)
@@ -76,12 +114,19 @@ async def health() -> dict:
76
114
 
77
115
  @app.get("/info")
78
116
  async def info() -> dict:
79
- """Return the identity of the loaded model."""
117
+ """Return the identity of the model and its live memory state.
118
+
119
+ `active_memory_gb` reads `mx.get_active_memory()` — the ONLY honest measure
120
+ of the model's footprint, since Metal unified memory is invisible to `ps`
121
+ and Activity Monitor (they undercount the model by ~6.5 GB).
122
+ """
80
123
  m = _app_state.get("mlx_module")
81
124
  return {
82
- "backend": "mlx",
83
- "model_id": m._resolve_model_id() if m else None,
84
- "loaded_at": _app_state.get("loaded_at"),
125
+ "backend": "mlx",
126
+ "model_id": m._resolve_model_id() if m else None,
127
+ "loaded_at": _app_state.get("loaded_at"),
128
+ "model_resident": m.model_resident() if m else False,
129
+ "active_memory_gb": m.model_active_memory_gb() if m else None,
85
130
  }
86
131
 
87
132
 
@@ -143,14 +188,14 @@ async def classify(req: ClassifyRequest) -> ClassifyResponse:
143
188
  # _classify_apple_fm uses asyncio.new_event_loop() internally;
144
189
  # must run in a thread (no existing loop) not in the async handler.
145
190
  return m._classify_apple_fm(messages)
146
- model = m._get_model()
147
- raw = model(
148
- Chat(messages),
149
- output_type=m.SessionClassification,
150
- max_tokens=m._MAX_TOKENS,
151
- sampler=make_sampler(temp=m._TEMPERATURE),
152
- verbose=False,
153
- )
191
+ with m.model_session() as model:
192
+ raw = model(
193
+ Chat(messages),
194
+ output_type=m.SessionClassification,
195
+ max_tokens=m._MAX_TOKENS,
196
+ sampler=make_sampler(temp=m._TEMPERATURE),
197
+ verbose=False,
198
+ )
154
199
  return m.SessionClassification.model_validate_json(raw)
155
200
 
156
201
  try:
@@ -375,13 +420,13 @@ async def openai_chat_completions(req: _OAIChatRequest) -> dict:
375
420
  def _generate() -> str:
376
421
  if m._resolve_model_id() == APPLE_INTELLIGENCE_ID:
377
422
  return _infer_apple_fm(msgs, max_tokens)
378
- model = m._get_model()
379
- return model(
380
- Chat(msgs),
381
- max_tokens=max_tokens,
382
- sampler=make_sampler(temp=temperature),
383
- verbose=False,
384
- )
423
+ with m.model_session() as model:
424
+ return model(
425
+ Chat(msgs),
426
+ max_tokens=max_tokens,
427
+ sampler=make_sampler(temp=temperature),
428
+ verbose=False,
429
+ )
385
430
 
386
431
  t0 = _time.time()
387
432
  try:
@@ -504,14 +549,14 @@ async def summarise(req: _SummariseRequest) -> _SummariseResponse:
504
549
  from outlines.inputs import Chat
505
550
 
506
551
  def _generate() -> str:
507
- model = m._get_model()
508
- return model(
509
- Chat(messages),
510
- output_type=_SummarySchema,
511
- max_tokens=req.max_tokens,
512
- sampler=make_sampler(temp=req.temperature),
513
- verbose=False,
514
- )
552
+ with m.model_session() as model:
553
+ return model(
554
+ Chat(messages),
555
+ output_type=_SummarySchema,
556
+ max_tokens=req.max_tokens,
557
+ sampler=make_sampler(temp=req.temperature),
558
+ verbose=False,
559
+ )
515
560
 
516
561
  try:
517
562
  raw = await run_in_threadpool(_generate)
@@ -9,6 +9,7 @@ from __future__ import annotations
9
9
  import json
10
10
  import sqlite3
11
11
  import sys
12
+ import time
12
13
  from io import StringIO
13
14
  from pathlib import Path
14
15
  from typing import Iterator
@@ -1011,6 +1012,58 @@ class TestModelCache:
1011
1012
  m._get_model()
1012
1013
 
1013
1014
 
1015
+ # ---------------------------------------------------------------------------
1016
+ # Idle eviction — model_session() in-flight tracking + maybe_evict_idle()
1017
+ # (the model holds ~7 GB while resident; the server unloads it when idle)
1018
+ # ---------------------------------------------------------------------------
1019
+
1020
+ class TestModelEviction:
1021
+ def test_model_session_loads_and_tracks_in_flight(self):
1022
+ import agents.run_task_linker_mlx as m
1023
+ sentinel = MagicMock(name="model")
1024
+ with patch.object(m, "_get_model", return_value=sentinel):
1025
+ m._in_flight = 0
1026
+ with m.model_session() as model:
1027
+ assert model is sentinel
1028
+ assert m._in_flight == 1 # marked in-flight while in use
1029
+ assert m._in_flight == 0 # released on exit
1030
+
1031
+ def test_evict_noop_when_not_idle_long_enough(self):
1032
+ import agents.run_task_linker_mlx as m
1033
+ m._model_cache["x"] = MagicMock()
1034
+ m._in_flight = 0
1035
+ m._last_used = time.monotonic() # just used
1036
+ assert m.maybe_evict_idle(idle_s=600) is None
1037
+ assert m.model_resident() is True
1038
+
1039
+ def test_evict_disabled_when_ttl_zero(self):
1040
+ import agents.run_task_linker_mlx as m
1041
+ m._model_cache["x"] = MagicMock()
1042
+ assert m.maybe_evict_idle(idle_s=0) is None
1043
+ assert m.model_resident() is True
1044
+
1045
+ def test_evict_noop_when_in_flight(self):
1046
+ import agents.run_task_linker_mlx as m
1047
+ m._model_cache["x"] = MagicMock()
1048
+ m._in_flight = 1 # an inference is using the model
1049
+ m._last_used = time.monotonic() - 1000
1050
+ try:
1051
+ assert m.maybe_evict_idle(idle_s=0.001) is None
1052
+ assert m.model_resident() is True # never freed mid-inference
1053
+ finally:
1054
+ m._in_flight = 0
1055
+
1056
+ def test_evict_clears_cache_when_idle(self):
1057
+ import agents.run_task_linker_mlx as m
1058
+ m._model_cache["x"] = MagicMock()
1059
+ m._in_flight = 0
1060
+ m._last_used = time.monotonic() - 1000 # idle long past the window
1061
+ freed = m.maybe_evict_idle(idle_s=0.001)
1062
+ assert freed is not None # eviction happened
1063
+ assert m.model_resident() is False
1064
+ assert m._model_cache == {}
1065
+
1066
+
1014
1067
  # ---------------------------------------------------------------------------
1015
1068
  # SessionClassification schema
1016
1069
  # ---------------------------------------------------------------------------
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "meridian-agents"
7
- version = "1.53.1"
7
+ version = "1.54.1"
8
8
  description = "Meridian agents — MLX classifier server and Jira worklog synthesis for meridian.db"
9
9
  requires-python = ">=3.11"
10
10
  authors = [{ name = "Meridiona" }]
package/ui.tar.gz CHANGED
Binary file