@meridiona/meridian-darwin-arm64 1.53.1 → 1.54.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +9 -0
- package/VERSION +1 -1
- package/bin/meridian +0 -0
- package/bin/meridian-tray +0 -0
- package/package.json +1 -1
- package/scripts/install-from-bundle.sh +15 -0
- package/services/agents/_prompts.py +14 -2
- package/services/agents/run_task_linker_mlx.py +209 -40
- package/services/agents/server.py +77 -32
- package/services/agents/tests/test_run_task_linker_mlx.py +53 -0
- package/services/pyproject.toml +1 -1
- package/ui.tar.gz +0 -0
package/.env.example
CHANGED
|
@@ -19,6 +19,15 @@
|
|
|
19
19
|
# MLX_SERVER_HOST=127.0.0.1
|
|
20
20
|
# MLX_SERVER_PORT=7823
|
|
21
21
|
|
|
22
|
+
# Idle eviction for the MLX model. The model holds ~7 GB of Metal memory while
|
|
23
|
+
# resident, but classification is bursty — so the server unloads it after this
|
|
24
|
+
# many seconds idle and reloads on the next request (~3 s cold start). Default
|
|
25
|
+
# 120s (aggressive: lightest idle footprint). Raise it to keep the model warm
|
|
26
|
+
# longer; set 0 to disable eviction (pin the model in memory). Avoid values
|
|
27
|
+
# below ~30s: if the TTL drops under the gap between sessions in a classification
|
|
28
|
+
# burst, the model evicts and cold-reloads (~3 s) repeatedly mid-burst.
|
|
29
|
+
# MLX_IDLE_EVICT_S=120
|
|
30
|
+
|
|
22
31
|
# Dashboard (Next.js UI) port. Defaults to 3939. Change this and re-run
|
|
23
32
|
# `meridian setup` to move the dashboard.
|
|
24
33
|
# MERIDIAN_UI_PORT=3939
|
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.
|
|
1
|
+
1.54.1
|
package/bin/meridian
CHANGED
|
Binary file
|
package/bin/meridian-tray
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@meridiona/meridian-darwin-arm64",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.54.1",
|
|
4
4
|
"description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
|
|
5
5
|
"homepage": "https://github.com/Meridiona/meridian",
|
|
6
6
|
"repository": {
|
|
@@ -540,6 +540,21 @@ if [[ "${SKIP_PERMISSIONS}" -eq 0 ]]; then
|
|
|
540
540
|
echo " ${HOME}/.meridian/bin/meridian-a11y-helper"
|
|
541
541
|
echo " Without the a11y helper, Electron apps (Claude, Codex, Slack, …) stay invisible to capture."
|
|
542
542
|
read -r -p " Press Enter once all are granted… " _ || true
|
|
543
|
+
|
|
544
|
+
# Notifications: the tray surfaces desktop toasts (plan nudges, worklog
|
|
545
|
+
# drafts, faults). macOS hides ALL notifications while the screen is being
|
|
546
|
+
# recorded/shared unless this is on — and screenpipe records continuously, so
|
|
547
|
+
# without it every Meridian toast is silently suppressed. No API/prompt exists
|
|
548
|
+
# for this toggle, so we can only walk the user to it.
|
|
549
|
+
echo "→ Meridian's tray shows desktop notifications. Because screenpipe records"
|
|
550
|
+
echo " the screen, macOS hides notifications during screen sharing unless allowed."
|
|
551
|
+
read -r -p " Press Enter to open Notifications settings… " _ || true
|
|
552
|
+
open "x-apple.systempreferences:com.apple.Notifications-Settings.extension" 2>/dev/null || true
|
|
553
|
+
echo " → Scroll to the bottom and turn ON"
|
|
554
|
+
echo " 'Allow notifications when mirroring or sharing the display'."
|
|
555
|
+
echo " → When 'Meridian Tray' appears, ensure its notifications are allowed"
|
|
556
|
+
echo " (style Banners or Alerts, not None)."
|
|
557
|
+
read -r -p " Press Enter when done… " _ || true
|
|
543
558
|
fi
|
|
544
559
|
|
|
545
560
|
# Enable a11y mode in installed VS Code-family editors (idempotent). Without
|
|
@@ -106,8 +106,11 @@ def _format_candidates(tasks: list[dict]) -> str:
|
|
|
106
106
|
desc = desc[:240] + "…"
|
|
107
107
|
meta_parts = [p for p in [issue_type, f"Epic: {epic_title}" if epic_title else "", sprint_name, f"tags: {tags}" if tags else ""] if p]
|
|
108
108
|
meta = " [" + " · ".join(meta_parts) + "]" if meta_parts else ""
|
|
109
|
+
# The dev declared this ticket as today's focus on the plan page. It's a
|
|
110
|
+
# tie-breaking prior, not a forced answer — only matches if the evidence fits.
|
|
111
|
+
focus = " ★ TODAY'S FOCUS" if task.get("is_today_focus") else ""
|
|
109
112
|
rows.append(
|
|
110
|
-
f"{i}. {task['task_key']}{meta}\n"
|
|
113
|
+
f"{i}. {task['task_key']}{focus}{meta}\n"
|
|
111
114
|
f" title: {title}\n"
|
|
112
115
|
f" description: {desc or '(empty)'}"
|
|
113
116
|
)
|
|
@@ -152,12 +155,21 @@ def build_user_message(
|
|
|
152
155
|
f"{_format_recent_sessions(sessions)}\n"
|
|
153
156
|
"\n"
|
|
154
157
|
) if has_any_task_key else ""
|
|
158
|
+
# When the dev declared a focus for the day, name it in the header so the model
|
|
159
|
+
# treats ★ rows as a prior — preferred when the evidence plausibly fits, but
|
|
160
|
+
# never forced. Recall is preserved: every candidate is still listed.
|
|
161
|
+
has_focus = any(c.get("is_today_focus") for c in candidates)
|
|
162
|
+
candidate_header = (
|
|
163
|
+
"CANDIDATE TICKETS (★ = the dev declared this as a task they're working on "
|
|
164
|
+
"today; prefer a ★ ticket when the session plausibly matches it, but only "
|
|
165
|
+
"if the evidence fits — never force a match):\n"
|
|
166
|
+
) if has_focus else "CANDIDATE TICKETS:\n"
|
|
155
167
|
return (
|
|
156
168
|
f"{recent_block}"
|
|
157
169
|
"SESSION:\n"
|
|
158
170
|
f"{_format_session(session)}\n"
|
|
159
171
|
"\n"
|
|
160
|
-
"
|
|
172
|
+
f"{candidate_header}"
|
|
161
173
|
f"{_format_candidates(candidates)}"
|
|
162
174
|
)
|
|
163
175
|
|
|
@@ -24,14 +24,18 @@ Method tag in results: "mlx_direct".
|
|
|
24
24
|
"""
|
|
25
25
|
from __future__ import annotations
|
|
26
26
|
|
|
27
|
+
import datetime as _dt
|
|
28
|
+
import gc
|
|
27
29
|
import json
|
|
28
30
|
import logging
|
|
29
31
|
import os
|
|
30
32
|
import sqlite3 as _sqlite3
|
|
31
33
|
import sys
|
|
34
|
+
import threading
|
|
32
35
|
import time
|
|
36
|
+
from contextlib import contextmanager
|
|
33
37
|
from pathlib import Path
|
|
34
|
-
from typing import Any, Literal, Optional
|
|
38
|
+
from typing import Any, Literal, Optional, Iterator
|
|
35
39
|
|
|
36
40
|
from opentelemetry.trace import StatusCode
|
|
37
41
|
from pydantic import BaseModel, Field
|
|
@@ -233,42 +237,144 @@ _SYSTEM_PROMPT = (
|
|
|
233
237
|
|
|
234
238
|
|
|
235
239
|
# ---------------------------------------------------------------------------
|
|
236
|
-
# Model loading —
|
|
237
|
-
#
|
|
238
|
-
#
|
|
240
|
+
# Model loading — loaded lazily on first use, evicted when idle.
|
|
241
|
+
#
|
|
242
|
+
# The MLX model holds ~7 GB of Metal unified memory while resident (measured;
|
|
243
|
+
# note `ps`/Activity Monitor RSS does NOT show it). Classification is bursty,
|
|
244
|
+
# so we keep the model only while it's being used: load on first inference,
|
|
245
|
+
# and evict after MLX_IDLE_EVICT_S of inactivity (server.py runs the evictor).
|
|
246
|
+
# `del + gc.collect() + mx.clear_cache()` reclaims the full 7 GB; cold reload
|
|
247
|
+
# is ~3 s. `_model_lock` + `_in_flight` guarantee the evictor never frees the
|
|
248
|
+
# model out from under an in-flight inference.
|
|
239
249
|
# ---------------------------------------------------------------------------
|
|
240
250
|
|
|
241
251
|
_model_cache: dict[str, Any] = {}
|
|
252
|
+
_model_lock = threading.Lock() # guards _model_cache mutation, _in_flight, _last_used, eviction
|
|
253
|
+
_in_flight = 0 # inferences currently using the model
|
|
254
|
+
_last_used = time.monotonic() # monotonic ts of the last finished inference
|
|
255
|
+
|
|
256
|
+
# Aggressive default (2 min): the model is present only during active bursts.
|
|
257
|
+
# Tune via env without a code change; 0 disables idle eviction entirely.
|
|
258
|
+
_IDLE_EVICT_S = float(os.environ.get("MLX_IDLE_EVICT_S", "120"))
|
|
242
259
|
|
|
243
260
|
|
|
244
261
|
def _get_model() -> Any:
|
|
245
|
-
"""Return an outlines-wrapped model, loading from disk on the first call.
|
|
262
|
+
"""Return an outlines-wrapped model, loading from disk on the first call.
|
|
263
|
+
|
|
264
|
+
Cache-miss load is done under _model_lock (double-checked) so concurrent
|
|
265
|
+
callers can't double-load and the idle evictor can't race the load.
|
|
266
|
+
"""
|
|
246
267
|
model_id = _resolve_model_id()
|
|
247
|
-
|
|
248
|
-
|
|
268
|
+
cached = _model_cache.get(model_id)
|
|
269
|
+
if cached is not None:
|
|
270
|
+
return cached
|
|
271
|
+
|
|
272
|
+
with _model_lock:
|
|
273
|
+
cached = _model_cache.get(model_id) # re-check under lock
|
|
274
|
+
if cached is not None:
|
|
275
|
+
return cached
|
|
276
|
+
try:
|
|
277
|
+
import mlx_lm
|
|
278
|
+
import outlines
|
|
279
|
+
except ImportError as exc:
|
|
280
|
+
raise ImportError(
|
|
281
|
+
f"Required package not installed: {exc}. "
|
|
282
|
+
"Install with: pip install 'mlx-lm>=0.22' 'outlines[mlxlm]>=1.3'"
|
|
283
|
+
) from exc
|
|
284
|
+
|
|
285
|
+
log.info(
|
|
286
|
+
"run_task_linker_mlx: loading %s (first call this process)", model_id
|
|
287
|
+
)
|
|
288
|
+
t0 = time.time()
|
|
289
|
+
mlx_model, tokenizer = mlx_lm.load(
|
|
290
|
+
model_id,
|
|
291
|
+
tokenizer_config={"trust_remote_code": True},
|
|
292
|
+
)
|
|
293
|
+
outlines_model = outlines.from_mlxlm(mlx_model, tokenizer)
|
|
294
|
+
log.info("run_task_linker_mlx: model loaded in %.1fs", time.time() - t0)
|
|
249
295
|
|
|
296
|
+
_model_cache[model_id] = outlines_model
|
|
297
|
+
return outlines_model
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
@contextmanager
|
|
301
|
+
def model_session() -> Iterator[Any]:
|
|
302
|
+
"""Yield the loaded model, marking it in-flight so the idle evictor never
|
|
303
|
+
frees it mid-inference. Wrap every direct ``model(...)`` call in this.
|
|
304
|
+
|
|
305
|
+
Lock is held only briefly (to bump/clear the in-flight counter), never for
|
|
306
|
+
the duration of inference. NOTE: production serialises all MLX calls upstream
|
|
307
|
+
via the Rust llm_gate (1-permit semaphore), so inferences don't actually
|
|
308
|
+
overlap — this lock scope just avoids adding a second, redundant serialisation
|
|
309
|
+
point, NOT a claim that concurrent generation on the shared model is safe.
|
|
310
|
+
"""
|
|
311
|
+
global _in_flight, _last_used
|
|
312
|
+
with _model_lock:
|
|
313
|
+
_in_flight += 1
|
|
250
314
|
try:
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
315
|
+
yield _get_model()
|
|
316
|
+
finally:
|
|
317
|
+
with _model_lock:
|
|
318
|
+
_in_flight -= 1
|
|
319
|
+
_last_used = time.monotonic()
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def maybe_evict_idle(idle_s: float | None = None) -> float | None:
|
|
323
|
+
"""Evict the model if it's resident, nothing is in flight, and it's been
|
|
324
|
+
idle longer than ``idle_s`` (default MLX_IDLE_EVICT_S). Returns the GB freed,
|
|
325
|
+
or None if no eviction happened. Safe to call from a threadpool worker.
|
|
326
|
+
|
|
327
|
+
Uses a non-blocking lock acquire: if an inference/load is mutating state we
|
|
328
|
+
simply skip this tick and try again on the next one.
|
|
329
|
+
"""
|
|
330
|
+
ttl = _IDLE_EVICT_S if idle_s is None else idle_s
|
|
331
|
+
if ttl <= 0:
|
|
332
|
+
return None
|
|
333
|
+
if not _model_lock.acquire(blocking=False):
|
|
334
|
+
return None
|
|
335
|
+
try:
|
|
336
|
+
if _in_flight > 0 or not _model_cache:
|
|
337
|
+
return None
|
|
338
|
+
if (time.monotonic() - _last_used) < ttl:
|
|
339
|
+
return None
|
|
340
|
+
try:
|
|
341
|
+
import mlx.core as mx
|
|
342
|
+
before = mx.get_active_memory()
|
|
343
|
+
except Exception: # noqa: BLE001 — mx should always import here
|
|
344
|
+
mx, before = None, 0
|
|
345
|
+
_model_cache.clear()
|
|
346
|
+
gc.collect()
|
|
347
|
+
freed = 0.0
|
|
348
|
+
if mx is not None:
|
|
349
|
+
mx.clear_cache()
|
|
350
|
+
freed = max(0.0, (before - mx.get_active_memory()) / 1e9)
|
|
351
|
+
log.info(
|
|
352
|
+
"run_task_linker_mlx: evicted idle model (idle ≥ %.0fs), freed ~%.1f GB",
|
|
353
|
+
ttl, freed,
|
|
354
|
+
)
|
|
355
|
+
return freed
|
|
356
|
+
finally:
|
|
357
|
+
_model_lock.release()
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def model_resident() -> bool:
|
|
361
|
+
"""True if the MLX model is currently loaded in memory."""
|
|
362
|
+
return bool(_model_cache)
|
|
269
363
|
|
|
270
|
-
|
|
271
|
-
|
|
364
|
+
|
|
365
|
+
def model_active_memory_gb() -> float | None:
|
|
366
|
+
"""Live Metal active-memory footprint in GB, or None if MLX is unavailable.
|
|
367
|
+
|
|
368
|
+
Process-wide Metal active memory (≈ the model when resident — the model
|
|
369
|
+
dominates, though a transient load allocation can briefly inflate it), and
|
|
370
|
+
the only honest measure: `ps`/Activity Monitor can't see Metal unified
|
|
371
|
+
memory (they undercount by ~6.5 GB).
|
|
372
|
+
"""
|
|
373
|
+
try:
|
|
374
|
+
import mlx.core as mx
|
|
375
|
+
return round(mx.get_active_memory() / 1e9, 2)
|
|
376
|
+
except Exception: # noqa: BLE001 — mx absent on non-MLX machines
|
|
377
|
+
return None
|
|
272
378
|
|
|
273
379
|
|
|
274
380
|
# Apple Foundation Models has a 4096-token combined context window (input + output).
|
|
@@ -483,7 +589,53 @@ def _fetch_recent_sessions(
|
|
|
483
589
|
return result
|
|
484
590
|
|
|
485
591
|
|
|
486
|
-
def
|
|
592
|
+
def _local_day(started_at: str) -> str:
|
|
593
|
+
"""The local calendar day (YYYY-MM-DD) of a session's UTC `started_at`.
|
|
594
|
+
|
|
595
|
+
`daily_plan.plan_date` is the dev's *local* day (the dashboard stamps it from
|
|
596
|
+
the browser's local date), but `app_sessions.started_at` is stored UTC. We
|
|
597
|
+
convert UTC → local here so a session is matched to the plan the dev actually
|
|
598
|
+
declared for that day. Returns "" on an unparseable timestamp (→ no boost).
|
|
599
|
+
"""
|
|
600
|
+
if not started_at:
|
|
601
|
+
return ""
|
|
602
|
+
try:
|
|
603
|
+
# `astimezone()` with no arg converts an aware datetime to the host's
|
|
604
|
+
# local zone — the same zone the dashboard used to compute plan_date.
|
|
605
|
+
return _dt.datetime.fromisoformat(started_at).astimezone().date().isoformat()
|
|
606
|
+
except ValueError:
|
|
607
|
+
return ""
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
def _fetch_plan_focus(con: _sqlite3.Connection, plan_date: str) -> list[str]:
|
|
611
|
+
"""Ordered task_keys the dev CONFIRMED as their focus for `plan_date`.
|
|
612
|
+
|
|
613
|
+
Empty (→ no boost, classification proceeds exactly as before) when the day is
|
|
614
|
+
unconfirmed, explicitly skipped, has no plan rows, or the plan tables don't
|
|
615
|
+
exist yet (pre-migration-041 DB). This is a ranking signal only — never a
|
|
616
|
+
filter — so an empty result can only ever cost the boost, never recall.
|
|
617
|
+
"""
|
|
618
|
+
if not plan_date:
|
|
619
|
+
return []
|
|
620
|
+
try:
|
|
621
|
+
meta = con.execute(
|
|
622
|
+
"SELECT confirmed_at, skipped FROM daily_plan_meta WHERE plan_date = ?",
|
|
623
|
+
(plan_date,),
|
|
624
|
+
).fetchone()
|
|
625
|
+
if meta is None or meta["skipped"] or not meta["confirmed_at"]:
|
|
626
|
+
return []
|
|
627
|
+
rows = con.execute(
|
|
628
|
+
"SELECT task_key FROM daily_plan WHERE plan_date = ? ORDER BY position",
|
|
629
|
+
(plan_date,),
|
|
630
|
+
).fetchall()
|
|
631
|
+
return [r["task_key"] for r in rows]
|
|
632
|
+
except _sqlite3.OperationalError:
|
|
633
|
+
return []
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
def _fetch_pm_tasks(
|
|
637
|
+
con: _sqlite3.Connection, focus_keys: list[str] | None = None
|
|
638
|
+
) -> list[dict[str, Any]]:
|
|
487
639
|
# Candidate set for classification. Tickets the user explicitly EXCLUDED during
|
|
488
640
|
# onboarding board-cleanup (pm_task_curation.decision = 'excluded') are dropped
|
|
489
641
|
# so a cleaned-up dead ticket can never be a classification target. Everything
|
|
@@ -512,7 +664,20 @@ def _fetch_pm_tasks(con: _sqlite3.Connection) -> list[dict[str, Any]]:
|
|
|
512
664
|
# Pre-migration-038 DB (no pm_task_curation): degrade to the unfiltered
|
|
513
665
|
# candidate set rather than crashing the whole /classify_sessions call.
|
|
514
666
|
rows = con.execute(base_cols).fetchall()
|
|
515
|
-
|
|
667
|
+
tasks = [dict(r) for r in rows]
|
|
668
|
+
|
|
669
|
+
# Today's-focus boost: tag the tickets the dev declared for the day and float
|
|
670
|
+
# them to the top of the candidate list, in their declared order. This is a
|
|
671
|
+
# BOOST, never a filter — every other candidate still follows, so recall is
|
|
672
|
+
# untouched. A focus key that isn't in `tasks` (e.g. excluded by curation)
|
|
673
|
+
# simply has no effect; we never resurrect a filtered-out ticket.
|
|
674
|
+
focus = focus_keys or []
|
|
675
|
+
if focus:
|
|
676
|
+
order = {key: i for i, key in enumerate(focus)}
|
|
677
|
+
for t in tasks:
|
|
678
|
+
t["is_today_focus"] = t["task_key"] in order
|
|
679
|
+
tasks.sort(key=lambda t: (0, order[t["task_key"]]) if t.get("is_today_focus") else (1, 0))
|
|
680
|
+
return tasks
|
|
516
681
|
|
|
517
682
|
|
|
518
683
|
# ---------------------------------------------------------------------------
|
|
@@ -555,10 +720,13 @@ def _classify_one(
|
|
|
555
720
|
session_id, f"session {session_id} not found in DB", 0.0, "mlx_error"
|
|
556
721
|
)
|
|
557
722
|
|
|
558
|
-
|
|
559
|
-
|
|
723
|
+
plan_date = _local_day(session_raw.get("started_at") or "")
|
|
724
|
+
focus_keys = _fetch_plan_focus(con, plan_date)
|
|
725
|
+
pm_tasks = _fetch_pm_tasks(con, focus_keys)
|
|
726
|
+
recent = _fetch_recent_sessions(con, session_id)
|
|
560
727
|
|
|
561
728
|
db_span.set_attribute("pm_tasks_count", len(pm_tasks))
|
|
729
|
+
db_span.set_attribute("today_focus_count", len(focus_keys))
|
|
562
730
|
db_span.set_attribute("recent_sessions_count", len(recent))
|
|
563
731
|
|
|
564
732
|
session_text = session_raw.get("session_text") or ""
|
|
@@ -642,14 +810,14 @@ def _classify_one(
|
|
|
642
810
|
from mlx_lm.sample_utils import make_sampler
|
|
643
811
|
from outlines.inputs import Chat
|
|
644
812
|
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
813
|
+
with model_session() as model:
|
|
814
|
+
raw = model(
|
|
815
|
+
Chat(messages),
|
|
816
|
+
output_type=SessionClassification,
|
|
817
|
+
max_tokens=_MAX_TOKENS,
|
|
818
|
+
sampler=make_sampler(temp=_TEMPERATURE),
|
|
819
|
+
verbose=False,
|
|
820
|
+
)
|
|
653
821
|
except Exception as exc:
|
|
654
822
|
elapsed = time.time() - t0
|
|
655
823
|
outcome = "apple_fm_error" if _use_apple_fm else "mlx_error"
|
|
@@ -785,7 +953,8 @@ def _classify_one_logged(
|
|
|
785
953
|
"""Classify one session and append a full record to the run log."""
|
|
786
954
|
# Gather inputs before classification so we can log them even on error.
|
|
787
955
|
session_raw = _fetch_session(con, session_id)
|
|
788
|
-
|
|
956
|
+
focus_keys = _fetch_plan_focus(con, _local_day(session_raw.get("started_at") or "")) if session_raw else []
|
|
957
|
+
pm_tasks = _fetch_pm_tasks(con, focus_keys) if session_raw else []
|
|
789
958
|
recent = _fetch_recent_sessions(con, session_id) if session_raw else []
|
|
790
959
|
|
|
791
960
|
if session_raw:
|
|
@@ -41,20 +41,58 @@ _DB_PATH = Path(os.environ.get("MERIDIAN_DB", Path.home() / ".meridian/meridian.
|
|
|
41
41
|
_app_state: dict[str, Any] = {}
|
|
42
42
|
|
|
43
43
|
|
|
44
|
+
async def _idle_evictor(mlx_module: Any) -> None:
|
|
45
|
+
"""Background loop: evict the MLX model after it has been idle long enough.
|
|
46
|
+
|
|
47
|
+
Runs the (briefly blocking) eviction in a threadpool so it never stalls the
|
|
48
|
+
event loop, and never raises out — the evictor must outlive transient errors.
|
|
49
|
+
"""
|
|
50
|
+
import asyncio
|
|
51
|
+
from fastapi.concurrency import run_in_threadpool
|
|
52
|
+
|
|
53
|
+
ttl = mlx_module._IDLE_EVICT_S
|
|
54
|
+
if ttl <= 0:
|
|
55
|
+
return
|
|
56
|
+
interval = max(15.0, ttl / 4.0) # check ~4× per idle window
|
|
57
|
+
while True:
|
|
58
|
+
await asyncio.sleep(interval)
|
|
59
|
+
try:
|
|
60
|
+
await run_in_threadpool(mlx_module.maybe_evict_idle)
|
|
61
|
+
except Exception as exc: # noqa: BLE001 — evictor must never die
|
|
62
|
+
log.warning("server: idle-evictor error: %s", exc)
|
|
63
|
+
|
|
64
|
+
|
|
44
65
|
@asynccontextmanager
|
|
45
66
|
async def _lifespan(app: FastAPI) -> AsyncIterator[None]:
|
|
67
|
+
import asyncio
|
|
46
68
|
import datetime
|
|
47
69
|
import agents.run_task_linker_mlx as _mlx
|
|
48
70
|
_app_state["mlx_module"] = _mlx
|
|
49
71
|
_app_state["loaded_at"] = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
|
50
72
|
from agents.llm_selector import APPLE_INTELLIGENCE_ID
|
|
73
|
+
evictor: "asyncio.Task | None" = None
|
|
51
74
|
if _mlx._resolve_model_id() == APPLE_INTELLIGENCE_ID:
|
|
52
|
-
log.info("server:
|
|
75
|
+
log.info("server: Apple Intelligence backend — no MLX model to load")
|
|
76
|
+
elif _mlx._IDLE_EVICT_S > 0:
|
|
77
|
+
# Lazy: the ~7 GB model loads on the first inference and is evicted after
|
|
78
|
+
# MLX_IDLE_EVICT_S of inactivity, so the server idles light (~0.4 GB)
|
|
79
|
+
# instead of pinning ~7 GB of Metal memory for the whole process life.
|
|
80
|
+
log.info(
|
|
81
|
+
"server: MLX model loads on first request; idle-evict after %.0fs",
|
|
82
|
+
_mlx._IDLE_EVICT_S,
|
|
83
|
+
)
|
|
84
|
+
evictor = asyncio.create_task(_idle_evictor(_mlx))
|
|
53
85
|
else:
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
86
|
+
# Eviction disabled — don't spawn a no-op evictor task just to cancel it.
|
|
87
|
+
log.info("server: MLX model loads on first request; idle-eviction disabled (MLX_IDLE_EVICT_S=0)")
|
|
88
|
+
try:
|
|
89
|
+
yield
|
|
90
|
+
finally:
|
|
91
|
+
if evictor is not None:
|
|
92
|
+
import contextlib
|
|
93
|
+
evictor.cancel()
|
|
94
|
+
with contextlib.suppress(asyncio.CancelledError):
|
|
95
|
+
await evictor
|
|
58
96
|
|
|
59
97
|
|
|
60
98
|
app = FastAPI(title="Meridian Agent", version="1.0.0", lifespan=_lifespan)
|
|
@@ -76,12 +114,19 @@ async def health() -> dict:
|
|
|
76
114
|
|
|
77
115
|
@app.get("/info")
|
|
78
116
|
async def info() -> dict:
|
|
79
|
-
"""Return the identity of the
|
|
117
|
+
"""Return the identity of the model and its live memory state.
|
|
118
|
+
|
|
119
|
+
`active_memory_gb` reads `mx.get_active_memory()` — the ONLY honest measure
|
|
120
|
+
of the model's footprint, since Metal unified memory is invisible to `ps`
|
|
121
|
+
and Activity Monitor (they undercount the model by ~6.5 GB).
|
|
122
|
+
"""
|
|
80
123
|
m = _app_state.get("mlx_module")
|
|
81
124
|
return {
|
|
82
|
-
"backend":
|
|
83
|
-
"model_id":
|
|
84
|
-
"loaded_at":
|
|
125
|
+
"backend": "mlx",
|
|
126
|
+
"model_id": m._resolve_model_id() if m else None,
|
|
127
|
+
"loaded_at": _app_state.get("loaded_at"),
|
|
128
|
+
"model_resident": m.model_resident() if m else False,
|
|
129
|
+
"active_memory_gb": m.model_active_memory_gb() if m else None,
|
|
85
130
|
}
|
|
86
131
|
|
|
87
132
|
|
|
@@ -143,14 +188,14 @@ async def classify(req: ClassifyRequest) -> ClassifyResponse:
|
|
|
143
188
|
# _classify_apple_fm uses asyncio.new_event_loop() internally;
|
|
144
189
|
# must run in a thread (no existing loop) not in the async handler.
|
|
145
190
|
return m._classify_apple_fm(messages)
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
191
|
+
with m.model_session() as model:
|
|
192
|
+
raw = model(
|
|
193
|
+
Chat(messages),
|
|
194
|
+
output_type=m.SessionClassification,
|
|
195
|
+
max_tokens=m._MAX_TOKENS,
|
|
196
|
+
sampler=make_sampler(temp=m._TEMPERATURE),
|
|
197
|
+
verbose=False,
|
|
198
|
+
)
|
|
154
199
|
return m.SessionClassification.model_validate_json(raw)
|
|
155
200
|
|
|
156
201
|
try:
|
|
@@ -375,13 +420,13 @@ async def openai_chat_completions(req: _OAIChatRequest) -> dict:
|
|
|
375
420
|
def _generate() -> str:
|
|
376
421
|
if m._resolve_model_id() == APPLE_INTELLIGENCE_ID:
|
|
377
422
|
return _infer_apple_fm(msgs, max_tokens)
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
423
|
+
with m.model_session() as model:
|
|
424
|
+
return model(
|
|
425
|
+
Chat(msgs),
|
|
426
|
+
max_tokens=max_tokens,
|
|
427
|
+
sampler=make_sampler(temp=temperature),
|
|
428
|
+
verbose=False,
|
|
429
|
+
)
|
|
385
430
|
|
|
386
431
|
t0 = _time.time()
|
|
387
432
|
try:
|
|
@@ -504,14 +549,14 @@ async def summarise(req: _SummariseRequest) -> _SummariseResponse:
|
|
|
504
549
|
from outlines.inputs import Chat
|
|
505
550
|
|
|
506
551
|
def _generate() -> str:
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
552
|
+
with m.model_session() as model:
|
|
553
|
+
return model(
|
|
554
|
+
Chat(messages),
|
|
555
|
+
output_type=_SummarySchema,
|
|
556
|
+
max_tokens=req.max_tokens,
|
|
557
|
+
sampler=make_sampler(temp=req.temperature),
|
|
558
|
+
verbose=False,
|
|
559
|
+
)
|
|
515
560
|
|
|
516
561
|
try:
|
|
517
562
|
raw = await run_in_threadpool(_generate)
|
|
@@ -9,6 +9,7 @@ from __future__ import annotations
|
|
|
9
9
|
import json
|
|
10
10
|
import sqlite3
|
|
11
11
|
import sys
|
|
12
|
+
import time
|
|
12
13
|
from io import StringIO
|
|
13
14
|
from pathlib import Path
|
|
14
15
|
from typing import Iterator
|
|
@@ -1011,6 +1012,58 @@ class TestModelCache:
|
|
|
1011
1012
|
m._get_model()
|
|
1012
1013
|
|
|
1013
1014
|
|
|
1015
|
+
# ---------------------------------------------------------------------------
|
|
1016
|
+
# Idle eviction — model_session() in-flight tracking + maybe_evict_idle()
|
|
1017
|
+
# (the model holds ~7 GB while resident; the server unloads it when idle)
|
|
1018
|
+
# ---------------------------------------------------------------------------
|
|
1019
|
+
|
|
1020
|
+
class TestModelEviction:
|
|
1021
|
+
def test_model_session_loads_and_tracks_in_flight(self):
|
|
1022
|
+
import agents.run_task_linker_mlx as m
|
|
1023
|
+
sentinel = MagicMock(name="model")
|
|
1024
|
+
with patch.object(m, "_get_model", return_value=sentinel):
|
|
1025
|
+
m._in_flight = 0
|
|
1026
|
+
with m.model_session() as model:
|
|
1027
|
+
assert model is sentinel
|
|
1028
|
+
assert m._in_flight == 1 # marked in-flight while in use
|
|
1029
|
+
assert m._in_flight == 0 # released on exit
|
|
1030
|
+
|
|
1031
|
+
def test_evict_noop_when_not_idle_long_enough(self):
|
|
1032
|
+
import agents.run_task_linker_mlx as m
|
|
1033
|
+
m._model_cache["x"] = MagicMock()
|
|
1034
|
+
m._in_flight = 0
|
|
1035
|
+
m._last_used = time.monotonic() # just used
|
|
1036
|
+
assert m.maybe_evict_idle(idle_s=600) is None
|
|
1037
|
+
assert m.model_resident() is True
|
|
1038
|
+
|
|
1039
|
+
def test_evict_disabled_when_ttl_zero(self):
|
|
1040
|
+
import agents.run_task_linker_mlx as m
|
|
1041
|
+
m._model_cache["x"] = MagicMock()
|
|
1042
|
+
assert m.maybe_evict_idle(idle_s=0) is None
|
|
1043
|
+
assert m.model_resident() is True
|
|
1044
|
+
|
|
1045
|
+
def test_evict_noop_when_in_flight(self):
|
|
1046
|
+
import agents.run_task_linker_mlx as m
|
|
1047
|
+
m._model_cache["x"] = MagicMock()
|
|
1048
|
+
m._in_flight = 1 # an inference is using the model
|
|
1049
|
+
m._last_used = time.monotonic() - 1000
|
|
1050
|
+
try:
|
|
1051
|
+
assert m.maybe_evict_idle(idle_s=0.001) is None
|
|
1052
|
+
assert m.model_resident() is True # never freed mid-inference
|
|
1053
|
+
finally:
|
|
1054
|
+
m._in_flight = 0
|
|
1055
|
+
|
|
1056
|
+
def test_evict_clears_cache_when_idle(self):
|
|
1057
|
+
import agents.run_task_linker_mlx as m
|
|
1058
|
+
m._model_cache["x"] = MagicMock()
|
|
1059
|
+
m._in_flight = 0
|
|
1060
|
+
m._last_used = time.monotonic() - 1000 # idle long past the window
|
|
1061
|
+
freed = m.maybe_evict_idle(idle_s=0.001)
|
|
1062
|
+
assert freed is not None # eviction happened
|
|
1063
|
+
assert m.model_resident() is False
|
|
1064
|
+
assert m._model_cache == {}
|
|
1065
|
+
|
|
1066
|
+
|
|
1014
1067
|
# ---------------------------------------------------------------------------
|
|
1015
1068
|
# SessionClassification schema
|
|
1016
1069
|
# ---------------------------------------------------------------------------
|
package/services/pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "meridian-agents"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.54.1"
|
|
8
8
|
description = "Meridian agents — MLX classifier server and Jira worklog synthesis for meridian.db"
|
|
9
9
|
requires-python = ">=3.11"
|
|
10
10
|
authors = [{ name = "Meridiona" }]
|
package/ui.tar.gz
CHANGED
|
Binary file
|