npm - @meridiona/meridian-darwin-arm64 - Versions diffs - 1.58.0 → 1.59.0 - Mend

@meridiona/meridian-darwin-arm64 1.58.0 → 1.59.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

package/VERSION +1 -1
package/bin/meridian +0 -0
package/package.json +1 -1
package/scripts/install-screenpipe-daemon.sh +25 -3
package/services/agents/_prompts.py +76 -31
package/services/agents/_system_context.py +1 -1
package/services/agents/run_task_linker_mlx.py +119 -26
package/services/agents/tests/test_run_task_linker_mlx.py +2 -2
package/services/pyproject.toml +1 -1
package/services/skills/activity/task-classifier/SKILL.md +16 -12
package/services/tests/evals/build_dataset.py +7 -15
package/services/tests/evals/classify_session.py +7 -3
package/services/tests/evals/render_seeds.py +35 -14
package/services/tests/test_continuity_context.py +202 -0
package/services/tests/test_fetch_pm_tasks.py +120 -0
package/services/tests/test_format_candidates.py +45 -0
package/ui.tar.gz +0 -0

package/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.58.0
1	+ 1.59.0

package/bin/meridian CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@meridiona/meridian-darwin-arm64",
-  "version": "1.58.0",
+  "version": "1.59.0",
   "description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
   "homepage": "https://github.com/Meridiona/meridian",
   "repository": {

package/scripts/install-screenpipe-daemon.sh CHANGED Viewed

@@ -127,10 +127,32 @@ while launchctl print "${GUI_TARGET}/${LABEL}" >/dev/null 2>&1; do
 done
 echo "→ bootstrap ${LABEL}"
+# `meridian stop` runs `launchctl disable` to clear the KeepAlive intent, which
+# persists in launchd's per-user override DB. bootstrap REFUSES a disabled label
+# with EIO (errno 5), so the override must be cleared FIRST — otherwise a plain
+# reinstall (install-dev.sh) can't revive a service that was `meridian stop`-ped.
 launchctl enable "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
-launchctl bootstrap "${GUI_TARGET}" "${PLIST_DEST}"
-launchctl enable "${GUI_TARGET}/${LABEL}"
-launchctl kickstart -k "${GUI_TARGET}/${LABEL}"
+# bootstrap is genuinely flaky: it EIOs when the prior domain entry hasn't fully
+# cleared even after the bootout-wait above. Do NOT let one transient failure
+# abort the whole install under `set -e` (that's what left screenpipe down after
+# a stop). Retry, re-enabling each round, and treat "already loaded" as success.
+_bs_try=0
+until launchctl bootstrap "${GUI_TARGET}" "${PLIST_DEST}" 2>/dev/null; do
+    if launchctl print "${GUI_TARGET}/${LABEL}" >/dev/null 2>&1; then
+        break  # already in the domain — bootstrap only "failed" because it's present
+    fi
+    _bs_try=$(( _bs_try + 1 ))
+    if [[ "${_bs_try}" -ge 5 ]]; then
+        echo "⚠ bootstrap ${LABEL} failed after ${_bs_try} attempts — see launchctl print" >&2
+        break
+    fi
+    launchctl enable "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
+    sleep 1
+done
+# Always finish with enable + kickstart, even if bootstrap was a no-op above, so a
+# disabled-but-loaded service ends up enabled AND running.
+launchctl enable "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
+launchctl kickstart -k "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
 echo
 echo "✓ screenpipe installed and started"

package/services/agents/_prompts.py CHANGED Viewed

@@ -23,6 +23,25 @@ _VSCODE_BANNER_RE = re.compile(
 # then responsible for not blowing the model's context window).
 SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "10000"))
+# Max chars of each candidate ticket's description included in the prompt.
+# Default 0 = NO cap — the full description is sent. This field was previously
+# hard-capped at 240 chars, which dropped 56-83% of real ticket text (measured:
+# avg 548 chars, max 1440 across the live board), and the discriminating scope a
+# session must be matched against frequently lives past char 240. With the
+# 128K-context classifier and plan-only candidate sets (2-3 tickets), the prompt
+# has ample budget, so descriptions are sent in full by default. Set
+# CANDIDATE_DESC_CAP=<n> to re-impose a ceiling if an unusually long description
+# ever bloats the prompt (e.g. on a full-candidate fallback day).
+CANDIDATE_DESC_CAP = int(os.environ.get("CANDIDATE_DESC_CAP", "0"))
+# Recent-work continuity window (minutes). The prompt summarises the developer's
+# tracked work in this many minutes BEFORE the current session, aggregated per
+# ticket, as a weak continuity prior. Time-windowed (not count-windowed) on
+# purpose: session length is wildly variable, so "last N sessions" can be 90s of
+# micro-glances or 3h of deep work. Shared with run_task_linker_mlx.py, which
+# fetches the window. Override via CONTINUITY_WINDOW_MIN.
+_CONTINUITY_WINDOW_MIN = int(os.environ.get("CONTINUITY_WINDOW_MIN", "30"))
 def _fmt_dur(duration_s: int | float) -> str:
     secs = int(duration_s or 0)
@@ -102,8 +121,8 @@ def _format_candidates(tasks: list[dict]) -> str:
         epic_title  = (task.get("epic_title") or "").strip()
         sprint_name = (task.get("sprint_name") or "").strip()
         tags        = (task.get("tags") or "").strip()
-        if len(desc) > 240:
-            desc = desc[:240] + "…"
+        if CANDIDATE_DESC_CAP > 0 and len(desc) > CANDIDATE_DESC_CAP:
+            desc = desc[:CANDIDATE_DESC_CAP] + "…"
         meta_parts = [p for p in [issue_type, f"Epic: {epic_title}" if epic_title else "", sprint_name, f"tags: {tags}" if tags else ""] if p]
         meta = "  [" + " · ".join(meta_parts) + "]" if meta_parts else ""
         # The dev declared this ticket as today's focus on the plan page. It's a
@@ -117,44 +136,70 @@ def _format_candidates(tasks: list[dict]) -> str:
     return "\n\n".join(rows) if rows else "(no candidates)"
-def _format_recent_sessions(sessions: list[dict]) -> str:
-    if not sessions:
-        return "  (no recent session context)"
-    rows = []
-    for s in sessions:
-        time_str = _fmt_time(s.get("started_at") or "")
-        app = (s.get("app_name") or "?")[:14]
-        dur_str = _fmt_dur(s.get("duration_s") or 0)
-        task_key = s.get("task_key")
-        routing = s.get("task_routing")  # None means unclassified
-        if task_key:
-            target = f"→ {task_key}"
-        elif routing == "untracked":
-            target = "→ [untracked]"
-        elif routing is None:
-            # session captured but not yet classified
-            target = "→ [pending]"
+def _fmt_continuity_mins(seconds: float) -> str:
+    """Coarse minutes label for the continuity block: '<1 min' or '~N min'."""
+    secs = int(seconds or 0)
+    if secs < 60:
+        return "<1 min"
+    return f"~{round(secs / 60)} min"
+def _format_continuity(activity: list[dict], now_iso: str | None = None) -> str:
+    """Render the recent-ticket continuity prior — one bullet per ticket worked in
+    the window, ordered most-recent-first: total time spent, how many sessions it
+    spanned, and how long before the current session it was last active.
+    `activity` entries come from `_fetch_recent_ticket_activity` (already
+    aggregated, candidate-gated, confidence-filtered, recency-sorted). Empty input
+    → an explicit "no tracked work" line (not ""), so the block is ALWAYS present:
+    that tells the model definitively "there is no recent continuity — rely on this
+    session's own evidence" (silence is ambiguous — it can't tell "no work" from
+    "not provided") and keeps the trace node legible instead of blank. We
+    deliberately do NOT emit a raw per-session log: those rows leak internal state
+    (sub-threshold micro-sessions, not-yet-classified neighbours, two interleaved
+    classify pipelines) that the model misreads as signal. This is a derived,
+    calibrated statement of recent tracked work.
+    """
+    if not activity:
+        return "  (no tracked work in this window)"
+    lines = []
+    for a in activity:
+        total = _fmt_continuity_mins(a.get("total_s", 0))
+        n = int(a.get("sessions", 0) or 0)
+        sess = "1 session" if n == 1 else f"{n} sessions"
+        ago_s = a.get("ago_s")
+        if ago_s is None:
+            recency = ""
+        elif ago_s < 60:
+            recency = ", last active just before this session"
         else:
-            target = "→ [overhead]"
-        # Category is intentionally omitted — recent-context is a task-continuity
-        # signal only; carrying the (rule-based or prior-LLM) category tag would
-        # feed a category prior back into classification.
-        rows.append(f"  {time_str}  {app:<14}  {dur_str:<7}  {target}")
-    return "\n".join(rows)
+            recency = f", last active ~{round(ago_s / 60)} min before this session"
+        lines.append(f"  • {a['task_key']} — {total} over {sess}{recency}")
+    return "\n".join(lines)
 def build_user_message(
     session: dict,
     candidates: list[dict],
-    recent_sessions: list[dict] | None = None,
+    recent_activity: list[dict] | None = None,
+    now_iso: str | None = None,
 ) -> str:
-    sessions = recent_sessions or []
-    has_any_task_key = any(s.get("task_key") for s in sessions)
+    continuity = _format_continuity(recent_activity or [], now_iso)
+    # ALWAYS emitted (even when empty, where `continuity` is an explicit
+    # "no tracked work" line) so the model gets a definitive signal rather than
+    # ambiguous silence, and the trace node is never blank. Framed as a WEAK prior,
+    # never an instruction: an assertive "user was working on KAN-X" anchors the
+    # model into force-linking — the exact false-positive failure mode the SKILL
+    # warns against. The block states facts (ticket, time, recency); the SKILL's
+    # "classify by THIS session's evidence" rule governs.
     recent_block = (
-        "RECENT WORK CONTEXT:\n"
-        f"{_format_recent_sessions(sessions)}\n"
+        f"RECENT WORK CONTEXT — the developer's tracked work in the last "
+        f"{_CONTINUITY_WINDOW_MIN} minutes before this session. This is a WEAK "
+        "continuity hint, NOT proof: continue the most-recent ticket ONLY if this "
+        "session's own evidence also fits it; never link on continuity alone.\n"
+        f"{continuity}\n"
         "\n"
-    ) if has_any_task_key else ""
+    )
     # When the dev declared a focus for the day, name it in the header so the model
     # treats ★ rows as a prior — preferred when the evidence plausibly fits, but
     # never forced. Recall is preserved: every candidate is still listed.

package/services/agents/_system_context.py CHANGED Viewed

@@ -29,7 +29,7 @@ _DB_SHELL = shlex.quote(str(_DB_PATH))
 SYSTEM_CONTEXT = f"""You are **Meridian Intelligence** — the AI reasoning layer inside Meridian, a developer productivity platform.
-Meridian monitors a developer's screen and builds a structured record of their work. Your role is to reason over that record and take actions.
+Meridian monitors a developer's screen and builds a structured record of their work as a stream of work *sessions*. Your PRIMARY role is to reason over each session and **classify it** — determining which tracked ticket (the "task") the work belongs to, or whether it is overhead or untracked work — so Meridian can keep every ticket's progress and worklog accurate. Classifying a session correctly to its task, and reasoning carefully over the evidence to do so, is the core job.
 CURRENT CAPABILITY — session classification
   Given a work session (app, duration, screen content, recent history, open tickets), decide:

package/services/agents/run_task_linker_mlx.py CHANGED Viewed

@@ -13,7 +13,7 @@ OTel span hierarchy (when invoked as a script via main()):
             db_fetch
             classifier_input     ← the COMPLETE model input (system + user)
                 system_prompt        — classifier skill + context
-                recent_context       — past-5 sessions
+                recent_context       — 30-min per-ticket continuity prior
                 session_block        — the input session being classified
                 candidate_tickets    — ranked candidate tickets (★ = today)
             llm_inference
@@ -54,15 +54,22 @@ from agents import observability
 from agents._prompts import (
     build_user_message,
     _format_candidates,
-    _format_recent_sessions,
+    _format_continuity,
     _format_session,
+    _CONTINUITY_WINDOW_MIN,
 )
 from agents._system_context import SYSTEM_CONTEXT
 log = logging.getLogger("agents.run_task_linker_mlx")
 tracer = observability.setup("meridian-task-linker-mlx")
-_CONTEXT_WINDOW = 5
+# Recent-work continuity: only count a prior session toward the continuity block
+# if its task link is confident enough to trust (a shaky 0.5 generic match
+# shouldn't compound into a continuity nudge). 0.7 sits at the top of the SKILL's
+# "generic project-level match" band (0.50-0.65), so this keeps real alignments
+# and drops weak guesses. The window length lives in _prompts._CONTINUITY_WINDOW_MIN
+# (shared with the prompt label). Override via CONTINUITY_MIN_CONFIDENCE.
+_CONTINUITY_MIN_CONFIDENCE = float(os.environ.get("CONTINUITY_MIN_CONFIDENCE", "0.7"))
 _MAX_TOKENS = 1024
 _TEMPERATURE = 0.0  # greedy decoding — deterministic classification
@@ -602,23 +609,88 @@ def _fetch_session(
     return dict(row) if row else None
-def _fetch_recent_sessions(
-    con: _sqlite3.Connection, before_id: int
+def _fetch_recent_ticket_activity(
+    con: _sqlite3.Connection,
+    current_started_at: str,
+    candidate_keys: list[str],
 ) -> list[dict[str, Any]]:
-    # Recent context is a task-continuity signal only: app, time, duration and
-    # which ticket each recent session mapped to. We deliberately do NOT select
-    # session_text/excerpt or category — recent OCR is noise here and a category
-    # tag would feed a prior back into classification. (session_text is still
-    # referenced in WHERE only to skip empty-capture rows.)
+    """The developer's tracked-ticket work in the _CONTINUITY_WINDOW_MIN minutes
+    before the current session, aggregated per ticket → a calibrated continuity
+    prior (NOT a raw session log).
+    Returns one entry per ticket worked in the window:
+        {"task_key", "total_s", "sessions", "last_ended_at", "ago_s"}
+    ordered by recency (most-recently-active ticket first). Empty when there is no
+    qualifying recent work — the caller then omits the block entirely rather than
+    asserting a continuity that doesn't exist.
+    A session counts only if it is (a) already CLASSIFIED to a ticket
+    (task_session_type='task' — "last classified", never pending/in-flight),
+    (b) confident enough to trust as a prior (task_confidence >=
+    _CONTINUITY_MIN_CONFIDENCE), and (c) mapped to a ticket in the CURRENT
+    candidate set — a prior on a ticket the model can't even pick is pure noise.
+    Windowing is done in Python (fromisoformat) so it's robust to the stored
+    timestamp's timezone/precision; the SQL only does the cheap "strictly before
+    current" + confidence prefilter (consistent ISO format → lexicographic '<' is
+    chronological).
+    """
+    candidates = set(candidate_keys)
+    if not current_started_at or not candidates:
+        return []
+    try:
+        anchor = _dt.datetime.fromisoformat(current_started_at)
+    except (ValueError, TypeError):
+        return []
+    window_start = anchor - _dt.timedelta(minutes=_CONTINUITY_WINDOW_MIN)
     rows = con.execute(
-        "SELECT app_name, started_at, duration_s, task_key, task_routing"
+        "SELECT task_key, started_at, ended_at, duration_s, task_confidence"
         " FROM app_sessions"
-        " WHERE id < ? AND duration_s > 1 AND COALESCE(session_text,'') != ''"
-        " ORDER BY id DESC LIMIT ?",
-        (before_id, _CONTEXT_WINDOW),
+        " WHERE started_at < ?"
+        "   AND task_key IS NOT NULL"
+        "   AND task_session_type = 'task'"
+        "   AND task_confidence >= ?"
+        " ORDER BY started_at DESC LIMIT 200",
+        (current_started_at, _CONTINUITY_MIN_CONFIDENCE),
     ).fetchall()
-    result = [dict(r) for r in rows]
-    result.reverse()
+    agg: dict[str, dict[str, Any]] = {}
+    for r in rows:
+        d = dict(r)
+        tk = d.get("task_key")
+        if tk not in candidates:
+            continue
+        try:
+            s_at = _dt.datetime.fromisoformat(d["started_at"])
+        except (ValueError, TypeError):
+            continue
+        if s_at < window_start:
+            continue  # outside the continuity window
+        try:
+            e_at = _dt.datetime.fromisoformat(d.get("ended_at") or d["started_at"])
+        except (ValueError, TypeError):
+            e_at = s_at
+        entry = agg.get(tk)
+        if entry is None:
+            entry = {"task_key": tk, "total_s": 0.0, "sessions": 0, "last_ended": e_at}
+            agg[tk] = entry
+        entry["total_s"] += float(d.get("duration_s") or 0.0)
+        entry["sessions"] += 1
+        if e_at > entry["last_ended"]:
+            entry["last_ended"] = e_at
+    result: list[dict[str, Any]] = []
+    for entry in agg.values():
+        ago_s = max(0.0, (anchor - entry["last_ended"]).total_seconds())
+        result.append(
+            {
+                "task_key":      entry["task_key"],
+                "total_s":       entry["total_s"],
+                "sessions":      entry["sessions"],
+                "last_ended_at": entry["last_ended"].isoformat(),
+                "ago_s":         ago_s,
+            }
+        )
+    result.sort(key=lambda e: e["ago_s"])  # most-recently-active ticket first
     return result
@@ -813,7 +885,7 @@ def _classify_one(
         session_raw = _fetch_session(con, session_id)
         if session_raw is None:
             db_span.set_attribute("pm_tasks_count", 0)
-            db_span.set_attribute("recent_sessions_count", 0)
+            db_span.set_attribute("recent_continuity_tickets", 0)
             db_span.add_event("session_not_found", {"session_id": session_id})
             db_span.set_status(StatusCode.ERROR, f"session {session_id} not found in DB")
             return _error_result(
@@ -823,7 +895,12 @@ def _classify_one(
         plan_date  = _local_day(session_raw.get("started_at") or "")
         focus_keys = _fetch_plan_focus(con, plan_date)
         pm_tasks   = _fetch_pm_tasks(con, focus_keys)
-        recent     = _fetch_recent_sessions(con, session_id)
+        # Continuity prior needs the candidate set up front (it only names tickets
+        # the model can actually pick), so compute candidate_keys before fetching.
+        candidate_keys = [t["task_key"] for t in pm_tasks]
+        recent     = _fetch_recent_ticket_activity(
+            con, session_raw.get("started_at") or "", candidate_keys
+        )
         session_text = session_raw.get("session_text") or ""
         # Coding-agent rows (Claude Code / Codex) carry the full transcript in
@@ -839,7 +916,7 @@ def _classify_one(
         # it answers "was the right ticket even offered, and where was it ranked?"
         # without anyone having to read the prompt. Ranked order is preserved
         # (today's-focus keys float to the front in _fetch_pm_tasks).
-        candidate_keys = [t["task_key"] for t in pm_tasks]
+        # candidate_keys computed above (the continuity fetch needs it).
         recent_task_keys = [r.get("task_key") for r in recent if r.get("task_key")]
         # Session identity + the app_sessions row metadata, so a trace is
         # self-contained — you know WHICH session and its key fields (app, window
@@ -881,7 +958,13 @@ def _classify_one(
             db_span.set_attribute("summary_source", str(session_raw.get("summary_source") or ""))
         db_span.set_attribute("pm_tasks_count", len(pm_tasks))
         db_span.set_attribute("today_focus_count", len(focus_keys))
-        db_span.set_attribute("recent_sessions_count", len(recent))
+        # Continuity prior: how many tickets the dev worked in the prior window,
+        # and across how many classified sessions (0/0 when there's no qualifying
+        # recent work → the block is omitted from the prompt).
+        db_span.set_attribute("recent_continuity_tickets", len(recent))
+        db_span.set_attribute(
+            "recent_continuity_sessions", sum(int(r.get("sessions", 0) or 0) for r in recent)
+        )
         db_span.set_attribute("candidate_task_keys", ", ".join(candidate_keys) if candidate_keys else "-")
         db_span.set_attribute("today_focus_keys", ", ".join(focus_keys) if focus_keys else "-")
         # Which candidate-set policy actually applied for this session, so a trace
@@ -920,8 +1003,8 @@ def _classify_one(
     # The single drill-down span for "exactly what the classifier was asked".
     # It carries the COMPLETE input, byte-for-byte as handed to the model:
     #   • system_prompt — full system context + the task-classifier SKILL
-    #   • llm_input     — full user message: the input session block, the recent
-    #                     past-5 sessions, and the ranked candidate tickets
+    #   • llm_input     — full user message: the input session block, the 30-min
+    #                     per-ticket continuity prior, and the ranked candidate tickets
     # Both are captured POST-assembly, so any cap already applied while building
     # the prompt (e.g. SESSION_TEXT_CAP truncating the OCR excerpt) is reflected
     # here EXACTLY as the model saw it — never the pre-cap original. Concatenating
@@ -931,7 +1014,9 @@ def _classify_one(
     # capped at ~8k chars — so on that path the on-span text is the assembled
     # input, not the rewritten one.)
     with tracer.start_as_current_span("classifier_input") as bp_span:
-        user_message = build_user_message(session, pm_tasks, recent_sessions=recent)
+        user_message = build_user_message(
+            session, pm_tasks, recent_activity=recent, now_iso=session.get("started_at")
+        )
         messages = [
             {"role": "system", "content": _SYSTEM_PROMPT},
             {"role": "user",   "content": user_message},
@@ -975,7 +1060,8 @@ def _classify_one(
                     part.set_attribute(_k, _v)
         _input_part("system_prompt", _SYSTEM_PROMPT)       # classifier skill + context
-        _input_part("recent_context", _format_recent_sessions(recent))   # past-5 sessions
+        _input_part("recent_context",
+                    _format_continuity(recent, session.get("started_at")))   # 30-min continuity prior
         _input_part("session_block", _format_session(session))           # the input session
         _input_part("candidate_tickets", _format_candidates(pm_tasks),   # ranked candidates
                     ticket_count=len(pm_tasks))
@@ -1310,7 +1396,13 @@ def _classify_one_logged_inner(
     session_raw = _fetch_session(con, session_id)
     focus_keys = _fetch_plan_focus(con, _local_day(session_raw.get("started_at") or "")) if session_raw else []
     pm_tasks = _fetch_pm_tasks(con, focus_keys) if session_raw else []
-    recent = _fetch_recent_sessions(con, session_id) if session_raw else []
+    recent = (
+        _fetch_recent_ticket_activity(
+            con, session_raw.get("started_at") or "", [t["task_key"] for t in pm_tasks]
+        )
+        if session_raw
+        else []
+    )
     if session_raw:
         user_message = build_user_message(
@@ -1328,7 +1420,8 @@ def _classify_one_logged_inner(
                 "audio_snippets":      [],
             },
             pm_tasks,
-            recent_sessions=recent,
+            recent_activity=recent,
+            now_iso=session_raw.get("started_at", ""),
         )
     else:
         user_message = ""

package/services/agents/tests/test_run_task_linker_mlx.py CHANGED Viewed

@@ -348,7 +348,7 @@ class TestObservabilityClassifyOne:
     def test_db_fetch_recent_sessions_count(self, db: Path, span_exporter):
         _, spans = self._run(db, span_exporter)
         s = _span_by_name(spans, "db_fetch")
-        assert s.attributes["recent_sessions_count"] == 0  # no prior sessions
+        assert s.attributes["recent_continuity_tickets"] == 0  # no prior tracked work
     def test_db_fetch_session_loaded_event_fields(self, db: Path, span_exporter):
         _, spans = self._run(db, span_exporter)
@@ -421,7 +421,7 @@ class TestObservabilityClassifyOne:
         s = _span_by_name(spans, "build_prompt")
         assert s is not None
         assert s.attributes["pm_tasks_count"] == 1
-        assert s.attributes["recent_sessions_count"] == 0
+        assert s.attributes["recent_continuity_tickets"] == 0
         assert s.attributes["prompt_chars"] > 0
     def test_build_prompt_assembled_event(self, db: Path, span_exporter):

package/services/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "meridian-agents"
-version = "1.58.0"
+version = "1.59.0"
 description = "Meridian agents — MLX classifier server and Jira worklog synthesis for meridian.db"
 requires-python = ">=3.11"
 authors = [{ name = "Meridiona" }]

package/services/skills/activity/task-classifier/SKILL.md CHANGED Viewed

@@ -55,7 +55,7 @@ The user message contains:
 - **SESSION** — app, duration, top window titles, and the screen content (OCR / a11y). Decide the category yourself from this evidence; no category is provided.
 - **CANDIDATE TICKETS** — all open tracked tickets (Jira, Linear, GitHub, Trello, Azure DevOps). These are the only tickets you may choose from.
-- **RECENT SESSIONS** (previous 5) — app / time / duration / which ticket each mapped to (no screen text). A **weak disambiguation hint only**: it can support a match when the current session ALSO has matching evidence, but it must never override what the current session itself shows. Recent activity on a ticket does not make the current session that ticket.
+- **RECENT WORK CONTEXT** — a summary of the developer's tracked work in the **last 30 minutes** before this session, aggregated **per ticket**: each line is a ticket they worked, with the total time spent, how many sessions it spanned, and how long before this session it was last active (most-recently-active ticket first). It lists only tickets that are in the candidate set above. This is a **weak disambiguation hint only**: it can support a match when the current session ALSO has matching evidence, but it must never override what the current session itself shows. Recent activity on a ticket does not make the current session that ticket — and a ticket worked 25 minutes ago is a much weaker hint than one active a minute ago. When this block is absent, there was no confident recent tracked work to report.
 ## Available capabilities
@@ -74,9 +74,9 @@ Use database queries sparingly — session data and candidate tickets are alread
 Pick **exactly one** of the candidate `task_key` values, OR return `null` if **none** fit the session.
-Use **context from previous sessions** to make smarter decisions:
-- If the current session is **generic** (e.g., Slack) but follows/precedes work on a specific ticket, consider linking it to that task.
-- If sessions alternate (coding → Slack → coding), treat them as potentially the **same task** if separated by only a few minutes.
+Use the **recent work context** to make smarter decisions:
+- If the current session is **generic** (e.g., Slack) but the recent context shows sustained, very-recent work on a specific ticket, consider linking it to that task — *only if* this session's own evidence is at least consistent with it.
+- The recent context is **recency- and time-weighted**: prefer the ticket that was active most recently and for the most time. A ticket last active a minute ago is a strong tie-breaker; one last active ~25 minutes ago is weak. When two or more tickets appear, the dev was context-switching — continuity is ambiguous, so lean harder on the current session's own evidence.
 - Overhead (system settings, music, etc) should always be `null` regardless of context.
 ## Output format
@@ -192,18 +192,22 @@ Capture every category the session shows evidence of:
 **Bad — speculative + marketing:**
 > Successfully refactored the workflow to be more efficient. The new linear design will be much faster. Next steps include adding the worklog poster and testing end-to-end on Jira.
-## Using Context from Previous Sessions
+## Using the Recent Work Context
-You have access to **the previous 5 sessions** to disambiguate the current session:
+The **RECENT WORK CONTEXT** block summarises the developer's tracked work over the prior 30 minutes, per ticket, with time spent and how recently each was active. Use it to disambiguate the current session — never to override it.
-**Example: Coding → Communication about same work → Coding**
-- Session 1 (5 min ago): VS Code, editing KAN-42 implementation → task_key: KAN-42, confidence: 0.90
-- Session 2 (3 min ago): Slack, discussing PR review for KAN-42 → **if related to same work**, task_key: KAN-42, confidence: 0.75 (work mention + prior context)
-- Session 3 (now): VS Code, editing same file → task_key: KAN-42, confidence: 0.85 (context continuity)
+**Example: the recent context shows**
+```
+RECENT WORK CONTEXT — the developer's tracked work in the last 30 minutes before this session...
+  • KAN-42 — ~22 min over 5 sessions, last active just before this session
+```
+- Current session is **VS Code editing the same file** referenced by KAN-42 → strong: the recent context *and* the current evidence agree → task_key: KAN-42, confidence ~0.85.
+- Current session is **Slack** with the channel/thread discussing the KAN-42 PR → the current content itself is about KAN-42, and the recent context supports it → task_key: KAN-42, confidence ~0.75.
+- Current session is **Slack** showing a generic standup or an unrelated thread → the current evidence is NOT about KAN-42 → return `null` / `untracked` (or a different ticket if its own evidence matches). **Do not inherit KAN-42 just because it was the recent task.**
-**Decision:** Only link Session 2 to KAN-42 if Session 2's **own content** shows it is about that work (the OCR/window discusses or searches the KAN-42 work). If Session 2 is generic, OR shows the user has moved to *different* work (another project, another team's doc, an unrelated meeting), return `null` with `session_type: "untracked"` (or a different ticket if its own evidence matches one) — **do not inherit KAN-42 just because it was the recent task.** Continuity is a tie-breaker between plausible matches, never a substitute for current-session evidence.
+**Decision rule:** continuity is a tie-breaker between plausible matches, never a substitute for current-session evidence. Weight the recent context by recency and time — a ticket "last active just before this session" with 22 minutes behind it is a strong tie-breaker; one "last active ~25 min before this session" is weak. When the block lists **more than one ticket**, the developer was switching context, so continuity is ambiguous: rely on the current session's own evidence.
-Example reasoning for Session 2 (if task-related): `"Slack discusses PR review for KAN-42 implementation mentioned in prior VS Code session; linked via work context."`
+Example reasoning (if task-related): `"Slack thread discusses the KAN-42 PR; recent context shows 22 min on KAN-42 ending just before this session — linked via work context."`
 ## Scoring heuristics

package/services/tests/evals/build_dataset.py CHANGED Viewed

@@ -34,6 +34,7 @@ if str(_SERVICES_DIR) not in sys.path:
     sys.path.insert(0, str(_SERVICES_DIR))
 from agents._prompts import build_user_message
+from agents.run_task_linker_mlx import _fetch_recent_ticket_activity
 MERIDIAN_DB = Path(os.environ.get("MERIDIAN_DB", Path.home() / ".meridian/meridian.db"))
 SESSION_IDS: list[int] = [
@@ -103,19 +104,6 @@ def _fetch_pm_tasks(con: sqlite3.Connection) -> list[dict]:
     return [dict(r) for r in rows]
-def _fetch_recent(con: sqlite3.Connection, before_id: int) -> list[dict]:
-    rows = con.execute(
-        "SELECT app_name, started_at, duration_s, task_key, task_routing, category"
-        " FROM app_sessions"
-        " WHERE id < ? AND duration_s > 1 AND COALESCE(session_text,'') != ''"
-        " ORDER BY id DESC LIMIT 5",
-        (before_id,),
-    ).fetchall()
-    result = [dict(r) for r in rows]
-    result.reverse()
-    return result
 def main() -> None:
     if not MERIDIAN_DB.exists():
         print(f"ERROR: meridian.db not found at {MERIDIAN_DB}", file=sys.stderr)
@@ -152,8 +140,12 @@ def main() -> None:
             "confidence":          s["confidence"] or 0.0,
             "audio_snippets":      [],
         }
-        recent = _fetch_recent(con, s["id"])
-        prompt_input = build_user_message(session, pm_tasks, recent_sessions=recent)
+        recent = _fetch_recent_ticket_activity(
+            con, session["started_at"], [t["task_key"] for t in pm_tasks]
+        )
+        prompt_input = build_user_message(
+            session, pm_tasks, recent_activity=recent, now_iso=session["started_at"]
+        )
         expected = {
             "task_key":     _normalise_task_key(s.get("task_key")),

package/services/tests/evals/classify_session.py CHANGED Viewed

@@ -39,7 +39,7 @@ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
     from agents._prompts import build_user_message
     from agents.run_task_linker_mlx import (
         _fetch_pm_tasks,
-        _fetch_recent_sessions,
+        _fetch_recent_ticket_activity,
         _fetch_session,
     )
@@ -48,8 +48,10 @@ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
     raw = _fetch_session(con, session_id)
     if raw is None:
         return None
-    recent = _fetch_recent_sessions(con, session_id)
     pm_tasks = _fetch_pm_tasks(con)
+    recent = _fetch_recent_ticket_activity(
+        con, raw.get("started_at") or "", [t["task_key"] for t in pm_tasks]
+    )
     session_text = raw.get("session_text") or ""
     if raw.get("coding_agent_session_uuid") and (raw.get("session_summary") or "").strip():
         session_text = raw["session_summary"]
@@ -66,7 +68,9 @@ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
         "confidence": raw.get("confidence", 0.0),
         "audio_snippets": [],
     }
-    return build_user_message(session, pm_tasks, recent_sessions=recent)
+    return build_user_message(
+        session, pm_tasks, recent_activity=recent, now_iso=raw.get("started_at")
+    )
 def _classify(url: str, db_path: str, session_ids: list[int]) -> list[dict]:

package/services/tests/evals/render_seeds.py CHANGED Viewed

@@ -24,7 +24,10 @@ _SERVICES_DIR = Path(__file__).parent.parent.parent
 if str(_SERVICES_DIR) not in sys.path:
     sys.path.insert(0, str(_SERVICES_DIR))
+import sqlite3  # noqa: E402
 from agents._prompts import build_user_message  # noqa: E402
+from agents.run_task_linker_mlx import _fetch_recent_ticket_activity  # noqa: E402
 EVAL_DIR = Path(__file__).parent
 SEED_DIR = EVAL_DIR / "data" / "seeds"
@@ -34,22 +37,36 @@ PERSONA_FILES = {
 }
-def _project_recent(prior: list[dict]) -> list[dict]:
-    """Project a list of prior seed sessions into the shape build_user_message wants."""
-    out = []
+def _project_recent(
+    prior: list[dict], current_started_at: str, candidate_keys: list[str]
+) -> list[dict]:
+    """Build the per-ticket continuity prior for a seed session, reusing the EXACT
+    production aggregation (`_fetch_recent_ticket_activity`) so rendered goldens
+    match the live prompt. We load the prior scoreable seeds into a throwaway
+    in-memory DB and run the real query against it (windowing, confidence floor,
+    candidate-gating, recency ordering all happen there — one source of truth)."""
+    con = sqlite3.connect(":memory:")
+    con.row_factory = sqlite3.Row
+    con.execute(
+        "CREATE TABLE app_sessions ("
+        " id INTEGER PRIMARY KEY AUTOINCREMENT,"
+        " task_key TEXT, started_at TEXT, ended_at TEXT, duration_s REAL,"
+        " task_confidence REAL, task_session_type TEXT)"
+    )
     for s in prior:
         gt = s.get("ground_truth", {})
         tk = gt.get("task_key")
         task_key = tk if tk and tk != "none" else None
-        out.append({
-            "app_name":     s["app_name"],
-            "started_at":   s["started_at"],
-            "duration_s":   s["duration_s"],
-            "task_key":     task_key,
-            "task_routing": "auto" if task_key else None,
-            "category":     s.get("category", ""),
-        })
-    return out
+        if not task_key:
+            continue  # untracked/overhead priors carry no continuity signal
+        con.execute(
+            "INSERT INTO app_sessions"
+            " (task_key, started_at, ended_at, duration_s, task_confidence, task_session_type)"
+            " VALUES (?, ?, ?, ?, 1.0, 'task')",
+            (task_key, s["started_at"], s.get("ended_at") or s["started_at"], s["duration_s"]),
+        )
+    con.commit()
+    return _fetch_recent_ticket_activity(con, current_started_at, candidate_keys)
 def render(persona: str) -> list[dict]:
@@ -79,8 +96,12 @@ def render(persona: str) -> list[dict]:
         if not gt.get("scoreable"):
             continue
-        recent = _project_recent(scoreable_prior[-5:])
-        prompt = build_user_message(s, candidates, recent_sessions=recent)
+        recent = _project_recent(
+            scoreable_prior, s["started_at"], [c["task_key"] for c in candidates]
+        )
+        prompt = build_user_message(
+            s, candidates, recent_activity=recent, now_iso=s["started_at"]
+        )
         expected = {
             "task_key":     gt.get("task_key", "none"),

package/services/tests/test_continuity_context.py ADDED Viewed

@@ -0,0 +1,202 @@
+"""Unit tests for the 30-min per-ticket continuity context.
+Covers the two halves of the rewritten recent-work block:
+  * _fetch_recent_ticket_activity — windowing, confidence floor, candidate-gating,
+    per-ticket aggregation, recency ordering.
+  * _format_continuity            — rendering (none / single / multiple / recency).
+Run: services/.venv/bin/pytest services/tests/test_continuity_context.py -v
+(Also runnable without pytest: services/.venv/bin/python services/tests/test_continuity_context.py)
+"""
+from __future__ import annotations
+import datetime as _dt
+import sqlite3
+import sys
+from pathlib import Path
+# Make `from agents import ...` resolve (mirror tests/evals/eval_classifier.py).
+_SERVICES_DIR = Path(__file__).resolve().parent.parent
+if str(_SERVICES_DIR) not in sys.path:
+    sys.path.insert(0, str(_SERVICES_DIR))
+from agents import run_task_linker_mlx as rtl  # noqa: E402
+from agents import _prompts  # noqa: E402
+ANCHOR = "2026-06-17T10:00:00+00:00"
+def _at(minutes_before: int) -> str:
+    base = _dt.datetime.fromisoformat(ANCHOR)
+    return (base - _dt.timedelta(minutes=minutes_before)).isoformat()
+def _make_con(rows: list[dict]) -> sqlite3.Connection:
+    """In-memory meridian DB with the columns _fetch_recent_ticket_activity reads."""
+    con = sqlite3.connect(":memory:")
+    con.row_factory = sqlite3.Row
+    con.execute(
+        "CREATE TABLE app_sessions ("
+        " id INTEGER PRIMARY KEY AUTOINCREMENT,"
+        " task_key TEXT, started_at TEXT, ended_at TEXT, duration_s REAL,"
+        " task_confidence REAL, task_session_type TEXT)"
+    )
+    for r in rows:
+        con.execute(
+            "INSERT INTO app_sessions"
+            " (task_key, started_at, ended_at, duration_s, task_confidence, task_session_type)"
+            " VALUES (?, ?, ?, ?, ?, ?)",
+            (
+                r.get("task_key"),
+                r["started_at"],
+                r.get("ended_at"),
+                r.get("duration_s", 0.0),
+                r.get("task_confidence", 0.9),
+                r.get("task_session_type", "task"),
+            ),
+        )
+    con.commit()
+    return con
+def _keys(activity):
+    return [a["task_key"] for a in activity]
+# ── _fetch_recent_ticket_activity ────────────────────────────────────────────
+def test_aggregates_per_ticket_and_orders_by_recency():
+    con = _make_con([
+        # KAN-1: two sessions, most-recent ends 2 min before anchor
+        {"task_key": "KAN-1", "started_at": _at(10), "ended_at": _at(9),  "duration_s": 300},
+        {"task_key": "KAN-1", "started_at": _at(3),  "ended_at": _at(2),  "duration_s": 120},
+        # KAN-2: one session, ends 20 min before anchor
+        {"task_key": "KAN-2", "started_at": _at(21), "ended_at": _at(20), "duration_s": 600},
+    ])
+    out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1", "KAN-2"])
+    assert _keys(out) == ["KAN-1", "KAN-2"]          # most-recently-active first
+    k1 = out[0]
+    assert k1["sessions"] == 2
+    assert k1["total_s"] == 420.0                    # 300 + 120 summed
+    assert abs(k1["ago_s"] - 120) < 1                # last active ~2 min ago
+    assert abs(out[1]["ago_s"] - 1200) < 1           # KAN-2 ~20 min ago
+def test_excludes_sessions_outside_the_window():
+    con = _make_con([
+        {"task_key": "KAN-1", "started_at": _at(5),  "ended_at": _at(4),  "duration_s": 60},
+        {"task_key": "KAN-9", "started_at": _at(40), "ended_at": _at(39), "duration_s": 60},  # >30 min
+    ])
+    out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1", "KAN-9"])
+    assert _keys(out) == ["KAN-1"]
+def test_excludes_below_confidence_floor():
+    con = _make_con([
+        {"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_confidence": 0.9},
+        {"task_key": "KAN-2", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_confidence": 0.5},
+    ])
+    out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1", "KAN-2"])
+    assert _keys(out) == ["KAN-1"]
+def test_candidate_gating_drops_non_candidate_tickets():
+    con = _make_con([
+        {"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60},
+        {"task_key": "KAN-7", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60},  # not a candidate
+    ])
+    out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1"])
+    assert _keys(out) == ["KAN-1"]
+def test_excludes_untracked_and_null_task():
+    con = _make_con([
+        {"task_key": None,    "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_session_type": "untracked"},
+        {"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_session_type": "task"},
+    ])
+    out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1"])
+    assert _keys(out) == ["KAN-1"]
+def test_no_candidates_returns_empty():
+    con = _make_con([{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60}])
+    assert rtl._fetch_recent_ticket_activity(con, ANCHOR, []) == []
+def test_no_anchor_returns_empty():
+    con = _make_con([{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60}])
+    assert rtl._fetch_recent_ticket_activity(con, "", ["KAN-1"]) == []
+# ── _format_continuity ───────────────────────────────────────────────────────
+def test_format_empty_is_explicit_no_work_line():
+    out = _prompts._format_continuity([])
+    assert out.strip() == "(no tracked work in this window)"
+    assert out != ""  # explicit, never silent
+def test_format_single_ticket_recent():
+    out = _prompts._format_continuity(
+        [{"task_key": "KAN-1", "total_s": 420, "sessions": 2, "ago_s": 30}]
+    )
+    assert "KAN-1" in out
+    assert "~7 min" in out
+    assert "2 sessions" in out
+    assert "just before this session" in out  # ago < 60s
+def test_format_recency_minutes():
+    out = _prompts._format_continuity(
+        [{"task_key": "KAN-2", "total_s": 600, "sessions": 1, "ago_s": 1200}]
+    )
+    assert "1 session" in out
+    assert "~20 min before this session" in out
+def test_format_multiple_tickets_one_bullet_each():
+    out = _prompts._format_continuity([
+        {"task_key": "KAN-1", "total_s": 420, "sessions": 2, "ago_s": 30},
+        {"task_key": "KAN-2", "total_s": 600, "sessions": 1, "ago_s": 1200},
+    ])
+    assert out.count("•") == 2
+    assert "KAN-1" in out and "KAN-2" in out
+def test_build_user_message_includes_block_when_activity_present():
+    msg = _prompts.build_user_message(
+        {"app_name": "Code", "session_text": "x"},
+        [{"task_key": "KAN-1", "title": "t", "description_text": "d"}],
+        recent_activity=[{"task_key": "KAN-1", "total_s": 60, "sessions": 1, "ago_s": 30}],
+        now_iso=ANCHOR,
+    )
+    assert "RECENT WORK CONTEXT" in msg
+    assert "WEAK" in msg          # framed as a weak prior
+    assert "KAN-1" in msg
+def test_build_user_message_shows_explicit_block_when_no_activity():
+    msg = _prompts.build_user_message(
+        {"app_name": "Code", "session_text": "x"},
+        [{"task_key": "KAN-1", "title": "t", "description_text": "d"}],
+        recent_activity=[],
+        now_iso=ANCHOR,
+    )
+    assert "RECENT WORK CONTEXT" in msg            # always present now
+    assert "no tracked work in this window" in msg  # explicit empty state
+# ── plain-python runner (no pytest needed) ───────────────────────────────────
+if __name__ == "__main__":
+    fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
+    passed = 0
+    for fn in fns:
+        try:
+            fn()
+            print(f"  PASS  {fn.__name__}")
+            passed += 1
+        except Exception as exc:  # noqa: BLE001
+            print(f"  FAIL  {fn.__name__}: {exc!r}")
+    print(f"\n{passed}/{len(fns)} passed")
+    raise SystemExit(0 if passed == len(fns) else 1)

package/services/tests/test_fetch_pm_tasks.py ADDED Viewed

@@ -0,0 +1,120 @@
+"""Unit tests for `_fetch_pm_tasks` candidate-set policy.
+Covers the plan-only candidate filtering (CLASSIFY_PLAN_ONLY_CANDIDATES) added on
+top of the legacy boost-never-filter behaviour, including both safety guards:
+  * no confirmed plan  → every candidate is offered (unchanged behaviour)
+  * plan confirmed     → candidates narrowed to the confirmed plan, in order
+  * plan tickets gone  → fall back to the full set (never zero candidates)
+  * curation-excluded  → never a candidate, even if named in the plan
+Run: services/.venv/bin/pytest services/tests/test_fetch_pm_tasks.py -v
+"""
+from __future__ import annotations
+import sqlite3
+import sys
+from pathlib import Path
+import pytest
+# Make `from agents import ...` resolve (mirror tests/evals/eval_classifier.py).
+_SERVICES_DIR = Path(__file__).resolve().parent.parent
+if str(_SERVICES_DIR) not in sys.path:
+    sys.path.insert(0, str(_SERVICES_DIR))
+from agents import run_task_linker_mlx as rtl  # noqa: E402
+def _make_con(task_keys: list[str], excluded: list[str] | None = None) -> sqlite3.Connection:
+    """In-memory meridian DB with the columns `_fetch_pm_tasks` selects."""
+    con = sqlite3.connect(":memory:")
+    con.row_factory = sqlite3.Row
+    con.execute(
+        "CREATE TABLE pm_tasks ("
+        " task_key TEXT PRIMARY KEY, title TEXT, description_text TEXT,"
+        " status_raw TEXT, is_terminal INTEGER, issue_type TEXT,"
+        " parent_key TEXT, epic_title TEXT, sprint_name TEXT, tags TEXT)"
+    )
+    con.execute(
+        "CREATE TABLE pm_task_curation (task_key TEXT PRIMARY KEY, decision TEXT)"
+    )
+    for k in task_keys:
+        con.execute(
+            "INSERT INTO pm_tasks (task_key, title, description_text, status_raw,"
+            " is_terminal, issue_type, parent_key, epic_title, sprint_name, tags)"
+            " VALUES (?, ?, '', 'In Progress', 0, 'Task', '', '', '', '')",
+            (k, f"title {k}"),
+        )
+    for k in excluded or []:
+        con.execute(
+            "INSERT INTO pm_task_curation (task_key, decision) VALUES (?, 'excluded')",
+            (k,),
+        )
+    con.commit()
+    return con
+@pytest.fixture
+def plan_only(monkeypatch):
+    """Force plan-only filtering on regardless of the ambient env default."""
+    monkeypatch.setattr(rtl, "_PLAN_ONLY_CANDIDATES", True)
+@pytest.fixture
+def boost_mode(monkeypatch):
+    """Force the legacy boost-never-filter behaviour."""
+    monkeypatch.setattr(rtl, "_PLAN_ONLY_CANDIDATES", False)
+def _keys(tasks):
+    return [t["task_key"] for t in tasks]
+def test_no_plan_returns_all_unmarked(plan_only):
+    """No confirmed plan → every candidate offered, none marked as focus."""
+    con = _make_con(["K-1", "K-2", "K-3"])
+    tasks = rtl._fetch_pm_tasks(con, focus_keys=[])
+    assert set(_keys(tasks)) == {"K-1", "K-2", "K-3"}
+    assert all(not t.get("is_today_focus") for t in tasks)
+def test_plan_only_narrows_to_plan_in_declared_order(plan_only):
+    """Plan confirmed → candidates are exactly the plan, in declared order, marked."""
+    con = _make_con(["K-1", "K-2", "K-3", "K-4"])
+    tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-3", "K-1"])
+    assert _keys(tasks) == ["K-3", "K-1"]  # declared order preserved
+    assert all(t["is_today_focus"] for t in tasks)
+def test_plan_only_falls_back_when_plan_tickets_absent(plan_only):
+    """GUARD: plan tickets not in the live pool → fall back to ALL, never empty."""
+    con = _make_con(["K-1", "K-2"])
+    tasks = rtl._fetch_pm_tasks(con, focus_keys=["GHOST-9"])
+    assert set(_keys(tasks)) == {"K-1", "K-2"}  # full set, not empty
+    assert all(not t.get("is_today_focus") for t in tasks)
+def test_plan_only_drops_curation_excluded_even_if_in_plan(plan_only):
+    """An excluded ticket is never a candidate, even when named in the plan."""
+    con = _make_con(["K-1", "K-2"], excluded=["K-2"])
+    tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-2", "K-1"])
+    # K-2 excluded → only K-1 survives; still a non-empty, plan-scoped set.
+    assert _keys(tasks) == ["K-1"]
+    assert tasks[0]["is_today_focus"]
+def test_plan_only_partial_plan_keeps_only_live_plan_tickets(plan_only):
+    """Plan names a live + a dead ticket → only the live one is offered."""
+    con = _make_con(["K-1", "K-2", "K-3"])
+    tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-2", "GHOST-9"])
+    assert _keys(tasks) == ["K-2"]
+def test_boost_mode_keeps_all_with_plan_floated_to_top(boost_mode):
+    """Flag off → legacy behaviour: plan floated to top, every candidate kept."""
+    con = _make_con(["K-1", "K-2", "K-3"])
+    tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-3"])
+    assert set(_keys(tasks)) == {"K-1", "K-2", "K-3"}  # recall untouched
+    assert tasks[0]["task_key"] == "K-3"  # floated to top
+    assert tasks[0]["is_today_focus"]
+    assert sum(1 for t in tasks if t.get("is_today_focus")) == 1

package/services/tests/test_format_candidates.py ADDED Viewed

@@ -0,0 +1,45 @@
+"""Unit tests for the candidate-description cap in `_format_candidates`.
+The cap is configurable via CANDIDATE_DESC_CAP (default 0 = no cap). These cover
+the default uncapped behaviour and an explicit ceiling.
+Run: services/.venv/bin/pytest services/tests/test_format_candidates.py -v
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+import pytest
+_SERVICES_DIR = Path(__file__).resolve().parent.parent
+if str(_SERVICES_DIR) not in sys.path:
+    sys.path.insert(0, str(_SERVICES_DIR))
+from agents import _prompts  # noqa: E402
+def _task(desc: str) -> dict:
+    return {"task_key": "K-1", "title": "t", "description_text": desc}
+def test_default_no_cap_keeps_full_description(monkeypatch):
+    monkeypatch.setattr(_prompts, "CANDIDATE_DESC_CAP", 0)
+    desc = "x" * 1000
+    out = _prompts._format_candidates([_task(desc)])
+    assert desc in out          # full text present
+    assert "…" not in out       # no truncation marker
+def test_positive_cap_truncates_with_marker(monkeypatch):
+    monkeypatch.setattr(_prompts, "CANDIDATE_DESC_CAP", 50)
+    out = _prompts._format_candidates([_task("y" * 100)])
+    assert "y" * 50 + "…" in out
+    assert "y" * 51 not in out  # nothing past the cap
+def test_description_under_cap_unchanged(monkeypatch):
+    monkeypatch.setattr(_prompts, "CANDIDATE_DESC_CAP", 240)
+    out = _prompts._format_candidates([_task("short desc")])
+    assert "short desc" in out
+    assert "…" not in out

package/ui.tar.gz CHANGED Viewed

Binary file