npm - @meridiona/meridian-darwin-arm64 - Versions diffs - 1.5.0 → 1.7.0 - Mend

@meridiona/meridian-darwin-arm64 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/VERSION +1 -1
package/bin/meridian +0 -0
package/package.json +1 -1
package/scripts/meridian-cli.sh +68 -98
package/services/agents/_prompts.py +21 -13
package/services/agents/run_task_linker_mlx.py +10 -6
package/services/pyproject.toml +1 -1
package/services/skills/activity/task-classifier/SKILL.md +14 -12
package/services/tests/evals/classify_session.py +122 -0
package/services/tests/evals/metrics.py +34 -5
package/ui/.next/BUILD_ID +1 -1
package/ui/.next/build-manifest.json +3 -3
package/ui/.next/prerender-manifest.json +3 -3
package/ui/.next/server/app/_global-error.html +1 -1
package/ui/.next/server/app/_global-error.rsc +1 -1
package/ui/.next/server/app/_global-error.segments/__PAGE__.segment.rsc +1 -1
package/ui/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
package/ui/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
package/ui/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
package/ui/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
package/ui/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
package/ui/.next/server/app/_not-found.html +1 -1
package/ui/.next/server/app/_not-found.rsc +2 -2
package/ui/.next/server/app/_not-found.segments/_full.segment.rsc +2 -2
package/ui/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
package/ui/.next/server/app/_not-found.segments/_index.segment.rsc +2 -2
package/ui/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
package/ui/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
package/ui/.next/server/app/_not-found.segments/_tree.segment.rsc +2 -2
package/ui/.next/server/app/api/settings/route.js.nft.json +1 -1
package/ui/.next/server/app/index.html +1 -1
package/ui/.next/server/app/index.rsc +3 -3
package/ui/.next/server/app/index.segments/__PAGE__.segment.rsc +2 -2
package/ui/.next/server/app/index.segments/_full.segment.rsc +3 -3
package/ui/.next/server/app/index.segments/_head.segment.rsc +1 -1
package/ui/.next/server/app/index.segments/_index.segment.rsc +2 -2
package/ui/.next/server/app/index.segments/_tree.segment.rsc +2 -2
package/ui/.next/server/app/page/react-loadable-manifest.json +1 -1
package/ui/.next/server/app/page_client-reference-manifest.js +1 -1
package/ui/.next/server/app/settings/page_client-reference-manifest.js +1 -1
package/ui/.next/server/app/settings.html +1 -1
package/ui/.next/server/app/settings.rsc +2 -2
package/ui/.next/server/app/settings.segments/_full.segment.rsc +2 -2
package/ui/.next/server/app/settings.segments/_head.segment.rsc +1 -1
package/ui/.next/server/app/settings.segments/_index.segment.rsc +2 -2
package/ui/.next/server/app/settings.segments/_tree.segment.rsc +2 -2
package/ui/.next/server/app/settings.segments/settings/__PAGE__.segment.rsc +1 -1
package/ui/.next/server/app/settings.segments/settings.segment.rsc +1 -1
package/ui/.next/server/chunks/[root-of-the-server]__0o.3lhr._.js +1 -1
package/ui/.next/server/chunks/[root-of-the-server]__0t62i3x._.js +8 -5
package/ui/.next/server/middleware-build-manifest.js +3 -3
package/ui/.next/server/pages/404.html +1 -1
package/ui/.next/server/pages/500.html +1 -1
package/ui/.next/server/server-reference-manifest.js +1 -1
package/ui/.next/server/server-reference-manifest.json +1 -1
package/ui/.next/static/chunks/0.e6xqgbosj58.css +4 -0
package/ui/.next/static/chunks/0f2ikqegp34r..js +1 -0
package/ui/.next/static/chunks/{17.0_3q.gw7x2.js → 0puw3vthktvhx.js} +1 -1
package/ui/app/api/active/route.ts +47 -0
package/ui/app/api/coding-agents/route.ts +53 -0
package/ui/app/api/queue-review/route.ts +83 -0
package/ui/app/api/settings/route.ts +18 -0
package/ui/app/api/tasks/route.ts +141 -0
package/ui/app/api/today/route.ts +299 -0
package/ui/app/api/week/route.ts +86 -0
package/ui/app/api/worklogs/[id]/route.ts +144 -0
package/ui/app/api/worklogs/route.ts +134 -0
package/ui/app/globals.css +177 -0
package/ui/app/layout.tsx +33 -0
package/ui/app/page.tsx +106 -0
package/ui/app/settings/page.tsx +6 -0
package/ui/components/CommandBar.tsx +103 -0
package/ui/components/DayTimeline.tsx +126 -0
package/ui/components/Nav.tsx +45 -0
package/ui/components/RefreshTrigger.tsx +15 -0
package/ui/components/ShapeOfDay.tsx +150 -0
package/ui/components/Sidebar.tsx +130 -0
package/ui/components/TaskBadge.tsx +223 -0
package/ui/components/TodayMetrics.tsx +110 -0
package/ui/components/TweaksPanel.tsx +200 -0
package/ui/components/atoms.tsx +254 -0
package/ui/components/ui/NumberStepper.tsx +128 -0
package/ui/components/ui/Select.tsx +109 -0
package/ui/components/ui/Switch.tsx +49 -0
package/ui/components/views/QueueView.tsx +171 -0
package/ui/components/views/SessionsView.tsx +145 -0
package/ui/components/views/SettingsView.tsx +217 -0
package/ui/components/views/TasksView.tsx +208 -0
package/ui/components/views/TodayView.tsx +522 -0
package/ui/components/views/WeekView.tsx +201 -0
package/ui/components/views/WorklogsView.tsx +379 -0
package/ui/instrumentation.ts +8 -0
package/ui/lib/app-colors.ts +40 -0
package/ui/lib/category-colors.ts +30 -0
package/ui/lib/date-utils.ts +16 -0
package/ui/lib/db-write.ts +41 -0
package/ui/lib/db.ts +45 -0
package/ui/lib/format.ts +35 -0
package/ui/lib/intervals.ts +128 -0
package/ui/lib/observability.ts +88 -0
package/ui/lib/settings.ts +73 -0
package/ui/lib/theme-context.tsx +104 -0
package/ui/lib/types.ts +106 -0
package/ui/next.config.ts +35 -0
package/ui/package-lock.json +4446 -0
package/ui/package.json +1 -1
package/ui/postcss.config.mjs +5 -0
package/ui/tsconfig.json +41 -0
package/ui/tsconfig.tsbuildinfo +1 -0
package/ui/.next/static/chunks/0laaz3a6vqgl~.css +0 -4
package/ui/.next/static/chunks/16f557ymkx721.js +0 -1
/package/ui/.next/static/{mXgl3Yg8KlSupvjSyOZCC → bw5ZyxNceKY52yxsIpewS}/_buildManifest.js +0 -0
/package/ui/.next/static/{mXgl3Yg8KlSupvjSyOZCC → bw5ZyxNceKY52yxsIpewS}/_clientMiddlewareManifest.js +0 -0
/package/ui/.next/static/{mXgl3Yg8KlSupvjSyOZCC → bw5ZyxNceKY52yxsIpewS}/_ssgManifest.js +0 -0

package/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 1.5.0
1	+ 1.7.0

package/bin/meridian CHANGED Viewed

Binary file

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@meridiona/meridian-darwin-arm64",
-  "version": "1.5.0",
+  "version": "1.7.0",
   "description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
   "homepage": "https://github.com/Meridiona/meridian",
   "repository": {

package/scripts/meridian-cli.sh CHANGED Viewed

@@ -190,113 +190,83 @@ cmd_logs() {
 }
 # --- doctor ---
-_check() {
-    local desc="$1" pass="$2" reason="${3:-}"
-    if [[ "$pass" == "1" ]]; then
-        ok "$desc"
-    else
-        err "$desc${reason:+ — ${reason}}"
-        DOCTOR_FAILURES=$(( DOCTOR_FAILURES + 1 ))
-    fi
+# The daemon binary owns the comprehensive, colourised, by-daemon health table
+# (system, meridian daemon, screenpipe, mlx-server, jira, ui, mcp). The wrapper
+# just delegates to it; if that binary is missing or stale, a minimal bash-only
+# fallback runs so `meridian doctor` always produces something useful.
+_group() { printf "\n  ── %s ─────────────────────────────────────────────\n" "$1"; }
+_row() {  # status check detail
+    local status="$1" check="$2" detail="${3:-}" glyph
+    case "$status" in
+        ok)   glyph="✓" ;;
+        warn) glyph="⊘" ;;
+        info) glyph="·" ;;
+        *)    glyph="✗"; DOCTOR_FAILURES=$(( DOCTOR_FAILURES + 1 )) ;;
+    esac
+    printf "  %s %-26s %s\n" "$glyph" "$check" "$detail"
 }
-_pid_from_print() {
-    local label="$1"
-    local output
-    set +e
-    output="$(launchctl print "${GUI_TARGET}/${label}" 2>/dev/null)"
-    local rc=$?
-    set -e
-    [[ $rc -ne 0 ]] && return 1
-    printf '%s\n' "$output" | grep -E '^\s+pid\s*=' | grep -oE '[0-9]+' | head -1
+_plist_row() {  # label check-label
+    local plist="${LAUNCH_AGENTS}/$1.plist"
+    if [[ -f "$plist" ]] && plutil -lint "$plist" >/dev/null 2>&1; then
+        _row ok "$2" ""
+    else
+        _row fail "$2" "run ./install.sh"
+    fi
 }
-cmd_doctor() {
-    DOCTOR_FAILURES=0
-    # 1. macOS
-    _check "macOS" "$([[ "$(uname -s)" == "Darwin" ]] && echo 1 || echo 0)" "run on macOS"
-    # 2. daemon binary
-    local bin_ok=0
+_daemon_bin() {
+    local p
     for p in /usr/local/bin/meridian-daemon "${HOME}/.local/bin/meridian-daemon"; do
-        [[ -x "$p" ]] && bin_ok=1 && break
+        [[ -x "$p" ]] && { printf '%s\n' "$p"; return 0; }
     done
-    _check "daemon binary exists and is executable" "$bin_ok" "run ./install.sh"
-    # 3. daemon plist lints
-    local dplist="${LAUNCH_AGENTS}/${LABEL_DAEMON}.plist"
-    if [[ -f "$dplist" ]]; then
-        set +e; plutil -lint "$dplist" >/dev/null 2>&1; local pl=$?; set -e
-        _check "daemon plist installed and valid" "$([[ $pl -eq 0 ]] && echo 1 || echo 0)" "plutil -lint ${dplist}"
-    else
-        _check "daemon plist installed and valid" "0" "run ./install.sh"
-    fi
-    # 4. daemon running
-    local dpid; dpid="$(_pid_from_print "$LABEL_DAEMON" 2>/dev/null)" || dpid=""
-    _check "daemon running (pid ${dpid:-?})" "$([[ -n "$dpid" ]] && echo 1 || echo 0)" "meridian start"
-    # 5. user config
-    _check "user config <repo>/.env exists" "$([[ -f "${REPO_ROOT}/.env" ]] && echo 1 || echo 0)" "run ./install.sh"
-    # 6. screenpipe plist lints
-    local spplist="${LAUNCH_AGENTS}/${LABEL_SCREENPIPE}.plist"
-    if [[ -f "$spplist" ]]; then
-        set +e; plutil -lint "$spplist" >/dev/null 2>&1; local spl=$?; set -e
-        _check "screenpipe plist installed and valid" "$([[ $spl -eq 0 ]] && echo 1 || echo 0)" "plutil -lint ${spplist}"
-    else
-        _check "screenpipe plist installed and valid" "0" "run ./install.sh"
-    fi
-    # 7. screenpipe binary in PATH
-    set +e; command -v screenpipe >/dev/null 2>&1; local spbin=$?; set -e
-    _check "screenpipe binary in PATH" "$([[ $spbin -eq 0 ]] && echo 1 || echo 0)" "install screenpipe (npm install -g screenpipe)"
-    # 8. screenpipe DB
-    _check "screenpipe DB exists" "$([[ -f "${HOME}/.screenpipe/db.sqlite" ]] && echo 1 || echo 0)" "install and run screenpipe"
-    # 9. screenpipe running
-    set +e; pgrep -x screenpipe >/dev/null 2>&1; local sp=$?; set -e
-    _check "screenpipe running" "$([[ $sp -eq 0 ]] && echo 1 || echo 0)" "start screenpipe"
-    # 10. meridian DB
-    if [[ -f "${HOME}/.meridian/meridian.db" ]]; then
-        ok "meridian DB exists"
-    else
-        warn "meridian DB not yet created (will be on first run)"
-    fi
-    # 11. Python venv
-    local venv_py="${REPO_ROOT}/services/.venv/bin/python"
-    local venv_ok=0
-    if [[ -x "$venv_py" ]]; then
-        set +e; "$venv_py" -c "import run_agent" 2>/dev/null; local vi=$?; set -e
-        [[ $vi -eq 0 ]] && venv_ok=1
-    fi
-    _check "Python venv and run_agent importable" "$venv_ok" "bash scripts/setup-services.sh"
-    # 12. MCP server built
-    _check "MCP server built" "$([[ -f "${REPO_ROOT}/packages/meridian-mcp/dist/index.js" ]] && echo 1 || echo 0)" "cd packages/meridian-mcp && npm run build"
+    return 1
+}
-    # 13. UI plist lints
-    local uiplist="${LAUNCH_AGENTS}/${LABEL_UI}.plist"
-    if [[ -f "$uiplist" ]]; then
-        set +e; plutil -lint "$uiplist" >/dev/null 2>&1; local uil=$?; set -e
-        _check "UI plist installed and valid" "$([[ $uil -eq 0 ]] && echo 1 || echo 0)" "plutil -lint ${uiplist}"
-    else
-        _check "UI plist installed and valid" "0" "run ./install.sh"
+cmd_doctor() {
+    local bin
+    if bin="$(_daemon_bin)"; then
+        set +e
+        if [[ "$*" == *--fix* ]]; then
+            # --fix has interactive guided prompts — the user is present, so run
+            # without the alarm (which would kill a prompt waiting for input).
+            "$bin" doctor "$@"
+        else
+            # Guard with a perl alarm so a stale binary (one that predates
+            # `doctor` and would fall through to starting the daemon) can never
+            # hang the terminal. The Rust report colourises itself on a tty.
+            perl -e 'alarm shift @ARGV; exec @ARGV' 30 "$bin" doctor "$@"
+        fi
+        local rc=$?
+        set -e
+        # 0 = healthy, 1 = critical issues found — both are real doctor runs.
+        if [[ $rc -eq 0 || $rc -eq 1 ]]; then return $rc; fi
+        warn "health engine timed out or is stale — rebuild: cargo build --release"
     fi
+    _doctor_fallback
+}
-    # 14. UI built
-    _check "UI built (ui/.next exists)" "$([[ -d "${REPO_ROOT}/ui/.next" ]] && echo 1 || echo 0)" "cd ui && npm ci && npm run build"
+# Minimal bash-only checks for when the daemon binary is unavailable.
+_doctor_fallback() {
+    DOCTOR_FAILURES=0
+    printf "\n  Meridian doctor (fallback — daemon binary unavailable)\n"
+    printf "  ════════════════════════════════════════════════════════\n"
+    _group "system"
+    _row "$([[ "$(uname -s)" == "Darwin" ]] && echo ok || echo fail)" "macOS" ""
+    _row "$([[ -f "${REPO_ROOT}/.env" ]] && echo ok || echo fail)" "config (.env)" ""
+    _group "services (plists)"
+    _plist_row "$LABEL_DAEMON" "daemon plist"
+    _plist_row "$LABEL_SCREENPIPE" "screenpipe plist"
+    _plist_row "$LABEL_MLX" "mlx plist"
+    _plist_row "$LABEL_UI" "ui plist"
+    _group "builds"
+    _row "$([[ -f "${REPO_ROOT}/packages/meridian-mcp/dist/index.js" ]] && echo ok || echo fail)" "mcp built" ""
+    _row "$([[ -d "${REPO_ROOT}/ui/.next" ]] && echo ok || echo fail)" "ui built" ""
     echo
-    if [[ $DOCTOR_FAILURES -eq 0 ]]; then
-        ok "all checks passed"
-    else
-        printf "  %d check%s failed\n" "$DOCTOR_FAILURES" "$([[ $DOCTOR_FAILURES -ne 1 ]] && echo s || true)"
-    fi
+    _row info "next step" "cargo build --release && meridian doctor"
+    [[ $DOCTOR_FAILURES -eq 0 ]]
 }
 # --- config ---
@@ -506,7 +476,7 @@ case "$CMD" in
     restart)          cmd_restart ;;
     status)           cmd_status ;;
     logs)             cmd_logs "$@" ;;
-    doctor)           cmd_doctor ;;
+    doctor)           cmd_doctor "$@" ;;
     config)           cmd_config "$@" ;;
     dev)              cmd_dev "$@" ;;
     uninstall)        cmd_uninstall ;;

package/services/agents/_prompts.py CHANGED Viewed

@@ -11,12 +11,17 @@ _VSCODE_BANNER_RE = re.compile(
     re.IGNORECASE | re.DOTALL,
 )
-# Max chars of session_text included in the prompt. Default 2500 (~625 tokens at
-# 4 chars/token) — enough to identify files, ticket keys, and recent activity
-# without inflating context in production. Override via SESSION_TEXT_CAP env var
-# for eval experiments; set to 0 to disable truncation entirely (caller is then
-# responsible for not blowing the model's context window).
-SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "2500"))
+# Max chars of session_text included in the prompt. Default 10000 (~2500 tokens
+# at 4 chars/token). The old 2500 cap kept only the FIRST frames of a multi-frame
+# OCR capture, so when a session spanned more than one window/app the later
+# (often foreground) activity was silently dropped — e.g. a session whose head
+# showed an IDE but whose tail showed the user had moved to a different app/
+# project got misclassified on the stale head. The classifier model has a 128K
+# context window, so 2500 was far too conservative; 10000 comfortably holds a
+# full multi-frame session while staying trivial for the model. Override via
+# SESSION_TEXT_CAP env var; set to 0 to disable truncation entirely (caller is
+# then responsible for not blowing the model's context window).
+SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "10000"))
 def _fmt_dur(duration_s: int | float) -> str:
@@ -51,10 +56,12 @@ def _format_session(session: dict) -> str:
         parts.append(f"time: {time_range}{dur_str}")
     elif dur is not None:
         parts.append(f"duration: {_fmt_dur(dur)}")
-    cat = session.get("category")
-    cat_conf = session.get("confidence")
-    if cat:
-        parts.append(f"category: {cat} (confidence {round(cat_conf or 0.0, 2)})")
+    # NOTE: the rule-based ETL category is intentionally NOT included here. It is
+    # a cheap heuristic derived from the SAME app/window/OCR signals the LLM
+    # already sees, so feeding it in only injects a correlated prior — when the
+    # heuristic is wrong (e.g. background-window OCR bleed), it biases the LLM
+    # toward the same mistake. The classifier re-derives category from the raw
+    # evidence and its output overwrites the rule-based value anyway.
     titles = session.get("window_titles") or []
     if titles:
         parts.append("top windows:")
@@ -116,7 +123,6 @@ def _format_recent_sessions(sessions: list[dict]) -> str:
         dur_str = _fmt_dur(s.get("duration_s") or 0)
         task_key = s.get("task_key")
         routing = s.get("task_routing")  # None means unclassified
-        category = (s.get("category") or "").strip()
         if task_key:
             target = f"→ {task_key}"
         elif routing == "untracked":
@@ -126,8 +132,10 @@ def _format_recent_sessions(sessions: list[dict]) -> str:
             target = "→ [pending]"
         else:
             target = "→ [overhead]"
-        cat_tag = f"  [{category}]" if category else ""
-        rows.append(f"  {time_str}  {app:<14}  {dur_str:<7}  {target}{cat_tag}")
+        # Category is intentionally omitted — recent-context is a task-continuity
+        # signal only; carrying the (rule-based or prior-LLM) category tag would
+        # feed a category prior back into classification.
+        rows.append(f"  {time_str}  {app:<14}  {dur_str:<7}  {target}")
     return "\n".join(rows)

package/services/agents/run_task_linker_mlx.py CHANGED Viewed

@@ -83,10 +83,10 @@ class SessionClassification(BaseModel):
     ] = Field(
         ...,
         description=(
-            "The single best activity category for this session. A rule-based "
-            "guess is supplied in the input — confirm it or correct it from the "
-            "evidence. Declared early in the schema so FSM decoding always emits "
-            "it before the long session_summary field."
+            "The single best activity category for this session. Derive it from "
+            "the evidence (app, window titles, screen content); no category is "
+            "supplied in the input. Declared early in the schema so FSM decoding "
+            "always emits it before the long session_summary field."
         ),
     )
     category_confidence: float = Field(
@@ -232,9 +232,13 @@ def _fetch_session(
 def _fetch_recent_sessions(
     con: _sqlite3.Connection, before_id: int
 ) -> list[dict[str, Any]]:
+    # Recent context is a task-continuity signal only: app, time, duration and
+    # which ticket each recent session mapped to. We deliberately do NOT select
+    # session_text/excerpt or category — recent OCR is noise here and a category
+    # tag would feed a prior back into classification. (session_text is still
+    # referenced in WHERE only to skip empty-capture rows.)
     rows = con.execute(
-        "SELECT app_name, started_at, duration_s, task_key, task_routing, category,"
-        "       COALESCE(SUBSTR(session_text, 1, 200), '') AS text_excerpt"
+        "SELECT app_name, started_at, duration_s, task_key, task_routing"
         " FROM app_sessions"
         " WHERE id < ? AND duration_s > 1 AND COALESCE(session_text,'') != ''"
         " ORDER BY id DESC LIMIT ?",

package/services/pyproject.toml CHANGED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "meridian-agents"
-version = "1.5.0"
+version = "1.7.0"
 description = "Meridian agents — hermes task linking and Jira progress updates for meridian.db"
 requires-python = ">=3.11"
 authors = [{ name = "Meridiona" }]

package/services/skills/activity/task-classifier/SKILL.md CHANGED Viewed

@@ -24,7 +24,7 @@ The task classifier sits at the center of Meridian's workflow understanding:
 ## Classification Decision Tree
-For each session, you must decide:
+For each session, decide in this order. **Core principle: do NOT try to fit every session to an existing ticket. Assign a `task_key` only when the session's OWN evidence clearly matches that specific ticket's scope. Most real work that isn't an obvious match is `untracked`, not a forced link.**
 ### 1. Is this overhead?
 If the session is **idle, music, system settings,or clearly personal/unrelated activity** → return:
@@ -33,27 +33,29 @@ If the session is **idle, music, system settings,or clearly personal/unrelated a
 ```
 **overhead is a hard discard.** These sessions are thrown away — never surfaced, never used for inference, never create tasks. When in doubt between overhead and untracked, ask: *"Would a manager care that this happened?"* If no, it's overhead.
-### 2. Is this work-related?
-If the session shows **any real work signal** (coding, research, meetings, writing, debugging, reviewing, learning) but **no Jira candidate matches** → mark as **untracked** and return:
+### 2. Is this real work that ISN'T clearly one of the candidate tickets? → untracked
+If the session shows **any real work signal** (coding, research, meetings, writing, debugging, reviewing, learning) but it does **not clearly match the scope of a candidate ticket** → mark as **untracked**:
 ```json
 {"task_key": null, "confidence": 0.6-0.8, "session_type": "untracked", "routing": "queue"}
 ```
-**untracked sessions are kept and used downstream** — for workload analysis, capacity reporting, and automatic new-task creation. Mark dimensions to capture *what* the work was. Examples that must be `untracked` (not `overhead`): standups, retros, code reviews on untracked PRs, config/infra housekeeping, general repo exploration, internal tool usage.
+**This is the important, common case — and it is what `untracked` MEANS: the user genuinely did this work, but there is no Jira ticket for it yet.** Downstream, Meridian uses untracked sessions to **create or update** the matching Jira task. So it is critical that you do **not** shoehorn this work into an unrelated existing ticket just because it is the only candidate available, or because recent sessions were on it. **A wrong task link is worse than `untracked`** — it pollutes a real ticket's worklog and hides the genuine untracked work that should have spawned its own ticket. When the evidence doesn't clearly fit a candidate, choose `untracked`.
-### 3. Can it map to an open Jira ticket?
-If the session evidence **directly or contextually matches** an open ticket → return:
+`untracked` sessions are kept and used downstream (workload analysis, capacity reporting, new-task creation). Mark dimensions to capture *what* the work was. Examples that must be `untracked` (not `overhead`): standups, retros, code reviews on untracked PRs, config/infra housekeeping, general repo exploration, general research, **and any work on a feature/bug/chore that has no matching candidate ticket**.
+### 3. Does it CLEARLY map to one specific candidate ticket? → task
+Assign a `task_key` **only** when the session's own evidence (window titles, OCR, file/branch names, an explicit ticket-key mention) directly matches the **scope described in that ticket's title/description** → return:
 ```json
 {"task_key": "KEY-123", "confidence": 0.50-0.90, "session_type": "task", "routing": "auto"}
 ```
-Cite the evidence (window title, OCR snippet, context from previous sessions) and infer activity dimensions.
+Recent-session continuity may *support* a match, but **continuity alone is never enough** — the current session must carry its own evidence that fits the ticket. If the active app/window shows the user is now on something else (a different project, a meeting, another repo, a doc for another team), classify by **that**, not by what they were doing minutes ago. Cite the specific evidence, and infer activity dimensions.
 ## Your inputs
 The user message contains:
-- **SESSION** — app, category (with confidence), duration, top window titles, and counts of OCR/audio captures.
+- **SESSION** — app, duration, top window titles, and the screen content (OCR / a11y). Decide the category yourself from this evidence; no category is provided.
 - **CANDIDATE TICKETS** — all open Jira tickets. These are the only tickets you may choose from.
-- **RECENT SESSIONS** (previous 5) — context to help disambiguate. Example: *"User was on KAN-42 (coding) 5 minutes ago, then Slack, now back in VS Code."* → likely same task, even if Slack doesn't directly match KAN-42.
+- **RECENT SESSIONS** (previous 5) — app / time / duration / which ticket each mapped to (no screen text). A **weak disambiguation hint only**: it can support a match when the current session ALSO has matching evidence, but it must never override what the current session itself shows. Recent activity on a ticket does not make the current session that ticket.
 ## Available capabilities
@@ -98,7 +100,7 @@ Reply with ONE valid JSON object — no preamble, no markdown fences, no follow-
 ### Field rules
 - `task_key` — must be one of the supplied candidates, or `null`. Never invent a key.
 - `confidence` — see Scoring heuristics section for exact ranges per outcome type.
-- `category` — the single best activity category (see taxonomy below). The input carries a rule-based guess; confirm it or correct it from the evidence.
+- `category` — the single best activity category (see taxonomy below). Derive it yourself from the evidence (app, window titles, screen content); no category is provided in the input.
 - `category_confidence` — how certain you are about `category`, `0.0`–`1.0`.
 - `category_explanation` — ONE concise sentence justifying the category, citing the app / window titles / OCR evidence. Shown in the dashboard next to the category.
 - `session_type` — `"task"` links to Jira; `"overhead"` is thrown away; `"untracked"` is kept for workload analysis.
@@ -199,7 +201,7 @@ You have access to **the previous 5 sessions** to disambiguate the current sessi
 - Session 2 (3 min ago): Slack, discussing PR review for KAN-42 → **if related to same work**, task_key: KAN-42, confidence: 0.75 (work mention + prior context)
 - Session 3 (now): VS Code, editing same file → task_key: KAN-42, confidence: 0.85 (context continuity)
-**Decision:** If Session 2 (Slack) content shows it's about the same work (discussing the work or searching about it), classify it to **KAN-42** using context from Session 1. If Slack is generic work discussion with no connection to the prior task, return `null` with `session_type: "untracked"`.
+**Decision:** Only link Session 2 to KAN-42 if Session 2's **own content** shows it is about that work (the OCR/window discusses or searches the KAN-42 work). If Session 2 is generic, OR shows the user has moved to *different* work (another project, another team's doc, an unrelated meeting), return `null` with `session_type: "untracked"` (or a different ticket if its own evidence matches one) — **do not inherit KAN-42 just because it was the recent task.** Continuity is a tie-breaker between plausible matches, never a substitute for current-session evidence.
 Example reasoning for Session 2 (if task-related): `"Slack discusses PR review for KAN-42 implementation mentioned in prior VS Code session; linked via work context."`
@@ -208,7 +210,7 @@ Example reasoning for Session 2 (if task-related): `"Slack discusses PR review f
 **When task_key is not null (matched to a ticket):**
 - **Task key + work alignment**  — `confidence ≥ 0.90`, `session_type: "task"`
 - **Work description alignment**  — `0.75–0.85`, `session_type: "task"`
-- **Context continuity**  — `0.75–0.85`, `session_type: "task"`
+- **Context continuity (current session ALSO has matching evidence)**  — `0.75–0.85`, `session_type: "task"`. Continuity with no current-session evidence is **not** a task — use `untracked`.
 - **Generic project-level match**  — `0.50–0.65`, `session_type: "task"`
 - **Task key only**  — `0.60–0.75`, `session_type: "task"` (lower than key+alignment because work intent unclear)

package/services/tests/evals/classify_session.py ADDED Viewed

@@ -0,0 +1,122 @@
+"""Dry-run the task classifier for one or more session ids — read-only.
+Drives the SAME code path the daemon uses: POSTs the session id(s) to the
+running MLX server's /classify_sessions endpoint, which runs `_classify_one`
+(fetch session + recent context + pm_tasks → build prompt → model → parse).
+The Python endpoint only RETURNS the result; the DB write is done separately by
+the Rust daemon, so calling this never mutates app_sessions — you see exactly
+what the classifier WOULD output right now, with the current code and prompt.
+Usage:
+    services/.venv/bin/python services/tests/evals/classify_session.py 20128
+    services/.venv/bin/python services/tests/evals/classify_session.py 20128 20127 --show-prompt
+    MLX_SERVER_URL=http://127.0.0.1:7823 ... classify_session.py 20128
+The MLX server must be running (meridian status / port 7823). It uses the
+already-loaded model, so this is fast — no in-process model load.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sqlite3
+import sys
+import urllib.request
+from pathlib import Path
+_DEFAULT_URL = os.environ.get("MLX_SERVER_URL", "http://127.0.0.1:7823").rstrip("/")
+_DEFAULT_DB = os.path.expanduser(
+    os.environ.get("MERIDIAN_DB", "~/.meridian/meridian.db")
+)
+def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
+    """Rebuild the exact prompt via the production builder (read-only)."""
+    # Import lazily so the common path (no --show-prompt) needs no agents deps.
+    sys.path.insert(0, str(Path(__file__).resolve().parents[2]))  # services/
+    from agents._prompts import build_user_message
+    from agents.run_task_linker_mlx import (
+        _fetch_pm_tasks,
+        _fetch_recent_sessions,
+        _fetch_session,
+    )
+    con = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
+    con.row_factory = sqlite3.Row
+    raw = _fetch_session(con, session_id)
+    if raw is None:
+        return None
+    recent = _fetch_recent_sessions(con, session_id)
+    pm_tasks = _fetch_pm_tasks(con)
+    session_text = raw.get("session_text") or ""
+    if raw.get("claude_session_uuid") and (raw.get("session_summary") or "").strip():
+        session_text = raw["session_summary"]
+    session = {
+        "id": session_id,
+        "app_name": raw.get("app_name"),
+        "started_at": raw.get("started_at", ""),
+        "ended_at": raw.get("ended_at", ""),
+        "duration_s": raw.get("duration_s"),
+        "session_text": session_text,
+        "session_text_source": raw.get("session_text_source", "unknown"),
+        "window_titles": json.loads(raw.get("window_titles") or "[]"),
+        "category": raw.get("category"),
+        "confidence": raw.get("confidence", 0.0),
+        "audio_snippets": [],
+    }
+    return build_user_message(session, pm_tasks, recent_sessions=recent)
+def _classify(url: str, db_path: str, session_ids: list[int]) -> list[dict]:
+    payload = json.dumps({"session_ids": session_ids, "meridian_db": db_path}).encode()
+    req = urllib.request.Request(
+        f"{url}/classify_sessions",
+        data=payload,
+        headers={"Content-Type": "application/json"},
+    )
+    with urllib.request.urlopen(req, timeout=600) as resp:
+        return json.loads(resp.read()).get("results", [])
+def main() -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("session_ids", nargs="+", type=int, help="session id(s) to classify")
+    ap.add_argument("--show-prompt", action="store_true", help="also print the exact prompt sent")
+    ap.add_argument("--url", default=_DEFAULT_URL, help=f"MLX server (default {_DEFAULT_URL})")
+    ap.add_argument("--db", default=_DEFAULT_DB, help=f"meridian.db (default {_DEFAULT_DB})")
+    ap.add_argument("--json", action="store_true", help="print raw JSON results")
+    args = ap.parse_args()
+    if args.show_prompt:
+        for sid in args.session_ids:
+            prompt = _reconstruct_prompt(args.db, sid)
+            print(f"\n{'='*30} PROMPT for session {sid} {'='*30}")
+            print(prompt if prompt is not None else f"(session {sid} not found)")
+    results = _classify(args.url, args.db, args.session_ids)
+    if args.json:
+        print(json.dumps(results, indent=2))
+        return 0
+    for r in results:
+        print(f"\n{'='*30} RESULT for session {r.get('session_id')} {'='*30}")
+        for k in (
+            "task_key",
+            "session_type",
+            "confidence",
+            "category",
+            "category_confidence",
+            "method",
+        ):
+            print(f"  {k:20} = {r.get(k)}")
+        reasoning = (r.get("reasoning") or "").strip()
+        if reasoning:
+            print(f"  reasoning            = {reasoning}")
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/services/tests/evals/metrics.py CHANGED Viewed

@@ -23,7 +23,6 @@ from pathlib import Path
 import sys
 from deepeval.metrics import BaseMetric, TaskCompletionMetric
-from deepeval.models import OllamaModel
 from deepeval.test_case import LLMTestCase
 _SERVICES_DIR = Path(__file__).parent.parent.parent
@@ -33,7 +32,33 @@ if str(_SERVICES_DIR) not in sys.path:
 _MODEL = os.environ.get("OLLAMA_MODEL", "gemma4:31b")
 _HOST  = os.environ.get("OLLAMA_HOST",  "http://localhost:11434")
-_judge = OllamaModel(model=_MODEL, base_url=_HOST)
+def _make_judge() -> "object | None":
+    """Build the LLM judge — ONLY the agent-e2e TaskCompletionMetric needs it.
+    The classifier metrics below (TaskKeyMatch / SessionTypeMatch) are pure
+    exact-match and require no judge. Importing this module must therefore NOT
+    hard-depend on Ollama: if the `ollama` package or server is unavailable we
+    return None and the classifier eval runs unaffected. Construction is inside
+    the function (not at import) because OllamaModel() pulls in `ollama` only
+    when instantiated.
+    """
+    try:
+        from deepeval.models import OllamaModel
+        return OllamaModel(model=_MODEL, base_url=_HOST)
+    except Exception as exc:  # noqa: BLE001 — missing pkg, server down, etc.
+        import warnings
+        warnings.warn(
+            f"LLM judge unavailable ({exc}); agent-e2e metrics disabled. "
+            "Classifier exact-match metrics are unaffected.",
+            stacklevel=2,
+        )
+        return None
+_judge = _make_judge()
 _NULL_LITERALS = {"none", "null", "n/a", "nil", "undefined", ""}
@@ -146,9 +171,13 @@ class SessionTypeMatchMetric(BaseMetric):
 # Metric lists — import these in eval files
 # ---------------------------------------------------------------------------
-AGENT_E2E_METRICS = [
-    TaskCompletionMetric(threshold=0.5, model=_judge, include_reason=True),
-]
+# Only built when a judge is available — otherwise empty so importing this module
+# (e.g. for the classifier eval) never requires Ollama.
+AGENT_E2E_METRICS = (
+    [TaskCompletionMetric(threshold=0.5, model=_judge, include_reason=True)]
+    if _judge is not None
+    else []
+)
 CLASSIFIER_METRICS = [
     TaskKeyMatchMetric(threshold=1.0),

package/ui/.next/BUILD_ID CHANGED Viewed

	@@ -1 +1 @@
1	- ~~mXgl3Yg8KlSupvjSyOZCC~~
1	+ bw5ZyxNceKY52yxsIpewS

package/ui/.next/build-manifest.json CHANGED Viewed

@@ -7,9 +7,9 @@
     "static/chunks/03~yq9q893hmn.js"
   ],
   "lowPriorityFiles": [
-    "static/mXgl3Yg8KlSupvjSyOZCC/_buildManifest.js",
-    "static/mXgl3Yg8KlSupvjSyOZCC/_ssgManifest.js",
-    "static/mXgl3Yg8KlSupvjSyOZCC/_clientMiddlewareManifest.js"
+    "static/bw5ZyxNceKY52yxsIpewS/_buildManifest.js",
+    "static/bw5ZyxNceKY52yxsIpewS/_ssgManifest.js",
+    "static/bw5ZyxNceKY52yxsIpewS/_clientMiddlewareManifest.js"
   ],
   "rootMainFiles": [
     "static/chunks/120gq8w9i9o8g.js",

package/ui/.next/prerender-manifest.json CHANGED Viewed

@@ -102,8 +102,8 @@
   "dynamicRoutes": {},
   "notFoundRoutes": [],
   "preview": {
-    "previewModeId": "259de61dc959206e75574e0518e18e16",
-    "previewModeSigningKey": "7e86eeeaf2ad4affdcc25109b966a3e0f836308d4cd982d39b5b04f49eb52715",
-    "previewModeEncryptionKey": "ba095c02436654b5d800d69a73ef6cda4c7e6512ef6616a3fe386d32c04b782e"
+    "previewModeId": "4bca4ef7ab7e6b0e3f6eadd427fdf4d1",
+    "previewModeSigningKey": "cea2b4fe95c40f06cfe7051da27aa2e4c3587f1aabcbef2665abc7faae9bc89b",
+    "previewModeEncryptionKey": "c8a89d44d53028881943d384e34aee74527b0113ca602ba082ead4a8bce7dc10"
   }
 }