@meridiona/meridian-darwin-arm64 1.5.0 → 1.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (114) hide show
  1. package/VERSION +1 -1
  2. package/bin/meridian +0 -0
  3. package/package.json +1 -1
  4. package/scripts/meridian-cli.sh +68 -98
  5. package/services/agents/_prompts.py +21 -13
  6. package/services/agents/run_task_linker_mlx.py +10 -6
  7. package/services/pyproject.toml +1 -1
  8. package/services/skills/activity/task-classifier/SKILL.md +14 -12
  9. package/services/tests/evals/classify_session.py +122 -0
  10. package/services/tests/evals/metrics.py +34 -5
  11. package/ui/.next/BUILD_ID +1 -1
  12. package/ui/.next/build-manifest.json +3 -3
  13. package/ui/.next/prerender-manifest.json +3 -3
  14. package/ui/.next/server/app/_global-error.html +1 -1
  15. package/ui/.next/server/app/_global-error.rsc +1 -1
  16. package/ui/.next/server/app/_global-error.segments/__PAGE__.segment.rsc +1 -1
  17. package/ui/.next/server/app/_global-error.segments/_full.segment.rsc +1 -1
  18. package/ui/.next/server/app/_global-error.segments/_head.segment.rsc +1 -1
  19. package/ui/.next/server/app/_global-error.segments/_index.segment.rsc +1 -1
  20. package/ui/.next/server/app/_global-error.segments/_tree.segment.rsc +1 -1
  21. package/ui/.next/server/app/_not-found/page_client-reference-manifest.js +1 -1
  22. package/ui/.next/server/app/_not-found.html +1 -1
  23. package/ui/.next/server/app/_not-found.rsc +2 -2
  24. package/ui/.next/server/app/_not-found.segments/_full.segment.rsc +2 -2
  25. package/ui/.next/server/app/_not-found.segments/_head.segment.rsc +1 -1
  26. package/ui/.next/server/app/_not-found.segments/_index.segment.rsc +2 -2
  27. package/ui/.next/server/app/_not-found.segments/_not-found/__PAGE__.segment.rsc +1 -1
  28. package/ui/.next/server/app/_not-found.segments/_not-found.segment.rsc +1 -1
  29. package/ui/.next/server/app/_not-found.segments/_tree.segment.rsc +2 -2
  30. package/ui/.next/server/app/api/settings/route.js.nft.json +1 -1
  31. package/ui/.next/server/app/index.html +1 -1
  32. package/ui/.next/server/app/index.rsc +3 -3
  33. package/ui/.next/server/app/index.segments/__PAGE__.segment.rsc +2 -2
  34. package/ui/.next/server/app/index.segments/_full.segment.rsc +3 -3
  35. package/ui/.next/server/app/index.segments/_head.segment.rsc +1 -1
  36. package/ui/.next/server/app/index.segments/_index.segment.rsc +2 -2
  37. package/ui/.next/server/app/index.segments/_tree.segment.rsc +2 -2
  38. package/ui/.next/server/app/page/react-loadable-manifest.json +1 -1
  39. package/ui/.next/server/app/page_client-reference-manifest.js +1 -1
  40. package/ui/.next/server/app/settings/page_client-reference-manifest.js +1 -1
  41. package/ui/.next/server/app/settings.html +1 -1
  42. package/ui/.next/server/app/settings.rsc +2 -2
  43. package/ui/.next/server/app/settings.segments/_full.segment.rsc +2 -2
  44. package/ui/.next/server/app/settings.segments/_head.segment.rsc +1 -1
  45. package/ui/.next/server/app/settings.segments/_index.segment.rsc +2 -2
  46. package/ui/.next/server/app/settings.segments/_tree.segment.rsc +2 -2
  47. package/ui/.next/server/app/settings.segments/settings/__PAGE__.segment.rsc +1 -1
  48. package/ui/.next/server/app/settings.segments/settings.segment.rsc +1 -1
  49. package/ui/.next/server/chunks/[root-of-the-server]__0o.3lhr._.js +1 -1
  50. package/ui/.next/server/chunks/[root-of-the-server]__0t62i3x._.js +8 -5
  51. package/ui/.next/server/middleware-build-manifest.js +3 -3
  52. package/ui/.next/server/pages/404.html +1 -1
  53. package/ui/.next/server/pages/500.html +1 -1
  54. package/ui/.next/server/server-reference-manifest.js +1 -1
  55. package/ui/.next/server/server-reference-manifest.json +1 -1
  56. package/ui/.next/static/chunks/0.e6xqgbosj58.css +4 -0
  57. package/ui/.next/static/chunks/0f2ikqegp34r..js +1 -0
  58. package/ui/.next/static/chunks/{17.0_3q.gw7x2.js → 0puw3vthktvhx.js} +1 -1
  59. package/ui/app/api/active/route.ts +47 -0
  60. package/ui/app/api/coding-agents/route.ts +53 -0
  61. package/ui/app/api/queue-review/route.ts +83 -0
  62. package/ui/app/api/settings/route.ts +18 -0
  63. package/ui/app/api/tasks/route.ts +141 -0
  64. package/ui/app/api/today/route.ts +299 -0
  65. package/ui/app/api/week/route.ts +86 -0
  66. package/ui/app/api/worklogs/[id]/route.ts +144 -0
  67. package/ui/app/api/worklogs/route.ts +134 -0
  68. package/ui/app/globals.css +177 -0
  69. package/ui/app/layout.tsx +33 -0
  70. package/ui/app/page.tsx +106 -0
  71. package/ui/app/settings/page.tsx +6 -0
  72. package/ui/components/CommandBar.tsx +103 -0
  73. package/ui/components/DayTimeline.tsx +126 -0
  74. package/ui/components/Nav.tsx +45 -0
  75. package/ui/components/RefreshTrigger.tsx +15 -0
  76. package/ui/components/ShapeOfDay.tsx +150 -0
  77. package/ui/components/Sidebar.tsx +130 -0
  78. package/ui/components/TaskBadge.tsx +223 -0
  79. package/ui/components/TodayMetrics.tsx +110 -0
  80. package/ui/components/TweaksPanel.tsx +200 -0
  81. package/ui/components/atoms.tsx +254 -0
  82. package/ui/components/ui/NumberStepper.tsx +128 -0
  83. package/ui/components/ui/Select.tsx +109 -0
  84. package/ui/components/ui/Switch.tsx +49 -0
  85. package/ui/components/views/QueueView.tsx +171 -0
  86. package/ui/components/views/SessionsView.tsx +145 -0
  87. package/ui/components/views/SettingsView.tsx +217 -0
  88. package/ui/components/views/TasksView.tsx +208 -0
  89. package/ui/components/views/TodayView.tsx +522 -0
  90. package/ui/components/views/WeekView.tsx +201 -0
  91. package/ui/components/views/WorklogsView.tsx +379 -0
  92. package/ui/instrumentation.ts +8 -0
  93. package/ui/lib/app-colors.ts +40 -0
  94. package/ui/lib/category-colors.ts +30 -0
  95. package/ui/lib/date-utils.ts +16 -0
  96. package/ui/lib/db-write.ts +41 -0
  97. package/ui/lib/db.ts +45 -0
  98. package/ui/lib/format.ts +35 -0
  99. package/ui/lib/intervals.ts +128 -0
  100. package/ui/lib/observability.ts +88 -0
  101. package/ui/lib/settings.ts +73 -0
  102. package/ui/lib/theme-context.tsx +104 -0
  103. package/ui/lib/types.ts +106 -0
  104. package/ui/next.config.ts +35 -0
  105. package/ui/package-lock.json +4446 -0
  106. package/ui/package.json +1 -1
  107. package/ui/postcss.config.mjs +5 -0
  108. package/ui/tsconfig.json +41 -0
  109. package/ui/tsconfig.tsbuildinfo +1 -0
  110. package/ui/.next/static/chunks/0laaz3a6vqgl~.css +0 -4
  111. package/ui/.next/static/chunks/16f557ymkx721.js +0 -1
  112. /package/ui/.next/static/{mXgl3Yg8KlSupvjSyOZCC → bw5ZyxNceKY52yxsIpewS}/_buildManifest.js +0 -0
  113. /package/ui/.next/static/{mXgl3Yg8KlSupvjSyOZCC → bw5ZyxNceKY52yxsIpewS}/_clientMiddlewareManifest.js +0 -0
  114. /package/ui/.next/static/{mXgl3Yg8KlSupvjSyOZCC → bw5ZyxNceKY52yxsIpewS}/_ssgManifest.js +0 -0
package/VERSION CHANGED
@@ -1 +1 @@
1
- 1.5.0
1
+ 1.7.0
package/bin/meridian CHANGED
Binary file
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@meridiona/meridian-darwin-arm64",
3
- "version": "1.5.0",
3
+ "version": "1.7.0",
4
4
  "description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
5
5
  "homepage": "https://github.com/Meridiona/meridian",
6
6
  "repository": {
@@ -190,113 +190,83 @@ cmd_logs() {
190
190
  }
191
191
 
192
192
  # --- doctor ---
193
- _check() {
194
- local desc="$1" pass="$2" reason="${3:-}"
195
- if [[ "$pass" == "1" ]]; then
196
- ok "$desc"
197
- else
198
- err "$desc${reason:+ ${reason}}"
199
- DOCTOR_FAILURES=$(( DOCTOR_FAILURES + 1 ))
200
- fi
193
+ # The daemon binary owns the comprehensive, colourised, by-daemon health table
194
+ # (system, meridian daemon, screenpipe, mlx-server, jira, ui, mcp). The wrapper
195
+ # just delegates to it; if that binary is missing or stale, a minimal bash-only
196
+ # fallback runs so `meridian doctor` always produces something useful.
197
+
198
+ _group() { printf "\n ── %s ─────────────────────────────────────────────\n" "$1"; }
199
+
200
+ _row() { # status check detail
201
+ local status="$1" check="$2" detail="${3:-}" glyph
202
+ case "$status" in
203
+ ok) glyph="✓" ;;
204
+ warn) glyph="⊘" ;;
205
+ info) glyph="·" ;;
206
+ *) glyph="✗"; DOCTOR_FAILURES=$(( DOCTOR_FAILURES + 1 )) ;;
207
+ esac
208
+ printf " %s %-26s %s\n" "$glyph" "$check" "$detail"
201
209
  }
202
210
 
203
- _pid_from_print() {
204
- local label="$1"
205
- local output
206
- set +e
207
- output="$(launchctl print "${GUI_TARGET}/${label}" 2>/dev/null)"
208
- local rc=$?
209
- set -e
210
- [[ $rc -ne 0 ]] && return 1
211
- printf '%s\n' "$output" | grep -E '^\s+pid\s*=' | grep -oE '[0-9]+' | head -1
211
+ _plist_row() { # label check-label
212
+ local plist="${LAUNCH_AGENTS}/$1.plist"
213
+ if [[ -f "$plist" ]] && plutil -lint "$plist" >/dev/null 2>&1; then
214
+ _row ok "$2" ""
215
+ else
216
+ _row fail "$2" "run ./install.sh"
217
+ fi
212
218
  }
213
219
 
214
- cmd_doctor() {
215
- DOCTOR_FAILURES=0
216
-
217
- # 1. macOS
218
- _check "macOS" "$([[ "$(uname -s)" == "Darwin" ]] && echo 1 || echo 0)" "run on macOS"
219
-
220
- # 2. daemon binary
221
- local bin_ok=0
220
+ _daemon_bin() {
221
+ local p
222
222
  for p in /usr/local/bin/meridian-daemon "${HOME}/.local/bin/meridian-daemon"; do
223
- [[ -x "$p" ]] && bin_ok=1 && break
223
+ [[ -x "$p" ]] && { printf '%s\n' "$p"; return 0; }
224
224
  done
225
- _check "daemon binary exists and is executable" "$bin_ok" "run ./install.sh"
226
-
227
- # 3. daemon plist lints
228
- local dplist="${LAUNCH_AGENTS}/${LABEL_DAEMON}.plist"
229
- if [[ -f "$dplist" ]]; then
230
- set +e; plutil -lint "$dplist" >/dev/null 2>&1; local pl=$?; set -e
231
- _check "daemon plist installed and valid" "$([[ $pl -eq 0 ]] && echo 1 || echo 0)" "plutil -lint ${dplist}"
232
- else
233
- _check "daemon plist installed and valid" "0" "run ./install.sh"
234
- fi
235
-
236
- # 4. daemon running
237
- local dpid; dpid="$(_pid_from_print "$LABEL_DAEMON" 2>/dev/null)" || dpid=""
238
- _check "daemon running (pid ${dpid:-?})" "$([[ -n "$dpid" ]] && echo 1 || echo 0)" "meridian start"
239
-
240
- # 5. user config
241
- _check "user config <repo>/.env exists" "$([[ -f "${REPO_ROOT}/.env" ]] && echo 1 || echo 0)" "run ./install.sh"
242
-
243
- # 6. screenpipe plist lints
244
- local spplist="${LAUNCH_AGENTS}/${LABEL_SCREENPIPE}.plist"
245
- if [[ -f "$spplist" ]]; then
246
- set +e; plutil -lint "$spplist" >/dev/null 2>&1; local spl=$?; set -e
247
- _check "screenpipe plist installed and valid" "$([[ $spl -eq 0 ]] && echo 1 || echo 0)" "plutil -lint ${spplist}"
248
- else
249
- _check "screenpipe plist installed and valid" "0" "run ./install.sh"
250
- fi
251
-
252
- # 7. screenpipe binary in PATH
253
- set +e; command -v screenpipe >/dev/null 2>&1; local spbin=$?; set -e
254
- _check "screenpipe binary in PATH" "$([[ $spbin -eq 0 ]] && echo 1 || echo 0)" "install screenpipe (npm install -g screenpipe)"
255
-
256
- # 8. screenpipe DB
257
- _check "screenpipe DB exists" "$([[ -f "${HOME}/.screenpipe/db.sqlite" ]] && echo 1 || echo 0)" "install and run screenpipe"
258
-
259
- # 9. screenpipe running
260
- set +e; pgrep -x screenpipe >/dev/null 2>&1; local sp=$?; set -e
261
- _check "screenpipe running" "$([[ $sp -eq 0 ]] && echo 1 || echo 0)" "start screenpipe"
262
-
263
- # 10. meridian DB
264
- if [[ -f "${HOME}/.meridian/meridian.db" ]]; then
265
- ok "meridian DB exists"
266
- else
267
- warn "meridian DB not yet created (will be on first run)"
268
- fi
269
-
270
- # 11. Python venv
271
- local venv_py="${REPO_ROOT}/services/.venv/bin/python"
272
- local venv_ok=0
273
- if [[ -x "$venv_py" ]]; then
274
- set +e; "$venv_py" -c "import run_agent" 2>/dev/null; local vi=$?; set -e
275
- [[ $vi -eq 0 ]] && venv_ok=1
276
- fi
277
- _check "Python venv and run_agent importable" "$venv_ok" "bash scripts/setup-services.sh"
278
-
279
- # 12. MCP server built
280
- _check "MCP server built" "$([[ -f "${REPO_ROOT}/packages/meridian-mcp/dist/index.js" ]] && echo 1 || echo 0)" "cd packages/meridian-mcp && npm run build"
225
+ return 1
226
+ }
281
227
 
282
- # 13. UI plist lints
283
- local uiplist="${LAUNCH_AGENTS}/${LABEL_UI}.plist"
284
- if [[ -f "$uiplist" ]]; then
285
- set +e; plutil -lint "$uiplist" >/dev/null 2>&1; local uil=$?; set -e
286
- _check "UI plist installed and valid" "$([[ $uil -eq 0 ]] && echo 1 || echo 0)" "plutil -lint ${uiplist}"
287
- else
288
- _check "UI plist installed and valid" "0" "run ./install.sh"
228
+ cmd_doctor() {
229
+ local bin
230
+ if bin="$(_daemon_bin)"; then
231
+ set +e
232
+ if [[ "$*" == *--fix* ]]; then
233
+ # --fix has interactive guided prompts — the user is present, so run
234
+ # without the alarm (which would kill a prompt waiting for input).
235
+ "$bin" doctor "$@"
236
+ else
237
+ # Guard with a perl alarm so a stale binary (one that predates
238
+ # `doctor` and would fall through to starting the daemon) can never
239
+ # hang the terminal. The Rust report colourises itself on a tty.
240
+ perl -e 'alarm shift @ARGV; exec @ARGV' 30 "$bin" doctor "$@"
241
+ fi
242
+ local rc=$?
243
+ set -e
244
+ # 0 = healthy, 1 = critical issues found — both are real doctor runs.
245
+ if [[ $rc -eq 0 || $rc -eq 1 ]]; then return $rc; fi
246
+ warn "health engine timed out or is stale — rebuild: cargo build --release"
289
247
  fi
248
+ _doctor_fallback
249
+ }
290
250
 
291
- # 14. UI built
292
- _check "UI built (ui/.next exists)" "$([[ -d "${REPO_ROOT}/ui/.next" ]] && echo 1 || echo 0)" "cd ui && npm ci && npm run build"
293
-
251
+ # Minimal bash-only checks for when the daemon binary is unavailable.
252
+ _doctor_fallback() {
253
+ DOCTOR_FAILURES=0
254
+ printf "\n Meridian doctor (fallback — daemon binary unavailable)\n"
255
+ printf " ════════════════════════════════════════════════════════\n"
256
+ _group "system"
257
+ _row "$([[ "$(uname -s)" == "Darwin" ]] && echo ok || echo fail)" "macOS" ""
258
+ _row "$([[ -f "${REPO_ROOT}/.env" ]] && echo ok || echo fail)" "config (.env)" ""
259
+ _group "services (plists)"
260
+ _plist_row "$LABEL_DAEMON" "daemon plist"
261
+ _plist_row "$LABEL_SCREENPIPE" "screenpipe plist"
262
+ _plist_row "$LABEL_MLX" "mlx plist"
263
+ _plist_row "$LABEL_UI" "ui plist"
264
+ _group "builds"
265
+ _row "$([[ -f "${REPO_ROOT}/packages/meridian-mcp/dist/index.js" ]] && echo ok || echo fail)" "mcp built" ""
266
+ _row "$([[ -d "${REPO_ROOT}/ui/.next" ]] && echo ok || echo fail)" "ui built" ""
294
267
  echo
295
- if [[ $DOCTOR_FAILURES -eq 0 ]]; then
296
- ok "all checks passed"
297
- else
298
- printf " %d check%s failed\n" "$DOCTOR_FAILURES" "$([[ $DOCTOR_FAILURES -ne 1 ]] && echo s || true)"
299
- fi
268
+ _row info "next step" "cargo build --release && meridian doctor"
269
+ [[ $DOCTOR_FAILURES -eq 0 ]]
300
270
  }
301
271
 
302
272
  # --- config ---
@@ -506,7 +476,7 @@ case "$CMD" in
506
476
  restart) cmd_restart ;;
507
477
  status) cmd_status ;;
508
478
  logs) cmd_logs "$@" ;;
509
- doctor) cmd_doctor ;;
479
+ doctor) cmd_doctor "$@" ;;
510
480
  config) cmd_config "$@" ;;
511
481
  dev) cmd_dev "$@" ;;
512
482
  uninstall) cmd_uninstall ;;
@@ -11,12 +11,17 @@ _VSCODE_BANNER_RE = re.compile(
11
11
  re.IGNORECASE | re.DOTALL,
12
12
  )
13
13
 
14
- # Max chars of session_text included in the prompt. Default 2500 (~625 tokens at
15
- # 4 chars/token) enough to identify files, ticket keys, and recent activity
16
- # without inflating context in production. Override via SESSION_TEXT_CAP env var
17
- # for eval experiments; set to 0 to disable truncation entirely (caller is then
18
- # responsible for not blowing the model's context window).
19
- SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "2500"))
14
+ # Max chars of session_text included in the prompt. Default 10000 (~2500 tokens
15
+ # at 4 chars/token). The old 2500 cap kept only the FIRST frames of a multi-frame
16
+ # OCR capture, so when a session spanned more than one window/app the later
17
+ # (often foreground) activity was silently dropped e.g. a session whose head
18
+ # showed an IDE but whose tail showed the user had moved to a different app/
19
+ # project got misclassified on the stale head. The classifier model has a 128K
20
+ # context window, so 2500 was far too conservative; 10000 comfortably holds a
21
+ # full multi-frame session while staying trivial for the model. Override via
22
+ # SESSION_TEXT_CAP env var; set to 0 to disable truncation entirely (caller is
23
+ # then responsible for not blowing the model's context window).
24
+ SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "10000"))
20
25
 
21
26
 
22
27
  def _fmt_dur(duration_s: int | float) -> str:
@@ -51,10 +56,12 @@ def _format_session(session: dict) -> str:
51
56
  parts.append(f"time: {time_range}{dur_str}")
52
57
  elif dur is not None:
53
58
  parts.append(f"duration: {_fmt_dur(dur)}")
54
- cat = session.get("category")
55
- cat_conf = session.get("confidence")
56
- if cat:
57
- parts.append(f"category: {cat} (confidence {round(cat_conf or 0.0, 2)})")
59
+ # NOTE: the rule-based ETL category is intentionally NOT included here. It is
60
+ # a cheap heuristic derived from the SAME app/window/OCR signals the LLM
61
+ # already sees, so feeding it in only injects a correlated prior — when the
62
+ # heuristic is wrong (e.g. background-window OCR bleed), it biases the LLM
63
+ # toward the same mistake. The classifier re-derives category from the raw
64
+ # evidence and its output overwrites the rule-based value anyway.
58
65
  titles = session.get("window_titles") or []
59
66
  if titles:
60
67
  parts.append("top windows:")
@@ -116,7 +123,6 @@ def _format_recent_sessions(sessions: list[dict]) -> str:
116
123
  dur_str = _fmt_dur(s.get("duration_s") or 0)
117
124
  task_key = s.get("task_key")
118
125
  routing = s.get("task_routing") # None means unclassified
119
- category = (s.get("category") or "").strip()
120
126
  if task_key:
121
127
  target = f"→ {task_key}"
122
128
  elif routing == "untracked":
@@ -126,8 +132,10 @@ def _format_recent_sessions(sessions: list[dict]) -> str:
126
132
  target = "→ [pending]"
127
133
  else:
128
134
  target = "→ [overhead]"
129
- cat_tag = f" [{category}]" if category else ""
130
- rows.append(f" {time_str} {app:<14} {dur_str:<7} {target}{cat_tag}")
135
+ # Category is intentionally omitted recent-context is a task-continuity
136
+ # signal only; carrying the (rule-based or prior-LLM) category tag would
137
+ # feed a category prior back into classification.
138
+ rows.append(f" {time_str} {app:<14} {dur_str:<7} {target}")
131
139
  return "\n".join(rows)
132
140
 
133
141
 
@@ -83,10 +83,10 @@ class SessionClassification(BaseModel):
83
83
  ] = Field(
84
84
  ...,
85
85
  description=(
86
- "The single best activity category for this session. A rule-based "
87
- "guess is supplied in the input confirm it or correct it from the "
88
- "evidence. Declared early in the schema so FSM decoding always emits "
89
- "it before the long session_summary field."
86
+ "The single best activity category for this session. Derive it from "
87
+ "the evidence (app, window titles, screen content); no category is "
88
+ "supplied in the input. Declared early in the schema so FSM decoding "
89
+ "always emits it before the long session_summary field."
90
90
  ),
91
91
  )
92
92
  category_confidence: float = Field(
@@ -232,9 +232,13 @@ def _fetch_session(
232
232
  def _fetch_recent_sessions(
233
233
  con: _sqlite3.Connection, before_id: int
234
234
  ) -> list[dict[str, Any]]:
235
+ # Recent context is a task-continuity signal only: app, time, duration and
236
+ # which ticket each recent session mapped to. We deliberately do NOT select
237
+ # session_text/excerpt or category — recent OCR is noise here and a category
238
+ # tag would feed a prior back into classification. (session_text is still
239
+ # referenced in WHERE only to skip empty-capture rows.)
235
240
  rows = con.execute(
236
- "SELECT app_name, started_at, duration_s, task_key, task_routing, category,"
237
- " COALESCE(SUBSTR(session_text, 1, 200), '') AS text_excerpt"
241
+ "SELECT app_name, started_at, duration_s, task_key, task_routing"
238
242
  " FROM app_sessions"
239
243
  " WHERE id < ? AND duration_s > 1 AND COALESCE(session_text,'') != ''"
240
244
  " ORDER BY id DESC LIMIT ?",
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "meridian-agents"
7
- version = "1.5.0"
7
+ version = "1.7.0"
8
8
  description = "Meridian agents — hermes task linking and Jira progress updates for meridian.db"
9
9
  requires-python = ">=3.11"
10
10
  authors = [{ name = "Meridiona" }]
@@ -24,7 +24,7 @@ The task classifier sits at the center of Meridian's workflow understanding:
24
24
 
25
25
  ## Classification Decision Tree
26
26
 
27
- For each session, you must decide:
27
+ For each session, decide in this order. **Core principle: do NOT try to fit every session to an existing ticket. Assign a `task_key` only when the session's OWN evidence clearly matches that specific ticket's scope. Most real work that isn't an obvious match is `untracked`, not a forced link.**
28
28
 
29
29
  ### 1. Is this overhead?
30
30
  If the session is **idle, music, system settings,or clearly personal/unrelated activity** → return:
@@ -33,27 +33,29 @@ If the session is **idle, music, system settings,or clearly personal/unrelated a
33
33
  ```
34
34
  **overhead is a hard discard.** These sessions are thrown away — never surfaced, never used for inference, never create tasks. When in doubt between overhead and untracked, ask: *"Would a manager care that this happened?"* If no, it's overhead.
35
35
 
36
- ### 2. Is this work-related?
37
- If the session shows **any real work signal** (coding, research, meetings, writing, debugging, reviewing, learning) but **no Jira candidate matches** → mark as **untracked** and return:
36
+ ### 2. Is this real work that ISN'T clearly one of the candidate tickets? → untracked
37
+ If the session shows **any real work signal** (coding, research, meetings, writing, debugging, reviewing, learning) but it does **not clearly match the scope of a candidate ticket** → mark as **untracked**:
38
38
  ```json
39
39
  {"task_key": null, "confidence": 0.6-0.8, "session_type": "untracked", "routing": "queue"}
40
40
  ```
41
- **untracked sessions are kept and used downstream** for workload analysis, capacity reporting, and automatic new-task creation. Mark dimensions to capture *what* the work was. Examples that must be `untracked` (not `overhead`): standups, retros, code reviews on untracked PRs, config/infra housekeeping, general repo exploration, internal tool usage.
41
+ **This is the important, common case — and it is what `untracked` MEANS: the user genuinely did this work, but there is no Jira ticket for it yet.** Downstream, Meridian uses untracked sessions to **create or update** the matching Jira task. So it is critical that you do **not** shoehorn this work into an unrelated existing ticket just because it is the only candidate available, or because recent sessions were on it. **A wrong task link is worse than `untracked`** it pollutes a real ticket's worklog and hides the genuine untracked work that should have spawned its own ticket. When the evidence doesn't clearly fit a candidate, choose `untracked`.
42
42
 
43
- ### 3. Can it map to an open Jira ticket?
44
- If the session evidence **directly or contextually matches** an open ticket → return:
43
+ `untracked` sessions are kept and used downstream (workload analysis, capacity reporting, new-task creation). Mark dimensions to capture *what* the work was. Examples that must be `untracked` (not `overhead`): standups, retros, code reviews on untracked PRs, config/infra housekeeping, general repo exploration, general research, **and any work on a feature/bug/chore that has no matching candidate ticket**.
44
+
45
+ ### 3. Does it CLEARLY map to one specific candidate ticket? → task
46
+ Assign a `task_key` **only** when the session's own evidence (window titles, OCR, file/branch names, an explicit ticket-key mention) directly matches the **scope described in that ticket's title/description** → return:
45
47
  ```json
46
48
  {"task_key": "KEY-123", "confidence": 0.50-0.90, "session_type": "task", "routing": "auto"}
47
49
  ```
48
- Cite the evidence (window title, OCR snippet, context from previous sessions) and infer activity dimensions.
50
+ Recent-session continuity may *support* a match, but **continuity alone is never enough** — the current session must carry its own evidence that fits the ticket. If the active app/window shows the user is now on something else (a different project, a meeting, another repo, a doc for another team), classify by **that**, not by what they were doing minutes ago. Cite the specific evidence, and infer activity dimensions.
49
51
 
50
52
  ## Your inputs
51
53
 
52
54
  The user message contains:
53
55
 
54
- - **SESSION** — app, category (with confidence), duration, top window titles, and counts of OCR/audio captures.
56
+ - **SESSION** — app, duration, top window titles, and the screen content (OCR / a11y). Decide the category yourself from this evidence; no category is provided.
55
57
  - **CANDIDATE TICKETS** — all open Jira tickets. These are the only tickets you may choose from.
56
- - **RECENT SESSIONS** (previous 5) — context to help disambiguate. Example: *"User was on KAN-42 (coding) 5 minutes ago, then Slack, now back in VS Code."* likely same task, even if Slack doesn't directly match KAN-42.
58
+ - **RECENT SESSIONS** (previous 5) — app / time / duration / which ticket each mapped to (no screen text). A **weak disambiguation hint only**: it can support a match when the current session ALSO has matching evidence, but it must never override what the current session itself shows. Recent activity on a ticket does not make the current session that ticket.
57
59
 
58
60
  ## Available capabilities
59
61
 
@@ -98,7 +100,7 @@ Reply with ONE valid JSON object — no preamble, no markdown fences, no follow-
98
100
  ### Field rules
99
101
  - `task_key` — must be one of the supplied candidates, or `null`. Never invent a key.
100
102
  - `confidence` — see Scoring heuristics section for exact ranges per outcome type.
101
- - `category` — the single best activity category (see taxonomy below). The input carries a rule-based guess; confirm it or correct it from the evidence.
103
+ - `category` — the single best activity category (see taxonomy below). Derive it yourself from the evidence (app, window titles, screen content); no category is provided in the input.
102
104
  - `category_confidence` — how certain you are about `category`, `0.0`–`1.0`.
103
105
  - `category_explanation` — ONE concise sentence justifying the category, citing the app / window titles / OCR evidence. Shown in the dashboard next to the category.
104
106
  - `session_type` — `"task"` links to Jira; `"overhead"` is thrown away; `"untracked"` is kept for workload analysis.
@@ -199,7 +201,7 @@ You have access to **the previous 5 sessions** to disambiguate the current sessi
199
201
  - Session 2 (3 min ago): Slack, discussing PR review for KAN-42 → **if related to same work**, task_key: KAN-42, confidence: 0.75 (work mention + prior context)
200
202
  - Session 3 (now): VS Code, editing same file → task_key: KAN-42, confidence: 0.85 (context continuity)
201
203
 
202
- **Decision:** If Session 2 (Slack) content shows it's about the same work (discussing the work or searching about it), classify it to **KAN-42** using context from Session 1. If Slack is generic work discussion with no connection to the prior task, return `null` with `session_type: "untracked"`.
204
+ **Decision:** Only link Session 2 to KAN-42 if Session 2's **own content** shows it is about that work (the OCR/window discusses or searches the KAN-42 work). If Session 2 is generic, OR shows the user has moved to *different* work (another project, another team's doc, an unrelated meeting), return `null` with `session_type: "untracked"` (or a different ticket if its own evidence matches one) — **do not inherit KAN-42 just because it was the recent task.** Continuity is a tie-breaker between plausible matches, never a substitute for current-session evidence.
203
205
 
204
206
  Example reasoning for Session 2 (if task-related): `"Slack discusses PR review for KAN-42 implementation mentioned in prior VS Code session; linked via work context."`
205
207
 
@@ -208,7 +210,7 @@ Example reasoning for Session 2 (if task-related): `"Slack discusses PR review f
208
210
  **When task_key is not null (matched to a ticket):**
209
211
  - **Task key + work alignment** — `confidence ≥ 0.90`, `session_type: "task"`
210
212
  - **Work description alignment** — `0.75–0.85`, `session_type: "task"`
211
- - **Context continuity** — `0.75–0.85`, `session_type: "task"`
213
+ - **Context continuity (current session ALSO has matching evidence)** — `0.75–0.85`, `session_type: "task"`. Continuity with no current-session evidence is **not** a task — use `untracked`.
212
214
  - **Generic project-level match** — `0.50–0.65`, `session_type: "task"`
213
215
  - **Task key only** — `0.60–0.75`, `session_type: "task"` (lower than key+alignment because work intent unclear)
214
216
 
@@ -0,0 +1,122 @@
1
+ """Dry-run the task classifier for one or more session ids — read-only.
2
+
3
+ Drives the SAME code path the daemon uses: POSTs the session id(s) to the
4
+ running MLX server's /classify_sessions endpoint, which runs `_classify_one`
5
+ (fetch session + recent context + pm_tasks → build prompt → model → parse).
6
+ The Python endpoint only RETURNS the result; the DB write is done separately by
7
+ the Rust daemon, so calling this never mutates app_sessions — you see exactly
8
+ what the classifier WOULD output right now, with the current code and prompt.
9
+
10
+ Usage:
11
+ services/.venv/bin/python services/tests/evals/classify_session.py 20128
12
+ services/.venv/bin/python services/tests/evals/classify_session.py 20128 20127 --show-prompt
13
+ MLX_SERVER_URL=http://127.0.0.1:7823 ... classify_session.py 20128
14
+
15
+ The MLX server must be running (meridian status / port 7823). It uses the
16
+ already-loaded model, so this is fast — no in-process model load.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ import argparse
22
+ import json
23
+ import os
24
+ import sqlite3
25
+ import sys
26
+ import urllib.request
27
+ from pathlib import Path
28
+
29
+ _DEFAULT_URL = os.environ.get("MLX_SERVER_URL", "http://127.0.0.1:7823").rstrip("/")
30
+ _DEFAULT_DB = os.path.expanduser(
31
+ os.environ.get("MERIDIAN_DB", "~/.meridian/meridian.db")
32
+ )
33
+
34
+
35
+ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
36
+ """Rebuild the exact prompt via the production builder (read-only)."""
37
+ # Import lazily so the common path (no --show-prompt) needs no agents deps.
38
+ sys.path.insert(0, str(Path(__file__).resolve().parents[2])) # services/
39
+ from agents._prompts import build_user_message
40
+ from agents.run_task_linker_mlx import (
41
+ _fetch_pm_tasks,
42
+ _fetch_recent_sessions,
43
+ _fetch_session,
44
+ )
45
+
46
+ con = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
47
+ con.row_factory = sqlite3.Row
48
+ raw = _fetch_session(con, session_id)
49
+ if raw is None:
50
+ return None
51
+ recent = _fetch_recent_sessions(con, session_id)
52
+ pm_tasks = _fetch_pm_tasks(con)
53
+ session_text = raw.get("session_text") or ""
54
+ if raw.get("claude_session_uuid") and (raw.get("session_summary") or "").strip():
55
+ session_text = raw["session_summary"]
56
+ session = {
57
+ "id": session_id,
58
+ "app_name": raw.get("app_name"),
59
+ "started_at": raw.get("started_at", ""),
60
+ "ended_at": raw.get("ended_at", ""),
61
+ "duration_s": raw.get("duration_s"),
62
+ "session_text": session_text,
63
+ "session_text_source": raw.get("session_text_source", "unknown"),
64
+ "window_titles": json.loads(raw.get("window_titles") or "[]"),
65
+ "category": raw.get("category"),
66
+ "confidence": raw.get("confidence", 0.0),
67
+ "audio_snippets": [],
68
+ }
69
+ return build_user_message(session, pm_tasks, recent_sessions=recent)
70
+
71
+
72
+ def _classify(url: str, db_path: str, session_ids: list[int]) -> list[dict]:
73
+ payload = json.dumps({"session_ids": session_ids, "meridian_db": db_path}).encode()
74
+ req = urllib.request.Request(
75
+ f"{url}/classify_sessions",
76
+ data=payload,
77
+ headers={"Content-Type": "application/json"},
78
+ )
79
+ with urllib.request.urlopen(req, timeout=600) as resp:
80
+ return json.loads(resp.read()).get("results", [])
81
+
82
+
83
+ def main() -> int:
84
+ ap = argparse.ArgumentParser(description=__doc__)
85
+ ap.add_argument("session_ids", nargs="+", type=int, help="session id(s) to classify")
86
+ ap.add_argument("--show-prompt", action="store_true", help="also print the exact prompt sent")
87
+ ap.add_argument("--url", default=_DEFAULT_URL, help=f"MLX server (default {_DEFAULT_URL})")
88
+ ap.add_argument("--db", default=_DEFAULT_DB, help=f"meridian.db (default {_DEFAULT_DB})")
89
+ ap.add_argument("--json", action="store_true", help="print raw JSON results")
90
+ args = ap.parse_args()
91
+
92
+ if args.show_prompt:
93
+ for sid in args.session_ids:
94
+ prompt = _reconstruct_prompt(args.db, sid)
95
+ print(f"\n{'='*30} PROMPT for session {sid} {'='*30}")
96
+ print(prompt if prompt is not None else f"(session {sid} not found)")
97
+
98
+ results = _classify(args.url, args.db, args.session_ids)
99
+
100
+ if args.json:
101
+ print(json.dumps(results, indent=2))
102
+ return 0
103
+
104
+ for r in results:
105
+ print(f"\n{'='*30} RESULT for session {r.get('session_id')} {'='*30}")
106
+ for k in (
107
+ "task_key",
108
+ "session_type",
109
+ "confidence",
110
+ "category",
111
+ "category_confidence",
112
+ "method",
113
+ ):
114
+ print(f" {k:20} = {r.get(k)}")
115
+ reasoning = (r.get("reasoning") or "").strip()
116
+ if reasoning:
117
+ print(f" reasoning = {reasoning}")
118
+ return 0
119
+
120
+
121
+ if __name__ == "__main__":
122
+ raise SystemExit(main())
@@ -23,7 +23,6 @@ from pathlib import Path
23
23
  import sys
24
24
 
25
25
  from deepeval.metrics import BaseMetric, TaskCompletionMetric
26
- from deepeval.models import OllamaModel
27
26
  from deepeval.test_case import LLMTestCase
28
27
 
29
28
  _SERVICES_DIR = Path(__file__).parent.parent.parent
@@ -33,7 +32,33 @@ if str(_SERVICES_DIR) not in sys.path:
33
32
  _MODEL = os.environ.get("OLLAMA_MODEL", "gemma4:31b")
34
33
  _HOST = os.environ.get("OLLAMA_HOST", "http://localhost:11434")
35
34
 
36
- _judge = OllamaModel(model=_MODEL, base_url=_HOST)
35
+
36
+ def _make_judge() -> "object | None":
37
+ """Build the LLM judge — ONLY the agent-e2e TaskCompletionMetric needs it.
38
+
39
+ The classifier metrics below (TaskKeyMatch / SessionTypeMatch) are pure
40
+ exact-match and require no judge. Importing this module must therefore NOT
41
+ hard-depend on Ollama: if the `ollama` package or server is unavailable we
42
+ return None and the classifier eval runs unaffected. Construction is inside
43
+ the function (not at import) because OllamaModel() pulls in `ollama` only
44
+ when instantiated.
45
+ """
46
+ try:
47
+ from deepeval.models import OllamaModel
48
+
49
+ return OllamaModel(model=_MODEL, base_url=_HOST)
50
+ except Exception as exc: # noqa: BLE001 — missing pkg, server down, etc.
51
+ import warnings
52
+
53
+ warnings.warn(
54
+ f"LLM judge unavailable ({exc}); agent-e2e metrics disabled. "
55
+ "Classifier exact-match metrics are unaffected.",
56
+ stacklevel=2,
57
+ )
58
+ return None
59
+
60
+
61
+ _judge = _make_judge()
37
62
 
38
63
  _NULL_LITERALS = {"none", "null", "n/a", "nil", "undefined", ""}
39
64
 
@@ -146,9 +171,13 @@ class SessionTypeMatchMetric(BaseMetric):
146
171
  # Metric lists — import these in eval files
147
172
  # ---------------------------------------------------------------------------
148
173
 
149
- AGENT_E2E_METRICS = [
150
- TaskCompletionMetric(threshold=0.5, model=_judge, include_reason=True),
151
- ]
174
+ # Only built when a judge is available — otherwise empty so importing this module
175
+ # (e.g. for the classifier eval) never requires Ollama.
176
+ AGENT_E2E_METRICS = (
177
+ [TaskCompletionMetric(threshold=0.5, model=_judge, include_reason=True)]
178
+ if _judge is not None
179
+ else []
180
+ )
152
181
 
153
182
  CLASSIFIER_METRICS = [
154
183
  TaskKeyMatchMetric(threshold=1.0),
package/ui/.next/BUILD_ID CHANGED
@@ -1 +1 @@
1
- mXgl3Yg8KlSupvjSyOZCC
1
+ bw5ZyxNceKY52yxsIpewS
@@ -7,9 +7,9 @@
7
7
  "static/chunks/03~yq9q893hmn.js"
8
8
  ],
9
9
  "lowPriorityFiles": [
10
- "static/mXgl3Yg8KlSupvjSyOZCC/_buildManifest.js",
11
- "static/mXgl3Yg8KlSupvjSyOZCC/_ssgManifest.js",
12
- "static/mXgl3Yg8KlSupvjSyOZCC/_clientMiddlewareManifest.js"
10
+ "static/bw5ZyxNceKY52yxsIpewS/_buildManifest.js",
11
+ "static/bw5ZyxNceKY52yxsIpewS/_ssgManifest.js",
12
+ "static/bw5ZyxNceKY52yxsIpewS/_clientMiddlewareManifest.js"
13
13
  ],
14
14
  "rootMainFiles": [
15
15
  "static/chunks/120gq8w9i9o8g.js",
@@ -102,8 +102,8 @@
102
102
  "dynamicRoutes": {},
103
103
  "notFoundRoutes": [],
104
104
  "preview": {
105
- "previewModeId": "259de61dc959206e75574e0518e18e16",
106
- "previewModeSigningKey": "7e86eeeaf2ad4affdcc25109b966a3e0f836308d4cd982d39b5b04f49eb52715",
107
- "previewModeEncryptionKey": "ba095c02436654b5d800d69a73ef6cda4c7e6512ef6616a3fe386d32c04b782e"
105
+ "previewModeId": "4bca4ef7ab7e6b0e3f6eadd427fdf4d1",
106
+ "previewModeSigningKey": "cea2b4fe95c40f06cfe7051da27aa2e4c3587f1aabcbef2665abc7faae9bc89b",
107
+ "previewModeEncryptionKey": "c8a89d44d53028881943d384e34aee74527b0113ca602ba082ead4a8bce7dc10"
108
108
  }
109
109
  }