@meridiona/meridian-darwin-arm64 1.58.1 → 1.60.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/VERSION +1 -1
- package/bin/meridian +0 -0
- package/package.json +1 -1
- package/services/agents/_prompts.py +76 -31
- package/services/agents/_system_context.py +31 -45
- package/services/agents/run_task_linker_mlx.py +489 -100
- package/services/agents/server.py +33 -7
- package/services/agents/tests/test_run_task_linker_mlx.py +2 -2
- package/services/observability/dashboards/classifier-debug.json +3 -3
- package/services/pyproject.toml +1 -1
- package/services/skills/activity/task-classifier/SKILL.md +66 -162
- package/services/tests/evals/build_dataset.py +7 -15
- package/services/tests/evals/build_real_goldens.py +180 -0
- package/services/tests/evals/classify_session.py +7 -3
- package/services/tests/evals/data/labels/real_curated.json +166 -0
- package/services/tests/evals/data/labels/real_curated_holdout.json +97 -0
- package/services/tests/evals/data/labels/real_curated_holdout2.json +64 -0
- package/services/tests/evals/metrics.py +75 -0
- package/services/tests/evals/render_seeds.py +35 -14
- package/services/tests/evals/test_classifier.py +13 -3
- package/services/tests/test_continuity_context.py +202 -0
- package/services/tests/test_fetch_pm_tasks.py +120 -0
- package/services/tests/test_format_candidates.py +45 -0
- package/services/tests/test_prompt_cache_equivalence.py +97 -0
- package/ui.tar.gz +0 -0
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.
|
|
1
|
+
1.60.0
|
package/bin/meridian
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@meridiona/meridian-darwin-arm64",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.60.0",
|
|
4
4
|
"description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
|
|
5
5
|
"homepage": "https://github.com/Meridiona/meridian",
|
|
6
6
|
"repository": {
|
|
@@ -23,6 +23,25 @@ _VSCODE_BANNER_RE = re.compile(
|
|
|
23
23
|
# then responsible for not blowing the model's context window).
|
|
24
24
|
SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "10000"))
|
|
25
25
|
|
|
26
|
+
# Max chars of each candidate ticket's description included in the prompt.
|
|
27
|
+
# Default 0 = NO cap — the full description is sent. This field was previously
|
|
28
|
+
# hard-capped at 240 chars, which dropped 56-83% of real ticket text (measured:
|
|
29
|
+
# avg 548 chars, max 1440 across the live board), and the discriminating scope a
|
|
30
|
+
# session must be matched against frequently lives past char 240. With the
|
|
31
|
+
# 128K-context classifier and plan-only candidate sets (2-3 tickets), the prompt
|
|
32
|
+
# has ample budget, so descriptions are sent in full by default. Set
|
|
33
|
+
# CANDIDATE_DESC_CAP=<n> to re-impose a ceiling if an unusually long description
|
|
34
|
+
# ever bloats the prompt (e.g. on a full-candidate fallback day).
|
|
35
|
+
CANDIDATE_DESC_CAP = int(os.environ.get("CANDIDATE_DESC_CAP", "0"))
|
|
36
|
+
|
|
37
|
+
# Recent-work continuity window (minutes). The prompt summarises the developer's
|
|
38
|
+
# tracked work in this many minutes BEFORE the current session, aggregated per
|
|
39
|
+
# ticket, as a weak continuity prior. Time-windowed (not count-windowed) on
|
|
40
|
+
# purpose: session length is wildly variable, so "last N sessions" can be 90s of
|
|
41
|
+
# micro-glances or 3h of deep work. Shared with run_task_linker_mlx.py, which
|
|
42
|
+
# fetches the window. Override via CONTINUITY_WINDOW_MIN.
|
|
43
|
+
_CONTINUITY_WINDOW_MIN = int(os.environ.get("CONTINUITY_WINDOW_MIN", "30"))
|
|
44
|
+
|
|
26
45
|
|
|
27
46
|
def _fmt_dur(duration_s: int | float) -> str:
|
|
28
47
|
secs = int(duration_s or 0)
|
|
@@ -102,8 +121,8 @@ def _format_candidates(tasks: list[dict]) -> str:
|
|
|
102
121
|
epic_title = (task.get("epic_title") or "").strip()
|
|
103
122
|
sprint_name = (task.get("sprint_name") or "").strip()
|
|
104
123
|
tags = (task.get("tags") or "").strip()
|
|
105
|
-
if len(desc) >
|
|
106
|
-
desc = desc[:
|
|
124
|
+
if CANDIDATE_DESC_CAP > 0 and len(desc) > CANDIDATE_DESC_CAP:
|
|
125
|
+
desc = desc[:CANDIDATE_DESC_CAP] + "…"
|
|
107
126
|
meta_parts = [p for p in [issue_type, f"Epic: {epic_title}" if epic_title else "", sprint_name, f"tags: {tags}" if tags else ""] if p]
|
|
108
127
|
meta = " [" + " · ".join(meta_parts) + "]" if meta_parts else ""
|
|
109
128
|
# The dev declared this ticket as today's focus on the plan page. It's a
|
|
@@ -117,44 +136,70 @@ def _format_candidates(tasks: list[dict]) -> str:
|
|
|
117
136
|
return "\n\n".join(rows) if rows else "(no candidates)"
|
|
118
137
|
|
|
119
138
|
|
|
120
|
-
def
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
139
|
+
def _fmt_continuity_mins(seconds: float) -> str:
|
|
140
|
+
"""Coarse minutes label for the continuity block: '<1 min' or '~N min'."""
|
|
141
|
+
secs = int(seconds or 0)
|
|
142
|
+
if secs < 60:
|
|
143
|
+
return "<1 min"
|
|
144
|
+
return f"~{round(secs / 60)} min"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _format_continuity(activity: list[dict], now_iso: str | None = None) -> str:
|
|
148
|
+
"""Render the recent-ticket continuity prior — one bullet per ticket worked in
|
|
149
|
+
the window, ordered most-recent-first: total time spent, how many sessions it
|
|
150
|
+
spanned, and how long before the current session it was last active.
|
|
151
|
+
|
|
152
|
+
`activity` entries come from `_fetch_recent_ticket_activity` (already
|
|
153
|
+
aggregated, candidate-gated, confidence-filtered, recency-sorted). Empty input
|
|
154
|
+
→ an explicit "no tracked work" line (not ""), so the block is ALWAYS present:
|
|
155
|
+
that tells the model definitively "there is no recent continuity — rely on this
|
|
156
|
+
session's own evidence" (silence is ambiguous — it can't tell "no work" from
|
|
157
|
+
"not provided") and keeps the trace node legible instead of blank. We
|
|
158
|
+
deliberately do NOT emit a raw per-session log: those rows leak internal state
|
|
159
|
+
(sub-threshold micro-sessions, not-yet-classified neighbours, two interleaved
|
|
160
|
+
classify pipelines) that the model misreads as signal. This is a derived,
|
|
161
|
+
calibrated statement of recent tracked work.
|
|
162
|
+
"""
|
|
163
|
+
if not activity:
|
|
164
|
+
return " (no tracked work in this window)"
|
|
165
|
+
lines = []
|
|
166
|
+
for a in activity:
|
|
167
|
+
total = _fmt_continuity_mins(a.get("total_s", 0))
|
|
168
|
+
n = int(a.get("sessions", 0) or 0)
|
|
169
|
+
sess = "1 session" if n == 1 else f"{n} sessions"
|
|
170
|
+
ago_s = a.get("ago_s")
|
|
171
|
+
if ago_s is None:
|
|
172
|
+
recency = ""
|
|
173
|
+
elif ago_s < 60:
|
|
174
|
+
recency = ", last active just before this session"
|
|
137
175
|
else:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# feed a category prior back into classification.
|
|
142
|
-
rows.append(f" {time_str} {app:<14} {dur_str:<7} {target}")
|
|
143
|
-
return "\n".join(rows)
|
|
176
|
+
recency = f", last active ~{round(ago_s / 60)} min before this session"
|
|
177
|
+
lines.append(f" • {a['task_key']} — {total} over {sess}{recency}")
|
|
178
|
+
return "\n".join(lines)
|
|
144
179
|
|
|
145
180
|
|
|
146
181
|
def build_user_message(
|
|
147
182
|
session: dict,
|
|
148
183
|
candidates: list[dict],
|
|
149
|
-
|
|
184
|
+
recent_activity: list[dict] | None = None,
|
|
185
|
+
now_iso: str | None = None,
|
|
150
186
|
) -> str:
|
|
151
|
-
|
|
152
|
-
|
|
187
|
+
continuity = _format_continuity(recent_activity or [], now_iso)
|
|
188
|
+
# ALWAYS emitted (even when empty, where `continuity` is an explicit
|
|
189
|
+
# "no tracked work" line) so the model gets a definitive signal rather than
|
|
190
|
+
# ambiguous silence, and the trace node is never blank. Framed as a WEAK prior,
|
|
191
|
+
# never an instruction: an assertive "user was working on KAN-X" anchors the
|
|
192
|
+
# model into force-linking — the exact false-positive failure mode the SKILL
|
|
193
|
+
# warns against. The block states facts (ticket, time, recency); the SKILL's
|
|
194
|
+
# "classify by THIS session's evidence" rule governs.
|
|
153
195
|
recent_block = (
|
|
154
|
-
"RECENT WORK CONTEXT
|
|
155
|
-
f"{
|
|
196
|
+
f"RECENT WORK CONTEXT — the developer's tracked work in the last "
|
|
197
|
+
f"{_CONTINUITY_WINDOW_MIN} minutes before this session. This is a WEAK "
|
|
198
|
+
"continuity hint, NOT proof: continue the most-recent ticket ONLY if this "
|
|
199
|
+
"session's own evidence also fits it; never link on continuity alone.\n"
|
|
200
|
+
f"{continuity}\n"
|
|
156
201
|
"\n"
|
|
157
|
-
)
|
|
202
|
+
)
|
|
158
203
|
# When the dev declared a focus for the day, name it in the header so the model
|
|
159
204
|
# treats ★ rows as a prior — preferred when the evidence plausibly fits, but
|
|
160
205
|
# never forced. Recall is preserved: every candidate is still listed.
|
|
@@ -8,49 +8,35 @@ to ensure consistent behavior across entry points.
|
|
|
8
8
|
"""
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
CURRENT CAPABILITY — PM worklog updates
|
|
44
|
-
Given classified sessions, writes a verified worklog comment and posts it to the
|
|
45
|
-
connected PM tool (Jira, Linear, GitHub, etc.) without manual developer input.
|
|
46
|
-
|
|
47
|
-
DATABASE (for verification and ad-hoc queries)
|
|
48
|
-
Path: {_DB_PATH}
|
|
49
|
-
Query: sqlite3 {_DB_SHELL} "<SQL>"
|
|
50
|
-
Tables:
|
|
51
|
-
app_sessions: id, app_name, started_at, ended_at, duration_s, session_text,
|
|
52
|
-
session_text_source, window_titles, category, confidence,
|
|
53
|
-
task_key, task_confidence, task_routing
|
|
54
|
-
pm_tasks: task_key, title, description_text, issue_type, status_raw, is_terminal,
|
|
55
|
-
parent_key, epic_title, sprint_name, assignee_name
|
|
11
|
+
# NOTE: the classifier no longer embeds the DB path or any per-environment value
|
|
12
|
+
# into the prompt — session data and candidate tickets arrive in the message, and
|
|
13
|
+
# the model never shells out to sqlite on this path. SYSTEM_CONTEXT is therefore a
|
|
14
|
+
# pure static constant (no f-string interpolation), which is exactly what lets the
|
|
15
|
+
# MLX prompt-cache treat the whole system+skill prefix as an unchanging, cacheable
|
|
16
|
+
# block reused across every session classified this process.
|
|
17
|
+
|
|
18
|
+
SYSTEM_CONTEXT = """You are **Meridian Intelligence**, the classification engine inside Meridian — a tool that watches a developer's screen and keeps their project-management tickets up to date automatically.
|
|
19
|
+
|
|
20
|
+
YOUR JOB
|
|
21
|
+
Meridian turns screen capture into a stream of work *sessions* (one app, a time span,
|
|
22
|
+
the on-screen text). For each session you are given the session plus the developer's
|
|
23
|
+
open tracked tickets, and you decide ONE thing:
|
|
24
|
+
· **task** — the session is clearly work on one of the candidate tickets → name it.
|
|
25
|
+
· **untracked** — real work, but it doesn't clearly match any candidate ticket. Kept:
|
|
26
|
+
Meridian later turns untracked work into new tickets.
|
|
27
|
+
· **overhead** — idle / personal / unrelated (music, settings, browsing). Discarded.
|
|
28
|
+
Tickets may come from Jira, Linear, GitHub, Trello, or Azure DevOps — treat them the same.
|
|
29
|
+
|
|
30
|
+
WHY ACCURACY MATTERS
|
|
31
|
+
Your classifications are the foundation of the whole pipeline. Every session you link to a
|
|
32
|
+
ticket is later summed with the others on that ticket and summarised into a **worklog update
|
|
33
|
+
posted to the developer's PM tool** on their behalf. So a wrong link is expensive: it injects
|
|
34
|
+
work that never happened into a real ticket's worklog AND hides the genuine untracked work.
|
|
35
|
+
**When the evidence does not clearly fit a candidate ticket, choose `untracked` — never force
|
|
36
|
+
a match.** A correct `untracked` is always better than a wrong `task`.
|
|
37
|
+
|
|
38
|
+
OUTPUT
|
|
39
|
+
Return a single bare JSON object — no preamble, no markdown fences, no text around it.
|
|
40
|
+
Follow the task-classifier skill below for the exact schema, field order, and decision rules.
|
|
41
|
+
Session data and candidate tickets are passed in the message; you do not need to query anything.
|
|
56
42
|
"""
|