@meridiona/meridian-darwin-arm64 1.58.0 → 1.59.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/VERSION +1 -1
- package/bin/meridian +0 -0
- package/package.json +1 -1
- package/scripts/install-screenpipe-daemon.sh +25 -3
- package/services/agents/_prompts.py +76 -31
- package/services/agents/_system_context.py +1 -1
- package/services/agents/run_task_linker_mlx.py +119 -26
- package/services/agents/tests/test_run_task_linker_mlx.py +2 -2
- package/services/pyproject.toml +1 -1
- package/services/skills/activity/task-classifier/SKILL.md +16 -12
- package/services/tests/evals/build_dataset.py +7 -15
- package/services/tests/evals/classify_session.py +7 -3
- package/services/tests/evals/render_seeds.py +35 -14
- package/services/tests/test_continuity_context.py +202 -0
- package/services/tests/test_fetch_pm_tasks.py +120 -0
- package/services/tests/test_format_candidates.py +45 -0
- package/ui.tar.gz +0 -0
package/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
1.
|
|
1
|
+
1.59.0
|
package/bin/meridian
CHANGED
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@meridiona/meridian-darwin-arm64",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.59.0",
|
|
4
4
|
"description": "Prebuilt Meridian app for macOS arm64 (daemon binary + dashboard + Python services). Installed via @meridiona/meridian.",
|
|
5
5
|
"homepage": "https://github.com/Meridiona/meridian",
|
|
6
6
|
"repository": {
|
|
@@ -127,10 +127,32 @@ while launchctl print "${GUI_TARGET}/${LABEL}" >/dev/null 2>&1; do
|
|
|
127
127
|
done
|
|
128
128
|
|
|
129
129
|
echo "→ bootstrap ${LABEL}"
|
|
130
|
+
# `meridian stop` runs `launchctl disable` to clear the KeepAlive intent, which
|
|
131
|
+
# persists in launchd's per-user override DB. bootstrap REFUSES a disabled label
|
|
132
|
+
# with EIO (errno 5), so the override must be cleared FIRST — otherwise a plain
|
|
133
|
+
# reinstall (install-dev.sh) can't revive a service that was `meridian stop`-ped.
|
|
130
134
|
launchctl enable "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
135
|
+
# bootstrap is genuinely flaky: it EIOs when the prior domain entry hasn't fully
|
|
136
|
+
# cleared even after the bootout-wait above. Do NOT let one transient failure
|
|
137
|
+
# abort the whole install under `set -e` (that's what left screenpipe down after
|
|
138
|
+
# a stop). Retry, re-enabling each round, and treat "already loaded" as success.
|
|
139
|
+
_bs_try=0
|
|
140
|
+
until launchctl bootstrap "${GUI_TARGET}" "${PLIST_DEST}" 2>/dev/null; do
|
|
141
|
+
if launchctl print "${GUI_TARGET}/${LABEL}" >/dev/null 2>&1; then
|
|
142
|
+
break # already in the domain — bootstrap only "failed" because it's present
|
|
143
|
+
fi
|
|
144
|
+
_bs_try=$(( _bs_try + 1 ))
|
|
145
|
+
if [[ "${_bs_try}" -ge 5 ]]; then
|
|
146
|
+
echo "⚠ bootstrap ${LABEL} failed after ${_bs_try} attempts — see launchctl print" >&2
|
|
147
|
+
break
|
|
148
|
+
fi
|
|
149
|
+
launchctl enable "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
|
|
150
|
+
sleep 1
|
|
151
|
+
done
|
|
152
|
+
# Always finish with enable + kickstart, even if bootstrap was a no-op above, so a
|
|
153
|
+
# disabled-but-loaded service ends up enabled AND running.
|
|
154
|
+
launchctl enable "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
|
|
155
|
+
launchctl kickstart -k "${GUI_TARGET}/${LABEL}" 2>/dev/null || true
|
|
134
156
|
|
|
135
157
|
echo
|
|
136
158
|
echo "✓ screenpipe installed and started"
|
|
@@ -23,6 +23,25 @@ _VSCODE_BANNER_RE = re.compile(
|
|
|
23
23
|
# then responsible for not blowing the model's context window).
|
|
24
24
|
SESSION_TEXT_CAP = int(os.environ.get("SESSION_TEXT_CAP", "10000"))
|
|
25
25
|
|
|
26
|
+
# Max chars of each candidate ticket's description included in the prompt.
|
|
27
|
+
# Default 0 = NO cap — the full description is sent. This field was previously
|
|
28
|
+
# hard-capped at 240 chars, which dropped 56-83% of real ticket text (measured:
|
|
29
|
+
# avg 548 chars, max 1440 across the live board), and the discriminating scope a
|
|
30
|
+
# session must be matched against frequently lives past char 240. With the
|
|
31
|
+
# 128K-context classifier and plan-only candidate sets (2-3 tickets), the prompt
|
|
32
|
+
# has ample budget, so descriptions are sent in full by default. Set
|
|
33
|
+
# CANDIDATE_DESC_CAP=<n> to re-impose a ceiling if an unusually long description
|
|
34
|
+
# ever bloats the prompt (e.g. on a full-candidate fallback day).
|
|
35
|
+
CANDIDATE_DESC_CAP = int(os.environ.get("CANDIDATE_DESC_CAP", "0"))
|
|
36
|
+
|
|
37
|
+
# Recent-work continuity window (minutes). The prompt summarises the developer's
|
|
38
|
+
# tracked work in this many minutes BEFORE the current session, aggregated per
|
|
39
|
+
# ticket, as a weak continuity prior. Time-windowed (not count-windowed) on
|
|
40
|
+
# purpose: session length is wildly variable, so "last N sessions" can be 90s of
|
|
41
|
+
# micro-glances or 3h of deep work. Shared with run_task_linker_mlx.py, which
|
|
42
|
+
# fetches the window. Override via CONTINUITY_WINDOW_MIN.
|
|
43
|
+
_CONTINUITY_WINDOW_MIN = int(os.environ.get("CONTINUITY_WINDOW_MIN", "30"))
|
|
44
|
+
|
|
26
45
|
|
|
27
46
|
def _fmt_dur(duration_s: int | float) -> str:
|
|
28
47
|
secs = int(duration_s or 0)
|
|
@@ -102,8 +121,8 @@ def _format_candidates(tasks: list[dict]) -> str:
|
|
|
102
121
|
epic_title = (task.get("epic_title") or "").strip()
|
|
103
122
|
sprint_name = (task.get("sprint_name") or "").strip()
|
|
104
123
|
tags = (task.get("tags") or "").strip()
|
|
105
|
-
if len(desc) >
|
|
106
|
-
desc = desc[:
|
|
124
|
+
if CANDIDATE_DESC_CAP > 0 and len(desc) > CANDIDATE_DESC_CAP:
|
|
125
|
+
desc = desc[:CANDIDATE_DESC_CAP] + "…"
|
|
107
126
|
meta_parts = [p for p in [issue_type, f"Epic: {epic_title}" if epic_title else "", sprint_name, f"tags: {tags}" if tags else ""] if p]
|
|
108
127
|
meta = " [" + " · ".join(meta_parts) + "]" if meta_parts else ""
|
|
109
128
|
# The dev declared this ticket as today's focus on the plan page. It's a
|
|
@@ -117,44 +136,70 @@ def _format_candidates(tasks: list[dict]) -> str:
|
|
|
117
136
|
return "\n\n".join(rows) if rows else "(no candidates)"
|
|
118
137
|
|
|
119
138
|
|
|
120
|
-
def
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
139
|
+
def _fmt_continuity_mins(seconds: float) -> str:
|
|
140
|
+
"""Coarse minutes label for the continuity block: '<1 min' or '~N min'."""
|
|
141
|
+
secs = int(seconds or 0)
|
|
142
|
+
if secs < 60:
|
|
143
|
+
return "<1 min"
|
|
144
|
+
return f"~{round(secs / 60)} min"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _format_continuity(activity: list[dict], now_iso: str | None = None) -> str:
|
|
148
|
+
"""Render the recent-ticket continuity prior — one bullet per ticket worked in
|
|
149
|
+
the window, ordered most-recent-first: total time spent, how many sessions it
|
|
150
|
+
spanned, and how long before the current session it was last active.
|
|
151
|
+
|
|
152
|
+
`activity` entries come from `_fetch_recent_ticket_activity` (already
|
|
153
|
+
aggregated, candidate-gated, confidence-filtered, recency-sorted). Empty input
|
|
154
|
+
→ an explicit "no tracked work" line (not ""), so the block is ALWAYS present:
|
|
155
|
+
that tells the model definitively "there is no recent continuity — rely on this
|
|
156
|
+
session's own evidence" (silence is ambiguous — it can't tell "no work" from
|
|
157
|
+
"not provided") and keeps the trace node legible instead of blank. We
|
|
158
|
+
deliberately do NOT emit a raw per-session log: those rows leak internal state
|
|
159
|
+
(sub-threshold micro-sessions, not-yet-classified neighbours, two interleaved
|
|
160
|
+
classify pipelines) that the model misreads as signal. This is a derived,
|
|
161
|
+
calibrated statement of recent tracked work.
|
|
162
|
+
"""
|
|
163
|
+
if not activity:
|
|
164
|
+
return " (no tracked work in this window)"
|
|
165
|
+
lines = []
|
|
166
|
+
for a in activity:
|
|
167
|
+
total = _fmt_continuity_mins(a.get("total_s", 0))
|
|
168
|
+
n = int(a.get("sessions", 0) or 0)
|
|
169
|
+
sess = "1 session" if n == 1 else f"{n} sessions"
|
|
170
|
+
ago_s = a.get("ago_s")
|
|
171
|
+
if ago_s is None:
|
|
172
|
+
recency = ""
|
|
173
|
+
elif ago_s < 60:
|
|
174
|
+
recency = ", last active just before this session"
|
|
137
175
|
else:
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# feed a category prior back into classification.
|
|
142
|
-
rows.append(f" {time_str} {app:<14} {dur_str:<7} {target}")
|
|
143
|
-
return "\n".join(rows)
|
|
176
|
+
recency = f", last active ~{round(ago_s / 60)} min before this session"
|
|
177
|
+
lines.append(f" • {a['task_key']} — {total} over {sess}{recency}")
|
|
178
|
+
return "\n".join(lines)
|
|
144
179
|
|
|
145
180
|
|
|
146
181
|
def build_user_message(
|
|
147
182
|
session: dict,
|
|
148
183
|
candidates: list[dict],
|
|
149
|
-
|
|
184
|
+
recent_activity: list[dict] | None = None,
|
|
185
|
+
now_iso: str | None = None,
|
|
150
186
|
) -> str:
|
|
151
|
-
|
|
152
|
-
|
|
187
|
+
continuity = _format_continuity(recent_activity or [], now_iso)
|
|
188
|
+
# ALWAYS emitted (even when empty, where `continuity` is an explicit
|
|
189
|
+
# "no tracked work" line) so the model gets a definitive signal rather than
|
|
190
|
+
# ambiguous silence, and the trace node is never blank. Framed as a WEAK prior,
|
|
191
|
+
# never an instruction: an assertive "user was working on KAN-X" anchors the
|
|
192
|
+
# model into force-linking — the exact false-positive failure mode the SKILL
|
|
193
|
+
# warns against. The block states facts (ticket, time, recency); the SKILL's
|
|
194
|
+
# "classify by THIS session's evidence" rule governs.
|
|
153
195
|
recent_block = (
|
|
154
|
-
"RECENT WORK CONTEXT
|
|
155
|
-
f"{
|
|
196
|
+
f"RECENT WORK CONTEXT — the developer's tracked work in the last "
|
|
197
|
+
f"{_CONTINUITY_WINDOW_MIN} minutes before this session. This is a WEAK "
|
|
198
|
+
"continuity hint, NOT proof: continue the most-recent ticket ONLY if this "
|
|
199
|
+
"session's own evidence also fits it; never link on continuity alone.\n"
|
|
200
|
+
f"{continuity}\n"
|
|
156
201
|
"\n"
|
|
157
|
-
)
|
|
202
|
+
)
|
|
158
203
|
# When the dev declared a focus for the day, name it in the header so the model
|
|
159
204
|
# treats ★ rows as a prior — preferred when the evidence plausibly fits, but
|
|
160
205
|
# never forced. Recall is preserved: every candidate is still listed.
|
|
@@ -29,7 +29,7 @@ _DB_SHELL = shlex.quote(str(_DB_PATH))
|
|
|
29
29
|
|
|
30
30
|
SYSTEM_CONTEXT = f"""You are **Meridian Intelligence** — the AI reasoning layer inside Meridian, a developer productivity platform.
|
|
31
31
|
|
|
32
|
-
Meridian monitors a developer's screen and builds a structured record of their work
|
|
32
|
+
Meridian monitors a developer's screen and builds a structured record of their work as a stream of work *sessions*. Your PRIMARY role is to reason over each session and **classify it** — determining which tracked ticket (the "task") the work belongs to, or whether it is overhead or untracked work — so Meridian can keep every ticket's progress and worklog accurate. Classifying a session correctly to its task, and reasoning carefully over the evidence to do so, is the core job.
|
|
33
33
|
|
|
34
34
|
CURRENT CAPABILITY — session classification
|
|
35
35
|
Given a work session (app, duration, screen content, recent history, open tickets), decide:
|
|
@@ -13,7 +13,7 @@ OTel span hierarchy (when invoked as a script via main()):
|
|
|
13
13
|
db_fetch
|
|
14
14
|
classifier_input ← the COMPLETE model input (system + user)
|
|
15
15
|
system_prompt — classifier skill + context
|
|
16
|
-
recent_context —
|
|
16
|
+
recent_context — 30-min per-ticket continuity prior
|
|
17
17
|
session_block — the input session being classified
|
|
18
18
|
candidate_tickets — ranked candidate tickets (★ = today)
|
|
19
19
|
llm_inference
|
|
@@ -54,15 +54,22 @@ from agents import observability
|
|
|
54
54
|
from agents._prompts import (
|
|
55
55
|
build_user_message,
|
|
56
56
|
_format_candidates,
|
|
57
|
-
|
|
57
|
+
_format_continuity,
|
|
58
58
|
_format_session,
|
|
59
|
+
_CONTINUITY_WINDOW_MIN,
|
|
59
60
|
)
|
|
60
61
|
from agents._system_context import SYSTEM_CONTEXT
|
|
61
62
|
|
|
62
63
|
log = logging.getLogger("agents.run_task_linker_mlx")
|
|
63
64
|
tracer = observability.setup("meridian-task-linker-mlx")
|
|
64
65
|
|
|
65
|
-
|
|
66
|
+
# Recent-work continuity: only count a prior session toward the continuity block
|
|
67
|
+
# if its task link is confident enough to trust (a shaky 0.5 generic match
|
|
68
|
+
# shouldn't compound into a continuity nudge). 0.7 sits at the top of the SKILL's
|
|
69
|
+
# "generic project-level match" band (0.50-0.65), so this keeps real alignments
|
|
70
|
+
# and drops weak guesses. The window length lives in _prompts._CONTINUITY_WINDOW_MIN
|
|
71
|
+
# (shared with the prompt label). Override via CONTINUITY_MIN_CONFIDENCE.
|
|
72
|
+
_CONTINUITY_MIN_CONFIDENCE = float(os.environ.get("CONTINUITY_MIN_CONFIDENCE", "0.7"))
|
|
66
73
|
_MAX_TOKENS = 1024
|
|
67
74
|
_TEMPERATURE = 0.0 # greedy decoding — deterministic classification
|
|
68
75
|
|
|
@@ -602,23 +609,88 @@ def _fetch_session(
|
|
|
602
609
|
return dict(row) if row else None
|
|
603
610
|
|
|
604
611
|
|
|
605
|
-
def
|
|
606
|
-
con: _sqlite3.Connection,
|
|
612
|
+
def _fetch_recent_ticket_activity(
|
|
613
|
+
con: _sqlite3.Connection,
|
|
614
|
+
current_started_at: str,
|
|
615
|
+
candidate_keys: list[str],
|
|
607
616
|
) -> list[dict[str, Any]]:
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
617
|
+
"""The developer's tracked-ticket work in the _CONTINUITY_WINDOW_MIN minutes
|
|
618
|
+
before the current session, aggregated per ticket → a calibrated continuity
|
|
619
|
+
prior (NOT a raw session log).
|
|
620
|
+
|
|
621
|
+
Returns one entry per ticket worked in the window:
|
|
622
|
+
{"task_key", "total_s", "sessions", "last_ended_at", "ago_s"}
|
|
623
|
+
ordered by recency (most-recently-active ticket first). Empty when there is no
|
|
624
|
+
qualifying recent work — the caller then omits the block entirely rather than
|
|
625
|
+
asserting a continuity that doesn't exist.
|
|
626
|
+
|
|
627
|
+
A session counts only if it is (a) already CLASSIFIED to a ticket
|
|
628
|
+
(task_session_type='task' — "last classified", never pending/in-flight),
|
|
629
|
+
(b) confident enough to trust as a prior (task_confidence >=
|
|
630
|
+
_CONTINUITY_MIN_CONFIDENCE), and (c) mapped to a ticket in the CURRENT
|
|
631
|
+
candidate set — a prior on a ticket the model can't even pick is pure noise.
|
|
632
|
+
Windowing is done in Python (fromisoformat) so it's robust to the stored
|
|
633
|
+
timestamp's timezone/precision; the SQL only does the cheap "strictly before
|
|
634
|
+
current" + confidence prefilter (consistent ISO format → lexicographic '<' is
|
|
635
|
+
chronological).
|
|
636
|
+
"""
|
|
637
|
+
candidates = set(candidate_keys)
|
|
638
|
+
if not current_started_at or not candidates:
|
|
639
|
+
return []
|
|
640
|
+
try:
|
|
641
|
+
anchor = _dt.datetime.fromisoformat(current_started_at)
|
|
642
|
+
except (ValueError, TypeError):
|
|
643
|
+
return []
|
|
644
|
+
window_start = anchor - _dt.timedelta(minutes=_CONTINUITY_WINDOW_MIN)
|
|
613
645
|
rows = con.execute(
|
|
614
|
-
"SELECT
|
|
646
|
+
"SELECT task_key, started_at, ended_at, duration_s, task_confidence"
|
|
615
647
|
" FROM app_sessions"
|
|
616
|
-
" WHERE
|
|
617
|
-
"
|
|
618
|
-
|
|
648
|
+
" WHERE started_at < ?"
|
|
649
|
+
" AND task_key IS NOT NULL"
|
|
650
|
+
" AND task_session_type = 'task'"
|
|
651
|
+
" AND task_confidence >= ?"
|
|
652
|
+
" ORDER BY started_at DESC LIMIT 200",
|
|
653
|
+
(current_started_at, _CONTINUITY_MIN_CONFIDENCE),
|
|
619
654
|
).fetchall()
|
|
620
|
-
|
|
621
|
-
|
|
655
|
+
|
|
656
|
+
agg: dict[str, dict[str, Any]] = {}
|
|
657
|
+
for r in rows:
|
|
658
|
+
d = dict(r)
|
|
659
|
+
tk = d.get("task_key")
|
|
660
|
+
if tk not in candidates:
|
|
661
|
+
continue
|
|
662
|
+
try:
|
|
663
|
+
s_at = _dt.datetime.fromisoformat(d["started_at"])
|
|
664
|
+
except (ValueError, TypeError):
|
|
665
|
+
continue
|
|
666
|
+
if s_at < window_start:
|
|
667
|
+
continue # outside the continuity window
|
|
668
|
+
try:
|
|
669
|
+
e_at = _dt.datetime.fromisoformat(d.get("ended_at") or d["started_at"])
|
|
670
|
+
except (ValueError, TypeError):
|
|
671
|
+
e_at = s_at
|
|
672
|
+
entry = agg.get(tk)
|
|
673
|
+
if entry is None:
|
|
674
|
+
entry = {"task_key": tk, "total_s": 0.0, "sessions": 0, "last_ended": e_at}
|
|
675
|
+
agg[tk] = entry
|
|
676
|
+
entry["total_s"] += float(d.get("duration_s") or 0.0)
|
|
677
|
+
entry["sessions"] += 1
|
|
678
|
+
if e_at > entry["last_ended"]:
|
|
679
|
+
entry["last_ended"] = e_at
|
|
680
|
+
|
|
681
|
+
result: list[dict[str, Any]] = []
|
|
682
|
+
for entry in agg.values():
|
|
683
|
+
ago_s = max(0.0, (anchor - entry["last_ended"]).total_seconds())
|
|
684
|
+
result.append(
|
|
685
|
+
{
|
|
686
|
+
"task_key": entry["task_key"],
|
|
687
|
+
"total_s": entry["total_s"],
|
|
688
|
+
"sessions": entry["sessions"],
|
|
689
|
+
"last_ended_at": entry["last_ended"].isoformat(),
|
|
690
|
+
"ago_s": ago_s,
|
|
691
|
+
}
|
|
692
|
+
)
|
|
693
|
+
result.sort(key=lambda e: e["ago_s"]) # most-recently-active ticket first
|
|
622
694
|
return result
|
|
623
695
|
|
|
624
696
|
|
|
@@ -813,7 +885,7 @@ def _classify_one(
|
|
|
813
885
|
session_raw = _fetch_session(con, session_id)
|
|
814
886
|
if session_raw is None:
|
|
815
887
|
db_span.set_attribute("pm_tasks_count", 0)
|
|
816
|
-
db_span.set_attribute("
|
|
888
|
+
db_span.set_attribute("recent_continuity_tickets", 0)
|
|
817
889
|
db_span.add_event("session_not_found", {"session_id": session_id})
|
|
818
890
|
db_span.set_status(StatusCode.ERROR, f"session {session_id} not found in DB")
|
|
819
891
|
return _error_result(
|
|
@@ -823,7 +895,12 @@ def _classify_one(
|
|
|
823
895
|
plan_date = _local_day(session_raw.get("started_at") or "")
|
|
824
896
|
focus_keys = _fetch_plan_focus(con, plan_date)
|
|
825
897
|
pm_tasks = _fetch_pm_tasks(con, focus_keys)
|
|
826
|
-
|
|
898
|
+
# Continuity prior needs the candidate set up front (it only names tickets
|
|
899
|
+
# the model can actually pick), so compute candidate_keys before fetching.
|
|
900
|
+
candidate_keys = [t["task_key"] for t in pm_tasks]
|
|
901
|
+
recent = _fetch_recent_ticket_activity(
|
|
902
|
+
con, session_raw.get("started_at") or "", candidate_keys
|
|
903
|
+
)
|
|
827
904
|
|
|
828
905
|
session_text = session_raw.get("session_text") or ""
|
|
829
906
|
# Coding-agent rows (Claude Code / Codex) carry the full transcript in
|
|
@@ -839,7 +916,7 @@ def _classify_one(
|
|
|
839
916
|
# it answers "was the right ticket even offered, and where was it ranked?"
|
|
840
917
|
# without anyone having to read the prompt. Ranked order is preserved
|
|
841
918
|
# (today's-focus keys float to the front in _fetch_pm_tasks).
|
|
842
|
-
candidate_keys
|
|
919
|
+
# candidate_keys computed above (the continuity fetch needs it).
|
|
843
920
|
recent_task_keys = [r.get("task_key") for r in recent if r.get("task_key")]
|
|
844
921
|
# Session identity + the app_sessions row metadata, so a trace is
|
|
845
922
|
# self-contained — you know WHICH session and its key fields (app, window
|
|
@@ -881,7 +958,13 @@ def _classify_one(
|
|
|
881
958
|
db_span.set_attribute("summary_source", str(session_raw.get("summary_source") or ""))
|
|
882
959
|
db_span.set_attribute("pm_tasks_count", len(pm_tasks))
|
|
883
960
|
db_span.set_attribute("today_focus_count", len(focus_keys))
|
|
884
|
-
|
|
961
|
+
# Continuity prior: how many tickets the dev worked in the prior window,
|
|
962
|
+
# and across how many classified sessions (0/0 when there's no qualifying
|
|
963
|
+
# recent work → the block is omitted from the prompt).
|
|
964
|
+
db_span.set_attribute("recent_continuity_tickets", len(recent))
|
|
965
|
+
db_span.set_attribute(
|
|
966
|
+
"recent_continuity_sessions", sum(int(r.get("sessions", 0) or 0) for r in recent)
|
|
967
|
+
)
|
|
885
968
|
db_span.set_attribute("candidate_task_keys", ", ".join(candidate_keys) if candidate_keys else "-")
|
|
886
969
|
db_span.set_attribute("today_focus_keys", ", ".join(focus_keys) if focus_keys else "-")
|
|
887
970
|
# Which candidate-set policy actually applied for this session, so a trace
|
|
@@ -920,8 +1003,8 @@ def _classify_one(
|
|
|
920
1003
|
# The single drill-down span for "exactly what the classifier was asked".
|
|
921
1004
|
# It carries the COMPLETE input, byte-for-byte as handed to the model:
|
|
922
1005
|
# • system_prompt — full system context + the task-classifier SKILL
|
|
923
|
-
# • llm_input — full user message: the input session block, the
|
|
924
|
-
#
|
|
1006
|
+
# • llm_input — full user message: the input session block, the 30-min
|
|
1007
|
+
# per-ticket continuity prior, and the ranked candidate tickets
|
|
925
1008
|
# Both are captured POST-assembly, so any cap already applied while building
|
|
926
1009
|
# the prompt (e.g. SESSION_TEXT_CAP truncating the OCR excerpt) is reflected
|
|
927
1010
|
# here EXACTLY as the model saw it — never the pre-cap original. Concatenating
|
|
@@ -931,7 +1014,9 @@ def _classify_one(
|
|
|
931
1014
|
# capped at ~8k chars — so on that path the on-span text is the assembled
|
|
932
1015
|
# input, not the rewritten one.)
|
|
933
1016
|
with tracer.start_as_current_span("classifier_input") as bp_span:
|
|
934
|
-
user_message = build_user_message(
|
|
1017
|
+
user_message = build_user_message(
|
|
1018
|
+
session, pm_tasks, recent_activity=recent, now_iso=session.get("started_at")
|
|
1019
|
+
)
|
|
935
1020
|
messages = [
|
|
936
1021
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
937
1022
|
{"role": "user", "content": user_message},
|
|
@@ -975,7 +1060,8 @@ def _classify_one(
|
|
|
975
1060
|
part.set_attribute(_k, _v)
|
|
976
1061
|
|
|
977
1062
|
_input_part("system_prompt", _SYSTEM_PROMPT) # classifier skill + context
|
|
978
|
-
_input_part("recent_context",
|
|
1063
|
+
_input_part("recent_context",
|
|
1064
|
+
_format_continuity(recent, session.get("started_at"))) # 30-min continuity prior
|
|
979
1065
|
_input_part("session_block", _format_session(session)) # the input session
|
|
980
1066
|
_input_part("candidate_tickets", _format_candidates(pm_tasks), # ranked candidates
|
|
981
1067
|
ticket_count=len(pm_tasks))
|
|
@@ -1310,7 +1396,13 @@ def _classify_one_logged_inner(
|
|
|
1310
1396
|
session_raw = _fetch_session(con, session_id)
|
|
1311
1397
|
focus_keys = _fetch_plan_focus(con, _local_day(session_raw.get("started_at") or "")) if session_raw else []
|
|
1312
1398
|
pm_tasks = _fetch_pm_tasks(con, focus_keys) if session_raw else []
|
|
1313
|
-
recent =
|
|
1399
|
+
recent = (
|
|
1400
|
+
_fetch_recent_ticket_activity(
|
|
1401
|
+
con, session_raw.get("started_at") or "", [t["task_key"] for t in pm_tasks]
|
|
1402
|
+
)
|
|
1403
|
+
if session_raw
|
|
1404
|
+
else []
|
|
1405
|
+
)
|
|
1314
1406
|
|
|
1315
1407
|
if session_raw:
|
|
1316
1408
|
user_message = build_user_message(
|
|
@@ -1328,7 +1420,8 @@ def _classify_one_logged_inner(
|
|
|
1328
1420
|
"audio_snippets": [],
|
|
1329
1421
|
},
|
|
1330
1422
|
pm_tasks,
|
|
1331
|
-
|
|
1423
|
+
recent_activity=recent,
|
|
1424
|
+
now_iso=session_raw.get("started_at", ""),
|
|
1332
1425
|
)
|
|
1333
1426
|
else:
|
|
1334
1427
|
user_message = ""
|
|
@@ -348,7 +348,7 @@ class TestObservabilityClassifyOne:
|
|
|
348
348
|
def test_db_fetch_recent_sessions_count(self, db: Path, span_exporter):
|
|
349
349
|
_, spans = self._run(db, span_exporter)
|
|
350
350
|
s = _span_by_name(spans, "db_fetch")
|
|
351
|
-
assert s.attributes["
|
|
351
|
+
assert s.attributes["recent_continuity_tickets"] == 0 # no prior tracked work
|
|
352
352
|
|
|
353
353
|
def test_db_fetch_session_loaded_event_fields(self, db: Path, span_exporter):
|
|
354
354
|
_, spans = self._run(db, span_exporter)
|
|
@@ -421,7 +421,7 @@ class TestObservabilityClassifyOne:
|
|
|
421
421
|
s = _span_by_name(spans, "build_prompt")
|
|
422
422
|
assert s is not None
|
|
423
423
|
assert s.attributes["pm_tasks_count"] == 1
|
|
424
|
-
assert s.attributes["
|
|
424
|
+
assert s.attributes["recent_continuity_tickets"] == 0
|
|
425
425
|
assert s.attributes["prompt_chars"] > 0
|
|
426
426
|
|
|
427
427
|
def test_build_prompt_assembled_event(self, db: Path, span_exporter):
|
package/services/pyproject.toml
CHANGED
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "meridian-agents"
|
|
7
|
-
version = "1.
|
|
7
|
+
version = "1.59.0"
|
|
8
8
|
description = "Meridian agents — MLX classifier server and Jira worklog synthesis for meridian.db"
|
|
9
9
|
requires-python = ">=3.11"
|
|
10
10
|
authors = [{ name = "Meridiona" }]
|
|
@@ -55,7 +55,7 @@ The user message contains:
|
|
|
55
55
|
|
|
56
56
|
- **SESSION** — app, duration, top window titles, and the screen content (OCR / a11y). Decide the category yourself from this evidence; no category is provided.
|
|
57
57
|
- **CANDIDATE TICKETS** — all open tracked tickets (Jira, Linear, GitHub, Trello, Azure DevOps). These are the only tickets you may choose from.
|
|
58
|
-
- **RECENT
|
|
58
|
+
- **RECENT WORK CONTEXT** — a summary of the developer's tracked work in the **last 30 minutes** before this session, aggregated **per ticket**: each line is a ticket they worked, with the total time spent, how many sessions it spanned, and how long before this session it was last active (most-recently-active ticket first). It lists only tickets that are in the candidate set above. This is a **weak disambiguation hint only**: it can support a match when the current session ALSO has matching evidence, but it must never override what the current session itself shows. Recent activity on a ticket does not make the current session that ticket — and a ticket worked 25 minutes ago is a much weaker hint than one active a minute ago. When this block is absent, there was no confident recent tracked work to report.
|
|
59
59
|
|
|
60
60
|
## Available capabilities
|
|
61
61
|
|
|
@@ -74,9 +74,9 @@ Use database queries sparingly — session data and candidate tickets are alread
|
|
|
74
74
|
|
|
75
75
|
Pick **exactly one** of the candidate `task_key` values, OR return `null` if **none** fit the session.
|
|
76
76
|
|
|
77
|
-
Use **
|
|
78
|
-
- If the current session is **generic** (e.g., Slack) but
|
|
79
|
-
-
|
|
77
|
+
Use the **recent work context** to make smarter decisions:
|
|
78
|
+
- If the current session is **generic** (e.g., Slack) but the recent context shows sustained, very-recent work on a specific ticket, consider linking it to that task — *only if* this session's own evidence is at least consistent with it.
|
|
79
|
+
- The recent context is **recency- and time-weighted**: prefer the ticket that was active most recently and for the most time. A ticket last active a minute ago is a strong tie-breaker; one last active ~25 minutes ago is weak. When two or more tickets appear, the dev was context-switching — continuity is ambiguous, so lean harder on the current session's own evidence.
|
|
80
80
|
- Overhead (system settings, music, etc) should always be `null` regardless of context.
|
|
81
81
|
|
|
82
82
|
## Output format
|
|
@@ -192,18 +192,22 @@ Capture every category the session shows evidence of:
|
|
|
192
192
|
**Bad — speculative + marketing:**
|
|
193
193
|
> Successfully refactored the workflow to be more efficient. The new linear design will be much faster. Next steps include adding the worklog poster and testing end-to-end on Jira.
|
|
194
194
|
|
|
195
|
-
## Using
|
|
195
|
+
## Using the Recent Work Context
|
|
196
196
|
|
|
197
|
-
|
|
197
|
+
The **RECENT WORK CONTEXT** block summarises the developer's tracked work over the prior 30 minutes, per ticket, with time spent and how recently each was active. Use it to disambiguate the current session — never to override it.
|
|
198
198
|
|
|
199
|
-
**Example:
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
-
|
|
199
|
+
**Example: the recent context shows**
|
|
200
|
+
```
|
|
201
|
+
RECENT WORK CONTEXT — the developer's tracked work in the last 30 minutes before this session...
|
|
202
|
+
• KAN-42 — ~22 min over 5 sessions, last active just before this session
|
|
203
|
+
```
|
|
204
|
+
- Current session is **VS Code editing the same file** referenced by KAN-42 → strong: the recent context *and* the current evidence agree → task_key: KAN-42, confidence ~0.85.
|
|
205
|
+
- Current session is **Slack** with the channel/thread discussing the KAN-42 PR → the current content itself is about KAN-42, and the recent context supports it → task_key: KAN-42, confidence ~0.75.
|
|
206
|
+
- Current session is **Slack** showing a generic standup or an unrelated thread → the current evidence is NOT about KAN-42 → return `null` / `untracked` (or a different ticket if its own evidence matches). **Do not inherit KAN-42 just because it was the recent task.**
|
|
203
207
|
|
|
204
|
-
**Decision:**
|
|
208
|
+
**Decision rule:** continuity is a tie-breaker between plausible matches, never a substitute for current-session evidence. Weight the recent context by recency and time — a ticket "last active just before this session" with 22 minutes behind it is a strong tie-breaker; one "last active ~25 min before this session" is weak. When the block lists **more than one ticket**, the developer was switching context, so continuity is ambiguous: rely on the current session's own evidence.
|
|
205
209
|
|
|
206
|
-
Example reasoning
|
|
210
|
+
Example reasoning (if task-related): `"Slack thread discusses the KAN-42 PR; recent context shows 22 min on KAN-42 ending just before this session — linked via work context."`
|
|
207
211
|
|
|
208
212
|
## Scoring heuristics
|
|
209
213
|
|
|
@@ -34,6 +34,7 @@ if str(_SERVICES_DIR) not in sys.path:
|
|
|
34
34
|
sys.path.insert(0, str(_SERVICES_DIR))
|
|
35
35
|
|
|
36
36
|
from agents._prompts import build_user_message
|
|
37
|
+
from agents.run_task_linker_mlx import _fetch_recent_ticket_activity
|
|
37
38
|
|
|
38
39
|
MERIDIAN_DB = Path(os.environ.get("MERIDIAN_DB", Path.home() / ".meridian/meridian.db"))
|
|
39
40
|
SESSION_IDS: list[int] = [
|
|
@@ -103,19 +104,6 @@ def _fetch_pm_tasks(con: sqlite3.Connection) -> list[dict]:
|
|
|
103
104
|
return [dict(r) for r in rows]
|
|
104
105
|
|
|
105
106
|
|
|
106
|
-
def _fetch_recent(con: sqlite3.Connection, before_id: int) -> list[dict]:
|
|
107
|
-
rows = con.execute(
|
|
108
|
-
"SELECT app_name, started_at, duration_s, task_key, task_routing, category"
|
|
109
|
-
" FROM app_sessions"
|
|
110
|
-
" WHERE id < ? AND duration_s > 1 AND COALESCE(session_text,'') != ''"
|
|
111
|
-
" ORDER BY id DESC LIMIT 5",
|
|
112
|
-
(before_id,),
|
|
113
|
-
).fetchall()
|
|
114
|
-
result = [dict(r) for r in rows]
|
|
115
|
-
result.reverse()
|
|
116
|
-
return result
|
|
117
|
-
|
|
118
|
-
|
|
119
107
|
def main() -> None:
|
|
120
108
|
if not MERIDIAN_DB.exists():
|
|
121
109
|
print(f"ERROR: meridian.db not found at {MERIDIAN_DB}", file=sys.stderr)
|
|
@@ -152,8 +140,12 @@ def main() -> None:
|
|
|
152
140
|
"confidence": s["confidence"] or 0.0,
|
|
153
141
|
"audio_snippets": [],
|
|
154
142
|
}
|
|
155
|
-
recent =
|
|
156
|
-
|
|
143
|
+
recent = _fetch_recent_ticket_activity(
|
|
144
|
+
con, session["started_at"], [t["task_key"] for t in pm_tasks]
|
|
145
|
+
)
|
|
146
|
+
prompt_input = build_user_message(
|
|
147
|
+
session, pm_tasks, recent_activity=recent, now_iso=session["started_at"]
|
|
148
|
+
)
|
|
157
149
|
|
|
158
150
|
expected = {
|
|
159
151
|
"task_key": _normalise_task_key(s.get("task_key")),
|
|
@@ -39,7 +39,7 @@ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
|
|
|
39
39
|
from agents._prompts import build_user_message
|
|
40
40
|
from agents.run_task_linker_mlx import (
|
|
41
41
|
_fetch_pm_tasks,
|
|
42
|
-
|
|
42
|
+
_fetch_recent_ticket_activity,
|
|
43
43
|
_fetch_session,
|
|
44
44
|
)
|
|
45
45
|
|
|
@@ -48,8 +48,10 @@ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
|
|
|
48
48
|
raw = _fetch_session(con, session_id)
|
|
49
49
|
if raw is None:
|
|
50
50
|
return None
|
|
51
|
-
recent = _fetch_recent_sessions(con, session_id)
|
|
52
51
|
pm_tasks = _fetch_pm_tasks(con)
|
|
52
|
+
recent = _fetch_recent_ticket_activity(
|
|
53
|
+
con, raw.get("started_at") or "", [t["task_key"] for t in pm_tasks]
|
|
54
|
+
)
|
|
53
55
|
session_text = raw.get("session_text") or ""
|
|
54
56
|
if raw.get("coding_agent_session_uuid") and (raw.get("session_summary") or "").strip():
|
|
55
57
|
session_text = raw["session_summary"]
|
|
@@ -66,7 +68,9 @@ def _reconstruct_prompt(db_path: str, session_id: int) -> str | None:
|
|
|
66
68
|
"confidence": raw.get("confidence", 0.0),
|
|
67
69
|
"audio_snippets": [],
|
|
68
70
|
}
|
|
69
|
-
return build_user_message(
|
|
71
|
+
return build_user_message(
|
|
72
|
+
session, pm_tasks, recent_activity=recent, now_iso=raw.get("started_at")
|
|
73
|
+
)
|
|
70
74
|
|
|
71
75
|
|
|
72
76
|
def _classify(url: str, db_path: str, session_ids: list[int]) -> list[dict]:
|
|
@@ -24,7 +24,10 @@ _SERVICES_DIR = Path(__file__).parent.parent.parent
|
|
|
24
24
|
if str(_SERVICES_DIR) not in sys.path:
|
|
25
25
|
sys.path.insert(0, str(_SERVICES_DIR))
|
|
26
26
|
|
|
27
|
+
import sqlite3 # noqa: E402
|
|
28
|
+
|
|
27
29
|
from agents._prompts import build_user_message # noqa: E402
|
|
30
|
+
from agents.run_task_linker_mlx import _fetch_recent_ticket_activity # noqa: E402
|
|
28
31
|
|
|
29
32
|
EVAL_DIR = Path(__file__).parent
|
|
30
33
|
SEED_DIR = EVAL_DIR / "data" / "seeds"
|
|
@@ -34,22 +37,36 @@ PERSONA_FILES = {
|
|
|
34
37
|
}
|
|
35
38
|
|
|
36
39
|
|
|
37
|
-
def _project_recent(
|
|
38
|
-
|
|
39
|
-
|
|
40
|
+
def _project_recent(
|
|
41
|
+
prior: list[dict], current_started_at: str, candidate_keys: list[str]
|
|
42
|
+
) -> list[dict]:
|
|
43
|
+
"""Build the per-ticket continuity prior for a seed session, reusing the EXACT
|
|
44
|
+
production aggregation (`_fetch_recent_ticket_activity`) so rendered goldens
|
|
45
|
+
match the live prompt. We load the prior scoreable seeds into a throwaway
|
|
46
|
+
in-memory DB and run the real query against it (windowing, confidence floor,
|
|
47
|
+
candidate-gating, recency ordering all happen there — one source of truth)."""
|
|
48
|
+
con = sqlite3.connect(":memory:")
|
|
49
|
+
con.row_factory = sqlite3.Row
|
|
50
|
+
con.execute(
|
|
51
|
+
"CREATE TABLE app_sessions ("
|
|
52
|
+
" id INTEGER PRIMARY KEY AUTOINCREMENT,"
|
|
53
|
+
" task_key TEXT, started_at TEXT, ended_at TEXT, duration_s REAL,"
|
|
54
|
+
" task_confidence REAL, task_session_type TEXT)"
|
|
55
|
+
)
|
|
40
56
|
for s in prior:
|
|
41
57
|
gt = s.get("ground_truth", {})
|
|
42
58
|
tk = gt.get("task_key")
|
|
43
59
|
task_key = tk if tk and tk != "none" else None
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
"
|
|
48
|
-
"task_key"
|
|
49
|
-
"
|
|
50
|
-
"
|
|
51
|
-
|
|
52
|
-
|
|
60
|
+
if not task_key:
|
|
61
|
+
continue # untracked/overhead priors carry no continuity signal
|
|
62
|
+
con.execute(
|
|
63
|
+
"INSERT INTO app_sessions"
|
|
64
|
+
" (task_key, started_at, ended_at, duration_s, task_confidence, task_session_type)"
|
|
65
|
+
" VALUES (?, ?, ?, ?, 1.0, 'task')",
|
|
66
|
+
(task_key, s["started_at"], s.get("ended_at") or s["started_at"], s["duration_s"]),
|
|
67
|
+
)
|
|
68
|
+
con.commit()
|
|
69
|
+
return _fetch_recent_ticket_activity(con, current_started_at, candidate_keys)
|
|
53
70
|
|
|
54
71
|
|
|
55
72
|
def render(persona: str) -> list[dict]:
|
|
@@ -79,8 +96,12 @@ def render(persona: str) -> list[dict]:
|
|
|
79
96
|
if not gt.get("scoreable"):
|
|
80
97
|
continue
|
|
81
98
|
|
|
82
|
-
recent = _project_recent(
|
|
83
|
-
|
|
99
|
+
recent = _project_recent(
|
|
100
|
+
scoreable_prior, s["started_at"], [c["task_key"] for c in candidates]
|
|
101
|
+
)
|
|
102
|
+
prompt = build_user_message(
|
|
103
|
+
s, candidates, recent_activity=recent, now_iso=s["started_at"]
|
|
104
|
+
)
|
|
84
105
|
|
|
85
106
|
expected = {
|
|
86
107
|
"task_key": gt.get("task_key", "none"),
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
"""Unit tests for the 30-min per-ticket continuity context.
|
|
2
|
+
|
|
3
|
+
Covers the two halves of the rewritten recent-work block:
|
|
4
|
+
* _fetch_recent_ticket_activity — windowing, confidence floor, candidate-gating,
|
|
5
|
+
per-ticket aggregation, recency ordering.
|
|
6
|
+
* _format_continuity — rendering (none / single / multiple / recency).
|
|
7
|
+
|
|
8
|
+
Run: services/.venv/bin/pytest services/tests/test_continuity_context.py -v
|
|
9
|
+
(Also runnable without pytest: services/.venv/bin/python services/tests/test_continuity_context.py)
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import datetime as _dt
|
|
14
|
+
import sqlite3
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
# Make `from agents import ...` resolve (mirror tests/evals/eval_classifier.py).
|
|
19
|
+
_SERVICES_DIR = Path(__file__).resolve().parent.parent
|
|
20
|
+
if str(_SERVICES_DIR) not in sys.path:
|
|
21
|
+
sys.path.insert(0, str(_SERVICES_DIR))
|
|
22
|
+
|
|
23
|
+
from agents import run_task_linker_mlx as rtl # noqa: E402
|
|
24
|
+
from agents import _prompts # noqa: E402
|
|
25
|
+
|
|
26
|
+
ANCHOR = "2026-06-17T10:00:00+00:00"
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _at(minutes_before: int) -> str:
|
|
30
|
+
base = _dt.datetime.fromisoformat(ANCHOR)
|
|
31
|
+
return (base - _dt.timedelta(minutes=minutes_before)).isoformat()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _make_con(rows: list[dict]) -> sqlite3.Connection:
|
|
35
|
+
"""In-memory meridian DB with the columns _fetch_recent_ticket_activity reads."""
|
|
36
|
+
con = sqlite3.connect(":memory:")
|
|
37
|
+
con.row_factory = sqlite3.Row
|
|
38
|
+
con.execute(
|
|
39
|
+
"CREATE TABLE app_sessions ("
|
|
40
|
+
" id INTEGER PRIMARY KEY AUTOINCREMENT,"
|
|
41
|
+
" task_key TEXT, started_at TEXT, ended_at TEXT, duration_s REAL,"
|
|
42
|
+
" task_confidence REAL, task_session_type TEXT)"
|
|
43
|
+
)
|
|
44
|
+
for r in rows:
|
|
45
|
+
con.execute(
|
|
46
|
+
"INSERT INTO app_sessions"
|
|
47
|
+
" (task_key, started_at, ended_at, duration_s, task_confidence, task_session_type)"
|
|
48
|
+
" VALUES (?, ?, ?, ?, ?, ?)",
|
|
49
|
+
(
|
|
50
|
+
r.get("task_key"),
|
|
51
|
+
r["started_at"],
|
|
52
|
+
r.get("ended_at"),
|
|
53
|
+
r.get("duration_s", 0.0),
|
|
54
|
+
r.get("task_confidence", 0.9),
|
|
55
|
+
r.get("task_session_type", "task"),
|
|
56
|
+
),
|
|
57
|
+
)
|
|
58
|
+
con.commit()
|
|
59
|
+
return con
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _keys(activity):
|
|
63
|
+
return [a["task_key"] for a in activity]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
# ── _fetch_recent_ticket_activity ────────────────────────────────────────────
|
|
67
|
+
|
|
68
|
+
def test_aggregates_per_ticket_and_orders_by_recency():
|
|
69
|
+
con = _make_con([
|
|
70
|
+
# KAN-1: two sessions, most-recent ends 2 min before anchor
|
|
71
|
+
{"task_key": "KAN-1", "started_at": _at(10), "ended_at": _at(9), "duration_s": 300},
|
|
72
|
+
{"task_key": "KAN-1", "started_at": _at(3), "ended_at": _at(2), "duration_s": 120},
|
|
73
|
+
# KAN-2: one session, ends 20 min before anchor
|
|
74
|
+
{"task_key": "KAN-2", "started_at": _at(21), "ended_at": _at(20), "duration_s": 600},
|
|
75
|
+
])
|
|
76
|
+
out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1", "KAN-2"])
|
|
77
|
+
assert _keys(out) == ["KAN-1", "KAN-2"] # most-recently-active first
|
|
78
|
+
k1 = out[0]
|
|
79
|
+
assert k1["sessions"] == 2
|
|
80
|
+
assert k1["total_s"] == 420.0 # 300 + 120 summed
|
|
81
|
+
assert abs(k1["ago_s"] - 120) < 1 # last active ~2 min ago
|
|
82
|
+
assert abs(out[1]["ago_s"] - 1200) < 1 # KAN-2 ~20 min ago
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_excludes_sessions_outside_the_window():
|
|
86
|
+
con = _make_con([
|
|
87
|
+
{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60},
|
|
88
|
+
{"task_key": "KAN-9", "started_at": _at(40), "ended_at": _at(39), "duration_s": 60}, # >30 min
|
|
89
|
+
])
|
|
90
|
+
out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1", "KAN-9"])
|
|
91
|
+
assert _keys(out) == ["KAN-1"]
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def test_excludes_below_confidence_floor():
|
|
95
|
+
con = _make_con([
|
|
96
|
+
{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_confidence": 0.9},
|
|
97
|
+
{"task_key": "KAN-2", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_confidence": 0.5},
|
|
98
|
+
])
|
|
99
|
+
out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1", "KAN-2"])
|
|
100
|
+
assert _keys(out) == ["KAN-1"]
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def test_candidate_gating_drops_non_candidate_tickets():
|
|
104
|
+
con = _make_con([
|
|
105
|
+
{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60},
|
|
106
|
+
{"task_key": "KAN-7", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60}, # not a candidate
|
|
107
|
+
])
|
|
108
|
+
out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1"])
|
|
109
|
+
assert _keys(out) == ["KAN-1"]
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_excludes_untracked_and_null_task():
|
|
113
|
+
con = _make_con([
|
|
114
|
+
{"task_key": None, "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_session_type": "untracked"},
|
|
115
|
+
{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60, "task_session_type": "task"},
|
|
116
|
+
])
|
|
117
|
+
out = rtl._fetch_recent_ticket_activity(con, ANCHOR, ["KAN-1"])
|
|
118
|
+
assert _keys(out) == ["KAN-1"]
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def test_no_candidates_returns_empty():
|
|
122
|
+
con = _make_con([{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60}])
|
|
123
|
+
assert rtl._fetch_recent_ticket_activity(con, ANCHOR, []) == []
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def test_no_anchor_returns_empty():
|
|
127
|
+
con = _make_con([{"task_key": "KAN-1", "started_at": _at(5), "ended_at": _at(4), "duration_s": 60}])
|
|
128
|
+
assert rtl._fetch_recent_ticket_activity(con, "", ["KAN-1"]) == []
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
# ── _format_continuity ───────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
def test_format_empty_is_explicit_no_work_line():
|
|
134
|
+
out = _prompts._format_continuity([])
|
|
135
|
+
assert out.strip() == "(no tracked work in this window)"
|
|
136
|
+
assert out != "" # explicit, never silent
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def test_format_single_ticket_recent():
|
|
140
|
+
out = _prompts._format_continuity(
|
|
141
|
+
[{"task_key": "KAN-1", "total_s": 420, "sessions": 2, "ago_s": 30}]
|
|
142
|
+
)
|
|
143
|
+
assert "KAN-1" in out
|
|
144
|
+
assert "~7 min" in out
|
|
145
|
+
assert "2 sessions" in out
|
|
146
|
+
assert "just before this session" in out # ago < 60s
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def test_format_recency_minutes():
|
|
150
|
+
out = _prompts._format_continuity(
|
|
151
|
+
[{"task_key": "KAN-2", "total_s": 600, "sessions": 1, "ago_s": 1200}]
|
|
152
|
+
)
|
|
153
|
+
assert "1 session" in out
|
|
154
|
+
assert "~20 min before this session" in out
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def test_format_multiple_tickets_one_bullet_each():
|
|
158
|
+
out = _prompts._format_continuity([
|
|
159
|
+
{"task_key": "KAN-1", "total_s": 420, "sessions": 2, "ago_s": 30},
|
|
160
|
+
{"task_key": "KAN-2", "total_s": 600, "sessions": 1, "ago_s": 1200},
|
|
161
|
+
])
|
|
162
|
+
assert out.count("•") == 2
|
|
163
|
+
assert "KAN-1" in out and "KAN-2" in out
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
def test_build_user_message_includes_block_when_activity_present():
|
|
167
|
+
msg = _prompts.build_user_message(
|
|
168
|
+
{"app_name": "Code", "session_text": "x"},
|
|
169
|
+
[{"task_key": "KAN-1", "title": "t", "description_text": "d"}],
|
|
170
|
+
recent_activity=[{"task_key": "KAN-1", "total_s": 60, "sessions": 1, "ago_s": 30}],
|
|
171
|
+
now_iso=ANCHOR,
|
|
172
|
+
)
|
|
173
|
+
assert "RECENT WORK CONTEXT" in msg
|
|
174
|
+
assert "WEAK" in msg # framed as a weak prior
|
|
175
|
+
assert "KAN-1" in msg
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def test_build_user_message_shows_explicit_block_when_no_activity():
|
|
179
|
+
msg = _prompts.build_user_message(
|
|
180
|
+
{"app_name": "Code", "session_text": "x"},
|
|
181
|
+
[{"task_key": "KAN-1", "title": "t", "description_text": "d"}],
|
|
182
|
+
recent_activity=[],
|
|
183
|
+
now_iso=ANCHOR,
|
|
184
|
+
)
|
|
185
|
+
assert "RECENT WORK CONTEXT" in msg # always present now
|
|
186
|
+
assert "no tracked work in this window" in msg # explicit empty state
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
# ── plain-python runner (no pytest needed) ───────────────────────────────────
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
fns = [v for k, v in sorted(globals().items()) if k.startswith("test_") and callable(v)]
|
|
193
|
+
passed = 0
|
|
194
|
+
for fn in fns:
|
|
195
|
+
try:
|
|
196
|
+
fn()
|
|
197
|
+
print(f" PASS {fn.__name__}")
|
|
198
|
+
passed += 1
|
|
199
|
+
except Exception as exc: # noqa: BLE001
|
|
200
|
+
print(f" FAIL {fn.__name__}: {exc!r}")
|
|
201
|
+
print(f"\n{passed}/{len(fns)} passed")
|
|
202
|
+
raise SystemExit(0 if passed == len(fns) else 1)
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
"""Unit tests for `_fetch_pm_tasks` candidate-set policy.
|
|
2
|
+
|
|
3
|
+
Covers the plan-only candidate filtering (CLASSIFY_PLAN_ONLY_CANDIDATES) added on
|
|
4
|
+
top of the legacy boost-never-filter behaviour, including both safety guards:
|
|
5
|
+
* no confirmed plan → every candidate is offered (unchanged behaviour)
|
|
6
|
+
* plan confirmed → candidates narrowed to the confirmed plan, in order
|
|
7
|
+
* plan tickets gone → fall back to the full set (never zero candidates)
|
|
8
|
+
* curation-excluded → never a candidate, even if named in the plan
|
|
9
|
+
|
|
10
|
+
Run: services/.venv/bin/pytest services/tests/test_fetch_pm_tasks.py -v
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import sqlite3
|
|
15
|
+
import sys
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import pytest
|
|
19
|
+
|
|
20
|
+
# Make `from agents import ...` resolve (mirror tests/evals/eval_classifier.py).
|
|
21
|
+
_SERVICES_DIR = Path(__file__).resolve().parent.parent
|
|
22
|
+
if str(_SERVICES_DIR) not in sys.path:
|
|
23
|
+
sys.path.insert(0, str(_SERVICES_DIR))
|
|
24
|
+
|
|
25
|
+
from agents import run_task_linker_mlx as rtl # noqa: E402
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _make_con(task_keys: list[str], excluded: list[str] | None = None) -> sqlite3.Connection:
|
|
29
|
+
"""In-memory meridian DB with the columns `_fetch_pm_tasks` selects."""
|
|
30
|
+
con = sqlite3.connect(":memory:")
|
|
31
|
+
con.row_factory = sqlite3.Row
|
|
32
|
+
con.execute(
|
|
33
|
+
"CREATE TABLE pm_tasks ("
|
|
34
|
+
" task_key TEXT PRIMARY KEY, title TEXT, description_text TEXT,"
|
|
35
|
+
" status_raw TEXT, is_terminal INTEGER, issue_type TEXT,"
|
|
36
|
+
" parent_key TEXT, epic_title TEXT, sprint_name TEXT, tags TEXT)"
|
|
37
|
+
)
|
|
38
|
+
con.execute(
|
|
39
|
+
"CREATE TABLE pm_task_curation (task_key TEXT PRIMARY KEY, decision TEXT)"
|
|
40
|
+
)
|
|
41
|
+
for k in task_keys:
|
|
42
|
+
con.execute(
|
|
43
|
+
"INSERT INTO pm_tasks (task_key, title, description_text, status_raw,"
|
|
44
|
+
" is_terminal, issue_type, parent_key, epic_title, sprint_name, tags)"
|
|
45
|
+
" VALUES (?, ?, '', 'In Progress', 0, 'Task', '', '', '', '')",
|
|
46
|
+
(k, f"title {k}"),
|
|
47
|
+
)
|
|
48
|
+
for k in excluded or []:
|
|
49
|
+
con.execute(
|
|
50
|
+
"INSERT INTO pm_task_curation (task_key, decision) VALUES (?, 'excluded')",
|
|
51
|
+
(k,),
|
|
52
|
+
)
|
|
53
|
+
con.commit()
|
|
54
|
+
return con
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
@pytest.fixture
|
|
58
|
+
def plan_only(monkeypatch):
|
|
59
|
+
"""Force plan-only filtering on regardless of the ambient env default."""
|
|
60
|
+
monkeypatch.setattr(rtl, "_PLAN_ONLY_CANDIDATES", True)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@pytest.fixture
|
|
64
|
+
def boost_mode(monkeypatch):
|
|
65
|
+
"""Force the legacy boost-never-filter behaviour."""
|
|
66
|
+
monkeypatch.setattr(rtl, "_PLAN_ONLY_CANDIDATES", False)
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _keys(tasks):
|
|
70
|
+
return [t["task_key"] for t in tasks]
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def test_no_plan_returns_all_unmarked(plan_only):
|
|
74
|
+
"""No confirmed plan → every candidate offered, none marked as focus."""
|
|
75
|
+
con = _make_con(["K-1", "K-2", "K-3"])
|
|
76
|
+
tasks = rtl._fetch_pm_tasks(con, focus_keys=[])
|
|
77
|
+
assert set(_keys(tasks)) == {"K-1", "K-2", "K-3"}
|
|
78
|
+
assert all(not t.get("is_today_focus") for t in tasks)
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def test_plan_only_narrows_to_plan_in_declared_order(plan_only):
|
|
82
|
+
"""Plan confirmed → candidates are exactly the plan, in declared order, marked."""
|
|
83
|
+
con = _make_con(["K-1", "K-2", "K-3", "K-4"])
|
|
84
|
+
tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-3", "K-1"])
|
|
85
|
+
assert _keys(tasks) == ["K-3", "K-1"] # declared order preserved
|
|
86
|
+
assert all(t["is_today_focus"] for t in tasks)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_plan_only_falls_back_when_plan_tickets_absent(plan_only):
|
|
90
|
+
"""GUARD: plan tickets not in the live pool → fall back to ALL, never empty."""
|
|
91
|
+
con = _make_con(["K-1", "K-2"])
|
|
92
|
+
tasks = rtl._fetch_pm_tasks(con, focus_keys=["GHOST-9"])
|
|
93
|
+
assert set(_keys(tasks)) == {"K-1", "K-2"} # full set, not empty
|
|
94
|
+
assert all(not t.get("is_today_focus") for t in tasks)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def test_plan_only_drops_curation_excluded_even_if_in_plan(plan_only):
|
|
98
|
+
"""An excluded ticket is never a candidate, even when named in the plan."""
|
|
99
|
+
con = _make_con(["K-1", "K-2"], excluded=["K-2"])
|
|
100
|
+
tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-2", "K-1"])
|
|
101
|
+
# K-2 excluded → only K-1 survives; still a non-empty, plan-scoped set.
|
|
102
|
+
assert _keys(tasks) == ["K-1"]
|
|
103
|
+
assert tasks[0]["is_today_focus"]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_plan_only_partial_plan_keeps_only_live_plan_tickets(plan_only):
|
|
107
|
+
"""Plan names a live + a dead ticket → only the live one is offered."""
|
|
108
|
+
con = _make_con(["K-1", "K-2", "K-3"])
|
|
109
|
+
tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-2", "GHOST-9"])
|
|
110
|
+
assert _keys(tasks) == ["K-2"]
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def test_boost_mode_keeps_all_with_plan_floated_to_top(boost_mode):
|
|
114
|
+
"""Flag off → legacy behaviour: plan floated to top, every candidate kept."""
|
|
115
|
+
con = _make_con(["K-1", "K-2", "K-3"])
|
|
116
|
+
tasks = rtl._fetch_pm_tasks(con, focus_keys=["K-3"])
|
|
117
|
+
assert set(_keys(tasks)) == {"K-1", "K-2", "K-3"} # recall untouched
|
|
118
|
+
assert tasks[0]["task_key"] == "K-3" # floated to top
|
|
119
|
+
assert tasks[0]["is_today_focus"]
|
|
120
|
+
assert sum(1 for t in tasks if t.get("is_today_focus")) == 1
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Unit tests for the candidate-description cap in `_format_candidates`.
|
|
2
|
+
|
|
3
|
+
The cap is configurable via CANDIDATE_DESC_CAP (default 0 = no cap). These cover
|
|
4
|
+
the default uncapped behaviour and an explicit ceiling.
|
|
5
|
+
|
|
6
|
+
Run: services/.venv/bin/pytest services/tests/test_format_candidates.py -v
|
|
7
|
+
"""
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
import pytest
|
|
14
|
+
|
|
15
|
+
_SERVICES_DIR = Path(__file__).resolve().parent.parent
|
|
16
|
+
if str(_SERVICES_DIR) not in sys.path:
|
|
17
|
+
sys.path.insert(0, str(_SERVICES_DIR))
|
|
18
|
+
|
|
19
|
+
from agents import _prompts # noqa: E402
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _task(desc: str) -> dict:
|
|
23
|
+
return {"task_key": "K-1", "title": "t", "description_text": desc}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_default_no_cap_keeps_full_description(monkeypatch):
|
|
27
|
+
monkeypatch.setattr(_prompts, "CANDIDATE_DESC_CAP", 0)
|
|
28
|
+
desc = "x" * 1000
|
|
29
|
+
out = _prompts._format_candidates([_task(desc)])
|
|
30
|
+
assert desc in out # full text present
|
|
31
|
+
assert "…" not in out # no truncation marker
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def test_positive_cap_truncates_with_marker(monkeypatch):
|
|
35
|
+
monkeypatch.setattr(_prompts, "CANDIDATE_DESC_CAP", 50)
|
|
36
|
+
out = _prompts._format_candidates([_task("y" * 100)])
|
|
37
|
+
assert "y" * 50 + "…" in out
|
|
38
|
+
assert "y" * 51 not in out # nothing past the cap
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_description_under_cap_unchanged(monkeypatch):
|
|
42
|
+
monkeypatch.setattr(_prompts, "CANDIDATE_DESC_CAP", 240)
|
|
43
|
+
out = _prompts._format_candidates([_task("short desc")])
|
|
44
|
+
assert "short desc" in out
|
|
45
|
+
assert "…" not in out
|
package/ui.tar.gz
CHANGED
|
Binary file
|