useathena 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/README.md +258 -0
  2. package/apps/chrome-extension/README.md +35 -0
  3. package/apps/chrome-extension/background.js +97 -0
  4. package/apps/chrome-extension/gmail.js +107 -0
  5. package/apps/chrome-extension/linkedin.js +123 -0
  6. package/apps/chrome-extension/manifest.json +27 -0
  7. package/apps/chrome-extension/options.html +60 -0
  8. package/apps/chrome-extension/options.js +36 -0
  9. package/apps/chrome-extension/popup.html +37 -0
  10. package/apps/chrome-extension/popup.js +22 -0
  11. package/bin/athena +28 -0
  12. package/dist/api/server.js +145 -0
  13. package/dist/capture/ingest.js +85 -0
  14. package/dist/cli/commands.js +201 -0
  15. package/dist/cli/format.js +76 -0
  16. package/dist/cli/setup.js +316 -0
  17. package/dist/cli.js +291 -0
  18. package/dist/config.js +26 -0
  19. package/dist/core/fixtures.js +65 -0
  20. package/dist/core/ids.js +34 -0
  21. package/dist/core/refs.js +25 -0
  22. package/dist/core/types.js +10 -0
  23. package/dist/engine/engine.js +136 -0
  24. package/dist/engine/parse.js +76 -0
  25. package/dist/engine/prompts.js +64 -0
  26. package/dist/eval/harness.js +123 -0
  27. package/dist/eval/judge.js +75 -0
  28. package/dist/eval/run-eval.js +46 -0
  29. package/dist/eval/scenarios.js +470 -0
  30. package/dist/mcp/server.js +107 -0
  31. package/dist/mcp-server.js +7 -0
  32. package/dist/model/api-model-client.js +99 -0
  33. package/dist/model/cli-model-client.js +111 -0
  34. package/dist/model/model-client.js +28 -0
  35. package/dist/model/registry.js +67 -0
  36. package/dist/sensors/claude-code-hook.js +131 -0
  37. package/dist/serve/brief.js +95 -0
  38. package/dist/serve/outcome.js +56 -0
  39. package/dist/store/open.js +19 -0
  40. package/dist/store/store.js +269 -0
  41. package/docs/schema.md +368 -0
  42. package/package.json +43 -0
  43. package/scripts/prepare.mjs +20 -0
@@ -0,0 +1,46 @@
1
+ import { LlmHypothesisEngine } from "../engine/engine.js";
2
+ import { modelClientFromSpec, resolveModelSpec } from "../model/registry.js";
3
+ import { loadConfig } from "../config.js";
4
+ import { LlmJudge, RubricJudge } from "./judge.js";
5
+ import { formatReport, runEval } from "./harness.js";
6
+ import { GOLDEN_SCENARIOS } from "./scenarios.js";
7
+ /**
8
+ * Run the real engine against the golden scenarios.
9
+ *
10
+ * npm run eval # rubric judge (free, keyword floor)
11
+ * npm run eval -- --judge llm # LLM judge (the real measure)
12
+ * ATHENA_MODEL=cli:claude:haiku npm run eval
13
+ */
14
+ const config = loadConfig();
15
+ const spec = resolveModelSpec(config);
16
+ const useLlmJudge = process.argv.includes("--judge") && process.argv[process.argv.indexOf("--judge") + 1] === "llm";
17
+ const model = modelClientFromSpec(spec, config);
18
+ const inner = new LlmHypothesisEngine(model);
19
+ const judge = useLlmJudge ? new LlmJudge(model) : new RubricJudge();
20
+ // Capture what the engine inferred so the report is inspectable, not just scoreable.
21
+ const inferred = new Map();
22
+ const engine = {
23
+ infer: async (instances) => {
24
+ const hypotheses = await inner.infer(instances);
25
+ inferred.set(instances[0]?.situation.domain ?? "unknown", hypotheses);
26
+ return hypotheses;
27
+ },
28
+ };
29
+ console.log(`engine model: ${model.id} judge: ${useLlmJudge ? `llm (${model.id})` : "rubric"}`);
30
+ console.log(`scenarios: ${GOLDEN_SCENARIOS.map((s) => s.id).join(", ")}\n`);
31
+ const startedAt = Date.now();
32
+ const report = await runEval(engine, GOLDEN_SCENARIOS, judge);
33
+ console.log(formatReport(report));
34
+ console.log(`\ndone in ${Math.round((Date.now() - startedAt) / 1000)}s`);
35
+ for (const scenario of report.scenarios) {
36
+ const golden = GOLDEN_SCENARIOS.find((s) => s.id === scenario.scenarioId);
37
+ const domain = golden?.train[0]?.situation.domain ?? "unknown";
38
+ console.log(`\n=== ${scenario.scenarioId} — inferred rules ===`);
39
+ for (const hypothesis of inferred.get(domain) ?? []) {
40
+ const flag = scenario.falseHypothesisIds.includes(hypothesis.id) ? " [UNMATCHED BY RUBRIC]" : "";
41
+ console.log(`- (${hypothesis.status}, conf ${hypothesis.confidence}, replay ${hypothesis.replay.reproduced}/${hypothesis.replay.tested})${flag}\n` +
42
+ ` ${hypothesis.rule}\n` +
43
+ (hypothesis.appliesWhen.length ? ` applies: ${hypothesis.appliesWhen.join("; ")}\n` : "") +
44
+ (hypothesis.doesNotApplyWhen.length ? ` except: ${hypothesis.doesNotApplyWhen.join("; ")}` : ""));
45
+ }
46
+ }
@@ -0,0 +1,470 @@
1
+ import { makeInstance } from "../core/fixtures.js";
2
+ let sequence = 0;
3
+ function correction(input) {
4
+ sequence += 1;
5
+ return makeInstance({
6
+ kind: input.kind ?? "correction",
7
+ observedAt: new Date(Date.UTC(2026, 4, 1) + sequence * 3_600_000).toISOString(),
8
+ situation: { summary: input.summary, domain: input.domain, cues: [], objectIds: [] },
9
+ before: { mediaType: "text/plain", content: input.before, contentHash: `fix-${sequence}-a` },
10
+ after: { mediaType: "text/plain", content: input.after, contentHash: `fix-${sequence}-b` },
11
+ diff: {
12
+ summary: input.diffSummary,
13
+ hunks: [{ before: input.before, after: input.after }],
14
+ magnitude: input.magnitude ?? "minor",
15
+ },
16
+ });
17
+ }
18
+ // --- 1. Em-dash removal (crisp lexical rule, cross-channel) ---
19
+ const emdash = {
20
+ id: "emdash",
21
+ title: "Removes em-dashes from drafted text",
22
+ planted: [
23
+ {
24
+ key: "emdash.remove",
25
+ statement: "Remove em-dashes from drafts; use commas, periods, or parentheses instead.",
26
+ domain: "writing.style",
27
+ mustMention: [
28
+ ["em-dash", "em dash", "—", "dash"],
29
+ ["remove", "avoid", "replace", "without", "no ", "never"],
30
+ ],
31
+ },
32
+ ],
33
+ train: [
34
+ correction({
35
+ domain: "writing.style",
36
+ summary: "editing agent-drafted follow-up email to a prospect",
37
+ before: "We're aligned on pricing — let's move to the legal review.",
38
+ after: "We're aligned on pricing. Let's move to the legal review.",
39
+ diffSummary: "replaced em-dash with a period",
40
+ }),
41
+ correction({
42
+ domain: "writing.style",
43
+ summary: "editing agent-drafted LinkedIn post",
44
+ before: "Great conversation today — lots to unpack — more on this soon.",
45
+ after: "Great conversation today, lots to unpack. More on this soon.",
46
+ diffSummary: "replaced two em-dashes with comma and period",
47
+ }),
48
+ correction({
49
+ domain: "writing.style",
50
+ summary: "editing agent-drafted Slack summary",
51
+ before: "Checked the Q2 numbers — they match the board deck.",
52
+ after: "Checked the Q2 numbers, they match the board deck.",
53
+ diffSummary: "replaced em-dash with a comma",
54
+ }),
55
+ correction({
56
+ domain: "writing.style",
57
+ summary: "editing agent-drafted product one-pager",
58
+ before: "Our wedge — tacit knowledge capture — is hard to copy.",
59
+ after: "Our wedge (tacit knowledge capture) is hard to copy.",
60
+ diffSummary: "replaced em-dash pair with parentheses",
61
+ }),
62
+ ],
63
+ heldOut: [
64
+ {
65
+ instance: correction({
66
+ domain: "writing.style",
67
+ summary: "editing agent-drafted intro email",
68
+ before: "Happy to intro you to Lena — she runs RevOps at Klar.",
69
+ after: "Happy to intro you to Lena. She runs RevOps at Klar.",
70
+ diffSummary: "replaced em-dash with a period",
71
+ }),
72
+ kind: "apply",
73
+ ruleKey: "emdash.remove",
74
+ expectMention: [["em-dash", "em dash", "—", "dash"]],
75
+ },
76
+ ],
77
+ };
78
+ // --- 2. LinkedIn connection-note tone (calibration band: not salesy, not dry) ---
79
+ const linkedinTone = {
80
+ id: "linkedin-tone",
81
+ title: "Connection notes: personal and specific, neither pitch nor boilerplate",
82
+ planted: [
83
+ {
84
+ key: "linkedin.note_band",
85
+ statement: "LinkedIn connection notes should be brief and personal: reference something specific about the recipient; no product pitch or meeting ask, but no empty boilerplate either.",
86
+ domain: "linkedin.connection_note",
87
+ mustMention: [
88
+ ["personal", "specific", "reference"],
89
+ ["pitch", "salesy", "sell", "demo", "promotional"],
90
+ ["boilerplate", "generic", "dry", "empty"],
91
+ ],
92
+ },
93
+ ],
94
+ alsoAcceptable: [
95
+ {
96
+ key: "linkedin.soft_close",
97
+ statement: "Close connection notes with a soft ask-free line like 'Would be glad to connect.'",
98
+ domain: "linkedin.connection_note",
99
+ mustMention: [["glad to connect", "soft", "close", "closing"]],
100
+ },
101
+ ],
102
+ train: [
103
+ correction({
104
+ domain: "linkedin.connection_note",
105
+ summary: "connection note to a RevOps lead, draft was a pitch",
106
+ before: "Hi Maria, athena cuts agent errors by 40%. Want to book a quick demo this week? Here's my calendar link.",
107
+ after: "Hi Maria, your post on why agent pilots stall in RevOps matched what we keep hearing. Would be glad to connect.",
108
+ diffSummary: "removed product pitch, demo ask, and calendar link; added reference to her post",
109
+ magnitude: "rewrite",
110
+ }),
111
+ correction({
112
+ domain: "linkedin.connection_note",
113
+ summary: "connection note to a conference speaker, draft was promotional",
114
+ before: "Hi Jonas, we're building the future of agent reliability. Check out athena and let's set up a call.",
115
+ after: "Hi Jonas, your SaaStr talk on tacit knowledge in sales teams stuck with me. Would be glad to connect.",
116
+ diffSummary: "removed promotion and call ask; referenced his talk",
117
+ magnitude: "rewrite",
118
+ }),
119
+ correction({
120
+ domain: "linkedin.connection_note",
121
+ summary: "connection note to a CTO, draft was empty boilerplate",
122
+ before: "Hi Tom, I'd like to add you to my professional network.",
123
+ after: "Hi Tom, your point about CRM data rot was spot on. We hear the same from every RevOps team. Would be glad to connect.",
124
+ diffSummary: "replaced generic boilerplate with a specific reference to his content",
125
+ magnitude: "rewrite",
126
+ }),
127
+ correction({
128
+ domain: "linkedin.connection_note",
129
+ summary: "connection note to a founder, draft was generic",
130
+ before: "Hello Anna, expanding my network in the GTM space. Let's connect.",
131
+ after: "Hello Anna, loved how you framed design partners as co-builders in your newsletter. Would be glad to connect.",
132
+ diffSummary: "replaced generic networking line with a specific reference to her newsletter",
133
+ magnitude: "rewrite",
134
+ }),
135
+ correction({
136
+ domain: "linkedin.connection_note",
137
+ summary: "connection note to a VP Sales, draft pitched in the second sentence",
138
+ before: "Hi Priya, great panel yesterday. By the way, athena could fix your team's follow-up problem, want a demo?",
139
+ after: "Hi Priya, great panel yesterday. Your line about reps drowning in tools was the highlight. Would be glad to connect.",
140
+ diffSummary: "kept the personal opener, removed the pitch and demo ask",
141
+ }),
142
+ ],
143
+ heldOut: [
144
+ {
145
+ instance: correction({
146
+ domain: "linkedin.connection_note",
147
+ summary: "connection note to a sales engineer, draft was salesy",
148
+ before: "Hi Omar, athena makes agents reliable. Free trial this week only, interested?",
149
+ after: "Hi Omar, your teardown of agent demos that fall apart in week two was brutal and accurate. Would be glad to connect.",
150
+ diffSummary: "removed pitch and urgency, added specific reference",
151
+ magnitude: "rewrite",
152
+ }),
153
+ kind: "apply",
154
+ ruleKey: "linkedin.note_band",
155
+ expectMention: [["pitch", "salesy", "sell", "demo", "promotional"]],
156
+ },
157
+ {
158
+ instance: correction({
159
+ domain: "linkedin.connection_note",
160
+ summary: "connection note to an analyst, draft was bare",
161
+ before: "Hi Dana, let's connect.",
162
+ after: "Hi Dana, your market map of agent memory vendors is the best one out there. Would be glad to connect.",
163
+ diffSummary: "replaced bare ask with specific reference to her work",
164
+ magnitude: "rewrite",
165
+ }),
166
+ kind: "apply",
167
+ ruleKey: "linkedin.note_band",
168
+ expectMention: [["specific", "personal", "reference"]],
169
+ },
170
+ ],
171
+ };
172
+ // --- 3. Slack brevity (context-conditional with a legitimate exception) ---
173
+ const slackBrevity = {
174
+ id: "slack-brevity",
175
+ title: "Slack replies stay short unless detail is asked for",
176
+ planted: [
177
+ {
178
+ key: "slack.brevity",
179
+ statement: "Slack replies should be at most a few sentences unless the thread explicitly asks for a detailed write-up.",
180
+ domain: "slack.reply",
181
+ mustMention: [
182
+ ["short", "brief", "concise", "few sentences", "long"],
183
+ ],
184
+ boundaries: [
185
+ {
186
+ statement: "Long-form is fine when someone explicitly asks for a detailed write-up or analysis.",
187
+ mustMention: [["unless", "except", "asked", "asks", "request", "detail"]],
188
+ },
189
+ ],
190
+ },
191
+ ],
192
+ train: [
193
+ correction({
194
+ domain: "slack.reply",
195
+ summary: "reply to a status question in #gtm",
196
+ before: "Good question! So there are several aspects to consider here. First, the pipeline review showed that we have 14 open opportunities, of which 6 are in stage 3 or later. Second, the two enterprise deals are waiting on security review, which historically takes 2-3 weeks. Third, regarding the renewal cohort, I think we should also discuss the expansion potential, because if we look at usage data there are at least 4 accounts that crossed the seat threshold. Happy to go deeper on any of these!",
197
+ after: "6 of 14 opps are stage 3+. Both enterprise deals are in security review (2-3 weeks). Renewals look fine.",
198
+ diffSummary: "cut a seven-sentence reply to three short sentences with the key numbers",
199
+ magnitude: "rewrite",
200
+ }),
201
+ correction({
202
+ domain: "slack.reply",
203
+ summary: "reply to 'did the import finish?' in #eng",
204
+ before: "Yes! The import finished successfully. For context, we processed all the records from the staging environment and the deduplication step worked as expected. There were a few warnings about malformed rows but nothing blocking. Let me know if you want me to walk through the logs together sometime.",
205
+ after: "Yes, finished clean. A few malformed-row warnings, nothing blocking.",
206
+ diffSummary: "cut five sentences to two",
207
+ magnitude: "substantive",
208
+ }),
209
+ correction({
210
+ domain: "slack.reply",
211
+ summary: "reply to a quick scheduling check in DM",
212
+ before: "Thanks for asking! I checked my calendar and I have a few options. Tuesday is quite packed with the board prep, but Wednesday morning could work, and Thursday afternoon is also relatively free. I would say Wednesday is probably best but I'm flexible. What works best for you?",
213
+ after: "Wednesday morning or Thursday afternoon. Slight preference for Wednesday.",
214
+ diffSummary: "cut the reply to the two viable options",
215
+ magnitude: "substantive",
216
+ }),
217
+ correction({
218
+ domain: "slack.reply",
219
+ summary: "reply to 'any blockers?' in standup thread",
220
+ before: "Not really blockers per se, but there are a few things worth mentioning for visibility. The OAuth review is still pending on Google's side, which is expected at this stage. I'm also waiting for feedback on the pricing doc, though that's not urgent yet. Otherwise everything is moving along nicely and I'm optimistic about the timeline.",
221
+ after: "No blockers. OAuth review pending on Google's side (expected).",
222
+ diffSummary: "cut to the one fact that matters",
223
+ magnitude: "substantive",
224
+ }),
225
+ ],
226
+ heldOut: [
227
+ {
228
+ instance: correction({
229
+ domain: "slack.reply",
230
+ summary: "reply to 'can you write up the full analysis of the churned accounts?'",
231
+ before: "We lost 3 accounts this quarter. Acme churned over a missing integration: they asked for the HubSpot sync in January, we shipped half of it. Beta Corp's champion left in March and we never rebuilt the relationship with the new VP. Gamma's usage dropped 60% after their reorg and nobody flagged it. Common thread: all three had warning signs in usage data at least six weeks out. Proposal: weekly usage-drop alert plus a champion-change playbook.",
232
+ after: "We lost 3 accounts this quarter. Acme churned over a missing integration: they asked for the HubSpot sync in January, we shipped half of it. Beta Corp's champion left in March and we never rebuilt the relationship with the new VP. Gamma's usage dropped 60% after their reorg and nobody flagged it. Common thread: all three had warning signs in usage data at least six weeks out. Proposal: weekly usage-drop alert plus a champion-change playbook.",
233
+ diffSummary: "kept long-form as-is: a detailed write-up was explicitly requested",
234
+ kind: "approval",
235
+ magnitude: "trivial",
236
+ }),
237
+ kind: "boundary",
238
+ ruleKey: "slack.brevity",
239
+ expectMention: [["unless", "except", "asked", "asks", "request", "detail"]],
240
+ },
241
+ {
242
+ instance: correction({
243
+ domain: "slack.reply",
244
+ summary: "reply to 'how did the demo go?' in #gtm",
245
+ before: "The demo went really well overall! We started with the usual deck intro, then moved into the live product part. They especially liked the review queue, and the CTO asked several questions about the MCP integration, which I took as a strong buying signal. There were some concerns about pricing at the end but I think we handled them well. Next step is a technical deep-dive with their platform team, probably next week.",
246
+ after: "Went well. CTO dug into the MCP integration (good sign). Next: technical deep-dive with their platform team next week.",
247
+ diffSummary: "cut six sentences to three",
248
+ magnitude: "substantive",
249
+ }),
250
+ kind: "apply",
251
+ ruleKey: "slack.brevity",
252
+ expectMention: [["short", "brief", "concise", "few sentences", "long"]],
253
+ },
254
+ ],
255
+ };
256
+ // --- 4. External formality (boundary discrimination: ongoing vs first contact) ---
257
+ const externalFormality = {
258
+ id: "external-formality",
259
+ title: "Warm and direct with known externals; stiff formality only for first contact",
260
+ planted: [
261
+ {
262
+ key: "external.warmth",
263
+ statement: "With external partners we already work with, drop stiff formality: warm, direct, professional tone. Keep formal register only for first contact with senior executives.",
264
+ domain: "email.external",
265
+ mustMention: [
266
+ ["formal", "formality", "stiff"],
267
+ ["warm", "direct", "relaxed", "casual"],
268
+ ],
269
+ boundaries: [
270
+ {
271
+ statement: "First contact with senior executives stays formal.",
272
+ mustMention: [
273
+ [
274
+ "first contact",
275
+ "first-contact",
276
+ "first outreach",
277
+ "first email",
278
+ "cold outreach",
279
+ "new contact",
280
+ "no existing relationship",
281
+ "no established relationship",
282
+ "senior",
283
+ "executive",
284
+ ],
285
+ ],
286
+ },
287
+ ],
288
+ },
289
+ ],
290
+ alsoAcceptable: [
291
+ {
292
+ key: "external.concrete_commitments",
293
+ statement: "Replace vague hedged commitments ('in due course', 'whether you might be available') with concrete specifics: a day, a duration.",
294
+ domain: "email.external",
295
+ mustMention: [
296
+ ["concrete", "specific", "bounded"],
297
+ ["vague", "hedge", "in due course", "open-ended"],
298
+ ],
299
+ },
300
+ ],
301
+ train: [
302
+ correction({
303
+ domain: "email.external",
304
+ summary: "recap email to a partner we've worked with for months",
305
+ before: "Dear Mr. Petersen, I hope this message finds you well. Pursuant to our discussion on Tuesday, please find below a summary of the agreed action items for your kind consideration.",
306
+ after: "Hi Lars, great talking Tuesday. Quick recap of what we agreed:",
307
+ diffSummary: "dropped formal salutation and stiff phrasing for warm direct tone with a known partner",
308
+ magnitude: "rewrite",
309
+ }),
310
+ correction({
311
+ domain: "email.external",
312
+ summary: "reply to a design partner's feedback email",
313
+ before: "Dear Ms. Okafor, thank you very much for your valuable feedback. We are most grateful for the time you have invested and shall revert with a detailed response in due course.",
314
+ after: "Hi Amara, this is exactly the feedback we needed, thank you. I'll get back to you with details by Thursday.",
315
+ diffSummary: "replaced stiff gratitude formula with warm direct reply and a concrete commitment",
316
+ magnitude: "rewrite",
317
+ }),
318
+ correction({
319
+ domain: "email.external",
320
+ summary: "scheduling email to an agency contact we talk to weekly",
321
+ before: "Dear Mr. Brandt, I would like to kindly inquire whether you might be available for a call next week.",
322
+ after: "Hi Felix, do you have 30 minutes next week?",
323
+ diffSummary: "cut formal inquiry to a direct question for a weekly contact",
324
+ magnitude: "substantive",
325
+ }),
326
+ correction({
327
+ domain: "email.external",
328
+ summary: "follow-up to a customer champion after the QBR",
329
+ before: "Dear Mrs. Lindqvist, it was a pleasure to meet with you and your esteemed colleagues. We remain at your disposal should any questions arise.",
330
+ after: "Hi Sara, really enjoyed the session with your team. Ping me anytime if questions come up.",
331
+ diffSummary: "dropped ceremonial closing for warm direct tone with an established champion",
332
+ magnitude: "rewrite",
333
+ }),
334
+ ],
335
+ heldOut: [
336
+ {
337
+ instance: correction({
338
+ domain: "email.external",
339
+ summary: "first cold email to the CFO of an enterprise prospect",
340
+ before: "Dear Mr. Tanaka, I hope this message finds you well. I am writing to introduce athena and to ask whether a brief conversation might be of interest.",
341
+ after: "Dear Mr. Tanaka, I hope this message finds you well. I am writing to introduce athena and to ask whether a brief conversation might be of interest.",
342
+ diffSummary: "kept formal register: first contact with a senior executive",
343
+ kind: "approval",
344
+ magnitude: "trivial",
345
+ }),
346
+ kind: "boundary",
347
+ ruleKey: "external.warmth",
348
+ expectMention: [
349
+ [
350
+ "first contact",
351
+ "first-contact",
352
+ "first outreach",
353
+ "first email",
354
+ "cold outreach",
355
+ "new contact",
356
+ "no existing relationship",
357
+ "no established relationship",
358
+ "senior",
359
+ "executive",
360
+ ],
361
+ ],
362
+ },
363
+ {
364
+ instance: correction({
365
+ domain: "email.external",
366
+ summary: "status email to an implementation partner we work with daily",
367
+ before: "Dear Mr. Novak, please be advised that the integration milestone has been reached as per the agreed schedule.",
368
+ after: "Hi Petr, milestone done, right on schedule.",
369
+ diffSummary: "dropped stiff formal notice for direct update to a daily collaborator",
370
+ magnitude: "rewrite",
371
+ }),
372
+ kind: "apply",
373
+ ruleKey: "external.warmth",
374
+ expectMention: [["warm", "direct", "relaxed", "casual"]],
375
+ },
376
+ ],
377
+ };
378
+ // --- 5. Meeting scheduling push (calibration band: concrete CTA, no pressure) ---
379
+ const meetingPush = {
380
+ id: "meeting-push",
381
+ title: "Always propose a concrete next step, never with pressure",
382
+ planted: [
383
+ {
384
+ key: "scheduling.band",
385
+ statement: "When closing a message, always propose a concrete next meeting (one or two specific options), but without pressure tactics or assumed commitment.",
386
+ domain: "email.scheduling",
387
+ mustMention: [
388
+ ["concrete", "specific", "propose", "options", "next step"],
389
+ ["pressure", "pushy", "assume", "force"],
390
+ ],
391
+ },
392
+ ],
393
+ train: [
394
+ correction({
395
+ domain: "email.scheduling",
396
+ summary: "closing of a follow-up email, draft was pushy",
397
+ before: "I'll send a calendar invite for tomorrow 9am. Looking forward to closing this!",
398
+ after: "Would Tuesday or Thursday afternoon work for a 30-minute follow-up? Happy to adjust.",
399
+ diffSummary: "replaced assumed commitment and urgency with two concrete options",
400
+ magnitude: "rewrite",
401
+ }),
402
+ correction({
403
+ domain: "email.scheduling",
404
+ summary: "closing of a demo recap, draft had no next step",
405
+ before: "Thanks again for your time. Let me know if you ever want to chat further.",
406
+ after: "Thanks again for your time. Would early next week work for a technical deep-dive? Tuesday or Wednesday morning are both open.",
407
+ diffSummary: "replaced vague open-ended closing with a concrete proposal and two options",
408
+ magnitude: "substantive",
409
+ }),
410
+ correction({
411
+ domain: "email.scheduling",
412
+ summary: "closing of a pricing discussion email, draft was pushy",
413
+ before: "This offer expires Friday, so we should get a contract call on the books today. When can you sign?",
414
+ after: "Happy to walk through the proposal together. Would Thursday or Friday work for a short call?",
415
+ diffSummary: "removed deadline pressure and signing push; proposed two call options",
416
+ magnitude: "rewrite",
417
+ }),
418
+ correction({
419
+ domain: "email.scheduling",
420
+ summary: "closing of a re-engagement email, draft had no ask",
421
+ before: "Hope things are going well on your side. Don't hesitate to reach out anytime.",
422
+ after: "Hope things are going well on your side. Worth a 20-minute catch-up? Next Wednesday or Friday morning would work here.",
423
+ diffSummary: "added a concrete low-pressure meeting proposal with two options",
424
+ magnitude: "substantive",
425
+ }),
426
+ correction({
427
+ domain: "email.scheduling",
428
+ summary: "closing of an intro email, draft assumed the meeting",
429
+ before: "I've gone ahead and booked us 45 minutes on Monday. See you then!",
430
+ after: "Would a 30-minute intro call help? Monday or Tuesday afternoon both work on my side.",
431
+ diffSummary: "replaced pre-booked meeting with a proposal and options",
432
+ magnitude: "rewrite",
433
+ }),
434
+ ],
435
+ heldOut: [
436
+ {
437
+ instance: correction({
438
+ domain: "email.scheduling",
439
+ summary: "closing of a check-in email, draft trailed off without an ask",
440
+ before: "Anyway, lots happening on our side too. Talk soon!",
441
+ after: "Anyway, lots happening on our side too. Want to swap notes for 20 minutes? Thursday or Friday would work here.",
442
+ diffSummary: "added a concrete two-option proposal to a closing that had none",
443
+ magnitude: "substantive",
444
+ }),
445
+ kind: "apply",
446
+ ruleKey: "scheduling.band",
447
+ expectMention: [["concrete", "specific", "propose", "options", "next step"]],
448
+ },
449
+ {
450
+ instance: correction({
451
+ domain: "email.scheduling",
452
+ summary: "closing of a negotiation email, draft pressured for commitment",
453
+ before: "We need your decision by end of week. Can you confirm a signing call for Thursday?",
454
+ after: "Take the time you need with the proposal. If a call would help, Thursday or early next week both work here.",
455
+ diffSummary: "removed deadline pressure, kept a concrete but optional call offer",
456
+ magnitude: "rewrite",
457
+ }),
458
+ kind: "apply",
459
+ ruleKey: "scheduling.band",
460
+ expectMention: [["pressure", "pushy", "assume", "force"]],
461
+ },
462
+ ],
463
+ };
464
+ export const GOLDEN_SCENARIOS = [
465
+ emdash,
466
+ linkedinTone,
467
+ slackBrevity,
468
+ externalFormality,
469
+ meetingPush,
470
+ ];
@@ -0,0 +1,107 @@
1
+ import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
2
+ import { z } from "zod";
3
+ import { ingestSensorEvent } from "../capture/ingest.js";
4
+ import { compileBrief } from "../serve/brief.js";
5
+ import { recordOutcome } from "../serve/outcome.js";
6
+ import { openRef } from "../store/open.js";
7
+ /**
8
+ * The agent surface: exactly 4 tools (design rule 8). No durable-mutation
9
+ * tools — athena_record creates evidence and outcomes; rules change only
10
+ * through the engine and human review.
11
+ */
12
+ const INSTANCE_KINDS = [
13
+ "correction",
14
+ "override",
15
+ "decision",
16
+ "escalation",
17
+ "failed_attempt",
18
+ "approval",
19
+ "manual_note",
20
+ ];
21
+ const OUTCOME_RESULTS = ["uncorrected", "corrected", "abandoned", "unknown"];
22
+ export function buildMcpServer(store) {
23
+ const server = new McpServer({ name: "athena", version: "0.1.0" });
24
+ server.registerTool("athena_brief", {
25
+ description: "Call this BEFORE acting on a task. Returns the tacit judgment rules that apply (with confidence and boundary conditions), relevant facts with citations, things you must not assume, open questions, and a readiness verdict (act / act_with_caveats / inspect_first / ask_human). Honor the boundaries and do-not-assume list. After finishing the task, report what happened with athena_record type=outcome, citing the briefId.",
26
+ inputSchema: {
27
+ task: z.string().describe("The task you are about to perform, in one sentence"),
28
+ domain: z
29
+ .string()
30
+ .optional()
31
+ .describe('Optional dot-path domain, e.g. "email.outreach", "slack.reply", "code.review"'),
32
+ },
33
+ }, ({ task, domain }) => {
34
+ const brief = compileBrief(store, domain !== undefined ? { task, domain } : { task });
35
+ return asJson(brief);
36
+ });
37
+ server.registerTool("athena_open", {
38
+ description: "Dereference any athena:// ref from a brief or search result: the full hypothesis with evidence links, the underlying judgment instance (the actual correction), a source, an object, a brief, or an outcome.",
39
+ inputSchema: { ref: z.string().describe("An athena:// ref, e.g. athena://hypothesis/hyp_...") },
40
+ }, ({ ref }) => {
41
+ const entity = openRef(store, ref);
42
+ if (!entity)
43
+ throw new Error(`${ref} does not exist`);
44
+ return asJson(entity);
45
+ });
46
+ server.registerTool("athena_record", {
47
+ description: "Report back to athena. type=outcome: after a task where you used a brief, report whether your output was accepted unchanged (uncorrected) or edited by the human (corrected) — this is how rules earn or lose trust. type=event: capture a judgment moment you observed (a correction of your output, a human decision, a failed approach, an explicit 'remember this'). Events become evidence for new rules after review; nothing you record changes durable rules directly.",
48
+ inputSchema: {
49
+ type: z.enum(["outcome", "event"]),
50
+ briefId: z.string().optional().describe("outcome: the brief this outcome judges"),
51
+ result: z.enum(OUTCOME_RESULTS).optional().describe("outcome: what happened"),
52
+ correctionInstanceId: z
53
+ .string()
54
+ .optional()
55
+ .describe("outcome: if corrected, the instance id of the captured correction (record the event first)"),
56
+ kind: z.enum(INSTANCE_KINDS).optional().describe("event: what kind of judgment moment"),
57
+ summary: z.string().optional().describe("event: one-line situation summary"),
58
+ domain: z.string().optional().describe('event: dot-path domain, e.g. "email.outreach"'),
59
+ task: z.string().optional().describe("event: what was being attempted"),
60
+ before: z.string().optional().describe("event: the draft/output before the human acted"),
61
+ after: z.string().optional().describe("event: the human's version (omit for approvals)"),
62
+ app: z.string().optional().describe("event: where this happened (gmail, slack, claude-code...)"),
63
+ },
64
+ }, (args) => {
65
+ if (args.type === "outcome") {
66
+ if (!args.briefId || !args.result)
67
+ throw new Error("outcome requires briefId and result");
68
+ const outcome = recordOutcome(store, {
69
+ briefId: args.briefId,
70
+ result: args.result,
71
+ ...(args.correctionInstanceId
72
+ ? { correctionInstanceId: args.correctionInstanceId }
73
+ : {}),
74
+ });
75
+ return asJson(outcome);
76
+ }
77
+ if (!args.kind || !args.summary)
78
+ throw new Error("event requires kind and summary");
79
+ const event = {
80
+ sensorId: "sen_mcp",
81
+ emittedAt: new Date().toISOString(),
82
+ kind: args.kind,
83
+ situation: {
84
+ summary: args.summary,
85
+ ...(args.domain !== undefined ? { domain: args.domain } : {}),
86
+ ...(args.task !== undefined ? { task: args.task } : {}),
87
+ ...(args.app !== undefined ? { app: args.app } : {}),
88
+ },
89
+ ...(args.before !== undefined ? { before: { mediaType: "text/plain", content: args.before } } : {}),
90
+ ...(args.after !== undefined ? { after: { mediaType: "text/plain", content: args.after } } : {}),
91
+ };
92
+ const instance = ingestSensorEvent(store, event);
93
+ return asJson({ recorded: instance.id, kind: instance.kind, domain: instance.situation.domain });
94
+ });
95
+ server.registerTool("athena_search", {
96
+ description: "Lexical search across captured judgment instances, learned rules, and sources. Returns athena:// refs ranked by relevance — open them with athena_open.",
97
+ inputSchema: {
98
+ query: z.string(),
99
+ lane: z.enum(["instance", "hypothesis", "source"]).optional().describe("Restrict to one lane"),
100
+ limit: z.number().int().min(1).max(50).optional(),
101
+ },
102
+ }, ({ query, lane, limit }) => asJson(store.search(query, lane, limit ?? 20)));
103
+ return server;
104
+ }
105
+ function asJson(value) {
106
+ return { content: [{ type: "text", text: JSON.stringify(value, null, 2) }] };
107
+ }
@@ -0,0 +1,7 @@
1
+ import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
2
+ import { buildMcpServer } from "./mcp/server.js";
3
+ import { AthenaStore } from "./store/store.js";
4
+ import { dbPath } from "./config.js";
5
+ const store = new AthenaStore(dbPath());
6
+ const server = buildMcpServer(store);
7
+ await server.connect(new StdioServerTransport());