nodebench-mcp 2.51.0 → 2.53.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,143 @@
1
+ /**
2
+ * cohorts.ts — Canonical cohort definitions for N=1, N=5, N=10, N=100 benchmarks.
3
+ *
4
+ * N=1 proves it can work
5
+ * N=5 proves it is not just for you
6
+ * N=10 proves it survives role and session variance
7
+ * N=100 proves it compounds over time
8
+ */
9
+ /* ─── Canonical Users ────────────────────────────────────────────────────── */
10
+ const USERS = {
11
+ founder_homen: {
12
+ userId: "u_founder_homen",
13
+ role: "founder",
14
+ name: "Homen (Founder)",
15
+ description: "Solo technical founder building NodeBench. Weekly reset, delegation, strategy.",
16
+ scenarios: ["weekly_reset", "pre_delegation", "important_change", "competitor_brief", "packet_diff"],
17
+ primaryEntity: "NodeBench",
18
+ },
19
+ banker_sarah: {
20
+ userId: "u_banker_sarah",
21
+ role: "banker",
22
+ name: "Sarah (Banker)",
23
+ description: "Investment banker evaluating AI infrastructure companies for deal flow.",
24
+ scenarios: ["company_search", "competitor_brief", "memo_export", "role_switch"],
25
+ primaryEntity: "Anthropic",
26
+ },
27
+ ceo_marcus: {
28
+ userId: "u_ceo_marcus",
29
+ role: "ceo",
30
+ name: "Marcus (CEO)",
31
+ description: "CEO of a mid-stage startup. Quarterly strategy, board narrative, resource allocation.",
32
+ scenarios: ["weekly_reset", "important_change", "company_search", "memo_export"],
33
+ primaryEntity: "Shopify",
34
+ },
35
+ researcher_lin: {
36
+ userId: "u_researcher_lin",
37
+ role: "researcher",
38
+ name: "Lin (Researcher)",
39
+ description: "AI strategy analyst. Competitor intelligence, market mapping, evidence synthesis.",
40
+ scenarios: ["competitor_brief", "company_search", "uploaded_notes", "html_export"],
41
+ primaryEntity: "Supermemory",
42
+ },
43
+ student_aisha: {
44
+ userId: "u_student_aisha",
45
+ role: "student",
46
+ name: "Aisha (Student)",
47
+ description: "MBA student studying AI commerce strategy. Needs citation-friendly study briefs.",
48
+ scenarios: ["company_search", "uploaded_notes", "memo_export", "role_switch"],
49
+ primaryEntity: "Shopify",
50
+ },
51
+ legal_david: {
52
+ userId: "u_legal_david",
53
+ role: "legal",
54
+ name: "David (Legal)",
55
+ description: "In-house counsel reviewing AI partnerships. Regulatory exposure, governance, IP.",
56
+ scenarios: ["company_search", "important_change", "memo_export"],
57
+ primaryEntity: "Anthropic",
58
+ },
59
+ pm_rachel: {
60
+ userId: "u_pm_rachel",
61
+ role: "pm",
62
+ name: "Rachel (PM)",
63
+ description: "Product manager at an AI-native company. Feature prioritization, competitor tracking.",
64
+ scenarios: ["weekly_reset", "competitor_brief", "important_change", "packet_diff"],
65
+ primaryEntity: "NodeBench",
66
+ },
67
+ contractor_kai: {
68
+ userId: "u_contractor_kai",
69
+ role: "contractor",
70
+ name: "Kai (Contractor)",
71
+ description: "Freelance developer receiving delegation packets. Needs scoped context without full history.",
72
+ scenarios: ["pre_delegation", "packet_diff", "uploaded_notes"],
73
+ primaryEntity: "NodeBench",
74
+ },
75
+ investor_priya: {
76
+ userId: "u_investor_priya",
77
+ role: "investor",
78
+ name: "Priya (Investor)",
79
+ description: "VC partner evaluating AI infrastructure deals. Pipeline tracking, comparables, diligence.",
80
+ scenarios: ["company_search", "competitor_brief", "memo_export", "role_switch"],
81
+ primaryEntity: "Anthropic",
82
+ },
83
+ content_james: {
84
+ userId: "u_content_james",
85
+ role: "content",
86
+ name: "James (Content)",
87
+ description: "Content strategist tracking AI industry trends. Post performance, audience analysis.",
88
+ scenarios: ["company_search", "important_change", "competitor_brief", "html_export"],
89
+ primaryEntity: "Shopify",
90
+ },
91
+ };
92
+ /* ─── N=1 — Golden Path ──────────────────────────────────────────────────── */
93
+ export const COHORT_N1 = {
94
+ cohortId: "cohort_n1_golden_path",
95
+ layer: "N1",
96
+ users: [USERS.founder_homen],
97
+ sessionsPerUser: 1,
98
+ timeHorizons: ["same_session"],
99
+ description: "Single founder run: weekly reset → memo export → pre-delegation → important-change. Proves the golden path works.",
100
+ };
101
+ /* ─── N=5 — Role Generalization ──────────────────────────────────────────── */
102
+ export const COHORT_N5 = {
103
+ cohortId: "cohort_n5_role_variance",
104
+ layer: "N5",
105
+ users: [
106
+ USERS.founder_homen,
107
+ USERS.banker_sarah,
108
+ USERS.ceo_marcus,
109
+ USERS.researcher_lin,
110
+ USERS.student_aisha,
111
+ ],
112
+ sessionsPerUser: 1,
113
+ timeHorizons: ["same_session"],
114
+ description: "5 users with different roles analyze the same entity. Proves role adaptation without hallucination.",
115
+ };
116
+ /* ─── N=10 — Repeated-Session Stability ──────────────────────────────────── */
117
+ export const COHORT_N10 = {
118
+ cohortId: "cohort_n10_session_stability",
119
+ layer: "N10",
120
+ users: [
121
+ USERS.founder_homen,
122
+ USERS.banker_sarah,
123
+ USERS.ceo_marcus,
124
+ USERS.researcher_lin,
125
+ USERS.student_aisha,
126
+ ],
127
+ sessionsPerUser: 2,
128
+ timeHorizons: ["same_session", "next_day"],
129
+ description: "5 users × 2 sessions each. Session 2 tests: remembered context, surfaced delta, refreshed packet. Proves session continuity.",
130
+ };
131
+ /* ─── N=100 — Longitudinal Compounding ───────────────────────────────────── */
132
+ export const COHORT_N100 = {
133
+ cohortId: "cohort_n100_longitudinal",
134
+ layer: "N100",
135
+ users: Object.values(USERS),
136
+ sessionsPerUser: 10,
137
+ timeHorizons: ["same_session", "same_day", "next_day", "weekly", "monthly", "quarterly"],
138
+ description: "10 users × 10 sessions across time horizons. Tests compounding memory, packet reuse, repeat-cognition avoidance, regression resistance.",
139
+ };
140
+ /* ─── All Cohorts ────────────────────────────────────────────────────────── */
141
+ export const ALL_COHORTS = [COHORT_N1, COHORT_N5, COHORT_N10, COHORT_N100];
142
+ export const ALL_USERS = USERS;
143
+ //# sourceMappingURL=cohorts.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"cohorts.js","sourceRoot":"","sources":["../../src/benchmarks/cohorts.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAIH,gFAAgF;AAEhF,MAAM,KAAK,GAAkC;IAC3C,aAAa,EAAE;QACb,MAAM,EAAE,iBAAiB;QACzB,IAAI,EAAE,SAAS;QACf,IAAI,EAAE,iBAAiB;QACvB,WAAW,EAAE,gFAAgF;QAC7F,SAAS,EAAE,CAAC,cAAc,EAAE,gBAAgB,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,aAAa,CAAC;QACpG,aAAa,EAAE,WAAW;KAC3B;IACD,YAAY,EAAE;QACZ,MAAM,EAAE,gBAAgB;QACxB,IAAI,EAAE,QAAQ;QACd,IAAI,EAAE,gBAAgB;QACtB,WAAW,EAAE,yEAAyE;QACtF,SAAS,EAAE,CAAC,gBAAgB,EAAE,kBAAkB,EAAE,aAAa,EAAE,aAAa,CAAC;QAC/E,aAAa,EAAE,WAAW;KAC3B;IACD,UAAU,EAAE;QACV,MAAM,EAAE,cAAc;QACtB,IAAI,EAAE,KAAK;QACX,IAAI,EAAE,cAAc;QACpB,WAAW,EAAE,uFAAuF;QACpG,SAAS,EAAE,CAAC,cAAc,EAAE,kBAAkB,EAAE,gBAAgB,EAAE,aAAa,CAAC;QAChF,aAAa,EAAE,SAAS;KACzB;IACD,cAAc,EAAE;QACd,MAAM,EAAE,kBAAkB;QAC1B,IAAI,EAAE,YAAY;QAClB,IAAI,EAAE,kBAAkB;QACxB,WAAW,EAAE,mFAAmF;QAChG,SAAS,EAAE,CAAC,kBAAkB,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,aAAa,CAAC;QAClF,aAAa,EAAE,aAAa;KAC7B;IACD,aAAa,EAAE;QACb,MAAM,EAAE,iBAAiB;QACzB,IAAI,EAAE,SAAS;QACf,IAAI,EAAE,iBAAiB;QACvB,WAAW,EAAE,kFAAkF;QAC/F,SAAS,EAAE,CAAC,gBAAgB,EAAE,gBAAgB,EAAE,aAAa,EAAE,aAAa,CAAC;QAC7E,aAAa,EAAE,SAAS;KACzB;IACD,WAAW,EAAE;QACX,MAAM,EAAE,eAAe;QACvB,IAAI,EAAE,OAAO;QACb,IAAI,EAAE,eAAe;QACrB,WAAW,EAAE,kFAAkF;QAC/F,SAAS,EAAE,CAAC,gBAAgB,EAAE,kBAAkB,EAAE,aAAa,CAAC;QAChE,aAAa,EAAE,WAAW;KAC3B;IACD,SAAS,EAAE;QACT,MAAM,EAAE,aAAa;QACrB,IAAI,EAAE,IAAI;QACV,IAAI,EAAE,aAAa;QACnB,WAAW,EAAE,uFAAuF;QACpG,SAAS,EAAE,CAAC,cAAc,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,aAAa,CAAC;QAClF,aAAa,EAAE,WAAW;KAC3B;IACD,cAAc,EAAE;QACd,MAAM,EAAE,kBAAkB;QAC1B,IAAI,EAAE,YAAY;QAClB,IAAI,EAAE,kBAAkB;QACxB,WAAW,EAAE,8FAA8F;QAC3G,SAAS,EAAE,CAAC,gBAAgB,EAAE,aAAa,EAAE,gBAAgB,CAAC;QAC9D,aAAa,EAAE,WAAW;KAC3B;IACD,cAAc,EAAE;QACd,MAAM,EAAE,kBAAkB;QAC1B,IAAI,EAAE,UAAU;QAChB,IAAI,EAAE,kBAAkB;QACxB,WAAW,EAAE,2FAA2F;QACxG,SAAS,EAAE,CAAC,gBAAgB,EAAE,kBAAkB,EAAE,aAAa,EAAE,aAAa,CAAC;QAC/E,aAAa,EAAE,WAAW;KAC3B;IACD,aAAa,EAAE;QACb,MAAM,EAAE,iBAAiB;QACzB,IAAI,EAAE,SAAS;QACf,IAAI,EAAE,iBAAiB;QACvB,WAAW,EAAE,sFAAsF;QACnG,SAAS,EAAE,CAAC,gBAAgB,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,aAAa,CAAC;QACpF,aAAa,EAAE,SAAS;KACzB;CACF,CAAC;AAEF,gFAAgF;AAEhF,MAAM,CAAC,MAAM,SAAS,GAAoB;IACxC,QAAQ,EAAE,uBAAuB;IACjC,KAAK,EAAE,IAAI;IACX,KAAK,EAAE,CAAC,KAAK,CAAC,aAAa,CAAC;IAC5B,eAAe,EAAE,CAAC;IAClB,YAAY,EAAE,CAAC,cAAc,CAAC;IAC9B,WAAW,EAAE,mHAAmH;CACjI,CAAC;AAEF,gFAAgF;AAEhF,MAAM,CAAC,MAAM,SAAS,GAAoB;IACxC,QAAQ,EAAE,yBAAyB;IACnC,KAAK,EAAE,IAAI;IACX,KAAK,EAAE;QACL,KAAK,CAAC,aAAa;QACnB,KAAK,CAAC,YAAY;QAClB,KAAK,CAAC,UAAU;QAChB,KAAK,CAAC,cAAc;QACpB,KAAK,CAAC,aAAa;KACpB;IACD,eAAe,EAAE,CAAC;IAClB,YAAY,EAAE,CAAC,cAAc,CAAC;IAC9B,WAAW,EAAE,qGAAqG;CACnH,CAAC;AAEF,gFAAgF;AAEhF,MAAM,CAAC,MAAM,UAAU,GAAoB;IACzC,QAAQ,EAAE,8BAA8B;IACxC,KAAK,EAAE,KAAK;IACZ,KAAK,EAAE;QACL,KAAK,CAAC,aAAa;QACnB,KAAK,CAAC,YAAY;QAClB,KAAK,CAAC,UAAU;QAChB,KAAK,CAAC,cAAc;QACpB,KAAK,CAAC,aAAa;KACpB;IACD,eAAe,EAAE,CAAC;IAClB,YAAY,EAAE,CAAC,cAAc,EAAE,UAAU,CAAC;IAC1C,WAAW,EAAE,8HAA8H;CAC5I,CAAC;AAEF,gFAAgF;AAEhF,MAAM,CAAC,MAAM,WAAW,GAAoB;IAC1C,QAAQ,EAAE,0BAA0B;IACpC,KAAK,EAAE,MAAM;IACb,KAAK,EAAE,MAAM,CAAC,MAAM,CAAC,KAAK,CAAC;IAC3B,eAAe,EAAE,EAAE;IACnB,YAAY,EAAE,CAAC,cAAc,EAAE,UAAU,EAAE,UAAU,EAAE,QAAQ,EAAE,SAAS,EAAE,WAAW,CAAC;IACxF,WAAW,EAAE,yIAAyI;CACvJ,CAAC;AAEF,gFAAgF;AAEhF,MAAM,CAAC,MAAM,WAAW,GAAsB,CAAC,SAAS,EAAE,SAAS,EAAE,UAAU,EAAE,WAAW,CAAC,CAAC;AAC9F,MAAM,CAAC,MAAM,SAAS,GAAG,KAAK,CAAC"}
@@ -0,0 +1,91 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * longitudinalHarness.ts — Longitudinal dogfood benchmark harness for NodeBench MCP.
4
+ *
5
+ * Orchestrates N=1, N=5, N=10, N=100 cohort evaluations to measure whether
6
+ * NodeBench compounds value across users, roles, sessions, and time horizons.
7
+ *
8
+ * Core metrics:
9
+ * RCA — Repeated Cognition Avoided (% of sessions not restating prior context)
10
+ * PRR — Packet Reuse Rate (% of sessions reusing a prior packet)
11
+ *
12
+ * Usage:
13
+ * cd packages/mcp-local && npx tsx src/benchmarks/longitudinalHarness.ts [n1|n5|n10|n100|all]
14
+ */
15
+ export interface CohortUser {
16
+ userId: string;
17
+ role: "founder" | "banker" | "ceo" | "researcher" | "student" | "legal" | "pm" | "contractor" | "investor" | "content";
18
+ preset: string;
19
+ typicalScenarios: string[];
20
+ }
21
+ export interface SessionRun {
22
+ runId: string;
23
+ userId: string;
24
+ role: string;
25
+ scenarioId: string;
26
+ sessionIndex: number;
27
+ timeHorizon: "same_session" | "same_day" | "next_day" | "weekly" | "monthly" | "quarterly" | "yearly";
28
+ surface: "mcp" | "ai_app" | "local_dashboard" | "engine_api";
29
+ toolCallCount: number;
30
+ latencyMs: number;
31
+ packetGenerated: boolean;
32
+ packetReused: boolean;
33
+ repeatQuestionDetected: boolean;
34
+ contextRestated: boolean;
35
+ exportProduced: boolean;
36
+ judgeScore: number;
37
+ errors: string[];
38
+ }
39
+ export interface CohortReport {
40
+ cohortSize: number;
41
+ totalSessions: number;
42
+ rolesCovered: string[];
43
+ coreLoopsCovered: number;
44
+ repeatedCognitionAvoided: number;
45
+ packetReuseRate: number;
46
+ importantChangePrecision: number;
47
+ contradictionPrecision: number;
48
+ falseAlertRate: number;
49
+ exportToActionRate: number;
50
+ topRecurringRootCause: string;
51
+ topRegressionRisk: string;
52
+ passed: boolean;
53
+ passThresholds: Record<string, number>;
54
+ }
55
+ /**
56
+ * Repeated Cognition Avoided (RCA): % of sessions where the user did NOT
57
+ * restate context or re-ask old questions.
58
+ */
59
+ export declare function computeRCA(sessions: SessionRun[]): number;
60
+ /**
61
+ * Packet Reuse Rate (PRR): % of sessions where a prior packet was reused
62
+ * rather than regenerated from scratch.
63
+ */
64
+ export declare function computePRR(sessions: SessionRun[]): number;
65
+ /**
66
+ * Aggregate all metrics from a batch of sessions into a CohortReport.
67
+ */
68
+ export declare function generateCohortReport(sessions: SessionRun[], cohortSize: number, layer: "n1" | "n5" | "n10" | "n100"): CohortReport;
69
+ /**
70
+ * N=1: Single golden-path founder run.
71
+ * Validates that the core tool chain works end-to-end for one scenario.
72
+ */
73
+ export declare function runN1(): Promise<{
74
+ session: SessionRun;
75
+ report: CohortReport;
76
+ }>;
77
+ /**
78
+ * N=5: 5 users (founder, banker, ceo, researcher, student), 1 session each.
79
+ * All against the same entity "Anthropic".
80
+ */
81
+ export declare function runN5(): Promise<CohortReport>;
82
+ /**
83
+ * N=10: 10 users x 1 session each OR 5 users x 2 sessions.
84
+ * Tests session-continuity metrics.
85
+ */
86
+ export declare function runN10(): Promise<CohortReport>;
87
+ /**
88
+ * N=100: 10 users x 10 sessions each (simulated across time horizons).
89
+ * Measures RCA + PRR compounding over time.
90
+ */
91
+ export declare function runN100(): Promise<CohortReport>;