@iris-eval/mcp-server 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +168 -0
  3. package/dist/config/defaults.d.ts +2 -0
  4. package/dist/config/defaults.js +40 -0
  5. package/dist/config/index.d.ts +11 -0
  6. package/dist/config/index.js +106 -0
  7. package/dist/dashboard/assets/index-BStyrSkE.js +127 -0
  8. package/dist/dashboard/assets/index-DsCtYyvh.css +1 -0
  9. package/dist/dashboard/index.html +13 -0
  10. package/dist/eval/engine.d.ts +8 -0
  11. package/dist/eval/engine.js +61 -0
  12. package/dist/eval/index.d.ts +2 -0
  13. package/dist/eval/index.js +2 -0
  14. package/dist/eval/rules/completeness.d.ts +6 -0
  15. package/dist/eval/rules/completeness.js +79 -0
  16. package/dist/eval/rules/cost.d.ts +4 -0
  17. package/dist/eval/rules/cost.js +44 -0
  18. package/dist/eval/rules/custom.d.ts +2 -0
  19. package/dist/eval/rules/custom.js +88 -0
  20. package/dist/eval/rules/index.d.ts +4 -0
  21. package/dist/eval/rules/index.js +15 -0
  22. package/dist/eval/rules/relevance.d.ts +5 -0
  23. package/dist/eval/rules/relevance.js +87 -0
  24. package/dist/eval/rules/safety.d.ts +5 -0
  25. package/dist/eval/rules/safety.js +81 -0
  26. package/dist/index.d.ts +2 -0
  27. package/dist/index.js +101 -0
  28. package/dist/middleware/auth.d.ts +3 -0
  29. package/dist/middleware/auth.js +24 -0
  30. package/dist/middleware/cors.d.ts +2 -0
  31. package/dist/middleware/cors.js +29 -0
  32. package/dist/middleware/error-handler.d.ts +3 -0
  33. package/dist/middleware/error-handler.js +19 -0
  34. package/dist/middleware/index.d.ts +4 -0
  35. package/dist/middleware/index.js +4 -0
  36. package/dist/middleware/rate-limit.d.ts +3 -0
  37. package/dist/middleware/rate-limit.js +19 -0
  38. package/dist/resources/dashboard-summary.d.ts +3 -0
  39. package/dist/resources/dashboard-summary.js +14 -0
  40. package/dist/resources/index.d.ts +3 -0
  41. package/dist/resources/index.js +6 -0
  42. package/dist/resources/trace-detail.d.ts +3 -0
  43. package/dist/resources/trace-detail.js +28 -0
  44. package/dist/server.d.ts +9 -0
  45. package/dist/server.js +14 -0
  46. package/dist/storage/index.d.ts +4 -0
  47. package/dist/storage/index.js +10 -0
  48. package/dist/storage/migrations/001-initial-schema.d.ts +3 -0
  49. package/dist/storage/migrations/001-initial-schema.js +57 -0
  50. package/dist/storage/migrations/index.d.ts +2 -0
  51. package/dist/storage/migrations/index.js +22 -0
  52. package/dist/storage/sqlite-adapter.d.ts +33 -0
  53. package/dist/storage/sqlite-adapter.js +232 -0
  54. package/dist/tools/evaluate-output.d.ts +4 -0
  55. package/dist/tools/evaluate-output.js +58 -0
  56. package/dist/tools/get-traces.d.ts +3 -0
  57. package/dist/tools/get-traces.js +53 -0
  58. package/dist/tools/index.d.ts +4 -0
  59. package/dist/tools/index.js +8 -0
  60. package/dist/tools/log-trace.d.ts +3 -0
  61. package/dist/tools/log-trace.js +80 -0
  62. package/dist/transport/http.d.ts +10 -0
  63. package/dist/transport/http.js +37 -0
  64. package/dist/transport/index.d.ts +3 -0
  65. package/dist/transport/index.js +2 -0
  66. package/dist/transport/stdio.d.ts +2 -0
  67. package/dist/transport/stdio.js +4 -0
  68. package/dist/types/config.d.ts +37 -0
  69. package/dist/types/config.js +1 -0
  70. package/dist/types/eval.d.ts +51 -0
  71. package/dist/types/eval.js +1 -0
  72. package/dist/types/index.d.ts +4 -0
  73. package/dist/types/index.js +1 -0
  74. package/dist/types/query.d.ts +64 -0
  75. package/dist/types/query.js +1 -0
  76. package/dist/types/trace.d.ts +47 -0
  77. package/dist/types/trace.js +1 -0
  78. package/dist/utils/ids.d.ts +3 -0
  79. package/dist/utils/ids.js +10 -0
  80. package/dist/utils/logger.d.ts +8 -0
  81. package/dist/utils/logger.js +14 -0
  82. package/package.json +77 -0
  83. package/server.json +69 -0
@@ -0,0 +1,24 @@
1
+ import { timingSafeEqual } from 'node:crypto';
2
+ export function createAuthMiddleware(config) {
3
+ const apiKey = config.security.apiKey;
4
+ if (!apiKey) {
5
+ return (_req, _res, next) => next();
6
+ }
7
+ const keyBuffer = Buffer.from(apiKey);
8
+ return (req, res, next) => {
9
+ if (req.path === '/health' || req.path === '/api/v1/health') {
10
+ return next();
11
+ }
12
+ const authHeader = req.headers.authorization;
13
+ if (!authHeader || !authHeader.startsWith('Bearer ')) {
14
+ res.status(401).json({ error: 'Missing or invalid Authorization header' });
15
+ return;
16
+ }
17
+ const tokenBuffer = Buffer.from(authHeader.slice(7));
18
+ if (tokenBuffer.length !== keyBuffer.length || !timingSafeEqual(tokenBuffer, keyBuffer)) {
19
+ res.status(403).json({ error: 'Invalid API key' });
20
+ return;
21
+ }
22
+ next();
23
+ };
24
+ }
@@ -0,0 +1,2 @@
1
+ import type { RequestHandler } from 'express';
2
+ export declare function createCorsMiddleware(allowedOrigins: string[]): RequestHandler;
@@ -0,0 +1,29 @@
1
+ function isOriginAllowed(origin, allowedOrigins) {
2
+ for (const pattern of allowedOrigins) {
3
+ if (pattern === '*')
4
+ return true;
5
+ if (pattern === origin)
6
+ return true;
7
+ const regex = new RegExp('^' + pattern.replace(/[.+?^${}()|[\]\\]/g, '\\$&').replace(/\*/g, '.*') + '$');
8
+ if (regex.test(origin))
9
+ return true;
10
+ }
11
+ return false;
12
+ }
13
+ export function createCorsMiddleware(allowedOrigins) {
14
+ return (req, res, next) => {
15
+ const origin = req.headers.origin;
16
+ if (origin && isOriginAllowed(origin, allowedOrigins)) {
17
+ res.setHeader('Access-Control-Allow-Origin', origin);
18
+ res.setHeader('Access-Control-Allow-Methods', 'GET, POST, DELETE, OPTIONS');
19
+ res.setHeader('Access-Control-Allow-Headers', 'Content-Type, Authorization');
20
+ res.setHeader('Access-Control-Max-Age', '86400');
21
+ res.setHeader('Vary', 'Origin');
22
+ }
23
+ if (req.method === 'OPTIONS') {
24
+ res.status(204).end();
25
+ return;
26
+ }
27
+ next();
28
+ };
29
+ }
@@ -0,0 +1,3 @@
1
+ import type { ErrorRequestHandler } from 'express';
2
+ import type { Logger } from '../utils/logger.js';
3
+ export declare function createErrorHandler(logger: Logger): ErrorRequestHandler;
@@ -0,0 +1,19 @@
1
+ export function createErrorHandler(logger) {
2
+ return (err, _req, res, _next) => {
3
+ // Handle Zod validation errors
4
+ if (err?.name === 'ZodError' || err?.constructor?.name === 'ZodError') {
5
+ res.status(400).json({
6
+ error: 'Validation error',
7
+ details: err.errors ?? err.issues,
8
+ });
9
+ return;
10
+ }
11
+ const status = err.status ?? err.statusCode ?? 500;
12
+ const message = status >= 500 ? 'Internal server error' : (err.message ?? 'Unknown error');
13
+ logger.error(`Request error: ${err.message}`, { status, stack: err.stack });
14
+ res.status(status).json({
15
+ error: message,
16
+ ...(process.env.NODE_ENV === 'development' ? { stack: err.stack } : {}),
17
+ });
18
+ };
19
+ }
@@ -0,0 +1,4 @@
1
+ export { createAuthMiddleware } from './auth.js';
2
+ export { createCorsMiddleware } from './cors.js';
3
+ export { createErrorHandler } from './error-handler.js';
4
+ export { createApiRateLimiter, createMcpRateLimiter } from './rate-limit.js';
@@ -0,0 +1,4 @@
1
+ export { createAuthMiddleware } from './auth.js';
2
+ export { createCorsMiddleware } from './cors.js';
3
+ export { createErrorHandler } from './error-handler.js';
4
+ export { createApiRateLimiter, createMcpRateLimiter } from './rate-limit.js';
@@ -0,0 +1,3 @@
1
+ import type { IrisConfig } from '../types/config.js';
2
+ export declare function createApiRateLimiter(config: Pick<IrisConfig, 'security'>): import("express-rate-limit").RateLimitRequestHandler;
3
+ export declare function createMcpRateLimiter(config: Pick<IrisConfig, 'security'>): import("express-rate-limit").RateLimitRequestHandler;
@@ -0,0 +1,19 @@
1
+ import rateLimit from 'express-rate-limit';
2
+ export function createApiRateLimiter(config) {
3
+ return rateLimit({
4
+ windowMs: 60_000,
5
+ limit: config.security.rateLimit.api,
6
+ standardHeaders: 'draft-7',
7
+ legacyHeaders: false,
8
+ message: { error: 'Too many requests, please try again later' },
9
+ });
10
+ }
11
+ export function createMcpRateLimiter(config) {
12
+ return rateLimit({
13
+ windowMs: 60_000,
14
+ limit: config.security.rateLimit.mcp,
15
+ standardHeaders: 'draft-7',
16
+ legacyHeaders: false,
17
+ message: { error: 'Too many requests, please try again later' },
18
+ });
19
+ }
@@ -0,0 +1,3 @@
1
+ import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import type { IStorageAdapter } from '../types/query.js';
3
+ export declare function registerDashboardSummaryResource(server: McpServer, storage: IStorageAdapter): void;
@@ -0,0 +1,14 @@
1
+ export function registerDashboardSummaryResource(server, storage) {
2
+ server.resource('dashboard-summary', 'iris://dashboard/summary', { description: 'Dashboard summary with key metrics and trends' }, async () => {
3
+ const summary = await storage.getDashboardSummary();
4
+ return {
5
+ contents: [
6
+ {
7
+ uri: 'iris://dashboard/summary',
8
+ mimeType: 'application/json',
9
+ text: JSON.stringify(summary, null, 2),
10
+ },
11
+ ],
12
+ };
13
+ });
14
+ }
@@ -0,0 +1,3 @@
1
+ import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import type { IStorageAdapter } from '../types/query.js';
3
+ export declare function registerAllResources(server: McpServer, storage: IStorageAdapter): void;
@@ -0,0 +1,6 @@
1
+ import { registerDashboardSummaryResource } from './dashboard-summary.js';
2
+ import { registerTraceDetailResource } from './trace-detail.js';
3
+ export function registerAllResources(server, storage) {
4
+ registerDashboardSummaryResource(server, storage);
5
+ registerTraceDetailResource(server, storage);
6
+ }
@@ -0,0 +1,3 @@
1
+ import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import type { IStorageAdapter } from '../types/query.js';
3
+ export declare function registerTraceDetailResource(server: McpServer, storage: IStorageAdapter): void;
@@ -0,0 +1,28 @@
1
+ export function registerTraceDetailResource(server, storage) {
2
+ server.resource('trace-detail', 'iris://traces/{trace_id}', { description: 'Full trace detail with spans and evaluation results' }, async (uri) => {
3
+ const traceId = uri.pathname.split('/').pop();
4
+ const trace = await storage.getTrace(traceId);
5
+ if (!trace) {
6
+ return {
7
+ contents: [
8
+ {
9
+ uri: uri.href,
10
+ mimeType: 'application/json',
11
+ text: JSON.stringify({ error: 'Trace not found' }),
12
+ },
13
+ ],
14
+ };
15
+ }
16
+ const spans = await storage.getSpansByTraceId(traceId);
17
+ const evals = await storage.getEvalsByTraceId(traceId);
18
+ return {
19
+ contents: [
20
+ {
21
+ uri: uri.href,
22
+ mimeType: 'application/json',
23
+ text: JSON.stringify({ trace, spans, evals }, null, 2),
24
+ },
25
+ ],
26
+ };
27
+ });
28
+ }
@@ -0,0 +1,9 @@
1
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import type { IrisConfig } from './types/index.js';
3
+ import type { IStorageAdapter } from './types/query.js';
4
+ import { EvalEngine } from './eval/engine.js';
5
+ export interface IrisServer {
6
+ mcpServer: McpServer;
7
+ evalEngine: EvalEngine;
8
+ }
9
+ export declare function createIrisServer(config: IrisConfig, storage: IStorageAdapter): IrisServer;
package/dist/server.js ADDED
@@ -0,0 +1,14 @@
1
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import { EvalEngine } from './eval/engine.js';
3
+ import { registerAllTools } from './tools/index.js';
4
+ import { registerAllResources } from './resources/index.js';
5
+ export function createIrisServer(config, storage) {
6
+ const mcpServer = new McpServer({
7
+ name: config.server.name,
8
+ version: config.server.version,
9
+ });
10
+ const evalEngine = new EvalEngine(config.eval.defaultThreshold);
11
+ registerAllTools(mcpServer, storage, evalEngine);
12
+ registerAllResources(mcpServer, storage);
13
+ return { mcpServer, evalEngine };
14
+ }
@@ -0,0 +1,4 @@
1
+ import type { IrisConfig } from '../types/index.js';
2
+ import type { IStorageAdapter } from '../types/query.js';
3
+ export declare function createStorage(config: IrisConfig): IStorageAdapter;
4
+ export { SqliteAdapter } from './sqlite-adapter.js';
@@ -0,0 +1,10 @@
1
+ import { SqliteAdapter } from './sqlite-adapter.js';
2
+ export function createStorage(config) {
3
+ switch (config.storage.type) {
4
+ case 'sqlite':
5
+ return new SqliteAdapter(config.storage.path);
6
+ default:
7
+ throw new Error(`Unsupported storage type: ${config.storage.type}`);
8
+ }
9
+ }
10
+ export { SqliteAdapter } from './sqlite-adapter.js';
@@ -0,0 +1,3 @@
1
+ import type Database from 'better-sqlite3';
2
+ export declare const id = "001-initial-schema";
3
+ export declare function up(db: Database.Database): void;
@@ -0,0 +1,57 @@
1
+ export const id = '001-initial-schema';
2
+ export function up(db) {
3
+ db.exec(`
4
+ CREATE TABLE IF NOT EXISTS traces (
5
+ trace_id TEXT PRIMARY KEY,
6
+ agent_name TEXT NOT NULL,
7
+ framework TEXT,
8
+ input TEXT,
9
+ output TEXT,
10
+ tool_calls TEXT,
11
+ latency_ms REAL,
12
+ token_usage TEXT,
13
+ cost_usd REAL,
14
+ metadata TEXT,
15
+ timestamp TEXT NOT NULL,
16
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
17
+ );
18
+
19
+ CREATE INDEX IF NOT EXISTS idx_traces_agent_name ON traces(agent_name);
20
+ CREATE INDEX IF NOT EXISTS idx_traces_timestamp ON traces(timestamp);
21
+ CREATE INDEX IF NOT EXISTS idx_traces_framework ON traces(framework);
22
+
23
+ CREATE TABLE IF NOT EXISTS spans (
24
+ span_id TEXT PRIMARY KEY,
25
+ trace_id TEXT NOT NULL REFERENCES traces(trace_id) ON DELETE CASCADE,
26
+ parent_span_id TEXT,
27
+ name TEXT NOT NULL,
28
+ kind TEXT NOT NULL DEFAULT 'INTERNAL',
29
+ status_code TEXT NOT NULL DEFAULT 'UNSET',
30
+ status_message TEXT,
31
+ start_time TEXT NOT NULL,
32
+ end_time TEXT,
33
+ attributes TEXT,
34
+ events TEXT
35
+ );
36
+
37
+ CREATE INDEX IF NOT EXISTS idx_spans_trace_id ON spans(trace_id);
38
+ CREATE INDEX IF NOT EXISTS idx_spans_parent ON spans(parent_span_id);
39
+
40
+ CREATE TABLE IF NOT EXISTS eval_results (
41
+ id TEXT PRIMARY KEY,
42
+ trace_id TEXT REFERENCES traces(trace_id) ON DELETE SET NULL,
43
+ eval_type TEXT NOT NULL,
44
+ output_text TEXT NOT NULL,
45
+ expected_text TEXT,
46
+ score REAL NOT NULL,
47
+ passed INTEGER NOT NULL,
48
+ rule_results TEXT,
49
+ suggestions TEXT,
50
+ created_at TEXT NOT NULL DEFAULT (datetime('now'))
51
+ );
52
+
53
+ CREATE INDEX IF NOT EXISTS idx_eval_results_trace_id ON eval_results(trace_id);
54
+ CREATE INDEX IF NOT EXISTS idx_eval_results_eval_type ON eval_results(eval_type);
55
+ CREATE INDEX IF NOT EXISTS idx_eval_results_created_at ON eval_results(created_at);
56
+ `);
57
+ }
@@ -0,0 +1,2 @@
1
+ import type Database from 'better-sqlite3';
2
+ export declare function runMigrations(db: Database.Database): void;
@@ -0,0 +1,22 @@
1
+ import * as migration001 from './001-initial-schema.js';
2
+ const migrations = [migration001];
3
+ export function runMigrations(db) {
4
+ db.exec(`
5
+ CREATE TABLE IF NOT EXISTS _iris_migrations (
6
+ id TEXT PRIMARY KEY,
7
+ applied_at TEXT NOT NULL DEFAULT (datetime('now'))
8
+ )
9
+ `);
10
+ const applied = new Set(db
11
+ .prepare('SELECT id FROM _iris_migrations')
12
+ .all()
13
+ .map((row) => row.id));
14
+ for (const migration of migrations) {
15
+ if (!applied.has(migration.id)) {
16
+ db.transaction(() => {
17
+ migration.up(db);
18
+ db.prepare('INSERT INTO _iris_migrations (id) VALUES (?)').run(migration.id);
19
+ })();
20
+ }
21
+ }
22
+ }
@@ -0,0 +1,33 @@
1
+ import type { IStorageAdapter, DashboardSummary, TraceQueryOptions, TraceQueryResult } from '../types/query.js';
2
+ import type { Trace, Span } from '../types/trace.js';
3
+ import type { EvalResult } from '../types/eval.js';
4
+ export declare class SqliteAdapter implements IStorageAdapter {
5
+ private db;
6
+ constructor(dbPath: string);
7
+ initialize(): Promise<void>;
8
+ close(): Promise<void>;
9
+ insertTrace(trace: Trace): Promise<void>;
10
+ getTrace(traceId: string): Promise<Trace | null>;
11
+ queryTraces(options: TraceQueryOptions): Promise<TraceQueryResult>;
12
+ insertSpan(span: Span): Promise<void>;
13
+ getSpansByTraceId(traceId: string): Promise<Span[]>;
14
+ insertEvalResult(result: EvalResult): Promise<void>;
15
+ getEvalsByTraceId(traceId: string): Promise<EvalResult[]>;
16
+ queryEvalResults(options: {
17
+ eval_type?: string;
18
+ passed?: boolean;
19
+ since?: string;
20
+ until?: string;
21
+ limit?: number;
22
+ offset?: number;
23
+ }): Promise<{
24
+ results: EvalResult[];
25
+ total: number;
26
+ }>;
27
+ getDashboardSummary(sinceHours?: number): Promise<DashboardSummary>;
28
+ deleteTracesOlderThan(days: number): Promise<number>;
29
+ getDistinctValues(column: string): Promise<string[]>;
30
+ private rowToTrace;
31
+ private rowToSpan;
32
+ private rowToEvalResult;
33
+ }
@@ -0,0 +1,232 @@
1
+ import Database from 'better-sqlite3';
2
+ import { runMigrations } from './migrations/index.js';
3
+ export class SqliteAdapter {
4
+ db;
5
+ constructor(dbPath) {
6
+ this.db = new Database(dbPath);
7
+ }
8
+ async initialize() {
9
+ this.db.pragma('journal_mode = WAL');
10
+ this.db.pragma('foreign_keys = ON');
11
+ runMigrations(this.db);
12
+ }
13
+ async close() {
14
+ this.db.close();
15
+ }
16
+ async insertTrace(trace) {
17
+ const stmt = this.db.prepare(`
18
+ INSERT INTO traces (trace_id, agent_name, framework, input, output, tool_calls, latency_ms, token_usage, cost_usd, metadata, timestamp)
19
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
20
+ `);
21
+ stmt.run(trace.trace_id, trace.agent_name, trace.framework ?? null, trace.input ?? null, trace.output ?? null, trace.tool_calls ? JSON.stringify(trace.tool_calls) : null, trace.latency_ms ?? null, trace.token_usage ? JSON.stringify(trace.token_usage) : null, trace.cost_usd ?? null, trace.metadata ? JSON.stringify(trace.metadata) : null, trace.timestamp);
22
+ if (trace.spans) {
23
+ for (const span of trace.spans) {
24
+ await this.insertSpan({ ...span, trace_id: trace.trace_id });
25
+ }
26
+ }
27
+ }
28
+ async getTrace(traceId) {
29
+ const row = this.db.prepare('SELECT * FROM traces WHERE trace_id = ?').get(traceId);
30
+ if (!row)
31
+ return null;
32
+ return this.rowToTrace(row);
33
+ }
34
+ async queryTraces(options) {
35
+ const conditions = [];
36
+ const params = [];
37
+ const filter = options.filter;
38
+ if (filter?.agent_name) {
39
+ conditions.push('agent_name = ?');
40
+ params.push(filter.agent_name);
41
+ }
42
+ if (filter?.framework) {
43
+ conditions.push('framework = ?');
44
+ params.push(filter.framework);
45
+ }
46
+ if (filter?.since) {
47
+ conditions.push('timestamp >= ?');
48
+ params.push(filter.since);
49
+ }
50
+ if (filter?.until) {
51
+ conditions.push('timestamp <= ?');
52
+ params.push(filter.until);
53
+ }
54
+ const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
55
+ const sortBy = options.sort_by ?? 'timestamp';
56
+ const sortOrder = options.sort_order ?? 'desc';
57
+ const limit = options.limit ?? 50;
58
+ const offset = options.offset ?? 0;
59
+ const countRow = this.db
60
+ .prepare(`SELECT COUNT(*) as count FROM traces ${whereClause}`)
61
+ .get(...params);
62
+ const rows = this.db
63
+ .prepare(`SELECT * FROM traces ${whereClause} ORDER BY ${sortBy} ${sortOrder} LIMIT ? OFFSET ?`)
64
+ .all(...params, limit, offset);
65
+ return {
66
+ traces: rows.map((row) => this.rowToTrace(row)),
67
+ total: countRow.count,
68
+ limit,
69
+ offset,
70
+ };
71
+ }
72
+ async insertSpan(span) {
73
+ this.db.prepare(`
74
+ INSERT INTO spans (span_id, trace_id, parent_span_id, name, kind, status_code, status_message, start_time, end_time, attributes, events)
75
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
76
+ `).run(span.span_id, span.trace_id, span.parent_span_id ?? null, span.name, span.kind, span.status_code, span.status_message ?? null, span.start_time, span.end_time ?? null, span.attributes ? JSON.stringify(span.attributes) : null, span.events ? JSON.stringify(span.events) : null);
77
+ }
78
+ async getSpansByTraceId(traceId) {
79
+ const rows = this.db
80
+ .prepare('SELECT * FROM spans WHERE trace_id = ? ORDER BY start_time')
81
+ .all(traceId);
82
+ return rows.map((row) => this.rowToSpan(row));
83
+ }
84
+ async insertEvalResult(result) {
85
+ this.db.prepare(`
86
+ INSERT INTO eval_results (id, trace_id, eval_type, output_text, expected_text, score, passed, rule_results, suggestions)
87
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
88
+ `).run(result.id, result.trace_id ?? null, result.eval_type, result.output_text, result.expected_text ?? null, result.score, result.passed ? 1 : 0, JSON.stringify(result.rule_results), JSON.stringify(result.suggestions));
89
+ }
90
+ async getEvalsByTraceId(traceId) {
91
+ const rows = this.db
92
+ .prepare('SELECT * FROM eval_results WHERE trace_id = ? ORDER BY created_at DESC')
93
+ .all(traceId);
94
+ return rows.map((row) => this.rowToEvalResult(row));
95
+ }
96
+ async queryEvalResults(options) {
97
+ const conditions = [];
98
+ const params = [];
99
+ if (options.eval_type) {
100
+ conditions.push('eval_type = ?');
101
+ params.push(options.eval_type);
102
+ }
103
+ if (options.passed !== undefined) {
104
+ conditions.push('passed = ?');
105
+ params.push(options.passed ? 1 : 0);
106
+ }
107
+ if (options.since) {
108
+ conditions.push('created_at >= ?');
109
+ params.push(options.since);
110
+ }
111
+ if (options.until) {
112
+ conditions.push('created_at <= ?');
113
+ params.push(options.until);
114
+ }
115
+ const whereClause = conditions.length > 0 ? `WHERE ${conditions.join(' AND ')}` : '';
116
+ const limit = options.limit ?? 50;
117
+ const offset = options.offset ?? 0;
118
+ const countRow = this.db
119
+ .prepare(`SELECT COUNT(*) as count FROM eval_results ${whereClause}`)
120
+ .get(...params);
121
+ const rows = this.db
122
+ .prepare(`SELECT * FROM eval_results ${whereClause} ORDER BY created_at DESC LIMIT ? OFFSET ?`)
123
+ .all(...params, limit, offset);
124
+ return {
125
+ results: rows.map((row) => this.rowToEvalResult(row)),
126
+ total: countRow.count,
127
+ };
128
+ }
129
+ async getDashboardSummary(sinceHours = 24) {
130
+ const since = new Date(Date.now() - sinceHours * 60 * 60 * 1000).toISOString();
131
+ const stats = this.db.prepare(`
132
+ SELECT
133
+ COUNT(*) as total_traces,
134
+ COALESCE(AVG(latency_ms), 0) as avg_latency_ms,
135
+ COALESCE(SUM(cost_usd), 0) as total_cost_usd
136
+ FROM traces WHERE timestamp >= ?
137
+ `).get(since);
138
+ const errorCount = this.db.prepare(`
139
+ SELECT COUNT(DISTINCT t.trace_id) as count
140
+ FROM traces t
141
+ JOIN spans s ON s.trace_id = t.trace_id
142
+ WHERE t.timestamp >= ? AND s.status_code = 'ERROR'
143
+ `).get(since);
144
+ const evalStats = this.db.prepare(`
145
+ SELECT
146
+ COUNT(*) as total,
147
+ SUM(CASE WHEN passed = 1 THEN 1 ELSE 0 END) as passed_count
148
+ FROM eval_results WHERE created_at >= ?
149
+ `).get(since);
150
+ const tracesPerHour = this.db.prepare(`
151
+ SELECT strftime('%Y-%m-%dT%H:00:00', timestamp) as hour, COUNT(*) as count
152
+ FROM traces WHERE timestamp >= ?
153
+ GROUP BY hour ORDER BY hour
154
+ `).all(since);
155
+ const topAgents = this.db.prepare(`
156
+ SELECT agent_name, COUNT(*) as count
157
+ FROM traces WHERE timestamp >= ?
158
+ GROUP BY agent_name ORDER BY count DESC LIMIT 10
159
+ `).all(since);
160
+ return {
161
+ total_traces: stats.total_traces,
162
+ avg_latency_ms: Math.round(stats.avg_latency_ms * 100) / 100,
163
+ total_cost_usd: Math.round(stats.total_cost_usd * 10000) / 10000,
164
+ error_rate: stats.total_traces > 0 ? errorCount.count / stats.total_traces : 0,
165
+ eval_pass_rate: evalStats.total > 0 ? evalStats.passed_count / evalStats.total : 0,
166
+ traces_per_hour: tracesPerHour,
167
+ top_agents: topAgents,
168
+ };
169
+ }
170
+ async deleteTracesOlderThan(days) {
171
+ const cutoff = new Date(Date.now() - days * 24 * 60 * 60 * 1000).toISOString();
172
+ const result = this.db.prepare('DELETE FROM traces WHERE timestamp < ?').run(cutoff);
173
+ return result.changes;
174
+ }
175
+ async getDistinctValues(column) {
176
+ const queries = {
177
+ agent_name: 'SELECT DISTINCT agent_name FROM traces WHERE agent_name IS NOT NULL ORDER BY agent_name',
178
+ framework: 'SELECT DISTINCT framework FROM traces WHERE framework IS NOT NULL ORDER BY framework',
179
+ };
180
+ const query = queries[column];
181
+ if (!query) {
182
+ throw new Error(`Column '${column}' is not queryable`);
183
+ }
184
+ const rows = this.db.prepare(query).all();
185
+ return rows.map((row) => row[column]);
186
+ }
187
+ rowToTrace(row) {
188
+ return {
189
+ trace_id: row.trace_id,
190
+ agent_name: row.agent_name,
191
+ framework: row.framework,
192
+ input: row.input,
193
+ output: row.output,
194
+ tool_calls: row.tool_calls ? JSON.parse(row.tool_calls) : undefined,
195
+ latency_ms: row.latency_ms,
196
+ token_usage: row.token_usage ? JSON.parse(row.token_usage) : undefined,
197
+ cost_usd: row.cost_usd,
198
+ metadata: row.metadata ? JSON.parse(row.metadata) : undefined,
199
+ timestamp: row.timestamp,
200
+ created_at: row.created_at,
201
+ };
202
+ }
203
+ rowToSpan(row) {
204
+ return {
205
+ span_id: row.span_id,
206
+ trace_id: row.trace_id,
207
+ parent_span_id: row.parent_span_id,
208
+ name: row.name,
209
+ kind: row.kind,
210
+ status_code: row.status_code,
211
+ status_message: row.status_message,
212
+ start_time: row.start_time,
213
+ end_time: row.end_time,
214
+ attributes: row.attributes ? JSON.parse(row.attributes) : undefined,
215
+ events: row.events ? JSON.parse(row.events) : undefined,
216
+ };
217
+ }
218
+ rowToEvalResult(row) {
219
+ return {
220
+ id: row.id,
221
+ trace_id: row.trace_id,
222
+ eval_type: row.eval_type,
223
+ output_text: row.output_text,
224
+ expected_text: row.expected_text,
225
+ score: row.score,
226
+ passed: row.passed === 1,
227
+ rule_results: JSON.parse(row.rule_results),
228
+ suggestions: JSON.parse(row.suggestions),
229
+ created_at: row.created_at,
230
+ };
231
+ }
232
+ }
@@ -0,0 +1,4 @@
1
+ import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import type { IStorageAdapter } from '../types/query.js';
3
+ import type { EvalEngine } from '../eval/engine.js';
4
+ export declare function registerEvaluateOutputTool(server: McpServer, storage: IStorageAdapter, evalEngine: EvalEngine): void;
@@ -0,0 +1,58 @@
1
+ import { z } from 'zod';
2
+ const CustomRuleSchema = z.object({
3
+ name: z.string(),
4
+ type: z.enum([
5
+ 'regex_match', 'regex_no_match', 'min_length', 'max_length',
6
+ 'contains_keywords', 'excludes_keywords', 'json_schema', 'cost_threshold',
7
+ ]),
8
+ config: z.record(z.unknown()),
9
+ weight: z.number().optional(),
10
+ });
11
+ const inputSchema = {
12
+ output: z.string().describe('The output text to evaluate'),
13
+ eval_type: z.enum(['completeness', 'relevance', 'safety', 'cost', 'custom']).default('completeness').describe('Type of evaluation'),
14
+ expected: z.string().optional().describe('Expected output for comparison'),
15
+ input: z.string().optional().describe('Original input for context'),
16
+ trace_id: z.string().optional().describe('Link evaluation to a trace'),
17
+ custom_rules: z.array(CustomRuleSchema).optional().describe('Custom evaluation rules'),
18
+ cost_usd: z.number().optional().describe('Cost for cost evaluation'),
19
+ token_usage: z.object({
20
+ prompt_tokens: z.number().optional(),
21
+ completion_tokens: z.number().optional(),
22
+ total_tokens: z.number().optional(),
23
+ }).optional().describe('Token usage for cost evaluation'),
24
+ };
25
+ export function registerEvaluateOutputTool(server, storage, evalEngine) {
26
+ server.registerTool('evaluate_output', {
27
+ title: 'Evaluate Output',
28
+ description: 'Evaluate agent output quality using configurable rules',
29
+ inputSchema,
30
+ }, async (args) => {
31
+ const evalType = args.eval_type;
32
+ const result = evalEngine.evaluate(evalType, {
33
+ output: args.output,
34
+ expected: args.expected,
35
+ input: args.input,
36
+ costUsd: args.cost_usd,
37
+ tokenUsage: args.token_usage,
38
+ }, args.custom_rules);
39
+ if (args.trace_id) {
40
+ result.trace_id = args.trace_id;
41
+ }
42
+ await storage.insertEvalResult(result);
43
+ return {
44
+ content: [
45
+ {
46
+ type: 'text',
47
+ text: JSON.stringify({
48
+ id: result.id,
49
+ score: result.score,
50
+ passed: result.passed,
51
+ rule_results: result.rule_results,
52
+ suggestions: result.suggestions,
53
+ }),
54
+ },
55
+ ],
56
+ };
57
+ });
58
+ }
@@ -0,0 +1,3 @@
1
+ import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
+ import type { IStorageAdapter } from '../types/query.js';
3
+ export declare function registerGetTracesTool(server: McpServer, storage: IStorageAdapter): void;