@evalgate/sdk 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. package/CHANGELOG.md +638 -0
  2. package/README.md +398 -0
  3. package/dist/assertions.d.ts +189 -0
  4. package/dist/assertions.js +662 -0
  5. package/dist/batch.d.ts +68 -0
  6. package/dist/batch.js +179 -0
  7. package/dist/cache.d.ts +65 -0
  8. package/dist/cache.js +131 -0
  9. package/dist/cli/api.d.ts +108 -0
  10. package/dist/cli/api.js +132 -0
  11. package/dist/cli/baseline.d.ts +10 -0
  12. package/dist/cli/baseline.js +172 -0
  13. package/dist/cli/check.d.ts +73 -0
  14. package/dist/cli/check.js +355 -0
  15. package/dist/cli/ci-context.d.ts +6 -0
  16. package/dist/cli/ci-context.js +112 -0
  17. package/dist/cli/ci.d.ts +45 -0
  18. package/dist/cli/ci.js +192 -0
  19. package/dist/cli/config.d.ts +30 -0
  20. package/dist/cli/config.js +230 -0
  21. package/dist/cli/constants.d.ts +15 -0
  22. package/dist/cli/constants.js +18 -0
  23. package/dist/cli/diff.d.ts +173 -0
  24. package/dist/cli/diff.js +685 -0
  25. package/dist/cli/discover.d.ts +84 -0
  26. package/dist/cli/discover.js +419 -0
  27. package/dist/cli/doctor.d.ts +88 -0
  28. package/dist/cli/doctor.js +675 -0
  29. package/dist/cli/env.d.ts +21 -0
  30. package/dist/cli/env.js +42 -0
  31. package/dist/cli/explain.d.ts +58 -0
  32. package/dist/cli/explain.js +561 -0
  33. package/dist/cli/formatters/github.d.ts +8 -0
  34. package/dist/cli/formatters/github.js +135 -0
  35. package/dist/cli/formatters/human.d.ts +6 -0
  36. package/dist/cli/formatters/human.js +110 -0
  37. package/dist/cli/formatters/json.d.ts +6 -0
  38. package/dist/cli/formatters/json.js +10 -0
  39. package/dist/cli/formatters/pr-comment.d.ts +12 -0
  40. package/dist/cli/formatters/pr-comment.js +103 -0
  41. package/dist/cli/formatters/types.d.ts +103 -0
  42. package/dist/cli/formatters/types.js +8 -0
  43. package/dist/cli/gate.d.ts +21 -0
  44. package/dist/cli/gate.js +179 -0
  45. package/dist/cli/impact-analysis.d.ts +63 -0
  46. package/dist/cli/impact-analysis.js +252 -0
  47. package/dist/cli/index.d.ts +9 -0
  48. package/dist/cli/index.js +332 -0
  49. package/dist/cli/init.d.ts +16 -0
  50. package/dist/cli/init.js +292 -0
  51. package/dist/cli/manifest.d.ts +103 -0
  52. package/dist/cli/manifest.js +282 -0
  53. package/dist/cli/migrate.d.ts +41 -0
  54. package/dist/cli/migrate.js +349 -0
  55. package/dist/cli/policy-packs.d.ts +23 -0
  56. package/dist/cli/policy-packs.js +89 -0
  57. package/dist/cli/print-config.d.ts +29 -0
  58. package/dist/cli/print-config.js +270 -0
  59. package/dist/cli/profiles.d.ts +28 -0
  60. package/dist/cli/profiles.js +30 -0
  61. package/dist/cli/reason-codes.d.ts +17 -0
  62. package/dist/cli/reason-codes.js +19 -0
  63. package/dist/cli/regression-gate.d.ts +15 -0
  64. package/dist/cli/regression-gate.js +341 -0
  65. package/dist/cli/render/snippet.d.ts +5 -0
  66. package/dist/cli/render/snippet.js +15 -0
  67. package/dist/cli/render/sort.d.ts +10 -0
  68. package/dist/cli/render/sort.js +24 -0
  69. package/dist/cli/report/build-check-report.d.ts +19 -0
  70. package/dist/cli/report/build-check-report.js +132 -0
  71. package/dist/cli/run.d.ts +101 -0
  72. package/dist/cli/run.js +395 -0
  73. package/dist/cli/share.d.ts +17 -0
  74. package/dist/cli/share.js +91 -0
  75. package/dist/cli/upgrade.d.ts +15 -0
  76. package/dist/cli/upgrade.js +492 -0
  77. package/dist/cli/workspace.d.ts +31 -0
  78. package/dist/cli/workspace.js +68 -0
  79. package/dist/client.d.ts +368 -0
  80. package/dist/client.js +893 -0
  81. package/dist/client.request.test.d.ts +1 -0
  82. package/dist/client.request.test.js +232 -0
  83. package/dist/context.d.ts +134 -0
  84. package/dist/context.js +215 -0
  85. package/dist/errors.d.ts +82 -0
  86. package/dist/errors.js +298 -0
  87. package/dist/export.d.ts +195 -0
  88. package/dist/export.js +344 -0
  89. package/dist/index.d.ts +44 -0
  90. package/dist/index.js +153 -0
  91. package/dist/integrations/anthropic.d.ts +91 -0
  92. package/dist/integrations/anthropic.js +163 -0
  93. package/dist/integrations/openai-eval.d.ts +57 -0
  94. package/dist/integrations/openai-eval.js +232 -0
  95. package/dist/integrations/openai.d.ts +92 -0
  96. package/dist/integrations/openai.js +160 -0
  97. package/dist/local.d.ts +39 -0
  98. package/dist/local.js +148 -0
  99. package/dist/logger.d.ts +128 -0
  100. package/dist/logger.js +227 -0
  101. package/dist/matchers/index.d.ts +1 -0
  102. package/dist/matchers/index.js +6 -0
  103. package/dist/matchers/to-pass-gate.d.ts +29 -0
  104. package/dist/matchers/to-pass-gate.js +35 -0
  105. package/dist/pagination.d.ts +74 -0
  106. package/dist/pagination.js +139 -0
  107. package/dist/regression.d.ts +100 -0
  108. package/dist/regression.js +44 -0
  109. package/dist/runtime/adapters/config-to-dsl.d.ts +33 -0
  110. package/dist/runtime/adapters/config-to-dsl.js +400 -0
  111. package/dist/runtime/adapters/testsuite-to-dsl.d.ts +63 -0
  112. package/dist/runtime/adapters/testsuite-to-dsl.js +276 -0
  113. package/dist/runtime/context.d.ts +26 -0
  114. package/dist/runtime/context.js +74 -0
  115. package/dist/runtime/eval.d.ts +46 -0
  116. package/dist/runtime/eval.js +244 -0
  117. package/dist/runtime/execution-mode.d.ts +80 -0
  118. package/dist/runtime/execution-mode.js +357 -0
  119. package/dist/runtime/executor.d.ts +16 -0
  120. package/dist/runtime/executor.js +152 -0
  121. package/dist/runtime/registry.d.ts +78 -0
  122. package/dist/runtime/registry.js +403 -0
  123. package/dist/runtime/run-report.d.ts +200 -0
  124. package/dist/runtime/run-report.js +222 -0
  125. package/dist/runtime/types.d.ts +356 -0
  126. package/dist/runtime/types.js +76 -0
  127. package/dist/snapshot.d.ts +176 -0
  128. package/dist/snapshot.js +322 -0
  129. package/dist/streaming.d.ts +173 -0
  130. package/dist/streaming.js +268 -0
  131. package/dist/testing.d.ts +273 -0
  132. package/dist/testing.js +317 -0
  133. package/dist/types.d.ts +754 -0
  134. package/dist/types.js +54 -0
  135. package/dist/utils/input-hash.d.ts +8 -0
  136. package/dist/utils/input-hash.js +41 -0
  137. package/dist/version.d.ts +7 -0
  138. package/dist/version.js +10 -0
  139. package/dist/workflows.d.ts +389 -0
  140. package/dist/workflows.js +671 -0
  141. package/package.json +117 -0
@@ -0,0 +1,163 @@
1
+ "use strict";
2
+ /**
3
+ * Anthropic Integration
4
+ * Tier 1.2: Framework Auto-Instrumentation - Anthropic wrapper
5
+ *
6
+ * @example
7
+ * ```typescript
8
+ * import { traceAnthropic } from '@ai-eval-platform/sdk/integrations/anthropic';
9
+ * import Anthropic from '@anthropic-ai/sdk';
10
+ *
11
+ * const anthropic = new Anthropic({ apiKey: '...' });
12
+ * const tracedAnthropic = traceAnthropic(anthropic, client);
13
+ *
14
+ * // All calls are automatically traced
15
+ * const message = await tracedAnthropic.messages.create({
16
+ * model: 'claude-3-5-sonnet-20241022',
17
+ * max_tokens: 1024,
18
+ * messages: [{ role: 'user', content: 'Hello!' }]
19
+ * });
20
+ * ```
21
+ */
22
+ Object.defineProperty(exports, "__esModule", { value: true });
23
+ exports.traceAnthropic = traceAnthropic;
24
+ exports.traceAnthropicCall = traceAnthropicCall;
25
+ const context_1 = require("../context");
26
+ /**
27
+ * Wrap Anthropic client with automatic tracing
28
+ *
29
+ * @example
30
+ * ```typescript
31
+ * import Anthropic from '@anthropic-ai/sdk';
32
+ * import { traceAnthropic } from '@ai-eval-platform/sdk/integrations/anthropic';
33
+ *
34
+ * const anthropic = new Anthropic({ apiKey: process.env.ANTHROPIC_API_KEY });
35
+ * const tracedAnthropic = traceAnthropic(anthropic, evalClient);
36
+ *
37
+ * // Automatically traced
38
+ * const message = await tracedAnthropic.messages.create({
39
+ * model: 'claude-3-5-sonnet-20241022',
40
+ * max_tokens: 1024,
41
+ * messages: [{ role: 'user', content: 'Hello, Claude!' }]
42
+ * });
43
+ * ```
44
+ */
45
+ function traceAnthropic(anthropic, evalClient, options = {}) {
46
+ const { captureInput = true, captureOutput = true, captureMetadata = true, organizationId, tracePrefix = "anthropic", } = options;
47
+ // Create proxy for messages.create
48
+ const originalCreate = anthropic.messages.create.bind(anthropic.messages);
49
+ anthropic.messages.create = async (params, requestOptions) => {
50
+ const startTime = Date.now();
51
+ const traceId = `${tracePrefix}-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
52
+ try {
53
+ // Call original method
54
+ const message = await originalCreate(params, requestOptions);
55
+ const durationMs = Date.now() - startTime;
56
+ // Create trace with success status and complete metadata
57
+ const traceMetadata = (0, context_1.mergeWithContext)({
58
+ model: params.model,
59
+ temperature: params.temperature,
60
+ max_tokens: params.max_tokens,
61
+ ...(captureInput ? { input: params.messages } : {}),
62
+ ...(captureOutput ? { output: message.content } : {}),
63
+ ...(captureMetadata
64
+ ? {
65
+ usage: message.usage,
66
+ stop_reason: message.stop_reason,
67
+ }
68
+ : {}),
69
+ });
70
+ await evalClient.traces.create({
71
+ name: `Anthropic: ${params.model}`,
72
+ traceId,
73
+ organizationId: organizationId || evalClient.getOrganizationId(),
74
+ status: "success",
75
+ durationMs,
76
+ metadata: traceMetadata,
77
+ });
78
+ return message;
79
+ }
80
+ catch (error) {
81
+ const durationMs = Date.now() - startTime;
82
+ // Create trace with error status
83
+ const errorMetadata = (0, context_1.mergeWithContext)({
84
+ model: params.model,
85
+ temperature: params.temperature,
86
+ max_tokens: params.max_tokens,
87
+ ...(captureInput ? { input: params.messages } : {}),
88
+ ...(captureMetadata ? { params } : {}),
89
+ error: error instanceof Error ? error.message : String(error),
90
+ });
91
+ await evalClient.traces
92
+ .create({
93
+ name: `Anthropic: ${params.model}`,
94
+ traceId,
95
+ organizationId: organizationId || evalClient.getOrganizationId(),
96
+ status: "error",
97
+ durationMs,
98
+ metadata: errorMetadata,
99
+ })
100
+ .catch(() => {
101
+ // Ignore errors in trace creation to avoid masking the original error
102
+ });
103
+ throw error;
104
+ }
105
+ };
106
+ return anthropic;
107
+ }
108
+ /**
109
+ * Manual trace wrapper for Anthropic calls
110
+ *
111
+ * @example
112
+ * ```typescript
113
+ * const message = await traceAnthropicCall(
114
+ * evalClient,
115
+ * 'claude-completion',
116
+ * async () => {
117
+ * return await anthropic.messages.create({
118
+ * model: 'claude-3-5-sonnet-20241022',
119
+ * max_tokens: 1024,
120
+ * messages: [{ role: 'user', content: 'Hello!' }]
121
+ * });
122
+ * }
123
+ * );
124
+ * ```
125
+ */
126
+ async function traceAnthropicCall(evalClient, name, fn, options = {}) {
127
+ const startTime = Date.now();
128
+ const traceId = `anthropic-${Date.now()}-${Math.random().toString(36).substr(2, 9)}`;
129
+ try {
130
+ await evalClient.traces.create({
131
+ name,
132
+ traceId,
133
+ organizationId: options.organizationId || evalClient.getOrganizationId(),
134
+ status: "pending",
135
+ metadata: (0, context_1.mergeWithContext)({}),
136
+ });
137
+ const result = await fn();
138
+ const durationMs = Date.now() - startTime;
139
+ await evalClient.traces.create({
140
+ name,
141
+ traceId,
142
+ organizationId: options.organizationId || evalClient.getOrganizationId(),
143
+ status: "success",
144
+ durationMs,
145
+ metadata: (0, context_1.mergeWithContext)({}),
146
+ });
147
+ return result;
148
+ }
149
+ catch (error) {
150
+ const durationMs = Date.now() - startTime;
151
+ await evalClient.traces.create({
152
+ name,
153
+ traceId,
154
+ organizationId: options.organizationId || evalClient.getOrganizationId(),
155
+ status: "error",
156
+ durationMs,
157
+ metadata: (0, context_1.mergeWithContext)({
158
+ error: error instanceof Error ? error.message : String(error),
159
+ }),
160
+ });
161
+ throw error;
162
+ }
163
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * openAIChatEval — One-function OpenAI chat regression testing
3
+ *
4
+ * Run local regression tests with OpenAI. No EvalGate account required.
5
+ * CI-friendly output. Optional reportToEvalGate in v1.5.
6
+ *
7
+ * @example
8
+ * ```typescript
9
+ * import { openAIChatEval } from '@evalgate/sdk';
10
+ *
11
+ * await openAIChatEval({
12
+ * name: 'chat-regression',
13
+ * cases: [
14
+ * { input: 'Hello', expectedOutput: 'greeting' },
15
+ * { input: '2 + 2 = ?', expectedOutput: '4' }
16
+ * ]
17
+ * });
18
+ * ```
19
+ */
20
+ import type { TestSuiteCaseResult } from "../testing";
21
+ export interface OpenAIChatEvalCase {
22
+ input: string;
23
+ expectedOutput?: string;
24
+ /** Platform test case ID. When provided, used directly for reportToEvalGate (no input matching). */
25
+ testCaseId?: number;
26
+ assertions?: ((output: string) => import("../assertions").AssertionResult)[];
27
+ }
28
+ export interface OpenAIChatEvalOptions {
29
+ name: string;
30
+ model?: string;
31
+ apiKey?: string;
32
+ cases: OpenAIChatEvalCase[];
33
+ /** Retry failing cases N times (default: 0). Only failing cases are retried. */
34
+ retries?: number;
35
+ /** v1.5: Upload results to EvalGate platform for an existing evaluation. Requires evaluationId and EVALGATE_API_KEY. */
36
+ reportToEvalGate?: boolean;
37
+ /** Evaluation ID (from config or arg). Required when reportToEvalGate is true. */
38
+ evaluationId?: string;
39
+ /** EvalGate API base URL. Default: EVALGATE_BASE_URL or http://localhost:3000 */
40
+ baseUrl?: string;
41
+ /** Idempotency key for import (e.g. CI run ID). Prevents duplicate runs on retry. */
42
+ idempotencyKey?: string;
43
+ }
44
+ export interface OpenAIChatEvalResult {
45
+ passed: number;
46
+ total: number;
47
+ score: number;
48
+ results: TestSuiteCaseResult[];
49
+ durationMs: number;
50
+ /** Case IDs that were retried (flaky recovery) */
51
+ retriedCases?: string[];
52
+ }
53
+ /**
54
+ * Run OpenAI chat regression tests locally.
55
+ * No EvalGate account required. Returns score and prints CI-friendly summary.
56
+ */
57
+ export declare function openAIChatEval(options: OpenAIChatEvalOptions): Promise<OpenAIChatEvalResult>;
@@ -0,0 +1,232 @@
1
+ "use strict";
2
+ /**
3
+ * openAIChatEval — One-function OpenAI chat regression testing
4
+ *
5
+ * Run local regression tests with OpenAI. No EvalGate account required.
6
+ * CI-friendly output. Optional reportToEvalGate in v1.5.
7
+ *
8
+ * @example
9
+ * ```typescript
10
+ * import { openAIChatEval } from '@evalgate/sdk';
11
+ *
12
+ * await openAIChatEval({
13
+ * name: 'chat-regression',
14
+ * cases: [
15
+ * { input: 'Hello', expectedOutput: 'greeting' },
16
+ * { input: '2 + 2 = ?', expectedOutput: '4' }
17
+ * ]
18
+ * });
19
+ * ```
20
+ */
21
+ Object.defineProperty(exports, "__esModule", { value: true });
22
+ exports.openAIChatEval = openAIChatEval;
23
+ const assertions_1 = require("../assertions");
24
+ const config_1 = require("../cli/config");
25
+ const testing_1 = require("../testing");
26
+ const input_hash_1 = require("../utils/input-hash");
27
+ const MAX_FAILED_CASES_TO_SHOW = 5;
28
+ function getOpenAI() {
29
+ try {
30
+ const OpenAI = require("openai");
31
+ return OpenAI;
32
+ }
33
+ catch {
34
+ throw new Error("openai package is required for openAIChatEval. Install with: npm install openai");
35
+ }
36
+ }
37
+ function createExecutor(model, apiKey) {
38
+ const OpenAI = getOpenAI();
39
+ const openai = new OpenAI({ apiKey });
40
+ return async (input) => {
41
+ const response = await openai.chat.completions.create({
42
+ model,
43
+ messages: [{ role: "user", content: input }],
44
+ temperature: 0.1,
45
+ });
46
+ return response.choices[0]?.message?.content ?? "";
47
+ };
48
+ }
49
+ function printSummary(result) {
50
+ const { passed, total, results } = result;
51
+ const score = total > 0 ? Math.round((passed / total) * 100) : 0;
52
+ const failed = results.filter((r) => !r.passed);
53
+ const status = failed.length === 0 ? "PASS" : "FAIL";
54
+ console.log(`\n${status} ${passed}/${total} (score: ${score})\n`);
55
+ if (failed.length > 0) {
56
+ const toShow = failed.slice(0, MAX_FAILED_CASES_TO_SHOW);
57
+ const more = failed.length - toShow.length;
58
+ console.log(`${failed.length} failing case${failed.length === 1 ? "" : "s"}:`);
59
+ for (const r of toShow) {
60
+ const expected = r.expected ?? "(no expected)";
61
+ console.log(`- "${r.input}" → expected: ${expected}`);
62
+ }
63
+ if (more > 0) {
64
+ console.log(`+ ${more} more`);
65
+ }
66
+ console.log("\nGate this in CI:");
67
+ console.log(" npx -y @evalgate/sdk@^2 init");
68
+ }
69
+ else {
70
+ console.log("Tip: Want dashboards and history?");
71
+ console.log("Set EVALGATE_API_KEY and connect this to the platform.");
72
+ }
73
+ }
74
+ /**
75
+ * Run OpenAI chat regression tests locally.
76
+ * No EvalGate account required. Returns score and prints CI-friendly summary.
77
+ */
78
+ async function openAIChatEval(options) {
79
+ const { name, model = "gpt-4o-mini", apiKey, cases, retries = 0 } = options;
80
+ const resolvedApiKey = apiKey ?? (typeof process !== "undefined" && process.env?.OPENAI_API_KEY);
81
+ if (!resolvedApiKey) {
82
+ throw new Error("OPENAI_API_KEY is required. Set it in the environment or pass apiKey to openAIChatEval.");
83
+ }
84
+ const executor = createExecutor(model, resolvedApiKey);
85
+ const suiteCases = cases.map((c) => {
86
+ const assertions = c.assertions
87
+ ? [...c.assertions]
88
+ : c.expectedOutput
89
+ ? [
90
+ (output) => (0, assertions_1.expect)(output).toContainKeywords(c.expectedOutput?.split(/\s+/).filter(Boolean) || []),
91
+ ]
92
+ : undefined;
93
+ return {
94
+ input: c.input,
95
+ expected: c.expectedOutput,
96
+ assertions,
97
+ };
98
+ });
99
+ const suite = (0, testing_1.createTestSuite)(name, {
100
+ cases: suiteCases,
101
+ executor,
102
+ parallel: true,
103
+ retries,
104
+ });
105
+ const result = await suite.run();
106
+ const score = result.total > 0 ? Math.round((result.passed / result.total) * 100) : 0;
107
+ const evalResult = {
108
+ passed: result.passed,
109
+ total: result.total,
110
+ score,
111
+ results: result.results,
112
+ durationMs: result.durationMs,
113
+ ...(result.retriedCases &&
114
+ result.retriedCases.length > 0 && { retriedCases: result.retriedCases }),
115
+ };
116
+ printSummary(evalResult);
117
+ // v1.5: Optional report to EvalGate platform
118
+ if (options.reportToEvalGate) {
119
+ const config = typeof process !== "undefined" && process.cwd
120
+ ? (0, config_1.loadConfig)(process.cwd())
121
+ : null;
122
+ const evalId = options.evaluationId || config?.evaluationId;
123
+ if (!evalId || String(evalId).trim() === "") {
124
+ console.log("Run evalgate init and set evaluationId to upload results.");
125
+ return evalResult;
126
+ }
127
+ const evalgateKey = (typeof process !== "undefined" && process.env?.EVALGATE_API_KEY) || "";
128
+ if (!evalgateKey) {
129
+ console.log("Set EVALGATE_API_KEY to upload results.");
130
+ return evalResult;
131
+ }
132
+ const baseUrl = options.baseUrl ||
133
+ config?.baseUrl ||
134
+ (typeof process !== "undefined" && process.env?.EVALGATE_BASE_URL) ||
135
+ "http://localhost:3000";
136
+ const url = String(baseUrl).replace(/\/$/, "");
137
+ try {
138
+ // Resolve testCaseId for each result: explicit testCaseId in cases, or match by inputHash
139
+ const importResults = [];
140
+ const hasExplicitIds = cases.some((c) => c.testCaseId != null);
141
+ if (hasExplicitIds) {
142
+ // Use testCaseId from cases (same order as results)
143
+ for (let i = 0; i < result.results.length; i++) {
144
+ const tcId = cases[i]?.testCaseId;
145
+ if (tcId == null) {
146
+ console.log("reportToEvalGate: All cases must have testCaseId when unknown has it.");
147
+ return evalResult;
148
+ }
149
+ importResults.push({
150
+ testCaseId: tcId,
151
+ status: result.results[i].passed ? "passed" : "failed",
152
+ output: result.results[i].actual ?? "",
153
+ latencyMs: result.results[i].durationMs,
154
+ });
155
+ }
156
+ }
157
+ else {
158
+ // Match by inputHash (same canonicalization as platform)
159
+ const tcRes = await fetch(`${url}/api/evaluations/${evalId}/test-cases?limit=500`, {
160
+ headers: { Authorization: `Bearer ${evalgateKey}` },
161
+ });
162
+ if (!tcRes.ok) {
163
+ console.log("Could not fetch test cases. Check evaluationId and EVALGATE_API_KEY.");
164
+ return evalResult;
165
+ }
166
+ const platformCases = (await tcRes.json());
167
+ const hashToIds = new Map();
168
+ for (const tc of platformCases) {
169
+ const input = tc.input ?? "";
170
+ if (!input.trim())
171
+ continue;
172
+ const hash = (0, input_hash_1.sha256Input)(input);
173
+ const existing = hashToIds.get(hash) ?? [];
174
+ existing.push(tc.id);
175
+ hashToIds.set(hash, existing);
176
+ }
177
+ for (const r of result.results) {
178
+ const hash = (0, input_hash_1.sha256Input)(r.input ?? "");
179
+ const ids = hashToIds.get(hash);
180
+ if (ids == null || ids.length === 0) {
181
+ console.log(`No platform test case matches input: "${(r.input ?? "").slice(0, 50)}…"`);
182
+ return evalResult;
183
+ }
184
+ if (ids.length > 1) {
185
+ console.log(`Multiple platform test cases share the same input (hash collision). Use testCaseId in cases.`);
186
+ return evalResult;
187
+ }
188
+ importResults.push({
189
+ testCaseId: ids[0],
190
+ status: r.passed ? "passed" : "failed",
191
+ output: r.actual ?? "",
192
+ latencyMs: r.durationMs,
193
+ });
194
+ }
195
+ }
196
+ if (importResults.length !== result.results.length) {
197
+ console.log("Could not match all results to platform test cases.");
198
+ return evalResult;
199
+ }
200
+ const sdkVersion = "1.4.1";
201
+ const headers = {
202
+ "Content-Type": "application/json",
203
+ Authorization: `Bearer ${evalgateKey}`,
204
+ };
205
+ if (options.idempotencyKey) {
206
+ headers["Idempotency-Key"] = options.idempotencyKey;
207
+ }
208
+ const importRes = await fetch(`${url}/api/evaluations/${evalId}/runs/import`, {
209
+ method: "POST",
210
+ headers,
211
+ body: JSON.stringify({
212
+ environment: "dev",
213
+ results: importResults,
214
+ importClientVersion: sdkVersion,
215
+ }),
216
+ });
217
+ if (!importRes.ok) {
218
+ const body = await importRes.text();
219
+ console.log(`Upload failed: ${importRes.status} — ${body}`);
220
+ return evalResult;
221
+ }
222
+ const importData = (await importRes.json());
223
+ if (importData.dashboardUrl) {
224
+ console.log(`Dashboard: ${importData.dashboardUrl}`);
225
+ }
226
+ }
227
+ catch (err) {
228
+ console.log("Upload failed:", err instanceof Error ? err.message : String(err));
229
+ }
230
+ }
231
+ return evalResult;
232
+ }
@@ -0,0 +1,92 @@
1
+ /**
2
+ * OpenAI Integration
3
+ * Tier 1.2: Framework Auto-Instrumentation - OpenAI wrapper
4
+ *
5
+ * @example
6
+ * ```typescript
7
+ * import { traceOpenAI } from '@ai-eval-platform/sdk/integrations/openai';
8
+ * import OpenAI from 'openai';
9
+ *
10
+ * const openai = new OpenAI({ apiKey: '...' });
11
+ * const tracedOpenAI = traceOpenAI(openai, client);
12
+ *
13
+ * // All calls are automatically traced
14
+ * const response = await tracedOpenAI.chat.completions.create({
15
+ * model: 'gpt-4',
16
+ * messages: [{ role: 'user', content: 'Hello!' }]
17
+ * });
18
+ * ```
19
+ */
20
+ import type { AIEvalClient } from "../client";
21
+ interface OpenAIChatParams {
22
+ model: string;
23
+ messages: unknown[];
24
+ temperature?: number;
25
+ max_tokens?: number;
26
+ [key: string]: unknown;
27
+ }
28
+ interface OpenAIChatCompletion {
29
+ choices: Array<{
30
+ message?: unknown;
31
+ finish_reason?: unknown;
32
+ }>;
33
+ usage?: unknown;
34
+ [key: string]: unknown;
35
+ }
36
+ interface OpenAIClient {
37
+ chat: {
38
+ completions: {
39
+ create: (params: OpenAIChatParams, requestOptions?: Record<string, unknown>) => Promise<OpenAIChatCompletion>;
40
+ };
41
+ };
42
+ }
43
+ export interface OpenAITraceOptions {
44
+ /** Whether to capture input (default: true) */
45
+ captureInput?: boolean;
46
+ /** Whether to capture output (default: true) */
47
+ captureOutput?: boolean;
48
+ /** Whether to capture metadata (default: true) */
49
+ captureMetadata?: boolean;
50
+ /** Organization ID for traces */
51
+ organizationId?: number;
52
+ /** Custom trace name prefix */
53
+ tracePrefix?: string;
54
+ }
55
+ /**
56
+ * Wrap OpenAI client with automatic tracing
57
+ *
58
+ * @example
59
+ * ```typescript
60
+ * import OpenAI from 'openai';
61
+ * import { traceOpenAI } from '@ai-eval-platform/sdk/integrations/openai';
62
+ *
63
+ * const openai = new OpenAI({ apiKey: process.env.OPENAI_API_KEY });
64
+ * const tracedOpenAI = traceOpenAI(openai, evalClient);
65
+ *
66
+ * // Automatically traced
67
+ * const completion = await tracedOpenAI.chat.completions.create({
68
+ * model: 'gpt-4',
69
+ * messages: [{ role: 'user', content: 'Hello!' }]
70
+ * });
71
+ * ```
72
+ */
73
+ export declare function traceOpenAI(openai: OpenAIClient, evalClient: AIEvalClient, options?: OpenAITraceOptions): OpenAIClient;
74
+ /**
75
+ * Manual trace wrapper for OpenAI calls
76
+ *
77
+ * @example
78
+ * ```typescript
79
+ * const response = await traceOpenAICall(
80
+ * evalClient,
81
+ * 'gpt-4-completion',
82
+ * async () => {
83
+ * return await openai.chat.completions.create({
84
+ * model: 'gpt-4',
85
+ * messages: [{ role: 'user', content: 'Hello!' }]
86
+ * });
87
+ * }
88
+ * );
89
+ * ```
90
+ */
91
+ export declare function traceOpenAICall<T>(evalClient: AIEvalClient, name: string, fn: () => Promise<T>, options?: OpenAITraceOptions): Promise<T>;
92
+ export {};