npm - @pauly4010/evalai-sdk - Versions diffs - 1.4.1 → 1.5.5 - Mend

@pauly4010/evalai-sdk 1.4.1 → 1.5.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (95) hide show

package/CHANGELOG.md +85 -0
package/README.md +205 -543
package/dist/assertions.d.ts +2 -2
package/dist/assertions.js +104 -71
package/dist/batch.js +12 -17
package/dist/cache.js +7 -11
package/dist/cli/api.d.ts +108 -0
package/dist/cli/api.js +130 -0
package/dist/cli/check.d.ts +28 -13
package/dist/cli/check.js +249 -142
package/dist/cli/ci-context.d.ts +6 -0
package/dist/cli/ci-context.js +110 -0
package/dist/cli/config.d.ts +30 -0
package/dist/cli/config.js +207 -0
package/dist/cli/constants.d.ts +15 -0
package/dist/cli/constants.js +18 -0
package/dist/cli/doctor.d.ts +11 -0
package/dist/cli/doctor.js +82 -0
package/dist/cli/formatters/github.d.ts +8 -0
package/dist/cli/formatters/github.js +130 -0
package/dist/cli/formatters/human.d.ts +6 -0
package/dist/cli/formatters/human.js +107 -0
package/dist/cli/formatters/json.d.ts +6 -0
package/dist/cli/formatters/json.js +10 -0
package/dist/cli/formatters/pr-comment.d.ts +12 -0
package/dist/cli/formatters/pr-comment.js +101 -0
package/dist/cli/formatters/types.d.ts +100 -0
package/dist/cli/formatters/types.js +5 -0
package/dist/cli/gate.d.ts +21 -0
package/dist/cli/gate.js +175 -0
package/dist/cli/index.d.ts +1 -0
package/dist/cli/index.js +67 -23
package/dist/cli/init.d.ts +7 -0
package/dist/cli/init.js +69 -0
package/dist/cli/policy-packs.d.ts +23 -0
package/dist/cli/policy-packs.js +83 -0
package/dist/cli/profiles.d.ts +28 -0
package/dist/cli/profiles.js +30 -0
package/dist/cli/reason-codes.d.ts +17 -0
package/dist/cli/reason-codes.js +19 -0
package/dist/cli/render/snippet.d.ts +5 -0
package/dist/cli/render/snippet.js +15 -0
package/dist/cli/render/sort.d.ts +10 -0
package/dist/cli/render/sort.js +24 -0
package/dist/cli/report/build-check-report.d.ts +19 -0
package/dist/cli/report/build-check-report.js +124 -0
package/dist/cli/share.d.ts +17 -0
package/dist/cli/share.js +83 -0
package/dist/client.d.ts +2 -2
package/dist/client.js +144 -132
package/dist/context.d.ts +1 -1
package/dist/context.js +4 -6
package/dist/errors.d.ts +2 -0
package/dist/errors.js +116 -107
package/dist/export.d.ts +6 -6
package/dist/export.js +39 -33
package/dist/index.d.ts +25 -24
package/dist/index.js +62 -56
package/dist/integrations/anthropic.d.ts +1 -1
package/dist/integrations/anthropic.js +23 -19
package/dist/integrations/openai-eval.d.ts +57 -0
package/dist/integrations/openai-eval.js +230 -0
package/dist/integrations/openai.d.ts +1 -1
package/dist/integrations/openai.js +23 -19
package/dist/local.d.ts +2 -2
package/dist/local.js +25 -25
package/dist/logger.d.ts +1 -1
package/dist/logger.js +24 -28
package/dist/matchers/index.d.ts +1 -0
package/dist/matchers/index.js +6 -0
package/dist/matchers/to-pass-gate.d.ts +29 -0
package/dist/matchers/to-pass-gate.js +35 -0
package/dist/pagination.d.ts +1 -1
package/dist/pagination.js +6 -6
package/dist/snapshot.js +24 -24
package/dist/streaming.js +11 -11
package/dist/testing.d.ts +6 -2
package/dist/testing.js +30 -12
package/dist/types.d.ts +22 -22
package/dist/types.js +13 -13
package/dist/utils/input-hash.d.ts +8 -0
package/dist/utils/input-hash.js +38 -0
package/dist/version.d.ts +7 -0
package/dist/version.js +10 -0
package/dist/workflows.d.ts +7 -7
package/dist/workflows.js +44 -44
package/package.json +102 -90
package/dist/__tests__/assertions.test.d.ts +0 -1
package/dist/__tests__/assertions.test.js +0 -288
package/dist/__tests__/client.test.d.ts +0 -1
package/dist/__tests__/client.test.js +0 -185
package/dist/__tests__/testing.test.d.ts +0 -1
package/dist/__tests__/testing.test.js +0 -230
package/dist/__tests__/workflows.test.d.ts +0 -1
package/dist/__tests__/workflows.test.js +0 -222

package/README.md CHANGED Viewed

@@ -1,641 +1,303 @@
 # @pauly4010/evalai-sdk
-[![npm version](https://img.shields.io/npm/v/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
+[![npm version](https://img.shields.io/npm/v/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
 [![npm downloads](https://img.shields.io/npm/dm/@pauly4010/evalai-sdk.svg)](https://www.npmjs.com/package/@pauly4010/evalai-sdk)
-Official TypeScript/JavaScript SDK for the AI Evaluation Platform. Build confidence in your AI systems with comprehensive evaluation tools.
+**Stop LLM regressions in CI in minutes.**
-## Installation
+Evaluate locally in 60 seconds. Add an optional CI gate in 2 minutes.
+No lock-in — remove by deleting `evalai.config.json`.
+---
+# 🚀 1) 60 seconds: Run locally (no account)
+Install, run, get a score.
+No EvalAI account. No API key. No dashboard required.
 ```bash
-npm install @pauly4010/evalai-sdk
-# or
-yarn add @pauly4010/evalai-sdk
-# or
-pnpm add @pauly4010/evalai-sdk
-```
+npm install @pauly4010/evalai-sdk openai
+import { openAIChatEval } from "@pauly4010/evalai-sdk";
+await openAIChatEval({
+  name: "chat-regression",
+  cases: [
+    { input: "Hello", expectedOutput: "greeting" },
+    { input: "2 + 2 = ?", expectedOutput: "4" },
+  ],
+});
+Set:
+OPENAI_API_KEY=...
+✅ Vitest integration (recommended)
+import {
+  openAIChatEval,
+  extendExpectWithToPassGate,
+} from "@pauly4010/evalai-sdk";
+import { expect } from "vitest";
+extendExpectWithToPassGate(expect);
+it("passes gate", async () => {
+  const result = await openAIChatEval({
+    name: "chat-regression",
+    cases: [
+      { input: "Hello", expectedOutput: "greeting" },
+      { input: "2 + 2 = ?", expectedOutput: "4" },
+    ],
+  });
-## Environment Support
+  expect(result).toPassGate();
+});
+Example output
+PASS 2/2 (score: 100)
-This SDK works in both **Node.js** and **browsers**, with some features having specific requirements:
+Tip: Want dashboards and history?
+Set EVALAI_API_KEY and connect this to the platform.
+Failures show:
-### ✅ Works Everywhere (Node.js + Browser)
+FAIL 9/10 (score: 90)
+with failed cases and CI guidance.
-- Traces API
-- Evaluations API
-- LLM Judge API
-- Annotations API
-- Developer API (API Keys, Webhooks, Usage)
-- Organizations API
-- Assertions Library
-- Test Suites
-- Error Handling
+⚡ 2) Optional: Add a CI gate (2 minutes)
+When you're ready to gate PRs on quality and regressions:
-### 🟡 Node.js Only Features
+npx -y @pauly4010/evalai-sdk@^1 init
+Create an evaluation in the dashboard and paste its ID into:
-The following features require Node.js and **will not work in browsers**:
+{
+  "evaluationId": "42"
+}
+Add to your CI:
-- **Snapshot Testing** - Uses filesystem for storage
-- **Local Storage Mode** - Uses filesystem for offline development
-- **CLI Tool** - Command-line interface
-- **Export to File** - Direct file system writes
+- name: EvalAI gate
+  env:
+    EVALAI_API_KEY: ${{ secrets.EVALAI_API_KEY }}
+  run: npx -y @pauly4010/evalai-sdk@^1 check --format github --onFail import --warnDrop 1
+You’ll get:
-### 🔄 Context Propagation
+GitHub annotations
-- **Node.js**: Full async context propagation using `AsyncLocalStorage`
-- **Browser**: Basic context support (not safe across all async boundaries)
+Step summary
-Use appropriate features based on your environment. The SDK will throw helpful errors if you try to use Node.js-only features in a browser.
+Optional dashboard link
-## Quick Start
+PASS / WARN / FAIL (v1.5.5)
+EvalAI introduces a WARN band so teams can see meaningful regressions without always blocking merges.
-```typescript
-import { AIEvalClient } from "@pauly4010/evalai-sdk";
+Behavior
-// Initialize with environment variables
-const client = AIEvalClient.init();
+PASS → within thresholds
-// Or with explicit config
-const client = new AIEvalClient({
-  apiKey: "your-api-key",
-  organizationId: 123,
-  debug: true,
-});
-```
+WARN → regression > warnDrop but < maxDrop
-## Features
+FAIL → regression > maxDrop
-### 🎯 Evaluation Templates (v1.1.0)
+Key flags
-The SDK now includes comprehensive evaluation template types for different testing scenarios:
+--warnDrop → soft regression warning
-```typescript
-import { EvaluationTemplates } from "@pauly4010/evalai-sdk";
+--maxDrop → hard regression fail
-// Create evaluations with predefined templates
-await client.evaluations.create({
-  name: "Prompt Optimization Test",
-  type: EvaluationTemplates.PROMPT_OPTIMIZATION,
-  createdBy: userId,
-});
+--fail-on-flake → fail if any test is unstable
-// Available templates:
-// Core Testing
-EvaluationTemplates.UNIT_TESTING;
-EvaluationTemplates.OUTPUT_QUALITY;
-// Advanced Evaluation
-EvaluationTemplates.PROMPT_OPTIMIZATION;
-EvaluationTemplates.CHAIN_OF_THOUGHT;
-EvaluationTemplates.LONG_CONTEXT_TESTING;
-EvaluationTemplates.MODEL_STEERING;
-EvaluationTemplates.REGRESSION_TESTING;
-EvaluationTemplates.CONFIDENCE_CALIBRATION;
-// Safety & Compliance
-EvaluationTemplates.SAFETY_COMPLIANCE;
-// Domain-Specific
-EvaluationTemplates.RAG_EVALUATION;
-EvaluationTemplates.CODE_GENERATION;
-EvaluationTemplates.SUMMARIZATION;
-```
+This lets teams tune signal vs noise in CI.
-### 📊 Organization Resource Limits (v1.1.0)
+🔒 3) No lock-in
+To stop using EvalAI:
-Track your organization's resource usage and limits:
+rm evalai.config.json
+Your local openAIChatEval runs continue to work exactly the same.
-```typescript
-// Get current usage and limits
-const limits = await client.getOrganizationLimits();
+No account cancellation. No data export required.
-console.log("Traces:", {
-  usage: limits.traces_per_organization?.usage,
-  balance: limits.traces_per_organization?.balance,
-  total: limits.traces_per_organization?.included_usage,
-});
+📦 Installation
+npm install @pauly4010/evalai-sdk openai
+# or
+yarn add @pauly4010/evalai-sdk openai
+# or
+pnpm add @pauly4010/evalai-sdk openai
+🖥️ Environment Support
+This SDK works in both Node.js and browsers, with some Node-only features.
-console.log("Evaluations:", {
-  usage: limits.evals_per_organization?.usage,
-  balance: limits.evals_per_organization?.balance,
-  total: limits.evals_per_organization?.included_usage,
-});
+✅ Works Everywhere (Node.js + Browser)
+Traces API
-console.log("Annotations:", {
-  usage: limits.annotations_per_organization?.usage,
-  balance: limits.annotations_per_organization?.balance,
-  total: limits.annotations_per_organization?.included_usage,
-});
-```
+Evaluations API
-### 🔍 Traces
+LLM Judge API
-```typescript
-// Create a trace
-const trace = await client.traces.create({
-  name: "User Query",
-  traceId: "trace-123",
-  metadata: { userId: "456" },
-});
+Annotations API
-// List traces
-const traces = await client.traces.list({
-  limit: 10,
-  status: "success",
-});
+Developer API (API Keys, Webhooks, Usage)
-// Create spans
-const span = await client.traces.createSpan(trace.id, {
-  name: "LLM Call",
-  spanId: "span-456",
-  startTime: new Date().toISOString(),
-  metadata: { model: "gpt-4" },
-});
-```
+Organizations API
-### 📝 Evaluations
+Assertions Library
-```typescript
-// Create evaluation
-const evaluation = await client.evaluations.create({
-  name: "Chatbot Responses",
-  type: EvaluationTemplates.OUTPUT_QUALITY,
-  description: "Test chatbot response quality",
-  createdBy: userId,
-});
+Test Suites
-// Add test cases
-await client.evaluations.createTestCase(evaluation.id, {
-  input: "What is the capital of France?",
-  expectedOutput: "Paris",
-});
+Error Handling
-// Run evaluation
-const run = await client.evaluations.createRun(evaluation.id, {
-  status: "running",
-});
-```
+🟡 Node.js Only
+These require Node.js:
-### ⚖️ LLM Judge
+Snapshot Testing
-```typescript
-// Evaluate with LLM judge
-const result = await client.llmJudge.evaluate({
-  configId: 1,
-  input: "Translate: Hello world",
-  output: "Bonjour le monde",
-  metadata: { language: "French" },
-});
+Local Storage Mode
-console.log("Score:", result.result.score);
-console.log("Reasoning:", result.result.reasoning);
-```
+CLI Tool
-## Configuration
+Export to File
-### Environment Variables
+🔄 Context Propagation
+Node.js: full async context via AsyncLocalStorage
-```bash
-# Required
-EVALAI_API_KEY=your-api-key
+Browser: basic support (not safe across all async boundaries)
-# Optional
-EVALAI_ORGANIZATION_ID=123
-EVALAI_BASE_URL=https://api.example.com
-```
+🧠 AIEvalClient (Platform API)
+import { AIEvalClient } from "@pauly4010/evalai-sdk";
-### Client Options
+// From env
+const client = AIEvalClient.init();
-```typescript
-const client = new AIEvalClient({
+// Explicit
+const client2 = new AIEvalClient({
   apiKey: "your-api-key",
   organizationId: 123,
-  baseUrl: "https://api.example.com",
-  timeout: 30000,
   debug: true,
-  logLevel: "debug",
-  retry: {
-    maxAttempts: 3,
-    backoff: "exponential",
-    retryableErrors: ["RATE_LIMIT_EXCEEDED", "TIMEOUT"],
-  },
 });
-```
-## Error Handling
-```typescript
-import { EvalAIError, RateLimitError } from '@pauly4010/evalai-sdk';
+🧪 evalai CLI (v1.5.5)
+The CLI gates deployments on quality, regression, and policy.
+Quick start
+npx -y @pauly4010/evalai-sdk@^1 check \
+  --evaluationId 42 \
+  --apiKey $EVALAI_API_KEY
+evalai check
+Option	Description
+--evaluationId <id>	Required. Evaluation to gate on
+--apiKey <key>	API key (or EVALAI_API_KEY)
+--format <fmt>	human, json, or github
+--onFail import	Import failing run to dashboard
+--explain	Show score breakdown
+--minScore <n>	Fail if score < n
+--warnDrop <n>	Warn if regression exceeds n
+--maxDrop <n>	Fail if regression exceeds n
+--minN <n>	Fail if test count < n
+--allowWeakEvidence	Permit weak evidence
+--policy <name>	HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511
+--baseline <mode>	published, previous, production
+--fail-on-flake	Fail if any case is flaky
+--baseUrl <url>	Override API base URL
+Exit codes
+Code	Meaning
+0	PASS
+8	WARN
+1	Score below threshold
+2	Regression failure
+3	Policy violation
+4	API error
+5	Bad arguments
+6	Low test count
+7	Weak evidence
+evalai doctor
+Verify CI setup before running the gate:
+npx -y @pauly4010/evalai-sdk@^1 doctor \
+  --evaluationId 42 \
+  --apiKey $EVALAI_API_KEY
+If doctor passes, check will work.
+🧯 Error Handling
+import { EvalAIError, RateLimitError } from "@pauly4010/evalai-sdk";
 try {
-  await client.traces.create({...});
-} catch (error) {
-  if (error instanceof RateLimitError) {
-    console.log('Rate limited, retry after:', error.retryAfter);
-  } else if (error instanceof EvalAIError) {
-    console.log('Error:', error.code, error.message);
+  await client.traces.create({ name: "User Query" });
+} catch (err) {
+  if (err instanceof RateLimitError) {
+    console.log("Retry after:", err.retryAfter);
+  } else if (err instanceof EvalAIError) {
+    console.log(err.code, err.message, err.requestId);
   }
 }
-```
-## Advanced Features
-### Context Propagation
-```typescript
-import { withContext } from "@pauly4010/evalai-sdk";
-withContext({ userId: "123", sessionId: "abc" }, async () => {
-  // Context automatically included in all traces
-  await client.traces.create({
-    name: "Query",
-    traceId: "trace-1",
-  });
+🔍 Traces
+const trace = await client.traces.create({
+  name: "User Query",
+  traceId: "trace-123",
+  metadata: { userId: "456" },
 });
-```
-### Test Suites
-```typescript
-import { createTestSuite } from "@pauly4010/evalai-sdk";
+📝 Evaluations
+import { EvaluationTemplates } from "@pauly4010/evalai-sdk";
-const suite = createTestSuite({
-  name: "Chatbot Tests",
-  tests: [
-    {
-      name: "Greeting",
-      input: "Hello",
-      expectedOutput: "Hi there!",
-    },
-  ],
+const evaluation = await client.evaluations.create({
+  name: "Chatbot Responses",
+  type: EvaluationTemplates.OUTPUT_QUALITY,
+  createdBy: userId,
 });
-await suite.run(client);
-```
-### Framework Integrations
-```typescript
+🔌 Framework Integrations
 import { traceOpenAI } from "@pauly4010/evalai-sdk/integrations/openai";
 import OpenAI from "openai";
 const openai = traceOpenAI(new OpenAI(), client);
-// All OpenAI calls are automatically traced
-const response = await openai.chat.completions.create({
+await openai.chat.completions.create({
   model: "gpt-4",
   messages: [{ role: "user", content: "Hello" }],
 });
-```
-## TypeScript Support
-The SDK is fully typed with TypeScript generics for type-safe metadata:
-```typescript
-interface CustomMetadata {
-  userId: string;
-  sessionId: string;
-  model: string;
-}
-const trace = await client.traces.create<CustomMetadata>({
-  name: "Query",
-  traceId: "trace-1",
-  metadata: {
-    userId: "123",
-    sessionId: "abc",
-    model: "gpt-4",
-  },
-});
-// TypeScript knows the exact metadata type
-console.log(trace.metadata.userId);
-```
-## 📋 Annotations API (v1.2.0)
-Human-in-the-loop evaluation for quality assurance:
-```typescript
-// Create an annotation
-const annotation = await client.annotations.create({
-  evaluationRunId: 123,
-  testCaseId: 456,
-  rating: 5,
-  feedback: "Excellent response!",
-  labels: { category: "helpful", sentiment: "positive" },
-});
-// List annotations
-const annotations = await client.annotations.list({
-  evaluationRunId: 123,
-});
-// Annotation Tasks
-const task = await client.annotations.tasks.create({
-  name: "Q4 Quality Review",
-  type: "classification",
-  organizationId: 1,
-  instructions: "Rate responses from 1-5",
-});
-const tasks = await client.annotations.tasks.list({
-  organizationId: 1,
-  status: "pending",
-});
-const taskDetail = await client.annotations.tasks.get(taskId);
-// Annotation Items
-const item = await client.annotations.tasks.items.create(taskId, {
-  content: "Response to evaluate",
-  annotation: { rating: 4, category: "good" },
-});
-const items = await client.annotations.tasks.items.list(taskId);
-```
-## 🔑 Developer API (v1.2.0)
-Manage API keys, webhooks, and monitor usage:
-### API Keys
-```typescript
-// Create an API key
-const { apiKey, id, keyPrefix } = await client.developer.apiKeys.create({
-  name: "Production Key",
-  organizationId: 1,
-  scopes: ["traces:read", "traces:write", "evaluations:read"],
-  expiresAt: "2025-12-31T23:59:59Z",
-});
-// IMPORTANT: Save the apiKey securely - it's only shown once!
-// List API keys
-const keys = await client.developer.apiKeys.list({
-  organizationId: 1,
-});
+🧭 Changelog
+v1.5.5 (Latest)
+PASS/WARN/FAIL gate semantics
-// Update an API key
-await client.developer.apiKeys.update(keyId, {
-  name: "Updated Name",
-  scopes: ["traces:read"],
-});
+--warnDrop soft regression band
-// Revoke an API key
-await client.developer.apiKeys.revoke(keyId);
+Flake intelligence + per-case pass rates
-// Get usage statistics for a key
-const usage = await client.developer.apiKeys.getUsage(keyId);
-console.log("Total requests:", usage.totalRequests);
-console.log("By endpoint:", usage.usageByEndpoint);
-```
+--fail-on-flake enforcement
-### Webhooks
+Golden regression suite
-```typescript
-// Create a webhook
-const webhook = await client.developer.webhooks.create({
-  organizationId: 1,
-  url: "https://your-app.com/webhooks/evalai",
-  events: ["trace.created", "evaluation.completed", "annotation.created"],
-});
+Nightly determinism + performance audits
-// List webhooks
-const webhooks = await client.developer.webhooks.list({
-  organizationId: 1,
-  status: "active",
-});
+Audit trail, observability, retention, and migration safety docs
-// Get a specific webhook
-const webhookDetail = await client.developer.webhooks.get(webhookId);
+v1.5.0
+GitHub annotations formatter
-// Update a webhook
-await client.developer.webhooks.update(webhookId, {
-  url: "https://new-url.com/webhooks",
-  events: ["trace.created"],
-  status: "inactive",
-});
+JSON formatter
-// Delete a webhook
-await client.developer.webhooks.delete(webhookId);
+--onFail import
-// Get webhook deliveries (for debugging)
-const deliveries = await client.developer.webhooks.getDeliveries(webhookId, {
-  limit: 50,
-  success: false, // Only failed deliveries
-});
-```
+--explain
-### Usage Analytics
+evalai doctor
-```typescript
-// Get detailed usage statistics
-const stats = await client.developer.getUsage({
-  organizationId: 1,
-  startDate: "2025-01-01",
-  endDate: "2025-01-31",
-});
+CI pinned invocation guidance
-console.log("Traces:", stats.traces.total);
-console.log("Evaluations by type:", stats.evaluations.byType);
-console.log("API calls by endpoint:", stats.apiCalls.byEndpoint);
-// Get usage summary
-const summary = await client.developer.getUsageSummary(organizationId);
-console.log("Current period:", summary.currentPeriod);
-console.log("Limits:", summary.limits);
-```
-## ⚖️ LLM Judge Extended (v1.2.0)
+Environment Variable Safety
-Enhanced LLM judge configuration and analysis:
-```typescript
-// Create a judge configuration
-const config = await client.llmJudge.createConfig({
-  name: "GPT-4 Accuracy Judge",
-  description: "Evaluates factual accuracy",
-  model: "gpt-4",
-  rubric: "Score 1-10 based on factual accuracy...",
-  temperature: 0.3,
-  maxTokens: 500,
-  organizationId: 1,
-  createdBy: userId,
-});
+The SDK never assumes `process.env` exists. All environment reads are guarded, so the client can initialize safely in browser, edge, and server runtimes.
-// List configurations
-const configs = await client.llmJudge.listConfigs({
-  organizationId: 1,
-});
+If environment variables are unavailable, the SDK falls back to explicit config.
-// List results
-const results = await client.llmJudge.listResults({
-  configId: config.id,
-  evaluationId: 123,
-});
-// Get alignment analysis
-const alignment = await client.llmJudge.getAlignment({
-  configId: config.id,
-  startDate: "2025-01-01",
-  endDate: "2025-01-31",
-});
-console.log("Average score:", alignment.averageScore);
-console.log("Accuracy:", alignment.alignmentMetrics.accuracy);
-console.log("Agreement with human:", alignment.comparisonWithHuman?.agreement);
-```
-## 🏢 Organizations API (v1.2.0)
-Manage organization details:
-```typescript
-// Get current organization
-const org = await client.organizations.getCurrent();
-console.log("Organization:", org.name);
-console.log("Plan:", org.plan);
-console.log("Status:", org.status);
-```
-## evalai CLI (v1.4.1)
-The SDK includes a CLI for CI/CD evaluation gates. Install globally or use via `npx`:
-```bash
-# Via npx (no global install)
-npx @pauly4010/evalai-sdk check --minScore 92 --evaluationId 42 --apiKey $EVALAI_API_KEY
-# Or install globally
-npm install -g @pauly4010/evalai-sdk
-evalai check --minScore 92 --evaluationId 42
-```
-### evalai check
-Gate deployments on quality scores, regression, and compliance:
-| Option | Description |
-|--------|-------------|
-| `--evaluationId <id>` | **Required.** Evaluation to gate on |
-| `--apiKey <key>` | API key (or `EVALAI_API_KEY` env) |
-| `--minScore <n>` | Fail if score &lt; n (0–100) |
-| `--maxDrop <n>` | Fail if score dropped &gt; n from baseline |
-| `--minN <n>` | Fail if total test cases &lt; n |
-| `--allowWeakEvidence` | Permit weak evidence level |
-| `--policy <name>` | Enforce HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511 |
-| `--baseline <mode>` | `published`, `previous`, or `production` |
-| `--baseUrl <url>` | API base URL |
-**Exit codes:** 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
-## Changelog
-### v1.4.1 (Latest)
-- **evalai check `--baseline production`** — Compare against latest prod-tagged run
-- **Package hardening** — Leaner npm publish with `files`, `sideEffects: false`
-### v1.4.0
-- **evalai CLI** — Command-line tool for CI/CD evaluation gates
-  - `evalai check` — Gate deployments on quality scores, regression, and compliance
-  - `--minScore <n>` — Fail if quality score &lt; n (0–100)
-  - `--maxDrop <n>` — Fail if score dropped &gt; n points from baseline
-  - `--minN <n>` — Fail if total test cases &lt; n
-  - `--allowWeakEvidence` — Permit weak evidence level (default: fail)
-  - `--policy <name>` — Enforce compliance (HIPAA, SOC2, GDPR, PCI_DSS, FINRA_4511)
-  - `--baseline <mode>` — Compare to `published` or `previous` run
-  - `--evaluationId <id>` — Required. Evaluation to gate on
-  - Environment: `EVALAI_API_KEY`, `EVALAI_BASE_URL`
-  - Exit codes: 0=pass, 1=score below, 2=regression, 3=policy violation, 4=API error, 5=bad args, 6=low N, 7=weak evidence
-- **CLI Exports** — `parseArgs`, `runCheck`, `EXIT` from `@pauly4010/evalai-sdk` for programmatic use
-### v1.3.0
-- **Workflow Tracing** — Multi-agent orchestration with full lifecycle instrumentation
-  - `WorkflowTracer` class with `startWorkflow`, `endWorkflow`, `startAgentSpan`, `endAgentSpan`
-  - `createWorkflowTracer` convenience factory
-  - `traceWorkflowStep` generic wrapper for any async function
-  - Agent handoff recording (`delegation`, `escalation`, `parallel`, `fallback`)
-  - Decision auditing with alternatives, confidence scores, reasoning, and context factors
-  - Cost tracking per span/workflow with automatic pricing (16+ models)
-  - Cost breakdown by category (`llm`, `tool`, `embedding`, `other`)
-- **Framework Integrations** — Wrap popular multi-agent frameworks:
-  - `traceLangChainAgent` — wraps `.invoke()` and `.call()` with auto-tracing
-  - `traceCrewAI` — wraps `.kickoff()` with workflow start/end
-  - `traceAutoGen` — wraps `.initiate_chat()` with workflow start/end
-- **Performance Utilities**
-  - `RequestCache` with configurable TTL (`CacheTTL` presets)
-  - `PaginatedIterator` / `createPaginatedIterator` / `autoPaginate` for cursor-based pagination
-  - `RequestBatcher` for batching API calls
-  - `RateLimiter` client-side rate limit handling
-- **Cost Tracking Types** — `CostRecord`, `CostBreakdown`, `ProviderPricing` interfaces
-- **Agent Decision Auditing Types** — `AgentDecision`, `DecisionAlternative`, `RecordDecisionParams` interfaces
-- **Benchmark Types** — `Benchmark`, `BenchmarkResult`, `AgentConfig` interfaces
-### v1.2.1 (Bug Fixes)
-- 🐛 **Critical Fixes**
-  - Fixed CLI import paths for proper npm package distribution
-  - Fixed duplicate trace creation in OpenAI/Anthropic integrations
-  - Fixed Commander.js command structure
-  - Added browser/Node.js environment detection and helpful errors
-  - Fixed context system to work in both Node.js and browsers
-  - Added security checks to snapshot path sanitization
-  - Removed misleading empty exports (StreamingClient, BatchClient)
-- 📦 **Dependencies**
-  - Updated Commander to v14
-  - Added peer dependencies for OpenAI and Anthropic SDKs (optional)
-  - Added Node.js engine requirement (>=16.0.0)
-- 📚 **Documentation**
-  - Clarified Node.js-only vs universal features
-  - Added environment support section
-  - Updated examples with security best practices
-### v1.2.0
-- 🎉 **100% API Coverage** - All backend endpoints now supported!
-- 📋 **Annotations API** - Complete human-in-the-loop evaluation
-  - Create and list annotations
-  - Manage annotation tasks
-  - Handle annotation items
-- 🔑 **Developer API** - Full API key and webhook management
-  - CRUD operations for API keys
-  - Webhook management with delivery tracking
-  - Usage analytics and monitoring
-- ⚖️ **LLM Judge Extended** - Enhanced judge capabilities
-  - Configuration management
-  - Results querying
-  - Alignment analysis
-- 🏢 **Organizations API** - Organization details access
-- 📊 **Enhanced Types** - 40+ new TypeScript interfaces
-- 📚 **Comprehensive Documentation** - Examples for all new features
-### v1.1.0
-- ✨ Added comprehensive evaluation template types
-- ✨ Added organization resource limits tracking
-- ✨ Added `getOrganizationLimits()` method
-- 📚 Enhanced documentation with new features
-### v1.0.0
-- 🎉 Initial release
-- ✅ Traces, Evaluations, LLM Judge APIs
-- ✅ Framework integrations (OpenAI, Anthropic)
-- ✅ Test suite builder
-- ✅ Context propagation
-- ✅ Error handling & retries
-## License
+📄 License
 MIT
-## Support
+🤝 Support
+Documentation:
+https://v0-ai-evaluation-platform-nu.vercel.app/documentation
-- Documentation: https://v0-ai-evaluation-platform-nu.vercel.app/documentation
-- Issues: https://github.com/pauly7610/ai-evaluation-platform/issues
+Issues:
+https://github.com/pauly7610/ai-evaluation-platform/issues
+```