npm - @langwatch/mcp-server - Versions diffs - 0.3.2 → 0.4.0 - Mend

@langwatch/mcp-server 0.3.2 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/CHANGELOG.md +20 -0
package/README.md +97 -25
package/dist/chunk-AAQNA53E.js +28 -0
package/dist/chunk-AAQNA53E.js.map +1 -0
package/dist/chunk-HOPTUDCZ.js +90 -0
package/dist/chunk-HOPTUDCZ.js.map +1 -0
package/dist/chunk-ZXKLPC2E.js +27 -0
package/dist/chunk-ZXKLPC2E.js.map +1 -0
package/dist/config-FIQWQRUB.js +11 -0
package/dist/config-FIQWQRUB.js.map +1 -0
package/dist/create-prompt-UBC537BJ.js +22 -0
package/dist/create-prompt-UBC537BJ.js.map +1 -0
package/dist/discover-schema-3T52ORPB.js +446 -0
package/dist/discover-schema-3T52ORPB.js.map +1 -0
package/dist/get-analytics-3IFTN6MY.js +55 -0
package/dist/get-analytics-3IFTN6MY.js.map +1 -0
package/dist/get-prompt-2ZB5B3QC.js +48 -0
package/dist/get-prompt-2ZB5B3QC.js.map +1 -0
package/dist/get-trace-7IXKKCJJ.js +50 -0
package/dist/get-trace-7IXKKCJJ.js.map +1 -0
package/dist/index.d.ts +2 -0
package/dist/index.js +20003 -0
package/dist/index.js.map +1 -0
package/dist/list-prompts-J72LTP7Z.js +33 -0
package/dist/list-prompts-J72LTP7Z.js.map +1 -0
package/dist/search-traces-RW2NDHN5.js +72 -0
package/dist/search-traces-RW2NDHN5.js.map +1 -0
package/dist/update-prompt-G6HHZSUM.js +31 -0
package/dist/update-prompt-G6HHZSUM.js.map +1 -0
package/package.json +8 -8
package/src/__tests__/config.unit.test.ts +89 -0
package/src/__tests__/date-parsing.unit.test.ts +78 -0
package/src/__tests__/discover-schema.unit.test.ts +118 -0
package/src/__tests__/integration.integration.test.ts +313 -0
package/src/__tests__/langwatch-api.unit.test.ts +309 -0
package/src/__tests__/schemas.unit.test.ts +85 -0
package/src/__tests__/tools.unit.test.ts +729 -0
package/src/config.ts +31 -0
package/src/index.ts +254 -0
package/src/langwatch-api.ts +265 -0
package/src/schemas/analytics-groups.ts +78 -0
package/src/schemas/analytics-metrics.ts +179 -0
package/src/schemas/filter-fields.ts +119 -0
package/src/schemas/index.ts +3 -0
package/src/tools/create-prompt.ts +29 -0
package/src/tools/discover-schema.ts +106 -0
package/src/tools/get-analytics.ts +71 -0
package/src/tools/get-prompt.ts +56 -0
package/src/tools/get-trace.ts +61 -0
package/src/tools/list-prompts.ts +35 -0
package/src/tools/search-traces.ts +91 -0
package/src/tools/update-prompt.ts +44 -0
package/src/utils/date-parsing.ts +31 -0
package/tests/evaluations.ipynb +634 -634
package/tests/scenario-openai.test.ts +3 -1

package/src/schemas/analytics-metrics.ts ADDED Viewed

@@ -0,0 +1,179 @@
+export interface MetricInfo {
+  category: string;
+  name: string;
+  label: string;
+  allowedAggregations: string[];
+  description: string;
+}
+export const analyticsMetrics: MetricInfo[] = [
+  // metadata
+  {
+    category: "metadata",
+    name: "trace_id",
+    label: "Traces",
+    allowedAggregations: ["cardinality"],
+    description: "Count of unique traces",
+  },
+  {
+    category: "metadata",
+    name: "user_id",
+    label: "Users",
+    allowedAggregations: ["cardinality"],
+    description: "Count of unique users",
+  },
+  {
+    category: "metadata",
+    name: "thread_id",
+    label: "Threads",
+    allowedAggregations: ["cardinality"],
+    description: "Count of unique conversation threads",
+  },
+  {
+    category: "metadata",
+    name: "span_type",
+    label: "Span Type",
+    allowedAggregations: ["cardinality"],
+    description: "Count of spans, optionally filtered by span type",
+  },
+  // sentiment
+  {
+    category: "sentiment",
+    name: "input_sentiment",
+    label: "Input Sentiment Score",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Sentiment analysis score of inputs",
+  },
+  {
+    category: "sentiment",
+    name: "thumbs_up_down",
+    label: "Thumbs Up/Down Score",
+    allowedAggregations: [
+      "terms",
+      "cardinality",
+      "avg",
+      "sum",
+      "min",
+      "max",
+      "median",
+      "p99",
+      "p95",
+      "p90",
+    ],
+    description: "User feedback score (-1 to 1)",
+  },
+  // performance
+  {
+    category: "performance",
+    name: "completion_time",
+    label: "Completion Time",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Time to complete the trace (ms)",
+  },
+  {
+    category: "performance",
+    name: "first_token",
+    label: "Time to First Token",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Time to first token (ms)",
+  },
+  {
+    category: "performance",
+    name: "total_cost",
+    label: "Total Cost",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Cost per trace in USD",
+  },
+  {
+    category: "performance",
+    name: "prompt_tokens",
+    label: "Prompt Tokens",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Input token count",
+  },
+  {
+    category: "performance",
+    name: "completion_tokens",
+    label: "Completion Tokens",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Output token count",
+  },
+  {
+    category: "performance",
+    name: "total_tokens",
+    label: "Total Tokens",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Total token count (input + output)",
+  },
+  {
+    category: "performance",
+    name: "tokens_per_second",
+    label: "Tokens per Second",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Token generation speed",
+  },
+  // events
+  {
+    category: "events",
+    name: "event_type",
+    label: "Event Type",
+    allowedAggregations: ["cardinality"],
+    description: "Count of events, optionally filtered by event type",
+  },
+  {
+    category: "events",
+    name: "event_score",
+    label: "Event Score",
+    allowedAggregations: [
+      "terms",
+      "avg",
+      "sum",
+      "min",
+      "max",
+      "median",
+      "p99",
+      "p95",
+      "p90",
+    ],
+    description: "Numeric score from events (requires event_type key and metrics key)",
+  },
+  {
+    category: "events",
+    name: "event_details",
+    label: "Event Details",
+    allowedAggregations: ["cardinality"],
+    description:
+      "Event detail key/value distribution (requires event_type key and details key)",
+  },
+  // evaluations
+  {
+    category: "evaluations",
+    name: "evaluation_score",
+    label: "Evaluation Score",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description: "Numeric evaluation score (requires evaluator_id key)",
+  },
+  {
+    category: "evaluations",
+    name: "evaluation_pass_rate",
+    label: "Evaluation Pass Rate",
+    allowedAggregations: ["avg", "sum", "min", "max", "median", "p99", "p95", "p90"],
+    description:
+      "Percentage of traces passing evaluation (requires evaluator_id key)",
+  },
+  {
+    category: "evaluations",
+    name: "evaluation_runs",
+    label: "Evaluation Runs",
+    allowedAggregations: ["cardinality"],
+    description: "Count of evaluation executions",
+  },
+  // threads
+  {
+    category: "threads",
+    name: "average_duration_per_thread",
+    label: "Thread Duration",
+    allowedAggregations: ["avg"],
+    description: "Average duration of conversation threads (ms)",
+  },
+];

package/src/schemas/filter-fields.ts ADDED Viewed

@@ -0,0 +1,119 @@
+export interface FilterFieldInfo {
+  field: string;
+  description: string;
+  example?: string;
+}
+export const filterFields: FilterFieldInfo[] = [
+  {
+    field: "topics.topics",
+    description: "Main topic classification of the trace",
+    example: "billing",
+  },
+  {
+    field: "topics.subtopics",
+    description: "Subtopic classification",
+    example: "refund-request",
+  },
+  {
+    field: "metadata.user_id",
+    description: "User ID from trace metadata",
+    example: "user-123",
+  },
+  {
+    field: "metadata.thread_id",
+    description: "Conversation thread ID",
+    example: "thread-456",
+  },
+  {
+    field: "metadata.customer_id",
+    description: "Customer/organization ID",
+    example: "customer-789",
+  },
+  {
+    field: "metadata.labels",
+    description: "Custom labels attached to traces",
+    example: "production",
+  },
+  {
+    field: "metadata.key",
+    description: "Custom metadata key",
+    example: "environment",
+  },
+  {
+    field: "metadata.value",
+    description: "Custom metadata value (used with metadata.key)",
+    example: "staging",
+  },
+  {
+    field: "metadata.prompt_ids",
+    description: "Prompt IDs used in the trace",
+  },
+  {
+    field: "traces.error",
+    description: "Whether the trace has errors",
+    example: "true",
+  },
+  {
+    field: "spans.type",
+    description: "Span type (llm, tool, agent, chain, rag)",
+    example: "llm",
+  },
+  {
+    field: "spans.model",
+    description: "LLM model name used in spans",
+    example: "gpt-4o",
+  },
+  {
+    field: "evaluations.evaluator_id",
+    description: "Evaluator that ran on the trace",
+  },
+  {
+    field: "evaluations.evaluator_id.guardrails_only",
+    description: "Evaluator ID filtered to guardrails only",
+  },
+  {
+    field: "evaluations.passed",
+    description: "Whether evaluations passed",
+    example: "true",
+  },
+  {
+    field: "evaluations.score",
+    description: "Evaluation score (numeric)",
+  },
+  {
+    field: "evaluations.state",
+    description: "Evaluation state (processed, error, skipped)",
+  },
+  {
+    field: "evaluations.label",
+    description: "Evaluation label result",
+  },
+  {
+    field: "events.event_type",
+    description: "Type of event (thumbs_up_down, custom)",
+    example: "thumbs_up_down",
+  },
+  {
+    field: "events.metrics.key",
+    description: "Event metric key",
+  },
+  {
+    field: "events.metrics.value",
+    description: "Event metric value (numeric)",
+  },
+  {
+    field: "events.event_details.key",
+    description: "Event detail key",
+  },
+  {
+    field: "annotations.hasAnnotation",
+    description: "Whether trace has human annotations",
+    example: "true",
+  },
+  {
+    field: "sentiment.input_sentiment",
+    description: "Detected sentiment of input",
+    example: "positive",
+  },
+];

package/src/schemas/index.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export { filterFields, type FilterFieldInfo } from "./filter-fields.js";
+export { analyticsMetrics, type MetricInfo } from "./analytics-metrics.js";
+export { analyticsGroups, type GroupByInfo } from "./analytics-groups.js";

package/src/tools/create-prompt.ts ADDED Viewed

@@ -0,0 +1,29 @@
+import { createPrompt as apiCreatePrompt } from "../langwatch-api.js";
+/**
+ * Handles the create_prompt MCP tool invocation.
+ *
+ * Creates a new prompt in the LangWatch project and returns a
+ * confirmation with the created prompt's details.
+ */
+export async function handleCreatePrompt(params: {
+  name: string;
+  handle?: string;
+  messages: Array<{ role: string; content: string }>;
+  model: string;
+  modelProvider: string;
+  description?: string;
+}): Promise<string> {
+  const result = await apiCreatePrompt(params);
+  const lines: string[] = [];
+  lines.push("Prompt created successfully!\n");
+  if (result.id) lines.push(`**ID**: ${result.id}`);
+  if (result.handle) lines.push(`**Handle**: ${result.handle}`);
+  lines.push(`**Name**: ${result.name || params.name}`);
+  lines.push(`**Model**: ${params.model} (${params.modelProvider})`);
+  if (result.latestVersionNumber != null)
+    lines.push(`**Version**: v${result.latestVersionNumber}`);
+  return lines.join("\n");
+}

package/src/tools/discover-schema.ts ADDED Viewed

@@ -0,0 +1,106 @@
+import { filterFields } from "../schemas/filter-fields.js";
+import { analyticsMetrics } from "../schemas/analytics-metrics.js";
+import { analyticsGroups } from "../schemas/analytics-groups.js";
+export type Category =
+  | "filters"
+  | "metrics"
+  | "aggregations"
+  | "groups"
+  | "all";
+/**
+ * Formats the LangWatch analytics schema into human-readable markdown.
+ *
+ * Returns documentation for the requested category of schema elements
+ * (filter fields, metrics, aggregation types, or group-by options).
+ */
+export function formatSchema(category: Category): string {
+  const sections: string[] = [];
+  if (category === "filters" || category === "all") {
+    sections.push(formatFilters());
+  }
+  if (category === "metrics" || category === "all") {
+    sections.push(formatMetrics());
+  }
+  if (category === "aggregations" || category === "all") {
+    sections.push(formatAggregations());
+  }
+  if (category === "groups" || category === "all") {
+    sections.push(formatGroups());
+  }
+  return sections.join("\n\n");
+}
+function formatFilters(): string {
+  const lines = ["## Available Filter Fields", ""];
+  lines.push(
+    "Use these in the `filters` parameter of `search_traces` and `get_analytics`."
+  );
+  lines.push('Format: `{ "field_name": ["value1", "value2"] }`');
+  lines.push("");
+  for (const f of filterFields) {
+    lines.push(
+      `- **${f.field}**: ${f.description}${f.example ? ` (e.g., \`${f.example}\`)` : ""}`
+    );
+  }
+  return lines.join("\n");
+}
+function formatMetrics(): string {
+  const lines = ["## Available Metrics", ""];
+  lines.push(
+    "Use these in `get_analytics` as `metric` parameter in `category.name` format."
+  );
+  lines.push("");
+  const byCategory = new Map<string, typeof analyticsMetrics>();
+  for (const m of analyticsMetrics) {
+    const list = byCategory.get(m.category) || [];
+    list.push(m);
+    byCategory.set(m.category, list);
+  }
+  for (const [cat, metrics] of byCategory) {
+    lines.push(`### ${cat}`);
+    for (const m of metrics) {
+      lines.push(`- **${cat}.${m.name}** (${m.label}): ${m.description}`);
+      lines.push(`  Aggregations: ${m.allowedAggregations.join(", ")}`);
+    }
+    lines.push("");
+  }
+  return lines.join("\n");
+}
+function formatAggregations(): string {
+  return [
+    "## Available Aggregation Types",
+    "",
+    "- **cardinality**: Count unique values",
+    "- **terms**: Distribution/breakdown of values",
+    "- **avg**: Average",
+    "- **sum**: Sum total",
+    "- **min**: Minimum",
+    "- **max**: Maximum",
+    "- **median**: 50th percentile",
+    "- **p90**: 90th percentile",
+    "- **p95**: 95th percentile",
+    "- **p99**: 99th percentile",
+    "",
+    "Note: Not all aggregations are available for all metrics. Check the metric's allowed aggregations.",
+  ].join("\n");
+}
+function formatGroups(): string {
+  const lines = ["## Available Group-By Options", ""];
+  lines.push(
+    "Use these in the `groupBy` parameter of `get_analytics`."
+  );
+  lines.push("");
+  for (const g of analyticsGroups) {
+    lines.push(`- **${g.name}** (${g.label}): ${g.description}`);
+  }
+  return lines.join("\n");
+}

package/src/tools/get-analytics.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import { getAnalyticsTimeseries as apiGetAnalytics } from "../langwatch-api.js";
+import { parseRelativeDate } from "../utils/date-parsing.js";
+/**
+ * Handles the get_analytics MCP tool invocation.
+ *
+ * Queries analytics timeseries from LangWatch and formats the results
+ * as an AI-readable markdown table.
+ */
+export async function handleGetAnalytics(params: {
+  metric: string;
+  aggregation?: string;
+  startDate?: string;
+  endDate?: string;
+  timeZone?: string;
+  groupBy?: string;
+  filters?: Record<string, string[]>;
+}): Promise<string> {
+  const now = Date.now();
+  const startDate = params.startDate
+    ? parseRelativeDate(params.startDate)
+    : now - 7 * 86400000;
+  const endDate = params.endDate ? parseRelativeDate(params.endDate) : now;
+  // Parse metric format "category.name"
+  const [category, name] = params.metric.includes(".")
+    ? params.metric.split(".", 2)
+    : ["metadata", params.metric];
+  const metricKey = `${category}.${name}`;
+  const aggregation = params.aggregation ?? "avg";
+  const result = await apiGetAnalytics({
+    series: [{ metric: metricKey, aggregation }],
+    startDate,
+    endDate,
+    timeZone: params.timeZone ?? "UTC",
+    groupBy: params.groupBy,
+    filters: params.filters,
+  });
+  const lines: string[] = [];
+  lines.push(`# Analytics: ${metricKey} (${aggregation})\n`);
+  lines.push(
+    `Period: ${new Date(startDate).toISOString().split("T")[0]} to ${new Date(endDate).toISOString().split("T")[0]}`
+  );
+  if (params.groupBy) lines.push(`Grouped by: ${params.groupBy}`);
+  lines.push("");
+  const currentPeriod = result.currentPeriod ?? [];
+  if (currentPeriod.length === 0) {
+    lines.push("No data available for this period.");
+  } else {
+    lines.push("| Date | Value |");
+    lines.push("|------|-------|");
+    for (const bucket of currentPeriod) {
+      const date = bucket.date;
+      // Find the metric value - it's typically keyed by index
+      const value =
+        Object.entries(bucket).find(
+          ([k]) => k !== "date" && typeof bucket[k] === "number"
+        )?.[1] ?? "N/A";
+      lines.push(`| ${date} | ${value} |`);
+    }
+  }
+  lines.push(
+    "\n> Tip: Use `discover_schema` to see all available metrics and aggregation types."
+  );
+  return lines.join("\n");
+}

package/src/tools/get-prompt.ts ADDED Viewed

@@ -0,0 +1,56 @@
+import { getPrompt as apiGetPrompt } from "../langwatch-api.js";
+/**
+ * Handles the get_prompt MCP tool invocation.
+ *
+ * Retrieves a specific prompt by ID or handle and formats it as
+ * AI-readable markdown, including messages, model config, and version history.
+ */
+export async function handleGetPrompt(params: {
+  idOrHandle: string;
+  version?: number;
+}): Promise<string> {
+  const prompt = await apiGetPrompt(params.idOrHandle, params.version);
+  const lines: string[] = [];
+  lines.push(
+    `# Prompt: ${prompt.name || prompt.handle || prompt.id}\n`
+  );
+  if (prompt.handle) lines.push(`**Handle**: ${prompt.handle}`);
+  if (prompt.id) lines.push(`**ID**: ${prompt.id}`);
+  if (prompt.description) lines.push(`**Description**: ${prompt.description}`);
+  if (prompt.latestVersionNumber != null)
+    lines.push(`**Latest Version**: v${prompt.latestVersionNumber}`);
+  // Show model config
+  const version = prompt.versions?.[0] ?? prompt;
+  if (version.model) lines.push(`**Model**: ${version.model}`);
+  if (version.modelProvider)
+    lines.push(`**Provider**: ${version.modelProvider}`);
+  // Show messages
+  const messages = version.messages || prompt.prompt || [];
+  if (Array.isArray(messages) && messages.length > 0) {
+    lines.push("\n## Messages");
+    for (const msg of messages) {
+      lines.push(`\n### ${msg.role}`);
+      lines.push(msg.content);
+    }
+  }
+  // Show version history
+  if (prompt.versions && prompt.versions.length > 0) {
+    lines.push("\n## Version History");
+    for (const v of prompt.versions.slice(0, 10)) {
+      const versionNum = v.version ?? "?";
+      const commitMsg = v.commitMessage || "No message";
+      lines.push(`- **v${versionNum}**: ${commitMsg}`);
+    }
+    if (prompt.versions.length > 10) {
+      lines.push(`... and ${prompt.versions.length - 10} more versions`);
+    }
+  }
+  return lines.join("\n");
+}

package/src/tools/get-trace.ts ADDED Viewed

@@ -0,0 +1,61 @@
+import { getTraceById as apiGetTraceById } from "../langwatch-api.js";
+/**
+ * Handles the get_trace MCP tool invocation.
+ *
+ * Retrieves a single trace by ID. In digest mode (default), returns the
+ * AI-readable formatted digest. In json mode, returns the full raw JSON.
+ */
+export async function handleGetTrace(params: {
+  traceId: string;
+  format?: "digest" | "json";
+}): Promise<string> {
+  const format = params.format ?? "digest";
+  const result = await apiGetTraceById(params.traceId, format);
+  if (format === "json") {
+    return JSON.stringify(result, null, 2);
+  }
+  const lines: string[] = [];
+  lines.push(`# Trace: ${params.traceId}\n`);
+  if (result.timestamps) {
+    lines.push(`**Started**: ${result.timestamps.started_at}`);
+    if (result.timestamps.updated_at)
+      lines.push(`**Updated**: ${result.timestamps.updated_at}`);
+  }
+  if (result.metadata) {
+    const meta = result.metadata;
+    if (meta.user_id) lines.push(`**User**: ${meta.user_id}`);
+    if (meta.thread_id) lines.push(`**Thread**: ${meta.thread_id}`);
+    if (meta.customer_id) lines.push(`**Customer**: ${meta.customer_id}`);
+    if (meta.labels?.length) lines.push(`**Labels**: ${meta.labels.join(", ")}`);
+  }
+  if (result.evaluations && result.evaluations.length > 0) {
+    lines.push("\n## Evaluations");
+    for (const evaluation of result.evaluations) {
+      const status =
+        evaluation.passed === true
+          ? "PASSED"
+          : evaluation.passed === false
+            ? "FAILED"
+            : "N/A";
+      lines.push(
+        `- **${evaluation.name || evaluation.evaluator_id}**: ${status}${evaluation.score != null ? ` (score: ${evaluation.score})` : ""}${evaluation.label ? ` [${evaluation.label}]` : ""}`
+      );
+    }
+  }
+  if (result.formatted_trace) {
+    lines.push(`\n## Trace Details\n${result.formatted_trace}`);
+  }
+  lines.push(
+    '\n> Tip: Use `get_trace` with `format: "json"` to get the full raw trace data.'
+  );
+  return lines.join("\n");
+}

package/src/tools/list-prompts.ts ADDED Viewed

@@ -0,0 +1,35 @@
+import { listPrompts as apiListPrompts } from "../langwatch-api.js";
+/**
+ * Handles the list_prompts MCP tool invocation.
+ *
+ * Lists all prompts in the LangWatch project, formatted as an
+ * AI-readable markdown table.
+ */
+export async function handleListPrompts(): Promise<string> {
+  const prompts = await apiListPrompts();
+  if (!Array.isArray(prompts) || prompts.length === 0) {
+    return "No prompts found in this project.";
+  }
+  const lines: string[] = [];
+  lines.push(`# Prompts (${prompts.length} total)\n`);
+  lines.push("| Handle | Name | Latest Version | Description |");
+  lines.push("|--------|------|----------------|-------------|");
+  for (const p of prompts) {
+    const handle = p.handle || p.id || "N/A";
+    const name = p.name || "Untitled";
+    const versionNum = p.latestVersionNumber ?? p.version;
+    const version = versionNum != null ? `v${versionNum}` : "N/A";
+    const desc = (p.description || "").slice(0, 60);
+    lines.push(`| ${handle} | ${name} | ${version} | ${desc} |`);
+  }
+  lines.push(
+    "\n> Use `get_prompt` with the handle or ID to see full prompt details."
+  );
+  return lines.join("\n");
+}