@mastra/mcp-docs-server 1.0.0-beta.4 → 1.0.0-beta.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.docs/organized/changelogs/%40internal%2Fstorage-test-utils.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fagent-builder.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fai-sdk.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fastra.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fchroma.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fclickhouse.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fclient-js.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fcloudflare-d1.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fcloudflare.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fconvex.md +29 -0
- package/.docs/organized/changelogs/%40mastra%2Fcore.md +411 -211
- package/.docs/organized/changelogs/%40mastra%2Fcouchbase.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloud.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-cloudflare.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-netlify.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fdeployer-vercel.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fdeployer.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fduckdb.md +42 -0
- package/.docs/organized/changelogs/%40mastra%2Fdynamodb.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Felasticsearch.md +52 -0
- package/.docs/organized/changelogs/%40mastra%2Fevals.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Flance.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Flibsql.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Floggers.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fmcp-docs-server.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fmcp-registry-registry.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fmcp.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fmemory.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fmongodb.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fmssql.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fopensearch.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fpg.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fpinecone.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fplayground-ui.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fqdrant.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Frag.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Freact.md +89 -1
- package/.docs/organized/changelogs/%40mastra%2Fs3vectors.md +9 -0
- package/.docs/organized/changelogs/%40mastra%2Fschema-compat.md +42 -0
- package/.docs/organized/changelogs/%40mastra%2Fserver.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fturbopuffer.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fupstash.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvectorize.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-azure.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-cloudflare.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-deepgram.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-elevenlabs.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-gladia.md +92 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-google-gemini-live.md +67 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-google.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-murf.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-openai-realtime.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-openai.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-playai.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-sarvam.md +201 -1
- package/.docs/organized/changelogs/%40mastra%2Fvoice-speechify.md +201 -1
- package/.docs/organized/changelogs/create-mastra.md +201 -1
- package/.docs/organized/changelogs/mastra.md +201 -1
- package/.docs/organized/code-examples/agui.md +1 -0
- package/.docs/organized/code-examples/ai-sdk-v5.md +1 -0
- package/.docs/organized/code-examples/mcp-server-adapters.md +721 -0
- package/.docs/organized/code-examples/memory-with-processors.md +1 -1
- package/.docs/organized/code-examples/quick-start.md +1 -1
- package/.docs/organized/code-examples/server-app-access.md +342 -0
- package/.docs/raw/agents/adding-voice.mdx +7 -10
- package/.docs/raw/agents/agent-approval.mdx +189 -0
- package/.docs/raw/agents/guardrails.mdx +26 -23
- package/.docs/raw/agents/networks.mdx +2 -2
- package/.docs/raw/agents/overview.mdx +27 -62
- package/.docs/raw/agents/processors.mdx +279 -0
- package/.docs/raw/agents/using-tools.mdx +4 -5
- package/.docs/raw/course/01-first-agent/05-running-playground.md +5 -5
- package/.docs/raw/course/01-first-agent/09-testing-your-agent.md +3 -3
- package/.docs/raw/course/01-first-agent/13-testing-your-tool.md +3 -3
- package/.docs/raw/course/01-first-agent/17-testing-memory.md +2 -2
- package/.docs/raw/course/04-workflows/07-using-playground.md +1 -1
- package/.docs/raw/deployment/building-mastra.mdx +1 -1
- package/.docs/raw/deployment/cloud-providers/amazon-ec2.mdx +1 -1
- package/.docs/raw/deployment/cloud-providers/aws-lambda.mdx +1 -1
- package/.docs/raw/deployment/cloud-providers/azure-app-services.mdx +1 -1
- package/.docs/raw/deployment/cloud-providers/digital-ocean.mdx +1 -1
- package/.docs/raw/deployment/cloud-providers/index.mdx +20 -27
- package/.docs/raw/deployment/cloud-providers/netlify-deployer.mdx +44 -13
- package/.docs/raw/deployment/mastra-cloud/observability.mdx +19 -17
- package/.docs/raw/deployment/mastra-cloud/setting-up.mdx +1 -1
- package/.docs/raw/deployment/overview.mdx +2 -2
- package/.docs/raw/deployment/web-framework.mdx +5 -5
- package/.docs/raw/evals/custom-scorers.mdx +3 -5
- package/.docs/raw/evals/overview.mdx +2 -3
- package/.docs/raw/evals/running-in-ci.mdx +0 -2
- package/.docs/raw/{guides/guide → getting-started}/manual-install.mdx +2 -2
- package/.docs/raw/getting-started/project-structure.mdx +1 -1
- package/.docs/raw/getting-started/start.mdx +72 -0
- package/.docs/raw/getting-started/studio.mdx +1 -1
- package/.docs/raw/{frameworks/agentic-uis/ai-sdk.mdx → guides/build-your-ui/ai-sdk-ui.mdx} +113 -11
- package/.docs/raw/{frameworks/web-frameworks → guides/getting-started}/astro.mdx +23 -25
- package/.docs/raw/{frameworks/servers → guides/getting-started}/express.mdx +3 -4
- package/.docs/raw/guides/{quickstarts/nextjs.mdx → getting-started/next-js.mdx} +11 -11
- package/.docs/raw/guides/{quickstarts/standalone-server.mdx → getting-started/quickstart.mdx} +7 -7
- package/.docs/raw/{frameworks/web-frameworks → guides/getting-started}/sveltekit.mdx +23 -25
- package/.docs/raw/{frameworks/web-frameworks → guides/getting-started}/vite-react.mdx +7 -7
- package/.docs/raw/guides/guide/ai-recruiter.mdx +2 -3
- package/.docs/raw/guides/guide/chef-michel.mdx +2 -3
- package/.docs/raw/guides/guide/notes-mcp-server.mdx +2 -2
- package/.docs/raw/guides/guide/research-assistant.mdx +7 -8
- package/.docs/raw/guides/guide/stock-agent.mdx +4 -6
- package/.docs/raw/guides/guide/web-search.mdx +12 -10
- package/.docs/raw/guides/guide/whatsapp-chat-bot.mdx +421 -0
- package/.docs/raw/guides/index.mdx +3 -35
- package/.docs/raw/guides/migrations/agentnetwork.mdx +4 -4
- package/.docs/raw/guides/migrations/ai-sdk-v4-to-v5.mdx +1 -1
- package/.docs/raw/guides/migrations/upgrade-to-v1/agent.mdx +40 -0
- package/.docs/raw/guides/migrations/upgrade-to-v1/tools.mdx +5 -0
- package/.docs/raw/guides/migrations/upgrade-to-v1/workflows.mdx +51 -0
- package/.docs/raw/guides/migrations/vnext-to-standard-apis.mdx +2 -2
- package/.docs/raw/index.mdx +2 -2
- package/.docs/raw/mcp/overview.mdx +3 -5
- package/.docs/raw/memory/memory-processors.mdx +264 -79
- package/.docs/raw/memory/semantic-recall.mdx +7 -7
- package/.docs/raw/memory/storage/memory-with-libsql.mdx +2 -4
- package/.docs/raw/memory/storage/memory-with-mongodb.mdx +2 -4
- package/.docs/raw/memory/storage/memory-with-pg.mdx +2 -4
- package/.docs/raw/memory/storage/memory-with-upstash.mdx +2 -4
- package/.docs/raw/memory/threads-and-resources.mdx +3 -3
- package/.docs/raw/memory/working-memory.mdx +14 -7
- package/.docs/raw/{logging.mdx → observability/logging.mdx} +1 -1
- package/.docs/raw/observability/overview.mdx +2 -3
- package/.docs/raw/observability/tracing/bridges/otel.mdx +176 -0
- package/.docs/raw/observability/tracing/exporters/arize.mdx +17 -0
- package/.docs/raw/observability/tracing/exporters/braintrust.mdx +19 -0
- package/.docs/raw/observability/tracing/exporters/langfuse.mdx +20 -0
- package/.docs/raw/observability/tracing/exporters/langsmith.mdx +12 -0
- package/.docs/raw/observability/tracing/exporters/otel.mdx +25 -5
- package/.docs/raw/observability/tracing/exporters/posthog.mdx +107 -0
- package/.docs/raw/observability/tracing/overview.mdx +74 -8
- package/.docs/raw/observability/tracing/processors/sensitive-data-filter.mdx +0 -1
- package/.docs/raw/rag/chunking-and-embedding.mdx +16 -17
- package/.docs/raw/rag/overview.mdx +3 -2
- package/.docs/raw/rag/retrieval.mdx +43 -38
- package/.docs/raw/rag/vector-databases.mdx +93 -2
- package/.docs/raw/reference/agents/agent.mdx +7 -10
- package/.docs/raw/reference/agents/generate.mdx +55 -6
- package/.docs/raw/reference/agents/generateLegacy.mdx +2 -2
- package/.docs/raw/reference/agents/getLLM.mdx +1 -1
- package/.docs/raw/reference/agents/network.mdx +46 -3
- package/.docs/raw/reference/cli/mastra.mdx +2 -1
- package/.docs/raw/reference/client-js/agents.mdx +3 -3
- package/.docs/raw/reference/client-js/memory.mdx +43 -0
- package/.docs/raw/reference/client-js/workflows.mdx +92 -63
- package/.docs/raw/reference/core/getLogger.mdx +1 -1
- package/.docs/raw/reference/core/listLogs.mdx +1 -1
- package/.docs/raw/reference/core/listLogsByRunId.mdx +1 -1
- package/.docs/raw/reference/core/mastra-model-gateway.mdx +5 -19
- package/.docs/raw/reference/core/setLogger.mdx +1 -1
- package/.docs/raw/reference/core/setTelemetry.mdx +1 -1
- package/.docs/raw/reference/deployer/netlify.mdx +1 -2
- package/.docs/raw/reference/evals/answer-relevancy.mdx +28 -98
- package/.docs/raw/reference/evals/answer-similarity.mdx +12 -258
- package/.docs/raw/reference/evals/bias.mdx +29 -87
- package/.docs/raw/reference/evals/completeness.mdx +31 -90
- package/.docs/raw/reference/evals/content-similarity.mdx +28 -88
- package/.docs/raw/reference/evals/context-precision.mdx +28 -130
- package/.docs/raw/reference/evals/context-relevance.mdx +11 -11
- package/.docs/raw/reference/evals/faithfulness.mdx +28 -101
- package/.docs/raw/reference/evals/hallucination.mdx +28 -103
- package/.docs/raw/reference/evals/keyword-coverage.mdx +28 -107
- package/.docs/raw/reference/evals/noise-sensitivity.mdx +11 -11
- package/.docs/raw/reference/evals/prompt-alignment.mdx +15 -15
- package/.docs/raw/reference/evals/scorer-utils.mdx +362 -0
- package/.docs/raw/reference/evals/textual-difference.mdx +27 -100
- package/.docs/raw/reference/evals/tone-consistency.mdx +25 -98
- package/.docs/raw/reference/evals/tool-call-accuracy.mdx +7 -7
- package/.docs/raw/reference/evals/toxicity.mdx +29 -92
- package/.docs/raw/reference/index.mdx +1 -0
- package/.docs/raw/reference/memory/memory-class.mdx +5 -7
- package/.docs/raw/reference/observability/tracing/bridges/otel.mdx +150 -0
- package/.docs/raw/reference/observability/tracing/configuration.mdx +0 -4
- package/.docs/raw/reference/observability/tracing/exporters/arize.mdx +4 -0
- package/.docs/raw/reference/observability/tracing/exporters/langsmith.mdx +17 -1
- package/.docs/raw/reference/observability/tracing/exporters/otel.mdx +6 -0
- package/.docs/raw/reference/observability/tracing/exporters/posthog.mdx +132 -0
- package/.docs/raw/reference/observability/tracing/instances.mdx +0 -4
- package/.docs/raw/reference/observability/tracing/interfaces.mdx +29 -4
- package/.docs/raw/reference/observability/tracing/spans.mdx +0 -4
- package/.docs/raw/reference/processors/batch-parts-processor.mdx +1 -1
- package/.docs/raw/reference/processors/language-detector.mdx +10 -3
- package/.docs/raw/reference/processors/message-history-processor.mdx +131 -0
- package/.docs/raw/reference/processors/moderation-processor.mdx +12 -5
- package/.docs/raw/reference/processors/pii-detector.mdx +12 -5
- package/.docs/raw/reference/processors/processor-interface.mdx +502 -0
- package/.docs/raw/reference/processors/prompt-injection-detector.mdx +10 -3
- package/.docs/raw/reference/processors/semantic-recall-processor.mdx +197 -0
- package/.docs/raw/reference/processors/system-prompt-scrubber.mdx +3 -4
- package/.docs/raw/reference/processors/token-limiter-processor.mdx +2 -2
- package/.docs/raw/reference/processors/tool-call-filter.mdx +125 -0
- package/.docs/raw/reference/processors/unicode-normalizer.mdx +1 -1
- package/.docs/raw/reference/processors/working-memory-processor.mdx +221 -0
- package/.docs/raw/reference/rag/embeddings.mdx +5 -5
- package/.docs/raw/reference/rag/rerank.mdx +1 -2
- package/.docs/raw/reference/rag/rerankWithScorer.mdx +0 -1
- package/.docs/raw/reference/storage/cloudflare-d1.mdx +37 -0
- package/.docs/raw/reference/storage/convex.mdx +164 -0
- package/.docs/raw/reference/storage/lance.mdx +33 -0
- package/.docs/raw/reference/storage/libsql.mdx +37 -0
- package/.docs/raw/reference/storage/mongodb.mdx +39 -0
- package/.docs/raw/reference/storage/mssql.mdx +37 -0
- package/.docs/raw/reference/storage/postgresql.mdx +37 -0
- package/.docs/raw/reference/streaming/ChunkType.mdx +1 -1
- package/.docs/raw/reference/streaming/agents/stream.mdx +64 -2
- package/.docs/raw/reference/streaming/workflows/observeStream.mdx +7 -9
- package/.docs/raw/reference/streaming/workflows/{resumeStreamVNext.mdx → resumeStream.mdx} +51 -11
- package/.docs/raw/reference/streaming/workflows/stream.mdx +83 -24
- package/.docs/raw/reference/templates/overview.mdx +1 -4
- package/.docs/raw/reference/tools/client.mdx +1 -2
- package/.docs/raw/reference/tools/create-tool.mdx +132 -0
- package/.docs/raw/reference/tools/graph-rag-tool.mdx +5 -5
- package/.docs/raw/reference/tools/mcp-client.mdx +76 -21
- package/.docs/raw/reference/tools/mcp-server.mdx +1 -2
- package/.docs/raw/reference/tools/vector-query-tool.mdx +14 -15
- package/.docs/raw/reference/vectors/chroma.mdx +81 -1
- package/.docs/raw/reference/vectors/convex.mdx +429 -0
- package/.docs/raw/reference/vectors/couchbase.mdx +24 -17
- package/.docs/raw/reference/vectors/duckdb.mdx +462 -0
- package/.docs/raw/reference/vectors/elasticsearch.mdx +310 -0
- package/.docs/raw/reference/vectors/lance.mdx +38 -22
- package/.docs/raw/reference/vectors/libsql.mdx +35 -2
- package/.docs/raw/reference/vectors/mongodb.mdx +35 -2
- package/.docs/raw/reference/vectors/opensearch.mdx +37 -16
- package/.docs/raw/reference/vectors/pg.mdx +43 -36
- package/.docs/raw/reference/vectors/pinecone.mdx +48 -1
- package/.docs/raw/reference/vectors/qdrant.mdx +36 -1
- package/.docs/raw/reference/vectors/turbopuffer.mdx +74 -0
- package/.docs/raw/reference/voice/google.mdx +159 -20
- package/.docs/raw/reference/voice/openai-realtime.mdx +2 -2
- package/.docs/raw/reference/voice/voice.addInstructions.mdx +2 -3
- package/.docs/raw/reference/voice/voice.addTools.mdx +1 -1
- package/.docs/raw/reference/voice/voice.answer.mdx +1 -1
- package/.docs/raw/reference/voice/voice.close.mdx +1 -1
- package/.docs/raw/reference/voice/voice.connect.mdx +1 -1
- package/.docs/raw/reference/voice/voice.off.mdx +1 -1
- package/.docs/raw/reference/voice/voice.on.mdx +1 -1
- package/.docs/raw/reference/voice/voice.send.mdx +1 -1
- package/.docs/raw/reference/voice/voice.updateConfig.mdx +1 -1
- package/.docs/raw/reference/workflows/run-methods/restart.mdx +142 -0
- package/.docs/raw/reference/workflows/run-methods/resume.mdx +44 -0
- package/.docs/raw/reference/workflows/run-methods/start.mdx +44 -0
- package/.docs/raw/reference/workflows/run.mdx +13 -5
- package/.docs/raw/reference/workflows/step.mdx +13 -0
- package/.docs/raw/reference/workflows/workflow.mdx +19 -0
- package/.docs/raw/server-db/mastra-client.mdx +1 -2
- package/.docs/raw/server-db/mastra-server.mdx +30 -1
- package/.docs/raw/server-db/request-context.mdx +0 -1
- package/.docs/raw/server-db/storage.mdx +11 -0
- package/.docs/raw/streaming/overview.mdx +26 -15
- package/.docs/raw/streaming/tool-streaming.mdx +48 -5
- package/.docs/raw/streaming/workflow-streaming.mdx +5 -11
- package/.docs/raw/tools-mcp/advanced-usage.mdx +1 -2
- package/.docs/raw/tools-mcp/mcp-overview.mdx +3 -5
- package/.docs/raw/voice/overview.mdx +21 -41
- package/.docs/raw/voice/speech-to-speech.mdx +4 -4
- package/.docs/raw/voice/speech-to-text.mdx +1 -2
- package/.docs/raw/voice/text-to-speech.mdx +1 -2
- package/.docs/raw/workflows/control-flow.mdx +180 -0
- package/.docs/raw/workflows/error-handling.mdx +1 -0
- package/.docs/raw/workflows/human-in-the-loop.mdx +4 -4
- package/.docs/raw/workflows/overview.mdx +56 -44
- package/.docs/raw/workflows/snapshots.mdx +1 -0
- package/.docs/raw/workflows/suspend-and-resume.mdx +85 -16
- package/.docs/raw/workflows/time-travel.mdx +313 -0
- package/.docs/raw/workflows/workflow-state.mdx +191 -0
- package/CHANGELOG.md +18 -0
- package/dist/{chunk-5NJC7NRO.js → chunk-4CM2BQNP.js} +24 -4
- package/dist/prepare-docs/package-changes.d.ts.map +1 -1
- package/dist/prepare-docs/prepare.js +1 -1
- package/dist/stdio.js +1 -1
- package/package.json +7 -7
- package/.docs/raw/agents/human-in-the-loop-with-tools.mdx +0 -90
- package/.docs/raw/frameworks/agentic-uis/cedar-os.mdx +0 -102
- package/.docs/raw/frameworks/agentic-uis/openrouter.mdx +0 -179
- package/.docs/raw/frameworks/web-frameworks/next-js.mdx +0 -379
- package/.docs/raw/getting-started/quickstart.mdx +0 -27
- package/.docs/raw/getting-started/templates.mdx +0 -73
- package/.docs/raw/reference/streaming/workflows/observeStreamVNext.mdx +0 -47
- package/.docs/raw/reference/streaming/workflows/streamVNext.mdx +0 -153
- /package/.docs/raw/{frameworks/agentic-uis → guides/build-your-ui}/assistant-ui.mdx +0 -0
- /package/.docs/raw/{frameworks/agentic-uis → guides/build-your-ui}/copilotkit.mdx +0 -0
|
@@ -0,0 +1,362 @@
|
|
|
1
|
+
---
|
|
2
|
+
title: "Reference: Scorer Utils | Evals"
|
|
3
|
+
description: Utility functions for extracting data from scorer run inputs and outputs, including text content, reasoning, system messages, and tool calls.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# Scorer Utils
|
|
7
|
+
|
|
8
|
+
Mastra provides utility functions to help extract and process data from scorer run inputs and outputs. These utilities are particularly useful in the `preprocess` step of custom scorers.
|
|
9
|
+
|
|
10
|
+
## Import
|
|
11
|
+
|
|
12
|
+
```typescript
|
|
13
|
+
import {
|
|
14
|
+
getAssistantMessageFromRunOutput,
|
|
15
|
+
getReasoningFromRunOutput,
|
|
16
|
+
getUserMessageFromRunInput,
|
|
17
|
+
getSystemMessagesFromRunInput,
|
|
18
|
+
getCombinedSystemPrompt,
|
|
19
|
+
extractToolCalls,
|
|
20
|
+
extractInputMessages,
|
|
21
|
+
extractAgentResponseMessages,
|
|
22
|
+
} from "@mastra/evals/scorers/utils";
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Message Extraction
|
|
26
|
+
|
|
27
|
+
### getAssistantMessageFromRunOutput
|
|
28
|
+
|
|
29
|
+
Extracts the text content from the first assistant message in the run output.
|
|
30
|
+
|
|
31
|
+
```typescript
|
|
32
|
+
const scorer = createScorer({
|
|
33
|
+
id: "my-scorer",
|
|
34
|
+
description: "My scorer",
|
|
35
|
+
type: "agent",
|
|
36
|
+
})
|
|
37
|
+
.preprocess(({ run }) => {
|
|
38
|
+
const response = getAssistantMessageFromRunOutput(run.output);
|
|
39
|
+
return { response };
|
|
40
|
+
})
|
|
41
|
+
.generateScore(({ results }) => {
|
|
42
|
+
return results.preprocessStepResult?.response ? 1 : 0;
|
|
43
|
+
});
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
<PropertiesTable
|
|
47
|
+
content={[
|
|
48
|
+
{
|
|
49
|
+
name: "output",
|
|
50
|
+
type: "ScorerRunOutputForAgent",
|
|
51
|
+
isOptional: true,
|
|
52
|
+
description: "The scorer run output (array of MastraDBMessage)",
|
|
53
|
+
},
|
|
54
|
+
]}
|
|
55
|
+
/>
|
|
56
|
+
|
|
57
|
+
**Returns:** `string | undefined` - The assistant message text, or undefined if no assistant message is found.
|
|
58
|
+
|
|
59
|
+
### getUserMessageFromRunInput
|
|
60
|
+
|
|
61
|
+
Extracts the text content from the first user message in the run input.
|
|
62
|
+
|
|
63
|
+
```typescript
|
|
64
|
+
.preprocess(({ run }) => {
|
|
65
|
+
const userMessage = getUserMessageFromRunInput(run.input);
|
|
66
|
+
return { userMessage };
|
|
67
|
+
})
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
<PropertiesTable
|
|
71
|
+
content={[
|
|
72
|
+
{
|
|
73
|
+
name: "input",
|
|
74
|
+
type: "ScorerRunInputForAgent",
|
|
75
|
+
isOptional: true,
|
|
76
|
+
description: "The scorer run input containing input messages",
|
|
77
|
+
},
|
|
78
|
+
]}
|
|
79
|
+
/>
|
|
80
|
+
|
|
81
|
+
**Returns:** `string | undefined` - The user message text, or undefined if no user message is found.
|
|
82
|
+
|
|
83
|
+
### extractInputMessages
|
|
84
|
+
|
|
85
|
+
Extracts text content from all input messages as an array.
|
|
86
|
+
|
|
87
|
+
```typescript
|
|
88
|
+
.preprocess(({ run }) => {
|
|
89
|
+
const allUserMessages = extractInputMessages(run.input);
|
|
90
|
+
return { conversationHistory: allUserMessages.join("\n") };
|
|
91
|
+
})
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Returns:** `string[]` - Array of text strings from each input message.
|
|
95
|
+
|
|
96
|
+
### extractAgentResponseMessages
|
|
97
|
+
|
|
98
|
+
Extracts text content from all assistant response messages as an array.
|
|
99
|
+
|
|
100
|
+
```typescript
|
|
101
|
+
.preprocess(({ run }) => {
|
|
102
|
+
const allResponses = extractAgentResponseMessages(run.output);
|
|
103
|
+
return { allResponses };
|
|
104
|
+
})
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**Returns:** `string[]` - Array of text strings from each assistant message.
|
|
108
|
+
|
|
109
|
+
## Reasoning Extraction
|
|
110
|
+
|
|
111
|
+
### getReasoningFromRunOutput
|
|
112
|
+
|
|
113
|
+
Extracts reasoning text from the run output. This is particularly useful when evaluating responses from reasoning models like `deepseek-reasoner` that produce chain-of-thought reasoning.
|
|
114
|
+
|
|
115
|
+
Reasoning can be stored in two places:
|
|
116
|
+
1. `content.reasoning` - a string field on the message content
|
|
117
|
+
2. `content.parts` - as parts with `type: 'reasoning'` containing `details`
|
|
118
|
+
|
|
119
|
+
```typescript
|
|
120
|
+
import {
|
|
121
|
+
getReasoningFromRunOutput,
|
|
122
|
+
getAssistantMessageFromRunOutput
|
|
123
|
+
} from "@mastra/evals/scorers/utils";
|
|
124
|
+
|
|
125
|
+
const reasoningQualityScorer = createScorer({
|
|
126
|
+
id: "reasoning-quality",
|
|
127
|
+
name: "Reasoning Quality",
|
|
128
|
+
description: "Evaluates the quality of model reasoning",
|
|
129
|
+
type: "agent",
|
|
130
|
+
})
|
|
131
|
+
.preprocess(({ run }) => {
|
|
132
|
+
const reasoning = getReasoningFromRunOutput(run.output);
|
|
133
|
+
const response = getAssistantMessageFromRunOutput(run.output);
|
|
134
|
+
return { reasoning, response };
|
|
135
|
+
})
|
|
136
|
+
.analyze(({ results }) => {
|
|
137
|
+
const { reasoning } = results.preprocessStepResult || {};
|
|
138
|
+
return {
|
|
139
|
+
hasReasoning: !!reasoning,
|
|
140
|
+
reasoningLength: reasoning?.length || 0,
|
|
141
|
+
hasStepByStep: reasoning?.includes("step") || false,
|
|
142
|
+
};
|
|
143
|
+
})
|
|
144
|
+
.generateScore(({ results }) => {
|
|
145
|
+
const { hasReasoning, reasoningLength } = results.analyzeStepResult || {};
|
|
146
|
+
if (!hasReasoning) return 0;
|
|
147
|
+
// Score based on reasoning length (normalized to 0-1)
|
|
148
|
+
return Math.min(reasoningLength / 500, 1);
|
|
149
|
+
})
|
|
150
|
+
.generateReason(({ results, score }) => {
|
|
151
|
+
const { hasReasoning, reasoningLength } = results.analyzeStepResult || {};
|
|
152
|
+
if (!hasReasoning) {
|
|
153
|
+
return "No reasoning was provided by the model.";
|
|
154
|
+
}
|
|
155
|
+
return `Model provided ${reasoningLength} characters of reasoning. Score: ${score}`;
|
|
156
|
+
});
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
<PropertiesTable
|
|
160
|
+
content={[
|
|
161
|
+
{
|
|
162
|
+
name: "output",
|
|
163
|
+
type: "ScorerRunOutputForAgent",
|
|
164
|
+
isOptional: true,
|
|
165
|
+
description: "The scorer run output (array of MastraDBMessage)",
|
|
166
|
+
},
|
|
167
|
+
]}
|
|
168
|
+
/>
|
|
169
|
+
|
|
170
|
+
**Returns:** `string | undefined` - The reasoning text, or undefined if no reasoning is present.
|
|
171
|
+
|
|
172
|
+
## System Message Extraction
|
|
173
|
+
|
|
174
|
+
### getSystemMessagesFromRunInput
|
|
175
|
+
|
|
176
|
+
Extracts all system messages from the run input, including both standard system messages and tagged system messages (specialized prompts like memory instructions).
|
|
177
|
+
|
|
178
|
+
```typescript
|
|
179
|
+
.preprocess(({ run }) => {
|
|
180
|
+
const systemMessages = getSystemMessagesFromRunInput(run.input);
|
|
181
|
+
return {
|
|
182
|
+
systemPromptCount: systemMessages.length,
|
|
183
|
+
systemPrompts: systemMessages
|
|
184
|
+
};
|
|
185
|
+
})
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
**Returns:** `string[]` - Array of system message strings.
|
|
189
|
+
|
|
190
|
+
### getCombinedSystemPrompt
|
|
191
|
+
|
|
192
|
+
Combines all system messages into a single prompt string, joined with double newlines.
|
|
193
|
+
|
|
194
|
+
```typescript
|
|
195
|
+
.preprocess(({ run }) => {
|
|
196
|
+
const fullSystemPrompt = getCombinedSystemPrompt(run.input);
|
|
197
|
+
return { fullSystemPrompt };
|
|
198
|
+
})
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
**Returns:** `string` - Combined system prompt string.
|
|
202
|
+
|
|
203
|
+
## Tool Call Extraction
|
|
204
|
+
|
|
205
|
+
### extractToolCalls
|
|
206
|
+
|
|
207
|
+
Extracts information about all tool calls from the run output, including tool names, call IDs, and their positions in the message array.
|
|
208
|
+
|
|
209
|
+
```typescript
|
|
210
|
+
const toolUsageScorer = createScorer({
|
|
211
|
+
id: "tool-usage",
|
|
212
|
+
description: "Evaluates tool usage patterns",
|
|
213
|
+
type: "agent",
|
|
214
|
+
})
|
|
215
|
+
.preprocess(({ run }) => {
|
|
216
|
+
const { tools, toolCallInfos } = extractToolCalls(run.output);
|
|
217
|
+
return {
|
|
218
|
+
toolsUsed: tools,
|
|
219
|
+
toolCount: tools.length,
|
|
220
|
+
toolDetails: toolCallInfos,
|
|
221
|
+
};
|
|
222
|
+
})
|
|
223
|
+
.generateScore(({ results }) => {
|
|
224
|
+
const { toolCount } = results.preprocessStepResult || {};
|
|
225
|
+
// Score based on appropriate tool usage
|
|
226
|
+
return toolCount > 0 ? 1 : 0;
|
|
227
|
+
});
|
|
228
|
+
```
|
|
229
|
+
|
|
230
|
+
**Returns:**
|
|
231
|
+
|
|
232
|
+
```typescript
|
|
233
|
+
{
|
|
234
|
+
tools: string[]; // Array of tool names
|
|
235
|
+
toolCallInfos: ToolCallInfo[]; // Detailed tool call information
|
|
236
|
+
}
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
Where `ToolCallInfo` is:
|
|
240
|
+
|
|
241
|
+
```typescript
|
|
242
|
+
type ToolCallInfo = {
|
|
243
|
+
toolName: string; // Name of the tool
|
|
244
|
+
toolCallId: string; // Unique call identifier
|
|
245
|
+
messageIndex: number; // Index in the output array
|
|
246
|
+
invocationIndex: number; // Index within message's tool invocations
|
|
247
|
+
};
|
|
248
|
+
```
|
|
249
|
+
|
|
250
|
+
## Test Utilities
|
|
251
|
+
|
|
252
|
+
These utilities help create test data for scorer development.
|
|
253
|
+
|
|
254
|
+
### createTestMessage
|
|
255
|
+
|
|
256
|
+
Creates a `MastraDBMessage` object for testing purposes.
|
|
257
|
+
|
|
258
|
+
```typescript
|
|
259
|
+
import { createTestMessage } from "@mastra/evals/scorers/utils";
|
|
260
|
+
|
|
261
|
+
const userMessage = createTestMessage({
|
|
262
|
+
content: "What is the weather?",
|
|
263
|
+
role: "user",
|
|
264
|
+
});
|
|
265
|
+
|
|
266
|
+
const assistantMessage = createTestMessage({
|
|
267
|
+
content: "The weather is sunny.",
|
|
268
|
+
role: "assistant",
|
|
269
|
+
toolInvocations: [
|
|
270
|
+
{
|
|
271
|
+
toolCallId: "call-1",
|
|
272
|
+
toolName: "weatherTool",
|
|
273
|
+
args: { location: "London" },
|
|
274
|
+
result: { temp: 20 },
|
|
275
|
+
state: "result",
|
|
276
|
+
},
|
|
277
|
+
],
|
|
278
|
+
});
|
|
279
|
+
```
|
|
280
|
+
|
|
281
|
+
### createAgentTestRun
|
|
282
|
+
|
|
283
|
+
Creates a complete test run object for testing scorers.
|
|
284
|
+
|
|
285
|
+
```typescript
|
|
286
|
+
import { createAgentTestRun, createTestMessage } from "@mastra/evals/scorers/utils";
|
|
287
|
+
|
|
288
|
+
const testRun = createAgentTestRun({
|
|
289
|
+
inputMessages: [
|
|
290
|
+
createTestMessage({ content: "Hello", role: "user" }),
|
|
291
|
+
],
|
|
292
|
+
output: [
|
|
293
|
+
createTestMessage({ content: "Hi there!", role: "assistant" }),
|
|
294
|
+
],
|
|
295
|
+
});
|
|
296
|
+
|
|
297
|
+
// Run your scorer with the test data
|
|
298
|
+
const result = await myScorer.run({
|
|
299
|
+
input: testRun.input,
|
|
300
|
+
output: testRun.output,
|
|
301
|
+
});
|
|
302
|
+
```
|
|
303
|
+
|
|
304
|
+
## Complete Example
|
|
305
|
+
|
|
306
|
+
Here's a complete example showing how to use multiple utilities together:
|
|
307
|
+
|
|
308
|
+
```typescript
|
|
309
|
+
import { createScorer } from "@mastra/core/evals";
|
|
310
|
+
import {
|
|
311
|
+
getAssistantMessageFromRunOutput,
|
|
312
|
+
getReasoningFromRunOutput,
|
|
313
|
+
getUserMessageFromRunInput,
|
|
314
|
+
getCombinedSystemPrompt,
|
|
315
|
+
extractToolCalls,
|
|
316
|
+
} from "@mastra/evals/scorers/utils";
|
|
317
|
+
|
|
318
|
+
const comprehensiveScorer = createScorer({
|
|
319
|
+
id: "comprehensive-analysis",
|
|
320
|
+
name: "Comprehensive Analysis",
|
|
321
|
+
description: "Analyzes all aspects of an agent response",
|
|
322
|
+
type: "agent",
|
|
323
|
+
})
|
|
324
|
+
.preprocess(({ run }) => {
|
|
325
|
+
// Extract all relevant data
|
|
326
|
+
const userMessage = getUserMessageFromRunInput(run.input);
|
|
327
|
+
const response = getAssistantMessageFromRunOutput(run.output);
|
|
328
|
+
const reasoning = getReasoningFromRunOutput(run.output);
|
|
329
|
+
const systemPrompt = getCombinedSystemPrompt(run.input);
|
|
330
|
+
const { tools, toolCallInfos } = extractToolCalls(run.output);
|
|
331
|
+
|
|
332
|
+
return {
|
|
333
|
+
userMessage,
|
|
334
|
+
response,
|
|
335
|
+
reasoning,
|
|
336
|
+
systemPrompt,
|
|
337
|
+
toolsUsed: tools,
|
|
338
|
+
toolCount: tools.length,
|
|
339
|
+
};
|
|
340
|
+
})
|
|
341
|
+
.generateScore(({ results }) => {
|
|
342
|
+
const { response, reasoning, toolCount } = results.preprocessStepResult || {};
|
|
343
|
+
|
|
344
|
+
let score = 0;
|
|
345
|
+
if (response && response.length > 0) score += 0.4;
|
|
346
|
+
if (reasoning) score += 0.3;
|
|
347
|
+
if (toolCount > 0) score += 0.3;
|
|
348
|
+
|
|
349
|
+
return score;
|
|
350
|
+
})
|
|
351
|
+
.generateReason(({ results, score }) => {
|
|
352
|
+
const { response, reasoning, toolCount } = results.preprocessStepResult || {};
|
|
353
|
+
|
|
354
|
+
const parts = [];
|
|
355
|
+
if (response) parts.push("provided a response");
|
|
356
|
+
if (reasoning) parts.push("included reasoning");
|
|
357
|
+
if (toolCount > 0) parts.push(`used ${toolCount} tool(s)`);
|
|
358
|
+
|
|
359
|
+
return `Score: ${score}. The agent ${parts.join(", ")}.`;
|
|
360
|
+
});
|
|
361
|
+
```
|
|
362
|
+
|
|
@@ -83,118 +83,45 @@ A textual difference score between 0 and 1:
|
|
|
83
83
|
- **0.1–0.3**: Major differences – extensive changes needed.
|
|
84
84
|
- **0.0**: Completely different texts.
|
|
85
85
|
|
|
86
|
-
##
|
|
86
|
+
## Example
|
|
87
87
|
|
|
88
|
-
|
|
88
|
+
Measure textual differences between expected and actual agent outputs:
|
|
89
89
|
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
```typescript title="src/example-no-differences.ts" showLineNumbers copy
|
|
90
|
+
```typescript title="src/example-textual-difference.ts" showLineNumbers copy
|
|
91
|
+
import { runEvals } from "@mastra/core/evals";
|
|
93
92
|
import { createTextualDifferenceScorer } from "@mastra/evals/scorers/prebuilt";
|
|
93
|
+
import { myAgent } from "./agent";
|
|
94
94
|
|
|
95
95
|
const scorer = createTextualDifferenceScorer();
|
|
96
96
|
|
|
97
|
-
const
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
analyzeStepResult: {
|
|
117
|
-
confidence: 1,
|
|
118
|
-
ratio: 1,
|
|
119
|
-
changes: 0,
|
|
120
|
-
lengthDiff: 0,
|
|
97
|
+
const result = await runEvals({
|
|
98
|
+
data: [
|
|
99
|
+
{
|
|
100
|
+
input: "Summarize the concept of recursion",
|
|
101
|
+
groundTruth:
|
|
102
|
+
"Recursion is when a function calls itself to solve a problem by breaking it into smaller subproblems.",
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
input: "What is the capital of France?",
|
|
106
|
+
groundTruth: "The capital of France is Paris.",
|
|
107
|
+
},
|
|
108
|
+
],
|
|
109
|
+
scorers: [scorer],
|
|
110
|
+
target: myAgent,
|
|
111
|
+
onItemComplete: ({ scorerResults }) => {
|
|
112
|
+
console.log({
|
|
113
|
+
score: scorerResults[scorer.id].score,
|
|
114
|
+
groundTruth: scorerResults[scorer.id].groundTruth,
|
|
115
|
+
});
|
|
121
116
|
},
|
|
122
|
-
}
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
### Minor differences example
|
|
126
|
-
|
|
127
|
-
In this example, the texts have small variations. The scorer detects these minor differences and returns a moderate similarity score.
|
|
128
|
-
|
|
129
|
-
```typescript title="src/example-minor-differences.ts" showLineNumbers copy
|
|
130
|
-
import { createTextualDifferenceScorer } from "@mastra/evals/scorers/prebuilt";
|
|
131
|
-
|
|
132
|
-
const scorer = createTextualDifferenceScorer();
|
|
133
|
-
|
|
134
|
-
const input = "Hello world! How are you?";
|
|
135
|
-
const output = "Hello there! How is it going?";
|
|
136
|
-
|
|
137
|
-
const result = await scorer.run({
|
|
138
|
-
input: [{ role: "user", content: input }],
|
|
139
|
-
output: { role: "assistant", text: output },
|
|
140
117
|
});
|
|
141
118
|
|
|
142
|
-
console.log(
|
|
143
|
-
console.log("AnalyzeStepResult:", result.analyzeStepResult);
|
|
119
|
+
console.log(result.scores);
|
|
144
120
|
```
|
|
145
121
|
|
|
146
|
-
|
|
122
|
+
For more details on `runEvals`, see the [runEvals reference](/reference/v1/evals/run-evals).
|
|
147
123
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
```typescript
|
|
151
|
-
{
|
|
152
|
-
score: 0.5925925925925926,
|
|
153
|
-
analyzeStepResult: {
|
|
154
|
-
confidence: 0.8620689655172413,
|
|
155
|
-
ratio: 0.5925925925925926,
|
|
156
|
-
changes: 5,
|
|
157
|
-
lengthDiff: 0.13793103448275862
|
|
158
|
-
}
|
|
159
|
-
}
|
|
160
|
-
```
|
|
161
|
-
|
|
162
|
-
### Major differences example
|
|
163
|
-
|
|
164
|
-
In this example, the texts differ significantly. The scorer detects extensive changes and returns a low similarity score.
|
|
165
|
-
|
|
166
|
-
```typescript title="src/example-major-differences.ts" showLineNumbers copy
|
|
167
|
-
import { createTextualDifferenceScorer } from "@mastra/evals/scorers/prebuilt";
|
|
168
|
-
|
|
169
|
-
const scorer = createTextualDifferenceScorer();
|
|
170
|
-
|
|
171
|
-
const input = "Python is a high-level programming language";
|
|
172
|
-
const output = "JavaScript is used for web development";
|
|
173
|
-
|
|
174
|
-
const result = await scorer.run({
|
|
175
|
-
input: [{ role: "user", content: input }],
|
|
176
|
-
output: { role: "assistant", text: output },
|
|
177
|
-
});
|
|
178
|
-
|
|
179
|
-
console.log("Score:", result.score);
|
|
180
|
-
console.log("AnalyzeStepResult:", result.analyzeStepResult);
|
|
181
|
-
```
|
|
182
|
-
|
|
183
|
-
#### Major differences output
|
|
184
|
-
|
|
185
|
-
The scorer returns a low score due to significant differences between the texts. The detailed `analyzeStepResult` shows numerous changes and a notable length difference.
|
|
186
|
-
|
|
187
|
-
```typescript
|
|
188
|
-
{
|
|
189
|
-
score: 0.3170731707317073,
|
|
190
|
-
analyzeStepResult: {
|
|
191
|
-
confidence: 0.8636363636363636,
|
|
192
|
-
ratio: 0.3170731707317073,
|
|
193
|
-
changes: 8,
|
|
194
|
-
lengthDiff: 0.13636363636363635
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
```
|
|
124
|
+
To add this scorer to an agent, see the [Scorers overview](/docs/v1/evals/overview#adding-scorers-to-agents) guide.
|
|
198
125
|
|
|
199
126
|
## Related
|
|
200
127
|
|
|
@@ -94,116 +94,43 @@ Object with tone metrics:
|
|
|
94
94
|
- **avgSentiment**: Average sentiment across sentences (stability mode).
|
|
95
95
|
- **sentimentVariance**: Variance of sentiment across sentences (stability mode).
|
|
96
96
|
|
|
97
|
-
##
|
|
97
|
+
## Example
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
Evaluate tone consistency between related agent responses:
|
|
100
100
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
```typescript title="src/example-positive-tone.ts" showLineNumbers copy
|
|
104
|
-
import { createToneScorer } from "@mastra/evals/scorers/prebuilt";
|
|
105
|
-
|
|
106
|
-
const scorer = createToneScorer();
|
|
107
|
-
|
|
108
|
-
const input = "This product is fantastic and amazing!";
|
|
109
|
-
const output = "The product is excellent and wonderful!";
|
|
110
|
-
|
|
111
|
-
const result = await scorer.run({
|
|
112
|
-
input: [{ role: "user", content: input }],
|
|
113
|
-
output: { role: "assistant", text: output },
|
|
114
|
-
});
|
|
115
|
-
|
|
116
|
-
console.log("Score:", result.score);
|
|
117
|
-
console.log("AnalyzeStepResult:", result.analyzeStepResult);
|
|
118
|
-
```
|
|
119
|
-
|
|
120
|
-
#### Positive tone output
|
|
121
|
-
|
|
122
|
-
The scorer returns a high score reflecting strong sentiment alignment. The `analyzeStepResult` field provides sentiment values and the difference between them.
|
|
123
|
-
|
|
124
|
-
```typescript
|
|
125
|
-
{
|
|
126
|
-
score: 0.8333333333333335,
|
|
127
|
-
analyzeStepResult: {
|
|
128
|
-
responseSentiment: 1.3333333333333333,
|
|
129
|
-
referenceSentiment: 1.1666666666666667,
|
|
130
|
-
difference: 0.16666666666666652,
|
|
131
|
-
},
|
|
132
|
-
}
|
|
133
|
-
```
|
|
134
|
-
|
|
135
|
-
### Stable tone example
|
|
136
|
-
|
|
137
|
-
In this example, the text’s internal tone consistency is analyzed by passing an empty response. This signals the scorer to evaluate sentiment stability within the single input text, resulting in a score reflecting how uniform the tone is throughout.
|
|
138
|
-
|
|
139
|
-
```typescript title="src/example-stable-tone.ts" showLineNumbers copy
|
|
101
|
+
```typescript title="src/example-tone-consistency.ts" showLineNumbers copy
|
|
102
|
+
import { runEvals } from "@mastra/core/evals";
|
|
140
103
|
import { createToneScorer } from "@mastra/evals/scorers/prebuilt";
|
|
104
|
+
import { myAgent } from "./agent";
|
|
141
105
|
|
|
142
106
|
const scorer = createToneScorer();
|
|
143
107
|
|
|
144
|
-
const
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
{
|
|
162
|
-
score: 0.9444444444444444,
|
|
163
|
-
analyzeStepResult: {
|
|
164
|
-
avgSentiment: 1.3333333333333333,
|
|
165
|
-
sentimentVariance: 0.05555555555555556,
|
|
108
|
+
const result = await runEvals({
|
|
109
|
+
data: [
|
|
110
|
+
{
|
|
111
|
+
input: "How was your experience with our service?",
|
|
112
|
+
groundTruth: "The service was excellent and exceeded expectations!",
|
|
113
|
+
},
|
|
114
|
+
{
|
|
115
|
+
input: "Tell me about the customer support",
|
|
116
|
+
groundTruth: "The support team was friendly and very helpful.",
|
|
117
|
+
},
|
|
118
|
+
],
|
|
119
|
+
scorers: [scorer],
|
|
120
|
+
target: myAgent,
|
|
121
|
+
onItemComplete: ({ scorerResults }) => {
|
|
122
|
+
console.log({
|
|
123
|
+
score: scorerResults[scorer.id].score,
|
|
124
|
+
});
|
|
166
125
|
},
|
|
167
|
-
}
|
|
168
|
-
```
|
|
169
|
-
|
|
170
|
-
### Mixed tone example
|
|
171
|
-
|
|
172
|
-
In this example, the input and response have different emotional tones. The scorer picks up on these variations and gives a lower consistency score.
|
|
173
|
-
|
|
174
|
-
```typescript title="src/example-mixed-tone.ts" showLineNumbers copy
|
|
175
|
-
import { createToneScorer } from "@mastra/evals/scorers/prebuilt";
|
|
176
|
-
|
|
177
|
-
const scorer = createToneScorer();
|
|
178
|
-
|
|
179
|
-
const input =
|
|
180
|
-
"The interface is frustrating and confusing, though it has potential.";
|
|
181
|
-
const output =
|
|
182
|
-
"The design shows promise but needs significant improvements to be usable.";
|
|
183
|
-
|
|
184
|
-
const result = await scorer.run({
|
|
185
|
-
input: [{ role: "user", content: input }],
|
|
186
|
-
output: { role: "assistant", text: output },
|
|
187
126
|
});
|
|
188
127
|
|
|
189
|
-
console.log(
|
|
190
|
-
console.log("AnalyzeStepResult:", result.analyzeStepResult);
|
|
128
|
+
console.log(result.scores);
|
|
191
129
|
```
|
|
192
130
|
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
The scorer returns a low score due to the noticeable differences in emotional tone. The `analyzeStepResult` field highlights the sentiment values and the degree of variation between them.
|
|
131
|
+
For more details on `runEvals`, see the [runEvals reference](/reference/v1/evals/run-evals).
|
|
196
132
|
|
|
197
|
-
|
|
198
|
-
{
|
|
199
|
-
score: 0.4181818181818182,
|
|
200
|
-
analyzeStepResult: {
|
|
201
|
-
responseSentiment: -0.4,
|
|
202
|
-
referenceSentiment: 0.18181818181818182,
|
|
203
|
-
difference: 0.5818181818181818,
|
|
204
|
-
},
|
|
205
|
-
}
|
|
206
|
-
```
|
|
133
|
+
To add this scorer to an agent, see the [Scorers overview](/docs/v1/evals/overview#adding-scorers-to-agents) guide.
|
|
207
134
|
|
|
208
135
|
## Related
|
|
209
136
|
|
|
@@ -349,7 +349,7 @@ The LLM-based scorer provides:
|
|
|
349
349
|
```typescript showLineNumbers copy
|
|
350
350
|
// Basic configuration
|
|
351
351
|
const basicLLMScorer = createLLMScorer({
|
|
352
|
-
model: 'openai/gpt-
|
|
352
|
+
model: 'openai/gpt-5.1',
|
|
353
353
|
availableTools: [
|
|
354
354
|
{ name: 'tool1', description: 'Description 1' },
|
|
355
355
|
{ name: 'tool2', description: 'Description 2' }
|
|
@@ -358,7 +358,7 @@ const basicLLMScorer = createLLMScorer({
|
|
|
358
358
|
|
|
359
359
|
// With different model
|
|
360
360
|
const customModelScorer = createLLMScorer({
|
|
361
|
-
model: openai
|
|
361
|
+
model: 'openai/gpt-5', // More powerful model for complex evaluations
|
|
362
362
|
availableTools: [...]
|
|
363
363
|
});
|
|
364
364
|
```
|
|
@@ -389,7 +389,7 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
|
|
|
389
389
|
|
|
390
390
|
```typescript title="src/example-llm-basic.ts" showLineNumbers copy
|
|
391
391
|
const llmScorer = createToolCallAccuracyScorerLLM({
|
|
392
|
-
model: "openai/gpt-
|
|
392
|
+
model: "openai/gpt-5.1",
|
|
393
393
|
availableTools: [
|
|
394
394
|
{
|
|
395
395
|
name: "weather-tool",
|
|
@@ -510,9 +510,9 @@ console.log(result.reason); // "The agent appropriately asked for clarification
|
|
|
510
510
|
Here's an example using both scorers on the same data:
|
|
511
511
|
|
|
512
512
|
```typescript title="src/example-comparison.ts" showLineNumbers copy
|
|
513
|
-
import {
|
|
514
|
-
createToolCallAccuracyScorerCode as createCodeScorer,
|
|
515
|
-
createToolCallAccuracyScorerLLM as createLLMScorer
|
|
513
|
+
import {
|
|
514
|
+
createToolCallAccuracyScorerCode as createCodeScorer,
|
|
515
|
+
createToolCallAccuracyScorerLLM as createLLMScorer
|
|
516
516
|
} from "@mastra/evals/scorers/prebuilt";
|
|
517
517
|
|
|
518
518
|
// Setup both scorers
|
|
@@ -522,7 +522,7 @@ const codeScorer = createCodeScorer({
|
|
|
522
522
|
});
|
|
523
523
|
|
|
524
524
|
const llmScorer = createLLMScorer({
|
|
525
|
-
model: "openai/gpt-
|
|
525
|
+
model: "openai/gpt-5.1",
|
|
526
526
|
availableTools: [
|
|
527
527
|
{ name: "weather-tool", description: "Get weather information" },
|
|
528
528
|
{ name: "search-tool", description: "Search the web" },
|