@codemation/agent-skills 0.1.9 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,263 @@
1
+ Load this when you need to see a complete workflow that exercises most authoring features end-to-end.
2
+
3
+ ## The dense example (manual trigger — full fluent sugar)
4
+
5
+ The fluent `.map`/`.if`/`.switch`/`.split`/`.agent`/`.node` helpers are only available after `.manualTrigger(...)`. The example below is a manual-trigger workflow so it can demonstrate all of them. For cron / webhook variants, see the snippet at the bottom.
6
+
7
+ ```ts
8
+ // src/workflows/dailyCsvDigest.ts
9
+ //
10
+ // Theme: a manual-triggered "daily CSV digest". Caller passes { date: "YYYY-MM-DD" }.
11
+ // The flow fetches that day's sales CSV from a reporting API, parses each row,
12
+ // classifies rows with an LLM agent, and sends a per-row digest email.
13
+ //
14
+ // Register in codemation.config.ts:
15
+ // import dailyCsvDigest from "./src/workflows/dailyCsvDigest";
16
+ // workflows: [dailyCsvDigest]
17
+
18
+ import { z } from "zod";
19
+ import { callableTool, itemExpr } from "@codemation/core";
20
+ import { HttpRequest } from "@codemation/core-nodes";
21
+ import { workflow } from "@codemation/host";
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Types
25
+ // ---------------------------------------------------------------------------
26
+
27
+ type TriggerInput = { date: string }; // e.g. "2025-05-14"
28
+
29
+ type FetchMeta = {
30
+ url: string;
31
+ ok: boolean;
32
+ status: number;
33
+ binarySlot: string;
34
+ };
35
+
36
+ type CsvRow = {
37
+ region: string;
38
+ product: string;
39
+ revenue: number;
40
+ anomaly: boolean;
41
+ };
42
+
43
+ type ClassifiedRow = CsvRow & {
44
+ classification: "normal" | "warning" | "critical";
45
+ rationale: string;
46
+ };
47
+
48
+ // ---------------------------------------------------------------------------
49
+ // Inline callable tool — classify a single row
50
+ // ---------------------------------------------------------------------------
51
+
52
+ const classifyRowTool = callableTool({
53
+ name: "classify_row",
54
+ description: "Classify a revenue row as normal, warning, or critical.",
55
+ inputSchema: z.object({
56
+ region: z.string(),
57
+ product: z.string(),
58
+ revenue: z.number(),
59
+ anomaly: z.boolean(),
60
+ }),
61
+ outputSchema: z.object({
62
+ classification: z.enum(["normal", "warning", "critical"]),
63
+ rationale: z.string(),
64
+ }),
65
+ execute: async ({ input }) => {
66
+ // Fallback executor if the agent doesn't call the tool — keeps the workflow deterministic in tests.
67
+ const classification =
68
+ input.anomaly || input.revenue < 0 ? "critical" : input.revenue < 1000 ? "warning" : "normal";
69
+ return { classification, rationale: `Revenue ${input.revenue}, anomaly=${input.anomaly}` };
70
+ },
71
+ });
72
+
73
+ // ---------------------------------------------------------------------------
74
+ // Workflow
75
+ // ---------------------------------------------------------------------------
76
+
77
+ export default workflow("wf.daily-csv-digest")
78
+ .name("Daily CSV Digest")
79
+ // Manual trigger seeded with a default date — callers can override at run time.
80
+ .manualTrigger<TriggerInput>("Start", { date: "2025-05-14" })
81
+
82
+ // ── Step 1: build the fetch URL ────────────────────────────────────────────
83
+ // async .map — use when you need await (date math here is sync, but the API call below is async).
84
+ .map("Build fetch URL", async (item, _ctx) => ({
85
+ date: item.json.date,
86
+ reportUrl: `https://reports.internal/sales/${item.json.date}.csv`,
87
+ }))
88
+
89
+ // HttpRequest with responseFormat:"binary" stores the body in ctx.binary automatically.
90
+ // Explicit id "fetch-report" keeps the credential binding stable across label renames.
91
+ .then(
92
+ new HttpRequest("Fetch report CSV", {
93
+ id: "fetch-report",
94
+ urlField: "reportUrl",
95
+ responseFormat: "binary",
96
+ responseBinarySlot: "csvFile",
97
+ credentialSlot: "reportApi",
98
+ }),
99
+ )
100
+
101
+ // ── Step 2: gate on HTTP success ───────────────────────────────────────────
102
+ // .if predicate receives (item, ctx). Use for fast boolean branches; .switch is overkill for two outcomes.
103
+ .if((item: { json: FetchMeta }, _ctx) => item.json.ok, {
104
+ true: (branch) =>
105
+ branch
106
+ // ── Step 3: parse CSV from binary ──────────────────────────────────────
107
+ // async .map — needs await to read from binary storage.
108
+ .map("Parse CSV rows", async (item: { json: FetchMeta }, ctx) => {
109
+ const stream = await ctx.binary.openReadStream(item.json.binarySlot);
110
+ const text = await streamToText(stream);
111
+ const rows = parseCsv(text);
112
+ return { rows, fetchedAt: item.json.url };
113
+ })
114
+
115
+ // .split emits one item per CSV row so downstream steps run per-row.
116
+ .split("Split rows", (item: { json: { rows: CsvRow[] } }) => item.json.rows)
117
+
118
+ // ── Step 4: classify each row with an agent ──────────────────────────
119
+ // itemExpr defers message construction to per-item runtime — required when content depends on the current item.
120
+ .agent("Classify row", {
121
+ model: "openai:gpt-4o-mini",
122
+ messages: itemExpr(({ item }: { item: { json: CsvRow } }) => [
123
+ { role: "system" as const, content: "You are a revenue analyst. Use classify_row." },
124
+ { role: "user" as const, content: JSON.stringify(item.json) },
125
+ ]),
126
+ tools: [classifyRowTool],
127
+ outputSchema: z.object({
128
+ classification: z.enum(["normal", "warning", "critical"]),
129
+ rationale: z.string(),
130
+ }),
131
+ })
132
+
133
+ // ── Step 5: merge agent output with the original row via ctx.data ──────
134
+ // ctx.data is keyed by node id (the slug of the node label).
135
+ // "Split rows" slugs to "split-rows"; we read its emitted item back here.
136
+ // sync .map — pure object merge, no I/O.
137
+ .map("Enrich classification", (item: { json: { classification: string; rationale: string } }, ctx) => {
138
+ const originalRow = ctx.data["split-rows"]?.items?.[0]?.json as CsvRow | undefined;
139
+ return {
140
+ ...originalRow,
141
+ classification: item.json.classification as ClassifiedRow["classification"],
142
+ rationale: item.json.rationale,
143
+ } satisfies Partial<ClassifiedRow>;
144
+ })
145
+
146
+ // ── Step 6: send digest email via a registered node ───────────────────
147
+ // .node(name, config, options) — explicit id keeps credential binding stable.
148
+ // SendEmailNodeConfig is illustrative; replace with the email node available in your project.
149
+ .node(
150
+ "Send digest email",
151
+ new SendEmailNodeConfig({
152
+ // itemExpr on a config field — engine resolves once per item at execution time.
153
+ subject: itemExpr(
154
+ ({ item }: { item: { json: Partial<ClassifiedRow> } }) =>
155
+ `[${item.json.classification?.toUpperCase()}] ${item.json.region} – ${item.json.product}`,
156
+ ),
157
+ to: "ops-team@example.com",
158
+ body: itemExpr(
159
+ ({ item }: { item: { json: Partial<ClassifiedRow> } }) =>
160
+ `Region: ${item.json.region}\nRevenue: ${item.json.revenue}\nRationale: ${item.json.rationale}`,
161
+ ),
162
+ }),
163
+ { id: "send-digest-email" },
164
+ ),
165
+
166
+ false: (branch) =>
167
+ branch.map("Log fetch failure", (item: { json: FetchMeta }, _ctx) => ({
168
+ error: `Fetch failed: HTTP ${item.json.status}`,
169
+ url: item.json.url,
170
+ })),
171
+ })
172
+
173
+ // .build() validates non-empty + unique node ids (including agent connection children).
174
+ // Throws WorkflowDefinitionError on violation.
175
+ .build();
176
+
177
+ // ---------------------------------------------------------------------------
178
+ // Helpers (inline for brevity — promote to lib/ if reused)
179
+ // ---------------------------------------------------------------------------
180
+
181
+ async function streamToText(stream: AsyncIterable<Uint8Array>): Promise<string> {
182
+ const chunks: Buffer[] = [];
183
+ for await (const chunk of stream) chunks.push(Buffer.from(chunk));
184
+ return Buffer.concat(chunks).toString("utf-8");
185
+ }
186
+
187
+ function parseCsv(text: string): CsvRow[] {
188
+ const [header, ...lines] = text.trim().split("\n");
189
+ const cols = header!.split(",");
190
+ return lines.map((line) => {
191
+ const vals = line.split(",");
192
+ return {
193
+ region: vals[cols.indexOf("region")] ?? "",
194
+ product: vals[cols.indexOf("product")] ?? "",
195
+ revenue: Number(vals[cols.indexOf("revenue")] ?? 0),
196
+ anomaly: vals[cols.indexOf("anomaly")] === "true",
197
+ };
198
+ });
199
+ }
200
+ ```
201
+
202
+ ## What this exercises
203
+
204
+ - **Manual trigger with typed default item** → `workflow("...").manualTrigger<TriggerInput>("Start", {...})`
205
+ - **sync `.map`** → "Enrich classification" — pure object merge, no `await`
206
+ - **async `.map`** → "Build fetch URL" and "Parse CSV rows" — uses `await` for binary read
207
+ - **`.if` per-item predicate** → `(item, _ctx) => item.json.ok` with branch factories
208
+ - **`HttpRequest` with explicit `id:`** → `id: "fetch-report"` (credential binding stability)
209
+ - **`.split`** → fan-out one batch into many items
210
+ - **`.agent(...)` with `messages`, `model`, `tools`, `outputSchema`** → typed structured output
211
+ - **`callableTool` with Zod schemas and `execute({ input })`** → inline tool definition
212
+ - **`itemExpr(...)`** → on agent messages (per-item content) and on `.node` config fields (per-item subject/body)
213
+ - **`.node(name, config, options)` with explicit id** → stable credential binding
214
+ - **`ctx.data["<slug>"]`** → reading earlier node output without threading it through every step
215
+ - **`ctx.binary.openReadStream(slot)`** → reading bytes from a binary slot attached upstream
216
+ - **`.build()`** → final validation pass
217
+
218
+ ## Cron / webhook variant (alternative trigger)
219
+
220
+ When the trigger isn't manual, the fluent `.map`/`.if`/`.agent` sugar isn't available — you use the lower-level builder and `.then(new SomeNodeConfig(...))`. Shape:
221
+
222
+ ```ts
223
+ import { Callback, CronTrigger, createWorkflowBuilder, HttpRequest } from "@codemation/core-nodes";
224
+
225
+ export default createWorkflowBuilder({
226
+ id: "wf.daily-csv-digest.cron",
227
+ name: "Daily CSV Digest (cron)",
228
+ })
229
+ .trigger(new CronTrigger("Daily 06:00", { schedule: "0 6 * * *", timezone: "UTC" }))
230
+ // Cron fires one item per tick: { firedAt, scheduledFor }. Wrap downstream logic in Callback configs:
231
+ .then(
232
+ new Callback("Build fetch URL", (items, _ctx) => {
233
+ return items.map((item) => {
234
+ const date = new Date((item.json as { scheduledFor: string }).scheduledFor).toISOString().slice(0, 10);
235
+ return { date, reportUrl: `https://reports.internal/sales/${date}.csv` };
236
+ });
237
+ }),
238
+ )
239
+ .then(
240
+ new HttpRequest("Fetch report CSV", {
241
+ id: "fetch-report",
242
+ urlField: "reportUrl",
243
+ responseFormat: "binary",
244
+ responseBinarySlot: "csvFile",
245
+ credentialSlot: "reportApi",
246
+ }),
247
+ )
248
+ // For branching, use `new If(...)`. For per-item agent calls, use `new AIAgent({...})`.
249
+ // For row fan-out, use `new Split(...)`. The execution semantics match the fluent helpers
250
+ // — only the surface syntax differs.
251
+ .build();
252
+ ```
253
+
254
+ If you need both cron + the fluent sugar in the same workflow, you can wrap the cursor manually:
255
+
256
+ ```ts
257
+ import { WorkflowChain } from "@codemation/core-nodes";
258
+
259
+ const cursor = createWorkflowBuilder({ id, name }).trigger(new CronTrigger("Tick", { schedule: "..." }));
260
+ export default new WorkflowChain(cursor).map("First step", (item) => ({ ...item.json })).build();
261
+ ```
262
+
263
+ This is uncommon in production code; reach for it only when the fluent helpers genuinely help readability.
@@ -0,0 +1,194 @@
1
+ # Workflow Testing
2
+
3
+ ## Use this reference when
4
+
5
+ You are authoring or reviewing a workflow that needs **end-to-end tests**: validate agent behavior, regression-test branching, score LLM outputs over time, or assert that a workflow produces the expected output for a known set of inputs.
6
+
7
+ This is **not** for unit-testing individual nodes — use `WorkflowTestKit` from `@codemation/core/testing` for that.
8
+
9
+ ## Three building blocks
10
+
11
+ 1. **`TestTrigger`** — drops on the canvas alongside live triggers (Webhook / Cron / Gmail / etc.). Authored callback yields one item per test case.
12
+ 2. **`IsTestRun`** — per-item router with `true` / `false` ports. Branches based on whether the run was started by the test orchestrator.
13
+ 3. **`Assertion`** — generic per-item assertion node; returns one or more `AssertionResult`s per input item, one persisted `TestAssertion` row per result.
14
+
15
+ ## Typical workflow shape
16
+
17
+ ```
18
+ [GmailTrigger: new email] ──┐
19
+
20
+ [TestTrigger: 10 fixtures]──┴─→ [ClassifyAgent]
21
+
22
+ [IsTestRun?]
23
+ │ │
24
+ true│ │false
25
+ ↓ ↓
26
+ [Assertion] [SendReply] (real side effect — skipped in tests)
27
+ ```
28
+
29
+ ## Authoring a TestTrigger
30
+
31
+ ```ts
32
+ import { TestTrigger } from "@codemation/core-nodes";
33
+ import { gmailCredentialType, type GmailSession } from "@codemation/core-nodes-gmail";
34
+
35
+ export const fixtureMailsTrigger = new TestTrigger<{ subject: string; body: string }>({
36
+ name: "Email fixtures",
37
+ credentialRequirements: [
38
+ { slotKey: "gmail", label: "Gmail", acceptedTypes: [gmailCredentialType.definition.typeId] },
39
+ ],
40
+ async *generateItems(ctx) {
41
+ const gmail = await ctx.getCredential<GmailSession>("gmail");
42
+ const messages = await gmail.listMessages({ labelIds: ["Label_test_mails"] });
43
+ for (const message of messages) {
44
+ if (ctx.signal.aborted) break;
45
+ yield { json: { subject: message.subject, body: message.body } };
46
+ }
47
+ },
48
+ concurrency: 8, // optional; default 4
49
+ caseLabel: (item) => item.json.subject, // optional; rows fall back to runId
50
+ });
51
+ ```
52
+
53
+ Notes:
54
+
55
+ - `triggerKind: "test"` is set automatically — `TriggerRuntimeService` skips it during live activation.
56
+ - `ctx.signal` is an `AbortSignal` raised when the suite is cancelled; long pulls should bail out.
57
+ - For hardcoded fixtures, just `yield { json: { ... } }` — no need to use credentials.
58
+ - Set `caseLabel` so the Tests-tab tree-table shows something readable instead of opaque runIds.
59
+
60
+ ## Branching in the workflow
61
+
62
+ ```ts
63
+ import { IsTestRun } from "@codemation/core-nodes";
64
+
65
+ const isTestRun = new IsTestRun("Skip side effects in tests");
66
+ ```
67
+
68
+ Or read `ctx.testContext` directly from a custom node:
69
+
70
+ ```ts
71
+ async execute({ item, ctx }) {
72
+ if (ctx.testContext) {
73
+ return { json: { result: "synthetic-test-output" } };
74
+ }
75
+ return { json: await this.realApi.send(item.json) };
76
+ }
77
+ ```
78
+
79
+ ## Authoring assertions
80
+
81
+ ```ts
82
+ import { Assertion } from "@codemation/core-nodes";
83
+
84
+ const checkClassification = new Assertion<{ label: string; confidence: number }>({
85
+ name: "Classification checks",
86
+ assertions: (item) => [
87
+ {
88
+ // Boolean-style: 1 = pass, 0 = fail. Default threshold (0.5) handles this.
89
+ name: "label is spam",
90
+ score: item.json.label === "spam" ? 1 : 0,
91
+ expected: "spam",
92
+ actual: item.json.label,
93
+ },
94
+ {
95
+ // Continuous-score: declare the threshold explicitly.
96
+ name: "confidence ≥ 0.8",
97
+ score: item.json.confidence,
98
+ passThreshold: 0.8,
99
+ expected: "≥ 0.8",
100
+ actual: item.json.confidence,
101
+ },
102
+ ],
103
+ });
104
+ ```
105
+
106
+ The `AssertionResult` shape (stable; persister + chart UIs key off these fields):
107
+
108
+ ```ts
109
+ interface AssertionResult {
110
+ readonly name: string;
111
+ /** 0..1 score. Source of truth for pass/fail (compared against `passThreshold`). */
112
+ readonly score: number;
113
+ /** 0..1 threshold for "passed". When omitted, consumers default to 0.5. */
114
+ readonly passThreshold?: number;
115
+ /** True when evaluating the assertion threw — treated as fail regardless of `score`. */
116
+ readonly errored?: true;
117
+ readonly expected?: JsonValue;
118
+ readonly actual?: JsonValue;
119
+ readonly message?: string;
120
+ readonly details?: Readonly<Record<string, JsonValue>>;
121
+ }
122
+ ```
123
+
124
+ Pass/fail derivation (canonical, in `@codemation/core`):
125
+
126
+ ```ts
127
+ import { deriveAssertionPassed } from "@codemation/core";
128
+ // errored ? false : score >= (passThreshold ?? 0.5)
129
+ ```
130
+
131
+ `errored: true` is for the assertion code itself crashing (judge agent crashed, JSON parse failed) — use it to separate "broken evaluator" from "wrong workflow output" in dashboards:
132
+
133
+ ```ts
134
+ assertions: async (item, ctx) => {
135
+ try {
136
+ const j = await runJudge(item, ctx);
137
+ return [{ name: "polite reply", score: j.score, passThreshold: 0.7, message: j.reason }];
138
+ } catch (err) {
139
+ return [{ name: "polite reply", score: 0, errored: true, message: String(err) }];
140
+ }
141
+ };
142
+ ```
143
+
144
+ ## Judge-by-Agent
145
+
146
+ A judge-by-agent is just an AI agent step feeding into an Assertion callback. Run an agent that returns a structured judgment, then map its output to an `AssertionResult` (`score: 0..1`, set `passThreshold`).
147
+
148
+ ## Running tests
149
+
150
+ - **From the UI**: open the workflow → **Tests** tab. Pick a TestTrigger from the dropdown (the picker lists every `triggerKind === "test"` node), click **Run tests**. Use the metric selector on the trend chart to plot pass-rate, per-assertion average scores, or case counts. Click two historical runs to compare them side-by-side.
151
+ - **From code**: instantiate `TestSuiteOrchestrator` from `@codemation/core/bootstrap`, call `runSuite({ workflow, triggerNodeId })`.
152
+ - **From HTTP**: `POST /api/workflows/:workflowId/test-suite-runs` with `{ triggerNodeId, concurrency? }`.
153
+
154
+ ## Status
155
+
156
+ ### Per case (`Run.testCaseStatus`)
157
+
158
+ | Status | Meaning |
159
+ | ----------- | ------------------------------------------------------------------------------------- |
160
+ | `running` | Workflow run dispatched, not yet finished. |
161
+ | `succeeded` | Workflow completed AND every assertion passed. |
162
+ | `failed` | Assertion-rollup downgrade OR the workflow itself reported failure. |
163
+ | `errored` | Workflow run threw before reaching a terminal state (engine error, not an assertion). |
164
+ | `cancelled` | Suite's `AbortSignal` fired before this case completed. |
165
+
166
+ ### Suite
167
+
168
+ | Status | Meaning |
169
+ | ----------- | ------------------------------------------------------------------- |
170
+ | `succeeded` | All cases passed (or zero cases yielded). |
171
+ | `failed` | Every case failed. |
172
+ | `partial` | Some passed, some failed — **the normal "1 of 10 failed" outcome**. |
173
+ | `cancelled` | Suite was aborted before all cases finished. |
174
+ | `errored` | The `generateItems` callback itself threw. |
175
+
176
+ The suite counters and status are re-derived from the final per-case statuses, so an "all workflows completed cleanly but assertions caught regressions" suite reports `partial` rather than `succeeded`.
177
+
178
+ ## Best practices
179
+
180
+ - **Don't `throw` from `execute` to fail a case.** Throwing skips downstream nodes — including the Assertion node — so you lose all assertion data and only get a run-level error. Instead, let the workflow complete and assert on the (wrong) output. The assertion-rollup downgrades the case to `failed`.
181
+ - Use `score: 1`/`score: 0` for boolean checks (equality, contains, regex). The default `passThreshold = 0.5` handles them.
182
+ - Use `passThreshold` for continuous metrics (confidence, judge ratings, similarity).
183
+ - Reserve `errored: true` for assertion-code crashes, not low scores.
184
+ - Keep TestTriggers as source-controlled fixtures so historical chart comparisons are apples-to-apples.
185
+
186
+ ## What's deferred (Phase 2)
187
+
188
+ - **Test-input snapshots** — Phase 1 fetches inputs live every run (rolling-input). Snapshotting will land in Phase 2 for stable judge-score charts.
189
+ - **Declarative assertion shorthands** — `StringEqualsAssertion`, `JudgeByAgentAssertion`, etc. compose on top of the generic `Assertion` shipping today.
190
+ - **CLI / cron / GitHub PR integration** — currently triggered manually via UI or HTTP only.
191
+
192
+ ## Read more
193
+
194
+ - Top-level walkthrough: [`docs/workflow-testing.md`](../../../../docs/workflow-testing.md)