hatch3r 1.7.1 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +38 -12
- package/agents/hatch3r-a11y-auditor.md +4 -0
- package/agents/hatch3r-architect.md +4 -0
- package/agents/hatch3r-ci-watcher.md +4 -0
- package/agents/hatch3r-context-rules.md +26 -6
- package/agents/hatch3r-creator.md +6 -1
- package/agents/hatch3r-dependency-auditor.md +4 -0
- package/agents/hatch3r-devops.md +4 -0
- package/agents/hatch3r-docs-writer.md +4 -0
- package/agents/hatch3r-fixer.md +4 -0
- package/agents/hatch3r-handoff-loader.md +243 -0
- package/agents/hatch3r-handoff-preparer.md +134 -0
- package/agents/hatch3r-implementer.md +12 -0
- package/agents/hatch3r-learnings-loader.md +5 -1
- package/agents/hatch3r-lint-fixer.md +4 -0
- package/agents/hatch3r-perf-profiler.md +8 -0
- package/agents/hatch3r-researcher.md +4 -0
- package/agents/hatch3r-reviewer.md +94 -0
- package/agents/hatch3r-security-auditor.md +24 -0
- package/agents/hatch3r-test-writer.md +4 -0
- package/agents/modes/requirements-elicitation.md +4 -1
- package/agents/modes/similar-implementation.md +6 -0
- package/agents/modes/user-flows.md +76 -0
- package/agents/shared/quality-charter.md +128 -0
- package/agents/shared/user-content-templates.md +31 -1
- package/commands/hatch3r-agent-customize.md +4 -0
- package/commands/hatch3r-api-spec.md +7 -0
- package/commands/hatch3r-benchmark.md +7 -0
- package/commands/hatch3r-board-fill.md +8 -0
- package/commands/hatch3r-board-groom.md +4 -0
- package/commands/hatch3r-board-init.md +51 -0
- package/commands/hatch3r-board-pickup.md +8 -0
- package/commands/hatch3r-board-refresh.md +4 -0
- package/commands/hatch3r-board-shared.md +6 -6
- package/commands/hatch3r-bug-plan.md +7 -0
- package/commands/hatch3r-codebase-map.md +8 -0
- package/commands/hatch3r-command-customize.md +4 -0
- package/commands/hatch3r-context-health.md +5 -0
- package/commands/hatch3r-create.md +59 -4
- package/commands/hatch3r-debug.md +7 -0
- package/commands/hatch3r-dep-audit.md +4 -0
- package/commands/hatch3r-feature-plan.md +7 -0
- package/commands/hatch3r-handoff.md +133 -0
- package/commands/hatch3r-healthcheck.md +4 -0
- package/commands/hatch3r-hooks.md +4 -0
- package/commands/hatch3r-learn.md +16 -0
- package/commands/hatch3r-migration-plan.md +7 -0
- package/commands/hatch3r-onboard.md +7 -0
- package/commands/hatch3r-pr-resolve.md +12 -1
- package/commands/hatch3r-project-spec.md +8 -0
- package/commands/hatch3r-quick-change.md +11 -2
- package/commands/hatch3r-recipe.md +4 -0
- package/commands/hatch3r-refactor-plan.md +7 -0
- package/commands/hatch3r-release.md +5 -0
- package/commands/hatch3r-revision.md +7 -0
- package/commands/hatch3r-roadmap.md +8 -0
- package/commands/hatch3r-rule-customize.md +4 -0
- package/commands/hatch3r-security-audit.md +4 -0
- package/commands/hatch3r-skill-customize.md +4 -0
- package/commands/hatch3r-test-plan.md +7 -0
- package/commands/hatch3r-workflow.md +11 -1
- package/dist/cli/index.js +4814 -1130
- package/dist/cli/index.js.map +1 -1
- package/package.json +10 -5
- package/rules/hatch3r-accessibility-standards.md +21 -0
- package/rules/hatch3r-accessibility-standards.mdc +21 -0
- package/rules/hatch3r-agent-orchestration-detail.md +3 -0
- package/rules/hatch3r-agent-orchestration-detail.mdc +3 -0
- package/rules/hatch3r-agent-orchestration.md +34 -3
- package/rules/hatch3r-agent-orchestration.mdc +34 -3
- package/rules/hatch3r-ai-evals.md +158 -0
- package/rules/hatch3r-ai-evals.mdc +154 -0
- package/rules/hatch3r-ai-ux-patterns.md +131 -0
- package/rules/hatch3r-ai-ux-patterns.mdc +127 -0
- package/rules/hatch3r-api-design.md +67 -9
- package/rules/hatch3r-api-design.mdc +67 -9
- package/rules/hatch3r-api-versioning.md +119 -0
- package/rules/hatch3r-api-versioning.mdc +115 -0
- package/rules/hatch3r-auth-patterns.md +170 -0
- package/rules/hatch3r-auth-patterns.mdc +166 -0
- package/rules/hatch3r-component-conventions.md +30 -0
- package/rules/hatch3r-component-conventions.mdc +30 -0
- package/rules/hatch3r-container-hardening.md +131 -0
- package/rules/hatch3r-container-hardening.mdc +127 -0
- package/rules/hatch3r-contract-testing.md +117 -0
- package/rules/hatch3r-contract-testing.mdc +113 -0
- package/rules/hatch3r-deep-context.md +2 -0
- package/rules/hatch3r-deep-context.mdc +2 -0
- package/rules/hatch3r-dependency-management.md +73 -1
- package/rules/hatch3r-dependency-management.mdc +72 -0
- package/rules/hatch3r-design-system-detection.md +142 -0
- package/rules/hatch3r-design-system-detection.mdc +138 -0
- package/rules/hatch3r-event-schema-evolution.md +90 -0
- package/rules/hatch3r-event-schema-evolution.mdc +86 -0
- package/rules/hatch3r-handoff-readiness.md +45 -0
- package/rules/hatch3r-handoff-readiness.mdc +40 -0
- package/rules/hatch3r-i18n.md +13 -0
- package/rules/hatch3r-i18n.mdc +13 -0
- package/rules/hatch3r-iteration-summary.md +2 -0
- package/rules/hatch3r-iteration-summary.mdc +2 -0
- package/rules/hatch3r-migrations.md +61 -16
- package/rules/hatch3r-migrations.mdc +61 -16
- package/rules/hatch3r-observability-logging.md +1 -1
- package/rules/hatch3r-observability-logging.mdc +1 -1
- package/rules/hatch3r-observability-metrics.md +1 -1
- package/rules/hatch3r-observability-metrics.mdc +1 -1
- package/rules/hatch3r-observability-tracing-detail.md +8 -149
- package/rules/hatch3r-observability-tracing-detail.mdc +7 -149
- package/rules/hatch3r-observability-tracing.md +154 -6
- package/rules/hatch3r-observability-tracing.mdc +154 -6
- package/rules/hatch3r-observability.md +1 -0
- package/rules/hatch3r-observability.mdc +1 -0
- package/rules/hatch3r-operability.md +149 -0
- package/rules/hatch3r-operability.mdc +145 -0
- package/rules/hatch3r-passkey-server.md +181 -0
- package/rules/hatch3r-passkey-server.mdc +177 -0
- package/rules/hatch3r-progressive-delivery.md +120 -0
- package/rules/hatch3r-progressive-delivery.mdc +116 -0
- package/rules/hatch3r-resilience-patterns.md +154 -0
- package/rules/hatch3r-resilience-patterns.mdc +150 -0
- package/rules/hatch3r-secrets-management.md +29 -0
- package/rules/hatch3r-secrets-management.mdc +29 -0
- package/rules/hatch3r-testing.md +139 -43
- package/rules/hatch3r-testing.mdc +139 -43
- package/rules/hatch3r-ux-states-and-flows.md +149 -0
- package/rules/hatch3r-ux-states-and-flows.mdc +145 -0
- package/skills/hatch3r-a11y-audit/SKILL.md +14 -0
- package/skills/hatch3r-agent-customize/SKILL.md +10 -0
- package/skills/hatch3r-ai-feature/SKILL.md +136 -0
- package/skills/hatch3r-api-spec/SKILL.md +73 -0
- package/skills/hatch3r-architecture-review/SKILL.md +14 -0
- package/skills/hatch3r-bug-fix/SKILL.md +5 -0
- package/skills/hatch3r-ci-pipeline/SKILL.md +14 -0
- package/skills/hatch3r-cli-aichat/SKILL.md +84 -0
- package/skills/hatch3r-cli-ast-grep/SKILL.md +85 -0
- package/skills/hatch3r-cli-az-devops/SKILL.md +89 -0
- package/skills/hatch3r-cli-bat/SKILL.md +85 -0
- package/skills/hatch3r-cli-comby/SKILL.md +85 -0
- package/skills/hatch3r-cli-csvkit/SKILL.md +84 -0
- package/skills/hatch3r-cli-delta/SKILL.md +86 -0
- package/skills/hatch3r-cli-difftastic/SKILL.md +84 -0
- package/skills/hatch3r-cli-docker/SKILL.md +89 -0
- package/skills/hatch3r-cli-duckdb/SKILL.md +84 -0
- package/skills/hatch3r-cli-fd/SKILL.md +85 -0
- package/skills/hatch3r-cli-fzf/SKILL.md +84 -0
- package/skills/hatch3r-cli-gh/SKILL.md +90 -0
- package/skills/hatch3r-cli-glab/SKILL.md +89 -0
- package/skills/hatch3r-cli-jq/SKILL.md +89 -0
- package/skills/hatch3r-cli-lazygit/SKILL.md +78 -0
- package/skills/hatch3r-cli-llm/SKILL.md +84 -0
- package/skills/hatch3r-cli-miller/SKILL.md +84 -0
- package/skills/hatch3r-cli-mods/SKILL.md +84 -0
- package/skills/hatch3r-cli-overview/SKILL.md +60 -0
- package/skills/hatch3r-cli-playwright/SKILL.md +89 -0
- package/skills/hatch3r-cli-podman/SKILL.md +84 -0
- package/skills/hatch3r-cli-qsv/SKILL.md +91 -0
- package/skills/hatch3r-cli-ripgrep/SKILL.md +85 -0
- package/skills/hatch3r-cli-rtk/SKILL.md +91 -0
- package/skills/hatch3r-cli-sd/SKILL.md +85 -0
- package/skills/hatch3r-cli-stagehand/SKILL.md +111 -0
- package/skills/hatch3r-cli-taplo/SKILL.md +84 -0
- package/skills/hatch3r-cli-yq/SKILL.md +85 -0
- package/skills/hatch3r-cli-zstd/SKILL.md +85 -0
- package/skills/hatch3r-command-customize/SKILL.md +10 -0
- package/skills/hatch3r-context-health/SKILL.md +14 -0
- package/skills/hatch3r-cost-tracking/SKILL.md +14 -0
- package/skills/hatch3r-customize/SKILL.md +17 -0
- package/skills/hatch3r-dep-audit/SKILL.md +14 -0
- package/skills/hatch3r-design-system-detect/SKILL.md +164 -0
- package/skills/hatch3r-feature/SKILL.md +2 -0
- package/skills/hatch3r-gh-agentic-workflows/SKILL.md +13 -0
- package/skills/hatch3r-handoff-prepare/SKILL.md +160 -0
- package/skills/hatch3r-handoff-resume/SKILL.md +171 -0
- package/skills/hatch3r-incident-response/SKILL.md +14 -0
- package/skills/hatch3r-issue-workflow/SKILL.md +5 -0
- package/skills/hatch3r-logical-refactor/SKILL.md +14 -0
- package/skills/hatch3r-migration/SKILL.md +14 -0
- package/skills/hatch3r-observability-verify/SKILL.md +134 -0
- package/skills/hatch3r-perf-audit/SKILL.md +14 -0
- package/skills/hatch3r-pr-creation/SKILL.md +14 -0
- package/skills/hatch3r-qa-validation/SKILL.md +18 -0
- package/skills/hatch3r-recipe/SKILL.md +14 -0
- package/skills/hatch3r-refactor/SKILL.md +14 -0
- package/skills/hatch3r-release/SKILL.md +14 -0
- package/skills/hatch3r-reliability-verify/SKILL.md +146 -0
- package/skills/hatch3r-rule-customize/SKILL.md +10 -0
- package/skills/hatch3r-skill-customize/SKILL.md +10 -0
- package/skills/hatch3r-ui-ux-verify/SKILL.md +138 -0
- package/skills/hatch3r-visual-refactor/SKILL.md +15 -1
package/rules/hatch3r-i18n.md
CHANGED
|
@@ -93,3 +93,16 @@ ICU MessageFormat 2.0 reached Final Candidate status in CLDR 46.1 (January 2025)
|
|
|
93
93
|
- **Migration strategy:** New translation keys should use MF2 syntax. Existing MF1 keys can be migrated incrementally — both syntaxes can coexist during transition.
|
|
94
94
|
- **Tooling:** Verify that your translation management system (TMS) supports MF2 syntax before migrating. Test with a small key set first.
|
|
95
95
|
- **Stability:** The MF2 specification has stability guarantees post-approval (mid-2025). Syntax and semantics will not change incompatibly after that point.
|
|
96
|
+
|
|
97
|
+
## Microcopy and Tone
|
|
98
|
+
|
|
99
|
+
Translation strings are user-facing copy — write them as product copy, not as technical labels.
|
|
100
|
+
|
|
101
|
+
- Use plain language. Default to second person ("you", "your") for end-user surfaces.
|
|
102
|
+
- Use a corrective verb in error messages: "Try again", "Reconnect", "Enter a valid email" — not "Error" or "Oops".
|
|
103
|
+
- Never expose to end users: protocol acronyms ("FIDO2", "WebAuthn"), raw HTTP status codes ("500", "401"), language sentinel values (`null`, `undefined`), or internal record/ID strings. Translate these into a user-visible cause + recovery.
|
|
104
|
+
- CTA labels are action-oriented and specific: "Save changes" beats "Submit"; "Delete project" beats "Confirm"; "Send invite" beats "OK".
|
|
105
|
+
- Error tone explains the cause and offers a recovery path. Do not blame the user. Replace "You entered an invalid value" with "This field needs a valid email address — for example, name@example.com".
|
|
106
|
+
- Use ICU MessageFormat (1.0 or 2.0 per the MF2 section above) for every plural, gender, and select pattern. Never concatenate translated fragments to build a sentence — each complete sentence is a single translation key with its own placeholders.
|
|
107
|
+
- Tone source-of-truth: the GOV.UK Design System content style guide (https://design-system.service.gov.uk/styles/) and IBM Carbon Design System voice and tone guidance (https://carbondesignsystem.com/guidelines/content/general/) — cite both when reviewing tone or training a translator.
|
|
108
|
+
- Cross-reference `rules/hatch3r-ux-states-and-flows.md` Microcopy subsection for the state-driven copy patterns (loading, empty, error, partial) that share this tone contract.
|
package/rules/hatch3r-i18n.mdc
CHANGED
|
@@ -88,3 +88,16 @@ ICU MessageFormat 2.0 reached Final Candidate status in CLDR 46.1 (January 2025)
|
|
|
88
88
|
- **Migration strategy:** New translation keys should use MF2 syntax. Existing MF1 keys can be migrated incrementally — both syntaxes can coexist during transition.
|
|
89
89
|
- **Tooling:** Verify that your translation management system (TMS) supports MF2 syntax before migrating. Test with a small key set first.
|
|
90
90
|
- **Stability:** The MF2 specification has stability guarantees post-approval (mid-2025). Syntax and semantics will not change incompatibly after that point.
|
|
91
|
+
|
|
92
|
+
## Microcopy and Tone
|
|
93
|
+
|
|
94
|
+
Translation strings are user-facing copy — write them as product copy, not as technical labels.
|
|
95
|
+
|
|
96
|
+
- Use plain language. Default to second person ("you", "your") for end-user surfaces.
|
|
97
|
+
- Use a corrective verb in error messages: "Try again", "Reconnect", "Enter a valid email" — not "Error" or "Oops".
|
|
98
|
+
- Never expose to end users: protocol acronyms ("FIDO2", "WebAuthn"), raw HTTP status codes ("500", "401"), language sentinel values (`null`, `undefined`), or internal record/ID strings. Translate these into a user-visible cause + recovery.
|
|
99
|
+
- CTA labels are action-oriented and specific: "Save changes" beats "Submit"; "Delete project" beats "Confirm"; "Send invite" beats "OK".
|
|
100
|
+
- Error tone explains the cause and offers a recovery path. Do not blame the user. Replace "You entered an invalid value" with "This field needs a valid email address — for example, name@example.com".
|
|
101
|
+
- Use ICU MessageFormat (1.0 or 2.0 per the MF2 section above) for every plural, gender, and select pattern. Never concatenate translated fragments to build a sentence — each complete sentence is a single translation key with its own placeholders.
|
|
102
|
+
- Tone source-of-truth: the GOV.UK Design System content style guide (https://design-system.service.gov.uk/styles/) and IBM Carbon Design System voice and tone guidance (https://carbondesignsystem.com/guidelines/content/general/) — cite both when reviewing tone or training a translator.
|
|
103
|
+
- Cross-reference `rules/hatch3r-ux-states-and-flows.md` Microcopy subsection for the state-driven copy patterns (loading, empty, error, partial) that share this tone contract.
|
|
@@ -67,6 +67,8 @@ Append only when they carry information. Do not include empty headers.
|
|
|
67
67
|
**Suggested Next Action:** {one line}
|
|
68
68
|
```
|
|
69
69
|
|
|
70
|
+
The **End-of-Turn Delegation Attestation** (defined in `hatch3r-agent-orchestration` -> End-of-Turn Delegation Attestation) is conditionally required and appears immediately BEFORE this Iteration Summary block. It applies when the turn is on a Tier >= 2 tracked task AND caused at least one file mutation. The Iteration Summary's 5-field contract is unchanged — the Attestation lives in a separate block to preserve backward compatibility for the 15 adapter outputs.
|
|
71
|
+
|
|
70
72
|
## Field Semantics
|
|
71
73
|
|
|
72
74
|
- **Outcome** is one sentence. The user should grasp what happened from this line alone.
|
|
@@ -62,6 +62,8 @@ Append only when they carry information. Do not include empty headers.
|
|
|
62
62
|
**Suggested Next Action:** {one line}
|
|
63
63
|
```
|
|
64
64
|
|
|
65
|
+
The **End-of-Turn Delegation Attestation** (defined in `hatch3r-agent-orchestration` -> End-of-Turn Delegation Attestation) is conditionally required and appears immediately BEFORE this Iteration Summary block. It applies when the turn is on a Tier >= 2 tracked task AND caused at least one file mutation. The Iteration Summary's 5-field contract is unchanged — the Attestation lives in a separate block to preserve backward compatibility for the 15 adapter outputs.
|
|
66
|
+
|
|
65
67
|
## Field Semantics
|
|
66
68
|
|
|
67
69
|
- **Outcome** is one sentence. The user should grasp what happened from this line alone.
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
id: hatch3r-migrations
|
|
3
3
|
type: rule
|
|
4
|
-
description: Database migration and schema change patterns
|
|
4
|
+
description: Database migration and schema change patterns — expand-contract, online DDL, backfills, compatibility windows, reversibility, multi-region, tooling
|
|
5
5
|
scope: "**/migrations/**,**/*migration*,**/migrate/**,**/seeds/**,**/seeders/**,**/prisma/migrations/**,**/drizzle/**,**/knex/**"
|
|
6
6
|
tags: [implementation, brownfield]
|
|
7
7
|
quality_charter: agents/shared/quality-charter.md
|
|
@@ -9,23 +9,68 @@ cache_friendly: true
|
|
|
9
9
|
---
|
|
10
10
|
# Migrations
|
|
11
11
|
|
|
12
|
-
- Schema changes must be backward-compatible. Add fields with defaults; never remove or rename without migration.
|
|
13
12
|
- Migration scripts live in a dedicated `migrations/` directory. One script per migration.
|
|
14
|
-
- Every migration is idempotent
|
|
15
|
-
-
|
|
16
|
-
-
|
|
17
|
-
- Order: deploy new code (handles old + new schema) → run migration → remove old schema handling.
|
|
18
|
-
- Document schema changes in project data model spec.
|
|
19
|
-
- Rollback plan required for every migration. Never run destructive migrations without backup verification.
|
|
20
|
-
- Hot documents must stay within size limits after migration.
|
|
13
|
+
- Every migration is idempotent (re-running produces the same result). Use a version column, `migratedAt` timestamp, or migration ledger row to skip already-applied work.
|
|
14
|
+
- Test every migration against an emulator or staging dataset before production. Verify data integrity after each step, not just at the end.
|
|
15
|
+
- Document the schema change in the project data model spec. Hot documents must stay within size limits after migration.
|
|
21
16
|
|
|
22
|
-
##
|
|
17
|
+
## Expand-Contract Pattern (mandatory for non-trivial schema changes)
|
|
23
18
|
|
|
24
|
-
-
|
|
25
|
-
- Include count checks: the number of records processed should match the number of records in the source collection. Log discrepancies as errors, not warnings.
|
|
26
|
-
- For large datasets, migrate in batches with progress checkpoints. If a batch fails, resume from the last checkpoint rather than restarting the entire migration.
|
|
19
|
+
Non-trivial = anything beyond pure-additive nullable columns on small tables, or any rename/drop/type-change. Use a 3-deploy cadence; split Migrate into two deploys when dual-write is required (4 deploys total).
|
|
27
20
|
|
|
28
|
-
|
|
21
|
+
1. **Deploy 1 — Expand.** Add new column nullable, add new table, or `CREATE INDEX CONCURRENTLY`. Add new constraints with `NOT VALID` first. Old code paths still work. No app behavior change in this deploy.
|
|
22
|
+
2. **Deploy 2 — Migrate (backfill + dual-write).** Run a batched, idempotent, resumable backfill job. If the change is a column rename / type swap, app code writes to both old and new columns during this phase. Validate row counts and per-block checksums on the new shape before proceeding.
|
|
23
|
+
3. **Deploy 3 — Contract.** Switch reads to the new shape (feature-flag-gated; flip is the rollback). Drop the old column, old table, or old index. Wait at least one full release cycle plus one on-call rotation between Expand and Contract — old code must remain executable to roll back inside the deploy window.
|
|
29
24
|
|
|
30
|
-
|
|
31
|
-
|
|
25
|
+
Hard rules: never rename a column in a single step; never add a `NOT NULL` column to a populated table without a default or a deferred `SET NOT NULL NOT VALID` → `VALIDATE`; every phase must be valid in isolation so that any deploy is independently rollbackable.
|
|
26
|
+
|
|
27
|
+
## Online Schema Changes
|
|
28
|
+
|
|
29
|
+
Set `lock_timeout` and `statement_timeout` before every DDL statement to bound blast radius. Selection by engine:
|
|
30
|
+
|
|
31
|
+
- **Postgres 18.x.** Use `CREATE INDEX CONCURRENTLY` (outside any transaction block — disable the migration tool's transaction wrapper). On failure, the index is left `INVALID`; emit a `DROP INDEX IF EXISTS` + retry step. For FK and CHECK constraints, use `ALTER TABLE ... ADD CONSTRAINT ... NOT VALID` followed later by `VALIDATE CONSTRAINT` (skips full scan, downgrades to `SHARE UPDATE EXCLUSIVE`). Postgres 18 also supports `SET NOT NULL NOT VALID` for column nullability. Use `pg_repack` 1.5.x for bloat removal instead of `VACUUM FULL`. Avoid `ALTER TABLE ... ADD COLUMN ... DEFAULT non_constant_expression` on large tables — it rewrites every row.
|
|
32
|
+
- **MySQL 8.4 LTS.** `ALGORITHM=INSTANT` is the default for many metadata ops (ADD COLUMN at end, RENAME COLUMN, some index meta) — verify against the 8.4 online DDL operations matrix. Hard limit: 64 row versions per table in 8.4. When `INSTANT` is rejected, fall back to `ALGORITHM=INPLACE`. For `ALGORITHM=COPY` operations on large tables, use `gh-ost` v1.1.8 (trigger-free, binlog-based, checkpoint + resume + revert) when the table has no incoming FKs and the cluster is not Galera / Percona XtraDB. Use `pt-online-schema-change` when FKs are present (`--alter-foreign-keys-method`) or under Galera. `lhm` is unmaintained — do not propose it for new code.
|
|
33
|
+
|
|
34
|
+
## Backfill Jobs
|
|
35
|
+
|
|
36
|
+
Every backfill must be batched, idempotent, resumable, throttled, and observable.
|
|
37
|
+
|
|
38
|
+
- **Batched.** Order by PK or a monotonic key. Chunk by `id BETWEEN ? AND ?` (range), not `LIMIT/OFFSET` — offsets drift under concurrent writes. Default chunk 1k–10k rows; tune by table width.
|
|
39
|
+
- **Idempotent.** Write `UPDATE ... SET new = f(old) WHERE id = ? AND new IS NULL` (or upsert with a deterministic source-derived value). Re-running on the same range must produce the same final state.
|
|
40
|
+
- **Resumable.** Persist the last-processed boundary (`last_id` or timestamp cursor) to a control table after each batch commit. Resume from the checkpoint on restart; never restart from zero on partial failure.
|
|
41
|
+
- **Throttled.** Poll replication lag (`pg_stat_replication`, `SHOW REPLICA STATUS`) between batches; pause when lag exceeds 30 seconds or the SLO threshold. Cap concurrency at the IO budget of the slowest replica.
|
|
42
|
+
- **Observable.** Emit `migration.backfill.rows_processed` (counter), `migration.backfill.error_rate` (counter), `migration.backfill.eta_seconds` (gauge), and `migration.backfill.current_boundary` (gauge). Wire dashboards before launch. Avoid single mega-DML — one `UPDATE` over 50M+ rows produces multi-hour locks and table bloat.
|
|
43
|
+
|
|
44
|
+
## Compatibility Window
|
|
45
|
+
|
|
46
|
+
Schema changes deploy before the code that depends on them when widening (add column, add table, add index). Schema changes deploy after the code that no longer depends on them when narrowing (drop column, drop table). During the window, app code reads both shapes — the new shape if populated, fall back to the old shape otherwise. Rollback compatibility (old code remains executable against the current schema) must hold for at least 1 full release cycle plus 1 on-call rotation — minimum 7 calendar days, longer when the on-call rotation is longer.
|
|
47
|
+
|
|
48
|
+
## Reversibility
|
|
49
|
+
|
|
50
|
+
Every migration ships a tested down-migration script. Forward-only migrations are permitted only when the operation is data-destructive (e.g., a `DROP COLUMN` after Contract) — these require an explicit `IRREVERSIBLE: <reason>` annotation in the migration header and reviewer sign-off. A compensating forward migration that restores the prior shape is acceptable in place of a down-script for tools that lack reversibility (Prisma Migrate, Drizzle Kit — surface the gap to the reviewer). Default for every migration: reversible.
|
|
51
|
+
|
|
52
|
+
## Data Integrity Verification
|
|
53
|
+
|
|
54
|
+
Apply layered verification from cheapest to most thorough; stop at the cheapest layer that detects no drift.
|
|
55
|
+
|
|
56
|
+
1. **Pre-migration backup drill.** Full restore to staging plus a smoke query within 24 hours prior to a destructive migration. "Backup exists" is not verification.
|
|
57
|
+
2. **Row-count parity per chunk.** Source rows processed equals target rows written. Log discrepancies as errors, not warnings.
|
|
58
|
+
3. **Aggregate checks.** SUM, MIN, MAX, COUNT(DISTINCT) on numeric and date columns per partition or batch.
|
|
59
|
+
4. **Per-block checksums.** SHA-256 or MD5 over concatenated key columns for blocks of N rows (e.g., `md5(string_agg(id::text || col::text, ',' ORDER BY id))`).
|
|
60
|
+
5. **Cross-system diff.** Datafold Reconcile, dbt-data-diff, or a hand-rolled sample-then-drill comparison for value-level differences.
|
|
61
|
+
6. **Canary dual-read.** Read both shapes in production for 24–72 hours before cutover; shadow-diff and alert on mismatch.
|
|
62
|
+
7. **Reconciliation control table.** Per-batch row count plus checksum stored alongside the checkpoint; auto-stop the backfill on drift above the configured threshold.
|
|
63
|
+
|
|
64
|
+
## Multi-Region & Replica Lag
|
|
65
|
+
|
|
66
|
+
- Pause backfill writes when any replica lag exceeds 30 seconds (or the project's lag SLO, whichever is lower). Resume only after lag returns to baseline for 5 consecutive minutes.
|
|
67
|
+
- Roll migrations across regions sequentially; never alter an active partition during the peak traffic window of any region.
|
|
68
|
+
- FK validation (`VALIDATE CONSTRAINT`) reads the entire dependent table — schedule outside peak read windows on replica-heavy topologies.
|
|
69
|
+
- For Postgres major-version upgrades, use native logical replication (PG17+ preserves slots through `pg_upgrade`); advance sequences manually at cutover — logical replication does not replicate sequences, DDL, or large objects.
|
|
70
|
+
- For ongoing cross-system replication, prefer Debezium (when Kafka is already deployed) or AWS DMS (managed, AWS-native). DMS hard limit: 200 tasks per replication instance — relevant for schema-per-tenant designs.
|
|
71
|
+
|
|
72
|
+
## Tooling Mandate
|
|
73
|
+
|
|
74
|
+
Pick one schema-management tool per project and commit the schema declaration to the repo. Greenfield default: Atlas (50+ destructive/locking linters, auto-generated down migrations, GitHub Actions approval policies) or dbmate (plain-SQL portability with first-class `-- migrate:down`). Existing-project default: whatever already ships migrations in the repo. Acceptable tools: Atlas, Prisma Migrate (forward-only — surface to reviewer), Drizzle Kit (forward-only — surface), Flyway 11+, Liquibase 4.27+ Pro, sqitch, Alembic, Knex, dbmate, Bytebase. Run a migration linter in CI — Atlas analyze, `squawk` for raw Postgres SQL — fail the PR on destructive operations without an explicit `IRREVERSIBLE:` annotation.
|
|
75
|
+
|
|
76
|
+
Cross-references: see `hatch3r-data-classification` (PII / encrypted-column migration requirements), `hatch3r-feature-flags` (read-path switchover gating), `hatch3r-observability-metrics` (backfill progress metrics).
|
|
@@ -1,27 +1,72 @@
|
|
|
1
1
|
---
|
|
2
|
-
description: Database migration and schema change patterns
|
|
2
|
+
description: Database migration and schema change patterns — expand-contract, online DDL, backfills, compatibility windows, reversibility, multi-region, tooling
|
|
3
3
|
globs: ["**/migrations/**", "**/*migration*", "**/migrate/**", "**/seeds/**", "**/seeders/**", "**/prisma/migrations/**", "**/drizzle/**", "**/knex/**"]
|
|
4
4
|
alwaysApply: false
|
|
5
5
|
---
|
|
6
6
|
# Migrations
|
|
7
7
|
|
|
8
|
-
- Schema changes must be backward-compatible. Add fields with defaults; never remove or rename without migration.
|
|
9
8
|
- Migration scripts live in a dedicated `migrations/` directory. One script per migration.
|
|
10
|
-
- Every migration is idempotent
|
|
11
|
-
-
|
|
12
|
-
-
|
|
13
|
-
- Order: deploy new code (handles old + new schema) → run migration → remove old schema handling.
|
|
14
|
-
- Document schema changes in project data model spec.
|
|
15
|
-
- Rollback plan required for every migration. Never run destructive migrations without backup verification.
|
|
16
|
-
- Hot documents must stay within size limits after migration.
|
|
9
|
+
- Every migration is idempotent (re-running produces the same result). Use a version column, `migratedAt` timestamp, or migration ledger row to skip already-applied work.
|
|
10
|
+
- Test every migration against an emulator or staging dataset before production. Verify data integrity after each step, not just at the end.
|
|
11
|
+
- Document the schema change in the project data model spec. Hot documents must stay within size limits after migration.
|
|
17
12
|
|
|
18
|
-
##
|
|
13
|
+
## Expand-Contract Pattern (mandatory for non-trivial schema changes)
|
|
19
14
|
|
|
20
|
-
-
|
|
21
|
-
- Include count checks: the number of records processed should match the number of records in the source collection. Log discrepancies as errors, not warnings.
|
|
22
|
-
- For large datasets, migrate in batches with progress checkpoints. If a batch fails, resume from the last checkpoint rather than restarting the entire migration.
|
|
15
|
+
Non-trivial = anything beyond pure-additive nullable columns on small tables, or any rename/drop/type-change. Use a 3-deploy cadence; split Migrate into two deploys when dual-write is required (4 deploys total).
|
|
23
16
|
|
|
24
|
-
|
|
17
|
+
1. **Deploy 1 — Expand.** Add new column nullable, add new table, or `CREATE INDEX CONCURRENTLY`. Add new constraints with `NOT VALID` first. Old code paths still work. No app behavior change in this deploy.
|
|
18
|
+
2. **Deploy 2 — Migrate (backfill + dual-write).** Run a batched, idempotent, resumable backfill job. If the change is a column rename / type swap, app code writes to both old and new columns during this phase. Validate row counts and per-block checksums on the new shape before proceeding.
|
|
19
|
+
3. **Deploy 3 — Contract.** Switch reads to the new shape (feature-flag-gated; flip is the rollback). Drop the old column, old table, or old index. Wait at least one full release cycle plus one on-call rotation between Expand and Contract — old code must remain executable to roll back inside the deploy window.
|
|
25
20
|
|
|
26
|
-
|
|
27
|
-
|
|
21
|
+
Hard rules: never rename a column in a single step; never add a `NOT NULL` column to a populated table without a default or a deferred `SET NOT NULL NOT VALID` → `VALIDATE`; every phase must be valid in isolation so that any deploy is independently rollbackable.
|
|
22
|
+
|
|
23
|
+
## Online Schema Changes
|
|
24
|
+
|
|
25
|
+
Set `lock_timeout` and `statement_timeout` before every DDL statement to bound blast radius. Selection by engine:
|
|
26
|
+
|
|
27
|
+
- **Postgres 18.x.** Use `CREATE INDEX CONCURRENTLY` (outside any transaction block — disable the migration tool's transaction wrapper). On failure, the index is left `INVALID`; emit a `DROP INDEX IF EXISTS` + retry step. For FK and CHECK constraints, use `ALTER TABLE ... ADD CONSTRAINT ... NOT VALID` followed later by `VALIDATE CONSTRAINT` (skips full scan, downgrades to `SHARE UPDATE EXCLUSIVE`). Postgres 18 also supports `SET NOT NULL NOT VALID` for column nullability. Use `pg_repack` 1.5.x for bloat removal instead of `VACUUM FULL`. Avoid `ALTER TABLE ... ADD COLUMN ... DEFAULT non_constant_expression` on large tables — it rewrites every row.
|
|
28
|
+
- **MySQL 8.4 LTS.** `ALGORITHM=INSTANT` is the default for many metadata ops (ADD COLUMN at end, RENAME COLUMN, some index meta) — verify against the 8.4 online DDL operations matrix. Hard limit: 64 row versions per table in 8.4. When `INSTANT` is rejected, fall back to `ALGORITHM=INPLACE`. For `ALGORITHM=COPY` operations on large tables, use `gh-ost` v1.1.8 (trigger-free, binlog-based, checkpoint + resume + revert) when the table has no incoming FKs and the cluster is not Galera / Percona XtraDB. Use `pt-online-schema-change` when FKs are present (`--alter-foreign-keys-method`) or under Galera. `lhm` is unmaintained — do not propose it for new code.
|
|
29
|
+
|
|
30
|
+
## Backfill Jobs
|
|
31
|
+
|
|
32
|
+
Every backfill must be batched, idempotent, resumable, throttled, and observable.
|
|
33
|
+
|
|
34
|
+
- **Batched.** Order by PK or a monotonic key. Chunk by `id BETWEEN ? AND ?` (range), not `LIMIT/OFFSET` — offsets drift under concurrent writes. Default chunk 1k–10k rows; tune by table width.
|
|
35
|
+
- **Idempotent.** Write `UPDATE ... SET new = f(old) WHERE id = ? AND new IS NULL` (or upsert with a deterministic source-derived value). Re-running on the same range must produce the same final state.
|
|
36
|
+
- **Resumable.** Persist the last-processed boundary (`last_id` or timestamp cursor) to a control table after each batch commit. Resume from the checkpoint on restart; never restart from zero on partial failure.
|
|
37
|
+
- **Throttled.** Poll replication lag (`pg_stat_replication`, `SHOW REPLICA STATUS`) between batches; pause when lag exceeds 30 seconds or the SLO threshold. Cap concurrency at the IO budget of the slowest replica.
|
|
38
|
+
- **Observable.** Emit `migration.backfill.rows_processed` (counter), `migration.backfill.error_rate` (counter), `migration.backfill.eta_seconds` (gauge), and `migration.backfill.current_boundary` (gauge). Wire dashboards before launch. Avoid single mega-DML — one `UPDATE` over 50M+ rows produces multi-hour locks and table bloat.
|
|
39
|
+
|
|
40
|
+
## Compatibility Window
|
|
41
|
+
|
|
42
|
+
Schema changes deploy before the code that depends on them when widening (add column, add table, add index). Schema changes deploy after the code that no longer depends on them when narrowing (drop column, drop table). During the window, app code reads both shapes — the new shape if populated, fall back to the old shape otherwise. Rollback compatibility (old code remains executable against the current schema) must hold for at least 1 full release cycle plus 1 on-call rotation — minimum 7 calendar days, longer when the on-call rotation is longer.
|
|
43
|
+
|
|
44
|
+
## Reversibility
|
|
45
|
+
|
|
46
|
+
Every migration ships a tested down-migration script. Forward-only migrations are permitted only when the operation is data-destructive (e.g., a `DROP COLUMN` after Contract) — these require an explicit `IRREVERSIBLE: <reason>` annotation in the migration header and reviewer sign-off. A compensating forward migration that restores the prior shape is acceptable in place of a down-script for tools that lack reversibility (Prisma Migrate, Drizzle Kit — surface the gap to the reviewer). Default for every migration: reversible.
|
|
47
|
+
|
|
48
|
+
## Data Integrity Verification
|
|
49
|
+
|
|
50
|
+
Apply layered verification from cheapest to most thorough; stop at the cheapest layer that detects no drift.
|
|
51
|
+
|
|
52
|
+
1. **Pre-migration backup drill.** Full restore to staging plus a smoke query within 24 hours prior to a destructive migration. "Backup exists" is not verification.
|
|
53
|
+
2. **Row-count parity per chunk.** Source rows processed equals target rows written. Log discrepancies as errors, not warnings.
|
|
54
|
+
3. **Aggregate checks.** SUM, MIN, MAX, COUNT(DISTINCT) on numeric and date columns per partition or batch.
|
|
55
|
+
4. **Per-block checksums.** SHA-256 or MD5 over concatenated key columns for blocks of N rows (e.g., `md5(string_agg(id::text || col::text, ',' ORDER BY id))`).
|
|
56
|
+
5. **Cross-system diff.** Datafold Reconcile, dbt-data-diff, or a hand-rolled sample-then-drill comparison for value-level differences.
|
|
57
|
+
6. **Canary dual-read.** Read both shapes in production for 24–72 hours before cutover; shadow-diff and alert on mismatch.
|
|
58
|
+
7. **Reconciliation control table.** Per-batch row count plus checksum stored alongside the checkpoint; auto-stop the backfill on drift above the configured threshold.
|
|
59
|
+
|
|
60
|
+
## Multi-Region & Replica Lag
|
|
61
|
+
|
|
62
|
+
- Pause backfill writes when any replica lag exceeds 30 seconds (or the project's lag SLO, whichever is lower). Resume only after lag returns to baseline for 5 consecutive minutes.
|
|
63
|
+
- Roll migrations across regions sequentially; never alter an active partition during the peak traffic window of any region.
|
|
64
|
+
- FK validation (`VALIDATE CONSTRAINT`) reads the entire dependent table — schedule outside peak read windows on replica-heavy topologies.
|
|
65
|
+
- For Postgres major-version upgrades, use native logical replication (PG17+ preserves slots through `pg_upgrade`); advance sequences manually at cutover — logical replication does not replicate sequences, DDL, or large objects.
|
|
66
|
+
- For ongoing cross-system replication, prefer Debezium (when Kafka is already deployed) or AWS DMS (managed, AWS-native). DMS hard limit: 200 tasks per replication instance — relevant for schema-per-tenant designs.
|
|
67
|
+
|
|
68
|
+
## Tooling Mandate
|
|
69
|
+
|
|
70
|
+
Pick one schema-management tool per project and commit the schema declaration to the repo. Greenfield default: Atlas (50+ destructive/locking linters, auto-generated down migrations, GitHub Actions approval policies) or dbmate (plain-SQL portability with first-class `-- migrate:down`). Existing-project default: whatever already ships migrations in the repo. Acceptable tools: Atlas, Prisma Migrate (forward-only — surface to reviewer), Drizzle Kit (forward-only — surface), Flyway 11+, Liquibase 4.27+ Pro, sqitch, Alembic, Knex, dbmate, Bytebase. Run a migration linter in CI — Atlas analyze, `squawk` for raw Postgres SQL — fail the PR on destructive operations without an explicit `IRREVERSIBLE:` annotation.
|
|
71
|
+
|
|
72
|
+
Cross-references: see `hatch3r-data-classification` (PII / encrypted-column migration requirements), `hatch3r-feature-flags` (read-path switchover gating), `hatch3r-observability-metrics` (backfill progress metrics).
|
|
@@ -3,7 +3,7 @@ id: hatch3r-observability-logging
|
|
|
3
3
|
type: rule
|
|
4
4
|
description: Structured logging and error reporting conventions for the project
|
|
5
5
|
scope: conditional
|
|
6
|
-
globs: "**/*log*,**/*logger*,**/*logging*,**/*error*,**/observability/**"
|
|
6
|
+
globs: "**/*log*,**/*logger*,**/*logging*,**/*error*,**/observability/**,**/routes/**,**/handlers/**,**/services/**,**/api/**,**/middleware/**,**/controllers/**,**/lib/**"
|
|
7
7
|
tags: [devops]
|
|
8
8
|
quality_charter: agents/shared/quality-charter.md
|
|
9
9
|
cache_friendly: true
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
description: Structured logging and error reporting conventions for the project
|
|
3
|
-
globs: ["**/*log*", "**/*logger*", "**/*logging*", "**/*error*", "**/observability/**"]
|
|
3
|
+
globs: ["**/*log*", "**/*logger*", "**/*logging*", "**/*error*", "**/observability/**", "**/routes/**", "**/handlers/**", "**/services/**", "**/api/**", "**/middleware/**", "**/controllers/**", "**/lib/**"]
|
|
4
4
|
alwaysApply: false
|
|
5
5
|
---
|
|
6
6
|
# Observability -- Logging & Error Reporting
|
|
@@ -3,7 +3,7 @@ id: hatch3r-observability-metrics
|
|
|
3
3
|
type: rule
|
|
4
4
|
description: Metrics, SLO/SLI definitions, alerting, and dashboard conventions for the project
|
|
5
5
|
scope: conditional
|
|
6
|
-
globs: "**/*metric*,**/*slo*,**/*sli*,**/*alert*,**/*dashboard*,**/observability/**"
|
|
6
|
+
globs: "**/*metric*,**/*slo*,**/*sli*,**/*alert*,**/*dashboard*,**/observability/**,**/routes/**,**/handlers/**,**/services/**,**/api/**,**/middleware/**,**/controllers/**,**/lib/**"
|
|
7
7
|
tags: [devops]
|
|
8
8
|
quality_charter: agents/shared/quality-charter.md
|
|
9
9
|
cache_friendly: true
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
---
|
|
2
2
|
description: Metrics, SLO/SLI definitions, alerting, and dashboard conventions for the project
|
|
3
|
-
globs: ["**/*metric*", "**/*slo*", "**/*sli*", "**/*alert*", "**/*dashboard*", "**/observability/**"]
|
|
3
|
+
globs: ["**/*metric*", "**/*slo*", "**/*sli*", "**/*alert*", "**/*dashboard*", "**/observability/**", "**/routes/**", "**/handlers/**", "**/services/**", "**/api/**", "**/middleware/**", "**/controllers/**", "**/lib/**"]
|
|
4
4
|
alwaysApply: false
|
|
5
5
|
---
|
|
6
6
|
# Observability -- Metrics, SLOs & Alerting
|
|
@@ -1,161 +1,20 @@
|
|
|
1
1
|
---
|
|
2
2
|
id: hatch3r-observability-tracing-detail
|
|
3
3
|
type: rule
|
|
4
|
-
description:
|
|
4
|
+
description: "[Deprecated] AI agent tracing detail rule -- consolidated into hatch3r-observability-tracing's AI Agent Instrumentation section"
|
|
5
5
|
scope: conditional
|
|
6
|
-
globs: "**/*trac*,**/*span*,**/*telemetry*,**/*otel*,**/*agent*,**/observability/**"
|
|
6
|
+
globs: "**/*trac*,**/*span*,**/*telemetry*,**/*otel*,**/*agent*,**/observability/**,**/routes/**,**/handlers/**,**/services/**,**/api/**,**/middleware/**,**/controllers/**,**/lib/**"
|
|
7
7
|
tags: [devops]
|
|
8
8
|
quality_charter: agents/shared/quality-charter.md
|
|
9
|
+
deprecated: true
|
|
9
10
|
cache_friendly: true
|
|
10
11
|
---
|
|
11
|
-
# Observability -- Tracing Extended Reference
|
|
12
|
+
# Observability -- Tracing Extended Reference (Deprecated Redirect)
|
|
12
13
|
|
|
13
|
-
|
|
14
|
+
This rule has been merged into `hatch3r-observability-tracing`. Load that rule for AI agent instrumentation, tool call spans, LLM request/response tracing, tool call audit trails, and correlation ID patterns.
|
|
14
15
|
|
|
15
|
-
|
|
16
|
+
- See `hatch3r-observability-tracing` § "AI Agent Instrumentation" for: GenAI span attributes, agent invocation spans, tool call spans, LLM request/response tracing, tool call audit trail, correlation IDs for agent workflows.
|
|
16
17
|
|
|
17
|
-
|
|
18
|
+
<!-- DEPRECATED-CONTENT-REMOVED -->
|
|
18
19
|
|
|
19
|
-
|
|
20
|
-
|-----------|------|-------------|---------|
|
|
21
|
-
| `gen_ai.system` | string | GenAI provider system name | `openai`, `anthropic`, `azure_openai` |
|
|
22
|
-
| `gen_ai.request.model` | string | Model name as specified in the request | `gpt-4o`, `claude-sonnet-4-20250514` |
|
|
23
|
-
| `gen_ai.response.model` | string | Model name as returned in the response | `gpt-4o-2024-08-06` |
|
|
24
|
-
| `gen_ai.request.max_tokens` | int | Maximum tokens requested for generation | `4096` |
|
|
25
|
-
| `gen_ai.request.temperature` | float | Temperature parameter | `0.7` |
|
|
26
|
-
| `gen_ai.response.finish_reasons` | string[] | Reasons the model stopped generating | `["stop"]`, `["length"]` |
|
|
27
|
-
| `gen_ai.usage.input_tokens` | int | Tokens in the input/prompt | `1250` |
|
|
28
|
-
| `gen_ai.usage.output_tokens` | int | Tokens in the generated output | `530` |
|
|
29
|
-
|
|
30
|
-
- Always set `gen_ai.system` and `gen_ai.request.model` on every GenAI span.
|
|
31
|
-
- Record `gen_ai.usage.input_tokens` and `gen_ai.usage.output_tokens` from the API response for cost dashboards.
|
|
32
|
-
- Use `gen_ai.response.finish_reasons` to detect truncated outputs (`length`) and trigger re-prompting.
|
|
33
|
-
|
|
34
|
-
## Agent Invocation Spans
|
|
35
|
-
|
|
36
|
-
Instrument the full lifecycle of an agent invocation with a dedicated span. This span is the parent for all LLM calls, tool executions, and sub-agent delegations.
|
|
37
|
-
|
|
38
|
-
- **Span name pattern:** `agent.{agent_name}.invoke`
|
|
39
|
-
- **Required attributes:** `agent.id`, `agent.name`, `agent.parent_id`, `agent.task`, `agent.framework`
|
|
40
|
-
- **Span events for state transitions:** `agent.planning`, `agent.tool_selection`, `agent.awaiting_human`, `agent.delegating`, `agent.completed`, `agent.error`
|
|
41
|
-
|
|
42
|
-
```typescript
|
|
43
|
-
const agentSpan = tracer.startSpan('agent.code_reviewer.invoke', {
|
|
44
|
-
attributes: {
|
|
45
|
-
'agent.id': invocationId,
|
|
46
|
-
'agent.name': 'code_reviewer',
|
|
47
|
-
'agent.parent_id': parentAgentId ?? '',
|
|
48
|
-
'agent.task': `review PR #${prNumber}`,
|
|
49
|
-
'agent.framework': 'custom',
|
|
50
|
-
},
|
|
51
|
-
});
|
|
52
|
-
agentSpan.addEvent('agent.planning');
|
|
53
|
-
// ... agent reasoning and tool calls happen as child spans ...
|
|
54
|
-
agentSpan.addEvent('agent.completed');
|
|
55
|
-
agentSpan.end();
|
|
56
|
-
```
|
|
57
|
-
|
|
58
|
-
## Tool Call Spans
|
|
59
|
-
|
|
60
|
-
Every tool invocation by an agent creates a child span of the agent invocation span.
|
|
61
|
-
|
|
62
|
-
- **Span name pattern:** `tool.{tool_name}.execute`
|
|
63
|
-
- **Required attributes:** `tool.name`, `tool.input_hash` (SHA-256), `tool.output_status`, `tool.duration_ms`, `tool.parameters_count`
|
|
64
|
-
- Tool spans must be children of the invoking agent span. Set span status to `ERROR` when `tool.output_status` is `error` or `timeout`.
|
|
65
|
-
- For tools performing I/O, create nested child spans using appropriate semantic conventions (`http.*`, `db.*`).
|
|
66
|
-
|
|
67
|
-
```typescript
|
|
68
|
-
const toolSpan = tracer.startSpan(
|
|
69
|
-
'tool.git_diff.execute',
|
|
70
|
-
{ attributes: { 'tool.name': 'git_diff' } },
|
|
71
|
-
trace.setSpan(context.active(), agentSpan),
|
|
72
|
-
);
|
|
73
|
-
try {
|
|
74
|
-
const result = await tools.gitDiff(params);
|
|
75
|
-
toolSpan.setAttributes({
|
|
76
|
-
'tool.output_status': 'success',
|
|
77
|
-
'tool.duration_ms': performance.now() - startTime,
|
|
78
|
-
'tool.input_hash': hashInput(params),
|
|
79
|
-
});
|
|
80
|
-
} catch (err) {
|
|
81
|
-
toolSpan.setAttributes({ 'tool.output_status': 'error' });
|
|
82
|
-
toolSpan.setStatus({ code: SpanStatusCode.ERROR, message: err.message });
|
|
83
|
-
toolSpan.recordException(err);
|
|
84
|
-
throw err;
|
|
85
|
-
} finally {
|
|
86
|
-
toolSpan.end();
|
|
87
|
-
}
|
|
88
|
-
```
|
|
89
|
-
|
|
90
|
-
## LLM Request/Response Tracing
|
|
91
|
-
|
|
92
|
-
- **Span name pattern:** `gen_ai.{operation}` (e.g., `gen_ai.chat`, `gen_ai.completion`)
|
|
93
|
-
- **Token tracking:** Capture `gen_ai.usage.input_tokens` and `gen_ai.usage.output_tokens`. Aggregate in metrics: Counter `gen_ai.tokens_total` with labels `{direction, model, agent_name}`, Histogram `gen_ai.request_duration_ms`.
|
|
94
|
-
- **Model version tracking:** Record both `gen_ai.request.model` and `gen_ai.response.model` for drift detection.
|
|
95
|
-
- **Retry spans:** Each retry attempt is a separate child span. Set `gen_ai.request.retries` on the final span. Record `http.response.status_code` on failed spans (429 vs 500+).
|
|
96
|
-
- Never log raw prompt content or full model responses as span attributes. Use token counts for cost tracking and correlated logs for prompt debugging in non-production environments.
|
|
97
|
-
- Sample GenAI spans at 50-100% in production (higher than general spans) because each call is expensive and low volume.
|
|
98
|
-
|
|
99
|
-
## Tool Call Audit Trail
|
|
100
|
-
|
|
101
|
-
Maintain a structured audit log for every tool invocation in agentic workflows, separate from tracing spans.
|
|
102
|
-
|
|
103
|
-
| Field | Type | Description |
|
|
104
|
-
|-------|------|-------------|
|
|
105
|
-
| `tool.name` | string | Name of the tool invoked |
|
|
106
|
-
| `tool.input_hash` | string | SHA-256 hash of tool input (never log raw input) |
|
|
107
|
-
| `tool.output_status` | string | `success`, `error`, `timeout`, or `denied` |
|
|
108
|
-
| `tool.duration_ms` | float | Execution time in milliseconds |
|
|
109
|
-
| `agent.id` | string | ID of the invoking agent |
|
|
110
|
-
| `agent.name` | string | Human-readable agent name |
|
|
111
|
-
| `correlation.id` | string | Trace correlation ID |
|
|
112
|
-
| `timestamp` | string | ISO 8601 timestamp |
|
|
113
|
-
| `session.id` | string | Session identifier |
|
|
114
|
-
|
|
115
|
-
- Log tool invocations at `info` level, failures at `error` level with `error.type` and `error.message`.
|
|
116
|
-
- Aggregate tool call counts per agent per session for anomaly detection.
|
|
117
|
-
- Retain audit logs for a minimum of 90 days.
|
|
118
|
-
|
|
119
|
-
## Correlation IDs for Agent Workflows
|
|
120
|
-
|
|
121
|
-
- Use UUIDv4 with workflow-type prefix: `{workflow-type}-{uuid}` (e.g., `agent-run-550e8400-...`).
|
|
122
|
-
- Generate at the workflow entry point. Propagate to all sub-agents and tool calls.
|
|
123
|
-
- Every log entry, span, and metric must include `correlation.id`.
|
|
124
|
-
- Cross-process: propagate via `X-Correlation-ID` header alongside W3C Trace Context.
|
|
125
|
-
- Use OpenTelemetry `SpanLink` for cross-workflow references (e.g., agent run triggered by CI event).
|
|
126
|
-
|
|
127
|
-
```typescript
|
|
128
|
-
import { randomUUID } from 'node:crypto';
|
|
129
|
-
import { context, trace, SpanStatusCode } from '@opentelemetry/api';
|
|
130
|
-
|
|
131
|
-
function generateCorrelationId(workflowType: string): string {
|
|
132
|
-
return `${workflowType}-${randomUUID()}`;
|
|
133
|
-
}
|
|
134
|
-
|
|
135
|
-
async function runAgentWorkflow(task: string): Promise<void> {
|
|
136
|
-
const correlationId = generateCorrelationId('agent-run');
|
|
137
|
-
const tracer = trace.getTracer('agent-orchestrator');
|
|
138
|
-
const rootSpan = tracer.startSpan('agent.orchestrator.invoke', {
|
|
139
|
-
attributes: {
|
|
140
|
-
'correlation.id': correlationId,
|
|
141
|
-
'agent.name': 'orchestrator',
|
|
142
|
-
'agent.task': task,
|
|
143
|
-
},
|
|
144
|
-
});
|
|
145
|
-
try {
|
|
146
|
-
await context.with(trace.setSpan(context.active(), rootSpan), async () => {
|
|
147
|
-
await delegateToSubAgent('code_reviewer', {
|
|
148
|
-
correlationId,
|
|
149
|
-
parentSpanId: rootSpan.spanContext().spanId,
|
|
150
|
-
task: 'review changes',
|
|
151
|
-
});
|
|
152
|
-
});
|
|
153
|
-
} catch (err) {
|
|
154
|
-
rootSpan.setStatus({ code: SpanStatusCode.ERROR, message: (err as Error).message });
|
|
155
|
-
rootSpan.recordException(err as Error);
|
|
156
|
-
throw err;
|
|
157
|
-
} finally {
|
|
158
|
-
rootSpan.end();
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
```
|
|
20
|
+
The full content has been migrated to `hatch3r-observability-tracing`.
|