autonomous-coding-toolkit 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude-plugin/marketplace.json +22 -0
- package/.claude-plugin/plugin.json +13 -0
- package/LICENSE +21 -0
- package/Makefile +21 -0
- package/README.md +140 -0
- package/SECURITY.md +28 -0
- package/agents/bash-expert.md +113 -0
- package/agents/dependency-auditor.md +138 -0
- package/agents/integration-tester.md +120 -0
- package/agents/lesson-scanner.md +149 -0
- package/agents/python-expert.md +179 -0
- package/agents/service-monitor.md +141 -0
- package/agents/shell-expert.md +147 -0
- package/benchmarks/runner.sh +147 -0
- package/benchmarks/tasks/01-rest-endpoint/rubric.sh +29 -0
- package/benchmarks/tasks/01-rest-endpoint/task.md +17 -0
- package/benchmarks/tasks/02-refactor-module/task.md +8 -0
- package/benchmarks/tasks/03-fix-integration-bug/task.md +8 -0
- package/benchmarks/tasks/04-add-test-coverage/task.md +8 -0
- package/benchmarks/tasks/05-multi-file-feature/task.md +8 -0
- package/bin/act.js +238 -0
- package/commands/autocode.md +6 -0
- package/commands/cancel-ralph.md +18 -0
- package/commands/code-factory.md +53 -0
- package/commands/create-prd.md +55 -0
- package/commands/ralph-loop.md +18 -0
- package/commands/run-plan.md +117 -0
- package/commands/submit-lesson.md +122 -0
- package/docs/ARCHITECTURE.md +630 -0
- package/docs/CONTRIBUTING.md +125 -0
- package/docs/lessons/0001-bare-exception-swallowing.md +34 -0
- package/docs/lessons/0002-async-def-without-await.md +28 -0
- package/docs/lessons/0003-create-task-without-callback.md +28 -0
- package/docs/lessons/0004-hardcoded-test-counts.md +28 -0
- package/docs/lessons/0005-sqlite-without-closing.md +33 -0
- package/docs/lessons/0006-venv-pip-path.md +27 -0
- package/docs/lessons/0007-runner-state-self-rejection.md +35 -0
- package/docs/lessons/0008-quality-gate-blind-spot.md +33 -0
- package/docs/lessons/0009-parser-overcount-empty-batches.md +36 -0
- package/docs/lessons/0010-local-outside-function-bash.md +33 -0
- package/docs/lessons/0011-batch-tests-for-unimplemented-code.md +36 -0
- package/docs/lessons/0012-api-markdown-unescaped-chars.md +33 -0
- package/docs/lessons/0013-export-prefix-env-parsing.md +33 -0
- package/docs/lessons/0014-decorator-registry-import-side-effect.md +43 -0
- package/docs/lessons/0015-frontend-backend-schema-drift.md +43 -0
- package/docs/lessons/0016-event-driven-cold-start-seeding.md +44 -0
- package/docs/lessons/0017-copy-paste-logic-diverges.md +43 -0
- package/docs/lessons/0018-layer-passes-pipeline-broken.md +45 -0
- package/docs/lessons/0019-systemd-envfile-ignores-export.md +41 -0
- package/docs/lessons/0020-persist-state-incrementally.md +44 -0
- package/docs/lessons/0021-dual-axis-testing.md +48 -0
- package/docs/lessons/0022-jsx-factory-shadowing.md +43 -0
- package/docs/lessons/0023-static-analysis-spiral.md +51 -0
- package/docs/lessons/0024-shared-pipeline-implementation.md +55 -0
- package/docs/lessons/0025-defense-in-depth-all-entry-points.md +65 -0
- package/docs/lessons/0026-linter-no-rules-false-enforcement.md +54 -0
- package/docs/lessons/0027-jsx-silent-prop-drop.md +64 -0
- package/docs/lessons/0028-no-infrastructure-in-client-code.md +49 -0
- package/docs/lessons/0029-never-write-secrets-to-files.md +61 -0
- package/docs/lessons/0030-cache-merge-not-replace.md +62 -0
- package/docs/lessons/0031-verify-units-at-boundaries.md +66 -0
- package/docs/lessons/0032-module-lifecycle-subscribe-unsubscribe.md +89 -0
- package/docs/lessons/0033-async-iteration-mutable-snapshot.md +72 -0
- package/docs/lessons/0034-caller-missing-await-silent-discard.md +65 -0
- package/docs/lessons/0035-duplicate-registration-silent-overwrite.md +85 -0
- package/docs/lessons/0036-websocket-dirty-disconnect.md +33 -0
- package/docs/lessons/0037-parallel-agents-worktree-corruption.md +31 -0
- package/docs/lessons/0038-subscribe-no-stored-ref.md +36 -0
- package/docs/lessons/0039-fallback-or-default-hides-bugs.md +34 -0
- package/docs/lessons/0040-event-firehose-filter-first.md +36 -0
- package/docs/lessons/0041-ambiguous-base-dir-path-nesting.md +32 -0
- package/docs/lessons/0042-spec-compliance-insufficient.md +36 -0
- package/docs/lessons/0043-exact-count-extensible-collections.md +32 -0
- package/docs/lessons/0044-relative-file-deps-worktree.md +39 -0
- package/docs/lessons/0045-iterative-design-improvement.md +33 -0
- package/docs/lessons/0046-plan-assertion-math-bugs.md +38 -0
- package/docs/lessons/0047-pytest-single-threaded-default.md +37 -0
- package/docs/lessons/0048-integration-wiring-batch.md +40 -0
- package/docs/lessons/0049-ab-verification.md +41 -0
- package/docs/lessons/0050-editing-sourced-files-during-execution.md +33 -0
- package/docs/lessons/0051-infrastructure-fixes-cant-self-heal.md +30 -0
- package/docs/lessons/0052-uncommitted-changes-poison-quality-gates.md +31 -0
- package/docs/lessons/0053-jq-compact-flag-inconsistency.md +31 -0
- package/docs/lessons/0054-parser-matches-inside-code-blocks.md +30 -0
- package/docs/lessons/0055-agents-compensate-for-garbled-prompts.md +31 -0
- package/docs/lessons/0056-grep-count-exit-code-on-zero.md +42 -0
- package/docs/lessons/0057-new-artifacts-break-git-clean-gates.md +42 -0
- package/docs/lessons/0058-dead-config-keys-never-consumed.md +49 -0
- package/docs/lessons/0059-contract-test-shared-structures.md +53 -0
- package/docs/lessons/0060-set-e-silent-death-in-runners.md +53 -0
- package/docs/lessons/0061-context-injection-dirty-state.md +50 -0
- package/docs/lessons/0062-sibling-bug-neighborhood-scan.md +29 -0
- package/docs/lessons/0063-one-flag-two-lifetimes.md +31 -0
- package/docs/lessons/0064-test-passes-wrong-reason.md +31 -0
- package/docs/lessons/0065-pipefail-grep-count-double-output.md +39 -0
- package/docs/lessons/0066-local-keyword-outside-function.md +37 -0
- package/docs/lessons/0067-stdin-hang-non-interactive-shell.md +36 -0
- package/docs/lessons/0068-agent-builds-wrong-thing-correctly.md +31 -0
- package/docs/lessons/0069-plan-quality-dominates-execution.md +30 -0
- package/docs/lessons/0070-spec-echo-back-prevents-drift.md +31 -0
- package/docs/lessons/0071-positive-instructions-outperform-negative.md +30 -0
- package/docs/lessons/0072-lost-in-the-middle-context-placement.md +30 -0
- package/docs/lessons/0073-unscoped-lessons-cause-false-positives.md +30 -0
- package/docs/lessons/0074-stale-context-injection-wrong-batch.md +32 -0
- package/docs/lessons/0075-research-artifacts-must-persist.md +32 -0
- package/docs/lessons/0076-wrong-decomposition-contaminates-downstream.md +30 -0
- package/docs/lessons/0077-cherry-pick-merges-need-manual-resolution.md +30 -0
- package/docs/lessons/0078-static-review-without-live-test.md +30 -0
- package/docs/lessons/0079-integration-wiring-batch-required.md +32 -0
- package/docs/lessons/FRAMEWORK.md +161 -0
- package/docs/lessons/SUMMARY.md +201 -0
- package/docs/lessons/TEMPLATE.md +85 -0
- package/docs/plans/2026-02-21-code-factory-v2-design.md +204 -0
- package/docs/plans/2026-02-21-code-factory-v2-implementation-plan.md +2189 -0
- package/docs/plans/2026-02-21-code-factory-v2-phase4-design.md +537 -0
- package/docs/plans/2026-02-21-code-factory-v2-phase4-implementation-plan.md +2012 -0
- package/docs/plans/2026-02-21-hardening-pass-design.md +108 -0
- package/docs/plans/2026-02-21-hardening-pass-plan.md +1378 -0
- package/docs/plans/2026-02-21-mab-research-report.md +406 -0
- package/docs/plans/2026-02-21-marketplace-restructure-design.md +240 -0
- package/docs/plans/2026-02-21-marketplace-restructure-plan.md +832 -0
- package/docs/plans/2026-02-21-phase4-completion-plan.md +697 -0
- package/docs/plans/2026-02-21-validator-suite-design.md +148 -0
- package/docs/plans/2026-02-21-validator-suite-plan.md +540 -0
- package/docs/plans/2026-02-22-mab-research-round2.md +556 -0
- package/docs/plans/2026-02-22-mab-run-design.md +462 -0
- package/docs/plans/2026-02-22-mab-run-plan.md +2046 -0
- package/docs/plans/2026-02-22-operations-design-methodology-research.md +681 -0
- package/docs/plans/2026-02-22-research-agent-failure-taxonomy.md +532 -0
- package/docs/plans/2026-02-22-research-code-guideline-policies.md +886 -0
- package/docs/plans/2026-02-22-research-codebase-audit-refactoring.md +908 -0
- package/docs/plans/2026-02-22-research-coding-standards-documentation.md +541 -0
- package/docs/plans/2026-02-22-research-competitive-landscape.md +687 -0
- package/docs/plans/2026-02-22-research-comprehensive-testing.md +1076 -0
- package/docs/plans/2026-02-22-research-context-utilization.md +459 -0
- package/docs/plans/2026-02-22-research-cost-quality-tradeoff.md +548 -0
- package/docs/plans/2026-02-22-research-lesson-transferability.md +508 -0
- package/docs/plans/2026-02-22-research-multi-agent-coordination.md +312 -0
- package/docs/plans/2026-02-22-research-phase-integration.md +602 -0
- package/docs/plans/2026-02-22-research-plan-quality.md +428 -0
- package/docs/plans/2026-02-22-research-prompt-engineering.md +558 -0
- package/docs/plans/2026-02-22-research-unconventional-perspectives.md +528 -0
- package/docs/plans/2026-02-22-research-user-adoption.md +638 -0
- package/docs/plans/2026-02-22-research-verification-effectiveness.md +433 -0
- package/docs/plans/2026-02-23-agent-suite-design.md +299 -0
- package/docs/plans/2026-02-23-agent-suite-plan.md +578 -0
- package/docs/plans/2026-02-23-phase3-cost-infrastructure-design.md +148 -0
- package/docs/plans/2026-02-23-phase3-cost-infrastructure-plan.md +1062 -0
- package/docs/plans/2026-02-23-research-bash-expert-agent.md +543 -0
- package/docs/plans/2026-02-23-research-dependency-auditor-agent.md +564 -0
- package/docs/plans/2026-02-23-research-improving-existing-agents.md +503 -0
- package/docs/plans/2026-02-23-research-integration-tester-agent.md +454 -0
- package/docs/plans/2026-02-23-research-python-expert-agent.md +429 -0
- package/docs/plans/2026-02-23-research-service-monitor-agent.md +425 -0
- package/docs/plans/2026-02-23-research-shell-expert-agent.md +533 -0
- package/docs/plans/2026-02-23-roadmap-to-completion.md +530 -0
- package/docs/plans/2026-02-24-headless-module-split-design.md +98 -0
- package/docs/plans/2026-02-24-headless-module-split.md +443 -0
- package/docs/plans/2026-02-24-lesson-scope-metadata-design.md +228 -0
- package/docs/plans/2026-02-24-lesson-scope-metadata-plan.md +968 -0
- package/docs/plans/2026-02-24-npm-packaging-design.md +841 -0
- package/docs/plans/2026-02-24-npm-packaging-plan.md +1965 -0
- package/docs/plans/audit-findings.md +186 -0
- package/docs/telegram-notification-format.md +98 -0
- package/examples/example-plan.md +51 -0
- package/examples/example-prd.json +72 -0
- package/examples/example-roadmap.md +33 -0
- package/examples/quickstart-plan.md +63 -0
- package/hooks/hooks.json +26 -0
- package/hooks/setup-symlinks.sh +48 -0
- package/hooks/stop-hook.sh +135 -0
- package/package.json +47 -0
- package/policies/bash.md +71 -0
- package/policies/python.md +71 -0
- package/policies/testing.md +61 -0
- package/policies/universal.md +60 -0
- package/scripts/analyze-report.sh +97 -0
- package/scripts/architecture-map.sh +145 -0
- package/scripts/auto-compound.sh +273 -0
- package/scripts/batch-audit.sh +42 -0
- package/scripts/batch-test.sh +101 -0
- package/scripts/entropy-audit.sh +221 -0
- package/scripts/failure-digest.sh +51 -0
- package/scripts/generate-ast-rules.sh +96 -0
- package/scripts/init.sh +112 -0
- package/scripts/lesson-check.sh +428 -0
- package/scripts/lib/common.sh +61 -0
- package/scripts/lib/cost-tracking.sh +153 -0
- package/scripts/lib/ollama.sh +60 -0
- package/scripts/lib/progress-writer.sh +128 -0
- package/scripts/lib/run-plan-context.sh +215 -0
- package/scripts/lib/run-plan-echo-back.sh +231 -0
- package/scripts/lib/run-plan-headless.sh +396 -0
- package/scripts/lib/run-plan-notify.sh +57 -0
- package/scripts/lib/run-plan-parser.sh +81 -0
- package/scripts/lib/run-plan-prompt.sh +215 -0
- package/scripts/lib/run-plan-quality-gate.sh +132 -0
- package/scripts/lib/run-plan-routing.sh +315 -0
- package/scripts/lib/run-plan-sampling.sh +170 -0
- package/scripts/lib/run-plan-scoring.sh +146 -0
- package/scripts/lib/run-plan-state.sh +142 -0
- package/scripts/lib/run-plan-team.sh +199 -0
- package/scripts/lib/telegram.sh +54 -0
- package/scripts/lib/thompson-sampling.sh +176 -0
- package/scripts/license-check.sh +74 -0
- package/scripts/mab-run.sh +575 -0
- package/scripts/module-size-check.sh +146 -0
- package/scripts/patterns/async-no-await.yml +5 -0
- package/scripts/patterns/bare-except.yml +6 -0
- package/scripts/patterns/empty-catch.yml +6 -0
- package/scripts/patterns/hardcoded-localhost.yml +9 -0
- package/scripts/patterns/retry-loop-no-backoff.yml +12 -0
- package/scripts/pipeline-status.sh +197 -0
- package/scripts/policy-check.sh +226 -0
- package/scripts/prior-art-search.sh +133 -0
- package/scripts/promote-mab-lessons.sh +126 -0
- package/scripts/prompts/agent-a-superpowers.md +29 -0
- package/scripts/prompts/agent-b-ralph.md +29 -0
- package/scripts/prompts/judge-agent.md +61 -0
- package/scripts/prompts/planner-agent.md +44 -0
- package/scripts/pull-community-lessons.sh +90 -0
- package/scripts/quality-gate.sh +266 -0
- package/scripts/research-gate.sh +90 -0
- package/scripts/run-plan.sh +329 -0
- package/scripts/scope-infer.sh +159 -0
- package/scripts/setup-ralph-loop.sh +155 -0
- package/scripts/telemetry.sh +230 -0
- package/scripts/tests/run-all-tests.sh +52 -0
- package/scripts/tests/test-act-cli.sh +46 -0
- package/scripts/tests/test-agents-md.sh +87 -0
- package/scripts/tests/test-analyze-report.sh +114 -0
- package/scripts/tests/test-architecture-map.sh +89 -0
- package/scripts/tests/test-auto-compound.sh +169 -0
- package/scripts/tests/test-batch-test.sh +65 -0
- package/scripts/tests/test-benchmark-runner.sh +25 -0
- package/scripts/tests/test-common.sh +168 -0
- package/scripts/tests/test-cost-tracking.sh +158 -0
- package/scripts/tests/test-echo-back.sh +180 -0
- package/scripts/tests/test-entropy-audit.sh +146 -0
- package/scripts/tests/test-failure-digest.sh +66 -0
- package/scripts/tests/test-generate-ast-rules.sh +145 -0
- package/scripts/tests/test-helpers.sh +82 -0
- package/scripts/tests/test-init.sh +47 -0
- package/scripts/tests/test-lesson-check.sh +278 -0
- package/scripts/tests/test-lesson-local.sh +55 -0
- package/scripts/tests/test-license-check.sh +109 -0
- package/scripts/tests/test-mab-run.sh +182 -0
- package/scripts/tests/test-ollama-lib.sh +49 -0
- package/scripts/tests/test-ollama.sh +60 -0
- package/scripts/tests/test-pipeline-status.sh +198 -0
- package/scripts/tests/test-policy-check.sh +124 -0
- package/scripts/tests/test-prior-art-search.sh +96 -0
- package/scripts/tests/test-progress-writer.sh +140 -0
- package/scripts/tests/test-promote-mab-lessons.sh +110 -0
- package/scripts/tests/test-pull-community-lessons.sh +149 -0
- package/scripts/tests/test-quality-gate.sh +241 -0
- package/scripts/tests/test-research-gate.sh +132 -0
- package/scripts/tests/test-run-plan-cli.sh +86 -0
- package/scripts/tests/test-run-plan-context.sh +305 -0
- package/scripts/tests/test-run-plan-e2e.sh +153 -0
- package/scripts/tests/test-run-plan-headless.sh +424 -0
- package/scripts/tests/test-run-plan-notify.sh +124 -0
- package/scripts/tests/test-run-plan-parser.sh +217 -0
- package/scripts/tests/test-run-plan-prompt.sh +254 -0
- package/scripts/tests/test-run-plan-quality-gate.sh +222 -0
- package/scripts/tests/test-run-plan-routing.sh +178 -0
- package/scripts/tests/test-run-plan-scoring.sh +148 -0
- package/scripts/tests/test-run-plan-state.sh +261 -0
- package/scripts/tests/test-run-plan-team.sh +157 -0
- package/scripts/tests/test-scope-infer.sh +150 -0
- package/scripts/tests/test-setup-ralph-loop.sh +63 -0
- package/scripts/tests/test-telegram-env.sh +38 -0
- package/scripts/tests/test-telegram.sh +121 -0
- package/scripts/tests/test-telemetry.sh +46 -0
- package/scripts/tests/test-thompson-sampling.sh +139 -0
- package/scripts/tests/test-validate-all.sh +60 -0
- package/scripts/tests/test-validate-commands.sh +89 -0
- package/scripts/tests/test-validate-hooks.sh +98 -0
- package/scripts/tests/test-validate-lessons.sh +150 -0
- package/scripts/tests/test-validate-plan-quality.sh +235 -0
- package/scripts/tests/test-validate-plans.sh +187 -0
- package/scripts/tests/test-validate-plugin.sh +106 -0
- package/scripts/tests/test-validate-prd.sh +184 -0
- package/scripts/tests/test-validate-skills.sh +134 -0
- package/scripts/validate-all.sh +57 -0
- package/scripts/validate-commands.sh +67 -0
- package/scripts/validate-hooks.sh +89 -0
- package/scripts/validate-lessons.sh +98 -0
- package/scripts/validate-plan-quality.sh +369 -0
- package/scripts/validate-plans.sh +120 -0
- package/scripts/validate-plugin.sh +86 -0
- package/scripts/validate-policies.sh +42 -0
- package/scripts/validate-prd.sh +118 -0
- package/scripts/validate-skills.sh +96 -0
- package/skills/autocode/SKILL.md +285 -0
- package/skills/autocode/ab-verification.md +51 -0
- package/skills/autocode/code-quality-standards.md +37 -0
- package/skills/autocode/competitive-mode.md +364 -0
- package/skills/brainstorming/SKILL.md +97 -0
- package/skills/capture-lesson/SKILL.md +187 -0
- package/skills/check-lessons/SKILL.md +116 -0
- package/skills/dispatching-parallel-agents/SKILL.md +110 -0
- package/skills/executing-plans/SKILL.md +85 -0
- package/skills/finishing-a-development-branch/SKILL.md +201 -0
- package/skills/receiving-code-review/SKILL.md +72 -0
- package/skills/requesting-code-review/SKILL.md +59 -0
- package/skills/requesting-code-review/code-reviewer.md +82 -0
- package/skills/research/SKILL.md +145 -0
- package/skills/roadmap/SKILL.md +115 -0
- package/skills/subagent-driven-development/SKILL.md +98 -0
- package/skills/subagent-driven-development/code-quality-reviewer-prompt.md +18 -0
- package/skills/subagent-driven-development/implementer-prompt.md +73 -0
- package/skills/subagent-driven-development/spec-reviewer-prompt.md +57 -0
- package/skills/systematic-debugging/SKILL.md +134 -0
- package/skills/systematic-debugging/condition-based-waiting.md +64 -0
- package/skills/systematic-debugging/defense-in-depth.md +32 -0
- package/skills/systematic-debugging/root-cause-tracing.md +55 -0
- package/skills/test-driven-development/SKILL.md +167 -0
- package/skills/using-git-worktrees/SKILL.md +219 -0
- package/skills/using-superpowers/SKILL.md +54 -0
- package/skills/verification-before-completion/SKILL.md +140 -0
- package/skills/verify/SKILL.md +82 -0
- package/skills/writing-plans/SKILL.md +128 -0
- package/skills/writing-skills/SKILL.md +93 -0
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0041
|
|
3
|
+
title: "Ambiguous base dir variable causes path double-nesting"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [python, shell, all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Variable named log_dir already contains subdirectory, but os.path.join adds it again"
|
|
11
|
+
fix: "Name variables to encode their scope (log_base_dir vs intelligence_dir); verify paths before first use"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
log_dir = "/var/logs/app/intelligence"
|
|
15
|
+
# Developer thinks log_dir is base, adds another level
|
|
16
|
+
intelligence_output = os.path.join(log_dir, "intelligence", "output.json")
|
|
17
|
+
# Result: /var/logs/app/intelligence/intelligence/output.json
|
|
18
|
+
good: |
|
|
19
|
+
log_base_dir = "/var/logs/app"
|
|
20
|
+
intelligence_dir = os.path.join(log_base_dir, "intelligence")
|
|
21
|
+
intelligence_output = os.path.join(intelligence_dir, "output.json")
|
|
22
|
+
# Result: /var/logs/app/intelligence/output.json
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Observation
|
|
26
|
+
Path variables are created with unclear semantics. A variable named `log_dir` might contain `/var/logs/app` or `/var/logs/app/intelligence`. Later code blindly adds subdirectories without checking the base, resulting in nested duplicates like `intelligence/intelligence/output.json`.
|
|
27
|
+
|
|
28
|
+
## Insight
|
|
29
|
+
Variable naming doesn't encode the directory's depth or scope. Different developers interpret the same variable name differently, leading to double-nesting or missing levels.
|
|
30
|
+
|
|
31
|
+
## Lesson
|
|
32
|
+
Name path variables to encode their scope: use `_base_dir` for top-level, `_dir` for specific subdirectories. Verify all paths at initialization time before they're used. Print and assert the structure early: `assert log_base_dir.endswith('/logs/app')` and `assert intelligence_dir.endswith('/intelligence')`. Test with actual filesystem operations to catch these bugs immediately.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0042
|
|
3
|
+
title: "Spec compliance without quality review misses defensive gaps"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Code review checks only spec compliance but misses error handling, cleanup, validation, and timeouts"
|
|
11
|
+
fix: "Include a defensive gaps checklist in code review, separate from spec compliance"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Spec: "Call API and return result"
|
|
15
|
+
def fetch_data(url):
|
|
16
|
+
response = requests.get(url) # No timeout, no error handling
|
|
17
|
+
return response.json() # Crashes if invalid JSON
|
|
18
|
+
good: |
|
|
19
|
+
# Spec + defensive: Call API with timeout, handle errors, validate
|
|
20
|
+
def fetch_data(url):
|
|
21
|
+
try:
|
|
22
|
+
response = requests.get(url, timeout=30)
|
|
23
|
+
return response.json()
|
|
24
|
+
except (requests.Timeout, requests.JSONDecodeError) as e:
|
|
25
|
+
logger.error(f"Fetch failed: {e}")
|
|
26
|
+
return None
|
|
27
|
+
---
|
|
28
|
+
|
|
29
|
+
## Observation
|
|
30
|
+
Code review focuses on whether the implementation matches the specification (does it call the API? does it return the result?). It skips defensive programming: timeouts, error handling, input validation, cleanup paths, and null checks. The code is spec-compliant but fragile.
|
|
31
|
+
|
|
32
|
+
## Insight
|
|
33
|
+
Spec compliance is a floor, not a ceiling. Defensive programming is orthogonal to spec compliance. Reviewers who are trained to check spec often skip defensive gaps because they're not part of the spec.
|
|
34
|
+
|
|
35
|
+
## Lesson
|
|
36
|
+
Create a separate defensive gaps checklist for code review: Does the code have timeouts? Error handling? Input validation? Cleanup paths? Null checks? Is there logging for failure cases? Run this checklist independently from spec compliance. Make it part of the merge gate, not optional. Test with fault injection and chaos testing to verify defensive behavior.
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0043
|
|
3
|
+
title: "Exact count assertions on extensible collections break on addition"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [python, javascript, all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: test-anti-patterns
|
|
8
|
+
pattern:
|
|
9
|
+
type: syntactic
|
|
10
|
+
regex: "assert.*len\\(.*==\\s*\\d+"
|
|
11
|
+
description: "Test asserts exact collection length that breaks when collection grows"
|
|
12
|
+
fix: "Use >= for extensible collections, or assert specific items exist rather than total count"
|
|
13
|
+
example:
|
|
14
|
+
bad: |
|
|
15
|
+
def test_users():
|
|
16
|
+
users = get_users()
|
|
17
|
+
assert len(users) == 3 # Breaks when a 4th user is added
|
|
18
|
+
good: |
|
|
19
|
+
def test_users():
|
|
20
|
+
users = get_users()
|
|
21
|
+
assert len(users) >= 3 # Allows growth
|
|
22
|
+
assert "alice" in [u.name for u in users]
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Observation
|
|
26
|
+
Tests assert that a collection has an exact count (`assert len(items) == 5`). When the feature grows and items are added to the collection, the test fails even though the new behavior is correct. Tests become brittle and must be updated constantly.
|
|
27
|
+
|
|
28
|
+
## Insight
|
|
29
|
+
Exact counts are too restrictive for evolving features. The test really cares about specific items being present, not the total count. Switching to exact assertions makes tests fragile to future additions.
|
|
30
|
+
|
|
31
|
+
## Lesson
|
|
32
|
+
Use `>=` for collection length assertions in tests of extensible collections. Instead of asserting total count, assert that specific items exist: `assert "item" in collection` or `assert any(x.id == 5 for x in items)`. This makes tests resilient to future growth. Only use exact counts for fixed-size collections (e.g., tuple return values).
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0044
|
|
3
|
+
title: "Relative `file:` deps break in git worktrees"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [javascript, typescript]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "package.json file: dependencies use relative paths that break in git worktrees at different depths"
|
|
11
|
+
fix: "Use workspace protocols, absolute paths resolved at install time, or npm/yarn workspaces"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
// package.json in monorepo/services/api
|
|
15
|
+
{
|
|
16
|
+
"dependencies": {
|
|
17
|
+
"shared": "file:../shared" // Breaks in worktree
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
good: |
|
|
21
|
+
{
|
|
22
|
+
"workspaces": [
|
|
23
|
+
"packages/*",
|
|
24
|
+
"services/*"
|
|
25
|
+
],
|
|
26
|
+
"dependencies": {
|
|
27
|
+
"shared": "workspace:*"
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
---
|
|
31
|
+
|
|
32
|
+
## Observation
|
|
33
|
+
npm/yarn `file:` dependencies use relative paths. When code is checked out into a git worktree at a different depth than the main repo, the relative path resolves to the wrong location (or doesn't exist). This breaks CI in specific git workflows.
|
|
34
|
+
|
|
35
|
+
## Insight
|
|
36
|
+
Git worktrees can be created at arbitrary depths relative to the main repo. Relative path dependencies were designed for a single repository layout and fail when the layout changes.
|
|
37
|
+
|
|
38
|
+
## Lesson
|
|
39
|
+
Use workspace protocols (`workspace:*`) in monorepos instead of `file:` dependencies. If `file:` is necessary, resolve relative paths to absolute paths at install time. For standalone packages, use npm/yarn workspaces or lerna to manage dependencies. Test with `git worktree add` at different depths to verify dependencies resolve correctly.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0045
|
|
3
|
+
title: "Iterative 'how would you improve' catches 35% more design gaps"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Single-pass design review misses gaps that iterative improvement rounds would catch"
|
|
11
|
+
fix: "Ask 'how would you improve this section?' after each design section; 5 rounds is the sweet spot"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Single design pass
|
|
15
|
+
Review once. Approve. Start building.
|
|
16
|
+
# Later: discover missing error handling, untested edge case
|
|
17
|
+
good: |
|
|
18
|
+
# Iterative design
|
|
19
|
+
Round 1: "What could break here?" -> Add timeout handling
|
|
20
|
+
Round 2: "How scale this to 10K items?" -> Add pagination
|
|
21
|
+
Round 3: "What if database is down?" -> Add circuit breaker
|
|
22
|
+
Round 4: "How to monitor this?" -> Add metrics
|
|
23
|
+
Round 5: "Any security risks?" -> Add auth validation
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Observation
|
|
27
|
+
Design review done in a single pass typically covers the happy path. Iterative rounds of "how would you improve this section?" reveal gaps: edge cases, scale limits, failure modes, monitoring, and security issues that a single review missed.
|
|
28
|
+
|
|
29
|
+
## Insight
|
|
30
|
+
Single-pass review relies on reviewers catching everything. Iterative rounds make gaps explicit by forcing the designer to consider improvements from different angles. Each round builds on the previous one and surfaces new concerns.
|
|
31
|
+
|
|
32
|
+
## Lesson
|
|
33
|
+
After each major design section, ask "How would you improve this section?" Require at least 3 rounds; 5 is optimal. Each round should surface a new category: performance, fault tolerance, monitoring, security, or operational concerns. Document improvements and rationale. This catches design gaps before implementation and reduces rework later.
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0046
|
|
3
|
+
title: "Plan-specified test assertions can have math bugs"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: test-anti-patterns
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Implementation plan specifies test thresholds with math errors that implementer copies verbatim"
|
|
11
|
+
fix: "Verify threshold boundary logic independently before writing the test"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Plan says: "Assert that 90% of requests succeed"
|
|
15
|
+
# Implementer writes (copying from plan):
|
|
16
|
+
assert success_count / total_count >= 0.9
|
|
17
|
+
# But 0.9 is already 90%, so this is correct.
|
|
18
|
+
# But what if plan meant: "Assert that error rate is below 10%"?
|
|
19
|
+
# assert error_count / total_count <= 0.1 # Different logic
|
|
20
|
+
|
|
21
|
+
# Implementer didn't verify the math matched intent
|
|
22
|
+
good: |
|
|
23
|
+
# Plan specifies: "Assert 90% success rate (>= 0.9)"
|
|
24
|
+
# Before implementing, verify:
|
|
25
|
+
# 90% = 0.9 (correct multiplier)
|
|
26
|
+
# 10% = 0.1 (correct error rate)
|
|
27
|
+
# Test with known values: 9/10 = 0.9 ✓
|
|
28
|
+
assert success_count / total_count >= 0.9
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Observation
|
|
32
|
+
Implementation plans specify test thresholds and assertions. Implementers copy these verbatim without verifying the math. If the plan has a boundary condition error (off-by-one, wrong direction, incorrect multiplier), the implementer creates a test that passes despite incorrect logic.
|
|
33
|
+
|
|
34
|
+
## Insight
|
|
35
|
+
Plan authors may write thresholds informally or with implicit assumptions. Implementers assume the math is correct and don't double-check. Boundary logic errors slip through undetected.
|
|
36
|
+
|
|
37
|
+
## Lesson
|
|
38
|
+
Before implementing any threshold-based assertion, verify the math independently. Test with concrete values to confirm the boundary is correct. For example, if the plan says "90% success rate," verify: success_count=9, total=10, then assert 9/10 >= 0.9 should pass. success_count=8, total=10, then assert 8/10 >= 0.9 should fail. Write and run these boundary tests before implementing the main test.
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0047
|
|
3
|
+
title: "pytest runs single-threaded by default -- add xdist"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [python]
|
|
6
|
+
scope: [framework:pytest]
|
|
7
|
+
category: performance
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "pytest test suite runs single-threaded when parallel execution would be significantly faster"
|
|
11
|
+
fix: "Add pytest-xdist to dev deps and addopts = '-n auto' to pytest config"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# pytest.ini or pyproject.toml
|
|
15
|
+
[tool.pytest.ini_options]
|
|
16
|
+
testpaths = ["tests"]
|
|
17
|
+
# Result: runs tests one at a time (slow)
|
|
18
|
+
|
|
19
|
+
good: |
|
|
20
|
+
# pyproject.toml
|
|
21
|
+
[tool.pytest.ini_options]
|
|
22
|
+
testpaths = ["tests"]
|
|
23
|
+
addopts = "-n auto --dist load"
|
|
24
|
+
|
|
25
|
+
# requirements-dev.txt or pyproject.toml
|
|
26
|
+
pytest-xdist>=3.5.0
|
|
27
|
+
# Result: runs tests in parallel (fast)
|
|
28
|
+
---
|
|
29
|
+
|
|
30
|
+
## Observation
|
|
31
|
+
pytest, by default, runs tests sequentially in a single worker process. For test suites with 50+ tests, this is significantly slower than parallel execution. Developers run test suites serially and accept the slow feedback loop, unaware that xdist can parallelize.
|
|
32
|
+
|
|
33
|
+
## Insight
|
|
34
|
+
pytest-xdist provides automatic parallelization across multiple CPU cores. Running tests in parallel often provides 3-6x speedup on modern hardware, but requires explicit configuration. This is a low-effort, high-impact performance improvement.
|
|
35
|
+
|
|
36
|
+
## Lesson
|
|
37
|
+
Add `pytest-xdist>=3.5.0` to dev dependencies. Add `addopts = "-n auto --dist load"` to pytest configuration. This parallelizes tests automatically, using all available CPU cores. Use `-n 0` to disable parallelization temporarily for debugging. Test with your specific test suite to measure speedup. For very large test suites, use `-n 6` instead of `-n auto` to prevent memory exhaustion.
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0048
|
|
3
|
+
title: "Multi-batch plans need explicit integration wiring batch"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Multi-batch plan builds components separately but skips the step of wiring them together"
|
|
11
|
+
fix: "Plans with 3+ batches must include a final integration wiring batch"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Plan with 3 batches:
|
|
15
|
+
Batch 1: Build API endpoint
|
|
16
|
+
Batch 2: Build database schema
|
|
17
|
+
Batch 3: Build client code
|
|
18
|
+
# Missing: wire components together
|
|
19
|
+
|
|
20
|
+
# Result: Each piece works in isolation, but together they fail
|
|
21
|
+
good: |
|
|
22
|
+
# Plan with 4 batches:
|
|
23
|
+
Batch 1: Build API endpoint
|
|
24
|
+
Batch 2: Build database schema
|
|
25
|
+
Batch 3: Build client code
|
|
26
|
+
Batch 4: Integration wiring
|
|
27
|
+
- Connect API to database
|
|
28
|
+
- Connect client to API
|
|
29
|
+
- Verify end-to-end flow
|
|
30
|
+
- Run integration tests
|
|
31
|
+
---
|
|
32
|
+
|
|
33
|
+
## Observation
|
|
34
|
+
Multi-batch plans build components (API, database, client) independently. Each batch passes its own tests. But components aren't wired together during implementation. Integration happens only at the end, revealing coupling issues, interface mismatches, and missing adapters too late.
|
|
35
|
+
|
|
36
|
+
## Insight
|
|
37
|
+
Batch-driven development optimizes for parallel work but can miss integration points. Components are unit-tested in isolation but may fail when combined. Without an explicit wiring batch, integration is assumed to "just work."
|
|
38
|
+
|
|
39
|
+
## Lesson
|
|
40
|
+
Plans with 3+ batches must include a final integration wiring batch. This batch connects components built in earlier batches, verifies data flows through the full pipeline, and runs end-to-end integration tests. Include this batch in the plan before implementation starts. Test the full system (not just individual components) after wiring is complete.
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 0049
|
|
3
|
+
title: "A/B verification finds zero-overlap bug classes"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Using only bottom-up or only top-down review misses entire classes of bugs"
|
|
11
|
+
fix: "Run both bottom-up (code-level) and top-down (architecture-level) review after 3+ batch implementations"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Bottom-up only: review each component's code
|
|
15
|
+
# Result: logic errors caught, but coupling issues missed
|
|
16
|
+
# Reviewer doesn't see: API expects array, client sends object
|
|
17
|
+
|
|
18
|
+
# Top-down only: review architecture diagrams
|
|
19
|
+
# Result: structure looks good, but off-by-one in retry logic missed
|
|
20
|
+
# Reviewer doesn't see: code-level bugs
|
|
21
|
+
good: |
|
|
22
|
+
# Bottom-up: Review code implementation
|
|
23
|
+
- Are loops correct? Error handling present? State managed correctly?
|
|
24
|
+
|
|
25
|
+
# Top-down: Review architecture
|
|
26
|
+
- Do components couple correctly? Is data flow end-to-end?
|
|
27
|
+
|
|
28
|
+
# Both perspectives together catch more bugs than either alone
|
|
29
|
+
---
|
|
30
|
+
|
|
31
|
+
## Observation
|
|
32
|
+
Code reviews conducted only from the bottom-up (code-level logic) miss architectural coupling issues. Reviews conducted only from the top-down (architecture diagrams) miss implementation bugs. Different bugs are visible from different angles.
|
|
33
|
+
|
|
34
|
+
## Insight
|
|
35
|
+
Bugs fall into different categories based on visibility:
|
|
36
|
+
- **Bottom-up visible:** off-by-one errors, null checks, state management, loop logic
|
|
37
|
+
- **Top-down visible:** coupling between components, interface mismatches, data flow breaks, missing error propagation
|
|
38
|
+
- **Requires both:** race conditions, distributed state consistency, integration deadlocks
|
|
39
|
+
|
|
40
|
+
## Lesson
|
|
41
|
+
Run both bottom-up and top-down review after implementing 3+ batches. Bottom-up: inspect code for logic errors, edge cases, resource cleanup. Top-down: trace data flow end-to-end, verify component interfaces match, check for coupling leaks. Document findings from each perspective. Bugs caught only in top-down review indicate architectural issues; bugs caught only in bottom-up indicate implementation issues. Fix both before declaring done.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 50
|
|
3
|
+
title: "Editing files sourced by a running process breaks function signatures"
|
|
4
|
+
severity: blocker
|
|
5
|
+
languages: [shell]
|
|
6
|
+
scope: [project:autonomous-coding-toolkit]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Modifying function signatures in files that are actively sourced by a running bash process (e.g., editing run-plan-notify.sh while run-plan.sh is executing)"
|
|
11
|
+
fix: "Never edit library files while they're being sourced by a running process. Wait for the run to complete, or commit changes that only new runs will pick up."
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# While run-plan.sh is running (sources run-plan-notify.sh at startup):
|
|
15
|
+
# Edit run-plan-notify.sh to change format_success_message from 6 to 9 params
|
|
16
|
+
# -> Next batch call crashes with wrong argument count
|
|
17
|
+
good: |
|
|
18
|
+
# Wait for run-plan.sh to finish, then edit
|
|
19
|
+
# Or: make changes backward-compatible (add params with defaults)
|
|
20
|
+
format_success_message() {
|
|
21
|
+
local plan="$1" batch="$2" total="${3:-?}" title="${4:-}"
|
|
22
|
+
# ... rest uses defaults for missing params
|
|
23
|
+
}
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Observation
|
|
27
|
+
During Phase 4 execution, `run-plan-notify.sh` was edited to add `total_batches` and `batch_title` parameters to `format_success_message` (6 → 9 params). The running `run-plan.sh` process had already sourced the original file at startup. When the next batch called `notify_success` with the old 6-parameter signature, the quality gate detected uncommitted changes and failed.
|
|
28
|
+
|
|
29
|
+
## Insight
|
|
30
|
+
Bash sources files once at startup — there's no hot-reload. But the *file on disk* is what `git diff` sees. So editing a sourced file creates a two-way failure: (1) the running process uses stale function signatures, and (2) the quality gate sees uncommitted changes. The fix had to be committed to unblock the gate, but that commit changed signatures the running process was still calling with old argument counts.
|
|
31
|
+
|
|
32
|
+
## Lesson
|
|
33
|
+
Treat sourced library files as immutable during execution. If you must change them: (a) make changes backward-compatible with default parameter values, (b) commit immediately so the quality gate stays clean, and (c) accept that the current run uses the old behavior. Never change function arity in a file that a running process has already sourced.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 51
|
|
3
|
+
title: "Infrastructure fixes in a plan cannot benefit the run executing that plan"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [shell]
|
|
6
|
+
scope: [project:autonomous-coding-toolkit]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "A plan includes tasks that fix the execution infrastructure (e.g., empty batch detection, parser improvements) but the current run-plan.sh process loaded the old code at startup"
|
|
11
|
+
fix: "Place infrastructure fixes in a separate pre-flight plan, or accept that the current run uses old behavior and the fix only helps future runs."
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Plan Batch 1: Fix empty batch detection in run-plan-headless.sh
|
|
15
|
+
# -> Fix is committed, but the running bash process already loaded old code
|
|
16
|
+
# -> Batches 6-19 still spawn claude for empty batches (43s each)
|
|
17
|
+
good: |
|
|
18
|
+
# Option A: Separate pre-flight plan for infra fixes, then main plan
|
|
19
|
+
# Option B: Accept the cost — document that infra fixes are forward-looking
|
|
20
|
+
# Option C: Use --start-batch to re-run from where infra fix takes effect
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Observation
|
|
24
|
+
The Phase 4 plan included Task 1: "Fix empty batch detection in run-plan-headless.sh." The fix was committed during Batch 1. However, the `run-plan.sh` bash process had already loaded `run-plan-headless.sh` at startup. Batches 6-19 (parser artifacts) still spawned a `claude -p` process for each empty batch (~30-50s each), wasting ~7 minutes and API calls.
|
|
25
|
+
|
|
26
|
+
## Insight
|
|
27
|
+
Bash reads `source` files once. The running process keeps the in-memory version of all sourced functions. Committing a fix to disk doesn't update the running process — only a new invocation reads the new code. This is fundamentally different from interpreted languages with hot-reload (Python's importlib, Node's require cache invalidation).
|
|
28
|
+
|
|
29
|
+
## Lesson
|
|
30
|
+
Infrastructure fixes (parser, quality gate, notification format) cannot benefit the execution that implements them. Either: (1) run infra fixes as a separate pre-flight step before the main plan, (2) accept the waste and document it as known, or (3) after the infra batch, stop and re-run with `--resume` so a fresh process loads the fixed code.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 52
|
|
3
|
+
title: "Uncommitted changes from parallel work fail the quality gate git-clean check"
|
|
4
|
+
severity: blocker
|
|
5
|
+
languages: [shell]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Manual edits to files in a worktree where run-plan.sh is executing — the git-clean check in quality-gate.sh detects uncommitted changes and fails the batch"
|
|
11
|
+
fix: "Never make uncommitted changes in a worktree with an active run-plan. Use a separate worktree or commit before the next quality gate runs."
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# run-plan.sh is executing batches in ~/project/
|
|
15
|
+
# Meanwhile, manually edit scripts/lib/run-plan-notify.sh
|
|
16
|
+
# -> Quality gate runs check_git_clean() -> finds dirty working tree -> FAIL
|
|
17
|
+
good: |
|
|
18
|
+
# Option A: Edit in a separate worktree
|
|
19
|
+
git worktree add ../project-notify-fix -b fix/notifications
|
|
20
|
+
# Option B: Commit immediately before next quality gate
|
|
21
|
+
git add scripts/lib/run-plan-notify.sh && git commit -m "fix: ..."
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Observation
|
|
25
|
+
During Phase 4 execution, Telegram notification format was improved by editing `run-plan-notify.sh` and its test file directly in the worktree where `run-plan.sh` was running. When Batch 9 completed and the quality gate ran `check_git_clean()`, it found 3 uncommitted files and failed the batch. The batch agent then spent a full retry attempt (5+ minutes) trying to fix a problem that wasn't caused by its own work.
|
|
26
|
+
|
|
27
|
+
## Insight
|
|
28
|
+
The quality gate's git-clean check exists to ensure every batch's work is committed before the next batch starts. It can't distinguish between "the batch agent forgot to commit" and "a human made parallel edits." Both look the same: dirty working tree. The retry agent wastes time investigating a failure it can't fix, since the dirty files aren't part of its batch.
|
|
29
|
+
|
|
30
|
+
## Lesson
|
|
31
|
+
A worktree with an active run-plan is a no-edit zone. All parallel work must happen in a separate worktree or be committed immediately. If you must edit files in the active worktree, commit them before the next quality gate runs. The cost of a wasted retry (5+ minutes, API calls) far exceeds the cost of a quick commit.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 53
|
|
3
|
+
title: "Missing jq -c flag causes string comparison failures in tests"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [shell]
|
|
6
|
+
scope: [project:autonomous-coding-toolkit]
|
|
7
|
+
category: test-anti-patterns
|
|
8
|
+
pattern:
|
|
9
|
+
type: syntactic
|
|
10
|
+
regex: "assert_eq.*\\$\\(.*jq [^-]"
|
|
11
|
+
description: "Using jq without -c flag in a string comparison assertion — pretty-printed output won't match compact expected values"
|
|
12
|
+
fix: "Always use jq -c (compact) when the output will be compared as a string. Or compare with jq equality instead of string equality."
|
|
13
|
+
example:
|
|
14
|
+
bad: |
|
|
15
|
+
result=$(echo "$json" | jq '.[0] | sort')
|
|
16
|
+
assert_eq "group is [1]" '[1]' "$result"
|
|
17
|
+
# FAIL: expected [1], got [\n 1\n]
|
|
18
|
+
good: |
|
|
19
|
+
result=$(echo "$json" | jq -c '.[0] | sort')
|
|
20
|
+
assert_eq "group is [1]" '[1]' "$result"
|
|
21
|
+
# PASS: both are [1]
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Observation
|
|
25
|
+
In `test-run-plan-team.sh`, three assertions failed because one `jq` call used `jq '.[2] | sort'` (pretty-printed) while the test expected compact JSON `[4]`. The other two calls on adjacent lines correctly used `jq -c`. The inconsistency was introduced when the test was generated — two of three similar lines got the `-c` flag, one didn't.
|
|
26
|
+
|
|
27
|
+
## Insight
|
|
28
|
+
jq defaults to pretty-printing (multi-line, indented). When output is stored in a variable and compared with `assert_eq`, the multi-line string `[\n 4\n]` never matches the compact string `[4]`. This is invisible until the test runs because the pattern looks correct at a glance. The failure message shows the actual as multi-line, making the `-c` omission obvious only in hindsight.
|
|
29
|
+
|
|
30
|
+
## Lesson
|
|
31
|
+
In shell test scripts, always use `jq -c` when the result will be compared as a string. Better yet, use `jq -e` for boolean checks or compare with `jq --argjson expected '[4]' '. == $expected'` to avoid format sensitivity entirely.
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 54
|
|
3
|
+
title: "Markdown parser matches headers inside code blocks and test fixtures"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [shell]
|
|
6
|
+
scope: [project:autonomous-coding-toolkit]
|
|
7
|
+
category: silent-failures
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "A markdown parser using simple regex (grep/awk) matches ## headers that appear inside fenced code blocks, heredocs, or test fixture content — inflating batch/task counts"
|
|
11
|
+
fix: "Track fenced code block state (``` toggles) and skip matches inside code blocks. Or use a proper markdown AST parser."
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# count_batches uses: grep -c '^## Batch'
|
|
15
|
+
# Plan has a test fixture with '## Batch 2: Also Real' inside a heredoc
|
|
16
|
+
# -> Parser counts 19 batches for a 5-batch plan
|
|
17
|
+
good: |
|
|
18
|
+
count_batches() {
|
|
19
|
+
awk '/^```/{fence=!fence} !fence && /^## Batch/{n++} END{print n}' "$1"
|
|
20
|
+
}
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Observation
|
|
24
|
+
The Phase 4 plan had 5 real batches, but `count_batches` found 19. The extra 14 came from `## Batch` and `### Task` headers inside test fixtures, code examples, and plan documentation sections. Each phantom batch spawned a `claude -p` process (~30-50s each), wasting ~7 minutes and API credits.
|
|
25
|
+
|
|
26
|
+
## Insight
|
|
27
|
+
Simple `grep '^## Batch'` treats all lines equally — it cannot distinguish a real plan header from one inside a fenced code block (` ``` `), a heredoc, or an inline example. This is a fundamental limitation of line-by-line regex parsing of markdown. The problem compounds: the plan's own test (Task 1) includes sample plan content with headers, creating a recursive parsing trap.
|
|
28
|
+
|
|
29
|
+
## Lesson
|
|
30
|
+
Any markdown parser that affects execution (batch counting, task extraction) must be code-block-aware. Minimum viable fix: track ` ``` ` fence state with a toggle variable and skip matches inside fences. Better: use a dedicated markdown heading extraction that respects the CommonMark spec. The empty-batch-skip mitigates the cost but doesn't prevent the API calls for the initial `claude -p` attempt on each phantom batch.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 55
|
|
3
|
+
title: "LLM agents compensate for garbled batch prompts using cross-batch context"
|
|
4
|
+
severity: nice-to-have
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "An agent receives a malformed or empty batch prompt but successfully infers the correct work from progress.txt, recent git commits, and the full plan file"
|
|
11
|
+
fix: "Design for resilience: include progress notes, recent commits, and the full plan in every batch prompt so agents can self-correct when the parsed batch content is wrong."
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Batch prompt: "Batch 9: (empty)" with no tasks
|
|
15
|
+
# Agent has no context -> does nothing or hallucinates
|
|
16
|
+
good: |
|
|
17
|
+
# Batch prompt: "Batch 9: (empty)" BUT includes:
|
|
18
|
+
# - progress.txt with completed tasks listed
|
|
19
|
+
# - Recent git log showing what's been done
|
|
20
|
+
# - Full plan file reference
|
|
21
|
+
# -> Agent reads plan, deduces remaining work, implements correctly
|
|
22
|
+
---
|
|
23
|
+
|
|
24
|
+
## Observation
|
|
25
|
+
During Phase 4, batches 2 and 9 received garbled prompts — Batch 2 got fake content from a test fixture ("Task 2: Do more / Write more code"), and Batch 9 got an empty batch title. Despite this, both agents successfully implemented the correct plan tasks. Batch 2 implemented Tasks 7-9 (context assembler), and Batch 9 implemented Tasks 10, 11, 12, 15, and 17 (ast-grep + team mode).
|
|
26
|
+
|
|
27
|
+
## Insight
|
|
28
|
+
The cross-batch context system (progress.txt, recent commits in the prompt, and the plan file reference) provides enough information for agents to self-correct. The agent reads what's been done, compares it to the full plan, and picks up the next logical tasks. This resilience is an emergent property of including redundant context — no single source needs to be correct as long as the ensemble is informative.
|
|
29
|
+
|
|
30
|
+
## Lesson
|
|
31
|
+
Always include multiple context signals in batch prompts: (1) progress notes listing completed work, (2) recent git commits showing actual changes, (3) the full plan file path for reference. This creates graceful degradation — even when the parser sends wrong batch content, agents can figure out what work remains. The cost is slightly larger prompts; the benefit is resilience to parser bugs.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 56
|
|
3
|
+
title: "grep -c exits 1 on zero matches, breaking || fallback arithmetic"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [shell]
|
|
6
|
+
scope: [language:bash]
|
|
7
|
+
category: silent-failures
|
|
8
|
+
pattern:
|
|
9
|
+
type: syntactic
|
|
10
|
+
regex: "grep\\s+-c.*\\|\\|\\s*echo\\s+[\"']?0[\"']?"
|
|
11
|
+
description: "grep -c with || echo 0 fallback — produces multiline output on zero matches"
|
|
12
|
+
fix: "Use || true with ${var:-0} default instead of || echo 0"
|
|
13
|
+
example:
|
|
14
|
+
bad: |
|
|
15
|
+
count=$(echo "$text" | grep -c "pattern" || echo "0")
|
|
16
|
+
result=$((count + 1)) # breaks: count="0\n0" from both outputs
|
|
17
|
+
good: |
|
|
18
|
+
count=$(echo "$text" | grep -c "pattern" || true)
|
|
19
|
+
count=${count:-0}
|
|
20
|
+
result=$((count + 1))
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Observation
|
|
24
|
+
|
|
25
|
+
`grep -c` returns both the count AND exit code 1 when count is 0.
|
|
26
|
+
With `|| echo "0"`, the fallback fires AND grep's "0" output is kept,
|
|
27
|
+
producing `"0\n0"`. Bash arithmetic `$((0\n0 + 1))` fails with
|
|
28
|
+
"syntax error in expression".
|
|
29
|
+
|
|
30
|
+
## Insight
|
|
31
|
+
|
|
32
|
+
`grep -c` violates the common assumption that exit code 1 means "error."
|
|
33
|
+
In grep, exit 1 means "no matches found" — a valid result, not a failure.
|
|
34
|
+
The `|| echo "0"` pattern double-counts because the subshell captures
|
|
35
|
+
grep's stdout ("0") AND the fallback echo ("0") on separate lines.
|
|
36
|
+
|
|
37
|
+
## Lesson
|
|
38
|
+
|
|
39
|
+
Never use `grep -c ... || echo "0"` for count fallback. Use
|
|
40
|
+
`grep -c ... || true` to suppress the exit code, then `${var:-0}` as
|
|
41
|
+
the numeric default. This pattern is safe because `|| true` doesn't
|
|
42
|
+
add to stdout — it only prevents `set -e` from aborting the script.
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
---
|
|
2
|
+
id: 57
|
|
3
|
+
title: "New generated artifacts break git-clean quality gates"
|
|
4
|
+
severity: should-fix
|
|
5
|
+
languages: [all]
|
|
6
|
+
scope: [universal]
|
|
7
|
+
category: integration-boundaries
|
|
8
|
+
pattern:
|
|
9
|
+
type: semantic
|
|
10
|
+
description: "Adding a new generated file to a pipeline without updating gitignore and E2E tests"
|
|
11
|
+
fix: "When adding generated artifacts, update .gitignore AND all E2E test gitignore fixtures"
|
|
12
|
+
example:
|
|
13
|
+
bad: |
|
|
14
|
+
# Added generate_agents_md() to startup
|
|
15
|
+
# AGENTS.md created in worktree
|
|
16
|
+
# E2E test fails: "uncommitted changes in worktree"
|
|
17
|
+
good: |
|
|
18
|
+
# Added generate_agents_md() to startup
|
|
19
|
+
# Updated E2E test .gitignore to include AGENTS.md
|
|
20
|
+
# E2E test passes: git-clean check ignores AGENTS.md
|
|
21
|
+
---
|
|
22
|
+
|
|
23
|
+
## Observation
|
|
24
|
+
|
|
25
|
+
Adding `generate_agents_md()` to the headless runner startup created
|
|
26
|
+
AGENTS.md in the worktree. The function's own unit test passed. But the
|
|
27
|
+
E2E test failed because its git worktree now had an untracked file,
|
|
28
|
+
and the quality gate's `check_git_clean` rejected it.
|
|
29
|
+
|
|
30
|
+
## Insight
|
|
31
|
+
|
|
32
|
+
This is Cluster B (Integration Boundaries). When a pipeline generates
|
|
33
|
+
new files, the git-clean check sees them as uncommitted work. Every
|
|
34
|
+
generated artifact needs a corresponding gitignore entry — both in the
|
|
35
|
+
real project AND in test fixtures that simulate the worktree.
|
|
36
|
+
|
|
37
|
+
## Lesson
|
|
38
|
+
|
|
39
|
+
Whenever you add a new generated file to a pipeline: (1) add it to the
|
|
40
|
+
project's `.gitignore`, (2) add it to every E2E test fixture's
|
|
41
|
+
`.gitignore`, (3) run the E2E test before committing. The unit test for
|
|
42
|
+
the generator won't catch this because it doesn't run the quality gate.
|