@hallucination-studio/harness-engine 1.0.0 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +185 -27
  3. package/bin/install.js +29 -17
  4. package/package.json +10 -4
  5. package/skills/harness-engine/SKILL.md +97 -0
  6. package/skills/harness-engine/agents/openai.yaml +4 -0
  7. package/skills/harness-engine/evals/cases.json +94 -0
  8. package/skills/harness-engine/evals/harness_engine_evals/__init__.py +1 -0
  9. package/skills/harness-engine/evals/harness_engine_evals/cases_frontend.py +211 -0
  10. package/skills/harness-engine/evals/harness_engine_evals/cases_lifecycle.py +1616 -0
  11. package/skills/harness-engine/evals/harness_engine_evals/helpers.py +155 -0
  12. package/skills/harness-engine/evals/harness_engine_evals/registry.py +55 -0
  13. package/skills/harness-engine/evals/harness_engine_evals/report.py +36 -0
  14. package/skills/harness-engine/evals/harness_engine_evals/runner.py +53 -0
  15. package/skills/harness-engine/evals/run_evals.py +14 -0
  16. package/skills/{harness-repo-bootstrap → harness-engine}/references/evaluation-loop.md +8 -2
  17. package/skills/harness-engine/references/evidence-first-evals.md +187 -0
  18. package/skills/harness-engine/references/exec-plans.md +59 -0
  19. package/skills/{harness-repo-bootstrap → harness-engine}/references/file-map.md +3 -3
  20. package/skills/{harness-repo-bootstrap → harness-engine}/references/knowledge-capture.md +2 -2
  21. package/skills/{harness-repo-bootstrap → harness-engine}/references/sop-index.md +3 -0
  22. package/skills/harness-engine/references/template-policy.md +17 -0
  23. package/skills/harness-engine/references/workflow.md +62 -0
  24. package/skills/harness-engine/scripts/harness_engine/__init__.py +1 -0
  25. package/skills/harness-engine/scripts/harness_engine/analysis.py +240 -0
  26. package/skills/harness-engine/scripts/harness_engine/checks.py +287 -0
  27. package/skills/harness-engine/scripts/harness_engine/cli.py +656 -0
  28. package/skills/harness-engine/scripts/harness_engine/common.py +977 -0
  29. package/skills/harness-engine/scripts/harness_engine/continuation.py +520 -0
  30. package/skills/harness-engine/scripts/harness_engine/git_ops.py +88 -0
  31. package/skills/harness-engine/scripts/harness_engine/knowledge.py +329 -0
  32. package/skills/harness-engine/scripts/harness_engine/plans.py +630 -0
  33. package/skills/harness-engine/scripts/harness_engine/templates.py +124 -0
  34. package/skills/harness-engine/scripts/manage_harness.py +14 -0
  35. package/skills/harness-repo-bootstrap/SKILL.md +0 -68
  36. package/skills/harness-repo-bootstrap/agents/openai.yaml +0 -4
  37. package/skills/harness-repo-bootstrap/evals/cases.json +0 -18
  38. package/skills/harness-repo-bootstrap/evals/run_evals.py +0 -337
  39. package/skills/harness-repo-bootstrap/references/exec-plans.md +0 -39
  40. package/skills/harness-repo-bootstrap/references/template-policy.md +0 -12
  41. package/skills/harness-repo-bootstrap/references/workflow.md +0 -47
  42. package/skills/harness-repo-bootstrap/scripts/manage_harness.py +0 -1181
  43. /package/skills/{harness-repo-bootstrap → harness-engine}/assets/repo-template/.keep +0 -0
  44. /package/skills/{harness-repo-bootstrap → harness-engine}/assets/sops/.keep +0 -0
  45. /package/skills/{harness-repo-bootstrap → harness-engine}/references/question-catalog.md +0 -0
@@ -0,0 +1,977 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ import re
5
+ import shutil
6
+ import subprocess
7
+ import time
8
+ from datetime import UTC, datetime
9
+ from pathlib import Path
10
+
11
+ MANAGED_MARKER = "<!-- harness-engine:managed -->"
12
+ OBSOLETE_MANAGED_MARKERS = [
13
+ "<!-- harness-repo-bootstrap:managed -->",
14
+ "<!-- harness-init:managed -->",
15
+ ]
16
+ DEFAULT_KNOWLEDGE_PLACEHOLDER = "- [ ] Add durable facts here as they emerge -> <destination-doc>"
17
+ DEFAULT_DEFECT_PLACEHOLDER = "None."
18
+ PLAN_PLACEHOLDERS = [
19
+ "Define in-scope work.",
20
+ "Define out-of-scope work.",
21
+ "Add relevant product, architecture, reliability, security, or delivery constraints.",
22
+ "Add the first concrete step.",
23
+ "Add the next concrete step.",
24
+ "Add the next step.",
25
+ "Describe how the work will be verified.",
26
+ "List product, architecture, reliability, security, or delivery constraints.",
27
+ "Describe what is included and excluded.",
28
+ ]
29
+ GITIGNORE_BLOCK_START = "# harness-engine transient files"
30
+ GITIGNORE_BLOCK_END = "# end harness-engine transient files"
31
+ GITIGNORE_ENTRIES = [
32
+ ".codex/skills/",
33
+ "docs/generated/",
34
+ ]
35
+ CLEAN_INIT_DIRS = [
36
+ "docs/generated",
37
+ ]
38
+ GIT_CLEAN_PATHS = [
39
+ ".codex/skills",
40
+ "docs/generated",
41
+ ]
42
+ PLAN_TEMPLATE = """# Execution Plan: {title}
43
+
44
+ ## Goal
45
+
46
+ {goal}
47
+
48
+ ## Scope
49
+
50
+ - Define in-scope work.
51
+ - Define out-of-scope work.
52
+
53
+ ## Constraints
54
+
55
+ - Add relevant product, architecture, reliability, security, or delivery constraints.
56
+
57
+ ## Steps
58
+
59
+ 1. Add the first concrete step.
60
+ 2. Add the next concrete step.
61
+
62
+ ## Validation
63
+
64
+ - Describe how the work will be verified.
65
+
66
+ ## Acceptance Contract
67
+
68
+ Status: draft
69
+ Fingerprint: pending
70
+
71
+ Run `acceptance-set` before implementation to define specific product, UX, architecture, reliability, and security acceptance criteria.
72
+
73
+ | Dimension | Criteria |
74
+ | --- | --- |
75
+ | Product correctness | pending |
76
+ | UX and operator clarity | pending |
77
+ | Architecture and maintainability | pending |
78
+ | Reliability and observability | pending |
79
+ | Security and data handling | pending |
80
+
81
+ ## Quality Result
82
+
83
+ Status: pending
84
+ Minimum score: 8.0
85
+ Average score: pending
86
+ Last scored: pending
87
+ Criteria fingerprint: pending
88
+
89
+ Run `quality-score` after implementation and validation. Scores must cite evidence for the ready acceptance contract.
90
+
91
+ ## Defects To Resolve
92
+
93
+ {defect_section}
94
+
95
+ ## Rework Required
96
+
97
+ - Acceptance Contract is not ready.
98
+
99
+ ## Continuation Decision
100
+
101
+ Decision: pending
102
+ Workstream: none
103
+ Next target: none
104
+ Next action: none
105
+ Closure reason: none
106
+ Resume notes: none
107
+
108
+ ## Durable Knowledge To Capture
109
+
110
+ {knowledge_section}
111
+
112
+ ## Completion Notes
113
+
114
+ Pending.
115
+ """
116
+
117
+ ROOT_FILES = {
118
+ "AGENTS.md": """{marker}
119
+ # AGENTS
120
+
121
+ Read this file first, then follow the linked docs.
122
+
123
+ ## Routing
124
+
125
+ - Read `ARCHITECTURE.md` before changing boundaries, data flow, or integrations.
126
+ - Read `docs/PLANS.md` before any repository change. Every code, doc, config, test, dependency, build, release, or runtime-behavior change needs an execution plan.
127
+ - Read `docs/exec-plans/workstreams.md` before resuming interrupted feature, refactor, reliability, security, frontend, or cleanup work.
128
+ - Read `docs/exec-plans/active/` before changing files; use `plan-start` when no active plan covers the requested repository change.
129
+ - Read `docs/QUALITY_SCORE.md` before evaluating tradeoffs or readiness.
130
+ - Read `docs/RELIABILITY.md` for runtime validation and failure handling.
131
+ - Read `docs/SECURITY.md` before touching auth, secrets, permissions, or sensitive data.
132
+ - Read `docs/FRONTEND.md` and `docs/DESIGN.md` for UI, terminal interface, layout, visual-state, canvas, or interaction changes.
133
+ - Read the matching file in `docs/sops/` before architecture changes, UI validation, observability work, evidence-first evals, or knowledge capture.
134
+
135
+ ## Harness Task Intake
136
+
137
+ Default rule: any request that changes repository files or behavior goes through the harness lifecycle. This includes code, docs, configuration, tests, dependencies, generated templates, build/release scripts, runtime behavior, migrations, cleanup, and fixes found during review. Codex starts or reuses an execution plan, sets acceptance before implementation, validates with evidence, runs `quality-score`, closes the plan, then runs `check`.
138
+
139
+ No-plan exceptions are narrow: pure question answering, read-only investigation, showing command output, or status reporting with no file changes. If the work moves from investigation to editing files, create or reuse an active plan before editing.
140
+
141
+ | Request Type | Read First | SOP | Minimum Evidence |
142
+ | --- | --- | --- | --- |
143
+ | New feature or product behavior | `docs/PRODUCT_SENSE.md`, `docs/product-specs/`, `docs/PLANS.md` | `docs/sops/evidence-first-eval-loop.md` | Product assertions, workflow checks, tests or smoke evidence |
144
+ | Bug, regression, or user-reported issue | `AGENTS.md` Issue Workflows, affected domain docs, `docs/PLANS.md` | Domain SOP from Issue Workflows | Reproduction, regression assertion, fix validation, defect log if confirmed |
145
+ | Refactor, cleanup, or code organization | `ARCHITECTURE.md`, `docs/PLANS.md`, `docs/exec-plans/workstreams.md` | `docs/sops/layered-domain-architecture-setup.md` when boundaries change | Before/after behavior checks, boundary or dependency notes, compatibility evidence |
146
+ | Frontend, UI, design, layout, terminal interface, visual state, or interaction | `docs/FRONTEND.md`, `docs/DESIGN.md`, `docs/QUALITY_SCORE.md` | `docs/sops/chrome-devtools-ui-validation-loop.md` and evidence-first eval loop | Browser or local-runtime evidence for workflows, states, and relevant viewports |
147
+ | Tests, evals, fixtures, or validation harnesses | `docs/QUALITY_SCORE.md`, `docs/sops/evidence-first-eval-loop.md`, relevant product or architecture docs | `docs/sops/evidence-first-eval-loop.md` | Failing-before or coverage rationale, passing test/eval output, artifact paths when produced |
148
+ | Documentation, policy, specs, or generated harness templates | `docs/PLANS.md`, affected docs, `docs/QUALITY_SCORE.md` | `docs/sops/encode-unseen-knowledge.md` when durable facts change | Doc diff review, link/path validation, generated-output or eval evidence when templates change |
149
+ | Dependencies, tooling, package manager, or build system | `ARCHITECTURE.md`, `docs/RELIABILITY.md`, `docs/SECURITY.md` | Local observability SOP when runtime behavior can change | Install/build/test output, lockfile or package diff, compatibility and rollback notes |
150
+ | Build, release, deployment, or packaging | `ARCHITECTURE.md`, `docs/RELIABILITY.md`, `docs/SECURITY.md` | `docs/sops/local-observability-feedback-loop.md` | Repeatable build/package output, smoke check, release-risk notes |
151
+ | Configuration, environment, flags, secrets handling, or policy gates | `docs/SECURITY.md`, `docs/RELIABILITY.md`, `ARCHITECTURE.md` | Local observability SOP; security review rules | Config diff, secret-handling review, permission or failure-mode evidence |
152
+ | Data, migrations, storage, cache, queues, or file formats | `ARCHITECTURE.md`, `docs/RELIABILITY.md`, `docs/SECURITY.md` | Evidence-first eval loop | Fixtures or migration checks, rollback/compatibility evidence, data-loss risk notes |
153
+ | Performance, reliability, observability, or operational behavior | `docs/RELIABILITY.md`, `ARCHITECTURE.md`, `docs/QUALITY_SCORE.md` | `docs/sops/local-observability-feedback-loop.md` | Baseline measurement, repeatable benchmark or smoke check, logs/traces, before/after evidence |
154
+ | Security, privacy, auth, authorization, or sensitive data | `docs/SECURITY.md`, `ARCHITECTURE.md`, `docs/QUALITY_SCORE.md` | Evidence-first eval loop plus security review rules | Threat check, sensitive-data path, permission test, and secret-handling evidence |
155
+ | Code review finding or user feedback that requires changes | Affected domain docs, `docs/PLANS.md`, `docs/QUALITY_SCORE.md` | Matching domain SOP | Finding reproduction or rationale, changed-file validation, defect log when it is a bug |
156
+
157
+ For every repository change:
158
+
159
+ - Inspect the relevant code path, runtime path, and user/operator workflow before editing.
160
+ - Codex creates or reuses an active plan with `plan-start`; keep plan scope lightweight for small changes, but do not skip the lifecycle.
161
+ - Codex defines a ready Acceptance Contract with `acceptance-set` before implementation.
162
+ - Convert requirements, risks, or reported failures into assertions, tests, smoke checks, or review evidence.
163
+ - Log confirmed defects or missing evidence with `defect-log`; unresolved defects must block `plan-close`, and scoring must be rerun after defects are resolved.
164
+ - Run task-appropriate validation, then have Codex score with `quality-score` using concrete evidence notes for every dimension.
165
+ - Codex closes with `plan-close` only after validation, passing quality, resolved defects, and durable knowledge updates are complete.
166
+ - Codex runs the local harness check before handoff.
167
+
168
+ ## Issue Workflows
169
+
170
+ For any user-reported issue, classify the domain first, read the listed files, then reproduce,
171
+ fix, and validate with evidence before judging the result. Issue handling is one branch of Harness Task Intake; if a fix or repository change is needed, the full plan, acceptance, quality, close, and check lifecycle applies.
172
+
173
+ | Domain | Read First | Required Evidence |
174
+ | --- | --- | --- |
175
+ | Product contract or acceptance drift | `docs/PRODUCT_SENSE.md`, `docs/product-specs/`, `docs/sops/evidence-first-eval-loop.md` | Product assertions, acceptance checks, or documented limitation |
176
+ | Frontend, UI, layout, interaction, responsive, canvas, visual state, or design fidelity | `docs/FRONTEND.md`, `docs/DESIGN.md`, `docs/sops/evidence-first-eval-loop.md` | Browser or local-runtime evidence across relevant workflows and viewports |
177
+ | Backend, API, runtime behavior, background jobs, or integrations | `ARCHITECTURE.md`, `docs/RELIABILITY.md`, `docs/sops/local-observability-feedback-loop.md` | Narrow reproduction, tests or API smoke checks, logs, and failure-mode evidence |
178
+ | Architecture boundaries, layering, data flow, or dependency direction | `ARCHITECTURE.md`, `docs/PLANS.md`, `docs/sops/layered-domain-architecture-setup.md` | Boundary map, tradeoff notes, migration or compatibility plan, and validation path |
179
+ | Data, state, migrations, cache, queues, or file formats | `ARCHITECTURE.md`, `docs/RELIABILITY.md`, `docs/SECURITY.md` | Fixtures or migration checks, rollback/compatibility evidence, and data-loss risk notes |
180
+ | Security, privacy, auth, authorization, secrets, or sensitive data | `docs/SECURITY.md`, `ARCHITECTURE.md` | Threat check, sensitive-data path, permission test, and secret-handling evidence |
181
+ | Performance, capacity, timeout, resource use, or availability | `docs/RELIABILITY.md`, `ARCHITECTURE.md`, `docs/sops/local-observability-feedback-loop.md` | Baseline measurement, repeatable benchmark or smoke check, and before/after evidence |
182
+
183
+ ## Repository Focus
184
+
185
+ - Project: {project_name}
186
+ - Domain: {product_domain}
187
+ - Primary outcome: {project_summary}
188
+ - Main users: {primary_users}
189
+
190
+ ## Operating Rules
191
+
192
+ - Keep durable decisions in repo docs, not only in chat.
193
+ - Keep active plans in `docs/exec-plans/active/` and completed plans in `docs/exec-plans/completed/`; both the Markdown plans and JSON sidecars are version-controlled project state.
194
+ - Keep resumable feature, refactor, reliability, security, frontend, and cleanup work in `docs/exec-plans/workstreams.md`.
195
+ - Update plans during the work, not only at the end.
196
+ - Codex defines acceptance criteria with `acceptance-set` before implementation, then scores completed work with `quality-score` before closing an execution plan.
197
+ - If `quality-score` fails, treat `## Rework Required` as the next implementation input and do not close the plan.
198
+ - Encode durable facts learned during execution into permanent docs before closing the task.
199
+ - Before handoff, Codex runs the local harness check. Active plans must have ready Acceptance Contracts; completed plans must have passing Quality Results scored against the current contract.
200
+ - Keep generated evidence and transient artifacts in `docs/generated/`; it is ignored by default unless intentionally promoted into tracked docs.
201
+ - Keep local skill installs in `.codex/skills/`; they are ignored by default.
202
+ - Keep external references in `docs/references/`.
203
+ """,
204
+
205
+ "ARCHITECTURE.md": """{marker}
206
+ # Architecture
207
+
208
+ ## System Summary
209
+
210
+ {project_summary}
211
+
212
+ ## Domain Boundaries
213
+
214
+ - Product domain: {product_domain}
215
+ - Primary users: {primary_users}
216
+ - Deployment targets: {deployment_targets}
217
+
218
+ ## Repository Shape
219
+
220
+ - Detected languages: {languages}
221
+ - Detected package managers: {package_managers}
222
+ - Detected frameworks: {frameworks}
223
+
224
+ ## Reliability Architecture
225
+
226
+ {reliability_targets}
227
+
228
+ ## Security Architecture
229
+
230
+ {security_constraints}
231
+
232
+ ## Open Questions
233
+
234
+ - Document major runtime boundaries, shared libraries, and integration seams here as the codebase grows.
235
+ """,
236
+ }
237
+
238
+ FRONTEND_DOC_FILES = {
239
+ "docs/DESIGN.md": """---
240
+ version: alpha
241
+ name: {project_name} Design System
242
+ description: Project-owned unified visual specification for frontend and interface work.
243
+ frontend: true
244
+ source: harness-engine-template
245
+ colors:
246
+ primary: "#1A1C1E"
247
+ on-primary: "#FFFFFF"
248
+ primary-container: "#F0F1F2"
249
+ on-primary-container: "#1A1C1E"
250
+ secondary: "#6C7278"
251
+ on-secondary: "#FFFFFF"
252
+ tertiary: "#B8422E"
253
+ on-tertiary: "#FFFFFF"
254
+ neutral: "#F7F5F2"
255
+ surface: "#FFFFFF"
256
+ surface-muted: "#F3F4F6"
257
+ surface-elevated: "#FFFFFF"
258
+ text: "#1A1C1E"
259
+ muted: "#6C7278"
260
+ border: "#D7D9DD"
261
+ focus: "#2563EB"
262
+ success: "#166534"
263
+ warning: "#A16207"
264
+ danger: "#B91C1C"
265
+ typography:
266
+ display-xl:
267
+ fontFamily: Inter
268
+ fontSize: 56px
269
+ fontWeight: "700"
270
+ lineHeight: 1.1
271
+ letterSpacing: 0px
272
+ display-md:
273
+ fontFamily: Inter
274
+ fontSize: 44px
275
+ fontWeight: "700"
276
+ lineHeight: 1.12
277
+ letterSpacing: 0px
278
+ headline-lg:
279
+ fontFamily: Inter
280
+ fontSize: 32px
281
+ fontWeight: "650"
282
+ lineHeight: 1.2
283
+ letterSpacing: 0px
284
+ headline-md:
285
+ fontFamily: Inter
286
+ fontSize: 24px
287
+ fontWeight: "650"
288
+ lineHeight: 1.25
289
+ letterSpacing: 0px
290
+ title-lg:
291
+ fontFamily: Inter
292
+ fontSize: 20px
293
+ fontWeight: "650"
294
+ lineHeight: 28px
295
+ letterSpacing: 0px
296
+ title-md:
297
+ fontFamily: Inter
298
+ fontSize: 18px
299
+ fontWeight: "650"
300
+ lineHeight: 26px
301
+ letterSpacing: 0px
302
+ body-lg:
303
+ fontFamily: Inter
304
+ fontSize: 18px
305
+ fontWeight: "400"
306
+ lineHeight: 30px
307
+ letterSpacing: 0px
308
+ body-md:
309
+ fontFamily: Inter
310
+ fontSize: 16px
311
+ fontWeight: "400"
312
+ lineHeight: 24px
313
+ letterSpacing: 0px
314
+ body-sm:
315
+ fontFamily: Inter
316
+ fontSize: 14px
317
+ fontWeight: "400"
318
+ lineHeight: 20px
319
+ letterSpacing: 0px
320
+ label-md:
321
+ fontFamily: Inter
322
+ fontSize: 13px
323
+ fontWeight: "600"
324
+ lineHeight: 20px
325
+ letterSpacing: 0px
326
+ label-sm:
327
+ fontFamily: Inter
328
+ fontSize: 12px
329
+ fontWeight: "600"
330
+ lineHeight: 16px
331
+ letterSpacing: 0px
332
+ rounded:
333
+ xs: 2px
334
+ sm: 4px
335
+ md: 8px
336
+ lg: 12px
337
+ xl: 16px
338
+ full: 9999px
339
+ spacing:
340
+ base: 8px
341
+ xs: 4px
342
+ sm: 8px
343
+ md: 16px
344
+ lg: 24px
345
+ xl: 40px
346
+ xxl: 64px
347
+ gutter: 24px
348
+ page: 48px
349
+ components:
350
+ button-primary:
351
+ backgroundColor: "{{colors.tertiary}}"
352
+ textColor: "{{colors.on-tertiary}}"
353
+ typography: "{{typography.label-md}}"
354
+ rounded: "{{rounded.md}}"
355
+ padding: "{{spacing.md}}"
356
+ height: 44px
357
+ button-primary-hover:
358
+ backgroundColor: "{{colors.primary}}"
359
+ textColor: "{{colors.on-primary}}"
360
+ button-secondary:
361
+ backgroundColor: "{{colors.surface}}"
362
+ textColor: "{{colors.primary}}"
363
+ typography: "{{typography.label-md}}"
364
+ rounded: "{{rounded.md}}"
365
+ padding: "{{spacing.md}}"
366
+ height: 44px
367
+ button-secondary-hover:
368
+ backgroundColor: "{{colors.primary-container}}"
369
+ textColor: "{{colors.on-primary-container}}"
370
+ card:
371
+ backgroundColor: "{{colors.surface-elevated}}"
372
+ textColor: "{{colors.text}}"
373
+ rounded: "{{rounded.lg}}"
374
+ padding: "{{spacing.lg}}"
375
+ input:
376
+ backgroundColor: "{{colors.surface}}"
377
+ textColor: "{{colors.text}}"
378
+ typography: "{{typography.body-md}}"
379
+ rounded: "{{rounded.sm}}"
380
+ padding: "{{spacing.md}}"
381
+ height: 40px
382
+ badge:
383
+ backgroundColor: "{{colors.surface-muted}}"
384
+ textColor: "{{colors.muted}}"
385
+ typography: "{{typography.label-sm}}"
386
+ rounded: "{{rounded.full}}"
387
+ padding: "{{spacing.xs}}"
388
+ table-row:
389
+ backgroundColor: "{{colors.surface}}"
390
+ textColor: "{{colors.text}}"
391
+ typography: "{{typography.body-sm}}"
392
+ height: 44px
393
+ ---
394
+
395
+ # Design System: {project_name}
396
+
397
+ ## Overview
398
+
399
+ {project_summary}
400
+
401
+ Project positioning: {product_domain}
402
+
403
+ Primary users: {primary_users}
404
+
405
+ Frontend context: {frontend_stack_notes}
406
+
407
+ Requested style direction: {design_style_direction}
408
+
409
+ Existing frontend code signals: {existing_frontend_style_notes}
410
+
411
+ This document is the repository-owned visual system. It follows the DESIGN.md pattern of YAML tokens plus markdown rationale, using `/Users/murphy/code/github/design.md` only as a local reference for structure. Do not depend on external design-generation skills or packages during init. Refine this file from the human-confirmed style direction and the existing code signals above before large UI work.
412
+
413
+ ## Colors
414
+
415
+ Use the YAML tokens as the source of truth. Replace the starter palette with project-specific colors before major UI implementation. Derive replacements from the human-confirmed style direction and existing frontend code, not from an external generator. Every UI surface must map colors to semantic roles instead of introducing one-off hex values in components.
416
+
417
+ - **Primary / On Primary:** Durable brand presence, selected navigation, and high-emphasis surfaces.
418
+ - **Secondary:** Metadata, borders, captions, inactive states, and lower-emphasis UI.
419
+ - **Tertiary / On Tertiary:** Primary actions and critical interactive emphasis.
420
+ - **Neutral / Surface:** Page backgrounds, panels, cards, table rows, and form fields.
421
+ - **State colors:** Success, warning, danger, and focus must be used consistently for feedback and validation.
422
+ - **Borders:** Use the `border` token for rules, dividers, field strokes, table separators, and card outlines.
423
+
424
+ ## Typography
425
+
426
+ Use one primary UI font family across the product until the project explicitly documents a second family. All headings, labels, body text, metadata, tables, and controls must map to the typography tokens in frontmatter. Do not create local font sizes or weights in component files unless `docs/DESIGN.md` is updated first.
427
+
428
+ - **Display XL / Display MD:** Rare product-level moments, onboarding, or empty states. Do not use inside dense panels.
429
+ - **Headline LG / Headline MD:** Page, section, and major panel titles.
430
+ - **Title LG / Title MD:** Card titles, modal titles, table group headings, and toolbar labels.
431
+ - **Body LG / Body MD / Body SM:** Main reading text, dense table copy, helper text, and secondary descriptions.
432
+ - **Label MD / Label SM:** Buttons, form labels, badges, tabs, compact metadata, and column headers.
433
+ - **Font rule:** Use the tokenized `fontFamily`, `fontSize`, `fontWeight`, `lineHeight`, and `letterSpacing` values in shared style files so type remains uniform across the product.
434
+
435
+ ## Layout
436
+
437
+ Use the spacing tokens as the implementation scale. Keep workflow surfaces dense enough for repeated use but leave enough separation for scanning, comparison, and error recovery. Document fixed-format surfaces, responsive breakpoints, page gutters, panel padding, table density, and primary task areas here before implementing them. Do not add ad hoc margins that bypass the token scale.
438
+
439
+ ## Elevation & Depth
440
+
441
+ Prefer hierarchy through spacing, contrast, borders, tonal layers, and restrained shadows. Record the allowed elevation levels for base page, raised surface, modal/popover, and active drag/focus states. If the product is flat, say so and use border/contrast tokens consistently instead of shadows.
442
+
443
+ ## Shapes
444
+
445
+ Use the rounded token scale consistently. Buttons, inputs, cards, chips, modals, tables, and fixed-format controls should share a coherent corner-radius language. Do not mix pill, sharp, and soft-rounded styles without documenting the role of each shape.
446
+
447
+ ## Components
448
+
449
+ Define component treatment before scaling UI work:
450
+
451
+ - **Buttons:** color role, icon placement, loading state, disabled state, and hover/focus behavior.
452
+ - **Forms:** field shape, validation state, helper text, density, and keyboard ergonomics.
453
+ - **Navigation:** selected state, hierarchy, collapsed behavior, and responsive fallback.
454
+ - **Cards and panels:** surface color, border/elevation, padding, and information density.
455
+ - **Tables and lists:** row height, selected state, sorting/filtering affordances, empty state, and overflow behavior.
456
+ - **Feedback states:** loading, empty, error, success, warning, and permission-denied patterns.
457
+
458
+ All shared UI components must consume tokens from this document through the project's existing style layer, such as CSS variables, Tailwind theme config, theme modules, component library theme objects, or generated token notes. Component-local styling is allowed only for layout-specific composition, not for redefining global color, type, spacing, or radius decisions.
459
+
460
+ ## Do's and Don'ts
461
+
462
+ - Do update this file with project-specific visual decisions before large UI changes.
463
+ - Do reconcile the requested style direction with the current frontend code before changing shared styles.
464
+ - Do keep tokens and prose aligned: tokens provide exact values, prose explains when to use them.
465
+ - Do map tokens into the project's shared style file, theme config, or component theme module before broad UI implementation.
466
+ - Do cite this file in frontend plans and code-review notes when UI choices matter.
467
+ - Do validate meaningful UI work in a real browser before closing it out.
468
+ - Don't call external design skills or packages during harness init.
469
+ - Don't create one-off component styles that contradict this file without updating it.
470
+ - Don't leave generic palette or typography defaults in place for production-facing UI.
471
+ - Don't add untracked font families, font sizes, shadows, radii, or semantic colors directly in component files.
472
+ """,
473
+ "docs/FRONTEND.md": """{marker}
474
+ # Frontend
475
+
476
+ ## Project Positioning
477
+
478
+ Project: {project_name}
479
+
480
+ Domain: {product_domain}
481
+
482
+ Primary users: {primary_users}
483
+
484
+ Product purpose: {project_summary}
485
+
486
+ Requested style direction: {design_style_direction}
487
+
488
+ Existing frontend code signals: {existing_frontend_style_notes}
489
+
490
+ ## Scope
491
+
492
+ {frontend_scope}
493
+
494
+ ## Stack Notes
495
+
496
+ {frontend_stack_notes}
497
+
498
+ ## Validation Loop
499
+
500
+ {frontend_validation_loop}
501
+
502
+ ## Design Style Contract
503
+
504
+ - Read `docs/DESIGN.md` before implementing frontend, UI, layout, visual-state, canvas, or interaction work.
505
+ - Treat `docs/DESIGN.md` as the project-owned unified visual specification. It is written and maintained in this repository.
506
+ - Use the human-confirmed style direction and existing frontend code signals as the inputs for refining `docs/DESIGN.md`.
507
+ - Treat `docs/DESIGN.md` as the source of truth for UI tokens, colors, typography, spacing, radius, elevation, component treatment, and Do's and Don'ts.
508
+ - Files controlled by `docs/DESIGN.md` include token notes under `docs/design-docs/`, Tailwind theme files, global CSS variables, component theme modules, Storybook/theme previews, and any UI implementation that consumes those tokens or style rules.
509
+ - Agents must read in this order for UI work: `docs/FRONTEND.md`, `docs/DESIGN.md`, then the component, theme, or stylesheet being changed.
510
+ - When implementing UI, map `docs/DESIGN.md` tokens into the project's shared style layer first: CSS variables, Tailwind config, theme module, component-library theme, or equivalent existing style file.
511
+ - Do not add new fonts, font sizes, semantic colors, shadows, radii, or spacing scales directly in component files. Add or update tokens in `docs/DESIGN.md`, then update the shared style layer and consume it from components.
512
+ - Do not call external design-generation skills or package CLIs as part of harness init. If a project later adopts a generator, record that decision here first.
513
+
514
+ ## Evidence For Meaningful UI Work
515
+
516
+ - Capture desktop and mobile evidence for significant UI changes.
517
+ - Assert primary text, controls, selected state, loading state, empty state, error state, and primary interactions from the DOM or accessibility tree.
518
+ - Define and verify layout invariants for the changed surface, including readable content, non-overlapping controls, usable primary work area, stable fixed-format elements, and reachable actions.
519
+ - For responsive UI, verify that navigation, side panels, inspectors, toolbars, and secondary panes preserve the primary task area at intended breakpoints.
520
+ - For canvas, WebGL, or game UIs, add pixel or scene-state checks so a blank render cannot pass.
521
+ - Record browser limitations and fallback checks instead of claiming full UX validation when browser evidence is unavailable.
522
+ """,
523
+ "docs/design-docs/index.md": """{marker}
524
+ # Design Docs Index
525
+
526
+ - Add one document per durable design decision.
527
+ - Link active design decisions from plans and specs.
528
+ """,
529
+ "docs/design-docs/style-options.md": """{marker}
530
+ # Design System Control
531
+
532
+ The project owns `docs/DESIGN.md`. Harness Engine initializes the document from a local template inspired by `/Users/murphy/code/github/design.md` structure, then the project refines it with its own product, brand, human-confirmed style direction, and existing frontend code.
533
+
534
+ ## Controlled Files
535
+
536
+ - `docs/DESIGN.md`: source of truth for design tokens and design rationale.
537
+ - `docs/design-docs/`: durable design decisions, option notes, and validation evidence.
538
+ - `src/styles/`, `app/styles/`, or equivalent style directories: CSS variables, Tailwind themes, or framework-specific theme modules.
539
+ - Component theme files, Storybook theme previews, and UI implementation files that consume shared tokens or style rules.
540
+
541
+ ## Operating Rules
542
+
543
+ - Read `docs/FRONTEND.md` before editing controlled files.
544
+ - Read `docs/DESIGN.md` before changing UI implementation.
545
+ - Keep token values and prose rationale in sync.
546
+ - Record major visual-system changes in this folder or in the active plan.
547
+ """,
548
+ "docs/design-docs/core-beliefs.md": """{marker}
549
+ # Core Beliefs
550
+
551
+ - Keep the repository as the system of record.
552
+ - Prefer explicit policies over implied team memory.
553
+ - Prefer repeatable checks over remembered rules.
554
+ """,
555
+ }
556
+
557
+ DOC_FILES = {
558
+ "docs/PLANS.md": """{marker}
559
+ # Plans
560
+
561
+ ## Plan Lifecycle
562
+
563
+ - Create or reuse an execution plan for every repository change: code, docs, configuration, tests, dependencies, build/release scripts, generated templates, runtime behavior, migrations, cleanup, and fixes found during review.
564
+ - Put active execution plans in `docs/exec-plans/active/`.
565
+ - Move completed plans to `docs/exec-plans/completed/`.
566
+ - Commit active plans, completed plans, JSON sidecars, and `docs/exec-plans/workstreams.md` as durable project state.
567
+ - Track resumable multi-plan workstreams in `docs/exec-plans/workstreams.md`.
568
+ - Record cross-cutting follow-up work in `docs/exec-plans/tech-debt-tracker.md`.
569
+
570
+ ## Authoring Rules
571
+
572
+ - Keep plans concrete, testable, and scoped.
573
+ - For small changes, keep the plan lightweight: narrow scope, short steps, and focused validation are fine, but the Acceptance Contract and Quality Result are still required.
574
+ - Update plans during the work, not after the fact.
575
+ - Link to specs, decisions, and validation artifacts when they exist.
576
+ - Include a section for durable knowledge that must be written back into permanent docs.
577
+ - Record a continuation decision before closing every plan. Use workstreams only for resumable continue or pause decisions.
578
+ - Do not treat plans as the final home for product, architecture, or policy knowledge.
579
+
580
+ ## No-Plan Exceptions
581
+
582
+ Only skip an execution plan for pure question answering, read-only investigation, showing command output, or status reporting with no file changes. If the work moves from investigation to editing files, create or reuse an active plan before editing.
583
+ """,
584
+ "docs/PRODUCT_SENSE.md": """{marker}
585
+ # Product Sense
586
+
587
+ ## Product Summary
588
+
589
+ {project_summary}
590
+
591
+ ## Users
592
+
593
+ {primary_users}
594
+
595
+ ## Decision Rules
596
+
597
+ - Optimize for the main user outcome before edge polish.
598
+ - Make tradeoffs explicit when speed, quality, and scope conflict.
599
+ - Capture durable product decisions in `docs/product-specs/`.
600
+ """,
601
+ "docs/QUALITY_SCORE.md": """{marker}
602
+ # Quality Score
603
+
604
+ ## Priority Areas
605
+
606
+ {quality_focus}
607
+
608
+ ## Scoring Dimensions
609
+
610
+ - Product correctness
611
+ - UX and operator clarity
612
+ - Architecture and maintainability
613
+ - Reliability and observability
614
+ - Security and data handling
615
+
616
+ ## Evidence Requirements
617
+
618
+ - Product correctness scores must cite product contract checks, tests, browser assertions, or documented limitations.
619
+ - UX scores for frontend work must cite browser evidence such as screenshots, DOM/accessibility snapshots, or responsive viewport checks.
620
+ - Backend and runtime scores must cite narrow reproductions, tests, API smoke checks, logs, or integration evidence.
621
+ - Architecture scores must cite boundary, dependency, data-flow, migration, or compatibility evidence.
622
+ - Data and state scores must cite fixtures, migration checks, rollback checks, or data-loss risk analysis.
623
+ - Security scores must cite threat checks, permission tests, sensitive-data path review, or secret-handling evidence.
624
+ - Performance and reliability scores must cite baseline measurements, repeatable checks, failure-mode tests, or before/after evidence.
625
+ - Reliability scores must cite repeatable commands, smoke checks, logs, traces, or failure-mode tests.
626
+ - Every quality-score dimension requires a concrete evidence note tied to the ready Acceptance Contract; do not leave score notes empty.
627
+ - Open defects must be logged with `defect-log`; do not hide known failures inside a high numeric score.
628
+ - Treat LLM or human judgment as a summary over evidence, not as the only eval signal.
629
+
630
+ ## Usage
631
+
632
+ - Score changes by affected domain and layer.
633
+ - Read `AGENTS.md` Harness Task Intake, Issue Workflows, and `docs/sops/evidence-first-eval-loop.md` before closing repository-mutating work.
634
+ - Document recurring weak spots and improvement themes here.
635
+ """,
636
+ "docs/RELIABILITY.md": """{marker}
637
+ # Reliability
638
+
639
+ ## Reliability Targets
640
+
641
+ {reliability_targets}
642
+
643
+ ## Runtime Validation
644
+
645
+ - Define the smallest useful local validation loop.
646
+ - Document required health checks, logs, and dashboards.
647
+ - Capture recurring incidents or near misses in repo docs.
648
+ """,
649
+ "docs/SECURITY.md": """{marker}
650
+ # Security
651
+
652
+ ## Security Constraints
653
+
654
+ {security_constraints}
655
+
656
+ ## Review Rules
657
+
658
+ - Review auth, authorization, secrets, and sensitive data changes explicitly.
659
+ - Prefer least privilege and traceable configuration.
660
+ - Record security-sensitive assumptions in durable docs.
661
+ """,
662
+ "docs/exec-plans/tech-debt-tracker.md": """{marker}
663
+ # Tech Debt Tracker
664
+
665
+ Record follow-up work that should survive beyond a single execution plan.
666
+ """,
667
+ "docs/exec-plans/workstreams.md": """{marker}
668
+ # Workstreams
669
+
670
+ Use this ledger only for resumable work that spans plans or is intentionally paused.
671
+
672
+ ## Index
673
+
674
+ | ID | Status | Current Plan | Last Completed Plan | Next Action | Last Updated |
675
+ | --- | --- | --- | --- | --- | --- |
676
+
677
+ ## Operating Rules
678
+
679
+ - Add a workstream when work spans multiple execution plans or may be resumed by another agent.
680
+ - Do not add one-off completed plans here unless their continuation decision is `continue` or `pause`.
681
+ - Keep `Current Plan` pointed at the active plan when one exists.
682
+ - Keep `Last Completed Plan` pointed at the latest completed plan after `plan-close`.
683
+ - Keep `Next Action` concrete enough that another agent can resume without chat history.
684
+ - If a workstream is paused, record the restart condition in `Next Action`.
685
+ """,
686
+ "docs/exec-plans/active/README.md": """{marker}
687
+ # Active Execution Plans
688
+
689
+ Create one markdown file per in-flight repository change. A repository change includes code, docs, configuration, tests, dependencies, build/release scripts, generated templates, runtime behavior, migrations, cleanup, and fixes found during review.
690
+
691
+ Suggested filename:
692
+
693
+ `YYYY-MM-DD-short-task-name.md`
694
+
695
+ Minimum contents:
696
+
697
+ - goal
698
+ - scope
699
+ - constraints
700
+ - steps
701
+ - validation
702
+ - acceptance contract
703
+ - quality result
704
+ - defects to resolve
705
+ - rework required
706
+ - continuation decision
707
+ - durable knowledge to capture
708
+
709
+ Use a lightweight plan for small changes, but still set a ready Acceptance Contract, record a Quality Result, close the plan, and run the local harness check.
710
+ """,
711
+ "docs/exec-plans/active/_template.md": """{marker}
712
+ # Execution Plan: <title>
713
+
714
+ ## Goal
715
+
716
+ Describe the intended outcome.
717
+
718
+ ## Scope
719
+
720
+ Describe what is included and excluded.
721
+
722
+ ## Constraints
723
+
724
+ List product, architecture, reliability, security, or delivery constraints.
725
+
726
+ ## Steps
727
+
728
+ 1. Add the first concrete step.
729
+ 2. Add the next step.
730
+
731
+ ## Validation
732
+
733
+ - Describe how the work will be verified.
734
+
735
+ ## Acceptance Contract
736
+
737
+ Status: draft
738
+ Fingerprint: pending
739
+
740
+ Run `acceptance-set` before implementation to define specific product, UX, architecture, reliability, and security acceptance criteria.
741
+
742
+ | Dimension | Criteria |
743
+ | --- | --- |
744
+ | Product correctness | pending |
745
+ | UX and operator clarity | pending |
746
+ | Architecture and maintainability | pending |
747
+ | Reliability and observability | pending |
748
+ | Security and data handling | pending |
749
+
750
+ ## Quality Result
751
+
752
+ Status: pending
753
+ Minimum score: 8.0
754
+ Average score: pending
755
+ Last scored: pending
756
+ Criteria fingerprint: pending
757
+
758
+ Run `quality-score` after implementation and validation. Scores must cite evidence for the ready acceptance contract.
759
+
760
+ ## Rework Required
761
+
762
+ - Acceptance Contract is not ready.
763
+
764
+ ## Continuation Decision
765
+
766
+ Decision: pending
767
+ Workstream: none
768
+ Next target: none
769
+ Next action: none
770
+ Closure reason: none
771
+ Resume notes: none
772
+
773
+ ## Durable Knowledge To Capture
774
+
775
+ - List facts that must be written back into permanent docs before completion.
776
+
777
+ ## Completion Notes
778
+
779
+ Summarize outcomes, follow-ups, and doc updates.
780
+ """,
781
+ "docs/exec-plans/completed/README.md": """{marker}
782
+ # Completed Execution Plans
783
+
784
+ Move finished plans here after:
785
+
786
+ 1. validation is complete
787
+ 2. the Acceptance Contract is ready and the Quality Result has passed
788
+ 3. a continuation decision has been recorded
789
+ 4. permanent docs have been updated
790
+ 5. any remaining follow-ups are recorded in workstreams, tech debt, or new plans
791
+ """,
792
+ "docs/generated/db-schema.md": """{marker}
793
+ # Generated DB Schema
794
+
795
+ Place generated database or storage schema snapshots here when relevant.
796
+ """,
797
+ "docs/product-specs/index.md": """{marker}
798
+ # Product Specs Index
799
+
800
+ - Add one durable product spec per important workflow or product area.
801
+ - Link the active plan that created or changed each spec when useful.
802
+ """,
803
+ "docs/product-specs/new-user-onboarding.md": """{marker}
804
+ # New User Onboarding
805
+
806
+ ## Outcome
807
+
808
+ Describe the desired first successful experience for a new user of {project_name}.
809
+
810
+ ## Open Questions
811
+
812
+ - What must a new user understand before reaching value?
813
+ - Which steps are fragile or confusing today?
814
+ """,
815
+ "docs/references/design-system-reference-llms.txt": "Add model-friendly design system notes or links here.\n",
816
+ "docs/references/nixpacks-llms.txt": "Add model-friendly deployment or buildpack notes here.\n",
817
+ "docs/references/uv-llms.txt": "Add model-friendly Python tooling notes here.\n",
818
+ "docs/sops/layered-domain-architecture-setup.md": """{marker}
819
+ # SOP: Layered Domain Architecture Setup
820
+
821
+ 1. Identify user-facing domains and bounded contexts.
822
+ 2. Map code ownership and integration seams.
823
+ 3. Record allowed dependency direction between layers.
824
+ 4. Capture the result in `ARCHITECTURE.md` and the relevant design docs.
825
+ """,
826
+ "docs/sops/encode-unseen-knowledge.md": """{marker}
827
+ # SOP: Encode Unseen Knowledge
828
+
829
+ 1. Notice repeated chat-only facts or tribal knowledge.
830
+ 2. Decide the right durable home inside `docs/`.
831
+ 3. Write the fact in concise, retrievable language.
832
+ 4. Link it from the nearest routing doc if it will be reused often.
833
+ """,
834
+ "docs/sops/local-observability-feedback-loop.md": """{marker}
835
+ # SOP: Local Observability Feedback Loop
836
+
837
+ 1. Run the narrowest local reproduction of the issue.
838
+ 2. Capture logs, metrics, traces, or screenshots.
839
+ 3. Tighten the validation loop until failures are easy to observe.
840
+ 4. Record the durable validation path in `docs/RELIABILITY.md`.
841
+ """,
842
+ "docs/sops/chrome-devtools-ui-validation-loop.md": """{marker}
843
+ # SOP: Chrome DevTools UI Validation Loop
844
+
845
+ 1. Open the relevant route in a browser.
846
+ 2. Check layout, interaction, loading, error, and empty states.
847
+ 3. Verify responsive behavior for the intended breakpoints.
848
+ 4. Write reusable findings back to `docs/FRONTEND.md` or `docs/design-docs/`.
849
+ """,
850
+ "docs/sops/evidence-first-eval-loop.md": """{marker}
851
+ # SOP: Evidence-First Eval Loop
852
+
853
+ 1. Read Harness Task Intake in `AGENTS.md`; every repository-mutating change needs an active plan unless it is a documented no-plan exception.
854
+ 2. Convert product requirements into explicit product contract checks and write them with `acceptance-set` before implementation.
855
+ 3. Run deterministic validation before scoring: tests, API smoke checks, CLI checks, browser actions, and state assertions.
856
+ 4. Read the Issue Workflows in `AGENTS.md` and the domain docs named there before judging or fixing reported bugs.
857
+ 5. For frontend work, capture browser evidence: screenshots, DOM/accessibility snapshots, responsive checks, and layout invariants.
858
+ 6. For backend, architecture, data, security, and performance work, capture the domain evidence named in `AGENTS.md`.
859
+ 7. Log every discovered bug or evidence gap with `defect-log` before running `quality-score`.
860
+ 8. Resolve defects only after fixes have passing evidence, then rerun validation and `quality-score`.
861
+ 9. Report per-case results, failed assertions, artifact paths, and recommended next actions to the user.
862
+ """,
863
+ }
864
+
865
+ QUESTION_CATALOG = [
866
+ {
867
+ "id": "project_summary",
868
+ "prompt": "What is the main user or business outcome this repository exists to deliver?",
869
+ "reason": "Needed for AGENTS, ARCHITECTURE, and product docs.",
870
+ },
871
+ {
872
+ "id": "primary_users",
873
+ "prompt": "Who are the primary users or operators of this repository?",
874
+ "reason": "Needed to make product and quality tradeoffs concrete.",
875
+ },
876
+ {
877
+ "id": "deployment_targets",
878
+ "prompt": "Where does this system run or get deployed?",
879
+ "reason": "Needed for architecture and reliability guidance.",
880
+ },
881
+ {
882
+ "id": "product_domain",
883
+ "prompt": "Which product domain best describes this repository?",
884
+ "reason": "Needed for quality scoring and policy language.",
885
+ },
886
+ {
887
+ "id": "reliability_targets",
888
+ "prompt": "Which uptime, recovery, or runtime validation expectations matter most?",
889
+ "reason": "Needed for reliability docs and validation loops.",
890
+ },
891
+ {
892
+ "id": "security_constraints",
893
+ "prompt": "Which security, compliance, auth, or sensitive-data constraints matter here?",
894
+ "reason": "Needed for security review guidance.",
895
+ },
896
+ {
897
+ "id": "frontend_stack_notes",
898
+ "prompt": "If there is a frontend, what experience bar, platforms, or UX constraints should the docs enforce?",
899
+ "reason": "Needed for design and frontend policies.",
900
+ },
901
+ {
902
+ "id": "design_style_direction",
903
+ "prompt": "If there is a frontend, what visual style should the project follow? Describe the concrete reference, mood, density, color/typography preferences, and hard don'ts.",
904
+ "reason": "Needed to generate the project-owned DESIGN.md without external design-generation skills.",
905
+ },
906
+ {
907
+ "id": "quality_focus",
908
+ "prompt": "Which product areas or architectural layers deserve the strictest quality scoring?",
909
+ "reason": "Needed for QUALITY_SCORE.md.",
910
+ },
911
+ ]
912
+
913
+ QUALITY_DIMENSIONS = [
914
+ ("product_correctness", "Product correctness"),
915
+ ("ux_operator_clarity", "UX and operator clarity"),
916
+ ("architecture_maintainability", "Architecture and maintainability"),
917
+ ("reliability_observability", "Reliability and observability"),
918
+ ("security_data_handling", "Security and data handling"),
919
+ ]
920
+ QUALITY_NOTE_ARGS = {
921
+ "product_correctness": "product-note",
922
+ "ux_operator_clarity": "ux-note",
923
+ "architecture_maintainability": "architecture-note",
924
+ "reliability_observability": "reliability-note",
925
+ "security_data_handling": "security-note",
926
+ }
927
+ ACCEPTANCE_ARGS = {
928
+ "product_correctness": "product",
929
+ "ux_operator_clarity": "ux",
930
+ "architecture_maintainability": "architecture",
931
+ "reliability_observability": "reliability",
932
+ "security_data_handling": "security",
933
+ }
934
+ GENERIC_ACCEPTANCE_PHRASES = [
935
+ "confirm the requested behavior is complete",
936
+ "confirm the user or operator experience is understandable",
937
+ "confirm the implementation is clean and easy to change",
938
+ "confirm the validation loop and failure handling are sufficient",
939
+ "confirm secrets and sensitive data are handled safely",
940
+ "requested behavior is complete",
941
+ ]
942
+ EVIDENCE_HINTS = [
943
+ "test",
944
+ "pytest",
945
+ "go test",
946
+ "npm test",
947
+ "smoke",
948
+ "browser",
949
+ "screenshot",
950
+ "dom",
951
+ "accessibility",
952
+ "log",
953
+ "trace",
954
+ "review",
955
+ "inspected",
956
+ "verified",
957
+ "validated",
958
+ "command",
959
+ "path",
960
+ "file",
961
+ ".md",
962
+ ".py",
963
+ ".js",
964
+ ".ts",
965
+ ".tsx",
966
+ "./",
967
+ ]
968
+ SIDECAR_VERSION = 1
969
+
970
+
971
+ class PlanCloseError(RuntimeError):
972
+ def __init__(self, reason, message, details=None):
973
+ super().__init__(message)
974
+ self.reason = reason
975
+ self.details = details or {}
976
+
977
+