npm - waypoint-codex - Versions diffs - 0.8.0 → 0.9.0 - Mend

waypoint-codex 0.8.0 → 0.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

package/dist/src/upgrade.js CHANGED Viewed

@@ -9,17 +9,84 @@ export function buildInitArgs(projectRoot, config) {
     if (config.profile === "app-friendly") {
         args.push("--app-friendly");
     }
-    const featureMap = config.features ?? {};
-    if (featureMap.roles) {
-        args.push("--with-roles");
+    return args;
+}
+function parseVersion(version) {
+    const trimmed = version.trim().replace(/^v/, "");
+    const match = trimmed.match(/^(\d+)\.(\d+)\.(\d+)(?:-([0-9A-Za-z.-]+))?$/);
+    if (!match) {
+        return null;
     }
-    if (featureMap.rules) {
-        args.push("--with-rules");
+    return {
+        core: [Number(match[1]), Number(match[2]), Number(match[3])],
+        prerelease: match[4] ? match[4].split(".") : [],
+    };
+}
+function compareIdentifiers(left, right) {
+    const leftNumeric = /^\d+$/.test(left);
+    const rightNumeric = /^\d+$/.test(right);
+    if (leftNumeric && rightNumeric) {
+        return Number(left) - Number(right);
     }
-    if (featureMap.automations) {
-        args.push("--with-automations");
+    if (leftNumeric) {
+        return -1;
     }
-    return args;
+    if (rightNumeric) {
+        return 1;
+    }
+    return left.localeCompare(right);
+}
+export function compareVersions(left, right) {
+    const leftParsed = parseVersion(left);
+    const rightParsed = parseVersion(right);
+    if (!leftParsed || !rightParsed) {
+        return left.localeCompare(right);
+    }
+    for (let index = 0; index < leftParsed.core.length; index += 1) {
+        const difference = leftParsed.core[index] - rightParsed.core[index];
+        if (difference !== 0) {
+            return difference;
+        }
+    }
+    const leftPrerelease = leftParsed.prerelease;
+    const rightPrerelease = rightParsed.prerelease;
+    if (leftPrerelease.length === 0 && rightPrerelease.length === 0) {
+        return 0;
+    }
+    if (leftPrerelease.length === 0) {
+        return 1;
+    }
+    if (rightPrerelease.length === 0) {
+        return -1;
+    }
+    const length = Math.max(leftPrerelease.length, rightPrerelease.length);
+    for (let index = 0; index < length; index += 1) {
+        const leftIdentifier = leftPrerelease[index];
+        const rightIdentifier = rightPrerelease[index];
+        if (leftIdentifier === undefined) {
+            return -1;
+        }
+        if (rightIdentifier === undefined) {
+            return 1;
+        }
+        const difference = compareIdentifiers(leftIdentifier, rightIdentifier);
+        if (difference !== 0) {
+            return difference;
+        }
+    }
+    return 0;
+}
+function latestWaypointVersion(options) {
+    const npmBinary = options.npmBinary ?? process.env.WAYPOINT_NPM_COMMAND ?? npmBinaryForPlatform();
+    const latest = spawnSync(npmBinary, ["view", "waypoint-codex", "version"], {
+        stdio: "pipe",
+        encoding: "utf8",
+    });
+    if ((latest.status ?? 1) !== 0) {
+        return null;
+    }
+    const version = latest.stdout?.trim();
+    return version ? version : null;
 }
 function hasWaypointConfig(projectRoot) {
     return existsSync(path.join(projectRoot, ".waypoint/config.toml"));
@@ -53,3 +120,26 @@ export function upgradeWaypoint(options) {
     });
     return doctor.status ?? 1;
 }
+export function maybeUpgradeWaypointBeforeInit(options) {
+    const nodeBinary = options.nodeBinary ?? process.execPath;
+    const npmBinary = options.npmBinary ?? process.env.WAYPOINT_NPM_COMMAND ?? npmBinaryForPlatform();
+    const stdio = options.stdio ?? "inherit";
+    const latestVersion = latestWaypointVersion({ npmBinary });
+    if (!latestVersion || compareVersions(latestVersion, options.currentVersion) <= 0) {
+        return null;
+    }
+    console.log(`Waypoint CLI ${options.currentVersion} is older than latest ${latestVersion}. Updating before init...`);
+    const update = spawnSync(npmBinary, ["install", "-g", "waypoint-codex@latest"], {
+        stdio,
+    });
+    if ((update.status ?? 1) !== 0) {
+        return update.status ?? 1;
+    }
+    const reexecArgs = options.initArgs.includes("--skip-cli-update")
+        ? options.initArgs
+        : [...options.initArgs, "--skip-cli-update"];
+    const init = spawnSync(nodeBinary, [options.cliEntry, "init", ...reexecArgs], {
+        stdio,
+    });
+    return init.status ?? 1;
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "waypoint-codex",
-  "version": "0.8.0",
+  "version": "0.9.0",
   "description": "Codex-native repository operating system: scaffolding, docs routing, repo-local skills, doctor, and sync.",
   "license": "MIT",
   "type": "module",

package/templates/.agents/skills/backend-context-interview/SKILL.md ADDED Viewed

@@ -0,0 +1,70 @@
+---
+name: backend-context-interview
+description: Gather and persist durable backend project context when missing or insufficient for implementation, architecture decisions, or ship-readiness review. Use when backend choices depend on scale, criticality, compliance, tenant model, compatibility, reliability, security posture, or similar context that is not clearly documented.
+---
+# Backend Context Interview
+Use this skill when relevant backend context is missing, stale, contradictory, or too weak to support correct implementation or review decisions.
+## Goals
+1. identify the missing backend context that materially affects the work
+2. ask only high-leverage questions that cannot be answered from the repo or guidance files
+3. persist durable context into the project root guidance file
+4. avoid repeated questioning in future tasks
+## When to use
+Use this skill when the current task depends on context such as:
+- internal tool vs public internet-facing product
+- expected scale, concurrency, and criticality
+- regulatory, privacy, or compliance requirements
+- multi-tenant vs single-tenant behavior
+- backward compatibility requirements
+- uptime and reliability expectations
+- migration and rollback risk tolerance
+- security posture expectations
+- observability or incident response expectations
+- infrastructure constraints that materially affect design
+Do not use this skill when the answer is already clearly present in `AGENTS.md`, architecture docs, runbooks, or the task itself.
+## Workflow
+### 1. Check persisted context first
+Inspect the project root guidance files.
+Priority:
+1. `AGENTS.md`
+Look for:
+- `## Project Context`
+- `## Backend Context`
+- equivalent sections with the same intent
+If the existing section is accurate and sufficient, do not interview the user.
+### 2. Determine what is actually missing
+Only ask questions that materially affect implementation or review choices.
+Good triggers:
+- public service vs internal tool changes reliability and security bar
+- scale and concurrency change architecture depth and observability expectations
+- compatibility requirements change migration and API decisions
+- tenant model changes authorization and data-isolation design
+Do not ask broad or low-value questions.
+### 3. Ask concise grouped questions
+Ask the minimum set of questions needed.
+Suggested categories:
+- product type and exposure
+- scale and criticality
+- data sensitivity and compliance
+Do not ask generic product questions that do not affect backend engineering.

package/templates/.agents/skills/backend-ship-audit/SKILL.md ADDED Viewed

@@ -0,0 +1,221 @@
+---
+name: backend-ship-audit
+description: Audit a backend scope for practical ship readiness with evidence-based findings focused on real release risk rather than style. Use when reviewing a backend service, feature, endpoint group, worker, scheduler, API surface, pull request, or directory to decide whether it is ready to ship; when the backend scope must be resolved from repository structure; when complete-file reading is required to understand behavior and dependencies; when only high-leverage deployment-context questions should be asked after repository exploration; when durable backend context should be persisted in project-root AGENTS.md; or when a timestamped audit should be written under .waypoint/audit/.
+---
+# Backend ship audit
+Follow this workflow in order. Optimize for practical release readiness. Ignore style-only issues.
+Use bundled resources as follows:
+- Use `references/audit-framework.md` for detailed evaluation prompts and severity calibration.
+- Use `references/report-template.md` for the audit structure and finding format.
+## 1. Resolve the reviewable unit
+Turn the user request into the narrowest defensible backend unit that can be audited end to end.
+Determine and state:
+- requested scope
+- assumed scope used for the audit
+- code and docs directly in scope
+- adjacent dependencies and boundaries that materially affect behavior
+- explicit out-of-scope areas
+Use repository structure to narrow broad requests. Prefer a reviewable unit such as a single service, endpoint group, worker flow, API surface, migration set, or pull request boundary over a vague "entire backend" scope.
+If the request is ambiguous, make the best defensible narrowing decision from the repository and state it in the audit. Do not block on clarification unless the ambiguity itself is the primary risk.
+## 2. Build complete understanding from the repository
+Explore the repository deeply enough to understand the scoped backend and the dependencies that materially affect ship readiness.
+Start with a map, then move to complete reads:
+1. Read project guidance first. Prefer root `AGENTS.md`, plus any scoped guidance files if present.
+2. Read architecture docs, ADRs, onboarding docs, runbooks, incident notes, and API or contract docs that touch the scope.
+3. Map entry points and main flows with fast search tools.
+4. Read the full contents of every file that matters to architecture understanding, behavior, or findings.
+5. Trace the end-to-end flow across handlers, validation, services, repositories, persistence, jobs, queues, external clients, configuration, and tests.
+Read complete files for relevant materials. Do not rely on grep hits, matched snippets, partial chunks, or truncated previews for any file that informs architectural understanding or a finding.
+Read complete files for all relevant artifacts you encounter, including when present:
+- backend source files
+- handlers, controllers, routes, services, jobs, workers, schedulers
+- schemas, models, serializers, DTOs, validators
+- repositories, query layers, persistence code
+- migrations and migration helpers
+- queue and background processing code
+- configuration files and environment examples
+- tests and test helpers
+- API specs and contract definitions
+- runbooks and operational docs
+- incident notes or postmortems
+Skip obviously irrelevant material such as vendored dependencies, generated artifacts, build outputs, caches, or unrelated subsystems.
+Keep a working inventory of fully read files. Include that inventory in the audit under a "What was read" section.
+When code and docs disagree, trust code for current behavior unless the docs clearly describe runtime behavior not visible in the repository. Call out the discrepancy if it matters to shipping risk.
+## 3. Build an internal system model
+Before asking questions or writing findings, form a concrete model of:
+- request and event entry points
+- trust boundaries
+- data flow and persistence path
+- transaction and consistency boundaries
+- async boundaries and retry paths
+- authn and authz assumptions
+- tenant scoping model when relevant
+- configuration, feature flags, and runtime dependencies
+- observability and operational control points
+Use that model to identify what can fail, what can race, what can duplicate, what can corrupt data, and what can become hard to detect or recover.
+## 4. Ask only missing high-leverage questions
+Ask questions only after repository exploration and only when the answer materially changes the ship-readiness bar.
+Group questions by topic. Keep them concise. Usually ask no more than 3 to 8 questions total unless the backend context is unusually under-specified.
+Target questions at facts that cannot be established from the codebase or docs, such as:
+- deployment exposure and user type
+- expected scale, burstiness, and concurrency
+- data sensitivity or compliance constraints
+- single-tenant or multi-tenant expectations
+- uptime, recovery, and rollback tolerance
+- backward compatibility and API stability requirements
+- observability and incident response expectations
+- security posture or access-control expectations
+Do not ask generic architecture-preference questions. Do not ask for information already visible in the repository.
+If answers are unavailable, proceed with explicit assumptions. Place assumptions in the audit and calibrate severity to the confidence of the evidence.
+## 5. Persist durable backend context
+Persist durable deployment context into the project root guidance file by editing it manually.
+Choose the target file in this order:
+1. `AGENTS.md` in the project root
+2. no edit if it does not exist, unless the user explicitly asked to create one
+Prefer updating an existing `## Backend Context` section. Add the section only if it is missing. Preserve surrounding content exactly. Do not overwrite unrelated sections. Do not duplicate existing context. Do not make any edit if the existing section is already accurate and sufficiently complete.
+Persist only stable context such as:
+- internal vs internet-facing exposure
+- expected scale and criticality
+- data sensitivity or compliance constraints
+- tenant model
+- compatibility requirements
+- reliability expectations
+- security posture assumptions
+- rollback tolerance
+Do not persist transient audit findings, temporary mitigations, or release-specific defects.
+Make this edit manually. Prefer the smallest precise change that preserves all unrelated guidance exactly.
+## 6. Evaluate ship readiness with a backend-risk lens
+Assess the scoped backend like a strong backend reviewer. Focus on real ship risk, not code taste.
+Evaluate at least these categories when relevant:
+- scope and architecture fit
+- API and contract quality
+- input validation and trust boundaries
+- domain modeling correctness
+- data integrity and consistency
+- transaction boundaries and idempotency
+- migration safety and rollback safety
+- failure handling and retry semantics
+- timeouts, cancellation, and backpressure
+- concurrency and race risks
+- queue and background job correctness
+- authorization and access control
+- authentication assumptions
+- secret handling and configuration safety
+- tenant isolation
+- security vulnerabilities and unsafe defaults
+- boundary clarity between layers and services
+- reliability under expected production conditions
+- observability, alertability, and debuggability
+- test coverage for meaningful failure modes
+- future legibility and maintainability as it affects shipping risk
+Use judgment. Do not force findings in every category.
+Treat missing evidence carefully:
+- Missing tests, docs, or operational controls can be findings if the absence creates real release risk.
+- Unknown deployment context can lower confidence or elevate a question into a release condition.
+- Lack of proof is not the same as proof of failure. State uncertainty explicitly.
+## 7. Calibrate priority consistently
+Use this model exactly:
+- `P0`: clear ship blocker; likely severe outage, data loss, data corruption, critical security issue, or fundamentally unsafe release
+- `P1`: serious issue that should usually be fixed before shipping; substantial reliability, security, integrity, or operational risk
+- `P2`: important issue that may be acceptable only with conscious risk acceptance; not an immediate blocker in all contexts
+- `P3`: moderate weakness or gap; address soon but not necessarily before launch
+- `P4`: minor improvement with limited near-term impact
+Do not inflate severity. Severity depends on deployment context, blast radius, reversibility, exploitability, and confidence.
+## 8. Write the audit document
+Write the final audit to:
+`.waypoint/audit/dd-mm-yyyy-hh-mm-backend-audit.md`
+Create directories if needed.
+Use `references/report-template.md` as the base structure. Include at minimum:
+- title and timestamp
+- requested scope and assumed scope
+- in-scope, adjacent dependencies, and out-of-scope areas
+- what was read
+- open questions and explicit assumptions
+- concise system understanding
+- ship recommendation
+- prioritized findings
+- release conditions or follow-up actions
+Prefer path and line-based evidence when practical. If line numbers are not practical, cite the full file path plus the specific symbol, function, class, migration, endpoint, or test that supports the finding.
+## 9. Format findings for actionability
+Give every finding:
+- ID
+- title
+- priority from `P0` to `P4`
+- why it matters
+- evidence
+- affected area
+- risk if shipped as-is
+- recommended fix
+- confidence level if evidence is incomplete
+Use concise, concrete language. Tie every finding to repository evidence or an explicit unanswered question that materially affects readiness.
+## 10. Make the ship decision explicit
+End with a clear recommendation:
+- `Ready to ship`
+- `Ready to ship with explicit risk acceptance`
+- `Not ready to ship`
+Use `Not ready to ship` when there is a `P0`, an unresolved `P1`, or a material unknown that could plausibly conceal a `P0` or `P1` under the stated deployment context.
+If the scope is acceptable only with conditions, list the exact conditions.
+## 11. Avoid low-value review behavior
+Do not include:
+- style policing
+- subjective nitpicks
+- trivial refactor suggestions without ship impact
+- generic best-practice commentary unsupported by repository evidence
+- vague advice such as "add more tests" without naming the missing failure mode or blind spot
+Prefer a short audit with strong evidence over a long audit with weak claims.

package/templates/.agents/skills/backend-ship-audit/agents/openai.yaml ADDED Viewed

@@ -0,0 +1,3 @@
+display_name: Backend Ship Audit
+short_description: Audit a backend scope for real ship risk and write an evidence-based readiness report.
+default_prompt: Audit this backend scope for ship readiness. Resolve scope from the repository, read relevant backend code and docs completely, ask only missing high-leverage questions, persist durable backend context, and write a prioritized audit under .waypoint/audit/.

package/templates/.agents/skills/backend-ship-audit/references/audit-framework.md ADDED Viewed

@@ -0,0 +1,228 @@
+# Backend audit framework
+Use this reference to turn repository understanding into a practical release-risk review.
+## Table of contents
+1. [Scope and architecture fit](#1-scope-and-architecture-fit)
+2. [API and contract quality](#2-api-and-contract-quality)
+3. [Domain modeling and correctness](#3-domain-modeling-and-correctness)
+4. [Data integrity and consistency](#4-data-integrity-and-consistency)
+5. [Migration safety and rollback safety](#5-migration-safety-and-rollback-safety)
+6. [Async, queue, and worker correctness](#6-async-queue-and-worker-correctness)
+7. [Failure handling, timeouts, and backpressure](#7-failure-handling-timeouts-and-backpressure)
+8. [Security, auth, and tenant isolation](#8-security-auth-and-tenant-isolation)
+9. [Reliability and operational readiness](#9-reliability-and-operational-readiness)
+10. [Test evidence](#10-test-evidence)
+11. [Severity calibration](#11-severity-calibration)
+12. [Handling unknowns](#12-handling-unknowns)
+## 1. Scope and architecture fit
+Check whether the scoped backend unit has a clear responsibility and a defensible boundary.
+Look for:
+- hidden coupling to unrelated subsystems that makes the requested scope impossible to reason about safely
+- ownership split across layers or services without clear contracts
+- feature behavior controlled by undocumented flags or runtime configuration
+- operational behavior that only exists in deployment config but is absent from docs and tests
+Evidence to seek:
+- route or handler entry points
+- service and repository boundaries
+- config loading paths
+- feature flag checks
+- ADRs or architecture docs
+## 2. API and contract quality
+Check whether external or internal contracts are safe to release under the expected compatibility bar.
+Look for:
+- request validation gaps
+- response-shape drift from spec or tests
+- undocumented behavior changes
+- missing idempotency on retryable writes
+- unsafe pagination or filtering semantics
+- partial-success behavior without clear client contract
+- incompatible changes to event payloads, queues, or partner-facing APIs
+Evidence to seek:
+- handlers, validators, serializers, DTOs, OpenAPI or protobuf specs
+- consumer tests or contract tests
+- backward-compatibility notes in docs or migrations
+## 3. Domain modeling and correctness
+Check whether the code actually enforces business invariants.
+Look for:
+- invariants enforced only in UI or caller code
+- state transitions with missing guards
+- invalid states representable in persistence
+- derived fields that can drift from source-of-truth data
+- duplicate logic that can diverge under edge cases
+Evidence to seek:
+- model logic, service logic, validators, persistence constraints, tests
+## 4. Data integrity and consistency
+Check how writes behave under failure, retries, and concurrent activity.
+Look for:
+- missing transactions around multi-step writes
+- transactions that include external calls and can hang or partially commit
+- non-atomic read-modify-write flows
+- retry paths that can duplicate side effects
+- absence of uniqueness or foreign-key enforcement where the invariant depends on it
+- eventual-consistency behavior with no compensating logic or visibility
+Evidence to seek:
+- transaction boundaries
+- repositories and queries
+- migration constraints and indexes
+- retry wrappers and dedup logic
+- job enqueue order relative to commits
+## 5. Migration safety and rollback safety
+Check whether schema and data changes are safe under real deploy conditions.
+Look for:
+- destructive or irreversible migrations without a rollback story
+- schema changes that break old binaries during rolling deploys
+- backfills mixed into request paths or startup
+- long-running or locking operations on hot tables
+- reliance on application code to keep dual-write or dual-read windows safe without tests
+Evidence to seek:
+- migration files
+- deploy docs
+- backfill code
+- startup hooks
+- compatibility tests
+## 6. Async, queue, and worker correctness
+Check whether background processing is safe under duplicate delivery, delay, and partial failure.
+Look for:
+- no idempotency for retryable jobs
+- poison-message loops
+- missing dead-letter or terminal-failure handling
+- enqueue-before-commit races
+- jobs that assume fresh state but run much later
+- scheduled tasks that can overlap or race
+- side effects without deduplication keys
+Evidence to seek:
+- worker code
+- queue wrappers
+- job payload definitions
+- retry config
+- scheduling config
+- tests for duplicate or delayed execution
+## 7. Failure handling, timeouts, and backpressure
+Check whether the system fails predictably under dependency issues and traffic spikes.
+Look for:
+- outbound calls without timeout or cancellation
+- retries without budget, jitter, or bounding
+- fan-out paths with unbounded concurrency
+- silent fallbacks that hide corruption or dropped work
+- blocking behavior in request paths that should shed load
+- missing admission control or queue bounds where bursts are expected
+Evidence to seek:
+- HTTP or RPC client construction
+- context propagation
+- timeout configuration
+- retry policies
+- worker pool limits
+- queue depth controls
+## 8. Security, auth, and tenant isolation
+Check whether trust boundaries are explicit and enforced server-side.
+Look for:
+- authz performed only in callers or UI layers
+- object-level access checks missing on reads or writes
+- tenant scoping inferred from untrusted input
+- secrets in code, logs, examples, or unsafe defaults
+- admin flows exposed through general routes without stronger controls
+- unsafe deserialization, query injection, SSRF, path traversal, or similar flaws relevant to the stack
+Evidence to seek:
+- middleware
+- route guards
+- policy checks
+- repository filters
+- config files and environment examples
+- tests for unauthorized access
+## 9. Reliability and operational readiness
+Check whether on-call engineers could detect, triage, and mitigate failures.
+Look for:
+- missing structured logs around critical writes or state transitions
+- no metrics for queues, retries, dead letters, or error rates
+- no tracing through core flows
+- lack of alerting hooks for silent backlog growth or repeated failure
+- missing runbooks for risky operations or recovery paths
+- no feature flag, kill switch, or disable path for a new high-risk flow
+Evidence to seek:
+- logging calls
+- metrics and tracing instrumentation
+- alert config
+- dashboards
+- runbooks
+- incident notes
+## 10. Test evidence
+Check whether tests cover the failure modes that matter for shipping.
+Look for:
+- tests only for happy paths
+- no coverage for authorization failures, duplicate delivery, retries, partial failures, migrations, or concurrency-sensitive paths
+- mocks that hide contract drift or persistence behavior
+- no integration coverage where correctness depends on transaction, queue, or DB behavior
+Evidence to seek:
+- unit tests
+- integration tests
+- end-to-end tests
+- test helpers and fixtures
+- migration or backfill tests
+## 11. Severity calibration
+Calibrate severity using blast radius, exploitability, reversibility, and confidence.
+Typical anchors:
+- P0: a likely severe outage, corruption path, data-loss path, or critical security flaw with no acceptable workaround
+- P1: a serious integrity, security, or reliability gap that should usually block release
+- P2: a material weakness that may be acceptable with explicit risk acceptance and monitoring
+- P3: a moderate gap with limited immediate blast radius
+- P4: a minor improvement with little near-term release impact
+Increase severity when the backend is public, high-scale, handles sensitive data, or has low rollback tolerance.
+Decrease severity when the flow is internal, low-volume, reversible, heavily monitored, and has a strong kill switch.
+## 12. Handling unknowns
+Treat missing context as a first-class part of the audit.
+When context is missing:
+- ask only the questions that change severity or release conditions
+- state assumptions explicitly
+- lower confidence when evidence is incomplete
+- promote the unknown into a release condition when the uncertainty itself is risky
+Do not invent certainty.