npm - forgecraft-mcp - Versions diffs - 1.2.0 → 1.3.2 - Mend

forgecraft-mcp 1.2.0 → 1.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (136) hide show

package/README.md +525 -525
package/dist/cli/help.js +44 -44
package/dist/registry/renderer-skeletons.js +92 -92
package/dist/shared/gs-score-logger.js +6 -6
package/dist/tools/add-module.js +123 -123
package/dist/tools/advice-registry.js +18 -18
package/dist/tools/check-cascade-report.js +64 -64
package/dist/tools/configure-mcp.d.ts +3 -0
package/dist/tools/configure-mcp.d.ts.map +1 -1
package/dist/tools/configure-mcp.js +10 -0
package/dist/tools/configure-mcp.js.map +1 -1
package/dist/tools/forgecraft-dispatch.d.ts.map +1 -1
package/dist/tools/forgecraft-dispatch.js +3 -0
package/dist/tools/forgecraft-dispatch.js.map +1 -1
package/dist/tools/forgecraft-schema-params.d.ts +9 -0
package/dist/tools/forgecraft-schema-params.d.ts.map +1 -1
package/dist/tools/forgecraft-schema-params.js +21 -0
package/dist/tools/forgecraft-schema-params.js.map +1 -1
package/dist/tools/forgecraft-schema.d.ts +9 -0
package/dist/tools/forgecraft-schema.d.ts.map +1 -1
package/dist/tools/refresh-output.js +14 -14
package/dist/tools/scaffold-spec-stubs.js +115 -115
package/dist/tools/scaffold-templates.js +62 -62
package/dist/tools/setup-artifact-writers.d.ts +30 -0
package/dist/tools/setup-artifact-writers.d.ts.map +1 -1
package/dist/tools/setup-artifact-writers.js +120 -8
package/dist/tools/setup-artifact-writers.js.map +1 -1
package/dist/tools/setup-phase1.d.ts +3 -0
package/dist/tools/setup-phase1.d.ts.map +1 -1
package/dist/tools/setup-phase1.js +79 -35
package/dist/tools/setup-phase1.js.map +1 -1
package/dist/tools/setup-phase2.d.ts +2 -0
package/dist/tools/setup-phase2.d.ts.map +1 -1
package/dist/tools/setup-phase2.js +10 -1
package/dist/tools/setup-phase2.js.map +1 -1
package/dist/tools/setup-project.d.ts +18 -0
package/dist/tools/setup-project.d.ts.map +1 -1
package/dist/tools/setup-project.js +77 -1
package/dist/tools/setup-project.js.map +1 -1
package/dist/tools/spec-parser-tags.d.ts +9 -0
package/dist/tools/spec-parser-tags.d.ts.map +1 -1
package/dist/tools/spec-parser-tags.js +92 -0
package/dist/tools/spec-parser-tags.js.map +1 -1
package/package.json +89 -86
package/templates/analytics/instructions.yaml +37 -37
package/templates/analytics/mcp-servers.yaml +11 -11
package/templates/analytics/structure.yaml +25 -25
package/templates/api/instructions.yaml +231 -231
package/templates/api/mcp-servers.yaml +22 -13
package/templates/api/nfr.yaml +23 -23
package/templates/api/review.yaml +103 -103
package/templates/api/structure.yaml +34 -34
package/templates/api/verification.yaml +132 -132
package/templates/cli/instructions.yaml +31 -31
package/templates/cli/mcp-servers.yaml +11 -11
package/templates/cli/review.yaml +53 -53
package/templates/cli/structure.yaml +16 -16
package/templates/data-lineage/instructions.yaml +28 -28
package/templates/data-lineage/mcp-servers.yaml +22 -22
package/templates/data-pipeline/instructions.yaml +84 -84
package/templates/data-pipeline/mcp-servers.yaml +13 -13
package/templates/data-pipeline/nfr.yaml +39 -39
package/templates/data-pipeline/structure.yaml +23 -23
package/templates/fintech/hooks.yaml +55 -55
package/templates/fintech/instructions.yaml +112 -112
package/templates/fintech/mcp-servers.yaml +13 -13
package/templates/fintech/nfr.yaml +46 -46
package/templates/fintech/playbook.yaml +210 -210
package/templates/fintech/verification.yaml +239 -239
package/templates/game/instructions.yaml +289 -289
package/templates/game/mcp-servers.yaml +38 -38
package/templates/game/nfr.yaml +64 -64
package/templates/game/playbook.yaml +214 -214
package/templates/game/review.yaml +97 -97
package/templates/game/structure.yaml +67 -67
package/templates/game/verification.yaml +174 -174
package/templates/healthcare/instructions.yaml +42 -42
package/templates/healthcare/mcp-servers.yaml +13 -13
package/templates/healthcare/nfr.yaml +47 -47
package/templates/hipaa/instructions.yaml +41 -41
package/templates/hipaa/mcp-servers.yaml +13 -13
package/templates/infra/instructions.yaml +104 -104
package/templates/infra/mcp-servers.yaml +20 -20
package/templates/infra/nfr.yaml +46 -46
package/templates/infra/review.yaml +65 -65
package/templates/infra/structure.yaml +25 -25
package/templates/library/instructions.yaml +36 -36
package/templates/library/mcp-servers.yaml +20 -20
package/templates/library/review.yaml +56 -56
package/templates/library/structure.yaml +19 -19
package/templates/medallion-architecture/instructions.yaml +41 -41
package/templates/medallion-architecture/mcp-servers.yaml +22 -22
package/templates/ml/instructions.yaml +85 -85
package/templates/ml/mcp-servers.yaml +11 -11
package/templates/ml/nfr.yaml +39 -39
package/templates/ml/structure.yaml +25 -25
package/templates/ml/verification.yaml +156 -156
package/templates/mobile/instructions.yaml +44 -44
package/templates/mobile/mcp-servers.yaml +11 -11
package/templates/mobile/nfr.yaml +49 -49
package/templates/mobile/structure.yaml +27 -27
package/templates/mobile/verification.yaml +121 -121
package/templates/observability-xray/instructions.yaml +40 -40
package/templates/observability-xray/mcp-servers.yaml +15 -15
package/templates/realtime/instructions.yaml +42 -42
package/templates/realtime/mcp-servers.yaml +13 -13
package/templates/soc2/instructions.yaml +41 -41
package/templates/soc2/mcp-servers.yaml +24 -24
package/templates/social/instructions.yaml +43 -43
package/templates/social/mcp-servers.yaml +24 -24
package/templates/state-machine/instructions.yaml +42 -42
package/templates/state-machine/mcp-servers.yaml +11 -11
package/templates/tools-registry.yaml +164 -164
package/templates/universal/hooks.yaml +531 -531
package/templates/universal/instructions.yaml +1692 -1692
package/templates/universal/mcp-servers.yaml +50 -50
package/templates/universal/nfr.yaml +197 -197
package/templates/universal/reference.yaml +326 -326
package/templates/universal/review.yaml +204 -204
package/templates/universal/skills.yaml +262 -262
package/templates/universal/structure.yaml +67 -67
package/templates/universal/verification.yaml +416 -416
package/templates/web-react/hooks.yaml +44 -44
package/templates/web-react/instructions.yaml +207 -207
package/templates/web-react/mcp-servers.yaml +20 -20
package/templates/web-react/nfr.yaml +27 -27
package/templates/web-react/review.yaml +94 -94
package/templates/web-react/structure.yaml +46 -46
package/templates/web-react/verification.yaml +126 -126
package/templates/web-static/instructions.yaml +115 -115
package/templates/web-static/mcp-servers.yaml +20 -20
package/templates/web3/instructions.yaml +44 -44
package/templates/web3/mcp-servers.yaml +11 -11
package/templates/web3/verification.yaml +159 -159
package/templates/zero-trust/instructions.yaml +41 -41
package/templates/zero-trust/mcp-servers.yaml +15 -15

package/templates/ml/mcp-servers.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
-tag: ML
-section: mcp-servers
-servers:
-  - name: jupyter
-    description: "Jupyter notebook management — create, execute, and inspect cells"
-    command: npx
-    args: ["-y", "mcp-server-jupyter"]
-    tags: [ML, DATA-PIPELINE]
-    category: ai-ml
-    tier: recommended
-    url: "https://github.com/datalayer/jupyter-mcp-server"
+tag: ML
+section: mcp-servers
+servers:
+  - name: jupyter
+    description: "Jupyter notebook management — create, execute, and inspect cells"
+    command: npx
+    args: ["-y", "mcp-server-jupyter"]
+    tags: [ML, DATA-PIPELINE]
+    category: ai-ml
+    tier: recommended
+    url: "https://github.com/datalayer/jupyter-mcp-server"

package/templates/ml/nfr.yaml CHANGED Viewed

@@ -1,39 +1,39 @@
-tag: ML
-section: nfr
-blocks:
-  - id: ml-performance
-    tier: recommended
-    title: "ML Model Performance"
-    content: |
-      ## NFR: ML Model Performance
-      ### Inference Latency
-      - p50 inference latency: < {{ml_p50_ms | default: 50ms}}.
-      - p99 inference latency: < {{ml_p99_ms | default: 200ms}}.
-      - Batch inference throughput: process {{batch_throughput | default: 10000}} samples/minute.
-      ### Model Quality
-      - Primary metric: {{primary_metric | default: accuracy}} ≥ {{primary_threshold | default: 0.90}}.
-      - Model quality regression blocks deployment — automated evaluation in CI/CD.
-      - A/B testing framework for comparing model versions in production.
-      ### Data Quality
-      - Data drift detection on model inputs. Alert when distribution shifts beyond threshold.
-      - Schema validation on training and inference data. Reject malformed inputs.
-      - Feature freshness monitored. Stale features flagged.
-  - id: ml-reproducibility
-    tier: recommended
-    title: "ML Reproducibility"
-    content: |
-      ## NFR: ML Reproducibility
-      ### Experiment Tracking
-      - Every training run logged: hyperparameters, metrics, dataset version, code version.
-      - Experiment tracker (MLflow, Weights & Biases, Neptune) mandatory — not optional.
-      - Model lineage: trace any deployed model back to its training data and code.
-      ### Versioning
-      - Datasets versioned (DVC, Delta Lake, or artifact store).
-      - Models versioned with metadata: training date, dataset hash, performance metrics.
-      - Random seeds set and logged for reproducibility.
+tag: ML
+section: nfr
+blocks:
+  - id: ml-performance
+    tier: recommended
+    title: "ML Model Performance"
+    content: |
+      ## NFR: ML Model Performance
+      ### Inference Latency
+      - p50 inference latency: < {{ml_p50_ms | default: 50ms}}.
+      - p99 inference latency: < {{ml_p99_ms | default: 200ms}}.
+      - Batch inference throughput: process {{batch_throughput | default: 10000}} samples/minute.
+      ### Model Quality
+      - Primary metric: {{primary_metric | default: accuracy}} ≥ {{primary_threshold | default: 0.90}}.
+      - Model quality regression blocks deployment — automated evaluation in CI/CD.
+      - A/B testing framework for comparing model versions in production.
+      ### Data Quality
+      - Data drift detection on model inputs. Alert when distribution shifts beyond threshold.
+      - Schema validation on training and inference data. Reject malformed inputs.
+      - Feature freshness monitored. Stale features flagged.
+  - id: ml-reproducibility
+    tier: recommended
+    title: "ML Reproducibility"
+    content: |
+      ## NFR: ML Reproducibility
+      ### Experiment Tracking
+      - Every training run logged: hyperparameters, metrics, dataset version, code version.
+      - Experiment tracker (MLflow, Weights & Biases, Neptune) mandatory — not optional.
+      - Model lineage: trace any deployed model back to its training data and code.
+      ### Versioning
+      - Datasets versioned (DVC, Delta Lake, or artifact store).
+      - Models versioned with metadata: training date, dataset hash, performance metrics.
+      - Random seeds set and logged for reproducibility.

package/templates/ml/structure.yaml CHANGED Viewed

@@ -1,25 +1,25 @@
-tag: ML
-section: structure
-entries:
-  - path: notebooks/
-    description: "Exploration notebooks — never used in production code"
-  - path: src/data/
-    description: "Data loading, validation, and preprocessing pipelines"
-  - path: src/features/
-    description: "Feature engineering: transformations, feature store connectors"
-  - path: src/models/
-    description: "Model definitions, training loops, evaluation"
-  - path: src/serving/
-    description: "Model serving: API wrappers, batch inference scripts"
-  - path: src/config/
-    description: "Hyperparameters, feature configs, experiment configs (YAML)"
-  - path: tests/
-    description: "Unit tests for data transforms, feature logic, model contracts"
-  - path: tests/data_validation/
-    description: "Data quality tests: schema checks, distribution tests"
-  - path: models/
-    description: "Serialized models and metadata (gitignored, stored in artifact registry)"
-  - path: data/
-    description: "Local data cache (gitignored). Source of truth is remote storage."
-  - path: pipelines/
-    description: "Orchestration definitions: Airflow DAGs, Kubeflow pipelines, or Prefect flows"
+tag: ML
+section: structure
+entries:
+  - path: notebooks/
+    description: "Exploration notebooks — never used in production code"
+  - path: src/data/
+    description: "Data loading, validation, and preprocessing pipelines"
+  - path: src/features/
+    description: "Feature engineering: transformations, feature store connectors"
+  - path: src/models/
+    description: "Model definitions, training loops, evaluation"
+  - path: src/serving/
+    description: "Model serving: API wrappers, batch inference scripts"
+  - path: src/config/
+    description: "Hyperparameters, feature configs, experiment configs (YAML)"
+  - path: tests/
+    description: "Unit tests for data transforms, feature logic, model contracts"
+  - path: tests/data_validation/
+    description: "Data quality tests: schema checks, distribution tests"
+  - path: models/
+    description: "Serialized models and metadata (gitignored, stored in artifact registry)"
+  - path: data/
+    description: "Local data cache (gitignored). Source of truth is remote storage."
+  - path: pipelines/
+    description: "Orchestration definitions: Airflow DAGs, Kubeflow pipelines, or Prefect flows"

package/templates/ml/verification.yaml CHANGED Viewed

@@ -1,156 +1,156 @@
-tag: ML
-section: verification
-title: "Hyperparameter Tuning + Convergence Verification"
-description: >
-  ML model verification is primarily heuristic: the correct hyperparameter set
-  cannot be derived analytically and must be found by search. Verification must
-  confirm convergence (the search plateaued, not just stopped early), generalization
-  (the model works on held-out data), and that the training pipeline is deterministic
-  given a fixed seed. Without these contracts, ML outputs are unverifiable black boxes.
-uncertainty_levels:
-  - heuristic
-completeness_ceiling: 0.80
-phases:
-  - id: contract-definition
-    title: "Define Model Quality Contracts and Search Bounds"
-    rationale: >
-      ML contracts are metric bounds achieved on a fixed test set. They distinguish
-      a model that is "good enough for production" from one that merely overfit the
-      validation set. Contracts must be defined before any training run begins.
-    steps:
-      - id: define-evaluation-metrics
-        instruction: >
-          For the model's task, define:
-          - Primary metric (e.g., F1@0.5, AUROC, RMSE, mAP@0.5)
-          - Minimum acceptable threshold on the test set (e.g., F1 ≥ 0.85)
-          - Maximum acceptable inference latency (e.g., p99 ≤ 50ms on CPU)
-          - Maximum acceptable model size (e.g., ≤ 500MB on disk)
-          Store in docs/ml-contracts.md.
-        contract: "docs/ml-contracts.md with numeric threshold per metric"
-        tools: ["filesystem"]
-        expected_output: "| F1@0.5 | ≥ 0.85 | held-out test set | 1 run = 1 epoch on full test |"
-        pass_criterion: "File exists; all thresholds are numeric with direction (≥ or ≤)"
-      - id: fix-seeds-and-environment
-        instruction: >
-          Fix all randomness sources:
-          - Python: `random.seed(42); numpy.random.seed(42); torch.manual_seed(42)`
-          - TensorFlow: `tf.random.set_seed(42)`
-          - DataLoader: `worker_init_fn` + `generator=torch.Generator().manual_seed(42)`
-          Record the full environment: Python version, framework versions, CUDA version,
-          hardware (GPU model). Two runs with the same seed on the same hardware must
-          produce bit-identical model checkpoints.
-        contract: >
-          Determinism test: train for 3 epochs with seed=42, save checkpoint.
-          Re-train for 3 epochs with seed=42. Checkpoints are identical (sha256 match).
-        tools: ["torch.save", "sha256sum", "mlflow.log_artifact"]
-        expected_output: "Determinism verification: checkpoint_run1.sha256 == checkpoint_run2.sha256"
-        pass_criterion: "SHA-256 of both checkpoints match"
-  - id: hyperparameter-search
-    title: "Warm Runs + Pruned Search + Plateau Detection"
-    rationale: >
-      Naive grid search wastes GPU time. Bayesian search with pruning focuses on
-      promising regions. Warm runs avoid rescheduling known-good configurations.
-      Plateau detection terminates the search when additional trials stop improving
-      the primary metric — preventing overfitting the validation set via trial count.
-    steps:
-      - id: warm-run-from-checkpoint
-        instruction: >
-          Before launching the search, check for warm-run-checkpoint.json.
-          If present, load the best hyperparameters from the previous search as the
-          first trial of the new search. Compute baseline performance on the current
-          validation set. Record to warm-run-baseline.json.
-        contract: "warm-run-baseline.json exists with primary_metric_value before new search"
-        tools: ["optuna", "ray.tune", "wandb", "mlflow"]
-        expected_output: "warm-run-baseline.json: {hyperparams, primary_metric, dataset_split_hash}"
-        pass_criterion: "File exists; primary_metric > 0 (model not trivially broken)"
-      - id: run-pruned-bayesian-search
-        instruction: >
-          Run a Bayesian hyperparameter search:
-          - Sampler: TPE (Tree-structured Parzen Estimator)
-          - Pruner: HyperbandPruner (prune unpromising trials at 25% epoch mark)
-          - Max trials: min(200, budget); max wall-clock: configured via SEARCH_TIMEOUT_HOURS
-          - Log all trial results to MLflow or W&B experiment tracking
-          - Forbidden: learning_rate < 1e-6 or batch_size > GPU_VRAM / dtype_bytes
-        contract: >
-          Search runs to configured budget or plateau. Trial logs present.
-          No forbidden regions in top-10 trials.
-        tools: ["optuna", "optuna.integration.mlflow", "ray.tune", "wandb.sweep"]
-        expected_output: "search-results.json: {best_params, best_metric, n_trials, n_pruned, experiment_url}"
-        pass_criterion: "best_metric improves over warm-run-baseline or is within 1% (plateau)"
-      - id: detect-plateau
-        instruction: >
-          After the search, check for plateau:
-          - Compute the rolling best metric over the last 30 trials
-          - If max improvement in the last 30 < 0.1% relative, the search has plateaued
-          - Record plateau verdict in search-results.json as converged: true|false
-          If converged=false after the trial budget, flag it — the search space or prior
-          may be misconfigured. Escalate to human review.
-        contract: >
-          search-results.json contains converged field.
-          If converged=false, a human review note explains why additional trials are needed.
-        tools: ["pandas rolling max", "numpy", "optuna.study.best_trials"]
-        expected_output: '{"converged": true, "plateau_detected_at_trial": 142}'
-        pass_criterion: "converged field present; if false, requires_human_review escalation logged"
-        requires_human_review: false
-  - id: evaluation
-    title: "Test Set Evaluation Against Quality Contracts"
-    rationale: >
-      Validation set performance is used to tune hyperparameters and is therefore
-      optimistically biased. The test set is held out throughout the entire search.
-      Only after the best model is selected is the test set evaluated once.
-    steps:
-      - id: evaluate-on-test-set
-        instruction: >
-          Load the best checkpoint from the search. Run inference on the test set exactly once.
-          Do NOT use test set results to re-tune — doing so invalidates the test set.
-          Record all metrics contracted in docs/ml-contracts.md at inference time.
-        contract: >
-          All primary and secondary metrics from docs/ml-contracts.md are recorded.
-          Test evaluation runs exactly once (no multiple evaluation loops).
-          Each metric meets its contracted threshold.
-        tools: ["torch.no_grad()", "sklearn.metrics", "torchmetrics", "mlflow.log_metrics"]
-        expected_output: "test-evaluation.json: {metric, value, threshold, pass} per contracted metric"
-        pass_criterion: "All pass fields = true in test-evaluation.json"
-      - id: measure-inference-performance
-        instruction: >
-          Benchmark inference latency on the target deployment hardware (CPU if deployed on CPU).
-          Run 1,000 inference calls with the same input shape as production data.
-          Record: mean, p50, p95, p99 latency. Record model size on disk.
-        contract: >
-          p99 latency ≤ contracted max in docs/ml-contracts.md.
-          Model size ≤ contracted max.
-        tools: ["time.perf_counter", "torch.autocast", "ONNX Runtime", "locust"]
-        expected_output: "latency-report.json: {mean_ms, p50_ms, p95_ms, p99_ms, model_size_mb}"
-        pass_criterion: "p99_ms ≤ threshold; model_size_mb ≤ threshold"
-  - id: evidence
-    title: "Persist Model Artifacts and Experiment Logs"
-    rationale: >
-      ML models without reproducibility artifacts are not production-grade.
-      The exact checkpoint, the exact dataset split hash, and the exact code commit
-      that produced them must be committed or referenced in the experiment tracker.
-    steps:
-      - id: persist-experiment-artifacts
-        instruction: >
-          Log to MLflow / W&B:
-          - best_params.json (hyperparameters)
-          - test-evaluation.json (metric results)
-          - latency-report.json
-          - warm-run-checkpoint.json (updated with current best params for next run)
-          - model checkpoint (via log_artifact)
-          Tag the experiment run with the git commit SHA.
-        contract: >
-          Experiment tracker run exists with all artifacts.
-          Run is tagged with exact git commit SHA.
-          warm-run-checkpoint.json is updated for next run.
-        tools: ["mlflow.log_artifact", "wandb.log_artifact", "git rev-parse HEAD"]
-        expected_output: "MLflow run ID in experiment-run.json; all artifacts listed"
-        pass_criterion: "experiment-run.json exists; artifacts present in tracker"
+tag: ML
+section: verification
+title: "Hyperparameter Tuning + Convergence Verification"
+description: >
+  ML model verification is primarily heuristic: the correct hyperparameter set
+  cannot be derived analytically and must be found by search. Verification must
+  confirm convergence (the search plateaued, not just stopped early), generalization
+  (the model works on held-out data), and that the training pipeline is deterministic
+  given a fixed seed. Without these contracts, ML outputs are unverifiable black boxes.
+uncertainty_levels:
+  - heuristic
+completeness_ceiling: 0.80
+phases:
+  - id: contract-definition
+    title: "Define Model Quality Contracts and Search Bounds"
+    rationale: >
+      ML contracts are metric bounds achieved on a fixed test set. They distinguish
+      a model that is "good enough for production" from one that merely overfit the
+      validation set. Contracts must be defined before any training run begins.
+    steps:
+      - id: define-evaluation-metrics
+        instruction: >
+          For the model's task, define:
+          - Primary metric (e.g., F1@0.5, AUROC, RMSE, mAP@0.5)
+          - Minimum acceptable threshold on the test set (e.g., F1 ≥ 0.85)
+          - Maximum acceptable inference latency (e.g., p99 ≤ 50ms on CPU)
+          - Maximum acceptable model size (e.g., ≤ 500MB on disk)
+          Store in docs/ml-contracts.md.
+        contract: "docs/ml-contracts.md with numeric threshold per metric"
+        tools: ["filesystem"]
+        expected_output: "| F1@0.5 | ≥ 0.85 | held-out test set | 1 run = 1 epoch on full test |"
+        pass_criterion: "File exists; all thresholds are numeric with direction (≥ or ≤)"
+      - id: fix-seeds-and-environment
+        instruction: >
+          Fix all randomness sources:
+          - Python: `random.seed(42); numpy.random.seed(42); torch.manual_seed(42)`
+          - TensorFlow: `tf.random.set_seed(42)`
+          - DataLoader: `worker_init_fn` + `generator=torch.Generator().manual_seed(42)`
+          Record the full environment: Python version, framework versions, CUDA version,
+          hardware (GPU model). Two runs with the same seed on the same hardware must
+          produce bit-identical model checkpoints.
+        contract: >
+          Determinism test: train for 3 epochs with seed=42, save checkpoint.
+          Re-train for 3 epochs with seed=42. Checkpoints are identical (sha256 match).
+        tools: ["torch.save", "sha256sum", "mlflow.log_artifact"]
+        expected_output: "Determinism verification: checkpoint_run1.sha256 == checkpoint_run2.sha256"
+        pass_criterion: "SHA-256 of both checkpoints match"
+  - id: hyperparameter-search
+    title: "Warm Runs + Pruned Search + Plateau Detection"
+    rationale: >
+      Naive grid search wastes GPU time. Bayesian search with pruning focuses on
+      promising regions. Warm runs avoid rescheduling known-good configurations.
+      Plateau detection terminates the search when additional trials stop improving
+      the primary metric — preventing overfitting the validation set via trial count.
+    steps:
+      - id: warm-run-from-checkpoint
+        instruction: >
+          Before launching the search, check for warm-run-checkpoint.json.
+          If present, load the best hyperparameters from the previous search as the
+          first trial of the new search. Compute baseline performance on the current
+          validation set. Record to warm-run-baseline.json.
+        contract: "warm-run-baseline.json exists with primary_metric_value before new search"
+        tools: ["optuna", "ray.tune", "wandb", "mlflow"]
+        expected_output: "warm-run-baseline.json: {hyperparams, primary_metric, dataset_split_hash}"
+        pass_criterion: "File exists; primary_metric > 0 (model not trivially broken)"
+      - id: run-pruned-bayesian-search
+        instruction: >
+          Run a Bayesian hyperparameter search:
+          - Sampler: TPE (Tree-structured Parzen Estimator)
+          - Pruner: HyperbandPruner (prune unpromising trials at 25% epoch mark)
+          - Max trials: min(200, budget); max wall-clock: configured via SEARCH_TIMEOUT_HOURS
+          - Log all trial results to MLflow or W&B experiment tracking
+          - Forbidden: learning_rate < 1e-6 or batch_size > GPU_VRAM / dtype_bytes
+        contract: >
+          Search runs to configured budget or plateau. Trial logs present.
+          No forbidden regions in top-10 trials.
+        tools: ["optuna", "optuna.integration.mlflow", "ray.tune", "wandb.sweep"]
+        expected_output: "search-results.json: {best_params, best_metric, n_trials, n_pruned, experiment_url}"
+        pass_criterion: "best_metric improves over warm-run-baseline or is within 1% (plateau)"
+      - id: detect-plateau
+        instruction: >
+          After the search, check for plateau:
+          - Compute the rolling best metric over the last 30 trials
+          - If max improvement in the last 30 < 0.1% relative, the search has plateaued
+          - Record plateau verdict in search-results.json as converged: true|false
+          If converged=false after the trial budget, flag it — the search space or prior
+          may be misconfigured. Escalate to human review.
+        contract: >
+          search-results.json contains converged field.
+          If converged=false, a human review note explains why additional trials are needed.
+        tools: ["pandas rolling max", "numpy", "optuna.study.best_trials"]
+        expected_output: '{"converged": true, "plateau_detected_at_trial": 142}'
+        pass_criterion: "converged field present; if false, requires_human_review escalation logged"
+        requires_human_review: false
+  - id: evaluation
+    title: "Test Set Evaluation Against Quality Contracts"
+    rationale: >
+      Validation set performance is used to tune hyperparameters and is therefore
+      optimistically biased. The test set is held out throughout the entire search.
+      Only after the best model is selected is the test set evaluated once.
+    steps:
+      - id: evaluate-on-test-set
+        instruction: >
+          Load the best checkpoint from the search. Run inference on the test set exactly once.
+          Do NOT use test set results to re-tune — doing so invalidates the test set.
+          Record all metrics contracted in docs/ml-contracts.md at inference time.
+        contract: >
+          All primary and secondary metrics from docs/ml-contracts.md are recorded.
+          Test evaluation runs exactly once (no multiple evaluation loops).
+          Each metric meets its contracted threshold.
+        tools: ["torch.no_grad()", "sklearn.metrics", "torchmetrics", "mlflow.log_metrics"]
+        expected_output: "test-evaluation.json: {metric, value, threshold, pass} per contracted metric"
+        pass_criterion: "All pass fields = true in test-evaluation.json"
+      - id: measure-inference-performance
+        instruction: >
+          Benchmark inference latency on the target deployment hardware (CPU if deployed on CPU).
+          Run 1,000 inference calls with the same input shape as production data.
+          Record: mean, p50, p95, p99 latency. Record model size on disk.
+        contract: >
+          p99 latency ≤ contracted max in docs/ml-contracts.md.
+          Model size ≤ contracted max.
+        tools: ["time.perf_counter", "torch.autocast", "ONNX Runtime", "locust"]
+        expected_output: "latency-report.json: {mean_ms, p50_ms, p95_ms, p99_ms, model_size_mb}"
+        pass_criterion: "p99_ms ≤ threshold; model_size_mb ≤ threshold"
+  - id: evidence
+    title: "Persist Model Artifacts and Experiment Logs"
+    rationale: >
+      ML models without reproducibility artifacts are not production-grade.
+      The exact checkpoint, the exact dataset split hash, and the exact code commit
+      that produced them must be committed or referenced in the experiment tracker.
+    steps:
+      - id: persist-experiment-artifacts
+        instruction: >
+          Log to MLflow / W&B:
+          - best_params.json (hyperparameters)
+          - test-evaluation.json (metric results)
+          - latency-report.json
+          - warm-run-checkpoint.json (updated with current best params for next run)
+          - model checkpoint (via log_artifact)
+          Tag the experiment run with the git commit SHA.
+        contract: >
+          Experiment tracker run exists with all artifacts.
+          Run is tagged with exact git commit SHA.
+          warm-run-checkpoint.json is updated for next run.
+        tools: ["mlflow.log_artifact", "wandb.log_artifact", "git rev-parse HEAD"]
+        expected_output: "MLflow run ID in experiment-run.json; all artifacts listed"
+        pass_criterion: "experiment-run.json exists; artifacts present in tracker"

package/templates/mobile/instructions.yaml CHANGED Viewed

@@ -1,44 +1,44 @@
-tag: MOBILE
-section: instructions
-blocks:
-  - id: responsive-offline
-    tier: recommended
-    title: "Responsive Design & Offline-First"
-    content: |
-      ## Responsive Design & Offline-First Patterns
-      - Design mobile-first: start with the smallest viewport and progressively enhance for larger screens. Use CSS media queries or container queries to adapt layout, not separate mobile pages.
-      - Implement an offline-first architecture: the app should be functional without a network connection, using locally cached data, and sync when connectivity is restored.
-      - Use a local database (SQLite, IndexedDB, Realm, WatermelonDB) as the primary data source for the UI. Sync with the server in the background using a conflict resolution strategy (last-write-wins, CRDTs, or manual merge).
-      - Queue mutations (writes, updates, deletes) locally when offline. Process the queue in order when connectivity resumes, handling conflicts and failures for each operation.
-      - Implement a network status indicator in the UI. Clearly communicate to users when they are offline and which features are unavailable or operating on stale data.
-      - Cache API responses with appropriate TTLs. Use stale-while-revalidate patterns to show cached data immediately while fetching fresh data in the background.
-      - Test all user flows in airplane mode and on throttled connections (3G, high latency). Offline behavior should feel intentional, not broken.
-  - id: push-lifecycle
-    tier: recommended
-    title: "Push Notifications & App Lifecycle"
-    content: |
-      ## Push Notifications & App Lifecycle
-      - Request notification permission contextually—explain the value before prompting. Never request permission on first launch without context; wait until the user encounters a feature that benefits from notifications.
-      - Implement a server-side notification preferences model. Users should control notification categories (marketing, transactional, social) independently, and preferences must be enforced server-side.
-      - Handle notification payloads gracefully in all app states: foreground (in-app banner), background (system notification), and terminated (cold start with deep link to relevant screen).
-      - Use silent/data notifications for background data sync. Keep background execution time under OS limits (< 30 seconds on iOS) to avoid being throttled.
-      - Manage app lifecycle transitions (active → background → suspended → terminated) explicitly. Save unsaved state on `onPause`/`onBackground`, restore it on `onResume`, and release expensive resources (camera, GPS, Bluetooth) when backgrounded.
-      - Handle deep links and universal links with a centralized routing mechanism. Every screen reachable by deep link must handle being the entry point (no assumed navigation stack).
-      - Implement token refresh for push notification services (FCM, APNs). Re-register the device token on every app launch and update the server if it changes.
-  - id: platform-performance
-    tier: recommended
-    title: "Platform Patterns & Performance"
-    content: |
-      ## Platform-Specific Patterns & Mobile Performance
-      - Respect platform conventions: use platform-native navigation patterns (bottom tabs on iOS, drawer on Android), system fonts, haptic feedback, and gesture behaviors that users expect.
-      - Optimize list rendering: use virtualized/recycled lists (FlatList, RecyclerView, UICollectionView) for any list longer than ~20 items. Never render hundreds of items in a scroll view.
-      - Minimize main thread work. Move file I/O, JSON parsing, image decoding, and network calls off the main/UI thread. Jank-free scrolling requires < 16ms per frame on the UI thread.
-      - Reduce app binary size: use tree shaking, asset compression, on-demand resource loading, and avoid bundling unused native libraries. Monitor binary size in CI and set growth budgets.
-      - Implement graceful permission handling: check permission status before requesting, explain why the permission is needed, handle denial gracefully, and provide a path to re-enable via system settings.
-      - Cache images in memory and on disk with size limits. Use progressive loading (blur placeholder → thumbnail → full resolution) for a smooth visual experience.
-      - Test on real devices across OS versions, screen sizes, and memory configurations. Emulators miss real-world issues like thermal throttling, low memory kills, and vendor-specific OS behaviors.
+tag: MOBILE
+section: instructions
+blocks:
+  - id: responsive-offline
+    tier: recommended
+    title: "Responsive Design & Offline-First"
+    content: |
+      ## Responsive Design & Offline-First Patterns
+      - Design mobile-first: start with the smallest viewport and progressively enhance for larger screens. Use CSS media queries or container queries to adapt layout, not separate mobile pages.
+      - Implement an offline-first architecture: the app should be functional without a network connection, using locally cached data, and sync when connectivity is restored.
+      - Use a local database (SQLite, IndexedDB, Realm, WatermelonDB) as the primary data source for the UI. Sync with the server in the background using a conflict resolution strategy (last-write-wins, CRDTs, or manual merge).
+      - Queue mutations (writes, updates, deletes) locally when offline. Process the queue in order when connectivity resumes, handling conflicts and failures for each operation.
+      - Implement a network status indicator in the UI. Clearly communicate to users when they are offline and which features are unavailable or operating on stale data.
+      - Cache API responses with appropriate TTLs. Use stale-while-revalidate patterns to show cached data immediately while fetching fresh data in the background.
+      - Test all user flows in airplane mode and on throttled connections (3G, high latency). Offline behavior should feel intentional, not broken.
+  - id: push-lifecycle
+    tier: recommended
+    title: "Push Notifications & App Lifecycle"
+    content: |
+      ## Push Notifications & App Lifecycle
+      - Request notification permission contextually—explain the value before prompting. Never request permission on first launch without context; wait until the user encounters a feature that benefits from notifications.
+      - Implement a server-side notification preferences model. Users should control notification categories (marketing, transactional, social) independently, and preferences must be enforced server-side.
+      - Handle notification payloads gracefully in all app states: foreground (in-app banner), background (system notification), and terminated (cold start with deep link to relevant screen).
+      - Use silent/data notifications for background data sync. Keep background execution time under OS limits (< 30 seconds on iOS) to avoid being throttled.
+      - Manage app lifecycle transitions (active → background → suspended → terminated) explicitly. Save unsaved state on `onPause`/`onBackground`, restore it on `onResume`, and release expensive resources (camera, GPS, Bluetooth) when backgrounded.
+      - Handle deep links and universal links with a centralized routing mechanism. Every screen reachable by deep link must handle being the entry point (no assumed navigation stack).
+      - Implement token refresh for push notification services (FCM, APNs). Re-register the device token on every app launch and update the server if it changes.
+  - id: platform-performance
+    tier: recommended
+    title: "Platform Patterns & Performance"
+    content: |
+      ## Platform-Specific Patterns & Mobile Performance
+      - Respect platform conventions: use platform-native navigation patterns (bottom tabs on iOS, drawer on Android), system fonts, haptic feedback, and gesture behaviors that users expect.
+      - Optimize list rendering: use virtualized/recycled lists (FlatList, RecyclerView, UICollectionView) for any list longer than ~20 items. Never render hundreds of items in a scroll view.
+      - Minimize main thread work. Move file I/O, JSON parsing, image decoding, and network calls off the main/UI thread. Jank-free scrolling requires < 16ms per frame on the UI thread.
+      - Reduce app binary size: use tree shaking, asset compression, on-demand resource loading, and avoid bundling unused native libraries. Monitor binary size in CI and set growth budgets.
+      - Implement graceful permission handling: check permission status before requesting, explain why the permission is needed, handle denial gracefully, and provide a path to re-enable via system settings.
+      - Cache images in memory and on disk with size limits. Use progressive loading (blur placeholder → thumbnail → full resolution) for a smooth visual experience.
+      - Test on real devices across OS versions, screen sizes, and memory configurations. Emulators miss real-world issues like thermal throttling, low memory kills, and vendor-specific OS behaviors.

package/templates/mobile/mcp-servers.yaml CHANGED Viewed

@@ -1,11 +1,11 @@
-tag: MOBILE
-section: mcp-servers
-servers:
-  - name: android-emulator
-    description: "Android emulator control — screenshots, tap, swipe, and app lifecycle"
-    command: npx
-    args: ["-y", "mcp-server-android"]
-    tags: [MOBILE]
-    category: devtools
-    tier: recommended
-    url: "https://github.com/nicholasgriffintn/android-mcp-server"
+tag: MOBILE
+section: mcp-servers
+servers:
+  - name: android-emulator
+    description: "Android emulator control — screenshots, tap, swipe, and app lifecycle"
+    command: npx
+    args: ["-y", "mcp-server-android"]
+    tags: [MOBILE]
+    category: devtools
+    tier: recommended
+    url: "https://github.com/nicholasgriffintn/android-mcp-server"