aes-cli 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. aes/__init__.py +5 -0
  2. aes/__main__.py +37 -0
  3. aes/analyzer.py +487 -0
  4. aes/commands/__init__.py +0 -0
  5. aes/commands/init.py +727 -0
  6. aes/commands/inspect.py +204 -0
  7. aes/commands/install.py +379 -0
  8. aes/commands/publish.py +432 -0
  9. aes/commands/search.py +65 -0
  10. aes/commands/status.py +153 -0
  11. aes/commands/sync.py +413 -0
  12. aes/commands/validate.py +77 -0
  13. aes/config.py +43 -0
  14. aes/domains.py +1382 -0
  15. aes/frameworks.py +522 -0
  16. aes/mcp_server.py +213 -0
  17. aes/registry.py +294 -0
  18. aes/scaffold/agent.yaml.jinja +135 -0
  19. aes/scaffold/agentignore.jinja +61 -0
  20. aes/scaffold/instructions.md.jinja +311 -0
  21. aes/scaffold/local.example.yaml.jinja +35 -0
  22. aes/scaffold/local.yaml.jinja +29 -0
  23. aes/scaffold/operations.md.jinja +33 -0
  24. aes/scaffold/orchestrator.md.jinja +95 -0
  25. aes/scaffold/permissions.yaml.jinja +151 -0
  26. aes/scaffold/setup.md.jinja +244 -0
  27. aes/scaffold/skill.md.jinja +27 -0
  28. aes/scaffold/skill.yaml.jinja +175 -0
  29. aes/scaffold/workflow.yaml.jinja +44 -0
  30. aes/scaffold/workflow_command.md.jinja +48 -0
  31. aes/schemas/agent.schema.json +188 -0
  32. aes/schemas/permissions.schema.json +100 -0
  33. aes/schemas/registry.schema.json +72 -0
  34. aes/schemas/skill.schema.json +209 -0
  35. aes/schemas/workflow.schema.json +92 -0
  36. aes/targets/__init__.py +29 -0
  37. aes/targets/_base.py +77 -0
  38. aes/targets/_composer.py +338 -0
  39. aes/targets/claude.py +153 -0
  40. aes/targets/copilot.py +48 -0
  41. aes/targets/cursor.py +46 -0
  42. aes/targets/windsurf.py +46 -0
  43. aes/validator.py +394 -0
  44. aes_cli-0.2.0.dist-info/METADATA +110 -0
  45. aes_cli-0.2.0.dist-info/RECORD +48 -0
  46. aes_cli-0.2.0.dist-info/WHEEL +5 -0
  47. aes_cli-0.2.0.dist-info/entry_points.txt +3 -0
  48. aes_cli-0.2.0.dist-info/top_level.txt +1 -0
aes/domains.py ADDED
@@ -0,0 +1,1382 @@
1
+ """Domain-specific configuration for aes init.
2
+
3
+ Each supported domain (ml, web, devops) has pre-filled content drawn from
4
+ the reference examples. Templates receive a DomainConfig instance; when it
5
+ is None the templates fall back to the existing TODO scaffolding.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass, field
11
+ from typing import Dict, List, Optional
12
+
13
+
14
+ # ---------------------------------------------------------------------------
15
+ # Data classes
16
+ # ---------------------------------------------------------------------------
17
+
18
+ @dataclass
19
+ class SkillDef:
20
+ """Definition for a single skill (manifest + runbook content)."""
21
+
22
+ id: str
23
+ name: str
24
+ version: str
25
+ description: str
26
+ stage: int
27
+ phase: str
28
+ inputs_required: List[Dict[str, str]] = field(default_factory=list)
29
+ inputs_optional: List[Dict[str, str]] = field(default_factory=list)
30
+ inputs_environment: List[str] = field(default_factory=list)
31
+ outputs: List[Dict[str, str]] = field(default_factory=list)
32
+ trigger_command: str = ""
33
+ error_strategy: str = "per-item-isolation"
34
+ code_primary: str = ""
35
+ tags: List[str] = field(default_factory=list)
36
+ depends_on: List[str] = field(default_factory=list)
37
+ blocks: List[str] = field(default_factory=list)
38
+ # New fields: description quality, activation, permissions
39
+ negative_triggers: List[str] = field(default_factory=list)
40
+ activation: str = "explicit" # "auto", "explicit", or "hybrid"
41
+ allowed_tools: Optional[Dict[str, object]] = None
42
+ # Runbook content sections
43
+ runbook_purpose: str = ""
44
+ runbook_when: str = ""
45
+ runbook_how: str = ""
46
+ runbook_decision_tree: str = ""
47
+ runbook_error_handling: str = ""
48
+
49
+
50
+ @dataclass
51
+ class WorkflowStateDef:
52
+ """A single state in a workflow."""
53
+
54
+ id: str
55
+ description: str
56
+ initial: bool = False
57
+ terminal: bool = False
58
+ active: bool = False
59
+
60
+
61
+ @dataclass
62
+ class WorkflowTransitionDef:
63
+ """A transition between workflow states."""
64
+
65
+ from_state: str
66
+ to_state: str
67
+ skill: str = ""
68
+ conditions: List[str] = field(default_factory=list)
69
+ on_failure: str = ""
70
+ description: str = ""
71
+
72
+
73
+ @dataclass
74
+ class WorkflowDef:
75
+ """Complete workflow definition."""
76
+
77
+ id: str
78
+ entity: str
79
+ description: str
80
+ states: List[WorkflowStateDef] = field(default_factory=list)
81
+ transitions: List[WorkflowTransitionDef] = field(default_factory=list)
82
+
83
+
84
+ @dataclass
85
+ class CommandDef:
86
+ """Definition for a workflow-initiating command (e.g. /train, /build)."""
87
+
88
+ id: str
89
+ trigger: str # e.g. "/build", "/train", "/process"
90
+ description: str
91
+ runbook_purpose: str
92
+ runbook_phases: List[Dict[str, str]] = field(default_factory=list)
93
+ worker_specialty: str = "" # one-line specialty for worker identity in memory
94
+
95
+
96
+ @dataclass
97
+ class DomainConfig:
98
+ """All domain-specific content for aes init."""
99
+
100
+ mode: str = "dev-assist" # "dev-assist" or "agent-integrated"
101
+ workflow_commands: List[CommandDef] = field(default_factory=list)
102
+
103
+ # instructions.md content
104
+ instructions_description: str = ""
105
+ instructions_quick_ref: str = ""
106
+ instructions_project_structure: str = ""
107
+ instructions_rules: List[str] = field(default_factory=list)
108
+ instructions_workflow_phases: List[Dict[str, str]] = field(default_factory=list)
109
+ instructions_key_principle: str = ""
110
+ instructions_gotchas: List[str] = field(default_factory=list)
111
+
112
+ # Skills
113
+ skills: List[SkillDef] = field(default_factory=list)
114
+
115
+ # Orchestrator content
116
+ orchestrator_pipeline: str = ""
117
+ orchestrator_status_flow: str = ""
118
+ orchestrator_decision_tree: str = ""
119
+ orchestrator_when_to_stop: str = ""
120
+
121
+ # Workflow
122
+ workflow: Optional[WorkflowDef] = None
123
+
124
+ # Permissions additions
125
+ permissions_shell_read: List[str] = field(default_factory=list)
126
+ permissions_shell_execute: List[str] = field(default_factory=list)
127
+ permissions_file_write: List[str] = field(default_factory=list)
128
+ permissions_deny_shell: List[str] = field(default_factory=list)
129
+ permissions_confirm_shell: List[str] = field(default_factory=list)
130
+ permissions_confirm_actions: List[str] = field(default_factory=list)
131
+ permissions_resource_limits: Optional[Dict[str, object]] = None
132
+
133
+ # Environment
134
+ env_required: List[Dict[str, str]] = field(default_factory=list)
135
+ env_optional: List[Dict[str, str]] = field(default_factory=list)
136
+
137
+
138
+ # ---------------------------------------------------------------------------
139
+ # ML domain config — drawn from examples/ml-pipeline
140
+ # ---------------------------------------------------------------------------
141
+
142
+ _ML_SKILLS = [
143
+ SkillDef(
144
+ id="discover",
145
+ name="Discover Datasets",
146
+ version="1.0.0",
147
+ description="Find new public datasets from OpenML and Kaggle APIs. Use when the pipeline needs fresh data or no datasets are in discovered status. Queries multiple sources, deduplicates, and filters by quality criteria.",
148
+ stage=1,
149
+ phase="ingestion",
150
+ inputs_required=[
151
+ {"name": "db_connection", "type": "sqlite3.Connection",
152
+ "description": "Active database connection"},
153
+ ],
154
+ inputs_optional=[
155
+ {"name": "max_datasets", "type": "int", "default": "50",
156
+ "description": "Maximum datasets to discover per run"},
157
+ ],
158
+ inputs_environment=["OPENML_APIKEY", "KAGGLE_USERNAME", "KAGGLE_KEY"],
159
+ outputs=[
160
+ {"name": "new_dataset_ids", "type": "list[int]",
161
+ "description": "IDs of newly discovered datasets"},
162
+ ],
163
+ trigger_command="python scripts/run_pipeline.py --stage discover",
164
+ error_strategy="per-item-isolation",
165
+ code_primary="pipeline/discover.py",
166
+ tags=["data-ingestion", "openml", "kaggle"],
167
+ blocks=["examine"],
168
+ negative_triggers=["Do NOT use for manual CSV imports or local file ingestion"],
169
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "data/**"]}, "network": True},
170
+ runbook_purpose="Find new public datasets from OpenML and Kaggle that meet quality and licensing criteria.",
171
+ runbook_when="- No datasets in `discovered` status\n- User requests new data sources\n- Scheduled daily",
172
+ runbook_how="1. Query OpenML API for datasets matching size/license filters\n2. Query Kaggle API for datasets in target domains\n3. Deduplicate against existing records via `dataset_exists()`\n4. Insert new records via `insert_dataset()`\n5. Record attribution via `insert_attribution()`",
173
+ runbook_decision_tree="For each candidate dataset:\n |- Already exists? -> Skip\n |- License not in whitelist? -> Skip\n |- Rows < 100 or > 500,000? -> Skip\n |- Features < 3? -> Skip\n \\- Passes all checks? -> Insert as \"discovered\"",
174
+ runbook_error_handling="- **API timeout**: Retry once, then skip source\n- **Rate limit**: Sleep and retry\n- **Invalid response**: Log debug, skip dataset",
175
+ ),
176
+ SkillDef(
177
+ id="examine",
178
+ name="Examine Dataset",
179
+ version="1.0.0",
180
+ description="Download, profile, and compute quality score for a dataset. Use after discover completes and datasets are in discovered status. Detects feature types, checks hard rejections, and saves as parquet.",
181
+ stage=2,
182
+ phase="profiling",
183
+ inputs_required=[
184
+ {"name": "db_connection", "type": "sqlite3.Connection",
185
+ "description": "Active database connection"},
186
+ {"name": "dataset_id", "type": "int",
187
+ "description": "Dataset to examine"},
188
+ ],
189
+ outputs=[
190
+ {"name": "quality_score", "type": "float",
191
+ "description": "0.0-1.0 weighted quality score"},
192
+ ],
193
+ trigger_command="python scripts/run_pipeline.py --stage examine --dataset-id {ID}",
194
+ error_strategy="per-item-isolation",
195
+ code_primary="pipeline/examine.py",
196
+ tags=["data-quality", "profiling"],
197
+ depends_on=["discover"],
198
+ negative_triggers=["Do NOT use on datasets already at examined status or beyond"],
199
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "data/**"]}, "network": True},
200
+ runbook_purpose="Download a dataset, compute quality score, detect feature types, and decide if it's worth training on.",
201
+ runbook_when="- Dataset is at `discovered` status\n- After discover skill completes",
202
+ runbook_how="1. Download data from source (OpenML API or Kaggle)\n2. Compute quality score (weighted): missing 30%, dupes 15%, constants 15%, imbalance 20%, features 10%, cardinality 10%\n3. Check hard rejections: >50% missing, <3 features, <10 minority samples\n4. Detect feature types: numeric, categorical, datetime, text\n5. Save as parquet\n6. Advance to `examined`",
203
+ runbook_decision_tree="Download dataset\n |- Download fails? -> Reject: \"download_failed\"\n |- >50% missing values? -> Reject: \"too_many_missing\"\n |- <3 features? -> Reject: \"too_few_features\"\n |- <10 minority samples? -> Reject: \"insufficient_minority\"\n \\- Passes? -> Status: \"examined\"",
204
+ runbook_error_handling="- **Network error**: Log and continue\n- **Invalid data**: Skip with log",
205
+ ),
206
+ SkillDef(
207
+ id="classify",
208
+ name="Classify Problem",
209
+ version="1.0.0",
210
+ description="Detect problem type and select candidate models from the registry. Use after examine when datasets reach examined status with quality_score >= 0.30. Supports binary, multiclass, regression, time-series, anomaly, and clustering.",
211
+ stage=3,
212
+ phase="classification",
213
+ inputs_required=[
214
+ {"name": "db_connection", "type": "sqlite3.Connection",
215
+ "description": "Active database connection"},
216
+ {"name": "dataset_id", "type": "int",
217
+ "description": "Dataset to classify"},
218
+ ],
219
+ outputs=[
220
+ {"name": "problem_type", "type": "str",
221
+ "description": "Detected problem type (binary, multiclass, regression, etc.)"},
222
+ {"name": "selected_models", "type": "list[str]",
223
+ "description": "Model keys selected from the registry"},
224
+ ],
225
+ trigger_command="python scripts/run_pipeline.py --stage classify --dataset-id {ID}",
226
+ error_strategy="per-item-isolation",
227
+ code_primary="pipeline/classify.py",
228
+ tags=["classification", "model-selection"],
229
+ depends_on=["examine"],
230
+ blocks=["train"],
231
+ negative_triggers=["Do NOT use on datasets below quality threshold (quality_score < 0.30)"],
232
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**"]}, "network": False},
233
+ runbook_purpose="Detect the problem type (binary, multiclass, regression, time-series, anomaly, clustering) and select candidate models from the model registry.",
234
+ runbook_when="- Dataset is at `examined` status\n- After examine skill completes with quality_score >= 0.30",
235
+ runbook_how="1. Load dataset profile from examine stage\n2. Analyze target column: cardinality, distribution, dtype\n3. Detect problem type using heuristics\n4. Query model_registry for compatible models\n5. Filter models by dataset size and feature types\n6. Save selected models and advance to `classified`",
236
+ runbook_decision_tree="Analyze target column:\n |- Numeric + high cardinality? -> regression\n |- Categorical + 2 classes? -> binary_classification\n |- Categorical + 3+ classes? -> multiclass_classification\n |- Datetime target? -> time_series\n |- No target column? -> clustering or anomaly_detection\n \\- Ambiguous? -> Default to multiclass_classification\n\nFor each compatible model:\n |- Supports problem type? -> Include\n \\- Not compatible? -> Skip",
237
+ runbook_error_handling="- **Ambiguous target**: Default to multiclass, log warning\n- **No compatible models**: Reject dataset with reason",
238
+ ),
239
+ SkillDef(
240
+ id="train",
241
+ name="Train Models",
242
+ version="1.0.0",
243
+ description="Run Optuna HPO and train all candidate models for a dataset. Use after classify when datasets reach classified status and resource limits are met. Trains each model independently with per-item isolation.",
244
+ stage=4,
245
+ phase="training",
246
+ inputs_required=[
247
+ {"name": "db_connection", "type": "sqlite3.Connection",
248
+ "description": "Active database connection"},
249
+ {"name": "dataset_id", "type": "int",
250
+ "description": "Dataset to train"},
251
+ ],
252
+ inputs_optional=[
253
+ {"name": "model_keys", "type": "list[str]",
254
+ "description": "Specific models to train (default: all selected)"},
255
+ ],
256
+ inputs_environment=["OPTUNA_TIMEOUT", "OPTUNA_N_TRIALS"],
257
+ outputs=[
258
+ {"name": "experiment_ids", "type": "list[int]",
259
+ "description": "IDs of completed experiments"},
260
+ ],
261
+ trigger_command="python scripts/run_pipeline.py --stage train --dataset-id {ID}",
262
+ error_strategy="per-item-isolation",
263
+ code_primary="pipeline/train.py",
264
+ tags=["training", "optuna", "hpo"],
265
+ depends_on=["classify"],
266
+ blocks=["evaluate"],
267
+ negative_triggers=["Do NOT use when CPU > 70% or memory > 75%", "Do NOT use on unclassified datasets"],
268
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "models/**", "data/**"]}, "network": False},
269
+ runbook_purpose="Run Optuna hyperparameter optimization and train all candidate models for a dataset.",
270
+ runbook_when="- Dataset is at `classified` status\n- Resource limits met (CPU <70%, memory <75%)",
271
+ runbook_how="For each selected model:\n1. Preprocess data (framework-aware)\n2. Run Optuna HPO (TPESampler, MedianPruner)\n3. Train final model on best params\n4. Evaluate on held-out test set\n5. Save model in native format\n6. Log to MLflow and SQLite",
272
+ runbook_decision_tree="For each model_key in selected_models:\n |- Preprocess fails? -> Mark experiment failed, continue\n |- Optuna finds no good trial? -> Mark failed, continue\n |- Training crashes? -> Mark failed with error_message, continue\n \\- Success? -> Save model, log metrics, mark completed\n\nAfter all models:\n |- At least 1 completed? -> Status: \"trained\"\n \\- All failed? -> Status: \"rejected\"",
273
+ runbook_error_handling="- Each model trains independently (per-item-isolation)\n- One model failing doesn't affect others\n- Error messages stored in experiment.error_message",
274
+ ),
275
+ SkillDef(
276
+ id="evaluate",
277
+ name="Evaluate Models",
278
+ version="1.0.0",
279
+ description="Compare trained models, check overfitting, and apply quality gates. Use after train when datasets reach trained status with at least one completed experiment. Ranks models, detects overfitting, and validates against baselines.",
280
+ stage=5,
281
+ phase="evaluation",
282
+ inputs_required=[
283
+ {"name": "db_connection", "type": "sqlite3.Connection",
284
+ "description": "Active database connection"},
285
+ {"name": "dataset_id", "type": "int",
286
+ "description": "Dataset to evaluate"},
287
+ ],
288
+ inputs_optional=[
289
+ {"name": "quality_gates", "type": "dict",
290
+ "description": "Custom quality gate thresholds (default: from config)"},
291
+ ],
292
+ outputs=[
293
+ {"name": "best_experiment_id", "type": "int",
294
+ "description": "ID of the best performing experiment"},
295
+ {"name": "passes_quality_gates", "type": "bool",
296
+ "description": "Whether the best model meets quality criteria"},
297
+ ],
298
+ trigger_command="python scripts/run_pipeline.py --stage evaluate --dataset-id {ID}",
299
+ error_strategy="per-item-isolation",
300
+ code_primary="pipeline/evaluate.py",
301
+ tags=["evaluation", "quality-gates", "model-comparison"],
302
+ depends_on=["train"],
303
+ blocks=["package"],
304
+ negative_triggers=["Do NOT use when no experiments have completed"],
305
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**"]}, "network": False},
306
+ runbook_purpose="Compare all trained models, detect overfitting, check quality gates, and select the best experiment.",
307
+ runbook_when="- Dataset is at `trained` status\n- At least one experiment completed successfully",
308
+ runbook_how="1. Load all completed experiments for the dataset\n2. Rank by primary metric (accuracy, RMSE, etc.)\n3. Check overfitting: train-val gap > 0.15 is a warning\n4. Check quality gates: minimum metric thresholds\n5. Check baseline: best model must beat random/majority\n6. Select best experiment and advance to `evaluated`",
309
+ runbook_decision_tree="For each completed experiment:\n |- Train-val gap > 0.15? -> Flag overfitting warning\n |- Below quality gate? -> Mark as not passing\n |- Worse than baseline? -> Mark as not passing\n \\- Passes all checks? -> Candidate for best\n\nAfter ranking:\n |- At least 1 passes? -> Select best, status: ready for packaging\n \\- None pass? -> Consider reframe (back to classify)",
310
+ runbook_error_handling="- **No experiments**: Cannot evaluate, keep at trained status\n- **All overfitting**: Log warning, still select best if above quality gate",
311
+ ),
312
+ SkillDef(
313
+ id="package",
314
+ name="Package Model",
315
+ version="1.0.0",
316
+ description="Export best model in native format and create deployment zip bundle. Use after evaluate confirms passes_quality_gates=true. Supports CatBoost, XGBoost, LightGBM, and sklearn serialization formats.",
317
+ stage=6,
318
+ phase="packaging",
319
+ inputs_required=[
320
+ {"name": "db_connection", "type": "sqlite3.Connection",
321
+ "description": "Active database connection"},
322
+ {"name": "dataset_id", "type": "int",
323
+ "description": "Dataset whose best model to package"},
324
+ ],
325
+ outputs=[
326
+ {"name": "package_path", "type": "str",
327
+ "description": "Path to the created zip bundle"},
328
+ ],
329
+ trigger_command="python scripts/run_pipeline.py --stage package --dataset-id {ID}",
330
+ error_strategy="fail-fast",
331
+ code_primary="pipeline/package.py",
332
+ tags=["packaging", "serialization", "deployment"],
333
+ depends_on=["evaluate"],
334
+ blocks=["publish"],
335
+ negative_triggers=["Do NOT use when best model fails quality gates"],
336
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "models/**", "packages/**"]}, "network": False},
337
+ runbook_purpose="Export the best model in its native serialization format and bundle it into a deployment-ready zip.",
338
+ runbook_when="- Dataset has a best experiment that passes quality gates\n- After evaluate skill confirms passes_quality_gates=true",
339
+ runbook_how="1. Load best experiment and its trained model\n2. Export in native format (CatBoost .cbm, XGBoost .json, LightGBM .txt, sklearn .joblib)\n3. Generate model card with metrics and metadata\n4. Create zip bundle: model file + model card + config\n5. Verify bundle integrity\n6. Advance to `packaged`",
340
+ runbook_decision_tree="Load best experiment:\n |- Model file exists? -> Export in native format\n | |- CatBoost? -> .cbm format\n | |- XGBoost? -> .json format\n | |- LightGBM? -> .txt format\n | \\- sklearn? -> .joblib format\n |- Model file missing? -> Abort, re-train needed\n \\- Bundle created? -> Verify checksum, advance status",
341
+ runbook_error_handling="- **Model file missing**: Abort, dataset stays at trained\n- **Serialization error**: Log error, try alternative format\n- **Zip creation failure**: Retry once, then abort",
342
+ ),
343
+ SkillDef(
344
+ id="publish",
345
+ name="Publish Model",
346
+ version="1.0.0",
347
+ description="Upload model to HuggingFace Hub and register with the prediction API. Use after package when a verified zip bundle exists. Requires HF_TOKEN environment variable.",
348
+ stage=7,
349
+ phase="distribution",
350
+ inputs_required=[
351
+ {"name": "db_connection", "type": "sqlite3.Connection",
352
+ "description": "Active database connection"},
353
+ {"name": "dataset_id", "type": "int",
354
+ "description": "Dataset whose packaged model to publish"},
355
+ ],
356
+ inputs_environment=["HF_TOKEN"],
357
+ outputs=[
358
+ {"name": "published_urls", "type": "list[str]",
359
+ "description": "URLs where the model is now available"},
360
+ ],
361
+ trigger_command="python scripts/run_pipeline.py --stage publish --dataset-id {ID}",
362
+ error_strategy="per-item-isolation",
363
+ code_primary="pipeline/publish.py",
364
+ tags=["publishing", "huggingface", "api-registration"],
365
+ depends_on=["package"],
366
+ negative_triggers=["Do NOT use without HF_TOKEN configured", "Do NOT use on unpackaged models"],
367
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**"]}, "network": True},
368
+ runbook_purpose="Upload the packaged model to HuggingFace Hub and register it with the metered prediction API.",
369
+ runbook_when="- Dataset is at `packaged` status\n- Package zip exists and is verified\n- HF_TOKEN environment variable is set",
370
+ runbook_how="1. Load package zip from package stage\n2. Upload to HuggingFace Hub with model card\n3. Register model with prediction API endpoint\n4. Generate API key for metered access\n5. Verify both platforms respond correctly\n6. Advance to `published`",
371
+ runbook_decision_tree="Publish to platforms:\n |- HuggingFace upload\n | |- Success? -> Record URL\n | \\- Failure? -> Log error, continue to API\n |- API registration\n | |- Success? -> Record endpoint URL\n | \\- Failure? -> Log error\n \\- At least one succeeded? -> Status: published\n \\- Both failed? -> Keep at packaged, log errors",
372
+ runbook_error_handling="- **HF_TOKEN invalid**: Abort HF upload, try API only\n- **Network error**: Retry once per platform\n- **API registration failure**: Log and keep at packaged status",
373
+ ),
374
+ ]
375
+
376
+ _ML_WORKFLOW = WorkflowDef(
377
+ id="dataset-pipeline",
378
+ entity="dataset",
379
+ description="Dataset lifecycle from discovery through publication",
380
+ states=[
381
+ WorkflowStateDef("discovered", "Found and registered in database", initial=True),
382
+ WorkflowStateDef("examined", "Downloaded, profiled, quality-scored"),
383
+ WorkflowStateDef("classified", "Problem type detected, models selected"),
384
+ WorkflowStateDef("training", "Models being trained", active=True),
385
+ WorkflowStateDef("trained", "All models trained, awaiting evaluation"),
386
+ WorkflowStateDef("packaged", "Best model packaged as zip"),
387
+ WorkflowStateDef("published", "Live on API and HuggingFace", terminal=True),
388
+ WorkflowStateDef("rejected", "Failed quality criteria", terminal=True),
389
+ ],
390
+ transitions=[
391
+ WorkflowTransitionDef("discovered", "examined", skill="examine",
392
+ conditions=["Data file downloadable"], on_failure="rejected"),
393
+ WorkflowTransitionDef("examined", "classified", skill="classify",
394
+ conditions=["quality_score >= 0.30"], on_failure="rejected"),
395
+ WorkflowTransitionDef("classified", "training", skill="train",
396
+ conditions=["At least one model selected", "Resource limits met"]),
397
+ WorkflowTransitionDef("training", "trained", skill="train",
398
+ conditions=["At least one experiment completed"],
399
+ on_failure="rejected"),
400
+ WorkflowTransitionDef("trained", "packaged", skill="package",
401
+ conditions=["Best experiment passes quality gates",
402
+ "Best experiment beats baseline"],
403
+ on_failure="rejected"),
404
+ WorkflowTransitionDef("packaged", "published", skill="publish",
405
+ conditions=["At least one platform succeeds"]),
406
+ WorkflowTransitionDef("trained", "classified", skill="classify",
407
+ conditions=["All models below quality gates"],
408
+ description="Reframe problem type when all models fail"),
409
+ ],
410
+ )
411
+
412
+ ML_CONFIG = DomainConfig(
413
+ mode="agent-integrated",
414
+ workflow_commands=[
415
+ CommandDef(
416
+ id="build",
417
+ trigger="/build",
418
+ description="Build the ML pipeline codebase from scratch: project structure, database, pipeline stages, model registry, config, scripts, tests",
419
+ runbook_purpose="Construct the complete ML pipeline codebase. The agent creates project structure, sets up the database, implements each pipeline stage, builds the model registry, writes configuration, adds CLI scripts, and verifies with tests.",
420
+ worker_specialty="Constructing ML pipeline codebases — modules, database, stages, registry",
421
+ runbook_phases=[
422
+ {"title": "Project Structure", "content": "Create directory layout: pipeline/, trainers/, config/, serving/, scripts/, tests/. Set up pyproject.toml, __init__.py files, and virtual environment."},
423
+ {"title": "Database & Storage", "content": "Implement SQLite schema for datasets, models, and runs. Create migration scripts and helper functions (insert, update, query)."},
424
+ {"title": "Pipeline Stages", "content": "Build each stage module: discover.py, examine.py, classify.py, train.py, evaluate.py, package.py, publish.py. Each reads from DB, processes, writes results back."},
425
+ {"title": "Model Registry", "content": "Create config/model_registry.py — the brain. Define model entries (name, class, search space, metrics). Adding a model = adding a dict entry."},
426
+ {"title": "Configuration", "content": "Create config/settings.py with environment-based config. Define quality gates, thresholds, resource limits, and API endpoints."},
427
+ {"title": "Scripts & CLI", "content": "Build scripts/run_pipeline.py with --stage and --dataset-id flags. Add convenience scripts for common operations."},
428
+ {"title": "Tests & Verification", "content": "Write unit tests for each pipeline stage. Add integration test that runs discover->examine->classify on a small dataset. Verify all imports and CLI commands work."},
429
+ ],
430
+ ),
431
+ CommandDef(
432
+ id="train",
433
+ trigger="/train",
434
+ description="Run the full ML pipeline: discover, examine, classify, train, evaluate, package, publish",
435
+ runbook_purpose="Execute the complete ML pipeline end-to-end. The agent discovers datasets, profiles them, trains models via Optuna HPO, evaluates results, and publishes winners.",
436
+ worker_specialty="Executing ML training pipelines — HPO, evaluation, packaging",
437
+ runbook_phases=[
438
+ {"title": "Discover", "content": "Find new datasets from OpenML/Kaggle APIs matching quality and licensing criteria."},
439
+ {"title": "Examine", "content": "Download, profile, and compute quality scores. Reject datasets below thresholds."},
440
+ {"title": "Classify", "content": "Detect problem type and select candidate models from the registry."},
441
+ {"title": "Train", "content": "Run Optuna HPO for each candidate model. Train final models on best params."},
442
+ {"title": "Evaluate", "content": "Score models against quality gates. Compare to baselines."},
443
+ {"title": "Package", "content": "Serialize models, generate model cards, and prepare artifacts."},
444
+ {"title": "Publish", "content": "Push passing models to HuggingFace Hub with full metadata."},
445
+ ],
446
+ ),
447
+ ],
448
+ instructions_description="Automated ML pipeline that discovers datasets, trains models, evaluates quality, packages winners, and serves predictions.",
449
+ instructions_quick_ref="<!-- AGENT: Extract commands from scripts/, Makefile, or pyproject.toml. Show the 3-5 most common commands for running the pipeline, training, and testing. -->",
450
+ instructions_project_structure="<!-- AGENT: Run directory listing and annotate key directories. Typical ML pipeline structure: pipeline stages, model trainers, configuration, serving layer, scripts, tests. -->",
451
+ instructions_rules=[
452
+ "**Model registry is the brain** -- adding a model means adding a configuration entry, not writing new training code.",
453
+ "**Resource limits** -- monitor CPU and memory during training. Skip or queue work when limits are exceeded.",
454
+ "**Fail graceful** -- each dataset/model wrapped in error handling. Log the error, continue with the next item.",
455
+ ],
456
+ instructions_workflow_phases=[
457
+ {"title": "Find Data", "content": "Search data sources or ingest user-provided datasets."},
458
+ {"title": "Run Pipeline", "content": "Execute discover -> examine -> classify -> train -> evaluate stages."},
459
+ {"title": "Analyze Results (DO NOT SKIP)", "content": "Check for: overfitting (train-val gap too large), underfitting (below quality gates), all models failed, baseline not beaten, problem type mismatch."},
460
+ {"title": "Iterate", "content": "Levers in order: hyperparameter tuning (more trials/time) -> model selection (search space) -> problem reframing -> preprocessing changes -> quality gate adjustment."},
461
+ {"title": "Package and Publish", "content": "Only after quality is confirmed. Evaluate -> package -> publish."},
462
+ ],
463
+ instructions_key_principle="The agent's job is NOT just to run commands. It is to understand, analyze, iterate, and deliver quality.",
464
+ instructions_gotchas=[],
465
+ skills=_ML_SKILLS,
466
+ orchestrator_pipeline="discover -> examine -> classify -> train -> evaluate -> package -> publish",
467
+ orchestrator_status_flow="discovered -> examined -> classified -> training -> trained -> packaged -> published\n | | |\n v v v\n rejected rejected rejected",
468
+ orchestrator_decision_tree="FIRST: Check if pipeline is already complete (all items at terminal status).\n If complete -> report status summary, ask user: re-run / new session / re-validate / exit.\n If not -> proceed:\n\nfor each stage in [discover, examine, classify, train, evaluate, package, publish]:\n 1. Check resource limits (CPU <70%, memory <75%)\n 2. Get datasets at current status (or single dataset if --dataset-id)\n 3. For each dataset:\n a. Run stage function\n b. On success: advance status to next stage\n c. On failure: log error, mark rejected if unrecoverable\n 4. Report: N processed, N failed, N skipped\n\nSpecial: after train stage, run ANALYSIS before evaluate:\n - Check overfitting (train-val gap >0.15)\n - Check underfitting (all below quality gates)\n - Check baseline (better than random?)\n - If all fail: consider reframe (trained -> classified)",
469
+ orchestrator_when_to_stop="- All datasets at terminal status (published or rejected)\n- Resource limits exceeded\n- User requests stop\n- No datasets to process",
470
+ workflow=_ML_WORKFLOW,
471
+ permissions_shell_read=[
472
+ "scripts/job.sh status *",
473
+ "scripts/job.sh logs *",
474
+ "scripts/job.sh list",
475
+ "scripts/job.sh results *",
476
+ ],
477
+ permissions_shell_execute=[
478
+ "python scripts/run_pipeline.py *",
479
+ "python -m pytest *",
480
+ "scripts/job.sh start *",
481
+ ],
482
+ permissions_file_write=[
483
+ "config/**/*.py",
484
+ "pipeline/**/*.py",
485
+ "trainers/**/*.py",
486
+ ],
487
+ permissions_deny_shell=[
488
+ "rm -rf *",
489
+ "docker rm *",
490
+ "systemctl *",
491
+ "kill *",
492
+ ],
493
+ permissions_confirm_shell=[
494
+ "scripts/job.sh stop *",
495
+ "git push *",
496
+ ],
497
+ permissions_confirm_actions=["publish_model", "create_api_key", "lower_quality_gates"],
498
+ permissions_resource_limits={
499
+ "max_cpu_percent": 70,
500
+ "max_memory_percent": 75,
501
+ "check_before": ["train", "evaluate"],
502
+ "on_exceeded": "warn_and_skip",
503
+ },
504
+ env_required=[
505
+ {"name": "OPENML_APIKEY", "description": "OpenML API key for dataset discovery"},
506
+ {"name": "HF_TOKEN", "description": "HuggingFace token for model publishing"},
507
+ ],
508
+ env_optional=[
509
+ {"name": "OPTUNA_TIMEOUT", "default": "300", "description": "Seconds per model for HPO"},
510
+ {"name": "OPTUNA_N_TRIALS", "default": "50", "description": "Max trials per model"},
511
+ ],
512
+ )
513
+
514
+
515
+ # ---------------------------------------------------------------------------
516
+ # Web domain config — drawn from examples/web-app
517
+ # ---------------------------------------------------------------------------
518
+
519
+ _WEB_SKILLS = [
520
+ SkillDef(
521
+ id="scaffold",
522
+ name="Scaffold Feature",
523
+ version="1.0.0",
524
+ description="Generate boilerplate for a new feature: migration, route, component, test. Use when starting a new feature or the user says 'add feature X'. Creates all necessary stubs and file structure.",
525
+ stage=1,
526
+ phase="setup",
527
+ inputs_required=[
528
+ {"name": "feature_name", "type": "string",
529
+ "description": "Name of the feature to scaffold"},
530
+ {"name": "needs_db", "type": "bool",
531
+ "description": "Whether a database migration is needed"},
532
+ ],
533
+ outputs=[
534
+ {"name": "files_created", "type": "list[str]",
535
+ "description": "Paths of generated files"},
536
+ ],
537
+ trigger_command="npx plop feature {name}",
538
+ error_strategy="fail-fast",
539
+ code_primary="plopfile.ts",
540
+ tags=["scaffolding", "code-generation"],
541
+ blocks=["implement"],
542
+ negative_triggers=["Do NOT use for modifying existing features", "Do NOT use for bug fixes"],
543
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["src/**", "tests/**", "migrations/**"]}, "network": False},
544
+ runbook_purpose="Generate all boilerplate files for a new feature: migration, API route, UI component, and test stubs.",
545
+ runbook_when="- Starting a new feature\n- User says \"add feature X\"",
546
+ runbook_how="1. Create migration file if DB changes needed\n2. Create API route with auth middleware\n3. Create React component (server or client)\n4. Create test files (unit + integration)\n5. Update feature flag env var",
547
+ runbook_decision_tree="New feature request:\n |- Needs DB? -> Create migration first\n |- Needs API? -> Create route with withAuth middleware\n |- Needs UI? -> Create component (server-first)\n \\- Always -> Create test stubs",
548
+ runbook_error_handling="- **Template error**: Fail fast, fix template\n- **Migration conflict**: Resolve before continuing",
549
+ ),
550
+ SkillDef(
551
+ id="implement",
552
+ name="Implement Feature",
553
+ version="1.0.0",
554
+ description="Write migration, routes, components, and feature flag integration. Use after scaffold creates boilerplate files and feature requirements are clear. Populates all scaffolded stubs with business logic.",
555
+ stage=2,
556
+ phase="development",
557
+ inputs_required=[
558
+ {"name": "feature_name", "type": "string",
559
+ "description": "Name of the feature to implement"},
560
+ {"name": "files_created", "type": "list[str]",
561
+ "description": "Scaffold output files to populate"},
562
+ ],
563
+ outputs=[
564
+ {"name": "files_modified", "type": "list[str]",
565
+ "description": "Paths of files modified during implementation"},
566
+ ],
567
+ trigger_command="npm run dev",
568
+ error_strategy="fail-fast",
569
+ code_primary="src/",
570
+ tags=["development", "implementation"],
571
+ depends_on=["scaffold"],
572
+ blocks=["test"],
573
+ negative_triggers=["Do NOT use before scaffold has run", "Do NOT use without clear requirements"],
574
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["src/**", "tests/**", "migrations/**"]}, "network": False},
575
+ runbook_purpose="Implement the feature by writing migration logic, API routes, UI components, and wiring up the feature flag.",
576
+ runbook_when="- After scaffold creates boilerplate files\n- Feature requirements are clear",
577
+ runbook_how="1. Write database migration (if needed)\n2. Implement API route with business logic and auth middleware\n3. Build React component (server-first, client when interactive)\n4. Wire up feature flag for gradual rollout\n5. Verify dev server runs without errors",
578
+ runbook_decision_tree="For each scaffolded file:\n |- Migration file? -> Write schema changes, add rollback\n |- API route? -> Add business logic, input validation, auth\n |- Component? -> Implement UI, add loading/error states\n \\- All files populated? -> Run dev server to verify",
579
+ runbook_error_handling="- **Type error**: Fix before moving to tests\n- **Migration conflict**: Resolve with existing migrations\n- **Dev server crash**: Check imports and dependencies",
580
+ ),
581
+ SkillDef(
582
+ id="test",
583
+ name="Run Tests",
584
+ version="1.0.0",
585
+ description="Run unit, integration, and e2e test suites. Use after implementation is complete and before deployment. Supports Jest, React Testing Library, Supertest, and Playwright.",
586
+ stage=3,
587
+ phase="quality",
588
+ inputs_optional=[
589
+ {"name": "suite", "type": "string", "default": "all",
590
+ "description": "Which suite: unit, integration, e2e, or all"},
591
+ ],
592
+ outputs=[
593
+ {"name": "results", "type": "object",
594
+ "description": "Test pass/fail counts"},
595
+ ],
596
+ trigger_command="npm run test",
597
+ error_strategy="fail-fast",
598
+ code_primary="jest.config.ts",
599
+ tags=["testing", "quality"],
600
+ depends_on=["implement"],
601
+ blocks=["review"],
602
+ negative_triggers=["Do NOT use during active implementation — wait until code compiles"],
603
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["tests/**"]}, "network": False},
604
+ runbook_purpose="Run the full test suite to verify feature quality before deployment.",
605
+ runbook_when="- After implementation complete\n- Before deployment",
606
+ runbook_how="1. Unit tests: Jest + React Testing Library\n2. Integration tests: Supertest against Express API\n3. E2E tests: Playwright against running dev server",
607
+ runbook_decision_tree="Run unit tests\n |- Fails? -> Fix before continuing\n \\- Passes? -> Run integration tests\n |- Fails? -> Fix API route or middleware\n \\- Passes? -> Run e2e tests\n |- Fails? -> Fix UI interaction\n \\- All pass? -> Ready for deployment",
608
+ runbook_error_handling="- **Test failure**: Fix before continuing to next suite\n- **Timeout**: Check for hanging async operations",
609
+ ),
610
+ SkillDef(
611
+ id="review",
612
+ name="Code Review",
613
+ version="1.0.0",
614
+ description="Run linting, type checking, bundle analysis, and security scan. Use after all tests pass and before deployment to staging. Catches code quality and security issues.",
615
+ stage=4,
616
+ phase="quality",
617
+ inputs_required=[
618
+ {"name": "feature_name", "type": "string",
619
+ "description": "Name of the feature under review"},
620
+ ],
621
+ outputs=[
622
+ {"name": "review_passed", "type": "bool",
623
+ "description": "Whether all review checks passed"},
624
+ {"name": "issues", "type": "list[str]",
625
+ "description": "List of issues found during review"},
626
+ ],
627
+ trigger_command="npm run lint && npm run typecheck",
628
+ error_strategy="fail-fast",
629
+ code_primary="eslint.config.js",
630
+ tags=["review", "linting", "security"],
631
+ depends_on=["test"],
632
+ blocks=["deploy"],
633
+ negative_triggers=["Do NOT use before tests pass", "Do NOT use for runtime debugging"],
634
+ allowed_tools={"shell": True, "files": {"read": True, "write": False}, "network": False},
635
+ runbook_purpose="Run automated code quality checks: linting, type checking, bundle size analysis, and security scanning.",
636
+ runbook_when="- After all tests pass\n- Before deployment to staging",
637
+ runbook_how="1. Run ESLint with project rules\n2. Run TypeScript type checker (strict mode)\n3. Analyze bundle size for regressions\n4. Run npm audit for security vulnerabilities\n5. Collect all issues into a report",
638
+ runbook_decision_tree="Run lint:\n |- Errors? -> Fix before continuing\n \\- Clean? -> Run typecheck\n |- Type errors? -> Fix before continuing\n \\- Clean? -> Check bundle size\n |- >10% increase? -> Investigate, optimize\n \\- Acceptable? -> Run security scan\n |- Critical vulns? -> Fix before deploy\n \\- Clean? -> Review passed",
639
+ runbook_error_handling="- **Lint errors**: Must fix, cannot deploy with lint errors\n- **Type errors**: Must fix, strict mode is non-negotiable\n- **Security vulnerability**: Critical = block, moderate = warn",
640
+ ),
641
+ SkillDef(
642
+ id="deploy",
643
+ name="Deploy",
644
+ version="1.0.0",
645
+ description="Deploy to staging or production environment. Use after review passes all checks. Runs build, migrations, deploy, health check, and post-deploy monitoring.",
646
+ stage=5,
647
+ phase="delivery",
648
+ inputs_required=[
649
+ {"name": "environment", "type": "string",
650
+ "description": "Target: staging or production"},
651
+ ],
652
+ trigger_command="npm run deploy:{environment}",
653
+ error_strategy="fail-fast",
654
+ code_primary="deploy.config.ts",
655
+ tags=["deployment", "ci-cd"],
656
+ depends_on=["review"],
657
+ negative_triggers=["Do NOT deploy to production without staging verification first", "Do NOT use when tests or review have not passed"],
658
+ allowed_tools={"shell": True, "files": {"read": True, "write": False}, "network": True},
659
+ runbook_purpose="Deploy the application to staging or production.",
660
+ runbook_when="- All tests pass\n- Feature reviewed and approved",
661
+ runbook_how="1. Build production bundle\n2. Run database migrations\n3. Deploy to target environment\n4. Verify health check\n5. Monitor error rates for 15 minutes",
662
+ runbook_decision_tree="Deploy to staging\n |- Health check fails? -> Rollback, investigate\n |- Error rate spikes? -> Rollback, investigate\n \\- Stable for 15 min? -> Promote to production (with confirmation)",
663
+ runbook_error_handling="- **Build failure**: Abort deploy\n- **Health check failure**: Rollback immediately\n- **Error rate spike**: Rollback and investigate",
664
+ ),
665
+ ]
666
+
667
+ _WEB_WORKFLOW = WorkflowDef(
668
+ id="feature-lifecycle",
669
+ entity="feature",
670
+ description="Feature development from planning through deployment",
671
+ states=[
672
+ WorkflowStateDef("planned", "Requirements understood, ready to build", initial=True),
673
+ WorkflowStateDef("in-progress", "Implementation underway", active=True),
674
+ WorkflowStateDef("testing", "All tests running"),
675
+ WorkflowStateDef("staging", "Deployed to staging for verification"),
676
+ WorkflowStateDef("deployed", "Live in production", terminal=True),
677
+ WorkflowStateDef("blocked", "Cannot proceed due to dependency or issue", terminal=True),
678
+ ],
679
+ transitions=[
680
+ WorkflowTransitionDef("planned", "in-progress", skill="scaffold",
681
+ conditions=["Requirements are clear"]),
682
+ WorkflowTransitionDef("in-progress", "testing", skill="implement",
683
+ conditions=["Implementation complete", "Code compiles without errors"]),
684
+ WorkflowTransitionDef("testing", "staging", skill="review",
685
+ conditions=["All tests pass", "Code review passes"]),
686
+ WorkflowTransitionDef("staging", "deployed", skill="deploy",
687
+ conditions=["Health check passes", "Error rate normal",
688
+ "User confirms promotion"]),
689
+ WorkflowTransitionDef("testing", "in-progress",
690
+ conditions=["Tests fail"],
691
+ description="Fix failing tests"),
692
+ ],
693
+ )
694
+
695
+ WEB_CONFIG = DomainConfig(
696
+ mode="dev-assist",
697
+ workflow_commands=[
698
+ CommandDef(
699
+ id="build",
700
+ trigger="/build",
701
+ description="Build a feature end-to-end: scaffold, implement, test, review, deploy",
702
+ runbook_purpose="Guide the agent through the full feature development lifecycle. The agent scaffolds boilerplate, implements the feature, runs tests, reviews code quality, and deploys.",
703
+ worker_specialty="Building web features end-to-end — scaffold, implement, test, deploy",
704
+ runbook_phases=[
705
+ {"title": "Scaffold", "content": "Generate migration, route, component, and test stubs for the feature."},
706
+ {"title": "Implement", "content": "Write migration logic, API routes, UI components, and wire up feature flags."},
707
+ {"title": "Test", "content": "Run unit, integration, and e2e test suites. Fix failures before proceeding."},
708
+ {"title": "Review", "content": "Run linting, type checking, bundle analysis, and security scan."},
709
+ {"title": "Deploy", "content": "Deploy to staging, verify health checks, then promote to production."},
710
+ ],
711
+ ),
712
+ ],
713
+ instructions_description="Full-stack web application with authentication, billing, and real-time updates.",
714
+ instructions_quick_ref="<!-- AGENT: Extract commands from package.json scripts, Makefile, or equivalent. Show the 3-5 most common commands for dev server, testing, migrations, and deployment. -->",
715
+ instructions_project_structure="<!-- AGENT: Run directory listing and annotate key directories. Typical web app structure: pages/routes, components, API layer, database/ORM, auth, billing, tests. -->",
716
+ instructions_rules=[
717
+ "**Auth on every API route** -- use auth middleware consistently. No unprotected endpoints.",
718
+ "**Feature flags** -- new features behind feature flags until stable.",
719
+ ],
720
+ instructions_workflow_phases=[
721
+ {"title": "Understand Requirements", "content": "What does the feature do? What data does it need? How does it interact with existing features?"},
722
+ {"title": "Implement", "content": "Schema migration -> API route -> UI component -> tests."},
723
+ {"title": "Test (DO NOT SKIP)", "content": "Unit tests pass, integration tests pass, manual QA on staging."},
724
+ {"title": "Deploy", "content": "Staging first, verify metrics, then production."},
725
+ ],
726
+ instructions_key_principle="Ship incrementally. Every feature has a migration, tests, and feature flag before going to production.",
727
+ instructions_gotchas=[],
728
+ skills=_WEB_SKILLS,
729
+ orchestrator_pipeline="scaffold -> implement -> test -> review -> deploy",
730
+ orchestrator_status_flow="planned -> in_progress -> testing -> staging -> deployed\n /\n blocked (any stage)",
731
+ orchestrator_decision_tree="FIRST: Check if pipeline is already complete (all items at terminal status).\n If complete -> report status summary, ask user: re-run / new session / re-validate / exit.\n If not -> proceed:\n\n1. Understand feature requirements\n2. Create migration if schema change needed\n3. Implement API route with auth middleware\n4. Implement UI component (server-first, client when interactive)\n5. Write tests (unit + integration + e2e)\n6. Deploy to staging\n7. Verify on staging (manual + automated checks)\n8. Deploy to production behind feature flag\n9. Monitor metrics, then remove flag",
732
+ orchestrator_when_to_stop="- Feature deployed and stable in production\n- Feature flag removed after verification\n- All tests passing on main branch",
733
+ workflow=_WEB_WORKFLOW,
734
+ permissions_shell_read=[],
735
+ permissions_shell_execute=[
736
+ "npm run *",
737
+ "npx *",
738
+ "node *",
739
+ ],
740
+ permissions_file_write=[
741
+ "src/**",
742
+ "tests/**",
743
+ ],
744
+ permissions_deny_shell=[
745
+ "rm -rf *",
746
+ "DROP DATABASE *",
747
+ ],
748
+ permissions_confirm_shell=[
749
+ "npm run deploy:*",
750
+ "git push *",
751
+ "npx drizzle-kit push *",
752
+ ],
753
+ permissions_confirm_actions=["deploy_production", "modify_billing"],
754
+ env_required=[
755
+ {"name": "DATABASE_URL", "description": "PostgreSQL connection string"},
756
+ {"name": "STRIPE_SECRET_KEY", "description": "Stripe API key for billing"},
757
+ ],
758
+ env_optional=[
759
+ {"name": "NODE_ENV", "default": "development", "description": "Runtime environment"},
760
+ ],
761
+ )
762
+
763
+
764
+ # ---------------------------------------------------------------------------
765
+ # DevOps domain config — drawn from examples/devops
766
+ # ---------------------------------------------------------------------------
767
+
768
+ _DEVOPS_SKILLS = [
769
+ SkillDef(
770
+ id="provision",
771
+ name="Provision Infrastructure",
772
+ version="1.0.0",
773
+ description="Create or update cloud infrastructure via Terraform. Use when new infrastructure is needed or existing resources require scaling. Always previews changes with plan before applying.",
774
+ stage=1,
775
+ phase="infrastructure",
776
+ inputs_required=[
777
+ {"name": "service", "type": "string",
778
+ "description": "Service to provision"},
779
+ ],
780
+ inputs_environment=["AWS_PROFILE"],
781
+ trigger_command="terraform plan && terraform apply",
782
+ error_strategy="fail-fast",
783
+ code_primary="terraform/",
784
+ tags=["terraform", "infrastructure", "aws"],
785
+ blocks=["configure"],
786
+ negative_triggers=[
787
+ "Do NOT use for configuration management — use configure skill instead",
788
+ "Do NOT apply destructive changes without user confirmation",
789
+ ],
790
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["terraform/**"]}, "network": True},
791
+ runbook_purpose="Create or update cloud infrastructure using Terraform.",
792
+ runbook_when="- New service needs infrastructure\n- Existing service needs scaling or config change",
793
+ runbook_how="1. `terraform plan -out=plan.tfplan` -- preview all changes\n2. Review plan for destructive actions (destroy, replace)\n3. `terraform apply plan.tfplan` -- apply only after review\n4. Verify resources created via `terraform state list`",
794
+ runbook_decision_tree="terraform plan\n |- No changes? -> Skip (already up to date)\n |- Only additions? -> Safe to apply\n |- Modifications? -> Review carefully, apply if benign\n \\- Destructions? -> STOP -- confirm with user before applying",
795
+ runbook_error_handling="- **Plan error**: Fix config before proceeding\n- **Apply error**: Check state, do NOT retry blindly",
796
+ ),
797
+ SkillDef(
798
+ id="configure",
799
+ name="Configure Service",
800
+ version="1.0.0",
801
+ description="Apply service configuration via Ansible with dry-run verification. Use after infrastructure is provisioned or when configuration changes are needed. Always runs --check mode first.",
802
+ stage=2,
803
+ phase="configuration",
804
+ inputs_required=[
805
+ {"name": "service", "type": "string",
806
+ "description": "Service to configure"},
807
+ {"name": "environment", "type": "string",
808
+ "description": "Target environment"},
809
+ ],
810
+ inputs_environment=["ANSIBLE_VAULT_PASSWORD"],
811
+ outputs=[
812
+ {"name": "configured_services", "type": "list[str]",
813
+ "description": "Services that were successfully configured"},
814
+ ],
815
+ trigger_command="ansible-playbook -i inventory configure.yml --limit {service}",
816
+ error_strategy="fail-fast",
817
+ code_primary="ansible/",
818
+ tags=["ansible", "configuration", "automation"],
819
+ depends_on=["provision"],
820
+ blocks=["deploy"],
821
+ negative_triggers=[
822
+ "Do NOT use before infrastructure is provisioned",
823
+ "Do NOT skip dry-run verification",
824
+ ],
825
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["ansible/**"]}, "network": True},
826
+ runbook_purpose="Apply service configuration using Ansible playbooks after infrastructure is provisioned.",
827
+ runbook_when="- Infrastructure provisioned (Terraform applied)\n- Configuration changes needed\n- New service setup",
828
+ runbook_how="1. Run `ansible-playbook --check` for dry-run preview\n2. Review changes for unexpected modifications\n3. Apply configuration with `ansible-playbook`\n4. Verify services are running with correct config\n5. Advance to `configured`",
829
+ runbook_decision_tree="Dry-run playbook:\n |- No changes? -> Skip (already configured)\n |- Expected changes only? -> Apply\n |- Unexpected changes? -> STOP, investigate\n \\- After apply:\n |- Service healthy? -> Status: configured\n \\- Service unhealthy? -> Rollback config, investigate",
830
+ runbook_error_handling="- **Vault password missing**: Abort, cannot decrypt secrets\n- **Playbook error**: Fix playbook before retrying\n- **Service unhealthy after config**: Rollback to previous config",
831
+ ),
832
+ SkillDef(
833
+ id="deploy",
834
+ name="Deploy Service",
835
+ version="1.0.0",
836
+ description="Blue-green deploy with health checks and monitoring. Use when a new version is ready and staging has been verified. Supports rollback on health check failure or error rate spike.",
837
+ stage=3,
838
+ phase="delivery",
839
+ inputs_required=[
840
+ {"name": "service", "type": "string",
841
+ "description": "Service to deploy"},
842
+ {"name": "environment", "type": "string",
843
+ "description": "Target: staging or production"},
844
+ ],
845
+ inputs_environment=["DEPLOY_ENV"],
846
+ trigger_command="python scripts/manage.py deploy --service {service} --env {environment}",
847
+ error_strategy="fail-fast",
848
+ code_primary="scripts/manage.py",
849
+ tags=["deployment", "blue-green"],
850
+ depends_on=["configure"],
851
+ negative_triggers=[
852
+ "Do NOT deploy to production without staging verification",
853
+ "Do NOT use when infrastructure is not provisioned",
854
+ ],
855
+ allowed_tools={"shell": True, "files": {"read": True, "write": False}, "network": True},
856
+ runbook_purpose="Deploy a service using blue-green strategy with health checks.",
857
+ runbook_when="- Infrastructure provisioned\n- New version ready to ship\n- Staging verified (for production deploys)",
858
+ runbook_how="1. Build new container image\n2. Deploy to \"green\" target\n3. Run health checks against green\n4. Switch traffic from blue to green\n5. Monitor for 5 minutes\n6. Tear down old blue (or keep for rollback)",
859
+ runbook_decision_tree="Deploy green instance\n |- Build fails? -> Abort, fix build\n |- Health check fails? -> Abort, keep blue\n |- Traffic switch\n | |- Error rate spikes? -> Rollback to blue immediately\n | \\- All healthy for 5 min? -> Confirm deploy, tear down blue",
860
+ runbook_error_handling="- **Build failure**: Abort immediately\n- **Health check failure**: Keep previous deployment\n- **Error rate spike**: Rollback to blue",
861
+ ),
862
+ SkillDef(
863
+ id="rollback",
864
+ name="Rollback Service",
865
+ version="1.0.0",
866
+ description="Rollback a service to the previous known-good deployment. Use when error rates spike after deploy, health checks fail, or user requests emergency rollback. Preserves failed deployment for debugging.",
867
+ stage=4,
868
+ phase="recovery",
869
+ inputs_required=[
870
+ {"name": "service", "type": "string",
871
+ "description": "Service to rollback"},
872
+ {"name": "environment", "type": "string",
873
+ "description": "Target environment"},
874
+ ],
875
+ trigger_command="python scripts/manage.py rollback --service {service} --env {environment}",
876
+ error_strategy="fail-fast",
877
+ code_primary="scripts/manage.py",
878
+ tags=["rollback", "recovery", "incident-response"],
879
+ negative_triggers=[
880
+ "Do NOT use when no previous deployment exists — escalate to human instead",
881
+ ],
882
+ allowed_tools={"shell": True, "files": {"read": True, "write": False}, "network": True},
883
+ runbook_purpose="Revert a service to the previous known-good deployment immediately.",
884
+ runbook_when="- Error rate spikes after deploy\n- Health checks fail after deploy\n- User requests emergency rollback",
885
+ runbook_how="1. Identify previous deployment (blue instance)\n2. Switch traffic back to previous\n3. Verify health of reverted service\n4. Log rollback event with reason\n5. Keep failed deployment for debugging",
886
+ runbook_decision_tree="Rollback initiated\n |- Previous deployment available? -> Switch traffic\n | |- Health check passes? -> Rollback successful\n | \\- Health check fails? -> ESCALATE to human\n \\- No previous deployment? -> ESCALATE to human",
887
+ runbook_error_handling="- **No previous deployment**: Escalate to human\n- **Health check failure after rollback**: Escalate immediately",
888
+ ),
889
+ SkillDef(
890
+ id="monitor",
891
+ name="Monitor Service",
892
+ version="1.0.0",
893
+ description="Check health endpoints, error rates, latency, and resource usage. Use after deploy completes, after rollback to verify recovery, or on-demand health check requests. Reports healthy, degraded, or unhealthy status.",
894
+ stage=5,
895
+ phase="observability",
896
+ inputs_required=[
897
+ {"name": "service", "type": "string",
898
+ "description": "Service to monitor"},
899
+ {"name": "environment", "type": "string",
900
+ "description": "Target environment"},
901
+ ],
902
+ inputs_optional=[
903
+ {"name": "duration_seconds", "type": "int", "default": "300",
904
+ "description": "How long to monitor in seconds"},
905
+ ],
906
+ outputs=[
907
+ {"name": "health_status", "type": "str",
908
+ "description": "Overall health: healthy, degraded, or unhealthy"},
909
+ {"name": "metrics_summary", "type": "object",
910
+ "description": "Summary of collected metrics"},
911
+ ],
912
+ trigger_command="python scripts/manage.py monitor --service {service} --env {environment}",
913
+ error_strategy="per-item-isolation",
914
+ code_primary="monitoring/",
915
+ tags=["monitoring", "observability", "health-checks"],
916
+ depends_on=["deploy"],
917
+ negative_triggers=[
918
+ "Do NOT use as a substitute for proper monitoring infrastructure",
919
+ ],
920
+ allowed_tools={"shell": True, "files": {"read": True, "write": False}, "network": True},
921
+ runbook_purpose="Monitor a deployed service by checking health endpoints, error rates, latency percentiles, and resource usage.",
922
+ runbook_when="- After deploy completes successfully\n- After rollback to verify recovery\n- On-demand health check requested",
923
+ runbook_how="1. Poll health endpoint every 10 seconds\n2. Collect error rate from logs/metrics\n3. Measure latency p50, p95, p99\n4. Check CPU and memory usage\n5. Compare against thresholds for duration\n6. Report final health status",
924
+ runbook_decision_tree="Monitor for duration_seconds:\n |- Health endpoint down? -> Status: unhealthy\n |- Error rate > 1%? -> Status: degraded\n |- Latency p99 > threshold? -> Status: degraded\n |- CPU > 80% or Memory > 85%? -> Status: degraded\n \\- All metrics normal? -> Status: healthy\n\nAfter monitoring:\n |- Healthy? -> Confirm deployment\n |- Degraded? -> Trigger rollback consideration\n \\- Unhealthy? -> Immediate rollback",
925
+ runbook_error_handling="- **Health endpoint unreachable**: Mark unhealthy immediately\n- **Metrics collection failure**: Log warning, continue with available data\n- **Threshold breach**: Alert and recommend rollback",
926
+ ),
927
+ ]
928
+
929
+ _DEVOPS_WORKFLOW = WorkflowDef(
930
+ id="service-lifecycle",
931
+ entity="service",
932
+ description="Service deployment lifecycle with monitoring and rollback",
933
+ states=[
934
+ WorkflowStateDef("planned", "Infrastructure design approved", initial=True),
935
+ WorkflowStateDef("provisioning", "Terraform creating resources", active=True),
936
+ WorkflowStateDef("configured", "Ansible config applied"),
937
+ WorkflowStateDef("deploying", "Blue-green deploy in progress", active=True),
938
+ WorkflowStateDef("deployed", "Service live and receiving traffic", terminal=True),
939
+ WorkflowStateDef("degraded", "Service live but metrics abnormal"),
940
+ WorkflowStateDef("rolled-back", "Reverted to previous version"),
941
+ WorkflowStateDef("failed", "Deployment failed, no traffic switched", terminal=True),
942
+ ],
943
+ transitions=[
944
+ WorkflowTransitionDef("planned", "provisioning", skill="provision",
945
+ conditions=["Terraform plan reviewed"]),
946
+ WorkflowTransitionDef("provisioning", "configured", skill="configure",
947
+ conditions=["All resources created", "Health checks pass"],
948
+ on_failure="failed"),
949
+ WorkflowTransitionDef("configured", "deploying", skill="deploy",
950
+ conditions=["Tests pass in staging"]),
951
+ WorkflowTransitionDef("deploying", "deployed",
952
+ conditions=["Health check passes",
953
+ "Error rate < 1% for 5 min"],
954
+ on_failure="failed"),
955
+ WorkflowTransitionDef("deployed", "degraded",
956
+ conditions=["Error rate > 1% or latency p99 > threshold"]),
957
+ WorkflowTransitionDef("degraded", "rolled-back", skill="rollback",
958
+ conditions=["Degradation confirmed"]),
959
+ WorkflowTransitionDef("rolled-back", "deploying", skill="deploy",
960
+ conditions=["Fix applied and tested"],
961
+ description="Re-deploy after fixing the issue"),
962
+ ],
963
+ )
964
+
965
+ DEVOPS_CONFIG = DomainConfig(
966
+ mode="dev-assist",
967
+ workflow_commands=[
968
+ CommandDef(
969
+ id="provision",
970
+ trigger="/provision",
971
+ description="Provision and deploy infrastructure: provision, configure, deploy",
972
+ runbook_purpose="Guide the agent through the full infrastructure provisioning and deployment lifecycle. The agent provisions resources via Terraform, configures services via Ansible, and deploys with blue-green strategy.",
973
+ worker_specialty="Provisioning and deploying infrastructure — Terraform, Ansible, blue-green deploys",
974
+ runbook_phases=[
975
+ {"title": "Provision", "content": "Run terraform plan, review for destructive changes, then apply."},
976
+ {"title": "Configure", "content": "Apply Ansible playbooks with dry-run verification first."},
977
+ {"title": "Deploy", "content": "Blue-green deploy with health checks. Monitor for 5 minutes. Rollback if degraded."},
978
+ ],
979
+ ),
980
+ ],
981
+ instructions_description="Infrastructure automation — provision, configure, deploy, monitor, and rollback.",
982
+ instructions_quick_ref="<!-- AGENT: Extract commands from scripts/, Makefile, or CI config. Show the 3-5 most common commands for planning, applying, deploying, and rolling back. -->",
983
+ instructions_project_structure="<!-- AGENT: Run directory listing and annotate key directories. Typical infrastructure structure: IaC definitions, configuration management, deployment scripts, monitoring, container definitions. -->",
984
+ instructions_rules=[
985
+ "**Never apply without preview** -- always preview infrastructure changes before applying.",
986
+ "**Staging first** -- every change hits staging before production.",
987
+ "**Rollback ready** -- every deploy must have a tested rollback path.",
988
+ "**No hardcoded secrets** -- use a secrets manager. Never commit credentials.",
989
+ "**Tag everything** -- all resources tagged with service, environment, owner.",
990
+ ],
991
+ instructions_workflow_phases=[
992
+ {"title": "Verify Prerequisites", "content": "Check: image built, tests pass, staging healthy, rollback tested."},
993
+ {"title": "Deploy to Staging", "content": "Blue-green deployment, health check, smoke tests."},
994
+ {"title": "Monitor (DO NOT SKIP)", "content": "Watch error rates, latency p99, CPU/memory for 5 minutes."},
995
+ {"title": "Promote or Rollback", "content": "If metrics stable -> promote to production. If metrics degrade -> immediate rollback."},
996
+ ],
997
+ instructions_key_principle="Infrastructure changes are permanent and visible. Measure twice, apply once. Always have a rollback plan.",
998
+ instructions_gotchas=[],
999
+ skills=_DEVOPS_SKILLS,
1000
+ orchestrator_pipeline="provision -> configure -> deploy -> monitor -> verify",
1001
+ orchestrator_status_flow="planned -> provisioning -> configured -> deploying -> deployed -> monitored\n | |\n failed degraded -> rollback -> deployed",
1002
+ orchestrator_decision_tree="FIRST: Check if pipeline is already complete (all items at terminal status).\n If complete -> report status summary, ask user: re-run / new session / re-validate / exit.\n If not -> proceed:\n\n1. Provision infrastructure (Terraform)\n |- Plan shows destructive changes? -> STOP, confirm with user\n \\- Plan is additive? -> Apply\n2. Configure services (Ansible)\n |- Dry-run shows unexpected changes? -> STOP, investigate\n \\- Dry-run clean? -> Apply\n3. Deploy service (blue-green)\n |- Health check fails? -> Rollback immediately\n |- Error rate > 1%? -> Rollback immediately\n \\- All healthy? -> Mark deployed\n4. Monitor for 5 minutes\n |- Metrics degrade? -> Rollback\n \\- Stable? -> Confirm deployment",
1003
+ orchestrator_when_to_stop="- Service deployed and metrics stable\n- Rollback completed successfully\n- Escalation required (human intervention needed)",
1004
+ workflow=_DEVOPS_WORKFLOW,
1005
+ permissions_shell_read=[
1006
+ "terraform plan *",
1007
+ "terraform state list *",
1008
+ "ansible-inventory *",
1009
+ "docker ps *",
1010
+ "kubectl get *",
1011
+ ],
1012
+ permissions_shell_execute=[
1013
+ "python scripts/manage.py *",
1014
+ "ansible-playbook --check *",
1015
+ ],
1016
+ permissions_file_write=[
1017
+ "terraform/**/*.tf",
1018
+ "ansible/**/*.yml",
1019
+ ],
1020
+ permissions_deny_shell=[
1021
+ "terraform destroy *",
1022
+ "rm -rf *",
1023
+ "kubectl delete namespace *",
1024
+ ],
1025
+ permissions_confirm_shell=[
1026
+ "terraform apply *",
1027
+ "ansible-playbook *",
1028
+ "python scripts/manage.py deploy --env production *",
1029
+ "python scripts/manage.py rollback *",
1030
+ ],
1031
+ permissions_confirm_actions=["deploy_production", "scale_down", "destroy_resource"],
1032
+ env_required=[
1033
+ {"name": "AWS_PROFILE", "description": "AWS credentials profile"},
1034
+ {"name": "DEPLOY_ENV", "description": "Target environment (staging/production)"},
1035
+ ],
1036
+ env_optional=[
1037
+ {"name": "ROLLBACK_WINDOW", "default": "300", "description": "Seconds to monitor before confirming deploy"},
1038
+ ],
1039
+ )
1040
+
1041
+
1042
+ # ---------------------------------------------------------------------------
1043
+ # Research domain config — agent-integrated content processing
1044
+ # ---------------------------------------------------------------------------
1045
+
1046
+ _RESEARCH_SKILLS = [
1047
+ SkillDef(
1048
+ id="ingest",
1049
+ name="Ingest Content",
1050
+ version="1.0.0",
1051
+ description="Collect papers and content from sources (arXiv, Semantic Scholar, RSS, web). Use when the pipeline needs fresh content or no items are in ingested status. Deduplicates by DOI, URL, and title similarity.",
1052
+ stage=1,
1053
+ phase="ingestion",
1054
+ inputs_required=[
1055
+ {"name": "sources", "type": "list[str]",
1056
+ "description": "Content sources to query (URLs, search terms, feed URLs)"},
1057
+ ],
1058
+ inputs_optional=[
1059
+ {"name": "max_items", "type": "int", "default": "50",
1060
+ "description": "Maximum items to ingest per run"},
1061
+ {"name": "date_range", "type": "str", "default": "7d",
1062
+ "description": "How far back to look (e.g. 7d, 30d, 1y)"},
1063
+ ],
1064
+ outputs=[
1065
+ {"name": "ingested_ids", "type": "list[str]",
1066
+ "description": "IDs of newly ingested items"},
1067
+ ],
1068
+ trigger_command="python scripts/pipeline.py --stage ingest",
1069
+ error_strategy="per-item-isolation",
1070
+ code_primary="pipeline/ingest.py",
1071
+ tags=["ingestion", "arxiv", "semantic-scholar", "rss"],
1072
+ blocks=["parse"],
1073
+ negative_triggers=["Do NOT use for manual document upload — add files directly to data/"],
1074
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "data/**"]}, "network": True},
1075
+ runbook_purpose="Collect new papers, articles, and content from configured sources.",
1076
+ runbook_when="- No items in `ingested` status\n- User requests new content\n- Scheduled daily/weekly",
1077
+ runbook_how="1. Query each configured source (arXiv API, Semantic Scholar, RSS feeds, web scraping)\n2. Deduplicate against existing records by DOI, URL, or title similarity\n3. Store raw content with source metadata\n4. Record provenance and access timestamps",
1078
+ runbook_decision_tree="For each source:\n |- API available? -> Query API with filters\n |- RSS feed? -> Parse feed entries\n |- Web URL? -> Fetch and extract content\n \\- For each item:\n |- Already exists? -> Skip\n |- Matches topic filters? -> Ingest\n \\- No match? -> Skip",
1079
+ runbook_error_handling="- **API rate limit**: Back off and retry\n- **Network error**: Skip source, continue with others\n- **Duplicate detected**: Skip silently",
1080
+ ),
1081
+ SkillDef(
1082
+ id="parse",
1083
+ name="Parse Content",
1084
+ version="1.0.0",
1085
+ description="Extract structure, metadata, key sections, and citations from raw content. Use after ingest when items reach ingested status. Supports PDF, HTML, and plain text formats.",
1086
+ stage=2,
1087
+ phase="extraction",
1088
+ inputs_required=[
1089
+ {"name": "item_id", "type": "str",
1090
+ "description": "ID of item to parse"},
1091
+ ],
1092
+ outputs=[
1093
+ {"name": "parsed_content", "type": "object",
1094
+ "description": "Structured representation with sections, metadata, citations"},
1095
+ ],
1096
+ trigger_command="python scripts/pipeline.py --stage parse --item-id {ID}",
1097
+ error_strategy="per-item-isolation",
1098
+ code_primary="pipeline/parse.py",
1099
+ tags=["parsing", "extraction", "metadata"],
1100
+ depends_on=["ingest"],
1101
+ blocks=["analyze"],
1102
+ negative_triggers=["Do NOT use on already-parsed items"],
1103
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "data/**"]}, "network": False},
1104
+ runbook_purpose="Extract structured information from raw ingested content: title, authors, abstract, sections, figures, citations, and metadata.",
1105
+ runbook_when="- Item is at `ingested` status\n- After ingest skill completes",
1106
+ runbook_how="1. Detect content type (PDF, HTML, plain text)\n2. Extract metadata: title, authors, date, DOI, venue\n3. Extract key sections: abstract, introduction, methods, results, conclusion\n4. Extract citations and build reference list\n5. Extract figures/tables if present\n6. Advance to `parsed`",
1107
+ runbook_decision_tree="Detect format:\n |- PDF? -> Use PDF parser (PyMuPDF/pdfplumber)\n |- HTML? -> Use BeautifulSoup/trafilatura\n |- Plain text? -> Use regex-based extraction\n \\- After extraction:\n |- Missing title or abstract? -> Mark for manual review\n \\- Complete? -> Status: \"parsed\"",
1108
+ runbook_error_handling="- **Corrupted PDF**: Reject with reason\n- **Encoding error**: Try fallback encodings\n- **Missing metadata**: Infer from content where possible",
1109
+ ),
1110
+ SkillDef(
1111
+ id="analyze",
1112
+ name="Analyze Content",
1113
+ version="1.0.0",
1114
+ description="Classify topics, extract insights, and identify key findings. Use after parse when items reach parsed status. Computes relevance scores and finds connections to existing knowledge.",
1115
+ stage=3,
1116
+ phase="analysis",
1117
+ inputs_required=[
1118
+ {"name": "item_id", "type": "str",
1119
+ "description": "ID of item to analyze"},
1120
+ ],
1121
+ outputs=[
1122
+ {"name": "topics", "type": "list[str]",
1123
+ "description": "Classified topic tags"},
1124
+ {"name": "key_findings", "type": "list[str]",
1125
+ "description": "Extracted key findings and insights"},
1126
+ {"name": "relevance_score", "type": "float",
1127
+ "description": "0.0-1.0 relevance to research focus"},
1128
+ ],
1129
+ trigger_command="python scripts/pipeline.py --stage analyze --item-id {ID}",
1130
+ error_strategy="per-item-isolation",
1131
+ code_primary="pipeline/analyze.py",
1132
+ tags=["analysis", "classification", "insights"],
1133
+ depends_on=["parse"],
1134
+ blocks=["organize"],
1135
+ negative_triggers=["Do NOT use on unparsed items — run parse first"],
1136
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "data/**"]}, "network": False},
1137
+ runbook_purpose="Classify topics, extract key findings, compute relevance scores, and identify connections to existing knowledge.",
1138
+ runbook_when="- Item is at `parsed` status\n- After parse skill completes",
1139
+ runbook_how="1. Classify into topic taxonomy (multi-label)\n2. Extract key findings and contributions\n3. Identify methodology and approach\n4. Compute relevance score against research focus\n5. Find connections to previously analyzed items\n6. Advance to `analyzed`",
1140
+ runbook_decision_tree="Analyze parsed content:\n |- Relevance score < 0.2? -> Reject: \"low_relevance\"\n |- No identifiable findings? -> Reject: \"no_findings\"\n |- Duplicate findings? -> Merge with existing, note source\n \\- Valid analysis? -> Status: \"analyzed\"",
1141
+ runbook_error_handling="- **Ambiguous topic**: Assign multiple labels, flag for review\n- **Low confidence**: Lower relevance score, keep for manual review",
1142
+ ),
1143
+ SkillDef(
1144
+ id="organize",
1145
+ name="Organize Content",
1146
+ version="1.0.0",
1147
+ description="Categorize by topic and relevance, build taxonomy, cross-reference related items. Use after analyze when items reach analyzed status with relevance_score >= 0.2. Updates the knowledge graph.",
1148
+ stage=4,
1149
+ phase="taxonomy",
1150
+ inputs_required=[
1151
+ {"name": "item_id", "type": "str",
1152
+ "description": "ID of item to organize"},
1153
+ ],
1154
+ outputs=[
1155
+ {"name": "categories", "type": "list[str]",
1156
+ "description": "Assigned taxonomy categories"},
1157
+ {"name": "cross_refs", "type": "list[str]",
1158
+ "description": "IDs of related items"},
1159
+ ],
1160
+ trigger_command="python scripts/pipeline.py --stage organize --item-id {ID}",
1161
+ error_strategy="per-item-isolation",
1162
+ code_primary="pipeline/organize.py",
1163
+ tags=["taxonomy", "categorization", "cross-reference"],
1164
+ depends_on=["analyze"],
1165
+ blocks=["display"],
1166
+ negative_triggers=["Do NOT use on items below relevance threshold (relevance_score < 0.2)"],
1167
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["pipeline/**", "data/**", "output/**"]}, "network": False},
1168
+ runbook_purpose="Place analyzed content into the knowledge taxonomy, cross-reference with related items, and update the knowledge graph.",
1169
+ runbook_when="- Item is at `analyzed` status\n- After analyze skill completes",
1170
+ runbook_how="1. Map topics to taxonomy categories\n2. Find related items by topic overlap and citation links\n3. Update knowledge graph with new connections\n4. Compute cluster membership\n5. Update category summaries\n6. Advance to `organized`",
1171
+ runbook_decision_tree="For analyzed item:\n |- Fits existing category? -> Assign and cross-reference\n |- New topic cluster? -> Create new category, assign\n |- Contradicts existing findings? -> Flag for attention\n \\- Organized? -> Status: \"organized\"",
1172
+ runbook_error_handling="- **Category conflict**: Assign to multiple, flag for review\n- **Missing cross-references**: Continue without, note gap",
1173
+ ),
1174
+ SkillDef(
1175
+ id="display",
1176
+ name="Display Results",
1177
+ version="1.0.0",
1178
+ description="Generate dashboard, report, or organized output for consumption. Use after organize completes or when user requests a summary. Supports markdown, HTML, and JSON output formats.",
1179
+ stage=5,
1180
+ phase="delivery",
1181
+ inputs_optional=[
1182
+ {"name": "format", "type": "str", "default": "markdown",
1183
+ "description": "Output format: markdown, html, json"},
1184
+ {"name": "focus_topics", "type": "list[str]",
1185
+ "description": "Topics to highlight in output"},
1186
+ ],
1187
+ outputs=[
1188
+ {"name": "report_path", "type": "str",
1189
+ "description": "Path to generated report"},
1190
+ ],
1191
+ trigger_command="python scripts/pipeline.py --stage display",
1192
+ error_strategy="fail-fast",
1193
+ code_primary="pipeline/display.py",
1194
+ tags=["reporting", "dashboard", "output"],
1195
+ depends_on=["organize"],
1196
+ negative_triggers=["Do NOT use when no items have been organized yet"],
1197
+ allowed_tools={"shell": True, "files": {"read": True, "write": ["output/**"]}, "network": False},
1198
+ runbook_purpose="Generate readable output: topic summaries, key findings digest, trend analysis, and cross-reference maps.",
1199
+ runbook_when="- After organize completes\n- User requests a report or summary",
1200
+ runbook_how="1. Gather all organized items by category\n2. Generate topic summaries with key findings\n3. Build trend analysis (new topics, growing areas)\n4. Create cross-reference visualization\n5. Output in requested format (markdown, HTML, JSON)",
1201
+ runbook_decision_tree="Generate report:\n |- No organized items? -> Report: \"No content processed yet\"\n |- Focus topics specified? -> Filter to those topics\n \\- Full report? -> Include all categories with summaries",
1202
+ runbook_error_handling="- **Empty category**: Include with note \"no items yet\"\n- **Template error**: Fall back to plain text output",
1203
+ ),
1204
+ ]
1205
+
1206
+ _RESEARCH_WORKFLOW = WorkflowDef(
1207
+ id="content-pipeline",
1208
+ entity="content-item",
1209
+ description="Content processing pipeline from ingestion through organized display",
1210
+ states=[
1211
+ WorkflowStateDef("ingested", "Raw content collected from source", initial=True),
1212
+ WorkflowStateDef("parsed", "Structure and metadata extracted"),
1213
+ WorkflowStateDef("analyzed", "Topics classified, findings extracted", active=True),
1214
+ WorkflowStateDef("organized", "Categorized and cross-referenced"),
1215
+ WorkflowStateDef("displayed", "Included in generated output", terminal=True),
1216
+ WorkflowStateDef("rejected", "Failed quality checks or irrelevant", terminal=True),
1217
+ ],
1218
+ transitions=[
1219
+ WorkflowTransitionDef("ingested", "parsed", skill="parse",
1220
+ conditions=["Content is accessible and readable"]),
1221
+ WorkflowTransitionDef("parsed", "analyzed", skill="analyze",
1222
+ conditions=["Metadata extraction successful"]),
1223
+ WorkflowTransitionDef("analyzed", "organized", skill="organize",
1224
+ conditions=["Relevance score >= 0.2", "Key findings extracted"]),
1225
+ WorkflowTransitionDef("organized", "displayed", skill="display",
1226
+ conditions=["Category assigned", "Cross-references resolved"]),
1227
+ WorkflowTransitionDef("ingested", "rejected",
1228
+ conditions=["Content unreadable or corrupted"],
1229
+ description="Reject unprocessable content"),
1230
+ WorkflowTransitionDef("analyzed", "rejected",
1231
+ conditions=["Relevance score < 0.2"],
1232
+ description="Reject irrelevant content"),
1233
+ ],
1234
+ )
1235
+
1236
+ RESEARCH_CONFIG = DomainConfig(
1237
+ mode="agent-integrated",
1238
+ workflow_commands=[
1239
+ CommandDef(
1240
+ id="build",
1241
+ trigger="/build",
1242
+ description="Build the research pipeline codebase from scratch: project structure, source adapters, storage, pipeline stages, taxonomy, scripts, tests",
1243
+ runbook_purpose="Construct the complete research content pipeline codebase. The agent creates project structure, implements source adapters, sets up storage, builds each pipeline stage, defines the topic taxonomy, adds CLI scripts, and verifies with tests.",
1244
+ worker_specialty="Constructing research pipeline codebases — sources, storage, stages, taxonomy",
1245
+ runbook_phases=[
1246
+ {"title": "Project Structure", "content": "Create directory layout: pipeline/, sources/, config/, output/, data/, scripts/, tests/. Set up pyproject.toml, __init__.py files, and virtual environment."},
1247
+ {"title": "Source Adapters", "content": "Implement source adapters: arxiv.py, semantic_scholar.py, rss.py, web.py. Each adapter handles authentication, rate limiting, and returns normalized content objects."},
1248
+ {"title": "Storage Layer", "content": "Set up SQLite or file-based storage for raw and processed content. Create schema for items, metadata, citations, and topic assignments. Add helper functions."},
1249
+ {"title": "Pipeline Stages", "content": "Build each stage module: ingest.py, parse.py, analyze.py, organize.py, display.py. Each reads items at current status, processes them, and advances status."},
1250
+ {"title": "Topic Taxonomy", "content": "Create config/topic_taxonomy.py — the classification hierarchy. Define top-level categories, subcategories, and keyword mappings. Taxonomy evolves as content is processed."},
1251
+ {"title": "Scripts & CLI", "content": "Build scripts/pipeline.py with --stage and --sources flags. Add convenience scripts for common operations (search, report generation)."},
1252
+ {"title": "Tests & Verification", "content": "Write unit tests for each pipeline stage and source adapter. Add integration test that runs ingest->parse->analyze on sample content. Verify all imports and CLI commands work."},
1253
+ ],
1254
+ ),
1255
+ CommandDef(
1256
+ id="process",
1257
+ trigger="/process",
1258
+ description="Run the content pipeline: ingest, parse, analyze, organize, display",
1259
+ runbook_purpose="Execute the complete content processing pipeline. The agent ingests content from sources, extracts structure, analyzes findings, organizes into taxonomy, and generates output.",
1260
+ worker_specialty="Processing research content through the pipeline — ingest, analyze, organize",
1261
+ runbook_phases=[
1262
+ {"title": "Ingest", "content": "Collect papers/articles from configured sources (arXiv, Semantic Scholar, RSS, web)."},
1263
+ {"title": "Parse", "content": "Extract structure, metadata, sections, and citations from raw content."},
1264
+ {"title": "Analyze", "content": "Classify topics, extract key findings, compute relevance scores."},
1265
+ {"title": "Organize", "content": "Categorize by topic, build taxonomy, cross-reference related items."},
1266
+ {"title": "Display", "content": "Generate reports, summaries, and visualizations of processed content."},
1267
+ ],
1268
+ ),
1269
+ ],
1270
+ instructions_description="Research content processing pipeline. Ingests papers and articles, extracts structure and metadata, analyzes findings, organizes by topic taxonomy, and generates reports.",
1271
+ instructions_quick_ref="<!-- AGENT: Extract commands from scripts/, Makefile, or pyproject.toml. Show the 3-5 most common commands for running the pipeline, ingesting sources, and generating output. -->",
1272
+ instructions_project_structure="<!-- AGENT: Run directory listing and annotate key directories. Typical research pipeline structure: pipeline stages, source adapters, configuration, output, data storage. -->",
1273
+ instructions_rules=[
1274
+ "**Source attribution** -- always record provenance and access date for every ingested item.",
1275
+ "**Deduplication** -- check by identifier, URL, and title similarity before ingesting.",
1276
+ "**Relevance filtering** -- reject items below the configured relevance threshold.",
1277
+ "**Incremental processing** -- resume from last completed stage, never reprocess completed items.",
1278
+ "**Structured output** -- all analysis results stored in structured format, not just prose.",
1279
+ ],
1280
+ instructions_workflow_phases=[
1281
+ {"title": "Ingest Sources", "content": "Query configured sources for new content matching topic filters."},
1282
+ {"title": "Extract Structure", "content": "Parse content into structured sections, metadata, and citations."},
1283
+ {"title": "Analyze & Classify", "content": "Topic classification, key finding extraction, relevance scoring."},
1284
+ {"title": "Organize & Connect", "content": "Build taxonomy, cross-reference, update knowledge graph."},
1285
+ {"title": "Generate Output", "content": "Produce reports, summaries, and dashboards in requested format."},
1286
+ ],
1287
+ instructions_key_principle="Content is the raw material; structured knowledge is the product. Every item flows through the pipeline from raw ingestion to organized, cross-referenced output.",
1288
+ instructions_gotchas=[],
1289
+ skills=_RESEARCH_SKILLS,
1290
+ orchestrator_pipeline="ingest -> parse -> analyze -> organize -> display",
1291
+ orchestrator_status_flow="ingested -> parsed -> analyzed -> organized -> displayed\n | |\n v v\n rejected rejected",
1292
+ orchestrator_decision_tree="FIRST: Check if pipeline is already complete (all items at terminal status).\n If complete -> report status summary, ask user: re-run / new session / re-validate / exit.\n If not -> proceed:\n\nfor each stage in [ingest, parse, analyze, organize, display]:\n 1. Get items at current status\n 2. For each item:\n a. Run stage skill\n b. On success: advance status\n c. On failure: log error, reject if unrecoverable\n 3. Report: N processed, N rejected, N skipped\n\nAfter analyze stage:\n - Check relevance scores\n - Reject items below threshold\n - Flag items that contradict existing findings",
1293
+ orchestrator_when_to_stop="- All items at terminal status (displayed or rejected)\n- No new items from sources\n- User requests stop",
1294
+ workflow=_RESEARCH_WORKFLOW,
1295
+ permissions_shell_read=[
1296
+ "python scripts/pipeline.py status *",
1297
+ "python scripts/pipeline.py list *",
1298
+ ],
1299
+ permissions_shell_execute=[
1300
+ "python scripts/pipeline.py *",
1301
+ "python -m pytest *",
1302
+ ],
1303
+ permissions_file_write=[
1304
+ "pipeline/**/*.py",
1305
+ "config/**/*.py",
1306
+ "output/**",
1307
+ "data/**",
1308
+ ],
1309
+ permissions_deny_shell=[
1310
+ "rm -rf *",
1311
+ ],
1312
+ permissions_confirm_shell=[
1313
+ "git push *",
1314
+ ],
1315
+ permissions_confirm_actions=[],
1316
+ env_required=[],
1317
+ env_optional=[
1318
+ {"name": "SEMANTIC_SCHOLAR_API_KEY", "default": "", "description": "Semantic Scholar API key (optional, increases rate limits)"},
1319
+ {"name": "MAX_ITEMS_PER_RUN", "default": "50", "description": "Maximum items to process per pipeline run"},
1320
+ ],
1321
+ )
1322
+
1323
+
1324
+ # ---------------------------------------------------------------------------
1325
+ # Generic agent-integrated base (used for Custom type in interactive picker)
1326
+ # ---------------------------------------------------------------------------
1327
+
1328
+ _BUILD_COMMAND = CommandDef(
1329
+ id="build",
1330
+ trigger="/build",
1331
+ description="Build the project codebase: structure, core modules, storage, configuration, scripts, tests",
1332
+ runbook_purpose="Construct the project codebase from scratch. The agent creates directory structure, implements core modules, sets up storage, writes configuration, adds scripts, and verifies with tests.",
1333
+ worker_specialty="Constructing project codebases from scratch",
1334
+ runbook_phases=[
1335
+ {"title": "Project Structure", "content": "Create directory layout, set up package config, __init__.py files, and virtual environment or package manager."},
1336
+ {"title": "Core Modules", "content": "Implement the primary modules that define the system's behavior. Each module has a clear responsibility and interface."},
1337
+ {"title": "Storage & State", "content": "Set up database, file storage, or state management. Create schemas, migrations, and helper functions."},
1338
+ {"title": "Configuration", "content": "Create settings module with environment-based config. Define thresholds, endpoints, and operational parameters."},
1339
+ {"title": "Integration Layer", "content": "Wire modules together. Implement the main entry point and any API or CLI interfaces."},
1340
+ {"title": "Scripts & CLI", "content": "Build run scripts with appropriate flags and options. Add convenience scripts for common operations."},
1341
+ {"title": "Tests & Verification", "content": "Write unit tests for each module. Add integration test for the primary workflow. Verify all imports and commands work."},
1342
+ ],
1343
+ )
1344
+
1345
+ DEV_ASSIST_BASE_CONFIG = DomainConfig(
1346
+ mode="dev-assist",
1347
+ workflow_commands=[_BUILD_COMMAND],
1348
+ instructions_description="", # empty -> falls through to TODO scaffolding in template
1349
+ )
1350
+
1351
+ AGENT_INTEGRATED_BASE_CONFIG = DomainConfig(
1352
+ mode="agent-integrated",
1353
+ workflow_commands=[
1354
+ _BUILD_COMMAND,
1355
+ CommandDef(
1356
+ id="run",
1357
+ trigger="/run",
1358
+ description="Execute the primary pipeline or workflow end-to-end",
1359
+ runbook_purpose="Run the project's main workflow from start to finish. The agent executes each stage, monitors progress, handles errors, and reports results.",
1360
+ worker_specialty="Executing the primary project workflow end-to-end",
1361
+ runbook_phases=[
1362
+ {"title": "Pre-flight Check", "content": "Verify environment, dependencies, and configuration. Ensure storage is accessible and previous state is consistent."},
1363
+ {"title": "Execute Pipeline", "content": "Run each stage of the pipeline in order. Monitor progress and resource usage. Handle per-item errors gracefully."},
1364
+ {"title": "Analyze Results", "content": "Check output quality, verify expected outcomes, and flag anomalies or failures."},
1365
+ {"title": "Report & Clean Up", "content": "Generate summary of results. Archive artifacts. Update state for next run."},
1366
+ ],
1367
+ ),
1368
+ ],
1369
+ instructions_description="", # empty -> falls through to TODO scaffolding in template
1370
+ )
1371
+
1372
+
1373
+ # ---------------------------------------------------------------------------
1374
+ # Public mapping
1375
+ # ---------------------------------------------------------------------------
1376
+
1377
+ DOMAIN_CONFIGS: Dict[str, DomainConfig] = {
1378
+ "ml": ML_CONFIG,
1379
+ "web": WEB_CONFIG,
1380
+ "devops": DEVOPS_CONFIG,
1381
+ "research": RESEARCH_CONFIG,
1382
+ }