agentic-qe 3.7.18 → 3.7.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. package/.claude/skills/iterative-loop/SKILL.md +371 -0
  2. package/.claude/skills/skills-manifest.json +35 -7
  3. package/.claude/skills/validation-pipeline/SKILL.md +164 -0
  4. package/.claude/skills/validation-pipeline/evals/validation-pipeline.yaml +544 -0
  5. package/.claude/skills/validation-pipeline/schemas/output.json +193 -0
  6. package/.claude/skills/validation-pipeline/scripts/validate-config.json +34 -0
  7. package/README.md +5 -3
  8. package/assets/skills/skills-manifest.json +17 -1
  9. package/assets/skills/validation-pipeline/SKILL.md +164 -0
  10. package/assets/skills/validation-pipeline/evals/validation-pipeline.yaml +544 -0
  11. package/assets/skills/validation-pipeline/schemas/output.json +193 -0
  12. package/assets/skills/validation-pipeline/scripts/validate-config.json +34 -0
  13. package/dist/cli/bundle.js +2 -2
  14. package/dist/context/compiler.js +4 -0
  15. package/dist/context/index.d.ts +2 -0
  16. package/dist/context/index.js +2 -0
  17. package/dist/context/sources/defect-source.d.ts +17 -0
  18. package/dist/context/sources/defect-source.js +102 -0
  19. package/dist/context/sources/index.d.ts +2 -0
  20. package/dist/context/sources/index.js +2 -0
  21. package/dist/context/sources/requirements-source.d.ts +17 -0
  22. package/dist/context/sources/requirements-source.js +119 -0
  23. package/dist/coordination/task-executor.js +7 -1
  24. package/dist/coordination/yaml-pipeline-loader.d.ts +32 -0
  25. package/dist/coordination/yaml-pipeline-loader.js +389 -0
  26. package/dist/coordination/yaml-pipeline-registry.d.ts +61 -0
  27. package/dist/coordination/yaml-pipeline-registry.js +143 -0
  28. package/dist/mcp/bundle.js +8670 -1244
  29. package/dist/mcp/entry.js +21 -0
  30. package/dist/mcp/handlers/domain-handler-configs.js +11 -0
  31. package/dist/mcp/handlers/index.d.ts +2 -0
  32. package/dist/mcp/handlers/index.js +4 -0
  33. package/dist/mcp/handlers/pipeline-handlers.d.ts +75 -0
  34. package/dist/mcp/handlers/pipeline-handlers.js +208 -0
  35. package/dist/mcp/handlers/validation-pipeline-handler.d.ts +53 -0
  36. package/dist/mcp/handlers/validation-pipeline-handler.js +118 -0
  37. package/dist/mcp/protocol-server.js +167 -1
  38. package/dist/mcp/server.js +75 -1
  39. package/dist/workers/daemon.js +3 -2
  40. package/dist/workers/index.d.ts +6 -0
  41. package/dist/workers/index.js +6 -0
  42. package/dist/workers/workers/heartbeat-scheduler.d.ts +45 -0
  43. package/dist/workers/workers/heartbeat-scheduler.js +312 -0
  44. package/dist/workers/workers/index.d.ts +2 -1
  45. package/dist/workers/workers/index.js +2 -1
  46. package/package.json +1 -1
@@ -0,0 +1,544 @@
1
+ # =============================================================================
2
+ # AQE Skill Evaluation Test Suite: Validation Pipeline v1.0.0
3
+ # =============================================================================
4
+ #
5
+ # Comprehensive evaluation suite for the validation-pipeline skill.
6
+ # Tests structured step-by-step validation with gate enforcement,
7
+ # per-step scoring, and report generation.
8
+ #
9
+ # Schema: .claude/skills/.validation/schemas/skill-eval.schema.json
10
+ # Validator: .claude/skills/validation-pipeline/scripts/validate-config.json
11
+ #
12
+ # Coverage:
13
+ # - Full requirements pipeline execution (13 steps)
14
+ # - Blocking gate enforcement (halt on failure)
15
+ # - Continue-on-failure mode
16
+ # - Step filtering (--steps)
17
+ # - Per-step scoring and weighted rollup
18
+ # - Report generation (markdown and JSON)
19
+ #
20
+ # =============================================================================
21
+
22
+ skill: validation-pipeline
23
+ version: 1.0.0
24
+ description: >
25
+ Evaluation suite for the validation-pipeline skill. Tests full pipeline
26
+ execution, gate enforcement, step filtering, scoring, and reporting
27
+ across the 13-step requirements validation pipeline.
28
+
29
+ # =============================================================================
30
+ # Multi-Model Configuration
31
+ # =============================================================================
32
+
33
+ models_to_test:
34
+ - claude-3.5-sonnet
35
+ - claude-3-haiku
36
+
37
+ # =============================================================================
38
+ # MCP Integration Configuration
39
+ # =============================================================================
40
+
41
+ mcp_integration:
42
+ enabled: true
43
+ namespace: skill-validation
44
+ query_patterns: true
45
+ track_outcomes: true
46
+ store_patterns: true
47
+ share_learning: true
48
+ update_quality_gate: true
49
+ target_agents:
50
+ - qe-requirements-validator
51
+ - qe-quality-gate
52
+
53
+ # =============================================================================
54
+ # Learning Configuration
55
+ # =============================================================================
56
+
57
+ learning:
58
+ store_success_patterns: true
59
+ store_failure_patterns: true
60
+ pattern_ttl_days: 90
61
+ min_confidence_to_store: 0.7
62
+ cross_model_comparison: true
63
+
64
+ # =============================================================================
65
+ # Result Format
66
+ # =============================================================================
67
+
68
+ result_format:
69
+ json_output: true
70
+ markdown_report: true
71
+ include_raw_output: false
72
+ include_timing: true
73
+ include_token_usage: true
74
+
75
+ # =============================================================================
76
+ # Setup
77
+ # =============================================================================
78
+
79
+ setup:
80
+ required_tools: []
81
+ environment_variables: {}
82
+ fixtures: []
83
+
84
+ # =============================================================================
85
+ # TEST CASES
86
+ # =============================================================================
87
+
88
+ test_cases:
89
+ # ---------------------------------------------------------------------------
90
+ # CATEGORY: Full Pipeline Execution
91
+ # ---------------------------------------------------------------------------
92
+
93
+ - id: tc001_full_requirements_pipeline
94
+ description: "Execute all 13 requirements validation steps on a well-formed document"
95
+ category: pipeline_execution
96
+ priority: critical
97
+
98
+ input:
99
+ prompt: |
100
+ Run the validation pipeline on this requirements document:
101
+
102
+ # User Management Requirements
103
+
104
+ ## Overview
105
+ This document specifies the requirements for the user management module.
106
+
107
+ ## Requirements
108
+
109
+ ### REQ-001: User Registration
110
+ As a visitor, I want to register an account so I can access the platform.
111
+
112
+ **Acceptance Criteria:**
113
+ - User can register with email and password (8+ chars, 1 uppercase, 1 number)
114
+ - Duplicate email rejected with clear error message
115
+ - Verification email sent within 30 seconds
116
+ - Registration completes in under 2 seconds
117
+
118
+ ### REQ-002: User Login
119
+ As a registered user, I want to log in so I can access my account.
120
+
121
+ **Acceptance Criteria:**
122
+ - User can log in with email and password
123
+ - Invalid credentials show generic error (no email enumeration)
124
+ - Account locks after 5 failed attempts for 15 minutes
125
+ - Session expires after 30 minutes of inactivity
126
+
127
+ ## Scope
128
+ In scope: registration, login, password reset.
129
+ Out of scope: social login, SSO.
130
+
131
+ context:
132
+ pipeline: "requirements"
133
+
134
+ expected_output:
135
+ must_contain:
136
+ - "pipeline"
137
+ - "validation"
138
+ - "score"
139
+ - "step"
140
+ - "finding"
141
+ must_not_contain:
142
+ - "error"
143
+ - "unable"
144
+ severity_classification: critical
145
+
146
+ validation:
147
+ schema_check: true
148
+ keyword_match_threshold: 0.8
149
+ reasoning_quality_min: 0.75
150
+
151
+ - id: tc002_blocking_gate_enforcement
152
+ description: "Pipeline halts at blocking step failure"
153
+ category: gate_enforcement
154
+ priority: critical
155
+
156
+ input:
157
+ prompt: |
158
+ Run the validation pipeline on this minimal document:
159
+
160
+ just some text without any structure
161
+
162
+ The pipeline should halt at the format-check step (blocking)
163
+ because there are no headings, required sections, or structure.
164
+ context:
165
+ pipeline: "requirements"
166
+
167
+ expected_output:
168
+ must_contain:
169
+ - "halt"
170
+ - "block"
171
+ - "format"
172
+ - "fail"
173
+ severity_classification: critical
174
+ finding_count:
175
+ min: 1
176
+
177
+ validation:
178
+ schema_check: true
179
+ keyword_match_threshold: 0.7
180
+
181
+ - id: tc003_continue_on_failure
182
+ description: "Pipeline continues past blocking failures with --continue-on-failure"
183
+ category: gate_enforcement
184
+ priority: high
185
+
186
+ input:
187
+ prompt: |
188
+ Run the validation pipeline with --continue-on-failure on this document:
189
+
190
+ just some text without any structure
191
+
192
+ Even though format-check will fail (blocking), the pipeline should
193
+ continue executing remaining steps and report all findings.
194
+ context:
195
+ pipeline: "requirements"
196
+ continue_on_failure: true
197
+
198
+ expected_output:
199
+ must_contain:
200
+ - "continue"
201
+ - "format"
202
+ - "step"
203
+ - "finding"
204
+ finding_count:
205
+ min: 2
206
+
207
+ validation:
208
+ schema_check: true
209
+ keyword_match_threshold: 0.7
210
+
211
+ # ---------------------------------------------------------------------------
212
+ # CATEGORY: Step Filtering
213
+ # ---------------------------------------------------------------------------
214
+
215
+ - id: tc004_step_filtering
216
+ description: "Run only specific steps from the pipeline"
217
+ category: step_filtering
218
+ priority: high
219
+
220
+ input:
221
+ prompt: |
222
+ Run only the format-check and vague-term-detection steps on:
223
+
224
+ # Requirements Document
225
+
226
+ ## Overview
227
+ The system should handle various types of user input properly.
228
+
229
+ ## Requirements
230
+ Users might want to do several things with the system.
231
+
232
+ ## Scope
233
+ The system should support etc.
234
+
235
+ context:
236
+ pipeline: "requirements"
237
+ steps: ["format-check", "vague-term-detection"]
238
+
239
+ expected_output:
240
+ must_contain:
241
+ - "format"
242
+ - "vague"
243
+ - "should"
244
+ - "step"
245
+ finding_count:
246
+ min: 1
247
+
248
+ validation:
249
+ schema_check: true
250
+ keyword_match_threshold: 0.75
251
+
252
+ # ---------------------------------------------------------------------------
253
+ # CATEGORY: Scoring and Reporting
254
+ # ---------------------------------------------------------------------------
255
+
256
+ - id: tc005_weighted_score_calculation
257
+ description: "Overall score uses category-weighted averages"
258
+ category: scoring
259
+ priority: high
260
+
261
+ input:
262
+ prompt: |
263
+ Run the full requirements pipeline and verify the scoring breakdown:
264
+
265
+ # API Gateway Requirements
266
+
267
+ ## Overview
268
+ Requirements for the API gateway service.
269
+
270
+ ## Requirements
271
+
272
+ ### REQ-001: Rate Limiting
273
+ Limit API requests to 1000/minute per client.
274
+
275
+ **Acceptance Criteria:**
276
+ - Requests beyond limit return 429 status
277
+ - Rate limit headers included in all responses
278
+ - Configurable per-endpoint limits
279
+
280
+ ## Scope
281
+ Rate limiting, authentication proxy, request routing.
282
+
283
+ Show the per-step scores and how the overall weighted score is calculated.
284
+ context:
285
+ pipeline: "requirements"
286
+
287
+ expected_output:
288
+ must_contain:
289
+ - "score"
290
+ - "weight"
291
+ - "step"
292
+ - "overall"
293
+
294
+ validation:
295
+ schema_check: true
296
+ keyword_match_threshold: 0.75
297
+
298
+ - id: tc006_markdown_report_format
299
+ description: "Pipeline produces well-formatted markdown report"
300
+ category: reporting
301
+ priority: high
302
+
303
+ input:
304
+ prompt: |
305
+ Run the requirements pipeline and output a markdown report:
306
+
307
+ # Payment Processing
308
+
309
+ ## Overview
310
+ Handle payment transactions securely.
311
+
312
+ ## Requirements
313
+
314
+ ### REQ-001: Card Payment
315
+ Process credit/debit card payments via Stripe.
316
+
317
+ **Acceptance Criteria:**
318
+ - Support Visa, Mastercard, Amex
319
+ - PCI DSS compliant tokenization
320
+ - Transaction completes in under 5 seconds
321
+
322
+ ## Scope
323
+ Card payments only. Wire transfers out of scope.
324
+
325
+ context:
326
+ pipeline: "requirements"
327
+ format: "markdown"
328
+
329
+ expected_output:
330
+ must_contain:
331
+ - "Validation Report"
332
+ - "Step Results"
333
+ - "Score"
334
+ - "Duration"
335
+ - "Finding"
336
+
337
+ validation:
338
+ schema_check: true
339
+ keyword_match_threshold: 0.8
340
+
341
+ # ---------------------------------------------------------------------------
342
+ # CATEGORY: Individual Step Validation
343
+ # ---------------------------------------------------------------------------
344
+
345
+ - id: tc007_invest_criteria_check
346
+ description: "INVEST criteria step evaluates requirement quality"
347
+ category: individual_steps
348
+ priority: high
349
+
350
+ input:
351
+ prompt: |
352
+ Run only the invest-criteria step on:
353
+
354
+ # Requirements
355
+
356
+ ## Overview
357
+ System requirements for user management.
358
+
359
+ ## Requirements
360
+
361
+ ### REQ-001: User CRUD
362
+ The system shall provide full CRUD operations for user entities including
363
+ create, read, update, and delete with proper authorization checks,
364
+ audit logging, soft delete support, batch operations, import/export,
365
+ admin override, and integration with 5 external systems.
366
+
367
+ ## Scope
368
+ User management module.
369
+
370
+ Check: is REQ-001 Independent, Negotiable, Valuable, Estimable, Small, Testable?
371
+ context:
372
+ pipeline: "requirements"
373
+ steps: ["invest-criteria"]
374
+
375
+ expected_output:
376
+ must_contain:
377
+ - "INVEST"
378
+ - "small"
379
+ - "testable"
380
+ - "independent"
381
+ finding_count:
382
+ min: 1
383
+
384
+ validation:
385
+ schema_check: true
386
+ keyword_match_threshold: 0.7
387
+
388
+ - id: tc008_vague_term_detection
389
+ description: "Detects vague and ambiguous terms"
390
+ category: individual_steps
391
+ priority: high
392
+
393
+ input:
394
+ prompt: |
395
+ Run only the vague-term-detection step on:
396
+
397
+ # Requirements
398
+
399
+ ## Overview
400
+ The system should handle various scenarios properly.
401
+
402
+ ## Requirements
403
+ The platform might need to support several user types etc.
404
+ Performance should be adequate for most use cases.
405
+ The UI should be user-friendly and intuitive.
406
+
407
+ ## Scope
408
+ Various features and improvements.
409
+
410
+ context:
411
+ pipeline: "requirements"
412
+ steps: ["vague-term-detection"]
413
+
414
+ expected_output:
415
+ must_contain:
416
+ - "vague"
417
+ - "should"
418
+ - "various"
419
+ - "etc"
420
+ finding_count:
421
+ min: 3
422
+
423
+ validation:
424
+ schema_check: true
425
+ keyword_match_threshold: 0.75
426
+
427
+ # ---------------------------------------------------------------------------
428
+ # CATEGORY: Negative / Edge Cases
429
+ # ---------------------------------------------------------------------------
430
+
431
+ - id: tc009_empty_document
432
+ description: "Pipeline handles empty document gracefully"
433
+ category: negative
434
+ priority: high
435
+
436
+ input:
437
+ prompt: |
438
+ Run the requirements pipeline on an empty document (no content).
439
+ context:
440
+ pipeline: "requirements"
441
+ content: ""
442
+
443
+ expected_output:
444
+ must_contain:
445
+ - "fail"
446
+ - "empty"
447
+ - "format"
448
+ severity_classification: critical
449
+ finding_count:
450
+ min: 1
451
+
452
+ validation:
453
+ schema_check: true
454
+ allow_partial: true
455
+
456
+ - id: tc010_high_quality_document
457
+ description: "High-quality document scores well across all steps"
458
+ category: positive
459
+ priority: high
460
+
461
+ input:
462
+ prompt: |
463
+ Run the full pipeline on this well-structured requirements document:
464
+
465
+ # Authentication Service Requirements v2.1
466
+
467
+ ## Overview
468
+ This document specifies authentication requirements for the platform.
469
+ All requirements have been reviewed by the security team and product owner.
470
+
471
+ ## Requirements
472
+
473
+ ### REQ-001: OAuth2 Login
474
+ As a user, I want to authenticate via OAuth2 providers so I can use
475
+ existing credentials without creating a new password.
476
+
477
+ **Acceptance Criteria:**
478
+ - Given a user clicks "Sign in with Google", when they authorize,
479
+ then they are redirected to the dashboard within 3 seconds
480
+ - Given an invalid OAuth token, when login is attempted,
481
+ then the system returns a 401 with descriptive error
482
+ - Given a new OAuth user, when they first authenticate,
483
+ then a local account is created automatically
484
+
485
+ **Tests:** TC-001, TC-002, TC-003
486
+ **Dependencies:** REQ-005 (Session Management)
487
+
488
+ ### REQ-002: Password Requirements
489
+ As a security administrator, I want password complexity rules enforced
490
+ so that user accounts are protected against brute force attacks.
491
+
492
+ **Acceptance Criteria:**
493
+ - Minimum 12 characters, 1 uppercase, 1 lowercase, 1 number, 1 symbol
494
+ - Password strength meter shows real-time feedback
495
+ - Common passwords (top 10,000) are rejected with suggestion
496
+ - Password history prevents reuse of last 5 passwords
497
+
498
+ **Tests:** TC-010, TC-011, TC-012, TC-013
499
+ **Dependencies:** None
500
+
501
+ ## Scope
502
+ In scope: OAuth2, password management, MFA.
503
+ Out of scope: Biometric authentication, hardware keys.
504
+
505
+ context:
506
+ pipeline: "requirements"
507
+
508
+ expected_output:
509
+ must_contain:
510
+ - "pass"
511
+ - "score"
512
+ - "step"
513
+ - "validation"
514
+ severity_classification: high
515
+
516
+ validation:
517
+ schema_check: true
518
+ keyword_match_threshold: 0.8
519
+ reasoning_quality_min: 0.8
520
+
521
+ # =============================================================================
522
+ # SUCCESS CRITERIA
523
+ # =============================================================================
524
+
525
+ success_criteria:
526
+ pass_rate: 0.8
527
+ critical_pass_rate: 1.0
528
+ avg_reasoning_quality: 0.75
529
+ max_execution_time_ms: 300000
530
+ cross_model_variance: 0.15
531
+
532
+ # =============================================================================
533
+ # METADATA
534
+ # =============================================================================
535
+
536
+ metadata:
537
+ author: "qe-requirements-validator"
538
+ created: "2026-03-12"
539
+ last_updated: "2026-03-12"
540
+ coverage_target: >
541
+ Full 13-step requirements pipeline execution, blocking gate enforcement,
542
+ continue-on-failure mode, step filtering, weighted score calculation,
543
+ markdown report generation, INVEST criteria validation, vague term detection,
544
+ empty document handling, and high-quality document scoring.