agentic-qe 3.6.9 → 3.6.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (117) hide show
  1. package/.claude/skills/.validation/schemas/skill-eval.schema.json +11 -1
  2. package/.claude/skills/pr-review/SKILL.md +2 -2
  3. package/.claude/skills/qcsd-production-swarm/SKILL.md +2781 -0
  4. package/.claude/skills/qcsd-production-swarm/evals/qcsd-production-swarm.yaml +246 -0
  5. package/.claude/skills/qcsd-production-swarm/schemas/output.json +505 -0
  6. package/.claude/skills/qcsd-production-swarm/scripts/validate-config.json +25 -0
  7. package/.claude/skills/skills-manifest.json +5 -5
  8. package/package.json +1 -1
  9. package/scripts/benchmark-hnsw-loading.ts +480 -0
  10. package/scripts/benchmark-kg-assisted.ts +725 -0
  11. package/scripts/collect-production-telemetry.sh +291 -0
  12. package/scripts/detect-skill-conflicts.ts +347 -0
  13. package/scripts/eval-driven-workflow.ts +704 -0
  14. package/scripts/run-skill-eval.ts +210 -10
  15. package/scripts/score-skill-quality.ts +511 -0
  16. package/v3/CHANGELOG.md +44 -0
  17. package/v3/assets/skills/pr-review/SKILL.md +2 -2
  18. package/v3/dist/cli/bundle.js +1526 -700
  19. package/v3/dist/cli/commands/code.d.ts.map +1 -1
  20. package/v3/dist/cli/commands/code.js +9 -85
  21. package/v3/dist/cli/commands/code.js.map +1 -1
  22. package/v3/dist/cli/commands/coverage.d.ts.map +1 -1
  23. package/v3/dist/cli/commands/coverage.js +3 -28
  24. package/v3/dist/cli/commands/coverage.js.map +1 -1
  25. package/v3/dist/cli/commands/hooks.d.ts.map +1 -1
  26. package/v3/dist/cli/commands/hooks.js +143 -2
  27. package/v3/dist/cli/commands/hooks.js.map +1 -1
  28. package/v3/dist/cli/commands/security.d.ts.map +1 -1
  29. package/v3/dist/cli/commands/security.js +3 -29
  30. package/v3/dist/cli/commands/security.js.map +1 -1
  31. package/v3/dist/cli/commands/test.d.ts.map +1 -1
  32. package/v3/dist/cli/commands/test.js +11 -58
  33. package/v3/dist/cli/commands/test.js.map +1 -1
  34. package/v3/dist/cli/utils/file-discovery.d.ts +27 -0
  35. package/v3/dist/cli/utils/file-discovery.d.ts.map +1 -0
  36. package/v3/dist/cli/utils/file-discovery.js +105 -0
  37. package/v3/dist/cli/utils/file-discovery.js.map +1 -0
  38. package/v3/dist/coordination/task-executor.d.ts.map +1 -1
  39. package/v3/dist/coordination/task-executor.js +304 -44
  40. package/v3/dist/coordination/task-executor.js.map +1 -1
  41. package/v3/dist/domains/code-intelligence/coordinator.d.ts.map +1 -1
  42. package/v3/dist/domains/code-intelligence/coordinator.js +8 -1
  43. package/v3/dist/domains/code-intelligence/coordinator.js.map +1 -1
  44. package/v3/dist/domains/code-intelligence/services/metric-collector/index.d.ts.map +1 -1
  45. package/v3/dist/domains/code-intelligence/services/metric-collector/index.js +10 -0
  46. package/v3/dist/domains/code-intelligence/services/metric-collector/index.js.map +1 -1
  47. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.d.ts +7 -1
  48. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.d.ts.map +1 -1
  49. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.js +10 -1
  50. package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.js.map +1 -1
  51. package/v3/dist/domains/code-intelligence/services/metric-collector/loc-counter.js +34 -10
  52. package/v3/dist/domains/code-intelligence/services/metric-collector/loc-counter.js.map +1 -1
  53. package/v3/dist/domains/coverage-analysis/services/hnsw-index.d.ts +9 -0
  54. package/v3/dist/domains/coverage-analysis/services/hnsw-index.d.ts.map +1 -1
  55. package/v3/dist/domains/coverage-analysis/services/hnsw-index.js +38 -3
  56. package/v3/dist/domains/coverage-analysis/services/hnsw-index.js.map +1 -1
  57. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.d.ts.map +1 -1
  58. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js +58 -6
  59. package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js.map +1 -1
  60. package/v3/dist/domains/test-generation/generators/mocha-generator.d.ts.map +1 -1
  61. package/v3/dist/domains/test-generation/generators/mocha-generator.js +79 -7
  62. package/v3/dist/domains/test-generation/generators/mocha-generator.js.map +1 -1
  63. package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts +4 -0
  64. package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts.map +1 -1
  65. package/v3/dist/domains/test-generation/generators/pytest-generator.js +77 -10
  66. package/v3/dist/domains/test-generation/generators/pytest-generator.js.map +1 -1
  67. package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts +21 -0
  68. package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts.map +1 -1
  69. package/v3/dist/domains/test-generation/interfaces.d.ts +21 -0
  70. package/v3/dist/domains/test-generation/interfaces.d.ts.map +1 -1
  71. package/v3/dist/domains/test-generation/services/test-generator.d.ts +22 -0
  72. package/v3/dist/domains/test-generation/services/test-generator.d.ts.map +1 -1
  73. package/v3/dist/domains/test-generation/services/test-generator.js +163 -3
  74. package/v3/dist/domains/test-generation/services/test-generator.js.map +1 -1
  75. package/v3/dist/init/init-wizard-hooks.d.ts +8 -1
  76. package/v3/dist/init/init-wizard-hooks.d.ts.map +1 -1
  77. package/v3/dist/init/init-wizard-hooks.js +47 -39
  78. package/v3/dist/init/init-wizard-hooks.js.map +1 -1
  79. package/v3/dist/init/phases/07-hooks.d.ts +11 -1
  80. package/v3/dist/init/phases/07-hooks.d.ts.map +1 -1
  81. package/v3/dist/init/phases/07-hooks.js +46 -50
  82. package/v3/dist/init/phases/07-hooks.js.map +1 -1
  83. package/v3/dist/init/settings-merge.d.ts +35 -0
  84. package/v3/dist/init/settings-merge.d.ts.map +1 -0
  85. package/v3/dist/init/settings-merge.js +140 -0
  86. package/v3/dist/init/settings-merge.js.map +1 -0
  87. package/v3/dist/integrations/agentic-flow/model-router/router.js +1 -1
  88. package/v3/dist/integrations/agentic-flow/model-router/router.js.map +1 -1
  89. package/v3/dist/integrations/agentic-flow/model-router/score-calculator.d.ts.map +1 -1
  90. package/v3/dist/integrations/agentic-flow/model-router/score-calculator.js +18 -3
  91. package/v3/dist/integrations/agentic-flow/model-router/score-calculator.js.map +1 -1
  92. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.d.ts +3 -3
  93. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.d.ts.map +1 -1
  94. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.js +18 -0
  95. package/v3/dist/integrations/agentic-flow/model-router/signal-collector.js.map +1 -1
  96. package/v3/dist/kernel/unified-memory-hnsw.d.ts +29 -0
  97. package/v3/dist/kernel/unified-memory-hnsw.d.ts.map +1 -1
  98. package/v3/dist/kernel/unified-memory-hnsw.js +136 -0
  99. package/v3/dist/kernel/unified-memory-hnsw.js.map +1 -1
  100. package/v3/dist/kernel/unified-memory.d.ts +2 -2
  101. package/v3/dist/kernel/unified-memory.d.ts.map +1 -1
  102. package/v3/dist/kernel/unified-memory.js +7 -9
  103. package/v3/dist/kernel/unified-memory.js.map +1 -1
  104. package/v3/dist/learning/qe-hooks.d.ts.map +1 -1
  105. package/v3/dist/learning/qe-hooks.js +34 -3
  106. package/v3/dist/learning/qe-hooks.js.map +1 -1
  107. package/v3/dist/mcp/bundle.js +1403 -425
  108. package/v3/dist/mcp/handlers/domain-handler-configs.d.ts.map +1 -1
  109. package/v3/dist/mcp/handlers/domain-handler-configs.js +40 -31
  110. package/v3/dist/mcp/handlers/domain-handler-configs.js.map +1 -1
  111. package/v3/dist/mcp/handlers/task-handlers.d.ts.map +1 -1
  112. package/v3/dist/mcp/handlers/task-handlers.js +68 -5
  113. package/v3/dist/mcp/handlers/task-handlers.js.map +1 -1
  114. package/v3/dist/mcp/protocol-server.d.ts.map +1 -1
  115. package/v3/dist/mcp/protocol-server.js +16 -2
  116. package/v3/dist/mcp/protocol-server.js.map +1 -1
  117. package/v3/package.json +1 -1
@@ -42,6 +42,7 @@ interface TestCase {
42
42
  priority: 'critical' | 'high' | 'medium' | 'low';
43
43
  skip?: boolean;
44
44
  skip_reason?: string;
45
+ negative_control?: boolean;
45
46
  input: TestInput;
46
47
  expected_output: ExpectedOutput;
47
48
  validation?: ValidationConfig;
@@ -84,6 +85,7 @@ interface ValidationConfig {
84
85
  reasoning_quality_min?: number;
85
86
  semantic_similarity_min?: number;
86
87
  allow_partial?: boolean;
88
+ adaptive_rubric?: boolean;
87
89
  grading_rubric?: {
88
90
  completeness?: number;
89
91
  accuracy?: number;
@@ -165,6 +167,16 @@ interface TestCaseResult {
165
167
  regex_misses: string[];
166
168
  severity_matched: boolean;
167
169
  finding_count_matched: boolean;
170
+ negative_control_passed?: boolean;
171
+ finding_count_actual?: number;
172
+ schema_validation_passed?: boolean;
173
+ rubric_breakdown?: {
174
+ completeness: number;
175
+ accuracy: number;
176
+ actionability: number;
177
+ weighted_score: number;
178
+ };
179
+ adaptive_keywords_extracted?: string[];
168
180
  };
169
181
  raw_output?: string;
170
182
  error?: string;
@@ -568,7 +580,7 @@ class SkillEvaluationRunner {
568
580
  const executionTime = Date.now() - startTime;
569
581
 
570
582
  // Validate output against expectations
571
- const validation = this.validateOutput(output, testCase.expected_output, testCase.validation);
583
+ const validation = this.validateOutput(output, testCase.expected_output, testCase.validation, testCase);
572
584
 
573
585
  return {
574
586
  id: testCase.id,
@@ -650,12 +662,19 @@ class SkillEvaluationRunner {
650
662
  }
651
663
 
652
664
  /**
653
- * Validate LLM output against expected output criteria
665
+ * Validate LLM output against expected output criteria.
666
+ *
667
+ * Implements:
668
+ * - P1-6: Negative control (inverted pass logic for "should decline" tests)
669
+ * - P1-5a: Finding count enforcement + schema_path validation
670
+ * - P1-5b: Grading rubric weighted scoring (completeness/accuracy/actionability)
671
+ * - P1-4: Adaptive rubric (dynamic keyword extraction from prompt)
654
672
  */
655
673
  private validateOutput(
656
674
  output: string,
657
675
  expected: ExpectedOutput,
658
- config?: ValidationConfig
676
+ config?: ValidationConfig,
677
+ testCase?: TestCase
659
678
  ): {
660
679
  passed: boolean;
661
680
  keywordMatchScore: number;
@@ -664,12 +683,39 @@ class SkillEvaluationRunner {
664
683
  } {
665
684
  const outputLower = output.toLowerCase();
666
685
  const threshold = config?.keyword_match_threshold ?? 0.8;
686
+ const isNegativeControl = testCase?.negative_control ?? false;
687
+
688
+ // --- P1-4: Adaptive Rubric — extract additional keywords from prompt ---
689
+ const adaptiveKeywords: string[] = [];
690
+ let effectiveMustContain = expected.must_contain || [];
691
+
692
+ if (config?.adaptive_rubric && testCase?.input?.prompt) {
693
+ const prompt = testCase.input.prompt;
694
+ // Extract quoted strings
695
+ const quoted = prompt.match(/"([^"]+)"/g)?.map((s) => s.replace(/"/g, '')) || [];
696
+ // Extract format keywords
697
+ const formatWords = prompt.match(/\b(JSON|YAML|markdown|XML|CSV|HTML|SQL)\b/gi) || [];
698
+ // Extract named standards
699
+ const standards =
700
+ prompt.match(/\b(OWASP|WCAG|PCI[-\s]DSS|HIPAA|SOC2|GDPR|ISO\s?\d+)\b/gi) || [];
701
+ // Extract meaningful numbers (>1 digit or >5)
702
+ const numbers = (prompt.match(/\b\d+(\.\d+)?\b/g) || []).filter(
703
+ (n) => n.length > 1 || parseInt(n) > 5
704
+ );
705
+
706
+ adaptiveKeywords.push(...quoted, ...formatWords, ...standards, ...numbers);
707
+
708
+ // Merge with existing must_contain (deduplicate)
709
+ const existingLower = new Set(effectiveMustContain.map((k) => k.toLowerCase()));
710
+ const newKeywords = adaptiveKeywords.filter((k) => !existingLower.has(k.toLowerCase()));
711
+ effectiveMustContain = [...effectiveMustContain, ...newKeywords];
712
+ }
667
713
 
668
714
  // Check must_contain
669
715
  const mustContainMatches: string[] = [];
670
716
  const mustContainMisses: string[] = [];
671
717
 
672
- for (const keyword of expected.must_contain || []) {
718
+ for (const keyword of effectiveMustContain) {
673
719
  if (outputLower.includes(keyword.toLowerCase())) {
674
720
  mustContainMatches.push(keyword);
675
721
  } else {
@@ -702,7 +748,7 @@ class SkillEvaluationRunner {
702
748
  }
703
749
 
704
750
  // Calculate keyword match score
705
- const totalKeywords = (expected.must_contain?.length || 0) + (expected.must_match_regex?.length || 0);
751
+ const totalKeywords = effectiveMustContain.length + (expected.must_match_regex?.length || 0);
706
752
  const matchedKeywords = mustContainMatches.length + regexMatches.length;
707
753
  const keywordMatchScore = totalKeywords > 0 ? matchedKeywords / totalKeywords : 1;
708
754
 
@@ -711,16 +757,165 @@ class SkillEvaluationRunner {
711
757
  !expected.severity_classification ||
712
758
  outputLower.includes(expected.severity_classification.toLowerCase());
713
759
 
714
- // Check finding count (mock implementation)
715
- const findingCountMatched = true; // Would parse JSON output in production
760
+ // --- P1-5a: Finding count enforcement ---
761
+ let findingCountMatched = true;
762
+ let findingCountActual: number | undefined;
763
+
764
+ if (expected.finding_count) {
765
+ // Try JSON parsing first
766
+ try {
767
+ const parsed = JSON.parse(output);
768
+ if (Array.isArray(parsed?.findings)) {
769
+ findingCountActual = parsed.findings.length;
770
+ }
771
+ } catch {
772
+ // Fallback: count severity-keyword occurrences or numbered findings
773
+ const severityPattern =
774
+ /\b(critical|high|medium|low|info)\b.*?(vulnerability|finding|issue)/gi;
775
+ const numberedPattern = /^\s*\d+[.)]\s+/gm;
776
+ const severityCount = (output.match(severityPattern) || []).length;
777
+ const numberedCount = (output.match(numberedPattern) || []).length;
778
+ findingCountActual = Math.max(severityCount, numberedCount);
779
+ }
780
+
781
+ if (findingCountActual !== undefined) {
782
+ if (
783
+ expected.finding_count.min !== undefined &&
784
+ findingCountActual < expected.finding_count.min
785
+ ) {
786
+ findingCountMatched = false;
787
+ }
788
+ if (
789
+ expected.finding_count.max !== undefined &&
790
+ findingCountActual > expected.finding_count.max
791
+ ) {
792
+ findingCountMatched = false;
793
+ }
794
+ }
795
+ }
796
+
797
+ // --- P1-5a: Schema validation ---
798
+ let schemaValidationPassed: boolean | undefined;
799
+ if (expected.schema_path) {
800
+ try {
801
+ const schemaContent = readFileSync(expected.schema_path, 'utf-8');
802
+ const schema = JSON.parse(schemaContent);
803
+ const parsed = JSON.parse(output);
804
+ // Structural check: verify all required fields exist
805
+ if (schema.required && Array.isArray(schema.required)) {
806
+ const missingFields = schema.required.filter(
807
+ (field: string) => !(field in parsed)
808
+ );
809
+ schemaValidationPassed = missingFields.length === 0;
810
+ } else {
811
+ schemaValidationPassed = true;
812
+ }
813
+ } catch {
814
+ schemaValidationPassed = false;
815
+ }
816
+ }
817
+
818
+ // --- P1-5b: Grading Rubric scoring ---
819
+ let rubricBreakdown:
820
+ | {
821
+ completeness: number;
822
+ accuracy: number;
823
+ actionability: number;
824
+ weighted_score: number;
825
+ }
826
+ | undefined;
827
+
828
+ if (config?.grading_rubric) {
829
+ const weights = config.grading_rubric;
830
+ const totalWeight =
831
+ (weights.completeness ?? 0) + (weights.accuracy ?? 0) + (weights.actionability ?? 0);
832
+
833
+ // Completeness = fraction of must_contain matched
834
+ const completenessScore = totalKeywords > 0 ? matchedKeywords / totalKeywords : 1;
835
+
836
+ // Accuracy = 1 - (violations / total_must_not_contain)
837
+ const totalMustNotContain = expected.must_not_contain?.length || 0;
838
+ const accuracyScore =
839
+ totalMustNotContain > 0 ? 1 - violations.length / totalMustNotContain : 1;
840
+
841
+ // Actionability = heuristic: code blocks, numbered steps, specific recommendations
842
+ let actionabilityScore = 0;
843
+ const hasCodeBlocks = /```[\s\S]*?```/.test(output) || /`[^`]+`/.test(output);
844
+ const hasNumberedSteps = /^\s*\d+[.)]\s+/m.test(output);
845
+ const hasRecommendations =
846
+ /\b(recommend|suggest|should|must|consider|implement|use|apply|ensure)\b/i.test(output);
847
+ const hasSpecificActions =
848
+ /\b(install|configure|update|replace|add|remove|change|set|enable|disable)\b/i.test(
849
+ output
850
+ );
851
+
852
+ if (hasCodeBlocks) actionabilityScore += 0.3;
853
+ if (hasNumberedSteps) actionabilityScore += 0.3;
854
+ if (hasRecommendations) actionabilityScore += 0.2;
855
+ if (hasSpecificActions) actionabilityScore += 0.2;
856
+ actionabilityScore = Math.min(1, actionabilityScore);
857
+
858
+ // Weighted sum
859
+ const weightedScore =
860
+ totalWeight > 0
861
+ ? ((weights.completeness ?? 0) * completenessScore +
862
+ (weights.accuracy ?? 0) * accuracyScore +
863
+ (weights.actionability ?? 0) * actionabilityScore) /
864
+ totalWeight
865
+ : completenessScore;
866
+
867
+ rubricBreakdown = {
868
+ completeness: completenessScore,
869
+ accuracy: accuracyScore,
870
+ actionability: actionabilityScore,
871
+ weighted_score: weightedScore,
872
+ };
873
+ }
874
+
875
+ // Calculate reasoning quality — use rubric if available, else simple heuristic
876
+ const reasoningQualityScore = rubricBreakdown
877
+ ? rubricBreakdown.weighted_score
878
+ : keywordMatchScore * 0.8 + (violations.length === 0 ? 0.2 : 0);
879
+
880
+ // --- P1-6: Negative Control logic ---
881
+ if (isNegativeControl) {
882
+ // For negative control: pass when must_contain items are ABSENT and no violations
883
+ const negativeControlPassed =
884
+ mustContainMatches.length === 0 && violations.length === 0;
885
+
886
+ return {
887
+ passed: negativeControlPassed,
888
+ keywordMatchScore,
889
+ reasoningQualityScore: negativeControlPassed ? 1.0 : 0.0,
890
+ details: {
891
+ must_contain_matches: mustContainMatches,
892
+ must_contain_misses: mustContainMisses,
893
+ must_not_contain_violations: violations,
894
+ regex_matches: regexMatches,
895
+ regex_misses: regexMisses,
896
+ severity_matched: severityMatched,
897
+ finding_count_matched: findingCountMatched,
898
+ negative_control_passed: negativeControlPassed,
899
+ finding_count_actual: findingCountActual,
900
+ schema_validation_passed: schemaValidationPassed,
901
+ rubric_breakdown: rubricBreakdown,
902
+ adaptive_keywords_extracted:
903
+ adaptiveKeywords.length > 0 ? adaptiveKeywords : undefined,
904
+ },
905
+ };
906
+ }
716
907
 
717
- // Calculate reasoning quality (simplified - would use embeddings in production)
718
- const reasoningQualityScore = keywordMatchScore * 0.8 + (violations.length === 0 ? 0.2 : 0);
908
+ // --- Standard pass/fail determination ---
909
+ const rubricGate = rubricBreakdown
910
+ ? rubricBreakdown.weighted_score >= threshold
911
+ : true;
719
912
 
720
- // Determine if test passed
721
913
  const passed =
722
914
  keywordMatchScore >= threshold &&
723
915
  violations.length === 0 &&
916
+ findingCountMatched &&
917
+ rubricGate &&
918
+ (schemaValidationPassed === undefined || schemaValidationPassed) &&
724
919
  (config?.reasoning_quality_min === undefined ||
725
920
  reasoningQualityScore >= config.reasoning_quality_min) &&
726
921
  (config?.allow_partial || mustContainMisses.length === 0);
@@ -737,6 +932,11 @@ class SkillEvaluationRunner {
737
932
  regex_misses: regexMisses,
738
933
  severity_matched: severityMatched,
739
934
  finding_count_matched: findingCountMatched,
935
+ finding_count_actual: findingCountActual,
936
+ schema_validation_passed: schemaValidationPassed,
937
+ rubric_breakdown: rubricBreakdown,
938
+ adaptive_keywords_extracted:
939
+ adaptiveKeywords.length > 0 ? adaptiveKeywords : undefined,
740
940
  },
741
941
  };
742
942
  }