agentic-qe 3.6.9 → 3.6.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/skills/.validation/schemas/skill-eval.schema.json +11 -1
- package/.claude/skills/pr-review/SKILL.md +2 -2
- package/.claude/skills/qcsd-production-swarm/SKILL.md +2781 -0
- package/.claude/skills/qcsd-production-swarm/evals/qcsd-production-swarm.yaml +246 -0
- package/.claude/skills/qcsd-production-swarm/schemas/output.json +505 -0
- package/.claude/skills/qcsd-production-swarm/scripts/validate-config.json +25 -0
- package/.claude/skills/skills-manifest.json +5 -5
- package/package.json +1 -1
- package/scripts/benchmark-hnsw-loading.ts +480 -0
- package/scripts/benchmark-kg-assisted.ts +725 -0
- package/scripts/collect-production-telemetry.sh +291 -0
- package/scripts/detect-skill-conflicts.ts +347 -0
- package/scripts/eval-driven-workflow.ts +704 -0
- package/scripts/run-skill-eval.ts +210 -10
- package/scripts/score-skill-quality.ts +511 -0
- package/v3/CHANGELOG.md +44 -0
- package/v3/assets/skills/pr-review/SKILL.md +2 -2
- package/v3/dist/cli/bundle.js +1526 -700
- package/v3/dist/cli/commands/code.d.ts.map +1 -1
- package/v3/dist/cli/commands/code.js +9 -85
- package/v3/dist/cli/commands/code.js.map +1 -1
- package/v3/dist/cli/commands/coverage.d.ts.map +1 -1
- package/v3/dist/cli/commands/coverage.js +3 -28
- package/v3/dist/cli/commands/coverage.js.map +1 -1
- package/v3/dist/cli/commands/hooks.d.ts.map +1 -1
- package/v3/dist/cli/commands/hooks.js +143 -2
- package/v3/dist/cli/commands/hooks.js.map +1 -1
- package/v3/dist/cli/commands/security.d.ts.map +1 -1
- package/v3/dist/cli/commands/security.js +3 -29
- package/v3/dist/cli/commands/security.js.map +1 -1
- package/v3/dist/cli/commands/test.d.ts.map +1 -1
- package/v3/dist/cli/commands/test.js +11 -58
- package/v3/dist/cli/commands/test.js.map +1 -1
- package/v3/dist/cli/utils/file-discovery.d.ts +27 -0
- package/v3/dist/cli/utils/file-discovery.d.ts.map +1 -0
- package/v3/dist/cli/utils/file-discovery.js +105 -0
- package/v3/dist/cli/utils/file-discovery.js.map +1 -0
- package/v3/dist/coordination/task-executor.d.ts.map +1 -1
- package/v3/dist/coordination/task-executor.js +304 -44
- package/v3/dist/coordination/task-executor.js.map +1 -1
- package/v3/dist/domains/code-intelligence/coordinator.d.ts.map +1 -1
- package/v3/dist/domains/code-intelligence/coordinator.js +8 -1
- package/v3/dist/domains/code-intelligence/coordinator.js.map +1 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/index.d.ts.map +1 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/index.js +10 -0
- package/v3/dist/domains/code-intelligence/services/metric-collector/index.js.map +1 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.d.ts +7 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.d.ts.map +1 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.js +10 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/interfaces.js.map +1 -1
- package/v3/dist/domains/code-intelligence/services/metric-collector/loc-counter.js +34 -10
- package/v3/dist/domains/code-intelligence/services/metric-collector/loc-counter.js.map +1 -1
- package/v3/dist/domains/coverage-analysis/services/hnsw-index.d.ts +9 -0
- package/v3/dist/domains/coverage-analysis/services/hnsw-index.d.ts.map +1 -1
- package/v3/dist/domains/coverage-analysis/services/hnsw-index.js +38 -3
- package/v3/dist/domains/coverage-analysis/services/hnsw-index.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js +58 -6
- package/v3/dist/domains/test-generation/generators/jest-vitest-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/mocha-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/mocha-generator.js +79 -7
- package/v3/dist/domains/test-generation/generators/mocha-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts +4 -0
- package/v3/dist/domains/test-generation/generators/pytest-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/generators/pytest-generator.js +77 -10
- package/v3/dist/domains/test-generation/generators/pytest-generator.js.map +1 -1
- package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts +21 -0
- package/v3/dist/domains/test-generation/interfaces/test-generator.interface.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/interfaces.d.ts +21 -0
- package/v3/dist/domains/test-generation/interfaces.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/services/test-generator.d.ts +22 -0
- package/v3/dist/domains/test-generation/services/test-generator.d.ts.map +1 -1
- package/v3/dist/domains/test-generation/services/test-generator.js +163 -3
- package/v3/dist/domains/test-generation/services/test-generator.js.map +1 -1
- package/v3/dist/init/init-wizard-hooks.d.ts +8 -1
- package/v3/dist/init/init-wizard-hooks.d.ts.map +1 -1
- package/v3/dist/init/init-wizard-hooks.js +47 -39
- package/v3/dist/init/init-wizard-hooks.js.map +1 -1
- package/v3/dist/init/phases/07-hooks.d.ts +11 -1
- package/v3/dist/init/phases/07-hooks.d.ts.map +1 -1
- package/v3/dist/init/phases/07-hooks.js +46 -50
- package/v3/dist/init/phases/07-hooks.js.map +1 -1
- package/v3/dist/init/settings-merge.d.ts +35 -0
- package/v3/dist/init/settings-merge.d.ts.map +1 -0
- package/v3/dist/init/settings-merge.js +140 -0
- package/v3/dist/init/settings-merge.js.map +1 -0
- package/v3/dist/integrations/agentic-flow/model-router/router.js +1 -1
- package/v3/dist/integrations/agentic-flow/model-router/router.js.map +1 -1
- package/v3/dist/integrations/agentic-flow/model-router/score-calculator.d.ts.map +1 -1
- package/v3/dist/integrations/agentic-flow/model-router/score-calculator.js +18 -3
- package/v3/dist/integrations/agentic-flow/model-router/score-calculator.js.map +1 -1
- package/v3/dist/integrations/agentic-flow/model-router/signal-collector.d.ts +3 -3
- package/v3/dist/integrations/agentic-flow/model-router/signal-collector.d.ts.map +1 -1
- package/v3/dist/integrations/agentic-flow/model-router/signal-collector.js +18 -0
- package/v3/dist/integrations/agentic-flow/model-router/signal-collector.js.map +1 -1
- package/v3/dist/kernel/unified-memory-hnsw.d.ts +29 -0
- package/v3/dist/kernel/unified-memory-hnsw.d.ts.map +1 -1
- package/v3/dist/kernel/unified-memory-hnsw.js +136 -0
- package/v3/dist/kernel/unified-memory-hnsw.js.map +1 -1
- package/v3/dist/kernel/unified-memory.d.ts +2 -2
- package/v3/dist/kernel/unified-memory.d.ts.map +1 -1
- package/v3/dist/kernel/unified-memory.js +7 -9
- package/v3/dist/kernel/unified-memory.js.map +1 -1
- package/v3/dist/learning/qe-hooks.d.ts.map +1 -1
- package/v3/dist/learning/qe-hooks.js +34 -3
- package/v3/dist/learning/qe-hooks.js.map +1 -1
- package/v3/dist/mcp/bundle.js +1403 -425
- package/v3/dist/mcp/handlers/domain-handler-configs.d.ts.map +1 -1
- package/v3/dist/mcp/handlers/domain-handler-configs.js +40 -31
- package/v3/dist/mcp/handlers/domain-handler-configs.js.map +1 -1
- package/v3/dist/mcp/handlers/task-handlers.d.ts.map +1 -1
- package/v3/dist/mcp/handlers/task-handlers.js +68 -5
- package/v3/dist/mcp/handlers/task-handlers.js.map +1 -1
- package/v3/dist/mcp/protocol-server.d.ts.map +1 -1
- package/v3/dist/mcp/protocol-server.js +16 -2
- package/v3/dist/mcp/protocol-server.js.map +1 -1
- package/v3/package.json +1 -1
|
@@ -42,6 +42,7 @@ interface TestCase {
|
|
|
42
42
|
priority: 'critical' | 'high' | 'medium' | 'low';
|
|
43
43
|
skip?: boolean;
|
|
44
44
|
skip_reason?: string;
|
|
45
|
+
negative_control?: boolean;
|
|
45
46
|
input: TestInput;
|
|
46
47
|
expected_output: ExpectedOutput;
|
|
47
48
|
validation?: ValidationConfig;
|
|
@@ -84,6 +85,7 @@ interface ValidationConfig {
|
|
|
84
85
|
reasoning_quality_min?: number;
|
|
85
86
|
semantic_similarity_min?: number;
|
|
86
87
|
allow_partial?: boolean;
|
|
88
|
+
adaptive_rubric?: boolean;
|
|
87
89
|
grading_rubric?: {
|
|
88
90
|
completeness?: number;
|
|
89
91
|
accuracy?: number;
|
|
@@ -165,6 +167,16 @@ interface TestCaseResult {
|
|
|
165
167
|
regex_misses: string[];
|
|
166
168
|
severity_matched: boolean;
|
|
167
169
|
finding_count_matched: boolean;
|
|
170
|
+
negative_control_passed?: boolean;
|
|
171
|
+
finding_count_actual?: number;
|
|
172
|
+
schema_validation_passed?: boolean;
|
|
173
|
+
rubric_breakdown?: {
|
|
174
|
+
completeness: number;
|
|
175
|
+
accuracy: number;
|
|
176
|
+
actionability: number;
|
|
177
|
+
weighted_score: number;
|
|
178
|
+
};
|
|
179
|
+
adaptive_keywords_extracted?: string[];
|
|
168
180
|
};
|
|
169
181
|
raw_output?: string;
|
|
170
182
|
error?: string;
|
|
@@ -568,7 +580,7 @@ class SkillEvaluationRunner {
|
|
|
568
580
|
const executionTime = Date.now() - startTime;
|
|
569
581
|
|
|
570
582
|
// Validate output against expectations
|
|
571
|
-
const validation = this.validateOutput(output, testCase.expected_output, testCase.validation);
|
|
583
|
+
const validation = this.validateOutput(output, testCase.expected_output, testCase.validation, testCase);
|
|
572
584
|
|
|
573
585
|
return {
|
|
574
586
|
id: testCase.id,
|
|
@@ -650,12 +662,19 @@ class SkillEvaluationRunner {
|
|
|
650
662
|
}
|
|
651
663
|
|
|
652
664
|
/**
|
|
653
|
-
* Validate LLM output against expected output criteria
|
|
665
|
+
* Validate LLM output against expected output criteria.
|
|
666
|
+
*
|
|
667
|
+
* Implements:
|
|
668
|
+
* - P1-6: Negative control (inverted pass logic for "should decline" tests)
|
|
669
|
+
* - P1-5a: Finding count enforcement + schema_path validation
|
|
670
|
+
* - P1-5b: Grading rubric weighted scoring (completeness/accuracy/actionability)
|
|
671
|
+
* - P1-4: Adaptive rubric (dynamic keyword extraction from prompt)
|
|
654
672
|
*/
|
|
655
673
|
private validateOutput(
|
|
656
674
|
output: string,
|
|
657
675
|
expected: ExpectedOutput,
|
|
658
|
-
config?: ValidationConfig
|
|
676
|
+
config?: ValidationConfig,
|
|
677
|
+
testCase?: TestCase
|
|
659
678
|
): {
|
|
660
679
|
passed: boolean;
|
|
661
680
|
keywordMatchScore: number;
|
|
@@ -664,12 +683,39 @@ class SkillEvaluationRunner {
|
|
|
664
683
|
} {
|
|
665
684
|
const outputLower = output.toLowerCase();
|
|
666
685
|
const threshold = config?.keyword_match_threshold ?? 0.8;
|
|
686
|
+
const isNegativeControl = testCase?.negative_control ?? false;
|
|
687
|
+
|
|
688
|
+
// --- P1-4: Adaptive Rubric — extract additional keywords from prompt ---
|
|
689
|
+
const adaptiveKeywords: string[] = [];
|
|
690
|
+
let effectiveMustContain = expected.must_contain || [];
|
|
691
|
+
|
|
692
|
+
if (config?.adaptive_rubric && testCase?.input?.prompt) {
|
|
693
|
+
const prompt = testCase.input.prompt;
|
|
694
|
+
// Extract quoted strings
|
|
695
|
+
const quoted = prompt.match(/"([^"]+)"/g)?.map((s) => s.replace(/"/g, '')) || [];
|
|
696
|
+
// Extract format keywords
|
|
697
|
+
const formatWords = prompt.match(/\b(JSON|YAML|markdown|XML|CSV|HTML|SQL)\b/gi) || [];
|
|
698
|
+
// Extract named standards
|
|
699
|
+
const standards =
|
|
700
|
+
prompt.match(/\b(OWASP|WCAG|PCI[-\s]DSS|HIPAA|SOC2|GDPR|ISO\s?\d+)\b/gi) || [];
|
|
701
|
+
// Extract meaningful numbers (>1 digit or >5)
|
|
702
|
+
const numbers = (prompt.match(/\b\d+(\.\d+)?\b/g) || []).filter(
|
|
703
|
+
(n) => n.length > 1 || parseInt(n) > 5
|
|
704
|
+
);
|
|
705
|
+
|
|
706
|
+
adaptiveKeywords.push(...quoted, ...formatWords, ...standards, ...numbers);
|
|
707
|
+
|
|
708
|
+
// Merge with existing must_contain (deduplicate)
|
|
709
|
+
const existingLower = new Set(effectiveMustContain.map((k) => k.toLowerCase()));
|
|
710
|
+
const newKeywords = adaptiveKeywords.filter((k) => !existingLower.has(k.toLowerCase()));
|
|
711
|
+
effectiveMustContain = [...effectiveMustContain, ...newKeywords];
|
|
712
|
+
}
|
|
667
713
|
|
|
668
714
|
// Check must_contain
|
|
669
715
|
const mustContainMatches: string[] = [];
|
|
670
716
|
const mustContainMisses: string[] = [];
|
|
671
717
|
|
|
672
|
-
for (const keyword of
|
|
718
|
+
for (const keyword of effectiveMustContain) {
|
|
673
719
|
if (outputLower.includes(keyword.toLowerCase())) {
|
|
674
720
|
mustContainMatches.push(keyword);
|
|
675
721
|
} else {
|
|
@@ -702,7 +748,7 @@ class SkillEvaluationRunner {
|
|
|
702
748
|
}
|
|
703
749
|
|
|
704
750
|
// Calculate keyword match score
|
|
705
|
-
const totalKeywords =
|
|
751
|
+
const totalKeywords = effectiveMustContain.length + (expected.must_match_regex?.length || 0);
|
|
706
752
|
const matchedKeywords = mustContainMatches.length + regexMatches.length;
|
|
707
753
|
const keywordMatchScore = totalKeywords > 0 ? matchedKeywords / totalKeywords : 1;
|
|
708
754
|
|
|
@@ -711,16 +757,165 @@ class SkillEvaluationRunner {
|
|
|
711
757
|
!expected.severity_classification ||
|
|
712
758
|
outputLower.includes(expected.severity_classification.toLowerCase());
|
|
713
759
|
|
|
714
|
-
//
|
|
715
|
-
|
|
760
|
+
// --- P1-5a: Finding count enforcement ---
|
|
761
|
+
let findingCountMatched = true;
|
|
762
|
+
let findingCountActual: number | undefined;
|
|
763
|
+
|
|
764
|
+
if (expected.finding_count) {
|
|
765
|
+
// Try JSON parsing first
|
|
766
|
+
try {
|
|
767
|
+
const parsed = JSON.parse(output);
|
|
768
|
+
if (Array.isArray(parsed?.findings)) {
|
|
769
|
+
findingCountActual = parsed.findings.length;
|
|
770
|
+
}
|
|
771
|
+
} catch {
|
|
772
|
+
// Fallback: count severity-keyword occurrences or numbered findings
|
|
773
|
+
const severityPattern =
|
|
774
|
+
/\b(critical|high|medium|low|info)\b.*?(vulnerability|finding|issue)/gi;
|
|
775
|
+
const numberedPattern = /^\s*\d+[.)]\s+/gm;
|
|
776
|
+
const severityCount = (output.match(severityPattern) || []).length;
|
|
777
|
+
const numberedCount = (output.match(numberedPattern) || []).length;
|
|
778
|
+
findingCountActual = Math.max(severityCount, numberedCount);
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
if (findingCountActual !== undefined) {
|
|
782
|
+
if (
|
|
783
|
+
expected.finding_count.min !== undefined &&
|
|
784
|
+
findingCountActual < expected.finding_count.min
|
|
785
|
+
) {
|
|
786
|
+
findingCountMatched = false;
|
|
787
|
+
}
|
|
788
|
+
if (
|
|
789
|
+
expected.finding_count.max !== undefined &&
|
|
790
|
+
findingCountActual > expected.finding_count.max
|
|
791
|
+
) {
|
|
792
|
+
findingCountMatched = false;
|
|
793
|
+
}
|
|
794
|
+
}
|
|
795
|
+
}
|
|
796
|
+
|
|
797
|
+
// --- P1-5a: Schema validation ---
|
|
798
|
+
let schemaValidationPassed: boolean | undefined;
|
|
799
|
+
if (expected.schema_path) {
|
|
800
|
+
try {
|
|
801
|
+
const schemaContent = readFileSync(expected.schema_path, 'utf-8');
|
|
802
|
+
const schema = JSON.parse(schemaContent);
|
|
803
|
+
const parsed = JSON.parse(output);
|
|
804
|
+
// Structural check: verify all required fields exist
|
|
805
|
+
if (schema.required && Array.isArray(schema.required)) {
|
|
806
|
+
const missingFields = schema.required.filter(
|
|
807
|
+
(field: string) => !(field in parsed)
|
|
808
|
+
);
|
|
809
|
+
schemaValidationPassed = missingFields.length === 0;
|
|
810
|
+
} else {
|
|
811
|
+
schemaValidationPassed = true;
|
|
812
|
+
}
|
|
813
|
+
} catch {
|
|
814
|
+
schemaValidationPassed = false;
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
|
|
818
|
+
// --- P1-5b: Grading Rubric scoring ---
|
|
819
|
+
let rubricBreakdown:
|
|
820
|
+
| {
|
|
821
|
+
completeness: number;
|
|
822
|
+
accuracy: number;
|
|
823
|
+
actionability: number;
|
|
824
|
+
weighted_score: number;
|
|
825
|
+
}
|
|
826
|
+
| undefined;
|
|
827
|
+
|
|
828
|
+
if (config?.grading_rubric) {
|
|
829
|
+
const weights = config.grading_rubric;
|
|
830
|
+
const totalWeight =
|
|
831
|
+
(weights.completeness ?? 0) + (weights.accuracy ?? 0) + (weights.actionability ?? 0);
|
|
832
|
+
|
|
833
|
+
// Completeness = fraction of must_contain matched
|
|
834
|
+
const completenessScore = totalKeywords > 0 ? matchedKeywords / totalKeywords : 1;
|
|
835
|
+
|
|
836
|
+
// Accuracy = 1 - (violations / total_must_not_contain)
|
|
837
|
+
const totalMustNotContain = expected.must_not_contain?.length || 0;
|
|
838
|
+
const accuracyScore =
|
|
839
|
+
totalMustNotContain > 0 ? 1 - violations.length / totalMustNotContain : 1;
|
|
840
|
+
|
|
841
|
+
// Actionability = heuristic: code blocks, numbered steps, specific recommendations
|
|
842
|
+
let actionabilityScore = 0;
|
|
843
|
+
const hasCodeBlocks = /```[\s\S]*?```/.test(output) || /`[^`]+`/.test(output);
|
|
844
|
+
const hasNumberedSteps = /^\s*\d+[.)]\s+/m.test(output);
|
|
845
|
+
const hasRecommendations =
|
|
846
|
+
/\b(recommend|suggest|should|must|consider|implement|use|apply|ensure)\b/i.test(output);
|
|
847
|
+
const hasSpecificActions =
|
|
848
|
+
/\b(install|configure|update|replace|add|remove|change|set|enable|disable)\b/i.test(
|
|
849
|
+
output
|
|
850
|
+
);
|
|
851
|
+
|
|
852
|
+
if (hasCodeBlocks) actionabilityScore += 0.3;
|
|
853
|
+
if (hasNumberedSteps) actionabilityScore += 0.3;
|
|
854
|
+
if (hasRecommendations) actionabilityScore += 0.2;
|
|
855
|
+
if (hasSpecificActions) actionabilityScore += 0.2;
|
|
856
|
+
actionabilityScore = Math.min(1, actionabilityScore);
|
|
857
|
+
|
|
858
|
+
// Weighted sum
|
|
859
|
+
const weightedScore =
|
|
860
|
+
totalWeight > 0
|
|
861
|
+
? ((weights.completeness ?? 0) * completenessScore +
|
|
862
|
+
(weights.accuracy ?? 0) * accuracyScore +
|
|
863
|
+
(weights.actionability ?? 0) * actionabilityScore) /
|
|
864
|
+
totalWeight
|
|
865
|
+
: completenessScore;
|
|
866
|
+
|
|
867
|
+
rubricBreakdown = {
|
|
868
|
+
completeness: completenessScore,
|
|
869
|
+
accuracy: accuracyScore,
|
|
870
|
+
actionability: actionabilityScore,
|
|
871
|
+
weighted_score: weightedScore,
|
|
872
|
+
};
|
|
873
|
+
}
|
|
874
|
+
|
|
875
|
+
// Calculate reasoning quality — use rubric if available, else simple heuristic
|
|
876
|
+
const reasoningQualityScore = rubricBreakdown
|
|
877
|
+
? rubricBreakdown.weighted_score
|
|
878
|
+
: keywordMatchScore * 0.8 + (violations.length === 0 ? 0.2 : 0);
|
|
879
|
+
|
|
880
|
+
// --- P1-6: Negative Control logic ---
|
|
881
|
+
if (isNegativeControl) {
|
|
882
|
+
// For negative control: pass when must_contain items are ABSENT and no violations
|
|
883
|
+
const negativeControlPassed =
|
|
884
|
+
mustContainMatches.length === 0 && violations.length === 0;
|
|
885
|
+
|
|
886
|
+
return {
|
|
887
|
+
passed: negativeControlPassed,
|
|
888
|
+
keywordMatchScore,
|
|
889
|
+
reasoningQualityScore: negativeControlPassed ? 1.0 : 0.0,
|
|
890
|
+
details: {
|
|
891
|
+
must_contain_matches: mustContainMatches,
|
|
892
|
+
must_contain_misses: mustContainMisses,
|
|
893
|
+
must_not_contain_violations: violations,
|
|
894
|
+
regex_matches: regexMatches,
|
|
895
|
+
regex_misses: regexMisses,
|
|
896
|
+
severity_matched: severityMatched,
|
|
897
|
+
finding_count_matched: findingCountMatched,
|
|
898
|
+
negative_control_passed: negativeControlPassed,
|
|
899
|
+
finding_count_actual: findingCountActual,
|
|
900
|
+
schema_validation_passed: schemaValidationPassed,
|
|
901
|
+
rubric_breakdown: rubricBreakdown,
|
|
902
|
+
adaptive_keywords_extracted:
|
|
903
|
+
adaptiveKeywords.length > 0 ? adaptiveKeywords : undefined,
|
|
904
|
+
},
|
|
905
|
+
};
|
|
906
|
+
}
|
|
716
907
|
|
|
717
|
-
//
|
|
718
|
-
const
|
|
908
|
+
// --- Standard pass/fail determination ---
|
|
909
|
+
const rubricGate = rubricBreakdown
|
|
910
|
+
? rubricBreakdown.weighted_score >= threshold
|
|
911
|
+
: true;
|
|
719
912
|
|
|
720
|
-
// Determine if test passed
|
|
721
913
|
const passed =
|
|
722
914
|
keywordMatchScore >= threshold &&
|
|
723
915
|
violations.length === 0 &&
|
|
916
|
+
findingCountMatched &&
|
|
917
|
+
rubricGate &&
|
|
918
|
+
(schemaValidationPassed === undefined || schemaValidationPassed) &&
|
|
724
919
|
(config?.reasoning_quality_min === undefined ||
|
|
725
920
|
reasoningQualityScore >= config.reasoning_quality_min) &&
|
|
726
921
|
(config?.allow_partial || mustContainMisses.length === 0);
|
|
@@ -737,6 +932,11 @@ class SkillEvaluationRunner {
|
|
|
737
932
|
regex_misses: regexMisses,
|
|
738
933
|
severity_matched: severityMatched,
|
|
739
934
|
finding_count_matched: findingCountMatched,
|
|
935
|
+
finding_count_actual: findingCountActual,
|
|
936
|
+
schema_validation_passed: schemaValidationPassed,
|
|
937
|
+
rubric_breakdown: rubricBreakdown,
|
|
938
|
+
adaptive_keywords_extracted:
|
|
939
|
+
adaptiveKeywords.length > 0 ? adaptiveKeywords : undefined,
|
|
740
940
|
},
|
|
741
941
|
};
|
|
742
942
|
}
|