@vfarcic/dot-ai 0.111.0 → 0.113.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/ai-provider-factory.d.ts +0 -10
- package/dist/core/ai-provider-factory.d.ts.map +1 -1
- package/dist/core/ai-provider-factory.js +14 -24
- package/dist/core/ai-provider.interface.d.ts +28 -1
- package/dist/core/ai-provider.interface.d.ts.map +1 -1
- package/dist/core/capabilities.d.ts +1 -1
- package/dist/core/capabilities.d.ts.map +1 -1
- package/dist/core/capabilities.js +7 -4
- package/dist/core/capability-scan-workflow.js +2 -2
- package/dist/core/embedding-service.d.ts +35 -2
- package/dist/core/embedding-service.d.ts.map +1 -1
- package/dist/core/embedding-service.js +228 -15
- package/dist/core/model-config.d.ts +23 -0
- package/dist/core/model-config.d.ts.map +1 -0
- package/dist/core/model-config.js +28 -0
- package/dist/core/platform-operations.d.ts.map +1 -1
- package/dist/core/platform-operations.js +3 -5
- package/dist/core/platform-utils.d.ts +13 -2
- package/dist/core/platform-utils.d.ts.map +1 -1
- package/dist/core/platform-utils.js +91 -9
- package/dist/core/providers/anthropic-provider.d.ts +6 -1
- package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
- package/dist/core/providers/anthropic-provider.js +99 -27
- package/dist/core/providers/provider-debug-utils.d.ts +53 -20
- package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
- package/dist/core/providers/provider-debug-utils.js +106 -51
- package/dist/core/providers/vercel-provider.d.ts +6 -1
- package/dist/core/providers/vercel-provider.d.ts.map +1 -1
- package/dist/core/providers/vercel-provider.js +212 -130
- package/dist/core/schema.d.ts +1 -101
- package/dist/core/schema.d.ts.map +1 -1
- package/dist/core/schema.js +20 -154
- package/dist/core/unified-creation-session.d.ts.map +1 -1
- package/dist/core/unified-creation-session.js +15 -7
- package/dist/evaluation/dataset-analyzer.d.ts +118 -0
- package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
- package/dist/evaluation/dataset-analyzer.js +234 -0
- package/dist/evaluation/datasets/loader.d.ts +42 -0
- package/dist/evaluation/datasets/loader.d.ts.map +1 -0
- package/dist/evaluation/datasets/loader.js +104 -0
- package/dist/evaluation/eval-runner.d.ts +9 -0
- package/dist/evaluation/eval-runner.d.ts.map +1 -0
- package/dist/evaluation/eval-runner.js +399 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts +94 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base-comparative.js +187 -0
- package/dist/evaluation/evaluators/base.d.ts +47 -0
- package/dist/evaluation/evaluators/base.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base.js +10 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/capability-comparative.js +104 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/policy-comparative.js +97 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
- package/dist/evaluation/platform-synthesizer.d.ts +54 -0
- package/dist/evaluation/platform-synthesizer.d.ts.map +1 -0
- package/dist/evaluation/platform-synthesizer.js +368 -0
- package/dist/evaluation/run-platform-synthesis.d.ts +9 -0
- package/dist/evaluation/run-platform-synthesis.d.ts.map +1 -0
- package/dist/evaluation/run-platform-synthesis.js +45 -0
- package/dist/interfaces/mcp.d.ts.map +1 -1
- package/dist/interfaces/mcp.js +23 -29
- package/dist/interfaces/rest-api.d.ts.map +1 -1
- package/dist/tools/answer-question.d.ts +2 -0
- package/dist/tools/answer-question.d.ts.map +1 -1
- package/dist/tools/answer-question.js +18 -11
- package/dist/tools/generate-manifests.d.ts +2 -0
- package/dist/tools/generate-manifests.d.ts.map +1 -1
- package/dist/tools/generate-manifests.js +11 -12
- package/dist/tools/organizational-data.d.ts +1 -0
- package/dist/tools/organizational-data.d.ts.map +1 -1
- package/dist/tools/organizational-data.js +2 -1
- package/dist/tools/recommend.d.ts +1 -0
- package/dist/tools/recommend.d.ts.map +1 -1
- package/dist/tools/recommend.js +13 -21
- package/dist/tools/remediate.d.ts +3 -0
- package/dist/tools/remediate.d.ts.map +1 -1
- package/dist/tools/remediate.js +35 -14
- package/dist/tools/test-docs.d.ts +1 -0
- package/dist/tools/test-docs.d.ts.map +1 -1
- package/dist/tools/test-docs.js +4 -2
- package/dist/tools/version.d.ts +5 -1
- package/dist/tools/version.d.ts.map +1 -1
- package/dist/tools/version.js +23 -8
- package/package.json +19 -1
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Standard Evaluator Interface Following OpenAI Evals Pattern
|
|
3
|
+
*
|
|
4
|
+
* Based on OpenAI Evals framework standards:
|
|
5
|
+
* - Each evaluator has a name and description
|
|
6
|
+
* - evaluate() method takes input, output, and optional ideal
|
|
7
|
+
* - Returns standardized EvaluationScore
|
|
8
|
+
*/
|
|
9
|
+
export interface EvaluationScore {
|
|
10
|
+
key: string;
|
|
11
|
+
score: number;
|
|
12
|
+
comment?: string;
|
|
13
|
+
confidence?: number;
|
|
14
|
+
}
|
|
15
|
+
export interface EvaluationSample {
|
|
16
|
+
input: Record<string, any>;
|
|
17
|
+
output: string;
|
|
18
|
+
ideal?: any;
|
|
19
|
+
metadata?: Record<string, any>;
|
|
20
|
+
}
|
|
21
|
+
export interface PerformanceMetrics {
|
|
22
|
+
duration_ms: number;
|
|
23
|
+
input_tokens: number;
|
|
24
|
+
output_tokens: number;
|
|
25
|
+
total_tokens: number;
|
|
26
|
+
cost_usd?: number;
|
|
27
|
+
iterations?: number;
|
|
28
|
+
tool_calls_executed?: number;
|
|
29
|
+
cache_hit_rate?: number;
|
|
30
|
+
model_version: string;
|
|
31
|
+
}
|
|
32
|
+
export interface EvaluationResult {
|
|
33
|
+
sample_id: string;
|
|
34
|
+
model: string;
|
|
35
|
+
timestamp: string;
|
|
36
|
+
quality_scores: Record<string, EvaluationScore>;
|
|
37
|
+
performance: PerformanceMetrics;
|
|
38
|
+
efficiency: {
|
|
39
|
+
quality_per_second: number;
|
|
40
|
+
quality_per_dollar?: number;
|
|
41
|
+
quality_per_token: number;
|
|
42
|
+
};
|
|
43
|
+
input: Record<string, any>;
|
|
44
|
+
output: string;
|
|
45
|
+
ideal?: any;
|
|
46
|
+
}
|
|
47
|
+
//# sourceMappingURL=base.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"base.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/base.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAED,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,GAAG,CAAC;IACZ,QAAQ,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;CAChC;AAED,MAAM,WAAW,kBAAkB;IACjC,WAAW,EAAE,MAAM,CAAC;IACpB,YAAY,EAAE,MAAM,CAAC;IACrB,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,MAAM,CAAC;IACrB,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,aAAa,EAAE,MAAM,CAAC;CACvB;AAED,MAAM,WAAW,gBAAgB;IAC/B,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,MAAM,CAAC;IAGlB,cAAc,EAAE,MAAM,CAAC,MAAM,EAAE,eAAe,CAAC,CAAC;IAGhD,WAAW,EAAE,kBAAkB,CAAC;IAGhC,UAAU,EAAE;QACV,kBAAkB,EAAE,MAAM,CAAC;QAC3B,kBAAkB,CAAC,EAAE,MAAM,CAAC;QAC5B,iBAAiB,EAAE,MAAM,CAAC;KAC3B,CAAC;IAGF,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC3B,MAAM,EAAE,MAAM,CAAC;IACf,KAAK,CAAC,EAAE,GAAG,CAAC;CACb"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Standard Evaluator Interface Following OpenAI Evals Pattern
|
|
4
|
+
*
|
|
5
|
+
* Based on OpenAI Evals framework standards:
|
|
6
|
+
* - Each evaluator has a name and description
|
|
7
|
+
* - evaluate() method takes input, output, and optional ideal
|
|
8
|
+
* - Returns standardized EvaluationScore
|
|
9
|
+
*/
|
|
10
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Capability Comparative Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Compares multiple AI models on Kubernetes capability inference scenarios
|
|
5
|
+
* Groups by interaction_id (e.g., auto_scan, crud_auto_scan) and evaluates
|
|
6
|
+
* quality of capability analyses across different models
|
|
7
|
+
*/
|
|
8
|
+
import { BaseComparativeEvaluator, ComparativeEvaluationScore } from './base-comparative.js';
|
|
9
|
+
import { ComparisonScenario } from '../dataset-analyzer.js';
|
|
10
|
+
export declare class CapabilityComparativeEvaluator extends BaseComparativeEvaluator {
|
|
11
|
+
readonly name = "capability-comparative";
|
|
12
|
+
readonly description = "Compares AI models on Kubernetes capability inference quality";
|
|
13
|
+
protected readonly promptFileName = "capability-comparative.md";
|
|
14
|
+
protected readonly toolName = "capability";
|
|
15
|
+
constructor(datasetDir?: string);
|
|
16
|
+
evaluateAllScenarios(): Promise<ComparativeEvaluationScore[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Build the evaluation prompt - uses base class reliability context with capability-specific template
|
|
19
|
+
*/
|
|
20
|
+
protected buildEvaluationPrompt(scenario: ComparisonScenario, modelResponsesText: string, modelList: string): string;
|
|
21
|
+
private extractResourceName;
|
|
22
|
+
/**
|
|
23
|
+
* Get detailed breakdown of evaluation phases available
|
|
24
|
+
*/
|
|
25
|
+
getEvaluationPhases(): {
|
|
26
|
+
phase: string;
|
|
27
|
+
description: string;
|
|
28
|
+
availableModels: string[];
|
|
29
|
+
scenarioCount: number;
|
|
30
|
+
}[];
|
|
31
|
+
}
|
|
32
|
+
//# sourceMappingURL=capability-comparative.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"capability-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/capability-comparative.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,uBAAuB,CAAC;AAC7F,OAAO,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAE5D,qBAAa,8BAA+B,SAAQ,wBAAwB;IAC1E,QAAQ,CAAC,IAAI,4BAA4B;IACzC,QAAQ,CAAC,WAAW,mEAAmE;IACvF,SAAS,CAAC,QAAQ,CAAC,cAAc,+BAA+B;IAChE,SAAS,CAAC,QAAQ,CAAC,QAAQ,gBAAgB;gBAE/B,UAAU,CAAC,EAAE,MAAM;IAKzB,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IA+BnE;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAYpH,OAAO,CAAC,mBAAmB;IAS3B;;OAEG;IACH,mBAAmB,IAAI;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CAsCJ"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Capability Comparative Evaluator
|
|
4
|
+
*
|
|
5
|
+
* Compares multiple AI models on Kubernetes capability inference scenarios
|
|
6
|
+
* Groups by interaction_id (e.g., auto_scan, crud_auto_scan) and evaluates
|
|
7
|
+
* quality of capability analyses across different models
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.CapabilityComparativeEvaluator = void 0;
|
|
11
|
+
const base_comparative_js_1 = require("./base-comparative.js");
|
|
12
|
+
class CapabilityComparativeEvaluator extends base_comparative_js_1.BaseComparativeEvaluator {
|
|
13
|
+
name = 'capability-comparative';
|
|
14
|
+
description = 'Compares AI models on Kubernetes capability inference quality';
|
|
15
|
+
promptFileName = 'capability-comparative.md';
|
|
16
|
+
toolName = 'capability';
|
|
17
|
+
constructor(datasetDir) {
|
|
18
|
+
super(datasetDir);
|
|
19
|
+
this.initializePrompt();
|
|
20
|
+
}
|
|
21
|
+
async evaluateAllScenarios() {
|
|
22
|
+
try {
|
|
23
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
24
|
+
const results = [];
|
|
25
|
+
console.log(`Found ${scenarios.length} capability scenarios with multiple models for comparative evaluation`);
|
|
26
|
+
for (const scenario of scenarios) {
|
|
27
|
+
try {
|
|
28
|
+
const result = await this.evaluateScenario(scenario);
|
|
29
|
+
results.push(result);
|
|
30
|
+
}
|
|
31
|
+
catch (error) {
|
|
32
|
+
console.error(`Failed to evaluate scenario ${scenario.interaction_id}:`, error);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return results;
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
console.error(`Capability comparative evaluation failed:`, error);
|
|
39
|
+
return [{
|
|
40
|
+
key: `${this.name}_error`,
|
|
41
|
+
score: 0,
|
|
42
|
+
comment: `Evaluation error: ${error instanceof Error ? error.message : String(error)}`,
|
|
43
|
+
confidence: 0,
|
|
44
|
+
modelRankings: [],
|
|
45
|
+
bestModel: 'unknown',
|
|
46
|
+
modelCount: 0
|
|
47
|
+
}];
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Build the evaluation prompt - uses base class reliability context with capability-specific template
|
|
52
|
+
*/
|
|
53
|
+
buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
|
|
54
|
+
// Use the base class's properly formatted model responses which include:
|
|
55
|
+
// - Reliability Status (✅ Completed successfully OR ⚠️ TIMEOUT FAILURE)
|
|
56
|
+
// - Performance metrics
|
|
57
|
+
// - All model responses
|
|
58
|
+
return this.promptTemplate
|
|
59
|
+
.replace('{scenario_name}', scenario.interaction_id)
|
|
60
|
+
.replace('{model_responses}', modelResponsesText)
|
|
61
|
+
.replace('{models}', modelList);
|
|
62
|
+
}
|
|
63
|
+
extractResourceName(input) {
|
|
64
|
+
if (input?.issue) {
|
|
65
|
+
const match = input.issue.match(/resource: (.+)/);
|
|
66
|
+
return match ? match[1] : 'unknown';
|
|
67
|
+
}
|
|
68
|
+
return 'unknown';
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Get detailed breakdown of evaluation phases available
|
|
72
|
+
*/
|
|
73
|
+
getEvaluationPhases() {
|
|
74
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
75
|
+
const phaseGroups = new Map();
|
|
76
|
+
// Group scenarios by phase type
|
|
77
|
+
for (const scenario of scenarios) {
|
|
78
|
+
const phase = scenario.interaction_id;
|
|
79
|
+
if (!phaseGroups.has(phase)) {
|
|
80
|
+
phaseGroups.set(phase, {
|
|
81
|
+
models: new Set(),
|
|
82
|
+
count: 0
|
|
83
|
+
});
|
|
84
|
+
}
|
|
85
|
+
const group = phaseGroups.get(phase);
|
|
86
|
+
scenario.models.forEach(model => group.models.add(model.model));
|
|
87
|
+
group.count++;
|
|
88
|
+
}
|
|
89
|
+
// Convert to structured output with descriptions
|
|
90
|
+
const phaseDescriptions = {
|
|
91
|
+
'auto_scan': 'Auto Scan Phase - How well each model analyzes cluster resource capabilities automatically',
|
|
92
|
+
'crud_auto_scan': 'CRUD Auto Scan Phase - How well each model handles capability analysis with CRUD operations',
|
|
93
|
+
'list_auto_scan': 'List Auto Scan Phase - How well each model handles capability listing and organization',
|
|
94
|
+
'search_auto_scan': 'Search Auto Scan Phase - How well each model handles capability search and filtering'
|
|
95
|
+
};
|
|
96
|
+
return Array.from(phaseGroups.entries()).map(([phase, data]) => ({
|
|
97
|
+
phase,
|
|
98
|
+
description: phaseDescriptions[phase] || `${phase} phase evaluation`,
|
|
99
|
+
availableModels: Array.from(data.models).sort(),
|
|
100
|
+
scenarioCount: data.count
|
|
101
|
+
}));
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
exports.CapabilityComparativeEvaluator = CapabilityComparativeEvaluator;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Pattern Comparative Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Compares multiple AI models on Kubernetes organizational pattern management scenarios
|
|
5
|
+
* Groups by interaction_id (e.g., pattern_create_workflow) and evaluates
|
|
6
|
+
* quality of pattern creation, identification, and management across different models
|
|
7
|
+
*/
|
|
8
|
+
import { BaseComparativeEvaluator, ComparativeEvaluationScore } from './base-comparative.js';
|
|
9
|
+
import { ComparisonScenario } from '../dataset-analyzer.js';
|
|
10
|
+
export declare class PatternComparativeEvaluator extends BaseComparativeEvaluator {
|
|
11
|
+
readonly name = "pattern-comparative";
|
|
12
|
+
readonly description = "Compares AI models on Kubernetes organizational pattern management quality";
|
|
13
|
+
protected readonly promptFileName = "pattern-comparative.md";
|
|
14
|
+
protected readonly toolName = "pattern";
|
|
15
|
+
constructor(datasetDir?: string);
|
|
16
|
+
evaluateAllScenarios(): Promise<ComparativeEvaluationScore[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Build the evaluation prompt - uses base class reliability context with pattern-specific template
|
|
19
|
+
*/
|
|
20
|
+
protected buildEvaluationPrompt(scenario: ComparisonScenario, modelResponsesText: string, modelList: string): string;
|
|
21
|
+
/**
|
|
22
|
+
* Get detailed breakdown of evaluation phases available
|
|
23
|
+
*/
|
|
24
|
+
getEvaluationPhases(): {
|
|
25
|
+
phase: string;
|
|
26
|
+
description: string;
|
|
27
|
+
availableModels: string[];
|
|
28
|
+
scenarioCount: number;
|
|
29
|
+
}[];
|
|
30
|
+
}
|
|
31
|
+
//# sourceMappingURL=pattern-comparative.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pattern-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/pattern-comparative.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,uBAAuB,CAAC;AAC7F,OAAO,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAE5D,qBAAa,2BAA4B,SAAQ,wBAAwB;IACvE,QAAQ,CAAC,IAAI,yBAAyB;IACtC,QAAQ,CAAC,WAAW,gFAAgF;IACpG,SAAS,CAAC,QAAQ,CAAC,cAAc,4BAA4B;IAC7D,SAAS,CAAC,QAAQ,CAAC,QAAQ,aAAa;gBAE5B,UAAU,CAAC,EAAE,MAAM;IAKzB,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IA+BnE;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAYpH;;OAEG;IACH,mBAAmB,IAAI;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CAsCJ"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Pattern Comparative Evaluator
|
|
4
|
+
*
|
|
5
|
+
* Compares multiple AI models on Kubernetes organizational pattern management scenarios
|
|
6
|
+
* Groups by interaction_id (e.g., pattern_create_workflow) and evaluates
|
|
7
|
+
* quality of pattern creation, identification, and management across different models
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.PatternComparativeEvaluator = void 0;
|
|
11
|
+
const base_comparative_js_1 = require("./base-comparative.js");
|
|
12
|
+
class PatternComparativeEvaluator extends base_comparative_js_1.BaseComparativeEvaluator {
|
|
13
|
+
name = 'pattern-comparative';
|
|
14
|
+
description = 'Compares AI models on Kubernetes organizational pattern management quality';
|
|
15
|
+
promptFileName = 'pattern-comparative.md';
|
|
16
|
+
toolName = 'pattern';
|
|
17
|
+
constructor(datasetDir) {
|
|
18
|
+
super(datasetDir);
|
|
19
|
+
this.initializePrompt();
|
|
20
|
+
}
|
|
21
|
+
async evaluateAllScenarios() {
|
|
22
|
+
try {
|
|
23
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
24
|
+
const results = [];
|
|
25
|
+
console.log(`Found ${scenarios.length} pattern scenarios with multiple models for comparative evaluation`);
|
|
26
|
+
for (const scenario of scenarios) {
|
|
27
|
+
try {
|
|
28
|
+
const result = await this.evaluateScenario(scenario);
|
|
29
|
+
results.push(result);
|
|
30
|
+
}
|
|
31
|
+
catch (error) {
|
|
32
|
+
console.error(`Failed to evaluate scenario ${scenario.interaction_id}:`, error);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return results;
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
console.error(`Pattern comparative evaluation failed:`, error);
|
|
39
|
+
return [{
|
|
40
|
+
key: `${this.name}_error`,
|
|
41
|
+
score: 0,
|
|
42
|
+
comment: `Evaluation error: ${error instanceof Error ? error.message : String(error)}`,
|
|
43
|
+
confidence: 0,
|
|
44
|
+
modelRankings: [],
|
|
45
|
+
bestModel: 'unknown',
|
|
46
|
+
modelCount: 0
|
|
47
|
+
}];
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Build the evaluation prompt - uses base class reliability context with pattern-specific template
|
|
52
|
+
*/
|
|
53
|
+
buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
|
|
54
|
+
// Use the base class's properly formatted model responses which include:
|
|
55
|
+
// - Reliability Status (✅ Completed successfully OR ⚠️ TIMEOUT FAILURE)
|
|
56
|
+
// - Performance metrics
|
|
57
|
+
// - All model responses
|
|
58
|
+
return this.promptTemplate
|
|
59
|
+
.replace('{scenario_name}', scenario.interaction_id)
|
|
60
|
+
.replace('{model_responses}', modelResponsesText)
|
|
61
|
+
.replace('{models}', modelList);
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Get detailed breakdown of evaluation phases available
|
|
65
|
+
*/
|
|
66
|
+
getEvaluationPhases() {
|
|
67
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
68
|
+
const phaseGroups = new Map();
|
|
69
|
+
// Group scenarios by phase type
|
|
70
|
+
for (const scenario of scenarios) {
|
|
71
|
+
const phase = scenario.interaction_id;
|
|
72
|
+
if (!phaseGroups.has(phase)) {
|
|
73
|
+
phaseGroups.set(phase, {
|
|
74
|
+
models: new Set(),
|
|
75
|
+
count: 0
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
const group = phaseGroups.get(phase);
|
|
79
|
+
scenario.models.forEach(model => group.models.add(model.model));
|
|
80
|
+
group.count++;
|
|
81
|
+
}
|
|
82
|
+
// Convert to structured output with descriptions
|
|
83
|
+
const phaseDescriptions = {
|
|
84
|
+
'pattern_create_workflow': 'Pattern Creation Workflow - How well each model guides users through creating organizational patterns',
|
|
85
|
+
'trigger_expansion': 'Trigger Expansion Phase - How well each model expands infrastructure triggers for patterns',
|
|
86
|
+
'pattern_validation': 'Pattern Validation Phase - How well each model validates organizational patterns',
|
|
87
|
+
'pattern_matching': 'Pattern Matching Phase - How well each model matches user requirements to existing patterns'
|
|
88
|
+
};
|
|
89
|
+
return Array.from(phaseGroups.entries()).map(([phase, data]) => ({
|
|
90
|
+
phase,
|
|
91
|
+
description: phaseDescriptions[phase] || `${phase} phase evaluation`,
|
|
92
|
+
availableModels: Array.from(data.models).sort(),
|
|
93
|
+
scenarioCount: data.count
|
|
94
|
+
}));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
exports.PatternComparativeEvaluator = PatternComparativeEvaluator;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Policy Comparative Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Compares multiple AI models on Kubernetes organizational policy intent management scenarios
|
|
5
|
+
* Groups by interaction_id (e.g., policy_create_workflow) and evaluates
|
|
6
|
+
* quality of policy creation, validation, and enforcement recommendations across different models
|
|
7
|
+
*/
|
|
8
|
+
import { BaseComparativeEvaluator, ComparativeEvaluationScore } from './base-comparative.js';
|
|
9
|
+
import { ComparisonScenario } from '../dataset-analyzer.js';
|
|
10
|
+
export declare class PolicyComparativeEvaluator extends BaseComparativeEvaluator {
|
|
11
|
+
readonly name = "policy-comparative";
|
|
12
|
+
readonly description = "Compares AI models on Kubernetes organizational policy intent management quality";
|
|
13
|
+
protected readonly promptFileName = "policy-comparative.md";
|
|
14
|
+
protected readonly toolName = "policy";
|
|
15
|
+
constructor(datasetDir?: string);
|
|
16
|
+
evaluateAllScenarios(): Promise<ComparativeEvaluationScore[]>;
|
|
17
|
+
/**
|
|
18
|
+
* Build the evaluation prompt - uses base class reliability context with policy-specific template
|
|
19
|
+
*/
|
|
20
|
+
protected buildEvaluationPrompt(scenario: ComparisonScenario, modelResponsesText: string, modelList: string): string;
|
|
21
|
+
/**
|
|
22
|
+
* Get detailed breakdown of evaluation phases available
|
|
23
|
+
*/
|
|
24
|
+
getEvaluationPhases(): {
|
|
25
|
+
phase: string;
|
|
26
|
+
description: string;
|
|
27
|
+
availableModels: string[];
|
|
28
|
+
scenarioCount: number;
|
|
29
|
+
}[];
|
|
30
|
+
}
|
|
31
|
+
//# sourceMappingURL=policy-comparative.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"policy-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/policy-comparative.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,wBAAwB,EAAE,0BAA0B,EAAE,MAAM,uBAAuB,CAAC;AAC7F,OAAO,EAAE,kBAAkB,EAAE,MAAM,wBAAwB,CAAC;AAE5D,qBAAa,0BAA2B,SAAQ,wBAAwB;IACtE,QAAQ,CAAC,IAAI,wBAAwB;IACrC,QAAQ,CAAC,WAAW,sFAAsF;IAC1G,SAAS,CAAC,QAAQ,CAAC,cAAc,2BAA2B;IAC5D,SAAS,CAAC,QAAQ,CAAC,QAAQ,YAAY;gBAE3B,UAAU,CAAC,EAAE,MAAM;IAKzB,oBAAoB,IAAI,OAAO,CAAC,0BAA0B,EAAE,CAAC;IA+BnE;;OAEG;IACH,SAAS,CAAC,qBAAqB,CAAC,QAAQ,EAAE,kBAAkB,EAAE,kBAAkB,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,GAAG,MAAM;IAYpH;;OAEG;IACH,mBAAmB,IAAI;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CAsCJ"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Policy Comparative Evaluator
|
|
4
|
+
*
|
|
5
|
+
* Compares multiple AI models on Kubernetes organizational policy intent management scenarios
|
|
6
|
+
* Groups by interaction_id (e.g., policy_create_workflow) and evaluates
|
|
7
|
+
* quality of policy creation, validation, and enforcement recommendations across different models
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.PolicyComparativeEvaluator = void 0;
|
|
11
|
+
const base_comparative_js_1 = require("./base-comparative.js");
|
|
12
|
+
class PolicyComparativeEvaluator extends base_comparative_js_1.BaseComparativeEvaluator {
|
|
13
|
+
name = 'policy-comparative';
|
|
14
|
+
description = 'Compares AI models on Kubernetes organizational policy intent management quality';
|
|
15
|
+
promptFileName = 'policy-comparative.md';
|
|
16
|
+
toolName = 'policy';
|
|
17
|
+
constructor(datasetDir) {
|
|
18
|
+
super(datasetDir);
|
|
19
|
+
this.initializePrompt();
|
|
20
|
+
}
|
|
21
|
+
async evaluateAllScenarios() {
|
|
22
|
+
try {
|
|
23
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
24
|
+
const results = [];
|
|
25
|
+
console.log(`Found ${scenarios.length} policy scenarios with multiple models for comparative evaluation`);
|
|
26
|
+
for (const scenario of scenarios) {
|
|
27
|
+
try {
|
|
28
|
+
const result = await this.evaluateScenario(scenario);
|
|
29
|
+
results.push(result);
|
|
30
|
+
}
|
|
31
|
+
catch (error) {
|
|
32
|
+
console.error(`Failed to evaluate scenario ${scenario.interaction_id}:`, error);
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
return results;
|
|
36
|
+
}
|
|
37
|
+
catch (error) {
|
|
38
|
+
console.error(`Policy comparative evaluation failed:`, error);
|
|
39
|
+
return [{
|
|
40
|
+
key: `${this.name}_error`,
|
|
41
|
+
score: 0,
|
|
42
|
+
comment: `Evaluation error: ${error instanceof Error ? error.message : String(error)}`,
|
|
43
|
+
confidence: 0,
|
|
44
|
+
modelRankings: [],
|
|
45
|
+
bestModel: 'unknown',
|
|
46
|
+
modelCount: 0
|
|
47
|
+
}];
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Build the evaluation prompt - uses base class reliability context with policy-specific template
|
|
52
|
+
*/
|
|
53
|
+
buildEvaluationPrompt(scenario, modelResponsesText, modelList) {
|
|
54
|
+
// Use the base class's properly formatted model responses which include:
|
|
55
|
+
// - Reliability Status (✅ Completed successfully OR ⚠️ TIMEOUT FAILURE)
|
|
56
|
+
// - Performance metrics
|
|
57
|
+
// - All model responses
|
|
58
|
+
return this.promptTemplate
|
|
59
|
+
.replace('{scenario_name}', scenario.interaction_id)
|
|
60
|
+
.replace('{model_responses}', modelResponsesText)
|
|
61
|
+
.replace('{models}', modelList);
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Get detailed breakdown of evaluation phases available
|
|
65
|
+
*/
|
|
66
|
+
getEvaluationPhases() {
|
|
67
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
68
|
+
const phaseGroups = new Map();
|
|
69
|
+
// Group scenarios by phase type
|
|
70
|
+
for (const scenario of scenarios) {
|
|
71
|
+
const phase = scenario.interaction_id;
|
|
72
|
+
if (!phaseGroups.has(phase)) {
|
|
73
|
+
phaseGroups.set(phase, {
|
|
74
|
+
models: new Set(),
|
|
75
|
+
count: 0
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
const group = phaseGroups.get(phase);
|
|
79
|
+
scenario.models.forEach(model => group.models.add(model.model));
|
|
80
|
+
group.count++;
|
|
81
|
+
}
|
|
82
|
+
// Convert to structured output with descriptions
|
|
83
|
+
const phaseDescriptions = {
|
|
84
|
+
'policy_create_workflow': 'Policy Creation Workflow - How well each model guides users through creating organizational policies',
|
|
85
|
+
'policy_validation': 'Policy Validation Phase - How well each model validates policy intent correctness and enforceability',
|
|
86
|
+
'policy_enforcement_recommendations': 'Policy Enforcement Recommendations - How well each model provides enforcement strategies',
|
|
87
|
+
'policy_compliance_analysis': 'Policy Compliance Analysis - How well each model analyzes compliance with existing policies'
|
|
88
|
+
};
|
|
89
|
+
return Array.from(phaseGroups.entries()).map(([phase, data]) => ({
|
|
90
|
+
phase,
|
|
91
|
+
description: phaseDescriptions[phase] || `${phase} phase evaluation`,
|
|
92
|
+
availableModels: Array.from(data.models).sort(),
|
|
93
|
+
scenarioCount: data.count
|
|
94
|
+
}));
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
exports.PolicyComparativeEvaluator = PolicyComparativeEvaluator;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Recommendation Comparative Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Compares multiple AI models on Kubernetes recommendation scenarios
|
|
5
|
+
* Uses dynamic model inclusion based on available datasets
|
|
6
|
+
* Follows reference-free comparative evaluation methodology
|
|
7
|
+
*/
|
|
8
|
+
import { BaseComparativeEvaluator } from './base-comparative.js';
|
|
9
|
+
export declare class RecommendationComparativeEvaluator extends BaseComparativeEvaluator {
|
|
10
|
+
readonly name = "recommendation_comparative";
|
|
11
|
+
readonly description = "Compares multiple AI models on Kubernetes deployment recommendation scenarios";
|
|
12
|
+
protected readonly promptFileName = "recommendation-comparative.md";
|
|
13
|
+
protected readonly toolName = "recommend";
|
|
14
|
+
constructor(datasetDir?: string);
|
|
15
|
+
/**
|
|
16
|
+
* Get detailed breakdown of evaluation phases available
|
|
17
|
+
*/
|
|
18
|
+
getEvaluationPhases(): {
|
|
19
|
+
phase: string;
|
|
20
|
+
description: string;
|
|
21
|
+
availableModels: string[];
|
|
22
|
+
scenarioCount: number;
|
|
23
|
+
}[];
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=recommendation-comparative.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"recommendation-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/recommendation-comparative.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AAEjE,qBAAa,kCAAmC,SAAQ,wBAAwB;IAC9E,QAAQ,CAAC,IAAI,gCAAgC;IAC7C,QAAQ,CAAC,WAAW,mFAAmF;IACvG,SAAS,CAAC,QAAQ,CAAC,cAAc,mCAAmC;IACpE,SAAS,CAAC,QAAQ,CAAC,QAAQ,eAAe;gBAE9B,UAAU,CAAC,EAAE,MAAM;IAK/B;;OAEG;IACH,mBAAmB,IAAI;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CAsCJ"}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Recommendation Comparative Evaluator
|
|
4
|
+
*
|
|
5
|
+
* Compares multiple AI models on Kubernetes recommendation scenarios
|
|
6
|
+
* Uses dynamic model inclusion based on available datasets
|
|
7
|
+
* Follows reference-free comparative evaluation methodology
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.RecommendationComparativeEvaluator = void 0;
|
|
11
|
+
const base_comparative_js_1 = require("./base-comparative.js");
|
|
12
|
+
class RecommendationComparativeEvaluator extends base_comparative_js_1.BaseComparativeEvaluator {
|
|
13
|
+
name = 'recommendation_comparative';
|
|
14
|
+
description = 'Compares multiple AI models on Kubernetes deployment recommendation scenarios';
|
|
15
|
+
promptFileName = 'recommendation-comparative.md';
|
|
16
|
+
toolName = 'recommend';
|
|
17
|
+
constructor(datasetDir) {
|
|
18
|
+
super(datasetDir);
|
|
19
|
+
this.initializePrompt();
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Get detailed breakdown of evaluation phases available
|
|
23
|
+
*/
|
|
24
|
+
getEvaluationPhases() {
|
|
25
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
26
|
+
const phaseGroups = new Map();
|
|
27
|
+
// Group scenarios by phase type
|
|
28
|
+
for (const scenario of scenarios) {
|
|
29
|
+
const phase = scenario.interaction_id;
|
|
30
|
+
if (!phaseGroups.has(phase)) {
|
|
31
|
+
phaseGroups.set(phase, {
|
|
32
|
+
models: new Set(),
|
|
33
|
+
count: 0
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
const group = phaseGroups.get(phase);
|
|
37
|
+
scenario.models.forEach(model => group.models.add(model.model));
|
|
38
|
+
group.count++;
|
|
39
|
+
}
|
|
40
|
+
// Convert to structured output with descriptions
|
|
41
|
+
const phaseDescriptions = {
|
|
42
|
+
'clarification_phase': 'Intent Analysis Phase - How well each model analyzes user intents and identifies missing context',
|
|
43
|
+
'question_generation': 'Question Generation Phase - How well each model generates clarifying questions to enhance requirements',
|
|
44
|
+
'solution_assembly': 'Solution Assembly Phase - How well each model selects appropriate Kubernetes resources and deployment patterns',
|
|
45
|
+
'generate_manifests_phase': 'Manifest Generation Phase - How well each model generates production-ready Kubernetes manifests'
|
|
46
|
+
};
|
|
47
|
+
return Array.from(phaseGroups.entries()).map(([phase, data]) => ({
|
|
48
|
+
phase,
|
|
49
|
+
description: phaseDescriptions[phase] || `${phase} phase evaluation`,
|
|
50
|
+
availableModels: Array.from(data.models).sort(),
|
|
51
|
+
scenarioCount: data.count
|
|
52
|
+
}));
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
exports.RecommendationComparativeEvaluator = RecommendationComparativeEvaluator;
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Remediation Comparative Evaluator
|
|
3
|
+
*
|
|
4
|
+
* Compares multiple AI models on Kubernetes troubleshooting scenarios
|
|
5
|
+
* Uses dynamic model inclusion based on available datasets
|
|
6
|
+
* Follows reference-free comparative evaluation methodology
|
|
7
|
+
*/
|
|
8
|
+
import { BaseComparativeEvaluator } from './base-comparative.js';
|
|
9
|
+
export declare class RemediationComparativeEvaluator extends BaseComparativeEvaluator {
|
|
10
|
+
readonly name = "remediation_comparative";
|
|
11
|
+
readonly description = "Compares multiple AI models on Kubernetes troubleshooting scenarios";
|
|
12
|
+
protected readonly promptFileName = "remediation-comparative.md";
|
|
13
|
+
protected readonly toolName = "remediate";
|
|
14
|
+
constructor(datasetDir?: string);
|
|
15
|
+
/**
|
|
16
|
+
* Get detailed breakdown of evaluation phases available
|
|
17
|
+
*/
|
|
18
|
+
getEvaluationPhases(): {
|
|
19
|
+
phase: string;
|
|
20
|
+
description: string;
|
|
21
|
+
availableModels: string[];
|
|
22
|
+
scenarioCount: number;
|
|
23
|
+
}[];
|
|
24
|
+
}
|
|
25
|
+
//# sourceMappingURL=remediation-comparative.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"remediation-comparative.d.ts","sourceRoot":"","sources":["../../../src/evaluation/evaluators/remediation-comparative.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,EAAE,wBAAwB,EAAE,MAAM,uBAAuB,CAAC;AAEjE,qBAAa,+BAAgC,SAAQ,wBAAwB;IAC3E,QAAQ,CAAC,IAAI,6BAA6B;IAC1C,QAAQ,CAAC,WAAW,yEAAyE;IAC7F,SAAS,CAAC,QAAQ,CAAC,cAAc,gCAAgC;IACjE,SAAS,CAAC,QAAQ,CAAC,QAAQ,eAAe;gBAE9B,UAAU,CAAC,EAAE,MAAM;IAK/B;;OAEG;IACH,mBAAmB,IAAI;QACrB,KAAK,EAAE,MAAM,CAAC;QACd,WAAW,EAAE,MAAM,CAAC;QACpB,eAAe,EAAE,MAAM,EAAE,CAAC;QAC1B,aAAa,EAAE,MAAM,CAAC;KACvB,EAAE;CAqCJ"}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Remediation Comparative Evaluator
|
|
4
|
+
*
|
|
5
|
+
* Compares multiple AI models on Kubernetes troubleshooting scenarios
|
|
6
|
+
* Uses dynamic model inclusion based on available datasets
|
|
7
|
+
* Follows reference-free comparative evaluation methodology
|
|
8
|
+
*/
|
|
9
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
10
|
+
exports.RemediationComparativeEvaluator = void 0;
|
|
11
|
+
const base_comparative_js_1 = require("./base-comparative.js");
|
|
12
|
+
class RemediationComparativeEvaluator extends base_comparative_js_1.BaseComparativeEvaluator {
|
|
13
|
+
name = 'remediation_comparative';
|
|
14
|
+
description = 'Compares multiple AI models on Kubernetes troubleshooting scenarios';
|
|
15
|
+
promptFileName = 'remediation-comparative.md';
|
|
16
|
+
toolName = 'remediate';
|
|
17
|
+
constructor(datasetDir) {
|
|
18
|
+
super(datasetDir);
|
|
19
|
+
this.initializePrompt();
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Get detailed breakdown of evaluation phases available
|
|
23
|
+
*/
|
|
24
|
+
getEvaluationPhases() {
|
|
25
|
+
const scenarios = this.datasetAnalyzer.groupByScenario(this.toolName);
|
|
26
|
+
const phaseGroups = new Map();
|
|
27
|
+
// Group scenarios by phase type
|
|
28
|
+
for (const scenario of scenarios) {
|
|
29
|
+
const phase = scenario.interaction_id;
|
|
30
|
+
if (!phaseGroups.has(phase)) {
|
|
31
|
+
phaseGroups.set(phase, {
|
|
32
|
+
models: new Set(),
|
|
33
|
+
count: 0
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
const group = phaseGroups.get(phase);
|
|
37
|
+
scenario.models.forEach(model => group.models.add(model.model));
|
|
38
|
+
group.count++;
|
|
39
|
+
}
|
|
40
|
+
// Convert to structured output with descriptions
|
|
41
|
+
const phaseDescriptions = {
|
|
42
|
+
'manual_analyze': 'Manual Investigation Phase - How well each model investigates and diagnoses issues',
|
|
43
|
+
'manual_execute': 'Manual Execution Phase - How well each model validates and confirms fixes worked',
|
|
44
|
+
'automatic_analyze_execute': 'Automatic Full Workflow - End-to-end troubleshooting in single automated workflow'
|
|
45
|
+
};
|
|
46
|
+
return Array.from(phaseGroups.entries()).map(([phase, data]) => ({
|
|
47
|
+
phase,
|
|
48
|
+
description: phaseDescriptions[phase] || `${phase} phase evaluation`,
|
|
49
|
+
availableModels: Array.from(data.models).sort(),
|
|
50
|
+
scenarioCount: data.count
|
|
51
|
+
}));
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
exports.RemediationComparativeEvaluator = RemediationComparativeEvaluator;
|