@vfarcic/dot-ai 0.111.0 → 0.113.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/ai-provider-factory.d.ts +0 -10
- package/dist/core/ai-provider-factory.d.ts.map +1 -1
- package/dist/core/ai-provider-factory.js +14 -24
- package/dist/core/ai-provider.interface.d.ts +28 -1
- package/dist/core/ai-provider.interface.d.ts.map +1 -1
- package/dist/core/capabilities.d.ts +1 -1
- package/dist/core/capabilities.d.ts.map +1 -1
- package/dist/core/capabilities.js +7 -4
- package/dist/core/capability-scan-workflow.js +2 -2
- package/dist/core/embedding-service.d.ts +35 -2
- package/dist/core/embedding-service.d.ts.map +1 -1
- package/dist/core/embedding-service.js +228 -15
- package/dist/core/model-config.d.ts +23 -0
- package/dist/core/model-config.d.ts.map +1 -0
- package/dist/core/model-config.js +28 -0
- package/dist/core/platform-operations.d.ts.map +1 -1
- package/dist/core/platform-operations.js +3 -5
- package/dist/core/platform-utils.d.ts +13 -2
- package/dist/core/platform-utils.d.ts.map +1 -1
- package/dist/core/platform-utils.js +91 -9
- package/dist/core/providers/anthropic-provider.d.ts +6 -1
- package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
- package/dist/core/providers/anthropic-provider.js +99 -27
- package/dist/core/providers/provider-debug-utils.d.ts +53 -20
- package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
- package/dist/core/providers/provider-debug-utils.js +106 -51
- package/dist/core/providers/vercel-provider.d.ts +6 -1
- package/dist/core/providers/vercel-provider.d.ts.map +1 -1
- package/dist/core/providers/vercel-provider.js +212 -130
- package/dist/core/schema.d.ts +1 -101
- package/dist/core/schema.d.ts.map +1 -1
- package/dist/core/schema.js +20 -154
- package/dist/core/unified-creation-session.d.ts.map +1 -1
- package/dist/core/unified-creation-session.js +15 -7
- package/dist/evaluation/dataset-analyzer.d.ts +118 -0
- package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
- package/dist/evaluation/dataset-analyzer.js +234 -0
- package/dist/evaluation/datasets/loader.d.ts +42 -0
- package/dist/evaluation/datasets/loader.d.ts.map +1 -0
- package/dist/evaluation/datasets/loader.js +104 -0
- package/dist/evaluation/eval-runner.d.ts +9 -0
- package/dist/evaluation/eval-runner.d.ts.map +1 -0
- package/dist/evaluation/eval-runner.js +399 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts +94 -0
- package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base-comparative.js +187 -0
- package/dist/evaluation/evaluators/base.d.ts +47 -0
- package/dist/evaluation/evaluators/base.d.ts.map +1 -0
- package/dist/evaluation/evaluators/base.js +10 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
- package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/capability-comparative.js +104 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
- package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/policy-comparative.js +97 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
- package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
- package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
- package/dist/evaluation/platform-synthesizer.d.ts +54 -0
- package/dist/evaluation/platform-synthesizer.d.ts.map +1 -0
- package/dist/evaluation/platform-synthesizer.js +368 -0
- package/dist/evaluation/run-platform-synthesis.d.ts +9 -0
- package/dist/evaluation/run-platform-synthesis.d.ts.map +1 -0
- package/dist/evaluation/run-platform-synthesis.js +45 -0
- package/dist/interfaces/mcp.d.ts.map +1 -1
- package/dist/interfaces/mcp.js +23 -29
- package/dist/interfaces/rest-api.d.ts.map +1 -1
- package/dist/tools/answer-question.d.ts +2 -0
- package/dist/tools/answer-question.d.ts.map +1 -1
- package/dist/tools/answer-question.js +18 -11
- package/dist/tools/generate-manifests.d.ts +2 -0
- package/dist/tools/generate-manifests.d.ts.map +1 -1
- package/dist/tools/generate-manifests.js +11 -12
- package/dist/tools/organizational-data.d.ts +1 -0
- package/dist/tools/organizational-data.d.ts.map +1 -1
- package/dist/tools/organizational-data.js +2 -1
- package/dist/tools/recommend.d.ts +1 -0
- package/dist/tools/recommend.d.ts.map +1 -1
- package/dist/tools/recommend.js +13 -21
- package/dist/tools/remediate.d.ts +3 -0
- package/dist/tools/remediate.d.ts.map +1 -1
- package/dist/tools/remediate.js +35 -14
- package/dist/tools/test-docs.d.ts +1 -0
- package/dist/tools/test-docs.d.ts.map +1 -1
- package/dist/tools/test-docs.js +4 -2
- package/dist/tools/version.d.ts +5 -1
- package/dist/tools/version.d.ts.map +1 -1
- package/dist/tools/version.js +23 -8
- package/package.json +19 -1
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Dataset Analyzer for Multi-Model Comparison
|
|
4
|
+
*
|
|
5
|
+
* Analyzes evaluation datasets to group them by scenario and extract
|
|
6
|
+
* model responses for comparative evaluation.
|
|
7
|
+
*/
|
|
8
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
9
|
+
exports.DatasetAnalyzer = void 0;
|
|
10
|
+
const fs_1 = require("fs");
|
|
11
|
+
const path_1 = require("path");
|
|
12
|
+
class DatasetAnalyzer {
|
|
13
|
+
datasetDir;
|
|
14
|
+
constructor(datasetDir = './eval/datasets') {
|
|
15
|
+
this.datasetDir = datasetDir;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Find all available datasets for a specific tool
|
|
19
|
+
*/
|
|
20
|
+
findDatasets(tool) {
|
|
21
|
+
const files = (0, fs_1.readdirSync)(this.datasetDir);
|
|
22
|
+
return files
|
|
23
|
+
.filter(file => file.startsWith(`${tool}_`) && file.endsWith('.jsonl'))
|
|
24
|
+
.map(file => (0, path_1.join)(this.datasetDir, file));
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Parse dataset filename to extract components
|
|
28
|
+
* Format: {tool}_{interaction_id}_{sdk}_{model}_{timestamp}.jsonl
|
|
29
|
+
*/
|
|
30
|
+
parseDatasetFilename(filename) {
|
|
31
|
+
const basename = filename.replace(/^.*\//, '').replace(/\.jsonl$/, '');
|
|
32
|
+
const parts = basename.split('_');
|
|
33
|
+
if (parts.length < 5)
|
|
34
|
+
return null;
|
|
35
|
+
// For remediate datasets: remediate_{phase}_{action}_vercel_{model}_{timestamp}
|
|
36
|
+
// e.g., remediate_manual_analyze_vercel_gpt_timestamp
|
|
37
|
+
const tool = parts[0];
|
|
38
|
+
const timestamp = parts[parts.length - 1];
|
|
39
|
+
// Find 'vercel' SDK position to split correctly
|
|
40
|
+
const sdkIndex = parts.indexOf('vercel');
|
|
41
|
+
if (sdkIndex === -1)
|
|
42
|
+
return null;
|
|
43
|
+
// interaction_id is everything between tool and sdk
|
|
44
|
+
const interaction_id = parts.slice(1, sdkIndex).join('_');
|
|
45
|
+
const sdk = parts[sdkIndex];
|
|
46
|
+
const model = parts.slice(sdkIndex + 1, -1).join('_');
|
|
47
|
+
return { tool, interaction_id, sdk, model, timestamp };
|
|
48
|
+
}
|
|
49
|
+
/**
|
|
50
|
+
* Load and parse a dataset file
|
|
51
|
+
*/
|
|
52
|
+
loadDataset(filepath) {
|
|
53
|
+
try {
|
|
54
|
+
const content = (0, fs_1.readFileSync)(filepath, 'utf8').trim();
|
|
55
|
+
if (!content)
|
|
56
|
+
return null;
|
|
57
|
+
return JSON.parse(content);
|
|
58
|
+
}
|
|
59
|
+
catch (error) {
|
|
60
|
+
console.warn(`Failed to load dataset ${filepath}:`, error);
|
|
61
|
+
return null;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Group datasets by scenario for comparative evaluation
|
|
66
|
+
* Returns scenarios that have data from multiple models
|
|
67
|
+
* Groups by both tool and interaction_id to create separate evaluations for each phase
|
|
68
|
+
*/
|
|
69
|
+
groupByScenario(tool) {
|
|
70
|
+
const datasets = this.findDatasets(tool);
|
|
71
|
+
const scenarioGroups = new Map();
|
|
72
|
+
// Group datasets by filename pattern up to provider, then by model
|
|
73
|
+
for (const filepath of datasets) {
|
|
74
|
+
const sample = this.loadDataset(filepath);
|
|
75
|
+
if (!sample)
|
|
76
|
+
continue;
|
|
77
|
+
// Extract scenario key from filename pattern (up to provider)
|
|
78
|
+
const filename = filepath.replace(/^.*\//, ''); // Remove directory path
|
|
79
|
+
const filenameParts = filename.split('_');
|
|
80
|
+
const beforeProvider = [];
|
|
81
|
+
for (const part of filenameParts) {
|
|
82
|
+
if (part === 'vercel')
|
|
83
|
+
break; // Stop at SDK name
|
|
84
|
+
beforeProvider.push(part);
|
|
85
|
+
}
|
|
86
|
+
const scenarioKey = beforeProvider.join('_');
|
|
87
|
+
// Group by model within each scenario
|
|
88
|
+
const modelKey = `${sample.performance.sdk}_${sample.performance.model_version}`;
|
|
89
|
+
if (!scenarioGroups.has(scenarioKey)) {
|
|
90
|
+
scenarioGroups.set(scenarioKey, new Map());
|
|
91
|
+
}
|
|
92
|
+
const modelGroups = scenarioGroups.get(scenarioKey);
|
|
93
|
+
if (!modelGroups.has(modelKey)) {
|
|
94
|
+
modelGroups.set(modelKey, []);
|
|
95
|
+
}
|
|
96
|
+
// Parse failure_analysis if it exists
|
|
97
|
+
let failure_analysis = undefined;
|
|
98
|
+
if (sample.metadata.failure_analysis && sample.metadata.failure_analysis !== "") {
|
|
99
|
+
try {
|
|
100
|
+
if (typeof sample.metadata.failure_analysis === 'string') {
|
|
101
|
+
failure_analysis = JSON.parse(sample.metadata.failure_analysis);
|
|
102
|
+
}
|
|
103
|
+
else {
|
|
104
|
+
failure_analysis = sample.metadata.failure_analysis;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
catch (error) {
|
|
108
|
+
// If parsing fails, treat as no failure analysis
|
|
109
|
+
failure_analysis = undefined;
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
modelGroups.get(modelKey).push({
|
|
113
|
+
model: modelKey,
|
|
114
|
+
response: sample.output,
|
|
115
|
+
performance: sample.performance,
|
|
116
|
+
metadata: {
|
|
117
|
+
timestamp: sample.metadata.timestamp,
|
|
118
|
+
complexity: sample.metadata.complexity,
|
|
119
|
+
test_scenario: sample.metadata.test_scenario,
|
|
120
|
+
issue: sample.input.issue,
|
|
121
|
+
failure_analysis
|
|
122
|
+
}
|
|
123
|
+
});
|
|
124
|
+
}
|
|
125
|
+
// Convert to comparison scenarios - include ALL scenarios (remove multi-model filter)
|
|
126
|
+
const scenarios = [];
|
|
127
|
+
for (const [scenarioKey, modelGroups] of scenarioGroups) {
|
|
128
|
+
// Flatten model groups: each model may have multiple interactions
|
|
129
|
+
const allModelResponses = [];
|
|
130
|
+
for (const [modelKey, interactions] of modelGroups) {
|
|
131
|
+
// Combine multiple interactions per model into a single response
|
|
132
|
+
if (interactions.length === 1) {
|
|
133
|
+
allModelResponses.push(interactions[0]);
|
|
134
|
+
}
|
|
135
|
+
else {
|
|
136
|
+
// Multiple interactions per model - combine them
|
|
137
|
+
const combinedResponse = this.combineModelInteractions(modelKey, interactions);
|
|
138
|
+
allModelResponses.push(combinedResponse);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
// Get representative issue from first model's first interaction
|
|
142
|
+
const firstModel = Array.from(modelGroups.values())[0]?.[0];
|
|
143
|
+
const issue = firstModel?.metadata?.issue || scenarioKey;
|
|
144
|
+
scenarios.push({
|
|
145
|
+
issue,
|
|
146
|
+
interaction_id: scenarioKey,
|
|
147
|
+
tool,
|
|
148
|
+
models: allModelResponses
|
|
149
|
+
});
|
|
150
|
+
}
|
|
151
|
+
return scenarios;
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Combine multiple interactions per model into a single response for evaluation
|
|
155
|
+
*/
|
|
156
|
+
combineModelInteractions(modelKey, interactions) {
|
|
157
|
+
// Sort interactions by timestamp
|
|
158
|
+
const sorted = interactions.sort((a, b) => new Date(a.metadata.timestamp).getTime() - new Date(b.metadata.timestamp).getTime());
|
|
159
|
+
// Create combined response showing all interactions
|
|
160
|
+
const combinedResponse = sorted.map((interaction, index) => `**Interaction ${index + 1}:**\n` +
|
|
161
|
+
`Issue: ${interaction.metadata.issue}\n` +
|
|
162
|
+
`Response: ${interaction.response}\n`).join('\n---\n');
|
|
163
|
+
// Aggregate performance metrics
|
|
164
|
+
const totalDuration = sorted.reduce((sum, i) => sum + i.performance.duration_ms, 0);
|
|
165
|
+
const totalInputTokens = sorted.reduce((sum, i) => sum + i.performance.input_tokens, 0);
|
|
166
|
+
const totalOutputTokens = sorted.reduce((sum, i) => sum + i.performance.output_tokens, 0);
|
|
167
|
+
// Collect all failure analyses from all interactions that have them
|
|
168
|
+
const allFailures = [];
|
|
169
|
+
sorted.forEach((interaction, index) => {
|
|
170
|
+
if (interaction.metadata.failure_analysis) {
|
|
171
|
+
allFailures.push({
|
|
172
|
+
interaction_number: index + 1,
|
|
173
|
+
issue: interaction.metadata.issue,
|
|
174
|
+
...interaction.metadata.failure_analysis
|
|
175
|
+
});
|
|
176
|
+
}
|
|
177
|
+
});
|
|
178
|
+
// Use the first failure as the primary failure_analysis, but preserve all failures
|
|
179
|
+
const primaryFailureAnalysis = allFailures.length > 0 ? allFailures[0] : undefined;
|
|
180
|
+
return {
|
|
181
|
+
model: modelKey,
|
|
182
|
+
response: combinedResponse,
|
|
183
|
+
performance: {
|
|
184
|
+
...sorted[0].performance,
|
|
185
|
+
duration_ms: totalDuration,
|
|
186
|
+
input_tokens: totalInputTokens,
|
|
187
|
+
output_tokens: totalOutputTokens,
|
|
188
|
+
total_tokens: totalInputTokens + totalOutputTokens
|
|
189
|
+
},
|
|
190
|
+
metadata: {
|
|
191
|
+
...sorted[0].metadata,
|
|
192
|
+
issue: sorted[0].metadata.issue, // Use first interaction's issue as primary
|
|
193
|
+
interaction_count: interactions.length,
|
|
194
|
+
failure_analysis: primaryFailureAnalysis,
|
|
195
|
+
all_failures: allFailures.length > 0 ? allFailures : undefined
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
}
|
|
199
|
+
/**
|
|
200
|
+
* Get summary of available models across all scenarios for a tool
|
|
201
|
+
*/
|
|
202
|
+
getAvailableModels(tool) {
|
|
203
|
+
const datasets = this.findDatasets(tool);
|
|
204
|
+
const models = new Set();
|
|
205
|
+
for (const filepath of datasets) {
|
|
206
|
+
const parsed = this.parseDatasetFilename(filepath);
|
|
207
|
+
if (parsed) {
|
|
208
|
+
models.add(`${parsed.sdk}_${parsed.model}`);
|
|
209
|
+
}
|
|
210
|
+
}
|
|
211
|
+
return Array.from(models).sort();
|
|
212
|
+
}
|
|
213
|
+
/**
|
|
214
|
+
* Get statistics about dataset availability
|
|
215
|
+
*/
|
|
216
|
+
getDatasetStats(tool) {
|
|
217
|
+
const scenarios = this.groupByScenario(tool);
|
|
218
|
+
const datasets = this.findDatasets(tool);
|
|
219
|
+
const interactionTypes = new Set();
|
|
220
|
+
for (const filepath of datasets) {
|
|
221
|
+
const parsed = this.parseDatasetFilename(filepath);
|
|
222
|
+
if (parsed) {
|
|
223
|
+
interactionTypes.add(parsed.interaction_id);
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
return {
|
|
227
|
+
totalDatasets: datasets.length,
|
|
228
|
+
availableModels: this.getAvailableModels(tool),
|
|
229
|
+
scenariosWithMultipleModels: scenarios.length,
|
|
230
|
+
interactionTypes: Array.from(interactionTypes).sort()
|
|
231
|
+
};
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
exports.DatasetAnalyzer = DatasetAnalyzer;
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Dataset Loader for Standard OpenAI Evals Format
|
|
3
|
+
*
|
|
4
|
+
* Loads JSONL evaluation datasets following OpenAI Evals standard:
|
|
5
|
+
* - Each line contains: {input, ideal, metadata}
|
|
6
|
+
* - Supports filtering by category, complexity, tags
|
|
7
|
+
* - Used by both integration tests and evaluation framework
|
|
8
|
+
*/
|
|
9
|
+
export interface StandardEvalSample {
|
|
10
|
+
input: Record<string, any>;
|
|
11
|
+
ideal: any;
|
|
12
|
+
metadata: {
|
|
13
|
+
category: string;
|
|
14
|
+
complexity: 'low' | 'medium' | 'high';
|
|
15
|
+
tags: string[];
|
|
16
|
+
source: string;
|
|
17
|
+
phase?: string;
|
|
18
|
+
tool?: string;
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
export interface DatasetFilter {
|
|
22
|
+
category?: string;
|
|
23
|
+
complexity?: 'low' | 'medium' | 'high';
|
|
24
|
+
tags?: string[];
|
|
25
|
+
phase?: string;
|
|
26
|
+
tool?: string;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Load evaluation dataset from JSONL file
|
|
30
|
+
* @param datasetName - Name of the dataset file (without .jsonl extension)
|
|
31
|
+
* @param filter - Optional filter criteria
|
|
32
|
+
* @returns Array of evaluation samples
|
|
33
|
+
*/
|
|
34
|
+
export declare function loadEvalDataset(datasetName: string, filter?: DatasetFilter): StandardEvalSample[];
|
|
35
|
+
/**
|
|
36
|
+
* Load samples for a specific test phase
|
|
37
|
+
* @param datasetName - Dataset name
|
|
38
|
+
* @param phase - Test phase to load
|
|
39
|
+
* @returns Array of samples for that phase
|
|
40
|
+
*/
|
|
41
|
+
export declare function loadTestPhase(datasetName: string, phase: string): StandardEvalSample[];
|
|
42
|
+
//# sourceMappingURL=loader.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../../src/evaluation/datasets/loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC3B,KAAK,EAAE,GAAG,CAAC;IACX,QAAQ,EAAE;QACR,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QACtC,IAAI,EAAE,MAAM,EAAE,CAAC;QACf,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,IAAI,CAAC,EAAE,MAAM,CAAC;KACf,CAAC;CACH;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IACvC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAC7B,WAAW,EAAE,MAAM,EACnB,MAAM,CAAC,EAAE,aAAa,GACrB,kBAAkB,EAAE,CA+CtB;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,WAAW,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,kBAAkB,EAAE,CAEtF"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Dataset Loader for Standard OpenAI Evals Format
|
|
4
|
+
*
|
|
5
|
+
* Loads JSONL evaluation datasets following OpenAI Evals standard:
|
|
6
|
+
* - Each line contains: {input, ideal, metadata}
|
|
7
|
+
* - Supports filtering by category, complexity, tags
|
|
8
|
+
* - Used by both integration tests and evaluation framework
|
|
9
|
+
*/
|
|
10
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
11
|
+
if (k2 === undefined) k2 = k;
|
|
12
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
13
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
14
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
15
|
+
}
|
|
16
|
+
Object.defineProperty(o, k2, desc);
|
|
17
|
+
}) : (function(o, m, k, k2) {
|
|
18
|
+
if (k2 === undefined) k2 = k;
|
|
19
|
+
o[k2] = m[k];
|
|
20
|
+
}));
|
|
21
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
22
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
23
|
+
}) : function(o, v) {
|
|
24
|
+
o["default"] = v;
|
|
25
|
+
});
|
|
26
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
27
|
+
var ownKeys = function(o) {
|
|
28
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
29
|
+
var ar = [];
|
|
30
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
31
|
+
return ar;
|
|
32
|
+
};
|
|
33
|
+
return ownKeys(o);
|
|
34
|
+
};
|
|
35
|
+
return function (mod) {
|
|
36
|
+
if (mod && mod.__esModule) return mod;
|
|
37
|
+
var result = {};
|
|
38
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
39
|
+
__setModuleDefault(result, mod);
|
|
40
|
+
return result;
|
|
41
|
+
};
|
|
42
|
+
})();
|
|
43
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
44
|
+
exports.loadEvalDataset = loadEvalDataset;
|
|
45
|
+
exports.loadTestPhase = loadTestPhase;
|
|
46
|
+
const fs = __importStar(require("fs"));
|
|
47
|
+
const path = __importStar(require("path"));
|
|
48
|
+
/**
|
|
49
|
+
* Load evaluation dataset from JSONL file
|
|
50
|
+
* @param datasetName - Name of the dataset file (without .jsonl extension)
|
|
51
|
+
* @param filter - Optional filter criteria
|
|
52
|
+
* @returns Array of evaluation samples
|
|
53
|
+
*/
|
|
54
|
+
function loadEvalDataset(datasetName, filter) {
|
|
55
|
+
const datasetsDir = path.join(process.cwd(), 'eval', 'datasets');
|
|
56
|
+
const datasetPath = path.join(datasetsDir, `${datasetName}.jsonl`);
|
|
57
|
+
if (!fs.existsSync(datasetPath)) {
|
|
58
|
+
throw new Error(`Dataset not found: ${datasetPath}`);
|
|
59
|
+
}
|
|
60
|
+
const fileContent = fs.readFileSync(datasetPath, 'utf8');
|
|
61
|
+
const lines = fileContent.trim().split('\n').filter(line => line.trim());
|
|
62
|
+
const samples = lines.map((line, index) => {
|
|
63
|
+
try {
|
|
64
|
+
return JSON.parse(line);
|
|
65
|
+
}
|
|
66
|
+
catch (error) {
|
|
67
|
+
throw new Error(`Invalid JSON at line ${index + 1} in ${datasetName}.jsonl: ${error}`);
|
|
68
|
+
}
|
|
69
|
+
});
|
|
70
|
+
// Apply filters if provided
|
|
71
|
+
if (filter) {
|
|
72
|
+
return samples.filter(sample => {
|
|
73
|
+
if (filter.category && sample.metadata.category !== filter.category) {
|
|
74
|
+
return false;
|
|
75
|
+
}
|
|
76
|
+
if (filter.complexity && sample.metadata.complexity !== filter.complexity) {
|
|
77
|
+
return false;
|
|
78
|
+
}
|
|
79
|
+
if (filter.phase && sample.metadata.phase !== filter.phase) {
|
|
80
|
+
return false;
|
|
81
|
+
}
|
|
82
|
+
if (filter.tool && sample.metadata.tool !== filter.tool) {
|
|
83
|
+
return false;
|
|
84
|
+
}
|
|
85
|
+
if (filter.tags) {
|
|
86
|
+
const hasAllTags = filter.tags.every(tag => sample.metadata.tags.includes(tag));
|
|
87
|
+
if (!hasAllTags) {
|
|
88
|
+
return false;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return true;
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
return samples;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Load samples for a specific test phase
|
|
98
|
+
* @param datasetName - Dataset name
|
|
99
|
+
* @param phase - Test phase to load
|
|
100
|
+
* @returns Array of samples for that phase
|
|
101
|
+
*/
|
|
102
|
+
function loadTestPhase(datasetName, phase) {
|
|
103
|
+
return loadEvalDataset(datasetName, { phase });
|
|
104
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
#!/usr/bin/env npx tsx
|
|
2
|
+
/**
|
|
3
|
+
* Evaluation Runner for Multi-Model Comparative Analysis
|
|
4
|
+
*
|
|
5
|
+
* Runs comparative evaluation on available datasets from multiple models
|
|
6
|
+
* Automatically detects and evaluates both remediation and recommendation datasets
|
|
7
|
+
*/
|
|
8
|
+
export {};
|
|
9
|
+
//# sourceMappingURL=eval-runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-runner.d.ts","sourceRoot":"","sources":["../../src/evaluation/eval-runner.ts"],"names":[],"mappings":";AAEA;;;;;GAKG"}
|