@vfarcic/dot-ai 0.111.0 → 0.113.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (95) hide show
  1. package/dist/core/ai-provider-factory.d.ts +0 -10
  2. package/dist/core/ai-provider-factory.d.ts.map +1 -1
  3. package/dist/core/ai-provider-factory.js +14 -24
  4. package/dist/core/ai-provider.interface.d.ts +28 -1
  5. package/dist/core/ai-provider.interface.d.ts.map +1 -1
  6. package/dist/core/capabilities.d.ts +1 -1
  7. package/dist/core/capabilities.d.ts.map +1 -1
  8. package/dist/core/capabilities.js +7 -4
  9. package/dist/core/capability-scan-workflow.js +2 -2
  10. package/dist/core/embedding-service.d.ts +35 -2
  11. package/dist/core/embedding-service.d.ts.map +1 -1
  12. package/dist/core/embedding-service.js +228 -15
  13. package/dist/core/model-config.d.ts +23 -0
  14. package/dist/core/model-config.d.ts.map +1 -0
  15. package/dist/core/model-config.js +28 -0
  16. package/dist/core/platform-operations.d.ts.map +1 -1
  17. package/dist/core/platform-operations.js +3 -5
  18. package/dist/core/platform-utils.d.ts +13 -2
  19. package/dist/core/platform-utils.d.ts.map +1 -1
  20. package/dist/core/platform-utils.js +91 -9
  21. package/dist/core/providers/anthropic-provider.d.ts +6 -1
  22. package/dist/core/providers/anthropic-provider.d.ts.map +1 -1
  23. package/dist/core/providers/anthropic-provider.js +99 -27
  24. package/dist/core/providers/provider-debug-utils.d.ts +53 -20
  25. package/dist/core/providers/provider-debug-utils.d.ts.map +1 -1
  26. package/dist/core/providers/provider-debug-utils.js +106 -51
  27. package/dist/core/providers/vercel-provider.d.ts +6 -1
  28. package/dist/core/providers/vercel-provider.d.ts.map +1 -1
  29. package/dist/core/providers/vercel-provider.js +212 -130
  30. package/dist/core/schema.d.ts +1 -101
  31. package/dist/core/schema.d.ts.map +1 -1
  32. package/dist/core/schema.js +20 -154
  33. package/dist/core/unified-creation-session.d.ts.map +1 -1
  34. package/dist/core/unified-creation-session.js +15 -7
  35. package/dist/evaluation/dataset-analyzer.d.ts +118 -0
  36. package/dist/evaluation/dataset-analyzer.d.ts.map +1 -0
  37. package/dist/evaluation/dataset-analyzer.js +234 -0
  38. package/dist/evaluation/datasets/loader.d.ts +42 -0
  39. package/dist/evaluation/datasets/loader.d.ts.map +1 -0
  40. package/dist/evaluation/datasets/loader.js +104 -0
  41. package/dist/evaluation/eval-runner.d.ts +9 -0
  42. package/dist/evaluation/eval-runner.d.ts.map +1 -0
  43. package/dist/evaluation/eval-runner.js +399 -0
  44. package/dist/evaluation/evaluators/base-comparative.d.ts +94 -0
  45. package/dist/evaluation/evaluators/base-comparative.d.ts.map +1 -0
  46. package/dist/evaluation/evaluators/base-comparative.js +187 -0
  47. package/dist/evaluation/evaluators/base.d.ts +47 -0
  48. package/dist/evaluation/evaluators/base.d.ts.map +1 -0
  49. package/dist/evaluation/evaluators/base.js +10 -0
  50. package/dist/evaluation/evaluators/capability-comparative.d.ts +32 -0
  51. package/dist/evaluation/evaluators/capability-comparative.d.ts.map +1 -0
  52. package/dist/evaluation/evaluators/capability-comparative.js +104 -0
  53. package/dist/evaluation/evaluators/pattern-comparative.d.ts +31 -0
  54. package/dist/evaluation/evaluators/pattern-comparative.d.ts.map +1 -0
  55. package/dist/evaluation/evaluators/pattern-comparative.js +97 -0
  56. package/dist/evaluation/evaluators/policy-comparative.d.ts +31 -0
  57. package/dist/evaluation/evaluators/policy-comparative.d.ts.map +1 -0
  58. package/dist/evaluation/evaluators/policy-comparative.js +97 -0
  59. package/dist/evaluation/evaluators/recommendation-comparative.d.ts +25 -0
  60. package/dist/evaluation/evaluators/recommendation-comparative.d.ts.map +1 -0
  61. package/dist/evaluation/evaluators/recommendation-comparative.js +55 -0
  62. package/dist/evaluation/evaluators/remediation-comparative.d.ts +25 -0
  63. package/dist/evaluation/evaluators/remediation-comparative.d.ts.map +1 -0
  64. package/dist/evaluation/evaluators/remediation-comparative.js +54 -0
  65. package/dist/evaluation/platform-synthesizer.d.ts +54 -0
  66. package/dist/evaluation/platform-synthesizer.d.ts.map +1 -0
  67. package/dist/evaluation/platform-synthesizer.js +368 -0
  68. package/dist/evaluation/run-platform-synthesis.d.ts +9 -0
  69. package/dist/evaluation/run-platform-synthesis.d.ts.map +1 -0
  70. package/dist/evaluation/run-platform-synthesis.js +45 -0
  71. package/dist/interfaces/mcp.d.ts.map +1 -1
  72. package/dist/interfaces/mcp.js +23 -29
  73. package/dist/interfaces/rest-api.d.ts.map +1 -1
  74. package/dist/tools/answer-question.d.ts +2 -0
  75. package/dist/tools/answer-question.d.ts.map +1 -1
  76. package/dist/tools/answer-question.js +18 -11
  77. package/dist/tools/generate-manifests.d.ts +2 -0
  78. package/dist/tools/generate-manifests.d.ts.map +1 -1
  79. package/dist/tools/generate-manifests.js +11 -12
  80. package/dist/tools/organizational-data.d.ts +1 -0
  81. package/dist/tools/organizational-data.d.ts.map +1 -1
  82. package/dist/tools/organizational-data.js +2 -1
  83. package/dist/tools/recommend.d.ts +1 -0
  84. package/dist/tools/recommend.d.ts.map +1 -1
  85. package/dist/tools/recommend.js +13 -21
  86. package/dist/tools/remediate.d.ts +3 -0
  87. package/dist/tools/remediate.d.ts.map +1 -1
  88. package/dist/tools/remediate.js +35 -14
  89. package/dist/tools/test-docs.d.ts +1 -0
  90. package/dist/tools/test-docs.d.ts.map +1 -1
  91. package/dist/tools/test-docs.js +4 -2
  92. package/dist/tools/version.d.ts +5 -1
  93. package/dist/tools/version.d.ts.map +1 -1
  94. package/dist/tools/version.js +23 -8
  95. package/package.json +19 -1
@@ -0,0 +1,234 @@
1
+ "use strict";
2
+ /**
3
+ * Dataset Analyzer for Multi-Model Comparison
4
+ *
5
+ * Analyzes evaluation datasets to group them by scenario and extract
6
+ * model responses for comparative evaluation.
7
+ */
8
+ Object.defineProperty(exports, "__esModule", { value: true });
9
+ exports.DatasetAnalyzer = void 0;
10
+ const fs_1 = require("fs");
11
+ const path_1 = require("path");
12
+ class DatasetAnalyzer {
13
+ datasetDir;
14
+ constructor(datasetDir = './eval/datasets') {
15
+ this.datasetDir = datasetDir;
16
+ }
17
+ /**
18
+ * Find all available datasets for a specific tool
19
+ */
20
+ findDatasets(tool) {
21
+ const files = (0, fs_1.readdirSync)(this.datasetDir);
22
+ return files
23
+ .filter(file => file.startsWith(`${tool}_`) && file.endsWith('.jsonl'))
24
+ .map(file => (0, path_1.join)(this.datasetDir, file));
25
+ }
26
+ /**
27
+ * Parse dataset filename to extract components
28
+ * Format: {tool}_{interaction_id}_{sdk}_{model}_{timestamp}.jsonl
29
+ */
30
+ parseDatasetFilename(filename) {
31
+ const basename = filename.replace(/^.*\//, '').replace(/\.jsonl$/, '');
32
+ const parts = basename.split('_');
33
+ if (parts.length < 5)
34
+ return null;
35
+ // For remediate datasets: remediate_{phase}_{action}_vercel_{model}_{timestamp}
36
+ // e.g., remediate_manual_analyze_vercel_gpt_timestamp
37
+ const tool = parts[0];
38
+ const timestamp = parts[parts.length - 1];
39
+ // Find 'vercel' SDK position to split correctly
40
+ const sdkIndex = parts.indexOf('vercel');
41
+ if (sdkIndex === -1)
42
+ return null;
43
+ // interaction_id is everything between tool and sdk
44
+ const interaction_id = parts.slice(1, sdkIndex).join('_');
45
+ const sdk = parts[sdkIndex];
46
+ const model = parts.slice(sdkIndex + 1, -1).join('_');
47
+ return { tool, interaction_id, sdk, model, timestamp };
48
+ }
49
+ /**
50
+ * Load and parse a dataset file
51
+ */
52
+ loadDataset(filepath) {
53
+ try {
54
+ const content = (0, fs_1.readFileSync)(filepath, 'utf8').trim();
55
+ if (!content)
56
+ return null;
57
+ return JSON.parse(content);
58
+ }
59
+ catch (error) {
60
+ console.warn(`Failed to load dataset ${filepath}:`, error);
61
+ return null;
62
+ }
63
+ }
64
+ /**
65
+ * Group datasets by scenario for comparative evaluation
66
+ * Returns scenarios that have data from multiple models
67
+ * Groups by both tool and interaction_id to create separate evaluations for each phase
68
+ */
69
+ groupByScenario(tool) {
70
+ const datasets = this.findDatasets(tool);
71
+ const scenarioGroups = new Map();
72
+ // Group datasets by filename pattern up to provider, then by model
73
+ for (const filepath of datasets) {
74
+ const sample = this.loadDataset(filepath);
75
+ if (!sample)
76
+ continue;
77
+ // Extract scenario key from filename pattern (up to provider)
78
+ const filename = filepath.replace(/^.*\//, ''); // Remove directory path
79
+ const filenameParts = filename.split('_');
80
+ const beforeProvider = [];
81
+ for (const part of filenameParts) {
82
+ if (part === 'vercel')
83
+ break; // Stop at SDK name
84
+ beforeProvider.push(part);
85
+ }
86
+ const scenarioKey = beforeProvider.join('_');
87
+ // Group by model within each scenario
88
+ const modelKey = `${sample.performance.sdk}_${sample.performance.model_version}`;
89
+ if (!scenarioGroups.has(scenarioKey)) {
90
+ scenarioGroups.set(scenarioKey, new Map());
91
+ }
92
+ const modelGroups = scenarioGroups.get(scenarioKey);
93
+ if (!modelGroups.has(modelKey)) {
94
+ modelGroups.set(modelKey, []);
95
+ }
96
+ // Parse failure_analysis if it exists
97
+ let failure_analysis = undefined;
98
+ if (sample.metadata.failure_analysis && sample.metadata.failure_analysis !== "") {
99
+ try {
100
+ if (typeof sample.metadata.failure_analysis === 'string') {
101
+ failure_analysis = JSON.parse(sample.metadata.failure_analysis);
102
+ }
103
+ else {
104
+ failure_analysis = sample.metadata.failure_analysis;
105
+ }
106
+ }
107
+ catch (error) {
108
+ // If parsing fails, treat as no failure analysis
109
+ failure_analysis = undefined;
110
+ }
111
+ }
112
+ modelGroups.get(modelKey).push({
113
+ model: modelKey,
114
+ response: sample.output,
115
+ performance: sample.performance,
116
+ metadata: {
117
+ timestamp: sample.metadata.timestamp,
118
+ complexity: sample.metadata.complexity,
119
+ test_scenario: sample.metadata.test_scenario,
120
+ issue: sample.input.issue,
121
+ failure_analysis
122
+ }
123
+ });
124
+ }
125
+ // Convert to comparison scenarios - include ALL scenarios (remove multi-model filter)
126
+ const scenarios = [];
127
+ for (const [scenarioKey, modelGroups] of scenarioGroups) {
128
+ // Flatten model groups: each model may have multiple interactions
129
+ const allModelResponses = [];
130
+ for (const [modelKey, interactions] of modelGroups) {
131
+ // Combine multiple interactions per model into a single response
132
+ if (interactions.length === 1) {
133
+ allModelResponses.push(interactions[0]);
134
+ }
135
+ else {
136
+ // Multiple interactions per model - combine them
137
+ const combinedResponse = this.combineModelInteractions(modelKey, interactions);
138
+ allModelResponses.push(combinedResponse);
139
+ }
140
+ }
141
+ // Get representative issue from first model's first interaction
142
+ const firstModel = Array.from(modelGroups.values())[0]?.[0];
143
+ const issue = firstModel?.metadata?.issue || scenarioKey;
144
+ scenarios.push({
145
+ issue,
146
+ interaction_id: scenarioKey,
147
+ tool,
148
+ models: allModelResponses
149
+ });
150
+ }
151
+ return scenarios;
152
+ }
153
+ /**
154
+ * Combine multiple interactions per model into a single response for evaluation
155
+ */
156
+ combineModelInteractions(modelKey, interactions) {
157
+ // Sort interactions by timestamp
158
+ const sorted = interactions.sort((a, b) => new Date(a.metadata.timestamp).getTime() - new Date(b.metadata.timestamp).getTime());
159
+ // Create combined response showing all interactions
160
+ const combinedResponse = sorted.map((interaction, index) => `**Interaction ${index + 1}:**\n` +
161
+ `Issue: ${interaction.metadata.issue}\n` +
162
+ `Response: ${interaction.response}\n`).join('\n---\n');
163
+ // Aggregate performance metrics
164
+ const totalDuration = sorted.reduce((sum, i) => sum + i.performance.duration_ms, 0);
165
+ const totalInputTokens = sorted.reduce((sum, i) => sum + i.performance.input_tokens, 0);
166
+ const totalOutputTokens = sorted.reduce((sum, i) => sum + i.performance.output_tokens, 0);
167
+ // Collect all failure analyses from all interactions that have them
168
+ const allFailures = [];
169
+ sorted.forEach((interaction, index) => {
170
+ if (interaction.metadata.failure_analysis) {
171
+ allFailures.push({
172
+ interaction_number: index + 1,
173
+ issue: interaction.metadata.issue,
174
+ ...interaction.metadata.failure_analysis
175
+ });
176
+ }
177
+ });
178
+ // Use the first failure as the primary failure_analysis, but preserve all failures
179
+ const primaryFailureAnalysis = allFailures.length > 0 ? allFailures[0] : undefined;
180
+ return {
181
+ model: modelKey,
182
+ response: combinedResponse,
183
+ performance: {
184
+ ...sorted[0].performance,
185
+ duration_ms: totalDuration,
186
+ input_tokens: totalInputTokens,
187
+ output_tokens: totalOutputTokens,
188
+ total_tokens: totalInputTokens + totalOutputTokens
189
+ },
190
+ metadata: {
191
+ ...sorted[0].metadata,
192
+ issue: sorted[0].metadata.issue, // Use first interaction's issue as primary
193
+ interaction_count: interactions.length,
194
+ failure_analysis: primaryFailureAnalysis,
195
+ all_failures: allFailures.length > 0 ? allFailures : undefined
196
+ }
197
+ };
198
+ }
199
+ /**
200
+ * Get summary of available models across all scenarios for a tool
201
+ */
202
+ getAvailableModels(tool) {
203
+ const datasets = this.findDatasets(tool);
204
+ const models = new Set();
205
+ for (const filepath of datasets) {
206
+ const parsed = this.parseDatasetFilename(filepath);
207
+ if (parsed) {
208
+ models.add(`${parsed.sdk}_${parsed.model}`);
209
+ }
210
+ }
211
+ return Array.from(models).sort();
212
+ }
213
+ /**
214
+ * Get statistics about dataset availability
215
+ */
216
+ getDatasetStats(tool) {
217
+ const scenarios = this.groupByScenario(tool);
218
+ const datasets = this.findDatasets(tool);
219
+ const interactionTypes = new Set();
220
+ for (const filepath of datasets) {
221
+ const parsed = this.parseDatasetFilename(filepath);
222
+ if (parsed) {
223
+ interactionTypes.add(parsed.interaction_id);
224
+ }
225
+ }
226
+ return {
227
+ totalDatasets: datasets.length,
228
+ availableModels: this.getAvailableModels(tool),
229
+ scenariosWithMultipleModels: scenarios.length,
230
+ interactionTypes: Array.from(interactionTypes).sort()
231
+ };
232
+ }
233
+ }
234
+ exports.DatasetAnalyzer = DatasetAnalyzer;
@@ -0,0 +1,42 @@
1
+ /**
2
+ * Dataset Loader for Standard OpenAI Evals Format
3
+ *
4
+ * Loads JSONL evaluation datasets following OpenAI Evals standard:
5
+ * - Each line contains: {input, ideal, metadata}
6
+ * - Supports filtering by category, complexity, tags
7
+ * - Used by both integration tests and evaluation framework
8
+ */
9
+ export interface StandardEvalSample {
10
+ input: Record<string, any>;
11
+ ideal: any;
12
+ metadata: {
13
+ category: string;
14
+ complexity: 'low' | 'medium' | 'high';
15
+ tags: string[];
16
+ source: string;
17
+ phase?: string;
18
+ tool?: string;
19
+ };
20
+ }
21
+ export interface DatasetFilter {
22
+ category?: string;
23
+ complexity?: 'low' | 'medium' | 'high';
24
+ tags?: string[];
25
+ phase?: string;
26
+ tool?: string;
27
+ }
28
+ /**
29
+ * Load evaluation dataset from JSONL file
30
+ * @param datasetName - Name of the dataset file (without .jsonl extension)
31
+ * @param filter - Optional filter criteria
32
+ * @returns Array of evaluation samples
33
+ */
34
+ export declare function loadEvalDataset(datasetName: string, filter?: DatasetFilter): StandardEvalSample[];
35
+ /**
36
+ * Load samples for a specific test phase
37
+ * @param datasetName - Dataset name
38
+ * @param phase - Test phase to load
39
+ * @returns Array of samples for that phase
40
+ */
41
+ export declare function loadTestPhase(datasetName: string, phase: string): StandardEvalSample[];
42
+ //# sourceMappingURL=loader.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"loader.d.ts","sourceRoot":"","sources":["../../../src/evaluation/datasets/loader.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAKH,MAAM,WAAW,kBAAkB;IACjC,KAAK,EAAE,MAAM,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAC3B,KAAK,EAAE,GAAG,CAAC;IACX,QAAQ,EAAE;QACR,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;QACtC,IAAI,EAAE,MAAM,EAAE,CAAC;QACf,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,CAAC,EAAE,MAAM,CAAC;QACf,IAAI,CAAC,EAAE,MAAM,CAAC;KACf,CAAC;CACH;AAED,MAAM,WAAW,aAAa;IAC5B,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,CAAC,EAAE,KAAK,GAAG,QAAQ,GAAG,MAAM,CAAC;IACvC,IAAI,CAAC,EAAE,MAAM,EAAE,CAAC;IAChB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;CACf;AAED;;;;;GAKG;AACH,wBAAgB,eAAe,CAC7B,WAAW,EAAE,MAAM,EACnB,MAAM,CAAC,EAAE,aAAa,GACrB,kBAAkB,EAAE,CA+CtB;AAED;;;;;GAKG;AACH,wBAAgB,aAAa,CAAC,WAAW,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,GAAG,kBAAkB,EAAE,CAEtF"}
@@ -0,0 +1,104 @@
1
+ "use strict";
2
+ /**
3
+ * Dataset Loader for Standard OpenAI Evals Format
4
+ *
5
+ * Loads JSONL evaluation datasets following OpenAI Evals standard:
6
+ * - Each line contains: {input, ideal, metadata}
7
+ * - Supports filtering by category, complexity, tags
8
+ * - Used by both integration tests and evaluation framework
9
+ */
10
+ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
11
+ if (k2 === undefined) k2 = k;
12
+ var desc = Object.getOwnPropertyDescriptor(m, k);
13
+ if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
14
+ desc = { enumerable: true, get: function() { return m[k]; } };
15
+ }
16
+ Object.defineProperty(o, k2, desc);
17
+ }) : (function(o, m, k, k2) {
18
+ if (k2 === undefined) k2 = k;
19
+ o[k2] = m[k];
20
+ }));
21
+ var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
22
+ Object.defineProperty(o, "default", { enumerable: true, value: v });
23
+ }) : function(o, v) {
24
+ o["default"] = v;
25
+ });
26
+ var __importStar = (this && this.__importStar) || (function () {
27
+ var ownKeys = function(o) {
28
+ ownKeys = Object.getOwnPropertyNames || function (o) {
29
+ var ar = [];
30
+ for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
31
+ return ar;
32
+ };
33
+ return ownKeys(o);
34
+ };
35
+ return function (mod) {
36
+ if (mod && mod.__esModule) return mod;
37
+ var result = {};
38
+ if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
39
+ __setModuleDefault(result, mod);
40
+ return result;
41
+ };
42
+ })();
43
+ Object.defineProperty(exports, "__esModule", { value: true });
44
+ exports.loadEvalDataset = loadEvalDataset;
45
+ exports.loadTestPhase = loadTestPhase;
46
+ const fs = __importStar(require("fs"));
47
+ const path = __importStar(require("path"));
48
+ /**
49
+ * Load evaluation dataset from JSONL file
50
+ * @param datasetName - Name of the dataset file (without .jsonl extension)
51
+ * @param filter - Optional filter criteria
52
+ * @returns Array of evaluation samples
53
+ */
54
+ function loadEvalDataset(datasetName, filter) {
55
+ const datasetsDir = path.join(process.cwd(), 'eval', 'datasets');
56
+ const datasetPath = path.join(datasetsDir, `${datasetName}.jsonl`);
57
+ if (!fs.existsSync(datasetPath)) {
58
+ throw new Error(`Dataset not found: ${datasetPath}`);
59
+ }
60
+ const fileContent = fs.readFileSync(datasetPath, 'utf8');
61
+ const lines = fileContent.trim().split('\n').filter(line => line.trim());
62
+ const samples = lines.map((line, index) => {
63
+ try {
64
+ return JSON.parse(line);
65
+ }
66
+ catch (error) {
67
+ throw new Error(`Invalid JSON at line ${index + 1} in ${datasetName}.jsonl: ${error}`);
68
+ }
69
+ });
70
+ // Apply filters if provided
71
+ if (filter) {
72
+ return samples.filter(sample => {
73
+ if (filter.category && sample.metadata.category !== filter.category) {
74
+ return false;
75
+ }
76
+ if (filter.complexity && sample.metadata.complexity !== filter.complexity) {
77
+ return false;
78
+ }
79
+ if (filter.phase && sample.metadata.phase !== filter.phase) {
80
+ return false;
81
+ }
82
+ if (filter.tool && sample.metadata.tool !== filter.tool) {
83
+ return false;
84
+ }
85
+ if (filter.tags) {
86
+ const hasAllTags = filter.tags.every(tag => sample.metadata.tags.includes(tag));
87
+ if (!hasAllTags) {
88
+ return false;
89
+ }
90
+ }
91
+ return true;
92
+ });
93
+ }
94
+ return samples;
95
+ }
96
+ /**
97
+ * Load samples for a specific test phase
98
+ * @param datasetName - Dataset name
99
+ * @param phase - Test phase to load
100
+ * @returns Array of samples for that phase
101
+ */
102
+ function loadTestPhase(datasetName, phase) {
103
+ return loadEvalDataset(datasetName, { phase });
104
+ }
@@ -0,0 +1,9 @@
1
+ #!/usr/bin/env npx tsx
2
+ /**
3
+ * Evaluation Runner for Multi-Model Comparative Analysis
4
+ *
5
+ * Runs comparative evaluation on available datasets from multiple models
6
+ * Automatically detects and evaluates both remediation and recommendation datasets
7
+ */
8
+ export {};
9
+ //# sourceMappingURL=eval-runner.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"eval-runner.d.ts","sourceRoot":"","sources":["../../src/evaluation/eval-runner.ts"],"names":[],"mappings":";AAEA;;;;;GAKG"}