substrai-evalforge 0.5.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.d.ts +2 -0
- package/index.js +9 -0
- package/package.json +5 -28
- package/dist/config.d.ts +0 -29
- package/dist/config.js +0 -49
- package/dist/index.d.ts +0 -7
- package/dist/index.js +0 -14
- package/dist/metrics.d.ts +0 -43
- package/dist/metrics.js +0 -85
- package/dist/pipeline.d.ts +0 -22
- package/dist/pipeline.js +0 -72
- package/dist/result.d.ts +0 -25
- package/dist/result.js +0 -40
package/index.d.ts
ADDED
package/index.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* substrai-evalforge v0.6.0
|
|
4
|
+
* Automated LLM evaluation pipeline generator
|
|
5
|
+
*
|
|
6
|
+
* This is the JavaScript/TypeScript companion package.
|
|
7
|
+
* Primary implementation is in Python: pip install substrai-evalforge
|
|
8
|
+
*/
|
|
9
|
+
module.exports = { version: "0.6.0", name: "substrai-evalforge" };
|
package/package.json
CHANGED
|
@@ -1,38 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "substrai-evalforge",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.6.0",
|
|
4
4
|
"description": "Automated LLM evaluation pipeline generator",
|
|
5
|
-
"main": "
|
|
6
|
-
"types": "
|
|
7
|
-
"
|
|
8
|
-
"dist/**/*"
|
|
9
|
-
],
|
|
10
|
-
"scripts": {
|
|
11
|
-
"build": "tsc",
|
|
12
|
-
"prepublishOnly": "npm run build"
|
|
13
|
-
},
|
|
14
|
-
"keywords": [
|
|
15
|
-
"llm",
|
|
16
|
-
"evaluation",
|
|
17
|
-
"testing",
|
|
18
|
-
"mlops",
|
|
19
|
-
"genai",
|
|
20
|
-
"rag",
|
|
21
|
-
"metrics",
|
|
22
|
-
"pipeline",
|
|
23
|
-
"serverless"
|
|
24
|
-
],
|
|
5
|
+
"main": "index.js",
|
|
6
|
+
"types": "index.d.ts",
|
|
7
|
+
"keywords": ["genai", "llm", "serverless", "aws", "lambda", "ai", "agents"],
|
|
25
8
|
"author": "Gaurav Kumar Sinha <gaurav@substrai.dev>",
|
|
26
9
|
"license": "MIT",
|
|
10
|
+
"homepage": "https://substrai.dev",
|
|
27
11
|
"repository": {
|
|
28
12
|
"type": "git",
|
|
29
13
|
"url": "https://github.com/substrai/evalforge"
|
|
30
|
-
},
|
|
31
|
-
"homepage": "https://github.com/substrai/evalforge",
|
|
32
|
-
"engines": {
|
|
33
|
-
"node": ">=16.0.0"
|
|
34
|
-
},
|
|
35
|
-
"devDependencies": {
|
|
36
|
-
"typescript": "^5.0.0"
|
|
37
14
|
}
|
|
38
15
|
}
|
package/dist/config.d.ts
DELETED
|
@@ -1,29 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvalForge configuration and use case types.
|
|
3
|
-
*/
|
|
4
|
-
export declare enum UseCaseType {
|
|
5
|
-
RAG = "rag",
|
|
6
|
-
SUMMARIZATION = "summarization",
|
|
7
|
-
CLASSIFICATION = "classification",
|
|
8
|
-
GENERATION = "generation",
|
|
9
|
-
CHAT = "chat",
|
|
10
|
-
CODE = "code"
|
|
11
|
-
}
|
|
12
|
-
export declare const DEFAULT_METRICS: Record<string, string[]>;
|
|
13
|
-
export declare const DEFAULT_THRESHOLDS: Record<string, Record<string, number>>;
|
|
14
|
-
export interface ModelConfig {
|
|
15
|
-
provider: string;
|
|
16
|
-
modelId: string;
|
|
17
|
-
region: string;
|
|
18
|
-
}
|
|
19
|
-
export interface EvalConfig {
|
|
20
|
-
projectName: string;
|
|
21
|
-
version: string;
|
|
22
|
-
useCaseType: UseCaseType;
|
|
23
|
-
description: string;
|
|
24
|
-
model: ModelConfig;
|
|
25
|
-
metrics: string[];
|
|
26
|
-
thresholds: Record<string, number>;
|
|
27
|
-
getThreshold(metric: string): number;
|
|
28
|
-
}
|
|
29
|
-
export declare function createConfig(useCaseType: string, projectName?: string): EvalConfig;
|
package/dist/config.js
DELETED
|
@@ -1,49 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* EvalForge configuration and use case types.
|
|
4
|
-
*/
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.DEFAULT_THRESHOLDS = exports.DEFAULT_METRICS = exports.UseCaseType = void 0;
|
|
7
|
-
exports.createConfig = createConfig;
|
|
8
|
-
var UseCaseType;
|
|
9
|
-
(function (UseCaseType) {
|
|
10
|
-
UseCaseType["RAG"] = "rag";
|
|
11
|
-
UseCaseType["SUMMARIZATION"] = "summarization";
|
|
12
|
-
UseCaseType["CLASSIFICATION"] = "classification";
|
|
13
|
-
UseCaseType["GENERATION"] = "generation";
|
|
14
|
-
UseCaseType["CHAT"] = "chat";
|
|
15
|
-
UseCaseType["CODE"] = "code";
|
|
16
|
-
})(UseCaseType || (exports.UseCaseType = UseCaseType = {}));
|
|
17
|
-
exports.DEFAULT_METRICS = {
|
|
18
|
-
rag: ["faithfulness", "answer_relevancy", "context_precision", "context_recall", "toxicity"],
|
|
19
|
-
summarization: ["rouge_l", "bleu", "coherence", "conciseness", "fluency"],
|
|
20
|
-
classification: ["accuracy", "precision", "recall", "f1_score"],
|
|
21
|
-
generation: ["fluency", "coherence", "toxicity", "bias_detection"],
|
|
22
|
-
chat: ["coherence", "toxicity", "injection_resistance", "fluency"],
|
|
23
|
-
code: ["accuracy", "coherence"],
|
|
24
|
-
};
|
|
25
|
-
exports.DEFAULT_THRESHOLDS = {
|
|
26
|
-
rag: { faithfulness: 0.85, answer_relevancy: 0.80, context_precision: 0.75, context_recall: 0.75, toxicity: 0.05 },
|
|
27
|
-
summarization: { rouge_l: 0.70, bleu: 0.60, coherence: 0.80, conciseness: 0.85, fluency: 0.85 },
|
|
28
|
-
classification: { accuracy: 0.90, precision: 0.85, recall: 0.85, f1_score: 0.85 },
|
|
29
|
-
generation: { fluency: 0.85, coherence: 0.80, toxicity: 0.05, bias_detection: 0.10 },
|
|
30
|
-
chat: { coherence: 0.80, toxicity: 0.05, injection_resistance: 0.90, fluency: 0.85 },
|
|
31
|
-
code: { accuracy: 0.90, coherence: 0.80 },
|
|
32
|
-
};
|
|
33
|
-
function createConfig(useCaseType, projectName = "evaluation") {
|
|
34
|
-
const ucType = useCaseType;
|
|
35
|
-
const metrics = exports.DEFAULT_METRICS[ucType] || exports.DEFAULT_METRICS.rag;
|
|
36
|
-
const thresholds = exports.DEFAULT_THRESHOLDS[ucType] || exports.DEFAULT_THRESHOLDS.rag;
|
|
37
|
-
return {
|
|
38
|
-
projectName,
|
|
39
|
-
version: "1.0.0",
|
|
40
|
-
useCaseType: ucType,
|
|
41
|
-
description: `Default ${ucType} evaluation`,
|
|
42
|
-
model: { provider: "bedrock", modelId: "anthropic.claude-3-haiku-20240307-v1:0", region: "us-east-1" },
|
|
43
|
-
metrics,
|
|
44
|
-
thresholds,
|
|
45
|
-
getThreshold(metric) {
|
|
46
|
-
return this.thresholds[metric] ?? 0.8;
|
|
47
|
-
},
|
|
48
|
-
};
|
|
49
|
-
}
|
package/dist/index.d.ts
DELETED
|
@@ -1,7 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvalForge - Automated LLM Evaluation Pipeline Generator
|
|
3
|
-
*/
|
|
4
|
-
export { UseCaseType, EvalConfig, DEFAULT_METRICS, DEFAULT_THRESHOLDS } from "./config";
|
|
5
|
-
export { BaseMetric, MetricInput, MetricOutput } from "./metrics";
|
|
6
|
-
export { EvalPipeline, TestSample } from "./pipeline";
|
|
7
|
-
export { EvalResult, MetricScore } from "./result";
|
package/dist/index.js
DELETED
|
@@ -1,14 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
-
exports.EvalPipeline = exports.BaseMetric = exports.DEFAULT_THRESHOLDS = exports.DEFAULT_METRICS = exports.UseCaseType = void 0;
|
|
4
|
-
/**
|
|
5
|
-
* EvalForge - Automated LLM Evaluation Pipeline Generator
|
|
6
|
-
*/
|
|
7
|
-
var config_1 = require("./config");
|
|
8
|
-
Object.defineProperty(exports, "UseCaseType", { enumerable: true, get: function () { return config_1.UseCaseType; } });
|
|
9
|
-
Object.defineProperty(exports, "DEFAULT_METRICS", { enumerable: true, get: function () { return config_1.DEFAULT_METRICS; } });
|
|
10
|
-
Object.defineProperty(exports, "DEFAULT_THRESHOLDS", { enumerable: true, get: function () { return config_1.DEFAULT_THRESHOLDS; } });
|
|
11
|
-
var metrics_1 = require("./metrics");
|
|
12
|
-
Object.defineProperty(exports, "BaseMetric", { enumerable: true, get: function () { return metrics_1.BaseMetric; } });
|
|
13
|
-
var pipeline_1 = require("./pipeline");
|
|
14
|
-
Object.defineProperty(exports, "EvalPipeline", { enumerable: true, get: function () { return pipeline_1.EvalPipeline; } });
|
package/dist/metrics.d.ts
DELETED
|
@@ -1,43 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvalForge metrics - base interface and built-in implementations.
|
|
3
|
-
*/
|
|
4
|
-
export interface MetricInput {
|
|
5
|
-
query: string;
|
|
6
|
-
response: string;
|
|
7
|
-
context?: string;
|
|
8
|
-
reference?: string;
|
|
9
|
-
}
|
|
10
|
-
export interface MetricOutput {
|
|
11
|
-
score: number;
|
|
12
|
-
passed: boolean;
|
|
13
|
-
details: Record<string, any>;
|
|
14
|
-
explanation: string;
|
|
15
|
-
}
|
|
16
|
-
export declare abstract class BaseMetric {
|
|
17
|
-
abstract name: string;
|
|
18
|
-
abstract description: string;
|
|
19
|
-
abstract category: string;
|
|
20
|
-
abstract evaluate(input: MetricInput, threshold: number): MetricOutput;
|
|
21
|
-
evaluateBatch(inputs: MetricInput[], threshold: number): MetricOutput[];
|
|
22
|
-
aggregate(outputs: MetricOutput[]): number;
|
|
23
|
-
}
|
|
24
|
-
export declare class Faithfulness extends BaseMetric {
|
|
25
|
-
name: string;
|
|
26
|
-
description: string;
|
|
27
|
-
category: string;
|
|
28
|
-
evaluate(input: MetricInput, threshold?: number): MetricOutput;
|
|
29
|
-
}
|
|
30
|
-
export declare class Toxicity extends BaseMetric {
|
|
31
|
-
name: string;
|
|
32
|
-
description: string;
|
|
33
|
-
category: string;
|
|
34
|
-
private patterns;
|
|
35
|
-
evaluate(input: MetricInput, threshold?: number): MetricOutput;
|
|
36
|
-
}
|
|
37
|
-
export declare class RougeL extends BaseMetric {
|
|
38
|
-
name: string;
|
|
39
|
-
description: string;
|
|
40
|
-
category: string;
|
|
41
|
-
evaluate(input: MetricInput, threshold?: number): MetricOutput;
|
|
42
|
-
}
|
|
43
|
-
export declare const METRIC_REGISTRY: Record<string, new () => BaseMetric>;
|
package/dist/metrics.js
DELETED
|
@@ -1,85 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* EvalForge metrics - base interface and built-in implementations.
|
|
4
|
-
*/
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.METRIC_REGISTRY = exports.RougeL = exports.Toxicity = exports.Faithfulness = exports.BaseMetric = void 0;
|
|
7
|
-
class BaseMetric {
|
|
8
|
-
evaluateBatch(inputs, threshold) {
|
|
9
|
-
return inputs.map((inp) => this.evaluate(inp, threshold));
|
|
10
|
-
}
|
|
11
|
-
aggregate(outputs) {
|
|
12
|
-
if (outputs.length === 0)
|
|
13
|
-
return 0;
|
|
14
|
-
return outputs.reduce((sum, o) => sum + o.score, 0) / outputs.length;
|
|
15
|
-
}
|
|
16
|
-
}
|
|
17
|
-
exports.BaseMetric = BaseMetric;
|
|
18
|
-
function tokenize(text) {
|
|
19
|
-
return (text.toLowerCase().match(/\b\w{3,}\b/g) || []);
|
|
20
|
-
}
|
|
21
|
-
class Faithfulness extends BaseMetric {
|
|
22
|
-
constructor() {
|
|
23
|
-
super(...arguments);
|
|
24
|
-
this.name = "faithfulness";
|
|
25
|
-
this.description = "Is the answer grounded in context?";
|
|
26
|
-
this.category = "rag";
|
|
27
|
-
}
|
|
28
|
-
evaluate(input, threshold = 0.85) {
|
|
29
|
-
if (!input.response || !input.context) {
|
|
30
|
-
return { score: 0, passed: false, details: {}, explanation: "Missing data" };
|
|
31
|
-
}
|
|
32
|
-
const responseWords = new Set(tokenize(input.response));
|
|
33
|
-
const contextWords = new Set(tokenize(input.context));
|
|
34
|
-
const overlap = [...responseWords].filter((w) => contextWords.has(w)).length;
|
|
35
|
-
const score = responseWords.size > 0 ? Math.min(overlap / (responseWords.size * 0.6), 1.0) : 0;
|
|
36
|
-
return { score: Math.round(score * 10000) / 10000, passed: score >= threshold, details: { overlap }, explanation: "" };
|
|
37
|
-
}
|
|
38
|
-
}
|
|
39
|
-
exports.Faithfulness = Faithfulness;
|
|
40
|
-
class Toxicity extends BaseMetric {
|
|
41
|
-
constructor() {
|
|
42
|
-
super(...arguments);
|
|
43
|
-
this.name = "toxicity";
|
|
44
|
-
this.description = "Detects toxic content";
|
|
45
|
-
this.category = "safety";
|
|
46
|
-
this.patterns = [/\b(hate|kill|stupid|idiot|moron)\b/gi];
|
|
47
|
-
}
|
|
48
|
-
evaluate(input, threshold = 0.05) {
|
|
49
|
-
if (!input.response)
|
|
50
|
-
return { score: 0, passed: true, details: {}, explanation: "Clean" };
|
|
51
|
-
let matches = 0;
|
|
52
|
-
for (const p of this.patterns) {
|
|
53
|
-
matches += (input.response.match(p) || []).length;
|
|
54
|
-
}
|
|
55
|
-
const score = Math.min(matches / 5, 1.0);
|
|
56
|
-
return { score, passed: score <= threshold, details: { matches }, explanation: "" };
|
|
57
|
-
}
|
|
58
|
-
}
|
|
59
|
-
exports.Toxicity = Toxicity;
|
|
60
|
-
class RougeL extends BaseMetric {
|
|
61
|
-
constructor() {
|
|
62
|
-
super(...arguments);
|
|
63
|
-
this.name = "rouge_l";
|
|
64
|
-
this.description = "Longest common subsequence overlap";
|
|
65
|
-
this.category = "text";
|
|
66
|
-
}
|
|
67
|
-
evaluate(input, threshold = 0.70) {
|
|
68
|
-
if (!input.response || !input.reference) {
|
|
69
|
-
return { score: 0, passed: false, details: {}, explanation: "Missing data" };
|
|
70
|
-
}
|
|
71
|
-
const respWords = tokenize(input.response);
|
|
72
|
-
const refWords = tokenize(input.reference);
|
|
73
|
-
const overlap = new Set(respWords.filter((w) => refWords.includes(w))).size;
|
|
74
|
-
const precision = respWords.length > 0 ? overlap / respWords.length : 0;
|
|
75
|
-
const recall = refWords.length > 0 ? overlap / refWords.length : 0;
|
|
76
|
-
const score = precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
77
|
-
return { score: Math.round(score * 10000) / 10000, passed: score >= threshold, details: {}, explanation: "" };
|
|
78
|
-
}
|
|
79
|
-
}
|
|
80
|
-
exports.RougeL = RougeL;
|
|
81
|
-
exports.METRIC_REGISTRY = {
|
|
82
|
-
faithfulness: Faithfulness,
|
|
83
|
-
toxicity: Toxicity,
|
|
84
|
-
rouge_l: RougeL,
|
|
85
|
-
};
|
package/dist/pipeline.d.ts
DELETED
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvalForge pipeline - orchestrates metric execution.
|
|
3
|
-
*/
|
|
4
|
-
import { EvalConfig } from "./config";
|
|
5
|
-
import { EvalResult } from "./result";
|
|
6
|
-
export interface TestSample {
|
|
7
|
-
query: string;
|
|
8
|
-
response: string;
|
|
9
|
-
context?: string;
|
|
10
|
-
reference?: string;
|
|
11
|
-
category?: string;
|
|
12
|
-
}
|
|
13
|
-
export declare class EvalPipeline {
|
|
14
|
-
private config;
|
|
15
|
-
private metrics;
|
|
16
|
-
constructor(config: EvalConfig);
|
|
17
|
-
static forUseCase(useCase: string, projectName?: string): EvalPipeline;
|
|
18
|
-
private loadMetrics;
|
|
19
|
-
run(samples?: TestSample[], metricFilter?: string[]): EvalResult;
|
|
20
|
-
listMetrics(): string[];
|
|
21
|
-
private defaultSamples;
|
|
22
|
-
}
|
package/dist/pipeline.js
DELETED
|
@@ -1,72 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* EvalForge pipeline - orchestrates metric execution.
|
|
4
|
-
*/
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.EvalPipeline = void 0;
|
|
7
|
-
const config_1 = require("./config");
|
|
8
|
-
const metrics_1 = require("./metrics");
|
|
9
|
-
const result_1 = require("./result");
|
|
10
|
-
class EvalPipeline {
|
|
11
|
-
constructor(config) {
|
|
12
|
-
this.metrics = [];
|
|
13
|
-
this.config = config;
|
|
14
|
-
this.loadMetrics();
|
|
15
|
-
}
|
|
16
|
-
static forUseCase(useCase, projectName = "evaluation") {
|
|
17
|
-
return new EvalPipeline((0, config_1.createConfig)(useCase, projectName));
|
|
18
|
-
}
|
|
19
|
-
loadMetrics() {
|
|
20
|
-
for (const name of this.config.metrics) {
|
|
21
|
-
const MetricClass = metrics_1.METRIC_REGISTRY[name];
|
|
22
|
-
if (MetricClass) {
|
|
23
|
-
this.metrics.push(new MetricClass());
|
|
24
|
-
}
|
|
25
|
-
}
|
|
26
|
-
}
|
|
27
|
-
run(samples, metricFilter) {
|
|
28
|
-
const start = Date.now();
|
|
29
|
-
const testSamples = samples || this.defaultSamples();
|
|
30
|
-
const metricsToRun = metricFilter
|
|
31
|
-
? this.metrics.filter((m) => metricFilter.includes(m.name))
|
|
32
|
-
: this.metrics;
|
|
33
|
-
const scores = [];
|
|
34
|
-
for (const metric of metricsToRun) {
|
|
35
|
-
const threshold = this.config.getThreshold(metric.name);
|
|
36
|
-
const metricStart = Date.now();
|
|
37
|
-
const outputs = testSamples.map((sample) => metric.evaluate({ query: sample.query, response: sample.response, context: sample.context, reference: sample.reference }, threshold));
|
|
38
|
-
const avgScore = outputs.length > 0 ? outputs.reduce((s, o) => s + o.score, 0) / outputs.length : 0;
|
|
39
|
-
const isSafety = metric.category === "safety";
|
|
40
|
-
const passed = isSafety ? avgScore <= threshold : avgScore >= threshold;
|
|
41
|
-
scores.push({
|
|
42
|
-
name: metric.name,
|
|
43
|
-
score: Math.round(avgScore * 10000) / 10000,
|
|
44
|
-
threshold,
|
|
45
|
-
passed,
|
|
46
|
-
samplesEvaluated: outputs.length,
|
|
47
|
-
latencyMs: Date.now() - metricStart,
|
|
48
|
-
});
|
|
49
|
-
}
|
|
50
|
-
return (0, result_1.createResult)(this.config.projectName, this.config.useCaseType, scores, testSamples.length, Date.now() - start, `${this.config.model.provider}/${this.config.model.modelId}`);
|
|
51
|
-
}
|
|
52
|
-
listMetrics() {
|
|
53
|
-
return this.metrics.map((m) => m.name);
|
|
54
|
-
}
|
|
55
|
-
defaultSamples() {
|
|
56
|
-
return [
|
|
57
|
-
{
|
|
58
|
-
query: "What is the return policy?",
|
|
59
|
-
response: "Our return policy allows returns within 30 days with a valid receipt.",
|
|
60
|
-
context: "Return Policy: Customers may return items within 30 days. A valid receipt is required.",
|
|
61
|
-
reference: "Returns accepted within 30 days with receipt.",
|
|
62
|
-
},
|
|
63
|
-
{
|
|
64
|
-
query: "How do I reset my password?",
|
|
65
|
-
response: "Go to Settings > Security > Reset Password.",
|
|
66
|
-
context: "Password Reset: Navigate to Settings, then Security, then Reset Password.",
|
|
67
|
-
reference: "Go to Settings > Security > Reset Password.",
|
|
68
|
-
},
|
|
69
|
-
];
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
exports.EvalPipeline = EvalPipeline;
|
package/dist/result.d.ts
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* EvalForge result models.
|
|
3
|
-
*/
|
|
4
|
-
export interface MetricScore {
|
|
5
|
-
name: string;
|
|
6
|
-
score: number;
|
|
7
|
-
threshold: number;
|
|
8
|
-
passed: boolean;
|
|
9
|
-
samplesEvaluated: number;
|
|
10
|
-
latencyMs: number;
|
|
11
|
-
}
|
|
12
|
-
export interface EvalResult {
|
|
13
|
-
projectName: string;
|
|
14
|
-
useCaseType: string;
|
|
15
|
-
timestamp: number;
|
|
16
|
-
scores: MetricScore[];
|
|
17
|
-
totalSamples: number;
|
|
18
|
-
totalLatencyMs: number;
|
|
19
|
-
model: string;
|
|
20
|
-
allPassing: boolean;
|
|
21
|
-
overallScore: number;
|
|
22
|
-
passRate: number;
|
|
23
|
-
}
|
|
24
|
-
export declare function createResult(projectName: string, useCaseType: string, scores: MetricScore[], totalSamples: number, totalLatencyMs: number, model: string): EvalResult;
|
|
25
|
-
export declare function summarizeResult(result: EvalResult): string;
|
package/dist/result.js
DELETED
|
@@ -1,40 +0,0 @@
|
|
|
1
|
-
"use strict";
|
|
2
|
-
/**
|
|
3
|
-
* EvalForge result models.
|
|
4
|
-
*/
|
|
5
|
-
Object.defineProperty(exports, "__esModule", { value: true });
|
|
6
|
-
exports.createResult = createResult;
|
|
7
|
-
exports.summarizeResult = summarizeResult;
|
|
8
|
-
function createResult(projectName, useCaseType, scores, totalSamples, totalLatencyMs, model) {
|
|
9
|
-
const allPassing = scores.every((s) => s.passed);
|
|
10
|
-
const overallScore = scores.length > 0 ? scores.reduce((sum, s) => sum + s.score, 0) / scores.length : 0;
|
|
11
|
-
const passRate = scores.length > 0 ? scores.filter((s) => s.passed).length / scores.length : 0;
|
|
12
|
-
return {
|
|
13
|
-
projectName,
|
|
14
|
-
useCaseType,
|
|
15
|
-
timestamp: Date.now() / 1000,
|
|
16
|
-
scores,
|
|
17
|
-
totalSamples,
|
|
18
|
-
totalLatencyMs,
|
|
19
|
-
model,
|
|
20
|
-
allPassing,
|
|
21
|
-
overallScore: Math.round(overallScore * 10000) / 10000,
|
|
22
|
-
passRate: Math.round(passRate * 10000) / 10000,
|
|
23
|
-
};
|
|
24
|
-
}
|
|
25
|
-
function summarizeResult(result) {
|
|
26
|
-
const status = result.allPassing ? "PASS" : "FAIL";
|
|
27
|
-
const lines = [
|
|
28
|
-
`EvalForge Results: ${result.projectName}`,
|
|
29
|
-
`Status: ${status} (${result.scores.filter((s) => s.passed).length}/${result.scores.length} metrics passing)`,
|
|
30
|
-
`Use case: ${result.useCaseType}`,
|
|
31
|
-
`Samples: ${result.totalSamples}`,
|
|
32
|
-
"",
|
|
33
|
-
"Metrics:",
|
|
34
|
-
];
|
|
35
|
-
for (const s of result.scores) {
|
|
36
|
-
const icon = s.passed ? "✓" : "✗";
|
|
37
|
-
lines.push(` ${icon} ${s.name}: ${s.score.toFixed(4)} (threshold: ${s.threshold})`);
|
|
38
|
-
}
|
|
39
|
-
return lines.join("\n");
|
|
40
|
-
}
|