judgeval 0.1.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +202 -0
- package/README.md +340 -0
- package/dist/clients.d.ts +7 -0
- package/dist/clients.js +78 -0
- package/dist/clients.js.map +1 -0
- package/dist/common/integrations/langgraph.d.ts +40 -0
- package/dist/common/integrations/langgraph.js +444 -0
- package/dist/common/integrations/langgraph.js.map +1 -0
- package/dist/common/logger-instance.d.ts +3 -0
- package/dist/common/logger-instance.js +64 -0
- package/dist/common/logger-instance.js.map +1 -0
- package/dist/common/logger.d.ts +54 -0
- package/dist/common/logger.js +221 -0
- package/dist/common/logger.js.map +1 -0
- package/dist/common/tracer.d.ts +205 -0
- package/dist/common/tracer.js +1035 -0
- package/dist/common/tracer.js.map +1 -0
- package/dist/constants.d.ts +51 -0
- package/dist/constants.js +344 -0
- package/dist/constants.js.map +1 -0
- package/dist/data/example.d.ts +70 -0
- package/dist/data/example.js +125 -0
- package/dist/data/example.js.map +1 -0
- package/dist/data/result.d.ts +51 -0
- package/dist/data/result.js +83 -0
- package/dist/data/result.js.map +1 -0
- package/dist/evaluation-run.d.ts +44 -0
- package/dist/evaluation-run.js +136 -0
- package/dist/evaluation-run.js.map +1 -0
- package/dist/index.d.ts +10 -0
- package/dist/index.js +73 -0
- package/dist/index.js.map +1 -0
- package/dist/judgment-client.d.ts +179 -0
- package/dist/judgment-client.js +1038 -0
- package/dist/judgment-client.js.map +1 -0
- package/dist/rules.d.ts +120 -0
- package/dist/rules.js +322 -0
- package/dist/rules.js.map +1 -0
- package/dist/run-evaluation.d.ts +78 -0
- package/dist/run-evaluation.js +618 -0
- package/dist/run-evaluation.js.map +1 -0
- package/dist/scorers/api-scorer.d.ts +79 -0
- package/dist/scorers/api-scorer.js +291 -0
- package/dist/scorers/api-scorer.js.map +1 -0
- package/dist/scorers/base-scorer.d.ts +100 -0
- package/dist/scorers/base-scorer.js +190 -0
- package/dist/scorers/base-scorer.js.map +1 -0
- package/dist/scorers/exact-match-scorer.d.ts +10 -0
- package/dist/scorers/exact-match-scorer.js +84 -0
- package/dist/scorers/exact-match-scorer.js.map +1 -0
- package/package.json +88 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
import { Example } from './data/example';
|
|
2
|
+
import { ScoringResult } from './data/result';
|
|
3
|
+
import { APIJudgmentScorer, JudgevalScorer, ScorerWrapper } from './scorers/base-scorer';
|
|
4
|
+
import { Rule } from './rules';
|
|
5
|
+
/**
|
|
6
|
+
* Singleton implementation for JudgmentClient
|
|
7
|
+
*/
|
|
8
|
+
export declare class JudgmentClient {
|
|
9
|
+
private static instance;
|
|
10
|
+
private judgmentApiKey;
|
|
11
|
+
private organizationId;
|
|
12
|
+
/**
|
|
13
|
+
* Get the singleton instance of JudgmentClient
|
|
14
|
+
*/
|
|
15
|
+
static getInstance(judgmentApiKey?: string, organizationId?: string): JudgmentClient;
|
|
16
|
+
/**
|
|
17
|
+
* Constructor for JudgmentClient
|
|
18
|
+
* @param judgmentApiKey The Judgment API key
|
|
19
|
+
* @param organizationId The organization ID
|
|
20
|
+
*/
|
|
21
|
+
constructor(judgmentApiKey?: string, organizationId?: string);
|
|
22
|
+
/**
|
|
23
|
+
* Run an evaluation asynchronously
|
|
24
|
+
*/
|
|
25
|
+
aRunEvaluation(examples: Example[], scorers: Array<ScorerWrapper | JudgevalScorer | APIJudgmentScorer>, model: string | string[] | any, aggregator?: string, metadata?: Record<string, any>, logResults?: boolean, projectName?: string, evalRunName?: string, override?: boolean, useJudgment?: boolean, ignoreErrors?: boolean, rules?: Rule[]): Promise<ScoringResult[]>;
|
|
26
|
+
/**
|
|
27
|
+
* Run an evaluation
|
|
28
|
+
*/
|
|
29
|
+
runEvaluation(examples: Example[], scorers: Array<ScorerWrapper | JudgevalScorer | APIJudgmentScorer>, model: string | string[] | any, aggregator?: string, metadata?: Record<string, any>, logResults?: boolean, projectName?: string, evalRunName?: string, override?: boolean, useJudgment?: boolean, ignoreErrors?: boolean, asyncExecution?: boolean, rules?: Rule[]): Promise<ScoringResult[]>;
|
|
30
|
+
/**
|
|
31
|
+
* Run an evaluation with a simplified interface (recommended)
|
|
32
|
+
* @param config Configuration object for the evaluation
|
|
33
|
+
* @returns Promise<ScoringResult[]> The evaluation results
|
|
34
|
+
*/
|
|
35
|
+
evaluate(config: {
|
|
36
|
+
examples: Example[];
|
|
37
|
+
scorers: Array<ScorerWrapper | JudgevalScorer | APIJudgmentScorer>;
|
|
38
|
+
model?: string | string[] | any;
|
|
39
|
+
aggregator?: string;
|
|
40
|
+
metadata?: Record<string, any>;
|
|
41
|
+
projectName?: string;
|
|
42
|
+
evalName?: string;
|
|
43
|
+
logResults?: boolean;
|
|
44
|
+
useJudgment?: boolean;
|
|
45
|
+
ignoreErrors?: boolean;
|
|
46
|
+
asyncExecution?: boolean;
|
|
47
|
+
rules?: Rule[];
|
|
48
|
+
override?: boolean;
|
|
49
|
+
}): Promise<ScoringResult[]>;
|
|
50
|
+
/**
|
|
51
|
+
* Evaluate a dataset
|
|
52
|
+
*/
|
|
53
|
+
evaluateDataset(dataset: any, // EvalDataset would be implemented separately
|
|
54
|
+
scorers: Array<ScorerWrapper | JudgevalScorer>, model: string | string[] | any, aggregator?: string, metadata?: Record<string, any>, projectName?: string, evalRunName?: string, logResults?: boolean, useJudgment?: boolean, rules?: Rule[]): Promise<ScoringResult[]>;
|
|
55
|
+
/**
|
|
56
|
+
* Create a dataset
|
|
57
|
+
*/
|
|
58
|
+
createDataset(): any;
|
|
59
|
+
/**
|
|
60
|
+
* Push a dataset to the Judgment platform
|
|
61
|
+
*/
|
|
62
|
+
pushDataset(alias: string, dataset: any, projectName: string, overwrite?: boolean): Promise<boolean>;
|
|
63
|
+
/**
|
|
64
|
+
* Pull a dataset from the Judgment platform
|
|
65
|
+
*/
|
|
66
|
+
pullDataset(alias: string, projectName: string): Promise<any>;
|
|
67
|
+
/**
|
|
68
|
+
* Delete a dataset from the Judgment platform
|
|
69
|
+
*/
|
|
70
|
+
deleteDataset(alias: string, projectName: string): Promise<boolean>;
|
|
71
|
+
/**
|
|
72
|
+
* Pull project dataset stats from the Judgment platform
|
|
73
|
+
*/
|
|
74
|
+
pullProjectDatasetStats(projectName: string): Promise<Record<string, any>>;
|
|
75
|
+
/**
|
|
76
|
+
* Insert examples into a dataset on the Judgment platform
|
|
77
|
+
*/
|
|
78
|
+
insertDataset(alias: string, examples: Example[], projectName: string): Promise<boolean>;
|
|
79
|
+
/**
|
|
80
|
+
* Pull evaluation results from the server
|
|
81
|
+
* @param projectName Name of the project
|
|
82
|
+
* @param evalRunName Name of the evaluation run
|
|
83
|
+
* @returns Array containing one object with 'id' and 'results' (list of ScoringResult)
|
|
84
|
+
*/
|
|
85
|
+
pullEval(projectName: string, evalRunName: string): Promise<Array<Record<string, any | ScoringResult[]>>>;
|
|
86
|
+
/**
|
|
87
|
+
* Get evaluation run results (alias for pullEval with a more intuitive name)
|
|
88
|
+
* @param projectName Name of the project
|
|
89
|
+
* @param evalRunName Name of the evaluation run
|
|
90
|
+
* @returns Array containing one object with 'id' and 'results' (list of ScoringResult)
|
|
91
|
+
*/
|
|
92
|
+
getEvalRun(projectName: string, evalRunName: string): Promise<Array<Record<string, any | ScoringResult[]>>>;
|
|
93
|
+
/**
|
|
94
|
+
* List all evaluation runs for a project
|
|
95
|
+
* @param projectName Name of the project
|
|
96
|
+
* @param limit Maximum number of evaluation runs to return (default: 100)
|
|
97
|
+
* @param offset Offset for pagination (default: 0)
|
|
98
|
+
* @returns List of evaluation run metadata
|
|
99
|
+
*/
|
|
100
|
+
listEvalRuns(projectName: string, limit?: number, offset?: number): Promise<Array<Record<string, any>>>;
|
|
101
|
+
/**
|
|
102
|
+
* Get evaluation run statistics
|
|
103
|
+
* @param projectName Name of the project
|
|
104
|
+
* @param evalRunName Name of the evaluation run
|
|
105
|
+
* @returns Statistics for the evaluation run
|
|
106
|
+
*/
|
|
107
|
+
getEvalRunStats(projectName: string, evalRunName: string): Promise<Record<string, any>>;
|
|
108
|
+
/**
|
|
109
|
+
* Export evaluation results to a file format
|
|
110
|
+
* @param projectName Name of the project
|
|
111
|
+
* @param evalRunName Name of the evaluation run
|
|
112
|
+
* @param format Export format ('json' or 'csv')
|
|
113
|
+
* @returns The exported data as a string
|
|
114
|
+
*/
|
|
115
|
+
exportEvalResults(projectName: string, evalRunName: string, format?: 'json' | 'csv'): Promise<string>;
|
|
116
|
+
/**
|
|
117
|
+
* Delete an evaluation from the server
|
|
118
|
+
*/
|
|
119
|
+
deleteEval(projectName: string, evalRunNames: string[]): Promise<boolean>;
|
|
120
|
+
/**
|
|
121
|
+
* Delete all evaluations from the server for a given project
|
|
122
|
+
*/
|
|
123
|
+
deleteProjectEvals(projectName: string): Promise<boolean>;
|
|
124
|
+
/**
|
|
125
|
+
* Create a project on the server
|
|
126
|
+
*/
|
|
127
|
+
createProject(projectName: string): Promise<boolean>;
|
|
128
|
+
/**
|
|
129
|
+
* Delete a project from the server
|
|
130
|
+
*/
|
|
131
|
+
deleteProject(projectName: string): Promise<boolean>;
|
|
132
|
+
/**
|
|
133
|
+
* Validate that the user API key is valid
|
|
134
|
+
*/
|
|
135
|
+
private validateApiKey;
|
|
136
|
+
/**
|
|
137
|
+
* Assert a test by running the evaluation and checking the results for success
|
|
138
|
+
*/
|
|
139
|
+
assertTest(examples: Example[], scorers: Array<APIJudgmentScorer | JudgevalScorer>, // Type matches Python's intent
|
|
140
|
+
model: string | string[] | any, aggregator?: string, metadata?: Record<string, any>, logResults?: boolean, projectName?: string, evalRunName?: string, override?: boolean, rules?: Rule[]): Promise<void>;
|
|
141
|
+
/**
|
|
142
|
+
* Pull the results of an evaluation run. Matches `pullEval` logic but returns only the ScoringResult array.
|
|
143
|
+
* @param projectName The name of the project
|
|
144
|
+
* @param evalRunName The name of the evaluation run
|
|
145
|
+
* @returns The results of the evaluation run as ScoringResult[] or empty array on error/no results.
|
|
146
|
+
*/
|
|
147
|
+
pullEvalResults(projectName: string, evalRunName: string): Promise<ScoringResult[]>;
|
|
148
|
+
/**
|
|
149
|
+
* Check the status of an evaluation run using the fetch endpoint.
|
|
150
|
+
* This is a heuristic approach as the endpoint might return full results or status info.
|
|
151
|
+
* @param projectName The name of the project
|
|
152
|
+
* @param evalRunName The name of the evaluation run
|
|
153
|
+
* @returns An object representing the status { status: string, progress: number, message: string }
|
|
154
|
+
*/
|
|
155
|
+
checkEvalStatus(projectName: string, evalRunName: string): Promise<{
|
|
156
|
+
status: string;
|
|
157
|
+
progress: number;
|
|
158
|
+
message: string;
|
|
159
|
+
error?: string;
|
|
160
|
+
}>;
|
|
161
|
+
/**
|
|
162
|
+
* Wait for an async evaluation to complete and return the results
|
|
163
|
+
* @param projectName The name of the project
|
|
164
|
+
* @param evalRunName The name of the evaluation run
|
|
165
|
+
* @param options Optional configuration for polling: intervalMs, maxAttempts, showProgress
|
|
166
|
+
* @returns The evaluation results as ScoringResult[] or empty array on timeout/failure.
|
|
167
|
+
*/
|
|
168
|
+
waitForEvaluation(projectName: string, evalRunName: string, options?: {
|
|
169
|
+
intervalMs?: number;
|
|
170
|
+
maxAttempts?: number;
|
|
171
|
+
showProgress?: boolean;
|
|
172
|
+
}): Promise<ScoringResult[]>;
|
|
173
|
+
/**
|
|
174
|
+
* Create a simple ASCII progress bar
|
|
175
|
+
* @param percent The percentage to display (0-100)
|
|
176
|
+
* @returns A string representing the progress bar
|
|
177
|
+
*/
|
|
178
|
+
private _createProgressBar;
|
|
179
|
+
}
|