judgeval 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +95 -68
- package/dist/cjs/common/tracer.js +235 -143
- package/dist/cjs/common/tracer.js.map +1 -1
- package/dist/cjs/constants.js +8 -5
- package/dist/cjs/constants.js.map +1 -1
- package/dist/cjs/data/datasets/eval-dataset-client.js +349 -0
- package/dist/cjs/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/cjs/data/datasets/eval-dataset.js +405 -0
- package/dist/cjs/data/datasets/eval-dataset.js.map +1 -0
- package/dist/cjs/data/example.js +22 -1
- package/dist/cjs/data/example.js.map +1 -1
- package/dist/cjs/e2etests/eval-operations.test.js +282 -0
- package/dist/cjs/e2etests/eval-operations.test.js.map +1 -0
- package/dist/cjs/e2etests/judgee-traces.test.js +278 -0
- package/dist/cjs/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/cjs/index.js +1 -3
- package/dist/cjs/index.js.map +1 -1
- package/dist/cjs/judgment-client.js +326 -645
- package/dist/cjs/judgment-client.js.map +1 -1
- package/dist/cjs/scorers/api-scorer.js +56 -48
- package/dist/cjs/scorers/api-scorer.js.map +1 -1
- package/dist/cjs/scorers/base-scorer.js +66 -11
- package/dist/cjs/scorers/base-scorer.js.map +1 -1
- package/dist/esm/common/tracer.js +236 -144
- package/dist/esm/common/tracer.js.map +1 -1
- package/dist/esm/constants.js +7 -4
- package/dist/esm/constants.js.map +1 -1
- package/dist/esm/data/datasets/eval-dataset-client.js +342 -0
- package/dist/esm/data/datasets/eval-dataset-client.js.map +1 -0
- package/dist/esm/data/datasets/eval-dataset.js +375 -0
- package/dist/esm/data/datasets/eval-dataset.js.map +1 -0
- package/dist/esm/data/example.js +22 -1
- package/dist/esm/data/example.js.map +1 -1
- package/dist/esm/e2etests/eval-operations.test.js +254 -0
- package/dist/esm/e2etests/eval-operations.test.js.map +1 -0
- package/dist/esm/e2etests/judgee-traces.test.js +253 -0
- package/dist/esm/e2etests/judgee-traces.test.js.map +1 -0
- package/dist/esm/index.js +0 -1
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/judgment-client.js +328 -647
- package/dist/esm/judgment-client.js.map +1 -1
- package/dist/esm/scorers/api-scorer.js +56 -48
- package/dist/esm/scorers/api-scorer.js.map +1 -1
- package/dist/esm/scorers/base-scorer.js +66 -11
- package/dist/esm/scorers/base-scorer.js.map +1 -1
- package/dist/types/common/tracer.d.ts +27 -14
- package/dist/types/constants.d.ts +4 -4
- package/dist/types/data/datasets/eval-dataset-client.d.ts +39 -0
- package/dist/types/data/datasets/eval-dataset.d.ts +45 -0
- package/dist/types/data/example.d.ts +24 -12
- package/dist/types/e2etests/eval-operations.test.d.ts +5 -0
- package/dist/types/e2etests/judgee-traces.test.d.ts +5 -0
- package/dist/types/index.d.ts +0 -1
- package/dist/types/judgment-client.d.ts +3 -47
- package/dist/types/scorers/api-scorer.d.ts +15 -15
- package/dist/types/scorers/base-scorer.d.ts +53 -10
- package/package.json +2 -1
- package/dist/cjs/scorers/exact-match-scorer.js +0 -84
- package/dist/cjs/scorers/exact-match-scorer.js.map +0 -1
- package/dist/esm/scorers/exact-match-scorer.js +0 -80
- package/dist/esm/scorers/exact-match-scorer.js.map +0 -1
- package/dist/types/scorers/exact-match-scorer.d.ts +0 -10
|
@@ -26,13 +26,15 @@ interface TraceEntry {
|
|
|
26
26
|
function: string;
|
|
27
27
|
span_id: string;
|
|
28
28
|
depth: number;
|
|
29
|
-
|
|
29
|
+
created_at: number;
|
|
30
30
|
duration?: number;
|
|
31
31
|
output?: any;
|
|
32
32
|
inputs?: Record<string, any>;
|
|
33
33
|
span_type: SpanType;
|
|
34
34
|
parent_span_id?: string;
|
|
35
35
|
evaluation_runs?: any[];
|
|
36
|
+
trace_id?: string;
|
|
37
|
+
message?: string;
|
|
36
38
|
}
|
|
37
39
|
interface TraceSavePayload {
|
|
38
40
|
trace_id: string;
|
|
@@ -49,8 +51,7 @@ interface TraceSavePayload {
|
|
|
49
51
|
total_cost_usd: number;
|
|
50
52
|
};
|
|
51
53
|
entries: CondensedSpanEntry[];
|
|
52
|
-
|
|
53
|
-
empty_save: boolean;
|
|
54
|
+
evaluation_runs: any[];
|
|
54
55
|
overwrite: boolean;
|
|
55
56
|
parent_trace_id?: string | null;
|
|
56
57
|
parent_name?: string | null;
|
|
@@ -59,15 +60,24 @@ interface CondensedSpanEntry {
|
|
|
59
60
|
span_id: string;
|
|
60
61
|
function: string;
|
|
61
62
|
depth: number;
|
|
62
|
-
|
|
63
|
+
created_at: string;
|
|
63
64
|
parent_span_id?: string | null;
|
|
64
65
|
span_type: SpanType;
|
|
65
66
|
inputs: Record<string, any> | null;
|
|
66
67
|
output: any | null;
|
|
67
|
-
evaluation_runs: any[];
|
|
68
68
|
duration: number | null;
|
|
69
|
+
trace_id?: string;
|
|
69
70
|
children?: CondensedSpanEntry[];
|
|
70
71
|
}
|
|
72
|
+
interface TokenCostResponse {
|
|
73
|
+
model: string;
|
|
74
|
+
prompt_tokens: number;
|
|
75
|
+
completion_tokens: number;
|
|
76
|
+
total_tokens: number;
|
|
77
|
+
prompt_tokens_cost_usd: number;
|
|
78
|
+
completion_tokens_cost_usd: number;
|
|
79
|
+
total_cost_usd: number;
|
|
80
|
+
}
|
|
71
81
|
/**
|
|
72
82
|
* Client for interacting with Judgment trace API endpoints.
|
|
73
83
|
*/
|
|
@@ -77,10 +87,20 @@ declare class TraceManagerClient {
|
|
|
77
87
|
constructor(apiKey: string, organizationId: string);
|
|
78
88
|
private _fetch;
|
|
79
89
|
fetchTrace(traceId: string): Promise<any>;
|
|
80
|
-
saveTrace(traceData: TraceSavePayload
|
|
90
|
+
saveTrace(traceData: TraceSavePayload): Promise<any>;
|
|
81
91
|
deleteTrace(traceId: string): Promise<any>;
|
|
82
92
|
deleteTraces(traceIds: string[]): Promise<any>;
|
|
83
93
|
addTraceToEvalQueue(traceData: TraceSavePayload): Promise<any>;
|
|
94
|
+
/**
|
|
95
|
+
* Calculate token costs directly using the API endpoint.
|
|
96
|
+
* This is more accurate than client-side calculation as it uses the most up-to-date pricing.
|
|
97
|
+
*
|
|
98
|
+
* @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
|
|
99
|
+
* @param promptTokens Number of tokens in the prompt/input
|
|
100
|
+
* @param completionTokens Number of tokens in the completion/output
|
|
101
|
+
* @returns Object containing token counts and calculated costs in USD
|
|
102
|
+
*/
|
|
103
|
+
calculateTokenCosts(model: string, promptTokens: number, completionTokens: number): Promise<TokenCostResponse | null>;
|
|
84
104
|
}
|
|
85
105
|
/**
|
|
86
106
|
* Represents an ongoing trace context.
|
|
@@ -100,6 +120,7 @@ declare class TraceClient {
|
|
|
100
120
|
private apiKey;
|
|
101
121
|
private organizationId;
|
|
102
122
|
private originalName;
|
|
123
|
+
private _spanDepths;
|
|
103
124
|
constructor(config: {
|
|
104
125
|
tracer: Tracer;
|
|
105
126
|
traceId?: string;
|
|
@@ -154,14 +175,6 @@ declare class TraceClient {
|
|
|
154
175
|
model?: string;
|
|
155
176
|
logResults?: boolean;
|
|
156
177
|
}): Promise<void>;
|
|
157
|
-
/**
|
|
158
|
-
* Private helper to add an evaluation entry to the trace.
|
|
159
|
-
* This mirrors the structure of Python's add_eval_run.
|
|
160
|
-
*
|
|
161
|
-
* @param evalRunPayload The constructed payload for the evaluation.
|
|
162
|
-
* @param startTime The start time (in seconds) of the evaluation process.
|
|
163
|
-
*/
|
|
164
|
-
private _addEvalRun;
|
|
165
178
|
getOriginalName(): string;
|
|
166
179
|
}
|
|
167
180
|
/**
|
|
@@ -21,15 +21,15 @@ export declare enum APIScorer {
|
|
|
21
21
|
}
|
|
22
22
|
export declare const UNBOUNDED_SCORERS: Set<APIScorer>;
|
|
23
23
|
export declare const ROOT_API = "https://api.judgmentlabs.ai";
|
|
24
|
-
export declare const JUDGMENT_EVAL_API_URL = "https://api.judgmentlabs.ai/evaluate/";
|
|
25
24
|
export declare const JUDGMENT_DATASETS_PUSH_API_URL = "https://api.judgmentlabs.ai/datasets/push/";
|
|
26
25
|
export declare const JUDGMENT_DATASETS_PULL_API_URL = "https://api.judgmentlabs.ai/datasets/pull/";
|
|
27
26
|
export declare const JUDGMENT_DATASETS_DELETE_API_URL = "https://api.judgmentlabs.ai/datasets/delete/";
|
|
28
27
|
export declare const JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = "https://api.judgmentlabs.ai/datasets/export_jsonl/";
|
|
29
28
|
export declare const JUDGMENT_DATASETS_PROJECT_STATS_API_URL = "https://api.judgmentlabs.ai/datasets/fetch_stats_by_project/";
|
|
30
29
|
export declare const JUDGMENT_DATASETS_INSERT_API_URL = "https://api.judgmentlabs.ai/datasets/insert_examples/";
|
|
31
|
-
export declare const JUDGMENT_EVAL_LOG_API_URL = "https://api.judgmentlabs.ai/log_eval_results/";
|
|
32
30
|
export declare const JUDGMENT_EVAL_FETCH_API_URL = "https://api.judgmentlabs.ai/fetch_eval_results/";
|
|
31
|
+
export declare const JUDGMENT_EVAL_API_URL = "https://api.judgmentlabs.ai/evaluate/";
|
|
32
|
+
export declare const JUDGMENT_EVAL_LOG_API_URL = "https://api.judgmentlabs.ai/log_eval_results/";
|
|
33
33
|
export declare const JUDGMENT_EVAL_DELETE_API_URL = "https://api.judgmentlabs.ai/delete_eval_results_by_project_and_run_names/";
|
|
34
34
|
export declare const JUDGMENT_EVAL_DELETE_PROJECT_API_URL = "https://api.judgmentlabs.ai/delete_eval_results_by_project/";
|
|
35
35
|
export declare const JUDGMENT_PROJECT_DELETE_API_URL = "https://api.judgmentlabs.ai/projects/delete/";
|
|
@@ -37,8 +37,8 @@ export declare const JUDGMENT_PROJECT_CREATE_API_URL = "https://api.judgmentlabs
|
|
|
37
37
|
export declare const JUDGMENT_TRACES_FETCH_API_URL = "https://api.judgmentlabs.ai/traces/fetch/";
|
|
38
38
|
export declare const JUDGMENT_TRACES_SAVE_API_URL = "https://api.judgmentlabs.ai/traces/save/";
|
|
39
39
|
export declare const JUDGMENT_TRACES_DELETE_API_URL = "https://api.judgmentlabs.ai/traces/delete/";
|
|
40
|
-
export declare const JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = "https://api.judgmentlabs.ai/traces/
|
|
41
|
-
export declare const
|
|
40
|
+
export declare const JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = "https://api.judgmentlabs.ai/traces/add_to_eval_queue/";
|
|
41
|
+
export declare const JUDGMENT_CALCULATE_TOKEN_COSTS_API_URL = "https://api.judgmentlabs.ai/calculate-token-costs";
|
|
42
42
|
export declare const JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = "https://api.judgmentlabs.ai/add_to_run_eval_queue/";
|
|
43
43
|
export declare const RABBITMQ_HOST: string;
|
|
44
44
|
export declare const RABBITMQ_PORT: number;
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { Example } from '../example.js';
|
|
2
|
+
import { EvalDataset } from './eval-dataset.js';
|
|
3
|
+
import { AxiosResponse } from 'axios';
|
|
4
|
+
export declare class EvalDatasetClient {
|
|
5
|
+
private judgmentApiKey;
|
|
6
|
+
private organizationId;
|
|
7
|
+
constructor(judgmentApiKey: string, organizationId: string);
|
|
8
|
+
createDataset(examples?: Example[]): EvalDataset;
|
|
9
|
+
/**
|
|
10
|
+
* Pushes the dataset to the Judgment platform.
|
|
11
|
+
* @returns True if successful, false otherwise.
|
|
12
|
+
*/
|
|
13
|
+
pushDataset(dataset: EvalDataset, alias: string, projectName: string, overwrite?: boolean): Promise<boolean>;
|
|
14
|
+
/**
|
|
15
|
+
* Pulls the dataset from the Judgment platform.
|
|
16
|
+
*/
|
|
17
|
+
pullDataset(alias: string, projectName: string): Promise<EvalDataset>;
|
|
18
|
+
/**
|
|
19
|
+
* Deletes the dataset from the Judgment platform.
|
|
20
|
+
* @returns True if successful, false otherwise.
|
|
21
|
+
*/
|
|
22
|
+
deleteDataset(alias: string, projectName: string): Promise<boolean>;
|
|
23
|
+
/**
|
|
24
|
+
* Pulls dataset statistics for a project from the Judgment platform.
|
|
25
|
+
*/
|
|
26
|
+
pullProjectDatasetStats(projectName: string): Promise<Record<string, any>>;
|
|
27
|
+
/**
|
|
28
|
+
* Inserts new examples into an existing dataset on the Judgment platform.
|
|
29
|
+
* @returns True if successful, false otherwise.
|
|
30
|
+
*/
|
|
31
|
+
insertDataset(alias: string, examples: Example[], projectName: string): Promise<boolean>;
|
|
32
|
+
/**
|
|
33
|
+
* Exports a dataset in JSONL format from the Judgment platform.
|
|
34
|
+
* @returns AxiosResponse containing the stream if successful.
|
|
35
|
+
*/
|
|
36
|
+
exportJsonl(alias: string, projectName: string): Promise<AxiosResponse>;
|
|
37
|
+
private getAuthHeaders;
|
|
38
|
+
private handleApiError;
|
|
39
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { Example, ExampleOptions } from '../example.js';
|
|
2
|
+
type SaveFileType = 'json' | 'csv' | 'yaml';
|
|
3
|
+
export declare class EvalDataset {
|
|
4
|
+
examples: Example[];
|
|
5
|
+
private _alias;
|
|
6
|
+
private _id;
|
|
7
|
+
constructor(examples?: Example[]);
|
|
8
|
+
addExample(e: Example): void;
|
|
9
|
+
get length(): number;
|
|
10
|
+
get alias(): string | null;
|
|
11
|
+
set alias(value: string | null);
|
|
12
|
+
get id(): string | null;
|
|
13
|
+
set id(value: string | null);
|
|
14
|
+
/**
|
|
15
|
+
* Adds examples from a JSON file.
|
|
16
|
+
* Assumes the JSON file has a top-level key "examples" containing an array of example objects.
|
|
17
|
+
* @param filePath Path to the JSON file.
|
|
18
|
+
*/
|
|
19
|
+
addFromJson(filePath: string): void;
|
|
20
|
+
/**
|
|
21
|
+
* Adds examples from a YAML file.
|
|
22
|
+
* Assumes the YAML file has a top-level key "examples" containing an array of example objects.
|
|
23
|
+
* @param filePath Path to the YAML file.
|
|
24
|
+
*/
|
|
25
|
+
addFromYaml(filePath: string): void;
|
|
26
|
+
/**
|
|
27
|
+
* Adds examples from a CSV file.
|
|
28
|
+
* @param filePath Path to the CSV file.
|
|
29
|
+
* @param headerMapping Dictionary mapping Example headers (keys) to custom headers in the CSV (values).
|
|
30
|
+
* @param primaryDelimiter Main delimiter used in CSV file. Defaults to ",".
|
|
31
|
+
* @param secondaryDelimiter Secondary delimiter for list fields (context, retrieval_context, etc.). Defaults to ";".
|
|
32
|
+
*/
|
|
33
|
+
addFromCsv(filePath: string, headerMapping: {
|
|
34
|
+
[key in keyof ExampleOptions]?: string;
|
|
35
|
+
}, primaryDelimiter?: string, secondaryDelimiter?: string): void;
|
|
36
|
+
/**
|
|
37
|
+
* Saves the dataset as a file.
|
|
38
|
+
* @param fileType The file type to save as ('json', 'csv', 'yaml').
|
|
39
|
+
* @param dirPath The directory path to save the file to.
|
|
40
|
+
* @param saveName Optional: The name of the file (without extension). Defaults to a timestamp.
|
|
41
|
+
* @param secondaryDelimiter Optional: The delimiter used for joining list fields in CSV output. Defaults to ";".
|
|
42
|
+
*/
|
|
43
|
+
saveAs(fileType: SaveFileType, dirPath: string, saveName?: string, secondaryDelimiter?: string): void;
|
|
44
|
+
}
|
|
45
|
+
export {};
|
|
@@ -3,29 +3,35 @@
|
|
|
3
3
|
*/
|
|
4
4
|
export interface ExampleOptions {
|
|
5
5
|
input: string;
|
|
6
|
-
actualOutput?: string;
|
|
7
|
-
expectedOutput?: string;
|
|
6
|
+
actualOutput?: string | string[];
|
|
7
|
+
expectedOutput?: string | string[];
|
|
8
8
|
context?: string[];
|
|
9
9
|
retrievalContext?: string[];
|
|
10
10
|
additionalMetadata?: Record<string, any>;
|
|
11
|
-
toolsCalled?:
|
|
12
|
-
expectedTools?:
|
|
11
|
+
toolsCalled?: string[];
|
|
12
|
+
expectedTools?: string[];
|
|
13
|
+
name?: string;
|
|
13
14
|
exampleId?: string;
|
|
14
15
|
exampleIndex?: number;
|
|
15
16
|
timestamp?: string;
|
|
17
|
+
traceId?: string;
|
|
18
|
+
example?: boolean;
|
|
16
19
|
}
|
|
17
20
|
export declare class Example {
|
|
18
21
|
input: string;
|
|
19
|
-
actualOutput?: string;
|
|
20
|
-
expectedOutput?: string;
|
|
22
|
+
actualOutput?: string | string[];
|
|
23
|
+
expectedOutput?: string | string[];
|
|
21
24
|
context?: string[];
|
|
22
25
|
retrievalContext?: string[];
|
|
23
26
|
additionalMetadata?: Record<string, any>;
|
|
24
|
-
toolsCalled?:
|
|
25
|
-
expectedTools?:
|
|
27
|
+
toolsCalled?: string[];
|
|
28
|
+
expectedTools?: string[];
|
|
29
|
+
name?: string;
|
|
26
30
|
exampleId: string;
|
|
27
31
|
exampleIndex?: number;
|
|
28
32
|
timestamp?: string;
|
|
33
|
+
traceId?: string;
|
|
34
|
+
example?: boolean;
|
|
29
35
|
constructor(options: ExampleOptions);
|
|
30
36
|
/**
|
|
31
37
|
* Generate a UUID for the example ID
|
|
@@ -52,19 +58,25 @@ export declare class ExampleBuilder {
|
|
|
52
58
|
private _additionalMetadata?;
|
|
53
59
|
private _toolsCalled?;
|
|
54
60
|
private _expectedTools?;
|
|
61
|
+
private _name?;
|
|
55
62
|
private _exampleId?;
|
|
56
63
|
private _exampleIndex?;
|
|
57
64
|
private _timestamp?;
|
|
65
|
+
private _traceId?;
|
|
66
|
+
private _example?;
|
|
58
67
|
input(input: string): ExampleBuilder;
|
|
59
|
-
actualOutput(actualOutput: string): ExampleBuilder;
|
|
60
|
-
expectedOutput(expectedOutput: string): ExampleBuilder;
|
|
68
|
+
actualOutput(actualOutput: string | string[]): ExampleBuilder;
|
|
69
|
+
expectedOutput(expectedOutput: string | string[]): ExampleBuilder;
|
|
61
70
|
context(context: string[]): ExampleBuilder;
|
|
62
71
|
retrievalContext(retrievalContext: string[]): ExampleBuilder;
|
|
63
72
|
additionalMetadata(additionalMetadata: Record<string, any>): ExampleBuilder;
|
|
64
|
-
toolsCalled(toolsCalled:
|
|
65
|
-
expectedTools(expectedTools:
|
|
73
|
+
toolsCalled(toolsCalled: string[]): ExampleBuilder;
|
|
74
|
+
expectedTools(expectedTools: string[]): ExampleBuilder;
|
|
75
|
+
name(name: string): ExampleBuilder;
|
|
66
76
|
exampleId(exampleId: string): ExampleBuilder;
|
|
67
77
|
exampleIndex(exampleIndex: number): ExampleBuilder;
|
|
68
78
|
timestamp(timestamp: string): ExampleBuilder;
|
|
79
|
+
traceId(traceId: string): ExampleBuilder;
|
|
80
|
+
example(example: boolean): ExampleBuilder;
|
|
69
81
|
build(): Example;
|
|
70
82
|
}
|
package/dist/types/index.d.ts
CHANGED
|
@@ -3,7 +3,6 @@ export { ScoringResult, ScoringResultBuilder, ScorerData, ScoringResultOptions }
|
|
|
3
3
|
export { Tracer, SpanType, wrap, TraceClient } from './common/tracer.js';
|
|
4
4
|
export { Scorer, APIJudgmentScorer, JudgevalScorer, ScorerWrapper } from './scorers/base-scorer.js';
|
|
5
5
|
export { AnswerCorrectnessScorer, AnswerRelevancyScorer, ComparisonScorer, ContextualPrecisionScorer, ContextualRecallScorer, ContextualRelevancyScorer, ExecutionOrderScorer, FaithfulnessScorer, GroundednessScorer, HallucinationScorer, InstructionAdherenceScorer, JsonCorrectnessScorer, SummarizationScorer } from './scorers/api-scorer.js';
|
|
6
|
-
export { ExactMatchScorer } from './scorers/exact-match-scorer.js';
|
|
7
6
|
export { AlertStatus, Condition, NotificationConfig, Rule, AlertResult, RulesEngine } from './rules.js';
|
|
8
7
|
export { EvaluationRun, EvaluationRunOptions } from './evaluation-run.js';
|
|
9
8
|
export { runEval, assertTest, JudgmentAPIError, sendToRabbitMQ, executeApiEval, mergeResults, checkMissingScorerData, checkEvalRunNameExists, logEvaluationResults, checkExamples } from './run-evaluation.js';
|
|
@@ -50,32 +50,8 @@ export declare class JudgmentClient {
|
|
|
50
50
|
/**
|
|
51
51
|
* Evaluate a dataset
|
|
52
52
|
*/
|
|
53
|
-
evaluateDataset(dataset: any, //
|
|
53
|
+
evaluateDataset(dataset: any, // Keep type loose for stub
|
|
54
54
|
scorers: Array<ScorerWrapper | JudgevalScorer>, model: string | string[] | any, aggregator?: string, metadata?: Record<string, any>, projectName?: string, evalRunName?: string, logResults?: boolean, useJudgment?: boolean, rules?: Rule[]): Promise<ScoringResult[]>;
|
|
55
|
-
/**
|
|
56
|
-
* Create a dataset
|
|
57
|
-
*/
|
|
58
|
-
createDataset(): any;
|
|
59
|
-
/**
|
|
60
|
-
* Push a dataset to the Judgment platform
|
|
61
|
-
*/
|
|
62
|
-
pushDataset(alias: string, dataset: any, projectName: string, overwrite?: boolean): Promise<boolean>;
|
|
63
|
-
/**
|
|
64
|
-
* Pull a dataset from the Judgment platform
|
|
65
|
-
*/
|
|
66
|
-
pullDataset(alias: string, projectName: string): Promise<any>;
|
|
67
|
-
/**
|
|
68
|
-
* Delete a dataset from the Judgment platform
|
|
69
|
-
*/
|
|
70
|
-
deleteDataset(alias: string, projectName: string): Promise<boolean>;
|
|
71
|
-
/**
|
|
72
|
-
* Pull project dataset stats from the Judgment platform
|
|
73
|
-
*/
|
|
74
|
-
pullProjectDatasetStats(projectName: string): Promise<Record<string, any>>;
|
|
75
|
-
/**
|
|
76
|
-
* Insert examples into a dataset on the Judgment platform
|
|
77
|
-
*/
|
|
78
|
-
insertDataset(alias: string, examples: Example[], projectName: string): Promise<boolean>;
|
|
79
55
|
/**
|
|
80
56
|
* Pull evaluation results from the server
|
|
81
57
|
* @param projectName Name of the project
|
|
@@ -83,28 +59,6 @@ export declare class JudgmentClient {
|
|
|
83
59
|
* @returns Array containing one object with 'id' and 'results' (list of ScoringResult)
|
|
84
60
|
*/
|
|
85
61
|
pullEval(projectName: string, evalRunName: string): Promise<Array<Record<string, any | ScoringResult[]>>>;
|
|
86
|
-
/**
|
|
87
|
-
* Get evaluation run results (alias for pullEval with a more intuitive name)
|
|
88
|
-
* @param projectName Name of the project
|
|
89
|
-
* @param evalRunName Name of the evaluation run
|
|
90
|
-
* @returns Array containing one object with 'id' and 'results' (list of ScoringResult)
|
|
91
|
-
*/
|
|
92
|
-
getEvalRun(projectName: string, evalRunName: string): Promise<Array<Record<string, any | ScoringResult[]>>>;
|
|
93
|
-
/**
|
|
94
|
-
* List all evaluation runs for a project
|
|
95
|
-
* @param projectName Name of the project
|
|
96
|
-
* @param limit Maximum number of evaluation runs to return (default: 100)
|
|
97
|
-
* @param offset Offset for pagination (default: 0)
|
|
98
|
-
* @returns List of evaluation run metadata
|
|
99
|
-
*/
|
|
100
|
-
listEvalRuns(projectName: string, limit?: number, offset?: number): Promise<Array<Record<string, any>>>;
|
|
101
|
-
/**
|
|
102
|
-
* Get evaluation run statistics
|
|
103
|
-
* @param projectName Name of the project
|
|
104
|
-
* @param evalRunName Name of the evaluation run
|
|
105
|
-
* @returns Statistics for the evaluation run
|
|
106
|
-
*/
|
|
107
|
-
getEvalRunStats(projectName: string, evalRunName: string): Promise<Record<string, any>>;
|
|
108
62
|
/**
|
|
109
63
|
* Export evaluation results to a file format
|
|
110
64
|
* @param projectName Name of the project
|
|
@@ -176,4 +130,6 @@ export declare class JudgmentClient {
|
|
|
176
130
|
* @returns A string representing the progress bar
|
|
177
131
|
*/
|
|
178
132
|
private _createProgressBar;
|
|
133
|
+
private getAuthHeaders;
|
|
134
|
+
private handleApiError;
|
|
179
135
|
}
|
|
@@ -5,67 +5,67 @@ import { ScorerData } from '../data/result.js';
|
|
|
5
5
|
* Implementation of API-based scorers
|
|
6
6
|
*/
|
|
7
7
|
export declare class AnswerCorrectnessScorer extends APIJudgmentScorer {
|
|
8
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
8
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
9
9
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
10
10
|
}
|
|
11
11
|
export declare class AnswerRelevancyScorer extends APIJudgmentScorer {
|
|
12
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
12
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
13
13
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
14
14
|
}
|
|
15
15
|
export declare class ComparisonScorer extends APIJudgmentScorer {
|
|
16
16
|
criteria: string[];
|
|
17
17
|
description: string;
|
|
18
|
-
constructor(threshold?: number, criteria?: string[], description?: string, additional_metadata?: Record<string, any>,
|
|
18
|
+
constructor(threshold?: number, criteria?: string[], description?: string, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
19
19
|
toJSON(): Record<string, any>;
|
|
20
20
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
21
21
|
}
|
|
22
22
|
export declare class ContextualPrecisionScorer extends APIJudgmentScorer {
|
|
23
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
23
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
24
24
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
25
25
|
}
|
|
26
26
|
export declare class ContextualRecallScorer extends APIJudgmentScorer {
|
|
27
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
27
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
28
28
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
29
29
|
}
|
|
30
30
|
export declare class ContextualRelevancyScorer extends APIJudgmentScorer {
|
|
31
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
31
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
32
32
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
33
33
|
}
|
|
34
34
|
export declare class ExecutionOrderScorer extends APIJudgmentScorer {
|
|
35
35
|
strictMode: boolean;
|
|
36
36
|
expectedTools?: string[];
|
|
37
|
-
constructor(threshold?: number,
|
|
37
|
+
constructor(threshold?: number, expectedTools?: string[], additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
38
38
|
toJSON(): Record<string, any>;
|
|
39
39
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
40
40
|
}
|
|
41
41
|
export declare class FaithfulnessScorer extends APIJudgmentScorer {
|
|
42
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
42
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
43
43
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
44
44
|
}
|
|
45
45
|
export declare class GroundednessScorer extends APIJudgmentScorer {
|
|
46
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
46
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
47
47
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
48
48
|
}
|
|
49
49
|
export declare class HallucinationScorer extends APIJudgmentScorer {
|
|
50
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
50
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
51
51
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
52
52
|
}
|
|
53
53
|
export declare class InstructionAdherenceScorer extends APIJudgmentScorer {
|
|
54
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
54
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
55
55
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
56
56
|
}
|
|
57
57
|
export declare class JsonCorrectnessScorer extends APIJudgmentScorer {
|
|
58
58
|
jsonSchema?: Record<string, any>;
|
|
59
|
-
constructor(threshold?: number, jsonSchema?: Record<string, any>, additional_metadata?: Record<string, any>,
|
|
59
|
+
constructor(threshold?: number, jsonSchema?: Record<string, any>, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
60
60
|
toJSON(): Record<string, any>;
|
|
61
61
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
62
62
|
}
|
|
63
63
|
export declare class SummarizationScorer extends APIJudgmentScorer {
|
|
64
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
64
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
65
65
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
66
66
|
}
|
|
67
67
|
export declare class Text2SQLScorer extends APIJudgmentScorer {
|
|
68
|
-
constructor(threshold?: number, additional_metadata?: Record<string, any>,
|
|
68
|
+
constructor(threshold?: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
69
69
|
a_score_example(example: Example): Promise<ScorerData>;
|
|
70
70
|
}
|
|
71
71
|
export declare class ScorerWrapper {
|
|
@@ -75,5 +75,5 @@ export declare class ScorerWrapper {
|
|
|
75
75
|
get threshold(): number;
|
|
76
76
|
get additional_metadata(): Record<string, any> | undefined;
|
|
77
77
|
toJSON(): Record<string, any>;
|
|
78
|
-
static fromType(type: string, threshold: number, additional_metadata?: Record<string, any>,
|
|
78
|
+
static fromType(type: string, threshold: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean): APIJudgmentScorer;
|
|
79
79
|
}
|
|
@@ -8,8 +8,18 @@ export interface Scorer {
|
|
|
8
8
|
scoreType: string;
|
|
9
9
|
threshold: number;
|
|
10
10
|
score?: number;
|
|
11
|
+
score_breakdown?: Record<string, any>;
|
|
12
|
+
reason?: string;
|
|
13
|
+
success?: boolean;
|
|
14
|
+
evaluation_model?: string;
|
|
15
|
+
strict_mode: boolean;
|
|
16
|
+
async_mode: boolean;
|
|
17
|
+
verbose_mode: boolean;
|
|
18
|
+
include_reason: boolean;
|
|
19
|
+
error?: string;
|
|
20
|
+
evaluation_cost?: number;
|
|
21
|
+
verbose_logs?: string;
|
|
11
22
|
additional_metadata?: Record<string, any>;
|
|
12
|
-
verbose: boolean;
|
|
13
23
|
validateThreshold(): void;
|
|
14
24
|
toJSON(): Record<string, any>;
|
|
15
25
|
successCheck(): boolean;
|
|
@@ -22,9 +32,13 @@ export declare abstract class APIJudgmentScorer implements Scorer {
|
|
|
22
32
|
get scoreType(): string;
|
|
23
33
|
readonly threshold: number;
|
|
24
34
|
score?: number;
|
|
35
|
+
score_breakdown?: Record<string, any>;
|
|
25
36
|
additional_metadata?: Record<string, any>;
|
|
26
|
-
|
|
27
|
-
|
|
37
|
+
strict_mode: boolean;
|
|
38
|
+
async_mode: boolean;
|
|
39
|
+
verbose_mode: boolean;
|
|
40
|
+
include_reason: boolean;
|
|
41
|
+
constructor(type: string, threshold: number, additional_metadata?: Record<string, any>, strict_mode?: boolean, async_mode?: boolean, verbose_mode?: boolean, include_reason?: boolean);
|
|
28
42
|
/**
|
|
29
43
|
* Check if the score meets the threshold
|
|
30
44
|
*/
|
|
@@ -47,27 +61,46 @@ export declare abstract class JudgevalScorer implements Scorer {
|
|
|
47
61
|
scoreType: string;
|
|
48
62
|
threshold: number;
|
|
49
63
|
score?: number;
|
|
64
|
+
score_breakdown?: Record<string, any>;
|
|
65
|
+
reason?: string;
|
|
66
|
+
success?: boolean;
|
|
67
|
+
evaluation_model?: string;
|
|
68
|
+
strict_mode: boolean;
|
|
69
|
+
async_mode: boolean;
|
|
70
|
+
verbose_mode: boolean;
|
|
71
|
+
include_reason: boolean;
|
|
72
|
+
error?: string;
|
|
73
|
+
evaluation_cost?: number;
|
|
74
|
+
verbose_logs?: string;
|
|
50
75
|
additional_metadata?: Record<string, any>;
|
|
51
|
-
|
|
52
|
-
constructor(type: string, threshold: number, additional_metadata?: Record<string, any>, verbose?: boolean);
|
|
76
|
+
constructor(type: string, threshold: number, additional_metadata?: Record<string, any>, include_reason?: boolean, async_mode?: boolean, strict_mode?: boolean, verbose_mode?: boolean);
|
|
53
77
|
/**
|
|
54
78
|
* Check if the score meets the threshold
|
|
55
79
|
*/
|
|
56
80
|
successCheck(): boolean;
|
|
81
|
+
/**
|
|
82
|
+
* Internal method to check success
|
|
83
|
+
* This is equivalent to Python's _success_check method
|
|
84
|
+
*/
|
|
85
|
+
protected _successCheck(): boolean;
|
|
57
86
|
/**
|
|
58
87
|
* Validate that the threshold is within the allowed range
|
|
59
88
|
*/
|
|
60
89
|
validateThreshold(): void;
|
|
90
|
+
/**
|
|
91
|
+
* Convert the scorer to a plain object
|
|
92
|
+
*/
|
|
93
|
+
toJSON(): Record<string, any>;
|
|
61
94
|
/**
|
|
62
95
|
* Score an example
|
|
63
|
-
*
|
|
64
|
-
* @returns A ScorerData object with the score
|
|
96
|
+
* This must be implemented by subclasses
|
|
65
97
|
*/
|
|
66
98
|
abstract scoreExample(example: Example): Promise<ScorerData>;
|
|
67
99
|
/**
|
|
68
|
-
*
|
|
100
|
+
* Get the name of the scorer
|
|
101
|
+
* This is equivalent to Python's __name__ property
|
|
69
102
|
*/
|
|
70
|
-
|
|
103
|
+
get name(): string;
|
|
71
104
|
}
|
|
72
105
|
/**
|
|
73
106
|
* Wrapper for scorers to allow dynamic loading of implementations
|
|
@@ -77,8 +110,18 @@ export declare class ScorerWrapper implements Scorer {
|
|
|
77
110
|
scoreType: string;
|
|
78
111
|
threshold: number;
|
|
79
112
|
score?: number;
|
|
113
|
+
score_breakdown?: Record<string, any>;
|
|
114
|
+
reason?: string;
|
|
115
|
+
success?: boolean;
|
|
116
|
+
evaluation_model?: string;
|
|
117
|
+
strict_mode: boolean;
|
|
118
|
+
async_mode: boolean;
|
|
119
|
+
verbose_mode: boolean;
|
|
120
|
+
include_reason: boolean;
|
|
121
|
+
error?: string;
|
|
122
|
+
evaluation_cost?: number;
|
|
123
|
+
verbose_logs?: string;
|
|
80
124
|
additional_metadata?: Record<string, any>;
|
|
81
|
-
verbose: boolean;
|
|
82
125
|
scorer: any;
|
|
83
126
|
constructor(scorer: any);
|
|
84
127
|
/**
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "judgeval",
|
|
3
|
-
"version": "0.2.
|
|
3
|
+
"version": "0.2.2",
|
|
4
4
|
"description": "Judgment SDK for TypeScript/JavaScript",
|
|
5
5
|
"main": "./dist/cjs/index.js",
|
|
6
6
|
"module": "./dist/esm/index.js",
|
|
@@ -49,6 +49,7 @@
|
|
|
49
49
|
"@types/node": "^20.12.12",
|
|
50
50
|
"@typescript-eslint/eslint-plugin": "^7.10.0",
|
|
51
51
|
"@typescript-eslint/parser": "^7.10.0",
|
|
52
|
+
"cross-env": "^7.0.3",
|
|
52
53
|
"eslint": "^8.57.0",
|
|
53
54
|
"eslint-config-prettier": "^9.1.0",
|
|
54
55
|
"eslint-plugin-prettier": "^5.1.3",
|