judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -1,341 +0,0 @@
|
|
|
1
|
-
import ast
|
|
2
|
-
import csv
|
|
3
|
-
import datetime
|
|
4
|
-
import json
|
|
5
|
-
import os
|
|
6
|
-
import yaml
|
|
7
|
-
from dataclasses import dataclass, field
|
|
8
|
-
from typing import List, Union, Literal, Optional
|
|
9
|
-
|
|
10
|
-
from judgeval.data import Example, Trace
|
|
11
|
-
from judgeval.common.logger import judgeval_logger
|
|
12
|
-
from judgeval.utils.file_utils import get_examples_from_yaml
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
@dataclass
|
|
16
|
-
class EvalDataset:
|
|
17
|
-
examples: List[Example]
|
|
18
|
-
traces: List[Trace]
|
|
19
|
-
_alias: Union[str, None] = field(default=None)
|
|
20
|
-
_id: Union[str, None] = field(default=None)
|
|
21
|
-
judgment_api_key: str = field(default="")
|
|
22
|
-
organization_id: str = field(default="")
|
|
23
|
-
|
|
24
|
-
def __init__(
|
|
25
|
-
self,
|
|
26
|
-
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY", ""),
|
|
27
|
-
organization_id: str = os.getenv("JUDGMENT_ORG_ID", ""),
|
|
28
|
-
examples: Optional[List[Example]] = None,
|
|
29
|
-
traces: Optional[List[Trace]] = None,
|
|
30
|
-
):
|
|
31
|
-
if not judgment_api_key:
|
|
32
|
-
judgeval_logger.error("No judgment_api_key provided")
|
|
33
|
-
self.examples = examples or []
|
|
34
|
-
self.traces = traces or []
|
|
35
|
-
self._alias = None
|
|
36
|
-
self._id = None
|
|
37
|
-
self.judgment_api_key = judgment_api_key
|
|
38
|
-
self.organization_id = organization_id
|
|
39
|
-
|
|
40
|
-
def add_from_json(self, file_path: str) -> None:
|
|
41
|
-
"""
|
|
42
|
-
Adds examples from a JSON file.
|
|
43
|
-
|
|
44
|
-
The format of the JSON file is expected to be a dictionary with one key: "examples".
|
|
45
|
-
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
|
46
|
-
|
|
47
|
-
The JSON file is expected to have the following format:
|
|
48
|
-
{
|
|
49
|
-
"examples": [
|
|
50
|
-
{
|
|
51
|
-
"input": "test input",
|
|
52
|
-
"actual_output": "test output",
|
|
53
|
-
"expected_output": "expected output",
|
|
54
|
-
"context": [
|
|
55
|
-
"context1",
|
|
56
|
-
"context2"
|
|
57
|
-
],
|
|
58
|
-
"retrieval_context": [
|
|
59
|
-
"retrieval1"
|
|
60
|
-
],
|
|
61
|
-
"additional_metadata": {
|
|
62
|
-
"key": "value"
|
|
63
|
-
},
|
|
64
|
-
"tools_called": [
|
|
65
|
-
"tool1"
|
|
66
|
-
],
|
|
67
|
-
"expected_tools": [
|
|
68
|
-
"tool1",
|
|
69
|
-
"tool2"
|
|
70
|
-
],
|
|
71
|
-
"name": "test example",
|
|
72
|
-
"example_id": null,
|
|
73
|
-
"timestamp": "20241230_160117",
|
|
74
|
-
"trace_id": "123"
|
|
75
|
-
}
|
|
76
|
-
]
|
|
77
|
-
}
|
|
78
|
-
"""
|
|
79
|
-
try:
|
|
80
|
-
with open(file_path, "r") as file:
|
|
81
|
-
payload = json.load(file)
|
|
82
|
-
examples = payload.get("examples", [])
|
|
83
|
-
except FileNotFoundError:
|
|
84
|
-
judgeval_logger.error(f"JSON file not found: {file_path}")
|
|
85
|
-
raise FileNotFoundError(f"The file {file_path} was not found.")
|
|
86
|
-
except json.JSONDecodeError:
|
|
87
|
-
judgeval_logger.error(f"Invalid JSON file: {file_path}")
|
|
88
|
-
raise ValueError(f"The file {file_path} is not a valid JSON file.")
|
|
89
|
-
|
|
90
|
-
new_examples = [Example(**e) for e in examples]
|
|
91
|
-
for e in new_examples:
|
|
92
|
-
self.add_example(e)
|
|
93
|
-
|
|
94
|
-
def add_from_csv(
|
|
95
|
-
self,
|
|
96
|
-
file_path: str,
|
|
97
|
-
header_mapping: dict,
|
|
98
|
-
primary_delimiter: str = ",",
|
|
99
|
-
secondary_delimiter: str = ";",
|
|
100
|
-
) -> None:
|
|
101
|
-
"""
|
|
102
|
-
Add Examples from a CSV file.
|
|
103
|
-
|
|
104
|
-
Args:
|
|
105
|
-
file_path (str): Path to the CSV file
|
|
106
|
-
header_mapping (dict): Dictionary mapping Example headers to custom headers
|
|
107
|
-
primary_delimiter (str, optional): Main delimiter used in CSV file. Defaults to ","
|
|
108
|
-
secondary_delimiter (str, optional): Secondary delimiter for list fields. Defaults to ";"
|
|
109
|
-
"""
|
|
110
|
-
try:
|
|
111
|
-
import pandas as pd
|
|
112
|
-
except ModuleNotFoundError:
|
|
113
|
-
raise ModuleNotFoundError(
|
|
114
|
-
"Please install pandas to use this method. 'pip install pandas'"
|
|
115
|
-
)
|
|
116
|
-
|
|
117
|
-
# Pandas naturally reads numbers in data files as ints, not strings (can lead to unexpected behavior)
|
|
118
|
-
df = pd.read_csv(file_path, dtype={"trace_id": str}, sep=primary_delimiter)
|
|
119
|
-
"""
|
|
120
|
-
The user should pass in a dict mapping from Judgment Example headers to their custom defined headers.
|
|
121
|
-
Available headers for Example objects are as follows:
|
|
122
|
-
|
|
123
|
-
"input", "actual_output", "expected_output", "context", \
|
|
124
|
-
"retrieval_context", "additional_metadata", "tools_called", \
|
|
125
|
-
"expected_tools", "name", "comments", "source_file", "example", \
|
|
126
|
-
"trace_id"
|
|
127
|
-
|
|
128
|
-
We want to collect the examples separately which can
|
|
129
|
-
be determined by the "example" column. If the value is True, then it is an
|
|
130
|
-
example, and we expect the `input` and `actual_output` fields to be non-null.
|
|
131
|
-
|
|
132
|
-
We also assume that if there are multiple retrieval contexts, contexts, or tools called, they are separated by semicolons.
|
|
133
|
-
This can be adjusted using the `secondary_delimiter` parameter.
|
|
134
|
-
"""
|
|
135
|
-
examples = []
|
|
136
|
-
|
|
137
|
-
def process_csv_row(value, header):
|
|
138
|
-
"""
|
|
139
|
-
Maps a singular value in the CSV file to the appropriate type based on the header.
|
|
140
|
-
If value exists and can be split into type List[*], we will split upon the user's provided secondary delimiter.
|
|
141
|
-
"""
|
|
142
|
-
# check that the CSV value is not null for entry
|
|
143
|
-
null_replacement = dict() if header == "additional_metadata" else None
|
|
144
|
-
if pd.isna(value) or value == "":
|
|
145
|
-
return null_replacement
|
|
146
|
-
try:
|
|
147
|
-
value = (
|
|
148
|
-
ast.literal_eval(value)
|
|
149
|
-
if header == "additional_metadata"
|
|
150
|
-
else str(value)
|
|
151
|
-
)
|
|
152
|
-
except (ValueError, SyntaxError):
|
|
153
|
-
value = str(value)
|
|
154
|
-
if header in [
|
|
155
|
-
"context",
|
|
156
|
-
"retrieval_context",
|
|
157
|
-
"tools_called",
|
|
158
|
-
"expected_tools",
|
|
159
|
-
]:
|
|
160
|
-
# attempt to split the value by the secondary delimiter
|
|
161
|
-
value = value.split(secondary_delimiter)
|
|
162
|
-
|
|
163
|
-
return value
|
|
164
|
-
|
|
165
|
-
for _, row in df.iterrows():
|
|
166
|
-
data = {
|
|
167
|
-
header: process_csv_row(row[header_mapping[header]], header)
|
|
168
|
-
for header in header_mapping
|
|
169
|
-
}
|
|
170
|
-
if "example" in header_mapping and row[header_mapping["example"]]:
|
|
171
|
-
if "name" in header_mapping:
|
|
172
|
-
data["name"] = (
|
|
173
|
-
row[header_mapping["name"]]
|
|
174
|
-
if pd.notna(row[header_mapping["name"]])
|
|
175
|
-
else None
|
|
176
|
-
)
|
|
177
|
-
# every Example has `input` and `actual_output` fields
|
|
178
|
-
if data["input"] is not None and data["actual_output"] is not None:
|
|
179
|
-
e = Example(**data)
|
|
180
|
-
examples.append(e)
|
|
181
|
-
else:
|
|
182
|
-
raise ValueError(
|
|
183
|
-
"Every example must have an 'input' and 'actual_output' field."
|
|
184
|
-
)
|
|
185
|
-
|
|
186
|
-
for e in examples:
|
|
187
|
-
self.add_example(e)
|
|
188
|
-
|
|
189
|
-
def add_from_yaml(self, file_path: str) -> None:
|
|
190
|
-
"""
|
|
191
|
-
Adds examples from a YAML file.
|
|
192
|
-
|
|
193
|
-
The format of the YAML file is expected to be a dictionary with one key: "examples".
|
|
194
|
-
The value of the key is a list of dictionaries, where each dictionary represents an example.
|
|
195
|
-
|
|
196
|
-
The YAML file is expected to have the following format:
|
|
197
|
-
examples:
|
|
198
|
-
- input: "test input"
|
|
199
|
-
actual_output: "test output"
|
|
200
|
-
expected_output: "expected output"
|
|
201
|
-
context:
|
|
202
|
-
- "context1"
|
|
203
|
-
- "context2"
|
|
204
|
-
retrieval_context:
|
|
205
|
-
- "retrieval1"
|
|
206
|
-
additional_metadata:
|
|
207
|
-
key: "value"
|
|
208
|
-
tools_called:
|
|
209
|
-
- "tool1"
|
|
210
|
-
expected_tools:
|
|
211
|
-
- "tool1"
|
|
212
|
-
- "tool2"
|
|
213
|
-
name: "test example"
|
|
214
|
-
example_id: null
|
|
215
|
-
timestamp: "20241230_160117"
|
|
216
|
-
trace_id: "123"
|
|
217
|
-
"""
|
|
218
|
-
examples = get_examples_from_yaml(file_path)
|
|
219
|
-
|
|
220
|
-
for e in examples:
|
|
221
|
-
self.add_example(e)
|
|
222
|
-
|
|
223
|
-
def add_example(self, e: Example) -> None:
|
|
224
|
-
self.examples.append(e)
|
|
225
|
-
# TODO if we need to add rank, then we need to do it here
|
|
226
|
-
|
|
227
|
-
def add_trace(self, t: Trace) -> None:
|
|
228
|
-
self.traces.append(t)
|
|
229
|
-
|
|
230
|
-
def save_as(
|
|
231
|
-
self,
|
|
232
|
-
file_type: Literal["json", "csv", "yaml"],
|
|
233
|
-
dir_path: str,
|
|
234
|
-
save_name: str | None = None,
|
|
235
|
-
) -> None:
|
|
236
|
-
"""
|
|
237
|
-
Saves the dataset as a file. Save only the examples.
|
|
238
|
-
|
|
239
|
-
Args:
|
|
240
|
-
file_type (Literal["json", "csv"]): The file type to save the dataset as.
|
|
241
|
-
dir_path (str): The directory path to save the file to.
|
|
242
|
-
save_name (str, optional): The name of the file to save. Defaults to None.
|
|
243
|
-
"""
|
|
244
|
-
if not os.path.exists(dir_path):
|
|
245
|
-
os.makedirs(dir_path)
|
|
246
|
-
file_name = (
|
|
247
|
-
datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
248
|
-
if save_name is None
|
|
249
|
-
else save_name
|
|
250
|
-
)
|
|
251
|
-
complete_path = os.path.join(dir_path, f"{file_name}.{file_type}")
|
|
252
|
-
if file_type == "json":
|
|
253
|
-
with open(complete_path, "w") as file:
|
|
254
|
-
json.dump(
|
|
255
|
-
{
|
|
256
|
-
"examples": [e.to_dict() for e in self.examples],
|
|
257
|
-
},
|
|
258
|
-
file,
|
|
259
|
-
indent=4,
|
|
260
|
-
)
|
|
261
|
-
elif file_type == "csv":
|
|
262
|
-
with open(complete_path, "w", newline="") as file:
|
|
263
|
-
writer = csv.writer(file)
|
|
264
|
-
writer.writerow(
|
|
265
|
-
[
|
|
266
|
-
"input",
|
|
267
|
-
"actual_output",
|
|
268
|
-
"expected_output",
|
|
269
|
-
"context",
|
|
270
|
-
"retrieval_context",
|
|
271
|
-
"additional_metadata",
|
|
272
|
-
"tools_called",
|
|
273
|
-
"expected_tools",
|
|
274
|
-
"name",
|
|
275
|
-
"comments",
|
|
276
|
-
"source_file",
|
|
277
|
-
"example",
|
|
278
|
-
"trace_id",
|
|
279
|
-
]
|
|
280
|
-
)
|
|
281
|
-
for e in self.examples:
|
|
282
|
-
writer.writerow(
|
|
283
|
-
[
|
|
284
|
-
e.input,
|
|
285
|
-
e.actual_output,
|
|
286
|
-
e.expected_output,
|
|
287
|
-
";".join(e.context),
|
|
288
|
-
";".join(e.retrieval_context),
|
|
289
|
-
e.additional_metadata,
|
|
290
|
-
";".join(e.tools_called),
|
|
291
|
-
";".join(e.expected_tools),
|
|
292
|
-
e.name,
|
|
293
|
-
None, # Example does not have comments
|
|
294
|
-
None, # Example does not have source file
|
|
295
|
-
True, # Adding an Example
|
|
296
|
-
]
|
|
297
|
-
)
|
|
298
|
-
|
|
299
|
-
elif file_type == "yaml":
|
|
300
|
-
with open(complete_path, "w") as file:
|
|
301
|
-
yaml_data = {
|
|
302
|
-
"examples": [
|
|
303
|
-
{
|
|
304
|
-
"input": e.input,
|
|
305
|
-
"actual_output": e.actual_output,
|
|
306
|
-
"expected_output": e.expected_output,
|
|
307
|
-
"context": e.context,
|
|
308
|
-
"retrieval_context": e.retrieval_context,
|
|
309
|
-
"additional_metadata": e.additional_metadata,
|
|
310
|
-
"tools_called": e.tools_called,
|
|
311
|
-
"expected_tools": e.expected_tools,
|
|
312
|
-
"name": e.name,
|
|
313
|
-
"comments": None, # Example does not have comments
|
|
314
|
-
"source_file": None, # Example does not have source file
|
|
315
|
-
"example": True, # Adding an Example
|
|
316
|
-
}
|
|
317
|
-
for e in self.examples
|
|
318
|
-
],
|
|
319
|
-
}
|
|
320
|
-
yaml.dump(yaml_data, file, default_flow_style=False)
|
|
321
|
-
else:
|
|
322
|
-
ACCEPTABLE_FILE_TYPES = ["json", "csv", "yaml"]
|
|
323
|
-
raise TypeError(
|
|
324
|
-
f"Invalid file type: {file_type}. Please choose from {ACCEPTABLE_FILE_TYPES}"
|
|
325
|
-
)
|
|
326
|
-
|
|
327
|
-
def __iter__(self):
|
|
328
|
-
return iter(self.examples)
|
|
329
|
-
|
|
330
|
-
def __len__(self):
|
|
331
|
-
return len(self.examples)
|
|
332
|
-
|
|
333
|
-
def __str__(self):
|
|
334
|
-
return (
|
|
335
|
-
f"{self.__class__.__name__}("
|
|
336
|
-
f"examples={self.examples}, "
|
|
337
|
-
f"traces={self.traces}, "
|
|
338
|
-
f"_alias={self._alias}, "
|
|
339
|
-
f"_id={self._id}"
|
|
340
|
-
f")"
|
|
341
|
-
)
|
|
@@ -1,214 +0,0 @@
|
|
|
1
|
-
from typing import Optional, List
|
|
2
|
-
from rich.progress import Progress, SpinnerColumn, TextColumn
|
|
3
|
-
from judgeval.common.logger import judgeval_logger
|
|
4
|
-
from judgeval.common.api import JudgmentApiClient
|
|
5
|
-
from judgeval.data import Example, Trace
|
|
6
|
-
from judgeval.data.datasets import EvalDataset
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class EvalDatasetClient:
|
|
10
|
-
def __init__(self, judgment_api_key: str, organization_id: str):
|
|
11
|
-
self.api_client = JudgmentApiClient(judgment_api_key, organization_id)
|
|
12
|
-
|
|
13
|
-
def create_dataset(self) -> EvalDataset:
|
|
14
|
-
return EvalDataset(judgment_api_key=self.api_client.api_key)
|
|
15
|
-
|
|
16
|
-
def push(
|
|
17
|
-
self,
|
|
18
|
-
dataset: EvalDataset,
|
|
19
|
-
alias: str,
|
|
20
|
-
project_name: str,
|
|
21
|
-
overwrite: Optional[bool] = False,
|
|
22
|
-
) -> bool:
|
|
23
|
-
if overwrite:
|
|
24
|
-
judgeval_logger.warning(f"Overwrite enabled for alias '{alias}'")
|
|
25
|
-
"""
|
|
26
|
-
Pushes the dataset to Judgment platform
|
|
27
|
-
|
|
28
|
-
Mock request:
|
|
29
|
-
dataset = {
|
|
30
|
-
"alias": alias,
|
|
31
|
-
"examples": [...],
|
|
32
|
-
"overwrite": overwrite
|
|
33
|
-
} ==>
|
|
34
|
-
{
|
|
35
|
-
"_alias": alias,
|
|
36
|
-
"_id": "..." # ID of the dataset
|
|
37
|
-
}
|
|
38
|
-
"""
|
|
39
|
-
with Progress(
|
|
40
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
41
|
-
TextColumn("[progress.description]{task.description}"),
|
|
42
|
-
transient=False,
|
|
43
|
-
) as progress:
|
|
44
|
-
task_id = progress.add_task(
|
|
45
|
-
f"Pushing [rgb(106,0,255)]'{alias}' to Judgment...",
|
|
46
|
-
total=100,
|
|
47
|
-
)
|
|
48
|
-
try:
|
|
49
|
-
payload = self.api_client.push_dataset(
|
|
50
|
-
dataset_alias=alias,
|
|
51
|
-
project_name=project_name,
|
|
52
|
-
examples=[e.to_dict() for e in dataset.examples],
|
|
53
|
-
traces=[t.model_dump() for t in dataset.traces],
|
|
54
|
-
overwrite=overwrite or False,
|
|
55
|
-
)
|
|
56
|
-
except Exception as e:
|
|
57
|
-
judgeval_logger.error(f"Error during push: {e}")
|
|
58
|
-
raise
|
|
59
|
-
dataset._alias = payload.get("_alias")
|
|
60
|
-
dataset._id = payload.get("_id")
|
|
61
|
-
progress.update(
|
|
62
|
-
task_id,
|
|
63
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
|
64
|
-
)
|
|
65
|
-
return True
|
|
66
|
-
|
|
67
|
-
def append_examples(
|
|
68
|
-
self, alias: str, examples: List[Example], project_name: str
|
|
69
|
-
) -> bool:
|
|
70
|
-
"""
|
|
71
|
-
Appends the dataset to Judgment platform
|
|
72
|
-
|
|
73
|
-
Mock request:
|
|
74
|
-
dataset = {
|
|
75
|
-
"alias": alias,
|
|
76
|
-
"examples": [...],
|
|
77
|
-
"project_name": project_name
|
|
78
|
-
} ==>
|
|
79
|
-
{
|
|
80
|
-
"_alias": alias,
|
|
81
|
-
"_id": "..." # ID of the dataset
|
|
82
|
-
}
|
|
83
|
-
"""
|
|
84
|
-
with Progress(
|
|
85
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
86
|
-
TextColumn("[progress.description]{task.description}"),
|
|
87
|
-
transient=False,
|
|
88
|
-
) as progress:
|
|
89
|
-
task_id = progress.add_task(
|
|
90
|
-
f"Appending [rgb(106,0,255)]'{alias}' to Judgment...",
|
|
91
|
-
total=100,
|
|
92
|
-
)
|
|
93
|
-
try:
|
|
94
|
-
self.api_client.append_examples(
|
|
95
|
-
dataset_alias=alias,
|
|
96
|
-
project_name=project_name,
|
|
97
|
-
examples=[e.to_dict() for e in examples],
|
|
98
|
-
)
|
|
99
|
-
except Exception as e:
|
|
100
|
-
judgeval_logger.error(f"Error during append: {e}")
|
|
101
|
-
raise
|
|
102
|
-
|
|
103
|
-
progress.update(
|
|
104
|
-
task_id,
|
|
105
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
|
106
|
-
)
|
|
107
|
-
return True
|
|
108
|
-
|
|
109
|
-
def pull(self, alias: str, project_name: str) -> EvalDataset:
|
|
110
|
-
"""
|
|
111
|
-
Pulls the dataset from Judgment platform
|
|
112
|
-
|
|
113
|
-
Mock request:
|
|
114
|
-
{
|
|
115
|
-
"alias": alias,
|
|
116
|
-
"project_name": project_name
|
|
117
|
-
}
|
|
118
|
-
==>
|
|
119
|
-
{
|
|
120
|
-
"examples": [...],
|
|
121
|
-
"_alias": alias,
|
|
122
|
-
"_id": "..." # ID of the dataset
|
|
123
|
-
}
|
|
124
|
-
"""
|
|
125
|
-
# Make a POST request to the Judgment API to get the dataset
|
|
126
|
-
dataset = self.create_dataset()
|
|
127
|
-
|
|
128
|
-
with Progress(
|
|
129
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
130
|
-
TextColumn("[progress.description]{task.description}"),
|
|
131
|
-
transient=False,
|
|
132
|
-
) as progress:
|
|
133
|
-
task_id = progress.add_task(
|
|
134
|
-
f"Pulling [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
|
135
|
-
total=100,
|
|
136
|
-
)
|
|
137
|
-
try:
|
|
138
|
-
payload = self.api_client.pull_dataset(
|
|
139
|
-
dataset_alias=alias,
|
|
140
|
-
project_name=project_name,
|
|
141
|
-
)
|
|
142
|
-
except Exception as e:
|
|
143
|
-
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
|
144
|
-
raise
|
|
145
|
-
dataset.examples = [Example(**e) for e in payload.get("examples", [])]
|
|
146
|
-
dataset.traces = [Trace(**t) for t in payload.get("traces", [])]
|
|
147
|
-
dataset._alias = payload.get("alias")
|
|
148
|
-
dataset._id = payload.get("id")
|
|
149
|
-
progress.update(
|
|
150
|
-
task_id,
|
|
151
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
|
152
|
-
)
|
|
153
|
-
|
|
154
|
-
return dataset
|
|
155
|
-
|
|
156
|
-
def delete(self, alias: str, project_name: str) -> bool:
|
|
157
|
-
with Progress(
|
|
158
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
159
|
-
TextColumn("[progress.description]{task.description}"),
|
|
160
|
-
transient=False,
|
|
161
|
-
) as progress:
|
|
162
|
-
progress.add_task(
|
|
163
|
-
f"Deleting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] from Judgment...",
|
|
164
|
-
total=100,
|
|
165
|
-
)
|
|
166
|
-
try:
|
|
167
|
-
self.api_client.delete_dataset(
|
|
168
|
-
dataset_alias=alias,
|
|
169
|
-
project_name=project_name,
|
|
170
|
-
)
|
|
171
|
-
except Exception as e:
|
|
172
|
-
judgeval_logger.error(f"Error deleting dataset: {str(e)}")
|
|
173
|
-
raise
|
|
174
|
-
|
|
175
|
-
return True
|
|
176
|
-
|
|
177
|
-
def pull_project_dataset_stats(self, project_name: str) -> dict:
|
|
178
|
-
"""
|
|
179
|
-
Pulls the project datasets stats from Judgment platform
|
|
180
|
-
|
|
181
|
-
Mock request:
|
|
182
|
-
{
|
|
183
|
-
"project_name": project_name
|
|
184
|
-
}
|
|
185
|
-
==>
|
|
186
|
-
{
|
|
187
|
-
"test_dataset_1": {"examples_count": len(dataset1.examples)},
|
|
188
|
-
"test_dataset_2": {"examples_count": len(dataset2.examples)},
|
|
189
|
-
...
|
|
190
|
-
}
|
|
191
|
-
"""
|
|
192
|
-
# Make a POST request to the Judgment API to get the dataset
|
|
193
|
-
|
|
194
|
-
with Progress(
|
|
195
|
-
SpinnerColumn(style="rgb(106,0,255)"),
|
|
196
|
-
TextColumn("[progress.description]{task.description}"),
|
|
197
|
-
transient=False,
|
|
198
|
-
) as progress:
|
|
199
|
-
task_id = progress.add_task(
|
|
200
|
-
"Pulling [rgb(106,0,255)]' datasets'[/rgb(106,0,255)] from Judgment...",
|
|
201
|
-
total=100,
|
|
202
|
-
)
|
|
203
|
-
try:
|
|
204
|
-
payload = self.api_client.get_project_dataset_stats(project_name)
|
|
205
|
-
except Exception as e:
|
|
206
|
-
judgeval_logger.error(f"Error pulling dataset: {str(e)}")
|
|
207
|
-
raise
|
|
208
|
-
|
|
209
|
-
progress.update(
|
|
210
|
-
task_id,
|
|
211
|
-
description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
|
|
212
|
-
)
|
|
213
|
-
|
|
214
|
-
return payload
|
judgeval/data/tool.py
DELETED
judgeval/data/trace_run.py
DELETED
|
@@ -1,37 +0,0 @@
|
|
|
1
|
-
from pydantic import BaseModel
|
|
2
|
-
from typing import List, Optional, Dict, Any, Union
|
|
3
|
-
from judgeval.data import Trace
|
|
4
|
-
from judgeval.scorers import APIScorerConfig, BaseScorer
|
|
5
|
-
from judgeval.rules import Rule
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class TraceRun(BaseModel):
|
|
9
|
-
"""
|
|
10
|
-
Stores example and evaluation scorers together for running an eval task
|
|
11
|
-
|
|
12
|
-
Args:
|
|
13
|
-
project_name (str): The name of the project the evaluation results belong to
|
|
14
|
-
eval_name (str): A name for this evaluation run
|
|
15
|
-
traces (List[Trace]): The traces to evaluate
|
|
16
|
-
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
|
17
|
-
model (str): The model used as a judge when using LLM as a Judge
|
|
18
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
|
19
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
|
20
|
-
append (Optional[bool]): Whether to append to existing evaluation results
|
|
21
|
-
tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
organization_id: Optional[str] = None
|
|
25
|
-
project_name: Optional[str] = None
|
|
26
|
-
eval_name: Optional[str] = None
|
|
27
|
-
traces: Optional[List[Trace]] = None
|
|
28
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
|
29
|
-
model: Optional[str] = "gpt-4.1"
|
|
30
|
-
trace_span_id: Optional[str] = None
|
|
31
|
-
append: Optional[bool] = False
|
|
32
|
-
override: Optional[bool] = False
|
|
33
|
-
rules: Optional[List[Rule]] = None
|
|
34
|
-
tools: Optional[List[Dict[str, Any]]] = None
|
|
35
|
-
|
|
36
|
-
class Config:
|
|
37
|
-
arbitrary_types_allowed = True
|
judgeval/evaluation_run.py
DELETED
|
@@ -1,75 +0,0 @@
|
|
|
1
|
-
from typing import List, Optional, Union
|
|
2
|
-
from pydantic import BaseModel, field_validator, Field
|
|
3
|
-
|
|
4
|
-
from judgeval.data import Example
|
|
5
|
-
from judgeval.scorers import BaseScorer, APIScorerConfig
|
|
6
|
-
from judgeval.constants import ACCEPTABLE_MODELS
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class EvaluationRun(BaseModel):
|
|
10
|
-
"""
|
|
11
|
-
Stores example and evaluation scorers together for running an eval task
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
project_name (str): The name of the project the evaluation results belong to
|
|
15
|
-
eval_name (str): A name for this evaluation run
|
|
16
|
-
examples (List[Example]): The examples to evaluate
|
|
17
|
-
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
|
18
|
-
model (str): The model used as a judge when using LLM as a Judge
|
|
19
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
organization_id: Optional[str] = None
|
|
23
|
-
project_name: Optional[str] = Field(default=None, validate_default=True)
|
|
24
|
-
eval_name: Optional[str] = Field(default=None, validate_default=True)
|
|
25
|
-
examples: List[Example]
|
|
26
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
|
27
|
-
model: Optional[str] = "gpt-4.1"
|
|
28
|
-
trace_span_id: Optional[str] = None
|
|
29
|
-
# API Key will be "" until user calls client.run_eval(), then API Key will be set
|
|
30
|
-
override: Optional[bool] = False
|
|
31
|
-
append: Optional[bool] = False
|
|
32
|
-
|
|
33
|
-
def model_dump(self, **kwargs):
|
|
34
|
-
data = super().model_dump(**kwargs)
|
|
35
|
-
|
|
36
|
-
data["scorers"] = [
|
|
37
|
-
scorer.model_dump() for scorer in self.scorers
|
|
38
|
-
] # Pydantic has problems with properly calling model_dump() on the scorers, so we need to do it manually
|
|
39
|
-
|
|
40
|
-
return data
|
|
41
|
-
|
|
42
|
-
@field_validator("examples")
|
|
43
|
-
def validate_examples(cls, v):
|
|
44
|
-
if not v:
|
|
45
|
-
raise ValueError("Examples cannot be empty.")
|
|
46
|
-
return v
|
|
47
|
-
|
|
48
|
-
@field_validator("scorers", mode="before")
|
|
49
|
-
def validate_scorers(cls, v):
|
|
50
|
-
if not v:
|
|
51
|
-
raise ValueError("Scorers cannot be empty.")
|
|
52
|
-
if not all(
|
|
53
|
-
isinstance(scorer, BaseScorer) or isinstance(scorer, APIScorerConfig)
|
|
54
|
-
for scorer in v
|
|
55
|
-
):
|
|
56
|
-
raise ValueError(
|
|
57
|
-
"All scorers must be of type BaseScorer or APIScorerConfig."
|
|
58
|
-
)
|
|
59
|
-
return v
|
|
60
|
-
|
|
61
|
-
@field_validator("model")
|
|
62
|
-
def validate_model(cls, v, values):
|
|
63
|
-
if not v:
|
|
64
|
-
raise ValueError("Model cannot be empty.")
|
|
65
|
-
|
|
66
|
-
# Check if model is string or list of strings
|
|
67
|
-
if isinstance(v, str):
|
|
68
|
-
if v not in ACCEPTABLE_MODELS:
|
|
69
|
-
raise ValueError(
|
|
70
|
-
f"Model name {v} not recognized. Please select a valid model name.)"
|
|
71
|
-
)
|
|
72
|
-
return v
|
|
73
|
-
|
|
74
|
-
class Config:
|
|
75
|
-
arbitrary_types_allowed = True
|