judgeval 0.2.0__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/common/api/api.py +38 -7
- judgeval/common/api/constants.py +9 -1
- judgeval/common/storage/s3_storage.py +2 -3
- judgeval/common/tracer/core.py +66 -32
- judgeval/common/tracer/otel_span_processor.py +4 -50
- judgeval/common/tracer/span_transformer.py +16 -10
- judgeval/common/utils.py +46 -38
- judgeval/constants.py +2 -0
- judgeval/data/example.py +9 -37
- judgeval/data/judgment_types.py +23 -45
- judgeval/data/result.py +8 -14
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +3 -4
- judgeval/dataset.py +192 -0
- judgeval/evaluation_run.py +1 -0
- judgeval/judges/litellm_judge.py +2 -2
- judgeval/judges/mixture_of_judges.py +6 -6
- judgeval/judges/together_judge.py +6 -3
- judgeval/judgment_client.py +9 -71
- judgeval/run_evaluation.py +41 -9
- judgeval/scorers/score.py +11 -7
- judgeval/scorers/utils.py +3 -3
- judgeval/utils/file_utils.py +40 -25
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/METADATA +10 -6
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/RECORD +27 -29
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/WHEEL +0 -0
- {judgeval-0.2.0.dist-info → judgeval-0.3.1.dist-info}/licenses/LICENSE.md +0 -0
judgeval/common/utils.py
CHANGED
@@ -16,7 +16,7 @@ from types import TracebackType
|
|
16
16
|
from judgeval.common.api.constants import ROOT_API
|
17
17
|
from judgeval.utils.requests import requests
|
18
18
|
import pprint
|
19
|
-
from typing import Any, Dict, List,
|
19
|
+
from typing import Any, Dict, List, Optional, TypeAlias, Union, TypeGuard
|
20
20
|
|
21
21
|
# Third-party imports
|
22
22
|
import litellm
|
@@ -138,7 +138,9 @@ def validate_api_key(judgment_api_key: str):
|
|
138
138
|
|
139
139
|
|
140
140
|
def fetch_together_api_response(
|
141
|
-
model: str,
|
141
|
+
model: str,
|
142
|
+
messages: List[Dict[str, str]],
|
143
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
142
144
|
) -> str:
|
143
145
|
"""
|
144
146
|
Fetches a single response from the Together API for a given model and messages.
|
@@ -167,7 +169,9 @@ def fetch_together_api_response(
|
|
167
169
|
|
168
170
|
|
169
171
|
async def afetch_together_api_response(
|
170
|
-
model: str,
|
172
|
+
model: str,
|
173
|
+
messages: List[Dict],
|
174
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
171
175
|
) -> str:
|
172
176
|
"""
|
173
177
|
ASYNCHRONOUSLY Fetches a single response from the Together API for a given model and messages.
|
@@ -192,8 +196,8 @@ async def afetch_together_api_response(
|
|
192
196
|
|
193
197
|
def query_together_api_multiple_calls(
|
194
198
|
models: List[str],
|
195
|
-
messages: List[List[
|
196
|
-
response_formats: List[pydantic.BaseModel]
|
199
|
+
messages: List[List[Dict]],
|
200
|
+
response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
|
197
201
|
) -> List[Union[str, None]]:
|
198
202
|
"""
|
199
203
|
Queries the Together API for multiple calls in parallel
|
@@ -230,7 +234,7 @@ def query_together_api_multiple_calls(
|
|
230
234
|
|
231
235
|
num_workers = int(os.getenv("NUM_WORKER_THREADS", MAX_WORKER_THREADS))
|
232
236
|
# Initialize results to maintain ordered outputs
|
233
|
-
out: List[str
|
237
|
+
out: List[Union[str, None]] = [None] * len(messages)
|
234
238
|
with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
|
235
239
|
# Submit all queries to together API with index, gets back the response content
|
236
240
|
futures = {
|
@@ -255,8 +259,8 @@ def query_together_api_multiple_calls(
|
|
255
259
|
|
256
260
|
async def aquery_together_api_multiple_calls(
|
257
261
|
models: List[str],
|
258
|
-
messages: List[List[
|
259
|
-
response_formats: List[pydantic.BaseModel]
|
262
|
+
messages: List[List[Dict]],
|
263
|
+
response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
|
260
264
|
) -> List[Union[str, None]]:
|
261
265
|
"""
|
262
266
|
Queries the Together API for multiple calls in parallel
|
@@ -314,7 +318,9 @@ async def aquery_together_api_multiple_calls(
|
|
314
318
|
|
315
319
|
|
316
320
|
def fetch_litellm_api_response(
|
317
|
-
model: str,
|
321
|
+
model: str,
|
322
|
+
messages: List[Dict[str, str]],
|
323
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
318
324
|
) -> str:
|
319
325
|
"""
|
320
326
|
Fetches a single response from the Litellm API for a given model and messages.
|
@@ -339,8 +345,8 @@ def fetch_litellm_api_response(
|
|
339
345
|
|
340
346
|
def fetch_custom_litellm_api_response(
|
341
347
|
custom_model_parameters: CustomModelParameters,
|
342
|
-
messages: List[
|
343
|
-
response_format: pydantic.BaseModel = None,
|
348
|
+
messages: List[Dict[str, str]],
|
349
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
344
350
|
) -> str:
|
345
351
|
if messages is None or messages == []:
|
346
352
|
raise ValueError("Messages cannot be empty")
|
@@ -372,7 +378,9 @@ def fetch_custom_litellm_api_response(
|
|
372
378
|
|
373
379
|
|
374
380
|
async def afetch_litellm_api_response(
|
375
|
-
model: str,
|
381
|
+
model: str,
|
382
|
+
messages: List[Dict[str, str]],
|
383
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
376
384
|
) -> str:
|
377
385
|
"""
|
378
386
|
ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
|
@@ -402,8 +410,8 @@ async def afetch_litellm_api_response(
|
|
402
410
|
|
403
411
|
async def afetch_custom_litellm_api_response(
|
404
412
|
custom_model_parameters: CustomModelParameters,
|
405
|
-
messages: List[
|
406
|
-
response_format: pydantic.BaseModel = None,
|
413
|
+
messages: List[Dict[str, str]],
|
414
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
407
415
|
) -> str:
|
408
416
|
"""
|
409
417
|
ASYNCHRONOUSLY Fetches a single response from the Litellm API for a given model and messages.
|
@@ -439,8 +447,8 @@ async def afetch_custom_litellm_api_response(
|
|
439
447
|
|
440
448
|
def query_litellm_api_multiple_calls(
|
441
449
|
models: List[str],
|
442
|
-
messages: List[List[
|
443
|
-
response_formats: List[pydantic.BaseModel]
|
450
|
+
messages: List[List[Dict]],
|
451
|
+
response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
|
444
452
|
) -> List[Union[str, None]]:
|
445
453
|
"""
|
446
454
|
Queries the Litellm API for multiple calls in parallel
|
@@ -480,8 +488,8 @@ def query_litellm_api_multiple_calls(
|
|
480
488
|
|
481
489
|
async def aquery_litellm_api_multiple_calls(
|
482
490
|
models: List[str],
|
483
|
-
messages: List[List[
|
484
|
-
response_formats: List[pydantic.BaseModel]
|
491
|
+
messages: List[List[Dict[str, str]]],
|
492
|
+
response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
|
485
493
|
) -> List[Union[str, None]]:
|
486
494
|
"""
|
487
495
|
Queries the Litellm API for multiple calls in parallel
|
@@ -565,14 +573,14 @@ def validate_batched_chat_messages(messages):
|
|
565
573
|
|
566
574
|
|
567
575
|
def is_batched_messages(
|
568
|
-
messages: Union[List[
|
569
|
-
) -> TypeGuard[List[List[
|
576
|
+
messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
|
577
|
+
) -> TypeGuard[List[List[Dict[str, str]]]]:
|
570
578
|
return isinstance(messages, list) and all(isinstance(msg, list) for msg in messages)
|
571
579
|
|
572
580
|
|
573
581
|
def is_simple_messages(
|
574
|
-
messages: Union[List[
|
575
|
-
) -> TypeGuard[List[
|
582
|
+
messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
|
583
|
+
) -> TypeGuard[List[Dict[str, str]]]:
|
576
584
|
return isinstance(messages, list) and all(
|
577
585
|
not isinstance(msg, list) for msg in messages
|
578
586
|
)
|
@@ -580,10 +588,10 @@ def is_simple_messages(
|
|
580
588
|
|
581
589
|
def get_chat_completion(
|
582
590
|
model_type: str,
|
583
|
-
messages: Union[List[
|
584
|
-
response_format: pydantic.BaseModel = None,
|
591
|
+
messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
|
592
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
585
593
|
batched: bool = False,
|
586
|
-
) -> Union[str, List[str
|
594
|
+
) -> Union[str, List[Union[str, None]]]:
|
587
595
|
"""
|
588
596
|
Generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
|
589
597
|
|
@@ -653,10 +661,10 @@ def get_chat_completion(
|
|
653
661
|
|
654
662
|
async def aget_chat_completion(
|
655
663
|
model_type: str,
|
656
|
-
messages: Union[List[
|
657
|
-
response_format: pydantic.BaseModel = None,
|
664
|
+
messages: Union[List[Dict[str, str]], List[List[Dict[str, str]]]],
|
665
|
+
response_format: Union[pydantic.BaseModel, None] = None,
|
658
666
|
batched: bool = False,
|
659
|
-
) -> Union[str, List[str
|
667
|
+
) -> Union[str, List[Union[str, None]]]:
|
660
668
|
"""
|
661
669
|
ASYNCHRONOUSLY generates chat completions using a single model and potentially several messages. Supports closed-source and OSS models.
|
662
670
|
|
@@ -722,9 +730,9 @@ async def aget_chat_completion(
|
|
722
730
|
|
723
731
|
def get_completion_multiple_models(
|
724
732
|
models: List[str],
|
725
|
-
messages: List[List[
|
726
|
-
response_formats: List[pydantic.BaseModel]
|
727
|
-
) -> List[str
|
733
|
+
messages: List[List[Dict[str, str]]],
|
734
|
+
response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
|
735
|
+
) -> List[Union[str, None]]:
|
728
736
|
"""
|
729
737
|
Retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
|
730
738
|
|
@@ -801,9 +809,9 @@ def get_completion_multiple_models(
|
|
801
809
|
|
802
810
|
async def aget_completion_multiple_models(
|
803
811
|
models: List[str],
|
804
|
-
messages: List[List[
|
805
|
-
response_formats: List[pydantic.BaseModel]
|
806
|
-
) -> List[str
|
812
|
+
messages: List[List[Dict[str, str]]],
|
813
|
+
response_formats: Union[List[Union[pydantic.BaseModel, None]], None] = None,
|
814
|
+
) -> List[Union[str, None]]:
|
807
815
|
"""
|
808
816
|
ASYNCHRONOUSLY retrieves completions for a single prompt from multiple models in parallel. Supports closed-source and OSS models.
|
809
817
|
|
@@ -875,7 +883,7 @@ async def aget_completion_multiple_models(
|
|
875
883
|
|
876
884
|
|
877
885
|
if __name__ == "__main__":
|
878
|
-
batched_messages: List[List[
|
886
|
+
batched_messages: List[List[Dict[str, str]]] = [
|
879
887
|
[
|
880
888
|
{"role": "system", "content": "You are a helpful assistant."},
|
881
889
|
{"role": "user", "content": "What is the capital of France?"},
|
@@ -886,12 +894,12 @@ if __name__ == "__main__":
|
|
886
894
|
],
|
887
895
|
]
|
888
896
|
|
889
|
-
non_batched_messages: List[
|
897
|
+
non_batched_messages: List[Dict[str, str]] = [
|
890
898
|
{"role": "system", "content": "You are a helpful assistant."},
|
891
899
|
{"role": "user", "content": "What is the capital of France?"},
|
892
900
|
]
|
893
901
|
|
894
|
-
batched_messages_2: List[List[
|
902
|
+
batched_messages_2: List[List[Dict[str, str]]] = [
|
895
903
|
[
|
896
904
|
{"role": "system", "content": "You are a helpful assistant."},
|
897
905
|
{"role": "user", "content": "What is the capital of China?"},
|
@@ -937,4 +945,4 @@ if __name__ == "__main__":
|
|
937
945
|
)
|
938
946
|
|
939
947
|
ExcInfo: TypeAlias = tuple[type[BaseException], BaseException, TracebackType]
|
940
|
-
OptExcInfo: TypeAlias = ExcInfo
|
948
|
+
OptExcInfo: TypeAlias = Union[ExcInfo, tuple[None, None, None]]
|
judgeval/constants.py
CHANGED
@@ -104,6 +104,8 @@ TOGETHER_SUPPORTED_MODELS = [
|
|
104
104
|
"mistralai/Mistral-7B-Instruct-v0.1",
|
105
105
|
]
|
106
106
|
|
107
|
+
DEFAULT_TOGETHER_MODEL = "meta-llama/Meta-Llama-3-8B-Instruct-Lite"
|
108
|
+
|
107
109
|
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
108
110
|
|
109
111
|
ACCEPTABLE_MODELS = (
|
judgeval/data/example.py
CHANGED
@@ -4,6 +4,7 @@ Classes for representing examples in a dataset.
|
|
4
4
|
|
5
5
|
from enum import Enum
|
6
6
|
from datetime import datetime
|
7
|
+
from typing import Dict, Any, Optional
|
7
8
|
from judgeval.data.judgment_types import ExampleJudgmentType
|
8
9
|
|
9
10
|
|
@@ -15,47 +16,18 @@ class ExampleParams(str, Enum):
|
|
15
16
|
RETRIEVAL_CONTEXT = "retrieval_context"
|
16
17
|
TOOLS_CALLED = "tools_called"
|
17
18
|
EXPECTED_TOOLS = "expected_tools"
|
18
|
-
REASONING = "reasoning"
|
19
19
|
ADDITIONAL_METADATA = "additional_metadata"
|
20
20
|
|
21
21
|
|
22
22
|
class Example(ExampleJudgmentType):
|
23
23
|
example_id: str = ""
|
24
|
+
created_at: str = datetime.now().isoformat()
|
25
|
+
name: Optional[str] = None
|
24
26
|
|
25
|
-
def
|
26
|
-
|
27
|
-
|
28
|
-
super().__init__(**data)
|
29
|
-
self.example_id = None
|
27
|
+
def to_dict(self) -> Dict[str, Any]:
|
28
|
+
data = super().model_dump(warnings=False)
|
29
|
+
return data
|
30
30
|
|
31
|
-
def
|
32
|
-
|
33
|
-
|
34
|
-
"actual_output": self.actual_output,
|
35
|
-
"expected_output": self.expected_output,
|
36
|
-
"context": self.context,
|
37
|
-
"retrieval_context": self.retrieval_context,
|
38
|
-
"additional_metadata": self.additional_metadata,
|
39
|
-
"tools_called": self.tools_called,
|
40
|
-
"expected_tools": self.expected_tools,
|
41
|
-
"name": self.name,
|
42
|
-
"example_id": self.example_id,
|
43
|
-
"example_index": self.example_index,
|
44
|
-
"created_at": self.created_at,
|
45
|
-
}
|
46
|
-
|
47
|
-
def __str__(self):
|
48
|
-
return (
|
49
|
-
f"Example(input={self.input}, "
|
50
|
-
f"actual_output={self.actual_output}, "
|
51
|
-
f"expected_output={self.expected_output}, "
|
52
|
-
f"context={self.context}, "
|
53
|
-
f"retrieval_context={self.retrieval_context}, "
|
54
|
-
f"additional_metadata={self.additional_metadata}, "
|
55
|
-
f"tools_called={self.tools_called}, "
|
56
|
-
f"expected_tools={self.expected_tools}, "
|
57
|
-
f"name={self.name}, "
|
58
|
-
f"example_id={self.example_id}, "
|
59
|
-
f"example_index={self.example_index}, "
|
60
|
-
f"created_at={self.created_at}, "
|
61
|
-
)
|
31
|
+
def get_fields(self):
|
32
|
+
excluded = {"example_id", "name", "created_at"}
|
33
|
+
return self.model_dump(exclude=excluded)
|
judgeval/data/judgment_types.py
CHANGED
@@ -1,12 +1,12 @@
|
|
1
1
|
# generated by datamodel-codegen:
|
2
2
|
# filename: openapi_new.json
|
3
|
-
# timestamp: 2025-07-
|
3
|
+
# timestamp: 2025-07-29T18:13:07+00:00
|
4
4
|
|
5
5
|
from __future__ import annotations
|
6
6
|
|
7
7
|
from typing import Annotated, Any, Dict, List, Optional, Union
|
8
8
|
|
9
|
-
from pydantic import BaseModel, Field
|
9
|
+
from pydantic import BaseModel, ConfigDict, Field
|
10
10
|
|
11
11
|
|
12
12
|
class ValidationErrorJudgmentType(BaseModel):
|
@@ -31,6 +31,15 @@ class ScorerDataJudgmentType(BaseModel):
|
|
31
31
|
] = None
|
32
32
|
|
33
33
|
|
34
|
+
class ExampleJudgmentType(BaseModel):
|
35
|
+
model_config = ConfigDict(
|
36
|
+
extra="allow",
|
37
|
+
)
|
38
|
+
example_id: Annotated[str, Field(title="Example Id")]
|
39
|
+
created_at: Annotated[str, Field(title="Created At")]
|
40
|
+
name: Annotated[Optional[str], Field(title="Name")] = None
|
41
|
+
|
42
|
+
|
34
43
|
class ScorerConfigJudgmentType(BaseModel):
|
35
44
|
score_type: Annotated[str, Field(title="Score Type")]
|
36
45
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
@@ -81,6 +90,17 @@ class HTTPValidationErrorJudgmentType(BaseModel):
|
|
81
90
|
] = None
|
82
91
|
|
83
92
|
|
93
|
+
class JudgmentEvalJudgmentType(BaseModel):
|
94
|
+
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
95
|
+
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
96
|
+
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
97
|
+
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
98
|
+
model: Annotated[str, Field(title="Model")]
|
99
|
+
append: Annotated[Optional[bool], Field(title="Append")] = False
|
100
|
+
override: Annotated[Optional[bool], Field(title="Override")] = False
|
101
|
+
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
102
|
+
|
103
|
+
|
84
104
|
class TraceSpanJudgmentType(BaseModel):
|
85
105
|
span_id: Annotated[str, Field(title="Span Id")]
|
86
106
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
@@ -109,43 +129,12 @@ class TraceSpanJudgmentType(BaseModel):
|
|
109
129
|
update_id: Annotated[Optional[int], Field(title="Update Id")] = 1
|
110
130
|
|
111
131
|
|
112
|
-
class ExampleJudgmentType(BaseModel):
|
113
|
-
input: Annotated[Optional[Union[str, Dict[str, Any]]], Field(title="Input")] = None
|
114
|
-
actual_output: Annotated[
|
115
|
-
Optional[Union[str, List[str]]], Field(title="Actual Output")
|
116
|
-
] = None
|
117
|
-
expected_output: Annotated[
|
118
|
-
Optional[Union[str, List[str]]], Field(title="Expected Output")
|
119
|
-
] = None
|
120
|
-
context: Annotated[Optional[List[str]], Field(title="Context")] = None
|
121
|
-
retrieval_context: Annotated[
|
122
|
-
Optional[List[str]], Field(title="Retrieval Context")
|
123
|
-
] = None
|
124
|
-
additional_metadata: Annotated[
|
125
|
-
Optional[Dict[str, Any]], Field(title="Additional Metadata")
|
126
|
-
] = None
|
127
|
-
tools_called: Annotated[Optional[List[str]], Field(title="Tools Called")] = Field(
|
128
|
-
default_factory=list
|
129
|
-
)
|
130
|
-
expected_tools: Annotated[
|
131
|
-
Optional[List[ToolJudgmentType]], Field(title="Expected Tools")
|
132
|
-
] = Field(default_factory=list)
|
133
|
-
name: Annotated[Optional[str], Field(title="Name")] = None
|
134
|
-
example_id: Annotated[str, Field(title="Example Id")]
|
135
|
-
example_index: Annotated[Optional[int], Field(title="Example Index")] = None
|
136
|
-
created_at: Annotated[Optional[str], Field(title="Created At")] = None
|
137
|
-
trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
|
138
|
-
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
139
|
-
dataset_id: Annotated[Optional[str], Field(title="Dataset Id")] = None
|
140
|
-
|
141
|
-
|
142
132
|
class TraceJudgmentType(BaseModel):
|
143
133
|
trace_id: Annotated[str, Field(title="Trace Id")]
|
144
134
|
name: Annotated[str, Field(title="Name")]
|
145
135
|
created_at: Annotated[str, Field(title="Created At")]
|
146
136
|
duration: Annotated[float, Field(title="Duration")]
|
147
137
|
trace_spans: Annotated[List[TraceSpanJudgmentType], Field(title="Trace Spans")]
|
148
|
-
overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
|
149
138
|
offline_mode: Annotated[Optional[bool], Field(title="Offline Mode")] = False
|
150
139
|
rules: Annotated[Optional[Dict[str, Any]], Field(title="Rules")] = Field(
|
151
140
|
default_factory=dict
|
@@ -165,7 +154,7 @@ class ScoringResultJudgmentType(BaseModel):
|
|
165
154
|
success: Annotated[bool, Field(title="Success")]
|
166
155
|
scorers_data: Annotated[
|
167
156
|
Optional[List[ScorerDataJudgmentType]], Field(title="Scorers Data")
|
168
|
-
]
|
157
|
+
] = None
|
169
158
|
name: Annotated[Optional[str], Field(title="Name")] = None
|
170
159
|
data_object: Annotated[
|
171
160
|
Optional[Union[TraceSpanJudgmentType, ExampleJudgmentType]],
|
@@ -188,17 +177,6 @@ class TraceRunJudgmentType(BaseModel):
|
|
188
177
|
tools: Annotated[Optional[List[Dict[str, Any]]], Field(title="Tools")] = None
|
189
178
|
|
190
179
|
|
191
|
-
class JudgmentEvalJudgmentType(BaseModel):
|
192
|
-
project_name: Annotated[Optional[str], Field(title="Project Name")] = None
|
193
|
-
eval_name: Annotated[Optional[str], Field(title="Eval Name")] = None
|
194
|
-
examples: Annotated[List[ExampleJudgmentType], Field(title="Examples")]
|
195
|
-
scorers: Annotated[List[ScorerConfigJudgmentType], Field(title="Scorers")]
|
196
|
-
model: Annotated[str, Field(title="Model")]
|
197
|
-
append: Annotated[Optional[bool], Field(title="Append")] = False
|
198
|
-
override: Annotated[Optional[bool], Field(title="Override")] = False
|
199
|
-
trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
|
200
|
-
|
201
|
-
|
202
180
|
class EvalResultsJudgmentType(BaseModel):
|
203
181
|
results: Annotated[List[ScoringResultJudgmentType], Field(title="Results")]
|
204
182
|
run: Annotated[
|
judgeval/data/result.py
CHANGED
@@ -17,15 +17,14 @@ class ScoringResult(ScoringResultJudgmentType):
|
|
17
17
|
|
18
18
|
"""
|
19
19
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
}
|
20
|
+
data_object: (
|
21
|
+
Example # Need to override this so that it uses this repo's Example class
|
22
|
+
)
|
23
|
+
|
24
|
+
def model_dump(self, **kwargs):
|
25
|
+
data = super().model_dump(**kwargs)
|
26
|
+
data["data_object"] = self.data_object.model_dump()
|
27
|
+
return data
|
29
28
|
|
30
29
|
def __str__(self) -> str:
|
31
30
|
return f"ScoringResult(\
|
@@ -47,12 +46,7 @@ def generate_scoring_result(
|
|
47
46
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
48
47
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
49
48
|
"""
|
50
|
-
if hasattr(data_object, "name") and data_object.name is not None:
|
51
|
-
name = data_object.name
|
52
|
-
else:
|
53
|
-
name = "Test Case Placeholder"
|
54
49
|
scoring_result = ScoringResult(
|
55
|
-
name=name,
|
56
50
|
data_object=data_object,
|
57
51
|
success=success,
|
58
52
|
scorers_data=scorers_data,
|
@@ -1,7 +1,7 @@
|
|
1
|
-
import
|
1
|
+
import orjson
|
2
2
|
import sys
|
3
3
|
from typing import Any, Dict, Generator, List
|
4
|
-
|
4
|
+
import requests
|
5
5
|
|
6
6
|
spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
|
7
7
|
|
@@ -10,8 +10,8 @@ if spec_file.startswith("http"):
|
|
10
10
|
r.raise_for_status()
|
11
11
|
SPEC = r.json()
|
12
12
|
else:
|
13
|
-
with open(spec_file, "
|
14
|
-
SPEC =
|
13
|
+
with open(spec_file, "rb") as f:
|
14
|
+
SPEC = orjson.loads(f.read())
|
15
15
|
|
16
16
|
JUDGEVAL_PATHS: List[str] = [
|
17
17
|
"/log_eval_results/",
|
@@ -120,4 +120,4 @@ spec = {
|
|
120
120
|
},
|
121
121
|
}
|
122
122
|
|
123
|
-
print(
|
123
|
+
print(orjson.dumps(spec, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
judgeval/data/trace.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from typing import Any
|
2
|
-
import json
|
3
2
|
import sys
|
4
3
|
import threading
|
4
|
+
import orjson
|
5
5
|
from datetime import datetime, timezone
|
6
6
|
from judgeval.data.judgment_types import (
|
7
7
|
TraceUsageJudgmentType,
|
@@ -83,7 +83,7 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
83
83
|
def _is_json_serializable(self, obj: Any) -> bool:
|
84
84
|
"""Helper method to check if an object is JSON serializable."""
|
85
85
|
try:
|
86
|
-
|
86
|
+
orjson.dumps(obj)
|
87
87
|
return True
|
88
88
|
except (TypeError, OverflowError, ValueError):
|
89
89
|
return False
|
@@ -177,9 +177,8 @@ class TraceSpan(TraceSpanJudgmentType):
|
|
177
177
|
# Recursively serialize list/tuple items
|
178
178
|
return [serialize_value(item, current_depth + 1) for item in value]
|
179
179
|
else:
|
180
|
-
# Try direct JSON serialization first
|
181
180
|
try:
|
182
|
-
|
181
|
+
orjson.dumps(value)
|
183
182
|
return value
|
184
183
|
except (TypeError, OverflowError, ValueError):
|
185
184
|
# Fallback to safe stringification
|