judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/result.py
CHANGED
|
@@ -1,81 +1,43 @@
|
|
|
1
|
-
from
|
|
2
|
-
from
|
|
1
|
+
from typing import List, Union
|
|
2
|
+
from judgeval.data import ScorerData, Example
|
|
3
|
+
from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
|
|
3
4
|
|
|
4
|
-
from judgeval.data import ScorerData, ProcessExample
|
|
5
5
|
|
|
6
|
-
|
|
7
|
-
class ScoringResult:
|
|
6
|
+
class ScoringResult(JudgmentScoringResult):
|
|
8
7
|
"""
|
|
9
8
|
A ScoringResult contains the output of one or more scorers applied to a single example.
|
|
10
9
|
Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
|
|
11
10
|
|
|
12
11
|
Args:
|
|
13
|
-
success (bool): Whether the evaluation was successful.
|
|
12
|
+
success (bool): Whether the evaluation was successful.
|
|
14
13
|
This means that all scorers applied to this example returned a success.
|
|
15
14
|
scorer_data (List[ScorerData]): The scorers data for the evaluated example
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
expected_output (Optional[str]): The expected output of the example
|
|
19
|
-
context (Optional[List[str]]): The context of the example
|
|
20
|
-
retrieval_context (Optional[List[str]]): The retrieval context of the example
|
|
21
|
-
additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
|
|
22
|
-
tools_called (Optional[List[str]]): The tools called by the example
|
|
23
|
-
expected_tools (Optional[List[str]]): The expected tools of the example
|
|
24
|
-
trace_id (Optional[str]): The trace id of the example
|
|
25
|
-
|
|
15
|
+
data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, WorkflowRun (future)
|
|
16
|
+
|
|
26
17
|
"""
|
|
27
|
-
# Fields for scoring outputs
|
|
28
|
-
success: bool # used for unit testing
|
|
29
|
-
scorers_data: Union[List[ScorerData], None]
|
|
30
18
|
|
|
31
|
-
#
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
trace_id: Optional[str] = None
|
|
41
|
-
|
|
42
|
-
example_id: Optional[str] = None
|
|
43
|
-
eval_run_name: Optional[str] = None
|
|
44
|
-
|
|
45
|
-
def to_dict(self) -> dict:
|
|
46
|
-
"""Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
|
|
47
|
-
return {
|
|
48
|
-
"success": self.success,
|
|
49
|
-
"scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
|
|
50
|
-
"input": self.input,
|
|
51
|
-
"actual_output": self.actual_output,
|
|
52
|
-
"expected_output": self.expected_output,
|
|
53
|
-
"context": self.context,
|
|
54
|
-
"retrieval_context": self.retrieval_context,
|
|
55
|
-
"additional_metadata": self.additional_metadata,
|
|
56
|
-
"tools_called": self.tools_called,
|
|
57
|
-
"expected_tools": self.expected_tools,
|
|
58
|
-
"trace_id": self.trace_id,
|
|
59
|
-
"example_id": self.example_id
|
|
60
|
-
}
|
|
61
|
-
|
|
19
|
+
# Need to override this so that it uses this repo's Example class
|
|
20
|
+
data_object: Example
|
|
21
|
+
scorers_data: List[ScorerData]
|
|
22
|
+
|
|
23
|
+
def model_dump(self, **kwargs):
|
|
24
|
+
data = super().model_dump(**kwargs)
|
|
25
|
+
data["data_object"] = self.data_object.model_dump()
|
|
26
|
+
return data
|
|
27
|
+
|
|
62
28
|
def __str__(self) -> str:
|
|
63
29
|
return f"ScoringResult(\
|
|
64
30
|
success={self.success}, \
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
expected_output={self.expected_output}, \
|
|
69
|
-
context={self.context}, \
|
|
70
|
-
retrieval_context={self.retrieval_context}, \
|
|
71
|
-
additional_metadata={self.additional_metadata}, \
|
|
72
|
-
tools_called={self.tools_called}, \
|
|
73
|
-
expected_tools={self.expected_tools}, \
|
|
74
|
-
trace_id={self.trace_id})"
|
|
31
|
+
scorers_data={self.scorers_data}, \
|
|
32
|
+
data_object={self.data_object}, \
|
|
33
|
+
run_duration={self.run_duration})"
|
|
75
34
|
|
|
76
35
|
|
|
77
36
|
def generate_scoring_result(
|
|
78
|
-
|
|
37
|
+
data_object: Union[Example],
|
|
38
|
+
scorers_data: List[ScorerData],
|
|
39
|
+
run_duration: float,
|
|
40
|
+
success: bool,
|
|
79
41
|
) -> ScoringResult:
|
|
80
42
|
"""
|
|
81
43
|
Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
|
|
@@ -83,16 +45,10 @@ def generate_scoring_result(
|
|
|
83
45
|
When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
|
|
84
46
|
At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
|
|
85
47
|
"""
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
expected_output=process_example.expected_output,
|
|
92
|
-
context=process_example.context,
|
|
93
|
-
retrieval_context=process_example.retrieval_context,
|
|
94
|
-
additional_metadata=process_example.additional_metadata,
|
|
95
|
-
tools_called=process_example.tools_called,
|
|
96
|
-
expected_tools=process_example.expected_tools,
|
|
97
|
-
trace_id=process_example.trace_id
|
|
48
|
+
scoring_result = ScoringResult(
|
|
49
|
+
data_object=data_object,
|
|
50
|
+
success=success,
|
|
51
|
+
scorers_data=scorers_data,
|
|
52
|
+
run_duration=run_duration,
|
|
98
53
|
)
|
|
54
|
+
return scoring_result
|
judgeval/data/scorer_data.py
CHANGED
|
@@ -4,50 +4,14 @@ Implementation of the ScorerData class.
|
|
|
4
4
|
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
-
from
|
|
8
|
-
from pydantic import BaseModel, Field
|
|
7
|
+
from __future__ import annotations
|
|
9
8
|
|
|
10
|
-
from judgeval.
|
|
9
|
+
from judgeval.data.judgment_types import ScorerData
|
|
10
|
+
from judgeval.scorers import BaseScorer
|
|
11
|
+
from typing import List
|
|
11
12
|
|
|
12
|
-
class ScorerData(BaseModel):
|
|
13
|
-
"""
|
|
14
|
-
ScorerData holds the information related to a single, completed Scorer evaluation run.
|
|
15
|
-
|
|
16
|
-
For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
|
|
17
|
-
object will contain whether the example passed its threshold expectation, as well as more detailed
|
|
18
|
-
information surrounding the evaluation run such as the claims and verdicts generated by the
|
|
19
|
-
judge model(s).
|
|
20
|
-
"""
|
|
21
|
-
name: str
|
|
22
|
-
threshold: float
|
|
23
|
-
success: bool
|
|
24
|
-
score: Optional[float] = None
|
|
25
|
-
reason: Optional[str] = None
|
|
26
|
-
strict_mode: Optional[bool] = None
|
|
27
|
-
evaluation_model: Union[List[str], str] = None
|
|
28
|
-
error: Optional[str] = None
|
|
29
|
-
evaluation_cost: Union[float, None] = None
|
|
30
|
-
verbose_logs: Optional[str] = None
|
|
31
|
-
additional_metadata: Optional[Dict] = None
|
|
32
13
|
|
|
33
|
-
|
|
34
|
-
"""Convert the ScorerData instance to a JSON-serializable dictionary."""
|
|
35
|
-
return {
|
|
36
|
-
"name": self.name,
|
|
37
|
-
"threshold": self.threshold,
|
|
38
|
-
"success": self.success,
|
|
39
|
-
"score": self.score,
|
|
40
|
-
"reason": self.reason,
|
|
41
|
-
"strict_mode": self.strict_mode,
|
|
42
|
-
"evaluation_model": self.evaluation_model,
|
|
43
|
-
"error": self.error,
|
|
44
|
-
"evaluation_cost": self.evaluation_cost,
|
|
45
|
-
"verbose_logs": self.verbose_logs,
|
|
46
|
-
"additional_metadata": self.additional_metadata
|
|
47
|
-
}
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
|
|
14
|
+
def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
|
|
51
15
|
"""
|
|
52
16
|
After a `scorer` is run, it contains information about the example that was evaluated
|
|
53
17
|
using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
|
|
@@ -57,30 +21,33 @@ def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
|
|
|
57
21
|
contains the output of the scorer run that can be exported to be logged as a part of
|
|
58
22
|
the ScorerResult.
|
|
59
23
|
"""
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
24
|
+
scorers_result = list()
|
|
25
|
+
|
|
26
|
+
scorers_result.append(
|
|
27
|
+
ScorerData(
|
|
28
|
+
name=scorer.name,
|
|
63
29
|
threshold=scorer.threshold,
|
|
64
|
-
score=None,
|
|
65
|
-
reason=None,
|
|
66
|
-
success=False,
|
|
67
|
-
strict_mode=scorer.strict_mode,
|
|
68
|
-
evaluation_model=scorer.evaluation_model,
|
|
69
|
-
error=scorer.error,
|
|
70
|
-
evaluation_cost=scorer.evaluation_cost,
|
|
71
|
-
verbose_logs=scorer.verbose_logs,
|
|
72
|
-
)
|
|
73
|
-
else: # standard execution, no error
|
|
74
|
-
return ScorerData(
|
|
75
|
-
name=scorer.__name__,
|
|
76
30
|
score=scorer.score,
|
|
77
|
-
threshold=scorer.threshold,
|
|
78
31
|
reason=scorer.reason,
|
|
79
|
-
success=scorer.
|
|
32
|
+
success=scorer.success,
|
|
80
33
|
strict_mode=scorer.strict_mode,
|
|
81
|
-
evaluation_model=scorer.
|
|
82
|
-
error=
|
|
83
|
-
evaluation_cost=scorer.evaluation_cost,
|
|
84
|
-
verbose_logs=scorer.verbose_logs,
|
|
34
|
+
evaluation_model=scorer.model,
|
|
35
|
+
error=scorer.error,
|
|
85
36
|
additional_metadata=scorer.additional_metadata,
|
|
86
37
|
)
|
|
38
|
+
)
|
|
39
|
+
if hasattr(scorer, "internal_scorer") and scorer.internal_scorer is not None:
|
|
40
|
+
scorers_result.append(
|
|
41
|
+
ScorerData(
|
|
42
|
+
name=scorer.internal_scorer.name,
|
|
43
|
+
score=scorer.internal_scorer.score,
|
|
44
|
+
threshold=scorer.internal_scorer.threshold,
|
|
45
|
+
reason=scorer.internal_scorer.reason,
|
|
46
|
+
success=scorer.internal_scorer.success,
|
|
47
|
+
strict_mode=scorer.internal_scorer.strict_mode,
|
|
48
|
+
evaluation_model=scorer.internal_scorer.model,
|
|
49
|
+
error=scorer.internal_scorer.error,
|
|
50
|
+
additional_metadata=scorer.internal_scorer.additional_metadata,
|
|
51
|
+
)
|
|
52
|
+
)
|
|
53
|
+
return scorers_result
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Post-process generated Pydantic models with default_factory defaults.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import sys
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def fix_mutable_defaults(file_path: str) -> None:
|
|
10
|
+
"""Fix mutable defaults in generated Pydantic models."""
|
|
11
|
+
|
|
12
|
+
with open(file_path, "r") as f:
|
|
13
|
+
content = f.read()
|
|
14
|
+
|
|
15
|
+
content = content.replace(" = {}", " = Field(default_factory=dict)")
|
|
16
|
+
content = content.replace(" = []", " = Field(default_factory=list)")
|
|
17
|
+
with open(file_path, "w") as f:
|
|
18
|
+
f.write(content)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
if __name__ == "__main__":
|
|
22
|
+
file_path = sys.argv[1]
|
|
23
|
+
fix_mutable_defaults(file_path)
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import orjson
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Any, Dict, Generator, List
|
|
4
|
+
import requests
|
|
5
|
+
|
|
6
|
+
spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
|
|
7
|
+
|
|
8
|
+
if spec_file.startswith("http"):
|
|
9
|
+
r = requests.get(spec_file)
|
|
10
|
+
r.raise_for_status()
|
|
11
|
+
SPEC = r.json()
|
|
12
|
+
else:
|
|
13
|
+
with open(spec_file, "rb") as f:
|
|
14
|
+
SPEC = orjson.loads(f.read())
|
|
15
|
+
|
|
16
|
+
JUDGEVAL_PATHS: List[str] = [
|
|
17
|
+
"/log_eval_results/",
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def resolve_ref(ref: str) -> str:
|
|
22
|
+
assert ref.startswith("#/components/schemas/"), (
|
|
23
|
+
"Reference must start with #/components/schemas/"
|
|
24
|
+
)
|
|
25
|
+
return ref.replace("#/components/schemas/", "")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def walk(obj: Any) -> Generator[Any, None, None]:
|
|
29
|
+
yield obj
|
|
30
|
+
if isinstance(obj, list):
|
|
31
|
+
for item in obj:
|
|
32
|
+
yield from walk(item)
|
|
33
|
+
elif isinstance(obj, dict):
|
|
34
|
+
for value in obj.values():
|
|
35
|
+
yield from walk(value)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def get_referenced_schemas(obj: Any) -> Generator[str, None, None]:
|
|
39
|
+
for value in walk(obj):
|
|
40
|
+
if isinstance(value, dict) and "$ref" in value:
|
|
41
|
+
ref = value["$ref"]
|
|
42
|
+
resolved = resolve_ref(ref)
|
|
43
|
+
assert isinstance(ref, str), "Reference must be a string"
|
|
44
|
+
# Strip the _JudgmentType suffix if it exists to get the original schema name
|
|
45
|
+
if resolved.endswith("_JudgmentType"):
|
|
46
|
+
resolved = resolved[: -len("_JudgmentType")]
|
|
47
|
+
yield resolved
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def transform_schema_refs(obj: Any) -> Any:
|
|
51
|
+
"""Transform all $ref values in a schema to use the _JudgmentType suffix"""
|
|
52
|
+
if isinstance(obj, dict):
|
|
53
|
+
result = {}
|
|
54
|
+
for key, value in obj.items():
|
|
55
|
+
if (
|
|
56
|
+
key == "$ref"
|
|
57
|
+
and isinstance(value, str)
|
|
58
|
+
and value.startswith("#/components/schemas/")
|
|
59
|
+
):
|
|
60
|
+
# Update the reference to use the suffixed name
|
|
61
|
+
original_name = resolve_ref(value)
|
|
62
|
+
suffixed_name = f"{original_name}_JudgmentType"
|
|
63
|
+
result[key] = f"#/components/schemas/{suffixed_name}"
|
|
64
|
+
else:
|
|
65
|
+
result[key] = transform_schema_refs(value)
|
|
66
|
+
return result
|
|
67
|
+
elif isinstance(obj, list):
|
|
68
|
+
return [transform_schema_refs(item) for item in obj]
|
|
69
|
+
else:
|
|
70
|
+
return obj
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
filtered_paths = {
|
|
74
|
+
path: spec_data
|
|
75
|
+
for path, spec_data in SPEC["paths"].items()
|
|
76
|
+
if path in JUDGEVAL_PATHS
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def filter_schemas() -> Dict[str, Any]:
|
|
81
|
+
result: Dict[str, Any] = {}
|
|
82
|
+
processed_original_names: set[str] = set()
|
|
83
|
+
schemas_to_scan: Any = filtered_paths
|
|
84
|
+
|
|
85
|
+
while True:
|
|
86
|
+
to_commit: Dict[str, Any] = {}
|
|
87
|
+
for original_schema_name in get_referenced_schemas(schemas_to_scan):
|
|
88
|
+
if original_schema_name in processed_original_names:
|
|
89
|
+
continue
|
|
90
|
+
|
|
91
|
+
assert original_schema_name in SPEC["components"]["schemas"], (
|
|
92
|
+
f"Schema {original_schema_name} not found in components.schemas"
|
|
93
|
+
)
|
|
94
|
+
# Transform the schema to update any internal references
|
|
95
|
+
original_schema = SPEC["components"]["schemas"][original_schema_name]
|
|
96
|
+
transformed_schema = transform_schema_refs(original_schema)
|
|
97
|
+
suffixed_name = f"{original_schema_name}_JudgmentType"
|
|
98
|
+
to_commit[suffixed_name] = transformed_schema
|
|
99
|
+
processed_original_names.add(original_schema_name)
|
|
100
|
+
|
|
101
|
+
if not to_commit:
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
result.update(to_commit)
|
|
105
|
+
schemas_to_scan = to_commit
|
|
106
|
+
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
# Transform the filtered paths to update schema references
|
|
111
|
+
transformed_paths = transform_schema_refs(filtered_paths)
|
|
112
|
+
|
|
113
|
+
spec = {
|
|
114
|
+
"openapi": SPEC["openapi"],
|
|
115
|
+
"info": SPEC["info"],
|
|
116
|
+
"paths": transformed_paths,
|
|
117
|
+
"components": {
|
|
118
|
+
**SPEC["components"],
|
|
119
|
+
"schemas": filter_schemas(),
|
|
120
|
+
},
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
print(orjson.dumps(spec, option=orjson.OPT_INDENT_2).decode("utf-8"))
|
judgeval/data/trace.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from typing import Optional, List, Dict, Any
|
|
2
|
+
from pydantic import BaseModel
|
|
3
|
+
from .judgment_types import (
|
|
4
|
+
OtelSpanDetailScores,
|
|
5
|
+
OtelSpanDetail,
|
|
6
|
+
OtelTraceListItem,
|
|
7
|
+
)
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TraceUsage(BaseModel):
|
|
11
|
+
prompt_tokens: Optional[int] = None
|
|
12
|
+
completion_tokens: Optional[int] = None
|
|
13
|
+
cache_creation_input_tokens: Optional[int] = None
|
|
14
|
+
cache_read_input_tokens: Optional[int] = None
|
|
15
|
+
total_tokens: Optional[int] = None
|
|
16
|
+
prompt_tokens_cost_usd: Optional[float] = None
|
|
17
|
+
completion_tokens_cost_usd: Optional[float] = None
|
|
18
|
+
total_cost_usd: Optional[float] = None
|
|
19
|
+
model_name: Optional[str] = None
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class TraceScore(OtelSpanDetailScores):
|
|
23
|
+
"""Score information for a trace or span."""
|
|
24
|
+
|
|
25
|
+
pass
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class TraceRule(BaseModel):
|
|
29
|
+
"""Rule that was triggered for a trace."""
|
|
30
|
+
|
|
31
|
+
rule_id: str
|
|
32
|
+
rule_name: str
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class TraceSpan(OtelSpanDetail):
|
|
36
|
+
"""Individual span within a trace with complete telemetry data."""
|
|
37
|
+
|
|
38
|
+
@classmethod
|
|
39
|
+
def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
|
|
40
|
+
"""Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
|
|
41
|
+
data = span_detail.model_dump()
|
|
42
|
+
|
|
43
|
+
if "scores" in data and data["scores"]:
|
|
44
|
+
data["scores"] = [TraceScore(**score) for score in data["scores"]]
|
|
45
|
+
|
|
46
|
+
return cls(**data)
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
49
|
+
"""Convert TraceSpan to dictionary."""
|
|
50
|
+
return self.model_dump(exclude_none=True)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class Trace(OtelTraceListItem):
|
|
54
|
+
"""Complete trace with metadata and all associated spans."""
|
|
55
|
+
|
|
56
|
+
spans: List[TraceSpan] = []
|
|
57
|
+
rules: Optional[List[TraceRule]] = []
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
|
|
61
|
+
"""Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
|
|
62
|
+
|
|
63
|
+
if hasattr(dataset_trace, "trace_detail"):
|
|
64
|
+
trace_detail = dataset_trace.trace_detail
|
|
65
|
+
spans_data = dataset_trace.spans
|
|
66
|
+
else:
|
|
67
|
+
trace_detail = dataset_trace.get("trace_detail", {})
|
|
68
|
+
spans_data = dataset_trace.get("spans", [])
|
|
69
|
+
|
|
70
|
+
if hasattr(trace_detail, "model_dump"):
|
|
71
|
+
trace_data = trace_detail.model_dump()
|
|
72
|
+
elif isinstance(trace_detail, dict):
|
|
73
|
+
trace_data = trace_detail.copy()
|
|
74
|
+
else:
|
|
75
|
+
trace_data = dict(trace_detail)
|
|
76
|
+
|
|
77
|
+
spans = []
|
|
78
|
+
for span in spans_data:
|
|
79
|
+
if hasattr(span, "model_dump"):
|
|
80
|
+
spans.append(TraceSpan.from_otel_span_detail(span))
|
|
81
|
+
else:
|
|
82
|
+
# Handle dict spans
|
|
83
|
+
span_data = dict(span) if not isinstance(span, dict) else span.copy()
|
|
84
|
+
if "scores" in span_data and span_data["scores"]:
|
|
85
|
+
span_data["scores"] = [
|
|
86
|
+
TraceScore(**score)
|
|
87
|
+
if isinstance(score, dict)
|
|
88
|
+
else TraceScore(**score.model_dump())
|
|
89
|
+
for score in span_data["scores"]
|
|
90
|
+
]
|
|
91
|
+
spans.append(TraceSpan(**span_data))
|
|
92
|
+
|
|
93
|
+
rules = []
|
|
94
|
+
if "rule_id" in trace_data and trace_data["rule_id"]:
|
|
95
|
+
rules = [
|
|
96
|
+
TraceRule(
|
|
97
|
+
rule_id=trace_data["rule_id"],
|
|
98
|
+
rule_name=f"Rule {trace_data['rule_id']}",
|
|
99
|
+
)
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
trace_data.pop("scores", [])
|
|
103
|
+
trace_data.pop("rule_id", None)
|
|
104
|
+
trace = cls(**trace_data)
|
|
105
|
+
|
|
106
|
+
trace.spans = spans
|
|
107
|
+
trace.rules = rules
|
|
108
|
+
|
|
109
|
+
return trace
|
|
110
|
+
|
|
111
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
112
|
+
"""Convert Trace to dictionary."""
|
|
113
|
+
return self.model_dump(exclude_none=True)
|
|
114
|
+
|
|
115
|
+
def __len__(self) -> int:
|
|
116
|
+
"""Return the number of spans in the trace."""
|
|
117
|
+
return len(self.spans)
|
|
118
|
+
|
|
119
|
+
def __iter__(self):
|
|
120
|
+
"""Iterate over spans in the trace."""
|
|
121
|
+
return iter(self.spans)
|