judgeval 0.9.4__py3-none-any.whl → 0.10.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +2 -2
- judgeval/api/__init__.py +28 -96
- judgeval/api/api_types.py +49 -140
- judgeval/constants.py +1 -5
- judgeval/data/__init__.py +1 -3
- judgeval/data/example.py +4 -2
- judgeval/data/judgment_types.py +57 -165
- judgeval/data/result.py +1 -2
- judgeval/data/trace.py +14 -40
- judgeval/dataset/__init__.py +15 -42
- judgeval/evaluation/__init__.py +23 -34
- judgeval/scorers/__init__.py +9 -7
- judgeval/scorers/api_scorer.py +8 -0
- judgeval/scorers/base_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +43 -4
- judgeval/tracer/__init__.py +13 -50
- judgeval/tracer/local_eval_queue.py +2 -2
- judgeval/tracer/processors/__init__.py +1 -1
- judgeval/tracer/utils.py +1 -1
- judgeval/trainer/trainer.py +4 -4
- {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/METADATA +1 -1
- {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/RECORD +30 -35
- judgeval/data/trace_run.py +0 -39
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/scorers/trace_api_scorer.py +0 -5
- {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/WHEEL +0 -0
- {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/entry_points.txt +0 -0
- {judgeval-0.9.4.dist-info → judgeval-0.10.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/base_scorer.py
CHANGED
@@ -10,24 +10,16 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
|
|
10
10
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
|
11
11
|
InstructionAdherenceScorer,
|
12
12
|
)
|
13
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
|
14
|
-
DerailmentScorer,
|
15
|
-
)
|
16
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
17
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
|
14
|
+
TracePromptScorer,
|
18
15
|
PromptScorer,
|
19
16
|
)
|
20
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
|
21
|
-
ToolDependencyScorer,
|
22
|
-
)
|
23
17
|
|
24
18
|
__all__ = [
|
25
19
|
"FaithfulnessScorer",
|
26
20
|
"AnswerRelevancyScorer",
|
27
21
|
"AnswerCorrectnessScorer",
|
28
22
|
"InstructionAdherenceScorer",
|
29
|
-
"
|
30
|
-
"ToolOrderScorer",
|
23
|
+
"TracePromptScorer",
|
31
24
|
"PromptScorer",
|
32
|
-
"ToolDependencyScorer",
|
33
25
|
]
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
from typing import List
|
13
13
|
|
14
14
|
|
15
|
-
class AnswerCorrectnessScorer(
|
15
|
+
class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
|
16
16
|
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
17
17
|
required_params: List[ExampleParams] = [
|
18
18
|
ExampleParams.INPUT,
|
@@ -1,10 +1,10 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
2
2
|
from judgeval.constants import APIScorerType
|
3
3
|
from judgeval.data import ExampleParams
|
4
4
|
from typing import List
|
5
5
|
|
6
6
|
|
7
|
-
class AnswerRelevancyScorer(
|
7
|
+
class AnswerRelevancyScorer(ExampleAPIScorerConfig):
|
8
8
|
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
9
9
|
required_params: List[ExampleParams] = [
|
10
10
|
ExampleParams.INPUT,
|
@@ -6,13 +6,13 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
from typing import List
|
13
13
|
|
14
14
|
|
15
|
-
class FaithfulnessScorer(
|
15
|
+
class FaithfulnessScorer(ExampleAPIScorerConfig):
|
16
16
|
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
17
17
|
required_params: List[ExampleParams] = [
|
18
18
|
ExampleParams.INPUT,
|
@@ -6,12 +6,12 @@ TODO add link to docs page for this scorer
|
|
6
6
|
"""
|
7
7
|
|
8
8
|
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import
|
9
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
10
10
|
from judgeval.constants import APIScorerType
|
11
11
|
from judgeval.data import ExampleParams
|
12
12
|
|
13
13
|
|
14
|
-
class InstructionAdherenceScorer(
|
14
|
+
class InstructionAdherenceScorer(ExampleAPIScorerConfig):
|
15
15
|
def __init__(self, threshold: float):
|
16
16
|
super().__init__(
|
17
17
|
threshold=threshold,
|
@@ -1,4 +1,8 @@
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
1
|
+
from judgeval.scorers.api_scorer import (
|
2
|
+
APIScorerConfig,
|
3
|
+
ExampleAPIScorerConfig,
|
4
|
+
TraceAPIScorerConfig,
|
5
|
+
)
|
2
6
|
from judgeval.constants import APIScorerType
|
3
7
|
from typing import Dict, Any, Optional
|
4
8
|
from judgeval.api import JudgmentSyncClient
|
@@ -6,6 +10,7 @@ from judgeval.exceptions import JudgmentAPIError
|
|
6
10
|
import os
|
7
11
|
from copy import copy
|
8
12
|
from judgeval.logger import judgeval_logger
|
13
|
+
from abc import ABC
|
9
14
|
|
10
15
|
|
11
16
|
def push_prompt_scorer(
|
@@ -15,6 +20,7 @@ def push_prompt_scorer(
|
|
15
20
|
options: Optional[Dict[str, float]] = None,
|
16
21
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or "",
|
17
22
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
23
|
+
is_trace: Optional[bool] = None,
|
18
24
|
) -> str:
|
19
25
|
client = JudgmentSyncClient(judgment_api_key, organization_id)
|
20
26
|
try:
|
@@ -24,6 +30,7 @@ def push_prompt_scorer(
|
|
24
30
|
"prompt": prompt,
|
25
31
|
"threshold": threshold,
|
26
32
|
"options": options,
|
33
|
+
"is_trace": is_trace,
|
27
34
|
}
|
28
35
|
)
|
29
36
|
except JudgmentAPIError as e:
|
@@ -88,7 +95,7 @@ def scorer_exists(
|
|
88
95
|
)
|
89
96
|
|
90
97
|
|
91
|
-
class
|
98
|
+
class BasePromptScorer(ABC, APIScorerConfig):
|
92
99
|
"""
|
93
100
|
In the Judgment backend, this scorer is implemented as a PromptScorer that takes
|
94
101
|
1. a system role that may involve the Example object
|
@@ -97,9 +104,9 @@ class PromptScorer(APIScorerConfig):
|
|
97
104
|
and uses a judge to execute the evaluation from the system role and classify into one of the options
|
98
105
|
"""
|
99
106
|
|
107
|
+
score_type: APIScorerType
|
100
108
|
prompt: str
|
101
109
|
options: Optional[Dict[str, float]] = None
|
102
|
-
score_type: APIScorerType = APIScorerType.PROMPT_SCORER
|
103
110
|
judgment_api_key: str = os.getenv("JUDGMENT_API_KEY") or ""
|
104
111
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or ""
|
105
112
|
|
@@ -111,7 +118,18 @@ class PromptScorer(APIScorerConfig):
|
|
111
118
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
112
119
|
):
|
113
120
|
scorer_config = fetch_prompt_scorer(name, judgment_api_key, organization_id)
|
121
|
+
if scorer_config["is_trace"] != issubclass(cls, TracePromptScorer):
|
122
|
+
raise JudgmentAPIError(
|
123
|
+
status_code=400,
|
124
|
+
detail=f"Scorer with name {name} is not a {cls.__name__}",
|
125
|
+
response=None, # type: ignore
|
126
|
+
)
|
127
|
+
if issubclass(cls, TracePromptScorer):
|
128
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
129
|
+
else:
|
130
|
+
score_type = APIScorerType.PROMPT_SCORER
|
114
131
|
return cls(
|
132
|
+
score_type=score_type,
|
115
133
|
name=name,
|
116
134
|
prompt=scorer_config["prompt"],
|
117
135
|
threshold=scorer_config["threshold"],
|
@@ -131,11 +149,24 @@ class PromptScorer(APIScorerConfig):
|
|
131
149
|
organization_id: str = os.getenv("JUDGMENT_ORG_ID") or "",
|
132
150
|
):
|
133
151
|
if not scorer_exists(name, judgment_api_key, organization_id):
|
152
|
+
if issubclass(cls, TracePromptScorer):
|
153
|
+
is_trace = True
|
154
|
+
score_type = APIScorerType.TRACE_PROMPT_SCORER
|
155
|
+
else:
|
156
|
+
is_trace = False
|
157
|
+
score_type = APIScorerType.PROMPT_SCORER
|
134
158
|
push_prompt_scorer(
|
135
|
-
name,
|
159
|
+
name,
|
160
|
+
prompt,
|
161
|
+
threshold,
|
162
|
+
options,
|
163
|
+
judgment_api_key,
|
164
|
+
organization_id,
|
165
|
+
is_trace,
|
136
166
|
)
|
137
167
|
judgeval_logger.info(f"Successfully created PromptScorer: {name}")
|
138
168
|
return cls(
|
169
|
+
score_type=score_type,
|
139
170
|
name=name,
|
140
171
|
prompt=prompt,
|
141
172
|
threshold=threshold,
|
@@ -251,3 +282,11 @@ class PromptScorer(APIScorerConfig):
|
|
251
282
|
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
252
283
|
}
|
253
284
|
return base
|
285
|
+
|
286
|
+
|
287
|
+
class PromptScorer(BasePromptScorer, ExampleAPIScorerConfig):
|
288
|
+
pass
|
289
|
+
|
290
|
+
|
291
|
+
class TracePromptScorer(BasePromptScorer, TraceAPIScorerConfig):
|
292
|
+
pass
|
judgeval/tracer/__init__.py
CHANGED
@@ -43,8 +43,7 @@ from judgeval.env import (
|
|
43
43
|
JUDGMENT_ORG_ID,
|
44
44
|
)
|
45
45
|
from judgeval.logger import judgeval_logger
|
46
|
-
from judgeval.scorers.api_scorer import
|
47
|
-
from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
|
46
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig, TraceAPIScorerConfig
|
48
47
|
from judgeval.scorers.base_scorer import BaseScorer
|
49
48
|
from judgeval.tracer.constants import JUDGEVAL_TRACER_INSTRUMENTING_MODULE_NAME
|
50
49
|
from judgeval.tracer.managers import (
|
@@ -485,11 +484,11 @@ class Tracer:
|
|
485
484
|
safe_serialize(format_inputs(f, args, kwargs)),
|
486
485
|
)
|
487
486
|
|
487
|
+
self.judgment_processor.emit_partial()
|
488
|
+
|
488
489
|
if scorer_config:
|
489
490
|
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
490
491
|
|
491
|
-
self.judgment_processor.emit_partial()
|
492
|
-
|
493
492
|
result = f(*args, **kwargs)
|
494
493
|
except Exception as user_exc:
|
495
494
|
span.record_exception(user_exc)
|
@@ -537,13 +536,13 @@ class Tracer:
|
|
537
536
|
safe_serialize(format_inputs(f, args, kwargs)),
|
538
537
|
)
|
539
538
|
|
539
|
+
self.judgment_processor.emit_partial()
|
540
|
+
|
540
541
|
if scorer_config:
|
541
542
|
self._set_pending_trace_eval(
|
542
543
|
main_span, scorer_config, args, kwargs
|
543
544
|
)
|
544
545
|
|
545
|
-
self.judgment_processor.emit_partial()
|
546
|
-
|
547
546
|
generator = f(*args, **kwargs)
|
548
547
|
set_span_attribute(
|
549
548
|
main_span, AttributeKeys.JUDGMENT_OUTPUT, "<generator>"
|
@@ -587,11 +586,11 @@ class Tracer:
|
|
587
586
|
safe_serialize(format_inputs(f, args, kwargs)),
|
588
587
|
)
|
589
588
|
|
589
|
+
self.judgment_processor.emit_partial()
|
590
|
+
|
590
591
|
if scorer_config:
|
591
592
|
self._set_pending_trace_eval(span, scorer_config, args, kwargs)
|
592
593
|
|
593
|
-
self.judgment_processor.emit_partial()
|
594
|
-
|
595
594
|
result = await f(*args, **kwargs)
|
596
595
|
except Exception as user_exc:
|
597
596
|
span.record_exception(user_exc)
|
@@ -639,13 +638,13 @@ class Tracer:
|
|
639
638
|
safe_serialize(format_inputs(f, args, kwargs)),
|
640
639
|
)
|
641
640
|
|
641
|
+
self.judgment_processor.emit_partial()
|
642
|
+
|
642
643
|
if scorer_config:
|
643
644
|
self._set_pending_trace_eval(
|
644
645
|
main_span, scorer_config, args, kwargs
|
645
646
|
)
|
646
647
|
|
647
|
-
self.judgment_processor.emit_partial()
|
648
|
-
|
649
648
|
async_generator = f(*args, **kwargs)
|
650
649
|
set_span_attribute(
|
651
650
|
main_span, AttributeKeys.JUDGMENT_OUTPUT, "<async_generator>"
|
@@ -825,42 +824,6 @@ class Tracer:
|
|
825
824
|
|
826
825
|
return sync_wrapper
|
827
826
|
|
828
|
-
@overload
|
829
|
-
def observe_tools(
|
830
|
-
self,
|
831
|
-
cls: Cls,
|
832
|
-
/,
|
833
|
-
*,
|
834
|
-
exclude_methods: List[str] = [],
|
835
|
-
include_private: bool = False,
|
836
|
-
) -> Cls: ...
|
837
|
-
|
838
|
-
@overload
|
839
|
-
def observe_tools(
|
840
|
-
self,
|
841
|
-
cls: None = None,
|
842
|
-
/,
|
843
|
-
*,
|
844
|
-
exclude_methods: List[str] = [],
|
845
|
-
include_private: bool = False,
|
846
|
-
) -> Callable[[Cls], Cls]: ...
|
847
|
-
|
848
|
-
def observe_tools(
|
849
|
-
self,
|
850
|
-
cls: Cls | None = None,
|
851
|
-
/,
|
852
|
-
*,
|
853
|
-
exclude_methods: List[str] = [],
|
854
|
-
include_private: bool = False,
|
855
|
-
) -> Cls | Callable[[Cls], Cls]:
|
856
|
-
if cls is None:
|
857
|
-
return partial(
|
858
|
-
self.observe_tools,
|
859
|
-
exclude_methods=exclude_methods,
|
860
|
-
include_private=include_private,
|
861
|
-
)
|
862
|
-
return cls
|
863
|
-
|
864
827
|
def wrap(self, client: ApiClient) -> ApiClient:
|
865
828
|
return wrap_provider(self, client)
|
866
829
|
|
@@ -899,7 +862,7 @@ class Tracer:
|
|
899
862
|
self,
|
900
863
|
/,
|
901
864
|
*,
|
902
|
-
scorer: Union[
|
865
|
+
scorer: Union[ExampleAPIScorerConfig, BaseScorer],
|
903
866
|
example: Example,
|
904
867
|
model: str = JUDGMENT_DEFAULT_GPT_MODEL,
|
905
868
|
sampling_rate: float = 1.0,
|
@@ -908,9 +871,9 @@ class Tracer:
|
|
908
871
|
judgeval_logger.info("Evaluation is not enabled, skipping evaluation")
|
909
872
|
return
|
910
873
|
|
911
|
-
if not isinstance(scorer, (
|
874
|
+
if not isinstance(scorer, (ExampleAPIScorerConfig, BaseScorer)):
|
912
875
|
judgeval_logger.error(
|
913
|
-
"Scorer must be an instance of
|
876
|
+
"Scorer must be an instance of ExampleAPIScorerConfig or BaseScorer, got %s, skipping evaluation."
|
914
877
|
% type(scorer)
|
915
878
|
)
|
916
879
|
return
|
@@ -939,7 +902,7 @@ class Tracer:
|
|
939
902
|
span_context = self.get_current_span().get_span_context()
|
940
903
|
trace_id = format(span_context.trace_id, "032x")
|
941
904
|
span_id = format(span_context.span_id, "016x")
|
942
|
-
hosted_scoring = isinstance(scorer,
|
905
|
+
hosted_scoring = isinstance(scorer, ExampleAPIScorerConfig) or (
|
943
906
|
isinstance(scorer, BaseScorer) and scorer.server_hosted
|
944
907
|
)
|
945
908
|
eval_run_name = f"async_evaluate_{span_id}" # note this name doesnt matter because we don't save the experiment only the example and scorer_data
|
@@ -24,7 +24,7 @@ class LocalEvaluationQueue:
|
|
24
24
|
"""Lightweight in-memory queue for local evaluation runs.
|
25
25
|
|
26
26
|
Only supports EvaluationRuns with local scorers (BaseScorer instances).
|
27
|
-
API scorers (
|
27
|
+
API scorers (ExampleAPIScorerConfig) are not supported as they have their own queue.
|
28
28
|
"""
|
29
29
|
|
30
30
|
def __init__(
|
@@ -54,7 +54,7 @@ class LocalEvaluationQueue:
|
|
54
54
|
if not evaluation_run.custom_scorers:
|
55
55
|
raise ValueError(
|
56
56
|
"LocalEvaluationQueue only supports runs with local scorers (BaseScorer). "
|
57
|
-
"Found only
|
57
|
+
"Found only ExampleAPIScorerConfig instances."
|
58
58
|
)
|
59
59
|
|
60
60
|
return safe_run_async(
|
@@ -97,7 +97,7 @@ class JudgmentSpanProcessor(BatchSpanProcessor):
|
|
97
97
|
resource_attributes[ResourceKeys.JUDGMENT_PROJECT_ID] = self.project_id
|
98
98
|
else:
|
99
99
|
judgeval_logger.error(
|
100
|
-
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/projects. Skipping Judgment export."
|
100
|
+
f"Failed to resolve project {self.project_name}, please create it first at https://app.judgmentlabs.ai/org/{self.organization_id}/projects. Skipping Judgment export."
|
101
101
|
)
|
102
102
|
|
103
103
|
self.resource_attributes = resource_attributes
|
judgeval/tracer/utils.py
CHANGED
@@ -2,7 +2,7 @@ from typing import Any
|
|
2
2
|
from opentelemetry.trace import Span
|
3
3
|
from pydantic import BaseModel
|
4
4
|
from typing import Callable, Optional
|
5
|
-
from judgeval.scorers.
|
5
|
+
from judgeval.scorers.api_scorer import TraceAPIScorerConfig
|
6
6
|
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
7
7
|
|
8
8
|
|
judgeval/trainer/trainer.py
CHANGED
@@ -10,7 +10,7 @@ from judgeval.tracer.exporters.store import SpanStore
|
|
10
10
|
from judgeval.tracer.exporters import InMemorySpanExporter
|
11
11
|
from judgeval.tracer.keys import AttributeKeys
|
12
12
|
from judgeval import JudgmentClient
|
13
|
-
from judgeval.scorers import BaseScorer,
|
13
|
+
from judgeval.scorers import BaseScorer, ExampleAPIScorerConfig
|
14
14
|
from judgeval.data import Example
|
15
15
|
from .console import _spinner_progress, _print_progress, _print_progress_update
|
16
16
|
from judgeval.exceptions import JudgmentRuntimeError
|
@@ -154,7 +154,7 @@ class JudgmentTrainer:
|
|
154
154
|
async def generate_rollouts_and_rewards(
|
155
155
|
self,
|
156
156
|
agent_function: Callable[[Any], Any],
|
157
|
-
scorers: List[Union[
|
157
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
158
158
|
prompts: List[Any],
|
159
159
|
num_prompts_per_step: Optional[int] = None,
|
160
160
|
num_generations_per_prompt: Optional[int] = None,
|
@@ -264,7 +264,7 @@ class JudgmentTrainer:
|
|
264
264
|
async def run_reinforcement_learning(
|
265
265
|
self,
|
266
266
|
agent_function: Callable[[Any], Any],
|
267
|
-
scorers: List[Union[
|
267
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
268
268
|
prompts: List[Any],
|
269
269
|
) -> ModelConfig:
|
270
270
|
"""
|
@@ -370,7 +370,7 @@ class JudgmentTrainer:
|
|
370
370
|
async def train(
|
371
371
|
self,
|
372
372
|
agent_function: Callable[[Any], Any],
|
373
|
-
scorers: List[Union[
|
373
|
+
scorers: List[Union[ExampleAPIScorerConfig, BaseScorer]],
|
374
374
|
prompts: List[Any],
|
375
375
|
rft_provider: Optional[str] = None,
|
376
376
|
) -> ModelConfig:
|
@@ -1,69 +1,64 @@
|
|
1
|
-
judgeval/__init__.py,sha256=
|
1
|
+
judgeval/__init__.py,sha256=MqB1s0zp-Fr_KvKFjGKnRHUeulutmrlMcUyjNRRAU_4,4962
|
2
2
|
judgeval/cli.py,sha256=R5IiIQmSVg21kQHX2kL3sOeXCxvvAMSqyva3Z9AoSXc,1560
|
3
|
-
judgeval/constants.py,sha256=
|
3
|
+
judgeval/constants.py,sha256=h7Cuf_2uvNzHZi8nqRFoMpvsQUZMS3mlNB3s2uduse8,3557
|
4
4
|
judgeval/env.py,sha256=R0bj7XU29RIVVQjkVMa11ObhOYVMbaE_3LTvL3I9dWM,2212
|
5
5
|
judgeval/exceptions.py,sha256=tTbfe4yoOtPXmn22UQz9-6a-5PT9uOko85xaRRwr0Sw,621
|
6
6
|
judgeval/logger.py,sha256=ZWbp0QfT1CJnQIjV-Zle4n489nFCKEmD2-ukx--iiow,1553
|
7
7
|
judgeval/version.py,sha256=kJtYsih3hTYZ_rY_Lt0RcFqvjAfF5Xo1uNq0jZWJ5pw,73
|
8
8
|
judgeval/warnings.py,sha256=LbGte14ppiFjrkp-JJYueZ40NWFvMkWRvPXr6r-fUWw,73
|
9
|
-
judgeval/api/__init__.py,sha256=
|
10
|
-
judgeval/api/api_types.py,sha256=
|
11
|
-
judgeval/data/__init__.py,sha256=
|
9
|
+
judgeval/api/__init__.py,sha256=RWQDwzT93nXWih3WYMPl1OL2ga9uk0dUGYV7fEDzBso,12764
|
10
|
+
judgeval/api/api_types.py,sha256=uyz8ePQI-ec88PVwhHN-KVmldAmNgRjOVmesVDKIBUw,6461
|
11
|
+
judgeval/data/__init__.py,sha256=1tU0EN0ThIfQ1fad5I3dKxAfTcZ5U8cvTLcQ6qLVLU0,407
|
12
12
|
judgeval/data/evaluation_run.py,sha256=G7ad4eDQTjketfcQRITk8bs8CIO8rm058H1G_qkLmhc,4729
|
13
|
-
judgeval/data/example.py,sha256=
|
14
|
-
judgeval/data/judgment_types.py,sha256=
|
15
|
-
judgeval/data/result.py,sha256=
|
13
|
+
judgeval/data/example.py,sha256=eGJpF-lyUH734Cg90B7WtU9f8iKoS3VFGeV6R-GVCCc,1039
|
14
|
+
judgeval/data/judgment_types.py,sha256=JkhNG6fRBFdryG8ogVZsMWtq3W3JmWh0AYIR8LdBAT4,11773
|
15
|
+
judgeval/data/result.py,sha256=LA0OzwcVKwD5NkmtmFuA_EusmYRyE10mjDMXa2bgU1g,2067
|
16
16
|
judgeval/data/scorer_data.py,sha256=g9PE0DNLikW0LgxGWhgpCiNVOX8PzqEaZKivifLOUDI,2997
|
17
17
|
judgeval/data/tool.py,sha256=bj_WxFg22mypUUVR5KqQRxMDHWvKwiE1MMPjLnTCoDU,99
|
18
|
-
judgeval/data/trace.py,sha256=
|
19
|
-
judgeval/data/trace_run.py,sha256=VCQUdDlrHixyiqWW1RUiCtLgqMt-3oW1M1A7CCer2Ok,1635
|
18
|
+
judgeval/data/trace.py,sha256=R9RF1kv1JHeOpjXLjErJcxV2RrNrJUSqWcWe73l3f9k,503
|
20
19
|
judgeval/data/scripts/fix_default_factory.py,sha256=lvp2JwYZqz-XpD9LZNa3mANZVP-jJSZoNzolI6JWERM,591
|
21
20
|
judgeval/data/scripts/openapi_transform.py,sha256=Sm04JClzyP1ga8KA3gkIdsae8Hlx-XU7-x0gHCQYOhg,3877
|
22
|
-
judgeval/dataset/__init__.py,sha256=
|
23
|
-
judgeval/evaluation/__init__.py,sha256=
|
21
|
+
judgeval/dataset/__init__.py,sha256=S1iLL7ivDLIT3aTNO1ardHqhIRxXMuoW5PFLFIkt4uY,5731
|
22
|
+
judgeval/evaluation/__init__.py,sha256=u-aDyLTRebPZigeBbJHpnZk3wQAS7jv_VgLXIi-jMGU,15075
|
24
23
|
judgeval/integrations/langgraph/__init__.py,sha256=VvqCKOk65A2gLlr8uWrJVzpRF5OnIja5zwF4hGPEFsw,27540
|
25
24
|
judgeval/judges/__init__.py,sha256=e7JnTc1TG_SwqydDHTXHIP0EBazQxt-ydMQG7ghSU5A,228
|
26
25
|
judgeval/judges/base_judge.py,sha256=_dz0qWsKRxzXxpRY9l6mrxTRYPSF2FE4ZXkrzhZ4gbY,986
|
27
26
|
judgeval/judges/litellm_judge.py,sha256=5vEF0IUo7HVWnOF2ww-DMke8Xkarnz32B_qbgKjc0-I,4182
|
28
27
|
judgeval/judges/together_judge.py,sha256=GzwlXZJzle8hT-vWKmq39JyIeanJqJfHDOkrksUbzk0,4398
|
29
28
|
judgeval/judges/utils.py,sha256=ITbYwvjU3o9-FIAReFvxh24yJrx9LV3l9BnSBgKUpxg,2068
|
30
|
-
judgeval/scorers/__init__.py,sha256=
|
29
|
+
judgeval/scorers/__init__.py,sha256=34PMPsfR2_3n7T96wpSfAZJWzWlU6v53S3mGX2PE87k,665
|
31
30
|
judgeval/scorers/agent_scorer.py,sha256=V1NSwhGWgtXPsX-blKLkDLsPPbEiP-A4614X-95dtlQ,565
|
32
|
-
judgeval/scorers/api_scorer.py,sha256=
|
33
|
-
judgeval/scorers/base_scorer.py,sha256=
|
31
|
+
judgeval/scorers/api_scorer.py,sha256=8TUJut9r74v-qMACiSKAUbDI1v3ZItPXrTz8s4_Lrgk,2287
|
32
|
+
judgeval/scorers/base_scorer.py,sha256=naGiZYHnkn9HVwY-jpOY7O6cYPJJJe5dHbrRBSOikxw,2723
|
34
33
|
judgeval/scorers/example_scorer.py,sha256=o_BGUztJXjnKnuOqIa9T4PXe0wPoWg63FyH518N1LxA,561
|
35
34
|
judgeval/scorers/exceptions.py,sha256=ACDHK5-TWiF3NTk-wycaedpbrdobm-CvvC1JA_iP-Mk,179
|
36
35
|
judgeval/scorers/score.py,sha256=xquM59SCtNeuAsrBsHFgBQk3CHp4-bms4oFs24xfcU0,7176
|
37
|
-
judgeval/scorers/trace_api_scorer.py,sha256=B2Vp8Jj2I7N-G1weHMm1b_9gVbn0BMcOtestMFNtx08,112
|
38
36
|
judgeval/scorers/utils.py,sha256=iSZONwK0HecxUPz-cMCyra_87DSCag1E8BdpF2a4_44,377
|
39
37
|
judgeval/scorers/judgeval_scorers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
|
-
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=
|
41
|
-
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=
|
42
|
-
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=
|
43
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
44
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
45
|
-
judgeval/scorers/judgeval_scorers/api_scorers/
|
46
|
-
judgeval/
|
47
|
-
judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py,sha256=BhrLnIASZOTT9XJ6giYSoVfdR7NYsjRRTOTNioNtEiU,610
|
48
|
-
judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py,sha256=bMu0WMJaXdMyDTN42sVLoWV-lrUHCEa8iDrCI_K7nlQ,808
|
49
|
-
judgeval/tracer/__init__.py,sha256=0DM6ixBI75FuVG7UMG_k-KHJm1MyFbRyhAUPm2GYu9A,36057
|
38
|
+
judgeval/scorers/judgeval_scorers/api_scorers/__init__.py,sha256=wrq7y9I30GZbwDXIrSh81KRO_-j7i-1DjwX5Hc3PScI,728
|
39
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py,sha256=_qa1sOHUwJubBCfyx6lsE_4vZsUh65VoTZba1NSouis,558
|
40
|
+
judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py,sha256=ciiFBQQC4UDsk9qou9OiKbAR31s82eRUY1ZTt1gdM-0,407
|
41
|
+
judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py,sha256=lIJ3GgOI9tfbrC7voZMvlxXdK3X1bhdj2zNxqdaGIkM,545
|
42
|
+
judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py,sha256=bSwbpVNhpkpEeX3GtCJuyz5vFyY1gbyqYEfaBF2KTVY,697
|
43
|
+
judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py,sha256=E2_TVO88iLSBAdcKYnfHYp4cUyffgG_p1th5aCpjCd8,9680
|
44
|
+
judgeval/tracer/__init__.py,sha256=mQQaca8XJRYwSRn7a5x63dFQeA8xGjwfoZYikQCAAyI,35214
|
50
45
|
judgeval/tracer/constants.py,sha256=ae8tivAW97awJQxdRB9OMqX50wOLX3zqChT_AGkPBu0,85
|
51
46
|
judgeval/tracer/keys.py,sha256=qXPoZSkEhVF-YYfQ9-zeDMVdr4GtpPf2W7MPJaN2AQo,2889
|
52
|
-
judgeval/tracer/local_eval_queue.py,sha256=
|
47
|
+
judgeval/tracer/local_eval_queue.py,sha256=iv9on1G4woGlhYn1mZATEMkzCiz-qVn2cdzEINzQFYQ,7242
|
53
48
|
judgeval/tracer/managers.py,sha256=h2ZHJ61_vf3cS-HlEUiodFzKDUuQWIhYC6n7pMVyM9c,6113
|
54
|
-
judgeval/tracer/utils.py,sha256=
|
49
|
+
judgeval/tracer/utils.py,sha256=3_8ZjjF4XgNyAu9LpThq5dVOcwdwI-E3vb-HRl_Px8c,594
|
55
50
|
judgeval/tracer/exporters/__init__.py,sha256=lnZXfPGaQH844HAIuZCQqjqhnmZGA98kHY8Xp-Oi4Ws,1220
|
56
51
|
judgeval/tracer/exporters/s3.py,sha256=N9gmw17cnR0VkfAQQkLsNj5BksgNRETThR5qYhWRjP4,4360
|
57
52
|
judgeval/tracer/exporters/store.py,sha256=KQV3cyqteesByQjR-9VdPXT9OlUZ-6F08ogqj837_c0,1012
|
58
53
|
judgeval/tracer/exporters/utils.py,sha256=JRcoSQuEHxMDJbXfyrUIfA2SHBVkZM82h4bTbYGxkNw,1154
|
59
54
|
judgeval/tracer/llm/__init__.py,sha256=p9uwWPg9k-NcWjj9TbwQj55sHhBOqRYx2-Ld6YHaFUs,42625
|
60
55
|
judgeval/tracer/llm/providers.py,sha256=QQLJlSNnDjXRAc2Wqw78o254COJUSXX39D7D_mx3NVA,2651
|
61
|
-
judgeval/tracer/processors/__init__.py,sha256=
|
56
|
+
judgeval/tracer/processors/__init__.py,sha256=tXbQaXGMQeutgM_7d5Y2EFTeSjbVEBky685Dst_v3rg,8672
|
62
57
|
judgeval/trainer/__init__.py,sha256=h_DDVV7HFF7HUPAJFpt2d9wjqgnmEVcHxqZyB1k7pPQ,257
|
63
58
|
judgeval/trainer/config.py,sha256=8s0X8B334PJomorwONaUpb6K8cAMxRdYAeQdtx7HPHs,4258
|
64
59
|
judgeval/trainer/console.py,sha256=PJ0rCnDwC7aoW-VsLDS96ZyMyagh-l9EOJKff1ATIpo,4342
|
65
60
|
judgeval/trainer/trainable_model.py,sha256=vSDtHJJ-fLczC2gkaY9jG6TQvLgWqaVjElm1l8YlJcU,8959
|
66
|
-
judgeval/trainer/trainer.py,sha256=
|
61
|
+
judgeval/trainer/trainer.py,sha256=YhepEm3M-5z1RB50cAEsLbZiOIE_fOWiX-thyvBj6v4,16578
|
67
62
|
judgeval/utils/async_utils.py,sha256=lgCgi8gkLUcAEepruEkx-AGQgJnAJpKmBIhZx6Y0q2s,935
|
68
63
|
judgeval/utils/decorators.py,sha256=rdqY1w0zNL6O6GU6Wdeo0-x5EgpFTEhU2vkgiWsRYdc,525
|
69
64
|
judgeval/utils/file_utils.py,sha256=3LI1YCZwO5ogTgJreyOgRgDksey3natO2Td1PQqaPyY,3252
|
@@ -73,8 +68,8 @@ judgeval/utils/serialize.py,sha256=QXR-8Nj5rqOrI9zLx0oRLdk6DW6Bc7j8eyF4zQ7PLxA,6
|
|
73
68
|
judgeval/utils/testing.py,sha256=kJOq4LlEXaNThfg9oSIRqSK7IH8AwLgbukjn5uxMY7A,3661
|
74
69
|
judgeval/utils/url.py,sha256=Shf0v3XcbaWpL0m1eGJEEO_z4TsQCnDB2Rl25OTUmiI,195
|
75
70
|
judgeval/utils/version_check.py,sha256=kcF6SvB6GbVKI0Gv9QRVm-kvBn9_z-c3jmPORsXO3h0,1015
|
76
|
-
judgeval-0.
|
77
|
-
judgeval-0.
|
78
|
-
judgeval-0.
|
79
|
-
judgeval-0.
|
80
|
-
judgeval-0.
|
71
|
+
judgeval-0.10.0.dist-info/METADATA,sha256=vpsStrROABbjYIuuO8UqssmVjq70k4rLH2AvEz4jie8,8870
|
72
|
+
judgeval-0.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
73
|
+
judgeval-0.10.0.dist-info/entry_points.txt,sha256=-eoeD-oDLn4A7MSgeBS9Akwanf3_0r0cgEleBcIOjg0,46
|
74
|
+
judgeval-0.10.0.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
|
75
|
+
judgeval-0.10.0.dist-info/RECORD,,
|
judgeval/data/trace_run.py
DELETED
@@ -1,39 +0,0 @@
|
|
1
|
-
from pydantic import BaseModel
|
2
|
-
from typing import List, Optional, Dict, Any, Union
|
3
|
-
from judgeval.data import Trace
|
4
|
-
from judgeval.scorers import APIScorerConfig, BaseScorer
|
5
|
-
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
6
|
-
|
7
|
-
|
8
|
-
class TraceRun(BaseModel):
|
9
|
-
"""
|
10
|
-
Stores example and evaluation scorers together for running an eval task
|
11
|
-
|
12
|
-
Args:
|
13
|
-
project_name (str): The name of the project the evaluation results belong to
|
14
|
-
eval_name (str): A name for this evaluation run
|
15
|
-
traces (List[Trace]): The traces to evaluate
|
16
|
-
scorers (List[Union[JudgmentScorer, BaseScorer]]): A list of scorers to use for evaluation
|
17
|
-
model (str): The model used as a judge when using LLM as a Judge
|
18
|
-
metadata (Optional[Dict[str, Any]]): Additional metadata to include for this evaluation run, e.g. comments, dataset name, purpose, etc.
|
19
|
-
rules (Optional[List[Rule]]): Rules to evaluate against scoring results
|
20
|
-
append (Optional[bool]): Whether to append to existing evaluation results
|
21
|
-
tools (Optional[List[Dict[str, Any]]]): List of tools to use for evaluation
|
22
|
-
"""
|
23
|
-
|
24
|
-
organization_id: Optional[str] = None
|
25
|
-
project_name: Optional[str] = None
|
26
|
-
eval_name: Optional[str] = None
|
27
|
-
traces: Optional[List[Trace]] = None
|
28
|
-
scorers: List[Union[APIScorerConfig, BaseScorer]]
|
29
|
-
model: Optional[str] = JUDGMENT_DEFAULT_GPT_MODEL
|
30
|
-
trace_span_id: Optional[str] = None
|
31
|
-
append: Optional[bool] = False
|
32
|
-
override: Optional[bool] = False
|
33
|
-
|
34
|
-
# TODO: ?
|
35
|
-
rules: Any = None
|
36
|
-
tools: Optional[List[Dict[str, Any]]] = None
|
37
|
-
|
38
|
-
class Config:
|
39
|
-
arbitrary_types_allowed = True
|
@@ -1,14 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` answer relevancy scorer
|
3
|
-
|
4
|
-
TODO add link to docs page for this scorer
|
5
|
-
|
6
|
-
"""
|
7
|
-
|
8
|
-
# Internal imports
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
10
|
-
from judgeval.constants import APIScorerType
|
11
|
-
|
12
|
-
|
13
|
-
class DerailmentScorer(APIScorerConfig):
|
14
|
-
score_type: APIScorerType = APIScorerType.DERAILMENT
|
@@ -1,20 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` tool dependency scorer
|
3
|
-
"""
|
4
|
-
|
5
|
-
# Internal imports
|
6
|
-
from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
|
7
|
-
from judgeval.constants import APIScorerType
|
8
|
-
from typing import Optional, Dict
|
9
|
-
|
10
|
-
|
11
|
-
class ToolDependencyScorer(TraceAPIScorerConfig):
|
12
|
-
kwargs: Optional[Dict] = None
|
13
|
-
|
14
|
-
def __init__(self, threshold: float = 1.0, enable_param_checking: bool = True):
|
15
|
-
super().__init__(threshold=threshold, score_type=APIScorerType.TOOL_DEPENDENCY)
|
16
|
-
self.kwargs = {"enable_param_checking": enable_param_checking}
|
17
|
-
|
18
|
-
@property
|
19
|
-
def __name__(self):
|
20
|
-
return "Tool Dependency"
|
@@ -1,27 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
`judgeval` tool order scorer
|
3
|
-
"""
|
4
|
-
|
5
|
-
# Internal imports
|
6
|
-
from judgeval.scorers.trace_api_scorer import TraceAPIScorerConfig
|
7
|
-
from judgeval.constants import APIScorerType
|
8
|
-
from typing import Dict, Any
|
9
|
-
|
10
|
-
|
11
|
-
class ToolOrderScorer(TraceAPIScorerConfig):
|
12
|
-
score_type: APIScorerType = APIScorerType.TOOL_ORDER
|
13
|
-
threshold: float = 1.0
|
14
|
-
exact_match: bool = False
|
15
|
-
|
16
|
-
def model_dump(self, *args, **kwargs) -> Dict[str, Any]:
|
17
|
-
base = super().model_dump(*args, **kwargs)
|
18
|
-
base_fields = set(TraceAPIScorerConfig.model_fields.keys())
|
19
|
-
all_fields = set(self.__class__.model_fields.keys())
|
20
|
-
|
21
|
-
extra_fields = all_fields - base_fields - {"kwargs"}
|
22
|
-
|
23
|
-
base["kwargs"] = {
|
24
|
-
k: getattr(self, k) for k in extra_fields if getattr(self, k) is not None
|
25
|
-
}
|
26
|
-
|
27
|
-
return base
|