judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
judgeval/scorers/agent_scorer.py
CHANGED
|
@@ -1,21 +1,17 @@
|
|
|
1
|
-
from judgeval.scorers.base_scorer import BaseScorer
|
|
2
|
-
from judgeval.data import Trace
|
|
3
|
-
from typing import List, Optional
|
|
4
|
-
from abc import abstractmethod
|
|
1
|
+
# from judgeval.scorers.base_scorer import BaseScorer
|
|
2
|
+
# from judgeval.data.judgment_types import Trace as JudgmentTrace
|
|
3
|
+
# from typing import List, Optional
|
|
4
|
+
# from abc import abstractmethod
|
|
5
5
|
|
|
6
|
-
from judgeval.common.logger import warning, error
|
|
7
6
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
raise NotImplementedError(
|
|
20
|
-
"You must implement the `a_score_trace` method in your custom scorer"
|
|
21
|
-
)
|
|
7
|
+
# class TraceScorer(BaseScorer):
|
|
8
|
+
# @abstractmethod
|
|
9
|
+
# async def a_score_trace(
|
|
10
|
+
# self, trace: JudgmentTrace, tools: Optional[List] = None, *args, **kwargs
|
|
11
|
+
# ) -> float:
|
|
12
|
+
# """
|
|
13
|
+
# Asynchronously measures the score on a trace
|
|
14
|
+
# """
|
|
15
|
+
# raise NotImplementedError(
|
|
16
|
+
# "You must implement the `a_score_trace` method in your custom scorer"
|
|
17
|
+
# )
|
judgeval/scorers/api_scorer.py
CHANGED
|
@@ -4,11 +4,13 @@ Judgment Scorer class.
|
|
|
4
4
|
Scores `Example`s using ready-made Judgment evaluators.
|
|
5
5
|
"""
|
|
6
6
|
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
7
9
|
from pydantic import BaseModel, field_validator
|
|
8
10
|
from typing import List
|
|
9
|
-
from judgeval.
|
|
10
|
-
from judgeval.
|
|
11
|
-
from judgeval.
|
|
11
|
+
from judgeval.constants import APIScorerType
|
|
12
|
+
from judgeval.data.example import ExampleParams
|
|
13
|
+
from judgeval.env import JUDGMENT_DEFAULT_GPT_MODEL
|
|
12
14
|
|
|
13
15
|
|
|
14
16
|
class APIScorerConfig(BaseModel):
|
|
@@ -28,9 +30,10 @@ class APIScorerConfig(BaseModel):
|
|
|
28
30
|
name: str = ""
|
|
29
31
|
threshold: float = 0.5
|
|
30
32
|
strict_mode: bool = False
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
] = []
|
|
33
|
+
model: str = JUDGMENT_DEFAULT_GPT_MODEL
|
|
34
|
+
|
|
35
|
+
required_params: List[ExampleParams] = []
|
|
36
|
+
|
|
34
37
|
kwargs: dict = {}
|
|
35
38
|
|
|
36
39
|
@field_validator("threshold")
|
|
@@ -40,31 +43,26 @@ class APIScorerConfig(BaseModel):
|
|
|
40
43
|
Validates that the threshold is between 0 and 1 inclusive.
|
|
41
44
|
"""
|
|
42
45
|
score_type = info.data.get("score_type")
|
|
43
|
-
if
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
)
|
|
48
|
-
raise ValueError(
|
|
49
|
-
f"Threshold for {score_type} must be greater than 0, got: {v}"
|
|
50
|
-
)
|
|
51
|
-
else:
|
|
52
|
-
if not 0 <= v <= 1:
|
|
53
|
-
judgeval_logger.error(
|
|
54
|
-
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
|
55
|
-
)
|
|
56
|
-
raise ValueError(
|
|
57
|
-
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
|
58
|
-
)
|
|
46
|
+
if not 0 <= v <= 1:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f"Threshold for {score_type} must be between 0 and 1, got: {v}"
|
|
49
|
+
)
|
|
59
50
|
return v
|
|
60
51
|
|
|
61
52
|
@field_validator("name", mode="after")
|
|
62
53
|
@classmethod
|
|
63
54
|
def set_name_to_score_type_if_none(cls, v, info):
|
|
64
|
-
"""Set name to score_type if not provided"""
|
|
65
55
|
if v is None:
|
|
66
56
|
return info.data.get("score_type")
|
|
67
57
|
return v
|
|
68
58
|
|
|
69
59
|
def __str__(self):
|
|
70
60
|
return f"JudgmentScorer(score_type={self.score_type.value}, threshold={self.threshold})"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class ExampleAPIScorerConfig(APIScorerConfig):
|
|
64
|
+
pass
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TraceAPIScorerConfig(APIScorerConfig):
|
|
68
|
+
pass
|
judgeval/scorers/base_scorer.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
Base class for all scorers.
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
|
+
from __future__ import annotations
|
|
5
6
|
from typing import Dict, Optional
|
|
6
7
|
|
|
7
8
|
from pydantic import BaseModel
|
|
@@ -19,45 +20,63 @@ class BaseScorer(BaseModel):
|
|
|
19
20
|
where none of Judgment's scorers are suitable.
|
|
20
21
|
"""
|
|
21
22
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
)
|
|
29
|
-
|
|
23
|
+
# type of your scorer (Faithfulness, PromptScorer)
|
|
24
|
+
score_type: str
|
|
25
|
+
|
|
26
|
+
# The threshold to pass a test while using this scorer as a scorer
|
|
27
|
+
threshold: float = 0.5
|
|
28
|
+
|
|
29
|
+
# name of your scorer (Faithfulness, PromptScorer-randomslug)
|
|
30
|
+
name: str = ""
|
|
31
|
+
|
|
32
|
+
# The name of the class of the scorer
|
|
33
|
+
class_name: Optional[str] = None
|
|
34
|
+
|
|
35
|
+
# The float score of the scorer run on the test case
|
|
36
|
+
score: Optional[float] = None
|
|
37
|
+
|
|
30
38
|
score_breakdown: Optional[Dict] = None
|
|
31
39
|
reason: Optional[str] = ""
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
40
|
+
|
|
41
|
+
# Whether the model is a native model
|
|
42
|
+
using_native_model: Optional[bool] = None
|
|
43
|
+
|
|
44
|
+
# Whether the test case passed or failed
|
|
45
|
+
success: bool = False
|
|
46
|
+
|
|
47
|
+
# The name of the model used to evaluate the test case
|
|
48
|
+
model: Optional[str] = None
|
|
49
|
+
|
|
50
|
+
# The model used to evaluate the test case
|
|
51
|
+
model_client: Optional[Any] = Field(default=None, exclude=True)
|
|
52
|
+
|
|
53
|
+
# Whether to run the scorer in strict mode
|
|
54
|
+
strict_mode: bool = False
|
|
55
|
+
|
|
56
|
+
# The error message if the scorer failed
|
|
57
|
+
error: Optional[str] = None
|
|
58
|
+
|
|
59
|
+
# Additional metadata for the scorer
|
|
60
|
+
additional_metadata: Optional[Dict] = None
|
|
61
|
+
|
|
62
|
+
# The user ID of the scorer
|
|
63
|
+
user: Optional[str] = None
|
|
64
|
+
|
|
65
|
+
# Whether the scorer is hosted on the server
|
|
66
|
+
server_hosted: bool = False
|
|
67
|
+
|
|
68
|
+
@model_validator(mode="after")
|
|
69
|
+
def enforce_strict_threshold(self):
|
|
70
|
+
if self.strict_mode:
|
|
71
|
+
self.threshold = 1.0
|
|
72
|
+
return self
|
|
49
73
|
|
|
50
74
|
@model_validator(mode="after")
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
if not
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
if class_name and getattr(m.__class__, "__name__", None):
|
|
57
|
-
m.name = m.__class__.__name__
|
|
58
|
-
else:
|
|
59
|
-
m.name = m.score_type
|
|
60
|
-
return m
|
|
75
|
+
def default_name(self):
|
|
76
|
+
self.class_name = self.__class__.__name__
|
|
77
|
+
if not self.name:
|
|
78
|
+
self.name = self.class_name
|
|
79
|
+
return self
|
|
61
80
|
|
|
62
81
|
def _add_model(self, model: str):
|
|
63
82
|
"""
|
|
@@ -66,7 +85,6 @@ class BaseScorer(BaseModel):
|
|
|
66
85
|
This method is used at eval time
|
|
67
86
|
"""
|
|
68
87
|
self.model_client, self.using_native_model = create_judge(model)
|
|
69
|
-
self.model = self.model_client.get_model_name() or model
|
|
70
88
|
|
|
71
89
|
def success_check(self) -> bool:
|
|
72
90
|
"""
|
|
@@ -2,18 +2,16 @@ from judgeval.scorers.base_scorer import BaseScorer
|
|
|
2
2
|
from judgeval.data import Example
|
|
3
3
|
from typing import List
|
|
4
4
|
from pydantic import Field
|
|
5
|
-
from judgeval.common.logger import judgeval_logger
|
|
6
5
|
|
|
7
6
|
|
|
8
7
|
class ExampleScorer(BaseScorer):
|
|
9
|
-
score_type: str = "Custom"
|
|
8
|
+
score_type: str = "Custom"
|
|
10
9
|
required_params: List[str] = Field(default_factory=list)
|
|
11
10
|
|
|
12
11
|
async def a_score_example(self, example: Example, *args, **kwargs) -> float:
|
|
13
12
|
"""
|
|
14
13
|
Asynchronously measures the score on a single example
|
|
15
14
|
"""
|
|
16
|
-
judgeval_logger.error("a_score_example method not implemented")
|
|
17
15
|
raise NotImplementedError(
|
|
18
16
|
"You must implement the `a_score_example` method in your custom scorer"
|
|
19
17
|
)
|
|
@@ -1,9 +1,3 @@
|
|
|
1
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.execution_order import (
|
|
2
|
-
ExecutionOrderScorer,
|
|
3
|
-
)
|
|
4
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.hallucination import (
|
|
5
|
-
HallucinationScorer,
|
|
6
|
-
)
|
|
7
1
|
from judgeval.scorers.judgeval_scorers.api_scorers.faithfulness import (
|
|
8
2
|
FaithfulnessScorer,
|
|
9
3
|
)
|
|
@@ -16,32 +10,16 @@ from judgeval.scorers.judgeval_scorers.api_scorers.answer_correctness import (
|
|
|
16
10
|
from judgeval.scorers.judgeval_scorers.api_scorers.instruction_adherence import (
|
|
17
11
|
InstructionAdherenceScorer,
|
|
18
12
|
)
|
|
19
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.derailment_scorer import (
|
|
20
|
-
DerailmentScorer,
|
|
21
|
-
)
|
|
22
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.tool_order import ToolOrderScorer
|
|
23
13
|
from judgeval.scorers.judgeval_scorers.api_scorers.prompt_scorer import (
|
|
14
|
+
TracePromptScorer,
|
|
24
15
|
PromptScorer,
|
|
25
16
|
)
|
|
26
|
-
from judgeval.scorers.judgeval_scorers.api_scorers.tool_dependency import (
|
|
27
|
-
ToolDependencyScorer,
|
|
28
|
-
)
|
|
29
17
|
|
|
30
18
|
__all__ = [
|
|
31
|
-
"ExecutionOrderScorer",
|
|
32
|
-
"JSONCorrectnessScorer",
|
|
33
|
-
"SummarizationScorer",
|
|
34
|
-
"HallucinationScorer",
|
|
35
19
|
"FaithfulnessScorer",
|
|
36
|
-
"ContextualRelevancyScorer",
|
|
37
|
-
"ContextualPrecisionScorer",
|
|
38
|
-
"ContextualRecallScorer",
|
|
39
20
|
"AnswerRelevancyScorer",
|
|
40
21
|
"AnswerCorrectnessScorer",
|
|
41
22
|
"InstructionAdherenceScorer",
|
|
42
|
-
"
|
|
43
|
-
"DerailmentScorer",
|
|
44
|
-
"ToolOrderScorer",
|
|
23
|
+
"TracePromptScorer",
|
|
45
24
|
"PromptScorer",
|
|
46
|
-
"ToolDependencyScorer",
|
|
47
25
|
]
|
|
@@ -1,18 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
`judgeval` answer relevancy scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
10
2
|
from judgeval.constants import APIScorerType
|
|
11
3
|
from judgeval.data import ExampleParams
|
|
12
4
|
from typing import List
|
|
13
5
|
|
|
14
6
|
|
|
15
|
-
class AnswerCorrectnessScorer(
|
|
7
|
+
class AnswerCorrectnessScorer(ExampleAPIScorerConfig):
|
|
16
8
|
score_type: APIScorerType = APIScorerType.ANSWER_CORRECTNESS
|
|
17
9
|
required_params: List[ExampleParams] = [
|
|
18
10
|
ExampleParams.INPUT,
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
from judgeval.scorers.api_scorer import
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
2
2
|
from judgeval.constants import APIScorerType
|
|
3
3
|
from judgeval.data import ExampleParams
|
|
4
4
|
from typing import List
|
|
5
5
|
|
|
6
6
|
|
|
7
|
-
class AnswerRelevancyScorer(
|
|
7
|
+
class AnswerRelevancyScorer(ExampleAPIScorerConfig):
|
|
8
8
|
score_type: APIScorerType = APIScorerType.ANSWER_RELEVANCY
|
|
9
9
|
required_params: List[ExampleParams] = [
|
|
10
10
|
ExampleParams.INPUT,
|
|
@@ -1,18 +1,10 @@
|
|
|
1
|
-
|
|
2
|
-
`judgeval` faithfulness scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
10
2
|
from judgeval.constants import APIScorerType
|
|
11
3
|
from judgeval.data import ExampleParams
|
|
12
4
|
from typing import List
|
|
13
5
|
|
|
14
6
|
|
|
15
|
-
class FaithfulnessScorer(
|
|
7
|
+
class FaithfulnessScorer(ExampleAPIScorerConfig):
|
|
16
8
|
score_type: APIScorerType = APIScorerType.FAITHFULNESS
|
|
17
9
|
required_params: List[ExampleParams] = [
|
|
18
10
|
ExampleParams.INPUT,
|
|
@@ -1,17 +1,9 @@
|
|
|
1
|
-
|
|
2
|
-
`judgeval` instruction adherence scorer
|
|
3
|
-
|
|
4
|
-
TODO add link to docs page for this scorer
|
|
5
|
-
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
# Internal imports
|
|
9
|
-
from judgeval.scorers.api_scorer import APIScorerConfig
|
|
1
|
+
from judgeval.scorers.api_scorer import ExampleAPIScorerConfig
|
|
10
2
|
from judgeval.constants import APIScorerType
|
|
11
3
|
from judgeval.data import ExampleParams
|
|
12
4
|
|
|
13
5
|
|
|
14
|
-
class InstructionAdherenceScorer(
|
|
6
|
+
class InstructionAdherenceScorer(ExampleAPIScorerConfig):
|
|
15
7
|
def __init__(self, threshold: float):
|
|
16
8
|
super().__init__(
|
|
17
9
|
threshold=threshold,
|
|
@@ -21,7 +13,3 @@ class InstructionAdherenceScorer(APIScorerConfig):
|
|
|
21
13
|
ExampleParams.ACTUAL_OUTPUT,
|
|
22
14
|
],
|
|
23
15
|
)
|
|
24
|
-
|
|
25
|
-
@property
|
|
26
|
-
def __name__(self):
|
|
27
|
-
return "Instruction Adherence"
|