judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +173 -10
- judgeval/api/__init__.py +523 -0
- judgeval/api/api_types.py +413 -0
- judgeval/cli.py +112 -0
- judgeval/constants.py +7 -30
- judgeval/data/__init__.py +1 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +14 -40
- judgeval/data/judgment_types.py +396 -146
- judgeval/data/result.py +11 -18
- judgeval/data/scorer_data.py +3 -26
- judgeval/data/scripts/openapi_transform.py +5 -5
- judgeval/data/trace.py +115 -194
- judgeval/dataset/__init__.py +335 -0
- judgeval/env.py +55 -0
- judgeval/evaluation/__init__.py +346 -0
- judgeval/exceptions.py +28 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +51 -0
- judgeval/judges/__init__.py +2 -2
- judgeval/judges/litellm_judge.py +77 -16
- judgeval/judges/together_judge.py +88 -17
- judgeval/judges/utils.py +7 -20
- judgeval/judgment_attribute_keys.py +55 -0
- judgeval/{common/logger.py → logger.py} +24 -8
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +11 -11
- judgeval/scorers/agent_scorer.py +15 -19
- judgeval/scorers/api_scorer.py +21 -23
- judgeval/scorers/base_scorer.py +54 -36
- judgeval/scorers/example_scorer.py +1 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
- judgeval/scorers/score.py +64 -47
- judgeval/scorers/utils.py +2 -107
- judgeval/tracer/__init__.py +1111 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +123 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +392 -0
- judgeval/trainer/trainable_model.py +252 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +74 -28
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/{version_check.py → utils/version_check.py} +5 -3
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/v1/__init__.py +88 -0
- judgeval/v1/data/__init__.py +7 -0
- judgeval/v1/data/example.py +44 -0
- judgeval/v1/data/scorer_data.py +42 -0
- judgeval/v1/data/scoring_result.py +44 -0
- judgeval/v1/datasets/__init__.py +6 -0
- judgeval/v1/datasets/dataset.py +214 -0
- judgeval/v1/datasets/dataset_factory.py +94 -0
- judgeval/v1/evaluation/__init__.py +6 -0
- judgeval/v1/evaluation/evaluation.py +182 -0
- judgeval/v1/evaluation/evaluation_factory.py +17 -0
- judgeval/v1/instrumentation/__init__.py +6 -0
- judgeval/v1/instrumentation/llm/__init__.py +7 -0
- judgeval/v1/instrumentation/llm/config.py +78 -0
- judgeval/v1/instrumentation/llm/constants.py +11 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
- judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
- judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
- judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
- judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
- judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
- judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
- judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
- judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
- judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
- judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
- judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
- judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
- judgeval/v1/instrumentation/llm/providers.py +19 -0
- judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
- judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
- judgeval/v1/integrations/langgraph/__init__.py +13 -0
- judgeval/v1/integrations/openlit/__init__.py +47 -0
- judgeval/v1/internal/api/__init__.py +525 -0
- judgeval/v1/internal/api/api_types.py +413 -0
- judgeval/v1/prompts/__init__.py +6 -0
- judgeval/v1/prompts/prompt.py +29 -0
- judgeval/v1/prompts/prompt_factory.py +189 -0
- judgeval/v1/py.typed +0 -0
- judgeval/v1/scorers/__init__.py +6 -0
- judgeval/v1/scorers/api_scorer.py +82 -0
- judgeval/v1/scorers/base_scorer.py +17 -0
- judgeval/v1/scorers/built_in/__init__.py +17 -0
- judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
- judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
- judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
- judgeval/v1/scorers/built_in/faithfulness.py +28 -0
- judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
- judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
- judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
- judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
- judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
- judgeval/v1/scorers/scorers_factory.py +49 -0
- judgeval/v1/tracer/__init__.py +7 -0
- judgeval/v1/tracer/base_tracer.py +520 -0
- judgeval/v1/tracer/exporters/__init__.py +14 -0
- judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
- judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
- judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
- judgeval/v1/tracer/exporters/span_store.py +50 -0
- judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
- judgeval/v1/tracer/processors/__init__.py +6 -0
- judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
- judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
- judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
- judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
- judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
- judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
- judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
- judgeval/v1/tracer/tracer.py +67 -0
- judgeval/v1/tracer/tracer_factory.py +38 -0
- judgeval/v1/trainers/__init__.py +5 -0
- judgeval/v1/trainers/base_trainer.py +62 -0
- judgeval/v1/trainers/config.py +123 -0
- judgeval/v1/trainers/console.py +144 -0
- judgeval/v1/trainers/fireworks_trainer.py +392 -0
- judgeval/v1/trainers/trainable_model.py +252 -0
- judgeval/v1/trainers/trainers_factory.py +37 -0
- judgeval/v1/utils.py +18 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.23.0.dist-info/METADATA +266 -0
- judgeval-0.23.0.dist-info/RECORD +201 -0
- judgeval-0.23.0.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -34
- judgeval/common/__init__.py +0 -13
- judgeval/common/api/__init__.py +0 -3
- judgeval/common/api/api.py +0 -352
- judgeval/common/api/constants.py +0 -165
- judgeval/common/exceptions.py +0 -27
- judgeval/common/storage/__init__.py +0 -6
- judgeval/common/storage/s3_storage.py +0 -98
- judgeval/common/tracer/__init__.py +0 -31
- judgeval/common/tracer/constants.py +0 -22
- judgeval/common/tracer/core.py +0 -1916
- judgeval/common/tracer/otel_exporter.py +0 -108
- judgeval/common/tracer/otel_span_processor.py +0 -234
- judgeval/common/tracer/span_processor.py +0 -37
- judgeval/common/tracer/span_transformer.py +0 -211
- judgeval/common/tracer/trace_manager.py +0 -92
- judgeval/common/utils.py +0 -940
- judgeval/data/datasets/__init__.py +0 -4
- judgeval/data/datasets/dataset.py +0 -341
- judgeval/data/datasets/eval_dataset_client.py +0 -214
- judgeval/data/tool.py +0 -5
- judgeval/data/trace_run.py +0 -37
- judgeval/evaluation_run.py +0 -75
- judgeval/integrations/langgraph.py +0 -843
- judgeval/judges/mixture_of_judges.py +0 -286
- judgeval/judgment_client.py +0 -369
- judgeval/rules.py +0 -521
- judgeval/run_evaluation.py +0 -684
- judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
- judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
- judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
- judgeval/utils/alerts.py +0 -93
- judgeval/utils/requests.py +0 -50
- judgeval-0.1.0.dist-info/METADATA +0 -202
- judgeval-0.1.0.dist-info/RECORD +0 -73
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
- {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.v1.internal.api.api_types import ScorerConfig
|
|
6
|
+
from judgeval.v1.scorers.base_scorer import BaseScorer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class APIScorer(BaseScorer):
|
|
10
|
+
__slots__ = (
|
|
11
|
+
"_score_type",
|
|
12
|
+
"_required_params",
|
|
13
|
+
"_threshold",
|
|
14
|
+
"_name",
|
|
15
|
+
"_strict_mode",
|
|
16
|
+
"_model",
|
|
17
|
+
"_additional_properties",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
score_type: str,
|
|
23
|
+
required_params: Optional[List[str]] = None,
|
|
24
|
+
threshold: float = 0.5,
|
|
25
|
+
name: Optional[str] = None,
|
|
26
|
+
strict_mode: bool = False,
|
|
27
|
+
model: Optional[str] = None,
|
|
28
|
+
**additional_properties: Any,
|
|
29
|
+
):
|
|
30
|
+
self._score_type = score_type
|
|
31
|
+
self._required_params = required_params or []
|
|
32
|
+
self._threshold = threshold
|
|
33
|
+
self._name = name or score_type
|
|
34
|
+
self._strict_mode = strict_mode
|
|
35
|
+
self._model = model
|
|
36
|
+
self._additional_properties = additional_properties
|
|
37
|
+
|
|
38
|
+
def get_name(self) -> str:
|
|
39
|
+
return self._name
|
|
40
|
+
|
|
41
|
+
def get_score_type(self) -> str:
|
|
42
|
+
return self._score_type
|
|
43
|
+
|
|
44
|
+
def get_threshold(self) -> float:
|
|
45
|
+
return self._threshold
|
|
46
|
+
|
|
47
|
+
def get_strict_mode(self) -> bool:
|
|
48
|
+
return self._strict_mode
|
|
49
|
+
|
|
50
|
+
def get_model(self) -> Optional[str]:
|
|
51
|
+
return self._model
|
|
52
|
+
|
|
53
|
+
def get_required_params(self) -> List[str]:
|
|
54
|
+
return self._required_params.copy()
|
|
55
|
+
|
|
56
|
+
def set_threshold(self, threshold: float) -> None:
|
|
57
|
+
if threshold < 0 or threshold > 1:
|
|
58
|
+
raise ValueError(f"Threshold must be between 0 and 1, got: {threshold}")
|
|
59
|
+
self._threshold = threshold
|
|
60
|
+
|
|
61
|
+
def set_name(self, name: str) -> None:
|
|
62
|
+
self._name = name
|
|
63
|
+
|
|
64
|
+
def set_strict_mode(self, strict_mode: bool) -> None:
|
|
65
|
+
self._strict_mode = strict_mode
|
|
66
|
+
|
|
67
|
+
def set_model(self, model: str) -> None:
|
|
68
|
+
self._model = model
|
|
69
|
+
|
|
70
|
+
def get_scorer_config(self) -> ScorerConfig:
|
|
71
|
+
kwargs: Dict[str, Any] = dict(self._additional_properties)
|
|
72
|
+
if self._model:
|
|
73
|
+
kwargs["model"] = self._model
|
|
74
|
+
|
|
75
|
+
return ScorerConfig(
|
|
76
|
+
score_type=self._score_type,
|
|
77
|
+
threshold=self._threshold,
|
|
78
|
+
name=self._name,
|
|
79
|
+
strict_mode=self._strict_mode,
|
|
80
|
+
required_params=self._required_params,
|
|
81
|
+
kwargs=kwargs,
|
|
82
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
|
|
5
|
+
from judgeval.v1.internal.api.api_types import ScorerConfig
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class BaseScorer(ABC):
|
|
9
|
+
__slots__ = ()
|
|
10
|
+
|
|
11
|
+
@abstractmethod
|
|
12
|
+
def get_name(self) -> str:
|
|
13
|
+
pass
|
|
14
|
+
|
|
15
|
+
@abstractmethod
|
|
16
|
+
def get_scorer_config(self) -> ScorerConfig:
|
|
17
|
+
pass
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from judgeval.v1.scorers.built_in.answer_correctness import AnswerCorrectnessScorer
|
|
4
|
+
from judgeval.v1.scorers.built_in.answer_relevancy import AnswerRelevancyScorer
|
|
5
|
+
from judgeval.v1.scorers.built_in.built_in_factory import BuiltInScorersFactory
|
|
6
|
+
from judgeval.v1.scorers.built_in.faithfulness import FaithfulnessScorer
|
|
7
|
+
from judgeval.v1.scorers.built_in.instruction_adherence import (
|
|
8
|
+
InstructionAdherenceScorer,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"AnswerCorrectnessScorer",
|
|
13
|
+
"AnswerRelevancyScorer",
|
|
14
|
+
"FaithfulnessScorer",
|
|
15
|
+
"InstructionAdherenceScorer",
|
|
16
|
+
"BuiltInScorersFactory",
|
|
17
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.constants import APIScorerType
|
|
6
|
+
from judgeval.v1.scorers.api_scorer import APIScorer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AnswerCorrectnessScorer(APIScorer):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
threshold: float = 0.5,
|
|
13
|
+
name: Optional[str] = None,
|
|
14
|
+
strict_mode: bool = False,
|
|
15
|
+
model: Optional[str] = None,
|
|
16
|
+
):
|
|
17
|
+
super().__init__(
|
|
18
|
+
score_type=APIScorerType.ANSWER_CORRECTNESS.value,
|
|
19
|
+
required_params=["input", "actual_output", "expected_output"],
|
|
20
|
+
threshold=threshold,
|
|
21
|
+
name=name,
|
|
22
|
+
strict_mode=strict_mode,
|
|
23
|
+
model=model,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def create(threshold: float = 0.5) -> AnswerCorrectnessScorer:
|
|
28
|
+
return AnswerCorrectnessScorer(threshold=threshold)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.constants import APIScorerType
|
|
6
|
+
from judgeval.v1.scorers.api_scorer import APIScorer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class AnswerRelevancyScorer(APIScorer):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
threshold: float = 0.5,
|
|
13
|
+
name: Optional[str] = None,
|
|
14
|
+
strict_mode: bool = False,
|
|
15
|
+
model: Optional[str] = None,
|
|
16
|
+
):
|
|
17
|
+
super().__init__(
|
|
18
|
+
score_type=APIScorerType.ANSWER_RELEVANCY.value,
|
|
19
|
+
required_params=["input", "actual_output"],
|
|
20
|
+
threshold=threshold,
|
|
21
|
+
name=name,
|
|
22
|
+
strict_mode=strict_mode,
|
|
23
|
+
model=model,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def create(threshold: float = 0.5) -> AnswerRelevancyScorer:
|
|
28
|
+
return AnswerRelevancyScorer(threshold=threshold)
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from judgeval.v1.scorers.built_in.answer_correctness import AnswerCorrectnessScorer
|
|
4
|
+
from judgeval.v1.scorers.built_in.answer_relevancy import AnswerRelevancyScorer
|
|
5
|
+
from judgeval.v1.scorers.built_in.faithfulness import FaithfulnessScorer
|
|
6
|
+
from judgeval.v1.scorers.built_in.instruction_adherence import (
|
|
7
|
+
InstructionAdherenceScorer,
|
|
8
|
+
)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class BuiltInScorersFactory:
|
|
12
|
+
__slots__ = ()
|
|
13
|
+
|
|
14
|
+
def answer_correctness(self, threshold: float = 0.5) -> AnswerCorrectnessScorer:
|
|
15
|
+
return AnswerCorrectnessScorer.create(threshold)
|
|
16
|
+
|
|
17
|
+
def answer_relevancy(self, threshold: float = 0.5) -> AnswerRelevancyScorer:
|
|
18
|
+
return AnswerRelevancyScorer.create(threshold)
|
|
19
|
+
|
|
20
|
+
def faithfulness(self, threshold: float = 0.5) -> FaithfulnessScorer:
|
|
21
|
+
return FaithfulnessScorer.create(threshold)
|
|
22
|
+
|
|
23
|
+
def instruction_adherence(
|
|
24
|
+
self, threshold: float = 0.5
|
|
25
|
+
) -> InstructionAdherenceScorer:
|
|
26
|
+
return InstructionAdherenceScorer.create(threshold)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.constants import APIScorerType
|
|
6
|
+
from judgeval.v1.scorers.api_scorer import APIScorer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class FaithfulnessScorer(APIScorer):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
threshold: float = 0.5,
|
|
13
|
+
name: Optional[str] = None,
|
|
14
|
+
strict_mode: bool = False,
|
|
15
|
+
model: Optional[str] = None,
|
|
16
|
+
):
|
|
17
|
+
super().__init__(
|
|
18
|
+
score_type=APIScorerType.FAITHFULNESS.value,
|
|
19
|
+
required_params=["context", "actual_output"],
|
|
20
|
+
threshold=threshold,
|
|
21
|
+
name=name,
|
|
22
|
+
strict_mode=strict_mode,
|
|
23
|
+
model=model,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def create(threshold: float = 0.5) -> FaithfulnessScorer:
|
|
28
|
+
return FaithfulnessScorer(threshold=threshold)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.constants import APIScorerType
|
|
6
|
+
from judgeval.v1.scorers.api_scorer import APIScorer
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class InstructionAdherenceScorer(APIScorer):
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
threshold: float = 0.5,
|
|
13
|
+
name: Optional[str] = None,
|
|
14
|
+
strict_mode: bool = False,
|
|
15
|
+
model: Optional[str] = None,
|
|
16
|
+
):
|
|
17
|
+
super().__init__(
|
|
18
|
+
score_type=APIScorerType.INSTRUCTION_ADHERENCE.value,
|
|
19
|
+
required_params=["input", "actual_output"],
|
|
20
|
+
threshold=threshold,
|
|
21
|
+
name=name,
|
|
22
|
+
strict_mode=strict_mode,
|
|
23
|
+
model=model,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def create(threshold: float = 0.5) -> InstructionAdherenceScorer:
|
|
28
|
+
return InstructionAdherenceScorer(threshold=threshold)
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from judgeval.constants import APIScorerType
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from judgeval.v1.internal.api.api_types import (
|
|
8
|
+
BaseScorer as BaseScorerDict,
|
|
9
|
+
ScorerConfig,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from judgeval.v1.scorers.base_scorer import BaseScorer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class CustomScorer(BaseScorer):
|
|
16
|
+
__slots__ = (
|
|
17
|
+
"_name",
|
|
18
|
+
"_class_name",
|
|
19
|
+
"_server_hosted",
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
name: str,
|
|
25
|
+
class_name: str = "",
|
|
26
|
+
server_hosted: bool = True,
|
|
27
|
+
):
|
|
28
|
+
self._name = name
|
|
29
|
+
self._class_name = class_name or name
|
|
30
|
+
self._server_hosted = server_hosted
|
|
31
|
+
|
|
32
|
+
def get_name(self) -> str:
|
|
33
|
+
return self._name
|
|
34
|
+
|
|
35
|
+
def get_class_name(self) -> str:
|
|
36
|
+
return self._class_name
|
|
37
|
+
|
|
38
|
+
def is_server_hosted(self) -> bool:
|
|
39
|
+
return self._server_hosted
|
|
40
|
+
|
|
41
|
+
def get_scorer_config(self) -> ScorerConfig:
|
|
42
|
+
raise NotImplementedError("CustomScorer does not use get_scorer_config")
|
|
43
|
+
|
|
44
|
+
def to_dict(self) -> BaseScorerDict:
|
|
45
|
+
return {
|
|
46
|
+
"score_type": APIScorerType.CUSTOM.value,
|
|
47
|
+
"name": self._name,
|
|
48
|
+
"class_name": self._class_name,
|
|
49
|
+
"server_hosted": self._server_hosted,
|
|
50
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.v1.scorers.custom_scorer.custom_scorer import CustomScorer
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CustomScorerFactory:
|
|
9
|
+
__slots__ = ()
|
|
10
|
+
|
|
11
|
+
def get(self, name: str, class_name: Optional[str] = None) -> CustomScorer:
|
|
12
|
+
return CustomScorer(
|
|
13
|
+
name=name,
|
|
14
|
+
class_name=class_name or name,
|
|
15
|
+
server_hosted=True,
|
|
16
|
+
)
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, Optional
|
|
4
|
+
|
|
5
|
+
from judgeval.constants import APIScorerType
|
|
6
|
+
from judgeval.v1.internal.api.api_types import ScorerConfig
|
|
7
|
+
from judgeval.v1.scorers.api_scorer import APIScorer
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PromptScorer(APIScorer):
|
|
11
|
+
__slots__ = (
|
|
12
|
+
"_prompt",
|
|
13
|
+
"_options",
|
|
14
|
+
"_description",
|
|
15
|
+
"_judgment_api_key",
|
|
16
|
+
"_organization_id",
|
|
17
|
+
"_is_trace",
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
name: str,
|
|
23
|
+
prompt: str,
|
|
24
|
+
threshold: float = 0.5,
|
|
25
|
+
options: Optional[Dict[str, float]] = None,
|
|
26
|
+
model: Optional[str] = None,
|
|
27
|
+
description: Optional[str] = None,
|
|
28
|
+
judgment_api_key: str = "",
|
|
29
|
+
organization_id: str = "",
|
|
30
|
+
is_trace: bool = False,
|
|
31
|
+
):
|
|
32
|
+
score_type = (
|
|
33
|
+
APIScorerType.TRACE_PROMPT_SCORER
|
|
34
|
+
if is_trace
|
|
35
|
+
else APIScorerType.PROMPT_SCORER
|
|
36
|
+
)
|
|
37
|
+
super().__init__(
|
|
38
|
+
score_type=score_type,
|
|
39
|
+
threshold=threshold,
|
|
40
|
+
name=name,
|
|
41
|
+
model=model,
|
|
42
|
+
)
|
|
43
|
+
self._prompt = prompt
|
|
44
|
+
self._options = options.copy() if options else None
|
|
45
|
+
self._description = description
|
|
46
|
+
self._judgment_api_key = judgment_api_key
|
|
47
|
+
self._organization_id = organization_id
|
|
48
|
+
self._is_trace = is_trace
|
|
49
|
+
|
|
50
|
+
def get_prompt(self) -> str:
|
|
51
|
+
return self._prompt
|
|
52
|
+
|
|
53
|
+
def get_options(self) -> Optional[Dict[str, float]]:
|
|
54
|
+
return self._options.copy() if self._options else None
|
|
55
|
+
|
|
56
|
+
def get_description(self) -> Optional[str]:
|
|
57
|
+
return self._description
|
|
58
|
+
|
|
59
|
+
def set_prompt(self, prompt: str) -> None:
|
|
60
|
+
self._prompt = prompt
|
|
61
|
+
|
|
62
|
+
def set_options(self, options: Dict[str, float]) -> None:
|
|
63
|
+
self._options = options.copy()
|
|
64
|
+
|
|
65
|
+
def set_description(self, description: str) -> None:
|
|
66
|
+
self._description = description
|
|
67
|
+
|
|
68
|
+
def append_to_prompt(self, addition: str) -> None:
|
|
69
|
+
self._prompt = self._prompt + addition
|
|
70
|
+
|
|
71
|
+
def get_scorer_config(self) -> ScorerConfig:
|
|
72
|
+
kwargs: Dict[str, Any] = {"prompt": self._prompt}
|
|
73
|
+
|
|
74
|
+
if self._options:
|
|
75
|
+
kwargs["options"] = self._options
|
|
76
|
+
if self._model:
|
|
77
|
+
kwargs["model"] = self._model
|
|
78
|
+
if self._description:
|
|
79
|
+
kwargs["description"] = self._description
|
|
80
|
+
|
|
81
|
+
return ScorerConfig(
|
|
82
|
+
score_type=self._score_type,
|
|
83
|
+
threshold=self._threshold,
|
|
84
|
+
name=self._name,
|
|
85
|
+
kwargs=kwargs,
|
|
86
|
+
)
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Dict, Tuple
|
|
4
|
+
|
|
5
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
6
|
+
from judgeval.v1.internal.api.api_types import (
|
|
7
|
+
FetchPromptScorersRequest,
|
|
8
|
+
FetchPromptScorersResponse,
|
|
9
|
+
PromptScorer as APIPromptScorer,
|
|
10
|
+
)
|
|
11
|
+
from judgeval.exceptions import JudgmentAPIError
|
|
12
|
+
from judgeval.v1.scorers.prompt_scorer.prompt_scorer import PromptScorer
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class PromptScorerFactory:
|
|
16
|
+
__slots__ = ("_client", "_is_trace")
|
|
17
|
+
_cache: Dict[Tuple[str, str, str, bool], APIPromptScorer] = {}
|
|
18
|
+
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
client: JudgmentSyncClient,
|
|
22
|
+
is_trace: bool,
|
|
23
|
+
):
|
|
24
|
+
self._client = client
|
|
25
|
+
self._is_trace = is_trace
|
|
26
|
+
|
|
27
|
+
def get(self, name: str) -> PromptScorer:
|
|
28
|
+
cache_key = (
|
|
29
|
+
name,
|
|
30
|
+
self._client.organization_id,
|
|
31
|
+
self._client.api_key,
|
|
32
|
+
self._is_trace,
|
|
33
|
+
)
|
|
34
|
+
cached = self._cache.get(cache_key)
|
|
35
|
+
|
|
36
|
+
if cached is None:
|
|
37
|
+
request: FetchPromptScorersRequest = {"names": [name]}
|
|
38
|
+
if self._is_trace is not None:
|
|
39
|
+
request["is_trace"] = self._is_trace
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
response: FetchPromptScorersResponse = self._client.fetch_scorers(
|
|
43
|
+
request
|
|
44
|
+
)
|
|
45
|
+
scorers = response.get("scorers", [])
|
|
46
|
+
|
|
47
|
+
if not scorers:
|
|
48
|
+
raise JudgmentAPIError(
|
|
49
|
+
404, f"Failed to fetch prompt scorer '{name}': not found", None
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
scorer = scorers[0]
|
|
53
|
+
scorer_is_trace = scorer.get("is_trace", False)
|
|
54
|
+
|
|
55
|
+
if scorer_is_trace != self._is_trace:
|
|
56
|
+
expected_type = (
|
|
57
|
+
"TracePromptScorer" if self._is_trace else "PromptScorer"
|
|
58
|
+
)
|
|
59
|
+
actual_type = (
|
|
60
|
+
"TracePromptScorer" if scorer_is_trace else "PromptScorer"
|
|
61
|
+
)
|
|
62
|
+
raise JudgmentAPIError(
|
|
63
|
+
400,
|
|
64
|
+
f"Scorer with name {name} is a {actual_type}, not a {expected_type}",
|
|
65
|
+
None,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
self._cache[cache_key] = scorer
|
|
69
|
+
cached = scorer
|
|
70
|
+
except JudgmentAPIError:
|
|
71
|
+
raise
|
|
72
|
+
except Exception as e:
|
|
73
|
+
raise JudgmentAPIError(
|
|
74
|
+
500, f"Failed to fetch prompt scorer '{name}': {e}", None
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
return PromptScorer(
|
|
78
|
+
name=name,
|
|
79
|
+
prompt=cached.get("prompt", ""),
|
|
80
|
+
threshold=cached.get("threshold", 0.5),
|
|
81
|
+
options=cached.get("options"),
|
|
82
|
+
model=cached.get("model"),
|
|
83
|
+
description=cached.get("description"),
|
|
84
|
+
is_trace=self._is_trace,
|
|
85
|
+
)
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from judgeval.v1.internal.api import JudgmentSyncClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ScorersFactory:
|
|
7
|
+
__slots__ = "_client"
|
|
8
|
+
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
client: JudgmentSyncClient,
|
|
12
|
+
):
|
|
13
|
+
self._client = client
|
|
14
|
+
|
|
15
|
+
@property
|
|
16
|
+
def prompt_scorer(self):
|
|
17
|
+
from judgeval.v1.scorers.prompt_scorer.prompt_scorer_factory import (
|
|
18
|
+
PromptScorerFactory,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
return PromptScorerFactory(
|
|
22
|
+
client=self._client,
|
|
23
|
+
is_trace=False,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def trace_prompt_scorer(self):
|
|
28
|
+
from judgeval.v1.scorers.prompt_scorer.prompt_scorer_factory import (
|
|
29
|
+
PromptScorerFactory,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
return PromptScorerFactory(
|
|
33
|
+
client=self._client,
|
|
34
|
+
is_trace=True,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
@property
|
|
38
|
+
def custom_scorer(self):
|
|
39
|
+
from judgeval.v1.scorers.custom_scorer.custom_scorer_factory import (
|
|
40
|
+
CustomScorerFactory,
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
return CustomScorerFactory()
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def built_in(self):
|
|
47
|
+
from judgeval.v1.scorers.built_in.built_in_factory import BuiltInScorersFactory
|
|
48
|
+
|
|
49
|
+
return BuiltInScorersFactory()
|