judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of judgeval might be problematic. Click here for more details.
- judgeval/__init__.py +177 -12
- judgeval/api/__init__.py +519 -0
- judgeval/api/api_types.py +407 -0
- judgeval/cli.py +79 -0
- judgeval/constants.py +76 -47
- judgeval/data/__init__.py +3 -3
- judgeval/data/evaluation_run.py +125 -0
- judgeval/data/example.py +15 -56
- judgeval/data/judgment_types.py +450 -0
- judgeval/data/result.py +29 -73
- judgeval/data/scorer_data.py +29 -62
- judgeval/data/scripts/fix_default_factory.py +23 -0
- judgeval/data/scripts/openapi_transform.py +123 -0
- judgeval/data/trace.py +121 -0
- judgeval/dataset/__init__.py +264 -0
- judgeval/env.py +52 -0
- judgeval/evaluation/__init__.py +344 -0
- judgeval/exceptions.py +27 -0
- judgeval/integrations/langgraph/__init__.py +13 -0
- judgeval/integrations/openlit/__init__.py +50 -0
- judgeval/judges/__init__.py +2 -3
- judgeval/judges/base_judge.py +2 -3
- judgeval/judges/litellm_judge.py +100 -20
- judgeval/judges/together_judge.py +101 -20
- judgeval/judges/utils.py +20 -24
- judgeval/logger.py +62 -0
- judgeval/prompt/__init__.py +330 -0
- judgeval/scorers/__init__.py +18 -25
- judgeval/scorers/agent_scorer.py +17 -0
- judgeval/scorers/api_scorer.py +45 -41
- judgeval/scorers/base_scorer.py +83 -38
- judgeval/scorers/example_scorer.py +17 -0
- judgeval/scorers/exceptions.py +1 -0
- judgeval/scorers/judgeval_scorers/__init__.py +0 -148
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
- judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
- judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
- judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
- judgeval/scorers/score.py +77 -306
- judgeval/scorers/utils.py +4 -199
- judgeval/tracer/__init__.py +1122 -2
- judgeval/tracer/constants.py +1 -0
- judgeval/tracer/exporters/__init__.py +40 -0
- judgeval/tracer/exporters/s3.py +119 -0
- judgeval/tracer/exporters/store.py +59 -0
- judgeval/tracer/exporters/utils.py +32 -0
- judgeval/tracer/keys.py +63 -0
- judgeval/tracer/llm/__init__.py +7 -0
- judgeval/tracer/llm/config.py +78 -0
- judgeval/tracer/llm/constants.py +9 -0
- judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
- judgeval/tracer/llm/llm_anthropic/config.py +6 -0
- judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
- judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
- judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
- judgeval/tracer/llm/llm_google/__init__.py +3 -0
- judgeval/tracer/llm/llm_google/config.py +6 -0
- judgeval/tracer/llm/llm_google/generate_content.py +127 -0
- judgeval/tracer/llm/llm_google/wrapper.py +30 -0
- judgeval/tracer/llm/llm_openai/__init__.py +3 -0
- judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
- judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
- judgeval/tracer/llm/llm_openai/config.py +6 -0
- judgeval/tracer/llm/llm_openai/responses.py +506 -0
- judgeval/tracer/llm/llm_openai/utils.py +42 -0
- judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
- judgeval/tracer/llm/llm_together/__init__.py +3 -0
- judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
- judgeval/tracer/llm/llm_together/config.py +6 -0
- judgeval/tracer/llm/llm_together/wrapper.py +52 -0
- judgeval/tracer/llm/providers.py +19 -0
- judgeval/tracer/managers.py +167 -0
- judgeval/tracer/processors/__init__.py +220 -0
- judgeval/tracer/utils.py +19 -0
- judgeval/trainer/__init__.py +14 -0
- judgeval/trainer/base_trainer.py +122 -0
- judgeval/trainer/config.py +128 -0
- judgeval/trainer/console.py +144 -0
- judgeval/trainer/fireworks_trainer.py +396 -0
- judgeval/trainer/trainable_model.py +243 -0
- judgeval/trainer/trainer.py +70 -0
- judgeval/utils/async_utils.py +39 -0
- judgeval/utils/decorators/__init__.py +0 -0
- judgeval/utils/decorators/dont_throw.py +37 -0
- judgeval/utils/decorators/use_once.py +13 -0
- judgeval/utils/file_utils.py +97 -0
- judgeval/utils/guards.py +36 -0
- judgeval/utils/meta.py +27 -0
- judgeval/utils/project.py +15 -0
- judgeval/utils/serialize.py +253 -0
- judgeval/utils/testing.py +70 -0
- judgeval/utils/url.py +10 -0
- judgeval/utils/version_check.py +28 -0
- judgeval/utils/wrappers/README.md +3 -0
- judgeval/utils/wrappers/__init__.py +15 -0
- judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
- judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
- judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
- judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
- judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
- judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
- judgeval/utils/wrappers/py.typed +0 -0
- judgeval/utils/wrappers/utils.py +35 -0
- judgeval/version.py +5 -0
- judgeval/warnings.py +4 -0
- judgeval-0.22.2.dist-info/METADATA +265 -0
- judgeval-0.22.2.dist-info/RECORD +112 -0
- judgeval-0.22.2.dist-info/entry_points.txt +2 -0
- judgeval/clients.py +0 -39
- judgeval/common/__init__.py +0 -8
- judgeval/common/exceptions.py +0 -28
- judgeval/common/logger.py +0 -189
- judgeval/common/tracer.py +0 -798
- judgeval/common/utils.py +0 -763
- judgeval/data/api_example.py +0 -111
- judgeval/data/datasets/__init__.py +0 -5
- judgeval/data/datasets/dataset.py +0 -286
- judgeval/data/datasets/eval_dataset_client.py +0 -193
- judgeval/data/datasets/ground_truth.py +0 -54
- judgeval/data/datasets/utils.py +0 -74
- judgeval/evaluation_run.py +0 -132
- judgeval/judges/mixture_of_judges.py +0 -248
- judgeval/judgment_client.py +0 -354
- judgeval/run_evaluation.py +0 -439
- judgeval/scorers/judgeval_scorer.py +0 -140
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
- judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
- judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
- judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
- judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
- judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
- judgeval/scorers/prompt_scorer.py +0 -439
- judgeval-0.0.11.dist-info/METADATA +0 -36
- judgeval-0.0.11.dist-info/RECORD +0 -84
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
- {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
from .config import TrainerConfig
|
|
3
|
+
from .base_trainer import BaseTrainer
|
|
4
|
+
from .fireworks_trainer import FireworksTrainer
|
|
5
|
+
from .trainable_model import TrainableModel
|
|
6
|
+
from judgeval.tracer import Tracer
|
|
7
|
+
from judgeval.exceptions import JudgmentRuntimeError
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def JudgmentTrainer(
|
|
11
|
+
config: TrainerConfig,
|
|
12
|
+
trainable_model: TrainableModel,
|
|
13
|
+
tracer: Tracer,
|
|
14
|
+
project_name: Optional[str] = None,
|
|
15
|
+
) -> BaseTrainer:
|
|
16
|
+
"""
|
|
17
|
+
Factory function for creating reinforcement learning trainers.
|
|
18
|
+
|
|
19
|
+
This factory creates and returns provider-specific trainer implementations
|
|
20
|
+
(FireworksTrainer, VerifiersTrainer, etc.) based on the configured RFT provider.
|
|
21
|
+
|
|
22
|
+
The factory pattern allows for easy extension to support multiple training
|
|
23
|
+
providers without changing the client-facing API.
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
config = TrainerConfig(
|
|
27
|
+
deployment_id="my-deployment",
|
|
28
|
+
user_id="my-user",
|
|
29
|
+
model_id="my-model",
|
|
30
|
+
rft_provider="fireworks" # or "verifiers" in the future
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# User creates and configures the trainable model
|
|
34
|
+
trainable_model = TrainableModel(config)
|
|
35
|
+
tracer = Tracer()
|
|
36
|
+
|
|
37
|
+
# JudgmentTrainer automatically creates the appropriate provider-specific trainer
|
|
38
|
+
trainer = JudgmentTrainer(config, trainable_model, tracer)
|
|
39
|
+
|
|
40
|
+
# The returned trainer implements the BaseTrainer interface
|
|
41
|
+
model_config = await trainer.train(agent_function, scorers, prompts)
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
config: TrainerConfig instance with training parameters including rft_provider
|
|
45
|
+
trainable_model: Provider-specific trainable model instance (e.g., TrainableModel for Fireworks)
|
|
46
|
+
tracer: Tracer for observability
|
|
47
|
+
project_name: Project name for organizing training runs and evaluations
|
|
48
|
+
|
|
49
|
+
Returns:
|
|
50
|
+
Provider-specific trainer instance (FireworksTrainer, etc.) that implements
|
|
51
|
+
the BaseTrainer interface
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
JudgmentRuntimeError: If the specified provider is not supported
|
|
55
|
+
"""
|
|
56
|
+
provider = config.rft_provider.lower()
|
|
57
|
+
|
|
58
|
+
if provider == "fireworks":
|
|
59
|
+
return FireworksTrainer(config, trainable_model, tracer, project_name)
|
|
60
|
+
elif provider == "verifiers":
|
|
61
|
+
# Placeholder for future implementation
|
|
62
|
+
raise JudgmentRuntimeError(
|
|
63
|
+
"Verifiers provider is not yet implemented. "
|
|
64
|
+
"Currently supported providers: 'fireworks'"
|
|
65
|
+
)
|
|
66
|
+
else:
|
|
67
|
+
raise JudgmentRuntimeError(
|
|
68
|
+
f"Unsupported RFT provider: '{config.rft_provider}'. "
|
|
69
|
+
f"Currently supported providers: 'fireworks'"
|
|
70
|
+
)
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Async utilities for judgeval."""
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import concurrent.futures
|
|
5
|
+
from typing import Awaitable, TypeVar, Coroutine
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
T = TypeVar("T")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def safe_run_async(coro: Awaitable[T]) -> T:
|
|
12
|
+
"""Safely execute an async *coro* from synchronous code.
|
|
13
|
+
|
|
14
|
+
This helper handles two common situations:
|
|
15
|
+
|
|
16
|
+
1. **No running event loop** - Simply delegates to ``asyncio.run``.
|
|
17
|
+
2. **Existing running loop** - Executes the coroutine in a separate
|
|
18
|
+
thread so that we don't attempt to nest event loops (which would raise
|
|
19
|
+
``RuntimeError``).
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
coro: The coroutine to execute.
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
The result returned by *coro*.
|
|
26
|
+
"""
|
|
27
|
+
if not isinstance(coro, Coroutine):
|
|
28
|
+
raise TypeError("The provided awaitable must be a coroutine.")
|
|
29
|
+
|
|
30
|
+
try:
|
|
31
|
+
asyncio.get_running_loop()
|
|
32
|
+
except RuntimeError:
|
|
33
|
+
return asyncio.run(coro)
|
|
34
|
+
|
|
35
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
36
|
+
future: concurrent.futures.Future[T] = executor.submit(
|
|
37
|
+
lambda: asyncio.run(coro)
|
|
38
|
+
)
|
|
39
|
+
return future.result()
|
|
File without changes
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from functools import wraps
|
|
2
|
+
from typing import Any, Callable, ParamSpec, TypeVar, overload
|
|
3
|
+
|
|
4
|
+
from judgeval.logger import judgeval_logger
|
|
5
|
+
|
|
6
|
+
T = TypeVar("T")
|
|
7
|
+
D = TypeVar("D")
|
|
8
|
+
P = ParamSpec("P")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@overload
|
|
12
|
+
def dont_throw(func: Callable[P, T], /) -> Callable[P, T | None]: ...
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@overload
|
|
16
|
+
def dont_throw(
|
|
17
|
+
func: None = None, /, *, default: D
|
|
18
|
+
) -> Callable[[Callable[P, T]], Callable[P, T | D]]: ...
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def dont_throw(func: Callable[P, T] | None = None, /, *, default: Any = None):
|
|
22
|
+
def decorator(f: Callable[P, T]) -> Callable[P, T | Any]:
|
|
23
|
+
@wraps(f)
|
|
24
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T | Any:
|
|
25
|
+
try:
|
|
26
|
+
return f(*args, **kwargs)
|
|
27
|
+
except Exception as e:
|
|
28
|
+
judgeval_logger.debug(
|
|
29
|
+
f"[Caught] An exception was raised in {f.__name__}", exc_info=e
|
|
30
|
+
)
|
|
31
|
+
return default
|
|
32
|
+
|
|
33
|
+
return wrapper
|
|
34
|
+
|
|
35
|
+
if func is None:
|
|
36
|
+
return decorator
|
|
37
|
+
return decorator(func)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from functools import lru_cache, wraps
|
|
2
|
+
from typing import Callable, TypeVar
|
|
3
|
+
|
|
4
|
+
T = TypeVar("T")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def use_once(func: Callable[..., T]) -> Callable[..., T]:
|
|
8
|
+
@lru_cache(maxsize=1)
|
|
9
|
+
@wraps(func)
|
|
10
|
+
def wrapper(*args, **kwargs):
|
|
11
|
+
return func(*args, **kwargs)
|
|
12
|
+
|
|
13
|
+
return wrapper
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import importlib.util
|
|
2
|
+
import yaml
|
|
3
|
+
import orjson
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
from typing import List
|
|
6
|
+
from judgeval.logger import judgeval_logger
|
|
7
|
+
|
|
8
|
+
from judgeval.data.example import Example
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_examples_from_yaml(file_path: str) -> List[Example]:
|
|
12
|
+
"""
|
|
13
|
+
Adds examples from a YAML file.
|
|
14
|
+
|
|
15
|
+
The YAML file is expected to have the following format:
|
|
16
|
+
- key_01: value_01
|
|
17
|
+
key_02: value_02
|
|
18
|
+
- key_11: value_11
|
|
19
|
+
key_12: value_12
|
|
20
|
+
key_13: value_13
|
|
21
|
+
...
|
|
22
|
+
"""
|
|
23
|
+
try:
|
|
24
|
+
with open(file_path, "r") as file:
|
|
25
|
+
payload = yaml.safe_load(file)
|
|
26
|
+
if payload is None:
|
|
27
|
+
raise ValueError("The YAML file is empty.")
|
|
28
|
+
except FileNotFoundError:
|
|
29
|
+
judgeval_logger.error(f"YAML file not found: {file_path}")
|
|
30
|
+
raise FileNotFoundError(f"The file {file_path} was not found.")
|
|
31
|
+
except yaml.YAMLError:
|
|
32
|
+
judgeval_logger.error(f"Invalid YAML file: {file_path}")
|
|
33
|
+
raise ValueError(f"The file {file_path} is not a valid YAML file.")
|
|
34
|
+
|
|
35
|
+
new_examples = [Example(**e) for e in payload]
|
|
36
|
+
return new_examples
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_examples_from_json(file_path: str) -> List[Example]:
|
|
40
|
+
"""
|
|
41
|
+
Adds examples from a JSON file.
|
|
42
|
+
|
|
43
|
+
The JSON file is expected to have the following format:
|
|
44
|
+
[
|
|
45
|
+
{
|
|
46
|
+
"key_01": "value_01",
|
|
47
|
+
"key_02": "value_02"
|
|
48
|
+
},
|
|
49
|
+
{
|
|
50
|
+
"key_11": "value_11",
|
|
51
|
+
"key_12": "value_12",
|
|
52
|
+
"key_13": "value_13"
|
|
53
|
+
},
|
|
54
|
+
...
|
|
55
|
+
]
|
|
56
|
+
"""
|
|
57
|
+
try:
|
|
58
|
+
with open(file_path, "rb") as file:
|
|
59
|
+
payload = orjson.loads(file.read())
|
|
60
|
+
except FileNotFoundError:
|
|
61
|
+
judgeval_logger.error(f"JSON file not found: {file_path}")
|
|
62
|
+
raise FileNotFoundError(f"The file {file_path} was not found.")
|
|
63
|
+
except orjson.JSONDecodeError:
|
|
64
|
+
judgeval_logger.error(f"Invalid JSON file: {file_path}")
|
|
65
|
+
raise ValueError(f"The file {file_path} is not a valid JSON file.")
|
|
66
|
+
|
|
67
|
+
new_examples = [Example(**e) for e in payload]
|
|
68
|
+
return new_examples
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def extract_scorer_name(scorer_file_path: str) -> str:
|
|
72
|
+
try:
|
|
73
|
+
spec = importlib.util.spec_from_file_location("scorer_module", scorer_file_path)
|
|
74
|
+
if spec is None or spec.loader is None:
|
|
75
|
+
raise ImportError(f"Could not load spec from {scorer_file_path}")
|
|
76
|
+
|
|
77
|
+
module = importlib.util.module_from_spec(spec)
|
|
78
|
+
spec.loader.exec_module(module)
|
|
79
|
+
|
|
80
|
+
for attr_name in dir(module):
|
|
81
|
+
attr = getattr(module, attr_name)
|
|
82
|
+
if (
|
|
83
|
+
isinstance(attr, type)
|
|
84
|
+
and any("Scorer" in str(base) for base in attr.__mro__)
|
|
85
|
+
and attr.__module__ == "scorer_module"
|
|
86
|
+
):
|
|
87
|
+
try:
|
|
88
|
+
scorer_instance = attr()
|
|
89
|
+
if hasattr(scorer_instance, "name"):
|
|
90
|
+
return scorer_instance.name
|
|
91
|
+
except Exception:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
raise AttributeError("No scorer class found or could be instantiated")
|
|
95
|
+
except Exception as e:
|
|
96
|
+
judgeval_logger.warning(f"Could not extract scorer name: {e}")
|
|
97
|
+
return Path(scorer_file_path).stem
|
judgeval/utils/guards.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
4
|
+
from judgeval.logger import judgeval_logger
|
|
5
|
+
|
|
6
|
+
if TYPE_CHECKING:
|
|
7
|
+
from typing import TypeVar
|
|
8
|
+
|
|
9
|
+
T = TypeVar("T")
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def expect_exists(value: T | None, message: str, default: T) -> T:
|
|
13
|
+
if not value:
|
|
14
|
+
judgeval_logger.error(message)
|
|
15
|
+
return default
|
|
16
|
+
|
|
17
|
+
return value
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def expect_api_key(api_key: str | None) -> str | None:
|
|
21
|
+
return expect_exists(
|
|
22
|
+
api_key,
|
|
23
|
+
"API Key is not set, please set JUDGMENT_API_KEY in the environment variables or pass it as `api_key`",
|
|
24
|
+
default=None,
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def expect_organization_id(organization_id: str | None) -> str | None:
|
|
29
|
+
return expect_exists(
|
|
30
|
+
organization_id,
|
|
31
|
+
"Organization ID is not set, please set JUDGMENT_ORG_ID in the environment variables or pass it as `organization_id`",
|
|
32
|
+
default=None,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
__all__ = ("expect_exists", "expect_api_key", "expect_organization_id")
|
judgeval/utils/meta.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
from typing import TypeVar, Dict, cast, Type
|
|
3
|
+
|
|
4
|
+
T = TypeVar("T")
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SingletonMeta(type):
|
|
8
|
+
"""
|
|
9
|
+
Metaclass for creating singleton classes.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
_instances: Dict[type, object] = {}
|
|
13
|
+
|
|
14
|
+
def __call__(cls, *args, **kwargs):
|
|
15
|
+
if cls not in SingletonMeta._instances:
|
|
16
|
+
SingletonMeta._instances[cls] = super(SingletonMeta, cls).__call__(
|
|
17
|
+
*args, **kwargs
|
|
18
|
+
)
|
|
19
|
+
return SingletonMeta._instances[cls]
|
|
20
|
+
|
|
21
|
+
def get_instance(cls: Type[T]) -> T | None:
|
|
22
|
+
"""Get the singleton instance if it exists, otherwise return None"""
|
|
23
|
+
instance = SingletonMeta._instances.get(cls, None)
|
|
24
|
+
return cast(T, instance) if instance is not None else None
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
__all__ = ("SingletonMeta",)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from judgeval.utils.decorators.dont_throw import dont_throw
|
|
2
|
+
import functools
|
|
3
|
+
from judgeval.api import JudgmentSyncClient
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dont_throw
|
|
7
|
+
@functools.lru_cache(maxsize=64)
|
|
8
|
+
def _resolve_project_id(project_name: str, api_key: str, organization_id: str) -> str:
|
|
9
|
+
"""Resolve project_id from project_name using the API."""
|
|
10
|
+
client = JudgmentSyncClient(
|
|
11
|
+
api_key=api_key,
|
|
12
|
+
organization_id=organization_id,
|
|
13
|
+
)
|
|
14
|
+
response = client.projects_resolve({"project_name": project_name})
|
|
15
|
+
return response["project_id"]
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
"""
|
|
2
|
+
|
|
3
|
+
This is a modified version of https://docs.powertools.aws.dev/lambda/python/2.35.1/api/event_handler/openapi/encoders.html
|
|
4
|
+
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import dataclasses
|
|
8
|
+
import datetime
|
|
9
|
+
from collections import defaultdict, deque
|
|
10
|
+
from decimal import Decimal
|
|
11
|
+
from enum import Enum
|
|
12
|
+
from pathlib import Path, PurePath
|
|
13
|
+
from re import Pattern
|
|
14
|
+
from types import GeneratorType
|
|
15
|
+
from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, Union
|
|
16
|
+
from uuid import UUID
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
from pydantic.types import SecretBytes, SecretStr
|
|
20
|
+
import orjson
|
|
21
|
+
|
|
22
|
+
from judgeval.logger import judgeval_logger
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
"""
|
|
26
|
+
This module contains the encoders used by jsonable_encoder to convert Python objects to JSON serializable data types.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _model_dump(
|
|
31
|
+
model: BaseModel, mode: Literal["json", "python"] = "json", **kwargs: Any
|
|
32
|
+
) -> Any:
|
|
33
|
+
return model.model_dump(mode=mode, **kwargs)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def json_encoder(
|
|
37
|
+
obj: Any,
|
|
38
|
+
custom_serializer: Optional[Callable[[Any], str]] = None,
|
|
39
|
+
) -> Any:
|
|
40
|
+
"""
|
|
41
|
+
JSON encodes an arbitrary Python object into JSON serializable data types.
|
|
42
|
+
|
|
43
|
+
This is a modified version of fastapi.encoders.jsonable_encoder that supports
|
|
44
|
+
encoding of pydantic.BaseModel objects.
|
|
45
|
+
|
|
46
|
+
Parameters
|
|
47
|
+
----------
|
|
48
|
+
obj : Any
|
|
49
|
+
The object to encode
|
|
50
|
+
custom_serializer : Callable, optional
|
|
51
|
+
A custom serializer to use for encoding the object, when everything else fails.
|
|
52
|
+
|
|
53
|
+
Returns
|
|
54
|
+
-------
|
|
55
|
+
Any
|
|
56
|
+
The JSON serializable data types
|
|
57
|
+
"""
|
|
58
|
+
# Pydantic models
|
|
59
|
+
if isinstance(obj, BaseModel):
|
|
60
|
+
return _dump_base_model(
|
|
61
|
+
obj=obj,
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
# Dataclasses
|
|
65
|
+
if dataclasses.is_dataclass(obj):
|
|
66
|
+
obj_dict = dataclasses.asdict(obj) # type: ignore[arg-type]
|
|
67
|
+
return json_encoder(
|
|
68
|
+
obj_dict,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
# Enums
|
|
72
|
+
if isinstance(obj, Enum):
|
|
73
|
+
return obj.value
|
|
74
|
+
|
|
75
|
+
# Paths
|
|
76
|
+
if isinstance(obj, PurePath):
|
|
77
|
+
return str(obj)
|
|
78
|
+
|
|
79
|
+
# Scalars
|
|
80
|
+
if isinstance(obj, (str, int, float, type(None))):
|
|
81
|
+
return obj
|
|
82
|
+
|
|
83
|
+
# Dictionaries
|
|
84
|
+
if isinstance(obj, dict):
|
|
85
|
+
return _dump_dict(
|
|
86
|
+
obj=obj,
|
|
87
|
+
)
|
|
88
|
+
|
|
89
|
+
# Sequences
|
|
90
|
+
if isinstance(obj, (list, set, frozenset, tuple, deque)):
|
|
91
|
+
return _dump_sequence(
|
|
92
|
+
obj=obj,
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
# Other types
|
|
96
|
+
if type(obj) in ENCODERS_BY_TYPE:
|
|
97
|
+
return ENCODERS_BY_TYPE[type(obj)](obj)
|
|
98
|
+
|
|
99
|
+
for encoder, classes_tuple in encoders_by_class_tuples.items():
|
|
100
|
+
if isinstance(obj, classes_tuple):
|
|
101
|
+
return encoder(obj)
|
|
102
|
+
|
|
103
|
+
# Use custom serializer if present
|
|
104
|
+
if custom_serializer:
|
|
105
|
+
return custom_serializer(obj)
|
|
106
|
+
|
|
107
|
+
# Default
|
|
108
|
+
return _dump_other(
|
|
109
|
+
obj=obj,
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _dump_base_model(
|
|
114
|
+
*,
|
|
115
|
+
obj: Any,
|
|
116
|
+
):
|
|
117
|
+
"""
|
|
118
|
+
Dump a BaseModel object to a dict, using the same parameters as jsonable_encoder
|
|
119
|
+
"""
|
|
120
|
+
obj_dict = _model_dump(
|
|
121
|
+
obj,
|
|
122
|
+
mode="json",
|
|
123
|
+
)
|
|
124
|
+
if "__root__" in obj_dict:
|
|
125
|
+
obj_dict = obj_dict["__root__"]
|
|
126
|
+
|
|
127
|
+
return json_encoder(
|
|
128
|
+
obj_dict,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def _dump_dict(
|
|
133
|
+
*,
|
|
134
|
+
obj: Any,
|
|
135
|
+
) -> Dict[str, Any]:
|
|
136
|
+
"""
|
|
137
|
+
Dump a dict to a dict, using the same parameters as jsonable_encoder
|
|
138
|
+
"""
|
|
139
|
+
encoded_dict = {}
|
|
140
|
+
allowed_keys = set(obj.keys())
|
|
141
|
+
for key, value in obj.items():
|
|
142
|
+
if key in allowed_keys:
|
|
143
|
+
encoded_key = json_encoder(
|
|
144
|
+
key,
|
|
145
|
+
)
|
|
146
|
+
encoded_value = json_encoder(
|
|
147
|
+
value,
|
|
148
|
+
)
|
|
149
|
+
encoded_dict[encoded_key] = encoded_value
|
|
150
|
+
return encoded_dict
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def _dump_sequence(
|
|
154
|
+
*,
|
|
155
|
+
obj: Any,
|
|
156
|
+
) -> List[Any]:
|
|
157
|
+
"""
|
|
158
|
+
Dump a sequence to a list, using the same parameters as jsonable_encoder
|
|
159
|
+
"""
|
|
160
|
+
encoded_list = []
|
|
161
|
+
for item in obj:
|
|
162
|
+
encoded_list.append(
|
|
163
|
+
json_encoder(
|
|
164
|
+
item,
|
|
165
|
+
),
|
|
166
|
+
)
|
|
167
|
+
return encoded_list
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _dump_other(
|
|
171
|
+
*,
|
|
172
|
+
obj: Any,
|
|
173
|
+
) -> Any:
|
|
174
|
+
"""
|
|
175
|
+
Dump an object to a representation without iterating it.
|
|
176
|
+
|
|
177
|
+
Avoids calling dict(obj) which can consume iterators/generators or
|
|
178
|
+
invoke user-defined iteration protocols.
|
|
179
|
+
"""
|
|
180
|
+
try:
|
|
181
|
+
return repr(obj)
|
|
182
|
+
except Exception:
|
|
183
|
+
return str(obj)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def iso_format(o: Union[datetime.date, datetime.time]) -> str:
|
|
187
|
+
"""
|
|
188
|
+
ISO format for date and time
|
|
189
|
+
"""
|
|
190
|
+
return o.isoformat()
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def decimal_encoder(dec_value: Decimal) -> Union[int, float]:
|
|
194
|
+
"""
|
|
195
|
+
Encodes a Decimal as int of there's no exponent, otherwise float
|
|
196
|
+
|
|
197
|
+
This is useful when we use ConstrainedDecimal to represent Numeric(x,0)
|
|
198
|
+
where an integer (but not int typed) is used. Encoding this as a float
|
|
199
|
+
results in failed round-tripping between encode and parse.
|
|
200
|
+
|
|
201
|
+
>>> decimal_encoder(Decimal("1.0"))
|
|
202
|
+
1.0
|
|
203
|
+
|
|
204
|
+
>>> decimal_encoder(Decimal("1"))
|
|
205
|
+
1
|
|
206
|
+
"""
|
|
207
|
+
if dec_value.as_tuple().exponent >= 0: # type: ignore[operator]
|
|
208
|
+
return int(dec_value)
|
|
209
|
+
else:
|
|
210
|
+
return float(dec_value)
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
ENCODERS_BY_TYPE: Dict[Type[Any], Callable[[Any], Any]] = {
|
|
214
|
+
bytes: lambda o: o.decode(),
|
|
215
|
+
datetime.date: iso_format,
|
|
216
|
+
datetime.datetime: iso_format,
|
|
217
|
+
datetime.time: iso_format,
|
|
218
|
+
datetime.timedelta: lambda td: td.total_seconds(),
|
|
219
|
+
Decimal: decimal_encoder,
|
|
220
|
+
Enum: lambda o: o.value,
|
|
221
|
+
frozenset: list,
|
|
222
|
+
deque: list,
|
|
223
|
+
GeneratorType: repr,
|
|
224
|
+
Path: str,
|
|
225
|
+
Pattern: lambda o: o.pattern,
|
|
226
|
+
SecretBytes: str,
|
|
227
|
+
SecretStr: str,
|
|
228
|
+
set: list,
|
|
229
|
+
UUID: str,
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
# Generates a mapping of encoders to a tuple of classes that they can encode
|
|
234
|
+
def generate_encoders_by_class_tuples(
|
|
235
|
+
type_encoder_map: Dict[Any, Callable[[Any], Any]],
|
|
236
|
+
) -> Dict[Callable[[Any], Any], Tuple[Any, ...]]:
|
|
237
|
+
encoders: Dict[Callable[[Any], Any], Tuple[Any, ...]] = defaultdict(tuple)
|
|
238
|
+
for type_, encoder in type_encoder_map.items():
|
|
239
|
+
encoders[encoder] += (type_,)
|
|
240
|
+
return encoders
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
# Mapping of encoders to a tuple of classes that they can encode
|
|
244
|
+
encoders_by_class_tuples = generate_encoders_by_class_tuples(ENCODERS_BY_TYPE)
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
# Seralize arbitrary object to a json string
|
|
248
|
+
def safe_serialize(obj: Any) -> str:
|
|
249
|
+
try:
|
|
250
|
+
return orjson.dumps(json_encoder(obj), option=orjson.OPT_NON_STR_KEYS).decode()
|
|
251
|
+
except Exception as e:
|
|
252
|
+
judgeval_logger.warning(f"Error serializing object: {e}")
|
|
253
|
+
return repr(obj)
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
from rich import print as rprint
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
from judgeval.evaluation import ScoringResult
|
|
5
|
+
from judgeval.data import ScorerData
|
|
6
|
+
from judgeval.exceptions import JudgmentTestError
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def assert_test_results(scoring_results: List[ScoringResult]) -> None:
|
|
10
|
+
failed_cases: List[List[ScorerData]] = []
|
|
11
|
+
for result in scoring_results:
|
|
12
|
+
if not result.success:
|
|
13
|
+
test_case = []
|
|
14
|
+
if result.scorers_data:
|
|
15
|
+
for scorer_data in result.scorers_data:
|
|
16
|
+
if not scorer_data.success:
|
|
17
|
+
test_case.append(scorer_data)
|
|
18
|
+
failed_cases.append(test_case)
|
|
19
|
+
|
|
20
|
+
if failed_cases:
|
|
21
|
+
error_msg = "The following test cases failed: \n"
|
|
22
|
+
for fail_case in failed_cases:
|
|
23
|
+
for fail_scorer in fail_case:
|
|
24
|
+
error_msg += (
|
|
25
|
+
f"\nScorer Name: {fail_scorer.name}\n"
|
|
26
|
+
f"Threshold: {fail_scorer.threshold}\n"
|
|
27
|
+
f"Success: {fail_scorer.success}\n"
|
|
28
|
+
f"Score: {fail_scorer.score}\n"
|
|
29
|
+
f"Reason: {fail_scorer.reason}\n"
|
|
30
|
+
f"Strict Mode: {fail_scorer.strict_mode}\n"
|
|
31
|
+
f"Evaluation Model: {fail_scorer.evaluation_model}\n"
|
|
32
|
+
f"Error: {fail_scorer.error}\n"
|
|
33
|
+
f"Additional Metadata: {fail_scorer.additional_metadata}\n"
|
|
34
|
+
)
|
|
35
|
+
error_msg += "-" * 100
|
|
36
|
+
|
|
37
|
+
total_tests = len(scoring_results)
|
|
38
|
+
failed_tests = len(failed_cases)
|
|
39
|
+
passed_tests = total_tests - failed_tests
|
|
40
|
+
|
|
41
|
+
rprint("\n" + "=" * 80)
|
|
42
|
+
if failed_tests == 0:
|
|
43
|
+
rprint(
|
|
44
|
+
f"[bold green]🎉 ALL TESTS PASSED! {passed_tests}/{total_tests} tests successful[/bold green]"
|
|
45
|
+
)
|
|
46
|
+
else:
|
|
47
|
+
rprint(
|
|
48
|
+
f"[bold red]⚠️ TEST RESULTS: {passed_tests}/{total_tests} passed ({failed_tests} failed)[/bold red]"
|
|
49
|
+
)
|
|
50
|
+
rprint("=" * 80 + "\n")
|
|
51
|
+
|
|
52
|
+
for i, result in enumerate(scoring_results):
|
|
53
|
+
test_num = i + 1
|
|
54
|
+
if result.success:
|
|
55
|
+
rprint(f"[green]✓ Test {test_num}: PASSED[/green]")
|
|
56
|
+
else:
|
|
57
|
+
rprint(f"[red]✗ Test {test_num}: FAILED[/red]")
|
|
58
|
+
if result.scorers_data:
|
|
59
|
+
for scorer_data in result.scorers_data:
|
|
60
|
+
if not scorer_data.success:
|
|
61
|
+
rprint(f" [yellow]Scorer: {scorer_data.name}[/yellow]")
|
|
62
|
+
rprint(f" [red] Score: {scorer_data.score}[/red]")
|
|
63
|
+
rprint(f" [red] Reason: {scorer_data.reason}[/red]")
|
|
64
|
+
if scorer_data.error:
|
|
65
|
+
rprint(f" [red] Error: {scorer_data.error}[/red]")
|
|
66
|
+
rprint(" " + "-" * 40)
|
|
67
|
+
|
|
68
|
+
rprint("\n" + "=" * 80)
|
|
69
|
+
if failed_tests > 0:
|
|
70
|
+
raise JudgmentTestError(failed_cases)
|