judgeval 0.0.17__py3-none-any.whl → 0.0.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +1 -3
- judgeval/clients.py +0 -7
- judgeval/common/logger.py +0 -1
- judgeval/common/tracer.py +250 -42
- judgeval/common/utils.py +9 -5
- judgeval/constants.py +6 -1
- judgeval/data/__init__.py +2 -0
- judgeval/data/api_example.py +2 -2
- judgeval/data/datasets/__init__.py +1 -2
- judgeval/data/datasets/dataset.py +4 -5
- judgeval/data/datasets/eval_dataset_client.py +1 -2
- judgeval/data/datasets/utils.py +1 -2
- judgeval/data/example.py +71 -16
- judgeval/data/scorer_data.py +1 -1
- judgeval/evaluation_run.py +2 -2
- judgeval/judges/__init__.py +0 -1
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +7 -2
- judgeval/judgment_client.py +8 -4
- judgeval/rules.py +2 -4
- judgeval/run_evaluation.py +2 -5
- judgeval/scorers/__init__.py +6 -0
- judgeval/scorers/api_scorer.py +12 -6
- judgeval/scorers/base_scorer.py +12 -6
- judgeval/scorers/judgeval_scorer.py +7 -3
- judgeval/scorers/judgeval_scorers/__init__.py +24 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
- judgeval/scorers/prompt_scorer.py +7 -5
- judgeval/scorers/utils.py +1 -1
- {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/METADATA +1 -1
- {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/RECORD +56 -48
- /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
- {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/WHEEL +0 -0
- {judgeval-0.0.17.dist-info → judgeval-0.0.19.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
# Import key components that should be publicly accessible
|
2
|
-
from judgeval.clients import client,
|
2
|
+
from judgeval.clients import client, together_client
|
3
3
|
from judgeval.judgment_client import JudgmentClient
|
4
4
|
|
5
5
|
__all__ = [
|
6
6
|
# Clients
|
7
7
|
'client',
|
8
|
-
'langfuse',
|
9
8
|
'together_client',
|
10
|
-
|
11
9
|
'JudgmentClient',
|
12
10
|
]
|
judgeval/clients.py
CHANGED
@@ -1,19 +1,12 @@
|
|
1
1
|
import os
|
2
2
|
from dotenv import load_dotenv
|
3
3
|
from openai import OpenAI
|
4
|
-
from langfuse import Langfuse
|
5
4
|
from typing import Optional
|
6
5
|
from together import Together, AsyncTogether
|
7
6
|
|
8
7
|
PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
|
9
8
|
load_dotenv(dotenv_path=PATH_TO_DOTENV)
|
10
9
|
|
11
|
-
# Initialize required clients
|
12
|
-
langfuse = Langfuse(
|
13
|
-
secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
|
14
|
-
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
|
15
|
-
host=os.getenv("LANGFUSE_HOST"),
|
16
|
-
)
|
17
10
|
|
18
11
|
# Initialize optional OpenAI client
|
19
12
|
client: Optional['OpenAI'] = None
|
judgeval/common/logger.py
CHANGED
judgeval/common/tracer.py
CHANGED
@@ -1,60 +1,68 @@
|
|
1
1
|
"""
|
2
2
|
Tracing system for judgeval that allows for function tracing using decorators.
|
3
3
|
"""
|
4
|
-
|
5
|
-
import
|
6
|
-
import time
|
4
|
+
# Standard library imports
|
5
|
+
import asyncio
|
7
6
|
import functools
|
8
|
-
import requests
|
9
|
-
import uuid
|
10
|
-
from contextlib import contextmanager
|
11
|
-
from typing import (
|
12
|
-
Optional,
|
13
|
-
Any,
|
14
|
-
List,
|
15
|
-
Literal,
|
16
|
-
Tuple,
|
17
|
-
Generator,
|
18
|
-
TypeAlias,
|
19
|
-
Union
|
20
|
-
)
|
21
|
-
from dataclasses import (
|
22
|
-
dataclass,
|
23
|
-
field
|
24
|
-
)
|
25
|
-
from datetime import datetime
|
26
|
-
from openai import OpenAI
|
27
|
-
from together import Together
|
28
|
-
from anthropic import Anthropic
|
29
|
-
from typing import Dict
|
30
7
|
import inspect
|
31
|
-
import asyncio
|
32
8
|
import json
|
9
|
+
import os
|
10
|
+
import time
|
11
|
+
import uuid
|
33
12
|
import warnings
|
34
|
-
from
|
13
|
+
from contextlib import contextmanager
|
14
|
+
from dataclasses import dataclass, field
|
15
|
+
from datetime import datetime
|
35
16
|
from http import HTTPStatus
|
17
|
+
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
|
18
|
+
from rich import print as rprint
|
19
|
+
from uuid import UUID
|
20
|
+
from collections.abc import Sequence
|
36
21
|
|
22
|
+
# Third-party imports
|
37
23
|
import pika
|
38
|
-
import
|
24
|
+
import requests
|
25
|
+
from pydantic import BaseModel
|
26
|
+
from rich import print as rprint
|
27
|
+
from openai import OpenAI
|
28
|
+
from together import Together
|
29
|
+
from anthropic import Anthropic
|
39
30
|
|
40
|
-
|
31
|
+
# Local application/library-specific imports
|
32
|
+
from judgeval.constants import (
|
33
|
+
JUDGMENT_TRACES_SAVE_API_URL,
|
34
|
+
JUDGMENT_TRACES_FETCH_API_URL,
|
35
|
+
RABBITMQ_HOST,
|
36
|
+
RABBITMQ_PORT,
|
37
|
+
RABBITMQ_QUEUE,
|
38
|
+
JUDGMENT_TRACES_DELETE_API_URL,
|
39
|
+
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
|
40
|
+
)
|
41
41
|
from judgeval.judgment_client import JudgmentClient
|
42
42
|
from judgeval.data import Example
|
43
43
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
44
44
|
from judgeval.rules import Rule
|
45
45
|
from judgeval.evaluation_run import EvaluationRun
|
46
|
-
from judgeval.
|
46
|
+
from judgeval.data.result import ScoringResult
|
47
47
|
|
48
|
-
from
|
48
|
+
from langchain_core.language_models import BaseChatModel
|
49
|
+
from langchain_huggingface import ChatHuggingFace
|
50
|
+
from langchain_openai import ChatOpenAI
|
51
|
+
from langchain_anthropic import ChatAnthropic
|
52
|
+
from langchain_core.utils.function_calling import convert_to_openai_tool
|
53
|
+
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
54
|
+
from langchain_core.agents import AgentAction, AgentFinish
|
55
|
+
from langchain_core.outputs import LLMResult
|
49
56
|
|
50
|
-
from
|
57
|
+
from langchain_core.messages.ai import AIMessage
|
58
|
+
from langchain_core.messages.tool import ToolMessage
|
59
|
+
from langchain_core.messages.base import BaseMessage
|
60
|
+
from langchain_core.documents import Document
|
51
61
|
|
52
62
|
# Define type aliases for better code readability and maintainability
|
53
63
|
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
54
64
|
TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
|
55
|
-
SpanType = Literal['span', 'tool', 'llm', 'evaluation']
|
56
|
-
|
57
|
-
|
65
|
+
SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
|
58
66
|
@dataclass
|
59
67
|
class TraceEntry:
|
60
68
|
"""Represents a single trace entry with its visual representation.
|
@@ -419,7 +427,7 @@ class TraceClient:
|
|
419
427
|
|
420
428
|
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
421
429
|
if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
422
|
-
raise ValueError("Cannot use Judgeval scorers
|
430
|
+
raise ValueError("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
423
431
|
|
424
432
|
except Exception as e:
|
425
433
|
warnings.warn(f"Failed to load scorers: {str(e)}")
|
@@ -455,9 +463,12 @@ class TraceClient:
|
|
455
463
|
if self._current_span:
|
456
464
|
duration = time.time() - start_time # Calculate duration from start_time
|
457
465
|
|
466
|
+
prev_entry = self.entries[-1]
|
467
|
+
|
468
|
+
# Select the last entry in the trace if it's an LLM call, otherwise use the current span
|
458
469
|
self.add_entry(TraceEntry(
|
459
470
|
type="evaluation",
|
460
|
-
function=self._current_span,
|
471
|
+
function=prev_entry.function if prev_entry.span_type == "llm" else self._current_span,
|
461
472
|
depth=self.tracer.depth,
|
462
473
|
message=f"Evaluation results for {self._current_span}",
|
463
474
|
timestamp=time.time(),
|
@@ -531,7 +542,7 @@ class TraceClient:
|
|
531
542
|
active_functions = [] # Stack to track nested function calls
|
532
543
|
function_entries = {} # Store entries for each function
|
533
544
|
|
534
|
-
for entry in entries:
|
545
|
+
for i, entry in enumerate(entries):
|
535
546
|
function = entry["function"]
|
536
547
|
|
537
548
|
if entry["type"] == "enter":
|
@@ -553,9 +564,12 @@ class TraceClient:
|
|
553
564
|
current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
|
554
565
|
condensed.append(current_entry)
|
555
566
|
active_functions.remove(function)
|
556
|
-
del function_entries[function]
|
567
|
+
# del function_entries[function]
|
557
568
|
|
558
|
-
|
569
|
+
# The OR condition is to handle the LLM client case.
|
570
|
+
# LLM client is a special case where we exit the span, so when we attach evaluations to it,
|
571
|
+
# we have to check if the previous entry is an LLM call.
|
572
|
+
elif function in active_functions or entry["type"] == "evaluation" and entries[i-1]["function"] == entry["function"]:
|
559
573
|
# Update existing function entry with additional data
|
560
574
|
current_entry = function_entries[function]
|
561
575
|
|
@@ -570,6 +584,7 @@ class TraceClient:
|
|
570
584
|
|
571
585
|
# Sort by timestamp
|
572
586
|
condensed.sort(key=lambda x: x["timestamp"])
|
587
|
+
|
573
588
|
return condensed
|
574
589
|
|
575
590
|
def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
|
@@ -581,6 +596,7 @@ class TraceClient:
|
|
581
596
|
total_duration = self.get_duration()
|
582
597
|
|
583
598
|
raw_entries = [entry.to_dict() for entry in self.entries]
|
599
|
+
|
584
600
|
condensed_entries = self.condense_trace(raw_entries)
|
585
601
|
|
586
602
|
# Calculate total token counts from LLM API calls
|
@@ -755,7 +771,7 @@ class Tracer:
|
|
755
771
|
with trace.span(span_name, span_type=span_type) as span:
|
756
772
|
# Record inputs
|
757
773
|
span.record_input({
|
758
|
-
'args':
|
774
|
+
'args': str(args),
|
759
775
|
'kwargs': kwargs
|
760
776
|
})
|
761
777
|
|
@@ -792,7 +808,7 @@ class Tracer:
|
|
792
808
|
with trace.span(span_name, span_type=span_type) as span:
|
793
809
|
# Record inputs
|
794
810
|
span.record_input({
|
795
|
-
'args':
|
811
|
+
'args': str(args),
|
796
812
|
'kwargs': kwargs
|
797
813
|
})
|
798
814
|
|
@@ -810,6 +826,28 @@ class Tracer:
|
|
810
826
|
self._current_trace = None
|
811
827
|
|
812
828
|
return wrapper
|
829
|
+
|
830
|
+
def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
|
831
|
+
"""
|
832
|
+
Decorator to trace function execution with detailed entry/exit information.
|
833
|
+
"""
|
834
|
+
if func is None:
|
835
|
+
return lambda f: self.observe(f, name=name, span_type=span_type)
|
836
|
+
|
837
|
+
if asyncio.iscoroutinefunction(func):
|
838
|
+
@functools.wraps(func)
|
839
|
+
async def async_wrapper(*args, **kwargs):
|
840
|
+
if self._current_trace:
|
841
|
+
self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model=model, log_results=log_results)
|
842
|
+
return async_wrapper
|
843
|
+
else:
|
844
|
+
@functools.wraps(func)
|
845
|
+
def wrapper(*args, **kwargs):
|
846
|
+
if self._current_trace:
|
847
|
+
self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
|
848
|
+
return wrapper
|
849
|
+
|
850
|
+
|
813
851
|
|
814
852
|
def wrap(client: Any) -> Any:
|
815
853
|
"""
|
@@ -920,3 +958,173 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
920
958
|
"total_tokens": response.usage.input_tokens + response.usage.output_tokens
|
921
959
|
}
|
922
960
|
}
|
961
|
+
|
962
|
+
class JudgevalCallbackHandler(BaseCallbackHandler):
|
963
|
+
def __init__(self, trace_client: TraceClient):
|
964
|
+
self.trace_client = trace_client
|
965
|
+
self.openai_count = 1
|
966
|
+
|
967
|
+
def start_span(self, name: str, span_type: SpanType = "span"):
|
968
|
+
start_time = time.time()
|
969
|
+
|
970
|
+
# Record span entry
|
971
|
+
self.trace_client.add_entry(TraceEntry(
|
972
|
+
type="enter",
|
973
|
+
function=name,
|
974
|
+
depth=self.trace_client.tracer.depth,
|
975
|
+
message=name,
|
976
|
+
timestamp=start_time,
|
977
|
+
span_type=span_type
|
978
|
+
))
|
979
|
+
|
980
|
+
self.trace_client.tracer.depth += 1
|
981
|
+
self.trace_client.prev_span = self.trace_client._current_span
|
982
|
+
self.trace_client._current_span = name
|
983
|
+
self._start_time = start_time
|
984
|
+
|
985
|
+
def end_span(self, name: str, span_type: SpanType = "span"):
|
986
|
+
self.trace_client.tracer.depth -= 1
|
987
|
+
duration = time.time() - self._start_time
|
988
|
+
|
989
|
+
# Record span exit
|
990
|
+
self.trace_client.add_entry(TraceEntry(
|
991
|
+
type="exit",
|
992
|
+
function=name,
|
993
|
+
depth=self.trace_client.tracer.depth,
|
994
|
+
message=f"← {name}",
|
995
|
+
timestamp=time.time(),
|
996
|
+
duration=duration,
|
997
|
+
span_type=span_type
|
998
|
+
))
|
999
|
+
self.trace_client._current_span = self.trace_client.prev_span
|
1000
|
+
|
1001
|
+
def on_retriever_start(
|
1002
|
+
self,
|
1003
|
+
serialized: Optional[dict[str, Any]],
|
1004
|
+
query: str,
|
1005
|
+
*,
|
1006
|
+
run_id: UUID,
|
1007
|
+
parent_run_id: Optional[UUID] = None,
|
1008
|
+
tags: Optional[list[str]] = None,
|
1009
|
+
metadata: Optional[dict[str, Any]] = None,
|
1010
|
+
**kwargs: Any,
|
1011
|
+
) -> Any:
|
1012
|
+
name = "RETRIEVER_CALL"
|
1013
|
+
if serialized and "name" in serialized:
|
1014
|
+
name = f"RETRIEVER_{serialized['name'].upper()}"
|
1015
|
+
|
1016
|
+
self.start_span(name, span_type="retriever")
|
1017
|
+
self.trace_client.record_input({
|
1018
|
+
'query': query,
|
1019
|
+
'tags': tags,
|
1020
|
+
'metadata': metadata,
|
1021
|
+
'kwargs': kwargs
|
1022
|
+
})
|
1023
|
+
|
1024
|
+
def on_retriever_end(
|
1025
|
+
self,
|
1026
|
+
documents: Sequence[Document],
|
1027
|
+
*,
|
1028
|
+
run_id: UUID,
|
1029
|
+
parent_run_id: Optional[UUID] = None,
|
1030
|
+
**kwargs: Any
|
1031
|
+
) -> Any:
|
1032
|
+
# Process the retrieved documents into a format suitable for logging
|
1033
|
+
doc_summary = []
|
1034
|
+
for i, doc in enumerate(documents):
|
1035
|
+
# Extract key information from each document
|
1036
|
+
doc_data = {
|
1037
|
+
"index": i,
|
1038
|
+
"page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
|
1039
|
+
"metadata": doc.metadata
|
1040
|
+
}
|
1041
|
+
doc_summary.append(doc_data)
|
1042
|
+
|
1043
|
+
# Record the document data
|
1044
|
+
self.trace_client.record_output({
|
1045
|
+
"document_count": len(documents),
|
1046
|
+
"documents": doc_summary
|
1047
|
+
})
|
1048
|
+
|
1049
|
+
# End the retriever span
|
1050
|
+
self.end_span(self.trace_client._current_span, span_type="retriever")
|
1051
|
+
|
1052
|
+
def on_tool_start(
|
1053
|
+
self,
|
1054
|
+
serialized: Optional[dict[str, Any]],
|
1055
|
+
input_str: str,
|
1056
|
+
run_id: Optional[UUID] = None,
|
1057
|
+
parent_run_id: Optional[UUID] = None,
|
1058
|
+
inputs: Optional[dict[str, Any]] = None,
|
1059
|
+
**kwargs: Any,
|
1060
|
+
):
|
1061
|
+
name = serialized["name"]
|
1062
|
+
self.start_span(name, span_type="tool")
|
1063
|
+
self.trace_client.record_input({
|
1064
|
+
'args': input_str,
|
1065
|
+
'kwargs': kwargs
|
1066
|
+
})
|
1067
|
+
|
1068
|
+
def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
|
1069
|
+
self.trace_client.record_output(output)
|
1070
|
+
self.end_span(self.trace_client._current_span, span_type="tool")
|
1071
|
+
|
1072
|
+
def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
|
1073
|
+
print(f"Agent action: {action}")
|
1074
|
+
|
1075
|
+
def on_agent_finish(
|
1076
|
+
self,
|
1077
|
+
finish: AgentFinish,
|
1078
|
+
*,
|
1079
|
+
run_id: UUID,
|
1080
|
+
parent_run_id: Optional[UUID] = None,
|
1081
|
+
tags: Optional[list[str]] = None,
|
1082
|
+
**kwargs: Any,
|
1083
|
+
) -> None:
|
1084
|
+
print(f"Agent action: {finish}")
|
1085
|
+
|
1086
|
+
def on_llm_start(
|
1087
|
+
self,
|
1088
|
+
serialized: Optional[dict[str, Any]],
|
1089
|
+
prompts: list[str],
|
1090
|
+
*,
|
1091
|
+
run_id: UUID,
|
1092
|
+
parent_run_id: Optional[UUID] = None,
|
1093
|
+
**kwargs: Any,
|
1094
|
+
) -> Any:
|
1095
|
+
name = "LLM call"
|
1096
|
+
self.start_span(name, span_type="llm")
|
1097
|
+
self.trace_client.record_input({
|
1098
|
+
'args': prompts,
|
1099
|
+
'kwargs': kwargs
|
1100
|
+
})
|
1101
|
+
|
1102
|
+
def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
|
1103
|
+
self.trace_client.record_output(response.generations[0][0].text)
|
1104
|
+
self.end_span(self.trace_client._current_span, span_type="llm")
|
1105
|
+
|
1106
|
+
def on_chat_model_start(
|
1107
|
+
self,
|
1108
|
+
serialized: Optional[dict[str, Any]],
|
1109
|
+
messages: list[list[BaseMessage]],
|
1110
|
+
*,
|
1111
|
+
run_id: UUID,
|
1112
|
+
parent_run_id: Optional[UUID] = None,
|
1113
|
+
**kwargs: Any,
|
1114
|
+
) -> Any:
|
1115
|
+
|
1116
|
+
if "openai" in serialized["id"]:
|
1117
|
+
name = f"OPENAI_API_CALL_{self.openai_count}"
|
1118
|
+
self.openai_count += 1
|
1119
|
+
elif "anthropic" in serialized["id"]:
|
1120
|
+
name = "ANTHROPIC_API_CALL"
|
1121
|
+
elif "together" in serialized["id"]:
|
1122
|
+
name = "TOGETHER_API_CALL"
|
1123
|
+
else:
|
1124
|
+
name = "LLM call"
|
1125
|
+
|
1126
|
+
self.start_span(name, span_type="llm")
|
1127
|
+
self.trace_client.record_input({
|
1128
|
+
'args': str(messages),
|
1129
|
+
'kwargs': kwargs
|
1130
|
+
})
|
judgeval/common/utils.py
CHANGED
@@ -8,15 +8,19 @@ For API calling, we support:
|
|
8
8
|
NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
|
9
9
|
"""
|
10
10
|
|
11
|
-
|
12
|
-
from typing import List, Mapping, Dict, Union, Optional, Literal, Any
|
11
|
+
# Standard library imports
|
13
12
|
import asyncio
|
13
|
+
import concurrent.futures
|
14
|
+
import os
|
15
|
+
import pprint
|
16
|
+
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
17
|
+
|
18
|
+
# Third-party imports
|
14
19
|
import litellm
|
15
20
|
import pydantic
|
16
|
-
import
|
17
|
-
import os
|
18
|
-
from dotenv import load_dotenv
|
21
|
+
from dotenv import load_dotenv
|
19
22
|
|
23
|
+
# Local application/library-specific imports
|
20
24
|
from judgeval.clients import async_together_client, together_client
|
21
25
|
from judgeval.constants import *
|
22
26
|
from judgeval.common.logger import debug, error
|
judgeval/constants.py
CHANGED
@@ -21,8 +21,11 @@ class APIScorer(str, Enum):
|
|
21
21
|
CONTEXTUAL_RECALL = "contextual_recall"
|
22
22
|
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
|
23
23
|
CONTEXTUAL_PRECISION = "contextual_precision"
|
24
|
+
INSTRUCTION_ADHERENCE = "instruction_adherence"
|
24
25
|
TOOL_CORRECTNESS = "tool_correctness"
|
25
26
|
JSON_CORRECTNESS = "json_correctness"
|
27
|
+
COMPARISON = "comparison"
|
28
|
+
GROUNDEDNESS = "groundedness"
|
26
29
|
|
27
30
|
@classmethod
|
28
31
|
def _missing_(cls, value):
|
@@ -31,6 +34,8 @@ class APIScorer(str, Enum):
|
|
31
34
|
if member.value == value.lower():
|
32
35
|
return member
|
33
36
|
|
37
|
+
UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not bounded between 0-1
|
38
|
+
|
34
39
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
35
40
|
# API URLs
|
36
41
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
@@ -110,7 +115,7 @@ TOGETHER_SUPPORTED_MODELS = [
|
|
110
115
|
"mistralai/Mistral-7B-Instruct-v0.1"
|
111
116
|
]
|
112
117
|
|
113
|
-
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
|
118
|
+
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
114
119
|
|
115
120
|
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
116
121
|
|
judgeval/data/__init__.py
CHANGED
@@ -2,6 +2,7 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
+
from judgeval.data.ground_truth import GroundTruthExample
|
5
6
|
|
6
7
|
__all__ = [
|
7
8
|
"Example",
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
12
13
|
"create_scorer_data",
|
13
14
|
"ScoringResult",
|
14
15
|
"generate_scoring_result",
|
16
|
+
"GroundTruthExample",
|
15
17
|
]
|
judgeval/data/api_example.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any
|
2
|
-
from pydantic import BaseModel,
|
1
|
+
from typing import List, Optional, Dict, Any
|
2
|
+
from pydantic import BaseModel, ConfigDict, model_validator
|
3
3
|
|
4
4
|
from judgeval.data.example import Example
|
5
5
|
from judgeval.data.scorer_data import ScorerData
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from judgeval.data.datasets.dataset import EvalDataset
|
2
|
-
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
3
2
|
from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
|
4
3
|
|
5
|
-
__all__ = ["EvalDataset", "EvalDatasetClient"
|
4
|
+
__all__ = ["EvalDataset", "EvalDatasetClient"]
|
@@ -1,13 +1,12 @@
|
|
1
1
|
import ast
|
2
2
|
import csv
|
3
|
-
import datetime
|
3
|
+
import datetime
|
4
4
|
import json
|
5
|
-
from dataclasses import dataclass, field
|
6
5
|
import os
|
7
|
-
from
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from typing import List, Union, Literal
|
8
8
|
|
9
|
-
from judgeval.data
|
10
|
-
from judgeval.data import Example
|
9
|
+
from judgeval.data import Example, GroundTruthExample
|
11
10
|
from judgeval.common.logger import debug, error, warning, info
|
12
11
|
|
13
12
|
@dataclass
|
@@ -11,9 +11,8 @@ from judgeval.constants import (
|
|
11
11
|
JUDGMENT_DATASETS_EDIT_API_URL,
|
12
12
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
13
13
|
)
|
14
|
-
from judgeval.data import Example
|
14
|
+
from judgeval.data import Example, GroundTruthExample
|
15
15
|
from judgeval.data.datasets import EvalDataset
|
16
|
-
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
17
16
|
|
18
17
|
|
19
18
|
|
judgeval/data/datasets/utils.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
from typing import List, Optional
|
2
2
|
|
3
|
-
from judgeval.data
|
4
|
-
from judgeval.data import Example
|
3
|
+
from judgeval.data import Example, GroundTruthExample
|
5
4
|
|
6
5
|
|
7
6
|
def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]:
|
judgeval/data/example.py
CHANGED
@@ -2,17 +2,12 @@
|
|
2
2
|
Classes for representing examples in a dataset.
|
3
3
|
"""
|
4
4
|
|
5
|
-
|
6
|
-
from typing import TypeVar, Optional, Any, Dict, List
|
5
|
+
from typing import Optional, Any, Dict, List
|
7
6
|
from uuid import uuid4
|
8
7
|
from pydantic import BaseModel, Field, field_validator
|
9
8
|
from enum import Enum
|
10
9
|
from datetime import datetime
|
11
|
-
import time
|
12
|
-
|
13
10
|
|
14
|
-
Input = TypeVar('Input')
|
15
|
-
Output = TypeVar('Output')
|
16
11
|
|
17
12
|
class ExampleParams(Enum):
|
18
13
|
INPUT = "input"
|
@@ -23,11 +18,12 @@ class ExampleParams(Enum):
|
|
23
18
|
TOOLS_CALLED = "tools_called"
|
24
19
|
EXPECTED_TOOLS = "expected_tools"
|
25
20
|
REASONING = "reasoning"
|
21
|
+
ADDITIONAL_METADATA = "additional_metadata"
|
26
22
|
|
27
23
|
|
28
24
|
class Example(BaseModel):
|
29
|
-
input:
|
30
|
-
actual_output:
|
25
|
+
input: str
|
26
|
+
actual_output: str
|
31
27
|
expected_output: Optional[str] = None
|
32
28
|
context: Optional[List[str]] = None
|
33
29
|
retrieval_context: Optional[List[str]] = None
|
@@ -39,22 +35,81 @@ class Example(BaseModel):
|
|
39
35
|
example_index: Optional[int] = None
|
40
36
|
timestamp: Optional[str] = None
|
41
37
|
trace_id: Optional[str] = None
|
42
|
-
|
43
|
-
@field_validator('input', 'actual_output', mode='before')
|
44
|
-
def convert_to_str(cls, value):
|
45
|
-
try:
|
46
|
-
return str(value)
|
47
|
-
except Exception:
|
48
|
-
return repr(value)
|
49
38
|
|
50
39
|
def __init__(self, **data):
|
40
|
+
# Check that required fields are provided
|
41
|
+
if 'input' not in data:
|
42
|
+
raise ValueError("Example must be initialized with 'input' field.")
|
43
|
+
if 'actual_output' not in data:
|
44
|
+
raise ValueError("Example must be initialized with 'actual_output' field.")
|
45
|
+
|
51
46
|
if 'example_id' not in data:
|
52
47
|
data['example_id'] = str(uuid4())
|
53
48
|
# Set timestamp if not provided
|
54
49
|
if 'timestamp' not in data:
|
55
50
|
data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
|
56
51
|
super().__init__(**data)
|
57
|
-
|
52
|
+
|
53
|
+
@field_validator('input', mode='before')
|
54
|
+
@classmethod
|
55
|
+
def validate_input(cls, v):
|
56
|
+
if not v or not isinstance(v, str):
|
57
|
+
raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
|
58
|
+
return v
|
59
|
+
|
60
|
+
@field_validator('actual_output', mode='before')
|
61
|
+
@classmethod
|
62
|
+
def validate_actual_output(cls, v):
|
63
|
+
if not isinstance(v, str):
|
64
|
+
raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
|
65
|
+
return v
|
66
|
+
|
67
|
+
@field_validator('expected_output', mode='before')
|
68
|
+
@classmethod
|
69
|
+
def validate_expected_output(cls, v):
|
70
|
+
if v is not None and not isinstance(v, str):
|
71
|
+
raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
|
72
|
+
return v
|
73
|
+
|
74
|
+
@field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
|
75
|
+
@classmethod
|
76
|
+
def validate_string_lists(cls, v, info):
|
77
|
+
field_name = info.field_name
|
78
|
+
if v is not None:
|
79
|
+
if not isinstance(v, list):
|
80
|
+
raise ValueError(f"{field_name} must be a list of strings or None but got {v} of type {type(v)}")
|
81
|
+
for i, item in enumerate(v):
|
82
|
+
if not isinstance(item, str):
|
83
|
+
raise ValueError(f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}")
|
84
|
+
return v
|
85
|
+
|
86
|
+
@field_validator('additional_metadata', mode='before')
|
87
|
+
@classmethod
|
88
|
+
def validate_additional_metadata(cls, v):
|
89
|
+
if v is not None and not isinstance(v, dict):
|
90
|
+
raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
|
91
|
+
return v
|
92
|
+
|
93
|
+
@field_validator('example_index', mode='before')
|
94
|
+
@classmethod
|
95
|
+
def validate_example_index(cls, v):
|
96
|
+
if v is not None and not isinstance(v, int):
|
97
|
+
raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
|
98
|
+
return v
|
99
|
+
|
100
|
+
@field_validator('timestamp', mode='before')
|
101
|
+
@classmethod
|
102
|
+
def validate_timestamp(cls, v):
|
103
|
+
if v is not None and not isinstance(v, str):
|
104
|
+
raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
|
105
|
+
return v
|
106
|
+
|
107
|
+
@field_validator('trace_id', mode='before')
|
108
|
+
@classmethod
|
109
|
+
def validate_trace_id(cls, v):
|
110
|
+
if v is not None and not isinstance(v, str):
|
111
|
+
raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
|
112
|
+
return v
|
58
113
|
|
59
114
|
def to_dict(self):
|
60
115
|
return {
|