judgeval 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- judgeval/__init__.py +1 -3
- judgeval/clients.py +0 -6
- judgeval/common/logger.py +0 -1
- judgeval/common/tracer.py +270 -62
- judgeval/common/utils.py +9 -5
- judgeval/constants.py +7 -2
- judgeval/data/__init__.py +2 -0
- judgeval/data/api_example.py +2 -2
- judgeval/data/datasets/__init__.py +1 -2
- judgeval/data/datasets/dataset.py +4 -5
- judgeval/data/datasets/eval_dataset_client.py +11 -7
- judgeval/data/datasets/utils.py +1 -2
- judgeval/data/example.py +72 -17
- judgeval/data/scorer_data.py +1 -1
- judgeval/evaluation_run.py +2 -2
- judgeval/judges/__init__.py +0 -1
- judgeval/judges/base_judge.py +1 -1
- judgeval/judges/mixture_of_judges.py +7 -2
- judgeval/judgment_client.py +16 -8
- judgeval/rules.py +2 -4
- judgeval/run_evaluation.py +8 -8
- judgeval/scorers/__init__.py +6 -0
- judgeval/scorers/api_scorer.py +12 -6
- judgeval/scorers/base_scorer.py +12 -6
- judgeval/scorers/judgeval_scorer.py +7 -3
- judgeval/scorers/judgeval_scorers/__init__.py +24 -3
- judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
- judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
- judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
- judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
- judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
- judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
- judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
- judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
- judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
- judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
- judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
- judgeval/scorers/prompt_scorer.py +7 -5
- judgeval/scorers/utils.py +1 -1
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/METADATA +1 -1
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/RECORD +56 -48
- /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/WHEEL +0 -0
- {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py
CHANGED
@@ -1,12 +1,10 @@
|
|
1
1
|
# Import key components that should be publicly accessible
|
2
|
-
from judgeval.clients import client,
|
2
|
+
from judgeval.clients import client, together_client
|
3
3
|
from judgeval.judgment_client import JudgmentClient
|
4
4
|
|
5
5
|
__all__ = [
|
6
6
|
# Clients
|
7
7
|
'client',
|
8
|
-
'langfuse',
|
9
8
|
'together_client',
|
10
|
-
|
11
9
|
'JudgmentClient',
|
12
10
|
]
|
judgeval/clients.py
CHANGED
@@ -8,12 +8,6 @@ from together import Together, AsyncTogether
|
|
8
8
|
PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
|
9
9
|
load_dotenv(dotenv_path=PATH_TO_DOTENV)
|
10
10
|
|
11
|
-
# Initialize required clients
|
12
|
-
langfuse = Langfuse(
|
13
|
-
secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
|
14
|
-
public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
|
15
|
-
host=os.getenv("LANGFUSE_HOST"),
|
16
|
-
)
|
17
11
|
|
18
12
|
# Initialize optional OpenAI client
|
19
13
|
client: Optional['OpenAI'] = None
|
judgeval/common/logger.py
CHANGED
judgeval/common/tracer.py
CHANGED
@@ -1,60 +1,68 @@
|
|
1
1
|
"""
|
2
2
|
Tracing system for judgeval that allows for function tracing using decorators.
|
3
3
|
"""
|
4
|
-
|
5
|
-
import
|
6
|
-
import time
|
4
|
+
# Standard library imports
|
5
|
+
import asyncio
|
7
6
|
import functools
|
8
|
-
import requests
|
9
|
-
import uuid
|
10
|
-
from contextlib import contextmanager
|
11
|
-
from typing import (
|
12
|
-
Optional,
|
13
|
-
Any,
|
14
|
-
List,
|
15
|
-
Literal,
|
16
|
-
Tuple,
|
17
|
-
Generator,
|
18
|
-
TypeAlias,
|
19
|
-
Union
|
20
|
-
)
|
21
|
-
from dataclasses import (
|
22
|
-
dataclass,
|
23
|
-
field
|
24
|
-
)
|
25
|
-
from datetime import datetime
|
26
|
-
from openai import OpenAI
|
27
|
-
from together import Together
|
28
|
-
from anthropic import Anthropic
|
29
|
-
from typing import Dict
|
30
7
|
import inspect
|
31
|
-
import asyncio
|
32
8
|
import json
|
9
|
+
import os
|
10
|
+
import time
|
11
|
+
import uuid
|
33
12
|
import warnings
|
34
|
-
from
|
13
|
+
from contextlib import contextmanager
|
14
|
+
from dataclasses import dataclass, field
|
15
|
+
from datetime import datetime
|
35
16
|
from http import HTTPStatus
|
17
|
+
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
|
18
|
+
from rich import print as rprint
|
19
|
+
from uuid import UUID
|
20
|
+
from collections.abc import Sequence
|
36
21
|
|
22
|
+
# Third-party imports
|
37
23
|
import pika
|
38
|
-
import
|
24
|
+
import requests
|
25
|
+
from pydantic import BaseModel
|
26
|
+
from rich import print as rprint
|
27
|
+
from openai import OpenAI
|
28
|
+
from together import Together
|
29
|
+
from anthropic import Anthropic
|
39
30
|
|
40
|
-
|
31
|
+
# Local application/library-specific imports
|
32
|
+
from judgeval.constants import (
|
33
|
+
JUDGMENT_TRACES_SAVE_API_URL,
|
34
|
+
JUDGMENT_TRACES_FETCH_API_URL,
|
35
|
+
RABBITMQ_HOST,
|
36
|
+
RABBITMQ_PORT,
|
37
|
+
RABBITMQ_QUEUE,
|
38
|
+
JUDGMENT_TRACES_DELETE_API_URL,
|
39
|
+
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
|
40
|
+
)
|
41
41
|
from judgeval.judgment_client import JudgmentClient
|
42
42
|
from judgeval.data import Example
|
43
43
|
from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
|
44
44
|
from judgeval.rules import Rule
|
45
45
|
from judgeval.evaluation_run import EvaluationRun
|
46
|
-
from judgeval.
|
46
|
+
from judgeval.data.result import ScoringResult
|
47
47
|
|
48
|
-
from
|
48
|
+
from langchain_core.language_models import BaseChatModel
|
49
|
+
from langchain_huggingface import ChatHuggingFace
|
50
|
+
from langchain_openai import ChatOpenAI
|
51
|
+
from langchain_anthropic import ChatAnthropic
|
52
|
+
from langchain_core.utils.function_calling import convert_to_openai_tool
|
53
|
+
from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
|
54
|
+
from langchain_core.agents import AgentAction, AgentFinish
|
55
|
+
from langchain_core.outputs import LLMResult
|
49
56
|
|
50
|
-
from
|
57
|
+
from langchain_core.messages.ai import AIMessage
|
58
|
+
from langchain_core.messages.tool import ToolMessage
|
59
|
+
from langchain_core.messages.base import BaseMessage
|
60
|
+
from langchain_core.documents import Document
|
51
61
|
|
52
62
|
# Define type aliases for better code readability and maintainability
|
53
63
|
ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
|
54
64
|
TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
|
55
|
-
SpanType = Literal['span', 'tool', 'llm', 'evaluation']
|
56
|
-
|
57
|
-
|
65
|
+
SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
|
58
66
|
@dataclass
|
59
67
|
class TraceEntry:
|
60
68
|
"""Represents a single trace entry with its visual representation.
|
@@ -207,7 +215,8 @@ class TraceManagerClient:
|
|
207
215
|
"Content-Type": "application/json",
|
208
216
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
209
217
|
"X-Organization-Id": self.organization_id
|
210
|
-
}
|
218
|
+
},
|
219
|
+
verify=True
|
211
220
|
)
|
212
221
|
|
213
222
|
if response.status_code != HTTPStatus.OK:
|
@@ -231,7 +240,8 @@ class TraceManagerClient:
|
|
231
240
|
"Content-Type": "application/json",
|
232
241
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
233
242
|
"X-Organization-Id": self.organization_id
|
234
|
-
}
|
243
|
+
},
|
244
|
+
verify=True
|
235
245
|
)
|
236
246
|
|
237
247
|
if response.status_code == HTTPStatus.BAD_REQUEST:
|
@@ -417,7 +427,7 @@ class TraceClient:
|
|
417
427
|
|
418
428
|
# Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
|
419
429
|
if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
|
420
|
-
raise ValueError("Cannot use Judgeval scorers
|
430
|
+
raise ValueError("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.")
|
421
431
|
|
422
432
|
except Exception as e:
|
423
433
|
warnings.warn(f"Failed to load scorers: {str(e)}")
|
@@ -453,9 +463,12 @@ class TraceClient:
|
|
453
463
|
if self._current_span:
|
454
464
|
duration = time.time() - start_time # Calculate duration from start_time
|
455
465
|
|
466
|
+
prev_entry = self.entries[-1]
|
467
|
+
|
468
|
+
# Select the last entry in the trace if it's an LLM call, otherwise use the current span
|
456
469
|
self.add_entry(TraceEntry(
|
457
470
|
type="evaluation",
|
458
|
-
function=self._current_span,
|
471
|
+
function=prev_entry.function if prev_entry.span_type == "llm" else self._current_span,
|
459
472
|
depth=self.tracer.depth,
|
460
473
|
message=f"Evaluation results for {self._current_span}",
|
461
474
|
timestamp=time.time(),
|
@@ -529,7 +542,7 @@ class TraceClient:
|
|
529
542
|
active_functions = [] # Stack to track nested function calls
|
530
543
|
function_entries = {} # Store entries for each function
|
531
544
|
|
532
|
-
for entry in entries:
|
545
|
+
for i, entry in enumerate(entries):
|
533
546
|
function = entry["function"]
|
534
547
|
|
535
548
|
if entry["type"] == "enter":
|
@@ -551,9 +564,12 @@ class TraceClient:
|
|
551
564
|
current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
|
552
565
|
condensed.append(current_entry)
|
553
566
|
active_functions.remove(function)
|
554
|
-
del function_entries[function]
|
567
|
+
# del function_entries[function]
|
555
568
|
|
556
|
-
|
569
|
+
# The OR condition is to handle the LLM client case.
|
570
|
+
# LLM client is a special case where we exit the span, so when we attach evaluations to it,
|
571
|
+
# we have to check if the previous entry is an LLM call.
|
572
|
+
elif function in active_functions or entry["type"] == "evaluation" and entries[i-1]["function"] == entry["function"]:
|
557
573
|
# Update existing function entry with additional data
|
558
574
|
current_entry = function_entries[function]
|
559
575
|
|
@@ -568,6 +584,7 @@ class TraceClient:
|
|
568
584
|
|
569
585
|
# Sort by timestamp
|
570
586
|
condensed.sort(key=lambda x: x["timestamp"])
|
587
|
+
|
571
588
|
return condensed
|
572
589
|
|
573
590
|
def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
|
@@ -579,6 +596,7 @@ class TraceClient:
|
|
579
596
|
total_duration = self.get_duration()
|
580
597
|
|
581
598
|
raw_entries = [entry.to_dict() for entry in self.entries]
|
599
|
+
|
582
600
|
condensed_entries = self.condense_trace(raw_entries)
|
583
601
|
|
584
602
|
# Calculate total token counts from LLM API calls
|
@@ -617,25 +635,23 @@ class TraceClient:
|
|
617
635
|
}
|
618
636
|
# Execute asynchrous evaluation in the background
|
619
637
|
if not empty_save: # Only send to RabbitMQ if the trace is not empty
|
620
|
-
|
621
|
-
|
622
|
-
|
623
|
-
|
624
|
-
|
625
|
-
trace_data["judgment_api_key"] = self.tracer.api_key
|
626
|
-
trace_data["organization_id"] = self.tracer.organization_id
|
627
|
-
channel.basic_publish(
|
628
|
-
exchange='',
|
629
|
-
routing_key=RABBITMQ_QUEUE,
|
630
|
-
body=json.dumps(trace_data),
|
631
|
-
properties=pika.BasicProperties(
|
632
|
-
delivery_mode=pika.DeliveryMode.Transient, # Changed from Persistent to Transient
|
638
|
+
# Send trace data to evaluation queue via API
|
639
|
+
try:
|
640
|
+
response = requests.post(
|
641
|
+
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
|
642
|
+
json=trace_data,
|
633
643
|
headers={
|
634
|
-
|
635
|
-
|
636
|
-
|
637
|
-
|
638
|
-
|
644
|
+
"Content-Type": "application/json",
|
645
|
+
"Authorization": f"Bearer {self.tracer.api_key}",
|
646
|
+
"X-Organization-Id": self.tracer.organization_id
|
647
|
+
},
|
648
|
+
verify=True
|
649
|
+
)
|
650
|
+
|
651
|
+
if response.status_code != HTTPStatus.OK:
|
652
|
+
warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
|
653
|
+
except Exception as e:
|
654
|
+
warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
|
639
655
|
|
640
656
|
self.trace_manager_client.save_trace(trace_data, empty_save)
|
641
657
|
|
@@ -755,7 +771,7 @@ class Tracer:
|
|
755
771
|
with trace.span(span_name, span_type=span_type) as span:
|
756
772
|
# Record inputs
|
757
773
|
span.record_input({
|
758
|
-
'args':
|
774
|
+
'args': str(args),
|
759
775
|
'kwargs': kwargs
|
760
776
|
})
|
761
777
|
|
@@ -792,7 +808,7 @@ class Tracer:
|
|
792
808
|
with trace.span(span_name, span_type=span_type) as span:
|
793
809
|
# Record inputs
|
794
810
|
span.record_input({
|
795
|
-
'args':
|
811
|
+
'args': str(args),
|
796
812
|
'kwargs': kwargs
|
797
813
|
})
|
798
814
|
|
@@ -810,6 +826,28 @@ class Tracer:
|
|
810
826
|
self._current_trace = None
|
811
827
|
|
812
828
|
return wrapper
|
829
|
+
|
830
|
+
def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
|
831
|
+
"""
|
832
|
+
Decorator to trace function execution with detailed entry/exit information.
|
833
|
+
"""
|
834
|
+
if func is None:
|
835
|
+
return lambda f: self.observe(f, name=name, span_type=span_type)
|
836
|
+
|
837
|
+
if asyncio.iscoroutinefunction(func):
|
838
|
+
@functools.wraps(func)
|
839
|
+
async def async_wrapper(*args, **kwargs):
|
840
|
+
if self._current_trace:
|
841
|
+
self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model=model, log_results=log_results)
|
842
|
+
return async_wrapper
|
843
|
+
else:
|
844
|
+
@functools.wraps(func)
|
845
|
+
def wrapper(*args, **kwargs):
|
846
|
+
if self._current_trace:
|
847
|
+
self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
|
848
|
+
return wrapper
|
849
|
+
|
850
|
+
|
813
851
|
|
814
852
|
def wrap(client: Any) -> Any:
|
815
853
|
"""
|
@@ -920,3 +958,173 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
|
|
920
958
|
"total_tokens": response.usage.input_tokens + response.usage.output_tokens
|
921
959
|
}
|
922
960
|
}
|
961
|
+
|
962
|
+
class JudgevalCallbackHandler(BaseCallbackHandler):
|
963
|
+
def __init__(self, trace_client: TraceClient):
|
964
|
+
self.trace_client = trace_client
|
965
|
+
self.openai_count = 1
|
966
|
+
|
967
|
+
def start_span(self, name: str, span_type: SpanType = "span"):
|
968
|
+
start_time = time.time()
|
969
|
+
|
970
|
+
# Record span entry
|
971
|
+
self.trace_client.add_entry(TraceEntry(
|
972
|
+
type="enter",
|
973
|
+
function=name,
|
974
|
+
depth=self.trace_client.tracer.depth,
|
975
|
+
message=name,
|
976
|
+
timestamp=start_time,
|
977
|
+
span_type=span_type
|
978
|
+
))
|
979
|
+
|
980
|
+
self.trace_client.tracer.depth += 1
|
981
|
+
self.trace_client.prev_span = self.trace_client._current_span
|
982
|
+
self.trace_client._current_span = name
|
983
|
+
self._start_time = start_time
|
984
|
+
|
985
|
+
def end_span(self, name: str, span_type: SpanType = "span"):
|
986
|
+
self.trace_client.tracer.depth -= 1
|
987
|
+
duration = time.time() - self._start_time
|
988
|
+
|
989
|
+
# Record span exit
|
990
|
+
self.trace_client.add_entry(TraceEntry(
|
991
|
+
type="exit",
|
992
|
+
function=name,
|
993
|
+
depth=self.trace_client.tracer.depth,
|
994
|
+
message=f"← {name}",
|
995
|
+
timestamp=time.time(),
|
996
|
+
duration=duration,
|
997
|
+
span_type=span_type
|
998
|
+
))
|
999
|
+
self.trace_client._current_span = self.trace_client.prev_span
|
1000
|
+
|
1001
|
+
def on_retriever_start(
|
1002
|
+
self,
|
1003
|
+
serialized: Optional[dict[str, Any]],
|
1004
|
+
query: str,
|
1005
|
+
*,
|
1006
|
+
run_id: UUID,
|
1007
|
+
parent_run_id: Optional[UUID] = None,
|
1008
|
+
tags: Optional[list[str]] = None,
|
1009
|
+
metadata: Optional[dict[str, Any]] = None,
|
1010
|
+
**kwargs: Any,
|
1011
|
+
) -> Any:
|
1012
|
+
name = "RETRIEVER_CALL"
|
1013
|
+
if serialized and "name" in serialized:
|
1014
|
+
name = f"RETRIEVER_{serialized['name'].upper()}"
|
1015
|
+
|
1016
|
+
self.start_span(name, span_type="retriever")
|
1017
|
+
self.trace_client.record_input({
|
1018
|
+
'query': query,
|
1019
|
+
'tags': tags,
|
1020
|
+
'metadata': metadata,
|
1021
|
+
'kwargs': kwargs
|
1022
|
+
})
|
1023
|
+
|
1024
|
+
def on_retriever_end(
|
1025
|
+
self,
|
1026
|
+
documents: Sequence[Document],
|
1027
|
+
*,
|
1028
|
+
run_id: UUID,
|
1029
|
+
parent_run_id: Optional[UUID] = None,
|
1030
|
+
**kwargs: Any
|
1031
|
+
) -> Any:
|
1032
|
+
# Process the retrieved documents into a format suitable for logging
|
1033
|
+
doc_summary = []
|
1034
|
+
for i, doc in enumerate(documents):
|
1035
|
+
# Extract key information from each document
|
1036
|
+
doc_data = {
|
1037
|
+
"index": i,
|
1038
|
+
"page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
|
1039
|
+
"metadata": doc.metadata
|
1040
|
+
}
|
1041
|
+
doc_summary.append(doc_data)
|
1042
|
+
|
1043
|
+
# Record the document data
|
1044
|
+
self.trace_client.record_output({
|
1045
|
+
"document_count": len(documents),
|
1046
|
+
"documents": doc_summary
|
1047
|
+
})
|
1048
|
+
|
1049
|
+
# End the retriever span
|
1050
|
+
self.end_span(self.trace_client._current_span, span_type="retriever")
|
1051
|
+
|
1052
|
+
def on_tool_start(
|
1053
|
+
self,
|
1054
|
+
serialized: Optional[dict[str, Any]],
|
1055
|
+
input_str: str,
|
1056
|
+
run_id: Optional[UUID] = None,
|
1057
|
+
parent_run_id: Optional[UUID] = None,
|
1058
|
+
inputs: Optional[dict[str, Any]] = None,
|
1059
|
+
**kwargs: Any,
|
1060
|
+
):
|
1061
|
+
name = serialized["name"]
|
1062
|
+
self.start_span(name, span_type="tool")
|
1063
|
+
self.trace_client.record_input({
|
1064
|
+
'args': input_str,
|
1065
|
+
'kwargs': kwargs
|
1066
|
+
})
|
1067
|
+
|
1068
|
+
def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
|
1069
|
+
self.trace_client.record_output(output)
|
1070
|
+
self.end_span(self.trace_client._current_span, span_type="tool")
|
1071
|
+
|
1072
|
+
def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
|
1073
|
+
print(f"Agent action: {action}")
|
1074
|
+
|
1075
|
+
def on_agent_finish(
|
1076
|
+
self,
|
1077
|
+
finish: AgentFinish,
|
1078
|
+
*,
|
1079
|
+
run_id: UUID,
|
1080
|
+
parent_run_id: Optional[UUID] = None,
|
1081
|
+
tags: Optional[list[str]] = None,
|
1082
|
+
**kwargs: Any,
|
1083
|
+
) -> None:
|
1084
|
+
print(f"Agent action: {finish}")
|
1085
|
+
|
1086
|
+
def on_llm_start(
|
1087
|
+
self,
|
1088
|
+
serialized: Optional[dict[str, Any]],
|
1089
|
+
prompts: list[str],
|
1090
|
+
*,
|
1091
|
+
run_id: UUID,
|
1092
|
+
parent_run_id: Optional[UUID] = None,
|
1093
|
+
**kwargs: Any,
|
1094
|
+
) -> Any:
|
1095
|
+
name = "LLM call"
|
1096
|
+
self.start_span(name, span_type="llm")
|
1097
|
+
self.trace_client.record_input({
|
1098
|
+
'args': prompts,
|
1099
|
+
'kwargs': kwargs
|
1100
|
+
})
|
1101
|
+
|
1102
|
+
def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
|
1103
|
+
self.trace_client.record_output(response.generations[0][0].text)
|
1104
|
+
self.end_span(self.trace_client._current_span, span_type="llm")
|
1105
|
+
|
1106
|
+
def on_chat_model_start(
|
1107
|
+
self,
|
1108
|
+
serialized: Optional[dict[str, Any]],
|
1109
|
+
messages: list[list[BaseMessage]],
|
1110
|
+
*,
|
1111
|
+
run_id: UUID,
|
1112
|
+
parent_run_id: Optional[UUID] = None,
|
1113
|
+
**kwargs: Any,
|
1114
|
+
) -> Any:
|
1115
|
+
|
1116
|
+
if "openai" in serialized["id"]:
|
1117
|
+
name = f"OPENAI_API_CALL_{self.openai_count}"
|
1118
|
+
self.openai_count += 1
|
1119
|
+
elif "anthropic" in serialized["id"]:
|
1120
|
+
name = "ANTHROPIC_API_CALL"
|
1121
|
+
elif "together" in serialized["id"]:
|
1122
|
+
name = "TOGETHER_API_CALL"
|
1123
|
+
else:
|
1124
|
+
name = "LLM call"
|
1125
|
+
|
1126
|
+
self.start_span(name, span_type="llm")
|
1127
|
+
self.trace_client.record_input({
|
1128
|
+
'args': str(messages),
|
1129
|
+
'kwargs': kwargs
|
1130
|
+
})
|
judgeval/common/utils.py
CHANGED
@@ -8,15 +8,19 @@ For API calling, we support:
|
|
8
8
|
NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
|
9
9
|
"""
|
10
10
|
|
11
|
-
|
12
|
-
from typing import List, Mapping, Dict, Union, Optional, Literal, Any
|
11
|
+
# Standard library imports
|
13
12
|
import asyncio
|
13
|
+
import concurrent.futures
|
14
|
+
import os
|
15
|
+
import pprint
|
16
|
+
from typing import Any, Dict, List, Literal, Mapping, Optional, Union
|
17
|
+
|
18
|
+
# Third-party imports
|
14
19
|
import litellm
|
15
20
|
import pydantic
|
16
|
-
import
|
17
|
-
import os
|
18
|
-
from dotenv import load_dotenv
|
21
|
+
from dotenv import load_dotenv
|
19
22
|
|
23
|
+
# Local application/library-specific imports
|
20
24
|
from judgeval.clients import async_together_client, together_client
|
21
25
|
from judgeval.constants import *
|
22
26
|
from judgeval.common.logger import debug, error
|
judgeval/constants.py
CHANGED
@@ -21,8 +21,11 @@ class APIScorer(str, Enum):
|
|
21
21
|
CONTEXTUAL_RECALL = "contextual_recall"
|
22
22
|
CONTEXTUAL_RELEVANCY = "contextual_relevancy"
|
23
23
|
CONTEXTUAL_PRECISION = "contextual_precision"
|
24
|
+
INSTRUCTION_ADHERENCE = "instruction_adherence"
|
24
25
|
TOOL_CORRECTNESS = "tool_correctness"
|
25
26
|
JSON_CORRECTNESS = "json_correctness"
|
27
|
+
COMPARISON = "comparison"
|
28
|
+
GROUNDEDNESS = "groundedness"
|
26
29
|
|
27
30
|
@classmethod
|
28
31
|
def _missing_(cls, value):
|
@@ -31,6 +34,8 @@ class APIScorer(str, Enum):
|
|
31
34
|
if member.value == value.lower():
|
32
35
|
return member
|
33
36
|
|
37
|
+
UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not bounded between 0-1
|
38
|
+
|
34
39
|
ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
|
35
40
|
# API URLs
|
36
41
|
JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
|
@@ -46,7 +51,7 @@ JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_proje
|
|
46
51
|
JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
|
47
52
|
JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
|
48
53
|
JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
|
49
|
-
|
54
|
+
JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_eval_queue/"
|
50
55
|
# RabbitMQ
|
51
56
|
RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
|
52
57
|
RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
|
@@ -110,7 +115,7 @@ TOGETHER_SUPPORTED_MODELS = [
|
|
110
115
|
"mistralai/Mistral-7B-Instruct-v0.1"
|
111
116
|
]
|
112
117
|
|
113
|
-
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
|
118
|
+
JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
|
114
119
|
|
115
120
|
ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
|
116
121
|
|
judgeval/data/__init__.py
CHANGED
@@ -2,6 +2,7 @@ from judgeval.data.example import Example, ExampleParams
|
|
2
2
|
from judgeval.data.api_example import ProcessExample, create_process_example
|
3
3
|
from judgeval.data.scorer_data import ScorerData, create_scorer_data
|
4
4
|
from judgeval.data.result import ScoringResult, generate_scoring_result
|
5
|
+
from judgeval.data.ground_truth import GroundTruthExample
|
5
6
|
|
6
7
|
__all__ = [
|
7
8
|
"Example",
|
@@ -12,4 +13,5 @@ __all__ = [
|
|
12
13
|
"create_scorer_data",
|
13
14
|
"ScoringResult",
|
14
15
|
"generate_scoring_result",
|
16
|
+
"GroundTruthExample",
|
15
17
|
]
|
judgeval/data/api_example.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1
|
-
from typing import List, Optional, Dict, Any
|
2
|
-
from pydantic import BaseModel,
|
1
|
+
from typing import List, Optional, Dict, Any
|
2
|
+
from pydantic import BaseModel, ConfigDict, model_validator
|
3
3
|
|
4
4
|
from judgeval.data.example import Example
|
5
5
|
from judgeval.data.scorer_data import ScorerData
|
@@ -1,5 +1,4 @@
|
|
1
1
|
from judgeval.data.datasets.dataset import EvalDataset
|
2
|
-
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
3
2
|
from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
|
4
3
|
|
5
|
-
__all__ = ["EvalDataset", "EvalDatasetClient"
|
4
|
+
__all__ = ["EvalDataset", "EvalDatasetClient"]
|
@@ -1,13 +1,12 @@
|
|
1
1
|
import ast
|
2
2
|
import csv
|
3
|
-
import datetime
|
3
|
+
import datetime
|
4
4
|
import json
|
5
|
-
from dataclasses import dataclass, field
|
6
5
|
import os
|
7
|
-
from
|
6
|
+
from dataclasses import dataclass, field
|
7
|
+
from typing import List, Union, Literal
|
8
8
|
|
9
|
-
from judgeval.data
|
10
|
-
from judgeval.data import Example
|
9
|
+
from judgeval.data import Example, GroundTruthExample
|
11
10
|
from judgeval.common.logger import debug, error, warning, info
|
12
11
|
|
13
12
|
@dataclass
|
@@ -11,9 +11,8 @@ from judgeval.constants import (
|
|
11
11
|
JUDGMENT_DATASETS_EDIT_API_URL,
|
12
12
|
JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
|
13
13
|
)
|
14
|
-
from judgeval.data import Example
|
14
|
+
from judgeval.data import Example, GroundTruthExample
|
15
15
|
from judgeval.data.datasets import EvalDataset
|
16
|
-
from judgeval.data.datasets.ground_truth import GroundTruthExample
|
17
16
|
|
18
17
|
|
19
18
|
|
@@ -68,7 +67,8 @@ class EvalDatasetClient:
|
|
68
67
|
"Content-Type": "application/json",
|
69
68
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
70
69
|
"X-Organization-Id": self.organization_id
|
71
|
-
}
|
70
|
+
},
|
71
|
+
verify=True
|
72
72
|
)
|
73
73
|
if response.status_code == 500:
|
74
74
|
error(f"Server error during push: {content.get('message')}")
|
@@ -132,7 +132,8 @@ class EvalDatasetClient:
|
|
132
132
|
"Content-Type": "application/json",
|
133
133
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
134
134
|
"X-Organization-Id": self.organization_id
|
135
|
-
}
|
135
|
+
},
|
136
|
+
verify=True
|
136
137
|
)
|
137
138
|
response.raise_for_status()
|
138
139
|
except requests.exceptions.RequestException as e:
|
@@ -190,7 +191,8 @@ class EvalDatasetClient:
|
|
190
191
|
"Content-Type": "application/json",
|
191
192
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
192
193
|
"X-Organization-Id": self.organization_id
|
193
|
-
}
|
194
|
+
},
|
195
|
+
verify=True
|
194
196
|
)
|
195
197
|
response.raise_for_status()
|
196
198
|
except requests.exceptions.RequestException as e:
|
@@ -243,7 +245,8 @@ class EvalDatasetClient:
|
|
243
245
|
"Content-Type": "application/json",
|
244
246
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
245
247
|
"X-Organization-Id": self.organization_id
|
246
|
-
}
|
248
|
+
},
|
249
|
+
verify=True
|
247
250
|
)
|
248
251
|
response.raise_for_status()
|
249
252
|
except requests.exceptions.RequestException as e:
|
@@ -274,7 +277,8 @@ class EvalDatasetClient:
|
|
274
277
|
"Authorization": f"Bearer {self.judgment_api_key}",
|
275
278
|
"X-Organization-Id": self.organization_id
|
276
279
|
},
|
277
|
-
stream=True
|
280
|
+
stream=True,
|
281
|
+
verify=True
|
278
282
|
)
|
279
283
|
response.raise_for_status()
|
280
284
|
except requests.exceptions.HTTPError as err:
|
judgeval/data/datasets/utils.py
CHANGED
@@ -1,7 +1,6 @@
|
|
1
1
|
from typing import List, Optional
|
2
2
|
|
3
|
-
from judgeval.data
|
4
|
-
from judgeval.data import Example
|
3
|
+
from judgeval.data import Example, GroundTruthExample
|
5
4
|
|
6
5
|
|
7
6
|
def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]:
|