judgeval 0.0.17__py3-none-any.whl → 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. judgeval/__init__.py +1 -3
  2. judgeval/clients.py +0 -6
  3. judgeval/common/logger.py +0 -1
  4. judgeval/common/tracer.py +250 -42
  5. judgeval/common/utils.py +9 -5
  6. judgeval/constants.py +6 -1
  7. judgeval/data/__init__.py +2 -0
  8. judgeval/data/api_example.py +2 -2
  9. judgeval/data/datasets/__init__.py +1 -2
  10. judgeval/data/datasets/dataset.py +4 -5
  11. judgeval/data/datasets/eval_dataset_client.py +1 -2
  12. judgeval/data/datasets/utils.py +1 -2
  13. judgeval/data/example.py +72 -17
  14. judgeval/data/scorer_data.py +1 -1
  15. judgeval/evaluation_run.py +2 -2
  16. judgeval/judges/__init__.py +0 -1
  17. judgeval/judges/base_judge.py +1 -1
  18. judgeval/judges/mixture_of_judges.py +7 -2
  19. judgeval/judgment_client.py +8 -4
  20. judgeval/rules.py +2 -4
  21. judgeval/run_evaluation.py +2 -5
  22. judgeval/scorers/__init__.py +6 -0
  23. judgeval/scorers/api_scorer.py +12 -6
  24. judgeval/scorers/base_scorer.py +12 -6
  25. judgeval/scorers/judgeval_scorer.py +7 -3
  26. judgeval/scorers/judgeval_scorers/__init__.py +24 -3
  27. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
  28. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
  29. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
  30. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
  31. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
  32. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
  36. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
  42. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
  48. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
  49. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
  50. judgeval/scorers/prompt_scorer.py +7 -5
  51. judgeval/scorers/utils.py +1 -1
  52. {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/METADATA +1 -1
  53. {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/RECORD +56 -48
  54. /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
  55. {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/WHEEL +0 -0
  56. {judgeval-0.0.17.dist-info → judgeval-0.0.18.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -1,12 +1,10 @@
1
1
  # Import key components that should be publicly accessible
2
- from judgeval.clients import client, langfuse, together_client
2
+ from judgeval.clients import client, together_client
3
3
  from judgeval.judgment_client import JudgmentClient
4
4
 
5
5
  __all__ = [
6
6
  # Clients
7
7
  'client',
8
- 'langfuse',
9
8
  'together_client',
10
-
11
9
  'JudgmentClient',
12
10
  ]
judgeval/clients.py CHANGED
@@ -8,12 +8,6 @@ from together import Together, AsyncTogether
8
8
  PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
9
9
  load_dotenv(dotenv_path=PATH_TO_DOTENV)
10
10
 
11
- # Initialize required clients
12
- langfuse = Langfuse(
13
- secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
14
- public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
15
- host=os.getenv("LANGFUSE_HOST"),
16
- )
17
11
 
18
12
  # Initialize optional OpenAI client
19
13
  client: Optional['OpenAI'] = None
judgeval/common/logger.py CHANGED
@@ -2,7 +2,6 @@ import logging
2
2
  from logging.handlers import RotatingFileHandler
3
3
  import sys
4
4
  from pathlib import Path
5
- from datetime import datetime
6
5
  from contextlib import contextmanager
7
6
 
8
7
  # Global variables
judgeval/common/tracer.py CHANGED
@@ -1,60 +1,68 @@
1
1
  """
2
2
  Tracing system for judgeval that allows for function tracing using decorators.
3
3
  """
4
-
5
- import os
6
- import time
4
+ # Standard library imports
5
+ import asyncio
7
6
  import functools
8
- import requests
9
- import uuid
10
- from contextlib import contextmanager
11
- from typing import (
12
- Optional,
13
- Any,
14
- List,
15
- Literal,
16
- Tuple,
17
- Generator,
18
- TypeAlias,
19
- Union
20
- )
21
- from dataclasses import (
22
- dataclass,
23
- field
24
- )
25
- from datetime import datetime
26
- from openai import OpenAI
27
- from together import Together
28
- from anthropic import Anthropic
29
- from typing import Dict
30
7
  import inspect
31
- import asyncio
32
8
  import json
9
+ import os
10
+ import time
11
+ import uuid
33
12
  import warnings
34
- from pydantic import BaseModel
13
+ from contextlib import contextmanager
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime
35
16
  from http import HTTPStatus
17
+ from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
18
+ from rich import print as rprint
19
+ from uuid import UUID
20
+ from collections.abc import Sequence
36
21
 
22
+ # Third-party imports
37
23
  import pika
38
- import os
24
+ import requests
25
+ from pydantic import BaseModel
26
+ from rich import print as rprint
27
+ from openai import OpenAI
28
+ from together import Together
29
+ from anthropic import Anthropic
39
30
 
40
- from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL,JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
31
+ # Local application/library-specific imports
32
+ from judgeval.constants import (
33
+ JUDGMENT_TRACES_SAVE_API_URL,
34
+ JUDGMENT_TRACES_FETCH_API_URL,
35
+ RABBITMQ_HOST,
36
+ RABBITMQ_PORT,
37
+ RABBITMQ_QUEUE,
38
+ JUDGMENT_TRACES_DELETE_API_URL,
39
+ JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
40
+ )
41
41
  from judgeval.judgment_client import JudgmentClient
42
42
  from judgeval.data import Example
43
43
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
44
44
  from judgeval.rules import Rule
45
45
  from judgeval.evaluation_run import EvaluationRun
46
- from judgeval.judges import JudgevalJudge
46
+ from judgeval.data.result import ScoringResult
47
47
 
48
- from rich import print as rprint
48
+ from langchain_core.language_models import BaseChatModel
49
+ from langchain_huggingface import ChatHuggingFace
50
+ from langchain_openai import ChatOpenAI
51
+ from langchain_anthropic import ChatAnthropic
52
+ from langchain_core.utils.function_calling import convert_to_openai_tool
53
+ from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
54
+ from langchain_core.agents import AgentAction, AgentFinish
55
+ from langchain_core.outputs import LLMResult
49
56
 
50
- from judgeval.data.result import ScoringResult
57
+ from langchain_core.messages.ai import AIMessage
58
+ from langchain_core.messages.tool import ToolMessage
59
+ from langchain_core.messages.base import BaseMessage
60
+ from langchain_core.documents import Document
51
61
 
52
62
  # Define type aliases for better code readability and maintainability
53
63
  ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
54
64
  TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
55
- SpanType = Literal['span', 'tool', 'llm', 'evaluation']
56
-
57
-
65
+ SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
58
66
  @dataclass
59
67
  class TraceEntry:
60
68
  """Represents a single trace entry with its visual representation.
@@ -419,7 +427,7 @@ class TraceClient:
419
427
 
420
428
  # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
421
429
  if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
422
- raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
430
+ raise ValueError("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.")
423
431
 
424
432
  except Exception as e:
425
433
  warnings.warn(f"Failed to load scorers: {str(e)}")
@@ -455,9 +463,12 @@ class TraceClient:
455
463
  if self._current_span:
456
464
  duration = time.time() - start_time # Calculate duration from start_time
457
465
 
466
+ prev_entry = self.entries[-1]
467
+
468
+ # Select the last entry in the trace if it's an LLM call, otherwise use the current span
458
469
  self.add_entry(TraceEntry(
459
470
  type="evaluation",
460
- function=self._current_span,
471
+ function=prev_entry.function if prev_entry.span_type == "llm" else self._current_span,
461
472
  depth=self.tracer.depth,
462
473
  message=f"Evaluation results for {self._current_span}",
463
474
  timestamp=time.time(),
@@ -531,7 +542,7 @@ class TraceClient:
531
542
  active_functions = [] # Stack to track nested function calls
532
543
  function_entries = {} # Store entries for each function
533
544
 
534
- for entry in entries:
545
+ for i, entry in enumerate(entries):
535
546
  function = entry["function"]
536
547
 
537
548
  if entry["type"] == "enter":
@@ -553,9 +564,12 @@ class TraceClient:
553
564
  current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
554
565
  condensed.append(current_entry)
555
566
  active_functions.remove(function)
556
- del function_entries[function]
567
+ # del function_entries[function]
557
568
 
558
- elif function in active_functions:
569
+ # The OR condition is to handle the LLM client case.
570
+ # LLM client is a special case where we exit the span, so when we attach evaluations to it,
571
+ # we have to check if the previous entry is an LLM call.
572
+ elif function in active_functions or entry["type"] == "evaluation" and entries[i-1]["function"] == entry["function"]:
559
573
  # Update existing function entry with additional data
560
574
  current_entry = function_entries[function]
561
575
 
@@ -570,6 +584,7 @@ class TraceClient:
570
584
 
571
585
  # Sort by timestamp
572
586
  condensed.sort(key=lambda x: x["timestamp"])
587
+
573
588
  return condensed
574
589
 
575
590
  def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
@@ -581,6 +596,7 @@ class TraceClient:
581
596
  total_duration = self.get_duration()
582
597
 
583
598
  raw_entries = [entry.to_dict() for entry in self.entries]
599
+
584
600
  condensed_entries = self.condense_trace(raw_entries)
585
601
 
586
602
  # Calculate total token counts from LLM API calls
@@ -755,7 +771,7 @@ class Tracer:
755
771
  with trace.span(span_name, span_type=span_type) as span:
756
772
  # Record inputs
757
773
  span.record_input({
758
- 'args': list(args),
774
+ 'args': str(args),
759
775
  'kwargs': kwargs
760
776
  })
761
777
 
@@ -792,7 +808,7 @@ class Tracer:
792
808
  with trace.span(span_name, span_type=span_type) as span:
793
809
  # Record inputs
794
810
  span.record_input({
795
- 'args': list(args),
811
+ 'args': str(args),
796
812
  'kwargs': kwargs
797
813
  })
798
814
 
@@ -810,6 +826,28 @@ class Tracer:
810
826
  self._current_trace = None
811
827
 
812
828
  return wrapper
829
+
830
+ def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
831
+ """
832
+ Decorator to trace function execution with detailed entry/exit information.
833
+ """
834
+ if func is None:
835
+ return lambda f: self.observe(f, name=name, span_type=span_type)
836
+
837
+ if asyncio.iscoroutinefunction(func):
838
+ @functools.wraps(func)
839
+ async def async_wrapper(*args, **kwargs):
840
+ if self._current_trace:
841
+ self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model=model, log_results=log_results)
842
+ return async_wrapper
843
+ else:
844
+ @functools.wraps(func)
845
+ def wrapper(*args, **kwargs):
846
+ if self._current_trace:
847
+ self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
848
+ return wrapper
849
+
850
+
813
851
 
814
852
  def wrap(client: Any) -> Any:
815
853
  """
@@ -920,3 +958,173 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
920
958
  "total_tokens": response.usage.input_tokens + response.usage.output_tokens
921
959
  }
922
960
  }
961
+
962
+ class JudgevalCallbackHandler(BaseCallbackHandler):
963
+ def __init__(self, trace_client: TraceClient):
964
+ self.trace_client = trace_client
965
+ self.openai_count = 1
966
+
967
+ def start_span(self, name: str, span_type: SpanType = "span"):
968
+ start_time = time.time()
969
+
970
+ # Record span entry
971
+ self.trace_client.add_entry(TraceEntry(
972
+ type="enter",
973
+ function=name,
974
+ depth=self.trace_client.tracer.depth,
975
+ message=name,
976
+ timestamp=start_time,
977
+ span_type=span_type
978
+ ))
979
+
980
+ self.trace_client.tracer.depth += 1
981
+ self.trace_client.prev_span = self.trace_client._current_span
982
+ self.trace_client._current_span = name
983
+ self._start_time = start_time
984
+
985
+ def end_span(self, name: str, span_type: SpanType = "span"):
986
+ self.trace_client.tracer.depth -= 1
987
+ duration = time.time() - self._start_time
988
+
989
+ # Record span exit
990
+ self.trace_client.add_entry(TraceEntry(
991
+ type="exit",
992
+ function=name,
993
+ depth=self.trace_client.tracer.depth,
994
+ message=f"← {name}",
995
+ timestamp=time.time(),
996
+ duration=duration,
997
+ span_type=span_type
998
+ ))
999
+ self.trace_client._current_span = self.trace_client.prev_span
1000
+
1001
+ def on_retriever_start(
1002
+ self,
1003
+ serialized: Optional[dict[str, Any]],
1004
+ query: str,
1005
+ *,
1006
+ run_id: UUID,
1007
+ parent_run_id: Optional[UUID] = None,
1008
+ tags: Optional[list[str]] = None,
1009
+ metadata: Optional[dict[str, Any]] = None,
1010
+ **kwargs: Any,
1011
+ ) -> Any:
1012
+ name = "RETRIEVER_CALL"
1013
+ if serialized and "name" in serialized:
1014
+ name = f"RETRIEVER_{serialized['name'].upper()}"
1015
+
1016
+ self.start_span(name, span_type="retriever")
1017
+ self.trace_client.record_input({
1018
+ 'query': query,
1019
+ 'tags': tags,
1020
+ 'metadata': metadata,
1021
+ 'kwargs': kwargs
1022
+ })
1023
+
1024
+ def on_retriever_end(
1025
+ self,
1026
+ documents: Sequence[Document],
1027
+ *,
1028
+ run_id: UUID,
1029
+ parent_run_id: Optional[UUID] = None,
1030
+ **kwargs: Any
1031
+ ) -> Any:
1032
+ # Process the retrieved documents into a format suitable for logging
1033
+ doc_summary = []
1034
+ for i, doc in enumerate(documents):
1035
+ # Extract key information from each document
1036
+ doc_data = {
1037
+ "index": i,
1038
+ "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
1039
+ "metadata": doc.metadata
1040
+ }
1041
+ doc_summary.append(doc_data)
1042
+
1043
+ # Record the document data
1044
+ self.trace_client.record_output({
1045
+ "document_count": len(documents),
1046
+ "documents": doc_summary
1047
+ })
1048
+
1049
+ # End the retriever span
1050
+ self.end_span(self.trace_client._current_span, span_type="retriever")
1051
+
1052
+ def on_tool_start(
1053
+ self,
1054
+ serialized: Optional[dict[str, Any]],
1055
+ input_str: str,
1056
+ run_id: Optional[UUID] = None,
1057
+ parent_run_id: Optional[UUID] = None,
1058
+ inputs: Optional[dict[str, Any]] = None,
1059
+ **kwargs: Any,
1060
+ ):
1061
+ name = serialized["name"]
1062
+ self.start_span(name, span_type="tool")
1063
+ self.trace_client.record_input({
1064
+ 'args': input_str,
1065
+ 'kwargs': kwargs
1066
+ })
1067
+
1068
+ def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
1069
+ self.trace_client.record_output(output)
1070
+ self.end_span(self.trace_client._current_span, span_type="tool")
1071
+
1072
+ def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
1073
+ print(f"Agent action: {action}")
1074
+
1075
+ def on_agent_finish(
1076
+ self,
1077
+ finish: AgentFinish,
1078
+ *,
1079
+ run_id: UUID,
1080
+ parent_run_id: Optional[UUID] = None,
1081
+ tags: Optional[list[str]] = None,
1082
+ **kwargs: Any,
1083
+ ) -> None:
1084
+ print(f"Agent action: {finish}")
1085
+
1086
+ def on_llm_start(
1087
+ self,
1088
+ serialized: Optional[dict[str, Any]],
1089
+ prompts: list[str],
1090
+ *,
1091
+ run_id: UUID,
1092
+ parent_run_id: Optional[UUID] = None,
1093
+ **kwargs: Any,
1094
+ ) -> Any:
1095
+ name = "LLM call"
1096
+ self.start_span(name, span_type="llm")
1097
+ self.trace_client.record_input({
1098
+ 'args': prompts,
1099
+ 'kwargs': kwargs
1100
+ })
1101
+
1102
+ def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
1103
+ self.trace_client.record_output(response.generations[0][0].text)
1104
+ self.end_span(self.trace_client._current_span, span_type="llm")
1105
+
1106
+ def on_chat_model_start(
1107
+ self,
1108
+ serialized: Optional[dict[str, Any]],
1109
+ messages: list[list[BaseMessage]],
1110
+ *,
1111
+ run_id: UUID,
1112
+ parent_run_id: Optional[UUID] = None,
1113
+ **kwargs: Any,
1114
+ ) -> Any:
1115
+
1116
+ if "openai" in serialized["id"]:
1117
+ name = f"OPENAI_API_CALL_{self.openai_count}"
1118
+ self.openai_count += 1
1119
+ elif "anthropic" in serialized["id"]:
1120
+ name = "ANTHROPIC_API_CALL"
1121
+ elif "together" in serialized["id"]:
1122
+ name = "TOGETHER_API_CALL"
1123
+ else:
1124
+ name = "LLM call"
1125
+
1126
+ self.start_span(name, span_type="llm")
1127
+ self.trace_client.record_input({
1128
+ 'args': str(messages),
1129
+ 'kwargs': kwargs
1130
+ })
judgeval/common/utils.py CHANGED
@@ -8,15 +8,19 @@ For API calling, we support:
8
8
  NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
9
9
  """
10
10
 
11
- import concurrent.futures
12
- from typing import List, Mapping, Dict, Union, Optional, Literal, Any
11
+ # Standard library imports
13
12
  import asyncio
13
+ import concurrent.futures
14
+ import os
15
+ import pprint
16
+ from typing import Any, Dict, List, Literal, Mapping, Optional, Union
17
+
18
+ # Third-party imports
14
19
  import litellm
15
20
  import pydantic
16
- import pprint
17
- import os
18
- from dotenv import load_dotenv
21
+ from dotenv import load_dotenv
19
22
 
23
+ # Local application/library-specific imports
20
24
  from judgeval.clients import async_together_client, together_client
21
25
  from judgeval.constants import *
22
26
  from judgeval.common.logger import debug, error
judgeval/constants.py CHANGED
@@ -21,8 +21,11 @@ class APIScorer(str, Enum):
21
21
  CONTEXTUAL_RECALL = "contextual_recall"
22
22
  CONTEXTUAL_RELEVANCY = "contextual_relevancy"
23
23
  CONTEXTUAL_PRECISION = "contextual_precision"
24
+ INSTRUCTION_ADHERENCE = "instruction_adherence"
24
25
  TOOL_CORRECTNESS = "tool_correctness"
25
26
  JSON_CORRECTNESS = "json_correctness"
27
+ COMPARISON = "comparison"
28
+ GROUNDEDNESS = "groundedness"
26
29
 
27
30
  @classmethod
28
31
  def _missing_(cls, value):
@@ -31,6 +34,8 @@ class APIScorer(str, Enum):
31
34
  if member.value == value.lower():
32
35
  return member
33
36
 
37
+ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not bounded between 0-1
38
+
34
39
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
35
40
  # API URLs
36
41
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
@@ -110,7 +115,7 @@ TOGETHER_SUPPORTED_MODELS = [
110
115
  "mistralai/Mistral-7B-Instruct-v0.1"
111
116
  ]
112
117
 
113
- JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
118
+ JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
114
119
 
115
120
  ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
116
121
 
judgeval/data/__init__.py CHANGED
@@ -2,6 +2,7 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.api_example import ProcessExample, create_process_example
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
+ from judgeval.data.ground_truth import GroundTruthExample
5
6
 
6
7
  __all__ = [
7
8
  "Example",
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "create_scorer_data",
13
14
  "ScoringResult",
14
15
  "generate_scoring_result",
16
+ "GroundTruthExample",
15
17
  ]
@@ -1,5 +1,5 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, Field, ConfigDict, model_validator
1
+ from typing import List, Optional, Dict, Any
2
+ from pydantic import BaseModel, ConfigDict, model_validator
3
3
 
4
4
  from judgeval.data.example import Example
5
5
  from judgeval.data.scorer_data import ScorerData
@@ -1,5 +1,4 @@
1
1
  from judgeval.data.datasets.dataset import EvalDataset
2
- from judgeval.data.datasets.ground_truth import GroundTruthExample
3
2
  from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
4
3
 
5
- __all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
4
+ __all__ = ["EvalDataset", "EvalDatasetClient"]
@@ -1,13 +1,12 @@
1
1
  import ast
2
2
  import csv
3
- import datetime
3
+ import datetime
4
4
  import json
5
- from dataclasses import dataclass, field
6
5
  import os
7
- from typing import List, Optional, Union, Literal
6
+ from dataclasses import dataclass, field
7
+ from typing import List, Union, Literal
8
8
 
9
- from judgeval.data.datasets.ground_truth import GroundTruthExample
10
- from judgeval.data import Example
9
+ from judgeval.data import Example, GroundTruthExample
11
10
  from judgeval.common.logger import debug, error, warning, info
12
11
 
13
12
  @dataclass
@@ -11,9 +11,8 @@ from judgeval.constants import (
11
11
  JUDGMENT_DATASETS_EDIT_API_URL,
12
12
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
13
13
  )
14
- from judgeval.data import Example
14
+ from judgeval.data import Example, GroundTruthExample
15
15
  from judgeval.data.datasets import EvalDataset
16
- from judgeval.data.datasets.ground_truth import GroundTruthExample
17
16
 
18
17
 
19
18
 
@@ -1,7 +1,6 @@
1
1
  from typing import List, Optional
2
2
 
3
- from judgeval.data.datasets.ground_truth import GroundTruthExample
4
- from judgeval.data import Example
3
+ from judgeval.data import Example, GroundTruthExample
5
4
 
6
5
 
7
6
  def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]:
judgeval/data/example.py CHANGED
@@ -2,17 +2,12 @@
2
2
  Classes for representing examples in a dataset.
3
3
  """
4
4
 
5
-
6
- from typing import TypeVar, Optional, Any, Dict, List
5
+ from typing import Optional, Any, Dict, List
7
6
  from uuid import uuid4
8
- from pydantic import BaseModel, Field, field_validator
7
+ from pydantic import BaseModel, Field
9
8
  from enum import Enum
10
9
  from datetime import datetime
11
- import time
12
-
13
10
 
14
- Input = TypeVar('Input')
15
- Output = TypeVar('Output')
16
11
 
17
12
  class ExampleParams(Enum):
18
13
  INPUT = "input"
@@ -23,11 +18,12 @@ class ExampleParams(Enum):
23
18
  TOOLS_CALLED = "tools_called"
24
19
  EXPECTED_TOOLS = "expected_tools"
25
20
  REASONING = "reasoning"
21
+ ADDITIONAL_METADATA = "additional_metadata"
26
22
 
27
23
 
28
24
  class Example(BaseModel):
29
- input: Input
30
- actual_output: Output
25
+ input: str
26
+ actual_output: str
31
27
  expected_output: Optional[str] = None
32
28
  context: Optional[List[str]] = None
33
29
  retrieval_context: Optional[List[str]] = None
@@ -39,22 +35,81 @@ class Example(BaseModel):
39
35
  example_index: Optional[int] = None
40
36
  timestamp: Optional[str] = None
41
37
  trace_id: Optional[str] = None
42
-
43
- @field_validator('input', 'actual_output', mode='before')
44
- def convert_to_str(cls, value):
45
- try:
46
- return str(value)
47
- except Exception:
48
- return repr(value)
49
38
 
50
39
  def __init__(self, **data):
40
+ # Check that required fields are provided
41
+ if 'input' not in data:
42
+ raise ValueError("Example must be initialized with 'input' field.")
43
+ if 'actual_output' not in data:
44
+ raise ValueError("Example must be initialized with 'actual_output' field.")
45
+
51
46
  if 'example_id' not in data:
52
47
  data['example_id'] = str(uuid4())
53
48
  # Set timestamp if not provided
54
49
  if 'timestamp' not in data:
55
50
  data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
56
51
  super().__init__(**data)
57
-
52
+
53
+ @field_validator('input', mode='before')
54
+ @classmethod
55
+ def validate_input(cls, v):
56
+ if not v or not isinstance(v, str):
57
+ raise ValueError(f"Input must be a non-empty string but got '{v}' of type {type(v)}")
58
+ return v
59
+
60
+ @field_validator('actual_output', mode='before')
61
+ @classmethod
62
+ def validate_actual_output(cls, v):
63
+ if not isinstance(v, str):
64
+ raise ValueError(f"Actual output must be a string but got '{v}' of type {type(v)}")
65
+ return v
66
+
67
+ @field_validator('expected_output', mode='before')
68
+ @classmethod
69
+ def validate_expected_output(cls, v):
70
+ if v is not None and not isinstance(v, str):
71
+ raise ValueError(f"Expected output must be a string or None but got {v} of type {type(v)}")
72
+ return v
73
+
74
+ @field_validator('context', 'retrieval_context', 'tools_called', 'expected_tools', mode='before')
75
+ @classmethod
76
+ def validate_string_lists(cls, v, info):
77
+ field_name = info.field_name
78
+ if v is not None:
79
+ if not isinstance(v, list):
80
+ raise ValueError(f"{field_name} must be a list of strings or None but got {v} of type {type(v)}")
81
+ for i, item in enumerate(v):
82
+ if not isinstance(item, str):
83
+ raise ValueError(f"All items in {field_name} must be strings but item at index {i} is {item} of type {type(item)}")
84
+ return v
85
+
86
+ @field_validator('additional_metadata', mode='before')
87
+ @classmethod
88
+ def validate_additional_metadata(cls, v):
89
+ if v is not None and not isinstance(v, dict):
90
+ raise ValueError(f"Additional metadata must be a dictionary or None but got {v} of type {type(v)}")
91
+ return v
92
+
93
+ @field_validator('example_index', mode='before')
94
+ @classmethod
95
+ def validate_example_index(cls, v):
96
+ if v is not None and not isinstance(v, int):
97
+ raise ValueError(f"Example index must be an integer or None but got {v} of type {type(v)}")
98
+ return v
99
+
100
+ @field_validator('timestamp', mode='before')
101
+ @classmethod
102
+ def validate_timestamp(cls, v):
103
+ if v is not None and not isinstance(v, str):
104
+ raise ValueError(f"Timestamp must be a string or None but got {v} of type {type(v)}")
105
+ return v
106
+
107
+ @field_validator('trace_id', mode='before')
108
+ @classmethod
109
+ def validate_trace_id(cls, v):
110
+ if v is not None and not isinstance(v, str):
111
+ raise ValueError(f"Trace ID must be a string or None but got {v} of type {type(v)}")
112
+ return v
58
113
 
59
114
  def to_dict(self):
60
115
  return {
@@ -5,7 +5,7 @@ ScorerData holds the information related to a single, completed Scorer evaluatio
5
5
  """
6
6
 
7
7
  from typing import List, Union, Optional, Dict
8
- from pydantic import BaseModel, Field
8
+ from pydantic import BaseModel
9
9
 
10
10
  from judgeval.scorers import JudgevalScorer
11
11