judgeval 0.0.16__py3-none-any.whl → 0.0.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. judgeval/__init__.py +1 -3
  2. judgeval/clients.py +0 -6
  3. judgeval/common/logger.py +0 -1
  4. judgeval/common/tracer.py +270 -62
  5. judgeval/common/utils.py +9 -5
  6. judgeval/constants.py +7 -2
  7. judgeval/data/__init__.py +2 -0
  8. judgeval/data/api_example.py +2 -2
  9. judgeval/data/datasets/__init__.py +1 -2
  10. judgeval/data/datasets/dataset.py +4 -5
  11. judgeval/data/datasets/eval_dataset_client.py +11 -7
  12. judgeval/data/datasets/utils.py +1 -2
  13. judgeval/data/example.py +72 -17
  14. judgeval/data/scorer_data.py +1 -1
  15. judgeval/evaluation_run.py +2 -2
  16. judgeval/judges/__init__.py +0 -1
  17. judgeval/judges/base_judge.py +1 -1
  18. judgeval/judges/mixture_of_judges.py +7 -2
  19. judgeval/judgment_client.py +16 -8
  20. judgeval/rules.py +2 -4
  21. judgeval/run_evaluation.py +8 -8
  22. judgeval/scorers/__init__.py +6 -0
  23. judgeval/scorers/api_scorer.py +12 -6
  24. judgeval/scorers/base_scorer.py +12 -6
  25. judgeval/scorers/judgeval_scorer.py +7 -3
  26. judgeval/scorers/judgeval_scorers/__init__.py +24 -3
  27. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +6 -0
  28. judgeval/scorers/judgeval_scorers/api_scorers/comparison.py +35 -0
  29. judgeval/scorers/judgeval_scorers/api_scorers/groundedness.py +19 -0
  30. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +19 -0
  31. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +4 -1
  32. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -1
  33. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +2 -2
  34. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +7 -6
  35. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +2 -2
  36. judgeval/scorers/judgeval_scorers/local_implementations/comparison/__init__.py +0 -0
  37. judgeval/scorers/judgeval_scorers/local_implementations/comparison/comparison_scorer.py +161 -0
  38. judgeval/scorers/judgeval_scorers/local_implementations/comparison/prompts.py +222 -0
  39. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +2 -2
  40. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +2 -2
  41. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +2 -2
  42. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +1 -8
  43. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +7 -6
  44. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +2 -2
  45. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/instruction_adherence.py +232 -0
  46. judgeval/scorers/judgeval_scorers/local_implementations/instruction_adherence/prompt.py +102 -0
  47. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +7 -7
  48. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +7 -6
  49. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +1 -2
  50. judgeval/scorers/prompt_scorer.py +7 -5
  51. judgeval/scorers/utils.py +1 -1
  52. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/METADATA +1 -1
  53. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/RECORD +56 -48
  54. /judgeval/data/{datasets/ground_truth.py → ground_truth.py} +0 -0
  55. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/WHEEL +0 -0
  56. {judgeval-0.0.16.dist-info → judgeval-0.0.18.dist-info}/licenses/LICENSE.md +0 -0
judgeval/__init__.py CHANGED
@@ -1,12 +1,10 @@
1
1
  # Import key components that should be publicly accessible
2
- from judgeval.clients import client, langfuse, together_client
2
+ from judgeval.clients import client, together_client
3
3
  from judgeval.judgment_client import JudgmentClient
4
4
 
5
5
  __all__ = [
6
6
  # Clients
7
7
  'client',
8
- 'langfuse',
9
8
  'together_client',
10
-
11
9
  'JudgmentClient',
12
10
  ]
judgeval/clients.py CHANGED
@@ -8,12 +8,6 @@ from together import Together, AsyncTogether
8
8
  PATH_TO_DOTENV = os.path.join(os.path.dirname(__file__), ".env")
9
9
  load_dotenv(dotenv_path=PATH_TO_DOTENV)
10
10
 
11
- # Initialize required clients
12
- langfuse = Langfuse(
13
- secret_key=os.getenv("LANGFUSE_SECRET_KEY"),
14
- public_key=os.getenv("LANGFUSE_PUBLIC_KEY"),
15
- host=os.getenv("LANGFUSE_HOST"),
16
- )
17
11
 
18
12
  # Initialize optional OpenAI client
19
13
  client: Optional['OpenAI'] = None
judgeval/common/logger.py CHANGED
@@ -2,7 +2,6 @@ import logging
2
2
  from logging.handlers import RotatingFileHandler
3
3
  import sys
4
4
  from pathlib import Path
5
- from datetime import datetime
6
5
  from contextlib import contextmanager
7
6
 
8
7
  # Global variables
judgeval/common/tracer.py CHANGED
@@ -1,60 +1,68 @@
1
1
  """
2
2
  Tracing system for judgeval that allows for function tracing using decorators.
3
3
  """
4
-
5
- import os
6
- import time
4
+ # Standard library imports
5
+ import asyncio
7
6
  import functools
8
- import requests
9
- import uuid
10
- from contextlib import contextmanager
11
- from typing import (
12
- Optional,
13
- Any,
14
- List,
15
- Literal,
16
- Tuple,
17
- Generator,
18
- TypeAlias,
19
- Union
20
- )
21
- from dataclasses import (
22
- dataclass,
23
- field
24
- )
25
- from datetime import datetime
26
- from openai import OpenAI
27
- from together import Together
28
- from anthropic import Anthropic
29
- from typing import Dict
30
7
  import inspect
31
- import asyncio
32
8
  import json
9
+ import os
10
+ import time
11
+ import uuid
33
12
  import warnings
34
- from pydantic import BaseModel
13
+ from contextlib import contextmanager
14
+ from dataclasses import dataclass, field
15
+ from datetime import datetime
35
16
  from http import HTTPStatus
17
+ from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, TypeAlias, Union
18
+ from rich import print as rprint
19
+ from uuid import UUID
20
+ from collections.abc import Sequence
36
21
 
22
+ # Third-party imports
37
23
  import pika
38
- import os
24
+ import requests
25
+ from pydantic import BaseModel
26
+ from rich import print as rprint
27
+ from openai import OpenAI
28
+ from together import Together
29
+ from anthropic import Anthropic
39
30
 
40
- from judgeval.constants import JUDGMENT_TRACES_SAVE_API_URL, JUDGMENT_TRACES_FETCH_API_URL, RABBITMQ_HOST, RABBITMQ_PORT, RABBITMQ_QUEUE, JUDGMENT_TRACES_DELETE_API_URL
31
+ # Local application/library-specific imports
32
+ from judgeval.constants import (
33
+ JUDGMENT_TRACES_SAVE_API_URL,
34
+ JUDGMENT_TRACES_FETCH_API_URL,
35
+ RABBITMQ_HOST,
36
+ RABBITMQ_PORT,
37
+ RABBITMQ_QUEUE,
38
+ JUDGMENT_TRACES_DELETE_API_URL,
39
+ JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL
40
+ )
41
41
  from judgeval.judgment_client import JudgmentClient
42
42
  from judgeval.data import Example
43
43
  from judgeval.scorers import APIJudgmentScorer, JudgevalScorer, ScorerWrapper
44
44
  from judgeval.rules import Rule
45
45
  from judgeval.evaluation_run import EvaluationRun
46
- from judgeval.judges import JudgevalJudge
46
+ from judgeval.data.result import ScoringResult
47
47
 
48
- from rich import print as rprint
48
+ from langchain_core.language_models import BaseChatModel
49
+ from langchain_huggingface import ChatHuggingFace
50
+ from langchain_openai import ChatOpenAI
51
+ from langchain_anthropic import ChatAnthropic
52
+ from langchain_core.utils.function_calling import convert_to_openai_tool
53
+ from langchain_core.callbacks import CallbackManager, BaseCallbackHandler
54
+ from langchain_core.agents import AgentAction, AgentFinish
55
+ from langchain_core.outputs import LLMResult
49
56
 
50
- from judgeval.data.result import ScoringResult
57
+ from langchain_core.messages.ai import AIMessage
58
+ from langchain_core.messages.tool import ToolMessage
59
+ from langchain_core.messages.base import BaseMessage
60
+ from langchain_core.documents import Document
51
61
 
52
62
  # Define type aliases for better code readability and maintainability
53
63
  ApiClient: TypeAlias = Union[OpenAI, Together, Anthropic] # Supported API clients
54
64
  TraceEntryType = Literal['enter', 'exit', 'output', 'input', 'evaluation'] # Valid trace entry types
55
- SpanType = Literal['span', 'tool', 'llm', 'evaluation']
56
-
57
-
65
+ SpanType = Literal['span', 'tool', 'llm', 'evaluation', 'chain']
58
66
  @dataclass
59
67
  class TraceEntry:
60
68
  """Represents a single trace entry with its visual representation.
@@ -207,7 +215,8 @@ class TraceManagerClient:
207
215
  "Content-Type": "application/json",
208
216
  "Authorization": f"Bearer {self.judgment_api_key}",
209
217
  "X-Organization-Id": self.organization_id
210
- }
218
+ },
219
+ verify=True
211
220
  )
212
221
 
213
222
  if response.status_code != HTTPStatus.OK:
@@ -231,7 +240,8 @@ class TraceManagerClient:
231
240
  "Content-Type": "application/json",
232
241
  "Authorization": f"Bearer {self.judgment_api_key}",
233
242
  "X-Organization-Id": self.organization_id
234
- }
243
+ },
244
+ verify=True
235
245
  )
236
246
 
237
247
  if response.status_code == HTTPStatus.BAD_REQUEST:
@@ -417,7 +427,7 @@ class TraceClient:
417
427
 
418
428
  # Prevent using JudgevalScorer with rules - only APIJudgmentScorer allowed with rules
419
429
  if loaded_rules and any(isinstance(scorer, JudgevalScorer) for scorer in loaded_scorers):
420
- raise ValueError("Cannot use Judgeval scorers (only API scorers) when using rules. Please either remove rules or use only APIJudgmentScorer types.")
430
+ raise ValueError("Cannot use Judgeval scorers, you can only use API scorers when using rules. Please either remove rules or use only APIJudgmentScorer types.")
421
431
 
422
432
  except Exception as e:
423
433
  warnings.warn(f"Failed to load scorers: {str(e)}")
@@ -453,9 +463,12 @@ class TraceClient:
453
463
  if self._current_span:
454
464
  duration = time.time() - start_time # Calculate duration from start_time
455
465
 
466
+ prev_entry = self.entries[-1]
467
+
468
+ # Select the last entry in the trace if it's an LLM call, otherwise use the current span
456
469
  self.add_entry(TraceEntry(
457
470
  type="evaluation",
458
- function=self._current_span,
471
+ function=prev_entry.function if prev_entry.span_type == "llm" else self._current_span,
459
472
  depth=self.tracer.depth,
460
473
  message=f"Evaluation results for {self._current_span}",
461
474
  timestamp=time.time(),
@@ -529,7 +542,7 @@ class TraceClient:
529
542
  active_functions = [] # Stack to track nested function calls
530
543
  function_entries = {} # Store entries for each function
531
544
 
532
- for entry in entries:
545
+ for i, entry in enumerate(entries):
533
546
  function = entry["function"]
534
547
 
535
548
  if entry["type"] == "enter":
@@ -551,9 +564,12 @@ class TraceClient:
551
564
  current_entry["duration"] = entry["timestamp"] - current_entry["timestamp"]
552
565
  condensed.append(current_entry)
553
566
  active_functions.remove(function)
554
- del function_entries[function]
567
+ # del function_entries[function]
555
568
 
556
- elif function in active_functions:
569
+ # The OR condition is to handle the LLM client case.
570
+ # LLM client is a special case where we exit the span, so when we attach evaluations to it,
571
+ # we have to check if the previous entry is an LLM call.
572
+ elif function in active_functions or entry["type"] == "evaluation" and entries[i-1]["function"] == entry["function"]:
557
573
  # Update existing function entry with additional data
558
574
  current_entry = function_entries[function]
559
575
 
@@ -568,6 +584,7 @@ class TraceClient:
568
584
 
569
585
  # Sort by timestamp
570
586
  condensed.sort(key=lambda x: x["timestamp"])
587
+
571
588
  return condensed
572
589
 
573
590
  def save(self, empty_save: bool = False, overwrite: bool = False) -> Tuple[str, dict]:
@@ -579,6 +596,7 @@ class TraceClient:
579
596
  total_duration = self.get_duration()
580
597
 
581
598
  raw_entries = [entry.to_dict() for entry in self.entries]
599
+
582
600
  condensed_entries = self.condense_trace(raw_entries)
583
601
 
584
602
  # Calculate total token counts from LLM API calls
@@ -617,25 +635,23 @@ class TraceClient:
617
635
  }
618
636
  # Execute asynchrous evaluation in the background
619
637
  if not empty_save: # Only send to RabbitMQ if the trace is not empty
620
- connection = pika.BlockingConnection(
621
- pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
622
- channel = connection.channel()
623
-
624
- channel.queue_declare(queue=RABBITMQ_QUEUE, durable=True)
625
- trace_data["judgment_api_key"] = self.tracer.api_key
626
- trace_data["organization_id"] = self.tracer.organization_id
627
- channel.basic_publish(
628
- exchange='',
629
- routing_key=RABBITMQ_QUEUE,
630
- body=json.dumps(trace_data),
631
- properties=pika.BasicProperties(
632
- delivery_mode=pika.DeliveryMode.Transient, # Changed from Persistent to Transient
638
+ # Send trace data to evaluation queue via API
639
+ try:
640
+ response = requests.post(
641
+ JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL,
642
+ json=trace_data,
633
643
  headers={
634
- 'api_key': self.tracer.api_key,
635
- 'organization_id': self.tracer.organization_id
636
- }
637
- ))
638
- connection.close()
644
+ "Content-Type": "application/json",
645
+ "Authorization": f"Bearer {self.tracer.api_key}",
646
+ "X-Organization-Id": self.tracer.organization_id
647
+ },
648
+ verify=True
649
+ )
650
+
651
+ if response.status_code != HTTPStatus.OK:
652
+ warnings.warn(f"Failed to add trace to evaluation queue: {response.text}")
653
+ except Exception as e:
654
+ warnings.warn(f"Error sending trace to evaluation queue: {str(e)}")
639
655
 
640
656
  self.trace_manager_client.save_trace(trace_data, empty_save)
641
657
 
@@ -755,7 +771,7 @@ class Tracer:
755
771
  with trace.span(span_name, span_type=span_type) as span:
756
772
  # Record inputs
757
773
  span.record_input({
758
- 'args': list(args),
774
+ 'args': str(args),
759
775
  'kwargs': kwargs
760
776
  })
761
777
 
@@ -792,7 +808,7 @@ class Tracer:
792
808
  with trace.span(span_name, span_type=span_type) as span:
793
809
  # Record inputs
794
810
  span.record_input({
795
- 'args': list(args),
811
+ 'args': str(args),
796
812
  'kwargs': kwargs
797
813
  })
798
814
 
@@ -810,6 +826,28 @@ class Tracer:
810
826
  self._current_trace = None
811
827
 
812
828
  return wrapper
829
+
830
+ def score(self, func=None, scorers: List[Union[APIJudgmentScorer, JudgevalScorer]] = None, model: str = None, log_results: bool = True, *, name: str = None, span_type: SpanType = "span"):
831
+ """
832
+ Decorator to trace function execution with detailed entry/exit information.
833
+ """
834
+ if func is None:
835
+ return lambda f: self.observe(f, name=name, span_type=span_type)
836
+
837
+ if asyncio.iscoroutinefunction(func):
838
+ @functools.wraps(func)
839
+ async def async_wrapper(*args, **kwargs):
840
+ if self._current_trace:
841
+ self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model=model, log_results=log_results)
842
+ return async_wrapper
843
+ else:
844
+ @functools.wraps(func)
845
+ def wrapper(*args, **kwargs):
846
+ if self._current_trace:
847
+ self._current_trace.async_evaluate(scorers=[scorers], input=args, actual_output=kwargs, model="gpt-4o-mini", log_results=True)
848
+ return wrapper
849
+
850
+
813
851
 
814
852
  def wrap(client: Any) -> Any:
815
853
  """
@@ -920,3 +958,173 @@ def _format_output_data(client: ApiClient, response: Any) -> dict:
920
958
  "total_tokens": response.usage.input_tokens + response.usage.output_tokens
921
959
  }
922
960
  }
961
+
962
+ class JudgevalCallbackHandler(BaseCallbackHandler):
963
+ def __init__(self, trace_client: TraceClient):
964
+ self.trace_client = trace_client
965
+ self.openai_count = 1
966
+
967
+ def start_span(self, name: str, span_type: SpanType = "span"):
968
+ start_time = time.time()
969
+
970
+ # Record span entry
971
+ self.trace_client.add_entry(TraceEntry(
972
+ type="enter",
973
+ function=name,
974
+ depth=self.trace_client.tracer.depth,
975
+ message=name,
976
+ timestamp=start_time,
977
+ span_type=span_type
978
+ ))
979
+
980
+ self.trace_client.tracer.depth += 1
981
+ self.trace_client.prev_span = self.trace_client._current_span
982
+ self.trace_client._current_span = name
983
+ self._start_time = start_time
984
+
985
+ def end_span(self, name: str, span_type: SpanType = "span"):
986
+ self.trace_client.tracer.depth -= 1
987
+ duration = time.time() - self._start_time
988
+
989
+ # Record span exit
990
+ self.trace_client.add_entry(TraceEntry(
991
+ type="exit",
992
+ function=name,
993
+ depth=self.trace_client.tracer.depth,
994
+ message=f"← {name}",
995
+ timestamp=time.time(),
996
+ duration=duration,
997
+ span_type=span_type
998
+ ))
999
+ self.trace_client._current_span = self.trace_client.prev_span
1000
+
1001
+ def on_retriever_start(
1002
+ self,
1003
+ serialized: Optional[dict[str, Any]],
1004
+ query: str,
1005
+ *,
1006
+ run_id: UUID,
1007
+ parent_run_id: Optional[UUID] = None,
1008
+ tags: Optional[list[str]] = None,
1009
+ metadata: Optional[dict[str, Any]] = None,
1010
+ **kwargs: Any,
1011
+ ) -> Any:
1012
+ name = "RETRIEVER_CALL"
1013
+ if serialized and "name" in serialized:
1014
+ name = f"RETRIEVER_{serialized['name'].upper()}"
1015
+
1016
+ self.start_span(name, span_type="retriever")
1017
+ self.trace_client.record_input({
1018
+ 'query': query,
1019
+ 'tags': tags,
1020
+ 'metadata': metadata,
1021
+ 'kwargs': kwargs
1022
+ })
1023
+
1024
+ def on_retriever_end(
1025
+ self,
1026
+ documents: Sequence[Document],
1027
+ *,
1028
+ run_id: UUID,
1029
+ parent_run_id: Optional[UUID] = None,
1030
+ **kwargs: Any
1031
+ ) -> Any:
1032
+ # Process the retrieved documents into a format suitable for logging
1033
+ doc_summary = []
1034
+ for i, doc in enumerate(documents):
1035
+ # Extract key information from each document
1036
+ doc_data = {
1037
+ "index": i,
1038
+ "page_content": doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content,
1039
+ "metadata": doc.metadata
1040
+ }
1041
+ doc_summary.append(doc_data)
1042
+
1043
+ # Record the document data
1044
+ self.trace_client.record_output({
1045
+ "document_count": len(documents),
1046
+ "documents": doc_summary
1047
+ })
1048
+
1049
+ # End the retriever span
1050
+ self.end_span(self.trace_client._current_span, span_type="retriever")
1051
+
1052
+ def on_tool_start(
1053
+ self,
1054
+ serialized: Optional[dict[str, Any]],
1055
+ input_str: str,
1056
+ run_id: Optional[UUID] = None,
1057
+ parent_run_id: Optional[UUID] = None,
1058
+ inputs: Optional[dict[str, Any]] = None,
1059
+ **kwargs: Any,
1060
+ ):
1061
+ name = serialized["name"]
1062
+ self.start_span(name, span_type="tool")
1063
+ self.trace_client.record_input({
1064
+ 'args': input_str,
1065
+ 'kwargs': kwargs
1066
+ })
1067
+
1068
+ def on_tool_end(self, output: Any, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any) -> Any:
1069
+ self.trace_client.record_output(output)
1070
+ self.end_span(self.trace_client._current_span, span_type="tool")
1071
+
1072
+ def on_agent_action (self, action: AgentAction, **kwargs: Any) -> Any:
1073
+ print(f"Agent action: {action}")
1074
+
1075
+ def on_agent_finish(
1076
+ self,
1077
+ finish: AgentFinish,
1078
+ *,
1079
+ run_id: UUID,
1080
+ parent_run_id: Optional[UUID] = None,
1081
+ tags: Optional[list[str]] = None,
1082
+ **kwargs: Any,
1083
+ ) -> None:
1084
+ print(f"Agent action: {finish}")
1085
+
1086
+ def on_llm_start(
1087
+ self,
1088
+ serialized: Optional[dict[str, Any]],
1089
+ prompts: list[str],
1090
+ *,
1091
+ run_id: UUID,
1092
+ parent_run_id: Optional[UUID] = None,
1093
+ **kwargs: Any,
1094
+ ) -> Any:
1095
+ name = "LLM call"
1096
+ self.start_span(name, span_type="llm")
1097
+ self.trace_client.record_input({
1098
+ 'args': prompts,
1099
+ 'kwargs': kwargs
1100
+ })
1101
+
1102
+ def on_llm_end(self, response: LLMResult, *, run_id: UUID, parent_run_id: Optional[UUID] = None, **kwargs: Any):
1103
+ self.trace_client.record_output(response.generations[0][0].text)
1104
+ self.end_span(self.trace_client._current_span, span_type="llm")
1105
+
1106
+ def on_chat_model_start(
1107
+ self,
1108
+ serialized: Optional[dict[str, Any]],
1109
+ messages: list[list[BaseMessage]],
1110
+ *,
1111
+ run_id: UUID,
1112
+ parent_run_id: Optional[UUID] = None,
1113
+ **kwargs: Any,
1114
+ ) -> Any:
1115
+
1116
+ if "openai" in serialized["id"]:
1117
+ name = f"OPENAI_API_CALL_{self.openai_count}"
1118
+ self.openai_count += 1
1119
+ elif "anthropic" in serialized["id"]:
1120
+ name = "ANTHROPIC_API_CALL"
1121
+ elif "together" in serialized["id"]:
1122
+ name = "TOGETHER_API_CALL"
1123
+ else:
1124
+ name = "LLM call"
1125
+
1126
+ self.start_span(name, span_type="llm")
1127
+ self.trace_client.record_input({
1128
+ 'args': str(messages),
1129
+ 'kwargs': kwargs
1130
+ })
judgeval/common/utils.py CHANGED
@@ -8,15 +8,19 @@ For API calling, we support:
8
8
  NOTE: any function beginning with 'a', e.g. 'afetch_together_api_response', is an asynchronous function
9
9
  """
10
10
 
11
- import concurrent.futures
12
- from typing import List, Mapping, Dict, Union, Optional, Literal, Any
11
+ # Standard library imports
13
12
  import asyncio
13
+ import concurrent.futures
14
+ import os
15
+ import pprint
16
+ from typing import Any, Dict, List, Literal, Mapping, Optional, Union
17
+
18
+ # Third-party imports
14
19
  import litellm
15
20
  import pydantic
16
- import pprint
17
- import os
18
- from dotenv import load_dotenv
21
+ from dotenv import load_dotenv
19
22
 
23
+ # Local application/library-specific imports
20
24
  from judgeval.clients import async_together_client, together_client
21
25
  from judgeval.constants import *
22
26
  from judgeval.common.logger import debug, error
judgeval/constants.py CHANGED
@@ -21,8 +21,11 @@ class APIScorer(str, Enum):
21
21
  CONTEXTUAL_RECALL = "contextual_recall"
22
22
  CONTEXTUAL_RELEVANCY = "contextual_relevancy"
23
23
  CONTEXTUAL_PRECISION = "contextual_precision"
24
+ INSTRUCTION_ADHERENCE = "instruction_adherence"
24
25
  TOOL_CORRECTNESS = "tool_correctness"
25
26
  JSON_CORRECTNESS = "json_correctness"
27
+ COMPARISON = "comparison"
28
+ GROUNDEDNESS = "groundedness"
26
29
 
27
30
  @classmethod
28
31
  def _missing_(cls, value):
@@ -31,6 +34,8 @@ class APIScorer(str, Enum):
31
34
  if member.value == value.lower():
32
35
  return member
33
36
 
37
+ UNBOUNDED_SCORERS = set([APIScorer.COMPARISON]) # scorers whose scores are not bounded between 0-1
38
+
34
39
  ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
35
40
  # API URLs
36
41
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
@@ -46,7 +51,7 @@ JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_proje
46
51
  JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
47
52
  JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
48
53
  JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
49
-
54
+ JUDGMENT_TRACES_ADD_TO_EVAL_QUEUE_API_URL = f"{ROOT_API}/traces/add_to_eval_queue/"
50
55
  # RabbitMQ
51
56
  RABBITMQ_HOST = os.getenv("RABBITMQ_HOST", "rabbitmq-networklb-faa155df16ec9085.elb.us-west-1.amazonaws.com")
52
57
  RABBITMQ_PORT = os.getenv("RABBITMQ_PORT", 5672)
@@ -110,7 +115,7 @@ TOGETHER_SUPPORTED_MODELS = [
110
115
  "mistralai/Mistral-7B-Instruct-v0.1"
111
116
  ]
112
117
 
113
- JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini"}
118
+ JUDGMENT_SUPPORTED_MODELS = {"osiris-large", "osiris-mini", "osiris"}
114
119
 
115
120
  ACCEPTABLE_MODELS = set(litellm.model_list) | set(TOGETHER_SUPPORTED_MODELS) | JUDGMENT_SUPPORTED_MODELS
116
121
 
judgeval/data/__init__.py CHANGED
@@ -2,6 +2,7 @@ from judgeval.data.example import Example, ExampleParams
2
2
  from judgeval.data.api_example import ProcessExample, create_process_example
3
3
  from judgeval.data.scorer_data import ScorerData, create_scorer_data
4
4
  from judgeval.data.result import ScoringResult, generate_scoring_result
5
+ from judgeval.data.ground_truth import GroundTruthExample
5
6
 
6
7
  __all__ = [
7
8
  "Example",
@@ -12,4 +13,5 @@ __all__ = [
12
13
  "create_scorer_data",
13
14
  "ScoringResult",
14
15
  "generate_scoring_result",
16
+ "GroundTruthExample",
15
17
  ]
@@ -1,5 +1,5 @@
1
- from typing import List, Optional, Dict, Any, Union
2
- from pydantic import BaseModel, Field, ConfigDict, model_validator
1
+ from typing import List, Optional, Dict, Any
2
+ from pydantic import BaseModel, ConfigDict, model_validator
3
3
 
4
4
  from judgeval.data.example import Example
5
5
  from judgeval.data.scorer_data import ScorerData
@@ -1,5 +1,4 @@
1
1
  from judgeval.data.datasets.dataset import EvalDataset
2
- from judgeval.data.datasets.ground_truth import GroundTruthExample
3
2
  from judgeval.data.datasets.eval_dataset_client import EvalDatasetClient
4
3
 
5
- __all__ = ["EvalDataset", "EvalDatasetClient", "GroundTruthExample"]
4
+ __all__ = ["EvalDataset", "EvalDatasetClient"]
@@ -1,13 +1,12 @@
1
1
  import ast
2
2
  import csv
3
- import datetime
3
+ import datetime
4
4
  import json
5
- from dataclasses import dataclass, field
6
5
  import os
7
- from typing import List, Optional, Union, Literal
6
+ from dataclasses import dataclass, field
7
+ from typing import List, Union, Literal
8
8
 
9
- from judgeval.data.datasets.ground_truth import GroundTruthExample
10
- from judgeval.data import Example
9
+ from judgeval.data import Example, GroundTruthExample
11
10
  from judgeval.common.logger import debug, error, warning, info
12
11
 
13
12
  @dataclass
@@ -11,9 +11,8 @@ from judgeval.constants import (
11
11
  JUDGMENT_DATASETS_EDIT_API_URL,
12
12
  JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
13
13
  )
14
- from judgeval.data import Example
14
+ from judgeval.data import Example, GroundTruthExample
15
15
  from judgeval.data.datasets import EvalDataset
16
- from judgeval.data.datasets.ground_truth import GroundTruthExample
17
16
 
18
17
 
19
18
 
@@ -68,7 +67,8 @@ class EvalDatasetClient:
68
67
  "Content-Type": "application/json",
69
68
  "Authorization": f"Bearer {self.judgment_api_key}",
70
69
  "X-Organization-Id": self.organization_id
71
- }
70
+ },
71
+ verify=True
72
72
  )
73
73
  if response.status_code == 500:
74
74
  error(f"Server error during push: {content.get('message')}")
@@ -132,7 +132,8 @@ class EvalDatasetClient:
132
132
  "Content-Type": "application/json",
133
133
  "Authorization": f"Bearer {self.judgment_api_key}",
134
134
  "X-Organization-Id": self.organization_id
135
- }
135
+ },
136
+ verify=True
136
137
  )
137
138
  response.raise_for_status()
138
139
  except requests.exceptions.RequestException as e:
@@ -190,7 +191,8 @@ class EvalDatasetClient:
190
191
  "Content-Type": "application/json",
191
192
  "Authorization": f"Bearer {self.judgment_api_key}",
192
193
  "X-Organization-Id": self.organization_id
193
- }
194
+ },
195
+ verify=True
194
196
  )
195
197
  response.raise_for_status()
196
198
  except requests.exceptions.RequestException as e:
@@ -243,7 +245,8 @@ class EvalDatasetClient:
243
245
  "Content-Type": "application/json",
244
246
  "Authorization": f"Bearer {self.judgment_api_key}",
245
247
  "X-Organization-Id": self.organization_id
246
- }
248
+ },
249
+ verify=True
247
250
  )
248
251
  response.raise_for_status()
249
252
  except requests.exceptions.RequestException as e:
@@ -274,7 +277,8 @@ class EvalDatasetClient:
274
277
  "Authorization": f"Bearer {self.judgment_api_key}",
275
278
  "X-Organization-Id": self.organization_id
276
279
  },
277
- stream=True
280
+ stream=True,
281
+ verify=True
278
282
  )
279
283
  response.raise_for_status()
280
284
  except requests.exceptions.HTTPError as err:
@@ -1,7 +1,6 @@
1
1
  from typing import List, Optional
2
2
 
3
- from judgeval.data.datasets.ground_truth import GroundTruthExample
4
- from judgeval.data import Example
3
+ from judgeval.data import Example, GroundTruthExample
5
4
 
6
5
 
7
6
  def examples_to_ground_truths(examples: List[Example]) -> List[GroundTruthExample]: