judgeval 0.1.0__py3-none-any.whl → 0.23.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (234) hide show
  1. judgeval/__init__.py +173 -10
  2. judgeval/api/__init__.py +523 -0
  3. judgeval/api/api_types.py +413 -0
  4. judgeval/cli.py +112 -0
  5. judgeval/constants.py +7 -30
  6. judgeval/data/__init__.py +1 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +14 -40
  9. judgeval/data/judgment_types.py +396 -146
  10. judgeval/data/result.py +11 -18
  11. judgeval/data/scorer_data.py +3 -26
  12. judgeval/data/scripts/openapi_transform.py +5 -5
  13. judgeval/data/trace.py +115 -194
  14. judgeval/dataset/__init__.py +335 -0
  15. judgeval/env.py +55 -0
  16. judgeval/evaluation/__init__.py +346 -0
  17. judgeval/exceptions.py +28 -0
  18. judgeval/integrations/langgraph/__init__.py +13 -0
  19. judgeval/integrations/openlit/__init__.py +51 -0
  20. judgeval/judges/__init__.py +2 -2
  21. judgeval/judges/litellm_judge.py +77 -16
  22. judgeval/judges/together_judge.py +88 -17
  23. judgeval/judges/utils.py +7 -20
  24. judgeval/judgment_attribute_keys.py +55 -0
  25. judgeval/{common/logger.py → logger.py} +24 -8
  26. judgeval/prompt/__init__.py +330 -0
  27. judgeval/scorers/__init__.py +11 -11
  28. judgeval/scorers/agent_scorer.py +15 -19
  29. judgeval/scorers/api_scorer.py +21 -23
  30. judgeval/scorers/base_scorer.py +54 -36
  31. judgeval/scorers/example_scorer.py +1 -3
  32. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +2 -24
  33. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +2 -10
  34. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +2 -2
  35. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +2 -10
  36. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +2 -14
  37. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +171 -59
  38. judgeval/scorers/score.py +64 -47
  39. judgeval/scorers/utils.py +2 -107
  40. judgeval/tracer/__init__.py +1111 -2
  41. judgeval/tracer/constants.py +1 -0
  42. judgeval/tracer/exporters/__init__.py +40 -0
  43. judgeval/tracer/exporters/s3.py +119 -0
  44. judgeval/tracer/exporters/store.py +59 -0
  45. judgeval/tracer/exporters/utils.py +32 -0
  46. judgeval/tracer/keys.py +63 -0
  47. judgeval/tracer/llm/__init__.py +7 -0
  48. judgeval/tracer/llm/config.py +78 -0
  49. judgeval/tracer/llm/constants.py +9 -0
  50. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  51. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  52. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  53. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  54. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  55. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  56. judgeval/tracer/llm/llm_google/config.py +6 -0
  57. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  58. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  59. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  60. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  61. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  62. judgeval/tracer/llm/llm_openai/config.py +6 -0
  63. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  64. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  65. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  66. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  67. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  68. judgeval/tracer/llm/llm_together/config.py +6 -0
  69. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  70. judgeval/tracer/llm/providers.py +19 -0
  71. judgeval/tracer/managers.py +167 -0
  72. judgeval/tracer/processors/__init__.py +220 -0
  73. judgeval/tracer/utils.py +19 -0
  74. judgeval/trainer/__init__.py +14 -0
  75. judgeval/trainer/base_trainer.py +122 -0
  76. judgeval/trainer/config.py +123 -0
  77. judgeval/trainer/console.py +144 -0
  78. judgeval/trainer/fireworks_trainer.py +392 -0
  79. judgeval/trainer/trainable_model.py +252 -0
  80. judgeval/trainer/trainer.py +70 -0
  81. judgeval/utils/async_utils.py +39 -0
  82. judgeval/utils/decorators/__init__.py +0 -0
  83. judgeval/utils/decorators/dont_throw.py +37 -0
  84. judgeval/utils/decorators/use_once.py +13 -0
  85. judgeval/utils/file_utils.py +74 -28
  86. judgeval/utils/guards.py +36 -0
  87. judgeval/utils/meta.py +27 -0
  88. judgeval/utils/project.py +15 -0
  89. judgeval/utils/serialize.py +253 -0
  90. judgeval/utils/testing.py +70 -0
  91. judgeval/utils/url.py +10 -0
  92. judgeval/{version_check.py → utils/version_check.py} +5 -3
  93. judgeval/utils/wrappers/README.md +3 -0
  94. judgeval/utils/wrappers/__init__.py +15 -0
  95. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  96. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  97. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  98. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  99. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  100. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  101. judgeval/utils/wrappers/py.typed +0 -0
  102. judgeval/utils/wrappers/utils.py +35 -0
  103. judgeval/v1/__init__.py +88 -0
  104. judgeval/v1/data/__init__.py +7 -0
  105. judgeval/v1/data/example.py +44 -0
  106. judgeval/v1/data/scorer_data.py +42 -0
  107. judgeval/v1/data/scoring_result.py +44 -0
  108. judgeval/v1/datasets/__init__.py +6 -0
  109. judgeval/v1/datasets/dataset.py +214 -0
  110. judgeval/v1/datasets/dataset_factory.py +94 -0
  111. judgeval/v1/evaluation/__init__.py +6 -0
  112. judgeval/v1/evaluation/evaluation.py +182 -0
  113. judgeval/v1/evaluation/evaluation_factory.py +17 -0
  114. judgeval/v1/instrumentation/__init__.py +6 -0
  115. judgeval/v1/instrumentation/llm/__init__.py +7 -0
  116. judgeval/v1/instrumentation/llm/config.py +78 -0
  117. judgeval/v1/instrumentation/llm/constants.py +11 -0
  118. judgeval/v1/instrumentation/llm/llm_anthropic/__init__.py +5 -0
  119. judgeval/v1/instrumentation/llm/llm_anthropic/config.py +6 -0
  120. judgeval/v1/instrumentation/llm/llm_anthropic/messages.py +414 -0
  121. judgeval/v1/instrumentation/llm/llm_anthropic/messages_stream.py +307 -0
  122. judgeval/v1/instrumentation/llm/llm_anthropic/wrapper.py +61 -0
  123. judgeval/v1/instrumentation/llm/llm_google/__init__.py +5 -0
  124. judgeval/v1/instrumentation/llm/llm_google/config.py +6 -0
  125. judgeval/v1/instrumentation/llm/llm_google/generate_content.py +121 -0
  126. judgeval/v1/instrumentation/llm/llm_google/wrapper.py +30 -0
  127. judgeval/v1/instrumentation/llm/llm_openai/__init__.py +5 -0
  128. judgeval/v1/instrumentation/llm/llm_openai/beta_chat_completions.py +212 -0
  129. judgeval/v1/instrumentation/llm/llm_openai/chat_completions.py +477 -0
  130. judgeval/v1/instrumentation/llm/llm_openai/config.py +6 -0
  131. judgeval/v1/instrumentation/llm/llm_openai/responses.py +472 -0
  132. judgeval/v1/instrumentation/llm/llm_openai/utils.py +41 -0
  133. judgeval/v1/instrumentation/llm/llm_openai/wrapper.py +63 -0
  134. judgeval/v1/instrumentation/llm/llm_together/__init__.py +5 -0
  135. judgeval/v1/instrumentation/llm/llm_together/chat_completions.py +382 -0
  136. judgeval/v1/instrumentation/llm/llm_together/config.py +6 -0
  137. judgeval/v1/instrumentation/llm/llm_together/wrapper.py +57 -0
  138. judgeval/v1/instrumentation/llm/providers.py +19 -0
  139. judgeval/v1/integrations/claude_agent_sdk/__init__.py +119 -0
  140. judgeval/v1/integrations/claude_agent_sdk/wrapper.py +564 -0
  141. judgeval/v1/integrations/langgraph/__init__.py +13 -0
  142. judgeval/v1/integrations/openlit/__init__.py +47 -0
  143. judgeval/v1/internal/api/__init__.py +525 -0
  144. judgeval/v1/internal/api/api_types.py +413 -0
  145. judgeval/v1/prompts/__init__.py +6 -0
  146. judgeval/v1/prompts/prompt.py +29 -0
  147. judgeval/v1/prompts/prompt_factory.py +189 -0
  148. judgeval/v1/py.typed +0 -0
  149. judgeval/v1/scorers/__init__.py +6 -0
  150. judgeval/v1/scorers/api_scorer.py +82 -0
  151. judgeval/v1/scorers/base_scorer.py +17 -0
  152. judgeval/v1/scorers/built_in/__init__.py +17 -0
  153. judgeval/v1/scorers/built_in/answer_correctness.py +28 -0
  154. judgeval/v1/scorers/built_in/answer_relevancy.py +28 -0
  155. judgeval/v1/scorers/built_in/built_in_factory.py +26 -0
  156. judgeval/v1/scorers/built_in/faithfulness.py +28 -0
  157. judgeval/v1/scorers/built_in/instruction_adherence.py +28 -0
  158. judgeval/v1/scorers/custom_scorer/__init__.py +6 -0
  159. judgeval/v1/scorers/custom_scorer/custom_scorer.py +50 -0
  160. judgeval/v1/scorers/custom_scorer/custom_scorer_factory.py +16 -0
  161. judgeval/v1/scorers/prompt_scorer/__init__.py +6 -0
  162. judgeval/v1/scorers/prompt_scorer/prompt_scorer.py +86 -0
  163. judgeval/v1/scorers/prompt_scorer/prompt_scorer_factory.py +85 -0
  164. judgeval/v1/scorers/scorers_factory.py +49 -0
  165. judgeval/v1/tracer/__init__.py +7 -0
  166. judgeval/v1/tracer/base_tracer.py +520 -0
  167. judgeval/v1/tracer/exporters/__init__.py +14 -0
  168. judgeval/v1/tracer/exporters/in_memory_span_exporter.py +25 -0
  169. judgeval/v1/tracer/exporters/judgment_span_exporter.py +42 -0
  170. judgeval/v1/tracer/exporters/noop_span_exporter.py +19 -0
  171. judgeval/v1/tracer/exporters/span_store.py +50 -0
  172. judgeval/v1/tracer/judgment_tracer_provider.py +70 -0
  173. judgeval/v1/tracer/processors/__init__.py +6 -0
  174. judgeval/v1/tracer/processors/_lifecycles/__init__.py +28 -0
  175. judgeval/v1/tracer/processors/_lifecycles/agent_id_processor.py +53 -0
  176. judgeval/v1/tracer/processors/_lifecycles/context_keys.py +11 -0
  177. judgeval/v1/tracer/processors/_lifecycles/customer_id_processor.py +29 -0
  178. judgeval/v1/tracer/processors/_lifecycles/registry.py +18 -0
  179. judgeval/v1/tracer/processors/judgment_span_processor.py +165 -0
  180. judgeval/v1/tracer/processors/noop_span_processor.py +42 -0
  181. judgeval/v1/tracer/tracer.py +67 -0
  182. judgeval/v1/tracer/tracer_factory.py +38 -0
  183. judgeval/v1/trainers/__init__.py +5 -0
  184. judgeval/v1/trainers/base_trainer.py +62 -0
  185. judgeval/v1/trainers/config.py +123 -0
  186. judgeval/v1/trainers/console.py +144 -0
  187. judgeval/v1/trainers/fireworks_trainer.py +392 -0
  188. judgeval/v1/trainers/trainable_model.py +252 -0
  189. judgeval/v1/trainers/trainers_factory.py +37 -0
  190. judgeval/v1/utils.py +18 -0
  191. judgeval/version.py +5 -0
  192. judgeval/warnings.py +4 -0
  193. judgeval-0.23.0.dist-info/METADATA +266 -0
  194. judgeval-0.23.0.dist-info/RECORD +201 -0
  195. judgeval-0.23.0.dist-info/entry_points.txt +2 -0
  196. judgeval/clients.py +0 -34
  197. judgeval/common/__init__.py +0 -13
  198. judgeval/common/api/__init__.py +0 -3
  199. judgeval/common/api/api.py +0 -352
  200. judgeval/common/api/constants.py +0 -165
  201. judgeval/common/exceptions.py +0 -27
  202. judgeval/common/storage/__init__.py +0 -6
  203. judgeval/common/storage/s3_storage.py +0 -98
  204. judgeval/common/tracer/__init__.py +0 -31
  205. judgeval/common/tracer/constants.py +0 -22
  206. judgeval/common/tracer/core.py +0 -1916
  207. judgeval/common/tracer/otel_exporter.py +0 -108
  208. judgeval/common/tracer/otel_span_processor.py +0 -234
  209. judgeval/common/tracer/span_processor.py +0 -37
  210. judgeval/common/tracer/span_transformer.py +0 -211
  211. judgeval/common/tracer/trace_manager.py +0 -92
  212. judgeval/common/utils.py +0 -940
  213. judgeval/data/datasets/__init__.py +0 -4
  214. judgeval/data/datasets/dataset.py +0 -341
  215. judgeval/data/datasets/eval_dataset_client.py +0 -214
  216. judgeval/data/tool.py +0 -5
  217. judgeval/data/trace_run.py +0 -37
  218. judgeval/evaluation_run.py +0 -75
  219. judgeval/integrations/langgraph.py +0 -843
  220. judgeval/judges/mixture_of_judges.py +0 -286
  221. judgeval/judgment_client.py +0 -369
  222. judgeval/rules.py +0 -521
  223. judgeval/run_evaluation.py +0 -684
  224. judgeval/scorers/judgeval_scorers/api_scorers/derailment_scorer.py +0 -14
  225. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  226. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  227. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +0 -20
  228. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +0 -27
  229. judgeval/utils/alerts.py +0 -93
  230. judgeval/utils/requests.py +0 -50
  231. judgeval-0.1.0.dist-info/METADATA +0 -202
  232. judgeval-0.1.0.dist-info/RECORD +0 -73
  233. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/WHEEL +0 -0
  234. {judgeval-0.1.0.dist-info → judgeval-0.23.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,125 @@
1
+ from typing import List, Optional, Union, Tuple, Sequence
2
+ from pydantic import field_validator, model_validator, Field, BaseModel
3
+ from datetime import datetime, timezone
4
+ import uuid
5
+
6
+ from judgeval.data import Example
7
+ from judgeval.scorers import APIScorerConfig
8
+ from judgeval.scorers.example_scorer import ExampleScorer
9
+ from judgeval.constants import ACCEPTABLE_MODELS
10
+ from judgeval.data.judgment_types import (
11
+ ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
12
+ TraceEvaluationRun as TraceEvaluationRunJudgmentType,
13
+ )
14
+
15
+
16
+ class EvaluationRun(BaseModel):
17
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
18
+ created_at: str = Field(
19
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
20
+ )
21
+ custom_scorers: List[ExampleScorer] = Field(default_factory=list)
22
+ judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
23
+ scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
24
+ default_factory=list
25
+ )
26
+ model: Optional[str] = None
27
+
28
+ def __init__(
29
+ self,
30
+ scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
31
+ **kwargs,
32
+ ):
33
+ """
34
+ Initialize EvaluationRun with automatic scorer classification.
35
+
36
+ Args:
37
+ scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
38
+ **kwargs: Other initialization arguments
39
+ """
40
+ if scorers is not None:
41
+ # Automatically sort scorers into appropriate fields
42
+ custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
43
+ judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
44
+
45
+ # Always set both fields as lists (even if empty) to satisfy validation
46
+ kwargs["custom_scorers"] = custom_scorers
47
+ kwargs["judgment_scorers"] = judgment_scorers
48
+
49
+ super().__init__(**kwargs)
50
+
51
+ def model_dump(self, **kwargs):
52
+ data = super().model_dump(**kwargs)
53
+ data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
54
+ data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
55
+
56
+ return data
57
+
58
+ @model_validator(mode="after")
59
+ @classmethod
60
+ def validate_scorer_lists(cls, values):
61
+ custom_scorers = values.custom_scorers
62
+ judgment_scorers = values.judgment_scorers
63
+
64
+ # Check that both lists are not empty
65
+ if not custom_scorers and not judgment_scorers:
66
+ raise ValueError(
67
+ "At least one of custom_scorers or judgment_scorers must be provided."
68
+ )
69
+
70
+ # Check that only one list is filled
71
+ if custom_scorers and judgment_scorers:
72
+ raise ValueError(
73
+ "Only one of custom_scorers or judgment_scorers can be provided, not both."
74
+ )
75
+
76
+ return values
77
+
78
+ @field_validator("model")
79
+ def validate_model(cls, v, values):
80
+ # Check if model is string or list of strings
81
+ if v is not None and isinstance(v, str):
82
+ if v not in ACCEPTABLE_MODELS:
83
+ raise ValueError(
84
+ f"Model name {v} not recognized. Please select a valid model name.)"
85
+ )
86
+ return v
87
+
88
+
89
+ class ExampleEvaluationRun(EvaluationRun, ExampleEvaluationRunJudgmentType): # type: ignore
90
+ """
91
+ Stores example and evaluation scorers together for running an eval task
92
+
93
+ Args:
94
+ project_name (str): The name of the project the evaluation results belong to
95
+ eval_name (str): A name for this evaluation run
96
+ examples (List[Example]): The examples to evaluate
97
+ scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
98
+ model (str): The model used as a judge when using LLM as a Judge
99
+ """
100
+
101
+ examples: List[Example] # type: ignore
102
+
103
+ @field_validator("examples")
104
+ def validate_examples(cls, v):
105
+ if not v:
106
+ raise ValueError("Examples cannot be empty.")
107
+ for item in v:
108
+ if not isinstance(item, Example):
109
+ raise ValueError(f"Item of type {type(item)} is not a Example")
110
+ return v
111
+
112
+ def model_dump(self, **kwargs):
113
+ data = super().model_dump(**kwargs)
114
+ data["examples"] = [example.model_dump() for example in self.examples]
115
+ return data
116
+
117
+
118
+ class TraceEvaluationRun(EvaluationRun, TraceEvaluationRunJudgmentType): # type: ignore
119
+ trace_and_span_ids: List[Tuple[str, str]] # type: ignore
120
+
121
+ @field_validator("trace_and_span_ids")
122
+ def validate_trace_and_span_ids(cls, v):
123
+ if not v:
124
+ raise ValueError("Trace and span IDs are required for trace evaluations.")
125
+ return v
judgeval/data/example.py CHANGED
@@ -4,7 +4,10 @@ Classes for representing examples in a dataset.
4
4
 
5
5
  from enum import Enum
6
6
  from datetime import datetime
7
- from judgeval.data.judgment_types import ExampleJudgmentType
7
+ from typing import Dict, Any, Optional
8
+ from judgeval.data.judgment_types import Example as JudgmentExample
9
+ from uuid import uuid4
10
+ from pydantic import Field
8
11
 
9
12
 
10
13
  class ExampleParams(str, Enum):
@@ -15,47 +18,18 @@ class ExampleParams(str, Enum):
15
18
  RETRIEVAL_CONTEXT = "retrieval_context"
16
19
  TOOLS_CALLED = "tools_called"
17
20
  EXPECTED_TOOLS = "expected_tools"
18
- REASONING = "reasoning"
19
21
  ADDITIONAL_METADATA = "additional_metadata"
20
22
 
21
23
 
22
- class Example(ExampleJudgmentType):
23
- example_id: str = ""
24
+ class Example(JudgmentExample):
25
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
26
+ created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
27
+ name: Optional[str] = None
24
28
 
25
- def __init__(self, **data):
26
- if "created_at" not in data:
27
- data["created_at"] = datetime.now().isoformat()
28
- super().__init__(**data)
29
- self.example_id = None
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ data = super().model_dump(warnings=False)
31
+ return data
30
32
 
31
- def to_dict(self):
32
- return {
33
- "input": self.input,
34
- "actual_output": self.actual_output,
35
- "expected_output": self.expected_output,
36
- "context": self.context,
37
- "retrieval_context": self.retrieval_context,
38
- "additional_metadata": self.additional_metadata,
39
- "tools_called": self.tools_called,
40
- "expected_tools": self.expected_tools,
41
- "name": self.name,
42
- "example_id": self.example_id,
43
- "example_index": self.example_index,
44
- "created_at": self.created_at,
45
- }
46
-
47
- def __str__(self):
48
- return (
49
- f"Example(input={self.input}, "
50
- f"actual_output={self.actual_output}, "
51
- f"expected_output={self.expected_output}, "
52
- f"context={self.context}, "
53
- f"retrieval_context={self.retrieval_context}, "
54
- f"additional_metadata={self.additional_metadata}, "
55
- f"tools_called={self.tools_called}, "
56
- f"expected_tools={self.expected_tools}, "
57
- f"name={self.name}, "
58
- f"example_id={self.example_id}, "
59
- f"example_index={self.example_index}, "
60
- f"created_at={self.created_at}, "
61
- )
33
+ def get_fields(self):
34
+ excluded = {"example_id", "name", "created_at"}
35
+ return self.model_dump(exclude=excluded)