judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
judgeval/data/result.py CHANGED
@@ -1,81 +1,43 @@
1
- from dataclasses import dataclass
2
- from typing import List, Union, Optional, Dict, Any
1
+ from typing import List, Union
2
+ from judgeval.data import ScorerData, Example
3
+ from judgeval.data.judgment_types import ScoringResult as JudgmentScoringResult
3
4
 
4
- from judgeval.data import ScorerData, ProcessExample
5
5
 
6
- @dataclass
7
- class ScoringResult:
6
+ class ScoringResult(JudgmentScoringResult):
8
7
  """
9
8
  A ScoringResult contains the output of one or more scorers applied to a single example.
10
9
  Ie: One input, one actual_output, one expected_output, etc..., and 1+ scorer (Faithfulness, Hallucination, Summarization, etc...)
11
10
 
12
11
  Args:
13
- success (bool): Whether the evaluation was successful.
12
+ success (bool): Whether the evaluation was successful.
14
13
  This means that all scorers applied to this example returned a success.
15
14
  scorer_data (List[ScorerData]): The scorers data for the evaluated example
16
- input (Optional[str]): The input to the example
17
- actual_output (Optional[str]): The actual output of the example
18
- expected_output (Optional[str]): The expected output of the example
19
- context (Optional[List[str]]): The context of the example
20
- retrieval_context (Optional[List[str]]): The retrieval context of the example
21
- additional_metadata (Optional[Dict[str, Any]]): The additional metadata of the example
22
- tools_called (Optional[List[str]]): The tools called by the example
23
- expected_tools (Optional[List[str]]): The expected tools of the example
24
- trace_id (Optional[str]): The trace id of the example
25
-
15
+ data_object (Optional[Example]): The original example object that was used to create the ScoringResult, can be Example, WorkflowRun (future)
16
+
26
17
  """
27
- # Fields for scoring outputs
28
- success: bool # used for unit testing
29
- scorers_data: Union[List[ScorerData], None]
30
18
 
31
- # Inputs from the original example
32
- input: Optional[str] = None
33
- actual_output: Optional[str] = None
34
- expected_output: Optional[str] = None
35
- context: Optional[List[str]] = None
36
- retrieval_context: Optional[List[str]] = None
37
- additional_metadata: Optional[Dict[str, Any]] = None
38
- tools_called: Optional[List[str]] = None
39
- expected_tools: Optional[List[str]] = None
40
- trace_id: Optional[str] = None
41
-
42
- example_id: Optional[str] = None
43
- eval_run_name: Optional[str] = None
44
-
45
- def to_dict(self) -> dict:
46
- """Convert the ScoringResult instance to a dictionary, properly serializing scorer_data."""
47
- return {
48
- "success": self.success,
49
- "scorers_data": [scorer_data.to_dict() for scorer_data in self.scorers_data] if self.scorers_data else None,
50
- "input": self.input,
51
- "actual_output": self.actual_output,
52
- "expected_output": self.expected_output,
53
- "context": self.context,
54
- "retrieval_context": self.retrieval_context,
55
- "additional_metadata": self.additional_metadata,
56
- "tools_called": self.tools_called,
57
- "expected_tools": self.expected_tools,
58
- "trace_id": self.trace_id,
59
- "example_id": self.example_id
60
- }
61
-
19
+ # Need to override this so that it uses this repo's Example class
20
+ data_object: Example
21
+ scorers_data: List[ScorerData]
22
+
23
+ def model_dump(self, **kwargs):
24
+ data = super().model_dump(**kwargs)
25
+ data["data_object"] = self.data_object.model_dump()
26
+ return data
27
+
62
28
  def __str__(self) -> str:
63
29
  return f"ScoringResult(\
64
30
  success={self.success}, \
65
- scorer_data={self.scorers_data}, \
66
- input={self.input}, \
67
- actual_output={self.actual_output}, \
68
- expected_output={self.expected_output}, \
69
- context={self.context}, \
70
- retrieval_context={self.retrieval_context}, \
71
- additional_metadata={self.additional_metadata}, \
72
- tools_called={self.tools_called}, \
73
- expected_tools={self.expected_tools}, \
74
- trace_id={self.trace_id})"
31
+ scorers_data={self.scorers_data}, \
32
+ data_object={self.data_object}, \
33
+ run_duration={self.run_duration})"
75
34
 
76
35
 
77
36
  def generate_scoring_result(
78
- process_example: ProcessExample,
37
+ data_object: Union[Example],
38
+ scorers_data: List[ScorerData],
39
+ run_duration: float,
40
+ success: bool,
79
41
  ) -> ScoringResult:
80
42
  """
81
43
  Creates a final ScoringResult object for an evaluation run based on the results from a completed LLMApiTestCase.
@@ -83,16 +45,10 @@ def generate_scoring_result(
83
45
  When an LLMTestCase is executed, it turns into an LLMApiTestCase and the progress of the evaluation run is tracked.
84
46
  At the end of the evaluation run, we create a TestResult object out of the completed LLMApiTestCase.
85
47
  """
86
- return ScoringResult(
87
- success=process_example.success,
88
- scorers_data=process_example.scorers_data,
89
- input=process_example.input,
90
- actual_output=process_example.actual_output,
91
- expected_output=process_example.expected_output,
92
- context=process_example.context,
93
- retrieval_context=process_example.retrieval_context,
94
- additional_metadata=process_example.additional_metadata,
95
- tools_called=process_example.tools_called,
96
- expected_tools=process_example.expected_tools,
97
- trace_id=process_example.trace_id
48
+ scoring_result = ScoringResult(
49
+ data_object=data_object,
50
+ success=success,
51
+ scorers_data=scorers_data,
52
+ run_duration=run_duration,
98
53
  )
54
+ return scoring_result
@@ -4,50 +4,14 @@ Implementation of the ScorerData class.
4
4
  ScorerData holds the information related to a single, completed Scorer evaluation run.
5
5
  """
6
6
 
7
- from typing import List, Union, Optional, Dict
8
- from pydantic import BaseModel, Field
7
+ from __future__ import annotations
9
8
 
10
- from judgeval.scorers import JudgevalScorer
9
+ from judgeval.data.judgment_types import ScorerData
10
+ from judgeval.scorers import BaseScorer
11
+ from typing import List
11
12
 
12
- class ScorerData(BaseModel):
13
- """
14
- ScorerData holds the information related to a single, completed Scorer evaluation run.
15
-
16
- For example, if running the Judgment Faithfulness scorer on an example, the ScorerData
17
- object will contain whether the example passed its threshold expectation, as well as more detailed
18
- information surrounding the evaluation run such as the claims and verdicts generated by the
19
- judge model(s).
20
- """
21
- name: str
22
- threshold: float
23
- success: bool
24
- score: Optional[float] = None
25
- reason: Optional[str] = None
26
- strict_mode: Optional[bool] = None
27
- evaluation_model: Union[List[str], str] = None
28
- error: Optional[str] = None
29
- evaluation_cost: Union[float, None] = None
30
- verbose_logs: Optional[str] = None
31
- additional_metadata: Optional[Dict] = None
32
13
 
33
- def to_dict(self) -> dict:
34
- """Convert the ScorerData instance to a JSON-serializable dictionary."""
35
- return {
36
- "name": self.name,
37
- "threshold": self.threshold,
38
- "success": self.success,
39
- "score": self.score,
40
- "reason": self.reason,
41
- "strict_mode": self.strict_mode,
42
- "evaluation_model": self.evaluation_model,
43
- "error": self.error,
44
- "evaluation_cost": self.evaluation_cost,
45
- "verbose_logs": self.verbose_logs,
46
- "additional_metadata": self.additional_metadata
47
- }
48
-
49
-
50
- def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
14
+ def create_scorer_data(scorer: BaseScorer) -> List[ScorerData]:
51
15
  """
52
16
  After a `scorer` is run, it contains information about the example that was evaluated
53
17
  using the scorer. For example, after computing Faithfulness, the `scorer` object will contain
@@ -57,30 +21,33 @@ def create_scorer_data(scorer: JudgevalScorer) -> ScorerData:
57
21
  contains the output of the scorer run that can be exported to be logged as a part of
58
22
  the ScorerResult.
59
23
  """
60
- if scorer.error is not None: # error occurred during eval run
61
- return ScorerData(
62
- name=scorer.__name__,
24
+ scorers_result = list()
25
+
26
+ scorers_result.append(
27
+ ScorerData(
28
+ name=scorer.name,
63
29
  threshold=scorer.threshold,
64
- score=None,
65
- reason=None,
66
- success=False,
67
- strict_mode=scorer.strict_mode,
68
- evaluation_model=scorer.evaluation_model,
69
- error=scorer.error,
70
- evaluation_cost=scorer.evaluation_cost,
71
- verbose_logs=scorer.verbose_logs,
72
- )
73
- else: # standard execution, no error
74
- return ScorerData(
75
- name=scorer.__name__,
76
30
  score=scorer.score,
77
- threshold=scorer.threshold,
78
31
  reason=scorer.reason,
79
- success=scorer._success_check(),
32
+ success=scorer.success,
80
33
  strict_mode=scorer.strict_mode,
81
- evaluation_model=scorer.evaluation_model,
82
- error=None,
83
- evaluation_cost=scorer.evaluation_cost,
84
- verbose_logs=scorer.verbose_logs,
34
+ evaluation_model=scorer.model,
35
+ error=scorer.error,
85
36
  additional_metadata=scorer.additional_metadata,
86
37
  )
38
+ )
39
+ if hasattr(scorer, "internal_scorer") and scorer.internal_scorer is not None:
40
+ scorers_result.append(
41
+ ScorerData(
42
+ name=scorer.internal_scorer.name,
43
+ score=scorer.internal_scorer.score,
44
+ threshold=scorer.internal_scorer.threshold,
45
+ reason=scorer.internal_scorer.reason,
46
+ success=scorer.internal_scorer.success,
47
+ strict_mode=scorer.internal_scorer.strict_mode,
48
+ evaluation_model=scorer.internal_scorer.model,
49
+ error=scorer.internal_scorer.error,
50
+ additional_metadata=scorer.internal_scorer.additional_metadata,
51
+ )
52
+ )
53
+ return scorers_result
@@ -0,0 +1,23 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Post-process generated Pydantic models with default_factory defaults.
4
+ """
5
+
6
+ import sys
7
+
8
+
9
+ def fix_mutable_defaults(file_path: str) -> None:
10
+ """Fix mutable defaults in generated Pydantic models."""
11
+
12
+ with open(file_path, "r") as f:
13
+ content = f.read()
14
+
15
+ content = content.replace(" = {}", " = Field(default_factory=dict)")
16
+ content = content.replace(" = []", " = Field(default_factory=list)")
17
+ with open(file_path, "w") as f:
18
+ f.write(content)
19
+
20
+
21
+ if __name__ == "__main__":
22
+ file_path = sys.argv[1]
23
+ fix_mutable_defaults(file_path)
@@ -0,0 +1,123 @@
1
+ import orjson
2
+ import sys
3
+ from typing import Any, Dict, Generator, List
4
+ import requests
5
+
6
+ spec_file = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8000/openapi.json"
7
+
8
+ if spec_file.startswith("http"):
9
+ r = requests.get(spec_file)
10
+ r.raise_for_status()
11
+ SPEC = r.json()
12
+ else:
13
+ with open(spec_file, "rb") as f:
14
+ SPEC = orjson.loads(f.read())
15
+
16
+ JUDGEVAL_PATHS: List[str] = [
17
+ "/log_eval_results/",
18
+ ]
19
+
20
+
21
+ def resolve_ref(ref: str) -> str:
22
+ assert ref.startswith("#/components/schemas/"), (
23
+ "Reference must start with #/components/schemas/"
24
+ )
25
+ return ref.replace("#/components/schemas/", "")
26
+
27
+
28
+ def walk(obj: Any) -> Generator[Any, None, None]:
29
+ yield obj
30
+ if isinstance(obj, list):
31
+ for item in obj:
32
+ yield from walk(item)
33
+ elif isinstance(obj, dict):
34
+ for value in obj.values():
35
+ yield from walk(value)
36
+
37
+
38
+ def get_referenced_schemas(obj: Any) -> Generator[str, None, None]:
39
+ for value in walk(obj):
40
+ if isinstance(value, dict) and "$ref" in value:
41
+ ref = value["$ref"]
42
+ resolved = resolve_ref(ref)
43
+ assert isinstance(ref, str), "Reference must be a string"
44
+ # Strip the _JudgmentType suffix if it exists to get the original schema name
45
+ if resolved.endswith("_JudgmentType"):
46
+ resolved = resolved[: -len("_JudgmentType")]
47
+ yield resolved
48
+
49
+
50
+ def transform_schema_refs(obj: Any) -> Any:
51
+ """Transform all $ref values in a schema to use the _JudgmentType suffix"""
52
+ if isinstance(obj, dict):
53
+ result = {}
54
+ for key, value in obj.items():
55
+ if (
56
+ key == "$ref"
57
+ and isinstance(value, str)
58
+ and value.startswith("#/components/schemas/")
59
+ ):
60
+ # Update the reference to use the suffixed name
61
+ original_name = resolve_ref(value)
62
+ suffixed_name = f"{original_name}_JudgmentType"
63
+ result[key] = f"#/components/schemas/{suffixed_name}"
64
+ else:
65
+ result[key] = transform_schema_refs(value)
66
+ return result
67
+ elif isinstance(obj, list):
68
+ return [transform_schema_refs(item) for item in obj]
69
+ else:
70
+ return obj
71
+
72
+
73
+ filtered_paths = {
74
+ path: spec_data
75
+ for path, spec_data in SPEC["paths"].items()
76
+ if path in JUDGEVAL_PATHS
77
+ }
78
+
79
+
80
+ def filter_schemas() -> Dict[str, Any]:
81
+ result: Dict[str, Any] = {}
82
+ processed_original_names: set[str] = set()
83
+ schemas_to_scan: Any = filtered_paths
84
+
85
+ while True:
86
+ to_commit: Dict[str, Any] = {}
87
+ for original_schema_name in get_referenced_schemas(schemas_to_scan):
88
+ if original_schema_name in processed_original_names:
89
+ continue
90
+
91
+ assert original_schema_name in SPEC["components"]["schemas"], (
92
+ f"Schema {original_schema_name} not found in components.schemas"
93
+ )
94
+ # Transform the schema to update any internal references
95
+ original_schema = SPEC["components"]["schemas"][original_schema_name]
96
+ transformed_schema = transform_schema_refs(original_schema)
97
+ suffixed_name = f"{original_schema_name}_JudgmentType"
98
+ to_commit[suffixed_name] = transformed_schema
99
+ processed_original_names.add(original_schema_name)
100
+
101
+ if not to_commit:
102
+ break
103
+
104
+ result.update(to_commit)
105
+ schemas_to_scan = to_commit
106
+
107
+ return result
108
+
109
+
110
+ # Transform the filtered paths to update schema references
111
+ transformed_paths = transform_schema_refs(filtered_paths)
112
+
113
+ spec = {
114
+ "openapi": SPEC["openapi"],
115
+ "info": SPEC["info"],
116
+ "paths": transformed_paths,
117
+ "components": {
118
+ **SPEC["components"],
119
+ "schemas": filter_schemas(),
120
+ },
121
+ }
122
+
123
+ print(orjson.dumps(spec, option=orjson.OPT_INDENT_2).decode("utf-8"))
judgeval/data/trace.py ADDED
@@ -0,0 +1,121 @@
1
+ from typing import Optional, List, Dict, Any
2
+ from pydantic import BaseModel
3
+ from .judgment_types import (
4
+ OtelSpanDetailScores,
5
+ OtelSpanDetail,
6
+ OtelTraceListItem,
7
+ )
8
+
9
+
10
+ class TraceUsage(BaseModel):
11
+ prompt_tokens: Optional[int] = None
12
+ completion_tokens: Optional[int] = None
13
+ cache_creation_input_tokens: Optional[int] = None
14
+ cache_read_input_tokens: Optional[int] = None
15
+ total_tokens: Optional[int] = None
16
+ prompt_tokens_cost_usd: Optional[float] = None
17
+ completion_tokens_cost_usd: Optional[float] = None
18
+ total_cost_usd: Optional[float] = None
19
+ model_name: Optional[str] = None
20
+
21
+
22
+ class TraceScore(OtelSpanDetailScores):
23
+ """Score information for a trace or span."""
24
+
25
+ pass
26
+
27
+
28
+ class TraceRule(BaseModel):
29
+ """Rule that was triggered for a trace."""
30
+
31
+ rule_id: str
32
+ rule_name: str
33
+
34
+
35
+ class TraceSpan(OtelSpanDetail):
36
+ """Individual span within a trace with complete telemetry data."""
37
+
38
+ @classmethod
39
+ def from_otel_span_detail(cls, span_detail: OtelSpanDetail) -> "TraceSpan":
40
+ """Create TraceSpan from OtelSpanDetail, converting scores to TraceScore."""
41
+ data = span_detail.model_dump()
42
+
43
+ if "scores" in data and data["scores"]:
44
+ data["scores"] = [TraceScore(**score) for score in data["scores"]]
45
+
46
+ return cls(**data)
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert TraceSpan to dictionary."""
50
+ return self.model_dump(exclude_none=True)
51
+
52
+
53
+ class Trace(OtelTraceListItem):
54
+ """Complete trace with metadata and all associated spans."""
55
+
56
+ spans: List[TraceSpan] = []
57
+ rules: Optional[List[TraceRule]] = []
58
+
59
+ @classmethod
60
+ def from_dataset_trace_with_spans(cls, dataset_trace: Any) -> "Trace":
61
+ """Create Trace from DatasetTraceWithSpans (handles both API and judgment types)."""
62
+
63
+ if hasattr(dataset_trace, "trace_detail"):
64
+ trace_detail = dataset_trace.trace_detail
65
+ spans_data = dataset_trace.spans
66
+ else:
67
+ trace_detail = dataset_trace.get("trace_detail", {})
68
+ spans_data = dataset_trace.get("spans", [])
69
+
70
+ if hasattr(trace_detail, "model_dump"):
71
+ trace_data = trace_detail.model_dump()
72
+ elif isinstance(trace_detail, dict):
73
+ trace_data = trace_detail.copy()
74
+ else:
75
+ trace_data = dict(trace_detail)
76
+
77
+ spans = []
78
+ for span in spans_data:
79
+ if hasattr(span, "model_dump"):
80
+ spans.append(TraceSpan.from_otel_span_detail(span))
81
+ else:
82
+ # Handle dict spans
83
+ span_data = dict(span) if not isinstance(span, dict) else span.copy()
84
+ if "scores" in span_data and span_data["scores"]:
85
+ span_data["scores"] = [
86
+ TraceScore(**score)
87
+ if isinstance(score, dict)
88
+ else TraceScore(**score.model_dump())
89
+ for score in span_data["scores"]
90
+ ]
91
+ spans.append(TraceSpan(**span_data))
92
+
93
+ rules = []
94
+ if "rule_id" in trace_data and trace_data["rule_id"]:
95
+ rules = [
96
+ TraceRule(
97
+ rule_id=trace_data["rule_id"],
98
+ rule_name=f"Rule {trace_data['rule_id']}",
99
+ )
100
+ ]
101
+
102
+ trace_data.pop("scores", [])
103
+ trace_data.pop("rule_id", None)
104
+ trace = cls(**trace_data)
105
+
106
+ trace.spans = spans
107
+ trace.rules = rules
108
+
109
+ return trace
110
+
111
+ def to_dict(self) -> Dict[str, Any]:
112
+ """Convert Trace to dictionary."""
113
+ return self.model_dump(exclude_none=True)
114
+
115
+ def __len__(self) -> int:
116
+ """Return the number of spans in the trace."""
117
+ return len(self.spans)
118
+
119
+ def __iter__(self):
120
+ """Iterate over spans in the trace."""
121
+ return iter(self.spans)