judgeval 0.0.11__py3-none-any.whl → 0.22.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of judgeval might be problematic. Click here for more details.

Files changed (171) hide show
  1. judgeval/__init__.py +177 -12
  2. judgeval/api/__init__.py +519 -0
  3. judgeval/api/api_types.py +407 -0
  4. judgeval/cli.py +79 -0
  5. judgeval/constants.py +76 -47
  6. judgeval/data/__init__.py +3 -3
  7. judgeval/data/evaluation_run.py +125 -0
  8. judgeval/data/example.py +15 -56
  9. judgeval/data/judgment_types.py +450 -0
  10. judgeval/data/result.py +29 -73
  11. judgeval/data/scorer_data.py +29 -62
  12. judgeval/data/scripts/fix_default_factory.py +23 -0
  13. judgeval/data/scripts/openapi_transform.py +123 -0
  14. judgeval/data/trace.py +121 -0
  15. judgeval/dataset/__init__.py +264 -0
  16. judgeval/env.py +52 -0
  17. judgeval/evaluation/__init__.py +344 -0
  18. judgeval/exceptions.py +27 -0
  19. judgeval/integrations/langgraph/__init__.py +13 -0
  20. judgeval/integrations/openlit/__init__.py +50 -0
  21. judgeval/judges/__init__.py +2 -3
  22. judgeval/judges/base_judge.py +2 -3
  23. judgeval/judges/litellm_judge.py +100 -20
  24. judgeval/judges/together_judge.py +101 -20
  25. judgeval/judges/utils.py +20 -24
  26. judgeval/logger.py +62 -0
  27. judgeval/prompt/__init__.py +330 -0
  28. judgeval/scorers/__init__.py +18 -25
  29. judgeval/scorers/agent_scorer.py +17 -0
  30. judgeval/scorers/api_scorer.py +45 -41
  31. judgeval/scorers/base_scorer.py +83 -38
  32. judgeval/scorers/example_scorer.py +17 -0
  33. judgeval/scorers/exceptions.py +1 -0
  34. judgeval/scorers/judgeval_scorers/__init__.py +0 -148
  35. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +19 -17
  36. judgeval/scorers/judgeval_scorers/api_scorers/answer_correctness.py +13 -19
  37. judgeval/scorers/judgeval_scorers/api_scorers/answer_relevancy.py +12 -19
  38. judgeval/scorers/judgeval_scorers/api_scorers/faithfulness.py +13 -19
  39. judgeval/scorers/judgeval_scorers/api_scorers/instruction_adherence.py +15 -0
  40. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +327 -0
  41. judgeval/scorers/score.py +77 -306
  42. judgeval/scorers/utils.py +4 -199
  43. judgeval/tracer/__init__.py +1122 -2
  44. judgeval/tracer/constants.py +1 -0
  45. judgeval/tracer/exporters/__init__.py +40 -0
  46. judgeval/tracer/exporters/s3.py +119 -0
  47. judgeval/tracer/exporters/store.py +59 -0
  48. judgeval/tracer/exporters/utils.py +32 -0
  49. judgeval/tracer/keys.py +63 -0
  50. judgeval/tracer/llm/__init__.py +7 -0
  51. judgeval/tracer/llm/config.py +78 -0
  52. judgeval/tracer/llm/constants.py +9 -0
  53. judgeval/tracer/llm/llm_anthropic/__init__.py +3 -0
  54. judgeval/tracer/llm/llm_anthropic/config.py +6 -0
  55. judgeval/tracer/llm/llm_anthropic/messages.py +452 -0
  56. judgeval/tracer/llm/llm_anthropic/messages_stream.py +322 -0
  57. judgeval/tracer/llm/llm_anthropic/wrapper.py +59 -0
  58. judgeval/tracer/llm/llm_google/__init__.py +3 -0
  59. judgeval/tracer/llm/llm_google/config.py +6 -0
  60. judgeval/tracer/llm/llm_google/generate_content.py +127 -0
  61. judgeval/tracer/llm/llm_google/wrapper.py +30 -0
  62. judgeval/tracer/llm/llm_openai/__init__.py +3 -0
  63. judgeval/tracer/llm/llm_openai/beta_chat_completions.py +216 -0
  64. judgeval/tracer/llm/llm_openai/chat_completions.py +501 -0
  65. judgeval/tracer/llm/llm_openai/config.py +6 -0
  66. judgeval/tracer/llm/llm_openai/responses.py +506 -0
  67. judgeval/tracer/llm/llm_openai/utils.py +42 -0
  68. judgeval/tracer/llm/llm_openai/wrapper.py +63 -0
  69. judgeval/tracer/llm/llm_together/__init__.py +3 -0
  70. judgeval/tracer/llm/llm_together/chat_completions.py +406 -0
  71. judgeval/tracer/llm/llm_together/config.py +6 -0
  72. judgeval/tracer/llm/llm_together/wrapper.py +52 -0
  73. judgeval/tracer/llm/providers.py +19 -0
  74. judgeval/tracer/managers.py +167 -0
  75. judgeval/tracer/processors/__init__.py +220 -0
  76. judgeval/tracer/utils.py +19 -0
  77. judgeval/trainer/__init__.py +14 -0
  78. judgeval/trainer/base_trainer.py +122 -0
  79. judgeval/trainer/config.py +128 -0
  80. judgeval/trainer/console.py +144 -0
  81. judgeval/trainer/fireworks_trainer.py +396 -0
  82. judgeval/trainer/trainable_model.py +243 -0
  83. judgeval/trainer/trainer.py +70 -0
  84. judgeval/utils/async_utils.py +39 -0
  85. judgeval/utils/decorators/__init__.py +0 -0
  86. judgeval/utils/decorators/dont_throw.py +37 -0
  87. judgeval/utils/decorators/use_once.py +13 -0
  88. judgeval/utils/file_utils.py +97 -0
  89. judgeval/utils/guards.py +36 -0
  90. judgeval/utils/meta.py +27 -0
  91. judgeval/utils/project.py +15 -0
  92. judgeval/utils/serialize.py +253 -0
  93. judgeval/utils/testing.py +70 -0
  94. judgeval/utils/url.py +10 -0
  95. judgeval/utils/version_check.py +28 -0
  96. judgeval/utils/wrappers/README.md +3 -0
  97. judgeval/utils/wrappers/__init__.py +15 -0
  98. judgeval/utils/wrappers/immutable_wrap_async.py +74 -0
  99. judgeval/utils/wrappers/immutable_wrap_async_iterator.py +84 -0
  100. judgeval/utils/wrappers/immutable_wrap_sync.py +66 -0
  101. judgeval/utils/wrappers/immutable_wrap_sync_iterator.py +84 -0
  102. judgeval/utils/wrappers/mutable_wrap_async.py +67 -0
  103. judgeval/utils/wrappers/mutable_wrap_sync.py +67 -0
  104. judgeval/utils/wrappers/py.typed +0 -0
  105. judgeval/utils/wrappers/utils.py +35 -0
  106. judgeval/version.py +5 -0
  107. judgeval/warnings.py +4 -0
  108. judgeval-0.22.2.dist-info/METADATA +265 -0
  109. judgeval-0.22.2.dist-info/RECORD +112 -0
  110. judgeval-0.22.2.dist-info/entry_points.txt +2 -0
  111. judgeval/clients.py +0 -39
  112. judgeval/common/__init__.py +0 -8
  113. judgeval/common/exceptions.py +0 -28
  114. judgeval/common/logger.py +0 -189
  115. judgeval/common/tracer.py +0 -798
  116. judgeval/common/utils.py +0 -763
  117. judgeval/data/api_example.py +0 -111
  118. judgeval/data/datasets/__init__.py +0 -5
  119. judgeval/data/datasets/dataset.py +0 -286
  120. judgeval/data/datasets/eval_dataset_client.py +0 -193
  121. judgeval/data/datasets/ground_truth.py +0 -54
  122. judgeval/data/datasets/utils.py +0 -74
  123. judgeval/evaluation_run.py +0 -132
  124. judgeval/judges/mixture_of_judges.py +0 -248
  125. judgeval/judgment_client.py +0 -354
  126. judgeval/run_evaluation.py +0 -439
  127. judgeval/scorers/judgeval_scorer.py +0 -140
  128. judgeval/scorers/judgeval_scorers/api_scorers/contextual_precision.py +0 -19
  129. judgeval/scorers/judgeval_scorers/api_scorers/contextual_recall.py +0 -19
  130. judgeval/scorers/judgeval_scorers/api_scorers/contextual_relevancy.py +0 -22
  131. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -19
  132. judgeval/scorers/judgeval_scorers/api_scorers/json_correctness.py +0 -32
  133. judgeval/scorers/judgeval_scorers/api_scorers/summarization.py +0 -20
  134. judgeval/scorers/judgeval_scorers/api_scorers/tool_correctness.py +0 -19
  135. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  136. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  137. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -54
  138. judgeval/scorers/judgeval_scorers/local_implementations/__init__.py +0 -24
  139. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/__init__.py +0 -4
  140. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/answer_correctness_scorer.py +0 -277
  141. judgeval/scorers/judgeval_scorers/local_implementations/answer_correctness/prompts.py +0 -169
  142. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/__init__.py +0 -4
  143. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/answer_relevancy_scorer.py +0 -298
  144. judgeval/scorers/judgeval_scorers/local_implementations/answer_relevancy/prompts.py +0 -174
  145. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/__init__.py +0 -3
  146. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/contextual_precision_scorer.py +0 -264
  147. judgeval/scorers/judgeval_scorers/local_implementations/contextual_precision/prompts.py +0 -106
  148. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/__init__.py +0 -3
  149. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/contextual_recall_scorer.py +0 -254
  150. judgeval/scorers/judgeval_scorers/local_implementations/contextual_recall/prompts.py +0 -142
  151. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/__init__.py +0 -3
  152. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/contextual_relevancy_scorer.py +0 -245
  153. judgeval/scorers/judgeval_scorers/local_implementations/contextual_relevancy/prompts.py +0 -121
  154. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/__init__.py +0 -3
  155. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/faithfulness_scorer.py +0 -325
  156. judgeval/scorers/judgeval_scorers/local_implementations/faithfulness/prompts.py +0 -268
  157. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/__init__.py +0 -3
  158. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/hallucination_scorer.py +0 -263
  159. judgeval/scorers/judgeval_scorers/local_implementations/hallucination/prompts.py +0 -104
  160. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/__init__.py +0 -5
  161. judgeval/scorers/judgeval_scorers/local_implementations/json_correctness/json_correctness_scorer.py +0 -134
  162. judgeval/scorers/judgeval_scorers/local_implementations/summarization/__init__.py +0 -3
  163. judgeval/scorers/judgeval_scorers/local_implementations/summarization/prompts.py +0 -247
  164. judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarization_scorer.py +0 -550
  165. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py +0 -3
  166. judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py +0 -157
  167. judgeval/scorers/prompt_scorer.py +0 -439
  168. judgeval-0.0.11.dist-info/METADATA +0 -36
  169. judgeval-0.0.11.dist-info/RECORD +0 -84
  170. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/WHEEL +0 -0
  171. {judgeval-0.0.11.dist-info → judgeval-0.22.2.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,125 @@
1
+ from typing import List, Optional, Union, Tuple, Sequence
2
+ from pydantic import field_validator, model_validator, Field, BaseModel
3
+ from datetime import datetime, timezone
4
+ import uuid
5
+
6
+ from judgeval.data import Example
7
+ from judgeval.scorers import APIScorerConfig
8
+ from judgeval.scorers.example_scorer import ExampleScorer
9
+ from judgeval.constants import ACCEPTABLE_MODELS
10
+ from judgeval.data.judgment_types import (
11
+ ExampleEvaluationRun as ExampleEvaluationRunJudgmentType,
12
+ TraceEvaluationRun as TraceEvaluationRunJudgmentType,
13
+ )
14
+
15
+
16
+ class EvaluationRun(BaseModel):
17
+ id: str = Field(default_factory=lambda: str(uuid.uuid4()))
18
+ created_at: str = Field(
19
+ default_factory=lambda: datetime.now(timezone.utc).isoformat()
20
+ )
21
+ custom_scorers: List[ExampleScorer] = Field(default_factory=list)
22
+ judgment_scorers: Sequence[APIScorerConfig] = Field(default_factory=list)
23
+ scorers: Sequence[Union[ExampleScorer, APIScorerConfig]] = Field(
24
+ default_factory=list
25
+ )
26
+ model: Optional[str] = None
27
+
28
+ def __init__(
29
+ self,
30
+ scorers: Optional[List[Union[ExampleScorer, APIScorerConfig]]] = None,
31
+ **kwargs,
32
+ ):
33
+ """
34
+ Initialize EvaluationRun with automatic scorer classification.
35
+
36
+ Args:
37
+ scorers: List of scorers that will be automatically sorted into custom_scorers or judgment_scorers
38
+ **kwargs: Other initialization arguments
39
+ """
40
+ if scorers is not None:
41
+ # Automatically sort scorers into appropriate fields
42
+ custom_scorers = [s for s in scorers if isinstance(s, ExampleScorer)]
43
+ judgment_scorers = [s for s in scorers if isinstance(s, APIScorerConfig)]
44
+
45
+ # Always set both fields as lists (even if empty) to satisfy validation
46
+ kwargs["custom_scorers"] = custom_scorers
47
+ kwargs["judgment_scorers"] = judgment_scorers
48
+
49
+ super().__init__(**kwargs)
50
+
51
+ def model_dump(self, **kwargs):
52
+ data = super().model_dump(**kwargs)
53
+ data["custom_scorers"] = [s.model_dump() for s in self.custom_scorers]
54
+ data["judgment_scorers"] = [s.model_dump() for s in self.judgment_scorers]
55
+
56
+ return data
57
+
58
+ @model_validator(mode="after")
59
+ @classmethod
60
+ def validate_scorer_lists(cls, values):
61
+ custom_scorers = values.custom_scorers
62
+ judgment_scorers = values.judgment_scorers
63
+
64
+ # Check that both lists are not empty
65
+ if not custom_scorers and not judgment_scorers:
66
+ raise ValueError(
67
+ "At least one of custom_scorers or judgment_scorers must be provided."
68
+ )
69
+
70
+ # Check that only one list is filled
71
+ if custom_scorers and judgment_scorers:
72
+ raise ValueError(
73
+ "Only one of custom_scorers or judgment_scorers can be provided, not both."
74
+ )
75
+
76
+ return values
77
+
78
+ @field_validator("model")
79
+ def validate_model(cls, v, values):
80
+ # Check if model is string or list of strings
81
+ if v is not None and isinstance(v, str):
82
+ if v not in ACCEPTABLE_MODELS:
83
+ raise ValueError(
84
+ f"Model name {v} not recognized. Please select a valid model name.)"
85
+ )
86
+ return v
87
+
88
+
89
+ class ExampleEvaluationRun(EvaluationRun, ExampleEvaluationRunJudgmentType): # type: ignore
90
+ """
91
+ Stores example and evaluation scorers together for running an eval task
92
+
93
+ Args:
94
+ project_name (str): The name of the project the evaluation results belong to
95
+ eval_name (str): A name for this evaluation run
96
+ examples (List[Example]): The examples to evaluate
97
+ scorers (List[Union[BaseScorer, APIScorerConfig]]): A list of scorers to use for evaluation
98
+ model (str): The model used as a judge when using LLM as a Judge
99
+ """
100
+
101
+ examples: List[Example] # type: ignore
102
+
103
+ @field_validator("examples")
104
+ def validate_examples(cls, v):
105
+ if not v:
106
+ raise ValueError("Examples cannot be empty.")
107
+ for item in v:
108
+ if not isinstance(item, Example):
109
+ raise ValueError(f"Item of type {type(item)} is not a Example")
110
+ return v
111
+
112
+ def model_dump(self, **kwargs):
113
+ data = super().model_dump(**kwargs)
114
+ data["examples"] = [example.model_dump() for example in self.examples]
115
+ return data
116
+
117
+
118
+ class TraceEvaluationRun(EvaluationRun, TraceEvaluationRunJudgmentType): # type: ignore
119
+ trace_and_span_ids: List[Tuple[str, str]] # type: ignore
120
+
121
+ @field_validator("trace_and_span_ids")
122
+ def validate_trace_and_span_ids(cls, v):
123
+ if not v:
124
+ raise ValueError("Trace and span IDs are required for trace evaluations.")
125
+ return v
judgeval/data/example.py CHANGED
@@ -2,17 +2,15 @@
2
2
  Classes for representing examples in a dataset.
3
3
  """
4
4
 
5
-
6
- from typing import TypeVar, Optional, Any, Dict, List
7
- from pydantic import BaseModel
8
5
  from enum import Enum
9
6
  from datetime import datetime
7
+ from typing import Dict, Any, Optional
8
+ from judgeval.data.judgment_types import Example as JudgmentExample
9
+ from uuid import uuid4
10
+ from pydantic import Field
10
11
 
11
12
 
12
- Input = TypeVar('Input')
13
- Output = TypeVar('Output')
14
-
15
- class ExampleParams(Enum):
13
+ class ExampleParams(str, Enum):
16
14
  INPUT = "input"
17
15
  ACTUAL_OUTPUT = "actual_output"
18
16
  EXPECTED_OUTPUT = "expected_output"
@@ -20,57 +18,18 @@ class ExampleParams(Enum):
20
18
  RETRIEVAL_CONTEXT = "retrieval_context"
21
19
  TOOLS_CALLED = "tools_called"
22
20
  EXPECTED_TOOLS = "expected_tools"
23
- REASONING = "reasoning"
21
+ ADDITIONAL_METADATA = "additional_metadata"
24
22
 
25
23
 
26
- class Example(BaseModel):
27
- input: Input
28
- actual_output: Output
29
- expected_output: Optional[str] = None
30
- context: Optional[List[str]] = None
31
- retrieval_context: Optional[List[str]] = None
32
- additional_metadata: Optional[Dict[str, Any]] = None
33
- tools_called: Optional[List[str]] = None
34
- expected_tools: Optional[List[str]] = None
24
+ class Example(JudgmentExample):
25
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
26
+ created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
35
27
  name: Optional[str] = None
36
- example_id: Optional[str] = None
37
- timestamp: Optional[str] = None
38
- trace_id: Optional[str] = None
39
-
40
- def __init__(self, **data):
41
- super().__init__(**data)
42
- # Set timestamp if not provided
43
- if self.timestamp is None:
44
- self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
45
28
 
46
- def to_dict(self):
47
- return {
48
- "input": self.input,
49
- "actual_output": self.actual_output,
50
- "expected_output": self.expected_output,
51
- "context": self.context,
52
- "retrieval_context": self.retrieval_context,
53
- "additional_metadata": self.additional_metadata,
54
- "tools_called": self.tools_called,
55
- "expected_tools": self.expected_tools,
56
- "name": self.name,
57
- "example_id": self.example_id,
58
- "timestamp": self.timestamp,
59
- "trace_id": self.trace_id
60
- }
29
+ def to_dict(self) -> Dict[str, Any]:
30
+ data = super().model_dump(warnings=False)
31
+ return data
61
32
 
62
- def __str__(self):
63
- return (
64
- f"Example(input={self.input}, "
65
- f"actual_output={self.actual_output}, "
66
- f"expected_output={self.expected_output}, "
67
- f"context={self.context}, "
68
- f"retrieval_context={self.retrieval_context}, "
69
- f"additional_metadata={self.additional_metadata}, "
70
- f"tools_called={self.tools_called}, "
71
- f"expected_tools={self.expected_tools}, "
72
- f"name={self.name}, "
73
- f"example_id={self.example_id}, "
74
- f"timestamp={self.timestamp}, "
75
- f"trace_id={self.trace_id})"
76
- )
33
+ def get_fields(self):
34
+ excluded = {"example_id", "name", "created_at"}
35
+ return self.model_dump(exclude=excluded)
@@ -0,0 +1,450 @@
1
+ # generated by datamodel-codegen:
2
+ # filename: .openapi.json
3
+ # timestamp: 2025-10-25T22:30:19+00:00
4
+
5
+ from __future__ import annotations
6
+ from typing import Annotated, Any, Dict, List, Optional, Union
7
+ from pydantic import AwareDatetime, BaseModel, ConfigDict, Field, RootModel
8
+ from enum import Enum
9
+
10
+
11
+ class TraceAndSpanId(RootModel[List]):
12
+ root: Annotated[List, Field(max_length=2, min_length=2)]
13
+
14
+
15
+ class EvalResultsFetch(BaseModel):
16
+ experiment_run_id: Annotated[str, Field(title="Experiment Run Id")]
17
+ project_name: Annotated[str, Field(title="Project Name")]
18
+
19
+
20
+ class DatasetFetch(BaseModel):
21
+ dataset_name: Annotated[str, Field(title="Dataset Name")]
22
+ project_name: Annotated[str, Field(title="Project Name")]
23
+
24
+
25
+ class DatasetsFetch(BaseModel):
26
+ project_name: Annotated[str, Field(title="Project Name")]
27
+
28
+
29
+ class ProjectAdd(BaseModel):
30
+ project_name: Annotated[str, Field(title="Project Name")]
31
+
32
+
33
+ class ProjectAddResponse(BaseModel):
34
+ project_id: Annotated[str, Field(title="Project Id")]
35
+
36
+
37
+ class ProjectDeleteFromJudgevalResponse(BaseModel):
38
+ project_name: Annotated[str, Field(title="Project Name")]
39
+
40
+
41
+ class ProjectDeleteResponse(BaseModel):
42
+ message: Annotated[str, Field(title="Message")]
43
+
44
+
45
+ class ScorerExistsRequest(BaseModel):
46
+ name: Annotated[str, Field(title="Name")]
47
+
48
+
49
+ class ScorerExistsResponse(BaseModel):
50
+ exists: Annotated[bool, Field(title="Exists")]
51
+
52
+
53
+ class SavePromptScorerRequest(BaseModel):
54
+ name: Annotated[str, Field(title="Name")]
55
+ prompt: Annotated[str, Field(title="Prompt")]
56
+ threshold: Annotated[float, Field(title="Threshold")]
57
+ model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
58
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
59
+ options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
60
+ description: Annotated[Optional[str], Field(title="Description")] = None
61
+
62
+
63
+ class FetchPromptScorersRequest(BaseModel):
64
+ names: Annotated[Optional[List[str]], Field(title="Names")] = None
65
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = None
66
+
67
+
68
+ class CustomScorerUploadPayload(BaseModel):
69
+ scorer_name: Annotated[str, Field(title="Scorer Name")]
70
+ scorer_code: Annotated[str, Field(title="Scorer Code")]
71
+ requirements_text: Annotated[str, Field(title="Requirements Text")]
72
+ overwrite: Annotated[Optional[bool], Field(title="Overwrite")] = False
73
+
74
+
75
+ class CustomScorerTemplateResponse(BaseModel):
76
+ scorer_name: Annotated[str, Field(title="Scorer Name")]
77
+ status: Annotated[str, Field(title="Status")]
78
+ message: Annotated[str, Field(title="Message")]
79
+
80
+
81
+ class PromptInsertRequest(BaseModel):
82
+ project_id: Annotated[str, Field(title="Project Id")]
83
+ name: Annotated[str, Field(title="Name")]
84
+ prompt: Annotated[str, Field(title="Prompt")]
85
+ tags: Annotated[List[str], Field(title="Tags")]
86
+
87
+
88
+ class PromptInsertResponse(BaseModel):
89
+ commit_id: Annotated[str, Field(title="Commit Id")]
90
+ parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
91
+ created_at: Annotated[str, Field(title="Created At")]
92
+
93
+
94
+ class PromptTagRequest(BaseModel):
95
+ project_id: Annotated[str, Field(title="Project Id")]
96
+ name: Annotated[str, Field(title="Name")]
97
+ commit_id: Annotated[str, Field(title="Commit Id")]
98
+ tags: Annotated[List[str], Field(title="Tags")]
99
+
100
+
101
+ class PromptTagResponse(BaseModel):
102
+ commit_id: Annotated[str, Field(title="Commit Id")]
103
+
104
+
105
+ class PromptUntagRequest(BaseModel):
106
+ project_id: Annotated[str, Field(title="Project Id")]
107
+ name: Annotated[str, Field(title="Name")]
108
+ tags: Annotated[List[str], Field(title="Tags")]
109
+
110
+
111
+ class PromptUntagResponse(BaseModel):
112
+ commit_ids: Annotated[List[str], Field(title="Commit Ids")]
113
+
114
+
115
+ class ResolveProjectNameRequest(BaseModel):
116
+ project_name: Annotated[str, Field(title="Project Name")]
117
+
118
+
119
+ class ResolveProjectNameResponse(BaseModel):
120
+ project_id: Annotated[str, Field(title="Project Id")]
121
+
122
+
123
+ class TraceIdRequest(BaseModel):
124
+ trace_id: Annotated[str, Field(title="Trace Id")]
125
+
126
+
127
+ class SpanScoreRequest(BaseModel):
128
+ span_id: Annotated[str, Field(title="Span Id")]
129
+ trace_id: Annotated[str, Field(title="Trace Id")]
130
+
131
+
132
+ class BaseScorer(BaseModel):
133
+ score_type: Annotated[str, Field(title="Score Type")]
134
+ threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
135
+ name: Annotated[Optional[str], Field(title="Name")] = None
136
+ class_name: Annotated[Optional[str], Field(title="Class Name")] = None
137
+ score: Annotated[Optional[float], Field(title="Score")] = None
138
+ score_breakdown: Annotated[
139
+ Optional[Dict[str, Any]], Field(title="Score Breakdown")
140
+ ] = None
141
+ reason: Annotated[Optional[str], Field(title="Reason")] = ""
142
+ using_native_model: Annotated[Optional[bool], Field(title="Using Native Model")] = (
143
+ None
144
+ )
145
+ success: Annotated[Optional[bool], Field(title="Success")] = None
146
+ model: Annotated[Optional[str], Field(title="Model")] = None
147
+ model_client: Annotated[Any, Field(title="Model Client")] = None
148
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
149
+ error: Annotated[Optional[str], Field(title="Error")] = None
150
+ additional_metadata: Annotated[
151
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
152
+ ] = None
153
+ user: Annotated[Optional[str], Field(title="User")] = None
154
+ server_hosted: Annotated[Optional[bool], Field(title="Server Hosted")] = False
155
+
156
+
157
+ class ScorerConfig(BaseModel):
158
+ score_type: Annotated[str, Field(title="Score Type")]
159
+ name: Annotated[Optional[str], Field(title="Name")] = None
160
+ threshold: Annotated[Optional[float], Field(title="Threshold")] = 0.5
161
+ model: Annotated[Optional[str], Field(title="Model")] = None
162
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = False
163
+ required_params: Annotated[Optional[List[str]], Field(title="Required Params")] = []
164
+ kwargs: Annotated[Optional[Dict[str, Any]], Field(title="Kwargs")] = None
165
+
166
+
167
+ class Example(BaseModel):
168
+ model_config = ConfigDict(
169
+ extra="allow",
170
+ )
171
+ example_id: Annotated[Optional[str], Field(title="Example Id")] = None
172
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
173
+ name: Annotated[Optional[str], Field(title="Name")] = None
174
+
175
+
176
+ class ValidationError(BaseModel):
177
+ loc: Annotated[List[Union[str, int]], Field(title="Location")]
178
+ msg: Annotated[str, Field(title="Message")]
179
+ type: Annotated[str, Field(title="Error Type")]
180
+
181
+
182
+ class UsageInfo(BaseModel):
183
+ total_judgees: Annotated[int, Field(title="Total Judgees")]
184
+ regular_use: Annotated[int, Field(title="Regular Use")]
185
+ pay_as_you_go_use: Annotated[int, Field(title="Pay As You Go Use")]
186
+ remaining_regular: Annotated[int, Field(title="Remaining Regular")]
187
+ remaining_after: Annotated[int, Field(title="Remaining After")]
188
+
189
+
190
+ class DatasetKind(Enum):
191
+ trace = "trace"
192
+ example = "example"
193
+
194
+
195
+ class PromptScorer(BaseModel):
196
+ id: Annotated[str, Field(title="Id")]
197
+ user_id: Annotated[str, Field(title="User Id")]
198
+ organization_id: Annotated[str, Field(title="Organization Id")]
199
+ name: Annotated[str, Field(title="Name")]
200
+ prompt: Annotated[str, Field(title="Prompt")]
201
+ threshold: Annotated[float, Field(title="Threshold")]
202
+ model: Annotated[Optional[str], Field(title="Model")] = "gpt-5"
203
+ options: Annotated[Optional[Dict[str, float]], Field(title="Options")] = None
204
+ description: Annotated[Optional[str], Field(title="Description")] = None
205
+ created_at: Annotated[Optional[AwareDatetime], Field(title="Created At")] = None
206
+ updated_at: Annotated[Optional[AwareDatetime], Field(title="Updated At")] = None
207
+ is_trace: Annotated[Optional[bool], Field(title="Is Trace")] = False
208
+ is_bucket_rubric: Annotated[Optional[bool], Field(title="Is Bucket Rubric")] = None
209
+
210
+
211
+ class PromptCommitInfo(BaseModel):
212
+ name: Annotated[str, Field(title="Name")]
213
+ prompt: Annotated[str, Field(title="Prompt")]
214
+ tags: Annotated[List[str], Field(title="Tags")]
215
+ commit_id: Annotated[str, Field(title="Commit Id")]
216
+ parent_commit_id: Annotated[Optional[str], Field(title="Parent Commit Id")] = None
217
+ created_at: Annotated[str, Field(title="Created At")]
218
+ first_name: Annotated[str, Field(title="First Name")]
219
+ last_name: Annotated[str, Field(title="Last Name")]
220
+ user_email: Annotated[str, Field(title="User Email")]
221
+
222
+
223
+ class ScorerData(BaseModel):
224
+ id: Annotated[Optional[str], Field(title="Id")] = None
225
+ name: Annotated[str, Field(title="Name")]
226
+ threshold: Annotated[float, Field(title="Threshold")]
227
+ success: Annotated[bool, Field(title="Success")]
228
+ score: Annotated[Optional[float], Field(title="Score")] = None
229
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
230
+ strict_mode: Annotated[Optional[bool], Field(title="Strict Mode")] = None
231
+ evaluation_model: Annotated[Optional[str], Field(title="Evaluation Model")] = None
232
+ error: Annotated[Optional[str], Field(title="Error")] = None
233
+ additional_metadata: Annotated[
234
+ Optional[Dict[str, Any]], Field(title="Additional Metadata")
235
+ ] = None
236
+
237
+
238
+ class OtelTraceSpan(BaseModel):
239
+ organization_id: Annotated[str, Field(title="Organization Id")]
240
+ project_id: Annotated[Optional[str], Field(title="Project Id")] = None
241
+ user_id: Annotated[str, Field(title="User Id")]
242
+ timestamp: Annotated[str, Field(title="Timestamp")]
243
+ trace_id: Annotated[str, Field(title="Trace Id")]
244
+ span_id: Annotated[str, Field(title="Span Id")]
245
+ parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
246
+ trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
247
+ span_name: Annotated[Optional[str], Field(title="Span Name")] = None
248
+ span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
249
+ service_name: Annotated[Optional[str], Field(title="Service Name")] = None
250
+ resource_attributes: Annotated[
251
+ Optional[Dict[str, Any]], Field(title="Resource Attributes")
252
+ ] = None
253
+ span_attributes: Annotated[
254
+ Optional[Dict[str, Any]], Field(title="Span Attributes")
255
+ ] = None
256
+ duration: Annotated[Optional[int], Field(title="Duration")] = None
257
+ status_code: Annotated[Optional[int], Field(title="Status Code")] = None
258
+ status_message: Annotated[Optional[str], Field(title="Status Message")] = None
259
+ events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
260
+ links: Annotated[Optional[List[Dict[str, Any]]], Field(title="Links")] = None
261
+
262
+
263
+ class OtelSpanListItemScores(BaseModel):
264
+ success: Annotated[bool, Field(title="Success")]
265
+ score: Annotated[float, Field(title="Score")]
266
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
267
+ name: Annotated[str, Field(title="Name")]
268
+
269
+
270
+ class OtelSpanDetailScores(BaseModel):
271
+ success: Annotated[bool, Field(title="Success")]
272
+ score: Annotated[float, Field(title="Score")]
273
+ reason: Annotated[Optional[str], Field(title="Reason")] = None
274
+ name: Annotated[str, Field(title="Name")]
275
+ example_id: Annotated[Optional[str], Field(title="Example Id")] = None
276
+
277
+
278
+ class ExampleEvaluationRun(BaseModel):
279
+ id: Annotated[Optional[str], Field(title="Id")] = None
280
+ project_name: Annotated[str, Field(title="Project Name")]
281
+ eval_name: Annotated[str, Field(title="Eval Name")]
282
+ custom_scorers: Annotated[
283
+ Optional[List[BaseScorer]], Field(title="Custom Scorers")
284
+ ] = []
285
+ judgment_scorers: Annotated[
286
+ Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
287
+ ] = []
288
+ model: Annotated[Optional[str], Field(title="Model")] = None
289
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
290
+ examples: Annotated[List[Example], Field(title="Examples")]
291
+ trace_span_id: Annotated[Optional[str], Field(title="Trace Span Id")] = None
292
+ trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
293
+
294
+
295
+ class HTTPValidationError(BaseModel):
296
+ detail: Annotated[Optional[List[ValidationError]], Field(title="Detail")] = None
297
+
298
+
299
+ class TraceEvaluationRun(BaseModel):
300
+ id: Annotated[Optional[str], Field(title="Id")] = None
301
+ project_name: Annotated[str, Field(title="Project Name")]
302
+ eval_name: Annotated[str, Field(title="Eval Name")]
303
+ custom_scorers: Annotated[
304
+ Optional[List[BaseScorer]], Field(title="Custom Scorers")
305
+ ] = []
306
+ judgment_scorers: Annotated[
307
+ Optional[List[ScorerConfig]], Field(title="Judgment Scorers")
308
+ ] = []
309
+ model: Annotated[Optional[str], Field(title="Model")] = None
310
+ created_at: Annotated[Optional[str], Field(title="Created At")] = None
311
+ trace_and_span_ids: Annotated[
312
+ List[TraceAndSpanId], Field(title="Trace And Span Ids")
313
+ ]
314
+ is_offline: Annotated[Optional[bool], Field(title="Is Offline")] = False
315
+ is_bucket_run: Annotated[Optional[bool], Field(title="Is Bucket Run")] = False
316
+
317
+
318
+ class DatasetInsertExamples(BaseModel):
319
+ dataset_name: Annotated[str, Field(title="Dataset Name")]
320
+ examples: Annotated[List[Example], Field(title="Examples")]
321
+ project_name: Annotated[str, Field(title="Project Name")]
322
+
323
+
324
+ class DatasetInfo(BaseModel):
325
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
326
+ name: Annotated[str, Field(title="Name")]
327
+ created_at: Annotated[str, Field(title="Created At")]
328
+ kind: DatasetKind
329
+ entries: Annotated[int, Field(title="Entries")]
330
+ creator: Annotated[str, Field(title="Creator")]
331
+
332
+
333
+ class DatasetCreate(BaseModel):
334
+ name: Annotated[str, Field(title="Name")]
335
+ dataset_kind: DatasetKind
336
+ project_name: Annotated[str, Field(title="Project Name")]
337
+ examples: Annotated[List[Example], Field(title="Examples")]
338
+ overwrite: Annotated[bool, Field(title="Overwrite")]
339
+
340
+
341
+ class SavePromptScorerResponse(BaseModel):
342
+ scorer_response: PromptScorer
343
+
344
+
345
+ class FetchPromptScorersResponse(BaseModel):
346
+ scorers: Annotated[List[PromptScorer], Field(title="Scorers")]
347
+
348
+
349
+ class PromptFetchResponse(BaseModel):
350
+ commit: Optional[PromptCommitInfo] = None
351
+
352
+
353
+ class PromptVersionsResponse(BaseModel):
354
+ versions: Annotated[List[PromptCommitInfo], Field(title="Versions")]
355
+
356
+
357
+ class ScoringResult(BaseModel):
358
+ success: Annotated[bool, Field(title="Success")]
359
+ scorers_data: Annotated[List[ScorerData], Field(title="Scorers Data")]
360
+ name: Annotated[Optional[str], Field(title="Name")] = None
361
+ data_object: Annotated[
362
+ Optional[Union[OtelTraceSpan, Example]], Field(title="Data Object")
363
+ ] = None
364
+ trace_id: Annotated[Optional[str], Field(title="Trace Id")] = None
365
+ run_duration: Annotated[Optional[float], Field(title="Run Duration")] = None
366
+ evaluation_cost: Annotated[Optional[float], Field(title="Evaluation Cost")] = None
367
+
368
+
369
+ class OtelTraceListItem(BaseModel):
370
+ organization_id: Annotated[str, Field(title="Organization Id")]
371
+ project_id: Annotated[str, Field(title="Project Id")]
372
+ trace_id: Annotated[str, Field(title="Trace Id")]
373
+ created_at: Annotated[AwareDatetime, Field(title="Created At")]
374
+ duration: Annotated[Optional[int], Field(title="Duration")] = None
375
+ tags: Annotated[Optional[List[str]], Field(title="Tags")] = None
376
+ experiment_run_id: Annotated[Optional[str], Field(title="Experiment Run Id")] = None
377
+ span_name: Annotated[Optional[str], Field(title="Span Name")] = None
378
+ llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
379
+ error: Annotated[Optional[str], Field(title="Error")] = ""
380
+ scores: Annotated[
381
+ Optional[List[OtelSpanListItemScores]], Field(title="Scores")
382
+ ] = []
383
+ rules_invoked: Annotated[Optional[List[str]], Field(title="Rules Invoked")] = []
384
+ customer_id: Annotated[Optional[str], Field(title="Customer Id")] = None
385
+ input: Annotated[Optional[str], Field(title="Input")] = None
386
+ output: Annotated[Optional[str], Field(title="Output")] = None
387
+ input_preview: Annotated[Optional[str], Field(title="Input Preview")] = None
388
+ output_preview: Annotated[Optional[str], Field(title="Output Preview")] = None
389
+ annotation_count: Annotated[Optional[int], Field(title="Annotation Count")] = 0
390
+ span_id: Annotated[str, Field(title="Span Id")]
391
+ rule_id: Annotated[Optional[str], Field(title="Rule Id")] = None
392
+
393
+
394
+ class OtelSpanDetail(BaseModel):
395
+ organization_id: Annotated[str, Field(title="Organization Id")]
396
+ project_id: Annotated[str, Field(title="Project Id")]
397
+ timestamp: Annotated[AwareDatetime, Field(title="Timestamp")]
398
+ trace_id: Annotated[str, Field(title="Trace Id")]
399
+ span_id: Annotated[str, Field(title="Span Id")]
400
+ parent_span_id: Annotated[Optional[str], Field(title="Parent Span Id")] = None
401
+ trace_state: Annotated[Optional[str], Field(title="Trace State")] = None
402
+ span_name: Annotated[Optional[str], Field(title="Span Name")] = None
403
+ span_kind: Annotated[Optional[str], Field(title="Span Kind")] = None
404
+ service_name: Annotated[Optional[str], Field(title="Service Name")] = None
405
+ resource_attributes: Annotated[
406
+ Optional[Dict[str, Any]], Field(title="Resource Attributes")
407
+ ] = None
408
+ span_attributes: Annotated[
409
+ Optional[Dict[str, Any]], Field(title="Span Attributes")
410
+ ] = None
411
+ duration: Annotated[Optional[int], Field(title="Duration")] = None
412
+ status_code: Annotated[Optional[int], Field(title="Status Code")] = None
413
+ status_message: Annotated[Optional[str], Field(title="Status Message")] = None
414
+ events: Annotated[Optional[List[Dict[str, Any]]], Field(title="Events")] = None
415
+ links: Annotated[
416
+ Optional[Union[List[Dict[str, Any]], Dict[str, Any]]], Field(title="Links")
417
+ ] = None
418
+ llm_cost: Annotated[Optional[float], Field(title="Llm Cost")] = None
419
+ prompt_tokens: Annotated[Optional[int], Field(title="Prompt Tokens")] = None
420
+ completion_tokens: Annotated[Optional[int], Field(title="Completion Tokens")] = None
421
+ scores: Annotated[Optional[List[OtelSpanDetailScores]], Field(title="Scores")] = (
422
+ None
423
+ )
424
+
425
+
426
+ class EvaluateResponse(BaseModel):
427
+ status: Annotated[str, Field(title="Status")]
428
+ results: Annotated[List[ScoringResult], Field(title="Results")]
429
+ resource_usage: Optional[UsageInfo] = None
430
+
431
+
432
+ class EvalResults(BaseModel):
433
+ results: Annotated[List[ScoringResult], Field(title="Results")]
434
+ run: Annotated[Union[ExampleEvaluationRun, TraceEvaluationRun], Field(title="Run")]
435
+
436
+
437
+ class DatasetTraceWithSpans(BaseModel):
438
+ dataset_id: Annotated[str, Field(title="Dataset Id")]
439
+ trace_detail: OtelTraceListItem
440
+ spans: Annotated[List[OtelSpanDetail], Field(title="Spans")]
441
+
442
+
443
+ class DatasetReturn(BaseModel):
444
+ name: Annotated[str, Field(title="Name")]
445
+ project_name: Annotated[str, Field(title="Project Name")]
446
+ dataset_kind: DatasetKind
447
+ examples: Annotated[Optional[List[Example]], Field(title="Examples")] = None
448
+ traces: Annotated[Optional[List[DatasetTraceWithSpans]], Field(title="Traces")] = (
449
+ None
450
+ )