judgeval 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. judgeval/__init__.py +139 -12
  2. judgeval/api/__init__.py +501 -0
  3. judgeval/api/api_types.py +344 -0
  4. judgeval/cli.py +2 -4
  5. judgeval/constants.py +10 -26
  6. judgeval/data/evaluation_run.py +49 -26
  7. judgeval/data/example.py +2 -2
  8. judgeval/data/judgment_types.py +266 -82
  9. judgeval/data/result.py +4 -5
  10. judgeval/data/scorer_data.py +4 -2
  11. judgeval/data/tool.py +2 -2
  12. judgeval/data/trace.py +7 -50
  13. judgeval/data/trace_run.py +7 -4
  14. judgeval/{dataset.py → dataset/__init__.py} +43 -28
  15. judgeval/env.py +67 -0
  16. judgeval/{run_evaluation.py → evaluation/__init__.py} +29 -95
  17. judgeval/exceptions.py +27 -0
  18. judgeval/integrations/langgraph/__init__.py +788 -0
  19. judgeval/judges/__init__.py +2 -2
  20. judgeval/judges/litellm_judge.py +75 -15
  21. judgeval/judges/together_judge.py +86 -18
  22. judgeval/judges/utils.py +7 -21
  23. judgeval/{common/logger.py → logger.py} +8 -6
  24. judgeval/scorers/__init__.py +0 -4
  25. judgeval/scorers/agent_scorer.py +3 -7
  26. judgeval/scorers/api_scorer.py +8 -13
  27. judgeval/scorers/base_scorer.py +52 -32
  28. judgeval/scorers/example_scorer.py +1 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +0 -14
  30. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +45 -20
  31. judgeval/scorers/judgeval_scorers/api_scorers/tool_dependency.py +2 -2
  32. judgeval/scorers/judgeval_scorers/api_scorers/tool_order.py +3 -3
  33. judgeval/scorers/score.py +21 -31
  34. judgeval/scorers/trace_api_scorer.py +5 -0
  35. judgeval/scorers/utils.py +1 -103
  36. judgeval/tracer/__init__.py +1075 -2
  37. judgeval/tracer/constants.py +1 -0
  38. judgeval/tracer/exporters/__init__.py +37 -0
  39. judgeval/tracer/exporters/s3.py +119 -0
  40. judgeval/tracer/exporters/store.py +43 -0
  41. judgeval/tracer/exporters/utils.py +32 -0
  42. judgeval/tracer/keys.py +67 -0
  43. judgeval/tracer/llm/__init__.py +1233 -0
  44. judgeval/{common/tracer → tracer/llm}/providers.py +5 -10
  45. judgeval/{local_eval_queue.py → tracer/local_eval_queue.py} +15 -10
  46. judgeval/tracer/managers.py +188 -0
  47. judgeval/tracer/processors/__init__.py +181 -0
  48. judgeval/tracer/utils.py +20 -0
  49. judgeval/trainer/__init__.py +5 -0
  50. judgeval/{common/trainer → trainer}/config.py +12 -9
  51. judgeval/{common/trainer → trainer}/console.py +2 -9
  52. judgeval/{common/trainer → trainer}/trainable_model.py +12 -7
  53. judgeval/{common/trainer → trainer}/trainer.py +119 -17
  54. judgeval/utils/async_utils.py +2 -3
  55. judgeval/utils/decorators.py +24 -0
  56. judgeval/utils/file_utils.py +37 -4
  57. judgeval/utils/guards.py +32 -0
  58. judgeval/utils/meta.py +14 -0
  59. judgeval/{common/api/json_encoder.py → utils/serialize.py} +7 -1
  60. judgeval/utils/testing.py +88 -0
  61. judgeval/utils/url.py +10 -0
  62. judgeval/{version_check.py → utils/version_check.py} +3 -3
  63. judgeval/version.py +5 -0
  64. judgeval/warnings.py +4 -0
  65. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/METADATA +12 -14
  66. judgeval-0.9.0.dist-info/RECORD +80 -0
  67. judgeval/clients.py +0 -35
  68. judgeval/common/__init__.py +0 -13
  69. judgeval/common/api/__init__.py +0 -3
  70. judgeval/common/api/api.py +0 -375
  71. judgeval/common/api/constants.py +0 -186
  72. judgeval/common/exceptions.py +0 -27
  73. judgeval/common/storage/__init__.py +0 -6
  74. judgeval/common/storage/s3_storage.py +0 -97
  75. judgeval/common/tracer/__init__.py +0 -31
  76. judgeval/common/tracer/constants.py +0 -22
  77. judgeval/common/tracer/core.py +0 -2427
  78. judgeval/common/tracer/otel_exporter.py +0 -108
  79. judgeval/common/tracer/otel_span_processor.py +0 -188
  80. judgeval/common/tracer/span_processor.py +0 -37
  81. judgeval/common/tracer/span_transformer.py +0 -207
  82. judgeval/common/tracer/trace_manager.py +0 -101
  83. judgeval/common/trainer/__init__.py +0 -5
  84. judgeval/common/utils.py +0 -948
  85. judgeval/integrations/langgraph.py +0 -844
  86. judgeval/judges/mixture_of_judges.py +0 -287
  87. judgeval/judgment_client.py +0 -267
  88. judgeval/rules.py +0 -521
  89. judgeval/scorers/judgeval_scorers/api_scorers/execution_order.py +0 -52
  90. judgeval/scorers/judgeval_scorers/api_scorers/hallucination.py +0 -28
  91. judgeval/utils/alerts.py +0 -93
  92. judgeval/utils/requests.py +0 -50
  93. judgeval-0.7.1.dist-info/RECORD +0 -82
  94. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/WHEEL +0 -0
  95. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/entry_points.txt +0 -0
  96. {judgeval-0.7.1.dist-info → judgeval-0.9.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -1,186 +0,0 @@
1
- import os
2
- from typing import Optional, TypedDict, List, Dict, Any
3
-
4
- ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
5
-
6
- # Traces API
7
- JUDGMENT_TRACES_FETCH_API_URL = f"{ROOT_API}/traces/fetch/"
8
- JUDGMENT_TRACES_SAVE_API_URL = f"{ROOT_API}/traces/save/"
9
- JUDGMENT_TRACES_UPSERT_API_URL = f"{ROOT_API}/traces/upsert/"
10
- JUDGMENT_TRACES_DELETE_API_URL = f"{ROOT_API}/traces/delete/"
11
- JUDGMENT_TRACES_SPANS_BATCH_API_URL = f"{ROOT_API}/traces/spans/batch/"
12
- JUDGMENT_TRACES_EVALUATION_RUNS_BATCH_API_URL = (
13
- f"{ROOT_API}/traces/evaluation_runs/batch/"
14
- )
15
-
16
-
17
- class TraceFetchPayload(TypedDict):
18
- trace_id: str
19
-
20
-
21
- class TraceDeletePayload(TypedDict):
22
- trace_ids: List[str]
23
-
24
-
25
- class SpansBatchPayload(TypedDict):
26
- spans: List[Dict[str, Any]]
27
- organization_id: str
28
-
29
-
30
- class EvaluationEntryResponse(TypedDict):
31
- evaluation_run: Dict[str, Any]
32
- associated_span: Dict[str, Any]
33
- queued_at: Optional[float]
34
-
35
-
36
- class EvaluationRunsBatchPayload(TypedDict):
37
- organization_id: str
38
- evaluation_entries: List[EvaluationEntryResponse]
39
-
40
-
41
- # Evaluation API
42
- JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
43
- JUDGMENT_TRACE_EVAL_API_URL = f"{ROOT_API}/evaluate_trace/"
44
- JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
45
- JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_experiment_run/"
46
- JUDGMENT_EVAL_DELETE_API_URL = (
47
- f"{ROOT_API}/delete_eval_results_by_project_and_run_names/"
48
- )
49
- JUDGMENT_EVAL_DELETE_PROJECT_API_URL = f"{ROOT_API}/delete_eval_results_by_project/"
50
- JUDGMENT_ADD_TO_RUN_EVAL_QUEUE_API_URL = f"{ROOT_API}/add_to_run_eval_queue/"
51
- JUDGMENT_GET_EVAL_STATUS_API_URL = f"{ROOT_API}/get_evaluation_status/"
52
-
53
- # Custom Scorers API
54
- JUDGMENT_CUSTOM_SCORER_UPLOAD_API_URL = f"{ROOT_API}/upload_scorer/"
55
-
56
-
57
- # Evaluation API Payloads
58
- class EvalRunRequestBody(TypedDict):
59
- eval_name: str
60
- project_name: str
61
- judgment_api_key: str
62
-
63
-
64
- class DeleteEvalRunRequestBody(TypedDict):
65
- eval_names: List[str]
66
- project_name: str
67
- judgment_api_key: str
68
-
69
-
70
- class EvalLogPayload(TypedDict):
71
- results: List[Dict[str, Any]]
72
- run: Dict[str, Any]
73
-
74
-
75
- class EvalStatusPayload(TypedDict):
76
- experiment_run_id: str
77
- judgment_api_key: str
78
- project_name: str
79
-
80
-
81
- class CheckExperimentTypePayload(TypedDict):
82
- eval_name: str
83
- project_name: str
84
- judgment_api_key: str
85
- is_trace: bool
86
-
87
-
88
- class EvalRunNameExistsPayload(TypedDict):
89
- eval_name: str
90
- project_name: str
91
- judgment_api_key: str
92
-
93
-
94
- class CheckExampleKeysPayload(TypedDict):
95
- keys: List[str]
96
- eval_name: str
97
- project_name: str
98
-
99
-
100
- # Datasets API
101
- JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
102
- JUDGMENT_DATASETS_APPEND_EXAMPLES_API_URL = f"{ROOT_API}/datasets/insert_examples/"
103
- JUDGMENT_DATASETS_APPEND_TRACES_API_URL = f"{ROOT_API}/traces/add_to_dataset/"
104
- JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull_for_judgeval/"
105
- JUDGMENT_DATASETS_DELETE_API_URL = f"{ROOT_API}/datasets/delete/"
106
- JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
107
- JUDGMENT_DATASETS_PROJECT_STATS_API_URL = f"{ROOT_API}/datasets/fetch_stats_by_project/"
108
- JUDGMENT_DATASETS_INSERT_API_URL = f"{ROOT_API}/datasets/insert_examples/"
109
-
110
-
111
- class DatasetPushPayload(TypedDict):
112
- dataset_alias: str
113
- project_name: str
114
- examples: List[Dict[str, Any]]
115
- traces: List[Dict[str, Any]]
116
- overwrite: bool
117
-
118
-
119
- class DatasetAppendPayload(TypedDict):
120
- dataset_alias: str
121
- project_name: str
122
- examples: List[Dict[str, Any]]
123
-
124
-
125
- class DatasetPullPayload(TypedDict):
126
- dataset_alias: str
127
- project_name: str
128
-
129
-
130
- class DatasetDeletePayload(TypedDict):
131
- dataset_alias: str
132
- project_name: str
133
-
134
-
135
- class DatasetExportPayload(TypedDict):
136
- dataset_alias: str
137
- project_name: str
138
-
139
-
140
- class DatasetStatsPayload(TypedDict):
141
- project_name: str
142
-
143
-
144
- # Projects API
145
- JUDGMENT_PROJECT_DELETE_API_URL = f"{ROOT_API}/projects/delete_from_judgeval/"
146
- JUDGMENT_PROJECT_CREATE_API_URL = f"{ROOT_API}/projects/add/"
147
-
148
-
149
- class ProjectDeletePayload(TypedDict):
150
- project_list: List[str]
151
-
152
-
153
- class ProjectCreatePayload(TypedDict):
154
- project_name: str
155
-
156
-
157
- JUDGMENT_SCORER_SAVE_API_URL = f"{ROOT_API}/save_scorer/"
158
- JUDGMENT_SCORER_FETCH_API_URL = f"{ROOT_API}/fetch_scorer/"
159
- JUDGMENT_SCORER_EXISTS_API_URL = f"{ROOT_API}/scorer_exists/"
160
-
161
-
162
- class ScorerSavePayload(TypedDict):
163
- name: str
164
- prompt: str
165
- threshold: float
166
- options: Optional[dict]
167
-
168
-
169
- class ScorerFetchPayload(TypedDict):
170
- name: str
171
-
172
-
173
- class ScorerExistsPayload(TypedDict):
174
- name: str
175
-
176
-
177
- class CustomScorerUploadPayload(TypedDict):
178
- scorer_name: str
179
- scorer_code: str
180
- requirements_text: str
181
-
182
-
183
- class CustomScorerTemplateResponse(TypedDict):
184
- scorer_name: str
185
- status: str
186
- message: str
@@ -1,27 +0,0 @@
1
- """
2
- Common Exceptions in Judgeval
3
- """
4
-
5
-
6
- class MissingTestCaseParamsError(Exception):
7
- pass
8
-
9
-
10
- class JudgmentAPIError(Exception):
11
- """
12
- Exception raised when an error occurs while executing a Judgment API request
13
- """
14
-
15
- def __init__(self, message: str):
16
- super().__init__(message)
17
- self.message = message
18
-
19
-
20
- class InvalidJudgeModelError(Exception):
21
- """
22
- Exception raised when an invalid judge model is provided
23
- """
24
-
25
- def __init__(self, message: str):
26
- super().__init__(message)
27
- self.message = message
@@ -1,6 +0,0 @@
1
- from judgeval.common.storage.s3_storage import S3Storage
2
-
3
-
4
- __all__ = [
5
- "S3Storage",
6
- ]
@@ -1,97 +0,0 @@
1
- import os
2
- import boto3
3
- import orjson
4
- from typing import Optional
5
- from datetime import datetime, UTC
6
- from botocore.exceptions import ClientError
7
- from judgeval.common.logger import judgeval_logger
8
-
9
-
10
- class S3Storage:
11
- """Utility class for storing and retrieving trace data from S3."""
12
-
13
- def __init__(
14
- self,
15
- bucket_name: str,
16
- aws_access_key_id: Optional[str] = None,
17
- aws_secret_access_key: Optional[str] = None,
18
- region_name: Optional[str] = None,
19
- ):
20
- """Initialize S3 storage with credentials and bucket name.
21
-
22
- Args:
23
- bucket_name: Name of the S3 bucket to store traces in
24
- aws_access_key_id: AWS access key ID (optional, will use environment variables if not provided)
25
- aws_secret_access_key: AWS secret access key (optional, will use environment variables if not provided)
26
- region_name: AWS region name (optional, will use environment variables if not provided)
27
- """
28
- self.bucket_name = bucket_name
29
- self.s3_client = boto3.client(
30
- "s3",
31
- aws_access_key_id=aws_access_key_id or os.getenv("AWS_ACCESS_KEY_ID"),
32
- aws_secret_access_key=aws_secret_access_key
33
- or os.getenv("AWS_SECRET_ACCESS_KEY"),
34
- region_name=region_name or os.getenv("AWS_REGION", "us-west-1"),
35
- )
36
-
37
- def _ensure_bucket_exists(self):
38
- """Ensure the S3 bucket exists, creating it if necessary."""
39
- try:
40
- self.s3_client.head_bucket(Bucket=self.bucket_name)
41
- except ClientError as e:
42
- error_code = e.response["Error"]["Code"]
43
- if error_code == "404":
44
- # Bucket doesn't exist, create it
45
- try:
46
- self.s3_client.create_bucket(
47
- Bucket=self.bucket_name,
48
- CreateBucketConfiguration={
49
- "LocationConstraint": self.s3_client.meta.region_name
50
- },
51
- ) if self.s3_client.meta.region_name != "us-east-1" else self.s3_client.create_bucket(
52
- Bucket=self.bucket_name
53
- )
54
- except ClientError as create_error:
55
- if (
56
- create_error.response["Error"]["Code"]
57
- == "BucketAlreadyOwnedByYou"
58
- ):
59
- # Bucket was just created by another process
60
- judgeval_logger.warning(
61
- f"Bucket {self.bucket_name} was just created by another process"
62
- )
63
- pass
64
- else:
65
- raise create_error
66
- else:
67
- # Some other error occurred
68
- raise e
69
-
70
- def save_trace(self, trace_data: dict, trace_id: str, project_name: str) -> str:
71
- """Save trace data to S3.
72
-
73
- Args:
74
- trace_data: The trace data to save
75
- trace_id: Unique identifier for the trace
76
- project_name: Name of the project the trace belongs to
77
-
78
- Returns:
79
- str: S3 key where the trace was saved
80
- """
81
- # Ensure bucket exists before saving
82
- self._ensure_bucket_exists()
83
-
84
- # Create a timestamped key for the trace
85
- timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
86
- s3_key = f"traces/{project_name}/{trace_id}_{timestamp}.json"
87
-
88
- trace_json = orjson.dumps(trace_data).decode("utf-8")
89
-
90
- self.s3_client.put_object(
91
- Bucket=self.bucket_name,
92
- Key=s3_key,
93
- Body=trace_json,
94
- ContentType="application/json",
95
- )
96
-
97
- return s3_key
@@ -1,31 +0,0 @@
1
- from judgeval.common.tracer.core import (
2
- TraceClient,
3
- _DeepTracer,
4
- Tracer,
5
- wrap,
6
- current_span_var,
7
- current_trace_var,
8
- SpanType,
9
- cost_per_token,
10
- )
11
- from judgeval.common.tracer.otel_exporter import JudgmentAPISpanExporter
12
- from judgeval.common.tracer.otel_span_processor import JudgmentSpanProcessor
13
- from judgeval.common.tracer.span_processor import SpanProcessorBase
14
- from judgeval.common.tracer.trace_manager import TraceManagerClient
15
- from judgeval.data import TraceSpan
16
-
17
- __all__ = [
18
- "_DeepTracer",
19
- "TraceClient",
20
- "Tracer",
21
- "wrap",
22
- "current_span_var",
23
- "current_trace_var",
24
- "TraceManagerClient",
25
- "JudgmentAPISpanExporter",
26
- "JudgmentSpanProcessor",
27
- "SpanProcessorBase",
28
- "SpanType",
29
- "cost_per_token",
30
- "TraceSpan",
31
- ]
@@ -1,22 +0,0 @@
1
- import os
2
- import site
3
- import sysconfig
4
-
5
-
6
- # NOTE: This builds once, can be tweaked if we are missing / capturing other unncessary modules
7
- # @link https://docs.python.org/3.13/library/sysconfig.html
8
- _TRACE_FILEPATH_BLOCKLIST = tuple(
9
- os.path.realpath(p) + os.sep
10
- for p in {
11
- sysconfig.get_paths()["stdlib"],
12
- sysconfig.get_paths().get("platstdlib", ""),
13
- *site.getsitepackages(),
14
- site.getusersitepackages(),
15
- *(
16
- [os.path.join(os.path.dirname(__file__), "../../judgeval/")]
17
- if os.environ.get("JUDGMENT_DEV")
18
- else []
19
- ),
20
- }
21
- if p
22
- )