judgeval 0.0.54__py3-none-any.whl → 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. judgeval/common/api/__init__.py +3 -0
  2. judgeval/common/api/api.py +352 -0
  3. judgeval/common/api/constants.py +165 -0
  4. judgeval/common/storage/__init__.py +6 -0
  5. judgeval/common/tracer/__init__.py +31 -0
  6. judgeval/common/tracer/constants.py +22 -0
  7. judgeval/common/tracer/core.py +1916 -0
  8. judgeval/common/tracer/otel_exporter.py +108 -0
  9. judgeval/common/tracer/otel_span_processor.py +234 -0
  10. judgeval/common/tracer/span_processor.py +37 -0
  11. judgeval/common/tracer/span_transformer.py +211 -0
  12. judgeval/common/tracer/trace_manager.py +92 -0
  13. judgeval/common/utils.py +2 -2
  14. judgeval/constants.py +3 -30
  15. judgeval/data/datasets/eval_dataset_client.py +29 -156
  16. judgeval/data/judgment_types.py +4 -12
  17. judgeval/data/result.py +1 -1
  18. judgeval/data/scorer_data.py +2 -2
  19. judgeval/data/scripts/openapi_transform.py +1 -1
  20. judgeval/data/trace.py +66 -1
  21. judgeval/data/trace_run.py +0 -3
  22. judgeval/evaluation_run.py +0 -2
  23. judgeval/integrations/langgraph.py +43 -164
  24. judgeval/judgment_client.py +17 -211
  25. judgeval/run_evaluation.py +209 -611
  26. judgeval/scorers/__init__.py +2 -6
  27. judgeval/scorers/base_scorer.py +4 -23
  28. judgeval/scorers/judgeval_scorers/api_scorers/__init__.py +3 -3
  29. judgeval/scorers/judgeval_scorers/api_scorers/prompt_scorer.py +215 -0
  30. judgeval/scorers/score.py +2 -1
  31. judgeval/scorers/utils.py +1 -13
  32. judgeval/utils/requests.py +21 -0
  33. judgeval-0.1.0.dist-info/METADATA +202 -0
  34. {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/RECORD +37 -29
  35. judgeval/common/tracer.py +0 -3215
  36. judgeval/scorers/judgeval_scorers/api_scorers/classifier_scorer.py +0 -73
  37. judgeval/scorers/judgeval_scorers/classifiers/__init__.py +0 -3
  38. judgeval/scorers/judgeval_scorers/classifiers/text2sql/__init__.py +0 -3
  39. judgeval/scorers/judgeval_scorers/classifiers/text2sql/text2sql_scorer.py +0 -53
  40. judgeval-0.0.54.dist-info/METADATA +0 -1384
  41. /judgeval/common/{s3_storage.py → storage/s3_storage.py} +0 -0
  42. {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/WHEEL +0 -0
  43. {judgeval-0.0.54.dist-info → judgeval-0.1.0.dist-info}/licenses/LICENSE.md +0 -0
@@ -5,11 +5,7 @@ Implements the JudgmentClient to interact with the Judgment API.
5
5
  import os
6
6
  from uuid import uuid4
7
7
  from typing import Optional, List, Dict, Any, Union, Callable
8
- from requests import codes
9
- from judgeval.utils.requests import requests
10
- import asyncio
11
8
 
12
- from judgeval.constants import ROOT_API
13
9
  from judgeval.data.datasets import EvalDataset, EvalDatasetClient
14
10
  from judgeval.data import (
15
11
  ScoringResult,
@@ -19,40 +15,31 @@ from judgeval.data import (
19
15
  from judgeval.scorers import (
20
16
  APIScorerConfig,
21
17
  BaseScorer,
22
- ClassifierScorer,
23
18
  )
24
19
  from judgeval.evaluation_run import EvaluationRun
25
20
  from judgeval.run_evaluation import (
26
21
  run_eval,
27
22
  assert_test,
28
23
  run_trace_eval,
29
- safe_run_async,
30
24
  )
31
25
  from judgeval.data.trace_run import TraceRun
32
- from judgeval.constants import (
33
- JUDGMENT_EVAL_FETCH_API_URL,
34
- JUDGMENT_PROJECT_DELETE_API_URL,
35
- JUDGMENT_PROJECT_CREATE_API_URL,
36
- )
26
+ from judgeval.common.api import JudgmentApiClient
37
27
  from judgeval.common.exceptions import JudgmentAPIError
38
28
  from langchain_core.callbacks import BaseCallbackHandler
39
29
  from judgeval.common.tracer import Tracer
40
30
  from judgeval.common.utils import validate_api_key
41
31
  from pydantic import BaseModel
42
- from judgeval.run_evaluation import SpinnerWrappedTask
43
32
  from judgeval.common.logger import judgeval_logger
44
33
 
45
34
 
46
35
  class EvalRunRequestBody(BaseModel):
47
36
  eval_name: str
48
37
  project_name: str
49
- judgment_api_key: str
50
38
 
51
39
 
52
40
  class DeleteEvalRunRequestBody(BaseModel):
53
41
  eval_names: List[str]
54
42
  project_name: str
55
- judgment_api_key: str
56
43
 
57
44
 
58
45
  class SingletonMeta(type):
@@ -83,6 +70,7 @@ class JudgmentClient(metaclass=SingletonMeta):
83
70
 
84
71
  self.judgment_api_key = api_key
85
72
  self.organization_id = organization_id
73
+ self.api_client = JudgmentApiClient(api_key, organization_id)
86
74
  self.eval_dataset_client = EvalDatasetClient(api_key, organization_id)
87
75
 
88
76
  # Verify API key is valid
@@ -93,29 +81,6 @@ class JudgmentClient(metaclass=SingletonMeta):
93
81
  else:
94
82
  judgeval_logger.info("Successfully initialized JudgmentClient!")
95
83
 
96
- def a_run_evaluation(
97
- self,
98
- examples: List[Example],
99
- scorers: List[Union[APIScorerConfig, BaseScorer]],
100
- model: Optional[str] = "gpt-4.1",
101
- project_name: str = "default_project",
102
- eval_run_name: str = "default_eval_run",
103
- override: bool = False,
104
- append: bool = False,
105
- ) -> List[ScoringResult]:
106
- result = self.run_evaluation(
107
- examples=examples,
108
- scorers=scorers,
109
- model=model,
110
- project_name=project_name,
111
- eval_run_name=eval_run_name,
112
- override=override,
113
- append=append,
114
- async_execution=True,
115
- )
116
- assert not isinstance(result, (asyncio.Task, SpinnerWrappedTask))
117
- return result
118
-
119
84
  def run_trace_evaluation(
120
85
  self,
121
86
  scorers: List[Union[APIScorerConfig, BaseScorer]],
@@ -147,11 +112,12 @@ class JudgmentClient(metaclass=SingletonMeta):
147
112
  scorers=scorers,
148
113
  model=model,
149
114
  append=append,
150
- judgment_api_key=self.judgment_api_key,
151
115
  organization_id=self.organization_id,
152
116
  tools=tools,
153
117
  )
154
- return run_trace_eval(trace_run, override, function, tracer, examples)
118
+ return run_trace_eval(
119
+ trace_run, self.judgment_api_key, override, function, tracer, examples
120
+ )
155
121
  except ValueError as e:
156
122
  raise ValueError(
157
123
  f"Please check your TraceRun object, one or more fields are invalid: \n{str(e)}"
@@ -168,8 +134,7 @@ class JudgmentClient(metaclass=SingletonMeta):
168
134
  eval_run_name: str = "default_eval_run",
169
135
  override: bool = False,
170
136
  append: bool = False,
171
- async_execution: bool = False,
172
- ) -> Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]:
137
+ ) -> List[ScoringResult]:
173
138
  """
174
139
  Executes an evaluation of `Example`s using one or more `Scorer`s
175
140
 
@@ -181,7 +146,6 @@ class JudgmentClient(metaclass=SingletonMeta):
181
146
  eval_run_name (str): A name for this evaluation run
182
147
  override (bool): Whether to override an existing evaluation run with the same name
183
148
  append (bool): Whether to append to an existing evaluation run with the same name
184
- async_execution (bool): Whether to execute the evaluation asynchronously
185
149
 
186
150
  Returns:
187
151
  List[ScoringResult]: The results of the evaluation
@@ -194,18 +158,18 @@ class JudgmentClient(metaclass=SingletonMeta):
194
158
  try:
195
159
  eval = EvaluationRun(
196
160
  append=append,
161
+ override=override,
197
162
  project_name=project_name,
198
163
  eval_name=eval_run_name,
199
164
  examples=examples,
200
165
  scorers=scorers,
201
166
  model=model,
202
- judgment_api_key=self.judgment_api_key,
203
167
  organization_id=self.organization_id,
204
168
  )
205
169
  return run_eval(
206
170
  eval,
171
+ self.judgment_api_key,
207
172
  override,
208
- async_execution=async_execution,
209
173
  )
210
174
  except ValueError as e:
211
175
  raise ValueError(
@@ -292,158 +256,21 @@ class JudgmentClient(metaclass=SingletonMeta):
292
256
  - id (str): The evaluation run ID
293
257
  - results (List[ScoringResult]): List of scoring results
294
258
  """
295
- eval_run_request_body = EvalRunRequestBody(
296
- project_name=project_name,
297
- eval_name=eval_run_name,
298
- judgment_api_key=self.judgment_api_key,
299
- )
300
- eval_run = requests.post(
301
- JUDGMENT_EVAL_FETCH_API_URL,
302
- headers={
303
- "Content-Type": "application/json",
304
- "Authorization": f"Bearer {self.judgment_api_key}",
305
- "X-Organization-Id": self.organization_id,
306
- },
307
- json=eval_run_request_body.model_dump(),
308
- verify=True,
309
- )
310
- if eval_run.status_code != codes.ok:
311
- raise ValueError(f"Error fetching eval results: {eval_run.json()}")
312
-
313
- return eval_run.json()
259
+ return self.api_client.fetch_evaluation_results(project_name, eval_run_name)
314
260
 
315
261
  def create_project(self, project_name: str) -> bool:
316
262
  """
317
263
  Creates a project on the server.
318
264
  """
319
- response = requests.post(
320
- JUDGMENT_PROJECT_CREATE_API_URL,
321
- json={
322
- "project_name": project_name,
323
- },
324
- headers={
325
- "Content-Type": "application/json",
326
- "Authorization": f"Bearer {self.judgment_api_key}",
327
- "X-Organization-Id": self.organization_id,
328
- },
329
- )
330
- if response.status_code != codes.ok:
331
- raise ValueError(f"Error creating project: {response.json()}")
332
- return response.json()
265
+ self.api_client.create_project(project_name)
266
+ return True
333
267
 
334
268
  def delete_project(self, project_name: str) -> bool:
335
269
  """
336
270
  Deletes a project from the server. Which also deletes all evaluations and traces associated with the project.
337
271
  """
338
- response = requests.delete(
339
- JUDGMENT_PROJECT_DELETE_API_URL,
340
- json={
341
- "project_name": project_name,
342
- },
343
- headers={
344
- "Content-Type": "application/json",
345
- "Authorization": f"Bearer {self.judgment_api_key}",
346
- "X-Organization-Id": self.organization_id,
347
- },
348
- )
349
- if response.status_code != codes.ok:
350
- raise ValueError(f"Error deleting project: {response.json()}")
351
- return response.json()
352
-
353
- def fetch_classifier_scorer(self, slug: str) -> ClassifierScorer:
354
- """
355
- Fetches a classifier scorer configuration from the Judgment API.
356
-
357
- Args:
358
- slug (str): Slug identifier of the custom scorer to fetch
359
-
360
- Returns:
361
- ClassifierScorer: The configured classifier scorer object
362
-
363
- Raises:
364
- JudgmentAPIError: If the scorer cannot be fetched or doesn't exist
365
- """
366
- request_body = {
367
- "slug": slug,
368
- }
369
-
370
- response = requests.post(
371
- f"{ROOT_API}/fetch_scorer/",
372
- json=request_body,
373
- headers={
374
- "Content-Type": "application/json",
375
- "Authorization": f"Bearer {self.judgment_api_key}",
376
- "X-Organization-Id": self.organization_id,
377
- },
378
- verify=True,
379
- )
380
-
381
- if response.status_code == 500:
382
- raise JudgmentAPIError(
383
- f"The server is temporarily unavailable. Please try your request again in a few moments. Error details: {response.json().get('detail', '')}"
384
- )
385
- elif response.status_code != 200:
386
- raise JudgmentAPIError(
387
- f"Failed to fetch classifier scorer '{slug}': {response.json().get('detail', '')}"
388
- )
389
-
390
- scorer_config = response.json()
391
- scorer_config.pop("created_at")
392
- scorer_config.pop("updated_at")
393
-
394
- try:
395
- return ClassifierScorer(**scorer_config)
396
- except Exception as e:
397
- raise JudgmentAPIError(
398
- f"Failed to create classifier scorer '{slug}' with config {scorer_config}: {str(e)}"
399
- )
400
-
401
- def push_classifier_scorer(
402
- self, scorer: ClassifierScorer, slug: str | None = None
403
- ) -> str:
404
- """
405
- Pushes a classifier scorer configuration to the Judgment API.
406
-
407
- Args:
408
- slug (str): Slug identifier for the scorer. If it exists, the scorer will be updated.
409
- scorer (ClassifierScorer): The classifier scorer to save
410
-
411
- Returns:
412
- str: The slug identifier of the saved scorer
413
-
414
- Raises:
415
- JudgmentAPIError: If there's an error saving the scorer
416
- """
417
- request_body = {
418
- "name": scorer.name,
419
- "conversation": scorer.conversation,
420
- "options": scorer.options,
421
- "slug": slug,
422
- }
423
-
424
- response = requests.post(
425
- f"{ROOT_API}/save_scorer/",
426
- json=request_body,
427
- headers={
428
- "Content-Type": "application/json",
429
- "Authorization": f"Bearer {self.judgment_api_key}",
430
- "X-Organization-Id": self.organization_id,
431
- },
432
- verify=True,
433
- )
434
-
435
- if response.status_code == 500:
436
- raise JudgmentAPIError(
437
- f"The server is temporarily unavailable. \
438
- Please try your request again in a few moments. \
439
- Error details: {response.json().get('detail', '')}"
440
- )
441
- elif response.status_code != 200:
442
- raise JudgmentAPIError(
443
- f"Failed to save classifier scorer: {response.json().get('detail', '')}"
444
- )
445
-
446
- return response.json()["slug"]
272
+ self.api_client.delete_project(project_name)
273
+ return True
447
274
 
448
275
  def assert_test(
449
276
  self,
@@ -454,7 +281,6 @@ class JudgmentClient(metaclass=SingletonMeta):
454
281
  eval_run_name: str = str(uuid4()),
455
282
  override: bool = False,
456
283
  append: bool = False,
457
- async_execution: bool = False,
458
284
  ) -> None:
459
285
  """
460
286
  Asserts a test by running the evaluation and checking the results for success
@@ -470,7 +296,7 @@ class JudgmentClient(metaclass=SingletonMeta):
470
296
  async_execution (bool): Whether to run the evaluation asynchronously
471
297
  """
472
298
 
473
- results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
299
+ results: List[ScoringResult]
474
300
 
475
301
  results = self.run_evaluation(
476
302
  examples=examples,
@@ -480,19 +306,8 @@ class JudgmentClient(metaclass=SingletonMeta):
480
306
  eval_run_name=eval_run_name,
481
307
  override=override,
482
308
  append=append,
483
- async_execution=async_execution,
484
309
  )
485
-
486
- if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
487
-
488
- async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
489
- return await results
490
-
491
- actual_results = safe_run_async(run_async())
492
- assert_test(actual_results) # Call the synchronous imported function
493
- else:
494
- # 'results' is already List[ScoringResult] here (synchronous path)
495
- assert_test(results) # Call the synchronous imported function
310
+ assert_test(results)
496
311
 
497
312
  def assert_trace_test(
498
313
  self,
@@ -535,7 +350,7 @@ class JudgmentClient(metaclass=SingletonMeta):
535
350
  f"You must provide the 'tools' argument to assert_test when using a scorer with enable_param_checking=True. If you do not want to do param checking, explicitly set enable_param_checking=False for the {scorer.__name__} scorer."
536
351
  )
537
352
 
538
- results: Union[List[ScoringResult], asyncio.Task | SpinnerWrappedTask]
353
+ results: List[ScoringResult]
539
354
 
540
355
  results = self.run_trace_evaluation(
541
356
  examples=examples,
@@ -551,13 +366,4 @@ class JudgmentClient(metaclass=SingletonMeta):
551
366
  tools=tools,
552
367
  )
553
368
 
554
- if async_execution and isinstance(results, (asyncio.Task, SpinnerWrappedTask)):
555
-
556
- async def run_async(): # Using wrapper here to resolve mypy error with passing Task into asyncio.run
557
- return await results
558
-
559
- actual_results = safe_run_async(run_async())
560
- assert_test(actual_results) # Call the synchronous imported function
561
- else:
562
- # 'results' is already List[ScoringResult] here (synchronous path)
563
- assert_test(results) # Call the synchronous imported function
369
+ assert_test(results)