judgeval 0.0.11__py3-none-any.whl → 0.0.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
judgeval/common/tracer.py CHANGED
@@ -199,10 +199,11 @@ class TraceManagerClient:
199
199
  JUDGMENT_TRACES_FETCH_API_URL,
200
200
  json={
201
201
  "trace_id": trace_id,
202
- "judgment_api_key": self.judgment_api_key,
202
+ # "judgment_api_key": self.judgment_api_key,
203
203
  },
204
204
  headers={
205
205
  "Content-Type": "application/json",
206
+ "Authorization": f"Bearer {self.judgment_api_key}"
206
207
  }
207
208
  )
208
209
 
@@ -225,6 +226,7 @@ class TraceManagerClient:
225
226
  json=trace_data,
226
227
  headers={
227
228
  "Content-Type": "application/json",
229
+ "Authorization": f"Bearer {self.judgment_api_key}"
228
230
  }
229
231
  )
230
232
 
@@ -248,6 +250,7 @@ class TraceManagerClient:
248
250
  },
249
251
  headers={
250
252
  "Content-Type": "application/json",
253
+ "Authorization": f"Bearer {self.judgment_api_key}"
251
254
  }
252
255
  )
253
256
 
@@ -263,11 +266,12 @@ class TraceManagerClient:
263
266
  response = requests.delete(
264
267
  JUDGMENT_TRACES_DELETE_API_URL,
265
268
  json={
266
- "judgment_api_key": self.judgment_api_key,
269
+ # "judgment_api_key": self.judgment_api_key,
267
270
  "trace_ids": trace_ids,
268
271
  },
269
272
  headers={
270
273
  "Content-Type": "application/json",
274
+ "Authorization": f"Bearer {self.judgment_api_key}"
271
275
  }
272
276
  )
273
277
 
@@ -557,7 +561,8 @@ class TraceClient:
557
561
  "overwrite": overwrite
558
562
  }
559
563
 
560
- if not empty_save:
564
+ # Execute asynchrous evaluation in the background
565
+ if not empty_save: # Only send to RabbitMQ if the trace is not empty
561
566
  connection = pika.BlockingConnection(
562
567
  pika.ConnectionParameters(host=RABBITMQ_HOST, port=RABBITMQ_PORT))
563
568
  channel = connection.channel()
@@ -575,6 +580,25 @@ class TraceClient:
575
580
 
576
581
  self.trace_manager_client.save_trace(trace_data, empty_save)
577
582
 
583
+
584
+ # Save trace data by making POST request to API
585
+ response = requests.post(
586
+ JUDGMENT_TRACES_SAVE_API_URL,
587
+ json=trace_data,
588
+ headers={
589
+ "Content-Type": "application/json",
590
+ "Authorization": f"Bearer {self.tracer.api_key}" # Bearer token format
591
+ }
592
+ )
593
+
594
+ if response.status_code == HTTPStatus.BAD_REQUEST:
595
+ raise ValueError(f"Failed to save trace data: Check your Trace name for conflicts, set overwrite=True to overwrite existing traces: {response.text}")
596
+ elif response.status_code != HTTPStatus.OK:
597
+ raise ValueError(f"Failed to save trace data: {response.text}")
598
+
599
+ if not empty_save and "ui_results_url" in response.json():
600
+ rprint(f"\n🔍 You can view your trace data here: [rgb(106,0,255)]{response.json()['ui_results_url']}[/]\n")
601
+
578
602
  return self.trace_id, trace_data
579
603
 
580
604
  def delete(self):
@@ -588,23 +612,31 @@ class Tracer:
588
612
  cls._instance = super(Tracer, cls).__new__(cls)
589
613
  return cls._instance
590
614
 
591
- def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY")):
615
+ def __init__(self, api_key: str = os.getenv("JUDGMENT_API_KEY"), project_name: str = "default_project"):
592
616
  if not hasattr(self, 'initialized'):
593
-
594
617
  if not api_key:
595
618
  raise ValueError("Tracer must be configured with a Judgment API key")
596
619
 
597
620
  self.api_key: str = api_key
621
+ self.project_name: str = project_name
598
622
  self.client: JudgmentClient = JudgmentClient(judgment_api_key=api_key)
599
623
  self.depth: int = 0
600
624
  self._current_trace: Optional[str] = None
601
625
  self.initialized: bool = True
626
+ elif hasattr(self, 'project_name') and self.project_name != project_name:
627
+ warnings.warn(
628
+ f"Attempting to initialize Tracer with project_name='{project_name}' but it was already initialized with "
629
+ f"project_name='{self.project_name}'. Due to the singleton pattern, the original project_name will be used. "
630
+ "To use a different project name, ensure the first Tracer initialization uses the desired project name.",
631
+ RuntimeWarning
632
+ )
602
633
 
603
634
  @contextmanager
604
- def trace(self, name: str, project_name: str = "default_project", overwrite: bool = False) -> Generator[TraceClient, None, None]:
635
+ def trace(self, name: str, project_name: str = None, overwrite: bool = False) -> Generator[TraceClient, None, None]:
605
636
  """Start a new trace context using a context manager"""
606
637
  trace_id = str(uuid.uuid4())
607
- trace = TraceClient(self, trace_id, name, project_name=project_name, overwrite=overwrite)
638
+ project = project_name if project_name is not None else self.project_name
639
+ trace = TraceClient(self, trace_id, name, project_name=project, overwrite=overwrite)
608
640
  prev_trace = self._current_trace
609
641
  self._current_trace = trace
610
642
 
@@ -623,28 +655,40 @@ class Tracer:
623
655
  """
624
656
  return self._current_trace
625
657
 
626
- def observe(self, func=None, *, name=None, span_type: SpanType = "span"):
658
+ def observe(self, func=None, *, name=None, span_type: SpanType = "span", project_name: str = None, overwrite: bool = False):
627
659
  """
628
660
  Decorator to trace function execution with detailed entry/exit information.
629
661
 
630
662
  Args:
631
- func: The function to trace
632
- name: Optional custom name for the function
633
- span_type: The type of span to use for this observation (default: "span")
663
+ func: The function to decorate
664
+ name: Optional custom name for the span (defaults to function name)
665
+ span_type: Type of span (default "span")
666
+ project_name: Optional project name override
667
+ overwrite: Whether to overwrite existing traces
634
668
  """
635
669
  if func is None:
636
- return lambda f: self.observe(f, name=name, span_type=span_type)
670
+ return lambda f: self.observe(f, name=name, span_type=span_type, project_name=project_name, overwrite=overwrite)
671
+
672
+ # Use provided name or fall back to function name
673
+ span_name = name or func.__name__
637
674
 
638
675
  if asyncio.iscoroutinefunction(func):
639
676
  @functools.wraps(func)
640
677
  async def async_wrapper(*args, **kwargs):
678
+ # If there's already a trace, use it. Otherwise create a new one
641
679
  if self._current_trace:
642
- span_name = name or func.__name__
643
-
644
- with self._current_trace.span(span_name, span_type=span_type) as span:
645
- # Set the span type
646
- span.span_type = span_type
647
-
680
+ trace = self._current_trace
681
+ else:
682
+ trace_id = str(uuid.uuid4())
683
+ trace_name = str(uuid.uuid4())
684
+ project = project_name if project_name is not None else self.project_name
685
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
686
+ self._current_trace = trace
687
+ # Only save empty trace for the root call
688
+ trace.save(empty_save=True, overwrite=overwrite)
689
+
690
+ try:
691
+ with trace.span(span_name, span_type=span_type) as span:
648
692
  # Record inputs
649
693
  span.record_input({
650
694
  'args': list(args),
@@ -658,19 +702,30 @@ class Tracer:
658
702
  span.record_output(result)
659
703
 
660
704
  return result
661
-
662
- return await func(*args, **kwargs)
705
+ finally:
706
+ # Only save and cleanup if this is the root observe call
707
+ if self.depth == 0:
708
+ trace.save(empty_save=False, overwrite=overwrite)
709
+ self._current_trace = None
710
+
663
711
  return async_wrapper
664
712
  else:
665
713
  @functools.wraps(func)
666
714
  def wrapper(*args, **kwargs):
715
+ # If there's already a trace, use it. Otherwise create a new one
667
716
  if self._current_trace:
668
- span_name = name or func.__name__
669
-
670
- with self._current_trace.span(span_name, span_type=span_type) as span:
671
- # Set the span type
672
- span.span_type = span_type
673
-
717
+ trace = self._current_trace
718
+ else:
719
+ trace_id = str(uuid.uuid4())
720
+ trace_name = str(uuid.uuid4())
721
+ project = project_name if project_name is not None else self.project_name
722
+ trace = TraceClient(self, trace_id, trace_name, project_name=project, overwrite=overwrite)
723
+ self._current_trace = trace
724
+ # Only save empty trace for the root call
725
+ trace.save(empty_save=True, overwrite=overwrite)
726
+
727
+ try:
728
+ with trace.span(span_name, span_type=span_type) as span:
674
729
  # Record inputs
675
730
  span.record_input({
676
731
  'args': list(args),
@@ -684,8 +739,12 @@ class Tracer:
684
739
  span.record_output(result)
685
740
 
686
741
  return result
687
-
688
- return func(*args, **kwargs)
742
+ finally:
743
+ # Only save and cleanup if this is the root observe call
744
+ if self.depth == 0:
745
+ trace.save(empty_save=False, overwrite=overwrite)
746
+ self._current_trace = None
747
+
689
748
  return wrapper
690
749
 
691
750
  def wrap(client: Any) -> Any:
judgeval/constants.py CHANGED
@@ -36,7 +36,9 @@ ROOT_API = os.getenv("JUDGMENT_API_URL", "https://api.judgmentlabs.ai")
36
36
  JUDGMENT_EVAL_API_URL = f"{ROOT_API}/evaluate/"
37
37
  JUDGMENT_DATASETS_PUSH_API_URL = f"{ROOT_API}/datasets/push/"
38
38
  JUDGMENT_DATASETS_PULL_API_URL = f"{ROOT_API}/datasets/pull/"
39
+ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL = f"{ROOT_API}/datasets/export_jsonl/"
39
40
  JUDGMENT_DATASETS_PULL_ALL_API_URL = f"{ROOT_API}/datasets/get_all_stats/"
41
+ JUDGMENT_DATASETS_EDIT_API_URL = f"{ROOT_API}/datasets/edit/"
40
42
  JUDGMENT_EVAL_LOG_API_URL = f"{ROOT_API}/log_eval_results/"
41
43
  JUDGMENT_EVAL_FETCH_API_URL = f"{ROOT_API}/fetch_eval_results/"
42
44
  JUDGMENT_EVAL_DELETE_API_URL = f"{ROOT_API}/delete_eval_results_by_project_and_run_name/"
@@ -162,7 +162,8 @@ class EvalDataset:
162
162
  "additional_metadata": ast.literal_eval(row["additional_metadata"]) if pd.notna(row["additional_metadata"]) else dict(),
163
163
  "tools_called": row["tools_called"].split(";") if pd.notna(row["tools_called"]) else [],
164
164
  "expected_tools": row["expected_tools"].split(";") if pd.notna(row["expected_tools"]) else [],
165
- "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None
165
+ "trace_id": row["trace_id"] if pd.notna(row["trace_id"]) else None,
166
+ "example_id": str(row["example_id"]) if pd.notna(row["example_id"]) else None
166
167
  }
167
168
  if row["example"]:
168
169
  data["name"] = row["name"] if pd.notna(row["name"]) else None
@@ -1,5 +1,5 @@
1
1
 
2
- from typing import Optional
2
+ from typing import Optional, List
3
3
  import requests
4
4
  from rich.progress import Progress, SpinnerColumn, TextColumn
5
5
 
@@ -7,7 +7,9 @@ from judgeval.common.logger import debug, error, warning, info
7
7
  from judgeval.constants import (
8
8
  JUDGMENT_DATASETS_PUSH_API_URL,
9
9
  JUDGMENT_DATASETS_PULL_API_URL,
10
- JUDGMENT_DATASETS_PULL_ALL_API_URL
10
+ JUDGMENT_DATASETS_PULL_ALL_API_URL,
11
+ JUDGMENT_DATASETS_EDIT_API_URL,
12
+ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL
11
13
  )
12
14
  from judgeval.data import Example
13
15
  from judgeval.data.datasets import EvalDataset
@@ -23,7 +25,7 @@ class EvalDatasetClient:
23
25
  def create_dataset(self) -> EvalDataset:
24
26
  return EvalDataset(judgment_api_key=self.judgment_api_key)
25
27
 
26
- def push(self, dataset: EvalDataset, alias: str,overwrite: Optional[bool] = False) -> bool:
28
+ def push(self, dataset: EvalDataset, alias: str, overwrite: Optional[bool] = False) -> bool:
27
29
  debug(f"Pushing dataset with alias '{alias}' (overwrite={overwrite})")
28
30
  if overwrite:
29
31
  warning(f"Overwrite enabled for alias '{alias}'")
@@ -56,12 +58,16 @@ class EvalDatasetClient:
56
58
  "ground_truths": [g.to_dict() for g in dataset.ground_truths],
57
59
  "examples": [e.to_dict() for e in dataset.examples],
58
60
  "overwrite": overwrite,
59
- "judgment_api_key": dataset.judgment_api_key
61
+ # "judgment_api_key": dataset.judgment_api_key
60
62
  }
61
63
  try:
62
64
  response = requests.post(
63
65
  JUDGMENT_DATASETS_PUSH_API_URL,
64
- json=content
66
+ json=content,
67
+ headers={
68
+ "Content-Type": "application/json",
69
+ "Authorization": f"Bearer {self.judgment_api_key}"
70
+ }
65
71
  )
66
72
  if response.status_code == 500:
67
73
  error(f"Server error during push: {content.get('message')}")
@@ -115,13 +121,17 @@ class EvalDatasetClient:
115
121
  )
116
122
  request_body = {
117
123
  "alias": alias,
118
- "judgment_api_key": self.judgment_api_key
124
+ # "judgment_api_key": self.judgment_api_key
119
125
  }
120
126
 
121
127
  try:
122
128
  response = requests.post(
123
129
  JUDGMENT_DATASETS_PULL_API_URL,
124
- json=request_body
130
+ json=request_body,
131
+ headers={
132
+ "Content-Type": "application/json",
133
+ "Authorization": f"Bearer {self.judgment_api_key}"
134
+ }
125
135
  )
126
136
  response.raise_for_status()
127
137
  except requests.exceptions.RequestException as e:
@@ -169,13 +179,17 @@ class EvalDatasetClient:
169
179
  total=100,
170
180
  )
171
181
  request_body = {
172
- "judgment_api_key": self.judgment_api_key
182
+ # "judgment_api_key": self.judgment_api_key
173
183
  }
174
184
 
175
185
  try:
176
186
  response = requests.post(
177
187
  JUDGMENT_DATASETS_PULL_ALL_API_URL,
178
- json=request_body
188
+ json=request_body,
189
+ headers={
190
+ "Content-Type": "application/json",
191
+ "Authorization": f"Bearer {self.judgment_api_key}"
192
+ }
179
193
  )
180
194
  response.raise_for_status()
181
195
  except requests.exceptions.RequestException as e:
@@ -191,3 +205,86 @@ class EvalDatasetClient:
191
205
  )
192
206
 
193
207
  return payload
208
+
209
+ def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
210
+ """
211
+ Edits the dataset on Judgment platform by adding new examples and ground truths
212
+
213
+ Mock request:
214
+ {
215
+ "alias": alias,
216
+ "examples": [...],
217
+ "ground_truths": [...],
218
+ "judgment_api_key": self.judgment_api_key
219
+ }
220
+ """
221
+ with Progress(
222
+ SpinnerColumn(style="rgb(106,0,255)"),
223
+ TextColumn("[progress.description]{task.description}"),
224
+ transient=False,
225
+ ) as progress:
226
+ task_id = progress.add_task(
227
+ f"Editing dataset [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] on Judgment...",
228
+ total=100,
229
+ )
230
+
231
+ content = {
232
+ "alias": alias,
233
+ "examples": [e.to_dict() for e in examples],
234
+ "ground_truths": [g.to_dict() for g in ground_truths],
235
+ "judgment_api_key": self.judgment_api_key
236
+ }
237
+
238
+ try:
239
+ response = requests.post(
240
+ JUDGMENT_DATASETS_EDIT_API_URL,
241
+ json=content
242
+ )
243
+ response.raise_for_status()
244
+ except requests.exceptions.RequestException as e:
245
+ error(f"Error editing dataset: {str(e)}")
246
+ return False
247
+
248
+ info(f"Successfully edited dataset '{alias}'")
249
+ return True
250
+
251
+ def export_jsonl(self, alias: str) -> requests.Response:
252
+ """Export dataset in JSONL format from Judgment platform"""
253
+ debug(f"Exporting dataset with alias '{alias}' as JSONL")
254
+ with Progress(
255
+ SpinnerColumn(style="rgb(106,0,255)"),
256
+ TextColumn("[progress.description]{task.description}"),
257
+ transient=False,
258
+ ) as progress:
259
+ task_id = progress.add_task(
260
+ f"Exporting [rgb(106,0,255)]'{alias}'[/rgb(106,0,255)] as JSONL...",
261
+ total=100,
262
+ )
263
+ try:
264
+ response = requests.post(
265
+ JUDGMENT_DATASETS_EXPORT_JSONL_API_URL,
266
+ json={"alias": alias},
267
+ headers={
268
+ "Content-Type": "application/json",
269
+ "Authorization": f"Bearer {self.judgment_api_key}"
270
+ },
271
+ stream=True
272
+ )
273
+ response.raise_for_status()
274
+ except requests.exceptions.HTTPError as err:
275
+ if err.response.status_code == 404:
276
+ error(f"Dataset not found: {alias}")
277
+ else:
278
+ error(f"HTTP error during export: {err}")
279
+ raise
280
+ except Exception as e:
281
+ error(f"Error during export: {str(e)}")
282
+ raise
283
+
284
+ info(f"Successfully exported dataset with alias '{alias}'")
285
+ progress.update(
286
+ task_id,
287
+ description=f"{progress.tasks[task_id].description} [rgb(25,227,160)]Done!)",
288
+ )
289
+
290
+ return response
judgeval/data/example.py CHANGED
@@ -4,9 +4,11 @@ Classes for representing examples in a dataset.
4
4
 
5
5
 
6
6
  from typing import TypeVar, Optional, Any, Dict, List
7
- from pydantic import BaseModel
7
+ from uuid import uuid4
8
+ from pydantic import BaseModel, Field
8
9
  from enum import Enum
9
10
  from datetime import datetime
11
+ import time
10
12
 
11
13
 
12
14
  Input = TypeVar('Input')
@@ -33,15 +35,19 @@ class Example(BaseModel):
33
35
  tools_called: Optional[List[str]] = None
34
36
  expected_tools: Optional[List[str]] = None
35
37
  name: Optional[str] = None
36
- example_id: Optional[str] = None
38
+ example_id: str = Field(default_factory=lambda: str(uuid4()))
39
+ example_index: Optional[int] = None
37
40
  timestamp: Optional[str] = None
38
41
  trace_id: Optional[str] = None
39
42
 
40
43
  def __init__(self, **data):
41
- super().__init__(**data)
44
+ if 'example_id' not in data:
45
+ data['example_id'] = str(uuid4())
42
46
  # Set timestamp if not provided
43
- if self.timestamp is None:
44
- self.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
47
+ if 'timestamp' not in data:
48
+ data['timestamp'] = datetime.now().strftime("%Y%m%d_%H%M%S")
49
+ super().__init__(**data)
50
+
45
51
 
46
52
  def to_dict(self):
47
53
  return {
@@ -55,6 +61,7 @@ class Example(BaseModel):
55
61
  "expected_tools": self.expected_tools,
56
62
  "name": self.name,
57
63
  "example_id": self.example_id,
64
+ "example_index": self.example_index,
58
65
  "timestamp": self.timestamp,
59
66
  "trace_id": self.trace_id
60
67
  }
@@ -71,6 +78,7 @@ class Example(BaseModel):
71
78
  f"expected_tools={self.expected_tools}, "
72
79
  f"name={self.name}, "
73
80
  f"example_id={self.example_id}, "
81
+ f"example_index={self.example_index}, "
74
82
  f"timestamp={self.timestamp}, "
75
83
  f"trace_id={self.trace_id})"
76
84
  )
@@ -6,7 +6,7 @@ from typing import Optional, List, Dict, Any, Union
6
6
  import requests
7
7
 
8
8
  from judgeval.constants import ROOT_API
9
- from judgeval.data.datasets import EvalDataset, EvalDatasetClient
9
+ from judgeval.data.datasets import EvalDataset, EvalDatasetClient, GroundTruthExample
10
10
  from judgeval.data import (
11
11
  ScoringResult,
12
12
  Example
@@ -164,6 +164,11 @@ class JudgmentClient:
164
164
  """
165
165
  return self.eval_dataset_client.pull_all_user_dataset_stats()
166
166
 
167
+ def edit_dataset(self, alias: str, examples: List[Example], ground_truths: List[GroundTruthExample]) -> bool:
168
+ """
169
+ Edits the dataset on Judgment platform by adding new examples and ground truths
170
+ """
171
+ return self.eval_dataset_client.edit_dataset(alias, examples, ground_truths)
167
172
 
168
173
  # Maybe add option where you can pass in the EvaluationRun object and it will pull the eval results from the backend
169
174
  def pull_eval(self, project_name: str, eval_run_name: str) -> List[Dict[str, Union[str, List[ScoringResult]]]]:
@@ -182,6 +187,10 @@ class JudgmentClient:
182
187
  eval_name=eval_run_name,
183
188
  judgment_api_key=self.judgment_api_key)
184
189
  eval_run = requests.post(JUDGMENT_EVAL_FETCH_API_URL,
190
+ headers={
191
+ "Content-Type": "application/json",
192
+ "Authorization": f"Bearer {self.judgment_api_key}"
193
+ },
185
194
  json=eval_run_request_body.model_dump())
186
195
  if eval_run.status_code != requests.codes.ok:
187
196
  raise ValueError(f"Error fetching eval results: {eval_run.json()}")
@@ -213,6 +222,7 @@ class JudgmentClient:
213
222
  json=eval_run_request_body.model_dump(),
214
223
  headers={
215
224
  "Content-Type": "application/json",
225
+ "Authorization": f"Bearer {self.judgment_api_key}"
216
226
  })
217
227
  if response.status_code != requests.codes.ok:
218
228
  raise ValueError(f"Error deleting eval results: {response.json()}")
@@ -235,6 +245,7 @@ class JudgmentClient:
235
245
  },
236
246
  headers={
237
247
  "Content-Type": "application/json",
248
+ "Authorization": f"Bearer {self.judgment_api_key}"
238
249
  })
239
250
  if response.status_code != requests.codes.ok:
240
251
  raise ValueError(f"Error deleting eval results: {response.json()}")
@@ -246,7 +257,11 @@ class JudgmentClient:
246
257
  """
247
258
  response = requests.post(
248
259
  f"{ROOT_API}/validate_api_key/",
249
- json={"api_key": self.judgment_api_key}
260
+ headers={
261
+ "Content-Type": "application/json",
262
+ "Authorization": f"Bearer {self.judgment_api_key}",
263
+ },
264
+ json={} # Empty body now
250
265
  )
251
266
  if response.status_code == 200:
252
267
  return True, response.json()
@@ -268,12 +283,16 @@ class JudgmentClient:
268
283
  """
269
284
  request_body = {
270
285
  "slug": slug,
271
- "judgment_api_key": self.judgment_api_key
286
+ # "judgment_api_key": self.judgment_api_key
272
287
  }
273
288
 
274
289
  response = requests.post(
275
290
  f"{ROOT_API}/fetch_scorer/",
276
- json=request_body
291
+ json=request_body,
292
+ headers={
293
+ "Content-Type": "application/json",
294
+ "Authorization": f"Bearer {self.judgment_api_key}"
295
+ }
277
296
  )
278
297
 
279
298
  if response.status_code == 500:
@@ -306,13 +325,17 @@ class JudgmentClient:
306
325
  "name": scorer.name,
307
326
  "conversation": scorer.conversation,
308
327
  "options": scorer.options,
309
- "judgment_api_key": self.judgment_api_key,
328
+ # "judgment_api_key": self.judgment_api_key,
310
329
  "slug": slug
311
330
  }
312
331
 
313
332
  response = requests.post(
314
333
  f"{ROOT_API}/save_scorer/",
315
- json=request_body
334
+ json=request_body,
335
+ headers={
336
+ "Content-Type": "application/json",
337
+ "Authorization": f"Bearer {self.judgment_api_key}"
338
+ }
316
339
  )
317
340
 
318
341
  if response.status_code == 500:
@@ -47,7 +47,12 @@ def execute_api_eval(evaluation_run: EvaluationRun) -> List[Dict]:
47
47
  try:
48
48
  # submit API request to execute evals
49
49
  payload = evaluation_run.model_dump(warnings=False)
50
- response = requests.post(JUDGMENT_EVAL_API_URL, json=payload)
50
+ response = requests.post(
51
+ JUDGMENT_EVAL_API_URL, headers={
52
+ "Content-Type": "application/json",
53
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}"
54
+ },
55
+ json=payload)
51
56
  response_data = response.json()
52
57
  except Exception as e:
53
58
  error(f"Error: {e}")
@@ -151,6 +156,10 @@ def check_eval_run_name_exists(eval_name: str, project_name: str, judgment_api_k
151
156
  try:
152
157
  response = requests.post(
153
158
  f"{ROOT_API}/eval-run-name-exists/",
159
+ headers={
160
+ "Content-Type": "application/json",
161
+ "Authorization": f"Bearer {judgment_api_key}"
162
+ },
154
163
  json={
155
164
  "eval_name": eval_name,
156
165
  "project_name": project_name,
@@ -188,6 +197,10 @@ def log_evaluation_results(merged_results: List[ScoringResult], evaluation_run:
188
197
  try:
189
198
  res = requests.post(
190
199
  JUDGMENT_EVAL_LOG_API_URL,
200
+ headers={
201
+ "Content-Type": "application/json",
202
+ "Authorization": f"Bearer {evaluation_run.judgment_api_key}"
203
+ },
191
204
  json={
192
205
  "results": [result.to_dict() for result in merged_results],
193
206
  "judgment_api_key": evaluation_run.judgment_api_key,
@@ -247,12 +260,10 @@ def run_eval(evaluation_run: EvaluationRun, override: bool = False) -> List[Scor
247
260
  # Set example IDs if not already set
248
261
  debug("Initializing examples with IDs and timestamps")
249
262
  for idx, example in enumerate(evaluation_run.examples):
250
- if example.example_id is None:
251
- example.example_id = idx
252
- debug(f"Set example ID {idx} for input: {example.input[:50]}...")
263
+ example.example_index = idx # Set numeric index
253
264
  example.timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
254
265
  with example_logging_context(example.timestamp, example.example_id):
255
- debug(f"Initialized example {example.example_id}")
266
+ debug(f"Initialized example {example.example_id} (index: {example.example_index})")
256
267
  debug(f"Input: {example.input}")
257
268
  debug(f"Actual output: {example.actual_output}")
258
269
  if example.expected_output:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: judgeval
3
- Version: 0.0.11
3
+ Version: 0.0.13
4
4
  Summary: Judgeval Package
5
5
  Project-URL: Homepage, https://github.com/JudgmentLabs/judgeval
6
6
  Project-URL: Issues, https://github.com/JudgmentLabs/judgeval/issues
@@ -1,22 +1,22 @@
1
1
  judgeval/__init__.py,sha256=xiiG4CkeaOtey4fusCd9CBz0BVqzTIbV-K2EFIU0rUM,283
2
2
  judgeval/clients.py,sha256=Ns5ljrgPPXUMo7fSPJxO12H64lcPyKeQPIVG_RMi2cM,1162
3
- judgeval/constants.py,sha256=oL3kWHg9CzQJiTInDTgJgxRhF3fgylhvEVP360UqG8A,2654
3
+ judgeval/constants.py,sha256=43hGesvBbX1uzc4KXvjLCVdd6cyZRMSnEJp11oA7h74,2794
4
4
  judgeval/evaluation_run.py,sha256=ev-IbL34SwRv8lwB4KHfYag1jYo6b049R8mmwNBqmnM,5923
5
- judgeval/judgment_client.py,sha256=thmSXi2essIlmd_j5SjlBw9_8qJJp6N3djoWdLaMrj0,13770
6
- judgeval/run_evaluation.py,sha256=YOQ6s9RuUrXPTgoYexf7r6Hl1QKIMSTdvHl9kw-ZMzw,20103
5
+ judgeval/judgment_client.py,sha256=7vaarj6zXQmQ44m0cVCe72S4e92eZ4tK8sqNTnx4FLQ,14957
6
+ judgeval/run_evaluation.py,sha256=vl6TcwJVH2jN60Gja1E1tPI3Jvv6YNeNMTDVTcWkqZY,20520
7
7
  judgeval/common/__init__.py,sha256=7d24BRxtncpMj3AAJCj8RS7TqgjXmW777HVZH6-3sBs,289
8
8
  judgeval/common/exceptions.py,sha256=U-TxHLn7oVMezsMuoYouNDb2XuS8RCggfntYf5_6u4E,565
9
9
  judgeval/common/logger.py,sha256=QXN3UMymmKu2iMEMEgATLBnMDjGr_pE2iOSEFoICgg8,6092
10
- judgeval/common/tracer.py,sha256=wp-oGl8rdAe3_UXcvrEKFg7V6Vnvrnz9y_RVVgYOjCY,29934
10
+ judgeval/common/tracer.py,sha256=szU7mhyMIoG9EvPIb6dtxv7ix83WVuv7TtVX31FWMoQ,33582
11
11
  judgeval/common/utils.py,sha256=3WRyyX0tvnnj_VAVlEdtZrfzyWj6zfX04xdpCtE1m5Y,33736
12
12
  judgeval/data/__init__.py,sha256=YferxwmUqoBi18hrdgro0BD0h4pt20LAqISeUzGMcVU,474
13
13
  judgeval/data/api_example.py,sha256=vwWFbI6eJr5VgURCRbuSiMtEXLUbTCih_BcaqEBy-pg,4108
14
- judgeval/data/example.py,sha256=lymGZ3jG818-r2vyFunt6OLFrhESOyJnbhao_ljTjlA,2471
14
+ judgeval/data/example.py,sha256=r_ZA_Fq0k-1xSutSLURwj0-Ug0C_yQl4GQlqtDxbYT0,2771
15
15
  judgeval/data/result.py,sha256=8FIO-bFKPegZuByKRjA2_sumjb8oGWQ5ZeQ1RVz5z2w,4393
16
16
  judgeval/data/scorer_data.py,sha256=pYljblCPZrlMIv5Eg7R-clnmsqzUBAwokKjZpwa0DXE,3280
17
17
  judgeval/data/datasets/__init__.py,sha256=eO6ayeM_bTGwIt0eDSlTBIIBvXvIWRWWSfYZrZROPiQ,265
18
- judgeval/data/datasets/dataset.py,sha256=AGdU21vZ4iVjqbjQ7JY-u29FzJrdDFTgdvhzvYVJNyo,11833
19
- judgeval/data/datasets/eval_dataset_client.py,sha256=TaCDzymGFNFjGRrieEdQB8dT8xqNPpsEi2XLGFyrJno,7113
18
+ judgeval/data/datasets/dataset.py,sha256=6-BhkGiwMmvROxnFbefgzsFZy7wAaLi9kiTQ6p0h_xk,11928
19
+ judgeval/data/datasets/eval_dataset_client.py,sha256=6wybPyt0BjrMQcOl3cTkcY3c9Pbm_K1fnpMiuzh56E4,11006
20
20
  judgeval/data/datasets/ground_truth.py,sha256=OTBs3VZe-Wp0vEXEsq14GPZHYtpWT16bhGQTycIvkKc,2057
21
21
  judgeval/data/datasets/utils.py,sha256=lQxyl7mevct7JcDSyIrU_8QOzT-EYPWEvoUiAeOdeek,2502
22
22
  judgeval/judges/__init__.py,sha256=tyQ5KY88Kp1Ctfw2IJxnVEpy8DnFCtmy04JdPOpp-As,339
@@ -78,7 +78,7 @@ judgeval/scorers/judgeval_scorers/local_implementations/summarization/summarizat
78
78
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/__init__.py,sha256=JUB3TMqS1OHr6PqpIGqkyiBNbyfUaw7lZuUATjU3_ek,168
79
79
  judgeval/scorers/judgeval_scorers/local_implementations/tool_correctness/tool_correctness_scorer.py,sha256=CYGRJY5EuyICYzHrmFdLykwXakX8AC7G3Bhj7p6szfY,5493
80
80
  judgeval/tracer/__init__.py,sha256=wy3DYpH8U_z0GO_K_gOSkK0tTTD-u5eLDo0T5xIBoAc,147
81
- judgeval-0.0.11.dist-info/METADATA,sha256=WH8aPpUNCwE1Zr21qJ0H0WEVB_i_dilyLSbw9e5nXZo,1283
82
- judgeval-0.0.11.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
83
- judgeval-0.0.11.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
84
- judgeval-0.0.11.dist-info/RECORD,,
81
+ judgeval-0.0.13.dist-info/METADATA,sha256=6BQFdiV0_9Oe119PBqfNnmgX1ZWXjN-_6x0q9lVvnDg,1283
82
+ judgeval-0.0.13.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
83
+ judgeval-0.0.13.dist-info/licenses/LICENSE.md,sha256=tKmCg7k5QOmxPK19XMfzim04QiQJPmgIm0pAn55IJwk,11352
84
+ judgeval-0.0.13.dist-info/RECORD,,