deepeval 3.5.0__py3-none-any.whl → 3.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deepeval/_version.py CHANGED
@@ -1 +1 @@
1
- __version__: str = "3.5.0"
1
+ __version__: str = "3.5.2"
deepeval/confident/api.py CHANGED
@@ -89,7 +89,9 @@ class Endpoints(Enum):
89
89
  TEST_RUN_ENDPOINT = "/v1/test-run"
90
90
  TRACES_ENDPOINT = "/v1/traces"
91
91
  ANNOTATIONS_ENDPOINT = "/v1/annotations"
92
+ PROMPTS_VERSION_ID_ENDPOINT = "/v1/prompts/:alias/versions/:versionId"
92
93
  PROMPTS_ENDPOINT = "/v1/prompts"
94
+ PROMPTS_VERSIONS_ENDPOINT = "/v1/prompts/:alias/versions"
93
95
  SIMULATE_ENDPOINT = "/v1/simulate"
94
96
  EVALUATE_ENDPOINT = "/v1/evaluate"
95
97
 
@@ -1,5 +1,4 @@
1
- from .callback import CallbackHandler
2
- from .patch import tool
1
+ from .callback import CallbackHandler, tool
3
2
 
4
3
 
5
- __all__ = ["CallbackHandler"]
4
+ __all__ = ["CallbackHandler", "tool"]
@@ -1,14 +1,12 @@
1
1
  from typing import Any, Optional, List, Dict
2
2
  from uuid import UUID
3
3
  from time import perf_counter
4
+ from deepeval.tracing.context import current_trace_context
4
5
  from deepeval.tracing.types import (
5
6
  LlmOutput,
6
7
  LlmToolCall,
7
- TraceAttributes,
8
8
  )
9
- from deepeval.metrics import BaseMetric, TaskCompletionMetric
10
- from deepeval.test_case import LLMTestCase
11
- from deepeval.test_run import global_test_run_manager
9
+ from deepeval.metrics import BaseMetric
12
10
 
13
11
  try:
14
12
  from langchain_core.callbacks.base import BaseCallbackHandler
@@ -19,11 +17,13 @@ try:
19
17
  # contains langchain imports
20
18
  from deepeval.integrations.langchain.utils import (
21
19
  parse_prompts_to_messages,
22
- prepare_dict,
23
20
  extract_name,
24
21
  safe_extract_model_name,
25
22
  safe_extract_token_usage,
23
+ enter_current_context,
24
+ exit_current_context,
26
25
  )
26
+ from deepeval.integrations.langchain.patch import tool
27
27
 
28
28
  langchain_installed = True
29
29
  except:
@@ -37,13 +37,8 @@ def is_langchain_installed():
37
37
  )
38
38
 
39
39
 
40
- # ASSUMPTIONS:
41
- # cycle for a single invoke call
42
- # one trace per cycle
43
-
44
40
  from deepeval.tracing import trace_manager
45
41
  from deepeval.tracing.types import (
46
- BaseSpan,
47
42
  LlmSpan,
48
43
  RetrieverSpan,
49
44
  TraceSpanStatus,
@@ -54,115 +49,32 @@ from deepeval.telemetry import capture_tracing_integration
54
49
 
55
50
  class CallbackHandler(BaseCallbackHandler):
56
51
 
57
- active_trace_id: Optional[str] = None
58
- metrics: List[BaseMetric] = []
59
- metric_collection: Optional[str] = None
60
-
61
52
  def __init__(
62
53
  self,
63
- metrics: List[BaseMetric] = [],
64
- metric_collection: Optional[str] = None,
65
54
  name: Optional[str] = None,
66
55
  tags: Optional[List[str]] = None,
67
56
  metadata: Optional[Dict[str, Any]] = None,
68
57
  thread_id: Optional[str] = None,
69
58
  user_id: Optional[str] = None,
59
+ metrics: Optional[List[BaseMetric]] = None,
60
+ metric_collection: Optional[str] = None,
70
61
  ):
71
62
  is_langchain_installed()
72
63
  with capture_tracing_integration("langchain.callback.CallbackHandler"):
64
+ trace = trace_manager.start_new_trace()
65
+
66
+ self.trace_uuid = trace.uuid
67
+
68
+ trace.name = name
69
+ trace.tags = tags
70
+ trace.metadata = metadata
71
+ trace.thread_id = thread_id
72
+ trace.user_id = user_id
73
73
  self.metrics = metrics
74
74
  self.metric_collection = metric_collection
75
- self.trace_attributes = TraceAttributes(
76
- name=name,
77
- tags=tags,
78
- metadata=metadata,
79
- thread_id=thread_id,
80
- user_id=user_id,
81
- )
75
+ current_trace_context.set(trace)
82
76
  super().__init__()
83
77
 
84
- def check_active_trace_id(self):
85
- if self.active_trace_id is None:
86
- self.active_trace_id = trace_manager.start_new_trace().uuid
87
-
88
- def add_span_to_trace(self, span: BaseSpan):
89
- trace_manager.add_span(span)
90
- trace_manager.add_span_to_trace(span)
91
-
92
- def end_span(self, span: BaseSpan):
93
- span.end_time = perf_counter()
94
- span.status = TraceSpanStatus.SUCCESS
95
- trace_manager.remove_span(str(span.uuid))
96
-
97
- ######## Conditions to add metric_collection to span ########
98
- if (
99
- self.metric_collection and span.parent_uuid is None
100
- ): # if span is a root span
101
- span.metric_collection = self.metric_collection
102
-
103
- ######## Conditions to add metrics to span ########
104
- if self.metrics and span.parent_uuid is None: # if span is a root span
105
-
106
- # prepare test_case for task_completion metric
107
- for metric in self.metrics:
108
- if isinstance(metric, TaskCompletionMetric):
109
- self.prepare_span_metric_test_case(metric, span)
110
-
111
- def end_trace(self, span: BaseSpan):
112
- current_trace = trace_manager.get_trace_by_uuid(self.active_trace_id)
113
-
114
- ######## Conditions send the trace for evaluation ########
115
- if self.metrics:
116
- trace_manager.evaluating = (
117
- True # to avoid posting the trace to the server
118
- )
119
- trace_manager.evaluation_loop = (
120
- True # to avoid traces being evaluated twice
121
- )
122
- trace_manager.integration_traces_to_evaluate.append(current_trace)
123
-
124
- if current_trace is not None:
125
- current_trace.input = span.input
126
- current_trace.output = span.output
127
-
128
- # set trace attributes
129
- if self.trace_attributes:
130
- if self.trace_attributes.name:
131
- current_trace.name = self.trace_attributes.name
132
- if self.trace_attributes.tags:
133
- current_trace.tags = self.trace_attributes.tags
134
- if self.trace_attributes.metadata:
135
- current_trace.metadata = self.trace_attributes.metadata
136
- if self.trace_attributes.thread_id:
137
- current_trace.thread_id = self.trace_attributes.thread_id
138
- if self.trace_attributes.user_id:
139
- current_trace.user_id = self.trace_attributes.user_id
140
-
141
- trace_manager.end_trace(self.active_trace_id)
142
- self.active_trace_id = None
143
-
144
- def prepare_span_metric_test_case(
145
- self, metric: TaskCompletionMetric, span: BaseSpan
146
- ):
147
- task_completion_metric = TaskCompletionMetric(
148
- threshold=metric.threshold,
149
- model=metric.model,
150
- include_reason=metric.include_reason,
151
- async_mode=metric.async_mode,
152
- strict_mode=metric.strict_mode,
153
- verbose_mode=metric.verbose_mode,
154
- )
155
- task_completion_metric.evaluation_cost = 0
156
- _llm_test_case = LLMTestCase(input="None", actual_output="None")
157
- _llm_test_case._trace_dict = trace_manager.create_nested_spans_dict(
158
- span
159
- )
160
- task, _ = task_completion_metric._extract_task_and_outcome(
161
- _llm_test_case
162
- )
163
- task_completion_metric.task = task
164
- span.metrics = [task_completion_metric]
165
-
166
78
  def on_chain_start(
167
79
  self,
168
80
  serialized: dict[str, Any],
@@ -174,43 +86,32 @@ class CallbackHandler(BaseCallbackHandler):
174
86
  metadata: Optional[dict[str, Any]] = None,
175
87
  **kwargs: Any,
176
88
  ) -> Any:
177
-
178
- self.check_active_trace_id()
179
- base_span = BaseSpan(
180
- uuid=str(run_id),
181
- status=TraceSpanStatus.ERRORED,
182
- children=[],
183
- trace_uuid=self.active_trace_id,
184
- parent_uuid=str(parent_run_id) if parent_run_id else None,
185
- start_time=perf_counter(),
186
- name=extract_name(serialized, **kwargs),
187
- input=inputs,
188
- metadata=prepare_dict(
189
- serialized=serialized, tags=tags, metadata=metadata, **kwargs
190
- ),
191
- # fallback for on_end callback
192
- end_time=perf_counter(),
193
- )
194
- self.add_span_to_trace(base_span)
89
+ if parent_run_id is None:
90
+ uuid_str = str(run_id)
91
+ base_span = enter_current_context(
92
+ uuid_str=uuid_str,
93
+ span_type="custom",
94
+ func_name=extract_name(serialized, **kwargs),
95
+ )
96
+ base_span.input = inputs
97
+ current_trace_context.get().input = inputs
98
+ base_span.metrics = self.metrics
99
+ base_span.metric_collection = self.metric_collection
195
100
 
196
101
  def on_chain_end(
197
102
  self,
198
- outputs: dict[str, Any],
103
+ output: Any,
199
104
  *,
200
105
  run_id: UUID,
201
106
  parent_run_id: Optional[UUID] = None,
202
- **kwargs: Any, # un-logged kwargs
107
+ **kwargs: Any,
203
108
  ) -> Any:
204
-
205
- base_span = trace_manager.get_span_by_uuid(str(run_id))
206
- if base_span is None:
207
- return
208
-
209
- base_span.output = outputs
210
- self.end_span(base_span)
211
-
212
- if parent_run_id is None:
213
- self.end_trace(base_span)
109
+ uuid_str = str(run_id)
110
+ base_span = trace_manager.get_span_by_uuid(uuid_str)
111
+ if base_span:
112
+ base_span.output = output
113
+ current_trace_context.get().output = output
114
+ exit_current_context(uuid_str=uuid_str)
214
115
 
215
116
  def on_llm_start(
216
117
  self,
@@ -223,36 +124,24 @@ class CallbackHandler(BaseCallbackHandler):
223
124
  metadata: Optional[dict[str, Any]] = None,
224
125
  **kwargs: Any,
225
126
  ) -> Any:
226
-
227
- self.check_active_trace_id()
228
-
229
- # extract input
127
+ uuid_str = str(run_id)
230
128
  input_messages = parse_prompts_to_messages(prompts, **kwargs)
231
-
232
- # extract model name
233
129
  model = safe_extract_model_name(metadata, **kwargs)
234
130
 
235
- llm_span = LlmSpan(
236
- uuid=str(run_id),
237
- status=TraceSpanStatus.ERRORED,
238
- children=[],
239
- trace_uuid=self.active_trace_id,
240
- parent_uuid=str(parent_run_id) if parent_run_id else None,
241
- start_time=perf_counter(),
242
- name=extract_name(serialized, **kwargs),
243
- input=input_messages,
244
- output="",
245
- metadata=prepare_dict(
246
- serialized=serialized, tags=tags, metadata=metadata, **kwargs
247
- ),
248
- model=model,
249
- # fallback for on_end callback
250
- end_time=perf_counter(),
251
- metric_collection=metadata.get("metric_collection", None),
252
- metrics=metadata.get("metrics", None),
131
+ llm_span: LlmSpan = enter_current_context(
132
+ uuid_str=uuid_str,
133
+ span_type="llm",
134
+ func_name=extract_name(serialized, **kwargs),
253
135
  )
254
136
 
255
- self.add_span_to_trace(llm_span)
137
+ llm_span.input = input_messages
138
+ llm_span.model = model
139
+ metrics = metadata.pop("metrics", None)
140
+ metric_collection = metadata.pop("metric_collection", None)
141
+ prompt = metadata.pop("prompt", None)
142
+ llm_span.metrics = metrics
143
+ llm_span.metric_collection = metric_collection
144
+ llm_span.prompt = prompt
256
145
 
257
146
  def on_llm_end(
258
147
  self,
@@ -262,12 +151,8 @@ class CallbackHandler(BaseCallbackHandler):
262
151
  parent_run_id: Optional[UUID] = None,
263
152
  **kwargs: Any, # un-logged kwargs
264
153
  ) -> Any:
265
- llm_span: LlmSpan = trace_manager.get_span_by_uuid(str(run_id))
266
- if llm_span is None:
267
- return
268
-
269
- if not isinstance(llm_span, LlmSpan):
270
- return
154
+ uuid_str = str(run_id)
155
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
271
156
 
272
157
  output = ""
273
158
  total_input_tokens = 0
@@ -317,9 +202,38 @@ class CallbackHandler(BaseCallbackHandler):
317
202
  total_output_tokens if total_output_tokens > 0 else None
318
203
  )
319
204
 
320
- self.end_span(llm_span)
321
- if parent_run_id is None:
322
- self.end_trace(llm_span)
205
+ exit_current_context(uuid_str=uuid_str)
206
+
207
+ def on_llm_error(
208
+ self,
209
+ error: BaseException,
210
+ *,
211
+ run_id: UUID,
212
+ parent_run_id: Optional[UUID] = None,
213
+ **kwargs: Any,
214
+ ) -> Any:
215
+ uuid_str = str(run_id)
216
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
217
+ llm_span.status = TraceSpanStatus.ERRORED
218
+ llm_span.error = str(error)
219
+ exit_current_context(uuid_str=uuid_str)
220
+
221
+ def on_llm_new_token(
222
+ self,
223
+ token: str,
224
+ *,
225
+ chunk,
226
+ run_id: UUID,
227
+ parent_run_id: Optional[UUID] = None,
228
+ tags: Optional[list[str]] = None,
229
+ **kwargs: Any,
230
+ ):
231
+ uuid_str = str(run_id)
232
+ llm_span: LlmSpan = trace_manager.get_span_by_uuid(uuid_str)
233
+ if llm_span.token_intervals is None:
234
+ llm_span.token_intervals = {perf_counter(): token}
235
+ else:
236
+ llm_span.token_intervals[perf_counter()] = token
323
237
 
324
238
  def on_tool_start(
325
239
  self,
@@ -333,27 +247,16 @@ class CallbackHandler(BaseCallbackHandler):
333
247
  inputs: Optional[dict[str, Any]] = None,
334
248
  **kwargs: Any,
335
249
  ) -> Any:
336
-
337
- self.check_active_trace_id()
338
-
339
- tool_span = ToolSpan(
340
- uuid=str(run_id),
341
- status=TraceSpanStatus.ERRORED,
342
- children=[],
343
- trace_uuid=self.active_trace_id,
344
- parent_uuid=str(parent_run_id) if parent_run_id else None,
345
- start_time=perf_counter(),
346
- name=extract_name(serialized, **kwargs),
347
- input=input_str,
348
- metadata=prepare_dict(
349
- serialized=serialized, tags=tags, metadata=metadata, **kwargs
350
- ),
351
- # fallback for on_end callback
352
- end_time=perf_counter(),
353
- metric_collection=metadata.get("metric_collection", None),
354
- metrics=metadata.get("metrics", None),
250
+ uuid_str = str(run_id)
251
+
252
+ tool_span = enter_current_context(
253
+ uuid_str=uuid_str,
254
+ span_type="tool",
255
+ func_name=extract_name(
256
+ serialized, **kwargs
257
+ ), # ignored when setting the input
355
258
  )
356
- self.add_span_to_trace(tool_span)
259
+ tool_span.input = inputs
357
260
 
358
261
  def on_tool_end(
359
262
  self,
@@ -364,16 +267,24 @@ class CallbackHandler(BaseCallbackHandler):
364
267
  **kwargs: Any, # un-logged kwargs
365
268
  ) -> Any:
366
269
 
367
- tool_span = trace_manager.get_span_by_uuid(str(run_id))
368
- if tool_span is None:
369
- return
370
-
270
+ uuid_str = str(run_id)
271
+ tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
371
272
  tool_span.output = output
273
+ exit_current_context(uuid_str=uuid_str)
372
274
 
373
- self.end_span(tool_span)
374
-
375
- if parent_run_id is None:
376
- self.end_trace(tool_span)
275
+ def on_tool_error(
276
+ self,
277
+ error: BaseException,
278
+ *,
279
+ run_id: UUID,
280
+ parent_run_id: Optional[UUID] = None,
281
+ **kwargs: Any, # un-logged kwargs
282
+ ) -> Any:
283
+ uuid_str = str(run_id)
284
+ tool_span: ToolSpan = trace_manager.get_span_by_uuid(uuid_str)
285
+ tool_span.status = TraceSpanStatus.ERRORED
286
+ tool_span.error = str(error)
287
+ exit_current_context(uuid_str=uuid_str)
377
288
 
378
289
  def on_retriever_start(
379
290
  self,
@@ -386,28 +297,16 @@ class CallbackHandler(BaseCallbackHandler):
386
297
  metadata: Optional[dict[str, Any]] = None,
387
298
  **kwargs: Any, # un-logged kwargs
388
299
  ) -> Any:
389
-
390
- self.check_active_trace_id()
391
-
392
- retriever_span = RetrieverSpan(
393
- uuid=str(run_id),
394
- status=TraceSpanStatus.ERRORED,
395
- children=[],
396
- trace_uuid=self.active_trace_id,
397
- parent_uuid=str(parent_run_id) if parent_run_id else None,
398
- start_time=perf_counter(),
399
- name=extract_name(serialized, **kwargs),
400
- embedder=metadata.get("ls_embedding_provider", "unknown"),
401
- metadata=prepare_dict(
402
- serialized=serialized, tags=tags, metadata=metadata, **kwargs
403
- ),
404
- # fallback for on_end callback
405
- end_time=perf_counter(),
300
+ uuid_str = str(run_id)
301
+ retriever_span = enter_current_context(
302
+ uuid_str=uuid_str,
303
+ span_type="retriever",
304
+ func_name=extract_name(serialized, **kwargs),
305
+ observe_kwargs={
306
+ "embedder": metadata.get("ls_embedding_provider", "unknown"),
307
+ },
406
308
  )
407
309
  retriever_span.input = query
408
- retriever_span.retrieval_context = []
409
-
410
- self.add_span_to_trace(retriever_span)
411
310
 
412
311
  def on_retriever_end(
413
312
  self,
@@ -417,11 +316,8 @@ class CallbackHandler(BaseCallbackHandler):
417
316
  parent_run_id: Optional[UUID] = None,
418
317
  **kwargs: Any, # un-logged kwargs
419
318
  ) -> Any:
420
-
421
- retriever_span = trace_manager.get_span_by_uuid(str(run_id))
422
-
423
- if retriever_span is None:
424
- return
319
+ uuid_str = str(run_id)
320
+ retriever_span: RetrieverSpan = trace_manager.get_span_by_uuid(uuid_str)
425
321
 
426
322
  # prepare output
427
323
  output_list = []
@@ -431,58 +327,8 @@ class CallbackHandler(BaseCallbackHandler):
431
327
  else:
432
328
  output_list.append(str(output))
433
329
 
434
- retriever_span.input = retriever_span.input
435
- retriever_span.retrieval_context = output_list
436
-
437
- self.end_span(retriever_span)
438
-
439
- if parent_run_id is None:
440
- self.end_trace(retriever_span)
441
-
442
- ################## on_error callbacks ###############
443
-
444
- def on_chain_error(
445
- self,
446
- error: BaseException,
447
- *,
448
- run_id: UUID,
449
- parent_run_id: Optional[UUID] = None,
450
- **kwargs: Any,
451
- ) -> None:
452
- base_span = trace_manager.get_span_by_uuid(str(run_id))
453
- if base_span is None:
454
- return
455
-
456
- base_span.end_time = perf_counter()
457
-
458
- def on_llm_error(
459
- self,
460
- error: BaseException,
461
- *,
462
- run_id: UUID,
463
- parent_run_id: Optional[UUID] = None,
464
- **kwargs: Any,
465
- ) -> Any:
466
-
467
- llm_span = trace_manager.get_span_by_uuid(str(run_id))
468
- if llm_span is None:
469
- return
470
-
471
- llm_span.end_time = perf_counter()
472
-
473
- def on_tool_error(
474
- self,
475
- error: BaseException,
476
- *,
477
- run_id: UUID,
478
- parent_run_id: Optional[UUID] = None,
479
- **kwargs: Any,
480
- ) -> Any:
481
- tool_span = trace_manager.get_span_by_uuid(str(run_id))
482
- if tool_span is None:
483
- return
484
-
485
- tool_span.end_time = perf_counter()
330
+ retriever_span.output = output_list
331
+ exit_current_context(uuid_str=uuid_str)
486
332
 
487
333
  def on_retriever_error(
488
334
  self,
@@ -490,10 +336,10 @@ class CallbackHandler(BaseCallbackHandler):
490
336
  *,
491
337
  run_id: UUID,
492
338
  parent_run_id: Optional[UUID] = None,
493
- **kwargs: Any,
339
+ **kwargs: Any, # un-logged kwargs
494
340
  ) -> Any:
495
- retriever_span = trace_manager.get_span_by_uuid(str(run_id))
496
- if retriever_span is None:
497
- return
498
-
499
- retriever_span.end_time = perf_counter()
341
+ uuid_str = str(run_id)
342
+ retriever_span: RetrieverSpan = trace_manager.get_span_by_uuid(uuid_str)
343
+ retriever_span.status = TraceSpanStatus.ERRORED
344
+ retriever_span.error = str(error)
345
+ exit_current_context(uuid_str=uuid_str)
@@ -1,7 +1,8 @@
1
- from langchain_core.tools import tool as original_tool, BaseTool
1
+ import functools
2
2
  from deepeval.metrics import BaseMetric
3
- from typing import List, Optional, Callable, Any
4
- from functools import wraps
3
+ from deepeval.tracing.context import current_span_context
4
+ from typing import List, Optional, Callable
5
+ from langchain_core.tools import tool as original_tool, BaseTool
5
6
 
6
7
 
7
8
  def tool(
@@ -16,17 +17,27 @@ def tool(
16
17
 
17
18
  # original_tool returns a decorator function, so we need to return a decorator
18
19
  def decorator(func: Callable) -> BaseTool:
19
-
20
- # Apply the original tool decorator to get the BaseTool
20
+ func = _patch_tool_decorator(func, metrics, metric_collection)
21
21
  tool_instance = original_tool(*args, **kwargs)(func)
22
-
23
- if isinstance(tool_instance, BaseTool):
24
- if tool_instance.metadata is None:
25
- tool_instance.metadata = {}
26
-
27
- tool_instance.metadata["metric_collection"] = metric_collection
28
- tool_instance.metadata["metrics"] = metrics
29
-
30
22
  return tool_instance
31
23
 
32
24
  return decorator
25
+
26
+
27
+ def _patch_tool_decorator(
28
+ func: Callable,
29
+ metrics: Optional[List[BaseMetric]] = None,
30
+ metric_collection: Optional[str] = None,
31
+ ):
32
+ original_func = func
33
+
34
+ @functools.wraps(original_func)
35
+ def wrapper(*args, **kwargs):
36
+ current_span = current_span_context.get()
37
+ current_span.metrics = metrics
38
+ current_span.metric_collection = metric_collection
39
+ res = original_func(*args, **kwargs)
40
+ return res
41
+
42
+ tool = wrapper
43
+ return tool