deepeval 3.6.6__py3-none-any.whl → 3.6.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. deepeval/_version.py +1 -1
  2. deepeval/benchmarks/equity_med_qa/equity_med_qa.py +1 -0
  3. deepeval/cli/main.py +42 -0
  4. deepeval/confident/api.py +1 -0
  5. deepeval/config/settings.py +22 -4
  6. deepeval/constants.py +8 -1
  7. deepeval/dataset/dataset.py +2 -11
  8. deepeval/dataset/utils.py +1 -1
  9. deepeval/evaluate/evaluate.py +5 -1
  10. deepeval/evaluate/execute.py +97 -42
  11. deepeval/evaluate/utils.py +20 -116
  12. deepeval/integrations/crewai/__init__.py +6 -1
  13. deepeval/integrations/crewai/handler.py +1 -1
  14. deepeval/integrations/crewai/subs.py +51 -0
  15. deepeval/integrations/crewai/wrapper.py +45 -5
  16. deepeval/metrics/answer_relevancy/answer_relevancy.py +12 -3
  17. deepeval/metrics/api.py +281 -0
  18. deepeval/metrics/argument_correctness/argument_correctness.py +12 -2
  19. deepeval/metrics/bias/bias.py +12 -3
  20. deepeval/metrics/contextual_precision/contextual_precision.py +12 -3
  21. deepeval/metrics/contextual_recall/contextual_recall.py +12 -3
  22. deepeval/metrics/contextual_relevancy/contextual_relevancy.py +12 -1
  23. deepeval/metrics/conversation_completeness/conversation_completeness.py +12 -0
  24. deepeval/metrics/conversational_dag/conversational_dag.py +12 -0
  25. deepeval/metrics/conversational_dag/nodes.py +12 -4
  26. deepeval/metrics/conversational_g_eval/conversational_g_eval.py +73 -59
  27. deepeval/metrics/dag/dag.py +12 -0
  28. deepeval/metrics/dag/nodes.py +12 -4
  29. deepeval/metrics/faithfulness/faithfulness.py +12 -1
  30. deepeval/metrics/g_eval/g_eval.py +11 -0
  31. deepeval/metrics/hallucination/hallucination.py +12 -1
  32. deepeval/metrics/indicator.py +8 -2
  33. deepeval/metrics/json_correctness/json_correctness.py +12 -1
  34. deepeval/metrics/knowledge_retention/knowledge_retention.py +12 -0
  35. deepeval/metrics/mcp/mcp_task_completion.py +13 -0
  36. deepeval/metrics/mcp/multi_turn_mcp_use_metric.py +13 -0
  37. deepeval/metrics/mcp_use_metric/mcp_use_metric.py +12 -1
  38. deepeval/metrics/misuse/misuse.py +12 -1
  39. deepeval/metrics/multimodal_metrics/image_coherence/image_coherence.py +3 -0
  40. deepeval/metrics/multimodal_metrics/image_editing/image_editing.py +3 -0
  41. deepeval/metrics/multimodal_metrics/image_helpfulness/image_helpfulness.py +3 -0
  42. deepeval/metrics/multimodal_metrics/image_reference/image_reference.py +3 -0
  43. deepeval/metrics/multimodal_metrics/multimodal_answer_relevancy/multimodal_answer_relevancy.py +6 -1
  44. deepeval/metrics/multimodal_metrics/multimodal_contextual_precision/multimodal_contextual_precision.py +6 -1
  45. deepeval/metrics/multimodal_metrics/multimodal_contextual_recall/multimodal_contextual_recall.py +3 -0
  46. deepeval/metrics/multimodal_metrics/multimodal_contextual_relevancy/multimodal_contextual_relevancy.py +3 -0
  47. deepeval/metrics/multimodal_metrics/multimodal_faithfulness/multimodal_faithfulness.py +3 -0
  48. deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +3 -0
  49. deepeval/metrics/multimodal_metrics/multimodal_tool_correctness/multimodal_tool_correctness.py +10 -5
  50. deepeval/metrics/non_advice/non_advice.py +12 -0
  51. deepeval/metrics/pii_leakage/pii_leakage.py +12 -1
  52. deepeval/metrics/prompt_alignment/prompt_alignment.py +12 -1
  53. deepeval/metrics/role_adherence/role_adherence.py +12 -0
  54. deepeval/metrics/role_violation/role_violation.py +12 -0
  55. deepeval/metrics/summarization/summarization.py +12 -1
  56. deepeval/metrics/task_completion/task_completion.py +3 -0
  57. deepeval/metrics/tool_correctness/tool_correctness.py +8 -0
  58. deepeval/metrics/toxicity/toxicity.py +12 -0
  59. deepeval/metrics/turn_relevancy/turn_relevancy.py +12 -0
  60. deepeval/models/llms/grok_model.py +1 -1
  61. deepeval/models/llms/openai_model.py +2 -0
  62. deepeval/openai/__init__.py +14 -32
  63. deepeval/openai/extractors.py +24 -34
  64. deepeval/openai/patch.py +256 -161
  65. deepeval/openai/types.py +20 -0
  66. deepeval/openai/utils.py +98 -56
  67. deepeval/prompt/__init__.py +19 -1
  68. deepeval/prompt/api.py +160 -0
  69. deepeval/prompt/prompt.py +244 -62
  70. deepeval/prompt/utils.py +144 -2
  71. deepeval/synthesizer/chunking/context_generator.py +209 -152
  72. deepeval/synthesizer/chunking/doc_chunker.py +46 -12
  73. deepeval/synthesizer/synthesizer.py +8 -5
  74. deepeval/test_case/api.py +131 -0
  75. deepeval/test_run/__init__.py +1 -0
  76. deepeval/test_run/hyperparameters.py +47 -8
  77. deepeval/test_run/test_run.py +104 -1
  78. deepeval/tracing/api.py +3 -1
  79. deepeval/tracing/message_types/__init__.py +10 -0
  80. deepeval/tracing/message_types/base.py +6 -0
  81. deepeval/tracing/message_types/messages.py +14 -0
  82. deepeval/tracing/message_types/tools.py +18 -0
  83. deepeval/tracing/otel/utils.py +1 -1
  84. deepeval/tracing/trace_context.py +73 -4
  85. deepeval/tracing/tracing.py +51 -3
  86. deepeval/tracing/types.py +16 -0
  87. deepeval/tracing/utils.py +8 -0
  88. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/METADATA +1 -1
  89. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/RECORD +92 -84
  90. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/LICENSE.md +0 -0
  91. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/WHEEL +0 -0
  92. {deepeval-3.6.6.dist-info → deepeval-3.6.7.dist-info}/entry_points.txt +0 -0
@@ -10,13 +10,17 @@ import math
10
10
  import sys
11
11
  import os
12
12
  import gc
13
-
13
+ import tempfile
14
+ import logging
14
15
 
15
16
  from deepeval.synthesizer.utils import (
16
17
  print_synthesizer_status,
17
18
  SynthesizerStatus,
18
19
  )
19
- from deepeval.synthesizer.chunking.doc_chunker import DocumentChunker
20
+ from deepeval.synthesizer.chunking.doc_chunker import (
21
+ DocumentChunker,
22
+ get_chromadb,
23
+ )
20
24
  from deepeval.metrics.utils import trimAndLoadJson, initialize_model
21
25
  from deepeval.synthesizer.templates.template import FilterTemplate
22
26
  from deepeval.models.base_model import (
@@ -24,6 +28,10 @@ from deepeval.models.base_model import (
24
28
  DeepEvalBaseLLM,
25
29
  )
26
30
  from deepeval.utils import update_pbar, add_pbar, remove_pbars
31
+ from deepeval.config.settings import get_settings
32
+
33
+
34
+ logger = logging.getLogger(__name__)
27
35
 
28
36
  # Monkey patch shutil.rmtree to handle locked files better on Windows
29
37
  original_rmtree = shutil.rmtree
@@ -122,6 +130,10 @@ class ContextGenerator:
122
130
  self.context_number = 0
123
131
  self.pbar_filling_contexts_ids = []
124
132
 
133
+ self.max_concurrency = int(
134
+ get_settings().DEEPEVAL_MAX_CONCURRENT_DOC_PROCESSING
135
+ )
136
+
125
137
  #########################################################
126
138
  ### Generate Contexts ###################################
127
139
  #########################################################
@@ -135,18 +147,23 @@ class ContextGenerator:
135
147
  progress: Optional[Progress] = None,
136
148
  pbar_id: Optional[int] = None,
137
149
  ) -> Tuple[List[List[str]], List[str], List[float]]:
138
- from chromadb.api.models.Collection import Collection
139
-
140
- vector_db_path = ".vector_db"
141
- if os.path.exists(vector_db_path):
142
- shutil.rmtree(vector_db_path)
150
+ # one temp root and one client for the whole run
151
+ temp_root = tempfile.mkdtemp(prefix="deepeval_chroma_")
152
+ chroma = get_chromadb()
153
+ from chromadb.config import Settings as ChromaSettings
154
+
155
+ client = chroma.PersistentClient(
156
+ path=temp_root,
157
+ settings=ChromaSettings(anonymized_telemetry=False),
158
+ )
143
159
 
144
160
  try:
145
- # Initialize lists for scores, contexts, and source files
146
- scores = []
147
- contexts = []
148
- source_files = []
161
+ # accumulators
162
+ scores: List[float] = []
163
+ contexts: List[List[str]] = []
164
+ source_files: List[str] = []
149
165
 
166
+ # progress bars
150
167
  pbar_load_docs_id = add_pbar(
151
168
  progress,
152
169
  f"\t📚 Loading {len(self.document_paths)} documents",
@@ -166,90 +183,107 @@ class ContextGenerator:
166
183
  self.pbar_chunk_docs_id = pbar_chunk_docs_id
167
184
  self.pbar_generate_contexts_id = pbar_generate_contexts_id
168
185
 
169
- # Load the source files and create document chunkers for each file
186
+ # load docs
170
187
  source_file_to_chunker_map: Dict[str, DocumentChunker] = (
171
188
  self._load_docs(progress, pbar_load_docs_id)
172
189
  )
173
190
  update_pbar(progress, pbar_id, remove=False)
174
191
 
175
- # Chunk each file into a chroma collection of chunks
176
- source_files_to_chunk_collections_map: Dict[str, Collection] = {}
177
- for key, chunker in source_file_to_chunker_map.items():
178
- collection = chunker.chunk_doc(
179
- self.chunk_size, self.chunk_overlap
180
- )
181
- self.validate_chunk_size(
182
- min_contexts_per_source_file, collection
183
- )
184
- source_files_to_chunk_collections_map[key] = collection
185
- update_pbar(progress, pbar_chunk_docs_id, remove=False)
186
- update_pbar(progress, pbar_id, remove=False)
192
+ # process each doc end-to-end (sync), with per-doc error logging
193
+ for path, chunker in source_file_to_chunker_map.items():
194
+ collection = None
195
+ try:
196
+ # chunk this doc into its own collection on the shared client
197
+ collection = chunker.chunk_doc(
198
+ self.chunk_size,
199
+ self.chunk_overlap,
200
+ client=client,
201
+ )
202
+ collection_count = collection.count()
187
203
 
188
- # Initialize progress bar for context generation
189
- num_contexts = sum(
190
- min(max_contexts_per_source_file, collection.count())
191
- for _, collection in source_files_to_chunk_collections_map.items()
192
- )
193
- self.total_chunks = sum(
194
- collection.count()
195
- for _, collection in source_files_to_chunk_collections_map.items()
196
- )
204
+ self.validate_chunk_size(
205
+ min_contexts_per_source_file, collection
206
+ )
207
+ update_pbar(progress, pbar_chunk_docs_id, remove=False)
197
208
 
198
- # Update progress bar total length for context generation after determining number of contexts
199
- if progress and pbar_generate_contexts_id:
200
- progress.update(
201
- pbar_generate_contexts_id,
202
- total=(self.max_retries + max_context_size - 1)
203
- * num_contexts,
204
- completed=0,
205
- )
209
+ # ensure we can generate at least the minimum context size
210
+ self.validate_context_size(
211
+ min_context_size, path, collection
212
+ )
206
213
 
207
- # Generate contexts for each source file
208
- for (
209
- path,
210
- collection,
211
- ) in source_files_to_chunk_collections_map.items():
212
- self.validate_context_size(min_context_size, path, collection)
213
- max_context_size = min(max_context_size, collection.count())
214
- num_context_per_source_file = min(
215
- max_contexts_per_source_file, collection.count()
216
- )
217
- contexts_per_source_file, scores_per_source_file = (
218
- self._generate_contexts_per_source_file(
219
- path=path,
220
- n_contexts_per_source_file=num_context_per_source_file,
221
- context_size=max_context_size,
222
- similarity_threshold=self.similarity_threshold,
223
- source_files_to_collections_map=source_files_to_chunk_collections_map,
224
- progress=progress,
225
- pbar_generate_contexts_id=pbar_generate_contexts_id,
214
+ # generate contexts for this doc using a map
215
+ single_map = {path: collection}
216
+ self.total_chunks += collection_count
217
+ max_sz_for_doc = min(max_context_size, collection_count)
218
+ n_ctx_for_doc = min(
219
+ max_contexts_per_source_file, collection_count
226
220
  )
227
- )
228
- contexts.extend(contexts_per_source_file)
229
- scores.extend(scores_per_source_file)
230
- source_files.extend([path] * len(contexts_per_source_file))
221
+
222
+ if progress and pbar_generate_contexts_id:
223
+ # keep simple; adjust total as we learn per-doc work
224
+ progress.update(
225
+ pbar_generate_contexts_id,
226
+ total=progress.tasks[
227
+ pbar_generate_contexts_id
228
+ ].total
229
+ + (self.max_retries + max_sz_for_doc - 1)
230
+ * n_ctx_for_doc,
231
+ )
232
+
233
+ # fill contexts for that doc
234
+ ctxs_for_doc, scores_for_doc = (
235
+ self._generate_contexts_per_source_file(
236
+ path=path,
237
+ n_contexts_per_source_file=n_ctx_for_doc,
238
+ context_size=max_sz_for_doc,
239
+ similarity_threshold=self.similarity_threshold,
240
+ source_files_to_collections_map=single_map,
241
+ progress=progress,
242
+ pbar_generate_contexts_id=pbar_generate_contexts_id,
243
+ )
244
+ )
245
+
246
+ contexts.extend(ctxs_for_doc)
247
+ scores.extend(scores_for_doc)
248
+ source_files.extend([path] * len(ctxs_for_doc))
249
+
250
+ except Exception as exc:
251
+ # record and continue with other docs
252
+ logger.exception(
253
+ "Document pipeline failed for %s", path, exc_info=exc
254
+ )
255
+ finally:
256
+ # drop the collection asap to avoid too many open collections
257
+ try:
258
+ if collection is not None:
259
+ client.delete_collection(
260
+ name=collection.name
261
+ ) # if supported
262
+ except Exception:
263
+ pass
264
+
265
+ # finalize progress bars
266
+ update_pbar(progress, pbar_id, remove=False)
231
267
  update_pbar(
232
268
  progress,
233
269
  pbar_generate_contexts_id,
234
270
  advance_to_end=True,
235
271
  remove=False,
236
272
  )
237
- update_pbar(progress, pbar_id, remove=False)
238
273
  remove_pbars(progress, self.pbar_filling_contexts_ids)
239
274
 
240
275
  if self.not_enough_chunks:
241
276
  print_synthesizer_status(
242
277
  SynthesizerStatus.WARNING,
243
278
  "Filtering not applied",
244
- f"Nnot enough chunks in smallest document",
279
+ "Not enough chunks in smallest document",
245
280
  )
246
281
 
247
282
  return contexts, source_files, scores
248
283
 
249
284
  finally:
250
- # Always delete the .vector_db folder if it exists, regardless of success or failure
251
- if os.path.exists(vector_db_path):
252
- shutil.rmtree(vector_db_path)
285
+ if os.path.exists(temp_root):
286
+ shutil.rmtree(temp_root)
253
287
 
254
288
  async def a_generate_contexts(
255
289
  self,
@@ -260,17 +294,21 @@ class ContextGenerator:
260
294
  progress: Optional[Progress] = None,
261
295
  pbar_id: Optional[int] = None,
262
296
  ) -> Tuple[List[List[str]], List[str], List[float]]:
263
- from chromadb.api.models.Collection import Collection
264
297
 
265
- vector_db_path = ".vector_db"
266
- if os.path.exists(vector_db_path):
267
- shutil.rmtree(vector_db_path)
298
+ temp_root = tempfile.mkdtemp(prefix="deepeval_chroma_")
299
+ chroma = get_chromadb()
300
+ from chromadb.config import Settings as ChromaSettings
301
+
302
+ client = chroma.PersistentClient(
303
+ path=temp_root,
304
+ settings=ChromaSettings(anonymized_telemetry=False),
305
+ )
268
306
 
269
307
  try:
270
308
  # Initialize lists for scores, contexts, and source files
271
- scores = []
272
- contexts = []
273
- source_files = []
309
+ scores: List[float] = []
310
+ contexts: List[List[str]] = []
311
+ source_files: List[str] = []
274
312
 
275
313
  # Check if chunk_size and max_context_size is valid for document lengths
276
314
  pbar_load_docs_id = add_pbar(
@@ -297,85 +335,98 @@ class ContextGenerator:
297
335
  )
298
336
  update_pbar(progress, pbar_id, remove=False)
299
337
 
300
- # Chunk each file into a chroma collection of chunks
301
- async def a_chunk_and_store(
302
- key,
303
- chunker: DocumentChunker,
304
- progress: Optional[Progress] = None,
305
- pbar_chunk_docs_id: Optional[int] = None,
306
- ):
307
- collection = await chunker.a_chunk_doc(
308
- self.chunk_size, self.chunk_overlap
309
- )
310
- self.validate_chunk_size(
311
- min_contexts_per_source_file, collection
312
- )
313
- source_files_to_chunk_collections_map[key] = collection
314
- update_pbar(progress, pbar_chunk_docs_id, remove=False)
315
-
316
- source_files_to_chunk_collections_map: Dict[str, Collection] = {}
317
- tasks = [
318
- a_chunk_and_store(key, chunker, progress, pbar_chunk_docs_id)
319
- for key, chunker in source_file_to_chunker_map.items()
320
- ]
321
- await asyncio.gather(*tasks)
322
- update_pbar(progress, pbar_id, remove=False)
323
-
324
- # Initialize progress bar for context generation
325
- num_contexts = sum(
326
- min(max_contexts_per_source_file, collection.count())
327
- for _, collection in source_files_to_chunk_collections_map.items()
328
- )
329
- self.total_chunks = sum(
330
- collection.count()
331
- for _, collection in source_files_to_chunk_collections_map.items()
332
- )
333
-
334
- # Update progress bar total length for context generation after determining number of contexts
335
- if progress and pbar_generate_contexts_id:
336
- progress.update(
337
- pbar_generate_contexts_id,
338
- total=(self.max_retries + max_context_size - 1)
339
- * num_contexts,
340
- completed=0,
341
- )
342
-
343
- # Generate contexts for each source file
344
- tasks = []
345
- for (
346
- path,
347
- collection,
348
- ) in source_files_to_chunk_collections_map.items():
349
- self.validate_context_size(min_context_size, path, collection)
350
- max_context_size = min(max_context_size, collection.count())
351
- n_contexts_per_source_file = min(
352
- max_contexts_per_source_file, collection.count()
353
- )
354
- tasks.append(
355
- self._a_process_document_async(
356
- path=path,
357
- num_context_per_source_file=n_contexts_per_source_file,
358
- max_context_size=max_context_size,
359
- source_files_to_collections_map=source_files_to_chunk_collections_map,
360
- progress=progress,
361
- pbar_generate_contexts_id=pbar_generate_contexts_id,
338
+ # stream each doc end-to-end on the shared client, with bounded concurrency
339
+ semaphore = asyncio.Semaphore(self.max_concurrency)
340
+
341
+ async def pipeline(path: str, chunker: DocumentChunker):
342
+ collection = None
343
+ async with semaphore: # bound the whole pipeline
344
+ try:
345
+ # chunk this doc into its own collection on the shared client
346
+ collection = await chunker.a_chunk_doc(
347
+ self.chunk_size,
348
+ self.chunk_overlap,
349
+ client=client,
350
+ )
351
+ collection_count = collection.count()
352
+
353
+ self.validate_chunk_size(
354
+ min_contexts_per_source_file, collection
355
+ )
356
+ update_pbar(progress, pbar_chunk_docs_id, remove=False)
357
+
358
+ # ensure we can generate at least the minimum context size
359
+ self.validate_context_size(
360
+ min_context_size, path, collection
361
+ )
362
+
363
+ # generate contexts for this doc using a map
364
+ single_map = {path: collection}
365
+ self.total_chunks += collection_count
366
+ max_sz_for_doc = min(max_context_size, collection_count)
367
+ n_ctx_for_doc = min(
368
+ max_contexts_per_source_file, collection_count
369
+ )
370
+
371
+ if progress and pbar_generate_contexts_id:
372
+ progress.update(
373
+ pbar_generate_contexts_id,
374
+ total=progress.tasks[
375
+ pbar_generate_contexts_id
376
+ ].total
377
+ + (self.max_retries + max_sz_for_doc - 1)
378
+ * n_ctx_for_doc,
379
+ )
380
+
381
+ # fill contexts for that doc
382
+ _, contexts_for_doc, scores_per_doc = (
383
+ await self._a_process_document_async(
384
+ path=path,
385
+ num_context_per_source_file=n_ctx_for_doc,
386
+ max_context_size=max_sz_for_doc,
387
+ source_files_to_collections_map=single_map,
388
+ progress=progress,
389
+ pbar_generate_contexts_id=pbar_generate_contexts_id,
390
+ )
391
+ )
392
+ return contexts_for_doc, scores_per_doc
393
+ finally:
394
+ # drop the collection asap to avoid too many open collections
395
+ try:
396
+ if collection is not None:
397
+ client.delete_collection(name=collection.name)
398
+ except Exception:
399
+ pass
400
+
401
+ # kick off bounded pipelines
402
+ paths = list(source_file_to_chunker_map.keys())
403
+ tasks = [pipeline(p, source_file_to_chunker_map[p]) for p in paths]
404
+ results = await asyncio.gather(*tasks, return_exceptions=True)
405
+
406
+ # Collect results, surface any errors after cleanup
407
+ for path, res in zip(paths, results):
408
+ if isinstance(res, Exception):
409
+ logger.error(
410
+ "Document pipeline failed for %s",
411
+ path,
412
+ exc_info=(type(res), res, res.__traceback__),
362
413
  )
414
+ continue
415
+ contexts_for_doc, scores_per_doc = (
416
+ res # see pipeline return below
363
417
  )
418
+ contexts.extend(contexts_for_doc)
419
+ scores.extend(scores_per_doc)
420
+ source_files.extend([path] * len(contexts_for_doc))
364
421
 
365
- results = await asyncio.gather(*tasks)
422
+ update_pbar(progress, pbar_id, remove=False)
366
423
  update_pbar(
367
424
  progress,
368
425
  pbar_generate_contexts_id,
369
426
  advance_to_end=True,
370
427
  remove=False,
371
428
  )
372
- update_pbar(progress, pbar_id, remove=False)
373
429
  remove_pbars(progress, self.pbar_filling_contexts_ids)
374
- for path, contexts_per_doc, scores_per_doc in results:
375
- contexts.extend(contexts_per_doc)
376
- scores.extend(scores_per_doc)
377
- for _ in contexts_per_doc:
378
- source_files.append(path)
379
430
 
380
431
  if self.not_enough_chunks:
381
432
  print_synthesizer_status(
@@ -387,8 +438,8 @@ class ContextGenerator:
387
438
  return contexts, source_files, scores
388
439
 
389
440
  finally:
390
- if os.path.exists(vector_db_path):
391
- shutil.rmtree(vector_db_path)
441
+ if os.path.exists(temp_root):
442
+ shutil.rmtree(temp_root)
392
443
 
393
444
  async def _a_process_document_async(
394
445
  self,
@@ -458,6 +509,7 @@ class ContextGenerator:
458
509
  f"\t\t🔋 Filling context #{self.context_number}",
459
510
  (context_size - 1),
460
511
  )
512
+
461
513
  self.pbar_filling_contexts_ids.append(pbar_filling_contexts_id)
462
514
  self.context_number += 1
463
515
  context = [random_chunk]
@@ -914,21 +966,26 @@ class ContextGenerator:
914
966
  progress: Optional[Progress] = None,
915
967
  pbar_load_docs_id: Optional[int] = None,
916
968
  ):
917
- doc_to_chunker_map = {}
969
+ doc_to_chunker_map: Dict[str, DocumentChunker] = {}
970
+
971
+ semaphore = asyncio.Semaphore(self.max_concurrency)
918
972
 
919
973
  async def a_process_document(
920
974
  path: str,
921
975
  progress: Optional[Progress] = None,
922
976
  pbar_load_docs_id: Optional[int] = None,
923
977
  ):
924
- doc_chunker = DocumentChunker(self.embedder)
925
- await doc_chunker.a_load_doc(path, self.encoding)
926
- doc_to_chunker_map[path] = doc_chunker
927
- update_pbar(progress, pbar_load_docs_id, remove=False)
978
+ async with semaphore:
979
+ doc_chunker = DocumentChunker(self.embedder)
980
+ await doc_chunker.a_load_doc(path, self.encoding)
981
+ doc_to_chunker_map[path] = doc_chunker
982
+ update_pbar(progress, pbar_load_docs_id, remove=False)
928
983
 
929
984
  tasks = [
930
985
  a_process_document(path, progress, pbar_load_docs_id)
931
986
  for path in self.document_paths
932
987
  ]
988
+
933
989
  await asyncio.gather(*tasks)
990
+
934
991
  return doc_to_chunker_map
@@ -1,6 +1,6 @@
1
1
  import os
2
2
 
3
- from typing import Dict, List, Optional, Type, TYPE_CHECKING
3
+ from typing import Any, Dict, List, Optional, Type, TYPE_CHECKING
4
4
  from types import SimpleNamespace
5
5
 
6
6
  from deepeval.models.base_model import DeepEvalBaseEmbeddingModel
@@ -53,7 +53,7 @@ def _get_langchain():
53
53
  )
54
54
 
55
55
 
56
- def _get_chromadb():
56
+ def get_chromadb():
57
57
  """Return the chromadb module, or raise ImportError with root cause."""
58
58
  global _chroma_mod, _chroma_import_error
59
59
  if _chroma_mod is not None:
@@ -91,10 +91,16 @@ class DocumentChunker:
91
91
  #########################################################
92
92
 
93
93
  async def a_chunk_doc(
94
- self, chunk_size: int = 1024, chunk_overlap: int = 0
94
+ self,
95
+ chunk_size: int = 1024,
96
+ chunk_overlap: int = 0,
97
+ client: Optional[Any] = None,
98
+ collection_name: Optional[str] = None,
95
99
  ) -> "Collection":
96
100
  lc = _get_langchain()
97
- chroma = _get_chromadb()
101
+ chroma = get_chromadb()
102
+
103
+ from chromadb.config import Settings as ChromaSettings
98
104
 
99
105
  # Raise error if chunk_doc is called before load_doc
100
106
  if self.sections is None or self.source_file is None:
@@ -102,12 +108,22 @@ class DocumentChunker:
102
108
  "Document Chunker has yet to properly load documents"
103
109
  )
104
110
 
105
- # Create ChromaDB client
111
+ # Determine client and collection_name
106
112
  full_document_path, _ = os.path.splitext(self.source_file)
107
113
  document_name = os.path.basename(full_document_path)
108
- client = chroma.PersistentClient(path=f".vector_db/{document_name}")
114
+ if client is None:
115
+ client = chroma.PersistentClient(
116
+ path=f".vector_db/{document_name}",
117
+ settings=ChromaSettings(anonymized_telemetry=True),
118
+ )
119
+ default_coll = f"processed_chunks_{chunk_size}_{chunk_overlap}"
120
+ else:
121
+ # namespace by doc to support sharing a single client across many docs
122
+ default_coll = (
123
+ f"{document_name}_processed_chunks_{chunk_size}_{chunk_overlap}"
124
+ )
125
+ collection_name = collection_name or default_coll
109
126
 
110
- collection_name = f"processed_chunks_{chunk_size}_{chunk_overlap}"
111
127
  try:
112
128
  collection = client.get_collection(name=collection_name)
113
129
  except Exception:
@@ -140,9 +156,17 @@ class DocumentChunker:
140
156
  )
141
157
  return collection
142
158
 
143
- def chunk_doc(self, chunk_size: int = 1024, chunk_overlap: int = 0):
159
+ def chunk_doc(
160
+ self,
161
+ chunk_size: int = 1024,
162
+ chunk_overlap: int = 0,
163
+ client: Optional[Any] = None,
164
+ collection_name: Optional[str] = None,
165
+ ):
144
166
  lc = _get_langchain()
145
- chroma = _get_chromadb()
167
+ chroma = get_chromadb()
168
+
169
+ from chromadb.config import Settings as ChromaSettings
146
170
 
147
171
  # Raise error if chunk_doc is called before load_doc
148
172
  if self.sections is None or self.source_file is None:
@@ -150,12 +174,22 @@ class DocumentChunker:
150
174
  "Document Chunker has yet to properly load documents"
151
175
  )
152
176
 
153
- # Create ChromaDB client
177
+ # Determine client and collection_name
154
178
  full_document_path, _ = os.path.splitext(self.source_file)
155
179
  document_name = os.path.basename(full_document_path)
156
- client = chroma.PersistentClient(path=f".vector_db/{document_name}")
180
+ if client is None:
181
+ client = chroma.PersistentClient(
182
+ path=f".vector_db/{document_name}",
183
+ settings=ChromaSettings(anonymized_telemetry=True),
184
+ )
185
+ default_coll = f"processed_chunks_{chunk_size}_{chunk_overlap}"
186
+ else:
187
+ # namespace by doc to support sharing a single client across many docs
188
+ default_coll = (
189
+ f"{document_name}_processed_chunks_{chunk_size}_{chunk_overlap}"
190
+ )
191
+ collection_name = collection_name or default_coll
157
192
 
158
- collection_name = f"processed_chunks_{chunk_size}_{chunk_overlap}"
159
193
  try:
160
194
  collection = client.get_collection(name=collection_name)
161
195
  except Exception:
@@ -898,6 +898,7 @@ class Synthesizer:
898
898
  update_pbar(progress, pbar_id)
899
899
 
900
900
  # Evolve inputs
901
+ evolved_prompts = []
901
902
  for i, data in enumerate(synthetic_data):
902
903
  pbar_evolve_input_id = add_pbar(
903
904
  progress,
@@ -911,14 +912,16 @@ class Synthesizer:
911
912
  progress=progress,
912
913
  pbar_evolve_input_id=pbar_evolve_input_id,
913
914
  )
915
+ evolved_prompts.append(evolved_prompt)
914
916
  update_pbar(progress, pbar_id)
915
917
 
916
918
  # Synthesize Goldens
917
- golden = Golden(
918
- input=evolved_prompt,
919
- additional_metadata={"evolutions": evolutions_used},
920
- )
921
- goldens.append(golden)
919
+ for evolved_prompt in evolved_prompts:
920
+ golden = Golden(
921
+ input=evolved_prompt,
922
+ additional_metadata={"evolutions": evolutions_used},
923
+ )
924
+ goldens.append(golden)
922
925
 
923
926
  # Wrap up Synthesis
924
927
  self.synthetic_goldens.extend(goldens)