deepeval 3.7.0__py3-none-any.whl → 3.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepeval/__init__.py +0 -4
- deepeval/_version.py +1 -1
- deepeval/cli/main.py +7 -0
- deepeval/confident/api.py +6 -1
- deepeval/config/settings.py +5 -0
- deepeval/evaluate/compare.py +219 -4
- deepeval/evaluate/types.py +6 -0
- deepeval/evaluate/utils.py +30 -0
- deepeval/key_handler.py +1 -0
- deepeval/metrics/arena_g_eval/arena_g_eval.py +5 -1
- deepeval/metrics/arena_g_eval/utils.py +5 -5
- deepeval/metrics/conversational_g_eval/conversational_g_eval.py +9 -18
- deepeval/metrics/g_eval/g_eval.py +5 -1
- deepeval/metrics/g_eval/utils.py +1 -1
- deepeval/metrics/multimodal_metrics/multimodal_g_eval/multimodal_g_eval.py +5 -1
- deepeval/metrics/utils.py +1 -1
- deepeval/models/llms/gemini_model.py +27 -5
- deepeval/openai_agents/callback_handler.py +12 -3
- deepeval/prompt/prompt.py +25 -14
- deepeval/simulator/template.py +1 -1
- deepeval/synthesizer/config.py +9 -0
- deepeval/synthesizer/schema.py +23 -0
- deepeval/synthesizer/synthesizer.py +1137 -2
- deepeval/synthesizer/templates/__init__.py +11 -2
- deepeval/synthesizer/templates/template.py +554 -1
- deepeval/synthesizer/templates/template_extraction.py +32 -0
- deepeval/synthesizer/templates/template_prompt.py +262 -0
- deepeval/test_case/__init__.py +2 -1
- deepeval/test_case/arena_test_case.py +15 -4
- deepeval/test_case/mllm_test_case.py +45 -22
- deepeval/test_run/cache.py +31 -10
- deepeval/test_run/hyperparameters.py +5 -1
- deepeval/test_run/test_run.py +28 -9
- deepeval/tracing/tracing.py +1 -1
- deepeval/utils.py +4 -0
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/METADATA +3 -2
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/RECORD +40 -40
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/LICENSE.md +0 -0
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/WHEEL +0 -0
- {deepeval-3.7.0.dist-info → deepeval-3.7.2.dist-info}/entry_points.txt +0 -0
|
@@ -37,6 +37,69 @@ class PromptSynthesizerTemplate:
|
|
|
37
37
|
JSON:
|
|
38
38
|
"""
|
|
39
39
|
|
|
40
|
+
@staticmethod
|
|
41
|
+
def generate_synthetic_conversational_scenarios(
|
|
42
|
+
scenario: str,
|
|
43
|
+
conversational_task: str,
|
|
44
|
+
participant_roles: str,
|
|
45
|
+
num_goldens: int,
|
|
46
|
+
):
|
|
47
|
+
return f"""
|
|
48
|
+
Generate a series of conversational SCENARIOS from scratch based on the provided scenario description,
|
|
49
|
+
conversational task, and participant roles.
|
|
50
|
+
|
|
51
|
+
A SCENARIO is a narrative description of a situation in which a conversation naturally occurs.
|
|
52
|
+
It is NOT a question, NOT a prompt, and NOT a user query. It MUST purely describe context.
|
|
53
|
+
|
|
54
|
+
Each scenario MUST depict a realistic MULTI-TURN conversational situation involving the given participants.
|
|
55
|
+
|
|
56
|
+
**
|
|
57
|
+
IMPORTANT FORMAT:
|
|
58
|
+
- Only return JSON
|
|
59
|
+
- JSON MUST contain: {{ "data": [ {{ "scenario": "..." }}, ... ] }}
|
|
60
|
+
- You MUST TRY to generate {num_goldens} items
|
|
61
|
+
**
|
|
62
|
+
|
|
63
|
+
Example of GOOD scenarios (situational descriptions):
|
|
64
|
+
- "During a late afternoon code review session, a junior engineer asks their senior engineer why an async function is inconsistent, leading to a detailed back-and-forth about race conditions."
|
|
65
|
+
- "While preparing for a sprint demo, a senior engineer helps a junior engineer interpret stack traces, prompting a step-by-step explanation."
|
|
66
|
+
|
|
67
|
+
Example of BAD scenarios (DO NOT DO):
|
|
68
|
+
- "Why does my async function return inconsistent results?" (This is a prompt)
|
|
69
|
+
- "Explain how to debug race conditions." (Instruction)
|
|
70
|
+
- "What is the freezing point of water?" (Question)
|
|
71
|
+
|
|
72
|
+
CRITICAL REQUIREMENTS:
|
|
73
|
+
- Scenario MUST be a narrative description of a SITUATION.
|
|
74
|
+
- Scenario MUST involve these participant roles: {participant_roles}
|
|
75
|
+
- Scenario MUST align with this conversational task: {conversational_task}
|
|
76
|
+
- Scenario MUST feel natural, real-world, and MULTI-TURN.
|
|
77
|
+
- Scenario MUST NOT contain:
|
|
78
|
+
• direct questions
|
|
79
|
+
• instructions
|
|
80
|
+
• tasks
|
|
81
|
+
• explicit prompts
|
|
82
|
+
• standalone facts
|
|
83
|
+
- Scenario MUST be grounded in the meaning of the provided base scenario description.
|
|
84
|
+
|
|
85
|
+
You MUST TRY to generate {num_goldens} high-quality, non-repetitive scenarios.
|
|
86
|
+
**
|
|
87
|
+
|
|
88
|
+
Base Scenario Description:
|
|
89
|
+
{scenario}
|
|
90
|
+
|
|
91
|
+
Conversational Task:
|
|
92
|
+
{conversational_task}
|
|
93
|
+
|
|
94
|
+
Participant Roles:
|
|
95
|
+
{participant_roles}
|
|
96
|
+
|
|
97
|
+
Num Scenarios:
|
|
98
|
+
{num_goldens}
|
|
99
|
+
|
|
100
|
+
JSON:
|
|
101
|
+
"""
|
|
102
|
+
|
|
40
103
|
|
|
41
104
|
######################################################################################################
|
|
42
105
|
##### Approach similar to https://github.com/nlpxucan/WizardLM/blob/main/Evol_Instruct/depth.py ######
|
|
@@ -282,3 +345,202 @@ class PromptEvolutionTemplate:
|
|
|
282
345
|
Rewritten Input:
|
|
283
346
|
"""
|
|
284
347
|
)
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
class ConversationalPromptEvolutionTemplate:
|
|
351
|
+
|
|
352
|
+
base_instruction = """I want you to act as a conversational scenario rewriter.
|
|
353
|
+
Your objective is to rewrite the given `Scenario`. You MUST complicate the `Scenario` using the following method:"""
|
|
354
|
+
|
|
355
|
+
@staticmethod
|
|
356
|
+
def reasoning_evolution(scenario):
|
|
357
|
+
return (
|
|
358
|
+
ConversationalPromptEvolutionTemplate.base_instruction
|
|
359
|
+
+ f"""
|
|
360
|
+
1. Rewrite `Scenario` to force participants into multi-step conversational reasoning.
|
|
361
|
+
2. Add layered inferences or analytical leaps required in dialogue.
|
|
362
|
+
3. `Rewritten Scenario` must stay concise, human-readable, and remain a conversation setup.
|
|
363
|
+
4. Do NOT exceed **15 words**.
|
|
364
|
+
|
|
365
|
+
**
|
|
366
|
+
EXAMPLES
|
|
367
|
+
|
|
368
|
+
Example scenario:
|
|
369
|
+
Two students discuss climate change.
|
|
370
|
+
Example rewritten scenario:
|
|
371
|
+
Two students debate climate impacts, tracing cause-effect chains across multiple evidence sources.
|
|
372
|
+
|
|
373
|
+
--------------------------
|
|
374
|
+
|
|
375
|
+
Example scenario:
|
|
376
|
+
A doctor explains treatment options.
|
|
377
|
+
Example rewritten scenario:
|
|
378
|
+
Doctor and patient reason through symptoms requiring sequential diagnostic logic.
|
|
379
|
+
|
|
380
|
+
--------------------------
|
|
381
|
+
|
|
382
|
+
Scenario:
|
|
383
|
+
{scenario}
|
|
384
|
+
Rewritten Scenario:
|
|
385
|
+
"""
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
@staticmethod
|
|
389
|
+
def concretizing_evolution(scenario):
|
|
390
|
+
return (
|
|
391
|
+
ConversationalPromptEvolutionTemplate.base_instruction
|
|
392
|
+
+ f"""
|
|
393
|
+
1. Replace broad conversation setup with a **more specific, concrete** conversational scene.
|
|
394
|
+
2. Add real-world detail (location, constraint, specific topic).
|
|
395
|
+
3. Keep under **15 words**, concise, and still a dialogue setup.
|
|
396
|
+
|
|
397
|
+
**
|
|
398
|
+
EXAMPLES
|
|
399
|
+
|
|
400
|
+
Example scenario:
|
|
401
|
+
Two engineers talk about safety.
|
|
402
|
+
Example rewritten scenario:
|
|
403
|
+
Two engineers argue over failing brake-system logs during late-night review.
|
|
404
|
+
|
|
405
|
+
--------------------------
|
|
406
|
+
|
|
407
|
+
Example scenario:
|
|
408
|
+
Two friends discuss exercise.
|
|
409
|
+
Example rewritten scenario:
|
|
410
|
+
Two friends compare heart-rate sensor issues during a marathon-training chat.
|
|
411
|
+
|
|
412
|
+
--------------------------
|
|
413
|
+
|
|
414
|
+
Scenario:
|
|
415
|
+
{scenario}
|
|
416
|
+
Rewritten Scenario:
|
|
417
|
+
"""
|
|
418
|
+
)
|
|
419
|
+
|
|
420
|
+
@staticmethod
|
|
421
|
+
def constrained_evolution(scenario):
|
|
422
|
+
return (
|
|
423
|
+
ConversationalPromptEvolutionTemplate.base_instruction
|
|
424
|
+
+ f"""
|
|
425
|
+
1. Add at least one new constraint shaping the conversation.
|
|
426
|
+
2. Constraint must significantly affect the dialogue.
|
|
427
|
+
3. Keep under **15 words**, concise, conversational.
|
|
428
|
+
|
|
429
|
+
**
|
|
430
|
+
EXAMPLES
|
|
431
|
+
|
|
432
|
+
Example scenario:
|
|
433
|
+
Two coworkers plan a report.
|
|
434
|
+
Example rewritten scenario:
|
|
435
|
+
Two coworkers plan a report with strict no-internet constraint.
|
|
436
|
+
|
|
437
|
+
--------------------------
|
|
438
|
+
|
|
439
|
+
Example scenario:
|
|
440
|
+
A teacher reviews homework.
|
|
441
|
+
Example rewritten scenario:
|
|
442
|
+
Teacher and student discuss homework under urgent submission deadline.
|
|
443
|
+
|
|
444
|
+
--------------------------
|
|
445
|
+
|
|
446
|
+
Scenario:
|
|
447
|
+
{scenario}
|
|
448
|
+
Rewritten Scenario:
|
|
449
|
+
"""
|
|
450
|
+
)
|
|
451
|
+
|
|
452
|
+
@staticmethod
|
|
453
|
+
def comparative_question_evolution(scenario):
|
|
454
|
+
return (
|
|
455
|
+
ConversationalPromptEvolutionTemplate.base_instruction
|
|
456
|
+
+ f"""
|
|
457
|
+
1. Rewrite `Scenario` so the conversation centers on comparing two+ items.
|
|
458
|
+
2. Must highlight similarities/differences through dialogue.
|
|
459
|
+
3. Keep under **15 words**, concise, conversational.
|
|
460
|
+
|
|
461
|
+
**
|
|
462
|
+
EXAMPLES
|
|
463
|
+
|
|
464
|
+
Example scenario:
|
|
465
|
+
Two analysts discuss tools.
|
|
466
|
+
Example rewritten scenario:
|
|
467
|
+
Two analysts compare legacy analytics pipeline vs. new automated system.
|
|
468
|
+
|
|
469
|
+
--------------------------
|
|
470
|
+
|
|
471
|
+
Example scenario:
|
|
472
|
+
Two students study history.
|
|
473
|
+
Example rewritten scenario:
|
|
474
|
+
Two students contrast Renaissance ideals with Enlightenment philosophies.
|
|
475
|
+
|
|
476
|
+
--------------------------
|
|
477
|
+
|
|
478
|
+
Scenario:
|
|
479
|
+
{scenario}
|
|
480
|
+
Rewritten Scenario:
|
|
481
|
+
"""
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
@staticmethod
|
|
485
|
+
def hypothetical_scenario_evolution(scenario):
|
|
486
|
+
return (
|
|
487
|
+
ConversationalPromptEvolutionTemplate.base_instruction
|
|
488
|
+
+ f"""
|
|
489
|
+
1. Rewrite `Scenario` to introduce a hypothetical twist derived from the setup.
|
|
490
|
+
2. The hypothetical MUST drive the conversation.
|
|
491
|
+
3. Keep under **15 words**, concise, conversational.
|
|
492
|
+
|
|
493
|
+
**
|
|
494
|
+
EXAMPLES
|
|
495
|
+
|
|
496
|
+
Example scenario:
|
|
497
|
+
Two scientists discuss pollution.
|
|
498
|
+
Example rewritten scenario:
|
|
499
|
+
Two scientists debate effects if emissions doubled overnight.
|
|
500
|
+
|
|
501
|
+
--------------------------
|
|
502
|
+
|
|
503
|
+
Example scenario:
|
|
504
|
+
A medic trains a recruit.
|
|
505
|
+
Example rewritten scenario:
|
|
506
|
+
Medic and recruit plan response to hypothetical antibiotic-resistant outbreak.
|
|
507
|
+
|
|
508
|
+
--------------------------
|
|
509
|
+
|
|
510
|
+
Scenario:
|
|
511
|
+
{scenario}
|
|
512
|
+
Rewritten Scenario:
|
|
513
|
+
"""
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
@staticmethod
|
|
517
|
+
def in_breadth_evolution(scenario):
|
|
518
|
+
return (
|
|
519
|
+
ConversationalPromptEvolutionTemplate.base_instruction
|
|
520
|
+
+ f"""
|
|
521
|
+
1. Rewrite `Scenario` into a new conversation within the same domain.
|
|
522
|
+
2. The new conversation must explore a rarer, niche angle.
|
|
523
|
+
3. Keep under **15 words**, concise, conversational.
|
|
524
|
+
|
|
525
|
+
**
|
|
526
|
+
EXAMPLES
|
|
527
|
+
|
|
528
|
+
Example scenario:
|
|
529
|
+
Two doctors discuss patient care.
|
|
530
|
+
Example rewritten scenario:
|
|
531
|
+
Two doctors debate rare autoimmune disorder diagnostics.
|
|
532
|
+
|
|
533
|
+
--------------------------
|
|
534
|
+
|
|
535
|
+
Example scenario:
|
|
536
|
+
Two programmers discuss bugs.
|
|
537
|
+
Example rewritten scenario:
|
|
538
|
+
Two programmers examine obscure concurrency race-condition failures.
|
|
539
|
+
|
|
540
|
+
--------------------------
|
|
541
|
+
|
|
542
|
+
Scenario:
|
|
543
|
+
{scenario}
|
|
544
|
+
Rewritten Scenario:
|
|
545
|
+
"""
|
|
546
|
+
)
|
deepeval/test_case/__init__.py
CHANGED
|
@@ -10,7 +10,7 @@ from .conversational_test_case import (
|
|
|
10
10
|
TurnParams,
|
|
11
11
|
)
|
|
12
12
|
from .mllm_test_case import MLLMTestCase, MLLMTestCaseParams, MLLMImage
|
|
13
|
-
from .arena_test_case import ArenaTestCase
|
|
13
|
+
from .arena_test_case import ArenaTestCase, Contestant
|
|
14
14
|
from .mcp import (
|
|
15
15
|
MCPServer,
|
|
16
16
|
MCPPromptCall,
|
|
@@ -35,4 +35,5 @@ __all__ = [
|
|
|
35
35
|
"MLLMTestCaseParams",
|
|
36
36
|
"MLLMImage",
|
|
37
37
|
"ArenaTestCase",
|
|
38
|
+
"Contestant",
|
|
38
39
|
]
|
|
@@ -1,20 +1,31 @@
|
|
|
1
|
+
from typing import List, Dict, Optional, Union
|
|
1
2
|
from dataclasses import dataclass
|
|
2
|
-
from
|
|
3
|
+
from pydantic import BaseModel
|
|
4
|
+
|
|
3
5
|
from deepeval.test_case import (
|
|
4
6
|
LLMTestCase,
|
|
5
7
|
)
|
|
8
|
+
from deepeval.prompt import Prompt
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class Contestant(BaseModel):
|
|
12
|
+
name: str
|
|
13
|
+
test_case: LLMTestCase
|
|
14
|
+
hyperparameters: Optional[Dict[str, Union[str, int, float, Prompt]]] = None
|
|
15
|
+
|
|
16
|
+
model_config = {"arbitrary_types_allowed": True}
|
|
6
17
|
|
|
7
18
|
|
|
8
19
|
@dataclass
|
|
9
20
|
class ArenaTestCase:
|
|
10
|
-
contestants:
|
|
21
|
+
contestants: List[Contestant]
|
|
11
22
|
|
|
12
23
|
def __post_init__(self):
|
|
13
|
-
contestant_names =
|
|
24
|
+
contestant_names = [contestant.name for contestant in self.contestants]
|
|
14
25
|
if len(contestant_names) != len(set(contestant_names)):
|
|
15
26
|
raise ValueError("All contestant names must be unique.")
|
|
16
27
|
|
|
17
|
-
cases =
|
|
28
|
+
cases = [contestant.test_case for contestant in self.contestants]
|
|
18
29
|
ref_input = cases[0].input
|
|
19
30
|
for case in cases[1:]:
|
|
20
31
|
if case.input != ref_input:
|
|
@@ -11,33 +11,50 @@ from deepeval.test_case import ToolCall
|
|
|
11
11
|
|
|
12
12
|
@dataclass
|
|
13
13
|
class MLLMImage:
|
|
14
|
-
|
|
14
|
+
dataBase64: Optional[str] = None
|
|
15
|
+
mimeType: Optional[str] = None
|
|
16
|
+
url: Optional[str] = None
|
|
15
17
|
local: Optional[bool] = None
|
|
16
|
-
filename: Optional[str] =
|
|
17
|
-
mimeType: Optional[str] = field(default=None, init=False, repr=False)
|
|
18
|
-
dataBase64: Optional[str] = field(default=None, init=False, repr=False)
|
|
18
|
+
filename: Optional[str] = None
|
|
19
19
|
|
|
20
20
|
def __post_init__(self):
|
|
21
|
-
|
|
22
|
-
if self.
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
self.filename = os.path.basename(path)
|
|
31
|
-
self.mimeType = (
|
|
32
|
-
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
21
|
+
|
|
22
|
+
if self.url and self.dataBase64:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
"You cannot provide both 'url' and 'dataBase64' at the same time when creating an MLLMImage."
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
if not self.url and not self.dataBase64:
|
|
28
|
+
raise ValueError(
|
|
29
|
+
"You must provide either a 'url' or both 'dataBase64' and 'mimeType' to create an MLLMImage."
|
|
33
30
|
)
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
self.
|
|
31
|
+
|
|
32
|
+
if self.dataBase64 is not None:
|
|
33
|
+
if self.mimeType is None:
|
|
34
|
+
raise ValueError(
|
|
35
|
+
"mimeType must be provided when initializing from Base64 data."
|
|
36
|
+
)
|
|
37
37
|
else:
|
|
38
|
-
|
|
39
|
-
self.
|
|
40
|
-
|
|
38
|
+
is_local = self.is_local_path(self.url)
|
|
39
|
+
if self.local is not None:
|
|
40
|
+
assert self.local == is_local, "Local path mismatch"
|
|
41
|
+
else:
|
|
42
|
+
self.local = is_local
|
|
43
|
+
|
|
44
|
+
# compute filename, mime_type, and Base64 data
|
|
45
|
+
if self.local:
|
|
46
|
+
path = self.process_url(self.url)
|
|
47
|
+
self.filename = os.path.basename(path)
|
|
48
|
+
self.mimeType = (
|
|
49
|
+
mimetypes.guess_type(path)[0] or "application/octet-stream"
|
|
50
|
+
)
|
|
51
|
+
with open(path, "rb") as f:
|
|
52
|
+
raw = f.read()
|
|
53
|
+
self.dataBase64 = base64.b64encode(raw).decode("ascii")
|
|
54
|
+
else:
|
|
55
|
+
self.filename = None
|
|
56
|
+
self.mimeType = None
|
|
57
|
+
self.dataBase64 = None
|
|
41
58
|
|
|
42
59
|
@staticmethod
|
|
43
60
|
def process_url(url: str) -> str:
|
|
@@ -69,6 +86,12 @@ class MLLMImage:
|
|
|
69
86
|
return os.path.exists(path)
|
|
70
87
|
return False
|
|
71
88
|
|
|
89
|
+
def as_data_uri(self) -> Optional[str]:
|
|
90
|
+
"""Return the image as a data URI string, if Base64 data is available."""
|
|
91
|
+
if not self.dataBase64 or not self.mimeType:
|
|
92
|
+
return None
|
|
93
|
+
return f"data:{self.mimeType};base64,{self.dataBase64}"
|
|
94
|
+
|
|
72
95
|
|
|
73
96
|
class MLLMTestCaseParams(Enum):
|
|
74
97
|
INPUT = "input"
|
deepeval/test_run/cache.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import
|
|
1
|
+
import logging
|
|
2
2
|
import sys
|
|
3
3
|
import json
|
|
4
4
|
import os
|
|
5
|
-
from typing import List, Optional,
|
|
5
|
+
from typing import List, Optional, Dict, Union
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from pydantic import BaseModel, Field
|
|
8
8
|
|
|
@@ -12,11 +12,26 @@ from deepeval.test_case import LLMTestCaseParams, LLMTestCase, ToolCallParams
|
|
|
12
12
|
from deepeval.test_run.api import MetricData
|
|
13
13
|
from deepeval.utils import (
|
|
14
14
|
delete_file_if_exists,
|
|
15
|
+
is_read_only_env,
|
|
15
16
|
serialize,
|
|
16
17
|
)
|
|
17
18
|
from deepeval.metrics import BaseMetric
|
|
18
19
|
from deepeval.constants import HIDDEN_DIR
|
|
19
20
|
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
portalocker = None
|
|
26
|
+
if not is_read_only_env():
|
|
27
|
+
try:
|
|
28
|
+
import portalocker
|
|
29
|
+
except Exception as e:
|
|
30
|
+
logger.warning("failed to import portalocker: %s", e)
|
|
31
|
+
else:
|
|
32
|
+
logger.warning("READ_ONLY filesystem: skipping disk cache for test runs.")
|
|
33
|
+
|
|
34
|
+
|
|
20
35
|
CACHE_FILE_NAME = f"{HIDDEN_DIR}/.deepeval-cache.json"
|
|
21
36
|
TEMP_CACHE_FILE_NAME = f"{HIDDEN_DIR}/.temp-deepeval-cache.json"
|
|
22
37
|
|
|
@@ -97,7 +112,7 @@ class TestRunCacheManager:
|
|
|
97
112
|
def get_cached_test_case(
|
|
98
113
|
self, test_case: LLMTestCase, hyperparameters: Union[Dict, None]
|
|
99
114
|
) -> Union[CachedTestCase, None]:
|
|
100
|
-
if self.disable_write_cache:
|
|
115
|
+
if self.disable_write_cache or portalocker is None:
|
|
101
116
|
return None
|
|
102
117
|
|
|
103
118
|
cached_test_run = self.get_cached_test_run()
|
|
@@ -122,7 +137,7 @@ class TestRunCacheManager:
|
|
|
122
137
|
hyperparameters: Union[Dict, None],
|
|
123
138
|
to_temp: bool = False,
|
|
124
139
|
):
|
|
125
|
-
if self.disable_write_cache:
|
|
140
|
+
if self.disable_write_cache or portalocker is None:
|
|
126
141
|
return
|
|
127
142
|
cache_dict = {
|
|
128
143
|
LLMTestCaseParams.INPUT.value: test_case.input,
|
|
@@ -142,7 +157,7 @@ class TestRunCacheManager:
|
|
|
142
157
|
def set_cached_test_run(
|
|
143
158
|
self, cached_test_run: CachedTestRun, temp: bool = False
|
|
144
159
|
):
|
|
145
|
-
if self.disable_write_cache:
|
|
160
|
+
if self.disable_write_cache or portalocker is None:
|
|
146
161
|
return
|
|
147
162
|
|
|
148
163
|
if temp:
|
|
@@ -151,7 +166,7 @@ class TestRunCacheManager:
|
|
|
151
166
|
self.cached_test_run = cached_test_run
|
|
152
167
|
|
|
153
168
|
def save_cached_test_run(self, to_temp: bool = False):
|
|
154
|
-
if self.disable_write_cache:
|
|
169
|
+
if self.disable_write_cache or portalocker is None:
|
|
155
170
|
return
|
|
156
171
|
|
|
157
172
|
if to_temp:
|
|
@@ -178,7 +193,7 @@ class TestRunCacheManager:
|
|
|
178
193
|
)
|
|
179
194
|
|
|
180
195
|
def create_cached_test_run(self, temp: bool = False):
|
|
181
|
-
if self.disable_write_cache:
|
|
196
|
+
if self.disable_write_cache or portalocker is None:
|
|
182
197
|
return
|
|
183
198
|
|
|
184
199
|
cached_test_run = CachedTestRun()
|
|
@@ -188,7 +203,7 @@ class TestRunCacheManager:
|
|
|
188
203
|
def get_cached_test_run(
|
|
189
204
|
self, from_temp: bool = False
|
|
190
205
|
) -> Union[CachedTestRun, None]:
|
|
191
|
-
if self.disable_write_cache:
|
|
206
|
+
if self.disable_write_cache or portalocker is None:
|
|
192
207
|
return
|
|
193
208
|
|
|
194
209
|
should_create_cached_test_run = False
|
|
@@ -209,7 +224,7 @@ class TestRunCacheManager:
|
|
|
209
224
|
try:
|
|
210
225
|
data = json.loads(content)
|
|
211
226
|
self.temp_cached_test_run = CachedTestRun.load(data)
|
|
212
|
-
except Exception
|
|
227
|
+
except Exception:
|
|
213
228
|
should_create_cached_test_run = True
|
|
214
229
|
except portalocker.exceptions.LockException as e:
|
|
215
230
|
print(
|
|
@@ -217,6 +232,9 @@ class TestRunCacheManager:
|
|
|
217
232
|
file=sys.stderr,
|
|
218
233
|
)
|
|
219
234
|
|
|
235
|
+
if should_create_cached_test_run:
|
|
236
|
+
self.create_cached_test_run(temp=from_temp)
|
|
237
|
+
|
|
220
238
|
return self.temp_cached_test_run
|
|
221
239
|
else:
|
|
222
240
|
if self.cached_test_run:
|
|
@@ -250,6 +268,9 @@ class TestRunCacheManager:
|
|
|
250
268
|
return self.cached_test_run
|
|
251
269
|
|
|
252
270
|
def wrap_up_cached_test_run(self):
|
|
271
|
+
if portalocker is None:
|
|
272
|
+
return
|
|
273
|
+
|
|
253
274
|
if self.disable_write_cache:
|
|
254
275
|
# Clear cache if write cache is disabled
|
|
255
276
|
delete_file_if_exists(self.cache_file_name)
|
|
@@ -330,7 +351,7 @@ class Cache:
|
|
|
330
351
|
if criteria_value != cached_criteria_value:
|
|
331
352
|
return False
|
|
332
353
|
continue
|
|
333
|
-
except:
|
|
354
|
+
except Exception:
|
|
334
355
|
# For non-GEval
|
|
335
356
|
continue
|
|
336
357
|
|
|
@@ -33,7 +33,11 @@ def process_hyperparameters(
|
|
|
33
33
|
)
|
|
34
34
|
|
|
35
35
|
if isinstance(value, Prompt):
|
|
36
|
-
|
|
36
|
+
try:
|
|
37
|
+
prompt_key = f"{value.alias}_{value.version}"
|
|
38
|
+
except AttributeError:
|
|
39
|
+
prompt_key = f"{value.alias}_00.00.01"
|
|
40
|
+
|
|
37
41
|
if value._prompt_version_id is not None and value.type is not None:
|
|
38
42
|
processed_hyperparameters[key] = PromptApi(
|
|
39
43
|
id=value._prompt_version_id,
|
deepeval/test_run/test_run.py
CHANGED
|
@@ -6,11 +6,11 @@ from typing import Any, Optional, List, Dict, Union, Tuple
|
|
|
6
6
|
import shutil
|
|
7
7
|
import sys
|
|
8
8
|
import datetime
|
|
9
|
-
import portalocker
|
|
10
9
|
from rich.table import Table
|
|
11
10
|
from rich.console import Console
|
|
12
11
|
from rich import print
|
|
13
12
|
|
|
13
|
+
|
|
14
14
|
from deepeval.metrics import BaseMetric
|
|
15
15
|
from deepeval.confident.api import Api, Endpoints, HttpMethods, is_confident
|
|
16
16
|
from deepeval.test_run.api import (
|
|
@@ -25,6 +25,7 @@ from deepeval.test_case import LLMTestCase, ConversationalTestCase, MLLMTestCase
|
|
|
25
25
|
from deepeval.utils import (
|
|
26
26
|
delete_file_if_exists,
|
|
27
27
|
get_is_running_deepeval,
|
|
28
|
+
is_read_only_env,
|
|
28
29
|
open_browser,
|
|
29
30
|
shorten,
|
|
30
31
|
format_turn,
|
|
@@ -42,6 +43,21 @@ from rich.panel import Panel
|
|
|
42
43
|
from rich.columns import Columns
|
|
43
44
|
|
|
44
45
|
|
|
46
|
+
portalocker = None
|
|
47
|
+
if not is_read_only_env():
|
|
48
|
+
try:
|
|
49
|
+
import portalocker
|
|
50
|
+
except Exception as e:
|
|
51
|
+
print(
|
|
52
|
+
f"Warning: failed to import portalocker: {e}",
|
|
53
|
+
file=sys.stderr,
|
|
54
|
+
)
|
|
55
|
+
else:
|
|
56
|
+
print(
|
|
57
|
+
"Warning: DeepEval is configured for read only environment. Test runs will not be written to disk."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
45
61
|
TEMP_FILE_PATH = f"{HIDDEN_DIR}/.temp_test_run_data.json"
|
|
46
62
|
LATEST_TEST_RUN_FILE_PATH = f"{HIDDEN_DIR}/.latest_test_run.json"
|
|
47
63
|
LATEST_TEST_RUN_DATA_KEY = "testRunData"
|
|
@@ -456,7 +472,7 @@ class TestRunManager:
|
|
|
456
472
|
if self.test_run is None:
|
|
457
473
|
self.create_test_run(identifier=identifier)
|
|
458
474
|
|
|
459
|
-
if self.save_to_disk:
|
|
475
|
+
if portalocker and self.save_to_disk:
|
|
460
476
|
try:
|
|
461
477
|
with portalocker.Lock(
|
|
462
478
|
self.temp_file_path,
|
|
@@ -479,7 +495,7 @@ class TestRunManager:
|
|
|
479
495
|
return self.test_run
|
|
480
496
|
|
|
481
497
|
def save_test_run(self, path: str, save_under_key: Optional[str] = None):
|
|
482
|
-
if self.save_to_disk:
|
|
498
|
+
if portalocker and self.save_to_disk:
|
|
483
499
|
try:
|
|
484
500
|
# ensure parent directory exists
|
|
485
501
|
parent = os.path.dirname(path)
|
|
@@ -505,11 +521,14 @@ class TestRunManager:
|
|
|
505
521
|
pass
|
|
506
522
|
|
|
507
523
|
def save_final_test_run_link(self, link: str):
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
524
|
+
if portalocker:
|
|
525
|
+
try:
|
|
526
|
+
with portalocker.Lock(
|
|
527
|
+
LATEST_TEST_RUN_FILE_PATH, mode="w"
|
|
528
|
+
) as file:
|
|
529
|
+
json.dump({LATEST_TEST_RUN_LINK_KEY: link}, file)
|
|
530
|
+
except portalocker.exceptions.LockException:
|
|
531
|
+
pass
|
|
513
532
|
|
|
514
533
|
def update_test_run(
|
|
515
534
|
self,
|
|
@@ -523,7 +542,7 @@ class TestRunManager:
|
|
|
523
542
|
):
|
|
524
543
|
return
|
|
525
544
|
|
|
526
|
-
if self.save_to_disk:
|
|
545
|
+
if portalocker and self.save_to_disk:
|
|
527
546
|
try:
|
|
528
547
|
with portalocker.Lock(
|
|
529
548
|
self.temp_file_path,
|
deepeval/tracing/tracing.py
CHANGED
deepeval/utils.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: deepeval
|
|
3
|
-
Version: 3.7.
|
|
3
|
+
Version: 3.7.2
|
|
4
4
|
Summary: The LLM Evaluation Framework
|
|
5
5
|
Home-page: https://github.com/confident-ai/deepeval
|
|
6
6
|
License: Apache-2.0
|
|
@@ -32,7 +32,7 @@ Requires-Dist: pyfiglet
|
|
|
32
32
|
Requires-Dist: pytest
|
|
33
33
|
Requires-Dist: pytest-asyncio
|
|
34
34
|
Requires-Dist: pytest-repeat
|
|
35
|
-
Requires-Dist: pytest-rerunfailures
|
|
35
|
+
Requires-Dist: pytest-rerunfailures
|
|
36
36
|
Requires-Dist: pytest-xdist
|
|
37
37
|
Requires-Dist: python-dotenv (>=1.1.1,<2.0.0)
|
|
38
38
|
Requires-Dist: requests (>=2.31.0,<3.0.0)
|
|
@@ -439,6 +439,7 @@ Using `.env.local` or `.env` is optional. If they are missing, DeepEval uses you
|
|
|
439
439
|
```bash
|
|
440
440
|
cp .env.example .env.local
|
|
441
441
|
# then edit .env.local (ignored by git)
|
|
442
|
+
```
|
|
442
443
|
|
|
443
444
|
<br />
|
|
444
445
|
|