validmind 2.8.20__py3-none-any.whl → 2.8.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. validmind/__init__.py +3 -0
  2. validmind/__version__.py +1 -1
  3. validmind/ai/utils.py +89 -0
  4. validmind/api_client.py +4 -0
  5. validmind/client.py +3 -0
  6. validmind/experimental/__init__.py +0 -0
  7. validmind/experimental/agents.py +65 -0
  8. validmind/template.py +3 -2
  9. validmind/tests/data_validation/MutualInformation.py +14 -2
  10. validmind/tests/model_validation/ragas/AnswerCorrectness.py +5 -1
  11. validmind/tests/model_validation/ragas/AspectCritic.py +5 -1
  12. validmind/tests/model_validation/ragas/ContextEntityRecall.py +5 -1
  13. validmind/tests/model_validation/ragas/ContextPrecision.py +5 -1
  14. validmind/tests/model_validation/ragas/ContextPrecisionWithoutReference.py +5 -1
  15. validmind/tests/model_validation/ragas/ContextRecall.py +5 -1
  16. validmind/tests/model_validation/ragas/Faithfulness.py +5 -1
  17. validmind/tests/model_validation/ragas/NoiseSensitivity.py +3 -1
  18. validmind/tests/model_validation/ragas/ResponseRelevancy.py +6 -4
  19. validmind/tests/model_validation/ragas/SemanticSimilarity.py +5 -1
  20. validmind/tests/model_validation/ragas/utils.py +4 -24
  21. validmind/tests/model_validation/sklearn/OverfitDiagnosis.py +11 -1
  22. validmind/tests/model_validation/sklearn/WeakspotsDiagnosis.py +13 -0
  23. validmind/tests/prompt_validation/Bias.py +2 -1
  24. validmind/tests/prompt_validation/Clarity.py +2 -1
  25. validmind/tests/prompt_validation/Conciseness.py +2 -1
  26. validmind/tests/prompt_validation/Delimitation.py +2 -1
  27. validmind/tests/prompt_validation/NegativeInstruction.py +2 -1
  28. validmind/tests/prompt_validation/Robustness.py +3 -2
  29. validmind/tests/prompt_validation/Specificity.py +2 -1
  30. validmind/tests/prompt_validation/ai_powered_test.py +18 -17
  31. validmind/vm_models/dataset/dataset.py +64 -27
  32. validmind/vm_models/result/__init__.py +16 -2
  33. validmind/vm_models/result/result.py +127 -14
  34. {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/METADATA +4 -3
  35. {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/RECORD +38 -36
  36. {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/WHEEL +1 -1
  37. {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/LICENSE +0 -0
  38. {validmind-2.8.20.dist-info → validmind-2.8.26.dist-info}/entry_points.txt +0 -0
@@ -52,7 +52,7 @@ Prompt:
52
52
 
53
53
  @tags("llm", "zero_shot", "few_shot")
54
54
  @tasks("text_classification", "text_summarization")
55
- def NegativeInstruction(model, min_threshold=7):
55
+ def NegativeInstruction(model, min_threshold=7, judge_llm=None):
56
56
  """
57
57
  Evaluates and grades the use of affirmative, proactive language over negative instructions in LLM prompts.
58
58
 
@@ -101,6 +101,7 @@ def NegativeInstruction(model, min_threshold=7):
101
101
  response = call_model(
102
102
  system_prompt=SYSTEM,
103
103
  user_prompt=USER.format(prompt_to_test=model.prompt.template),
104
+ judge_llm=judge_llm,
104
105
  )
105
106
  score = get_score(response)
106
107
  explanation = get_explanation(response)
@@ -25,7 +25,7 @@ Contradictions, edge cases, typos, bad phrasing, distracting, complex or out-of-
25
25
  Be creative and think step-by-step how you would break the prompt.
26
26
  Then generate {num_tests} inputs for the user-submitted prompt template that would break the prompt.
27
27
  Each input should be different from the others.
28
- Each input should be retured as a new line in your response.
28
+ Each input should be returned as a new line in your response.
29
29
  Respond only with the values to be inserted into the prompt template and do not include quotes, explanations or any extra text.
30
30
 
31
31
  Example:
@@ -56,7 +56,7 @@ Input:
56
56
 
57
57
  @tags("llm", "zero_shot", "few_shot")
58
58
  @tasks("text_classification", "text_summarization")
59
- def Robustness(model, dataset, num_tests=10):
59
+ def Robustness(model, dataset, num_tests=10, judge_llm=None):
60
60
  """
61
61
  Assesses the robustness of prompts provided to a Large Language Model under varying conditions and contexts. This test
62
62
  specifically measures the model's ability to generate correct classifications with the given prompt even when the
@@ -112,6 +112,7 @@ def Robustness(model, dataset, num_tests=10):
112
112
  generated_inputs = call_model(
113
113
  system_prompt=SYSTEM.format(num_tests=num_tests),
114
114
  user_prompt=USER.format(prompt_to_test=model.prompt.template),
115
+ judge_llm=judge_llm,
115
116
  ).split("\n")
116
117
 
117
118
  responses = model.predict(
@@ -52,7 +52,7 @@ Prompt:
52
52
 
53
53
  @tags("llm", "zero_shot", "few_shot")
54
54
  @tasks("text_classification", "text_summarization")
55
- def Specificity(model, min_threshold=7):
55
+ def Specificity(model, min_threshold=7, judge_llm=None):
56
56
  """
57
57
  Evaluates and scores the specificity of prompts provided to a Large Language Model (LLM), based on clarity, detail,
58
58
  and relevance.
@@ -97,6 +97,7 @@ def Specificity(model, min_threshold=7):
97
97
  response = call_model(
98
98
  system_prompt=SYSTEM,
99
99
  user_prompt=USER.format(prompt_to_test=model.prompt.template),
100
+ judge_llm=judge_llm,
100
101
  )
101
102
  score = get_score(response)
102
103
  explanation = get_explanation(response)
@@ -4,7 +4,7 @@
4
4
 
5
5
  import re
6
6
 
7
- from validmind.ai.utils import get_client_and_model, is_configured
7
+ from validmind.ai.utils import get_judge_config, is_configured
8
8
 
9
9
  missing_prompt_message = """
10
10
  Cannot run prompt validation tests on a model with no prompt.
@@ -21,7 +21,12 @@ my_vm_model = vm.init_model(
21
21
 
22
22
 
23
23
  def call_model(
24
- system_prompt: str, user_prompt: str, temperature: float = 0.0, seed: int = 42
24
+ system_prompt: str,
25
+ user_prompt: str,
26
+ temperature: float = 0.0,
27
+ seed: int = 42,
28
+ judge_llm=None,
29
+ judge_embeddings=None,
25
30
  ):
26
31
  """Call LLM with the given prompts and return the response"""
27
32
  if not is_configured():
@@ -31,21 +36,17 @@ def call_model(
31
36
  "enabled for your account."
32
37
  )
33
38
 
34
- client, model = get_client_and_model()
35
-
36
- return (
37
- client.chat.completions.create(
38
- model=model,
39
- messages=[
40
- {"role": "system", "content": system_prompt.strip("\n").strip()},
41
- {"role": "user", "content": user_prompt.strip("\n").strip()},
42
- ],
43
- temperature=temperature,
44
- seed=seed,
45
- )
46
- .choices[0]
47
- .message.content
48
- )
39
+ judge_llm, judge_embeddings = get_judge_config(judge_llm, judge_embeddings)
40
+ messages = [
41
+ ("system", system_prompt.strip("\n").strip()),
42
+ ("user", user_prompt.strip("\n").strip()),
43
+ ]
44
+
45
+ return judge_llm.invoke(
46
+ messages,
47
+ temperature=temperature,
48
+ seed=seed,
49
+ ).content
49
50
 
50
51
 
51
52
  def get_score(response: str):
@@ -47,6 +47,7 @@ class VMDataset(VMInput):
47
47
  target_class_labels (Dict): The class labels for the target columns.
48
48
  df (pd.DataFrame): The dataset as a pandas DataFrame.
49
49
  extra_columns (Dict): Extra columns to include in the dataset.
50
+ copy_data (bool): Whether to copy the data. Defaults to True.
50
51
  """
51
52
 
52
53
  def __repr__(self):
@@ -66,6 +67,7 @@ class VMDataset(VMInput):
66
67
  text_column: str = None,
67
68
  extra_columns: dict = None,
68
69
  target_class_labels: dict = None,
70
+ copy_data: bool = True,
69
71
  ):
70
72
  """
71
73
  Initializes a VMDataset instance.
@@ -82,6 +84,7 @@ class VMDataset(VMInput):
82
84
  feature_columns (str, optional): The feature column names of the dataset. Defaults to None.
83
85
  text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
84
86
  target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
87
+ copy_data (bool, optional): Whether to copy the data. Defaults to True.
85
88
  """
86
89
  # initialize input_id
87
90
  self.input_id = input_id
@@ -112,6 +115,7 @@ class VMDataset(VMInput):
112
115
  self.target_class_labels = target_class_labels
113
116
  self.extra_columns = ExtraColumns.from_dict(extra_columns)
114
117
  self._set_feature_columns(feature_columns)
118
+ self._copy_data = copy_data
115
119
 
116
120
  if model:
117
121
  self.assign_predictions(model)
@@ -129,16 +133,19 @@ class VMDataset(VMInput):
129
133
  excluded = [self.target_column, *self.extra_columns.flatten()]
130
134
  self.feature_columns = [col for col in self.columns if col not in excluded]
131
135
 
132
- self.feature_columns_numeric = (
133
- self._df[self.feature_columns]
134
- .select_dtypes(include=[np.number])
135
- .columns.tolist()
136
- )
137
- self.feature_columns_categorical = (
138
- self._df[self.feature_columns]
139
- .select_dtypes(include=[object, pd.Categorical])
140
- .columns.tolist()
141
- )
136
+ # Get dtypes without loading data into memory
137
+ feature_dtypes = self._df[self.feature_columns].dtypes
138
+
139
+ self.feature_columns_numeric = feature_dtypes[
140
+ feature_dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x))
141
+ ].index.tolist()
142
+
143
+ self.feature_columns_categorical = feature_dtypes[
144
+ feature_dtypes.apply(
145
+ lambda x: pd.api.types.is_categorical_dtype(x)
146
+ or pd.api.types.is_object_dtype(x)
147
+ )
148
+ ].index.tolist()
142
149
 
143
150
  def _add_column(self, column_name, column_values):
144
151
  column_values = np.array(column_values)
@@ -397,8 +404,18 @@ class VMDataset(VMInput):
397
404
  assert self.target_column not in columns
398
405
  columns.append(self.target_column)
399
406
 
400
- # return a copy to prevent accidental modification
401
- return as_df(self._df[columns]).copy()
407
+ # Check if all columns in self._df are requested
408
+ all_columns = set(columns) == set(self._df.columns)
409
+
410
+ # For copy_data=False and all columns: return exact same DataFrame object
411
+ if not self._copy_data and all_columns:
412
+ return self._df
413
+ # For copy_data=False and subset of columns: return view with shared data
414
+ elif not self._copy_data:
415
+ return as_df(self._df[columns])
416
+ # For copy_data=True: return independent copy with duplicated data
417
+ else:
418
+ return as_df(self._df[columns]).copy()
402
419
 
403
420
  @property
404
421
  def x(self) -> np.ndarray:
@@ -522,9 +539,10 @@ class DataFrameDataset(VMDataset):
522
539
  text_column: str = None,
523
540
  target_class_labels: dict = None,
524
541
  date_time_index: bool = False,
542
+ copy_data: bool = True,
525
543
  ):
526
544
  """
527
- Initializes a DataFrameDataset instance.
545
+ Initializes a DataFrameDataset instance, preserving original pandas dtypes.
528
546
 
529
547
  Args:
530
548
  raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
@@ -536,25 +554,44 @@ class DataFrameDataset(VMDataset):
536
554
  text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
537
555
  target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
538
556
  date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
557
+ copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True.
539
558
  """
559
+
560
+ VMInput.__init__(self)
561
+
562
+ self.input_id = input_id
563
+
540
564
  index = None
541
565
  if isinstance(raw_dataset.index, pd.Index):
542
566
  index = raw_dataset.index.values
567
+ self.index = index
543
568
 
544
- super().__init__(
545
- raw_dataset=raw_dataset.values,
546
- input_id=input_id,
547
- model=model,
548
- index_name=raw_dataset.index.name,
549
- index=index,
550
- columns=raw_dataset.columns.to_list(),
551
- target_column=target_column,
552
- extra_columns=extra_columns,
553
- feature_columns=feature_columns,
554
- text_column=text_column,
555
- target_class_labels=target_class_labels,
556
- date_time_index=date_time_index,
557
- )
569
+ # Store the DataFrame directly
570
+ self._df = raw_dataset
571
+
572
+ if date_time_index:
573
+ self._df = convert_index_to_datetime(self._df)
574
+
575
+ self.columns = raw_dataset.columns.tolist()
576
+ self.column_aliases = {}
577
+ self.target_column = target_column
578
+ self.text_column = text_column
579
+ self.target_class_labels = target_class_labels
580
+ self.extra_columns = ExtraColumns.from_dict(extra_columns)
581
+ self._copy_data = copy_data
582
+
583
+ # Add warning when copy_data is False
584
+ if not copy_data:
585
+ logger.warning(
586
+ "Dataset initialized with copy_data=False. Changes to the original DataFrame "
587
+ "may affect this dataset. Use this option only when memory efficiency is critical "
588
+ "and you won't modify the source data."
589
+ )
590
+
591
+ self._set_feature_columns(feature_columns)
592
+
593
+ if model:
594
+ self.assign_predictions(model)
558
595
 
559
596
 
560
597
  class PolarsDataset(VMDataset):
@@ -2,6 +2,20 @@
2
2
  # See the LICENSE file in the root of this repository for details.
3
3
  # SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
4
4
 
5
- from .result import ErrorResult, RawData, Result, ResultTable, TestResult
5
+ from .result import (
6
+ ErrorResult,
7
+ RawData,
8
+ Result,
9
+ ResultTable,
10
+ TestResult,
11
+ TextGenerationResult,
12
+ )
6
13
 
7
- __all__ = ["ErrorResult", "RawData", "Result", "ResultTable", "TestResult"]
14
+ __all__ = [
15
+ "ErrorResult",
16
+ "RawData",
17
+ "Result",
18
+ "ResultTable",
19
+ "TestResult",
20
+ "TextGenerationResult",
21
+ ]
@@ -129,6 +129,7 @@ class Result:
129
129
 
130
130
  result_id: str = None
131
131
  name: str = None
132
+ result_type: str = None
132
133
 
133
134
  def __str__(self) -> str:
134
135
  """May be overridden by subclasses."""
@@ -445,6 +446,7 @@ class TestResult(Result):
445
446
  async def log_async(
446
447
  self,
447
448
  section_id: str = None,
449
+ content_id: str = None,
448
450
  position: int = None,
449
451
  config: Dict[str, bool] = None,
450
452
  ):
@@ -464,7 +466,7 @@ class TestResult(Result):
464
466
  )
465
467
  )
466
468
 
467
- if self.tables or self.figures or self.description:
469
+ if self.tables:
468
470
  tasks.append(
469
471
  api_client.alog_test_result(
470
472
  result=self.serialize(),
@@ -473,30 +475,32 @@ class TestResult(Result):
473
475
  config=config,
474
476
  )
475
477
  )
476
-
478
+ if self.figures:
477
479
  tasks.extend(
478
480
  [api_client.alog_figure(figure) for figure in (self.figures or [])]
479
481
  )
482
+ if self.description:
483
+ revision_name = (
484
+ AI_REVISION_NAME
485
+ if self._was_description_generated
486
+ else DEFAULT_REVISION_NAME
487
+ )
480
488
 
481
- if self.description:
482
- revision_name = (
483
- AI_REVISION_NAME
484
- if self._was_description_generated
485
- else DEFAULT_REVISION_NAME
486
- )
487
-
488
- tasks.append(
489
- update_metadata(
490
- content_id=f"test_description:{self.result_id}::{revision_name}",
491
- text=self.description,
492
- )
489
+ tasks.append(
490
+ update_metadata(
491
+ content_id=f"{content_id}::{revision_name}"
492
+ if content_id
493
+ else f"test_description:{self.result_id}::{revision_name}",
494
+ text=self.description,
493
495
  )
496
+ )
494
497
 
495
498
  return await asyncio.gather(*tasks)
496
499
 
497
500
  def log(
498
501
  self,
499
502
  section_id: str = None,
503
+ content_id: str = None,
500
504
  position: int = None,
501
505
  unsafe: bool = False,
502
506
  config: Dict[str, bool] = None,
@@ -506,6 +510,7 @@ class TestResult(Result):
506
510
  Args:
507
511
  section_id (str): The section ID within the model document to insert the
508
512
  test result.
513
+ content_id (str): The content ID to log the result to.
509
514
  position (int): The position (index) within the section to insert the test
510
515
  result.
511
516
  unsafe (bool): If True, log the result even if it contains sensitive data
@@ -533,6 +538,7 @@ class TestResult(Result):
533
538
  run_async(
534
539
  self.log_async,
535
540
  section_id=section_id,
541
+ content_id=content_id,
536
542
  position=position,
537
543
  config=config,
538
544
  )
@@ -568,3 +574,110 @@ class TestResult(Result):
568
574
  raise InvalidParameterError(
569
575
  f"Values for config keys must be boolean. Non-boolean values found for keys: {', '.join(non_bool_keys)}"
570
576
  )
577
+
578
+
579
+ @dataclass
580
+ class TextGenerationResult(Result):
581
+ """Test result."""
582
+
583
+ name: str = "Text Generation Result"
584
+ ref_id: str = None
585
+ title: Optional[str] = None
586
+ doc: Optional[str] = None
587
+ description: Optional[Union[str, DescriptionFuture]] = None
588
+ params: Optional[Dict[str, Any]] = None
589
+ metadata: Optional[Dict[str, Any]] = None
590
+ _was_description_generated: bool = False
591
+
592
+ def __post_init__(self):
593
+ if self.ref_id is None:
594
+ self.ref_id = str(uuid4())
595
+
596
+ def __repr__(self) -> str:
597
+ attrs = [
598
+ attr
599
+ for attr in [
600
+ "doc",
601
+ "description",
602
+ "params",
603
+ ]
604
+ if getattr(self, attr) is not None
605
+ and (
606
+ len(getattr(self, attr)) > 0
607
+ if isinstance(getattr(self, attr), list)
608
+ else True
609
+ )
610
+ ]
611
+
612
+ return f'TextGenerationResult("{self.result_id}", {", ".join(attrs)})'
613
+
614
+ def __getattribute__(self, name):
615
+ # lazy load description if its a DescriptionFuture (generated in background)
616
+ if name == "description":
617
+ description = super().__getattribute__("description")
618
+
619
+ if isinstance(description, DescriptionFuture):
620
+ self._was_description_generated = True
621
+ self.description = description.get_description()
622
+
623
+ return super().__getattribute__(name)
624
+
625
+ @property
626
+ def test_name(self) -> str:
627
+ """Get the test name, using custom title if available."""
628
+ return self.title or test_id_to_name(self.result_id)
629
+
630
+ def to_widget(self):
631
+ template_data = {
632
+ "test_name": self.test_name,
633
+ "description": self.description.replace("h3", "strong"),
634
+ "params": (
635
+ json.dumps(self.params, cls=NumpyEncoder, indent=2)
636
+ if self.params
637
+ else None
638
+ ),
639
+ }
640
+ rendered = get_result_template().render(**template_data)
641
+
642
+ widgets = [HTML(rendered)]
643
+
644
+ return VBox(widgets)
645
+
646
+ def serialize(self):
647
+ """Serialize the result for the API."""
648
+ return {
649
+ "test_name": self.result_id,
650
+ "title": self.title,
651
+ "ref_id": self.ref_id,
652
+ "params": self.params,
653
+ "metadata": self.metadata,
654
+ }
655
+
656
+ async def log_async(
657
+ self,
658
+ content_id: str = None,
659
+ ):
660
+ return await asyncio.gather(
661
+ update_metadata(
662
+ content_id=f"{content_id}",
663
+ text=self.description,
664
+ )
665
+ )
666
+
667
+ def log(
668
+ self,
669
+ content_id: str = None,
670
+ ):
671
+ """Log the result to ValidMind.
672
+
673
+ Args:
674
+ section_id (str): The section ID within the model document to insert the
675
+ test result.
676
+ content_id (str): The content ID to log the result to.
677
+ position (int): The position (index) within the section to insert the test
678
+ result.
679
+ """
680
+ run_async(
681
+ self.log_async,
682
+ content_id=content_id,
683
+ )
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: validmind
3
- Version: 2.8.20
3
+ Version: 2.8.26
4
4
  Summary: ValidMind Library
5
5
  License: Commercial License
6
6
  Author: Andres Rodriguez
7
7
  Author-email: andres@validmind.ai
8
- Requires-Python: >=3.8.1,<3.12
8
+ Requires-Python: >=3.9.0,<3.12
9
9
  Classifier: License :: Other/Proprietary License
10
10
  Classifier: Programming Language :: Python :: 3
11
11
  Classifier: Programming Language :: Python :: 3.9
@@ -22,6 +22,7 @@ Requires-Dist: bert-score (>=0.3.13)
22
22
  Requires-Dist: catboost
23
23
  Requires-Dist: datasets (>=2.10.0,<3.0.0)
24
24
  Requires-Dist: evaluate
25
+ Requires-Dist: h11 (>=0.16.0)
25
26
  Requires-Dist: ipywidgets
26
27
  Requires-Dist: kaleido (>=0.2.1,!=0.2.1.post1)
27
28
  Requires-Dist: langchain-openai (>=0.1.8) ; extra == "all" or extra == "llm"
@@ -53,7 +54,7 @@ Requires-Dist: statsmodels
53
54
  Requires-Dist: tabulate (>=0.8.9,<0.9.0)
54
55
  Requires-Dist: textblob (>=0.18.0.post0,<0.19.0)
55
56
  Requires-Dist: tiktoken
56
- Requires-Dist: torch (>=1.10.0) ; extra == "all" or extra == "llm" or extra == "pytorch"
57
+ Requires-Dist: torch (==2.7.0) ; extra == "all" or extra == "llm" or extra == "pytorch"
57
58
  Requires-Dist: tqdm
58
59
  Requires-Dist: transformers (>=4.32.0,<5.0.0) ; extra == "all" or extra == "huggingface" or extra == "llm"
59
60
  Requires-Dist: xgboost (>=1.5.2,<3)