validmind 2.4.1__py3-none-any.whl → 2.4.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/api_client.py +17 -15
- validmind/datasets/nlp/cnn_dailymail.py +10 -1
- validmind/datasets/nlp/datasets/cnn_dailymail_100_with_predictions.csv +33 -209
- validmind/datasets/nlp/datasets/cnn_dailymail_500_with_predictions.csv +156 -1051
- validmind/models/huggingface.py +0 -1
- validmind/tests/data_validation/IQROutliersBarPlot.py +16 -9
- validmind/tests/data_validation/IQROutliersTable.py +13 -6
- validmind/tests/data_validation/TabularDescriptionTables.py +96 -148
- validmind/tests/model_validation/embeddings/ClusterDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/CosineSimilarityDistribution.py +1 -1
- validmind/tests/model_validation/embeddings/DescriptiveAnalytics.py +1 -1
- validmind/tests/model_validation/embeddings/EmbeddingsVisualization2D.py +1 -1
- validmind/tests/model_validation/embeddings/StabilityAnalysis.py +3 -2
- validmind/tests/model_validation/embeddings/StabilityAnalysisTranslation.py +9 -2
- validmind/tests/model_validation/embeddings/TSNEComponentsPairwisePlots.py +0 -1
- validmind/tests/model_validation/sklearn/HyperParametersTuning.py +2 -1
- validmind/tests/run.py +1 -1
- validmind/vm_models/dataset/dataset.py +18 -6
- validmind/vm_models/test_suite/summary.py +2 -2
- {validmind-2.4.1.dist-info → validmind-2.4.5.dist-info}/METADATA +4 -3
- {validmind-2.4.1.dist-info → validmind-2.4.5.dist-info}/RECORD +25 -25
- {validmind-2.4.1.dist-info → validmind-2.4.5.dist-info}/LICENSE +0 -0
- {validmind-2.4.1.dist-info → validmind-2.4.5.dist-info}/WHEEL +0 -0
- {validmind-2.4.1.dist-info → validmind-2.4.5.dist-info}/entry_points.txt +0 -0
validmind/models/huggingface.py
CHANGED
@@ -56,7 +56,6 @@ class HFModel(VMModel):
|
|
56
56
|
return [result["label"] for result in results]
|
57
57
|
elif tasks[-1] == "feature_extraction":
|
58
58
|
# Extract [CLS] token embedding for each input and return as list of lists
|
59
|
-
print(f"len(results): {len(results)}")
|
60
59
|
return [embedding[0][0] for embedding in results]
|
61
60
|
else:
|
62
61
|
return results
|
@@ -4,7 +4,6 @@
|
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
import plotly.graph_objects as go
|
9
8
|
|
10
9
|
from validmind.vm_models import Figure, Metric
|
@@ -62,22 +61,27 @@ class IQROutliersBarPlot(Metric):
|
|
62
61
|
|
63
62
|
name = "iqr_outliers_bar_plot"
|
64
63
|
required_inputs = ["dataset"]
|
65
|
-
default_params = {"threshold": 1.5, "
|
64
|
+
default_params = {"threshold": 1.5, "fig_width": 800}
|
66
65
|
tasks = ["classification", "regression"]
|
67
66
|
tags = ["tabular_data", "visualization", "numerical_data"]
|
68
67
|
|
69
68
|
def run(self):
|
70
69
|
df = self.inputs.dataset.df
|
71
|
-
|
70
|
+
|
71
|
+
# Select numerical features
|
72
|
+
features = self.inputs.dataset.feature_columns_numeric
|
73
|
+
|
74
|
+
# Select non-binary features
|
75
|
+
features = [
|
76
|
+
feature
|
77
|
+
for feature in features
|
78
|
+
if len(self.inputs.dataset.df[feature].unique()) > 2
|
79
|
+
]
|
80
|
+
|
72
81
|
threshold = self.params["threshold"]
|
73
82
|
fig_width = self.params["fig_width"]
|
74
83
|
|
75
|
-
|
76
|
-
# Otherwise, only use the columns provided in num_features.
|
77
|
-
if num_features is None:
|
78
|
-
df = df.select_dtypes(include=[np.number])
|
79
|
-
else:
|
80
|
-
df = df[num_features]
|
84
|
+
df = df[features]
|
81
85
|
|
82
86
|
return self.detect_and_visualize_outliers(df, threshold, fig_width)
|
83
87
|
|
@@ -98,6 +102,9 @@ class IQROutliersBarPlot(Metric):
|
|
98
102
|
# Compute outliers
|
99
103
|
outliers = self.compute_outliers(df[col], threshold)
|
100
104
|
|
105
|
+
if outliers.empty:
|
106
|
+
continue # Skip plotting if there are no outliers
|
107
|
+
|
101
108
|
Q1_count = outliers[
|
102
109
|
(outliers >= 0) & (outliers < outliers.quantile(0.25))
|
103
110
|
].count()
|
@@ -4,7 +4,6 @@
|
|
4
4
|
|
5
5
|
from dataclasses import dataclass
|
6
6
|
|
7
|
-
import numpy as np
|
8
7
|
import pandas as pd
|
9
8
|
|
10
9
|
from validmind.vm_models import Metric, ResultSummary, ResultTable, ResultTableMetadata
|
@@ -53,12 +52,22 @@ class IQROutliersTable(Metric):
|
|
53
52
|
|
54
53
|
name = "iqr_outliers_table"
|
55
54
|
required_inputs = ["dataset"]
|
56
|
-
default_params = {"
|
55
|
+
default_params = {"threshold": 1.5}
|
57
56
|
tasks = ["classification", "regression"]
|
58
57
|
tags = ["tabular_data", "numerical_data"]
|
59
58
|
|
60
59
|
def run(self):
|
61
|
-
|
60
|
+
|
61
|
+
# Select numerical features
|
62
|
+
features = self.inputs.dataset.feature_columns_numeric
|
63
|
+
|
64
|
+
# Select non-binary features
|
65
|
+
features = [
|
66
|
+
feature
|
67
|
+
for feature in features
|
68
|
+
if len(self.inputs.dataset.df[feature].unique()) > 2
|
69
|
+
]
|
70
|
+
|
62
71
|
threshold = self.params["threshold"]
|
63
72
|
|
64
73
|
df = self.inputs.dataset.df
|
@@ -80,9 +89,7 @@ class IQROutliersTable(Metric):
|
|
80
89
|
upper_bound = Q3 + threshold * IQR
|
81
90
|
return series[(series < lower_bound) | (series > upper_bound)]
|
82
91
|
|
83
|
-
def detect_and_analyze_outliers(self, df, features
|
84
|
-
if features is None:
|
85
|
-
features = df.select_dtypes(include=[np.number]).columns.tolist()
|
92
|
+
def detect_and_analyze_outliers(self, df, features, threshold=1.5):
|
86
93
|
|
87
94
|
outliers_summary = []
|
88
95
|
for feature in features:
|
@@ -2,15 +2,14 @@
|
|
2
2
|
# See the LICENSE file in the root of this repository for details.
|
3
3
|
# SPDX-License-Identifier: AGPL-3.0 AND ValidMind Commercial
|
4
4
|
|
5
|
-
from dataclasses import dataclass
|
6
|
-
|
7
5
|
import pandas as pd
|
8
6
|
|
9
|
-
from validmind
|
7
|
+
from validmind import tags, tasks
|
10
8
|
|
11
9
|
|
12
|
-
@
|
13
|
-
|
10
|
+
@tags("tabular_data")
|
11
|
+
@tasks("classification", "regression")
|
12
|
+
def TabularDescriptionTables(dataset):
|
14
13
|
"""
|
15
14
|
Summarizes key descriptive statistics for numerical, categorical, and datetime variables in a dataset.
|
16
15
|
|
@@ -54,155 +53,104 @@ class TabularDescriptionTables(Metric):
|
|
54
53
|
chosen algorithm.
|
55
54
|
"""
|
56
55
|
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
)
|
103
|
-
summary_stats.loc[column, "Missing Values (%)"] = (
|
104
|
-
self.inputs.dataset.df[column].isnull().mean() * 100
|
105
|
-
)
|
106
|
-
summary_stats.loc[column, "Data Type"] = str(
|
107
|
-
self.inputs.dataset.df[column].dtype
|
108
|
-
)
|
109
|
-
|
110
|
-
summary_stats = summary_stats.sort_values(
|
111
|
-
by="Missing Values (%)", ascending=False
|
112
|
-
)
|
113
|
-
summary_stats.reset_index(inplace=True)
|
114
|
-
summary_stats.rename(
|
115
|
-
columns={"index": "Categorical Variable"}, inplace=True
|
116
|
-
)
|
117
|
-
return summary_stats
|
118
|
-
|
119
|
-
def get_summary_statistics_datetime(self, datetime_fields):
|
120
|
-
summary_stats = pd.DataFrame()
|
121
|
-
for column in self.inputs.dataset.df[datetime_fields].columns:
|
122
|
-
summary_stats.loc[column, "Num of Obs"] = int(
|
123
|
-
self.inputs.dataset.df[column].count()
|
124
|
-
)
|
125
|
-
summary_stats.loc[column, "Num of Unique Values"] = self.inputs.dataset.df[
|
56
|
+
numerical_fields = get_numerical_columns(dataset)
|
57
|
+
categorical_fields = get_categorical_columns(dataset)
|
58
|
+
datetime_fields = get_datetime_columns(dataset)
|
59
|
+
|
60
|
+
summary_stats_numerical = get_summary_statistics_numerical(
|
61
|
+
dataset, numerical_fields
|
62
|
+
)
|
63
|
+
summary_stats_categorical = get_summary_statistics_categorical(
|
64
|
+
dataset, categorical_fields
|
65
|
+
)
|
66
|
+
summary_stats_datetime = get_summary_statistics_datetime(dataset, datetime_fields)
|
67
|
+
|
68
|
+
return (summary_stats_numerical, summary_stats_categorical, summary_stats_datetime)
|
69
|
+
|
70
|
+
|
71
|
+
def get_summary_statistics_numerical(dataset, numerical_fields):
|
72
|
+
summary_stats = dataset.df[numerical_fields].describe().T
|
73
|
+
summary_stats["Missing Values (%)"] = (
|
74
|
+
dataset.df[numerical_fields].isnull().mean() * 100
|
75
|
+
)
|
76
|
+
summary_stats["Data Type"] = dataset.df[numerical_fields].dtypes.astype(str)
|
77
|
+
summary_stats = summary_stats[
|
78
|
+
["count", "mean", "min", "max", "Missing Values (%)", "Data Type"]
|
79
|
+
]
|
80
|
+
summary_stats.columns = [
|
81
|
+
"Num of Obs",
|
82
|
+
"Mean",
|
83
|
+
"Min",
|
84
|
+
"Max",
|
85
|
+
"Missing Values (%)",
|
86
|
+
"Data Type",
|
87
|
+
]
|
88
|
+
summary_stats["Num of Obs"] = summary_stats["Num of Obs"].astype(int)
|
89
|
+
summary_stats = summary_stats.sort_values(by="Missing Values (%)", ascending=False)
|
90
|
+
summary_stats.reset_index(inplace=True)
|
91
|
+
summary_stats.rename(columns={"index": "Numerical Variable"}, inplace=True)
|
92
|
+
return summary_stats
|
93
|
+
|
94
|
+
|
95
|
+
def get_summary_statistics_categorical(dataset, categorical_fields):
|
96
|
+
summary_stats = pd.DataFrame()
|
97
|
+
if categorical_fields: # check if the list is not empty
|
98
|
+
for column in dataset.df[categorical_fields].columns:
|
99
|
+
summary_stats.loc[column, "Num of Obs"] = int(dataset.df[column].count())
|
100
|
+
summary_stats.loc[column, "Num of Unique Values"] = dataset.df[
|
126
101
|
column
|
127
102
|
].nunique()
|
128
|
-
summary_stats.loc[column, "
|
129
|
-
column
|
130
|
-
].min()
|
131
|
-
summary_stats.loc[column, "Latest Date"] = self.inputs.dataset.df[
|
132
|
-
column
|
133
|
-
].max()
|
134
|
-
summary_stats.loc[column, "Missing Values (%)"] = (
|
135
|
-
self.inputs.dataset.df[column].isnull().mean() * 100
|
103
|
+
summary_stats.loc[column, "Unique Values"] = str(
|
104
|
+
dataset.df[column].unique()
|
136
105
|
)
|
137
|
-
summary_stats.loc[column, "
|
138
|
-
|
106
|
+
summary_stats.loc[column, "Missing Values (%)"] = (
|
107
|
+
dataset.df[column].isnull().mean() * 100
|
139
108
|
)
|
109
|
+
summary_stats.loc[column, "Data Type"] = str(dataset.df[column].dtype)
|
140
110
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
)
|
111
|
+
summary_stats = summary_stats.sort_values(
|
112
|
+
by="Missing Values (%)", ascending=False
|
113
|
+
)
|
145
114
|
summary_stats.reset_index(inplace=True)
|
146
|
-
summary_stats.rename(columns={"index": "
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
),
|
160
|
-
ResultTable(
|
161
|
-
data=summary_stats_categorical,
|
162
|
-
metadata=ResultTableMetadata(title="Categorical Variables"),
|
163
|
-
),
|
164
|
-
ResultTable(
|
165
|
-
data=summary_stats_datetime,
|
166
|
-
metadata=ResultTableMetadata(title="Datetime Variables"),
|
167
|
-
),
|
168
|
-
]
|
115
|
+
summary_stats.rename(columns={"index": "Categorical Variable"}, inplace=True)
|
116
|
+
return summary_stats
|
117
|
+
|
118
|
+
|
119
|
+
def get_summary_statistics_datetime(dataset, datetime_fields):
|
120
|
+
summary_stats = pd.DataFrame()
|
121
|
+
for column in dataset.df[datetime_fields].columns:
|
122
|
+
summary_stats.loc[column, "Num of Obs"] = int(dataset.df[column].count())
|
123
|
+
summary_stats.loc[column, "Num of Unique Values"] = dataset.df[column].nunique()
|
124
|
+
summary_stats.loc[column, "Earliest Date"] = dataset.df[column].min()
|
125
|
+
summary_stats.loc[column, "Latest Date"] = dataset.df[column].max()
|
126
|
+
summary_stats.loc[column, "Missing Values (%)"] = (
|
127
|
+
dataset.df[column].isnull().mean() * 100
|
169
128
|
)
|
129
|
+
summary_stats.loc[column, "Data Type"] = str(dataset.df[column].dtype)
|
170
130
|
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
).columns.tolist()
|
175
|
-
return categorical_columns
|
176
|
-
|
177
|
-
def get_numerical_columns(self):
|
178
|
-
numerical_columns = self.inputs.dataset.df.select_dtypes(
|
179
|
-
include=["int", "float", "uint8"]
|
180
|
-
).columns.tolist()
|
181
|
-
return numerical_columns
|
182
|
-
|
183
|
-
def get_datetime_columns(self):
|
184
|
-
datetime_columns = self.inputs.dataset.df.select_dtypes(
|
185
|
-
include=["datetime"]
|
186
|
-
).columns.tolist()
|
187
|
-
return datetime_columns
|
188
|
-
|
189
|
-
def run(self):
|
190
|
-
numerical_fields = self.get_numerical_columns()
|
191
|
-
categorical_fields = self.get_categorical_columns()
|
192
|
-
datetime_fields = self.get_datetime_columns()
|
193
|
-
|
194
|
-
summary_stats_numerical = self.get_summary_statistics_numerical(
|
195
|
-
numerical_fields
|
196
|
-
)
|
197
|
-
summary_stats_categorical = self.get_summary_statistics_categorical(
|
198
|
-
categorical_fields
|
199
|
-
)
|
200
|
-
summary_stats_datetime = self.get_summary_statistics_datetime(datetime_fields)
|
201
|
-
|
202
|
-
return self.cache_results(
|
203
|
-
{
|
204
|
-
"numerical": summary_stats_numerical.to_dict(orient="records"),
|
205
|
-
"categorical": summary_stats_categorical.to_dict(orient="records"),
|
206
|
-
"datetime": summary_stats_datetime.to_dict(orient="records"),
|
207
|
-
}
|
131
|
+
if not summary_stats.empty:
|
132
|
+
summary_stats = summary_stats.sort_values(
|
133
|
+
by="Missing Values (%)", ascending=False
|
208
134
|
)
|
135
|
+
summary_stats.reset_index(inplace=True)
|
136
|
+
summary_stats.rename(columns={"index": "Datetime Variable"}, inplace=True)
|
137
|
+
return summary_stats
|
138
|
+
|
139
|
+
|
140
|
+
def get_categorical_columns(dataset):
|
141
|
+
categorical_columns = dataset.df.select_dtypes(
|
142
|
+
include=["object", "category"]
|
143
|
+
).columns.tolist()
|
144
|
+
return categorical_columns
|
145
|
+
|
146
|
+
|
147
|
+
def get_numerical_columns(dataset):
|
148
|
+
numerical_columns = dataset.df.select_dtypes(
|
149
|
+
include=["int", "float", "uint8"]
|
150
|
+
).columns.tolist()
|
151
|
+
return numerical_columns
|
152
|
+
|
153
|
+
|
154
|
+
def get_datetime_columns(dataset):
|
155
|
+
datetime_columns = dataset.df.select_dtypes(include=["datetime"]).columns.tolist()
|
156
|
+
return datetime_columns
|
@@ -52,7 +52,7 @@ class ClusterDistribution(Metric):
|
|
52
52
|
"num_clusters": 5,
|
53
53
|
}
|
54
54
|
tasks = ["feature_extraction"]
|
55
|
-
tags = ["llm", "text_data", "
|
55
|
+
tags = ["llm", "text_data", "embeddings", "visualization"]
|
56
56
|
|
57
57
|
def run(self):
|
58
58
|
# run kmeans clustering on embeddings
|
@@ -51,7 +51,7 @@ class CosineSimilarityDistribution(Metric):
|
|
51
51
|
name = "Text Embeddings Cosine Similarity Distribution"
|
52
52
|
required_inputs = ["model", "dataset"]
|
53
53
|
tasks = ["feature_extraction"]
|
54
|
-
tags = ["llm", "text_data", "
|
54
|
+
tags = ["llm", "text_data", "embeddings", "visualization"]
|
55
55
|
|
56
56
|
def run(self):
|
57
57
|
# Compute cosine similarity
|
@@ -54,7 +54,7 @@ class DescriptiveAnalytics(Metric):
|
|
54
54
|
name = "Descriptive Analytics for Text Embeddings Models"
|
55
55
|
required_inputs = ["model", "dataset"]
|
56
56
|
tasks = ["feature_extraction"]
|
57
|
-
tags = ["llm", "text_data", "
|
57
|
+
tags = ["llm", "text_data", "embeddings", "visualization"]
|
58
58
|
|
59
59
|
def run(self):
|
60
60
|
# Assuming y_pred returns a 2D array of embeddings [samples, features]
|
@@ -54,7 +54,7 @@ class EmbeddingsVisualization2D(Metric):
|
|
54
54
|
"perplexity": 30,
|
55
55
|
}
|
56
56
|
tasks = ["feature_extraction"]
|
57
|
-
tags = ["llm", "text_data", "
|
57
|
+
tags = ["llm", "text_data", "embeddings", "visualization"]
|
58
58
|
|
59
59
|
def run(self):
|
60
60
|
cluster_column = self.params.get("cluster_column")
|
@@ -30,7 +30,7 @@ class StabilityAnalysis(ThresholdTest):
|
|
30
30
|
"mean_similarity_threshold": 0.7,
|
31
31
|
}
|
32
32
|
tasks = ["feature_extraction"]
|
33
|
-
tags = ["llm", "text_data", "
|
33
|
+
tags = ["llm", "text_data", "embeddings", "visualization"]
|
34
34
|
|
35
35
|
@abstractmethod
|
36
36
|
def perturb_data(self, data: str) -> str:
|
@@ -62,7 +62,8 @@ class StabilityAnalysis(ThresholdTest):
|
|
62
62
|
|
63
63
|
def run(self):
|
64
64
|
# Perturb the test dataset
|
65
|
-
|
65
|
+
text_column = self.inputs.dataset.text_column
|
66
|
+
original = self.inputs.dataset.df[[text_column]]
|
66
67
|
perturbed = original.copy()
|
67
68
|
perturbed.update(
|
68
69
|
perturbed.select_dtypes(include="object").applymap(self.perturb_data)
|
@@ -4,8 +4,12 @@
|
|
4
4
|
|
5
5
|
from transformers import MarianMTModel, MarianTokenizer
|
6
6
|
|
7
|
+
from validmind.logging import get_logger
|
8
|
+
|
7
9
|
from .StabilityAnalysis import StabilityAnalysis
|
8
10
|
|
11
|
+
logger = get_logger(__name__)
|
12
|
+
|
9
13
|
|
10
14
|
class StabilityAnalysisTranslation(StabilityAnalysis):
|
11
15
|
"""
|
@@ -61,8 +65,11 @@ class StabilityAnalysisTranslation(StabilityAnalysis):
|
|
61
65
|
}
|
62
66
|
|
63
67
|
def perturb_data(self, data: str):
|
64
|
-
if
|
65
|
-
|
68
|
+
if len(data) > 512:
|
69
|
+
logger.info(
|
70
|
+
"Data length exceeds 512 tokens. Truncating data to 512 tokens."
|
71
|
+
)
|
72
|
+
data = data[:512]
|
66
73
|
|
67
74
|
source_lang = self.params["source_lang"]
|
68
75
|
target_lang = self.params["target_lang"]
|
@@ -53,7 +53,6 @@ def TSNEComponentsPairwisePlots(
|
|
53
53
|
- t-SNE visualizations can be misleading if interpreted without considering the stochastic nature of the algorithm;
|
54
54
|
two runs with the same parameters might yield different visual outputs, necessitating multiple runs for a consistent interpretation.
|
55
55
|
"""
|
56
|
-
|
57
56
|
# Get embeddings from the dataset using the model
|
58
57
|
embeddings = np.stack(dataset.y_pred(model))
|
59
58
|
|
@@ -60,8 +60,9 @@ class HyperParametersTuning(Metric):
|
|
60
60
|
param_grid = self.params["param_grid"]
|
61
61
|
if param_grid is None:
|
62
62
|
raise SkipTestError(
|
63
|
-
"param_grid in
|
63
|
+
"param_grid in dictonary format must be provided to run this test"
|
64
64
|
)
|
65
|
+
|
65
66
|
model = self.inputs.model.model
|
66
67
|
estimators = GridSearchCV(
|
67
68
|
model, param_grid=param_grid, scoring=self.params["scoring"]
|
validmind/tests/run.py
CHANGED
@@ -118,7 +118,7 @@ def _combine_figures(figure_lists: List[List[Any]], input_groups: List[Dict[str,
|
|
118
118
|
|
119
119
|
title_template = "{current_title}({input_description})"
|
120
120
|
|
121
|
-
for
|
121
|
+
for figures in list(zip(*figure_lists)):
|
122
122
|
if is_plotly_figure(figures[0].figure):
|
123
123
|
_update_plotly_titles(figures, input_groups, title_template)
|
124
124
|
elif is_matplotlib_figure(figures[0].figure):
|
@@ -139,13 +139,25 @@ class VMDataset:
|
|
139
139
|
)
|
140
140
|
|
141
141
|
def _add_column(self, column_name, column_values):
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
)
|
142
|
+
column_values = np.array(column_values)
|
143
|
+
|
144
|
+
if column_values.ndim == 1:
|
145
|
+
if len(column_values) != len(self.df):
|
146
|
+
raise ValueError(
|
147
|
+
"Length of values doesn't match number of rows in the DataFrame."
|
148
|
+
)
|
149
|
+
self.columns.append(column_name)
|
150
|
+
self.df[column_name] = column_values
|
151
|
+
elif column_values.ndim == 2:
|
152
|
+
if column_values.shape[0] != len(self.df):
|
153
|
+
raise ValueError(
|
154
|
+
"Number of rows in values doesn't match number of rows in the DataFrame."
|
155
|
+
)
|
156
|
+
self.columns.append(column_name)
|
157
|
+
self.df[column_name] = column_values.tolist()
|
146
158
|
|
147
|
-
|
148
|
-
|
159
|
+
else:
|
160
|
+
raise ValueError("Only 1D and 2D arrays are supported for column_values.")
|
149
161
|
|
150
162
|
def _validate_assign_predictions(
|
151
163
|
self,
|
@@ -93,10 +93,10 @@ class TestSuiteSummary:
|
|
93
93
|
|
94
94
|
def _add_results_link(self):
|
95
95
|
# avoid circular import
|
96
|
-
from ...api_client import get_api_host,
|
96
|
+
from ...api_client import get_api_host, get_api_model
|
97
97
|
|
98
98
|
ui_host = get_api_host().replace("/api/v1/tracking", "").replace("api", "app")
|
99
|
-
link = f"{ui_host}/projects/{
|
99
|
+
link = f"{ui_host}/projects/{get_api_model()}/project-overview"
|
100
100
|
results_link = f"""
|
101
101
|
<h3>
|
102
102
|
Check out the updated documentation in your
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: validmind
|
3
|
-
Version: 2.4.
|
3
|
+
Version: 2.4.5
|
4
4
|
Summary: ValidMind Developer Framework
|
5
5
|
License: Commercial License
|
6
6
|
Author: Andres Rodriguez
|
@@ -32,7 +32,7 @@ Requires-Dist: mistune (>=3.0.2,<4.0.0)
|
|
32
32
|
Requires-Dist: nltk (>=3.8.1,<4.0.0)
|
33
33
|
Requires-Dist: numba (<0.59.0)
|
34
34
|
Requires-Dist: numpy
|
35
|
-
Requires-Dist: openai (>=1) ; extra == "all"
|
35
|
+
Requires-Dist: openai (>=1) ; extra == "all"
|
36
36
|
Requires-Dist: pandas (>=1.1,<2)
|
37
37
|
Requires-Dist: plotly
|
38
38
|
Requires-Dist: plotly-express
|
@@ -46,6 +46,7 @@ Requires-Dist: scikit-learn
|
|
46
46
|
Requires-Dist: scipy
|
47
47
|
Requires-Dist: scorecardpy (>=0.1.9.6,<0.2.0.0)
|
48
48
|
Requires-Dist: seaborn
|
49
|
+
Requires-Dist: sentencepiece (>=0.2.0,<0.3.0) ; extra == "all" or extra == "huggingface" or extra == "llm"
|
49
50
|
Requires-Dist: sentry-sdk (>=1.24.0,<2.0.0)
|
50
51
|
Requires-Dist: shap (>=0.42.0,<0.43.0)
|
51
52
|
Requires-Dist: statsmodels
|
@@ -53,7 +54,7 @@ Requires-Dist: tabulate (>=0.8.9,<0.9.0)
|
|
53
54
|
Requires-Dist: textblob (>=0.18.0.post0,<0.19.0)
|
54
55
|
Requires-Dist: torch (>=1.10.0) ; extra == "all" or extra == "llm" or extra == "pytorch"
|
55
56
|
Requires-Dist: tqdm
|
56
|
-
Requires-Dist: transformers (>=4.32.0,<5.0.0) ; extra == "all" or extra == "
|
57
|
+
Requires-Dist: transformers (>=4.32.0,<5.0.0) ; extra == "all" or extra == "huggingface" or extra == "llm"
|
57
58
|
Requires-Dist: xgboost (>=1.5.2,<3)
|
58
59
|
Requires-Dist: ydata-profiling
|
59
60
|
Description-Content-Type: text/markdown
|