validmind 2.8.20__py3-none-any.whl → 2.8.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- validmind/__version__.py +1 -1
- validmind/client.py +3 -0
- validmind/template.py +3 -2
- validmind/vm_models/dataset/dataset.py +64 -27
- {validmind-2.8.20.dist-info → validmind-2.8.22.dist-info}/METADATA +1 -1
- {validmind-2.8.20.dist-info → validmind-2.8.22.dist-info}/RECORD +9 -9
- {validmind-2.8.20.dist-info → validmind-2.8.22.dist-info}/LICENSE +0 -0
- {validmind-2.8.20.dist-info → validmind-2.8.22.dist-info}/WHEEL +0 -0
- {validmind-2.8.20.dist-info → validmind-2.8.22.dist-info}/entry_points.txt +0 -0
validmind/__version__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "2.8.
|
1
|
+
__version__ = "2.8.22"
|
validmind/client.py
CHANGED
@@ -61,6 +61,7 @@ def init_dataset(
|
|
61
61
|
class_labels: Optional[Dict[str, Any]] = None,
|
62
62
|
type: Optional[str] = None,
|
63
63
|
input_id: Optional[str] = None,
|
64
|
+
copy_data: bool = True,
|
64
65
|
__log: bool = True,
|
65
66
|
) -> VMDataset:
|
66
67
|
"""
|
@@ -92,6 +93,7 @@ def init_dataset(
|
|
92
93
|
this will be set to `dataset` but if you are passing this dataset as a
|
93
94
|
test input using some other key than `dataset`, then you should set
|
94
95
|
this to the same key.
|
96
|
+
copy_data (bool, optional): Whether to copy the data. Defaults to True.
|
95
97
|
__log (bool): Whether to log the input. Defaults to True.
|
96
98
|
|
97
99
|
Raises:
|
@@ -121,6 +123,7 @@ def init_dataset(
|
|
121
123
|
extra_columns=extra_columns,
|
122
124
|
target_class_labels=class_labels,
|
123
125
|
date_time_index=date_time_index,
|
126
|
+
copy_data=copy_data,
|
124
127
|
)
|
125
128
|
elif isinstance(dataset, pl.DataFrame):
|
126
129
|
vm_dataset = PolarsDataset(
|
validmind/template.py
CHANGED
@@ -53,8 +53,9 @@ def _convert_sections_to_section_tree(
|
|
53
53
|
|
54
54
|
if start_section_id and not section_tree:
|
55
55
|
raise ValueError(f"Section {start_section_id} not found in template")
|
56
|
-
|
57
|
-
|
56
|
+
# sort the section tree by the order of the sections in the template (if provided)
|
57
|
+
# set the order to 9999 for the sections that do not have an order
|
58
|
+
return sorted(section_tree, key=lambda x: x.get("order", 9999))
|
58
59
|
|
59
60
|
|
60
61
|
def _create_content_widget(content: Dict[str, Any]) -> Widget:
|
@@ -47,6 +47,7 @@ class VMDataset(VMInput):
|
|
47
47
|
target_class_labels (Dict): The class labels for the target columns.
|
48
48
|
df (pd.DataFrame): The dataset as a pandas DataFrame.
|
49
49
|
extra_columns (Dict): Extra columns to include in the dataset.
|
50
|
+
copy_data (bool): Whether to copy the data. Defaults to True.
|
50
51
|
"""
|
51
52
|
|
52
53
|
def __repr__(self):
|
@@ -66,6 +67,7 @@ class VMDataset(VMInput):
|
|
66
67
|
text_column: str = None,
|
67
68
|
extra_columns: dict = None,
|
68
69
|
target_class_labels: dict = None,
|
70
|
+
copy_data: bool = True,
|
69
71
|
):
|
70
72
|
"""
|
71
73
|
Initializes a VMDataset instance.
|
@@ -82,6 +84,7 @@ class VMDataset(VMInput):
|
|
82
84
|
feature_columns (str, optional): The feature column names of the dataset. Defaults to None.
|
83
85
|
text_column (str, optional): The text column name of the dataset for nlp tasks. Defaults to None.
|
84
86
|
target_class_labels (Dict, optional): The class labels for the target columns. Defaults to None.
|
87
|
+
copy_data (bool, optional): Whether to copy the data. Defaults to True.
|
85
88
|
"""
|
86
89
|
# initialize input_id
|
87
90
|
self.input_id = input_id
|
@@ -112,6 +115,7 @@ class VMDataset(VMInput):
|
|
112
115
|
self.target_class_labels = target_class_labels
|
113
116
|
self.extra_columns = ExtraColumns.from_dict(extra_columns)
|
114
117
|
self._set_feature_columns(feature_columns)
|
118
|
+
self._copy_data = copy_data
|
115
119
|
|
116
120
|
if model:
|
117
121
|
self.assign_predictions(model)
|
@@ -129,16 +133,19 @@ class VMDataset(VMInput):
|
|
129
133
|
excluded = [self.target_column, *self.extra_columns.flatten()]
|
130
134
|
self.feature_columns = [col for col in self.columns if col not in excluded]
|
131
135
|
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
.
|
141
|
-
|
136
|
+
# Get dtypes without loading data into memory
|
137
|
+
feature_dtypes = self._df[self.feature_columns].dtypes
|
138
|
+
|
139
|
+
self.feature_columns_numeric = feature_dtypes[
|
140
|
+
feature_dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x))
|
141
|
+
].index.tolist()
|
142
|
+
|
143
|
+
self.feature_columns_categorical = feature_dtypes[
|
144
|
+
feature_dtypes.apply(
|
145
|
+
lambda x: pd.api.types.is_categorical_dtype(x)
|
146
|
+
or pd.api.types.is_object_dtype(x)
|
147
|
+
)
|
148
|
+
].index.tolist()
|
142
149
|
|
143
150
|
def _add_column(self, column_name, column_values):
|
144
151
|
column_values = np.array(column_values)
|
@@ -397,8 +404,18 @@ class VMDataset(VMInput):
|
|
397
404
|
assert self.target_column not in columns
|
398
405
|
columns.append(self.target_column)
|
399
406
|
|
400
|
-
#
|
401
|
-
|
407
|
+
# Check if all columns in self._df are requested
|
408
|
+
all_columns = set(columns) == set(self._df.columns)
|
409
|
+
|
410
|
+
# For copy_data=False and all columns: return exact same DataFrame object
|
411
|
+
if not self._copy_data and all_columns:
|
412
|
+
return self._df
|
413
|
+
# For copy_data=False and subset of columns: return view with shared data
|
414
|
+
elif not self._copy_data:
|
415
|
+
return as_df(self._df[columns])
|
416
|
+
# For copy_data=True: return independent copy with duplicated data
|
417
|
+
else:
|
418
|
+
return as_df(self._df[columns]).copy()
|
402
419
|
|
403
420
|
@property
|
404
421
|
def x(self) -> np.ndarray:
|
@@ -522,9 +539,10 @@ class DataFrameDataset(VMDataset):
|
|
522
539
|
text_column: str = None,
|
523
540
|
target_class_labels: dict = None,
|
524
541
|
date_time_index: bool = False,
|
542
|
+
copy_data: bool = True,
|
525
543
|
):
|
526
544
|
"""
|
527
|
-
Initializes a DataFrameDataset instance.
|
545
|
+
Initializes a DataFrameDataset instance, preserving original pandas dtypes.
|
528
546
|
|
529
547
|
Args:
|
530
548
|
raw_dataset (pd.DataFrame): The raw dataset as a pandas DataFrame.
|
@@ -536,25 +554,44 @@ class DataFrameDataset(VMDataset):
|
|
536
554
|
text_column (str, optional): The text column name of the dataset for NLP tasks. Defaults to None.
|
537
555
|
target_class_labels (dict, optional): The class labels for the target columns. Defaults to None.
|
538
556
|
date_time_index (bool, optional): Whether to use date-time index. Defaults to False.
|
557
|
+
copy_data (bool, optional): Whether to create a copy of the input data. Defaults to True.
|
539
558
|
"""
|
559
|
+
|
560
|
+
VMInput.__init__(self)
|
561
|
+
|
562
|
+
self.input_id = input_id
|
563
|
+
|
540
564
|
index = None
|
541
565
|
if isinstance(raw_dataset.index, pd.Index):
|
542
566
|
index = raw_dataset.index.values
|
567
|
+
self.index = index
|
543
568
|
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
569
|
+
# Store the DataFrame directly
|
570
|
+
self._df = raw_dataset
|
571
|
+
|
572
|
+
if date_time_index:
|
573
|
+
self._df = convert_index_to_datetime(self._df)
|
574
|
+
|
575
|
+
self.columns = raw_dataset.columns.tolist()
|
576
|
+
self.column_aliases = {}
|
577
|
+
self.target_column = target_column
|
578
|
+
self.text_column = text_column
|
579
|
+
self.target_class_labels = target_class_labels
|
580
|
+
self.extra_columns = ExtraColumns.from_dict(extra_columns)
|
581
|
+
self._copy_data = copy_data
|
582
|
+
|
583
|
+
# Add warning when copy_data is False
|
584
|
+
if not copy_data:
|
585
|
+
logger.warning(
|
586
|
+
"Dataset initialized with copy_data=False. Changes to the original DataFrame "
|
587
|
+
"may affect this dataset. Use this option only when memory efficiency is critical "
|
588
|
+
"and you won't modify the source data."
|
589
|
+
)
|
590
|
+
|
591
|
+
self._set_feature_columns(feature_columns)
|
592
|
+
|
593
|
+
if model:
|
594
|
+
self.assign_predictions(model)
|
558
595
|
|
559
596
|
|
560
597
|
class PolarsDataset(VMDataset):
|
@@ -1,9 +1,9 @@
|
|
1
1
|
validmind/__init__.py,sha256=qmC6WY6ifIQpCU38V91EN91UlIEcOG7a9jWK3smZJoY,4220
|
2
|
-
validmind/__version__.py,sha256=
|
2
|
+
validmind/__version__.py,sha256=KYh5XihUKlbKxUpWkhRlJg_POPRyBFW9RUgXYfdQc0s,23
|
3
3
|
validmind/ai/test_descriptions.py,sha256=eBF09MAyqAAD-Ah7vxXVRbHxOmGx5_10ZkoJmMvEaEA,7123
|
4
4
|
validmind/ai/utils.py,sha256=O5gTkvGsPCCdKCdBGvpDaJM1oL_msdm2xKkf9fFpIy8,4172
|
5
5
|
validmind/api_client.py,sha256=slvf0FJ8olYsK1-EPetMVYV7UvhjMAIFBxwBWSVT9BI,16807
|
6
|
-
validmind/client.py,sha256=
|
6
|
+
validmind/client.py,sha256=XKb4uc7yXVV_3NH9-zTrS9jCbLPX2zZZU12vKKlSpIc,19049
|
7
7
|
validmind/client_config.py,sha256=O1gopTaNADM4ZVPj383AJTjcpjdxyEvUQY5cFt7nbIs,1366
|
8
8
|
validmind/datasets/__init__.py,sha256=c0hQZN_6GrUEJxdFHdQaEsQrSYNABG84ZCY0H-PzOZk,260
|
9
9
|
validmind/datasets/classification/__init__.py,sha256=p2p9r3SE4FIm4WNNJb4-axnkD5EexVW9VxIRYTbA6dI,1941
|
@@ -73,7 +73,7 @@ validmind/models/pipeline.py,sha256=nSskKWxaS4SGmx_B0IAvS5ogDZyh6tdx_aUkyxSXt88,
|
|
73
73
|
validmind/models/pytorch.py,sha256=aAEUWtISwLh-PMvHkcLwBEbBStAByt4J-NpK-Ndv38E,1826
|
74
74
|
validmind/models/r_model.py,sha256=TPUwPmxz3cNzJ1bAA5vz6P9xS6deVcLTuIO1e7rD1vY,7306
|
75
75
|
validmind/models/sklearn.py,sha256=lOCJlP2wvd5IJHtBS1XG9FXrtIvO_f8xm2Qp1UdsiBw,2406
|
76
|
-
validmind/template.py,sha256=
|
76
|
+
validmind/template.py,sha256=ezmHLee5QkccBf4n8iBFcXw7EMljSceGgGJb3O0c4cE,7902
|
77
77
|
validmind/test_suites/__init__.py,sha256=ofNaXD2SdkHko_Fy_RTr29YBHPGWiXVzHd1nENwi2ms,6973
|
78
78
|
validmind/test_suites/classifier.py,sha256=0ZE3z5X_ZewTvmwQ3cVGJQh7dPgg0IlqcQshJJxCFWQ,4003
|
79
79
|
validmind/test_suites/cluster.py,sha256=Wc2NViwivjiuiJMwrnGbOJYeZ3ApN8usWlOPYZYWAgE,2276
|
@@ -309,7 +309,7 @@ validmind/unit_metrics/regression/RootMeanSquaredError.py,sha256=uIDsSpy75Z7W3zu
|
|
309
309
|
validmind/utils.py,sha256=59WWVV_JhvxzPr8Y625qw_wsyu_ZVRoLJyi0Pw3MTMU,28613
|
310
310
|
validmind/vm_models/__init__.py,sha256=lcqf9q2aRzrVrNN6R--81IkrnSa6BXPbhJ8SnkT_hcI,702
|
311
311
|
validmind/vm_models/dataset/__init__.py,sha256=U4CxZjdoc0dd9u2AqBl5PJh1UVbzXWNrmundmjLF-qE,346
|
312
|
-
validmind/vm_models/dataset/dataset.py,sha256=
|
312
|
+
validmind/vm_models/dataset/dataset.py,sha256=lP0XLncHV6V5P-DG-Zs__wgirlUrEHokST-dQpb81Ro,28038
|
313
313
|
validmind/vm_models/dataset/utils.py,sha256=g6mBPrBmVYf8wJAlTxeg9DTiNvts4ZaaT5mbnQAPWaU,5638
|
314
314
|
validmind/vm_models/figure.py,sha256=ZMO_nIIleNhkBV1vJeF_UUsVDCzrXNOYwV1Lbg9E0XY,6303
|
315
315
|
validmind/vm_models/input.py,sha256=nTBQB6aqirhF-0Gmg5mYc4_vNyypvbYUfahMovcK02M,1095
|
@@ -323,8 +323,8 @@ validmind/vm_models/test_suite/runner.py,sha256=JqW8LW4X1Ri2C6wSsAGSki-JxGUGV8zm
|
|
323
323
|
validmind/vm_models/test_suite/summary.py,sha256=7P4zhfeU7a3I1MMBn8f7s-2lzdAz7U4y6LblpR89_vE,5401
|
324
324
|
validmind/vm_models/test_suite/test.py,sha256=C8xPGKSyYF9oMJ3VegwFJDF7cwYlIgtQoQ7nzXIS1uc,3914
|
325
325
|
validmind/vm_models/test_suite/test_suite.py,sha256=CciC6IhrLEeWwcpY3Np8EmQCB8XEF2ljwEXcvmNYgZc,5090
|
326
|
-
validmind-2.8.
|
327
|
-
validmind-2.8.
|
328
|
-
validmind-2.8.
|
329
|
-
validmind-2.8.
|
330
|
-
validmind-2.8.
|
326
|
+
validmind-2.8.22.dist-info/LICENSE,sha256=XonPUfwjvrC5Ombl3y-ko0Wubb1xdG_7nzvIbkZRKHw,35772
|
327
|
+
validmind-2.8.22.dist-info/METADATA,sha256=cLg53_Ec1ZxQQ1eiIsBugTOh8e2_wjgCnPfWBWAEuZc,6032
|
328
|
+
validmind-2.8.22.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
|
329
|
+
validmind-2.8.22.dist-info/entry_points.txt,sha256=HuW7YyOv9u_OEWpViQXtv0nfoI67uieJHawKWA4Hv9A,76
|
330
|
+
validmind-2.8.22.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|