molcraft 0.1.0a13__py3-none-any.whl → 0.1.0a14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of molcraft might be problematic. Click here for more details.
- molcraft/__init__.py +1 -1
- molcraft/datasets.py +88 -80
- molcraft/models.py +6 -7
- {molcraft-0.1.0a13.dist-info → molcraft-0.1.0a14.dist-info}/METADATA +1 -1
- {molcraft-0.1.0a13.dist-info → molcraft-0.1.0a14.dist-info}/RECORD +8 -8
- {molcraft-0.1.0a13.dist-info → molcraft-0.1.0a14.dist-info}/WHEEL +0 -0
- {molcraft-0.1.0a13.dist-info → molcraft-0.1.0a14.dist-info}/licenses/LICENSE +0 -0
- {molcraft-0.1.0a13.dist-info → molcraft-0.1.0a14.dist-info}/top_level.txt +0 -0
molcraft/__init__.py
CHANGED
molcraft/datasets.py
CHANGED
|
@@ -1,123 +1,131 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
import pandas as pd
|
|
3
|
+
import typing
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
def split(
|
|
6
7
|
data: pd.DataFrame | np.ndarray,
|
|
8
|
+
*,
|
|
7
9
|
train_size: float | None = None,
|
|
8
10
|
validation_size: float | None = None,
|
|
9
|
-
test_size: float =
|
|
11
|
+
test_size: float | None = None,
|
|
12
|
+
groups: str | np.ndarray = None,
|
|
10
13
|
shuffle: bool = False,
|
|
11
14
|
random_state: int | None = None,
|
|
12
|
-
) ->
|
|
13
|
-
"""Splits dataset into subsets.
|
|
15
|
+
) -> tuple[np.ndarray | pd.DataFrame, ...]:
|
|
16
|
+
"""Splits the dataset into subsets.
|
|
14
17
|
|
|
15
18
|
Args:
|
|
16
19
|
data:
|
|
17
20
|
A pd.DataFrame or np.ndarray object.
|
|
18
21
|
train_size:
|
|
19
|
-
|
|
22
|
+
The size of the train set.
|
|
20
23
|
validation_size:
|
|
21
|
-
|
|
24
|
+
The size of the validation set.
|
|
22
25
|
test_size:
|
|
23
|
-
|
|
26
|
+
The size of the test set.
|
|
27
|
+
groups:
|
|
28
|
+
The groups to perform the splitting on.
|
|
24
29
|
shuffle:
|
|
25
30
|
Whether the dataset should be shuffled prior to splitting.
|
|
26
31
|
random_state:
|
|
27
|
-
The random state
|
|
32
|
+
The random state/seed. Only applicable if shuffling.
|
|
28
33
|
"""
|
|
34
|
+
if not isinstance(data, (pd.DataFrame, np.ndarray)):
|
|
35
|
+
raise ValueError(f'Unsupported `data` type ({type(data)}).')
|
|
29
36
|
|
|
30
|
-
if
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
)
|
|
35
|
-
|
|
36
|
-
size = len(data)
|
|
37
|
+
if isinstance(groups, str):
|
|
38
|
+
groups = data[groups].values
|
|
39
|
+
elif groups is None:
|
|
40
|
+
groups = np.arange(len(data))
|
|
37
41
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
f'Test size needs to be positive. Found: {test_size}. '
|
|
43
|
-
'Either specify a positive `float` (fraction) or '
|
|
44
|
-
'a positive `int` (size).'
|
|
45
|
-
)
|
|
46
|
-
if train_size is not None and train_size <= 0:
|
|
47
|
-
raise ValueError(
|
|
48
|
-
f'Train size needs to be None or positive. Found: {train_size}. '
|
|
49
|
-
'Either specify `None`, a positive `float` (fraction) or '
|
|
50
|
-
'a positive `int` (size).'
|
|
51
|
-
)
|
|
52
|
-
if validation_size is not None and validation_size <= 0:
|
|
42
|
+
indices = np.unique(groups)
|
|
43
|
+
size = len(indices)
|
|
44
|
+
|
|
45
|
+
if not train_size and not test_size:
|
|
53
46
|
raise ValueError(
|
|
54
|
-
f'
|
|
55
|
-
'
|
|
56
|
-
'a positive `int` (size).'
|
|
47
|
+
f'Found both `train_size` and `test_size` to be `None`, '
|
|
48
|
+
f'specify at least one of them.'
|
|
57
49
|
)
|
|
58
|
-
|
|
59
50
|
if isinstance(test_size, float):
|
|
60
51
|
test_size = int(size * test_size)
|
|
61
|
-
if
|
|
52
|
+
if isinstance(train_size, float):
|
|
53
|
+
train_size = int(size * train_size)
|
|
54
|
+
if isinstance(validation_size, float):
|
|
62
55
|
validation_size = int(size * validation_size)
|
|
63
56
|
elif not validation_size:
|
|
64
57
|
validation_size = 0
|
|
65
58
|
|
|
66
|
-
if train_size and isinstance(train_size, float):
|
|
67
|
-
train_size = int(size * train_size)
|
|
68
|
-
elif not train_size:
|
|
69
|
-
train_size = 0
|
|
70
|
-
|
|
71
59
|
if not train_size:
|
|
72
|
-
train_size = size - test_size
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
60
|
+
train_size = (size - test_size - validation_size)
|
|
61
|
+
if not test_size:
|
|
62
|
+
test_size = (size - train_size - validation_size)
|
|
63
|
+
|
|
76
64
|
remainder = size - (train_size + validation_size + test_size)
|
|
77
|
-
|
|
78
65
|
if remainder < 0:
|
|
79
66
|
raise ValueError(
|
|
80
|
-
'
|
|
81
|
-
f'{size} < ({train_size} + {validation_size} + {test_size})'
|
|
67
|
+
f'subset sizes added up to more than the data size.'
|
|
82
68
|
)
|
|
83
|
-
if test_size <= 0:
|
|
84
|
-
raise ValueError(
|
|
85
|
-
f'Test size needs to be greater than 0. Found: {test_size}.'
|
|
86
|
-
)
|
|
87
|
-
if train_size <= 0:
|
|
88
|
-
raise ValueError(
|
|
89
|
-
f'Train size needs to be greater than 0. Found: {train_size}.'
|
|
90
|
-
)
|
|
91
|
-
|
|
92
69
|
train_size += remainder
|
|
93
70
|
|
|
94
|
-
if
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
frac=1.0, replace=False, random_state=random_state
|
|
98
|
-
)
|
|
99
|
-
train_data = data.iloc[:train_size]
|
|
100
|
-
test_data = data.iloc[-test_size:]
|
|
101
|
-
if not validation_size:
|
|
102
|
-
return train_data, test_data
|
|
103
|
-
validation_data = data.iloc[train_size:-test_size]
|
|
104
|
-
return train_data, validation_data, test_data
|
|
105
|
-
|
|
106
|
-
if not isinstance(data, np.ndarray):
|
|
107
|
-
data = np.asarray(data)
|
|
108
|
-
|
|
109
|
-
np.random.seed(random_state)
|
|
110
|
-
|
|
111
|
-
random_indices = np.arange(size)
|
|
112
|
-
np.random.shuffle(random_indices)
|
|
113
|
-
data = data[random_indices]
|
|
71
|
+
if shuffle:
|
|
72
|
+
np.random.seed(random_state)
|
|
73
|
+
np.random.shuffle(indices)
|
|
114
74
|
|
|
115
|
-
|
|
116
|
-
|
|
75
|
+
train_mask = np.isin(groups, indices[:train_size])
|
|
76
|
+
test_mask = np.isin(groups, indices[-test_size:])
|
|
117
77
|
if not validation_size:
|
|
118
|
-
return
|
|
119
|
-
|
|
120
|
-
return
|
|
78
|
+
return data[train_mask], data[test_mask]
|
|
79
|
+
validation_mask = np.isin(groups, indices[train_size:-test_size])
|
|
80
|
+
return data[train_mask], data[validation_mask], data[test_mask]
|
|
121
81
|
|
|
82
|
+
def cv_split(
|
|
83
|
+
data: pd.DataFrame | np.ndarray,
|
|
84
|
+
num_splits: int = 10,
|
|
85
|
+
groups: str | np.ndarray = None,
|
|
86
|
+
shuffle: bool = False,
|
|
87
|
+
random_state: int | None = None,
|
|
88
|
+
) -> typing.Iterator[
|
|
89
|
+
tuple[np.ndarray | pd.DataFrame, np.ndarray | pd.DataFrame]
|
|
90
|
+
]:
|
|
91
|
+
"""Splits the dataset into cross-validation folds.
|
|
122
92
|
|
|
93
|
+
Args:
|
|
94
|
+
data:
|
|
95
|
+
A pd.DataFrame or np.ndarray object.
|
|
96
|
+
num_splits:
|
|
97
|
+
The number of cross-validation folds.
|
|
98
|
+
groups:
|
|
99
|
+
The groups to perform the splitting on.
|
|
100
|
+
shuffle:
|
|
101
|
+
Whether the dataset should be shuffled prior to splitting.
|
|
102
|
+
random_state:
|
|
103
|
+
The random state/seed. Only applicable if shuffling.
|
|
104
|
+
"""
|
|
105
|
+
if not isinstance(data, (pd.DataFrame, np.ndarray)):
|
|
106
|
+
raise ValueError(f'Unsupported `data` type ({type(data)}).')
|
|
107
|
+
|
|
108
|
+
if isinstance(groups, str):
|
|
109
|
+
groups = data[groups].values
|
|
110
|
+
elif groups is None:
|
|
111
|
+
groups = np.arange(len(data))
|
|
112
|
+
|
|
113
|
+
indices = np.unique(groups)
|
|
114
|
+
size = len(indices)
|
|
123
115
|
|
|
116
|
+
if num_splits > size:
|
|
117
|
+
raise ValueError(
|
|
118
|
+
f'`num_splits` ({num_splits}) must not be greater than'
|
|
119
|
+
f'the data size or the number of groups ({size}).'
|
|
120
|
+
)
|
|
121
|
+
if shuffle:
|
|
122
|
+
np.random.seed(random_state)
|
|
123
|
+
np.random.shuffle(indices)
|
|
124
|
+
|
|
125
|
+
indices_splits = np.array_split(indices, num_splits)
|
|
126
|
+
|
|
127
|
+
for k in range(num_splits):
|
|
128
|
+
test_indices = indices_splits[k]
|
|
129
|
+
test_mask = np.isin(groups, test_indices)
|
|
130
|
+
train_mask = ~test_mask
|
|
131
|
+
yield data[train_mask], data[test_mask]
|
molcraft/models.py
CHANGED
|
@@ -250,7 +250,7 @@ class GraphModel(layers.GraphLayer, keras.models.Model):
|
|
|
250
250
|
val_size = int(val_split * x.num_subgraphs)
|
|
251
251
|
x_val = _make_dataset(x[-val_size:], batch_size)
|
|
252
252
|
x = x[:-val_size]
|
|
253
|
-
x = _make_dataset(x, batch_size)
|
|
253
|
+
x = _make_dataset(x, batch_size, shuffle=kwargs.get('shuffle', True))
|
|
254
254
|
return super().fit(x, validation_data=x_val, **kwargs)
|
|
255
255
|
|
|
256
256
|
def evaluate(self, x: tensors.GraphTensor | tf.data.Dataset, **kwargs):
|
|
@@ -561,9 +561,8 @@ def _functional_init_arguments(args, kwargs):
|
|
|
561
561
|
or ("inputs" in kwargs and "outputs" in kwargs)
|
|
562
562
|
)
|
|
563
563
|
|
|
564
|
-
def _make_dataset(x: tensors.GraphTensor, batch_size: int):
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
.
|
|
568
|
-
|
|
569
|
-
)
|
|
564
|
+
def _make_dataset(x: tensors.GraphTensor, batch_size: int, shuffle: bool = False):
|
|
565
|
+
ds = tf.data.Dataset.from_tensor_slices(x)
|
|
566
|
+
if shuffle:
|
|
567
|
+
ds = ds.shuffle(buffer_size=ds.cardinality())
|
|
568
|
+
return ds.batch(batch_size).prefetch(-1)
|
|
@@ -1,21 +1,21 @@
|
|
|
1
|
-
molcraft/__init__.py,sha256=
|
|
1
|
+
molcraft/__init__.py,sha256=lReyUDRgBySoe9LPZzlwv1N_x9unwr6nHxIU70u3mLU,464
|
|
2
2
|
molcraft/callbacks.py,sha256=x5HnkZhqcFRrW6xdApt_jZ4X08A-0fxcnFKfdmRKa0c,3571
|
|
3
3
|
molcraft/chem.py,sha256=--4AdZV0TCj_cf5i-TRidNJGSFyab1ksUEMjmDi7zaM,21837
|
|
4
4
|
molcraft/conformers.py,sha256=K6ZtiSUNDN_fwqGP9JrPcwALLFFvlMlF_XejEJH3Sr4,4205
|
|
5
|
-
molcraft/datasets.py,sha256=
|
|
5
|
+
molcraft/datasets.py,sha256=QKHi9SUBKvJvdkRFmRQNowhrnu35pQqtujuLatOK8bE,4151
|
|
6
6
|
molcraft/descriptors.py,sha256=jJpT0XWu3Tx_bxnwk1rENySRkaM8cMDMaDIjG8KKvtg,3097
|
|
7
7
|
molcraft/features.py,sha256=GwOecLCNUIuGfbIVzsAJH4LikkzWMKj5IT7zSgGTttU,13846
|
|
8
8
|
molcraft/featurizers.py,sha256=QiyNEFRJdMcKZM-gJGHU6Soy300RWDtLeYw0QEkFG20,27129
|
|
9
9
|
molcraft/layers.py,sha256=cUpo9dqqNEnc7rNf-Dze8adFhOkTV5F9IhHOKs13OUI,60134
|
|
10
10
|
molcraft/losses.py,sha256=qnS2yC5g-O3n_zVea9MR6TNiFraW2yqRgePOisoUP4A,1065
|
|
11
|
-
molcraft/models.py,sha256=
|
|
11
|
+
molcraft/models.py,sha256=h9cRAdCOU-_UAxROC9Utuz4AR4HfFE9QqJ4geLYlynE,21878
|
|
12
12
|
molcraft/ops.py,sha256=TaAD26V-b7eSNKFKswWt9IExSgIBOmLqwlPPcdpt8wk,5496
|
|
13
13
|
molcraft/records.py,sha256=MbvYkcCunbAmpy_MWXmQ9WBGi2WvwxFUlwQSPKPvSSk,5534
|
|
14
14
|
molcraft/tensors.py,sha256=EOUKx496KUZsjA1zA2ABc7tU_TW3Jv7AXDsug_QsLbA,22407
|
|
15
15
|
molcraft/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
molcraft/apps/peptides.py,sha256=N5wJDGDIDRbmOmxin_dTY-odLqb0avAX9FU22U6x6c0,14576
|
|
17
|
-
molcraft-0.1.
|
|
18
|
-
molcraft-0.1.
|
|
19
|
-
molcraft-0.1.
|
|
20
|
-
molcraft-0.1.
|
|
21
|
-
molcraft-0.1.
|
|
17
|
+
molcraft-0.1.0a14.dist-info/licenses/LICENSE,sha256=sbVeqlrtZ0V63uYhZGL5dCxUm8rBAOqe2avyA1zIQNk,1074
|
|
18
|
+
molcraft-0.1.0a14.dist-info/METADATA,sha256=1Op3VxuV9hkciALrrOXx2KnGShFI5a9n_XbhT-oPpKI,3893
|
|
19
|
+
molcraft-0.1.0a14.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
20
|
+
molcraft-0.1.0a14.dist-info/top_level.txt,sha256=dENV6MfOceshM6MQCgJlcN1ojZkiCL9B4F7XyUge3QM,9
|
|
21
|
+
molcraft-0.1.0a14.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|