chemotools 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- chemotools/datasets/_base.py +78 -25
- chemotools/feature_selection/_range_cut.py +3 -0
- {chemotools-0.1.4.dist-info → chemotools-0.1.5.dist-info}/METADATA +4 -2
- {chemotools-0.1.4.dist-info → chemotools-0.1.5.dist-info}/RECORD +9 -9
- tests/test_datasets.py +73 -5
- tests/test_functionality.py +15 -1
- {chemotools-0.1.4.dist-info → chemotools-0.1.5.dist-info}/LICENSE +0 -0
- {chemotools-0.1.4.dist-info → chemotools-0.1.5.dist-info}/WHEEL +0 -0
- {chemotools-0.1.4.dist-info → chemotools-0.1.5.dist-info}/top_level.txt +0 -0
chemotools/datasets/_base.py
CHANGED
@@ -1,14 +1,22 @@
|
|
1
|
-
import pandas as pd
|
2
1
|
import os
|
3
2
|
|
3
|
+
|
4
|
+
import pandas as pd
|
5
|
+
import polars as pl
|
6
|
+
|
4
7
|
PACKAGE_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
|
5
8
|
|
6
9
|
|
7
|
-
def load_fermentation_train():
|
10
|
+
def load_fermentation_train(set_output="pandas"):
|
8
11
|
"""
|
9
|
-
Loads the training data of the fermentation dataset. This data corresponds to a synthetic dataset measured
|
12
|
+
Loads the training data of the fermentation dataset. This data corresponds to a synthetic dataset measured
|
10
13
|
off-line. This dataset is designed to represent the variability of real fermentation data.
|
11
14
|
|
15
|
+
Arguments
|
16
|
+
-------
|
17
|
+
set_output: str, default='pandas'
|
18
|
+
The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
|
19
|
+
|
12
20
|
Returns
|
13
21
|
-------
|
14
22
|
train_spectra: pd.DataFrame A pandas DataFrame containing the synthetic spectra measured to train the model.
|
@@ -20,17 +28,32 @@ def load_fermentation_train():
|
|
20
28
|
Mauricio Iglesias Miguel, Gernaey Krist V. Transforming data into information:
|
21
29
|
A parallel hybrid model for real-time state estimation in lignocellulose ethanol fermentations.
|
22
30
|
"""
|
23
|
-
|
24
|
-
|
25
|
-
|
31
|
+
if set_output == "pandas":
|
32
|
+
train_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
|
33
|
+
train_spectra.columns = train_spectra.columns.astype(float)
|
34
|
+
train_hplc = pd.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
|
35
|
+
return train_spectra, train_hplc
|
26
36
|
|
27
|
-
|
37
|
+
if set_output == "polars":
|
38
|
+
train_spectra = pl.read_csv(PACKAGE_DIRECTORY + "/data/train_spectra.csv")
|
39
|
+
train_hplc = pl.read_csv(PACKAGE_DIRECTORY + "/data/train_hplc.csv")
|
40
|
+
return train_spectra, train_hplc
|
28
41
|
|
42
|
+
else:
|
43
|
+
raise ValueError(
|
44
|
+
"Invalid value for set_output. Please use 'pandas' or 'polars'."
|
45
|
+
)
|
29
46
|
|
30
|
-
|
47
|
+
|
48
|
+
def load_fermentation_test(set_output="pandas"):
|
31
49
|
"""
|
32
50
|
Loads the testing data of the fermentation dataset. This data corresponds to real fermentation data measured
|
33
|
-
on-line during a fermentation process.
|
51
|
+
on-line during a fermentation process.
|
52
|
+
|
53
|
+
Arguments
|
54
|
+
-------
|
55
|
+
set_output: str, default='pandas'
|
56
|
+
The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
|
34
57
|
|
35
58
|
Returns
|
36
59
|
-------
|
@@ -43,27 +66,57 @@ def load_fermentation_test():
|
|
43
66
|
Mauricio Iglesias Miguel, Gernaey Krist V. Transforming data into information:
|
44
67
|
A parallel hybrid model for real-time state estimation in lignocellulose ethanol fermentations.
|
45
68
|
"""
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
69
|
+
if set_output == "pandas":
|
70
|
+
fermentation_spectra = pd.read_csv(
|
71
|
+
PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
|
72
|
+
)
|
73
|
+
fermentation_spectra.columns = fermentation_spectra.columns.astype(float)
|
74
|
+
fermentation_hplc = pd.read_csv(
|
75
|
+
PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv"
|
76
|
+
)
|
77
|
+
return fermentation_spectra, fermentation_hplc
|
78
|
+
|
79
|
+
if set_output == "polars":
|
80
|
+
fermentation_spectra = pl.read_csv(
|
81
|
+
PACKAGE_DIRECTORY + "/data/fermentation_spectra.csv"
|
82
|
+
)
|
83
|
+
fermentation_hplc = pl.read_csv(
|
84
|
+
PACKAGE_DIRECTORY + "/data/fermentation_hplc.csv"
|
85
|
+
)
|
86
|
+
return fermentation_spectra, fermentation_hplc
|
87
|
+
|
88
|
+
else:
|
89
|
+
raise ValueError(
|
90
|
+
"Invalid value for set_output. Please use 'pandas' or 'polars'."
|
91
|
+
)
|
92
|
+
|
93
|
+
|
94
|
+
def load_coffee(set_output="pandas"):
|
56
95
|
"""
|
57
|
-
Loads the coffee dataset. This data corresponds to a coffee spectra from three different origins
|
96
|
+
Loads the coffee dataset. This data corresponds to a coffee spectra from three different origins
|
58
97
|
measured off-line using attenuated total reflectance Fourier transform infrared spectroscopy (ATR-FTIR).
|
59
98
|
|
99
|
+
Arguments
|
100
|
+
-------
|
101
|
+
set_output: str, default='pandas'
|
102
|
+
The output format of the data. It can be 'pandas' or 'polars'. If 'polars', the data is returned as a polars DataFrame.
|
103
|
+
|
60
104
|
Returns
|
61
105
|
-------
|
62
106
|
coffee_spectra: pd.DataFrame A pandas DataFrame containing the coffee spectra.
|
63
107
|
coffee_labels: pd.DataFrame A pandas DataFrame containing the corresponding labels.
|
64
108
|
"""
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
109
|
+
if set_output == "pandas":
|
110
|
+
coffee_spectra = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
|
111
|
+
coffee_labels = pd.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
|
112
|
+
return coffee_spectra, coffee_labels
|
113
|
+
|
114
|
+
if set_output == "polars":
|
115
|
+
coffee_spectra = pl.read_csv(PACKAGE_DIRECTORY + "/data/coffee_spectra.csv")
|
116
|
+
coffee_labels = pl.read_csv(PACKAGE_DIRECTORY + "/data/coffee_labels.csv")
|
117
|
+
return coffee_spectra, coffee_labels
|
118
|
+
|
119
|
+
else:
|
120
|
+
raise ValueError(
|
121
|
+
"Invalid value for set_output. Please use 'pandas' or 'polars'."
|
122
|
+
)
|
@@ -34,6 +34,8 @@ class RangeCut(BaseEstimator, SelectorMixin):
|
|
34
34
|
end_index_ : int
|
35
35
|
The index of the end of the range. It is -1 if the wavenumbers are not provided.
|
36
36
|
|
37
|
+
wavenuumbers_ : array-like
|
38
|
+
The cut wavenumbers of the input data.
|
37
39
|
|
38
40
|
Methods
|
39
41
|
-------
|
@@ -75,6 +77,7 @@ class RangeCut(BaseEstimator, SelectorMixin):
|
|
75
77
|
if self.wavenumbers is None:
|
76
78
|
self.start_index_ = self.start
|
77
79
|
self.end_index_ = self.end
|
80
|
+
self.wavenumbers_ = None
|
78
81
|
else:
|
79
82
|
self.start_index_ = self._find_index(self.start)
|
80
83
|
self.end_index_ = self._find_index(self.end)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: chemotools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: Package to integrate chemometrics in scikit-learn pipelines
|
5
5
|
Home-page: https://github.com/paucablop/chemotools
|
6
6
|
Author: Pau Cabaneros Lopez
|
@@ -14,8 +14,10 @@ Description-Content-Type: text/markdown
|
|
14
14
|
License-File: LICENSE
|
15
15
|
Requires-Dist: numpy
|
16
16
|
Requires-Dist: pandas
|
17
|
+
Requires-Dist: polars
|
18
|
+
Requires-Dist: pyarrow
|
17
19
|
Requires-Dist: scipy
|
18
|
-
Requires-Dist: scikit-learn
|
20
|
+
Requires-Dist: scikit-learn >=1.4.0
|
19
21
|
|
20
22
|

|
21
23
|
|
@@ -16,7 +16,7 @@ chemotools/baseline/_non_negative.py,sha256=SyiS_-cfnypLXY3gC80oo7doqXUlHAAgmwrk
|
|
16
16
|
chemotools/baseline/_polynomial_correction.py,sha256=0w9qA_w5dc9IIv5KMmAOZ06hWDuk-uyealsTaZX2qgw,3749
|
17
17
|
chemotools/baseline/_subtract_reference.py,sha256=vfre6Z-bgDCwwl3VnpahmGJTBFJVK9HGBrUsjfl2O9o,3135
|
18
18
|
chemotools/datasets/__init__.py,sha256=ojqxb-C_eDmizwUqVCJ8BqJxwULD7_hWCyVIA1uRO0c,116
|
19
|
-
chemotools/datasets/_base.py,sha256=
|
19
|
+
chemotools/datasets/_base.py,sha256=ftAmf2jHWUW_YQobXCsIFC617PeXwsmZIwAgab9EvL8,4890
|
20
20
|
chemotools/datasets/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
chemotools/datasets/data/coffee_labels.csv,sha256=ZXQWQIf8faLHjdnHfRoXfxMR56kq9Q1BGPZBkQyhGlY,487
|
22
22
|
chemotools/datasets/data/coffee_spectra.csv,sha256=VA-sN4u0hC5iALlRxxkj-K87Lz3b3mmUHBJPoDXychI,2206147
|
@@ -29,7 +29,7 @@ chemotools/derivative/_norris_william.py,sha256=NKmuo95vNWHQOdcww7APU9Z4s1wWExIR
|
|
29
29
|
chemotools/derivative/_savitzky_golay.py,sha256=5At4sexJH0RvjkrvVfJvhIfaxXD3vE4Ozq1VClb3qlU,3417
|
30
30
|
chemotools/feature_selection/__init__.py,sha256=p47SuyI7jMpV7kiaAsv2hA20smKf5Yo6447LfrNdDhY,76
|
31
31
|
chemotools/feature_selection/_index_selector.py,sha256=2z2aAyMUOuP7x1n19RV5JGf6ZcM3mtJZby8tEgBOix4,3379
|
32
|
-
chemotools/feature_selection/_range_cut.py,sha256=
|
32
|
+
chemotools/feature_selection/_range_cut.py,sha256=ikWW9FhsbyzijSUYTcx048eOyK65mdbfOuFRF_Ee3rk,3424
|
33
33
|
chemotools/scale/__init__.py,sha256=CQPUPx-8pUeHHbN9p5smFro3xtl_UEE0YeXHLVd7Lfk,118
|
34
34
|
chemotools/scale/_min_max_scaler.py,sha256=-Wnr7zW-zmW6nR5J5yPdBm1KNuQDa9w27Un7rAr-s8E,2806
|
35
35
|
chemotools/scale/_norm_scaler.py,sha256=bjMg1-x2I1xZmmbIgl4vXZZweJV-w3Euta0KGff_2Gk,2363
|
@@ -48,11 +48,11 @@ chemotools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
|
|
48
48
|
chemotools/utils/check_inputs.py,sha256=fRAV4HIaGamdj_PNXSNnl7LurXytACNTGO51rhPpMUY,512
|
49
49
|
tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
50
50
|
tests/fixtures.py,sha256=Xa-Vd62Kd1fyWg3PLUSP6iIkOK8etrbyOkMJTn3dvX8,1933
|
51
|
-
tests/test_datasets.py,sha256=
|
52
|
-
tests/test_functionality.py,sha256=
|
51
|
+
tests/test_datasets.py,sha256=ZdyjSJVX-iJyz8SoRgFfRLP9-ajNEyqWxs00ZfIv0eo,2712
|
52
|
+
tests/test_functionality.py,sha256=v8dH7TPA2D-5byl1nwpPW9ejx1Fzd5QsKuQQ4aouCjo,21707
|
53
53
|
tests/test_sklearn_compliance.py,sha256=CRB_0X9HRGj0pOpUCmiSHwJkCsVB-yK_apsyUONmfmw,5856
|
54
|
-
chemotools-0.1.
|
55
|
-
chemotools-0.1.
|
56
|
-
chemotools-0.1.
|
57
|
-
chemotools-0.1.
|
58
|
-
chemotools-0.1.
|
54
|
+
chemotools-0.1.5.dist-info/LICENSE,sha256=qtyOy2wDQVX9hxp58h3T-6Lmfv-mSCHoSRkcLUdM9bg,1070
|
55
|
+
chemotools-0.1.5.dist-info/METADATA,sha256=s3KJEhQ3jgq6DPl7PW5Hl3x9f5kKyCDi-Cedon48DDA,5071
|
56
|
+
chemotools-0.1.5.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
57
|
+
chemotools-0.1.5.dist-info/top_level.txt,sha256=eNcNcKSdo-1H_2gwSDrS__dr7BM3R73Cnn-pBiW5FEw,17
|
58
|
+
chemotools-0.1.5.dist-info/RECORD,,
|
tests/test_datasets.py
CHANGED
@@ -1,9 +1,15 @@
|
|
1
1
|
import pandas as pd
|
2
|
+
import polars as pl
|
3
|
+
import pytest
|
2
4
|
|
3
|
-
from chemotools.datasets import
|
5
|
+
from chemotools.datasets import (
|
6
|
+
load_coffee,
|
7
|
+
load_fermentation_test,
|
8
|
+
load_fermentation_train,
|
9
|
+
)
|
4
10
|
|
5
11
|
|
6
|
-
def
|
12
|
+
def test_load_coffee_pandas():
|
7
13
|
# Arrange
|
8
14
|
|
9
15
|
# Act
|
@@ -16,7 +22,28 @@ def test_load_coffee():
|
|
16
22
|
assert isinstance(coffee_labels, pd.DataFrame)
|
17
23
|
|
18
24
|
|
19
|
-
def
|
25
|
+
def test_load_coffee_polars():
|
26
|
+
# Arrange
|
27
|
+
|
28
|
+
# Act
|
29
|
+
coffee_spectra, coffee_labels = load_coffee(set_output="polars")
|
30
|
+
|
31
|
+
# Assert
|
32
|
+
assert coffee_spectra.shape == (60, 1841)
|
33
|
+
assert coffee_labels.shape == (60, 1)
|
34
|
+
assert isinstance(coffee_spectra, pl.DataFrame)
|
35
|
+
assert isinstance(coffee_labels, pl.DataFrame)
|
36
|
+
|
37
|
+
|
38
|
+
def test_load_coffee_exception():
|
39
|
+
# Arrange
|
40
|
+
|
41
|
+
# Act and Assert
|
42
|
+
with pytest.raises(ValueError):
|
43
|
+
coffee_spectra, coffee_labels = load_coffee(set_output="plars")
|
44
|
+
|
45
|
+
|
46
|
+
def test_load_fermentation_test_pandas():
|
20
47
|
# Arrange
|
21
48
|
|
22
49
|
# Act
|
@@ -28,7 +55,29 @@ def test_load_fermentation_test():
|
|
28
55
|
assert isinstance(test_spectra, pd.DataFrame)
|
29
56
|
assert isinstance(test_hplc, pd.DataFrame)
|
30
57
|
|
31
|
-
|
58
|
+
|
59
|
+
def test_load_fermentation_test_polars():
|
60
|
+
# Arrange
|
61
|
+
|
62
|
+
# Act
|
63
|
+
test_spectra, test_hplc = load_fermentation_test(set_output="polars")
|
64
|
+
|
65
|
+
# Assert
|
66
|
+
assert test_spectra.shape == (1629, 1047)
|
67
|
+
assert test_hplc.shape == (34, 6)
|
68
|
+
assert isinstance(test_spectra, pl.DataFrame)
|
69
|
+
assert isinstance(test_hplc, pl.DataFrame)
|
70
|
+
|
71
|
+
|
72
|
+
def test_load_fermentation_test_exception():
|
73
|
+
# Arrange
|
74
|
+
|
75
|
+
# Act and Assert
|
76
|
+
with pytest.raises(ValueError):
|
77
|
+
test_spectra, test_hplc = load_fermentation_test(set_output="plars")
|
78
|
+
|
79
|
+
|
80
|
+
def test_load_fermentation_train_pandas():
|
32
81
|
# Arrange
|
33
82
|
|
34
83
|
# Act
|
@@ -40,4 +89,23 @@ def test_load_fermentation_train():
|
|
40
89
|
assert isinstance(train_spectra, pd.DataFrame)
|
41
90
|
assert isinstance(train_hplc, pd.DataFrame)
|
42
91
|
|
43
|
-
|
92
|
+
|
93
|
+
def test_load_fermentation_train_polars():
|
94
|
+
# Arrange
|
95
|
+
|
96
|
+
# Act
|
97
|
+
train_spectra, train_hplc = load_fermentation_train(set_output="polars")
|
98
|
+
|
99
|
+
# Assert
|
100
|
+
assert train_spectra.shape == (21, 1047)
|
101
|
+
assert train_hplc.shape == (21, 1)
|
102
|
+
assert isinstance(train_spectra, pl.DataFrame)
|
103
|
+
assert isinstance(train_hplc, pl.DataFrame)
|
104
|
+
|
105
|
+
|
106
|
+
def test_load_fermentation_train_exception():
|
107
|
+
# Arrange
|
108
|
+
|
109
|
+
# Act and Assert
|
110
|
+
with pytest.raises(ValueError):
|
111
|
+
train_spectra, train_hplc = load_fermentation_train(set_output="plars")
|
tests/test_functionality.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
import numpy as np
|
2
2
|
import pandas as pd
|
3
|
+
import polars as pl
|
3
4
|
import pytest
|
4
5
|
|
5
6
|
from chemotools.augmentation import (
|
@@ -625,7 +626,7 @@ def test_range_cut_by_wavenumber_with_list():
|
|
625
626
|
assert range_cut.wavenumbers_ == [2, 3, 4, 5, 6, 7]
|
626
627
|
|
627
628
|
|
628
|
-
def
|
629
|
+
def test_range_cut_by_wavenumber_with_pandas_dataframe():
|
629
630
|
# Arrange
|
630
631
|
wavenumbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
631
632
|
spectrum = pd.DataFrame(np.array([[10, 12, 14, 16, 14, 12, 10, 12, 14, 16]]))
|
@@ -638,6 +639,19 @@ def test_range_cut_by_wavenumber_with_dataframe():
|
|
638
639
|
assert type(spectrum_corrected) == pd.DataFrame
|
639
640
|
|
640
641
|
|
642
|
+
def test_range_cut_by_wavenumber_with_polars_dataframe():
|
643
|
+
# Arrange
|
644
|
+
wavenumbers = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
|
645
|
+
spectrum = pl.DataFrame(np.array([[10, 12, 14, 16, 14, 12, 10, 12, 14, 16]]))
|
646
|
+
range_cut = RangeCut(start=2.5, end=7.9, wavenumbers=wavenumbers).set_output(transform='polars')
|
647
|
+
|
648
|
+
# Act
|
649
|
+
spectrum_corrected = range_cut.fit_transform(spectrum)
|
650
|
+
|
651
|
+
# Assert
|
652
|
+
assert type(spectrum_corrected) == pl.DataFrame
|
653
|
+
|
654
|
+
|
641
655
|
def test_robust_normal_variate():
|
642
656
|
# Arrange
|
643
657
|
spectrum = np.array([2, 3.5, 5, 27, 8, 9]).reshape(1, -1)
|
File without changes
|
File without changes
|
File without changes
|