pyAgrum-nightly 2.3.0.9.dev202512061764412981__cp310-abi3-macosx_11_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyagrum/__init__.py +165 -0
- pyagrum/_pyagrum.so +0 -0
- pyagrum/bnmixture/BNMInference.py +268 -0
- pyagrum/bnmixture/BNMLearning.py +376 -0
- pyagrum/bnmixture/BNMixture.py +464 -0
- pyagrum/bnmixture/__init__.py +60 -0
- pyagrum/bnmixture/notebook.py +1058 -0
- pyagrum/causal/_CausalFormula.py +280 -0
- pyagrum/causal/_CausalModel.py +436 -0
- pyagrum/causal/__init__.py +81 -0
- pyagrum/causal/_causalImpact.py +356 -0
- pyagrum/causal/_dSeparation.py +598 -0
- pyagrum/causal/_doAST.py +761 -0
- pyagrum/causal/_doCalculus.py +361 -0
- pyagrum/causal/_doorCriteria.py +374 -0
- pyagrum/causal/_exceptions.py +95 -0
- pyagrum/causal/_types.py +61 -0
- pyagrum/causal/causalEffectEstimation/_CausalEffectEstimation.py +1175 -0
- pyagrum/causal/causalEffectEstimation/_IVEstimators.py +718 -0
- pyagrum/causal/causalEffectEstimation/_RCTEstimators.py +132 -0
- pyagrum/causal/causalEffectEstimation/__init__.py +46 -0
- pyagrum/causal/causalEffectEstimation/_backdoorEstimators.py +774 -0
- pyagrum/causal/causalEffectEstimation/_causalBNEstimator.py +324 -0
- pyagrum/causal/causalEffectEstimation/_frontdoorEstimators.py +396 -0
- pyagrum/causal/causalEffectEstimation/_learners.py +118 -0
- pyagrum/causal/causalEffectEstimation/_utils.py +466 -0
- pyagrum/causal/notebook.py +171 -0
- pyagrum/clg/CLG.py +658 -0
- pyagrum/clg/GaussianVariable.py +111 -0
- pyagrum/clg/SEM.py +312 -0
- pyagrum/clg/__init__.py +63 -0
- pyagrum/clg/canonicalForm.py +408 -0
- pyagrum/clg/constants.py +54 -0
- pyagrum/clg/forwardSampling.py +202 -0
- pyagrum/clg/learning.py +776 -0
- pyagrum/clg/notebook.py +480 -0
- pyagrum/clg/variableElimination.py +271 -0
- pyagrum/common.py +60 -0
- pyagrum/config.py +319 -0
- pyagrum/ctbn/CIM.py +513 -0
- pyagrum/ctbn/CTBN.py +573 -0
- pyagrum/ctbn/CTBNGenerator.py +216 -0
- pyagrum/ctbn/CTBNInference.py +459 -0
- pyagrum/ctbn/CTBNLearner.py +161 -0
- pyagrum/ctbn/SamplesStats.py +671 -0
- pyagrum/ctbn/StatsIndepTest.py +355 -0
- pyagrum/ctbn/__init__.py +79 -0
- pyagrum/ctbn/constants.py +54 -0
- pyagrum/ctbn/notebook.py +264 -0
- pyagrum/defaults.ini +199 -0
- pyagrum/deprecated.py +95 -0
- pyagrum/explain/_ComputationCausal.py +75 -0
- pyagrum/explain/_ComputationConditional.py +48 -0
- pyagrum/explain/_ComputationMarginal.py +48 -0
- pyagrum/explain/_CustomShapleyCache.py +110 -0
- pyagrum/explain/_Explainer.py +176 -0
- pyagrum/explain/_Explanation.py +70 -0
- pyagrum/explain/_FIFOCache.py +54 -0
- pyagrum/explain/_ShallCausalValues.py +204 -0
- pyagrum/explain/_ShallConditionalValues.py +155 -0
- pyagrum/explain/_ShallMarginalValues.py +155 -0
- pyagrum/explain/_ShallValues.py +296 -0
- pyagrum/explain/_ShapCausalValues.py +208 -0
- pyagrum/explain/_ShapConditionalValues.py +126 -0
- pyagrum/explain/_ShapMarginalValues.py +191 -0
- pyagrum/explain/_ShapleyValues.py +298 -0
- pyagrum/explain/__init__.py +81 -0
- pyagrum/explain/_explGeneralizedMarkovBlanket.py +152 -0
- pyagrum/explain/_explIndependenceListForPairs.py +146 -0
- pyagrum/explain/_explInformationGraph.py +264 -0
- pyagrum/explain/notebook/__init__.py +54 -0
- pyagrum/explain/notebook/_bar.py +142 -0
- pyagrum/explain/notebook/_beeswarm.py +174 -0
- pyagrum/explain/notebook/_showShapValues.py +97 -0
- pyagrum/explain/notebook/_waterfall.py +220 -0
- pyagrum/explain/shapley.py +225 -0
- pyagrum/lib/__init__.py +46 -0
- pyagrum/lib/_colors.py +390 -0
- pyagrum/lib/bn2graph.py +299 -0
- pyagrum/lib/bn2roc.py +1026 -0
- pyagrum/lib/bn2scores.py +217 -0
- pyagrum/lib/bn_vs_bn.py +605 -0
- pyagrum/lib/cn2graph.py +305 -0
- pyagrum/lib/discreteTypeProcessor.py +1102 -0
- pyagrum/lib/discretizer.py +58 -0
- pyagrum/lib/dynamicBN.py +390 -0
- pyagrum/lib/explain.py +57 -0
- pyagrum/lib/export.py +84 -0
- pyagrum/lib/id2graph.py +258 -0
- pyagrum/lib/image.py +387 -0
- pyagrum/lib/ipython.py +307 -0
- pyagrum/lib/mrf2graph.py +471 -0
- pyagrum/lib/notebook.py +1821 -0
- pyagrum/lib/proba_histogram.py +552 -0
- pyagrum/lib/utils.py +138 -0
- pyagrum/pyagrum.py +31495 -0
- pyagrum/skbn/_MBCalcul.py +242 -0
- pyagrum/skbn/__init__.py +49 -0
- pyagrum/skbn/_learningMethods.py +282 -0
- pyagrum/skbn/_utils.py +297 -0
- pyagrum/skbn/bnclassifier.py +1014 -0
- pyagrum_nightly-2.3.0.9.dev202512061764412981.dist-info/LICENSE.md +12 -0
- pyagrum_nightly-2.3.0.9.dev202512061764412981.dist-info/LICENSES/LGPL-3.0-or-later.txt +304 -0
- pyagrum_nightly-2.3.0.9.dev202512061764412981.dist-info/LICENSES/MIT.txt +18 -0
- pyagrum_nightly-2.3.0.9.dev202512061764412981.dist-info/METADATA +145 -0
- pyagrum_nightly-2.3.0.9.dev202512061764412981.dist-info/RECORD +107 -0
- pyagrum_nightly-2.3.0.9.dev202512061764412981.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1102 @@
|
|
|
1
|
+
############################################################################
|
|
2
|
+
# This file is part of the aGrUM/pyAgrum library. #
|
|
3
|
+
# #
|
|
4
|
+
# Copyright (c) 2005-2025 by #
|
|
5
|
+
# - Pierre-Henri WUILLEMIN(_at_LIP6) #
|
|
6
|
+
# - Christophe GONZALES(_at_AMU) #
|
|
7
|
+
# #
|
|
8
|
+
# The aGrUM/pyAgrum library is free software; you can redistribute it #
|
|
9
|
+
# and/or modify it under the terms of either : #
|
|
10
|
+
# #
|
|
11
|
+
# - the GNU Lesser General Public License as published by #
|
|
12
|
+
# the Free Software Foundation, either version 3 of the License, #
|
|
13
|
+
# or (at your option) any later version, #
|
|
14
|
+
# - the MIT license (MIT), #
|
|
15
|
+
# - or both in dual license, as here. #
|
|
16
|
+
# #
|
|
17
|
+
# (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) #
|
|
18
|
+
# #
|
|
19
|
+
# This aGrUM/pyAgrum library is distributed in the hope that it will be #
|
|
20
|
+
# useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #
|
|
21
|
+
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS #
|
|
22
|
+
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #
|
|
23
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #
|
|
24
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, #
|
|
25
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR #
|
|
26
|
+
# OTHER DEALINGS IN THE SOFTWARE. #
|
|
27
|
+
# #
|
|
28
|
+
# See LICENCES for more details. #
|
|
29
|
+
# #
|
|
30
|
+
# SPDX-FileCopyrightText: Copyright 2005-2025 #
|
|
31
|
+
# - Pierre-Henri WUILLEMIN(_at_LIP6) #
|
|
32
|
+
# - Christophe GONZALES(_at_AMU) #
|
|
33
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later OR MIT #
|
|
34
|
+
# #
|
|
35
|
+
# Contact : info_at_agrum_dot_org #
|
|
36
|
+
# homepage : http://agrum.gitlab.io #
|
|
37
|
+
# gitlab : https://gitlab.com/agrumery/agrum #
|
|
38
|
+
# #
|
|
39
|
+
############################################################################
|
|
40
|
+
|
|
41
|
+
"""
|
|
42
|
+
This module contains the DiscreteTypeProcessor class used to attribute a type from the values of a database before learning a (discrete) Graphical Model.
|
|
43
|
+
Particularly, the DiscreteTypeProcessor class is used to discretize some continuous variables in a database.
|
|
44
|
+
|
|
45
|
+
The discretization is done using the following methods: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP'.
|
|
46
|
+
Some methods need specialized treatment of arguments for classification (y separated from X).
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
import math
|
|
50
|
+
|
|
51
|
+
import numpy
|
|
52
|
+
import pandas
|
|
53
|
+
import scipy
|
|
54
|
+
import sklearn
|
|
55
|
+
import sklearn.preprocessing as skp
|
|
56
|
+
|
|
57
|
+
from typing import Any
|
|
58
|
+
|
|
59
|
+
import pyagrum as gum
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def check_int(v: Any) -> bool:
|
|
63
|
+
"""
|
|
64
|
+
Test if v is an int or a str representing an int
|
|
65
|
+
"""
|
|
66
|
+
if isinstance(v, (bool, numpy.bool_)):
|
|
67
|
+
return False
|
|
68
|
+
if isinstance(v, (int, numpy.integer)):
|
|
69
|
+
return True
|
|
70
|
+
|
|
71
|
+
if isinstance(v, str):
|
|
72
|
+
if v[0] in ("-", "+"):
|
|
73
|
+
return v[1:].isdigit()
|
|
74
|
+
return v.isdigit()
|
|
75
|
+
return False
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def check_float(v: Any) -> bool:
|
|
79
|
+
"""
|
|
80
|
+
Test if v is a float or a str representing a float.
|
|
81
|
+
|
|
82
|
+
Parameters
|
|
83
|
+
----------
|
|
84
|
+
v : Any
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
bool:
|
|
89
|
+
True if v is a float or a str representing a float
|
|
90
|
+
"""
|
|
91
|
+
if isinstance(v, (bool, numpy.bool_)):
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
try:
|
|
95
|
+
float(v)
|
|
96
|
+
return True
|
|
97
|
+
except ValueError:
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
class DiscreteTypeProcessor:
|
|
102
|
+
"""
|
|
103
|
+
Represents a tool to process the type of the variables in a database in order to obtain a way to learn a pyAgrum's discrete Graphical Model.
|
|
104
|
+
Particularly, the DiscreteTypeProcessor class is used to discretize some continuous variables in a database.
|
|
105
|
+
|
|
106
|
+
Warnings
|
|
107
|
+
--------
|
|
108
|
+
- The data are represented by tabular data (X and possibly y) where the columns are the variables and the rows are the samples. Generally, X can be replaced by a the name of a csv file.
|
|
109
|
+
- In the case of a classification, y is the class variable and X are the features. y has not to be binary.
|
|
110
|
+
|
|
111
|
+
Parameters
|
|
112
|
+
----------
|
|
113
|
+
defaultDiscretizationMethod: str
|
|
114
|
+
sets the default method of discretization for this discretizer. Possible values are: `quantile`, `uniform`,
|
|
115
|
+
`kmeans`, `NML`, `CAIM` and `MDLP`. This method will be used if the user has not specified another method
|
|
116
|
+
for that specific variable using the setDiscretizationParameters method.
|
|
117
|
+
defaultNumberOfBins: str or int
|
|
118
|
+
sets the number of bins if the method used is `quantile`, `kmeans`, `uniform`. In this case this parameter can also
|
|
119
|
+
be set to the string `elbowMethod` so that the best number of bins is found automatically.
|
|
120
|
+
If the method used is NML, this parameter sets the the maximum number of bins up to which the NML algorithm
|
|
121
|
+
searches for the optimal number of bins. In this case this parameter must be an int
|
|
122
|
+
If any other discretization method is used, this parameter is ignored.
|
|
123
|
+
discretizationThreshold: int or float
|
|
124
|
+
When using default parameters a variable will be treated as continuous only if it has more unique values than
|
|
125
|
+
this number (if the number is an int greater than 1).
|
|
126
|
+
If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than this
|
|
127
|
+
number. For example if you have entered 0.95, the variable will be treated as continuous only if more than 95%
|
|
128
|
+
of its values are unique.
|
|
129
|
+
|
|
130
|
+
"""
|
|
131
|
+
|
|
132
|
+
def __init__(
|
|
133
|
+
self,
|
|
134
|
+
defaultDiscretizationMethod="quantile",
|
|
135
|
+
defaultNumberOfBins=10,
|
|
136
|
+
discretizationThreshold=25,
|
|
137
|
+
):
|
|
138
|
+
"""
|
|
139
|
+
Initializes the DiscreteTypeProcessor object.
|
|
140
|
+
|
|
141
|
+
Parameters
|
|
142
|
+
----------
|
|
143
|
+
defaultDiscretizationMethod: str
|
|
144
|
+
sets the default method of discretization for this discretizer. Possible values are: 'quantile', 'uniform',
|
|
145
|
+
'kmeans', 'NML', 'CAIM' and 'MDLP'. This method will be used if the user has not specified another method
|
|
146
|
+
for that specific variable using the setDiscretizationParameters method.
|
|
147
|
+
defaultNumberOfBins: str or int
|
|
148
|
+
sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can
|
|
149
|
+
also be set to the string 'elbowMethod' so that the best number of bins is found automatically.
|
|
150
|
+
If the method used is NML, this parameter sets the the maximum number of bins up to which the NML algorithm
|
|
151
|
+
searches for the optimal number of bins. In this case this parameter must be an int
|
|
152
|
+
If any other discretization method is used, this parameter is ignored.
|
|
153
|
+
discretizationThreshold: int or float
|
|
154
|
+
When using default parameters a variable will be treated as continuous only if it has more unique values than
|
|
155
|
+
this number (if the number is an int greater than 1).
|
|
156
|
+
If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than
|
|
157
|
+
this number. For example if you have entered 0.95, the variable will be treated as continuous only if more
|
|
158
|
+
than 95% of its values are unique.
|
|
159
|
+
"""
|
|
160
|
+
self.discretizationParametersDictionary = {}
|
|
161
|
+
self.numberOfContinuous = 0
|
|
162
|
+
self.totalNumberOfBins = 0
|
|
163
|
+
self.defaultMethod = None
|
|
164
|
+
self.defaultParamDiscretizationMethod = None
|
|
165
|
+
self.setDiscretizationParameters(None, defaultDiscretizationMethod, defaultNumberOfBins)
|
|
166
|
+
|
|
167
|
+
self.discretizationThreshold = discretizationThreshold
|
|
168
|
+
|
|
169
|
+
def clear(self, clearDiscretizationParameters=False):
|
|
170
|
+
"""
|
|
171
|
+
Sets the number of continuous variables and the total number of bins created by this discretizer to 0. If
|
|
172
|
+
clearDiscretizationParameters is True, also clears the the parameters for discretization the user has set for each
|
|
173
|
+
variable.
|
|
174
|
+
|
|
175
|
+
Parameters
|
|
176
|
+
----------
|
|
177
|
+
clearDiscretizationParameters: bool
|
|
178
|
+
if True, this method also clears the parameters the user has set for each variable and resets them to the default.
|
|
179
|
+
"""
|
|
180
|
+
self.numberOfContinuous = 0
|
|
181
|
+
self.totalNumberOfBins = 0
|
|
182
|
+
if clearDiscretizationParameters:
|
|
183
|
+
self.discretizationParametersDictionary = {}
|
|
184
|
+
|
|
185
|
+
@gum.deprecated_arg(newA="parameters", oldA="paramDiscretizationMethod", version="2.0.0")
|
|
186
|
+
def setDiscretizationParameters(self, variableName: str, method: str, parameters: Any = None):
|
|
187
|
+
"""
|
|
188
|
+
Sets the discretization parameters for a variable. If variableName is None, sets the default parameters.
|
|
189
|
+
|
|
190
|
+
Parameters
|
|
191
|
+
----------
|
|
192
|
+
variableName: str
|
|
193
|
+
the name of the variable you want to set the discretization parameters of. Set to None to set the new
|
|
194
|
+
default.
|
|
195
|
+
method: str
|
|
196
|
+
The method of discretization used for this variable. Use "NoDiscretization" if you do not want to discretize this
|
|
197
|
+
variable. Possible values are: 'NoDiscretization', 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM', 'MDLP' and 'expert'
|
|
198
|
+
parameters:
|
|
199
|
+
Each method of discretization has a parameter that can be set.
|
|
200
|
+
- 'quantile': the number of bins
|
|
201
|
+
- 'kmeans', 'uniform': the number of bins. The parameter can also be set to the string 'elbowMethod' so that the best
|
|
202
|
+
number of bins is found automatically.
|
|
203
|
+
- 'NML': this parameter sets the the maximum number of bins up to which the NML algorithm searches for the optimal
|
|
204
|
+
number of bins.
|
|
205
|
+
- 'MDLP', 'CAIM': this parameter is ignored
|
|
206
|
+
- 'expert': this parameter is the set of ticks proposed by the expert. The discretized variable will set the flag
|
|
207
|
+
'empirical' which means that if the values found in the data are not in the proposed intervals, they did not raise
|
|
208
|
+
any exception but are nevertheless accepted (as belonging to the smallest or biggest interval).
|
|
209
|
+
- 'NoDiscretization': this parameter is a superset of the values for the variable found in the database (or None).
|
|
210
|
+
"""
|
|
211
|
+
if parameters is None:
|
|
212
|
+
parameters = self.defaultParamDiscretizationMethod
|
|
213
|
+
|
|
214
|
+
match method:
|
|
215
|
+
case "quantile" | "NML":
|
|
216
|
+
if type(parameters) is not int:
|
|
217
|
+
raise ValueError(
|
|
218
|
+
"The parameter for the quantile/NML method must be an integer. You have entered: " + str(parameters)
|
|
219
|
+
)
|
|
220
|
+
case "kmeans" | "uniform":
|
|
221
|
+
if type(parameters) is not int and str(parameters) != "elbowMethod":
|
|
222
|
+
raise ValueError(
|
|
223
|
+
"The parameter for the kmeans/uniform method must be an integer or the string 'elbowMethod'. You have entered: "
|
|
224
|
+
+ str(parameters)
|
|
225
|
+
)
|
|
226
|
+
case "expert":
|
|
227
|
+
if not (isinstance(parameters, list) and all(map(check_float, parameters))):
|
|
228
|
+
raise ValueError(
|
|
229
|
+
"The parameter for the expert method must be a list of float. You have entered: " + str(parameters)
|
|
230
|
+
)
|
|
231
|
+
case "NoDiscretization":
|
|
232
|
+
if parameters is not None and not (isinstance(parameters, str)):
|
|
233
|
+
raise ValueError(
|
|
234
|
+
"The parameter for the NoDiscretization method must be a string (fastVar syntax) or None. You have "
|
|
235
|
+
"entered: " + str(parameters)
|
|
236
|
+
)
|
|
237
|
+
case "CAIM" | "MDLP":
|
|
238
|
+
pass
|
|
239
|
+
case _:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
"This discretization method is not recognized! Possible values are kmeans, uniform, quantile, NML, "
|
|
242
|
+
"CAIM, MDLP, NoDiscretization or expert. You have entered " + str(method)
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
if variableName is None:
|
|
246
|
+
self.defaultMethod = method
|
|
247
|
+
self.defaultParamDiscretizationMethod = parameters
|
|
248
|
+
else:
|
|
249
|
+
self.discretizationParametersDictionary[variableName] = {}
|
|
250
|
+
self.discretizationParametersDictionary[variableName]["method"] = method
|
|
251
|
+
self.discretizationParametersDictionary[variableName]["param"] = parameters
|
|
252
|
+
|
|
253
|
+
def audit(self, X, y=None):
|
|
254
|
+
"""
|
|
255
|
+
Audits the passed values of X and y. Guess which columns in X are already discrete and which need to
|
|
256
|
+
be discretized, as well as the discretization algorithm that will be used to discretize them The parameters which
|
|
257
|
+
are suggested will be used when creating the variables. To change this the user can manually set discretization
|
|
258
|
+
parameters for each variable using the setDiscretizationParameters function.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
X: {array-like, pandas or polars dataframe} of shape (n_samples, n_features) or str (filename)
|
|
263
|
+
training data
|
|
264
|
+
y: {array-like, pandas or polars dataframe} of shape (n_samples,) or str (classname)
|
|
265
|
+
Target values
|
|
266
|
+
Returns
|
|
267
|
+
-------
|
|
268
|
+
Dict
|
|
269
|
+
for each variable, the proposition of audit
|
|
270
|
+
"""
|
|
271
|
+
if isinstance(X, str):
|
|
272
|
+
Xp = pandas.read_csv(X)
|
|
273
|
+
elif hasattr(X, "to_pandas"): # for instance, polars dataframe
|
|
274
|
+
Xp = X.to_pandas()
|
|
275
|
+
else:
|
|
276
|
+
Xp = X
|
|
277
|
+
|
|
278
|
+
if isinstance(y, str):
|
|
279
|
+
yp = Xp[y]
|
|
280
|
+
elif y is not None and hasattr(y, "to_pandas"): # for instance, polars dataframe
|
|
281
|
+
yp = y.to_pandas()
|
|
282
|
+
else:
|
|
283
|
+
yp = y
|
|
284
|
+
|
|
285
|
+
return self._audit(Xp, yp)
|
|
286
|
+
|
|
287
|
+
def _audit(self, X, y=None):
|
|
288
|
+
"""
|
|
289
|
+
Audits the passed values of X and y. Tells us which columns in X we think are already discrete and which need to
|
|
290
|
+
be discretized, as well as the discretization algorithm that will be used to discretize them The parameters which
|
|
291
|
+
are suggested will be used when creating the variables. To change this the user can manually set discretization
|
|
292
|
+
parameters for each variable using the setDiscretizationParameters function.
|
|
293
|
+
|
|
294
|
+
Parameters
|
|
295
|
+
----------
|
|
296
|
+
X: {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
297
|
+
training data
|
|
298
|
+
y: array-like of shape (n_samples,)
|
|
299
|
+
Target values
|
|
300
|
+
Returns
|
|
301
|
+
-------
|
|
302
|
+
Dict
|
|
303
|
+
for each variable, the proposition of audit
|
|
304
|
+
"""
|
|
305
|
+
|
|
306
|
+
auditDict = {}
|
|
307
|
+
|
|
308
|
+
if isinstance(X, pandas.DataFrame):
|
|
309
|
+
variableNames = X.columns.tolist()
|
|
310
|
+
elif type(X) is pandas.core.series.Series:
|
|
311
|
+
variableNames = [X.name]
|
|
312
|
+
else:
|
|
313
|
+
variableNames = None
|
|
314
|
+
|
|
315
|
+
if y is not None:
|
|
316
|
+
X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True)
|
|
317
|
+
else:
|
|
318
|
+
X = sklearn.utils.check_array(X, dtype=None)
|
|
319
|
+
|
|
320
|
+
d = X.shape[1]
|
|
321
|
+
|
|
322
|
+
if variableNames is None:
|
|
323
|
+
variableNames = ["x" + str(i) for i in range(d)]
|
|
324
|
+
|
|
325
|
+
possibleValues = {} # counting the possible values gor this line
|
|
326
|
+
|
|
327
|
+
for i in range(d):
|
|
328
|
+
possibleValues[i] = numpy.unique(X[:, i])
|
|
329
|
+
possibleValues[d] = numpy.unique(y)
|
|
330
|
+
|
|
331
|
+
for i in range(d):
|
|
332
|
+
variable = variableNames[i]
|
|
333
|
+
auditDict[variable] = {}
|
|
334
|
+
try:
|
|
335
|
+
sklearn.utils.check_array(X[:, i], dtype="float", ensure_2d=False)
|
|
336
|
+
isNumeric = True
|
|
337
|
+
except ValueError:
|
|
338
|
+
isNumeric = False
|
|
339
|
+
if variable in self.discretizationParametersDictionary.keys():
|
|
340
|
+
auditDict[variable] = self.discretizationParametersDictionary[variable]
|
|
341
|
+
if self.discretizationParametersDictionary[variable]["method"] != "NoDiscretization" and not isNumeric:
|
|
342
|
+
raise ValueError("The variable " + variable + " is not numeric and cannot be discretized!")
|
|
343
|
+
else:
|
|
344
|
+
if len(possibleValues[i]) > self.discretizationThreshold and isNumeric:
|
|
345
|
+
auditDict[variable]["method"] = self.defaultMethod
|
|
346
|
+
auditDict[variable]["nbBins"] = self.defaultParamDiscretizationMethod
|
|
347
|
+
else:
|
|
348
|
+
auditDict[variable]["method"] = "NoDiscretization"
|
|
349
|
+
auditDict[variable]["values"] = possibleValues[i]
|
|
350
|
+
|
|
351
|
+
if auditDict[variable]["method"] == "NoDiscretization":
|
|
352
|
+
auditDict[variable]["type"] = "Discrete"
|
|
353
|
+
else:
|
|
354
|
+
auditDict[variable]["type"] = "Continuous"
|
|
355
|
+
auditDict[variable]["minInData"] = min(possibleValues[i])
|
|
356
|
+
auditDict[variable]["maxInData"] = max(possibleValues[i])
|
|
357
|
+
|
|
358
|
+
return auditDict
|
|
359
|
+
|
|
360
|
+
@staticmethod
|
|
361
|
+
def _discretizationElbowMethodRotation(discretizationStrategy, X):
|
|
362
|
+
"""
|
|
363
|
+
Calculates the sum of squared errors as a function of the number of clusters using the discretization strategy
|
|
364
|
+
that is passed as a parameter. Returns the bins that are optimal for minimizing the variation and the number of
|
|
365
|
+
bins at the same time. Uses the elbow method to find this optimal point. To find the "elbow" we rotate the curve
|
|
366
|
+
and look for its minimum.
|
|
367
|
+
|
|
368
|
+
Parameters
|
|
369
|
+
----------
|
|
370
|
+
discretizationStrategy: str
|
|
371
|
+
The method of discretization that will be used. Possible values are: 'quantile' , 'kmeans' and 'uniform'
|
|
372
|
+
X: one dimensional ndarray
|
|
373
|
+
Contains the data that should be discretized
|
|
374
|
+
Returns
|
|
375
|
+
-------
|
|
376
|
+
List[float]
|
|
377
|
+
the edges of the bins the algorithm has chosen.
|
|
378
|
+
"""
|
|
379
|
+
|
|
380
|
+
if discretizationStrategy not in {"kmeans", "quantile", "uniform"}:
|
|
381
|
+
raise ValueError("cannot use elbow method with this type of discretization")
|
|
382
|
+
variationArray = numpy.zeros(14)
|
|
383
|
+
Xsorted = X[X.argsort(axis=None)]
|
|
384
|
+
binEdgeMatrix = [[]] * 14
|
|
385
|
+
for k in range(2, 16):
|
|
386
|
+
discretizer = skp.KBinsDiscretizer(
|
|
387
|
+
k,
|
|
388
|
+
strategy=discretizationStrategy,
|
|
389
|
+
quantile_method="averaged_inverted_cdf",
|
|
390
|
+
subsample=None,
|
|
391
|
+
)
|
|
392
|
+
discretizer.fit(Xsorted)
|
|
393
|
+
binEdges = discretizer.bin_edges_[0]
|
|
394
|
+
centresArray = (binEdges[1:] + binEdges[:-1]) / 2
|
|
395
|
+
i = 0
|
|
396
|
+
sumOfSquaredErrors = 0
|
|
397
|
+
for x in Xsorted:
|
|
398
|
+
if x > binEdges[i + 1]:
|
|
399
|
+
i = i + 1
|
|
400
|
+
sumOfSquaredErrors += (x - centresArray[i]) ** 2
|
|
401
|
+
variationArray[k - 2] = sumOfSquaredErrors
|
|
402
|
+
binEdgeMatrix[k - 2] = binEdges.to_list()
|
|
403
|
+
|
|
404
|
+
# we calculate the slope of the line that connects the first and last point on our graph
|
|
405
|
+
slope = (variationArray[13] - variationArray[0]) / 13
|
|
406
|
+
|
|
407
|
+
# we calculate the slope of the line perpendicular to it
|
|
408
|
+
otherSlope = -1 / slope
|
|
409
|
+
|
|
410
|
+
# we change the coordinate system to the one with the two lines previously mentioned as its axes
|
|
411
|
+
coordinateChangeMatrix = numpy.array([[1, slope], [1 / otherSlope, 1]])
|
|
412
|
+
|
|
413
|
+
# we linearly transform the coordinates of every point in our curve
|
|
414
|
+
transformedCoordinates = numpy.zeros((2, 14))
|
|
415
|
+
for i in range(14):
|
|
416
|
+
transformedCoordinates[:, i] = numpy.matmul(
|
|
417
|
+
coordinateChangeMatrix,
|
|
418
|
+
numpy.array([[i], [variationArray[i] - variationArray[0]]]),
|
|
419
|
+
).reshape(2)
|
|
420
|
+
|
|
421
|
+
# we search for the minimum in our newly obtained curve
|
|
422
|
+
minkIndex = 0
|
|
423
|
+
for k in range(14):
|
|
424
|
+
if transformedCoordinates[1][minkIndex] > transformedCoordinates[1][k]:
|
|
425
|
+
minkIndex = k
|
|
426
|
+
# when we have found the minimum, we apply the inverse linear transformation to recover the optimal value of k
|
|
427
|
+
minimumVector = numpy.matmul(
|
|
428
|
+
numpy.linalg.inv(coordinateChangeMatrix),
|
|
429
|
+
transformedCoordinates[:, minkIndex].reshape(2, 1),
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
# we return the list of bin edges found using said optimal number of k
|
|
433
|
+
return binEdgeMatrix[int(round(minimumVector[0]))]
|
|
434
|
+
|
|
435
|
+
def _discretizationMDLP(self, X, y, possibleValuesX, possibleValuesY):
|
|
436
|
+
"""
|
|
437
|
+
Uses the MDLP algorithm described in Fayyad, 1995 to discretize the values of x.
|
|
438
|
+
|
|
439
|
+
Parameters
|
|
440
|
+
----------
|
|
441
|
+
X: ndarray with shape (n,1) where n is the number of samples
|
|
442
|
+
Column-vector that contains all the data that needs to be discretized
|
|
443
|
+
y: ndarray with shape (n,1) where n is the number of samples
|
|
444
|
+
Column-vector that contains the class for each sample. This vector will not be discretized, but the class-value of each sample is needed to properly apply the algorithm
|
|
445
|
+
possibleValuesX: one dimensional ndarray
|
|
446
|
+
Contains all the possible values that x can take sorted in increasing order. There shouldn't be any doubles inside
|
|
447
|
+
possibleValuesY: one dimensional ndarray
|
|
448
|
+
Contains the possible values of y.
|
|
449
|
+
Returns
|
|
450
|
+
-------
|
|
451
|
+
List[float]
|
|
452
|
+
a list of the edges of the bins that are chosen by this algorithm
|
|
453
|
+
"""
|
|
454
|
+
xAndY = numpy.concatenate((X, y), axis=1)
|
|
455
|
+
xAndY = xAndY[xAndY[:, 0].argsort()]
|
|
456
|
+
B = (possibleValuesX[1:] + possibleValuesX[:-1]) / 2
|
|
457
|
+
[class0, _] = possibleValuesY
|
|
458
|
+
|
|
459
|
+
binEdgesIndex = []
|
|
460
|
+
nbElementsByIntervalClass0 = numpy.zeros(len(B) + 1)
|
|
461
|
+
nbElementsByIntervalClass1 = numpy.zeros(len(B) + 1)
|
|
462
|
+
currentIntervalIndex = 0
|
|
463
|
+
for X in xAndY:
|
|
464
|
+
if currentIntervalIndex < len(B) and X[0] > B[currentIntervalIndex]:
|
|
465
|
+
currentIntervalIndex += 1
|
|
466
|
+
if X[1] == class0:
|
|
467
|
+
nbElementsByIntervalClass0[currentIntervalIndex] += 1
|
|
468
|
+
else:
|
|
469
|
+
nbElementsByIntervalClass1[currentIntervalIndex] += 1
|
|
470
|
+
|
|
471
|
+
Class0ByLargeInterval = [nbElementsByIntervalClass0.sum()]
|
|
472
|
+
Class1ByLargeInterval = [nbElementsByIntervalClass1.sum()]
|
|
473
|
+
|
|
474
|
+
totalCountByLargeInterval = [Class0ByLargeInterval[0] + Class1ByLargeInterval[0]]
|
|
475
|
+
|
|
476
|
+
probabilityClass0 = Class0ByLargeInterval[0] / totalCountByLargeInterval[0]
|
|
477
|
+
probabilityClass1 = Class1ByLargeInterval[0] / totalCountByLargeInterval[0]
|
|
478
|
+
shannonEntropyByLargeInterval = [
|
|
479
|
+
(-1) * (probabilityClass0 * math.log2(probabilityClass0) + probabilityClass1 * math.log2(probabilityClass1))
|
|
480
|
+
]
|
|
481
|
+
|
|
482
|
+
continueDividingInterval = [True]
|
|
483
|
+
|
|
484
|
+
currentValues = {}
|
|
485
|
+
minimalValues = {}
|
|
486
|
+
|
|
487
|
+
while any(continueDividingInterval):
|
|
488
|
+
minimalValues["classInformationEntropy"] = math.inf
|
|
489
|
+
for param in {
|
|
490
|
+
"boundaryIndex",
|
|
491
|
+
"leftSubintervalClass0",
|
|
492
|
+
"leftSubintervalClass1",
|
|
493
|
+
"leftSubintervalShannonEntropy",
|
|
494
|
+
"rightSubintervalClass0",
|
|
495
|
+
"rightSubintervalClass1",
|
|
496
|
+
"rightSubintervalShannonEntropy",
|
|
497
|
+
}:
|
|
498
|
+
(currentValues[param], minimalValues[param]) = (0, 0)
|
|
499
|
+
|
|
500
|
+
position = 0
|
|
501
|
+
while currentValues["boundaryIndex"] < len(B):
|
|
502
|
+
while not continueDividingInterval[position]:
|
|
503
|
+
position = position + 1
|
|
504
|
+
currentValues["boundaryIndex"] = binEdgesIndex[position - 1] + 1
|
|
505
|
+
|
|
506
|
+
if position < len(binEdgesIndex) and currentValues["boundaryIndex"] == binEdgesIndex[position]:
|
|
507
|
+
# this function decides whether to accept the cut point in this interval and updates the relevant lists if
|
|
508
|
+
# the value is accepted.
|
|
509
|
+
self._divideIntervalMDLP(
|
|
510
|
+
minimalValues,
|
|
511
|
+
shannonEntropyByLargeInterval,
|
|
512
|
+
Class0ByLargeInterval,
|
|
513
|
+
Class1ByLargeInterval,
|
|
514
|
+
continueDividingInterval,
|
|
515
|
+
totalCountByLargeInterval,
|
|
516
|
+
position,
|
|
517
|
+
binEdgesIndex,
|
|
518
|
+
)
|
|
519
|
+
position += 1
|
|
520
|
+
while position < len(continueDividingInterval) and not continueDividingInterval[position]:
|
|
521
|
+
position += 1
|
|
522
|
+
if position == len(continueDividingInterval):
|
|
523
|
+
break
|
|
524
|
+
else:
|
|
525
|
+
currentValues["boundaryIndex"] = binEdgesIndex[position - 1] + 1
|
|
526
|
+
(
|
|
527
|
+
currentValues["leftSubintervalClass0"],
|
|
528
|
+
currentValues["leftSubintervalClass1"],
|
|
529
|
+
) = (0, 0)
|
|
530
|
+
minimalValues["classInformationEntropy"] = math.inf
|
|
531
|
+
continue
|
|
532
|
+
|
|
533
|
+
currentValues["leftSubintervalClass0"] += nbElementsByIntervalClass0[currentValues["boundaryIndex"]]
|
|
534
|
+
currentValues["leftSubintervalClass1"] += nbElementsByIntervalClass1[currentValues["boundaryIndex"]]
|
|
535
|
+
|
|
536
|
+
totalCountLeftInterval = currentValues["leftSubintervalClass0"] + currentValues["leftSubintervalClass1"]
|
|
537
|
+
|
|
538
|
+
probabilityClass0 = currentValues["leftSubintervalClass0"] / totalCountLeftInterval
|
|
539
|
+
probabilityClass1 = currentValues["leftSubintervalClass1"] / totalCountLeftInterval
|
|
540
|
+
currentValues["leftSubintervalShannonEntropy"] = 0
|
|
541
|
+
if probabilityClass0 > 0:
|
|
542
|
+
currentValues["leftSubintervalShannonEntropy"] += (-1) * probabilityClass0 * math.log2(probabilityClass0)
|
|
543
|
+
if probabilityClass1 > 0:
|
|
544
|
+
currentValues["leftSubintervalShannonEntropy"] += (-1) * probabilityClass1 * math.log2(probabilityClass1)
|
|
545
|
+
|
|
546
|
+
classInformationEntropy = (totalCountLeftInterval / totalCountByLargeInterval[position]) * currentValues[
|
|
547
|
+
"leftSubintervalShannonEntropy"
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
currentValues["rightSubintervalClass0"] = (
|
|
551
|
+
Class0ByLargeInterval[position] - currentValues["leftSubintervalClass0"]
|
|
552
|
+
)
|
|
553
|
+
currentValues["rightSubintervalClass1"] = (
|
|
554
|
+
Class1ByLargeInterval[position] - currentValues["leftSubintervalClass1"]
|
|
555
|
+
)
|
|
556
|
+
NRightInterval = currentValues["rightSubintervalClass0"] + currentValues["rightSubintervalClass1"]
|
|
557
|
+
|
|
558
|
+
probabilityClass0 = currentValues["rightSubintervalClass0"] / NRightInterval
|
|
559
|
+
probabilityClass1 = currentValues["rightSubintervalClass1"] / NRightInterval
|
|
560
|
+
currentValues["rightSubintervalShannonEntropy"] = 0
|
|
561
|
+
if probabilityClass0 > 0:
|
|
562
|
+
currentValues["rightSubintervalShannonEntropy"] += (-1) * probabilityClass0 * math.log2(probabilityClass0)
|
|
563
|
+
if probabilityClass1 > 0:
|
|
564
|
+
currentValues["rightSubintervalShannonEntropy"] += (-1) * probabilityClass1 * math.log2(probabilityClass1)
|
|
565
|
+
|
|
566
|
+
classInformationEntropy += (NRightInterval / totalCountByLargeInterval[position]) * currentValues[
|
|
567
|
+
"rightSubintervalShannonEntropy"
|
|
568
|
+
]
|
|
569
|
+
|
|
570
|
+
if classInformationEntropy < minimalValues["classInformationEntropy"]:
|
|
571
|
+
minimalValues = currentValues.copy()
|
|
572
|
+
minimalValues["classInformationEntropy"] = classInformationEntropy
|
|
573
|
+
|
|
574
|
+
currentValues["boundaryIndex"] += 1
|
|
575
|
+
|
|
576
|
+
if continueDividingInterval[-1]:
|
|
577
|
+
self._divideIntervalMDLP(
|
|
578
|
+
minimalValues,
|
|
579
|
+
shannonEntropyByLargeInterval,
|
|
580
|
+
Class0ByLargeInterval,
|
|
581
|
+
Class1ByLargeInterval,
|
|
582
|
+
continueDividingInterval,
|
|
583
|
+
totalCountByLargeInterval,
|
|
584
|
+
position,
|
|
585
|
+
binEdgesIndex,
|
|
586
|
+
)
|
|
587
|
+
binEdges = [xAndY[0][0]]
|
|
588
|
+
for index in binEdgesIndex:
|
|
589
|
+
binEdges.append(B[index])
|
|
590
|
+
binEdges.append(xAndY[-1][0])
|
|
591
|
+
|
|
592
|
+
return binEdges
|
|
593
|
+
|
|
594
|
+
@staticmethod
|
|
595
|
+
def _discretizationCAIM(X, y, possibleValuesX, possibleValuesY):
|
|
596
|
+
"""
|
|
597
|
+
Applies the CAIM algorithm to discretize the values of x
|
|
598
|
+
|
|
599
|
+
Parameters
|
|
600
|
+
----------
|
|
601
|
+
X: ndarray with shape (n,1) where n is the number of samples
|
|
602
|
+
Column-vector that contains all the data that needs to be discretized
|
|
603
|
+
y: ndarray with shape (n,1) where n is the number of samples
|
|
604
|
+
Column-vector that contains the class for each sample. This vector will not be discretized, but the class-value of each sample is needed to properly apply the algorithm
|
|
605
|
+
possibleValuesX: one dimensional ndarray
|
|
606
|
+
Contains all the possible values that x can take sorted in increasing order. There shouldn't be any doubles inside
|
|
607
|
+
possibleValuesY: one dimensional ndarray
|
|
608
|
+
Contains the possible values of y.
|
|
609
|
+
Returns
|
|
610
|
+
-------
|
|
611
|
+
list[float]
|
|
612
|
+
a list of the edges of the bins that are chosen by this algorithm
|
|
613
|
+
"""
|
|
614
|
+
xAndY = numpy.concatenate((X, y), axis=1)
|
|
615
|
+
xAndY = xAndY[xAndY[:, 0].argsort()]
|
|
616
|
+
B = (possibleValuesX[1:] + possibleValuesX[:-1]) / 2
|
|
617
|
+
[class0, _] = possibleValuesY
|
|
618
|
+
|
|
619
|
+
binEdgesIndex = []
|
|
620
|
+
nbElementsByIntervalClass0 = numpy.zeros(len(B) + 1)
|
|
621
|
+
nbElementsByIntervalClass1 = numpy.zeros(len(B) + 1)
|
|
622
|
+
currentIntervalIndex = 0
|
|
623
|
+
for X in xAndY:
|
|
624
|
+
if currentIntervalIndex < len(B) and X[0] > B[currentIntervalIndex]:
|
|
625
|
+
currentIntervalIndex += 1
|
|
626
|
+
if X[1] == class0:
|
|
627
|
+
nbElementsByIntervalClass0[currentIntervalIndex] += 1
|
|
628
|
+
else:
|
|
629
|
+
nbElementsByIntervalClass1[currentIntervalIndex] += 1
|
|
630
|
+
|
|
631
|
+
Class0ByLargeInterval = [nbElementsByIntervalClass0.sum()]
|
|
632
|
+
Class1ByLargeInterval = [nbElementsByIntervalClass1.sum()]
|
|
633
|
+
|
|
634
|
+
k = 0
|
|
635
|
+
globalCAIM = 0.0
|
|
636
|
+
oldCaim = 0.0
|
|
637
|
+
while True:
|
|
638
|
+
caimMax = 0
|
|
639
|
+
maxPosition = 0
|
|
640
|
+
maxBoundaryIndex = 0
|
|
641
|
+
position = 0
|
|
642
|
+
currentSumClass0 = 0
|
|
643
|
+
currentSumClass1 = 0
|
|
644
|
+
maxLeftIntervalClass0 = currentSumClass0
|
|
645
|
+
maxLeftIntervalClass1 = currentSumClass1
|
|
646
|
+
maxRightIntervalClass0 = maxLeftIntervalClass0
|
|
647
|
+
maxRightIntervalClass1 = maxLeftIntervalClass1
|
|
648
|
+
|
|
649
|
+
for boundaryIndex in range(len(B)):
|
|
650
|
+
if position < len(binEdgesIndex) and boundaryIndex == binEdgesIndex[position]:
|
|
651
|
+
position += 1
|
|
652
|
+
if Class0ByLargeInterval[position] > Class1ByLargeInterval[position]:
|
|
653
|
+
oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class0ByLargeInterval[position], 2) / (
|
|
654
|
+
Class0ByLargeInterval[position] + Class1ByLargeInterval[position]
|
|
655
|
+
)
|
|
656
|
+
else:
|
|
657
|
+
oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class1ByLargeInterval[position], 2) / (
|
|
658
|
+
Class0ByLargeInterval[position] + Class1ByLargeInterval[position]
|
|
659
|
+
)
|
|
660
|
+
currentSumClass0 = 0
|
|
661
|
+
currentSumClass1 = 0
|
|
662
|
+
continue
|
|
663
|
+
|
|
664
|
+
currentSumClass0 += nbElementsByIntervalClass0[boundaryIndex]
|
|
665
|
+
currentSumClass1 += nbElementsByIntervalClass1[boundaryIndex]
|
|
666
|
+
caim = oldCaim
|
|
667
|
+
|
|
668
|
+
if currentSumClass0 > currentSumClass1:
|
|
669
|
+
caim = caim + math.pow(currentSumClass0, 2) / (currentSumClass0 + currentSumClass1)
|
|
670
|
+
else:
|
|
671
|
+
caim = caim + math.pow(currentSumClass1, 2) / (currentSumClass0 + currentSumClass1)
|
|
672
|
+
|
|
673
|
+
intervalClass0 = Class0ByLargeInterval[position] - currentSumClass0
|
|
674
|
+
intervalClass1 = Class1ByLargeInterval[position] - currentSumClass1
|
|
675
|
+
|
|
676
|
+
if intervalClass0 > intervalClass1:
|
|
677
|
+
caim = caim + math.pow(intervalClass0, 2) / (intervalClass0 + intervalClass1)
|
|
678
|
+
else:
|
|
679
|
+
caim = caim + math.pow(intervalClass1, 2) / (intervalClass0 + intervalClass1)
|
|
680
|
+
|
|
681
|
+
caim = caim / (len(Class0ByLargeInterval) + 1)
|
|
682
|
+
|
|
683
|
+
if caim > caimMax:
|
|
684
|
+
maxLeftIntervalClass0 = currentSumClass0
|
|
685
|
+
maxLeftIntervalClass1 = currentSumClass1
|
|
686
|
+
maxRightIntervalClass0 = intervalClass0
|
|
687
|
+
maxRightIntervalClass1 = intervalClass1
|
|
688
|
+
caimMax = caim
|
|
689
|
+
maxBoundaryIndex = boundaryIndex
|
|
690
|
+
maxPosition = position
|
|
691
|
+
|
|
692
|
+
if caimMax > globalCAIM:
|
|
693
|
+
globalCAIM = caimMax
|
|
694
|
+
binEdgesIndex.insert(maxPosition, maxBoundaryIndex)
|
|
695
|
+
Class0ByLargeInterval.insert(maxPosition + 1, maxRightIntervalClass0)
|
|
696
|
+
Class1ByLargeInterval.insert(maxPosition + 1, maxRightIntervalClass1)
|
|
697
|
+
Class0ByLargeInterval[maxPosition] = maxLeftIntervalClass0
|
|
698
|
+
Class1ByLargeInterval[maxPosition] = maxLeftIntervalClass1
|
|
699
|
+
k = k + 1
|
|
700
|
+
if Class0ByLargeInterval[0] > Class1ByLargeInterval[0]:
|
|
701
|
+
oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class0ByLargeInterval[0], 2) / (
|
|
702
|
+
Class0ByLargeInterval[0] + Class1ByLargeInterval[0]
|
|
703
|
+
)
|
|
704
|
+
else:
|
|
705
|
+
oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class1ByLargeInterval[0], 2) / (
|
|
706
|
+
Class0ByLargeInterval[0] + Class1ByLargeInterval[0]
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
else:
|
|
710
|
+
break
|
|
711
|
+
|
|
712
|
+
binEdges = [xAndY[0][0]]
|
|
713
|
+
for index in binEdgesIndex:
|
|
714
|
+
binEdges.append(B[index])
|
|
715
|
+
binEdges.append(xAndY[-1][0])
|
|
716
|
+
|
|
717
|
+
return binEdges
|
|
718
|
+
|
|
719
|
+
@staticmethod
|
|
720
|
+
def _discretizationNML(X, possibleValuesX, kMax=10, epsilon=None):
|
|
721
|
+
"""
|
|
722
|
+
Uses the discretization algorithm described in "MDL Histogram Density Estimator", Kontkaken and Myllymaki, 2007 to
|
|
723
|
+
discretize.
|
|
724
|
+
|
|
725
|
+
Parameters
|
|
726
|
+
----------
|
|
727
|
+
X: one dimensional ndarray
|
|
728
|
+
array that that contains all the data that needs to be discretized
|
|
729
|
+
possibleValuesX: one dimensional ndarray
|
|
730
|
+
Contains all the possible values that x can take sorted in increasing order. There shouldn't be any doubles
|
|
731
|
+
inside.
|
|
732
|
+
kMax: int
|
|
733
|
+
the maximum number of bins before the algorithm stops itself.
|
|
734
|
+
epsilon: float or None
|
|
735
|
+
the value of epsilon used in the algorithm. Should be as small as possible. If None is passed the value is
|
|
736
|
+
automatically calculated.
|
|
737
|
+
Returns
|
|
738
|
+
-------
|
|
739
|
+
List[float]
|
|
740
|
+
a list of the edges of the bins that are chosen by this algorithm
|
|
741
|
+
"""
|
|
742
|
+
Xsorted = X[X.argsort(axis=None)]
|
|
743
|
+
if epsilon is None:
|
|
744
|
+
epsilon = numpy.amin(possibleValuesX[1:] - possibleValuesX[:-1]) / 2
|
|
745
|
+
epsilon = epsilon / 2
|
|
746
|
+
candidateCutPoints = numpy.zeros(2 * len(possibleValuesX))
|
|
747
|
+
for i in range(len(possibleValuesX)):
|
|
748
|
+
candidateCutPoints[2 * i] = possibleValuesX[i] - epsilon
|
|
749
|
+
candidateCutPoints[2 * i + 1] = possibleValuesX[i] + epsilon
|
|
750
|
+
|
|
751
|
+
E = len(candidateCutPoints)
|
|
752
|
+
|
|
753
|
+
binCount = numpy.zeros(E)
|
|
754
|
+
counter = 0
|
|
755
|
+
j = 0
|
|
756
|
+
for x in Xsorted:
|
|
757
|
+
while x > candidateCutPoints[j]:
|
|
758
|
+
binCount[j] = counter
|
|
759
|
+
j = j + 1
|
|
760
|
+
counter = counter + 1
|
|
761
|
+
binCount[j] = counter
|
|
762
|
+
n = binCount[-1]
|
|
763
|
+
Rkminus1 = numpy.ones(E)
|
|
764
|
+
Rk = numpy.zeros(E)
|
|
765
|
+
for i in range(1, E):
|
|
766
|
+
ne = int(binCount[i])
|
|
767
|
+
total_amount = 0
|
|
768
|
+
for h1 in range(ne + 1):
|
|
769
|
+
h2 = ne - h1
|
|
770
|
+
total_amount += math.pow(h1 / ne, h1) * math.pow(h2 / ne, h2) * scipy.special.comb(ne, h1)
|
|
771
|
+
Rk[i] = total_amount
|
|
772
|
+
|
|
773
|
+
k = 2
|
|
774
|
+
|
|
775
|
+
Bkminus1 = numpy.zeros(E)
|
|
776
|
+
for e in range(1, E):
|
|
777
|
+
ne = binCount[e]
|
|
778
|
+
Bkminus1[e] = -ne * (math.log(2 * epsilon * ne) - math.log(n * (candidateCutPoints[e] - candidateCutPoints[0])))
|
|
779
|
+
|
|
780
|
+
Bk = numpy.zeros(E)
|
|
781
|
+
cutpoints = [candidateCutPoints[0]]
|
|
782
|
+
Bvalues = [Bkminus1[-1]]
|
|
783
|
+
minimumeprime = 0
|
|
784
|
+
while k <= kMax:
|
|
785
|
+
for e in range(k, E):
|
|
786
|
+
minimum = math.inf
|
|
787
|
+
minimumeprime = 0
|
|
788
|
+
for eprime in range(k - 1, e):
|
|
789
|
+
if binCount[e] > binCount[eprime]:
|
|
790
|
+
temp = Bkminus1[eprime] - (binCount[e] - binCount[eprime]) * (
|
|
791
|
+
math.log(2 * epsilon * (binCount[e] - binCount[eprime]))
|
|
792
|
+
- math.log(n * (candidateCutPoints[e] - candidateCutPoints[eprime]))
|
|
793
|
+
)
|
|
794
|
+
else:
|
|
795
|
+
temp = Bkminus1[eprime]
|
|
796
|
+
temp = temp + math.log(Rk[e] / Rkminus1[eprime])
|
|
797
|
+
if minimum > temp:
|
|
798
|
+
minimum = temp
|
|
799
|
+
minimumeprime = eprime
|
|
800
|
+
minimum = minimum + math.log((E - k) / (k - 1))
|
|
801
|
+
Bk[e] = minimum
|
|
802
|
+
cutpoints.append(candidateCutPoints[minimumeprime])
|
|
803
|
+
Bvalues.append(Bk[-1])
|
|
804
|
+
|
|
805
|
+
k = k + 1
|
|
806
|
+
temp = Rk
|
|
807
|
+
Rk = Rk + numpy.multiply(binCount, Rkminus1) / (k - 2)
|
|
808
|
+
Rkminus1 = temp
|
|
809
|
+
Bkminus1 = Bk
|
|
810
|
+
minimum = math.inf
|
|
811
|
+
minimumIndex = 0
|
|
812
|
+
for k in range(1, len(Bvalues)):
|
|
813
|
+
if Bvalues[k] < minimum:
|
|
814
|
+
minimum = Bvalues[k]
|
|
815
|
+
minimumIndex = k
|
|
816
|
+
cutpoints = sorted(set(cutpoints[: minimumIndex + 1]))
|
|
817
|
+
cutpoints.append(candidateCutPoints[-1])
|
|
818
|
+
|
|
819
|
+
return cutpoints
|
|
820
|
+
|
|
821
|
+
def _createVariable(self, variableName, X, y=None, possibleValuesY=None):
|
|
822
|
+
"""
|
|
823
|
+
Creates a variable for the column passed in as a parameter
|
|
824
|
+
|
|
825
|
+
Parameters
|
|
826
|
+
----------
|
|
827
|
+
variableName:
|
|
828
|
+
the name of the created variable
|
|
829
|
+
X: ndarray shape(n,1)
|
|
830
|
+
A column vector containing n samples of a feature. The column for which the variable will be created
|
|
831
|
+
y: ndarray shape(n,1)
|
|
832
|
+
A column vector containing the corresponding for each element in X.
|
|
833
|
+
possibleValuesY: ndarray
|
|
834
|
+
An ndarray containing all the unique values of y
|
|
835
|
+
Returns
|
|
836
|
+
-------
|
|
837
|
+
pyagrum.DiscreteVariable
|
|
838
|
+
the created variable
|
|
839
|
+
"""
|
|
840
|
+
|
|
841
|
+
if y is not None:
|
|
842
|
+
X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True, ensure_2d=False)
|
|
843
|
+
X = sklearn.utils.check_array(X, dtype=None, ensure_2d=False)
|
|
844
|
+
try:
|
|
845
|
+
Xtransformed = sklearn.utils.check_array(X, dtype="float", ensure_2d=False)
|
|
846
|
+
isNumeric = True
|
|
847
|
+
except ValueError:
|
|
848
|
+
Xtransformed = X
|
|
849
|
+
isNumeric = False
|
|
850
|
+
|
|
851
|
+
foundValuesX = set(numpy.unique(X))
|
|
852
|
+
n = len(X)
|
|
853
|
+
|
|
854
|
+
if (
|
|
855
|
+
variableName not in self.discretizationParametersDictionary
|
|
856
|
+
): # The user has not manually set the discretization parameters for this variable
|
|
857
|
+
if (
|
|
858
|
+
isNumeric
|
|
859
|
+
and 1 <= self.discretizationThreshold < len(foundValuesX)
|
|
860
|
+
or (self.discretizationThreshold < 1 and len(foundValuesX) / len(X) > self.discretizationThreshold)
|
|
861
|
+
):
|
|
862
|
+
self.discretizationParametersDictionary[variableName] = {}
|
|
863
|
+
self.discretizationParametersDictionary[variableName]["method"] = self.defaultMethod
|
|
864
|
+
self.discretizationParametersDictionary[variableName]["param"] = self.defaultParamDiscretizationMethod
|
|
865
|
+
else:
|
|
866
|
+
self.discretizationParametersDictionary[variableName] = {}
|
|
867
|
+
self.discretizationParametersDictionary[variableName]["method"] = "NoDiscretization"
|
|
868
|
+
usingDefaultParameters = True
|
|
869
|
+
else: # The user has manually set the discretization parameters for this variable
|
|
870
|
+
usingDefaultParameters = False
|
|
871
|
+
if self.discretizationParametersDictionary[variableName]["method"] != "NoDiscretization" and not isNumeric:
|
|
872
|
+
raise ValueError(f"The variable {variableName} is not numeric and cannot be discretized!")
|
|
873
|
+
|
|
874
|
+
if self.discretizationParametersDictionary[variableName]["method"] == "NoDiscretization":
|
|
875
|
+
is_int_var = True
|
|
876
|
+
|
|
877
|
+
varSyntax = ""
|
|
878
|
+
if "param" in self.discretizationParametersDictionary[variableName]:
|
|
879
|
+
varSyntax = self.discretizationParametersDictionary[variableName]["param"]
|
|
880
|
+
if varSyntax is None:
|
|
881
|
+
varSyntax = ""
|
|
882
|
+
|
|
883
|
+
if varSyntax != "":
|
|
884
|
+
var = gum.fastVariable(variableName + varSyntax)
|
|
885
|
+
possibleValuesX = set(var.labels())
|
|
886
|
+
f = {str(x) for x in foundValuesX}
|
|
887
|
+
if not f.issubset(possibleValuesX):
|
|
888
|
+
raise ValueError(
|
|
889
|
+
f"The values passed in possibleValues ({sorted(possibleValuesX)}) do not match database values ("
|
|
890
|
+
f"{sorted(f)})"
|
|
891
|
+
)
|
|
892
|
+
return var
|
|
893
|
+
|
|
894
|
+
possibleValuesX = sorted(foundValuesX)
|
|
895
|
+
is_int_var = all(map(check_int, possibleValuesX))
|
|
896
|
+
if is_int_var:
|
|
897
|
+
possibleValuesX = [int(x) for x in possibleValuesX]
|
|
898
|
+
max_v = int(possibleValuesX[-1]) # sorted
|
|
899
|
+
min_v = int(possibleValuesX[0])
|
|
900
|
+
|
|
901
|
+
if len(possibleValuesX) == max_v - min_v + 1: # no hole in the list of int
|
|
902
|
+
return gum.RangeVariable(variableName, variableName, min_v, max_v)
|
|
903
|
+
else:
|
|
904
|
+
return gum.IntegerVariable(variableName, variableName, possibleValuesX)
|
|
905
|
+
|
|
906
|
+
is_float_var = all(map(check_float, possibleValuesX))
|
|
907
|
+
if is_float_var:
|
|
908
|
+
possibleValuesX = [float(x) for x in possibleValuesX]
|
|
909
|
+
return gum.NumericalDiscreteVariable(variableName, variableName, possibleValuesX)
|
|
910
|
+
else:
|
|
911
|
+
return gum.LabelizedVariable(variableName, variableName, [str(v) for v in possibleValuesX])
|
|
912
|
+
else:
|
|
913
|
+
self.numberOfContinuous += 1
|
|
914
|
+
if self.discretizationParametersDictionary[variableName]["method"] == "expert":
|
|
915
|
+
binEdges = self.discretizationParametersDictionary[variableName]["param"]
|
|
916
|
+
elif self.discretizationParametersDictionary[variableName]["method"] == "CAIM":
|
|
917
|
+
if y is None:
|
|
918
|
+
raise ValueError(
|
|
919
|
+
"The CAIM discretization method requires a list of the associated classes for each data vector since it "
|
|
920
|
+
"is a supervised discretization method. You should pass it as y."
|
|
921
|
+
)
|
|
922
|
+
if possibleValuesY is None:
|
|
923
|
+
possibleValuesY = numpy.unique(y)
|
|
924
|
+
binEdges = self._discretizationCAIM(
|
|
925
|
+
Xtransformed.reshape(n, 1),
|
|
926
|
+
y.reshape(n, 1),
|
|
927
|
+
numpy.unique(Xtransformed),
|
|
928
|
+
possibleValuesY,
|
|
929
|
+
)
|
|
930
|
+
elif self.discretizationParametersDictionary[variableName]["method"] == "MDLP":
|
|
931
|
+
if y is None:
|
|
932
|
+
raise ValueError(
|
|
933
|
+
"The MDLP discretization method requires a list of the associated classes for each data vector since it "
|
|
934
|
+
"is a supervised discretization method. You should pass it as y."
|
|
935
|
+
)
|
|
936
|
+
if possibleValuesY is None:
|
|
937
|
+
possibleValuesY = numpy.unique(y)
|
|
938
|
+
binEdges = self._discretizationMDLP(
|
|
939
|
+
Xtransformed.reshape(n, 1),
|
|
940
|
+
y.reshape(n, 1),
|
|
941
|
+
numpy.unique(Xtransformed),
|
|
942
|
+
possibleValuesY,
|
|
943
|
+
)
|
|
944
|
+
elif self.discretizationParametersDictionary[variableName]["method"] == "NML":
|
|
945
|
+
binEdges = self._discretizationNML(
|
|
946
|
+
Xtransformed.flatten(),
|
|
947
|
+
numpy.unique(Xtransformed),
|
|
948
|
+
kMax=self.discretizationParametersDictionary[variableName]["param"],
|
|
949
|
+
)
|
|
950
|
+
else:
|
|
951
|
+
if self.discretizationParametersDictionary[variableName]["param"] == "elbowMethod":
|
|
952
|
+
binEdges = self._discretizationElbowMethodRotation(
|
|
953
|
+
self.discretizationParametersDictionary[variableName]["method"],
|
|
954
|
+
Xtransformed.flatten(),
|
|
955
|
+
)
|
|
956
|
+
else:
|
|
957
|
+
discre = skp.KBinsDiscretizer(
|
|
958
|
+
self.discretizationParametersDictionary[variableName]["param"],
|
|
959
|
+
strategy=self.discretizationParametersDictionary[variableName]["method"],
|
|
960
|
+
quantile_method="averaged_inverted_cdf",
|
|
961
|
+
subsample=None,
|
|
962
|
+
)
|
|
963
|
+
discre.fit(X.reshape(-1, 1))
|
|
964
|
+
binEdges = discre.bin_edges_[0].tolist()
|
|
965
|
+
|
|
966
|
+
if len(binEdges) == 2:
|
|
967
|
+
raise ValueError(
|
|
968
|
+
"Due to an error the discretization method "
|
|
969
|
+
+ str(self.discretizationParametersDictionary[variableName]["method"])
|
|
970
|
+
+ " using "
|
|
971
|
+
+ str(self.discretizationParametersDictionary[variableName]["param"])
|
|
972
|
+
+ " bins for the variable "
|
|
973
|
+
+ str(variableName)
|
|
974
|
+
+ "gave only 1 bin. Try increasing the number of bins used by this variable using "
|
|
975
|
+
"setDiscretizationParameters to avoid this error"
|
|
976
|
+
)
|
|
977
|
+
|
|
978
|
+
self.totalNumberOfBins += len(binEdges) - 1
|
|
979
|
+
var = gum.DiscretizedVariable(variableName, variableName, binEdges)
|
|
980
|
+
var.setEmpirical(True)
|
|
981
|
+
|
|
982
|
+
if usingDefaultParameters:
|
|
983
|
+
self.discretizationParametersDictionary.pop(variableName)
|
|
984
|
+
|
|
985
|
+
return var
|
|
986
|
+
|
|
987
|
+
@staticmethod
|
|
988
|
+
def _divideIntervalMDLP(
|
|
989
|
+
minimalValues,
|
|
990
|
+
shannonEntropyByLargeInterval,
|
|
991
|
+
Class0ByLargeInterval,
|
|
992
|
+
Class1ByLargeInterval,
|
|
993
|
+
continueDividingInterval,
|
|
994
|
+
totalCountByLargeInterval,
|
|
995
|
+
position,
|
|
996
|
+
binEdgesIndex,
|
|
997
|
+
):
|
|
998
|
+
shannonEntropy = shannonEntropyByLargeInterval[position]
|
|
999
|
+
|
|
1000
|
+
gain = shannonEntropy - minimalValues["classInformationEntropy"]
|
|
1001
|
+
|
|
1002
|
+
# all the 2's here should be replaced by the number of classes present in the interval. However we know that if
|
|
1003
|
+
# the number of classes in the interval is equal to 1, then the shannon entropy will be 0 so the product of the 2
|
|
1004
|
+
# will be 0.
|
|
1005
|
+
deltaS = math.log2(7) - (
|
|
1006
|
+
2 * shannonEntropy
|
|
1007
|
+
- 2 * minimalValues["leftSubintervalShannonEntropy"]
|
|
1008
|
+
- 2 * minimalValues["rightSubintervalShannonEntropy"]
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
if (
|
|
1012
|
+
gain > (math.log2(totalCountByLargeInterval[position] - 1) + deltaS) / totalCountByLargeInterval[position]
|
|
1013
|
+
or len(Class0ByLargeInterval) == 1
|
|
1014
|
+
):
|
|
1015
|
+
binEdgesIndex.insert(position, minimalValues["boundaryIndex"])
|
|
1016
|
+
|
|
1017
|
+
Class0ByLargeInterval.insert(position + 1, minimalValues["rightSubintervalClass0"])
|
|
1018
|
+
Class1ByLargeInterval.insert(position + 1, minimalValues["rightSubintervalClass1"])
|
|
1019
|
+
continueDividingInterval.insert(position + 1, True)
|
|
1020
|
+
totalCountByLargeInterval.insert(
|
|
1021
|
+
position + 1,
|
|
1022
|
+
minimalValues["rightSubintervalClass0"] + minimalValues["rightSubintervalClass1"],
|
|
1023
|
+
)
|
|
1024
|
+
shannonEntropyByLargeInterval.insert(position + 1, minimalValues["rightSubintervalShannonEntropy"])
|
|
1025
|
+
|
|
1026
|
+
Class0ByLargeInterval[position] = minimalValues["leftSubintervalClass0"]
|
|
1027
|
+
Class1ByLargeInterval[position] = minimalValues["leftSubintervalClass1"]
|
|
1028
|
+
totalCountByLargeInterval[position] = (
|
|
1029
|
+
minimalValues["leftSubintervalClass0"] + minimalValues["leftSubintervalClass1"]
|
|
1030
|
+
)
|
|
1031
|
+
shannonEntropyByLargeInterval[position] = minimalValues["leftSubintervalShannonEntropy"]
|
|
1032
|
+
|
|
1033
|
+
# if the class information entropy is 0, then we have perfectly cut the interval so that a class only appears
|
|
1034
|
+
# on one side, so we do not need to cut any further.
|
|
1035
|
+
if minimalValues["leftSubintervalShannonEntropy"] == 0:
|
|
1036
|
+
continueDividingInterval[position] = False
|
|
1037
|
+
if minimalValues["rightSubintervalShannonEntropy"] == 0:
|
|
1038
|
+
continueDividingInterval[position + 1] = False
|
|
1039
|
+
|
|
1040
|
+
# if there are no tensor boundary points left in this interval, we can't divide it any further
|
|
1041
|
+
if position > 0 and minimalValues["boundaryIndex"] - 1 == binEdgesIndex[position - 1]:
|
|
1042
|
+
continueDividingInterval[position] = False
|
|
1043
|
+
|
|
1044
|
+
if minimalValues["boundaryIndex"] == 0:
|
|
1045
|
+
continueDividingInterval[position] = False
|
|
1046
|
+
|
|
1047
|
+
if position < len(binEdgesIndex) - 1 and binEdgesIndex[position] + 1 == binEdgesIndex[position + 1]:
|
|
1048
|
+
continueDividingInterval[position + 1] = False
|
|
1049
|
+
position += 1
|
|
1050
|
+
else:
|
|
1051
|
+
continueDividingInterval[position] = False
|
|
1052
|
+
|
|
1053
|
+
def discretizedTemplate(self, X, y=None, *, possibleValuesY=None, template=None):
|
|
1054
|
+
"""
|
|
1055
|
+
return a graphical model discretized using the suggestion of the Discretized for date source X (and for target y).
|
|
1056
|
+
This graphical model only contains the discretized variables.
|
|
1057
|
+
For instance, it can be used as a template for a BNLearner.
|
|
1058
|
+
|
|
1059
|
+
Parameters
|
|
1060
|
+
----------
|
|
1061
|
+
X: {array-like, sparse matrix, pandas or polars dataframe} of shape (n_samples, n_features)) or str (filename)
|
|
1062
|
+
training data
|
|
1063
|
+
y: array-like, pandas or polars dataframe of shape (n_samples,) or str (classname)
|
|
1064
|
+
Target values
|
|
1065
|
+
possibleValuesY: ndarray
|
|
1066
|
+
An ndarray containing all the unique values of y
|
|
1067
|
+
template: a graphical model such as pyagrum.BayesNet, pyagrum.MRF, etc...
|
|
1068
|
+
the template that will contain the discretized variables.
|
|
1069
|
+
If None, a new Bayesian network is created.
|
|
1070
|
+
|
|
1071
|
+
Returns
|
|
1072
|
+
-------
|
|
1073
|
+
pyagrum.BayesNet or other graphical model:
|
|
1074
|
+
the discretized graphical model (only (discretized) random variables are created in the model)
|
|
1075
|
+
|
|
1076
|
+
Example
|
|
1077
|
+
-------
|
|
1078
|
+
>>> discretizer = DiscreteTypeProcessor(
|
|
1079
|
+
... defaultDiscretizationMethod="uniform", defaultParamDiscretizationMethod=7, discretizationThreshold=10
|
|
1080
|
+
... )
|
|
1081
|
+
>>> learner = gum.BNLearner(data, discretizer.discretizedTemplate(data))
|
|
1082
|
+
"""
|
|
1083
|
+
if template is None:
|
|
1084
|
+
template = gum.BayesNet()
|
|
1085
|
+
|
|
1086
|
+
if isinstance(X, str):
|
|
1087
|
+
Xp = pandas.read_csv(X)
|
|
1088
|
+
elif hasattr(X, "to_pandas"):
|
|
1089
|
+
Xp = X.to_pandas()
|
|
1090
|
+
else:
|
|
1091
|
+
Xp = X
|
|
1092
|
+
|
|
1093
|
+
if isinstance(y, str):
|
|
1094
|
+
yp = Xp[y]
|
|
1095
|
+
elif y is not None and hasattr(y, "to_pandas"):
|
|
1096
|
+
yp = y.to_pandas()
|
|
1097
|
+
else:
|
|
1098
|
+
yp = y
|
|
1099
|
+
|
|
1100
|
+
for name in Xp:
|
|
1101
|
+
template.add(self._createVariable(name, Xp[name], yp, possibleValuesY))
|
|
1102
|
+
return template
|