pyAgrum-nightly 2.3.1.9.dev202512261765915415__cp310-abi3-macosx_10_15_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyagrum/__init__.py +165 -0
- pyagrum/_pyagrum.so +0 -0
- pyagrum/bnmixture/BNMInference.py +268 -0
- pyagrum/bnmixture/BNMLearning.py +376 -0
- pyagrum/bnmixture/BNMixture.py +464 -0
- pyagrum/bnmixture/__init__.py +60 -0
- pyagrum/bnmixture/notebook.py +1058 -0
- pyagrum/causal/_CausalFormula.py +280 -0
- pyagrum/causal/_CausalModel.py +436 -0
- pyagrum/causal/__init__.py +81 -0
- pyagrum/causal/_causalImpact.py +356 -0
- pyagrum/causal/_dSeparation.py +598 -0
- pyagrum/causal/_doAST.py +761 -0
- pyagrum/causal/_doCalculus.py +361 -0
- pyagrum/causal/_doorCriteria.py +374 -0
- pyagrum/causal/_exceptions.py +95 -0
- pyagrum/causal/_types.py +61 -0
- pyagrum/causal/causalEffectEstimation/_CausalEffectEstimation.py +1175 -0
- pyagrum/causal/causalEffectEstimation/_IVEstimators.py +718 -0
- pyagrum/causal/causalEffectEstimation/_RCTEstimators.py +132 -0
- pyagrum/causal/causalEffectEstimation/__init__.py +46 -0
- pyagrum/causal/causalEffectEstimation/_backdoorEstimators.py +774 -0
- pyagrum/causal/causalEffectEstimation/_causalBNEstimator.py +324 -0
- pyagrum/causal/causalEffectEstimation/_frontdoorEstimators.py +396 -0
- pyagrum/causal/causalEffectEstimation/_learners.py +118 -0
- pyagrum/causal/causalEffectEstimation/_utils.py +466 -0
- pyagrum/causal/notebook.py +172 -0
- pyagrum/clg/CLG.py +658 -0
- pyagrum/clg/GaussianVariable.py +111 -0
- pyagrum/clg/SEM.py +312 -0
- pyagrum/clg/__init__.py +63 -0
- pyagrum/clg/canonicalForm.py +408 -0
- pyagrum/clg/constants.py +54 -0
- pyagrum/clg/forwardSampling.py +202 -0
- pyagrum/clg/learning.py +776 -0
- pyagrum/clg/notebook.py +480 -0
- pyagrum/clg/variableElimination.py +271 -0
- pyagrum/common.py +60 -0
- pyagrum/config.py +319 -0
- pyagrum/ctbn/CIM.py +513 -0
- pyagrum/ctbn/CTBN.py +573 -0
- pyagrum/ctbn/CTBNGenerator.py +216 -0
- pyagrum/ctbn/CTBNInference.py +459 -0
- pyagrum/ctbn/CTBNLearner.py +161 -0
- pyagrum/ctbn/SamplesStats.py +671 -0
- pyagrum/ctbn/StatsIndepTest.py +355 -0
- pyagrum/ctbn/__init__.py +79 -0
- pyagrum/ctbn/constants.py +54 -0
- pyagrum/ctbn/notebook.py +264 -0
- pyagrum/defaults.ini +199 -0
- pyagrum/deprecated.py +95 -0
- pyagrum/explain/_ComputationCausal.py +75 -0
- pyagrum/explain/_ComputationConditional.py +48 -0
- pyagrum/explain/_ComputationMarginal.py +48 -0
- pyagrum/explain/_CustomShapleyCache.py +110 -0
- pyagrum/explain/_Explainer.py +176 -0
- pyagrum/explain/_Explanation.py +70 -0
- pyagrum/explain/_FIFOCache.py +54 -0
- pyagrum/explain/_ShallCausalValues.py +204 -0
- pyagrum/explain/_ShallConditionalValues.py +155 -0
- pyagrum/explain/_ShallMarginalValues.py +155 -0
- pyagrum/explain/_ShallValues.py +296 -0
- pyagrum/explain/_ShapCausalValues.py +208 -0
- pyagrum/explain/_ShapConditionalValues.py +126 -0
- pyagrum/explain/_ShapMarginalValues.py +191 -0
- pyagrum/explain/_ShapleyValues.py +298 -0
- pyagrum/explain/__init__.py +81 -0
- pyagrum/explain/_explGeneralizedMarkovBlanket.py +152 -0
- pyagrum/explain/_explIndependenceListForPairs.py +146 -0
- pyagrum/explain/_explInformationGraph.py +264 -0
- pyagrum/explain/notebook/__init__.py +54 -0
- pyagrum/explain/notebook/_bar.py +142 -0
- pyagrum/explain/notebook/_beeswarm.py +174 -0
- pyagrum/explain/notebook/_showShapValues.py +97 -0
- pyagrum/explain/notebook/_waterfall.py +220 -0
- pyagrum/explain/shapley.py +225 -0
- pyagrum/lib/__init__.py +46 -0
- pyagrum/lib/_colors.py +390 -0
- pyagrum/lib/bn2graph.py +299 -0
- pyagrum/lib/bn2roc.py +1026 -0
- pyagrum/lib/bn2scores.py +217 -0
- pyagrum/lib/bn_vs_bn.py +605 -0
- pyagrum/lib/cn2graph.py +305 -0
- pyagrum/lib/discreteTypeProcessor.py +1102 -0
- pyagrum/lib/discretizer.py +58 -0
- pyagrum/lib/dynamicBN.py +390 -0
- pyagrum/lib/explain.py +57 -0
- pyagrum/lib/export.py +84 -0
- pyagrum/lib/id2graph.py +258 -0
- pyagrum/lib/image.py +387 -0
- pyagrum/lib/ipython.py +307 -0
- pyagrum/lib/mrf2graph.py +471 -0
- pyagrum/lib/notebook.py +1821 -0
- pyagrum/lib/proba_histogram.py +552 -0
- pyagrum/lib/utils.py +138 -0
- pyagrum/pyagrum.py +31495 -0
- pyagrum/skbn/_MBCalcul.py +242 -0
- pyagrum/skbn/__init__.py +49 -0
- pyagrum/skbn/_learningMethods.py +282 -0
- pyagrum/skbn/_utils.py +297 -0
- pyagrum/skbn/bnclassifier.py +1014 -0
- pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/LICENSE.md +12 -0
- pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/LICENSES/LGPL-3.0-or-later.txt +304 -0
- pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/LICENSES/MIT.txt +18 -0
- pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/METADATA +145 -0
- pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/RECORD +107 -0
- pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,1014 @@
|
|
|
1
|
+
############################################################################
|
|
2
|
+
# This file is part of the aGrUM/pyAgrum library. #
|
|
3
|
+
# #
|
|
4
|
+
# Copyright (c) 2005-2025 by #
|
|
5
|
+
# - Pierre-Henri WUILLEMIN(_at_LIP6) #
|
|
6
|
+
# - Christophe GONZALES(_at_AMU) #
|
|
7
|
+
# #
|
|
8
|
+
# The aGrUM/pyAgrum library is free software; you can redistribute it #
|
|
9
|
+
# and/or modify it under the terms of either : #
|
|
10
|
+
# #
|
|
11
|
+
# - the GNU Lesser General Public License as published by #
|
|
12
|
+
# the Free Software Foundation, either version 3 of the License, #
|
|
13
|
+
# or (at your option) any later version, #
|
|
14
|
+
# - the MIT license (MIT), #
|
|
15
|
+
# - or both in dual license, as here. #
|
|
16
|
+
# #
|
|
17
|
+
# (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) #
|
|
18
|
+
# #
|
|
19
|
+
# This aGrUM/pyAgrum library is distributed in the hope that it will be #
|
|
20
|
+
# useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #
|
|
21
|
+
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS #
|
|
22
|
+
# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #
|
|
23
|
+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #
|
|
24
|
+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, #
|
|
25
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR #
|
|
26
|
+
# OTHER DEALINGS IN THE SOFTWARE. #
|
|
27
|
+
# #
|
|
28
|
+
# See LICENCES for more details. #
|
|
29
|
+
# #
|
|
30
|
+
# SPDX-FileCopyrightText: Copyright 2005-2025 #
|
|
31
|
+
# - Pierre-Henri WUILLEMIN(_at_LIP6) #
|
|
32
|
+
# - Christophe GONZALES(_at_AMU) #
|
|
33
|
+
# SPDX-License-Identifier: LGPL-3.0-or-later OR MIT #
|
|
34
|
+
# #
|
|
35
|
+
# Contact : info_at_agrum_dot_org #
|
|
36
|
+
# homepage : http://agrum.gitlab.io #
|
|
37
|
+
# gitlab : https://gitlab.com/agrumery/agrum #
|
|
38
|
+
# #
|
|
39
|
+
############################################################################
|
|
40
|
+
|
|
41
|
+
from typing import List, Union
|
|
42
|
+
|
|
43
|
+
import pandas
|
|
44
|
+
import numpy
|
|
45
|
+
import os
|
|
46
|
+
import tempfile
|
|
47
|
+
import warnings
|
|
48
|
+
|
|
49
|
+
import sklearn
|
|
50
|
+
|
|
51
|
+
import pyagrum as gum
|
|
52
|
+
from pyagrum.lib.discreteTypeProcessor import DiscreteTypeProcessor
|
|
53
|
+
from pyagrum.lib.discreteTypeProcessor import check_int
|
|
54
|
+
|
|
55
|
+
from ._utils import _ImplementPrior as IPrior
|
|
56
|
+
from ._utils import _CalculateThreshold as CThreshold
|
|
57
|
+
from ._utils import _DFNames as DFNames
|
|
58
|
+
from ._utils import _createCSVfromNDArrays as CSV
|
|
59
|
+
|
|
60
|
+
from ._MBCalcul import compileMarkovBlanket
|
|
61
|
+
from ._MBCalcul import (
|
|
62
|
+
_calcul_proba_for_binary_class,
|
|
63
|
+
_calcul_most_probable_for_nary_class,
|
|
64
|
+
_calcul_proba_for_nary_class,
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
from ._learningMethods import _fitStandard as BN_fitStandard
|
|
68
|
+
from ._learningMethods import _fitNaiveBayes as BN_fitNaiveBayes
|
|
69
|
+
from ._learningMethods import _fitTAN as BN_fitTAN
|
|
70
|
+
from ._learningMethods import _fitChowLiu as BN_fitChowLiu
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
class BNClassifier(sklearn.base.BaseEstimator, sklearn.base.ClassifierMixin):
|
|
74
|
+
"""
|
|
75
|
+
Represents a (scikit-learn compliant) classifier which uses a BN to classify. A BNClassifier is build using
|
|
76
|
+
|
|
77
|
+
- a Bayesian network,
|
|
78
|
+
- a database and a learning algorithm and parameters
|
|
79
|
+
- the use of DiscreteTypeProcessor to discretize with different algorithms some variables.
|
|
80
|
+
|
|
81
|
+
The classifier can be used to predict the class of new data.
|
|
82
|
+
|
|
83
|
+
Warnings
|
|
84
|
+
--------
|
|
85
|
+
This class can be pickled. However, the state of this class is only the classifier itself,
|
|
86
|
+
not the parameter used to train it.
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
Parameters
|
|
90
|
+
----------
|
|
91
|
+
learningMethod: str
|
|
92
|
+
A string designating which learning algorithm we want to use. Possible values are: Chow-Liu, NaiveBayes,
|
|
93
|
+
TAN, MIIC + (MDL ou NML), GHC, Tabu.
|
|
94
|
+
GHC designates Greedy Hill Climbing.
|
|
95
|
+
MIIC designates Multivariate Information based Inductive Causation
|
|
96
|
+
TAN designates Tree-augmented NaiveBayes
|
|
97
|
+
Tabu designated Tabu list searching
|
|
98
|
+
|
|
99
|
+
prior: str
|
|
100
|
+
A string designating the type of a priorsmoothing we want to use. Possible values are Smoothing,
|
|
101
|
+
BDeu, Dirichlet and NoPrior .
|
|
102
|
+
Note: if using Dirichlet smoothing DirichletCsv cannot be set to none
|
|
103
|
+
By default (when prior is None) : a smoothing(0.01) is applied.
|
|
104
|
+
|
|
105
|
+
scoringType: str
|
|
106
|
+
A string designating the scoring method we want to use. Since scoring is used while constructing the
|
|
107
|
+
network and not when learning its parameters, the scoring will be ignored if using a learning algorithm
|
|
108
|
+
with a fixed network structure such as Chow-Liu, TAN or NaiveBayes.
|
|
109
|
+
possible values are: AIC, BIC, BD, BDeu, K2, Log2
|
|
110
|
+
AIC means Akaike information criterion
|
|
111
|
+
BIC means Bayesian Information criterion
|
|
112
|
+
BD means Bayesian-Dirichlet scoring
|
|
113
|
+
BDeu means Bayesian-Dirichlet equivalent uniform
|
|
114
|
+
Log2 means log2 likelihood ratio test
|
|
115
|
+
|
|
116
|
+
constraints: dict()
|
|
117
|
+
A dictionary designating the constraints that we want to put on the structure of the Bayesian network.
|
|
118
|
+
Ignored if using a learning algorithm where the structure is fixed such as TAN or NaiveBayes.
|
|
119
|
+
the keys of the dictionary should be the strings "PossibleEdges" , "MandatoryArcs" and "ForbiddenArcs".
|
|
120
|
+
The format of the values should be a tuple of strings (tail,head) which designates the string arc from
|
|
121
|
+
tail to head. For example if we put the value ("x0"."y") in MandatoryArcs the network will surely have
|
|
122
|
+
an arc going from x0 to y.
|
|
123
|
+
|
|
124
|
+
Note: PossibleEdge allows between nodes x and y allows for either (x,y) or (y,x) (or none of them)
|
|
125
|
+
to be added to the Bayesian network, while the others are not symmetric.
|
|
126
|
+
|
|
127
|
+
priorWeight: double
|
|
128
|
+
The weight used for a prior.
|
|
129
|
+
|
|
130
|
+
possibleSkeleton: pyagrum.undigraph
|
|
131
|
+
An undirected graph that serves as a possible skeleton for the Bayesian network
|
|
132
|
+
|
|
133
|
+
DirichletCsv: str
|
|
134
|
+
the file name of the csv file we want to use for the dirichlet prior. Will be ignored if prior is not
|
|
135
|
+
set to Dirichlet.
|
|
136
|
+
|
|
137
|
+
discretizationStrategy: str
|
|
138
|
+
sets the default method of discretization for this discretizer. This method will be used if the user has
|
|
139
|
+
not specified another method for that specific variable using the setDiscretizationParameters method
|
|
140
|
+
possible values are: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP'
|
|
141
|
+
|
|
142
|
+
discretizationNbBins: str or int
|
|
143
|
+
sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can
|
|
144
|
+
also be set to the string 'elbowMethod' so that the best number of bins is found automatically.
|
|
145
|
+
If the method used is NML, this parameter sets the maximum number of bins up to which the NML
|
|
146
|
+
algorithm searches for the optimal number of bins. In this case this parameter must be an int
|
|
147
|
+
If any other discretization method is used, this parameter is ignored.
|
|
148
|
+
|
|
149
|
+
discretizationThreshold: int or float
|
|
150
|
+
When using default parameters a variable will be treated as continuous only if it has more unique values
|
|
151
|
+
than this number (if the number is an int greater than 1).
|
|
152
|
+
If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than
|
|
153
|
+
this number.
|
|
154
|
+
For instance, if you have entered 0.95, the variable will be treated as continuous only if more than 95%
|
|
155
|
+
of its values are unique.
|
|
156
|
+
|
|
157
|
+
usePR: bool
|
|
158
|
+
indicates if the threshold to choose is Prevision-Recall curve's threshold or ROC's threshold by
|
|
159
|
+
default.
|
|
160
|
+
ROC curves should be used when there are roughly equal numbers of observations for each class.
|
|
161
|
+
Precision-Recall curves should be used when there is a moderate to large class imbalance especially for
|
|
162
|
+
the target's class.
|
|
163
|
+
|
|
164
|
+
beta: float
|
|
165
|
+
if you choose the Precision-Recall curve's threshold, the F-beta score is maximized. By default, beta=1
|
|
166
|
+
to have the F1 score. A value inferior of 1 will give more weight to precision. A value superior of 1
|
|
167
|
+
will give more weight to recall. For example, beta = 0.5 makes precision twice as important as recall,
|
|
168
|
+
while beta = 2 does the opposite.
|
|
169
|
+
|
|
170
|
+
significant_digit:
|
|
171
|
+
number of significant digits when computing probabilities
|
|
172
|
+
"""
|
|
173
|
+
|
|
174
|
+
def __init__(
|
|
175
|
+
self,
|
|
176
|
+
learningMethod="MIIC",
|
|
177
|
+
prior=None,
|
|
178
|
+
scoringType="BIC",
|
|
179
|
+
constraints=None,
|
|
180
|
+
priorWeight=1,
|
|
181
|
+
possibleSkeleton=None,
|
|
182
|
+
DirichletCsv=None,
|
|
183
|
+
discretizationStrategy="quantile",
|
|
184
|
+
discretizationNbBins=5,
|
|
185
|
+
discretizationThreshold=25,
|
|
186
|
+
usePR=False,
|
|
187
|
+
beta=1,
|
|
188
|
+
significant_digit=10,
|
|
189
|
+
):
|
|
190
|
+
"""
|
|
191
|
+
Parameters
|
|
192
|
+
----------
|
|
193
|
+
learningMethod: str
|
|
194
|
+
A string designating which type of learning we want to use. Possible values are: Chow-Liu, NaiveBayes,
|
|
195
|
+
TAN, MIIC + (MDL ou NML), GHC, Tabu.
|
|
196
|
+
GHC designates Greedy Hill Climbing.
|
|
197
|
+
MIIC designates Multivariate Information based Inductive Causation
|
|
198
|
+
TAN designates Tree-augmented NaiveBayes
|
|
199
|
+
Tabu designated Tabu list searching
|
|
200
|
+
|
|
201
|
+
prior: str
|
|
202
|
+
A string designating the type of prior we want to use. Possible values are Smoothing, BDeu ,
|
|
203
|
+
Dirichlet and NoPrior.
|
|
204
|
+
Note: if using Dirichlet smoothing DirichletCsv cannot be set to none
|
|
205
|
+
|
|
206
|
+
scoringType: str
|
|
207
|
+
A string designating the type of scoring we want to use. Since scoring is used while constructing the
|
|
208
|
+
network and not when learning its parameters, the scoring will be ignored if using a learning algorithm
|
|
209
|
+
with a fixed network structure such as Chow-Liu, TAN or NaiveBayes.
|
|
210
|
+
possible values are: AIC, BIC, BD, BDeu, K2, Log2
|
|
211
|
+
AIC means Akaike information criterion
|
|
212
|
+
BIC means Bayesian Information criterion
|
|
213
|
+
BD means Bayesian-Dirichlet scoring
|
|
214
|
+
BDeu means Bayesian-Dirichlet equivalent uniform
|
|
215
|
+
Log2 means log2 likelihood ratio test
|
|
216
|
+
|
|
217
|
+
constraints: dict()
|
|
218
|
+
A dictionary designating the constraints that we want to put on the structure of the Bayesian network.
|
|
219
|
+
Ignored if using a learning algorithm where the structure is fixed such as TAN or NaiveBayes.
|
|
220
|
+
the keys of the dictionary should be the strings "PossibleEdges" , "MandatoryArcs" and "ForbiddenArcs".
|
|
221
|
+
The format of the values should be a tuple of strings (tail,head) which designates the string arc from
|
|
222
|
+
tail to head. For example if we put the value ("x0"."y") in MandatoryArcs the network will surely have
|
|
223
|
+
an arc going from x0 to y.
|
|
224
|
+
Note: PossibleEdges allows for both (tail,head) and (head,tail) to be added to the Bayesian network,
|
|
225
|
+
while the others are not symmetric.
|
|
226
|
+
|
|
227
|
+
priorWeight: double
|
|
228
|
+
The weight used for a prior.
|
|
229
|
+
|
|
230
|
+
possibleSkeleton: pyagrum.undigraph
|
|
231
|
+
An undirected graph that serves as a possible skeleton for the Bayesian network
|
|
232
|
+
|
|
233
|
+
DirichletCsv: str
|
|
234
|
+
the file name of the csv file we want to use for the dirichlet prior. Will be ignored if prior is not
|
|
235
|
+
set to Dirichlet.
|
|
236
|
+
|
|
237
|
+
discretizationStrategy: str
|
|
238
|
+
sets the default method of discretization for this discretizer. This method will be used if the user has
|
|
239
|
+
not specified another method for that specific variable using the setDiscretizationParameters method
|
|
240
|
+
possible values are: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP'
|
|
241
|
+
|
|
242
|
+
discretizationNbBins: str or int
|
|
243
|
+
sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can
|
|
244
|
+
also be set to the string 'elbowMethod' so that the best number of bins is found automatically.
|
|
245
|
+
If the method used is NML, this parameter sets the the maximum number of bins up to which the NML
|
|
246
|
+
algorithm searches for the optimal number of bins. In this case this parameter must be an int
|
|
247
|
+
If any other discetization method is used, this parameter is ignored.
|
|
248
|
+
|
|
249
|
+
discretizationThreshold: int or float
|
|
250
|
+
When using default parameters a variable will be treated as continuous only if it has more unique values
|
|
251
|
+
than this number (if the number is an int greater than 1).
|
|
252
|
+
If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than
|
|
253
|
+
this number.
|
|
254
|
+
For instance, if you have entered 0.95, the variable will be treated as continouus only if more than 95%
|
|
255
|
+
of its values are unique.
|
|
256
|
+
|
|
257
|
+
usePR: bool
|
|
258
|
+
indicates if the threshold to choose is Prevision-Recall curve's threshold or ROC's threshold by
|
|
259
|
+
default.
|
|
260
|
+
ROC curves should be used when there are roughly equal numbers of observations for each class.
|
|
261
|
+
Precision-Recall curves should be used when there is a moderate to large class imbalance especially for
|
|
262
|
+
the target's class.
|
|
263
|
+
|
|
264
|
+
significant_digit:
|
|
265
|
+
number of significant digits when computing probabilities
|
|
266
|
+
"""
|
|
267
|
+
|
|
268
|
+
# The method of learning used
|
|
269
|
+
self.learningMethod = learningMethod
|
|
270
|
+
|
|
271
|
+
# An object used to stock the learner object from pyagrum
|
|
272
|
+
self.learner = None
|
|
273
|
+
|
|
274
|
+
# Used to stock the Bayesian network learned
|
|
275
|
+
self.bn = None
|
|
276
|
+
|
|
277
|
+
# The threshold used for predicting the class. THe algorithm calculates the probability of a certain class, the
|
|
278
|
+
# classifier designates it as that class only if the probability is higher than the threshold.
|
|
279
|
+
# The ROC curve is used to calculate the optimal threshold
|
|
280
|
+
self.threshold = 0.5
|
|
281
|
+
self.usePR = usePR
|
|
282
|
+
self.beta = beta
|
|
283
|
+
|
|
284
|
+
# the type of prior used
|
|
285
|
+
self.prior = prior
|
|
286
|
+
|
|
287
|
+
# the weight used for the a prior
|
|
288
|
+
self.priorWeight = priorWeight
|
|
289
|
+
|
|
290
|
+
# the type of scoring used
|
|
291
|
+
self.scoringType = scoringType
|
|
292
|
+
|
|
293
|
+
# the constraints forced onto the structure of the Bayesian network
|
|
294
|
+
self.constraints = constraints
|
|
295
|
+
|
|
296
|
+
self.possibleSkeleton = possibleSkeleton
|
|
297
|
+
|
|
298
|
+
self.DirichletCsv = DirichletCsv
|
|
299
|
+
|
|
300
|
+
self.MarkovBlanket = None
|
|
301
|
+
|
|
302
|
+
self.significant_digit = significant_digit
|
|
303
|
+
|
|
304
|
+
self.discretizationNbBins = discretizationNbBins
|
|
305
|
+
self.discretizationStrategy = discretizationStrategy
|
|
306
|
+
self.discretizationThreshold = discretizationThreshold
|
|
307
|
+
self.type_processor = DiscreteTypeProcessor(
|
|
308
|
+
defaultDiscretizationMethod=discretizationStrategy,
|
|
309
|
+
defaultNumberOfBins=discretizationNbBins,
|
|
310
|
+
discretizationThreshold=discretizationThreshold,
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# boolean that tells us whether this classifier is obtained from an already trained model (using the function
|
|
314
|
+
# fromTrainedModel) or not
|
|
315
|
+
self.fromModel = False
|
|
316
|
+
|
|
317
|
+
self.label = "1.0"
|
|
318
|
+
|
|
319
|
+
# the name of the target variable
|
|
320
|
+
self.target = "y"
|
|
321
|
+
|
|
322
|
+
# the type of the target variable
|
|
323
|
+
self.targetType = None
|
|
324
|
+
self.isBinaryClassifier = None
|
|
325
|
+
|
|
326
|
+
# dict(str:int)
|
|
327
|
+
# The keys of this dictionary are the names of the variables. The value associated to each name is
|
|
328
|
+
# the index of the variable.
|
|
329
|
+
self.variableNameIndexDictionary = None
|
|
330
|
+
|
|
331
|
+
def fit(self, X=None, y=None, data=None, targetName=None):
|
|
332
|
+
"""
|
|
333
|
+
Fits the model to the training data provided. The two possible uses of this function are `fit(X,y)`
|
|
334
|
+
and `fit(data=...,targetName=...)`. Any other combination will raise a ValueError
|
|
335
|
+
|
|
336
|
+
Parameters
|
|
337
|
+
----------
|
|
338
|
+
X: {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
339
|
+
training data. Warning: Raises ValueError if either data or targetname is not None. Raises ValueError
|
|
340
|
+
if y is None.
|
|
341
|
+
y: array-like of shape (n_samples)
|
|
342
|
+
Target values. Warning: Raises ValueError if either data or targetname is not None. Raises ValueError
|
|
343
|
+
if X is None
|
|
344
|
+
data: Union[str,pandas.DataFrame]
|
|
345
|
+
the source of training data : csv filename or pandas.DataFrame. targetName is mandatory to find the class
|
|
346
|
+
in this source.
|
|
347
|
+
targetName: str
|
|
348
|
+
specifies the name of the targetVariable in the csv file. Warning: Raises ValueError if either X or y is
|
|
349
|
+
not None. Raises ValueError if data is None.
|
|
350
|
+
"""
|
|
351
|
+
if data is None:
|
|
352
|
+
if targetName is not None:
|
|
353
|
+
raise ValueError(
|
|
354
|
+
"This function should be used either as fit(X,y) or fit(data=...,targetName=...). You have set "
|
|
355
|
+
"data to None, but have entered a targetName"
|
|
356
|
+
)
|
|
357
|
+
if X is None or y is None:
|
|
358
|
+
raise ValueError(
|
|
359
|
+
"This function should be used either as fit(X,y) or fit(data=...,targetName=...). You have not "
|
|
360
|
+
"entered a data source (filename or pandas.DataFrame) and not specified the X and y matrices"
|
|
361
|
+
" that should be used"
|
|
362
|
+
)
|
|
363
|
+
else:
|
|
364
|
+
if targetName is None:
|
|
365
|
+
raise ValueError(
|
|
366
|
+
"This function should be used either as fit(X,y) or fit(data=...,targetName=...). The name of the "
|
|
367
|
+
"target must be specified if using this function with data containing a csv filename or a pandas.DataFrame."
|
|
368
|
+
)
|
|
369
|
+
if X is not None or y is not None:
|
|
370
|
+
raise ValueError(
|
|
371
|
+
"This function should be used either as fit(X,y) or fit(data=...,targetName=...). You can not give "
|
|
372
|
+
"a data and the X and y matrices at the same time."
|
|
373
|
+
)
|
|
374
|
+
if isinstance(data, str):
|
|
375
|
+
X, y = self.XYfromCSV(data, True, targetName)
|
|
376
|
+
else: # pandas.DataFrame
|
|
377
|
+
y = data[targetName]
|
|
378
|
+
X = data.drop(targetName, axis=1)
|
|
379
|
+
|
|
380
|
+
self.fromModel = False
|
|
381
|
+
variableNames = None
|
|
382
|
+
self.type_processor.clear()
|
|
383
|
+
|
|
384
|
+
if isinstance(y, pandas.DataFrame): # type(y) == pandas.DataFrame:
|
|
385
|
+
self.target = y.columns.tolist()[0]
|
|
386
|
+
if check_int(self.target):
|
|
387
|
+
self.target = "Y"
|
|
388
|
+
elif type(y) is pandas.core.series.Series:
|
|
389
|
+
self.target = y.name
|
|
390
|
+
else:
|
|
391
|
+
self.target = "y"
|
|
392
|
+
|
|
393
|
+
if isinstance(X, pandas.DataFrame): # type(X) == pandas.DataFrame:
|
|
394
|
+
variableNames = [f"X{x}" if check_int(x) else x for x in X.columns]
|
|
395
|
+
|
|
396
|
+
# verifies the shape of the two arrays
|
|
397
|
+
X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True)
|
|
398
|
+
|
|
399
|
+
d = X.shape[1]
|
|
400
|
+
|
|
401
|
+
if variableNames is None:
|
|
402
|
+
variableNames = ["x" + str(i) for i in range(d)]
|
|
403
|
+
|
|
404
|
+
self.variableNameIndexDictionary = dict()
|
|
405
|
+
|
|
406
|
+
for i in range(d):
|
|
407
|
+
self.variableNameIndexDictionary[variableNames[i]] = i
|
|
408
|
+
|
|
409
|
+
self.targetType = y.dtype
|
|
410
|
+
|
|
411
|
+
possibleValuesY = numpy.unique(y)
|
|
412
|
+
|
|
413
|
+
if len(possibleValuesY) == 1:
|
|
414
|
+
raise ValueError("There is only 1 possible values for Y in the data provided")
|
|
415
|
+
if len(possibleValuesY) > 10:
|
|
416
|
+
warnings.warn(
|
|
417
|
+
f"A classifier with too many possible values for Y (here : {possibleValuesY})"
|
|
418
|
+
"in the data provided is not meaningfull ("
|
|
419
|
+
"please use regression methods instead)."
|
|
420
|
+
)
|
|
421
|
+
|
|
422
|
+
self.isBinaryClassifier = len(possibleValuesY) == 2
|
|
423
|
+
|
|
424
|
+
self.bn = gum.BayesNet("Template")
|
|
425
|
+
|
|
426
|
+
is_int_varY = True
|
|
427
|
+
min_vY = max_vY = None
|
|
428
|
+
for value in possibleValuesY:
|
|
429
|
+
if not check_int(value):
|
|
430
|
+
is_int_varY = False
|
|
431
|
+
break
|
|
432
|
+
else:
|
|
433
|
+
v = int(value)
|
|
434
|
+
if min_vY is None or min_vY > v:
|
|
435
|
+
min_vY = v
|
|
436
|
+
if max_vY is None or max_vY < v:
|
|
437
|
+
max_vY = v
|
|
438
|
+
|
|
439
|
+
if is_int_varY:
|
|
440
|
+
if len(possibleValuesY) == max_vY - min_vY + 1: # no hole in the list of int
|
|
441
|
+
var = gum.RangeVariable(self.target, self.target, min_vY, max_vY)
|
|
442
|
+
else:
|
|
443
|
+
var = gum.IntegerVariable(self.target, self.target, [int(v) for v in possibleValuesY])
|
|
444
|
+
else:
|
|
445
|
+
var = gum.LabelizedVariable(self.target, self.target, [str(v) for v in possibleValuesY])
|
|
446
|
+
self.bn.add(var)
|
|
447
|
+
|
|
448
|
+
for i in range(d):
|
|
449
|
+
var = self.type_processor._createVariable(variableNames[i], X[:, i], y, possibleValuesY)
|
|
450
|
+
self.bn.add(var)
|
|
451
|
+
|
|
452
|
+
csvfile = tempfile.NamedTemporaryFile(delete=False)
|
|
453
|
+
tmpfilename = csvfile.name
|
|
454
|
+
csvfilename = tmpfilename + ".csv"
|
|
455
|
+
csvfile.close()
|
|
456
|
+
|
|
457
|
+
CSV(X, y, self.target, self.variableNameIndexDictionary, csvfilename)
|
|
458
|
+
|
|
459
|
+
self.learner = gum.BNLearner(csvfilename, self.bn)
|
|
460
|
+
|
|
461
|
+
IPrior(self.prior, self.learner, self.priorWeight, self.DirichletCsv)
|
|
462
|
+
|
|
463
|
+
if self.learningMethod == "NaiveBayes":
|
|
464
|
+
self.bn = BN_fitNaiveBayes(X, y, self.bn, self.learner, variableNames, self.target, self.constraints)
|
|
465
|
+
elif self.learningMethod == "TAN":
|
|
466
|
+
self.bn = BN_fitTAN(X, y, self.bn, self.learner, variableNames, self.target)
|
|
467
|
+
elif self.learningMethod == "Chow-Liu":
|
|
468
|
+
self.bn = BN_fitChowLiu(X, y, self.bn, self.learner, variableNames, self.target)
|
|
469
|
+
else:
|
|
470
|
+
self.bn = BN_fitStandard(
|
|
471
|
+
X, y, self.learner, self.learningMethod, self.possibleSkeleton, self.scoringType, self.constraints
|
|
472
|
+
)
|
|
473
|
+
|
|
474
|
+
self.label = self.bn.variableFromName(self.target).labels()[1]
|
|
475
|
+
|
|
476
|
+
self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target)
|
|
477
|
+
|
|
478
|
+
if self.isBinaryClassifier:
|
|
479
|
+
self.threshold = CThreshold(
|
|
480
|
+
self.MarkovBlanket, self.target, csvfilename, self.usePR, self.beta, self.significant_digit
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
os.remove(csvfilename)
|
|
484
|
+
os.remove(tmpfilename)
|
|
485
|
+
|
|
486
|
+
def fromTrainedModel(self, bn, targetAttribute, targetModality="", copy=False, threshold=0.5, variableList=None):
|
|
487
|
+
"""
|
|
488
|
+
parameters:
|
|
489
|
+
bn: pyagrum.BayesNet
|
|
490
|
+
The Bayesian network we want to use for this classifier
|
|
491
|
+
targetAttribute: str
|
|
492
|
+
the attribute that will be the target in this classifier
|
|
493
|
+
targetModality: str
|
|
494
|
+
If this is a binary classifier we have to specify which modality we are looking at if the target
|
|
495
|
+
attribute has more than 2 possible values
|
|
496
|
+
if !="", a binary classifier is created.
|
|
497
|
+
if =="", a classifier is created that can be non-binary depending on the number of modalities
|
|
498
|
+
for targetAttribute. If binary, the second one is taken as targetModality.
|
|
499
|
+
copy: bool
|
|
500
|
+
Indicates whether we want to put a copy of bn in the classifier, or bn itself.
|
|
501
|
+
threshold: double
|
|
502
|
+
The classification threshold. If the probability that the target modality is true is larger than this
|
|
503
|
+
threshold we predict that modality
|
|
504
|
+
variableList: list(str)
|
|
505
|
+
A list of strings. variableList[i] is the name of the variable that has the index i. We use this information
|
|
506
|
+
when calling predict to know which column corresponds to which variable.
|
|
507
|
+
If this list is set to none, then we use the order in which the variables were added to the network.
|
|
508
|
+
|
|
509
|
+
returns:
|
|
510
|
+
void
|
|
511
|
+
|
|
512
|
+
Creates a BN classifier from an already trained pyAgrum Bayesian network
|
|
513
|
+
"""
|
|
514
|
+
|
|
515
|
+
self.fromModel = True
|
|
516
|
+
|
|
517
|
+
# the set of the names of all the variables in the Bayesian network
|
|
518
|
+
namesSet = set(bn.names())
|
|
519
|
+
|
|
520
|
+
# The target specified must be a variable in the Bayesian network
|
|
521
|
+
if targetAttribute not in namesSet:
|
|
522
|
+
raise ValueError("the target variable does not appear in the Bayesian network")
|
|
523
|
+
|
|
524
|
+
self.target = targetAttribute
|
|
525
|
+
|
|
526
|
+
self.learner = None
|
|
527
|
+
|
|
528
|
+
if copy:
|
|
529
|
+
self.bn = gum.BayesNet(bn)
|
|
530
|
+
else:
|
|
531
|
+
self.bn = bn
|
|
532
|
+
|
|
533
|
+
self.threshold = threshold
|
|
534
|
+
|
|
535
|
+
self.MarkovBlanket = compileMarkovBlanket(self.bn, self.target)
|
|
536
|
+
|
|
537
|
+
self.variableNameIndexDictionary = dict()
|
|
538
|
+
# if the user specified an order for the variables then we use this order
|
|
539
|
+
if variableList is not None:
|
|
540
|
+
if len(namesSet) - 1 != len(variableList):
|
|
541
|
+
raise ValueError("variableList should include all variables in the Bayesian network except the target")
|
|
542
|
+
|
|
543
|
+
i = 0
|
|
544
|
+
for name in variableList:
|
|
545
|
+
if name not in namesSet:
|
|
546
|
+
raise ValueError("variableList includes a name that does not appear in the Bayesian network")
|
|
547
|
+
self.variableNameIndexDictionary[name] = i
|
|
548
|
+
i = i + 1
|
|
549
|
+
|
|
550
|
+
# if the user didn't specify an order we use the order that the variables were added in
|
|
551
|
+
else:
|
|
552
|
+
variableList = bn.names()
|
|
553
|
+
i = 0
|
|
554
|
+
for name in variableList:
|
|
555
|
+
if name == self.target:
|
|
556
|
+
continue
|
|
557
|
+
self.variableNameIndexDictionary[name] = i
|
|
558
|
+
i = i + 1
|
|
559
|
+
|
|
560
|
+
if targetModality != "":
|
|
561
|
+
self.isBinaryClassifier = True
|
|
562
|
+
self.label = targetModality
|
|
563
|
+
else:
|
|
564
|
+
if self.bn.variableFromName(self.target).domainSize() == 2:
|
|
565
|
+
self.isBinaryClassifier = True
|
|
566
|
+
self.label = self.bn.variableFromName(self.target).labels()[1] # we take the label 1 as targetModality
|
|
567
|
+
else:
|
|
568
|
+
self.isBinaryClassifier = False
|
|
569
|
+
|
|
570
|
+
def changeVariableName(self, oldName, newName):
|
|
571
|
+
"""
|
|
572
|
+
Changes the name of a variable inside the Bayesian network
|
|
573
|
+
|
|
574
|
+
Parameters
|
|
575
|
+
----------
|
|
576
|
+
oldName: str
|
|
577
|
+
the old name of the variable
|
|
578
|
+
newName: str
|
|
579
|
+
the new name of the variable
|
|
580
|
+
"""
|
|
581
|
+
if oldName == self.target:
|
|
582
|
+
self.bn.changeVariableName(oldName, newName)
|
|
583
|
+
self.target = newName
|
|
584
|
+
self.MarkovBlanket.changeVariableName(oldName, newName)
|
|
585
|
+
return
|
|
586
|
+
|
|
587
|
+
if oldName not in self.variableNameIndexDictionary:
|
|
588
|
+
raise ValueError("The oldName you have specified is not a name of a variable in the Bayesian network")
|
|
589
|
+
index = self.variableNameIndexDictionary.pop(oldName)
|
|
590
|
+
|
|
591
|
+
self.variableNameIndexDictionary[newName] = index
|
|
592
|
+
|
|
593
|
+
self.bn.changeVariableName(oldName, newName)
|
|
594
|
+
|
|
595
|
+
if oldName in self.MarkovBlanket.names():
|
|
596
|
+
self.MarkovBlanket.changeVariableName(oldName, newName)
|
|
597
|
+
|
|
598
|
+
# ------------------method Markov Blanket and predict---------------------
|
|
599
|
+
|
|
600
|
+
def predict(self, X, with_labels=True):
|
|
601
|
+
"""
|
|
602
|
+
Predicts the most likely class for each row of input data, with bn's Markov Blanket
|
|
603
|
+
|
|
604
|
+
Parameters
|
|
605
|
+
----------
|
|
606
|
+
X: str,{array-like, sparse matrix} of shape (n_samples, n_features) or str
|
|
607
|
+
test data, can be either dataFrame, matrix or name of a csv file
|
|
608
|
+
with_labels: bool
|
|
609
|
+
tells us whether the csv includes the labels themselves or their indexes.
|
|
610
|
+
returns:
|
|
611
|
+
y: array-like of shape (n_samples,)
|
|
612
|
+
Predicted classes
|
|
613
|
+
"""
|
|
614
|
+
if isinstance(X, str):
|
|
615
|
+
X, _ = self.XYfromCSV(X, target=self.target)
|
|
616
|
+
|
|
617
|
+
if isinstance(X, pandas.DataFrame): # type(X) == pandas.DataFrame:
|
|
618
|
+
dictName = DFNames(X)
|
|
619
|
+
else:
|
|
620
|
+
dictName = self.variableNameIndexDictionary
|
|
621
|
+
|
|
622
|
+
if self.fromModel:
|
|
623
|
+
X = sklearn.utils.check_array(X, dtype="str", ensure_2d=False)
|
|
624
|
+
else:
|
|
625
|
+
X = sklearn.utils.check_array(X, dtype=None, ensure_2d=False)
|
|
626
|
+
|
|
627
|
+
if self.isBinaryClassifier:
|
|
628
|
+
returned_list = self._binary_predict(X, dictName)
|
|
629
|
+
else:
|
|
630
|
+
returned_list = self._nary_predict(X, dictName, with_labels)
|
|
631
|
+
|
|
632
|
+
returned_list = numpy.array(returned_list)
|
|
633
|
+
if not self.fromModel:
|
|
634
|
+
if self.targetType == "bool":
|
|
635
|
+
returned_list = returned_list == "True"
|
|
636
|
+
elif numpy.issubdtype(self.targetType, numpy.number):
|
|
637
|
+
returned_list = returned_list.astype("float")
|
|
638
|
+
|
|
639
|
+
return returned_list
|
|
640
|
+
|
|
641
|
+
def _nary_predict(self, X, dictName, with_labels) -> Union[List[str], List[int]]:
|
|
642
|
+
"""
|
|
643
|
+
For a classifier, predicts the most likely class for each row of input data, with bn's Markov Blanket
|
|
644
|
+
|
|
645
|
+
Parameters
|
|
646
|
+
----------
|
|
647
|
+
X: {array-like, sparse matrix} of shape (n_samples, n_features) or str
|
|
648
|
+
test data, can be either dataFrame, matrix or name of a csv file
|
|
649
|
+
the data
|
|
650
|
+
dictName: Dict[str,int]
|
|
651
|
+
dictionary of the name of a variable and his column in the database
|
|
652
|
+
with_labels: bool
|
|
653
|
+
whether `data` contains the labels themselves or their ids.
|
|
654
|
+
|
|
655
|
+
Returns
|
|
656
|
+
-------
|
|
657
|
+
array-like of shape (n_samples,)
|
|
658
|
+
the list of predictions as idLabel or label name.
|
|
659
|
+
"""
|
|
660
|
+
returned_list = []
|
|
661
|
+
I = self.MarkovBlanket.completeInstantiation()
|
|
662
|
+
I.erase(self.target)
|
|
663
|
+
for x in X:
|
|
664
|
+
vals, _ = _calcul_most_probable_for_nary_class(x, I, dictName, self.MarkovBlanket, self.target)
|
|
665
|
+
if with_labels:
|
|
666
|
+
returned_list.append(self.MarkovBlanket.variable(self.target).label(vals[0][0][self.target]))
|
|
667
|
+
else:
|
|
668
|
+
returned_list.append(vals[0][0][self.target])
|
|
669
|
+
|
|
670
|
+
return returned_list
|
|
671
|
+
|
|
672
|
+
def _binary_predict(self, X, dictName) -> Union[List[str], List[bool]]:
|
|
673
|
+
"""
|
|
674
|
+
For a binary classifier, predicts the most likely class for each row of input data, with bn's Markov Blanket
|
|
675
|
+
|
|
676
|
+
Parameters
|
|
677
|
+
----------
|
|
678
|
+
X: {array-like, sparse matrix} of shape (n_samples, n_features) or str
|
|
679
|
+
test data, can be either dataFrame, matrix or name of a csv file
|
|
680
|
+
the datas
|
|
681
|
+
dictName: Dict[str,int]
|
|
682
|
+
dictionary of the name of a variable and his column in the database
|
|
683
|
+
|
|
684
|
+
Returns
|
|
685
|
+
-------
|
|
686
|
+
array-like of shape (n_samples,)
|
|
687
|
+
the list of predictions
|
|
688
|
+
"""
|
|
689
|
+
returned_list = []
|
|
690
|
+
# list of other labels of the target
|
|
691
|
+
labels = [
|
|
692
|
+
self.bn.variable(self.target).label(i)
|
|
693
|
+
for i in range(self.bn.variable(self.target).domainSize())
|
|
694
|
+
if self.bn.variable(self.target).label(i) != self.label
|
|
695
|
+
]
|
|
696
|
+
|
|
697
|
+
# negative value to add to the list returned
|
|
698
|
+
label0 = labels[0]
|
|
699
|
+
# label of the target
|
|
700
|
+
label1 = self.label
|
|
701
|
+
# Instantiation use to apply values of the database
|
|
702
|
+
I = self.MarkovBlanket.completeInstantiation()
|
|
703
|
+
# read through database's ligns
|
|
704
|
+
for x in X:
|
|
705
|
+
res = round(
|
|
706
|
+
_calcul_proba_for_binary_class(x, label1, labels, I, dictName, self.MarkovBlanket, self.target),
|
|
707
|
+
self.significant_digit,
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
if res >= self.threshold: # Positive value predicted
|
|
711
|
+
if self.fromModel:
|
|
712
|
+
returned_list.append(True)
|
|
713
|
+
else:
|
|
714
|
+
returned_list.append(label1)
|
|
715
|
+
else: # Negative value predicted
|
|
716
|
+
if self.fromModel:
|
|
717
|
+
returned_list.append(False)
|
|
718
|
+
else:
|
|
719
|
+
returned_list.append(label0)
|
|
720
|
+
|
|
721
|
+
return returned_list
|
|
722
|
+
|
|
723
|
+
# ------------------interaction with sklearn, pour ROC et Precision-Recall ---------------------
|
|
724
|
+
|
|
725
|
+
def predict_proba(self, X):
|
|
726
|
+
"""
|
|
727
|
+
Predicts the probability of classes for each row of input data, with bn's Markov Blanket
|
|
728
|
+
|
|
729
|
+
Parameters
|
|
730
|
+
----------
|
|
731
|
+
X: str or {array-like, sparse matrix} of shape (n_samples, n_features) or str
|
|
732
|
+
test data, can be either dataFrame, matrix or name of a csv file
|
|
733
|
+
|
|
734
|
+
Returns
|
|
735
|
+
-------
|
|
736
|
+
array-like of shape (n_samples,)
|
|
737
|
+
Predicted probability for each classes
|
|
738
|
+
"""
|
|
739
|
+
# dictionary of the name of a variable and his column in the database
|
|
740
|
+
dictName = self.variableNameIndexDictionary
|
|
741
|
+
|
|
742
|
+
if isinstance(X, pandas.DataFrame): # type(X) == pandas.DataFrame:
|
|
743
|
+
dictName = DFNames(X)
|
|
744
|
+
vals = X.to_numpy()
|
|
745
|
+
elif isinstance(X, str):
|
|
746
|
+
vals, _ = self.XYfromCSV(X, target=self.target)
|
|
747
|
+
dictName = DFNames(vals)
|
|
748
|
+
vals = vals.to_numpy()
|
|
749
|
+
else:
|
|
750
|
+
vals = X
|
|
751
|
+
|
|
752
|
+
if self.fromModel:
|
|
753
|
+
vals = sklearn.utils.check_array(vals, dtype="str", ensure_2d=False)
|
|
754
|
+
else:
|
|
755
|
+
sklearn.utils.check_array(vals, dtype=None, ensure_2d=False)
|
|
756
|
+
|
|
757
|
+
returned_list = []
|
|
758
|
+
|
|
759
|
+
# label of the target
|
|
760
|
+
label1 = self.label
|
|
761
|
+
# list of other labels of the target
|
|
762
|
+
labels = [
|
|
763
|
+
self.bn.variable(self.target).label(i)
|
|
764
|
+
for i in range(self.bn.variable(self.target).domainSize())
|
|
765
|
+
if self.bn.variable(self.target).label(i) != self.label
|
|
766
|
+
]
|
|
767
|
+
|
|
768
|
+
# Instantiation use to apply values of the database
|
|
769
|
+
I = self.MarkovBlanket.completeInstantiation()
|
|
770
|
+
|
|
771
|
+
# read through database's ligns
|
|
772
|
+
if self.isBinaryClassifier:
|
|
773
|
+
for x in vals:
|
|
774
|
+
res = round(
|
|
775
|
+
_calcul_proba_for_binary_class(x, label1, labels, I, dictName, self.MarkovBlanket, self.target),
|
|
776
|
+
self.significant_digit,
|
|
777
|
+
)
|
|
778
|
+
returned_list.append([1 - res, res])
|
|
779
|
+
else:
|
|
780
|
+
local_inst = gum.Instantiation(I)
|
|
781
|
+
local_inst.erase(self.target)
|
|
782
|
+
for x in vals:
|
|
783
|
+
returned_list.append(
|
|
784
|
+
_calcul_proba_for_nary_class(x, local_inst, dictName, self.MarkovBlanket, self.target).tolist()
|
|
785
|
+
)
|
|
786
|
+
|
|
787
|
+
return numpy.array(returned_list)
|
|
788
|
+
|
|
789
|
+
# ------------------ BNClassifier compatible from pyagrum to sklearn ---------------------
|
|
790
|
+
|
|
791
|
+
def XYfromCSV(self, filename, with_labels=True, target=None):
|
|
792
|
+
"""
|
|
793
|
+
Reads the data from a csv file and separates it into an X matrix and a y column vector.
|
|
794
|
+
|
|
795
|
+
Parameters
|
|
796
|
+
----------
|
|
797
|
+
filename: str
|
|
798
|
+
the name of the csv file
|
|
799
|
+
with_labels: bool
|
|
800
|
+
tells us whether the csv includes the labels themselves or their indexes.
|
|
801
|
+
target: str or None
|
|
802
|
+
The name of the column that will be put in the dataframe y. If target is None, we use the target that is
|
|
803
|
+
already specified in the classifier
|
|
804
|
+
|
|
805
|
+
Returns
|
|
806
|
+
-------
|
|
807
|
+
Tuple(pandas.Dataframe,pandas.Dataframe)
|
|
808
|
+
Matrix X containing the data,Column-vector containing the class for each data vector in X
|
|
809
|
+
"""
|
|
810
|
+
if self.fromModel:
|
|
811
|
+
dataframe = pandas.read_csv(filename, dtype="str")
|
|
812
|
+
else:
|
|
813
|
+
dataframe = pandas.read_csv(filename)
|
|
814
|
+
|
|
815
|
+
if target is None:
|
|
816
|
+
target = self.target
|
|
817
|
+
y = dataframe[target]
|
|
818
|
+
X = dataframe.drop(target, axis=1)
|
|
819
|
+
|
|
820
|
+
if not with_labels:
|
|
821
|
+
variableList = X.columns.tolist()
|
|
822
|
+
targetVariable = self.bn.variableFromName(target)
|
|
823
|
+
for index in range(len(variableList)):
|
|
824
|
+
variableList[index] = self.bn.variableFromName(variableList[index])
|
|
825
|
+
for row in X:
|
|
826
|
+
for i in range(len(row)):
|
|
827
|
+
row[i] = variableList[i].labels(row[i])
|
|
828
|
+
if self.fromModel:
|
|
829
|
+
if self.isBinaryClassifier:
|
|
830
|
+
labelIndex = 0
|
|
831
|
+
labelList = targetVariable.labels()
|
|
832
|
+
while labelIndex < len(labelList):
|
|
833
|
+
if labelList[labelIndex] == self.label:
|
|
834
|
+
break
|
|
835
|
+
labelIndex += 1
|
|
836
|
+
y = y == labelIndex
|
|
837
|
+
else:
|
|
838
|
+
for index in range(len(y)):
|
|
839
|
+
y[index] = targetVariable(y[index])
|
|
840
|
+
|
|
841
|
+
elif self.fromModel:
|
|
842
|
+
y = y.astype("str")
|
|
843
|
+
if self.isBinaryClassifier:
|
|
844
|
+
y = y == self.label
|
|
845
|
+
|
|
846
|
+
return X, y
|
|
847
|
+
|
|
848
|
+
def preparedData(self, X=None, y=None, data=None):
|
|
849
|
+
"""
|
|
850
|
+
Given an X and a y (or a data source : filename or pandas.DataFrame),
|
|
851
|
+
returns a pandas.Dataframe with the prepared (especially discretized) values of the base
|
|
852
|
+
|
|
853
|
+
Parameters
|
|
854
|
+
----------
|
|
855
|
+
X: {array-like, sparse matrix} of shape (n_samples, n_features)
|
|
856
|
+
training data. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError
|
|
857
|
+
if y is None.
|
|
858
|
+
y: array-like of shape (n_samples)
|
|
859
|
+
Target values. Warning: Raises ValueError if either filename or targetname is not None. Raises ValueError
|
|
860
|
+
if X is None
|
|
861
|
+
data: Union[str,pandas.DataFrame]
|
|
862
|
+
specifies the csv file or the DataFrame where the data values are located. Warning: Raises ValueError
|
|
863
|
+
if either X or y is not None.
|
|
864
|
+
|
|
865
|
+
Returns
|
|
866
|
+
-------
|
|
867
|
+
pandas.Dataframe
|
|
868
|
+
"""
|
|
869
|
+
if self.variableNameIndexDictionary is None:
|
|
870
|
+
raise ValueError("First, you need to fit a model !")
|
|
871
|
+
|
|
872
|
+
targetName = self.target
|
|
873
|
+
if data is None:
|
|
874
|
+
if X is None or y is None:
|
|
875
|
+
raise ValueError(
|
|
876
|
+
"This function should be used either as preparedData(X,y) or preparedData(data=...). You have not "
|
|
877
|
+
"entered a csv file name and not specified the X and y matrices that should be used"
|
|
878
|
+
)
|
|
879
|
+
else:
|
|
880
|
+
if targetName is None:
|
|
881
|
+
raise ValueError(
|
|
882
|
+
"This function should be used either as preparedData(X,y) or preparedData(data=...). The name of the "
|
|
883
|
+
"target must be specified if using this function with a csv file."
|
|
884
|
+
)
|
|
885
|
+
if X is not None or y is not None:
|
|
886
|
+
raise ValueError(
|
|
887
|
+
"This function should be used either as preparedData(X,y) or preparedData(data=...). You have entered "
|
|
888
|
+
"a data source and the X and y matrices at the same time."
|
|
889
|
+
)
|
|
890
|
+
|
|
891
|
+
if isinstance(data, str):
|
|
892
|
+
X, y = self.XYfromCSV(data, True, targetName)
|
|
893
|
+
else: # pandas.DataFrame
|
|
894
|
+
y = data[targetName]
|
|
895
|
+
X = data.drop(targetName, axis=1)
|
|
896
|
+
|
|
897
|
+
def bestTypedVal(v, idx):
|
|
898
|
+
if v.varType() == gum.VarType_DISCRETIZED:
|
|
899
|
+
return v.label(idx)
|
|
900
|
+
elif v.varType() == gum.VarType_INTEGER:
|
|
901
|
+
return int(v.numerical(idx))
|
|
902
|
+
elif v.varType() == gum.VarType_LABELIZED:
|
|
903
|
+
return v.label(idx)
|
|
904
|
+
elif v.varType() == gum.VarType_RANGE:
|
|
905
|
+
return int(v.numerical(idx))
|
|
906
|
+
elif v.varType() == gum.VarType_NUMERICAL:
|
|
907
|
+
return float(v.numerical(idx))
|
|
908
|
+
else:
|
|
909
|
+
raise gum.NotFound("This type of variable does not exist yet.")
|
|
910
|
+
|
|
911
|
+
reverse = {v: k for k, v in self.variableNameIndexDictionary.items()}
|
|
912
|
+
if isinstance(X, pandas.DataFrame): # to be sure of the name of the columns
|
|
913
|
+
X = X.rename(columns=reverse)
|
|
914
|
+
varY = self.bn.variable(self.target)
|
|
915
|
+
df = pandas.DataFrame([], columns=[reverse[k] for k in range(len(reverse))] + [self.target])
|
|
916
|
+
|
|
917
|
+
for n in range(len(X)):
|
|
918
|
+
ligne = []
|
|
919
|
+
for k in range(len(reverse)):
|
|
920
|
+
if isinstance(X, pandas.DataFrame):
|
|
921
|
+
val = X[reverse[k]][n]
|
|
922
|
+
else: # np.array
|
|
923
|
+
val = X[n][k]
|
|
924
|
+
var = self.bn.variable(reverse[k])
|
|
925
|
+
ligne.append(bestTypedVal(var, var[str(val)]))
|
|
926
|
+
|
|
927
|
+
ligne.append(bestTypedVal(varY, varY[str(y[n])]))
|
|
928
|
+
df.loc[len(df)] = ligne
|
|
929
|
+
|
|
930
|
+
return df
|
|
931
|
+
|
|
932
|
+
def __getstate__(self):
|
|
933
|
+
"""
|
|
934
|
+
Returns the state of the object (used for pickling)
|
|
935
|
+
|
|
936
|
+
Warnings
|
|
937
|
+
--------
|
|
938
|
+
This method is used for pickling and should not be called directly.
|
|
939
|
+
Note that the "learning side" of the BNClassifier is not in the state which focus on the classifier itself.
|
|
940
|
+
|
|
941
|
+
Returns
|
|
942
|
+
-------
|
|
943
|
+
the state as a dictionary.
|
|
944
|
+
"""
|
|
945
|
+
return {
|
|
946
|
+
"bn": self.bn,
|
|
947
|
+
"target": self.target,
|
|
948
|
+
"targetType": self.targetType,
|
|
949
|
+
"label": self.label,
|
|
950
|
+
"fromModel": self.fromModel,
|
|
951
|
+
"threshold": self.threshold,
|
|
952
|
+
"variableNameIndexDictionary": self.variableNameIndexDictionary,
|
|
953
|
+
"params": self.get_params(),
|
|
954
|
+
"discretizer": self.type_processor,
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
def __setstate__(self, state):
|
|
958
|
+
"""
|
|
959
|
+
Sets the state of the object (used for pickling)
|
|
960
|
+
|
|
961
|
+
Warnings
|
|
962
|
+
--------
|
|
963
|
+
This method is used for pickling and should not be called directly.
|
|
964
|
+
Note that the "learning side" of the BNClassifier is not in the state which focus on the classifier itself.
|
|
965
|
+
|
|
966
|
+
Parameters
|
|
967
|
+
----------
|
|
968
|
+
state : dict
|
|
969
|
+
the state of the object as a dictionary
|
|
970
|
+
|
|
971
|
+
Returns
|
|
972
|
+
-------
|
|
973
|
+
self
|
|
974
|
+
"""
|
|
975
|
+
self.__init__()
|
|
976
|
+
self.fromTrainedModel(bn=state["bn"], targetAttribute=state["target"], targetModality=state["label"], copy=False)
|
|
977
|
+
self.targetType = state["targetType"]
|
|
978
|
+
self.fromModel = state["fromModel"]
|
|
979
|
+
self.threshold = state["threshold"]
|
|
980
|
+
self.variableNameIndexDictionary = state["variableNameIndexDictionary"]
|
|
981
|
+
self.set_params(**state["params"])
|
|
982
|
+
self.discertizer = state["discretizer"]
|
|
983
|
+
return self
|
|
984
|
+
|
|
985
|
+
def showROC_PR(self, data, *, beta=1, save_fig=False, show_progress=False, bgcolor=None):
|
|
986
|
+
"""
|
|
987
|
+
Use the `pyagrum.lib.bn2roc` tools to create ROC and Precision-Recall curve
|
|
988
|
+
|
|
989
|
+
Parameters
|
|
990
|
+
----------
|
|
991
|
+
data: str | dataframe
|
|
992
|
+
a csv filename or a DataFrame
|
|
993
|
+
beta : float
|
|
994
|
+
the value of beta for the F-beta score
|
|
995
|
+
save_fig : bool
|
|
996
|
+
whether the graph should be saved
|
|
997
|
+
show_progress : bool
|
|
998
|
+
indicates if the resulting curve must be printed
|
|
999
|
+
bgcolor: str
|
|
1000
|
+
HTML background color for the figure (default: None if transparent)
|
|
1001
|
+
"""
|
|
1002
|
+
import pyagrum.lib.bn2roc as bn2roc
|
|
1003
|
+
|
|
1004
|
+
bn2roc.showROC_PR(
|
|
1005
|
+
self.bn,
|
|
1006
|
+
data,
|
|
1007
|
+
self.target,
|
|
1008
|
+
self.label,
|
|
1009
|
+
beta=beta,
|
|
1010
|
+
significant_digits=self.significant_digit,
|
|
1011
|
+
save_fig=save_fig,
|
|
1012
|
+
show_progress=show_progress,
|
|
1013
|
+
bgcolor=bgcolor,
|
|
1014
|
+
)
|