pyAgrum-nightly 2.3.1.9.dev202512261765915415__cp310-abi3-macosx_10_15_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. pyagrum/__init__.py +165 -0
  2. pyagrum/_pyagrum.so +0 -0
  3. pyagrum/bnmixture/BNMInference.py +268 -0
  4. pyagrum/bnmixture/BNMLearning.py +376 -0
  5. pyagrum/bnmixture/BNMixture.py +464 -0
  6. pyagrum/bnmixture/__init__.py +60 -0
  7. pyagrum/bnmixture/notebook.py +1058 -0
  8. pyagrum/causal/_CausalFormula.py +280 -0
  9. pyagrum/causal/_CausalModel.py +436 -0
  10. pyagrum/causal/__init__.py +81 -0
  11. pyagrum/causal/_causalImpact.py +356 -0
  12. pyagrum/causal/_dSeparation.py +598 -0
  13. pyagrum/causal/_doAST.py +761 -0
  14. pyagrum/causal/_doCalculus.py +361 -0
  15. pyagrum/causal/_doorCriteria.py +374 -0
  16. pyagrum/causal/_exceptions.py +95 -0
  17. pyagrum/causal/_types.py +61 -0
  18. pyagrum/causal/causalEffectEstimation/_CausalEffectEstimation.py +1175 -0
  19. pyagrum/causal/causalEffectEstimation/_IVEstimators.py +718 -0
  20. pyagrum/causal/causalEffectEstimation/_RCTEstimators.py +132 -0
  21. pyagrum/causal/causalEffectEstimation/__init__.py +46 -0
  22. pyagrum/causal/causalEffectEstimation/_backdoorEstimators.py +774 -0
  23. pyagrum/causal/causalEffectEstimation/_causalBNEstimator.py +324 -0
  24. pyagrum/causal/causalEffectEstimation/_frontdoorEstimators.py +396 -0
  25. pyagrum/causal/causalEffectEstimation/_learners.py +118 -0
  26. pyagrum/causal/causalEffectEstimation/_utils.py +466 -0
  27. pyagrum/causal/notebook.py +172 -0
  28. pyagrum/clg/CLG.py +658 -0
  29. pyagrum/clg/GaussianVariable.py +111 -0
  30. pyagrum/clg/SEM.py +312 -0
  31. pyagrum/clg/__init__.py +63 -0
  32. pyagrum/clg/canonicalForm.py +408 -0
  33. pyagrum/clg/constants.py +54 -0
  34. pyagrum/clg/forwardSampling.py +202 -0
  35. pyagrum/clg/learning.py +776 -0
  36. pyagrum/clg/notebook.py +480 -0
  37. pyagrum/clg/variableElimination.py +271 -0
  38. pyagrum/common.py +60 -0
  39. pyagrum/config.py +319 -0
  40. pyagrum/ctbn/CIM.py +513 -0
  41. pyagrum/ctbn/CTBN.py +573 -0
  42. pyagrum/ctbn/CTBNGenerator.py +216 -0
  43. pyagrum/ctbn/CTBNInference.py +459 -0
  44. pyagrum/ctbn/CTBNLearner.py +161 -0
  45. pyagrum/ctbn/SamplesStats.py +671 -0
  46. pyagrum/ctbn/StatsIndepTest.py +355 -0
  47. pyagrum/ctbn/__init__.py +79 -0
  48. pyagrum/ctbn/constants.py +54 -0
  49. pyagrum/ctbn/notebook.py +264 -0
  50. pyagrum/defaults.ini +199 -0
  51. pyagrum/deprecated.py +95 -0
  52. pyagrum/explain/_ComputationCausal.py +75 -0
  53. pyagrum/explain/_ComputationConditional.py +48 -0
  54. pyagrum/explain/_ComputationMarginal.py +48 -0
  55. pyagrum/explain/_CustomShapleyCache.py +110 -0
  56. pyagrum/explain/_Explainer.py +176 -0
  57. pyagrum/explain/_Explanation.py +70 -0
  58. pyagrum/explain/_FIFOCache.py +54 -0
  59. pyagrum/explain/_ShallCausalValues.py +204 -0
  60. pyagrum/explain/_ShallConditionalValues.py +155 -0
  61. pyagrum/explain/_ShallMarginalValues.py +155 -0
  62. pyagrum/explain/_ShallValues.py +296 -0
  63. pyagrum/explain/_ShapCausalValues.py +208 -0
  64. pyagrum/explain/_ShapConditionalValues.py +126 -0
  65. pyagrum/explain/_ShapMarginalValues.py +191 -0
  66. pyagrum/explain/_ShapleyValues.py +298 -0
  67. pyagrum/explain/__init__.py +81 -0
  68. pyagrum/explain/_explGeneralizedMarkovBlanket.py +152 -0
  69. pyagrum/explain/_explIndependenceListForPairs.py +146 -0
  70. pyagrum/explain/_explInformationGraph.py +264 -0
  71. pyagrum/explain/notebook/__init__.py +54 -0
  72. pyagrum/explain/notebook/_bar.py +142 -0
  73. pyagrum/explain/notebook/_beeswarm.py +174 -0
  74. pyagrum/explain/notebook/_showShapValues.py +97 -0
  75. pyagrum/explain/notebook/_waterfall.py +220 -0
  76. pyagrum/explain/shapley.py +225 -0
  77. pyagrum/lib/__init__.py +46 -0
  78. pyagrum/lib/_colors.py +390 -0
  79. pyagrum/lib/bn2graph.py +299 -0
  80. pyagrum/lib/bn2roc.py +1026 -0
  81. pyagrum/lib/bn2scores.py +217 -0
  82. pyagrum/lib/bn_vs_bn.py +605 -0
  83. pyagrum/lib/cn2graph.py +305 -0
  84. pyagrum/lib/discreteTypeProcessor.py +1102 -0
  85. pyagrum/lib/discretizer.py +58 -0
  86. pyagrum/lib/dynamicBN.py +390 -0
  87. pyagrum/lib/explain.py +57 -0
  88. pyagrum/lib/export.py +84 -0
  89. pyagrum/lib/id2graph.py +258 -0
  90. pyagrum/lib/image.py +387 -0
  91. pyagrum/lib/ipython.py +307 -0
  92. pyagrum/lib/mrf2graph.py +471 -0
  93. pyagrum/lib/notebook.py +1821 -0
  94. pyagrum/lib/proba_histogram.py +552 -0
  95. pyagrum/lib/utils.py +138 -0
  96. pyagrum/pyagrum.py +31495 -0
  97. pyagrum/skbn/_MBCalcul.py +242 -0
  98. pyagrum/skbn/__init__.py +49 -0
  99. pyagrum/skbn/_learningMethods.py +282 -0
  100. pyagrum/skbn/_utils.py +297 -0
  101. pyagrum/skbn/bnclassifier.py +1014 -0
  102. pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/LICENSE.md +12 -0
  103. pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/LICENSES/LGPL-3.0-or-later.txt +304 -0
  104. pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/LICENSES/MIT.txt +18 -0
  105. pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/METADATA +145 -0
  106. pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/RECORD +107 -0
  107. pyagrum_nightly-2.3.1.9.dev202512261765915415.dist-info/WHEEL +4 -0
@@ -0,0 +1,1102 @@
1
+ ############################################################################
2
+ # This file is part of the aGrUM/pyAgrum library. #
3
+ # #
4
+ # Copyright (c) 2005-2025 by #
5
+ # - Pierre-Henri WUILLEMIN(_at_LIP6) #
6
+ # - Christophe GONZALES(_at_AMU) #
7
+ # #
8
+ # The aGrUM/pyAgrum library is free software; you can redistribute it #
9
+ # and/or modify it under the terms of either : #
10
+ # #
11
+ # - the GNU Lesser General Public License as published by #
12
+ # the Free Software Foundation, either version 3 of the License, #
13
+ # or (at your option) any later version, #
14
+ # - the MIT license (MIT), #
15
+ # - or both in dual license, as here. #
16
+ # #
17
+ # (see https://agrum.gitlab.io/articles/dual-licenses-lgplv3mit.html) #
18
+ # #
19
+ # This aGrUM/pyAgrum library is distributed in the hope that it will be #
20
+ # useful, but WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, #
21
+ # INCLUDING BUT NOT LIMITED TO THE WARRANTIES MERCHANTABILITY or FITNESS #
22
+ # FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE #
23
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER #
24
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, #
25
+ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR #
26
+ # OTHER DEALINGS IN THE SOFTWARE. #
27
+ # #
28
+ # See LICENCES for more details. #
29
+ # #
30
+ # SPDX-FileCopyrightText: Copyright 2005-2025 #
31
+ # - Pierre-Henri WUILLEMIN(_at_LIP6) #
32
+ # - Christophe GONZALES(_at_AMU) #
33
+ # SPDX-License-Identifier: LGPL-3.0-or-later OR MIT #
34
+ # #
35
+ # Contact : info_at_agrum_dot_org #
36
+ # homepage : http://agrum.gitlab.io #
37
+ # gitlab : https://gitlab.com/agrumery/agrum #
38
+ # #
39
+ ############################################################################
40
+
41
+ """
42
+ This module contains the DiscreteTypeProcessor class used to attribute a type from the values of a database before learning a (discrete) Graphical Model.
43
+ Particularly, the DiscreteTypeProcessor class is used to discretize some continuous variables in a database.
44
+
45
+ The discretization is done using the following methods: 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM' and 'MDLP'.
46
+ Some methods need specialized treatment of arguments for classification (y separated from X).
47
+ """
48
+
49
+ import math
50
+
51
+ import numpy
52
+ import pandas
53
+ import scipy
54
+ import sklearn
55
+ import sklearn.preprocessing as skp
56
+
57
+ from typing import Any
58
+
59
+ import pyagrum as gum
60
+
61
+
62
+ def check_int(v: Any) -> bool:
63
+ """
64
+ Test if v is an int or a str representing an int
65
+ """
66
+ if isinstance(v, (bool, numpy.bool_)):
67
+ return False
68
+ if isinstance(v, (int, numpy.integer)):
69
+ return True
70
+
71
+ if isinstance(v, str):
72
+ if v[0] in ("-", "+"):
73
+ return v[1:].isdigit()
74
+ return v.isdigit()
75
+ return False
76
+
77
+
78
+ def check_float(v: Any) -> bool:
79
+ """
80
+ Test if v is a float or a str representing a float.
81
+
82
+ Parameters
83
+ ----------
84
+ v : Any
85
+
86
+ Returns
87
+ -------
88
+ bool:
89
+ True if v is a float or a str representing a float
90
+ """
91
+ if isinstance(v, (bool, numpy.bool_)):
92
+ return False
93
+
94
+ try:
95
+ float(v)
96
+ return True
97
+ except ValueError:
98
+ return False
99
+
100
+
101
+ class DiscreteTypeProcessor:
102
+ """
103
+ Represents a tool to process the type of the variables in a database in order to obtain a way to learn a pyAgrum's discrete Graphical Model.
104
+ Particularly, the DiscreteTypeProcessor class is used to discretize some continuous variables in a database.
105
+
106
+ Warnings
107
+ --------
108
+ - The data are represented by tabular data (X and possibly y) where the columns are the variables and the rows are the samples. Generally, X can be replaced by a the name of a csv file.
109
+ - In the case of a classification, y is the class variable and X are the features. y has not to be binary.
110
+
111
+ Parameters
112
+ ----------
113
+ defaultDiscretizationMethod: str
114
+ sets the default method of discretization for this discretizer. Possible values are: `quantile`, `uniform`,
115
+ `kmeans`, `NML`, `CAIM` and `MDLP`. This method will be used if the user has not specified another method
116
+ for that specific variable using the setDiscretizationParameters method.
117
+ defaultNumberOfBins: str or int
118
+ sets the number of bins if the method used is `quantile`, `kmeans`, `uniform`. In this case this parameter can also
119
+ be set to the string `elbowMethod` so that the best number of bins is found automatically.
120
+ If the method used is NML, this parameter sets the the maximum number of bins up to which the NML algorithm
121
+ searches for the optimal number of bins. In this case this parameter must be an int
122
+ If any other discretization method is used, this parameter is ignored.
123
+ discretizationThreshold: int or float
124
+ When using default parameters a variable will be treated as continuous only if it has more unique values than
125
+ this number (if the number is an int greater than 1).
126
+ If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than this
127
+ number. For example if you have entered 0.95, the variable will be treated as continuous only if more than 95%
128
+ of its values are unique.
129
+
130
+ """
131
+
132
+ def __init__(
133
+ self,
134
+ defaultDiscretizationMethod="quantile",
135
+ defaultNumberOfBins=10,
136
+ discretizationThreshold=25,
137
+ ):
138
+ """
139
+ Initializes the DiscreteTypeProcessor object.
140
+
141
+ Parameters
142
+ ----------
143
+ defaultDiscretizationMethod: str
144
+ sets the default method of discretization for this discretizer. Possible values are: 'quantile', 'uniform',
145
+ 'kmeans', 'NML', 'CAIM' and 'MDLP'. This method will be used if the user has not specified another method
146
+ for that specific variable using the setDiscretizationParameters method.
147
+ defaultNumberOfBins: str or int
148
+ sets the number of bins if the method used is quantile, kmeans, uniform. In this case this parameter can
149
+ also be set to the string 'elbowMethod' so that the best number of bins is found automatically.
150
+ If the method used is NML, this parameter sets the the maximum number of bins up to which the NML algorithm
151
+ searches for the optimal number of bins. In this case this parameter must be an int
152
+ If any other discretization method is used, this parameter is ignored.
153
+ discretizationThreshold: int or float
154
+ When using default parameters a variable will be treated as continuous only if it has more unique values than
155
+ this number (if the number is an int greater than 1).
156
+ If the number is a float between 0 and 1, we will test if the proportion of unique values is bigger than
157
+ this number. For example if you have entered 0.95, the variable will be treated as continuous only if more
158
+ than 95% of its values are unique.
159
+ """
160
+ self.discretizationParametersDictionary = {}
161
+ self.numberOfContinuous = 0
162
+ self.totalNumberOfBins = 0
163
+ self.defaultMethod = None
164
+ self.defaultParamDiscretizationMethod = None
165
+ self.setDiscretizationParameters(None, defaultDiscretizationMethod, defaultNumberOfBins)
166
+
167
+ self.discretizationThreshold = discretizationThreshold
168
+
169
+ def clear(self, clearDiscretizationParameters=False):
170
+ """
171
+ Sets the number of continuous variables and the total number of bins created by this discretizer to 0. If
172
+ clearDiscretizationParameters is True, also clears the the parameters for discretization the user has set for each
173
+ variable.
174
+
175
+ Parameters
176
+ ----------
177
+ clearDiscretizationParameters: bool
178
+ if True, this method also clears the parameters the user has set for each variable and resets them to the default.
179
+ """
180
+ self.numberOfContinuous = 0
181
+ self.totalNumberOfBins = 0
182
+ if clearDiscretizationParameters:
183
+ self.discretizationParametersDictionary = {}
184
+
185
+ @gum.deprecated_arg(newA="parameters", oldA="paramDiscretizationMethod", version="2.0.0")
186
+ def setDiscretizationParameters(self, variableName: str, method: str, parameters: Any = None):
187
+ """
188
+ Sets the discretization parameters for a variable. If variableName is None, sets the default parameters.
189
+
190
+ Parameters
191
+ ----------
192
+ variableName: str
193
+ the name of the variable you want to set the discretization parameters of. Set to None to set the new
194
+ default.
195
+ method: str
196
+ The method of discretization used for this variable. Use "NoDiscretization" if you do not want to discretize this
197
+ variable. Possible values are: 'NoDiscretization', 'quantile', 'uniform', 'kmeans', 'NML', 'CAIM', 'MDLP' and 'expert'
198
+ parameters:
199
+ Each method of discretization has a parameter that can be set.
200
+ - 'quantile': the number of bins
201
+ - 'kmeans', 'uniform': the number of bins. The parameter can also be set to the string 'elbowMethod' so that the best
202
+ number of bins is found automatically.
203
+ - 'NML': this parameter sets the the maximum number of bins up to which the NML algorithm searches for the optimal
204
+ number of bins.
205
+ - 'MDLP', 'CAIM': this parameter is ignored
206
+ - 'expert': this parameter is the set of ticks proposed by the expert. The discretized variable will set the flag
207
+ 'empirical' which means that if the values found in the data are not in the proposed intervals, they did not raise
208
+ any exception but are nevertheless accepted (as belonging to the smallest or biggest interval).
209
+ - 'NoDiscretization': this parameter is a superset of the values for the variable found in the database (or None).
210
+ """
211
+ if parameters is None:
212
+ parameters = self.defaultParamDiscretizationMethod
213
+
214
+ match method:
215
+ case "quantile" | "NML":
216
+ if type(parameters) is not int:
217
+ raise ValueError(
218
+ "The parameter for the quantile/NML method must be an integer. You have entered: " + str(parameters)
219
+ )
220
+ case "kmeans" | "uniform":
221
+ if type(parameters) is not int and str(parameters) != "elbowMethod":
222
+ raise ValueError(
223
+ "The parameter for the kmeans/uniform method must be an integer or the string 'elbowMethod'. You have entered: "
224
+ + str(parameters)
225
+ )
226
+ case "expert":
227
+ if not (isinstance(parameters, list) and all(map(check_float, parameters))):
228
+ raise ValueError(
229
+ "The parameter for the expert method must be a list of float. You have entered: " + str(parameters)
230
+ )
231
+ case "NoDiscretization":
232
+ if parameters is not None and not (isinstance(parameters, str)):
233
+ raise ValueError(
234
+ "The parameter for the NoDiscretization method must be a string (fastVar syntax) or None. You have "
235
+ "entered: " + str(parameters)
236
+ )
237
+ case "CAIM" | "MDLP":
238
+ pass
239
+ case _:
240
+ raise ValueError(
241
+ "This discretization method is not recognized! Possible values are kmeans, uniform, quantile, NML, "
242
+ "CAIM, MDLP, NoDiscretization or expert. You have entered " + str(method)
243
+ )
244
+
245
+ if variableName is None:
246
+ self.defaultMethod = method
247
+ self.defaultParamDiscretizationMethod = parameters
248
+ else:
249
+ self.discretizationParametersDictionary[variableName] = {}
250
+ self.discretizationParametersDictionary[variableName]["method"] = method
251
+ self.discretizationParametersDictionary[variableName]["param"] = parameters
252
+
253
+ def audit(self, X, y=None):
254
+ """
255
+ Audits the passed values of X and y. Guess which columns in X are already discrete and which need to
256
+ be discretized, as well as the discretization algorithm that will be used to discretize them The parameters which
257
+ are suggested will be used when creating the variables. To change this the user can manually set discretization
258
+ parameters for each variable using the setDiscretizationParameters function.
259
+
260
+ Parameters
261
+ ----------
262
+ X: {array-like, pandas or polars dataframe} of shape (n_samples, n_features) or str (filename)
263
+ training data
264
+ y: {array-like, pandas or polars dataframe} of shape (n_samples,) or str (classname)
265
+ Target values
266
+ Returns
267
+ -------
268
+ Dict
269
+ for each variable, the proposition of audit
270
+ """
271
+ if isinstance(X, str):
272
+ Xp = pandas.read_csv(X)
273
+ elif hasattr(X, "to_pandas"): # for instance, polars dataframe
274
+ Xp = X.to_pandas()
275
+ else:
276
+ Xp = X
277
+
278
+ if isinstance(y, str):
279
+ yp = Xp[y]
280
+ elif y is not None and hasattr(y, "to_pandas"): # for instance, polars dataframe
281
+ yp = y.to_pandas()
282
+ else:
283
+ yp = y
284
+
285
+ return self._audit(Xp, yp)
286
+
287
+ def _audit(self, X, y=None):
288
+ """
289
+ Audits the passed values of X and y. Tells us which columns in X we think are already discrete and which need to
290
+ be discretized, as well as the discretization algorithm that will be used to discretize them The parameters which
291
+ are suggested will be used when creating the variables. To change this the user can manually set discretization
292
+ parameters for each variable using the setDiscretizationParameters function.
293
+
294
+ Parameters
295
+ ----------
296
+ X: {array-like, sparse matrix} of shape (n_samples, n_features)
297
+ training data
298
+ y: array-like of shape (n_samples,)
299
+ Target values
300
+ Returns
301
+ -------
302
+ Dict
303
+ for each variable, the proposition of audit
304
+ """
305
+
306
+ auditDict = {}
307
+
308
+ if isinstance(X, pandas.DataFrame):
309
+ variableNames = X.columns.tolist()
310
+ elif type(X) is pandas.core.series.Series:
311
+ variableNames = [X.name]
312
+ else:
313
+ variableNames = None
314
+
315
+ if y is not None:
316
+ X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True)
317
+ else:
318
+ X = sklearn.utils.check_array(X, dtype=None)
319
+
320
+ d = X.shape[1]
321
+
322
+ if variableNames is None:
323
+ variableNames = ["x" + str(i) for i in range(d)]
324
+
325
+ possibleValues = {} # counting the possible values gor this line
326
+
327
+ for i in range(d):
328
+ possibleValues[i] = numpy.unique(X[:, i])
329
+ possibleValues[d] = numpy.unique(y)
330
+
331
+ for i in range(d):
332
+ variable = variableNames[i]
333
+ auditDict[variable] = {}
334
+ try:
335
+ sklearn.utils.check_array(X[:, i], dtype="float", ensure_2d=False)
336
+ isNumeric = True
337
+ except ValueError:
338
+ isNumeric = False
339
+ if variable in self.discretizationParametersDictionary.keys():
340
+ auditDict[variable] = self.discretizationParametersDictionary[variable]
341
+ if self.discretizationParametersDictionary[variable]["method"] != "NoDiscretization" and not isNumeric:
342
+ raise ValueError("The variable " + variable + " is not numeric and cannot be discretized!")
343
+ else:
344
+ if len(possibleValues[i]) > self.discretizationThreshold and isNumeric:
345
+ auditDict[variable]["method"] = self.defaultMethod
346
+ auditDict[variable]["nbBins"] = self.defaultParamDiscretizationMethod
347
+ else:
348
+ auditDict[variable]["method"] = "NoDiscretization"
349
+ auditDict[variable]["values"] = possibleValues[i]
350
+
351
+ if auditDict[variable]["method"] == "NoDiscretization":
352
+ auditDict[variable]["type"] = "Discrete"
353
+ else:
354
+ auditDict[variable]["type"] = "Continuous"
355
+ auditDict[variable]["minInData"] = min(possibleValues[i])
356
+ auditDict[variable]["maxInData"] = max(possibleValues[i])
357
+
358
+ return auditDict
359
+
360
+ @staticmethod
361
+ def _discretizationElbowMethodRotation(discretizationStrategy, X):
362
+ """
363
+ Calculates the sum of squared errors as a function of the number of clusters using the discretization strategy
364
+ that is passed as a parameter. Returns the bins that are optimal for minimizing the variation and the number of
365
+ bins at the same time. Uses the elbow method to find this optimal point. To find the "elbow" we rotate the curve
366
+ and look for its minimum.
367
+
368
+ Parameters
369
+ ----------
370
+ discretizationStrategy: str
371
+ The method of discretization that will be used. Possible values are: 'quantile' , 'kmeans' and 'uniform'
372
+ X: one dimensional ndarray
373
+ Contains the data that should be discretized
374
+ Returns
375
+ -------
376
+ List[float]
377
+ the edges of the bins the algorithm has chosen.
378
+ """
379
+
380
+ if discretizationStrategy not in {"kmeans", "quantile", "uniform"}:
381
+ raise ValueError("cannot use elbow method with this type of discretization")
382
+ variationArray = numpy.zeros(14)
383
+ Xsorted = X[X.argsort(axis=None)]
384
+ binEdgeMatrix = [[]] * 14
385
+ for k in range(2, 16):
386
+ discretizer = skp.KBinsDiscretizer(
387
+ k,
388
+ strategy=discretizationStrategy,
389
+ quantile_method="averaged_inverted_cdf",
390
+ subsample=None,
391
+ )
392
+ discretizer.fit(Xsorted)
393
+ binEdges = discretizer.bin_edges_[0]
394
+ centresArray = (binEdges[1:] + binEdges[:-1]) / 2
395
+ i = 0
396
+ sumOfSquaredErrors = 0
397
+ for x in Xsorted:
398
+ if x > binEdges[i + 1]:
399
+ i = i + 1
400
+ sumOfSquaredErrors += (x - centresArray[i]) ** 2
401
+ variationArray[k - 2] = sumOfSquaredErrors
402
+ binEdgeMatrix[k - 2] = binEdges.to_list()
403
+
404
+ # we calculate the slope of the line that connects the first and last point on our graph
405
+ slope = (variationArray[13] - variationArray[0]) / 13
406
+
407
+ # we calculate the slope of the line perpendicular to it
408
+ otherSlope = -1 / slope
409
+
410
+ # we change the coordinate system to the one with the two lines previously mentioned as its axes
411
+ coordinateChangeMatrix = numpy.array([[1, slope], [1 / otherSlope, 1]])
412
+
413
+ # we linearly transform the coordinates of every point in our curve
414
+ transformedCoordinates = numpy.zeros((2, 14))
415
+ for i in range(14):
416
+ transformedCoordinates[:, i] = numpy.matmul(
417
+ coordinateChangeMatrix,
418
+ numpy.array([[i], [variationArray[i] - variationArray[0]]]),
419
+ ).reshape(2)
420
+
421
+ # we search for the minimum in our newly obtained curve
422
+ minkIndex = 0
423
+ for k in range(14):
424
+ if transformedCoordinates[1][minkIndex] > transformedCoordinates[1][k]:
425
+ minkIndex = k
426
+ # when we have found the minimum, we apply the inverse linear transformation to recover the optimal value of k
427
+ minimumVector = numpy.matmul(
428
+ numpy.linalg.inv(coordinateChangeMatrix),
429
+ transformedCoordinates[:, minkIndex].reshape(2, 1),
430
+ )
431
+
432
+ # we return the list of bin edges found using said optimal number of k
433
+ return binEdgeMatrix[int(round(minimumVector[0]))]
434
+
435
+ def _discretizationMDLP(self, X, y, possibleValuesX, possibleValuesY):
436
+ """
437
+ Uses the MDLP algorithm described in Fayyad, 1995 to discretize the values of x.
438
+
439
+ Parameters
440
+ ----------
441
+ X: ndarray with shape (n,1) where n is the number of samples
442
+ Column-vector that contains all the data that needs to be discretized
443
+ y: ndarray with shape (n,1) where n is the number of samples
444
+ Column-vector that contains the class for each sample. This vector will not be discretized, but the class-value of each sample is needed to properly apply the algorithm
445
+ possibleValuesX: one dimensional ndarray
446
+ Contains all the possible values that x can take sorted in increasing order. There shouldn't be any doubles inside
447
+ possibleValuesY: one dimensional ndarray
448
+ Contains the possible values of y.
449
+ Returns
450
+ -------
451
+ List[float]
452
+ a list of the edges of the bins that are chosen by this algorithm
453
+ """
454
+ xAndY = numpy.concatenate((X, y), axis=1)
455
+ xAndY = xAndY[xAndY[:, 0].argsort()]
456
+ B = (possibleValuesX[1:] + possibleValuesX[:-1]) / 2
457
+ [class0, _] = possibleValuesY
458
+
459
+ binEdgesIndex = []
460
+ nbElementsByIntervalClass0 = numpy.zeros(len(B) + 1)
461
+ nbElementsByIntervalClass1 = numpy.zeros(len(B) + 1)
462
+ currentIntervalIndex = 0
463
+ for X in xAndY:
464
+ if currentIntervalIndex < len(B) and X[0] > B[currentIntervalIndex]:
465
+ currentIntervalIndex += 1
466
+ if X[1] == class0:
467
+ nbElementsByIntervalClass0[currentIntervalIndex] += 1
468
+ else:
469
+ nbElementsByIntervalClass1[currentIntervalIndex] += 1
470
+
471
+ Class0ByLargeInterval = [nbElementsByIntervalClass0.sum()]
472
+ Class1ByLargeInterval = [nbElementsByIntervalClass1.sum()]
473
+
474
+ totalCountByLargeInterval = [Class0ByLargeInterval[0] + Class1ByLargeInterval[0]]
475
+
476
+ probabilityClass0 = Class0ByLargeInterval[0] / totalCountByLargeInterval[0]
477
+ probabilityClass1 = Class1ByLargeInterval[0] / totalCountByLargeInterval[0]
478
+ shannonEntropyByLargeInterval = [
479
+ (-1) * (probabilityClass0 * math.log2(probabilityClass0) + probabilityClass1 * math.log2(probabilityClass1))
480
+ ]
481
+
482
+ continueDividingInterval = [True]
483
+
484
+ currentValues = {}
485
+ minimalValues = {}
486
+
487
+ while any(continueDividingInterval):
488
+ minimalValues["classInformationEntropy"] = math.inf
489
+ for param in {
490
+ "boundaryIndex",
491
+ "leftSubintervalClass0",
492
+ "leftSubintervalClass1",
493
+ "leftSubintervalShannonEntropy",
494
+ "rightSubintervalClass0",
495
+ "rightSubintervalClass1",
496
+ "rightSubintervalShannonEntropy",
497
+ }:
498
+ (currentValues[param], minimalValues[param]) = (0, 0)
499
+
500
+ position = 0
501
+ while currentValues["boundaryIndex"] < len(B):
502
+ while not continueDividingInterval[position]:
503
+ position = position + 1
504
+ currentValues["boundaryIndex"] = binEdgesIndex[position - 1] + 1
505
+
506
+ if position < len(binEdgesIndex) and currentValues["boundaryIndex"] == binEdgesIndex[position]:
507
+ # this function decides whether to accept the cut point in this interval and updates the relevant lists if
508
+ # the value is accepted.
509
+ self._divideIntervalMDLP(
510
+ minimalValues,
511
+ shannonEntropyByLargeInterval,
512
+ Class0ByLargeInterval,
513
+ Class1ByLargeInterval,
514
+ continueDividingInterval,
515
+ totalCountByLargeInterval,
516
+ position,
517
+ binEdgesIndex,
518
+ )
519
+ position += 1
520
+ while position < len(continueDividingInterval) and not continueDividingInterval[position]:
521
+ position += 1
522
+ if position == len(continueDividingInterval):
523
+ break
524
+ else:
525
+ currentValues["boundaryIndex"] = binEdgesIndex[position - 1] + 1
526
+ (
527
+ currentValues["leftSubintervalClass0"],
528
+ currentValues["leftSubintervalClass1"],
529
+ ) = (0, 0)
530
+ minimalValues["classInformationEntropy"] = math.inf
531
+ continue
532
+
533
+ currentValues["leftSubintervalClass0"] += nbElementsByIntervalClass0[currentValues["boundaryIndex"]]
534
+ currentValues["leftSubintervalClass1"] += nbElementsByIntervalClass1[currentValues["boundaryIndex"]]
535
+
536
+ totalCountLeftInterval = currentValues["leftSubintervalClass0"] + currentValues["leftSubintervalClass1"]
537
+
538
+ probabilityClass0 = currentValues["leftSubintervalClass0"] / totalCountLeftInterval
539
+ probabilityClass1 = currentValues["leftSubintervalClass1"] / totalCountLeftInterval
540
+ currentValues["leftSubintervalShannonEntropy"] = 0
541
+ if probabilityClass0 > 0:
542
+ currentValues["leftSubintervalShannonEntropy"] += (-1) * probabilityClass0 * math.log2(probabilityClass0)
543
+ if probabilityClass1 > 0:
544
+ currentValues["leftSubintervalShannonEntropy"] += (-1) * probabilityClass1 * math.log2(probabilityClass1)
545
+
546
+ classInformationEntropy = (totalCountLeftInterval / totalCountByLargeInterval[position]) * currentValues[
547
+ "leftSubintervalShannonEntropy"
548
+ ]
549
+
550
+ currentValues["rightSubintervalClass0"] = (
551
+ Class0ByLargeInterval[position] - currentValues["leftSubintervalClass0"]
552
+ )
553
+ currentValues["rightSubintervalClass1"] = (
554
+ Class1ByLargeInterval[position] - currentValues["leftSubintervalClass1"]
555
+ )
556
+ NRightInterval = currentValues["rightSubintervalClass0"] + currentValues["rightSubintervalClass1"]
557
+
558
+ probabilityClass0 = currentValues["rightSubintervalClass0"] / NRightInterval
559
+ probabilityClass1 = currentValues["rightSubintervalClass1"] / NRightInterval
560
+ currentValues["rightSubintervalShannonEntropy"] = 0
561
+ if probabilityClass0 > 0:
562
+ currentValues["rightSubintervalShannonEntropy"] += (-1) * probabilityClass0 * math.log2(probabilityClass0)
563
+ if probabilityClass1 > 0:
564
+ currentValues["rightSubintervalShannonEntropy"] += (-1) * probabilityClass1 * math.log2(probabilityClass1)
565
+
566
+ classInformationEntropy += (NRightInterval / totalCountByLargeInterval[position]) * currentValues[
567
+ "rightSubintervalShannonEntropy"
568
+ ]
569
+
570
+ if classInformationEntropy < minimalValues["classInformationEntropy"]:
571
+ minimalValues = currentValues.copy()
572
+ minimalValues["classInformationEntropy"] = classInformationEntropy
573
+
574
+ currentValues["boundaryIndex"] += 1
575
+
576
+ if continueDividingInterval[-1]:
577
+ self._divideIntervalMDLP(
578
+ minimalValues,
579
+ shannonEntropyByLargeInterval,
580
+ Class0ByLargeInterval,
581
+ Class1ByLargeInterval,
582
+ continueDividingInterval,
583
+ totalCountByLargeInterval,
584
+ position,
585
+ binEdgesIndex,
586
+ )
587
+ binEdges = [xAndY[0][0]]
588
+ for index in binEdgesIndex:
589
+ binEdges.append(B[index])
590
+ binEdges.append(xAndY[-1][0])
591
+
592
+ return binEdges
593
+
594
+ @staticmethod
595
+ def _discretizationCAIM(X, y, possibleValuesX, possibleValuesY):
596
+ """
597
+ Applies the CAIM algorithm to discretize the values of x
598
+
599
+ Parameters
600
+ ----------
601
+ X: ndarray with shape (n,1) where n is the number of samples
602
+ Column-vector that contains all the data that needs to be discretized
603
+ y: ndarray with shape (n,1) where n is the number of samples
604
+ Column-vector that contains the class for each sample. This vector will not be discretized, but the class-value of each sample is needed to properly apply the algorithm
605
+ possibleValuesX: one dimensional ndarray
606
+ Contains all the possible values that x can take sorted in increasing order. There shouldn't be any doubles inside
607
+ possibleValuesY: one dimensional ndarray
608
+ Contains the possible values of y.
609
+ Returns
610
+ -------
611
+ list[float]
612
+ a list of the edges of the bins that are chosen by this algorithm
613
+ """
614
+ xAndY = numpy.concatenate((X, y), axis=1)
615
+ xAndY = xAndY[xAndY[:, 0].argsort()]
616
+ B = (possibleValuesX[1:] + possibleValuesX[:-1]) / 2
617
+ [class0, _] = possibleValuesY
618
+
619
+ binEdgesIndex = []
620
+ nbElementsByIntervalClass0 = numpy.zeros(len(B) + 1)
621
+ nbElementsByIntervalClass1 = numpy.zeros(len(B) + 1)
622
+ currentIntervalIndex = 0
623
+ for X in xAndY:
624
+ if currentIntervalIndex < len(B) and X[0] > B[currentIntervalIndex]:
625
+ currentIntervalIndex += 1
626
+ if X[1] == class0:
627
+ nbElementsByIntervalClass0[currentIntervalIndex] += 1
628
+ else:
629
+ nbElementsByIntervalClass1[currentIntervalIndex] += 1
630
+
631
+ Class0ByLargeInterval = [nbElementsByIntervalClass0.sum()]
632
+ Class1ByLargeInterval = [nbElementsByIntervalClass1.sum()]
633
+
634
+ k = 0
635
+ globalCAIM = 0.0
636
+ oldCaim = 0.0
637
+ while True:
638
+ caimMax = 0
639
+ maxPosition = 0
640
+ maxBoundaryIndex = 0
641
+ position = 0
642
+ currentSumClass0 = 0
643
+ currentSumClass1 = 0
644
+ maxLeftIntervalClass0 = currentSumClass0
645
+ maxLeftIntervalClass1 = currentSumClass1
646
+ maxRightIntervalClass0 = maxLeftIntervalClass0
647
+ maxRightIntervalClass1 = maxLeftIntervalClass1
648
+
649
+ for boundaryIndex in range(len(B)):
650
+ if position < len(binEdgesIndex) and boundaryIndex == binEdgesIndex[position]:
651
+ position += 1
652
+ if Class0ByLargeInterval[position] > Class1ByLargeInterval[position]:
653
+ oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class0ByLargeInterval[position], 2) / (
654
+ Class0ByLargeInterval[position] + Class1ByLargeInterval[position]
655
+ )
656
+ else:
657
+ oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class1ByLargeInterval[position], 2) / (
658
+ Class0ByLargeInterval[position] + Class1ByLargeInterval[position]
659
+ )
660
+ currentSumClass0 = 0
661
+ currentSumClass1 = 0
662
+ continue
663
+
664
+ currentSumClass0 += nbElementsByIntervalClass0[boundaryIndex]
665
+ currentSumClass1 += nbElementsByIntervalClass1[boundaryIndex]
666
+ caim = oldCaim
667
+
668
+ if currentSumClass0 > currentSumClass1:
669
+ caim = caim + math.pow(currentSumClass0, 2) / (currentSumClass0 + currentSumClass1)
670
+ else:
671
+ caim = caim + math.pow(currentSumClass1, 2) / (currentSumClass0 + currentSumClass1)
672
+
673
+ intervalClass0 = Class0ByLargeInterval[position] - currentSumClass0
674
+ intervalClass1 = Class1ByLargeInterval[position] - currentSumClass1
675
+
676
+ if intervalClass0 > intervalClass1:
677
+ caim = caim + math.pow(intervalClass0, 2) / (intervalClass0 + intervalClass1)
678
+ else:
679
+ caim = caim + math.pow(intervalClass1, 2) / (intervalClass0 + intervalClass1)
680
+
681
+ caim = caim / (len(Class0ByLargeInterval) + 1)
682
+
683
+ if caim > caimMax:
684
+ maxLeftIntervalClass0 = currentSumClass0
685
+ maxLeftIntervalClass1 = currentSumClass1
686
+ maxRightIntervalClass0 = intervalClass0
687
+ maxRightIntervalClass1 = intervalClass1
688
+ caimMax = caim
689
+ maxBoundaryIndex = boundaryIndex
690
+ maxPosition = position
691
+
692
+ if caimMax > globalCAIM:
693
+ globalCAIM = caimMax
694
+ binEdgesIndex.insert(maxPosition, maxBoundaryIndex)
695
+ Class0ByLargeInterval.insert(maxPosition + 1, maxRightIntervalClass0)
696
+ Class1ByLargeInterval.insert(maxPosition + 1, maxRightIntervalClass1)
697
+ Class0ByLargeInterval[maxPosition] = maxLeftIntervalClass0
698
+ Class1ByLargeInterval[maxPosition] = maxLeftIntervalClass1
699
+ k = k + 1
700
+ if Class0ByLargeInterval[0] > Class1ByLargeInterval[0]:
701
+ oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class0ByLargeInterval[0], 2) / (
702
+ Class0ByLargeInterval[0] + Class1ByLargeInterval[0]
703
+ )
704
+ else:
705
+ oldCaim = globalCAIM * len(Class0ByLargeInterval) - math.pow(Class1ByLargeInterval[0], 2) / (
706
+ Class0ByLargeInterval[0] + Class1ByLargeInterval[0]
707
+ )
708
+
709
+ else:
710
+ break
711
+
712
+ binEdges = [xAndY[0][0]]
713
+ for index in binEdgesIndex:
714
+ binEdges.append(B[index])
715
+ binEdges.append(xAndY[-1][0])
716
+
717
+ return binEdges
718
+
719
+ @staticmethod
720
+ def _discretizationNML(X, possibleValuesX, kMax=10, epsilon=None):
721
+ """
722
+ Uses the discretization algorithm described in "MDL Histogram Density Estimator", Kontkaken and Myllymaki, 2007 to
723
+ discretize.
724
+
725
+ Parameters
726
+ ----------
727
+ X: one dimensional ndarray
728
+ array that that contains all the data that needs to be discretized
729
+ possibleValuesX: one dimensional ndarray
730
+ Contains all the possible values that x can take sorted in increasing order. There shouldn't be any doubles
731
+ inside.
732
+ kMax: int
733
+ the maximum number of bins before the algorithm stops itself.
734
+ epsilon: float or None
735
+ the value of epsilon used in the algorithm. Should be as small as possible. If None is passed the value is
736
+ automatically calculated.
737
+ Returns
738
+ -------
739
+ List[float]
740
+ a list of the edges of the bins that are chosen by this algorithm
741
+ """
742
+ Xsorted = X[X.argsort(axis=None)]
743
+ if epsilon is None:
744
+ epsilon = numpy.amin(possibleValuesX[1:] - possibleValuesX[:-1]) / 2
745
+ epsilon = epsilon / 2
746
+ candidateCutPoints = numpy.zeros(2 * len(possibleValuesX))
747
+ for i in range(len(possibleValuesX)):
748
+ candidateCutPoints[2 * i] = possibleValuesX[i] - epsilon
749
+ candidateCutPoints[2 * i + 1] = possibleValuesX[i] + epsilon
750
+
751
+ E = len(candidateCutPoints)
752
+
753
+ binCount = numpy.zeros(E)
754
+ counter = 0
755
+ j = 0
756
+ for x in Xsorted:
757
+ while x > candidateCutPoints[j]:
758
+ binCount[j] = counter
759
+ j = j + 1
760
+ counter = counter + 1
761
+ binCount[j] = counter
762
+ n = binCount[-1]
763
+ Rkminus1 = numpy.ones(E)
764
+ Rk = numpy.zeros(E)
765
+ for i in range(1, E):
766
+ ne = int(binCount[i])
767
+ total_amount = 0
768
+ for h1 in range(ne + 1):
769
+ h2 = ne - h1
770
+ total_amount += math.pow(h1 / ne, h1) * math.pow(h2 / ne, h2) * scipy.special.comb(ne, h1)
771
+ Rk[i] = total_amount
772
+
773
+ k = 2
774
+
775
+ Bkminus1 = numpy.zeros(E)
776
+ for e in range(1, E):
777
+ ne = binCount[e]
778
+ Bkminus1[e] = -ne * (math.log(2 * epsilon * ne) - math.log(n * (candidateCutPoints[e] - candidateCutPoints[0])))
779
+
780
+ Bk = numpy.zeros(E)
781
+ cutpoints = [candidateCutPoints[0]]
782
+ Bvalues = [Bkminus1[-1]]
783
+ minimumeprime = 0
784
+ while k <= kMax:
785
+ for e in range(k, E):
786
+ minimum = math.inf
787
+ minimumeprime = 0
788
+ for eprime in range(k - 1, e):
789
+ if binCount[e] > binCount[eprime]:
790
+ temp = Bkminus1[eprime] - (binCount[e] - binCount[eprime]) * (
791
+ math.log(2 * epsilon * (binCount[e] - binCount[eprime]))
792
+ - math.log(n * (candidateCutPoints[e] - candidateCutPoints[eprime]))
793
+ )
794
+ else:
795
+ temp = Bkminus1[eprime]
796
+ temp = temp + math.log(Rk[e] / Rkminus1[eprime])
797
+ if minimum > temp:
798
+ minimum = temp
799
+ minimumeprime = eprime
800
+ minimum = minimum + math.log((E - k) / (k - 1))
801
+ Bk[e] = minimum
802
+ cutpoints.append(candidateCutPoints[minimumeprime])
803
+ Bvalues.append(Bk[-1])
804
+
805
+ k = k + 1
806
+ temp = Rk
807
+ Rk = Rk + numpy.multiply(binCount, Rkminus1) / (k - 2)
808
+ Rkminus1 = temp
809
+ Bkminus1 = Bk
810
+ minimum = math.inf
811
+ minimumIndex = 0
812
+ for k in range(1, len(Bvalues)):
813
+ if Bvalues[k] < minimum:
814
+ minimum = Bvalues[k]
815
+ minimumIndex = k
816
+ cutpoints = sorted(set(cutpoints[: minimumIndex + 1]))
817
+ cutpoints.append(candidateCutPoints[-1])
818
+
819
+ return cutpoints
820
+
821
+ def _createVariable(self, variableName, X, y=None, possibleValuesY=None):
822
+ """
823
+ Creates a variable for the column passed in as a parameter
824
+
825
+ Parameters
826
+ ----------
827
+ variableName:
828
+ the name of the created variable
829
+ X: ndarray shape(n,1)
830
+ A column vector containing n samples of a feature. The column for which the variable will be created
831
+ y: ndarray shape(n,1)
832
+ A column vector containing the corresponding for each element in X.
833
+ possibleValuesY: ndarray
834
+ An ndarray containing all the unique values of y
835
+ Returns
836
+ -------
837
+ pyagrum.DiscreteVariable
838
+ the created variable
839
+ """
840
+
841
+ if y is not None:
842
+ X, y = sklearn.utils.check_X_y(X, y, dtype=None, accept_sparse=True, ensure_2d=False)
843
+ X = sklearn.utils.check_array(X, dtype=None, ensure_2d=False)
844
+ try:
845
+ Xtransformed = sklearn.utils.check_array(X, dtype="float", ensure_2d=False)
846
+ isNumeric = True
847
+ except ValueError:
848
+ Xtransformed = X
849
+ isNumeric = False
850
+
851
+ foundValuesX = set(numpy.unique(X))
852
+ n = len(X)
853
+
854
+ if (
855
+ variableName not in self.discretizationParametersDictionary
856
+ ): # The user has not manually set the discretization parameters for this variable
857
+ if (
858
+ isNumeric
859
+ and 1 <= self.discretizationThreshold < len(foundValuesX)
860
+ or (self.discretizationThreshold < 1 and len(foundValuesX) / len(X) > self.discretizationThreshold)
861
+ ):
862
+ self.discretizationParametersDictionary[variableName] = {}
863
+ self.discretizationParametersDictionary[variableName]["method"] = self.defaultMethod
864
+ self.discretizationParametersDictionary[variableName]["param"] = self.defaultParamDiscretizationMethod
865
+ else:
866
+ self.discretizationParametersDictionary[variableName] = {}
867
+ self.discretizationParametersDictionary[variableName]["method"] = "NoDiscretization"
868
+ usingDefaultParameters = True
869
+ else: # The user has manually set the discretization parameters for this variable
870
+ usingDefaultParameters = False
871
+ if self.discretizationParametersDictionary[variableName]["method"] != "NoDiscretization" and not isNumeric:
872
+ raise ValueError(f"The variable {variableName} is not numeric and cannot be discretized!")
873
+
874
+ if self.discretizationParametersDictionary[variableName]["method"] == "NoDiscretization":
875
+ is_int_var = True
876
+
877
+ varSyntax = ""
878
+ if "param" in self.discretizationParametersDictionary[variableName]:
879
+ varSyntax = self.discretizationParametersDictionary[variableName]["param"]
880
+ if varSyntax is None:
881
+ varSyntax = ""
882
+
883
+ if varSyntax != "":
884
+ var = gum.fastVariable(variableName + varSyntax)
885
+ possibleValuesX = set(var.labels())
886
+ f = {str(x) for x in foundValuesX}
887
+ if not f.issubset(possibleValuesX):
888
+ raise ValueError(
889
+ f"The values passed in possibleValues ({sorted(possibleValuesX)}) do not match database values ("
890
+ f"{sorted(f)})"
891
+ )
892
+ return var
893
+
894
+ possibleValuesX = sorted(foundValuesX)
895
+ is_int_var = all(map(check_int, possibleValuesX))
896
+ if is_int_var:
897
+ possibleValuesX = [int(x) for x in possibleValuesX]
898
+ max_v = int(possibleValuesX[-1]) # sorted
899
+ min_v = int(possibleValuesX[0])
900
+
901
+ if len(possibleValuesX) == max_v - min_v + 1: # no hole in the list of int
902
+ return gum.RangeVariable(variableName, variableName, min_v, max_v)
903
+ else:
904
+ return gum.IntegerVariable(variableName, variableName, possibleValuesX)
905
+
906
+ is_float_var = all(map(check_float, possibleValuesX))
907
+ if is_float_var:
908
+ possibleValuesX = [float(x) for x in possibleValuesX]
909
+ return gum.NumericalDiscreteVariable(variableName, variableName, possibleValuesX)
910
+ else:
911
+ return gum.LabelizedVariable(variableName, variableName, [str(v) for v in possibleValuesX])
912
+ else:
913
+ self.numberOfContinuous += 1
914
+ if self.discretizationParametersDictionary[variableName]["method"] == "expert":
915
+ binEdges = self.discretizationParametersDictionary[variableName]["param"]
916
+ elif self.discretizationParametersDictionary[variableName]["method"] == "CAIM":
917
+ if y is None:
918
+ raise ValueError(
919
+ "The CAIM discretization method requires a list of the associated classes for each data vector since it "
920
+ "is a supervised discretization method. You should pass it as y."
921
+ )
922
+ if possibleValuesY is None:
923
+ possibleValuesY = numpy.unique(y)
924
+ binEdges = self._discretizationCAIM(
925
+ Xtransformed.reshape(n, 1),
926
+ y.reshape(n, 1),
927
+ numpy.unique(Xtransformed),
928
+ possibleValuesY,
929
+ )
930
+ elif self.discretizationParametersDictionary[variableName]["method"] == "MDLP":
931
+ if y is None:
932
+ raise ValueError(
933
+ "The MDLP discretization method requires a list of the associated classes for each data vector since it "
934
+ "is a supervised discretization method. You should pass it as y."
935
+ )
936
+ if possibleValuesY is None:
937
+ possibleValuesY = numpy.unique(y)
938
+ binEdges = self._discretizationMDLP(
939
+ Xtransformed.reshape(n, 1),
940
+ y.reshape(n, 1),
941
+ numpy.unique(Xtransformed),
942
+ possibleValuesY,
943
+ )
944
+ elif self.discretizationParametersDictionary[variableName]["method"] == "NML":
945
+ binEdges = self._discretizationNML(
946
+ Xtransformed.flatten(),
947
+ numpy.unique(Xtransformed),
948
+ kMax=self.discretizationParametersDictionary[variableName]["param"],
949
+ )
950
+ else:
951
+ if self.discretizationParametersDictionary[variableName]["param"] == "elbowMethod":
952
+ binEdges = self._discretizationElbowMethodRotation(
953
+ self.discretizationParametersDictionary[variableName]["method"],
954
+ Xtransformed.flatten(),
955
+ )
956
+ else:
957
+ discre = skp.KBinsDiscretizer(
958
+ self.discretizationParametersDictionary[variableName]["param"],
959
+ strategy=self.discretizationParametersDictionary[variableName]["method"],
960
+ quantile_method="averaged_inverted_cdf",
961
+ subsample=None,
962
+ )
963
+ discre.fit(X.reshape(-1, 1))
964
+ binEdges = discre.bin_edges_[0].tolist()
965
+
966
+ if len(binEdges) == 2:
967
+ raise ValueError(
968
+ "Due to an error the discretization method "
969
+ + str(self.discretizationParametersDictionary[variableName]["method"])
970
+ + " using "
971
+ + str(self.discretizationParametersDictionary[variableName]["param"])
972
+ + " bins for the variable "
973
+ + str(variableName)
974
+ + "gave only 1 bin. Try increasing the number of bins used by this variable using "
975
+ "setDiscretizationParameters to avoid this error"
976
+ )
977
+
978
+ self.totalNumberOfBins += len(binEdges) - 1
979
+ var = gum.DiscretizedVariable(variableName, variableName, binEdges)
980
+ var.setEmpirical(True)
981
+
982
+ if usingDefaultParameters:
983
+ self.discretizationParametersDictionary.pop(variableName)
984
+
985
+ return var
986
+
987
+ @staticmethod
988
+ def _divideIntervalMDLP(
989
+ minimalValues,
990
+ shannonEntropyByLargeInterval,
991
+ Class0ByLargeInterval,
992
+ Class1ByLargeInterval,
993
+ continueDividingInterval,
994
+ totalCountByLargeInterval,
995
+ position,
996
+ binEdgesIndex,
997
+ ):
998
+ shannonEntropy = shannonEntropyByLargeInterval[position]
999
+
1000
+ gain = shannonEntropy - minimalValues["classInformationEntropy"]
1001
+
1002
+ # all the 2's here should be replaced by the number of classes present in the interval. However we know that if
1003
+ # the number of classes in the interval is equal to 1, then the shannon entropy will be 0 so the product of the 2
1004
+ # will be 0.
1005
+ deltaS = math.log2(7) - (
1006
+ 2 * shannonEntropy
1007
+ - 2 * minimalValues["leftSubintervalShannonEntropy"]
1008
+ - 2 * minimalValues["rightSubintervalShannonEntropy"]
1009
+ )
1010
+
1011
+ if (
1012
+ gain > (math.log2(totalCountByLargeInterval[position] - 1) + deltaS) / totalCountByLargeInterval[position]
1013
+ or len(Class0ByLargeInterval) == 1
1014
+ ):
1015
+ binEdgesIndex.insert(position, minimalValues["boundaryIndex"])
1016
+
1017
+ Class0ByLargeInterval.insert(position + 1, minimalValues["rightSubintervalClass0"])
1018
+ Class1ByLargeInterval.insert(position + 1, minimalValues["rightSubintervalClass1"])
1019
+ continueDividingInterval.insert(position + 1, True)
1020
+ totalCountByLargeInterval.insert(
1021
+ position + 1,
1022
+ minimalValues["rightSubintervalClass0"] + minimalValues["rightSubintervalClass1"],
1023
+ )
1024
+ shannonEntropyByLargeInterval.insert(position + 1, minimalValues["rightSubintervalShannonEntropy"])
1025
+
1026
+ Class0ByLargeInterval[position] = minimalValues["leftSubintervalClass0"]
1027
+ Class1ByLargeInterval[position] = minimalValues["leftSubintervalClass1"]
1028
+ totalCountByLargeInterval[position] = (
1029
+ minimalValues["leftSubintervalClass0"] + minimalValues["leftSubintervalClass1"]
1030
+ )
1031
+ shannonEntropyByLargeInterval[position] = minimalValues["leftSubintervalShannonEntropy"]
1032
+
1033
+ # if the class information entropy is 0, then we have perfectly cut the interval so that a class only appears
1034
+ # on one side, so we do not need to cut any further.
1035
+ if minimalValues["leftSubintervalShannonEntropy"] == 0:
1036
+ continueDividingInterval[position] = False
1037
+ if minimalValues["rightSubintervalShannonEntropy"] == 0:
1038
+ continueDividingInterval[position + 1] = False
1039
+
1040
+ # if there are no tensor boundary points left in this interval, we can't divide it any further
1041
+ if position > 0 and minimalValues["boundaryIndex"] - 1 == binEdgesIndex[position - 1]:
1042
+ continueDividingInterval[position] = False
1043
+
1044
+ if minimalValues["boundaryIndex"] == 0:
1045
+ continueDividingInterval[position] = False
1046
+
1047
+ if position < len(binEdgesIndex) - 1 and binEdgesIndex[position] + 1 == binEdgesIndex[position + 1]:
1048
+ continueDividingInterval[position + 1] = False
1049
+ position += 1
1050
+ else:
1051
+ continueDividingInterval[position] = False
1052
+
1053
+ def discretizedTemplate(self, X, y=None, *, possibleValuesY=None, template=None):
1054
+ """
1055
+ return a graphical model discretized using the suggestion of the Discretized for date source X (and for target y).
1056
+ This graphical model only contains the discretized variables.
1057
+ For instance, it can be used as a template for a BNLearner.
1058
+
1059
+ Parameters
1060
+ ----------
1061
+ X: {array-like, sparse matrix, pandas or polars dataframe} of shape (n_samples, n_features)) or str (filename)
1062
+ training data
1063
+ y: array-like, pandas or polars dataframe of shape (n_samples,) or str (classname)
1064
+ Target values
1065
+ possibleValuesY: ndarray
1066
+ An ndarray containing all the unique values of y
1067
+ template: a graphical model such as pyagrum.BayesNet, pyagrum.MRF, etc...
1068
+ the template that will contain the discretized variables.
1069
+ If None, a new Bayesian network is created.
1070
+
1071
+ Returns
1072
+ -------
1073
+ pyagrum.BayesNet or other graphical model:
1074
+ the discretized graphical model (only (discretized) random variables are created in the model)
1075
+
1076
+ Example
1077
+ -------
1078
+ >>> discretizer = DiscreteTypeProcessor(
1079
+ ... defaultDiscretizationMethod="uniform", defaultParamDiscretizationMethod=7, discretizationThreshold=10
1080
+ ... )
1081
+ >>> learner = gum.BNLearner(data, discretizer.discretizedTemplate(data))
1082
+ """
1083
+ if template is None:
1084
+ template = gum.BayesNet()
1085
+
1086
+ if isinstance(X, str):
1087
+ Xp = pandas.read_csv(X)
1088
+ elif hasattr(X, "to_pandas"):
1089
+ Xp = X.to_pandas()
1090
+ else:
1091
+ Xp = X
1092
+
1093
+ if isinstance(y, str):
1094
+ yp = Xp[y]
1095
+ elif y is not None and hasattr(y, "to_pandas"):
1096
+ yp = y.to_pandas()
1097
+ else:
1098
+ yp = y
1099
+
1100
+ for name in Xp:
1101
+ template.add(self._createVariable(name, Xp[name], yp, possibleValuesY))
1102
+ return template