machinegnostics 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- machinegnostics/__init__.py +24 -0
- machinegnostics/magcal/__init__.py +37 -0
- machinegnostics/magcal/characteristics.py +460 -0
- machinegnostics/magcal/criteria_eval.py +268 -0
- machinegnostics/magcal/criterion.py +140 -0
- machinegnostics/magcal/data_conversion.py +381 -0
- machinegnostics/magcal/gcor.py +64 -0
- machinegnostics/magcal/gdf/__init__.py +2 -0
- machinegnostics/magcal/gdf/base_df.py +39 -0
- machinegnostics/magcal/gdf/base_distfunc.py +1202 -0
- machinegnostics/magcal/gdf/base_egdf.py +823 -0
- machinegnostics/magcal/gdf/base_eldf.py +830 -0
- machinegnostics/magcal/gdf/base_qgdf.py +1234 -0
- machinegnostics/magcal/gdf/base_qldf.py +1019 -0
- machinegnostics/magcal/gdf/cluster_analysis.py +456 -0
- machinegnostics/magcal/gdf/data_cluster.py +975 -0
- machinegnostics/magcal/gdf/data_intervals.py +853 -0
- machinegnostics/magcal/gdf/data_membership.py +536 -0
- machinegnostics/magcal/gdf/der_egdf.py +243 -0
- machinegnostics/magcal/gdf/distfunc_engine.py +841 -0
- machinegnostics/magcal/gdf/egdf.py +324 -0
- machinegnostics/magcal/gdf/eldf.py +297 -0
- machinegnostics/magcal/gdf/eldf_intv.py +609 -0
- machinegnostics/magcal/gdf/eldf_ma.py +627 -0
- machinegnostics/magcal/gdf/homogeneity.py +1218 -0
- machinegnostics/magcal/gdf/intv_engine.py +1523 -0
- machinegnostics/magcal/gdf/marginal_intv_analysis.py +558 -0
- machinegnostics/magcal/gdf/qgdf.py +289 -0
- machinegnostics/magcal/gdf/qldf.py +296 -0
- machinegnostics/magcal/gdf/scedasticity.py +197 -0
- machinegnostics/magcal/gdf/wedf.py +181 -0
- machinegnostics/magcal/gdf/z0_estimator.py +1047 -0
- machinegnostics/magcal/layer_base.py +42 -0
- machinegnostics/magcal/layer_history_base.py +74 -0
- machinegnostics/magcal/layer_io_process_base.py +238 -0
- machinegnostics/magcal/layer_param_base.py +448 -0
- machinegnostics/magcal/mg_weights.py +36 -0
- machinegnostics/magcal/sample_characteristics.py +532 -0
- machinegnostics/magcal/scale_optimization.py +185 -0
- machinegnostics/magcal/scale_param.py +313 -0
- machinegnostics/magcal/util/__init__.py +0 -0
- machinegnostics/magcal/util/dis_docstring.py +18 -0
- machinegnostics/magcal/util/logging.py +24 -0
- machinegnostics/magcal/util/min_max_float.py +34 -0
- machinegnostics/magnet/__init__.py +0 -0
- machinegnostics/metrics/__init__.py +28 -0
- machinegnostics/metrics/accu.py +61 -0
- machinegnostics/metrics/accuracy.py +67 -0
- machinegnostics/metrics/auto_correlation.py +183 -0
- machinegnostics/metrics/auto_covariance.py +204 -0
- machinegnostics/metrics/cls_report.py +130 -0
- machinegnostics/metrics/conf_matrix.py +93 -0
- machinegnostics/metrics/correlation.py +178 -0
- machinegnostics/metrics/cross_variance.py +167 -0
- machinegnostics/metrics/divi.py +82 -0
- machinegnostics/metrics/evalmet.py +109 -0
- machinegnostics/metrics/f1_score.py +128 -0
- machinegnostics/metrics/gmmfe.py +108 -0
- machinegnostics/metrics/hc.py +141 -0
- machinegnostics/metrics/mae.py +72 -0
- machinegnostics/metrics/mean.py +117 -0
- machinegnostics/metrics/median.py +122 -0
- machinegnostics/metrics/mg_r2.py +167 -0
- machinegnostics/metrics/mse.py +78 -0
- machinegnostics/metrics/precision.py +119 -0
- machinegnostics/metrics/r2.py +122 -0
- machinegnostics/metrics/recall.py +108 -0
- machinegnostics/metrics/rmse.py +77 -0
- machinegnostics/metrics/robr2.py +119 -0
- machinegnostics/metrics/std.py +144 -0
- machinegnostics/metrics/variance.py +101 -0
- machinegnostics/models/__init__.py +2 -0
- machinegnostics/models/classification/__init__.py +1 -0
- machinegnostics/models/classification/layer_history_log_reg.py +121 -0
- machinegnostics/models/classification/layer_io_process_log_reg.py +98 -0
- machinegnostics/models/classification/layer_mlflow_log_reg.py +107 -0
- machinegnostics/models/classification/layer_param_log_reg.py +275 -0
- machinegnostics/models/classification/mg_log_reg.py +273 -0
- machinegnostics/models/cross_validation.py +118 -0
- machinegnostics/models/data_split.py +106 -0
- machinegnostics/models/regression/__init__.py +2 -0
- machinegnostics/models/regression/layer_histroy_rob_reg.py +139 -0
- machinegnostics/models/regression/layer_io_process_rob_rig.py +88 -0
- machinegnostics/models/regression/layer_mlflow_rob_reg.py +134 -0
- machinegnostics/models/regression/layer_param_rob_reg.py +212 -0
- machinegnostics/models/regression/mg_lin_reg.py +253 -0
- machinegnostics/models/regression/mg_poly_reg.py +258 -0
- machinegnostics-0.0.1.dist-info/METADATA +246 -0
- machinegnostics-0.0.1.dist-info/RECORD +93 -0
- machinegnostics-0.0.1.dist-info/WHEEL +5 -0
- machinegnostics-0.0.1.dist-info/licenses/LICENSE +674 -0
- machinegnostics-0.0.1.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Gnostic Correlation Metric
|
|
3
|
+
|
|
4
|
+
This module provides a function to compute the Gnostic correlation between two data samples.
|
|
5
|
+
|
|
6
|
+
Author: Nirmal Parmar
|
|
7
|
+
Machine Gnostics
|
|
8
|
+
'''
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
from machinegnostics.magcal import EGDF, QGDF, DataHomogeneity
|
|
12
|
+
import logging
|
|
13
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
14
|
+
|
|
15
|
+
def correlation(X: np.ndarray, y: np.ndarray, case: str = 'i', verbose: bool = False) -> float:
|
|
16
|
+
"""
|
|
17
|
+
Calculate the Gnostic correlation coefficient between a feature array X and a target array y.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
----------
|
|
21
|
+
X : np.ndarray
|
|
22
|
+
The feature data sample. Must be a numpy array without NaN or Inf values.
|
|
23
|
+
If X has more than one column, pass each column one by one to this function.
|
|
24
|
+
y : np.ndarray
|
|
25
|
+
The target data sample. Must be a 1D numpy array without NaN or Inf values.
|
|
26
|
+
case : str, optional, default='i'
|
|
27
|
+
Specifies the type of geometry to use:
|
|
28
|
+
- 'i': Estimation geometry (EGDF).
|
|
29
|
+
- 'j': Quantifying geometry (QGDF).
|
|
30
|
+
verbose : bool, optional, default=False
|
|
31
|
+
If True, enables detailed logging for debugging purposes.
|
|
32
|
+
|
|
33
|
+
Returns:
|
|
34
|
+
-------
|
|
35
|
+
float
|
|
36
|
+
The Gnostic correlation coefficient between the two data samples.
|
|
37
|
+
|
|
38
|
+
Examples:
|
|
39
|
+
---------
|
|
40
|
+
Example 1: Compute correlation for two simple datasets
|
|
41
|
+
>>> import numpy as np
|
|
42
|
+
>>> from machinegnostics.metrics import correlation
|
|
43
|
+
>>> X = np.array([1, 2, 3, 4, 5])
|
|
44
|
+
>>> y = np.array([5, 4, 3, 2, 1])
|
|
45
|
+
>>> corr = correlation(X, y, case='i', verbose=False)
|
|
46
|
+
>>> print(f"Correlation (case='i'): {corr}")
|
|
47
|
+
|
|
48
|
+
Example 2: For multi-column X
|
|
49
|
+
>>> X = np.array([[1, 10], [2, 20], [3, 30], [4, 40], [5, 50]])
|
|
50
|
+
>>> y = np.array([5, 4, 3, 2, 1])
|
|
51
|
+
>>> for i in range(X.shape[1]):
|
|
52
|
+
... corr = correlation(X[:, i], y)
|
|
53
|
+
... print(f"Correlation for column {i}: {corr}")
|
|
54
|
+
|
|
55
|
+
Raises:
|
|
56
|
+
------
|
|
57
|
+
ValueError
|
|
58
|
+
If the input arrays are not of the same length, are empty, contain NaN/Inf values,
|
|
59
|
+
or are not 1D numpy arrays. Also raised if `case` is not 'i' or 'j'.
|
|
60
|
+
|
|
61
|
+
Notes:
|
|
62
|
+
-----
|
|
63
|
+
- If X has more than one column, pass each column separately (e.g., X[:, i]).
|
|
64
|
+
- y must be a 1D array.
|
|
65
|
+
- This metric is robust to data uncertainty and provides meaningful estimates even
|
|
66
|
+
in the presence of noise or outliers.
|
|
67
|
+
- Ensure that the input data is preprocessed and cleaned for optimal results.
|
|
68
|
+
- In cases where data homogeneity is not met, a warning is raised, and the scale
|
|
69
|
+
parameter is adjusted to improve results.
|
|
70
|
+
"""
|
|
71
|
+
logger = get_logger('correlation', level=logging.WARNING if not verbose else logging.INFO)
|
|
72
|
+
logger.info("Starting correlation computation.")
|
|
73
|
+
|
|
74
|
+
# Validate inputs
|
|
75
|
+
if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
|
|
76
|
+
logger.error("Inputs must be numpy arrays.")
|
|
77
|
+
raise ValueError("Inputs must be numpy arrays.")
|
|
78
|
+
|
|
79
|
+
# Flatten X and y to 1D if possible
|
|
80
|
+
X = X.flatten()
|
|
81
|
+
y = y.flatten()
|
|
82
|
+
|
|
83
|
+
if len(X) != len(y):
|
|
84
|
+
logger.error("Input arrays must have the same length.")
|
|
85
|
+
raise ValueError("Input arrays must have the same length.")
|
|
86
|
+
if len(X) == 0 or len(y) == 0:
|
|
87
|
+
logger.error("Input arrays must not be empty.")
|
|
88
|
+
raise ValueError("Input arrays must not be empty.")
|
|
89
|
+
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
|
|
90
|
+
logger.error("Input arrays must not contain NaN values.")
|
|
91
|
+
raise ValueError("Input arrays must not contain NaN values.")
|
|
92
|
+
if np.any(np.isinf(X)) or np.any(np.isinf(y)):
|
|
93
|
+
logger.error("Input arrays must not contain Inf values.")
|
|
94
|
+
raise ValueError("Input arrays must not contain Inf values.")
|
|
95
|
+
if case not in ['i', 'j']:
|
|
96
|
+
logger.error("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
97
|
+
raise ValueError("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
98
|
+
|
|
99
|
+
# default arg
|
|
100
|
+
FLUSH = False
|
|
101
|
+
VERBOSE = False
|
|
102
|
+
|
|
103
|
+
# ...existing code logic, replacing data_1 with X and data_2 with y...
|
|
104
|
+
if case == 'i':
|
|
105
|
+
logger.info("Using Estimation Global Distribution Function (EGDF) for correlation computation.")
|
|
106
|
+
egdf_X = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
107
|
+
egdf_X.fit(X)
|
|
108
|
+
|
|
109
|
+
egdf_y = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
110
|
+
egdf_y.fit(y)
|
|
111
|
+
|
|
112
|
+
logger.info("Performing data homogeneity check.")
|
|
113
|
+
dh_X = DataHomogeneity(gdf=egdf_X, verbose=VERBOSE, flush=FLUSH)
|
|
114
|
+
is_homo_X = dh_X.fit()
|
|
115
|
+
|
|
116
|
+
dh_y = DataHomogeneity(gdf=egdf_y, verbose=VERBOSE, flush=FLUSH)
|
|
117
|
+
is_homo_y = dh_y.fit()
|
|
118
|
+
|
|
119
|
+
if not is_homo_X:
|
|
120
|
+
logger.warning("X is not homogeneous. Switching to S=1 for better results.")
|
|
121
|
+
logger.info("Fitting EGDF with S=1.")
|
|
122
|
+
egdf_X = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
123
|
+
egdf_X.fit(X)
|
|
124
|
+
|
|
125
|
+
if not is_homo_y:
|
|
126
|
+
logger.warning("y is not homogeneous. Switching to S=1 for better results.")
|
|
127
|
+
logger.info("Fitting EGDF with S=1.")
|
|
128
|
+
egdf_y = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
129
|
+
egdf_y.fit(y)
|
|
130
|
+
|
|
131
|
+
hc_X = np.mean(egdf_X.hi, axis=0)
|
|
132
|
+
hc_y = np.mean(egdf_y.hi, axis=0)
|
|
133
|
+
|
|
134
|
+
if case == 'j':
|
|
135
|
+
logger.info("Using Estimation Global Distribution Function (EGDF) for correlation computation.")
|
|
136
|
+
egdf_X = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
137
|
+
egdf_X.fit(X)
|
|
138
|
+
|
|
139
|
+
egdf_y = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
140
|
+
egdf_y.fit(y)
|
|
141
|
+
|
|
142
|
+
logger.info("Checking data homogeneity.")
|
|
143
|
+
dh_X = DataHomogeneity(gdf=egdf_X, verbose=VERBOSE, flush=FLUSH)
|
|
144
|
+
is_homo_X = dh_X.fit()
|
|
145
|
+
|
|
146
|
+
dh_y = DataHomogeneity(gdf=egdf_y, verbose=VERBOSE, flush=FLUSH)
|
|
147
|
+
is_homo_y = dh_y.fit()
|
|
148
|
+
|
|
149
|
+
if not is_homo_X:
|
|
150
|
+
logger.warning("X is not homogeneous. Switching to S=1 for better results.")
|
|
151
|
+
if not is_homo_y:
|
|
152
|
+
logger.warning("y is not homogeneous. Switching to S=1 for better results.")
|
|
153
|
+
|
|
154
|
+
logger.info("Using Quantification Global Distribution Function (QGDF) for correlation computation.")
|
|
155
|
+
qgdf_X = QGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
156
|
+
qgdf_X.fit(X)
|
|
157
|
+
|
|
158
|
+
qgdf_y = QGDF(flush=FLUSH, verbose=VERBOSE)
|
|
159
|
+
qgdf_y.fit(y)
|
|
160
|
+
|
|
161
|
+
hc_X = np.mean(qgdf_X.hj, axis=0)
|
|
162
|
+
hc_y = np.mean(qgdf_y.hj, axis=0)
|
|
163
|
+
|
|
164
|
+
hc_X = np.clip(hc_X, 1, 1e12)
|
|
165
|
+
hc_y = np.clip(hc_y, 1, 1e12)
|
|
166
|
+
|
|
167
|
+
def compute_correlation(hc_X: np.ndarray, hc_y: np.ndarray) -> float:
|
|
168
|
+
logger.info("Computing correlation.")
|
|
169
|
+
numerator = np.sum(hc_X * hc_y)
|
|
170
|
+
denominator = (np.sqrt(np.sum(hc_X**2)) * np.sqrt(np.sum(hc_y**2)))
|
|
171
|
+
corr = numerator / denominator
|
|
172
|
+
if denominator == 0:
|
|
173
|
+
return np.nan
|
|
174
|
+
return corr
|
|
175
|
+
|
|
176
|
+
corr = compute_correlation(hc_X, hc_y)
|
|
177
|
+
logger.info("Correlation computed successfully.")
|
|
178
|
+
return corr
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Gnostic Cross-Variance
|
|
3
|
+
|
|
4
|
+
Author: Nirmal Parmar
|
|
5
|
+
Machine Gnostics
|
|
6
|
+
'''
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from machinegnostics.magcal import EGDF, QGDF, DataHomogeneity
|
|
10
|
+
import logging
|
|
11
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
12
|
+
|
|
13
|
+
def cross_covariance(X: np.ndarray, y: np.ndarray, case: str = 'i', verbose: bool = False) -> float:
|
|
14
|
+
"""
|
|
15
|
+
Calculate the Gnostic cross-covariance between a feature array X and a target array y.
|
|
16
|
+
|
|
17
|
+
Parameters:
|
|
18
|
+
----------
|
|
19
|
+
X : np.ndarray
|
|
20
|
+
The feature data sample. Must be a 1D numpy array (single feature/column).
|
|
21
|
+
If X has more than one column, pass each column separately (e.g., X[:, i]).
|
|
22
|
+
y : np.ndarray
|
|
23
|
+
The target data sample. Must be a 1D numpy array without NaN or Inf values.
|
|
24
|
+
case : str, optional, default='i'
|
|
25
|
+
Specifies the type of geometry to use:
|
|
26
|
+
- 'i': Estimation geometry.
|
|
27
|
+
- 'j': Quantifying geometry.
|
|
28
|
+
verbose : bool, optional, default=False
|
|
29
|
+
If True, enables detailed logging for debugging purposes.
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
-------
|
|
33
|
+
float
|
|
34
|
+
The Gnostic cross-covariance between the two data samples.
|
|
35
|
+
|
|
36
|
+
Examples:
|
|
37
|
+
---------
|
|
38
|
+
Example 1: Compute cross-covariance for two simple datasets
|
|
39
|
+
>>> import numpy as np
|
|
40
|
+
>>> from machinegnostics.metrics import cross_covariance
|
|
41
|
+
>>> X = np.array([1, 2, 3, 4, 5])
|
|
42
|
+
>>> y = np.array([5, 4, 3, 2, 1])
|
|
43
|
+
>>> covar = cross_covariance(X, y, case='i', verbose=False)
|
|
44
|
+
>>> print(f"Cross-Covariance (case='i'): {covar}")
|
|
45
|
+
|
|
46
|
+
Example 2: For multi-column X
|
|
47
|
+
>>> X = np.array([[1, 10], [2, 20], [3, 30], [4, 40], [5, 50]])
|
|
48
|
+
>>> y = np.array([5, 4, 3, 2, 1])
|
|
49
|
+
>>> for i in range(X.shape[1]):
|
|
50
|
+
... covar = cross_covariance(X[:, i], y)
|
|
51
|
+
... print(f"Cross-Covariance for column {i}: {covar}")
|
|
52
|
+
|
|
53
|
+
Raises:
|
|
54
|
+
------
|
|
55
|
+
ValueError
|
|
56
|
+
If the input arrays are not of the same length, are empty, contain NaN/Inf values,
|
|
57
|
+
or are not 1D numpy arrays. Also raised if `case` is not 'i' or 'j'.
|
|
58
|
+
|
|
59
|
+
Notes:
|
|
60
|
+
-----
|
|
61
|
+
- X must be a 1D numpy array (single column). For multi-column X, pass each column separately.
|
|
62
|
+
- y must be a 1D numpy array.
|
|
63
|
+
- This metric is robust to data uncertainty and provides meaningful estimates even
|
|
64
|
+
in the presence of noise or outliers.
|
|
65
|
+
- Ensure that the input data is preprocessed and cleaned for optimal results.
|
|
66
|
+
- In cases where data homogeneity is not met, a warning is raised, and the scale
|
|
67
|
+
parameter is adjusted to improve results.
|
|
68
|
+
"""
|
|
69
|
+
logger = get_logger('cross_covariance', level=logging.WARNING if not verbose else logging.INFO)
|
|
70
|
+
logger.info("Starting cross-covariance computation.")
|
|
71
|
+
# Validate inputs
|
|
72
|
+
if len(X) != len(y):
|
|
73
|
+
logger.error("Input arrays must have the same length.")
|
|
74
|
+
raise ValueError("Input arrays must have the same length.")
|
|
75
|
+
if len(X) == 0 or len(y) == 0:
|
|
76
|
+
logger.error("Input arrays must not be empty.")
|
|
77
|
+
raise ValueError("Input arrays must not be empty.")
|
|
78
|
+
if not isinstance(X, np.ndarray) or not isinstance(y, np.ndarray):
|
|
79
|
+
logger.error("Inputs must be numpy arrays.")
|
|
80
|
+
raise ValueError("Inputs must be numpy arrays.")
|
|
81
|
+
# flatten the arrays if they are not 1D
|
|
82
|
+
X = X.flatten()
|
|
83
|
+
y = y.flatten()
|
|
84
|
+
if X.ndim != 1 or y.ndim != 1:
|
|
85
|
+
logger.error("X and y must be 1D numpy arrays. For multi-column X, pass each column separately (e.g., X[:, i]).")
|
|
86
|
+
raise ValueError("X and y must be 1D numpy arrays. For multi-column X, pass each column separately (e.g., X[:, i]).")
|
|
87
|
+
# avoid inf and nan in data
|
|
88
|
+
if np.any(np.isnan(X)) or np.any(np.isnan(y)):
|
|
89
|
+
logger.error("Input arrays must not contain NaN values.")
|
|
90
|
+
raise ValueError("Input arrays must not contain NaN values.")
|
|
91
|
+
if np.any(np.isinf(X)) or np.any(np.isinf(y)):
|
|
92
|
+
logger.error("Input arrays must not contain Inf values.")
|
|
93
|
+
raise ValueError("Input arrays must not contain Inf values.")
|
|
94
|
+
if case not in ['i', 'j']:
|
|
95
|
+
logger.error("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
96
|
+
raise ValueError("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
97
|
+
|
|
98
|
+
# ...existing logic unchanged...
|
|
99
|
+
FLUSH = False
|
|
100
|
+
VERBOSE = False
|
|
101
|
+
|
|
102
|
+
if case == 'i':
|
|
103
|
+
logger.info("Using Estimation Global Distribution Function (EGDF) for correlation computation.")
|
|
104
|
+
egdf_data_1 = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
105
|
+
egdf_data_1.fit(X)
|
|
106
|
+
|
|
107
|
+
egdf_data_2 = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
108
|
+
egdf_data_2.fit(y)
|
|
109
|
+
|
|
110
|
+
logger.info("Performing data homogeneity check.")
|
|
111
|
+
dh_data_1 = DataHomogeneity(gdf=egdf_data_1, verbose=VERBOSE, flush=FLUSH)
|
|
112
|
+
is_homo_data_1 = dh_data_1.fit()
|
|
113
|
+
|
|
114
|
+
dh_data_2 = DataHomogeneity(gdf=egdf_data_2, verbose=VERBOSE, flush=FLUSH)
|
|
115
|
+
is_homo_data_2 = dh_data_2.fit()
|
|
116
|
+
|
|
117
|
+
if not is_homo_data_1:
|
|
118
|
+
logger.warning("X is not homogeneous. Switching to S=1 for better results.")
|
|
119
|
+
logger.info("Fitting EGDF with S=1.")
|
|
120
|
+
egdf_data_1 = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
121
|
+
egdf_data_1.fit(X)
|
|
122
|
+
|
|
123
|
+
if not is_homo_data_2:
|
|
124
|
+
logger.warning("y is not homogeneous. Switching to S=1 for better results.")
|
|
125
|
+
logger.info("Fitting EGDF with S=1.")
|
|
126
|
+
egdf_data_2 = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
127
|
+
egdf_data_2.fit(y)
|
|
128
|
+
|
|
129
|
+
hc_data_1 = np.mean(egdf_data_1.hi, axis=0)
|
|
130
|
+
hc_data_2 = np.mean(egdf_data_2.hi, axis=0)
|
|
131
|
+
|
|
132
|
+
if case == 'j':
|
|
133
|
+
logger.info("Using Estimation Global Distribution Function (EGDF) for correlation computation.")
|
|
134
|
+
egdf_data_1 = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
135
|
+
egdf_data_1.fit(X)
|
|
136
|
+
|
|
137
|
+
egdf_data_2 = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
138
|
+
egdf_data_2.fit(y)
|
|
139
|
+
|
|
140
|
+
logger.info("Checking data homogeneity.")
|
|
141
|
+
dh_data_1 = DataHomogeneity(gdf=egdf_data_1, verbose=VERBOSE, flush=FLUSH)
|
|
142
|
+
is_homo_data_1 = dh_data_1.fit()
|
|
143
|
+
|
|
144
|
+
dh_data_2 = DataHomogeneity(gdf=egdf_data_2, verbose=VERBOSE, flush=FLUSH)
|
|
145
|
+
is_homo_data_2 = dh_data_2.fit()
|
|
146
|
+
|
|
147
|
+
if not is_homo_data_1:
|
|
148
|
+
logger.warning("X is not homogeneous. Switching to S=1 for better results.")
|
|
149
|
+
if not is_homo_data_2:
|
|
150
|
+
logger.warning("y is not homogeneous. Switching to S=1 for better results.")
|
|
151
|
+
|
|
152
|
+
logger.info("Using Quantification Global Distribution Function (QGDF) for correlation computation.")
|
|
153
|
+
qgdf_data_1 = QGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
154
|
+
qgdf_data_1.fit(X)
|
|
155
|
+
|
|
156
|
+
qgdf_data_2 = QGDF(flush=FLUSH, verbose=VERBOSE)
|
|
157
|
+
qgdf_data_2.fit(y)
|
|
158
|
+
|
|
159
|
+
hc_data_1 = np.mean(qgdf_data_1.hj, axis=0)
|
|
160
|
+
hc_data_2 = np.mean(qgdf_data_2.hj, axis=0)
|
|
161
|
+
|
|
162
|
+
hc_data_1 = np.clip(hc_data_1, 1, 1e12)
|
|
163
|
+
hc_data_2 = np.clip(hc_data_2, 1, 1e12)
|
|
164
|
+
|
|
165
|
+
cross_covar = np.mean(hc_data_1 * hc_data_2)
|
|
166
|
+
logger.info(f"Cross-covariance calculated successfully.")
|
|
167
|
+
return cross_covar
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
'''
|
|
2
|
+
ManGo - Machine Gnostics Library
|
|
3
|
+
Copyright (C) 2025 ManGo Team
|
|
4
|
+
|
|
5
|
+
Author: Nirmal Parmar
|
|
6
|
+
'''
|
|
7
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
8
|
+
import logging
|
|
9
|
+
import numpy as np
|
|
10
|
+
from machinegnostics.magcal.criteria_eval import CriteriaEvaluator
|
|
11
|
+
|
|
12
|
+
def divI(y: np.ndarray, y_fit: np.ndarray, verbose: bool = False) -> float:
|
|
13
|
+
"""
|
|
14
|
+
Compute the Divergence Information (DivI) for evaluating the fit between observed data and model predictions.
|
|
15
|
+
|
|
16
|
+
The DivI is a statistical metric that measures the divergence between the distributions of the observed and fitted values
|
|
17
|
+
using gnostic characteristics. It is particularly useful for assessing the quality of model fits in various applications.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
y : np.ndarray
|
|
22
|
+
The observed data (ground truth). Must be a 1D array of numerical values.
|
|
23
|
+
y_fit : np.ndarray
|
|
24
|
+
The fitted data (model predictions). Must be a 1D array of the same shape as `y`.
|
|
25
|
+
verbose : bool, optional
|
|
26
|
+
If True, enables detailed logging for debugging purposes. Default is False.
|
|
27
|
+
|
|
28
|
+
Returns
|
|
29
|
+
-------
|
|
30
|
+
float
|
|
31
|
+
The computed Divergence Information (DivI) value.
|
|
32
|
+
|
|
33
|
+
Raises
|
|
34
|
+
------
|
|
35
|
+
ValueError
|
|
36
|
+
If `y` and `y_fit` do not have the same shape.
|
|
37
|
+
ValueError
|
|
38
|
+
If `w` is provided and does not have the same shape as `y`.
|
|
39
|
+
ValueError
|
|
40
|
+
If `y` or `y_fit` are not 1D arrays.
|
|
41
|
+
|
|
42
|
+
Notes
|
|
43
|
+
-----
|
|
44
|
+
- The DivI is calculated using gnostic characteristics, which provide a robust way to measure divergence between distributions.
|
|
45
|
+
|
|
46
|
+
References
|
|
47
|
+
----------
|
|
48
|
+
- Kovanic P., Humber M.B (2015) The Economics of Information - Mathematical Gnostics for Data Analysis, Chapter 19.3.4
|
|
49
|
+
|
|
50
|
+
Example
|
|
51
|
+
-------
|
|
52
|
+
>>> import numpy as np
|
|
53
|
+
>>> from src.metrics.divi import divI
|
|
54
|
+
>>> y = np.array([
|
|
55
|
+
... 1.0, 2.0, 3.0, 4.0
|
|
56
|
+
... ])
|
|
57
|
+
>>> y_fit = np.array([
|
|
58
|
+
... 1.1, 1.9, 3.2, 3.8
|
|
59
|
+
... ])
|
|
60
|
+
>>> divI(y, y_fit)
|
|
61
|
+
"""
|
|
62
|
+
logger = get_logger('DivI', level=logging.WARNING if not verbose else logging.INFO)
|
|
63
|
+
logger.info("Starting DivI calculation.")
|
|
64
|
+
# Ensure y and y_fit are 1D arrays
|
|
65
|
+
if y.ndim != 1 or y_fit.ndim != 1:
|
|
66
|
+
logger.error("Both y and y_fit must be 1D arrays.")
|
|
67
|
+
raise ValueError("Both y and y_fit must be 1D arrays.")
|
|
68
|
+
|
|
69
|
+
# Ensure y and y_fit have the same shape
|
|
70
|
+
if y.shape != y_fit.shape:
|
|
71
|
+
logger.error("y and y_fit must have the same shape.")
|
|
72
|
+
raise ValueError("y and y_fit must have the same shape.")
|
|
73
|
+
|
|
74
|
+
# Convert to numpy arrays and flatten
|
|
75
|
+
y = np.asarray(y).flatten()
|
|
76
|
+
y_fit = np.asarray(y_fit).flatten()
|
|
77
|
+
|
|
78
|
+
# Compute the Divergence Information (DivI)
|
|
79
|
+
evaluator = CriteriaEvaluator(y, y_fit, verbose=verbose)
|
|
80
|
+
divI_value = evaluator._divI()
|
|
81
|
+
logger.info(f"Divergence Information (DivI) calculation completed.")
|
|
82
|
+
return divI_value
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
'''
|
|
2
|
+
ManGo - Machine Gnostics Library
|
|
3
|
+
Copyright (C) 2025 ManGo Team
|
|
4
|
+
|
|
5
|
+
Author: Nirmal Parmar
|
|
6
|
+
'''
|
|
7
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
8
|
+
import logging
|
|
9
|
+
import numpy as np
|
|
10
|
+
from machinegnostics.magcal.criteria_eval import CriteriaEvaluator
|
|
11
|
+
|
|
12
|
+
def evalMet(y: np.ndarray, y_fit: np.ndarray, w: np.ndarray = None, verbose: bool = False) -> float:
|
|
13
|
+
"""
|
|
14
|
+
Compute the Evaluation Metric (EvalMet) for evaluating the fit between observed data and model predictions.
|
|
15
|
+
|
|
16
|
+
The EvalMet is a composite metric that combines Robust R-squared (RobR2), Geometric Mean of Model Fit Error (GMMFE),
|
|
17
|
+
and Divergence Information (DivI) to provide a comprehensive assessment of model performance.
|
|
18
|
+
|
|
19
|
+
Parameters
|
|
20
|
+
----------
|
|
21
|
+
y : np.ndarray
|
|
22
|
+
The observed data (ground truth). Must be a 1D array of numerical values.
|
|
23
|
+
y_fit : np.ndarray
|
|
24
|
+
The fitted data (model predictions). Must be a 1D array of the same shape as `y`.
|
|
25
|
+
w : np.ndarray, optional
|
|
26
|
+
Weights for the data points. If not provided, an array of ones is used.
|
|
27
|
+
verbose : bool, optional
|
|
28
|
+
If True, enables detailed logging for debugging purposes. Default is False.
|
|
29
|
+
|
|
30
|
+
Returns
|
|
31
|
+
-------
|
|
32
|
+
float
|
|
33
|
+
The computed Evaluation Metric (EvalMet) value.
|
|
34
|
+
|
|
35
|
+
Raises
|
|
36
|
+
------
|
|
37
|
+
ValueError
|
|
38
|
+
If `y` and `y_fit` do not have the same shape.
|
|
39
|
+
ValueError
|
|
40
|
+
If `w` is provided and does not have the same shape as `y`.
|
|
41
|
+
ValueError
|
|
42
|
+
If `y` or `y_fit` are not 1D arrays.
|
|
43
|
+
|
|
44
|
+
Notes
|
|
45
|
+
-----
|
|
46
|
+
- The EvalMet is calculated as:
|
|
47
|
+
EvalMet = RobR2 / (GMMFE . DivI)
|
|
48
|
+
where:
|
|
49
|
+
- RobR2 = Robust R-squared value
|
|
50
|
+
- GMMFE = Geometric Mean of Model Fit Error
|
|
51
|
+
- DivI = Divergence Information
|
|
52
|
+
|
|
53
|
+
References
|
|
54
|
+
----------
|
|
55
|
+
- Kovanic P., Humber M.B (2015) The Economics of Information - Mathematical Gnostics for Data Analysis, Chapter 19.3.4
|
|
56
|
+
|
|
57
|
+
Example
|
|
58
|
+
-------
|
|
59
|
+
>>> from mango.metrics.evalmet import evalMet
|
|
60
|
+
>>> import numpy as np
|
|
61
|
+
>>> y = np.array([
|
|
62
|
+
... 1.0, 2.0, 3.0, 4.0
|
|
63
|
+
... ])
|
|
64
|
+
>>> y_fit = np.array([
|
|
65
|
+
... 1.1, 1.9, 3.2, 3.8
|
|
66
|
+
... ])
|
|
67
|
+
>>> evalMet(y, y_fit, weights)
|
|
68
|
+
"""
|
|
69
|
+
logger = get_logger('EvalMet', level=logging.WARNING if not verbose else logging.INFO)
|
|
70
|
+
logger.info("Starting EvalMet calculation.")
|
|
71
|
+
# Ensure y and y_fit are 1D arrays
|
|
72
|
+
if y.ndim != 1 or y_fit.ndim != 1:
|
|
73
|
+
logger.error("Both y and y_fit must be 1D arrays.")
|
|
74
|
+
raise ValueError("Both y and y_fit must be 1D arrays.")
|
|
75
|
+
|
|
76
|
+
# Ensure y and y_fit have the same shape
|
|
77
|
+
if y.shape != y_fit.shape:
|
|
78
|
+
logger.error("y and y_fit must have the same shape.")
|
|
79
|
+
raise ValueError("y and y_fit must have the same shape.")
|
|
80
|
+
|
|
81
|
+
# empty check
|
|
82
|
+
if y.size == 0 or y_fit.size == 0:
|
|
83
|
+
logger.error("y and y_fit must not be empty.")
|
|
84
|
+
raise ValueError("y and y_fit must not be empty.")
|
|
85
|
+
if np.any(np.isnan(y)) or np.any(np.isnan(y_fit)):
|
|
86
|
+
logger.error("y and y_fit must not contain NaN values.")
|
|
87
|
+
raise ValueError("y and y_fit must not contain NaN values.")
|
|
88
|
+
if np.any(np.isinf(y)) or np.any(np.isinf(y_fit)):
|
|
89
|
+
logger.error("y and y_fit must not contain Inf values.")
|
|
90
|
+
raise ValueError("y and y_fit must not contain Inf values.")
|
|
91
|
+
|
|
92
|
+
# If weights are not provided, use an array of ones
|
|
93
|
+
if w is None:
|
|
94
|
+
w = np.ones_like(y)
|
|
95
|
+
|
|
96
|
+
# Ensure weights have the same shape as y
|
|
97
|
+
if w.shape != y.shape:
|
|
98
|
+
logger.error("Weights must have the same shape as y.")
|
|
99
|
+
raise ValueError("Weights must have the same shape as y.")
|
|
100
|
+
|
|
101
|
+
# Convert to numpy arrays and flatten
|
|
102
|
+
y = np.asarray(y).flatten()
|
|
103
|
+
y_fit = np.asarray(y_fit).flatten()
|
|
104
|
+
|
|
105
|
+
# Compute the Evaluation Metric (EvalMet)
|
|
106
|
+
evaluator = CriteriaEvaluator(y, y_fit, w, verbose=verbose)
|
|
107
|
+
evalmet = evaluator._evalmet()
|
|
108
|
+
logger.info(f"EvalMet calculation completed.")
|
|
109
|
+
return evalmet
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
def f1_score(y_true:np.ndarray | pd.Series | list,
|
|
7
|
+
y_pred:np.ndarray | pd.Series | list,
|
|
8
|
+
average='binary',
|
|
9
|
+
labels=None,
|
|
10
|
+
verbose:bool=False) -> float | np.ndarray:
|
|
11
|
+
"""
|
|
12
|
+
Computes the F1 score for classification tasks.
|
|
13
|
+
|
|
14
|
+
The F1 score is the harmonic mean of precision and recall.
|
|
15
|
+
Supports binary and multiclass classification.
|
|
16
|
+
|
|
17
|
+
Parameters
|
|
18
|
+
----------
|
|
19
|
+
y_true : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
20
|
+
Ground truth (correct) target values.
|
|
21
|
+
|
|
22
|
+
y_pred : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
23
|
+
Estimated targets as returned by a classifier.
|
|
24
|
+
|
|
25
|
+
average : {'binary', 'micro', 'macro', 'weighted', None}, default='binary'
|
|
26
|
+
- 'binary': Only report results for the class specified by `pos_label` (default for binary).
|
|
27
|
+
- 'micro': Calculate metrics globally by counting the total true positives, false negatives and false positives.
|
|
28
|
+
- 'macro': Calculate metrics for each label, and find their unweighted mean.
|
|
29
|
+
- 'weighted': Calculate metrics for each label, and find their average weighted by support (the number of true instances for each label).
|
|
30
|
+
- None: Return the F1 score for each class.
|
|
31
|
+
|
|
32
|
+
labels : array-like, default=None
|
|
33
|
+
List of labels to include. If None, uses sorted unique labels from y_true and y_pred.
|
|
34
|
+
|
|
35
|
+
verbose : bool, optional
|
|
36
|
+
If True, enables detailed logging for debugging purposes. Default is False.
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
f1 : float or array of floats
|
|
40
|
+
F1 score(s). Float if average is not None, array otherwise.
|
|
41
|
+
|
|
42
|
+
Examples
|
|
43
|
+
--------
|
|
44
|
+
>>> y_true = [0, 1, 2, 2, 0]
|
|
45
|
+
>>> y_pred = [0, 0, 2, 2, 0]
|
|
46
|
+
>>> f1_score(y_true, y_pred, average='macro')
|
|
47
|
+
0.7777777777777777
|
|
48
|
+
|
|
49
|
+
>>> import pandas as pd
|
|
50
|
+
>>> df = pd.DataFrame({'true': [1, 0, 1], 'pred': [1, 1, 1]})
|
|
51
|
+
>>> f1_score(df['true'], df['pred'], average='binary')
|
|
52
|
+
0.8
|
|
53
|
+
"""
|
|
54
|
+
logger = get_logger('f1_score', level=logging.WARNING if not verbose else logging.INFO)
|
|
55
|
+
logger.info("Calculating F1 Score...")
|
|
56
|
+
# If input is a DataFrame, raise error (must select column)
|
|
57
|
+
if isinstance(y_true, pd.DataFrame) or isinstance(y_pred, pd.DataFrame):
|
|
58
|
+
logger.error("y_true and y_pred must be 1D array-like or pandas Series, not DataFrame. Select a column.")
|
|
59
|
+
raise ValueError("y_true and y_pred must be 1D array-like or pandas Series, not DataFrame. Select a column.")
|
|
60
|
+
|
|
61
|
+
# Convert pandas Series to numpy array
|
|
62
|
+
if isinstance(y_true, pd.Series):
|
|
63
|
+
y_true = y_true.values
|
|
64
|
+
if isinstance(y_pred, pd.Series):
|
|
65
|
+
y_pred = y_pred.values
|
|
66
|
+
|
|
67
|
+
# Convert to numpy arrays and flatten
|
|
68
|
+
y_true = np.asarray(y_true).flatten()
|
|
69
|
+
y_pred = np.asarray(y_pred).flatten()
|
|
70
|
+
|
|
71
|
+
if y_true.shape != y_pred.shape:
|
|
72
|
+
logger.error("Shape mismatch between y_true and y_pred.")
|
|
73
|
+
raise ValueError("Shape of y_true and y_pred must be the same.")
|
|
74
|
+
if y_true.size == 0:
|
|
75
|
+
logger.error("Empty input arrays.")
|
|
76
|
+
raise ValueError("y_true and y_pred must not be empty.")
|
|
77
|
+
# inf and nan check
|
|
78
|
+
if np.any(np.isnan(y_true)) or np.any(np.isnan(y_pred)):
|
|
79
|
+
logger.error("Input contains NaN values.")
|
|
80
|
+
raise ValueError("y_true and y_pred must not contain NaN values.")
|
|
81
|
+
if np.any(np.isinf(y_true)) or np.any(np.isinf(y_pred)):
|
|
82
|
+
logger.error("Input contains Inf values.")
|
|
83
|
+
raise ValueError("y_true and y_pred must not contain Inf values.")
|
|
84
|
+
|
|
85
|
+
# Get unique labels
|
|
86
|
+
if labels is None:
|
|
87
|
+
labels = np.unique(np.concatenate([y_true, y_pred]))
|
|
88
|
+
else:
|
|
89
|
+
labels = np.asarray(labels)
|
|
90
|
+
|
|
91
|
+
precisions = []
|
|
92
|
+
recalls = []
|
|
93
|
+
for label in labels:
|
|
94
|
+
tp = np.sum((y_pred == label) & (y_true == label))
|
|
95
|
+
fp = np.sum((y_pred == label) & (y_true != label))
|
|
96
|
+
fn = np.sum((y_pred != label) & (y_true == label))
|
|
97
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
98
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
99
|
+
precisions.append(precision)
|
|
100
|
+
recalls.append(recall)
|
|
101
|
+
|
|
102
|
+
precisions = np.array(precisions)
|
|
103
|
+
recalls = np.array(recalls)
|
|
104
|
+
f1s = np.where((precisions + recalls) > 0, 2 * precisions * recalls / (precisions + recalls), 0.0)
|
|
105
|
+
|
|
106
|
+
logger.info("F1 Score calculation completed.")
|
|
107
|
+
if average == 'binary':
|
|
108
|
+
if len(labels) != 2:
|
|
109
|
+
logger.error("Binary average is only supported for binary classification with 2 classes.")
|
|
110
|
+
raise ValueError("Binary average is only supported for binary classification with 2 classes.")
|
|
111
|
+
return f1s[1]
|
|
112
|
+
elif average == 'micro':
|
|
113
|
+
tp = sum(np.sum((y_pred == label) & (y_true == label)) for label in labels)
|
|
114
|
+
fp = sum(np.sum((y_pred == label) & (y_true != label)) for label in labels)
|
|
115
|
+
fn = sum(np.sum((y_pred != label) & (y_true == label)) for label in labels)
|
|
116
|
+
precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
|
|
117
|
+
recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
|
|
118
|
+
return 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
|
|
119
|
+
elif average == 'macro':
|
|
120
|
+
return np.mean(f1s)
|
|
121
|
+
elif average == 'weighted':
|
|
122
|
+
support = np.array([np.sum(y_true == label) for label in labels])
|
|
123
|
+
return np.average(f1s, weights=support)
|
|
124
|
+
elif average is None:
|
|
125
|
+
return f1s
|
|
126
|
+
else:
|
|
127
|
+
logger.error(f"Unknown average type: {average}")
|
|
128
|
+
raise ValueError(f"Unknown average type: {average}")
|