machinegnostics 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- __init__.py +0 -0
- machinegnostics/__init__.py +24 -0
- machinegnostics/magcal/__init__.py +37 -0
- machinegnostics/magcal/characteristics.py +460 -0
- machinegnostics/magcal/criteria_eval.py +268 -0
- machinegnostics/magcal/criterion.py +140 -0
- machinegnostics/magcal/data_conversion.py +381 -0
- machinegnostics/magcal/gcor.py +64 -0
- machinegnostics/magcal/gdf/__init__.py +2 -0
- machinegnostics/magcal/gdf/base_df.py +39 -0
- machinegnostics/magcal/gdf/base_distfunc.py +1202 -0
- machinegnostics/magcal/gdf/base_egdf.py +823 -0
- machinegnostics/magcal/gdf/base_eldf.py +830 -0
- machinegnostics/magcal/gdf/base_qgdf.py +1234 -0
- machinegnostics/magcal/gdf/base_qldf.py +1019 -0
- machinegnostics/magcal/gdf/cluster_analysis.py +456 -0
- machinegnostics/magcal/gdf/data_cluster.py +975 -0
- machinegnostics/magcal/gdf/data_intervals.py +853 -0
- machinegnostics/magcal/gdf/data_membership.py +536 -0
- machinegnostics/magcal/gdf/der_egdf.py +243 -0
- machinegnostics/magcal/gdf/distfunc_engine.py +841 -0
- machinegnostics/magcal/gdf/egdf.py +324 -0
- machinegnostics/magcal/gdf/eldf.py +297 -0
- machinegnostics/magcal/gdf/eldf_intv.py +609 -0
- machinegnostics/magcal/gdf/eldf_ma.py +627 -0
- machinegnostics/magcal/gdf/homogeneity.py +1218 -0
- machinegnostics/magcal/gdf/intv_engine.py +1523 -0
- machinegnostics/magcal/gdf/marginal_intv_analysis.py +558 -0
- machinegnostics/magcal/gdf/qgdf.py +289 -0
- machinegnostics/magcal/gdf/qldf.py +296 -0
- machinegnostics/magcal/gdf/scedasticity.py +197 -0
- machinegnostics/magcal/gdf/wedf.py +181 -0
- machinegnostics/magcal/gdf/z0_estimator.py +1047 -0
- machinegnostics/magcal/layer_base.py +42 -0
- machinegnostics/magcal/layer_history_base.py +74 -0
- machinegnostics/magcal/layer_io_process_base.py +238 -0
- machinegnostics/magcal/layer_param_base.py +448 -0
- machinegnostics/magcal/mg_weights.py +36 -0
- machinegnostics/magcal/sample_characteristics.py +532 -0
- machinegnostics/magcal/scale_optimization.py +185 -0
- machinegnostics/magcal/scale_param.py +313 -0
- machinegnostics/magcal/util/__init__.py +0 -0
- machinegnostics/magcal/util/dis_docstring.py +18 -0
- machinegnostics/magcal/util/logging.py +24 -0
- machinegnostics/magcal/util/min_max_float.py +34 -0
- machinegnostics/magnet/__init__.py +0 -0
- machinegnostics/metrics/__init__.py +28 -0
- machinegnostics/metrics/accu.py +61 -0
- machinegnostics/metrics/accuracy.py +67 -0
- machinegnostics/metrics/auto_correlation.py +183 -0
- machinegnostics/metrics/auto_covariance.py +204 -0
- machinegnostics/metrics/cls_report.py +130 -0
- machinegnostics/metrics/conf_matrix.py +93 -0
- machinegnostics/metrics/correlation.py +178 -0
- machinegnostics/metrics/cross_variance.py +167 -0
- machinegnostics/metrics/divi.py +82 -0
- machinegnostics/metrics/evalmet.py +109 -0
- machinegnostics/metrics/f1_score.py +128 -0
- machinegnostics/metrics/gmmfe.py +108 -0
- machinegnostics/metrics/hc.py +141 -0
- machinegnostics/metrics/mae.py +72 -0
- machinegnostics/metrics/mean.py +117 -0
- machinegnostics/metrics/median.py +122 -0
- machinegnostics/metrics/mg_r2.py +167 -0
- machinegnostics/metrics/mse.py +78 -0
- machinegnostics/metrics/precision.py +119 -0
- machinegnostics/metrics/r2.py +122 -0
- machinegnostics/metrics/recall.py +108 -0
- machinegnostics/metrics/rmse.py +77 -0
- machinegnostics/metrics/robr2.py +119 -0
- machinegnostics/metrics/std.py +144 -0
- machinegnostics/metrics/variance.py +101 -0
- machinegnostics/models/__init__.py +2 -0
- machinegnostics/models/classification/__init__.py +1 -0
- machinegnostics/models/classification/layer_history_log_reg.py +121 -0
- machinegnostics/models/classification/layer_io_process_log_reg.py +98 -0
- machinegnostics/models/classification/layer_mlflow_log_reg.py +107 -0
- machinegnostics/models/classification/layer_param_log_reg.py +275 -0
- machinegnostics/models/classification/mg_log_reg.py +273 -0
- machinegnostics/models/cross_validation.py +118 -0
- machinegnostics/models/data_split.py +106 -0
- machinegnostics/models/regression/__init__.py +2 -0
- machinegnostics/models/regression/layer_histroy_rob_reg.py +139 -0
- machinegnostics/models/regression/layer_io_process_rob_rig.py +88 -0
- machinegnostics/models/regression/layer_mlflow_rob_reg.py +134 -0
- machinegnostics/models/regression/layer_param_rob_reg.py +212 -0
- machinegnostics/models/regression/mg_lin_reg.py +253 -0
- machinegnostics/models/regression/mg_poly_reg.py +258 -0
- machinegnostics-0.0.1.dist-info/METADATA +246 -0
- machinegnostics-0.0.1.dist-info/RECORD +93 -0
- machinegnostics-0.0.1.dist-info/WHEEL +5 -0
- machinegnostics-0.0.1.dist-info/licenses/LICENSE +674 -0
- machinegnostics-0.0.1.dist-info/top_level.txt +2 -0
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import logging
|
|
4
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
5
|
+
from machinegnostics.metrics.mean import mean
|
|
6
|
+
|
|
7
|
+
def accuracy_score(y_true:np.ndarray, y_pred:np.ndarray, verbose:bool=False) -> float:
|
|
8
|
+
"""
|
|
9
|
+
Computes the accuracy classification score.
|
|
10
|
+
|
|
11
|
+
Supports input as numpy arrays, lists, or pandas Series/DataFrame columns.
|
|
12
|
+
|
|
13
|
+
Parameters
|
|
14
|
+
----------
|
|
15
|
+
y_true : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
16
|
+
Ground truth (correct) target values.
|
|
17
|
+
|
|
18
|
+
y_pred : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
19
|
+
Estimated targets as returned by a classifier.
|
|
20
|
+
verbose : bool, optional
|
|
21
|
+
If True, enables detailed logging for debugging purposes. Default is False.
|
|
22
|
+
|
|
23
|
+
Returns
|
|
24
|
+
-------
|
|
25
|
+
accuracy : float
|
|
26
|
+
The accuracy score as a float in the range [0, 1].
|
|
27
|
+
|
|
28
|
+
Examples
|
|
29
|
+
--------
|
|
30
|
+
>>> y_true = [0, 1, 2, 2, 0]
|
|
31
|
+
>>> y_pred = [0, 0, 2, 2, 0]
|
|
32
|
+
>>> accuracy_score(y_true, y_pred)
|
|
33
|
+
0.8
|
|
34
|
+
|
|
35
|
+
>>> import pandas as pd
|
|
36
|
+
>>> df = pd.DataFrame({'true': [1, 0, 1], 'pred': [1, 1, 1]})
|
|
37
|
+
>>> accuracy_score(df['true'], df['pred'])
|
|
38
|
+
"""
|
|
39
|
+
logger = get_logger('accuracy_score', level=logging.WARNING if not verbose else logging.INFO)
|
|
40
|
+
logger.info("Calculating Accuracy Score...")
|
|
41
|
+
# Check for empty input
|
|
42
|
+
if y_true is None or y_pred is None:
|
|
43
|
+
logger.error("y_true and y_pred must not be None.")
|
|
44
|
+
raise ValueError("y_true and y_pred must not be None.")
|
|
45
|
+
# If input is a DataFrame, raise error (must select column)
|
|
46
|
+
if isinstance(y_true, pd.DataFrame) or isinstance(y_pred, pd.DataFrame):
|
|
47
|
+
logger.error("y_true and y_pred must be 1D array-like or pandas Series, not DataFrame. Select a column.")
|
|
48
|
+
raise ValueError("y_true and y_pred must be 1D array-like or pandas Series, not DataFrame. Select a column.")
|
|
49
|
+
|
|
50
|
+
# Convert pandas Series to numpy array
|
|
51
|
+
if isinstance(y_true, pd.Series):
|
|
52
|
+
y_true = y_true.values
|
|
53
|
+
if isinstance(y_pred, pd.Series):
|
|
54
|
+
y_pred = y_pred.values
|
|
55
|
+
|
|
56
|
+
# Convert to numpy arrays and flatten
|
|
57
|
+
y_true = np.asarray(y_true).flatten()
|
|
58
|
+
y_pred = np.asarray(y_pred).flatten()
|
|
59
|
+
|
|
60
|
+
if y_true.shape != y_pred.shape:
|
|
61
|
+
raise ValueError("Shape of y_true and y_pred must be the same.")
|
|
62
|
+
|
|
63
|
+
correct = np.sum(y_true == y_pred)
|
|
64
|
+
total = y_true.size
|
|
65
|
+
accuracy = correct / total
|
|
66
|
+
logger.info("Accuracy score calculation complete.")
|
|
67
|
+
return accuracy
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Auto-Correlation Metric
|
|
3
|
+
|
|
4
|
+
This module provides a function to compute the auto-correlation of a data sample.
|
|
5
|
+
|
|
6
|
+
Author: Nirmal Parmar
|
|
7
|
+
Machine Gnostics
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
12
|
+
import numpy as np
|
|
13
|
+
from machinegnostics.magcal import EGDF, QGDF, DataHomogeneity
|
|
14
|
+
|
|
15
|
+
def auto_correlation(data: np.ndarray, lag: int = 0, case: str = 'i', verbose: bool = False) -> float:
|
|
16
|
+
"""
|
|
17
|
+
Calculate the Gnostic auto-correlation of a data sample.
|
|
18
|
+
|
|
19
|
+
Auto-correlation measures the similarity between a data sample and a lagged version of itself.
|
|
20
|
+
This function uses the principles of gnostic theory to compute robust estimates of auto-correlation.
|
|
21
|
+
|
|
22
|
+
Parameters:
|
|
23
|
+
----------
|
|
24
|
+
data : np.ndarray
|
|
25
|
+
The data sample. Must be a 1D numpy array without NaN or Inf values.
|
|
26
|
+
lag : int, optional, default=0
|
|
27
|
+
The lag value for which the auto-correlation is computed. Must be non-negative and less than the length of the data.
|
|
28
|
+
case : str, optional, default='i'
|
|
29
|
+
Specifies the type of geometry to use:
|
|
30
|
+
- 'i': Estimation geometry (EGDF).
|
|
31
|
+
- 'j': Quantifying geometry (QGDF).
|
|
32
|
+
verbose : bool, optional, default=False
|
|
33
|
+
If True, enables detailed logging for debugging purposes.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
-------
|
|
37
|
+
float
|
|
38
|
+
The Gnostic auto-correlation coefficient for the given lag.
|
|
39
|
+
|
|
40
|
+
Raises:
|
|
41
|
+
------
|
|
42
|
+
ValueError
|
|
43
|
+
If the input array is empty, contains NaN/Inf values, is not 1D, or if the lag is invalid.
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
---------
|
|
47
|
+
Example 1: Compute auto-correlation for a simple dataset
|
|
48
|
+
>>> import numpy as np
|
|
49
|
+
>>> from machinegnostics.metrics import auto_correlation
|
|
50
|
+
>>> data = np.array([1, 2, 3, 4, 5])
|
|
51
|
+
>>> lag = 1
|
|
52
|
+
>>> auto_corr = auto_correlation(data, lag=lag, case='i', verbose=False)
|
|
53
|
+
>>> print(f"Auto-Correlation (lag={lag}, case='i'): {auto_corr}")
|
|
54
|
+
|
|
55
|
+
Notes:
|
|
56
|
+
-----
|
|
57
|
+
- This metric is robust to data uncertainty and provides meaningful estimates even in the presence of noise or outliers.
|
|
58
|
+
- Ensure that the input data is preprocessed and cleaned for optimal results.
|
|
59
|
+
"""
|
|
60
|
+
logger = get_logger('auto_correlation', level=logging.WARNING if not verbose else logging.INFO)
|
|
61
|
+
logger.info("Starting auto-correlation computation.")
|
|
62
|
+
|
|
63
|
+
# Validate inputs
|
|
64
|
+
if not isinstance(data, np.ndarray):
|
|
65
|
+
logger.error("Input must be a numpy array.")
|
|
66
|
+
raise ValueError("Input must be a numpy array.")
|
|
67
|
+
# flatten data
|
|
68
|
+
data = data.flatten()
|
|
69
|
+
if data.ndim != 1:
|
|
70
|
+
logger.error("Input array must be 1D.")
|
|
71
|
+
raise ValueError("Input array must be 1D.")
|
|
72
|
+
if len(data) == 0:
|
|
73
|
+
logger.error("Input array must not be empty.")
|
|
74
|
+
raise ValueError("Input array must not be empty.")
|
|
75
|
+
if np.any(np.isnan(data)):
|
|
76
|
+
logger.error("Input array must not contain NaN values.")
|
|
77
|
+
raise ValueError("Input array must not contain NaN values.")
|
|
78
|
+
if np.any(np.isinf(data)):
|
|
79
|
+
logger.error("Input array must not contain Inf values.")
|
|
80
|
+
raise ValueError("Input array must not contain Inf values.")
|
|
81
|
+
if lag < 0 or lag >= len(data):
|
|
82
|
+
logger.error("Lag must be non-negative and less than the length of the data.")
|
|
83
|
+
raise ValueError("Lag must be non-negative and less than the length of the data.")
|
|
84
|
+
if case not in ['i', 'j']:
|
|
85
|
+
logger.error("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
86
|
+
raise ValueError("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
87
|
+
|
|
88
|
+
# Shift data by lag
|
|
89
|
+
data_lagged = np.roll(data, -lag)
|
|
90
|
+
data_lagged = data_lagged[:-lag] if lag > 0 else data_lagged
|
|
91
|
+
data = data[:len(data_lagged)]
|
|
92
|
+
|
|
93
|
+
# Default arguments for gnostic functions
|
|
94
|
+
FLUSH = False
|
|
95
|
+
VERBOSE = False
|
|
96
|
+
|
|
97
|
+
if case == 'i':
|
|
98
|
+
logger.info("Using Estimation Global Distribution Function (EGDF) for irrelevance computation.")
|
|
99
|
+
# EGDF
|
|
100
|
+
egdf_data = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
101
|
+
egdf_data.fit(data)
|
|
102
|
+
|
|
103
|
+
egdf_data_lagged = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
104
|
+
egdf_data_lagged.fit(data_lagged)
|
|
105
|
+
|
|
106
|
+
# Data Homogeneity
|
|
107
|
+
logger.info("Performing data homogeneity check.")
|
|
108
|
+
dh_data = DataHomogeneity(gdf=egdf_data, verbose=VERBOSE, flush=FLUSH)
|
|
109
|
+
is_homo_data = dh_data.fit()
|
|
110
|
+
|
|
111
|
+
dh_data_lagged = DataHomogeneity(gdf=egdf_data_lagged, verbose=VERBOSE, flush=FLUSH)
|
|
112
|
+
is_homo_data_lagged = dh_data_lagged.fit()
|
|
113
|
+
|
|
114
|
+
# data homogeneity check
|
|
115
|
+
if not is_homo_data:
|
|
116
|
+
logger.warning("Data is not homogeneous. Switching to S=1 for better results.")
|
|
117
|
+
logger.info("Fitting EGDF with S=1.")
|
|
118
|
+
egdf_data = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
119
|
+
egdf_data.fit(data)
|
|
120
|
+
|
|
121
|
+
if not is_homo_data_lagged:
|
|
122
|
+
logger.warning("Lagged data is not homogeneous. Switching to S=1 for better results.")
|
|
123
|
+
logger.info("Fitting EGDF with S=1.")
|
|
124
|
+
egdf_data_lagged = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
125
|
+
egdf_data_lagged.fit(data_lagged)
|
|
126
|
+
|
|
127
|
+
# Get irrelevance of the data sample
|
|
128
|
+
logger.info("Getting irrelevance of the data sample.")
|
|
129
|
+
hc_data = np.mean(egdf_data.hi, axis=0)
|
|
130
|
+
hc_data_lagged = np.mean(egdf_data_lagged.hi, axis=0)
|
|
131
|
+
|
|
132
|
+
if case == 'j':
|
|
133
|
+
logger.info("Using Quantifying Global Distribution Function (QGDF) for irrelevance computation.")
|
|
134
|
+
# EGDF
|
|
135
|
+
egdf_data = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
136
|
+
egdf_data.fit(data)
|
|
137
|
+
|
|
138
|
+
egdf_data_lagged = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
139
|
+
egdf_data_lagged.fit(data_lagged)
|
|
140
|
+
|
|
141
|
+
# Data Homogeneity
|
|
142
|
+
logger.info("Performing data homogeneity check.")
|
|
143
|
+
dh_data = DataHomogeneity(gdf=egdf_data, verbose=VERBOSE, flush=FLUSH)
|
|
144
|
+
is_homo_data = dh_data.fit()
|
|
145
|
+
|
|
146
|
+
dh_data_lagged = DataHomogeneity(gdf=egdf_data_lagged, verbose=VERBOSE, flush=FLUSH)
|
|
147
|
+
is_homo_data_lagged = dh_data_lagged.fit()
|
|
148
|
+
|
|
149
|
+
# data homogeneity check
|
|
150
|
+
if not is_homo_data:
|
|
151
|
+
logger.info("Data is not homogeneous.")
|
|
152
|
+
if not is_homo_data_lagged:
|
|
153
|
+
logger.info("Lagged data is not homogeneous.")
|
|
154
|
+
|
|
155
|
+
# QGDF
|
|
156
|
+
logger.info("Fitting QGDF with S=1.")
|
|
157
|
+
qgdf_data = QGDF(flush=FLUSH, verbose=VERBOSE)
|
|
158
|
+
qgdf_data.fit(data)
|
|
159
|
+
|
|
160
|
+
qgdf_data_lagged = QGDF(flush=FLUSH, verbose=VERBOSE)
|
|
161
|
+
qgdf_data_lagged.fit(data_lagged)
|
|
162
|
+
|
|
163
|
+
# Get irrelevance of the data sample
|
|
164
|
+
hc_data = np.mean(qgdf_data.hj, axis=0)
|
|
165
|
+
hc_data_lagged = np.mean(qgdf_data_lagged.hj, axis=0)
|
|
166
|
+
|
|
167
|
+
# Stop overflow by limiting big value in hc up to 1e12
|
|
168
|
+
hc_data = np.clip(hc_data, 1, 1e12)
|
|
169
|
+
hc_data_lagged = np.clip(hc_data_lagged, 1, 1e12)
|
|
170
|
+
|
|
171
|
+
# Compute correlation
|
|
172
|
+
def compute_correlation(hc_data_1: np.ndarray, hc_data_2: np.ndarray) -> float:
|
|
173
|
+
logger.debug("Computing correlation.")
|
|
174
|
+
numerator = np.sum(hc_data_1 * hc_data_2)
|
|
175
|
+
denominator = (np.sqrt(np.sum(hc_data_1**2)) * np.sqrt(np.sum(hc_data_2**2)))
|
|
176
|
+
corr = numerator / denominator
|
|
177
|
+
if denominator == 0:
|
|
178
|
+
return np.nan
|
|
179
|
+
return corr
|
|
180
|
+
|
|
181
|
+
auto_corr = compute_correlation(hc_data, hc_data_lagged)
|
|
182
|
+
|
|
183
|
+
return auto_corr
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Gnostic auto-co-variance
|
|
3
|
+
|
|
4
|
+
Author: Nirmal Parmar
|
|
5
|
+
Machine Gnostics
|
|
6
|
+
'''
|
|
7
|
+
|
|
8
|
+
from re import VERBOSE
|
|
9
|
+
import numpy as np
|
|
10
|
+
from machinegnostics.magcal import EGDF, QGDF, DataHomogeneity
|
|
11
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
def auto_covariance(data: np.ndarray, lag: int = 0, case: str = 'i', verbose: bool = False) -> float:
|
|
15
|
+
"""
|
|
16
|
+
Calculate the Gnostic auto-covariance of a data sample.
|
|
17
|
+
|
|
18
|
+
Auto-covariance measures the relationship between a data sample and a lagged version of itself.
|
|
19
|
+
This function uses the principles of Gnostic theory to compute robust estimates of auto-covariance.
|
|
20
|
+
|
|
21
|
+
Parameters:
|
|
22
|
+
----------
|
|
23
|
+
data : np.ndarray
|
|
24
|
+
The data sample. Must be a 1D numpy array without NaN or Inf values.
|
|
25
|
+
The input data should represent a time series or sequential data points.
|
|
26
|
+
lag : int, optional, default=0
|
|
27
|
+
The lag value for which the auto-covariance is computed. Must be non-negative and less than the length of the data.
|
|
28
|
+
A lag of 0 computes the covariance of the data with itself.
|
|
29
|
+
case : str, optional, default='i'
|
|
30
|
+
Specifies the type of geometry to use for irrelevance computation:
|
|
31
|
+
- 'i': Estimation Geometry Distribution Function.
|
|
32
|
+
- 'j': Quantifying Geometry Distribution Function.
|
|
33
|
+
verbose : bool, optional, default=False
|
|
34
|
+
If True, detailed logging information will be printed during the computation.
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
-------
|
|
38
|
+
float
|
|
39
|
+
The Gnostic auto-covariance coefficient for the given lag. If the computed value is less than 1e-6, it is set to 0.0.
|
|
40
|
+
|
|
41
|
+
Raises:
|
|
42
|
+
------
|
|
43
|
+
ValueError
|
|
44
|
+
If the input array is invalid (e.g., not a numpy array, contains NaN/Inf values, is not 1D, or is empty).
|
|
45
|
+
If the lag is negative or greater than or equal to the length of the data.
|
|
46
|
+
If the case is not one of ['i', 'j'].
|
|
47
|
+
|
|
48
|
+
Notes:
|
|
49
|
+
-----
|
|
50
|
+
- This function uses Gnostic theory to compute irrelevance values for the data and its lagged version.
|
|
51
|
+
- Irrelevance values are clipped to avoid overflow, with a maximum value of 1e12.
|
|
52
|
+
- Homogeneity checks are performed on the data and its lagged version. If the data is not homogeneous, warnings are raised.
|
|
53
|
+
|
|
54
|
+
Warnings:
|
|
55
|
+
--------
|
|
56
|
+
- If the data or its lagged version is not homogeneous, a warning is printed suggesting the use of a scale parameter ( S = 1 ) for better results.
|
|
57
|
+
|
|
58
|
+
Examples:
|
|
59
|
+
---------
|
|
60
|
+
Example 1: Compute auto-covariance for a simple dataset
|
|
61
|
+
>>> from machinegnostics.metrics import auto_covariance
|
|
62
|
+
>>> import machinegnostics as mg # alternative import
|
|
63
|
+
>>> import numpy as np
|
|
64
|
+
>>> data = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
|
|
65
|
+
>>> lag = 1
|
|
66
|
+
>>> auto_covar = auto_covariance(data, lag=lag, case='i')
|
|
67
|
+
>>> auto_covar = mg.auto_covariance(data, lag=lag, case='i') # alternative usage
|
|
68
|
+
>>> print(f"Auto-covariance with lag={lag}: {auto_covar}")
|
|
69
|
+
|
|
70
|
+
Example 2: Compute auto-covariance for a dataset with QGDF
|
|
71
|
+
>>> data = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
|
|
72
|
+
>>> lag = 2
|
|
73
|
+
>>> auto_covar = auto_covariance(data, lag=lag, case='j')
|
|
74
|
+
>>> print(f"Auto-covariance with lag={lag}: {auto_covar}")
|
|
75
|
+
|
|
76
|
+
Example 3: Handle invalid input
|
|
77
|
+
>>> data = np.array([1.0, np.nan, 3.0, 4.0, 5.0])
|
|
78
|
+
>>> lag = 1
|
|
79
|
+
>>> try:
|
|
80
|
+
>>> auto_covar = auto_covariance(data, lag=lag, case='i')
|
|
81
|
+
>>> except ValueError as e:
|
|
82
|
+
>>> print(f"Error: {e}")
|
|
83
|
+
|
|
84
|
+
"""
|
|
85
|
+
logger = get_logger('auto_covariance', level=logging.WARNING if not verbose else logging.INFO)
|
|
86
|
+
# Validate inputs
|
|
87
|
+
if not isinstance(data, np.ndarray):
|
|
88
|
+
logger.error("Input must be a numpy array.")
|
|
89
|
+
raise ValueError("Input must be a numpy array.")
|
|
90
|
+
# flatten data
|
|
91
|
+
data = data.flatten()
|
|
92
|
+
if data.ndim != 1:
|
|
93
|
+
logger.error("Input array must be 1D.")
|
|
94
|
+
raise ValueError("Input array must be 1D.")
|
|
95
|
+
if len(data) == 0:
|
|
96
|
+
logger.error("Input array must not be empty.")
|
|
97
|
+
raise ValueError("Input array must not be empty.")
|
|
98
|
+
if np.any(np.isnan(data)):
|
|
99
|
+
logger.error("Input array must not contain NaN values.")
|
|
100
|
+
raise ValueError("Input array must not contain NaN values.")
|
|
101
|
+
if np.any(np.isinf(data)):
|
|
102
|
+
logger.error("Input array must not contain Inf values.")
|
|
103
|
+
raise ValueError("Input array must not contain Inf values.")
|
|
104
|
+
if lag < 0 or lag >= len(data):
|
|
105
|
+
logger.error("Lag must be non-negative and less than the length of the data.")
|
|
106
|
+
raise ValueError("Lag must be non-negative and less than the length of the data.")
|
|
107
|
+
if case not in ['i', 'j']:
|
|
108
|
+
logger.error("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
109
|
+
raise ValueError("Case must be 'i' for estimation geometry or 'j' for quantifying geometry.")
|
|
110
|
+
|
|
111
|
+
# Shift data by lag
|
|
112
|
+
data_lagged = np.roll(data, -lag)
|
|
113
|
+
data_lagged = data_lagged[:-lag] if lag > 0 else data_lagged
|
|
114
|
+
data = data[:len(data_lagged)]
|
|
115
|
+
|
|
116
|
+
# Default arguments for gnostic functions
|
|
117
|
+
FLUSH = False
|
|
118
|
+
VERBOSE = False
|
|
119
|
+
|
|
120
|
+
if case == 'i':
|
|
121
|
+
logger.info("Using Estimation Global Distribution Function (EGDF) for irrelevance computation.")
|
|
122
|
+
# EGDF
|
|
123
|
+
egdf_data = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
124
|
+
egdf_data.fit(data)
|
|
125
|
+
|
|
126
|
+
egdf_data_lagged = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
127
|
+
egdf_data_lagged.fit(data_lagged)
|
|
128
|
+
|
|
129
|
+
# Data Homogeneity
|
|
130
|
+
logger.info("Performing data homogeneity check.")
|
|
131
|
+
dh_data = DataHomogeneity(gdf=egdf_data, verbose=VERBOSE, flush=FLUSH)
|
|
132
|
+
is_homo_data = dh_data.fit()
|
|
133
|
+
|
|
134
|
+
dh_data_lagged = DataHomogeneity(gdf=egdf_data_lagged, verbose=VERBOSE, flush=FLUSH)
|
|
135
|
+
is_homo_data_lagged = dh_data_lagged.fit()
|
|
136
|
+
|
|
137
|
+
# data homogeneity check
|
|
138
|
+
if not is_homo_data:
|
|
139
|
+
logger.warning("Data is not homogeneous. Switching to S=1 for better results.")
|
|
140
|
+
logger.info("Fitting EGDF with S=1.")
|
|
141
|
+
egdf_data = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
142
|
+
egdf_data.fit(data)
|
|
143
|
+
|
|
144
|
+
if not is_homo_data_lagged:
|
|
145
|
+
logger.warning("Lagged data is not homogeneous. Switching to S=1 for better results.")
|
|
146
|
+
logger.info("Fitting EGDF with S=1.")
|
|
147
|
+
egdf_data_lagged = EGDF(flush=FLUSH, verbose=VERBOSE, S=1)
|
|
148
|
+
egdf_data_lagged.fit(data_lagged)
|
|
149
|
+
|
|
150
|
+
# Get irrelevance of the data sample
|
|
151
|
+
logger.info("Getting irrelevance of the data sample.")
|
|
152
|
+
hc_data = np.mean(egdf_data.hi, axis=0)
|
|
153
|
+
hc_data_lagged = np.mean(egdf_data_lagged.hi, axis=0)
|
|
154
|
+
|
|
155
|
+
if case == 'j':
|
|
156
|
+
logger.info("Using Quantifying Global Distribution Function (QGDF) for irrelevance computation.")
|
|
157
|
+
# EGDF
|
|
158
|
+
egdf_data = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
159
|
+
egdf_data.fit(data)
|
|
160
|
+
|
|
161
|
+
egdf_data_lagged = EGDF(flush=FLUSH, verbose=VERBOSE)
|
|
162
|
+
egdf_data_lagged.fit(data_lagged)
|
|
163
|
+
|
|
164
|
+
# Data Homogeneity
|
|
165
|
+
logger.info("Performing data homogeneity check.")
|
|
166
|
+
dh_data = DataHomogeneity(gdf=egdf_data, verbose=VERBOSE, flush=FLUSH)
|
|
167
|
+
is_homo_data = dh_data.fit()
|
|
168
|
+
|
|
169
|
+
dh_data_lagged = DataHomogeneity(gdf=egdf_data_lagged, verbose=VERBOSE, flush=FLUSH)
|
|
170
|
+
is_homo_data_lagged = dh_data_lagged.fit()
|
|
171
|
+
|
|
172
|
+
# data homogeneity check
|
|
173
|
+
if not is_homo_data:
|
|
174
|
+
logger.info("Data is not homogeneous.")
|
|
175
|
+
if not is_homo_data_lagged:
|
|
176
|
+
logger.info("Lagged data is not homogeneous.")
|
|
177
|
+
|
|
178
|
+
# QGDF
|
|
179
|
+
logger.info("Fitting QGDF with S=1.")
|
|
180
|
+
qgdf_data = QGDF(flush=FLUSH, verbose=VERBOSE)
|
|
181
|
+
qgdf_data.fit(data)
|
|
182
|
+
|
|
183
|
+
qgdf_data_lagged = QGDF(flush=FLUSH, verbose=VERBOSE)
|
|
184
|
+
qgdf_data_lagged.fit(data_lagged)
|
|
185
|
+
|
|
186
|
+
# Get irrelevance of the data sample
|
|
187
|
+
hc_data = np.mean(qgdf_data.hj, axis=0)
|
|
188
|
+
hc_data_lagged = np.mean(qgdf_data_lagged.hj, axis=0)
|
|
189
|
+
|
|
190
|
+
# Stop overflow by limiting big value in hc up to 1e12
|
|
191
|
+
hc_data = np.clip(hc_data, 1, 1e12)
|
|
192
|
+
hc_data_lagged = np.clip(hc_data_lagged, 1, 1e12)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
# Compute auto-covariance
|
|
197
|
+
numerator = np.sum(hc_data * hc_data_lagged)
|
|
198
|
+
denominator = (len(data) - lag)
|
|
199
|
+
if denominator == 0:
|
|
200
|
+
auto_covar = 0
|
|
201
|
+
if denominator != 0:
|
|
202
|
+
auto_covar = numerator / denominator
|
|
203
|
+
|
|
204
|
+
return auto_covar
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from machinegnostics.metrics import precision_score, recall_score, f1_score
|
|
4
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
def classification_report(
|
|
8
|
+
y_true:np.ndarray | pd.Series,
|
|
9
|
+
y_pred:np.ndarray | pd.Series,
|
|
10
|
+
labels=None,
|
|
11
|
+
target_names=None,
|
|
12
|
+
digits=2,
|
|
13
|
+
output_dict=False,
|
|
14
|
+
verbose: bool = False
|
|
15
|
+
):
|
|
16
|
+
"""
|
|
17
|
+
Builds a text summary or dictionary of the precision, recall, F1 score, and support for each class.
|
|
18
|
+
|
|
19
|
+
Uses the precision_score, recall_score, and f1_score functions for consistency.
|
|
20
|
+
|
|
21
|
+
Parameters
|
|
22
|
+
----------
|
|
23
|
+
y_true : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
24
|
+
Ground truth (correct) target values.
|
|
25
|
+
|
|
26
|
+
y_pred : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
27
|
+
Estimated targets as returned by a classifier.
|
|
28
|
+
|
|
29
|
+
labels : array-like, default=None
|
|
30
|
+
List of labels to include in the report. If None, uses sorted unique labels from y_true and y_pred.
|
|
31
|
+
|
|
32
|
+
target_names : list of str, default=None
|
|
33
|
+
Optional display names matching the labels (same order).
|
|
34
|
+
|
|
35
|
+
digits : int, default=2
|
|
36
|
+
Number of digits for formatting output.
|
|
37
|
+
|
|
38
|
+
output_dict : bool, default=False
|
|
39
|
+
If True, return output as a dict for programmatic use. If False, return as a formatted string.
|
|
40
|
+
|
|
41
|
+
verbose : bool, optional
|
|
42
|
+
If True, enables detailed logging for debugging purposes. Default is False.
|
|
43
|
+
|
|
44
|
+
Returns
|
|
45
|
+
-------
|
|
46
|
+
report : str or dict
|
|
47
|
+
Text summary or dictionary of the precision, recall, F1 score for each class.
|
|
48
|
+
"""
|
|
49
|
+
logger = get_logger('classification_report', level=logging.WARNING if not verbose else logging.INFO)
|
|
50
|
+
logger.info("Generating Classification Report...")
|
|
51
|
+
# Convert pandas Series to numpy array
|
|
52
|
+
if isinstance(y_true, pd.Series):
|
|
53
|
+
y_true = y_true.values
|
|
54
|
+
if isinstance(y_pred, pd.Series):
|
|
55
|
+
y_pred = y_pred.values
|
|
56
|
+
|
|
57
|
+
# Convert to numpy arrays and flatten
|
|
58
|
+
y_true = np.asarray(y_true).flatten()
|
|
59
|
+
y_pred = np.asarray(y_pred).flatten()
|
|
60
|
+
|
|
61
|
+
if y_true.shape != y_pred.shape:
|
|
62
|
+
logger.error("Shape of y_true and y_pred must be the same.")
|
|
63
|
+
raise ValueError("Shape of y_true and y_pred must be the same.")
|
|
64
|
+
|
|
65
|
+
# Get unique labels
|
|
66
|
+
if labels is None:
|
|
67
|
+
labels = np.unique(np.concatenate([y_true, y_pred]))
|
|
68
|
+
else:
|
|
69
|
+
labels = np.asarray(labels)
|
|
70
|
+
|
|
71
|
+
n_labels = len(labels)
|
|
72
|
+
if target_names is not None:
|
|
73
|
+
if len(target_names) != n_labels:
|
|
74
|
+
logger.error("target_names length must match number of labels")
|
|
75
|
+
raise ValueError("target_names length must match number of labels")
|
|
76
|
+
else:
|
|
77
|
+
target_names = [str(label) for label in labels]
|
|
78
|
+
|
|
79
|
+
# Use your metric functions for each class
|
|
80
|
+
precisions = precision_score(y_true, y_pred, average=None, labels=labels)
|
|
81
|
+
recalls = recall_score(y_true, y_pred, average=None, labels=labels)
|
|
82
|
+
f1s = f1_score(y_true, y_pred, average=None, labels=labels)
|
|
83
|
+
supports = np.array([(y_true == label).sum() for label in labels])
|
|
84
|
+
|
|
85
|
+
# Weighted averages
|
|
86
|
+
total_support = supports.sum()
|
|
87
|
+
avg_precision = np.average(precisions, weights=supports) if total_support > 0 else 0.0
|
|
88
|
+
avg_recall = np.average(recalls, weights=supports) if total_support > 0 else 0.0
|
|
89
|
+
avg_f1 = np.average(f1s, weights=supports) if total_support > 0 else 0.0
|
|
90
|
+
|
|
91
|
+
if output_dict:
|
|
92
|
+
report = {}
|
|
93
|
+
for i, label in enumerate(labels):
|
|
94
|
+
report[target_names[i]] = {
|
|
95
|
+
"precision": round(precisions[i], digits),
|
|
96
|
+
"recall": round(recalls[i], digits),
|
|
97
|
+
"f1-score": round(f1s[i], digits),
|
|
98
|
+
"support": int(supports[i])
|
|
99
|
+
}
|
|
100
|
+
report["avg/total"] = {
|
|
101
|
+
"precision": round(avg_precision, digits),
|
|
102
|
+
"recall": round(avg_recall, digits),
|
|
103
|
+
"f1-score": round(avg_f1, digits),
|
|
104
|
+
"support": int(total_support)
|
|
105
|
+
}
|
|
106
|
+
return report
|
|
107
|
+
|
|
108
|
+
# Build report string
|
|
109
|
+
header = f"{'Class':<15}{'Precision':>10}{'Recall':>10}{'F1-score':>10}{'Support':>10}\n"
|
|
110
|
+
report = header
|
|
111
|
+
report += "=" * len(header) + "\n"
|
|
112
|
+
for i in range(n_labels):
|
|
113
|
+
report += (
|
|
114
|
+
f"{target_names[i]:<15}"
|
|
115
|
+
f"{precisions[i]:>10.{digits}f}"
|
|
116
|
+
f"{recalls[i]:>10.{digits}f}"
|
|
117
|
+
f"{f1s[i]:>10.{digits}f}"
|
|
118
|
+
f"{supports[i]:>10}\n"
|
|
119
|
+
)
|
|
120
|
+
report += "=" * len(header) + "\n"
|
|
121
|
+
report += (
|
|
122
|
+
f"{'Avg/Total':<15}"
|
|
123
|
+
f"{avg_precision:>10.{digits}f}"
|
|
124
|
+
f"{avg_recall:>10.{digits}f}"
|
|
125
|
+
f"{avg_f1:>10.{digits}f}"
|
|
126
|
+
f"{total_support:>10}\n"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
logger.info("Classification Report generated.")
|
|
130
|
+
return report
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
from machinegnostics.magcal.util.logging import get_logger
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
def confusion_matrix(y_true:np.ndarray | pd.Series,
|
|
7
|
+
y_pred:np.ndarray | pd.Series,
|
|
8
|
+
labels=None, verbose=False) -> np.ndarray:
|
|
9
|
+
"""
|
|
10
|
+
Computes the confusion matrix to evaluate the accuracy of a classification.
|
|
11
|
+
|
|
12
|
+
By definition, entry (i, j) in the confusion matrix is the number of observations
|
|
13
|
+
actually in class i but predicted to be in class j.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
y_true : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
18
|
+
Ground truth (correct) target values.
|
|
19
|
+
|
|
20
|
+
y_pred : array-like or pandas Series/DataFrame column of shape (n_samples,)
|
|
21
|
+
Estimated targets as returned by a classifier.
|
|
22
|
+
|
|
23
|
+
labels : array-like, default=None
|
|
24
|
+
List of labels to index the matrix. This may be used to reorder or select a subset of labels.
|
|
25
|
+
If None, labels that appear at least once in y_true or y_pred are used in sorted order.
|
|
26
|
+
verbose : bool, optional
|
|
27
|
+
If True, enables detailed logging for debugging purposes. Default is False.
|
|
28
|
+
|
|
29
|
+
Returns
|
|
30
|
+
-------
|
|
31
|
+
cm : ndarray of shape (n_classes, n_classes)
|
|
32
|
+
Confusion matrix whose i-th row and j-th column entry indicates the number of samples with
|
|
33
|
+
true label being i-th class and predicted label being j-th class.
|
|
34
|
+
|
|
35
|
+
Examples
|
|
36
|
+
--------
|
|
37
|
+
>>> y_true = [2, 0, 2, 2, 0, 1]
|
|
38
|
+
>>> y_pred = [0, 0, 2, 2, 0, 2]
|
|
39
|
+
>>> confusion_matrix(y_true, y_pred)
|
|
40
|
+
array([[2, 0, 0],
|
|
41
|
+
[0, 0, 1],
|
|
42
|
+
[1, 0, 2]])
|
|
43
|
+
"""
|
|
44
|
+
logger = get_logger('confusion_matrix', level=logging.WARNING if not verbose else logging.INFO)
|
|
45
|
+
logger.info("Calculating Confusion Matrix...")
|
|
46
|
+
# Convert pandas Series to numpy array
|
|
47
|
+
if isinstance(y_true, pd.Series):
|
|
48
|
+
y_true = y_true.values
|
|
49
|
+
if isinstance(y_pred, pd.Series):
|
|
50
|
+
y_pred = y_pred.values
|
|
51
|
+
|
|
52
|
+
# Convert to numpy arrays and flatten
|
|
53
|
+
y_true = np.asarray(y_true).flatten()
|
|
54
|
+
y_pred = np.asarray(y_pred).flatten()
|
|
55
|
+
|
|
56
|
+
if y_true.shape != y_pred.shape:
|
|
57
|
+
logger.error("Shape of y_true and y_pred must be the same.")
|
|
58
|
+
raise ValueError("Shape of y_true and y_pred must be the same.")
|
|
59
|
+
if y_true.size == 0:
|
|
60
|
+
logger.error("y_true and y_pred must not be empty.")
|
|
61
|
+
raise ValueError("y_true and y_pred must not be empty.")
|
|
62
|
+
# Ensure 1D arrays
|
|
63
|
+
if y_true.ndim != 1 or y_pred.ndim != 1:
|
|
64
|
+
logger.error("y_true and y_pred must be 1D arrays.")
|
|
65
|
+
raise ValueError("y_true and y_pred must be 1D arrays.")
|
|
66
|
+
# inf and nan check
|
|
67
|
+
if np.any(np.isnan(y_true)) or np.any(np.isnan(y_pred)):
|
|
68
|
+
logger.error("y_true and y_pred must not contain NaN values.")
|
|
69
|
+
raise ValueError("y_true and y_pred must not contain NaN values.")
|
|
70
|
+
if np.any(np.isinf(y_true)) or np.any(np.isinf(y_pred)):
|
|
71
|
+
logger.error("y_true and y_pred must not contain Inf values.")
|
|
72
|
+
raise ValueError("y_true and y_pred must not contain Inf values.")
|
|
73
|
+
|
|
74
|
+
# Determine labels
|
|
75
|
+
if labels is None:
|
|
76
|
+
labels = np.unique(np.concatenate([y_true, y_pred]))
|
|
77
|
+
else:
|
|
78
|
+
labels = np.asarray(labels)
|
|
79
|
+
n_labels = len(labels)
|
|
80
|
+
label_to_index = {label: idx for idx, label in enumerate(labels)}
|
|
81
|
+
|
|
82
|
+
# Initialize confusion matrix
|
|
83
|
+
cm = np.zeros((n_labels, n_labels), dtype=int)
|
|
84
|
+
|
|
85
|
+
# Populate confusion matrix
|
|
86
|
+
for true, pred in zip(y_true, y_pred):
|
|
87
|
+
if true in label_to_index and pred in label_to_index:
|
|
88
|
+
i = label_to_index[true]
|
|
89
|
+
j = label_to_index[pred]
|
|
90
|
+
cm[i, j] += 1
|
|
91
|
+
|
|
92
|
+
logger.info("Confusion Matrix calculation completed.")
|
|
93
|
+
return cm
|