mlquantify 0.0.11.4__tar.gz → 0.0.11.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/PKG-INFO +2 -2
  2. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/README.md +1 -1
  3. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/gac.py +16 -23
  4. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/gpac.py +17 -26
  5. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/_MixtureModel.py +0 -1
  6. mlquantify-0.0.11.6/mlquantify/methods/aggregative/mixtureModels/dys.py +107 -0
  7. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/dys_syn.py +63 -16
  8. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/hdy.py +44 -7
  9. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify.egg-info/PKG-INFO +2 -2
  10. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/setup.py +1 -1
  11. mlquantify-0.0.11.4/mlquantify/methods/aggregative/mixtureModels/dys.py +0 -55
  12. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/__init__.py +0 -0
  13. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/base.py +0 -0
  14. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/classification/__init__.py +0 -0
  15. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/classification/pwkclf.py +0 -0
  16. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/__init__.py +0 -0
  17. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/__init__.py +0 -0
  18. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/ae.py +0 -0
  19. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/bias.py +0 -0
  20. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/kld.py +0 -0
  21. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/mse.py +0 -0
  22. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/nae.py +0 -0
  23. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/nkld.py +0 -0
  24. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/nrae.py +0 -0
  25. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/rae.py +0 -0
  26. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/measures/se.py +0 -0
  27. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/protocol/_Protocol.py +0 -0
  28. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/protocol/__init__.py +0 -0
  29. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/protocol/app.py +0 -0
  30. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/evaluation/protocol/npp.py +0 -0
  31. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/__init__.py +0 -0
  32. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/_ThreholdOptimization.py +0 -0
  33. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/__init__.py +0 -0
  34. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/acc.py +0 -0
  35. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/max.py +0 -0
  36. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/ms.py +0 -0
  37. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/ms2.py +0 -0
  38. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/pacc.py +0 -0
  39. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/t50.py +0 -0
  40. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/ThreholdOptm/x.py +0 -0
  41. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/__init__.py +0 -0
  42. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/cc.py +0 -0
  43. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/emq.py +0 -0
  44. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/fm.py +0 -0
  45. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/__init__.py +0 -0
  46. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/smm.py +0 -0
  47. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/mixtureModels/sord.py +0 -0
  48. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/pcc.py +0 -0
  49. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/aggregative/pwk.py +0 -0
  50. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/meta/__init__.py +0 -0
  51. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/meta/ensemble.py +0 -0
  52. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/non_aggregative/__init__.py +0 -0
  53. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/methods/non_aggregative/hdx.py +0 -0
  54. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/model_selection.py +0 -0
  55. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/plots/__init__.py +0 -0
  56. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/plots/distribution_plot.py +0 -0
  57. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/plots/protocol_plot.py +0 -0
  58. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/__init__.py +0 -0
  59. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/__init__.py +0 -0
  60. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/convert_col_to_array.py +0 -0
  61. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/generate_artificial_indexes.py +0 -0
  62. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/get_real_prev.py +0 -0
  63. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/load_quantifier.py +0 -0
  64. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/make_prevs.py +0 -0
  65. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/normalize.py +0 -0
  66. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/parallel.py +0 -0
  67. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/general_purposes/round_protocol_df.py +0 -0
  68. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/__init__.py +0 -0
  69. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/distances.py +0 -0
  70. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/getHist.py +0 -0
  71. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/get_scores.py +0 -0
  72. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/moss.py +0 -0
  73. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/ternary_search.py +0 -0
  74. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify/utils/method_purposes/tprfpr.py +0 -0
  75. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify.egg-info/SOURCES.txt +0 -0
  76. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify.egg-info/dependency_links.txt +0 -0
  77. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify.egg-info/requires.txt +0 -0
  78. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/mlquantify.egg-info/top_level.txt +0 -0
  79. {mlquantify-0.0.11.4 → mlquantify-0.0.11.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mlquantify
3
- Version: 0.0.11.4
3
+ Version: 0.0.11.6
4
4
  Summary: Quantification Library
5
5
  Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
6
6
  Maintainer: Luiz Fernando Luth Junior
@@ -32,7 +32,7 @@ ___
32
32
 
33
33
  ## Latest Release
34
34
 
35
- - **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
35
+ - **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
36
36
  - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
37
37
  - Explore the [API documentation](#) for detailed developer information.
38
38
  - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
@@ -9,7 +9,7 @@ ___
9
9
 
10
10
  ## Latest Release
11
11
 
12
- - **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
12
+ - **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
13
13
  - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
14
14
  - Explore the [API documentation](#) for detailed developer information.
15
15
  - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
@@ -2,7 +2,7 @@ import numpy as np
2
2
  import pandas as pd
3
3
  from sklearn.base import BaseEstimator
4
4
  from sklearn.metrics import confusion_matrix
5
- from sklearn.model_selection import StratifiedKFold
5
+ from sklearn.model_selection import train_test_split
6
6
 
7
7
  from ...base import AggregativeQuantifier
8
8
 
@@ -13,10 +13,12 @@ class GAC(AggregativeQuantifier):
13
13
  and solve it via constrained least-squares regression.
14
14
  """
15
15
 
16
- def __init__(self, learner: BaseEstimator):
16
+ def __init__(self, learner: BaseEstimator, train_size:float=0.6, random_state:int=None):
17
17
  assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
18
18
  self.learner = learner
19
19
  self.cond_prob_matrix = None
20
+ self.train_size = train_size
21
+ self.random_state = random_state
20
22
 
21
23
  def _fit_method(self, X, y):
22
24
  # Ensure X and y are DataFrames
@@ -29,26 +31,17 @@ class GAC(AggregativeQuantifier):
29
31
  y_pred = self.learner.predict(X)
30
32
  y_label = y
31
33
  else:
32
- # Cross-validation for generating predictions
33
- skf = StratifiedKFold(n_splits=self.cv_folds)
34
- y_pred = []
35
- y_label = []
34
+ X_train, X_val, y_train, y_val = train_test_split(
35
+ X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
36
+ )
36
37
 
37
- for train_index, valid_index in skf.split(X, y):
38
-
39
- train_data = pd.DataFrame(X.iloc[train_index])
40
- train_label = y.iloc[train_index]
41
-
42
- valid_data = pd.DataFrame(X.iloc[valid_index])
43
- valid_label = y.iloc[valid_index]
44
-
45
- self.learner.fit(train_data, train_label)
46
-
47
- y_pred.extend(self.learner.predict(valid_data))
48
- y_label.extend(valid_label)
38
+ self.learner.fit(X_train, y_train)
39
+
40
+ y_label = y_val
41
+ y_pred = self.learner.predict(X_val)
49
42
 
50
43
  # Compute conditional probability matrix
51
- self.cond_prob_matrix = self.get_cond_prob_matrix(self.classes, y, y_pred)
44
+ self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_label, y_pred)
52
45
 
53
46
  return self
54
47
 
@@ -66,11 +59,11 @@ class GAC(AggregativeQuantifier):
66
59
  return adjusted_prevalences
67
60
 
68
61
  @classmethod
69
- def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
62
+ def get_cond_prob_matrix(cls, classes:list, y_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
70
63
  """ Estimate the conditional probability matrix P(yi|yj)"""
71
64
 
72
- CM = confusion_matrix(true_labels, predictions, labels=classes).T
73
- CM = CM.astype(np.float32)
65
+ CM = confusion_matrix(y_labels, predictions, labels=classes).T
66
+ CM = CM.astype(float)
74
67
  class_counts = CM.sum(axis=0)
75
68
  for i, _ in enumerate(classes):
76
69
  if class_counts[i] == 0:
@@ -91,6 +84,6 @@ class GAC(AggregativeQuantifier):
91
84
  adjusted_prevalences = np.linalg.solve(A, B)
92
85
  adjusted_prevalences = np.clip(adjusted_prevalences, 0, 1)
93
86
  adjusted_prevalences /= adjusted_prevalences.sum()
94
- except (np.linalg.LinAlgError, ValueError):
87
+ except (np.linalg.LinAlgError):
95
88
  adjusted_prevalences = predicted_prevalences # No way to adjust them
96
89
  return adjusted_prevalences
@@ -1,7 +1,7 @@
1
1
  import numpy as np
2
2
  import pandas as pd
3
3
  from sklearn.base import BaseEstimator
4
- from sklearn.model_selection import StratifiedKFold
4
+ from sklearn.model_selection import train_test_split
5
5
 
6
6
  from .gac import GAC
7
7
  from ...base import AggregativeQuantifier
@@ -14,10 +14,12 @@ class GPAC(AggregativeQuantifier):
14
14
  """
15
15
 
16
16
 
17
- def __init__(self, learner: BaseEstimator):
17
+ def __init__(self, learner: BaseEstimator, train_size:float=0.6, random_state:int=None):
18
18
  assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
19
19
  self.learner = learner
20
20
  self.cond_prob_matrix = None
21
+ self.train_size = train_size
22
+ self.random_state = random_state
21
23
 
22
24
  def _fit_method(self, X, y):
23
25
  # Convert X and y to DataFrames if they are numpy arrays
@@ -28,31 +30,20 @@ class GPAC(AggregativeQuantifier):
28
30
 
29
31
  if self.learner_fitted:
30
32
  # Use existing model to predict
31
- predictions = self.learner.predict(X)
32
- true_labels = y
33
+ y_pred = self.learner.predict(X)
34
+ y_labels = y
33
35
  else:
34
- # Perform cross-validation to generate predictions
35
- skf = StratifiedKFold(n_splits=self.cv_folds)
36
- predictions = []
37
- true_labels = []
36
+ X_train, X_val, y_train, y_val = train_test_split(
37
+ X, y, train_size=self.train_size, stratify=y, random_state=self.random_state
38
+ )
38
39
 
39
- for train_index, valid_index in skf.split(X, y):
40
- # Split data into training and validation sets
41
- train_data = pd.DataFrame(X.iloc[train_index])
42
- train_labels = y.iloc[train_index]
43
-
44
- valid_data = pd.DataFrame(X.iloc[valid_index])
45
- valid_labels = y.iloc[valid_index]
46
-
47
- # Train the learner
48
- self.learner.fit(train_data, train_labels)
49
-
50
- # Predict and collect results
51
- predictions.extend(self.learner.predict(valid_data))
52
- true_labels.extend(valid_labels)
40
+ self.learner.fit(X_train, y_train)
41
+
42
+ y_labels = y_val
43
+ y_pred = self.learner.predict(X_val)
53
44
 
54
45
  # Compute conditional probability matrix using GAC
55
- self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, true_labels, predictions)
46
+ self.cond_prob_matrix = GAC.get_cond_prob_matrix(self.classes, y_labels, y_pred)
56
47
 
57
48
  return self
58
49
 
@@ -73,15 +64,15 @@ class GPAC(AggregativeQuantifier):
73
64
  return adjusted_prevalences
74
65
 
75
66
  @classmethod
76
- def get_cond_prob_matrix(cls, classes:list, true_labels:np.ndarray, predictions:np.ndarray) -> np.ndarray:
67
+ def get_cond_prob_matrix(cls, classes:list, y_labels:np.ndarray, y_pred:np.ndarray) -> np.ndarray:
77
68
  """Estimate the matrix where entry (i,j) is the estimate of P(yi|yj)"""
78
69
 
79
70
  n_classes = len(classes)
80
71
  cond_prob_matrix = np.eye(n_classes)
81
72
 
82
73
  for i, class_ in enumerate(classes):
83
- class_indices = true_labels == class_
74
+ class_indices = y_labels == class_
84
75
  if class_indices.any():
85
- cond_prob_matrix[i] = predictions[class_indices].mean(axis=0)
76
+ cond_prob_matrix[i] = y_pred[class_indices].mean(axis=0)
86
77
 
87
78
  return cond_prob_matrix.T
@@ -16,7 +16,6 @@ class MixtureModel(AggregativeQuantifier):
16
16
  self.learner = learner
17
17
  self.pos_scores = None
18
18
  self.neg_scores = None
19
- self.distance = None
20
19
 
21
20
  @property
22
21
  def multiclass_method(self) -> bool:
@@ -0,0 +1,107 @@
1
+ import numpy as np
2
+ from sklearn.base import BaseEstimator
3
+
4
+ from ._MixtureModel import MixtureModel
5
+ from ....utils import getHist, ternary_search
6
+
7
+ class DyS(MixtureModel):
8
+ """Distribution y-Similarity framework. Is a
9
+ method that generalises the HDy approach by
10
+ considering the dissimilarity function DS as
11
+ a parameter of the model
12
+ """
13
+
14
+ def __init__(self, learner:BaseEstimator, measure:str="topsoe", bins_size:np.ndarray=None):
15
+ assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
16
+ assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
17
+ super().__init__(learner)
18
+
19
+ # Set up bins_size
20
+ if not bins_size:
21
+ bins_size = np.append(np.linspace(2,20,10), 30)
22
+ if isinstance(bins_size, list):
23
+ bins_size = np.asarray(bins_size)
24
+
25
+ self.bins_size = bins_size
26
+ self.measure = measure
27
+ self.prevs = None # Array of prevalences that minimizes the distances
28
+
29
+
30
+ def _compute_prevalence(self, test_scores:np.ndarray) -> float:
31
+
32
+ prevs = self.GetMinDistancesDyS(test_scores)
33
+ # Use the median of the prevalences as the final prevalence estimate
34
+ prevalence = np.median(prevs)
35
+
36
+ return prevalence
37
+
38
+
39
+
40
+ def best_distance(self, X_test) -> float:
41
+
42
+ test_scores = self.learner.predict_proba(X_test)
43
+
44
+ prevs = self.GetMinDistancesDyS(test_scores)
45
+
46
+ size = len(prevs)
47
+ best_prev = np.median(prevs)
48
+
49
+ if size % 2 != 0: # ODD
50
+ index = np.argmax(prevs == best_prev)
51
+ bin_size = self.bins_size[index]
52
+ else: # EVEN
53
+ # Sort the values in self.prevs
54
+ ordered_prevs = np.sort(prevs)
55
+
56
+ # Find the two middle indices
57
+ middle1 = np.floor(size / 2).astype(int)
58
+ middle2 = np.ceil(size / 2).astype(int)
59
+
60
+ # Get the values corresponding to the median positions
61
+ median1 = ordered_prevs[middle1]
62
+ median2 = ordered_prevs[middle2]
63
+
64
+ # Find the indices of median1 and median2 in prevs
65
+ index1 = np.argmax(prevs == median1)
66
+ index2 = np.argmax(prevs == median2)
67
+
68
+ # Calculate the average of the corresponding bin sizes
69
+ bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
70
+
71
+
72
+ pos_bin_density = getHist(self.pos_scores, bin_size)
73
+ neg_bin_density = getHist(self.neg_scores, bin_size)
74
+ test_bin_density = getHist(test_scores, bin_size)
75
+
76
+ train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
77
+
78
+ distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
79
+
80
+ return distance
81
+
82
+
83
+ def GetMinDistancesDyS(self, test_scores) -> list:
84
+ # Compute prevalence by evaluating the distance metric across various bin sizes
85
+
86
+ prevs = []
87
+
88
+ # Iterate over each bin size
89
+ for bins in self.bins_size:
90
+ # Compute histogram densities for positive, negative, and test scores
91
+ pos_bin_density = getHist(self.pos_scores, bins)
92
+ neg_bin_density = getHist(self.neg_scores, bins)
93
+ test_bin_density = getHist(test_scores, bins)
94
+
95
+ # Define the function to minimize
96
+ def f(x):
97
+ # Combine densities using a mixture of positive and negative densities
98
+ train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
99
+ # Calculate the distance between combined density and test density
100
+ return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
101
+
102
+ # Use ternary search to find the best x that minimizes the distance
103
+ prevs.append(ternary_search(0, 1, f))
104
+
105
+ return prevs
106
+
107
+
@@ -34,6 +34,7 @@ class DySsyn(MixtureModel):
34
34
  self.m = None
35
35
 
36
36
 
37
+
37
38
  def _fit_method(self, X, y):
38
39
  if not self.learner_fitted:
39
40
  self.learner.fit(X, y)
@@ -45,16 +46,41 @@ class DySsyn(MixtureModel):
45
46
 
46
47
 
47
48
  def _compute_prevalence(self, test_scores:np.ndarray) -> float: #creating bins from 10 to 110 with step size 10
49
+
50
+ distances = self.GetMinDistancesDySsyn(test_scores)
51
+
52
+ # Use the median of the prevss as the final prevalence estimate
53
+ index = min(distances, key=lambda d: distances[d][0])
54
+ prevalence = distances[index][1]
55
+
56
+ return prevalence
57
+
58
+
59
+ def best_distance(self, X_test):
60
+
61
+ test_scores = self.learner.predict_proba(X_test)
62
+
63
+ distances = self.GetMinDistancesDySsyn(test_scores)
64
+
65
+ index = min(distances, key=lambda d: distances[d][0])
66
+
67
+ distance = distances[index][0]
68
+
69
+ return distance
70
+
71
+
72
+
73
+ def GetMinDistancesDySsyn(self, test_scores) -> list:
48
74
  # Compute prevalence by evaluating the distance metric across various bin sizes
49
75
  if self.n is None:
50
76
  self.n = len(test_scores)
51
77
 
52
- distances = {}
78
+ values = {}
53
79
 
54
80
  # Iterate over each bin size
55
81
  for m in self.merge_factor:
56
82
  pos_scores, neg_scores = MoSS(self.n, self.alpha_train, m)
57
- result = []
83
+ prevs = []
58
84
  for bins in self.bins_size:
59
85
  # Compute histogram densities for positive, negative, and test scores
60
86
  pos_bin_density = getHist(pos_scores, bins)
@@ -69,21 +95,42 @@ class DySsyn(MixtureModel):
69
95
  return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
70
96
 
71
97
  # Use ternary search to find the best x that minimizes the distance
72
- result.append(ternary_search(0, 1, f))
73
- prevalence = np.median(result)
98
+ prevs.append(ternary_search(0, 1, f))
99
+
100
+ size = len(prevs)
101
+ best_prev = np.median(prevs)
102
+
103
+ if size % 2 != 0: # ODD
104
+ index = np.argmax(prevs == best_prev)
105
+ bin_size = self.bins_size[index]
106
+ else: # EVEN
107
+ # Sort the values in self.prevs
108
+ ordered_prevs = np.sort(prevs)
109
+
110
+ # Find the two middle indices
111
+ middle1 = np.floor(size / 2).astype(int)
112
+ middle2 = np.ceil(size / 2).astype(int)
113
+
114
+ # Get the values corresponding to the median positions
115
+ median1 = ordered_prevs[middle1]
116
+ median2 = ordered_prevs[middle2]
117
+
118
+ # Find the indices of median1 and median2 in prevs
119
+ index1 = np.argmax(prevs == median1)
120
+ index2 = np.argmax(prevs == median2)
121
+
122
+ # Calculate the average of the corresponding bin sizes
123
+ bin_size = np.mean([self.bins_size[index1], self.bins_size[index2]])
124
+
74
125
 
75
- bins_size = self.bins_size[result == prevalence][0]
126
+ pos_bin_density = getHist(pos_scores, bin_size)
127
+ neg_bin_density = getHist(neg_scores, bin_size)
128
+ test_bin_density = getHist(test_scores, bin_size)
76
129
 
77
- pos_bin_density = getHist(pos_scores, bins_size)
78
- neg_bin_density = getHist(neg_scores, bins_size)
79
- test_bin_density = getHist(test_scores, bins_size)
130
+ train_combined_density = (pos_bin_density * best_prev) + (neg_bin_density * (1 - best_prev))
80
131
 
81
- train_combined_density = (pos_bin_density * prevalence) + (neg_bin_density * (1 - prevalence))
82
- d = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
83
- distances[m] = (d, prevalence)
84
- # Use the median of the results as the final prevalence estimate
85
- index = min(distances, key=lambda d: distances[d][0])
86
- prevalence = distances[index][1]
132
+ distance = self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
87
133
 
88
- return prevalence
89
-
134
+ values[m] = (distance, best_prev)
135
+
136
+ return values
@@ -14,15 +14,54 @@ class HDy(MixtureModel):
14
14
  def __init__(self, learner: BaseEstimator):
15
15
  assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
16
16
  super().__init__(learner)
17
+
17
18
 
18
19
  def _compute_prevalence(self, test_scores: np.ndarray) -> float:
20
+
21
+ best_alphas, _ = self.GetMinDistancesHDy(test_scores)
22
+ # Compute the median of the best alpha values as the final prevalence estimate
23
+ prevalence = np.median(best_alphas)
24
+
25
+ return prevalence
26
+
27
+
28
+
29
+ def best_distance(self, X_test) -> float:
30
+
31
+ test_scores = self.learner.predict_proba(X_test)
32
+
33
+ _, distances = self.GetMinDistancesHDy(test_scores)
34
+
35
+ size = len(distances)
36
+
37
+ if size % 2 != 0: # ODD
38
+ index = size // 2
39
+ distance = distances[index]
40
+ else: # EVEN
41
+ # Find the two middle indices
42
+ middle1 = np.floor(size / 2).astype(int)
43
+ middle2 = np.ceil(size / 2).astype(int)
44
+
45
+ # Get the values corresponding to the median positions
46
+ dist1 = distances[middle1]
47
+ dist2 = distances[middle2]
48
+
49
+ # Calculate the average of the corresponding distances
50
+ distance = np.mean([dist1, dist2])
51
+
52
+ return distance
53
+
54
+
55
+ def GetMinDistancesHDy(self, test_scores: np.ndarray) -> tuple:
56
+
19
57
  # Define bin sizes and alpha values
20
- bin_size = np.arange(10, 110, 11) # Bins from 10 to 110 with a step size of 10
58
+ bins_size = np.arange(10, 110, 11) # Bins from 10 to 110 with a step size of 10
21
59
  alpha_values = np.round(np.linspace(0, 1, 101), 2) # Alpha values from 0 to 1, rounded to 2 decimal places
22
60
 
23
61
  best_alphas = []
24
-
25
- for bins in bin_size:
62
+ distances = []
63
+
64
+ for bins in bins_size:
26
65
 
27
66
  pos_bin_density = getHist(self.pos_scores, bins)
28
67
  neg_bin_density = getHist(self.neg_scores, bins)
@@ -39,8 +78,6 @@ class HDy(MixtureModel):
39
78
 
40
79
  # Find the alpha value that minimizes the distance
41
80
  best_alphas.append(alpha_values[np.argmin(distances)])
42
-
43
- # Compute the median of the best alpha values as the final prevalence estimate
44
- prevalence = np.median(best_alphas)
81
+ distances.append(min(distances))
45
82
 
46
- return prevalence
83
+ return best_alphas, distances
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: mlquantify
3
- Version: 0.0.11.4
3
+ Version: 0.0.11.6
4
4
  Summary: Quantification Library
5
5
  Home-page: https://github.com/luizfernandolj/QuantifyML/tree/master
6
6
  Maintainer: Luiz Fernando Luth Junior
@@ -32,7 +32,7 @@ ___
32
32
 
33
33
  ## Latest Release
34
34
 
35
- - **Version 0.0.1**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
35
+ - **Version 0.0.11.6**: Inicial beta version. For a detailed list of changes, check the [changelog](#).
36
36
  - In case you need any help, refer to the [wiki](https://github.com/luizfernandolj/mlquantify/wiki).
37
37
  - Explore the [API documentation](#) for detailed developer information.
38
38
  - See also the library in the pypi site in [pypi mlquantify](https://pypi.org/project/mlquantify/)
@@ -6,7 +6,7 @@ here = pathlib.Path(__file__).parent.resolve()
6
6
 
7
7
  long_description = (here / 'README.md').read_text(encoding='utf-8')
8
8
 
9
- VERSION = '0.0.11.4'
9
+ VERSION = '0.0.11.6'
10
10
  DESCRIPTION = 'Quantification Library'
11
11
 
12
12
  # Setting up
@@ -1,55 +0,0 @@
1
- import numpy as np
2
- from sklearn.base import BaseEstimator
3
-
4
- from ._MixtureModel import MixtureModel
5
- from ....utils import getHist, ternary_search
6
-
7
- class DyS(MixtureModel):
8
- """Distribution y-Similarity framework. Is a
9
- method that generalises the HDy approach by
10
- considering the dissimilarity function DS as
11
- a parameter of the model
12
- """
13
-
14
- def __init__(self, learner:BaseEstimator, measure:str="topsoe", bins_size:np.ndarray=None):
15
- assert measure in ["hellinger", "topsoe", "probsymm"], "measure not valid"
16
- assert isinstance(learner, BaseEstimator), "learner object is not an estimator"
17
- super().__init__(learner)
18
-
19
- # Set up bins_size
20
- if not bins_size:
21
- bins_size = np.append(np.linspace(2,20,10), 30)
22
- if isinstance(bins_size, list):
23
- bins_size = np.asarray(bins_size)
24
-
25
- self.bins_size = bins_size
26
- self.measure = measure
27
-
28
-
29
- def _compute_prevalence(self, test_scores:np.ndarray) -> float: #creating bins from 10 to 110 with step size 10
30
- # Compute prevalence by evaluating the distance metric across various bin sizes
31
-
32
- result = []
33
-
34
- # Iterate over each bin size
35
- for bins in self.bins_size:
36
- # Compute histogram densities for positive, negative, and test scores
37
- pos_bin_density = getHist(self.pos_scores, bins)
38
- neg_bin_density = getHist(self.neg_scores, bins)
39
- test_bin_density = getHist(test_scores, bins)
40
-
41
- # Define the function to minimize
42
- def f(x):
43
- # Combine densities using a mixture of positive and negative densities
44
- train_combined_density = (pos_bin_density * x) + (neg_bin_density * (1 - x))
45
- # Calculate the distance between combined density and test density
46
- return self.get_distance(train_combined_density, test_bin_density, measure=self.measure)
47
-
48
- # Use ternary search to find the best x that minimizes the distance
49
- result.append(ternary_search(0, 1, f))
50
-
51
- # Use the median of the results as the final prevalence estimate
52
- prevalence = np.median(result)
53
-
54
- return prevalence
55
-
File without changes