PyPI - libmultilabel - Versions diffs - 0.7.0__tar.gz → 0.7.2__tar.gz - Mend

libmultilabel 0.7.0tar.gz → 0.7.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: libmultilabel
-Version: 0.7.0
+Version: 0.7.2
 Summary: A library for multi-class and multi-label classification
 Home-page: https://github.com/ASUS-AICS/LibMultiLabel
 Author: LibMultiLabel Team
@@ -24,12 +24,12 @@ Requires-Dist: numba
 Requires-Dist: pandas>1.3.0
 Requires-Dist: PyYAML
 Requires-Dist: scikit-learn
-Requires-Dist: scipy
+Requires-Dist: scipy<1.14.0
 Requires-Dist: tqdm
 Provides-Extra: nn
 Requires-Dist: lightning==2.0.9; extra == "nn"
 Requires-Dist: nltk; extra == "nn"
-Requires-Dist: torch; extra == "nn"
+Requires-Dist: torch<=2.3; extra == "nn"
 Requires-Dist: torchmetrics==0.10.3; extra == "nn"
 Requires-Dist: torchtext; extra == "nn"
 Requires-Dist: transformers; extra == "nn"

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/libmultilabel/linear/linear.py RENAMED Viewed

@@ -79,7 +79,7 @@ def train_1vsrest(
     options: str = "",
     verbose: bool = True,
 ) -> FlatModel:
-    """Trains a linear model for multiabel data using a one-vs-rest strategy.
+    """Trains a linear model for multi-label data using a one-vs-rest strategy.
     Args:
         y (sparse.csr_matrix): A 0/1 matrix with dimensions number of instances * number of classes.
@@ -139,7 +139,7 @@ def _prepare_options(x: sparse.csr_matrix, options: str) -> tuple[sparse.csr_mat
             raise ValueError("Invalid LIBLINEAR solver type. Only classification solvers are allowed.")
     else:
         # workaround for liblinear warning about unspecified solver
-        options_split.extend(["-s", "2"])
+        options_split.extend(["-s", "1"])
     bias = -1.0
     if "-B" in options_split:
@@ -396,7 +396,7 @@ def train_cost_sensitive(
     options: str = "",
     verbose: bool = True,
 ) -> FlatModel:
-    """Trains a linear model for multilabel data using a one-vs-rest strategy
+    """Trains a linear model for multi-label data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
     for Macro-F1.
     Outperforms train_1vsrest in most aspects at the cost of higher
@@ -500,7 +500,7 @@ def train_cost_sensitive_micro(
     options: str = "",
     verbose: bool = True,
 ) -> FlatModel:
-    """Trains a linear model for multilabel data using a one-vs-rest strategy
+    """Trains a linear model for multi-label data using a one-vs-rest strategy
     and cross-validation to pick an optimal asymmetric misclassification cost
     for Micro-F1.
     Outperforms train_1vsrest in most aspects at the cost of higher

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/libmultilabel/linear/metrics.py RENAMED Viewed

@@ -300,13 +300,13 @@ def get_metrics(monitor_metrics: list[str], num_classes: int, multiclass: bool =
         monitor_metrics = []
     metrics = {}
     for metric in monitor_metrics:
-        if re.match("P@\d+", metric):
+        if re.match(r"P@\d+", metric):
             metrics[metric] = PrecisionAtK(top_k=int(metric[2:]))
-        elif re.match("R@\d+", metric):
+        elif re.match(r"R@\d+", metric):
             metrics[metric] = RecallAtK(top_k=int(metric[2:]))
-        elif re.match("RP@\d+", metric):
+        elif re.match(r"RP@\d+", metric):
             metrics[metric] = RPrecisionAtK(top_k=int(metric[3:]))
-        elif re.match("NDCG@\d+", metric):
+        elif re.match(r"NDCG@\d+", metric):
             metrics[metric] = NDCGAtK(top_k=int(metric[5:]))
         elif metric in {"Another-Macro-F1", "Macro-F1", "Micro-F1"}:
             metrics[metric] = F1(num_classes, average=metric[:-3].lower(), multiclass=multiclass)

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/libmultilabel/linear/tree.py RENAMED Viewed

@@ -7,6 +7,7 @@ import scipy.sparse as sparse
 import sklearn.cluster
 import sklearn.preprocessing
 from tqdm import tqdm
+import psutil
 from . import linear
@@ -26,6 +27,7 @@ class Node:
         """
         self.label_map = label_map
         self.children = children
+        self.is_root = False
     def isLeaf(self) -> bool:
         return len(self.children) == 0
@@ -57,7 +59,7 @@ class TreeModel:
         x: sparse.csr_matrix,
         beam_width: int = 10,
     ) -> np.ndarray:
-        """Calculates the decision values associated with x.
+        """Calculates the probability estimates associated with x.
         Args:
             x (sparse.csr_matrix): A matrix with dimension number of instances * number of features.
@@ -71,10 +73,10 @@ class TreeModel:
         return np.vstack([self._beam_search(all_preds[i], beam_width) for i in range(all_preds.shape[0])])
     def _beam_search(self, instance_preds: np.ndarray, beam_width: int) -> np.ndarray:
-        """Predict with beam search using cached decision values for a single instance.
+        """Predict with beam search using cached probability estimates for a single instance.
         Args:
-            instance_preds (np.ndarray): A vector of cached decision values of each node, has dimension number of labels + total number of metalabels.
+            instance_preds (np.ndarray): A vector of cached probability estimates of each node, has dimension number of labels + total number of metalabels.
             beam_width (int): Number of candidates considered.
         Returns:
@@ -93,18 +95,18 @@ class TreeModel:
                     continue
                 slice = np.s_[self.weight_map[node.index] : self.weight_map[node.index + 1]]
                 pred = instance_preds[slice]
-                children_score = score - np.maximum(0, 1 - pred) ** 2
+                children_score = score - np.square(np.maximum(0, 1 - pred))
                 next_level.extend(zip(node.children, children_score.tolist()))
             cur_level = sorted(next_level, key=lambda pair: -pair[1])[:beam_width]
             next_level = []
         num_labels = len(self.root.label_map)
-        scores = np.full(num_labels, -np.inf)
+        scores = np.full(num_labels, 0.0)
         for node, score in cur_level:
             slice = np.s_[self.weight_map[node.index] : self.weight_map[node.index + 1]]
             pred = instance_preds[slice]
-            scores[node.label_map] = np.exp(score - np.maximum(0, 1 - pred) ** 2)
+            scores[node.label_map] = np.exp(score - np.square(np.maximum(0, 1 - pred)))
         return scores
@@ -116,7 +118,7 @@ def train_tree(
     dmax=10,
     verbose: bool = True,
 ) -> TreeModel:
-    """Trains a linear model for multiabel data using a divide-and-conquer strategy.
+    """Trains a linear model for multi-label data using a divide-and-conquer strategy.
     The algorithm used is based on https://github.com/xmc-aalto/bonsai.
     Args:
@@ -133,20 +135,39 @@ def train_tree(
     label_representation = (y.T * x).tocsr()
     label_representation = sklearn.preprocessing.normalize(label_representation, norm="l2", axis=1)
     root = _build_tree(label_representation, np.arange(y.shape[1]), 0, K, dmax)
+    root.is_root = True
     num_nodes = 0
+    # Both type(x) and type(y) are sparse.csr_matrix
+    # However, type((x != 0).T) becomes sparse.csc_matrix
+    # So type((x != 0).T * y) results in sparse.csc_matrix
+    features_used_perlabel = (x != 0).T * y
     def count(node):
         nonlocal num_nodes
         num_nodes += 1
+        node.num_features_used = np.count_nonzero(features_used_perlabel[:, node.label_map].sum(axis=1))
     root.dfs(count)
+    model_size = get_estimated_model_size(root)
+    print(f"The estimated tree model size is: {model_size / (1024**3):.3f} GB")
+    # Calculate the total memory (excluding swap) on the local machine
+    total_memory = psutil.virtual_memory().total
+    print(f"Your system memory is: {total_memory / (1024**3):.3f} GB")
+    if total_memory <= model_size:
+        raise MemoryError(f"Not enough memory to train the model.")
     pbar = tqdm(total=num_nodes, disable=not verbose)
     def visit(node):
-        relevant_instances = y[:, node.label_map].getnnz(axis=1) > 0
-        _train_node(y[relevant_instances], x[relevant_instances], options, node)
+        if node.is_root:
+            _train_node(y, x, options, node)
+        else:
+            relevant_instances = y[:, node.label_map].getnnz(axis=1) > 0
+            _train_node(y[relevant_instances], x[relevant_instances], options, node)
         pbar.update()
     root.dfs(visit)
@@ -195,6 +216,24 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray,
     return Node(label_map=label_map, children=children)
+def get_estimated_model_size(root):
+    total_num_weights = 0
+    def collect_stat(node: Node):
+        nonlocal total_num_weights
+        if node.isLeaf():
+            total_num_weights += len(node.label_map) * node.num_features_used
+        else:
+            total_num_weights += len(node.children) * node.num_features_used
+    root.dfs(collect_stat)
+    # 16 is because when storing sparse matrices, indices (int64) require 8 bytes and floats require 8 bytes
+    # Our study showed that among the used features of every binary classification problem, on average no more than 2/3 of weights obtained by the dual coordinate descent method are non-zeros.
+    return total_num_weights * 16 * 2 / 3
 def _train_node(y: sparse.csr_matrix, x: sparse.csr_matrix, options: str, node: Node):
     """If node is internal, computes the metalabels representing each child and trains
     on the metalabels. Otherwise, train on y.

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/libmultilabel/linear/utils.py RENAMED Viewed

@@ -76,17 +76,18 @@ class MultiLabelEstimator(sklearn.base.BaseEstimator):
         scoring_metric (str, optional): The scoring metric. Defaults to 'P@1'.
     """
-    def __init__(self, options: str = "", linear_technique: str = "1vsrest", scoring_metric: str = "P@1"):
+    def __init__(self, options: str = "", linear_technique: str = "1vsrest", scoring_metric: str = "P@1", multiclass: bool = False):
         super().__init__()
         self.options = options
         self.linear_technique = linear_technique
         self.scoring_metric = scoring_metric
         self._is_fitted = False
+        self.multiclass = multiclass
     def fit(self, X: sparse.csr_matrix, y: sparse.csr_matrix):
         X, y = sklearn.utils.validation.check_X_y(X, y, accept_sparse=True, multi_output=True)
         self._is_fitted = True
-        self.model = LINEAR_TECHNIQUES[self.linear_technique](y, X, self.options)
+        self.model = LINEAR_TECHNIQUES[self.linear_technique](y, X, options=self.options)
         return self
     def predict(self, X: sparse.csr_matrix) -> np.ndarray:
@@ -96,8 +97,9 @@ class MultiLabelEstimator(sklearn.base.BaseEstimator):
     def score(self, X: sparse.csr_matrix, y: sparse.csr_matrix) -> float:
         metrics = linear.get_metrics(
-            [self.scoring_metric],
-            y.shape[1],
+            monitor_metrics=[self.scoring_metric],
+            num_classes=y.shape[1],
+            multiclass=self.multiclass
         )
         preds = self.predict(X)
         metrics.update(preds, y.toarray())

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/libmultilabel.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: libmultilabel
-Version: 0.7.0
+Version: 0.7.2
 Summary: A library for multi-class and multi-label classification
 Home-page: https://github.com/ASUS-AICS/LibMultiLabel
 Author: LibMultiLabel Team
@@ -24,12 +24,12 @@ Requires-Dist: numba
 Requires-Dist: pandas>1.3.0
 Requires-Dist: PyYAML
 Requires-Dist: scikit-learn
-Requires-Dist: scipy
+Requires-Dist: scipy<1.14.0
 Requires-Dist: tqdm
 Provides-Extra: nn
 Requires-Dist: lightning==2.0.9; extra == "nn"
 Requires-Dist: nltk; extra == "nn"
-Requires-Dist: torch; extra == "nn"
+Requires-Dist: torch<=2.3; extra == "nn"
 Requires-Dist: torchmetrics==0.10.3; extra == "nn"
 Requires-Dist: torchtext; extra == "nn"
 Requires-Dist: transformers; extra == "nn"

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/libmultilabel.egg-info/requires.txt RENAMED Viewed

@@ -3,13 +3,13 @@ numba
 pandas>1.3.0
 PyYAML
 scikit-learn
-scipy
+scipy<1.14.0
 tqdm
 [nn]
 lightning==2.0.9
 nltk
-torch
+torch<=2.3
 torchmetrics==0.10.3
 torchtext
 transformers

{libmultilabel-0.7.0 → libmultilabel-0.7.2}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = libmultilabel
-version = 0.7.0
+version = 0.7.2
 author = LibMultiLabel Team
 license = MIT License
 license_file = LICENSE
@@ -30,7 +30,7 @@ install_requires =
 	pandas>1.3.0
 	PyYAML
 	scikit-learn
-	scipy
+	scipy<1.14.0
 	tqdm
 python_requires = >=3.8
@@ -38,7 +38,7 @@ python_requires = >=3.8
 nn =
 	lightning==2.0.9
 	nltk
-	torch
+	torch<=2.3
 	torchmetrics==0.10.3
 	torchtext
 	transformers