PyPI - libmultilabel - Versions diffs - 0.5.0__tar.gz → 0.5.2__tar.gz - Mend

libmultilabel 0.5.0tar.gz → 0.5.2tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: libmultilabel
-Version: 0.5.0
+Version: 0.5.2
 Summary: A library for multi-label text classification
 Home-page: https://github.com/ASUS-AICS/LibMultiLabel
 Author: LibMultiLabel Team

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/libmultilabel/common_utils.py RENAMED Viewed

@@ -3,6 +3,7 @@ import json
 import logging
 import os
 import time
+from functools import wraps
 import numpy as np
@@ -41,36 +42,6 @@ class AttributeDict(dict):
         return {k: self[k] for k in self._used}
-class Timer(object):
-    """Computes elasped time."""
-    def __init__(self):
-        self.reset()
-    def reset(self):
-        self.running = True
-        self.total = 0
-        self.start = time.time()
-        return self
-    def resume(self):
-        if not self.running:
-            self.running = True
-            self.start = time.time()
-        return self
-    def stop(self):
-        if self.running:
-            self.running = False
-            self.total += time.time() - self.start
-        return self
-    def time(self):
-        if self.running:
-            return self.total + time.time() - self.start
-        return self.total
 def dump_log(log_path, metrics=None, split=None, config=None):
     """Write log including the used items of config and the evaluation scores.
@@ -156,3 +127,17 @@ def is_multiclass_dataset(dataset, label="label"):
             a multi-class problem."""
         )
     return ratio == 1.0
+def timer(func):
+    """Log info-level wall time"""
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        start_time = time.time()
+        value = func(*args, **kwargs)
+        wall_time = time.time() - start_time
+        logging.info(f"{repr(func.__name__)} finished in {wall_time:.2f} seconds")
+        return value
+    return wrapper

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/libmultilabel/linear/metrics.py RENAMED Viewed

@@ -64,6 +64,7 @@ class NDCG:
         dcg = _DCG_argsort(argsort_preds, target, self.top_k)
         idcg = _IDCG(target, self.top_k)
         ndcg_score = dcg / idcg
+        # by convention, ndcg is 0 for zero label instances
         self.score += np.nan_to_num(ndcg_score, nan=0.0).sum()
         self.num_sample += argsort_preds.shape[0]
@@ -95,6 +96,7 @@ class RPrecision:
     def update_argsort(self, argsort_preds: np.ndarray, target: np.ndarray):
         top_k_idx = argsort_preds[:, -self.top_k :]
         num_relevant = np.take_along_axis(target, top_k_idx, axis=-1).sum(axis=-1)  # (batch_size, )
+        # by convention, rprecision is 0 for zero label instances
         self.score += np.nan_to_num(num_relevant / np.minimum(self.top_k, target.sum(axis=-1)), nan=0.0).sum()
         self.num_sample += argsort_preds.shape[0]
@@ -167,7 +169,8 @@ class Recall:
     def update_argsort(self, argsort_preds: np.ndarray, target: np.ndarray):
         top_k_idx = argsort_preds[:, -self.top_k :]
         num_relevant = np.take_along_axis(target, top_k_idx, -1).sum(axis=-1)
-        self.score += np.nan_to_num(num_relevant / target.sum(axis=-1), nan=1.0).sum()
+        # by convention, recall is 0 for zero label instances
+        self.score += np.nan_to_num(num_relevant / target.sum(axis=-1), nan=0.0).sum()
         self.num_sample += argsort_preds.shape[0]
     def compute(self) -> float:
@@ -210,14 +213,15 @@ class F1:
     def compute(self) -> float:
         prev_settings = np.seterr("ignore")
+        # F1 is 0 for the cases where there are no positive instances
         if self.average == "macro":
             score = np.nansum(2 * self.tp / (2 * self.tp + self.fp + self.fn)) / self.num_classes
         elif self.average == "micro":
-            score = np.nan_to_num(2 * np.sum(self.tp) / np.sum(2 * self.tp + self.fp + self.fn))
+            score = np.nan_to_num(2 * np.sum(self.tp) / np.sum(2 * self.tp + self.fp + self.fn), nan=0.0)
         elif self.average == "another-macro":
             macro_prec = np.nansum(self.tp / (self.tp + self.fp)) / self.num_classes
             macro_recall = np.nansum(self.tp / (self.tp + self.fn)) / self.num_classes
-            score = np.nan_to_num(2 * macro_prec * macro_recall / (macro_prec + macro_recall))
+            score = np.nan_to_num(2 * macro_prec * macro_recall / (macro_prec + macro_recall), nan=0.0)
         np.seterr(**prev_settings)
         return score

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/libmultilabel/nn/data_utils.py RENAMED Viewed

@@ -2,7 +2,6 @@ import csv
 import gc
 import logging
 import warnings
-from concurrent.futures import ProcessPoolExecutor
 import pandas as pd
 import torch
@@ -159,7 +158,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data
             This is effective only when is_test=False. Defaults to False.
     Returns:
-        pandas.DataFrame: Data composed of index, label, and tokenized text.
+        dict: [{(optional: "index": ..., )"label": ..., "text": ...}, ...]
     """
     assert isinstance(data, str) or isinstance(data, pd.DataFrame), "Data must be from a file or pandas dataframe."
     if isinstance(data, str):
@@ -176,9 +175,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data
     data["label"] = data["label"].astype(str).map(lambda s: s.split())
     if tokenize_text:
-        # multiprocessing requires serializable objects
-        with ProcessPoolExecutor() as executor:
-            data["text"] = pd.Series(tqdm(executor.map(tokenize, data["text"]), total=len(data["text"])))
+        data["text"] = data["text"].map(tokenize)
     data = data.to_dict("records")
     if not is_test:
         num_no_label_data = sum(1 for d in data if len(d["label"]) == 0)
@@ -222,15 +219,12 @@ def load_datasets(
     Returns:
         dict: A dictionary of datasets.
     """
-    if isinstance(training_data, str) or isinstance(test_data, str):
-        assert training_data or test_data, "At least one of `training_data` and `test_data` must be specified."
-    elif isinstance(training_data, pd.DataFrame) or isinstance(test_data, pd.DataFrame):
-        assert (
-            not training_data.empty or not test_data.empty
-        ), "At least one of `training_data` and `test_data` must be specified."
+    if training_data is None and test_data is None:
+        raise ValueError("At least one of `training_data` and `test_data` must be specified.")
     datasets = {}
     if training_data is not None:
+        logging.info(f"Loading training data")
         datasets["train"] = _load_raw_data(
             training_data, tokenize_text=tokenize_text, remove_no_label_data=remove_no_label_data
         )
@@ -243,11 +237,12 @@ def load_datasets(
         datasets["train"], datasets["val"] = train_test_split(datasets["train"], test_size=val_size, random_state=42)
     if test_data is not None:
+        logging.info(f"Loading test data")
         datasets["test"] = _load_raw_data(
             test_data, is_test=True, tokenize_text=tokenize_text, remove_no_label_data=remove_no_label_data
         )
-    if merge_train_val:
+    if merge_train_val and "val" in datasets:
         datasets["train"] = datasets["train"] + datasets["val"]
         for i in range(len(datasets["train"])):
             datasets["train"][i]["index"] = i

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/libmultilabel/nn/metrics.py RENAMED Viewed

@@ -217,19 +217,17 @@ def get_metrics(metric_threshold, monitor_metrics, num_classes, top_k=None):
         if match_top_k:
             metric_abbr = match_top_k.group(1)  # P, R, PR, or nDCG
-            top_k = int(match_top_k.group(2))
-            if top_k >= num_classes:
-                raise ValueError(
-                    f"Invalid metric: {metric}. top_k ({top_k}) is greater than num_classes({num_classes})."
-                )
+            k = int(match_top_k.group(2))
+            if k >= num_classes:
+                raise ValueError(f"Invalid metric: {metric}. k ({k}) is greater than num_classes({num_classes}).")
             if metric_abbr == "P":
-                metrics[metric] = Precision(num_classes, average="samples", top_k=top_k)
+                metrics[metric] = Precision(num_classes, average="samples", top_k=k)
             elif metric_abbr == "R":
-                metrics[metric] = Recall(num_classes, average="samples", top_k=top_k)
+                metrics[metric] = Recall(num_classes, average="samples", top_k=k)
             elif metric_abbr == "RP":
-                metrics[metric] = RPrecision(top_k=top_k)
+                metrics[metric] = RPrecision(top_k=k)
             elif metric_abbr == "nDCG":
-                metrics[metric] = NDCG(top_k=top_k)
+                metrics[metric] = NDCG(top_k=k)
                 # The implementation in torchmetrics stores the prediction/target of all batches,
                 # which can lead to CUDA out of memory.
                 # metrics[metric] = RetrievalNormalizedDCG(k=top_k)

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/libmultilabel.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: libmultilabel
-Version: 0.5.0
+Version: 0.5.2
 Summary: A library for multi-label text classification
 Home-page: https://github.com/ASUS-AICS/LibMultiLabel
 Author: LibMultiLabel Team

{libmultilabel-0.5.0 → libmultilabel-0.5.2}/setup.cfg RENAMED Viewed

@@ -1,6 +1,6 @@
 [metadata]
 name = libmultilabel
-version = 0.5.0
+version = 0.5.2
 author = LibMultiLabel Team
 license = MIT License
 license_file = LICENSE