libmultilabel 0.7.4__tar.gz → 0.8.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/PKG-INFO +13 -9
  2. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/README.md +3 -3
  3. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/linear.py +18 -4
  4. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/tree.py +23 -22
  5. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/attentionxml.py +1 -1
  6. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/data_utils.py +125 -67
  7. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/model.py +1 -1
  8. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/nn_utils.py +1 -2
  9. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/PKG-INFO +13 -9
  10. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/requires.txt +4 -4
  11. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/setup.cfg +11 -8
  12. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/LICENSE +0 -0
  13. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/__init__.py +0 -0
  14. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/common_utils.py +0 -0
  15. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/__init__.py +0 -0
  16. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/data_utils.py +0 -0
  17. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/metrics.py +0 -0
  18. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/preprocessor.py +0 -0
  19. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/utils.py +0 -0
  20. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/logging.py +0 -0
  21. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/__init__.py +0 -0
  22. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/metrics.py +0 -0
  23. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/__init__.py +0 -0
  24. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/bert.py +0 -0
  25. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/bert_attention.py +0 -0
  26. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/caml.py +0 -0
  27. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/kim_cnn.py +0 -0
  28. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/labelwise_attention_networks.py +0 -0
  29. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/modules.py +0 -0
  30. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/xml_cnn.py +0 -0
  31. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/SOURCES.txt +0 -0
  32. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/dependency_links.txt +0 -0
  33. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/top_level.txt +0 -0
  34. {libmultilabel-0.7.4 → libmultilabel-0.8.1}/pyproject.toml +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: libmultilabel
3
- Version: 0.7.4
3
+ Version: 0.8.1
4
4
  Summary: A library for multi-class and multi-label classification
5
5
  Home-page: https://github.com/ASUS-AICS/LibMultiLabel
6
6
  Author: LibMultiLabel Team
@@ -8,7 +8,7 @@ License: MIT License
8
8
  Project-URL: Bug Tracker, https://github.com/ASUS-AICS/LibMultiLabel/issues
9
9
  Project-URL: Documentation, https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
10
10
  Project-URL: Source Code, https://github.com/ASUS-AICS/LibMultiLabel/
11
- Classifier: Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1
11
+ Classifier: Environment :: GPU :: NVIDIA CUDA :: 12
12
12
  Classifier: Environment :: GPU :: NVIDIA CUDA :: 11.8
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Intended Audience :: Education
@@ -16,23 +16,27 @@ Classifier: Intended Audience :: Science/Research
16
16
  Classifier: License :: OSI Approved :: MIT License
17
17
  Classifier: Operating System :: OS Independent
18
18
  Classifier: Programming Language :: Python :: 3
19
- Classifier: Programming Language :: Python :: 3.8
20
- Requires-Python: >=3.8
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Requires-Python: >=3.10
21
24
  License-File: LICENSE
22
25
  Requires-Dist: liblinear-multicore>=2.49.0
23
26
  Requires-Dist: numba
24
27
  Requires-Dist: pandas>1.3.0
25
28
  Requires-Dist: PyYAML
26
29
  Requires-Dist: scikit-learn
27
- Requires-Dist: scipy<1.14.0
30
+ Requires-Dist: scipy
28
31
  Requires-Dist: tqdm
29
32
  Requires-Dist: psutil
33
+ Requires-Dist: sparsekmeans
30
34
  Provides-Extra: nn
31
- Requires-Dist: lightning==2.0.9; extra == "nn"
35
+ Requires-Dist: lightning; extra == "nn"
32
36
  Requires-Dist: nltk; extra == "nn"
33
- Requires-Dist: torch<=2.3; extra == "nn"
37
+ Requires-Dist: torch; extra == "nn"
34
38
  Requires-Dist: torchmetrics==0.10.3; extra == "nn"
35
- Requires-Dist: torchtext; extra == "nn"
36
39
  Requires-Dist: transformers; extra == "nn"
40
+ Dynamic: license-file
37
41
 
38
42
  See documentation here: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
@@ -9,9 +9,9 @@ LibMultiLabel is a library for binary, multi-class, and multi-label classificati
9
9
  This is an on-going development so many improvements are still being made. Comments are very welcome.
10
10
 
11
11
  ## Environments
12
- - Python: 3.8+
13
- - CUDA: 11.8, 12.1 (if training neural networks by GPU)
14
- - Pytorch: 2.0.1+
12
+ - Python: 3.10+
13
+ - CUDA: 11.8, 12.1, 12.6 (if training neural networks by GPU)
14
+ - Pytorch: 2.3.0+
15
15
 
16
16
  If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/).
17
17
 
@@ -27,7 +27,7 @@ class FlatModel:
27
27
  def __init__(
28
28
  self,
29
29
  name: str,
30
- weights: np.matrix,
30
+ weights: np.matrix | sparse.csr_matrix,
31
31
  bias: float,
32
32
  thresholds: float | np.ndarray,
33
33
  multiclass: bool,
@@ -69,7 +69,21 @@ class FlatModel:
69
69
  "csr",
70
70
  )
71
71
 
72
- return (x * self.weights).A + self.thresholds
72
+ return self._to_dense_array(x * self.weights) + self.thresholds
73
+
74
+ def _to_dense_array(self, matrix: np.matrix | sparse.csr_matrix) -> np.ndarray:
75
+ """Convert a numpy or scipy matrix to a dense ndarray.
76
+
77
+ Args:
78
+ matrix (np.matrix | sparse.csr_matrix): A numpy or scipy sparse matrix.
79
+
80
+ Returns:
81
+ np.ndarray: A dense ndarray of `matrix`.
82
+ """
83
+ if sparse.issparse(matrix):
84
+ return matrix.toarray()
85
+ elif isinstance(matrix, np.matrix):
86
+ return np.asarray(matrix)
73
87
 
74
88
 
75
89
  def train_1vsrest(
@@ -458,7 +472,7 @@ def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str)
458
472
 
459
473
  param_space = [1, 1.33, 1.8, 2.5, 3.67, 6, 13]
460
474
 
461
- bestScore = -np.Inf
475
+ bestScore = -np.inf
462
476
  for a in param_space:
463
477
  cv_options = f"{options} -w1 {a}"
464
478
  pred = _cross_validate(y, x, cv_options, perm)
@@ -532,7 +546,7 @@ def train_cost_sensitive_micro(
532
546
  l = y.shape[0]
533
547
  perm = np.random.permutation(l)
534
548
  param_space = [1, 1.33, 1.8, 2.5, 3.67, 6, 13]
535
- bestScore = -np.Inf
549
+ bestScore = -np.inf
536
550
 
537
551
  if verbose:
538
552
  logging.info(f"Training cost-sensitive model for Micro-F1 on {num_class} labels")
@@ -4,7 +4,7 @@ from typing import Callable
4
4
 
5
5
  import numpy as np
6
6
  import scipy.sparse as sparse
7
- import sklearn.cluster
7
+ from sparsekmeans import LloydKmeans, ElkanKmeans
8
8
  import sklearn.preprocessing
9
9
  from tqdm import tqdm
10
10
  import psutil
@@ -101,7 +101,7 @@ class TreeModel:
101
101
  self.subtree_models = []
102
102
  for i in range(len(self.root.children)):
103
103
  subtree_weights_start = self.node_ptr[self.root.children[i].index]
104
- subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1
104
+ subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else self.node_ptr[-1]
105
105
  slice = np.s_[:, subtree_weights_start:subtree_weights_end]
106
106
  subtree_flatmodel = linear.FlatModel(
107
107
  name="subtree-flattened-tree",
@@ -274,28 +274,29 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray,
274
274
  Returns:
275
275
  Node: Root of the (sub)tree built from label_representation.
276
276
  """
277
- if d >= dmax or label_representation.shape[0] <= K:
278
- return Node(label_map=label_map, children=[])
279
-
280
- metalabels = (
281
- sklearn.cluster.KMeans(
282
- K,
283
- random_state=np.random.randint(2**31 - 1),
284
- n_init=1,
285
- max_iter=300,
286
- tol=0.0001,
287
- algorithm="elkan",
277
+ children = []
278
+ if d < dmax and label_representation.shape[0] > K:
279
+ if label_representation.shape[0] > 10000:
280
+ kmeans_algo = ElkanKmeans
281
+ else:
282
+ kmeans_algo = LloydKmeans
283
+
284
+ kmeans = kmeans_algo(
285
+ n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=True
288
286
  )
289
- .fit(label_representation)
290
- .labels_
291
- )
287
+ metalabels = kmeans.fit(label_representation)
292
288
 
293
- children = []
294
- for i in range(K):
295
- child_representation = label_representation[metalabels == i]
296
- child_map = label_map[metalabels == i]
297
- child = _build_tree(child_representation, child_map, d + 1, K, dmax)
298
- children.append(child)
289
+ unique_labels = np.unique(metalabels)
290
+ if len(unique_labels) == K:
291
+ create_child_node = lambda i: _build_tree(
292
+ label_representation[metalabels == i], label_map[metalabels == i], d + 1, K, dmax
293
+ )
294
+ else:
295
+ create_child_node = lambda i: Node(label_map=label_map[metalabels == i], children=[])
296
+
297
+ for i in range(K):
298
+ child = create_child_node(i)
299
+ children.append(child)
299
300
 
300
301
  return Node(label_map=label_map, children=children)
301
302
 
@@ -489,7 +489,7 @@ class PLTTrainer:
489
489
  # Convert words to numbers according to their indices in word_dict. Then pad each instance to a certain length.
490
490
  encoded_text = list(
491
491
  map(
492
- lambda text: torch.tensor([self.word_dict[word] for word in text], dtype=torch.int64)
492
+ lambda text: torch.tensor([self.word_dict.get(word, self.word_dict[UNK]) for word in text], dtype=torch.int64)
493
493
  if text
494
494
  else torch.tensor([self.word_dict[UNK]], dtype=torch.int64),
495
495
  [instance["text"][: self.max_seq_length] for instance in dataset],
@@ -1,7 +1,12 @@
1
1
  import csv
2
2
  import gc
3
3
  import logging
4
+ import os
5
+ import re
4
6
  import warnings
7
+ import zipfile
8
+ from urllib.request import urlretrieve
9
+ from collections import Counter, OrderedDict
5
10
 
6
11
  import pandas as pd
7
12
  import torch
@@ -11,7 +16,6 @@ from sklearn.model_selection import train_test_split
11
16
  from sklearn.preprocessing import MultiLabelBinarizer
12
17
  from torch.nn.utils.rnn import pad_sequence
13
18
  from torch.utils.data import Dataset
14
- from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases, Vocab
15
19
  from tqdm import tqdm
16
20
 
17
21
  transformers.logging.set_verbosity_error()
@@ -19,6 +23,14 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
19
23
 
20
24
  UNK = "<unk>"
21
25
  PAD = "<pad>"
26
+ GLOVE_WORD_EMBEDDING = {
27
+ "glove.42B.300d",
28
+ "glove.840B.300d",
29
+ "glove.6B.50d",
30
+ "glove.6B.100d",
31
+ "glove.6B.200d",
32
+ "glove.6B.300d",
33
+ }
22
34
 
23
35
 
24
36
  class TextDataset(Dataset):
@@ -31,8 +43,7 @@ class TextDataset(Dataset):
31
43
  add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
32
44
  tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
33
45
  the transformer-based pretrained language model. Defaults to None.
34
- word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
35
- map tokens to indices. Defaults to None.
46
+ word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
36
47
  """
37
48
 
38
49
  def __init__(
@@ -55,7 +66,7 @@ class TextDataset(Dataset):
55
66
  self.num_classes = len(self.classes)
56
67
  self.label_binarizer = MultiLabelBinarizer().fit([classes])
57
68
 
58
- if not isinstance(self.word_dict, Vocab) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase):
69
+ if not isinstance(self.word_dict, dict) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase):
59
70
  raise ValueError("Please specify exactly one of word_dict or tokenizer")
60
71
 
61
72
  def __len__(self):
@@ -71,7 +82,7 @@ class TextDataset(Dataset):
71
82
  else:
72
83
  input_ids = self.tokenizer.encode(data["text"], add_special_tokens=False)
73
84
  else:
74
- input_ids = [self.word_dict[word] for word in data["text"]]
85
+ input_ids = [self.word_dict.get(word, self.word_dict[UNK]) for word in data["text"]]
75
86
  return {
76
87
  "text": torch.LongTensor(input_ids[: self.max_seq_length]),
77
88
  "label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]),
@@ -128,8 +139,7 @@ def get_dataset_loader(
128
139
  add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
129
140
  tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
130
141
  the transformer-based pretrained language model. Defaults to None.
131
- word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
132
- map tokens to indices. Defaults to None.
142
+ word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
133
143
 
134
144
  Returns:
135
145
  torch.utils.data.DataLoader: A pytorch DataLoader.
@@ -154,6 +164,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data
154
164
  Args:
155
165
  data (Union[str, pandas,.Dataframe]): Training, test, or validation data in file or dataframe.
156
166
  is_test (bool, optional): Whether the data is for test or not. Defaults to False.
167
+ tokenize_text (bool, optional): Whether to tokenize text. Defaults to True.
157
168
  remove_no_label_data (bool, optional): Whether to remove training/validation instances that have no labels.
158
169
  This is effective only when is_test=False. Defaults to False.
159
170
 
@@ -265,20 +276,19 @@ def load_or_build_text_dict(
265
276
  ):
266
277
  """Build or load the vocabulary from the training dataset or the predefined `vocab_file`.
267
278
  The pretrained embedding can be either from a self-defined `embed_file` or from one of
268
- the vectors defined in torchtext.vocab.pretrained_aliases
269
- (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py).
279
+ the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, or `glove.840B.300d`.
270
280
 
271
281
  Args:
272
282
  dataset (list): List of training instances with index, label, and tokenized text.
273
283
  vocab_file (str, optional): Path to a file holding vocabuaries. Defaults to None.
274
284
  min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
275
- embed_file (str): Path to a file holding pre-trained embeddings.
285
+ embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. Defaults to None.
276
286
  embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
277
287
  silent (bool, optional): Enable silent mode. Defaults to False.
278
288
  normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False.
279
289
 
280
290
  Returns:
281
- tuple[torchtext.vocab.Vocab, torch.Tensor]: A vocab object which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim).
291
+ tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim).
282
292
  """
283
293
  if vocab_file:
284
294
  logging.info(f"Load vocab from {vocab_file}")
@@ -286,14 +296,14 @@ def load_or_build_text_dict(
286
296
  vocab_list = [[vocab.strip() for vocab in fp.readlines()]]
287
297
  # Keep PAD index 0 to align `padding_idx` of
288
298
  # class Embedding in libmultilabel.nn.networks.modules.
289
- vocabs = build_vocab_from_iterator(vocab_list, min_freq=1, specials=[PAD, UNK])
299
+ word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK])
290
300
  else:
291
301
  vocab_list = [set(data["text"]) for data in dataset]
292
- vocabs = build_vocab_from_iterator(vocab_list, min_freq=min_vocab_freq, specials=[PAD, UNK])
293
- vocabs.set_default_index(vocabs[UNK])
294
- logging.info(f"Read {len(vocabs)} vocabularies.")
302
+ word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK])
303
+
304
+ logging.info(f"Read {len(word_dict)} vocabularies.")
295
305
 
296
- embedding_weights = get_embedding_weights_from_file(vocabs, embed_file, silent, embed_cache_dir)
306
+ embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir)
297
307
 
298
308
  if normalize_embed:
299
309
  # To have better precision for calculating the normalization, we convert the original
@@ -306,7 +316,41 @@ def load_or_build_text_dict(
306
316
  embedding_weights[i] = vector / float(torch.linalg.norm(vector) + 1e-6)
307
317
  embedding_weights = embedding_weights.float()
308
318
 
309
- return vocabs, embedding_weights
319
+ return word_dict, embedding_weights
320
+
321
+
322
+ def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
323
+ r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator`
324
+ (https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator)
325
+
326
+ Args:
327
+ vocab_list: List of words.
328
+ min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
329
+ specials: Special tokens (e.g., <unk>, <pad>) to add. Defaults to None.
330
+
331
+ Returns:
332
+ dict: A dictionary which maps tokens to indices.
333
+ """
334
+
335
+ counter = Counter()
336
+ for tokens in vocab_list:
337
+ counter.update(tokens)
338
+
339
+ # sort by descending frequency, then lexicographically
340
+ sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
341
+ ordered_dict = OrderedDict(sorted_by_freq_tuples)
342
+
343
+ # add special tokens at the beginning
344
+ tokens = specials or []
345
+ for token, freq in ordered_dict.items():
346
+ if freq >= min_vocab_freq:
347
+ tokens.append(token)
348
+
349
+ # build token to indices dict
350
+ word_dict = dict()
351
+ for idx, token in enumerate(tokens):
352
+ word_dict[token] = idx
353
+ return word_dict
310
354
 
311
355
 
312
356
  def load_or_build_label(datasets, label_file=None, include_test_labels=False):
@@ -344,70 +388,84 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False):
344
388
  return classes
345
389
 
346
390
 
347
- def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache=None):
348
- """If the word exists in the embedding file, load the pretrained word embedding.
349
- Otherwise, assign a zero vector to that word.
391
+ def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None):
392
+ """Obtain the word embeddings from file. If the word exists in the embedding file,
393
+ load the pretrained word embedding. Otherwise, assign a zero vector to that word.
394
+ If the given `embed_file` is the name of a pretrained GloVe embedding, the function
395
+ will first download the corresponding file.
350
396
 
351
397
  Args:
352
- word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
353
- embed_file (str): Path to a file holding pre-trained embeddings.
398
+ word_dict (dict): A dictionary for mapping tokens to indices.
399
+ embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding.
354
400
  silent (bool, optional): Enable silent mode. Defaults to False.
355
- cache (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
401
+ cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
356
402
 
357
403
  Returns:
358
404
  torch.Tensor: Embedding weights (vocab_size, embed_size).
359
405
  """
360
- # Load pretrained word embedding
361
- load_embedding_from_file = embed_file not in pretrained_aliases
362
- if load_embedding_from_file:
363
- logging.info(f"Load pretrained embedding from file: {embed_file}.")
364
- with open(embed_file) as f:
365
- word_vectors = f.readlines()
366
- embed_size = len(word_vectors[0].split()) - 1
367
- vector_dict = {}
368
- for word_vector in tqdm(word_vectors, disable=silent):
369
- word, vector = word_vector.rstrip().split(" ", 1)
370
- vector = torch.Tensor(list(map(float, vector.split())))
371
- vector_dict[word] = vector
372
- else:
373
- logging.info(f"Load pretrained embedding from torchtext.")
374
- # Adapted from https://pytorch.org/text/0.9.0/_modules/torchtext/vocab.html#Vocab.load_vectors.
375
- if embed_file not in pretrained_aliases:
376
- raise ValueError(
377
- "Got embed_file {}, but allowed pretrained "
378
- "vectors are {}".format(embed_file, list(pretrained_aliases.keys()))
379
- )
380
-
381
- # Hotfix: Glove URLs are outdated in Torchtext
382
- # (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py#L213-L217)
383
- pretrained_cls = pretrained_aliases[embed_file]
384
- if embed_file.startswith("glove"):
385
- for name, url in pretrained_cls.func.url.items():
386
- file_name = url.split("/")[-1]
387
- pretrained_cls.func.url[name] = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{file_name}"
388
-
389
- vector_dict = pretrained_cls(cache=cache)
390
- embed_size = vector_dict.dim
391
406
 
392
- embedding_weights = torch.zeros(len(word_dict), embed_size)
407
+ if embed_file in GLOVE_WORD_EMBEDDING:
408
+ embed_file = _download_glove_embedding(embed_file, cache_dir=cache_dir)
409
+ elif not os.path.isfile(embed_file):
410
+ raise ValueError(
411
+ "Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, GLOVE_WORD_EMBEDDING)
412
+ )
413
+
414
+ logging.info(f"Load pretrained embedding from {embed_file}.")
415
+ with open(embed_file) as f:
416
+ word_vectors = f.readlines()
417
+ embed_size = len(word_vectors[0].split()) - 1
393
418
 
394
- if load_embedding_from_file:
395
- # Add UNK embedding
396
- # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
397
- # CAML: np.random.randn(embed_size)
398
- unk_vector = torch.randn(embed_size)
399
- embedding_weights[word_dict[UNK]] = unk_vector
419
+ vector_dict = {}
420
+ for word_vector in tqdm(word_vectors, disable=silent):
421
+ word, vector = word_vector.rstrip().split(" ", 1)
422
+ vector = torch.Tensor(list(map(float, vector.split())))
423
+ vector_dict[word] = vector
424
+
425
+ embedding_weights = torch.zeros(len(word_dict), embed_size)
426
+ # Add UNK embedding
427
+ # AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
428
+ # CAML: np.random.randn(embed_size)
429
+ unk_vector = torch.randn(embed_size)
430
+ embedding_weights[word_dict[UNK]] = unk_vector
400
431
 
401
432
  # Store pretrained word embedding
402
433
  vec_counts = 0
403
- for word in word_dict.get_itos():
404
- # The condition can be used to process the word that does not in the embedding file.
405
- # Note that torchtext vector object has already dealt with this,
406
- # so we can directly make a query without addtional handling.
407
- if (load_embedding_from_file and word in vector_dict) or not load_embedding_from_file:
434
+ for word in word_dict.keys():
435
+ if word in vector_dict:
408
436
  embedding_weights[word_dict[word]] = vector_dict[word]
409
437
  vec_counts += 1
410
438
 
411
- logging.info(f"loaded {vec_counts}/{len(word_dict)} word embeddings")
439
+ logging.info(f"Loaded {vec_counts}/{len(word_dict)} word embeddings")
412
440
 
413
441
  return embedding_weights
442
+
443
+
444
+ def _download_glove_embedding(embed_name, cache_dir=None):
445
+ """Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main.
446
+
447
+ Args:
448
+ embed_name (str): The name of the pretrained GloVe embedding. Defaults to None.
449
+ cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
450
+
451
+ Returns:
452
+ str: Path to the file that contains the cached embeddings.
453
+ """
454
+ cache_dir = ".vector_cache" if cache_dir is None else cache_dir
455
+ cached_embed_file = f"{cache_dir}/{embed_name}.txt"
456
+ if os.path.isfile(cached_embed_file):
457
+ return cached_embed_file
458
+ os.makedirs(cache_dir, exist_ok=True)
459
+
460
+ remote_embed_file = re.sub(r"6B.*", "6B", embed_name) + ".zip"
461
+ url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}"
462
+ logging.info(f"Downloading pretrained embeddings from {url}.")
463
+ try:
464
+ zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}")
465
+ with zipfile.ZipFile(zip_file, "r") as zf:
466
+ zf.extractall(cache_dir)
467
+ except Exception as e:
468
+ os.remove(zip_file)
469
+ raise e
470
+ logging.info(f"Downloaded pretrained embeddings {embed_name} to {cached_embed_file}.")
471
+ return cached_embed_file
@@ -181,7 +181,7 @@ class Model(MultiLabelModel):
181
181
 
182
182
  Args:
183
183
  classes (list): List of class names.
184
- word_dict (torchtext.vocab.Vocab): A vocab object which maps tokens to indices.
184
+ word_dict (dict): A dictionary for mapping tokens to indices.
185
185
  network (nn.Module): Network (i.e., CAML, KimCNN, or XMLCNN).
186
186
  loss_function (str, optional): Loss function name (i.e., binary_cross_entropy_with_logits,
187
187
  cross_entropy). Defaults to 'binary_cross_entropy_with_logits'.
@@ -61,8 +61,7 @@ def init_model(
61
61
  model_name (str): Model to be used such as KimCNN.
62
62
  network_config (dict): Configuration for defining the network.
63
63
  classes (list): List of class names.
64
- word_dict (torchtext.vocab.Vocab, optional): A vocab object for word tokenizer to
65
- map tokens to indices. Defaults to None.
64
+ word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
66
65
  embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape
67
66
  (vocab_size, embed_dim). Defaults to None.
68
67
  init_weight (str): Weight initialization method from `torch.nn.init`.
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: libmultilabel
3
- Version: 0.7.4
3
+ Version: 0.8.1
4
4
  Summary: A library for multi-class and multi-label classification
5
5
  Home-page: https://github.com/ASUS-AICS/LibMultiLabel
6
6
  Author: LibMultiLabel Team
@@ -8,7 +8,7 @@ License: MIT License
8
8
  Project-URL: Bug Tracker, https://github.com/ASUS-AICS/LibMultiLabel/issues
9
9
  Project-URL: Documentation, https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
10
10
  Project-URL: Source Code, https://github.com/ASUS-AICS/LibMultiLabel/
11
- Classifier: Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1
11
+ Classifier: Environment :: GPU :: NVIDIA CUDA :: 12
12
12
  Classifier: Environment :: GPU :: NVIDIA CUDA :: 11.8
13
13
  Classifier: Intended Audience :: Developers
14
14
  Classifier: Intended Audience :: Education
@@ -16,23 +16,27 @@ Classifier: Intended Audience :: Science/Research
16
16
  Classifier: License :: OSI Approved :: MIT License
17
17
  Classifier: Operating System :: OS Independent
18
18
  Classifier: Programming Language :: Python :: 3
19
- Classifier: Programming Language :: Python :: 3.8
20
- Requires-Python: >=3.8
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Requires-Python: >=3.10
21
24
  License-File: LICENSE
22
25
  Requires-Dist: liblinear-multicore>=2.49.0
23
26
  Requires-Dist: numba
24
27
  Requires-Dist: pandas>1.3.0
25
28
  Requires-Dist: PyYAML
26
29
  Requires-Dist: scikit-learn
27
- Requires-Dist: scipy<1.14.0
30
+ Requires-Dist: scipy
28
31
  Requires-Dist: tqdm
29
32
  Requires-Dist: psutil
33
+ Requires-Dist: sparsekmeans
30
34
  Provides-Extra: nn
31
- Requires-Dist: lightning==2.0.9; extra == "nn"
35
+ Requires-Dist: lightning; extra == "nn"
32
36
  Requires-Dist: nltk; extra == "nn"
33
- Requires-Dist: torch<=2.3; extra == "nn"
37
+ Requires-Dist: torch; extra == "nn"
34
38
  Requires-Dist: torchmetrics==0.10.3; extra == "nn"
35
- Requires-Dist: torchtext; extra == "nn"
36
39
  Requires-Dist: transformers; extra == "nn"
40
+ Dynamic: license-file
37
41
 
38
42
  See documentation here: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
@@ -3,14 +3,14 @@ numba
3
3
  pandas>1.3.0
4
4
  PyYAML
5
5
  scikit-learn
6
- scipy<1.14.0
6
+ scipy
7
7
  tqdm
8
8
  psutil
9
+ sparsekmeans
9
10
 
10
11
  [nn]
11
- lightning==2.0.9
12
+ lightning
12
13
  nltk
13
- torch<=2.3
14
+ torch
14
15
  torchmetrics==0.10.3
15
- torchtext
16
16
  transformers
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = libmultilabel
3
- version = 0.7.4
3
+ version = 0.8.1
4
4
  author = LibMultiLabel Team
5
5
  license = MIT License
6
6
  license_file = LICENSE
@@ -12,7 +12,7 @@ project_urls =
12
12
  Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
13
13
  Source Code = https://github.com/ASUS-AICS/LibMultiLabel/
14
14
  classifiers =
15
- Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1
15
+ Environment :: GPU :: NVIDIA CUDA :: 12
16
16
  Environment :: GPU :: NVIDIA CUDA :: 11.8
17
17
  Intended Audience :: Developers
18
18
  Intended Audience :: Education
@@ -20,7 +20,10 @@ classifiers =
20
20
  License :: OSI Approved :: MIT License
21
21
  Operating System :: OS Independent
22
22
  Programming Language :: Python :: 3
23
- Programming Language :: Python :: 3.8
23
+ Programming Language :: Python :: 3.10
24
+ Programming Language :: Python :: 3.11
25
+ Programming Language :: Python :: 3.12
26
+ Programming Language :: Python :: 3.13
24
27
 
25
28
  [options]
26
29
  packages = find:
@@ -30,18 +33,18 @@ install_requires =
30
33
  pandas>1.3.0
31
34
  PyYAML
32
35
  scikit-learn
33
- scipy<1.14.0
36
+ scipy
34
37
  tqdm
35
38
  psutil
36
- python_requires = >=3.8
39
+ sparsekmeans
40
+ python_requires = >=3.10
37
41
 
38
42
  [options.extras_require]
39
43
  nn =
40
- lightning==2.0.9
44
+ lightning
41
45
  nltk
42
- torch<=2.3
46
+ torch
43
47
  torchmetrics==0.10.3
44
- torchtext
45
48
  transformers
46
49
 
47
50
  [options.packages.find]
File without changes