libmultilabel 0.7.4__tar.gz → 0.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/PKG-INFO +13 -9
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/README.md +3 -3
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/linear.py +18 -4
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/tree.py +23 -22
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/attentionxml.py +1 -1
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/data_utils.py +125 -67
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/model.py +1 -1
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/nn_utils.py +1 -2
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/PKG-INFO +13 -9
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/requires.txt +4 -4
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/setup.cfg +11 -8
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/LICENSE +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/__init__.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/common_utils.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/__init__.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/data_utils.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/metrics.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/preprocessor.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/linear/utils.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/logging.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/__init__.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/metrics.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/__init__.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/bert.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/bert_attention.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/caml.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/kim_cnn.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/labelwise_attention_networks.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/modules.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel/nn/networks/xml_cnn.py +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/SOURCES.txt +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/dependency_links.txt +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/libmultilabel.egg-info/top_level.txt +0 -0
- {libmultilabel-0.7.4 → libmultilabel-0.8.1}/pyproject.toml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: libmultilabel
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: A library for multi-class and multi-label classification
|
|
5
5
|
Home-page: https://github.com/ASUS-AICS/LibMultiLabel
|
|
6
6
|
Author: LibMultiLabel Team
|
|
@@ -8,7 +8,7 @@ License: MIT License
|
|
|
8
8
|
Project-URL: Bug Tracker, https://github.com/ASUS-AICS/LibMultiLabel/issues
|
|
9
9
|
Project-URL: Documentation, https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
|
|
10
10
|
Project-URL: Source Code, https://github.com/ASUS-AICS/LibMultiLabel/
|
|
11
|
-
Classifier: Environment :: GPU :: NVIDIA CUDA :: 12
|
|
11
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA :: 12
|
|
12
12
|
Classifier: Environment :: GPU :: NVIDIA CUDA :: 11.8
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Intended Audience :: Education
|
|
@@ -16,23 +16,27 @@ Classifier: Intended Audience :: Science/Research
|
|
|
16
16
|
Classifier: License :: OSI Approved :: MIT License
|
|
17
17
|
Classifier: Operating System :: OS Independent
|
|
18
18
|
Classifier: Programming Language :: Python :: 3
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.
|
|
20
|
-
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Requires-Python: >=3.10
|
|
21
24
|
License-File: LICENSE
|
|
22
25
|
Requires-Dist: liblinear-multicore>=2.49.0
|
|
23
26
|
Requires-Dist: numba
|
|
24
27
|
Requires-Dist: pandas>1.3.0
|
|
25
28
|
Requires-Dist: PyYAML
|
|
26
29
|
Requires-Dist: scikit-learn
|
|
27
|
-
Requires-Dist: scipy
|
|
30
|
+
Requires-Dist: scipy
|
|
28
31
|
Requires-Dist: tqdm
|
|
29
32
|
Requires-Dist: psutil
|
|
33
|
+
Requires-Dist: sparsekmeans
|
|
30
34
|
Provides-Extra: nn
|
|
31
|
-
Requires-Dist: lightning
|
|
35
|
+
Requires-Dist: lightning; extra == "nn"
|
|
32
36
|
Requires-Dist: nltk; extra == "nn"
|
|
33
|
-
Requires-Dist: torch
|
|
37
|
+
Requires-Dist: torch; extra == "nn"
|
|
34
38
|
Requires-Dist: torchmetrics==0.10.3; extra == "nn"
|
|
35
|
-
Requires-Dist: torchtext; extra == "nn"
|
|
36
39
|
Requires-Dist: transformers; extra == "nn"
|
|
40
|
+
Dynamic: license-file
|
|
37
41
|
|
|
38
42
|
See documentation here: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
|
|
@@ -9,9 +9,9 @@ LibMultiLabel is a library for binary, multi-class, and multi-label classificati
|
|
|
9
9
|
This is an on-going development so many improvements are still being made. Comments are very welcome.
|
|
10
10
|
|
|
11
11
|
## Environments
|
|
12
|
-
- Python: 3.
|
|
13
|
-
- CUDA: 11.8, 12.1 (if training neural networks by GPU)
|
|
14
|
-
- Pytorch: 2.0
|
|
12
|
+
- Python: 3.10+
|
|
13
|
+
- CUDA: 11.8, 12.1, 12.6 (if training neural networks by GPU)
|
|
14
|
+
- Pytorch: 2.3.0+
|
|
15
15
|
|
|
16
16
|
If you have a different version of CUDA, follow the installation instructions for PyTorch LTS at their [website](https://pytorch.org/).
|
|
17
17
|
|
|
@@ -27,7 +27,7 @@ class FlatModel:
|
|
|
27
27
|
def __init__(
|
|
28
28
|
self,
|
|
29
29
|
name: str,
|
|
30
|
-
weights: np.matrix,
|
|
30
|
+
weights: np.matrix | sparse.csr_matrix,
|
|
31
31
|
bias: float,
|
|
32
32
|
thresholds: float | np.ndarray,
|
|
33
33
|
multiclass: bool,
|
|
@@ -69,7 +69,21 @@ class FlatModel:
|
|
|
69
69
|
"csr",
|
|
70
70
|
)
|
|
71
71
|
|
|
72
|
-
return (x * self.weights)
|
|
72
|
+
return self._to_dense_array(x * self.weights) + self.thresholds
|
|
73
|
+
|
|
74
|
+
def _to_dense_array(self, matrix: np.matrix | sparse.csr_matrix) -> np.ndarray:
|
|
75
|
+
"""Convert a numpy or scipy matrix to a dense ndarray.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
matrix (np.matrix | sparse.csr_matrix): A numpy or scipy sparse matrix.
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
np.ndarray: A dense ndarray of `matrix`.
|
|
82
|
+
"""
|
|
83
|
+
if sparse.issparse(matrix):
|
|
84
|
+
return matrix.toarray()
|
|
85
|
+
elif isinstance(matrix, np.matrix):
|
|
86
|
+
return np.asarray(matrix)
|
|
73
87
|
|
|
74
88
|
|
|
75
89
|
def train_1vsrest(
|
|
@@ -458,7 +472,7 @@ def _cost_sensitive_one_label(y: np.ndarray, x: sparse.csr_matrix, options: str)
|
|
|
458
472
|
|
|
459
473
|
param_space = [1, 1.33, 1.8, 2.5, 3.67, 6, 13]
|
|
460
474
|
|
|
461
|
-
bestScore = -np.
|
|
475
|
+
bestScore = -np.inf
|
|
462
476
|
for a in param_space:
|
|
463
477
|
cv_options = f"{options} -w1 {a}"
|
|
464
478
|
pred = _cross_validate(y, x, cv_options, perm)
|
|
@@ -532,7 +546,7 @@ def train_cost_sensitive_micro(
|
|
|
532
546
|
l = y.shape[0]
|
|
533
547
|
perm = np.random.permutation(l)
|
|
534
548
|
param_space = [1, 1.33, 1.8, 2.5, 3.67, 6, 13]
|
|
535
|
-
bestScore = -np.
|
|
549
|
+
bestScore = -np.inf
|
|
536
550
|
|
|
537
551
|
if verbose:
|
|
538
552
|
logging.info(f"Training cost-sensitive model for Micro-F1 on {num_class} labels")
|
|
@@ -4,7 +4,7 @@ from typing import Callable
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import scipy.sparse as sparse
|
|
7
|
-
import
|
|
7
|
+
from sparsekmeans import LloydKmeans, ElkanKmeans
|
|
8
8
|
import sklearn.preprocessing
|
|
9
9
|
from tqdm import tqdm
|
|
10
10
|
import psutil
|
|
@@ -101,7 +101,7 @@ class TreeModel:
|
|
|
101
101
|
self.subtree_models = []
|
|
102
102
|
for i in range(len(self.root.children)):
|
|
103
103
|
subtree_weights_start = self.node_ptr[self.root.children[i].index]
|
|
104
|
-
subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else -1
|
|
104
|
+
subtree_weights_end = self.node_ptr[self.root.children[i+1].index] if i+1 < len(self.root.children) else self.node_ptr[-1]
|
|
105
105
|
slice = np.s_[:, subtree_weights_start:subtree_weights_end]
|
|
106
106
|
subtree_flatmodel = linear.FlatModel(
|
|
107
107
|
name="subtree-flattened-tree",
|
|
@@ -274,28 +274,29 @@ def _build_tree(label_representation: sparse.csr_matrix, label_map: np.ndarray,
|
|
|
274
274
|
Returns:
|
|
275
275
|
Node: Root of the (sub)tree built from label_representation.
|
|
276
276
|
"""
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
max_iter=300,
|
|
286
|
-
tol=0.0001,
|
|
287
|
-
algorithm="elkan",
|
|
277
|
+
children = []
|
|
278
|
+
if d < dmax and label_representation.shape[0] > K:
|
|
279
|
+
if label_representation.shape[0] > 10000:
|
|
280
|
+
kmeans_algo = ElkanKmeans
|
|
281
|
+
else:
|
|
282
|
+
kmeans_algo = LloydKmeans
|
|
283
|
+
|
|
284
|
+
kmeans = kmeans_algo(
|
|
285
|
+
n_clusters=K, max_iter=300, tol=0.0001, random_state=np.random.randint(2**31 - 1), verbose=True
|
|
288
286
|
)
|
|
289
|
-
.fit(label_representation)
|
|
290
|
-
.labels_
|
|
291
|
-
)
|
|
287
|
+
metalabels = kmeans.fit(label_representation)
|
|
292
288
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
289
|
+
unique_labels = np.unique(metalabels)
|
|
290
|
+
if len(unique_labels) == K:
|
|
291
|
+
create_child_node = lambda i: _build_tree(
|
|
292
|
+
label_representation[metalabels == i], label_map[metalabels == i], d + 1, K, dmax
|
|
293
|
+
)
|
|
294
|
+
else:
|
|
295
|
+
create_child_node = lambda i: Node(label_map=label_map[metalabels == i], children=[])
|
|
296
|
+
|
|
297
|
+
for i in range(K):
|
|
298
|
+
child = create_child_node(i)
|
|
299
|
+
children.append(child)
|
|
299
300
|
|
|
300
301
|
return Node(label_map=label_map, children=children)
|
|
301
302
|
|
|
@@ -489,7 +489,7 @@ class PLTTrainer:
|
|
|
489
489
|
# Convert words to numbers according to their indices in word_dict. Then pad each instance to a certain length.
|
|
490
490
|
encoded_text = list(
|
|
491
491
|
map(
|
|
492
|
-
lambda text: torch.tensor([self.word_dict[
|
|
492
|
+
lambda text: torch.tensor([self.word_dict.get(word, self.word_dict[UNK]) for word in text], dtype=torch.int64)
|
|
493
493
|
if text
|
|
494
494
|
else torch.tensor([self.word_dict[UNK]], dtype=torch.int64),
|
|
495
495
|
[instance["text"][: self.max_seq_length] for instance in dataset],
|
|
@@ -1,7 +1,12 @@
|
|
|
1
1
|
import csv
|
|
2
2
|
import gc
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
5
|
+
import re
|
|
4
6
|
import warnings
|
|
7
|
+
import zipfile
|
|
8
|
+
from urllib.request import urlretrieve
|
|
9
|
+
from collections import Counter, OrderedDict
|
|
5
10
|
|
|
6
11
|
import pandas as pd
|
|
7
12
|
import torch
|
|
@@ -11,7 +16,6 @@ from sklearn.model_selection import train_test_split
|
|
|
11
16
|
from sklearn.preprocessing import MultiLabelBinarizer
|
|
12
17
|
from torch.nn.utils.rnn import pad_sequence
|
|
13
18
|
from torch.utils.data import Dataset
|
|
14
|
-
from torchtext.vocab import build_vocab_from_iterator, pretrained_aliases, Vocab
|
|
15
19
|
from tqdm import tqdm
|
|
16
20
|
|
|
17
21
|
transformers.logging.set_verbosity_error()
|
|
@@ -19,6 +23,14 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
|
|
|
19
23
|
|
|
20
24
|
UNK = "<unk>"
|
|
21
25
|
PAD = "<pad>"
|
|
26
|
+
GLOVE_WORD_EMBEDDING = {
|
|
27
|
+
"glove.42B.300d",
|
|
28
|
+
"glove.840B.300d",
|
|
29
|
+
"glove.6B.50d",
|
|
30
|
+
"glove.6B.100d",
|
|
31
|
+
"glove.6B.200d",
|
|
32
|
+
"glove.6B.300d",
|
|
33
|
+
}
|
|
22
34
|
|
|
23
35
|
|
|
24
36
|
class TextDataset(Dataset):
|
|
@@ -31,8 +43,7 @@ class TextDataset(Dataset):
|
|
|
31
43
|
add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
|
|
32
44
|
tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
|
|
33
45
|
the transformer-based pretrained language model. Defaults to None.
|
|
34
|
-
word_dict (
|
|
35
|
-
map tokens to indices. Defaults to None.
|
|
46
|
+
word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
|
|
36
47
|
"""
|
|
37
48
|
|
|
38
49
|
def __init__(
|
|
@@ -55,7 +66,7 @@ class TextDataset(Dataset):
|
|
|
55
66
|
self.num_classes = len(self.classes)
|
|
56
67
|
self.label_binarizer = MultiLabelBinarizer().fit([classes])
|
|
57
68
|
|
|
58
|
-
if not isinstance(self.word_dict,
|
|
69
|
+
if not isinstance(self.word_dict, dict) ^ isinstance(self.tokenizer, transformers.PreTrainedTokenizerBase):
|
|
59
70
|
raise ValueError("Please specify exactly one of word_dict or tokenizer")
|
|
60
71
|
|
|
61
72
|
def __len__(self):
|
|
@@ -71,7 +82,7 @@ class TextDataset(Dataset):
|
|
|
71
82
|
else:
|
|
72
83
|
input_ids = self.tokenizer.encode(data["text"], add_special_tokens=False)
|
|
73
84
|
else:
|
|
74
|
-
input_ids = [self.word_dict[
|
|
85
|
+
input_ids = [self.word_dict.get(word, self.word_dict[UNK]) for word in data["text"]]
|
|
75
86
|
return {
|
|
76
87
|
"text": torch.LongTensor(input_ids[: self.max_seq_length]),
|
|
77
88
|
"label": torch.IntTensor(self.label_binarizer.transform([data["label"]])[0]),
|
|
@@ -128,8 +139,7 @@ def get_dataset_loader(
|
|
|
128
139
|
add_special_tokens (bool, optional): Whether to add the special tokens. Defaults to True.
|
|
129
140
|
tokenizer (transformers.PreTrainedTokenizerBase, optional): HuggingFace's tokenizer of
|
|
130
141
|
the transformer-based pretrained language model. Defaults to None.
|
|
131
|
-
word_dict (
|
|
132
|
-
map tokens to indices. Defaults to None.
|
|
142
|
+
word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
|
|
133
143
|
|
|
134
144
|
Returns:
|
|
135
145
|
torch.utils.data.DataLoader: A pytorch DataLoader.
|
|
@@ -154,6 +164,7 @@ def _load_raw_data(data, is_test=False, tokenize_text=True, remove_no_label_data
|
|
|
154
164
|
Args:
|
|
155
165
|
data (Union[str, pandas,.Dataframe]): Training, test, or validation data in file or dataframe.
|
|
156
166
|
is_test (bool, optional): Whether the data is for test or not. Defaults to False.
|
|
167
|
+
tokenize_text (bool, optional): Whether to tokenize text. Defaults to True.
|
|
157
168
|
remove_no_label_data (bool, optional): Whether to remove training/validation instances that have no labels.
|
|
158
169
|
This is effective only when is_test=False. Defaults to False.
|
|
159
170
|
|
|
@@ -265,20 +276,19 @@ def load_or_build_text_dict(
|
|
|
265
276
|
):
|
|
266
277
|
"""Build or load the vocabulary from the training dataset or the predefined `vocab_file`.
|
|
267
278
|
The pretrained embedding can be either from a self-defined `embed_file` or from one of
|
|
268
|
-
the vectors
|
|
269
|
-
(https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py).
|
|
279
|
+
the vectors: `glove.6B.50d`, `glove.6B.100d`, `glove.6B.200d`, `glove.6B.300d`, `glove.42B.300d`, or `glove.840B.300d`.
|
|
270
280
|
|
|
271
281
|
Args:
|
|
272
282
|
dataset (list): List of training instances with index, label, and tokenized text.
|
|
273
283
|
vocab_file (str, optional): Path to a file holding vocabuaries. Defaults to None.
|
|
274
284
|
min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
|
|
275
|
-
embed_file (str): Path to a file holding pre-trained embeddings.
|
|
285
|
+
embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding. Defaults to None.
|
|
276
286
|
embed_cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
|
|
277
287
|
silent (bool, optional): Enable silent mode. Defaults to False.
|
|
278
288
|
normalize_embed (bool, optional): Whether the embeddings of each word is normalized to a unit vector. Defaults to False.
|
|
279
289
|
|
|
280
290
|
Returns:
|
|
281
|
-
tuple[
|
|
291
|
+
tuple[dict, torch.Tensor]: A dictionary which maps tokens to indices and the pre-trained word vectors of shape (vocab_size, embed_dim).
|
|
282
292
|
"""
|
|
283
293
|
if vocab_file:
|
|
284
294
|
logging.info(f"Load vocab from {vocab_file}")
|
|
@@ -286,14 +296,14 @@ def load_or_build_text_dict(
|
|
|
286
296
|
vocab_list = [[vocab.strip() for vocab in fp.readlines()]]
|
|
287
297
|
# Keep PAD index 0 to align `padding_idx` of
|
|
288
298
|
# class Embedding in libmultilabel.nn.networks.modules.
|
|
289
|
-
|
|
299
|
+
word_dict = _build_word_dict(vocab_list, min_vocab_freq=1, specials=[PAD, UNK])
|
|
290
300
|
else:
|
|
291
301
|
vocab_list = [set(data["text"]) for data in dataset]
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
logging.info(f"Read {len(
|
|
302
|
+
word_dict = _build_word_dict(vocab_list, min_vocab_freq=min_vocab_freq, specials=[PAD, UNK])
|
|
303
|
+
|
|
304
|
+
logging.info(f"Read {len(word_dict)} vocabularies.")
|
|
295
305
|
|
|
296
|
-
embedding_weights = get_embedding_weights_from_file(
|
|
306
|
+
embedding_weights = get_embedding_weights_from_file(word_dict, embed_file, silent, embed_cache_dir)
|
|
297
307
|
|
|
298
308
|
if normalize_embed:
|
|
299
309
|
# To have better precision for calculating the normalization, we convert the original
|
|
@@ -306,7 +316,41 @@ def load_or_build_text_dict(
|
|
|
306
316
|
embedding_weights[i] = vector / float(torch.linalg.norm(vector) + 1e-6)
|
|
307
317
|
embedding_weights = embedding_weights.float()
|
|
308
318
|
|
|
309
|
-
return
|
|
319
|
+
return word_dict, embedding_weights
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def _build_word_dict(vocab_list, min_vocab_freq=1, specials=None):
|
|
323
|
+
r"""Build word dictionary, modified from `torchtext.vocab.build-vocab-from-iterator`
|
|
324
|
+
(https://docs.pytorch.org/text/stable/vocab.html#build-vocab-from-iterator)
|
|
325
|
+
|
|
326
|
+
Args:
|
|
327
|
+
vocab_list: List of words.
|
|
328
|
+
min_vocab_freq (int, optional): The minimum frequency needed to include a token in the vocabulary. Defaults to 1.
|
|
329
|
+
specials: Special tokens (e.g., <unk>, <pad>) to add. Defaults to None.
|
|
330
|
+
|
|
331
|
+
Returns:
|
|
332
|
+
dict: A dictionary which maps tokens to indices.
|
|
333
|
+
"""
|
|
334
|
+
|
|
335
|
+
counter = Counter()
|
|
336
|
+
for tokens in vocab_list:
|
|
337
|
+
counter.update(tokens)
|
|
338
|
+
|
|
339
|
+
# sort by descending frequency, then lexicographically
|
|
340
|
+
sorted_by_freq_tuples = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
|
|
341
|
+
ordered_dict = OrderedDict(sorted_by_freq_tuples)
|
|
342
|
+
|
|
343
|
+
# add special tokens at the beginning
|
|
344
|
+
tokens = specials or []
|
|
345
|
+
for token, freq in ordered_dict.items():
|
|
346
|
+
if freq >= min_vocab_freq:
|
|
347
|
+
tokens.append(token)
|
|
348
|
+
|
|
349
|
+
# build token to indices dict
|
|
350
|
+
word_dict = dict()
|
|
351
|
+
for idx, token in enumerate(tokens):
|
|
352
|
+
word_dict[token] = idx
|
|
353
|
+
return word_dict
|
|
310
354
|
|
|
311
355
|
|
|
312
356
|
def load_or_build_label(datasets, label_file=None, include_test_labels=False):
|
|
@@ -344,70 +388,84 @@ def load_or_build_label(datasets, label_file=None, include_test_labels=False):
|
|
|
344
388
|
return classes
|
|
345
389
|
|
|
346
390
|
|
|
347
|
-
def get_embedding_weights_from_file(word_dict, embed_file, silent=False,
|
|
348
|
-
"""If the word exists in the embedding file,
|
|
349
|
-
Otherwise, assign a zero vector to that word.
|
|
391
|
+
def get_embedding_weights_from_file(word_dict, embed_file, silent=False, cache_dir=None):
|
|
392
|
+
"""Obtain the word embeddings from file. If the word exists in the embedding file,
|
|
393
|
+
load the pretrained word embedding. Otherwise, assign a zero vector to that word.
|
|
394
|
+
If the given `embed_file` is the name of a pretrained GloVe embedding, the function
|
|
395
|
+
will first download the corresponding file.
|
|
350
396
|
|
|
351
397
|
Args:
|
|
352
|
-
word_dict (
|
|
353
|
-
embed_file (str): Path to a file holding pre-trained embeddings.
|
|
398
|
+
word_dict (dict): A dictionary for mapping tokens to indices.
|
|
399
|
+
embed_file (str): Path to a file holding pre-trained embeddings or the name of the pretrained GloVe embedding.
|
|
354
400
|
silent (bool, optional): Enable silent mode. Defaults to False.
|
|
355
|
-
|
|
401
|
+
cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
|
|
356
402
|
|
|
357
403
|
Returns:
|
|
358
404
|
torch.Tensor: Embedding weights (vocab_size, embed_size).
|
|
359
405
|
"""
|
|
360
|
-
# Load pretrained word embedding
|
|
361
|
-
load_embedding_from_file = embed_file not in pretrained_aliases
|
|
362
|
-
if load_embedding_from_file:
|
|
363
|
-
logging.info(f"Load pretrained embedding from file: {embed_file}.")
|
|
364
|
-
with open(embed_file) as f:
|
|
365
|
-
word_vectors = f.readlines()
|
|
366
|
-
embed_size = len(word_vectors[0].split()) - 1
|
|
367
|
-
vector_dict = {}
|
|
368
|
-
for word_vector in tqdm(word_vectors, disable=silent):
|
|
369
|
-
word, vector = word_vector.rstrip().split(" ", 1)
|
|
370
|
-
vector = torch.Tensor(list(map(float, vector.split())))
|
|
371
|
-
vector_dict[word] = vector
|
|
372
|
-
else:
|
|
373
|
-
logging.info(f"Load pretrained embedding from torchtext.")
|
|
374
|
-
# Adapted from https://pytorch.org/text/0.9.0/_modules/torchtext/vocab.html#Vocab.load_vectors.
|
|
375
|
-
if embed_file not in pretrained_aliases:
|
|
376
|
-
raise ValueError(
|
|
377
|
-
"Got embed_file {}, but allowed pretrained "
|
|
378
|
-
"vectors are {}".format(embed_file, list(pretrained_aliases.keys()))
|
|
379
|
-
)
|
|
380
|
-
|
|
381
|
-
# Hotfix: Glove URLs are outdated in Torchtext
|
|
382
|
-
# (https://github.com/pytorch/text/blob/main/torchtext/vocab/vectors.py#L213-L217)
|
|
383
|
-
pretrained_cls = pretrained_aliases[embed_file]
|
|
384
|
-
if embed_file.startswith("glove"):
|
|
385
|
-
for name, url in pretrained_cls.func.url.items():
|
|
386
|
-
file_name = url.split("/")[-1]
|
|
387
|
-
pretrained_cls.func.url[name] = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{file_name}"
|
|
388
|
-
|
|
389
|
-
vector_dict = pretrained_cls(cache=cache)
|
|
390
|
-
embed_size = vector_dict.dim
|
|
391
406
|
|
|
392
|
-
|
|
407
|
+
if embed_file in GLOVE_WORD_EMBEDDING:
|
|
408
|
+
embed_file = _download_glove_embedding(embed_file, cache_dir=cache_dir)
|
|
409
|
+
elif not os.path.isfile(embed_file):
|
|
410
|
+
raise ValueError(
|
|
411
|
+
"Got embed_file {}, but allowed pretrained " "embeddings are {}".format(embed_file, GLOVE_WORD_EMBEDDING)
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
logging.info(f"Load pretrained embedding from {embed_file}.")
|
|
415
|
+
with open(embed_file) as f:
|
|
416
|
+
word_vectors = f.readlines()
|
|
417
|
+
embed_size = len(word_vectors[0].split()) - 1
|
|
393
418
|
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
419
|
+
vector_dict = {}
|
|
420
|
+
for word_vector in tqdm(word_vectors, disable=silent):
|
|
421
|
+
word, vector = word_vector.rstrip().split(" ", 1)
|
|
422
|
+
vector = torch.Tensor(list(map(float, vector.split())))
|
|
423
|
+
vector_dict[word] = vector
|
|
424
|
+
|
|
425
|
+
embedding_weights = torch.zeros(len(word_dict), embed_size)
|
|
426
|
+
# Add UNK embedding
|
|
427
|
+
# AttentionXML: np.random.uniform(-1.0, 1.0, embed_size)
|
|
428
|
+
# CAML: np.random.randn(embed_size)
|
|
429
|
+
unk_vector = torch.randn(embed_size)
|
|
430
|
+
embedding_weights[word_dict[UNK]] = unk_vector
|
|
400
431
|
|
|
401
432
|
# Store pretrained word embedding
|
|
402
433
|
vec_counts = 0
|
|
403
|
-
for word in word_dict.
|
|
404
|
-
|
|
405
|
-
# Note that torchtext vector object has already dealt with this,
|
|
406
|
-
# so we can directly make a query without addtional handling.
|
|
407
|
-
if (load_embedding_from_file and word in vector_dict) or not load_embedding_from_file:
|
|
434
|
+
for word in word_dict.keys():
|
|
435
|
+
if word in vector_dict:
|
|
408
436
|
embedding_weights[word_dict[word]] = vector_dict[word]
|
|
409
437
|
vec_counts += 1
|
|
410
438
|
|
|
411
|
-
logging.info(f"
|
|
439
|
+
logging.info(f"Loaded {vec_counts}/{len(word_dict)} word embeddings")
|
|
412
440
|
|
|
413
441
|
return embedding_weights
|
|
442
|
+
|
|
443
|
+
|
|
444
|
+
def _download_glove_embedding(embed_name, cache_dir=None):
|
|
445
|
+
"""Download pretrained glove embedding from https://huggingface.co/stanfordnlp/glove/tree/main.
|
|
446
|
+
|
|
447
|
+
Args:
|
|
448
|
+
embed_name (str): The name of the pretrained GloVe embedding. Defaults to None.
|
|
449
|
+
cache_dir (str, optional): Path to a directory for storing cached embeddings. Defaults to None.
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
str: Path to the file that contains the cached embeddings.
|
|
453
|
+
"""
|
|
454
|
+
cache_dir = ".vector_cache" if cache_dir is None else cache_dir
|
|
455
|
+
cached_embed_file = f"{cache_dir}/{embed_name}.txt"
|
|
456
|
+
if os.path.isfile(cached_embed_file):
|
|
457
|
+
return cached_embed_file
|
|
458
|
+
os.makedirs(cache_dir, exist_ok=True)
|
|
459
|
+
|
|
460
|
+
remote_embed_file = re.sub(r"6B.*", "6B", embed_name) + ".zip"
|
|
461
|
+
url = f"https://huggingface.co/stanfordnlp/glove/resolve/main/{remote_embed_file}"
|
|
462
|
+
logging.info(f"Downloading pretrained embeddings from {url}.")
|
|
463
|
+
try:
|
|
464
|
+
zip_file, _ = urlretrieve(url, f"{cache_dir}/{remote_embed_file}")
|
|
465
|
+
with zipfile.ZipFile(zip_file, "r") as zf:
|
|
466
|
+
zf.extractall(cache_dir)
|
|
467
|
+
except Exception as e:
|
|
468
|
+
os.remove(zip_file)
|
|
469
|
+
raise e
|
|
470
|
+
logging.info(f"Downloaded pretrained embeddings {embed_name} to {cached_embed_file}.")
|
|
471
|
+
return cached_embed_file
|
|
@@ -181,7 +181,7 @@ class Model(MultiLabelModel):
|
|
|
181
181
|
|
|
182
182
|
Args:
|
|
183
183
|
classes (list): List of class names.
|
|
184
|
-
word_dict (
|
|
184
|
+
word_dict (dict): A dictionary for mapping tokens to indices.
|
|
185
185
|
network (nn.Module): Network (i.e., CAML, KimCNN, or XMLCNN).
|
|
186
186
|
loss_function (str, optional): Loss function name (i.e., binary_cross_entropy_with_logits,
|
|
187
187
|
cross_entropy). Defaults to 'binary_cross_entropy_with_logits'.
|
|
@@ -61,8 +61,7 @@ def init_model(
|
|
|
61
61
|
model_name (str): Model to be used such as KimCNN.
|
|
62
62
|
network_config (dict): Configuration for defining the network.
|
|
63
63
|
classes (list): List of class names.
|
|
64
|
-
word_dict (
|
|
65
|
-
map tokens to indices. Defaults to None.
|
|
64
|
+
word_dict (dict, optional): A dictionary for mapping tokens to indices. Defaults to None.
|
|
66
65
|
embed_vecs (torch.Tensor, optional): The pre-trained word vectors of shape
|
|
67
66
|
(vocab_size, embed_dim). Defaults to None.
|
|
68
67
|
init_weight (str): Weight initialization method from `torch.nn.init`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: libmultilabel
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.8.1
|
|
4
4
|
Summary: A library for multi-class and multi-label classification
|
|
5
5
|
Home-page: https://github.com/ASUS-AICS/LibMultiLabel
|
|
6
6
|
Author: LibMultiLabel Team
|
|
@@ -8,7 +8,7 @@ License: MIT License
|
|
|
8
8
|
Project-URL: Bug Tracker, https://github.com/ASUS-AICS/LibMultiLabel/issues
|
|
9
9
|
Project-URL: Documentation, https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
|
|
10
10
|
Project-URL: Source Code, https://github.com/ASUS-AICS/LibMultiLabel/
|
|
11
|
-
Classifier: Environment :: GPU :: NVIDIA CUDA :: 12
|
|
11
|
+
Classifier: Environment :: GPU :: NVIDIA CUDA :: 12
|
|
12
12
|
Classifier: Environment :: GPU :: NVIDIA CUDA :: 11.8
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Intended Audience :: Education
|
|
@@ -16,23 +16,27 @@ Classifier: Intended Audience :: Science/Research
|
|
|
16
16
|
Classifier: License :: OSI Approved :: MIT License
|
|
17
17
|
Classifier: Operating System :: OS Independent
|
|
18
18
|
Classifier: Programming Language :: Python :: 3
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.
|
|
20
|
-
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Requires-Python: >=3.10
|
|
21
24
|
License-File: LICENSE
|
|
22
25
|
Requires-Dist: liblinear-multicore>=2.49.0
|
|
23
26
|
Requires-Dist: numba
|
|
24
27
|
Requires-Dist: pandas>1.3.0
|
|
25
28
|
Requires-Dist: PyYAML
|
|
26
29
|
Requires-Dist: scikit-learn
|
|
27
|
-
Requires-Dist: scipy
|
|
30
|
+
Requires-Dist: scipy
|
|
28
31
|
Requires-Dist: tqdm
|
|
29
32
|
Requires-Dist: psutil
|
|
33
|
+
Requires-Dist: sparsekmeans
|
|
30
34
|
Provides-Extra: nn
|
|
31
|
-
Requires-Dist: lightning
|
|
35
|
+
Requires-Dist: lightning; extra == "nn"
|
|
32
36
|
Requires-Dist: nltk; extra == "nn"
|
|
33
|
-
Requires-Dist: torch
|
|
37
|
+
Requires-Dist: torch; extra == "nn"
|
|
34
38
|
Requires-Dist: torchmetrics==0.10.3; extra == "nn"
|
|
35
|
-
Requires-Dist: torchtext; extra == "nn"
|
|
36
39
|
Requires-Dist: transformers; extra == "nn"
|
|
40
|
+
Dynamic: license-file
|
|
37
41
|
|
|
38
42
|
See documentation here: https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[metadata]
|
|
2
2
|
name = libmultilabel
|
|
3
|
-
version = 0.
|
|
3
|
+
version = 0.8.1
|
|
4
4
|
author = LibMultiLabel Team
|
|
5
5
|
license = MIT License
|
|
6
6
|
license_file = LICENSE
|
|
@@ -12,7 +12,7 @@ project_urls =
|
|
|
12
12
|
Documentation = https://www.csie.ntu.edu.tw/~cjlin/libmultilabel
|
|
13
13
|
Source Code = https://github.com/ASUS-AICS/LibMultiLabel/
|
|
14
14
|
classifiers =
|
|
15
|
-
Environment :: GPU :: NVIDIA CUDA :: 12
|
|
15
|
+
Environment :: GPU :: NVIDIA CUDA :: 12
|
|
16
16
|
Environment :: GPU :: NVIDIA CUDA :: 11.8
|
|
17
17
|
Intended Audience :: Developers
|
|
18
18
|
Intended Audience :: Education
|
|
@@ -20,7 +20,10 @@ classifiers =
|
|
|
20
20
|
License :: OSI Approved :: MIT License
|
|
21
21
|
Operating System :: OS Independent
|
|
22
22
|
Programming Language :: Python :: 3
|
|
23
|
-
Programming Language :: Python :: 3.
|
|
23
|
+
Programming Language :: Python :: 3.10
|
|
24
|
+
Programming Language :: Python :: 3.11
|
|
25
|
+
Programming Language :: Python :: 3.12
|
|
26
|
+
Programming Language :: Python :: 3.13
|
|
24
27
|
|
|
25
28
|
[options]
|
|
26
29
|
packages = find:
|
|
@@ -30,18 +33,18 @@ install_requires =
|
|
|
30
33
|
pandas>1.3.0
|
|
31
34
|
PyYAML
|
|
32
35
|
scikit-learn
|
|
33
|
-
scipy
|
|
36
|
+
scipy
|
|
34
37
|
tqdm
|
|
35
38
|
psutil
|
|
36
|
-
|
|
39
|
+
sparsekmeans
|
|
40
|
+
python_requires = >=3.10
|
|
37
41
|
|
|
38
42
|
[options.extras_require]
|
|
39
43
|
nn =
|
|
40
|
-
lightning
|
|
44
|
+
lightning
|
|
41
45
|
nltk
|
|
42
|
-
torch
|
|
46
|
+
torch
|
|
43
47
|
torchmetrics==0.10.3
|
|
44
|
-
torchtext
|
|
45
48
|
transformers
|
|
46
49
|
|
|
47
50
|
[options.packages.find]
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|