deeplotx 0.2.20__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeplotx/__init__.py +1 -1
- deeplotx/encoder/__init__.py +1 -0
- deeplotx/encoder/bert_encoder.py +7 -6
- deeplotx/encoder/long_text_encoder.py +3 -2
- deeplotx/encoder/longformer_encoder.py +30 -0
- deeplotx/nn/base_neural_network.py +3 -0
- deeplotx/similarity/__init__.py +17 -0
- deeplotx/similarity/distribution.py +32 -0
- deeplotx/similarity/set.py +19 -0
- deeplotx/similarity/vector.py +36 -0
- {deeplotx-0.2.20.dist-info → deeplotx-0.3.1.dist-info}/METADATA +9 -3
- deeplotx-0.3.1.dist-info/RECORD +25 -0
- {deeplotx-0.2.20.dist-info → deeplotx-0.3.1.dist-info}/WHEEL +1 -1
- deeplotx-0.2.20.dist-info/RECORD +0 -20
- {deeplotx-0.2.20.dist-info → deeplotx-0.3.1.dist-info}/licenses/LICENSE +0 -0
- {deeplotx-0.2.20.dist-info → deeplotx-0.3.1.dist-info}/top_level.txt +0 -0
deeplotx/__init__.py
CHANGED
@@ -3,7 +3,7 @@ import os
|
|
3
3
|
|
4
4
|
__ROOT__ = os.path.dirname(os.path.abspath(__file__))
|
5
5
|
|
6
|
-
from .encoder import BertEncoder, LongTextEncoder
|
6
|
+
from .encoder import BertEncoder, LongTextEncoder, LongformerEncoder
|
7
7
|
from .nn import LinearRegression, LogisticRegression, SoftmaxRegression
|
8
8
|
from .trainer import TextBinaryClassifierTrainer
|
9
9
|
|
deeplotx/encoder/__init__.py
CHANGED
deeplotx/encoder/bert_encoder.py
CHANGED
@@ -13,19 +13,20 @@ class BertEncoder(nn.Module):
|
|
13
13
|
def __init__(self, model_name_or_path: str = DEFAULT_BERT):
|
14
14
|
super().__init__()
|
15
15
|
self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
16
|
-
cache_dir=CACHE_PATH)
|
16
|
+
cache_dir=CACHE_PATH, _from_auto=True)
|
17
17
|
self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
18
|
-
cache_dir=CACHE_PATH)
|
18
|
+
cache_dir=CACHE_PATH, _from_auto=True)
|
19
|
+
self.embed_dim = self.bert.config.max_position_embeddings
|
19
20
|
|
20
|
-
def forward(self, input_ids, attention_mask: torch.Tensor) -> torch.Tensor:
|
21
|
+
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
21
22
|
def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
22
23
|
return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
|
23
24
|
|
24
|
-
num_chunks = math.ceil(input_ids.shape[-1] /
|
25
|
+
num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
|
25
26
|
chunks = chunk_results = []
|
26
27
|
for i in range(num_chunks):
|
27
|
-
start_idx = i *
|
28
|
-
end_idx = min(start_idx +
|
28
|
+
start_idx = i * self.embed_dim
|
29
|
+
end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
|
29
30
|
chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
|
30
31
|
ori_mode = self.bert.training
|
31
32
|
self.bert.eval()
|
@@ -24,7 +24,7 @@ class LongTextEncoder(BertEncoder):
|
|
24
24
|
return input_tup[0], super().forward(input_tup[1], attention_mask=input_tup[2])
|
25
25
|
|
26
26
|
@override
|
27
|
-
def encode(self, text: str) -> torch.Tensor:
|
27
|
+
def encode(self, text: str, use_cache: bool = True) -> torch.Tensor:
|
28
28
|
_text_to_show = text.replace("\n", str())
|
29
29
|
logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
|
30
30
|
# read cache
|
@@ -58,5 +58,6 @@ class LongTextEncoder(BertEncoder):
|
|
58
58
|
fin_emb_tensor = torch.cat((fin_emb_tensor.detach().clone(), emb.detach().clone()), dim=-1)
|
59
59
|
fin_emb_tensor = fin_emb_tensor.squeeze()
|
60
60
|
# write cache
|
61
|
-
|
61
|
+
if use_cache:
|
62
|
+
self._cache[_text_hash] = fin_emb_tensor
|
62
63
|
return fin_emb_tensor
|
@@ -0,0 +1,30 @@
|
|
1
|
+
import torch
|
2
|
+
from torch import nn
|
3
|
+
from transformers import LongformerTokenizer, LongformerModel
|
4
|
+
|
5
|
+
from deeplotx import __ROOT__
|
6
|
+
|
7
|
+
CACHE_PATH = f'{__ROOT__}\\.cache'
|
8
|
+
DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
|
9
|
+
|
10
|
+
|
11
|
+
class LongformerEncoder(nn.Module):
|
12
|
+
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER):
|
13
|
+
super().__init__()
|
14
|
+
self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
15
|
+
cache_dir=CACHE_PATH, _from_auto=True)
|
16
|
+
self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
17
|
+
cache_dir=CACHE_PATH, _from_auto=True)
|
18
|
+
|
19
|
+
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
20
|
+
ori_mode = self.bert.training
|
21
|
+
self.bert.eval()
|
22
|
+
with torch.no_grad():
|
23
|
+
res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
|
24
|
+
self.bert.train(mode=ori_mode)
|
25
|
+
return res
|
26
|
+
|
27
|
+
def encode(self, text: str) -> torch.Tensor:
|
28
|
+
_input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long)
|
29
|
+
_att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int)
|
30
|
+
return self.forward(_input_ids, _att_mask).squeeze()
|
@@ -1,3 +1,5 @@
|
|
1
|
+
from abc import abstractmethod
|
2
|
+
|
1
3
|
import torch
|
2
4
|
from torch import nn
|
3
5
|
|
@@ -28,6 +30,7 @@ class BaseNeuralNetwork(nn.Module):
|
|
28
30
|
def elastic_net(self, alpha: float = 1e-4, rho: float = 0.5) -> torch.Tensor:
|
29
31
|
return alpha * (rho * self.l1(_lambda=1.) + (1 - rho) * self.l2(_lambda=1.))
|
30
32
|
|
33
|
+
@abstractmethod
|
31
34
|
def forward(self, x) -> torch.Tensor: ...
|
32
35
|
|
33
36
|
def predict(self, x) -> torch.Tensor:
|
@@ -0,0 +1,17 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import torch
|
3
|
+
|
4
|
+
bias = 1e-12
|
5
|
+
|
6
|
+
|
7
|
+
def ndarray_adapter(*args) -> tuple | np.ndarray:
|
8
|
+
args = list(args)
|
9
|
+
for i, arg in enumerate(args):
|
10
|
+
match arg.__class__:
|
11
|
+
case torch.Tensor:
|
12
|
+
args[i] = arg.detach().cpu().numpy()
|
13
|
+
case List:
|
14
|
+
args[i] = np.asarray(arg)
|
15
|
+
if len(args) > 1:
|
16
|
+
return tuple(args)
|
17
|
+
return args[0]
|
@@ -0,0 +1,32 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import torch
|
3
|
+
|
4
|
+
from deeplotx.similarity import bias, ndarray_adapter
|
5
|
+
|
6
|
+
|
7
|
+
def cross_entropy(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
8
|
+
p, q = ndarray_adapter(p, q)
|
9
|
+
q = np.clip(q, bias, 1 - bias)
|
10
|
+
return -1 * (np.sum(p * np.log(q)) / p.shape[0])
|
11
|
+
|
12
|
+
|
13
|
+
def kl_divergence(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
14
|
+
p, q = ndarray_adapter(p, q)
|
15
|
+
q = np.where(q == 0, bias, q)
|
16
|
+
p = p / np.sum(p)
|
17
|
+
q = q / np.sum(q)
|
18
|
+
return np.sum(p * np.log(p / q))
|
19
|
+
|
20
|
+
|
21
|
+
def js_divergence(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
22
|
+
p, q = ndarray_adapter(p, q)
|
23
|
+
m = (p + q) / 2
|
24
|
+
return (kl_divergence(p, m) + kl_divergence(q, m)) / 2
|
25
|
+
|
26
|
+
|
27
|
+
def hellinger_distance(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
28
|
+
p, q = ndarray_adapter(p, q)
|
29
|
+
p = p / np.sum(p)
|
30
|
+
q = q / np.sum(q)
|
31
|
+
squared_diff = (np.sqrt(p) - np.sqrt(q)) ** 2
|
32
|
+
return np.sqrt(np.sum(squared_diff)) / np.sqrt(2)
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from deeplotx.similarity import bias
|
2
|
+
|
3
|
+
|
4
|
+
def jaccard_similarity(set1: set, set2: set) -> float:
|
5
|
+
return (len(set1.intersection(set2)) + bias) / (len(set1.union(set2)) + bias)
|
6
|
+
|
7
|
+
|
8
|
+
def overlap_coefficient(set1: set, set2: set) -> float:
|
9
|
+
return (len(set1.intersection(set2)) + bias) / (min(len(set1), len(set2)) + bias)
|
10
|
+
|
11
|
+
|
12
|
+
def dice_coefficient(set1: set, set2: set) -> float:
|
13
|
+
return (2 * len(set1.intersection(set2)) + bias) / (len(set1) + len(set2) + bias)
|
14
|
+
|
15
|
+
|
16
|
+
def ochiai_similarity(set1: set, set2: set) -> float:
|
17
|
+
intersection = len(set1.intersection(set2))
|
18
|
+
product = len(set1) ** 0.5 * len(set2) ** 0.5
|
19
|
+
return (intersection + bias) / (product + bias)
|
@@ -0,0 +1,36 @@
|
|
1
|
+
import numpy as np
|
2
|
+
import torch
|
3
|
+
|
4
|
+
from deeplotx.similarity import ndarray_adapter
|
5
|
+
|
6
|
+
|
7
|
+
def l2_normalize(x: torch.Tensor | np.ndarray) -> np.ndarray:
|
8
|
+
x = ndarray_adapter(x)
|
9
|
+
return x / np.sqrt(np.sum(np.multiply(x, x)))
|
10
|
+
|
11
|
+
|
12
|
+
def z_score_normalize(x: torch.Tensor | np.ndarray) -> np.ndarray:
|
13
|
+
x = ndarray_adapter(x)
|
14
|
+
mean = np.mean(x)
|
15
|
+
std_dev = np.std(x)
|
16
|
+
return (x - mean) / std_dev
|
17
|
+
|
18
|
+
|
19
|
+
def euclidean_similarity(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
20
|
+
p, q = ndarray_adapter(p, q)
|
21
|
+
distance = p - q
|
22
|
+
distance = np.sum(np.multiply(distance, distance))
|
23
|
+
return np.sqrt(distance)
|
24
|
+
|
25
|
+
|
26
|
+
def cosine_similarity(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
27
|
+
p, q = ndarray_adapter(p, q)
|
28
|
+
a = np.matmul(np.transpose(p), q)
|
29
|
+
b = np.sum(np.multiply(p, p))
|
30
|
+
c = np.sum(np.multiply(q, q))
|
31
|
+
return 1 - (a / (np.sqrt(b) * np.sqrt(c)))
|
32
|
+
|
33
|
+
|
34
|
+
def chebyshev_similarity(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
|
35
|
+
p, q = ndarray_adapter(p, q)
|
36
|
+
return np.max(np.abs(p - q))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.3.1
|
4
4
|
Summary: Easy-2-use long text classifier trainers.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -21,13 +21,19 @@ Dynamic: license-file
|
|
21
21
|
- Install with pip
|
22
22
|
|
23
23
|
```
|
24
|
-
pip install
|
24
|
+
pip install -U deeplotx
|
25
25
|
```
|
26
26
|
|
27
27
|
- Install with uv
|
28
28
|
|
29
29
|
```
|
30
|
-
uv add
|
30
|
+
uv add -U deeplotx
|
31
|
+
```
|
32
|
+
|
33
|
+
- Install from github
|
34
|
+
|
35
|
+
```
|
36
|
+
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
31
37
|
```
|
32
38
|
|
33
39
|
## Quick Start
|
@@ -0,0 +1,25 @@
|
|
1
|
+
deeplotx/__init__.py,sha256=C6N717chqnk3jqh9nuh9oM5hPldX9mCusCn-LqGWJJg,935
|
2
|
+
deeplotx/encoder/__init__.py,sha256=EM-xrTsHoGaiiFpj-iFAxilMHXC_sQKWYrcq1qCnI3U,138
|
3
|
+
deeplotx/encoder/bert_encoder.py,sha256=A-B7Gj94xv6UhvsFTBH7tnkAdGHRhfUZA2QjSnTKB6c,1970
|
4
|
+
deeplotx/encoder/long_text_encoder.py,sha256=V6VxaHW6bMMaZHgU1UZ8n19UfSIV2f2sarWXquiFffQ,3018
|
5
|
+
deeplotx/encoder/longformer_encoder.py,sha256=mZpC5TrGHQo98-ydGtVQQ9KRHgCGl1sRoxcQs7r4SSo,1409
|
6
|
+
deeplotx/nn/__init__.py,sha256=9gh8rhKqVWtJyvryU_wHPTLEQIorwOBhAQRc0DtNamM,153
|
7
|
+
deeplotx/nn/base_neural_network.py,sha256=Rkwu58mXXcuusf-59yLX89MywQx-EvTsSXOvlzUptRE,1621
|
8
|
+
deeplotx/nn/linear_regression.py,sha256=D4mEWVOq6q1Fm2otm57rgZ_E06HJLZBV5k636PprAf4,1520
|
9
|
+
deeplotx/nn/logistic_regression.py,sha256=QAtZp2oyqOW8-1pJWVcahsSM83bzfA68EHObg-wSHHY,463
|
10
|
+
deeplotx/nn/softmax_regression.py,sha256=eUn3mVNlye9ewVdw3McPHZuKbUvvaamsUgFIJMVMgBU,487
|
11
|
+
deeplotx/similarity/__init__.py,sha256=JA1om2zeDcQVS1R04nDMdP6yegxdLJ14WF63pSTL3oo,418
|
12
|
+
deeplotx/similarity/distribution.py,sha256=wQGouuuW531pZeBRKBujXsdsoz4fDnPw7_GW81jwepc,1066
|
13
|
+
deeplotx/similarity/set.py,sha256=zhGFxtSIXlWqvipBYzoiPahp4g0boAIoUiMfG0wl07A,686
|
14
|
+
deeplotx/similarity/vector.py,sha256=WVbDHqykt-fvuILVrhUCtIFAOEjY_zvttrXGM9eylG0,1125
|
15
|
+
deeplotx/trainer/__init__.py,sha256=Fl5DR9UecQc5VtBcczU9sx_HtPNoFohpuELOh-Jrsks,77
|
16
|
+
deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
|
17
|
+
deeplotx/trainer/text_binary_classification_trainer.py,sha256=5O-5dwVMCj5EDX9gjJwCA468OR4UozJ7V8b-JxeUB0s,4080
|
18
|
+
deeplotx/util/__init__.py,sha256=JxqAK_WOOHcYVSTHBT1-WuBwWrPEVDTV3titeVWvNUM,74
|
19
|
+
deeplotx/util/hash.py,sha256=wwsC6kOQvbpuvwKsNQOARd78_wePmW9i3oaUuXRUnpc,352
|
20
|
+
deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
|
21
|
+
deeplotx-0.3.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
22
|
+
deeplotx-0.3.1.dist-info/METADATA,sha256=GzIi1llGFDTLvHDLF3GOQ4G6MafM10M-7IeosZwZlaY,1616
|
23
|
+
deeplotx-0.3.1.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
|
24
|
+
deeplotx-0.3.1.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
|
25
|
+
deeplotx-0.3.1.dist-info/RECORD,,
|
deeplotx-0.2.20.dist-info/RECORD
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
deeplotx/__init__.py,sha256=Bhxc6HRnuhPZCMNlBc6oKcFTpJbWRGrZmt00vVOsNf0,916
|
2
|
-
deeplotx/encoder/__init__.py,sha256=x7k8IE0FXvDl7kCJGWPsetOHFdvNCiCXHbYOdvo7_JQ,87
|
3
|
-
deeplotx/encoder/bert_encoder.py,sha256=rdT8YgZzvRoqYqtzPW95ilagSQTAQgUl7mMVetGKxCY,1822
|
4
|
-
deeplotx/encoder/long_text_encoder.py,sha256=yEEtTVZYHJ0W3OSbh7BHm6xI33nJmVYlSrgD5RVcJLY,2967
|
5
|
-
deeplotx/nn/__init__.py,sha256=9gh8rhKqVWtJyvryU_wHPTLEQIorwOBhAQRc0DtNamM,153
|
6
|
-
deeplotx/nn/base_neural_network.py,sha256=MXuID5bagdHyrFOkoybW1oiXAY2d4FGnzZoR37LZfUI,1566
|
7
|
-
deeplotx/nn/linear_regression.py,sha256=D4mEWVOq6q1Fm2otm57rgZ_E06HJLZBV5k636PprAf4,1520
|
8
|
-
deeplotx/nn/logistic_regression.py,sha256=QAtZp2oyqOW8-1pJWVcahsSM83bzfA68EHObg-wSHHY,463
|
9
|
-
deeplotx/nn/softmax_regression.py,sha256=eUn3mVNlye9ewVdw3McPHZuKbUvvaamsUgFIJMVMgBU,487
|
10
|
-
deeplotx/trainer/__init__.py,sha256=Fl5DR9UecQc5VtBcczU9sx_HtPNoFohpuELOh-Jrsks,77
|
11
|
-
deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
|
12
|
-
deeplotx/trainer/text_binary_classification_trainer.py,sha256=5O-5dwVMCj5EDX9gjJwCA468OR4UozJ7V8b-JxeUB0s,4080
|
13
|
-
deeplotx/util/__init__.py,sha256=JxqAK_WOOHcYVSTHBT1-WuBwWrPEVDTV3titeVWvNUM,74
|
14
|
-
deeplotx/util/hash.py,sha256=wwsC6kOQvbpuvwKsNQOARd78_wePmW9i3oaUuXRUnpc,352
|
15
|
-
deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
|
16
|
-
deeplotx-0.2.20.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
17
|
-
deeplotx-0.2.20.dist-info/METADATA,sha256=NQgRWucDSAI4awAJNf9984IujFRo9PurR1qrqpmWIzA,1573
|
18
|
-
deeplotx-0.2.20.dist-info/WHEEL,sha256=0CuiUZ_p9E4cD6NyLD6UG80LBXYyiSYZOKDm5lp32xk,91
|
19
|
-
deeplotx-0.2.20.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
|
20
|
-
deeplotx-0.2.20.dist-info/RECORD,,
|
File without changes
|
File without changes
|