deeplotx 0.5.3__tar.gz → 0.5.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deeplotx-0.5.3 → deeplotx-0.5.6}/PKG-INFO +7 -5
- {deeplotx-0.5.3 → deeplotx-0.5.6}/README.md +2 -1
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/__init__.py +1 -1
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/encoder/__init__.py +1 -1
- deeplotx-0.5.3/deeplotx/encoder/bert_encoder.py → deeplotx-0.5.6/deeplotx/encoder/encoder.py +13 -13
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/encoder/long_text_encoder.py +21 -17
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/trainer/text_binary_classification_trainer.py +1 -1
- deeplotx-0.5.6/deeplotx/util/hash.py +29 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/PKG-INFO +7 -5
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/SOURCES.txt +1 -1
- deeplotx-0.5.6/deeplotx.egg-info/requires.txt +9 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/pyproject.toml +17 -16
- deeplotx-0.5.3/deeplotx/util/hash.py +0 -15
- deeplotx-0.5.3/deeplotx.egg-info/requires.txt +0 -8
- {deeplotx-0.5.3 → deeplotx-0.5.6}/LICENSE +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/encoder/longformer_encoder.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/__init__.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/auto_regression.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/linear_regression.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/logistic_regression.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/long_context_auto_regression.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/recursive_sequential.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/self_attention.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/softmax_regression.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.5.3 → deeplotx-0.5.6}/setup.cfg +0 -0
@@ -1,18 +1,19 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.6
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
License-File: LICENSE
|
8
|
-
Requires-Dist: hf-xet
|
8
|
+
Requires-Dist: hf-xet
|
9
9
|
Requires-Dist: jupyter
|
10
10
|
Requires-Dist: numpy
|
11
11
|
Requires-Dist: protobuf
|
12
|
-
Requires-Dist: python-dotenv
|
12
|
+
Requires-Dist: python-dotenv
|
13
13
|
Requires-Dist: torch
|
14
14
|
Requires-Dist: transformers
|
15
|
-
Requires-Dist: typing-extensions
|
15
|
+
Requires-Dist: typing-extensions
|
16
|
+
Requires-Dist: vortezwohl>=0.0.6
|
16
17
|
Dynamic: license-file
|
17
18
|
|
18
19
|
[](https://deepwiki.com/vortezwohl/DeepLoTX)
|
@@ -264,7 +265,8 @@ Dynamic: license-file
|
|
264
265
|
long_text_encoder = LongTextEncoder(
|
265
266
|
max_length=2048, # 最大文本大小, 超出截断
|
266
267
|
chunk_size=448, # 块大小 (按 Token 计)
|
267
|
-
overlapping=32 # 块间重叠大小 (按 Token 计)
|
268
|
+
overlapping=32, # 块间重叠大小 (按 Token 计)
|
269
|
+
cache_capacity=512 # 缓存大小
|
268
270
|
)
|
269
271
|
|
270
272
|
trainer = TextBinaryClassifierTrainer(
|
@@ -247,7 +247,8 @@
|
|
247
247
|
long_text_encoder = LongTextEncoder(
|
248
248
|
max_length=2048, # 最大文本大小, 超出截断
|
249
249
|
chunk_size=448, # 块大小 (按 Token 计)
|
250
|
-
overlapping=32 # 块间重叠大小 (按 Token 计)
|
250
|
+
overlapping=32, # 块间重叠大小 (按 Token 计)
|
251
|
+
cache_capacity=512 # 缓存大小
|
251
252
|
)
|
252
253
|
|
253
254
|
trainer = TextBinaryClassifierTrainer(
|
@@ -3,7 +3,7 @@ import os
|
|
3
3
|
|
4
4
|
__ROOT__ = os.path.dirname(os.path.abspath(__file__))
|
5
5
|
|
6
|
-
from .encoder import
|
6
|
+
from .encoder import Encoder, LongTextEncoder, LongformerEncoder
|
7
7
|
from .nn import (
|
8
8
|
LinearRegression,
|
9
9
|
LogisticRegression,
|
deeplotx-0.5.3/deeplotx/encoder/bert_encoder.py → deeplotx-0.5.6/deeplotx/encoder/encoder.py
RENAMED
@@ -4,30 +4,30 @@ import math
|
|
4
4
|
|
5
5
|
import torch
|
6
6
|
from torch import nn
|
7
|
-
from transformers import
|
7
|
+
from transformers import AutoTokenizer, AutoModel
|
8
8
|
|
9
9
|
from deeplotx import __ROOT__
|
10
10
|
|
11
11
|
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
12
|
-
DEFAULT_BERT = '
|
12
|
+
DEFAULT_BERT = 'FacebookAI/xlm-roberta-base'
|
13
13
|
logger = logging.getLogger('deeplotx.embedding')
|
14
14
|
|
15
15
|
|
16
|
-
class
|
16
|
+
class Encoder(nn.Module):
|
17
17
|
def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
|
18
18
|
super().__init__()
|
19
19
|
self.device = torch.device(device) if device is not None \
|
20
20
|
else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
21
|
-
self.tokenizer =
|
21
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
22
22
|
cache_dir=CACHE_PATH, _from_auto=True)
|
23
|
-
self.
|
24
|
-
|
25
|
-
self.embed_dim = self.
|
26
|
-
logger.debug(f'{
|
23
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
24
|
+
cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
|
25
|
+
self.embed_dim = self.encoder.config.max_position_embeddings
|
26
|
+
logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
|
27
27
|
|
28
|
-
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
28
|
+
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
29
29
|
def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
30
|
-
return self.
|
30
|
+
return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
|
31
31
|
|
32
32
|
num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
|
33
33
|
chunks = chunk_results = []
|
@@ -35,11 +35,11 @@ class BertEncoder(nn.Module):
|
|
35
35
|
start_idx = i * self.embed_dim
|
36
36
|
end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
|
37
37
|
chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
|
38
|
-
ori_mode = self.
|
39
|
-
self.
|
38
|
+
ori_mode = self.encoder.training
|
39
|
+
self.encoder.eval()
|
40
40
|
with torch.no_grad():
|
41
41
|
chunk_results = [_encoder(x) for x in chunks]
|
42
|
-
self.
|
42
|
+
self.encoder.train(mode=ori_mode)
|
43
43
|
return torch.cat(chunk_results, dim=-1)
|
44
44
|
|
45
45
|
def encode(self, text: str) -> torch.Tensor:
|
@@ -1,30 +1,36 @@
|
|
1
1
|
import logging
|
2
2
|
import math
|
3
|
-
from concurrent.futures import ThreadPoolExecutor
|
4
3
|
from typing_extensions import override
|
5
4
|
|
6
5
|
import torch
|
6
|
+
from vortezwohl.concurrent import ThreadPool
|
7
|
+
from vortezwohl.cache import LRUCache
|
7
8
|
|
8
|
-
from deeplotx.encoder.
|
9
|
-
from deeplotx.util.hash import
|
9
|
+
from deeplotx.encoder.encoder import Encoder, DEFAULT_BERT
|
10
|
+
from deeplotx.util.hash import sha512
|
10
11
|
|
11
12
|
logger = logging.getLogger('deeplotx.embedding')
|
12
13
|
|
13
14
|
|
14
|
-
class LongTextEncoder(
|
15
|
-
def __init__(self, max_length: int, chunk_size: int =
|
16
|
-
overlapping: int =
|
15
|
+
class LongTextEncoder(Encoder):
|
16
|
+
def __init__(self, max_length: int, chunk_size: int = 448,
|
17
|
+
overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
|
18
|
+
cache_capacity: int = 64, device: str | None = None):
|
17
19
|
super().__init__(model_name_or_path=model_name_or_path, device=device)
|
18
20
|
self._max_length = max_length
|
19
21
|
self._chunk_size = chunk_size
|
20
22
|
self._overlapping = overlapping
|
21
|
-
self._cache =
|
23
|
+
self._cache = LRUCache(capacity=cache_capacity)
|
22
24
|
|
23
|
-
def __chunk_embedding(self,
|
24
|
-
return
|
25
|
+
def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
|
26
|
+
return idx, super().forward(x, attention_mask=mask)
|
25
27
|
|
26
28
|
@override
|
27
|
-
def
|
29
|
+
def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:
|
30
|
+
return self.encode(text=text, flatten=flatten)
|
31
|
+
|
32
|
+
@override
|
33
|
+
def encode(self, text: str, flatten: bool = False) -> torch.Tensor:
|
28
34
|
def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
|
29
35
|
if not _flatten:
|
30
36
|
return torch.stack(tensors, dim=0).squeeze()
|
@@ -36,8 +42,8 @@ class LongTextEncoder(BertEncoder):
|
|
36
42
|
_text_to_show = text.replace("\n", str())
|
37
43
|
logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
|
38
44
|
# read cache
|
39
|
-
_text_hash =
|
40
|
-
if _text_hash in self._cache
|
45
|
+
_text_hash = sha512(text)
|
46
|
+
if _text_hash in self._cache:
|
41
47
|
return postprocess(self._cache[_text_hash], flatten)
|
42
48
|
_text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
|
43
49
|
_text_to_input_ids_att_mask = []
|
@@ -57,11 +63,9 @@ class LongTextEncoder(BertEncoder):
|
|
57
63
|
_tmp_right = (i + 1) * self._chunk_size + self._overlapping
|
58
64
|
chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device),
|
59
65
|
torch.tensor([_text_to_input_ids_att_mask[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device)))
|
60
|
-
|
61
|
-
|
62
|
-
embeddings.sort(key=lambda x: x[0])
|
66
|
+
embeddings = list(ThreadPool(max_workers=min(num_chunks + 1, 8)).map(self.__chunk_embedding, chunks))
|
67
|
+
embeddings = sorted([x.returns for x in embeddings], key=lambda x: x[0], reverse=False)
|
63
68
|
fin_embedding = [x[1] for x in embeddings]
|
64
69
|
# write cache
|
65
|
-
|
66
|
-
self._cache[_text_hash] = fin_embedding
|
70
|
+
self._cache[_text_hash] = fin_embedding
|
67
71
|
return postprocess(fin_embedding, flatten)
|
@@ -31,7 +31,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
|
|
31
31
|
positive_texts = positive_texts[:min_length]
|
32
32
|
negative_texts = negative_texts[:min_length]
|
33
33
|
all_texts = positive_texts + negative_texts
|
34
|
-
text_embeddings = [self._long_text_encoder.encode(x, flatten=False
|
34
|
+
text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
|
35
35
|
feature_dim = text_embeddings[0].shape[-1]
|
36
36
|
dtype = text_embeddings[0].dtype
|
37
37
|
labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
|
@@ -0,0 +1,29 @@
|
|
1
|
+
import hashlib
|
2
|
+
|
3
|
+
|
4
|
+
def md5(text: str) -> str:
|
5
|
+
_hash = hashlib.md5()
|
6
|
+
text_bytes = text.encode('utf-8')
|
7
|
+
_hash.update(text_bytes)
|
8
|
+
return _hash.hexdigest()
|
9
|
+
|
10
|
+
|
11
|
+
def sha1(text: str) -> str:
|
12
|
+
_hash = hashlib.sha1()
|
13
|
+
text_bytes = text.encode('utf-8')
|
14
|
+
_hash.update(text_bytes)
|
15
|
+
return _hash.hexdigest()
|
16
|
+
|
17
|
+
|
18
|
+
def sha256(text: str) -> str:
|
19
|
+
_hash = hashlib.sha256()
|
20
|
+
text_bytes = text.encode('utf-8')
|
21
|
+
_hash.update(text_bytes)
|
22
|
+
return _hash.hexdigest()
|
23
|
+
|
24
|
+
|
25
|
+
def sha512(text: str) -> str:
|
26
|
+
_hash = hashlib.sha512()
|
27
|
+
text_bytes = text.encode('utf-8')
|
28
|
+
_hash.update(text_bytes)
|
29
|
+
return _hash.hexdigest()
|
@@ -1,18 +1,19 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.6
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
7
7
|
License-File: LICENSE
|
8
|
-
Requires-Dist: hf-xet
|
8
|
+
Requires-Dist: hf-xet
|
9
9
|
Requires-Dist: jupyter
|
10
10
|
Requires-Dist: numpy
|
11
11
|
Requires-Dist: protobuf
|
12
|
-
Requires-Dist: python-dotenv
|
12
|
+
Requires-Dist: python-dotenv
|
13
13
|
Requires-Dist: torch
|
14
14
|
Requires-Dist: transformers
|
15
|
-
Requires-Dist: typing-extensions
|
15
|
+
Requires-Dist: typing-extensions
|
16
|
+
Requires-Dist: vortezwohl>=0.0.6
|
16
17
|
Dynamic: license-file
|
17
18
|
|
18
19
|
[](https://deepwiki.com/vortezwohl/DeepLoTX)
|
@@ -264,7 +265,8 @@ Dynamic: license-file
|
|
264
265
|
long_text_encoder = LongTextEncoder(
|
265
266
|
max_length=2048, # 最大文本大小, 超出截断
|
266
267
|
chunk_size=448, # 块大小 (按 Token 计)
|
267
|
-
overlapping=32 # 块间重叠大小 (按 Token 计)
|
268
|
+
overlapping=32, # 块间重叠大小 (按 Token 计)
|
269
|
+
cache_capacity=512 # 缓存大小
|
268
270
|
)
|
269
271
|
|
270
272
|
trainer = TextBinaryClassifierTrainer(
|
@@ -8,7 +8,7 @@ deeplotx.egg-info/dependency_links.txt
|
|
8
8
|
deeplotx.egg-info/requires.txt
|
9
9
|
deeplotx.egg-info/top_level.txt
|
10
10
|
deeplotx/encoder/__init__.py
|
11
|
-
deeplotx/encoder/
|
11
|
+
deeplotx/encoder/encoder.py
|
12
12
|
deeplotx/encoder/long_text_encoder.py
|
13
13
|
deeplotx/encoder/longformer_encoder.py
|
14
14
|
deeplotx/nn/__init__.py
|
@@ -1,16 +1,17 @@
|
|
1
|
-
[project]
|
2
|
-
name = "deeplotx"
|
3
|
-
version = "0.5.
|
4
|
-
description = "Easy-2-use long text NLP toolkit."
|
5
|
-
readme = "README.md"
|
6
|
-
requires-python = ">=3.10"
|
7
|
-
dependencies = [
|
8
|
-
"hf-xet
|
9
|
-
"jupyter",
|
10
|
-
"numpy",
|
11
|
-
"protobuf",
|
12
|
-
"python-dotenv
|
13
|
-
"torch",
|
14
|
-
"transformers",
|
15
|
-
"typing-extensions
|
16
|
-
|
1
|
+
[project]
|
2
|
+
name = "deeplotx"
|
3
|
+
version = "0.5.6"
|
4
|
+
description = "Easy-2-use long text NLP toolkit."
|
5
|
+
readme = "README.md"
|
6
|
+
requires-python = ">=3.10"
|
7
|
+
dependencies = [
|
8
|
+
"hf-xet",
|
9
|
+
"jupyter",
|
10
|
+
"numpy",
|
11
|
+
"protobuf",
|
12
|
+
"python-dotenv",
|
13
|
+
"torch",
|
14
|
+
"transformers",
|
15
|
+
"typing-extensions",
|
16
|
+
"vortezwohl>=0.0.6",
|
17
|
+
]
|
@@ -1,15 +0,0 @@
|
|
1
|
-
import hashlib
|
2
|
-
|
3
|
-
|
4
|
-
def md5(text: str) -> str:
|
5
|
-
md5_hash = hashlib.md5()
|
6
|
-
text_bytes = text.encode('utf-8')
|
7
|
-
md5_hash.update(text_bytes)
|
8
|
-
return md5_hash.hexdigest()
|
9
|
-
|
10
|
-
|
11
|
-
def sha1(text: str) -> str:
|
12
|
-
md5_hash = hashlib.sha1()
|
13
|
-
text_bytes = text.encode('utf-8')
|
14
|
-
md5_hash.update(text_bytes)
|
15
|
-
return md5_hash.hexdigest()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|