deeplotx 0.5.3__tar.gz → 0.5.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. {deeplotx-0.5.3 → deeplotx-0.5.5}/PKG-INFO +5 -4
  2. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/__init__.py +1 -1
  3. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/encoder/__init__.py +1 -1
  4. deeplotx-0.5.3/deeplotx/encoder/bert_encoder.py → deeplotx-0.5.5/deeplotx/encoder/encoder.py +12 -12
  5. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/encoder/long_text_encoder.py +17 -17
  6. deeplotx-0.5.5/deeplotx/util/hash.py +29 -0
  7. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx.egg-info/PKG-INFO +5 -4
  8. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx.egg-info/SOURCES.txt +1 -1
  9. deeplotx-0.5.5/deeplotx.egg-info/requires.txt +9 -0
  10. {deeplotx-0.5.3 → deeplotx-0.5.5}/pyproject.toml +17 -16
  11. deeplotx-0.5.3/deeplotx/util/hash.py +0 -15
  12. deeplotx-0.5.3/deeplotx.egg-info/requires.txt +0 -8
  13. {deeplotx-0.5.3 → deeplotx-0.5.5}/LICENSE +0 -0
  14. {deeplotx-0.5.3 → deeplotx-0.5.5}/README.md +0 -0
  15. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/encoder/longformer_encoder.py +0 -0
  16. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/__init__.py +0 -0
  17. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/auto_regression.py +0 -0
  18. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/base_neural_network.py +0 -0
  19. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/linear_regression.py +0 -0
  20. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/logistic_regression.py +0 -0
  21. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/long_context_auto_regression.py +0 -0
  22. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  23. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/recursive_sequential.py +0 -0
  24. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/self_attention.py +0 -0
  25. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/nn/softmax_regression.py +0 -0
  26. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/similarity/__init__.py +0 -0
  27. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/similarity/distribution.py +0 -0
  28. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/similarity/set.py +0 -0
  29. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/similarity/vector.py +0 -0
  30. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/trainer/__init__.py +0 -0
  31. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/trainer/base_trainer.py +0 -0
  32. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  33. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/util/__init__.py +0 -0
  34. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx/util/read_file.py +0 -0
  35. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx.egg-info/dependency_links.txt +0 -0
  36. {deeplotx-0.5.3 → deeplotx-0.5.5}/deeplotx.egg-info/top_level.txt +0 -0
  37. {deeplotx-0.5.3 → deeplotx-0.5.5}/setup.cfg +0 -0
@@ -1,18 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.5.3
3
+ Version: 0.5.5
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
7
7
  License-File: LICENSE
8
- Requires-Dist: hf-xet>=1.0.5
8
+ Requires-Dist: hf-xet
9
9
  Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
- Requires-Dist: python-dotenv>=1.1.0
12
+ Requires-Dist: python-dotenv
13
13
  Requires-Dist: torch
14
14
  Requires-Dist: transformers
15
- Requires-Dist: typing-extensions>=4.13.2
15
+ Requires-Dist: typing-extensions
16
+ Requires-Dist: vortezwohl>=0.0.6
16
17
  Dynamic: license-file
17
18
 
18
19
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -3,7 +3,7 @@ import os
3
3
 
4
4
  __ROOT__ = os.path.dirname(os.path.abspath(__file__))
5
5
 
6
- from .encoder import BertEncoder, LongTextEncoder, LongformerEncoder
6
+ from .encoder import Encoder, LongTextEncoder, LongformerEncoder
7
7
  from .nn import (
8
8
  LinearRegression,
9
9
  LogisticRegression,
@@ -1,3 +1,3 @@
1
- from .bert_encoder import BertEncoder
1
+ from .encoder import Encoder
2
2
  from .long_text_encoder import LongTextEncoder
3
3
  from .longformer_encoder import LongformerEncoder
@@ -4,30 +4,30 @@ import math
4
4
 
5
5
  import torch
6
6
  from torch import nn
7
- from transformers import BertTokenizer, BertModel
7
+ from transformers import AutoTokenizer, AutoModel
8
8
 
9
9
  from deeplotx import __ROOT__
10
10
 
11
11
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
12
- DEFAULT_BERT = 'bert-base-uncased'
12
+ DEFAULT_BERT = 'FacebookAI/xlm-roberta-base'
13
13
  logger = logging.getLogger('deeplotx.embedding')
14
14
 
15
15
 
16
- class BertEncoder(nn.Module):
16
+ class Encoder(nn.Module):
17
17
  def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
18
18
  super().__init__()
19
19
  self.device = torch.device(device) if device is not None \
20
20
  else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
- self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
21
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
22
22
  cache_dir=CACHE_PATH, _from_auto=True)
23
- self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
24
- cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
25
- self.embed_dim = self.bert.config.max_position_embeddings
26
- logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
23
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
24
+ cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
25
+ self.embed_dim = self.encoder.config.max_position_embeddings
26
+ logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
27
27
 
28
28
  def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
29
29
  def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
30
- return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
30
+ return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
31
31
 
32
32
  num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
33
33
  chunks = chunk_results = []
@@ -35,11 +35,11 @@ class BertEncoder(nn.Module):
35
35
  start_idx = i * self.embed_dim
36
36
  end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
37
37
  chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
38
- ori_mode = self.bert.training
39
- self.bert.eval()
38
+ ori_mode = self.encoder.training
39
+ self.encoder.eval()
40
40
  with torch.no_grad():
41
41
  chunk_results = [_encoder(x) for x in chunks]
42
- self.bert.train(mode=ori_mode)
42
+ self.encoder.train(mode=ori_mode)
43
43
  return torch.cat(chunk_results, dim=-1)
44
44
 
45
45
  def encode(self, text: str) -> torch.Tensor:
@@ -1,30 +1,32 @@
1
1
  import logging
2
2
  import math
3
- from concurrent.futures import ThreadPoolExecutor
4
3
  from typing_extensions import override
5
4
 
6
5
  import torch
6
+ from vortezwohl.concurrent import ThreadPool
7
+ from vortezwohl.cache import LRUCache
7
8
 
8
- from deeplotx.encoder.bert_encoder import BertEncoder, DEFAULT_BERT
9
- from deeplotx.util.hash import md5
9
+ from deeplotx.encoder.encoder import Encoder, DEFAULT_BERT
10
+ from deeplotx.util.hash import sha512
10
11
 
11
12
  logger = logging.getLogger('deeplotx.embedding')
12
13
 
13
14
 
14
- class LongTextEncoder(BertEncoder):
15
- def __init__(self, max_length: int, chunk_size: int = 256,
16
- overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
15
+ class LongTextEncoder(Encoder):
16
+ def __init__(self, max_length: int, chunk_size: int = 448,
17
+ overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
18
+ cache_capacity: int = 64, device: str | None = None):
17
19
  super().__init__(model_name_or_path=model_name_or_path, device=device)
18
20
  self._max_length = max_length
19
21
  self._chunk_size = chunk_size
20
22
  self._overlapping = overlapping
21
- self._cache = dict()
23
+ self._cache = LRUCache(capacity=cache_capacity)
22
24
 
23
- def __chunk_embedding(self, input_tup: tuple[int, torch.Tensor]) -> tuple[int, torch.Tensor]:
24
- return input_tup[0], super().forward(input_tup[1], attention_mask=input_tup[2])
25
+ def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
26
+ return idx, super().forward(x, attention_mask=mask)
25
27
 
26
28
  @override
27
- def encode(self, text: str, flatten: bool = True, use_cache: bool = True) -> torch.Tensor:
29
+ def encode(self, text: str, flatten: bool = False) -> torch.Tensor:
28
30
  def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
29
31
  if not _flatten:
30
32
  return torch.stack(tensors, dim=0).squeeze()
@@ -36,8 +38,8 @@ class LongTextEncoder(BertEncoder):
36
38
  _text_to_show = text.replace("\n", str())
37
39
  logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
38
40
  # read cache
39
- _text_hash = md5(text)
40
- if _text_hash in self._cache.keys():
41
+ _text_hash = sha512(text)
42
+ if _text_hash in self._cache:
41
43
  return postprocess(self._cache[_text_hash], flatten)
42
44
  _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
43
45
  _text_to_input_ids_att_mask = []
@@ -57,11 +59,9 @@ class LongTextEncoder(BertEncoder):
57
59
  _tmp_right = (i + 1) * self._chunk_size + self._overlapping
58
60
  chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device),
59
61
  torch.tensor([_text_to_input_ids_att_mask[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device)))
60
- with ThreadPoolExecutor(max_workers=min(num_chunks + 1, 3)) as executor:
61
- embeddings = list(executor.map(self.__chunk_embedding, chunks))
62
- embeddings.sort(key=lambda x: x[0])
62
+ embeddings = list(ThreadPool(max_workers=min(num_chunks + 1, 8)).map(self.__chunk_embedding, chunks))
63
+ embeddings = sorted([x.returns for x in embeddings], key=lambda x: x[0], reverse=False)
63
64
  fin_embedding = [x[1] for x in embeddings]
64
65
  # write cache
65
- if use_cache:
66
- self._cache[_text_hash] = fin_embedding
66
+ self._cache[_text_hash] = fin_embedding
67
67
  return postprocess(fin_embedding, flatten)
@@ -0,0 +1,29 @@
1
+ import hashlib
2
+
3
+
4
+ def md5(text: str) -> str:
5
+ _hash = hashlib.md5()
6
+ text_bytes = text.encode('utf-8')
7
+ _hash.update(text_bytes)
8
+ return _hash.hexdigest()
9
+
10
+
11
+ def sha1(text: str) -> str:
12
+ _hash = hashlib.sha1()
13
+ text_bytes = text.encode('utf-8')
14
+ _hash.update(text_bytes)
15
+ return _hash.hexdigest()
16
+
17
+
18
+ def sha256(text: str) -> str:
19
+ _hash = hashlib.sha256()
20
+ text_bytes = text.encode('utf-8')
21
+ _hash.update(text_bytes)
22
+ return _hash.hexdigest()
23
+
24
+
25
+ def sha512(text: str) -> str:
26
+ _hash = hashlib.sha512()
27
+ text_bytes = text.encode('utf-8')
28
+ _hash.update(text_bytes)
29
+ return _hash.hexdigest()
@@ -1,18 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.5.3
3
+ Version: 0.5.5
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
7
7
  License-File: LICENSE
8
- Requires-Dist: hf-xet>=1.0.5
8
+ Requires-Dist: hf-xet
9
9
  Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
- Requires-Dist: python-dotenv>=1.1.0
12
+ Requires-Dist: python-dotenv
13
13
  Requires-Dist: torch
14
14
  Requires-Dist: transformers
15
- Requires-Dist: typing-extensions>=4.13.2
15
+ Requires-Dist: typing-extensions
16
+ Requires-Dist: vortezwohl>=0.0.6
16
17
  Dynamic: license-file
17
18
 
18
19
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -8,7 +8,7 @@ deeplotx.egg-info/dependency_links.txt
8
8
  deeplotx.egg-info/requires.txt
9
9
  deeplotx.egg-info/top_level.txt
10
10
  deeplotx/encoder/__init__.py
11
- deeplotx/encoder/bert_encoder.py
11
+ deeplotx/encoder/encoder.py
12
12
  deeplotx/encoder/long_text_encoder.py
13
13
  deeplotx/encoder/longformer_encoder.py
14
14
  deeplotx/nn/__init__.py
@@ -0,0 +1,9 @@
1
+ hf-xet
2
+ jupyter
3
+ numpy
4
+ protobuf
5
+ python-dotenv
6
+ torch
7
+ transformers
8
+ typing-extensions
9
+ vortezwohl>=0.0.6
@@ -1,16 +1,17 @@
1
- [project]
2
- name = "deeplotx"
3
- version = "0.5.3"
4
- description = "Easy-2-use long text NLP toolkit."
5
- readme = "README.md"
6
- requires-python = ">=3.10"
7
- dependencies = [
8
- "hf-xet>=1.0.5",
9
- "jupyter",
10
- "numpy",
11
- "protobuf",
12
- "python-dotenv>=1.1.0",
13
- "torch",
14
- "transformers",
15
- "typing-extensions>=4.13.2",
16
- ]
1
+ [project]
2
+ name = "deeplotx"
3
+ version = "0.5.5"
4
+ description = "Easy-2-use long text NLP toolkit."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "hf-xet",
9
+ "jupyter",
10
+ "numpy",
11
+ "protobuf",
12
+ "python-dotenv",
13
+ "torch",
14
+ "transformers",
15
+ "typing-extensions",
16
+ "vortezwohl>=0.0.6",
17
+ ]
@@ -1,15 +0,0 @@
1
- import hashlib
2
-
3
-
4
- def md5(text: str) -> str:
5
- md5_hash = hashlib.md5()
6
- text_bytes = text.encode('utf-8')
7
- md5_hash.update(text_bytes)
8
- return md5_hash.hexdigest()
9
-
10
-
11
- def sha1(text: str) -> str:
12
- md5_hash = hashlib.sha1()
13
- text_bytes = text.encode('utf-8')
14
- md5_hash.update(text_bytes)
15
- return md5_hash.hexdigest()
@@ -1,8 +0,0 @@
1
- hf-xet>=1.0.5
2
- jupyter
3
- numpy
4
- protobuf
5
- python-dotenv>=1.1.0
6
- torch
7
- transformers
8
- typing-extensions>=4.13.2
File without changes
File without changes
File without changes