deeplotx 0.8.7__tar.gz → 0.8.8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {deeplotx-0.8.7 → deeplotx-0.8.8}/PKG-INFO +2 -1
  2. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/encoder.py +7 -5
  3. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/long_text_encoder.py +1 -1
  4. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/longformer_encoder.py +6 -5
  5. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/PKG-INFO +2 -1
  6. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/requires.txt +1 -0
  7. {deeplotx-0.8.7 → deeplotx-0.8.8}/pyproject.toml +2 -1
  8. {deeplotx-0.8.7 → deeplotx-0.8.8}/LICENSE +0 -0
  9. {deeplotx-0.8.7 → deeplotx-0.8.8}/README.md +0 -0
  10. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/__init__.py +0 -0
  11. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/__init__.py +0 -0
  12. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/__init__.py +0 -0
  13. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/attention.py +0 -0
  14. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/auto_regression.py +0 -0
  15. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/base_neural_network.py +0 -0
  16. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/feed_forward.py +0 -0
  17. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/linear_regression.py +0 -0
  18. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/logistic_regression.py +0 -0
  19. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/long_context_auto_regression.py +0 -0
  20. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  21. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/multi_head_attention.py +0 -0
  22. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/multi_head_feed_forward.py +0 -0
  23. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/recursive_sequential.py +0 -0
  24. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/roformer_encoder.py +0 -0
  25. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/rope.py +0 -0
  26. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/softmax_regression.py +0 -0
  27. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/__init__.py +0 -0
  28. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/distribution.py +0 -0
  29. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/set.py +0 -0
  30. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/vector.py +0 -0
  31. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/trainer/__init__.py +0 -0
  32. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/trainer/base_trainer.py +0 -0
  33. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  34. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/util/__init__.py +0 -0
  35. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/util/hash.py +0 -0
  36. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/util/read_file.py +0 -0
  37. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/SOURCES.txt +0 -0
  38. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/dependency_links.txt +0 -0
  39. {deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/top_level.txt +0 -0
  40. {deeplotx-0.8.7 → deeplotx-0.8.8}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.7
3
+ Version: 0.8.8
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -10,6 +10,7 @@ Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
12
  Requires-Dist: python-dotenv
13
+ Requires-Dist: tiktoken
13
14
  Requires-Dist: torch
14
15
  Requires-Dist: transformers
15
16
  Requires-Dist: typing-extensions
@@ -43,9 +43,11 @@ class Encoder(nn.Module):
43
43
  self.embed_dim = self.encoder.config.max_position_embeddings
44
44
  logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
45
45
 
46
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
46
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True,
47
+ *args, **kwargs) -> torch.Tensor:
47
48
  def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
48
- return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
49
+ emb_seq = self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state
50
+ return emb_seq[:, 0, :] if cls_only else emb_seq
49
51
 
50
52
  num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
51
53
  chunks, chunk_results = [], []
@@ -58,9 +60,9 @@ class Encoder(nn.Module):
58
60
  with torch.no_grad():
59
61
  chunk_results = [_encoder(x) for x in chunks]
60
62
  self.encoder.train(mode=ori_mode)
61
- return torch.cat(chunk_results, dim=-1)
63
+ return torch.cat(chunk_results, dim=-1) if cls_only else torch.cat(chunk_results, dim=-2)
62
64
 
63
- def encode(self, text: str) -> torch.Tensor:
65
+ def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
64
66
  _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
65
67
  _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
66
- return self.forward(_input_ids, _att_mask).squeeze()
68
+ return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()
@@ -25,7 +25,7 @@ class LongTextEncoder(Encoder):
25
25
  self._worker_group = ThreadPool(max_workers=max_workers)
26
26
 
27
27
  def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
28
- return idx, super().forward(x, attention_mask=mask)
28
+ return idx, super().forward(x, attention_mask=mask, cls_only=True)
29
29
 
30
30
  @override
31
31
  def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:
@@ -9,7 +9,7 @@ from requests.exceptions import ConnectTimeout, SSLError
9
9
  from deeplotx import __ROOT__
10
10
 
11
11
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
12
- DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
12
+ DEFAULT_LONGFORMER = 'severinsimmler/xlm-roberta-longformer-base-16384'
13
13
  logger = logging.getLogger('deeplotx.embedding')
14
14
 
15
15
 
@@ -41,15 +41,16 @@ class LongformerEncoder(nn.Module):
41
41
  trust_remote_code=True, local_files_only=True).to(self.device)
42
42
  logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
43
43
 
44
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
44
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True) -> torch.Tensor:
45
45
  ori_mode = self.encoder.training
46
46
  self.encoder.eval()
47
47
  with torch.no_grad():
48
- res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
48
+ emb_seq = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state
49
+ res = emb_seq[:, 0, :] if cls_only else emb_seq
49
50
  self.encoder.train(mode=ori_mode)
50
51
  return res
51
52
 
52
- def encode(self, text: str) -> torch.Tensor:
53
+ def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
53
54
  _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
54
55
  _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
55
- return self.forward(_input_ids, _att_mask).squeeze()
56
+ return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.7
3
+ Version: 0.8.8
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -10,6 +10,7 @@ Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
12
  Requires-Dist: python-dotenv
13
+ Requires-Dist: tiktoken
13
14
  Requires-Dist: torch
14
15
  Requires-Dist: transformers
15
16
  Requires-Dist: typing-extensions
@@ -3,6 +3,7 @@ jupyter
3
3
  numpy
4
4
  protobuf
5
5
  python-dotenv
6
+ tiktoken
6
7
  torch
7
8
  transformers
8
9
  typing-extensions
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = "0.8.7"
3
+ version = "0.8.8"
4
4
  description = "Easy-2-use long text NLP toolkit."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -10,6 +10,7 @@ dependencies = [
10
10
  "numpy",
11
11
  "protobuf",
12
12
  "python-dotenv",
13
+ "tiktoken",
13
14
  "torch",
14
15
  "transformers",
15
16
  "typing-extensions",
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes