deeplotx 0.8.7__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {deeplotx-0.8.7 → deeplotx-0.9.0}/PKG-INFO +4 -1
  2. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/__init__.py +3 -0
  3. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/encoder.py +7 -5
  4. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/long_text_encoder.py +1 -1
  5. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/longformer_encoder.py +6 -5
  6. deeplotx-0.9.0/deeplotx/ner/__init__.py +3 -0
  7. deeplotx-0.9.0/deeplotx/ner/base_ner.py +7 -0
  8. deeplotx-0.9.0/deeplotx/ner/bert_ner.py +72 -0
  9. deeplotx-0.9.0/deeplotx/ner/named_entity.py +8 -0
  10. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/PKG-INFO +4 -1
  11. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/SOURCES.txt +4 -0
  12. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/requires.txt +3 -0
  13. {deeplotx-0.8.7 → deeplotx-0.9.0}/pyproject.toml +4 -1
  14. {deeplotx-0.8.7 → deeplotx-0.9.0}/LICENSE +0 -0
  15. {deeplotx-0.8.7 → deeplotx-0.9.0}/README.md +0 -0
  16. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/__init__.py +0 -0
  17. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/__init__.py +0 -0
  18. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/attention.py +0 -0
  19. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/auto_regression.py +0 -0
  20. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/base_neural_network.py +0 -0
  21. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/feed_forward.py +0 -0
  22. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/linear_regression.py +0 -0
  23. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/logistic_regression.py +0 -0
  24. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/long_context_auto_regression.py +0 -0
  25. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  26. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/multi_head_attention.py +0 -0
  27. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/multi_head_feed_forward.py +0 -0
  28. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/recursive_sequential.py +0 -0
  29. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/roformer_encoder.py +0 -0
  30. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/rope.py +0 -0
  31. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/softmax_regression.py +0 -0
  32. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/__init__.py +0 -0
  33. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/distribution.py +0 -0
  34. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/set.py +0 -0
  35. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/vector.py +0 -0
  36. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/trainer/__init__.py +0 -0
  37. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/trainer/base_trainer.py +0 -0
  38. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  39. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/util/__init__.py +0 -0
  40. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/util/hash.py +0 -0
  41. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/util/read_file.py +0 -0
  42. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/dependency_links.txt +0 -0
  43. {deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/top_level.txt +0 -0
  44. {deeplotx-0.8.7 → deeplotx-0.9.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.7
3
+ Version: 0.9.0
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -10,10 +10,13 @@ Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
12
  Requires-Dist: python-dotenv
13
+ Requires-Dist: sentencepiece
14
+ Requires-Dist: tiktoken
13
15
  Requires-Dist: torch
14
16
  Requires-Dist: transformers
15
17
  Requires-Dist: typing-extensions
16
18
  Requires-Dist: vortezwohl>=0.0.8
19
+ Requires-Dist: name2gender>=0.0.4a0
17
20
  Dynamic: license-file
18
21
 
19
22
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -4,6 +4,7 @@ import os
4
4
  __ROOT__ = os.path.dirname(os.path.abspath(__file__))
5
5
 
6
6
  from .encoder import Encoder, LongTextEncoder, LongformerEncoder
7
+ from .ner import BertNER, NamedEntity
7
8
  from .nn import (
8
9
  FeedForward,
9
10
  MultiHeadFeedForward,
@@ -40,3 +41,5 @@ logger = logging.getLogger('deeplotx.trainer')
40
41
  logger.setLevel(logging.DEBUG)
41
42
  logger = logging.getLogger('deeplotx.embedding')
42
43
  logger.setLevel(logging.DEBUG)
44
+ logger = logging.getLogger('deeplotx.ner')
45
+ logger.setLevel(logging.DEBUG)
@@ -43,9 +43,11 @@ class Encoder(nn.Module):
43
43
  self.embed_dim = self.encoder.config.max_position_embeddings
44
44
  logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
45
45
 
46
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
46
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True,
47
+ *args, **kwargs) -> torch.Tensor:
47
48
  def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
48
- return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
49
+ emb_seq = self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state
50
+ return emb_seq[:, 0, :] if cls_only else emb_seq
49
51
 
50
52
  num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
51
53
  chunks, chunk_results = [], []
@@ -58,9 +60,9 @@ class Encoder(nn.Module):
58
60
  with torch.no_grad():
59
61
  chunk_results = [_encoder(x) for x in chunks]
60
62
  self.encoder.train(mode=ori_mode)
61
- return torch.cat(chunk_results, dim=-1)
63
+ return torch.cat(chunk_results, dim=-1) if cls_only else torch.cat(chunk_results, dim=-2)
62
64
 
63
- def encode(self, text: str) -> torch.Tensor:
65
+ def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
64
66
  _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
65
67
  _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
66
- return self.forward(_input_ids, _att_mask).squeeze()
68
+ return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()
@@ -25,7 +25,7 @@ class LongTextEncoder(Encoder):
25
25
  self._worker_group = ThreadPool(max_workers=max_workers)
26
26
 
27
27
  def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
28
- return idx, super().forward(x, attention_mask=mask)
28
+ return idx, super().forward(x, attention_mask=mask, cls_only=True)
29
29
 
30
30
  @override
31
31
  def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:
@@ -9,7 +9,7 @@ from requests.exceptions import ConnectTimeout, SSLError
9
9
  from deeplotx import __ROOT__
10
10
 
11
11
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
12
- DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
12
+ DEFAULT_LONGFORMER = 'severinsimmler/xlm-roberta-longformer-base-16384'
13
13
  logger = logging.getLogger('deeplotx.embedding')
14
14
 
15
15
 
@@ -41,15 +41,16 @@ class LongformerEncoder(nn.Module):
41
41
  trust_remote_code=True, local_files_only=True).to(self.device)
42
42
  logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
43
43
 
44
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
44
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True) -> torch.Tensor:
45
45
  ori_mode = self.encoder.training
46
46
  self.encoder.eval()
47
47
  with torch.no_grad():
48
- res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
48
+ emb_seq = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state
49
+ res = emb_seq[:, 0, :] if cls_only else emb_seq
49
50
  self.encoder.train(mode=ori_mode)
50
51
  return res
51
52
 
52
- def encode(self, text: str) -> torch.Tensor:
53
+ def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
53
54
  _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
54
55
  _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
55
- return self.forward(_input_ids, _att_mask).squeeze()
56
+ return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()
@@ -0,0 +1,3 @@
1
+ from .named_entity import NamedEntity
2
+ from .base_ner import BaseNER
3
+ from .bert_ner import BertNER
@@ -0,0 +1,7 @@
1
+ from deeplotx.ner.named_entity import NamedEntity
2
+
3
+
4
+ class BaseNER:
5
+ def __init__(self): ...
6
+
7
+ def extract_entities(self, s: str, *args, **kwargs) -> list[NamedEntity]: ...
@@ -0,0 +1,72 @@
1
+ import logging
2
+ import os
3
+ from requests.exceptions import ConnectTimeout, SSLError
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
7
+
8
+ from deeplotx import __ROOT__
9
+ from deeplotx.ner.base_ner import BaseNER
10
+ from deeplotx.ner.named_entity import NamedEntity
11
+
12
+ CACHE_PATH = os.path.join(__ROOT__, '.cache')
13
+ DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
14
+ logger = logging.getLogger('deeplotx.ner')
15
+
16
+
17
+ class BertNER(BaseNER):
18
+ def __init__(self, model_name_or_path: str = DEFAULT_BERT_NER, device: str | None = None):
19
+ super().__init__()
20
+ self.device = torch.device(device) if device is not None \
21
+ else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
+ try:
23
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
24
+ cache_dir=CACHE_PATH, _from_auto=True,
25
+ trust_remote_code=True)
26
+ self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
27
+ cache_dir=CACHE_PATH, _from_auto=True,
28
+ trust_remote_code=True).to(self.device)
29
+ except ConnectTimeout:
30
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
31
+ cache_dir=CACHE_PATH, _from_auto=True,
32
+ trust_remote_code=True, local_files_only=True)
33
+ self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
34
+ cache_dir=CACHE_PATH, _from_auto=True,
35
+ trust_remote_code=True, local_files_only=True).to(self.device)
36
+ except SSLError:
37
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
38
+ cache_dir=CACHE_PATH, _from_auto=True,
39
+ trust_remote_code=True, local_files_only=True)
40
+ self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
41
+ cache_dir=CACHE_PATH, _from_auto=True,
42
+ trust_remote_code=True, local_files_only=True).to(self.device)
43
+ self.embed_dim = self.encoder.config.max_position_embeddings
44
+ self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
45
+ logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
46
+
47
+ def extract_entities(self, s: str, prob_threshold: float = .0, *args, **kwargs) -> list[NamedEntity]:
48
+ assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
49
+ s = ' ' + s
50
+ raw_entities = self._ner_pipeline(s)
51
+ entities = []
52
+ for ent in raw_entities:
53
+ entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
54
+ while True:
55
+ for i, ent in enumerate(entities):
56
+ if len(ent[0].strip()) < 1:
57
+ del entities[i]
58
+ if ent[1].upper().startswith('I') and entities[i - 1][1].upper().startswith('B'):
59
+ entities[i - 1][0] += ent[0]
60
+ entities[i - 1][2] *= ent[2]
61
+ del entities[i]
62
+ _continue = False
63
+ for ent in entities:
64
+ if ent[1].upper().startswith('I'):
65
+ _continue = True
66
+ if not _continue:
67
+ break
68
+ for ent in entities:
69
+ ent[0] = ent[0].strip()
70
+ if ent[1].upper().startswith('B'):
71
+ ent[1] = ent[1].upper()[1:].strip('-')
72
+ return [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
@@ -0,0 +1,8 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class NamedEntity:
6
+ text: str
7
+ type: str
8
+ probability: float
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.7
3
+ Version: 0.9.0
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -10,10 +10,13 @@ Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
12
  Requires-Dist: python-dotenv
13
+ Requires-Dist: sentencepiece
14
+ Requires-Dist: tiktoken
13
15
  Requires-Dist: torch
14
16
  Requires-Dist: transformers
15
17
  Requires-Dist: typing-extensions
16
18
  Requires-Dist: vortezwohl>=0.0.8
19
+ Requires-Dist: name2gender>=0.0.4a0
17
20
  Dynamic: license-file
18
21
 
19
22
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -11,6 +11,10 @@ deeplotx/encoder/__init__.py
11
11
  deeplotx/encoder/encoder.py
12
12
  deeplotx/encoder/long_text_encoder.py
13
13
  deeplotx/encoder/longformer_encoder.py
14
+ deeplotx/ner/__init__.py
15
+ deeplotx/ner/base_ner.py
16
+ deeplotx/ner/bert_ner.py
17
+ deeplotx/ner/named_entity.py
14
18
  deeplotx/nn/__init__.py
15
19
  deeplotx/nn/attention.py
16
20
  deeplotx/nn/auto_regression.py
@@ -3,7 +3,10 @@ jupyter
3
3
  numpy
4
4
  protobuf
5
5
  python-dotenv
6
+ sentencepiece
7
+ tiktoken
6
8
  torch
7
9
  transformers
8
10
  typing-extensions
9
11
  vortezwohl>=0.0.8
12
+ name2gender>=0.0.4a0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = "0.8.7"
3
+ version = "0.9.0"
4
4
  description = "Easy-2-use long text NLP toolkit."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -10,8 +10,11 @@ dependencies = [
10
10
  "numpy",
11
11
  "protobuf",
12
12
  "python-dotenv",
13
+ "sentencepiece",
14
+ "tiktoken",
13
15
  "torch",
14
16
  "transformers",
15
17
  "typing-extensions",
16
18
  "vortezwohl>=0.0.8",
19
+ "name2gender>=0.0.4a0",
17
20
  ]
File without changes
File without changes
File without changes
File without changes
File without changes