deeplotx 0.9.4__tar.gz → 0.9.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {deeplotx-0.9.4 → deeplotx-0.9.5}/PKG-INFO +1 -1
  2. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/bert_ner.py +32 -10
  3. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/PKG-INFO +1 -1
  4. {deeplotx-0.9.4 → deeplotx-0.9.5}/pyproject.toml +1 -1
  5. {deeplotx-0.9.4 → deeplotx-0.9.5}/LICENSE +0 -0
  6. {deeplotx-0.9.4 → deeplotx-0.9.5}/README.md +0 -0
  7. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/__init__.py +0 -0
  8. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/__init__.py +0 -0
  9. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/encoder.py +0 -0
  10. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/long_text_encoder.py +0 -0
  11. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/longformer_encoder.py +0 -0
  12. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/__init__.py +0 -0
  13. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/base_ner.py +0 -0
  14. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/n2g/__init__.py +0 -0
  15. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/named_entity.py +0 -0
  16. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/__init__.py +0 -0
  17. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/attention.py +0 -0
  18. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/auto_regression.py +0 -0
  19. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/base_neural_network.py +0 -0
  20. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/feed_forward.py +0 -0
  21. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/linear_regression.py +0 -0
  22. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/logistic_regression.py +0 -0
  23. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/long_context_auto_regression.py +0 -0
  24. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  25. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/multi_head_attention.py +0 -0
  26. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/multi_head_feed_forward.py +0 -0
  27. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/recursive_sequential.py +0 -0
  28. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/roformer_encoder.py +0 -0
  29. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/rope.py +0 -0
  30. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/softmax_regression.py +0 -0
  31. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/__init__.py +0 -0
  32. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/distribution.py +0 -0
  33. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/set.py +0 -0
  34. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/vector.py +0 -0
  35. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/trainer/__init__.py +0 -0
  36. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/trainer/base_trainer.py +0 -0
  37. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  38. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/util/__init__.py +0 -0
  39. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/util/hash.py +0 -0
  40. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/util/read_file.py +0 -0
  41. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/SOURCES.txt +0 -0
  42. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/dependency_links.txt +0 -0
  43. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/requires.txt +0 -0
  44. {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/top_level.txt +0 -0
  45. {deeplotx-0.9.4 → deeplotx-0.9.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.4
3
+ Version: 0.9.5
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -11,6 +11,8 @@ from deeplotx.ner.base_ner import BaseNER
11
11
  from deeplotx.ner.named_entity import NamedEntity, NamedPerson
12
12
 
13
13
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
14
+ NEW_LINE, BLANK = '\n', ' '
15
+ DEFAULT_LENGTH_THRESHOLD = 384
14
16
  DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
15
17
  N2G_MODEL: list[Name2Gender] = []
16
18
  logger = logging.getLogger('deeplotx.ner')
@@ -44,11 +46,11 @@ class BertNER(BaseNER):
44
46
  trust_remote_code=True, local_files_only=True).to(self.device)
45
47
  self.embed_dim = self.encoder.config.max_position_embeddings
46
48
  self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
47
- logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
49
+ logger.debug(f'{BertNER.__name__} initialized on device: {self.device}.')
48
50
 
49
51
  def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
50
52
  assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
51
- s = f' {s} '
53
+ s = f' {s.replace(NEW_LINE, BLANK)} '
52
54
  raw_entities = self._ner_pipeline(s)
53
55
  entities = []
54
56
  for ent in raw_entities:
@@ -69,8 +71,21 @@ class BertNER(BaseNER):
69
71
  break
70
72
  for ent in entities:
71
73
  ent[0] = ent[0].strip()
74
+ # stripping
75
+ while not ent[0][0].isalpha():
76
+ if len(ent[0]) < 2:
77
+ break
78
+ if not ent[0][0].isnumeric():
79
+ ent[0] = ent[0][1:]
80
+ while not ent[0][-1].isalpha():
81
+ if len(ent[0]) < 2:
82
+ break
83
+ if not ent[0][-1].isnumeric():
84
+ ent[0] = ent[0][:-1]
72
85
  if ent[1].upper().startswith('B'):
73
86
  ent[1] = ent[1].upper()[1:].strip('-')
87
+ if len(entities) > 0:
88
+ logger.debug(f'Entities: {[_[0] for _ in entities]}, extracted from: "{s.strip()}".')
74
89
  entities = [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
75
90
  if not with_gender:
76
91
  return entities
@@ -88,14 +103,19 @@ class BertNER(BaseNER):
88
103
  return entities
89
104
 
90
105
  def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0, deduplicate: bool = True) -> list[NamedEntity]:
91
- _entities = self._fast_extract(s, with_gender=with_gender, prob_threshold=prob_threshold) if len(s) < 512 else []
92
- if len(s) >= 512:
93
- window_size: int = 512
94
- offset = window_size // 6
95
- for _offset in [- offset, offset]:
96
- _window_size = window_size + _offset
97
- for i in range(0, len(s) + _window_size, _window_size):
98
- _entities.extend(self._fast_extract(s[i: i + _window_size], with_gender=with_gender, prob_threshold=prob_threshold))
106
+ _length_threshold = DEFAULT_LENGTH_THRESHOLD
107
+ _s_seq = self.tokenizer.encode(s, add_special_tokens=False)
108
+ _entities = self._fast_extract(self.tokenizer.decode(_s_seq, skip_special_tokens=True),
109
+ with_gender=with_gender,
110
+ prob_threshold=prob_threshold) if len(_s_seq) < _length_threshold else []
111
+ # sliding window extracting
112
+ if len(_s_seq) >= _length_threshold:
113
+ _window_size = _length_threshold
114
+ _stride = _length_threshold // 4
115
+ for i in range(0, len(_s_seq) + _stride, _stride):
116
+ _window_text = self.tokenizer.decode(_s_seq[i: i + _window_size], skip_special_tokens=True)
117
+ _entities.extend(self._fast_extract(_window_text, with_gender=with_gender, prob_threshold=prob_threshold))
118
+ # entity combination
99
119
  _tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
100
120
  for _ent_i in _tmp_entities:
101
121
  for _ent_j in _entities:
@@ -103,6 +123,7 @@ class BertNER(BaseNER):
103
123
  and len(_ent_j.text) != len(_ent_i.text)
104
124
  and _ent_j in _tmp_entities):
105
125
  _tmp_entities.remove(_ent_j)
126
+ # entity cleaning
106
127
  while True:
107
128
  for _ent in _tmp_entities:
108
129
  if _ent.text not in s or len(_ent.text) < 2:
@@ -116,6 +137,7 @@ class BertNER(BaseNER):
116
137
  break
117
138
  if not deduplicate:
118
139
  return sorted(_tmp_entities, key=lambda _: _.text[0], reverse=False)
140
+ # entity deduplication
119
141
  _fin_entities = dict()
120
142
  texts = set([text.text for text in _tmp_entities])
121
143
  for text in texts:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.4
3
+ Version: 0.9.5
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = "0.9.4"
3
+ version = "0.9.5"
4
4
  description = "An out-of-the-box long-text NLP framework."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes