deeplotx 0.9.4__py3-none-any.whl → 0.9.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deeplotx/ner/bert_ner.py CHANGED
@@ -11,6 +11,8 @@ from deeplotx.ner.base_ner import BaseNER
11
11
  from deeplotx.ner.named_entity import NamedEntity, NamedPerson
12
12
 
13
13
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
14
+ NEW_LINE, BLANK = '\n', ' '
15
+ DEFAULT_LENGTH_THRESHOLD = 448
14
16
  DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
15
17
  N2G_MODEL: list[Name2Gender] = []
16
18
  logger = logging.getLogger('deeplotx.ner')
@@ -44,11 +46,11 @@ class BertNER(BaseNER):
44
46
  trust_remote_code=True, local_files_only=True).to(self.device)
45
47
  self.embed_dim = self.encoder.config.max_position_embeddings
46
48
  self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
47
- logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
49
+ logger.debug(f'{BertNER.__name__} initialized on device: {self.device}.')
48
50
 
49
51
  def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
50
52
  assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
51
- s = f' {s} '
53
+ s = f' {s.replace(NEW_LINE, BLANK * 2)} '
52
54
  raw_entities = self._ner_pipeline(s)
53
55
  entities = []
54
56
  for ent in raw_entities:
@@ -58,7 +60,10 @@ class BertNER(BaseNER):
58
60
  if len(ent[0].strip()) < 1:
59
61
  del entities[i]
60
62
  if ent[1].upper().startswith('I') and entities[i - 1][1].upper().startswith('B'):
61
- entities[i - 1][0] += ent[0]
63
+ if entities[i - 1][0] + ent[0] in s:
64
+ entities[i - 1][0] += ent[0]
65
+ else:
66
+ entities[i - 1][0] += f' {ent[0]}'
62
67
  entities[i - 1][2] *= ent[2]
63
68
  del entities[i]
64
69
  _continue = False
@@ -69,8 +74,21 @@ class BertNER(BaseNER):
69
74
  break
70
75
  for ent in entities:
71
76
  ent[0] = ent[0].strip()
77
+ # stripping
78
+ while not ent[0][0].isalpha():
79
+ if len(ent[0]) < 2:
80
+ break
81
+ if not ent[0][0].isnumeric():
82
+ ent[0] = ent[0][1:]
83
+ while not ent[0][-1].isalpha():
84
+ if len(ent[0]) < 2:
85
+ break
86
+ if not ent[0][-1].isnumeric():
87
+ ent[0] = ent[0][:-1]
72
88
  if ent[1].upper().startswith('B'):
73
89
  ent[1] = ent[1].upper()[1:].strip('-')
90
+ if len(entities) > 0:
91
+ logger.debug(f'Entities: {[_[0] for _ in entities]}, extracted from: "{s.strip()}".')
74
92
  entities = [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
75
93
  if not with_gender:
76
94
  return entities
@@ -87,15 +105,22 @@ class BertNER(BaseNER):
87
105
  gender_probability=gender_prob)
88
106
  return entities
89
107
 
90
- def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0, deduplicate: bool = True) -> list[NamedEntity]:
91
- _entities = self._fast_extract(s, with_gender=with_gender, prob_threshold=prob_threshold) if len(s) < 512 else []
92
- if len(s) >= 512:
93
- window_size: int = 512
94
- offset = window_size // 6
95
- for _offset in [- offset, offset]:
96
- _window_size = window_size + _offset
97
- for i in range(0, len(s) + _window_size, _window_size):
98
- _entities.extend(self._fast_extract(s[i: i + _window_size], with_gender=with_gender, prob_threshold=prob_threshold))
108
+ def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0,
109
+ window_size: int = DEFAULT_LENGTH_THRESHOLD, deduplicate: bool = True) -> list[NamedEntity]:
110
+ _s_seq = self.tokenizer.encode(s, add_special_tokens=False)
111
+ _entities = self._fast_extract(self.tokenizer.decode(_s_seq, skip_special_tokens=True),
112
+ with_gender=with_gender,
113
+ prob_threshold=prob_threshold) if len(_s_seq) < window_size else []
114
+ # sliding window extracting
115
+ if len(_s_seq) >= window_size:
116
+ _stride = window_size // 4
117
+ for i in range(0, len(_s_seq) + window_size, _stride):
118
+ _tmp_s_seq = _s_seq[i: i + window_size]
119
+ if len(_tmp_s_seq) < 1:
120
+ continue
121
+ _window_text = self.tokenizer.decode(_tmp_s_seq, skip_special_tokens=True)
122
+ _entities.extend(self._fast_extract(_window_text, with_gender=with_gender, prob_threshold=prob_threshold))
123
+ # entity combination
99
124
  _tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
100
125
  for _ent_i in _tmp_entities:
101
126
  for _ent_j in _entities:
@@ -103,6 +128,7 @@ class BertNER(BaseNER):
103
128
  and len(_ent_j.text) != len(_ent_i.text)
104
129
  and _ent_j in _tmp_entities):
105
130
  _tmp_entities.remove(_ent_j)
131
+ # entity cleaning
106
132
  while True:
107
133
  for _ent in _tmp_entities:
108
134
  if _ent.text not in s or len(_ent.text) < 2:
@@ -116,6 +142,7 @@ class BertNER(BaseNER):
116
142
  break
117
143
  if not deduplicate:
118
144
  return sorted(_tmp_entities, key=lambda _: _.text[0], reverse=False)
145
+ # entity deduplication
119
146
  _fin_entities = dict()
120
147
  texts = set([text.text for text in _tmp_entities])
121
148
  for text in texts:
@@ -133,4 +160,5 @@ class BertNER(BaseNER):
133
160
  return self._fast_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold)
134
161
  else:
135
162
  return self._slow_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold,
163
+ window_size=kwargs.get('window_size', DEFAULT_LENGTH_THRESHOLD),
136
164
  deduplicate=kwargs.get('deduplicate', True))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.4
3
+ Version: 0.9.6
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -27,6 +27,28 @@ Dynamic: license-file
27
27
 
28
28
  > Author: [vortezwohl](https://github.com/vortezwohl)
29
29
 
30
+ ## Citation
31
+
32
+ If you are incorporating the `DeepLoTX` framework into your research, please remember to properly cite it to acknowledge its contribution to your work.
33
+
34
+ Если вы интегрируете фреймворк `DeepLoTX` в своё исследование, пожалуйста, не забудьте правильно сослаться на него, указывая его вклад в вашу работу.
35
+
36
+ もしあなたが研究に `DeepLoTX` フレームワークを組み入れているなら、その貢献を認めるために適切に引用することを忘れないでください.
37
+
38
+ 如果您正在將 `DeepLoTX` 框架整合到您的研究中,請務必正確引用它,以聲明它對您工作的貢獻.
39
+
40
+ ```bibtex
41
+ @software{Wu_DeepLoTX_2025,
42
+ author = {Wu, Zihao},
43
+ license = {GPL-3.0},
44
+ month = aug,
45
+ title = {{DeepLoTX}},
46
+ url = {https://github.com/vortezwohl/DeepLoTX},
47
+ version = {0.9.5},
48
+ year = {2025}
49
+ }
50
+ ```
51
+
30
52
  ## Installation
31
53
 
32
54
  - **With pip**
@@ -5,7 +5,7 @@ deeplotx/encoder/long_text_encoder.py,sha256=4oRa9FqfGNZ8-gq14UKuhDkZC0A1Xi-wKmb
5
5
  deeplotx/encoder/longformer_encoder.py,sha256=7Lm65AUD3qwbrzrhJ3dPZkyHeNRSapga3f-5QJCxV5A,3538
6
6
  deeplotx/ner/__init__.py,sha256=Rss1pup9HzHZCG8U9ub8niWa9zRjWCy3Z7zg378KZQg,114
7
7
  deeplotx/ner/base_ner.py,sha256=pZTl50OrHH_FJm4rKp9iuixeOE6FX_AzgDXD32aXsN0,204
8
- deeplotx/ner/bert_ner.py,sha256=bp2oFhNdyiI_VORNBppo8HDc3SI0YBsWH0IrZIMXkk4,7866
8
+ deeplotx/ner/bert_ner.py,sha256=_IpR2AqXzEP_QBUj8KKVS7jO9mHZyFGDxr7q1LUgJf4,9246
9
9
  deeplotx/ner/named_entity.py,sha256=c6XufIwH6yloJ-ccUjagf4mBl1XbbYDT8xyEJJ_-ZNs,269
10
10
  deeplotx/ner/n2g/__init__.py,sha256=b6fOWJVLaOCtoz8Qlp8NWQbL5lUSbn6H3-8fnVNIPi0,3940
11
11
  deeplotx/nn/__init__.py,sha256=YILwbxb-NHdiJjfOwBKH8F7PuZSDZSrGpTznPDucTro,710
@@ -33,8 +33,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=TFxOX8rWU_zKliI9zm
33
33
  deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
34
34
  deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
35
35
  deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
36
- deeplotx-0.9.4.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
37
- deeplotx-0.9.4.dist-info/METADATA,sha256=FkHL9xnVxqKEE22X2Dpfcf6RJlHhVkMJoFXVwa7ufGA,13472
38
- deeplotx-0.9.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
- deeplotx-0.9.4.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
40
- deeplotx-0.9.4.dist-info/RECORD,,
36
+ deeplotx-0.9.6.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
37
+ deeplotx-0.9.6.dist-info/METADATA,sha256=yjG3QmoZfUJwenoR76b7MXAjzCt7Qh9UMr1CishZFG4,14442
38
+ deeplotx-0.9.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
+ deeplotx-0.9.6.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
40
+ deeplotx-0.9.6.dist-info/RECORD,,