deeplotx 0.9.4__py3-none-any.whl → 0.9.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeplotx/ner/bert_ner.py +40 -12
- {deeplotx-0.9.4.dist-info → deeplotx-0.9.6.dist-info}/METADATA +23 -1
- {deeplotx-0.9.4.dist-info → deeplotx-0.9.6.dist-info}/RECORD +6 -6
- {deeplotx-0.9.4.dist-info → deeplotx-0.9.6.dist-info}/WHEEL +0 -0
- {deeplotx-0.9.4.dist-info → deeplotx-0.9.6.dist-info}/licenses/LICENSE +0 -0
- {deeplotx-0.9.4.dist-info → deeplotx-0.9.6.dist-info}/top_level.txt +0 -0
deeplotx/ner/bert_ner.py
CHANGED
@@ -11,6 +11,8 @@ from deeplotx.ner.base_ner import BaseNER
|
|
11
11
|
from deeplotx.ner.named_entity import NamedEntity, NamedPerson
|
12
12
|
|
13
13
|
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
14
|
+
NEW_LINE, BLANK = '\n', ' '
|
15
|
+
DEFAULT_LENGTH_THRESHOLD = 448
|
14
16
|
DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
|
15
17
|
N2G_MODEL: list[Name2Gender] = []
|
16
18
|
logger = logging.getLogger('deeplotx.ner')
|
@@ -44,11 +46,11 @@ class BertNER(BaseNER):
|
|
44
46
|
trust_remote_code=True, local_files_only=True).to(self.device)
|
45
47
|
self.embed_dim = self.encoder.config.max_position_embeddings
|
46
48
|
self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
|
47
|
-
logger.debug(f'{
|
49
|
+
logger.debug(f'{BertNER.__name__} initialized on device: {self.device}.')
|
48
50
|
|
49
51
|
def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
|
50
52
|
assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
|
51
|
-
s = f' {s} '
|
53
|
+
s = f' {s.replace(NEW_LINE, BLANK * 2)} '
|
52
54
|
raw_entities = self._ner_pipeline(s)
|
53
55
|
entities = []
|
54
56
|
for ent in raw_entities:
|
@@ -58,7 +60,10 @@ class BertNER(BaseNER):
|
|
58
60
|
if len(ent[0].strip()) < 1:
|
59
61
|
del entities[i]
|
60
62
|
if ent[1].upper().startswith('I') and entities[i - 1][1].upper().startswith('B'):
|
61
|
-
entities[i - 1][0]
|
63
|
+
if entities[i - 1][0] + ent[0] in s:
|
64
|
+
entities[i - 1][0] += ent[0]
|
65
|
+
else:
|
66
|
+
entities[i - 1][0] += f' {ent[0]}'
|
62
67
|
entities[i - 1][2] *= ent[2]
|
63
68
|
del entities[i]
|
64
69
|
_continue = False
|
@@ -69,8 +74,21 @@ class BertNER(BaseNER):
|
|
69
74
|
break
|
70
75
|
for ent in entities:
|
71
76
|
ent[0] = ent[0].strip()
|
77
|
+
# stripping
|
78
|
+
while not ent[0][0].isalpha():
|
79
|
+
if len(ent[0]) < 2:
|
80
|
+
break
|
81
|
+
if not ent[0][0].isnumeric():
|
82
|
+
ent[0] = ent[0][1:]
|
83
|
+
while not ent[0][-1].isalpha():
|
84
|
+
if len(ent[0]) < 2:
|
85
|
+
break
|
86
|
+
if not ent[0][-1].isnumeric():
|
87
|
+
ent[0] = ent[0][:-1]
|
72
88
|
if ent[1].upper().startswith('B'):
|
73
89
|
ent[1] = ent[1].upper()[1:].strip('-')
|
90
|
+
if len(entities) > 0:
|
91
|
+
logger.debug(f'Entities: {[_[0] for _ in entities]}, extracted from: "{s.strip()}".')
|
74
92
|
entities = [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
|
75
93
|
if not with_gender:
|
76
94
|
return entities
|
@@ -87,15 +105,22 @@ class BertNER(BaseNER):
|
|
87
105
|
gender_probability=gender_prob)
|
88
106
|
return entities
|
89
107
|
|
90
|
-
def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0,
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
108
|
+
def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0,
|
109
|
+
window_size: int = DEFAULT_LENGTH_THRESHOLD, deduplicate: bool = True) -> list[NamedEntity]:
|
110
|
+
_s_seq = self.tokenizer.encode(s, add_special_tokens=False)
|
111
|
+
_entities = self._fast_extract(self.tokenizer.decode(_s_seq, skip_special_tokens=True),
|
112
|
+
with_gender=with_gender,
|
113
|
+
prob_threshold=prob_threshold) if len(_s_seq) < window_size else []
|
114
|
+
# sliding window extracting
|
115
|
+
if len(_s_seq) >= window_size:
|
116
|
+
_stride = window_size // 4
|
117
|
+
for i in range(0, len(_s_seq) + window_size, _stride):
|
118
|
+
_tmp_s_seq = _s_seq[i: i + window_size]
|
119
|
+
if len(_tmp_s_seq) < 1:
|
120
|
+
continue
|
121
|
+
_window_text = self.tokenizer.decode(_tmp_s_seq, skip_special_tokens=True)
|
122
|
+
_entities.extend(self._fast_extract(_window_text, with_gender=with_gender, prob_threshold=prob_threshold))
|
123
|
+
# entity combination
|
99
124
|
_tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
|
100
125
|
for _ent_i in _tmp_entities:
|
101
126
|
for _ent_j in _entities:
|
@@ -103,6 +128,7 @@ class BertNER(BaseNER):
|
|
103
128
|
and len(_ent_j.text) != len(_ent_i.text)
|
104
129
|
and _ent_j in _tmp_entities):
|
105
130
|
_tmp_entities.remove(_ent_j)
|
131
|
+
# entity cleaning
|
106
132
|
while True:
|
107
133
|
for _ent in _tmp_entities:
|
108
134
|
if _ent.text not in s or len(_ent.text) < 2:
|
@@ -116,6 +142,7 @@ class BertNER(BaseNER):
|
|
116
142
|
break
|
117
143
|
if not deduplicate:
|
118
144
|
return sorted(_tmp_entities, key=lambda _: _.text[0], reverse=False)
|
145
|
+
# entity deduplication
|
119
146
|
_fin_entities = dict()
|
120
147
|
texts = set([text.text for text in _tmp_entities])
|
121
148
|
for text in texts:
|
@@ -133,4 +160,5 @@ class BertNER(BaseNER):
|
|
133
160
|
return self._fast_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold)
|
134
161
|
else:
|
135
162
|
return self._slow_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold,
|
163
|
+
window_size=kwargs.get('window_size', DEFAULT_LENGTH_THRESHOLD),
|
136
164
|
deduplicate=kwargs.get('deduplicate', True))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.9.
|
3
|
+
Version: 0.9.6
|
4
4
|
Summary: An out-of-the-box long-text NLP framework.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -27,6 +27,28 @@ Dynamic: license-file
|
|
27
27
|
|
28
28
|
> Author: [vortezwohl](https://github.com/vortezwohl)
|
29
29
|
|
30
|
+
## Citation
|
31
|
+
|
32
|
+
If you are incorporating the `DeepLoTX` framework into your research, please remember to properly cite it to acknowledge its contribution to your work.
|
33
|
+
|
34
|
+
Если вы интегрируете фреймворк `DeepLoTX` в своё исследование, пожалуйста, не забудьте правильно сослаться на него, указывая его вклад в вашу работу.
|
35
|
+
|
36
|
+
もしあなたが研究に `DeepLoTX` フレームワークを組み入れているなら、その貢献を認めるために適切に引用することを忘れないでください.
|
37
|
+
|
38
|
+
如果您正在將 `DeepLoTX` 框架整合到您的研究中,請務必正確引用它,以聲明它對您工作的貢獻.
|
39
|
+
|
40
|
+
```bibtex
|
41
|
+
@software{Wu_DeepLoTX_2025,
|
42
|
+
author = {Wu, Zihao},
|
43
|
+
license = {GPL-3.0},
|
44
|
+
month = aug,
|
45
|
+
title = {{DeepLoTX}},
|
46
|
+
url = {https://github.com/vortezwohl/DeepLoTX},
|
47
|
+
version = {0.9.5},
|
48
|
+
year = {2025}
|
49
|
+
}
|
50
|
+
```
|
51
|
+
|
30
52
|
## Installation
|
31
53
|
|
32
54
|
- **With pip**
|
@@ -5,7 +5,7 @@ deeplotx/encoder/long_text_encoder.py,sha256=4oRa9FqfGNZ8-gq14UKuhDkZC0A1Xi-wKmb
|
|
5
5
|
deeplotx/encoder/longformer_encoder.py,sha256=7Lm65AUD3qwbrzrhJ3dPZkyHeNRSapga3f-5QJCxV5A,3538
|
6
6
|
deeplotx/ner/__init__.py,sha256=Rss1pup9HzHZCG8U9ub8niWa9zRjWCy3Z7zg378KZQg,114
|
7
7
|
deeplotx/ner/base_ner.py,sha256=pZTl50OrHH_FJm4rKp9iuixeOE6FX_AzgDXD32aXsN0,204
|
8
|
-
deeplotx/ner/bert_ner.py,sha256=
|
8
|
+
deeplotx/ner/bert_ner.py,sha256=_IpR2AqXzEP_QBUj8KKVS7jO9mHZyFGDxr7q1LUgJf4,9246
|
9
9
|
deeplotx/ner/named_entity.py,sha256=c6XufIwH6yloJ-ccUjagf4mBl1XbbYDT8xyEJJ_-ZNs,269
|
10
10
|
deeplotx/ner/n2g/__init__.py,sha256=b6fOWJVLaOCtoz8Qlp8NWQbL5lUSbn6H3-8fnVNIPi0,3940
|
11
11
|
deeplotx/nn/__init__.py,sha256=YILwbxb-NHdiJjfOwBKH8F7PuZSDZSrGpTznPDucTro,710
|
@@ -33,8 +33,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=TFxOX8rWU_zKliI9zm
|
|
33
33
|
deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
|
34
34
|
deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
|
35
35
|
deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
|
36
|
-
deeplotx-0.9.
|
37
|
-
deeplotx-0.9.
|
38
|
-
deeplotx-0.9.
|
39
|
-
deeplotx-0.9.
|
40
|
-
deeplotx-0.9.
|
36
|
+
deeplotx-0.9.6.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
37
|
+
deeplotx-0.9.6.dist-info/METADATA,sha256=yjG3QmoZfUJwenoR76b7MXAjzCt7Qh9UMr1CishZFG4,14442
|
38
|
+
deeplotx-0.9.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
39
|
+
deeplotx-0.9.6.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
|
40
|
+
deeplotx-0.9.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|