deeplotx 0.9.4__tar.gz → 0.9.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deeplotx-0.9.4 → deeplotx-0.9.5}/PKG-INFO +1 -1
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/bert_ner.py +32 -10
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/PKG-INFO +1 -1
- {deeplotx-0.9.4 → deeplotx-0.9.5}/pyproject.toml +1 -1
- {deeplotx-0.9.4 → deeplotx-0.9.5}/LICENSE +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/README.md +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/encoder.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/long_text_encoder.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/encoder/longformer_encoder.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/base_ner.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/n2g/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/ner/named_entity.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/attention.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/auto_regression.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/feed_forward.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/linear_regression.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/logistic_regression.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/long_context_auto_regression.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/multi_head_attention.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/multi_head_feed_forward.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/recursive_sequential.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/roformer_encoder.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/rope.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/nn/softmax_regression.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/util/hash.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/SOURCES.txt +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/requires.txt +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.9.4 → deeplotx-0.9.5}/setup.cfg +0 -0
@@ -11,6 +11,8 @@ from deeplotx.ner.base_ner import BaseNER
|
|
11
11
|
from deeplotx.ner.named_entity import NamedEntity, NamedPerson
|
12
12
|
|
13
13
|
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
14
|
+
NEW_LINE, BLANK = '\n', ' '
|
15
|
+
DEFAULT_LENGTH_THRESHOLD = 384
|
14
16
|
DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
|
15
17
|
N2G_MODEL: list[Name2Gender] = []
|
16
18
|
logger = logging.getLogger('deeplotx.ner')
|
@@ -44,11 +46,11 @@ class BertNER(BaseNER):
|
|
44
46
|
trust_remote_code=True, local_files_only=True).to(self.device)
|
45
47
|
self.embed_dim = self.encoder.config.max_position_embeddings
|
46
48
|
self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
|
47
|
-
logger.debug(f'{
|
49
|
+
logger.debug(f'{BertNER.__name__} initialized on device: {self.device}.')
|
48
50
|
|
49
51
|
def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
|
50
52
|
assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
|
51
|
-
s = f' {s} '
|
53
|
+
s = f' {s.replace(NEW_LINE, BLANK)} '
|
52
54
|
raw_entities = self._ner_pipeline(s)
|
53
55
|
entities = []
|
54
56
|
for ent in raw_entities:
|
@@ -69,8 +71,21 @@ class BertNER(BaseNER):
|
|
69
71
|
break
|
70
72
|
for ent in entities:
|
71
73
|
ent[0] = ent[0].strip()
|
74
|
+
# stripping
|
75
|
+
while not ent[0][0].isalpha():
|
76
|
+
if len(ent[0]) < 2:
|
77
|
+
break
|
78
|
+
if not ent[0][0].isnumeric():
|
79
|
+
ent[0] = ent[0][1:]
|
80
|
+
while not ent[0][-1].isalpha():
|
81
|
+
if len(ent[0]) < 2:
|
82
|
+
break
|
83
|
+
if not ent[0][-1].isnumeric():
|
84
|
+
ent[0] = ent[0][:-1]
|
72
85
|
if ent[1].upper().startswith('B'):
|
73
86
|
ent[1] = ent[1].upper()[1:].strip('-')
|
87
|
+
if len(entities) > 0:
|
88
|
+
logger.debug(f'Entities: {[_[0] for _ in entities]}, extracted from: "{s.strip()}".')
|
74
89
|
entities = [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
|
75
90
|
if not with_gender:
|
76
91
|
return entities
|
@@ -88,14 +103,19 @@ class BertNER(BaseNER):
|
|
88
103
|
return entities
|
89
104
|
|
90
105
|
def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0, deduplicate: bool = True) -> list[NamedEntity]:
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
106
|
+
_length_threshold = DEFAULT_LENGTH_THRESHOLD
|
107
|
+
_s_seq = self.tokenizer.encode(s, add_special_tokens=False)
|
108
|
+
_entities = self._fast_extract(self.tokenizer.decode(_s_seq, skip_special_tokens=True),
|
109
|
+
with_gender=with_gender,
|
110
|
+
prob_threshold=prob_threshold) if len(_s_seq) < _length_threshold else []
|
111
|
+
# sliding window extracting
|
112
|
+
if len(_s_seq) >= _length_threshold:
|
113
|
+
_window_size = _length_threshold
|
114
|
+
_stride = _length_threshold // 4
|
115
|
+
for i in range(0, len(_s_seq) + _stride, _stride):
|
116
|
+
_window_text = self.tokenizer.decode(_s_seq[i: i + _window_size], skip_special_tokens=True)
|
117
|
+
_entities.extend(self._fast_extract(_window_text, with_gender=with_gender, prob_threshold=prob_threshold))
|
118
|
+
# entity combination
|
99
119
|
_tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
|
100
120
|
for _ent_i in _tmp_entities:
|
101
121
|
for _ent_j in _entities:
|
@@ -103,6 +123,7 @@ class BertNER(BaseNER):
|
|
103
123
|
and len(_ent_j.text) != len(_ent_i.text)
|
104
124
|
and _ent_j in _tmp_entities):
|
105
125
|
_tmp_entities.remove(_ent_j)
|
126
|
+
# entity cleaning
|
106
127
|
while True:
|
107
128
|
for _ent in _tmp_entities:
|
108
129
|
if _ent.text not in s or len(_ent.text) < 2:
|
@@ -116,6 +137,7 @@ class BertNER(BaseNER):
|
|
116
137
|
break
|
117
138
|
if not deduplicate:
|
118
139
|
return sorted(_tmp_entities, key=lambda _: _.text[0], reverse=False)
|
140
|
+
# entity deduplication
|
119
141
|
_fin_entities = dict()
|
120
142
|
texts = set([text.text for text in _tmp_entities])
|
121
143
|
for text in texts:
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|