deeplotx 0.9.2__tar.gz → 0.9.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deeplotx-0.9.2 → deeplotx-0.9.4}/PKG-INFO +1 -1
- deeplotx-0.9.4/deeplotx/ner/base_ner.py +7 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/ner/bert_ner.py +49 -4
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx.egg-info/PKG-INFO +1 -1
- {deeplotx-0.9.2 → deeplotx-0.9.4}/pyproject.toml +1 -1
- deeplotx-0.9.2/deeplotx/ner/base_ner.py +0 -10
- {deeplotx-0.9.2 → deeplotx-0.9.4}/LICENSE +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/README.md +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/encoder/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/encoder/encoder.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/encoder/long_text_encoder.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/encoder/longformer_encoder.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/ner/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/ner/n2g/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/ner/named_entity.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/attention.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/auto_regression.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/feed_forward.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/linear_regression.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/logistic_regression.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/long_context_auto_regression.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/multi_head_attention.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/multi_head_feed_forward.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/recursive_sequential.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/roformer_encoder.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/rope.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/nn/softmax_regression.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/util/hash.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx.egg-info/SOURCES.txt +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx.egg-info/requires.txt +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.9.2 → deeplotx-0.9.4}/setup.cfg +0 -0
@@ -46,9 +46,9 @@ class BertNER(BaseNER):
|
|
46
46
|
self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
|
47
47
|
logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
|
48
48
|
|
49
|
-
def
|
49
|
+
def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
|
50
50
|
assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
|
51
|
-
s = ' '
|
51
|
+
s = f' {s} '
|
52
52
|
raw_entities = self._ner_pipeline(s)
|
53
53
|
entities = []
|
54
54
|
for ent in raw_entities:
|
@@ -87,5 +87,50 @@ class BertNER(BaseNER):
|
|
87
87
|
gender_probability=gender_prob)
|
88
88
|
return entities
|
89
89
|
|
90
|
-
def
|
91
|
-
|
90
|
+
def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0, deduplicate: bool = True) -> list[NamedEntity]:
|
91
|
+
_entities = self._fast_extract(s, with_gender=with_gender, prob_threshold=prob_threshold) if len(s) < 512 else []
|
92
|
+
if len(s) >= 512:
|
93
|
+
window_size: int = 512
|
94
|
+
offset = window_size // 6
|
95
|
+
for _offset in [- offset, offset]:
|
96
|
+
_window_size = window_size + _offset
|
97
|
+
for i in range(0, len(s) + _window_size, _window_size):
|
98
|
+
_entities.extend(self._fast_extract(s[i: i + _window_size], with_gender=with_gender, prob_threshold=prob_threshold))
|
99
|
+
_tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
|
100
|
+
for _ent_i in _tmp_entities:
|
101
|
+
for _ent_j in _entities:
|
102
|
+
if (_ent_j.text in _ent_i.text
|
103
|
+
and len(_ent_j.text) != len(_ent_i.text)
|
104
|
+
and _ent_j in _tmp_entities):
|
105
|
+
_tmp_entities.remove(_ent_j)
|
106
|
+
while True:
|
107
|
+
for _ent in _tmp_entities:
|
108
|
+
if _ent.text not in s or len(_ent.text) < 2:
|
109
|
+
_tmp_entities.remove(_ent)
|
110
|
+
_continue = False
|
111
|
+
for _ent in _tmp_entities:
|
112
|
+
if _ent.text not in s or len(_ent.text) < 2:
|
113
|
+
_continue = True
|
114
|
+
break
|
115
|
+
if not _continue:
|
116
|
+
break
|
117
|
+
if not deduplicate:
|
118
|
+
return sorted(_tmp_entities, key=lambda _: _.text[0], reverse=False)
|
119
|
+
_fin_entities = dict()
|
120
|
+
texts = set([text.text for text in _tmp_entities])
|
121
|
+
for text in texts:
|
122
|
+
for _ent in _tmp_entities:
|
123
|
+
if _ent.text == text:
|
124
|
+
if _ent.text not in _fin_entities.keys():
|
125
|
+
_fin_entities[_ent.text] = _ent
|
126
|
+
else:
|
127
|
+
if _ent.base_probability > _fin_entities[_ent.text].base_probability:
|
128
|
+
_fin_entities[_ent.text] = _ent
|
129
|
+
return sorted([v for k, v in _fin_entities.items()], key=lambda _: _.text[0], reverse=False)
|
130
|
+
|
131
|
+
def __call__(self, s: str, with_gender: bool = True, prob_threshold: float = .0, fast_mode: bool = False, *args, **kwargs):
|
132
|
+
if fast_mode:
|
133
|
+
return self._fast_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold)
|
134
|
+
else:
|
135
|
+
return self._slow_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold,
|
136
|
+
deduplicate=kwargs.get('deduplicate', True))
|
@@ -1,10 +0,0 @@
|
|
1
|
-
from deeplotx.ner.named_entity import NamedEntity, NamedPerson
|
2
|
-
|
3
|
-
|
4
|
-
class BaseNER:
|
5
|
-
def __init__(self): ...
|
6
|
-
|
7
|
-
def __call__(self, s: str, *args, **kwargs) -> list[NamedEntity | NamedPerson]:
|
8
|
-
return self.extract_entities(s=s, *args, **kwargs)
|
9
|
-
|
10
|
-
def extract_entities(self, s: str, *args, **kwargs) -> list[NamedEntity | NamedPerson]: ...
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|