deeplotx 0.9.2__py3-none-any.whl → 0.9.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deeplotx/ner/base_ner.py CHANGED
@@ -4,7 +4,4 @@ from deeplotx.ner.named_entity import NamedEntity, NamedPerson
4
4
  class BaseNER:
5
5
  def __init__(self): ...
6
6
 
7
- def __call__(self, s: str, *args, **kwargs) -> list[NamedEntity | NamedPerson]:
8
- return self.extract_entities(s=s, *args, **kwargs)
9
-
10
- def extract_entities(self, s: str, *args, **kwargs) -> list[NamedEntity | NamedPerson]: ...
7
+ def __call__(self, s: str, *args, **kwargs) -> list[NamedEntity | NamedPerson]: ...
deeplotx/ner/bert_ner.py CHANGED
@@ -46,9 +46,9 @@ class BertNER(BaseNER):
46
46
  self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
47
47
  logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
48
48
 
49
- def extract_entities(self, s: str, with_gender: bool = True, prob_threshold: float = .0, *args, **kwargs) -> list[NamedEntity]:
49
+ def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
50
50
  assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
51
- s = ' ' + s
51
+ s = f' {s} '
52
52
  raw_entities = self._ner_pipeline(s)
53
53
  entities = []
54
54
  for ent in raw_entities:
@@ -87,5 +87,49 @@ class BertNER(BaseNER):
87
87
  gender_probability=gender_prob)
88
88
  return entities
89
89
 
90
- def __call__(self, s: str, with_gender: bool = True, prob_threshold: float = .0, *args, **kwargs):
91
- return self.extract_entities(s=s, with_gender=with_gender, prob_threshold=prob_threshold, *args, **kwargs)
90
+ def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0, deduplicate: bool = True) -> list[NamedEntity]:
91
+ _entities = self._fast_extract(s, with_gender=with_gender, prob_threshold=prob_threshold) if len(s) < 512 else []
92
+ if len(s) >= 512:
93
+ window_size: int = 512
94
+ offset = window_size // 6
95
+ for _offset in [- offset, offset]:
96
+ _window_size = window_size + _offset
97
+ for i in range(0, len(s) + _window_size, _window_size):
98
+ _entities.extend(self._fast_extract(s[i: i + _window_size], with_gender=with_gender, prob_threshold=prob_threshold))
99
+ _tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
100
+ for _ent_i in _tmp_entities:
101
+ for _ent_j in _entities:
102
+ if (_ent_j.text in _ent_i.text
103
+ and len(_ent_j.text) != len(_ent_i.text)
104
+ and _ent_j in _tmp_entities):
105
+ _tmp_entities.remove(_ent_j)
106
+ while True:
107
+ for _ent in _tmp_entities:
108
+ if _ent.text not in s or len(_ent.text) < 2:
109
+ _tmp_entities.remove(_ent)
110
+ _continue = False
111
+ for _ent in _tmp_entities:
112
+ if _ent.text not in s or len(_ent.text) < 2:
113
+ _continue = True
114
+ break
115
+ if not _continue:
116
+ break
117
+ if not deduplicate:
118
+ return _tmp_entities
119
+ _fin_entities = dict()
120
+ texts = set([text.text for text in _tmp_entities])
121
+ for text in texts:
122
+ for _ent in _tmp_entities:
123
+ if _ent.text == text:
124
+ if _ent.text not in _fin_entities.keys():
125
+ _fin_entities[_ent.text] = _ent
126
+ else:
127
+ if _ent.base_probability > _fin_entities[_ent.text].base_probability:
128
+ _fin_entities[_ent.text] = _ent
129
+ return [v for k, v in _fin_entities.items()]
130
+
131
+ def __call__(self, s: str, with_gender: bool = True, prob_threshold: float = .0, fast_mode: bool = False, *args, **kwargs):
132
+ if fast_mode:
133
+ return self._fast_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold)
134
+ else:
135
+ return self._slow_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold, deduplicate=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.2
3
+ Version: 0.9.3
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -4,8 +4,8 @@ deeplotx/encoder/encoder.py,sha256=wVRl3p_7eg7qT_tJEit5qnmZx7dXkMVLxAtao5vImkk,4
4
4
  deeplotx/encoder/long_text_encoder.py,sha256=4oRa9FqfGNZ8-gq14UKuhDkZC0A1Xi-wKmbQsn-uZ58,3966
5
5
  deeplotx/encoder/longformer_encoder.py,sha256=7Lm65AUD3qwbrzrhJ3dPZkyHeNRSapga3f-5QJCxV5A,3538
6
6
  deeplotx/ner/__init__.py,sha256=Rss1pup9HzHZCG8U9ub8niWa9zRjWCy3Z7zg378KZQg,114
7
- deeplotx/ner/base_ner.py,sha256=bAp7R6mawsfO7owBONXtbPN0rzMSltMJVEGGNKhi41A,359
8
- deeplotx/ner/bert_ner.py,sha256=RkqHVBY4SBJtHHR0YuR006v5gFmAaKqJCCKkOOs9ulY,5458
7
+ deeplotx/ner/base_ner.py,sha256=pZTl50OrHH_FJm4rKp9iuixeOE6FX_AzgDXD32aXsN0,204
8
+ deeplotx/ner/bert_ner.py,sha256=I8yFsarsLEQv0vcnNU2JIc0-LuPJcxaO-mLhDFCh1PI,7704
9
9
  deeplotx/ner/named_entity.py,sha256=c6XufIwH6yloJ-ccUjagf4mBl1XbbYDT8xyEJJ_-ZNs,269
10
10
  deeplotx/ner/n2g/__init__.py,sha256=b6fOWJVLaOCtoz8Qlp8NWQbL5lUSbn6H3-8fnVNIPi0,3940
11
11
  deeplotx/nn/__init__.py,sha256=YILwbxb-NHdiJjfOwBKH8F7PuZSDZSrGpTznPDucTro,710
@@ -33,8 +33,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=TFxOX8rWU_zKliI9zm
33
33
  deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
34
34
  deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
35
35
  deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
36
- deeplotx-0.9.2.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
37
- deeplotx-0.9.2.dist-info/METADATA,sha256=lA_h92G6v6cT3ff94pmxVAi0LLj-qO2qrEjAVLFTYHw,13472
38
- deeplotx-0.9.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
- deeplotx-0.9.2.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
40
- deeplotx-0.9.2.dist-info/RECORD,,
36
+ deeplotx-0.9.3.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
37
+ deeplotx-0.9.3.dist-info/METADATA,sha256=Fg0KzWIxFcMtuTfmuQ9BBJDFXjNTWtl9l3Cuuc1sX3I,13472
38
+ deeplotx-0.9.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
+ deeplotx-0.9.3.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
40
+ deeplotx-0.9.3.dist-info/RECORD,,