PyPI - deeplotx - Versions diffs - 0.9.3__tar.gz → 0.9.5__tar.gz - Mend

deeplotx 0.9.3tar.gz → 0.9.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{deeplotx-0.9.3 → deeplotx-0.9.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.9.3
+Version: 0.9.5
 Summary: An out-of-the-box long-text NLP framework.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown

{deeplotx-0.9.3 → deeplotx-0.9.5}/deeplotx/ner/bert_ner.py RENAMED Viewed

@@ -11,6 +11,8 @@ from deeplotx.ner.base_ner import BaseNER
 from deeplotx.ner.named_entity import NamedEntity, NamedPerson
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
+NEW_LINE, BLANK = '\n', ' '
+DEFAULT_LENGTH_THRESHOLD = 384
 DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
 N2G_MODEL: list[Name2Gender] = []
 logger = logging.getLogger('deeplotx.ner')
@@ -44,11 +46,11 @@ class BertNER(BaseNER):
                                                                            trust_remote_code=True, local_files_only=True).to(self.device)
         self.embed_dim = self.encoder.config.max_position_embeddings
         self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
-        logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
+        logger.debug(f'{BertNER.__name__} initialized on device: {self.device}.')
     def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
         assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
-        s = f' {s} '
+        s = f' {s.replace(NEW_LINE, BLANK)} '
         raw_entities = self._ner_pipeline(s)
         entities = []
         for ent in raw_entities:
@@ -69,8 +71,21 @@ class BertNER(BaseNER):
                 break
         for ent in entities:
             ent[0] = ent[0].strip()
+            # stripping
+            while not ent[0][0].isalpha():
+                if len(ent[0]) < 2:
+                    break
+                if not ent[0][0].isnumeric():
+                    ent[0] = ent[0][1:]
+            while not ent[0][-1].isalpha():
+                if len(ent[0]) < 2:
+                    break
+                if not ent[0][-1].isnumeric():
+                    ent[0] = ent[0][:-1]
             if ent[1].upper().startswith('B'):
                 ent[1] = ent[1].upper()[1:].strip('-')
+        if len(entities) > 0:
+            logger.debug(f'Entities: {[_[0] for _ in entities]}, extracted from: "{s.strip()}".')
         entities = [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
         if not with_gender:
             return entities
@@ -88,14 +103,19 @@ class BertNER(BaseNER):
         return entities
     def _slow_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0, deduplicate: bool = True) -> list[NamedEntity]:
-        _entities = self._fast_extract(s, with_gender=with_gender, prob_threshold=prob_threshold) if len(s) < 512 else []
-        if len(s) >= 512:
-            window_size: int = 512
-            offset = window_size // 6
-            for _offset in [- offset, offset]:
-                _window_size = window_size + _offset
-                for i in range(0, len(s) + _window_size, _window_size):
-                    _entities.extend(self._fast_extract(s[i: i + _window_size], with_gender=with_gender, prob_threshold=prob_threshold))
+        _length_threshold = DEFAULT_LENGTH_THRESHOLD
+        _s_seq = self.tokenizer.encode(s, add_special_tokens=False)
+        _entities = self._fast_extract(self.tokenizer.decode(_s_seq, skip_special_tokens=True),
+                                       with_gender=with_gender,
+                                       prob_threshold=prob_threshold) if len(_s_seq) < _length_threshold else []
+        # sliding window extracting
+        if len(_s_seq) >= _length_threshold:
+            _window_size = _length_threshold
+            _stride = _length_threshold // 4
+            for i in range(0, len(_s_seq) + _stride, _stride):
+                _window_text = self.tokenizer.decode(_s_seq[i: i + _window_size], skip_special_tokens=True)
+                _entities.extend(self._fast_extract(_window_text, with_gender=with_gender, prob_threshold=prob_threshold))
+        # entity combination
         _tmp_entities = sorted(_entities, key=lambda x: len(x.text), reverse=True)
         for _ent_i in _tmp_entities:
             for _ent_j in _entities:
@@ -103,6 +123,7 @@ class BertNER(BaseNER):
                         and len(_ent_j.text) != len(_ent_i.text)
                         and _ent_j in _tmp_entities):
                     _tmp_entities.remove(_ent_j)
+        # entity cleaning
         while True:
             for _ent in _tmp_entities:
                 if _ent.text not in s or len(_ent.text) < 2:
@@ -115,7 +136,8 @@ class BertNER(BaseNER):
             if not _continue:
                 break
         if not deduplicate:
-            return _tmp_entities
+            return sorted(_tmp_entities, key=lambda _: _.text[0], reverse=False)
+        # entity deduplication
         _fin_entities = dict()
         texts = set([text.text for text in _tmp_entities])
         for text in texts:
@@ -126,10 +148,11 @@ class BertNER(BaseNER):
                     else:
                         if _ent.base_probability > _fin_entities[_ent.text].base_probability:
                             _fin_entities[_ent.text] = _ent
-        return [v for k, v in _fin_entities.items()]
+        return sorted([v for k, v in _fin_entities.items()], key=lambda _: _.text[0], reverse=False)
     def __call__(self, s: str, with_gender: bool = True, prob_threshold: float = .0, fast_mode: bool = False, *args, **kwargs):
         if fast_mode:
             return self._fast_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold)
         else:
-            return self._slow_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold, deduplicate=True)
+            return self._slow_extract(s=s, with_gender=with_gender, prob_threshold=prob_threshold,
+                                      deduplicate=kwargs.get('deduplicate', True))

{deeplotx-0.9.3 → deeplotx-0.9.5}/deeplotx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.9.3
+Version: 0.9.5
 Summary: An out-of-the-box long-text NLP framework.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown

{deeplotx-0.9.3 → deeplotx-0.9.5}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "deeplotx"
-version = "0.9.3"
+version = "0.9.5"
 description = "An out-of-the-box long-text NLP framework."
 readme = "README.md"
 requires-python = ">=3.10"