PyPI - deeplotx - Versions diffs - 0.8.7__tar.gz → 0.9.0__tar.gz - Mend

deeplotx 0.8.7tar.gz → 0.9.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

{deeplotx-0.8.7 → deeplotx-0.9.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.8.7
+Version: 0.9.0
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -10,10 +10,13 @@ Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
 Requires-Dist: python-dotenv
+Requires-Dist: sentencepiece
+Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: transformers
 Requires-Dist: typing-extensions
 Requires-Dist: vortezwohl>=0.0.8
+Requires-Dist: name2gender>=0.0.4a0
 Dynamic: license-file
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/__init__.py RENAMED Viewed

@@ -4,6 +4,7 @@ import os
 __ROOT__ = os.path.dirname(os.path.abspath(__file__))
 from .encoder import Encoder, LongTextEncoder, LongformerEncoder
+from .ner import BertNER, NamedEntity
 from .nn import (
     FeedForward,
     MultiHeadFeedForward,
@@ -40,3 +41,5 @@ logger = logging.getLogger('deeplotx.trainer')
 logger.setLevel(logging.DEBUG)
 logger = logging.getLogger('deeplotx.embedding')
 logger.setLevel(logging.DEBUG)
+logger = logging.getLogger('deeplotx.ner')
+logger.setLevel(logging.DEBUG)

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/encoder.py RENAMED Viewed

@@ -43,9 +43,11 @@ class Encoder(nn.Module):
         self.embed_dim = self.encoder.config.max_position_embeddings
         logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True,
+                *args, **kwargs) -> torch.Tensor:
         def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
-            return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
+            emb_seq = self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state
+            return emb_seq[:, 0, :] if cls_only else emb_seq
         num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
         chunks, chunk_results = [], []
@@ -58,9 +60,9 @@ class Encoder(nn.Module):
         with torch.no_grad():
             chunk_results = [_encoder(x) for x in chunks]
         self.encoder.train(mode=ori_mode)
-        return torch.cat(chunk_results, dim=-1)
+        return torch.cat(chunk_results, dim=-1) if cls_only else torch.cat(chunk_results, dim=-2)
-    def encode(self, text: str) -> torch.Tensor:
+    def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
         _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
         _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
-        return self.forward(_input_ids, _att_mask).squeeze()
+        return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/long_text_encoder.py RENAMED Viewed

@@ -25,7 +25,7 @@ class LongTextEncoder(Encoder):
         self._worker_group = ThreadPool(max_workers=max_workers)
     def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
-        return idx, super().forward(x, attention_mask=mask)
+        return idx, super().forward(x, attention_mask=mask, cls_only=True)
     @override
     def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/longformer_encoder.py RENAMED Viewed

@@ -9,7 +9,7 @@ from requests.exceptions import ConnectTimeout, SSLError
 from deeplotx import __ROOT__
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
-DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
+DEFAULT_LONGFORMER = 'severinsimmler/xlm-roberta-longformer-base-16384'
 logger = logging.getLogger('deeplotx.embedding')
@@ -41,15 +41,16 @@ class LongformerEncoder(nn.Module):
                                                      trust_remote_code=True, local_files_only=True).to(self.device)
         logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True) -> torch.Tensor:
         ori_mode = self.encoder.training
         self.encoder.eval()
         with torch.no_grad():
-            res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
+            emb_seq = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state
+            res = emb_seq[:, 0, :] if cls_only else emb_seq
         self.encoder.train(mode=ori_mode)
         return res
-    def encode(self, text: str) -> torch.Tensor:
+    def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
         _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
         _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
-        return self.forward(_input_ids, _att_mask).squeeze()
+        return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()

deeplotx-0.9.0/deeplotx/ner/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from .named_entity import NamedEntity
+from .base_ner import BaseNER
+from .bert_ner import BertNER

deeplotx-0.9.0/deeplotx/ner/base_ner.py ADDED Viewed

@@ -0,0 +1,7 @@
+from deeplotx.ner.named_entity import NamedEntity
+class BaseNER:
+    def __init__(self): ...
+    def extract_entities(self, s: str, *args, **kwargs) -> list[NamedEntity]: ...

deeplotx-0.9.0/deeplotx/ner/bert_ner.py ADDED Viewed

@@ -0,0 +1,72 @@
+import logging
+import os
+from requests.exceptions import ConnectTimeout, SSLError
+import torch
+from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
+from deeplotx import __ROOT__
+from deeplotx.ner.base_ner import BaseNER
+from deeplotx.ner.named_entity import NamedEntity
+CACHE_PATH = os.path.join(__ROOT__, '.cache')
+DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
+logger = logging.getLogger('deeplotx.ner')
+class BertNER(BaseNER):
+    def __init__(self, model_name_or_path: str = DEFAULT_BERT_NER, device: str | None = None):
+        super().__init__()
+        self.device = torch.device(device) if device is not None \
+            else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                           trust_remote_code=True)
+            self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                                           trust_remote_code=True).to(self.device)
+        except ConnectTimeout:
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                           trust_remote_code=True, local_files_only=True)
+            self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                                           trust_remote_code=True, local_files_only=True).to(self.device)
+        except SSLError:
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                           trust_remote_code=True, local_files_only=True)
+            self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                                           trust_remote_code=True, local_files_only=True).to(self.device)
+        self.embed_dim = self.encoder.config.max_position_embeddings
+        self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
+        logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
+    def extract_entities(self, s: str, prob_threshold: float = .0, *args, **kwargs) -> list[NamedEntity]:
+        assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
+        s = ' ' + s
+        raw_entities = self._ner_pipeline(s)
+        entities = []
+        for ent in raw_entities:
+            entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
+        while True:
+            for i, ent in enumerate(entities):
+                if len(ent[0].strip()) < 1:
+                    del entities[i]
+                if ent[1].upper().startswith('I') and entities[i - 1][1].upper().startswith('B'):
+                    entities[i - 1][0] += ent[0]
+                    entities[i - 1][2] *= ent[2]
+                    del entities[i]
+            _continue = False
+            for ent in entities:
+                if ent[1].upper().startswith('I'):
+                    _continue = True
+            if not _continue:
+                break
+        for ent in entities:
+            ent[0] = ent[0].strip()
+            if ent[1].upper().startswith('B'):
+                ent[1] = ent[1].upper()[1:].strip('-')
+        return [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]

deeplotx-0.9.0/deeplotx/ner/named_entity.py ADDED Viewed

@@ -0,0 +1,8 @@
+from dataclasses import dataclass
+@dataclass
+class NamedEntity:
+    text: str
+    type: str
+    probability: float

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.8.7
+Version: 0.9.0
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -10,10 +10,13 @@ Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
 Requires-Dist: python-dotenv
+Requires-Dist: sentencepiece
+Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: transformers
 Requires-Dist: typing-extensions
 Requires-Dist: vortezwohl>=0.0.8
+Requires-Dist: name2gender>=0.0.4a0
 Dynamic: license-file
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,6 +11,10 @@ deeplotx/encoder/__init__.py
 deeplotx/encoder/encoder.py
 deeplotx/encoder/long_text_encoder.py
 deeplotx/encoder/longformer_encoder.py
+deeplotx/ner/__init__.py
+deeplotx/ner/base_ner.py
+deeplotx/ner/bert_ner.py
+deeplotx/ner/named_entity.py
 deeplotx/nn/__init__.py
 deeplotx/nn/attention.py
 deeplotx/nn/auto_regression.py

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/requires.txt RENAMED Viewed

@@ -3,7 +3,10 @@ jupyter
 numpy
 protobuf
 python-dotenv
+sentencepiece
+tiktoken
 torch
 transformers
 typing-extensions
 vortezwohl>=0.0.8
+name2gender>=0.0.4a0

{deeplotx-0.8.7 → deeplotx-0.9.0}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "deeplotx"
-version = "0.8.7"
+version = "0.9.0"
 description = "Easy-2-use long text NLP toolkit."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -10,8 +10,11 @@ dependencies = [
     "numpy",
     "protobuf",
     "python-dotenv",
+    "sentencepiece",
+    "tiktoken",
     "torch",
     "transformers",
     "typing-extensions",
     "vortezwohl>=0.0.8",
+    "name2gender>=0.0.4a0",
 ]

{deeplotx-0.8.7 → deeplotx-0.9.0}/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/README.md RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/encoder/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/attention.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/base_neural_network.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/feed_forward.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/linear_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/logistic_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/long_context_auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/long_context_recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/multi_head_attention.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/multi_head_feed_forward.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/roformer_encoder.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/rope.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/nn/softmax_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/distribution.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/set.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/similarity/vector.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/trainer/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/trainer/base_trainer.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/trainer/text_binary_classification_trainer.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/util/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/util/hash.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx/util/read_file.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/deeplotx.egg-info/top_level.txt RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.9.0}/setup.cfg RENAMED Viewed

File without changes

deeplotx 0.8.7__tar.gz → 0.9.0__tar.gz

deeplotx 0.8.7tar.gz → 0.9.0tar.gz