PyPI - deeplotx - Versions diffs - 0.9.8a0__py3-none-any.whl → 0.9.10__py3-none-any.whl - Mend

deeplotx 0.9.8a0py3-none-any.whl → 0.9.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

deeplotx/__init__.py +0 -1
deeplotx/ner/bert_ner.py +9 -1
deeplotx/ner/n2g/__init__.py +17 -23
deeplotx/util/__init__.py +1 -1
{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/METADATA +1 -1
{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/RECORD +9 -12
deeplotx/trainer/__init__.py +0 -1
deeplotx/trainer/base_trainer.py +0 -13
deeplotx/trainer/text_binary_classification_trainer.py +0 -103
{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/WHEEL +0 -0
{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/licenses/LICENSE +0 -0
{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/top_level.txt +0 -0

deeplotx/__init__.py CHANGED Viewed

@@ -21,7 +21,6 @@ from .nn import (
     AutoRegression,
     LongContextAutoRegression
 )
-from .trainer import TextBinaryClassifierTrainer
 __AUTHOR__ = '吴子豪 / Vortez Wohl'
 __EMAIL__ = 'vortez.wohl@gmail.com'

deeplotx/ner/bert_ner.py CHANGED Viewed

@@ -43,12 +43,15 @@ class BertNER(BaseNER):
     def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
         assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
+        # entity length cannot be longer than the whole seq
+        __max_search_backward = len(self.tokenizer.encode(s, add_special_tokens=False))
         s = f' {s.replace(NEW_LINE, BLANK * 2)} '
         raw_entities = self._ner_pipeline(s)
         entities = []
         for ent in raw_entities:
             entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
-        while True:
+        __search_backward = -2
+        while __search_backward < __max_search_backward:
             for i, ent in enumerate(entities):
                 if len(ent[0].strip()) < 1:
                     del entities[i]
@@ -65,6 +68,11 @@ class BertNER(BaseNER):
                     _continue = True
             if not _continue:
                 break
+            __search_backward += 1
+        # adjust all I-ENTs
+        for ent in entities:
+            if ent[1].upper().startswith('I'):
+                ent[1] = f'B{ent[1][1:]}'
         for ent in entities:
             ent[0] = ent[0].strip()
             if len(ent[0]) < 1:

deeplotx/ner/n2g/__init__.py CHANGED Viewed

@@ -13,13 +13,12 @@ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
 __CACHE_DIR__ = os.path.join(__ROOT__, '.cache', '.n2g')
 ENCODER = Encoder(model_name_or_path='FacebookAI/xlm-roberta-base')
-BASE_MODEL = 'name2gender-base'
-SMALL_MODEL = 'name2gender-small'
+DEFAULT_MODEL = 'name2gender-small'
 _MIN_FILE_SIZE = 1024 * 5
 def download_model(model_name: str):
-    quiet = bool(os.getenv('QUIET_DOWNLOAD', False))
+    quiet = bool(os.getenv('N2G_QUIET_DOWNLOAD', False))
     os.makedirs(__CACHE_DIR__, exist_ok=True)
     _proxies = {
         'http': os.getenv('HTTP_PROXY', os.getenv('http_proxy')),
@@ -51,25 +50,20 @@ def download_model(model_name: str):
 def load_model(model_name: str = 'name2gender-small', dtype: torch.dtype | None = torch.float16) -> BaseNeuralNetwork:
     n2g_model = None
-    match model_name:
-        case 'name2gender-base' | 'n2g-base' | 'base':
-            download_model(BASE_MODEL)
-            n2g_model = LogisticRegression(input_dim=768, output_dim=1,
-                                           num_heads=12, num_layers=4,
-                                           head_layers=1, expansion_factor=2,
-                                           model_name=BASE_MODEL, dtype=dtype)
-        case 'name2gender-small' | 'n2g-base' | 'small':
-            download_model(SMALL_MODEL)
-            n2g_model = LogisticRegression(input_dim=768, output_dim=1,
-                                           num_heads=6, num_layers=2,
-                                           head_layers=1, expansion_factor=1.5,
-                                           model_name=SMALL_MODEL, dtype=dtype)
-        case _:
-            download_model(SMALL_MODEL)
-            n2g_model = LogisticRegression(input_dim=768, output_dim=1,
-                                           num_heads=6, num_layers=2,
-                                           head_layers=1, expansion_factor=1.5,
-                                           model_name=SMALL_MODEL, dtype=dtype)
+    if 'base' in model_name.lower():
+        download_model(model_name)
+        n2g_model = LogisticRegression(input_dim=768, output_dim=1,
+                                       num_heads=12, num_layers=4,
+                                       head_layers=1, expansion_factor=2,
+                                       model_name=model_name, dtype=dtype)
+    elif 'small' in model_name.lower():
+        download_model(model_name)
+        n2g_model = LogisticRegression(input_dim=768, output_dim=1,
+                                       num_heads=6, num_layers=2,
+                                       head_layers=1, expansion_factor=1.5,
+                                       model_name=model_name, dtype=dtype)
+    else:
+        raise FileNotFoundError(f"Model \"{model_name}\" doesn't exists.")
     return n2g_model.load(model_dir=__CACHE_DIR__)
@@ -77,7 +71,7 @@ class Name2Gender:
     def __init__(self, model: BaseNeuralNetwork | None = None):
         super().__init__()
         if model is None:
-            model = load_model(SMALL_MODEL)
+            model = load_model(DEFAULT_MODEL)
         self._model = model
     def __call__(self, name: str, return_probability: bool = False, threshold: float = .5) -> tuple[Gender, float] | Gender:

deeplotx/util/__init__.py CHANGED Viewed

@@ -1,2 +1,2 @@
 from .hash import md5, sha1, sha256, sha512
-from .read_file import read_file, get_files
+from .read_file import read_file, write_file, get_files

{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.9.8a0
+Version: 0.9.10
 Summary: An out-of-the-box long-text NLP framework.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown

{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/RECORD RENAMED Viewed

@@ -1,13 +1,13 @@
-deeplotx/__init__.py,sha256=x4CbJuW20al6S5KkKyrReeuwNGv04JGoqtGUyx-ACtg,1356
+deeplotx/__init__.py,sha256=0OWLsgXlStzwm0m9ScaoZvBnsx3a0xTmlzYBUgarl-g,1306
 deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
 deeplotx/encoder/encoder.py,sha256=uJswUSrYVDWP84HeCD40R9KgXGPUEa080konv7jEp8I,3539
 deeplotx/encoder/long_text_encoder.py,sha256=4oRa9FqfGNZ8-gq14UKuhDkZC0A1Xi-wKmbQsn-uZ58,3966
 deeplotx/encoder/longformer_encoder.py,sha256=mfAI_NE3QQZhvGHbZkP7S6g0Jj59wmLWQ9QW7HOjqm0,2876
 deeplotx/ner/__init__.py,sha256=Rss1pup9HzHZCG8U9ub8niWa9zRjWCy3Z7zg378KZQg,114
 deeplotx/ner/base_ner.py,sha256=pZTl50OrHH_FJm4rKp9iuixeOE6FX_AzgDXD32aXsN0,204
-deeplotx/ner/bert_ner.py,sha256=6al5iMc8gb3XuJf7dbJd-noXShLYmRoLXTd8L1-wYMM,8581
+deeplotx/ner/bert_ner.py,sha256=tfbM3CQBEpZsD0KYVA7GVNJax-7kzNOQgwlpo2S8h-c,8986
 deeplotx/ner/named_entity.py,sha256=c6XufIwH6yloJ-ccUjagf4mBl1XbbYDT8xyEJJ_-ZNs,269
-deeplotx/ner/n2g/__init__.py,sha256=b6fOWJVLaOCtoz8Qlp8NWQbL5lUSbn6H3-8fnVNIPi0,3940
+deeplotx/ner/n2g/__init__.py,sha256=L1IJ8W1nApzqHx2u7JMtPCLfABm5qKJvh_bHMWdvdLY,3538
 deeplotx/nn/__init__.py,sha256=YILwbxb-NHdiJjfOwBKH8F7PuZSDZSrGpTznPDucTro,710
 deeplotx/nn/attention.py,sha256=R-i-Rd7gnsh6hwXDeYfqLQOJvfSZIGfQbFzRlC91XLo,2879
 deeplotx/nn/auto_regression.py,sha256=j_R7WGPq9REngjpLuX5c0AaNqOpgGm2Vfrolw-XjWXw,877
@@ -27,14 +27,11 @@ deeplotx/similarity/__init__.py,sha256=s3u-KSgxjnMcWpIItKgXNltFMPQ7YY3CqsqHI-5F1
 deeplotx/similarity/distribution.py,sha256=wQGouuuW531pZeBRKBujXsdsoz4fDnPw7_GW81jwepc,1066
 deeplotx/similarity/set.py,sha256=zhGFxtSIXlWqvipBYzoiPahp4g0boAIoUiMfG0wl07A,686
 deeplotx/similarity/vector.py,sha256=WVbDHqykt-fvuILVrhUCtIFAOEjY_zvttrXGM9eylG0,1125
-deeplotx/trainer/__init__.py,sha256=Fl5DR9UecQc5VtBcczU9sx_HtPNoFohpuELOh-Jrsks,77
-deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
-deeplotx/trainer/text_binary_classification_trainer.py,sha256=TFxOX8rWU_zKliI9zm7F5ZH7snR2d-sk95s3pfTmm78,6601
-deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
+deeplotx/util/__init__.py,sha256=ppQwp3A4rhAWBQ7DEobIYxloIiythxxUswCn-7UrMeA,102
 deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
 deeplotx/util/read_file.py,sha256=O9nieNgAGQ7Ct1EFxCdcgL6hVs5s2Vw_ItcUK6VeTwY,981
-deeplotx-0.9.8a0.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
-deeplotx-0.9.8a0.dist-info/METADATA,sha256=vOJDioxttz0XCz58mflhMB5PbBKdRCy2R4pP7ZjefW0,14444
-deeplotx-0.9.8a0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-deeplotx-0.9.8a0.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
-deeplotx-0.9.8a0.dist-info/RECORD,,
+deeplotx-0.9.10.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
+deeplotx-0.9.10.dist-info/METADATA,sha256=6m2igF02QAdr5xns8BamtyMfrgLwBsuej2jihR36dCE,14443
+deeplotx-0.9.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+deeplotx-0.9.10.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
+deeplotx-0.9.10.dist-info/RECORD,,

deeplotx/trainer/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .text_binary_classification_trainer import TextBinaryClassifierTrainer

deeplotx/trainer/base_trainer.py DELETED Viewed

@@ -1,13 +0,0 @@
-from abc import abstractmethod
-from deeplotx.nn.base_neural_network import BaseNeuralNetwork
-class BaseTrainer(object):
-    def __init__(self, batch_size: int, train_ratio: float):
-        self._batch_size = batch_size
-        self._train_ratio = train_ratio
-        self.model = None
-    @abstractmethod
-    def train(self, *args, **kwargs) -> BaseNeuralNetwork: ...

deeplotx/trainer/text_binary_classification_trainer.py DELETED Viewed

@@ -1,103 +0,0 @@
-import logging
-from typing_extensions import override
-import torch
-from torch import nn, optim
-from torch.utils.data import DataLoader, TensorDataset
-from deeplotx.encoder.long_text_encoder import LongTextEncoder
-from deeplotx.nn.attention import DEFAULT_THETA
-from deeplotx.nn.long_context_recursive_sequential import LongContextRecursiveSequential
-from deeplotx.trainer.base_trainer import BaseTrainer
-logger = logging.getLogger('deeplotx.trainer')
-class TextBinaryClassifierTrainer(BaseTrainer):
-    def __init__(self, long_text_encoder: LongTextEncoder, batch_size: int = 2, train_ratio: float = 0.8):
-        super().__init__(batch_size=batch_size, train_ratio=train_ratio)
-        self._long_text_encoder = long_text_encoder
-        self.device = self._long_text_encoder.device
-        self.train_dataset_loader = None
-        self.valid_dataset_loader = None
-    @override
-    def train(self, positive_texts: list[str], negative_texts: list[str],
-              num_epochs: int, learning_rate: float = 2e-6, balancing_dataset: bool = True,
-              train_loss_threshold: float = 0.0, valid_loss_threshold: float = 0.0,
-              alpha: float = 1e-4, rho: float = 0.2, encoder_layers: int = 4, attn_heads: int = 6,
-              recursive_layers: int = 2, recursive_hidden_dim: int = 256, **kwargs) -> LongContextRecursiveSequential:
-        if balancing_dataset:
-            min_length = min(len(positive_texts), len(negative_texts))
-            positive_texts = positive_texts[:min_length]
-            negative_texts = negative_texts[:min_length]
-        all_texts = positive_texts + negative_texts
-        text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
-        feature_dim = text_embeddings[0].shape[-1]
-        dtype = text_embeddings[0].dtype
-        labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
-                  + [torch.tensor([.0], dtype=dtype, device=self.device) for _ in range(len(negative_texts))])
-        inputs = torch.stack(text_embeddings).to(self.device)
-        labels = torch.stack(labels).to(self.device)
-        dataset_size = len(labels)
-        train_size = int(self._train_ratio * dataset_size)
-        train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
-        valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
-        self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
-        self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
-        if self.model is not None and self.model.in_features != feature_dim:
-            logger.warning("The dimension of features doesn't match. A new model instance will be created.")
-            self.model = None
-        if self.model is None:
-            ffn_heads = kwargs.get('ffn_heads', 2)
-            ffn_layers = kwargs.get('ffn_layers', 5)
-            ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
-            bias = kwargs.get('bias', True)
-            dropout_rate = kwargs.get('dropout_rate', 0.1)
-            encoder_ffn_layers = kwargs.get('encoder_ffn_layers', ffn_layers)
-            encoder_expansion_factor = kwargs.get('encoder_expansion_factor', ffn_expansion_factor)
-            encoder_dropout_rate = kwargs.get('encoder_dropout_rate', dropout_rate)
-            attn_ffn_layers = kwargs.get('attn_ffn_layers', 1)
-            attn_expansion_factor = kwargs.get('attn_expansion_factor', ffn_expansion_factor)
-            attn_dropout_rate = kwargs.get('attn_dropout_rate', dropout_rate)
-            theta = kwargs.get('theta', DEFAULT_THETA)
-            self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
-                                                        encoder_layers=encoder_layers, attn_heads=attn_heads,
-                                                        recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
-                                                        ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
-                                                        dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
-                                                        encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
-                                                        attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
-                                                        attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
-        logger.debug(f'Training Model: \n{self.model}')
-        loss_function = nn.BCELoss()
-        optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
-        for epoch in range(num_epochs):
-            self.model.train()
-            total_loss = 0.0
-            for batch_texts, batch_labels in self.train_dataset_loader:
-                outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
-                loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-                total_loss += loss.item()
-            if epoch % 3 == 0:
-                total_valid_loss = 0.0
-                for batch_texts, batch_labels in self.valid_dataset_loader:
-                    with torch.no_grad():
-                        self.model.eval()
-                        outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
-                        loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
-                        total_valid_loss += loss.item()
-                        self.model.train()
-                logger.debug(f"Epoch {epoch + 1}/{num_epochs} | "
-                             f"Train Loss: {total_loss:.4f} | "
-                             f"Valid Loss: {total_valid_loss:.4f}")
-                if total_valid_loss < valid_loss_threshold:
-                    break
-            else:
-                logger.debug(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {total_loss:.4f}")
-            if total_loss < train_loss_threshold:
-                break
-        return self.model

{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/WHEEL RENAMED Viewed

File without changes

{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/top_level.txt RENAMED Viewed

File without changes

deeplotx 0.9.8a0__py3-none-any.whl → 0.9.10__py3-none-any.whl

deeplotx 0.9.8a0py3-none-any.whl → 0.9.10py3-none-any.whl