PyPI - deeplotx - Versions diffs - 0.9.9__tar.gz → 0.9.11__tar.gz - Mend

deeplotx 0.9.9tar.gz → 0.9.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

{deeplotx-0.9.9 → deeplotx-0.9.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.9.9
+Version: 0.9.11
 Summary: An out-of-the-box long-text NLP framework.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -15,7 +15,7 @@ Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: transformers
 Requires-Dist: typing-extensions
-Requires-Dist: vortezwohl>=0.0.8
+Requires-Dist: vortezwohl>=0.0.10
 Requires-Dist: name4py>=0.1.4
 Dynamic: license-file

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/__init__.py RENAMED Viewed

@@ -21,7 +21,6 @@ from .nn import (
     AutoRegression,
     LongContextAutoRegression
 )
-from .trainer import TextBinaryClassifierTrainer
 __AUTHOR__ = '吴子豪 / Vortez Wohl'
 __EMAIL__ = 'vortez.wohl@gmail.com'

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/bert_ner.py RENAMED Viewed

@@ -43,12 +43,15 @@ class BertNER(BaseNER):
     def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
         assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
+        # entity length cannot be longer than the whole seq
+        __max_search_backward = len(self.tokenizer.encode(s, add_special_tokens=False))
         s = f' {s.replace(NEW_LINE, BLANK * 2)} '
         raw_entities = self._ner_pipeline(s)
         entities = []
         for ent in raw_entities:
             entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
-        while True:
+        __search_backward = -2
+        while __search_backward < __max_search_backward:
             for i, ent in enumerate(entities):
                 if len(ent[0].strip()) < 1:
                     del entities[i]
@@ -65,6 +68,11 @@ class BertNER(BaseNER):
                     _continue = True
             if not _continue:
                 break
+            __search_backward += 1
+        # adjust all I-ENTs
+        for ent in entities:
+            if ent[1].upper().startswith('I'):
+                ent[1] = f'B{ent[1][1:]}'
         for ent in entities:
             ent[0] = ent[0].strip()
             if len(ent[0]) < 1:

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/n2g/__init__.py RENAMED Viewed

@@ -13,13 +13,12 @@ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
 __CACHE_DIR__ = os.path.join(__ROOT__, '.cache', '.n2g')
 ENCODER = Encoder(model_name_or_path='FacebookAI/xlm-roberta-base')
-BASE_MODEL = 'name2gender-base'
-SMALL_MODEL = 'name2gender-small'
+DEFAULT_MODEL = 'name2gender-small'
 _MIN_FILE_SIZE = 1024 * 5
 def download_model(model_name: str):
-    quiet = bool(os.getenv('QUIET_DOWNLOAD', False))
+    quiet = bool(os.getenv('N2G_QUIET_DOWNLOAD', False))
     os.makedirs(__CACHE_DIR__, exist_ok=True)
     _proxies = {
         'http': os.getenv('HTTP_PROXY', os.getenv('http_proxy')),
@@ -51,25 +50,20 @@ def download_model(model_name: str):
 def load_model(model_name: str = 'name2gender-small', dtype: torch.dtype | None = torch.float16) -> BaseNeuralNetwork:
     n2g_model = None
-    match model_name:
-        case 'name2gender-base' | 'n2g-base' | 'base':
-            download_model(BASE_MODEL)
-            n2g_model = LogisticRegression(input_dim=768, output_dim=1,
-                                           num_heads=12, num_layers=4,
-                                           head_layers=1, expansion_factor=2,
-                                           model_name=BASE_MODEL, dtype=dtype)
-        case 'name2gender-small' | 'n2g-base' | 'small':
-            download_model(SMALL_MODEL)
-            n2g_model = LogisticRegression(input_dim=768, output_dim=1,
-                                           num_heads=6, num_layers=2,
-                                           head_layers=1, expansion_factor=1.5,
-                                           model_name=SMALL_MODEL, dtype=dtype)
-        case _:
-            download_model(SMALL_MODEL)
-            n2g_model = LogisticRegression(input_dim=768, output_dim=1,
-                                           num_heads=6, num_layers=2,
-                                           head_layers=1, expansion_factor=1.5,
-                                           model_name=SMALL_MODEL, dtype=dtype)
+    if 'base' in model_name.lower():
+        download_model(model_name)
+        n2g_model = LogisticRegression(input_dim=768, output_dim=1,
+                                       num_heads=12, num_layers=4,
+                                       head_layers=1, expansion_factor=2,
+                                       model_name=model_name, dtype=dtype)
+    elif 'small' in model_name.lower():
+        download_model(model_name)
+        n2g_model = LogisticRegression(input_dim=768, output_dim=1,
+                                       num_heads=6, num_layers=2,
+                                       head_layers=1, expansion_factor=1.5,
+                                       model_name=model_name, dtype=dtype)
+    else:
+        raise FileNotFoundError(f"Model \"{model_name}\" doesn't exists.")
     return n2g_model.load(model_dir=__CACHE_DIR__)
@@ -77,7 +71,7 @@ class Name2Gender:
     def __init__(self, model: BaseNeuralNetwork | None = None):
         super().__init__()
         if model is None:
-            model = load_model(SMALL_MODEL)
+            model = load_model(DEFAULT_MODEL)
         self._model = model
     def __call__(self, name: str, return_probability: bool = False, threshold: float = .5) -> tuple[Gender, float] | Gender:

deeplotx-0.9.11/deeplotx/util/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ from vortezwohl.crypt.hash import *
2	+ from vortezwohl.io import *

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.9.9
+Version: 0.9.11
 Summary: An out-of-the-box long-text NLP framework.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -15,7 +15,7 @@ Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: transformers
 Requires-Dist: typing-extensions
-Requires-Dist: vortezwohl>=0.0.8
+Requires-Dist: vortezwohl>=0.0.10
 Requires-Dist: name4py>=0.1.4
 Dynamic: license-file

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/SOURCES.txt RENAMED Viewed

@@ -35,9 +35,4 @@ deeplotx/similarity/__init__.py
 deeplotx/similarity/distribution.py
 deeplotx/similarity/set.py
 deeplotx/similarity/vector.py
-deeplotx/trainer/__init__.py
-deeplotx/trainer/base_trainer.py
-deeplotx/trainer/text_binary_classification_trainer.py
-deeplotx/util/__init__.py
-deeplotx/util/hash.py
-deeplotx/util/read_file.py
+deeplotx/util/__init__.py

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/requires.txt RENAMED Viewed

@@ -8,5 +8,5 @@ tiktoken
 torch
 transformers
 typing-extensions
-vortezwohl>=0.0.8
+vortezwohl>=0.0.10
 name4py>=0.1.4

{deeplotx-0.9.9 → deeplotx-0.9.11}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "deeplotx"
-version = '0.9.9'
+version = '0.9.11'
 description = "An out-of-the-box long-text NLP framework."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -15,6 +15,6 @@ dependencies = [
     "torch",
     "transformers",
     "typing-extensions",
-    "vortezwohl>=0.0.8",
+    "vortezwohl>=0.0.10",
     "name4py>=0.1.4",
 ]

deeplotx-0.9.9/deeplotx/trainer/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .text_binary_classification_trainer import TextBinaryClassifierTrainer

deeplotx-0.9.9/deeplotx/trainer/base_trainer.py DELETED Viewed

@@ -1,13 +0,0 @@
-from abc import abstractmethod
-from deeplotx.nn.base_neural_network import BaseNeuralNetwork
-class BaseTrainer(object):
-    def __init__(self, batch_size: int, train_ratio: float):
-        self._batch_size = batch_size
-        self._train_ratio = train_ratio
-        self.model = None
-    @abstractmethod
-    def train(self, *args, **kwargs) -> BaseNeuralNetwork: ...

deeplotx-0.9.9/deeplotx/trainer/text_binary_classification_trainer.py DELETED Viewed

@@ -1,103 +0,0 @@
-import logging
-from typing_extensions import override
-import torch
-from torch import nn, optim
-from torch.utils.data import DataLoader, TensorDataset
-from deeplotx.encoder.long_text_encoder import LongTextEncoder
-from deeplotx.nn.attention import DEFAULT_THETA
-from deeplotx.nn.long_context_recursive_sequential import LongContextRecursiveSequential
-from deeplotx.trainer.base_trainer import BaseTrainer
-logger = logging.getLogger('deeplotx.trainer')
-class TextBinaryClassifierTrainer(BaseTrainer):
-    def __init__(self, long_text_encoder: LongTextEncoder, batch_size: int = 2, train_ratio: float = 0.8):
-        super().__init__(batch_size=batch_size, train_ratio=train_ratio)
-        self._long_text_encoder = long_text_encoder
-        self.device = self._long_text_encoder.device
-        self.train_dataset_loader = None
-        self.valid_dataset_loader = None
-    @override
-    def train(self, positive_texts: list[str], negative_texts: list[str],
-              num_epochs: int, learning_rate: float = 2e-6, balancing_dataset: bool = True,
-              train_loss_threshold: float = 0.0, valid_loss_threshold: float = 0.0,
-              alpha: float = 1e-4, rho: float = 0.2, encoder_layers: int = 4, attn_heads: int = 6,
-              recursive_layers: int = 2, recursive_hidden_dim: int = 256, **kwargs) -> LongContextRecursiveSequential:
-        if balancing_dataset:
-            min_length = min(len(positive_texts), len(negative_texts))
-            positive_texts = positive_texts[:min_length]
-            negative_texts = negative_texts[:min_length]
-        all_texts = positive_texts + negative_texts
-        text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
-        feature_dim = text_embeddings[0].shape[-1]
-        dtype = text_embeddings[0].dtype
-        labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
-                  + [torch.tensor([.0], dtype=dtype, device=self.device) for _ in range(len(negative_texts))])
-        inputs = torch.stack(text_embeddings).to(self.device)
-        labels = torch.stack(labels).to(self.device)
-        dataset_size = len(labels)
-        train_size = int(self._train_ratio * dataset_size)
-        train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
-        valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
-        self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
-        self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
-        if self.model is not None and self.model.in_features != feature_dim:
-            logger.warning("The dimension of features doesn't match. A new model instance will be created.")
-            self.model = None
-        if self.model is None:
-            ffn_heads = kwargs.get('ffn_heads', 2)
-            ffn_layers = kwargs.get('ffn_layers', 5)
-            ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
-            bias = kwargs.get('bias', True)
-            dropout_rate = kwargs.get('dropout_rate', 0.1)
-            encoder_ffn_layers = kwargs.get('encoder_ffn_layers', ffn_layers)
-            encoder_expansion_factor = kwargs.get('encoder_expansion_factor', ffn_expansion_factor)
-            encoder_dropout_rate = kwargs.get('encoder_dropout_rate', dropout_rate)
-            attn_ffn_layers = kwargs.get('attn_ffn_layers', 1)
-            attn_expansion_factor = kwargs.get('attn_expansion_factor', ffn_expansion_factor)
-            attn_dropout_rate = kwargs.get('attn_dropout_rate', dropout_rate)
-            theta = kwargs.get('theta', DEFAULT_THETA)
-            self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
-                                                        encoder_layers=encoder_layers, attn_heads=attn_heads,
-                                                        recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
-                                                        ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
-                                                        dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
-                                                        encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
-                                                        attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
-                                                        attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
-        logger.debug(f'Training Model: \n{self.model}')
-        loss_function = nn.BCELoss()
-        optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
-        for epoch in range(num_epochs):
-            self.model.train()
-            total_loss = 0.0
-            for batch_texts, batch_labels in self.train_dataset_loader:
-                outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
-                loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-                total_loss += loss.item()
-            if epoch % 3 == 0:
-                total_valid_loss = 0.0
-                for batch_texts, batch_labels in self.valid_dataset_loader:
-                    with torch.no_grad():
-                        self.model.eval()
-                        outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
-                        loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
-                        total_valid_loss += loss.item()
-                        self.model.train()
-                logger.debug(f"Epoch {epoch + 1}/{num_epochs} | "
-                             f"Train Loss: {total_loss:.4f} | "
-                             f"Valid Loss: {total_valid_loss:.4f}")
-                if total_valid_loss < valid_loss_threshold:
-                    break
-            else:
-                logger.debug(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {total_loss:.4f}")
-            if total_loss < train_loss_threshold:
-                break
-        return self.model

deeplotx-0.9.9/deeplotx/util/__init__.py DELETED Viewed

	@@ -1,2 +0,0 @@
1	- from .hash import md5, sha1, sha256, sha512
2	- from .read_file import read_file, write_file, get_files

deeplotx-0.9.9/deeplotx/util/hash.py DELETED Viewed

@@ -1,29 +0,0 @@
-import hashlib
-def md5(text: str) -> str:
-    _hash = hashlib.md5()
-    text_bytes = text.encode('utf-8')
-    _hash.update(text_bytes)
-    return _hash.hexdigest()
-def sha1(text: str) -> str:
-    _hash = hashlib.sha1()
-    text_bytes = text.encode('utf-8')
-    _hash.update(text_bytes)
-    return _hash.hexdigest()
-def sha256(text: str) -> str:
-    _hash = hashlib.sha256()
-    text_bytes = text.encode('utf-8')
-    _hash.update(text_bytes)
-    return _hash.hexdigest()
-def sha512(text: str) -> str:
-    _hash = hashlib.sha512()
-    text_bytes = text.encode('utf-8')
-    _hash.update(text_bytes)
-    return _hash.hexdigest()

deeplotx-0.9.9/deeplotx/util/read_file.py DELETED Viewed

@@ -1,32 +0,0 @@
-import os
-def read_file(path: str, encoding: str = 'utf-8') -> str:
-    try:
-        with open(path, mode='r', encoding=encoding) as f:
-            return f.read()
-    except UnicodeDecodeError:
-        try:
-            with open(path, mode='r', encoding='gbk') as f:
-                return f.read()
-        except UnicodeDecodeError:
-            pass
-def write_file(content: str | bytes, path: str, encoding: str = 'utf-8') -> str:
-    os.makedirs(os.path.dirname(path), exist_ok=True)
-    if isinstance(content, bytes):
-        with open(path, mode='wb') as f:
-            f.write(content)
-        return path
-    with open(path, mode='w', encoding=encoding) as f:
-        f.write(content)
-    return path
-def get_files(path: str) -> list:
-    if os.path.exists(path):
-        entries = os.listdir(path)
-        return [os.path.join(path, entry) for entry in entries if os.path.isfile(os.path.join(path, entry))]
-    else:
-        return []

{deeplotx-0.9.9 → deeplotx-0.9.11}/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/README.md RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/encoder.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/long_text_encoder.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/longformer_encoder.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/base_ner.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/named_entity.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/attention.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/base_neural_network.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/feed_forward.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/linear_regression.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/logistic_regression.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/long_context_auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/long_context_recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/multi_head_attention.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/multi_head_feed_forward.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/roformer_encoder.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/rope.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/softmax_regression.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/distribution.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/set.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/vector.py RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/top_level.txt RENAMED Viewed

File without changes

{deeplotx-0.9.9 → deeplotx-0.9.11}/setup.cfg RENAMED Viewed

File without changes

deeplotx 0.9.9__tar.gz → 0.9.11__tar.gz

deeplotx 0.9.9tar.gz → 0.9.11tar.gz