PyPI - deeplotx - Versions diffs - 0.5.1__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

deeplotx 0.5.1py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

deeplotx/__init__.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 __ROOT__ = os.path.dirname(os.path.abspath(__file__))
-from .encoder import BertEncoder, LongTextEncoder, LongformerEncoder
+from .encoder import Encoder, LongTextEncoder, LongformerEncoder
 from .nn import (
     LinearRegression,
     LogisticRegression,

deeplotx/encoder/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .bert_encoder import BertEncoder
+from .encoder import Encoder
 from .long_text_encoder import LongTextEncoder
 from .longformer_encoder import LongformerEncoder

deeplotx/encoder/{bert_encoder.py → encoder.py} RENAMED Viewed

@@ -4,30 +4,30 @@ import math
 import torch
 from torch import nn
-from transformers import BertTokenizer, BertModel
+from transformers import AutoTokenizer, AutoModel
 from deeplotx import __ROOT__
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
-DEFAULT_BERT = 'bert-base-uncased'
+DEFAULT_BERT = 'FacebookAI/xlm-roberta-base'
 logger = logging.getLogger('deeplotx.embedding')
-class BertEncoder(nn.Module):
+class Encoder(nn.Module):
     def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
         super().__init__()
         self.device = torch.device(device) if device is not None \
             else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                                        cache_dir=CACHE_PATH, _from_auto=True)
-        self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
-                                              cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
-        self.embed_dim = self.bert.config.max_position_embeddings
-        logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
+        self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                 cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
+        self.embed_dim = self.encoder.config.max_position_embeddings
+        logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
     def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
         def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
-            return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
+            return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
         num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
         chunks = chunk_results = []
@@ -35,11 +35,11 @@ class BertEncoder(nn.Module):
             start_idx = i * self.embed_dim
             end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
             chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
-        ori_mode = self.bert.training
-        self.bert.eval()
+        ori_mode = self.encoder.training
+        self.encoder.eval()
         with torch.no_grad():
             chunk_results = [_encoder(x) for x in chunks]
-        self.bert.train(mode=ori_mode)
+        self.encoder.train(mode=ori_mode)
         return torch.cat(chunk_results, dim=-1)
     def encode(self, text: str) -> torch.Tensor:

deeplotx/encoder/long_text_encoder.py CHANGED Viewed

@@ -1,30 +1,32 @@
 import logging
 import math
-from concurrent.futures import ThreadPoolExecutor
 from typing_extensions import override
 import torch
+from vortezwohl.concurrent import ThreadPool
+from vortezwohl.cache import LRUCache
-from deeplotx.encoder.bert_encoder import BertEncoder, DEFAULT_BERT
-from deeplotx.util.hash import md5
+from deeplotx.encoder.encoder import Encoder, DEFAULT_BERT
+from deeplotx.util.hash import sha512
 logger = logging.getLogger('deeplotx.embedding')
-class LongTextEncoder(BertEncoder):
-    def __init__(self, max_length: int, chunk_size: int = 256,
-                 overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
+class LongTextEncoder(Encoder):
+    def __init__(self, max_length: int, chunk_size: int = 448,
+                 overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
+                 cache_capacity: int = 64, device: str | None = None):
         super().__init__(model_name_or_path=model_name_or_path, device=device)
         self._max_length = max_length
         self._chunk_size = chunk_size
         self._overlapping = overlapping
-        self._cache = dict()
+        self._cache = LRUCache(capacity=cache_capacity)
-    def __chunk_embedding(self, input_tup: tuple[int, torch.Tensor]) -> tuple[int, torch.Tensor]:
-        return input_tup[0], super().forward(input_tup[1], attention_mask=input_tup[2])
+    def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
+        return idx, super().forward(x, attention_mask=mask)
     @override
-    def encode(self, text: str, flatten: bool = True, use_cache: bool = True) -> torch.Tensor:
+    def encode(self, text: str, flatten: bool = False) -> torch.Tensor:
         def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
             if not _flatten:
                 return torch.stack(tensors, dim=0).squeeze()
@@ -36,8 +38,8 @@ class LongTextEncoder(BertEncoder):
         _text_to_show = text.replace("\n", str())
         logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
         # read cache
-        _text_hash = md5(text)
-        if _text_hash in self._cache.keys():
+        _text_hash = sha512(text)
+        if _text_hash in self._cache:
             return postprocess(self._cache[_text_hash], flatten)
         _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
         _text_to_input_ids_att_mask = []
@@ -57,11 +59,9 @@ class LongTextEncoder(BertEncoder):
             _tmp_right = (i + 1) * self._chunk_size + self._overlapping
             chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device),
                            torch.tensor([_text_to_input_ids_att_mask[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device)))
-        with ThreadPoolExecutor(max_workers=min(num_chunks + 1, 3)) as executor:
-            embeddings = list(executor.map(self.__chunk_embedding, chunks))
-        embeddings.sort(key=lambda x: x[0])
+        embeddings = list(ThreadPool(max_workers=min(num_chunks + 1, 8)).map(self.__chunk_embedding, chunks))
+        embeddings = sorted([x.returns for x in embeddings], key=lambda x: x[0], reverse=False)
         fin_embedding = [x[1] for x in embeddings]
         # write cache
-        if use_cache:
-            self._cache[_text_hash] = fin_embedding
+        self._cache[_text_hash] = fin_embedding
         return postprocess(fin_embedding, flatten)

deeplotx/trainer/text_binary_classification_trainer.py CHANGED Viewed

@@ -17,6 +17,8 @@ class TextBinaryClassifierTrainer(BaseTrainer):
         super().__init__(batch_size=batch_size, train_ratio=train_ratio)
         self._long_text_encoder = long_text_encoder
         self.device = self._long_text_encoder.device
+        self.train_dataset_loader = None
+        self.valid_dataset_loader = None
     @override
     def train(self, positive_texts: list[str], negative_texts: list[str],
@@ -40,8 +42,9 @@ class TextBinaryClassifierTrainer(BaseTrainer):
         train_size = int(self._train_ratio * dataset_size)
         train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
         valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
-        train_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
-        valid_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
+        self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
+        self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
         if self.model is not None and self.model.fc1.in_features != feature_dim:
             logger.warning("The dimension of features doesn't match. A new model instance will be created.")
             self.model = None
@@ -55,7 +58,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
         for epoch in range(num_epochs):
             self.model.train()
             total_loss = 0.0
-            for batch_texts, batch_labels in train_loader:
+            for batch_texts, batch_labels in self.train_dataset_loader:
                 outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
                 loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
                 optimizer.zero_grad()
@@ -64,7 +67,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
                 total_loss += loss.item()
             if epoch % 3 == 0:
                 total_valid_loss = 0.0
-                for batch_texts, batch_labels in valid_loader:
+                for batch_texts, batch_labels in self.valid_dataset_loader:
                     with torch.no_grad():
                         self.model.eval()
                         outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])

deeplotx/util/hash.py CHANGED Viewed

@@ -2,14 +2,28 @@ import hashlib
 def md5(text: str) -> str:
-    md5_hash = hashlib.md5()
+    _hash = hashlib.md5()
     text_bytes = text.encode('utf-8')
-    md5_hash.update(text_bytes)
-    return md5_hash.hexdigest()
+    _hash.update(text_bytes)
+    return _hash.hexdigest()
 def sha1(text: str) -> str:
-    md5_hash = hashlib.sha1()
+    _hash = hashlib.sha1()
     text_bytes = text.encode('utf-8')
-    md5_hash.update(text_bytes)
-    return md5_hash.hexdigest()
+    _hash.update(text_bytes)
+    return _hash.hexdigest()
+def sha256(text: str) -> str:
+    _hash = hashlib.sha256()
+    text_bytes = text.encode('utf-8')
+    _hash.update(text_bytes)
+    return _hash.hexdigest()
+def sha512(text: str) -> str:
+    _hash = hashlib.sha512()
+    text_bytes = text.encode('utf-8')
+    _hash.update(text_bytes)
+    return _hash.hexdigest()

{deeplotx-0.5.1.dist-info → deeplotx-0.5.5.dist-info}/METADATA RENAMED Viewed

@@ -1,18 +1,19 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.5.1
+Version: 0.5.5
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: hf-xet>=1.0.5
+Requires-Dist: hf-xet
 Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
-Requires-Dist: python-dotenv>=1.1.0
+Requires-Dist: python-dotenv
 Requires-Dist: torch
 Requires-Dist: transformers
-Requires-Dist: typing-extensions>=4.13.2
+Requires-Dist: typing-extensions
+Requires-Dist: vortezwohl>=0.0.6
 Dynamic: license-file
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -166,7 +167,10 @@ Dynamic: license-file
         LogisticRegression,  # 逻辑回归 / 二分类 / 多标签分类
         SoftmaxRegression,  # Softmax 回归 / 多分类
         RecursiveSequential,  # 序列模型 / 循环神经网络
-        AutoRegression  # 自回归模型
+        LongContextRecursiveSequential,  # 长上下文序列模型 / 自注意力融合循环神经网络
+        SelfAttention,  # 自注意力模块
+        AutoRegression,  # 自回归模型 / 循环神经网络
+        LongContextAutoRegression  # 长上下文自回归模型 / 自注意力融合循环神经网络
     )
     ```
@@ -211,3 +215,84 @@ Dynamic: license-file
             x = self.fc5(x)
             return x
     ```
+    自注意力模块:
+    ```python
+    from typing_extensions import override
+    import torch
+    from torch import nn, softmax
+    from deeplotx.nn.base_neural_network import BaseNeuralNetwork
+    class SelfAttention(BaseNeuralNetwork):
+        def __init__(self, feature_dim: int, model_name: str | None = None,
+                    device: str | None = None, dtype: torch.dtype | None = None):
+            super().__init__(model_name=model_name, device=device, dtype=dtype)
+            self._feature_dim = feature_dim
+            self.q_proj = nn.Linear(in_features=self._feature_dim, out_features=self._feature_dim,
+                                    bias=True, device=self.device, dtype=self.dtype)
+            self.k_proj = nn.Linear(in_features=self._feature_dim, out_features=self._feature_dim,
+                                    bias=True, device=self.device, dtype=self.dtype)
+            self.v_proj = nn.Linear(in_features=self._feature_dim, out_features=self._feature_dim,
+                                    bias=True, device=self.device, dtype=self.dtype)
+        def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+            q, k = self.q_proj(x), self.k_proj(x)
+            attn = torch.matmul(q, k.transpose(-2, -1))
+            attn = attn / (self._feature_dim ** 0.5)
+            attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
+            return softmax(attn, dim=-1)
+        @override
+        def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+            x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
+            if mask is not None:
+                mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
+            v = self.v_proj(x)
+            return torch.matmul(self._attention(x, mask), v)
+    ```
+- ### 使用预定义训练器实现文本二分类任务
+    ```python
+    from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
+    from deeplotx.util import get_files, read_file
+    # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
+    long_text_encoder = LongTextEncoder(
+        max_length=2048,  # 最大文本大小, 超出截断
+        chunk_size=448,  # 块大小 (按 Token 计)
+        overlapping=32  # 块间重叠大小 (按 Token 计)
+    )
+    trainer = TextBinaryClassifierTrainer(
+        long_text_encoder=long_text_encoder,
+        batch_size=2,
+        train_ratio=0.9  # 训练集和验证集比例
+    )
+    # 读取数据
+    pos_data_path = 'path/to/pos_dir'
+    neg_data_path = 'path/to/neg_dir'
+    pos_data = [read_file(x) for x in get_files(pos_data_path)]
+    neg_data = [read_file(x) for x in get_files(neg_data_path)]
+    # 开始训练
+    model = trainer.train(pos_data, neg_data,
+                          num_epochs=36, learning_rate=2e-5,  # 设置训练轮数和学习率
+                          balancing_dataset=True,  # 是否平衡数据集
+                          alpha=1e-4, rho=.2,  # 设置 elastic net 正则化的超参数 alpha 和 rho
+                          hidden_dim=256, recursive_layers=2)  # 设置循环神经网络的结构
+    # 保存模型权重
+    model.save(model_name='test_model', model_dir='model')
+    # 加载已保存的模型
+    model = model.load(model_name='test_model', model_dir='model')
+    # 使用训练好的模型进行预测
+    model.predict(long_text_encoder.encode('这是一个测试文本.', flatten=False))
+    ```

{deeplotx-0.5.1.dist-info → deeplotx-0.5.5.dist-info}/RECORD RENAMED Viewed

@@ -1,7 +1,7 @@
-deeplotx/__init__.py,sha256=3rVjGSRdcxpxZzHIQohT8dheB5mVdeXIrBkfH2yorcQ,1091
-deeplotx/encoder/__init__.py,sha256=EM-xrTsHoGaiiFpj-iFAxilMHXC_sQKWYrcq1qCnI3U,138
-deeplotx/encoder/bert_encoder.py,sha256=uLqGcXH6AGY6CcjjbYbh09VWYqSpsg-y-jHYB6Fmp3w,2377
-deeplotx/encoder/long_text_encoder.py,sha256=hl_O8kR9o1kcII9YfSx2rf_Pk0l_Rv7LNbsS9UsTU0c,3373
+deeplotx/__init__.py,sha256=6El66QXHDrgNMsNIG9bG97WO8BhPK5btXbTikzx2ce4,1087
+deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
+deeplotx/encoder/encoder.py,sha256=p1e4Dx3-Ghdl0MGNalr0D_OnafwaJnbhscEDVq-y73A,2400
+deeplotx/encoder/long_text_encoder.py,sha256=GatkOF1QQHLtvyuikfCP4xpzfDvszJJyonaS9f7wSxg,3401
 deeplotx/encoder/longformer_encoder.py,sha256=A8FXqd4mdHxSn_o_R689XtpT73ISDT788EgMQRGLC2g,1822
 deeplotx/nn/__init__.py,sha256=CS0UwyYKa8wI6vu6FBIYxvm-HAmw39MTMFlZDtqi6UA,444
 deeplotx/nn/auto_regression.py,sha256=7P63opWCWMqE2DigwbsL6kfXtFtJPz00Yo1RqflBz4A,572
@@ -19,12 +19,12 @@ deeplotx/similarity/set.py,sha256=zhGFxtSIXlWqvipBYzoiPahp4g0boAIoUiMfG0wl07A,68
 deeplotx/similarity/vector.py,sha256=WVbDHqykt-fvuILVrhUCtIFAOEjY_zvttrXGM9eylG0,1125
 deeplotx/trainer/__init__.py,sha256=Fl5DR9UecQc5VtBcczU9sx_HtPNoFohpuELOh-Jrsks,77
 deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
-deeplotx/trainer/text_binary_classification_trainer.py,sha256=Ktdk4rCNHgTFdXVFmbTnvIlGIJi1gphGRkuRgL2bVOo,4793
+deeplotx/trainer/text_binary_classification_trainer.py,sha256=BNBQdpaD8nB1dQv8naHNIravNcQC8JjOMqD-WRSrUH0,4931
 deeplotx/util/__init__.py,sha256=JxqAK_WOOHcYVSTHBT1-WuBwWrPEVDTV3titeVWvNUM,74
-deeplotx/util/hash.py,sha256=wwsC6kOQvbpuvwKsNQOARd78_wePmW9i3oaUuXRUnpc,352
+deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
 deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
-deeplotx-0.5.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
-deeplotx-0.5.1.dist-info/METADATA,sha256=LatUJZ1YzKrlPMDNI2UiOqSf5h9mP57kf4f5ngnfa8Q,6954
-deeplotx-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-deeplotx-0.5.1.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
-deeplotx-0.5.1.dist-info/RECORD,,
+deeplotx-0.5.5.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
+deeplotx-0.5.5.dist-info/METADATA,sha256=QE1R1jodTrnPFY7cbu4mQNPt8_BgKNJuHoSDswopueo,10880
+deeplotx-0.5.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+deeplotx-0.5.5.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
+deeplotx-0.5.5.dist-info/RECORD,,

{deeplotx-0.5.1.dist-info → deeplotx-0.5.5.dist-info}/WHEEL RENAMED Viewed

File without changes

{deeplotx-0.5.1.dist-info → deeplotx-0.5.5.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.5.1.dist-info → deeplotx-0.5.5.dist-info}/top_level.txt RENAMED Viewed

File without changes

deeplotx 0.5.1__py3-none-any.whl → 0.5.5__py3-none-any.whl

deeplotx 0.5.1py3-none-any.whl → 0.5.5py3-none-any.whl