PyPI - deeplotx - Versions diffs - 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl - Mend

deeplotx 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

deeplotx/encoder/long_text_encoder.py CHANGED Viewed

@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
 class LongTextEncoder(Encoder):
-    def __init__(self, max_length: int, chunk_size: int = 448,
-                 overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
-                 cache_capacity: int = 64, max_workers: int = 8, device: str | None = None):
+    def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
+                 model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
+                 max_workers: int = 8, device: str | None = None):
         super().__init__(model_name_or_path=model_name_or_path, device=device)
         assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
         self._max_length = max_length
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
                 _fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
             return _fin_emb_tensor.squeeze()
+        _tmp_max_length = self._max_length
         _text_to_show = text.replace("\n", str())
         logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
         # read cache
         _text_hash = sha512(text)
         if _text_hash in self._cache:
             return postprocess(self._cache[_text_hash], flatten)
-        _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
+        _text_to_input_ids = self.tokenizer.encode(text.strip())
+        # variable length
+        if _tmp_max_length < 0:
+            _tmp_max_length = len(_text_to_input_ids)
+        _text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
         _text_to_input_ids_att_mask = []
         # padding
         pad_token = self.tokenizer.pad_token_type_id
-        if len(_text_to_input_ids) < self._max_length:
-            _text_to_input_ids.extend([pad_token] * (self._max_length - len(_text_to_input_ids)))
+        if len(_text_to_input_ids) < _tmp_max_length:
+            _text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
         pads = _text_to_input_ids.count(pad_token)
-        non_pads = self._max_length - pads
+        non_pads = _tmp_max_length - pads
         _text_to_input_ids_att_mask.extend([1] * non_pads)
         _text_to_input_ids_att_mask.extend([0] * pads)
-        num_chunks = math.ceil(self._max_length / self._chunk_size)
+        num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
         # split chunks
         chunks = []
         for i in range(num_chunks):

deeplotx/encoder/longformer_encoder.py CHANGED Viewed

@@ -3,7 +3,8 @@ import os
 import torch
 from torch import nn
-from transformers import LongformerTokenizer, LongformerModel
+from transformers import AutoModel, AutoTokenizer
+from requests.exceptions import ConnectTimeout, SSLError
 from deeplotx import __ROOT__
@@ -17,18 +18,35 @@ class LongformerEncoder(nn.Module):
         super().__init__()
         self.device = torch.device(device) if device is not None \
             else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
-                                                             cache_dir=CACHE_PATH, _from_auto=True)
-        self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
-                                                    cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
+        try:
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                           trust_remote_code=True)
+            self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                     cache_dir=CACHE_PATH, _from_auto=True,
+                                                     trust_remote_code=True).to(self.device)
+        except ConnectTimeout:
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                           trust_remote_code=True, local_files_only=True)
+            self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                     cache_dir=CACHE_PATH, _from_auto=True,
+                                                     trust_remote_code=True, local_files_only=True).to(self.device)
+        except SSLError:
+            self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                           cache_dir=CACHE_PATH, _from_auto=True,
+                                                           trust_remote_code=True, local_files_only=True)
+            self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                     cache_dir=CACHE_PATH, _from_auto=True,
+                                                     trust_remote_code=True, local_files_only=True).to(self.device)
         logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
     def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        ori_mode = self.bert.training
-        self.bert.eval()
+        ori_mode = self.encoder.training
+        self.encoder.eval()
         with torch.no_grad():
-            res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
-        self.bert.train(mode=ori_mode)
+            res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
+        self.encoder.train(mode=ori_mode)
         return res
     def encode(self, text: str) -> torch.Tensor:

deeplotx/nn/attention.py CHANGED Viewed

@@ -36,7 +36,7 @@ class Attention(BaseNeuralNetwork):
         attn = torch.matmul(q, k.transpose(-2, -1))
         attn = attn / (self._feature_dim ** 0.5)
         attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
-        return torch.softmax(attn, dim=-1)
+        return torch.softmax(attn, dtype=self.dtype, dim=-1)
     @override
     def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:

{deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.8.1
+Version: 0.8.3
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -48,24 +48,23 @@ Dynamic: license-file
 - ### 长文本嵌入
-    - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
+    - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
         ```python
         from deeplotx import LongTextEncoder
-        # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
+        # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
         encoder = LongTextEncoder(
-            max_length=2048,
-            chunk_size=512,
-            overlapping=64
+            chunk_size=448,
+            overlapping=32
         )
-        # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
-        encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
+        # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
+        encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
         ```
         输出:
         ```
-        tensor([ 0.5163,  0.2497,  0.5896,  ..., -0.9815, -0.3095,  0.4232])
+        tensor([ 2.2316e-01,  2.0300e-01,  ...,  1.5578e-01, -6.6735e-02])
         ```
     - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -77,6 +76,11 @@ Dynamic: license-file
         encoder.encode('我是吴子豪, 这是一个测试文本.')
         ```
+        输出:
+        ```
+        tensor([-2.7490e-02,  6.6503e-02, ..., -6.5937e-02,  6.7802e-03])
+        ```
 - ### 相似性计算
     - **基于向量的相似性**
@@ -163,14 +167,16 @@ Dynamic: license-file
     ```python
     from deeplotx import (
-        BaseNeuralNetwork,  # 深度神经网络基类
         FeedForward,  # 前馈神经网络
         LinearRegression,  # 线性回归
         LogisticRegression,  # 逻辑回归 / 二分类 / 多标签分类
         SoftmaxRegression,  # Softmax 回归 / 多分类
         RecursiveSequential,  # 序列模型 / 循环神经网络
         LongContextRecursiveSequential,  # 长上下文序列模型 / 自注意力融合循环神经网络
-        SelfAttention,  # 自注意力模块
+        RoPE,  # RoPE 位置编码
+        Attention,  # 自注意力 / 交叉注意力
+        MultiHeadAttention,  # 并行多头注意力
+        RoFormerEncoder,  # Roformer (Transformer + RoPE) 编码器模型
         AutoRegression,  # 自回归模型 / 循环神经网络
         LongContextAutoRegression  # 长上下文自回归模型 / 自注意力融合循环神经网络
     )
@@ -193,13 +199,13 @@ Dynamic: license-file
                     device: str | None = None, dtype: torch.dtype | None = None):
             super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
             self._dropout_rate = dropout_rate
-            self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
-                                device=self.device, dtype=self.dtype)
-            self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
-                                device=self.device, dtype=self.dtype)
-            self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
+            self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
+                                    bias=bias, device=self.device, dtype=self.dtype)
+            self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
+                                    bias=bias, device=self.device, dtype=self.dtype)
+            self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
                                             device=self.device, dtype=self.dtype)
-            self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
+            self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
                                         device=self.device, dtype=self.dtype)
         @override
@@ -207,11 +213,11 @@ Dynamic: license-file
             x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
             residual = x
             x = self.layer_norm(x)
-            x = self.fc1(x)
-            x = self.parametric_relu_1(x)
+            x = self.up_proj(x)
+            x = self.parametric_relu(x)
             if self._dropout_rate > .0:
                 x = torch.dropout(x, p=self._dropout_rate, train=self.training)
-            return self.fc2(x) + residual
+            return self.down_proj(x) + residual
     class FeedForward(BaseNeuralNetwork):
@@ -224,7 +230,7 @@ Dynamic: license-file
             self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
                                                             expansion_factor=expansion_factor, bias=bias,
                                                             dropout_rate=dropout_rate,
-                                                            device=self.device, dtype=self.dtype)] * num_layers)
+                                                            device=self.device, dtype=self.dtype) for _ in range(num_layers)])
         @override
         def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -234,7 +240,7 @@ Dynamic: license-file
             return x
     ```
-    自注意力模块:
+    注意力模块:
     ```python
     from typing_extensions import override
@@ -243,14 +249,17 @@ Dynamic: license-file
     from deeplotx.nn.base_neural_network import BaseNeuralNetwork
     from deeplotx.nn.feed_forward import FeedForward
+    from deeplotx.nn.rope import RoPE, DEFAULT_THETA
-    class SelfAttention(BaseNeuralNetwork):
-        def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
-                    proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
-                    model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
+    class Attention(BaseNeuralNetwork):
+        def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
+                    proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
+                    model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
+                    **kwargs):
             super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
                             device=device, dtype=dtype)
+            self._positional = positional
             self._feature_dim = feature_dim
             self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
                                     expansion_factor=proj_expansion_factor,
@@ -261,21 +270,27 @@ Dynamic: license-file
             self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
                                     expansion_factor=proj_expansion_factor,
                                     bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
+            if self._positional:
+                self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
+                                device=self.device, dtype=self.dtype)
-        def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
-            q, k = self.q_proj(x), self.k_proj(x)
+        def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+            q, k = self.q_proj(x), self.k_proj(y)
+            if self._positional:
+                q, k = self.rope(q), self.rope(k)
             attn = torch.matmul(q, k.transpose(-2, -1))
             attn = attn / (self._feature_dim ** 0.5)
             attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
-            return torch.softmax(attn, dim=-1)
+            return torch.softmax(attn, dtype=self.dtype, dim=-1)
         @override
-        def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
+        def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
             x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
+            y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
             if mask is not None:
                 mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
-            v = self.v_proj(x)
-            return torch.matmul(self._attention(x, mask), v)
+            v = self.v_proj(y)
+            return torch.matmul(self._attention(x, y, mask), v)
     ```
 - ### 使用预定义训练器实现文本二分类任务
@@ -284,7 +299,7 @@ Dynamic: license-file
     from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
     from deeplotx.util import get_files, read_file
-    # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
+    # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
     long_text_encoder = LongTextEncoder(
         max_length=2048,  # 最大文本大小, 超出截断
         chunk_size=448,  # 块大小 (按 Token 计)
@@ -306,10 +321,11 @@ Dynamic: license-file
     # 开始训练
     model = trainer.train(pos_data, neg_data,
-                          num_epochs=36, learning_rate=2e-5,  # 设置训练轮数和学习率
-                          balancing_dataset=True,  # 是否平衡数据集
-                          alpha=1e-4, rho=.2,  # 设置 elastic net 正则化的超参数 alpha 和 rho
-                          hidden_dim=256, recursive_layers=2)  # 设置循环神经网络的结构
+                        num_epochs=36, learning_rate=2e-5,
+                        balancing_dataset=True, alpha=1e-4,
+                        rho=.2, encoder_layers=2,  # 2 层 Roformer 编码器
+                        attn_heads=8,  # 8 个注意力头
+                        recursive_layers=2)  # 2 层 Bi-LSTM
     # 保存模型权重
     model.save(model_name='test_model', model_dir='model')

{deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,10 @@
 deeplotx/__init__.py,sha256=oNeA-vNu5YGiEQg0IcpKEdGh_Y_2uPvo2nqaNL_Zgv8,1159
 deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
 deeplotx/encoder/encoder.py,sha256=oSBdA-MiwMKNfTFJWR-RdvNS0G0qfX-Qchwy4LuwB00,3985
-deeplotx/encoder/long_text_encoder.py,sha256=PFR6jjGyg1N58TQlKsPaNQEd-EDl13Hyhu7A1KtGBbA,3743
-deeplotx/encoder/longformer_encoder.py,sha256=A8FXqd4mdHxSn_o_R689XtpT73ISDT788EgMQRGLC2g,1822
+deeplotx/encoder/long_text_encoder.py,sha256=3ScdKDi65J5tdO8PFCXBjCzNUCLlJRwVhpDR0BrphG4,3951
+deeplotx/encoder/longformer_encoder.py,sha256=NNYLr5I9tdeh0C8Ir7QcbEMU9gDk6U7CiF3Tbg6NEsE,3372
 deeplotx/nn/__init__.py,sha256=01I_yqx9GTa4wy3uNyAqhtxp66tDqxgMLC4Ky5Vnkrg,651
-deeplotx/nn/attention.py,sha256=HZ4nfFtkk7AnJ9nuoDSK6zIlIhZ_hbpZc3o6JQIBqJ8,2861
+deeplotx/nn/attention.py,sha256=R-i-Rd7gnsh6hwXDeYfqLQOJvfSZIGfQbFzRlC91XLo,2879
 deeplotx/nn/auto_regression.py,sha256=uISx29t_zkDGS8s2wvGB6wOGYZitQ4hQ7wyoQl4lcqY,857
 deeplotx/nn/base_neural_network.py,sha256=FjQEDFH810fJS7JV3aLgJZnaMqC6DH--wlBvuj-ghTc,5900
 deeplotx/nn/feed_forward.py,sha256=4ozj7EDalO9pb6JUhZtsJqE0r8bIHFApHRt2zTrl4ho,2931
@@ -27,8 +27,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=QMLR4cC8NCUP-v7SOY
 deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
 deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
 deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
-deeplotx-0.8.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
-deeplotx-0.8.1.dist-info/METADATA,sha256=zMKRLmdsEibLnN_hAx3OM7AbX3SiM7X1-8w4eFJGxNY,12251
-deeplotx-0.8.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-deeplotx-0.8.1.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
-deeplotx-0.8.1.dist-info/RECORD,,
+deeplotx-0.8.3.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
+deeplotx-0.8.3.dist-info/METADATA,sha256=Lif2B7wUDIQQKWvUt_Vl_XYPlMf_EhskiQcq8ZYv6TQ,13079
+deeplotx-0.8.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+deeplotx-0.8.3.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
+deeplotx-0.8.3.dist-info/RECORD,,

{deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

deeplotx 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

deeplotx 0.8.1py3-none-any.whl → 0.8.3py3-none-any.whl