PyPI - deeplotx - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl - Mend

deeplotx 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

deeplotx/encoder/bert_encoder.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import os
 import math
@@ -9,21 +10,26 @@ from deeplotx import __ROOT__
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
 DEFAULT_BERT = 'bert-base-uncased'
+logger = logging.getLogger('deeplotx.embedding')
 class BertEncoder(nn.Module):
-    def __init__(self, model_name_or_path: str = DEFAULT_BERT):
+    def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
         super().__init__()
+        self.device = device if device is not None else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                                        cache_dir=CACHE_PATH, _from_auto=True)
         self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
-                                              cache_dir=CACHE_PATH, _from_auto=True)
+                                              cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
         self.embed_dim = self.bert.config.max_position_embeddings
+        logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
     def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
         def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
             return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
         num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
         chunks = chunk_results = []
         for i in range(num_chunks):

deeplotx/encoder/long_text_encoder.py CHANGED Viewed

@@ -13,8 +13,8 @@ logger = logging.getLogger('deeplotx.embedding')
 class LongTextEncoder(BertEncoder):
     def __init__(self, max_length: int, chunk_size: int = 256,
-                 overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT):
-        super().__init__(model_name_or_path=model_name_or_path)
+                 overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
+        super().__init__(model_name_or_path=model_name_or_path, device=device)
         self._max_length = max_length
         self._chunk_size = chunk_size
         self._overlapping = overlapping

deeplotx/encoder/longformer_encoder.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import logging
 import os
 import torch
@@ -8,17 +9,22 @@ from deeplotx import __ROOT__
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
 DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
+logger = logging.getLogger('deeplotx.embedding')
 class LongformerEncoder(nn.Module):
-    def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER):
+    def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
         super().__init__()
+        self.device = device if device is not None else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                                              cache_dir=CACHE_PATH, _from_auto=True)
         self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
-                                                    cache_dir=CACHE_PATH, _from_auto=True)
+                                                    cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
+        logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
     def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+        input_ids = input_ids.to(self.device)
+        attention_mask = attention_mask.to(self.device)
         ori_mode = self.bert.training
         self.bert.eval()
         with torch.no_grad():

deeplotx-0.4.9.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,211 @@
+Metadata-Version: 2.4
+Name: deeplotx
+Version: 0.4.9
+Summary: Easy-2-use long text NLP toolkit.
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: hf-xet>=1.0.5
+Requires-Dist: jupyter
+Requires-Dist: numpy
+Requires-Dist: protobuf>=6.31.1
+Requires-Dist: python-dotenv>=1.1.0
+Requires-Dist: torch
+Requires-Dist: transformers
+Requires-Dist: typing-extensions>=4.13.2
+Dynamic: license-file
+[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
+# Deep Long Text Learning Kit
+> Author: 吴子豪
+**开箱即用的长文本语义建模框架**
+## 安装
+- 使用 pip
+    ```
+    pip install -U deeplotx
+    ```
+- 使用 uv (推荐)
+    ```
+    uv add -U deeplotx
+    ```
+- 从 github 安装最新特性
+    ```
+    pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
+    ```
+## 核心功能
+- ### 长文本嵌入
+    - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
+        ```python
+        from deeplotx import LongTextEncoder
+        # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
+        encoder = LongTextEncoder(
+            max_length=2048,
+            chunk_size=512,
+            overlapping=64
+        )
+        # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
+        encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
+        ```
+        输出:
+        ```
+        tensor([ 0.5163,  0.2497,  0.5896,  ..., -0.9815, -0.3095,  0.4232])
+        ```
+    - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
+        ```python
+        from deeplotx import LongformerEncoder
+        encoder = LongformerEncoder()
+        encoder.encode('我是吴子豪, 这是一个测试文本.')
+        ```
+- ### 相似性计算
+    - **基于向量的相似性**
+        ```python
+        import deeplotx.similarity as sim
+        vector_0, vector_1 = [1, 2, 3, 4], [4, 3, 2, 1]
+        # 欧几里得距离
+        distance_0 = sim.euclidean_similarity(vector_0, vector_1)
+        print(distance_0)
+        # 余弦距离
+        distance_1 = sim.cosine_similarity(vector_0, vector_1)
+        print(distance_1)
+        # 切比雪夫距离
+        distance_2 = sim.chebyshev_similarity(vector_0, vector_1)
+        print(distance_2)
+        ```
+        输出:
+        ```
+        4.47213595499958
+        0.33333333333333337
+        3
+        ```
+    - **基于集合的相似性**
+        ```python
+        import deeplotx.similarity as sim
+        set_0, set_1 = {1, 2, 3, 4}, {4, 5, 6, 7}
+        # 杰卡德距离
+        distance_0 = sim.jaccard_similarity(set_0, set_1)
+        print(distance_0)
+        # Ochiai 距离
+        distance_1 = sim.ochiai_similarity(set_0, set_1)
+        print(distance_1)
+        # Dice 系数
+        distance_2 = sim.dice_coefficient(set_0, set_1)
+        print(distance_2)
+        # Overlap 系数
+        distance_3 = sim.overlap_coefficient(set_0, set_1)
+        print(distance_3)
+        ```
+        输出:
+        ```
+        0.1428571428572653
+        0.2500000000001875
+        0.25000000000009376
+        0.2500000000001875
+        ```
+    - **基于概率分布的相似性**
+        ```python
+        import deeplotx.similarity as sim
+        dist_0, dist_1 = [0.3, 0.2, 0.1, 0.4], [0.2, 0.1, 0.3, 0.4]
+        # 交叉熵
+        distance_0 = sim.cross_entropy(dist_0, dist_1)
+        print(distance_0)
+        # KL 散度
+        distance_1 = sim.kl_divergence(dist_0, dist_1)
+        print(distance_1)
+        # JS 散度
+        distance_2 = sim.js_divergence(dist_0, dist_1)
+        print(distance_2)
+        # Hellinger 距离
+        distance_3 = sim.hellinger_distance(dist_0, dist_1)
+        print(distance_3)
+        ```
+        输出:
+        ```
+        0.3575654913778237
+        0.15040773967762736
+        0.03969123741566945
+        0.20105866986400994
+        ```
+- ### 预定义深度神经网络
+    ```python
+    from deeplotx import (
+        LinearRegression,  # 线性回归
+        LogisticRegression,  # 逻辑回归 / 二分类 / 多标签分类
+        SoftmaxRegression,  # Softmax 回归 / 多分类
+        RecursiveSequential,  # 序列模型 / 循环神经网络
+        AutoRegression  # 自回归模型
+    )
+    ```
+    基础网络结构:
+    ```python
+    from typing_extensions import override
+    import torch
+    from torch import nn
+    from deeplotx.nn.base_neural_network import BaseNeuralNetwork
+    class LinearRegression(BaseNeuralNetwork):
+        def __init__(self, input_dim: int, output_dim: int, model_name: str | None = None):
+            super().__init__(model_name=model_name)
+            self.fc1 = nn.Linear(input_dim, 1024)
+            self.fc1_to_fc4_res = nn.Linear(1024, 64)
+            self.fc2 = nn.Linear(1024, 768)
+            self.fc3 = nn.Linear(768, 128)
+            self.fc4 = nn.Linear(128, 64)
+            self.fc5 = nn.Linear(64, output_dim)
+            self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3)
+            self.parametric_relu_2 = nn.PReLU(num_parameters=1, init=5e-3)
+            self.parametric_relu_3 = nn.PReLU(num_parameters=1, init=5e-3)
+            self.parametric_relu_4 = nn.PReLU(num_parameters=1, init=5e-3)
+        @override
+        def forward(self, x) -> torch.Tensor:
+            fc1_out = self.parametric_relu_1(self.fc1(x))
+            x = nn.LayerNorm(normalized_shape=1024, eps=1e-9)(fc1_out)
+            x = torch.dropout(x, p=0.2, train=self.training)
+            x = self.parametric_relu_2(self.fc2(x))
+            x = nn.LayerNorm(normalized_shape=768, eps=1e-9)(x)
+            x = torch.dropout(x, p=0.2, train=self.training)
+            x = self.parametric_relu_3(self.fc3(x))
+            x = torch.dropout(x, p=0.2, train=self.training)
+            x = self.parametric_relu_4(self.fc4(x)) + self.fc1_to_fc4_res(fc1_out)
+            x = self.fc5(x)
+            return x
+    ```

{deeplotx-0.4.8.dist-info → deeplotx-0.4.9.dist-info}/RECORD RENAMED Viewed

@@ -1,8 +1,8 @@
 deeplotx/__init__.py,sha256=wMN_AI14V-0BPbQghYpvd2y7eUGfhr7jKTTuur-5Upg,1002
 deeplotx/encoder/__init__.py,sha256=EM-xrTsHoGaiiFpj-iFAxilMHXC_sQKWYrcq1qCnI3U,138
-deeplotx/encoder/bert_encoder.py,sha256=6QY2pOvayWNz4w749JAGndvQ-jeKJgy3BalQl2JCkgk,1994
-deeplotx/encoder/long_text_encoder.py,sha256=FP0ACiOaOCjK2buRSWqBs-peg3IWQKuIdP2S00LNvSs,3271
-deeplotx/encoder/longformer_encoder.py,sha256=4avKYsLN6TTpPoky8BQ0nIhQm8lVxMvvzqkrdKCWj3Q,1433
+deeplotx/encoder/bert_encoder.py,sha256=VCmYsBSqB9bRL_ge4bYssyx-Xy4oR0-DE1cMTuTn1tU,2412
+deeplotx/encoder/long_text_encoder.py,sha256=7On6NuaINDZLqgb3HsSJBEzbWXNZPh_MXAvO5KY471k,3313
+deeplotx/encoder/longformer_encoder.py,sha256=J8Si8Ta0bh7Vo7YsV0XdC7jGrvIt54GKcHr_pq2qHbI,1857
 deeplotx/nn/__init__.py,sha256=oQ-vYXyuaGelfCOs2im_gZXAiiBlCCVXh1uw9yjvRMs,253
 deeplotx/nn/auto_regression.py,sha256=o82C9TREZbhGdj2knSVGTXhjJne0LGEqc7BllByJJWE,449
 deeplotx/nn/base_neural_network.py,sha256=xWKG4FX6Jzdlrfc1HOW1aO9uh0Af3D-dB5Jl7eCxsAk,1635
@@ -20,8 +20,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=5O-5dwVMCj5EDX9gjJ
 deeplotx/util/__init__.py,sha256=JxqAK_WOOHcYVSTHBT1-WuBwWrPEVDTV3titeVWvNUM,74
 deeplotx/util/hash.py,sha256=wwsC6kOQvbpuvwKsNQOARd78_wePmW9i3oaUuXRUnpc,352
 deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
-deeplotx-0.4.8.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
-deeplotx-0.4.8.dist-info/METADATA,sha256=KMyt-My-d5261MGBfC1_HsyqbJ_KVEvZ--kZNFq8B2A,1656
-deeplotx-0.4.8.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
-deeplotx-0.4.8.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
-deeplotx-0.4.8.dist-info/RECORD,,
+deeplotx-0.4.9.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
+deeplotx-0.4.9.dist-info/METADATA,sha256=VucPny6Tz6-bCc7xB6G_3MJxI2_0FySACWbPX7CDTTo,6292
+deeplotx-0.4.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+deeplotx-0.4.9.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
+deeplotx-0.4.9.dist-info/RECORD,,

{deeplotx-0.4.8.dist-info → deeplotx-0.4.9.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.4.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

deeplotx-0.4.8.dist-info/METADATA DELETED Viewed

@@ -1,72 +0,0 @@
-Metadata-Version: 2.4
-Name: deeplotx
-Version: 0.4.8
-Summary: Easy-2-use long text NLP toolkit.
-Requires-Python: >=3.10
-Description-Content-Type: text/markdown
-License-File: LICENSE
-Requires-Dist: hf-xet>=1.0.5
-Requires-Dist: jupyter
-Requires-Dist: numpy
-Requires-Dist: python-dotenv>=1.1.0
-Requires-Dist: torch
-Requires-Dist: transformers
-Requires-Dist: typing-extensions>=4.13.2
-Dynamic: license-file
-[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/LoTC)
-# DeepLoTX
-An Easy-2-use long text NLP toolkit
-## Installation
-- Install with pip
-    ```
-    pip install -U deeplotx
-    ```
-- Install with uv
-    ```
-    uv add -U deeplotx
-    ```
-- Install from github
-    ```
-    pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
-    ```
-## Quick Start
-To train a binary classifier from text files:
-```python
-from deeplotx.util import get_files, read_file
-from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
-long_text_encoder = LongTextEncoder(
-  max_length=2048,
-  chunk_size=512,
-  overlapping=128
-)
-trainer = TextBinaryClassifierTrainer(
-  long_text_encoder=long_text_encoder,
-  batch_size=4,
-  train_ratio=0.9
-)
-pos_data_path = './data/pos'
-neg_data_path = './data/neg'
-pos_data = [read_file(x) for x in get_files(pos_data_path)]
-neg_data = [read_file(x) for x in get_files(neg_data_path)]
-model = trainer.train(pos_data, neg_data, num_epochs=20, learning_rate=2e-5, train_loss_threshold=1)
-model.save()
-model = model.load()
-model.predict(long_text_encoder.encode('这是一个测试文本.').squeeze())
-```

{deeplotx-0.4.8.dist-info → deeplotx-0.4.9.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.4.8.dist-info → deeplotx-0.4.9.dist-info}/top_level.txt RENAMED Viewed

File without changes

deeplotx 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

deeplotx 0.4.8py3-none-any.whl → 0.4.9py3-none-any.whl