PyPI - deeplotx - Versions diffs - 0.5.3__tar.gz → 0.5.6__tar.gz - Mend

deeplotx 0.5.3tar.gz → 0.5.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

{deeplotx-0.5.3 → deeplotx-0.5.6}/PKG-INFO RENAMED Viewed

@@ -1,18 +1,19 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.5.3
+Version: 0.5.6
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: hf-xet>=1.0.5
+Requires-Dist: hf-xet
 Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
-Requires-Dist: python-dotenv>=1.1.0
+Requires-Dist: python-dotenv
 Requires-Dist: torch
 Requires-Dist: transformers
-Requires-Dist: typing-extensions>=4.13.2
+Requires-Dist: typing-extensions
+Requires-Dist: vortezwohl>=0.0.6
 Dynamic: license-file
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -264,7 +265,8 @@ Dynamic: license-file
     long_text_encoder = LongTextEncoder(
         max_length=2048,  # 最大文本大小, 超出截断
         chunk_size=448,  # 块大小 (按 Token 计)
-        overlapping=32  # 块间重叠大小 (按 Token 计)
+        overlapping=32,  # 块间重叠大小 (按 Token 计)
+        cache_capacity=512  # 缓存大小
     )
     trainer = TextBinaryClassifierTrainer(

{deeplotx-0.5.3 → deeplotx-0.5.6}/README.md RENAMED Viewed

@@ -247,7 +247,8 @@
     long_text_encoder = LongTextEncoder(
         max_length=2048,  # 最大文本大小, 超出截断
         chunk_size=448,  # 块大小 (按 Token 计)
-        overlapping=32  # 块间重叠大小 (按 Token 计)
+        overlapping=32,  # 块间重叠大小 (按 Token 计)
+        cache_capacity=512  # 缓存大小
     )
     trainer = TextBinaryClassifierTrainer(

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/__init__.py RENAMED Viewed

@@ -3,7 +3,7 @@ import os
 __ROOT__ = os.path.dirname(os.path.abspath(__file__))
-from .encoder import BertEncoder, LongTextEncoder, LongformerEncoder
+from .encoder import Encoder, LongTextEncoder, LongformerEncoder
 from .nn import (
     LinearRegression,
     LogisticRegression,

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/encoder/__init__.py RENAMED Viewed

@@ -1,3 +1,3 @@
-from .bert_encoder import BertEncoder
+from .encoder import Encoder
 from .long_text_encoder import LongTextEncoder
 from .longformer_encoder import LongformerEncoder

deeplotx-0.5.3/deeplotx/encoder/bert_encoder.py → deeplotx-0.5.6/deeplotx/encoder/encoder.py RENAMED Viewed

@@ -4,30 +4,30 @@ import math
 import torch
 from torch import nn
-from transformers import BertTokenizer, BertModel
+from transformers import AutoTokenizer, AutoModel
 from deeplotx import __ROOT__
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
-DEFAULT_BERT = 'bert-base-uncased'
+DEFAULT_BERT = 'FacebookAI/xlm-roberta-base'
 logger = logging.getLogger('deeplotx.embedding')
-class BertEncoder(nn.Module):
+class Encoder(nn.Module):
     def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
         super().__init__()
         self.device = torch.device(device) if device is not None \
             else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
                                                        cache_dir=CACHE_PATH, _from_auto=True)
-        self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
-                                              cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
-        self.embed_dim = self.bert.config.max_position_embeddings
-        logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
+        self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
+                                                 cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
+        self.embed_dim = self.encoder.config.max_position_embeddings
+        logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
         def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
-            return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
+            return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
         num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
         chunks = chunk_results = []
@@ -35,11 +35,11 @@ class BertEncoder(nn.Module):
             start_idx = i * self.embed_dim
             end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
             chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
-        ori_mode = self.bert.training
-        self.bert.eval()
+        ori_mode = self.encoder.training
+        self.encoder.eval()
         with torch.no_grad():
             chunk_results = [_encoder(x) for x in chunks]
-        self.bert.train(mode=ori_mode)
+        self.encoder.train(mode=ori_mode)
         return torch.cat(chunk_results, dim=-1)
     def encode(self, text: str) -> torch.Tensor:

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/encoder/long_text_encoder.py RENAMED Viewed

@@ -1,30 +1,36 @@
 import logging
 import math
-from concurrent.futures import ThreadPoolExecutor
 from typing_extensions import override
 import torch
+from vortezwohl.concurrent import ThreadPool
+from vortezwohl.cache import LRUCache
-from deeplotx.encoder.bert_encoder import BertEncoder, DEFAULT_BERT
-from deeplotx.util.hash import md5
+from deeplotx.encoder.encoder import Encoder, DEFAULT_BERT
+from deeplotx.util.hash import sha512
 logger = logging.getLogger('deeplotx.embedding')
-class LongTextEncoder(BertEncoder):
-    def __init__(self, max_length: int, chunk_size: int = 256,
-                 overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
+class LongTextEncoder(Encoder):
+    def __init__(self, max_length: int, chunk_size: int = 448,
+                 overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
+                 cache_capacity: int = 64, device: str | None = None):
         super().__init__(model_name_or_path=model_name_or_path, device=device)
         self._max_length = max_length
         self._chunk_size = chunk_size
         self._overlapping = overlapping
-        self._cache = dict()
+        self._cache = LRUCache(capacity=cache_capacity)
-    def __chunk_embedding(self, input_tup: tuple[int, torch.Tensor]) -> tuple[int, torch.Tensor]:
-        return input_tup[0], super().forward(input_tup[1], attention_mask=input_tup[2])
+    def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
+        return idx, super().forward(x, attention_mask=mask)
     @override
-    def encode(self, text: str, flatten: bool = True, use_cache: bool = True) -> torch.Tensor:
+    def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:
+        return self.encode(text=text, flatten=flatten)
+    @override
+    def encode(self, text: str, flatten: bool = False) -> torch.Tensor:
         def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
             if not _flatten:
                 return torch.stack(tensors, dim=0).squeeze()
@@ -36,8 +42,8 @@ class LongTextEncoder(BertEncoder):
         _text_to_show = text.replace("\n", str())
         logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
         # read cache
-        _text_hash = md5(text)
-        if _text_hash in self._cache.keys():
+        _text_hash = sha512(text)
+        if _text_hash in self._cache:
             return postprocess(self._cache[_text_hash], flatten)
         _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
         _text_to_input_ids_att_mask = []
@@ -57,11 +63,9 @@ class LongTextEncoder(BertEncoder):
             _tmp_right = (i + 1) * self._chunk_size + self._overlapping
             chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device),
                            torch.tensor([_text_to_input_ids_att_mask[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device)))
-        with ThreadPoolExecutor(max_workers=min(num_chunks + 1, 3)) as executor:
-            embeddings = list(executor.map(self.__chunk_embedding, chunks))
-        embeddings.sort(key=lambda x: x[0])
+        embeddings = list(ThreadPool(max_workers=min(num_chunks + 1, 8)).map(self.__chunk_embedding, chunks))
+        embeddings = sorted([x.returns for x in embeddings], key=lambda x: x[0], reverse=False)
         fin_embedding = [x[1] for x in embeddings]
         # write cache
-        if use_cache:
-            self._cache[_text_hash] = fin_embedding
+        self._cache[_text_hash] = fin_embedding
         return postprocess(fin_embedding, flatten)

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/trainer/text_binary_classification_trainer.py RENAMED Viewed

@@ -31,7 +31,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
             positive_texts = positive_texts[:min_length]
             negative_texts = negative_texts[:min_length]
         all_texts = positive_texts + negative_texts
-        text_embeddings = [self._long_text_encoder.encode(x, flatten=False, use_cache=True) for x in all_texts]
+        text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
         feature_dim = text_embeddings[0].shape[-1]
         dtype = text_embeddings[0].dtype
         labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]

deeplotx-0.5.6/deeplotx/util/hash.py ADDED Viewed

@@ -0,0 +1,29 @@
+import hashlib
+def md5(text: str) -> str:
+    _hash = hashlib.md5()
+    text_bytes = text.encode('utf-8')
+    _hash.update(text_bytes)
+    return _hash.hexdigest()
+def sha1(text: str) -> str:
+    _hash = hashlib.sha1()
+    text_bytes = text.encode('utf-8')
+    _hash.update(text_bytes)
+    return _hash.hexdigest()
+def sha256(text: str) -> str:
+    _hash = hashlib.sha256()
+    text_bytes = text.encode('utf-8')
+    _hash.update(text_bytes)
+    return _hash.hexdigest()
+def sha512(text: str) -> str:
+    _hash = hashlib.sha512()
+    text_bytes = text.encode('utf-8')
+    _hash.update(text_bytes)
+    return _hash.hexdigest()

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,18 +1,19 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.5.3
+Version: 0.5.6
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
 License-File: LICENSE
-Requires-Dist: hf-xet>=1.0.5
+Requires-Dist: hf-xet
 Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
-Requires-Dist: python-dotenv>=1.1.0
+Requires-Dist: python-dotenv
 Requires-Dist: torch
 Requires-Dist: transformers
-Requires-Dist: typing-extensions>=4.13.2
+Requires-Dist: typing-extensions
+Requires-Dist: vortezwohl>=0.0.6
 Dynamic: license-file
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -264,7 +265,8 @@ Dynamic: license-file
     long_text_encoder = LongTextEncoder(
         max_length=2048,  # 最大文本大小, 超出截断
         chunk_size=448,  # 块大小 (按 Token 计)
-        overlapping=32  # 块间重叠大小 (按 Token 计)
+        overlapping=32,  # 块间重叠大小 (按 Token 计)
+        cache_capacity=512  # 缓存大小
     )
     trainer = TextBinaryClassifierTrainer(

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/SOURCES.txt RENAMED Viewed

@@ -8,7 +8,7 @@ deeplotx.egg-info/dependency_links.txt
 deeplotx.egg-info/requires.txt
 deeplotx.egg-info/top_level.txt
 deeplotx/encoder/__init__.py
-deeplotx/encoder/bert_encoder.py
+deeplotx/encoder/encoder.py
 deeplotx/encoder/long_text_encoder.py
 deeplotx/encoder/longformer_encoder.py
 deeplotx/nn/__init__.py

deeplotx-0.5.6/deeplotx.egg-info/requires.txt ADDED Viewed

@@ -0,0 +1,9 @@
+hf-xet
+jupyter
+numpy
+protobuf
+python-dotenv
+torch
+transformers
+typing-extensions
+vortezwohl>=0.0.6

{deeplotx-0.5.3 → deeplotx-0.5.6}/pyproject.toml RENAMED Viewed

@@ -1,16 +1,17 @@
-[project]
-name = "deeplotx"
-version = "0.5.3"
-description = "Easy-2-use long text NLP toolkit."
-readme = "README.md"
-requires-python = ">=3.10"
-dependencies = [
-    "hf-xet>=1.0.5",
-    "jupyter",
-    "numpy",
-    "protobuf",
-    "python-dotenv>=1.1.0",
-    "torch",
-    "transformers",
-    "typing-extensions>=4.13.2",
-]
+[project]
+name = "deeplotx"
+version = "0.5.6"
+description = "Easy-2-use long text NLP toolkit."
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "hf-xet",
+    "jupyter",
+    "numpy",
+    "protobuf",
+    "python-dotenv",
+    "torch",
+    "transformers",
+    "typing-extensions",
+    "vortezwohl>=0.0.6",
+]

deeplotx-0.5.3/deeplotx/util/hash.py DELETED Viewed

@@ -1,15 +0,0 @@
-import hashlib
-def md5(text: str) -> str:
-    md5_hash = hashlib.md5()
-    text_bytes = text.encode('utf-8')
-    md5_hash.update(text_bytes)
-    return md5_hash.hexdigest()
-def sha1(text: str) -> str:
-    md5_hash = hashlib.sha1()
-    text_bytes = text.encode('utf-8')
-    md5_hash.update(text_bytes)
-    return md5_hash.hexdigest()

deeplotx-0.5.3/deeplotx.egg-info/requires.txt DELETED Viewed

@@ -1,8 +0,0 @@
-hf-xet>=1.0.5
-jupyter
-numpy
-protobuf
-python-dotenv>=1.1.0
-torch
-transformers
-typing-extensions>=4.13.2

{deeplotx-0.5.3 → deeplotx-0.5.6}/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/encoder/longformer_encoder.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/base_neural_network.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/linear_regression.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/logistic_regression.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/long_context_auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/long_context_recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/self_attention.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/nn/softmax_regression.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/distribution.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/set.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/similarity/vector.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/trainer/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/trainer/base_trainer.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/util/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx/util/read_file.py RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/deeplotx.egg-info/top_level.txt RENAMED Viewed

File without changes

{deeplotx-0.5.3 → deeplotx-0.5.6}/setup.cfg RENAMED Viewed

File without changes

deeplotx 0.5.3__tar.gz → 0.5.6__tar.gz

deeplotx 0.5.3tar.gz → 0.5.6tar.gz