PyPI - deeplotx - Versions diffs - 0.8.7__tar.gz → 0.8.8__tar.gz - Mend

deeplotx 0.8.7tar.gz → 0.8.8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (40) hide show

{deeplotx-0.8.7 → deeplotx-0.8.8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.8.7
+Version: 0.8.8
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -10,6 +10,7 @@ Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
 Requires-Dist: python-dotenv
+Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: transformers
 Requires-Dist: typing-extensions

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/encoder.py RENAMED Viewed

@@ -43,9 +43,11 @@ class Encoder(nn.Module):
         self.embed_dim = self.encoder.config.max_position_embeddings
         logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True,
+                *args, **kwargs) -> torch.Tensor:
         def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
-            return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
+            emb_seq = self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state
+            return emb_seq[:, 0, :] if cls_only else emb_seq
         num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
         chunks, chunk_results = [], []
@@ -58,9 +60,9 @@ class Encoder(nn.Module):
         with torch.no_grad():
             chunk_results = [_encoder(x) for x in chunks]
         self.encoder.train(mode=ori_mode)
-        return torch.cat(chunk_results, dim=-1)
+        return torch.cat(chunk_results, dim=-1) if cls_only else torch.cat(chunk_results, dim=-2)
-    def encode(self, text: str) -> torch.Tensor:
+    def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
         _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
         _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
-        return self.forward(_input_ids, _att_mask).squeeze()
+        return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/long_text_encoder.py RENAMED Viewed

@@ -25,7 +25,7 @@ class LongTextEncoder(Encoder):
         self._worker_group = ThreadPool(max_workers=max_workers)
     def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
-        return idx, super().forward(x, attention_mask=mask)
+        return idx, super().forward(x, attention_mask=mask, cls_only=True)
     @override
     def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/longformer_encoder.py RENAMED Viewed

@@ -9,7 +9,7 @@ from requests.exceptions import ConnectTimeout, SSLError
 from deeplotx import __ROOT__
 CACHE_PATH = os.path.join(__ROOT__, '.cache')
-DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
+DEFAULT_LONGFORMER = 'severinsimmler/xlm-roberta-longformer-base-16384'
 logger = logging.getLogger('deeplotx.embedding')
@@ -41,15 +41,16 @@ class LongformerEncoder(nn.Module):
                                                      trust_remote_code=True, local_files_only=True).to(self.device)
         logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
-    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, cls_only: bool = True) -> torch.Tensor:
         ori_mode = self.encoder.training
         self.encoder.eval()
         with torch.no_grad():
-            res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
+            emb_seq = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state
+            res = emb_seq[:, 0, :] if cls_only else emb_seq
         self.encoder.train(mode=ori_mode)
         return res
-    def encode(self, text: str) -> torch.Tensor:
+    def encode(self, text: str, cls_only: bool = True) -> torch.Tensor:
         _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
         _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
-        return self.forward(_input_ids, _att_mask).squeeze()
+        return self.forward(input_ids=_input_ids, attention_mask=_att_mask, cls_only=cls_only).squeeze()

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: deeplotx
-Version: 0.8.7
+Version: 0.8.8
 Summary: Easy-2-use long text NLP toolkit.
 Requires-Python: >=3.10
 Description-Content-Type: text/markdown
@@ -10,6 +10,7 @@ Requires-Dist: jupyter
 Requires-Dist: numpy
 Requires-Dist: protobuf
 Requires-Dist: python-dotenv
+Requires-Dist: tiktoken
 Requires-Dist: torch
 Requires-Dist: transformers
 Requires-Dist: typing-extensions

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/requires.txt RENAMED Viewed

@@ -3,6 +3,7 @@ jupyter
 numpy
 protobuf
 python-dotenv
+tiktoken
 torch
 transformers
 typing-extensions

{deeplotx-0.8.7 → deeplotx-0.8.8}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "deeplotx"
-version = "0.8.7"
+version = "0.8.8"
 description = "Easy-2-use long text NLP toolkit."
 readme = "README.md"
 requires-python = ">=3.10"
@@ -10,6 +10,7 @@ dependencies = [
     "numpy",
     "protobuf",
     "python-dotenv",
+    "tiktoken",
     "torch",
     "transformers",
     "typing-extensions",

{deeplotx-0.8.7 → deeplotx-0.8.8}/LICENSE RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/README.md RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/encoder/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/attention.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/base_neural_network.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/feed_forward.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/linear_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/logistic_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/long_context_auto_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/long_context_recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/multi_head_attention.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/multi_head_feed_forward.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/recursive_sequential.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/roformer_encoder.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/rope.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/nn/softmax_regression.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/distribution.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/set.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/similarity/vector.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/trainer/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/trainer/base_trainer.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/trainer/text_binary_classification_trainer.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/util/__init__.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/util/hash.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx/util/read_file.py RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/SOURCES.txt RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/dependency_links.txt RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/deeplotx.egg-info/top_level.txt RENAMED Viewed

File without changes

{deeplotx-0.8.7 → deeplotx-0.8.8}/setup.cfg RENAMED Viewed

File without changes

deeplotx 0.8.7__tar.gz → 0.8.8__tar.gz

deeplotx 0.8.7tar.gz → 0.8.8tar.gz