deeplotx 0.5.5__tar.gz → 0.5.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deeplotx-0.5.5 → deeplotx-0.5.6}/PKG-INFO +3 -2
- {deeplotx-0.5.5 → deeplotx-0.5.6}/README.md +2 -1
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/encoder/encoder.py +1 -1
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/encoder/long_text_encoder.py +4 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/trainer/text_binary_classification_trainer.py +1 -1
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx.egg-info/PKG-INFO +3 -2
- {deeplotx-0.5.5 → deeplotx-0.5.6}/pyproject.toml +1 -1
- {deeplotx-0.5.5 → deeplotx-0.5.6}/LICENSE +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/__init__.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/encoder/__init__.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/encoder/longformer_encoder.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/__init__.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/auto_regression.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/linear_regression.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/logistic_regression.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/long_context_auto_regression.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/recursive_sequential.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/self_attention.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/nn/softmax_regression.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/util/hash.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx.egg-info/SOURCES.txt +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx.egg-info/requires.txt +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.5.5 → deeplotx-0.5.6}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.6
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -265,7 +265,8 @@ Dynamic: license-file
|
|
265
265
|
long_text_encoder = LongTextEncoder(
|
266
266
|
max_length=2048, # 最大文本大小, 超出截断
|
267
267
|
chunk_size=448, # 块大小 (按 Token 计)
|
268
|
-
overlapping=32 # 块间重叠大小 (按 Token 计)
|
268
|
+
overlapping=32, # 块间重叠大小 (按 Token 计)
|
269
|
+
cache_capacity=512 # 缓存大小
|
269
270
|
)
|
270
271
|
|
271
272
|
trainer = TextBinaryClassifierTrainer(
|
@@ -247,7 +247,8 @@
|
|
247
247
|
long_text_encoder = LongTextEncoder(
|
248
248
|
max_length=2048, # 最大文本大小, 超出截断
|
249
249
|
chunk_size=448, # 块大小 (按 Token 计)
|
250
|
-
overlapping=32 # 块间重叠大小 (按 Token 计)
|
250
|
+
overlapping=32, # 块间重叠大小 (按 Token 计)
|
251
|
+
cache_capacity=512 # 缓存大小
|
251
252
|
)
|
252
253
|
|
253
254
|
trainer = TextBinaryClassifierTrainer(
|
@@ -25,7 +25,7 @@ class Encoder(nn.Module):
|
|
25
25
|
self.embed_dim = self.encoder.config.max_position_embeddings
|
26
26
|
logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
|
27
27
|
|
28
|
-
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
28
|
+
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor, *args, **kwargs) -> torch.Tensor:
|
29
29
|
def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
30
30
|
return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
|
31
31
|
|
@@ -25,6 +25,10 @@ class LongTextEncoder(Encoder):
|
|
25
25
|
def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
|
26
26
|
return idx, super().forward(x, attention_mask=mask)
|
27
27
|
|
28
|
+
@override
|
29
|
+
def forward(self, text: str, flatten: bool = False, *args, **kwargs) -> torch.Tensor:
|
30
|
+
return self.encode(text=text, flatten=flatten)
|
31
|
+
|
28
32
|
@override
|
29
33
|
def encode(self, text: str, flatten: bool = False) -> torch.Tensor:
|
30
34
|
def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
|
@@ -31,7 +31,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
|
|
31
31
|
positive_texts = positive_texts[:min_length]
|
32
32
|
negative_texts = negative_texts[:min_length]
|
33
33
|
all_texts = positive_texts + negative_texts
|
34
|
-
text_embeddings = [self._long_text_encoder.encode(x, flatten=False
|
34
|
+
text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
|
35
35
|
feature_dim = text_embeddings[0].shape[-1]
|
36
36
|
dtype = text_embeddings[0].dtype
|
37
37
|
labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.5.
|
3
|
+
Version: 0.5.6
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -265,7 +265,8 @@ Dynamic: license-file
|
|
265
265
|
long_text_encoder = LongTextEncoder(
|
266
266
|
max_length=2048, # 最大文本大小, 超出截断
|
267
267
|
chunk_size=448, # 块大小 (按 Token 计)
|
268
|
-
overlapping=32 # 块间重叠大小 (按 Token 计)
|
268
|
+
overlapping=32, # 块间重叠大小 (按 Token 计)
|
269
|
+
cache_capacity=512 # 缓存大小
|
269
270
|
)
|
270
271
|
|
271
272
|
trainer = TextBinaryClassifierTrainer(
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|