deeplotx 0.5.1__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deeplotx/__init__.py CHANGED
@@ -3,7 +3,7 @@ import os
3
3
 
4
4
  __ROOT__ = os.path.dirname(os.path.abspath(__file__))
5
5
 
6
- from .encoder import BertEncoder, LongTextEncoder, LongformerEncoder
6
+ from .encoder import Encoder, LongTextEncoder, LongformerEncoder
7
7
  from .nn import (
8
8
  LinearRegression,
9
9
  LogisticRegression,
@@ -1,3 +1,3 @@
1
- from .bert_encoder import BertEncoder
1
+ from .encoder import Encoder
2
2
  from .long_text_encoder import LongTextEncoder
3
3
  from .longformer_encoder import LongformerEncoder
@@ -4,30 +4,30 @@ import math
4
4
 
5
5
  import torch
6
6
  from torch import nn
7
- from transformers import BertTokenizer, BertModel
7
+ from transformers import AutoTokenizer, AutoModel
8
8
 
9
9
  from deeplotx import __ROOT__
10
10
 
11
11
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
12
- DEFAULT_BERT = 'bert-base-uncased'
12
+ DEFAULT_BERT = 'FacebookAI/xlm-roberta-base'
13
13
  logger = logging.getLogger('deeplotx.embedding')
14
14
 
15
15
 
16
- class BertEncoder(nn.Module):
16
+ class Encoder(nn.Module):
17
17
  def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
18
18
  super().__init__()
19
19
  self.device = torch.device(device) if device is not None \
20
20
  else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
- self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
21
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
22
22
  cache_dir=CACHE_PATH, _from_auto=True)
23
- self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
24
- cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
25
- self.embed_dim = self.bert.config.max_position_embeddings
26
- logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
23
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
24
+ cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
25
+ self.embed_dim = self.encoder.config.max_position_embeddings
26
+ logger.debug(f'{Encoder.__name__} initialized on device: {self.device}.')
27
27
 
28
28
  def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
29
29
  def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
30
- return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
30
+ return self.encoder.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
31
31
 
32
32
  num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
33
33
  chunks = chunk_results = []
@@ -35,11 +35,11 @@ class BertEncoder(nn.Module):
35
35
  start_idx = i * self.embed_dim
36
36
  end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
37
37
  chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
38
- ori_mode = self.bert.training
39
- self.bert.eval()
38
+ ori_mode = self.encoder.training
39
+ self.encoder.eval()
40
40
  with torch.no_grad():
41
41
  chunk_results = [_encoder(x) for x in chunks]
42
- self.bert.train(mode=ori_mode)
42
+ self.encoder.train(mode=ori_mode)
43
43
  return torch.cat(chunk_results, dim=-1)
44
44
 
45
45
  def encode(self, text: str) -> torch.Tensor:
@@ -1,30 +1,32 @@
1
1
  import logging
2
2
  import math
3
- from concurrent.futures import ThreadPoolExecutor
4
3
  from typing_extensions import override
5
4
 
6
5
  import torch
6
+ from vortezwohl.concurrent import ThreadPool
7
+ from vortezwohl.cache import LRUCache
7
8
 
8
- from deeplotx.encoder.bert_encoder import BertEncoder, DEFAULT_BERT
9
- from deeplotx.util.hash import md5
9
+ from deeplotx.encoder.encoder import Encoder, DEFAULT_BERT
10
+ from deeplotx.util.hash import sha512
10
11
 
11
12
  logger = logging.getLogger('deeplotx.embedding')
12
13
 
13
14
 
14
- class LongTextEncoder(BertEncoder):
15
- def __init__(self, max_length: int, chunk_size: int = 256,
16
- overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
15
+ class LongTextEncoder(Encoder):
16
+ def __init__(self, max_length: int, chunk_size: int = 448,
17
+ overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
18
+ cache_capacity: int = 64, device: str | None = None):
17
19
  super().__init__(model_name_or_path=model_name_or_path, device=device)
18
20
  self._max_length = max_length
19
21
  self._chunk_size = chunk_size
20
22
  self._overlapping = overlapping
21
- self._cache = dict()
23
+ self._cache = LRUCache(capacity=cache_capacity)
22
24
 
23
- def __chunk_embedding(self, input_tup: tuple[int, torch.Tensor]) -> tuple[int, torch.Tensor]:
24
- return input_tup[0], super().forward(input_tup[1], attention_mask=input_tup[2])
25
+ def __chunk_embedding(self, idx: int, x: torch.Tensor, mask: torch.Tensor) -> tuple[int, torch.Tensor]:
26
+ return idx, super().forward(x, attention_mask=mask)
25
27
 
26
28
  @override
27
- def encode(self, text: str, flatten: bool = True, use_cache: bool = True) -> torch.Tensor:
29
+ def encode(self, text: str, flatten: bool = False) -> torch.Tensor:
28
30
  def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
29
31
  if not _flatten:
30
32
  return torch.stack(tensors, dim=0).squeeze()
@@ -36,8 +38,8 @@ class LongTextEncoder(BertEncoder):
36
38
  _text_to_show = text.replace("\n", str())
37
39
  logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
38
40
  # read cache
39
- _text_hash = md5(text)
40
- if _text_hash in self._cache.keys():
41
+ _text_hash = sha512(text)
42
+ if _text_hash in self._cache:
41
43
  return postprocess(self._cache[_text_hash], flatten)
42
44
  _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
43
45
  _text_to_input_ids_att_mask = []
@@ -57,11 +59,9 @@ class LongTextEncoder(BertEncoder):
57
59
  _tmp_right = (i + 1) * self._chunk_size + self._overlapping
58
60
  chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device),
59
61
  torch.tensor([_text_to_input_ids_att_mask[_tmp_left: _tmp_right]], dtype=torch.int, device=self.device)))
60
- with ThreadPoolExecutor(max_workers=min(num_chunks + 1, 3)) as executor:
61
- embeddings = list(executor.map(self.__chunk_embedding, chunks))
62
- embeddings.sort(key=lambda x: x[0])
62
+ embeddings = list(ThreadPool(max_workers=min(num_chunks + 1, 8)).map(self.__chunk_embedding, chunks))
63
+ embeddings = sorted([x.returns for x in embeddings], key=lambda x: x[0], reverse=False)
63
64
  fin_embedding = [x[1] for x in embeddings]
64
65
  # write cache
65
- if use_cache:
66
- self._cache[_text_hash] = fin_embedding
66
+ self._cache[_text_hash] = fin_embedding
67
67
  return postprocess(fin_embedding, flatten)
@@ -17,6 +17,8 @@ class TextBinaryClassifierTrainer(BaseTrainer):
17
17
  super().__init__(batch_size=batch_size, train_ratio=train_ratio)
18
18
  self._long_text_encoder = long_text_encoder
19
19
  self.device = self._long_text_encoder.device
20
+ self.train_dataset_loader = None
21
+ self.valid_dataset_loader = None
20
22
 
21
23
  @override
22
24
  def train(self, positive_texts: list[str], negative_texts: list[str],
@@ -40,8 +42,9 @@ class TextBinaryClassifierTrainer(BaseTrainer):
40
42
  train_size = int(self._train_ratio * dataset_size)
41
43
  train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
42
44
  valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
43
- train_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
44
- valid_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
45
+ self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
46
+ self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
47
+
45
48
  if self.model is not None and self.model.fc1.in_features != feature_dim:
46
49
  logger.warning("The dimension of features doesn't match. A new model instance will be created.")
47
50
  self.model = None
@@ -55,7 +58,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
55
58
  for epoch in range(num_epochs):
56
59
  self.model.train()
57
60
  total_loss = 0.0
58
- for batch_texts, batch_labels in train_loader:
61
+ for batch_texts, batch_labels in self.train_dataset_loader:
59
62
  outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
60
63
  loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
61
64
  optimizer.zero_grad()
@@ -64,7 +67,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
64
67
  total_loss += loss.item()
65
68
  if epoch % 3 == 0:
66
69
  total_valid_loss = 0.0
67
- for batch_texts, batch_labels in valid_loader:
70
+ for batch_texts, batch_labels in self.valid_dataset_loader:
68
71
  with torch.no_grad():
69
72
  self.model.eval()
70
73
  outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
deeplotx/util/hash.py CHANGED
@@ -2,14 +2,28 @@ import hashlib
2
2
 
3
3
 
4
4
  def md5(text: str) -> str:
5
- md5_hash = hashlib.md5()
5
+ _hash = hashlib.md5()
6
6
  text_bytes = text.encode('utf-8')
7
- md5_hash.update(text_bytes)
8
- return md5_hash.hexdigest()
7
+ _hash.update(text_bytes)
8
+ return _hash.hexdigest()
9
9
 
10
10
 
11
11
  def sha1(text: str) -> str:
12
- md5_hash = hashlib.sha1()
12
+ _hash = hashlib.sha1()
13
13
  text_bytes = text.encode('utf-8')
14
- md5_hash.update(text_bytes)
15
- return md5_hash.hexdigest()
14
+ _hash.update(text_bytes)
15
+ return _hash.hexdigest()
16
+
17
+
18
+ def sha256(text: str) -> str:
19
+ _hash = hashlib.sha256()
20
+ text_bytes = text.encode('utf-8')
21
+ _hash.update(text_bytes)
22
+ return _hash.hexdigest()
23
+
24
+
25
+ def sha512(text: str) -> str:
26
+ _hash = hashlib.sha512()
27
+ text_bytes = text.encode('utf-8')
28
+ _hash.update(text_bytes)
29
+ return _hash.hexdigest()
@@ -1,18 +1,19 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.5.1
3
+ Version: 0.5.5
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
7
7
  License-File: LICENSE
8
- Requires-Dist: hf-xet>=1.0.5
8
+ Requires-Dist: hf-xet
9
9
  Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
- Requires-Dist: python-dotenv>=1.1.0
12
+ Requires-Dist: python-dotenv
13
13
  Requires-Dist: torch
14
14
  Requires-Dist: transformers
15
- Requires-Dist: typing-extensions>=4.13.2
15
+ Requires-Dist: typing-extensions
16
+ Requires-Dist: vortezwohl>=0.0.6
16
17
  Dynamic: license-file
17
18
 
18
19
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -166,7 +167,10 @@ Dynamic: license-file
166
167
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
167
168
  SoftmaxRegression, # Softmax 回归 / 多分类
168
169
  RecursiveSequential, # 序列模型 / 循环神经网络
169
- AutoRegression # 自回归模型
170
+ LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
171
+ SelfAttention, # 自注意力模块
172
+ AutoRegression, # 自回归模型 / 循环神经网络
173
+ LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
170
174
  )
171
175
  ```
172
176
 
@@ -211,3 +215,84 @@ Dynamic: license-file
211
215
  x = self.fc5(x)
212
216
  return x
213
217
  ```
218
+
219
+ 自注意力模块:
220
+
221
+ ```python
222
+ from typing_extensions import override
223
+
224
+ import torch
225
+ from torch import nn, softmax
226
+
227
+ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
228
+
229
+
230
+ class SelfAttention(BaseNeuralNetwork):
231
+ def __init__(self, feature_dim: int, model_name: str | None = None,
232
+ device: str | None = None, dtype: torch.dtype | None = None):
233
+ super().__init__(model_name=model_name, device=device, dtype=dtype)
234
+ self._feature_dim = feature_dim
235
+ self.q_proj = nn.Linear(in_features=self._feature_dim, out_features=self._feature_dim,
236
+ bias=True, device=self.device, dtype=self.dtype)
237
+ self.k_proj = nn.Linear(in_features=self._feature_dim, out_features=self._feature_dim,
238
+ bias=True, device=self.device, dtype=self.dtype)
239
+ self.v_proj = nn.Linear(in_features=self._feature_dim, out_features=self._feature_dim,
240
+ bias=True, device=self.device, dtype=self.dtype)
241
+
242
+ def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
243
+ q, k = self.q_proj(x), self.k_proj(x)
244
+ attn = torch.matmul(q, k.transpose(-2, -1))
245
+ attn = attn / (self._feature_dim ** 0.5)
246
+ attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
247
+ return softmax(attn, dim=-1)
248
+
249
+ @override
250
+ def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
251
+ x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
252
+ if mask is not None:
253
+ mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
254
+ v = self.v_proj(x)
255
+ return torch.matmul(self._attention(x, mask), v)
256
+ ```
257
+
258
+ - ### 使用预定义训练器实现文本二分类任务
259
+
260
+ ```python
261
+ from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
262
+ from deeplotx.util import get_files, read_file
263
+
264
+ # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
265
+ long_text_encoder = LongTextEncoder(
266
+ max_length=2048, # 最大文本大小, 超出截断
267
+ chunk_size=448, # 块大小 (按 Token 计)
268
+ overlapping=32 # 块间重叠大小 (按 Token 计)
269
+ )
270
+
271
+ trainer = TextBinaryClassifierTrainer(
272
+ long_text_encoder=long_text_encoder,
273
+ batch_size=2,
274
+ train_ratio=0.9 # 训练集和验证集比例
275
+ )
276
+
277
+ # 读取数据
278
+ pos_data_path = 'path/to/pos_dir'
279
+ neg_data_path = 'path/to/neg_dir'
280
+ pos_data = [read_file(x) for x in get_files(pos_data_path)]
281
+ neg_data = [read_file(x) for x in get_files(neg_data_path)]
282
+
283
+ # 开始训练
284
+ model = trainer.train(pos_data, neg_data,
285
+ num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
286
+ balancing_dataset=True, # 是否平衡数据集
287
+ alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
288
+ hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
289
+
290
+ # 保存模型权重
291
+ model.save(model_name='test_model', model_dir='model')
292
+
293
+ # 加载已保存的模型
294
+ model = model.load(model_name='test_model', model_dir='model')
295
+
296
+ # 使用训练好的模型进行预测
297
+ model.predict(long_text_encoder.encode('这是一个测试文本.', flatten=False))
298
+ ```
@@ -1,7 +1,7 @@
1
- deeplotx/__init__.py,sha256=3rVjGSRdcxpxZzHIQohT8dheB5mVdeXIrBkfH2yorcQ,1091
2
- deeplotx/encoder/__init__.py,sha256=EM-xrTsHoGaiiFpj-iFAxilMHXC_sQKWYrcq1qCnI3U,138
3
- deeplotx/encoder/bert_encoder.py,sha256=uLqGcXH6AGY6CcjjbYbh09VWYqSpsg-y-jHYB6Fmp3w,2377
4
- deeplotx/encoder/long_text_encoder.py,sha256=hl_O8kR9o1kcII9YfSx2rf_Pk0l_Rv7LNbsS9UsTU0c,3373
1
+ deeplotx/__init__.py,sha256=6El66QXHDrgNMsNIG9bG97WO8BhPK5btXbTikzx2ce4,1087
2
+ deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
3
+ deeplotx/encoder/encoder.py,sha256=p1e4Dx3-Ghdl0MGNalr0D_OnafwaJnbhscEDVq-y73A,2400
4
+ deeplotx/encoder/long_text_encoder.py,sha256=GatkOF1QQHLtvyuikfCP4xpzfDvszJJyonaS9f7wSxg,3401
5
5
  deeplotx/encoder/longformer_encoder.py,sha256=A8FXqd4mdHxSn_o_R689XtpT73ISDT788EgMQRGLC2g,1822
6
6
  deeplotx/nn/__init__.py,sha256=CS0UwyYKa8wI6vu6FBIYxvm-HAmw39MTMFlZDtqi6UA,444
7
7
  deeplotx/nn/auto_regression.py,sha256=7P63opWCWMqE2DigwbsL6kfXtFtJPz00Yo1RqflBz4A,572
@@ -19,12 +19,12 @@ deeplotx/similarity/set.py,sha256=zhGFxtSIXlWqvipBYzoiPahp4g0boAIoUiMfG0wl07A,68
19
19
  deeplotx/similarity/vector.py,sha256=WVbDHqykt-fvuILVrhUCtIFAOEjY_zvttrXGM9eylG0,1125
20
20
  deeplotx/trainer/__init__.py,sha256=Fl5DR9UecQc5VtBcczU9sx_HtPNoFohpuELOh-Jrsks,77
21
21
  deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
22
- deeplotx/trainer/text_binary_classification_trainer.py,sha256=Ktdk4rCNHgTFdXVFmbTnvIlGIJi1gphGRkuRgL2bVOo,4793
22
+ deeplotx/trainer/text_binary_classification_trainer.py,sha256=BNBQdpaD8nB1dQv8naHNIravNcQC8JjOMqD-WRSrUH0,4931
23
23
  deeplotx/util/__init__.py,sha256=JxqAK_WOOHcYVSTHBT1-WuBwWrPEVDTV3titeVWvNUM,74
24
- deeplotx/util/hash.py,sha256=wwsC6kOQvbpuvwKsNQOARd78_wePmW9i3oaUuXRUnpc,352
24
+ deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
25
25
  deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
26
- deeplotx-0.5.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
27
- deeplotx-0.5.1.dist-info/METADATA,sha256=LatUJZ1YzKrlPMDNI2UiOqSf5h9mP57kf4f5ngnfa8Q,6954
28
- deeplotx-0.5.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
- deeplotx-0.5.1.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
30
- deeplotx-0.5.1.dist-info/RECORD,,
26
+ deeplotx-0.5.5.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
27
+ deeplotx-0.5.5.dist-info/METADATA,sha256=QE1R1jodTrnPFY7cbu4mQNPt8_BgKNJuHoSDswopueo,10880
28
+ deeplotx-0.5.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
29
+ deeplotx-0.5.5.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
30
+ deeplotx-0.5.5.dist-info/RECORD,,