deeplotx 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
13
13
 
14
14
 
15
15
  class LongTextEncoder(Encoder):
16
- def __init__(self, max_length: int, chunk_size: int = 448,
17
- overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
18
- cache_capacity: int = 64, max_workers: int = 8, device: str | None = None):
16
+ def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
17
+ model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
18
+ max_workers: int = 8, device: str | None = None):
19
19
  super().__init__(model_name_or_path=model_name_or_path, device=device)
20
20
  assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
21
21
  self._max_length = max_length
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
41
41
  _fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
42
42
  return _fin_emb_tensor.squeeze()
43
43
 
44
+ _tmp_max_length = self._max_length
44
45
  _text_to_show = text.replace("\n", str())
45
46
  logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
46
47
  # read cache
47
48
  _text_hash = sha512(text)
48
49
  if _text_hash in self._cache:
49
50
  return postprocess(self._cache[_text_hash], flatten)
50
- _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
51
+ _text_to_input_ids = self.tokenizer.encode(text.strip())
52
+ # variable length
53
+ if _tmp_max_length < 0:
54
+ _tmp_max_length = len(_text_to_input_ids)
55
+ _text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
51
56
  _text_to_input_ids_att_mask = []
52
57
  # padding
53
58
  pad_token = self.tokenizer.pad_token_type_id
54
- if len(_text_to_input_ids) < self._max_length:
55
- _text_to_input_ids.extend([pad_token] * (self._max_length - len(_text_to_input_ids)))
59
+ if len(_text_to_input_ids) < _tmp_max_length:
60
+ _text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
56
61
  pads = _text_to_input_ids.count(pad_token)
57
- non_pads = self._max_length - pads
62
+ non_pads = _tmp_max_length - pads
58
63
  _text_to_input_ids_att_mask.extend([1] * non_pads)
59
64
  _text_to_input_ids_att_mask.extend([0] * pads)
60
- num_chunks = math.ceil(self._max_length / self._chunk_size)
65
+ num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
61
66
  # split chunks
62
67
  chunks = []
63
68
  for i in range(num_chunks):
@@ -3,7 +3,8 @@ import os
3
3
 
4
4
  import torch
5
5
  from torch import nn
6
- from transformers import LongformerTokenizer, LongformerModel
6
+ from transformers import AutoModel, AutoTokenizer
7
+ from requests.exceptions import ConnectTimeout, SSLError
7
8
 
8
9
  from deeplotx import __ROOT__
9
10
 
@@ -17,18 +18,35 @@ class LongformerEncoder(nn.Module):
17
18
  super().__init__()
18
19
  self.device = torch.device(device) if device is not None \
19
20
  else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
- self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
21
- cache_dir=CACHE_PATH, _from_auto=True)
22
- self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
23
- cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
21
+ try:
22
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
23
+ cache_dir=CACHE_PATH, _from_auto=True,
24
+ trust_remote_code=True)
25
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
26
+ cache_dir=CACHE_PATH, _from_auto=True,
27
+ trust_remote_code=True).to(self.device)
28
+ except ConnectTimeout:
29
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
30
+ cache_dir=CACHE_PATH, _from_auto=True,
31
+ trust_remote_code=True, local_files_only=True)
32
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
33
+ cache_dir=CACHE_PATH, _from_auto=True,
34
+ trust_remote_code=True, local_files_only=True).to(self.device)
35
+ except SSLError:
36
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
37
+ cache_dir=CACHE_PATH, _from_auto=True,
38
+ trust_remote_code=True, local_files_only=True)
39
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
40
+ cache_dir=CACHE_PATH, _from_auto=True,
41
+ trust_remote_code=True, local_files_only=True).to(self.device)
24
42
  logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
25
43
 
26
44
  def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
27
- ori_mode = self.bert.training
28
- self.bert.eval()
45
+ ori_mode = self.encoder.training
46
+ self.encoder.eval()
29
47
  with torch.no_grad():
30
- res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
31
- self.bert.train(mode=ori_mode)
48
+ res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
49
+ self.encoder.train(mode=ori_mode)
32
50
  return res
33
51
 
34
52
  def encode(self, text: str) -> torch.Tensor:
deeplotx/nn/attention.py CHANGED
@@ -36,7 +36,7 @@ class Attention(BaseNeuralNetwork):
36
36
  attn = torch.matmul(q, k.transpose(-2, -1))
37
37
  attn = attn / (self._feature_dim ** 0.5)
38
38
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
39
- return torch.softmax(attn, dim=-1)
39
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
40
40
 
41
41
  @override
42
42
  def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.1
3
+ Version: 0.8.3
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -48,24 +48,23 @@ Dynamic: license-file
48
48
 
49
49
  - ### 长文本嵌入
50
50
 
51
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
51
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
52
52
 
53
53
  ```python
54
54
  from deeplotx import LongTextEncoder
55
55
 
56
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
56
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
57
57
  encoder = LongTextEncoder(
58
- max_length=2048,
59
- chunk_size=512,
60
- overlapping=64
58
+ chunk_size=448,
59
+ overlapping=32
61
60
  )
62
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
63
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
61
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
62
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
64
63
  ```
65
64
 
66
65
  输出:
67
66
  ```
68
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
67
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
69
68
  ```
70
69
 
71
70
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -77,6 +76,11 @@ Dynamic: license-file
77
76
  encoder.encode('我是吴子豪, 这是一个测试文本.')
78
77
  ```
79
78
 
79
+ 输出:
80
+ ```
81
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
82
+ ```
83
+
80
84
  - ### 相似性计算
81
85
 
82
86
  - **基于向量的相似性**
@@ -163,14 +167,16 @@ Dynamic: license-file
163
167
 
164
168
  ```python
165
169
  from deeplotx import (
166
- BaseNeuralNetwork, # 深度神经网络基类
167
170
  FeedForward, # 前馈神经网络
168
171
  LinearRegression, # 线性回归
169
172
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
170
173
  SoftmaxRegression, # Softmax 回归 / 多分类
171
174
  RecursiveSequential, # 序列模型 / 循环神经网络
172
175
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
173
- SelfAttention, # 自注意力模块
176
+ RoPE, # RoPE 位置编码
177
+ Attention, # 自注意力 / 交叉注意力
178
+ MultiHeadAttention, # 并行多头注意力
179
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
174
180
  AutoRegression, # 自回归模型 / 循环神经网络
175
181
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
176
182
  )
@@ -193,13 +199,13 @@ Dynamic: license-file
193
199
  device: str | None = None, dtype: torch.dtype | None = None):
194
200
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
195
201
  self._dropout_rate = dropout_rate
196
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
197
- device=self.device, dtype=self.dtype)
198
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
199
- device=self.device, dtype=self.dtype)
200
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
202
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
203
+ bias=bias, device=self.device, dtype=self.dtype)
204
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
205
+ bias=bias, device=self.device, dtype=self.dtype)
206
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
201
207
  device=self.device, dtype=self.dtype)
202
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
208
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
203
209
  device=self.device, dtype=self.dtype)
204
210
 
205
211
  @override
@@ -207,11 +213,11 @@ Dynamic: license-file
207
213
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
208
214
  residual = x
209
215
  x = self.layer_norm(x)
210
- x = self.fc1(x)
211
- x = self.parametric_relu_1(x)
216
+ x = self.up_proj(x)
217
+ x = self.parametric_relu(x)
212
218
  if self._dropout_rate > .0:
213
219
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
214
- return self.fc2(x) + residual
220
+ return self.down_proj(x) + residual
215
221
 
216
222
 
217
223
  class FeedForward(BaseNeuralNetwork):
@@ -224,7 +230,7 @@ Dynamic: license-file
224
230
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
225
231
  expansion_factor=expansion_factor, bias=bias,
226
232
  dropout_rate=dropout_rate,
227
- device=self.device, dtype=self.dtype)] * num_layers)
233
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
228
234
 
229
235
  @override
230
236
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -234,7 +240,7 @@ Dynamic: license-file
234
240
  return x
235
241
  ```
236
242
 
237
- 自注意力模块:
243
+ 注意力模块:
238
244
 
239
245
  ```python
240
246
  from typing_extensions import override
@@ -243,14 +249,17 @@ Dynamic: license-file
243
249
 
244
250
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
245
251
  from deeplotx.nn.feed_forward import FeedForward
252
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
246
253
 
247
254
 
248
- class SelfAttention(BaseNeuralNetwork):
249
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
250
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
251
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
255
+ class Attention(BaseNeuralNetwork):
256
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
257
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
258
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
259
+ **kwargs):
252
260
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
253
261
  device=device, dtype=dtype)
262
+ self._positional = positional
254
263
  self._feature_dim = feature_dim
255
264
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
256
265
  expansion_factor=proj_expansion_factor,
@@ -261,21 +270,27 @@ Dynamic: license-file
261
270
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
262
271
  expansion_factor=proj_expansion_factor,
263
272
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
273
+ if self._positional:
274
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
275
+ device=self.device, dtype=self.dtype)
264
276
 
265
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
266
- q, k = self.q_proj(x), self.k_proj(x)
277
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
278
+ q, k = self.q_proj(x), self.k_proj(y)
279
+ if self._positional:
280
+ q, k = self.rope(q), self.rope(k)
267
281
  attn = torch.matmul(q, k.transpose(-2, -1))
268
282
  attn = attn / (self._feature_dim ** 0.5)
269
283
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
270
- return torch.softmax(attn, dim=-1)
284
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
271
285
 
272
286
  @override
273
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
287
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
274
288
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
289
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
275
290
  if mask is not None:
276
291
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
277
- v = self.v_proj(x)
278
- return torch.matmul(self._attention(x, mask), v)
292
+ v = self.v_proj(y)
293
+ return torch.matmul(self._attention(x, y, mask), v)
279
294
  ```
280
295
 
281
296
  - ### 使用预定义训练器实现文本二分类任务
@@ -284,7 +299,7 @@ Dynamic: license-file
284
299
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
285
300
  from deeplotx.util import get_files, read_file
286
301
 
287
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
302
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
288
303
  long_text_encoder = LongTextEncoder(
289
304
  max_length=2048, # 最大文本大小, 超出截断
290
305
  chunk_size=448, # 块大小 (按 Token 计)
@@ -306,10 +321,11 @@ Dynamic: license-file
306
321
 
307
322
  # 开始训练
308
323
  model = trainer.train(pos_data, neg_data,
309
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
310
- balancing_dataset=True, # 是否平衡数据集
311
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
312
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
324
+ num_epochs=36, learning_rate=2e-5,
325
+ balancing_dataset=True, alpha=1e-4,
326
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
327
+ attn_heads=8, # 8 个注意力头
328
+ recursive_layers=2) # 2 层 Bi-LSTM
313
329
 
314
330
  # 保存模型权重
315
331
  model.save(model_name='test_model', model_dir='model')
@@ -1,10 +1,10 @@
1
1
  deeplotx/__init__.py,sha256=oNeA-vNu5YGiEQg0IcpKEdGh_Y_2uPvo2nqaNL_Zgv8,1159
2
2
  deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
3
3
  deeplotx/encoder/encoder.py,sha256=oSBdA-MiwMKNfTFJWR-RdvNS0G0qfX-Qchwy4LuwB00,3985
4
- deeplotx/encoder/long_text_encoder.py,sha256=PFR6jjGyg1N58TQlKsPaNQEd-EDl13Hyhu7A1KtGBbA,3743
5
- deeplotx/encoder/longformer_encoder.py,sha256=A8FXqd4mdHxSn_o_R689XtpT73ISDT788EgMQRGLC2g,1822
4
+ deeplotx/encoder/long_text_encoder.py,sha256=3ScdKDi65J5tdO8PFCXBjCzNUCLlJRwVhpDR0BrphG4,3951
5
+ deeplotx/encoder/longformer_encoder.py,sha256=NNYLr5I9tdeh0C8Ir7QcbEMU9gDk6U7CiF3Tbg6NEsE,3372
6
6
  deeplotx/nn/__init__.py,sha256=01I_yqx9GTa4wy3uNyAqhtxp66tDqxgMLC4Ky5Vnkrg,651
7
- deeplotx/nn/attention.py,sha256=HZ4nfFtkk7AnJ9nuoDSK6zIlIhZ_hbpZc3o6JQIBqJ8,2861
7
+ deeplotx/nn/attention.py,sha256=R-i-Rd7gnsh6hwXDeYfqLQOJvfSZIGfQbFzRlC91XLo,2879
8
8
  deeplotx/nn/auto_regression.py,sha256=uISx29t_zkDGS8s2wvGB6wOGYZitQ4hQ7wyoQl4lcqY,857
9
9
  deeplotx/nn/base_neural_network.py,sha256=FjQEDFH810fJS7JV3aLgJZnaMqC6DH--wlBvuj-ghTc,5900
10
10
  deeplotx/nn/feed_forward.py,sha256=4ozj7EDalO9pb6JUhZtsJqE0r8bIHFApHRt2zTrl4ho,2931
@@ -27,8 +27,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=QMLR4cC8NCUP-v7SOY
27
27
  deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
28
28
  deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
29
29
  deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
30
- deeplotx-0.8.1.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
31
- deeplotx-0.8.1.dist-info/METADATA,sha256=zMKRLmdsEibLnN_hAx3OM7AbX3SiM7X1-8w4eFJGxNY,12251
32
- deeplotx-0.8.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
- deeplotx-0.8.1.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
34
- deeplotx-0.8.1.dist-info/RECORD,,
30
+ deeplotx-0.8.3.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
31
+ deeplotx-0.8.3.dist-info/METADATA,sha256=Lif2B7wUDIQQKWvUt_Vl_XYPlMf_EhskiQcq8ZYv6TQ,13079
32
+ deeplotx-0.8.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
33
+ deeplotx-0.8.3.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
34
+ deeplotx-0.8.3.dist-info/RECORD,,