deeplotx 0.8.2__tar.gz → 0.8.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. {deeplotx-0.8.2 → deeplotx-0.8.5}/PKG-INFO +54 -37
  2. {deeplotx-0.8.2 → deeplotx-0.8.5}/README.md +53 -36
  3. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/__init__.py +1 -0
  4. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/encoder/long_text_encoder.py +13 -8
  5. deeplotx-0.8.5/deeplotx/encoder/longformer_encoder.py +55 -0
  6. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/__init__.py +1 -0
  7. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/auto_regression.py +2 -2
  8. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/linear_regression.py +5 -4
  9. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/logistic_regression.py +4 -4
  10. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/long_context_recursive_sequential.py +2 -3
  11. deeplotx-0.8.5/deeplotx/nn/multi_head_feed_forward.py +32 -0
  12. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/recursive_sequential.py +5 -5
  13. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/softmax_regression.py +4 -4
  14. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/trainer/text_binary_classification_trainer.py +6 -5
  15. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/PKG-INFO +54 -37
  16. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/SOURCES.txt +1 -0
  17. {deeplotx-0.8.2 → deeplotx-0.8.5}/pyproject.toml +17 -17
  18. deeplotx-0.8.2/deeplotx/encoder/longformer_encoder.py +0 -37
  19. {deeplotx-0.8.2 → deeplotx-0.8.5}/LICENSE +0 -0
  20. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/encoder/__init__.py +0 -0
  21. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/encoder/encoder.py +0 -0
  22. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/attention.py +0 -0
  23. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/base_neural_network.py +0 -0
  24. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/feed_forward.py +0 -0
  25. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/long_context_auto_regression.py +0 -0
  26. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/multi_head_attention.py +0 -0
  27. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/roformer_encoder.py +0 -0
  28. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/rope.py +0 -0
  29. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/__init__.py +0 -0
  30. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/distribution.py +0 -0
  31. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/set.py +0 -0
  32. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/vector.py +0 -0
  33. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/trainer/__init__.py +0 -0
  34. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/trainer/base_trainer.py +0 -0
  35. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/util/__init__.py +0 -0
  36. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/util/hash.py +0 -0
  37. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/util/read_file.py +0 -0
  38. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/dependency_links.txt +0 -0
  39. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/requires.txt +0 -0
  40. {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/top_level.txt +0 -0
  41. {deeplotx-0.8.2 → deeplotx-0.8.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.2
3
+ Version: 0.8.5
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -48,24 +48,23 @@ Dynamic: license-file
48
48
 
49
49
  - ### 长文本嵌入
50
50
 
51
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
51
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
52
52
 
53
53
  ```python
54
54
  from deeplotx import LongTextEncoder
55
55
 
56
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
56
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
57
57
  encoder = LongTextEncoder(
58
- max_length=2048,
59
- chunk_size=512,
60
- overlapping=64
58
+ chunk_size=448,
59
+ overlapping=32
61
60
  )
62
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
63
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
61
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
62
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
64
63
  ```
65
64
 
66
65
  输出:
67
66
  ```
68
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
67
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
69
68
  ```
70
69
 
71
70
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -77,6 +76,11 @@ Dynamic: license-file
77
76
  encoder.encode('我是吴子豪, 这是一个测试文本.')
78
77
  ```
79
78
 
79
+ 输出:
80
+ ```
81
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
82
+ ```
83
+
80
84
  - ### 相似性计算
81
85
 
82
86
  - **基于向量的相似性**
@@ -163,14 +167,17 @@ Dynamic: license-file
163
167
 
164
168
  ```python
165
169
  from deeplotx import (
166
- BaseNeuralNetwork, # 深度神经网络基类
167
170
  FeedForward, # 前馈神经网络
171
+ MultiHeadFeedForward, # 多头前馈神经网络
168
172
  LinearRegression, # 线性回归
169
173
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
170
174
  SoftmaxRegression, # Softmax 回归 / 多分类
171
175
  RecursiveSequential, # 序列模型 / 循环神经网络
172
176
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
173
- SelfAttention, # 自注意力模块
177
+ RoPE, # RoPE 位置编码
178
+ Attention, # 自注意力 / 交叉注意力
179
+ MultiHeadAttention, # 并行多头注意力
180
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
174
181
  AutoRegression, # 自回归模型 / 循环神经网络
175
182
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
176
183
  )
@@ -193,13 +200,13 @@ Dynamic: license-file
193
200
  device: str | None = None, dtype: torch.dtype | None = None):
194
201
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
195
202
  self._dropout_rate = dropout_rate
196
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
197
- device=self.device, dtype=self.dtype)
198
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
199
- device=self.device, dtype=self.dtype)
200
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
203
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
204
+ bias=bias, device=self.device, dtype=self.dtype)
205
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
206
+ bias=bias, device=self.device, dtype=self.dtype)
207
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
201
208
  device=self.device, dtype=self.dtype)
202
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
209
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
203
210
  device=self.device, dtype=self.dtype)
204
211
 
205
212
  @override
@@ -207,11 +214,11 @@ Dynamic: license-file
207
214
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
208
215
  residual = x
209
216
  x = self.layer_norm(x)
210
- x = self.fc1(x)
211
- x = self.parametric_relu_1(x)
217
+ x = self.up_proj(x)
218
+ x = self.parametric_relu(x)
212
219
  if self._dropout_rate > .0:
213
220
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
214
- return self.fc2(x) + residual
221
+ return self.down_proj(x) + residual
215
222
 
216
223
 
217
224
  class FeedForward(BaseNeuralNetwork):
@@ -224,7 +231,7 @@ Dynamic: license-file
224
231
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
225
232
  expansion_factor=expansion_factor, bias=bias,
226
233
  dropout_rate=dropout_rate,
227
- device=self.device, dtype=self.dtype)] * num_layers)
234
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
228
235
 
229
236
  @override
230
237
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -234,7 +241,7 @@ Dynamic: license-file
234
241
  return x
235
242
  ```
236
243
 
237
- 自注意力模块:
244
+ 注意力模块:
238
245
 
239
246
  ```python
240
247
  from typing_extensions import override
@@ -243,14 +250,17 @@ Dynamic: license-file
243
250
 
244
251
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
245
252
  from deeplotx.nn.feed_forward import FeedForward
253
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
246
254
 
247
255
 
248
- class SelfAttention(BaseNeuralNetwork):
249
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
250
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
251
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
256
+ class Attention(BaseNeuralNetwork):
257
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
258
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
259
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
260
+ **kwargs):
252
261
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
253
262
  device=device, dtype=dtype)
263
+ self._positional = positional
254
264
  self._feature_dim = feature_dim
255
265
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
256
266
  expansion_factor=proj_expansion_factor,
@@ -261,21 +271,27 @@ Dynamic: license-file
261
271
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
262
272
  expansion_factor=proj_expansion_factor,
263
273
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
274
+ if self._positional:
275
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
276
+ device=self.device, dtype=self.dtype)
264
277
 
265
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
266
- q, k = self.q_proj(x), self.k_proj(x)
278
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
279
+ q, k = self.q_proj(x), self.k_proj(y)
280
+ if self._positional:
281
+ q, k = self.rope(q), self.rope(k)
267
282
  attn = torch.matmul(q, k.transpose(-2, -1))
268
283
  attn = attn / (self._feature_dim ** 0.5)
269
284
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
270
- return torch.softmax(attn, dim=-1)
285
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
271
286
 
272
287
  @override
273
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
288
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
274
289
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
290
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
275
291
  if mask is not None:
276
292
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
277
- v = self.v_proj(x)
278
- return torch.matmul(self._attention(x, mask), v)
293
+ v = self.v_proj(y)
294
+ return torch.matmul(self._attention(x, y, mask), v)
279
295
  ```
280
296
 
281
297
  - ### 使用预定义训练器实现文本二分类任务
@@ -284,7 +300,7 @@ Dynamic: license-file
284
300
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
285
301
  from deeplotx.util import get_files, read_file
286
302
 
287
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
303
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
288
304
  long_text_encoder = LongTextEncoder(
289
305
  max_length=2048, # 最大文本大小, 超出截断
290
306
  chunk_size=448, # 块大小 (按 Token 计)
@@ -306,10 +322,11 @@ Dynamic: license-file
306
322
 
307
323
  # 开始训练
308
324
  model = trainer.train(pos_data, neg_data,
309
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
310
- balancing_dataset=True, # 是否平衡数据集
311
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
312
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
325
+ num_epochs=36, learning_rate=2e-5,
326
+ balancing_dataset=True, alpha=1e-4,
327
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
328
+ attn_heads=8, # 8 个注意力头
329
+ recursive_layers=2) # 2 层 Bi-LSTM
313
330
 
314
331
  # 保存模型权重
315
332
  model.save(model_name='test_model', model_dir='model')
@@ -30,24 +30,23 @@
30
30
 
31
31
  - ### 长文本嵌入
32
32
 
33
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
33
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
34
34
 
35
35
  ```python
36
36
  from deeplotx import LongTextEncoder
37
37
 
38
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
38
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
39
39
  encoder = LongTextEncoder(
40
- max_length=2048,
41
- chunk_size=512,
42
- overlapping=64
40
+ chunk_size=448,
41
+ overlapping=32
43
42
  )
44
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
45
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
43
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
44
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
46
45
  ```
47
46
 
48
47
  输出:
49
48
  ```
50
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
49
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
51
50
  ```
52
51
 
53
52
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -59,6 +58,11 @@
59
58
  encoder.encode('我是吴子豪, 这是一个测试文本.')
60
59
  ```
61
60
 
61
+ 输出:
62
+ ```
63
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
64
+ ```
65
+
62
66
  - ### 相似性计算
63
67
 
64
68
  - **基于向量的相似性**
@@ -145,14 +149,17 @@
145
149
 
146
150
  ```python
147
151
  from deeplotx import (
148
- BaseNeuralNetwork, # 深度神经网络基类
149
152
  FeedForward, # 前馈神经网络
153
+ MultiHeadFeedForward, # 多头前馈神经网络
150
154
  LinearRegression, # 线性回归
151
155
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
152
156
  SoftmaxRegression, # Softmax 回归 / 多分类
153
157
  RecursiveSequential, # 序列模型 / 循环神经网络
154
158
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
155
- SelfAttention, # 自注意力模块
159
+ RoPE, # RoPE 位置编码
160
+ Attention, # 自注意力 / 交叉注意力
161
+ MultiHeadAttention, # 并行多头注意力
162
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
156
163
  AutoRegression, # 自回归模型 / 循环神经网络
157
164
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
158
165
  )
@@ -175,13 +182,13 @@
175
182
  device: str | None = None, dtype: torch.dtype | None = None):
176
183
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
177
184
  self._dropout_rate = dropout_rate
178
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
179
- device=self.device, dtype=self.dtype)
180
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
181
- device=self.device, dtype=self.dtype)
182
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
185
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
186
+ bias=bias, device=self.device, dtype=self.dtype)
187
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
188
+ bias=bias, device=self.device, dtype=self.dtype)
189
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
183
190
  device=self.device, dtype=self.dtype)
184
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
191
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
185
192
  device=self.device, dtype=self.dtype)
186
193
 
187
194
  @override
@@ -189,11 +196,11 @@
189
196
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
190
197
  residual = x
191
198
  x = self.layer_norm(x)
192
- x = self.fc1(x)
193
- x = self.parametric_relu_1(x)
199
+ x = self.up_proj(x)
200
+ x = self.parametric_relu(x)
194
201
  if self._dropout_rate > .0:
195
202
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
196
- return self.fc2(x) + residual
203
+ return self.down_proj(x) + residual
197
204
 
198
205
 
199
206
  class FeedForward(BaseNeuralNetwork):
@@ -206,7 +213,7 @@
206
213
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
207
214
  expansion_factor=expansion_factor, bias=bias,
208
215
  dropout_rate=dropout_rate,
209
- device=self.device, dtype=self.dtype)] * num_layers)
216
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
210
217
 
211
218
  @override
212
219
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -216,7 +223,7 @@
216
223
  return x
217
224
  ```
218
225
 
219
- 自注意力模块:
226
+ 注意力模块:
220
227
 
221
228
  ```python
222
229
  from typing_extensions import override
@@ -225,14 +232,17 @@
225
232
 
226
233
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
227
234
  from deeplotx.nn.feed_forward import FeedForward
235
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
228
236
 
229
237
 
230
- class SelfAttention(BaseNeuralNetwork):
231
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
232
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
233
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
238
+ class Attention(BaseNeuralNetwork):
239
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
240
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
241
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
242
+ **kwargs):
234
243
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
235
244
  device=device, dtype=dtype)
245
+ self._positional = positional
236
246
  self._feature_dim = feature_dim
237
247
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
238
248
  expansion_factor=proj_expansion_factor,
@@ -243,21 +253,27 @@
243
253
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
244
254
  expansion_factor=proj_expansion_factor,
245
255
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
256
+ if self._positional:
257
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
258
+ device=self.device, dtype=self.dtype)
246
259
 
247
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
248
- q, k = self.q_proj(x), self.k_proj(x)
260
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
261
+ q, k = self.q_proj(x), self.k_proj(y)
262
+ if self._positional:
263
+ q, k = self.rope(q), self.rope(k)
249
264
  attn = torch.matmul(q, k.transpose(-2, -1))
250
265
  attn = attn / (self._feature_dim ** 0.5)
251
266
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
252
- return torch.softmax(attn, dim=-1)
267
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
253
268
 
254
269
  @override
255
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
270
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
256
271
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
272
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
257
273
  if mask is not None:
258
274
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
259
- v = self.v_proj(x)
260
- return torch.matmul(self._attention(x, mask), v)
275
+ v = self.v_proj(y)
276
+ return torch.matmul(self._attention(x, y, mask), v)
261
277
  ```
262
278
 
263
279
  - ### 使用预定义训练器实现文本二分类任务
@@ -266,7 +282,7 @@
266
282
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
267
283
  from deeplotx.util import get_files, read_file
268
284
 
269
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
285
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
270
286
  long_text_encoder = LongTextEncoder(
271
287
  max_length=2048, # 最大文本大小, 超出截断
272
288
  chunk_size=448, # 块大小 (按 Token 计)
@@ -288,10 +304,11 @@
288
304
 
289
305
  # 开始训练
290
306
  model = trainer.train(pos_data, neg_data,
291
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
292
- balancing_dataset=True, # 是否平衡数据集
293
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
294
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
307
+ num_epochs=36, learning_rate=2e-5,
308
+ balancing_dataset=True, alpha=1e-4,
309
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
310
+ attn_heads=8, # 8 个注意力头
311
+ recursive_layers=2) # 2 层 Bi-LSTM
295
312
 
296
313
  # 保存模型权重
297
314
  model.save(model_name='test_model', model_dir='model')
@@ -6,6 +6,7 @@ __ROOT__ = os.path.dirname(os.path.abspath(__file__))
6
6
  from .encoder import Encoder, LongTextEncoder, LongformerEncoder
7
7
  from .nn import (
8
8
  FeedForward,
9
+ MultiHeadFeedForward,
9
10
  LinearRegression,
10
11
  LogisticRegression,
11
12
  SoftmaxRegression,
@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
13
13
 
14
14
 
15
15
  class LongTextEncoder(Encoder):
16
- def __init__(self, max_length: int, chunk_size: int = 448,
17
- overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
18
- cache_capacity: int = 64, max_workers: int = 8, device: str | None = None):
16
+ def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
17
+ model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
18
+ max_workers: int = 8, device: str | None = None):
19
19
  super().__init__(model_name_or_path=model_name_or_path, device=device)
20
20
  assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
21
21
  self._max_length = max_length
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
41
41
  _fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
42
42
  return _fin_emb_tensor.squeeze()
43
43
 
44
+ _tmp_max_length = self._max_length
44
45
  _text_to_show = text.replace("\n", str())
45
46
  logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
46
47
  # read cache
47
48
  _text_hash = sha512(text)
48
49
  if _text_hash in self._cache:
49
50
  return postprocess(self._cache[_text_hash], flatten)
50
- _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
51
+ _text_to_input_ids = self.tokenizer.encode(text.strip())
52
+ # variable length
53
+ if _tmp_max_length < 0:
54
+ _tmp_max_length = len(_text_to_input_ids)
55
+ _text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
51
56
  _text_to_input_ids_att_mask = []
52
57
  # padding
53
58
  pad_token = self.tokenizer.pad_token_type_id
54
- if len(_text_to_input_ids) < self._max_length:
55
- _text_to_input_ids.extend([pad_token] * (self._max_length - len(_text_to_input_ids)))
59
+ if len(_text_to_input_ids) < _tmp_max_length:
60
+ _text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
56
61
  pads = _text_to_input_ids.count(pad_token)
57
- non_pads = self._max_length - pads
62
+ non_pads = _tmp_max_length - pads
58
63
  _text_to_input_ids_att_mask.extend([1] * non_pads)
59
64
  _text_to_input_ids_att_mask.extend([0] * pads)
60
- num_chunks = math.ceil(self._max_length / self._chunk_size)
65
+ num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
61
66
  # split chunks
62
67
  chunks = []
63
68
  for i in range(num_chunks):
@@ -0,0 +1,55 @@
1
+ import logging
2
+ import os
3
+
4
+ import torch
5
+ from torch import nn
6
+ from transformers import AutoModel, AutoTokenizer
7
+ from requests.exceptions import ConnectTimeout, SSLError
8
+
9
+ from deeplotx import __ROOT__
10
+
11
+ CACHE_PATH = os.path.join(__ROOT__, '.cache')
12
+ DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
13
+ logger = logging.getLogger('deeplotx.embedding')
14
+
15
+
16
+ class LongformerEncoder(nn.Module):
17
+ def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
18
+ super().__init__()
19
+ self.device = torch.device(device) if device is not None \
20
+ else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+ try:
22
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
23
+ cache_dir=CACHE_PATH, _from_auto=True,
24
+ trust_remote_code=True)
25
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
26
+ cache_dir=CACHE_PATH, _from_auto=True,
27
+ trust_remote_code=True).to(self.device)
28
+ except ConnectTimeout:
29
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
30
+ cache_dir=CACHE_PATH, _from_auto=True,
31
+ trust_remote_code=True, local_files_only=True)
32
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
33
+ cache_dir=CACHE_PATH, _from_auto=True,
34
+ trust_remote_code=True, local_files_only=True).to(self.device)
35
+ except SSLError:
36
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
37
+ cache_dir=CACHE_PATH, _from_auto=True,
38
+ trust_remote_code=True, local_files_only=True)
39
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
40
+ cache_dir=CACHE_PATH, _from_auto=True,
41
+ trust_remote_code=True, local_files_only=True).to(self.device)
42
+ logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
43
+
44
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
45
+ ori_mode = self.encoder.training
46
+ self.encoder.eval()
47
+ with torch.no_grad():
48
+ res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
49
+ self.encoder.train(mode=ori_mode)
50
+ return res
51
+
52
+ def encode(self, text: str) -> torch.Tensor:
53
+ _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
54
+ _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
55
+ return self.forward(_input_ids, _att_mask).squeeze()
@@ -1,5 +1,6 @@
1
1
  from .base_neural_network import BaseNeuralNetwork
2
2
  from .feed_forward import FeedForward
3
+ from .multi_head_feed_forward import MultiHeadFeedForward
3
4
  from .linear_regression import LinearRegression
4
5
  from .logistic_regression import LogisticRegression
5
6
  from .softmax_regression import SoftmaxRegression
@@ -7,8 +7,8 @@ class AutoRegression(RecursiveSequential):
7
7
  def __init__(self, feature_dim: int, bias: bool = True,
8
8
  recursive_layers: int = 1, recursive_hidden_dim: int | None = None,
9
9
  ffn_layers: int = 1, ffn_expansion_factor: int | float = 2, dropout_rate: float = 0.05,
10
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
10
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None, **kwargs):
11
11
  super().__init__(input_dim=feature_dim, output_dim=feature_dim, bias=bias,
12
12
  recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
13
13
  ffn_layers=ffn_layers, ffn_expansion_factor=ffn_expansion_factor,
14
- dropout_rate=dropout_rate, model_name=model_name, device=device, dtype=dtype)
14
+ dropout_rate=dropout_rate, model_name=model_name, device=device, dtype=dtype, **kwargs)
@@ -4,16 +4,17 @@ import torch
4
4
  from torch import nn
5
5
 
6
6
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
7
- from deeplotx.nn.feed_forward import FeedForward
7
+ from deeplotx.nn.multi_head_feed_forward import MultiHeadFeedForward
8
8
 
9
9
 
10
10
  class LinearRegression(BaseNeuralNetwork):
11
- def __init__(self, input_dim: int, output_dim: int, num_layers: int = 1,
11
+ def __init__(self, input_dim: int, output_dim: int, num_heads: int = 1, num_layers: int = 1,
12
12
  expansion_factor: int | float = 1.5, bias: bool = True, dropout_rate: float = 0.1,
13
13
  model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
14
14
  super().__init__(in_features=input_dim, out_features=output_dim, model_name=model_name, device=device, dtype=dtype)
15
- self.ffn = FeedForward(feature_dim=input_dim, num_layers=num_layers, expansion_factor=expansion_factor,
16
- bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
15
+ self.ffn = MultiHeadFeedForward(feature_dim=input_dim, num_heads=num_heads,
16
+ num_layers=num_layers, expansion_factor=expansion_factor,
17
+ bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
17
18
  self.proj = nn.Linear(in_features=input_dim, out_features=output_dim,
18
19
  bias=bias, device=self.device, dtype=self.dtype)
19
20
 
@@ -6,10 +6,10 @@ from deeplotx.nn.linear_regression import LinearRegression
6
6
 
7
7
 
8
8
  class LogisticRegression(LinearRegression):
9
- def __init__(self, input_dim: int, output_dim: int = 1, num_layers: int = 1, expansion_factor: int | float = 1.5,
10
- bias: bool = True, dropout_rate: float = 0.1, model_name: str | None = None,
11
- device: str | None = None, dtype: torch.dtype | None = None):
12
- super().__init__(input_dim=input_dim, output_dim=output_dim, num_layers=num_layers,
9
+ def __init__(self, input_dim: int, output_dim: int = 1, num_heads: int = 1, num_layers: int = 1,
10
+ expansion_factor: int | float = 1.5, bias: bool = True, dropout_rate: float = 0.1,
11
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
12
+ super().__init__(input_dim=input_dim, output_dim=output_dim, num_heads=num_heads, num_layers=num_layers,
13
13
  expansion_factor=expansion_factor, bias=bias, dropout_rate=dropout_rate,
14
14
  model_name=model_name, device=device, dtype=dtype)
15
15
 
@@ -12,12 +12,11 @@ class LongContextRecursiveSequential(RecursiveSequential):
12
12
  def __init__(self, input_dim: int, output_dim: int, bias: bool = True,
13
13
  encoder_layers: int = 1, attn_heads: int = 1, recursive_layers: int = 2, recursive_hidden_dim: int | None = None,
14
14
  ffn_layers: int = 1, ffn_expansion_factor: int | float = 2, dropout_rate: float = 0.05,
15
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
16
- **kwargs):
15
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None, **kwargs):
17
16
  super().__init__(input_dim=input_dim, output_dim=output_dim, bias=bias,
18
17
  recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
19
18
  ffn_layers=ffn_layers, ffn_expansion_factor=ffn_expansion_factor, dropout_rate=dropout_rate,
20
- model_name=model_name, device=device, dtype=dtype)
19
+ model_name=model_name, device=device, dtype=dtype, **kwargs)
21
20
  self.roformer_encoders = nn.ModuleList([RoFormerEncoder(feature_dim=input_dim, attn_heads=attn_heads, bias=bias,
22
21
  ffn_layers=kwargs.get('encoder_ffn_layers', ffn_layers),
23
22
  ffn_expansion_factor=kwargs.get('encoder_expansion_factor', ffn_expansion_factor),
@@ -0,0 +1,32 @@
1
+ from typing_extensions import override
2
+
3
+ import torch
4
+ from torch import nn
5
+
6
+ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
7
+ from deeplotx.nn.feed_forward import FeedForward
8
+
9
+
10
+ class MultiHeadFeedForward(BaseNeuralNetwork):
11
+ def __init__(self, feature_dim: int, num_heads: int = 1, num_layers: int = 1, expansion_factor: int | float = 2,
12
+ bias: bool = True, dropout_rate: float = 0.05, model_name: str | None = None,
13
+ device: str | None = None, dtype: torch.dtype | None = None):
14
+ super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
15
+ device=device, dtype=dtype)
16
+ self._num_heads = num_heads
17
+ self.expand_proj = nn.Linear(in_features=feature_dim, out_features=feature_dim * self._num_heads, bias=bias,
18
+ device=self.device, dtype=self.dtype)
19
+ self.ffn_heads = nn.ModuleList([FeedForward(feature_dim=feature_dim, num_layers=num_layers,
20
+ expansion_factor=expansion_factor, bias=bias,
21
+ dropout_rate=dropout_rate, device=self.device,
22
+ dtype=self.dtype) for _ in range(self._num_heads)])
23
+ self.out_proj = nn.Linear(in_features=feature_dim * self._num_heads, out_features=feature_dim, bias=bias,
24
+ device=self.device, dtype=self.dtype)
25
+
26
+ @override
27
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
28
+ x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
29
+ x = self.expand_proj(x)
30
+ x_heads = x.split(self.in_features, dim=-1)
31
+ head_outs = [self.ffn_heads[_](x_heads[_]) for _ in range(self._num_heads)]
32
+ return self.out_proj(torch.concat(head_outs, dim=-1))
@@ -4,14 +4,14 @@ import torch
4
4
  from torch import nn
5
5
 
6
6
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
7
- from deeplotx.nn.feed_forward import FeedForward
7
+ from deeplotx.nn.multi_head_feed_forward import MultiHeadFeedForward
8
8
 
9
9
 
10
10
  class RecursiveSequential(BaseNeuralNetwork):
11
11
  def __init__(self, input_dim: int, output_dim: int, bias: bool = True,
12
12
  recursive_layers: int = 1, recursive_hidden_dim: int | None = None,
13
13
  ffn_layers: int = 1, ffn_expansion_factor: int | float = 2, dropout_rate: float = 0.05,
14
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
14
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None, **kwargs):
15
15
  super().__init__(in_features=input_dim, out_features=output_dim, model_name=model_name,
16
16
  device=device, dtype=dtype)
17
17
  if recursive_hidden_dim is None:
@@ -20,9 +20,9 @@ class RecursiveSequential(BaseNeuralNetwork):
20
20
  num_layers=recursive_layers, batch_first=True,
21
21
  bias=True, bidirectional=True, device=self.device,
22
22
  dtype=self.dtype)
23
- self.ffn = FeedForward(feature_dim=recursive_hidden_dim * 2, num_layers=ffn_layers,
24
- expansion_factor=ffn_expansion_factor, bias=bias, dropout_rate=dropout_rate,
25
- device=self.device, dtype=self.dtype)
23
+ self.ffn = MultiHeadFeedForward(feature_dim=recursive_hidden_dim * 2, num_heads=kwargs.get('ffn_heads', 1),
24
+ num_layers=ffn_layers, expansion_factor=ffn_expansion_factor,
25
+ bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
26
26
  self.__proj = nn.Linear(in_features=recursive_hidden_dim * 2, out_features=output_dim, bias=bias,
27
27
  device=self.device, dtype=self.dtype)
28
28
 
@@ -6,10 +6,10 @@ from deeplotx.nn.linear_regression import LinearRegression
6
6
 
7
7
 
8
8
  class SoftmaxRegression(LinearRegression):
9
- def __init__(self, input_dim: int, output_dim: int, num_layers: int = 1, expansion_factor: int | float = 1.5,
10
- bias: bool = True, dropout_rate: float = 0.1, model_name: str | None = None,
11
- device: str | None = None, dtype: torch.dtype | None = None):
12
- super().__init__(input_dim=input_dim, output_dim=output_dim, num_layers=num_layers,
9
+ def __init__(self, input_dim: int, output_dim: int, num_heads: int = 1, num_layers: int = 1,
10
+ expansion_factor: int | float = 1.5, bias: bool = True, dropout_rate: float = 0.1,
11
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
12
+ super().__init__(input_dim=input_dim, output_dim=output_dim, num_heads=num_heads, num_layers=num_layers,
13
13
  expansion_factor=expansion_factor, bias=bias, dropout_rate=dropout_rate,
14
14
  model_name=model_name, device=device, dtype=dtype)
15
15
 
@@ -49,6 +49,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
49
49
  logger.warning("The dimension of features doesn't match. A new model instance will be created.")
50
50
  self.model = None
51
51
  if self.model is None:
52
+ ffn_heads = kwargs.get('ffn_heads', 2)
52
53
  ffn_layers = kwargs.get('ffn_layers', 5)
53
54
  ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
54
55
  bias = kwargs.get('bias', True)
@@ -63,11 +64,11 @@ class TextBinaryClassifierTrainer(BaseTrainer):
63
64
  self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
64
65
  encoder_layers=encoder_layers, attn_heads=attn_heads,
65
66
  recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
66
- ffn_layers=ffn_layers, ffn_expansion_factor=ffn_expansion_factor, dropout_rate=dropout_rate,
67
- encoder_ffn_layers=encoder_ffn_layers, encoder_expansion_factor=encoder_expansion_factor,
68
- encoder_dropout_rate=encoder_dropout_rate, attn_ffn_layers=attn_ffn_layers,
69
- attn_expansion_factor=attn_expansion_factor, attn_dropout_rate=attn_dropout_rate,
70
- theta=theta).initialize_weights()
67
+ ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
68
+ dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
69
+ encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
70
+ attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
71
+ attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
71
72
  logger.debug(f'Training Model: \n{self.model}')
72
73
  loss_function = nn.BCELoss()
73
74
  optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.2
3
+ Version: 0.8.5
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -48,24 +48,23 @@ Dynamic: license-file
48
48
 
49
49
  - ### 长文本嵌入
50
50
 
51
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
51
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
52
52
 
53
53
  ```python
54
54
  from deeplotx import LongTextEncoder
55
55
 
56
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
56
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
57
57
  encoder = LongTextEncoder(
58
- max_length=2048,
59
- chunk_size=512,
60
- overlapping=64
58
+ chunk_size=448,
59
+ overlapping=32
61
60
  )
62
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
63
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
61
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
62
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
64
63
  ```
65
64
 
66
65
  输出:
67
66
  ```
68
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
67
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
69
68
  ```
70
69
 
71
70
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -77,6 +76,11 @@ Dynamic: license-file
77
76
  encoder.encode('我是吴子豪, 这是一个测试文本.')
78
77
  ```
79
78
 
79
+ 输出:
80
+ ```
81
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
82
+ ```
83
+
80
84
  - ### 相似性计算
81
85
 
82
86
  - **基于向量的相似性**
@@ -163,14 +167,17 @@ Dynamic: license-file
163
167
 
164
168
  ```python
165
169
  from deeplotx import (
166
- BaseNeuralNetwork, # 深度神经网络基类
167
170
  FeedForward, # 前馈神经网络
171
+ MultiHeadFeedForward, # 多头前馈神经网络
168
172
  LinearRegression, # 线性回归
169
173
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
170
174
  SoftmaxRegression, # Softmax 回归 / 多分类
171
175
  RecursiveSequential, # 序列模型 / 循环神经网络
172
176
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
173
- SelfAttention, # 自注意力模块
177
+ RoPE, # RoPE 位置编码
178
+ Attention, # 自注意力 / 交叉注意力
179
+ MultiHeadAttention, # 并行多头注意力
180
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
174
181
  AutoRegression, # 自回归模型 / 循环神经网络
175
182
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
176
183
  )
@@ -193,13 +200,13 @@ Dynamic: license-file
193
200
  device: str | None = None, dtype: torch.dtype | None = None):
194
201
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
195
202
  self._dropout_rate = dropout_rate
196
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
197
- device=self.device, dtype=self.dtype)
198
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
199
- device=self.device, dtype=self.dtype)
200
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
203
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
204
+ bias=bias, device=self.device, dtype=self.dtype)
205
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
206
+ bias=bias, device=self.device, dtype=self.dtype)
207
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
201
208
  device=self.device, dtype=self.dtype)
202
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
209
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
203
210
  device=self.device, dtype=self.dtype)
204
211
 
205
212
  @override
@@ -207,11 +214,11 @@ Dynamic: license-file
207
214
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
208
215
  residual = x
209
216
  x = self.layer_norm(x)
210
- x = self.fc1(x)
211
- x = self.parametric_relu_1(x)
217
+ x = self.up_proj(x)
218
+ x = self.parametric_relu(x)
212
219
  if self._dropout_rate > .0:
213
220
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
214
- return self.fc2(x) + residual
221
+ return self.down_proj(x) + residual
215
222
 
216
223
 
217
224
  class FeedForward(BaseNeuralNetwork):
@@ -224,7 +231,7 @@ Dynamic: license-file
224
231
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
225
232
  expansion_factor=expansion_factor, bias=bias,
226
233
  dropout_rate=dropout_rate,
227
- device=self.device, dtype=self.dtype)] * num_layers)
234
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
228
235
 
229
236
  @override
230
237
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -234,7 +241,7 @@ Dynamic: license-file
234
241
  return x
235
242
  ```
236
243
 
237
- 自注意力模块:
244
+ 注意力模块:
238
245
 
239
246
  ```python
240
247
  from typing_extensions import override
@@ -243,14 +250,17 @@ Dynamic: license-file
243
250
 
244
251
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
245
252
  from deeplotx.nn.feed_forward import FeedForward
253
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
246
254
 
247
255
 
248
- class SelfAttention(BaseNeuralNetwork):
249
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
250
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
251
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
256
+ class Attention(BaseNeuralNetwork):
257
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
258
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
259
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
260
+ **kwargs):
252
261
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
253
262
  device=device, dtype=dtype)
263
+ self._positional = positional
254
264
  self._feature_dim = feature_dim
255
265
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
256
266
  expansion_factor=proj_expansion_factor,
@@ -261,21 +271,27 @@ Dynamic: license-file
261
271
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
262
272
  expansion_factor=proj_expansion_factor,
263
273
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
274
+ if self._positional:
275
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
276
+ device=self.device, dtype=self.dtype)
264
277
 
265
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
266
- q, k = self.q_proj(x), self.k_proj(x)
278
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
279
+ q, k = self.q_proj(x), self.k_proj(y)
280
+ if self._positional:
281
+ q, k = self.rope(q), self.rope(k)
267
282
  attn = torch.matmul(q, k.transpose(-2, -1))
268
283
  attn = attn / (self._feature_dim ** 0.5)
269
284
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
270
- return torch.softmax(attn, dim=-1)
285
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
271
286
 
272
287
  @override
273
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
288
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
274
289
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
290
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
275
291
  if mask is not None:
276
292
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
277
- v = self.v_proj(x)
278
- return torch.matmul(self._attention(x, mask), v)
293
+ v = self.v_proj(y)
294
+ return torch.matmul(self._attention(x, y, mask), v)
279
295
  ```
280
296
 
281
297
  - ### 使用预定义训练器实现文本二分类任务
@@ -284,7 +300,7 @@ Dynamic: license-file
284
300
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
285
301
  from deeplotx.util import get_files, read_file
286
302
 
287
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
303
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
288
304
  long_text_encoder = LongTextEncoder(
289
305
  max_length=2048, # 最大文本大小, 超出截断
290
306
  chunk_size=448, # 块大小 (按 Token 计)
@@ -306,10 +322,11 @@ Dynamic: license-file
306
322
 
307
323
  # 开始训练
308
324
  model = trainer.train(pos_data, neg_data,
309
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
310
- balancing_dataset=True, # 是否平衡数据集
311
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
312
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
325
+ num_epochs=36, learning_rate=2e-5,
326
+ balancing_dataset=True, alpha=1e-4,
327
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
328
+ attn_heads=8, # 8 个注意力头
329
+ recursive_layers=2) # 2 层 Bi-LSTM
313
330
 
314
331
  # 保存模型权重
315
332
  model.save(model_name='test_model', model_dir='model')
@@ -21,6 +21,7 @@ deeplotx/nn/logistic_regression.py
21
21
  deeplotx/nn/long_context_auto_regression.py
22
22
  deeplotx/nn/long_context_recursive_sequential.py
23
23
  deeplotx/nn/multi_head_attention.py
24
+ deeplotx/nn/multi_head_feed_forward.py
24
25
  deeplotx/nn/recursive_sequential.py
25
26
  deeplotx/nn/roformer_encoder.py
26
27
  deeplotx/nn/rope.py
@@ -1,17 +1,17 @@
1
- [project]
2
- name = "deeplotx"
3
- version = "0.8.2"
4
- description = "Easy-2-use long text NLP toolkit."
5
- readme = "README.md"
6
- requires-python = ">=3.10"
7
- dependencies = [
8
- "hf-xet",
9
- "jupyter",
10
- "numpy",
11
- "protobuf",
12
- "python-dotenv",
13
- "torch",
14
- "transformers",
15
- "typing-extensions",
16
- "vortezwohl>=0.0.8",
17
- ]
1
+ [project]
2
+ name = "deeplotx"
3
+ version = "0.8.5"
4
+ description = "Easy-2-use long text NLP toolkit."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "hf-xet",
9
+ "jupyter",
10
+ "numpy",
11
+ "protobuf",
12
+ "python-dotenv",
13
+ "torch",
14
+ "transformers",
15
+ "typing-extensions",
16
+ "vortezwohl>=0.0.8",
17
+ ]
@@ -1,37 +0,0 @@
1
- import logging
2
- import os
3
-
4
- import torch
5
- from torch import nn
6
- from transformers import LongformerTokenizer, LongformerModel
7
-
8
- from deeplotx import __ROOT__
9
-
10
- CACHE_PATH = os.path.join(__ROOT__, '.cache')
11
- DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
12
- logger = logging.getLogger('deeplotx.embedding')
13
-
14
-
15
- class LongformerEncoder(nn.Module):
16
- def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
17
- super().__init__()
18
- self.device = torch.device(device) if device is not None \
19
- else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
- self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
21
- cache_dir=CACHE_PATH, _from_auto=True)
22
- self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
23
- cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
24
- logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
25
-
26
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
27
- ori_mode = self.bert.training
28
- self.bert.eval()
29
- with torch.no_grad():
30
- res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
31
- self.bert.train(mode=ori_mode)
32
- return res
33
-
34
- def encode(self, text: str) -> torch.Tensor:
35
- _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
36
- _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
37
- return self.forward(_input_ids, _att_mask).squeeze()
File without changes
File without changes
File without changes
File without changes