deeplotx 0.8.2__tar.gz → 0.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. {deeplotx-0.8.2 → deeplotx-0.8.3}/PKG-INFO +53 -37
  2. {deeplotx-0.8.2 → deeplotx-0.8.3}/README.md +52 -36
  3. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/encoder/long_text_encoder.py +13 -8
  4. deeplotx-0.8.3/deeplotx/encoder/longformer_encoder.py +55 -0
  5. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/PKG-INFO +53 -37
  6. {deeplotx-0.8.2 → deeplotx-0.8.3}/pyproject.toml +1 -1
  7. deeplotx-0.8.2/deeplotx/encoder/longformer_encoder.py +0 -37
  8. {deeplotx-0.8.2 → deeplotx-0.8.3}/LICENSE +0 -0
  9. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/__init__.py +0 -0
  10. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/encoder/__init__.py +0 -0
  11. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/encoder/encoder.py +0 -0
  12. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/__init__.py +0 -0
  13. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/attention.py +0 -0
  14. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/auto_regression.py +0 -0
  15. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/base_neural_network.py +0 -0
  16. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/feed_forward.py +0 -0
  17. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/linear_regression.py +0 -0
  18. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/logistic_regression.py +0 -0
  19. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/long_context_auto_regression.py +0 -0
  20. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  21. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/multi_head_attention.py +0 -0
  22. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/recursive_sequential.py +0 -0
  23. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/roformer_encoder.py +0 -0
  24. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/rope.py +0 -0
  25. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/softmax_regression.py +0 -0
  26. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/__init__.py +0 -0
  27. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/distribution.py +0 -0
  28. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/set.py +0 -0
  29. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/vector.py +0 -0
  30. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/trainer/__init__.py +0 -0
  31. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/trainer/base_trainer.py +0 -0
  32. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  33. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/util/__init__.py +0 -0
  34. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/util/hash.py +0 -0
  35. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/util/read_file.py +0 -0
  36. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/SOURCES.txt +0 -0
  37. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/dependency_links.txt +0 -0
  38. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/requires.txt +0 -0
  39. {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/top_level.txt +0 -0
  40. {deeplotx-0.8.2 → deeplotx-0.8.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.2
3
+ Version: 0.8.3
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -48,24 +48,23 @@ Dynamic: license-file
48
48
 
49
49
  - ### 长文本嵌入
50
50
 
51
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
51
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
52
52
 
53
53
  ```python
54
54
  from deeplotx import LongTextEncoder
55
55
 
56
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
56
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
57
57
  encoder = LongTextEncoder(
58
- max_length=2048,
59
- chunk_size=512,
60
- overlapping=64
58
+ chunk_size=448,
59
+ overlapping=32
61
60
  )
62
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
63
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
61
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
62
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
64
63
  ```
65
64
 
66
65
  输出:
67
66
  ```
68
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
67
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
69
68
  ```
70
69
 
71
70
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -77,6 +76,11 @@ Dynamic: license-file
77
76
  encoder.encode('我是吴子豪, 这是一个测试文本.')
78
77
  ```
79
78
 
79
+ 输出:
80
+ ```
81
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
82
+ ```
83
+
80
84
  - ### 相似性计算
81
85
 
82
86
  - **基于向量的相似性**
@@ -163,14 +167,16 @@ Dynamic: license-file
163
167
 
164
168
  ```python
165
169
  from deeplotx import (
166
- BaseNeuralNetwork, # 深度神经网络基类
167
170
  FeedForward, # 前馈神经网络
168
171
  LinearRegression, # 线性回归
169
172
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
170
173
  SoftmaxRegression, # Softmax 回归 / 多分类
171
174
  RecursiveSequential, # 序列模型 / 循环神经网络
172
175
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
173
- SelfAttention, # 自注意力模块
176
+ RoPE, # RoPE 位置编码
177
+ Attention, # 自注意力 / 交叉注意力
178
+ MultiHeadAttention, # 并行多头注意力
179
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
174
180
  AutoRegression, # 自回归模型 / 循环神经网络
175
181
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
176
182
  )
@@ -193,13 +199,13 @@ Dynamic: license-file
193
199
  device: str | None = None, dtype: torch.dtype | None = None):
194
200
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
195
201
  self._dropout_rate = dropout_rate
196
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
197
- device=self.device, dtype=self.dtype)
198
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
199
- device=self.device, dtype=self.dtype)
200
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
202
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
203
+ bias=bias, device=self.device, dtype=self.dtype)
204
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
205
+ bias=bias, device=self.device, dtype=self.dtype)
206
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
201
207
  device=self.device, dtype=self.dtype)
202
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
208
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
203
209
  device=self.device, dtype=self.dtype)
204
210
 
205
211
  @override
@@ -207,11 +213,11 @@ Dynamic: license-file
207
213
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
208
214
  residual = x
209
215
  x = self.layer_norm(x)
210
- x = self.fc1(x)
211
- x = self.parametric_relu_1(x)
216
+ x = self.up_proj(x)
217
+ x = self.parametric_relu(x)
212
218
  if self._dropout_rate > .0:
213
219
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
214
- return self.fc2(x) + residual
220
+ return self.down_proj(x) + residual
215
221
 
216
222
 
217
223
  class FeedForward(BaseNeuralNetwork):
@@ -224,7 +230,7 @@ Dynamic: license-file
224
230
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
225
231
  expansion_factor=expansion_factor, bias=bias,
226
232
  dropout_rate=dropout_rate,
227
- device=self.device, dtype=self.dtype)] * num_layers)
233
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
228
234
 
229
235
  @override
230
236
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -234,7 +240,7 @@ Dynamic: license-file
234
240
  return x
235
241
  ```
236
242
 
237
- 自注意力模块:
243
+ 注意力模块:
238
244
 
239
245
  ```python
240
246
  from typing_extensions import override
@@ -243,14 +249,17 @@ Dynamic: license-file
243
249
 
244
250
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
245
251
  from deeplotx.nn.feed_forward import FeedForward
252
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
246
253
 
247
254
 
248
- class SelfAttention(BaseNeuralNetwork):
249
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
250
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
251
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
255
+ class Attention(BaseNeuralNetwork):
256
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
257
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
258
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
259
+ **kwargs):
252
260
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
253
261
  device=device, dtype=dtype)
262
+ self._positional = positional
254
263
  self._feature_dim = feature_dim
255
264
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
256
265
  expansion_factor=proj_expansion_factor,
@@ -261,21 +270,27 @@ Dynamic: license-file
261
270
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
262
271
  expansion_factor=proj_expansion_factor,
263
272
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
273
+ if self._positional:
274
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
275
+ device=self.device, dtype=self.dtype)
264
276
 
265
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
266
- q, k = self.q_proj(x), self.k_proj(x)
277
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
278
+ q, k = self.q_proj(x), self.k_proj(y)
279
+ if self._positional:
280
+ q, k = self.rope(q), self.rope(k)
267
281
  attn = torch.matmul(q, k.transpose(-2, -1))
268
282
  attn = attn / (self._feature_dim ** 0.5)
269
283
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
270
- return torch.softmax(attn, dim=-1)
284
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
271
285
 
272
286
  @override
273
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
287
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
274
288
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
289
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
275
290
  if mask is not None:
276
291
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
277
- v = self.v_proj(x)
278
- return torch.matmul(self._attention(x, mask), v)
292
+ v = self.v_proj(y)
293
+ return torch.matmul(self._attention(x, y, mask), v)
279
294
  ```
280
295
 
281
296
  - ### 使用预定义训练器实现文本二分类任务
@@ -284,7 +299,7 @@ Dynamic: license-file
284
299
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
285
300
  from deeplotx.util import get_files, read_file
286
301
 
287
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
302
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
288
303
  long_text_encoder = LongTextEncoder(
289
304
  max_length=2048, # 最大文本大小, 超出截断
290
305
  chunk_size=448, # 块大小 (按 Token 计)
@@ -306,10 +321,11 @@ Dynamic: license-file
306
321
 
307
322
  # 开始训练
308
323
  model = trainer.train(pos_data, neg_data,
309
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
310
- balancing_dataset=True, # 是否平衡数据集
311
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
312
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
324
+ num_epochs=36, learning_rate=2e-5,
325
+ balancing_dataset=True, alpha=1e-4,
326
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
327
+ attn_heads=8, # 8 个注意力头
328
+ recursive_layers=2) # 2 层 Bi-LSTM
313
329
 
314
330
  # 保存模型权重
315
331
  model.save(model_name='test_model', model_dir='model')
@@ -30,24 +30,23 @@
30
30
 
31
31
  - ### 长文本嵌入
32
32
 
33
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
33
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
34
34
 
35
35
  ```python
36
36
  from deeplotx import LongTextEncoder
37
37
 
38
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
38
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
39
39
  encoder = LongTextEncoder(
40
- max_length=2048,
41
- chunk_size=512,
42
- overlapping=64
40
+ chunk_size=448,
41
+ overlapping=32
43
42
  )
44
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
45
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
43
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
44
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
46
45
  ```
47
46
 
48
47
  输出:
49
48
  ```
50
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
49
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
51
50
  ```
52
51
 
53
52
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -59,6 +58,11 @@
59
58
  encoder.encode('我是吴子豪, 这是一个测试文本.')
60
59
  ```
61
60
 
61
+ 输出:
62
+ ```
63
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
64
+ ```
65
+
62
66
  - ### 相似性计算
63
67
 
64
68
  - **基于向量的相似性**
@@ -145,14 +149,16 @@
145
149
 
146
150
  ```python
147
151
  from deeplotx import (
148
- BaseNeuralNetwork, # 深度神经网络基类
149
152
  FeedForward, # 前馈神经网络
150
153
  LinearRegression, # 线性回归
151
154
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
152
155
  SoftmaxRegression, # Softmax 回归 / 多分类
153
156
  RecursiveSequential, # 序列模型 / 循环神经网络
154
157
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
155
- SelfAttention, # 自注意力模块
158
+ RoPE, # RoPE 位置编码
159
+ Attention, # 自注意力 / 交叉注意力
160
+ MultiHeadAttention, # 并行多头注意力
161
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
156
162
  AutoRegression, # 自回归模型 / 循环神经网络
157
163
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
158
164
  )
@@ -175,13 +181,13 @@
175
181
  device: str | None = None, dtype: torch.dtype | None = None):
176
182
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
177
183
  self._dropout_rate = dropout_rate
178
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
179
- device=self.device, dtype=self.dtype)
180
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
181
- device=self.device, dtype=self.dtype)
182
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
184
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
185
+ bias=bias, device=self.device, dtype=self.dtype)
186
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
187
+ bias=bias, device=self.device, dtype=self.dtype)
188
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
183
189
  device=self.device, dtype=self.dtype)
184
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
190
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
185
191
  device=self.device, dtype=self.dtype)
186
192
 
187
193
  @override
@@ -189,11 +195,11 @@
189
195
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
190
196
  residual = x
191
197
  x = self.layer_norm(x)
192
- x = self.fc1(x)
193
- x = self.parametric_relu_1(x)
198
+ x = self.up_proj(x)
199
+ x = self.parametric_relu(x)
194
200
  if self._dropout_rate > .0:
195
201
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
196
- return self.fc2(x) + residual
202
+ return self.down_proj(x) + residual
197
203
 
198
204
 
199
205
  class FeedForward(BaseNeuralNetwork):
@@ -206,7 +212,7 @@
206
212
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
207
213
  expansion_factor=expansion_factor, bias=bias,
208
214
  dropout_rate=dropout_rate,
209
- device=self.device, dtype=self.dtype)] * num_layers)
215
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
210
216
 
211
217
  @override
212
218
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -216,7 +222,7 @@
216
222
  return x
217
223
  ```
218
224
 
219
- 自注意力模块:
225
+ 注意力模块:
220
226
 
221
227
  ```python
222
228
  from typing_extensions import override
@@ -225,14 +231,17 @@
225
231
 
226
232
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
227
233
  from deeplotx.nn.feed_forward import FeedForward
234
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
228
235
 
229
236
 
230
- class SelfAttention(BaseNeuralNetwork):
231
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
232
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
233
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
237
+ class Attention(BaseNeuralNetwork):
238
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
239
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
240
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
241
+ **kwargs):
234
242
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
235
243
  device=device, dtype=dtype)
244
+ self._positional = positional
236
245
  self._feature_dim = feature_dim
237
246
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
238
247
  expansion_factor=proj_expansion_factor,
@@ -243,21 +252,27 @@
243
252
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
244
253
  expansion_factor=proj_expansion_factor,
245
254
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
255
+ if self._positional:
256
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
257
+ device=self.device, dtype=self.dtype)
246
258
 
247
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
248
- q, k = self.q_proj(x), self.k_proj(x)
259
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
260
+ q, k = self.q_proj(x), self.k_proj(y)
261
+ if self._positional:
262
+ q, k = self.rope(q), self.rope(k)
249
263
  attn = torch.matmul(q, k.transpose(-2, -1))
250
264
  attn = attn / (self._feature_dim ** 0.5)
251
265
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
252
- return torch.softmax(attn, dim=-1)
266
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
253
267
 
254
268
  @override
255
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
269
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
256
270
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
271
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
257
272
  if mask is not None:
258
273
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
259
- v = self.v_proj(x)
260
- return torch.matmul(self._attention(x, mask), v)
274
+ v = self.v_proj(y)
275
+ return torch.matmul(self._attention(x, y, mask), v)
261
276
  ```
262
277
 
263
278
  - ### 使用预定义训练器实现文本二分类任务
@@ -266,7 +281,7 @@
266
281
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
267
282
  from deeplotx.util import get_files, read_file
268
283
 
269
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
284
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
270
285
  long_text_encoder = LongTextEncoder(
271
286
  max_length=2048, # 最大文本大小, 超出截断
272
287
  chunk_size=448, # 块大小 (按 Token 计)
@@ -288,10 +303,11 @@
288
303
 
289
304
  # 开始训练
290
305
  model = trainer.train(pos_data, neg_data,
291
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
292
- balancing_dataset=True, # 是否平衡数据集
293
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
294
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
306
+ num_epochs=36, learning_rate=2e-5,
307
+ balancing_dataset=True, alpha=1e-4,
308
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
309
+ attn_heads=8, # 8 个注意力头
310
+ recursive_layers=2) # 2 层 Bi-LSTM
295
311
 
296
312
  # 保存模型权重
297
313
  model.save(model_name='test_model', model_dir='model')
@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
13
13
 
14
14
 
15
15
  class LongTextEncoder(Encoder):
16
- def __init__(self, max_length: int, chunk_size: int = 448,
17
- overlapping: int = 32, model_name_or_path: str = DEFAULT_BERT,
18
- cache_capacity: int = 64, max_workers: int = 8, device: str | None = None):
16
+ def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
17
+ model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
18
+ max_workers: int = 8, device: str | None = None):
19
19
  super().__init__(model_name_or_path=model_name_or_path, device=device)
20
20
  assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
21
21
  self._max_length = max_length
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
41
41
  _fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
42
42
  return _fin_emb_tensor.squeeze()
43
43
 
44
+ _tmp_max_length = self._max_length
44
45
  _text_to_show = text.replace("\n", str())
45
46
  logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
46
47
  # read cache
47
48
  _text_hash = sha512(text)
48
49
  if _text_hash in self._cache:
49
50
  return postprocess(self._cache[_text_hash], flatten)
50
- _text_to_input_ids = self.tokenizer.encode(text.strip())[:self._max_length]
51
+ _text_to_input_ids = self.tokenizer.encode(text.strip())
52
+ # variable length
53
+ if _tmp_max_length < 0:
54
+ _tmp_max_length = len(_text_to_input_ids)
55
+ _text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
51
56
  _text_to_input_ids_att_mask = []
52
57
  # padding
53
58
  pad_token = self.tokenizer.pad_token_type_id
54
- if len(_text_to_input_ids) < self._max_length:
55
- _text_to_input_ids.extend([pad_token] * (self._max_length - len(_text_to_input_ids)))
59
+ if len(_text_to_input_ids) < _tmp_max_length:
60
+ _text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
56
61
  pads = _text_to_input_ids.count(pad_token)
57
- non_pads = self._max_length - pads
62
+ non_pads = _tmp_max_length - pads
58
63
  _text_to_input_ids_att_mask.extend([1] * non_pads)
59
64
  _text_to_input_ids_att_mask.extend([0] * pads)
60
- num_chunks = math.ceil(self._max_length / self._chunk_size)
65
+ num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
61
66
  # split chunks
62
67
  chunks = []
63
68
  for i in range(num_chunks):
@@ -0,0 +1,55 @@
1
+ import logging
2
+ import os
3
+
4
+ import torch
5
+ from torch import nn
6
+ from transformers import AutoModel, AutoTokenizer
7
+ from requests.exceptions import ConnectTimeout, SSLError
8
+
9
+ from deeplotx import __ROOT__
10
+
11
+ CACHE_PATH = os.path.join(__ROOT__, '.cache')
12
+ DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
13
+ logger = logging.getLogger('deeplotx.embedding')
14
+
15
+
16
+ class LongformerEncoder(nn.Module):
17
+ def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
18
+ super().__init__()
19
+ self.device = torch.device(device) if device is not None \
20
+ else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
21
+ try:
22
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
23
+ cache_dir=CACHE_PATH, _from_auto=True,
24
+ trust_remote_code=True)
25
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
26
+ cache_dir=CACHE_PATH, _from_auto=True,
27
+ trust_remote_code=True).to(self.device)
28
+ except ConnectTimeout:
29
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
30
+ cache_dir=CACHE_PATH, _from_auto=True,
31
+ trust_remote_code=True, local_files_only=True)
32
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
33
+ cache_dir=CACHE_PATH, _from_auto=True,
34
+ trust_remote_code=True, local_files_only=True).to(self.device)
35
+ except SSLError:
36
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
37
+ cache_dir=CACHE_PATH, _from_auto=True,
38
+ trust_remote_code=True, local_files_only=True)
39
+ self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
40
+ cache_dir=CACHE_PATH, _from_auto=True,
41
+ trust_remote_code=True, local_files_only=True).to(self.device)
42
+ logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
43
+
44
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
45
+ ori_mode = self.encoder.training
46
+ self.encoder.eval()
47
+ with torch.no_grad():
48
+ res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
49
+ self.encoder.train(mode=ori_mode)
50
+ return res
51
+
52
+ def encode(self, text: str) -> torch.Tensor:
53
+ _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
54
+ _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
55
+ return self.forward(_input_ids, _att_mask).squeeze()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.2
3
+ Version: 0.8.3
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -48,24 +48,23 @@ Dynamic: license-file
48
48
 
49
49
  - ### 长文本嵌入
50
50
 
51
- - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
51
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
52
52
 
53
53
  ```python
54
54
  from deeplotx import LongTextEncoder
55
55
 
56
- # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
56
+ # 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
57
57
  encoder = LongTextEncoder(
58
- max_length=2048,
59
- chunk_size=512,
60
- overlapping=64
58
+ chunk_size=448,
59
+ overlapping=32
61
60
  )
62
- # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
63
- encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
61
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
62
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
64
63
  ```
65
64
 
66
65
  输出:
67
66
  ```
68
- tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
67
+ tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
69
68
  ```
70
69
 
71
70
  - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
@@ -77,6 +76,11 @@ Dynamic: license-file
77
76
  encoder.encode('我是吴子豪, 这是一个测试文本.')
78
77
  ```
79
78
 
79
+ 输出:
80
+ ```
81
+ tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
82
+ ```
83
+
80
84
  - ### 相似性计算
81
85
 
82
86
  - **基于向量的相似性**
@@ -163,14 +167,16 @@ Dynamic: license-file
163
167
 
164
168
  ```python
165
169
  from deeplotx import (
166
- BaseNeuralNetwork, # 深度神经网络基类
167
170
  FeedForward, # 前馈神经网络
168
171
  LinearRegression, # 线性回归
169
172
  LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
170
173
  SoftmaxRegression, # Softmax 回归 / 多分类
171
174
  RecursiveSequential, # 序列模型 / 循环神经网络
172
175
  LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
173
- SelfAttention, # 自注意力模块
176
+ RoPE, # RoPE 位置编码
177
+ Attention, # 自注意力 / 交叉注意力
178
+ MultiHeadAttention, # 并行多头注意力
179
+ RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
174
180
  AutoRegression, # 自回归模型 / 循环神经网络
175
181
  LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
176
182
  )
@@ -193,13 +199,13 @@ Dynamic: license-file
193
199
  device: str | None = None, dtype: torch.dtype | None = None):
194
200
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
195
201
  self._dropout_rate = dropout_rate
196
- self.fc1 = nn.Linear(feature_dim, int(feature_dim * expansion_factor), bias=bias,
197
- device=self.device, dtype=self.dtype)
198
- self.fc2 = nn.Linear(int(feature_dim * expansion_factor), feature_dim, bias=bias,
199
- device=self.device, dtype=self.dtype)
200
- self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3,
202
+ self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
203
+ bias=bias, device=self.device, dtype=self.dtype)
204
+ self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
205
+ bias=bias, device=self.device, dtype=self.dtype)
206
+ self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
201
207
  device=self.device, dtype=self.dtype)
202
- self.layer_norm = nn.LayerNorm(normalized_shape=self.fc1.in_features, eps=1e-9,
208
+ self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
203
209
  device=self.device, dtype=self.dtype)
204
210
 
205
211
  @override
@@ -207,11 +213,11 @@ Dynamic: license-file
207
213
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
208
214
  residual = x
209
215
  x = self.layer_norm(x)
210
- x = self.fc1(x)
211
- x = self.parametric_relu_1(x)
216
+ x = self.up_proj(x)
217
+ x = self.parametric_relu(x)
212
218
  if self._dropout_rate > .0:
213
219
  x = torch.dropout(x, p=self._dropout_rate, train=self.training)
214
- return self.fc2(x) + residual
220
+ return self.down_proj(x) + residual
215
221
 
216
222
 
217
223
  class FeedForward(BaseNeuralNetwork):
@@ -224,7 +230,7 @@ Dynamic: license-file
224
230
  self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
225
231
  expansion_factor=expansion_factor, bias=bias,
226
232
  dropout_rate=dropout_rate,
227
- device=self.device, dtype=self.dtype)] * num_layers)
233
+ device=self.device, dtype=self.dtype) for _ in range(num_layers)])
228
234
 
229
235
  @override
230
236
  def forward(self, x: torch.Tensor) -> torch.Tensor:
@@ -234,7 +240,7 @@ Dynamic: license-file
234
240
  return x
235
241
  ```
236
242
 
237
- 自注意力模块:
243
+ 注意力模块:
238
244
 
239
245
  ```python
240
246
  from typing_extensions import override
@@ -243,14 +249,17 @@ Dynamic: license-file
243
249
 
244
250
  from deeplotx.nn.base_neural_network import BaseNeuralNetwork
245
251
  from deeplotx.nn.feed_forward import FeedForward
252
+ from deeplotx.nn.rope import RoPE, DEFAULT_THETA
246
253
 
247
254
 
248
- class SelfAttention(BaseNeuralNetwork):
249
- def __init__(self, feature_dim: int, bias: bool = True, proj_layers: int = 1,
250
- proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
251
- model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
255
+ class Attention(BaseNeuralNetwork):
256
+ def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
257
+ proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
258
+ model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
259
+ **kwargs):
252
260
  super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
253
261
  device=device, dtype=dtype)
262
+ self._positional = positional
254
263
  self._feature_dim = feature_dim
255
264
  self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
256
265
  expansion_factor=proj_expansion_factor,
@@ -261,21 +270,27 @@ Dynamic: license-file
261
270
  self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
262
271
  expansion_factor=proj_expansion_factor,
263
272
  bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
273
+ if self._positional:
274
+ self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
275
+ device=self.device, dtype=self.dtype)
264
276
 
265
- def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
266
- q, k = self.q_proj(x), self.k_proj(x)
277
+ def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
278
+ q, k = self.q_proj(x), self.k_proj(y)
279
+ if self._positional:
280
+ q, k = self.rope(q), self.rope(k)
267
281
  attn = torch.matmul(q, k.transpose(-2, -1))
268
282
  attn = attn / (self._feature_dim ** 0.5)
269
283
  attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
270
- return torch.softmax(attn, dim=-1)
284
+ return torch.softmax(attn, dtype=self.dtype, dim=-1)
271
285
 
272
286
  @override
273
- def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
287
+ def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
274
288
  x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
289
+ y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
275
290
  if mask is not None:
276
291
  mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
277
- v = self.v_proj(x)
278
- return torch.matmul(self._attention(x, mask), v)
292
+ v = self.v_proj(y)
293
+ return torch.matmul(self._attention(x, y, mask), v)
279
294
  ```
280
295
 
281
296
  - ### 使用预定义训练器实现文本二分类任务
@@ -284,7 +299,7 @@ Dynamic: license-file
284
299
  from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
285
300
  from deeplotx.util import get_files, read_file
286
301
 
287
- # 定义向量编码策略 (默认使用 bert-base-uncased 作为嵌入模型)
302
+ # 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
288
303
  long_text_encoder = LongTextEncoder(
289
304
  max_length=2048, # 最大文本大小, 超出截断
290
305
  chunk_size=448, # 块大小 (按 Token 计)
@@ -306,10 +321,11 @@ Dynamic: license-file
306
321
 
307
322
  # 开始训练
308
323
  model = trainer.train(pos_data, neg_data,
309
- num_epochs=36, learning_rate=2e-5, # 设置训练轮数和学习率
310
- balancing_dataset=True, # 是否平衡数据集
311
- alpha=1e-4, rho=.2, # 设置 elastic net 正则化的超参数 alpha 和 rho
312
- hidden_dim=256, recursive_layers=2) # 设置循环神经网络的结构
324
+ num_epochs=36, learning_rate=2e-5,
325
+ balancing_dataset=True, alpha=1e-4,
326
+ rho=.2, encoder_layers=2, # 2 Roformer 编码器
327
+ attn_heads=8, # 8 个注意力头
328
+ recursive_layers=2) # 2 层 Bi-LSTM
313
329
 
314
330
  # 保存模型权重
315
331
  model.save(model_name='test_model', model_dir='model')
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = "0.8.2"
3
+ version = "0.8.3"
4
4
  description = "Easy-2-use long text NLP toolkit."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -1,37 +0,0 @@
1
- import logging
2
- import os
3
-
4
- import torch
5
- from torch import nn
6
- from transformers import LongformerTokenizer, LongformerModel
7
-
8
- from deeplotx import __ROOT__
9
-
10
- CACHE_PATH = os.path.join(__ROOT__, '.cache')
11
- DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
12
- logger = logging.getLogger('deeplotx.embedding')
13
-
14
-
15
- class LongformerEncoder(nn.Module):
16
- def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
17
- super().__init__()
18
- self.device = torch.device(device) if device is not None \
19
- else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
20
- self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
21
- cache_dir=CACHE_PATH, _from_auto=True)
22
- self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
23
- cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
24
- logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
25
-
26
- def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
27
- ori_mode = self.bert.training
28
- self.bert.eval()
29
- with torch.no_grad():
30
- res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
31
- self.bert.train(mode=ori_mode)
32
- return res
33
-
34
- def encode(self, text: str) -> torch.Tensor:
35
- _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
36
- _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
37
- return self.forward(_input_ids, _att_mask).squeeze()
File without changes
File without changes
File without changes
File without changes
File without changes