deeplotx 0.8.2__tar.gz → 0.8.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deeplotx-0.8.2 → deeplotx-0.8.5}/PKG-INFO +54 -37
- {deeplotx-0.8.2 → deeplotx-0.8.5}/README.md +53 -36
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/__init__.py +1 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/encoder/long_text_encoder.py +13 -8
- deeplotx-0.8.5/deeplotx/encoder/longformer_encoder.py +55 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/__init__.py +1 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/auto_regression.py +2 -2
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/linear_regression.py +5 -4
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/logistic_regression.py +4 -4
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/long_context_recursive_sequential.py +2 -3
- deeplotx-0.8.5/deeplotx/nn/multi_head_feed_forward.py +32 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/recursive_sequential.py +5 -5
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/softmax_regression.py +4 -4
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/trainer/text_binary_classification_trainer.py +6 -5
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/PKG-INFO +54 -37
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/SOURCES.txt +1 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/pyproject.toml +17 -17
- deeplotx-0.8.2/deeplotx/encoder/longformer_encoder.py +0 -37
- {deeplotx-0.8.2 → deeplotx-0.8.5}/LICENSE +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/encoder/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/encoder/encoder.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/attention.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/feed_forward.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/long_context_auto_regression.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/multi_head_attention.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/roformer_encoder.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/nn/rope.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/util/hash.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/requires.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.5}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.5
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -48,24 +48,23 @@ Dynamic: license-file
|
|
48
48
|
|
49
49
|
- ### 长文本嵌入
|
50
50
|
|
51
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
51
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
52
52
|
|
53
53
|
```python
|
54
54
|
from deeplotx import LongTextEncoder
|
55
55
|
|
56
|
-
#
|
56
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
57
57
|
encoder = LongTextEncoder(
|
58
|
-
|
59
|
-
|
60
|
-
overlapping=64
|
58
|
+
chunk_size=448,
|
59
|
+
overlapping=32
|
61
60
|
)
|
62
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
63
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
64
63
|
```
|
65
64
|
|
66
65
|
输出:
|
67
66
|
```
|
68
|
-
tensor([
|
67
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
69
68
|
```
|
70
69
|
|
71
70
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -77,6 +76,11 @@ Dynamic: license-file
|
|
77
76
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
78
77
|
```
|
79
78
|
|
79
|
+
输出:
|
80
|
+
```
|
81
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
82
|
+
```
|
83
|
+
|
80
84
|
- ### 相似性计算
|
81
85
|
|
82
86
|
- **基于向量的相似性**
|
@@ -163,14 +167,17 @@ Dynamic: license-file
|
|
163
167
|
|
164
168
|
```python
|
165
169
|
from deeplotx import (
|
166
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
167
170
|
FeedForward, # 前馈神经网络
|
171
|
+
MultiHeadFeedForward, # 多头前馈神经网络
|
168
172
|
LinearRegression, # 线性回归
|
169
173
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
170
174
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
171
175
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
172
176
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
173
|
-
|
177
|
+
RoPE, # RoPE 位置编码
|
178
|
+
Attention, # 自注意力 / 交叉注意力
|
179
|
+
MultiHeadAttention, # 并行多头注意力
|
180
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
174
181
|
AutoRegression, # 自回归模型 / 循环神经网络
|
175
182
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
176
183
|
)
|
@@ -193,13 +200,13 @@ Dynamic: license-file
|
|
193
200
|
device: str | None = None, dtype: torch.dtype | None = None):
|
194
201
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
195
202
|
self._dropout_rate = dropout_rate
|
196
|
-
self.
|
197
|
-
|
198
|
-
self.
|
199
|
-
|
200
|
-
self.
|
203
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
204
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
205
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
206
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
207
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
201
208
|
device=self.device, dtype=self.dtype)
|
202
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
209
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
203
210
|
device=self.device, dtype=self.dtype)
|
204
211
|
|
205
212
|
@override
|
@@ -207,11 +214,11 @@ Dynamic: license-file
|
|
207
214
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
208
215
|
residual = x
|
209
216
|
x = self.layer_norm(x)
|
210
|
-
x = self.
|
211
|
-
x = self.
|
217
|
+
x = self.up_proj(x)
|
218
|
+
x = self.parametric_relu(x)
|
212
219
|
if self._dropout_rate > .0:
|
213
220
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
214
|
-
return self.
|
221
|
+
return self.down_proj(x) + residual
|
215
222
|
|
216
223
|
|
217
224
|
class FeedForward(BaseNeuralNetwork):
|
@@ -224,7 +231,7 @@ Dynamic: license-file
|
|
224
231
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
225
232
|
expansion_factor=expansion_factor, bias=bias,
|
226
233
|
dropout_rate=dropout_rate,
|
227
|
-
device=self.device, dtype=self.dtype)
|
234
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
228
235
|
|
229
236
|
@override
|
230
237
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -234,7 +241,7 @@ Dynamic: license-file
|
|
234
241
|
return x
|
235
242
|
```
|
236
243
|
|
237
|
-
|
244
|
+
注意力模块:
|
238
245
|
|
239
246
|
```python
|
240
247
|
from typing_extensions import override
|
@@ -243,14 +250,17 @@ Dynamic: license-file
|
|
243
250
|
|
244
251
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
245
252
|
from deeplotx.nn.feed_forward import FeedForward
|
253
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
246
254
|
|
247
255
|
|
248
|
-
class
|
249
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
250
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
251
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
256
|
+
class Attention(BaseNeuralNetwork):
|
257
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
258
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
259
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
260
|
+
**kwargs):
|
252
261
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
253
262
|
device=device, dtype=dtype)
|
263
|
+
self._positional = positional
|
254
264
|
self._feature_dim = feature_dim
|
255
265
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
256
266
|
expansion_factor=proj_expansion_factor,
|
@@ -261,21 +271,27 @@ Dynamic: license-file
|
|
261
271
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
262
272
|
expansion_factor=proj_expansion_factor,
|
263
273
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
274
|
+
if self._positional:
|
275
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
276
|
+
device=self.device, dtype=self.dtype)
|
264
277
|
|
265
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
266
|
-
q, k = self.q_proj(x), self.k_proj(
|
278
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
279
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
280
|
+
if self._positional:
|
281
|
+
q, k = self.rope(q), self.rope(k)
|
267
282
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
268
283
|
attn = attn / (self._feature_dim ** 0.5)
|
269
284
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
270
|
-
return torch.softmax(attn, dim=-1)
|
285
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
271
286
|
|
272
287
|
@override
|
273
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
288
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
274
289
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
290
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
275
291
|
if mask is not None:
|
276
292
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
277
|
-
v = self.v_proj(
|
278
|
-
return torch.matmul(self._attention(x, mask), v)
|
293
|
+
v = self.v_proj(y)
|
294
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
279
295
|
```
|
280
296
|
|
281
297
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -284,7 +300,7 @@ Dynamic: license-file
|
|
284
300
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
285
301
|
from deeplotx.util import get_files, read_file
|
286
302
|
|
287
|
-
# 定义向量编码策略 (默认使用
|
303
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
288
304
|
long_text_encoder = LongTextEncoder(
|
289
305
|
max_length=2048, # 最大文本大小, 超出截断
|
290
306
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -306,10 +322,11 @@ Dynamic: license-file
|
|
306
322
|
|
307
323
|
# 开始训练
|
308
324
|
model = trainer.train(pos_data, neg_data,
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
325
|
+
num_epochs=36, learning_rate=2e-5,
|
326
|
+
balancing_dataset=True, alpha=1e-4,
|
327
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
328
|
+
attn_heads=8, # 8 个注意力头
|
329
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
313
330
|
|
314
331
|
# 保存模型权重
|
315
332
|
model.save(model_name='test_model', model_dir='model')
|
@@ -30,24 +30,23 @@
|
|
30
30
|
|
31
31
|
- ### 长文本嵌入
|
32
32
|
|
33
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
33
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
34
34
|
|
35
35
|
```python
|
36
36
|
from deeplotx import LongTextEncoder
|
37
37
|
|
38
|
-
#
|
38
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
39
39
|
encoder = LongTextEncoder(
|
40
|
-
|
41
|
-
|
42
|
-
overlapping=64
|
40
|
+
chunk_size=448,
|
41
|
+
overlapping=32
|
43
42
|
)
|
44
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
45
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
43
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
44
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
46
45
|
```
|
47
46
|
|
48
47
|
输出:
|
49
48
|
```
|
50
|
-
tensor([
|
49
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
51
50
|
```
|
52
51
|
|
53
52
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -59,6 +58,11 @@
|
|
59
58
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
60
59
|
```
|
61
60
|
|
61
|
+
输出:
|
62
|
+
```
|
63
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
64
|
+
```
|
65
|
+
|
62
66
|
- ### 相似性计算
|
63
67
|
|
64
68
|
- **基于向量的相似性**
|
@@ -145,14 +149,17 @@
|
|
145
149
|
|
146
150
|
```python
|
147
151
|
from deeplotx import (
|
148
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
149
152
|
FeedForward, # 前馈神经网络
|
153
|
+
MultiHeadFeedForward, # 多头前馈神经网络
|
150
154
|
LinearRegression, # 线性回归
|
151
155
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
152
156
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
153
157
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
154
158
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
155
|
-
|
159
|
+
RoPE, # RoPE 位置编码
|
160
|
+
Attention, # 自注意力 / 交叉注意力
|
161
|
+
MultiHeadAttention, # 并行多头注意力
|
162
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
156
163
|
AutoRegression, # 自回归模型 / 循环神经网络
|
157
164
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
158
165
|
)
|
@@ -175,13 +182,13 @@
|
|
175
182
|
device: str | None = None, dtype: torch.dtype | None = None):
|
176
183
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
177
184
|
self._dropout_rate = dropout_rate
|
178
|
-
self.
|
179
|
-
|
180
|
-
self.
|
181
|
-
|
182
|
-
self.
|
185
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
186
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
187
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
188
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
189
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
183
190
|
device=self.device, dtype=self.dtype)
|
184
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
191
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
185
192
|
device=self.device, dtype=self.dtype)
|
186
193
|
|
187
194
|
@override
|
@@ -189,11 +196,11 @@
|
|
189
196
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
190
197
|
residual = x
|
191
198
|
x = self.layer_norm(x)
|
192
|
-
x = self.
|
193
|
-
x = self.
|
199
|
+
x = self.up_proj(x)
|
200
|
+
x = self.parametric_relu(x)
|
194
201
|
if self._dropout_rate > .0:
|
195
202
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
196
|
-
return self.
|
203
|
+
return self.down_proj(x) + residual
|
197
204
|
|
198
205
|
|
199
206
|
class FeedForward(BaseNeuralNetwork):
|
@@ -206,7 +213,7 @@
|
|
206
213
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
207
214
|
expansion_factor=expansion_factor, bias=bias,
|
208
215
|
dropout_rate=dropout_rate,
|
209
|
-
device=self.device, dtype=self.dtype)
|
216
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
210
217
|
|
211
218
|
@override
|
212
219
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -216,7 +223,7 @@
|
|
216
223
|
return x
|
217
224
|
```
|
218
225
|
|
219
|
-
|
226
|
+
注意力模块:
|
220
227
|
|
221
228
|
```python
|
222
229
|
from typing_extensions import override
|
@@ -225,14 +232,17 @@
|
|
225
232
|
|
226
233
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
227
234
|
from deeplotx.nn.feed_forward import FeedForward
|
235
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
228
236
|
|
229
237
|
|
230
|
-
class
|
231
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
232
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
233
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
238
|
+
class Attention(BaseNeuralNetwork):
|
239
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
240
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
241
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
242
|
+
**kwargs):
|
234
243
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
235
244
|
device=device, dtype=dtype)
|
245
|
+
self._positional = positional
|
236
246
|
self._feature_dim = feature_dim
|
237
247
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
238
248
|
expansion_factor=proj_expansion_factor,
|
@@ -243,21 +253,27 @@
|
|
243
253
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
244
254
|
expansion_factor=proj_expansion_factor,
|
245
255
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
256
|
+
if self._positional:
|
257
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
258
|
+
device=self.device, dtype=self.dtype)
|
246
259
|
|
247
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
248
|
-
q, k = self.q_proj(x), self.k_proj(
|
260
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
261
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
262
|
+
if self._positional:
|
263
|
+
q, k = self.rope(q), self.rope(k)
|
249
264
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
250
265
|
attn = attn / (self._feature_dim ** 0.5)
|
251
266
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
252
|
-
return torch.softmax(attn, dim=-1)
|
267
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
253
268
|
|
254
269
|
@override
|
255
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
270
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
256
271
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
272
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
257
273
|
if mask is not None:
|
258
274
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
259
|
-
v = self.v_proj(
|
260
|
-
return torch.matmul(self._attention(x, mask), v)
|
275
|
+
v = self.v_proj(y)
|
276
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
261
277
|
```
|
262
278
|
|
263
279
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -266,7 +282,7 @@
|
|
266
282
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
267
283
|
from deeplotx.util import get_files, read_file
|
268
284
|
|
269
|
-
# 定义向量编码策略 (默认使用
|
285
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
270
286
|
long_text_encoder = LongTextEncoder(
|
271
287
|
max_length=2048, # 最大文本大小, 超出截断
|
272
288
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -288,10 +304,11 @@
|
|
288
304
|
|
289
305
|
# 开始训练
|
290
306
|
model = trainer.train(pos_data, neg_data,
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
307
|
+
num_epochs=36, learning_rate=2e-5,
|
308
|
+
balancing_dataset=True, alpha=1e-4,
|
309
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
310
|
+
attn_heads=8, # 8 个注意力头
|
311
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
295
312
|
|
296
313
|
# 保存模型权重
|
297
314
|
model.save(model_name='test_model', model_dir='model')
|
@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
|
|
13
13
|
|
14
14
|
|
15
15
|
class LongTextEncoder(Encoder):
|
16
|
-
def __init__(self,
|
17
|
-
|
18
|
-
|
16
|
+
def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
|
17
|
+
model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
|
18
|
+
max_workers: int = 8, device: str | None = None):
|
19
19
|
super().__init__(model_name_or_path=model_name_or_path, device=device)
|
20
20
|
assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
|
21
21
|
self._max_length = max_length
|
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
|
|
41
41
|
_fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
|
42
42
|
return _fin_emb_tensor.squeeze()
|
43
43
|
|
44
|
+
_tmp_max_length = self._max_length
|
44
45
|
_text_to_show = text.replace("\n", str())
|
45
46
|
logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
|
46
47
|
# read cache
|
47
48
|
_text_hash = sha512(text)
|
48
49
|
if _text_hash in self._cache:
|
49
50
|
return postprocess(self._cache[_text_hash], flatten)
|
50
|
-
_text_to_input_ids = self.tokenizer.encode(text.strip())
|
51
|
+
_text_to_input_ids = self.tokenizer.encode(text.strip())
|
52
|
+
# variable length
|
53
|
+
if _tmp_max_length < 0:
|
54
|
+
_tmp_max_length = len(_text_to_input_ids)
|
55
|
+
_text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
|
51
56
|
_text_to_input_ids_att_mask = []
|
52
57
|
# padding
|
53
58
|
pad_token = self.tokenizer.pad_token_type_id
|
54
|
-
if len(_text_to_input_ids) <
|
55
|
-
_text_to_input_ids.extend([pad_token] * (
|
59
|
+
if len(_text_to_input_ids) < _tmp_max_length:
|
60
|
+
_text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
|
56
61
|
pads = _text_to_input_ids.count(pad_token)
|
57
|
-
non_pads =
|
62
|
+
non_pads = _tmp_max_length - pads
|
58
63
|
_text_to_input_ids_att_mask.extend([1] * non_pads)
|
59
64
|
_text_to_input_ids_att_mask.extend([0] * pads)
|
60
|
-
num_chunks = math.ceil(
|
65
|
+
num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
|
61
66
|
# split chunks
|
62
67
|
chunks = []
|
63
68
|
for i in range(num_chunks):
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
|
4
|
+
import torch
|
5
|
+
from torch import nn
|
6
|
+
from transformers import AutoModel, AutoTokenizer
|
7
|
+
from requests.exceptions import ConnectTimeout, SSLError
|
8
|
+
|
9
|
+
from deeplotx import __ROOT__
|
10
|
+
|
11
|
+
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
12
|
+
DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
|
13
|
+
logger = logging.getLogger('deeplotx.embedding')
|
14
|
+
|
15
|
+
|
16
|
+
class LongformerEncoder(nn.Module):
|
17
|
+
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
|
18
|
+
super().__init__()
|
19
|
+
self.device = torch.device(device) if device is not None \
|
20
|
+
else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
21
|
+
try:
|
22
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
23
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
24
|
+
trust_remote_code=True)
|
25
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
26
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
27
|
+
trust_remote_code=True).to(self.device)
|
28
|
+
except ConnectTimeout:
|
29
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
30
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
31
|
+
trust_remote_code=True, local_files_only=True)
|
32
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
33
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
34
|
+
trust_remote_code=True, local_files_only=True).to(self.device)
|
35
|
+
except SSLError:
|
36
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
37
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
38
|
+
trust_remote_code=True, local_files_only=True)
|
39
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
40
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
41
|
+
trust_remote_code=True, local_files_only=True).to(self.device)
|
42
|
+
logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
|
43
|
+
|
44
|
+
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
45
|
+
ori_mode = self.encoder.training
|
46
|
+
self.encoder.eval()
|
47
|
+
with torch.no_grad():
|
48
|
+
res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
|
49
|
+
self.encoder.train(mode=ori_mode)
|
50
|
+
return res
|
51
|
+
|
52
|
+
def encode(self, text: str) -> torch.Tensor:
|
53
|
+
_input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
|
54
|
+
_att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
|
55
|
+
return self.forward(_input_ids, _att_mask).squeeze()
|
@@ -1,5 +1,6 @@
|
|
1
1
|
from .base_neural_network import BaseNeuralNetwork
|
2
2
|
from .feed_forward import FeedForward
|
3
|
+
from .multi_head_feed_forward import MultiHeadFeedForward
|
3
4
|
from .linear_regression import LinearRegression
|
4
5
|
from .logistic_regression import LogisticRegression
|
5
6
|
from .softmax_regression import SoftmaxRegression
|
@@ -7,8 +7,8 @@ class AutoRegression(RecursiveSequential):
|
|
7
7
|
def __init__(self, feature_dim: int, bias: bool = True,
|
8
8
|
recursive_layers: int = 1, recursive_hidden_dim: int | None = None,
|
9
9
|
ffn_layers: int = 1, ffn_expansion_factor: int | float = 2, dropout_rate: float = 0.05,
|
10
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
|
10
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None, **kwargs):
|
11
11
|
super().__init__(input_dim=feature_dim, output_dim=feature_dim, bias=bias,
|
12
12
|
recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
|
13
13
|
ffn_layers=ffn_layers, ffn_expansion_factor=ffn_expansion_factor,
|
14
|
-
dropout_rate=dropout_rate, model_name=model_name, device=device, dtype=dtype)
|
14
|
+
dropout_rate=dropout_rate, model_name=model_name, device=device, dtype=dtype, **kwargs)
|
@@ -4,16 +4,17 @@ import torch
|
|
4
4
|
from torch import nn
|
5
5
|
|
6
6
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
7
|
-
from deeplotx.nn.
|
7
|
+
from deeplotx.nn.multi_head_feed_forward import MultiHeadFeedForward
|
8
8
|
|
9
9
|
|
10
10
|
class LinearRegression(BaseNeuralNetwork):
|
11
|
-
def __init__(self, input_dim: int, output_dim: int, num_layers: int = 1,
|
11
|
+
def __init__(self, input_dim: int, output_dim: int, num_heads: int = 1, num_layers: int = 1,
|
12
12
|
expansion_factor: int | float = 1.5, bias: bool = True, dropout_rate: float = 0.1,
|
13
13
|
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
|
14
14
|
super().__init__(in_features=input_dim, out_features=output_dim, model_name=model_name, device=device, dtype=dtype)
|
15
|
-
self.ffn =
|
16
|
-
|
15
|
+
self.ffn = MultiHeadFeedForward(feature_dim=input_dim, num_heads=num_heads,
|
16
|
+
num_layers=num_layers, expansion_factor=expansion_factor,
|
17
|
+
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
17
18
|
self.proj = nn.Linear(in_features=input_dim, out_features=output_dim,
|
18
19
|
bias=bias, device=self.device, dtype=self.dtype)
|
19
20
|
|
@@ -6,10 +6,10 @@ from deeplotx.nn.linear_regression import LinearRegression
|
|
6
6
|
|
7
7
|
|
8
8
|
class LogisticRegression(LinearRegression):
|
9
|
-
def __init__(self, input_dim: int, output_dim: int = 1,
|
10
|
-
bias: bool = True, dropout_rate: float = 0.1,
|
11
|
-
device: str | None = None, dtype: torch.dtype | None = None):
|
12
|
-
super().__init__(input_dim=input_dim, output_dim=output_dim, num_layers=num_layers,
|
9
|
+
def __init__(self, input_dim: int, output_dim: int = 1, num_heads: int = 1, num_layers: int = 1,
|
10
|
+
expansion_factor: int | float = 1.5, bias: bool = True, dropout_rate: float = 0.1,
|
11
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
|
12
|
+
super().__init__(input_dim=input_dim, output_dim=output_dim, num_heads=num_heads, num_layers=num_layers,
|
13
13
|
expansion_factor=expansion_factor, bias=bias, dropout_rate=dropout_rate,
|
14
14
|
model_name=model_name, device=device, dtype=dtype)
|
15
15
|
|
@@ -12,12 +12,11 @@ class LongContextRecursiveSequential(RecursiveSequential):
|
|
12
12
|
def __init__(self, input_dim: int, output_dim: int, bias: bool = True,
|
13
13
|
encoder_layers: int = 1, attn_heads: int = 1, recursive_layers: int = 2, recursive_hidden_dim: int | None = None,
|
14
14
|
ffn_layers: int = 1, ffn_expansion_factor: int | float = 2, dropout_rate: float = 0.05,
|
15
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
16
|
-
**kwargs):
|
15
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None, **kwargs):
|
17
16
|
super().__init__(input_dim=input_dim, output_dim=output_dim, bias=bias,
|
18
17
|
recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
|
19
18
|
ffn_layers=ffn_layers, ffn_expansion_factor=ffn_expansion_factor, dropout_rate=dropout_rate,
|
20
|
-
model_name=model_name, device=device, dtype=dtype)
|
19
|
+
model_name=model_name, device=device, dtype=dtype, **kwargs)
|
21
20
|
self.roformer_encoders = nn.ModuleList([RoFormerEncoder(feature_dim=input_dim, attn_heads=attn_heads, bias=bias,
|
22
21
|
ffn_layers=kwargs.get('encoder_ffn_layers', ffn_layers),
|
23
22
|
ffn_expansion_factor=kwargs.get('encoder_expansion_factor', ffn_expansion_factor),
|
@@ -0,0 +1,32 @@
|
|
1
|
+
from typing_extensions import override
|
2
|
+
|
3
|
+
import torch
|
4
|
+
from torch import nn
|
5
|
+
|
6
|
+
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
7
|
+
from deeplotx.nn.feed_forward import FeedForward
|
8
|
+
|
9
|
+
|
10
|
+
class MultiHeadFeedForward(BaseNeuralNetwork):
|
11
|
+
def __init__(self, feature_dim: int, num_heads: int = 1, num_layers: int = 1, expansion_factor: int | float = 2,
|
12
|
+
bias: bool = True, dropout_rate: float = 0.05, model_name: str | None = None,
|
13
|
+
device: str | None = None, dtype: torch.dtype | None = None):
|
14
|
+
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
15
|
+
device=device, dtype=dtype)
|
16
|
+
self._num_heads = num_heads
|
17
|
+
self.expand_proj = nn.Linear(in_features=feature_dim, out_features=feature_dim * self._num_heads, bias=bias,
|
18
|
+
device=self.device, dtype=self.dtype)
|
19
|
+
self.ffn_heads = nn.ModuleList([FeedForward(feature_dim=feature_dim, num_layers=num_layers,
|
20
|
+
expansion_factor=expansion_factor, bias=bias,
|
21
|
+
dropout_rate=dropout_rate, device=self.device,
|
22
|
+
dtype=self.dtype) for _ in range(self._num_heads)])
|
23
|
+
self.out_proj = nn.Linear(in_features=feature_dim * self._num_heads, out_features=feature_dim, bias=bias,
|
24
|
+
device=self.device, dtype=self.dtype)
|
25
|
+
|
26
|
+
@override
|
27
|
+
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
28
|
+
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
29
|
+
x = self.expand_proj(x)
|
30
|
+
x_heads = x.split(self.in_features, dim=-1)
|
31
|
+
head_outs = [self.ffn_heads[_](x_heads[_]) for _ in range(self._num_heads)]
|
32
|
+
return self.out_proj(torch.concat(head_outs, dim=-1))
|
@@ -4,14 +4,14 @@ import torch
|
|
4
4
|
from torch import nn
|
5
5
|
|
6
6
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
7
|
-
from deeplotx.nn.
|
7
|
+
from deeplotx.nn.multi_head_feed_forward import MultiHeadFeedForward
|
8
8
|
|
9
9
|
|
10
10
|
class RecursiveSequential(BaseNeuralNetwork):
|
11
11
|
def __init__(self, input_dim: int, output_dim: int, bias: bool = True,
|
12
12
|
recursive_layers: int = 1, recursive_hidden_dim: int | None = None,
|
13
13
|
ffn_layers: int = 1, ffn_expansion_factor: int | float = 2, dropout_rate: float = 0.05,
|
14
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
|
14
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None, **kwargs):
|
15
15
|
super().__init__(in_features=input_dim, out_features=output_dim, model_name=model_name,
|
16
16
|
device=device, dtype=dtype)
|
17
17
|
if recursive_hidden_dim is None:
|
@@ -20,9 +20,9 @@ class RecursiveSequential(BaseNeuralNetwork):
|
|
20
20
|
num_layers=recursive_layers, batch_first=True,
|
21
21
|
bias=True, bidirectional=True, device=self.device,
|
22
22
|
dtype=self.dtype)
|
23
|
-
self.ffn =
|
24
|
-
|
25
|
-
|
23
|
+
self.ffn = MultiHeadFeedForward(feature_dim=recursive_hidden_dim * 2, num_heads=kwargs.get('ffn_heads', 1),
|
24
|
+
num_layers=ffn_layers, expansion_factor=ffn_expansion_factor,
|
25
|
+
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
26
26
|
self.__proj = nn.Linear(in_features=recursive_hidden_dim * 2, out_features=output_dim, bias=bias,
|
27
27
|
device=self.device, dtype=self.dtype)
|
28
28
|
|
@@ -6,10 +6,10 @@ from deeplotx.nn.linear_regression import LinearRegression
|
|
6
6
|
|
7
7
|
|
8
8
|
class SoftmaxRegression(LinearRegression):
|
9
|
-
def __init__(self, input_dim: int, output_dim: int,
|
10
|
-
bias: bool = True, dropout_rate: float = 0.1,
|
11
|
-
device: str | None = None, dtype: torch.dtype | None = None):
|
12
|
-
super().__init__(input_dim=input_dim, output_dim=output_dim, num_layers=num_layers,
|
9
|
+
def __init__(self, input_dim: int, output_dim: int, num_heads: int = 1, num_layers: int = 1,
|
10
|
+
expansion_factor: int | float = 1.5, bias: bool = True, dropout_rate: float = 0.1,
|
11
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None):
|
12
|
+
super().__init__(input_dim=input_dim, output_dim=output_dim, num_heads=num_heads, num_layers=num_layers,
|
13
13
|
expansion_factor=expansion_factor, bias=bias, dropout_rate=dropout_rate,
|
14
14
|
model_name=model_name, device=device, dtype=dtype)
|
15
15
|
|
@@ -49,6 +49,7 @@ class TextBinaryClassifierTrainer(BaseTrainer):
|
|
49
49
|
logger.warning("The dimension of features doesn't match. A new model instance will be created.")
|
50
50
|
self.model = None
|
51
51
|
if self.model is None:
|
52
|
+
ffn_heads = kwargs.get('ffn_heads', 2)
|
52
53
|
ffn_layers = kwargs.get('ffn_layers', 5)
|
53
54
|
ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
|
54
55
|
bias = kwargs.get('bias', True)
|
@@ -63,11 +64,11 @@ class TextBinaryClassifierTrainer(BaseTrainer):
|
|
63
64
|
self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
|
64
65
|
encoder_layers=encoder_layers, attn_heads=attn_heads,
|
65
66
|
recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
|
66
|
-
ffn_layers=ffn_layers,
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
theta=theta).initialize_weights()
|
67
|
+
ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
|
68
|
+
dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
|
69
|
+
encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
|
70
|
+
attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
|
71
|
+
attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
|
71
72
|
logger.debug(f'Training Model: \n{self.model}')
|
72
73
|
loss_function = nn.BCELoss()
|
73
74
|
optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.5
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -48,24 +48,23 @@ Dynamic: license-file
|
|
48
48
|
|
49
49
|
- ### 长文本嵌入
|
50
50
|
|
51
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
51
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
52
52
|
|
53
53
|
```python
|
54
54
|
from deeplotx import LongTextEncoder
|
55
55
|
|
56
|
-
#
|
56
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
57
57
|
encoder = LongTextEncoder(
|
58
|
-
|
59
|
-
|
60
|
-
overlapping=64
|
58
|
+
chunk_size=448,
|
59
|
+
overlapping=32
|
61
60
|
)
|
62
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
63
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
64
63
|
```
|
65
64
|
|
66
65
|
输出:
|
67
66
|
```
|
68
|
-
tensor([
|
67
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
69
68
|
```
|
70
69
|
|
71
70
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -77,6 +76,11 @@ Dynamic: license-file
|
|
77
76
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
78
77
|
```
|
79
78
|
|
79
|
+
输出:
|
80
|
+
```
|
81
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
82
|
+
```
|
83
|
+
|
80
84
|
- ### 相似性计算
|
81
85
|
|
82
86
|
- **基于向量的相似性**
|
@@ -163,14 +167,17 @@ Dynamic: license-file
|
|
163
167
|
|
164
168
|
```python
|
165
169
|
from deeplotx import (
|
166
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
167
170
|
FeedForward, # 前馈神经网络
|
171
|
+
MultiHeadFeedForward, # 多头前馈神经网络
|
168
172
|
LinearRegression, # 线性回归
|
169
173
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
170
174
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
171
175
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
172
176
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
173
|
-
|
177
|
+
RoPE, # RoPE 位置编码
|
178
|
+
Attention, # 自注意力 / 交叉注意力
|
179
|
+
MultiHeadAttention, # 并行多头注意力
|
180
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
174
181
|
AutoRegression, # 自回归模型 / 循环神经网络
|
175
182
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
176
183
|
)
|
@@ -193,13 +200,13 @@ Dynamic: license-file
|
|
193
200
|
device: str | None = None, dtype: torch.dtype | None = None):
|
194
201
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
195
202
|
self._dropout_rate = dropout_rate
|
196
|
-
self.
|
197
|
-
|
198
|
-
self.
|
199
|
-
|
200
|
-
self.
|
203
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
204
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
205
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
206
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
207
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
201
208
|
device=self.device, dtype=self.dtype)
|
202
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
209
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
203
210
|
device=self.device, dtype=self.dtype)
|
204
211
|
|
205
212
|
@override
|
@@ -207,11 +214,11 @@ Dynamic: license-file
|
|
207
214
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
208
215
|
residual = x
|
209
216
|
x = self.layer_norm(x)
|
210
|
-
x = self.
|
211
|
-
x = self.
|
217
|
+
x = self.up_proj(x)
|
218
|
+
x = self.parametric_relu(x)
|
212
219
|
if self._dropout_rate > .0:
|
213
220
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
214
|
-
return self.
|
221
|
+
return self.down_proj(x) + residual
|
215
222
|
|
216
223
|
|
217
224
|
class FeedForward(BaseNeuralNetwork):
|
@@ -224,7 +231,7 @@ Dynamic: license-file
|
|
224
231
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
225
232
|
expansion_factor=expansion_factor, bias=bias,
|
226
233
|
dropout_rate=dropout_rate,
|
227
|
-
device=self.device, dtype=self.dtype)
|
234
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
228
235
|
|
229
236
|
@override
|
230
237
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -234,7 +241,7 @@ Dynamic: license-file
|
|
234
241
|
return x
|
235
242
|
```
|
236
243
|
|
237
|
-
|
244
|
+
注意力模块:
|
238
245
|
|
239
246
|
```python
|
240
247
|
from typing_extensions import override
|
@@ -243,14 +250,17 @@ Dynamic: license-file
|
|
243
250
|
|
244
251
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
245
252
|
from deeplotx.nn.feed_forward import FeedForward
|
253
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
246
254
|
|
247
255
|
|
248
|
-
class
|
249
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
250
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
251
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
256
|
+
class Attention(BaseNeuralNetwork):
|
257
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
258
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
259
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
260
|
+
**kwargs):
|
252
261
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
253
262
|
device=device, dtype=dtype)
|
263
|
+
self._positional = positional
|
254
264
|
self._feature_dim = feature_dim
|
255
265
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
256
266
|
expansion_factor=proj_expansion_factor,
|
@@ -261,21 +271,27 @@ Dynamic: license-file
|
|
261
271
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
262
272
|
expansion_factor=proj_expansion_factor,
|
263
273
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
274
|
+
if self._positional:
|
275
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
276
|
+
device=self.device, dtype=self.dtype)
|
264
277
|
|
265
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
266
|
-
q, k = self.q_proj(x), self.k_proj(
|
278
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
279
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
280
|
+
if self._positional:
|
281
|
+
q, k = self.rope(q), self.rope(k)
|
267
282
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
268
283
|
attn = attn / (self._feature_dim ** 0.5)
|
269
284
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
270
|
-
return torch.softmax(attn, dim=-1)
|
285
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
271
286
|
|
272
287
|
@override
|
273
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
288
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
274
289
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
290
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
275
291
|
if mask is not None:
|
276
292
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
277
|
-
v = self.v_proj(
|
278
|
-
return torch.matmul(self._attention(x, mask), v)
|
293
|
+
v = self.v_proj(y)
|
294
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
279
295
|
```
|
280
296
|
|
281
297
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -284,7 +300,7 @@ Dynamic: license-file
|
|
284
300
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
285
301
|
from deeplotx.util import get_files, read_file
|
286
302
|
|
287
|
-
# 定义向量编码策略 (默认使用
|
303
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
288
304
|
long_text_encoder = LongTextEncoder(
|
289
305
|
max_length=2048, # 最大文本大小, 超出截断
|
290
306
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -306,10 +322,11 @@ Dynamic: license-file
|
|
306
322
|
|
307
323
|
# 开始训练
|
308
324
|
model = trainer.train(pos_data, neg_data,
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
325
|
+
num_epochs=36, learning_rate=2e-5,
|
326
|
+
balancing_dataset=True, alpha=1e-4,
|
327
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
328
|
+
attn_heads=8, # 8 个注意力头
|
329
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
313
330
|
|
314
331
|
# 保存模型权重
|
315
332
|
model.save(model_name='test_model', model_dir='model')
|
@@ -21,6 +21,7 @@ deeplotx/nn/logistic_regression.py
|
|
21
21
|
deeplotx/nn/long_context_auto_regression.py
|
22
22
|
deeplotx/nn/long_context_recursive_sequential.py
|
23
23
|
deeplotx/nn/multi_head_attention.py
|
24
|
+
deeplotx/nn/multi_head_feed_forward.py
|
24
25
|
deeplotx/nn/recursive_sequential.py
|
25
26
|
deeplotx/nn/roformer_encoder.py
|
26
27
|
deeplotx/nn/rope.py
|
@@ -1,17 +1,17 @@
|
|
1
|
-
[project]
|
2
|
-
name = "deeplotx"
|
3
|
-
version = "0.8.
|
4
|
-
description = "Easy-2-use long text NLP toolkit."
|
5
|
-
readme = "README.md"
|
6
|
-
requires-python = ">=3.10"
|
7
|
-
dependencies = [
|
8
|
-
"hf-xet",
|
9
|
-
"jupyter",
|
10
|
-
"numpy",
|
11
|
-
"protobuf",
|
12
|
-
"python-dotenv",
|
13
|
-
"torch",
|
14
|
-
"transformers",
|
15
|
-
"typing-extensions",
|
16
|
-
"vortezwohl>=0.0.8",
|
17
|
-
]
|
1
|
+
[project]
|
2
|
+
name = "deeplotx"
|
3
|
+
version = "0.8.5"
|
4
|
+
description = "Easy-2-use long text NLP toolkit."
|
5
|
+
readme = "README.md"
|
6
|
+
requires-python = ">=3.10"
|
7
|
+
dependencies = [
|
8
|
+
"hf-xet",
|
9
|
+
"jupyter",
|
10
|
+
"numpy",
|
11
|
+
"protobuf",
|
12
|
+
"python-dotenv",
|
13
|
+
"torch",
|
14
|
+
"transformers",
|
15
|
+
"typing-extensions",
|
16
|
+
"vortezwohl>=0.0.8",
|
17
|
+
]
|
@@ -1,37 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import os
|
3
|
-
|
4
|
-
import torch
|
5
|
-
from torch import nn
|
6
|
-
from transformers import LongformerTokenizer, LongformerModel
|
7
|
-
|
8
|
-
from deeplotx import __ROOT__
|
9
|
-
|
10
|
-
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
11
|
-
DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
|
12
|
-
logger = logging.getLogger('deeplotx.embedding')
|
13
|
-
|
14
|
-
|
15
|
-
class LongformerEncoder(nn.Module):
|
16
|
-
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
|
17
|
-
super().__init__()
|
18
|
-
self.device = torch.device(device) if device is not None \
|
19
|
-
else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20
|
-
self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
21
|
-
cache_dir=CACHE_PATH, _from_auto=True)
|
22
|
-
self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
23
|
-
cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
|
24
|
-
logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
|
25
|
-
|
26
|
-
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
27
|
-
ori_mode = self.bert.training
|
28
|
-
self.bert.eval()
|
29
|
-
with torch.no_grad():
|
30
|
-
res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
|
31
|
-
self.bert.train(mode=ori_mode)
|
32
|
-
return res
|
33
|
-
|
34
|
-
def encode(self, text: str) -> torch.Tensor:
|
35
|
-
_input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
|
36
|
-
_att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
|
37
|
-
return self.forward(_input_ids, _att_mask).squeeze()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|