deeplotx 0.8.2__tar.gz → 0.8.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {deeplotx-0.8.2 → deeplotx-0.8.3}/PKG-INFO +53 -37
- {deeplotx-0.8.2 → deeplotx-0.8.3}/README.md +52 -36
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/encoder/long_text_encoder.py +13 -8
- deeplotx-0.8.3/deeplotx/encoder/longformer_encoder.py +55 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/PKG-INFO +53 -37
- {deeplotx-0.8.2 → deeplotx-0.8.3}/pyproject.toml +1 -1
- deeplotx-0.8.2/deeplotx/encoder/longformer_encoder.py +0 -37
- {deeplotx-0.8.2 → deeplotx-0.8.3}/LICENSE +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/encoder/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/encoder/encoder.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/attention.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/auto_regression.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/feed_forward.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/linear_regression.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/logistic_regression.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/long_context_auto_regression.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/multi_head_attention.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/recursive_sequential.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/roformer_encoder.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/rope.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/nn/softmax_regression.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/util/hash.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/SOURCES.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/requires.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.8.2 → deeplotx-0.8.3}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.3
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -48,24 +48,23 @@ Dynamic: license-file
|
|
48
48
|
|
49
49
|
- ### 长文本嵌入
|
50
50
|
|
51
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
51
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
52
52
|
|
53
53
|
```python
|
54
54
|
from deeplotx import LongTextEncoder
|
55
55
|
|
56
|
-
#
|
56
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
57
57
|
encoder = LongTextEncoder(
|
58
|
-
|
59
|
-
|
60
|
-
overlapping=64
|
58
|
+
chunk_size=448,
|
59
|
+
overlapping=32
|
61
60
|
)
|
62
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
63
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
64
63
|
```
|
65
64
|
|
66
65
|
输出:
|
67
66
|
```
|
68
|
-
tensor([
|
67
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
69
68
|
```
|
70
69
|
|
71
70
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -77,6 +76,11 @@ Dynamic: license-file
|
|
77
76
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
78
77
|
```
|
79
78
|
|
79
|
+
输出:
|
80
|
+
```
|
81
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
82
|
+
```
|
83
|
+
|
80
84
|
- ### 相似性计算
|
81
85
|
|
82
86
|
- **基于向量的相似性**
|
@@ -163,14 +167,16 @@ Dynamic: license-file
|
|
163
167
|
|
164
168
|
```python
|
165
169
|
from deeplotx import (
|
166
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
167
170
|
FeedForward, # 前馈神经网络
|
168
171
|
LinearRegression, # 线性回归
|
169
172
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
170
173
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
171
174
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
172
175
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
173
|
-
|
176
|
+
RoPE, # RoPE 位置编码
|
177
|
+
Attention, # 自注意力 / 交叉注意力
|
178
|
+
MultiHeadAttention, # 并行多头注意力
|
179
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
174
180
|
AutoRegression, # 自回归模型 / 循环神经网络
|
175
181
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
176
182
|
)
|
@@ -193,13 +199,13 @@ Dynamic: license-file
|
|
193
199
|
device: str | None = None, dtype: torch.dtype | None = None):
|
194
200
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
195
201
|
self._dropout_rate = dropout_rate
|
196
|
-
self.
|
197
|
-
|
198
|
-
self.
|
199
|
-
|
200
|
-
self.
|
202
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
203
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
204
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
205
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
206
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
201
207
|
device=self.device, dtype=self.dtype)
|
202
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
208
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
203
209
|
device=self.device, dtype=self.dtype)
|
204
210
|
|
205
211
|
@override
|
@@ -207,11 +213,11 @@ Dynamic: license-file
|
|
207
213
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
208
214
|
residual = x
|
209
215
|
x = self.layer_norm(x)
|
210
|
-
x = self.
|
211
|
-
x = self.
|
216
|
+
x = self.up_proj(x)
|
217
|
+
x = self.parametric_relu(x)
|
212
218
|
if self._dropout_rate > .0:
|
213
219
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
214
|
-
return self.
|
220
|
+
return self.down_proj(x) + residual
|
215
221
|
|
216
222
|
|
217
223
|
class FeedForward(BaseNeuralNetwork):
|
@@ -224,7 +230,7 @@ Dynamic: license-file
|
|
224
230
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
225
231
|
expansion_factor=expansion_factor, bias=bias,
|
226
232
|
dropout_rate=dropout_rate,
|
227
|
-
device=self.device, dtype=self.dtype)
|
233
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
228
234
|
|
229
235
|
@override
|
230
236
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -234,7 +240,7 @@ Dynamic: license-file
|
|
234
240
|
return x
|
235
241
|
```
|
236
242
|
|
237
|
-
|
243
|
+
注意力模块:
|
238
244
|
|
239
245
|
```python
|
240
246
|
from typing_extensions import override
|
@@ -243,14 +249,17 @@ Dynamic: license-file
|
|
243
249
|
|
244
250
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
245
251
|
from deeplotx.nn.feed_forward import FeedForward
|
252
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
246
253
|
|
247
254
|
|
248
|
-
class
|
249
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
250
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
251
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
255
|
+
class Attention(BaseNeuralNetwork):
|
256
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
257
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
258
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
259
|
+
**kwargs):
|
252
260
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
253
261
|
device=device, dtype=dtype)
|
262
|
+
self._positional = positional
|
254
263
|
self._feature_dim = feature_dim
|
255
264
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
256
265
|
expansion_factor=proj_expansion_factor,
|
@@ -261,21 +270,27 @@ Dynamic: license-file
|
|
261
270
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
262
271
|
expansion_factor=proj_expansion_factor,
|
263
272
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
273
|
+
if self._positional:
|
274
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
275
|
+
device=self.device, dtype=self.dtype)
|
264
276
|
|
265
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
266
|
-
q, k = self.q_proj(x), self.k_proj(
|
277
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
278
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
279
|
+
if self._positional:
|
280
|
+
q, k = self.rope(q), self.rope(k)
|
267
281
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
268
282
|
attn = attn / (self._feature_dim ** 0.5)
|
269
283
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
270
|
-
return torch.softmax(attn, dim=-1)
|
284
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
271
285
|
|
272
286
|
@override
|
273
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
287
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
274
288
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
289
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
275
290
|
if mask is not None:
|
276
291
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
277
|
-
v = self.v_proj(
|
278
|
-
return torch.matmul(self._attention(x, mask), v)
|
292
|
+
v = self.v_proj(y)
|
293
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
279
294
|
```
|
280
295
|
|
281
296
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -284,7 +299,7 @@ Dynamic: license-file
|
|
284
299
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
285
300
|
from deeplotx.util import get_files, read_file
|
286
301
|
|
287
|
-
# 定义向量编码策略 (默认使用
|
302
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
288
303
|
long_text_encoder = LongTextEncoder(
|
289
304
|
max_length=2048, # 最大文本大小, 超出截断
|
290
305
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -306,10 +321,11 @@ Dynamic: license-file
|
|
306
321
|
|
307
322
|
# 开始训练
|
308
323
|
model = trainer.train(pos_data, neg_data,
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
324
|
+
num_epochs=36, learning_rate=2e-5,
|
325
|
+
balancing_dataset=True, alpha=1e-4,
|
326
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
327
|
+
attn_heads=8, # 8 个注意力头
|
328
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
313
329
|
|
314
330
|
# 保存模型权重
|
315
331
|
model.save(model_name='test_model', model_dir='model')
|
@@ -30,24 +30,23 @@
|
|
30
30
|
|
31
31
|
- ### 长文本嵌入
|
32
32
|
|
33
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
33
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
34
34
|
|
35
35
|
```python
|
36
36
|
from deeplotx import LongTextEncoder
|
37
37
|
|
38
|
-
#
|
38
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
39
39
|
encoder = LongTextEncoder(
|
40
|
-
|
41
|
-
|
42
|
-
overlapping=64
|
40
|
+
chunk_size=448,
|
41
|
+
overlapping=32
|
43
42
|
)
|
44
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
45
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
43
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
44
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
46
45
|
```
|
47
46
|
|
48
47
|
输出:
|
49
48
|
```
|
50
|
-
tensor([
|
49
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
51
50
|
```
|
52
51
|
|
53
52
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -59,6 +58,11 @@
|
|
59
58
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
60
59
|
```
|
61
60
|
|
61
|
+
输出:
|
62
|
+
```
|
63
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
64
|
+
```
|
65
|
+
|
62
66
|
- ### 相似性计算
|
63
67
|
|
64
68
|
- **基于向量的相似性**
|
@@ -145,14 +149,16 @@
|
|
145
149
|
|
146
150
|
```python
|
147
151
|
from deeplotx import (
|
148
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
149
152
|
FeedForward, # 前馈神经网络
|
150
153
|
LinearRegression, # 线性回归
|
151
154
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
152
155
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
153
156
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
154
157
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
155
|
-
|
158
|
+
RoPE, # RoPE 位置编码
|
159
|
+
Attention, # 自注意力 / 交叉注意力
|
160
|
+
MultiHeadAttention, # 并行多头注意力
|
161
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
156
162
|
AutoRegression, # 自回归模型 / 循环神经网络
|
157
163
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
158
164
|
)
|
@@ -175,13 +181,13 @@
|
|
175
181
|
device: str | None = None, dtype: torch.dtype | None = None):
|
176
182
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
177
183
|
self._dropout_rate = dropout_rate
|
178
|
-
self.
|
179
|
-
|
180
|
-
self.
|
181
|
-
|
182
|
-
self.
|
184
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
185
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
186
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
187
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
188
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
183
189
|
device=self.device, dtype=self.dtype)
|
184
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
190
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
185
191
|
device=self.device, dtype=self.dtype)
|
186
192
|
|
187
193
|
@override
|
@@ -189,11 +195,11 @@
|
|
189
195
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
190
196
|
residual = x
|
191
197
|
x = self.layer_norm(x)
|
192
|
-
x = self.
|
193
|
-
x = self.
|
198
|
+
x = self.up_proj(x)
|
199
|
+
x = self.parametric_relu(x)
|
194
200
|
if self._dropout_rate > .0:
|
195
201
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
196
|
-
return self.
|
202
|
+
return self.down_proj(x) + residual
|
197
203
|
|
198
204
|
|
199
205
|
class FeedForward(BaseNeuralNetwork):
|
@@ -206,7 +212,7 @@
|
|
206
212
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
207
213
|
expansion_factor=expansion_factor, bias=bias,
|
208
214
|
dropout_rate=dropout_rate,
|
209
|
-
device=self.device, dtype=self.dtype)
|
215
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
210
216
|
|
211
217
|
@override
|
212
218
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -216,7 +222,7 @@
|
|
216
222
|
return x
|
217
223
|
```
|
218
224
|
|
219
|
-
|
225
|
+
注意力模块:
|
220
226
|
|
221
227
|
```python
|
222
228
|
from typing_extensions import override
|
@@ -225,14 +231,17 @@
|
|
225
231
|
|
226
232
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
227
233
|
from deeplotx.nn.feed_forward import FeedForward
|
234
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
228
235
|
|
229
236
|
|
230
|
-
class
|
231
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
232
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
233
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
237
|
+
class Attention(BaseNeuralNetwork):
|
238
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
239
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
240
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
241
|
+
**kwargs):
|
234
242
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
235
243
|
device=device, dtype=dtype)
|
244
|
+
self._positional = positional
|
236
245
|
self._feature_dim = feature_dim
|
237
246
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
238
247
|
expansion_factor=proj_expansion_factor,
|
@@ -243,21 +252,27 @@
|
|
243
252
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
244
253
|
expansion_factor=proj_expansion_factor,
|
245
254
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
255
|
+
if self._positional:
|
256
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
257
|
+
device=self.device, dtype=self.dtype)
|
246
258
|
|
247
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
248
|
-
q, k = self.q_proj(x), self.k_proj(
|
259
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
260
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
261
|
+
if self._positional:
|
262
|
+
q, k = self.rope(q), self.rope(k)
|
249
263
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
250
264
|
attn = attn / (self._feature_dim ** 0.5)
|
251
265
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
252
|
-
return torch.softmax(attn, dim=-1)
|
266
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
253
267
|
|
254
268
|
@override
|
255
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
269
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
256
270
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
271
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
257
272
|
if mask is not None:
|
258
273
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
259
|
-
v = self.v_proj(
|
260
|
-
return torch.matmul(self._attention(x, mask), v)
|
274
|
+
v = self.v_proj(y)
|
275
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
261
276
|
```
|
262
277
|
|
263
278
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -266,7 +281,7 @@
|
|
266
281
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
267
282
|
from deeplotx.util import get_files, read_file
|
268
283
|
|
269
|
-
# 定义向量编码策略 (默认使用
|
284
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
270
285
|
long_text_encoder = LongTextEncoder(
|
271
286
|
max_length=2048, # 最大文本大小, 超出截断
|
272
287
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -288,10 +303,11 @@
|
|
288
303
|
|
289
304
|
# 开始训练
|
290
305
|
model = trainer.train(pos_data, neg_data,
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
306
|
+
num_epochs=36, learning_rate=2e-5,
|
307
|
+
balancing_dataset=True, alpha=1e-4,
|
308
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
309
|
+
attn_heads=8, # 8 个注意力头
|
310
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
295
311
|
|
296
312
|
# 保存模型权重
|
297
313
|
model.save(model_name='test_model', model_dir='model')
|
@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
|
|
13
13
|
|
14
14
|
|
15
15
|
class LongTextEncoder(Encoder):
|
16
|
-
def __init__(self,
|
17
|
-
|
18
|
-
|
16
|
+
def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
|
17
|
+
model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
|
18
|
+
max_workers: int = 8, device: str | None = None):
|
19
19
|
super().__init__(model_name_or_path=model_name_or_path, device=device)
|
20
20
|
assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
|
21
21
|
self._max_length = max_length
|
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
|
|
41
41
|
_fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
|
42
42
|
return _fin_emb_tensor.squeeze()
|
43
43
|
|
44
|
+
_tmp_max_length = self._max_length
|
44
45
|
_text_to_show = text.replace("\n", str())
|
45
46
|
logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
|
46
47
|
# read cache
|
47
48
|
_text_hash = sha512(text)
|
48
49
|
if _text_hash in self._cache:
|
49
50
|
return postprocess(self._cache[_text_hash], flatten)
|
50
|
-
_text_to_input_ids = self.tokenizer.encode(text.strip())
|
51
|
+
_text_to_input_ids = self.tokenizer.encode(text.strip())
|
52
|
+
# variable length
|
53
|
+
if _tmp_max_length < 0:
|
54
|
+
_tmp_max_length = len(_text_to_input_ids)
|
55
|
+
_text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
|
51
56
|
_text_to_input_ids_att_mask = []
|
52
57
|
# padding
|
53
58
|
pad_token = self.tokenizer.pad_token_type_id
|
54
|
-
if len(_text_to_input_ids) <
|
55
|
-
_text_to_input_ids.extend([pad_token] * (
|
59
|
+
if len(_text_to_input_ids) < _tmp_max_length:
|
60
|
+
_text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
|
56
61
|
pads = _text_to_input_ids.count(pad_token)
|
57
|
-
non_pads =
|
62
|
+
non_pads = _tmp_max_length - pads
|
58
63
|
_text_to_input_ids_att_mask.extend([1] * non_pads)
|
59
64
|
_text_to_input_ids_att_mask.extend([0] * pads)
|
60
|
-
num_chunks = math.ceil(
|
65
|
+
num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
|
61
66
|
# split chunks
|
62
67
|
chunks = []
|
63
68
|
for i in range(num_chunks):
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import logging
|
2
|
+
import os
|
3
|
+
|
4
|
+
import torch
|
5
|
+
from torch import nn
|
6
|
+
from transformers import AutoModel, AutoTokenizer
|
7
|
+
from requests.exceptions import ConnectTimeout, SSLError
|
8
|
+
|
9
|
+
from deeplotx import __ROOT__
|
10
|
+
|
11
|
+
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
12
|
+
DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
|
13
|
+
logger = logging.getLogger('deeplotx.embedding')
|
14
|
+
|
15
|
+
|
16
|
+
class LongformerEncoder(nn.Module):
|
17
|
+
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
|
18
|
+
super().__init__()
|
19
|
+
self.device = torch.device(device) if device is not None \
|
20
|
+
else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
21
|
+
try:
|
22
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
23
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
24
|
+
trust_remote_code=True)
|
25
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
26
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
27
|
+
trust_remote_code=True).to(self.device)
|
28
|
+
except ConnectTimeout:
|
29
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
30
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
31
|
+
trust_remote_code=True, local_files_only=True)
|
32
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
33
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
34
|
+
trust_remote_code=True, local_files_only=True).to(self.device)
|
35
|
+
except SSLError:
|
36
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
37
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
38
|
+
trust_remote_code=True, local_files_only=True)
|
39
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
40
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
41
|
+
trust_remote_code=True, local_files_only=True).to(self.device)
|
42
|
+
logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
|
43
|
+
|
44
|
+
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
45
|
+
ori_mode = self.encoder.training
|
46
|
+
self.encoder.eval()
|
47
|
+
with torch.no_grad():
|
48
|
+
res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
|
49
|
+
self.encoder.train(mode=ori_mode)
|
50
|
+
return res
|
51
|
+
|
52
|
+
def encode(self, text: str) -> torch.Tensor:
|
53
|
+
_input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
|
54
|
+
_att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
|
55
|
+
return self.forward(_input_ids, _att_mask).squeeze()
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.3
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -48,24 +48,23 @@ Dynamic: license-file
|
|
48
48
|
|
49
49
|
- ### 长文本嵌入
|
50
50
|
|
51
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
51
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
52
52
|
|
53
53
|
```python
|
54
54
|
from deeplotx import LongTextEncoder
|
55
55
|
|
56
|
-
#
|
56
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
57
57
|
encoder = LongTextEncoder(
|
58
|
-
|
59
|
-
|
60
|
-
overlapping=64
|
58
|
+
chunk_size=448,
|
59
|
+
overlapping=32
|
61
60
|
)
|
62
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
63
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
64
63
|
```
|
65
64
|
|
66
65
|
输出:
|
67
66
|
```
|
68
|
-
tensor([
|
67
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
69
68
|
```
|
70
69
|
|
71
70
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -77,6 +76,11 @@ Dynamic: license-file
|
|
77
76
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
78
77
|
```
|
79
78
|
|
79
|
+
输出:
|
80
|
+
```
|
81
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
82
|
+
```
|
83
|
+
|
80
84
|
- ### 相似性计算
|
81
85
|
|
82
86
|
- **基于向量的相似性**
|
@@ -163,14 +167,16 @@ Dynamic: license-file
|
|
163
167
|
|
164
168
|
```python
|
165
169
|
from deeplotx import (
|
166
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
167
170
|
FeedForward, # 前馈神经网络
|
168
171
|
LinearRegression, # 线性回归
|
169
172
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
170
173
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
171
174
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
172
175
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
173
|
-
|
176
|
+
RoPE, # RoPE 位置编码
|
177
|
+
Attention, # 自注意力 / 交叉注意力
|
178
|
+
MultiHeadAttention, # 并行多头注意力
|
179
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
174
180
|
AutoRegression, # 自回归模型 / 循环神经网络
|
175
181
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
176
182
|
)
|
@@ -193,13 +199,13 @@ Dynamic: license-file
|
|
193
199
|
device: str | None = None, dtype: torch.dtype | None = None):
|
194
200
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
195
201
|
self._dropout_rate = dropout_rate
|
196
|
-
self.
|
197
|
-
|
198
|
-
self.
|
199
|
-
|
200
|
-
self.
|
202
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
203
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
204
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
205
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
206
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
201
207
|
device=self.device, dtype=self.dtype)
|
202
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
208
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
203
209
|
device=self.device, dtype=self.dtype)
|
204
210
|
|
205
211
|
@override
|
@@ -207,11 +213,11 @@ Dynamic: license-file
|
|
207
213
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
208
214
|
residual = x
|
209
215
|
x = self.layer_norm(x)
|
210
|
-
x = self.
|
211
|
-
x = self.
|
216
|
+
x = self.up_proj(x)
|
217
|
+
x = self.parametric_relu(x)
|
212
218
|
if self._dropout_rate > .0:
|
213
219
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
214
|
-
return self.
|
220
|
+
return self.down_proj(x) + residual
|
215
221
|
|
216
222
|
|
217
223
|
class FeedForward(BaseNeuralNetwork):
|
@@ -224,7 +230,7 @@ Dynamic: license-file
|
|
224
230
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
225
231
|
expansion_factor=expansion_factor, bias=bias,
|
226
232
|
dropout_rate=dropout_rate,
|
227
|
-
device=self.device, dtype=self.dtype)
|
233
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
228
234
|
|
229
235
|
@override
|
230
236
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -234,7 +240,7 @@ Dynamic: license-file
|
|
234
240
|
return x
|
235
241
|
```
|
236
242
|
|
237
|
-
|
243
|
+
注意力模块:
|
238
244
|
|
239
245
|
```python
|
240
246
|
from typing_extensions import override
|
@@ -243,14 +249,17 @@ Dynamic: license-file
|
|
243
249
|
|
244
250
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
245
251
|
from deeplotx.nn.feed_forward import FeedForward
|
252
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
246
253
|
|
247
254
|
|
248
|
-
class
|
249
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
250
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
251
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
255
|
+
class Attention(BaseNeuralNetwork):
|
256
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
257
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
258
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
259
|
+
**kwargs):
|
252
260
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
253
261
|
device=device, dtype=dtype)
|
262
|
+
self._positional = positional
|
254
263
|
self._feature_dim = feature_dim
|
255
264
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
256
265
|
expansion_factor=proj_expansion_factor,
|
@@ -261,21 +270,27 @@ Dynamic: license-file
|
|
261
270
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
262
271
|
expansion_factor=proj_expansion_factor,
|
263
272
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
273
|
+
if self._positional:
|
274
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
275
|
+
device=self.device, dtype=self.dtype)
|
264
276
|
|
265
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
266
|
-
q, k = self.q_proj(x), self.k_proj(
|
277
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
278
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
279
|
+
if self._positional:
|
280
|
+
q, k = self.rope(q), self.rope(k)
|
267
281
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
268
282
|
attn = attn / (self._feature_dim ** 0.5)
|
269
283
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
270
|
-
return torch.softmax(attn, dim=-1)
|
284
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
271
285
|
|
272
286
|
@override
|
273
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
287
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
274
288
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
289
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
275
290
|
if mask is not None:
|
276
291
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
277
|
-
v = self.v_proj(
|
278
|
-
return torch.matmul(self._attention(x, mask), v)
|
292
|
+
v = self.v_proj(y)
|
293
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
279
294
|
```
|
280
295
|
|
281
296
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -284,7 +299,7 @@ Dynamic: license-file
|
|
284
299
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
285
300
|
from deeplotx.util import get_files, read_file
|
286
301
|
|
287
|
-
# 定义向量编码策略 (默认使用
|
302
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
288
303
|
long_text_encoder = LongTextEncoder(
|
289
304
|
max_length=2048, # 最大文本大小, 超出截断
|
290
305
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -306,10 +321,11 @@ Dynamic: license-file
|
|
306
321
|
|
307
322
|
# 开始训练
|
308
323
|
model = trainer.train(pos_data, neg_data,
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
324
|
+
num_epochs=36, learning_rate=2e-5,
|
325
|
+
balancing_dataset=True, alpha=1e-4,
|
326
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
327
|
+
attn_heads=8, # 8 个注意力头
|
328
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
313
329
|
|
314
330
|
# 保存模型权重
|
315
331
|
model.save(model_name='test_model', model_dir='model')
|
@@ -1,37 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
import os
|
3
|
-
|
4
|
-
import torch
|
5
|
-
from torch import nn
|
6
|
-
from transformers import LongformerTokenizer, LongformerModel
|
7
|
-
|
8
|
-
from deeplotx import __ROOT__
|
9
|
-
|
10
|
-
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
11
|
-
DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
|
12
|
-
logger = logging.getLogger('deeplotx.embedding')
|
13
|
-
|
14
|
-
|
15
|
-
class LongformerEncoder(nn.Module):
|
16
|
-
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
|
17
|
-
super().__init__()
|
18
|
-
self.device = torch.device(device) if device is not None \
|
19
|
-
else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20
|
-
self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
21
|
-
cache_dir=CACHE_PATH, _from_auto=True)
|
22
|
-
self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
23
|
-
cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
|
24
|
-
logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
|
25
|
-
|
26
|
-
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
27
|
-
ori_mode = self.bert.training
|
28
|
-
self.bert.eval()
|
29
|
-
with torch.no_grad():
|
30
|
-
res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
|
31
|
-
self.bert.train(mode=ori_mode)
|
32
|
-
return res
|
33
|
-
|
34
|
-
def encode(self, text: str) -> torch.Tensor:
|
35
|
-
_input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long, device=self.device)
|
36
|
-
_att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int, device=self.device)
|
37
|
-
return self.forward(_input_ids, _att_mask).squeeze()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|