deeplotx 0.8.1__py3-none-any.whl → 0.8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeplotx/encoder/long_text_encoder.py +13 -8
- deeplotx/encoder/longformer_encoder.py +27 -9
- deeplotx/nn/attention.py +1 -1
- {deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/METADATA +53 -37
- {deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/RECORD +8 -8
- {deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/WHEEL +0 -0
- {deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/licenses/LICENSE +0 -0
- {deeplotx-0.8.1.dist-info → deeplotx-0.8.3.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,9 @@ logger = logging.getLogger('deeplotx.embedding')
|
|
13
13
|
|
14
14
|
|
15
15
|
class LongTextEncoder(Encoder):
|
16
|
-
def __init__(self,
|
17
|
-
|
18
|
-
|
16
|
+
def __init__(self, chunk_size: int = 448, overlapping: int = 32, max_length: int = -1,
|
17
|
+
model_name_or_path: str = DEFAULT_BERT, cache_capacity: int = 64,
|
18
|
+
max_workers: int = 8, device: str | None = None):
|
19
19
|
super().__init__(model_name_or_path=model_name_or_path, device=device)
|
20
20
|
assert overlapping < chunk_size, f'overlapping ({overlapping}) must be less than chunk size ({chunk_size}).'
|
21
21
|
self._max_length = max_length
|
@@ -41,23 +41,28 @@ class LongTextEncoder(Encoder):
|
|
41
41
|
_fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
|
42
42
|
return _fin_emb_tensor.squeeze()
|
43
43
|
|
44
|
+
_tmp_max_length = self._max_length
|
44
45
|
_text_to_show = text.replace("\n", str())
|
45
46
|
logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
|
46
47
|
# read cache
|
47
48
|
_text_hash = sha512(text)
|
48
49
|
if _text_hash in self._cache:
|
49
50
|
return postprocess(self._cache[_text_hash], flatten)
|
50
|
-
_text_to_input_ids = self.tokenizer.encode(text.strip())
|
51
|
+
_text_to_input_ids = self.tokenizer.encode(text.strip())
|
52
|
+
# variable length
|
53
|
+
if _tmp_max_length < 0:
|
54
|
+
_tmp_max_length = len(_text_to_input_ids)
|
55
|
+
_text_to_input_ids = _text_to_input_ids[:_tmp_max_length]
|
51
56
|
_text_to_input_ids_att_mask = []
|
52
57
|
# padding
|
53
58
|
pad_token = self.tokenizer.pad_token_type_id
|
54
|
-
if len(_text_to_input_ids) <
|
55
|
-
_text_to_input_ids.extend([pad_token] * (
|
59
|
+
if len(_text_to_input_ids) < _tmp_max_length:
|
60
|
+
_text_to_input_ids.extend([pad_token] * (_tmp_max_length - len(_text_to_input_ids)))
|
56
61
|
pads = _text_to_input_ids.count(pad_token)
|
57
|
-
non_pads =
|
62
|
+
non_pads = _tmp_max_length - pads
|
58
63
|
_text_to_input_ids_att_mask.extend([1] * non_pads)
|
59
64
|
_text_to_input_ids_att_mask.extend([0] * pads)
|
60
|
-
num_chunks = math.ceil(
|
65
|
+
num_chunks = math.ceil(_tmp_max_length / self._chunk_size)
|
61
66
|
# split chunks
|
62
67
|
chunks = []
|
63
68
|
for i in range(num_chunks):
|
@@ -3,7 +3,8 @@ import os
|
|
3
3
|
|
4
4
|
import torch
|
5
5
|
from torch import nn
|
6
|
-
from transformers import
|
6
|
+
from transformers import AutoModel, AutoTokenizer
|
7
|
+
from requests.exceptions import ConnectTimeout, SSLError
|
7
8
|
|
8
9
|
from deeplotx import __ROOT__
|
9
10
|
|
@@ -17,18 +18,35 @@ class LongformerEncoder(nn.Module):
|
|
17
18
|
super().__init__()
|
18
19
|
self.device = torch.device(device) if device is not None \
|
19
20
|
else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
21
|
+
try:
|
22
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
23
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
24
|
+
trust_remote_code=True)
|
25
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
26
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
27
|
+
trust_remote_code=True).to(self.device)
|
28
|
+
except ConnectTimeout:
|
29
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
30
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
31
|
+
trust_remote_code=True, local_files_only=True)
|
32
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
33
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
34
|
+
trust_remote_code=True, local_files_only=True).to(self.device)
|
35
|
+
except SSLError:
|
36
|
+
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
37
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
38
|
+
trust_remote_code=True, local_files_only=True)
|
39
|
+
self.encoder = AutoModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
40
|
+
cache_dir=CACHE_PATH, _from_auto=True,
|
41
|
+
trust_remote_code=True, local_files_only=True).to(self.device)
|
24
42
|
logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
|
25
43
|
|
26
44
|
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
27
|
-
ori_mode = self.
|
28
|
-
self.
|
45
|
+
ori_mode = self.encoder.training
|
46
|
+
self.encoder.eval()
|
29
47
|
with torch.no_grad():
|
30
|
-
res = self.
|
31
|
-
self.
|
48
|
+
res = self.encoder.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
|
49
|
+
self.encoder.train(mode=ori_mode)
|
32
50
|
return res
|
33
51
|
|
34
52
|
def encode(self, text: str) -> torch.Tensor:
|
deeplotx/nn/attention.py
CHANGED
@@ -36,7 +36,7 @@ class Attention(BaseNeuralNetwork):
|
|
36
36
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
37
37
|
attn = attn / (self._feature_dim ** 0.5)
|
38
38
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
39
|
-
return torch.softmax(attn, dim=-1)
|
39
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
40
40
|
|
41
41
|
@override
|
42
42
|
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: deeplotx
|
3
|
-
Version: 0.8.
|
3
|
+
Version: 0.8.3
|
4
4
|
Summary: Easy-2-use long text NLP toolkit.
|
5
5
|
Requires-Python: >=3.10
|
6
6
|
Description-Content-Type: text/markdown
|
@@ -48,24 +48,23 @@ Dynamic: license-file
|
|
48
48
|
|
49
49
|
- ### 长文本嵌入
|
50
50
|
|
51
|
-
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长,
|
51
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 可通过 max_length 限制长度)
|
52
52
|
|
53
53
|
```python
|
54
54
|
from deeplotx import LongTextEncoder
|
55
55
|
|
56
|
-
#
|
56
|
+
# 块大小为 448 个 tokens, 块间重叠部分为 32 个 tokens.
|
57
57
|
encoder = LongTextEncoder(
|
58
|
-
|
59
|
-
|
60
|
-
overlapping=64
|
58
|
+
chunk_size=448,
|
59
|
+
overlapping=32
|
61
60
|
)
|
62
|
-
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入,
|
63
|
-
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并堆叠.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=False)
|
64
63
|
```
|
65
64
|
|
66
65
|
输出:
|
67
66
|
```
|
68
|
-
tensor([
|
67
|
+
tensor([ 2.2316e-01, 2.0300e-01, ..., 1.5578e-01, -6.6735e-02])
|
69
68
|
```
|
70
69
|
|
71
70
|
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
@@ -77,6 +76,11 @@ Dynamic: license-file
|
|
77
76
|
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
78
77
|
```
|
79
78
|
|
79
|
+
输出:
|
80
|
+
```
|
81
|
+
tensor([-2.7490e-02, 6.6503e-02, ..., -6.5937e-02, 6.7802e-03])
|
82
|
+
```
|
83
|
+
|
80
84
|
- ### 相似性计算
|
81
85
|
|
82
86
|
- **基于向量的相似性**
|
@@ -163,14 +167,16 @@ Dynamic: license-file
|
|
163
167
|
|
164
168
|
```python
|
165
169
|
from deeplotx import (
|
166
|
-
BaseNeuralNetwork, # 深度神经网络基类
|
167
170
|
FeedForward, # 前馈神经网络
|
168
171
|
LinearRegression, # 线性回归
|
169
172
|
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
170
173
|
SoftmaxRegression, # Softmax 回归 / 多分类
|
171
174
|
RecursiveSequential, # 序列模型 / 循环神经网络
|
172
175
|
LongContextRecursiveSequential, # 长上下文序列模型 / 自注意力融合循环神经网络
|
173
|
-
|
176
|
+
RoPE, # RoPE 位置编码
|
177
|
+
Attention, # 自注意力 / 交叉注意力
|
178
|
+
MultiHeadAttention, # 并行多头注意力
|
179
|
+
RoFormerEncoder, # Roformer (Transformer + RoPE) 编码器模型
|
174
180
|
AutoRegression, # 自回归模型 / 循环神经网络
|
175
181
|
LongContextAutoRegression # 长上下文自回归模型 / 自注意力融合循环神经网络
|
176
182
|
)
|
@@ -193,13 +199,13 @@ Dynamic: license-file
|
|
193
199
|
device: str | None = None, dtype: torch.dtype | None = None):
|
194
200
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name, device=device, dtype=dtype)
|
195
201
|
self._dropout_rate = dropout_rate
|
196
|
-
self.
|
197
|
-
|
198
|
-
self.
|
199
|
-
|
200
|
-
self.
|
202
|
+
self.up_proj = nn.Linear(in_features=feature_dim, out_features=int(feature_dim * expansion_factor),
|
203
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
204
|
+
self.down_proj = nn.Linear(in_features=int(feature_dim * expansion_factor), out_features=feature_dim,
|
205
|
+
bias=bias, device=self.device, dtype=self.dtype)
|
206
|
+
self.parametric_relu = nn.PReLU(num_parameters=1, init=5e-3,
|
201
207
|
device=self.device, dtype=self.dtype)
|
202
|
-
self.layer_norm = nn.LayerNorm(normalized_shape=self.
|
208
|
+
self.layer_norm = nn.LayerNorm(normalized_shape=self.up_proj.in_features, eps=1e-9,
|
203
209
|
device=self.device, dtype=self.dtype)
|
204
210
|
|
205
211
|
@override
|
@@ -207,11 +213,11 @@ Dynamic: license-file
|
|
207
213
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
208
214
|
residual = x
|
209
215
|
x = self.layer_norm(x)
|
210
|
-
x = self.
|
211
|
-
x = self.
|
216
|
+
x = self.up_proj(x)
|
217
|
+
x = self.parametric_relu(x)
|
212
218
|
if self._dropout_rate > .0:
|
213
219
|
x = torch.dropout(x, p=self._dropout_rate, train=self.training)
|
214
|
-
return self.
|
220
|
+
return self.down_proj(x) + residual
|
215
221
|
|
216
222
|
|
217
223
|
class FeedForward(BaseNeuralNetwork):
|
@@ -224,7 +230,7 @@ Dynamic: license-file
|
|
224
230
|
self.ffn_layers = nn.ModuleList([FeedForwardUnit(feature_dim=feature_dim,
|
225
231
|
expansion_factor=expansion_factor, bias=bias,
|
226
232
|
dropout_rate=dropout_rate,
|
227
|
-
device=self.device, dtype=self.dtype)
|
233
|
+
device=self.device, dtype=self.dtype) for _ in range(num_layers)])
|
228
234
|
|
229
235
|
@override
|
230
236
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
@@ -234,7 +240,7 @@ Dynamic: license-file
|
|
234
240
|
return x
|
235
241
|
```
|
236
242
|
|
237
|
-
|
243
|
+
注意力模块:
|
238
244
|
|
239
245
|
```python
|
240
246
|
from typing_extensions import override
|
@@ -243,14 +249,17 @@ Dynamic: license-file
|
|
243
249
|
|
244
250
|
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
245
251
|
from deeplotx.nn.feed_forward import FeedForward
|
252
|
+
from deeplotx.nn.rope import RoPE, DEFAULT_THETA
|
246
253
|
|
247
254
|
|
248
|
-
class
|
249
|
-
def __init__(self, feature_dim: int, bias: bool = True,
|
250
|
-
proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
251
|
-
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None
|
255
|
+
class Attention(BaseNeuralNetwork):
|
256
|
+
def __init__(self, feature_dim: int, bias: bool = True, positional: bool = True,
|
257
|
+
proj_layers: int = 1, proj_expansion_factor: int | float = 1.5, dropout_rate: float = 0.02,
|
258
|
+
model_name: str | None = None, device: str | None = None, dtype: torch.dtype | None = None,
|
259
|
+
**kwargs):
|
252
260
|
super().__init__(in_features=feature_dim, out_features=feature_dim, model_name=model_name,
|
253
261
|
device=device, dtype=dtype)
|
262
|
+
self._positional = positional
|
254
263
|
self._feature_dim = feature_dim
|
255
264
|
self.q_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
256
265
|
expansion_factor=proj_expansion_factor,
|
@@ -261,21 +270,27 @@ Dynamic: license-file
|
|
261
270
|
self.v_proj = FeedForward(feature_dim=self._feature_dim, num_layers=proj_layers,
|
262
271
|
expansion_factor=proj_expansion_factor,
|
263
272
|
bias=bias, dropout_rate=dropout_rate, device=self.device, dtype=self.dtype)
|
273
|
+
if self._positional:
|
274
|
+
self.rope = RoPE(feature_dim=self._feature_dim, theta=kwargs.get('theta', DEFAULT_THETA),
|
275
|
+
device=self.device, dtype=self.dtype)
|
264
276
|
|
265
|
-
def _attention(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
266
|
-
q, k = self.q_proj(x), self.k_proj(
|
277
|
+
def _attention(self, x: torch.Tensor, y: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
278
|
+
q, k = self.q_proj(x), self.k_proj(y)
|
279
|
+
if self._positional:
|
280
|
+
q, k = self.rope(q), self.rope(k)
|
267
281
|
attn = torch.matmul(q, k.transpose(-2, -1))
|
268
282
|
attn = attn / (self._feature_dim ** 0.5)
|
269
283
|
attn = attn.masked_fill(mask == 0, -1e9) if mask is not None else attn
|
270
|
-
return torch.softmax(attn, dim=-1)
|
284
|
+
return torch.softmax(attn, dtype=self.dtype, dim=-1)
|
271
285
|
|
272
286
|
@override
|
273
|
-
def forward(self, x: torch.Tensor, mask: torch.Tensor | None = None) -> torch.Tensor:
|
287
|
+
def forward(self, x: torch.Tensor, y: torch.Tensor | None = None, mask: torch.Tensor | None = None) -> torch.Tensor:
|
274
288
|
x = self.ensure_device_and_dtype(x, device=self.device, dtype=self.dtype)
|
289
|
+
y = x if y is None else self.ensure_device_and_dtype(y, device=self.device, dtype=self.dtype)
|
275
290
|
if mask is not None:
|
276
291
|
mask = self.ensure_device_and_dtype(mask, device=self.device, dtype=self.dtype)
|
277
|
-
v = self.v_proj(
|
278
|
-
return torch.matmul(self._attention(x, mask), v)
|
292
|
+
v = self.v_proj(y)
|
293
|
+
return torch.matmul(self._attention(x, y, mask), v)
|
279
294
|
```
|
280
295
|
|
281
296
|
- ### 使用预定义训练器实现文本二分类任务
|
@@ -284,7 +299,7 @@ Dynamic: license-file
|
|
284
299
|
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
285
300
|
from deeplotx.util import get_files, read_file
|
286
301
|
|
287
|
-
# 定义向量编码策略 (默认使用
|
302
|
+
# 定义向量编码策略 (默认使用 FacebookAI/xlm-roberta-base 作为嵌入模型)
|
288
303
|
long_text_encoder = LongTextEncoder(
|
289
304
|
max_length=2048, # 最大文本大小, 超出截断
|
290
305
|
chunk_size=448, # 块大小 (按 Token 计)
|
@@ -306,10 +321,11 @@ Dynamic: license-file
|
|
306
321
|
|
307
322
|
# 开始训练
|
308
323
|
model = trainer.train(pos_data, neg_data,
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
324
|
+
num_epochs=36, learning_rate=2e-5,
|
325
|
+
balancing_dataset=True, alpha=1e-4,
|
326
|
+
rho=.2, encoder_layers=2, # 2 层 Roformer 编码器
|
327
|
+
attn_heads=8, # 8 个注意力头
|
328
|
+
recursive_layers=2) # 2 层 Bi-LSTM
|
313
329
|
|
314
330
|
# 保存模型权重
|
315
331
|
model.save(model_name='test_model', model_dir='model')
|
@@ -1,10 +1,10 @@
|
|
1
1
|
deeplotx/__init__.py,sha256=oNeA-vNu5YGiEQg0IcpKEdGh_Y_2uPvo2nqaNL_Zgv8,1159
|
2
2
|
deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
|
3
3
|
deeplotx/encoder/encoder.py,sha256=oSBdA-MiwMKNfTFJWR-RdvNS0G0qfX-Qchwy4LuwB00,3985
|
4
|
-
deeplotx/encoder/long_text_encoder.py,sha256=
|
5
|
-
deeplotx/encoder/longformer_encoder.py,sha256=
|
4
|
+
deeplotx/encoder/long_text_encoder.py,sha256=3ScdKDi65J5tdO8PFCXBjCzNUCLlJRwVhpDR0BrphG4,3951
|
5
|
+
deeplotx/encoder/longformer_encoder.py,sha256=NNYLr5I9tdeh0C8Ir7QcbEMU9gDk6U7CiF3Tbg6NEsE,3372
|
6
6
|
deeplotx/nn/__init__.py,sha256=01I_yqx9GTa4wy3uNyAqhtxp66tDqxgMLC4Ky5Vnkrg,651
|
7
|
-
deeplotx/nn/attention.py,sha256=
|
7
|
+
deeplotx/nn/attention.py,sha256=R-i-Rd7gnsh6hwXDeYfqLQOJvfSZIGfQbFzRlC91XLo,2879
|
8
8
|
deeplotx/nn/auto_regression.py,sha256=uISx29t_zkDGS8s2wvGB6wOGYZitQ4hQ7wyoQl4lcqY,857
|
9
9
|
deeplotx/nn/base_neural_network.py,sha256=FjQEDFH810fJS7JV3aLgJZnaMqC6DH--wlBvuj-ghTc,5900
|
10
10
|
deeplotx/nn/feed_forward.py,sha256=4ozj7EDalO9pb6JUhZtsJqE0r8bIHFApHRt2zTrl4ho,2931
|
@@ -27,8 +27,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=QMLR4cC8NCUP-v7SOY
|
|
27
27
|
deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
|
28
28
|
deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
|
29
29
|
deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
|
30
|
-
deeplotx-0.8.
|
31
|
-
deeplotx-0.8.
|
32
|
-
deeplotx-0.8.
|
33
|
-
deeplotx-0.8.
|
34
|
-
deeplotx-0.8.
|
30
|
+
deeplotx-0.8.3.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
31
|
+
deeplotx-0.8.3.dist-info/METADATA,sha256=Lif2B7wUDIQQKWvUt_Vl_XYPlMf_EhskiQcq8ZYv6TQ,13079
|
32
|
+
deeplotx-0.8.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
33
|
+
deeplotx-0.8.3.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
|
34
|
+
deeplotx-0.8.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|