deeplotx 0.4.7__tar.gz → 0.4.9__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeplotx-0.4.9/PKG-INFO +211 -0
- deeplotx-0.4.9/README.md +194 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/encoder/bert_encoder.py +8 -2
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/encoder/long_text_encoder.py +5 -5
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/encoder/longformer_encoder.py +8 -2
- deeplotx-0.4.9/deeplotx.egg-info/PKG-INFO +211 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx.egg-info/requires.txt +1 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/pyproject.toml +2 -1
- deeplotx-0.4.7/PKG-INFO +0 -72
- deeplotx-0.4.7/README.md +0 -56
- deeplotx-0.4.7/deeplotx.egg-info/PKG-INFO +0 -72
- {deeplotx-0.4.7 → deeplotx-0.4.9}/LICENSE +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/__init__.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/encoder/__init__.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/__init__.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/auto_regression.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/base_neural_network.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/linear_regression.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/logistic_regression.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/recursive_sequential.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/nn/softmax_regression.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/similarity/__init__.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/similarity/distribution.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/similarity/set.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/similarity/vector.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/trainer/__init__.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/trainer/base_trainer.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/util/__init__.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/util/hash.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx/util/read_file.py +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx.egg-info/SOURCES.txt +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx.egg-info/dependency_links.txt +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/deeplotx.egg-info/top_level.txt +0 -0
- {deeplotx-0.4.7 → deeplotx-0.4.9}/setup.cfg +0 -0
deeplotx-0.4.9/PKG-INFO
ADDED
@@ -0,0 +1,211 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: deeplotx
|
3
|
+
Version: 0.4.9
|
4
|
+
Summary: Easy-2-use long text NLP toolkit.
|
5
|
+
Requires-Python: >=3.10
|
6
|
+
Description-Content-Type: text/markdown
|
7
|
+
License-File: LICENSE
|
8
|
+
Requires-Dist: hf-xet>=1.0.5
|
9
|
+
Requires-Dist: jupyter
|
10
|
+
Requires-Dist: numpy
|
11
|
+
Requires-Dist: protobuf>=6.31.1
|
12
|
+
Requires-Dist: python-dotenv>=1.1.0
|
13
|
+
Requires-Dist: torch
|
14
|
+
Requires-Dist: transformers
|
15
|
+
Requires-Dist: typing-extensions>=4.13.2
|
16
|
+
Dynamic: license-file
|
17
|
+
|
18
|
+
[](https://deepwiki.com/vortezwohl/DeepLoTX)
|
19
|
+
|
20
|
+
# Deep Long Text Learning Kit
|
21
|
+
|
22
|
+
> Author: 吴子豪
|
23
|
+
|
24
|
+
**开箱即用的长文本语义建模框架**
|
25
|
+
|
26
|
+
## 安装
|
27
|
+
|
28
|
+
- 使用 pip
|
29
|
+
|
30
|
+
```
|
31
|
+
pip install -U deeplotx
|
32
|
+
```
|
33
|
+
|
34
|
+
- 使用 uv (推荐)
|
35
|
+
|
36
|
+
```
|
37
|
+
uv add -U deeplotx
|
38
|
+
```
|
39
|
+
|
40
|
+
- 从 github 安装最新特性
|
41
|
+
|
42
|
+
```
|
43
|
+
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
44
|
+
```
|
45
|
+
|
46
|
+
## 核心功能
|
47
|
+
|
48
|
+
- ### 长文本嵌入
|
49
|
+
|
50
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
|
51
|
+
|
52
|
+
```python
|
53
|
+
from deeplotx import LongTextEncoder
|
54
|
+
|
55
|
+
# 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
|
56
|
+
encoder = LongTextEncoder(
|
57
|
+
max_length=2048,
|
58
|
+
chunk_size=512,
|
59
|
+
overlapping=64
|
60
|
+
)
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
|
63
|
+
```
|
64
|
+
|
65
|
+
输出:
|
66
|
+
```
|
67
|
+
tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
|
68
|
+
```
|
69
|
+
|
70
|
+
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
71
|
+
|
72
|
+
```python
|
73
|
+
from deeplotx import LongformerEncoder
|
74
|
+
|
75
|
+
encoder = LongformerEncoder()
|
76
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
77
|
+
```
|
78
|
+
|
79
|
+
- ### 相似性计算
|
80
|
+
|
81
|
+
- **基于向量的相似性**
|
82
|
+
|
83
|
+
```python
|
84
|
+
import deeplotx.similarity as sim
|
85
|
+
|
86
|
+
vector_0, vector_1 = [1, 2, 3, 4], [4, 3, 2, 1]
|
87
|
+
# 欧几里得距离
|
88
|
+
distance_0 = sim.euclidean_similarity(vector_0, vector_1)
|
89
|
+
print(distance_0)
|
90
|
+
# 余弦距离
|
91
|
+
distance_1 = sim.cosine_similarity(vector_0, vector_1)
|
92
|
+
print(distance_1)
|
93
|
+
# 切比雪夫距离
|
94
|
+
distance_2 = sim.chebyshev_similarity(vector_0, vector_1)
|
95
|
+
print(distance_2)
|
96
|
+
```
|
97
|
+
|
98
|
+
输出:
|
99
|
+
```
|
100
|
+
4.47213595499958
|
101
|
+
0.33333333333333337
|
102
|
+
3
|
103
|
+
```
|
104
|
+
|
105
|
+
- **基于集合的相似性**
|
106
|
+
|
107
|
+
```python
|
108
|
+
import deeplotx.similarity as sim
|
109
|
+
|
110
|
+
set_0, set_1 = {1, 2, 3, 4}, {4, 5, 6, 7}
|
111
|
+
# 杰卡德距离
|
112
|
+
distance_0 = sim.jaccard_similarity(set_0, set_1)
|
113
|
+
print(distance_0)
|
114
|
+
# Ochiai 距离
|
115
|
+
distance_1 = sim.ochiai_similarity(set_0, set_1)
|
116
|
+
print(distance_1)
|
117
|
+
# Dice 系数
|
118
|
+
distance_2 = sim.dice_coefficient(set_0, set_1)
|
119
|
+
print(distance_2)
|
120
|
+
# Overlap 系数
|
121
|
+
distance_3 = sim.overlap_coefficient(set_0, set_1)
|
122
|
+
print(distance_3)
|
123
|
+
```
|
124
|
+
|
125
|
+
输出:
|
126
|
+
```
|
127
|
+
0.1428571428572653
|
128
|
+
0.2500000000001875
|
129
|
+
0.25000000000009376
|
130
|
+
0.2500000000001875
|
131
|
+
```
|
132
|
+
|
133
|
+
- **基于概率分布的相似性**
|
134
|
+
|
135
|
+
```python
|
136
|
+
import deeplotx.similarity as sim
|
137
|
+
|
138
|
+
dist_0, dist_1 = [0.3, 0.2, 0.1, 0.4], [0.2, 0.1, 0.3, 0.4]
|
139
|
+
# 交叉熵
|
140
|
+
distance_0 = sim.cross_entropy(dist_0, dist_1)
|
141
|
+
print(distance_0)
|
142
|
+
# KL 散度
|
143
|
+
distance_1 = sim.kl_divergence(dist_0, dist_1)
|
144
|
+
print(distance_1)
|
145
|
+
# JS 散度
|
146
|
+
distance_2 = sim.js_divergence(dist_0, dist_1)
|
147
|
+
print(distance_2)
|
148
|
+
# Hellinger 距离
|
149
|
+
distance_3 = sim.hellinger_distance(dist_0, dist_1)
|
150
|
+
print(distance_3)
|
151
|
+
```
|
152
|
+
|
153
|
+
输出:
|
154
|
+
```
|
155
|
+
0.3575654913778237
|
156
|
+
0.15040773967762736
|
157
|
+
0.03969123741566945
|
158
|
+
0.20105866986400994
|
159
|
+
```
|
160
|
+
|
161
|
+
- ### 预定义深度神经网络
|
162
|
+
|
163
|
+
```python
|
164
|
+
from deeplotx import (
|
165
|
+
LinearRegression, # 线性回归
|
166
|
+
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
167
|
+
SoftmaxRegression, # Softmax 回归 / 多分类
|
168
|
+
RecursiveSequential, # 序列模型 / 循环神经网络
|
169
|
+
AutoRegression # 自回归模型
|
170
|
+
)
|
171
|
+
```
|
172
|
+
|
173
|
+
基础网络结构:
|
174
|
+
|
175
|
+
```python
|
176
|
+
from typing_extensions import override
|
177
|
+
|
178
|
+
import torch
|
179
|
+
from torch import nn
|
180
|
+
|
181
|
+
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
182
|
+
|
183
|
+
|
184
|
+
class LinearRegression(BaseNeuralNetwork):
|
185
|
+
def __init__(self, input_dim: int, output_dim: int, model_name: str | None = None):
|
186
|
+
super().__init__(model_name=model_name)
|
187
|
+
self.fc1 = nn.Linear(input_dim, 1024)
|
188
|
+
self.fc1_to_fc4_res = nn.Linear(1024, 64)
|
189
|
+
self.fc2 = nn.Linear(1024, 768)
|
190
|
+
self.fc3 = nn.Linear(768, 128)
|
191
|
+
self.fc4 = nn.Linear(128, 64)
|
192
|
+
self.fc5 = nn.Linear(64, output_dim)
|
193
|
+
self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3)
|
194
|
+
self.parametric_relu_2 = nn.PReLU(num_parameters=1, init=5e-3)
|
195
|
+
self.parametric_relu_3 = nn.PReLU(num_parameters=1, init=5e-3)
|
196
|
+
self.parametric_relu_4 = nn.PReLU(num_parameters=1, init=5e-3)
|
197
|
+
|
198
|
+
@override
|
199
|
+
def forward(self, x) -> torch.Tensor:
|
200
|
+
fc1_out = self.parametric_relu_1(self.fc1(x))
|
201
|
+
x = nn.LayerNorm(normalized_shape=1024, eps=1e-9)(fc1_out)
|
202
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
203
|
+
x = self.parametric_relu_2(self.fc2(x))
|
204
|
+
x = nn.LayerNorm(normalized_shape=768, eps=1e-9)(x)
|
205
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
206
|
+
x = self.parametric_relu_3(self.fc3(x))
|
207
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
208
|
+
x = self.parametric_relu_4(self.fc4(x)) + self.fc1_to_fc4_res(fc1_out)
|
209
|
+
x = self.fc5(x)
|
210
|
+
return x
|
211
|
+
```
|
deeplotx-0.4.9/README.md
ADDED
@@ -0,0 +1,194 @@
|
|
1
|
+
[](https://deepwiki.com/vortezwohl/DeepLoTX)
|
2
|
+
|
3
|
+
# Deep Long Text Learning Kit
|
4
|
+
|
5
|
+
> Author: 吴子豪
|
6
|
+
|
7
|
+
**开箱即用的长文本语义建模框架**
|
8
|
+
|
9
|
+
## 安装
|
10
|
+
|
11
|
+
- 使用 pip
|
12
|
+
|
13
|
+
```
|
14
|
+
pip install -U deeplotx
|
15
|
+
```
|
16
|
+
|
17
|
+
- 使用 uv (推荐)
|
18
|
+
|
19
|
+
```
|
20
|
+
uv add -U deeplotx
|
21
|
+
```
|
22
|
+
|
23
|
+
- 从 github 安装最新特性
|
24
|
+
|
25
|
+
```
|
26
|
+
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
27
|
+
```
|
28
|
+
|
29
|
+
## 核心功能
|
30
|
+
|
31
|
+
- ### 长文本嵌入
|
32
|
+
|
33
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
|
34
|
+
|
35
|
+
```python
|
36
|
+
from deeplotx import LongTextEncoder
|
37
|
+
|
38
|
+
# 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
|
39
|
+
encoder = LongTextEncoder(
|
40
|
+
max_length=2048,
|
41
|
+
chunk_size=512,
|
42
|
+
overlapping=64
|
43
|
+
)
|
44
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
|
45
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
|
46
|
+
```
|
47
|
+
|
48
|
+
输出:
|
49
|
+
```
|
50
|
+
tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
|
51
|
+
```
|
52
|
+
|
53
|
+
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
54
|
+
|
55
|
+
```python
|
56
|
+
from deeplotx import LongformerEncoder
|
57
|
+
|
58
|
+
encoder = LongformerEncoder()
|
59
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
60
|
+
```
|
61
|
+
|
62
|
+
- ### 相似性计算
|
63
|
+
|
64
|
+
- **基于向量的相似性**
|
65
|
+
|
66
|
+
```python
|
67
|
+
import deeplotx.similarity as sim
|
68
|
+
|
69
|
+
vector_0, vector_1 = [1, 2, 3, 4], [4, 3, 2, 1]
|
70
|
+
# 欧几里得距离
|
71
|
+
distance_0 = sim.euclidean_similarity(vector_0, vector_1)
|
72
|
+
print(distance_0)
|
73
|
+
# 余弦距离
|
74
|
+
distance_1 = sim.cosine_similarity(vector_0, vector_1)
|
75
|
+
print(distance_1)
|
76
|
+
# 切比雪夫距离
|
77
|
+
distance_2 = sim.chebyshev_similarity(vector_0, vector_1)
|
78
|
+
print(distance_2)
|
79
|
+
```
|
80
|
+
|
81
|
+
输出:
|
82
|
+
```
|
83
|
+
4.47213595499958
|
84
|
+
0.33333333333333337
|
85
|
+
3
|
86
|
+
```
|
87
|
+
|
88
|
+
- **基于集合的相似性**
|
89
|
+
|
90
|
+
```python
|
91
|
+
import deeplotx.similarity as sim
|
92
|
+
|
93
|
+
set_0, set_1 = {1, 2, 3, 4}, {4, 5, 6, 7}
|
94
|
+
# 杰卡德距离
|
95
|
+
distance_0 = sim.jaccard_similarity(set_0, set_1)
|
96
|
+
print(distance_0)
|
97
|
+
# Ochiai 距离
|
98
|
+
distance_1 = sim.ochiai_similarity(set_0, set_1)
|
99
|
+
print(distance_1)
|
100
|
+
# Dice 系数
|
101
|
+
distance_2 = sim.dice_coefficient(set_0, set_1)
|
102
|
+
print(distance_2)
|
103
|
+
# Overlap 系数
|
104
|
+
distance_3 = sim.overlap_coefficient(set_0, set_1)
|
105
|
+
print(distance_3)
|
106
|
+
```
|
107
|
+
|
108
|
+
输出:
|
109
|
+
```
|
110
|
+
0.1428571428572653
|
111
|
+
0.2500000000001875
|
112
|
+
0.25000000000009376
|
113
|
+
0.2500000000001875
|
114
|
+
```
|
115
|
+
|
116
|
+
- **基于概率分布的相似性**
|
117
|
+
|
118
|
+
```python
|
119
|
+
import deeplotx.similarity as sim
|
120
|
+
|
121
|
+
dist_0, dist_1 = [0.3, 0.2, 0.1, 0.4], [0.2, 0.1, 0.3, 0.4]
|
122
|
+
# 交叉熵
|
123
|
+
distance_0 = sim.cross_entropy(dist_0, dist_1)
|
124
|
+
print(distance_0)
|
125
|
+
# KL 散度
|
126
|
+
distance_1 = sim.kl_divergence(dist_0, dist_1)
|
127
|
+
print(distance_1)
|
128
|
+
# JS 散度
|
129
|
+
distance_2 = sim.js_divergence(dist_0, dist_1)
|
130
|
+
print(distance_2)
|
131
|
+
# Hellinger 距离
|
132
|
+
distance_3 = sim.hellinger_distance(dist_0, dist_1)
|
133
|
+
print(distance_3)
|
134
|
+
```
|
135
|
+
|
136
|
+
输出:
|
137
|
+
```
|
138
|
+
0.3575654913778237
|
139
|
+
0.15040773967762736
|
140
|
+
0.03969123741566945
|
141
|
+
0.20105866986400994
|
142
|
+
```
|
143
|
+
|
144
|
+
- ### 预定义深度神经网络
|
145
|
+
|
146
|
+
```python
|
147
|
+
from deeplotx import (
|
148
|
+
LinearRegression, # 线性回归
|
149
|
+
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
150
|
+
SoftmaxRegression, # Softmax 回归 / 多分类
|
151
|
+
RecursiveSequential, # 序列模型 / 循环神经网络
|
152
|
+
AutoRegression # 自回归模型
|
153
|
+
)
|
154
|
+
```
|
155
|
+
|
156
|
+
基础网络结构:
|
157
|
+
|
158
|
+
```python
|
159
|
+
from typing_extensions import override
|
160
|
+
|
161
|
+
import torch
|
162
|
+
from torch import nn
|
163
|
+
|
164
|
+
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
165
|
+
|
166
|
+
|
167
|
+
class LinearRegression(BaseNeuralNetwork):
|
168
|
+
def __init__(self, input_dim: int, output_dim: int, model_name: str | None = None):
|
169
|
+
super().__init__(model_name=model_name)
|
170
|
+
self.fc1 = nn.Linear(input_dim, 1024)
|
171
|
+
self.fc1_to_fc4_res = nn.Linear(1024, 64)
|
172
|
+
self.fc2 = nn.Linear(1024, 768)
|
173
|
+
self.fc3 = nn.Linear(768, 128)
|
174
|
+
self.fc4 = nn.Linear(128, 64)
|
175
|
+
self.fc5 = nn.Linear(64, output_dim)
|
176
|
+
self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3)
|
177
|
+
self.parametric_relu_2 = nn.PReLU(num_parameters=1, init=5e-3)
|
178
|
+
self.parametric_relu_3 = nn.PReLU(num_parameters=1, init=5e-3)
|
179
|
+
self.parametric_relu_4 = nn.PReLU(num_parameters=1, init=5e-3)
|
180
|
+
|
181
|
+
@override
|
182
|
+
def forward(self, x) -> torch.Tensor:
|
183
|
+
fc1_out = self.parametric_relu_1(self.fc1(x))
|
184
|
+
x = nn.LayerNorm(normalized_shape=1024, eps=1e-9)(fc1_out)
|
185
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
186
|
+
x = self.parametric_relu_2(self.fc2(x))
|
187
|
+
x = nn.LayerNorm(normalized_shape=768, eps=1e-9)(x)
|
188
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
189
|
+
x = self.parametric_relu_3(self.fc3(x))
|
190
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
191
|
+
x = self.parametric_relu_4(self.fc4(x)) + self.fc1_to_fc4_res(fc1_out)
|
192
|
+
x = self.fc5(x)
|
193
|
+
return x
|
194
|
+
```
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
import math
|
3
4
|
|
@@ -9,21 +10,26 @@ from deeplotx import __ROOT__
|
|
9
10
|
|
10
11
|
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
11
12
|
DEFAULT_BERT = 'bert-base-uncased'
|
13
|
+
logger = logging.getLogger('deeplotx.embedding')
|
12
14
|
|
13
15
|
|
14
16
|
class BertEncoder(nn.Module):
|
15
|
-
def __init__(self, model_name_or_path: str = DEFAULT_BERT):
|
17
|
+
def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
|
16
18
|
super().__init__()
|
19
|
+
self.device = device if device is not None else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
17
20
|
self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
18
21
|
cache_dir=CACHE_PATH, _from_auto=True)
|
19
22
|
self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
20
|
-
cache_dir=CACHE_PATH, _from_auto=True)
|
23
|
+
cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
|
21
24
|
self.embed_dim = self.bert.config.max_position_embeddings
|
25
|
+
logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
|
22
26
|
|
23
27
|
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
24
28
|
def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
|
25
29
|
return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
|
26
30
|
|
31
|
+
input_ids = input_ids.to(self.device)
|
32
|
+
attention_mask = attention_mask.to(self.device)
|
27
33
|
num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
|
28
34
|
chunks = chunk_results = []
|
29
35
|
for i in range(num_chunks):
|
@@ -13,8 +13,8 @@ logger = logging.getLogger('deeplotx.embedding')
|
|
13
13
|
|
14
14
|
class LongTextEncoder(BertEncoder):
|
15
15
|
def __init__(self, max_length: int, chunk_size: int = 256,
|
16
|
-
overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT):
|
17
|
-
super().__init__(model_name_or_path=model_name_or_path)
|
16
|
+
overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
|
17
|
+
super().__init__(model_name_or_path=model_name_or_path, device=device)
|
18
18
|
self._max_length = max_length
|
19
19
|
self._chunk_size = chunk_size
|
20
20
|
self._overlapping = overlapping
|
@@ -28,8 +28,8 @@ class LongTextEncoder(BertEncoder):
|
|
28
28
|
def postprocess(tensors: list[torch.Tensor], _flatten: bool) -> torch.Tensor:
|
29
29
|
if not _flatten:
|
30
30
|
return torch.stack(tensors, dim=0).squeeze()
|
31
|
-
_fin_emb_tensor = torch.tensor([], dtype=
|
32
|
-
for _emb in
|
31
|
+
_fin_emb_tensor = torch.tensor([], dtype=tensors[0].dtype)
|
32
|
+
for _emb in tensors:
|
33
33
|
_fin_emb_tensor = torch.cat((_fin_emb_tensor.detach().clone(), _emb.detach().clone()), dim=-1)
|
34
34
|
return _fin_emb_tensor.squeeze()
|
35
35
|
|
@@ -55,7 +55,7 @@ class LongTextEncoder(BertEncoder):
|
|
55
55
|
for i in range(num_chunks):
|
56
56
|
_tmp_left = max(i * self._chunk_size - self._overlapping, 0)
|
57
57
|
_tmp_right = (i + 1) * self._chunk_size + self._overlapping
|
58
|
-
chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.
|
58
|
+
chunks.append((i, torch.tensor([_text_to_input_ids[_tmp_left: _tmp_right]], dtype=torch.int),
|
59
59
|
torch.tensor([_text_to_input_ids_att_mask[_tmp_left: _tmp_right]], dtype=torch.int)))
|
60
60
|
with ThreadPoolExecutor(max_workers=min(num_chunks + 1, 3)) as executor:
|
61
61
|
embeddings = list(executor.map(self.__chunk_embedding, chunks))
|
@@ -1,3 +1,4 @@
|
|
1
|
+
import logging
|
1
2
|
import os
|
2
3
|
|
3
4
|
import torch
|
@@ -8,17 +9,22 @@ from deeplotx import __ROOT__
|
|
8
9
|
|
9
10
|
CACHE_PATH = os.path.join(__ROOT__, '.cache')
|
10
11
|
DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
|
12
|
+
logger = logging.getLogger('deeplotx.embedding')
|
11
13
|
|
12
14
|
|
13
15
|
class LongformerEncoder(nn.Module):
|
14
|
-
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER):
|
16
|
+
def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
|
15
17
|
super().__init__()
|
18
|
+
self.device = device if device is not None else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
16
19
|
self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
17
20
|
cache_dir=CACHE_PATH, _from_auto=True)
|
18
21
|
self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
|
19
|
-
cache_dir=CACHE_PATH, _from_auto=True)
|
22
|
+
cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
|
23
|
+
logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
|
20
24
|
|
21
25
|
def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
|
26
|
+
input_ids = input_ids.to(self.device)
|
27
|
+
attention_mask = attention_mask.to(self.device)
|
22
28
|
ori_mode = self.bert.training
|
23
29
|
self.bert.eval()
|
24
30
|
with torch.no_grad():
|
@@ -0,0 +1,211 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: deeplotx
|
3
|
+
Version: 0.4.9
|
4
|
+
Summary: Easy-2-use long text NLP toolkit.
|
5
|
+
Requires-Python: >=3.10
|
6
|
+
Description-Content-Type: text/markdown
|
7
|
+
License-File: LICENSE
|
8
|
+
Requires-Dist: hf-xet>=1.0.5
|
9
|
+
Requires-Dist: jupyter
|
10
|
+
Requires-Dist: numpy
|
11
|
+
Requires-Dist: protobuf>=6.31.1
|
12
|
+
Requires-Dist: python-dotenv>=1.1.0
|
13
|
+
Requires-Dist: torch
|
14
|
+
Requires-Dist: transformers
|
15
|
+
Requires-Dist: typing-extensions>=4.13.2
|
16
|
+
Dynamic: license-file
|
17
|
+
|
18
|
+
[](https://deepwiki.com/vortezwohl/DeepLoTX)
|
19
|
+
|
20
|
+
# Deep Long Text Learning Kit
|
21
|
+
|
22
|
+
> Author: 吴子豪
|
23
|
+
|
24
|
+
**开箱即用的长文本语义建模框架**
|
25
|
+
|
26
|
+
## 安装
|
27
|
+
|
28
|
+
- 使用 pip
|
29
|
+
|
30
|
+
```
|
31
|
+
pip install -U deeplotx
|
32
|
+
```
|
33
|
+
|
34
|
+
- 使用 uv (推荐)
|
35
|
+
|
36
|
+
```
|
37
|
+
uv add -U deeplotx
|
38
|
+
```
|
39
|
+
|
40
|
+
- 从 github 安装最新特性
|
41
|
+
|
42
|
+
```
|
43
|
+
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
44
|
+
```
|
45
|
+
|
46
|
+
## 核心功能
|
47
|
+
|
48
|
+
- ### 长文本嵌入
|
49
|
+
|
50
|
+
- **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
|
51
|
+
|
52
|
+
```python
|
53
|
+
from deeplotx import LongTextEncoder
|
54
|
+
|
55
|
+
# 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
|
56
|
+
encoder = LongTextEncoder(
|
57
|
+
max_length=2048,
|
58
|
+
chunk_size=512,
|
59
|
+
overlapping=64
|
60
|
+
)
|
61
|
+
# 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
|
62
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
|
63
|
+
```
|
64
|
+
|
65
|
+
输出:
|
66
|
+
```
|
67
|
+
tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
|
68
|
+
```
|
69
|
+
|
70
|
+
- **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
|
71
|
+
|
72
|
+
```python
|
73
|
+
from deeplotx import LongformerEncoder
|
74
|
+
|
75
|
+
encoder = LongformerEncoder()
|
76
|
+
encoder.encode('我是吴子豪, 这是一个测试文本.')
|
77
|
+
```
|
78
|
+
|
79
|
+
- ### 相似性计算
|
80
|
+
|
81
|
+
- **基于向量的相似性**
|
82
|
+
|
83
|
+
```python
|
84
|
+
import deeplotx.similarity as sim
|
85
|
+
|
86
|
+
vector_0, vector_1 = [1, 2, 3, 4], [4, 3, 2, 1]
|
87
|
+
# 欧几里得距离
|
88
|
+
distance_0 = sim.euclidean_similarity(vector_0, vector_1)
|
89
|
+
print(distance_0)
|
90
|
+
# 余弦距离
|
91
|
+
distance_1 = sim.cosine_similarity(vector_0, vector_1)
|
92
|
+
print(distance_1)
|
93
|
+
# 切比雪夫距离
|
94
|
+
distance_2 = sim.chebyshev_similarity(vector_0, vector_1)
|
95
|
+
print(distance_2)
|
96
|
+
```
|
97
|
+
|
98
|
+
输出:
|
99
|
+
```
|
100
|
+
4.47213595499958
|
101
|
+
0.33333333333333337
|
102
|
+
3
|
103
|
+
```
|
104
|
+
|
105
|
+
- **基于集合的相似性**
|
106
|
+
|
107
|
+
```python
|
108
|
+
import deeplotx.similarity as sim
|
109
|
+
|
110
|
+
set_0, set_1 = {1, 2, 3, 4}, {4, 5, 6, 7}
|
111
|
+
# 杰卡德距离
|
112
|
+
distance_0 = sim.jaccard_similarity(set_0, set_1)
|
113
|
+
print(distance_0)
|
114
|
+
# Ochiai 距离
|
115
|
+
distance_1 = sim.ochiai_similarity(set_0, set_1)
|
116
|
+
print(distance_1)
|
117
|
+
# Dice 系数
|
118
|
+
distance_2 = sim.dice_coefficient(set_0, set_1)
|
119
|
+
print(distance_2)
|
120
|
+
# Overlap 系数
|
121
|
+
distance_3 = sim.overlap_coefficient(set_0, set_1)
|
122
|
+
print(distance_3)
|
123
|
+
```
|
124
|
+
|
125
|
+
输出:
|
126
|
+
```
|
127
|
+
0.1428571428572653
|
128
|
+
0.2500000000001875
|
129
|
+
0.25000000000009376
|
130
|
+
0.2500000000001875
|
131
|
+
```
|
132
|
+
|
133
|
+
- **基于概率分布的相似性**
|
134
|
+
|
135
|
+
```python
|
136
|
+
import deeplotx.similarity as sim
|
137
|
+
|
138
|
+
dist_0, dist_1 = [0.3, 0.2, 0.1, 0.4], [0.2, 0.1, 0.3, 0.4]
|
139
|
+
# 交叉熵
|
140
|
+
distance_0 = sim.cross_entropy(dist_0, dist_1)
|
141
|
+
print(distance_0)
|
142
|
+
# KL 散度
|
143
|
+
distance_1 = sim.kl_divergence(dist_0, dist_1)
|
144
|
+
print(distance_1)
|
145
|
+
# JS 散度
|
146
|
+
distance_2 = sim.js_divergence(dist_0, dist_1)
|
147
|
+
print(distance_2)
|
148
|
+
# Hellinger 距离
|
149
|
+
distance_3 = sim.hellinger_distance(dist_0, dist_1)
|
150
|
+
print(distance_3)
|
151
|
+
```
|
152
|
+
|
153
|
+
输出:
|
154
|
+
```
|
155
|
+
0.3575654913778237
|
156
|
+
0.15040773967762736
|
157
|
+
0.03969123741566945
|
158
|
+
0.20105866986400994
|
159
|
+
```
|
160
|
+
|
161
|
+
- ### 预定义深度神经网络
|
162
|
+
|
163
|
+
```python
|
164
|
+
from deeplotx import (
|
165
|
+
LinearRegression, # 线性回归
|
166
|
+
LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
|
167
|
+
SoftmaxRegression, # Softmax 回归 / 多分类
|
168
|
+
RecursiveSequential, # 序列模型 / 循环神经网络
|
169
|
+
AutoRegression # 自回归模型
|
170
|
+
)
|
171
|
+
```
|
172
|
+
|
173
|
+
基础网络结构:
|
174
|
+
|
175
|
+
```python
|
176
|
+
from typing_extensions import override
|
177
|
+
|
178
|
+
import torch
|
179
|
+
from torch import nn
|
180
|
+
|
181
|
+
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
182
|
+
|
183
|
+
|
184
|
+
class LinearRegression(BaseNeuralNetwork):
|
185
|
+
def __init__(self, input_dim: int, output_dim: int, model_name: str | None = None):
|
186
|
+
super().__init__(model_name=model_name)
|
187
|
+
self.fc1 = nn.Linear(input_dim, 1024)
|
188
|
+
self.fc1_to_fc4_res = nn.Linear(1024, 64)
|
189
|
+
self.fc2 = nn.Linear(1024, 768)
|
190
|
+
self.fc3 = nn.Linear(768, 128)
|
191
|
+
self.fc4 = nn.Linear(128, 64)
|
192
|
+
self.fc5 = nn.Linear(64, output_dim)
|
193
|
+
self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3)
|
194
|
+
self.parametric_relu_2 = nn.PReLU(num_parameters=1, init=5e-3)
|
195
|
+
self.parametric_relu_3 = nn.PReLU(num_parameters=1, init=5e-3)
|
196
|
+
self.parametric_relu_4 = nn.PReLU(num_parameters=1, init=5e-3)
|
197
|
+
|
198
|
+
@override
|
199
|
+
def forward(self, x) -> torch.Tensor:
|
200
|
+
fc1_out = self.parametric_relu_1(self.fc1(x))
|
201
|
+
x = nn.LayerNorm(normalized_shape=1024, eps=1e-9)(fc1_out)
|
202
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
203
|
+
x = self.parametric_relu_2(self.fc2(x))
|
204
|
+
x = nn.LayerNorm(normalized_shape=768, eps=1e-9)(x)
|
205
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
206
|
+
x = self.parametric_relu_3(self.fc3(x))
|
207
|
+
x = torch.dropout(x, p=0.2, train=self.training)
|
208
|
+
x = self.parametric_relu_4(self.fc4(x)) + self.fc1_to_fc4_res(fc1_out)
|
209
|
+
x = self.fc5(x)
|
210
|
+
return x
|
211
|
+
```
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "deeplotx"
|
3
|
-
version = "0.4.
|
3
|
+
version = "0.4.9"
|
4
4
|
description = "Easy-2-use long text NLP toolkit."
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">=3.10"
|
@@ -8,6 +8,7 @@ dependencies = [
|
|
8
8
|
"hf-xet>=1.0.5",
|
9
9
|
"jupyter",
|
10
10
|
"numpy",
|
11
|
+
"protobuf>=6.31.1",
|
11
12
|
"python-dotenv>=1.1.0",
|
12
13
|
"torch",
|
13
14
|
"transformers",
|
deeplotx-0.4.7/PKG-INFO
DELETED
@@ -1,72 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: deeplotx
|
3
|
-
Version: 0.4.7
|
4
|
-
Summary: Easy-2-use long text NLP toolkit.
|
5
|
-
Requires-Python: >=3.10
|
6
|
-
Description-Content-Type: text/markdown
|
7
|
-
License-File: LICENSE
|
8
|
-
Requires-Dist: hf-xet>=1.0.5
|
9
|
-
Requires-Dist: jupyter
|
10
|
-
Requires-Dist: numpy
|
11
|
-
Requires-Dist: python-dotenv>=1.1.0
|
12
|
-
Requires-Dist: torch
|
13
|
-
Requires-Dist: transformers
|
14
|
-
Requires-Dist: typing-extensions>=4.13.2
|
15
|
-
Dynamic: license-file
|
16
|
-
|
17
|
-
[](https://deepwiki.com/vortezwohl/LoTC)
|
18
|
-
|
19
|
-
# DeepLoTX
|
20
|
-
|
21
|
-
An Easy-2-use long text NLP toolkit
|
22
|
-
|
23
|
-
## Installation
|
24
|
-
|
25
|
-
- Install with pip
|
26
|
-
|
27
|
-
```
|
28
|
-
pip install -U deeplotx
|
29
|
-
```
|
30
|
-
|
31
|
-
- Install with uv
|
32
|
-
|
33
|
-
```
|
34
|
-
uv add -U deeplotx
|
35
|
-
```
|
36
|
-
|
37
|
-
- Install from github
|
38
|
-
|
39
|
-
```
|
40
|
-
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
41
|
-
```
|
42
|
-
|
43
|
-
## Quick Start
|
44
|
-
|
45
|
-
To train a binary classifier from text files:
|
46
|
-
|
47
|
-
```python
|
48
|
-
from deeplotx.util import get_files, read_file
|
49
|
-
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
50
|
-
|
51
|
-
long_text_encoder = LongTextEncoder(
|
52
|
-
max_length=2048,
|
53
|
-
chunk_size=512,
|
54
|
-
overlapping=128
|
55
|
-
)
|
56
|
-
|
57
|
-
trainer = TextBinaryClassifierTrainer(
|
58
|
-
long_text_encoder=long_text_encoder,
|
59
|
-
batch_size=4,
|
60
|
-
train_ratio=0.9
|
61
|
-
)
|
62
|
-
|
63
|
-
pos_data_path = './data/pos'
|
64
|
-
neg_data_path = './data/neg'
|
65
|
-
pos_data = [read_file(x) for x in get_files(pos_data_path)]
|
66
|
-
neg_data = [read_file(x) for x in get_files(neg_data_path)]
|
67
|
-
model = trainer.train(pos_data, neg_data, num_epochs=20, learning_rate=2e-5, train_loss_threshold=1)
|
68
|
-
model.save()
|
69
|
-
|
70
|
-
model = model.load()
|
71
|
-
model.predict(long_text_encoder.encode('这是一个测试文本.').squeeze())
|
72
|
-
```
|
deeplotx-0.4.7/README.md
DELETED
@@ -1,56 +0,0 @@
|
|
1
|
-
[](https://deepwiki.com/vortezwohl/LoTC)
|
2
|
-
|
3
|
-
# DeepLoTX
|
4
|
-
|
5
|
-
An Easy-2-use long text NLP toolkit
|
6
|
-
|
7
|
-
## Installation
|
8
|
-
|
9
|
-
- Install with pip
|
10
|
-
|
11
|
-
```
|
12
|
-
pip install -U deeplotx
|
13
|
-
```
|
14
|
-
|
15
|
-
- Install with uv
|
16
|
-
|
17
|
-
```
|
18
|
-
uv add -U deeplotx
|
19
|
-
```
|
20
|
-
|
21
|
-
- Install from github
|
22
|
-
|
23
|
-
```
|
24
|
-
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
25
|
-
```
|
26
|
-
|
27
|
-
## Quick Start
|
28
|
-
|
29
|
-
To train a binary classifier from text files:
|
30
|
-
|
31
|
-
```python
|
32
|
-
from deeplotx.util import get_files, read_file
|
33
|
-
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
34
|
-
|
35
|
-
long_text_encoder = LongTextEncoder(
|
36
|
-
max_length=2048,
|
37
|
-
chunk_size=512,
|
38
|
-
overlapping=128
|
39
|
-
)
|
40
|
-
|
41
|
-
trainer = TextBinaryClassifierTrainer(
|
42
|
-
long_text_encoder=long_text_encoder,
|
43
|
-
batch_size=4,
|
44
|
-
train_ratio=0.9
|
45
|
-
)
|
46
|
-
|
47
|
-
pos_data_path = './data/pos'
|
48
|
-
neg_data_path = './data/neg'
|
49
|
-
pos_data = [read_file(x) for x in get_files(pos_data_path)]
|
50
|
-
neg_data = [read_file(x) for x in get_files(neg_data_path)]
|
51
|
-
model = trainer.train(pos_data, neg_data, num_epochs=20, learning_rate=2e-5, train_loss_threshold=1)
|
52
|
-
model.save()
|
53
|
-
|
54
|
-
model = model.load()
|
55
|
-
model.predict(long_text_encoder.encode('这是一个测试文本.').squeeze())
|
56
|
-
```
|
@@ -1,72 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: deeplotx
|
3
|
-
Version: 0.4.7
|
4
|
-
Summary: Easy-2-use long text NLP toolkit.
|
5
|
-
Requires-Python: >=3.10
|
6
|
-
Description-Content-Type: text/markdown
|
7
|
-
License-File: LICENSE
|
8
|
-
Requires-Dist: hf-xet>=1.0.5
|
9
|
-
Requires-Dist: jupyter
|
10
|
-
Requires-Dist: numpy
|
11
|
-
Requires-Dist: python-dotenv>=1.1.0
|
12
|
-
Requires-Dist: torch
|
13
|
-
Requires-Dist: transformers
|
14
|
-
Requires-Dist: typing-extensions>=4.13.2
|
15
|
-
Dynamic: license-file
|
16
|
-
|
17
|
-
[](https://deepwiki.com/vortezwohl/LoTC)
|
18
|
-
|
19
|
-
# DeepLoTX
|
20
|
-
|
21
|
-
An Easy-2-use long text NLP toolkit
|
22
|
-
|
23
|
-
## Installation
|
24
|
-
|
25
|
-
- Install with pip
|
26
|
-
|
27
|
-
```
|
28
|
-
pip install -U deeplotx
|
29
|
-
```
|
30
|
-
|
31
|
-
- Install with uv
|
32
|
-
|
33
|
-
```
|
34
|
-
uv add -U deeplotx
|
35
|
-
```
|
36
|
-
|
37
|
-
- Install from github
|
38
|
-
|
39
|
-
```
|
40
|
-
pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
|
41
|
-
```
|
42
|
-
|
43
|
-
## Quick Start
|
44
|
-
|
45
|
-
To train a binary classifier from text files:
|
46
|
-
|
47
|
-
```python
|
48
|
-
from deeplotx.util import get_files, read_file
|
49
|
-
from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
|
50
|
-
|
51
|
-
long_text_encoder = LongTextEncoder(
|
52
|
-
max_length=2048,
|
53
|
-
chunk_size=512,
|
54
|
-
overlapping=128
|
55
|
-
)
|
56
|
-
|
57
|
-
trainer = TextBinaryClassifierTrainer(
|
58
|
-
long_text_encoder=long_text_encoder,
|
59
|
-
batch_size=4,
|
60
|
-
train_ratio=0.9
|
61
|
-
)
|
62
|
-
|
63
|
-
pos_data_path = './data/pos'
|
64
|
-
neg_data_path = './data/neg'
|
65
|
-
pos_data = [read_file(x) for x in get_files(pos_data_path)]
|
66
|
-
neg_data = [read_file(x) for x in get_files(neg_data_path)]
|
67
|
-
model = trainer.train(pos_data, neg_data, num_epochs=20, learning_rate=2e-5, train_loss_threshold=1)
|
68
|
-
model.save()
|
69
|
-
|
70
|
-
model = model.load()
|
71
|
-
model.predict(long_text_encoder.encode('这是一个测试文本.').squeeze())
|
72
|
-
```
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|