deeplotx 0.4.8__py3-none-any.whl → 0.4.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import os
2
3
  import math
3
4
 
@@ -9,21 +10,26 @@ from deeplotx import __ROOT__
9
10
 
10
11
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
11
12
  DEFAULT_BERT = 'bert-base-uncased'
13
+ logger = logging.getLogger('deeplotx.embedding')
12
14
 
13
15
 
14
16
  class BertEncoder(nn.Module):
15
- def __init__(self, model_name_or_path: str = DEFAULT_BERT):
17
+ def __init__(self, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
16
18
  super().__init__()
19
+ self.device = device if device is not None else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
17
20
  self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
18
21
  cache_dir=CACHE_PATH, _from_auto=True)
19
22
  self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
20
- cache_dir=CACHE_PATH, _from_auto=True)
23
+ cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
21
24
  self.embed_dim = self.bert.config.max_position_embeddings
25
+ logger.debug(f'{BertEncoder.__name__} initialized on device: {self.device}.')
22
26
 
23
27
  def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
24
28
  def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
25
29
  return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
26
30
 
31
+ input_ids = input_ids.to(self.device)
32
+ attention_mask = attention_mask.to(self.device)
27
33
  num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
28
34
  chunks = chunk_results = []
29
35
  for i in range(num_chunks):
@@ -13,8 +13,8 @@ logger = logging.getLogger('deeplotx.embedding')
13
13
 
14
14
  class LongTextEncoder(BertEncoder):
15
15
  def __init__(self, max_length: int, chunk_size: int = 256,
16
- overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT):
17
- super().__init__(model_name_or_path=model_name_or_path)
16
+ overlapping: int = 0, model_name_or_path: str = DEFAULT_BERT, device: str | None = None):
17
+ super().__init__(model_name_or_path=model_name_or_path, device=device)
18
18
  self._max_length = max_length
19
19
  self._chunk_size = chunk_size
20
20
  self._overlapping = overlapping
@@ -1,3 +1,4 @@
1
+ import logging
1
2
  import os
2
3
 
3
4
  import torch
@@ -8,17 +9,22 @@ from deeplotx import __ROOT__
8
9
 
9
10
  CACHE_PATH = os.path.join(__ROOT__, '.cache')
10
11
  DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
12
+ logger = logging.getLogger('deeplotx.embedding')
11
13
 
12
14
 
13
15
  class LongformerEncoder(nn.Module):
14
- def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER):
16
+ def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER, device: str | None = None):
15
17
  super().__init__()
18
+ self.device = device if device is not None else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
16
19
  self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
17
20
  cache_dir=CACHE_PATH, _from_auto=True)
18
21
  self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
19
- cache_dir=CACHE_PATH, _from_auto=True)
22
+ cache_dir=CACHE_PATH, _from_auto=True).to(self.device)
23
+ logger.debug(f'{LongformerEncoder.__name__} initialized on device: {self.device}.')
20
24
 
21
25
  def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
26
+ input_ids = input_ids.to(self.device)
27
+ attention_mask = attention_mask.to(self.device)
22
28
  ori_mode = self.bert.training
23
29
  self.bert.eval()
24
30
  with torch.no_grad():
@@ -0,0 +1,211 @@
1
+ Metadata-Version: 2.4
2
+ Name: deeplotx
3
+ Version: 0.4.9
4
+ Summary: Easy-2-use long text NLP toolkit.
5
+ Requires-Python: >=3.10
6
+ Description-Content-Type: text/markdown
7
+ License-File: LICENSE
8
+ Requires-Dist: hf-xet>=1.0.5
9
+ Requires-Dist: jupyter
10
+ Requires-Dist: numpy
11
+ Requires-Dist: protobuf>=6.31.1
12
+ Requires-Dist: python-dotenv>=1.1.0
13
+ Requires-Dist: torch
14
+ Requires-Dist: transformers
15
+ Requires-Dist: typing-extensions>=4.13.2
16
+ Dynamic: license-file
17
+
18
+ [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
19
+
20
+ # Deep Long Text Learning Kit
21
+
22
+ > Author: 吴子豪
23
+
24
+ **开箱即用的长文本语义建模框架**
25
+
26
+ ## 安装
27
+
28
+ - 使用 pip
29
+
30
+ ```
31
+ pip install -U deeplotx
32
+ ```
33
+
34
+ - 使用 uv (推荐)
35
+
36
+ ```
37
+ uv add -U deeplotx
38
+ ```
39
+
40
+ - 从 github 安装最新特性
41
+
42
+ ```
43
+ pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
44
+ ```
45
+
46
+ ## 核心功能
47
+
48
+ - ### 长文本嵌入
49
+
50
+ - **基于通用 BERT 的长文本嵌入** (最大支持长度, 无限长, 通过 max_length 定义)
51
+
52
+ ```python
53
+ from deeplotx import LongTextEncoder
54
+
55
+ # 最大文本长度为 2048 个 tokens, 块大小为 512 个 tokens, 块间重叠部分为 64 个 tokens.
56
+ encoder = LongTextEncoder(
57
+ max_length=2048,
58
+ chunk_size=512,
59
+ overlapping=64
60
+ )
61
+ # 对 "我是吴子豪, 这是一个测试文本." 计算嵌入, 并展平.
62
+ encoder.encode('我是吴子豪, 这是一个测试文本.', flatten=True, use_cache=True)
63
+ ```
64
+
65
+ 输出:
66
+ ```
67
+ tensor([ 0.5163, 0.2497, 0.5896, ..., -0.9815, -0.3095, 0.4232])
68
+ ```
69
+
70
+ - **基于 Longformer 的长文本嵌入** (最大支持长度 4096 个 tokens)
71
+
72
+ ```python
73
+ from deeplotx import LongformerEncoder
74
+
75
+ encoder = LongformerEncoder()
76
+ encoder.encode('我是吴子豪, 这是一个测试文本.')
77
+ ```
78
+
79
+ - ### 相似性计算
80
+
81
+ - **基于向量的相似性**
82
+
83
+ ```python
84
+ import deeplotx.similarity as sim
85
+
86
+ vector_0, vector_1 = [1, 2, 3, 4], [4, 3, 2, 1]
87
+ # 欧几里得距离
88
+ distance_0 = sim.euclidean_similarity(vector_0, vector_1)
89
+ print(distance_0)
90
+ # 余弦距离
91
+ distance_1 = sim.cosine_similarity(vector_0, vector_1)
92
+ print(distance_1)
93
+ # 切比雪夫距离
94
+ distance_2 = sim.chebyshev_similarity(vector_0, vector_1)
95
+ print(distance_2)
96
+ ```
97
+
98
+ 输出:
99
+ ```
100
+ 4.47213595499958
101
+ 0.33333333333333337
102
+ 3
103
+ ```
104
+
105
+ - **基于集合的相似性**
106
+
107
+ ```python
108
+ import deeplotx.similarity as sim
109
+
110
+ set_0, set_1 = {1, 2, 3, 4}, {4, 5, 6, 7}
111
+ # 杰卡德距离
112
+ distance_0 = sim.jaccard_similarity(set_0, set_1)
113
+ print(distance_0)
114
+ # Ochiai 距离
115
+ distance_1 = sim.ochiai_similarity(set_0, set_1)
116
+ print(distance_1)
117
+ # Dice 系数
118
+ distance_2 = sim.dice_coefficient(set_0, set_1)
119
+ print(distance_2)
120
+ # Overlap 系数
121
+ distance_3 = sim.overlap_coefficient(set_0, set_1)
122
+ print(distance_3)
123
+ ```
124
+
125
+ 输出:
126
+ ```
127
+ 0.1428571428572653
128
+ 0.2500000000001875
129
+ 0.25000000000009376
130
+ 0.2500000000001875
131
+ ```
132
+
133
+ - **基于概率分布的相似性**
134
+
135
+ ```python
136
+ import deeplotx.similarity as sim
137
+
138
+ dist_0, dist_1 = [0.3, 0.2, 0.1, 0.4], [0.2, 0.1, 0.3, 0.4]
139
+ # 交叉熵
140
+ distance_0 = sim.cross_entropy(dist_0, dist_1)
141
+ print(distance_0)
142
+ # KL 散度
143
+ distance_1 = sim.kl_divergence(dist_0, dist_1)
144
+ print(distance_1)
145
+ # JS 散度
146
+ distance_2 = sim.js_divergence(dist_0, dist_1)
147
+ print(distance_2)
148
+ # Hellinger 距离
149
+ distance_3 = sim.hellinger_distance(dist_0, dist_1)
150
+ print(distance_3)
151
+ ```
152
+
153
+ 输出:
154
+ ```
155
+ 0.3575654913778237
156
+ 0.15040773967762736
157
+ 0.03969123741566945
158
+ 0.20105866986400994
159
+ ```
160
+
161
+ - ### 预定义深度神经网络
162
+
163
+ ```python
164
+ from deeplotx import (
165
+ LinearRegression, # 线性回归
166
+ LogisticRegression, # 逻辑回归 / 二分类 / 多标签分类
167
+ SoftmaxRegression, # Softmax 回归 / 多分类
168
+ RecursiveSequential, # 序列模型 / 循环神经网络
169
+ AutoRegression # 自回归模型
170
+ )
171
+ ```
172
+
173
+ 基础网络结构:
174
+
175
+ ```python
176
+ from typing_extensions import override
177
+
178
+ import torch
179
+ from torch import nn
180
+
181
+ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
182
+
183
+
184
+ class LinearRegression(BaseNeuralNetwork):
185
+ def __init__(self, input_dim: int, output_dim: int, model_name: str | None = None):
186
+ super().__init__(model_name=model_name)
187
+ self.fc1 = nn.Linear(input_dim, 1024)
188
+ self.fc1_to_fc4_res = nn.Linear(1024, 64)
189
+ self.fc2 = nn.Linear(1024, 768)
190
+ self.fc3 = nn.Linear(768, 128)
191
+ self.fc4 = nn.Linear(128, 64)
192
+ self.fc5 = nn.Linear(64, output_dim)
193
+ self.parametric_relu_1 = nn.PReLU(num_parameters=1, init=5e-3)
194
+ self.parametric_relu_2 = nn.PReLU(num_parameters=1, init=5e-3)
195
+ self.parametric_relu_3 = nn.PReLU(num_parameters=1, init=5e-3)
196
+ self.parametric_relu_4 = nn.PReLU(num_parameters=1, init=5e-3)
197
+
198
+ @override
199
+ def forward(self, x) -> torch.Tensor:
200
+ fc1_out = self.parametric_relu_1(self.fc1(x))
201
+ x = nn.LayerNorm(normalized_shape=1024, eps=1e-9)(fc1_out)
202
+ x = torch.dropout(x, p=0.2, train=self.training)
203
+ x = self.parametric_relu_2(self.fc2(x))
204
+ x = nn.LayerNorm(normalized_shape=768, eps=1e-9)(x)
205
+ x = torch.dropout(x, p=0.2, train=self.training)
206
+ x = self.parametric_relu_3(self.fc3(x))
207
+ x = torch.dropout(x, p=0.2, train=self.training)
208
+ x = self.parametric_relu_4(self.fc4(x)) + self.fc1_to_fc4_res(fc1_out)
209
+ x = self.fc5(x)
210
+ return x
211
+ ```
@@ -1,8 +1,8 @@
1
1
  deeplotx/__init__.py,sha256=wMN_AI14V-0BPbQghYpvd2y7eUGfhr7jKTTuur-5Upg,1002
2
2
  deeplotx/encoder/__init__.py,sha256=EM-xrTsHoGaiiFpj-iFAxilMHXC_sQKWYrcq1qCnI3U,138
3
- deeplotx/encoder/bert_encoder.py,sha256=6QY2pOvayWNz4w749JAGndvQ-jeKJgy3BalQl2JCkgk,1994
4
- deeplotx/encoder/long_text_encoder.py,sha256=FP0ACiOaOCjK2buRSWqBs-peg3IWQKuIdP2S00LNvSs,3271
5
- deeplotx/encoder/longformer_encoder.py,sha256=4avKYsLN6TTpPoky8BQ0nIhQm8lVxMvvzqkrdKCWj3Q,1433
3
+ deeplotx/encoder/bert_encoder.py,sha256=VCmYsBSqB9bRL_ge4bYssyx-Xy4oR0-DE1cMTuTn1tU,2412
4
+ deeplotx/encoder/long_text_encoder.py,sha256=7On6NuaINDZLqgb3HsSJBEzbWXNZPh_MXAvO5KY471k,3313
5
+ deeplotx/encoder/longformer_encoder.py,sha256=J8Si8Ta0bh7Vo7YsV0XdC7jGrvIt54GKcHr_pq2qHbI,1857
6
6
  deeplotx/nn/__init__.py,sha256=oQ-vYXyuaGelfCOs2im_gZXAiiBlCCVXh1uw9yjvRMs,253
7
7
  deeplotx/nn/auto_regression.py,sha256=o82C9TREZbhGdj2knSVGTXhjJne0LGEqc7BllByJJWE,449
8
8
  deeplotx/nn/base_neural_network.py,sha256=xWKG4FX6Jzdlrfc1HOW1aO9uh0Af3D-dB5Jl7eCxsAk,1635
@@ -20,8 +20,8 @@ deeplotx/trainer/text_binary_classification_trainer.py,sha256=5O-5dwVMCj5EDX9gjJ
20
20
  deeplotx/util/__init__.py,sha256=JxqAK_WOOHcYVSTHBT1-WuBwWrPEVDTV3titeVWvNUM,74
21
21
  deeplotx/util/hash.py,sha256=wwsC6kOQvbpuvwKsNQOARd78_wePmW9i3oaUuXRUnpc,352
22
22
  deeplotx/util/read_file.py,sha256=ptzouvEQeeW8KU5BrWNJlXw-vFXVrpS9SkAUxsu6A8A,612
23
- deeplotx-0.4.8.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
24
- deeplotx-0.4.8.dist-info/METADATA,sha256=KMyt-My-d5261MGBfC1_HsyqbJ_KVEvZ--kZNFq8B2A,1656
25
- deeplotx-0.4.8.dist-info/WHEEL,sha256=DnLRTWE75wApRYVsjgc6wsVswC54sMSJhAEd4xhDpBk,91
26
- deeplotx-0.4.8.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
27
- deeplotx-0.4.8.dist-info/RECORD,,
23
+ deeplotx-0.4.9.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
24
+ deeplotx-0.4.9.dist-info/METADATA,sha256=VucPny6Tz6-bCc7xB6G_3MJxI2_0FySACWbPX7CDTTo,6292
25
+ deeplotx-0.4.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
26
+ deeplotx-0.4.9.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
27
+ deeplotx-0.4.9.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (80.4.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,72 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: deeplotx
3
- Version: 0.4.8
4
- Summary: Easy-2-use long text NLP toolkit.
5
- Requires-Python: >=3.10
6
- Description-Content-Type: text/markdown
7
- License-File: LICENSE
8
- Requires-Dist: hf-xet>=1.0.5
9
- Requires-Dist: jupyter
10
- Requires-Dist: numpy
11
- Requires-Dist: python-dotenv>=1.1.0
12
- Requires-Dist: torch
13
- Requires-Dist: transformers
14
- Requires-Dist: typing-extensions>=4.13.2
15
- Dynamic: license-file
16
-
17
- [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/LoTC)
18
-
19
- # DeepLoTX
20
-
21
- An Easy-2-use long text NLP toolkit
22
-
23
- ## Installation
24
-
25
- - Install with pip
26
-
27
- ```
28
- pip install -U deeplotx
29
- ```
30
-
31
- - Install with uv
32
-
33
- ```
34
- uv add -U deeplotx
35
- ```
36
-
37
- - Install from github
38
-
39
- ```
40
- pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
41
- ```
42
-
43
- ## Quick Start
44
-
45
- To train a binary classifier from text files:
46
-
47
- ```python
48
- from deeplotx.util import get_files, read_file
49
- from deeplotx import TextBinaryClassifierTrainer, LongTextEncoder
50
-
51
- long_text_encoder = LongTextEncoder(
52
- max_length=2048,
53
- chunk_size=512,
54
- overlapping=128
55
- )
56
-
57
- trainer = TextBinaryClassifierTrainer(
58
- long_text_encoder=long_text_encoder,
59
- batch_size=4,
60
- train_ratio=0.9
61
- )
62
-
63
- pos_data_path = './data/pos'
64
- neg_data_path = './data/neg'
65
- pos_data = [read_file(x) for x in get_files(pos_data_path)]
66
- neg_data = [read_file(x) for x in get_files(neg_data_path)]
67
- model = trainer.train(pos_data, neg_data, num_epochs=20, learning_rate=2e-5, train_loss_threshold=1)
68
- model.save()
69
-
70
- model = model.load()
71
- model.predict(long_text_encoder.encode('这是一个测试文本.').squeeze())
72
- ```