deeplotx 0.9.8a0__py3-none-any.whl → 0.9.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deeplotx/__init__.py +0 -1
- deeplotx/ner/bert_ner.py +9 -1
- deeplotx/ner/n2g/__init__.py +17 -23
- deeplotx/util/__init__.py +1 -1
- {deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/METADATA +1 -1
- {deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/RECORD +9 -12
- deeplotx/trainer/__init__.py +0 -1
- deeplotx/trainer/base_trainer.py +0 -13
- deeplotx/trainer/text_binary_classification_trainer.py +0 -103
- {deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/WHEEL +0 -0
- {deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/licenses/LICENSE +0 -0
- {deeplotx-0.9.8a0.dist-info → deeplotx-0.9.10.dist-info}/top_level.txt +0 -0
deeplotx/__init__.py
CHANGED
deeplotx/ner/bert_ner.py
CHANGED
@@ -43,12 +43,15 @@ class BertNER(BaseNER):
|
|
43
43
|
|
44
44
|
def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
|
45
45
|
assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
|
46
|
+
# entity length cannot be longer than the whole seq
|
47
|
+
__max_search_backward = len(self.tokenizer.encode(s, add_special_tokens=False))
|
46
48
|
s = f' {s.replace(NEW_LINE, BLANK * 2)} '
|
47
49
|
raw_entities = self._ner_pipeline(s)
|
48
50
|
entities = []
|
49
51
|
for ent in raw_entities:
|
50
52
|
entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
|
51
|
-
|
53
|
+
__search_backward = -2
|
54
|
+
while __search_backward < __max_search_backward:
|
52
55
|
for i, ent in enumerate(entities):
|
53
56
|
if len(ent[0].strip()) < 1:
|
54
57
|
del entities[i]
|
@@ -65,6 +68,11 @@ class BertNER(BaseNER):
|
|
65
68
|
_continue = True
|
66
69
|
if not _continue:
|
67
70
|
break
|
71
|
+
__search_backward += 1
|
72
|
+
# adjust all I-ENTs
|
73
|
+
for ent in entities:
|
74
|
+
if ent[1].upper().startswith('I'):
|
75
|
+
ent[1] = f'B{ent[1][1:]}'
|
68
76
|
for ent in entities:
|
69
77
|
ent[0] = ent[0].strip()
|
70
78
|
if len(ent[0]) < 1:
|
deeplotx/ner/n2g/__init__.py
CHANGED
@@ -13,13 +13,12 @@ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
|
13
13
|
|
14
14
|
__CACHE_DIR__ = os.path.join(__ROOT__, '.cache', '.n2g')
|
15
15
|
ENCODER = Encoder(model_name_or_path='FacebookAI/xlm-roberta-base')
|
16
|
-
|
17
|
-
SMALL_MODEL = 'name2gender-small'
|
16
|
+
DEFAULT_MODEL = 'name2gender-small'
|
18
17
|
_MIN_FILE_SIZE = 1024 * 5
|
19
18
|
|
20
19
|
|
21
20
|
def download_model(model_name: str):
|
22
|
-
quiet = bool(os.getenv('
|
21
|
+
quiet = bool(os.getenv('N2G_QUIET_DOWNLOAD', False))
|
23
22
|
os.makedirs(__CACHE_DIR__, exist_ok=True)
|
24
23
|
_proxies = {
|
25
24
|
'http': os.getenv('HTTP_PROXY', os.getenv('http_proxy')),
|
@@ -51,25 +50,20 @@ def download_model(model_name: str):
|
|
51
50
|
|
52
51
|
def load_model(model_name: str = 'name2gender-small', dtype: torch.dtype | None = torch.float16) -> BaseNeuralNetwork:
|
53
52
|
n2g_model = None
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
download_model(SMALL_MODEL)
|
69
|
-
n2g_model = LogisticRegression(input_dim=768, output_dim=1,
|
70
|
-
num_heads=6, num_layers=2,
|
71
|
-
head_layers=1, expansion_factor=1.5,
|
72
|
-
model_name=SMALL_MODEL, dtype=dtype)
|
53
|
+
if 'base' in model_name.lower():
|
54
|
+
download_model(model_name)
|
55
|
+
n2g_model = LogisticRegression(input_dim=768, output_dim=1,
|
56
|
+
num_heads=12, num_layers=4,
|
57
|
+
head_layers=1, expansion_factor=2,
|
58
|
+
model_name=model_name, dtype=dtype)
|
59
|
+
elif 'small' in model_name.lower():
|
60
|
+
download_model(model_name)
|
61
|
+
n2g_model = LogisticRegression(input_dim=768, output_dim=1,
|
62
|
+
num_heads=6, num_layers=2,
|
63
|
+
head_layers=1, expansion_factor=1.5,
|
64
|
+
model_name=model_name, dtype=dtype)
|
65
|
+
else:
|
66
|
+
raise FileNotFoundError(f"Model \"{model_name}\" doesn't exists.")
|
73
67
|
return n2g_model.load(model_dir=__CACHE_DIR__)
|
74
68
|
|
75
69
|
|
@@ -77,7 +71,7 @@ class Name2Gender:
|
|
77
71
|
def __init__(self, model: BaseNeuralNetwork | None = None):
|
78
72
|
super().__init__()
|
79
73
|
if model is None:
|
80
|
-
model = load_model(
|
74
|
+
model = load_model(DEFAULT_MODEL)
|
81
75
|
self._model = model
|
82
76
|
|
83
77
|
def __call__(self, name: str, return_probability: bool = False, threshold: float = .5) -> tuple[Gender, float] | Gender:
|
deeplotx/util/__init__.py
CHANGED
@@ -1,2 +1,2 @@
|
|
1
1
|
from .hash import md5, sha1, sha256, sha512
|
2
|
-
from .read_file import read_file, get_files
|
2
|
+
from .read_file import read_file, write_file, get_files
|
@@ -1,13 +1,13 @@
|
|
1
|
-
deeplotx/__init__.py,sha256=
|
1
|
+
deeplotx/__init__.py,sha256=0OWLsgXlStzwm0m9ScaoZvBnsx3a0xTmlzYBUgarl-g,1306
|
2
2
|
deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
|
3
3
|
deeplotx/encoder/encoder.py,sha256=uJswUSrYVDWP84HeCD40R9KgXGPUEa080konv7jEp8I,3539
|
4
4
|
deeplotx/encoder/long_text_encoder.py,sha256=4oRa9FqfGNZ8-gq14UKuhDkZC0A1Xi-wKmbQsn-uZ58,3966
|
5
5
|
deeplotx/encoder/longformer_encoder.py,sha256=mfAI_NE3QQZhvGHbZkP7S6g0Jj59wmLWQ9QW7HOjqm0,2876
|
6
6
|
deeplotx/ner/__init__.py,sha256=Rss1pup9HzHZCG8U9ub8niWa9zRjWCy3Z7zg378KZQg,114
|
7
7
|
deeplotx/ner/base_ner.py,sha256=pZTl50OrHH_FJm4rKp9iuixeOE6FX_AzgDXD32aXsN0,204
|
8
|
-
deeplotx/ner/bert_ner.py,sha256=
|
8
|
+
deeplotx/ner/bert_ner.py,sha256=tfbM3CQBEpZsD0KYVA7GVNJax-7kzNOQgwlpo2S8h-c,8986
|
9
9
|
deeplotx/ner/named_entity.py,sha256=c6XufIwH6yloJ-ccUjagf4mBl1XbbYDT8xyEJJ_-ZNs,269
|
10
|
-
deeplotx/ner/n2g/__init__.py,sha256=
|
10
|
+
deeplotx/ner/n2g/__init__.py,sha256=L1IJ8W1nApzqHx2u7JMtPCLfABm5qKJvh_bHMWdvdLY,3538
|
11
11
|
deeplotx/nn/__init__.py,sha256=YILwbxb-NHdiJjfOwBKH8F7PuZSDZSrGpTznPDucTro,710
|
12
12
|
deeplotx/nn/attention.py,sha256=R-i-Rd7gnsh6hwXDeYfqLQOJvfSZIGfQbFzRlC91XLo,2879
|
13
13
|
deeplotx/nn/auto_regression.py,sha256=j_R7WGPq9REngjpLuX5c0AaNqOpgGm2Vfrolw-XjWXw,877
|
@@ -27,14 +27,11 @@ deeplotx/similarity/__init__.py,sha256=s3u-KSgxjnMcWpIItKgXNltFMPQ7YY3CqsqHI-5F1
|
|
27
27
|
deeplotx/similarity/distribution.py,sha256=wQGouuuW531pZeBRKBujXsdsoz4fDnPw7_GW81jwepc,1066
|
28
28
|
deeplotx/similarity/set.py,sha256=zhGFxtSIXlWqvipBYzoiPahp4g0boAIoUiMfG0wl07A,686
|
29
29
|
deeplotx/similarity/vector.py,sha256=WVbDHqykt-fvuILVrhUCtIFAOEjY_zvttrXGM9eylG0,1125
|
30
|
-
deeplotx/
|
31
|
-
deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
|
32
|
-
deeplotx/trainer/text_binary_classification_trainer.py,sha256=TFxOX8rWU_zKliI9zm7F5ZH7snR2d-sk95s3pfTmm78,6601
|
33
|
-
deeplotx/util/__init__.py,sha256=5CH4MTeSgsmCe3LPMfvKoSBpwh6jDSBuHVElJvzQzgs,90
|
30
|
+
deeplotx/util/__init__.py,sha256=ppQwp3A4rhAWBQ7DEobIYxloIiythxxUswCn-7UrMeA,102
|
34
31
|
deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
|
35
32
|
deeplotx/util/read_file.py,sha256=O9nieNgAGQ7Ct1EFxCdcgL6hVs5s2Vw_ItcUK6VeTwY,981
|
36
|
-
deeplotx-0.9.
|
37
|
-
deeplotx-0.9.
|
38
|
-
deeplotx-0.9.
|
39
|
-
deeplotx-0.9.
|
40
|
-
deeplotx-0.9.
|
33
|
+
deeplotx-0.9.10.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
|
34
|
+
deeplotx-0.9.10.dist-info/METADATA,sha256=6m2igF02QAdr5xns8BamtyMfrgLwBsuej2jihR36dCE,14443
|
35
|
+
deeplotx-0.9.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
36
|
+
deeplotx-0.9.10.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
|
37
|
+
deeplotx-0.9.10.dist-info/RECORD,,
|
deeplotx/trainer/__init__.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
from .text_binary_classification_trainer import TextBinaryClassifierTrainer
|
deeplotx/trainer/base_trainer.py
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
from abc import abstractmethod
|
2
|
-
|
3
|
-
from deeplotx.nn.base_neural_network import BaseNeuralNetwork
|
4
|
-
|
5
|
-
|
6
|
-
class BaseTrainer(object):
|
7
|
-
def __init__(self, batch_size: int, train_ratio: float):
|
8
|
-
self._batch_size = batch_size
|
9
|
-
self._train_ratio = train_ratio
|
10
|
-
self.model = None
|
11
|
-
|
12
|
-
@abstractmethod
|
13
|
-
def train(self, *args, **kwargs) -> BaseNeuralNetwork: ...
|
@@ -1,103 +0,0 @@
|
|
1
|
-
import logging
|
2
|
-
from typing_extensions import override
|
3
|
-
|
4
|
-
import torch
|
5
|
-
from torch import nn, optim
|
6
|
-
from torch.utils.data import DataLoader, TensorDataset
|
7
|
-
|
8
|
-
from deeplotx.encoder.long_text_encoder import LongTextEncoder
|
9
|
-
from deeplotx.nn.attention import DEFAULT_THETA
|
10
|
-
from deeplotx.nn.long_context_recursive_sequential import LongContextRecursiveSequential
|
11
|
-
from deeplotx.trainer.base_trainer import BaseTrainer
|
12
|
-
|
13
|
-
logger = logging.getLogger('deeplotx.trainer')
|
14
|
-
|
15
|
-
|
16
|
-
class TextBinaryClassifierTrainer(BaseTrainer):
|
17
|
-
def __init__(self, long_text_encoder: LongTextEncoder, batch_size: int = 2, train_ratio: float = 0.8):
|
18
|
-
super().__init__(batch_size=batch_size, train_ratio=train_ratio)
|
19
|
-
self._long_text_encoder = long_text_encoder
|
20
|
-
self.device = self._long_text_encoder.device
|
21
|
-
self.train_dataset_loader = None
|
22
|
-
self.valid_dataset_loader = None
|
23
|
-
|
24
|
-
@override
|
25
|
-
def train(self, positive_texts: list[str], negative_texts: list[str],
|
26
|
-
num_epochs: int, learning_rate: float = 2e-6, balancing_dataset: bool = True,
|
27
|
-
train_loss_threshold: float = 0.0, valid_loss_threshold: float = 0.0,
|
28
|
-
alpha: float = 1e-4, rho: float = 0.2, encoder_layers: int = 4, attn_heads: int = 6,
|
29
|
-
recursive_layers: int = 2, recursive_hidden_dim: int = 256, **kwargs) -> LongContextRecursiveSequential:
|
30
|
-
if balancing_dataset:
|
31
|
-
min_length = min(len(positive_texts), len(negative_texts))
|
32
|
-
positive_texts = positive_texts[:min_length]
|
33
|
-
negative_texts = negative_texts[:min_length]
|
34
|
-
all_texts = positive_texts + negative_texts
|
35
|
-
text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
|
36
|
-
feature_dim = text_embeddings[0].shape[-1]
|
37
|
-
dtype = text_embeddings[0].dtype
|
38
|
-
labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
|
39
|
-
+ [torch.tensor([.0], dtype=dtype, device=self.device) for _ in range(len(negative_texts))])
|
40
|
-
inputs = torch.stack(text_embeddings).to(self.device)
|
41
|
-
labels = torch.stack(labels).to(self.device)
|
42
|
-
dataset_size = len(labels)
|
43
|
-
train_size = int(self._train_ratio * dataset_size)
|
44
|
-
train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
|
45
|
-
valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
|
46
|
-
self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
|
47
|
-
self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
|
48
|
-
if self.model is not None and self.model.in_features != feature_dim:
|
49
|
-
logger.warning("The dimension of features doesn't match. A new model instance will be created.")
|
50
|
-
self.model = None
|
51
|
-
if self.model is None:
|
52
|
-
ffn_heads = kwargs.get('ffn_heads', 2)
|
53
|
-
ffn_layers = kwargs.get('ffn_layers', 5)
|
54
|
-
ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
|
55
|
-
bias = kwargs.get('bias', True)
|
56
|
-
dropout_rate = kwargs.get('dropout_rate', 0.1)
|
57
|
-
encoder_ffn_layers = kwargs.get('encoder_ffn_layers', ffn_layers)
|
58
|
-
encoder_expansion_factor = kwargs.get('encoder_expansion_factor', ffn_expansion_factor)
|
59
|
-
encoder_dropout_rate = kwargs.get('encoder_dropout_rate', dropout_rate)
|
60
|
-
attn_ffn_layers = kwargs.get('attn_ffn_layers', 1)
|
61
|
-
attn_expansion_factor = kwargs.get('attn_expansion_factor', ffn_expansion_factor)
|
62
|
-
attn_dropout_rate = kwargs.get('attn_dropout_rate', dropout_rate)
|
63
|
-
theta = kwargs.get('theta', DEFAULT_THETA)
|
64
|
-
self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
|
65
|
-
encoder_layers=encoder_layers, attn_heads=attn_heads,
|
66
|
-
recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
|
67
|
-
ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
|
68
|
-
dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
|
69
|
-
encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
|
70
|
-
attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
|
71
|
-
attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
|
72
|
-
logger.debug(f'Training Model: \n{self.model}')
|
73
|
-
loss_function = nn.BCELoss()
|
74
|
-
optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
|
75
|
-
for epoch in range(num_epochs):
|
76
|
-
self.model.train()
|
77
|
-
total_loss = 0.0
|
78
|
-
for batch_texts, batch_labels in self.train_dataset_loader:
|
79
|
-
outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
|
80
|
-
loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
|
81
|
-
optimizer.zero_grad()
|
82
|
-
loss.backward()
|
83
|
-
optimizer.step()
|
84
|
-
total_loss += loss.item()
|
85
|
-
if epoch % 3 == 0:
|
86
|
-
total_valid_loss = 0.0
|
87
|
-
for batch_texts, batch_labels in self.valid_dataset_loader:
|
88
|
-
with torch.no_grad():
|
89
|
-
self.model.eval()
|
90
|
-
outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
|
91
|
-
loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
|
92
|
-
total_valid_loss += loss.item()
|
93
|
-
self.model.train()
|
94
|
-
logger.debug(f"Epoch {epoch + 1}/{num_epochs} | "
|
95
|
-
f"Train Loss: {total_loss:.4f} | "
|
96
|
-
f"Valid Loss: {total_valid_loss:.4f}")
|
97
|
-
if total_valid_loss < valid_loss_threshold:
|
98
|
-
break
|
99
|
-
else:
|
100
|
-
logger.debug(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {total_loss:.4f}")
|
101
|
-
if total_loss < train_loss_threshold:
|
102
|
-
break
|
103
|
-
return self.model
|
File without changes
|
File without changes
|
File without changes
|