deeplotx 0.9.9__py3-none-any.whl → 0.9.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
deeplotx/__init__.py CHANGED
@@ -21,7 +21,6 @@ from .nn import (
21
21
  AutoRegression,
22
22
  LongContextAutoRegression
23
23
  )
24
- from .trainer import TextBinaryClassifierTrainer
25
24
 
26
25
  __AUTHOR__ = '吴子豪 / Vortez Wohl'
27
26
  __EMAIL__ = 'vortez.wohl@gmail.com'
deeplotx/ner/bert_ner.py CHANGED
@@ -43,12 +43,15 @@ class BertNER(BaseNER):
43
43
 
44
44
  def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
45
45
  assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
46
+ # entity length cannot be longer than the whole seq
47
+ __max_search_backward = len(self.tokenizer.encode(s, add_special_tokens=False))
46
48
  s = f' {s.replace(NEW_LINE, BLANK * 2)} '
47
49
  raw_entities = self._ner_pipeline(s)
48
50
  entities = []
49
51
  for ent in raw_entities:
50
52
  entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
51
- while True:
53
+ __search_backward = -2
54
+ while __search_backward < __max_search_backward:
52
55
  for i, ent in enumerate(entities):
53
56
  if len(ent[0].strip()) < 1:
54
57
  del entities[i]
@@ -65,6 +68,11 @@ class BertNER(BaseNER):
65
68
  _continue = True
66
69
  if not _continue:
67
70
  break
71
+ __search_backward += 1
72
+ # adjust all I-ENTs
73
+ for ent in entities:
74
+ if ent[1].upper().startswith('I'):
75
+ ent[1] = f'B{ent[1][1:]}'
68
76
  for ent in entities:
69
77
  ent[0] = ent[0].strip()
70
78
  if len(ent[0]) < 1:
@@ -13,13 +13,12 @@ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
13
13
 
14
14
  __CACHE_DIR__ = os.path.join(__ROOT__, '.cache', '.n2g')
15
15
  ENCODER = Encoder(model_name_or_path='FacebookAI/xlm-roberta-base')
16
- BASE_MODEL = 'name2gender-base'
17
- SMALL_MODEL = 'name2gender-small'
16
+ DEFAULT_MODEL = 'name2gender-small'
18
17
  _MIN_FILE_SIZE = 1024 * 5
19
18
 
20
19
 
21
20
  def download_model(model_name: str):
22
- quiet = bool(os.getenv('QUIET_DOWNLOAD', False))
21
+ quiet = bool(os.getenv('N2G_QUIET_DOWNLOAD', False))
23
22
  os.makedirs(__CACHE_DIR__, exist_ok=True)
24
23
  _proxies = {
25
24
  'http': os.getenv('HTTP_PROXY', os.getenv('http_proxy')),
@@ -51,25 +50,20 @@ def download_model(model_name: str):
51
50
 
52
51
  def load_model(model_name: str = 'name2gender-small', dtype: torch.dtype | None = torch.float16) -> BaseNeuralNetwork:
53
52
  n2g_model = None
54
- match model_name:
55
- case 'name2gender-base' | 'n2g-base' | 'base':
56
- download_model(BASE_MODEL)
57
- n2g_model = LogisticRegression(input_dim=768, output_dim=1,
58
- num_heads=12, num_layers=4,
59
- head_layers=1, expansion_factor=2,
60
- model_name=BASE_MODEL, dtype=dtype)
61
- case 'name2gender-small' | 'n2g-base' | 'small':
62
- download_model(SMALL_MODEL)
63
- n2g_model = LogisticRegression(input_dim=768, output_dim=1,
64
- num_heads=6, num_layers=2,
65
- head_layers=1, expansion_factor=1.5,
66
- model_name=SMALL_MODEL, dtype=dtype)
67
- case _:
68
- download_model(SMALL_MODEL)
69
- n2g_model = LogisticRegression(input_dim=768, output_dim=1,
70
- num_heads=6, num_layers=2,
71
- head_layers=1, expansion_factor=1.5,
72
- model_name=SMALL_MODEL, dtype=dtype)
53
+ if 'base' in model_name.lower():
54
+ download_model(model_name)
55
+ n2g_model = LogisticRegression(input_dim=768, output_dim=1,
56
+ num_heads=12, num_layers=4,
57
+ head_layers=1, expansion_factor=2,
58
+ model_name=model_name, dtype=dtype)
59
+ elif 'small' in model_name.lower():
60
+ download_model(model_name)
61
+ n2g_model = LogisticRegression(input_dim=768, output_dim=1,
62
+ num_heads=6, num_layers=2,
63
+ head_layers=1, expansion_factor=1.5,
64
+ model_name=model_name, dtype=dtype)
65
+ else:
66
+ raise FileNotFoundError(f"Model \"{model_name}\" doesn't exists.")
73
67
  return n2g_model.load(model_dir=__CACHE_DIR__)
74
68
 
75
69
 
@@ -77,7 +71,7 @@ class Name2Gender:
77
71
  def __init__(self, model: BaseNeuralNetwork | None = None):
78
72
  super().__init__()
79
73
  if model is None:
80
- model = load_model(SMALL_MODEL)
74
+ model = load_model(DEFAULT_MODEL)
81
75
  self._model = model
82
76
 
83
77
  def __call__(self, name: str, return_probability: bool = False, threshold: float = .5) -> tuple[Gender, float] | Gender:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.9
3
+ Version: 0.9.10
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -1,13 +1,13 @@
1
- deeplotx/__init__.py,sha256=x4CbJuW20al6S5KkKyrReeuwNGv04JGoqtGUyx-ACtg,1356
1
+ deeplotx/__init__.py,sha256=0OWLsgXlStzwm0m9ScaoZvBnsx3a0xTmlzYBUgarl-g,1306
2
2
  deeplotx/encoder/__init__.py,sha256=BrsF5_4O-4pfihYF2wjExDOoAY-03kGJTH-Mhez4tsE,129
3
3
  deeplotx/encoder/encoder.py,sha256=uJswUSrYVDWP84HeCD40R9KgXGPUEa080konv7jEp8I,3539
4
4
  deeplotx/encoder/long_text_encoder.py,sha256=4oRa9FqfGNZ8-gq14UKuhDkZC0A1Xi-wKmbQsn-uZ58,3966
5
5
  deeplotx/encoder/longformer_encoder.py,sha256=mfAI_NE3QQZhvGHbZkP7S6g0Jj59wmLWQ9QW7HOjqm0,2876
6
6
  deeplotx/ner/__init__.py,sha256=Rss1pup9HzHZCG8U9ub8niWa9zRjWCy3Z7zg378KZQg,114
7
7
  deeplotx/ner/base_ner.py,sha256=pZTl50OrHH_FJm4rKp9iuixeOE6FX_AzgDXD32aXsN0,204
8
- deeplotx/ner/bert_ner.py,sha256=6al5iMc8gb3XuJf7dbJd-noXShLYmRoLXTd8L1-wYMM,8581
8
+ deeplotx/ner/bert_ner.py,sha256=tfbM3CQBEpZsD0KYVA7GVNJax-7kzNOQgwlpo2S8h-c,8986
9
9
  deeplotx/ner/named_entity.py,sha256=c6XufIwH6yloJ-ccUjagf4mBl1XbbYDT8xyEJJ_-ZNs,269
10
- deeplotx/ner/n2g/__init__.py,sha256=b6fOWJVLaOCtoz8Qlp8NWQbL5lUSbn6H3-8fnVNIPi0,3940
10
+ deeplotx/ner/n2g/__init__.py,sha256=L1IJ8W1nApzqHx2u7JMtPCLfABm5qKJvh_bHMWdvdLY,3538
11
11
  deeplotx/nn/__init__.py,sha256=YILwbxb-NHdiJjfOwBKH8F7PuZSDZSrGpTznPDucTro,710
12
12
  deeplotx/nn/attention.py,sha256=R-i-Rd7gnsh6hwXDeYfqLQOJvfSZIGfQbFzRlC91XLo,2879
13
13
  deeplotx/nn/auto_regression.py,sha256=j_R7WGPq9REngjpLuX5c0AaNqOpgGm2Vfrolw-XjWXw,877
@@ -27,14 +27,11 @@ deeplotx/similarity/__init__.py,sha256=s3u-KSgxjnMcWpIItKgXNltFMPQ7YY3CqsqHI-5F1
27
27
  deeplotx/similarity/distribution.py,sha256=wQGouuuW531pZeBRKBujXsdsoz4fDnPw7_GW81jwepc,1066
28
28
  deeplotx/similarity/set.py,sha256=zhGFxtSIXlWqvipBYzoiPahp4g0boAIoUiMfG0wl07A,686
29
29
  deeplotx/similarity/vector.py,sha256=WVbDHqykt-fvuILVrhUCtIFAOEjY_zvttrXGM9eylG0,1125
30
- deeplotx/trainer/__init__.py,sha256=Fl5DR9UecQc5VtBcczU9sx_HtPNoFohpuELOh-Jrsks,77
31
- deeplotx/trainer/base_trainer.py,sha256=z0MeAT-rRYmjeBXt0ckt7J1itYArR0Cx02wHesXUoZE,385
32
- deeplotx/trainer/text_binary_classification_trainer.py,sha256=TFxOX8rWU_zKliI9zm7F5ZH7snR2d-sk95s3pfTmm78,6601
33
30
  deeplotx/util/__init__.py,sha256=ppQwp3A4rhAWBQ7DEobIYxloIiythxxUswCn-7UrMeA,102
34
31
  deeplotx/util/hash.py,sha256=qbNU3RLBWGQYFVte9WZBAkZ1BkdjCXiKLDaKPN54KFk,662
35
32
  deeplotx/util/read_file.py,sha256=O9nieNgAGQ7Ct1EFxCdcgL6hVs5s2Vw_ItcUK6VeTwY,981
36
- deeplotx-0.9.9.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
37
- deeplotx-0.9.9.dist-info/METADATA,sha256=00OSBOncepG2pRHu6K_HOWksiQCOCOUEvqubydI_yXA,14442
38
- deeplotx-0.9.9.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
39
- deeplotx-0.9.9.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
40
- deeplotx-0.9.9.dist-info/RECORD,,
33
+ deeplotx-0.9.10.dist-info/licenses/LICENSE,sha256=IwGE9guuL-ryRPEKi6wFPI_zOhg7zDZbTYuHbSt_SAk,35823
34
+ deeplotx-0.9.10.dist-info/METADATA,sha256=6m2igF02QAdr5xns8BamtyMfrgLwBsuej2jihR36dCE,14443
35
+ deeplotx-0.9.10.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
36
+ deeplotx-0.9.10.dist-info/top_level.txt,sha256=hKg4pVDXZ-WWxkRfJFczRIll1Sv7VyfKCmzHLXbuh1U,9
37
+ deeplotx-0.9.10.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- from .text_binary_classification_trainer import TextBinaryClassifierTrainer
@@ -1,13 +0,0 @@
1
- from abc import abstractmethod
2
-
3
- from deeplotx.nn.base_neural_network import BaseNeuralNetwork
4
-
5
-
6
- class BaseTrainer(object):
7
- def __init__(self, batch_size: int, train_ratio: float):
8
- self._batch_size = batch_size
9
- self._train_ratio = train_ratio
10
- self.model = None
11
-
12
- @abstractmethod
13
- def train(self, *args, **kwargs) -> BaseNeuralNetwork: ...
@@ -1,103 +0,0 @@
1
- import logging
2
- from typing_extensions import override
3
-
4
- import torch
5
- from torch import nn, optim
6
- from torch.utils.data import DataLoader, TensorDataset
7
-
8
- from deeplotx.encoder.long_text_encoder import LongTextEncoder
9
- from deeplotx.nn.attention import DEFAULT_THETA
10
- from deeplotx.nn.long_context_recursive_sequential import LongContextRecursiveSequential
11
- from deeplotx.trainer.base_trainer import BaseTrainer
12
-
13
- logger = logging.getLogger('deeplotx.trainer')
14
-
15
-
16
- class TextBinaryClassifierTrainer(BaseTrainer):
17
- def __init__(self, long_text_encoder: LongTextEncoder, batch_size: int = 2, train_ratio: float = 0.8):
18
- super().__init__(batch_size=batch_size, train_ratio=train_ratio)
19
- self._long_text_encoder = long_text_encoder
20
- self.device = self._long_text_encoder.device
21
- self.train_dataset_loader = None
22
- self.valid_dataset_loader = None
23
-
24
- @override
25
- def train(self, positive_texts: list[str], negative_texts: list[str],
26
- num_epochs: int, learning_rate: float = 2e-6, balancing_dataset: bool = True,
27
- train_loss_threshold: float = 0.0, valid_loss_threshold: float = 0.0,
28
- alpha: float = 1e-4, rho: float = 0.2, encoder_layers: int = 4, attn_heads: int = 6,
29
- recursive_layers: int = 2, recursive_hidden_dim: int = 256, **kwargs) -> LongContextRecursiveSequential:
30
- if balancing_dataset:
31
- min_length = min(len(positive_texts), len(negative_texts))
32
- positive_texts = positive_texts[:min_length]
33
- negative_texts = negative_texts[:min_length]
34
- all_texts = positive_texts + negative_texts
35
- text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
36
- feature_dim = text_embeddings[0].shape[-1]
37
- dtype = text_embeddings[0].dtype
38
- labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
39
- + [torch.tensor([.0], dtype=dtype, device=self.device) for _ in range(len(negative_texts))])
40
- inputs = torch.stack(text_embeddings).to(self.device)
41
- labels = torch.stack(labels).to(self.device)
42
- dataset_size = len(labels)
43
- train_size = int(self._train_ratio * dataset_size)
44
- train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
45
- valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
46
- self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
47
- self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
48
- if self.model is not None and self.model.in_features != feature_dim:
49
- logger.warning("The dimension of features doesn't match. A new model instance will be created.")
50
- self.model = None
51
- if self.model is None:
52
- ffn_heads = kwargs.get('ffn_heads', 2)
53
- ffn_layers = kwargs.get('ffn_layers', 5)
54
- ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
55
- bias = kwargs.get('bias', True)
56
- dropout_rate = kwargs.get('dropout_rate', 0.1)
57
- encoder_ffn_layers = kwargs.get('encoder_ffn_layers', ffn_layers)
58
- encoder_expansion_factor = kwargs.get('encoder_expansion_factor', ffn_expansion_factor)
59
- encoder_dropout_rate = kwargs.get('encoder_dropout_rate', dropout_rate)
60
- attn_ffn_layers = kwargs.get('attn_ffn_layers', 1)
61
- attn_expansion_factor = kwargs.get('attn_expansion_factor', ffn_expansion_factor)
62
- attn_dropout_rate = kwargs.get('attn_dropout_rate', dropout_rate)
63
- theta = kwargs.get('theta', DEFAULT_THETA)
64
- self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
65
- encoder_layers=encoder_layers, attn_heads=attn_heads,
66
- recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
67
- ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
68
- dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
69
- encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
70
- attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
71
- attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
72
- logger.debug(f'Training Model: \n{self.model}')
73
- loss_function = nn.BCELoss()
74
- optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
75
- for epoch in range(num_epochs):
76
- self.model.train()
77
- total_loss = 0.0
78
- for batch_texts, batch_labels in self.train_dataset_loader:
79
- outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
80
- loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
81
- optimizer.zero_grad()
82
- loss.backward()
83
- optimizer.step()
84
- total_loss += loss.item()
85
- if epoch % 3 == 0:
86
- total_valid_loss = 0.0
87
- for batch_texts, batch_labels in self.valid_dataset_loader:
88
- with torch.no_grad():
89
- self.model.eval()
90
- outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
91
- loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
92
- total_valid_loss += loss.item()
93
- self.model.train()
94
- logger.debug(f"Epoch {epoch + 1}/{num_epochs} | "
95
- f"Train Loss: {total_loss:.4f} | "
96
- f"Valid Loss: {total_valid_loss:.4f}")
97
- if total_valid_loss < valid_loss_threshold:
98
- break
99
- else:
100
- logger.debug(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {total_loss:.4f}")
101
- if total_loss < train_loss_threshold:
102
- break
103
- return self.model