deeplotx 0.9.9__tar.gz → 0.9.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. {deeplotx-0.9.9 → deeplotx-0.9.11}/PKG-INFO +2 -2
  2. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/__init__.py +0 -1
  3. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/bert_ner.py +9 -1
  4. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/n2g/__init__.py +17 -23
  5. deeplotx-0.9.11/deeplotx/util/__init__.py +2 -0
  6. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/PKG-INFO +2 -2
  7. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/SOURCES.txt +1 -6
  8. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/requires.txt +1 -1
  9. {deeplotx-0.9.9 → deeplotx-0.9.11}/pyproject.toml +2 -2
  10. deeplotx-0.9.9/deeplotx/trainer/__init__.py +0 -1
  11. deeplotx-0.9.9/deeplotx/trainer/base_trainer.py +0 -13
  12. deeplotx-0.9.9/deeplotx/trainer/text_binary_classification_trainer.py +0 -103
  13. deeplotx-0.9.9/deeplotx/util/__init__.py +0 -2
  14. deeplotx-0.9.9/deeplotx/util/hash.py +0 -29
  15. deeplotx-0.9.9/deeplotx/util/read_file.py +0 -32
  16. {deeplotx-0.9.9 → deeplotx-0.9.11}/LICENSE +0 -0
  17. {deeplotx-0.9.9 → deeplotx-0.9.11}/README.md +0 -0
  18. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/__init__.py +0 -0
  19. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/encoder.py +0 -0
  20. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/long_text_encoder.py +0 -0
  21. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/encoder/longformer_encoder.py +0 -0
  22. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/__init__.py +0 -0
  23. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/base_ner.py +0 -0
  24. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/ner/named_entity.py +0 -0
  25. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/__init__.py +0 -0
  26. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/attention.py +0 -0
  27. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/auto_regression.py +0 -0
  28. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/base_neural_network.py +0 -0
  29. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/feed_forward.py +0 -0
  30. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/linear_regression.py +0 -0
  31. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/logistic_regression.py +0 -0
  32. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/long_context_auto_regression.py +0 -0
  33. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  34. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/multi_head_attention.py +0 -0
  35. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/multi_head_feed_forward.py +0 -0
  36. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/recursive_sequential.py +0 -0
  37. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/roformer_encoder.py +0 -0
  38. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/rope.py +0 -0
  39. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/nn/softmax_regression.py +0 -0
  40. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/__init__.py +0 -0
  41. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/distribution.py +0 -0
  42. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/set.py +0 -0
  43. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx/similarity/vector.py +0 -0
  44. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/dependency_links.txt +0 -0
  45. {deeplotx-0.9.9 → deeplotx-0.9.11}/deeplotx.egg-info/top_level.txt +0 -0
  46. {deeplotx-0.9.9 → deeplotx-0.9.11}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.9
3
+ Version: 0.9.11
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -15,7 +15,7 @@ Requires-Dist: tiktoken
15
15
  Requires-Dist: torch
16
16
  Requires-Dist: transformers
17
17
  Requires-Dist: typing-extensions
18
- Requires-Dist: vortezwohl>=0.0.8
18
+ Requires-Dist: vortezwohl>=0.0.10
19
19
  Requires-Dist: name4py>=0.1.4
20
20
  Dynamic: license-file
21
21
 
@@ -21,7 +21,6 @@ from .nn import (
21
21
  AutoRegression,
22
22
  LongContextAutoRegression
23
23
  )
24
- from .trainer import TextBinaryClassifierTrainer
25
24
 
26
25
  __AUTHOR__ = '吴子豪 / Vortez Wohl'
27
26
  __EMAIL__ = 'vortez.wohl@gmail.com'
@@ -43,12 +43,15 @@ class BertNER(BaseNER):
43
43
 
44
44
  def _fast_extract(self, s: str, with_gender: bool = True, prob_threshold: float = .0) -> list[NamedEntity]:
45
45
  assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
46
+ # entity length cannot be longer than the whole seq
47
+ __max_search_backward = len(self.tokenizer.encode(s, add_special_tokens=False))
46
48
  s = f' {s.replace(NEW_LINE, BLANK * 2)} '
47
49
  raw_entities = self._ner_pipeline(s)
48
50
  entities = []
49
51
  for ent in raw_entities:
50
52
  entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
51
- while True:
53
+ __search_backward = -2
54
+ while __search_backward < __max_search_backward:
52
55
  for i, ent in enumerate(entities):
53
56
  if len(ent[0].strip()) < 1:
54
57
  del entities[i]
@@ -65,6 +68,11 @@ class BertNER(BaseNER):
65
68
  _continue = True
66
69
  if not _continue:
67
70
  break
71
+ __search_backward += 1
72
+ # adjust all I-ENTs
73
+ for ent in entities:
74
+ if ent[1].upper().startswith('I'):
75
+ ent[1] = f'B{ent[1][1:]}'
68
76
  for ent in entities:
69
77
  ent[0] = ent[0].strip()
70
78
  if len(ent[0]) < 1:
@@ -13,13 +13,12 @@ from deeplotx.nn.base_neural_network import BaseNeuralNetwork
13
13
 
14
14
  __CACHE_DIR__ = os.path.join(__ROOT__, '.cache', '.n2g')
15
15
  ENCODER = Encoder(model_name_or_path='FacebookAI/xlm-roberta-base')
16
- BASE_MODEL = 'name2gender-base'
17
- SMALL_MODEL = 'name2gender-small'
16
+ DEFAULT_MODEL = 'name2gender-small'
18
17
  _MIN_FILE_SIZE = 1024 * 5
19
18
 
20
19
 
21
20
  def download_model(model_name: str):
22
- quiet = bool(os.getenv('QUIET_DOWNLOAD', False))
21
+ quiet = bool(os.getenv('N2G_QUIET_DOWNLOAD', False))
23
22
  os.makedirs(__CACHE_DIR__, exist_ok=True)
24
23
  _proxies = {
25
24
  'http': os.getenv('HTTP_PROXY', os.getenv('http_proxy')),
@@ -51,25 +50,20 @@ def download_model(model_name: str):
51
50
 
52
51
  def load_model(model_name: str = 'name2gender-small', dtype: torch.dtype | None = torch.float16) -> BaseNeuralNetwork:
53
52
  n2g_model = None
54
- match model_name:
55
- case 'name2gender-base' | 'n2g-base' | 'base':
56
- download_model(BASE_MODEL)
57
- n2g_model = LogisticRegression(input_dim=768, output_dim=1,
58
- num_heads=12, num_layers=4,
59
- head_layers=1, expansion_factor=2,
60
- model_name=BASE_MODEL, dtype=dtype)
61
- case 'name2gender-small' | 'n2g-base' | 'small':
62
- download_model(SMALL_MODEL)
63
- n2g_model = LogisticRegression(input_dim=768, output_dim=1,
64
- num_heads=6, num_layers=2,
65
- head_layers=1, expansion_factor=1.5,
66
- model_name=SMALL_MODEL, dtype=dtype)
67
- case _:
68
- download_model(SMALL_MODEL)
69
- n2g_model = LogisticRegression(input_dim=768, output_dim=1,
70
- num_heads=6, num_layers=2,
71
- head_layers=1, expansion_factor=1.5,
72
- model_name=SMALL_MODEL, dtype=dtype)
53
+ if 'base' in model_name.lower():
54
+ download_model(model_name)
55
+ n2g_model = LogisticRegression(input_dim=768, output_dim=1,
56
+ num_heads=12, num_layers=4,
57
+ head_layers=1, expansion_factor=2,
58
+ model_name=model_name, dtype=dtype)
59
+ elif 'small' in model_name.lower():
60
+ download_model(model_name)
61
+ n2g_model = LogisticRegression(input_dim=768, output_dim=1,
62
+ num_heads=6, num_layers=2,
63
+ head_layers=1, expansion_factor=1.5,
64
+ model_name=model_name, dtype=dtype)
65
+ else:
66
+ raise FileNotFoundError(f"Model \"{model_name}\" doesn't exists.")
73
67
  return n2g_model.load(model_dir=__CACHE_DIR__)
74
68
 
75
69
 
@@ -77,7 +71,7 @@ class Name2Gender:
77
71
  def __init__(self, model: BaseNeuralNetwork | None = None):
78
72
  super().__init__()
79
73
  if model is None:
80
- model = load_model(SMALL_MODEL)
74
+ model = load_model(DEFAULT_MODEL)
81
75
  self._model = model
82
76
 
83
77
  def __call__(self, name: str, return_probability: bool = False, threshold: float = .5) -> tuple[Gender, float] | Gender:
@@ -0,0 +1,2 @@
1
+ from vortezwohl.crypt.hash import *
2
+ from vortezwohl.io import *
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.9.9
3
+ Version: 0.9.11
4
4
  Summary: An out-of-the-box long-text NLP framework.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -15,7 +15,7 @@ Requires-Dist: tiktoken
15
15
  Requires-Dist: torch
16
16
  Requires-Dist: transformers
17
17
  Requires-Dist: typing-extensions
18
- Requires-Dist: vortezwohl>=0.0.8
18
+ Requires-Dist: vortezwohl>=0.0.10
19
19
  Requires-Dist: name4py>=0.1.4
20
20
  Dynamic: license-file
21
21
 
@@ -35,9 +35,4 @@ deeplotx/similarity/__init__.py
35
35
  deeplotx/similarity/distribution.py
36
36
  deeplotx/similarity/set.py
37
37
  deeplotx/similarity/vector.py
38
- deeplotx/trainer/__init__.py
39
- deeplotx/trainer/base_trainer.py
40
- deeplotx/trainer/text_binary_classification_trainer.py
41
- deeplotx/util/__init__.py
42
- deeplotx/util/hash.py
43
- deeplotx/util/read_file.py
38
+ deeplotx/util/__init__.py
@@ -8,5 +8,5 @@ tiktoken
8
8
  torch
9
9
  transformers
10
10
  typing-extensions
11
- vortezwohl>=0.0.8
11
+ vortezwohl>=0.0.10
12
12
  name4py>=0.1.4
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = '0.9.9'
3
+ version = '0.9.11'
4
4
  description = "An out-of-the-box long-text NLP framework."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -15,6 +15,6 @@ dependencies = [
15
15
  "torch",
16
16
  "transformers",
17
17
  "typing-extensions",
18
- "vortezwohl>=0.0.8",
18
+ "vortezwohl>=0.0.10",
19
19
  "name4py>=0.1.4",
20
20
  ]
@@ -1 +0,0 @@
1
- from .text_binary_classification_trainer import TextBinaryClassifierTrainer
@@ -1,13 +0,0 @@
1
- from abc import abstractmethod
2
-
3
- from deeplotx.nn.base_neural_network import BaseNeuralNetwork
4
-
5
-
6
- class BaseTrainer(object):
7
- def __init__(self, batch_size: int, train_ratio: float):
8
- self._batch_size = batch_size
9
- self._train_ratio = train_ratio
10
- self.model = None
11
-
12
- @abstractmethod
13
- def train(self, *args, **kwargs) -> BaseNeuralNetwork: ...
@@ -1,103 +0,0 @@
1
- import logging
2
- from typing_extensions import override
3
-
4
- import torch
5
- from torch import nn, optim
6
- from torch.utils.data import DataLoader, TensorDataset
7
-
8
- from deeplotx.encoder.long_text_encoder import LongTextEncoder
9
- from deeplotx.nn.attention import DEFAULT_THETA
10
- from deeplotx.nn.long_context_recursive_sequential import LongContextRecursiveSequential
11
- from deeplotx.trainer.base_trainer import BaseTrainer
12
-
13
- logger = logging.getLogger('deeplotx.trainer')
14
-
15
-
16
- class TextBinaryClassifierTrainer(BaseTrainer):
17
- def __init__(self, long_text_encoder: LongTextEncoder, batch_size: int = 2, train_ratio: float = 0.8):
18
- super().__init__(batch_size=batch_size, train_ratio=train_ratio)
19
- self._long_text_encoder = long_text_encoder
20
- self.device = self._long_text_encoder.device
21
- self.train_dataset_loader = None
22
- self.valid_dataset_loader = None
23
-
24
- @override
25
- def train(self, positive_texts: list[str], negative_texts: list[str],
26
- num_epochs: int, learning_rate: float = 2e-6, balancing_dataset: bool = True,
27
- train_loss_threshold: float = 0.0, valid_loss_threshold: float = 0.0,
28
- alpha: float = 1e-4, rho: float = 0.2, encoder_layers: int = 4, attn_heads: int = 6,
29
- recursive_layers: int = 2, recursive_hidden_dim: int = 256, **kwargs) -> LongContextRecursiveSequential:
30
- if balancing_dataset:
31
- min_length = min(len(positive_texts), len(negative_texts))
32
- positive_texts = positive_texts[:min_length]
33
- negative_texts = negative_texts[:min_length]
34
- all_texts = positive_texts + negative_texts
35
- text_embeddings = [self._long_text_encoder.encode(x, flatten=False) for x in all_texts]
36
- feature_dim = text_embeddings[0].shape[-1]
37
- dtype = text_embeddings[0].dtype
38
- labels = ([torch.tensor([1.], dtype=dtype, device=self.device) for _ in range(len(positive_texts))]
39
- + [torch.tensor([.0], dtype=dtype, device=self.device) for _ in range(len(negative_texts))])
40
- inputs = torch.stack(text_embeddings).to(self.device)
41
- labels = torch.stack(labels).to(self.device)
42
- dataset_size = len(labels)
43
- train_size = int(self._train_ratio * dataset_size)
44
- train_dataset = TensorDataset(inputs[:train_size], labels[:train_size])
45
- valid_dataset = TensorDataset(inputs[train_size:], labels[train_size:])
46
- self.train_dataset_loader = DataLoader(train_dataset, batch_size=self._batch_size, shuffle=True)
47
- self.valid_dataset_loader = DataLoader(valid_dataset, batch_size=self._batch_size, shuffle=True)
48
- if self.model is not None and self.model.in_features != feature_dim:
49
- logger.warning("The dimension of features doesn't match. A new model instance will be created.")
50
- self.model = None
51
- if self.model is None:
52
- ffn_heads = kwargs.get('ffn_heads', 2)
53
- ffn_layers = kwargs.get('ffn_layers', 5)
54
- ffn_expansion_factor = kwargs.get('ffn_expansion_factor', 2)
55
- bias = kwargs.get('bias', True)
56
- dropout_rate = kwargs.get('dropout_rate', 0.1)
57
- encoder_ffn_layers = kwargs.get('encoder_ffn_layers', ffn_layers)
58
- encoder_expansion_factor = kwargs.get('encoder_expansion_factor', ffn_expansion_factor)
59
- encoder_dropout_rate = kwargs.get('encoder_dropout_rate', dropout_rate)
60
- attn_ffn_layers = kwargs.get('attn_ffn_layers', 1)
61
- attn_expansion_factor = kwargs.get('attn_expansion_factor', ffn_expansion_factor)
62
- attn_dropout_rate = kwargs.get('attn_dropout_rate', dropout_rate)
63
- theta = kwargs.get('theta', DEFAULT_THETA)
64
- self.model = LongContextRecursiveSequential(input_dim=feature_dim, output_dim=1, bias=bias,
65
- encoder_layers=encoder_layers, attn_heads=attn_heads,
66
- recursive_layers=recursive_layers, recursive_hidden_dim=recursive_hidden_dim,
67
- ffn_layers=ffn_layers, ffn_heads=ffn_heads, ffn_expansion_factor=ffn_expansion_factor,
68
- dropout_rate=dropout_rate, encoder_ffn_layers=encoder_ffn_layers,
69
- encoder_expansion_factor=encoder_expansion_factor, encoder_dropout_rate=encoder_dropout_rate,
70
- attn_ffn_layers=attn_ffn_layers, attn_expansion_factor=attn_expansion_factor,
71
- attn_dropout_rate=attn_dropout_rate, theta=theta).initialize_weights()
72
- logger.debug(f'Training Model: \n{self.model}')
73
- loss_function = nn.BCELoss()
74
- optimizer = optim.Adamax(self.model.parameters(), lr=learning_rate)
75
- for epoch in range(num_epochs):
76
- self.model.train()
77
- total_loss = 0.0
78
- for batch_texts, batch_labels in self.train_dataset_loader:
79
- outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
80
- loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
81
- optimizer.zero_grad()
82
- loss.backward()
83
- optimizer.step()
84
- total_loss += loss.item()
85
- if epoch % 3 == 0:
86
- total_valid_loss = 0.0
87
- for batch_texts, batch_labels in self.valid_dataset_loader:
88
- with torch.no_grad():
89
- self.model.eval()
90
- outputs = torch.sigmoid(self.model.forward(batch_texts, self.model.initial_state(batch_texts.shape[0]))[0])
91
- loss = loss_function(outputs, batch_labels) + self.model.elastic_net(alpha=alpha, rho=rho)
92
- total_valid_loss += loss.item()
93
- self.model.train()
94
- logger.debug(f"Epoch {epoch + 1}/{num_epochs} | "
95
- f"Train Loss: {total_loss:.4f} | "
96
- f"Valid Loss: {total_valid_loss:.4f}")
97
- if total_valid_loss < valid_loss_threshold:
98
- break
99
- else:
100
- logger.debug(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {total_loss:.4f}")
101
- if total_loss < train_loss_threshold:
102
- break
103
- return self.model
@@ -1,2 +0,0 @@
1
- from .hash import md5, sha1, sha256, sha512
2
- from .read_file import read_file, write_file, get_files
@@ -1,29 +0,0 @@
1
- import hashlib
2
-
3
-
4
- def md5(text: str) -> str:
5
- _hash = hashlib.md5()
6
- text_bytes = text.encode('utf-8')
7
- _hash.update(text_bytes)
8
- return _hash.hexdigest()
9
-
10
-
11
- def sha1(text: str) -> str:
12
- _hash = hashlib.sha1()
13
- text_bytes = text.encode('utf-8')
14
- _hash.update(text_bytes)
15
- return _hash.hexdigest()
16
-
17
-
18
- def sha256(text: str) -> str:
19
- _hash = hashlib.sha256()
20
- text_bytes = text.encode('utf-8')
21
- _hash.update(text_bytes)
22
- return _hash.hexdigest()
23
-
24
-
25
- def sha512(text: str) -> str:
26
- _hash = hashlib.sha512()
27
- text_bytes = text.encode('utf-8')
28
- _hash.update(text_bytes)
29
- return _hash.hexdigest()
@@ -1,32 +0,0 @@
1
- import os
2
-
3
-
4
- def read_file(path: str, encoding: str = 'utf-8') -> str:
5
- try:
6
- with open(path, mode='r', encoding=encoding) as f:
7
- return f.read()
8
- except UnicodeDecodeError:
9
- try:
10
- with open(path, mode='r', encoding='gbk') as f:
11
- return f.read()
12
- except UnicodeDecodeError:
13
- pass
14
-
15
-
16
- def write_file(content: str | bytes, path: str, encoding: str = 'utf-8') -> str:
17
- os.makedirs(os.path.dirname(path), exist_ok=True)
18
- if isinstance(content, bytes):
19
- with open(path, mode='wb') as f:
20
- f.write(content)
21
- return path
22
- with open(path, mode='w', encoding=encoding) as f:
23
- f.write(content)
24
- return path
25
-
26
-
27
- def get_files(path: str) -> list:
28
- if os.path.exists(path):
29
- entries = os.listdir(path)
30
- return [os.path.join(path, entry) for entry in entries if os.path.isfile(os.path.join(path, entry))]
31
- else:
32
- return []
File without changes
File without changes
File without changes
File without changes