deeplotx 0.2.20__tar.gz → 0.3.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. {deeplotx-0.2.20 → deeplotx-0.3.1}/PKG-INFO +9 -3
  2. {deeplotx-0.2.20 → deeplotx-0.3.1}/README.md +8 -2
  3. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/__init__.py +1 -1
  4. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/encoder/__init__.py +1 -0
  5. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/encoder/bert_encoder.py +7 -6
  6. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/encoder/long_text_encoder.py +3 -2
  7. deeplotx-0.3.1/deeplotx/encoder/longformer_encoder.py +30 -0
  8. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/nn/base_neural_network.py +3 -0
  9. deeplotx-0.3.1/deeplotx/similarity/__init__.py +17 -0
  10. deeplotx-0.3.1/deeplotx/similarity/distribution.py +32 -0
  11. deeplotx-0.3.1/deeplotx/similarity/set.py +19 -0
  12. deeplotx-0.3.1/deeplotx/similarity/vector.py +36 -0
  13. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx.egg-info/PKG-INFO +9 -3
  14. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx.egg-info/SOURCES.txt +5 -0
  15. {deeplotx-0.2.20 → deeplotx-0.3.1}/pyproject.toml +1 -1
  16. {deeplotx-0.2.20 → deeplotx-0.3.1}/LICENSE +0 -0
  17. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/nn/__init__.py +0 -0
  18. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/nn/linear_regression.py +0 -0
  19. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/nn/logistic_regression.py +0 -0
  20. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/nn/softmax_regression.py +0 -0
  21. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/trainer/__init__.py +0 -0
  22. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/trainer/base_trainer.py +0 -0
  23. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  24. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/util/__init__.py +0 -0
  25. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/util/hash.py +0 -0
  26. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx/util/read_file.py +0 -0
  27. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx.egg-info/dependency_links.txt +0 -0
  28. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx.egg-info/requires.txt +0 -0
  29. {deeplotx-0.2.20 → deeplotx-0.3.1}/deeplotx.egg-info/top_level.txt +0 -0
  30. {deeplotx-0.2.20 → deeplotx-0.3.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.2.20
3
+ Version: 0.3.1
4
4
  Summary: Easy-2-use long text classifier trainers.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -21,13 +21,19 @@ Dynamic: license-file
21
21
  - Install with pip
22
22
 
23
23
  ```
24
- pip install git+https://github.com/vortezwohl/DeepLoTX.git
24
+ pip install -U deeplotx
25
25
  ```
26
26
 
27
27
  - Install with uv
28
28
 
29
29
  ```
30
- uv add git+https://github.com/vortezwohl/DeepLoTX.git
30
+ uv add -U deeplotx
31
+ ```
32
+
33
+ - Install from github
34
+
35
+ ```
36
+ pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
31
37
  ```
32
38
 
33
39
  ## Quick Start
@@ -5,13 +5,19 @@
5
5
  - Install with pip
6
6
 
7
7
  ```
8
- pip install git+https://github.com/vortezwohl/DeepLoTX.git
8
+ pip install -U deeplotx
9
9
  ```
10
10
 
11
11
  - Install with uv
12
12
 
13
13
  ```
14
- uv add git+https://github.com/vortezwohl/DeepLoTX.git
14
+ uv add -U deeplotx
15
+ ```
16
+
17
+ - Install from github
18
+
19
+ ```
20
+ pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
15
21
  ```
16
22
 
17
23
  ## Quick Start
@@ -3,7 +3,7 @@ import os
3
3
 
4
4
  __ROOT__ = os.path.dirname(os.path.abspath(__file__))
5
5
 
6
- from .encoder import BertEncoder, LongTextEncoder
6
+ from .encoder import BertEncoder, LongTextEncoder, LongformerEncoder
7
7
  from .nn import LinearRegression, LogisticRegression, SoftmaxRegression
8
8
  from .trainer import TextBinaryClassifierTrainer
9
9
 
@@ -1,2 +1,3 @@
1
1
  from .bert_encoder import BertEncoder
2
2
  from .long_text_encoder import LongTextEncoder
3
+ from .longformer_encoder import LongformerEncoder
@@ -13,19 +13,20 @@ class BertEncoder(nn.Module):
13
13
  def __init__(self, model_name_or_path: str = DEFAULT_BERT):
14
14
  super().__init__()
15
15
  self.tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
16
- cache_dir=CACHE_PATH)
16
+ cache_dir=CACHE_PATH, _from_auto=True)
17
17
  self.bert = BertModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
18
- cache_dir=CACHE_PATH)
18
+ cache_dir=CACHE_PATH, _from_auto=True)
19
+ self.embed_dim = self.bert.config.max_position_embeddings
19
20
 
20
- def forward(self, input_ids, attention_mask: torch.Tensor) -> torch.Tensor:
21
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
21
22
  def _encoder(_input_tup: tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
22
23
  return self.bert.forward(_input_tup[0], attention_mask=_input_tup[1]).last_hidden_state[:, 0, :]
23
24
 
24
- num_chunks = math.ceil(input_ids.shape[-1] / 512)
25
+ num_chunks = math.ceil(input_ids.shape[-1] / self.embed_dim)
25
26
  chunks = chunk_results = []
26
27
  for i in range(num_chunks):
27
- start_idx = i * 512
28
- end_idx = min(start_idx + 512, input_ids.shape[-1])
28
+ start_idx = i * self.embed_dim
29
+ end_idx = min(start_idx + self.embed_dim, input_ids.shape[-1])
29
30
  chunks.append((input_ids[:, start_idx: end_idx], attention_mask[:, start_idx: end_idx]))
30
31
  ori_mode = self.bert.training
31
32
  self.bert.eval()
@@ -24,7 +24,7 @@ class LongTextEncoder(BertEncoder):
24
24
  return input_tup[0], super().forward(input_tup[1], attention_mask=input_tup[2])
25
25
 
26
26
  @override
27
- def encode(self, text: str) -> torch.Tensor:
27
+ def encode(self, text: str, use_cache: bool = True) -> torch.Tensor:
28
28
  _text_to_show = text.replace("\n", str())
29
29
  logger.debug(f'Embedding \"{_text_to_show if len(_text_to_show) < 128 else _text_to_show[:128] + "..."}\".')
30
30
  # read cache
@@ -58,5 +58,6 @@ class LongTextEncoder(BertEncoder):
58
58
  fin_emb_tensor = torch.cat((fin_emb_tensor.detach().clone(), emb.detach().clone()), dim=-1)
59
59
  fin_emb_tensor = fin_emb_tensor.squeeze()
60
60
  # write cache
61
- self._cache[_text_hash] = fin_emb_tensor
61
+ if use_cache:
62
+ self._cache[_text_hash] = fin_emb_tensor
62
63
  return fin_emb_tensor
@@ -0,0 +1,30 @@
1
+ import torch
2
+ from torch import nn
3
+ from transformers import LongformerTokenizer, LongformerModel
4
+
5
+ from deeplotx import __ROOT__
6
+
7
+ CACHE_PATH = f'{__ROOT__}\\.cache'
8
+ DEFAULT_LONGFORMER = 'allenai/longformer-base-4096'
9
+
10
+
11
+ class LongformerEncoder(nn.Module):
12
+ def __init__(self, model_name_or_path: str = DEFAULT_LONGFORMER):
13
+ super().__init__()
14
+ self.tokenizer = LongformerTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
15
+ cache_dir=CACHE_PATH, _from_auto=True)
16
+ self.bert = LongformerModel.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
17
+ cache_dir=CACHE_PATH, _from_auto=True)
18
+
19
+ def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
20
+ ori_mode = self.bert.training
21
+ self.bert.eval()
22
+ with torch.no_grad():
23
+ res = self.bert.forward(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
24
+ self.bert.train(mode=ori_mode)
25
+ return res
26
+
27
+ def encode(self, text: str) -> torch.Tensor:
28
+ _input_ids = torch.tensor([self.tokenizer.encode(text)], dtype=torch.long)
29
+ _att_mask = torch.tensor([[1] * _input_ids.shape[-1]], dtype=torch.int)
30
+ return self.forward(_input_ids, _att_mask).squeeze()
@@ -1,3 +1,5 @@
1
+ from abc import abstractmethod
2
+
1
3
  import torch
2
4
  from torch import nn
3
5
 
@@ -28,6 +30,7 @@ class BaseNeuralNetwork(nn.Module):
28
30
  def elastic_net(self, alpha: float = 1e-4, rho: float = 0.5) -> torch.Tensor:
29
31
  return alpha * (rho * self.l1(_lambda=1.) + (1 - rho) * self.l2(_lambda=1.))
30
32
 
33
+ @abstractmethod
31
34
  def forward(self, x) -> torch.Tensor: ...
32
35
 
33
36
  def predict(self, x) -> torch.Tensor:
@@ -0,0 +1,17 @@
1
+ import numpy as np
2
+ import torch
3
+
4
+ bias = 1e-12
5
+
6
+
7
+ def ndarray_adapter(*args) -> tuple | np.ndarray:
8
+ args = list(args)
9
+ for i, arg in enumerate(args):
10
+ match arg.__class__:
11
+ case torch.Tensor:
12
+ args[i] = arg.detach().cpu().numpy()
13
+ case List:
14
+ args[i] = np.asarray(arg)
15
+ if len(args) > 1:
16
+ return tuple(args)
17
+ return args[0]
@@ -0,0 +1,32 @@
1
+ import numpy as np
2
+ import torch
3
+
4
+ from deeplotx.similarity import bias, ndarray_adapter
5
+
6
+
7
+ def cross_entropy(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
8
+ p, q = ndarray_adapter(p, q)
9
+ q = np.clip(q, bias, 1 - bias)
10
+ return -1 * (np.sum(p * np.log(q)) / p.shape[0])
11
+
12
+
13
+ def kl_divergence(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
14
+ p, q = ndarray_adapter(p, q)
15
+ q = np.where(q == 0, bias, q)
16
+ p = p / np.sum(p)
17
+ q = q / np.sum(q)
18
+ return np.sum(p * np.log(p / q))
19
+
20
+
21
+ def js_divergence(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
22
+ p, q = ndarray_adapter(p, q)
23
+ m = (p + q) / 2
24
+ return (kl_divergence(p, m) + kl_divergence(q, m)) / 2
25
+
26
+
27
+ def hellinger_distance(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
28
+ p, q = ndarray_adapter(p, q)
29
+ p = p / np.sum(p)
30
+ q = q / np.sum(q)
31
+ squared_diff = (np.sqrt(p) - np.sqrt(q)) ** 2
32
+ return np.sqrt(np.sum(squared_diff)) / np.sqrt(2)
@@ -0,0 +1,19 @@
1
+ from deeplotx.similarity import bias
2
+
3
+
4
+ def jaccard_similarity(set1: set, set2: set) -> float:
5
+ return (len(set1.intersection(set2)) + bias) / (len(set1.union(set2)) + bias)
6
+
7
+
8
+ def overlap_coefficient(set1: set, set2: set) -> float:
9
+ return (len(set1.intersection(set2)) + bias) / (min(len(set1), len(set2)) + bias)
10
+
11
+
12
+ def dice_coefficient(set1: set, set2: set) -> float:
13
+ return (2 * len(set1.intersection(set2)) + bias) / (len(set1) + len(set2) + bias)
14
+
15
+
16
+ def ochiai_similarity(set1: set, set2: set) -> float:
17
+ intersection = len(set1.intersection(set2))
18
+ product = len(set1) ** 0.5 * len(set2) ** 0.5
19
+ return (intersection + bias) / (product + bias)
@@ -0,0 +1,36 @@
1
+ import numpy as np
2
+ import torch
3
+
4
+ from deeplotx.similarity import ndarray_adapter
5
+
6
+
7
+ def l2_normalize(x: torch.Tensor | np.ndarray) -> np.ndarray:
8
+ x = ndarray_adapter(x)
9
+ return x / np.sqrt(np.sum(np.multiply(x, x)))
10
+
11
+
12
+ def z_score_normalize(x: torch.Tensor | np.ndarray) -> np.ndarray:
13
+ x = ndarray_adapter(x)
14
+ mean = np.mean(x)
15
+ std_dev = np.std(x)
16
+ return (x - mean) / std_dev
17
+
18
+
19
+ def euclidean_similarity(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
20
+ p, q = ndarray_adapter(p, q)
21
+ distance = p - q
22
+ distance = np.sum(np.multiply(distance, distance))
23
+ return np.sqrt(distance)
24
+
25
+
26
+ def cosine_similarity(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
27
+ p, q = ndarray_adapter(p, q)
28
+ a = np.matmul(np.transpose(p), q)
29
+ b = np.sum(np.multiply(p, p))
30
+ c = np.sum(np.multiply(q, q))
31
+ return 1 - (a / (np.sqrt(b) * np.sqrt(c)))
32
+
33
+
34
+ def chebyshev_similarity(p: torch.Tensor | np.ndarray, q: torch.Tensor | np.ndarray) -> np.float32:
35
+ p, q = ndarray_adapter(p, q)
36
+ return np.max(np.abs(p - q))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.2.20
3
+ Version: 0.3.1
4
4
  Summary: Easy-2-use long text classifier trainers.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -21,13 +21,19 @@ Dynamic: license-file
21
21
  - Install with pip
22
22
 
23
23
  ```
24
- pip install git+https://github.com/vortezwohl/DeepLoTX.git
24
+ pip install -U deeplotx
25
25
  ```
26
26
 
27
27
  - Install with uv
28
28
 
29
29
  ```
30
- uv add git+https://github.com/vortezwohl/DeepLoTX.git
30
+ uv add -U deeplotx
31
+ ```
32
+
33
+ - Install from github
34
+
35
+ ```
36
+ pip install -U git+https://github.com/vortezwohl/DeepLoTX.git
31
37
  ```
32
38
 
33
39
  ## Quick Start
@@ -10,11 +10,16 @@ deeplotx.egg-info/top_level.txt
10
10
  deeplotx/encoder/__init__.py
11
11
  deeplotx/encoder/bert_encoder.py
12
12
  deeplotx/encoder/long_text_encoder.py
13
+ deeplotx/encoder/longformer_encoder.py
13
14
  deeplotx/nn/__init__.py
14
15
  deeplotx/nn/base_neural_network.py
15
16
  deeplotx/nn/linear_regression.py
16
17
  deeplotx/nn/logistic_regression.py
17
18
  deeplotx/nn/softmax_regression.py
19
+ deeplotx/similarity/__init__.py
20
+ deeplotx/similarity/distribution.py
21
+ deeplotx/similarity/set.py
22
+ deeplotx/similarity/vector.py
18
23
  deeplotx/trainer/__init__.py
19
24
  deeplotx/trainer/base_trainer.py
20
25
  deeplotx/trainer/text_binary_classification_trainer.py
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = "0.2.20"
3
+ version = "0.3.1"
4
4
  description = "Easy-2-use long text classifier trainers."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
File without changes
File without changes