deeplotx 0.8.8__tar.gz → 0.9.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {deeplotx-0.8.8 → deeplotx-0.9.0}/PKG-INFO +3 -1
  2. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/__init__.py +3 -0
  3. deeplotx-0.9.0/deeplotx/ner/__init__.py +3 -0
  4. deeplotx-0.9.0/deeplotx/ner/base_ner.py +7 -0
  5. deeplotx-0.9.0/deeplotx/ner/bert_ner.py +72 -0
  6. deeplotx-0.9.0/deeplotx/ner/named_entity.py +8 -0
  7. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx.egg-info/PKG-INFO +3 -1
  8. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx.egg-info/SOURCES.txt +4 -0
  9. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx.egg-info/requires.txt +2 -0
  10. {deeplotx-0.8.8 → deeplotx-0.9.0}/pyproject.toml +3 -1
  11. {deeplotx-0.8.8 → deeplotx-0.9.0}/LICENSE +0 -0
  12. {deeplotx-0.8.8 → deeplotx-0.9.0}/README.md +0 -0
  13. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/encoder/__init__.py +0 -0
  14. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/encoder/encoder.py +0 -0
  15. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/encoder/long_text_encoder.py +0 -0
  16. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/encoder/longformer_encoder.py +0 -0
  17. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/__init__.py +0 -0
  18. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/attention.py +0 -0
  19. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/auto_regression.py +0 -0
  20. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/base_neural_network.py +0 -0
  21. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/feed_forward.py +0 -0
  22. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/linear_regression.py +0 -0
  23. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/logistic_regression.py +0 -0
  24. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/long_context_auto_regression.py +0 -0
  25. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/long_context_recursive_sequential.py +0 -0
  26. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/multi_head_attention.py +0 -0
  27. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/multi_head_feed_forward.py +0 -0
  28. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/recursive_sequential.py +0 -0
  29. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/roformer_encoder.py +0 -0
  30. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/rope.py +0 -0
  31. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/nn/softmax_regression.py +0 -0
  32. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/similarity/__init__.py +0 -0
  33. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/similarity/distribution.py +0 -0
  34. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/similarity/set.py +0 -0
  35. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/similarity/vector.py +0 -0
  36. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/trainer/__init__.py +0 -0
  37. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/trainer/base_trainer.py +0 -0
  38. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/trainer/text_binary_classification_trainer.py +0 -0
  39. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/util/__init__.py +0 -0
  40. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/util/hash.py +0 -0
  41. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx/util/read_file.py +0 -0
  42. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx.egg-info/dependency_links.txt +0 -0
  43. {deeplotx-0.8.8 → deeplotx-0.9.0}/deeplotx.egg-info/top_level.txt +0 -0
  44. {deeplotx-0.8.8 → deeplotx-0.9.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.8
3
+ Version: 0.9.0
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -10,11 +10,13 @@ Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
12
  Requires-Dist: python-dotenv
13
+ Requires-Dist: sentencepiece
13
14
  Requires-Dist: tiktoken
14
15
  Requires-Dist: torch
15
16
  Requires-Dist: transformers
16
17
  Requires-Dist: typing-extensions
17
18
  Requires-Dist: vortezwohl>=0.0.8
19
+ Requires-Dist: name2gender>=0.0.4a0
18
20
  Dynamic: license-file
19
21
 
20
22
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -4,6 +4,7 @@ import os
4
4
  __ROOT__ = os.path.dirname(os.path.abspath(__file__))
5
5
 
6
6
  from .encoder import Encoder, LongTextEncoder, LongformerEncoder
7
+ from .ner import BertNER, NamedEntity
7
8
  from .nn import (
8
9
  FeedForward,
9
10
  MultiHeadFeedForward,
@@ -40,3 +41,5 @@ logger = logging.getLogger('deeplotx.trainer')
40
41
  logger.setLevel(logging.DEBUG)
41
42
  logger = logging.getLogger('deeplotx.embedding')
42
43
  logger.setLevel(logging.DEBUG)
44
+ logger = logging.getLogger('deeplotx.ner')
45
+ logger.setLevel(logging.DEBUG)
@@ -0,0 +1,3 @@
1
+ from .named_entity import NamedEntity
2
+ from .base_ner import BaseNER
3
+ from .bert_ner import BertNER
@@ -0,0 +1,7 @@
1
+ from deeplotx.ner.named_entity import NamedEntity
2
+
3
+
4
+ class BaseNER:
5
+ def __init__(self): ...
6
+
7
+ def extract_entities(self, s: str, *args, **kwargs) -> list[NamedEntity]: ...
@@ -0,0 +1,72 @@
1
+ import logging
2
+ import os
3
+ from requests.exceptions import ConnectTimeout, SSLError
4
+
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
7
+
8
+ from deeplotx import __ROOT__
9
+ from deeplotx.ner.base_ner import BaseNER
10
+ from deeplotx.ner.named_entity import NamedEntity
11
+
12
+ CACHE_PATH = os.path.join(__ROOT__, '.cache')
13
+ DEFAULT_BERT_NER = 'Davlan/xlm-roberta-base-ner-hrl'
14
+ logger = logging.getLogger('deeplotx.ner')
15
+
16
+
17
+ class BertNER(BaseNER):
18
+ def __init__(self, model_name_or_path: str = DEFAULT_BERT_NER, device: str | None = None):
19
+ super().__init__()
20
+ self.device = torch.device(device) if device is not None \
21
+ else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
22
+ try:
23
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
24
+ cache_dir=CACHE_PATH, _from_auto=True,
25
+ trust_remote_code=True)
26
+ self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
27
+ cache_dir=CACHE_PATH, _from_auto=True,
28
+ trust_remote_code=True).to(self.device)
29
+ except ConnectTimeout:
30
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
31
+ cache_dir=CACHE_PATH, _from_auto=True,
32
+ trust_remote_code=True, local_files_only=True)
33
+ self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
34
+ cache_dir=CACHE_PATH, _from_auto=True,
35
+ trust_remote_code=True, local_files_only=True).to(self.device)
36
+ except SSLError:
37
+ self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
38
+ cache_dir=CACHE_PATH, _from_auto=True,
39
+ trust_remote_code=True, local_files_only=True)
40
+ self.encoder = AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path,
41
+ cache_dir=CACHE_PATH, _from_auto=True,
42
+ trust_remote_code=True, local_files_only=True).to(self.device)
43
+ self.embed_dim = self.encoder.config.max_position_embeddings
44
+ self._ner_pipeline = pipeline(task='ner', model=self.encoder, tokenizer=self.tokenizer, trust_remote_code=True)
45
+ logger.debug(f'{BaseNER.__name__} initialized on device: {self.device}.')
46
+
47
+ def extract_entities(self, s: str, prob_threshold: float = .0, *args, **kwargs) -> list[NamedEntity]:
48
+ assert prob_threshold <= 1., f'prob_threshold ({prob_threshold}) cannot be larger than 1.'
49
+ s = ' ' + s
50
+ raw_entities = self._ner_pipeline(s)
51
+ entities = []
52
+ for ent in raw_entities:
53
+ entities.append([s[ent['start']: ent['end']], ent['entity'], ent['score'].item()])
54
+ while True:
55
+ for i, ent in enumerate(entities):
56
+ if len(ent[0].strip()) < 1:
57
+ del entities[i]
58
+ if ent[1].upper().startswith('I') and entities[i - 1][1].upper().startswith('B'):
59
+ entities[i - 1][0] += ent[0]
60
+ entities[i - 1][2] *= ent[2]
61
+ del entities[i]
62
+ _continue = False
63
+ for ent in entities:
64
+ if ent[1].upper().startswith('I'):
65
+ _continue = True
66
+ if not _continue:
67
+ break
68
+ for ent in entities:
69
+ ent[0] = ent[0].strip()
70
+ if ent[1].upper().startswith('B'):
71
+ ent[1] = ent[1].upper()[1:].strip('-')
72
+ return [NamedEntity(*_) for _ in entities if _[2] >= prob_threshold]
@@ -0,0 +1,8 @@
1
+ from dataclasses import dataclass
2
+
3
+
4
+ @dataclass
5
+ class NamedEntity:
6
+ text: str
7
+ type: str
8
+ probability: float
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: deeplotx
3
- Version: 0.8.8
3
+ Version: 0.9.0
4
4
  Summary: Easy-2-use long text NLP toolkit.
5
5
  Requires-Python: >=3.10
6
6
  Description-Content-Type: text/markdown
@@ -10,11 +10,13 @@ Requires-Dist: jupyter
10
10
  Requires-Dist: numpy
11
11
  Requires-Dist: protobuf
12
12
  Requires-Dist: python-dotenv
13
+ Requires-Dist: sentencepiece
13
14
  Requires-Dist: tiktoken
14
15
  Requires-Dist: torch
15
16
  Requires-Dist: transformers
16
17
  Requires-Dist: typing-extensions
17
18
  Requires-Dist: vortezwohl>=0.0.8
19
+ Requires-Dist: name2gender>=0.0.4a0
18
20
  Dynamic: license-file
19
21
 
20
22
  [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/vortezwohl/DeepLoTX)
@@ -11,6 +11,10 @@ deeplotx/encoder/__init__.py
11
11
  deeplotx/encoder/encoder.py
12
12
  deeplotx/encoder/long_text_encoder.py
13
13
  deeplotx/encoder/longformer_encoder.py
14
+ deeplotx/ner/__init__.py
15
+ deeplotx/ner/base_ner.py
16
+ deeplotx/ner/bert_ner.py
17
+ deeplotx/ner/named_entity.py
14
18
  deeplotx/nn/__init__.py
15
19
  deeplotx/nn/attention.py
16
20
  deeplotx/nn/auto_regression.py
@@ -3,8 +3,10 @@ jupyter
3
3
  numpy
4
4
  protobuf
5
5
  python-dotenv
6
+ sentencepiece
6
7
  tiktoken
7
8
  torch
8
9
  transformers
9
10
  typing-extensions
10
11
  vortezwohl>=0.0.8
12
+ name2gender>=0.0.4a0
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "deeplotx"
3
- version = "0.8.8"
3
+ version = "0.9.0"
4
4
  description = "Easy-2-use long text NLP toolkit."
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10"
@@ -10,9 +10,11 @@ dependencies = [
10
10
  "numpy",
11
11
  "protobuf",
12
12
  "python-dotenv",
13
+ "sentencepiece",
13
14
  "tiktoken",
14
15
  "torch",
15
16
  "transformers",
16
17
  "typing-extensions",
17
18
  "vortezwohl>=0.0.8",
19
+ "name2gender>=0.0.4a0",
18
20
  ]
File without changes
File without changes
File without changes
File without changes
File without changes