SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.13__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/METADATA +2 -3
  2. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/RECORD +47 -26
  3. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/entry_points.txt +7 -3
  4. sinatools/CLI/DataDownload/download_files.py +0 -10
  5. sinatools/CLI/ner/corpus_entity_extractor.py +9 -6
  6. sinatools/CLI/ner/entity_extractor.py +18 -42
  7. sinatools/CLI/utils/arStrip.py +8 -8
  8. sinatools/CLI/utils/implication.py +0 -8
  9. sinatools/CLI/utils/jaccard.py +5 -14
  10. sinatools/CLI/utils/remove_latin.py +2 -2
  11. sinatools/CLI/utils/text_dublication_detector.py +25 -0
  12. sinatools/VERSION +1 -1
  13. sinatools/morphology/ALMA_multi_word.py +14 -16
  14. sinatools/morphology/__init__.py +32 -31
  15. sinatools/ner/__init__.py +28 -2
  16. sinatools/ner/data/__init__.py +1 -0
  17. sinatools/ner/data/datasets.py +146 -0
  18. sinatools/ner/data/transforms.py +118 -0
  19. sinatools/ner/data.py +124 -0
  20. sinatools/ner/data_format.py +124 -0
  21. sinatools/ner/datasets.py +146 -0
  22. sinatools/ner/entity_extractor.py +34 -54
  23. sinatools/ner/helpers.py +86 -0
  24. sinatools/ner/metrics.py +69 -0
  25. sinatools/ner/nn/BaseModel.py +22 -0
  26. sinatools/ner/nn/BertNestedTagger.py +34 -0
  27. sinatools/ner/nn/BertSeqTagger.py +17 -0
  28. sinatools/ner/nn/__init__.py +3 -0
  29. sinatools/ner/trainers/BaseTrainer.py +117 -0
  30. sinatools/ner/trainers/BertNestedTrainer.py +203 -0
  31. sinatools/ner/trainers/BertTrainer.py +163 -0
  32. sinatools/ner/trainers/__init__.py +3 -0
  33. sinatools/ner/transforms.py +119 -0
  34. sinatools/semantic_relatedness/__init__.py +20 -0
  35. sinatools/semantic_relatedness/compute_relatedness.py +31 -0
  36. sinatools/synonyms/__init__.py +18 -0
  37. sinatools/synonyms/synonyms_generator.py +192 -0
  38. sinatools/utils/text_dublication_detector.py +110 -0
  39. sinatools/wsd/__init__.py +11 -0
  40. sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
  41. sinatools/{salma → wsd}/wsd.py +1 -1
  42. sinatools/CLI/salma/salma_tools.py +0 -68
  43. sinatools/salma/__init__.py +0 -12
  44. sinatools/utils/utils.py +0 -2
  45. {SinaTools-0.1.11.data → SinaTools-0.1.13.data}/data/sinatools/environment.yml +0 -0
  46. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/AUTHORS.rst +0 -0
  47. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/LICENSE +0 -0
  48. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/WHEEL +0 -0
  49. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/top_level.txt +0 -0
  50. /sinatools/{salma → wsd}/settings.py +0 -0
@@ -0,0 +1,146 @@
1
+ import logging
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ from torch.nn.utils.rnn import pad_sequence
5
+ from sinatools.ner.data.transforms import (
6
+ BertSeqTransform,
7
+ NestedTagsTransform
8
+ )
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Token:
14
+ def __init__(self, text=None, pred_tag=None, gold_tag=None):
15
+ """
16
+ Token object to hold token attributes
17
+ :param text: str
18
+ :param pred_tag: str
19
+ :param gold_tag: str
20
+ """
21
+ self.text = text
22
+ self.gold_tag = gold_tag
23
+ self.pred_tag = pred_tag
24
+ self.subwords = None
25
+
26
+ @property
27
+ def subwords(self):
28
+ return self._subwords
29
+
30
+ @subwords.setter
31
+ def subwords(self, value):
32
+ self._subwords = value
33
+
34
+ def __str__(self):
35
+ """
36
+ Token text representation
37
+ :return: str
38
+ """
39
+ gold_tags = "|".join(self.gold_tag)
40
+ pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
41
+
42
+ if self.gold_tag:
43
+ r = f"{self.text}\t{gold_tags}\t{pred_tags}"
44
+ else:
45
+ r = f"{self.text}\t{pred_tags}"
46
+
47
+ return r
48
+
49
+
50
+ class DefaultDataset(Dataset):
51
+ def __init__(
52
+ self,
53
+ examples=None,
54
+ vocab=None,
55
+ bert_model="aubmindlab/bert-base-arabertv2",
56
+ max_seq_len=512,
57
+ ):
58
+ """
59
+ The dataset that used to transform the segments into training data
60
+ :param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
61
+ You can get generate examples from -- arabiner.data.dataset.parse_conll_files
62
+ :param vocab: vocab object containing indexed tags and tokens
63
+ :param bert_model: str - BERT model
64
+ :param: int - maximum sequence length
65
+ """
66
+ self.transform = BertSeqTransform(bert_model, vocab, max_seq_len=max_seq_len)
67
+ self.examples = examples
68
+ self.vocab = vocab
69
+
70
+ def __len__(self):
71
+ return len(self.examples)
72
+
73
+ def __getitem__(self, item):
74
+ subwords, tags, tokens, valid_len = self.transform(self.examples[item])
75
+ return subwords, tags, tokens, valid_len
76
+
77
+ def collate_fn(self, batch):
78
+ """
79
+ Collate function that is called when the batch is called by the trainer
80
+ :param batch: Dataloader batch
81
+ :return: Same output as the __getitem__ function
82
+ """
83
+ subwords, tags, tokens, valid_len = zip(*batch)
84
+
85
+ # Pad sequences in this batch
86
+ # subwords and tokens are padded with zeros
87
+ # tags are padding with the index of the O tag
88
+ subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
89
+ tags = pad_sequence(
90
+ tags, batch_first=True, padding_value=self.vocab.tags[0].get_stoi()["O"]
91
+ )
92
+ return subwords, tags, tokens, valid_len
93
+
94
+
95
+ class NestedTagsDataset(Dataset):
96
+ def __init__(
97
+ self,
98
+ examples=None,
99
+ vocab=None,
100
+ bert_model="aubmindlab/bert-base-arabertv2",
101
+ max_seq_len=512,
102
+ ):
103
+ """
104
+ The dataset that used to transform the segments into training data
105
+ :param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
106
+ You can get generate examples from -- arabiner.data.dataset.parse_conll_files
107
+ :param vocab: vocab object containing indexed tags and tokens
108
+ :param bert_model: str - BERT model
109
+ :param: int - maximum sequence length
110
+ """
111
+ self.transform = NestedTagsTransform(
112
+ bert_model, vocab, max_seq_len=max_seq_len
113
+ )
114
+ self.examples = examples
115
+ self.vocab = vocab
116
+
117
+ def __len__(self):
118
+ return len(self.examples)
119
+
120
+ def __getitem__(self, item):
121
+ subwords, tags, tokens, masks, valid_len = self.transform(self.examples[item])
122
+ return subwords, tags, tokens, masks, valid_len
123
+
124
+ def collate_fn(self, batch):
125
+ """
126
+ Collate function that is called when the batch is called by the trainer
127
+ :param batch: Dataloader batch
128
+ :return: Same output as the __getitem__ function
129
+ """
130
+ subwords, tags, tokens, masks, valid_len = zip(*batch)
131
+
132
+ # Pad sequences in this batch
133
+ # subwords and tokens are padded with zeros
134
+ # tags are padding with the index of the O tag
135
+ subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
136
+
137
+ masks = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), 0)(mask)
138
+ for tag, mask in zip(tags, masks)]
139
+ masks = torch.cat(masks)
140
+
141
+ # Pad the tags, do the padding for each tag type
142
+ tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
143
+ for tag, vocab in zip(tags, self.vocab.tags[1:])]
144
+ tags = torch.cat(tags)
145
+
146
+ return subwords, tags, tokens, masks, valid_len
@@ -0,0 +1,118 @@
1
+ import torch
2
+ from transformers import BertTokenizer
3
+ from functools import partial
4
+ import re
5
+ import itertools
6
+ from sinatools.ner.data import datasets
7
+ class BertSeqTransform:
8
+ def __init__(self, bert_model, vocab, max_seq_len=512):
9
+ self.tokenizer = BertTokenizer.from_pretrained(bert_model)
10
+ self.encoder = partial(
11
+ self.tokenizer.encode,
12
+ max_length=max_seq_len,
13
+ truncation=True,
14
+ )
15
+ self.max_seq_len = max_seq_len
16
+ self.vocab = vocab
17
+
18
+ def __call__(self, segment):
19
+ subwords, tags, tokens = list(), list(), list()
20
+ unk_token = datasets.Token(text="UNK")
21
+
22
+ for token in segment:
23
+ token_subwords = self.encoder(token.text)[1:-1]
24
+ subwords += token_subwords
25
+ tags += [self.vocab.tags[0].get_stoi()[token.gold_tag[0]]] + [self.vocab.tags[0].get_stoi()["O"]] * (len(token_subwords) - 1)
26
+ tokens += [token] + [unk_token] * (len(token_subwords) - 1)
27
+
28
+ # Truncate to max_seq_len
29
+ if len(subwords) > self.max_seq_len - 2:
30
+ text = " ".join([t.text for t in tokens if t.text != "UNK"])
31
+
32
+ subwords = subwords[:self.max_seq_len - 2]
33
+ tags = tags[:self.max_seq_len - 2]
34
+ tokens = tokens[:self.max_seq_len - 2]
35
+
36
+ subwords.insert(0, self.tokenizer.cls_token_id)
37
+ subwords.append(self.tokenizer.sep_token_id)
38
+
39
+ tags.insert(0, self.vocab.tags[0].get_stoi()["O"])
40
+ tags.append(self.vocab.tags[0].get_stoi()["O"])
41
+
42
+ tokens.insert(0, unk_token)
43
+ tokens.append(unk_token)
44
+
45
+ return torch.LongTensor(subwords), torch.LongTensor(tags), tokens, len(tokens)
46
+
47
+
48
+ class NestedTagsTransform:
49
+ def __init__(self, bert_model, vocab, max_seq_len=512):
50
+ self.tokenizer = BertTokenizer.from_pretrained(bert_model)
51
+ self.encoder = partial(
52
+ self.tokenizer.encode,
53
+ max_length=max_seq_len,
54
+ truncation=True,
55
+ )
56
+ self.max_seq_len = max_seq_len
57
+ self.vocab = vocab
58
+
59
+ def __call__(self, segment):
60
+ tags, tokens, subwords = list(), list(), list()
61
+ unk_token = datasets.Token(text="UNK")
62
+
63
+ # Encode each token and get its subwords and IDs
64
+ for token in segment:
65
+ token.subwords = self.encoder(token.text)[1:-1]
66
+ subwords += token.subwords
67
+ tokens += [token] + [unk_token] * (len(token.subwords ) - 1)
68
+
69
+ # Construct the labels for each tag type
70
+ # The sequence will have a list of tags for each type
71
+ # The final tags for a sequence is a matrix NUM_TAG_TYPES x SEQ_LEN
72
+ # Example:
73
+ # [
74
+ # [O, O, B-PERS, I-PERS, O, O, O]
75
+ # [B-ORG, I-ORG, O, O, O, O, O]
76
+ # [O, O, O, O, O, O, B-GPE]
77
+ # ]
78
+ for vocab in self.vocab.tags[1:]:
79
+ vocab_tags = "|".join([t for t in vocab.get_itos() if "-" in t])
80
+ r = re.compile(vocab_tags)
81
+
82
+ # This is really messy
83
+ # For a given token we find a matching tag_name, BUT we might find
84
+ # multiple matches (i.e. a token can be labeled B-ORG and I-ORG) in this
85
+ # case we get only the first tag as we do not have overlapping of same type
86
+ single_type_tags = [[(list(filter(r.match, token.gold_tag))
87
+ or ["O"])[0]] + ["O"] * (len(token.subwords) - 1)
88
+ for token in segment]
89
+ single_type_tags = list(itertools.chain(*single_type_tags))
90
+ tags.append([vocab.get_stoi()[tag] for tag in single_type_tags])
91
+
92
+ # Truncate to max_seq_len
93
+ if len(subwords) > self.max_seq_len - 2:
94
+ text = " ".join([t.text for t in tokens if t.text != "UNK"])
95
+
96
+ subwords = subwords[:self.max_seq_len - 2]
97
+ tags = [t[:self.max_seq_len - 2] for t in tags]
98
+ tokens = tokens[:self.max_seq_len - 2]
99
+
100
+ # Add dummy token at the start end of sequence
101
+ tokens.insert(0, unk_token)
102
+ tokens.append(unk_token)
103
+
104
+ # Add CLS and SEP at start end of subwords
105
+ subwords.insert(0, self.tokenizer.cls_token_id)
106
+ subwords.append(self.tokenizer.sep_token_id)
107
+ subwords = torch.LongTensor(subwords)
108
+
109
+ # Add "O" tags for the first and last subwords
110
+ tags = torch.Tensor(tags)
111
+ tags = torch.column_stack((
112
+ torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
113
+ tags,
114
+ torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
115
+ )).unsqueeze(0)
116
+
117
+ mask = torch.ones_like(tags)
118
+ return subwords, tags, tokens, mask, len(tokens)
sinatools/ner/data.py ADDED
@@ -0,0 +1,124 @@
1
+ from torch.utils.data import DataLoader
2
+ from torchtext.vocab import vocab
3
+ from collections import Counter, namedtuple
4
+ import logging
5
+ import re
6
+ import itertools
7
+ from sinatools.ner.helpers import load_object
8
+ from sinatools.ner.datasets import Token
9
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def conll_to_segments(filename):
15
+ """
16
+ Convert CoNLL files to segments. This return list of segments and each segment is
17
+ a list of tuples (token, tag)
18
+ :param filename: Path
19
+ :return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
20
+ """
21
+ segments, segment = list(), list()
22
+
23
+ with open(filename, "r") as fh:
24
+ for token in fh.read().splitlines():
25
+ if not token.strip():
26
+ segments.append(segment)
27
+ segment = list()
28
+ else:
29
+ parts = token.split()
30
+ token = Token(text=parts[0], gold_tag=parts[1:])
31
+ segment.append(token)
32
+
33
+ segments.append(segment)
34
+
35
+ return segments
36
+
37
+
38
+ def parse_conll_files(data_paths):
39
+ """
40
+ Parse CoNLL formatted files and return list of segments for each file and index
41
+ the vocabs and tags across all data_paths
42
+ :param data_paths: tuple(Path) - tuple of filenames
43
+ :return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
44
+ [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
45
+ ...
46
+ )
47
+ List of segments for each dataset and each segment has list of (tokens, tags)
48
+ """
49
+ vocabs = namedtuple("Vocab", ["tags", "tokens"])
50
+ datasets, tags, tokens = list(), list(), list()
51
+
52
+ for data_path in data_paths:
53
+ dataset = conll_to_segments(data_path)
54
+ datasets.append(dataset)
55
+ tokens += [token.text for segment in dataset for token in segment]
56
+ tags += [token.gold_tag for segment in dataset for token in segment]
57
+
58
+ # Flatten list of tags
59
+ tags = list(itertools.chain(*tags))
60
+
61
+ # Generate vocabs for tags and tokens
62
+ tag_vocabs = tag_vocab_by_type(tags)
63
+ tag_vocabs.insert(0, vocab(Counter(tags)))
64
+ vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
65
+ return tuple(datasets), vocabs
66
+
67
+
68
+ def tag_vocab_by_type(tags):
69
+ vocabs = list()
70
+ c = Counter(tags)
71
+ tag_names = c.keys()
72
+ tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
73
+
74
+ for tag_type in tag_types:
75
+ r = re.compile(".*-" + tag_type)
76
+ t = list(filter(r.match, tags)) + ["O"]
77
+ vocabs.append(vocab(Counter(t), specials=["<pad>"]))
78
+
79
+ return vocabs
80
+
81
+
82
+ def text2segments(text):
83
+ """
84
+ Convert text to a datasets and index the tokens
85
+ """
86
+ #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
87
+ list_of_tokens = simple_word_tokenize(text)
88
+ dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
89
+ tokens = [token.text for segment in dataset for token in segment]
90
+
91
+ # Generate vocabs for the tokens
92
+ segment_vocab = vocab(Counter(tokens), specials=["UNK"])
93
+ return dataset, segment_vocab
94
+
95
+
96
+ def get_dataloaders(
97
+ datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
98
+ ):
99
+ """
100
+ From the datasets generate the dataloaders
101
+ :param datasets: list - list of the datasets, list of list of segments and tokens
102
+ :param batch_size: int
103
+ :param num_workers: int
104
+ :param shuffle: boolean - to shuffle the data or not
105
+ :return: List[torch.utils.data.DataLoader]
106
+ """
107
+ dataloaders = list()
108
+
109
+ for i, examples in enumerate(datasets):
110
+ data_config["kwargs"].update({"examples": examples, "vocab": vocab})
111
+ dataset = load_object("sinatools."+data_config["fn"], data_config["kwargs"])
112
+
113
+ dataloader = DataLoader(
114
+ dataset=dataset,
115
+ shuffle=shuffle[i],
116
+ batch_size=batch_size,
117
+ num_workers=num_workers,
118
+ collate_fn=dataset.collate_fn,
119
+ )
120
+
121
+ logger.info("%s batches found", len(dataloader))
122
+ dataloaders.append(dataloader)
123
+
124
+ return dataloaders
@@ -0,0 +1,124 @@
1
+ from torch.utils.data import DataLoader
2
+ from torchtext.vocab import vocab
3
+ from collections import Counter, namedtuple
4
+ import logging
5
+ import re
6
+ import itertools
7
+ from sinatools.ner.helpers import load_object
8
+ from sinatools.ner.datasets import Token
9
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+
14
+ def conll_to_segments(filename):
15
+ """
16
+ Convert CoNLL files to segments. This return list of segments and each segment is
17
+ a list of tuples (token, tag)
18
+ :param filename: Path
19
+ :return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
20
+ """
21
+ segments, segment = list(), list()
22
+
23
+ with open(filename, "r") as fh:
24
+ for token in fh.read().splitlines():
25
+ if not token.strip():
26
+ segments.append(segment)
27
+ segment = list()
28
+ else:
29
+ parts = token.split()
30
+ token = Token(text=parts[0], gold_tag=parts[1:])
31
+ segment.append(token)
32
+
33
+ segments.append(segment)
34
+
35
+ return segments
36
+
37
+
38
+ def parse_conll_files(data_paths):
39
+ """
40
+ Parse CoNLL formatted files and return list of segments for each file and index
41
+ the vocabs and tags across all data_paths
42
+ :param data_paths: tuple(Path) - tuple of filenames
43
+ :return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
44
+ [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
45
+ ...
46
+ )
47
+ List of segments for each dataset and each segment has list of (tokens, tags)
48
+ """
49
+ vocabs = namedtuple("Vocab", ["tags", "tokens"])
50
+ datasets, tags, tokens = list(), list(), list()
51
+
52
+ for data_path in data_paths:
53
+ dataset = conll_to_segments(data_path)
54
+ datasets.append(dataset)
55
+ tokens += [token.text for segment in dataset for token in segment]
56
+ tags += [token.gold_tag for segment in dataset for token in segment]
57
+
58
+ # Flatten list of tags
59
+ tags = list(itertools.chain(*tags))
60
+
61
+ # Generate vocabs for tags and tokens
62
+ tag_vocabs = tag_vocab_by_type(tags)
63
+ tag_vocabs.insert(0, vocab(Counter(tags)))
64
+ vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
65
+ return tuple(datasets), vocabs
66
+
67
+
68
+ def tag_vocab_by_type(tags):
69
+ vocabs = list()
70
+ c = Counter(tags)
71
+ tag_names = c.keys()
72
+ tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
73
+
74
+ for tag_type in tag_types:
75
+ r = re.compile(".*-" + tag_type)
76
+ t = list(filter(r.match, tags)) + ["O"]
77
+ vocabs.append(vocab(Counter(t), specials=["<pad>"]))
78
+
79
+ return vocabs
80
+
81
+
82
+ def text2segments(text):
83
+ """
84
+ Convert text to a datasets and index the tokens
85
+ """
86
+ #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
87
+ list_of_tokens = simple_word_tokenize(text)
88
+ dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
89
+ tokens = [token.text for segment in dataset for token in segment]
90
+
91
+ # Generate vocabs for the tokens
92
+ segment_vocab = vocab(Counter(tokens), specials=["UNK"])
93
+ return dataset, segment_vocab
94
+
95
+
96
+ def get_dataloaders(
97
+ datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
98
+ ):
99
+ """
100
+ From the datasets generate the dataloaders
101
+ :param datasets: list - list of the datasets, list of list of segments and tokens
102
+ :param batch_size: int
103
+ :param num_workers: int
104
+ :param shuffle: boolean - to shuffle the data or not
105
+ :return: List[torch.utils.data.DataLoader]
106
+ """
107
+ dataloaders = list()
108
+
109
+ for i, examples in enumerate(datasets):
110
+ data_config["kwargs"].update({"examples": examples, "vocab": vocab})
111
+ dataset = load_object(data_config["fn"], data_config["kwargs"])
112
+
113
+ dataloader = DataLoader(
114
+ dataset=dataset,
115
+ shuffle=shuffle[i],
116
+ batch_size=batch_size,
117
+ num_workers=num_workers,
118
+ collate_fn=dataset.collate_fn,
119
+ )
120
+
121
+ logger.info("%s batches found", len(dataloader))
122
+ dataloaders.append(dataloader)
123
+
124
+ return dataloaders
@@ -0,0 +1,146 @@
1
+ import logging
2
+ import torch
3
+ from torch.utils.data import Dataset
4
+ from torch.nn.utils.rnn import pad_sequence
5
+ from sinatools.ner.transforms import (
6
+ BertSeqTransform,
7
+ NestedTagsTransform
8
+ )
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class Token:
14
+ def __init__(self, text=None, pred_tag=None, gold_tag=None):
15
+ """
16
+ Token object to hold token attributes
17
+ :param text: str
18
+ :param pred_tag: str
19
+ :param gold_tag: str
20
+ """
21
+ self.text = text
22
+ self.gold_tag = gold_tag
23
+ self.pred_tag = pred_tag
24
+ self.subwords = None
25
+
26
+ @property
27
+ def subwords(self):
28
+ return self._subwords
29
+
30
+ @subwords.setter
31
+ def subwords(self, value):
32
+ self._subwords = value
33
+
34
+ def __str__(self):
35
+ """
36
+ Token text representation
37
+ :return: str
38
+ """
39
+ gold_tags = "|".join(self.gold_tag)
40
+ pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
41
+
42
+ if self.gold_tag:
43
+ r = f"{self.text}\t{gold_tags}\t{pred_tags}"
44
+ else:
45
+ r = f"{self.text}\t{pred_tags}"
46
+
47
+ return r
48
+
49
+
50
+ class DefaultDataset(Dataset):
51
+ def __init__(
52
+ self,
53
+ examples=None,
54
+ vocab=None,
55
+ bert_model="aubmindlab/bert-base-arabertv2",
56
+ max_seq_len=512,
57
+ ):
58
+ """
59
+ The dataset that used to transform the segments into training data
60
+ :param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
61
+ You can get generate examples from -- arabiner.data.dataset.parse_conll_files
62
+ :param vocab: vocab object containing indexed tags and tokens
63
+ :param bert_model: str - BERT model
64
+ :param: int - maximum sequence length
65
+ """
66
+ self.transform = BertSeqTransform(bert_model, vocab, max_seq_len=max_seq_len)
67
+ self.examples = examples
68
+ self.vocab = vocab
69
+
70
+ def __len__(self):
71
+ return len(self.examples)
72
+
73
+ def __getitem__(self, item):
74
+ subwords, tags, tokens, valid_len = self.transform(self.examples[item])
75
+ return subwords, tags, tokens, valid_len
76
+
77
+ def collate_fn(self, batch):
78
+ """
79
+ Collate function that is called when the batch is called by the trainer
80
+ :param batch: Dataloader batch
81
+ :return: Same output as the __getitem__ function
82
+ """
83
+ subwords, tags, tokens, valid_len = zip(*batch)
84
+
85
+ # Pad sequences in this batch
86
+ # subwords and tokens are padded with zeros
87
+ # tags are padding with the index of the O tag
88
+ subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
89
+ tags = pad_sequence(
90
+ tags, batch_first=True, padding_value=self.vocab.tags[0].get_stoi()["O"]
91
+ )
92
+ return subwords, tags, tokens, valid_len
93
+
94
+
95
+ class NestedTagsDataset(Dataset):
96
+ def __init__(
97
+ self,
98
+ examples=None,
99
+ vocab=None,
100
+ bert_model="aubmindlab/bert-base-arabertv2",
101
+ max_seq_len=512,
102
+ ):
103
+ """
104
+ The dataset that used to transform the segments into training data
105
+ :param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
106
+ You can get generate examples from -- arabiner.data.dataset.parse_conll_files
107
+ :param vocab: vocab object containing indexed tags and tokens
108
+ :param bert_model: str - BERT model
109
+ :param: int - maximum sequence length
110
+ """
111
+ self.transform = NestedTagsTransform(
112
+ bert_model, vocab, max_seq_len=max_seq_len
113
+ )
114
+ self.examples = examples
115
+ self.vocab = vocab
116
+
117
+ def __len__(self):
118
+ return len(self.examples)
119
+
120
+ def __getitem__(self, item):
121
+ subwords, tags, tokens, masks, valid_len = self.transform(self.examples[item])
122
+ return subwords, tags, tokens, masks, valid_len
123
+
124
+ def collate_fn(self, batch):
125
+ """
126
+ Collate function that is called when the batch is called by the trainer
127
+ :param batch: Dataloader batch
128
+ :return: Same output as the __getitem__ function
129
+ """
130
+ subwords, tags, tokens, masks, valid_len = zip(*batch)
131
+
132
+ # Pad sequences in this batch
133
+ # subwords and tokens are padded with zeros
134
+ # tags are padding with the index of the O tag
135
+ subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
136
+
137
+ masks = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), 0)(mask)
138
+ for tag, mask in zip(tags, masks)]
139
+ masks = torch.cat(masks)
140
+
141
+ # Pad the tags, do the padding for each tag type
142
+ tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
143
+ for tag, vocab in zip(tags, self.vocab.tags[1:])]
144
+ tags = torch.cat(tags)
145
+
146
+ return subwords, tags, tokens, masks, valid_len