SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.12__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/METADATA +2 -3
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/RECORD +47 -26
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/entry_points.txt +7 -3
- sinatools/CLI/DataDownload/download_files.py +0 -10
- sinatools/CLI/ner/corpus_entity_extractor.py +6 -6
- sinatools/CLI/ner/entity_extractor.py +18 -42
- sinatools/CLI/utils/arStrip.py +8 -8
- sinatools/CLI/utils/implication.py +0 -8
- sinatools/CLI/utils/jaccard.py +5 -14
- sinatools/CLI/utils/remove_latin.py +2 -2
- sinatools/CLI/utils/text_dublication_detector.py +25 -0
- sinatools/VERSION +1 -1
- sinatools/morphology/ALMA_multi_word.py +14 -16
- sinatools/morphology/__init__.py +32 -31
- sinatools/ner/__init__.py +28 -2
- sinatools/ner/data/__init__.py +1 -0
- sinatools/ner/data/datasets.py +146 -0
- sinatools/ner/data/transforms.py +118 -0
- sinatools/ner/data.py +124 -0
- sinatools/ner/data_format.py +124 -0
- sinatools/ner/datasets.py +146 -0
- sinatools/ner/entity_extractor.py +34 -54
- sinatools/ner/helpers.py +86 -0
- sinatools/ner/metrics.py +69 -0
- sinatools/ner/nn/BaseModel.py +22 -0
- sinatools/ner/nn/BertNestedTagger.py +34 -0
- sinatools/ner/nn/BertSeqTagger.py +17 -0
- sinatools/ner/nn/__init__.py +3 -0
- sinatools/ner/trainers/BaseTrainer.py +117 -0
- sinatools/ner/trainers/BertNestedTrainer.py +203 -0
- sinatools/ner/trainers/BertTrainer.py +163 -0
- sinatools/ner/trainers/__init__.py +3 -0
- sinatools/ner/transforms.py +119 -0
- sinatools/semantic_relatedness/__init__.py +20 -0
- sinatools/semantic_relatedness/compute_relatedness.py +31 -0
- sinatools/synonyms/__init__.py +18 -0
- sinatools/synonyms/synonyms_generator.py +192 -0
- sinatools/utils/text_dublication_detector.py +110 -0
- sinatools/wsd/__init__.py +11 -0
- sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
- sinatools/{salma → wsd}/wsd.py +1 -1
- sinatools/CLI/salma/salma_tools.py +0 -68
- sinatools/salma/__init__.py +0 -12
- sinatools/utils/utils.py +0 -2
- {SinaTools-0.1.11.data → SinaTools-0.1.12.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/top_level.txt +0 -0
- /sinatools/{salma → wsd}/settings.py +0 -0
@@ -0,0 +1,146 @@
|
|
1
|
+
import logging
|
2
|
+
import torch
|
3
|
+
from torch.utils.data import Dataset
|
4
|
+
from torch.nn.utils.rnn import pad_sequence
|
5
|
+
from sinatools.ner.data.transforms import (
|
6
|
+
BertSeqTransform,
|
7
|
+
NestedTagsTransform
|
8
|
+
)
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class Token:
|
14
|
+
def __init__(self, text=None, pred_tag=None, gold_tag=None):
|
15
|
+
"""
|
16
|
+
Token object to hold token attributes
|
17
|
+
:param text: str
|
18
|
+
:param pred_tag: str
|
19
|
+
:param gold_tag: str
|
20
|
+
"""
|
21
|
+
self.text = text
|
22
|
+
self.gold_tag = gold_tag
|
23
|
+
self.pred_tag = pred_tag
|
24
|
+
self.subwords = None
|
25
|
+
|
26
|
+
@property
|
27
|
+
def subwords(self):
|
28
|
+
return self._subwords
|
29
|
+
|
30
|
+
@subwords.setter
|
31
|
+
def subwords(self, value):
|
32
|
+
self._subwords = value
|
33
|
+
|
34
|
+
def __str__(self):
|
35
|
+
"""
|
36
|
+
Token text representation
|
37
|
+
:return: str
|
38
|
+
"""
|
39
|
+
gold_tags = "|".join(self.gold_tag)
|
40
|
+
pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
|
41
|
+
|
42
|
+
if self.gold_tag:
|
43
|
+
r = f"{self.text}\t{gold_tags}\t{pred_tags}"
|
44
|
+
else:
|
45
|
+
r = f"{self.text}\t{pred_tags}"
|
46
|
+
|
47
|
+
return r
|
48
|
+
|
49
|
+
|
50
|
+
class DefaultDataset(Dataset):
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
examples=None,
|
54
|
+
vocab=None,
|
55
|
+
bert_model="aubmindlab/bert-base-arabertv2",
|
56
|
+
max_seq_len=512,
|
57
|
+
):
|
58
|
+
"""
|
59
|
+
The dataset that used to transform the segments into training data
|
60
|
+
:param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
|
61
|
+
You can get generate examples from -- arabiner.data.dataset.parse_conll_files
|
62
|
+
:param vocab: vocab object containing indexed tags and tokens
|
63
|
+
:param bert_model: str - BERT model
|
64
|
+
:param: int - maximum sequence length
|
65
|
+
"""
|
66
|
+
self.transform = BertSeqTransform(bert_model, vocab, max_seq_len=max_seq_len)
|
67
|
+
self.examples = examples
|
68
|
+
self.vocab = vocab
|
69
|
+
|
70
|
+
def __len__(self):
|
71
|
+
return len(self.examples)
|
72
|
+
|
73
|
+
def __getitem__(self, item):
|
74
|
+
subwords, tags, tokens, valid_len = self.transform(self.examples[item])
|
75
|
+
return subwords, tags, tokens, valid_len
|
76
|
+
|
77
|
+
def collate_fn(self, batch):
|
78
|
+
"""
|
79
|
+
Collate function that is called when the batch is called by the trainer
|
80
|
+
:param batch: Dataloader batch
|
81
|
+
:return: Same output as the __getitem__ function
|
82
|
+
"""
|
83
|
+
subwords, tags, tokens, valid_len = zip(*batch)
|
84
|
+
|
85
|
+
# Pad sequences in this batch
|
86
|
+
# subwords and tokens are padded with zeros
|
87
|
+
# tags are padding with the index of the O tag
|
88
|
+
subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
|
89
|
+
tags = pad_sequence(
|
90
|
+
tags, batch_first=True, padding_value=self.vocab.tags[0].get_stoi()["O"]
|
91
|
+
)
|
92
|
+
return subwords, tags, tokens, valid_len
|
93
|
+
|
94
|
+
|
95
|
+
class NestedTagsDataset(Dataset):
|
96
|
+
def __init__(
|
97
|
+
self,
|
98
|
+
examples=None,
|
99
|
+
vocab=None,
|
100
|
+
bert_model="aubmindlab/bert-base-arabertv2",
|
101
|
+
max_seq_len=512,
|
102
|
+
):
|
103
|
+
"""
|
104
|
+
The dataset that used to transform the segments into training data
|
105
|
+
:param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
|
106
|
+
You can get generate examples from -- arabiner.data.dataset.parse_conll_files
|
107
|
+
:param vocab: vocab object containing indexed tags and tokens
|
108
|
+
:param bert_model: str - BERT model
|
109
|
+
:param: int - maximum sequence length
|
110
|
+
"""
|
111
|
+
self.transform = NestedTagsTransform(
|
112
|
+
bert_model, vocab, max_seq_len=max_seq_len
|
113
|
+
)
|
114
|
+
self.examples = examples
|
115
|
+
self.vocab = vocab
|
116
|
+
|
117
|
+
def __len__(self):
|
118
|
+
return len(self.examples)
|
119
|
+
|
120
|
+
def __getitem__(self, item):
|
121
|
+
subwords, tags, tokens, masks, valid_len = self.transform(self.examples[item])
|
122
|
+
return subwords, tags, tokens, masks, valid_len
|
123
|
+
|
124
|
+
def collate_fn(self, batch):
|
125
|
+
"""
|
126
|
+
Collate function that is called when the batch is called by the trainer
|
127
|
+
:param batch: Dataloader batch
|
128
|
+
:return: Same output as the __getitem__ function
|
129
|
+
"""
|
130
|
+
subwords, tags, tokens, masks, valid_len = zip(*batch)
|
131
|
+
|
132
|
+
# Pad sequences in this batch
|
133
|
+
# subwords and tokens are padded with zeros
|
134
|
+
# tags are padding with the index of the O tag
|
135
|
+
subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
|
136
|
+
|
137
|
+
masks = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), 0)(mask)
|
138
|
+
for tag, mask in zip(tags, masks)]
|
139
|
+
masks = torch.cat(masks)
|
140
|
+
|
141
|
+
# Pad the tags, do the padding for each tag type
|
142
|
+
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
|
143
|
+
for tag, vocab in zip(tags, self.vocab.tags[1:])]
|
144
|
+
tags = torch.cat(tags)
|
145
|
+
|
146
|
+
return subwords, tags, tokens, masks, valid_len
|
@@ -0,0 +1,118 @@
|
|
1
|
+
import torch
|
2
|
+
from transformers import BertTokenizer
|
3
|
+
from functools import partial
|
4
|
+
import re
|
5
|
+
import itertools
|
6
|
+
from sinatools.ner.data import datasets
|
7
|
+
class BertSeqTransform:
|
8
|
+
def __init__(self, bert_model, vocab, max_seq_len=512):
|
9
|
+
self.tokenizer = BertTokenizer.from_pretrained(bert_model)
|
10
|
+
self.encoder = partial(
|
11
|
+
self.tokenizer.encode,
|
12
|
+
max_length=max_seq_len,
|
13
|
+
truncation=True,
|
14
|
+
)
|
15
|
+
self.max_seq_len = max_seq_len
|
16
|
+
self.vocab = vocab
|
17
|
+
|
18
|
+
def __call__(self, segment):
|
19
|
+
subwords, tags, tokens = list(), list(), list()
|
20
|
+
unk_token = datasets.Token(text="UNK")
|
21
|
+
|
22
|
+
for token in segment:
|
23
|
+
token_subwords = self.encoder(token.text)[1:-1]
|
24
|
+
subwords += token_subwords
|
25
|
+
tags += [self.vocab.tags[0].get_stoi()[token.gold_tag[0]]] + [self.vocab.tags[0].get_stoi()["O"]] * (len(token_subwords) - 1)
|
26
|
+
tokens += [token] + [unk_token] * (len(token_subwords) - 1)
|
27
|
+
|
28
|
+
# Truncate to max_seq_len
|
29
|
+
if len(subwords) > self.max_seq_len - 2:
|
30
|
+
text = " ".join([t.text for t in tokens if t.text != "UNK"])
|
31
|
+
|
32
|
+
subwords = subwords[:self.max_seq_len - 2]
|
33
|
+
tags = tags[:self.max_seq_len - 2]
|
34
|
+
tokens = tokens[:self.max_seq_len - 2]
|
35
|
+
|
36
|
+
subwords.insert(0, self.tokenizer.cls_token_id)
|
37
|
+
subwords.append(self.tokenizer.sep_token_id)
|
38
|
+
|
39
|
+
tags.insert(0, self.vocab.tags[0].get_stoi()["O"])
|
40
|
+
tags.append(self.vocab.tags[0].get_stoi()["O"])
|
41
|
+
|
42
|
+
tokens.insert(0, unk_token)
|
43
|
+
tokens.append(unk_token)
|
44
|
+
|
45
|
+
return torch.LongTensor(subwords), torch.LongTensor(tags), tokens, len(tokens)
|
46
|
+
|
47
|
+
|
48
|
+
class NestedTagsTransform:
|
49
|
+
def __init__(self, bert_model, vocab, max_seq_len=512):
|
50
|
+
self.tokenizer = BertTokenizer.from_pretrained(bert_model)
|
51
|
+
self.encoder = partial(
|
52
|
+
self.tokenizer.encode,
|
53
|
+
max_length=max_seq_len,
|
54
|
+
truncation=True,
|
55
|
+
)
|
56
|
+
self.max_seq_len = max_seq_len
|
57
|
+
self.vocab = vocab
|
58
|
+
|
59
|
+
def __call__(self, segment):
|
60
|
+
tags, tokens, subwords = list(), list(), list()
|
61
|
+
unk_token = datasets.Token(text="UNK")
|
62
|
+
|
63
|
+
# Encode each token and get its subwords and IDs
|
64
|
+
for token in segment:
|
65
|
+
token.subwords = self.encoder(token.text)[1:-1]
|
66
|
+
subwords += token.subwords
|
67
|
+
tokens += [token] + [unk_token] * (len(token.subwords ) - 1)
|
68
|
+
|
69
|
+
# Construct the labels for each tag type
|
70
|
+
# The sequence will have a list of tags for each type
|
71
|
+
# The final tags for a sequence is a matrix NUM_TAG_TYPES x SEQ_LEN
|
72
|
+
# Example:
|
73
|
+
# [
|
74
|
+
# [O, O, B-PERS, I-PERS, O, O, O]
|
75
|
+
# [B-ORG, I-ORG, O, O, O, O, O]
|
76
|
+
# [O, O, O, O, O, O, B-GPE]
|
77
|
+
# ]
|
78
|
+
for vocab in self.vocab.tags[1:]:
|
79
|
+
vocab_tags = "|".join([t for t in vocab.get_itos() if "-" in t])
|
80
|
+
r = re.compile(vocab_tags)
|
81
|
+
|
82
|
+
# This is really messy
|
83
|
+
# For a given token we find a matching tag_name, BUT we might find
|
84
|
+
# multiple matches (i.e. a token can be labeled B-ORG and I-ORG) in this
|
85
|
+
# case we get only the first tag as we do not have overlapping of same type
|
86
|
+
single_type_tags = [[(list(filter(r.match, token.gold_tag))
|
87
|
+
or ["O"])[0]] + ["O"] * (len(token.subwords) - 1)
|
88
|
+
for token in segment]
|
89
|
+
single_type_tags = list(itertools.chain(*single_type_tags))
|
90
|
+
tags.append([vocab.get_stoi()[tag] for tag in single_type_tags])
|
91
|
+
|
92
|
+
# Truncate to max_seq_len
|
93
|
+
if len(subwords) > self.max_seq_len - 2:
|
94
|
+
text = " ".join([t.text for t in tokens if t.text != "UNK"])
|
95
|
+
|
96
|
+
subwords = subwords[:self.max_seq_len - 2]
|
97
|
+
tags = [t[:self.max_seq_len - 2] for t in tags]
|
98
|
+
tokens = tokens[:self.max_seq_len - 2]
|
99
|
+
|
100
|
+
# Add dummy token at the start end of sequence
|
101
|
+
tokens.insert(0, unk_token)
|
102
|
+
tokens.append(unk_token)
|
103
|
+
|
104
|
+
# Add CLS and SEP at start end of subwords
|
105
|
+
subwords.insert(0, self.tokenizer.cls_token_id)
|
106
|
+
subwords.append(self.tokenizer.sep_token_id)
|
107
|
+
subwords = torch.LongTensor(subwords)
|
108
|
+
|
109
|
+
# Add "O" tags for the first and last subwords
|
110
|
+
tags = torch.Tensor(tags)
|
111
|
+
tags = torch.column_stack((
|
112
|
+
torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
|
113
|
+
tags,
|
114
|
+
torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
|
115
|
+
)).unsqueeze(0)
|
116
|
+
|
117
|
+
mask = torch.ones_like(tags)
|
118
|
+
return subwords, tags, tokens, mask, len(tokens)
|
sinatools/ner/data.py
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
from torch.utils.data import DataLoader
|
2
|
+
from torchtext.vocab import vocab
|
3
|
+
from collections import Counter, namedtuple
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
import itertools
|
7
|
+
from sinatools.ner.helpers import load_object
|
8
|
+
from sinatools.ner.datasets import Token
|
9
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def conll_to_segments(filename):
|
15
|
+
"""
|
16
|
+
Convert CoNLL files to segments. This return list of segments and each segment is
|
17
|
+
a list of tuples (token, tag)
|
18
|
+
:param filename: Path
|
19
|
+
:return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
|
20
|
+
"""
|
21
|
+
segments, segment = list(), list()
|
22
|
+
|
23
|
+
with open(filename, "r") as fh:
|
24
|
+
for token in fh.read().splitlines():
|
25
|
+
if not token.strip():
|
26
|
+
segments.append(segment)
|
27
|
+
segment = list()
|
28
|
+
else:
|
29
|
+
parts = token.split()
|
30
|
+
token = Token(text=parts[0], gold_tag=parts[1:])
|
31
|
+
segment.append(token)
|
32
|
+
|
33
|
+
segments.append(segment)
|
34
|
+
|
35
|
+
return segments
|
36
|
+
|
37
|
+
|
38
|
+
def parse_conll_files(data_paths):
|
39
|
+
"""
|
40
|
+
Parse CoNLL formatted files and return list of segments for each file and index
|
41
|
+
the vocabs and tags across all data_paths
|
42
|
+
:param data_paths: tuple(Path) - tuple of filenames
|
43
|
+
:return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
|
44
|
+
[[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
|
45
|
+
...
|
46
|
+
)
|
47
|
+
List of segments for each dataset and each segment has list of (tokens, tags)
|
48
|
+
"""
|
49
|
+
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
50
|
+
datasets, tags, tokens = list(), list(), list()
|
51
|
+
|
52
|
+
for data_path in data_paths:
|
53
|
+
dataset = conll_to_segments(data_path)
|
54
|
+
datasets.append(dataset)
|
55
|
+
tokens += [token.text for segment in dataset for token in segment]
|
56
|
+
tags += [token.gold_tag for segment in dataset for token in segment]
|
57
|
+
|
58
|
+
# Flatten list of tags
|
59
|
+
tags = list(itertools.chain(*tags))
|
60
|
+
|
61
|
+
# Generate vocabs for tags and tokens
|
62
|
+
tag_vocabs = tag_vocab_by_type(tags)
|
63
|
+
tag_vocabs.insert(0, vocab(Counter(tags)))
|
64
|
+
vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
|
65
|
+
return tuple(datasets), vocabs
|
66
|
+
|
67
|
+
|
68
|
+
def tag_vocab_by_type(tags):
|
69
|
+
vocabs = list()
|
70
|
+
c = Counter(tags)
|
71
|
+
tag_names = c.keys()
|
72
|
+
tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
|
73
|
+
|
74
|
+
for tag_type in tag_types:
|
75
|
+
r = re.compile(".*-" + tag_type)
|
76
|
+
t = list(filter(r.match, tags)) + ["O"]
|
77
|
+
vocabs.append(vocab(Counter(t), specials=["<pad>"]))
|
78
|
+
|
79
|
+
return vocabs
|
80
|
+
|
81
|
+
|
82
|
+
def text2segments(text):
|
83
|
+
"""
|
84
|
+
Convert text to a datasets and index the tokens
|
85
|
+
"""
|
86
|
+
#dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
|
87
|
+
list_of_tokens = simple_word_tokenize(text)
|
88
|
+
dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
|
89
|
+
tokens = [token.text for segment in dataset for token in segment]
|
90
|
+
|
91
|
+
# Generate vocabs for the tokens
|
92
|
+
segment_vocab = vocab(Counter(tokens), specials=["UNK"])
|
93
|
+
return dataset, segment_vocab
|
94
|
+
|
95
|
+
|
96
|
+
def get_dataloaders(
|
97
|
+
datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
|
98
|
+
):
|
99
|
+
"""
|
100
|
+
From the datasets generate the dataloaders
|
101
|
+
:param datasets: list - list of the datasets, list of list of segments and tokens
|
102
|
+
:param batch_size: int
|
103
|
+
:param num_workers: int
|
104
|
+
:param shuffle: boolean - to shuffle the data or not
|
105
|
+
:return: List[torch.utils.data.DataLoader]
|
106
|
+
"""
|
107
|
+
dataloaders = list()
|
108
|
+
|
109
|
+
for i, examples in enumerate(datasets):
|
110
|
+
data_config["kwargs"].update({"examples": examples, "vocab": vocab})
|
111
|
+
dataset = load_object("sinatools."+data_config["fn"], data_config["kwargs"])
|
112
|
+
|
113
|
+
dataloader = DataLoader(
|
114
|
+
dataset=dataset,
|
115
|
+
shuffle=shuffle[i],
|
116
|
+
batch_size=batch_size,
|
117
|
+
num_workers=num_workers,
|
118
|
+
collate_fn=dataset.collate_fn,
|
119
|
+
)
|
120
|
+
|
121
|
+
logger.info("%s batches found", len(dataloader))
|
122
|
+
dataloaders.append(dataloader)
|
123
|
+
|
124
|
+
return dataloaders
|
@@ -0,0 +1,124 @@
|
|
1
|
+
from torch.utils.data import DataLoader
|
2
|
+
from torchtext.vocab import vocab
|
3
|
+
from collections import Counter, namedtuple
|
4
|
+
import logging
|
5
|
+
import re
|
6
|
+
import itertools
|
7
|
+
from sinatools.ner.helpers import load_object
|
8
|
+
from sinatools.ner.datasets import Token
|
9
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
10
|
+
|
11
|
+
logger = logging.getLogger(__name__)
|
12
|
+
|
13
|
+
|
14
|
+
def conll_to_segments(filename):
|
15
|
+
"""
|
16
|
+
Convert CoNLL files to segments. This return list of segments and each segment is
|
17
|
+
a list of tuples (token, tag)
|
18
|
+
:param filename: Path
|
19
|
+
:return: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
|
20
|
+
"""
|
21
|
+
segments, segment = list(), list()
|
22
|
+
|
23
|
+
with open(filename, "r") as fh:
|
24
|
+
for token in fh.read().splitlines():
|
25
|
+
if not token.strip():
|
26
|
+
segments.append(segment)
|
27
|
+
segment = list()
|
28
|
+
else:
|
29
|
+
parts = token.split()
|
30
|
+
token = Token(text=parts[0], gold_tag=parts[1:])
|
31
|
+
segment.append(token)
|
32
|
+
|
33
|
+
segments.append(segment)
|
34
|
+
|
35
|
+
return segments
|
36
|
+
|
37
|
+
|
38
|
+
def parse_conll_files(data_paths):
|
39
|
+
"""
|
40
|
+
Parse CoNLL formatted files and return list of segments for each file and index
|
41
|
+
the vocabs and tags across all data_paths
|
42
|
+
:param data_paths: tuple(Path) - tuple of filenames
|
43
|
+
:return: tuple( [[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i]
|
44
|
+
[[(token, tag), ...], [(token, tag), ...]], -> segments for data_paths[i+1],
|
45
|
+
...
|
46
|
+
)
|
47
|
+
List of segments for each dataset and each segment has list of (tokens, tags)
|
48
|
+
"""
|
49
|
+
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
50
|
+
datasets, tags, tokens = list(), list(), list()
|
51
|
+
|
52
|
+
for data_path in data_paths:
|
53
|
+
dataset = conll_to_segments(data_path)
|
54
|
+
datasets.append(dataset)
|
55
|
+
tokens += [token.text for segment in dataset for token in segment]
|
56
|
+
tags += [token.gold_tag for segment in dataset for token in segment]
|
57
|
+
|
58
|
+
# Flatten list of tags
|
59
|
+
tags = list(itertools.chain(*tags))
|
60
|
+
|
61
|
+
# Generate vocabs for tags and tokens
|
62
|
+
tag_vocabs = tag_vocab_by_type(tags)
|
63
|
+
tag_vocabs.insert(0, vocab(Counter(tags)))
|
64
|
+
vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
|
65
|
+
return tuple(datasets), vocabs
|
66
|
+
|
67
|
+
|
68
|
+
def tag_vocab_by_type(tags):
|
69
|
+
vocabs = list()
|
70
|
+
c = Counter(tags)
|
71
|
+
tag_names = c.keys()
|
72
|
+
tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
|
73
|
+
|
74
|
+
for tag_type in tag_types:
|
75
|
+
r = re.compile(".*-" + tag_type)
|
76
|
+
t = list(filter(r.match, tags)) + ["O"]
|
77
|
+
vocabs.append(vocab(Counter(t), specials=["<pad>"]))
|
78
|
+
|
79
|
+
return vocabs
|
80
|
+
|
81
|
+
|
82
|
+
def text2segments(text):
|
83
|
+
"""
|
84
|
+
Convert text to a datasets and index the tokens
|
85
|
+
"""
|
86
|
+
#dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
|
87
|
+
list_of_tokens = simple_word_tokenize(text)
|
88
|
+
dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
|
89
|
+
tokens = [token.text for segment in dataset for token in segment]
|
90
|
+
|
91
|
+
# Generate vocabs for the tokens
|
92
|
+
segment_vocab = vocab(Counter(tokens), specials=["UNK"])
|
93
|
+
return dataset, segment_vocab
|
94
|
+
|
95
|
+
|
96
|
+
def get_dataloaders(
|
97
|
+
datasets, vocab, data_config, batch_size=32, num_workers=0, shuffle=(True, False, False)
|
98
|
+
):
|
99
|
+
"""
|
100
|
+
From the datasets generate the dataloaders
|
101
|
+
:param datasets: list - list of the datasets, list of list of segments and tokens
|
102
|
+
:param batch_size: int
|
103
|
+
:param num_workers: int
|
104
|
+
:param shuffle: boolean - to shuffle the data or not
|
105
|
+
:return: List[torch.utils.data.DataLoader]
|
106
|
+
"""
|
107
|
+
dataloaders = list()
|
108
|
+
|
109
|
+
for i, examples in enumerate(datasets):
|
110
|
+
data_config["kwargs"].update({"examples": examples, "vocab": vocab})
|
111
|
+
dataset = load_object(data_config["fn"], data_config["kwargs"])
|
112
|
+
|
113
|
+
dataloader = DataLoader(
|
114
|
+
dataset=dataset,
|
115
|
+
shuffle=shuffle[i],
|
116
|
+
batch_size=batch_size,
|
117
|
+
num_workers=num_workers,
|
118
|
+
collate_fn=dataset.collate_fn,
|
119
|
+
)
|
120
|
+
|
121
|
+
logger.info("%s batches found", len(dataloader))
|
122
|
+
dataloaders.append(dataloader)
|
123
|
+
|
124
|
+
return dataloaders
|
@@ -0,0 +1,146 @@
|
|
1
|
+
import logging
|
2
|
+
import torch
|
3
|
+
from torch.utils.data import Dataset
|
4
|
+
from torch.nn.utils.rnn import pad_sequence
|
5
|
+
from sinatools.ner.transforms import (
|
6
|
+
BertSeqTransform,
|
7
|
+
NestedTagsTransform
|
8
|
+
)
|
9
|
+
|
10
|
+
logger = logging.getLogger(__name__)
|
11
|
+
|
12
|
+
|
13
|
+
class Token:
|
14
|
+
def __init__(self, text=None, pred_tag=None, gold_tag=None):
|
15
|
+
"""
|
16
|
+
Token object to hold token attributes
|
17
|
+
:param text: str
|
18
|
+
:param pred_tag: str
|
19
|
+
:param gold_tag: str
|
20
|
+
"""
|
21
|
+
self.text = text
|
22
|
+
self.gold_tag = gold_tag
|
23
|
+
self.pred_tag = pred_tag
|
24
|
+
self.subwords = None
|
25
|
+
|
26
|
+
@property
|
27
|
+
def subwords(self):
|
28
|
+
return self._subwords
|
29
|
+
|
30
|
+
@subwords.setter
|
31
|
+
def subwords(self, value):
|
32
|
+
self._subwords = value
|
33
|
+
|
34
|
+
def __str__(self):
|
35
|
+
"""
|
36
|
+
Token text representation
|
37
|
+
:return: str
|
38
|
+
"""
|
39
|
+
gold_tags = "|".join(self.gold_tag)
|
40
|
+
pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
|
41
|
+
|
42
|
+
if self.gold_tag:
|
43
|
+
r = f"{self.text}\t{gold_tags}\t{pred_tags}"
|
44
|
+
else:
|
45
|
+
r = f"{self.text}\t{pred_tags}"
|
46
|
+
|
47
|
+
return r
|
48
|
+
|
49
|
+
|
50
|
+
class DefaultDataset(Dataset):
|
51
|
+
def __init__(
|
52
|
+
self,
|
53
|
+
examples=None,
|
54
|
+
vocab=None,
|
55
|
+
bert_model="aubmindlab/bert-base-arabertv2",
|
56
|
+
max_seq_len=512,
|
57
|
+
):
|
58
|
+
"""
|
59
|
+
The dataset that used to transform the segments into training data
|
60
|
+
:param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
|
61
|
+
You can get generate examples from -- arabiner.data.dataset.parse_conll_files
|
62
|
+
:param vocab: vocab object containing indexed tags and tokens
|
63
|
+
:param bert_model: str - BERT model
|
64
|
+
:param: int - maximum sequence length
|
65
|
+
"""
|
66
|
+
self.transform = BertSeqTransform(bert_model, vocab, max_seq_len=max_seq_len)
|
67
|
+
self.examples = examples
|
68
|
+
self.vocab = vocab
|
69
|
+
|
70
|
+
def __len__(self):
|
71
|
+
return len(self.examples)
|
72
|
+
|
73
|
+
def __getitem__(self, item):
|
74
|
+
subwords, tags, tokens, valid_len = self.transform(self.examples[item])
|
75
|
+
return subwords, tags, tokens, valid_len
|
76
|
+
|
77
|
+
def collate_fn(self, batch):
|
78
|
+
"""
|
79
|
+
Collate function that is called when the batch is called by the trainer
|
80
|
+
:param batch: Dataloader batch
|
81
|
+
:return: Same output as the __getitem__ function
|
82
|
+
"""
|
83
|
+
subwords, tags, tokens, valid_len = zip(*batch)
|
84
|
+
|
85
|
+
# Pad sequences in this batch
|
86
|
+
# subwords and tokens are padded with zeros
|
87
|
+
# tags are padding with the index of the O tag
|
88
|
+
subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
|
89
|
+
tags = pad_sequence(
|
90
|
+
tags, batch_first=True, padding_value=self.vocab.tags[0].get_stoi()["O"]
|
91
|
+
)
|
92
|
+
return subwords, tags, tokens, valid_len
|
93
|
+
|
94
|
+
|
95
|
+
class NestedTagsDataset(Dataset):
|
96
|
+
def __init__(
|
97
|
+
self,
|
98
|
+
examples=None,
|
99
|
+
vocab=None,
|
100
|
+
bert_model="aubmindlab/bert-base-arabertv2",
|
101
|
+
max_seq_len=512,
|
102
|
+
):
|
103
|
+
"""
|
104
|
+
The dataset that used to transform the segments into training data
|
105
|
+
:param examples: list[[tuple]] - [[(token, tag), (token, tag), ...], [(token, tag), ...]]
|
106
|
+
You can get generate examples from -- arabiner.data.dataset.parse_conll_files
|
107
|
+
:param vocab: vocab object containing indexed tags and tokens
|
108
|
+
:param bert_model: str - BERT model
|
109
|
+
:param: int - maximum sequence length
|
110
|
+
"""
|
111
|
+
self.transform = NestedTagsTransform(
|
112
|
+
bert_model, vocab, max_seq_len=max_seq_len
|
113
|
+
)
|
114
|
+
self.examples = examples
|
115
|
+
self.vocab = vocab
|
116
|
+
|
117
|
+
def __len__(self):
|
118
|
+
return len(self.examples)
|
119
|
+
|
120
|
+
def __getitem__(self, item):
|
121
|
+
subwords, tags, tokens, masks, valid_len = self.transform(self.examples[item])
|
122
|
+
return subwords, tags, tokens, masks, valid_len
|
123
|
+
|
124
|
+
def collate_fn(self, batch):
|
125
|
+
"""
|
126
|
+
Collate function that is called when the batch is called by the trainer
|
127
|
+
:param batch: Dataloader batch
|
128
|
+
:return: Same output as the __getitem__ function
|
129
|
+
"""
|
130
|
+
subwords, tags, tokens, masks, valid_len = zip(*batch)
|
131
|
+
|
132
|
+
# Pad sequences in this batch
|
133
|
+
# subwords and tokens are padded with zeros
|
134
|
+
# tags are padding with the index of the O tag
|
135
|
+
subwords = pad_sequence(subwords, batch_first=True, padding_value=0)
|
136
|
+
|
137
|
+
masks = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), 0)(mask)
|
138
|
+
for tag, mask in zip(tags, masks)]
|
139
|
+
masks = torch.cat(masks)
|
140
|
+
|
141
|
+
# Pad the tags, do the padding for each tag type
|
142
|
+
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
|
143
|
+
for tag, vocab in zip(tags, self.vocab.tags[1:])]
|
144
|
+
tags = torch.cat(tags)
|
145
|
+
|
146
|
+
return subwords, tags, tokens, masks, valid_len
|