SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.12__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/METADATA +2 -3
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/RECORD +47 -26
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/entry_points.txt +7 -3
- sinatools/CLI/DataDownload/download_files.py +0 -10
- sinatools/CLI/ner/corpus_entity_extractor.py +6 -6
- sinatools/CLI/ner/entity_extractor.py +18 -42
- sinatools/CLI/utils/arStrip.py +8 -8
- sinatools/CLI/utils/implication.py +0 -8
- sinatools/CLI/utils/jaccard.py +5 -14
- sinatools/CLI/utils/remove_latin.py +2 -2
- sinatools/CLI/utils/text_dublication_detector.py +25 -0
- sinatools/VERSION +1 -1
- sinatools/morphology/ALMA_multi_word.py +14 -16
- sinatools/morphology/__init__.py +32 -31
- sinatools/ner/__init__.py +28 -2
- sinatools/ner/data/__init__.py +1 -0
- sinatools/ner/data/datasets.py +146 -0
- sinatools/ner/data/transforms.py +118 -0
- sinatools/ner/data.py +124 -0
- sinatools/ner/data_format.py +124 -0
- sinatools/ner/datasets.py +146 -0
- sinatools/ner/entity_extractor.py +34 -54
- sinatools/ner/helpers.py +86 -0
- sinatools/ner/metrics.py +69 -0
- sinatools/ner/nn/BaseModel.py +22 -0
- sinatools/ner/nn/BertNestedTagger.py +34 -0
- sinatools/ner/nn/BertSeqTagger.py +17 -0
- sinatools/ner/nn/__init__.py +3 -0
- sinatools/ner/trainers/BaseTrainer.py +117 -0
- sinatools/ner/trainers/BertNestedTrainer.py +203 -0
- sinatools/ner/trainers/BertTrainer.py +163 -0
- sinatools/ner/trainers/__init__.py +3 -0
- sinatools/ner/transforms.py +119 -0
- sinatools/semantic_relatedness/__init__.py +20 -0
- sinatools/semantic_relatedness/compute_relatedness.py +31 -0
- sinatools/synonyms/__init__.py +18 -0
- sinatools/synonyms/synonyms_generator.py +192 -0
- sinatools/utils/text_dublication_detector.py +110 -0
- sinatools/wsd/__init__.py +11 -0
- sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
- sinatools/{salma → wsd}/wsd.py +1 -1
- sinatools/CLI/salma/salma_tools.py +0 -68
- sinatools/salma/__init__.py +0 -12
- sinatools/utils/utils.py +0 -2
- {SinaTools-0.1.11.data → SinaTools-0.1.12.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.11.dist-info → SinaTools-0.1.12.dist-info}/top_level.txt +0 -0
- /sinatools/{salma → wsd}/settings.py +0 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
import os
|
2
|
+
import logging
|
3
|
+
import torch
|
4
|
+
import numpy as np
|
5
|
+
from sinatools.ner.trainers import BaseTrainer
|
6
|
+
from sinatools.ner.metrics import compute_single_label_metrics
|
7
|
+
|
8
|
+
logger = logging.getLogger(__name__)
|
9
|
+
|
10
|
+
|
11
|
+
class BertTrainer(BaseTrainer):
|
12
|
+
def __init__(self, **kwargs):
|
13
|
+
super().__init__(**kwargs)
|
14
|
+
|
15
|
+
def train(self):
|
16
|
+
best_val_loss, test_loss = np.inf, np.inf
|
17
|
+
num_train_batch = len(self.train_dataloader)
|
18
|
+
patience = self.patience
|
19
|
+
|
20
|
+
for epoch_index in range(self.max_epochs):
|
21
|
+
self.current_epoch = epoch_index
|
22
|
+
train_loss = 0
|
23
|
+
|
24
|
+
for batch_index, (_, gold_tags, _, _, logits) in enumerate(self.tag(
|
25
|
+
self.train_dataloader, is_train=True
|
26
|
+
), 1):
|
27
|
+
self.current_timestep += 1
|
28
|
+
batch_loss = self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
|
29
|
+
batch_loss.backward()
|
30
|
+
|
31
|
+
# Avoid exploding gradient by doing gradient clipping
|
32
|
+
torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
|
33
|
+
|
34
|
+
self.optimizer.step()
|
35
|
+
self.scheduler.step()
|
36
|
+
train_loss += batch_loss.item()
|
37
|
+
|
38
|
+
if self.current_timestep % self.log_interval == 0:
|
39
|
+
logger.info(
|
40
|
+
"Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
|
41
|
+
epoch_index,
|
42
|
+
batch_index,
|
43
|
+
num_train_batch,
|
44
|
+
self.current_timestep,
|
45
|
+
self.optimizer.param_groups[0]['lr'],
|
46
|
+
batch_loss.item()
|
47
|
+
)
|
48
|
+
|
49
|
+
train_loss /= num_train_batch
|
50
|
+
|
51
|
+
logger.info("** Evaluating on validation dataset **")
|
52
|
+
val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
|
53
|
+
val_metrics = compute_single_label_metrics(segments)
|
54
|
+
|
55
|
+
epoch_summary_loss = {
|
56
|
+
"train_loss": train_loss,
|
57
|
+
"val_loss": val_loss
|
58
|
+
}
|
59
|
+
epoch_summary_metrics = {
|
60
|
+
"val_micro_f1": val_metrics.micro_f1,
|
61
|
+
"val_precision": val_metrics.precision,
|
62
|
+
"val_recall": val_metrics.recall
|
63
|
+
}
|
64
|
+
|
65
|
+
logger.info(
|
66
|
+
"Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
|
67
|
+
epoch_index,
|
68
|
+
self.current_timestep,
|
69
|
+
train_loss,
|
70
|
+
val_loss,
|
71
|
+
val_metrics.micro_f1
|
72
|
+
)
|
73
|
+
|
74
|
+
if val_loss < best_val_loss:
|
75
|
+
patience = self.patience
|
76
|
+
best_val_loss = val_loss
|
77
|
+
logger.info("** Validation improved, evaluating test data **")
|
78
|
+
test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
|
79
|
+
self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
|
80
|
+
test_metrics = compute_single_label_metrics(segments)
|
81
|
+
|
82
|
+
epoch_summary_loss["test_loss"] = test_loss
|
83
|
+
epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
|
84
|
+
epoch_summary_metrics["test_precision"] = test_metrics.precision
|
85
|
+
epoch_summary_metrics["test_recall"] = test_metrics.recall
|
86
|
+
|
87
|
+
logger.info(
|
88
|
+
f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
|
89
|
+
epoch_index,
|
90
|
+
self.current_timestep,
|
91
|
+
test_loss,
|
92
|
+
test_metrics.micro_f1
|
93
|
+
)
|
94
|
+
|
95
|
+
self.save()
|
96
|
+
else:
|
97
|
+
patience -= 1
|
98
|
+
|
99
|
+
# No improvements, terminating early
|
100
|
+
if patience == 0:
|
101
|
+
logger.info("Early termination triggered")
|
102
|
+
break
|
103
|
+
|
104
|
+
self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
|
105
|
+
self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
|
106
|
+
|
107
|
+
def eval(self, dataloader):
|
108
|
+
golds, preds, segments, valid_lens = list(), list(), list(), list()
|
109
|
+
loss = 0
|
110
|
+
|
111
|
+
for _, gold_tags, tokens, valid_len, logits in self.tag(
|
112
|
+
dataloader, is_train=False
|
113
|
+
):
|
114
|
+
loss += self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
|
115
|
+
preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
|
116
|
+
segments += tokens
|
117
|
+
valid_lens += list(valid_len)
|
118
|
+
|
119
|
+
loss /= len(dataloader)
|
120
|
+
|
121
|
+
# Update segments, attach predicted tags to each token
|
122
|
+
segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
|
123
|
+
|
124
|
+
return preds, segments, valid_lens, loss.item()
|
125
|
+
|
126
|
+
def infer(self, dataloader):
|
127
|
+
golds, preds, segments, valid_lens = list(), list(), list(), list()
|
128
|
+
|
129
|
+
for _, gold_tags, tokens, valid_len, logits in self.tag(
|
130
|
+
dataloader, is_train=False
|
131
|
+
):
|
132
|
+
preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
|
133
|
+
segments += tokens
|
134
|
+
valid_lens += list(valid_len)
|
135
|
+
|
136
|
+
segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
|
137
|
+
return segments
|
138
|
+
|
139
|
+
def to_segments(self, segments, preds, valid_lens, vocab):
|
140
|
+
if vocab is None:
|
141
|
+
vocab = self.vocab
|
142
|
+
|
143
|
+
tagged_segments = list()
|
144
|
+
tokens_stoi = vocab.tokens.get_stoi()
|
145
|
+
tags_itos = vocab.tags[0].get_itos()
|
146
|
+
unk_id = tokens_stoi["UNK"]
|
147
|
+
|
148
|
+
for segment, pred, valid_len in zip(segments, preds, valid_lens):
|
149
|
+
# First, the token at 0th index [CLS] and token at nth index [SEP]
|
150
|
+
# Combine the tokens with their corresponding predictions
|
151
|
+
segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
|
152
|
+
|
153
|
+
# Ignore the sub-tokens/subwords, which are identified with text being UNK
|
154
|
+
segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
|
155
|
+
|
156
|
+
# Attach the predicted tags to each token
|
157
|
+
list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": tags_itos[t[1]]}]), segment_pred))
|
158
|
+
|
159
|
+
# We are only interested in the tagged tokens, we do no longer need raw model predictions
|
160
|
+
tagged_segment = [t for t, _ in segment_pred]
|
161
|
+
tagged_segments.append(tagged_segment)
|
162
|
+
|
163
|
+
return tagged_segments
|
@@ -0,0 +1,119 @@
|
|
1
|
+
import torch
|
2
|
+
from transformers import BertTokenizer
|
3
|
+
from functools import partial
|
4
|
+
import re
|
5
|
+
import itertools
|
6
|
+
from sinatools.ner import datasets
|
7
|
+
|
8
|
+
class BertSeqTransform:
|
9
|
+
def __init__(self, bert_model, vocab, max_seq_len=512):
|
10
|
+
self.tokenizer = BertTokenizer.from_pretrained(bert_model)
|
11
|
+
self.encoder = partial(
|
12
|
+
self.tokenizer.encode,
|
13
|
+
max_length=max_seq_len,
|
14
|
+
truncation=True,
|
15
|
+
)
|
16
|
+
self.max_seq_len = max_seq_len
|
17
|
+
self.vocab = vocab
|
18
|
+
|
19
|
+
def __call__(self, segment):
|
20
|
+
subwords, tags, tokens = list(), list(), list()
|
21
|
+
unk_token = datasets.Token(text="UNK")
|
22
|
+
|
23
|
+
for token in segment:
|
24
|
+
token_subwords = self.encoder(token.text)[1:-1]
|
25
|
+
subwords += token_subwords
|
26
|
+
tags += [self.vocab.tags[0].get_stoi()[token.gold_tag[0]]] + [self.vocab.tags[0].get_stoi()["O"]] * (len(token_subwords) - 1)
|
27
|
+
tokens += [token] + [unk_token] * (len(token_subwords) - 1)
|
28
|
+
|
29
|
+
# Truncate to max_seq_len
|
30
|
+
if len(subwords) > self.max_seq_len - 2:
|
31
|
+
text = " ".join([t.text for t in tokens if t.text != "UNK"])
|
32
|
+
|
33
|
+
subwords = subwords[:self.max_seq_len - 2]
|
34
|
+
tags = tags[:self.max_seq_len - 2]
|
35
|
+
tokens = tokens[:self.max_seq_len - 2]
|
36
|
+
|
37
|
+
subwords.insert(0, self.tokenizer.cls_token_id)
|
38
|
+
subwords.append(self.tokenizer.sep_token_id)
|
39
|
+
|
40
|
+
tags.insert(0, self.vocab.tags[0].get_stoi()["O"])
|
41
|
+
tags.append(self.vocab.tags[0].get_stoi()["O"])
|
42
|
+
|
43
|
+
tokens.insert(0, unk_token)
|
44
|
+
tokens.append(unk_token)
|
45
|
+
|
46
|
+
return torch.LongTensor(subwords), torch.LongTensor(tags), tokens, len(tokens)
|
47
|
+
|
48
|
+
|
49
|
+
class NestedTagsTransform:
|
50
|
+
def __init__(self, bert_model, vocab, max_seq_len=512):
|
51
|
+
self.tokenizer = BertTokenizer.from_pretrained(bert_model)
|
52
|
+
self.encoder = partial(
|
53
|
+
self.tokenizer.encode,
|
54
|
+
max_length=max_seq_len,
|
55
|
+
truncation=True,
|
56
|
+
)
|
57
|
+
self.max_seq_len = max_seq_len
|
58
|
+
self.vocab = vocab
|
59
|
+
|
60
|
+
def __call__(self, segment):
|
61
|
+
tags, tokens, subwords = list(), list(), list()
|
62
|
+
unk_token = datasets.Token(text="UNK")
|
63
|
+
|
64
|
+
# Encode each token and get its subwords and IDs
|
65
|
+
for token in segment:
|
66
|
+
token.subwords = self.encoder(token.text)[1:-1]
|
67
|
+
subwords += token.subwords
|
68
|
+
tokens += [token] + [unk_token] * (len(token.subwords ) - 1)
|
69
|
+
|
70
|
+
# Construct the labels for each tag type
|
71
|
+
# The sequence will have a list of tags for each type
|
72
|
+
# The final tags for a sequence is a matrix NUM_TAG_TYPES x SEQ_LEN
|
73
|
+
# Example:
|
74
|
+
# [
|
75
|
+
# [O, O, B-PERS, I-PERS, O, O, O]
|
76
|
+
# [B-ORG, I-ORG, O, O, O, O, O]
|
77
|
+
# [O, O, O, O, O, O, B-GPE]
|
78
|
+
# ]
|
79
|
+
for vocab in self.vocab.tags[1:]:
|
80
|
+
vocab_tags = "|".join([t for t in vocab.get_itos() if "-" in t])
|
81
|
+
r = re.compile(vocab_tags)
|
82
|
+
|
83
|
+
# This is really messy
|
84
|
+
# For a given token we find a matching tag_name, BUT we might find
|
85
|
+
# multiple matches (i.e. a token can be labeled B-ORG and I-ORG) in this
|
86
|
+
# case we get only the first tag as we do not have overlapping of same type
|
87
|
+
single_type_tags = [[(list(filter(r.match, token.gold_tag))
|
88
|
+
or ["O"])[0]] + ["O"] * (len(token.subwords) - 1)
|
89
|
+
for token in segment]
|
90
|
+
single_type_tags = list(itertools.chain(*single_type_tags))
|
91
|
+
tags.append([vocab.get_stoi()[tag] for tag in single_type_tags])
|
92
|
+
|
93
|
+
# Truncate to max_seq_len
|
94
|
+
if len(subwords) > self.max_seq_len - 2:
|
95
|
+
text = " ".join([t.text for t in tokens if t.text != "UNK"])
|
96
|
+
|
97
|
+
subwords = subwords[:self.max_seq_len - 2]
|
98
|
+
tags = [t[:self.max_seq_len - 2] for t in tags]
|
99
|
+
tokens = tokens[:self.max_seq_len - 2]
|
100
|
+
|
101
|
+
# Add dummy token at the start end of sequence
|
102
|
+
tokens.insert(0, unk_token)
|
103
|
+
tokens.append(unk_token)
|
104
|
+
|
105
|
+
# Add CLS and SEP at start end of subwords
|
106
|
+
subwords.insert(0, self.tokenizer.cls_token_id)
|
107
|
+
subwords.append(self.tokenizer.sep_token_id)
|
108
|
+
subwords = torch.LongTensor(subwords)
|
109
|
+
|
110
|
+
# Add "O" tags for the first and last subwords
|
111
|
+
tags = torch.Tensor(tags)
|
112
|
+
tags = torch.column_stack((
|
113
|
+
torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
|
114
|
+
tags,
|
115
|
+
torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
|
116
|
+
)).unsqueeze(0)
|
117
|
+
|
118
|
+
mask = torch.ones_like(tags)
|
119
|
+
return subwords, tags, tokens, mask, len(tokens)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
import warnings
|
2
|
+
warnings.filterwarnings("ignore")
|
3
|
+
from sinatools.DataDownload import downloader
|
4
|
+
import os
|
5
|
+
from transformers import BertTokenizer,BertModel
|
6
|
+
|
7
|
+
model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
|
8
|
+
path =downloader.get_appdatadir()
|
9
|
+
model_file_path = os.path.join(path, model_file_name)
|
10
|
+
|
11
|
+
tokenizer_file_name = "bert-base-arabertv02"
|
12
|
+
path =downloader.get_appdatadir()
|
13
|
+
tokenizer_file_path = os.path.join(path, tokenizer_file_name)
|
14
|
+
|
15
|
+
model = BertModel.from_pretrained('{}'.format(model_file_path),
|
16
|
+
output_hidden_states = True,
|
17
|
+
num_labels=2
|
18
|
+
)
|
19
|
+
|
20
|
+
tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))
|
@@ -0,0 +1,31 @@
|
|
1
|
+
import torch
|
2
|
+
from . import tokenizer
|
3
|
+
from . import model
|
4
|
+
|
5
|
+
#cosine using average embedding
|
6
|
+
def get_similarity_score(sentence1, sentence2):
|
7
|
+
|
8
|
+
# Tokenize and encode sentences
|
9
|
+
inputs1 = tokenizer(sentence1, return_tensors="pt")
|
10
|
+
inputs2 = tokenizer(sentence2, return_tensors="pt")
|
11
|
+
|
12
|
+
# Extract embeddings
|
13
|
+
with torch.no_grad():
|
14
|
+
outputs1 = model(**inputs1)
|
15
|
+
outputs2 = model(**inputs2)
|
16
|
+
|
17
|
+
embeddings1 = outputs1.last_hidden_state
|
18
|
+
embeddings2 = outputs2.last_hidden_state
|
19
|
+
|
20
|
+
# Mask padding tokens
|
21
|
+
attention_mask1 = inputs1["attention_mask"]
|
22
|
+
attention_mask2 = inputs2["attention_mask"]
|
23
|
+
|
24
|
+
# Average pool across tokens, excluding padding
|
25
|
+
embeddings1_avg = torch.sum(embeddings1 * attention_mask1.unsqueeze(-1), dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)
|
26
|
+
embeddings2_avg = torch.sum(embeddings2 * attention_mask2.unsqueeze(-1), dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)
|
27
|
+
|
28
|
+
# Calculate cosine similarity
|
29
|
+
similarity = torch.nn.functional.cosine_similarity(embeddings1_avg, embeddings2_avg)
|
30
|
+
|
31
|
+
return similarity.item()
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import pickle
|
2
|
+
from sinatools.DataDownload import downloader
|
3
|
+
import os
|
4
|
+
|
5
|
+
synonyms_level2_dict = {}
|
6
|
+
level2_dict = 'synonyms_level2.pkl'
|
7
|
+
path = downloader.get_appdatadir()
|
8
|
+
file_path = os.path.join(path, level2_dict)
|
9
|
+
with open(file_path, 'rb') as f:
|
10
|
+
synonyms_level2_dict = pickle.load(f, encoding='utf-8')
|
11
|
+
|
12
|
+
|
13
|
+
synonyms_level3_dict = {}
|
14
|
+
#level3_dict = 'synonyms_level3.pkl'
|
15
|
+
#path = downloader.get_appdatadir()
|
16
|
+
#file_path = os.path.join(path, level3_dict)
|
17
|
+
#with open(file_path, 'rb') as f:
|
18
|
+
# synonyms_level3_dict = pickle.load(f, encoding='utf-8')
|
@@ -0,0 +1,192 @@
|
|
1
|
+
from . import synonyms_level2_dict, synonyms_level3_dict
|
2
|
+
from copy import deepcopy
|
3
|
+
|
4
|
+
def dfs(graph, start, end, level):
|
5
|
+
level = level - 2
|
6
|
+
edge = [(start, [])]
|
7
|
+
while edge:
|
8
|
+
state, path = edge.pop()
|
9
|
+
if path and state == end:
|
10
|
+
yield path
|
11
|
+
continue
|
12
|
+
for next_state in graph[state]:
|
13
|
+
if next_state not in path:
|
14
|
+
edge.append((next_state, path+[next_state]))
|
15
|
+
|
16
|
+
if len(path) > level:
|
17
|
+
break
|
18
|
+
|
19
|
+
|
20
|
+
def find_cycles(level, synset, used_graph):
|
21
|
+
cycles = []
|
22
|
+
source_with_unique_candidates = {}
|
23
|
+
for source in synset:
|
24
|
+
source_with_unique_candidates[source] = set()
|
25
|
+
|
26
|
+
for path in dfs(used_graph, source, source, level):
|
27
|
+
cycle = [source]+path
|
28
|
+
if len(cycle) <= level :
|
29
|
+
cycles.append(cycle)
|
30
|
+
source_with_unique_candidates[source] = set(source_with_unique_candidates[source].union(set(cycle)))
|
31
|
+
return cycles, source_with_unique_candidates
|
32
|
+
|
33
|
+
|
34
|
+
|
35
|
+
def get_list_of_unique_synonems(synset,cycles, unique_synonyms, synonems_with_unique_candidates):
|
36
|
+
list_of_unique_synonyms = []
|
37
|
+
for i in range(0, len(unique_synonyms)):
|
38
|
+
synonym = unique_synonyms[i]
|
39
|
+
count = 0
|
40
|
+
syn_count = 0
|
41
|
+
for cycle in cycles:
|
42
|
+
if synonym in cycle:
|
43
|
+
count = count + 1
|
44
|
+
|
45
|
+
for v in synonems_with_unique_candidates:
|
46
|
+
tmp = list(synonems_with_unique_candidates[v])
|
47
|
+
if synonym in tmp :
|
48
|
+
syn_count = syn_count + 1
|
49
|
+
|
50
|
+
|
51
|
+
list_of_unique_synonyms.append([synonym,count, syn_count])
|
52
|
+
return list_of_unique_synonyms
|
53
|
+
|
54
|
+
|
55
|
+
def find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cycles, length_of_synset, synset):
|
56
|
+
list_of_synon_with_fuzzy_value = []
|
57
|
+
|
58
|
+
if level == 4 :
|
59
|
+
theta1 = 0.5
|
60
|
+
theta2 = 0.5
|
61
|
+
elif level == 3:
|
62
|
+
theta1 = 0.4
|
63
|
+
theta2 = 0.6
|
64
|
+
elif level == 2:
|
65
|
+
theta1 = 0.4
|
66
|
+
theta2 = 0.6
|
67
|
+
else:
|
68
|
+
theta1 = 0
|
69
|
+
theta2 = 0
|
70
|
+
|
71
|
+
for unique_syn in list_of_unique_synonyms:
|
72
|
+
if unique_syn[0] not in synset:
|
73
|
+
equ = ( unique_syn[1] / number_of_cycles ) * theta1 + (unique_syn[2] / length_of_synset ) * theta2
|
74
|
+
list_of_synon_with_fuzzy_value.append([unique_syn[0], equ])
|
75
|
+
return list_of_synon_with_fuzzy_value
|
76
|
+
|
77
|
+
|
78
|
+
def extend_synonyms(synset, level):
|
79
|
+
|
80
|
+
used_graph = {}
|
81
|
+
if level == 2:
|
82
|
+
used_graph = synonyms_level2_dict
|
83
|
+
elif level == 3:
|
84
|
+
used_graph = synonyms_level3_dict
|
85
|
+
else:
|
86
|
+
return "Please choose the correct level"
|
87
|
+
|
88
|
+
cycles = []
|
89
|
+
nodes = []
|
90
|
+
synonems_with_unique_candidates = {}
|
91
|
+
number_of_cycles = 0
|
92
|
+
final_synset = []
|
93
|
+
if synset != None:
|
94
|
+
synset = synset.split("|")
|
95
|
+
for syn in synset:
|
96
|
+
syn = syn.strip()
|
97
|
+
if syn in list(used_graph.keys()):
|
98
|
+
synonems_with_unique_candidates[syn] = set()
|
99
|
+
final_synset.append(syn)
|
100
|
+
|
101
|
+
cycles_list = used_graph[syn]
|
102
|
+
number_of_cycles = number_of_cycles + len(cycles_list)
|
103
|
+
for cycle in cycles_list:
|
104
|
+
cycles.append(cycle)
|
105
|
+
for c in cycle:
|
106
|
+
nodes.append(c)
|
107
|
+
synonems_with_unique_candidates[syn] = set(synonems_with_unique_candidates[syn].union(set([c])))
|
108
|
+
|
109
|
+
unique_synonyms = list(set(nodes))
|
110
|
+
list_of_unique_synonyms = get_list_of_unique_synonems(final_synset, cycles, unique_synonyms, synonems_with_unique_candidates)
|
111
|
+
|
112
|
+
length_of_synset = len(final_synset)
|
113
|
+
|
114
|
+
list_of_synon_with_fuzzy_value = find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cycles, length_of_synset, final_synset)
|
115
|
+
|
116
|
+
list_of_synon_with_fuzzy_value.sort(key=lambda row: (row[1], row[0]), reverse=True)
|
117
|
+
|
118
|
+
return list_of_synon_with_fuzzy_value
|
119
|
+
|
120
|
+
def evaluate_synonyms(synset, level):
|
121
|
+
|
122
|
+
used_graph = {}
|
123
|
+
if level == 2:
|
124
|
+
used_graph = synonyms_level2_dict
|
125
|
+
elif level == 3:
|
126
|
+
used_graph = synonyms_level3_dict
|
127
|
+
else:
|
128
|
+
return "Please choose the correct level"
|
129
|
+
|
130
|
+
cycles = []
|
131
|
+
synonems_with_unique_candidates = {}
|
132
|
+
number_of_cycles = 0
|
133
|
+
final_synset = []
|
134
|
+
|
135
|
+
if synset != None:
|
136
|
+
synset = synset.split("|")
|
137
|
+
|
138
|
+
for syn in synset:
|
139
|
+
syn = syn.strip()
|
140
|
+
if syn in list(used_graph.keys()):
|
141
|
+
synonems_with_unique_candidates[syn] = set()
|
142
|
+
final_synset.append(syn)
|
143
|
+
|
144
|
+
cycles_list = used_graph[syn]
|
145
|
+
for cycle in cycles_list:
|
146
|
+
cycles.append(cycle)
|
147
|
+
for c in cycle:
|
148
|
+
synonems_with_unique_candidates[syn] = set(synonems_with_unique_candidates[syn].union(set([c])))
|
149
|
+
|
150
|
+
if len(final_synset) > 1 :
|
151
|
+
fuzzy_result = []
|
152
|
+
for syn in final_synset:
|
153
|
+
included = False
|
154
|
+
tmp_synset = deepcopy(final_synset)
|
155
|
+
tmp_synset.remove(syn)
|
156
|
+
|
157
|
+
tmp_cycles = deepcopy(cycles)
|
158
|
+
filtered_cycle = [x for x in tmp_cycles if x[0] != syn]
|
159
|
+
|
160
|
+
nodes = []
|
161
|
+
for tmp_cycle in filtered_cycle:
|
162
|
+
for c in tmp_cycle:
|
163
|
+
nodes.append(c)
|
164
|
+
|
165
|
+
tmp_synonems_with_unique_candidates = deepcopy(synonems_with_unique_candidates)
|
166
|
+
del tmp_synonems_with_unique_candidates[syn]
|
167
|
+
|
168
|
+
unique_synonyms = list(set(nodes))
|
169
|
+
|
170
|
+
tmp_unique_synonyms = deepcopy(unique_synonyms)
|
171
|
+
|
172
|
+
number_of_cycles = len(filtered_cycle)
|
173
|
+
|
174
|
+
length_of_synset = len(tmp_synset)
|
175
|
+
|
176
|
+
list_of_unique_synonyms = get_list_of_unique_synonems(tmp_synset, filtered_cycle, tmp_unique_synonyms, tmp_synonems_with_unique_candidates)
|
177
|
+
|
178
|
+
list_of_synon_with_fuzzy_value = find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cycles, length_of_synset, tmp_synset)
|
179
|
+
|
180
|
+
for x in list_of_synon_with_fuzzy_value:
|
181
|
+
if x[0] == syn:
|
182
|
+
fuzzy_result.append(x)
|
183
|
+
included = True
|
184
|
+
|
185
|
+
if included == False:
|
186
|
+
fuzzy_result.append([syn,0])
|
187
|
+
else:
|
188
|
+
included = False
|
189
|
+
|
190
|
+
fuzzy_result.sort(key=lambda row: (row[1], row[0]), reverse=True)
|
191
|
+
|
192
|
+
return fuzzy_result
|
@@ -0,0 +1,110 @@
|
|
1
|
+
|
2
|
+
import pandas as pd
|
3
|
+
import re
|
4
|
+
import math
|
5
|
+
from collections import Counter
|
6
|
+
from sinatools.utils.parser import arStrip
|
7
|
+
from sinatools.utils.parser import remove_punctuation
|
8
|
+
|
9
|
+
def validator(sentence, max_tokens=500):
|
10
|
+
tokens = len(sentence.split())
|
11
|
+
if tokens > max_tokens:
|
12
|
+
return f"Invalid: Sentence has {tokens} tokens, which exceeds the maximum allowed ({max_tokens})."
|
13
|
+
else:
|
14
|
+
return "Valid"
|
15
|
+
|
16
|
+
|
17
|
+
def removal(csv_file, columnName, finalFileName, deletedFileName, similarityThreshold=0.8):
|
18
|
+
# Read CSV file
|
19
|
+
try:
|
20
|
+
df = pd.read_csv(csv_file)
|
21
|
+
except FileNotFoundError:
|
22
|
+
return "Error: CSV file not found."
|
23
|
+
|
24
|
+
# Check if the specified column exists
|
25
|
+
if columnName not in df.columns:
|
26
|
+
return f"Error: Column '{columnName}' does not exist in the CSV file."
|
27
|
+
|
28
|
+
# Create an empty DataFrame to store the final results
|
29
|
+
finalDf = pd.DataFrame(columns=df.columns)
|
30
|
+
|
31
|
+
# Create temporary DataFrames for deleted sentences
|
32
|
+
deletedSentencesDf = pd.DataFrame(columns=df.columns)
|
33
|
+
|
34
|
+
# Iterate through each row in the DataFrame
|
35
|
+
for index, row in df.iterrows():
|
36
|
+
currentSentence = str(row[columnName])
|
37
|
+
|
38
|
+
# Call the validator function for each sentence
|
39
|
+
#validationResult = validator(currentSentence)
|
40
|
+
validationResult = "Valid"
|
41
|
+
|
42
|
+
if validationResult == "Valid":
|
43
|
+
# Check cosine similarity with all sentences in the final DataFrame
|
44
|
+
isDuplicate = False
|
45
|
+
DublicatedRow = ""
|
46
|
+
for _, finalRow in finalDf.iterrows():
|
47
|
+
finalSentence = str(finalRow[columnName])
|
48
|
+
currentSentence = remove_punctuation(arStrip(currentSentence, diacs = False, smallDiacs = False, shaddah = False, digit = True, alif = False, specialChars = True))
|
49
|
+
finalSentence = remove_punctuation(arStrip(finalSentence, diacs = False, smallDiacs = False, shaddah = False, digit = True, alif = False, specialChars = True))
|
50
|
+
if currentSentence != "" and finalSentence != "":
|
51
|
+
similarity = calculateCosineSimilarity(currentSentence, finalSentence)
|
52
|
+
|
53
|
+
if similarity >= similarityThreshold:
|
54
|
+
isDuplicate = True
|
55
|
+
DublicatedRow = finalSentence
|
56
|
+
print("DublicatedRow : ", DublicatedRow)
|
57
|
+
break
|
58
|
+
|
59
|
+
if not isDuplicate:
|
60
|
+
# If not a duplicate, add the sentence to the final DataFrame
|
61
|
+
finalDf = finalDf.append(row, ignore_index=True)
|
62
|
+
else:
|
63
|
+
# If a duplicate, add the sentence to the deleted sentences DataFrame
|
64
|
+
#deletedSentencesDf = deletedSentencesDf.append(row, ignore_index=True)
|
65
|
+
deletedSentencesDf = deletedSentencesDf.append({**row, 'Dublicated': DublicatedRow}, ignore_index=True)
|
66
|
+
else:
|
67
|
+
# If validation fails, return the error message
|
68
|
+
return validationResult
|
69
|
+
|
70
|
+
# Save the final results to CSV files
|
71
|
+
finalDf.to_csv(finalFileName, index=False)
|
72
|
+
deletedSentencesDf.to_csv(deletedFileName, index=False)
|
73
|
+
|
74
|
+
|
75
|
+
def calculateCosineSimilarity(sentence1, sentence2):
|
76
|
+
vector1 = textToVector(sentence1)
|
77
|
+
vector2 = textToVector(sentence2)
|
78
|
+
cosine = getCosine(vector1, vector2)
|
79
|
+
|
80
|
+
return cosine
|
81
|
+
|
82
|
+
|
83
|
+
def getCosine(vec1, vec2):
|
84
|
+
intersection = set(vec1.keys()) & set(vec2.keys())
|
85
|
+
numerator = sum([vec1[x] * vec2[x] for x in intersection])
|
86
|
+
|
87
|
+
sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
|
88
|
+
sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
|
89
|
+
denominator = math.sqrt(sum1) * math.sqrt(sum2)
|
90
|
+
|
91
|
+
if not denominator:
|
92
|
+
return 0.0
|
93
|
+
else:
|
94
|
+
return float(numerator) / denominator
|
95
|
+
|
96
|
+
|
97
|
+
def textToVector(text):
|
98
|
+
WORD = re.compile(r"\w+")
|
99
|
+
words = WORD.findall(text)
|
100
|
+
return Counter(words)
|
101
|
+
|
102
|
+
|
103
|
+
# columnName = "Message"
|
104
|
+
# csvFile = "Arabic-Oct7-Feb12.csv"
|
105
|
+
# similarityThreshold = 0.8
|
106
|
+
# finalFileName = "Arabic-Oct7-Feb12FINAL.csv"
|
107
|
+
# deletedFileName = "Arabic-Oct7-Feb12DeletedSent.csv"
|
108
|
+
|
109
|
+
# result = removal(csvFile, columnName, finalFileName, deletedFileName, similarityThreshold)
|
110
|
+
# print(result)
|
@@ -0,0 +1,11 @@
|
|
1
|
+
from sinatools.wsd import settings
|
2
|
+
import pickle
|
3
|
+
from sinatools.DataDownload import downloader
|
4
|
+
import os
|
5
|
+
|
6
|
+
settings.glosses_dic = {}
|
7
|
+
filename = 'glosses_dic.pickle'
|
8
|
+
path =downloader.get_appdatadir()
|
9
|
+
file_path = os.path.join(path, filename)
|
10
|
+
with open(file_path, 'rb') as f:
|
11
|
+
settings.glosses_dic = pickle.load(f)
|