SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.13__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/METADATA +2 -3
  2. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/RECORD +47 -26
  3. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/entry_points.txt +7 -3
  4. sinatools/CLI/DataDownload/download_files.py +0 -10
  5. sinatools/CLI/ner/corpus_entity_extractor.py +9 -6
  6. sinatools/CLI/ner/entity_extractor.py +18 -42
  7. sinatools/CLI/utils/arStrip.py +8 -8
  8. sinatools/CLI/utils/implication.py +0 -8
  9. sinatools/CLI/utils/jaccard.py +5 -14
  10. sinatools/CLI/utils/remove_latin.py +2 -2
  11. sinatools/CLI/utils/text_dublication_detector.py +25 -0
  12. sinatools/VERSION +1 -1
  13. sinatools/morphology/ALMA_multi_word.py +14 -16
  14. sinatools/morphology/__init__.py +32 -31
  15. sinatools/ner/__init__.py +28 -2
  16. sinatools/ner/data/__init__.py +1 -0
  17. sinatools/ner/data/datasets.py +146 -0
  18. sinatools/ner/data/transforms.py +118 -0
  19. sinatools/ner/data.py +124 -0
  20. sinatools/ner/data_format.py +124 -0
  21. sinatools/ner/datasets.py +146 -0
  22. sinatools/ner/entity_extractor.py +34 -54
  23. sinatools/ner/helpers.py +86 -0
  24. sinatools/ner/metrics.py +69 -0
  25. sinatools/ner/nn/BaseModel.py +22 -0
  26. sinatools/ner/nn/BertNestedTagger.py +34 -0
  27. sinatools/ner/nn/BertSeqTagger.py +17 -0
  28. sinatools/ner/nn/__init__.py +3 -0
  29. sinatools/ner/trainers/BaseTrainer.py +117 -0
  30. sinatools/ner/trainers/BertNestedTrainer.py +203 -0
  31. sinatools/ner/trainers/BertTrainer.py +163 -0
  32. sinatools/ner/trainers/__init__.py +3 -0
  33. sinatools/ner/transforms.py +119 -0
  34. sinatools/semantic_relatedness/__init__.py +20 -0
  35. sinatools/semantic_relatedness/compute_relatedness.py +31 -0
  36. sinatools/synonyms/__init__.py +18 -0
  37. sinatools/synonyms/synonyms_generator.py +192 -0
  38. sinatools/utils/text_dublication_detector.py +110 -0
  39. sinatools/wsd/__init__.py +11 -0
  40. sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
  41. sinatools/{salma → wsd}/wsd.py +1 -1
  42. sinatools/CLI/salma/salma_tools.py +0 -68
  43. sinatools/salma/__init__.py +0 -12
  44. sinatools/utils/utils.py +0 -2
  45. {SinaTools-0.1.11.data → SinaTools-0.1.13.data}/data/sinatools/environment.yml +0 -0
  46. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/AUTHORS.rst +0 -0
  47. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/LICENSE +0 -0
  48. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/WHEEL +0 -0
  49. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/top_level.txt +0 -0
  50. /sinatools/{salma → wsd}/settings.py +0 -0
@@ -0,0 +1,163 @@
1
+ import os
2
+ import logging
3
+ import torch
4
+ import numpy as np
5
+ from sinatools.ner.trainers import BaseTrainer
6
+ from sinatools.ner.metrics import compute_single_label_metrics
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class BertTrainer(BaseTrainer):
12
+ def __init__(self, **kwargs):
13
+ super().__init__(**kwargs)
14
+
15
+ def train(self):
16
+ best_val_loss, test_loss = np.inf, np.inf
17
+ num_train_batch = len(self.train_dataloader)
18
+ patience = self.patience
19
+
20
+ for epoch_index in range(self.max_epochs):
21
+ self.current_epoch = epoch_index
22
+ train_loss = 0
23
+
24
+ for batch_index, (_, gold_tags, _, _, logits) in enumerate(self.tag(
25
+ self.train_dataloader, is_train=True
26
+ ), 1):
27
+ self.current_timestep += 1
28
+ batch_loss = self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
29
+ batch_loss.backward()
30
+
31
+ # Avoid exploding gradient by doing gradient clipping
32
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
33
+
34
+ self.optimizer.step()
35
+ self.scheduler.step()
36
+ train_loss += batch_loss.item()
37
+
38
+ if self.current_timestep % self.log_interval == 0:
39
+ logger.info(
40
+ "Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
41
+ epoch_index,
42
+ batch_index,
43
+ num_train_batch,
44
+ self.current_timestep,
45
+ self.optimizer.param_groups[0]['lr'],
46
+ batch_loss.item()
47
+ )
48
+
49
+ train_loss /= num_train_batch
50
+
51
+ logger.info("** Evaluating on validation dataset **")
52
+ val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
53
+ val_metrics = compute_single_label_metrics(segments)
54
+
55
+ epoch_summary_loss = {
56
+ "train_loss": train_loss,
57
+ "val_loss": val_loss
58
+ }
59
+ epoch_summary_metrics = {
60
+ "val_micro_f1": val_metrics.micro_f1,
61
+ "val_precision": val_metrics.precision,
62
+ "val_recall": val_metrics.recall
63
+ }
64
+
65
+ logger.info(
66
+ "Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
67
+ epoch_index,
68
+ self.current_timestep,
69
+ train_loss,
70
+ val_loss,
71
+ val_metrics.micro_f1
72
+ )
73
+
74
+ if val_loss < best_val_loss:
75
+ patience = self.patience
76
+ best_val_loss = val_loss
77
+ logger.info("** Validation improved, evaluating test data **")
78
+ test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
79
+ self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
80
+ test_metrics = compute_single_label_metrics(segments)
81
+
82
+ epoch_summary_loss["test_loss"] = test_loss
83
+ epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
84
+ epoch_summary_metrics["test_precision"] = test_metrics.precision
85
+ epoch_summary_metrics["test_recall"] = test_metrics.recall
86
+
87
+ logger.info(
88
+ f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
89
+ epoch_index,
90
+ self.current_timestep,
91
+ test_loss,
92
+ test_metrics.micro_f1
93
+ )
94
+
95
+ self.save()
96
+ else:
97
+ patience -= 1
98
+
99
+ # No improvements, terminating early
100
+ if patience == 0:
101
+ logger.info("Early termination triggered")
102
+ break
103
+
104
+ self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
105
+ self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
106
+
107
+ def eval(self, dataloader):
108
+ golds, preds, segments, valid_lens = list(), list(), list(), list()
109
+ loss = 0
110
+
111
+ for _, gold_tags, tokens, valid_len, logits in self.tag(
112
+ dataloader, is_train=False
113
+ ):
114
+ loss += self.loss(logits.view(-1, logits.shape[-1]), gold_tags.view(-1))
115
+ preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
116
+ segments += tokens
117
+ valid_lens += list(valid_len)
118
+
119
+ loss /= len(dataloader)
120
+
121
+ # Update segments, attach predicted tags to each token
122
+ segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
123
+
124
+ return preds, segments, valid_lens, loss.item()
125
+
126
+ def infer(self, dataloader):
127
+ golds, preds, segments, valid_lens = list(), list(), list(), list()
128
+
129
+ for _, gold_tags, tokens, valid_len, logits in self.tag(
130
+ dataloader, is_train=False
131
+ ):
132
+ preds += torch.argmax(logits, dim=2).detach().cpu().numpy().tolist()
133
+ segments += tokens
134
+ valid_lens += list(valid_len)
135
+
136
+ segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
137
+ return segments
138
+
139
+ def to_segments(self, segments, preds, valid_lens, vocab):
140
+ if vocab is None:
141
+ vocab = self.vocab
142
+
143
+ tagged_segments = list()
144
+ tokens_stoi = vocab.tokens.get_stoi()
145
+ tags_itos = vocab.tags[0].get_itos()
146
+ unk_id = tokens_stoi["UNK"]
147
+
148
+ for segment, pred, valid_len in zip(segments, preds, valid_lens):
149
+ # First, the token at 0th index [CLS] and token at nth index [SEP]
150
+ # Combine the tokens with their corresponding predictions
151
+ segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
152
+
153
+ # Ignore the sub-tokens/subwords, which are identified with text being UNK
154
+ segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
155
+
156
+ # Attach the predicted tags to each token
157
+ list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": tags_itos[t[1]]}]), segment_pred))
158
+
159
+ # We are only interested in the tagged tokens, we do no longer need raw model predictions
160
+ tagged_segment = [t for t, _ in segment_pred]
161
+ tagged_segments.append(tagged_segment)
162
+
163
+ return tagged_segments
@@ -0,0 +1,3 @@
1
+ from sinatools.ner.trainers.BaseTrainer import BaseTrainer
2
+ from sinatools.ner.trainers.BertTrainer import BertTrainer
3
+ from sinatools.ner.trainers.BertNestedTrainer import BertNestedTrainer
@@ -0,0 +1,119 @@
1
+ import torch
2
+ from transformers import BertTokenizer
3
+ from functools import partial
4
+ import re
5
+ import itertools
6
+ from sinatools.ner import datasets
7
+
8
+ class BertSeqTransform:
9
+ def __init__(self, bert_model, vocab, max_seq_len=512):
10
+ self.tokenizer = BertTokenizer.from_pretrained(bert_model)
11
+ self.encoder = partial(
12
+ self.tokenizer.encode,
13
+ max_length=max_seq_len,
14
+ truncation=True,
15
+ )
16
+ self.max_seq_len = max_seq_len
17
+ self.vocab = vocab
18
+
19
+ def __call__(self, segment):
20
+ subwords, tags, tokens = list(), list(), list()
21
+ unk_token = datasets.Token(text="UNK")
22
+
23
+ for token in segment:
24
+ token_subwords = self.encoder(token.text)[1:-1]
25
+ subwords += token_subwords
26
+ tags += [self.vocab.tags[0].get_stoi()[token.gold_tag[0]]] + [self.vocab.tags[0].get_stoi()["O"]] * (len(token_subwords) - 1)
27
+ tokens += [token] + [unk_token] * (len(token_subwords) - 1)
28
+
29
+ # Truncate to max_seq_len
30
+ if len(subwords) > self.max_seq_len - 2:
31
+ text = " ".join([t.text for t in tokens if t.text != "UNK"])
32
+
33
+ subwords = subwords[:self.max_seq_len - 2]
34
+ tags = tags[:self.max_seq_len - 2]
35
+ tokens = tokens[:self.max_seq_len - 2]
36
+
37
+ subwords.insert(0, self.tokenizer.cls_token_id)
38
+ subwords.append(self.tokenizer.sep_token_id)
39
+
40
+ tags.insert(0, self.vocab.tags[0].get_stoi()["O"])
41
+ tags.append(self.vocab.tags[0].get_stoi()["O"])
42
+
43
+ tokens.insert(0, unk_token)
44
+ tokens.append(unk_token)
45
+
46
+ return torch.LongTensor(subwords), torch.LongTensor(tags), tokens, len(tokens)
47
+
48
+
49
+ class NestedTagsTransform:
50
+ def __init__(self, bert_model, vocab, max_seq_len=512):
51
+ self.tokenizer = BertTokenizer.from_pretrained(bert_model)
52
+ self.encoder = partial(
53
+ self.tokenizer.encode,
54
+ max_length=max_seq_len,
55
+ truncation=True,
56
+ )
57
+ self.max_seq_len = max_seq_len
58
+ self.vocab = vocab
59
+
60
+ def __call__(self, segment):
61
+ tags, tokens, subwords = list(), list(), list()
62
+ unk_token = datasets.Token(text="UNK")
63
+
64
+ # Encode each token and get its subwords and IDs
65
+ for token in segment:
66
+ token.subwords = self.encoder(token.text)[1:-1]
67
+ subwords += token.subwords
68
+ tokens += [token] + [unk_token] * (len(token.subwords ) - 1)
69
+
70
+ # Construct the labels for each tag type
71
+ # The sequence will have a list of tags for each type
72
+ # The final tags for a sequence is a matrix NUM_TAG_TYPES x SEQ_LEN
73
+ # Example:
74
+ # [
75
+ # [O, O, B-PERS, I-PERS, O, O, O]
76
+ # [B-ORG, I-ORG, O, O, O, O, O]
77
+ # [O, O, O, O, O, O, B-GPE]
78
+ # ]
79
+ for vocab in self.vocab.tags[1:]:
80
+ vocab_tags = "|".join([t for t in vocab.get_itos() if "-" in t])
81
+ r = re.compile(vocab_tags)
82
+
83
+ # This is really messy
84
+ # For a given token we find a matching tag_name, BUT we might find
85
+ # multiple matches (i.e. a token can be labeled B-ORG and I-ORG) in this
86
+ # case we get only the first tag as we do not have overlapping of same type
87
+ single_type_tags = [[(list(filter(r.match, token.gold_tag))
88
+ or ["O"])[0]] + ["O"] * (len(token.subwords) - 1)
89
+ for token in segment]
90
+ single_type_tags = list(itertools.chain(*single_type_tags))
91
+ tags.append([vocab.get_stoi()[tag] for tag in single_type_tags])
92
+
93
+ # Truncate to max_seq_len
94
+ if len(subwords) > self.max_seq_len - 2:
95
+ text = " ".join([t.text for t in tokens if t.text != "UNK"])
96
+
97
+ subwords = subwords[:self.max_seq_len - 2]
98
+ tags = [t[:self.max_seq_len - 2] for t in tags]
99
+ tokens = tokens[:self.max_seq_len - 2]
100
+
101
+ # Add dummy token at the start end of sequence
102
+ tokens.insert(0, unk_token)
103
+ tokens.append(unk_token)
104
+
105
+ # Add CLS and SEP at start end of subwords
106
+ subwords.insert(0, self.tokenizer.cls_token_id)
107
+ subwords.append(self.tokenizer.sep_token_id)
108
+ subwords = torch.LongTensor(subwords)
109
+
110
+ # Add "O" tags for the first and last subwords
111
+ tags = torch.Tensor(tags)
112
+ tags = torch.column_stack((
113
+ torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
114
+ tags,
115
+ torch.Tensor([vocab.get_stoi()["O"] for vocab in self.vocab.tags[1:]]),
116
+ )).unsqueeze(0)
117
+
118
+ mask = torch.ones_like(tags)
119
+ return subwords, tags, tokens, mask, len(tokens)
@@ -0,0 +1,20 @@
1
+ import warnings
2
+ warnings.filterwarnings("ignore")
3
+ from sinatools.DataDownload import downloader
4
+ import os
5
+ from transformers import BertTokenizer,BertModel
6
+
7
+ model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
8
+ path =downloader.get_appdatadir()
9
+ model_file_path = os.path.join(path, model_file_name)
10
+
11
+ tokenizer_file_name = "bert-base-arabertv02"
12
+ path =downloader.get_appdatadir()
13
+ tokenizer_file_path = os.path.join(path, tokenizer_file_name)
14
+
15
+ model = BertModel.from_pretrained('{}'.format(model_file_path),
16
+ output_hidden_states = True,
17
+ num_labels=2
18
+ )
19
+
20
+ tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))
@@ -0,0 +1,31 @@
1
+ import torch
2
+ from . import tokenizer
3
+ from . import model
4
+
5
+ #cosine using average embedding
6
+ def get_similarity_score(sentence1, sentence2):
7
+
8
+ # Tokenize and encode sentences
9
+ inputs1 = tokenizer(sentence1, return_tensors="pt")
10
+ inputs2 = tokenizer(sentence2, return_tensors="pt")
11
+
12
+ # Extract embeddings
13
+ with torch.no_grad():
14
+ outputs1 = model(**inputs1)
15
+ outputs2 = model(**inputs2)
16
+
17
+ embeddings1 = outputs1.last_hidden_state
18
+ embeddings2 = outputs2.last_hidden_state
19
+
20
+ # Mask padding tokens
21
+ attention_mask1 = inputs1["attention_mask"]
22
+ attention_mask2 = inputs2["attention_mask"]
23
+
24
+ # Average pool across tokens, excluding padding
25
+ embeddings1_avg = torch.sum(embeddings1 * attention_mask1.unsqueeze(-1), dim=1) / torch.sum(attention_mask1, dim=1, keepdim=True)
26
+ embeddings2_avg = torch.sum(embeddings2 * attention_mask2.unsqueeze(-1), dim=1) / torch.sum(attention_mask2, dim=1, keepdim=True)
27
+
28
+ # Calculate cosine similarity
29
+ similarity = torch.nn.functional.cosine_similarity(embeddings1_avg, embeddings2_avg)
30
+
31
+ return similarity.item()
@@ -0,0 +1,18 @@
1
+ import pickle
2
+ from sinatools.DataDownload import downloader
3
+ import os
4
+
5
+ synonyms_level2_dict = {}
6
+ level2_dict = 'synonyms_level2.pkl'
7
+ path = downloader.get_appdatadir()
8
+ file_path = os.path.join(path, level2_dict)
9
+ with open(file_path, 'rb') as f:
10
+ synonyms_level2_dict = pickle.load(f, encoding='utf-8')
11
+
12
+
13
+ synonyms_level3_dict = {}
14
+ #level3_dict = 'synonyms_level3.pkl'
15
+ #path = downloader.get_appdatadir()
16
+ #file_path = os.path.join(path, level3_dict)
17
+ #with open(file_path, 'rb') as f:
18
+ # synonyms_level3_dict = pickle.load(f, encoding='utf-8')
@@ -0,0 +1,192 @@
1
+ from . import synonyms_level2_dict, synonyms_level3_dict
2
+ from copy import deepcopy
3
+
4
+ def dfs(graph, start, end, level):
5
+ level = level - 2
6
+ edge = [(start, [])]
7
+ while edge:
8
+ state, path = edge.pop()
9
+ if path and state == end:
10
+ yield path
11
+ continue
12
+ for next_state in graph[state]:
13
+ if next_state not in path:
14
+ edge.append((next_state, path+[next_state]))
15
+
16
+ if len(path) > level:
17
+ break
18
+
19
+
20
+ def find_cycles(level, synset, used_graph):
21
+ cycles = []
22
+ source_with_unique_candidates = {}
23
+ for source in synset:
24
+ source_with_unique_candidates[source] = set()
25
+
26
+ for path in dfs(used_graph, source, source, level):
27
+ cycle = [source]+path
28
+ if len(cycle) <= level :
29
+ cycles.append(cycle)
30
+ source_with_unique_candidates[source] = set(source_with_unique_candidates[source].union(set(cycle)))
31
+ return cycles, source_with_unique_candidates
32
+
33
+
34
+
35
+ def get_list_of_unique_synonems(synset,cycles, unique_synonyms, synonems_with_unique_candidates):
36
+ list_of_unique_synonyms = []
37
+ for i in range(0, len(unique_synonyms)):
38
+ synonym = unique_synonyms[i]
39
+ count = 0
40
+ syn_count = 0
41
+ for cycle in cycles:
42
+ if synonym in cycle:
43
+ count = count + 1
44
+
45
+ for v in synonems_with_unique_candidates:
46
+ tmp = list(synonems_with_unique_candidates[v])
47
+ if synonym in tmp :
48
+ syn_count = syn_count + 1
49
+
50
+
51
+ list_of_unique_synonyms.append([synonym,count, syn_count])
52
+ return list_of_unique_synonyms
53
+
54
+
55
+ def find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cycles, length_of_synset, synset):
56
+ list_of_synon_with_fuzzy_value = []
57
+
58
+ if level == 4 :
59
+ theta1 = 0.5
60
+ theta2 = 0.5
61
+ elif level == 3:
62
+ theta1 = 0.4
63
+ theta2 = 0.6
64
+ elif level == 2:
65
+ theta1 = 0.4
66
+ theta2 = 0.6
67
+ else:
68
+ theta1 = 0
69
+ theta2 = 0
70
+
71
+ for unique_syn in list_of_unique_synonyms:
72
+ if unique_syn[0] not in synset:
73
+ equ = ( unique_syn[1] / number_of_cycles ) * theta1 + (unique_syn[2] / length_of_synset ) * theta2
74
+ list_of_synon_with_fuzzy_value.append([unique_syn[0], equ])
75
+ return list_of_synon_with_fuzzy_value
76
+
77
+
78
+ def extend_synonyms(synset, level):
79
+
80
+ used_graph = {}
81
+ if level == 2:
82
+ used_graph = synonyms_level2_dict
83
+ elif level == 3:
84
+ used_graph = synonyms_level3_dict
85
+ else:
86
+ return "Please choose the correct level"
87
+
88
+ cycles = []
89
+ nodes = []
90
+ synonems_with_unique_candidates = {}
91
+ number_of_cycles = 0
92
+ final_synset = []
93
+ if synset != None:
94
+ synset = synset.split("|")
95
+ for syn in synset:
96
+ syn = syn.strip()
97
+ if syn in list(used_graph.keys()):
98
+ synonems_with_unique_candidates[syn] = set()
99
+ final_synset.append(syn)
100
+
101
+ cycles_list = used_graph[syn]
102
+ number_of_cycles = number_of_cycles + len(cycles_list)
103
+ for cycle in cycles_list:
104
+ cycles.append(cycle)
105
+ for c in cycle:
106
+ nodes.append(c)
107
+ synonems_with_unique_candidates[syn] = set(synonems_with_unique_candidates[syn].union(set([c])))
108
+
109
+ unique_synonyms = list(set(nodes))
110
+ list_of_unique_synonyms = get_list_of_unique_synonems(final_synset, cycles, unique_synonyms, synonems_with_unique_candidates)
111
+
112
+ length_of_synset = len(final_synset)
113
+
114
+ list_of_synon_with_fuzzy_value = find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cycles, length_of_synset, final_synset)
115
+
116
+ list_of_synon_with_fuzzy_value.sort(key=lambda row: (row[1], row[0]), reverse=True)
117
+
118
+ return list_of_synon_with_fuzzy_value
119
+
120
+ def evaluate_synonyms(synset, level):
121
+
122
+ used_graph = {}
123
+ if level == 2:
124
+ used_graph = synonyms_level2_dict
125
+ elif level == 3:
126
+ used_graph = synonyms_level3_dict
127
+ else:
128
+ return "Please choose the correct level"
129
+
130
+ cycles = []
131
+ synonems_with_unique_candidates = {}
132
+ number_of_cycles = 0
133
+ final_synset = []
134
+
135
+ if synset != None:
136
+ synset = synset.split("|")
137
+
138
+ for syn in synset:
139
+ syn = syn.strip()
140
+ if syn in list(used_graph.keys()):
141
+ synonems_with_unique_candidates[syn] = set()
142
+ final_synset.append(syn)
143
+
144
+ cycles_list = used_graph[syn]
145
+ for cycle in cycles_list:
146
+ cycles.append(cycle)
147
+ for c in cycle:
148
+ synonems_with_unique_candidates[syn] = set(synonems_with_unique_candidates[syn].union(set([c])))
149
+
150
+ if len(final_synset) > 1 :
151
+ fuzzy_result = []
152
+ for syn in final_synset:
153
+ included = False
154
+ tmp_synset = deepcopy(final_synset)
155
+ tmp_synset.remove(syn)
156
+
157
+ tmp_cycles = deepcopy(cycles)
158
+ filtered_cycle = [x for x in tmp_cycles if x[0] != syn]
159
+
160
+ nodes = []
161
+ for tmp_cycle in filtered_cycle:
162
+ for c in tmp_cycle:
163
+ nodes.append(c)
164
+
165
+ tmp_synonems_with_unique_candidates = deepcopy(synonems_with_unique_candidates)
166
+ del tmp_synonems_with_unique_candidates[syn]
167
+
168
+ unique_synonyms = list(set(nodes))
169
+
170
+ tmp_unique_synonyms = deepcopy(unique_synonyms)
171
+
172
+ number_of_cycles = len(filtered_cycle)
173
+
174
+ length_of_synset = len(tmp_synset)
175
+
176
+ list_of_unique_synonyms = get_list_of_unique_synonems(tmp_synset, filtered_cycle, tmp_unique_synonyms, tmp_synonems_with_unique_candidates)
177
+
178
+ list_of_synon_with_fuzzy_value = find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cycles, length_of_synset, tmp_synset)
179
+
180
+ for x in list_of_synon_with_fuzzy_value:
181
+ if x[0] == syn:
182
+ fuzzy_result.append(x)
183
+ included = True
184
+
185
+ if included == False:
186
+ fuzzy_result.append([syn,0])
187
+ else:
188
+ included = False
189
+
190
+ fuzzy_result.sort(key=lambda row: (row[1], row[0]), reverse=True)
191
+
192
+ return fuzzy_result
@@ -0,0 +1,110 @@
1
+
2
+ import pandas as pd
3
+ import re
4
+ import math
5
+ from collections import Counter
6
+ from sinatools.utils.parser import arStrip
7
+ from sinatools.utils.parser import remove_punctuation
8
+
9
+ def validator(sentence, max_tokens=500):
10
+ tokens = len(sentence.split())
11
+ if tokens > max_tokens:
12
+ return f"Invalid: Sentence has {tokens} tokens, which exceeds the maximum allowed ({max_tokens})."
13
+ else:
14
+ return "Valid"
15
+
16
+
17
+ def removal(csv_file, columnName, finalFileName, deletedFileName, similarityThreshold=0.8):
18
+ # Read CSV file
19
+ try:
20
+ df = pd.read_csv(csv_file)
21
+ except FileNotFoundError:
22
+ return "Error: CSV file not found."
23
+
24
+ # Check if the specified column exists
25
+ if columnName not in df.columns:
26
+ return f"Error: Column '{columnName}' does not exist in the CSV file."
27
+
28
+ # Create an empty DataFrame to store the final results
29
+ finalDf = pd.DataFrame(columns=df.columns)
30
+
31
+ # Create temporary DataFrames for deleted sentences
32
+ deletedSentencesDf = pd.DataFrame(columns=df.columns)
33
+
34
+ # Iterate through each row in the DataFrame
35
+ for index, row in df.iterrows():
36
+ currentSentence = str(row[columnName])
37
+
38
+ # Call the validator function for each sentence
39
+ #validationResult = validator(currentSentence)
40
+ validationResult = "Valid"
41
+
42
+ if validationResult == "Valid":
43
+ # Check cosine similarity with all sentences in the final DataFrame
44
+ isDuplicate = False
45
+ DublicatedRow = ""
46
+ for _, finalRow in finalDf.iterrows():
47
+ finalSentence = str(finalRow[columnName])
48
+ currentSentence = remove_punctuation(arStrip(currentSentence, diacs = False, smallDiacs = False, shaddah = False, digit = True, alif = False, specialChars = True))
49
+ finalSentence = remove_punctuation(arStrip(finalSentence, diacs = False, smallDiacs = False, shaddah = False, digit = True, alif = False, specialChars = True))
50
+ if currentSentence != "" and finalSentence != "":
51
+ similarity = calculateCosineSimilarity(currentSentence, finalSentence)
52
+
53
+ if similarity >= similarityThreshold:
54
+ isDuplicate = True
55
+ DublicatedRow = finalSentence
56
+ print("DublicatedRow : ", DublicatedRow)
57
+ break
58
+
59
+ if not isDuplicate:
60
+ # If not a duplicate, add the sentence to the final DataFrame
61
+ finalDf = finalDf.append(row, ignore_index=True)
62
+ else:
63
+ # If a duplicate, add the sentence to the deleted sentences DataFrame
64
+ #deletedSentencesDf = deletedSentencesDf.append(row, ignore_index=True)
65
+ deletedSentencesDf = deletedSentencesDf.append({**row, 'Dublicated': DublicatedRow}, ignore_index=True)
66
+ else:
67
+ # If validation fails, return the error message
68
+ return validationResult
69
+
70
+ # Save the final results to CSV files
71
+ finalDf.to_csv(finalFileName, index=False)
72
+ deletedSentencesDf.to_csv(deletedFileName, index=False)
73
+
74
+
75
+ def calculateCosineSimilarity(sentence1, sentence2):
76
+ vector1 = textToVector(sentence1)
77
+ vector2 = textToVector(sentence2)
78
+ cosine = getCosine(vector1, vector2)
79
+
80
+ return cosine
81
+
82
+
83
+ def getCosine(vec1, vec2):
84
+ intersection = set(vec1.keys()) & set(vec2.keys())
85
+ numerator = sum([vec1[x] * vec2[x] for x in intersection])
86
+
87
+ sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
88
+ sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
89
+ denominator = math.sqrt(sum1) * math.sqrt(sum2)
90
+
91
+ if not denominator:
92
+ return 0.0
93
+ else:
94
+ return float(numerator) / denominator
95
+
96
+
97
+ def textToVector(text):
98
+ WORD = re.compile(r"\w+")
99
+ words = WORD.findall(text)
100
+ return Counter(words)
101
+
102
+
103
+ # columnName = "Message"
104
+ # csvFile = "Arabic-Oct7-Feb12.csv"
105
+ # similarityThreshold = 0.8
106
+ # finalFileName = "Arabic-Oct7-Feb12FINAL.csv"
107
+ # deletedFileName = "Arabic-Oct7-Feb12DeletedSent.csv"
108
+
109
+ # result = removal(csvFile, columnName, finalFileName, deletedFileName, similarityThreshold)
110
+ # print(result)
@@ -0,0 +1,11 @@
1
+ from sinatools.wsd import settings
2
+ import pickle
3
+ from sinatools.DataDownload import downloader
4
+ import os
5
+
6
+ settings.glosses_dic = {}
7
+ filename = 'glosses_dic.pickle'
8
+ path =downloader.get_appdatadir()
9
+ file_path = os.path.join(path, filename)
10
+ with open(file_path, 'rb') as f:
11
+ settings.glosses_dic = pickle.load(f)