SinaTools 0.1.26__py2.py3-none-any.whl → 0.1.28__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.28.dist-info/METADATA +64 -0
- {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/RECORD +33 -30
- sinatools/CLI/DataDownload/download_files.py +5 -8
- sinatools/CLI/morphology/ALMA_multi_word.py +0 -34
- sinatools/CLI/morphology/morph_analyzer.py +1 -1
- sinatools/CLI/ner/corpus_entity_extractor.py +17 -4
- sinatools/CLI/ner/entity_extractor.py +8 -8
- sinatools/CLI/utils/implication.py +3 -3
- sinatools/CLI/utils/jaccard.py +2 -2
- sinatools/DataDownload/downloader.py +2 -2
- sinatools/VERSION +1 -1
- sinatools/morphology/morph_analyzer.py +44 -45
- sinatools/ner/__init__.py +6 -1
- sinatools/ner/entity_extractor.py +42 -1
- sinatools/ner/relation_extractor.py +201 -0
- sinatools/semantic_relatedness/compute_relatedness.py +22 -0
- sinatools/synonyms/__init__.py +2 -2
- sinatools/synonyms/synonyms_generator.py +45 -1
- sinatools/utils/jaccard.py +1 -1
- sinatools/utils/parser.py +12 -15
- sinatools/utils/similarity.py +240 -0
- sinatools/utils/text_dublication_detector.py +22 -0
- sinatools/utils/text_transliteration.py +1 -1
- sinatools/utils/tokenizer.py +1 -1
- sinatools/utils/word_compare.py +667 -0
- sinatools/wsd/__init__.py +1 -1
- sinatools/wsd/disambiguator.py +20 -19
- SinaTools-0.1.26.dist-info/METADATA +0 -34
- {SinaTools-0.1.26.data → SinaTools-0.1.28.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.26.dist-info → SinaTools-0.1.28.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,201 @@
|
|
1
|
+
import torch
|
2
|
+
import json
|
3
|
+
from urllib.request import Request, urlopen
|
4
|
+
from sinatools.ner.entity_extractor import extract
|
5
|
+
from . import pipe
|
6
|
+
|
7
|
+
|
8
|
+
# ============================ Extract entities and their types ========================
|
9
|
+
def jsons_to_list_of_lists(json_list):
|
10
|
+
return [[d['token'], d['tags']] for d in json_list]
|
11
|
+
|
12
|
+
def entities_and_types(sentence):
|
13
|
+
output_list = jsons_to_list_of_lists(extract(sentence))
|
14
|
+
json_short = distill_entities(output_list)
|
15
|
+
|
16
|
+
entities = {}
|
17
|
+
for entity in json_short:
|
18
|
+
name = entity[0]
|
19
|
+
entity_type = entity[1]
|
20
|
+
entities[name] = entity_type
|
21
|
+
|
22
|
+
return entities
|
23
|
+
|
24
|
+
def distill_entities(entities):
|
25
|
+
# This is list that we put the output what we need
|
26
|
+
list_output = list()
|
27
|
+
|
28
|
+
# This line go to sort function and save the output to temp_entities
|
29
|
+
temp_entities = sortTags(entities)
|
30
|
+
|
31
|
+
# This list help us to make the output,
|
32
|
+
temp_list = list()
|
33
|
+
|
34
|
+
# initlize the temp_list
|
35
|
+
temp_list.append(["", "", 0, 0])
|
36
|
+
word_position = 0
|
37
|
+
|
38
|
+
# For each entity, convert ibo to distllir list.
|
39
|
+
for entity in temp_entities:
|
40
|
+
# This is counter tag of this entity
|
41
|
+
counter_tag = 0
|
42
|
+
# For each tag
|
43
|
+
for tag in str(entity[1]).split():
|
44
|
+
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
|
45
|
+
if counter_tag >= len(temp_list):
|
46
|
+
temp_list.append(["", "", 0, 0])
|
47
|
+
|
48
|
+
# If tag equal O and word postion of this tag is not equal zero then it will add all
|
49
|
+
# not empty eliment of temp list in output list
|
50
|
+
if "O" == tag and word_position != 0:
|
51
|
+
for j in range(0, len(temp_list)):
|
52
|
+
if temp_list[j][1] != "":
|
53
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
54
|
+
temp_list[j][0] = ""
|
55
|
+
temp_list[j][1] = ""
|
56
|
+
temp_list[j][2] = word_position
|
57
|
+
temp_list[j][3] = word_position
|
58
|
+
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
|
59
|
+
# of the split its B
|
60
|
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
|
61
|
+
# if the temp_list of counter is not empty then it will append in output list and hten it will
|
62
|
+
# initilize by new string and tag in templist of counter
|
63
|
+
if temp_list[counter_tag][1] != "":
|
64
|
+
list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
|
65
|
+
temp_list[counter_tag][0] = str(entity[0]) + " "
|
66
|
+
temp_list[counter_tag][1] = str(tag).split("-")[1]
|
67
|
+
temp_list[counter_tag][2] = word_position
|
68
|
+
temp_list[counter_tag][3] = word_position
|
69
|
+
|
70
|
+
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
|
71
|
+
# of the split its O
|
72
|
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
|
73
|
+
# For each of temp_list, check if in this counter tag of templist is same tag with this.tag
|
74
|
+
# then will complete if not it will save in output list and cheak another
|
75
|
+
for j in range(counter_tag,len(temp_list)):
|
76
|
+
if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
|
77
|
+
temp_list[j][0] += str(entity[0]) + " "
|
78
|
+
temp_list[j][3] += 1
|
79
|
+
break
|
80
|
+
else:
|
81
|
+
if temp_list[j][1] != "":
|
82
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
83
|
+
temp_list[j][0] = ""
|
84
|
+
temp_list[j][1] = ""
|
85
|
+
temp_list[j][2] = word_position
|
86
|
+
temp_list[j][3] = word_position
|
87
|
+
counter_tag += 1
|
88
|
+
word_position += 1
|
89
|
+
# For each temp_list, at the end of the previous loop, there will be some
|
90
|
+
# values in this list, we should save it to the output list
|
91
|
+
for j in range(0, len(temp_list)):
|
92
|
+
if temp_list[j][1] != "":
|
93
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
94
|
+
return sorted(list_output, key=lambda x: (x[2]))
|
95
|
+
|
96
|
+
def sortTags(entities):
|
97
|
+
temp_entities = entities
|
98
|
+
temp_counter = 0
|
99
|
+
# For each entity, this loop will sort each tag of entitiy, first it will check if the
|
100
|
+
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
|
101
|
+
for entity in temp_entities:
|
102
|
+
tags = entity[1].split()
|
103
|
+
for tag in tags:
|
104
|
+
# if the counter is not 0 then, will complete
|
105
|
+
if temp_counter != 0:
|
106
|
+
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
|
107
|
+
# count how many tag in previous tags
|
108
|
+
if "I-" == tag[0:2]:
|
109
|
+
counter_of_this_tag = 0
|
110
|
+
counter_of_previous_tag = 0
|
111
|
+
for word in tags:
|
112
|
+
if tag.split("-")[1] in word:
|
113
|
+
counter_of_this_tag+=1
|
114
|
+
for word in temp_entities[temp_counter-1][1].split():
|
115
|
+
if tag.split("-")[1] in word:
|
116
|
+
counter_of_previous_tag+=1
|
117
|
+
# if the counter of previous tag is bigger than counter of this tag, then we
|
118
|
+
# need to add I-tag in this tags
|
119
|
+
if counter_of_previous_tag > counter_of_this_tag:
|
120
|
+
tags.append("I-"+tag.split("-")[1])
|
121
|
+
# Sort the tags
|
122
|
+
tags.sort()
|
123
|
+
# Need to revers the tags because it should begins with I
|
124
|
+
tags.reverse()
|
125
|
+
# If the counter is not 0 then we can complete
|
126
|
+
if temp_counter != 0:
|
127
|
+
this_tags = tags
|
128
|
+
previous_tags = temp_entities[temp_counter - 1][1].split()
|
129
|
+
sorted_tags = list()
|
130
|
+
|
131
|
+
# Check if the this tag is not O and previous tags is not O, then will complete,
|
132
|
+
# if not then it will ignor this tag
|
133
|
+
if "O" not in this_tags and "O" not in previous_tags:
|
134
|
+
index = 0
|
135
|
+
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
|
136
|
+
for i in previous_tags:
|
137
|
+
j = 0
|
138
|
+
while this_tags and j < len(this_tags):
|
139
|
+
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
|
140
|
+
sorted_tags.insert(index, this_tags.pop(j))
|
141
|
+
break
|
142
|
+
elif this_tags[j][0:2] == "B-":
|
143
|
+
break
|
144
|
+
j += 1
|
145
|
+
index += 1
|
146
|
+
sorted_tags += this_tags
|
147
|
+
tags = sorted_tags
|
148
|
+
str_tag = " "
|
149
|
+
str_tag = str_tag.join(tags)
|
150
|
+
str_tag = str_tag.strip()
|
151
|
+
temp_entities[temp_counter][1] = str_tag
|
152
|
+
temp_counter += 1
|
153
|
+
return temp_entities
|
154
|
+
|
155
|
+
# ============= Prepare Templates and Catergorize Extracted Entities ================
|
156
|
+
temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
|
157
|
+
categories = {
|
158
|
+
'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
|
159
|
+
'location': ['LOC', 'FAC', 'GPE'],
|
160
|
+
'happened at': ['DATE', 'TIME']
|
161
|
+
}
|
162
|
+
|
163
|
+
def get_entity_category(entity_type, categories):
|
164
|
+
for category, types in categories.items():
|
165
|
+
if entity_type in types:
|
166
|
+
return category
|
167
|
+
return None
|
168
|
+
|
169
|
+
|
170
|
+
# ============ Extract entities, their types and categorize them ===============
|
171
|
+
def relation_extraction(sentence):
|
172
|
+
#test_sentence="صورة إعتقال طفل فلسطيني خلال انتفاضة الأقصى ."
|
173
|
+
entities=entities_and_types(sentence)
|
174
|
+
|
175
|
+
event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
|
176
|
+
arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
|
177
|
+
|
178
|
+
output_list=[]
|
179
|
+
|
180
|
+
for i in event_indices:
|
181
|
+
event_entity=list(entities.keys())[i]
|
182
|
+
for j in arg_event_indices:
|
183
|
+
arg_name= list(entities.keys())[j]
|
184
|
+
arg_type=entities[arg_name]
|
185
|
+
category = get_entity_category(arg_type, categories)
|
186
|
+
|
187
|
+
if category in temp03:
|
188
|
+
relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
|
189
|
+
predicted_relation=pipe(relation_sentence)
|
190
|
+
score = predicted_relation[0][0]['score']
|
191
|
+
if score > 0.50:
|
192
|
+
#print(f"Event:{event_entity} Relation:{category} Argument:{arg_name}\n")
|
193
|
+
#output_list.append([{event_entity} ,{category}, {arg_name}])
|
194
|
+
output_list.append(f"Event:{event_entity}, Relation:{category}, Argument:{arg_name}")
|
195
|
+
|
196
|
+
else:
|
197
|
+
#print(f"Event:{event_entity} Relation:No relation Argument:{arg_name}\n")
|
198
|
+
#output_list.append([{event_entity} ,'No relation', {arg_name}])
|
199
|
+
output_list.append(f"Event:{event_entity}, Relation:No relation, Argument:{arg_name}")
|
200
|
+
|
201
|
+
return output_list
|
@@ -4,6 +4,28 @@ from . import model
|
|
4
4
|
|
5
5
|
#cosine using average embedding
|
6
6
|
def get_similarity_score(sentence1, sentence2):
|
7
|
+
"""
|
8
|
+
Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment. This method is described and implemented on this article.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
sentence1 (:obj:`str`) – The Arabic sentence to find the semantic relatedness between it and the second sentence.
|
12
|
+
sentence2 (:obj:`int`) – The Arabic sentence to find the semantic relatedness between it and the first sentence.
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
:obj:`float`: An float number that represents the degree of relatedness between two provided sentences.
|
16
|
+
|
17
|
+
**Example:**
|
18
|
+
|
19
|
+
.. highlight:: python
|
20
|
+
.. code-block:: python
|
21
|
+
|
22
|
+
from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score
|
23
|
+
|
24
|
+
sentence1 = "تبلغ سرعة دوران الأرض حول الشمس حوالي 110 كيلومتر في الساعة."
|
25
|
+
sentence2 = "تدور الأرض حول محورها بسرعة تصل تقريبا 1670 كيلومتر في الساعة."
|
26
|
+
get_similarity_score(sentence1, sentence2)
|
27
|
+
Score = 0.90
|
28
|
+
"""
|
7
29
|
|
8
30
|
# Tokenize and encode sentences
|
9
31
|
inputs1 = tokenizer(sentence1, return_tensors="pt")
|
sinatools/synonyms/__init__.py
CHANGED
@@ -3,7 +3,7 @@ from sinatools.DataDownload import downloader
|
|
3
3
|
import os
|
4
4
|
|
5
5
|
synonyms_level2_dict = {}
|
6
|
-
level2_dict = '
|
6
|
+
level2_dict = 'graph_l2.pkl'
|
7
7
|
path = downloader.get_appdatadir()
|
8
8
|
file_path = os.path.join(path, level2_dict)
|
9
9
|
with open(file_path, 'rb') as f:
|
@@ -11,7 +11,7 @@ with open(file_path, 'rb') as f:
|
|
11
11
|
|
12
12
|
|
13
13
|
synonyms_level3_dict = {}
|
14
|
-
level3_dict = '
|
14
|
+
level3_dict = 'graph_l3.pkl'
|
15
15
|
path = downloader.get_appdatadir()
|
16
16
|
file_path = os.path.join(path, level3_dict)
|
17
17
|
with open(file_path, 'rb') as f:
|
@@ -76,7 +76,28 @@ def find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cy
|
|
76
76
|
|
77
77
|
|
78
78
|
def extend_synonyms(synset, level):
|
79
|
-
|
79
|
+
"""
|
80
|
+
This method receives a set of one or more synonyms and a level number, then extends this set with additional synonyms. The more synonyms in the input, the more accurate in the results. Each synonym in the output is assigned a fuzzy value to indicate how much it is likely to be a synonymy. You can try the demo online. Read the article for more details.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
synset (:obj:`str`) – A set of initial synonyms to be extended (string of synonyms seperated by |).
|
84
|
+
level (:obj:`int`) – The level number indicates the depth of the synonym graph that the method should explore. The level could be 2 or 3. The 3rd level is richer, but the 2nd is faster.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
:obj:`list`: A list of lists, where each list could be contains:
|
88
|
+
synonym: Synonym related to the given synset (set of synonyms).
|
89
|
+
fuzzy_value: The synonyms strength as a percentage out of 100.
|
90
|
+
|
91
|
+
**Example:**
|
92
|
+
|
93
|
+
.. highlight:: python
|
94
|
+
.. code-block:: python
|
95
|
+
|
96
|
+
from sinatools.synonyms.synonyms_generator import extend_synonyms
|
97
|
+
extend_synonyms('ممر | طريق', 2)
|
98
|
+
[["مَسْلَك","61%"],["سبيل","61%"],["وَجْه","30%"],["نَهْج", "30%"],["نَمَطٌ","30%"],["مِنْهَج","30%"],["مِنهاج", "30%"],["مَوْر","30%"],["مَسَار","30%"],["مَرصَد", "30%"],["مَذْهَبٌ","30%"],["مَدْرَج","30%"],["مَجَاز","30%"]]
|
99
|
+
|
100
|
+
"""
|
80
101
|
used_graph = {}
|
81
102
|
if level == 2:
|
82
103
|
used_graph = synonyms_level2_dict
|
@@ -119,6 +140,29 @@ def extend_synonyms(synset, level):
|
|
119
140
|
|
120
141
|
def evaluate_synonyms(synset, level):
|
121
142
|
|
143
|
+
"""
|
144
|
+
This method receives a set of synonyms and a level number, then evaluates how much each of these input synonyms is really a synonym (i.e., how much it belongs to the set). You can try the demo online.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
synset (:obj:`str`) – A set of initial synonyms to be evaluated (string of synonyms seperated by |).
|
148
|
+
level (:obj:`int`) – The level number indicating the depth of synonym graph that the method will explore, which could be 2 or 3.
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
:obj:`list`: A list of lists, where each list could be contains:
|
152
|
+
synonym: Synonym related to the given synset (set of synonyms).
|
153
|
+
fuzzy_value: The synonyms strength as a percentage out of 100.
|
154
|
+
|
155
|
+
**Example:**
|
156
|
+
|
157
|
+
.. highlight:: python
|
158
|
+
.. code-block:: python
|
159
|
+
|
160
|
+
from sinatools.synonyms.synonyms_generator import evaluate_synonyms
|
161
|
+
|
162
|
+
evaluate_synonyms('ممر | طريق | مَسْلَك | سبيل')
|
163
|
+
[["مَسْلَك","61%"],["سبيل","60%"],["طريق","40%"],["ممر", "40%"]]
|
164
|
+
"""
|
165
|
+
|
122
166
|
used_graph = {}
|
123
167
|
if level == 2:
|
124
168
|
used_graph = synonyms_level2_dict
|
sinatools/utils/jaccard.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
|
3
3
|
from sinatools.utils.parser import arStrip
|
4
|
-
from sinatools.utils.
|
4
|
+
from sinatools.utils.word_compare import Implication
|
5
5
|
import argparse
|
6
6
|
|
7
7
|
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
sinatools/utils/parser.py
CHANGED
@@ -4,16 +4,16 @@ import argparse
|
|
4
4
|
def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, alif=True , special_chars=True ):
|
5
5
|
|
6
6
|
"""
|
7
|
-
This method
|
7
|
+
This method allows one to optionally remove (Arabic diacritics, small diacritics, shaddah, Latin and Arabic digits, unify alif, remove special characters, extra spaces, underscore and Arabic tatwelah) from the input text.
|
8
8
|
|
9
9
|
Args:
|
10
10
|
text (:obj:`str`): Arabic text to be processed.
|
11
|
-
diacs (:obj:`bool`): flag to remove Arabic diacretics [
|
12
|
-
small_diacs (:obj:`bool`): flag to remove small
|
11
|
+
diacs (:obj:`bool`): flag to remove these 7 Arabic diacretics [ ٍ ِ ْ ٌ ُ َ ً] (default is True).
|
12
|
+
small_diacs (:obj:`bool`): flag to remove all Quranic annotation signs from this range [06D6-06ED] in addition to small alif. (default is True).
|
13
13
|
shaddah (:obj:`bool`): flag to remove shaddah (default is True).
|
14
14
|
digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
|
15
|
-
alif (:obj:`bool`): flag to unify alif (default is True).
|
16
|
-
special_chars (:obj:`bool`): flag to remove special characters (default is True).
|
15
|
+
alif (:obj:`bool`): flag to unify alif. Replace [ٱ أ إ آ] into [ا] (default is True).
|
16
|
+
special_chars (:obj:`bool`): flag to remove these special characters [?؟!@#$%] (default is True).
|
17
17
|
|
18
18
|
Returns:
|
19
19
|
:obj:`str`: stripped text.
|
@@ -30,10 +30,10 @@ def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, al
|
|
30
30
|
# output
|
31
31
|
الجو جميل
|
32
32
|
|
33
|
-
output =
|
33
|
+
output =parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ', True, True, True, True, False, False )
|
34
34
|
print(output)
|
35
35
|
#output
|
36
|
-
|
36
|
+
ألم یأن للذین ءامنوا أن تخشع قلوبهم لذكر ٱلله وما نزل من ٱلحق ولا یكونوا كٱلذین أوتوا ٱلكتب من قبل فطال علیهم ٱلأمد فقست قلوبهم وكثیر منهم فسقون
|
37
37
|
"""
|
38
38
|
try:
|
39
39
|
if text: # if the input string is not empty do the following
|
@@ -67,13 +67,13 @@ def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, al
|
|
67
67
|
|
68
68
|
def remove_punctuation(text):
|
69
69
|
"""
|
70
|
-
Removes punctuation marks from the text.
|
70
|
+
Removes these arabic and english punctuation marks from the text [! " # $ % & ' ( ) * + , - . / : ; > = < ? @ [ \ ] ^ _ ` { | } ~ ، ؛ ؞ ؟ ـ ٓ ٬ ٪ ٫ ٭ ۔].
|
71
71
|
|
72
72
|
Args:
|
73
73
|
text (:obj:`str`): The input text.
|
74
74
|
|
75
75
|
Returns:
|
76
|
-
|
76
|
+
:obj:`str`
|
77
77
|
|
78
78
|
**Example:**
|
79
79
|
|
@@ -109,15 +109,12 @@ def remove_punctuation(text):
|
|
109
109
|
|
110
110
|
def remove_latin(text):
|
111
111
|
"""
|
112
|
-
This method removes all Latin
|
112
|
+
This method removes all Latin letters from the input text.
|
113
113
|
|
114
|
-
|
114
|
+
Parameters:
|
115
115
|
text (:obj:`str`): The input text.
|
116
|
-
|
117
116
|
Returns:
|
118
|
-
|
119
|
-
Note:
|
120
|
-
If an error occurs during processing, the original text is returned.
|
117
|
+
:obj:`str`
|
121
118
|
**Example:**
|
122
119
|
|
123
120
|
.. highlight:: python
|
@@ -0,0 +1,240 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
from sinatools.utils.parser import arStrip
|
4
|
+
from sinatools.utils.word_compare import Implication
|
5
|
+
import argparse
|
6
|
+
|
7
|
+
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
8
|
+
if ignore_all_diacritics_but_not_shadda:
|
9
|
+
word = arStrip(word, True, True, False, False, False, False)
|
10
|
+
|
11
|
+
if ignore_shadda_diacritic:
|
12
|
+
word = arStrip(word, False, False, True, False, False, False)
|
13
|
+
|
14
|
+
return word
|
15
|
+
|
16
|
+
|
17
|
+
def get_preferred_word(word1, word2):
|
18
|
+
implication = Implication(word1, word2)
|
19
|
+
|
20
|
+
direction = implication.get_direction()
|
21
|
+
|
22
|
+
if direction in (0, 2):
|
23
|
+
return word1
|
24
|
+
|
25
|
+
elif direction == 1:
|
26
|
+
return word2
|
27
|
+
|
28
|
+
elif direction == 3:
|
29
|
+
if not word1.endswith("َ") and not word1.endswith("ُ"):
|
30
|
+
return word2
|
31
|
+
return word1
|
32
|
+
|
33
|
+
|
34
|
+
def get_non_preferred_word(word1, word2):
|
35
|
+
|
36
|
+
implication = Implication(word1, word2)
|
37
|
+
if implication.get_distance() < 15:
|
38
|
+
direction = implication.get_direction()
|
39
|
+
if direction == 0 or direction == 1:
|
40
|
+
return word1
|
41
|
+
elif direction == 2:
|
42
|
+
return word2
|
43
|
+
elif direction == 3:
|
44
|
+
if not word1.endswith("َ") and not word1.endswith("ُ"):
|
45
|
+
return word1
|
46
|
+
return word2
|
47
|
+
return "#"
|
48
|
+
|
49
|
+
def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
|
50
|
+
"""
|
51
|
+
Computes the intersection of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
list1 (:obj:`list`): The first list.
|
55
|
+
list2 (:obj:`bool`): The second list.
|
56
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
|
57
|
+
ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
:obj:`list`: The intersection of the two lists, ignores diacritics if flags are true.
|
61
|
+
|
62
|
+
**Example:**
|
63
|
+
|
64
|
+
.. highlight:: python
|
65
|
+
.. code-block:: python
|
66
|
+
|
67
|
+
from sinatools.utils.similarity import get_intersection
|
68
|
+
list1 = ["كتب","فَعل","فَعَلَ"]
|
69
|
+
list2 = ["كتب","فَعّل"]
|
70
|
+
print(get_intersection(list1, list2, False, True))
|
71
|
+
#output: ["كتب" ,"فعل"]
|
72
|
+
"""
|
73
|
+
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
74
|
+
list1 = [str(i.strip()) for i in list1]
|
75
|
+
|
76
|
+
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
77
|
+
list2 = [str(i.strip()) for i in list2]
|
78
|
+
|
79
|
+
interection_list = []
|
80
|
+
|
81
|
+
for list1_word in list1:
|
82
|
+
for list2_word in list2:
|
83
|
+
word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
84
|
+
word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
85
|
+
|
86
|
+
implication = Implication(word1, word2)
|
87
|
+
if implication.get_direction() >= 0 and implication.get_distance() < 15:
|
88
|
+
interection_list.append(get_preferred_word(word1, word2))
|
89
|
+
|
90
|
+
i = 0
|
91
|
+
while i < len(interection_list):
|
92
|
+
j = i + 1
|
93
|
+
while j < len(interection_list):
|
94
|
+
non_preferred_word = get_non_preferred_word(interection_list[i], interection_list[j])
|
95
|
+
if non_preferred_word != "#":
|
96
|
+
interection_list.remove(non_preferred_word)
|
97
|
+
j += 1
|
98
|
+
i += 1
|
99
|
+
|
100
|
+
return interection_list
|
101
|
+
|
102
|
+
|
103
|
+
|
104
|
+
def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
|
105
|
+
"""
|
106
|
+
Computes the union of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
list1 (:obj:`list`): The first list.
|
110
|
+
list2 (:obj:`bool`): The second list.
|
111
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
|
112
|
+
ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
:obj:`list`: The union of the two lists, ignoring diacritics if flags are true.
|
116
|
+
|
117
|
+
**Example:**
|
118
|
+
|
119
|
+
.. highlight:: python
|
120
|
+
.. code-block:: python
|
121
|
+
|
122
|
+
from sinatools.utils.similarity import get_union
|
123
|
+
list1 = ["كتب","فَعل","فَعَلَ"]
|
124
|
+
list2 = ["كتب","فَعّل"]
|
125
|
+
print(get_union(list1, list2, False, True))
|
126
|
+
#output: ["كتب" ,"فَعل" ,"فَعَلَ"]
|
127
|
+
"""
|
128
|
+
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
129
|
+
|
130
|
+
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
131
|
+
|
132
|
+
union_list = []
|
133
|
+
|
134
|
+
for list1_word in list1:
|
135
|
+
word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
136
|
+
union_list.append(word1)
|
137
|
+
|
138
|
+
for list2_word in list2:
|
139
|
+
word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
140
|
+
union_list.append(word2)
|
141
|
+
|
142
|
+
i = 0
|
143
|
+
while i < len(union_list):
|
144
|
+
j = i + 1
|
145
|
+
while j < len(union_list):
|
146
|
+
non_preferred_word = get_non_preferred_word(union_list[i], union_list[j])
|
147
|
+
if (non_preferred_word != "#"):
|
148
|
+
union_list.remove(non_preferred_word)
|
149
|
+
j = j + 1
|
150
|
+
i = i + 1
|
151
|
+
|
152
|
+
return union_list
|
153
|
+
|
154
|
+
|
155
|
+
|
156
|
+
def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
|
157
|
+
"""
|
158
|
+
Calculates the Jaccard similarity coefficient between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
159
|
+
|
160
|
+
Args:
|
161
|
+
list1 (:obj:`list`): The first list.
|
162
|
+
list2 (:obj:`bool`): The second list.
|
163
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
|
164
|
+
ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
:obj:`float`: The Jaccard similarity coefficient between the two lists, ignoring diacritics if flags are true.
|
168
|
+
|
169
|
+
**Example:**
|
170
|
+
|
171
|
+
.. highlight:: python
|
172
|
+
.. code-block:: python
|
173
|
+
|
174
|
+
from sinatools.utils.similarity import get_jaccard_similarity
|
175
|
+
list1 = ["كتب","فَعل","فَعَلَ"]
|
176
|
+
list2 = ["كتب","فَعّل"]
|
177
|
+
print(get_jaccard_similarity(list1, list2, True, True))
|
178
|
+
#output: 0.67
|
179
|
+
"""
|
180
|
+
|
181
|
+
intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
182
|
+
|
183
|
+
union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
184
|
+
|
185
|
+
return float(len(intersection_list)) / float(len(union_list))
|
186
|
+
|
187
|
+
def get_jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
|
188
|
+
"""
|
189
|
+
Calculates and returns the Jaccard similarity values (union, intersection, or Jaccard similarity) between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
delimiter (:obj:`str`): The delimiter used to split the input strings.
|
193
|
+
str1 (:obj:`str`): The first input string to compare.
|
194
|
+
str1 (:obj:`str`): The second input string to compare.
|
195
|
+
selection (:obj:`str`) – The desired operation to perform on the two sets of strings. Must be one of intersection, union, jaccardSimilarity, or jaccardAll.
|
196
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`) – If True, ignore all diacratics except for the Shadda diacritic. (Default is True)
|
197
|
+
ignore_shadda_diacritic (:obj:`bool`) – If True, ignore the Shadda diacritic.(Default is True)
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
Three values (Jaccard similarity, union, or intersection) between the two lists of Arabic words depending on the parameter selection.
|
201
|
+
|
202
|
+
**Example:**
|
203
|
+
|
204
|
+
.. highlight:: python
|
205
|
+
.. code-block:: python
|
206
|
+
|
207
|
+
from sinatools.utils.similarity import get_jaccard
|
208
|
+
str1 = "فَعَلَ | فَعل"
|
209
|
+
str2 = "فَعّل"
|
210
|
+
print(get_jaccard("|", "jaccardAll", str1, str2, True, True))
|
211
|
+
#output: ['intersection:', ['فعل'], 'union:', ['فعل', 'فعل'], 'similarity:', 0.5]
|
212
|
+
"""
|
213
|
+
try:
|
214
|
+
list1 = str1.split(delimiter)
|
215
|
+
list2 = str2.split(delimiter)
|
216
|
+
|
217
|
+
if selection == "intersection":
|
218
|
+
intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
219
|
+
return intersection
|
220
|
+
elif selection == "union":
|
221
|
+
union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
222
|
+
return union
|
223
|
+
elif selection == "jaccardSimilarity":
|
224
|
+
similarity = get_jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
225
|
+
return similarity
|
226
|
+
elif selection == "jaccardAll":
|
227
|
+
intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
228
|
+
union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
229
|
+
similarity = get_jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
230
|
+
output_list = ["intersection:", intersection, "union:", union, "similarity:", similarity]
|
231
|
+
return output_list
|
232
|
+
else:
|
233
|
+
return 'Invalid selection option'
|
234
|
+
|
235
|
+
except AttributeError as ae:
|
236
|
+
print(f"Attribute error occurred: {str(ae)}")
|
237
|
+
return 'Invalid input type'
|
238
|
+
except Exception as e:
|
239
|
+
print(f"Error occurred: {str(e)}")
|
240
|
+
return 'An error has occurred'
|
@@ -15,6 +15,28 @@ def validator(sentence, max_tokens=500):
|
|
15
15
|
|
16
16
|
|
17
17
|
def removal(csv_file, columnName, finalFileName, deletedFileName, similarityThreshold=0.8):
|
18
|
+
"""
|
19
|
+
This method is designed to identify dublicate text in a given corpora/text. It processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold. We used cosine similarity to measure similarity between words and sentences. The method saves the filtered results and the identified duplicates to separate files.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
csv_file (:obj:`str`) – The CSV file contains Arabic text that needs to be cleaned.
|
23
|
+
column_name (:obj:`str`) – This is the name of the column containing the text that needs to be checked for duplicate removal.
|
24
|
+
final_file_name (:obj:`str`) – This is the name of the CSV file that will contain the data after duplicate removal.
|
25
|
+
deleted_file_name (:obj:`str`) – This is the name of the file that will contain all the duplicate records that are deleted.
|
26
|
+
similarity_threshold (:obj:`float`) – This is a floating-point number. The default value is 0.8, indicating the percentage of similarity that the function should use when deleting duplicates from the text column.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
csv files.
|
30
|
+
|
31
|
+
**Example:**
|
32
|
+
|
33
|
+
.. highlight:: python
|
34
|
+
.. code-block:: python
|
35
|
+
|
36
|
+
from sinatools.utils.text_dublication_detector import removal
|
37
|
+
removal("/path/to/csv/file1", sentences, "/path/to/csv/file2", 0.8)
|
38
|
+
"""
|
39
|
+
|
18
40
|
# Read CSV file
|
19
41
|
try:
|
20
42
|
df = pd.read_csv(csv_file)
|
@@ -165,7 +165,7 @@ bw2ar_map = {
|
|
165
165
|
#It takes a text and the schema as input and return 2-values: the transliteration and a flag of whether all chars are transliterated or not
|
166
166
|
def perform_transliteration(text , schema ):
|
167
167
|
"""
|
168
|
-
This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
|
168
|
+
This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text is based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
|
169
169
|
|
170
170
|
Args:
|
171
171
|
text (:obj:`str`): The input text to be transliterated.
|