SinaTools 0.1.27__py2.py3-none-any.whl → 0.1.29__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/METADATA +2 -2
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/RECORD +27 -26
- sinatools/CLI/DataDownload/download_files.py +2 -5
- sinatools/CLI/morphology/ALMA_multi_word.py +0 -34
- sinatools/CLI/morphology/morph_analyzer.py +1 -1
- sinatools/CLI/ner/corpus_entity_extractor.py +17 -4
- sinatools/CLI/ner/entity_extractor.py +8 -8
- sinatools/CLI/utils/implication.py +3 -3
- sinatools/VERSION +1 -1
- sinatools/morphology/morph_analyzer.py +44 -45
- sinatools/ner/entity_extractor.py +41 -0
- sinatools/semantic_relatedness/compute_relatedness.py +22 -0
- sinatools/synonyms/synonyms_generator.py +45 -1
- sinatools/utils/jaccard.py +1 -1
- sinatools/utils/parser.py +12 -15
- sinatools/utils/similarity.py +95 -4
- sinatools/utils/text_dublication_detector.py +22 -0
- sinatools/utils/text_transliteration.py +1 -1
- sinatools/utils/tokenizer.py +1 -1
- sinatools/utils/word_compare.py +667 -0
- sinatools/wsd/disambiguator.py +20 -19
- {SinaTools-0.1.27.data → SinaTools-0.1.29.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.29.dist-info}/top_level.txt +0 -0
sinatools/utils/similarity.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
|
3
3
|
from sinatools.utils.parser import arStrip
|
4
|
-
from sinatools.utils.
|
4
|
+
from sinatools.utils.word_compare import Implication
|
5
5
|
import argparse
|
6
6
|
|
7
7
|
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
@@ -47,7 +47,29 @@ def get_non_preferred_word(word1, word2):
|
|
47
47
|
return "#"
|
48
48
|
|
49
49
|
def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
|
50
|
-
|
50
|
+
"""
|
51
|
+
Computes the intersection of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
52
|
+
|
53
|
+
Args:
|
54
|
+
list1 (:obj:`list`): The first list.
|
55
|
+
list2 (:obj:`bool`): The second list.
|
56
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
|
57
|
+
ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
|
58
|
+
|
59
|
+
Returns:
|
60
|
+
:obj:`list`: The intersection of the two lists, ignores diacritics if flags are true.
|
61
|
+
|
62
|
+
**Example:**
|
63
|
+
|
64
|
+
.. highlight:: python
|
65
|
+
.. code-block:: python
|
66
|
+
|
67
|
+
from sinatools.utils.similarity import get_intersection
|
68
|
+
list1 = ["كتب","فَعل","فَعَلَ"]
|
69
|
+
list2 = ["كتب","فَعّل"]
|
70
|
+
print(get_intersection(list1, list2, False, True))
|
71
|
+
#output: ["كتب" ,"فعل"]
|
72
|
+
"""
|
51
73
|
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
52
74
|
list1 = [str(i.strip()) for i in list1]
|
53
75
|
|
@@ -80,7 +102,29 @@ def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, i
|
|
80
102
|
|
81
103
|
|
82
104
|
def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
|
83
|
-
|
105
|
+
"""
|
106
|
+
Computes the union of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
list1 (:obj:`list`): The first list.
|
110
|
+
list2 (:obj:`bool`): The second list.
|
111
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
|
112
|
+
ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
:obj:`list`: The union of the two lists, ignoring diacritics if flags are true.
|
116
|
+
|
117
|
+
**Example:**
|
118
|
+
|
119
|
+
.. highlight:: python
|
120
|
+
.. code-block:: python
|
121
|
+
|
122
|
+
from sinatools.utils.similarity import get_union
|
123
|
+
list1 = ["كتب","فَعل","فَعَلَ"]
|
124
|
+
list2 = ["كتب","فَعّل"]
|
125
|
+
print(get_union(list1, list2, False, True))
|
126
|
+
#output: ["كتب" ,"فَعل" ,"فَعَلَ"]
|
127
|
+
"""
|
84
128
|
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
85
129
|
|
86
130
|
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
@@ -110,7 +154,30 @@ def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_
|
|
110
154
|
|
111
155
|
|
112
156
|
def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
|
157
|
+
"""
|
158
|
+
Calculates the Jaccard similarity coefficient between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
113
159
|
|
160
|
+
Args:
|
161
|
+
list1 (:obj:`list`): The first list.
|
162
|
+
list2 (:obj:`bool`): The second list.
|
163
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
|
164
|
+
ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
|
165
|
+
|
166
|
+
Returns:
|
167
|
+
:obj:`float`: The Jaccard similarity coefficient between the two lists, ignoring diacritics if flags are true.
|
168
|
+
|
169
|
+
**Example:**
|
170
|
+
|
171
|
+
.. highlight:: python
|
172
|
+
.. code-block:: python
|
173
|
+
|
174
|
+
from sinatools.utils.similarity import get_jaccard_similarity
|
175
|
+
list1 = ["كتب","فَعل","فَعَلَ"]
|
176
|
+
list2 = ["كتب","فَعّل"]
|
177
|
+
print(get_jaccard_similarity(list1, list2, True, True))
|
178
|
+
#output: 0.67
|
179
|
+
"""
|
180
|
+
|
114
181
|
intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
115
182
|
|
116
183
|
union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
@@ -118,7 +185,31 @@ def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_n
|
|
118
185
|
return float(len(intersection_list)) / float(len(union_list))
|
119
186
|
|
120
187
|
def get_jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
|
121
|
-
|
188
|
+
"""
|
189
|
+
Calculates and returns the Jaccard similarity values (union, intersection, or Jaccard similarity) between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
|
190
|
+
|
191
|
+
Args:
|
192
|
+
delimiter (:obj:`str`): The delimiter used to split the input strings.
|
193
|
+
str1 (:obj:`str`): The first input string to compare.
|
194
|
+
str1 (:obj:`str`): The second input string to compare.
|
195
|
+
selection (:obj:`str`) – The desired operation to perform on the two sets of strings. Must be one of intersection, union, jaccardSimilarity, or jaccardAll.
|
196
|
+
ignore_all_diacratics_but_not_shadda (:obj:`bool`) – If True, ignore all diacratics except for the Shadda diacritic. (Default is True)
|
197
|
+
ignore_shadda_diacritic (:obj:`bool`) – If True, ignore the Shadda diacritic.(Default is True)
|
198
|
+
|
199
|
+
Returns:
|
200
|
+
Three values (Jaccard similarity, union, or intersection) between the two lists of Arabic words depending on the parameter selection.
|
201
|
+
|
202
|
+
**Example:**
|
203
|
+
|
204
|
+
.. highlight:: python
|
205
|
+
.. code-block:: python
|
206
|
+
|
207
|
+
from sinatools.utils.similarity import get_jaccard
|
208
|
+
str1 = "فَعَلَ | فَعل"
|
209
|
+
str2 = "فَعّل"
|
210
|
+
print(get_jaccard("|", "jaccardAll", str1, str2, True, True))
|
211
|
+
#output: ['intersection:', ['فعل'], 'union:', ['فعل', 'فعل'], 'similarity:', 0.5]
|
212
|
+
"""
|
122
213
|
try:
|
123
214
|
list1 = str1.split(delimiter)
|
124
215
|
list2 = str2.split(delimiter)
|
@@ -15,6 +15,28 @@ def validator(sentence, max_tokens=500):
|
|
15
15
|
|
16
16
|
|
17
17
|
def removal(csv_file, columnName, finalFileName, deletedFileName, similarityThreshold=0.8):
|
18
|
+
"""
|
19
|
+
This method is designed to identify dublicate text in a given corpora/text. It processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold. We used cosine similarity to measure similarity between words and sentences. The method saves the filtered results and the identified duplicates to separate files.
|
20
|
+
|
21
|
+
Args:
|
22
|
+
csv_file (:obj:`str`) – The CSV file contains Arabic text that needs to be cleaned.
|
23
|
+
column_name (:obj:`str`) – This is the name of the column containing the text that needs to be checked for duplicate removal.
|
24
|
+
final_file_name (:obj:`str`) – This is the name of the CSV file that will contain the data after duplicate removal.
|
25
|
+
deleted_file_name (:obj:`str`) – This is the name of the file that will contain all the duplicate records that are deleted.
|
26
|
+
similarity_threshold (:obj:`float`) – This is a floating-point number. The default value is 0.8, indicating the percentage of similarity that the function should use when deleting duplicates from the text column.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
csv files.
|
30
|
+
|
31
|
+
**Example:**
|
32
|
+
|
33
|
+
.. highlight:: python
|
34
|
+
.. code-block:: python
|
35
|
+
|
36
|
+
from sinatools.utils.text_dublication_detector import removal
|
37
|
+
removal("/path/to/csv/file1", sentences, "/path/to/csv/file2", 0.8)
|
38
|
+
"""
|
39
|
+
|
18
40
|
# Read CSV file
|
19
41
|
try:
|
20
42
|
df = pd.read_csv(csv_file)
|
@@ -165,7 +165,7 @@ bw2ar_map = {
|
|
165
165
|
#It takes a text and the schema as input and return 2-values: the transliteration and a flag of whether all chars are transliterated or not
|
166
166
|
def perform_transliteration(text , schema ):
|
167
167
|
"""
|
168
|
-
This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
|
168
|
+
This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text is based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
|
169
169
|
|
170
170
|
Args:
|
171
171
|
text (:obj:`str`): The input text to be transliterated.
|
sinatools/utils/tokenizer.py
CHANGED
@@ -58,7 +58,7 @@ def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclam
|
|
58
58
|
|
59
59
|
def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
|
60
60
|
"""
|
61
|
-
This method receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in
|
61
|
+
This method is designed to tokenize a corpus into words. It receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in one CSV file. The data within files was split into sentences using the sentence_tokenizer module and into words using a word tokenizer. Additionally, it added a set of ids (row_id, docs_sentence_word_id, global_sentence_id, sentence_id, word_position).
|
62
62
|
|
63
63
|
Args:
|
64
64
|
dir_path (:obj:`str`): The path of the directory containing multiple Arabic txt files.
|