SinaTools 0.1.27__py2.py3-none-any.whl → 0.1.29__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
3
  from sinatools.utils.parser import arStrip
4
- from sinatools.utils.implication import Implication
4
+ from sinatools.utils.word_compare import Implication
5
5
  import argparse
6
6
 
7
7
  def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
@@ -47,7 +47,29 @@ def get_non_preferred_word(word1, word2):
47
47
  return "#"
48
48
 
49
49
  def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
50
-
50
+ """
51
+ Computes the intersection of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
52
+
53
+ Args:
54
+ list1 (:obj:`list`): The first list.
55
+ list2 (:obj:`bool`): The second list.
56
+ ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
57
+ ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
58
+
59
+ Returns:
60
+ :obj:`list`: The intersection of the two lists, ignores diacritics if flags are true.
61
+
62
+ **Example:**
63
+
64
+ .. highlight:: python
65
+ .. code-block:: python
66
+
67
+ from sinatools.utils.similarity import get_intersection
68
+ list1 = ["كتب","فَعل","فَعَلَ"]
69
+ list2 = ["كتب","فَعّل"]
70
+ print(get_intersection(list1, list2, False, True))
71
+ #output: ["كتب" ,"فعل"]
72
+ """
51
73
  list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
52
74
  list1 = [str(i.strip()) for i in list1]
53
75
 
@@ -80,7 +102,29 @@ def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, i
80
102
 
81
103
 
82
104
  def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
83
-
105
+ """
106
+ Computes the union of two sets of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
107
+
108
+ Args:
109
+ list1 (:obj:`list`): The first list.
110
+ list2 (:obj:`bool`): The second list.
111
+ ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
112
+ ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
113
+
114
+ Returns:
115
+ :obj:`list`: The union of the two lists, ignoring diacritics if flags are true.
116
+
117
+ **Example:**
118
+
119
+ .. highlight:: python
120
+ .. code-block:: python
121
+
122
+ from sinatools.utils.similarity import get_union
123
+ list1 = ["كتب","فَعل","فَعَلَ"]
124
+ list2 = ["كتب","فَعّل"]
125
+ print(get_union(list1, list2, False, True))
126
+ #output: ["كتب" ,"فَعل" ,"فَعَلَ"]
127
+ """
84
128
  list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
85
129
 
86
130
  list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
@@ -110,7 +154,30 @@ def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_
110
154
 
111
155
 
112
156
  def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
157
+ """
158
+ Calculates the Jaccard similarity coefficient between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
113
159
 
160
+ Args:
161
+ list1 (:obj:`list`): The first list.
162
+ list2 (:obj:`bool`): The second list.
163
+ ignore_all_diacratics_but_not_shadda (:obj:`bool`, optional) – A flag to ignore all diacratics except for the shadda. Defaults to False.
164
+ ignore_shadda_diacritic (:obj:`bool`, optional) – A flag to ignore the shadda diacritic. Defaults to False.
165
+
166
+ Returns:
167
+ :obj:`float`: The Jaccard similarity coefficient between the two lists, ignoring diacritics if flags are true.
168
+
169
+ **Example:**
170
+
171
+ .. highlight:: python
172
+ .. code-block:: python
173
+
174
+ from sinatools.utils.similarity import get_jaccard_similarity
175
+ list1 = ["كتب","فَعل","فَعَلَ"]
176
+ list2 = ["كتب","فَعّل"]
177
+ print(get_jaccard_similarity(list1, list2, True, True))
178
+ #output: 0.67
179
+ """
180
+
114
181
  intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
115
182
 
116
183
  union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
@@ -118,7 +185,31 @@ def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_n
118
185
  return float(len(intersection_list)) / float(len(union_list))
119
186
 
120
187
  def get_jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
121
-
188
+ """
189
+ Calculates and returns the Jaccard similarity values (union, intersection, or Jaccard similarity) between two lists of Arabic words, considering the differences in their diacritization. The method provides two options for handling diacritics: (i) ignore all diacritics except for shadda, and (ii) ignore the shadda diacritic as well. You can try the demo online.
190
+
191
+ Args:
192
+ delimiter (:obj:`str`): The delimiter used to split the input strings.
193
+ str1 (:obj:`str`): The first input string to compare.
194
+ str1 (:obj:`str`): The second input string to compare.
195
+ selection (:obj:`str`) – The desired operation to perform on the two sets of strings. Must be one of intersection, union, jaccardSimilarity, or jaccardAll.
196
+ ignore_all_diacratics_but_not_shadda (:obj:`bool`) – If True, ignore all diacratics except for the Shadda diacritic. (Default is True)
197
+ ignore_shadda_diacritic (:obj:`bool`) – If True, ignore the Shadda diacritic.(Default is True)
198
+
199
+ Returns:
200
+ Three values (Jaccard similarity, union, or intersection) between the two lists of Arabic words depending on the parameter selection.
201
+
202
+ **Example:**
203
+
204
+ .. highlight:: python
205
+ .. code-block:: python
206
+
207
+ from sinatools.utils.similarity import get_jaccard
208
+ str1 = "فَعَلَ | فَعل"
209
+ str2 = "فَعّل"
210
+ print(get_jaccard("|", "jaccardAll", str1, str2, True, True))
211
+ #output: ['intersection:', ['فعل'], 'union:', ['فعل', 'فعل'], 'similarity:', 0.5]
212
+ """
122
213
  try:
123
214
  list1 = str1.split(delimiter)
124
215
  list2 = str2.split(delimiter)
@@ -15,6 +15,28 @@ def validator(sentence, max_tokens=500):
15
15
 
16
16
 
17
17
  def removal(csv_file, columnName, finalFileName, deletedFileName, similarityThreshold=0.8):
18
+ """
19
+ This method is designed to identify dublicate text in a given corpora/text. It processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold. We used cosine similarity to measure similarity between words and sentences. The method saves the filtered results and the identified duplicates to separate files.
20
+
21
+ Args:
22
+ csv_file (:obj:`str`) – The CSV file contains Arabic text that needs to be cleaned.
23
+ column_name (:obj:`str`) – This is the name of the column containing the text that needs to be checked for duplicate removal.
24
+ final_file_name (:obj:`str`) – This is the name of the CSV file that will contain the data after duplicate removal.
25
+ deleted_file_name (:obj:`str`) – This is the name of the file that will contain all the duplicate records that are deleted.
26
+ similarity_threshold (:obj:`float`) – This is a floating-point number. The default value is 0.8, indicating the percentage of similarity that the function should use when deleting duplicates from the text column.
27
+
28
+ Returns:
29
+ csv files.
30
+
31
+ **Example:**
32
+
33
+ .. highlight:: python
34
+ .. code-block:: python
35
+
36
+ from sinatools.utils.text_dublication_detector import removal
37
+ removal("/path/to/csv/file1", sentences, "/path/to/csv/file2", 0.8)
38
+ """
39
+
18
40
  # Read CSV file
19
41
  try:
20
42
  df = pd.read_csv(csv_file)
@@ -165,7 +165,7 @@ bw2ar_map = {
165
165
  #It takes a text and the schema as input and return 2-values: the transliteration and a flag of whether all chars are transliterated or not
166
166
  def perform_transliteration(text , schema ):
167
167
  """
168
- This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
168
+ This method takes a text and a schema as input and returns a tuple of two values: the transliteration of the text is based on the given schema and a flag indicating whether all characters in the text were transliterated or not.
169
169
 
170
170
  Args:
171
171
  text (:obj:`str`): The input text to be transliterated.
@@ -58,7 +58,7 @@ def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclam
58
58
 
59
59
  def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
60
60
  """
61
- This method receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in a CSV file.
61
+ This method is designed to tokenize a corpus into words. It receives a directory and tokenizes all files within the input directory, as well as all files within subdirectories within the main directory. The results are then stored in one CSV file. The data within files was split into sentences using the sentence_tokenizer module and into words using a word tokenizer. Additionally, it added a set of ids (row_id, docs_sentence_word_id, global_sentence_id, sentence_id, word_position).
62
62
 
63
63
  Args:
64
64
  dir_path (:obj:`str`): The path of the directory containing multiple Arabic txt files.