sharedkernel 1.6.1__tar.gz → 1.6.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/PKG-INFO +5 -1
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/README.md +4 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/setup.py +1 -1
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel.egg-info/PKG-INFO +5 -1
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel.egg-info/SOURCES.txt +0 -1
- sharedkernel-1.6.1/sharedkernel/normalizer.py +0 -498
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/setup.cfg +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/common.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/__init__.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/mongo_generic_repository.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/vector_database_repository/__init__.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/vector_database_repository/chroma_startegy.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/vector_database_repository/milvus_strategy.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/vector_database_repository/vector_database_repository.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/database/vector_database_repository/vector_database_strategy.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/date_converter.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/enum/__init__.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/enum/error_code.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/enum/vector_database_type.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/exception/__init__.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/exception/exception.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/exception/exception_handlers.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/jwt_service.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/objects/__init__.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/objects/base_document.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/objects/jwt_model.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/objects/result.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/regex_masking.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel/string_extentions.py +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel.egg-info/dependency_links.txt +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel.egg-info/requires.txt +0 -0
- {sharedkernel-1.6.1 → sharedkernel-1.6.3}/sharedkernel.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sharedkernel
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.3
|
|
4
4
|
Summary: sharekernel is a shared package between all python projects
|
|
5
5
|
Author: Smilinno
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -20,6 +20,10 @@ Requires-Dist: persiantools
|
|
|
20
20
|
this a shared kernel package
|
|
21
21
|
|
|
22
22
|
# Change Log
|
|
23
|
+
### Version 1.6.3
|
|
24
|
+
- Fix minor bug in phone normalizer
|
|
25
|
+
### Version 1.6.2
|
|
26
|
+
- Minor update: normalize function name
|
|
23
27
|
### Version 1.6.1
|
|
24
28
|
- Minor update normalize functions
|
|
25
29
|
### Version 1.6
|
|
@@ -33,7 +33,7 @@ setup(
|
|
|
33
33
|
"persiantools"
|
|
34
34
|
],
|
|
35
35
|
# *strongly* suggested for sharing
|
|
36
|
-
version="1.6.
|
|
36
|
+
version="1.6.3",
|
|
37
37
|
description="sharekernel is a shared package between all python projects",
|
|
38
38
|
long_description=long_description,
|
|
39
39
|
long_description_content_type="text/markdown",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sharedkernel
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.3
|
|
4
4
|
Summary: sharekernel is a shared package between all python projects
|
|
5
5
|
Author: Smilinno
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -20,6 +20,10 @@ Requires-Dist: persiantools
|
|
|
20
20
|
this a shared kernel package
|
|
21
21
|
|
|
22
22
|
# Change Log
|
|
23
|
+
### Version 1.6.3
|
|
24
|
+
- Fix minor bug in phone normalizer
|
|
25
|
+
### Version 1.6.2
|
|
26
|
+
- Minor update: normalize function name
|
|
23
27
|
### Version 1.6.1
|
|
24
28
|
- Minor update normalize functions
|
|
25
29
|
### Version 1.6
|
|
@@ -1,498 +0,0 @@
|
|
|
1
|
-
import re
|
|
2
|
-
import itertools
|
|
3
|
-
|
|
4
|
-
"""
|
|
5
|
-
Speech-to-Text Processing Systems for Iranian Dialects:
|
|
6
|
-
|
|
7
|
-
Speech-to-text processing systems often struggle to accurately transcribe phone numbers spoken by Iranians.
|
|
8
|
-
As a result, phone numbers and sequential digits may not be written accurately.
|
|
9
|
-
This is because individuals sometimes read phone numbers in a shortened or rounded manner.
|
|
10
|
-
Therefore, it is necessary to design a module that considers all these variations, normalizes the text,
|
|
11
|
-
and constructs a correct phone number.
|
|
12
|
-
|
|
13
|
-
Input and Output Specifications:
|
|
14
|
-
|
|
15
|
-
Input: The input will be in str format.
|
|
16
|
-
Output: The output will also be in str format.
|
|
17
|
-
If a phone number is identified, it will be processed and replaced with a corrected version.
|
|
18
|
-
If no phone number is found, the text will be returned in its original format and structure.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class DigitMapping:
|
|
23
|
-
PERSIAN_DIGITS = "۰۱۲۳۴۵۶۷۸۹"
|
|
24
|
-
WESTERN_DIGITS = "0123456789"
|
|
25
|
-
PHONE_NUMBER_LENGTH_FULL = 11
|
|
26
|
-
PHONE_NUMBER_LENGTH_PARTIAL = 10
|
|
27
|
-
SEVEN_DIGIT_PREFIX_LENGTH = 7
|
|
28
|
-
MIN_NUMBER_FOR_CHECK = 8
|
|
29
|
-
MIN_NUMBER_FOR_GENERATE = 5
|
|
30
|
-
MAX_ITERATIONS_CHECK = 4
|
|
31
|
-
PICK_FIRST_OR_LAST_LENGTH_NUMBER = 4
|
|
32
|
-
MAX_CHAR_CHECK = 30
|
|
33
|
-
MIN_CHAR_CHECK = 7
|
|
34
|
-
FIND_MIN_REPEAT_NUM = 3
|
|
35
|
-
GENERATIVE_VALID_NUMBER_COUNT = 1
|
|
36
|
-
number_words = {
|
|
37
|
-
# Persian
|
|
38
|
-
"صفر": "0",
|
|
39
|
-
"یک": "1",
|
|
40
|
-
"دو": "2",
|
|
41
|
-
"سه": "3",
|
|
42
|
-
"چهار": "4",
|
|
43
|
-
"پنج": "5",
|
|
44
|
-
"شش": "6",
|
|
45
|
-
"هفت": "7",
|
|
46
|
-
"هشت": "8",
|
|
47
|
-
"نه": "9",
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
# Creating an instance of the DigitMapping class
|
|
54
|
-
digit_mapping = DigitMapping()
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
class PhoneNumberProcessor:
|
|
58
|
-
# Iranian mobile phone number area code
|
|
59
|
-
|
|
60
|
-
def __replace_number_words(self, text):
|
|
61
|
-
pattern = r"(\d+)([a-zA-Z]+)"
|
|
62
|
-
text = re.sub(pattern, r"\1 \2", text)
|
|
63
|
-
for word, digit in digit_mapping.number_words.items():
|
|
64
|
-
text = re.sub(r"\b" + word + r"\b", digit, text, flags=re.IGNORECASE)
|
|
65
|
-
return text
|
|
66
|
-
|
|
67
|
-
def __first_to_in_phone_number(self, number_str):
|
|
68
|
-
"""
|
|
69
|
-
Identify patterns where a number is followed by "تا" and another number.
|
|
70
|
-
Generate all combinations for numbers preceding and following "تا"
|
|
71
|
-
and return them as a list.
|
|
72
|
-
"""
|
|
73
|
-
generated_numbers = []
|
|
74
|
-
ta_pattern = re.compile(r"(\d)(?=\s*تا\s*(\d+))")
|
|
75
|
-
matches = ta_pattern.findall(number_str)
|
|
76
|
-
|
|
77
|
-
if matches:
|
|
78
|
-
num1, num2 = matches[0] # Extract the first match
|
|
79
|
-
combinations = self.__generate_prefix_combinations(num1, num2)
|
|
80
|
-
|
|
81
|
-
for combo in combinations:
|
|
82
|
-
repeated_number = combo[1] * int(combo[0])
|
|
83
|
-
pattern = re.escape(f"{combo[0]} تا {combo[1]}")
|
|
84
|
-
generated_numbers.append(re.sub(pattern, repeated_number, number_str))
|
|
85
|
-
|
|
86
|
-
return generated_numbers
|
|
87
|
-
return number_str
|
|
88
|
-
|
|
89
|
-
def __process_to_constructions(self, number_str):
|
|
90
|
-
"""
|
|
91
|
-
Process occurrences of 'تا' in the input string to generate possible number combinations.
|
|
92
|
-
This function calls `__first_to_in_phone_number` to handle the first 'تا' occurrence and
|
|
93
|
-
iteratively processes further occurrences if necessary.
|
|
94
|
-
"""
|
|
95
|
-
# Start processing the first 'تا'
|
|
96
|
-
generated_numbers = self.__first_to_in_phone_number(number_str)
|
|
97
|
-
|
|
98
|
-
if isinstance(generated_numbers, str):
|
|
99
|
-
return generated_numbers
|
|
100
|
-
|
|
101
|
-
ta_pattern = re.compile(r"(\d+)(?=\s*تا\s*(\d+))")
|
|
102
|
-
|
|
103
|
-
i = 0
|
|
104
|
-
while i < len(generated_numbers):
|
|
105
|
-
current_number = generated_numbers[i]
|
|
106
|
-
if ta_pattern.search(current_number):
|
|
107
|
-
new_combinations = self.__first_to_in_phone_number(current_number)
|
|
108
|
-
generated_numbers.pop(i)
|
|
109
|
-
generated_numbers[i:i] = new_combinations
|
|
110
|
-
else:
|
|
111
|
-
i += 1
|
|
112
|
-
|
|
113
|
-
# Select the appropriate number from the generated list
|
|
114
|
-
selected_number = next(
|
|
115
|
-
(
|
|
116
|
-
num
|
|
117
|
-
for num in generated_numbers
|
|
118
|
-
if len("".join(num.split())) == digit_mapping.PHONE_NUMBER_LENGTH_FULL
|
|
119
|
-
),
|
|
120
|
-
next(
|
|
121
|
-
(
|
|
122
|
-
num
|
|
123
|
-
for num in generated_numbers
|
|
124
|
-
if len("".join(num.split()))
|
|
125
|
-
== digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL
|
|
126
|
-
),
|
|
127
|
-
None,
|
|
128
|
-
),
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
return "".join(selected_number.split()) if selected_number else number_str
|
|
132
|
-
|
|
133
|
-
def __generate_prefix_combinations(self, prefix1: str, prefix2: str):
|
|
134
|
-
"""
|
|
135
|
-
Generate all possible combinations of the prefixes of `prefix1` and `prefix2`
|
|
136
|
-
using itertools to avoid explicit loops.
|
|
137
|
-
"""
|
|
138
|
-
len1, len2 = len(prefix1), len(prefix2)
|
|
139
|
-
indices_product = itertools.product(range(1, len1 + 1), range(1, len2 + 1))
|
|
140
|
-
|
|
141
|
-
# Generate the combinations using the indices
|
|
142
|
-
return [(prefix1[:i], prefix2[:j]) for i, j in indices_product]
|
|
143
|
-
|
|
144
|
-
def __add_spaces_around_to(self, input_text):
|
|
145
|
-
"""
|
|
146
|
-
Ensure "تا" has spaces around it and remove any redundant spaces.
|
|
147
|
-
"""
|
|
148
|
-
# Use regular expressions to add spaces around "تا"
|
|
149
|
-
modified_text = re.sub(r"\s*تا\s*", " تا ", input_text)
|
|
150
|
-
|
|
151
|
-
return modified_text
|
|
152
|
-
|
|
153
|
-
def __generate_next_numbers(self, num):
|
|
154
|
-
str_num = str(num)
|
|
155
|
-
next_numbers = set()
|
|
156
|
-
for i in range(len(str_num) - 1):
|
|
157
|
-
current_digit = int(str_num[i])
|
|
158
|
-
next_digit = int(str_num[i + 1])
|
|
159
|
-
new_num = (
|
|
160
|
-
str_num[:i]
|
|
161
|
-
+ str(int(current_digit) * str(next_digit))
|
|
162
|
-
+ str_num[i + 2 :]
|
|
163
|
-
)
|
|
164
|
-
if len(new_num) <= 7:
|
|
165
|
-
next_numbers.add(str(new_num))
|
|
166
|
-
return next_numbers
|
|
167
|
-
|
|
168
|
-
def __find_seven_chain_numbers(self, start_num):
|
|
169
|
-
current_numbers = {start_num}
|
|
170
|
-
all_numbers = set(current_numbers)
|
|
171
|
-
while len(all_numbers) < digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH:
|
|
172
|
-
next_numbers = set()
|
|
173
|
-
|
|
174
|
-
# Using a for loop to generate new numbers
|
|
175
|
-
for num in current_numbers:
|
|
176
|
-
generated_numbers = self.__generate_next_numbers(num)
|
|
177
|
-
new_numbers = generated_numbers - all_numbers
|
|
178
|
-
next_numbers.update(new_numbers)
|
|
179
|
-
|
|
180
|
-
if not next_numbers: # Exit loop if no new numbers are generated
|
|
181
|
-
break
|
|
182
|
-
|
|
183
|
-
all_numbers.update(next_numbers)
|
|
184
|
-
current_numbers = next_numbers
|
|
185
|
-
|
|
186
|
-
# Early exit if we find any 7-digit number
|
|
187
|
-
if any(
|
|
188
|
-
len(str(num)) == digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH
|
|
189
|
-
for num in next_numbers
|
|
190
|
-
):
|
|
191
|
-
break
|
|
192
|
-
# Collect all 7-digit numbers and return them sorted
|
|
193
|
-
seven_digit_numbers = sorted(
|
|
194
|
-
num
|
|
195
|
-
for num in all_numbers
|
|
196
|
-
if len(str(num)) == digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH
|
|
197
|
-
)
|
|
198
|
-
return seven_digit_numbers
|
|
199
|
-
|
|
200
|
-
def __apply_filter(self, numbers, index, condition):
|
|
201
|
-
return (
|
|
202
|
-
[num for num in numbers if condition(num)] if len(numbers) >= 2 else numbers
|
|
203
|
-
)
|
|
204
|
-
|
|
205
|
-
def __generate_valid_numbers(self, start_num):
|
|
206
|
-
numbers_list = self.__find_seven_chain_numbers(start_num)
|
|
207
|
-
|
|
208
|
-
# If the number list length matches the valid criteria, return the first number
|
|
209
|
-
if len(numbers_list) == digit_mapping.GENERATIVE_VALID_NUMBER_COUNT:
|
|
210
|
-
return numbers_list[0] if numbers_list else start_num
|
|
211
|
-
|
|
212
|
-
start_num_str = str(start_num)
|
|
213
|
-
unique_count = len(set(start_num_str))
|
|
214
|
-
# Step 1: Filter numbers with the same or one less unique digit count
|
|
215
|
-
filtered_numbers = [
|
|
216
|
-
num
|
|
217
|
-
for num in numbers_list
|
|
218
|
-
if len(set(str(num))) in {unique_count, unique_count - 1}
|
|
219
|
-
]
|
|
220
|
-
# Apply additional filters only if there are at least 2 candidates remaining
|
|
221
|
-
# Step 2: Filter by last digit
|
|
222
|
-
filtered_numbers = self.__apply_filter(
|
|
223
|
-
filtered_numbers, -1, lambda num: str(num)[-1] == start_num_str[-1]
|
|
224
|
-
)
|
|
225
|
-
# Step 3: Filter by second digit
|
|
226
|
-
filtered_numbers = self.__apply_filter(
|
|
227
|
-
filtered_numbers, 1, lambda num: str(num)[1] == start_num_str[1]
|
|
228
|
-
)
|
|
229
|
-
# Step 4: Filter by last 4 digits
|
|
230
|
-
filtered_numbers = self.__apply_filter(
|
|
231
|
-
filtered_numbers, -4, lambda num: str(num)[-4:] == start_num_str[-4:]
|
|
232
|
-
)
|
|
233
|
-
|
|
234
|
-
# Return the first valid number or fallback
|
|
235
|
-
return (
|
|
236
|
-
filtered_numbers[0]
|
|
237
|
-
if filtered_numbers
|
|
238
|
-
else (numbers_list[0] if numbers_list else start_num)
|
|
239
|
-
)
|
|
240
|
-
|
|
241
|
-
def __check_area_code(self, input_text):
|
|
242
|
-
"""
|
|
243
|
-
Processes the input Persian text to ensure it has a valid area
|
|
244
|
-
code and valid number format.
|
|
245
|
-
"""
|
|
246
|
-
# Remove all non-digit characters from the input
|
|
247
|
-
digits_only = re.sub(r"\D", "", input_text)
|
|
248
|
-
|
|
249
|
-
# Ensure there are enough digits to process
|
|
250
|
-
if len(digits_only) <= digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER:
|
|
251
|
-
return digits_only
|
|
252
|
-
|
|
253
|
-
# Extract the area code (first 4 digits) and the remaining number
|
|
254
|
-
area_code = digits_only[: digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER]
|
|
255
|
-
number_part = digits_only[digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER :]
|
|
256
|
-
|
|
257
|
-
# Generate valid number formats based on the remaining part
|
|
258
|
-
valid_number = self.__generate_valid_numbers(number_part)
|
|
259
|
-
# Return the formatted result
|
|
260
|
-
if not valid_number:
|
|
261
|
-
return digits_only
|
|
262
|
-
return f"{area_code}{valid_number}"
|
|
263
|
-
|
|
264
|
-
def __insert_repeated_number(self, number_str):
|
|
265
|
-
"""
|
|
266
|
-
Finds the longest repeated sequence in the string and inserts one more instance of the repeated number
|
|
267
|
-
to extend the sequence, without using explicit `for` loops.
|
|
268
|
-
"""
|
|
269
|
-
# Group consecutive identical characters and find the longest group
|
|
270
|
-
grouped = [
|
|
271
|
-
(char, len(list(group))) for char, group in itertools.groupby(number_str)
|
|
272
|
-
]
|
|
273
|
-
|
|
274
|
-
# Find the max repeated group
|
|
275
|
-
max_repeated_char, max_repeated_len = max(grouped, key=lambda x: x[1])
|
|
276
|
-
|
|
277
|
-
# Only extend the sequence if the maximum repeated length is greater than 1
|
|
278
|
-
if max_repeated_len > 1:
|
|
279
|
-
# Find the start index of the sequence
|
|
280
|
-
start_index = number_str.find(max_repeated_char * max_repeated_len)
|
|
281
|
-
|
|
282
|
-
# Extend the sequence by one character and return the modified string
|
|
283
|
-
extended_sequence = max_repeated_char * (max_repeated_len + 1)
|
|
284
|
-
return (
|
|
285
|
-
number_str[:start_index]
|
|
286
|
-
+ extended_sequence
|
|
287
|
-
+ number_str[start_index + max_repeated_len :]
|
|
288
|
-
)
|
|
289
|
-
|
|
290
|
-
return number_str
|
|
291
|
-
|
|
292
|
-
def __add_single_repeating_digit_between_repeats(self, number_str):
|
|
293
|
-
# If the input string is already longer than the partial phone number length, return it as is
|
|
294
|
-
if len(number_str) > digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL:
|
|
295
|
-
return number_str
|
|
296
|
-
|
|
297
|
-
# Split the string into two parts: the prefix (e.g., area code) and the remainder
|
|
298
|
-
prefix = number_str[: digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER]
|
|
299
|
-
number_str = number_str[digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER :]
|
|
300
|
-
|
|
301
|
-
result = []
|
|
302
|
-
i = 0
|
|
303
|
-
repeat_count = 1 # Track the count of consecutive repeating digits
|
|
304
|
-
|
|
305
|
-
while i < len(number_str):
|
|
306
|
-
# Check if the next digit is the same as the current one
|
|
307
|
-
if i > 0 and number_str[i] == number_str[i - 1]:
|
|
308
|
-
repeat_count += 1
|
|
309
|
-
else:
|
|
310
|
-
repeat_count = 1 # Reset the repeat count if the digit changes
|
|
311
|
-
|
|
312
|
-
# If we have a sequence of three or more repeating digits
|
|
313
|
-
if repeat_count == digit_mapping.FIND_MIN_REPEAT_NUM:
|
|
314
|
-
# Add a single extra repeating digit and then reset the counter
|
|
315
|
-
result.append(number_str[i])
|
|
316
|
-
repeat_count = 1
|
|
317
|
-
|
|
318
|
-
# Add the current digit to the result
|
|
319
|
-
result.append(number_str[i])
|
|
320
|
-
i += 1
|
|
321
|
-
|
|
322
|
-
# Concatenate the prefix with the processed result and return the final string
|
|
323
|
-
return prefix + "".join(result)
|
|
324
|
-
|
|
325
|
-
def __process_phone_number(self, phone_number):
|
|
326
|
-
"""
|
|
327
|
-
Process the Persian phone number string by adding a leading zero if missing,
|
|
328
|
-
removing spaces, and checking the area code.
|
|
329
|
-
"""
|
|
330
|
-
# Add leading zero to the area code if it's missing
|
|
331
|
-
if phone_number[0] != "0":
|
|
332
|
-
phone_number = "0" + phone_number
|
|
333
|
-
|
|
334
|
-
# Remove any spaces from the phone number
|
|
335
|
-
phone_number = "".join(phone_number.split())
|
|
336
|
-
|
|
337
|
-
# If the phone number is 10 digits or fewer, validate the area code
|
|
338
|
-
if len(phone_number) <= digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL:
|
|
339
|
-
validated_number = self.__check_area_code(phone_number)
|
|
340
|
-
return validated_number, phone_number
|
|
341
|
-
|
|
342
|
-
# If the phone number is longer than expected, return it as-is
|
|
343
|
-
return phone_number, phone_number
|
|
344
|
-
|
|
345
|
-
def __clean_and_concatenate_numbers(self, text):
|
|
346
|
-
"""
|
|
347
|
-
Process the input text by adding spaces around specific constructions,
|
|
348
|
-
removing unnecessary spaces, handling special constructions, and concatenating numbers.
|
|
349
|
-
"""
|
|
350
|
-
# Add spaces around the Persian word "تا"
|
|
351
|
-
text = self.__add_spaces_around_to(text)
|
|
352
|
-
# Remove spaces between numbers
|
|
353
|
-
text = self.__remove_spaces_between_numbers(text)
|
|
354
|
-
""" Handle special constructions like replacing "تا"
|
|
355
|
-
with corresponding numbers and removing it"""
|
|
356
|
-
text = self.__process_to_constructions(text)
|
|
357
|
-
# Concatenate digits without any spaces between them
|
|
358
|
-
text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
|
|
359
|
-
|
|
360
|
-
return text
|
|
361
|
-
|
|
362
|
-
def __english_to_persian(self, text):
|
|
363
|
-
farsi_to_latin = str.maketrans(
|
|
364
|
-
digit_mapping.WESTERN_DIGITS, digit_mapping.PERSIAN_DIGITS
|
|
365
|
-
)
|
|
366
|
-
return text.translate(farsi_to_latin)
|
|
367
|
-
|
|
368
|
-
def __persian_to_western(self, persian_number):
|
|
369
|
-
translation_table = str.maketrans(
|
|
370
|
-
digit_mapping.PERSIAN_DIGITS, digit_mapping.WESTERN_DIGITS
|
|
371
|
-
)
|
|
372
|
-
return persian_number.translate(translation_table)
|
|
373
|
-
|
|
374
|
-
def __remove_spaces_between_numbers(self, text):
|
|
375
|
-
# This regex will match spaces that are between two digits
|
|
376
|
-
return re.sub(r"(\d)\s+(\d)", r"\1\2", text)
|
|
377
|
-
|
|
378
|
-
def __process_patterned_numbers(self, number_sequence):
|
|
379
|
-
if number_sequence is None:
|
|
380
|
-
return number_sequence
|
|
381
|
-
# Convert Persian numbers to Western (English) numbers
|
|
382
|
-
western_number_sequence = self.__persian_to_western(number_sequence)
|
|
383
|
-
|
|
384
|
-
# Remove spaces between numbers to form a continuous sequence
|
|
385
|
-
contiguous_numbers = self.__remove_spaces_between_numbers(western_number_sequence)
|
|
386
|
-
|
|
387
|
-
# Clean up any remaining spaces
|
|
388
|
-
clean_number_sequence = "".join(contiguous_numbers.split())
|
|
389
|
-
|
|
390
|
-
# Clean and concatenate numbers to form the phone number
|
|
391
|
-
concatenated_numbers = self.__clean_and_concatenate_numbers(clean_number_sequence)
|
|
392
|
-
|
|
393
|
-
# Add single repeating digit between repeats in the phone number
|
|
394
|
-
processed_sequence = self.__add_single_repeating_digit_between_repeats(
|
|
395
|
-
concatenated_numbers
|
|
396
|
-
)
|
|
397
|
-
|
|
398
|
-
# Process and validate the phone number
|
|
399
|
-
final_number, original_sequence = self.__process_phone_number(processed_sequence)
|
|
400
|
-
|
|
401
|
-
# Limit the number of iterations to avoid infinite loops
|
|
402
|
-
|
|
403
|
-
iterations = 0
|
|
404
|
-
|
|
405
|
-
# Loop to handle sequences of length 8 to 10 digits
|
|
406
|
-
while (
|
|
407
|
-
digit_mapping.MIN_NUMBER_FOR_CHECK
|
|
408
|
-
<= len(str(final_number))
|
|
409
|
-
<= digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL
|
|
410
|
-
and iterations < digit_mapping.MAX_ITERATIONS_CHECK
|
|
411
|
-
):
|
|
412
|
-
if len(final_number) == 0:
|
|
413
|
-
modified_sequence = self.__insert_repeated_number(str(original_sequence))
|
|
414
|
-
else:
|
|
415
|
-
modified_sequence = self.__insert_repeated_number(str(final_number))
|
|
416
|
-
|
|
417
|
-
# Re-process the phone number
|
|
418
|
-
final_number, original_sequence = self.__process_phone_number(
|
|
419
|
-
modified_sequence
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
iterations += 1
|
|
423
|
-
# Handle case when final_number is a list
|
|
424
|
-
if isinstance(final_number, list):
|
|
425
|
-
final_number = final_number[0]
|
|
426
|
-
|
|
427
|
-
return final_number
|
|
428
|
-
|
|
429
|
-
def __update_text_with_number(self, text, new_number, old_prefix, new_prefix):
|
|
430
|
-
"""
|
|
431
|
-
Update the text by replacing the old prefix with the new prefix and converting it to Persian.
|
|
432
|
-
"""
|
|
433
|
-
processed_text = self.__process_patterned_numbers(new_number)
|
|
434
|
-
persian_text = self.__english_to_persian(processed_text)
|
|
435
|
-
text = re.sub(old_prefix, new_prefix, text, count=1)
|
|
436
|
-
return re.sub(re.escape(new_number), persian_text, text)
|
|
437
|
-
|
|
438
|
-
def __update_general_case(self, text, number):
|
|
439
|
-
"""
|
|
440
|
-
Handle general number replacement cases and update the text.
|
|
441
|
-
"""
|
|
442
|
-
processed_text = self.__process_patterned_numbers(number)
|
|
443
|
-
persian_text = self.__english_to_persian(processed_text)
|
|
444
|
-
return re.sub(re.escape(number), persian_text, text)
|
|
445
|
-
|
|
446
|
-
def __process_number_replacement(self, text, number):
|
|
447
|
-
"""
|
|
448
|
-
Process specific number patterns like '۰۹۹' and '۹۹' and replace them with correct forms.
|
|
449
|
-
"""
|
|
450
|
-
first_digits = number[:3]
|
|
451
|
-
|
|
452
|
-
if first_digits == "۰۹۹":
|
|
453
|
-
new_number = number.replace("۰۹۹", "۰۹۹۹", 1)
|
|
454
|
-
return self.__update_text_with_number(text, new_number, "۰۹۹", "۰۹۹۹")
|
|
455
|
-
|
|
456
|
-
return self.__update_general_case(text, number)
|
|
457
|
-
|
|
458
|
-
@staticmethod
|
|
459
|
-
def normalize(text):
|
|
460
|
-
"""
|
|
461
|
-
Normalize phone numbers in the input Persian text by handling specific patterns
|
|
462
|
-
and replacing them with the correct numeric forms.
|
|
463
|
-
"""
|
|
464
|
-
|
|
465
|
-
# Compile regex pattern to match number patterns
|
|
466
|
-
pattern_regex = re.compile(
|
|
467
|
-
r"((0?[1-9][0-9]{1,3})|(۰?[۱-۹][۰-۹]{1,3}))[\s۰-۹0-9تا]*"
|
|
468
|
-
)
|
|
469
|
-
|
|
470
|
-
# Clean up text by removing extra spaces
|
|
471
|
-
cleaned_text = re.sub(r"\s{2,}", " ", text)
|
|
472
|
-
|
|
473
|
-
processor = PhoneNumberProcessor()
|
|
474
|
-
|
|
475
|
-
# Replace Persian number words with numeric equivalents
|
|
476
|
-
cleaned_text = processor.__replace_number_words(cleaned_text)
|
|
477
|
-
|
|
478
|
-
# Find all matching number patterns
|
|
479
|
-
matches = pattern_regex.finditer(cleaned_text)
|
|
480
|
-
results = [
|
|
481
|
-
match.group()
|
|
482
|
-
for match in matches
|
|
483
|
-
if digit_mapping.MIN_CHAR_CHECK
|
|
484
|
-
<= len(match.group())
|
|
485
|
-
<= digit_mapping.MAX_CHAR_CHECK
|
|
486
|
-
]
|
|
487
|
-
|
|
488
|
-
# Process each match to handle specific cases
|
|
489
|
-
for result in results:
|
|
490
|
-
digits = re.findall(r"\d", result)
|
|
491
|
-
if (
|
|
492
|
-
digit_mapping.MIN_NUMBER_FOR_GENERATE
|
|
493
|
-
<= len(digits)
|
|
494
|
-
<= digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL
|
|
495
|
-
):
|
|
496
|
-
cleaned_text = processor.__process_number_replacement(cleaned_text, result)
|
|
497
|
-
|
|
498
|
-
return cleaned_text if results else text
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|