sharedkernel 1.6.8__tar.gz → 1.6.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/PKG-INFO +10 -6
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/README.md +9 -4
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/setup.py +2 -2
- sharedkernel-1.6.10/sharedkernel/database/vector_database_repository/__init__.py +2 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/database/vector_database_repository/vector_database_repository.py +5 -5
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/normalizer/phone_number_normalizer.py +7 -408
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/objects/base_document.py +1 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel.egg-info/PKG-INFO +10 -6
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel.egg-info/SOURCES.txt +0 -1
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel.egg-info/requires.txt +0 -1
- sharedkernel-1.6.8/sharedkernel/database/vector_database_repository/__init__.py +0 -3
- sharedkernel-1.6.8/sharedkernel/database/vector_database_repository/milvus_strategy.py +0 -50
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/setup.cfg +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/common.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/database/__init__.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/database/mongo_generic_repository.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/database/vector_database_repository/chroma_startegy.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/database/vector_database_repository/vector_database_strategy.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/date_converter.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/enum/__init__.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/enum/error_code.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/enum/vector_database_type.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/exception/__init__.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/exception/exception.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/exception/exception_handlers.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/jwt_service.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/normalizer/__init__.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/objects/__init__.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/objects/jwt_model.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/objects/result.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/regex_masking.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/string_extentions.py +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel.egg-info/dependency_links.txt +0 -0
- {sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sharedkernel
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.10
|
|
4
4
|
Summary: sharekernel is a shared package between all python projects
|
|
5
5
|
Author: Smilinno
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -9,7 +9,6 @@ Requires-Dist: requests
|
|
|
9
9
|
Requires-Dist: pymongo
|
|
10
10
|
Requires-Dist: fastapi==0.111.0
|
|
11
11
|
Requires-Dist: PyJWT
|
|
12
|
-
Requires-Dist: pymilvus
|
|
13
12
|
Requires-Dist: chromadb
|
|
14
13
|
Requires-Dist: persian_tools
|
|
15
14
|
Requires-Dist: sentry-sdk
|
|
@@ -20,6 +19,11 @@ Requires-Dist: persiantools
|
|
|
20
19
|
this a shared kernel package
|
|
21
20
|
|
|
22
21
|
# Change Log
|
|
22
|
+
### Version 1.6.10
|
|
23
|
+
- Add updated_on to BaseDocument
|
|
24
|
+
- Update phonenumber normalizer
|
|
25
|
+
### Version 1.6.9
|
|
26
|
+
- Delete milvus
|
|
23
27
|
### Version 1.6.8
|
|
24
28
|
- Fix Date Converter Bug
|
|
25
29
|
### Version 1.6.7
|
|
@@ -62,10 +66,10 @@ this a shared kernel package
|
|
|
62
66
|
### Version 1.2.0
|
|
63
67
|
- Implement Regex Masking
|
|
64
68
|
# Create Package
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
python -m pip install --upgrade build
|
|
70
|
+
python -m build
|
|
71
|
+
python -m pip install --upgrade twine
|
|
72
|
+
python -m twine upload dist/*
|
|
69
73
|
|
|
70
74
|
# Pypi
|
|
71
75
|
pip install sharedkernel
|
|
@@ -2,6 +2,11 @@
|
|
|
2
2
|
this a shared kernel package
|
|
3
3
|
|
|
4
4
|
# Change Log
|
|
5
|
+
### Version 1.6.10
|
|
6
|
+
- Add updated_on to BaseDocument
|
|
7
|
+
- Update phonenumber normalizer
|
|
8
|
+
### Version 1.6.9
|
|
9
|
+
- Delete milvus
|
|
5
10
|
### Version 1.6.8
|
|
6
11
|
- Fix Date Converter Bug
|
|
7
12
|
### Version 1.6.7
|
|
@@ -44,10 +49,10 @@ this a shared kernel package
|
|
|
44
49
|
### Version 1.2.0
|
|
45
50
|
- Implement Regex Masking
|
|
46
51
|
# Create Package
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
52
|
+
python -m pip install --upgrade build
|
|
53
|
+
python -m build
|
|
54
|
+
python -m pip install --upgrade twine
|
|
55
|
+
python -m twine upload dist/*
|
|
51
56
|
|
|
52
57
|
# Pypi
|
|
53
58
|
pip install sharedkernel
|
|
@@ -26,7 +26,7 @@ setup(
|
|
|
26
26
|
"pymongo",
|
|
27
27
|
"fastapi==0.111.0",
|
|
28
28
|
"PyJWT",
|
|
29
|
-
"pymilvus",
|
|
29
|
+
#"pymilvus",
|
|
30
30
|
"chromadb",
|
|
31
31
|
"persian_tools",
|
|
32
32
|
"sentry-sdk",
|
|
@@ -34,7 +34,7 @@ setup(
|
|
|
34
34
|
"persiantools"
|
|
35
35
|
],
|
|
36
36
|
# *strongly* suggested for sharing
|
|
37
|
-
version="1.6.
|
|
37
|
+
version="1.6.10",
|
|
38
38
|
description="sharekernel is a shared package between all python projects",
|
|
39
39
|
long_description=long_description,
|
|
40
40
|
long_description_content_type="text/markdown",
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import numpy as np
|
|
2
2
|
from .vector_database_strategy import VectorDatabaseStrategy
|
|
3
|
-
from .milvus_strategy import MilvusStrategy
|
|
3
|
+
# from .milvus_strategy import MilvusStrategy
|
|
4
4
|
from .chroma_startegy import ChromaStrategy
|
|
5
5
|
from sharedkernel.enum.vector_database_type import VectorDatabaseType
|
|
6
6
|
|
|
@@ -10,10 +10,10 @@ class VectorRepository:
|
|
|
10
10
|
self.strategy.connect(**connection_params)
|
|
11
11
|
|
|
12
12
|
def _get_strategy(self, database_type: VectorDatabaseType, collection_name: str) -> VectorDatabaseStrategy:
|
|
13
|
-
if database_type == VectorDatabaseType.MILVUS:
|
|
14
|
-
|
|
15
|
-
else:
|
|
16
|
-
|
|
13
|
+
# if database_type == VectorDatabaseType.MILVUS:
|
|
14
|
+
# return MilvusStrategy(collection_name)
|
|
15
|
+
# else:
|
|
16
|
+
return ChromaStrategy(collection_name)
|
|
17
17
|
|
|
18
18
|
def add_vector(self, vector: np.ndarray, metadata: dict) -> str:
|
|
19
19
|
return self.strategy.insert_vector(vector, metadata)
|
{sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/normalizer/phone_number_normalizer.py
RENAMED
|
@@ -18,6 +18,7 @@ import itertools
|
|
|
18
18
|
If no phone number is found, the text will be returned in its original format and structure.
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
|
+
|
|
21
22
|
class DigitMapping:
|
|
22
23
|
PERSIAN_DIGITS = "۰۱۲۳۴۵۶۷۸۹"
|
|
23
24
|
WESTERN_DIGITS = "0123456789"
|
|
@@ -47,6 +48,8 @@ class DigitMapping:
|
|
|
47
48
|
}
|
|
48
49
|
|
|
49
50
|
|
|
51
|
+
|
|
52
|
+
|
|
50
53
|
# Creating an instance of the DigitMapping class
|
|
51
54
|
digit_mapping = DigitMapping()
|
|
52
55
|
|
|
@@ -211,412 +214,8 @@ class PhoneNumberNormalizer:
|
|
|
211
214
|
if len(numbers_list) == digit_mapping.GENERATIVE_VALID_NUMBER_COUNT:
|
|
212
215
|
return numbers_list[0] if numbers_list else start_num
|
|
213
216
|
|
|
214
|
-
start_num_str = str(start_num)
|
|
215
|
-
unique_count = len(set(start_num_str))
|
|
216
|
-
# Step 1: Filter numbers with the same or one less unique digit count
|
|
217
|
-
# Remove version 1.6.x
|
|
218
|
-
# Apply additional filters only if there are at least 2 candidates remaining
|
|
219
|
-
|
|
220
|
-
# Return the first valid number or fallback
|
|
221
|
-
# version 1.16.x remove two generate number
|
|
222
|
-
return (
|
|
223
|
-
numbers_list[1]
|
|
224
|
-
if numbers_list and len(numbers_list) <= 2
|
|
225
|
-
else start_num)
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
def __check_area_code(self, input_text):
|
|
230
|
-
"""
|
|
231
|
-
Processes the input Persian text to ensure it has a valid area
|
|
232
|
-
code and valid number format.
|
|
233
|
-
"""
|
|
234
|
-
# Remove all non-digit characters from the input
|
|
235
|
-
digits_only = re.sub(r"\D", "", input_text)
|
|
236
|
-
|
|
237
|
-
# Ensure there are enough digits to process
|
|
238
|
-
if len(digits_only) <= digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER:
|
|
239
|
-
return digits_only
|
|
240
|
-
|
|
241
|
-
# Extract the area code (first 4 digits) and the remaining number
|
|
242
|
-
area_code = digits_only[: digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER]
|
|
243
|
-
number_part = digits_only[digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER :]
|
|
244
|
-
|
|
245
|
-
# Generate valid number formats based on the remaining part
|
|
246
|
-
valid_number = self.__generate_valid_numbers(number_part)
|
|
247
|
-
# Return the formatted result
|
|
248
|
-
if not valid_number:
|
|
249
|
-
return digits_only
|
|
250
|
-
return f"{area_code}{valid_number}"
|
|
251
|
-
#Remove faction version 1.6.2
|
|
252
|
-
#def __insert_repeated_number(self, number_str):
|
|
253
|
-
"""
|
|
254
|
-
Finds the longest repeated sequence in the string and inserts one more instance of the repeated number
|
|
255
|
-
to extend the sequence, without using explicit `for` loops.
|
|
256
|
-
"""
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
def __add_single_repeating_digit_between_repeats(self, number_str):
|
|
260
|
-
# If the input string is already longer than the partial phone number length, return it as is
|
|
261
|
-
if len(number_str) > digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL:
|
|
262
|
-
return number_str
|
|
263
|
-
|
|
264
|
-
# Split the string into two parts: the prefix (e.g., area code) and the remainder
|
|
265
|
-
prefix = number_str[: digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER]
|
|
266
|
-
number_str = number_str[digit_mapping.PICK_FIRST_OR_LAST_LENGTH_NUMBER :]
|
|
267
|
-
|
|
268
|
-
result = []
|
|
269
|
-
i = 0
|
|
270
|
-
repeat_count = 1 # Track the count of consecutive repeating digits
|
|
271
|
-
|
|
272
|
-
while i < len(number_str):
|
|
273
|
-
# Check if the next digit is the same as the current one
|
|
274
|
-
if i > 0 and number_str[i] == number_str[i - 1] and int(number_str[i]) != 0:
|
|
275
|
-
repeat_count += 1
|
|
276
|
-
else:
|
|
277
|
-
repeat_count = 1 # Reset the repeat count if the digit changes
|
|
278
|
-
|
|
279
|
-
# If we have a sequence of three or more repeating digits
|
|
280
|
-
if repeat_count == digit_mapping.FIND_MIN_REPEAT_NUM:
|
|
281
|
-
# Add a single extra repeating digit and then reset the counter
|
|
282
|
-
result.append(number_str[i])
|
|
283
|
-
repeat_count = 1
|
|
284
|
-
|
|
285
|
-
# Add the current digit to the result
|
|
286
|
-
result.append(number_str[i])
|
|
287
|
-
i += 1
|
|
288
|
-
|
|
289
|
-
# Concatenate the prefix with the processed result and return the final string
|
|
290
|
-
return prefix + "".join(result)
|
|
291
|
-
|
|
292
|
-
def __process_phone_number(self, phone_number):
|
|
293
|
-
"""
|
|
294
|
-
Process the Persian phone number string by adding a leading zero if missing,
|
|
295
|
-
removing spaces, and checking the area code.
|
|
296
|
-
"""
|
|
297
|
-
# Add leading zero to the area code if it's missing
|
|
298
|
-
phone_number = self.__ensureLeadingZero(phone_number)
|
|
299
|
-
# Remove any spaces from the phone number
|
|
300
|
-
phone_number = "".join(phone_number.split())
|
|
301
|
-
|
|
302
|
-
# If the phone number is 10 digits or fewer, validate the area code
|
|
303
|
-
if len(phone_number) <= digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL:
|
|
304
|
-
validated_number = self.__check_area_code(phone_number)
|
|
305
|
-
return validated_number, phone_number
|
|
306
|
-
|
|
307
|
-
# If the phone number is longer than expected, return it as-is
|
|
308
|
-
return phone_number, phone_number
|
|
309
|
-
|
|
310
|
-
def __clean_and_concatenate_numbers(self, text):
|
|
311
|
-
"""
|
|
312
|
-
Process the input text by adding spaces around specific constructions,
|
|
313
|
-
removing unnecessary spaces, handling special constructions, and concatenating numbers.
|
|
314
|
-
"""
|
|
315
|
-
# Add spaces around the Persian word "تا"
|
|
316
|
-
text = self.__add_spaces_around_to(text)
|
|
317
|
-
# Remove spaces between numbers
|
|
318
|
-
text = self.__remove_spaces_between_numbers(text)
|
|
319
|
-
""" Handle special constructions like replacing "تا"
|
|
320
|
-
with corresponding numbers and removing it"""
|
|
321
|
-
text = self.__process_to_constructions(text)
|
|
322
|
-
# Concatenate digits without any spaces between them
|
|
323
|
-
text = re.sub(r"(\d)\s+(\d)", r"\1\2", text)
|
|
324
|
-
|
|
325
|
-
return text
|
|
326
|
-
|
|
327
|
-
def __english_to_persian(self, text):
|
|
328
|
-
farsi_to_latin = str.maketrans(
|
|
329
|
-
digit_mapping.WESTERN_DIGITS, digit_mapping.PERSIAN_DIGITS
|
|
330
|
-
)
|
|
331
|
-
return text.translate(farsi_to_latin)
|
|
332
|
-
|
|
333
|
-
def __persian_to_western(self, persian_number):
|
|
334
|
-
translation_table = str.maketrans(
|
|
335
|
-
digit_mapping.PERSIAN_DIGITS, digit_mapping.WESTERN_DIGITS
|
|
336
|
-
)
|
|
337
|
-
return persian_number.translate(translation_table)
|
|
338
|
-
|
|
339
|
-
def __remove_spaces_between_numbers(self, text):
|
|
340
|
-
# This regex will match spaces that are between two digits
|
|
341
|
-
return re.sub(r"(\d)\s+(\d)", r"\1\2", text)
|
|
342
|
-
|
|
343
|
-
def __process_patterned_numbers(self, number_sequence):
|
|
344
|
-
if number_sequence is None:
|
|
345
|
-
return number_sequence
|
|
346
|
-
# Convert Persian numbers to Western (English) numbers
|
|
347
|
-
western_number_sequence = self.__persian_to_western(number_sequence)
|
|
348
|
-
|
|
349
|
-
# Remove spaces between numbers to form a continuous sequence
|
|
350
|
-
contiguous_numbers = self.__remove_spaces_between_numbers(western_number_sequence)
|
|
351
|
-
|
|
352
|
-
# Clean up any remaining spaces
|
|
353
|
-
clean_number_sequence = "".join(contiguous_numbers.split())
|
|
354
|
-
|
|
355
|
-
# Clean and concatenate numbers to form the phone number
|
|
356
|
-
concatenated_numbers = self.__clean_and_concatenate_numbers(clean_number_sequence)
|
|
357
|
-
# Add single repeating digit between repeats in the phone number
|
|
358
|
-
|
|
359
|
-
# Process and validate the phone number
|
|
360
|
-
final_number, original_sequence = self.__process_phone_number(concatenated_numbers)
|
|
361
|
-
# Handle case when final_number is a list
|
|
362
|
-
if not isinstance(final_number, list):
|
|
363
|
-
processed_sequence = self.__add_single_repeating_digit_between_repeats(
|
|
364
|
-
final_number
|
|
365
|
-
)
|
|
366
|
-
return processed_sequence
|
|
367
|
-
processed_sequence = self.__add_single_repeating_digit_between_repeats(
|
|
368
|
-
final_number[0]
|
|
369
|
-
)
|
|
370
|
-
|
|
371
|
-
return processed_sequence
|
|
372
|
-
|
|
373
|
-
def __update_text_with_number(self, text, new_number, old_prefix, new_prefix):
|
|
374
|
-
"""
|
|
375
|
-
Update the text by replacing the old prefix with the new prefix and converting it to Persian.
|
|
376
|
-
"""
|
|
377
|
-
processed_text = self.__process_patterned_numbers(new_number)
|
|
378
|
-
if len(processed_text) != 11:
|
|
379
|
-
phone_number_proc = "".join(new_number.split())
|
|
380
|
-
processed_text = phone_number_proc
|
|
381
|
-
persian_text = self.__english_to_persian(processed_text)
|
|
382
|
-
#remove v.1.6.2
|
|
383
|
-
#text = re.sub(old_prefix, new_prefix, text, count=1)
|
|
384
|
-
return re.sub(re.escape(' ' * 8), persian_text, text)
|
|
385
|
-
|
|
386
|
-
def __update_general_case(self, text, number):
|
|
387
|
-
"""
|
|
388
|
-
Handle general number replacement cases and update the text.
|
|
389
|
-
"""
|
|
390
|
-
processed_text = self.__process_patterned_numbers(number)
|
|
391
|
-
if len(processed_text) != 11:
|
|
392
|
-
phone_number_proc = "".join(number.split())
|
|
393
|
-
processed_text = phone_number_proc
|
|
394
|
-
persian_text = self.__english_to_persian(processed_text)
|
|
395
|
-
return re.sub(re.escape(' ' * 8), persian_text, text)
|
|
396
|
-
|
|
397
|
-
def __process_number_replacement(self, text, number):
|
|
398
|
-
"""
|
|
399
|
-
Process specific number patterns like '۰۹۹' and '۹۹' and replace them with correct forms.
|
|
400
|
-
"""
|
|
401
|
-
if ((number[:3] == "۰۹۹")): #and (number[:4] != "۰۹۹۹")):
|
|
402
|
-
new_number = number.replace("۰۹۹", "۰۹۹۹", 1)
|
|
403
|
-
return self.__update_text_with_number(text, new_number, "۰۹۹", "۰۹۹۹")
|
|
404
|
-
if ((number[:3] == "099")): #and (number[:4] != "0999")):
|
|
405
|
-
new_number = number.replace("099", "0999", 1)
|
|
406
|
-
return self.__update_text_with_number(text, new_number, "099", "0999")
|
|
407
|
-
return self.__update_general_case(text, number)
|
|
408
|
-
|
|
409
|
-
def __normalize_format_text(self, text):
|
|
410
|
-
# اضافه کردن فاصله قبل و بعد از اعداد
|
|
411
|
-
text_with_spaces = re.sub(r'(\d+)', r' \1 ', text)
|
|
412
|
-
# حذف فضاهای اضافی
|
|
413
|
-
formatted_text = re.sub(r'\s+', ' ', text_with_spaces).strip()
|
|
414
|
-
return formatted_text
|
|
415
|
-
@staticmethod
|
|
416
|
-
def normalize_phone_numbers(text):
|
|
417
|
-
"""
|
|
418
|
-
Normalize phone numbers in the input Persian text by handling specific patterns
|
|
419
|
-
and replacing them with the correct numeric forms.
|
|
420
|
-
"""
|
|
421
|
-
|
|
422
|
-
# Compile regex pattern to match number patterns
|
|
423
|
-
pattern_regex = re.compile(
|
|
424
|
-
r"((0?[1-9][0-9]{1,3})|(۰?[۱-۹][۰-۹]{1,3}))[\s۰-۹0-9تا]*"
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
# Clean up text by removing extra spaces
|
|
428
|
-
cleaned_text = re.sub(r"\s{2,}", " ", text)
|
|
429
|
-
|
|
430
|
-
processor = PhoneNumberProcessor()
|
|
431
|
-
|
|
432
|
-
# Replace Persian number words with numeric equivalents
|
|
433
|
-
cleaned_text = processor.__replace_number_words(cleaned_text)
|
|
434
|
-
|
|
435
|
-
# Find all matching number patterns
|
|
436
|
-
matches = pattern_regex.finditer(cleaned_text)
|
|
437
|
-
|
|
438
|
-
results = [
|
|
439
|
-
match.group()
|
|
440
|
-
for match in matches
|
|
441
|
-
if digit_mapping.MIN_CHAR_CHECK
|
|
442
|
-
<= len(match.group())
|
|
443
|
-
<= digit_mapping.MAX_CHAR_CHECK
|
|
444
|
-
]
|
|
445
|
-
# Process each match to handle specific cases
|
|
446
|
-
for result in results:
|
|
447
|
-
digits = re.findall(r"\d", result)
|
|
448
|
-
if (
|
|
449
|
-
digit_mapping.MIN_NUMBER_FOR_GENERATE
|
|
450
|
-
<= len(digits)
|
|
451
|
-
<= digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL
|
|
452
|
-
):
|
|
453
|
-
|
|
454
|
-
cleaned_text = cleaned_text.replace(result, ' ' * 8)
|
|
455
|
-
cleaned_text = processor.__process_number_replacement(cleaned_text, result)
|
|
456
|
-
#new_version 2.0 fixed bug space between number and word
|
|
457
|
-
cleaned_text = processor.__normalize_format_text(cleaned_text)
|
|
458
|
-
return cleaned_text if results else text
|
|
459
|
-
# Iranian mobile phone number area code
|
|
460
|
-
|
|
461
|
-
def __replace_number_words(self, text):
|
|
462
|
-
pattern = r"(\d+)([a-zA-Z]+)"
|
|
463
|
-
text = re.sub(pattern, r"\1 \2", text)
|
|
464
|
-
for word, digit in digit_mapping.number_words.items():
|
|
465
|
-
text = re.sub(r"\b" + word + r"\b", digit, text, flags=re.IGNORECASE)
|
|
466
|
-
return text
|
|
467
|
-
|
|
468
|
-
def __first_to_in_phone_number(self, number_str):
|
|
469
|
-
"""
|
|
470
|
-
Identify patterns where a number is followed by "تا" and another number.
|
|
471
|
-
Generate all combinations for numbers preceding and following "تا"
|
|
472
|
-
and return them as a list.
|
|
473
|
-
"""
|
|
474
|
-
generated_numbers = []
|
|
475
|
-
ta_pattern = re.compile(r"(\d)(?=\s*تا\s*(\d+))")
|
|
476
|
-
matches = ta_pattern.findall(number_str)
|
|
477
|
-
|
|
478
|
-
if matches:
|
|
479
|
-
num1, num2 = matches[0] # Extract the first match
|
|
480
|
-
combinations = self.__generate_prefix_combinations(num1, num2)
|
|
481
|
-
for combo in combinations:
|
|
482
|
-
repeated_number = combo[1] * int(combo[0])
|
|
483
|
-
pattern = re.escape(f"{combo[0]} تا {combo[1]}")
|
|
484
|
-
generated_numbers.append(re.sub(pattern, repeated_number, number_str))
|
|
485
|
-
return generated_numbers
|
|
486
|
-
return number_str
|
|
487
|
-
|
|
488
|
-
def __ensureLeadingZero(self, number_string):
|
|
489
|
-
if not number_string.startswith('0'):
|
|
490
|
-
return '0' + number_string
|
|
491
|
-
return number_string
|
|
492
|
-
|
|
493
|
-
def __process_to_constructions(self, number_str):
|
|
494
|
-
"""
|
|
495
|
-
Process occurrences of 'تا' in the input string to generate possible number combinations.
|
|
496
|
-
This function calls `__first_to_in_phone_number` to handle the first 'تا' occurrence and
|
|
497
|
-
iteratively processes further occurrences if necessary.
|
|
498
|
-
"""
|
|
499
|
-
# Start processing the first 'تا'
|
|
500
|
-
#add version 1.6.2 check zero
|
|
501
|
-
number_str = self.__ensureLeadingZero(number_str)
|
|
502
|
-
generated_numbers = self.__first_to_in_phone_number(number_str)
|
|
503
|
-
|
|
504
|
-
if isinstance(generated_numbers, str):
|
|
505
|
-
return generated_numbers
|
|
506
|
-
|
|
507
|
-
ta_pattern = re.compile(r"(\d+)(?=\s*تا\s*(\d+))")
|
|
508
|
-
|
|
509
|
-
i = 0
|
|
510
|
-
while i < digit_mapping.MAX_ITERATIONS_CHECK and len(generated_numbers) >= 2:
|
|
511
|
-
current_number = generated_numbers[i]
|
|
512
|
-
if ta_pattern.search(current_number):
|
|
513
|
-
new_combinations = self.__first_to_in_phone_number(current_number)
|
|
514
|
-
generated_numbers.pop(i)
|
|
515
|
-
generated_numbers[i:i] = new_combinations
|
|
516
|
-
else:
|
|
517
|
-
i += 1
|
|
518
|
-
# Select the appropriate number from the generated list
|
|
519
|
-
#edit version 1.16.2 (dont have zero first)
|
|
520
|
-
|
|
521
|
-
selected_number = next(
|
|
522
|
-
(
|
|
523
|
-
num
|
|
524
|
-
for num in generated_numbers
|
|
525
|
-
if len("".join(num.split())) == digit_mapping.PHONE_NUMBER_LENGTH_FULL
|
|
526
|
-
),
|
|
527
|
-
next(
|
|
528
|
-
(
|
|
529
|
-
num
|
|
530
|
-
for num in generated_numbers
|
|
531
|
-
if len("".join(num.split()))
|
|
532
|
-
== digit_mapping.PHONE_NUMBER_LENGTH_PARTIAL
|
|
533
|
-
),
|
|
534
|
-
None,
|
|
535
|
-
),
|
|
536
|
-
)
|
|
537
|
-
return "".join(selected_number.split()) if selected_number else number_str
|
|
538
|
-
|
|
539
|
-
def __generate_prefix_combinations(self, prefix1: str, prefix2: str):
|
|
540
|
-
"""
|
|
541
|
-
Generate all possible combinations of the prefixes of `prefix1` and `prefix2`
|
|
542
|
-
using itertools to avoid explicit loops.
|
|
543
|
-
"""
|
|
544
|
-
len1, len2 = len(prefix1), len(prefix2)
|
|
545
|
-
indices_product = itertools.product(range(1, len1 + 1), range(1, len2 + 1))
|
|
546
|
-
|
|
547
|
-
# Generate the combinations using the indices
|
|
548
|
-
return [(prefix1[:i], prefix2[:j]) for i, j in indices_product]
|
|
549
|
-
|
|
550
|
-
def __add_spaces_around_to(self, input_text):
|
|
551
|
-
"""
|
|
552
|
-
Ensure "تا" has spaces around it and remove any redundant spaces.
|
|
553
|
-
"""
|
|
554
|
-
# Use regular expressions to add spaces around "تا"
|
|
555
|
-
modified_text = re.sub(r"\s*تا\s*", " تا ", input_text)
|
|
556
|
-
|
|
557
|
-
return modified_text
|
|
558
|
-
|
|
559
|
-
def __generate_next_numbers(self, num):
|
|
560
|
-
str_num = str(num)
|
|
561
|
-
next_numbers = set()
|
|
562
|
-
for i in range(len(str_num) - 1):
|
|
563
|
-
if i <= digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH :
|
|
564
|
-
current_digit = int(str_num[i])
|
|
565
|
-
next_digit = int(str_num[i + 1])
|
|
566
|
-
new_num = (
|
|
567
|
-
str_num[:i]
|
|
568
|
-
+ str(int(current_digit) * str(next_digit))
|
|
569
|
-
+ str_num[i + 2 :]
|
|
570
|
-
)
|
|
571
|
-
if digit_mapping.MIN_NUMBER_FOR_GENERATE <= len(new_num) <= digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH:
|
|
572
|
-
next_numbers.add(str(new_num))
|
|
573
|
-
return next_numbers
|
|
574
|
-
|
|
575
|
-
def __find_seven_chain_numbers(self, start_num):
|
|
576
|
-
current_numbers = {start_num}
|
|
577
|
-
all_numbers = set(current_numbers)
|
|
578
|
-
while len(all_numbers) < digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH:
|
|
579
|
-
next_numbers = set()
|
|
580
|
-
|
|
581
|
-
# Using a for loop to generate new numbers
|
|
582
|
-
for num in current_numbers:
|
|
583
|
-
generated_numbers = self.__generate_next_numbers(num)
|
|
584
|
-
new_numbers = generated_numbers - all_numbers
|
|
585
|
-
next_numbers.update(new_numbers)
|
|
586
|
-
|
|
587
|
-
if not next_numbers: # Exit loop if no new numbers are generated
|
|
588
|
-
break
|
|
589
|
-
|
|
590
|
-
all_numbers.update(next_numbers)
|
|
591
|
-
current_numbers = next_numbers
|
|
592
|
-
|
|
593
|
-
# Early exit if we find any 7-digit number
|
|
594
|
-
if any(
|
|
595
|
-
len(str(num)) == digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH
|
|
596
|
-
for num in next_numbers
|
|
597
|
-
):
|
|
598
|
-
break
|
|
599
|
-
# Collect all 7-digit numbers and return them sorted
|
|
600
|
-
seven_digit_numbers = sorted(
|
|
601
|
-
num
|
|
602
|
-
for num in all_numbers
|
|
603
|
-
if len(str(num)) == digit_mapping.SEVEN_DIGIT_PREFIX_LENGTH
|
|
604
|
-
)
|
|
605
|
-
return seven_digit_numbers
|
|
606
|
-
#remove 1.6.x
|
|
607
|
-
#def __apply_filter(self, numbers, index, condition):
|
|
608
|
-
# return (
|
|
609
|
-
# [num for num in numbers if condition(num)] if len(numbers) >= 2 else numbers
|
|
610
|
-
# )
|
|
611
|
-
|
|
612
|
-
def __generate_valid_numbers(self, start_num):
|
|
613
|
-
numbers_list = self.__find_seven_chain_numbers(start_num)
|
|
614
|
-
# If the number list length matches the valid criteria, return the first number
|
|
615
|
-
if len(numbers_list) == digit_mapping.GENERATIVE_VALID_NUMBER_COUNT:
|
|
616
|
-
return numbers_list[0] if numbers_list else start_num
|
|
617
|
-
|
|
618
|
-
start_num_str = str(start_num)
|
|
619
|
-
unique_count = len(set(start_num_str))
|
|
217
|
+
#start_num_str = str(start_num)
|
|
218
|
+
#unique_count = len(set(start_num_str))
|
|
620
219
|
# Step 1: Filter numbers with the same or one less unique digit count
|
|
621
220
|
# Remove version 1.6.x
|
|
622
221
|
# Apply additional filters only if there are at least 2 candidates remaining
|
|
@@ -624,8 +223,8 @@ class PhoneNumberNormalizer:
|
|
|
624
223
|
# Return the first valid number or fallback
|
|
625
224
|
# version 1.16.x remove two generate number
|
|
626
225
|
return (
|
|
627
|
-
numbers_list[
|
|
628
|
-
if numbers_list and len(numbers_list)
|
|
226
|
+
numbers_list[0]
|
|
227
|
+
if (numbers_list and len(numbers_list) == 1)
|
|
629
228
|
else start_num)
|
|
630
229
|
|
|
631
230
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: sharedkernel
|
|
3
|
-
Version: 1.6.
|
|
3
|
+
Version: 1.6.10
|
|
4
4
|
Summary: sharekernel is a shared package between all python projects
|
|
5
5
|
Author: Smilinno
|
|
6
6
|
Description-Content-Type: text/markdown
|
|
@@ -9,7 +9,6 @@ Requires-Dist: requests
|
|
|
9
9
|
Requires-Dist: pymongo
|
|
10
10
|
Requires-Dist: fastapi==0.111.0
|
|
11
11
|
Requires-Dist: PyJWT
|
|
12
|
-
Requires-Dist: pymilvus
|
|
13
12
|
Requires-Dist: chromadb
|
|
14
13
|
Requires-Dist: persian_tools
|
|
15
14
|
Requires-Dist: sentry-sdk
|
|
@@ -20,6 +19,11 @@ Requires-Dist: persiantools
|
|
|
20
19
|
this a shared kernel package
|
|
21
20
|
|
|
22
21
|
# Change Log
|
|
22
|
+
### Version 1.6.10
|
|
23
|
+
- Add updated_on to BaseDocument
|
|
24
|
+
- Update phonenumber normalizer
|
|
25
|
+
### Version 1.6.9
|
|
26
|
+
- Delete milvus
|
|
23
27
|
### Version 1.6.8
|
|
24
28
|
- Fix Date Converter Bug
|
|
25
29
|
### Version 1.6.7
|
|
@@ -62,10 +66,10 @@ this a shared kernel package
|
|
|
62
66
|
### Version 1.2.0
|
|
63
67
|
- Implement Regex Masking
|
|
64
68
|
# Create Package
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
python -m pip install --upgrade build
|
|
70
|
+
python -m build
|
|
71
|
+
python -m pip install --upgrade twine
|
|
72
|
+
python -m twine upload dist/*
|
|
69
73
|
|
|
70
74
|
# Pypi
|
|
71
75
|
pip install sharedkernel
|
|
@@ -14,7 +14,6 @@ sharedkernel/database/__init__.py
|
|
|
14
14
|
sharedkernel/database/mongo_generic_repository.py
|
|
15
15
|
sharedkernel/database/vector_database_repository/__init__.py
|
|
16
16
|
sharedkernel/database/vector_database_repository/chroma_startegy.py
|
|
17
|
-
sharedkernel/database/vector_database_repository/milvus_strategy.py
|
|
18
17
|
sharedkernel/database/vector_database_repository/vector_database_repository.py
|
|
19
18
|
sharedkernel/database/vector_database_repository/vector_database_strategy.py
|
|
20
19
|
sharedkernel/enum/__init__.py
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
import numpy as np
|
|
2
|
-
from pymilvus import connections, Collection, FieldSchema, CollectionSchema, DataType
|
|
3
|
-
|
|
4
|
-
from .vector_database_strategy import VectorDatabaseStrategy
|
|
5
|
-
import uuid
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
class MilvusStrategy(VectorDatabaseStrategy):
|
|
9
|
-
def __init__(self, collection_name: str):
|
|
10
|
-
self.collection_name = collection_name
|
|
11
|
-
self.collection = None
|
|
12
|
-
|
|
13
|
-
def connect(self, host: str = "localhost", port: str = "19530"):
|
|
14
|
-
connections.connect(alias="default", host=host, port=port)
|
|
15
|
-
# Define fields
|
|
16
|
-
fields = [
|
|
17
|
-
FieldSchema(
|
|
18
|
-
name="id", dtype=DataType.VARCHAR, max_length=36, is_primary=True
|
|
19
|
-
),
|
|
20
|
-
FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=128),
|
|
21
|
-
]
|
|
22
|
-
schema = CollectionSchema(
|
|
23
|
-
fields, description="Vector collection", enable_dynamic_field=True
|
|
24
|
-
)
|
|
25
|
-
self.collection = Collection(name=self.collection_name, schema=schema)
|
|
26
|
-
|
|
27
|
-
if not self.collection.has_index():
|
|
28
|
-
self.collection.create_index(field_name="vector", index_params={"index_type": "IVF_FLAT", "metric_type": "L2", "params": {"nlist": 128}})
|
|
29
|
-
|
|
30
|
-
self.collection.load()
|
|
31
|
-
|
|
32
|
-
def insert_vector(self, vector: np.ndarray, metadata: dict) -> str:
|
|
33
|
-
id = str(uuid.uuid4())
|
|
34
|
-
self.collection.insert(data={"id": id, "vector": vector.tolist()})
|
|
35
|
-
|
|
36
|
-
return id
|
|
37
|
-
|
|
38
|
-
def search_vector(self, vector: np.ndarray, top_k: int):
|
|
39
|
-
search_params = {"metric_type": "L2", "params": {"nprobe": 10}}
|
|
40
|
-
results = self.collection.search(
|
|
41
|
-
[vector.tolist()], "vector", search_params, top_k
|
|
42
|
-
)
|
|
43
|
-
return results
|
|
44
|
-
|
|
45
|
-
def get_vector_by_id(self, id: str):
|
|
46
|
-
result = self.collection.query(expr=f"id=='{id}'",output_fields=["vector"])
|
|
47
|
-
return result
|
|
48
|
-
|
|
49
|
-
def delete_vector(self, id: str):
|
|
50
|
-
self.collection.delete(expr=f"id=='{id}'")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{sharedkernel-1.6.8 → sharedkernel-1.6.10}/sharedkernel/database/mongo_generic_repository.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|