telugu-language-tools 4.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- telugu_language_tools-4.0.2.dist-info/METADATA +956 -0
- telugu_language_tools-4.0.2.dist-info/RECORD +14 -0
- telugu_language_tools-4.0.2.dist-info/WHEEL +5 -0
- telugu_language_tools-4.0.2.dist-info/licenses/LICENSE +21 -0
- telugu_language_tools-4.0.2.dist-info/top_level.txt +1 -0
- telugu_lib/__init__.py +197 -0
- telugu_lib/advanced.py +717 -0
- telugu_lib/cluster_generator.py +399 -0
- telugu_lib/context_rules.py +568 -0
- telugu_lib/enhanced_dictionary.py +516 -0
- telugu_lib/iso15919_mappings.py +430 -0
- telugu_lib/sentence_tools.py +214 -0
- telugu_lib/text_tools.py +108 -0
- telugu_lib/transliterate.py +972 -0
telugu_lib/text_tools.py
ADDED
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
def count_telugu_chars(text):
|
|
2
|
+
"""
|
|
3
|
+
Count the number of Telugu characters in a string.
|
|
4
|
+
|
|
5
|
+
Args:
|
|
6
|
+
text (str): Input text
|
|
7
|
+
|
|
8
|
+
Returns:
|
|
9
|
+
int: Number of Telugu characters
|
|
10
|
+
"""
|
|
11
|
+
if text is None or not isinstance(text, str):
|
|
12
|
+
return 0
|
|
13
|
+
return sum(1 for ch in text if '\u0C00' <= ch <= '\u0C7F')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def count_english_chars(text):
|
|
17
|
+
"""
|
|
18
|
+
Count the number of English characters in a string.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
text (str): Input text
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
int: Number of English characters (a-z, A-Z)
|
|
25
|
+
"""
|
|
26
|
+
if text is None or not isinstance(text, str):
|
|
27
|
+
return 0
|
|
28
|
+
import re
|
|
29
|
+
return len(re.findall(r'[a-zA-Z]', text))
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def count_digits(text):
|
|
33
|
+
"""
|
|
34
|
+
Count the number of digits in a string.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
text (str): Input text
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
int: Number of digit characters
|
|
41
|
+
"""
|
|
42
|
+
if text is None or not isinstance(text, str):
|
|
43
|
+
return 0
|
|
44
|
+
return sum(1 for ch in text if ch.isdigit())
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def is_telugu_text(text):
|
|
48
|
+
"""
|
|
49
|
+
Check if text contains Telugu characters.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
text (str): Input text
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
bool: True if text contains Telugu characters
|
|
56
|
+
"""
|
|
57
|
+
if text is None or not isinstance(text, str):
|
|
58
|
+
return False
|
|
59
|
+
return any('\u0C00' <= ch <= '\u0C7F' for ch in text)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def split_telugu_words(text):
|
|
63
|
+
"""
|
|
64
|
+
Split text into Telugu words.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
text (str): Input text
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
list: List of Telugu words
|
|
71
|
+
"""
|
|
72
|
+
if text is None or not isinstance(text, str):
|
|
73
|
+
return []
|
|
74
|
+
import re
|
|
75
|
+
# Match Telugu characters
|
|
76
|
+
words = re.findall(r'[\u0C00-\u0C7F]+', text)
|
|
77
|
+
return words
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def get_text_stats(text):
|
|
81
|
+
"""
|
|
82
|
+
Get comprehensive statistics about the text.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
text (str): Input text
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
dict: Dictionary with text statistics
|
|
89
|
+
"""
|
|
90
|
+
if text is None or not isinstance(text, str):
|
|
91
|
+
return {
|
|
92
|
+
'total_chars': 0,
|
|
93
|
+
'telugu_chars': 0,
|
|
94
|
+
'english_chars': 0,
|
|
95
|
+
'digits': 0,
|
|
96
|
+
'telugu_words': 0,
|
|
97
|
+
'is_telugu': False
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
stats = {
|
|
101
|
+
'total_chars': len(text),
|
|
102
|
+
'telugu_chars': count_telugu_chars(text),
|
|
103
|
+
'english_chars': count_english_chars(text),
|
|
104
|
+
'digits': count_digits(text),
|
|
105
|
+
'telugu_words': len(split_telugu_words(text)),
|
|
106
|
+
'is_telugu': is_telugu_text(text)
|
|
107
|
+
}
|
|
108
|
+
return stats
|