stringextn 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stringextn/__init__.py +7 -0
- stringextn/cases.py +105 -0
- stringextn/clean.py +135 -0
- stringextn/contains.py +48 -0
- stringextn/fuzzy.py +28 -0
- stringextn/replace.py +30 -0
- stringextn/security.py +48 -0
- stringextn/slug.py +33 -0
- stringextn-1.0.1.dist-info/LICENSE +21 -0
- stringextn-1.0.1.dist-info/METADATA +170 -0
- stringextn-1.0.1.dist-info/RECORD +13 -0
- stringextn-1.0.1.dist-info/WHEEL +5 -0
- stringextn-1.0.1.dist-info/top_level.txt +1 -0
stringextn/__init__.py
ADDED
stringextn/cases.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
def to_snake(s: str) -> str:
|
|
4
|
+
"""Convert a string to snake_case format.
|
|
5
|
+
|
|
6
|
+
Converts various string formats (camelCase, PascalCase, kebab-case, etc.)
|
|
7
|
+
to snake_case by inserting underscores before uppercase letters and
|
|
8
|
+
converting to lowercase.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
s: The input string to convert.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
The string converted to snake_case format with all characters in lowercase
|
|
15
|
+
and words separated by underscores.
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
None
|
|
19
|
+
|
|
20
|
+
Edge cases:
|
|
21
|
+
- Consecutive uppercase letters are treated individually.
|
|
22
|
+
- Spaces are converted to underscores.
|
|
23
|
+
- Empty string returns an empty string.
|
|
24
|
+
- Non-ASCII characters are preserved but not affected by case conversion.
|
|
25
|
+
"""
|
|
26
|
+
s = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', s)
|
|
27
|
+
s = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s)
|
|
28
|
+
return s.replace(" ", "_").lower()
|
|
29
|
+
|
|
30
|
+
def to_camel(s: str) -> str:
|
|
31
|
+
"""Convert a string to camelCase format.
|
|
32
|
+
|
|
33
|
+
Converts various string formats (snake_case, kebab-case, PascalCase, etc.)
|
|
34
|
+
to camelCase where the first word is lowercase and subsequent words are
|
|
35
|
+
title-cased without separators.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
s: The input string to convert.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
The string converted to camelCase format with the first character
|
|
42
|
+
lowercase and subsequent words capitalized without separators.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
None
|
|
46
|
+
|
|
47
|
+
Edge cases:
|
|
48
|
+
- Empty string returns an empty string.
|
|
49
|
+
- String with only separators returns an empty string.
|
|
50
|
+
- Single word returns the word in lowercase.
|
|
51
|
+
- Separators recognized: underscores (_), hyphens (-), and spaces ( ).
|
|
52
|
+
"""
|
|
53
|
+
parts = re.split(r'[_\-\s]', s)
|
|
54
|
+
return parts[0].lower() + "".join(p.title() for p in parts[1:])
|
|
55
|
+
|
|
56
|
+
def to_pascal(s: str) -> str:
|
|
57
|
+
"""Convert a string to PascalCase format.
|
|
58
|
+
|
|
59
|
+
Converts various string formats (snake_case, kebab-case, camelCase, etc.)
|
|
60
|
+
to PascalCase where each word is title-cased and concatenated without
|
|
61
|
+
separators.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
s: The input string to convert.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
The string converted to PascalCase format with the first character
|
|
68
|
+
and first character of each word capitalized without separators.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
None
|
|
72
|
+
|
|
73
|
+
Edge cases:
|
|
74
|
+
- Empty string returns an empty string.
|
|
75
|
+
- String with only separators returns an empty string.
|
|
76
|
+
- Single word returns the word with first character capitalized.
|
|
77
|
+
- Separators recognized: underscores (_), hyphens (-), and spaces ( ).
|
|
78
|
+
"""
|
|
79
|
+
parts = re.split(r'[_\-\s]', s)
|
|
80
|
+
return "".join(p.title() for p in parts)
|
|
81
|
+
|
|
82
|
+
def to_kebab(s: str) -> str:
|
|
83
|
+
"""Convert a string to kebab-case format.
|
|
84
|
+
|
|
85
|
+
Converts various string formats to kebab-case by first converting to
|
|
86
|
+
snake_case, then replacing underscores with hyphens. The result has all
|
|
87
|
+
lowercase letters with words separated by hyphens.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
s: The input string to convert.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The string converted to kebab-case format with all characters in lowercase
|
|
94
|
+
and words separated by hyphens.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
None
|
|
98
|
+
|
|
99
|
+
Edge cases:
|
|
100
|
+
- Consecutive uppercase letters are treated individually.
|
|
101
|
+
- Spaces are converted to hyphens.
|
|
102
|
+
- Empty string returns an empty string.
|
|
103
|
+
- Non-ASCII characters are preserved but not affected by case conversion.
|
|
104
|
+
"""
|
|
105
|
+
return to_snake(s).replace("_", "-")
|
stringextn/clean.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import html
|
|
3
|
+
import unicodedata
|
|
4
|
+
|
|
5
|
+
EMOJI_PATTERN = re.compile(
|
|
6
|
+
"["
|
|
7
|
+
"\U0001F600-\U0001F64F"
|
|
8
|
+
"\U0001F300-\U0001F5FF"
|
|
9
|
+
"\U0001F680-\U0001F6FF"
|
|
10
|
+
"\U0001F1E0-\U0001F1FF"
|
|
11
|
+
"]+", flags=re.UNICODE
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
def remove_html(s: str) -> str:
|
|
15
|
+
"""Remove all HTML tags from a string.
|
|
16
|
+
|
|
17
|
+
Removes any content enclosed in angle brackets (<...>) which represents
|
|
18
|
+
HTML/XML tags, leaving only the text content.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
s: The input string potentially containing HTML tags.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
The string with all HTML tags removed, preserving the text content.
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
None
|
|
28
|
+
|
|
29
|
+
Edge cases:
|
|
30
|
+
- Empty string returns an empty string.
|
|
31
|
+
- String with no HTML tags returns the original string unchanged.
|
|
32
|
+
- Malformed tags are handled by the greedy regex pattern.
|
|
33
|
+
- HTML entities (e.g., <) are NOT unescaped; use html.unescape separately.
|
|
34
|
+
"""
|
|
35
|
+
return re.sub(r'<.*?>', '', s)
|
|
36
|
+
|
|
37
|
+
def remove_emoji(s: str) -> str:
|
|
38
|
+
"""Remove all emoji characters from a string.
|
|
39
|
+
|
|
40
|
+
Removes emoji characters in the Unicode ranges defined by EMOJI_PATTERN,
|
|
41
|
+
including emoticons, symbols, and flag sequences.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
s: The input string potentially containing emoji characters.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The string with all emoji characters removed.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
None
|
|
51
|
+
|
|
52
|
+
Edge cases:
|
|
53
|
+
- Empty string returns an empty string.
|
|
54
|
+
- String with no emoji returns the original string unchanged.
|
|
55
|
+
- Emoji in skin tone or zero-width-joiner sequences may not all be removed.
|
|
56
|
+
- Non-emoji Unicode characters are preserved.
|
|
57
|
+
"""
|
|
58
|
+
return EMOJI_PATTERN.sub('', s)
|
|
59
|
+
|
|
60
|
+
def normalize_spaces(s: str) -> str:
|
|
61
|
+
"""Normalize whitespace in a string.
|
|
62
|
+
|
|
63
|
+
Replaces consecutive whitespace characters (spaces, tabs, newlines, etc.)
|
|
64
|
+
with a single space and removes leading/trailing whitespace.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
s: The input string with potentially irregular whitespace.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
The string with normalized whitespace: single spaces between words
|
|
71
|
+
and no leading or trailing whitespace.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
None
|
|
75
|
+
|
|
76
|
+
Edge cases:
|
|
77
|
+
- Empty string returns an empty string.
|
|
78
|
+
- String with only whitespace returns an empty string.
|
|
79
|
+
- Non-breaking spaces and other Unicode whitespace are treated as whitespace.
|
|
80
|
+
"""
|
|
81
|
+
return re.sub(r'\s+', ' ', s).strip()
|
|
82
|
+
|
|
83
|
+
def normalize_unicode(s: str) -> str:
|
|
84
|
+
"""Normalize Unicode characters to their canonical decomposed form.
|
|
85
|
+
|
|
86
|
+
Applies NFKD (Compatibility Decomposition) normalization, which decomposes
|
|
87
|
+
characters into their constituent parts and applies compatibility mappings.
|
|
88
|
+
Useful for handling accented characters and compatibility characters.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
s: The input string with potentially non-normalized Unicode characters.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The string with Unicode characters normalized to NFKD form.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
None
|
|
98
|
+
|
|
99
|
+
Edge cases:
|
|
100
|
+
- Empty string returns an empty string.
|
|
101
|
+
- ASCII-only strings are unchanged.
|
|
102
|
+
- Accented characters are decomposed into base character + combining marks.
|
|
103
|
+
- Some characters may be converted to different representations (e.g., ligatures).
|
|
104
|
+
"""
|
|
105
|
+
return unicodedata.normalize("NFKD", s)
|
|
106
|
+
|
|
107
|
+
def clean_text(s: str) -> str:
|
|
108
|
+
"""Perform comprehensive text cleaning on a string.
|
|
109
|
+
|
|
110
|
+
Applies a series of cleaning operations in sequence: HTML entity unescaping,
|
|
111
|
+
HTML tag removal, emoji removal, Unicode normalization, and whitespace
|
|
112
|
+
normalization. Provides a complete text sanitization pipeline.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
s: The input string to clean.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
The cleaned string with HTML entities unescaped, tags removed, emoji
|
|
119
|
+
removed, Unicode normalized, and whitespace normalized.
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
None
|
|
123
|
+
|
|
124
|
+
Edge cases:
|
|
125
|
+
- Empty string returns an empty string.
|
|
126
|
+
- Order of operations matters: HTML is processed before emoji and Unicode.
|
|
127
|
+
- HTML entities are decoded before tag removal (e.g., <tag> becomes <tag> then removed).
|
|
128
|
+
- The function calls remove_html, remove_emoji, normalize_unicode, and normalize_spaces internally.
|
|
129
|
+
"""
|
|
130
|
+
s = html.unescape(s)
|
|
131
|
+
s = remove_html(s)
|
|
132
|
+
s = remove_emoji(s)
|
|
133
|
+
s = normalize_unicode(s)
|
|
134
|
+
s = normalize_spaces(s)
|
|
135
|
+
return s
|
stringextn/contains.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
def contains_any(s: str, items) -> bool:
|
|
2
|
+
"""Check if a string contains any of the given items.
|
|
3
|
+
|
|
4
|
+
Returns True if the string contains at least one of the items in the
|
|
5
|
+
provided iterable. Uses substring matching for string items.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
s: The string to search in.
|
|
9
|
+
items: An iterable of items to check for in the string.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
True if the string contains any of the items, False otherwise.
|
|
13
|
+
|
|
14
|
+
Raises:
|
|
15
|
+
TypeError: If items is not iterable.
|
|
16
|
+
|
|
17
|
+
Edge cases:
|
|
18
|
+
- Empty items iterable returns False.
|
|
19
|
+
- Empty string only returns True if items contains empty string.
|
|
20
|
+
- Case-sensitive substring matching.
|
|
21
|
+
- Matching is performed using the 'in' operator.
|
|
22
|
+
"""
|
|
23
|
+
return any(i in s for i in items)
|
|
24
|
+
|
|
25
|
+
def contains_all(s: str, items) -> bool:
|
|
26
|
+
"""Check if a string contains all of the given items.
|
|
27
|
+
|
|
28
|
+
Returns True if the string contains every item in the provided iterable.
|
|
29
|
+
Uses substring matching for string items. Order does not matter.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
s: The string to search in.
|
|
33
|
+
items: An iterable of items to check for in the string.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
True if the string contains all of the items, False otherwise.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
TypeError: If items is not iterable.
|
|
40
|
+
|
|
41
|
+
Edge cases:
|
|
42
|
+
- Empty items iterable returns True.
|
|
43
|
+
- Empty string only returns True if items is empty.
|
|
44
|
+
- Case-sensitive substring matching.
|
|
45
|
+
- Order of items in the string does not matter.
|
|
46
|
+
- Matching is performed using the 'in' operator.
|
|
47
|
+
"""
|
|
48
|
+
return all(i in s for i in items)
|
stringextn/fuzzy.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from difflib import SequenceMatcher
|
|
2
|
+
|
|
3
|
+
def similarity(a: str, b: str) -> float:
|
|
4
|
+
"""Calculate the similarity ratio between two strings.
|
|
5
|
+
|
|
6
|
+
Computes a similarity score between 0 and 1 using SequenceMatcher from
|
|
7
|
+
the difflib module. The ratio represents the proportion of matching
|
|
8
|
+
characters and sequences. Result is rounded to 3 decimal places.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
a: The first string to compare.
|
|
12
|
+
b: The second string to compare.
|
|
13
|
+
|
|
14
|
+
Returns:
|
|
15
|
+
A float between 0 and 1 representing the similarity ratio, rounded to
|
|
16
|
+
3 decimal places. 1.0 indicates identical strings, 0.0 indicates no similarity.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
None
|
|
20
|
+
|
|
21
|
+
Edge cases:
|
|
22
|
+
- Empty strings: two empty strings return 1.0 (identical).
|
|
23
|
+
- One empty string: similarity depends on the other string's length.
|
|
24
|
+
- Case-sensitive comparison: 'abc' and 'ABC' are treated as different.
|
|
25
|
+
- Whitespace is significant: leading/trailing spaces affect the result.
|
|
26
|
+
- The function uses longest contiguous matching subsequences for comparison.
|
|
27
|
+
"""
|
|
28
|
+
return round(SequenceMatcher(None, a, b).ratio(), 3)
|
stringextn/replace.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
def multi_replace(s: str, mapping: dict) -> str:
|
|
4
|
+
"""Replace multiple substrings in a string using a mapping dictionary.
|
|
5
|
+
|
|
6
|
+
Performs simultaneous replacement of multiple substrings based on the
|
|
7
|
+
provided mapping dictionary. Uses compiled regex pattern for efficient
|
|
8
|
+
substitution. All keys are escaped to be treated as literal strings.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
s: The input string to perform replacements on.
|
|
12
|
+
mapping: A dictionary where keys are substrings to find and values
|
|
13
|
+
are the replacements. Keys are treated as literal strings.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
The string with all mapped substrings replaced according to the mapping.
|
|
17
|
+
|
|
18
|
+
Raises:
|
|
19
|
+
None
|
|
20
|
+
|
|
21
|
+
Edge cases:
|
|
22
|
+
- Empty mapping dictionary returns the original string unchanged.
|
|
23
|
+
- Empty string returns an empty string.
|
|
24
|
+
- Empty keys in mapping are ignored by the regex pattern.
|
|
25
|
+
- Overlapping matches are not replaced multiple times; first match wins.
|
|
26
|
+
- All special regex characters in keys are escaped automatically.
|
|
27
|
+
- Order of replacements is determined by the order keys appear in the pattern.
|
|
28
|
+
"""
|
|
29
|
+
pattern = re.compile("|".join(map(re.escape, mapping.keys())))
|
|
30
|
+
return pattern.sub(lambda m: mapping[m.group(0)], s)
|
stringextn/security.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
def mask_email(email: str) -> str:
|
|
2
|
+
"""Mask an email address for privacy by hiding most of the local part.
|
|
3
|
+
|
|
4
|
+
Replaces most characters in the local part (before @) with asterisks,
|
|
5
|
+
keeping only the first character visible. The domain part remains unchanged.
|
|
6
|
+
Useful for displaying email addresses in logs or UI without full exposure.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
email: A valid email address string containing exactly one @ symbol.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
The masked email with format: [first_char]***@[domain]
|
|
13
|
+
|
|
14
|
+
Raises:
|
|
15
|
+
ValueError: If the email does not contain exactly one @ symbol.
|
|
16
|
+
IndexError: If the local part (before @) is empty.
|
|
17
|
+
|
|
18
|
+
Edge cases:
|
|
19
|
+
- Single character email local part returns "***@domain".
|
|
20
|
+
- Email with no domain part after @ raises ValueError.
|
|
21
|
+
- Email with multiple @ symbols raises ValueError.
|
|
22
|
+
- No validation is performed on email format beyond @ requirement.
|
|
23
|
+
"""
|
|
24
|
+
name, domain = email.split("@")
|
|
25
|
+
return name[0] + "***@" + domain
|
|
26
|
+
|
|
27
|
+
def mask_phone(phone: str) -> str:
|
|
28
|
+
"""Mask a phone number for privacy by hiding all but the last 4 digits.
|
|
29
|
+
|
|
30
|
+
Replaces all but the final 4 characters with asterisks. Useful for displaying
|
|
31
|
+
phone numbers in logs or UI while maintaining minimal identifier information.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
phone: A phone number string (any length, typically 10+ digits).
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
The masked phone number with format: ****[last_4_chars]
|
|
38
|
+
|
|
39
|
+
Raises:
|
|
40
|
+
None
|
|
41
|
+
|
|
42
|
+
Edge cases:
|
|
43
|
+
- Phone number with 4 or fewer characters returns the original string unchanged.
|
|
44
|
+
- Phone number with 5 characters returns one asterisk plus last 4 chars.
|
|
45
|
+
- No validation is performed on phone format; any string is accepted.
|
|
46
|
+
- Special characters and spaces are preserved (treated as regular characters).
|
|
47
|
+
"""
|
|
48
|
+
return "*" * (len(phone) - 4) + phone[-4:]
|
stringextn/slug.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from .clean import clean_text
|
|
3
|
+
|
|
4
|
+
def slugify(s: str) -> str:
|
|
5
|
+
"""Convert a string to a URL-friendly slug format.
|
|
6
|
+
|
|
7
|
+
Converts a string into a slug suitable for URLs by cleaning text,
|
|
8
|
+
converting to lowercase, replacing non-alphanumeric characters with hyphens,
|
|
9
|
+
and removing leading/trailing hyphens. Useful for generating URL-safe identifiers
|
|
10
|
+
from titles or descriptions.
|
|
11
|
+
|
|
12
|
+
Args:
|
|
13
|
+
s: The input string to convert to a slug.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
A URL-friendly slug with lowercase alphanumeric characters separated
|
|
17
|
+
by hyphens, with no leading or trailing hyphens.
|
|
18
|
+
|
|
19
|
+
Raises:
|
|
20
|
+
None
|
|
21
|
+
|
|
22
|
+
Edge cases:
|
|
23
|
+
- Empty string returns an empty string.
|
|
24
|
+
- String with only special characters returns an empty string.
|
|
25
|
+
- Consecutive special characters are collapsed into a single hyphen.
|
|
26
|
+
- Leading/trailing hyphens are removed via strip.
|
|
27
|
+
- HTML tags and emoji are removed by clean_text.
|
|
28
|
+
- Unicode characters are normalized before conversion.
|
|
29
|
+
- Spaces are converted to hyphens as part of the non-alphanumeric replacement.
|
|
30
|
+
"""
|
|
31
|
+
s = clean_text(s).lower()
|
|
32
|
+
s = re.sub(r'[^a-z0-9]+', '-', s)
|
|
33
|
+
return s.strip('-')
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Balaji Katta Venkatarathnam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: stringextn
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Pragmatic string utilities for APIs and data cleaning
|
|
5
|
+
Author: Balaji Katta Venkatarathnam
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/balaji-kv/stringextn
|
|
8
|
+
Project-URL: Source, https://github.com/balaji-kv/stringextn
|
|
9
|
+
Project-URL: Issues, https://github.com/balaji-kv/stringextn/issues
|
|
10
|
+
Keywords: string,text,utilities,slug,mask,cleaning
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
|
|
25
|
+
# stringextn
|
|
26
|
+
|
|
27
|
+
A pragmatic, zero-dependency Python library for practical string manipulation and text cleaning. `stringextn` provides battle-tested utilities for case conversion, HTML/emoji removal, substring matching, fuzzy comparison, security masking, URL slug generation, and multi-string replacementโall designed with real-world edge cases in mind.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Install via pip:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install stringextn
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Requires Python 3.8 or higher. No external dependencies.
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from stringextn import (
|
|
43
|
+
to_snake, to_camel, to_pascal, to_kebab,
|
|
44
|
+
clean_text, remove_html, remove_emoji,
|
|
45
|
+
contains_any, contains_all,
|
|
46
|
+
similarity,
|
|
47
|
+
multi_replace,
|
|
48
|
+
mask_email, mask_phone,
|
|
49
|
+
slugify
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Case conversion
|
|
53
|
+
to_snake("myVariableName") # "my_variable_name"
|
|
54
|
+
to_camel("my_variable_name") # "myVariableName"
|
|
55
|
+
to_pascal("my-variable-name") # "MyVariableName"
|
|
56
|
+
to_kebab("myVariableName") # "my-variable-name"
|
|
57
|
+
|
|
58
|
+
# Text cleaning
|
|
59
|
+
clean_text("<p>Hello & goodbye!</p>") # "Hello & goodbye!"
|
|
60
|
+
remove_html("<div>Content</div>") # "Content"
|
|
61
|
+
remove_emoji("Hello ๐ World ๐") # "Hello World "
|
|
62
|
+
|
|
63
|
+
# Substring matching
|
|
64
|
+
contains_any("hello world", ["world", "foo"]) # True
|
|
65
|
+
contains_all("hello world", ["hello", "world"]) # True
|
|
66
|
+
|
|
67
|
+
# Fuzzy string matching
|
|
68
|
+
similarity("kitten", "sitting") # 0.571
|
|
69
|
+
|
|
70
|
+
# Multi-replace
|
|
71
|
+
multi_replace("abc abc abc", {"a": "X", "b": "Y"}) # "XYc XYc XYc"
|
|
72
|
+
|
|
73
|
+
# Privacy masking
|
|
74
|
+
mask_email("user@example.com") # "u***@example.com"
|
|
75
|
+
mask_phone("5551234567") # "****1234"
|
|
76
|
+
|
|
77
|
+
# URL-safe slugs
|
|
78
|
+
slugify("Hello, World! โจ") # "hello-world"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Features
|
|
82
|
+
|
|
83
|
+
### Case Conversion
|
|
84
|
+
- **`to_snake(s)`** โ Converts to snake_case
|
|
85
|
+
- **`to_camel(s)`** โ Converts to camelCase
|
|
86
|
+
- **`to_pascal(s)`** โ Converts to PascalCase
|
|
87
|
+
- **`to_kebab(s)`** โ Converts to kebab-case
|
|
88
|
+
|
|
89
|
+
Supports mixed input formats (camelCase, PascalCase, kebab-case, snake_case, space-separated).
|
|
90
|
+
|
|
91
|
+
### Text Cleaning
|
|
92
|
+
- **`clean_text(s)`** โ Comprehensive cleaning pipeline: HTML entity unescaping, tag removal, emoji removal, Unicode normalization, and whitespace normalization
|
|
93
|
+
- **`remove_html(s)`** โ Strips HTML/XML tags
|
|
94
|
+
- **`remove_emoji(s)`** โ Removes emoji characters
|
|
95
|
+
- **`normalize_spaces(s)`** โ Collapses whitespace and trims
|
|
96
|
+
- **`normalize_unicode(s)`** โ Applies NFKD normalization for consistent character representation
|
|
97
|
+
|
|
98
|
+
### Substring Operations
|
|
99
|
+
- **`contains_any(s, items)`** โ Returns True if string contains any item
|
|
100
|
+
- **`contains_all(s, items)`** โ Returns True if string contains all items
|
|
101
|
+
|
|
102
|
+
Case-sensitive substring matching using Python's `in` operator.
|
|
103
|
+
|
|
104
|
+
### Fuzzy Matching
|
|
105
|
+
- **`similarity(a, b)`** โ Returns similarity score (0.0โ1.0) using difflib's SequenceMatcher
|
|
106
|
+
- 1.0 = identical strings
|
|
107
|
+
- 0.0 = no similarity
|
|
108
|
+
- Rounded to 3 decimal places
|
|
109
|
+
|
|
110
|
+
### String Replacement
|
|
111
|
+
- **`multi_replace(s, mapping)`** โ Performs simultaneous multi-string replacement
|
|
112
|
+
- All keys are treated as literal strings (regex special chars auto-escaped)
|
|
113
|
+
- Non-cascading: each substring is replaced exactly once
|
|
114
|
+
|
|
115
|
+
### Security & Privacy
|
|
116
|
+
- **`mask_email(email)`** โ Hides all but first character of email local part
|
|
117
|
+
- Format: `u***@example.com`
|
|
118
|
+
- Raises `ValueError` if email doesn't contain exactly one `@`
|
|
119
|
+
- **`mask_phone(phone)`** โ Hides all but last 4 digits
|
|
120
|
+
- Format: `****1234`
|
|
121
|
+
|
|
122
|
+
### URL Slugs
|
|
123
|
+
- **`slugify(s)`** โ Generates URL-safe slugs
|
|
124
|
+
- Cleans text, lowercases, replaces non-alphanumeric with hyphens
|
|
125
|
+
- Strips leading/trailing hyphens
|
|
126
|
+
- Example: `"Hello, World! โจ"` โ `"hello-world"`
|
|
127
|
+
|
|
128
|
+
## Performance & Behavior Notes
|
|
129
|
+
|
|
130
|
+
### Unicode Handling
|
|
131
|
+
- **NFKD Normalization**: The `clean_text()` and `slugify()` functions apply NFKD (Compatibility Decomposition) normalization, which:
|
|
132
|
+
- Decomposes accented characters (รฉ โ e + ยด)
|
|
133
|
+
- Applies compatibility mappings (๏ฌ โ fi)
|
|
134
|
+
- Ensures consistent character representation across different input encodings
|
|
135
|
+
- Emoji removal uses Unicode ranges and handles most emoticons and symbols; complex emoji sequences (skin tones, zero-width-joiner) may not be fully removed
|
|
136
|
+
- Non-ASCII characters in `to_snake()` and `to_camel()` are preserved but not affected by case conversion
|
|
137
|
+
|
|
138
|
+
### Edge Cases
|
|
139
|
+
- **Empty strings**: Most functions return empty strings; `contains_all("", [])` returns True (vacuous truth)
|
|
140
|
+
- **Whitespace**: Leading/trailing whitespace is preserved in case conversion; use `normalize_spaces()` first if needed
|
|
141
|
+
- **Consecutive separators**: `multi_replace()` and `slugify()` handle consecutive delimiters correctly (collapsed in slugs, replaced individually in multi_replace)
|
|
142
|
+
- **Special regex characters**: `multi_replace()` automatically escapes all regex special characters in mapping keys
|
|
143
|
+
- **Email masking**: No format validation; only checks for single `@` symbol
|
|
144
|
+
- **Phone masking**: Works with any string; no validation of format
|
|
145
|
+
|
|
146
|
+
### Performance
|
|
147
|
+
- All functions use compiled regular expressions or built-in operations for efficiency
|
|
148
|
+
- No external dependencies; pure Python implementation
|
|
149
|
+
- Suitable for high-volume text processing in APIs and data pipelines
|
|
150
|
+
|
|
151
|
+
## Testing
|
|
152
|
+
|
|
153
|
+
Run the test suite with pytest:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
pytest tests/
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT License. See LICENSE file for details.
|
|
162
|
+
|
|
163
|
+
## Contributing
|
|
164
|
+
|
|
165
|
+
Contributions are welcome. Please ensure all tests pass and add tests for new functionality.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
**Package**: stringextn v1.0.0
|
|
170
|
+
**GitHub**: [stringextn](https://github.com/balaji-kv/stringextn)
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
stringextn/__init__.py,sha256=tFSBvaarjSukVEPA7SjThaJNnjRt0F5f8HCq5wFxau4,154
|
|
2
|
+
stringextn/cases.py,sha256=lQ_SkuW1_im1szFHBD2Fw4uKXbwMrpWg0JWNDZjVYgM,3419
|
|
3
|
+
stringextn/clean.py,sha256=b6aAf0v8TIUnaaTBgPo9o2H_Sd46prwEaR5sYtrVOVI,4311
|
|
4
|
+
stringextn/contains.py,sha256=yV91oA-qMC6FkY1NLDa_c_38ZOHhQPZEWPkBUQwouEc,1603
|
|
5
|
+
stringextn/fuzzy.py,sha256=OeSuNjEkaXs6SmhZ1AVOIl3ipUgA6NFQ9ZUdT9vomX0,1122
|
|
6
|
+
stringextn/replace.py,sha256=ylqCw2DoRWrbR6SEaNkVyKZRl58Y16BsC5Tf6YZaN5U,1276
|
|
7
|
+
stringextn/security.py,sha256=jQRV8lF6hwoKsX8MpiAgrmV9hPrwVdrjGl7xtfKWygc,1887
|
|
8
|
+
stringextn/slug.py,sha256=MYDmAGlm0N8et-Njnik-deVn45EZ9MrVD5wtLH2M4bE,1208
|
|
9
|
+
stringextn-1.0.1.dist-info/LICENSE,sha256=PWxsZvoTQzW_6Oo3dR9F4Xc3B0UzzMyC55PVfOQb5r8,1084
|
|
10
|
+
stringextn-1.0.1.dist-info/METADATA,sha256=9SDLfhOReJOC4geE1nCl-0Fh7FZEhlNYinhjJMkdwwU,6476
|
|
11
|
+
stringextn-1.0.1.dist-info/WHEEL,sha256=WnJ8fYhv8N4SYVK2lLYNI6N0kVATA7b0piVUNvqIIJE,91
|
|
12
|
+
stringextn-1.0.1.dist-info/top_level.txt,sha256=tCN4hz31Y52rDem2AoOHJKNVhD-RdPysvksM9HpJMXI,11
|
|
13
|
+
stringextn-1.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
stringextn
|