stringextn 1.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- stringextn-1.0.1/LICENSE +21 -0
- stringextn-1.0.1/PKG-INFO +170 -0
- stringextn-1.0.1/README.md +146 -0
- stringextn-1.0.1/pyproject.toml +40 -0
- stringextn-1.0.1/setup.cfg +4 -0
- stringextn-1.0.1/stringextn/__init__.py +7 -0
- stringextn-1.0.1/stringextn/cases.py +105 -0
- stringextn-1.0.1/stringextn/clean.py +135 -0
- stringextn-1.0.1/stringextn/contains.py +48 -0
- stringextn-1.0.1/stringextn/fuzzy.py +28 -0
- stringextn-1.0.1/stringextn/replace.py +30 -0
- stringextn-1.0.1/stringextn/security.py +48 -0
- stringextn-1.0.1/stringextn/slug.py +33 -0
- stringextn-1.0.1/stringextn.egg-info/PKG-INFO +170 -0
- stringextn-1.0.1/stringextn.egg-info/SOURCES.txt +22 -0
- stringextn-1.0.1/stringextn.egg-info/dependency_links.txt +1 -0
- stringextn-1.0.1/stringextn.egg-info/top_level.txt +2 -0
- stringextn-1.0.1/tests/test_cases.py +383 -0
- stringextn-1.0.1/tests/test_clean.py +546 -0
- stringextn-1.0.1/tests/test_contains.py +664 -0
- stringextn-1.0.1/tests/test_fuzzy.py +667 -0
- stringextn-1.0.1/tests/test_replace.py +730 -0
- stringextn-1.0.1/tests/test_security.py +700 -0
- stringextn-1.0.1/tests/test_slug.py +801 -0
stringextn-1.0.1/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Balaji Katta Venkatarathnam
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: stringextn
|
|
3
|
+
Version: 1.0.1
|
|
4
|
+
Summary: Pragmatic string utilities for APIs and data cleaning
|
|
5
|
+
Author: Balaji Katta Venkatarathnam
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/balaji-kv/stringextn
|
|
8
|
+
Project-URL: Source, https://github.com/balaji-kv/stringextn
|
|
9
|
+
Project-URL: Issues, https://github.com/balaji-kv/stringextn/issues
|
|
10
|
+
Keywords: string,text,utilities,slug,mask,cleaning
|
|
11
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
21
|
+
Requires-Python: >=3.8
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
|
|
25
|
+
# stringextn
|
|
26
|
+
|
|
27
|
+
A pragmatic, zero-dependency Python library for practical string manipulation and text cleaning. `stringextn` provides battle-tested utilities for case conversion, HTML/emoji removal, substring matching, fuzzy comparison, security masking, URL slug generation, and multi-string replacement—all designed with real-world edge cases in mind.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
Install via pip:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install stringextn
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
Requires Python 3.8 or higher. No external dependencies.
|
|
38
|
+
|
|
39
|
+
## Quick Start
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from stringextn import (
|
|
43
|
+
to_snake, to_camel, to_pascal, to_kebab,
|
|
44
|
+
clean_text, remove_html, remove_emoji,
|
|
45
|
+
contains_any, contains_all,
|
|
46
|
+
similarity,
|
|
47
|
+
multi_replace,
|
|
48
|
+
mask_email, mask_phone,
|
|
49
|
+
slugify
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
# Case conversion
|
|
53
|
+
to_snake("myVariableName") # "my_variable_name"
|
|
54
|
+
to_camel("my_variable_name") # "myVariableName"
|
|
55
|
+
to_pascal("my-variable-name") # "MyVariableName"
|
|
56
|
+
to_kebab("myVariableName") # "my-variable-name"
|
|
57
|
+
|
|
58
|
+
# Text cleaning
|
|
59
|
+
clean_text("<p>Hello & goodbye!</p>") # "Hello & goodbye!"
|
|
60
|
+
remove_html("<div>Content</div>") # "Content"
|
|
61
|
+
remove_emoji("Hello 👋 World 🌍") # "Hello World "
|
|
62
|
+
|
|
63
|
+
# Substring matching
|
|
64
|
+
contains_any("hello world", ["world", "foo"]) # True
|
|
65
|
+
contains_all("hello world", ["hello", "world"]) # True
|
|
66
|
+
|
|
67
|
+
# Fuzzy string matching
|
|
68
|
+
similarity("kitten", "sitting") # 0.571
|
|
69
|
+
|
|
70
|
+
# Multi-replace
|
|
71
|
+
multi_replace("abc abc abc", {"a": "X", "b": "Y"}) # "XYc XYc XYc"
|
|
72
|
+
|
|
73
|
+
# Privacy masking
|
|
74
|
+
mask_email("user@example.com") # "u***@example.com"
|
|
75
|
+
mask_phone("5551234567") # "****1234"
|
|
76
|
+
|
|
77
|
+
# URL-safe slugs
|
|
78
|
+
slugify("Hello, World! ✨") # "hello-world"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Features
|
|
82
|
+
|
|
83
|
+
### Case Conversion
|
|
84
|
+
- **`to_snake(s)`** – Converts to snake_case
|
|
85
|
+
- **`to_camel(s)`** – Converts to camelCase
|
|
86
|
+
- **`to_pascal(s)`** – Converts to PascalCase
|
|
87
|
+
- **`to_kebab(s)`** – Converts to kebab-case
|
|
88
|
+
|
|
89
|
+
Supports mixed input formats (camelCase, PascalCase, kebab-case, snake_case, space-separated).
|
|
90
|
+
|
|
91
|
+
### Text Cleaning
|
|
92
|
+
- **`clean_text(s)`** – Comprehensive cleaning pipeline: HTML entity unescaping, tag removal, emoji removal, Unicode normalization, and whitespace normalization
|
|
93
|
+
- **`remove_html(s)`** – Strips HTML/XML tags
|
|
94
|
+
- **`remove_emoji(s)`** – Removes emoji characters
|
|
95
|
+
- **`normalize_spaces(s)`** – Collapses whitespace and trims
|
|
96
|
+
- **`normalize_unicode(s)`** – Applies NFKD normalization for consistent character representation
|
|
97
|
+
|
|
98
|
+
### Substring Operations
|
|
99
|
+
- **`contains_any(s, items)`** – Returns True if string contains any item
|
|
100
|
+
- **`contains_all(s, items)`** – Returns True if string contains all items
|
|
101
|
+
|
|
102
|
+
Case-sensitive substring matching using Python's `in` operator.
|
|
103
|
+
|
|
104
|
+
### Fuzzy Matching
|
|
105
|
+
- **`similarity(a, b)`** – Returns similarity score (0.0–1.0) using difflib's SequenceMatcher
|
|
106
|
+
- 1.0 = identical strings
|
|
107
|
+
- 0.0 = no similarity
|
|
108
|
+
- Rounded to 3 decimal places
|
|
109
|
+
|
|
110
|
+
### String Replacement
|
|
111
|
+
- **`multi_replace(s, mapping)`** – Performs simultaneous multi-string replacement
|
|
112
|
+
- All keys are treated as literal strings (regex special chars auto-escaped)
|
|
113
|
+
- Non-cascading: each substring is replaced exactly once
|
|
114
|
+
|
|
115
|
+
### Security & Privacy
|
|
116
|
+
- **`mask_email(email)`** – Hides all but first character of email local part
|
|
117
|
+
- Format: `u***@example.com`
|
|
118
|
+
- Raises `ValueError` if email doesn't contain exactly one `@`
|
|
119
|
+
- **`mask_phone(phone)`** – Hides all but last 4 digits
|
|
120
|
+
- Format: `****1234`
|
|
121
|
+
|
|
122
|
+
### URL Slugs
|
|
123
|
+
- **`slugify(s)`** – Generates URL-safe slugs
|
|
124
|
+
- Cleans text, lowercases, replaces non-alphanumeric with hyphens
|
|
125
|
+
- Strips leading/trailing hyphens
|
|
126
|
+
- Example: `"Hello, World! ✨"` → `"hello-world"`
|
|
127
|
+
|
|
128
|
+
## Performance & Behavior Notes
|
|
129
|
+
|
|
130
|
+
### Unicode Handling
|
|
131
|
+
- **NFKD Normalization**: The `clean_text()` and `slugify()` functions apply NFKD (Compatibility Decomposition) normalization, which:
|
|
132
|
+
- Decomposes accented characters (é → e + ´)
|
|
133
|
+
- Applies compatibility mappings (fi → fi)
|
|
134
|
+
- Ensures consistent character representation across different input encodings
|
|
135
|
+
- Emoji removal uses Unicode ranges and handles most emoticons and symbols; complex emoji sequences (skin tones, zero-width-joiner) may not be fully removed
|
|
136
|
+
- Non-ASCII characters in `to_snake()` and `to_camel()` are preserved but not affected by case conversion
|
|
137
|
+
|
|
138
|
+
### Edge Cases
|
|
139
|
+
- **Empty strings**: Most functions return empty strings; `contains_all("", [])` returns True (vacuous truth)
|
|
140
|
+
- **Whitespace**: Leading/trailing whitespace is preserved in case conversion; use `normalize_spaces()` first if needed
|
|
141
|
+
- **Consecutive separators**: `multi_replace()` and `slugify()` handle consecutive delimiters correctly (collapsed in slugs, replaced individually in multi_replace)
|
|
142
|
+
- **Special regex characters**: `multi_replace()` automatically escapes all regex special characters in mapping keys
|
|
143
|
+
- **Email masking**: No format validation; only checks for single `@` symbol
|
|
144
|
+
- **Phone masking**: Works with any string; no validation of format
|
|
145
|
+
|
|
146
|
+
### Performance
|
|
147
|
+
- All functions use compiled regular expressions or built-in operations for efficiency
|
|
148
|
+
- No external dependencies; pure Python implementation
|
|
149
|
+
- Suitable for high-volume text processing in APIs and data pipelines
|
|
150
|
+
|
|
151
|
+
## Testing
|
|
152
|
+
|
|
153
|
+
Run the test suite with pytest:
|
|
154
|
+
|
|
155
|
+
```bash
|
|
156
|
+
pytest tests/
|
|
157
|
+
```
|
|
158
|
+
|
|
159
|
+
## License
|
|
160
|
+
|
|
161
|
+
MIT License. See LICENSE file for details.
|
|
162
|
+
|
|
163
|
+
## Contributing
|
|
164
|
+
|
|
165
|
+
Contributions are welcome. Please ensure all tests pass and add tests for new functionality.
|
|
166
|
+
|
|
167
|
+
---
|
|
168
|
+
|
|
169
|
+
**Package**: stringextn v1.0.0
|
|
170
|
+
**GitHub**: [stringextn](https://github.com/balaji-kv/stringextn)
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
# stringextn
|
|
2
|
+
|
|
3
|
+
A pragmatic, zero-dependency Python library for practical string manipulation and text cleaning. `stringextn` provides battle-tested utilities for case conversion, HTML/emoji removal, substring matching, fuzzy comparison, security masking, URL slug generation, and multi-string replacement—all designed with real-world edge cases in mind.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
Install via pip:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install stringextn
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
Requires Python 3.8 or higher. No external dependencies.
|
|
14
|
+
|
|
15
|
+
## Quick Start
|
|
16
|
+
|
|
17
|
+
```python
|
|
18
|
+
from stringextn import (
|
|
19
|
+
to_snake, to_camel, to_pascal, to_kebab,
|
|
20
|
+
clean_text, remove_html, remove_emoji,
|
|
21
|
+
contains_any, contains_all,
|
|
22
|
+
similarity,
|
|
23
|
+
multi_replace,
|
|
24
|
+
mask_email, mask_phone,
|
|
25
|
+
slugify
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
# Case conversion
|
|
29
|
+
to_snake("myVariableName") # "my_variable_name"
|
|
30
|
+
to_camel("my_variable_name") # "myVariableName"
|
|
31
|
+
to_pascal("my-variable-name") # "MyVariableName"
|
|
32
|
+
to_kebab("myVariableName") # "my-variable-name"
|
|
33
|
+
|
|
34
|
+
# Text cleaning
|
|
35
|
+
clean_text("<p>Hello & goodbye!</p>") # "Hello & goodbye!"
|
|
36
|
+
remove_html("<div>Content</div>") # "Content"
|
|
37
|
+
remove_emoji("Hello 👋 World 🌍") # "Hello World "
|
|
38
|
+
|
|
39
|
+
# Substring matching
|
|
40
|
+
contains_any("hello world", ["world", "foo"]) # True
|
|
41
|
+
contains_all("hello world", ["hello", "world"]) # True
|
|
42
|
+
|
|
43
|
+
# Fuzzy string matching
|
|
44
|
+
similarity("kitten", "sitting") # 0.571
|
|
45
|
+
|
|
46
|
+
# Multi-replace
|
|
47
|
+
multi_replace("abc abc abc", {"a": "X", "b": "Y"}) # "XYc XYc XYc"
|
|
48
|
+
|
|
49
|
+
# Privacy masking
|
|
50
|
+
mask_email("user@example.com") # "u***@example.com"
|
|
51
|
+
mask_phone("5551234567") # "****1234"
|
|
52
|
+
|
|
53
|
+
# URL-safe slugs
|
|
54
|
+
slugify("Hello, World! ✨") # "hello-world"
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Features
|
|
58
|
+
|
|
59
|
+
### Case Conversion
|
|
60
|
+
- **`to_snake(s)`** – Converts to snake_case
|
|
61
|
+
- **`to_camel(s)`** – Converts to camelCase
|
|
62
|
+
- **`to_pascal(s)`** – Converts to PascalCase
|
|
63
|
+
- **`to_kebab(s)`** – Converts to kebab-case
|
|
64
|
+
|
|
65
|
+
Supports mixed input formats (camelCase, PascalCase, kebab-case, snake_case, space-separated).
|
|
66
|
+
|
|
67
|
+
### Text Cleaning
|
|
68
|
+
- **`clean_text(s)`** – Comprehensive cleaning pipeline: HTML entity unescaping, tag removal, emoji removal, Unicode normalization, and whitespace normalization
|
|
69
|
+
- **`remove_html(s)`** – Strips HTML/XML tags
|
|
70
|
+
- **`remove_emoji(s)`** – Removes emoji characters
|
|
71
|
+
- **`normalize_spaces(s)`** – Collapses whitespace and trims
|
|
72
|
+
- **`normalize_unicode(s)`** – Applies NFKD normalization for consistent character representation
|
|
73
|
+
|
|
74
|
+
### Substring Operations
|
|
75
|
+
- **`contains_any(s, items)`** – Returns True if string contains any item
|
|
76
|
+
- **`contains_all(s, items)`** – Returns True if string contains all items
|
|
77
|
+
|
|
78
|
+
Case-sensitive substring matching using Python's `in` operator.
|
|
79
|
+
|
|
80
|
+
### Fuzzy Matching
|
|
81
|
+
- **`similarity(a, b)`** – Returns similarity score (0.0–1.0) using difflib's SequenceMatcher
|
|
82
|
+
- 1.0 = identical strings
|
|
83
|
+
- 0.0 = no similarity
|
|
84
|
+
- Rounded to 3 decimal places
|
|
85
|
+
|
|
86
|
+
### String Replacement
|
|
87
|
+
- **`multi_replace(s, mapping)`** – Performs simultaneous multi-string replacement
|
|
88
|
+
- All keys are treated as literal strings (regex special chars auto-escaped)
|
|
89
|
+
- Non-cascading: each substring is replaced exactly once
|
|
90
|
+
|
|
91
|
+
### Security & Privacy
|
|
92
|
+
- **`mask_email(email)`** – Hides all but first character of email local part
|
|
93
|
+
- Format: `u***@example.com`
|
|
94
|
+
- Raises `ValueError` if email doesn't contain exactly one `@`
|
|
95
|
+
- **`mask_phone(phone)`** – Hides all but last 4 digits
|
|
96
|
+
- Format: `****1234`
|
|
97
|
+
|
|
98
|
+
### URL Slugs
|
|
99
|
+
- **`slugify(s)`** – Generates URL-safe slugs
|
|
100
|
+
- Cleans text, lowercases, replaces non-alphanumeric with hyphens
|
|
101
|
+
- Strips leading/trailing hyphens
|
|
102
|
+
- Example: `"Hello, World! ✨"` → `"hello-world"`
|
|
103
|
+
|
|
104
|
+
## Performance & Behavior Notes
|
|
105
|
+
|
|
106
|
+
### Unicode Handling
|
|
107
|
+
- **NFKD Normalization**: The `clean_text()` and `slugify()` functions apply NFKD (Compatibility Decomposition) normalization, which:
|
|
108
|
+
- Decomposes accented characters (é → e + ´)
|
|
109
|
+
- Applies compatibility mappings (fi → fi)
|
|
110
|
+
- Ensures consistent character representation across different input encodings
|
|
111
|
+
- Emoji removal uses Unicode ranges and handles most emoticons and symbols; complex emoji sequences (skin tones, zero-width-joiner) may not be fully removed
|
|
112
|
+
- Non-ASCII characters in `to_snake()` and `to_camel()` are preserved but not affected by case conversion
|
|
113
|
+
|
|
114
|
+
### Edge Cases
|
|
115
|
+
- **Empty strings**: Most functions return empty strings; `contains_all("", [])` returns True (vacuous truth)
|
|
116
|
+
- **Whitespace**: Leading/trailing whitespace is preserved in case conversion; use `normalize_spaces()` first if needed
|
|
117
|
+
- **Consecutive separators**: `multi_replace()` and `slugify()` handle consecutive delimiters correctly (collapsed in slugs, replaced individually in multi_replace)
|
|
118
|
+
- **Special regex characters**: `multi_replace()` automatically escapes all regex special characters in mapping keys
|
|
119
|
+
- **Email masking**: No format validation; only checks for single `@` symbol
|
|
120
|
+
- **Phone masking**: Works with any string; no validation of format
|
|
121
|
+
|
|
122
|
+
### Performance
|
|
123
|
+
- All functions use compiled regular expressions or built-in operations for efficiency
|
|
124
|
+
- No external dependencies; pure Python implementation
|
|
125
|
+
- Suitable for high-volume text processing in APIs and data pipelines
|
|
126
|
+
|
|
127
|
+
## Testing
|
|
128
|
+
|
|
129
|
+
Run the test suite with pytest:
|
|
130
|
+
|
|
131
|
+
```bash
|
|
132
|
+
pytest tests/
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## License
|
|
136
|
+
|
|
137
|
+
MIT License. See LICENSE file for details.
|
|
138
|
+
|
|
139
|
+
## Contributing
|
|
140
|
+
|
|
141
|
+
Contributions are welcome. Please ensure all tests pass and add tests for new functionality.
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
**Package**: stringextn v1.0.0
|
|
146
|
+
**GitHub**: [stringextn](https://github.com/balaji-kv/stringextn)
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "stringextn"
|
|
7
|
+
version = "1.0.1"
|
|
8
|
+
description = "Pragmatic string utilities for APIs and data cleaning"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [
|
|
13
|
+
{ name = "Balaji Katta Venkatarathnam" }
|
|
14
|
+
]
|
|
15
|
+
keywords = ["string", "text", "utilities", "slug", "mask", "cleaning"]
|
|
16
|
+
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 5 - Production/Stable",
|
|
19
|
+
"Intended Audience :: Developers",
|
|
20
|
+
"Topic :: Software Development :: Libraries",
|
|
21
|
+
"License :: OSI Approved :: MIT License",
|
|
22
|
+
"Programming Language :: Python :: 3",
|
|
23
|
+
"Programming Language :: Python :: 3.8",
|
|
24
|
+
"Programming Language :: Python :: 3.9",
|
|
25
|
+
"Programming Language :: Python :: 3.10",
|
|
26
|
+
"Programming Language :: Python :: 3.11",
|
|
27
|
+
"Programming Language :: Python :: 3.12"
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
dependencies = []
|
|
31
|
+
|
|
32
|
+
[tool.setuptools.packages.find]
|
|
33
|
+
where = ["."]
|
|
34
|
+
exclude = ["venv*", "tests*"]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/balaji-kv/stringextn"
|
|
38
|
+
Source = "https://github.com/balaji-kv/stringextn"
|
|
39
|
+
Issues = "https://github.com/balaji-kv/stringextn/issues"
|
|
40
|
+
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import re
|
|
2
|
+
|
|
3
|
+
def to_snake(s: str) -> str:
|
|
4
|
+
"""Convert a string to snake_case format.
|
|
5
|
+
|
|
6
|
+
Converts various string formats (camelCase, PascalCase, kebab-case, etc.)
|
|
7
|
+
to snake_case by inserting underscores before uppercase letters and
|
|
8
|
+
converting to lowercase.
|
|
9
|
+
|
|
10
|
+
Args:
|
|
11
|
+
s: The input string to convert.
|
|
12
|
+
|
|
13
|
+
Returns:
|
|
14
|
+
The string converted to snake_case format with all characters in lowercase
|
|
15
|
+
and words separated by underscores.
|
|
16
|
+
|
|
17
|
+
Raises:
|
|
18
|
+
None
|
|
19
|
+
|
|
20
|
+
Edge cases:
|
|
21
|
+
- Consecutive uppercase letters are treated individually.
|
|
22
|
+
- Spaces are converted to underscores.
|
|
23
|
+
- Empty string returns an empty string.
|
|
24
|
+
- Non-ASCII characters are preserved but not affected by case conversion.
|
|
25
|
+
"""
|
|
26
|
+
s = re.sub(r'(.)([A-Z][a-z]+)', r'\1_\2', s)
|
|
27
|
+
s = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', s)
|
|
28
|
+
return s.replace(" ", "_").lower()
|
|
29
|
+
|
|
30
|
+
def to_camel(s: str) -> str:
|
|
31
|
+
"""Convert a string to camelCase format.
|
|
32
|
+
|
|
33
|
+
Converts various string formats (snake_case, kebab-case, PascalCase, etc.)
|
|
34
|
+
to camelCase where the first word is lowercase and subsequent words are
|
|
35
|
+
title-cased without separators.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
s: The input string to convert.
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
The string converted to camelCase format with the first character
|
|
42
|
+
lowercase and subsequent words capitalized without separators.
|
|
43
|
+
|
|
44
|
+
Raises:
|
|
45
|
+
None
|
|
46
|
+
|
|
47
|
+
Edge cases:
|
|
48
|
+
- Empty string returns an empty string.
|
|
49
|
+
- String with only separators returns an empty string.
|
|
50
|
+
- Single word returns the word in lowercase.
|
|
51
|
+
- Separators recognized: underscores (_), hyphens (-), and spaces ( ).
|
|
52
|
+
"""
|
|
53
|
+
parts = re.split(r'[_\-\s]', s)
|
|
54
|
+
return parts[0].lower() + "".join(p.title() for p in parts[1:])
|
|
55
|
+
|
|
56
|
+
def to_pascal(s: str) -> str:
|
|
57
|
+
"""Convert a string to PascalCase format.
|
|
58
|
+
|
|
59
|
+
Converts various string formats (snake_case, kebab-case, camelCase, etc.)
|
|
60
|
+
to PascalCase where each word is title-cased and concatenated without
|
|
61
|
+
separators.
|
|
62
|
+
|
|
63
|
+
Args:
|
|
64
|
+
s: The input string to convert.
|
|
65
|
+
|
|
66
|
+
Returns:
|
|
67
|
+
The string converted to PascalCase format with the first character
|
|
68
|
+
and first character of each word capitalized without separators.
|
|
69
|
+
|
|
70
|
+
Raises:
|
|
71
|
+
None
|
|
72
|
+
|
|
73
|
+
Edge cases:
|
|
74
|
+
- Empty string returns an empty string.
|
|
75
|
+
- String with only separators returns an empty string.
|
|
76
|
+
- Single word returns the word with first character capitalized.
|
|
77
|
+
- Separators recognized: underscores (_), hyphens (-), and spaces ( ).
|
|
78
|
+
"""
|
|
79
|
+
parts = re.split(r'[_\-\s]', s)
|
|
80
|
+
return "".join(p.title() for p in parts)
|
|
81
|
+
|
|
82
|
+
def to_kebab(s: str) -> str:
|
|
83
|
+
"""Convert a string to kebab-case format.
|
|
84
|
+
|
|
85
|
+
Converts various string formats to kebab-case by first converting to
|
|
86
|
+
snake_case, then replacing underscores with hyphens. The result has all
|
|
87
|
+
lowercase letters with words separated by hyphens.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
s: The input string to convert.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The string converted to kebab-case format with all characters in lowercase
|
|
94
|
+
and words separated by hyphens.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
None
|
|
98
|
+
|
|
99
|
+
Edge cases:
|
|
100
|
+
- Consecutive uppercase letters are treated individually.
|
|
101
|
+
- Spaces are converted to hyphens.
|
|
102
|
+
- Empty string returns an empty string.
|
|
103
|
+
- Non-ASCII characters are preserved but not affected by case conversion.
|
|
104
|
+
"""
|
|
105
|
+
return to_snake(s).replace("_", "-")
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import html
|
|
3
|
+
import unicodedata
|
|
4
|
+
|
|
5
|
+
EMOJI_PATTERN = re.compile(
|
|
6
|
+
"["
|
|
7
|
+
"\U0001F600-\U0001F64F"
|
|
8
|
+
"\U0001F300-\U0001F5FF"
|
|
9
|
+
"\U0001F680-\U0001F6FF"
|
|
10
|
+
"\U0001F1E0-\U0001F1FF"
|
|
11
|
+
"]+", flags=re.UNICODE
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
def remove_html(s: str) -> str:
|
|
15
|
+
"""Remove all HTML tags from a string.
|
|
16
|
+
|
|
17
|
+
Removes any content enclosed in angle brackets (<...>) which represents
|
|
18
|
+
HTML/XML tags, leaving only the text content.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
s: The input string potentially containing HTML tags.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
The string with all HTML tags removed, preserving the text content.
|
|
25
|
+
|
|
26
|
+
Raises:
|
|
27
|
+
None
|
|
28
|
+
|
|
29
|
+
Edge cases:
|
|
30
|
+
- Empty string returns an empty string.
|
|
31
|
+
- String with no HTML tags returns the original string unchanged.
|
|
32
|
+
- Malformed tags are handled by the greedy regex pattern.
|
|
33
|
+
- HTML entities (e.g., <) are NOT unescaped; use html.unescape separately.
|
|
34
|
+
"""
|
|
35
|
+
return re.sub(r'<.*?>', '', s)
|
|
36
|
+
|
|
37
|
+
def remove_emoji(s: str) -> str:
|
|
38
|
+
"""Remove all emoji characters from a string.
|
|
39
|
+
|
|
40
|
+
Removes emoji characters in the Unicode ranges defined by EMOJI_PATTERN,
|
|
41
|
+
including emoticons, symbols, and flag sequences.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
s: The input string potentially containing emoji characters.
|
|
45
|
+
|
|
46
|
+
Returns:
|
|
47
|
+
The string with all emoji characters removed.
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
None
|
|
51
|
+
|
|
52
|
+
Edge cases:
|
|
53
|
+
- Empty string returns an empty string.
|
|
54
|
+
- String with no emoji returns the original string unchanged.
|
|
55
|
+
- Emoji in skin tone or zero-width-joiner sequences may not all be removed.
|
|
56
|
+
- Non-emoji Unicode characters are preserved.
|
|
57
|
+
"""
|
|
58
|
+
return EMOJI_PATTERN.sub('', s)
|
|
59
|
+
|
|
60
|
+
def normalize_spaces(s: str) -> str:
|
|
61
|
+
"""Normalize whitespace in a string.
|
|
62
|
+
|
|
63
|
+
Replaces consecutive whitespace characters (spaces, tabs, newlines, etc.)
|
|
64
|
+
with a single space and removes leading/trailing whitespace.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
s: The input string with potentially irregular whitespace.
|
|
68
|
+
|
|
69
|
+
Returns:
|
|
70
|
+
The string with normalized whitespace: single spaces between words
|
|
71
|
+
and no leading or trailing whitespace.
|
|
72
|
+
|
|
73
|
+
Raises:
|
|
74
|
+
None
|
|
75
|
+
|
|
76
|
+
Edge cases:
|
|
77
|
+
- Empty string returns an empty string.
|
|
78
|
+
- String with only whitespace returns an empty string.
|
|
79
|
+
- Non-breaking spaces and other Unicode whitespace are treated as whitespace.
|
|
80
|
+
"""
|
|
81
|
+
return re.sub(r'\s+', ' ', s).strip()
|
|
82
|
+
|
|
83
|
+
def normalize_unicode(s: str) -> str:
|
|
84
|
+
"""Normalize Unicode characters to their canonical decomposed form.
|
|
85
|
+
|
|
86
|
+
Applies NFKD (Compatibility Decomposition) normalization, which decomposes
|
|
87
|
+
characters into their constituent parts and applies compatibility mappings.
|
|
88
|
+
Useful for handling accented characters and compatibility characters.
|
|
89
|
+
|
|
90
|
+
Args:
|
|
91
|
+
s: The input string with potentially non-normalized Unicode characters.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
The string with Unicode characters normalized to NFKD form.
|
|
95
|
+
|
|
96
|
+
Raises:
|
|
97
|
+
None
|
|
98
|
+
|
|
99
|
+
Edge cases:
|
|
100
|
+
- Empty string returns an empty string.
|
|
101
|
+
- ASCII-only strings are unchanged.
|
|
102
|
+
- Accented characters are decomposed into base character + combining marks.
|
|
103
|
+
- Some characters may be converted to different representations (e.g., ligatures).
|
|
104
|
+
"""
|
|
105
|
+
return unicodedata.normalize("NFKD", s)
|
|
106
|
+
|
|
107
|
+
def clean_text(s: str) -> str:
|
|
108
|
+
"""Perform comprehensive text cleaning on a string.
|
|
109
|
+
|
|
110
|
+
Applies a series of cleaning operations in sequence: HTML entity unescaping,
|
|
111
|
+
HTML tag removal, emoji removal, Unicode normalization, and whitespace
|
|
112
|
+
normalization. Provides a complete text sanitization pipeline.
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
s: The input string to clean.
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
The cleaned string with HTML entities unescaped, tags removed, emoji
|
|
119
|
+
removed, Unicode normalized, and whitespace normalized.
|
|
120
|
+
|
|
121
|
+
Raises:
|
|
122
|
+
None
|
|
123
|
+
|
|
124
|
+
Edge cases:
|
|
125
|
+
- Empty string returns an empty string.
|
|
126
|
+
- Order of operations matters: HTML is processed before emoji and Unicode.
|
|
127
|
+
- HTML entities are decoded before tag removal (e.g., <tag> becomes <tag> then removed).
|
|
128
|
+
- The function calls remove_html, remove_emoji, normalize_unicode, and normalize_spaces internally.
|
|
129
|
+
"""
|
|
130
|
+
s = html.unescape(s)
|
|
131
|
+
s = remove_html(s)
|
|
132
|
+
s = remove_emoji(s)
|
|
133
|
+
s = normalize_unicode(s)
|
|
134
|
+
s = normalize_spaces(s)
|
|
135
|
+
return s
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
def contains_any(s: str, items) -> bool:
|
|
2
|
+
"""Check if a string contains any of the given items.
|
|
3
|
+
|
|
4
|
+
Returns True if the string contains at least one of the items in the
|
|
5
|
+
provided iterable. Uses substring matching for string items.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
s: The string to search in.
|
|
9
|
+
items: An iterable of items to check for in the string.
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
True if the string contains any of the items, False otherwise.
|
|
13
|
+
|
|
14
|
+
Raises:
|
|
15
|
+
TypeError: If items is not iterable.
|
|
16
|
+
|
|
17
|
+
Edge cases:
|
|
18
|
+
- Empty items iterable returns False.
|
|
19
|
+
- Empty string only returns True if items contains empty string.
|
|
20
|
+
- Case-sensitive substring matching.
|
|
21
|
+
- Matching is performed using the 'in' operator.
|
|
22
|
+
"""
|
|
23
|
+
return any(i in s for i in items)
|
|
24
|
+
|
|
25
|
+
def contains_all(s: str, items) -> bool:
|
|
26
|
+
"""Check if a string contains all of the given items.
|
|
27
|
+
|
|
28
|
+
Returns True if the string contains every item in the provided iterable.
|
|
29
|
+
Uses substring matching for string items. Order does not matter.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
s: The string to search in.
|
|
33
|
+
items: An iterable of items to check for in the string.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
True if the string contains all of the items, False otherwise.
|
|
37
|
+
|
|
38
|
+
Raises:
|
|
39
|
+
TypeError: If items is not iterable.
|
|
40
|
+
|
|
41
|
+
Edge cases:
|
|
42
|
+
- Empty items iterable returns True.
|
|
43
|
+
- Empty string only returns True if items is empty.
|
|
44
|
+
- Case-sensitive substring matching.
|
|
45
|
+
- Order of items in the string does not matter.
|
|
46
|
+
- Matching is performed using the 'in' operator.
|
|
47
|
+
"""
|
|
48
|
+
return all(i in s for i in items)
|