tk-normalizer 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tk_normalizer-1.0.0/LICENSE +21 -0
- tk_normalizer-1.0.0/PKG-INFO +231 -0
- tk_normalizer-1.0.0/README.md +195 -0
- tk_normalizer-1.0.0/pyproject.toml +162 -0
- tk_normalizer-1.0.0/setup.cfg +4 -0
- tk_normalizer-1.0.0/src/tk_normalizer/__init__.py +37 -0
- tk_normalizer-1.0.0/src/tk_normalizer/normalizer.py +241 -0
- tk_normalizer-1.0.0/src/tk_normalizer.egg-info/PKG-INFO +231 -0
- tk_normalizer-1.0.0/src/tk_normalizer.egg-info/SOURCES.txt +11 -0
- tk_normalizer-1.0.0/src/tk_normalizer.egg-info/dependency_links.txt +1 -0
- tk_normalizer-1.0.0/src/tk_normalizer.egg-info/requires.txt +9 -0
- tk_normalizer-1.0.0/src/tk_normalizer.egg-info/top_level.txt +1 -0
- tk_normalizer-1.0.0/tests/test_normalizer.py +309 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Terakeet
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tk-normalizer
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: URL normalization library for consistent URL representation
|
|
5
|
+
Author-email: Terakeet <engineering@terakeet.com>
|
|
6
|
+
Maintainer-email: Terakeet <engineering@terakeet.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/terakeet/tk-normalizer
|
|
9
|
+
Project-URL: Repository, https://github.com/terakeet/tk-normalizer.git
|
|
10
|
+
Project-URL: Issues, https://github.com/terakeet/tk-normalizer/issues
|
|
11
|
+
Project-URL: Documentation, https://github.com/terakeet/tk-normalizer/blob/main/docs/ARCHITECTURE.md
|
|
12
|
+
Keywords: url,normalization,canonicalization,web,utilities
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Classifier: Topic :: Text Processing :: Filters
|
|
23
|
+
Classifier: Operating System :: OS Independent
|
|
24
|
+
Requires-Python: >=3.11
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-mock>=3.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: build>=0.10.0; extra == "dev"
|
|
34
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# tk-normalizer
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/tk-normalizer/)
|
|
40
|
+
[](https://pypi.org/project/tk-normalizer/)
|
|
41
|
+
[](https://opensource.org/licenses/MIT)
|
|
42
|
+
|
|
43
|
+
URL normalization library for creating consistent URL representations.
|
|
44
|
+
|
|
45
|
+
## Purpose
|
|
46
|
+
|
|
47
|
+
The URL normalization process creates a mechanism to provide equivalence between URLs with varying string, protocol, scheme, and query parameter ordering. This library helps create normalized representations of URLs for consistent storage, comparison, and analysis.
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install tk-normalizer
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from tk_normalizer import normalize_url
|
|
59
|
+
|
|
60
|
+
# Simple usage with the convenience function
|
|
61
|
+
normalized = normalize_url("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
62
|
+
print(normalized) # Output: example.com/path?a=1&b=2
|
|
63
|
+
|
|
64
|
+
# Using the class directly for more control
|
|
65
|
+
from tk_normalizer import TkNormalizer
|
|
66
|
+
|
|
67
|
+
normalizer = TkNormalizer("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
68
|
+
print(normalizer.normalized_url) # example.com/path?a=1&b=2
|
|
69
|
+
print(normalizer.get_normalized_url()) # Full details including hashes
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Features
|
|
73
|
+
|
|
74
|
+
### URL Normalization
|
|
75
|
+
|
|
76
|
+
The following URLs all normalize to the same normalized form:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
https://example.com/
|
|
80
|
+
http://www.example.com/
|
|
81
|
+
http://www.example.com
|
|
82
|
+
http://www.example.com/#my_search_engine_is_great
|
|
83
|
+
https://www.example.com/?utm_campaign=SomeGoogleCampaign
|
|
84
|
+
https://www.example.com/?utm_source=because&utm_campaign=SomeGoogleCampaign
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
All normalize to: `example.com`
|
|
88
|
+
|
|
89
|
+
### Normalization Process
|
|
90
|
+
|
|
91
|
+
URLs are normalized through the following steps:
|
|
92
|
+
|
|
93
|
+
- ✅ Protocol and www subdomains removed
|
|
94
|
+
- ✅ Lowercased
|
|
95
|
+
- ✅ Trailing slashes removed
|
|
96
|
+
- ✅ Query parameters reordered alphabetically by key
|
|
97
|
+
- ✅ Duplicate query parameter key/value pairs removed
|
|
98
|
+
- ✅ Common tracking parameters removed (utm_*, gclid, fbclid, etc.)
|
|
99
|
+
- ✅ Non-HTTP(S) protocols rejected
|
|
100
|
+
- ✅ Localhost URLs rejected
|
|
101
|
+
|
|
102
|
+
### Tracking Parameters Removed
|
|
103
|
+
|
|
104
|
+
The following tracking parameters are automatically removed during normalization:
|
|
105
|
+
|
|
106
|
+
- `utm_*` (all utm parameters)
|
|
107
|
+
- `gclid`, `fbclid`, `dclid` (click identifiers)
|
|
108
|
+
- `_ga`, `_gid`, `_fbp`, `_hjid` (analytics cookies)
|
|
109
|
+
- `msclkid` (Microsoft Ads)
|
|
110
|
+
- `aff_id`, `affid` (affiliate tracking)
|
|
111
|
+
- `referrer`, `adgroupid`, `srsltid`
|
|
112
|
+
|
|
113
|
+
## Advanced Usage
|
|
114
|
+
|
|
115
|
+
### Getting Full Normalization Details
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from tk_normalizer import TkNormalizer
|
|
119
|
+
|
|
120
|
+
normalizer = TkNormalizer("http://blog.example.com/page?b=2&a=1")
|
|
121
|
+
result = normalizer.get_normalized_url()
|
|
122
|
+
|
|
123
|
+
print(result)
|
|
124
|
+
# {
|
|
125
|
+
# 'normalized_url': 'blog.example.com/page?a=1&b=2',
|
|
126
|
+
# 'parent_normal_url': 'blog.example.com',
|
|
127
|
+
# 'root_normal_url': 'example.com',
|
|
128
|
+
# 'normalized_url_hash': '...',
|
|
129
|
+
# 'parent_normal_url_hash': '...',
|
|
130
|
+
# 'root_normal_url_hash': '...'
|
|
131
|
+
# }
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Error Handling
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from tk_normalizer import normalize_url, InvalidUrlException
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
normalized = normalize_url("not a valid url")
|
|
141
|
+
except InvalidUrlException as e:
|
|
142
|
+
print(f"Invalid URL: {e}")
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Accessing Individual Components
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from tk_normalizer import TkNormalizer
|
|
149
|
+
|
|
150
|
+
normalizer = TkNormalizer("https://blog.example.com/path?a=1")
|
|
151
|
+
|
|
152
|
+
# Access individual normalized components
|
|
153
|
+
print(normalizer.normalized_url) # blog.example.com/path?a=1
|
|
154
|
+
print(normalizer.parent_normal_url) # blog.example.com
|
|
155
|
+
print(normalizer.root_normal_url) # example.com
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Hashing
|
|
159
|
+
|
|
160
|
+
For efficient storage and comparison, SHA-256 hashes are computed for:
|
|
161
|
+
- The normalized URL
|
|
162
|
+
- The parent normal URL (domain without path)
|
|
163
|
+
- The root normal URL (root domain without subdomains)
|
|
164
|
+
|
|
165
|
+
This provides fixed-length representations suitable for database indexing.
|
|
166
|
+
|
|
167
|
+
## Important Caveats
|
|
168
|
+
|
|
169
|
+
While this normalization process works well for most use cases, there are some limitations:
|
|
170
|
+
|
|
171
|
+
1. **www subdomain removal**: Technically, `www.example.com` and `example.com` could serve different content, though this is rare in practice.
|
|
172
|
+
|
|
173
|
+
2. **Case sensitivity**: URLs are lowercased, but some servers are case-sensitive for paths.
|
|
174
|
+
|
|
175
|
+
3. **Tracking parameters**: New tracking parameters emerge over time and may not be in the removal list.
|
|
176
|
+
|
|
177
|
+
4. **Fragment removal**: URL fragments (#anchors) are removed, which may affect single-page applications.
|
|
178
|
+
|
|
179
|
+
## Development
|
|
180
|
+
|
|
181
|
+
### Setting Up Development Environment
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# Clone the repository
|
|
185
|
+
git clone https://github.com/terakeet/tk-normalizer.git
|
|
186
|
+
cd tk-normalizer
|
|
187
|
+
|
|
188
|
+
# Install development dependencies
|
|
189
|
+
pip install -e ".[dev]"
|
|
190
|
+
|
|
191
|
+
# Run tests
|
|
192
|
+
pytest
|
|
193
|
+
|
|
194
|
+
# Run tests with coverage
|
|
195
|
+
pytest --cov=tk_normalizer
|
|
196
|
+
|
|
197
|
+
# Run linting
|
|
198
|
+
ruff check src tests
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Running Tests
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
# Run all tests
|
|
205
|
+
pytest
|
|
206
|
+
|
|
207
|
+
# Run with verbose output
|
|
208
|
+
pytest -v
|
|
209
|
+
|
|
210
|
+
# Run specific test file
|
|
211
|
+
pytest tests/test_normalizer.py
|
|
212
|
+
|
|
213
|
+
# Run with coverage report
|
|
214
|
+
pytest --cov=tk_normalizer --cov-report=html
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Contributing
|
|
218
|
+
|
|
219
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
224
|
+
|
|
225
|
+
## Support
|
|
226
|
+
|
|
227
|
+
For issues and questions, please use the [GitHub issue tracker](https://github.com/terakeet/tk-normalizer/issues).
|
|
228
|
+
|
|
229
|
+
## Credits
|
|
230
|
+
|
|
231
|
+
Based on the URL normalization functionality from [tk-core](https://github.com/terakeet/tk-core), extracted and packaged for standalone use.
|
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
# tk-normalizer
|
|
2
|
+
|
|
3
|
+
[](https://pypi.org/project/tk-normalizer/)
|
|
4
|
+
[](https://pypi.org/project/tk-normalizer/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
URL normalization library for creating consistent URL representations.
|
|
8
|
+
|
|
9
|
+
## Purpose
|
|
10
|
+
|
|
11
|
+
The URL normalization process creates a mechanism to provide equivalence between URLs with varying string, protocol, scheme, and query parameter ordering. This library helps create normalized representations of URLs for consistent storage, comparison, and analysis.
|
|
12
|
+
|
|
13
|
+
## Installation
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install tk-normalizer
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick Start
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from tk_normalizer import normalize_url
|
|
23
|
+
|
|
24
|
+
# Simple usage with the convenience function
|
|
25
|
+
normalized = normalize_url("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
26
|
+
print(normalized) # Output: example.com/path?a=1&b=2
|
|
27
|
+
|
|
28
|
+
# Using the class directly for more control
|
|
29
|
+
from tk_normalizer import TkNormalizer
|
|
30
|
+
|
|
31
|
+
normalizer = TkNormalizer("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
32
|
+
print(normalizer.normalized_url) # example.com/path?a=1&b=2
|
|
33
|
+
print(normalizer.get_normalized_url()) # Full details including hashes
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Features
|
|
37
|
+
|
|
38
|
+
### URL Normalization
|
|
39
|
+
|
|
40
|
+
The following URLs all normalize to the same normalized form:
|
|
41
|
+
|
|
42
|
+
```
|
|
43
|
+
https://example.com/
|
|
44
|
+
http://www.example.com/
|
|
45
|
+
http://www.example.com
|
|
46
|
+
http://www.example.com/#my_search_engine_is_great
|
|
47
|
+
https://www.example.com/?utm_campaign=SomeGoogleCampaign
|
|
48
|
+
https://www.example.com/?utm_source=because&utm_campaign=SomeGoogleCampaign
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
All normalize to: `example.com`
|
|
52
|
+
|
|
53
|
+
### Normalization Process
|
|
54
|
+
|
|
55
|
+
URLs are normalized through the following steps:
|
|
56
|
+
|
|
57
|
+
- ✅ Protocol and www subdomains removed
|
|
58
|
+
- ✅ Lowercased
|
|
59
|
+
- ✅ Trailing slashes removed
|
|
60
|
+
- ✅ Query parameters reordered alphabetically by key
|
|
61
|
+
- ✅ Duplicate query parameter key/value pairs removed
|
|
62
|
+
- ✅ Common tracking parameters removed (utm_*, gclid, fbclid, etc.)
|
|
63
|
+
- ✅ Non-HTTP(S) protocols rejected
|
|
64
|
+
- ✅ Localhost URLs rejected
|
|
65
|
+
|
|
66
|
+
### Tracking Parameters Removed
|
|
67
|
+
|
|
68
|
+
The following tracking parameters are automatically removed during normalization:
|
|
69
|
+
|
|
70
|
+
- `utm_*` (all utm parameters)
|
|
71
|
+
- `gclid`, `fbclid`, `dclid` (click identifiers)
|
|
72
|
+
- `_ga`, `_gid`, `_fbp`, `_hjid` (analytics cookies)
|
|
73
|
+
- `msclkid` (Microsoft Ads)
|
|
74
|
+
- `aff_id`, `affid` (affiliate tracking)
|
|
75
|
+
- `referrer`, `adgroupid`, `srsltid`
|
|
76
|
+
|
|
77
|
+
## Advanced Usage
|
|
78
|
+
|
|
79
|
+
### Getting Full Normalization Details
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
from tk_normalizer import TkNormalizer
|
|
83
|
+
|
|
84
|
+
normalizer = TkNormalizer("http://blog.example.com/page?b=2&a=1")
|
|
85
|
+
result = normalizer.get_normalized_url()
|
|
86
|
+
|
|
87
|
+
print(result)
|
|
88
|
+
# {
|
|
89
|
+
# 'normalized_url': 'blog.example.com/page?a=1&b=2',
|
|
90
|
+
# 'parent_normal_url': 'blog.example.com',
|
|
91
|
+
# 'root_normal_url': 'example.com',
|
|
92
|
+
# 'normalized_url_hash': '...',
|
|
93
|
+
# 'parent_normal_url_hash': '...',
|
|
94
|
+
# 'root_normal_url_hash': '...'
|
|
95
|
+
# }
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
### Error Handling
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
from tk_normalizer import normalize_url, InvalidUrlException
|
|
102
|
+
|
|
103
|
+
try:
|
|
104
|
+
normalized = normalize_url("not a valid url")
|
|
105
|
+
except InvalidUrlException as e:
|
|
106
|
+
print(f"Invalid URL: {e}")
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
### Accessing Individual Components
|
|
110
|
+
|
|
111
|
+
```python
|
|
112
|
+
from tk_normalizer import TkNormalizer
|
|
113
|
+
|
|
114
|
+
normalizer = TkNormalizer("https://blog.example.com/path?a=1")
|
|
115
|
+
|
|
116
|
+
# Access individual normalized components
|
|
117
|
+
print(normalizer.normalized_url) # blog.example.com/path?a=1
|
|
118
|
+
print(normalizer.parent_normal_url) # blog.example.com
|
|
119
|
+
print(normalizer.root_normal_url) # example.com
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
## Hashing
|
|
123
|
+
|
|
124
|
+
For efficient storage and comparison, SHA-256 hashes are computed for:
|
|
125
|
+
- The normalized URL
|
|
126
|
+
- The parent normal URL (domain without path)
|
|
127
|
+
- The root normal URL (root domain without subdomains)
|
|
128
|
+
|
|
129
|
+
This provides fixed-length representations suitable for database indexing.
|
|
130
|
+
|
|
131
|
+
## Important Caveats
|
|
132
|
+
|
|
133
|
+
While this normalization process works well for most use cases, there are some limitations:
|
|
134
|
+
|
|
135
|
+
1. **www subdomain removal**: Technically, `www.example.com` and `example.com` could serve different content, though this is rare in practice.
|
|
136
|
+
|
|
137
|
+
2. **Case sensitivity**: URLs are lowercased, but some servers are case-sensitive for paths.
|
|
138
|
+
|
|
139
|
+
3. **Tracking parameters**: New tracking parameters emerge over time and may not be in the removal list.
|
|
140
|
+
|
|
141
|
+
4. **Fragment removal**: URL fragments (#anchors) are removed, which may affect single-page applications.
|
|
142
|
+
|
|
143
|
+
## Development
|
|
144
|
+
|
|
145
|
+
### Setting Up Development Environment
|
|
146
|
+
|
|
147
|
+
```bash
|
|
148
|
+
# Clone the repository
|
|
149
|
+
git clone https://github.com/terakeet/tk-normalizer.git
|
|
150
|
+
cd tk-normalizer
|
|
151
|
+
|
|
152
|
+
# Install development dependencies
|
|
153
|
+
pip install -e ".[dev]"
|
|
154
|
+
|
|
155
|
+
# Run tests
|
|
156
|
+
pytest
|
|
157
|
+
|
|
158
|
+
# Run tests with coverage
|
|
159
|
+
pytest --cov=tk_normalizer
|
|
160
|
+
|
|
161
|
+
# Run linting
|
|
162
|
+
ruff check src tests
|
|
163
|
+
```
|
|
164
|
+
|
|
165
|
+
### Running Tests
|
|
166
|
+
|
|
167
|
+
```bash
|
|
168
|
+
# Run all tests
|
|
169
|
+
pytest
|
|
170
|
+
|
|
171
|
+
# Run with verbose output
|
|
172
|
+
pytest -v
|
|
173
|
+
|
|
174
|
+
# Run specific test file
|
|
175
|
+
pytest tests/test_normalizer.py
|
|
176
|
+
|
|
177
|
+
# Run with coverage report
|
|
178
|
+
pytest --cov=tk_normalizer --cov-report=html
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
## Contributing
|
|
182
|
+
|
|
183
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
184
|
+
|
|
185
|
+
## License
|
|
186
|
+
|
|
187
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
188
|
+
|
|
189
|
+
## Support
|
|
190
|
+
|
|
191
|
+
For issues and questions, please use the [GitHub issue tracker](https://github.com/terakeet/tk-normalizer/issues).
|
|
192
|
+
|
|
193
|
+
## Credits
|
|
194
|
+
|
|
195
|
+
Based on the URL normalization functionality from [tk-core](https://github.com/terakeet/tk-core), extracted and packaged for standalone use.
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tk-normalizer"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "URL normalization library for consistent URL representation"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.11"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Terakeet", email = "engineering@terakeet.com"}
|
|
14
|
+
]
|
|
15
|
+
maintainers = [
|
|
16
|
+
{name = "Terakeet", email = "engineering@terakeet.com"}
|
|
17
|
+
]
|
|
18
|
+
keywords = ["url", "normalization", "canonicalization", "web", "utilities"]
|
|
19
|
+
classifiers = [
|
|
20
|
+
"Development Status :: 4 - Beta",
|
|
21
|
+
"Intended Audience :: Developers",
|
|
22
|
+
"License :: OSI Approved :: MIT License",
|
|
23
|
+
"Programming Language :: Python :: 3",
|
|
24
|
+
"Programming Language :: Python :: 3.11",
|
|
25
|
+
"Programming Language :: Python :: 3.12",
|
|
26
|
+
"Programming Language :: Python :: 3.13",
|
|
27
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
28
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
29
|
+
"Topic :: Text Processing :: Filters",
|
|
30
|
+
"Operating System :: OS Independent",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/terakeet/tk-normalizer"
|
|
35
|
+
Repository = "https://github.com/terakeet/tk-normalizer.git"
|
|
36
|
+
Issues = "https://github.com/terakeet/tk-normalizer/issues"
|
|
37
|
+
Documentation = "https://github.com/terakeet/tk-normalizer/blob/main/docs/ARCHITECTURE.md"
|
|
38
|
+
|
|
39
|
+
[project.optional-dependencies]
|
|
40
|
+
dev = [
|
|
41
|
+
"pytest>=7.0.0",
|
|
42
|
+
"pytest-mock>=3.0.0",
|
|
43
|
+
"pytest-cov>=4.0.0",
|
|
44
|
+
"ruff>=0.1.0",
|
|
45
|
+
"pre-commit>=3.0.0",
|
|
46
|
+
"build>=0.10.0",
|
|
47
|
+
"twine>=4.0.0",
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
[tool.setuptools.packages.find]
|
|
51
|
+
where = ["src"]
|
|
52
|
+
|
|
53
|
+
[tool.setuptools.package-data]
|
|
54
|
+
"*" = ["py.typed"]
|
|
55
|
+
|
|
56
|
+
[tool.pytest.ini_options]
|
|
57
|
+
pythonpath = ["src"]
|
|
58
|
+
testpaths = ["tests"]
|
|
59
|
+
python_files = ["test_*.py", "*_test.py"]
|
|
60
|
+
python_classes = ["Test*"]
|
|
61
|
+
python_functions = ["test_*"]
|
|
62
|
+
addopts = [
|
|
63
|
+
"-ra",
|
|
64
|
+
"--strict-markers",
|
|
65
|
+
"--strict-config",
|
|
66
|
+
"--cov=tk_normalizer",
|
|
67
|
+
"--cov-report=term-missing",
|
|
68
|
+
"--cov-branch",
|
|
69
|
+
"-vv",
|
|
70
|
+
]
|
|
71
|
+
log_cli = true
|
|
72
|
+
log_cli_level = "INFO"
|
|
73
|
+
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
|
|
74
|
+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
|
|
75
|
+
filterwarnings = [
|
|
76
|
+
"ignore::DeprecationWarning",
|
|
77
|
+
"ignore::PendingDeprecationWarning",
|
|
78
|
+
]
|
|
79
|
+
markers = [
|
|
80
|
+
"unit: Unit tests",
|
|
81
|
+
"integration: Integration tests",
|
|
82
|
+
"slow: Slow tests",
|
|
83
|
+
]
|
|
84
|
+
|
|
85
|
+
[tool.ruff]
|
|
86
|
+
line-length = 120
|
|
87
|
+
target-version = "py311"
|
|
88
|
+
exclude = [
|
|
89
|
+
".git",
|
|
90
|
+
".venv",
|
|
91
|
+
"venv",
|
|
92
|
+
"__pycache__",
|
|
93
|
+
".ruff_cache",
|
|
94
|
+
"build",
|
|
95
|
+
"dist",
|
|
96
|
+
"*.egg-info",
|
|
97
|
+
"htmlcov",
|
|
98
|
+
]
|
|
99
|
+
|
|
100
|
+
[tool.ruff.lint]
|
|
101
|
+
select = [
|
|
102
|
+
"E", # pycodestyle errors
|
|
103
|
+
"W", # pycodestyle warnings
|
|
104
|
+
"F", # pyflakes
|
|
105
|
+
"I", # isort
|
|
106
|
+
"B", # flake8-bugbear
|
|
107
|
+
"C4", # flake8-comprehensions
|
|
108
|
+
"UP", # pyupgrade
|
|
109
|
+
"ARG", # flake8-unused-arguments
|
|
110
|
+
"SIM", # flake8-simplify
|
|
111
|
+
"S", # flake8-bandit
|
|
112
|
+
"ANN", # annotations
|
|
113
|
+
]
|
|
114
|
+
ignore = [
|
|
115
|
+
"E501", # line too long (handled by formatter)
|
|
116
|
+
"B008", # do not perform function calls in argument defaults
|
|
117
|
+
"S101", # use of assert in tests is fine
|
|
118
|
+
"ANN002", # Args type hint skip
|
|
119
|
+
"ANN003", # Kwargs type hint skip
|
|
120
|
+
"ANN204", # Missing return type annotation for special method `__init__`
|
|
121
|
+
"ANN401", # Allow Any type declaration
|
|
122
|
+
"SIM108", # ternary operator required
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
[tool.ruff.lint.per-file-ignores]
|
|
126
|
+
"tests/*" = ["S101", "ARG001", "ARG002", "S105", "S106", "ANN201"]
|
|
127
|
+
"test_*.py" = ["S101", "ARG001", "ARG002", "S105", "S106", "ANN201"]
|
|
128
|
+
|
|
129
|
+
[tool.ruff.format]
|
|
130
|
+
quote-style = "double"
|
|
131
|
+
indent-style = "space"
|
|
132
|
+
skip-magic-trailing-comma = false
|
|
133
|
+
line-ending = "lf"
|
|
134
|
+
|
|
135
|
+
[tool.coverage.run]
|
|
136
|
+
branch = true
|
|
137
|
+
source = ["src/tk_normalizer"]
|
|
138
|
+
omit = [
|
|
139
|
+
"tests/*",
|
|
140
|
+
"test_*.py",
|
|
141
|
+
".venv/*",
|
|
142
|
+
"venv/*",
|
|
143
|
+
"*/site-packages/*",
|
|
144
|
+
"*/__init__.py",
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
[tool.coverage.report]
|
|
148
|
+
exclude_lines = [
|
|
149
|
+
"pragma: no cover",
|
|
150
|
+
"def __repr__",
|
|
151
|
+
"if self.debug:",
|
|
152
|
+
"if __name__ == .__main__.:",
|
|
153
|
+
"raise AssertionError",
|
|
154
|
+
"raise NotImplementedError",
|
|
155
|
+
"pass",
|
|
156
|
+
"except ImportError:",
|
|
157
|
+
"if TYPE_CHECKING:",
|
|
158
|
+
]
|
|
159
|
+
show_missing = true
|
|
160
|
+
skip_covered = false
|
|
161
|
+
precision = 2
|
|
162
|
+
fail_under = 0
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
tk-normalizer: URL normalization library for consistent URL representation.
|
|
3
|
+
|
|
4
|
+
This library provides URL normalization functionality to create normalized
|
|
5
|
+
representations of URLs, handling variations in protocols, subdomains,
|
|
6
|
+
query parameters, and more.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .normalizer import InvalidUrlException, TkNormalizer
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
__all__ = ["TkNormalizer", "InvalidUrlException", "normalize_url"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def normalize_url(url: str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Normalize a URL to its normalized form.
|
|
18
|
+
|
|
19
|
+
This is a convenience function that creates a TkNormalizer instance
|
|
20
|
+
and returns the normalized URL string.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
url: The URL string to normalize.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
The normalized URL string.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
InvalidUrlException: If the URL is invalid or cannot be normalized.
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> from tk_normalizer import normalize_url
|
|
33
|
+
>>> normalize_url("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
34
|
+
'example.com/path?a=1&b=2'
|
|
35
|
+
"""
|
|
36
|
+
normalizer = TkNormalizer(url)
|
|
37
|
+
return normalizer.normalized_url
|