tk-normalizer 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tk_normalizer/__init__.py +37 -0
- tk_normalizer/normalizer.py +241 -0
- tk_normalizer-1.0.0.dist-info/METADATA +231 -0
- tk_normalizer-1.0.0.dist-info/RECORD +7 -0
- tk_normalizer-1.0.0.dist-info/WHEEL +5 -0
- tk_normalizer-1.0.0.dist-info/licenses/LICENSE +21 -0
- tk_normalizer-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
tk-normalizer: URL normalization library for consistent URL representation.
|
|
3
|
+
|
|
4
|
+
This library provides URL normalization functionality to create normalized
|
|
5
|
+
representations of URLs, handling variations in protocols, subdomains,
|
|
6
|
+
query parameters, and more.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from .normalizer import InvalidUrlException, TkNormalizer
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
__all__ = ["TkNormalizer", "InvalidUrlException", "normalize_url"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def normalize_url(url: str) -> str:
|
|
16
|
+
"""
|
|
17
|
+
Normalize a URL to its normalized form.
|
|
18
|
+
|
|
19
|
+
This is a convenience function that creates a TkNormalizer instance
|
|
20
|
+
and returns the normalized URL string.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
url: The URL string to normalize.
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
The normalized URL string.
|
|
27
|
+
|
|
28
|
+
Raises:
|
|
29
|
+
InvalidUrlException: If the URL is invalid or cannot be normalized.
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> from tk_normalizer import normalize_url
|
|
33
|
+
>>> normalize_url("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
34
|
+
'example.com/path?a=1&b=2'
|
|
35
|
+
"""
|
|
36
|
+
normalizer = TkNormalizer(url)
|
|
37
|
+
return normalizer.normalized_url
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import fnmatch
|
|
2
|
+
import hashlib
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
from urllib.parse import ParseResult, parse_qsl, urlencode, urlparse
|
|
6
|
+
|
|
7
|
+
# Uses https://gist.github.com/Integralist/edcfb88c925658a13fc3e51f581fe4bc as a starting point
|
|
8
|
+
# Modified for more current rules regarding host/domain/tld naming.
|
|
9
|
+
|
|
10
|
+
ip_middle_octet = r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5]))"
|
|
11
|
+
ip_last_octet = r"(?:\.(?:[0-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
|
12
|
+
|
|
13
|
+
regex = re.compile(
|
|
14
|
+
r"^"
|
|
15
|
+
# protocol identifier
|
|
16
|
+
r"(?:(?:https?|ftp)://)"
|
|
17
|
+
# user:pass authentication (updated to avoid catastrophic backtracking)
|
|
18
|
+
r"(?:[a-zA-Z0-9._%+-]+(?::[^\s@]*)?@)?"
|
|
19
|
+
r"(?:"
|
|
20
|
+
r"(?P<private_ip>"
|
|
21
|
+
# IP address exclusion
|
|
22
|
+
# private & local networks
|
|
23
|
+
r"(?:(?:10|127)" + ip_middle_octet + "{2}" + ip_last_octet + ")|"
|
|
24
|
+
r"(?:(?:169\.254|192\.168)" + ip_middle_octet + ip_last_octet + ")|"
|
|
25
|
+
r"(?:172\.(?:1[6-9]|2\d|3[0-1])" + ip_middle_octet + ip_last_octet + "))"
|
|
26
|
+
r"|"
|
|
27
|
+
# IP address dotted notation octets
|
|
28
|
+
# excludes loopback network 0.0.0.0
|
|
29
|
+
# excludes reserved space >= 224.0.0.0
|
|
30
|
+
# excludes network & broadcast addresses
|
|
31
|
+
# (first & last IP address of each class)
|
|
32
|
+
r"(?P<public_ip>"
|
|
33
|
+
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
|
34
|
+
r"" + ip_middle_octet + "{2}"
|
|
35
|
+
r"" + ip_last_octet + ")"
|
|
36
|
+
r"|"
|
|
37
|
+
# host name (modified: handles multiple hyphens, underscores in hostnames, and trailing hyphens)
|
|
38
|
+
r"(?:(?:[a-z\-_\u00a1-\uffff0-9]-?)*[a-z_\u00a1-\uffff0-9\-]+)"
|
|
39
|
+
# domain name (modified: handles multiple hyphens, and also underscores in domain names, and trailing hyphens)
|
|
40
|
+
r"(?:\.(?:[a-z\-_\u00a1-\uffff0-9]-?)*[a-z_\u00a1-\uffff0-9\-]+)*"
|
|
41
|
+
# TLD identifier (modified: handles oddities like site.xn--p1ai/)
|
|
42
|
+
r"(?:\.(?:[a-z0-9\-\u00a1-\uffff]{2,}))"
|
|
43
|
+
r")"
|
|
44
|
+
# port number
|
|
45
|
+
r"(?::\d{2,5})?"
|
|
46
|
+
# resource path
|
|
47
|
+
r"(?:/\S*)?"
|
|
48
|
+
# query string
|
|
49
|
+
r"(?:\?\S*)?"
|
|
50
|
+
r"$",
|
|
51
|
+
re.UNICODE | re.IGNORECASE,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
pattern = re.compile(regex)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class InvalidUrlException(Exception):
|
|
58
|
+
def __init__(self, message: str, original_exception: Exception) -> None:
|
|
59
|
+
super().__init__(message)
|
|
60
|
+
self.original_exception = original_exception
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class TkNormalizer:
|
|
64
|
+
# Define the list of query parameters to remove
|
|
65
|
+
query_params_to_remove: list[str] = [
|
|
66
|
+
"utm_*",
|
|
67
|
+
"gclid",
|
|
68
|
+
"fbclid",
|
|
69
|
+
"dclid",
|
|
70
|
+
"_ga",
|
|
71
|
+
"_gid",
|
|
72
|
+
"_fbp",
|
|
73
|
+
"_hjid",
|
|
74
|
+
"msclkid",
|
|
75
|
+
"aff_id",
|
|
76
|
+
"affid",
|
|
77
|
+
"referrer",
|
|
78
|
+
"adgroupid",
|
|
79
|
+
"srsltid",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
def __init__(self, url: str, log_errors: bool = True) -> None:
|
|
83
|
+
try:
|
|
84
|
+
url = url.strip()
|
|
85
|
+
self.log_errors: bool = log_errors
|
|
86
|
+
self.logger = logging.getLogger(__name__)
|
|
87
|
+
self.original_url: str = url
|
|
88
|
+
self.normalized_url: str
|
|
89
|
+
self.parent_normal_url: str
|
|
90
|
+
self.root_normal_url: str
|
|
91
|
+
self.normalized_url, self.parent_normal_url, self.root_normal_url = self.normalize_url(url)
|
|
92
|
+
self.url_hashes: dict[str, str] = self.compute_hashes()
|
|
93
|
+
except InvalidUrlException as e:
|
|
94
|
+
if self.log_errors:
|
|
95
|
+
self.logger.warning(f"{e}")
|
|
96
|
+
raise e
|
|
97
|
+
except Exception as e:
|
|
98
|
+
m = f"Invalid URL (exception): {url}"
|
|
99
|
+
if self.log_errors:
|
|
100
|
+
self.logger.error(m)
|
|
101
|
+
raise InvalidUrlException(m, e) from e
|
|
102
|
+
|
|
103
|
+
def normalize_url(self, url: str) -> tuple[str, str, str]:
|
|
104
|
+
url = self.lowercase_url(url)
|
|
105
|
+
parsed_url = self.parse_url(url)
|
|
106
|
+
netloc, path, query = self.validate_url(parsed_url)
|
|
107
|
+
netloc = self.remove_www_subdomain(netloc)
|
|
108
|
+
path = self.remove_trailing_slash(path)
|
|
109
|
+
query_params = self.parse_query_params(query)
|
|
110
|
+
query_params = self.remove_unwanted_params(query_params)
|
|
111
|
+
query_params = self.sort_query_params(query_params)
|
|
112
|
+
unique_params = self.remove_duplicate_params(query_params)
|
|
113
|
+
normalized_url = self.rebuild_url(netloc, path, unique_params)
|
|
114
|
+
parent_normal_url = self.get_parent_normal_url(netloc)
|
|
115
|
+
root_normal_url = self.get_root_normal_url(netloc)
|
|
116
|
+
return normalized_url, parent_normal_url, root_normal_url
|
|
117
|
+
|
|
118
|
+
@staticmethod
|
|
119
|
+
def lowercase_url(url: str) -> str:
|
|
120
|
+
return url.lower() if url else url
|
|
121
|
+
|
|
122
|
+
@staticmethod
|
|
123
|
+
def parse_url(url: str) -> ParseResult:
|
|
124
|
+
parsed_url: ParseResult = urlparse(url)
|
|
125
|
+
|
|
126
|
+
# Handle URLs with neither scheme nor netloc
|
|
127
|
+
if not parsed_url.scheme and not parsed_url.netloc:
|
|
128
|
+
parsed_url = urlparse(f"http://{url}")
|
|
129
|
+
|
|
130
|
+
return parsed_url
|
|
131
|
+
|
|
132
|
+
@staticmethod
|
|
133
|
+
def validate_url(parsed_url: ParseResult) -> tuple[str, str, str]:
|
|
134
|
+
# This will also capture localhost by its nature
|
|
135
|
+
if "." not in parsed_url.netloc:
|
|
136
|
+
raise InvalidUrlException(
|
|
137
|
+
f"Invalid URL provided (no dots) '{parsed_url.geturl()}'",
|
|
138
|
+
ValueError(f"URLs without a tld are forbidden. '{parsed_url.netloc}'"),
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Only http(s) urls are currently allowed; this means no file, ftp, etc.
|
|
142
|
+
if not str(parsed_url.scheme).startswith("http"):
|
|
143
|
+
raise InvalidUrlException(
|
|
144
|
+
f"Invalid URL provided (non-HTTP) '{parsed_url.geturl()}'",
|
|
145
|
+
ValueError(f"Only http(s) URLs are currently allowed, received: '{parsed_url.scheme}'"),
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Some oddball or broken URLs will be caught here
|
|
149
|
+
if not parsed_url.scheme or not parsed_url.netloc:
|
|
150
|
+
raise InvalidUrlException(
|
|
151
|
+
f"Invalid URL provided: (empty) '{parsed_url.geturl()}'",
|
|
152
|
+
ValueError(f"URL could not be parsed. '{parsed_url.netloc}'"),
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
# Check again the mega regular-expression
|
|
156
|
+
if not pattern.match(parsed_url.geturl()):
|
|
157
|
+
raise InvalidUrlException(
|
|
158
|
+
f"Invalid URL provided (regex.fail) '{parsed_url.geturl()}'", ValueError("URL failed regex check.")
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
return parsed_url.netloc, parsed_url.path, parsed_url.query
|
|
162
|
+
|
|
163
|
+
@staticmethod
|
|
164
|
+
def remove_www_subdomain(netloc: str) -> str:
|
|
165
|
+
return netloc[4:] if str(netloc).startswith("www.") else netloc
|
|
166
|
+
|
|
167
|
+
@staticmethod
|
|
168
|
+
def remove_trailing_slash(path: str) -> str:
|
|
169
|
+
return path.rstrip("/")
|
|
170
|
+
|
|
171
|
+
@staticmethod
|
|
172
|
+
def parse_query_params(query: str) -> list[tuple[str, str]]:
|
|
173
|
+
return parse_qsl(query, keep_blank_values=True)
|
|
174
|
+
|
|
175
|
+
def remove_unwanted_params(self, query_params: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
176
|
+
def is_unwanted_param(param: str) -> bool:
|
|
177
|
+
return any(fnmatch.fnmatch(param, pattern) for pattern in self.query_params_to_remove)
|
|
178
|
+
|
|
179
|
+
return [(k, v) for k, v in query_params if not is_unwanted_param(k)]
|
|
180
|
+
|
|
181
|
+
@staticmethod
|
|
182
|
+
def sort_query_params(query_params: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
183
|
+
return sorted(query_params, key=lambda x: (x[0], x[1]))
|
|
184
|
+
|
|
185
|
+
@staticmethod
|
|
186
|
+
def remove_duplicate_params(query_params: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
187
|
+
seen_params: set = set()
|
|
188
|
+
unique_params: list[tuple[str, str]] = []
|
|
189
|
+
for param in query_params:
|
|
190
|
+
if param not in seen_params:
|
|
191
|
+
seen_params.add(param)
|
|
192
|
+
unique_params.append(param)
|
|
193
|
+
return unique_params
|
|
194
|
+
|
|
195
|
+
@staticmethod
|
|
196
|
+
def rebuild_url(netloc: str, path: str, query_params: list[tuple[str, str]]) -> str:
|
|
197
|
+
query_string: str = urlencode(query_params)
|
|
198
|
+
return f"{netloc}{path}?{query_string}" if query_string else f"{netloc}{path}"
|
|
199
|
+
|
|
200
|
+
@staticmethod
|
|
201
|
+
def get_parent_normal_url(netloc: str) -> str:
|
|
202
|
+
return netloc
|
|
203
|
+
|
|
204
|
+
@staticmethod
|
|
205
|
+
def get_root_normal_url(netloc: str) -> str:
|
|
206
|
+
return ".".join(netloc.split(".")[-2:])
|
|
207
|
+
|
|
208
|
+
def compute_hashes(self) -> dict[str, str]:
|
|
209
|
+
def sha256_hash(value: str) -> str:
|
|
210
|
+
return hashlib.sha256(value.encode()).hexdigest()
|
|
211
|
+
|
|
212
|
+
return {
|
|
213
|
+
"normalized_url_hash": sha256_hash(self.normalized_url),
|
|
214
|
+
"parent_normal_url_hash": sha256_hash(self.parent_normal_url),
|
|
215
|
+
"root_normal_url_hash": sha256_hash(self.root_normal_url),
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
def get_normalized_url(self) -> dict[str, str | dict[str, str]]:
|
|
219
|
+
return {
|
|
220
|
+
"normalized_url": self.normalized_url,
|
|
221
|
+
"parent_normal_url": self.parent_normal_url,
|
|
222
|
+
"root_normal_url": self.root_normal_url,
|
|
223
|
+
**self.url_hashes,
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
if __name__ == "__main__": # pragma: no cover
|
|
228
|
+
url1 = "http://www.Example.com/some-sub-folder/or_page.html?b=2&a=1&a=1&b=2&c=3&bad_param=some_value"
|
|
229
|
+
url2 = "http://blog.example.com/some-folder/some-page.html?b=2&a=1&a=1&b=2&c=3&bad_param=another_value"
|
|
230
|
+
|
|
231
|
+
try:
|
|
232
|
+
normalizer1 = TkNormalizer(url1)
|
|
233
|
+
print(normalizer1.get_normalized_url())
|
|
234
|
+
except InvalidUrlException as e:
|
|
235
|
+
print(f"Error: {e}")
|
|
236
|
+
|
|
237
|
+
try:
|
|
238
|
+
normalizer2 = TkNormalizer(url2)
|
|
239
|
+
print(normalizer2.get_normalized_url())
|
|
240
|
+
except InvalidUrlException as e:
|
|
241
|
+
print(f"Error: {e}")
|
|
@@ -0,0 +1,231 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tk-normalizer
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: URL normalization library for consistent URL representation
|
|
5
|
+
Author-email: Terakeet <engineering@terakeet.com>
|
|
6
|
+
Maintainer-email: Terakeet <engineering@terakeet.com>
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/terakeet/tk-normalizer
|
|
9
|
+
Project-URL: Repository, https://github.com/terakeet/tk-normalizer.git
|
|
10
|
+
Project-URL: Issues, https://github.com/terakeet/tk-normalizer/issues
|
|
11
|
+
Project-URL: Documentation, https://github.com/terakeet/tk-normalizer/blob/main/docs/ARCHITECTURE.md
|
|
12
|
+
Keywords: url,normalization,canonicalization,web,utilities
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
21
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
22
|
+
Classifier: Topic :: Text Processing :: Filters
|
|
23
|
+
Classifier: Operating System :: OS Independent
|
|
24
|
+
Requires-Python: >=3.11
|
|
25
|
+
Description-Content-Type: text/markdown
|
|
26
|
+
License-File: LICENSE
|
|
27
|
+
Provides-Extra: dev
|
|
28
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
29
|
+
Requires-Dist: pytest-mock>=3.0.0; extra == "dev"
|
|
30
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
31
|
+
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
32
|
+
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: build>=0.10.0; extra == "dev"
|
|
34
|
+
Requires-Dist: twine>=4.0.0; extra == "dev"
|
|
35
|
+
Dynamic: license-file
|
|
36
|
+
|
|
37
|
+
# tk-normalizer
|
|
38
|
+
|
|
39
|
+
[](https://pypi.org/project/tk-normalizer/)
|
|
40
|
+
[](https://pypi.org/project/tk-normalizer/)
|
|
41
|
+
[](https://opensource.org/licenses/MIT)
|
|
42
|
+
|
|
43
|
+
URL normalization library for creating consistent URL representations.
|
|
44
|
+
|
|
45
|
+
## Purpose
|
|
46
|
+
|
|
47
|
+
The URL normalization process creates a mechanism to provide equivalence between URLs with varying string, protocol, scheme, and query parameter ordering. This library helps create normalized representations of URLs for consistent storage, comparison, and analysis.
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install tk-normalizer
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## Quick Start
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from tk_normalizer import normalize_url
|
|
59
|
+
|
|
60
|
+
# Simple usage with the convenience function
|
|
61
|
+
normalized = normalize_url("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
62
|
+
print(normalized) # Output: example.com/path?a=1&b=2
|
|
63
|
+
|
|
64
|
+
# Using the class directly for more control
|
|
65
|
+
from tk_normalizer import TkNormalizer
|
|
66
|
+
|
|
67
|
+
normalizer = TkNormalizer("http://www.Example.com/path?b=2&a=1&utm_source=test")
|
|
68
|
+
print(normalizer.normalized_url) # example.com/path?a=1&b=2
|
|
69
|
+
print(normalizer.get_normalized_url()) # Full details including hashes
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## Features
|
|
73
|
+
|
|
74
|
+
### URL Normalization
|
|
75
|
+
|
|
76
|
+
The following URLs all normalize to the same normalized form:
|
|
77
|
+
|
|
78
|
+
```
|
|
79
|
+
https://example.com/
|
|
80
|
+
http://www.example.com/
|
|
81
|
+
http://www.example.com
|
|
82
|
+
http://www.example.com/#my_search_engine_is_great
|
|
83
|
+
https://www.example.com/?utm_campaign=SomeGoogleCampaign
|
|
84
|
+
https://www.example.com/?utm_source=because&utm_campaign=SomeGoogleCampaign
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
All normalize to: `example.com`
|
|
88
|
+
|
|
89
|
+
### Normalization Process
|
|
90
|
+
|
|
91
|
+
URLs are normalized through the following steps:
|
|
92
|
+
|
|
93
|
+
- ✅ Protocol and www subdomains removed
|
|
94
|
+
- ✅ Lowercased
|
|
95
|
+
- ✅ Trailing slashes removed
|
|
96
|
+
- ✅ Query parameters reordered alphabetically by key
|
|
97
|
+
- ✅ Duplicate query parameter key/value pairs removed
|
|
98
|
+
- ✅ Common tracking parameters removed (utm_*, gclid, fbclid, etc.)
|
|
99
|
+
- ✅ Non-HTTP(S) protocols rejected
|
|
100
|
+
- ✅ Localhost URLs rejected
|
|
101
|
+
|
|
102
|
+
### Tracking Parameters Removed
|
|
103
|
+
|
|
104
|
+
The following tracking parameters are automatically removed during normalization:
|
|
105
|
+
|
|
106
|
+
- `utm_*` (all utm parameters)
|
|
107
|
+
- `gclid`, `fbclid`, `dclid` (click identifiers)
|
|
108
|
+
- `_ga`, `_gid`, `_fbp`, `_hjid` (analytics cookies)
|
|
109
|
+
- `msclkid` (Microsoft Ads)
|
|
110
|
+
- `aff_id`, `affid` (affiliate tracking)
|
|
111
|
+
- `referrer`, `adgroupid`, `srsltid`
|
|
112
|
+
|
|
113
|
+
## Advanced Usage
|
|
114
|
+
|
|
115
|
+
### Getting Full Normalization Details
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
from tk_normalizer import TkNormalizer
|
|
119
|
+
|
|
120
|
+
normalizer = TkNormalizer("http://blog.example.com/page?b=2&a=1")
|
|
121
|
+
result = normalizer.get_normalized_url()
|
|
122
|
+
|
|
123
|
+
print(result)
|
|
124
|
+
# {
|
|
125
|
+
# 'normalized_url': 'blog.example.com/page?a=1&b=2',
|
|
126
|
+
# 'parent_normal_url': 'blog.example.com',
|
|
127
|
+
# 'root_normal_url': 'example.com',
|
|
128
|
+
# 'normalized_url_hash': '...',
|
|
129
|
+
# 'parent_normal_url_hash': '...',
|
|
130
|
+
# 'root_normal_url_hash': '...'
|
|
131
|
+
# }
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
### Error Handling
|
|
135
|
+
|
|
136
|
+
```python
|
|
137
|
+
from tk_normalizer import normalize_url, InvalidUrlException
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
normalized = normalize_url("not a valid url")
|
|
141
|
+
except InvalidUrlException as e:
|
|
142
|
+
print(f"Invalid URL: {e}")
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
### Accessing Individual Components
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
from tk_normalizer import TkNormalizer
|
|
149
|
+
|
|
150
|
+
normalizer = TkNormalizer("https://blog.example.com/path?a=1")
|
|
151
|
+
|
|
152
|
+
# Access individual normalized components
|
|
153
|
+
print(normalizer.normalized_url) # blog.example.com/path?a=1
|
|
154
|
+
print(normalizer.parent_normal_url) # blog.example.com
|
|
155
|
+
print(normalizer.root_normal_url) # example.com
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Hashing
|
|
159
|
+
|
|
160
|
+
For efficient storage and comparison, SHA-256 hashes are computed for:
|
|
161
|
+
- The normalized URL
|
|
162
|
+
- The parent normal URL (domain without path)
|
|
163
|
+
- The root normal URL (root domain without subdomains)
|
|
164
|
+
|
|
165
|
+
This provides fixed-length representations suitable for database indexing.
|
|
166
|
+
|
|
167
|
+
## Important Caveats
|
|
168
|
+
|
|
169
|
+
While this normalization process works well for most use cases, there are some limitations:
|
|
170
|
+
|
|
171
|
+
1. **www subdomain removal**: Technically, `www.example.com` and `example.com` could serve different content, though this is rare in practice.
|
|
172
|
+
|
|
173
|
+
2. **Case sensitivity**: URLs are lowercased, but some servers are case-sensitive for paths.
|
|
174
|
+
|
|
175
|
+
3. **Tracking parameters**: New tracking parameters emerge over time and may not be in the removal list.
|
|
176
|
+
|
|
177
|
+
4. **Fragment removal**: URL fragments (#anchors) are removed, which may affect single-page applications.
|
|
178
|
+
|
|
179
|
+
## Development
|
|
180
|
+
|
|
181
|
+
### Setting Up Development Environment
|
|
182
|
+
|
|
183
|
+
```bash
|
|
184
|
+
# Clone the repository
|
|
185
|
+
git clone https://github.com/terakeet/tk-normalizer.git
|
|
186
|
+
cd tk-normalizer
|
|
187
|
+
|
|
188
|
+
# Install development dependencies
|
|
189
|
+
pip install -e ".[dev]"
|
|
190
|
+
|
|
191
|
+
# Run tests
|
|
192
|
+
pytest
|
|
193
|
+
|
|
194
|
+
# Run tests with coverage
|
|
195
|
+
pytest --cov=tk_normalizer
|
|
196
|
+
|
|
197
|
+
# Run linting
|
|
198
|
+
ruff check src tests
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Running Tests
|
|
202
|
+
|
|
203
|
+
```bash
|
|
204
|
+
# Run all tests
|
|
205
|
+
pytest
|
|
206
|
+
|
|
207
|
+
# Run with verbose output
|
|
208
|
+
pytest -v
|
|
209
|
+
|
|
210
|
+
# Run specific test file
|
|
211
|
+
pytest tests/test_normalizer.py
|
|
212
|
+
|
|
213
|
+
# Run with coverage report
|
|
214
|
+
pytest --cov=tk_normalizer --cov-report=html
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
## Contributing
|
|
218
|
+
|
|
219
|
+
Contributions are welcome! Please feel free to submit a Pull Request.
|
|
220
|
+
|
|
221
|
+
## License
|
|
222
|
+
|
|
223
|
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
|
224
|
+
|
|
225
|
+
## Support
|
|
226
|
+
|
|
227
|
+
For issues and questions, please use the [GitHub issue tracker](https://github.com/terakeet/tk-normalizer/issues).
|
|
228
|
+
|
|
229
|
+
## Credits
|
|
230
|
+
|
|
231
|
+
Based on the URL normalization functionality from [tk-core](https://github.com/terakeet/tk-core), extracted and packaged for standalone use.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
tk_normalizer/__init__.py,sha256=yUNyiw5QMY_r766L6RebLL-el1rnKLZrrC_1W4LnPK4,1067
|
|
2
|
+
tk_normalizer/normalizer.py,sha256=ytLYdGQlqcEwrpuN4eZS72Dnt53moHkR0P2zGUW7iOU,9059
|
|
3
|
+
tk_normalizer-1.0.0.dist-info/licenses/LICENSE,sha256=7rybr3ioo2HAWA784BIwucFSrECCwJB4mSztAF3krtc,1065
|
|
4
|
+
tk_normalizer-1.0.0.dist-info/METADATA,sha256=p5zaBitT681qCSapFF3WjIyDBlWksAsLFZ_eFuia18k,7022
|
|
5
|
+
tk_normalizer-1.0.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
6
|
+
tk_normalizer-1.0.0.dist-info/top_level.txt,sha256=YFjFkT1gCcO4_rGcNtcTI0CO4KDutwzQpvw0FcqEMnQ,14
|
|
7
|
+
tk_normalizer-1.0.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2024 Terakeet
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
tk_normalizer
|