seoslug 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- seoslug-1.0.0/LICENSE +21 -0
- seoslug-1.0.0/PKG-INFO +77 -0
- seoslug-1.0.0/README.md +55 -0
- seoslug-1.0.0/pyproject.toml +35 -0
- seoslug-1.0.0/setup.cfg +4 -0
- seoslug-1.0.0/src/seoslug/__init__.py +16 -0
- seoslug-1.0.0/src/seoslug/builder.py +81 -0
- seoslug-1.0.0/src/seoslug/config.py +97 -0
- seoslug-1.0.0/src/seoslug/jsonld.py +13 -0
- seoslug-1.0.0/src/seoslug/normalization.py +85 -0
- seoslug-1.0.0/src/seoslug/schemas.py +87 -0
- seoslug-1.0.0/src/seoslug/text.py +34 -0
- seoslug-1.0.0/src/seoslug.egg-info/PKG-INFO +77 -0
- seoslug-1.0.0/src/seoslug.egg-info/SOURCES.txt +22 -0
- seoslug-1.0.0/src/seoslug.egg-info/dependency_links.txt +1 -0
- seoslug-1.0.0/src/seoslug.egg-info/top_level.txt +1 -0
- seoslug-1.0.0/tests/test_builder.py +105 -0
- seoslug-1.0.0/tests/test_fallbacks.py +79 -0
- seoslug-1.0.0/tests/test_jsonld.py +28 -0
- seoslug-1.0.0/tests/test_normalization.py +86 -0
- seoslug-1.0.0/tests/test_regression_fixtures.py +179 -0
- seoslug-1.0.0/tests/test_robots_rules.py +38 -0
- seoslug-1.0.0/tests/test_schemas.py +36 -0
- seoslug-1.0.0/tests/test_text.py +29 -0
seoslug-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 seoslug contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
seoslug-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seoslug
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Framework-agnostic canonical URL normalization and SEO payload generation
|
|
5
|
+
Author: seoslug contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Documentation, https://deepwiki.com/emiliano-gandini-outeda/seoslug/
|
|
8
|
+
Keywords: seo,canonical,urls,metadata,open-graph,twitter-cards
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# seoslug
|
|
24
|
+
|
|
25
|
+
[](https://deepwiki.com/emiliano-gandini-outeda/seoslug/)
|
|
26
|
+
|
|
27
|
+
Canonical URL normalization and deterministic SEO payload generation for content platforms.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install seoslug
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
For local development:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from seoslug import SEOConfig, URLPolicy, SEOEntity, build_seo_payload
|
|
45
|
+
|
|
46
|
+
config = SEOConfig(
|
|
47
|
+
canonical_host="portal.example.com",
|
|
48
|
+
public_base_url="https://portal.example.com",
|
|
49
|
+
url_policy=URLPolicy(
|
|
50
|
+
enforce_https=True,
|
|
51
|
+
lowercase_paths=True,
|
|
52
|
+
trailing_slash="never",
|
|
53
|
+
collapse_duplicate_slashes=True,
|
|
54
|
+
strip_tracking_params=True,
|
|
55
|
+
allowed_query_params=["page", "q"],
|
|
56
|
+
),
|
|
57
|
+
default_og_image="https://cdn.example.com/default.jpg",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
entity = SEOEntity(
|
|
61
|
+
entity_type="post",
|
|
62
|
+
slug="my-post",
|
|
63
|
+
title="My Post",
|
|
64
|
+
excerpt="Example excerpt",
|
|
65
|
+
body_html="<p>Body content</p>",
|
|
66
|
+
status="published",
|
|
67
|
+
featured_image="https://cdn.example.com/post.jpg",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
payload = build_seo_payload(entity, "/posts/my-post", config)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Full docs, API reference, and usage examples are in `docs/` and published with MkDocs.
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT, see `LICENSE`.
|
seoslug-1.0.0/README.md
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# seoslug
|
|
2
|
+
|
|
3
|
+
[](https://deepwiki.com/emiliano-gandini-outeda/seoslug/)
|
|
4
|
+
|
|
5
|
+
Canonical URL normalization and deterministic SEO payload generation for content platforms.
|
|
6
|
+
|
|
7
|
+
## Installation
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
pip install seoslug
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
For local development:
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
pip install -e .
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Quick usage
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
from seoslug import SEOConfig, URLPolicy, SEOEntity, build_seo_payload
|
|
23
|
+
|
|
24
|
+
config = SEOConfig(
|
|
25
|
+
canonical_host="portal.example.com",
|
|
26
|
+
public_base_url="https://portal.example.com",
|
|
27
|
+
url_policy=URLPolicy(
|
|
28
|
+
enforce_https=True,
|
|
29
|
+
lowercase_paths=True,
|
|
30
|
+
trailing_slash="never",
|
|
31
|
+
collapse_duplicate_slashes=True,
|
|
32
|
+
strip_tracking_params=True,
|
|
33
|
+
allowed_query_params=["page", "q"],
|
|
34
|
+
),
|
|
35
|
+
default_og_image="https://cdn.example.com/default.jpg",
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
entity = SEOEntity(
|
|
39
|
+
entity_type="post",
|
|
40
|
+
slug="my-post",
|
|
41
|
+
title="My Post",
|
|
42
|
+
excerpt="Example excerpt",
|
|
43
|
+
body_html="<p>Body content</p>",
|
|
44
|
+
status="published",
|
|
45
|
+
featured_image="https://cdn.example.com/post.jpg",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
payload = build_seo_payload(entity, "/posts/my-post", config)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Full docs, API reference, and usage examples are in `docs/` and published with MkDocs.
|
|
52
|
+
|
|
53
|
+
## License
|
|
54
|
+
|
|
55
|
+
MIT, see `LICENSE`.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "seoslug"
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
description = "Framework-agnostic canonical URL normalization and SEO payload generation"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
authors = [{ name = "seoslug contributors" }]
|
|
12
|
+
license = "MIT"
|
|
13
|
+
keywords = ["seo", "canonical", "urls", "metadata", "open-graph", "twitter-cards"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 5 - Production/Stable",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
19
|
+
"Programming Language :: Python :: 3.10",
|
|
20
|
+
"Programming Language :: Python :: 3.11",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
23
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
24
|
+
]
|
|
25
|
+
dependencies = []
|
|
26
|
+
license-files = ["LICENSE"]
|
|
27
|
+
|
|
28
|
+
[project.urls]
|
|
29
|
+
Documentation = "https://deepwiki.com/emiliano-gandini-outeda/seoslug/"
|
|
30
|
+
|
|
31
|
+
[tool.setuptools]
|
|
32
|
+
package-dir = {"" = "src"}
|
|
33
|
+
|
|
34
|
+
[tool.setuptools.packages.find]
|
|
35
|
+
where = ["src"]
|
seoslug-1.0.0/setup.cfg
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Public API for seoslug."""
|
|
2
|
+
|
|
3
|
+
from .builder import build_seo_payload
|
|
4
|
+
from .config import SEOConfig, URLPolicy
|
|
5
|
+
from .normalization import normalize_path, normalize_public_url
|
|
6
|
+
from .schemas import SEOEntity, SEOOverrides
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"SEOConfig",
|
|
10
|
+
"URLPolicy",
|
|
11
|
+
"SEOEntity",
|
|
12
|
+
"SEOOverrides",
|
|
13
|
+
"normalize_public_url",
|
|
14
|
+
"normalize_path",
|
|
15
|
+
"build_seo_payload",
|
|
16
|
+
]
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""SEO payload builder for seoslug."""
|
|
2
|
+
|
|
3
|
+
from .config import SEOConfig
|
|
4
|
+
from .jsonld import normalize_schema_jsonld
|
|
5
|
+
from .normalization import normalize_public_url
|
|
6
|
+
from .schemas import SEOEntity, SEOOverrides
|
|
7
|
+
from .text import build_description_snippet
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _pick(*values: str | None) -> str | None:
|
|
11
|
+
for value in values:
|
|
12
|
+
if isinstance(value, str) and value.strip():
|
|
13
|
+
return value.strip()
|
|
14
|
+
return None
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _entity_default_robots(entity: SEOEntity, config: SEOConfig) -> str:
|
|
18
|
+
if entity.entity_type == "search":
|
|
19
|
+
return config.search_robots
|
|
20
|
+
if (entity.status or "").lower() == "published":
|
|
21
|
+
return "index,follow"
|
|
22
|
+
return config.default_robots
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _og_type(entity: SEOEntity) -> str:
|
|
26
|
+
if entity.entity_type in {"post", "video"}:
|
|
27
|
+
return "article"
|
|
28
|
+
return "website"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def build_seo_payload(
|
|
32
|
+
entity: SEOEntity,
|
|
33
|
+
route_path: str,
|
|
34
|
+
config: SEOConfig,
|
|
35
|
+
overrides: SEOOverrides | None = None,
|
|
36
|
+
) -> dict:
|
|
37
|
+
ov = overrides or SEOOverrides()
|
|
38
|
+
|
|
39
|
+
title = _pick(ov.meta_title, entity.title, "Untitled")
|
|
40
|
+
if config.title_template:
|
|
41
|
+
title = config.title_template.format(title=title)
|
|
42
|
+
|
|
43
|
+
description = _pick(
|
|
44
|
+
ov.meta_description,
|
|
45
|
+
entity.excerpt,
|
|
46
|
+
build_description_snippet(entity.body_html),
|
|
47
|
+
"",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
canonical = _pick(ov.canonical_url, normalize_public_url(route_path, config))
|
|
51
|
+
robots = _pick(ov.robots, _entity_default_robots(entity, config))
|
|
52
|
+
|
|
53
|
+
og_title = _pick(ov.og_title, title)
|
|
54
|
+
og_description = _pick(ov.og_description, description)
|
|
55
|
+
og_image = _pick(ov.og_image, entity.featured_image, config.default_og_image)
|
|
56
|
+
|
|
57
|
+
twitter_title = _pick(ov.twitter_title, og_title)
|
|
58
|
+
twitter_description = _pick(ov.twitter_description, og_description)
|
|
59
|
+
twitter_image = _pick(ov.twitter_image, og_image)
|
|
60
|
+
twitter_card = _pick(ov.twitter_card, "summary_large_image")
|
|
61
|
+
|
|
62
|
+
return {
|
|
63
|
+
"title": title,
|
|
64
|
+
"description": description,
|
|
65
|
+
"canonical": canonical,
|
|
66
|
+
"robots": robots,
|
|
67
|
+
"og": {
|
|
68
|
+
"type": _og_type(entity),
|
|
69
|
+
"title": og_title,
|
|
70
|
+
"description": og_description,
|
|
71
|
+
"url": canonical,
|
|
72
|
+
"image": og_image,
|
|
73
|
+
},
|
|
74
|
+
"twitter": {
|
|
75
|
+
"card": twitter_card,
|
|
76
|
+
"title": twitter_title,
|
|
77
|
+
"description": twitter_description,
|
|
78
|
+
"image": twitter_image,
|
|
79
|
+
},
|
|
80
|
+
"schema_jsonld": normalize_schema_jsonld(ov.schema_jsonld),
|
|
81
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
"""Configuration models for seoslug."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from typing import Literal
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass(slots=True)
|
|
9
|
+
class URLPolicy:
|
|
10
|
+
enforce_https: bool = True
|
|
11
|
+
lowercase_paths: bool = True
|
|
12
|
+
trailing_slash: Literal["always", "never", "preserve"] = "never"
|
|
13
|
+
collapse_duplicate_slashes: bool = True
|
|
14
|
+
strip_tracking_params: bool = True
|
|
15
|
+
allowed_query_params: list[str] = field(default_factory=list)
|
|
16
|
+
|
|
17
|
+
def __post_init__(self) -> None:
|
|
18
|
+
if self.trailing_slash not in {"always", "never", "preserve"}:
|
|
19
|
+
raise ValueError(
|
|
20
|
+
"trailing_slash must be one of: 'always', 'never', 'preserve'"
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
cleaned_params: list[str] = []
|
|
24
|
+
seen: set[str] = set()
|
|
25
|
+
for param in self.allowed_query_params:
|
|
26
|
+
if not isinstance(param, str):
|
|
27
|
+
raise ValueError("allowed_query_params must contain only strings")
|
|
28
|
+
normalized = param.strip()
|
|
29
|
+
if not normalized:
|
|
30
|
+
continue
|
|
31
|
+
if normalized not in seen:
|
|
32
|
+
seen.add(normalized)
|
|
33
|
+
cleaned_params.append(normalized)
|
|
34
|
+
self.allowed_query_params = cleaned_params
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@dataclass(slots=True)
|
|
38
|
+
class SEOConfig:
|
|
39
|
+
canonical_host: str
|
|
40
|
+
public_base_url: str
|
|
41
|
+
url_policy: URLPolicy
|
|
42
|
+
default_robots: str = "index,follow"
|
|
43
|
+
default_og_image: str | None = None
|
|
44
|
+
site_name: str | None = None
|
|
45
|
+
title_template: str | None = "{title}"
|
|
46
|
+
search_robots: str = "noindex,follow"
|
|
47
|
+
|
|
48
|
+
def __post_init__(self) -> None:
|
|
49
|
+
self.canonical_host = _validate_canonical_host(self.canonical_host)
|
|
50
|
+
self.public_base_url = _validate_public_base_url(self.public_base_url)
|
|
51
|
+
|
|
52
|
+
if not isinstance(self.url_policy, URLPolicy):
|
|
53
|
+
raise ValueError("url_policy must be a URLPolicy instance")
|
|
54
|
+
|
|
55
|
+
if not _is_nonempty_string(self.default_robots):
|
|
56
|
+
raise ValueError("default_robots must be a non-empty string")
|
|
57
|
+
if not _is_nonempty_string(self.search_robots):
|
|
58
|
+
raise ValueError("search_robots must be a non-empty string")
|
|
59
|
+
|
|
60
|
+
if self.default_og_image is not None and not _is_nonempty_string(
|
|
61
|
+
self.default_og_image
|
|
62
|
+
):
|
|
63
|
+
raise ValueError("default_og_image must be a non-empty string when set")
|
|
64
|
+
|
|
65
|
+
if self.site_name is not None and not _is_nonempty_string(self.site_name):
|
|
66
|
+
raise ValueError("site_name must be a non-empty string when set")
|
|
67
|
+
|
|
68
|
+
if self.title_template is not None:
|
|
69
|
+
if not _is_nonempty_string(self.title_template):
|
|
70
|
+
raise ValueError("title_template must be a non-empty string when set")
|
|
71
|
+
if "{title}" not in self.title_template:
|
|
72
|
+
raise ValueError("title_template must include '{title}' placeholder")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _is_nonempty_string(value: object) -> bool:
|
|
76
|
+
return isinstance(value, str) and bool(value.strip())
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def _validate_canonical_host(canonical_host: str) -> str:
|
|
80
|
+
if not _is_nonempty_string(canonical_host):
|
|
81
|
+
raise ValueError("canonical_host must be a non-empty string")
|
|
82
|
+
|
|
83
|
+
value = canonical_host.strip().lower()
|
|
84
|
+
if "://" in value or "/" in value or "?" in value or "#" in value:
|
|
85
|
+
raise ValueError("canonical_host must be host-only (no scheme/path/query)")
|
|
86
|
+
return value
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _validate_public_base_url(public_base_url: str) -> str:
|
|
90
|
+
if not _is_nonempty_string(public_base_url):
|
|
91
|
+
raise ValueError("public_base_url must be a non-empty string")
|
|
92
|
+
|
|
93
|
+
value = public_base_url.strip()
|
|
94
|
+
parsed = urlparse(value)
|
|
95
|
+
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
|
|
96
|
+
raise ValueError("public_base_url must be an absolute http(s) URL")
|
|
97
|
+
return value
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""JSON-LD helpers for seoslug."""
|
|
2
|
+
|
|
3
|
+
from copy import deepcopy
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def normalize_schema_jsonld(value: dict | list[dict] | None) -> dict | list[dict]:
|
|
7
|
+
if value is None:
|
|
8
|
+
return {}
|
|
9
|
+
if isinstance(value, dict):
|
|
10
|
+
return deepcopy(value)
|
|
11
|
+
if isinstance(value, list) and all(isinstance(item, dict) for item in value):
|
|
12
|
+
return deepcopy(value)
|
|
13
|
+
raise ValueError("schema_jsonld must be dict, list[dict], or None")
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""URL normalization functions for seoslug."""
|
|
2
|
+
|
|
3
|
+
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
4
|
+
|
|
5
|
+
from .config import SEOConfig, URLPolicy
|
|
6
|
+
|
|
7
|
+
_TRACKING_KEYS = {"gclid", "fbclid"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _collapse_duplicate_slashes(path: str) -> str:
|
|
11
|
+
out: list[str] = []
|
|
12
|
+
prev_slash = False
|
|
13
|
+
for char in path:
|
|
14
|
+
if char == "/":
|
|
15
|
+
if prev_slash:
|
|
16
|
+
continue
|
|
17
|
+
prev_slash = True
|
|
18
|
+
out.append(char)
|
|
19
|
+
else:
|
|
20
|
+
prev_slash = False
|
|
21
|
+
out.append(char)
|
|
22
|
+
return "".join(out)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _apply_trailing_slash(path: str, mode: str) -> str:
|
|
26
|
+
if mode == "preserve":
|
|
27
|
+
return path
|
|
28
|
+
if path == "/":
|
|
29
|
+
return path
|
|
30
|
+
if mode == "always":
|
|
31
|
+
return path if path.endswith("/") else path + "/"
|
|
32
|
+
return path.rstrip("/") or "/"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def normalize_path(path: str, policy: URLPolicy) -> str:
|
|
36
|
+
if not isinstance(path, str):
|
|
37
|
+
raise ValueError("path must be a string")
|
|
38
|
+
value = path.strip() or "/"
|
|
39
|
+
if not value.startswith("/"):
|
|
40
|
+
value = "/" + value
|
|
41
|
+
if policy.collapse_duplicate_slashes:
|
|
42
|
+
value = _collapse_duplicate_slashes(value)
|
|
43
|
+
if policy.lowercase_paths:
|
|
44
|
+
value = value.lower()
|
|
45
|
+
value = _apply_trailing_slash(value, policy.trailing_slash)
|
|
46
|
+
return value
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _filter_query(query: str, policy: URLPolicy) -> str:
|
|
50
|
+
pairs = parse_qsl(query, keep_blank_values=True)
|
|
51
|
+
filtered: list[tuple[str, str]] = []
|
|
52
|
+
allowlist = set(policy.allowed_query_params)
|
|
53
|
+
for key, value in pairs:
|
|
54
|
+
k = key.lower()
|
|
55
|
+
if policy.strip_tracking_params and (k.startswith("utm_") or k in _TRACKING_KEYS):
|
|
56
|
+
continue
|
|
57
|
+
if allowlist and key not in allowlist:
|
|
58
|
+
continue
|
|
59
|
+
filtered.append((key, value))
|
|
60
|
+
return urlencode(filtered, doseq=True)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def normalize_public_url(url_or_path: str, config: SEOConfig) -> str:
|
|
64
|
+
if not isinstance(url_or_path, str) or not url_or_path.strip():
|
|
65
|
+
raise ValueError("url_or_path must be a non-empty string")
|
|
66
|
+
|
|
67
|
+
value = url_or_path.strip()
|
|
68
|
+
parsed_input = urlsplit(value)
|
|
69
|
+
parsed_base = urlsplit(config.public_base_url)
|
|
70
|
+
|
|
71
|
+
if parsed_input.scheme and not parsed_input.netloc:
|
|
72
|
+
raise ValueError("Malformed URL input")
|
|
73
|
+
|
|
74
|
+
path = parsed_input.path
|
|
75
|
+
query = parsed_input.query
|
|
76
|
+
if not parsed_input.scheme and not parsed_input.netloc:
|
|
77
|
+
path = value.split("?", 1)[0]
|
|
78
|
+
query = value.split("?", 1)[1] if "?" in value else ""
|
|
79
|
+
|
|
80
|
+
normalized_path = normalize_path(path or "/", config.url_policy)
|
|
81
|
+
normalized_query = _filter_query(query, config.url_policy)
|
|
82
|
+
|
|
83
|
+
scheme = "https" if config.url_policy.enforce_https else (parsed_base.scheme or "https")
|
|
84
|
+
netloc = config.canonical_host
|
|
85
|
+
return urlunsplit((scheme, netloc, normalized_path, normalized_query, ""))
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""Input schemas for seoslug."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Literal
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
_ENTITY_TYPES = {"home", "post", "page", "video", "taxonomy", "search", "other"}
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def _normalize_optional_string(value: object, field_name: str) -> str | None:
|
|
11
|
+
if value is None:
|
|
12
|
+
return None
|
|
13
|
+
if not isinstance(value, str):
|
|
14
|
+
raise ValueError(f"{field_name} must be a string or None")
|
|
15
|
+
normalized = value.strip()
|
|
16
|
+
return normalized or None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass(slots=True)
|
|
20
|
+
class SEOEntity:
|
|
21
|
+
entity_type: Literal["home", "post", "page", "video", "taxonomy", "search", "other"]
|
|
22
|
+
slug: str | None = None
|
|
23
|
+
title: str | None = None
|
|
24
|
+
excerpt: str | None = None
|
|
25
|
+
body_html: str | None = None
|
|
26
|
+
status: str | None = None
|
|
27
|
+
featured_image: str | None = None
|
|
28
|
+
published_at: str | None = None
|
|
29
|
+
updated_at: str | None = None
|
|
30
|
+
|
|
31
|
+
def __post_init__(self) -> None:
|
|
32
|
+
if self.entity_type not in _ENTITY_TYPES:
|
|
33
|
+
raise ValueError("entity_type must be one of home/post/page/video/taxonomy/search/other")
|
|
34
|
+
|
|
35
|
+
self.slug = _normalize_optional_string(self.slug, "slug")
|
|
36
|
+
self.title = _normalize_optional_string(self.title, "title")
|
|
37
|
+
self.excerpt = _normalize_optional_string(self.excerpt, "excerpt")
|
|
38
|
+
self.body_html = _normalize_optional_string(self.body_html, "body_html")
|
|
39
|
+
self.status = _normalize_optional_string(self.status, "status")
|
|
40
|
+
self.featured_image = _normalize_optional_string(self.featured_image, "featured_image")
|
|
41
|
+
self.published_at = _normalize_optional_string(self.published_at, "published_at")
|
|
42
|
+
self.updated_at = _normalize_optional_string(self.updated_at, "updated_at")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass(slots=True)
|
|
46
|
+
class SEOOverrides:
|
|
47
|
+
meta_title: str | None = None
|
|
48
|
+
meta_description: str | None = None
|
|
49
|
+
canonical_url: str | None = None
|
|
50
|
+
robots: str | None = None
|
|
51
|
+
og_title: str | None = None
|
|
52
|
+
og_description: str | None = None
|
|
53
|
+
og_image: str | None = None
|
|
54
|
+
twitter_card: str | None = None
|
|
55
|
+
twitter_title: str | None = None
|
|
56
|
+
twitter_description: str | None = None
|
|
57
|
+
twitter_image: str | None = None
|
|
58
|
+
schema_jsonld: dict | list[dict] | None = None
|
|
59
|
+
|
|
60
|
+
def __post_init__(self) -> None:
|
|
61
|
+
self.meta_title = _normalize_optional_string(self.meta_title, "meta_title")
|
|
62
|
+
self.meta_description = _normalize_optional_string(
|
|
63
|
+
self.meta_description, "meta_description"
|
|
64
|
+
)
|
|
65
|
+
self.canonical_url = _normalize_optional_string(self.canonical_url, "canonical_url")
|
|
66
|
+
self.robots = _normalize_optional_string(self.robots, "robots")
|
|
67
|
+
self.og_title = _normalize_optional_string(self.og_title, "og_title")
|
|
68
|
+
self.og_description = _normalize_optional_string(
|
|
69
|
+
self.og_description, "og_description"
|
|
70
|
+
)
|
|
71
|
+
self.og_image = _normalize_optional_string(self.og_image, "og_image")
|
|
72
|
+
self.twitter_card = _normalize_optional_string(self.twitter_card, "twitter_card")
|
|
73
|
+
self.twitter_title = _normalize_optional_string(self.twitter_title, "twitter_title")
|
|
74
|
+
self.twitter_description = _normalize_optional_string(
|
|
75
|
+
self.twitter_description, "twitter_description"
|
|
76
|
+
)
|
|
77
|
+
self.twitter_image = _normalize_optional_string(self.twitter_image, "twitter_image")
|
|
78
|
+
|
|
79
|
+
if self.schema_jsonld is None:
|
|
80
|
+
return
|
|
81
|
+
if isinstance(self.schema_jsonld, dict):
|
|
82
|
+
return
|
|
83
|
+
if isinstance(self.schema_jsonld, list) and all(
|
|
84
|
+
isinstance(item, dict) for item in self.schema_jsonld
|
|
85
|
+
):
|
|
86
|
+
return
|
|
87
|
+
raise ValueError("schema_jsonld must be dict, list[dict], or None")
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Text extraction utilities for seoslug."""
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from html import unescape
|
|
5
|
+
|
|
6
|
+
_SCRIPT_STYLE_RE = re.compile(
|
|
7
|
+
r"<(script|style)\b[^>]*>.*?</\1>", re.IGNORECASE | re.DOTALL
|
|
8
|
+
)
|
|
9
|
+
_TAG_RE = re.compile(r"<[^>]+>")
|
|
10
|
+
_WS_RE = re.compile(r"\s+")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def html_to_text(html: str | None) -> str:
|
|
14
|
+
if html is None:
|
|
15
|
+
return ""
|
|
16
|
+
if not isinstance(html, str):
|
|
17
|
+
raise ValueError("html must be a string or None")
|
|
18
|
+
if not html:
|
|
19
|
+
return ""
|
|
20
|
+
text = _SCRIPT_STYLE_RE.sub(" ", html)
|
|
21
|
+
text = _TAG_RE.sub(" ", text)
|
|
22
|
+
text = unescape(text)
|
|
23
|
+
return _WS_RE.sub(" ", text).strip()
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def build_description_snippet(body_html: str | None, max_length: int = 160) -> str:
|
|
27
|
+
if not isinstance(max_length, int) or max_length <= 0:
|
|
28
|
+
raise ValueError("max_length must be a positive integer")
|
|
29
|
+
text = html_to_text(body_html)
|
|
30
|
+
if len(text) <= max_length:
|
|
31
|
+
return text
|
|
32
|
+
if max_length <= 3:
|
|
33
|
+
return "." * max_length
|
|
34
|
+
return text[: max_length - 3].rstrip() + "..."
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: seoslug
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Framework-agnostic canonical URL normalization and SEO payload generation
|
|
5
|
+
Author: seoslug contributors
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Documentation, https://deepwiki.com/emiliano-gandini-outeda/seoslug/
|
|
8
|
+
Keywords: seo,canonical,urls,metadata,open-graph,twitter-cards
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Dynamic: license-file
|
|
22
|
+
|
|
23
|
+
# seoslug
|
|
24
|
+
|
|
25
|
+
[](https://deepwiki.com/emiliano-gandini-outeda/seoslug/)
|
|
26
|
+
|
|
27
|
+
Canonical URL normalization and deterministic SEO payload generation for content platforms.
|
|
28
|
+
|
|
29
|
+
## Installation
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install seoslug
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
For local development:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
pip install -e .
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Quick usage
|
|
42
|
+
|
|
43
|
+
```python
|
|
44
|
+
from seoslug import SEOConfig, URLPolicy, SEOEntity, build_seo_payload
|
|
45
|
+
|
|
46
|
+
config = SEOConfig(
|
|
47
|
+
canonical_host="portal.example.com",
|
|
48
|
+
public_base_url="https://portal.example.com",
|
|
49
|
+
url_policy=URLPolicy(
|
|
50
|
+
enforce_https=True,
|
|
51
|
+
lowercase_paths=True,
|
|
52
|
+
trailing_slash="never",
|
|
53
|
+
collapse_duplicate_slashes=True,
|
|
54
|
+
strip_tracking_params=True,
|
|
55
|
+
allowed_query_params=["page", "q"],
|
|
56
|
+
),
|
|
57
|
+
default_og_image="https://cdn.example.com/default.jpg",
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
entity = SEOEntity(
|
|
61
|
+
entity_type="post",
|
|
62
|
+
slug="my-post",
|
|
63
|
+
title="My Post",
|
|
64
|
+
excerpt="Example excerpt",
|
|
65
|
+
body_html="<p>Body content</p>",
|
|
66
|
+
status="published",
|
|
67
|
+
featured_image="https://cdn.example.com/post.jpg",
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
payload = build_seo_payload(entity, "/posts/my-post", config)
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
Full docs, API reference, and usage examples are in `docs/` and published with MkDocs.
|
|
74
|
+
|
|
75
|
+
## License
|
|
76
|
+
|
|
77
|
+
MIT, see `LICENSE`.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
src/seoslug/__init__.py
|
|
5
|
+
src/seoslug/builder.py
|
|
6
|
+
src/seoslug/config.py
|
|
7
|
+
src/seoslug/jsonld.py
|
|
8
|
+
src/seoslug/normalization.py
|
|
9
|
+
src/seoslug/schemas.py
|
|
10
|
+
src/seoslug/text.py
|
|
11
|
+
src/seoslug.egg-info/PKG-INFO
|
|
12
|
+
src/seoslug.egg-info/SOURCES.txt
|
|
13
|
+
src/seoslug.egg-info/dependency_links.txt
|
|
14
|
+
src/seoslug.egg-info/top_level.txt
|
|
15
|
+
tests/test_builder.py
|
|
16
|
+
tests/test_fallbacks.py
|
|
17
|
+
tests/test_jsonld.py
|
|
18
|
+
tests/test_normalization.py
|
|
19
|
+
tests/test_regression_fixtures.py
|
|
20
|
+
tests/test_robots_rules.py
|
|
21
|
+
tests/test_schemas.py
|
|
22
|
+
tests/test_text.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
seoslug
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
"""Tests for SEO payload builder."""
|
|
2
|
+
|
|
3
|
+
from seoslug import SEOConfig, SEOEntity, SEOOverrides, URLPolicy, build_seo_payload
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _config() -> SEOConfig:
|
|
7
|
+
return SEOConfig(
|
|
8
|
+
canonical_host="portal.example.com",
|
|
9
|
+
public_base_url="https://portal.example.com",
|
|
10
|
+
url_policy=URLPolicy(),
|
|
11
|
+
default_og_image="https://cdn.example.com/default.jpg",
|
|
12
|
+
site_name="Portal",
|
|
13
|
+
title_template="{title}",
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def test_payload_contract_shape() -> None:
|
|
18
|
+
entity = SEOEntity(
|
|
19
|
+
entity_type="post",
|
|
20
|
+
slug="my-post",
|
|
21
|
+
title="My Post",
|
|
22
|
+
excerpt="Example excerpt",
|
|
23
|
+
body_html="<p>Body</p>",
|
|
24
|
+
status="published",
|
|
25
|
+
featured_image="https://cdn.example.com/post.jpg",
|
|
26
|
+
)
|
|
27
|
+
payload = build_seo_payload(entity, "/posts/my-post", _config())
|
|
28
|
+
|
|
29
|
+
assert set(payload.keys()) == {
|
|
30
|
+
"title",
|
|
31
|
+
"description",
|
|
32
|
+
"canonical",
|
|
33
|
+
"robots",
|
|
34
|
+
"og",
|
|
35
|
+
"twitter",
|
|
36
|
+
"schema_jsonld",
|
|
37
|
+
}
|
|
38
|
+
assert payload["canonical"] == "https://portal.example.com/posts/my-post"
|
|
39
|
+
assert payload["og"]["url"] == payload["canonical"]
|
|
40
|
+
assert payload["twitter"]["card"] == "summary_large_image"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_canonical_override_and_schema_passthrough() -> None:
|
|
44
|
+
entity = SEOEntity(entity_type="page", title="About")
|
|
45
|
+
overrides = SEOOverrides(
|
|
46
|
+
canonical_url="https://portal.example.com/custom-about",
|
|
47
|
+
schema_jsonld={"@context": "https://schema.org", "@type": "WebPage"},
|
|
48
|
+
)
|
|
49
|
+
payload = build_seo_payload(entity, "/about", _config(), overrides)
|
|
50
|
+
assert payload["canonical"] == "https://portal.example.com/custom-about"
|
|
51
|
+
assert payload["schema_jsonld"]["@type"] == "WebPage"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def test_twitter_override_precedence() -> None:
|
|
55
|
+
entity = SEOEntity(entity_type="post", title="Entity Title", excerpt="Entity Excerpt")
|
|
56
|
+
overrides = SEOOverrides(
|
|
57
|
+
og_title="OG Title",
|
|
58
|
+
twitter_title="Twitter Title",
|
|
59
|
+
twitter_description="Twitter Description",
|
|
60
|
+
)
|
|
61
|
+
payload = build_seo_payload(entity, "/posts/t", _config(), overrides)
|
|
62
|
+
assert payload["og"]["title"] == "OG Title"
|
|
63
|
+
assert payload["twitter"]["title"] == "Twitter Title"
|
|
64
|
+
assert payload["twitter"]["description"] == "Twitter Description"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_schema_list_passthrough() -> None:
|
|
68
|
+
entity = SEOEntity(entity_type="page", title="Docs")
|
|
69
|
+
schema = [{"@type": "BreadcrumbList"}, {"@type": "WebPage"}]
|
|
70
|
+
payload = build_seo_payload(entity, "/docs", _config(), SEOOverrides(schema_jsonld=schema))
|
|
71
|
+
assert payload["schema_jsonld"] == schema
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_title_template_is_applied() -> None:
|
|
75
|
+
config = SEOConfig(
|
|
76
|
+
canonical_host="portal.example.com",
|
|
77
|
+
public_base_url="https://portal.example.com",
|
|
78
|
+
url_policy=URLPolicy(),
|
|
79
|
+
title_template="{title} | Portal",
|
|
80
|
+
)
|
|
81
|
+
payload = build_seo_payload(SEOEntity(entity_type="page", title="About"), "/about", config)
|
|
82
|
+
assert payload["title"] == "About | Portal"
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def test_description_prefers_excerpt_over_body_snippet() -> None:
|
|
86
|
+
entity = SEOEntity(
|
|
87
|
+
entity_type="post",
|
|
88
|
+
excerpt="Excerpt text",
|
|
89
|
+
body_html="<p>Body fallback text</p>",
|
|
90
|
+
)
|
|
91
|
+
payload = build_seo_payload(entity, "/posts/p", _config())
|
|
92
|
+
assert payload["description"] == "Excerpt text"
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def test_twitter_falls_back_to_og_values() -> None:
|
|
96
|
+
entity = SEOEntity(entity_type="post", title="Entity")
|
|
97
|
+
overrides = SEOOverrides(
|
|
98
|
+
og_title="OG T",
|
|
99
|
+
og_description="OG D",
|
|
100
|
+
og_image="https://cdn.example.com/og.jpg",
|
|
101
|
+
)
|
|
102
|
+
payload = build_seo_payload(entity, "/posts/p", _config(), overrides)
|
|
103
|
+
assert payload["twitter"]["title"] == "OG T"
|
|
104
|
+
assert payload["twitter"]["description"] == "OG D"
|
|
105
|
+
assert payload["twitter"]["image"] == "https://cdn.example.com/og.jpg"
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Tests for fallback hierarchy behavior."""
|
|
2
|
+
|
|
3
|
+
from seoslug import SEOConfig, SEOEntity, SEOOverrides, URLPolicy, build_seo_payload
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _config() -> SEOConfig:
|
|
7
|
+
return SEOConfig(
|
|
8
|
+
canonical_host="portal.example.com",
|
|
9
|
+
public_base_url="https://portal.example.com",
|
|
10
|
+
url_policy=URLPolicy(),
|
|
11
|
+
default_og_image="https://cdn.example.com/default.jpg",
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_title_and_description_fallbacks() -> None:
|
|
16
|
+
entity = SEOEntity(entity_type="post", title=None, excerpt=None, body_html="<p>Hello body</p>")
|
|
17
|
+
payload = build_seo_payload(entity, "/x", _config())
|
|
18
|
+
assert payload["title"] == "Untitled"
|
|
19
|
+
assert payload["description"] == "Hello body"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_override_precedence() -> None:
|
|
23
|
+
entity = SEOEntity(entity_type="post", title="Entity title", excerpt="Entity desc")
|
|
24
|
+
ov = SEOOverrides(meta_title="Override title", meta_description="Override desc")
|
|
25
|
+
payload = build_seo_payload(entity, "/x", _config(), ov)
|
|
26
|
+
assert payload["title"] == "Override title"
|
|
27
|
+
assert payload["description"] == "Override desc"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_og_and_twitter_image_fallbacks() -> None:
|
|
31
|
+
entity = SEOEntity(entity_type="post", featured_image=None)
|
|
32
|
+
payload = build_seo_payload(entity, "/x", _config())
|
|
33
|
+
assert payload["og"]["image"] == "https://cdn.example.com/default.jpg"
|
|
34
|
+
assert payload["twitter"]["image"] == "https://cdn.example.com/default.jpg"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def test_canonical_fallback_uses_normalized_route() -> None:
|
|
38
|
+
entity = SEOEntity(entity_type="page", title="About")
|
|
39
|
+
payload = build_seo_payload(entity, "/About//Team?utm_source=x", _config())
|
|
40
|
+
assert payload["canonical"] == "https://portal.example.com/about/team"
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def test_og_image_precedence_override_then_entity_then_default() -> None:
|
|
44
|
+
base_entity = SEOEntity(entity_type="post", featured_image="https://cdn.example.com/entity.jpg")
|
|
45
|
+
with_override = build_seo_payload(
|
|
46
|
+
base_entity,
|
|
47
|
+
"/x",
|
|
48
|
+
_config(),
|
|
49
|
+
SEOOverrides(og_image="https://cdn.example.com/override.jpg"),
|
|
50
|
+
)
|
|
51
|
+
assert with_override["og"]["image"] == "https://cdn.example.com/override.jpg"
|
|
52
|
+
|
|
53
|
+
without_override = build_seo_payload(base_entity, "/x", _config())
|
|
54
|
+
assert without_override["og"]["image"] == "https://cdn.example.com/entity.jpg"
|
|
55
|
+
|
|
56
|
+
no_entity_image = build_seo_payload(SEOEntity(entity_type="post"), "/x", _config())
|
|
57
|
+
assert no_entity_image["og"]["image"] == "https://cdn.example.com/default.jpg"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def test_twitter_override_fields_take_highest_precedence() -> None:
|
|
61
|
+
entity = SEOEntity(entity_type="post", title="Entity", excerpt="Excerpt")
|
|
62
|
+
payload = build_seo_payload(
|
|
63
|
+
entity,
|
|
64
|
+
"/x",
|
|
65
|
+
_config(),
|
|
66
|
+
SEOOverrides(
|
|
67
|
+
twitter_card="summary",
|
|
68
|
+
twitter_title="Tw Title",
|
|
69
|
+
twitter_description="Tw Desc",
|
|
70
|
+
twitter_image="https://cdn.example.com/tw.jpg",
|
|
71
|
+
og_title="OG Title",
|
|
72
|
+
og_description="OG Desc",
|
|
73
|
+
og_image="https://cdn.example.com/og.jpg",
|
|
74
|
+
),
|
|
75
|
+
)
|
|
76
|
+
assert payload["twitter"]["card"] == "summary"
|
|
77
|
+
assert payload["twitter"]["title"] == "Tw Title"
|
|
78
|
+
assert payload["twitter"]["description"] == "Tw Desc"
|
|
79
|
+
assert payload["twitter"]["image"] == "https://cdn.example.com/tw.jpg"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Tests for JSON-LD normalization helpers."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from seoslug.jsonld import normalize_schema_jsonld
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_normalize_schema_jsonld_none_to_empty_dict() -> None:
|
|
9
|
+
assert normalize_schema_jsonld(None) == {}
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def test_normalize_schema_jsonld_returns_copy_for_dict() -> None:
|
|
13
|
+
schema = {"@type": "WebPage", "name": "Home"}
|
|
14
|
+
normalized = normalize_schema_jsonld(schema)
|
|
15
|
+
assert normalized == schema
|
|
16
|
+
assert normalized is not schema
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def test_normalize_schema_jsonld_returns_copy_for_list() -> None:
|
|
20
|
+
schema = [{"@type": "WebPage"}, {"@type": "BreadcrumbList"}]
|
|
21
|
+
normalized = normalize_schema_jsonld(schema)
|
|
22
|
+
assert normalized == schema
|
|
23
|
+
assert normalized is not schema
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_normalize_schema_jsonld_rejects_invalid_type() -> None:
|
|
27
|
+
with pytest.raises(ValueError):
|
|
28
|
+
normalize_schema_jsonld("bad") # type: ignore[arg-type]
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""Tests for URL normalization."""
|
|
2
|
+
|
|
3
|
+
from seoslug import SEOConfig, URLPolicy, normalize_path, normalize_public_url
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def _config(policy: URLPolicy | None = None) -> SEOConfig:
|
|
8
|
+
return SEOConfig(
|
|
9
|
+
canonical_host="portal.example.com",
|
|
10
|
+
public_base_url="https://portal.example.com",
|
|
11
|
+
url_policy=policy or URLPolicy(),
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_normalize_path_lowercase_and_slashes() -> None:
|
|
16
|
+
policy = URLPolicy(lowercase_paths=True, collapse_duplicate_slashes=True)
|
|
17
|
+
assert normalize_path("//Blog//My-Post//", policy) == "/blog/my-post"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_trailing_slash_modes() -> None:
|
|
21
|
+
assert normalize_path("/blog/post", URLPolicy(trailing_slash="always")) == "/blog/post/"
|
|
22
|
+
assert normalize_path("/blog/post/", URLPolicy(trailing_slash="never")) == "/blog/post"
|
|
23
|
+
assert normalize_path("/blog/post/", URLPolicy(trailing_slash="preserve")) == "/blog/post/"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_normalize_public_url_enforces_host_https_and_query_rules() -> None:
|
|
27
|
+
config = _config(URLPolicy(allowed_query_params=["page", "q"]))
|
|
28
|
+
url = "http://other.example.com//Blog/Post?utm_source=x&gclid=1&page=2&q=abc&bad=1"
|
|
29
|
+
assert (
|
|
30
|
+
normalize_public_url(url, config)
|
|
31
|
+
== "https://portal.example.com/blog/post?page=2&q=abc"
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_normalize_public_url_idempotent() -> None:
|
|
36
|
+
config = _config()
|
|
37
|
+
first = normalize_public_url("/A//B/?utm_campaign=x", config)
|
|
38
|
+
second = normalize_public_url(first, config)
|
|
39
|
+
assert first == second
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def test_accepts_relative_path_with_query() -> None:
|
|
43
|
+
config = _config()
|
|
44
|
+
assert (
|
|
45
|
+
normalize_public_url("posts/My-Post?fbclid=123&page=1", config)
|
|
46
|
+
== "https://portal.example.com/posts/my-post?page=1"
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def test_enforce_https_toggle_uses_public_base_scheme() -> None:
|
|
51
|
+
config = SEOConfig(
|
|
52
|
+
canonical_host="portal.example.com",
|
|
53
|
+
public_base_url="http://portal.example.com",
|
|
54
|
+
url_policy=URLPolicy(enforce_https=False),
|
|
55
|
+
)
|
|
56
|
+
assert normalize_public_url("https://other.example.com/a", config) == "http://portal.example.com/a"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_tracking_strip_toggle() -> None:
|
|
60
|
+
config = _config(URLPolicy(strip_tracking_params=False))
|
|
61
|
+
assert (
|
|
62
|
+
normalize_public_url("/p?utm_source=x&gclid=1&fbclid=2", config)
|
|
63
|
+
== "https://portal.example.com/p?utm_source=x&gclid=1&fbclid=2"
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_no_allowlist_keeps_non_tracking_params() -> None:
|
|
68
|
+
config = _config(URLPolicy())
|
|
69
|
+
assert normalize_public_url("/p?a=1&b=2&utm_campaign=x", config) == "https://portal.example.com/p?a=1&b=2"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_malformed_url_raises_value_error() -> None:
|
|
73
|
+
config = _config()
|
|
74
|
+
with pytest.raises(ValueError):
|
|
75
|
+
normalize_public_url("https:///broken", config)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def test_host_is_always_enforced_for_absolute_input() -> None:
|
|
79
|
+
config = _config()
|
|
80
|
+
normalized = normalize_public_url("https://evil.example.org/path", config)
|
|
81
|
+
assert normalized == "https://portal.example.com/path"
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def test_can_disable_duplicate_slash_collapse_and_lowercase() -> None:
|
|
85
|
+
policy = URLPolicy(collapse_duplicate_slashes=False, lowercase_paths=False, trailing_slash="preserve")
|
|
86
|
+
assert normalize_path("//Blog//Post//", policy) == "//Blog//Post//"
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Regression fixtures for representative entity types."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from seoslug import SEOConfig, SEOEntity, URLPolicy, build_seo_payload
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _config() -> SEOConfig:
|
|
9
|
+
return SEOConfig(
|
|
10
|
+
canonical_host="portal.example.com",
|
|
11
|
+
public_base_url="https://portal.example.com",
|
|
12
|
+
url_policy=URLPolicy(),
|
|
13
|
+
default_og_image="https://cdn.example.com/default.jpg",
|
|
14
|
+
search_robots="noindex,follow",
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@pytest.mark.parametrize(
|
|
19
|
+
("entity_type", "route", "expected"),
|
|
20
|
+
[
|
|
21
|
+
(
|
|
22
|
+
"home",
|
|
23
|
+
"/",
|
|
24
|
+
{
|
|
25
|
+
"title": "home title",
|
|
26
|
+
"description": "home excerpt",
|
|
27
|
+
"canonical": "https://portal.example.com/",
|
|
28
|
+
"robots": "index,follow",
|
|
29
|
+
"og": {
|
|
30
|
+
"type": "website",
|
|
31
|
+
"title": "home title",
|
|
32
|
+
"description": "home excerpt",
|
|
33
|
+
"url": "https://portal.example.com/",
|
|
34
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
35
|
+
},
|
|
36
|
+
"twitter": {
|
|
37
|
+
"card": "summary_large_image",
|
|
38
|
+
"title": "home title",
|
|
39
|
+
"description": "home excerpt",
|
|
40
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
41
|
+
},
|
|
42
|
+
"schema_jsonld": {},
|
|
43
|
+
},
|
|
44
|
+
),
|
|
45
|
+
(
|
|
46
|
+
"post",
|
|
47
|
+
"/posts/p",
|
|
48
|
+
{
|
|
49
|
+
"title": "post title",
|
|
50
|
+
"description": "post excerpt",
|
|
51
|
+
"canonical": "https://portal.example.com/posts/p",
|
|
52
|
+
"robots": "index,follow",
|
|
53
|
+
"og": {
|
|
54
|
+
"type": "article",
|
|
55
|
+
"title": "post title",
|
|
56
|
+
"description": "post excerpt",
|
|
57
|
+
"url": "https://portal.example.com/posts/p",
|
|
58
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
59
|
+
},
|
|
60
|
+
"twitter": {
|
|
61
|
+
"card": "summary_large_image",
|
|
62
|
+
"title": "post title",
|
|
63
|
+
"description": "post excerpt",
|
|
64
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
65
|
+
},
|
|
66
|
+
"schema_jsonld": {},
|
|
67
|
+
},
|
|
68
|
+
),
|
|
69
|
+
(
|
|
70
|
+
"page",
|
|
71
|
+
"/about",
|
|
72
|
+
{
|
|
73
|
+
"title": "page title",
|
|
74
|
+
"description": "page excerpt",
|
|
75
|
+
"canonical": "https://portal.example.com/about",
|
|
76
|
+
"robots": "index,follow",
|
|
77
|
+
"og": {
|
|
78
|
+
"type": "website",
|
|
79
|
+
"title": "page title",
|
|
80
|
+
"description": "page excerpt",
|
|
81
|
+
"url": "https://portal.example.com/about",
|
|
82
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
83
|
+
},
|
|
84
|
+
"twitter": {
|
|
85
|
+
"card": "summary_large_image",
|
|
86
|
+
"title": "page title",
|
|
87
|
+
"description": "page excerpt",
|
|
88
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
89
|
+
},
|
|
90
|
+
"schema_jsonld": {},
|
|
91
|
+
},
|
|
92
|
+
),
|
|
93
|
+
(
|
|
94
|
+
"video",
|
|
95
|
+
"/videos/v",
|
|
96
|
+
{
|
|
97
|
+
"title": "video title",
|
|
98
|
+
"description": "video excerpt",
|
|
99
|
+
"canonical": "https://portal.example.com/videos/v",
|
|
100
|
+
"robots": "index,follow",
|
|
101
|
+
"og": {
|
|
102
|
+
"type": "article",
|
|
103
|
+
"title": "video title",
|
|
104
|
+
"description": "video excerpt",
|
|
105
|
+
"url": "https://portal.example.com/videos/v",
|
|
106
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
107
|
+
},
|
|
108
|
+
"twitter": {
|
|
109
|
+
"card": "summary_large_image",
|
|
110
|
+
"title": "video title",
|
|
111
|
+
"description": "video excerpt",
|
|
112
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
113
|
+
},
|
|
114
|
+
"schema_jsonld": {},
|
|
115
|
+
},
|
|
116
|
+
),
|
|
117
|
+
(
|
|
118
|
+
"taxonomy",
|
|
119
|
+
"/topics/python",
|
|
120
|
+
{
|
|
121
|
+
"title": "taxonomy title",
|
|
122
|
+
"description": "taxonomy excerpt",
|
|
123
|
+
"canonical": "https://portal.example.com/topics/python",
|
|
124
|
+
"robots": "index,follow",
|
|
125
|
+
"og": {
|
|
126
|
+
"type": "website",
|
|
127
|
+
"title": "taxonomy title",
|
|
128
|
+
"description": "taxonomy excerpt",
|
|
129
|
+
"url": "https://portal.example.com/topics/python",
|
|
130
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
131
|
+
},
|
|
132
|
+
"twitter": {
|
|
133
|
+
"card": "summary_large_image",
|
|
134
|
+
"title": "taxonomy title",
|
|
135
|
+
"description": "taxonomy excerpt",
|
|
136
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
137
|
+
},
|
|
138
|
+
"schema_jsonld": {},
|
|
139
|
+
},
|
|
140
|
+
),
|
|
141
|
+
(
|
|
142
|
+
"search",
|
|
143
|
+
"/search?q=x",
|
|
144
|
+
{
|
|
145
|
+
"title": "search title",
|
|
146
|
+
"description": "search excerpt",
|
|
147
|
+
"canonical": "https://portal.example.com/search?q=x",
|
|
148
|
+
"robots": "noindex,follow",
|
|
149
|
+
"og": {
|
|
150
|
+
"type": "website",
|
|
151
|
+
"title": "search title",
|
|
152
|
+
"description": "search excerpt",
|
|
153
|
+
"url": "https://portal.example.com/search?q=x",
|
|
154
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
155
|
+
},
|
|
156
|
+
"twitter": {
|
|
157
|
+
"card": "summary_large_image",
|
|
158
|
+
"title": "search title",
|
|
159
|
+
"description": "search excerpt",
|
|
160
|
+
"image": "https://cdn.example.com/default.jpg",
|
|
161
|
+
},
|
|
162
|
+
"schema_jsonld": {},
|
|
163
|
+
},
|
|
164
|
+
),
|
|
165
|
+
],
|
|
166
|
+
)
|
|
167
|
+
def test_regression_entity_type_snapshots(
|
|
168
|
+
entity_type: str,
|
|
169
|
+
route: str,
|
|
170
|
+
expected: dict,
|
|
171
|
+
) -> None:
|
|
172
|
+
entity = SEOEntity(
|
|
173
|
+
entity_type=entity_type,
|
|
174
|
+
title=f"{entity_type} title",
|
|
175
|
+
excerpt=f"{entity_type} excerpt",
|
|
176
|
+
status="published",
|
|
177
|
+
)
|
|
178
|
+
payload = build_seo_payload(entity, route, _config())
|
|
179
|
+
assert payload == expected
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Tests for robots rule behavior."""
|
|
2
|
+
|
|
3
|
+
from seoslug import SEOConfig, SEOEntity, SEOOverrides, URLPolicy, build_seo_payload
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _config() -> SEOConfig:
|
|
7
|
+
return SEOConfig(
|
|
8
|
+
canonical_host="portal.example.com",
|
|
9
|
+
public_base_url="https://portal.example.com",
|
|
10
|
+
url_policy=URLPolicy(),
|
|
11
|
+
default_robots="index,follow",
|
|
12
|
+
search_robots="noindex,follow",
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_published_content_defaults_to_index_follow() -> None:
|
|
17
|
+
entity = SEOEntity(entity_type="post", status="published")
|
|
18
|
+
payload = build_seo_payload(entity, "/posts/x", _config())
|
|
19
|
+
assert payload["robots"] == "index,follow"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def test_search_uses_search_robots() -> None:
|
|
23
|
+
entity = SEOEntity(entity_type="search", status="published")
|
|
24
|
+
payload = build_seo_payload(entity, "/search?q=x", _config())
|
|
25
|
+
assert payload["robots"] == "noindex,follow"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_override_robots_wins() -> None:
|
|
29
|
+
entity = SEOEntity(entity_type="post", status="draft")
|
|
30
|
+
ov = SEOOverrides(robots="noindex,nofollow")
|
|
31
|
+
payload = build_seo_payload(entity, "/posts/x", _config(), ov)
|
|
32
|
+
assert payload["robots"] == "noindex,nofollow"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def test_non_published_uses_default_robots() -> None:
|
|
36
|
+
entity = SEOEntity(entity_type="post", status="draft")
|
|
37
|
+
payload = build_seo_payload(entity, "/posts/x", _config())
|
|
38
|
+
assert payload["robots"] == "index,follow"
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Tests for SEOEntity and SEOOverrides models."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from seoslug import SEOEntity, SEOOverrides
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_entity_normalizes_optional_string_fields() -> None:
|
|
9
|
+
entity = SEOEntity(
|
|
10
|
+
entity_type="post",
|
|
11
|
+
slug=" my-post ",
|
|
12
|
+
title=" Hello ",
|
|
13
|
+
excerpt=" ",
|
|
14
|
+
)
|
|
15
|
+
assert entity.slug == "my-post"
|
|
16
|
+
assert entity.title == "Hello"
|
|
17
|
+
assert entity.excerpt is None
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_entity_rejects_invalid_entity_type() -> None:
|
|
21
|
+
with pytest.raises(ValueError):
|
|
22
|
+
SEOEntity(entity_type="invalid")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_entity_rejects_non_string_optional_field() -> None:
|
|
26
|
+
with pytest.raises(ValueError):
|
|
27
|
+
SEOEntity(entity_type="post", title=123) # type: ignore[arg-type]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_overrides_normalize_and_validate_schema_type() -> None:
|
|
31
|
+
overrides = SEOOverrides(meta_title=" A ", robots=" ")
|
|
32
|
+
assert overrides.meta_title == "A"
|
|
33
|
+
assert overrides.robots is None
|
|
34
|
+
|
|
35
|
+
with pytest.raises(ValueError):
|
|
36
|
+
SEOOverrides(schema_jsonld=[{"@type": "WebPage"}, "bad"]) # type: ignore[list-item]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Tests for text extraction utilities."""
|
|
2
|
+
|
|
3
|
+
import pytest
|
|
4
|
+
|
|
5
|
+
from seoslug.text import build_description_snippet, html_to_text
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def test_html_to_text_strips_tags_and_script_content() -> None:
|
|
9
|
+
html = "<h1>Hello</h1><script>alert('x')</script><p>World</p>"
|
|
10
|
+
assert html_to_text(html) == "Hello World"
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_snippet_truncation() -> None:
|
|
14
|
+
text = "<p>" + ("a" * 200) + "</p>"
|
|
15
|
+
snippet = build_description_snippet(text, max_length=20)
|
|
16
|
+
assert snippet.endswith("...")
|
|
17
|
+
assert len(snippet) == 20
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_html_to_text_normalizes_whitespace_and_style() -> None:
|
|
21
|
+
html = "<style>.x{color:red;}</style><p> Hello\n\tWorld </p>"
|
|
22
|
+
assert html_to_text(html) == "Hello World"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_invalid_inputs_raise_value_error() -> None:
|
|
26
|
+
with pytest.raises(ValueError):
|
|
27
|
+
html_to_text(123) # type: ignore[arg-type]
|
|
28
|
+
with pytest.raises(ValueError):
|
|
29
|
+
build_description_snippet("<p>ok</p>", max_length=0)
|