py-sitemap-parser 2.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,193 @@
1
+ Metadata-Version: 2.3
2
+ Name: py-sitemap-parser
3
+ Version: 2.0.0
4
+ Summary: Simple sitemap parser for Python
5
+ Keywords: sitemap,xml,parser,seo,sitemap-parser,sitemap-index
6
+ Author: Joakim Helleśen
7
+ Author-email: Joakim Helleśen <tlovinator@gmail.com>
8
+ License: MIT
9
+ Classifier: Development Status :: 5 - Production/Stable
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
20
+ Classifier: Topic :: Text Processing :: Markup :: XML
21
+ Classifier: Typing :: Typed
22
+ Requires-Dist: lxml
23
+ Requires-Dist: niquests
24
+ Requires-Dist: python-dateutil
25
+ Requires-Dist: types-xmltodict
26
+ Requires-Dist: xmltodict
27
+ Requires-Python: >=3.9
28
+ Project-URL: Homepage, https://github.com/M4hbod/sitemap-parser
29
+ Project-URL: Repository, https://github.com/M4hbod/sitemap-parser
30
+ Project-URL: Issues, https://github.com/M4hbod/sitemap-parser/issues
31
+ Project-URL: Documentation, https://github.com/M4hbod/sitemap-parser#readme
32
+ Description-Content-Type: text/markdown
33
+
34
+ # Sitemap Parser
35
+
36
+ <p align="center">
37
+ <img src="https://github.com/M4hbod/sitemap-parser/blob/master/.github/logo.png?raw=true" title="Robot searching for sitemaps" alt="Robot searching for sitemaps" width="300" height="300" />
38
+ </p>
39
+
40
+ This is a Python library designed to parse XML sitemaps and sitemap index files from a given URL. It supports both standard XML sitemaps (which contain URLs) and sitemap index files (which contain links to other sitemaps). This tool is useful for extracting data such as URLs and modification dates from website sitemaps.
41
+
42
+ ## Acknowledgments
43
+
44
+ This is a fork of [Dave O'Connor](https://github.com/daveoconnor)'s [site-map-parser](https://github.com/daveoconnor/site-map-parser). I couldn't have done this without his original work.
45
+
46
+
47
+ ## Installation
48
+
49
+ ```sh
50
+ uv add py-sitemap-parser
51
+ ```
52
+
53
+ ## Usage
54
+
55
+ The library provides a SiteMapParser class that can be used to parse sitemaps and sitemap indexes. You can pass a URL or raw XML data to the parser to extract the URLs or links to other sitemaps.
56
+
57
+ ### Parsing a Sitemap from a URL
58
+
59
+ ```python
60
+ import logging
61
+ from typing import TYPE_CHECKING
62
+
63
+ from sitemap_parser import SiteMapParser
64
+
65
+ if TYPE_CHECKING:
66
+ from sitemap_parser import SitemapIndex
67
+ from sitemap_parser import UrlSet
68
+
69
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
70
+ logger: logging.Logger = logging.getLogger(__name__)
71
+
72
+
73
+ # url = "https://ttvdrops.lovinator.space/sitemap.xml" # Sitemap index
74
+ url = "https://ttvdrops.lovinator.space/sitemap-static.xml" # Sitemap with URLs
75
+ parser = SiteMapParser(source=url)
76
+
77
+ if parser.has_sitemaps():
78
+ sitemaps: SitemapIndex = parser.get_sitemaps()
79
+ for sitemap in sitemaps:
80
+ logger.info(sitemap)
81
+
82
+ elif parser.has_urls():
83
+ urls: UrlSet = parser.get_urls()
84
+ for url in urls:
85
+ logger.info(url)
86
+ ```
87
+
88
+ ### Parsing a Raw XML String
89
+
90
+ ```python
91
+ from sitemap_parser import SiteMapParser, UrlSet
92
+
93
+ xml_data = """
94
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
95
+ <url>
96
+ <loc>https://example.com/</loc>
97
+ <lastmod>2023-09-27</lastmod>
98
+ <changefreq>daily</changefreq>
99
+ <priority>1.0</priority>
100
+ </url>
101
+ <url>
102
+ <loc>https://example.com/about</loc>
103
+ <lastmod>2023-09-27</lastmod>
104
+ <changefreq>daily</changefreq>
105
+ <priority>0.8</priority>
106
+ </url>
107
+ </urlset>
108
+ """
109
+ parser = SiteMapParser(source=xml_data, is_data_string=True)
110
+ urls: UrlSet = parser.get_urls()
111
+ for url in urls:
112
+ print(url)
113
+
114
+ # Output:
115
+ # - https://example.com/
116
+ # - https://example.com/about
117
+ ```
118
+
119
+ ### Exporting Sitemap Data to JSON
120
+
121
+ You can export the parsed sitemap data to a JSON file using the JSONExporter class.
122
+
123
+ ```python
124
+ import json
125
+ import logging
126
+
127
+ from sitemap_parser import JSONExporter
128
+ from sitemap_parser import SiteMapParser
129
+
130
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
131
+ logger: logging.Logger = logging.getLogger(__name__)
132
+
133
+ # Sitemap with URLs to other sitemaps
134
+ parser = SiteMapParser(source="https://ttvdrops.lovinator.space/sitemap.xml")
135
+
136
+ if parser.has_sitemaps():
137
+ json_data: str = JSONExporter(data=parser).export_sitemaps()
138
+ json_data = json.loads(json_data)
139
+ logger.info("Exported sitemaps: %s", json_data)
140
+
141
+ logger.info("----" * 10)
142
+
143
+ # Sitemap with "real" URLs
144
+ parser2 = SiteMapParser(
145
+ source="https://ttvdrops.lovinator.space/sitemap-static.xml",
146
+ )
147
+
148
+ if parser2.has_urls():
149
+ json_data: str = JSONExporter(data=parser2).export_urls()
150
+ json_data = json.loads(json_data)
151
+ logger.info("Exported URLs: %s", json_data)
152
+ ```
153
+
154
+ ### Converting Sitemap XML to a Python dict
155
+
156
+ If you'd like to work with the parsed sitemap as a plain Python dictionary, you can use `SiteMapParser.to_dict()`.
157
+
158
+ ```python
159
+ from sitemap_parser import SiteMapParser
160
+
161
+ xml = """
162
+ <urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">
163
+ <url>
164
+ <loc>https://example.com/</loc>
165
+ </url>
166
+ </urlset>
167
+ """
168
+
169
+ parser = SiteMapParser(source=xml, is_data_string=True)
170
+ parsed = parser.to_dict()
171
+
172
+ # xmltodict represents repeated elements as lists
173
+ print(parsed["urlset"]["url"][0]["loc"])
174
+ ```
175
+
176
+ You can also enable namespace processing for expanded namespace keys:
177
+
178
+ ```python
179
+ parsed = parser.to_dict(process_namespaces=True)
180
+ ```
181
+
182
+ ## Disabling Logging
183
+
184
+ If you want to disable logging, you can adjust the logging level to logging.CRITICAL or higher. This will suppress all log messages below the CRITICAL level.
185
+
186
+ Here's an example of how to do this:
187
+
188
+ ```python
189
+ import logging
190
+
191
+ # Set the logging level to CRITICAL to disable logging
192
+ logging.getLogger("sitemap_parser").setLevel(logging.CRITICAL)
193
+ ```
@@ -0,0 +1,160 @@
1
+ # Sitemap Parser
2
+
3
+ <p align="center">
4
+ <img src="https://github.com/M4hbod/sitemap-parser/blob/master/.github/logo.png?raw=true" title="Robot searching for sitemaps" alt="Robot searching for sitemaps" width="300" height="300" />
5
+ </p>
6
+
7
+ This is a Python library designed to parse XML sitemaps and sitemap index files from a given URL. It supports both standard XML sitemaps (which contain URLs) and sitemap index files (which contain links to other sitemaps). This tool is useful for extracting data such as URLs and modification dates from website sitemaps.
8
+
9
+ ## Acknowledgments
10
+
11
+ This is a fork of [Dave O'Connor](https://github.com/daveoconnor)'s [site-map-parser](https://github.com/daveoconnor/site-map-parser). I couldn't have done this without his original work.
12
+
13
+
14
+ ## Installation
15
+
16
+ ```sh
17
+ uv add py-sitemap-parser
18
+ ```
19
+
20
+ ## Usage
21
+
22
+ The library provides a SiteMapParser class that can be used to parse sitemaps and sitemap indexes. You can pass a URL or raw XML data to the parser to extract the URLs or links to other sitemaps.
23
+
24
+ ### Parsing a Sitemap from a URL
25
+
26
+ ```python
27
+ import logging
28
+ from typing import TYPE_CHECKING
29
+
30
+ from sitemap_parser import SiteMapParser
31
+
32
+ if TYPE_CHECKING:
33
+ from sitemap_parser import SitemapIndex
34
+ from sitemap_parser import UrlSet
35
+
36
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
37
+ logger: logging.Logger = logging.getLogger(__name__)
38
+
39
+
40
+ # url = "https://ttvdrops.lovinator.space/sitemap.xml" # Sitemap index
41
+ url = "https://ttvdrops.lovinator.space/sitemap-static.xml" # Sitemap with URLs
42
+ parser = SiteMapParser(source=url)
43
+
44
+ if parser.has_sitemaps():
45
+ sitemaps: SitemapIndex = parser.get_sitemaps()
46
+ for sitemap in sitemaps:
47
+ logger.info(sitemap)
48
+
49
+ elif parser.has_urls():
50
+ urls: UrlSet = parser.get_urls()
51
+ for url in urls:
52
+ logger.info(url)
53
+ ```
54
+
55
+ ### Parsing a Raw XML String
56
+
57
+ ```python
58
+ from sitemap_parser import SiteMapParser, UrlSet
59
+
60
+ xml_data = """
61
+ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
62
+ <url>
63
+ <loc>https://example.com/</loc>
64
+ <lastmod>2023-09-27</lastmod>
65
+ <changefreq>daily</changefreq>
66
+ <priority>1.0</priority>
67
+ </url>
68
+ <url>
69
+ <loc>https://example.com/about</loc>
70
+ <lastmod>2023-09-27</lastmod>
71
+ <changefreq>daily</changefreq>
72
+ <priority>0.8</priority>
73
+ </url>
74
+ </urlset>
75
+ """
76
+ parser = SiteMapParser(source=xml_data, is_data_string=True)
77
+ urls: UrlSet = parser.get_urls()
78
+ for url in urls:
79
+ print(url)
80
+
81
+ # Output:
82
+ # - https://example.com/
83
+ # - https://example.com/about
84
+ ```
85
+
86
+ ### Exporting Sitemap Data to JSON
87
+
88
+ You can export the parsed sitemap data to a JSON file using the JSONExporter class.
89
+
90
+ ```python
91
+ import json
92
+ import logging
93
+
94
+ from sitemap_parser import JSONExporter
95
+ from sitemap_parser import SiteMapParser
96
+
97
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
98
+ logger: logging.Logger = logging.getLogger(__name__)
99
+
100
+ # Sitemap with URLs to other sitemaps
101
+ parser = SiteMapParser(source="https://ttvdrops.lovinator.space/sitemap.xml")
102
+
103
+ if parser.has_sitemaps():
104
+ json_data: str = JSONExporter(data=parser).export_sitemaps()
105
+ json_data = json.loads(json_data)
106
+ logger.info("Exported sitemaps: %s", json_data)
107
+
108
+ logger.info("----" * 10)
109
+
110
+ # Sitemap with "real" URLs
111
+ parser2 = SiteMapParser(
112
+ source="https://ttvdrops.lovinator.space/sitemap-static.xml",
113
+ )
114
+
115
+ if parser2.has_urls():
116
+ json_data: str = JSONExporter(data=parser2).export_urls()
117
+ json_data = json.loads(json_data)
118
+ logger.info("Exported URLs: %s", json_data)
119
+ ```
120
+
121
+ ### Converting Sitemap XML to a Python dict
122
+
123
+ If you'd like to work with the parsed sitemap as a plain Python dictionary, you can use `SiteMapParser.to_dict()`.
124
+
125
+ ```python
126
+ from sitemap_parser import SiteMapParser
127
+
128
+ xml = """
129
+ <urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">
130
+ <url>
131
+ <loc>https://example.com/</loc>
132
+ </url>
133
+ </urlset>
134
+ """
135
+
136
+ parser = SiteMapParser(source=xml, is_data_string=True)
137
+ parsed = parser.to_dict()
138
+
139
+ # xmltodict represents repeated elements as lists
140
+ print(parsed["urlset"]["url"][0]["loc"])
141
+ ```
142
+
143
+ You can also enable namespace processing for expanded namespace keys:
144
+
145
+ ```python
146
+ parsed = parser.to_dict(process_namespaces=True)
147
+ ```
148
+
149
+ ## Disabling Logging
150
+
151
+ If you want to disable logging, you can adjust the logging level to logging.CRITICAL or higher. This will suppress all log messages below the CRITICAL level.
152
+
153
+ Here's an example of how to do this:
154
+
155
+ ```python
156
+ import logging
157
+
158
+ # Set the logging level to CRITICAL to disable logging
159
+ logging.getLogger("sitemap_parser").setLevel(logging.CRITICAL)
160
+ ```
@@ -0,0 +1,113 @@
1
+ [project]
2
+ name = "py-sitemap-parser"
3
+ version = "2.0.0"
4
+ description = "Simple sitemap parser for Python"
5
+ readme = "README.md"
6
+ authors = [{ name = "Joakim Helleśen", email = "tlovinator@gmail.com" }]
7
+ license = { text = "MIT" }
8
+ requires-python = ">=3.9"
9
+ keywords = ["sitemap", "xml", "parser", "seo", "sitemap-parser", "sitemap-index"]
10
+ classifiers = [
11
+ "Development Status :: 5 - Production/Stable",
12
+ "Intended Audience :: Developers",
13
+ "License :: OSI Approved :: MIT License",
14
+ "Operating System :: OS Independent",
15
+ "Programming Language :: Python :: 3",
16
+ "Programming Language :: Python :: 3.9",
17
+ "Programming Language :: Python :: 3.10",
18
+ "Programming Language :: Python :: 3.11",
19
+ "Programming Language :: Python :: 3.12",
20
+ "Programming Language :: Python :: 3.13",
21
+ "Topic :: Software Development :: Libraries :: Python Modules",
22
+ "Topic :: Text Processing :: Markup :: XML",
23
+ "Typing :: Typed",
24
+ ]
25
+ dependencies = [
26
+ "lxml",
27
+ "niquests",
28
+ "python-dateutil",
29
+ "types-xmltodict",
30
+ "xmltodict",
31
+ ]
32
+
33
+ [project.urls]
34
+ Homepage = "https://github.com/M4hbod/sitemap-parser"
35
+ Repository = "https://github.com/M4hbod/sitemap-parser"
36
+ Issues = "https://github.com/M4hbod/sitemap-parser/issues"
37
+ Documentation = "https://github.com/M4hbod/sitemap-parser#readme"
38
+
39
+ [build-system]
40
+ requires = ["uv_build>=0.10.11,<0.11.0"]
41
+ build-backend = "uv_build"
42
+
43
+ [dependency-groups]
44
+ dev = ["lxml-stubs", "pytest", "pytest-cov"]
45
+
46
+
47
+ [tool.ruff]
48
+ fix = true
49
+ preview = true
50
+ unsafe-fixes = true
51
+
52
+ format.docstring-code-format = true
53
+ format.preview = true
54
+
55
+ lint.future-annotations = true
56
+ lint.isort.force-single-line = true
57
+ lint.pycodestyle.ignore-overlong-task-comments = true
58
+ lint.pydocstyle.convention = "google"
59
+ lint.select = ["ALL"]
60
+
61
+ # Don't automatically remove unused variables
62
+ lint.unfixable = ["F841"]
63
+
64
+ lint.ignore = [
65
+ "ANN002", # Checks that function *args arguments have type annotations.
66
+ "ANN003", # Checks that function **kwargs arguments have type annotations.
67
+ "C901", # Checks for functions with a high McCabe complexity.
68
+ "CPY001", # Checks for the absence of copyright notices within Python files.
69
+ "D100", # Checks for undocumented public module definitions.
70
+ "D104", # Checks for undocumented public package definitions.
71
+ "D105", # Checks for undocumented magic method definitions.
72
+ "D106", # Checks for undocumented public class definitions, for nested classes.
73
+ "E501", # Checks for lines that exceed the specified maximum character length.
74
+ "ERA001", # Checks for commented-out Python code.
75
+ "FIX002", # Checks for "TODO" comments.
76
+ "PLR0911", # Checks for functions or methods with too many return statements.
77
+ "PLR0912", # Checks for functions or methods with too many branches, including (nested) if, elif, and else branches, for loops, try-except clauses, and match and case statements.
78
+ "PLR6301", # Checks for the presence of unused self parameter in methods definitions.
79
+ "RUF012", # Checks for mutable default values in class attributes.
80
+ "RUF067", # Detects the presence of code in __init__.py files.
81
+ "S405", # Checks for imports of the xml.etree.cElementTree and xml.etree.ElementTree modules
82
+
83
+ # Conflicting lint rules when using Ruff's formatter
84
+ # https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
85
+ "COM812", # Checks for the absence of trailing commas.
86
+ "COM819", # Checks for the presence of prohibited trailing commas.
87
+ "D206", # Checks for docstrings that are indented with tabs.
88
+ "D300", # Checks for docstrings that use '''triple single quotes''' instead of """triple double quotes""".
89
+ "E111", # Checks for indentation with a non-multiple of 4 spaces.
90
+ "E114", # Checks for indentation of comments with a non-multiple of 4 spaces.
91
+ "E117", # Checks for over-indented code.
92
+ "ISC001", # Checks for implicitly concatenated strings on a single line.
93
+ "ISC002", # Checks for implicitly concatenated strings that span multiple lines.
94
+ "Q000", # Checks for inline strings that use single quotes or double quotes, depending on the value of the lint.flake8-quotes.inline-quotes option.
95
+ "Q001", # Checks for multiline strings that use single quotes or double quotes, depending on the value of the lint.flake8-quotes.multiline-quotes setting.
96
+ "Q002", # Checks for docstrings that use single quotes or double quotes, depending on the value of the lint.flake8-quotes.docstring-quotes setting.
97
+ "Q003", # Checks for strings that include escaped quotes, and suggests changing the quote style to avoid the need to escape them.
98
+ "W191", # Checks for indentation that uses tabs.
99
+ ]
100
+
101
+ [tool.ruff.lint.per-file-ignores]
102
+ "**/tests/**" = [
103
+ "ARG",
104
+ "FBT",
105
+ "PLR0904",
106
+ "PLR2004",
107
+ "PLR6301",
108
+ "S101",
109
+ "S105",
110
+ "S106",
111
+ "S311",
112
+ "SLF001",
113
+ ]
@@ -0,0 +1,688 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import re
5
+ import typing
6
+ from datetime import datetime
7
+ from json import dumps
8
+ from typing import Any
9
+ from typing import Literal
10
+
11
+ import niquests
12
+ import xmltodict
13
+ from dateutil import parser
14
+
15
+ if typing.TYPE_CHECKING:
16
+ from collections.abc import Generator
17
+ from collections.abc import Iterator
18
+
19
+ from niquests import Response
20
+
21
+ __all__: list[str] = [
22
+ "JSONExporter",
23
+ "SiteMapParser",
24
+ "Sitemap",
25
+ "SitemapIndex",
26
+ "Url",
27
+ "UrlSet",
28
+ ]
29
+
30
+ logger: logging.Logger = logging.getLogger("sitemap_parser")
31
+
32
+ type Freqs = Literal[
33
+ "always",
34
+ "hourly",
35
+ "daily",
36
+ "weekly",
37
+ "monthly",
38
+ "yearly",
39
+ "never",
40
+ ]
41
+ type ValidFreqs = tuple[
42
+ Literal["always"],
43
+ Literal["hourly"],
44
+ Literal["daily"],
45
+ Literal["weekly"],
46
+ Literal["monthly"],
47
+ Literal["yearly"],
48
+ Literal["never"],
49
+ ]
50
+ type Fields = tuple[
51
+ Literal["loc"],
52
+ Literal["lastmod"],
53
+ Literal["changefreq"],
54
+ Literal["priority"],
55
+ ]
56
+ type UrlFields = tuple[
57
+ Literal["loc"],
58
+ Literal["lastmod"],
59
+ Literal["changefreq"],
60
+ Literal["priority"],
61
+ ]
62
+ type SitemapFields = tuple[
63
+ Literal["loc"],
64
+ Literal["lastmod"],
65
+ ]
66
+
67
+
68
+ # MARK: BaseData
69
+ class BaseData:
70
+ """Base class for sitemap data.
71
+
72
+ Provides common properties and methods for sitemap and sitemap index entries,
73
+ such as location (`loc`) and last modified time (`lastmod`).
74
+ """
75
+
76
+ def __init__(self) -> None:
77
+ self._lastmod: datetime | None = None
78
+ self._loc: str | None = None
79
+
80
+ @property
81
+ def lastmod(self) -> datetime | None:
82
+ """Get the last modified datetime of the resource.
83
+
84
+ Returns:
85
+ datetime | None: The datetime when the resource was last modified, or None if not set.
86
+ """
87
+ return self._lastmod
88
+
89
+ @lastmod.setter
90
+ def lastmod(self, value: str | None) -> None:
91
+ """Set the last modified datetime of the resource.
92
+
93
+ Parses an ISO-8601 datetime string into a datetime object.
94
+
95
+ Args:
96
+ value (str | None): An ISO-8601 formatted datetime string, or None.
97
+ """
98
+ self._lastmod = parser.isoparse(value) if value is not None else None
99
+
100
+ @property
101
+ def loc(self) -> str | None:
102
+ """Get the location URL of the resource.
103
+
104
+ Returns:
105
+ str | None: The URL of the resource.
106
+ """
107
+ return self._loc
108
+
109
+ @loc.setter
110
+ def loc(self, value: str | None) -> None:
111
+ """Set the location URL of the resource.
112
+
113
+ Validates that the provided value is a valid URL.
114
+
115
+ Args:
116
+ value (str | None): The URL to set.
117
+
118
+ Raises:
119
+ TypeError: If the value is not a string.
120
+ ValueError: If the value is not a valid URL.
121
+ """
122
+ if not isinstance(value, str):
123
+ msg = "URL must be a string"
124
+ raise TypeError(msg)
125
+
126
+ if not re.match(r"http[s]?://", value):
127
+ msg: str = f"{value} is not a valid URL"
128
+ raise ValueError(msg)
129
+
130
+ self._loc = value
131
+
132
+
133
+ def download_uri_data(
134
+ uri: str,
135
+ *,
136
+ raise_on_http_error: bool = True,
137
+ **kwargs: Any, # noqa: ANN401
138
+ ) -> bytes:
139
+ """Download the data from the uri.
140
+
141
+ Args:
142
+ uri(str): The uri to download. Expected format: HTTP/HTTPS URL.
143
+ **kwargs: Additional keyword arguments passed to niquests.get().
144
+ raise_on_http_error (bool): Whether to raise an exception on HTTP errors.
145
+
146
+ Returns:
147
+ bytes: The data from the uri
148
+
149
+ Raises:
150
+ ValueError: If no content was found at the uri.
151
+ """
152
+ logger.info("Downloading from %s", uri)
153
+ r: Response = niquests.get(uri, **kwargs)
154
+
155
+ if raise_on_http_error:
156
+ r.raise_for_status()
157
+
158
+ logger.debug("Downloaded data from %s", uri)
159
+
160
+ content: bytes | None = r.content
161
+ if not content:
162
+ logger.warning("No content found at %s", uri)
163
+ msg: str = f"No content found at {uri}"
164
+ raise ValueError(msg)
165
+
166
+ return content
167
+
168
+
169
+ # MARK: Sitemap
170
+ class Sitemap(BaseData):
171
+ """Representation of the <sitemap> element."""
172
+
173
+ fields: tuple[Literal["loc"], Literal["lastmod"]] = "loc", "lastmod"
174
+
175
+ def __init__(self, loc: str, lastmod: str | None = None) -> None:
176
+ """Representation of the <sitemap> element.
177
+
178
+ Args:
179
+ loc: String, URL of the page.
180
+ lastmod: str | None, The date of last modification of the file.
181
+ """
182
+ self.loc = loc
183
+ self.lastmod = lastmod
184
+
185
+ def __str__(self) -> str:
186
+ """String representation of the Sitemap instance.
187
+
188
+ Returns:
189
+ The URL of the page.
190
+
191
+ Raises:
192
+ ValueError: If loc is None.
193
+ """
194
+ if self.loc is None:
195
+ msg = "loc cannot be None"
196
+ raise ValueError(msg)
197
+
198
+ return self.loc
199
+
200
+ def __repr__(self) -> str:
201
+ """String representation of the Sitemap instance.
202
+
203
+ Returns:
204
+ The URL of the page.
205
+ """
206
+ return f"<Sitemap {self.loc}>"
207
+
208
+
209
+ # MARK: Url
210
+ class Url(BaseData):
211
+ """Representation of the <url> element.
212
+
213
+ Args:
214
+ BaseData: Base class for all data classes
215
+
216
+ Raises:
217
+ ValueError: If `changefreq` is not an allowed value.
218
+ ValueError: If `priority` is not between 0.0 and 1.0.
219
+ """
220
+
221
+ fields: Fields = ("loc", "lastmod", "changefreq", "priority")
222
+ valid_freqs: ValidFreqs = (
223
+ "always",
224
+ "hourly",
225
+ "daily",
226
+ "weekly",
227
+ "monthly",
228
+ "yearly",
229
+ "never",
230
+ )
231
+
232
+ def __init__(
233
+ self: Url,
234
+ loc: str | None,
235
+ lastmod: str | None = None,
236
+ changefreq: str | None = None,
237
+ priority: str | float | None = None,
238
+ ) -> None:
239
+ """Creates a Url instance.
240
+
241
+ Args:
242
+ loc: Location.
243
+ lastmod: Last modified.
244
+ changefreq: Change frequency.
245
+ priority: Priority.
246
+ """
247
+ self.loc = loc
248
+ self.lastmod = lastmod
249
+ self.changefreq = changefreq
250
+ self.priority = float(priority) if priority is not None else None
251
+
252
+ @property
253
+ def changefreq(self: Url) -> Freqs | None:
254
+ """Get changefreq."""
255
+ return self._changefreq
256
+
257
+ @changefreq.setter
258
+ def changefreq(self: Url, frequency: str | None) -> None:
259
+ """Set changefreq.
260
+
261
+ Args:
262
+ self: The Url instance
263
+ frequency: Change frequency.
264
+
265
+ Raises:
266
+ ValueError: Value is not an allowed value
267
+ """
268
+ if frequency is not None and frequency not in Url.valid_freqs:
269
+ msg: str = f"'{frequency}' is not an allowed value: {Url.valid_freqs}"
270
+ raise ValueError(msg)
271
+ self._changefreq: Freqs | None = frequency
272
+
273
+ @property
274
+ def priority(self: Url) -> float | None:
275
+ """Get priority.
276
+
277
+ Returns:
278
+ Priority
279
+ """
280
+ return self._priority
281
+
282
+ @priority.setter
283
+ def priority(self: Url, priority: float | None) -> None:
284
+ if priority is not None:
285
+ min_value = 0.0
286
+ max_value = 1.0
287
+ if priority < min_value or priority > max_value:
288
+ msg: str = f"'{priority}' is not between 0.0 and 1.0"
289
+ raise ValueError(msg)
290
+
291
+ self._priority: float | None = priority
292
+
293
+ def __str__(self: Url) -> str:
294
+ """Return a string representation of the Url instance.
295
+
296
+ Returns:
297
+ String representation of the Url instance
298
+ """
299
+ return self.loc or ""
300
+
301
+ def __repr__(self: Url) -> str:
302
+ """Return a string representation of the Url instance.
303
+
304
+ Returns:
305
+ String representation of the Url instance
306
+ """
307
+ return f"Url(loc={self.loc}, lastmod={self.lastmod}, changefreq={self.changefreq}, priority={self.priority})"
308
+
309
+
310
+ # MARK: UrlSet
311
+ class UrlSet:
312
+ r"""Represents a <urlset\> element.
313
+
314
+ It contains multiple <url> entries, each represented by a Url instance.
315
+
316
+ Example usage:
317
+ ```python
318
+ # Example of creating a UrlSet instance from a dict representing a <urlset> element.
319
+ # This is how the data would look after being parsed by xmltodict.
320
+ urlset_data = {
321
+ "url": [
322
+ {
323
+ "loc": "https://example.com/",
324
+ "lastmod": "2024-01-01T00:00:00Z",
325
+ "changefreq": "daily",
326
+ "priority": "0.8",
327
+ },
328
+ {
329
+ "loc": "https://example.com/about",
330
+ "lastmod": "2024-01-02T00:00:00Z",
331
+ "changefreq": "weekly",
332
+ "priority": "0.5",
333
+ },
334
+ ]
335
+ }
336
+ # Create a UrlSet instance and iterate over the Url entries.
337
+ url_set = UrlSet(urlset_data)
338
+ for url in url_set:
339
+ print(url)
340
+ ```
341
+ """
342
+
343
+ allowed_fields: typing.ClassVar[tuple[str, ...]] = (
344
+ "loc",
345
+ "lastmod",
346
+ "changefreq",
347
+ "priority",
348
+ )
349
+
350
+ def __init__(self, urlset_data: dict[str, Any]) -> None:
351
+ r"""Initialize the UrlSet instance with the parsed <urlset\> data."""
352
+ self.urlset_data: dict[str, Any] = urlset_data
353
+
354
+ @staticmethod
355
+ def url_from_dict(url_dict: dict[str, Any]) -> Url:
356
+ """Creates a Url instance from a dict representing a <url> element.
357
+
358
+ Args:
359
+ url_dict: A dict as returned by xmltodict for a <url> element.
360
+
361
+ Returns:
362
+ Url: A Url instance populated from the provided dict.
363
+ """
364
+ logger.debug("url_from_dict %s", url_dict)
365
+ url_data: dict[str, str | None] = {}
366
+ for fld in UrlSet.allowed_fields:
367
+ value = url_dict.get(fld)
368
+ if value is not None:
369
+ url_data[fld] = value
370
+
371
+ logger.debug("url_data %s", url_data)
372
+ return Url(**url_data)
373
+
374
+ @staticmethod
375
+ def urls_from_url_set_data(
376
+ url_set_data: dict[str, Any],
377
+ ) -> Generator[Url, typing.Any]:
378
+ r"""Generate Url instances from xmltodict output for a <urlset\>.
379
+
380
+ Args:
381
+ url_set_data: Parsed xmltodict output for the <urlset\> element.
382
+
383
+ Yields:
384
+ Url: A Url instance for each <url\> entry.
385
+ """
386
+ logger.debug("urls_from_url_set_data %s", url_set_data)
387
+
388
+ url_items: list[dict[str, Any]] | dict[str, Any] = url_set_data.get("url", [])
389
+ if isinstance(url_items, dict):
390
+ url_items = [url_items]
391
+
392
+ for url_dict in url_items:
393
+ yield UrlSet.url_from_dict(url_dict)
394
+
395
+ def __iter__(self) -> Iterator[Url]:
396
+ """Generator for Url instances from a <urlset> element.
397
+
398
+ Returns:
399
+ Url instance
400
+ """
401
+ return UrlSet.urls_from_url_set_data(self.urlset_data)
402
+
403
+
404
+ # MARK: SitemapIndex
405
+ class SitemapIndex:
406
+ """Represents a <sitemapindex> element."""
407
+
408
+ def __init__(self, index_data: dict[str, Any]) -> None:
409
+ """Initialize the SitemapIndex instance with the parsed <sitemapindex> data."""
410
+ self.index_data: dict[str, Any] = index_data
411
+
412
+ @staticmethod
413
+ def sitemap_from_dict(sitemap_dict: dict[str, Any]) -> Sitemap:
414
+ """Creates a Sitemap instance from a dict representing a <sitemap> element.
415
+
416
+ Args:
417
+ sitemap_dict: A dict as returned by xmltodict for a <sitemap> element.
418
+
419
+ Returns:
420
+ Sitemap: A Sitemap instance populated from the provided dict.
421
+ """
422
+ sitemap_data: dict[str, str] = {}
423
+ for fld in ("loc", "lastmod"):
424
+ value = sitemap_dict.get(fld)
425
+ if value is not None:
426
+ sitemap_data[fld] = value
427
+
428
+ msg = "Returning sitemap object with data: {}"
429
+ logger.debug(msg, sitemap_data)
430
+ return Sitemap(**sitemap_data)
431
+
432
+ @staticmethod
433
+ def sitemaps_from_index_data(index_data: dict[str, Any]) -> Generator[Sitemap, Any]:
434
+ """Generate Sitemap instances from xmltodict output for a <sitemapindex>.
435
+
436
+ Args:
437
+ index_data: Parsed xmltodict output for the <sitemapindex> element.
438
+
439
+ Yields:
440
+ Sitemap: A Sitemap instance for each <sitemap> entry.
441
+ """
442
+ logger.debug("Generating sitemaps from %s", index_data)
443
+
444
+ sitemap_items: list[dict[str, Any]] | dict[str, Any] = index_data.get(
445
+ "sitemap",
446
+ [],
447
+ )
448
+ if isinstance(sitemap_items, dict):
449
+ sitemap_items = [sitemap_items]
450
+
451
+ for sitemap_dict in sitemap_items:
452
+ yield SitemapIndex.sitemap_from_dict(sitemap_dict)
453
+
454
+ def __iter__(self) -> Iterator[Sitemap]:
455
+ """Generator for Sitemap instances from a <sitemapindex> element.
456
+
457
+ Args:
458
+ self: The SitemapIndex instance
459
+
460
+ Returns:
461
+ Sitemap instance
462
+ """
463
+ return SitemapIndex.sitemaps_from_index_data(self.index_data)
464
+
465
+ def __str__(self) -> str:
466
+ return f"<SitemapIndex: {self.index_data}>"
467
+
468
+
469
+ # MARK: SiteMapParser
470
+ class SiteMapParser:
471
+ """Parses a sitemap or sitemap index and returns the appropriate object."""
472
+
473
+ def __init__(
474
+ self,
475
+ source: str,
476
+ *,
477
+ is_data_string: bool = False,
478
+ ) -> None:
479
+ """Initialize the SiteMapParser instance with the URI.
480
+
481
+ The source can be a URL or a raw XML string. The parser will determine
482
+ whether to download the data or use the provided string.
483
+
484
+ Args:
485
+ source: The URL of the sitemap or raw XML string.
486
+ is_data_string: Whether the source is a raw XML string or not.
487
+ """
488
+ self.source: str = source
489
+ self.is_sitemap_index: bool = False
490
+ self._sitemaps: SitemapIndex | None = None
491
+ self._url_set: UrlSet | None = None
492
+ self._parsed_dict: dict[str, Any] | None = None
493
+ self._is_data_string: bool = is_data_string
494
+ self._xml_bytes: bytes | None = None
495
+ self._initialize()
496
+
497
+ def _initialize(self) -> None:
498
+ """Initialization processing."""
499
+ # Determine if we're using raw XML data or downloading from a URL
500
+ if self._is_data_string:
501
+ data: bytes = self.source.encode("utf-8")
502
+ else:
503
+ data: bytes = download_uri_data(uri=self.source)
504
+
505
+ self._xml_bytes = data
506
+
507
+ # Use xmltodict to parse sitemap content into a dictionary.
508
+ # This avoids relying on ElementTree/XPath for extraction.
509
+ parsed: dict[str, Any] = xmltodict.parse(data, force_list=("url", "sitemap"))
510
+ self._parsed_dict = parsed
511
+
512
+ root_key = next(iter(parsed))
513
+ root_tag = root_key.split(":")[-1]
514
+ self.is_sitemap_index = root_tag == "sitemapindex"
515
+
516
+ if self.is_sitemap_index:
517
+ self._sitemaps = SitemapIndex(index_data=parsed[root_key])
518
+ else:
519
+ self._url_set = UrlSet(urlset_data=parsed[root_key])
520
+
521
+ def get_sitemaps(self) -> SitemapIndex:
522
+ """Retrieve the sitemaps.
523
+
524
+ Can check if 'has_sitemaps()' returns True to determine
525
+ if this should be used without calling it
526
+
527
+ Returns:
528
+ SitemapIndex: The sitemaps as a SitemapIndex instance
529
+
530
+ Raises:
531
+ KeyError: If the root is not a <sitemapindex>
532
+ """
533
+ if not self.has_sitemaps():
534
+ error_msg = "Method called when root is not a <sitemapindex>"
535
+ logger.critical(error_msg)
536
+ raise KeyError(error_msg)
537
+
538
+ if self._sitemaps is None:
539
+ msg = "Sitemaps are not available"
540
+ raise KeyError(msg)
541
+
542
+ return self._sitemaps
543
+
544
+ def get_urls(self) -> UrlSet:
545
+ """Retrieve the URLs from the sitemap.
546
+
547
+ Returns:
548
+ UrlSet: The URLs as a UrlSet instance.
549
+
550
+ Raises:
551
+ KeyError: If the root is not a <urlset>.
552
+ """
553
+ if not self.has_urls():
554
+ error_msg = "Method called when root is not a <urlset>"
555
+ logger.critical(error_msg)
556
+
557
+ # Check if the root is a <sitemapindex>
558
+ if self.is_sitemap_index:
559
+ error_msg = "Method called when root is a <sitemapindex>. Use 'get_sitemaps()' instead"
560
+
561
+ raise KeyError(error_msg)
562
+
563
+ if self._url_set is None:
564
+ msg = "URLs are not available"
565
+ raise KeyError(msg)
566
+
567
+ return self._url_set
568
+
569
+ def has_sitemaps(self) -> bool:
570
+ """Determine if the URL's data contained sitemaps.
571
+
572
+ A sitemap can contain other sitemaps. For example: <https://www.webhallen.com/sitemap.xml>
573
+
574
+ Returns:
575
+ Boolean
576
+ """
577
+ return self.is_sitemap_index
578
+
579
+ def has_urls(self) -> bool:
580
+ """Determine if the URL's data contained urls.
581
+
582
+ Returns:
583
+ Boolean
584
+ """
585
+ return not self.is_sitemap_index
586
+
587
+ def to_dict(
588
+ self,
589
+ *,
590
+ process_namespaces: bool = False,
591
+ **xmltodict_kwargs: object,
592
+ ) -> dict[str, Any]:
593
+ """Parse the underlying XML input into a Python dict.
594
+
595
+ Args:
596
+ process_namespaces (bool): Expand namespaces into the returned dict.
597
+ **xmltodict_kwargs: Additional keyword arguments passed to :func:`xmltodict.parse`.
598
+
599
+ Returns:
600
+ dict[str, Any]: The parsed XML as a Python dictionary.
601
+
602
+ Raises:
603
+ RuntimeError: If the parser does not have XML data available.
604
+ """
605
+ xml_bytes: bytes | None = self._xml_bytes
606
+ if xml_bytes is None:
607
+ msg = "No XML data available to parse"
608
+ raise RuntimeError(msg)
609
+
610
+ # If we have already parsed the XML and the caller is using default
611
+ # options, just return the cached parse.
612
+ if (
613
+ not process_namespaces
614
+ and not xmltodict_kwargs
615
+ and self._parsed_dict is not None
616
+ ):
617
+ return self._parsed_dict
618
+
619
+ kwargs: dict[str, Any] = {"process_namespaces": process_namespaces}
620
+ kwargs.update(xmltodict_kwargs) # type: ignore[arg-type]
621
+
622
+ return xmltodict.parse(xml_bytes, **kwargs)
623
+
624
+ def __str__(self) -> str:
625
+ """String representation of the SiteMapParser instance.
626
+
627
+ Returns:
628
+ str
629
+ """
630
+ return str(self._sitemaps if self.has_sitemaps() else self._url_set)
631
+
632
+
633
+ # MARK: JSONExporter
634
+ class JSONExporter:
635
+ """Export site map data to JSON format."""
636
+
637
+ def __init__(self, data: SiteMapParser) -> None:
638
+ """Initializes the JSONExporter instance with the site map data."""
639
+ self.data: SiteMapParser = data
640
+
641
+ @staticmethod
642
+ def _collate(
643
+ fields: SitemapFields | UrlFields,
644
+ row_data: SitemapIndex | UrlSet,
645
+ ) -> list[dict[str, Any]]:
646
+ """Collate data from SitemapIndex or UrlSet into a list of dictionaries.
647
+
648
+ Args:
649
+ fields (SitemapFields | UrlFields): The fields to include in the output.
650
+ row_data (SitemapIndex | UrlSet): An iterable containing Sitemap or Url objects.
651
+
652
+ Returns:
653
+ list: A list of dictionaries where each dictionary represents a Sitemap or Url object.
654
+ """
655
+ dump_data: list[dict[str, Any]] = []
656
+ for sm in row_data:
657
+ row: dict[str, Any] = {}
658
+ for field_name in fields:
659
+ field_value: Fields = getattr(sm, field_name)
660
+ row[field_name] = (
661
+ field_value.isoformat()
662
+ if isinstance(field_value, datetime)
663
+ else field_value
664
+ )
665
+ dump_data.append(row)
666
+ return dump_data
667
+
668
+ def export_sitemaps(self) -> str:
669
+ """Export site map data to JSON format.
670
+
671
+ Returns:
672
+ str: JSON data as a string
673
+ """
674
+ default_fields: SitemapFields = ("loc", "lastmod")
675
+ sitemap_fields: SitemapFields = getattr(Sitemap, "fields", default_fields)
676
+
677
+ return dumps(self._collate(sitemap_fields, self.data.get_sitemaps()))
678
+
679
+ def export_urls(self) -> str:
680
+ """Export site map data to JSON format.
681
+
682
+ Returns:
683
+ str: JSON data as a string
684
+ """
685
+ default_fields: UrlFields = ("loc", "lastmod", "changefreq", "priority")
686
+ url_fields: UrlFields = getattr(Url, "fields", default_fields)
687
+
688
+ return dumps(self._collate(url_fields, self.data.get_urls()))
File without changes