py-sitemap-parser 2.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -0,0 +1,193 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: py-sitemap-parser
|
|
3
|
+
Version: 2.0.0
|
|
4
|
+
Summary: Simple sitemap parser for Python
|
|
5
|
+
Keywords: sitemap,xml,parser,seo,sitemap-parser,sitemap-index
|
|
6
|
+
Author: Joakim Helleśen
|
|
7
|
+
Author-email: Joakim Helleśen <tlovinator@gmail.com>
|
|
8
|
+
License: MIT
|
|
9
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
12
|
+
Classifier: Operating System :: OS Independent
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Dist: lxml
|
|
23
|
+
Requires-Dist: niquests
|
|
24
|
+
Requires-Dist: python-dateutil
|
|
25
|
+
Requires-Dist: types-xmltodict
|
|
26
|
+
Requires-Dist: xmltodict
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Project-URL: Homepage, https://github.com/M4hbod/sitemap-parser
|
|
29
|
+
Project-URL: Repository, https://github.com/M4hbod/sitemap-parser
|
|
30
|
+
Project-URL: Issues, https://github.com/M4hbod/sitemap-parser/issues
|
|
31
|
+
Project-URL: Documentation, https://github.com/M4hbod/sitemap-parser#readme
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
|
|
34
|
+
# Sitemap Parser
|
|
35
|
+
|
|
36
|
+
<p align="center">
|
|
37
|
+
<img src="https://github.com/M4hbod/sitemap-parser/blob/master/.github/logo.png?raw=true" title="Robot searching for sitemaps" alt="Robot searching for sitemaps" width="300" height="300" />
|
|
38
|
+
</p>
|
|
39
|
+
|
|
40
|
+
This is a Python library designed to parse XML sitemaps and sitemap index files from a given URL. It supports both standard XML sitemaps (which contain URLs) and sitemap index files (which contain links to other sitemaps). This tool is useful for extracting data such as URLs and modification dates from website sitemaps.
|
|
41
|
+
|
|
42
|
+
## Acknowledgments
|
|
43
|
+
|
|
44
|
+
This is a fork of [Dave O'Connor](https://github.com/daveoconnor)'s [site-map-parser](https://github.com/daveoconnor/site-map-parser). I couldn't have done this without his original work.
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
## Installation
|
|
48
|
+
|
|
49
|
+
```sh
|
|
50
|
+
uv add py-sitemap-parser
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Usage
|
|
54
|
+
|
|
55
|
+
The library provides a SiteMapParser class that can be used to parse sitemaps and sitemap indexes. You can pass a URL or raw XML data to the parser to extract the URLs or links to other sitemaps.
|
|
56
|
+
|
|
57
|
+
### Parsing a Sitemap from a URL
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import logging
|
|
61
|
+
from typing import TYPE_CHECKING
|
|
62
|
+
|
|
63
|
+
from sitemap_parser import SiteMapParser
|
|
64
|
+
|
|
65
|
+
if TYPE_CHECKING:
|
|
66
|
+
from sitemap_parser import SitemapIndex
|
|
67
|
+
from sitemap_parser import UrlSet
|
|
68
|
+
|
|
69
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
70
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# url = "https://ttvdrops.lovinator.space/sitemap.xml" # Sitemap index
|
|
74
|
+
url = "https://ttvdrops.lovinator.space/sitemap-static.xml" # Sitemap with URLs
|
|
75
|
+
parser = SiteMapParser(source=url)
|
|
76
|
+
|
|
77
|
+
if parser.has_sitemaps():
|
|
78
|
+
sitemaps: SitemapIndex = parser.get_sitemaps()
|
|
79
|
+
for sitemap in sitemaps:
|
|
80
|
+
logger.info(sitemap)
|
|
81
|
+
|
|
82
|
+
elif parser.has_urls():
|
|
83
|
+
urls: UrlSet = parser.get_urls()
|
|
84
|
+
for url in urls:
|
|
85
|
+
logger.info(url)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### Parsing a Raw XML String
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from sitemap_parser import SiteMapParser, UrlSet
|
|
92
|
+
|
|
93
|
+
xml_data = """
|
|
94
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
95
|
+
<url>
|
|
96
|
+
<loc>https://example.com/</loc>
|
|
97
|
+
<lastmod>2023-09-27</lastmod>
|
|
98
|
+
<changefreq>daily</changefreq>
|
|
99
|
+
<priority>1.0</priority>
|
|
100
|
+
</url>
|
|
101
|
+
<url>
|
|
102
|
+
<loc>https://example.com/about</loc>
|
|
103
|
+
<lastmod>2023-09-27</lastmod>
|
|
104
|
+
<changefreq>daily</changefreq>
|
|
105
|
+
<priority>0.8</priority>
|
|
106
|
+
</url>
|
|
107
|
+
</urlset>
|
|
108
|
+
"""
|
|
109
|
+
parser = SiteMapParser(source=xml_data, is_data_string=True)
|
|
110
|
+
urls: UrlSet = parser.get_urls()
|
|
111
|
+
for url in urls:
|
|
112
|
+
print(url)
|
|
113
|
+
|
|
114
|
+
# Output:
|
|
115
|
+
# - https://example.com/
|
|
116
|
+
# - https://example.com/about
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
### Exporting Sitemap Data to JSON
|
|
120
|
+
|
|
121
|
+
You can export the parsed sitemap data to a JSON file using the JSONExporter class.
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
import json
|
|
125
|
+
import logging
|
|
126
|
+
|
|
127
|
+
from sitemap_parser import JSONExporter
|
|
128
|
+
from sitemap_parser import SiteMapParser
|
|
129
|
+
|
|
130
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
131
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
132
|
+
|
|
133
|
+
# Sitemap with URLs to other sitemaps
|
|
134
|
+
parser = SiteMapParser(source="https://ttvdrops.lovinator.space/sitemap.xml")
|
|
135
|
+
|
|
136
|
+
if parser.has_sitemaps():
|
|
137
|
+
json_data: str = JSONExporter(data=parser).export_sitemaps()
|
|
138
|
+
json_data = json.loads(json_data)
|
|
139
|
+
logger.info("Exported sitemaps: %s", json_data)
|
|
140
|
+
|
|
141
|
+
logger.info("----" * 10)
|
|
142
|
+
|
|
143
|
+
# Sitemap with "real" URLs
|
|
144
|
+
parser2 = SiteMapParser(
|
|
145
|
+
source="https://ttvdrops.lovinator.space/sitemap-static.xml",
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
if parser2.has_urls():
|
|
149
|
+
json_data: str = JSONExporter(data=parser2).export_urls()
|
|
150
|
+
json_data = json.loads(json_data)
|
|
151
|
+
logger.info("Exported URLs: %s", json_data)
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
### Converting Sitemap XML to a Python dict
|
|
155
|
+
|
|
156
|
+
If you'd like to work with the parsed sitemap as a plain Python dictionary, you can use `SiteMapParser.to_dict()`.
|
|
157
|
+
|
|
158
|
+
```python
|
|
159
|
+
from sitemap_parser import SiteMapParser
|
|
160
|
+
|
|
161
|
+
xml = """
|
|
162
|
+
<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">
|
|
163
|
+
<url>
|
|
164
|
+
<loc>https://example.com/</loc>
|
|
165
|
+
</url>
|
|
166
|
+
</urlset>
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
parser = SiteMapParser(source=xml, is_data_string=True)
|
|
170
|
+
parsed = parser.to_dict()
|
|
171
|
+
|
|
172
|
+
# xmltodict represents repeated elements as lists
|
|
173
|
+
print(parsed["urlset"]["url"][0]["loc"])
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
You can also enable namespace processing for expanded namespace keys:
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
parsed = parser.to_dict(process_namespaces=True)
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
## Disabling Logging
|
|
183
|
+
|
|
184
|
+
If you want to disable logging, you can adjust the logging level to logging.CRITICAL or higher. This will suppress all log messages below the CRITICAL level.
|
|
185
|
+
|
|
186
|
+
Here's an example of how to do this:
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
import logging
|
|
190
|
+
|
|
191
|
+
# Set the logging level to CRITICAL to disable logging
|
|
192
|
+
logging.getLogger("sitemap_parser").setLevel(logging.CRITICAL)
|
|
193
|
+
```
|
|
@@ -0,0 +1,160 @@
|
|
|
1
|
+
# Sitemap Parser
|
|
2
|
+
|
|
3
|
+
<p align="center">
|
|
4
|
+
<img src="https://github.com/M4hbod/sitemap-parser/blob/master/.github/logo.png?raw=true" title="Robot searching for sitemaps" alt="Robot searching for sitemaps" width="300" height="300" />
|
|
5
|
+
</p>
|
|
6
|
+
|
|
7
|
+
This is a Python library designed to parse XML sitemaps and sitemap index files from a given URL. It supports both standard XML sitemaps (which contain URLs) and sitemap index files (which contain links to other sitemaps). This tool is useful for extracting data such as URLs and modification dates from website sitemaps.
|
|
8
|
+
|
|
9
|
+
## Acknowledgments
|
|
10
|
+
|
|
11
|
+
This is a fork of [Dave O'Connor](https://github.com/daveoconnor)'s [site-map-parser](https://github.com/daveoconnor/site-map-parser). I couldn't have done this without his original work.
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
## Installation
|
|
15
|
+
|
|
16
|
+
```sh
|
|
17
|
+
uv add py-sitemap-parser
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
## Usage
|
|
21
|
+
|
|
22
|
+
The library provides a SiteMapParser class that can be used to parse sitemaps and sitemap indexes. You can pass a URL or raw XML data to the parser to extract the URLs or links to other sitemaps.
|
|
23
|
+
|
|
24
|
+
### Parsing a Sitemap from a URL
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
import logging
|
|
28
|
+
from typing import TYPE_CHECKING
|
|
29
|
+
|
|
30
|
+
from sitemap_parser import SiteMapParser
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from sitemap_parser import SitemapIndex
|
|
34
|
+
from sitemap_parser import UrlSet
|
|
35
|
+
|
|
36
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
37
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
# url = "https://ttvdrops.lovinator.space/sitemap.xml" # Sitemap index
|
|
41
|
+
url = "https://ttvdrops.lovinator.space/sitemap-static.xml" # Sitemap with URLs
|
|
42
|
+
parser = SiteMapParser(source=url)
|
|
43
|
+
|
|
44
|
+
if parser.has_sitemaps():
|
|
45
|
+
sitemaps: SitemapIndex = parser.get_sitemaps()
|
|
46
|
+
for sitemap in sitemaps:
|
|
47
|
+
logger.info(sitemap)
|
|
48
|
+
|
|
49
|
+
elif parser.has_urls():
|
|
50
|
+
urls: UrlSet = parser.get_urls()
|
|
51
|
+
for url in urls:
|
|
52
|
+
logger.info(url)
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Parsing a Raw XML String
|
|
56
|
+
|
|
57
|
+
```python
|
|
58
|
+
from sitemap_parser import SiteMapParser, UrlSet
|
|
59
|
+
|
|
60
|
+
xml_data = """
|
|
61
|
+
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
|
62
|
+
<url>
|
|
63
|
+
<loc>https://example.com/</loc>
|
|
64
|
+
<lastmod>2023-09-27</lastmod>
|
|
65
|
+
<changefreq>daily</changefreq>
|
|
66
|
+
<priority>1.0</priority>
|
|
67
|
+
</url>
|
|
68
|
+
<url>
|
|
69
|
+
<loc>https://example.com/about</loc>
|
|
70
|
+
<lastmod>2023-09-27</lastmod>
|
|
71
|
+
<changefreq>daily</changefreq>
|
|
72
|
+
<priority>0.8</priority>
|
|
73
|
+
</url>
|
|
74
|
+
</urlset>
|
|
75
|
+
"""
|
|
76
|
+
parser = SiteMapParser(source=xml_data, is_data_string=True)
|
|
77
|
+
urls: UrlSet = parser.get_urls()
|
|
78
|
+
for url in urls:
|
|
79
|
+
print(url)
|
|
80
|
+
|
|
81
|
+
# Output:
|
|
82
|
+
# - https://example.com/
|
|
83
|
+
# - https://example.com/about
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
### Exporting Sitemap Data to JSON
|
|
87
|
+
|
|
88
|
+
You can export the parsed sitemap data to a JSON file using the JSONExporter class.
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
import json
|
|
92
|
+
import logging
|
|
93
|
+
|
|
94
|
+
from sitemap_parser import JSONExporter
|
|
95
|
+
from sitemap_parser import SiteMapParser
|
|
96
|
+
|
|
97
|
+
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
|
98
|
+
logger: logging.Logger = logging.getLogger(__name__)
|
|
99
|
+
|
|
100
|
+
# Sitemap with URLs to other sitemaps
|
|
101
|
+
parser = SiteMapParser(source="https://ttvdrops.lovinator.space/sitemap.xml")
|
|
102
|
+
|
|
103
|
+
if parser.has_sitemaps():
|
|
104
|
+
json_data: str = JSONExporter(data=parser).export_sitemaps()
|
|
105
|
+
json_data = json.loads(json_data)
|
|
106
|
+
logger.info("Exported sitemaps: %s", json_data)
|
|
107
|
+
|
|
108
|
+
logger.info("----" * 10)
|
|
109
|
+
|
|
110
|
+
# Sitemap with "real" URLs
|
|
111
|
+
parser2 = SiteMapParser(
|
|
112
|
+
source="https://ttvdrops.lovinator.space/sitemap-static.xml",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if parser2.has_urls():
|
|
116
|
+
json_data: str = JSONExporter(data=parser2).export_urls()
|
|
117
|
+
json_data = json.loads(json_data)
|
|
118
|
+
logger.info("Exported URLs: %s", json_data)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Converting Sitemap XML to a Python dict
|
|
122
|
+
|
|
123
|
+
If you'd like to work with the parsed sitemap as a plain Python dictionary, you can use `SiteMapParser.to_dict()`.
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
from sitemap_parser import SiteMapParser
|
|
127
|
+
|
|
128
|
+
xml = """
|
|
129
|
+
<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">
|
|
130
|
+
<url>
|
|
131
|
+
<loc>https://example.com/</loc>
|
|
132
|
+
</url>
|
|
133
|
+
</urlset>
|
|
134
|
+
"""
|
|
135
|
+
|
|
136
|
+
parser = SiteMapParser(source=xml, is_data_string=True)
|
|
137
|
+
parsed = parser.to_dict()
|
|
138
|
+
|
|
139
|
+
# xmltodict represents repeated elements as lists
|
|
140
|
+
print(parsed["urlset"]["url"][0]["loc"])
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
You can also enable namespace processing for expanded namespace keys:
|
|
144
|
+
|
|
145
|
+
```python
|
|
146
|
+
parsed = parser.to_dict(process_namespaces=True)
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Disabling Logging
|
|
150
|
+
|
|
151
|
+
If you want to disable logging, you can adjust the logging level to logging.CRITICAL or higher. This will suppress all log messages below the CRITICAL level.
|
|
152
|
+
|
|
153
|
+
Here's an example of how to do this:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import logging
|
|
157
|
+
|
|
158
|
+
# Set the logging level to CRITICAL to disable logging
|
|
159
|
+
logging.getLogger("sitemap_parser").setLevel(logging.CRITICAL)
|
|
160
|
+
```
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "py-sitemap-parser"
|
|
3
|
+
version = "2.0.0"
|
|
4
|
+
description = "Simple sitemap parser for Python"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
authors = [{ name = "Joakim Helleśen", email = "tlovinator@gmail.com" }]
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
requires-python = ">=3.9"
|
|
9
|
+
keywords = ["sitemap", "xml", "parser", "seo", "sitemap-parser", "sitemap-index"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 5 - Production/Stable",
|
|
12
|
+
"Intended Audience :: Developers",
|
|
13
|
+
"License :: OSI Approved :: MIT License",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.9",
|
|
17
|
+
"Programming Language :: Python :: 3.10",
|
|
18
|
+
"Programming Language :: Python :: 3.11",
|
|
19
|
+
"Programming Language :: Python :: 3.12",
|
|
20
|
+
"Programming Language :: Python :: 3.13",
|
|
21
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
22
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
23
|
+
"Typing :: Typed",
|
|
24
|
+
]
|
|
25
|
+
dependencies = [
|
|
26
|
+
"lxml",
|
|
27
|
+
"niquests",
|
|
28
|
+
"python-dateutil",
|
|
29
|
+
"types-xmltodict",
|
|
30
|
+
"xmltodict",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.urls]
|
|
34
|
+
Homepage = "https://github.com/M4hbod/sitemap-parser"
|
|
35
|
+
Repository = "https://github.com/M4hbod/sitemap-parser"
|
|
36
|
+
Issues = "https://github.com/M4hbod/sitemap-parser/issues"
|
|
37
|
+
Documentation = "https://github.com/M4hbod/sitemap-parser#readme"
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["uv_build>=0.10.11,<0.11.0"]
|
|
41
|
+
build-backend = "uv_build"
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = ["lxml-stubs", "pytest", "pytest-cov"]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
fix = true
|
|
49
|
+
preview = true
|
|
50
|
+
unsafe-fixes = true
|
|
51
|
+
|
|
52
|
+
format.docstring-code-format = true
|
|
53
|
+
format.preview = true
|
|
54
|
+
|
|
55
|
+
lint.future-annotations = true
|
|
56
|
+
lint.isort.force-single-line = true
|
|
57
|
+
lint.pycodestyle.ignore-overlong-task-comments = true
|
|
58
|
+
lint.pydocstyle.convention = "google"
|
|
59
|
+
lint.select = ["ALL"]
|
|
60
|
+
|
|
61
|
+
# Don't automatically remove unused variables
|
|
62
|
+
lint.unfixable = ["F841"]
|
|
63
|
+
|
|
64
|
+
lint.ignore = [
|
|
65
|
+
"ANN002", # Checks that function *args arguments have type annotations.
|
|
66
|
+
"ANN003", # Checks that function **kwargs arguments have type annotations.
|
|
67
|
+
"C901", # Checks for functions with a high McCabe complexity.
|
|
68
|
+
"CPY001", # Checks for the absence of copyright notices within Python files.
|
|
69
|
+
"D100", # Checks for undocumented public module definitions.
|
|
70
|
+
"D104", # Checks for undocumented public package definitions.
|
|
71
|
+
"D105", # Checks for undocumented magic method definitions.
|
|
72
|
+
"D106", # Checks for undocumented public class definitions, for nested classes.
|
|
73
|
+
"E501", # Checks for lines that exceed the specified maximum character length.
|
|
74
|
+
"ERA001", # Checks for commented-out Python code.
|
|
75
|
+
"FIX002", # Checks for "TODO" comments.
|
|
76
|
+
"PLR0911", # Checks for functions or methods with too many return statements.
|
|
77
|
+
"PLR0912", # Checks for functions or methods with too many branches, including (nested) if, elif, and else branches, for loops, try-except clauses, and match and case statements.
|
|
78
|
+
"PLR6301", # Checks for the presence of unused self parameter in methods definitions.
|
|
79
|
+
"RUF012", # Checks for mutable default values in class attributes.
|
|
80
|
+
"RUF067", # Detects the presence of code in __init__.py files.
|
|
81
|
+
"S405", # Checks for imports of the xml.etree.cElementTree and xml.etree.ElementTree modules
|
|
82
|
+
|
|
83
|
+
# Conflicting lint rules when using Ruff's formatter
|
|
84
|
+
# https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
|
|
85
|
+
"COM812", # Checks for the absence of trailing commas.
|
|
86
|
+
"COM819", # Checks for the presence of prohibited trailing commas.
|
|
87
|
+
"D206", # Checks for docstrings that are indented with tabs.
|
|
88
|
+
"D300", # Checks for docstrings that use '''triple single quotes''' instead of """triple double quotes""".
|
|
89
|
+
"E111", # Checks for indentation with a non-multiple of 4 spaces.
|
|
90
|
+
"E114", # Checks for indentation of comments with a non-multiple of 4 spaces.
|
|
91
|
+
"E117", # Checks for over-indented code.
|
|
92
|
+
"ISC001", # Checks for implicitly concatenated strings on a single line.
|
|
93
|
+
"ISC002", # Checks for implicitly concatenated strings that span multiple lines.
|
|
94
|
+
"Q000", # Checks for inline strings that use single quotes or double quotes, depending on the value of the lint.flake8-quotes.inline-quotes option.
|
|
95
|
+
"Q001", # Checks for multiline strings that use single quotes or double quotes, depending on the value of the lint.flake8-quotes.multiline-quotes setting.
|
|
96
|
+
"Q002", # Checks for docstrings that use single quotes or double quotes, depending on the value of the lint.flake8-quotes.docstring-quotes setting.
|
|
97
|
+
"Q003", # Checks for strings that include escaped quotes, and suggests changing the quote style to avoid the need to escape them.
|
|
98
|
+
"W191", # Checks for indentation that uses tabs.
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
[tool.ruff.lint.per-file-ignores]
|
|
102
|
+
"**/tests/**" = [
|
|
103
|
+
"ARG",
|
|
104
|
+
"FBT",
|
|
105
|
+
"PLR0904",
|
|
106
|
+
"PLR2004",
|
|
107
|
+
"PLR6301",
|
|
108
|
+
"S101",
|
|
109
|
+
"S105",
|
|
110
|
+
"S106",
|
|
111
|
+
"S311",
|
|
112
|
+
"SLF001",
|
|
113
|
+
]
|
|
@@ -0,0 +1,688 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import re
|
|
5
|
+
import typing
|
|
6
|
+
from datetime import datetime
|
|
7
|
+
from json import dumps
|
|
8
|
+
from typing import Any
|
|
9
|
+
from typing import Literal
|
|
10
|
+
|
|
11
|
+
import niquests
|
|
12
|
+
import xmltodict
|
|
13
|
+
from dateutil import parser
|
|
14
|
+
|
|
15
|
+
if typing.TYPE_CHECKING:
|
|
16
|
+
from collections.abc import Generator
|
|
17
|
+
from collections.abc import Iterator
|
|
18
|
+
|
|
19
|
+
from niquests import Response
|
|
20
|
+
|
|
21
|
+
__all__: list[str] = [
|
|
22
|
+
"JSONExporter",
|
|
23
|
+
"SiteMapParser",
|
|
24
|
+
"Sitemap",
|
|
25
|
+
"SitemapIndex",
|
|
26
|
+
"Url",
|
|
27
|
+
"UrlSet",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
logger: logging.Logger = logging.getLogger("sitemap_parser")
|
|
31
|
+
|
|
32
|
+
type Freqs = Literal[
|
|
33
|
+
"always",
|
|
34
|
+
"hourly",
|
|
35
|
+
"daily",
|
|
36
|
+
"weekly",
|
|
37
|
+
"monthly",
|
|
38
|
+
"yearly",
|
|
39
|
+
"never",
|
|
40
|
+
]
|
|
41
|
+
type ValidFreqs = tuple[
|
|
42
|
+
Literal["always"],
|
|
43
|
+
Literal["hourly"],
|
|
44
|
+
Literal["daily"],
|
|
45
|
+
Literal["weekly"],
|
|
46
|
+
Literal["monthly"],
|
|
47
|
+
Literal["yearly"],
|
|
48
|
+
Literal["never"],
|
|
49
|
+
]
|
|
50
|
+
type Fields = tuple[
|
|
51
|
+
Literal["loc"],
|
|
52
|
+
Literal["lastmod"],
|
|
53
|
+
Literal["changefreq"],
|
|
54
|
+
Literal["priority"],
|
|
55
|
+
]
|
|
56
|
+
type UrlFields = tuple[
|
|
57
|
+
Literal["loc"],
|
|
58
|
+
Literal["lastmod"],
|
|
59
|
+
Literal["changefreq"],
|
|
60
|
+
Literal["priority"],
|
|
61
|
+
]
|
|
62
|
+
type SitemapFields = tuple[
|
|
63
|
+
Literal["loc"],
|
|
64
|
+
Literal["lastmod"],
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# MARK: BaseData
|
|
69
|
+
class BaseData:
|
|
70
|
+
"""Base class for sitemap data.
|
|
71
|
+
|
|
72
|
+
Provides common properties and methods for sitemap and sitemap index entries,
|
|
73
|
+
such as location (`loc`) and last modified time (`lastmod`).
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def __init__(self) -> None:
|
|
77
|
+
self._lastmod: datetime | None = None
|
|
78
|
+
self._loc: str | None = None
|
|
79
|
+
|
|
80
|
+
@property
|
|
81
|
+
def lastmod(self) -> datetime | None:
|
|
82
|
+
"""Get the last modified datetime of the resource.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
datetime | None: The datetime when the resource was last modified, or None if not set.
|
|
86
|
+
"""
|
|
87
|
+
return self._lastmod
|
|
88
|
+
|
|
89
|
+
@lastmod.setter
|
|
90
|
+
def lastmod(self, value: str | None) -> None:
|
|
91
|
+
"""Set the last modified datetime of the resource.
|
|
92
|
+
|
|
93
|
+
Parses an ISO-8601 datetime string into a datetime object.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
value (str | None): An ISO-8601 formatted datetime string, or None.
|
|
97
|
+
"""
|
|
98
|
+
self._lastmod = parser.isoparse(value) if value is not None else None
|
|
99
|
+
|
|
100
|
+
@property
|
|
101
|
+
def loc(self) -> str | None:
|
|
102
|
+
"""Get the location URL of the resource.
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
str | None: The URL of the resource.
|
|
106
|
+
"""
|
|
107
|
+
return self._loc
|
|
108
|
+
|
|
109
|
+
@loc.setter
|
|
110
|
+
def loc(self, value: str | None) -> None:
|
|
111
|
+
"""Set the location URL of the resource.
|
|
112
|
+
|
|
113
|
+
Validates that the provided value is a valid URL.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
value (str | None): The URL to set.
|
|
117
|
+
|
|
118
|
+
Raises:
|
|
119
|
+
TypeError: If the value is not a string.
|
|
120
|
+
ValueError: If the value is not a valid URL.
|
|
121
|
+
"""
|
|
122
|
+
if not isinstance(value, str):
|
|
123
|
+
msg = "URL must be a string"
|
|
124
|
+
raise TypeError(msg)
|
|
125
|
+
|
|
126
|
+
if not re.match(r"http[s]?://", value):
|
|
127
|
+
msg: str = f"{value} is not a valid URL"
|
|
128
|
+
raise ValueError(msg)
|
|
129
|
+
|
|
130
|
+
self._loc = value
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def download_uri_data(
|
|
134
|
+
uri: str,
|
|
135
|
+
*,
|
|
136
|
+
raise_on_http_error: bool = True,
|
|
137
|
+
**kwargs: Any, # noqa: ANN401
|
|
138
|
+
) -> bytes:
|
|
139
|
+
"""Download the data from the uri.
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
uri(str): The uri to download. Expected format: HTTP/HTTPS URL.
|
|
143
|
+
**kwargs: Additional keyword arguments passed to niquests.get().
|
|
144
|
+
raise_on_http_error (bool): Whether to raise an exception on HTTP errors.
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
bytes: The data from the uri
|
|
148
|
+
|
|
149
|
+
Raises:
|
|
150
|
+
ValueError: If no content was found at the uri.
|
|
151
|
+
"""
|
|
152
|
+
logger.info("Downloading from %s", uri)
|
|
153
|
+
r: Response = niquests.get(uri, **kwargs)
|
|
154
|
+
|
|
155
|
+
if raise_on_http_error:
|
|
156
|
+
r.raise_for_status()
|
|
157
|
+
|
|
158
|
+
logger.debug("Downloaded data from %s", uri)
|
|
159
|
+
|
|
160
|
+
content: bytes | None = r.content
|
|
161
|
+
if not content:
|
|
162
|
+
logger.warning("No content found at %s", uri)
|
|
163
|
+
msg: str = f"No content found at {uri}"
|
|
164
|
+
raise ValueError(msg)
|
|
165
|
+
|
|
166
|
+
return content
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# MARK: Sitemap
|
|
170
|
+
class Sitemap(BaseData):
|
|
171
|
+
"""Representation of the <sitemap> element."""
|
|
172
|
+
|
|
173
|
+
fields: tuple[Literal["loc"], Literal["lastmod"]] = "loc", "lastmod"
|
|
174
|
+
|
|
175
|
+
def __init__(self, loc: str, lastmod: str | None = None) -> None:
|
|
176
|
+
"""Representation of the <sitemap> element.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
loc: String, URL of the page.
|
|
180
|
+
lastmod: str | None, The date of last modification of the file.
|
|
181
|
+
"""
|
|
182
|
+
self.loc = loc
|
|
183
|
+
self.lastmod = lastmod
|
|
184
|
+
|
|
185
|
+
def __str__(self) -> str:
|
|
186
|
+
"""String representation of the Sitemap instance.
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
The URL of the page.
|
|
190
|
+
|
|
191
|
+
Raises:
|
|
192
|
+
ValueError: If loc is None.
|
|
193
|
+
"""
|
|
194
|
+
if self.loc is None:
|
|
195
|
+
msg = "loc cannot be None"
|
|
196
|
+
raise ValueError(msg)
|
|
197
|
+
|
|
198
|
+
return self.loc
|
|
199
|
+
|
|
200
|
+
def __repr__(self) -> str:
|
|
201
|
+
"""String representation of the Sitemap instance.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
The URL of the page.
|
|
205
|
+
"""
|
|
206
|
+
return f"<Sitemap {self.loc}>"
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
# MARK: Url
|
|
210
|
+
class Url(BaseData):
|
|
211
|
+
"""Representation of the <url> element.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
BaseData: Base class for all data classes
|
|
215
|
+
|
|
216
|
+
Raises:
|
|
217
|
+
ValueError: If `changefreq` is not an allowed value.
|
|
218
|
+
ValueError: If `priority` is not between 0.0 and 1.0.
|
|
219
|
+
"""
|
|
220
|
+
|
|
221
|
+
fields: Fields = ("loc", "lastmod", "changefreq", "priority")
|
|
222
|
+
valid_freqs: ValidFreqs = (
|
|
223
|
+
"always",
|
|
224
|
+
"hourly",
|
|
225
|
+
"daily",
|
|
226
|
+
"weekly",
|
|
227
|
+
"monthly",
|
|
228
|
+
"yearly",
|
|
229
|
+
"never",
|
|
230
|
+
)
|
|
231
|
+
|
|
232
|
+
def __init__(
|
|
233
|
+
self: Url,
|
|
234
|
+
loc: str | None,
|
|
235
|
+
lastmod: str | None = None,
|
|
236
|
+
changefreq: str | None = None,
|
|
237
|
+
priority: str | float | None = None,
|
|
238
|
+
) -> None:
|
|
239
|
+
"""Creates a Url instance.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
loc: Location.
|
|
243
|
+
lastmod: Last modified.
|
|
244
|
+
changefreq: Change frequency.
|
|
245
|
+
priority: Priority.
|
|
246
|
+
"""
|
|
247
|
+
self.loc = loc
|
|
248
|
+
self.lastmod = lastmod
|
|
249
|
+
self.changefreq = changefreq
|
|
250
|
+
self.priority = float(priority) if priority is not None else None
|
|
251
|
+
|
|
252
|
+
@property
|
|
253
|
+
def changefreq(self: Url) -> Freqs | None:
|
|
254
|
+
"""Get changefreq."""
|
|
255
|
+
return self._changefreq
|
|
256
|
+
|
|
257
|
+
@changefreq.setter
|
|
258
|
+
def changefreq(self: Url, frequency: str | None) -> None:
|
|
259
|
+
"""Set changefreq.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
self: The Url instance
|
|
263
|
+
frequency: Change frequency.
|
|
264
|
+
|
|
265
|
+
Raises:
|
|
266
|
+
ValueError: Value is not an allowed value
|
|
267
|
+
"""
|
|
268
|
+
if frequency is not None and frequency not in Url.valid_freqs:
|
|
269
|
+
msg: str = f"'{frequency}' is not an allowed value: {Url.valid_freqs}"
|
|
270
|
+
raise ValueError(msg)
|
|
271
|
+
self._changefreq: Freqs | None = frequency
|
|
272
|
+
|
|
273
|
+
@property
|
|
274
|
+
def priority(self: Url) -> float | None:
|
|
275
|
+
"""Get priority.
|
|
276
|
+
|
|
277
|
+
Returns:
|
|
278
|
+
Priority
|
|
279
|
+
"""
|
|
280
|
+
return self._priority
|
|
281
|
+
|
|
282
|
+
@priority.setter
|
|
283
|
+
def priority(self: Url, priority: float | None) -> None:
|
|
284
|
+
if priority is not None:
|
|
285
|
+
min_value = 0.0
|
|
286
|
+
max_value = 1.0
|
|
287
|
+
if priority < min_value or priority > max_value:
|
|
288
|
+
msg: str = f"'{priority}' is not between 0.0 and 1.0"
|
|
289
|
+
raise ValueError(msg)
|
|
290
|
+
|
|
291
|
+
self._priority: float | None = priority
|
|
292
|
+
|
|
293
|
+
def __str__(self: Url) -> str:
|
|
294
|
+
"""Return a string representation of the Url instance.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
String representation of the Url instance
|
|
298
|
+
"""
|
|
299
|
+
return self.loc or ""
|
|
300
|
+
|
|
301
|
+
def __repr__(self: Url) -> str:
|
|
302
|
+
"""Return a string representation of the Url instance.
|
|
303
|
+
|
|
304
|
+
Returns:
|
|
305
|
+
String representation of the Url instance
|
|
306
|
+
"""
|
|
307
|
+
return f"Url(loc={self.loc}, lastmod={self.lastmod}, changefreq={self.changefreq}, priority={self.priority})"
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
# MARK: UrlSet
|
|
311
|
+
class UrlSet:
|
|
312
|
+
r"""Represents a <urlset\> element.
|
|
313
|
+
|
|
314
|
+
It contains multiple <url> entries, each represented by a Url instance.
|
|
315
|
+
|
|
316
|
+
Example usage:
|
|
317
|
+
```python
|
|
318
|
+
# Example of creating a UrlSet instance from a dict representing a <urlset> element.
|
|
319
|
+
# This is how the data would look after being parsed by xmltodict.
|
|
320
|
+
urlset_data = {
|
|
321
|
+
"url": [
|
|
322
|
+
{
|
|
323
|
+
"loc": "https://example.com/",
|
|
324
|
+
"lastmod": "2024-01-01T00:00:00Z",
|
|
325
|
+
"changefreq": "daily",
|
|
326
|
+
"priority": "0.8",
|
|
327
|
+
},
|
|
328
|
+
{
|
|
329
|
+
"loc": "https://example.com/about",
|
|
330
|
+
"lastmod": "2024-01-02T00:00:00Z",
|
|
331
|
+
"changefreq": "weekly",
|
|
332
|
+
"priority": "0.5",
|
|
333
|
+
},
|
|
334
|
+
]
|
|
335
|
+
}
|
|
336
|
+
# Create a UrlSet instance and iterate over the Url entries.
|
|
337
|
+
url_set = UrlSet(urlset_data)
|
|
338
|
+
for url in url_set:
|
|
339
|
+
print(url)
|
|
340
|
+
```
|
|
341
|
+
"""
|
|
342
|
+
|
|
343
|
+
allowed_fields: typing.ClassVar[tuple[str, ...]] = (
|
|
344
|
+
"loc",
|
|
345
|
+
"lastmod",
|
|
346
|
+
"changefreq",
|
|
347
|
+
"priority",
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
def __init__(self, urlset_data: dict[str, Any]) -> None:
|
|
351
|
+
r"""Initialize the UrlSet instance with the parsed <urlset\> data."""
|
|
352
|
+
self.urlset_data: dict[str, Any] = urlset_data
|
|
353
|
+
|
|
354
|
+
@staticmethod
|
|
355
|
+
def url_from_dict(url_dict: dict[str, Any]) -> Url:
|
|
356
|
+
"""Creates a Url instance from a dict representing a <url> element.
|
|
357
|
+
|
|
358
|
+
Args:
|
|
359
|
+
url_dict: A dict as returned by xmltodict for a <url> element.
|
|
360
|
+
|
|
361
|
+
Returns:
|
|
362
|
+
Url: A Url instance populated from the provided dict.
|
|
363
|
+
"""
|
|
364
|
+
logger.debug("url_from_dict %s", url_dict)
|
|
365
|
+
url_data: dict[str, str | None] = {}
|
|
366
|
+
for fld in UrlSet.allowed_fields:
|
|
367
|
+
value = url_dict.get(fld)
|
|
368
|
+
if value is not None:
|
|
369
|
+
url_data[fld] = value
|
|
370
|
+
|
|
371
|
+
logger.debug("url_data %s", url_data)
|
|
372
|
+
return Url(**url_data)
|
|
373
|
+
|
|
374
|
+
@staticmethod
|
|
375
|
+
def urls_from_url_set_data(
|
|
376
|
+
url_set_data: dict[str, Any],
|
|
377
|
+
) -> Generator[Url, typing.Any]:
|
|
378
|
+
r"""Generate Url instances from xmltodict output for a <urlset\>.
|
|
379
|
+
|
|
380
|
+
Args:
|
|
381
|
+
url_set_data: Parsed xmltodict output for the <urlset\> element.
|
|
382
|
+
|
|
383
|
+
Yields:
|
|
384
|
+
Url: A Url instance for each <url\> entry.
|
|
385
|
+
"""
|
|
386
|
+
logger.debug("urls_from_url_set_data %s", url_set_data)
|
|
387
|
+
|
|
388
|
+
url_items: list[dict[str, Any]] | dict[str, Any] = url_set_data.get("url", [])
|
|
389
|
+
if isinstance(url_items, dict):
|
|
390
|
+
url_items = [url_items]
|
|
391
|
+
|
|
392
|
+
for url_dict in url_items:
|
|
393
|
+
yield UrlSet.url_from_dict(url_dict)
|
|
394
|
+
|
|
395
|
+
def __iter__(self) -> Iterator[Url]:
|
|
396
|
+
"""Generator for Url instances from a <urlset> element.
|
|
397
|
+
|
|
398
|
+
Returns:
|
|
399
|
+
Url instance
|
|
400
|
+
"""
|
|
401
|
+
return UrlSet.urls_from_url_set_data(self.urlset_data)
|
|
402
|
+
|
|
403
|
+
|
|
404
|
+
# MARK: SitemapIndex
|
|
405
|
+
class SitemapIndex:
|
|
406
|
+
"""Represents a <sitemapindex> element."""
|
|
407
|
+
|
|
408
|
+
def __init__(self, index_data: dict[str, Any]) -> None:
|
|
409
|
+
"""Initialize the SitemapIndex instance with the parsed <sitemapindex> data."""
|
|
410
|
+
self.index_data: dict[str, Any] = index_data
|
|
411
|
+
|
|
412
|
+
@staticmethod
|
|
413
|
+
def sitemap_from_dict(sitemap_dict: dict[str, Any]) -> Sitemap:
|
|
414
|
+
"""Creates a Sitemap instance from a dict representing a <sitemap> element.
|
|
415
|
+
|
|
416
|
+
Args:
|
|
417
|
+
sitemap_dict: A dict as returned by xmltodict for a <sitemap> element.
|
|
418
|
+
|
|
419
|
+
Returns:
|
|
420
|
+
Sitemap: A Sitemap instance populated from the provided dict.
|
|
421
|
+
"""
|
|
422
|
+
sitemap_data: dict[str, str] = {}
|
|
423
|
+
for fld in ("loc", "lastmod"):
|
|
424
|
+
value = sitemap_dict.get(fld)
|
|
425
|
+
if value is not None:
|
|
426
|
+
sitemap_data[fld] = value
|
|
427
|
+
|
|
428
|
+
msg = "Returning sitemap object with data: {}"
|
|
429
|
+
logger.debug(msg, sitemap_data)
|
|
430
|
+
return Sitemap(**sitemap_data)
|
|
431
|
+
|
|
432
|
+
@staticmethod
|
|
433
|
+
def sitemaps_from_index_data(index_data: dict[str, Any]) -> Generator[Sitemap, Any]:
|
|
434
|
+
"""Generate Sitemap instances from xmltodict output for a <sitemapindex>.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
index_data: Parsed xmltodict output for the <sitemapindex> element.
|
|
438
|
+
|
|
439
|
+
Yields:
|
|
440
|
+
Sitemap: A Sitemap instance for each <sitemap> entry.
|
|
441
|
+
"""
|
|
442
|
+
logger.debug("Generating sitemaps from %s", index_data)
|
|
443
|
+
|
|
444
|
+
sitemap_items: list[dict[str, Any]] | dict[str, Any] = index_data.get(
|
|
445
|
+
"sitemap",
|
|
446
|
+
[],
|
|
447
|
+
)
|
|
448
|
+
if isinstance(sitemap_items, dict):
|
|
449
|
+
sitemap_items = [sitemap_items]
|
|
450
|
+
|
|
451
|
+
for sitemap_dict in sitemap_items:
|
|
452
|
+
yield SitemapIndex.sitemap_from_dict(sitemap_dict)
|
|
453
|
+
|
|
454
|
+
def __iter__(self) -> Iterator[Sitemap]:
|
|
455
|
+
"""Generator for Sitemap instances from a <sitemapindex> element.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
self: The SitemapIndex instance
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
Sitemap instance
|
|
462
|
+
"""
|
|
463
|
+
return SitemapIndex.sitemaps_from_index_data(self.index_data)
|
|
464
|
+
|
|
465
|
+
def __str__(self) -> str:
|
|
466
|
+
return f"<SitemapIndex: {self.index_data}>"
|
|
467
|
+
|
|
468
|
+
|
|
469
|
+
# MARK: SiteMapParser
|
|
470
|
+
class SiteMapParser:
|
|
471
|
+
"""Parses a sitemap or sitemap index and returns the appropriate object."""
|
|
472
|
+
|
|
473
|
+
def __init__(
|
|
474
|
+
self,
|
|
475
|
+
source: str,
|
|
476
|
+
*,
|
|
477
|
+
is_data_string: bool = False,
|
|
478
|
+
) -> None:
|
|
479
|
+
"""Initialize the SiteMapParser instance with the URI.
|
|
480
|
+
|
|
481
|
+
The source can be a URL or a raw XML string. The parser will determine
|
|
482
|
+
whether to download the data or use the provided string.
|
|
483
|
+
|
|
484
|
+
Args:
|
|
485
|
+
source: The URL of the sitemap or raw XML string.
|
|
486
|
+
is_data_string: Whether the source is a raw XML string or not.
|
|
487
|
+
"""
|
|
488
|
+
self.source: str = source
|
|
489
|
+
self.is_sitemap_index: bool = False
|
|
490
|
+
self._sitemaps: SitemapIndex | None = None
|
|
491
|
+
self._url_set: UrlSet | None = None
|
|
492
|
+
self._parsed_dict: dict[str, Any] | None = None
|
|
493
|
+
self._is_data_string: bool = is_data_string
|
|
494
|
+
self._xml_bytes: bytes | None = None
|
|
495
|
+
self._initialize()
|
|
496
|
+
|
|
497
|
+
def _initialize(self) -> None:
|
|
498
|
+
"""Initialization processing."""
|
|
499
|
+
# Determine if we're using raw XML data or downloading from a URL
|
|
500
|
+
if self._is_data_string:
|
|
501
|
+
data: bytes = self.source.encode("utf-8")
|
|
502
|
+
else:
|
|
503
|
+
data: bytes = download_uri_data(uri=self.source)
|
|
504
|
+
|
|
505
|
+
self._xml_bytes = data
|
|
506
|
+
|
|
507
|
+
# Use xmltodict to parse sitemap content into a dictionary.
|
|
508
|
+
# This avoids relying on ElementTree/XPath for extraction.
|
|
509
|
+
parsed: dict[str, Any] = xmltodict.parse(data, force_list=("url", "sitemap"))
|
|
510
|
+
self._parsed_dict = parsed
|
|
511
|
+
|
|
512
|
+
root_key = next(iter(parsed))
|
|
513
|
+
root_tag = root_key.split(":")[-1]
|
|
514
|
+
self.is_sitemap_index = root_tag == "sitemapindex"
|
|
515
|
+
|
|
516
|
+
if self.is_sitemap_index:
|
|
517
|
+
self._sitemaps = SitemapIndex(index_data=parsed[root_key])
|
|
518
|
+
else:
|
|
519
|
+
self._url_set = UrlSet(urlset_data=parsed[root_key])
|
|
520
|
+
|
|
521
|
+
def get_sitemaps(self) -> SitemapIndex:
|
|
522
|
+
"""Retrieve the sitemaps.
|
|
523
|
+
|
|
524
|
+
Can check if 'has_sitemaps()' returns True to determine
|
|
525
|
+
if this should be used without calling it
|
|
526
|
+
|
|
527
|
+
Returns:
|
|
528
|
+
SitemapIndex: The sitemaps as a SitemapIndex instance
|
|
529
|
+
|
|
530
|
+
Raises:
|
|
531
|
+
KeyError: If the root is not a <sitemapindex>
|
|
532
|
+
"""
|
|
533
|
+
if not self.has_sitemaps():
|
|
534
|
+
error_msg = "Method called when root is not a <sitemapindex>"
|
|
535
|
+
logger.critical(error_msg)
|
|
536
|
+
raise KeyError(error_msg)
|
|
537
|
+
|
|
538
|
+
if self._sitemaps is None:
|
|
539
|
+
msg = "Sitemaps are not available"
|
|
540
|
+
raise KeyError(msg)
|
|
541
|
+
|
|
542
|
+
return self._sitemaps
|
|
543
|
+
|
|
544
|
+
def get_urls(self) -> UrlSet:
|
|
545
|
+
"""Retrieve the URLs from the sitemap.
|
|
546
|
+
|
|
547
|
+
Returns:
|
|
548
|
+
UrlSet: The URLs as a UrlSet instance.
|
|
549
|
+
|
|
550
|
+
Raises:
|
|
551
|
+
KeyError: If the root is not a <urlset>.
|
|
552
|
+
"""
|
|
553
|
+
if not self.has_urls():
|
|
554
|
+
error_msg = "Method called when root is not a <urlset>"
|
|
555
|
+
logger.critical(error_msg)
|
|
556
|
+
|
|
557
|
+
# Check if the root is a <sitemapindex>
|
|
558
|
+
if self.is_sitemap_index:
|
|
559
|
+
error_msg = "Method called when root is a <sitemapindex>. Use 'get_sitemaps()' instead"
|
|
560
|
+
|
|
561
|
+
raise KeyError(error_msg)
|
|
562
|
+
|
|
563
|
+
if self._url_set is None:
|
|
564
|
+
msg = "URLs are not available"
|
|
565
|
+
raise KeyError(msg)
|
|
566
|
+
|
|
567
|
+
return self._url_set
|
|
568
|
+
|
|
569
|
+
def has_sitemaps(self) -> bool:
|
|
570
|
+
"""Determine if the URL's data contained sitemaps.
|
|
571
|
+
|
|
572
|
+
A sitemap can contain other sitemaps. For example: <https://www.webhallen.com/sitemap.xml>
|
|
573
|
+
|
|
574
|
+
Returns:
|
|
575
|
+
Boolean
|
|
576
|
+
"""
|
|
577
|
+
return self.is_sitemap_index
|
|
578
|
+
|
|
579
|
+
def has_urls(self) -> bool:
|
|
580
|
+
"""Determine if the URL's data contained urls.
|
|
581
|
+
|
|
582
|
+
Returns:
|
|
583
|
+
Boolean
|
|
584
|
+
"""
|
|
585
|
+
return not self.is_sitemap_index
|
|
586
|
+
|
|
587
|
+
def to_dict(
|
|
588
|
+
self,
|
|
589
|
+
*,
|
|
590
|
+
process_namespaces: bool = False,
|
|
591
|
+
**xmltodict_kwargs: object,
|
|
592
|
+
) -> dict[str, Any]:
|
|
593
|
+
"""Parse the underlying XML input into a Python dict.
|
|
594
|
+
|
|
595
|
+
Args:
|
|
596
|
+
process_namespaces (bool): Expand namespaces into the returned dict.
|
|
597
|
+
**xmltodict_kwargs: Additional keyword arguments passed to :func:`xmltodict.parse`.
|
|
598
|
+
|
|
599
|
+
Returns:
|
|
600
|
+
dict[str, Any]: The parsed XML as a Python dictionary.
|
|
601
|
+
|
|
602
|
+
Raises:
|
|
603
|
+
RuntimeError: If the parser does not have XML data available.
|
|
604
|
+
"""
|
|
605
|
+
xml_bytes: bytes | None = self._xml_bytes
|
|
606
|
+
if xml_bytes is None:
|
|
607
|
+
msg = "No XML data available to parse"
|
|
608
|
+
raise RuntimeError(msg)
|
|
609
|
+
|
|
610
|
+
# If we have already parsed the XML and the caller is using default
|
|
611
|
+
# options, just return the cached parse.
|
|
612
|
+
if (
|
|
613
|
+
not process_namespaces
|
|
614
|
+
and not xmltodict_kwargs
|
|
615
|
+
and self._parsed_dict is not None
|
|
616
|
+
):
|
|
617
|
+
return self._parsed_dict
|
|
618
|
+
|
|
619
|
+
kwargs: dict[str, Any] = {"process_namespaces": process_namespaces}
|
|
620
|
+
kwargs.update(xmltodict_kwargs) # type: ignore[arg-type]
|
|
621
|
+
|
|
622
|
+
return xmltodict.parse(xml_bytes, **kwargs)
|
|
623
|
+
|
|
624
|
+
def __str__(self) -> str:
|
|
625
|
+
"""String representation of the SiteMapParser instance.
|
|
626
|
+
|
|
627
|
+
Returns:
|
|
628
|
+
str
|
|
629
|
+
"""
|
|
630
|
+
return str(self._sitemaps if self.has_sitemaps() else self._url_set)
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
# MARK: JSONExporter
|
|
634
|
+
class JSONExporter:
|
|
635
|
+
"""Export site map data to JSON format."""
|
|
636
|
+
|
|
637
|
+
def __init__(self, data: SiteMapParser) -> None:
|
|
638
|
+
"""Initializes the JSONExporter instance with the site map data."""
|
|
639
|
+
self.data: SiteMapParser = data
|
|
640
|
+
|
|
641
|
+
@staticmethod
|
|
642
|
+
def _collate(
|
|
643
|
+
fields: SitemapFields | UrlFields,
|
|
644
|
+
row_data: SitemapIndex | UrlSet,
|
|
645
|
+
) -> list[dict[str, Any]]:
|
|
646
|
+
"""Collate data from SitemapIndex or UrlSet into a list of dictionaries.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
fields (SitemapFields | UrlFields): The fields to include in the output.
|
|
650
|
+
row_data (SitemapIndex | UrlSet): An iterable containing Sitemap or Url objects.
|
|
651
|
+
|
|
652
|
+
Returns:
|
|
653
|
+
list: A list of dictionaries where each dictionary represents a Sitemap or Url object.
|
|
654
|
+
"""
|
|
655
|
+
dump_data: list[dict[str, Any]] = []
|
|
656
|
+
for sm in row_data:
|
|
657
|
+
row: dict[str, Any] = {}
|
|
658
|
+
for field_name in fields:
|
|
659
|
+
field_value: Fields = getattr(sm, field_name)
|
|
660
|
+
row[field_name] = (
|
|
661
|
+
field_value.isoformat()
|
|
662
|
+
if isinstance(field_value, datetime)
|
|
663
|
+
else field_value
|
|
664
|
+
)
|
|
665
|
+
dump_data.append(row)
|
|
666
|
+
return dump_data
|
|
667
|
+
|
|
668
|
+
def export_sitemaps(self) -> str:
|
|
669
|
+
"""Export site map data to JSON format.
|
|
670
|
+
|
|
671
|
+
Returns:
|
|
672
|
+
str: JSON data as a string
|
|
673
|
+
"""
|
|
674
|
+
default_fields: SitemapFields = ("loc", "lastmod")
|
|
675
|
+
sitemap_fields: SitemapFields = getattr(Sitemap, "fields", default_fields)
|
|
676
|
+
|
|
677
|
+
return dumps(self._collate(sitemap_fields, self.data.get_sitemaps()))
|
|
678
|
+
|
|
679
|
+
def export_urls(self) -> str:
|
|
680
|
+
"""Export site map data to JSON format.
|
|
681
|
+
|
|
682
|
+
Returns:
|
|
683
|
+
str: JSON data as a string
|
|
684
|
+
"""
|
|
685
|
+
default_fields: UrlFields = ("loc", "lastmod", "changefreq", "priority")
|
|
686
|
+
url_fields: UrlFields = getattr(Url, "fields", default_fields)
|
|
687
|
+
|
|
688
|
+
return dumps(self._collate(url_fields, self.data.get_urls()))
|
|
File without changes
|