ultimate-sitemap-parser 1.3.1__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/PKG-INFO +1 -1
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/pyproject.toml +2 -2
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/fetch_parse.py +18 -3
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/helpers.py +2 -7
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/README.rst +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/__init__.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/cli/__init__.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/cli/_ls.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/cli/_util.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/cli/cli.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/exceptions.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/objects/__init__.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/objects/page.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/objects/sitemap.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/tree.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/web_client/__init__.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/web_client/abstract_client.py +0 -0
- {ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/web_client/requests_client.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ultimate-sitemap-parser"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.4.0"
|
|
4
4
|
description = "A performant library for parsing and crawling sitemaps"
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Linas Valiukas", email = "linas@media.mit.edu"},
|
|
@@ -53,7 +53,7 @@ python = ">=3.9,<4.0"
|
|
|
53
53
|
[tool.poetry.group.dev.dependencies]
|
|
54
54
|
requests-mock = ">=1.6.0,<2.0"
|
|
55
55
|
pytest = "^8.3.0"
|
|
56
|
-
ruff = "^0.
|
|
56
|
+
ruff = "^0.11.6"
|
|
57
57
|
vcrpy = "6.0.1"
|
|
58
58
|
pytest-mock = "^3.14.0"
|
|
59
59
|
|
|
@@ -366,6 +366,7 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
366
366
|
|
|
367
367
|
__slots__ = [
|
|
368
368
|
"_concrete_parser",
|
|
369
|
+
"_is_non_ns_sitemap",
|
|
369
370
|
]
|
|
370
371
|
|
|
371
372
|
def __init__(
|
|
@@ -386,6 +387,8 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
386
387
|
|
|
387
388
|
# Will be initialized when the type of sitemap is known
|
|
388
389
|
self._concrete_parser = None
|
|
390
|
+
# Whether this is a malformed sitemap with no namespace
|
|
391
|
+
self._is_non_ns_sitemap = False
|
|
389
392
|
|
|
390
393
|
def sitemap(self) -> AbstractSitemap:
|
|
391
394
|
parser = xml.parsers.expat.ParserCreate(
|
|
@@ -411,8 +414,7 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
411
414
|
|
|
412
415
|
return self._concrete_parser.sitemap()
|
|
413
416
|
|
|
414
|
-
|
|
415
|
-
def __normalize_xml_element_name(cls, name: str):
|
|
417
|
+
def __normalize_xml_element_name(self, name: str):
|
|
416
418
|
"""
|
|
417
419
|
Replace the namespace URL in the argument element name with internal namespace.
|
|
418
420
|
|
|
@@ -428,7 +430,7 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
428
430
|
:return: Internal namespace name plus element name, e.g. "sitemap loc"
|
|
429
431
|
"""
|
|
430
432
|
|
|
431
|
-
name_parts = name.split(
|
|
433
|
+
name_parts = name.split(self.__XML_NAMESPACE_SEPARATOR)
|
|
432
434
|
|
|
433
435
|
if len(name_parts) == 1:
|
|
434
436
|
namespace_url = ""
|
|
@@ -451,6 +453,19 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
451
453
|
name = f"image:{name}"
|
|
452
454
|
elif "/sitemap-video/" in namespace_url:
|
|
453
455
|
name = f"video:{name}"
|
|
456
|
+
elif name in {"urlset", "sitemapindex"}:
|
|
457
|
+
# XML sitemap root tag but namespace is not set
|
|
458
|
+
self._is_non_ns_sitemap = True
|
|
459
|
+
log.warning(
|
|
460
|
+
f'XML sitemap root tag {name} detected without expected xmlns (value is "{namespace_url}"), '
|
|
461
|
+
f"assuming is an XML sitemap."
|
|
462
|
+
)
|
|
463
|
+
name = f"sitemap:{name}"
|
|
464
|
+
elif self._is_non_ns_sitemap:
|
|
465
|
+
# Flag has previously been set and no other namespace matched,
|
|
466
|
+
# assume this should be in the sitemap namespace
|
|
467
|
+
log.debug(f"Assuming {name} should be in sitemap namespace")
|
|
468
|
+
name = f"sitemap:{name}"
|
|
454
469
|
else:
|
|
455
470
|
# We don't care about the rest of the namespaces, so just keep the plain element name
|
|
456
471
|
pass
|
|
@@ -194,13 +194,8 @@ def __response_is_gzipped_data(
|
|
|
194
194
|
uri = urlparse(url)
|
|
195
195
|
url_path = unquote_plus(uri.path)
|
|
196
196
|
content_type = response.header("content-type") or ""
|
|
197
|
-
content_encoding = response.header("content-encoding") or ""
|
|
198
197
|
|
|
199
|
-
if (
|
|
200
|
-
url_path.lower().endswith(".gz")
|
|
201
|
-
or "gzip" in content_type.lower()
|
|
202
|
-
or "gzip" in content_encoding.lower()
|
|
203
|
-
):
|
|
198
|
+
if url_path.lower().endswith(".gz") or "gzip" in content_type.lower():
|
|
204
199
|
return True
|
|
205
200
|
|
|
206
201
|
else:
|
|
@@ -260,7 +255,7 @@ def ungzipped_response_content(
|
|
|
260
255
|
except GunzipException as ex:
|
|
261
256
|
# In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
|
|
262
257
|
log.warning(
|
|
263
|
-
f"Unable to gunzip response {
|
|
258
|
+
f"Unable to gunzip response for {url}, maybe it's a non-gzipped sitemap: {ex}"
|
|
264
259
|
)
|
|
265
260
|
|
|
266
261
|
# FIXME other encodings
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/web_client/abstract_client.py
RENAMED
|
File without changes
|
{ultimate_sitemap_parser-1.3.1 → ultimate_sitemap_parser-1.4.0}/usp/web_client/requests_client.py
RENAMED
|
File without changes
|