ultimate-sitemap-parser 1.3.1__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.3.1
3
+ Version: 1.4.0
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
5
  License: GPL-3.0-or-later
6
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
@@ -4,8 +4,8 @@ usp/cli/_ls.py,sha256=V0pMsDiQK_9RZ5MyUS2toW8b6e2FJ4spb3Grw6PayAI,3419
4
4
  usp/cli/_util.py,sha256=OrT9en350tATnaUrUn0peXr7aFPyYaaHGbEXGY6O4wI,2015
5
5
  usp/cli/cli.py,sha256=2byuqhBUhb7c1qUpBfTTufG-jvtiEWWq97GvCgv-s44,777
6
6
  usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
7
- usp/fetch_parse.py,sha256=69U1uAKawUym41N4nwJXLW9tQ0WXO4Pi63hnljYCXPM,43524
8
- usp/helpers.py,sha256=FeIZcEuEM3Uz8tHeNucgoB3_27Ax6qCatfalPIHHGUY,8862
7
+ usp/fetch_parse.py,sha256=COngyIf0ifmv_XAegUKzEn74pwdak9F6Thi7DSbglRs,44312
8
+ usp/helpers.py,sha256=OSP5W9N1WFc7MDZxYnGLvHKLodkkKUrq2ND65h9l-9Q,8726
9
9
  usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
11
11
  usp/objects/sitemap.py,sha256=_t0ej2UmNsIb0NkxYkwYGxBqX_LHEJfNc-cRulQXyIk,11495
@@ -13,9 +13,9 @@ usp/tree.py,sha256=MdnVxfIIMqWrudsYxFI8yQTXnlmNLFEcQEOkXbnuBr4,4395
13
13
  usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
14
  usp/web_client/abstract_client.py,sha256=EWY4lPYJqpV7ge0DTZESTAOofAjNMIJnDm_2PPeZ9z4,7007
15
15
  usp/web_client/requests_client.py,sha256=sFYtJ8Q5z27WlTG1PgBzcvbS75pJ0pYUastEFmxa95U,5888
16
- ultimate_sitemap_parser-1.3.1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
- ultimate_sitemap_parser-1.3.1.dist-info/METADATA,sha256=GUU8qLo24ZGBtAd4CYaHxY927eFFGvKlVPhc6jfg5so,4397
18
- ultimate_sitemap_parser-1.3.1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
- ultimate_sitemap_parser-1.3.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
- ultimate_sitemap_parser-1.3.1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
- ultimate_sitemap_parser-1.3.1.dist-info/RECORD,,
16
+ ultimate_sitemap_parser-1.4.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
+ ultimate_sitemap_parser-1.4.0.dist-info/METADATA,sha256=L2nASfrjc5-hZL71JAmf6h6SUeeAVHyppi5fbuDt5lo,4397
18
+ ultimate_sitemap_parser-1.4.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
+ ultimate_sitemap_parser-1.4.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
+ ultimate_sitemap_parser-1.4.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
+ ultimate_sitemap_parser-1.4.0.dist-info/RECORD,,
usp/fetch_parse.py CHANGED
@@ -366,6 +366,7 @@ class XMLSitemapParser(AbstractSitemapParser):
366
366
 
367
367
  __slots__ = [
368
368
  "_concrete_parser",
369
+ "_is_non_ns_sitemap",
369
370
  ]
370
371
 
371
372
  def __init__(
@@ -386,6 +387,8 @@ class XMLSitemapParser(AbstractSitemapParser):
386
387
 
387
388
  # Will be initialized when the type of sitemap is known
388
389
  self._concrete_parser = None
390
+ # Whether this is a malformed sitemap with no namespace
391
+ self._is_non_ns_sitemap = False
389
392
 
390
393
  def sitemap(self) -> AbstractSitemap:
391
394
  parser = xml.parsers.expat.ParserCreate(
@@ -411,8 +414,7 @@ class XMLSitemapParser(AbstractSitemapParser):
411
414
 
412
415
  return self._concrete_parser.sitemap()
413
416
 
414
- @classmethod
415
- def __normalize_xml_element_name(cls, name: str):
417
+ def __normalize_xml_element_name(self, name: str):
416
418
  """
417
419
  Replace the namespace URL in the argument element name with internal namespace.
418
420
 
@@ -428,7 +430,7 @@ class XMLSitemapParser(AbstractSitemapParser):
428
430
  :return: Internal namespace name plus element name, e.g. "sitemap loc"
429
431
  """
430
432
 
431
- name_parts = name.split(cls.__XML_NAMESPACE_SEPARATOR)
433
+ name_parts = name.split(self.__XML_NAMESPACE_SEPARATOR)
432
434
 
433
435
  if len(name_parts) == 1:
434
436
  namespace_url = ""
@@ -451,6 +453,19 @@ class XMLSitemapParser(AbstractSitemapParser):
451
453
  name = f"image:{name}"
452
454
  elif "/sitemap-video/" in namespace_url:
453
455
  name = f"video:{name}"
456
+ elif name in {"urlset", "sitemapindex"}:
457
+ # XML sitemap root tag but namespace is not set
458
+ self._is_non_ns_sitemap = True
459
+ log.warning(
460
+ f'XML sitemap root tag {name} detected without expected xmlns (value is "{namespace_url}"), '
461
+ f"assuming is an XML sitemap."
462
+ )
463
+ name = f"sitemap:{name}"
464
+ elif self._is_non_ns_sitemap:
465
+ # Flag has previously been set and no other namespace matched,
466
+ # assume this should be in the sitemap namespace
467
+ log.debug(f"Assuming {name} should be in sitemap namespace")
468
+ name = f"sitemap:{name}"
454
469
  else:
455
470
  # We don't care about the rest of the namespaces, so just keep the plain element name
456
471
  pass
usp/helpers.py CHANGED
@@ -194,13 +194,8 @@ def __response_is_gzipped_data(
194
194
  uri = urlparse(url)
195
195
  url_path = unquote_plus(uri.path)
196
196
  content_type = response.header("content-type") or ""
197
- content_encoding = response.header("content-encoding") or ""
198
197
 
199
- if (
200
- url_path.lower().endswith(".gz")
201
- or "gzip" in content_type.lower()
202
- or "gzip" in content_encoding.lower()
203
- ):
198
+ if url_path.lower().endswith(".gz") or "gzip" in content_type.lower():
204
199
  return True
205
200
 
206
201
  else:
@@ -260,7 +255,7 @@ def ungzipped_response_content(
260
255
  except GunzipException as ex:
261
256
  # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
262
257
  log.warning(
263
- f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
258
+ f"Unable to gunzip response for {url}, maybe it's a non-gzipped sitemap: {ex}"
264
259
  )
265
260
 
266
261
  # FIXME other encodings