ultimate-sitemap-parser 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.1.0
3
+ Version: 1.1.1
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
5
  License: GPL-3.0-or-later
6
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
@@ -0,0 +1,21 @@
1
+ usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
2
+ usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
3
+ usp/cli/_ls.py,sha256=BjF5bGuhe_E_Ak-yyY0cDM83LFstl5tA3XNIrGZJujs,2954
4
+ usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
5
+ usp/cli/cli.py,sha256=ySNyYHoCQ440KfxmpTkzLXgqtbnt5ru-TgPs2Zw2-LI,592
6
+ usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
7
+ usp/fetch_parse.py,sha256=VJrJSAG1X8oQyW2p9wSepuGWfHlMDNoJG8jn3an2XUY,41396
8
+ usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
9
+ usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
11
+ usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
12
+ usp/tree.py,sha256=AmK0TptwNAexwSBAjrziYvx9cueQDMt5w9_1m8d4edI,4055
13
+ usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
15
+ usp/web_client/requests_client.py,sha256=xxkBUHvakBN-Guw_DqGElZJVS42xgUwWHxM7jA_QEPI,5593
16
+ ultimate_sitemap_parser-1.1.1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
17
+ ultimate_sitemap_parser-1.1.1.dist-info/METADATA,sha256=9sK5LCSHHPuSvdDjakIqOm-Gv3_Lgm1tsZdDDFs8vSE,4447
18
+ ultimate_sitemap_parser-1.1.1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
19
+ ultimate_sitemap_parser-1.1.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
20
+ ultimate_sitemap_parser-1.1.1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
21
+ ultimate_sitemap_parser-1.1.1.dist-info/RECORD,,
usp/cli/_ls.py CHANGED
@@ -2,7 +2,7 @@ import argparse
2
2
  import sys
3
3
  from typing import Iterator
4
4
 
5
- from usp.cli._util import tabs, format_help
5
+ from usp.cli._util import format_help, tabs
6
6
  from usp.objects.sitemap import AbstractSitemap
7
7
  from usp.tree import sitemap_tree_for_homepage
8
8
 
usp/cli/cli.py CHANGED
@@ -1,7 +1,7 @@
1
1
  from argparse import ArgumentParser
2
2
 
3
- from usp.cli import _ls as ls_cmd
4
3
  from usp import __version__
4
+ from usp.cli import _ls as ls_cmd
5
5
 
6
6
 
7
7
  def main():
usp/fetch_parse.py CHANGED
@@ -8,49 +8,49 @@
8
8
  """
9
9
 
10
10
  import abc
11
+ import logging
11
12
  import re
12
13
  import xml.parsers.expat
13
14
  from collections import OrderedDict
14
15
  from decimal import Decimal, InvalidOperation
15
- from typing import Optional, Dict, Union
16
-
16
+ from typing import Dict, Optional, Union
17
17
 
18
18
  from .exceptions import SitemapException, SitemapXMLParsingException
19
19
  from .helpers import (
20
- html_unescape_strip,
21
- parse_iso8601_date,
22
20
  get_url_retry_on_client_errors,
23
- ungzipped_response_content,
21
+ html_unescape_strip,
24
22
  is_http_url,
23
+ parse_iso8601_date,
25
24
  parse_rfc2822_date,
25
+ ungzipped_response_content,
26
26
  )
27
- from .log import create_logger
28
27
  from .objects.page import (
28
+ SITEMAP_PAGE_DEFAULT_PRIORITY,
29
29
  SitemapImage,
30
- SitemapPage,
31
30
  SitemapNewsStory,
31
+ SitemapPage,
32
32
  SitemapPageChangeFrequency,
33
- SITEMAP_PAGE_DEFAULT_PRIORITY,
34
33
  )
35
34
  from .objects.sitemap import (
36
35
  AbstractSitemap,
37
- InvalidSitemap,
38
36
  IndexRobotsTxtSitemap,
39
37
  IndexXMLSitemap,
40
- PagesXMLSitemap,
41
- PagesTextSitemap,
42
- PagesRSSSitemap,
38
+ InvalidSitemap,
43
39
  PagesAtomSitemap,
40
+ PagesRSSSitemap,
41
+ PagesTextSitemap,
42
+ PagesXMLSitemap,
44
43
  )
45
44
  from .web_client.abstract_client import (
46
45
  AbstractWebClient,
47
46
  AbstractWebClientSuccessResponse,
47
+ LocalWebClient,
48
+ NoWebClientException,
48
49
  WebClientErrorResponse,
49
50
  )
50
- from .web_client.abstract_client import LocalWebClient, NoWebClientException
51
51
  from .web_client.requests_client import RequestsWebClient
52
52
 
53
- log = create_logger(__name__)
53
+ log = logging.getLogger(__name__)
54
54
 
55
55
 
56
56
  class SitemapFetcher:
usp/helpers.py CHANGED
@@ -3,24 +3,25 @@
3
3
  import datetime
4
4
  import gzip as gzip_lib
5
5
  import html
6
+ import logging
6
7
  import re
7
8
  import sys
8
9
  import time
9
10
  from typing import Optional
10
- from urllib.parse import urlparse, unquote_plus, urlunparse
11
- from dateutil.parser import parse as dateutil_parse
11
+ from urllib.parse import unquote_plus, urlparse, urlunparse
12
+
12
13
  from dateutil.parser import isoparse as dateutil_isoparse
14
+ from dateutil.parser import parse as dateutil_parse
13
15
 
14
- from .exceptions import SitemapException, GunzipException, StripURLToHomepageException
15
- from .log import create_logger
16
+ from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
16
17
  from .web_client.abstract_client import (
17
18
  AbstractWebClient,
19
+ AbstractWebClientResponse,
18
20
  AbstractWebClientSuccessResponse,
19
21
  WebClientErrorResponse,
20
- AbstractWebClientResponse,
21
22
  )
22
23
 
23
- log = create_logger(__name__)
24
+ log = logging.getLogger(__name__)
24
25
 
25
26
  __URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
26
27
  """Regular expression to match HTTP(s) URLs."""
@@ -247,7 +248,7 @@ def ungzipped_response_content(
247
248
  data = gunzip(data)
248
249
  except GunzipException as ex:
249
250
  # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
250
- log.error(
251
+ log.warning(
251
252
  f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
252
253
  )
253
254
 
usp/objects/sitemap.py CHANGED
@@ -9,11 +9,11 @@
9
9
  """
10
10
 
11
11
  import abc
12
- from functools import lru_cache
13
12
  import os
14
13
  import pickle
15
14
  import tempfile
16
- from typing import List, Iterator, Tuple
15
+ from functools import lru_cache
16
+ from typing import Iterator, List, Tuple
17
17
 
18
18
  from .page import SitemapPage
19
19
 
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
72
72
  return hash((self.url,))
73
73
 
74
74
  def __repr__(self):
75
- return f"{self.__class__.__name__}(" f"url={self.url}" ")"
75
+ return f"{self.__class__.__name__}(url={self.url})"
76
76
 
77
77
  @property
78
78
  def url(self) -> str:
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
167
167
  return True
168
168
 
169
169
  def __repr__(self):
170
- return (
171
- f"{self.__class__.__name__}("
172
- f"url={self.url}, "
173
- f"reason={self.reason}"
174
- ")"
175
- )
170
+ return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
176
171
 
177
172
  def to_dict(self, with_pages=True) -> dict:
178
173
  return {
usp/tree.py CHANGED
@@ -1,19 +1,20 @@
1
1
  """Helpers to generate a sitemap tree."""
2
2
 
3
+ import logging
3
4
  from typing import Optional
5
+
4
6
  from .exceptions import SitemapException
5
7
  from .fetch_parse import SitemapFetcher, SitemapStrParser
6
8
  from .helpers import is_http_url, strip_url_to_homepage
7
- from .log import create_logger
8
9
  from .objects.sitemap import (
9
10
  AbstractSitemap,
10
- InvalidSitemap,
11
- IndexWebsiteSitemap,
12
11
  IndexRobotsTxtSitemap,
12
+ IndexWebsiteSitemap,
13
+ InvalidSitemap,
13
14
  )
14
15
  from .web_client.abstract_client import AbstractWebClient
15
16
 
16
- log = create_logger(__name__)
17
+ log = logging.getLogger(__name__)
17
18
 
18
19
  _UNPUBLISHED_SITEMAP_PATHS = {
19
20
  "sitemap.xml",
@@ -2,8 +2,8 @@
2
2
 
3
3
  import abc
4
4
  import random
5
- from http import HTTPStatus
6
5
  import time
6
+ from http import HTTPStatus
7
7
  from typing import Optional
8
8
 
9
9
  RETRYABLE_HTTP_STATUS_CODES = {
@@ -1,20 +1,23 @@
1
1
  """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
2
2
 
3
- from http import HTTPStatus
4
3
  import logging
5
- from typing import Optional, Dict, Tuple, Union
4
+ from http import HTTPStatus
5
+ from typing import Dict, Optional, Tuple, Union
6
6
 
7
7
  import requests
8
8
 
9
+ from usp import __version__
10
+
9
11
  from .abstract_client import (
12
+ RETRYABLE_HTTP_STATUS_CODES,
10
13
  AbstractWebClient,
11
14
  AbstractWebClientResponse,
12
15
  AbstractWebClientSuccessResponse,
13
16
  RequestWaiter,
14
17
  WebClientErrorResponse,
15
- RETRYABLE_HTTP_STATUS_CODES,
16
18
  )
17
- from usp import __version__
19
+
20
+ log = logging.getLogger(__name__)
18
21
 
19
22
 
20
23
  class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
@@ -153,7 +156,7 @@ class RequestsWebClient(AbstractWebClient):
153
156
  )
154
157
  else:
155
158
  message = f"{response.status_code} {response.reason}"
156
- logging.info(f"Response content: {response.text}")
159
+ log.info(f"Response content: {response.text}")
157
160
 
158
161
  if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
159
162
  return RequestsWebClientErrorResponse(
@@ -1,22 +0,0 @@
1
- usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
2
- usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
3
- usp/cli/_ls.py,sha256=YyDmtBjK02_26Qv8-3NLf87b1C4Wt0GzZ1XkdF2fllQ,2954
4
- usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
5
- usp/cli/cli.py,sha256=D1tXZyhiG0sIwtepdPdglW5gUlPWyx4LNeBmaM700Yc,592
6
- usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
7
- usp/fetch_parse.py,sha256=_AmrKQa_5dupqWWmXr3MXuPW3u_V4KBfLwCC0YaY3kk,41440
8
- usp/helpers.py,sha256=QJH3ETapqqbwRnjX_LM0EhWqeta9LTqVvW5OAkBKUOc,8491
9
- usp/log.py,sha256=BS0AtURK62TPGVqEuIu8kwGtIJDYoGsK5_N-b60VOpE,1631
10
- usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
12
- usp/objects/sitemap.py,sha256=XWgke1SJNA79qnOEvaY2nJbnlidWxqBvfuRcF4GhBHI,11564
13
- usp/tree.py,sha256=2cuHOpdYX5aKZ4XuUQPaKjILnoPnFKZwPNn0g8cxT18,4066
14
- usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
- usp/web_client/abstract_client.py,sha256=m6glqfJyY_i63HhRb0SQLCx-i9yt-nwssiN_I-sCvQk,6291
16
- usp/web_client/requests_client.py,sha256=ywOovvBG4gnk0f_AJW8ZOv5r1DCrfk97fPjUT_f2US4,5561
17
- ultimate_sitemap_parser-1.1.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
18
- ultimate_sitemap_parser-1.1.0.dist-info/METADATA,sha256=ya5PV0x-dJpeTpK4J5UUsgiMhzXlQ9DyH4QTv-ozQXk,4447
19
- ultimate_sitemap_parser-1.1.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
20
- ultimate_sitemap_parser-1.1.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
21
- ultimate_sitemap_parser-1.1.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
22
- ultimate_sitemap_parser-1.1.0.dist-info/RECORD,,
usp/log.py DELETED
@@ -1,77 +0,0 @@
1
- """Logging utilities."""
2
-
3
- import logging
4
-
5
-
6
- class Logger:
7
- """
8
- Logging helper class.
9
- """
10
-
11
- __LEVELS = {
12
- "CRITICAL": logging.CRITICAL,
13
- "ERROR": logging.ERROR,
14
- "WARNING": logging.WARNING,
15
- "INFO": logging.INFO,
16
- "DEBUG": logging.DEBUG,
17
- }
18
- """Valid logging levels and their "logging" counterparts."""
19
-
20
- __DEFAULT_LEVEL = "INFO"
21
- """Default logging level."""
22
-
23
- __slots__ = [
24
- # "logging" object
25
- "__l",
26
- ]
27
-
28
- def __init__(self, name: str):
29
- """
30
- Initialize logger object for a given name.
31
-
32
- :param name: Module name that the logger should be initialized for.
33
- """
34
-
35
- self.__l = logging.getLogger(name)
36
-
37
- def error(self, message: str) -> None:
38
- """
39
- Log error message.
40
-
41
- :param message: Message to log.
42
- """
43
- self.__l.error(message)
44
-
45
- def warning(self, message: str) -> None:
46
- """
47
- Log warning message.
48
-
49
- :param message: Message to log.
50
- """
51
- self.__l.warning(message)
52
-
53
- def info(self, message: str) -> None:
54
- """
55
- Log informational message.
56
-
57
- :param message: Message to log.
58
- """
59
- self.__l.info(message)
60
-
61
- def debug(self, message: str) -> None:
62
- """
63
- Log debugging message.
64
-
65
- :param message: Message to log.
66
- """
67
- self.__l.debug(message)
68
-
69
-
70
- def create_logger(name: str) -> Logger:
71
- """
72
- Create and return Logger object.
73
-
74
- :param name: Module name that the logger should be initialized for.
75
- :return: Logger object.
76
- """
77
- return Logger(name=name)