ultimate-sitemap-parser 1.1.0__py3-none-any.whl → 1.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.1.0.dist-info → ultimate_sitemap_parser-1.1.1.dist-info}/METADATA +1 -1
- ultimate_sitemap_parser-1.1.1.dist-info/RECORD +21 -0
- usp/cli/_ls.py +1 -1
- usp/cli/cli.py +1 -1
- usp/fetch_parse.py +14 -14
- usp/helpers.py +8 -7
- usp/objects/sitemap.py +4 -9
- usp/tree.py +5 -4
- usp/web_client/abstract_client.py +1 -1
- usp/web_client/requests_client.py +8 -5
- ultimate_sitemap_parser-1.1.0.dist-info/RECORD +0 -22
- usp/log.py +0 -77
- {ultimate_sitemap_parser-1.1.0.dist-info → ultimate_sitemap_parser-1.1.1.dist-info}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.1.0.dist-info → ultimate_sitemap_parser-1.1.1.dist-info}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.1.0.dist-info → ultimate_sitemap_parser-1.1.1.dist-info}/WHEEL +0 -0
- {ultimate_sitemap_parser-1.1.0.dist-info → ultimate_sitemap_parser-1.1.1.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
|
|
2
|
+
usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
|
|
3
|
+
usp/cli/_ls.py,sha256=BjF5bGuhe_E_Ak-yyY0cDM83LFstl5tA3XNIrGZJujs,2954
|
|
4
|
+
usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
|
|
5
|
+
usp/cli/cli.py,sha256=ySNyYHoCQ440KfxmpTkzLXgqtbnt5ru-TgPs2Zw2-LI,592
|
|
6
|
+
usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
|
|
7
|
+
usp/fetch_parse.py,sha256=VJrJSAG1X8oQyW2p9wSepuGWfHlMDNoJG8jn3an2XUY,41396
|
|
8
|
+
usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
|
|
9
|
+
usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
|
|
11
|
+
usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
|
|
12
|
+
usp/tree.py,sha256=AmK0TptwNAexwSBAjrziYvx9cueQDMt5w9_1m8d4edI,4055
|
|
13
|
+
usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
|
|
15
|
+
usp/web_client/requests_client.py,sha256=xxkBUHvakBN-Guw_DqGElZJVS42xgUwWHxM7jA_QEPI,5593
|
|
16
|
+
ultimate_sitemap_parser-1.1.1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
17
|
+
ultimate_sitemap_parser-1.1.1.dist-info/METADATA,sha256=9sK5LCSHHPuSvdDjakIqOm-Gv3_Lgm1tsZdDDFs8vSE,4447
|
|
18
|
+
ultimate_sitemap_parser-1.1.1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
|
|
19
|
+
ultimate_sitemap_parser-1.1.1.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
20
|
+
ultimate_sitemap_parser-1.1.1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
|
|
21
|
+
ultimate_sitemap_parser-1.1.1.dist-info/RECORD,,
|
usp/cli/_ls.py
CHANGED
usp/cli/cli.py
CHANGED
usp/fetch_parse.py
CHANGED
|
@@ -8,49 +8,49 @@
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import abc
|
|
11
|
+
import logging
|
|
11
12
|
import re
|
|
12
13
|
import xml.parsers.expat
|
|
13
14
|
from collections import OrderedDict
|
|
14
15
|
from decimal import Decimal, InvalidOperation
|
|
15
|
-
from typing import
|
|
16
|
-
|
|
16
|
+
from typing import Dict, Optional, Union
|
|
17
17
|
|
|
18
18
|
from .exceptions import SitemapException, SitemapXMLParsingException
|
|
19
19
|
from .helpers import (
|
|
20
|
-
html_unescape_strip,
|
|
21
|
-
parse_iso8601_date,
|
|
22
20
|
get_url_retry_on_client_errors,
|
|
23
|
-
|
|
21
|
+
html_unescape_strip,
|
|
24
22
|
is_http_url,
|
|
23
|
+
parse_iso8601_date,
|
|
25
24
|
parse_rfc2822_date,
|
|
25
|
+
ungzipped_response_content,
|
|
26
26
|
)
|
|
27
|
-
from .log import create_logger
|
|
28
27
|
from .objects.page import (
|
|
28
|
+
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
29
29
|
SitemapImage,
|
|
30
|
-
SitemapPage,
|
|
31
30
|
SitemapNewsStory,
|
|
31
|
+
SitemapPage,
|
|
32
32
|
SitemapPageChangeFrequency,
|
|
33
|
-
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
34
33
|
)
|
|
35
34
|
from .objects.sitemap import (
|
|
36
35
|
AbstractSitemap,
|
|
37
|
-
InvalidSitemap,
|
|
38
36
|
IndexRobotsTxtSitemap,
|
|
39
37
|
IndexXMLSitemap,
|
|
40
|
-
|
|
41
|
-
PagesTextSitemap,
|
|
42
|
-
PagesRSSSitemap,
|
|
38
|
+
InvalidSitemap,
|
|
43
39
|
PagesAtomSitemap,
|
|
40
|
+
PagesRSSSitemap,
|
|
41
|
+
PagesTextSitemap,
|
|
42
|
+
PagesXMLSitemap,
|
|
44
43
|
)
|
|
45
44
|
from .web_client.abstract_client import (
|
|
46
45
|
AbstractWebClient,
|
|
47
46
|
AbstractWebClientSuccessResponse,
|
|
47
|
+
LocalWebClient,
|
|
48
|
+
NoWebClientException,
|
|
48
49
|
WebClientErrorResponse,
|
|
49
50
|
)
|
|
50
|
-
from .web_client.abstract_client import LocalWebClient, NoWebClientException
|
|
51
51
|
from .web_client.requests_client import RequestsWebClient
|
|
52
52
|
|
|
53
|
-
log =
|
|
53
|
+
log = logging.getLogger(__name__)
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
class SitemapFetcher:
|
usp/helpers.py
CHANGED
|
@@ -3,24 +3,25 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
import gzip as gzip_lib
|
|
5
5
|
import html
|
|
6
|
+
import logging
|
|
6
7
|
import re
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
9
10
|
from typing import Optional
|
|
10
|
-
from urllib.parse import
|
|
11
|
-
|
|
11
|
+
from urllib.parse import unquote_plus, urlparse, urlunparse
|
|
12
|
+
|
|
12
13
|
from dateutil.parser import isoparse as dateutil_isoparse
|
|
14
|
+
from dateutil.parser import parse as dateutil_parse
|
|
13
15
|
|
|
14
|
-
from .exceptions import
|
|
15
|
-
from .log import create_logger
|
|
16
|
+
from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
|
|
16
17
|
from .web_client.abstract_client import (
|
|
17
18
|
AbstractWebClient,
|
|
19
|
+
AbstractWebClientResponse,
|
|
18
20
|
AbstractWebClientSuccessResponse,
|
|
19
21
|
WebClientErrorResponse,
|
|
20
|
-
AbstractWebClientResponse,
|
|
21
22
|
)
|
|
22
23
|
|
|
23
|
-
log =
|
|
24
|
+
log = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
__URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
|
|
26
27
|
"""Regular expression to match HTTP(s) URLs."""
|
|
@@ -247,7 +248,7 @@ def ungzipped_response_content(
|
|
|
247
248
|
data = gunzip(data)
|
|
248
249
|
except GunzipException as ex:
|
|
249
250
|
# In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
|
|
250
|
-
log.
|
|
251
|
+
log.warning(
|
|
251
252
|
f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
|
|
252
253
|
)
|
|
253
254
|
|
usp/objects/sitemap.py
CHANGED
|
@@ -9,11 +9,11 @@
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import abc
|
|
12
|
-
from functools import lru_cache
|
|
13
12
|
import os
|
|
14
13
|
import pickle
|
|
15
14
|
import tempfile
|
|
16
|
-
from
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
from typing import Iterator, List, Tuple
|
|
17
17
|
|
|
18
18
|
from .page import SitemapPage
|
|
19
19
|
|
|
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
|
|
|
72
72
|
return hash((self.url,))
|
|
73
73
|
|
|
74
74
|
def __repr__(self):
|
|
75
|
-
return f"{self.__class__.__name__}(
|
|
75
|
+
return f"{self.__class__.__name__}(url={self.url})"
|
|
76
76
|
|
|
77
77
|
@property
|
|
78
78
|
def url(self) -> str:
|
|
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
|
|
|
167
167
|
return True
|
|
168
168
|
|
|
169
169
|
def __repr__(self):
|
|
170
|
-
return (
|
|
171
|
-
f"{self.__class__.__name__}("
|
|
172
|
-
f"url={self.url}, "
|
|
173
|
-
f"reason={self.reason}"
|
|
174
|
-
")"
|
|
175
|
-
)
|
|
170
|
+
return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
|
|
176
171
|
|
|
177
172
|
def to_dict(self, with_pages=True) -> dict:
|
|
178
173
|
return {
|
usp/tree.py
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
"""Helpers to generate a sitemap tree."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Optional
|
|
5
|
+
|
|
4
6
|
from .exceptions import SitemapException
|
|
5
7
|
from .fetch_parse import SitemapFetcher, SitemapStrParser
|
|
6
8
|
from .helpers import is_http_url, strip_url_to_homepage
|
|
7
|
-
from .log import create_logger
|
|
8
9
|
from .objects.sitemap import (
|
|
9
10
|
AbstractSitemap,
|
|
10
|
-
InvalidSitemap,
|
|
11
|
-
IndexWebsiteSitemap,
|
|
12
11
|
IndexRobotsTxtSitemap,
|
|
12
|
+
IndexWebsiteSitemap,
|
|
13
|
+
InvalidSitemap,
|
|
13
14
|
)
|
|
14
15
|
from .web_client.abstract_client import AbstractWebClient
|
|
15
16
|
|
|
16
|
-
log =
|
|
17
|
+
log = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
_UNPUBLISHED_SITEMAP_PATHS = {
|
|
19
20
|
"sitemap.xml",
|
|
@@ -1,20 +1,23 @@
|
|
|
1
1
|
"""Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
|
|
2
2
|
|
|
3
|
-
from http import HTTPStatus
|
|
4
3
|
import logging
|
|
5
|
-
from
|
|
4
|
+
from http import HTTPStatus
|
|
5
|
+
from typing import Dict, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
|
|
9
|
+
from usp import __version__
|
|
10
|
+
|
|
9
11
|
from .abstract_client import (
|
|
12
|
+
RETRYABLE_HTTP_STATUS_CODES,
|
|
10
13
|
AbstractWebClient,
|
|
11
14
|
AbstractWebClientResponse,
|
|
12
15
|
AbstractWebClientSuccessResponse,
|
|
13
16
|
RequestWaiter,
|
|
14
17
|
WebClientErrorResponse,
|
|
15
|
-
RETRYABLE_HTTP_STATUS_CODES,
|
|
16
18
|
)
|
|
17
|
-
|
|
19
|
+
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
|
|
@@ -153,7 +156,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
153
156
|
)
|
|
154
157
|
else:
|
|
155
158
|
message = f"{response.status_code} {response.reason}"
|
|
156
|
-
|
|
159
|
+
log.info(f"Response content: {response.text}")
|
|
157
160
|
|
|
158
161
|
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
|
|
159
162
|
return RequestsWebClientErrorResponse(
|
|
@@ -1,22 +0,0 @@
|
|
|
1
|
-
usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
|
|
2
|
-
usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
|
|
3
|
-
usp/cli/_ls.py,sha256=YyDmtBjK02_26Qv8-3NLf87b1C4Wt0GzZ1XkdF2fllQ,2954
|
|
4
|
-
usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
|
|
5
|
-
usp/cli/cli.py,sha256=D1tXZyhiG0sIwtepdPdglW5gUlPWyx4LNeBmaM700Yc,592
|
|
6
|
-
usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
|
|
7
|
-
usp/fetch_parse.py,sha256=_AmrKQa_5dupqWWmXr3MXuPW3u_V4KBfLwCC0YaY3kk,41440
|
|
8
|
-
usp/helpers.py,sha256=QJH3ETapqqbwRnjX_LM0EhWqeta9LTqVvW5OAkBKUOc,8491
|
|
9
|
-
usp/log.py,sha256=BS0AtURK62TPGVqEuIu8kwGtIJDYoGsK5_N-b60VOpE,1631
|
|
10
|
-
usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
-
usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
|
|
12
|
-
usp/objects/sitemap.py,sha256=XWgke1SJNA79qnOEvaY2nJbnlidWxqBvfuRcF4GhBHI,11564
|
|
13
|
-
usp/tree.py,sha256=2cuHOpdYX5aKZ4XuUQPaKjILnoPnFKZwPNn0g8cxT18,4066
|
|
14
|
-
usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
-
usp/web_client/abstract_client.py,sha256=m6glqfJyY_i63HhRb0SQLCx-i9yt-nwssiN_I-sCvQk,6291
|
|
16
|
-
usp/web_client/requests_client.py,sha256=ywOovvBG4gnk0f_AJW8ZOv5r1DCrfk97fPjUT_f2US4,5561
|
|
17
|
-
ultimate_sitemap_parser-1.1.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
18
|
-
ultimate_sitemap_parser-1.1.0.dist-info/METADATA,sha256=ya5PV0x-dJpeTpK4J5UUsgiMhzXlQ9DyH4QTv-ozQXk,4447
|
|
19
|
-
ultimate_sitemap_parser-1.1.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
|
|
20
|
-
ultimate_sitemap_parser-1.1.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
21
|
-
ultimate_sitemap_parser-1.1.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
|
|
22
|
-
ultimate_sitemap_parser-1.1.0.dist-info/RECORD,,
|
usp/log.py
DELETED
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
"""Logging utilities."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Logger:
|
|
7
|
-
"""
|
|
8
|
-
Logging helper class.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
__LEVELS = {
|
|
12
|
-
"CRITICAL": logging.CRITICAL,
|
|
13
|
-
"ERROR": logging.ERROR,
|
|
14
|
-
"WARNING": logging.WARNING,
|
|
15
|
-
"INFO": logging.INFO,
|
|
16
|
-
"DEBUG": logging.DEBUG,
|
|
17
|
-
}
|
|
18
|
-
"""Valid logging levels and their "logging" counterparts."""
|
|
19
|
-
|
|
20
|
-
__DEFAULT_LEVEL = "INFO"
|
|
21
|
-
"""Default logging level."""
|
|
22
|
-
|
|
23
|
-
__slots__ = [
|
|
24
|
-
# "logging" object
|
|
25
|
-
"__l",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
def __init__(self, name: str):
|
|
29
|
-
"""
|
|
30
|
-
Initialize logger object for a given name.
|
|
31
|
-
|
|
32
|
-
:param name: Module name that the logger should be initialized for.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
self.__l = logging.getLogger(name)
|
|
36
|
-
|
|
37
|
-
def error(self, message: str) -> None:
|
|
38
|
-
"""
|
|
39
|
-
Log error message.
|
|
40
|
-
|
|
41
|
-
:param message: Message to log.
|
|
42
|
-
"""
|
|
43
|
-
self.__l.error(message)
|
|
44
|
-
|
|
45
|
-
def warning(self, message: str) -> None:
|
|
46
|
-
"""
|
|
47
|
-
Log warning message.
|
|
48
|
-
|
|
49
|
-
:param message: Message to log.
|
|
50
|
-
"""
|
|
51
|
-
self.__l.warning(message)
|
|
52
|
-
|
|
53
|
-
def info(self, message: str) -> None:
|
|
54
|
-
"""
|
|
55
|
-
Log informational message.
|
|
56
|
-
|
|
57
|
-
:param message: Message to log.
|
|
58
|
-
"""
|
|
59
|
-
self.__l.info(message)
|
|
60
|
-
|
|
61
|
-
def debug(self, message: str) -> None:
|
|
62
|
-
"""
|
|
63
|
-
Log debugging message.
|
|
64
|
-
|
|
65
|
-
:param message: Message to log.
|
|
66
|
-
"""
|
|
67
|
-
self.__l.debug(message)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def create_logger(name: str) -> Logger:
|
|
71
|
-
"""
|
|
72
|
-
Create and return Logger object.
|
|
73
|
-
|
|
74
|
-
:param name: Module name that the logger should be initialized for.
|
|
75
|
-
:return: Logger object.
|
|
76
|
-
"""
|
|
77
|
-
return Logger(name=name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ultimate_sitemap_parser-1.1.0.dist-info → ultimate_sitemap_parser-1.1.1.dist-info}/entry_points.txt
RENAMED
|
File without changes
|