ultimate-sitemap-parser 1.1.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/PKG-INFO +1 -1
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/pyproject.toml +16 -8
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/_ls.py +1 -1
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/fetch_parse.py +14 -14
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/helpers.py +8 -7
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/sitemap.py +4 -9
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/tree.py +10 -5
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/requests_client.py +16 -7
- ultimate_sitemap_parser-1.1.0/usp/log.py +0 -77
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/README.rst +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/__init__.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/__init__.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/_util.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/cli.py +1 -1
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/exceptions.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/__init__.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/page.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/__init__.py +0 -0
- {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/abstract_client.py +1 -1
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "ultimate-sitemap-parser"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.2.0"
|
|
4
4
|
description = "A performant library for parsing and crawling sitemaps"
|
|
5
5
|
authors = [
|
|
6
6
|
{ name = "Linas Valiukas", email = "linas@media.mit.edu"},
|
|
@@ -53,7 +53,7 @@ python = ">=3.8,<4.0"
|
|
|
53
53
|
[tool.poetry.group.dev.dependencies]
|
|
54
54
|
requests-mock = ">=1.6.0,<2.0"
|
|
55
55
|
pytest = "^8.3.0"
|
|
56
|
-
ruff = "^0.
|
|
56
|
+
ruff = "^0.9.3"
|
|
57
57
|
vcrpy = "6.0.1"
|
|
58
58
|
pytest-mock = "^3.14.0"
|
|
59
59
|
|
|
@@ -83,12 +83,20 @@ extend-exclude = ["docs/*"]
|
|
|
83
83
|
|
|
84
84
|
[tool.ruff.lint]
|
|
85
85
|
select = [
|
|
86
|
-
"E4",
|
|
87
|
-
"E7",
|
|
88
|
-
"E9",
|
|
89
|
-
"F",
|
|
90
|
-
"UP",
|
|
91
|
-
"PT"
|
|
86
|
+
"E4", # pycodestyle Import
|
|
87
|
+
"E7", # pycodestyle Statement
|
|
88
|
+
"E9", # pycodestyle Runtime
|
|
89
|
+
"F", # pyflakes
|
|
90
|
+
"UP", # pyupgrde
|
|
91
|
+
"PT", # flake8-pytest-style
|
|
92
|
+
"I", # isort
|
|
93
|
+
"T20", # flake8-print
|
|
94
|
+
"LOG", # flake8-logging
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
[tool.ruff.lint.per-file-ignores]
|
|
98
|
+
"**/tests/*" = [
|
|
99
|
+
"T20", # Allow print in tests
|
|
92
100
|
]
|
|
93
101
|
|
|
94
102
|
[tool.pytest.ini_options]
|
|
@@ -8,49 +8,49 @@
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import abc
|
|
11
|
+
import logging
|
|
11
12
|
import re
|
|
12
13
|
import xml.parsers.expat
|
|
13
14
|
from collections import OrderedDict
|
|
14
15
|
from decimal import Decimal, InvalidOperation
|
|
15
|
-
from typing import
|
|
16
|
-
|
|
16
|
+
from typing import Dict, Optional, Union
|
|
17
17
|
|
|
18
18
|
from .exceptions import SitemapException, SitemapXMLParsingException
|
|
19
19
|
from .helpers import (
|
|
20
|
-
html_unescape_strip,
|
|
21
|
-
parse_iso8601_date,
|
|
22
20
|
get_url_retry_on_client_errors,
|
|
23
|
-
|
|
21
|
+
html_unescape_strip,
|
|
24
22
|
is_http_url,
|
|
23
|
+
parse_iso8601_date,
|
|
25
24
|
parse_rfc2822_date,
|
|
25
|
+
ungzipped_response_content,
|
|
26
26
|
)
|
|
27
|
-
from .log import create_logger
|
|
28
27
|
from .objects.page import (
|
|
28
|
+
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
29
29
|
SitemapImage,
|
|
30
|
-
SitemapPage,
|
|
31
30
|
SitemapNewsStory,
|
|
31
|
+
SitemapPage,
|
|
32
32
|
SitemapPageChangeFrequency,
|
|
33
|
-
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
34
33
|
)
|
|
35
34
|
from .objects.sitemap import (
|
|
36
35
|
AbstractSitemap,
|
|
37
|
-
InvalidSitemap,
|
|
38
36
|
IndexRobotsTxtSitemap,
|
|
39
37
|
IndexXMLSitemap,
|
|
40
|
-
|
|
41
|
-
PagesTextSitemap,
|
|
42
|
-
PagesRSSSitemap,
|
|
38
|
+
InvalidSitemap,
|
|
43
39
|
PagesAtomSitemap,
|
|
40
|
+
PagesRSSSitemap,
|
|
41
|
+
PagesTextSitemap,
|
|
42
|
+
PagesXMLSitemap,
|
|
44
43
|
)
|
|
45
44
|
from .web_client.abstract_client import (
|
|
46
45
|
AbstractWebClient,
|
|
47
46
|
AbstractWebClientSuccessResponse,
|
|
47
|
+
LocalWebClient,
|
|
48
|
+
NoWebClientException,
|
|
48
49
|
WebClientErrorResponse,
|
|
49
50
|
)
|
|
50
|
-
from .web_client.abstract_client import LocalWebClient, NoWebClientException
|
|
51
51
|
from .web_client.requests_client import RequestsWebClient
|
|
52
52
|
|
|
53
|
-
log =
|
|
53
|
+
log = logging.getLogger(__name__)
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
class SitemapFetcher:
|
|
@@ -3,24 +3,25 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
import gzip as gzip_lib
|
|
5
5
|
import html
|
|
6
|
+
import logging
|
|
6
7
|
import re
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
9
10
|
from typing import Optional
|
|
10
|
-
from urllib.parse import
|
|
11
|
-
|
|
11
|
+
from urllib.parse import unquote_plus, urlparse, urlunparse
|
|
12
|
+
|
|
12
13
|
from dateutil.parser import isoparse as dateutil_isoparse
|
|
14
|
+
from dateutil.parser import parse as dateutil_parse
|
|
13
15
|
|
|
14
|
-
from .exceptions import
|
|
15
|
-
from .log import create_logger
|
|
16
|
+
from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
|
|
16
17
|
from .web_client.abstract_client import (
|
|
17
18
|
AbstractWebClient,
|
|
19
|
+
AbstractWebClientResponse,
|
|
18
20
|
AbstractWebClientSuccessResponse,
|
|
19
21
|
WebClientErrorResponse,
|
|
20
|
-
AbstractWebClientResponse,
|
|
21
22
|
)
|
|
22
23
|
|
|
23
|
-
log =
|
|
24
|
+
log = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
__URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
|
|
26
27
|
"""Regular expression to match HTTP(s) URLs."""
|
|
@@ -247,7 +248,7 @@ def ungzipped_response_content(
|
|
|
247
248
|
data = gunzip(data)
|
|
248
249
|
except GunzipException as ex:
|
|
249
250
|
# In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
|
|
250
|
-
log.
|
|
251
|
+
log.warning(
|
|
251
252
|
f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
|
|
252
253
|
)
|
|
253
254
|
|
|
@@ -9,11 +9,11 @@
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import abc
|
|
12
|
-
from functools import lru_cache
|
|
13
12
|
import os
|
|
14
13
|
import pickle
|
|
15
14
|
import tempfile
|
|
16
|
-
from
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
from typing import Iterator, List, Tuple
|
|
17
17
|
|
|
18
18
|
from .page import SitemapPage
|
|
19
19
|
|
|
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
|
|
|
72
72
|
return hash((self.url,))
|
|
73
73
|
|
|
74
74
|
def __repr__(self):
|
|
75
|
-
return f"{self.__class__.__name__}(
|
|
75
|
+
return f"{self.__class__.__name__}(url={self.url})"
|
|
76
76
|
|
|
77
77
|
@property
|
|
78
78
|
def url(self) -> str:
|
|
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
|
|
|
167
167
|
return True
|
|
168
168
|
|
|
169
169
|
def __repr__(self):
|
|
170
|
-
return (
|
|
171
|
-
f"{self.__class__.__name__}("
|
|
172
|
-
f"url={self.url}, "
|
|
173
|
-
f"reason={self.reason}"
|
|
174
|
-
")"
|
|
175
|
-
)
|
|
170
|
+
return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
|
|
176
171
|
|
|
177
172
|
def to_dict(self, with_pages=True) -> dict:
|
|
178
173
|
return {
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
"""Helpers to generate a sitemap tree."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Optional
|
|
5
|
+
|
|
4
6
|
from .exceptions import SitemapException
|
|
5
7
|
from .fetch_parse import SitemapFetcher, SitemapStrParser
|
|
6
8
|
from .helpers import is_http_url, strip_url_to_homepage
|
|
7
|
-
from .log import create_logger
|
|
8
9
|
from .objects.sitemap import (
|
|
9
10
|
AbstractSitemap,
|
|
10
|
-
InvalidSitemap,
|
|
11
|
-
IndexWebsiteSitemap,
|
|
12
11
|
IndexRobotsTxtSitemap,
|
|
12
|
+
IndexWebsiteSitemap,
|
|
13
|
+
InvalidSitemap,
|
|
13
14
|
)
|
|
14
15
|
from .web_client.abstract_client import AbstractWebClient
|
|
15
16
|
|
|
16
|
-
log =
|
|
17
|
+
log = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
_UNPUBLISHED_SITEMAP_PATHS = {
|
|
19
20
|
"sitemap.xml",
|
|
@@ -39,6 +40,7 @@ def sitemap_tree_for_homepage(
|
|
|
39
40
|
web_client: Optional[AbstractWebClient] = None,
|
|
40
41
|
use_robots: bool = True,
|
|
41
42
|
use_known_paths: bool = True,
|
|
43
|
+
extra_known_paths: Optional[set] = None,
|
|
42
44
|
) -> AbstractSitemap:
|
|
43
45
|
"""
|
|
44
46
|
Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
|
|
@@ -48,12 +50,15 @@ def sitemap_tree_for_homepage(
|
|
|
48
50
|
If ``None``, a :class:`~.RequestsWebClient` will be used.
|
|
49
51
|
:param use_robots: Whether to discover sitemaps through robots.txt.
|
|
50
52
|
:param use_known_paths: Whether to discover sitemaps through common known paths.
|
|
53
|
+
:param extra_known_paths: Extra paths to check for sitemaps.
|
|
51
54
|
:return: Root sitemap object of the fetched sitemap tree.
|
|
52
55
|
"""
|
|
53
56
|
|
|
54
57
|
if not is_http_url(homepage_url):
|
|
55
58
|
raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
|
|
56
59
|
|
|
60
|
+
extra_known_paths = extra_known_paths or set()
|
|
61
|
+
|
|
57
62
|
stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
|
|
58
63
|
if homepage_url != stripped_homepage_url:
|
|
59
64
|
log.warning(
|
|
@@ -81,7 +86,7 @@ def sitemap_tree_for_homepage(
|
|
|
81
86
|
sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
|
|
82
87
|
|
|
83
88
|
if use_known_paths:
|
|
84
|
-
for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
|
|
89
|
+
for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS | extra_known_paths:
|
|
85
90
|
unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
|
|
86
91
|
|
|
87
92
|
# Don't refetch URLs already found in robots.txt
|
{ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/requests_client.py
RENAMED
|
@@ -1,20 +1,23 @@
|
|
|
1
1
|
"""Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
|
|
2
2
|
|
|
3
|
-
from http import HTTPStatus
|
|
4
3
|
import logging
|
|
5
|
-
from
|
|
4
|
+
from http import HTTPStatus
|
|
5
|
+
from typing import Dict, Optional, Tuple, Union
|
|
6
6
|
|
|
7
7
|
import requests
|
|
8
8
|
|
|
9
|
+
from usp import __version__
|
|
10
|
+
|
|
9
11
|
from .abstract_client import (
|
|
12
|
+
RETRYABLE_HTTP_STATUS_CODES,
|
|
10
13
|
AbstractWebClient,
|
|
11
14
|
AbstractWebClientResponse,
|
|
12
15
|
AbstractWebClientSuccessResponse,
|
|
13
16
|
RequestWaiter,
|
|
14
17
|
WebClientErrorResponse,
|
|
15
|
-
RETRYABLE_HTTP_STATUS_CODES,
|
|
16
18
|
)
|
|
17
|
-
|
|
19
|
+
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
18
21
|
|
|
19
22
|
|
|
20
23
|
class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
|
|
@@ -89,18 +92,24 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
89
92
|
]
|
|
90
93
|
|
|
91
94
|
def __init__(
|
|
92
|
-
self,
|
|
95
|
+
self,
|
|
96
|
+
verify=True,
|
|
97
|
+
wait: Optional[float] = None,
|
|
98
|
+
random_wait: bool = False,
|
|
99
|
+
session: Optional[requests.Session] = None,
|
|
93
100
|
):
|
|
94
101
|
"""
|
|
95
102
|
:param verify: whether certificates should be verified for HTTPS requests.
|
|
96
103
|
:param wait: time to wait between requests, in seconds.
|
|
97
104
|
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
|
|
105
|
+
:param session: a custom session object to use, or None to create a new one.
|
|
98
106
|
"""
|
|
99
107
|
self.__max_response_data_length = None
|
|
100
108
|
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
|
|
101
109
|
self.__proxies = {}
|
|
102
110
|
self.__verify = verify
|
|
103
111
|
self.__waiter = RequestWaiter(wait, random_wait)
|
|
112
|
+
self.__session = session or requests.Session()
|
|
104
113
|
|
|
105
114
|
def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
|
|
106
115
|
"""Set HTTP request timeout.
|
|
@@ -129,7 +138,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
129
138
|
def get(self, url: str) -> AbstractWebClientResponse:
|
|
130
139
|
self.__waiter.wait()
|
|
131
140
|
try:
|
|
132
|
-
response =
|
|
141
|
+
response = self.__session.get(
|
|
133
142
|
url,
|
|
134
143
|
timeout=self.__timeout,
|
|
135
144
|
stream=True,
|
|
@@ -153,7 +162,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
153
162
|
)
|
|
154
163
|
else:
|
|
155
164
|
message = f"{response.status_code} {response.reason}"
|
|
156
|
-
|
|
165
|
+
log.info(f"Response content: {response.text}")
|
|
157
166
|
|
|
158
167
|
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
|
|
159
168
|
return RequestsWebClientErrorResponse(
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
"""Logging utilities."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Logger:
|
|
7
|
-
"""
|
|
8
|
-
Logging helper class.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
__LEVELS = {
|
|
12
|
-
"CRITICAL": logging.CRITICAL,
|
|
13
|
-
"ERROR": logging.ERROR,
|
|
14
|
-
"WARNING": logging.WARNING,
|
|
15
|
-
"INFO": logging.INFO,
|
|
16
|
-
"DEBUG": logging.DEBUG,
|
|
17
|
-
}
|
|
18
|
-
"""Valid logging levels and their "logging" counterparts."""
|
|
19
|
-
|
|
20
|
-
__DEFAULT_LEVEL = "INFO"
|
|
21
|
-
"""Default logging level."""
|
|
22
|
-
|
|
23
|
-
__slots__ = [
|
|
24
|
-
# "logging" object
|
|
25
|
-
"__l",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
def __init__(self, name: str):
|
|
29
|
-
"""
|
|
30
|
-
Initialize logger object for a given name.
|
|
31
|
-
|
|
32
|
-
:param name: Module name that the logger should be initialized for.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
self.__l = logging.getLogger(name)
|
|
36
|
-
|
|
37
|
-
def error(self, message: str) -> None:
|
|
38
|
-
"""
|
|
39
|
-
Log error message.
|
|
40
|
-
|
|
41
|
-
:param message: Message to log.
|
|
42
|
-
"""
|
|
43
|
-
self.__l.error(message)
|
|
44
|
-
|
|
45
|
-
def warning(self, message: str) -> None:
|
|
46
|
-
"""
|
|
47
|
-
Log warning message.
|
|
48
|
-
|
|
49
|
-
:param message: Message to log.
|
|
50
|
-
"""
|
|
51
|
-
self.__l.warning(message)
|
|
52
|
-
|
|
53
|
-
def info(self, message: str) -> None:
|
|
54
|
-
"""
|
|
55
|
-
Log informational message.
|
|
56
|
-
|
|
57
|
-
:param message: Message to log.
|
|
58
|
-
"""
|
|
59
|
-
self.__l.info(message)
|
|
60
|
-
|
|
61
|
-
def debug(self, message: str) -> None:
|
|
62
|
-
"""
|
|
63
|
-
Log debugging message.
|
|
64
|
-
|
|
65
|
-
:param message: Message to log.
|
|
66
|
-
"""
|
|
67
|
-
self.__l.debug(message)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def create_logger(name: str) -> Logger:
|
|
71
|
-
"""
|
|
72
|
-
Create and return Logger object.
|
|
73
|
-
|
|
74
|
-
:param name: Module name that the logger should be initialized for.
|
|
75
|
-
:return: Logger object.
|
|
76
|
-
"""
|
|
77
|
-
return Logger(name=name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|