ultimate-sitemap-parser 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/METADATA +2 -3
- ultimate_sitemap_parser-1.3.0.dist-info/RECORD +21 -0
- usp/cli/_ls.py +19 -2
- usp/cli/_util.py +57 -1
- usp/cli/cli.py +8 -2
- usp/fetch_parse.py +70 -17
- usp/helpers.py +12 -1
- usp/objects/sitemap.py +6 -3
- usp/tree.py +6 -1
- usp/web_client/abstract_client.py +30 -0
- usp/web_client/requests_client.py +4 -1
- ultimate_sitemap_parser-1.2.0.dist-info/RECORD +0 -21
- {ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/WHEEL +0 -0
- {ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/entry_points.txt +0 -0
{ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/METADATA
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: ultimate-sitemap-parser
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.3.0
|
|
4
4
|
Summary: A performant library for parsing and crawling sitemaps
|
|
5
5
|
License: GPL-3.0-or-later
|
|
6
6
|
Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
|
|
@@ -8,7 +8,7 @@ Author: Linas Valiukas
|
|
|
8
8
|
Author-email: linas@media.mit.edu
|
|
9
9
|
Maintainer: Freddy Heppell
|
|
10
10
|
Maintainer-email: f.heppell@sheffield.ac.uk
|
|
11
|
-
Requires-Python: >=3.
|
|
11
|
+
Requires-Python: >=3.9
|
|
12
12
|
Classifier: Development Status :: 5 - Production/Stable
|
|
13
13
|
Classifier: Intended Audience :: Developers
|
|
14
14
|
Classifier: Intended Audience :: Information Technology
|
|
@@ -16,7 +16,6 @@ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (G
|
|
|
16
16
|
Classifier: Operating System :: OS Independent
|
|
17
17
|
Classifier: Programming Language :: Python
|
|
18
18
|
Classifier: Programming Language :: Python :: 3
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
20
19
|
Classifier: Programming Language :: Python :: 3.9
|
|
21
20
|
Classifier: Programming Language :: Python :: 3.10
|
|
22
21
|
Classifier: Programming Language :: Python :: 3.11
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
|
|
2
|
+
usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
|
|
3
|
+
usp/cli/_ls.py,sha256=V0pMsDiQK_9RZ5MyUS2toW8b6e2FJ4spb3Grw6PayAI,3419
|
|
4
|
+
usp/cli/_util.py,sha256=OrT9en350tATnaUrUn0peXr7aFPyYaaHGbEXGY6O4wI,2015
|
|
5
|
+
usp/cli/cli.py,sha256=2byuqhBUhb7c1qUpBfTTufG-jvtiEWWq97GvCgv-s44,777
|
|
6
|
+
usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
|
|
7
|
+
usp/fetch_parse.py,sha256=69U1uAKawUym41N4nwJXLW9tQ0WXO4Pi63hnljYCXPM,43524
|
|
8
|
+
usp/helpers.py,sha256=FeIZcEuEM3Uz8tHeNucgoB3_27Ax6qCatfalPIHHGUY,8862
|
|
9
|
+
usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
+
usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
|
|
11
|
+
usp/objects/sitemap.py,sha256=KSMjgHDTG3Tx2LCIk_x3C4Q-05DNDNFt92gUwQiU7u8,11522
|
|
12
|
+
usp/tree.py,sha256=MdnVxfIIMqWrudsYxFI8yQTXnlmNLFEcQEOkXbnuBr4,4395
|
|
13
|
+
usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
+
usp/web_client/abstract_client.py,sha256=EWY4lPYJqpV7ge0DTZESTAOofAjNMIJnDm_2PPeZ9z4,7007
|
|
15
|
+
usp/web_client/requests_client.py,sha256=sFYtJ8Q5z27WlTG1PgBzcvbS75pJ0pYUastEFmxa95U,5888
|
|
16
|
+
ultimate_sitemap_parser-1.3.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
17
|
+
ultimate_sitemap_parser-1.3.0.dist-info/METADATA,sha256=MkV1nfPlwy-cRTcqTHmLTxplbw8_jDSKEIKZacCg2lE,4397
|
|
18
|
+
ultimate_sitemap_parser-1.3.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
|
|
19
|
+
ultimate_sitemap_parser-1.3.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
20
|
+
ultimate_sitemap_parser-1.3.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
|
|
21
|
+
ultimate_sitemap_parser-1.3.0.dist-info/RECORD,,
|
usp/cli/_ls.py
CHANGED
|
@@ -2,7 +2,7 @@ import argparse
|
|
|
2
2
|
import sys
|
|
3
3
|
from typing import Iterator
|
|
4
4
|
|
|
5
|
-
from usp.cli._util import format_help, tabs
|
|
5
|
+
from usp.cli._util import CountAction, format_help, setup_logging, tabs
|
|
6
6
|
from usp.objects.sitemap import AbstractSitemap
|
|
7
7
|
from usp.tree import sitemap_tree_for_homepage
|
|
8
8
|
|
|
@@ -26,7 +26,7 @@ def register(subparsers):
|
|
|
26
26
|
choices=LS_FORMATS,
|
|
27
27
|
default="tabtree",
|
|
28
28
|
help=format_help(LS_FORMATS, "set output format"),
|
|
29
|
-
metavar="",
|
|
29
|
+
metavar="FORMAT",
|
|
30
30
|
)
|
|
31
31
|
ls_parser.add_argument(
|
|
32
32
|
"-r",
|
|
@@ -46,6 +46,21 @@ def register(subparsers):
|
|
|
46
46
|
action="store_true",
|
|
47
47
|
help="strip the supplied URL from each page and sitemap URL",
|
|
48
48
|
)
|
|
49
|
+
ls_parser.add_argument(
|
|
50
|
+
"-v",
|
|
51
|
+
"--verbose",
|
|
52
|
+
action=CountAction,
|
|
53
|
+
help="increase output verbosity (-v=INFO, -vv=DEBUG)",
|
|
54
|
+
dest="verbosity",
|
|
55
|
+
default=0,
|
|
56
|
+
max_count=2,
|
|
57
|
+
)
|
|
58
|
+
ls_parser.add_argument(
|
|
59
|
+
"-l",
|
|
60
|
+
"--log-file",
|
|
61
|
+
type=str,
|
|
62
|
+
help="write log to this file and suppress console output",
|
|
63
|
+
)
|
|
49
64
|
ls_parser.set_defaults(no_robots=False, no_known=False, strip_url=False)
|
|
50
65
|
|
|
51
66
|
ls_parser.set_defaults(func=ls)
|
|
@@ -85,6 +100,8 @@ def _output_pages(sitemap: AbstractSitemap, strip_prefix: str = ""):
|
|
|
85
100
|
|
|
86
101
|
|
|
87
102
|
def ls(args):
|
|
103
|
+
setup_logging(args.verbosity, args.log_file)
|
|
104
|
+
|
|
88
105
|
tree = sitemap_tree_for_homepage(
|
|
89
106
|
args.url,
|
|
90
107
|
use_robots=not args.no_robots,
|
usp/cli/_util.py
CHANGED
|
@@ -1,4 +1,6 @@
|
|
|
1
|
-
|
|
1
|
+
import logging
|
|
2
|
+
from argparse import Action
|
|
3
|
+
from typing import Dict, Optional
|
|
2
4
|
|
|
3
5
|
|
|
4
6
|
def format_help(choices: Dict[str, str], opt_help: str) -> str:
|
|
@@ -19,3 +21,57 @@ def format_help(choices: Dict[str, str], opt_help: str) -> str:
|
|
|
19
21
|
def tabs(n: int):
|
|
20
22
|
"""Generate n tabs."""
|
|
21
23
|
return "\t" * n
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_log_levels = {
|
|
27
|
+
0: logging.WARNING,
|
|
28
|
+
1: logging.INFO,
|
|
29
|
+
2: logging.DEBUG,
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class CountAction(Action):
|
|
34
|
+
"""Modified version of argparse._CountAction to output better help."""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
option_strings,
|
|
39
|
+
dest,
|
|
40
|
+
default=None,
|
|
41
|
+
required=False,
|
|
42
|
+
help=None,
|
|
43
|
+
max_count=None,
|
|
44
|
+
):
|
|
45
|
+
super().__init__(
|
|
46
|
+
option_strings=option_strings,
|
|
47
|
+
dest=dest,
|
|
48
|
+
nargs=0,
|
|
49
|
+
default=default,
|
|
50
|
+
required=required,
|
|
51
|
+
help=help,
|
|
52
|
+
)
|
|
53
|
+
self.max_count = max_count
|
|
54
|
+
|
|
55
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
|
56
|
+
count = getattr(namespace, self.dest, None)
|
|
57
|
+
if count is None:
|
|
58
|
+
count = 0
|
|
59
|
+
if self.max_count:
|
|
60
|
+
count = min(count, self.max_count)
|
|
61
|
+
setattr(namespace, self.dest, count + 1)
|
|
62
|
+
|
|
63
|
+
def format_usage(self):
|
|
64
|
+
option_str = self.option_strings[0]
|
|
65
|
+
if self.max_count is None:
|
|
66
|
+
return option_str
|
|
67
|
+
letter = self.option_strings[0][1]
|
|
68
|
+
usages = [f"-{letter * i}" for i in range(1, self.max_count + 1)]
|
|
69
|
+
return "/".join(usages)
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def setup_logging(verbosity: int, log_path: Optional[str]) -> None:
|
|
73
|
+
log_level = _log_levels.get(verbosity, logging.DEBUG)
|
|
74
|
+
if log_path is not None:
|
|
75
|
+
logging.basicConfig(level=log_level, filename=log_path)
|
|
76
|
+
else:
|
|
77
|
+
logging.basicConfig(level=log_level)
|
usp/cli/cli.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
from argparse import ArgumentParser
|
|
2
|
+
from typing import Optional
|
|
2
3
|
|
|
3
4
|
from usp import __version__
|
|
4
5
|
from usp.cli import _ls as ls_cmd
|
|
5
6
|
|
|
6
7
|
|
|
7
|
-
def
|
|
8
|
+
def parse_args(arg_list: Optional[list[str]]):
|
|
8
9
|
parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser")
|
|
9
10
|
parser.add_argument(
|
|
10
11
|
"-v", "--version", action="version", version=f"%(prog)s v{__version__}"
|
|
@@ -13,7 +14,12 @@ def main():
|
|
|
13
14
|
subparsers = parser.add_subparsers(required=False, title="commands", metavar="")
|
|
14
15
|
ls_cmd.register(subparsers)
|
|
15
16
|
|
|
16
|
-
args = parser.parse_args()
|
|
17
|
+
args = parser.parse_args(arg_list)
|
|
18
|
+
return args, parser
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def main(arg_list: Optional[list[str]] = None):
|
|
22
|
+
args, parser = parse_args(arg_list)
|
|
17
23
|
|
|
18
24
|
if "func" in args:
|
|
19
25
|
args.func(args)
|
usp/fetch_parse.py
CHANGED
|
@@ -13,7 +13,7 @@ import re
|
|
|
13
13
|
import xml.parsers.expat
|
|
14
14
|
from collections import OrderedDict
|
|
15
15
|
from decimal import Decimal, InvalidOperation
|
|
16
|
-
from typing import Dict, Optional,
|
|
16
|
+
from typing import Dict, Optional, Set
|
|
17
17
|
|
|
18
18
|
from .exceptions import SitemapException, SitemapXMLParsingException
|
|
19
19
|
from .helpers import (
|
|
@@ -43,8 +43,10 @@ from .objects.sitemap import (
|
|
|
43
43
|
)
|
|
44
44
|
from .web_client.abstract_client import (
|
|
45
45
|
AbstractWebClient,
|
|
46
|
+
AbstractWebClientResponse,
|
|
46
47
|
AbstractWebClientSuccessResponse,
|
|
47
48
|
LocalWebClient,
|
|
49
|
+
LocalWebClientSuccessResponse,
|
|
48
50
|
NoWebClientException,
|
|
49
51
|
WebClientErrorResponse,
|
|
50
52
|
)
|
|
@@ -64,12 +66,17 @@ class SitemapFetcher:
|
|
|
64
66
|
Spec says it might be up to 50 MB but let's go for the full 100 MB here."""
|
|
65
67
|
|
|
66
68
|
__MAX_RECURSION_LEVEL = 11
|
|
67
|
-
"""Max.
|
|
69
|
+
"""Max. depth level in iterating over sub-sitemaps.
|
|
70
|
+
|
|
71
|
+
Recursive sitemaps (i.e. child sitemaps pointing to their parent) are stopped immediately.
|
|
72
|
+
"""
|
|
68
73
|
|
|
69
74
|
__slots__ = [
|
|
70
75
|
"_url",
|
|
71
76
|
"_recursion_level",
|
|
72
77
|
"_web_client",
|
|
78
|
+
"_parent_urls",
|
|
79
|
+
"_quiet_404",
|
|
73
80
|
]
|
|
74
81
|
|
|
75
82
|
def __init__(
|
|
@@ -77,14 +84,19 @@ class SitemapFetcher:
|
|
|
77
84
|
url: str,
|
|
78
85
|
recursion_level: int,
|
|
79
86
|
web_client: Optional[AbstractWebClient] = None,
|
|
87
|
+
parent_urls: Optional[Set[str]] = None,
|
|
88
|
+
quiet_404: bool = False,
|
|
80
89
|
):
|
|
81
90
|
"""
|
|
82
91
|
|
|
83
92
|
:param url: URL of the sitemap to fetch and parse.
|
|
84
93
|
:param recursion_level: current recursion level of parser
|
|
85
94
|
:param web_client: Web client to use. If ``None``, a :class:`~.RequestsWebClient` will be used.
|
|
95
|
+
:param parent_urls: Set of parent URLs that led to this sitemap.
|
|
96
|
+
:param quiet_404: Whether 404 errors are expected and should be logged at a reduced level, useful for speculative fetching of known URLs.
|
|
86
97
|
|
|
87
98
|
:raises SitemapException: If the maximum recursion depth is exceeded.
|
|
99
|
+
:raises SitemapException: If the URL is in the parent URLs set.
|
|
88
100
|
:raises SitemapException: If the URL is not an HTTP(S) URL
|
|
89
101
|
"""
|
|
90
102
|
if recursion_level > self.__MAX_RECURSION_LEVEL:
|
|
@@ -92,9 +104,19 @@ class SitemapFetcher:
|
|
|
92
104
|
f"Recursion level exceeded {self.__MAX_RECURSION_LEVEL} for URL {url}."
|
|
93
105
|
)
|
|
94
106
|
|
|
107
|
+
log.debug(f"Parent URLs is {parent_urls}")
|
|
108
|
+
|
|
95
109
|
if not is_http_url(url):
|
|
96
110
|
raise SitemapException(f"URL {url} is not a HTTP(s) URL.")
|
|
97
111
|
|
|
112
|
+
parent_urls = parent_urls or set()
|
|
113
|
+
|
|
114
|
+
if url in parent_urls:
|
|
115
|
+
# Likely a sitemap index points to itself/a higher level index
|
|
116
|
+
raise SitemapException(
|
|
117
|
+
f"Recursion detected in URL {url} with parent URLs {parent_urls}."
|
|
118
|
+
)
|
|
119
|
+
|
|
98
120
|
if not web_client:
|
|
99
121
|
web_client = RequestsWebClient()
|
|
100
122
|
|
|
@@ -103,19 +125,15 @@ class SitemapFetcher:
|
|
|
103
125
|
self._url = url
|
|
104
126
|
self._web_client = web_client
|
|
105
127
|
self._recursion_level = recursion_level
|
|
128
|
+
self._parent_urls = parent_urls or set()
|
|
129
|
+
self._quiet_404 = quiet_404
|
|
106
130
|
|
|
107
|
-
def _fetch(self) ->
|
|
131
|
+
def _fetch(self) -> AbstractWebClientResponse:
|
|
108
132
|
log.info(f"Fetching level {self._recursion_level} sitemap from {self._url}...")
|
|
109
133
|
response = get_url_retry_on_client_errors(
|
|
110
|
-
url=self._url, web_client=self._web_client
|
|
134
|
+
url=self._url, web_client=self._web_client, quiet_404=self._quiet_404
|
|
111
135
|
)
|
|
112
|
-
|
|
113
|
-
if isinstance(response, WebClientErrorResponse):
|
|
114
|
-
return response
|
|
115
|
-
|
|
116
|
-
assert isinstance(response, AbstractWebClientSuccessResponse)
|
|
117
|
-
|
|
118
|
-
return ungzipped_response_content(url=self._url, response=response)
|
|
136
|
+
return response
|
|
119
137
|
|
|
120
138
|
def sitemap(self) -> AbstractSitemap:
|
|
121
139
|
"""
|
|
@@ -124,13 +142,27 @@ class SitemapFetcher:
|
|
|
124
142
|
:return: the parsed sitemap. Will be a child of :class:`~.AbstractSitemap`.
|
|
125
143
|
If an HTTP error is encountered, or the sitemap cannot be parsed, will be :class:`~.InvalidSitemap`.
|
|
126
144
|
"""
|
|
127
|
-
|
|
145
|
+
response = self._fetch()
|
|
128
146
|
|
|
129
|
-
if isinstance(
|
|
147
|
+
if isinstance(response, WebClientErrorResponse):
|
|
130
148
|
return InvalidSitemap(
|
|
131
149
|
url=self._url,
|
|
132
|
-
reason=f"Unable to fetch sitemap from {self._url}: {
|
|
150
|
+
reason=f"Unable to fetch sitemap from {self._url}: {response.message()}",
|
|
133
151
|
)
|
|
152
|
+
assert isinstance(response, AbstractWebClientSuccessResponse)
|
|
153
|
+
|
|
154
|
+
response_url = response.url()
|
|
155
|
+
log.debug(f"Response URL is {response_url}")
|
|
156
|
+
if response_url in self._parent_urls:
|
|
157
|
+
# Likely a sitemap has redirected to a parent URL
|
|
158
|
+
return InvalidSitemap(
|
|
159
|
+
url=self._url,
|
|
160
|
+
reason=f"Recursion detected when {self._url} redirected to {response_url} with parent URLs {self._parent_urls}.",
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
self._url = response_url
|
|
164
|
+
|
|
165
|
+
response_content = ungzipped_response_content(url=self._url, response=response)
|
|
134
166
|
|
|
135
167
|
# MIME types returned in Content-Type are unpredictable, so peek into the content instead
|
|
136
168
|
if response_content[:20].strip().startswith("<"):
|
|
@@ -140,6 +172,7 @@ class SitemapFetcher:
|
|
|
140
172
|
content=response_content,
|
|
141
173
|
recursion_level=self._recursion_level,
|
|
142
174
|
web_client=self._web_client,
|
|
175
|
+
parent_urls=self._parent_urls,
|
|
143
176
|
)
|
|
144
177
|
|
|
145
178
|
else:
|
|
@@ -150,6 +183,7 @@ class SitemapFetcher:
|
|
|
150
183
|
content=response_content,
|
|
151
184
|
recursion_level=self._recursion_level,
|
|
152
185
|
web_client=self._web_client,
|
|
186
|
+
parent_urls=self._parent_urls,
|
|
153
187
|
)
|
|
154
188
|
else:
|
|
155
189
|
parser = PlainTextSitemapParser(
|
|
@@ -157,6 +191,7 @@ class SitemapFetcher:
|
|
|
157
191
|
content=response_content,
|
|
158
192
|
recursion_level=self._recursion_level,
|
|
159
193
|
web_client=self._web_client,
|
|
194
|
+
parent_urls=self._parent_urls,
|
|
160
195
|
)
|
|
161
196
|
|
|
162
197
|
log.info(f"Parsing sitemap from URL {self._url}...")
|
|
@@ -186,8 +221,8 @@ class SitemapStrParser(SitemapFetcher):
|
|
|
186
221
|
)
|
|
187
222
|
self._static_content = static_content
|
|
188
223
|
|
|
189
|
-
def _fetch(self) ->
|
|
190
|
-
return self._static_content
|
|
224
|
+
def _fetch(self) -> AbstractWebClientResponse:
|
|
225
|
+
return LocalWebClientSuccessResponse(url=self._url, data=self._static_content)
|
|
191
226
|
|
|
192
227
|
|
|
193
228
|
class AbstractSitemapParser(metaclass=abc.ABCMeta):
|
|
@@ -198,6 +233,7 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
|
|
|
198
233
|
"_content",
|
|
199
234
|
"_web_client",
|
|
200
235
|
"_recursion_level",
|
|
236
|
+
"_parent_urls",
|
|
201
237
|
]
|
|
202
238
|
|
|
203
239
|
def __init__(
|
|
@@ -206,11 +242,13 @@ class AbstractSitemapParser(metaclass=abc.ABCMeta):
|
|
|
206
242
|
content: str,
|
|
207
243
|
recursion_level: int,
|
|
208
244
|
web_client: AbstractWebClient,
|
|
245
|
+
parent_urls: Set[str],
|
|
209
246
|
):
|
|
210
247
|
self._url = url
|
|
211
248
|
self._content = content
|
|
212
249
|
self._recursion_level = recursion_level
|
|
213
250
|
self._web_client = web_client
|
|
251
|
+
self._parent_urls = parent_urls
|
|
214
252
|
|
|
215
253
|
@abc.abstractmethod
|
|
216
254
|
def sitemap(self) -> AbstractSitemap:
|
|
@@ -231,12 +269,14 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
|
|
|
231
269
|
content: str,
|
|
232
270
|
recursion_level: int,
|
|
233
271
|
web_client: AbstractWebClient,
|
|
272
|
+
parent_urls: Set[str],
|
|
234
273
|
):
|
|
235
274
|
super().__init__(
|
|
236
275
|
url=url,
|
|
237
276
|
content=content,
|
|
238
277
|
recursion_level=recursion_level,
|
|
239
278
|
web_client=web_client,
|
|
279
|
+
parent_urls=parent_urls,
|
|
240
280
|
)
|
|
241
281
|
|
|
242
282
|
if not self._url.endswith("/robots.txt"):
|
|
@@ -271,6 +311,7 @@ class IndexRobotsTxtSitemapParser(AbstractSitemapParser):
|
|
|
271
311
|
url=sitemap_url,
|
|
272
312
|
recursion_level=self._recursion_level + 1,
|
|
273
313
|
web_client=self._web_client,
|
|
314
|
+
parent_urls=self._parent_urls | {self._url},
|
|
274
315
|
)
|
|
275
316
|
fetched_sitemap = fetcher.sitemap()
|
|
276
317
|
except NoWebClientException:
|
|
@@ -333,12 +374,14 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
333
374
|
content: str,
|
|
334
375
|
recursion_level: int,
|
|
335
376
|
web_client: AbstractWebClient,
|
|
377
|
+
parent_urls: Set[str],
|
|
336
378
|
):
|
|
337
379
|
super().__init__(
|
|
338
380
|
url=url,
|
|
339
381
|
content=content,
|
|
340
382
|
recursion_level=recursion_level,
|
|
341
383
|
web_client=web_client,
|
|
384
|
+
parent_urls=parent_urls,
|
|
342
385
|
)
|
|
343
386
|
|
|
344
387
|
# Will be initialized when the type of sitemap is known
|
|
@@ -432,6 +475,7 @@ class XMLSitemapParser(AbstractSitemapParser):
|
|
|
432
475
|
url=self._url,
|
|
433
476
|
web_client=self._web_client,
|
|
434
477
|
recursion_level=self._recursion_level,
|
|
478
|
+
parent_urls=self._parent_urls,
|
|
435
479
|
)
|
|
436
480
|
|
|
437
481
|
elif name == "rss":
|
|
@@ -545,14 +589,22 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
545
589
|
"_recursion_level",
|
|
546
590
|
# List of sub-sitemap URLs found in this index sitemap
|
|
547
591
|
"_sub_sitemap_urls",
|
|
592
|
+
"_parent_urls",
|
|
548
593
|
]
|
|
549
594
|
|
|
550
|
-
def __init__(
|
|
595
|
+
def __init__(
|
|
596
|
+
self,
|
|
597
|
+
url: str,
|
|
598
|
+
web_client: AbstractWebClient,
|
|
599
|
+
recursion_level: int,
|
|
600
|
+
parent_urls: Set[str],
|
|
601
|
+
):
|
|
551
602
|
super().__init__(url=url)
|
|
552
603
|
|
|
553
604
|
self._web_client = web_client
|
|
554
605
|
self._recursion_level = recursion_level
|
|
555
606
|
self._sub_sitemap_urls = []
|
|
607
|
+
self._parent_urls = parent_urls
|
|
556
608
|
|
|
557
609
|
def xml_element_end(self, name: str) -> None:
|
|
558
610
|
if name == "sitemap:loc":
|
|
@@ -578,6 +630,7 @@ class IndexXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
578
630
|
url=sub_sitemap_url,
|
|
579
631
|
recursion_level=self._recursion_level + 1,
|
|
580
632
|
web_client=self._web_client,
|
|
633
|
+
parent_urls=self._parent_urls | {self._url},
|
|
581
634
|
)
|
|
582
635
|
fetched_sitemap = fetcher.sitemap()
|
|
583
636
|
except NoWebClientException:
|
usp/helpers.py
CHANGED
|
@@ -7,6 +7,7 @@ import logging
|
|
|
7
7
|
import re
|
|
8
8
|
import sys
|
|
9
9
|
import time
|
|
10
|
+
from http import HTTPStatus
|
|
10
11
|
from typing import Optional
|
|
11
12
|
from urllib.parse import unquote_plus, urlparse, urlunparse
|
|
12
13
|
|
|
@@ -130,11 +131,15 @@ def parse_rfc2822_date(date_string: str) -> Optional[datetime.datetime]:
|
|
|
130
131
|
return None
|
|
131
132
|
|
|
132
133
|
|
|
134
|
+
_404_log_message = f"{HTTPStatus.NOT_FOUND} {HTTPStatus.NOT_FOUND.phrase}"
|
|
135
|
+
|
|
136
|
+
|
|
133
137
|
def get_url_retry_on_client_errors(
|
|
134
138
|
url: str,
|
|
135
139
|
web_client: AbstractWebClient,
|
|
136
140
|
retry_count: int = 5,
|
|
137
141
|
sleep_between_retries: int = 1,
|
|
142
|
+
quiet_404: bool = False,
|
|
138
143
|
) -> AbstractWebClientResponse:
|
|
139
144
|
"""
|
|
140
145
|
Fetch URL, retry on retryable errors.
|
|
@@ -143,6 +148,8 @@ def get_url_retry_on_client_errors(
|
|
|
143
148
|
:param web_client: Web client object to use for fetching.
|
|
144
149
|
:param retry_count: How many times to retry fetching the same URL.
|
|
145
150
|
:param sleep_between_retries: How long to sleep between retries, in seconds.
|
|
151
|
+
:param quiet_404: Whether to log 404 errors at a lower level.
|
|
152
|
+
|
|
146
153
|
:return: Web client response object.
|
|
147
154
|
"""
|
|
148
155
|
assert retry_count > 0, "Retry count must be positive."
|
|
@@ -153,7 +160,11 @@ def get_url_retry_on_client_errors(
|
|
|
153
160
|
response = web_client.get(url)
|
|
154
161
|
|
|
155
162
|
if isinstance(response, WebClientErrorResponse):
|
|
156
|
-
|
|
163
|
+
if quiet_404 and response.message() == _404_log_message:
|
|
164
|
+
log_level = logging.INFO
|
|
165
|
+
else:
|
|
166
|
+
log_level = logging.WARNING
|
|
167
|
+
log.log(log_level, f"Request for URL {url} failed: {response.message()}")
|
|
157
168
|
|
|
158
169
|
if response.retryable():
|
|
159
170
|
log.info(f"Retrying URL {url} in {sleep_between_retries} seconds...")
|
usp/objects/sitemap.py
CHANGED
|
@@ -9,17 +9,19 @@
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import abc
|
|
12
|
+
import logging
|
|
12
13
|
import os
|
|
13
14
|
import pickle
|
|
14
15
|
import tempfile
|
|
15
|
-
from functools import
|
|
16
|
+
from functools import cache
|
|
16
17
|
from typing import Iterator, List, Tuple
|
|
17
18
|
|
|
18
19
|
from .page import SitemapPage
|
|
19
20
|
|
|
21
|
+
log = logging.getLogger(__name__)
|
|
20
22
|
|
|
21
|
-
|
|
22
|
-
@
|
|
23
|
+
|
|
24
|
+
@cache
|
|
23
25
|
def _all_slots(target_cls):
|
|
24
26
|
mro = target_cls.__mro__
|
|
25
27
|
|
|
@@ -153,6 +155,7 @@ class InvalidSitemap(AbstractSitemap):
|
|
|
153
155
|
"""
|
|
154
156
|
super().__init__(url=url)
|
|
155
157
|
self.__reason = reason
|
|
158
|
+
log.info(f"Invalid sitemap: {url}, reason: {reason}")
|
|
156
159
|
|
|
157
160
|
def __eq__(self, other) -> bool:
|
|
158
161
|
if not isinstance(other, InvalidSitemap):
|
usp/tree.py
CHANGED
|
@@ -75,7 +75,10 @@ def sitemap_tree_for_homepage(
|
|
|
75
75
|
sitemap_urls_found_in_robots_txt = set()
|
|
76
76
|
if use_robots:
|
|
77
77
|
robots_txt_fetcher = SitemapFetcher(
|
|
78
|
-
url=robots_txt_url,
|
|
78
|
+
url=robots_txt_url,
|
|
79
|
+
web_client=web_client,
|
|
80
|
+
recursion_level=0,
|
|
81
|
+
parent_urls=set(),
|
|
79
82
|
)
|
|
80
83
|
robots_txt_sitemap = robots_txt_fetcher.sitemap()
|
|
81
84
|
if not isinstance(robots_txt_sitemap, InvalidSitemap):
|
|
@@ -95,6 +98,8 @@ def sitemap_tree_for_homepage(
|
|
|
95
98
|
url=unpublished_sitemap_url,
|
|
96
99
|
web_client=web_client,
|
|
97
100
|
recursion_level=0,
|
|
101
|
+
parent_urls=sitemap_urls_found_in_robots_txt,
|
|
102
|
+
quiet_404=True,
|
|
98
103
|
)
|
|
99
104
|
unpublished_sitemap = unpublished_sitemap_fetcher.sitemap()
|
|
100
105
|
|
|
@@ -101,6 +101,15 @@ class AbstractWebClientSuccessResponse(
|
|
|
101
101
|
"""
|
|
102
102
|
raise NotImplementedError("Abstract method.")
|
|
103
103
|
|
|
104
|
+
@abc.abstractmethod
|
|
105
|
+
def url(self) -> str:
|
|
106
|
+
"""
|
|
107
|
+
Return the actual URL fetched, after any redirects.
|
|
108
|
+
|
|
109
|
+
:return: URL fetched.
|
|
110
|
+
"""
|
|
111
|
+
raise NotImplementedError("Abstract method.")
|
|
112
|
+
|
|
104
113
|
|
|
105
114
|
class WebClientErrorResponse(AbstractWebClientResponse, metaclass=abc.ABCMeta):
|
|
106
115
|
"""
|
|
@@ -191,6 +200,27 @@ class LocalWebClient(AbstractWebClient):
|
|
|
191
200
|
raise NoWebClientException
|
|
192
201
|
|
|
193
202
|
|
|
203
|
+
class LocalWebClientSuccessResponse(AbstractWebClientSuccessResponse):
|
|
204
|
+
def __init__(self, url: str, data: str):
|
|
205
|
+
self._url = url
|
|
206
|
+
self._data = data
|
|
207
|
+
|
|
208
|
+
def status_code(self) -> int:
|
|
209
|
+
return 200
|
|
210
|
+
|
|
211
|
+
def status_message(self) -> str:
|
|
212
|
+
return "OK"
|
|
213
|
+
|
|
214
|
+
def header(self, case_insensitive_name: str) -> Optional[str]:
|
|
215
|
+
return None
|
|
216
|
+
|
|
217
|
+
def raw_data(self) -> bytes:
|
|
218
|
+
return self._data.encode("utf-8")
|
|
219
|
+
|
|
220
|
+
def url(self) -> str:
|
|
221
|
+
return self._url
|
|
222
|
+
|
|
223
|
+
|
|
194
224
|
class RequestWaiter:
|
|
195
225
|
"""
|
|
196
226
|
Manages waiting between requests.
|
|
@@ -62,6 +62,9 @@ class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
|
|
|
62
62
|
|
|
63
63
|
return data
|
|
64
64
|
|
|
65
|
+
def url(self) -> str:
|
|
66
|
+
return self.__requests_response.url
|
|
67
|
+
|
|
65
68
|
|
|
66
69
|
class RequestsWebClientErrorResponse(WebClientErrorResponse):
|
|
67
70
|
"""
|
|
@@ -162,7 +165,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
162
165
|
)
|
|
163
166
|
else:
|
|
164
167
|
message = f"{response.status_code} {response.reason}"
|
|
165
|
-
log.
|
|
168
|
+
log.debug(f"Response content: {response.text}")
|
|
166
169
|
|
|
167
170
|
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
|
|
168
171
|
return RequestsWebClientErrorResponse(
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
|
|
2
|
-
usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
|
|
3
|
-
usp/cli/_ls.py,sha256=BjF5bGuhe_E_Ak-yyY0cDM83LFstl5tA3XNIrGZJujs,2954
|
|
4
|
-
usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
|
|
5
|
-
usp/cli/cli.py,sha256=ySNyYHoCQ440KfxmpTkzLXgqtbnt5ru-TgPs2Zw2-LI,592
|
|
6
|
-
usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
|
|
7
|
-
usp/fetch_parse.py,sha256=VJrJSAG1X8oQyW2p9wSepuGWfHlMDNoJG8jn3an2XUY,41396
|
|
8
|
-
usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
|
|
9
|
-
usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
|
|
11
|
-
usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
|
|
12
|
-
usp/tree.py,sha256=pwSTp1Zok4evzrNFavP-hh5i9xGGzObj_sKUqjk72UU,4237
|
|
13
|
-
usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
|
-
usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
|
|
15
|
-
usp/web_client/requests_client.py,sha256=1nyXXBxiapDNN5jNpCAXRL5rgjptK4oKvaJhV5nhLsA,5816
|
|
16
|
-
ultimate_sitemap_parser-1.2.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
17
|
-
ultimate_sitemap_parser-1.2.0.dist-info/METADATA,sha256=46wVZspA5eUgbXefu2Fu7xtE03TbFsgjEwLL5BT-mj0,4447
|
|
18
|
-
ultimate_sitemap_parser-1.2.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
|
|
19
|
-
ultimate_sitemap_parser-1.2.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
20
|
-
ultimate_sitemap_parser-1.2.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
|
|
21
|
-
ultimate_sitemap_parser-1.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ultimate_sitemap_parser-1.2.0.dist-info → ultimate_sitemap_parser-1.3.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|