ultimate-sitemap-parser 1.1.1__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/METADATA +1 -1
- {ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/RECORD +8 -8
- usp/tree.py +5 -1
- usp/web_client/requests_client.py +8 -2
- {ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/WHEEL +0 -0
- {ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/entry_points.txt +0 -0
|
@@ -9,13 +9,13 @@ usp/helpers.py,sha256=S9d8fEhHzZqVCx3SkcWVTgW1JYKujH-tM86urjORNWA,8482
|
|
|
9
9
|
usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
usp/objects/page.py,sha256=vz2QXC9Z3E65Cxf68tKfQkubIc_OB0m6pNYH146Qx_8,14253
|
|
11
11
|
usp/objects/sitemap.py,sha256=yt5qe6fyKfmvJmV60mB8kc7yooGcpYhuIcNlmUqFGFA,11486
|
|
12
|
-
usp/tree.py,sha256=
|
|
12
|
+
usp/tree.py,sha256=pwSTp1Zok4evzrNFavP-hh5i9xGGzObj_sKUqjk72UU,4237
|
|
13
13
|
usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
14
14
|
usp/web_client/abstract_client.py,sha256=7MpIfqQpi1_yojEmuReT8iy9kFUWCD3i2LMpHmBOwV0,6291
|
|
15
|
-
usp/web_client/requests_client.py,sha256=
|
|
16
|
-
ultimate_sitemap_parser-1.
|
|
17
|
-
ultimate_sitemap_parser-1.
|
|
18
|
-
ultimate_sitemap_parser-1.
|
|
19
|
-
ultimate_sitemap_parser-1.
|
|
20
|
-
ultimate_sitemap_parser-1.
|
|
21
|
-
ultimate_sitemap_parser-1.
|
|
15
|
+
usp/web_client/requests_client.py,sha256=1nyXXBxiapDNN5jNpCAXRL5rgjptK4oKvaJhV5nhLsA,5816
|
|
16
|
+
ultimate_sitemap_parser-1.2.0.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
17
|
+
ultimate_sitemap_parser-1.2.0.dist-info/METADATA,sha256=46wVZspA5eUgbXefu2Fu7xtE03TbFsgjEwLL5BT-mj0,4447
|
|
18
|
+
ultimate_sitemap_parser-1.2.0.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
|
|
19
|
+
ultimate_sitemap_parser-1.2.0.dist-info/WHEEL,sha256=IYZQI976HJqqOpQU6PHkJ8fb3tMNBFjg-Cn-pwAbaFM,88
|
|
20
|
+
ultimate_sitemap_parser-1.2.0.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
|
|
21
|
+
ultimate_sitemap_parser-1.2.0.dist-info/RECORD,,
|
usp/tree.py
CHANGED
|
@@ -40,6 +40,7 @@ def sitemap_tree_for_homepage(
|
|
|
40
40
|
web_client: Optional[AbstractWebClient] = None,
|
|
41
41
|
use_robots: bool = True,
|
|
42
42
|
use_known_paths: bool = True,
|
|
43
|
+
extra_known_paths: Optional[set] = None,
|
|
43
44
|
) -> AbstractSitemap:
|
|
44
45
|
"""
|
|
45
46
|
Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
|
|
@@ -49,12 +50,15 @@ def sitemap_tree_for_homepage(
|
|
|
49
50
|
If ``None``, a :class:`~.RequestsWebClient` will be used.
|
|
50
51
|
:param use_robots: Whether to discover sitemaps through robots.txt.
|
|
51
52
|
:param use_known_paths: Whether to discover sitemaps through common known paths.
|
|
53
|
+
:param extra_known_paths: Extra paths to check for sitemaps.
|
|
52
54
|
:return: Root sitemap object of the fetched sitemap tree.
|
|
53
55
|
"""
|
|
54
56
|
|
|
55
57
|
if not is_http_url(homepage_url):
|
|
56
58
|
raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
|
|
57
59
|
|
|
60
|
+
extra_known_paths = extra_known_paths or set()
|
|
61
|
+
|
|
58
62
|
stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
|
|
59
63
|
if homepage_url != stripped_homepage_url:
|
|
60
64
|
log.warning(
|
|
@@ -82,7 +86,7 @@ def sitemap_tree_for_homepage(
|
|
|
82
86
|
sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
|
|
83
87
|
|
|
84
88
|
if use_known_paths:
|
|
85
|
-
for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
|
|
89
|
+
for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS | extra_known_paths:
|
|
86
90
|
unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
|
|
87
91
|
|
|
88
92
|
# Don't refetch URLs already found in robots.txt
|
|
@@ -92,18 +92,24 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
92
92
|
]
|
|
93
93
|
|
|
94
94
|
def __init__(
|
|
95
|
-
self,
|
|
95
|
+
self,
|
|
96
|
+
verify=True,
|
|
97
|
+
wait: Optional[float] = None,
|
|
98
|
+
random_wait: bool = False,
|
|
99
|
+
session: Optional[requests.Session] = None,
|
|
96
100
|
):
|
|
97
101
|
"""
|
|
98
102
|
:param verify: whether certificates should be verified for HTTPS requests.
|
|
99
103
|
:param wait: time to wait between requests, in seconds.
|
|
100
104
|
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
|
|
105
|
+
:param session: a custom session object to use, or None to create a new one.
|
|
101
106
|
"""
|
|
102
107
|
self.__max_response_data_length = None
|
|
103
108
|
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
|
|
104
109
|
self.__proxies = {}
|
|
105
110
|
self.__verify = verify
|
|
106
111
|
self.__waiter = RequestWaiter(wait, random_wait)
|
|
112
|
+
self.__session = session or requests.Session()
|
|
107
113
|
|
|
108
114
|
def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
|
|
109
115
|
"""Set HTTP request timeout.
|
|
@@ -132,7 +138,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
132
138
|
def get(self, url: str) -> AbstractWebClientResponse:
|
|
133
139
|
self.__waiter.wait()
|
|
134
140
|
try:
|
|
135
|
-
response =
|
|
141
|
+
response = self.__session.get(
|
|
136
142
|
url,
|
|
137
143
|
timeout=self.__timeout,
|
|
138
144
|
stream=True,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ultimate_sitemap_parser-1.1.1.dist-info → ultimate_sitemap_parser-1.2.0.dist-info}/entry_points.txt
RENAMED
|
File without changes
|