ultimate-sitemap-parser 1.0.0rc1__tar.gz → 1.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/PKG-INFO +6 -5
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/pyproject.toml +42 -22
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/_ls.py +1 -1
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/fetch_parse.py +34 -14
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/helpers.py +8 -7
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/objects/page.py +29 -3
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/objects/sitemap.py +4 -9
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/tree.py +5 -4
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/abstract_client.py +35 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/requests_client.py +23 -5
- ultimate_sitemap_parser-1.0.0rc1/usp/log.py +0 -77
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/LICENSE +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/NOTICE +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/README.rst +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/__init__.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/__init__.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/_util.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/cli/cli.py +1 -1
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/exceptions.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/objects/__init__.py +0 -0
- {ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/__init__.py +0 -0
|
@@ -1,15 +1,14 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
2
|
Name: ultimate-sitemap-parser
|
|
3
|
-
Version: 1.
|
|
3
|
+
Version: 1.1.1
|
|
4
4
|
Summary: A performant library for parsing and crawling sitemaps
|
|
5
|
-
Home-page: https://ultimate-sitemap-parser.readthedocs.io/
|
|
6
5
|
License: GPL-3.0-or-later
|
|
7
6
|
Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
|
|
8
7
|
Author: Linas Valiukas
|
|
9
8
|
Author-email: linas@media.mit.edu
|
|
10
9
|
Maintainer: Freddy Heppell
|
|
11
10
|
Maintainer-email: f.heppell@sheffield.ac.uk
|
|
12
|
-
Requires-Python: >=3.8
|
|
11
|
+
Requires-Python: >=3.8
|
|
13
12
|
Classifier: Development Status :: 5 - Production/Stable
|
|
14
13
|
Classifier: Intended Audience :: Developers
|
|
15
14
|
Classifier: Intended Audience :: Information Technology
|
|
@@ -22,12 +21,14 @@ Classifier: Programming Language :: Python :: 3.9
|
|
|
22
21
|
Classifier: Programming Language :: Python :: 3.10
|
|
23
22
|
Classifier: Programming Language :: Python :: 3.11
|
|
24
23
|
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
25
25
|
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
26
26
|
Classifier: Topic :: Text Processing :: Indexing
|
|
27
27
|
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
28
28
|
Requires-Dist: python-dateutil (>=2.7,<3.0.0)
|
|
29
|
-
Requires-Dist: requests (>=2.2.1)
|
|
29
|
+
Requires-Dist: requests (>=2.2.1,<3.0.0)
|
|
30
30
|
Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
|
|
31
|
+
Project-URL: Homepage, https://ultimate-sitemap-parser.readthedocs.io/
|
|
31
32
|
Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
|
|
32
33
|
Description-Content-Type: text/x-rst
|
|
33
34
|
|
|
@@ -1,20 +1,36 @@
|
|
|
1
|
-
[
|
|
1
|
+
[project]
|
|
2
2
|
name = "ultimate-sitemap-parser"
|
|
3
|
-
version = "1.
|
|
3
|
+
version = "1.1.1"
|
|
4
4
|
description = "A performant library for parsing and crawling sitemaps"
|
|
5
5
|
authors = [
|
|
6
|
-
"Linas Valiukas
|
|
7
|
-
"Hal Roberts
|
|
8
|
-
"Freddy Heppell
|
|
6
|
+
{ name = "Linas Valiukas", email = "linas@media.mit.edu"},
|
|
7
|
+
{ name = "Hal Roberts", email = "hroberts@cyber.law.harvard.edu"},
|
|
8
|
+
{ name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
|
|
9
9
|
]
|
|
10
10
|
maintainers = [
|
|
11
|
-
"Freddy Heppell
|
|
11
|
+
{ name = "Freddy Heppell", email = "f.heppell@sheffield.ac.uk"},
|
|
12
12
|
]
|
|
13
|
+
license = "GPL-3.0-or-later"
|
|
14
|
+
readme = "README.rst"
|
|
15
|
+
keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
|
|
16
|
+
dynamic = ["classifiers"]
|
|
17
|
+
|
|
18
|
+
requires-python = ">=3.8"
|
|
19
|
+
dependencies = [
|
|
20
|
+
"python-dateutil (>=2.7,<3.0.0)",
|
|
21
|
+
"requests (>=2.2.1,<3.0.0)"
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.urls]
|
|
13
25
|
homepage = "https://ultimate-sitemap-parser.readthedocs.io/"
|
|
14
26
|
documentation = "https://ultimate-sitemap-parser.readthedocs.io/"
|
|
15
27
|
repository = "https://github.com/GateNLP/ultimate-sitemap-parser"
|
|
16
|
-
|
|
17
|
-
|
|
28
|
+
|
|
29
|
+
[project.scripts]
|
|
30
|
+
usp = 'usp.cli:main'
|
|
31
|
+
|
|
32
|
+
[tool.poetry]
|
|
33
|
+
requires-poetry = ">=2.0"
|
|
18
34
|
classifiers=[
|
|
19
35
|
'Development Status :: 5 - Production/Stable',
|
|
20
36
|
'Intended Audience :: Developers',
|
|
@@ -26,24 +42,20 @@ classifiers=[
|
|
|
26
42
|
'Topic :: Text Processing :: Indexing',
|
|
27
43
|
'Topic :: Text Processing :: Markup :: XML',
|
|
28
44
|
]
|
|
29
|
-
keywords = ["sitemap", "crawler", "indexing", "xml", "rss", "atom", "google news"]
|
|
30
45
|
packages = [
|
|
31
46
|
{ include = "usp" }
|
|
32
47
|
]
|
|
33
48
|
|
|
34
|
-
[tool.poetry.scripts]
|
|
35
|
-
usp = 'usp.cli:main'
|
|
36
|
-
|
|
37
49
|
[tool.poetry.dependencies]
|
|
38
|
-
|
|
39
|
-
python
|
|
40
|
-
requests = ">=2.2.1"
|
|
50
|
+
# Specify upper bound for locking
|
|
51
|
+
python = ">=3.8,<4.0"
|
|
41
52
|
|
|
42
53
|
[tool.poetry.group.dev.dependencies]
|
|
43
54
|
requests-mock = ">=1.6.0,<2.0"
|
|
44
55
|
pytest = "^8.3.0"
|
|
45
|
-
ruff = "^0.
|
|
56
|
+
ruff = "^0.9.3"
|
|
46
57
|
vcrpy = "6.0.1"
|
|
58
|
+
pytest-mock = "^3.14.0"
|
|
47
59
|
|
|
48
60
|
[tool.poetry.group.perf]
|
|
49
61
|
optional = true
|
|
@@ -71,12 +83,20 @@ extend-exclude = ["docs/*"]
|
|
|
71
83
|
|
|
72
84
|
[tool.ruff.lint]
|
|
73
85
|
select = [
|
|
74
|
-
"E4",
|
|
75
|
-
"E7",
|
|
76
|
-
"E9",
|
|
77
|
-
"F",
|
|
78
|
-
"UP",
|
|
79
|
-
"PT"
|
|
86
|
+
"E4", # pycodestyle Import
|
|
87
|
+
"E7", # pycodestyle Statement
|
|
88
|
+
"E9", # pycodestyle Runtime
|
|
89
|
+
"F", # pyflakes
|
|
90
|
+
"UP", # pyupgrde
|
|
91
|
+
"PT", # flake8-pytest-style
|
|
92
|
+
"I", # isort
|
|
93
|
+
"T20", # flake8-print
|
|
94
|
+
"LOG", # flake8-logging
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
[tool.ruff.lint.per-file-ignores]
|
|
98
|
+
"**/tests/*" = [
|
|
99
|
+
"T20", # Allow print in tests
|
|
80
100
|
]
|
|
81
101
|
|
|
82
102
|
[tool.pytest.ini_options]
|
|
@@ -8,49 +8,49 @@
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import abc
|
|
11
|
+
import logging
|
|
11
12
|
import re
|
|
12
13
|
import xml.parsers.expat
|
|
13
14
|
from collections import OrderedDict
|
|
14
15
|
from decimal import Decimal, InvalidOperation
|
|
15
|
-
from typing import
|
|
16
|
-
|
|
16
|
+
from typing import Dict, Optional, Union
|
|
17
17
|
|
|
18
18
|
from .exceptions import SitemapException, SitemapXMLParsingException
|
|
19
19
|
from .helpers import (
|
|
20
|
-
html_unescape_strip,
|
|
21
|
-
parse_iso8601_date,
|
|
22
20
|
get_url_retry_on_client_errors,
|
|
23
|
-
|
|
21
|
+
html_unescape_strip,
|
|
24
22
|
is_http_url,
|
|
23
|
+
parse_iso8601_date,
|
|
25
24
|
parse_rfc2822_date,
|
|
25
|
+
ungzipped_response_content,
|
|
26
26
|
)
|
|
27
|
-
from .log import create_logger
|
|
28
27
|
from .objects.page import (
|
|
28
|
+
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
29
29
|
SitemapImage,
|
|
30
|
-
SitemapPage,
|
|
31
30
|
SitemapNewsStory,
|
|
31
|
+
SitemapPage,
|
|
32
32
|
SitemapPageChangeFrequency,
|
|
33
|
-
SITEMAP_PAGE_DEFAULT_PRIORITY,
|
|
34
33
|
)
|
|
35
34
|
from .objects.sitemap import (
|
|
36
35
|
AbstractSitemap,
|
|
37
|
-
InvalidSitemap,
|
|
38
36
|
IndexRobotsTxtSitemap,
|
|
39
37
|
IndexXMLSitemap,
|
|
40
|
-
|
|
41
|
-
PagesTextSitemap,
|
|
42
|
-
PagesRSSSitemap,
|
|
38
|
+
InvalidSitemap,
|
|
43
39
|
PagesAtomSitemap,
|
|
40
|
+
PagesRSSSitemap,
|
|
41
|
+
PagesTextSitemap,
|
|
42
|
+
PagesXMLSitemap,
|
|
44
43
|
)
|
|
45
44
|
from .web_client.abstract_client import (
|
|
46
45
|
AbstractWebClient,
|
|
47
46
|
AbstractWebClientSuccessResponse,
|
|
47
|
+
LocalWebClient,
|
|
48
|
+
NoWebClientException,
|
|
48
49
|
WebClientErrorResponse,
|
|
49
50
|
)
|
|
50
|
-
from .web_client.abstract_client import LocalWebClient, NoWebClientException
|
|
51
51
|
from .web_client.requests_client import RequestsWebClient
|
|
52
52
|
|
|
53
|
-
log =
|
|
53
|
+
log = logging.getLogger(__name__)
|
|
54
54
|
|
|
55
55
|
|
|
56
56
|
class SitemapFetcher:
|
|
@@ -643,6 +643,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
643
643
|
"news_keywords",
|
|
644
644
|
"news_stock_tickers",
|
|
645
645
|
"images",
|
|
646
|
+
"alternates",
|
|
646
647
|
]
|
|
647
648
|
|
|
648
649
|
def __init__(self):
|
|
@@ -659,6 +660,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
659
660
|
self.news_keywords = None
|
|
660
661
|
self.news_stock_tickers = None
|
|
661
662
|
self.images = []
|
|
663
|
+
self.alternates = []
|
|
662
664
|
|
|
663
665
|
def __hash__(self):
|
|
664
666
|
return hash(
|
|
@@ -763,6 +765,10 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
763
765
|
for image in self.images
|
|
764
766
|
]
|
|
765
767
|
|
|
768
|
+
alternates = None
|
|
769
|
+
if len(self.alternates) > 0:
|
|
770
|
+
alternates = self.alternates
|
|
771
|
+
|
|
766
772
|
return SitemapPage(
|
|
767
773
|
url=url,
|
|
768
774
|
last_modified=last_modified,
|
|
@@ -770,6 +776,7 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
770
776
|
priority=priority,
|
|
771
777
|
news_story=sitemap_news_story,
|
|
772
778
|
images=sitemap_images,
|
|
779
|
+
alternates=alternates,
|
|
773
780
|
)
|
|
774
781
|
|
|
775
782
|
__slots__ = ["_current_page", "_pages", "_page_urls", "_current_image"]
|
|
@@ -801,6 +808,19 @@ class PagesXMLSitemapParser(AbstractXMLSitemapParser):
|
|
|
801
808
|
"Page is expected to be set before <image:image>."
|
|
802
809
|
)
|
|
803
810
|
self._current_image = self.Image()
|
|
811
|
+
elif name == "link":
|
|
812
|
+
if not self._current_page:
|
|
813
|
+
raise SitemapXMLParsingException(
|
|
814
|
+
"Page is expected to be set before <link>."
|
|
815
|
+
)
|
|
816
|
+
if "rel" not in attrs or attrs["rel"] != "alternate":
|
|
817
|
+
log.warning(f"<link> element is missing rel attribute: {attrs}.")
|
|
818
|
+
elif "hreflang" not in attrs or "href" not in attrs:
|
|
819
|
+
log.warning(
|
|
820
|
+
f"<link> element is missing hreflang or href attributes: {attrs}."
|
|
821
|
+
)
|
|
822
|
+
else:
|
|
823
|
+
self._current_page.alternates.append((attrs["hreflang"], attrs["href"]))
|
|
804
824
|
|
|
805
825
|
def __require_last_char_data_to_be_set(self, name: str) -> None:
|
|
806
826
|
if not self._last_char_data:
|
|
@@ -3,24 +3,25 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
import gzip as gzip_lib
|
|
5
5
|
import html
|
|
6
|
+
import logging
|
|
6
7
|
import re
|
|
7
8
|
import sys
|
|
8
9
|
import time
|
|
9
10
|
from typing import Optional
|
|
10
|
-
from urllib.parse import
|
|
11
|
-
|
|
11
|
+
from urllib.parse import unquote_plus, urlparse, urlunparse
|
|
12
|
+
|
|
12
13
|
from dateutil.parser import isoparse as dateutil_isoparse
|
|
14
|
+
from dateutil.parser import parse as dateutil_parse
|
|
13
15
|
|
|
14
|
-
from .exceptions import
|
|
15
|
-
from .log import create_logger
|
|
16
|
+
from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
|
|
16
17
|
from .web_client.abstract_client import (
|
|
17
18
|
AbstractWebClient,
|
|
19
|
+
AbstractWebClientResponse,
|
|
18
20
|
AbstractWebClientSuccessResponse,
|
|
19
21
|
WebClientErrorResponse,
|
|
20
|
-
AbstractWebClientResponse,
|
|
21
22
|
)
|
|
22
23
|
|
|
23
|
-
log =
|
|
24
|
+
log = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
__URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
|
|
26
27
|
"""Regular expression to match HTTP(s) URLs."""
|
|
@@ -247,7 +248,7 @@ def ungzipped_response_content(
|
|
|
247
248
|
data = gunzip(data)
|
|
248
249
|
except GunzipException as ex:
|
|
249
250
|
# In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
|
|
250
|
-
log.
|
|
251
|
+
log.warning(
|
|
251
252
|
f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
|
|
252
253
|
)
|
|
253
254
|
|
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
import datetime
|
|
4
4
|
from decimal import Decimal
|
|
5
5
|
from enum import Enum, unique
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import List, Optional, Tuple
|
|
7
7
|
|
|
8
8
|
SITEMAP_PAGE_DEFAULT_PRIORITY = Decimal("0.5")
|
|
9
9
|
"""Default sitemap page priority, as per the spec."""
|
|
@@ -331,6 +331,7 @@ class SitemapPage:
|
|
|
331
331
|
"__change_frequency",
|
|
332
332
|
"__news_story",
|
|
333
333
|
"__images",
|
|
334
|
+
"__alternates",
|
|
334
335
|
]
|
|
335
336
|
|
|
336
337
|
def __init__(
|
|
@@ -341,6 +342,7 @@ class SitemapPage:
|
|
|
341
342
|
change_frequency: Optional[SitemapPageChangeFrequency] = None,
|
|
342
343
|
news_story: Optional[SitemapNewsStory] = None,
|
|
343
344
|
images: Optional[List[SitemapImage]] = None,
|
|
345
|
+
alternates: Optional[List[Tuple[str, str]]] = None,
|
|
344
346
|
):
|
|
345
347
|
"""
|
|
346
348
|
Initialize a new sitemap-derived page.
|
|
@@ -357,6 +359,7 @@ class SitemapPage:
|
|
|
357
359
|
self.__change_frequency = change_frequency
|
|
358
360
|
self.__news_story = news_story
|
|
359
361
|
self.__images = images
|
|
362
|
+
self.__alternates = alternates
|
|
360
363
|
|
|
361
364
|
def __eq__(self, other) -> bool:
|
|
362
365
|
if not isinstance(other, SitemapPage):
|
|
@@ -380,6 +383,9 @@ class SitemapPage:
|
|
|
380
383
|
if self.images != other.images:
|
|
381
384
|
return False
|
|
382
385
|
|
|
386
|
+
if self.alternates != other.alternates:
|
|
387
|
+
return False
|
|
388
|
+
|
|
383
389
|
return True
|
|
384
390
|
|
|
385
391
|
def __hash__(self):
|
|
@@ -442,10 +448,30 @@ class SitemapPage:
|
|
|
442
448
|
|
|
443
449
|
@property
|
|
444
450
|
def news_story(self) -> Optional[SitemapNewsStory]:
|
|
445
|
-
"""Get the Google News story attached to the URL.
|
|
451
|
+
"""Get the Google News story attached to the URL.
|
|
452
|
+
|
|
453
|
+
See :ref:`google-news-ext` reference
|
|
454
|
+
"""
|
|
446
455
|
return self.__news_story
|
|
447
456
|
|
|
448
457
|
@property
|
|
449
458
|
def images(self) -> Optional[List[SitemapImage]]:
|
|
450
|
-
"""Get the images attached to the URL.
|
|
459
|
+
"""Get the images attached to the URL.
|
|
460
|
+
|
|
461
|
+
See :ref:`google-image-ext` reference
|
|
462
|
+
"""
|
|
451
463
|
return self.__images
|
|
464
|
+
|
|
465
|
+
@property
|
|
466
|
+
def alternates(self) -> Optional[List[Tuple[str, str]]]:
|
|
467
|
+
"""Get the alternate URLs for the URL.
|
|
468
|
+
|
|
469
|
+
A tuple of (language code, URL) for each ``<xhtml:link>`` element with ``rel="alternate"`` attribute.
|
|
470
|
+
|
|
471
|
+
See :ref:`sitemap-extra-localisation` reference
|
|
472
|
+
|
|
473
|
+
Example::
|
|
474
|
+
|
|
475
|
+
[('fr', 'https://www.example.com/fr/page'), ('de', 'https://www.example.com/de/page')]
|
|
476
|
+
"""
|
|
477
|
+
return self.__alternates
|
|
@@ -9,11 +9,11 @@
|
|
|
9
9
|
"""
|
|
10
10
|
|
|
11
11
|
import abc
|
|
12
|
-
from functools import lru_cache
|
|
13
12
|
import os
|
|
14
13
|
import pickle
|
|
15
14
|
import tempfile
|
|
16
|
-
from
|
|
15
|
+
from functools import lru_cache
|
|
16
|
+
from typing import Iterator, List, Tuple
|
|
17
17
|
|
|
18
18
|
from .page import SitemapPage
|
|
19
19
|
|
|
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
|
|
|
72
72
|
return hash((self.url,))
|
|
73
73
|
|
|
74
74
|
def __repr__(self):
|
|
75
|
-
return f"{self.__class__.__name__}(
|
|
75
|
+
return f"{self.__class__.__name__}(url={self.url})"
|
|
76
76
|
|
|
77
77
|
@property
|
|
78
78
|
def url(self) -> str:
|
|
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
|
|
|
167
167
|
return True
|
|
168
168
|
|
|
169
169
|
def __repr__(self):
|
|
170
|
-
return (
|
|
171
|
-
f"{self.__class__.__name__}("
|
|
172
|
-
f"url={self.url}, "
|
|
173
|
-
f"reason={self.reason}"
|
|
174
|
-
")"
|
|
175
|
-
)
|
|
170
|
+
return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
|
|
176
171
|
|
|
177
172
|
def to_dict(self, with_pages=True) -> dict:
|
|
178
173
|
return {
|
|
@@ -1,19 +1,20 @@
|
|
|
1
1
|
"""Helpers to generate a sitemap tree."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from typing import Optional
|
|
5
|
+
|
|
4
6
|
from .exceptions import SitemapException
|
|
5
7
|
from .fetch_parse import SitemapFetcher, SitemapStrParser
|
|
6
8
|
from .helpers import is_http_url, strip_url_to_homepage
|
|
7
|
-
from .log import create_logger
|
|
8
9
|
from .objects.sitemap import (
|
|
9
10
|
AbstractSitemap,
|
|
10
|
-
InvalidSitemap,
|
|
11
|
-
IndexWebsiteSitemap,
|
|
12
11
|
IndexRobotsTxtSitemap,
|
|
12
|
+
IndexWebsiteSitemap,
|
|
13
|
+
InvalidSitemap,
|
|
13
14
|
)
|
|
14
15
|
from .web_client.abstract_client import AbstractWebClient
|
|
15
16
|
|
|
16
|
-
log =
|
|
17
|
+
log = logging.getLogger(__name__)
|
|
17
18
|
|
|
18
19
|
_UNPUBLISHED_SITEMAP_PATHS = {
|
|
19
20
|
"sitemap.xml",
|
{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/abstract_client.py
RENAMED
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
"""Abstract web client class."""
|
|
2
2
|
|
|
3
3
|
import abc
|
|
4
|
+
import random
|
|
5
|
+
import time
|
|
4
6
|
from http import HTTPStatus
|
|
5
7
|
from typing import Optional
|
|
6
8
|
|
|
@@ -187,3 +189,36 @@ class LocalWebClient(AbstractWebClient):
|
|
|
187
189
|
|
|
188
190
|
def get(self, url: str) -> AbstractWebClientResponse:
|
|
189
191
|
raise NoWebClientException
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
class RequestWaiter:
|
|
195
|
+
"""
|
|
196
|
+
Manages waiting between requests.
|
|
197
|
+
"""
|
|
198
|
+
|
|
199
|
+
def __init__(self, wait: Optional[float] = None, random_wait: bool = True):
|
|
200
|
+
"""
|
|
201
|
+
:param wait: time to wait between requests, in seconds.
|
|
202
|
+
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
|
|
203
|
+
"""
|
|
204
|
+
self.wait_s = wait or 0
|
|
205
|
+
self.random_wait = random_wait
|
|
206
|
+
self.is_first = True
|
|
207
|
+
|
|
208
|
+
def wait(self) -> None:
|
|
209
|
+
"""Perform a wait if needed. Should be called before each request.
|
|
210
|
+
|
|
211
|
+
Will skip wait if this is the first request.
|
|
212
|
+
"""
|
|
213
|
+
if self.wait_s == 0:
|
|
214
|
+
return
|
|
215
|
+
|
|
216
|
+
if self.is_first:
|
|
217
|
+
self.is_first = False
|
|
218
|
+
return
|
|
219
|
+
|
|
220
|
+
wait_f = 1.0
|
|
221
|
+
if self.random_wait:
|
|
222
|
+
wait_f = random.uniform(0.5, 1.5)
|
|
223
|
+
|
|
224
|
+
time.sleep(self.wait_s * wait_f)
|
{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/requests_client.py
RENAMED
|
@@ -1,18 +1,23 @@
|
|
|
1
1
|
"""Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from http import HTTPStatus
|
|
4
|
-
from typing import
|
|
5
|
+
from typing import Dict, Optional, Tuple, Union
|
|
5
6
|
|
|
6
7
|
import requests
|
|
7
8
|
|
|
9
|
+
from usp import __version__
|
|
10
|
+
|
|
8
11
|
from .abstract_client import (
|
|
12
|
+
RETRYABLE_HTTP_STATUS_CODES,
|
|
9
13
|
AbstractWebClient,
|
|
10
14
|
AbstractWebClientResponse,
|
|
11
15
|
AbstractWebClientSuccessResponse,
|
|
16
|
+
RequestWaiter,
|
|
12
17
|
WebClientErrorResponse,
|
|
13
|
-
RETRYABLE_HTTP_STATUS_CODES,
|
|
14
18
|
)
|
|
15
|
-
|
|
19
|
+
|
|
20
|
+
log = logging.getLogger(__name__)
|
|
16
21
|
|
|
17
22
|
|
|
18
23
|
class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
|
|
@@ -78,16 +83,27 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
78
83
|
Some webservers might be generating huge sitemaps on the fly, so this is why it's rather big.
|
|
79
84
|
"""
|
|
80
85
|
|
|
81
|
-
__slots__ = [
|
|
86
|
+
__slots__ = [
|
|
87
|
+
"__max_response_data_length",
|
|
88
|
+
"__timeout",
|
|
89
|
+
"__proxies",
|
|
90
|
+
"__verify",
|
|
91
|
+
"__waiter",
|
|
92
|
+
]
|
|
82
93
|
|
|
83
|
-
def __init__(
|
|
94
|
+
def __init__(
|
|
95
|
+
self, verify=True, wait: Optional[float] = None, random_wait: bool = False
|
|
96
|
+
):
|
|
84
97
|
"""
|
|
85
98
|
:param verify: whether certificates should be verified for HTTPS requests.
|
|
99
|
+
:param wait: time to wait between requests, in seconds.
|
|
100
|
+
:param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
|
|
86
101
|
"""
|
|
87
102
|
self.__max_response_data_length = None
|
|
88
103
|
self.__timeout = self.__HTTP_REQUEST_TIMEOUT
|
|
89
104
|
self.__proxies = {}
|
|
90
105
|
self.__verify = verify
|
|
106
|
+
self.__waiter = RequestWaiter(wait, random_wait)
|
|
91
107
|
|
|
92
108
|
def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
|
|
93
109
|
"""Set HTTP request timeout.
|
|
@@ -114,6 +130,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
114
130
|
self.__max_response_data_length = max_response_data_length
|
|
115
131
|
|
|
116
132
|
def get(self, url: str) -> AbstractWebClientResponse:
|
|
133
|
+
self.__waiter.wait()
|
|
117
134
|
try:
|
|
118
135
|
response = requests.get(
|
|
119
136
|
url,
|
|
@@ -139,6 +156,7 @@ class RequestsWebClient(AbstractWebClient):
|
|
|
139
156
|
)
|
|
140
157
|
else:
|
|
141
158
|
message = f"{response.status_code} {response.reason}"
|
|
159
|
+
log.info(f"Response content: {response.text}")
|
|
142
160
|
|
|
143
161
|
if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
|
|
144
162
|
return RequestsWebClientErrorResponse(
|
|
@@ -1,77 +0,0 @@
|
|
|
1
|
-
"""Logging utilities."""
|
|
2
|
-
|
|
3
|
-
import logging
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class Logger:
|
|
7
|
-
"""
|
|
8
|
-
Logging helper class.
|
|
9
|
-
"""
|
|
10
|
-
|
|
11
|
-
__LEVELS = {
|
|
12
|
-
"CRITICAL": logging.CRITICAL,
|
|
13
|
-
"ERROR": logging.ERROR,
|
|
14
|
-
"WARNING": logging.WARNING,
|
|
15
|
-
"INFO": logging.INFO,
|
|
16
|
-
"DEBUG": logging.DEBUG,
|
|
17
|
-
}
|
|
18
|
-
"""Valid logging levels and their "logging" counterparts."""
|
|
19
|
-
|
|
20
|
-
__DEFAULT_LEVEL = "INFO"
|
|
21
|
-
"""Default logging level."""
|
|
22
|
-
|
|
23
|
-
__slots__ = [
|
|
24
|
-
# "logging" object
|
|
25
|
-
"__l",
|
|
26
|
-
]
|
|
27
|
-
|
|
28
|
-
def __init__(self, name: str):
|
|
29
|
-
"""
|
|
30
|
-
Initialize logger object for a given name.
|
|
31
|
-
|
|
32
|
-
:param name: Module name that the logger should be initialized for.
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
self.__l = logging.getLogger(name)
|
|
36
|
-
|
|
37
|
-
def error(self, message: str) -> None:
|
|
38
|
-
"""
|
|
39
|
-
Log error message.
|
|
40
|
-
|
|
41
|
-
:param message: Message to log.
|
|
42
|
-
"""
|
|
43
|
-
self.__l.error(message)
|
|
44
|
-
|
|
45
|
-
def warning(self, message: str) -> None:
|
|
46
|
-
"""
|
|
47
|
-
Log warning message.
|
|
48
|
-
|
|
49
|
-
:param message: Message to log.
|
|
50
|
-
"""
|
|
51
|
-
self.__l.warning(message)
|
|
52
|
-
|
|
53
|
-
def info(self, message: str) -> None:
|
|
54
|
-
"""
|
|
55
|
-
Log informational message.
|
|
56
|
-
|
|
57
|
-
:param message: Message to log.
|
|
58
|
-
"""
|
|
59
|
-
self.__l.info(message)
|
|
60
|
-
|
|
61
|
-
def debug(self, message: str) -> None:
|
|
62
|
-
"""
|
|
63
|
-
Log debugging message.
|
|
64
|
-
|
|
65
|
-
:param message: Message to log.
|
|
66
|
-
"""
|
|
67
|
-
self.__l.debug(message)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def create_logger(name: str) -> Logger:
|
|
71
|
-
"""
|
|
72
|
-
Create and return Logger object.
|
|
73
|
-
|
|
74
|
-
:param name: Module name that the logger should be initialized for.
|
|
75
|
-
:return: Logger object.
|
|
76
|
-
"""
|
|
77
|
-
return Logger(name=name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{ultimate_sitemap_parser-1.0.0rc1 → ultimate_sitemap_parser-1.1.1}/usp/web_client/__init__.py
RENAMED
|
File without changes
|