ultimate-sitemap-parser 1.1.0__tar.gz → 1.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

Files changed (21) hide show
  1. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/PKG-INFO +1 -1
  2. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/pyproject.toml +16 -8
  3. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/_ls.py +1 -1
  4. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/fetch_parse.py +14 -14
  5. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/helpers.py +8 -7
  6. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/sitemap.py +4 -9
  7. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/tree.py +10 -5
  8. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/requests_client.py +16 -7
  9. ultimate_sitemap_parser-1.1.0/usp/log.py +0 -77
  10. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/LICENSE +0 -0
  11. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/NOTICE +0 -0
  12. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/README.rst +0 -0
  13. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/__init__.py +0 -0
  14. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/__init__.py +0 -0
  15. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/_util.py +0 -0
  16. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/cli/cli.py +1 -1
  17. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/exceptions.py +0 -0
  18. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/__init__.py +0 -0
  19. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/objects/page.py +0 -0
  20. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/__init__.py +0 -0
  21. {ultimate_sitemap_parser-1.1.0 → ultimate_sitemap_parser-1.2.0}/usp/web_client/abstract_client.py +1 -1
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: ultimate-sitemap-parser
3
- Version: 1.1.0
3
+ Version: 1.2.0
4
4
  Summary: A performant library for parsing and crawling sitemaps
5
5
  License: GPL-3.0-or-later
6
6
  Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
@@ -1,6 +1,6 @@
1
1
  [project]
2
2
  name = "ultimate-sitemap-parser"
3
- version = "1.1.0"
3
+ version = "1.2.0"
4
4
  description = "A performant library for parsing and crawling sitemaps"
5
5
  authors = [
6
6
  { name = "Linas Valiukas", email = "linas@media.mit.edu"},
@@ -53,7 +53,7 @@ python = ">=3.8,<4.0"
53
53
  [tool.poetry.group.dev.dependencies]
54
54
  requests-mock = ">=1.6.0,<2.0"
55
55
  pytest = "^8.3.0"
56
- ruff = "^0.6.1"
56
+ ruff = "^0.9.3"
57
57
  vcrpy = "6.0.1"
58
58
  pytest-mock = "^3.14.0"
59
59
 
@@ -83,12 +83,20 @@ extend-exclude = ["docs/*"]
83
83
 
84
84
  [tool.ruff.lint]
85
85
  select = [
86
- "E4",
87
- "E7",
88
- "E9",
89
- "F",
90
- "UP",
91
- "PT"
86
+ "E4", # pycodestyle Import
87
+ "E7", # pycodestyle Statement
88
+ "E9", # pycodestyle Runtime
89
+ "F", # pyflakes
90
+ "UP", # pyupgrde
91
+ "PT", # flake8-pytest-style
92
+ "I", # isort
93
+ "T20", # flake8-print
94
+ "LOG", # flake8-logging
95
+ ]
96
+
97
+ [tool.ruff.lint.per-file-ignores]
98
+ "**/tests/*" = [
99
+ "T20", # Allow print in tests
92
100
  ]
93
101
 
94
102
  [tool.pytest.ini_options]
@@ -2,7 +2,7 @@ import argparse
2
2
  import sys
3
3
  from typing import Iterator
4
4
 
5
- from usp.cli._util import tabs, format_help
5
+ from usp.cli._util import format_help, tabs
6
6
  from usp.objects.sitemap import AbstractSitemap
7
7
  from usp.tree import sitemap_tree_for_homepage
8
8
 
@@ -8,49 +8,49 @@
8
8
  """
9
9
 
10
10
  import abc
11
+ import logging
11
12
  import re
12
13
  import xml.parsers.expat
13
14
  from collections import OrderedDict
14
15
  from decimal import Decimal, InvalidOperation
15
- from typing import Optional, Dict, Union
16
-
16
+ from typing import Dict, Optional, Union
17
17
 
18
18
  from .exceptions import SitemapException, SitemapXMLParsingException
19
19
  from .helpers import (
20
- html_unescape_strip,
21
- parse_iso8601_date,
22
20
  get_url_retry_on_client_errors,
23
- ungzipped_response_content,
21
+ html_unescape_strip,
24
22
  is_http_url,
23
+ parse_iso8601_date,
25
24
  parse_rfc2822_date,
25
+ ungzipped_response_content,
26
26
  )
27
- from .log import create_logger
28
27
  from .objects.page import (
28
+ SITEMAP_PAGE_DEFAULT_PRIORITY,
29
29
  SitemapImage,
30
- SitemapPage,
31
30
  SitemapNewsStory,
31
+ SitemapPage,
32
32
  SitemapPageChangeFrequency,
33
- SITEMAP_PAGE_DEFAULT_PRIORITY,
34
33
  )
35
34
  from .objects.sitemap import (
36
35
  AbstractSitemap,
37
- InvalidSitemap,
38
36
  IndexRobotsTxtSitemap,
39
37
  IndexXMLSitemap,
40
- PagesXMLSitemap,
41
- PagesTextSitemap,
42
- PagesRSSSitemap,
38
+ InvalidSitemap,
43
39
  PagesAtomSitemap,
40
+ PagesRSSSitemap,
41
+ PagesTextSitemap,
42
+ PagesXMLSitemap,
44
43
  )
45
44
  from .web_client.abstract_client import (
46
45
  AbstractWebClient,
47
46
  AbstractWebClientSuccessResponse,
47
+ LocalWebClient,
48
+ NoWebClientException,
48
49
  WebClientErrorResponse,
49
50
  )
50
- from .web_client.abstract_client import LocalWebClient, NoWebClientException
51
51
  from .web_client.requests_client import RequestsWebClient
52
52
 
53
- log = create_logger(__name__)
53
+ log = logging.getLogger(__name__)
54
54
 
55
55
 
56
56
  class SitemapFetcher:
@@ -3,24 +3,25 @@
3
3
  import datetime
4
4
  import gzip as gzip_lib
5
5
  import html
6
+ import logging
6
7
  import re
7
8
  import sys
8
9
  import time
9
10
  from typing import Optional
10
- from urllib.parse import urlparse, unquote_plus, urlunparse
11
- from dateutil.parser import parse as dateutil_parse
11
+ from urllib.parse import unquote_plus, urlparse, urlunparse
12
+
12
13
  from dateutil.parser import isoparse as dateutil_isoparse
14
+ from dateutil.parser import parse as dateutil_parse
13
15
 
14
- from .exceptions import SitemapException, GunzipException, StripURLToHomepageException
15
- from .log import create_logger
16
+ from .exceptions import GunzipException, SitemapException, StripURLToHomepageException
16
17
  from .web_client.abstract_client import (
17
18
  AbstractWebClient,
19
+ AbstractWebClientResponse,
18
20
  AbstractWebClientSuccessResponse,
19
21
  WebClientErrorResponse,
20
- AbstractWebClientResponse,
21
22
  )
22
23
 
23
- log = create_logger(__name__)
24
+ log = logging.getLogger(__name__)
24
25
 
25
26
  __URL_REGEX = re.compile(r"^https?://[^\s/$.?#].[^\s]*$", re.IGNORECASE)
26
27
  """Regular expression to match HTTP(s) URLs."""
@@ -247,7 +248,7 @@ def ungzipped_response_content(
247
248
  data = gunzip(data)
248
249
  except GunzipException as ex:
249
250
  # In case of an error, just assume that it's one of the non-gzipped sitemaps with ".gz" extension
250
- log.error(
251
+ log.warning(
251
252
  f"Unable to gunzip response {response}, maybe it's a non-gzipped sitemap: {ex}"
252
253
  )
253
254
 
@@ -9,11 +9,11 @@
9
9
  """
10
10
 
11
11
  import abc
12
- from functools import lru_cache
13
12
  import os
14
13
  import pickle
15
14
  import tempfile
16
- from typing import List, Iterator, Tuple
15
+ from functools import lru_cache
16
+ from typing import Iterator, List, Tuple
17
17
 
18
18
  from .page import SitemapPage
19
19
 
@@ -72,7 +72,7 @@ class AbstractSitemap(metaclass=abc.ABCMeta):
72
72
  return hash((self.url,))
73
73
 
74
74
  def __repr__(self):
75
- return f"{self.__class__.__name__}(" f"url={self.url}" ")"
75
+ return f"{self.__class__.__name__}(url={self.url})"
76
76
 
77
77
  @property
78
78
  def url(self) -> str:
@@ -167,12 +167,7 @@ class InvalidSitemap(AbstractSitemap):
167
167
  return True
168
168
 
169
169
  def __repr__(self):
170
- return (
171
- f"{self.__class__.__name__}("
172
- f"url={self.url}, "
173
- f"reason={self.reason}"
174
- ")"
175
- )
170
+ return f"{self.__class__.__name__}(url={self.url}, reason={self.reason})"
176
171
 
177
172
  def to_dict(self, with_pages=True) -> dict:
178
173
  return {
@@ -1,19 +1,20 @@
1
1
  """Helpers to generate a sitemap tree."""
2
2
 
3
+ import logging
3
4
  from typing import Optional
5
+
4
6
  from .exceptions import SitemapException
5
7
  from .fetch_parse import SitemapFetcher, SitemapStrParser
6
8
  from .helpers import is_http_url, strip_url_to_homepage
7
- from .log import create_logger
8
9
  from .objects.sitemap import (
9
10
  AbstractSitemap,
10
- InvalidSitemap,
11
- IndexWebsiteSitemap,
12
11
  IndexRobotsTxtSitemap,
12
+ IndexWebsiteSitemap,
13
+ InvalidSitemap,
13
14
  )
14
15
  from .web_client.abstract_client import AbstractWebClient
15
16
 
16
- log = create_logger(__name__)
17
+ log = logging.getLogger(__name__)
17
18
 
18
19
  _UNPUBLISHED_SITEMAP_PATHS = {
19
20
  "sitemap.xml",
@@ -39,6 +40,7 @@ def sitemap_tree_for_homepage(
39
40
  web_client: Optional[AbstractWebClient] = None,
40
41
  use_robots: bool = True,
41
42
  use_known_paths: bool = True,
43
+ extra_known_paths: Optional[set] = None,
42
44
  ) -> AbstractSitemap:
43
45
  """
44
46
  Using a homepage URL, fetch the tree of sitemaps and pages listed in them.
@@ -48,12 +50,15 @@ def sitemap_tree_for_homepage(
48
50
  If ``None``, a :class:`~.RequestsWebClient` will be used.
49
51
  :param use_robots: Whether to discover sitemaps through robots.txt.
50
52
  :param use_known_paths: Whether to discover sitemaps through common known paths.
53
+ :param extra_known_paths: Extra paths to check for sitemaps.
51
54
  :return: Root sitemap object of the fetched sitemap tree.
52
55
  """
53
56
 
54
57
  if not is_http_url(homepage_url):
55
58
  raise SitemapException(f"URL {homepage_url} is not a HTTP(s) URL.")
56
59
 
60
+ extra_known_paths = extra_known_paths or set()
61
+
57
62
  stripped_homepage_url = strip_url_to_homepage(url=homepage_url)
58
63
  if homepage_url != stripped_homepage_url:
59
64
  log.warning(
@@ -81,7 +86,7 @@ def sitemap_tree_for_homepage(
81
86
  sitemap_urls_found_in_robots_txt.add(sub_sitemap.url)
82
87
 
83
88
  if use_known_paths:
84
- for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS:
89
+ for unpublished_sitemap_path in _UNPUBLISHED_SITEMAP_PATHS | extra_known_paths:
85
90
  unpublished_sitemap_url = homepage_url + unpublished_sitemap_path
86
91
 
87
92
  # Don't refetch URLs already found in robots.txt
@@ -1,20 +1,23 @@
1
1
  """Implementation of :mod:`usp.web_client.abstract_client` with Requests."""
2
2
 
3
- from http import HTTPStatus
4
3
  import logging
5
- from typing import Optional, Dict, Tuple, Union
4
+ from http import HTTPStatus
5
+ from typing import Dict, Optional, Tuple, Union
6
6
 
7
7
  import requests
8
8
 
9
+ from usp import __version__
10
+
9
11
  from .abstract_client import (
12
+ RETRYABLE_HTTP_STATUS_CODES,
10
13
  AbstractWebClient,
11
14
  AbstractWebClientResponse,
12
15
  AbstractWebClientSuccessResponse,
13
16
  RequestWaiter,
14
17
  WebClientErrorResponse,
15
- RETRYABLE_HTTP_STATUS_CODES,
16
18
  )
17
- from usp import __version__
19
+
20
+ log = logging.getLogger(__name__)
18
21
 
19
22
 
20
23
  class RequestsWebClientSuccessResponse(AbstractWebClientSuccessResponse):
@@ -89,18 +92,24 @@ class RequestsWebClient(AbstractWebClient):
89
92
  ]
90
93
 
91
94
  def __init__(
92
- self, verify=True, wait: Optional[float] = None, random_wait: bool = False
95
+ self,
96
+ verify=True,
97
+ wait: Optional[float] = None,
98
+ random_wait: bool = False,
99
+ session: Optional[requests.Session] = None,
93
100
  ):
94
101
  """
95
102
  :param verify: whether certificates should be verified for HTTPS requests.
96
103
  :param wait: time to wait between requests, in seconds.
97
104
  :param random_wait: if true, wait time is multiplied by a random number between 0.5 and 1.5.
105
+ :param session: a custom session object to use, or None to create a new one.
98
106
  """
99
107
  self.__max_response_data_length = None
100
108
  self.__timeout = self.__HTTP_REQUEST_TIMEOUT
101
109
  self.__proxies = {}
102
110
  self.__verify = verify
103
111
  self.__waiter = RequestWaiter(wait, random_wait)
112
+ self.__session = session or requests.Session()
104
113
 
105
114
  def set_timeout(self, timeout: Union[int, Tuple[int, int], None]) -> None:
106
115
  """Set HTTP request timeout.
@@ -129,7 +138,7 @@ class RequestsWebClient(AbstractWebClient):
129
138
  def get(self, url: str) -> AbstractWebClientResponse:
130
139
  self.__waiter.wait()
131
140
  try:
132
- response = requests.get(
141
+ response = self.__session.get(
133
142
  url,
134
143
  timeout=self.__timeout,
135
144
  stream=True,
@@ -153,7 +162,7 @@ class RequestsWebClient(AbstractWebClient):
153
162
  )
154
163
  else:
155
164
  message = f"{response.status_code} {response.reason}"
156
- logging.info(f"Response content: {response.text}")
165
+ log.info(f"Response content: {response.text}")
157
166
 
158
167
  if response.status_code in RETRYABLE_HTTP_STATUS_CODES:
159
168
  return RequestsWebClientErrorResponse(
@@ -1,77 +0,0 @@
1
- """Logging utilities."""
2
-
3
- import logging
4
-
5
-
6
- class Logger:
7
- """
8
- Logging helper class.
9
- """
10
-
11
- __LEVELS = {
12
- "CRITICAL": logging.CRITICAL,
13
- "ERROR": logging.ERROR,
14
- "WARNING": logging.WARNING,
15
- "INFO": logging.INFO,
16
- "DEBUG": logging.DEBUG,
17
- }
18
- """Valid logging levels and their "logging" counterparts."""
19
-
20
- __DEFAULT_LEVEL = "INFO"
21
- """Default logging level."""
22
-
23
- __slots__ = [
24
- # "logging" object
25
- "__l",
26
- ]
27
-
28
- def __init__(self, name: str):
29
- """
30
- Initialize logger object for a given name.
31
-
32
- :param name: Module name that the logger should be initialized for.
33
- """
34
-
35
- self.__l = logging.getLogger(name)
36
-
37
- def error(self, message: str) -> None:
38
- """
39
- Log error message.
40
-
41
- :param message: Message to log.
42
- """
43
- self.__l.error(message)
44
-
45
- def warning(self, message: str) -> None:
46
- """
47
- Log warning message.
48
-
49
- :param message: Message to log.
50
- """
51
- self.__l.warning(message)
52
-
53
- def info(self, message: str) -> None:
54
- """
55
- Log informational message.
56
-
57
- :param message: Message to log.
58
- """
59
- self.__l.info(message)
60
-
61
- def debug(self, message: str) -> None:
62
- """
63
- Log debugging message.
64
-
65
- :param message: Message to log.
66
- """
67
- self.__l.debug(message)
68
-
69
-
70
- def create_logger(name: str) -> Logger:
71
- """
72
- Create and return Logger object.
73
-
74
- :param name: Module name that the logger should be initialized for.
75
- :return: Logger object.
76
- """
77
- return Logger(name=name)
@@ -1,7 +1,7 @@
1
1
  from argparse import ArgumentParser
2
2
 
3
- from usp.cli import _ls as ls_cmd
4
3
  from usp import __version__
4
+ from usp.cli import _ls as ls_cmd
5
5
 
6
6
 
7
7
  def main():
@@ -2,8 +2,8 @@
2
2
 
3
3
  import abc
4
4
  import random
5
- from http import HTTPStatus
6
5
  import time
6
+ from http import HTTPStatus
7
7
  from typing import Optional
8
8
 
9
9
  RETRYABLE_HTTP_STATUS_CODES = {