ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ultimate-sitemap-parser might be problematic. Click here for more details.

@@ -0,0 +1,109 @@
1
+ Metadata-Version: 2.1
2
+ Name: ultimate-sitemap-parser
3
+ Version: 1.0.0rc1
4
+ Summary: A performant library for parsing and crawling sitemaps
5
+ Home-page: https://ultimate-sitemap-parser.readthedocs.io/
6
+ License: GPL-3.0-or-later
7
+ Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
8
+ Author: Linas Valiukas
9
+ Author-email: linas@media.mit.edu
10
+ Maintainer: Freddy Heppell
11
+ Maintainer-email: f.heppell@sheffield.ac.uk
12
+ Requires-Python: >=3.8,<4.0
13
+ Classifier: Development Status :: 5 - Production/Stable
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Intended Audience :: Information Technology
16
+ Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.8
21
+ Classifier: Programming Language :: Python :: 3.9
22
+ Classifier: Programming Language :: Python :: 3.10
23
+ Classifier: Programming Language :: Python :: 3.11
24
+ Classifier: Programming Language :: Python :: 3.12
25
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
26
+ Classifier: Topic :: Text Processing :: Indexing
27
+ Classifier: Topic :: Text Processing :: Markup :: XML
28
+ Requires-Dist: python-dateutil (>=2.7,<3.0.0)
29
+ Requires-Dist: requests (>=2.2.1)
30
+ Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
31
+ Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
32
+ Description-Content-Type: text/x-rst
33
+
34
+ Ultimate Sitemap Parser
35
+ -----------------------
36
+
37
+ .. image:: https://img.shields.io/pypi/pyversions/ultimate-sitemap-parser
38
+ :alt: PyPI - Python Version
39
+ :target: https://github.com/GateNLP/ultimate-sitemap-parser
40
+
41
+ .. image:: https://img.shields.io/pypi/v/ultimate-sitemap-parser
42
+ :alt: PyPI - Version
43
+ :target: https://pypi.org/project/ultimate-sitemap-parser/
44
+
45
+ .. image:: https://img.shields.io/conda/vn/conda-forge/ultimate-sitemap-parser
46
+ :alt: Conda Version
47
+ :target: https://anaconda.org/conda-forge/ultimate-sitemap-parser
48
+
49
+ .. image:: https://img.shields.io/pepy/dt/ultimate-sitemap-parser
50
+ :target: https://pepy.tech/project/ultimate-sitemap-parser
51
+ :alt: Pepy Total Downloads
52
+
53
+
54
+ **Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps.**
55
+
56
+
57
+ Features
58
+ ========
59
+
60
+ - Supports all sitemap formats:
61
+
62
+ - `XML sitemaps <https://www.sitemaps.org/protocol.html#xmlTagDefinitions>`_
63
+ - `Google News sitemaps <https://developers.google.com/search/docs/crawling-indexing/sitemaps/news-sitemap>`_ and `Image sitemaps <https://developers.google.com/search/docs/advanced/sitemaps/image-sitemaps>`_
64
+ - `plain text sitemaps <https://www.sitemaps.org/protocol.html#otherformats>`_
65
+ - `RSS 2.0 / Atom 0.3 / Atom 1.0 sitemaps <https://www.sitemaps.org/protocol.html#otherformats>`_
66
+ - `Sitemaps linked from robots.txt <https://developers.google.com/search/reference/robots_txt#sitemap>`_
67
+
68
+ - Field-tested with ~1 million URLs as part of the `Media Cloud project <https://mediacloud.org/>`_
69
+ - Error-tolerant with more common sitemap bugs
70
+ - Tries to find sitemaps not listed in ``robots.txt``
71
+ - Uses fast and memory efficient Expat XML parsing
72
+ - Doesn't consume much memory even with massive sitemap hierarchies
73
+ - Provides a generated sitemap tree as easy to use object tree
74
+ - Supports using a custom web client
75
+ - Uses a small number of actively maintained third-party modules
76
+ - Reasonably tested
77
+
78
+
79
+ Installation
80
+ ============
81
+
82
+ .. code:: sh
83
+
84
+ pip install ultimate-sitemap-parser
85
+
86
+ or using Anaconda:
87
+
88
+ .. code:: sh
89
+
90
+ conda install -c conda-forge ultimate-sitemap-parser
91
+
92
+
93
+ Usage
94
+ =====
95
+
96
+ .. code:: python
97
+
98
+ from usp.tree import sitemap_tree_for_homepage
99
+
100
+ tree = sitemap_tree_for_homepage('https://www.example.org/')
101
+
102
+ for page in tree.all_pages():
103
+ print(page.url)
104
+
105
+ ``sitemap_tree_for_homepage()`` will return a tree of ``AbstractSitemap`` subclass objects that represent the sitemap
106
+ hierarchy found on the website; see a `reference of AbstractSitemap subclasses <https://ultimate-sitemap-parser.readthedocs.io/en/latest/reference/api/usp.objects.sitemap.html>`_. `AbstractSitemap.all_pages()` returns a generator to efficiently iterate over pages without loading the entire tree into memory.
107
+
108
+ For more examples and details, see the `documentation <https://ultimate-sitemap-parser.readthedocs.io/en/latest/>`_.
109
+
@@ -0,0 +1,12 @@
1
+ Copyright (C) 2018 Linas Valiukas, Hal Roberts, Media Cloud project,
2
+ Freddy Heppell, The University of Sheffield, and other contributors.
3
+
4
+ This program is free software: you can redistribute it and/or modify
5
+ it under the terms of the GNU General Public License as published by
6
+ the Free Software Foundation, either version 3 of the License, or
7
+ any later version.
8
+
9
+ This program is distributed in the hope that it will be useful,
10
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12
+ GNU General Public License for more details <http://www.gnu.org/licenses/>.
@@ -0,0 +1,22 @@
1
+ usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
2
+ usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
3
+ usp/cli/_ls.py,sha256=YyDmtBjK02_26Qv8-3NLf87b1C4Wt0GzZ1XkdF2fllQ,2954
4
+ usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
5
+ usp/cli/cli.py,sha256=D1tXZyhiG0sIwtepdPdglW5gUlPWyx4LNeBmaM700Yc,592
6
+ usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
7
+ usp/fetch_parse.py,sha256=aE0ohttrce_gcZE8iBFvjy6iRJEEij0SzIOX3PEsDXw,40571
8
+ usp/helpers.py,sha256=QJH3ETapqqbwRnjX_LM0EhWqeta9LTqVvW5OAkBKUOc,8491
9
+ usp/log.py,sha256=BS0AtURK62TPGVqEuIu8kwGtIJDYoGsK5_N-b60VOpE,1631
10
+ usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ usp/objects/page.py,sha256=PNuYGburGu6AqoIdI8NdNd_6XG7nyyG8qlXKekuIinY,13483
12
+ usp/objects/sitemap.py,sha256=XWgke1SJNA79qnOEvaY2nJbnlidWxqBvfuRcF4GhBHI,11564
13
+ usp/tree.py,sha256=2cuHOpdYX5aKZ4XuUQPaKjILnoPnFKZwPNn0g8cxT18,4066
14
+ usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ usp/web_client/abstract_client.py,sha256=4OdmM9UEjK3LUYZCRwYVDrmapKkPXHR7fYch47W5xys,5381
16
+ usp/web_client/requests_client.py,sha256=N2nY15Aj9wd-qF0ujS4-YvMWaRntfWnX8ze5I_h7FnI,5079
17
+ ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
18
+ ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA,sha256=af_IqHD8nO8DzsiiG17faK52sMh6Br0imarNxZthhmY,4385
19
+ ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
20
+ ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
21
+ ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
22
+ ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: poetry-core 1.9.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,3 @@
1
+ [console_scripts]
2
+ usp=usp.cli:main
3
+
usp/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ from importlib.metadata import version
2
+
3
+ __version__ = version("ultimate-sitemap-parser")
4
+
5
+ __all__ = ["tree", "__version__"]
usp/cli/__init__.py ADDED
@@ -0,0 +1 @@
1
+ from usp.cli.cli import main as main
usp/cli/_ls.py ADDED
@@ -0,0 +1,105 @@
1
+ import argparse
2
+ import sys
3
+ from typing import Iterator
4
+
5
+ from usp.cli._util import tabs, format_help
6
+ from usp.objects.sitemap import AbstractSitemap
7
+ from usp.tree import sitemap_tree_for_homepage
8
+
9
+ LS_FORMATS = {
10
+ "tabtree": "Sitemaps and pages, nested with tab indentation",
11
+ "pages": "Flat list of pages, one per line",
12
+ }
13
+
14
+
15
+ def register(subparsers):
16
+ ls_parser = subparsers.add_parser(
17
+ "ls",
18
+ help="List sitemap pages",
19
+ description="download, parse and list the sitemap structure",
20
+ formatter_class=argparse.RawTextHelpFormatter,
21
+ )
22
+ ls_parser.add_argument("url", type=str, help="URL of the site including protocol")
23
+ ls_parser.add_argument(
24
+ "-f",
25
+ "--format",
26
+ choices=LS_FORMATS,
27
+ default="tabtree",
28
+ help=format_help(LS_FORMATS, "set output format"),
29
+ metavar="",
30
+ )
31
+ ls_parser.add_argument(
32
+ "-r",
33
+ "--no-robots",
34
+ action="store_true",
35
+ help="don't discover sitemaps through robots.txt",
36
+ )
37
+ ls_parser.add_argument(
38
+ "-k",
39
+ "--no-known",
40
+ action="store_true",
41
+ help="don't discover sitemaps through well-known URLs",
42
+ )
43
+ ls_parser.add_argument(
44
+ "-u",
45
+ "--strip-url",
46
+ action="store_true",
47
+ help="strip the supplied URL from each page and sitemap URL",
48
+ )
49
+ ls_parser.set_defaults(no_robots=False, no_known=False, strip_url=False)
50
+
51
+ ls_parser.set_defaults(func=ls)
52
+
53
+
54
+ def _strip_url(url: str, prefix: str):
55
+ url = url.removeprefix(prefix)
56
+
57
+ if not url.startswith("/") and prefix != "":
58
+ return "/" + url
59
+ return url
60
+
61
+
62
+ def _list_page_urls(sitemap: AbstractSitemap, prefix: str = "") -> Iterator[str]:
63
+ for page in sitemap.all_pages():
64
+ yield prefix + page.url
65
+
66
+
67
+ def _output_sitemap_nested(
68
+ sitemap: AbstractSitemap, strip_prefix: str = "", depth: int = 0
69
+ ):
70
+ sitemap_url = sitemap.url
71
+ if depth != 0:
72
+ sitemap_url = _strip_url(sitemap_url, strip_prefix)
73
+ sys.stdout.write(tabs(depth) + sitemap_url + "\n")
74
+
75
+ for sub_map in sitemap.sub_sitemaps:
76
+ _output_sitemap_nested(sub_map, strip_prefix, depth + 1)
77
+
78
+ for page in sitemap.pages:
79
+ sys.stdout.write(tabs(depth + 1) + _strip_url(page.url, strip_prefix) + "\n")
80
+
81
+
82
+ def _output_pages(sitemap: AbstractSitemap, strip_prefix: str = ""):
83
+ for page in sitemap.all_pages():
84
+ sys.stdout.write(_strip_url(page.url, strip_prefix) + "\n")
85
+
86
+
87
+ def ls(args):
88
+ tree = sitemap_tree_for_homepage(
89
+ args.url,
90
+ use_robots=not args.no_robots,
91
+ use_known_paths=not args.no_known,
92
+ )
93
+
94
+ strip_prefix = ""
95
+ if args.strip_url:
96
+ strip_prefix = tree.url
97
+
98
+ if args.format == "pages":
99
+ _output_pages(tree, strip_prefix)
100
+ elif args.format == "tabtree":
101
+ _output_sitemap_nested(tree, strip_prefix)
102
+ else:
103
+ raise NotImplementedError(f"Format '{args.format}' not implemented")
104
+
105
+ exit(0)
usp/cli/_util.py ADDED
@@ -0,0 +1,21 @@
1
+ from typing import Dict
2
+
3
+
4
+ def format_help(choices: Dict[str, str], opt_help: str) -> str:
5
+ """Generate help text for argparse choices.
6
+
7
+ :param choices: Dictionary of choices {choice: help}
8
+ :param opt_help: Help text for the option:
9
+ :return: Help text for argparse choices.
10
+ """
11
+ h = f"{opt_help} (default: %(default)s)\nchoices:\n"
12
+
13
+ for fmt, key in choices.items():
14
+ h += f" {fmt}: {key}\n"
15
+
16
+ return h
17
+
18
+
19
+ def tabs(n: int):
20
+ """Generate n tabs."""
21
+ return "\t" * n
usp/cli/cli.py ADDED
@@ -0,0 +1,27 @@
1
+ from argparse import ArgumentParser
2
+
3
+ from usp.cli import _ls as ls_cmd
4
+ from usp import __version__
5
+
6
+
7
+ def main():
8
+ parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser")
9
+ parser.add_argument(
10
+ "-v", "--version", action="version", version=f"%(prog)s v{__version__}"
11
+ )
12
+
13
+ subparsers = parser.add_subparsers(required=False, title="commands", metavar="")
14
+ ls_cmd.register(subparsers)
15
+
16
+ args = parser.parse_args()
17
+
18
+ if "func" in args:
19
+ args.func(args)
20
+ else:
21
+ parser.print_help()
22
+
23
+ exit(0)
24
+
25
+
26
+ if __name__ == "__main__":
27
+ main()
usp/exceptions.py ADDED
@@ -0,0 +1,35 @@
1
+ """Exceptions used by the sitemap parser."""
2
+
3
+
4
+ class SitemapException(Exception):
5
+ """
6
+ Problem due to which we can't run further, e.g. wrong input parameters.
7
+ """
8
+
9
+ pass
10
+
11
+
12
+ class SitemapXMLParsingException(Exception):
13
+ """
14
+ XML parsing exception to be handled gracefully.
15
+ """
16
+
17
+ pass
18
+
19
+
20
+ class GunzipException(Exception):
21
+ """
22
+ Error decompressing seemingly gzipped content.
23
+ See :func:`usp.helpers.gunzip`.
24
+ """
25
+
26
+ pass
27
+
28
+
29
+ class StripURLToHomepageException(Exception):
30
+ """
31
+ Problem parsing URL and stripping to homepage.
32
+ See :func:`usp.helpers.strip_url_to_homepage`.
33
+ """
34
+
35
+ pass