ultimate-sitemap-parser 1.0.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ultimate-sitemap-parser might be problematic. Click here for more details.
- ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE +674 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA +109 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE +12 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD +22 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL +4 -0
- ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt +3 -0
- usp/__init__.py +5 -0
- usp/cli/__init__.py +1 -0
- usp/cli/_ls.py +105 -0
- usp/cli/_util.py +21 -0
- usp/cli/cli.py +27 -0
- usp/exceptions.py +35 -0
- usp/fetch_parse.py +1182 -0
- usp/helpers.py +293 -0
- usp/log.py +77 -0
- usp/objects/__init__.py +0 -0
- usp/objects/page.py +451 -0
- usp/objects/sitemap.py +436 -0
- usp/tree.py +114 -0
- usp/web_client/__init__.py +0 -0
- usp/web_client/abstract_client.py +189 -0
- usp/web_client/requests_client.py +150 -0
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: ultimate-sitemap-parser
|
|
3
|
+
Version: 1.0.0rc1
|
|
4
|
+
Summary: A performant library for parsing and crawling sitemaps
|
|
5
|
+
Home-page: https://ultimate-sitemap-parser.readthedocs.io/
|
|
6
|
+
License: GPL-3.0-or-later
|
|
7
|
+
Keywords: sitemap,crawler,indexing,xml,rss,atom,google news
|
|
8
|
+
Author: Linas Valiukas
|
|
9
|
+
Author-email: linas@media.mit.edu
|
|
10
|
+
Maintainer: Freddy Heppell
|
|
11
|
+
Maintainer-email: f.heppell@sheffield.ac.uk
|
|
12
|
+
Requires-Python: >=3.8,<4.0
|
|
13
|
+
Classifier: Development Status :: 5 - Production/Stable
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Intended Audience :: Information Technology
|
|
16
|
+
Classifier: License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
25
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
26
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
27
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
28
|
+
Requires-Dist: python-dateutil (>=2.7,<3.0.0)
|
|
29
|
+
Requires-Dist: requests (>=2.2.1)
|
|
30
|
+
Project-URL: Documentation, https://ultimate-sitemap-parser.readthedocs.io/
|
|
31
|
+
Project-URL: Repository, https://github.com/GateNLP/ultimate-sitemap-parser
|
|
32
|
+
Description-Content-Type: text/x-rst
|
|
33
|
+
|
|
34
|
+
Ultimate Sitemap Parser
|
|
35
|
+
-----------------------
|
|
36
|
+
|
|
37
|
+
.. image:: https://img.shields.io/pypi/pyversions/ultimate-sitemap-parser
|
|
38
|
+
:alt: PyPI - Python Version
|
|
39
|
+
:target: https://github.com/GateNLP/ultimate-sitemap-parser
|
|
40
|
+
|
|
41
|
+
.. image:: https://img.shields.io/pypi/v/ultimate-sitemap-parser
|
|
42
|
+
:alt: PyPI - Version
|
|
43
|
+
:target: https://pypi.org/project/ultimate-sitemap-parser/
|
|
44
|
+
|
|
45
|
+
.. image:: https://img.shields.io/conda/vn/conda-forge/ultimate-sitemap-parser
|
|
46
|
+
:alt: Conda Version
|
|
47
|
+
:target: https://anaconda.org/conda-forge/ultimate-sitemap-parser
|
|
48
|
+
|
|
49
|
+
.. image:: https://img.shields.io/pepy/dt/ultimate-sitemap-parser
|
|
50
|
+
:target: https://pepy.tech/project/ultimate-sitemap-parser
|
|
51
|
+
:alt: Pepy Total Downloads
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
**Ultimate Sitemap Parser (USP) is a performant and robust Python library for parsing and crawling sitemaps.**
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
Features
|
|
58
|
+
========
|
|
59
|
+
|
|
60
|
+
- Supports all sitemap formats:
|
|
61
|
+
|
|
62
|
+
- `XML sitemaps <https://www.sitemaps.org/protocol.html#xmlTagDefinitions>`_
|
|
63
|
+
- `Google News sitemaps <https://developers.google.com/search/docs/crawling-indexing/sitemaps/news-sitemap>`_ and `Image sitemaps <https://developers.google.com/search/docs/advanced/sitemaps/image-sitemaps>`_
|
|
64
|
+
- `plain text sitemaps <https://www.sitemaps.org/protocol.html#otherformats>`_
|
|
65
|
+
- `RSS 2.0 / Atom 0.3 / Atom 1.0 sitemaps <https://www.sitemaps.org/protocol.html#otherformats>`_
|
|
66
|
+
- `Sitemaps linked from robots.txt <https://developers.google.com/search/reference/robots_txt#sitemap>`_
|
|
67
|
+
|
|
68
|
+
- Field-tested with ~1 million URLs as part of the `Media Cloud project <https://mediacloud.org/>`_
|
|
69
|
+
- Error-tolerant with more common sitemap bugs
|
|
70
|
+
- Tries to find sitemaps not listed in ``robots.txt``
|
|
71
|
+
- Uses fast and memory efficient Expat XML parsing
|
|
72
|
+
- Doesn't consume much memory even with massive sitemap hierarchies
|
|
73
|
+
- Provides a generated sitemap tree as easy to use object tree
|
|
74
|
+
- Supports using a custom web client
|
|
75
|
+
- Uses a small number of actively maintained third-party modules
|
|
76
|
+
- Reasonably tested
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
Installation
|
|
80
|
+
============
|
|
81
|
+
|
|
82
|
+
.. code:: sh
|
|
83
|
+
|
|
84
|
+
pip install ultimate-sitemap-parser
|
|
85
|
+
|
|
86
|
+
or using Anaconda:
|
|
87
|
+
|
|
88
|
+
.. code:: sh
|
|
89
|
+
|
|
90
|
+
conda install -c conda-forge ultimate-sitemap-parser
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
Usage
|
|
94
|
+
=====
|
|
95
|
+
|
|
96
|
+
.. code:: python
|
|
97
|
+
|
|
98
|
+
from usp.tree import sitemap_tree_for_homepage
|
|
99
|
+
|
|
100
|
+
tree = sitemap_tree_for_homepage('https://www.example.org/')
|
|
101
|
+
|
|
102
|
+
for page in tree.all_pages():
|
|
103
|
+
print(page.url)
|
|
104
|
+
|
|
105
|
+
``sitemap_tree_for_homepage()`` will return a tree of ``AbstractSitemap`` subclass objects that represent the sitemap
|
|
106
|
+
hierarchy found on the website; see a `reference of AbstractSitemap subclasses <https://ultimate-sitemap-parser.readthedocs.io/en/latest/reference/api/usp.objects.sitemap.html>`_. `AbstractSitemap.all_pages()` returns a generator to efficiently iterate over pages without loading the entire tree into memory.
|
|
107
|
+
|
|
108
|
+
For more examples and details, see the `documentation <https://ultimate-sitemap-parser.readthedocs.io/en/latest/>`_.
|
|
109
|
+
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Copyright (C) 2018 Linas Valiukas, Hal Roberts, Media Cloud project,
|
|
2
|
+
Freddy Heppell, The University of Sheffield, and other contributors.
|
|
3
|
+
|
|
4
|
+
This program is free software: you can redistribute it and/or modify
|
|
5
|
+
it under the terms of the GNU General Public License as published by
|
|
6
|
+
the Free Software Foundation, either version 3 of the License, or
|
|
7
|
+
any later version.
|
|
8
|
+
|
|
9
|
+
This program is distributed in the hope that it will be useful,
|
|
10
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
11
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
12
|
+
GNU General Public License for more details <http://www.gnu.org/licenses/>.
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
usp/__init__.py,sha256=_jshbOBBUHRZ5ko4SdI7GRFiF9xKGJVCEPgL9lZJ81o,124
|
|
2
|
+
usp/cli/__init__.py,sha256=mGrjSftUYfM2SGp9yEN2dTJndl5thOdv77-EAe6ocWo,37
|
|
3
|
+
usp/cli/_ls.py,sha256=YyDmtBjK02_26Qv8-3NLf87b1C4Wt0GzZ1XkdF2fllQ,2954
|
|
4
|
+
usp/cli/_util.py,sha256=UL5WiRZlpiDOI_QvSU1PdjcS6iCmfcLQlO1Mm1wjSAw,505
|
|
5
|
+
usp/cli/cli.py,sha256=D1tXZyhiG0sIwtepdPdglW5gUlPWyx4LNeBmaM700Yc,592
|
|
6
|
+
usp/exceptions.py,sha256=9KTgnocYYZCfyaCf9BrBN7Ok4cwn7_DlrNFbhUfFsGM,634
|
|
7
|
+
usp/fetch_parse.py,sha256=aE0ohttrce_gcZE8iBFvjy6iRJEEij0SzIOX3PEsDXw,40571
|
|
8
|
+
usp/helpers.py,sha256=QJH3ETapqqbwRnjX_LM0EhWqeta9LTqVvW5OAkBKUOc,8491
|
|
9
|
+
usp/log.py,sha256=BS0AtURK62TPGVqEuIu8kwGtIJDYoGsK5_N-b60VOpE,1631
|
|
10
|
+
usp/objects/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
|
+
usp/objects/page.py,sha256=PNuYGburGu6AqoIdI8NdNd_6XG7nyyG8qlXKekuIinY,13483
|
|
12
|
+
usp/objects/sitemap.py,sha256=XWgke1SJNA79qnOEvaY2nJbnlidWxqBvfuRcF4GhBHI,11564
|
|
13
|
+
usp/tree.py,sha256=2cuHOpdYX5aKZ4XuUQPaKjILnoPnFKZwPNn0g8cxT18,4066
|
|
14
|
+
usp/web_client/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
|
+
usp/web_client/abstract_client.py,sha256=4OdmM9UEjK3LUYZCRwYVDrmapKkPXHR7fYch47W5xys,5381
|
|
16
|
+
usp/web_client/requests_client.py,sha256=N2nY15Aj9wd-qF0ujS4-YvMWaRntfWnX8ze5I_h7FnI,5079
|
|
17
|
+
ultimate_sitemap_parser-1.0.0rc1.dist-info/LICENSE,sha256=ixuiBLtpoK3iv89l7ylKkg9rs2GzF9ukPH7ynZYzK5s,35148
|
|
18
|
+
ultimate_sitemap_parser-1.0.0rc1.dist-info/METADATA,sha256=af_IqHD8nO8DzsiiG17faK52sMh6Br0imarNxZthhmY,4385
|
|
19
|
+
ultimate_sitemap_parser-1.0.0rc1.dist-info/NOTICE,sha256=3ANZA5R9rYnCOnUoroGfFUOZ__ww_yG01NUAx0X6J7E,632
|
|
20
|
+
ultimate_sitemap_parser-1.0.0rc1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
21
|
+
ultimate_sitemap_parser-1.0.0rc1.dist-info/entry_points.txt,sha256=v60w5WzqYlPOucntZUy0ydzlYwuAPSwoQY0KdT5ragQ,36
|
|
22
|
+
ultimate_sitemap_parser-1.0.0rc1.dist-info/RECORD,,
|
usp/__init__.py
ADDED
usp/cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from usp.cli.cli import main as main
|
usp/cli/_ls.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
from typing import Iterator
|
|
4
|
+
|
|
5
|
+
from usp.cli._util import tabs, format_help
|
|
6
|
+
from usp.objects.sitemap import AbstractSitemap
|
|
7
|
+
from usp.tree import sitemap_tree_for_homepage
|
|
8
|
+
|
|
9
|
+
LS_FORMATS = {
|
|
10
|
+
"tabtree": "Sitemaps and pages, nested with tab indentation",
|
|
11
|
+
"pages": "Flat list of pages, one per line",
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def register(subparsers):
|
|
16
|
+
ls_parser = subparsers.add_parser(
|
|
17
|
+
"ls",
|
|
18
|
+
help="List sitemap pages",
|
|
19
|
+
description="download, parse and list the sitemap structure",
|
|
20
|
+
formatter_class=argparse.RawTextHelpFormatter,
|
|
21
|
+
)
|
|
22
|
+
ls_parser.add_argument("url", type=str, help="URL of the site including protocol")
|
|
23
|
+
ls_parser.add_argument(
|
|
24
|
+
"-f",
|
|
25
|
+
"--format",
|
|
26
|
+
choices=LS_FORMATS,
|
|
27
|
+
default="tabtree",
|
|
28
|
+
help=format_help(LS_FORMATS, "set output format"),
|
|
29
|
+
metavar="",
|
|
30
|
+
)
|
|
31
|
+
ls_parser.add_argument(
|
|
32
|
+
"-r",
|
|
33
|
+
"--no-robots",
|
|
34
|
+
action="store_true",
|
|
35
|
+
help="don't discover sitemaps through robots.txt",
|
|
36
|
+
)
|
|
37
|
+
ls_parser.add_argument(
|
|
38
|
+
"-k",
|
|
39
|
+
"--no-known",
|
|
40
|
+
action="store_true",
|
|
41
|
+
help="don't discover sitemaps through well-known URLs",
|
|
42
|
+
)
|
|
43
|
+
ls_parser.add_argument(
|
|
44
|
+
"-u",
|
|
45
|
+
"--strip-url",
|
|
46
|
+
action="store_true",
|
|
47
|
+
help="strip the supplied URL from each page and sitemap URL",
|
|
48
|
+
)
|
|
49
|
+
ls_parser.set_defaults(no_robots=False, no_known=False, strip_url=False)
|
|
50
|
+
|
|
51
|
+
ls_parser.set_defaults(func=ls)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _strip_url(url: str, prefix: str):
|
|
55
|
+
url = url.removeprefix(prefix)
|
|
56
|
+
|
|
57
|
+
if not url.startswith("/") and prefix != "":
|
|
58
|
+
return "/" + url
|
|
59
|
+
return url
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _list_page_urls(sitemap: AbstractSitemap, prefix: str = "") -> Iterator[str]:
|
|
63
|
+
for page in sitemap.all_pages():
|
|
64
|
+
yield prefix + page.url
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _output_sitemap_nested(
|
|
68
|
+
sitemap: AbstractSitemap, strip_prefix: str = "", depth: int = 0
|
|
69
|
+
):
|
|
70
|
+
sitemap_url = sitemap.url
|
|
71
|
+
if depth != 0:
|
|
72
|
+
sitemap_url = _strip_url(sitemap_url, strip_prefix)
|
|
73
|
+
sys.stdout.write(tabs(depth) + sitemap_url + "\n")
|
|
74
|
+
|
|
75
|
+
for sub_map in sitemap.sub_sitemaps:
|
|
76
|
+
_output_sitemap_nested(sub_map, strip_prefix, depth + 1)
|
|
77
|
+
|
|
78
|
+
for page in sitemap.pages:
|
|
79
|
+
sys.stdout.write(tabs(depth + 1) + _strip_url(page.url, strip_prefix) + "\n")
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _output_pages(sitemap: AbstractSitemap, strip_prefix: str = ""):
|
|
83
|
+
for page in sitemap.all_pages():
|
|
84
|
+
sys.stdout.write(_strip_url(page.url, strip_prefix) + "\n")
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def ls(args):
|
|
88
|
+
tree = sitemap_tree_for_homepage(
|
|
89
|
+
args.url,
|
|
90
|
+
use_robots=not args.no_robots,
|
|
91
|
+
use_known_paths=not args.no_known,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
strip_prefix = ""
|
|
95
|
+
if args.strip_url:
|
|
96
|
+
strip_prefix = tree.url
|
|
97
|
+
|
|
98
|
+
if args.format == "pages":
|
|
99
|
+
_output_pages(tree, strip_prefix)
|
|
100
|
+
elif args.format == "tabtree":
|
|
101
|
+
_output_sitemap_nested(tree, strip_prefix)
|
|
102
|
+
else:
|
|
103
|
+
raise NotImplementedError(f"Format '{args.format}' not implemented")
|
|
104
|
+
|
|
105
|
+
exit(0)
|
usp/cli/_util.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Dict
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def format_help(choices: Dict[str, str], opt_help: str) -> str:
|
|
5
|
+
"""Generate help text for argparse choices.
|
|
6
|
+
|
|
7
|
+
:param choices: Dictionary of choices {choice: help}
|
|
8
|
+
:param opt_help: Help text for the option:
|
|
9
|
+
:return: Help text for argparse choices.
|
|
10
|
+
"""
|
|
11
|
+
h = f"{opt_help} (default: %(default)s)\nchoices:\n"
|
|
12
|
+
|
|
13
|
+
for fmt, key in choices.items():
|
|
14
|
+
h += f" {fmt}: {key}\n"
|
|
15
|
+
|
|
16
|
+
return h
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def tabs(n: int):
|
|
20
|
+
"""Generate n tabs."""
|
|
21
|
+
return "\t" * n
|
usp/cli/cli.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from argparse import ArgumentParser
|
|
2
|
+
|
|
3
|
+
from usp.cli import _ls as ls_cmd
|
|
4
|
+
from usp import __version__
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
def main():
|
|
8
|
+
parser = ArgumentParser(prog="usp", description="Ultimate Sitemap Parser")
|
|
9
|
+
parser.add_argument(
|
|
10
|
+
"-v", "--version", action="version", version=f"%(prog)s v{__version__}"
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
subparsers = parser.add_subparsers(required=False, title="commands", metavar="")
|
|
14
|
+
ls_cmd.register(subparsers)
|
|
15
|
+
|
|
16
|
+
args = parser.parse_args()
|
|
17
|
+
|
|
18
|
+
if "func" in args:
|
|
19
|
+
args.func(args)
|
|
20
|
+
else:
|
|
21
|
+
parser.print_help()
|
|
22
|
+
|
|
23
|
+
exit(0)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
main()
|
usp/exceptions.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Exceptions used by the sitemap parser."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class SitemapException(Exception):
|
|
5
|
+
"""
|
|
6
|
+
Problem due to which we can't run further, e.g. wrong input parameters.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SitemapXMLParsingException(Exception):
|
|
13
|
+
"""
|
|
14
|
+
XML parsing exception to be handled gracefully.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class GunzipException(Exception):
|
|
21
|
+
"""
|
|
22
|
+
Error decompressing seemingly gzipped content.
|
|
23
|
+
See :func:`usp.helpers.gunzip`.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class StripURLToHomepageException(Exception):
|
|
30
|
+
"""
|
|
31
|
+
Problem parsing URL and stripping to homepage.
|
|
32
|
+
See :func:`usp.helpers.strip_url_to_homepage`.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
pass
|