sitemap2atom 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sitemap2atom-0.1.0/.flake8 +9 -0
- sitemap2atom-0.1.0/.github/workflows/test.yml +33 -0
- sitemap2atom-0.1.0/.github/workflows/workflow.yml +30 -0
- sitemap2atom-0.1.0/.gitignore +25 -0
- sitemap2atom-0.1.0/CHANGELOG.md +22 -0
- sitemap2atom-0.1.0/CONTRIBUTING.md +55 -0
- sitemap2atom-0.1.0/LICENSE +21 -0
- sitemap2atom-0.1.0/PKG-INFO +130 -0
- sitemap2atom-0.1.0/README.md +102 -0
- sitemap2atom-0.1.0/SECURITY.md +31 -0
- sitemap2atom-0.1.0/pyproject.toml +62 -0
- sitemap2atom-0.1.0/src/sitemap2atom/__init__.py +22 -0
- sitemap2atom-0.1.0/src/sitemap2atom/__main__.py +6 -0
- sitemap2atom-0.1.0/src/sitemap2atom/cli.py +83 -0
- sitemap2atom-0.1.0/src/sitemap2atom/core.py +330 -0
- sitemap2atom-0.1.0/tests/fixtures/sample.html +25 -0
- sitemap2atom-0.1.0/tests/test_core.py +126 -0
- sitemap2atom-0.1.0/uv.lock +803 -0
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
name: Test
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [main]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
fail-fast: false
|
|
14
|
+
matrix:
|
|
15
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
16
|
+
|
|
17
|
+
steps:
|
|
18
|
+
- uses: actions/checkout@v6
|
|
19
|
+
|
|
20
|
+
- name: Install uv
|
|
21
|
+
uses: astral-sh/setup-uv@v8.1.0
|
|
22
|
+
with:
|
|
23
|
+
python-version: ${{ matrix.python-version }}
|
|
24
|
+
enable-cache: true
|
|
25
|
+
|
|
26
|
+
- name: Install dependencies
|
|
27
|
+
run: uv sync
|
|
28
|
+
|
|
29
|
+
- name: Lint
|
|
30
|
+
run: uv run flake8 src tests
|
|
31
|
+
|
|
32
|
+
- name: Run tests
|
|
33
|
+
run: uv run pytest
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
release:
|
|
5
|
+
types: [published]
|
|
6
|
+
|
|
7
|
+
jobs:
|
|
8
|
+
publish:
|
|
9
|
+
runs-on: ubuntu-latest
|
|
10
|
+
environment:
|
|
11
|
+
name: pypi
|
|
12
|
+
url: https://pypi.org/p/sitemap2atom
|
|
13
|
+
permissions:
|
|
14
|
+
# Required for PyPI trusted publishing via OIDC.
|
|
15
|
+
id-token: write
|
|
16
|
+
contents: read
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v6
|
|
20
|
+
|
|
21
|
+
- name: Install uv
|
|
22
|
+
uses: astral-sh/setup-uv@v8.1.0
|
|
23
|
+
with:
|
|
24
|
+
python-version: "3.12"
|
|
25
|
+
|
|
26
|
+
- name: Build distributions
|
|
27
|
+
run: uv build
|
|
28
|
+
|
|
29
|
+
- name: Publish to PyPI
|
|
30
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Generated feeds
|
|
2
|
+
*.atom
|
|
3
|
+
enriched_feed.atom
|
|
4
|
+
|
|
5
|
+
# Byte-compiled / optimized
|
|
6
|
+
__pycache__/
|
|
7
|
+
*.py[cod]
|
|
8
|
+
*$py.class
|
|
9
|
+
|
|
10
|
+
# Distribution / packaging
|
|
11
|
+
build/
|
|
12
|
+
dist/
|
|
13
|
+
*.egg-info/
|
|
14
|
+
.eggs/
|
|
15
|
+
|
|
16
|
+
# Test / type-check / coverage caches
|
|
17
|
+
.pytest_cache/
|
|
18
|
+
.mypy_cache/
|
|
19
|
+
.ruff_cache/
|
|
20
|
+
.coverage
|
|
21
|
+
htmlcov/
|
|
22
|
+
|
|
23
|
+
# Virtual environments
|
|
24
|
+
.venv/
|
|
25
|
+
venv/
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to this project will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.1.0] - 2026-06-09
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- Initial public release.
|
|
15
|
+
- `sitemap2atom` command-line tool: fetch an XML sitemap and convert its URLs
|
|
16
|
+
into an enriched Atom feed using OpenGraph and Twitter Card metadata.
|
|
17
|
+
- CLI options: `--output`, `--limit`, `--feed-title`, `--timeout`, `--verbose`.
|
|
18
|
+
- Installable from PyPI and runnable with `uvx sitemap2atom`.
|
|
19
|
+
- Offline test suite covering metadata parsing and Atom feed generation.
|
|
20
|
+
|
|
21
|
+
[Unreleased]: https://github.com/darkflib/sitemap2atom/compare/v0.1.0...HEAD
|
|
22
|
+
[0.1.0]: https://github.com/darkflib/sitemap2atom/releases/tag/v0.1.0
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
# Contributing to sitemap2atom
|
|
2
|
+
|
|
3
|
+
Thanks for your interest in improving sitemap2atom! Contributions of all kinds
|
|
4
|
+
are welcome — bug reports, feature ideas, documentation fixes, and pull requests.
|
|
5
|
+
|
|
6
|
+
## Reporting issues
|
|
7
|
+
|
|
8
|
+
Please open an issue on the
|
|
9
|
+
[issue tracker](https://github.com/darkflib/sitemap2atom/issues) and include:
|
|
10
|
+
|
|
11
|
+
- What you expected to happen and what actually happened.
|
|
12
|
+
- The command you ran (and the sitemap URL, if it can be shared).
|
|
13
|
+
- Your Python version and operating system.
|
|
14
|
+
|
|
15
|
+
## Development setup
|
|
16
|
+
|
|
17
|
+
This project uses [uv](https://docs.astral.sh/uv/) for dependency management.
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
git clone https://github.com/darkflib/sitemap2atom.git
|
|
21
|
+
cd sitemap2atom
|
|
22
|
+
uv sync # create the virtualenv and install runtime + dev deps
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
Run the CLI locally without installing it globally:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
uv run sitemap2atom https://example.com/sitemap.xml --limit 5
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Tests, linting, and types
|
|
32
|
+
|
|
33
|
+
The test suite is offline (no network access needed) and must stay that way so
|
|
34
|
+
it can run in CI:
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
uv run pytest # run the tests
|
|
38
|
+
uv run flake8 src tests # lint
|
|
39
|
+
uv run black --check src tests
|
|
40
|
+
uv run isort --check src tests
|
|
41
|
+
uv run mypy src # type-check
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
Please make sure tests and lint pass before opening a pull request, and add
|
|
45
|
+
tests for any new behaviour.
|
|
46
|
+
|
|
47
|
+
## Pull requests
|
|
48
|
+
|
|
49
|
+
1. Fork the repository and create a branch from `main`.
|
|
50
|
+
2. Make your change, with tests and documentation as appropriate.
|
|
51
|
+
3. Add an entry to the **Unreleased** section of [CHANGELOG.md](CHANGELOG.md).
|
|
52
|
+
4. Open a pull request describing the change and the motivation.
|
|
53
|
+
|
|
54
|
+
By contributing, you agree that your contributions will be licensed under the
|
|
55
|
+
[MIT License](LICENSE).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Mike Preston
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sitemap2atom
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A tool to convert XML sitemaps to Atom feeds
|
|
5
|
+
Project-URL: homepage, https://github.com/darkflib/sitemap2atom
|
|
6
|
+
Project-URL: repository, https://github.com/darkflib/sitemap2atom
|
|
7
|
+
Project-URL: issues, https://github.com/darkflib/sitemap2atom/issues
|
|
8
|
+
Author-email: Mike Preston <darkflib@gmail.com>
|
|
9
|
+
License-Expression: MIT
|
|
10
|
+
License-File: LICENSE
|
|
11
|
+
Keywords: atom,feed,opengraph,rss,sitemap,syndication
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
19
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
21
|
+
Requires-Python: >=3.11
|
|
22
|
+
Requires-Dist: beautifulsoup4>=4.9.3
|
|
23
|
+
Requires-Dist: click>=7.1.2
|
|
24
|
+
Requires-Dist: lxml>=4.6.3
|
|
25
|
+
Requires-Dist: python-dateutil>=2.8.1
|
|
26
|
+
Requires-Dist: requests>=2.25.1
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# sitemap2atom
|
|
30
|
+
|
|
31
|
+
A simple tool to convert an XML sitemap into an [Atom](https://datatracker.ietf.org/doc/html/rfc4287)
|
|
32
|
+
feed — especially useful for sites that don't have a CMS, or where the CMS
|
|
33
|
+
doesn't produce a feed. Each URL in the sitemap is fetched and its OpenGraph and
|
|
34
|
+
Twitter Card metadata (title, description, image, author, dates) is used to build
|
|
35
|
+
a rich Atom entry.
|
|
36
|
+
|
|
37
|
+
## Installation
|
|
38
|
+
|
|
39
|
+
### Run without installing (uvx)
|
|
40
|
+
|
|
41
|
+
Once published to PyPI you can run it directly with
|
|
42
|
+
[uv](https://docs.astral.sh/uv/):
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
uvx sitemap2atom https://example.com/sitemap.xml -o feed.atom
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
To run the latest code straight from GitHub (before a release, or to try `main`):
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
uvx --from git+https://github.com/darkflib/sitemap2atom sitemap2atom https://example.com/sitemap.xml
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Install as a tool / library
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
uv tool install sitemap2atom # installs the `sitemap2atom` command
|
|
58
|
+
# or
|
|
59
|
+
pip install sitemap2atom
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Usage
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
sitemap2atom SITEMAP_URL [OPTIONS]
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
By default the feed is written to standard output; redirect it or use `-o` to
|
|
69
|
+
save it to a file:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
# Print to stdout
|
|
73
|
+
sitemap2atom https://example.com/sitemap.xml
|
|
74
|
+
|
|
75
|
+
# Write to a file, limiting to the first 20 URLs
|
|
76
|
+
sitemap2atom https://example.com/sitemap.xml -o feed.atom --limit 20
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
### Options
|
|
80
|
+
|
|
81
|
+
- `-o, --output PATH` — write the Atom feed to this file (default: stdout).
|
|
82
|
+
- `--limit N` — maximum number of sitemap URLs to process (default: all).
|
|
83
|
+
- `--feed-title TEXT` — title for the generated feed (default: `Enriched URL Feed`).
|
|
84
|
+
- `--timeout SECONDS` — per-request timeout in seconds (default: `10`).
|
|
85
|
+
- `-v, --verbose` — enable info-level logging on stderr.
|
|
86
|
+
- `--version` — show the version and exit.
|
|
87
|
+
|
|
88
|
+
### As a library
|
|
89
|
+
|
|
90
|
+
```python
|
|
91
|
+
from sitemap2atom import fetch_sitemap_urls, enrich_url_list_to_atom, feed_to_pretty_xml
|
|
92
|
+
|
|
93
|
+
urls = fetch_sitemap_urls("https://example.com/sitemap.xml")
|
|
94
|
+
feed = enrich_url_list_to_atom(urls[:10], feed_title="My Feed")
|
|
95
|
+
print(feed_to_pretty_xml(feed))
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
## Example output
|
|
99
|
+
|
|
100
|
+
See this gist for a sample of the kind of enriched Atom feed produced:
|
|
101
|
+
<https://gist.github.com/Darkflib/989b8f3a5a1ea995e8e294669d5e282a>
|
|
102
|
+
|
|
103
|
+
## Limitations
|
|
104
|
+
|
|
105
|
+
This is a simple tool aimed at basic use cases. It does not support
|
|
106
|
+
authentication, sitemap index files / pagination, or dynamic sitemaps, and may
|
|
107
|
+
not handle every sitemap or page format. Treat the sitemap and the pages it
|
|
108
|
+
references as untrusted input and run it against sources you trust.
|
|
109
|
+
|
|
110
|
+
## Development
|
|
111
|
+
|
|
112
|
+
This project uses [uv](https://docs.astral.sh/uv/).
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
git clone https://github.com/darkflib/sitemap2atom.git
|
|
116
|
+
cd sitemap2atom
|
|
117
|
+
uv sync
|
|
118
|
+
uv run pytest
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for more, and
|
|
122
|
+
[CHANGELOG.md](CHANGELOG.md) for release notes.
|
|
123
|
+
|
|
124
|
+
## License
|
|
125
|
+
|
|
126
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file
|
|
127
|
+
for details.
|
|
128
|
+
|
|
129
|
+
PS. If you do anything interesting with this code, please let me know! I'd love
|
|
130
|
+
to hear about it.
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# sitemap2atom
|
|
2
|
+
|
|
3
|
+
A simple tool to convert an XML sitemap into an [Atom](https://datatracker.ietf.org/doc/html/rfc4287)
|
|
4
|
+
feed — especially useful for sites that don't have a CMS, or where the CMS
|
|
5
|
+
doesn't produce a feed. Each URL in the sitemap is fetched and its OpenGraph and
|
|
6
|
+
Twitter Card metadata (title, description, image, author, dates) is used to build
|
|
7
|
+
a rich Atom entry.
|
|
8
|
+
|
|
9
|
+
## Installation
|
|
10
|
+
|
|
11
|
+
### Run without installing (uvx)
|
|
12
|
+
|
|
13
|
+
Once published to PyPI you can run it directly with
|
|
14
|
+
[uv](https://docs.astral.sh/uv/):
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
uvx sitemap2atom https://example.com/sitemap.xml -o feed.atom
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
To run the latest code straight from GitHub (before a release, or to try `main`):
|
|
21
|
+
|
|
22
|
+
```bash
|
|
23
|
+
uvx --from git+https://github.com/darkflib/sitemap2atom sitemap2atom https://example.com/sitemap.xml
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
### Install as a tool / library
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
uv tool install sitemap2atom # installs the `sitemap2atom` command
|
|
30
|
+
# or
|
|
31
|
+
pip install sitemap2atom
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## Usage
|
|
35
|
+
|
|
36
|
+
```bash
|
|
37
|
+
sitemap2atom SITEMAP_URL [OPTIONS]
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
By default the feed is written to standard output; redirect it or use `-o` to
|
|
41
|
+
save it to a file:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
# Print to stdout
|
|
45
|
+
sitemap2atom https://example.com/sitemap.xml
|
|
46
|
+
|
|
47
|
+
# Write to a file, limiting to the first 20 URLs
|
|
48
|
+
sitemap2atom https://example.com/sitemap.xml -o feed.atom --limit 20
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Options
|
|
52
|
+
|
|
53
|
+
- `-o, --output PATH` — write the Atom feed to this file (default: stdout).
|
|
54
|
+
- `--limit N` — maximum number of sitemap URLs to process (default: all).
|
|
55
|
+
- `--feed-title TEXT` — title for the generated feed (default: `Enriched URL Feed`).
|
|
56
|
+
- `--timeout SECONDS` — per-request timeout in seconds (default: `10`).
|
|
57
|
+
- `-v, --verbose` — enable info-level logging on stderr.
|
|
58
|
+
- `--version` — show the version and exit.
|
|
59
|
+
|
|
60
|
+
### As a library
|
|
61
|
+
|
|
62
|
+
```python
|
|
63
|
+
from sitemap2atom import fetch_sitemap_urls, enrich_url_list_to_atom, feed_to_pretty_xml
|
|
64
|
+
|
|
65
|
+
urls = fetch_sitemap_urls("https://example.com/sitemap.xml")
|
|
66
|
+
feed = enrich_url_list_to_atom(urls[:10], feed_title="My Feed")
|
|
67
|
+
print(feed_to_pretty_xml(feed))
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
## Example output
|
|
71
|
+
|
|
72
|
+
See this gist for a sample of the kind of enriched Atom feed produced:
|
|
73
|
+
<https://gist.github.com/Darkflib/989b8f3a5a1ea995e8e294669d5e282a>
|
|
74
|
+
|
|
75
|
+
## Limitations
|
|
76
|
+
|
|
77
|
+
This is a simple tool aimed at basic use cases. It does not support
|
|
78
|
+
authentication, sitemap index files / pagination, or dynamic sitemaps, and may
|
|
79
|
+
not handle every sitemap or page format. Treat the sitemap and the pages it
|
|
80
|
+
references as untrusted input and run it against sources you trust.
|
|
81
|
+
|
|
82
|
+
## Development
|
|
83
|
+
|
|
84
|
+
This project uses [uv](https://docs.astral.sh/uv/).
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
git clone https://github.com/darkflib/sitemap2atom.git
|
|
88
|
+
cd sitemap2atom
|
|
89
|
+
uv sync
|
|
90
|
+
uv run pytest
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for more, and
|
|
94
|
+
[CHANGELOG.md](CHANGELOG.md) for release notes.
|
|
95
|
+
|
|
96
|
+
## License
|
|
97
|
+
|
|
98
|
+
This project is licensed under the MIT License — see the [LICENSE](LICENSE) file
|
|
99
|
+
for details.
|
|
100
|
+
|
|
101
|
+
PS. If you do anything interesting with this code, please let me know! I'd love
|
|
102
|
+
to hear about it.
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Supported versions
|
|
4
|
+
|
|
5
|
+
sitemap2atom is currently pre-1.0. Security fixes are applied to the latest
|
|
6
|
+
released version on PyPI.
|
|
7
|
+
|
|
8
|
+
| Version | Supported |
|
|
9
|
+
| ------- | ------------------ |
|
|
10
|
+
| 0.1.x | :white_check_mark: |
|
|
11
|
+
|
|
12
|
+
## Reporting a vulnerability
|
|
13
|
+
|
|
14
|
+
Please **do not** report security vulnerabilities through public GitHub issues.
|
|
15
|
+
|
|
16
|
+
Instead, report them privately via one of:
|
|
17
|
+
|
|
18
|
+
- GitHub's [private vulnerability reporting](https://github.com/darkflib/sitemap2atom/security/advisories/new)
|
|
19
|
+
("Report a vulnerability" on the Security tab), or
|
|
20
|
+
- email to **darkflib@gmail.com**.
|
|
21
|
+
|
|
22
|
+
Please include a description of the issue, steps to reproduce, and the impact
|
|
23
|
+
you anticipate. You can expect an acknowledgement within a few days. Once the
|
|
24
|
+
issue is confirmed and fixed, a new release will be published and the reporter
|
|
25
|
+
credited (unless anonymity is requested).
|
|
26
|
+
|
|
27
|
+
## Scope
|
|
28
|
+
|
|
29
|
+
sitemap2atom fetches arbitrary URLs listed in a sitemap and parses the returned
|
|
30
|
+
HTML. Treat sitemaps and the pages they reference as untrusted input, and run
|
|
31
|
+
the tool against sources you trust.
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "sitemap2atom"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A tool to convert XML sitemaps to Atom feeds"
|
|
5
|
+
authors = [{name = "Mike Preston", email = "darkflib@gmail.com"}]
|
|
6
|
+
license = "MIT"
|
|
7
|
+
license-files = ["LICENSE"]
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
requires-python = ">=3.11"
|
|
10
|
+
keywords = ["sitemap", "atom", "feed", "opengraph", "syndication", "rss"]
|
|
11
|
+
classifiers = [
|
|
12
|
+
"Development Status :: 4 - Beta",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"Operating System :: OS Independent",
|
|
15
|
+
"Programming Language :: Python :: 3",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
20
|
+
"Topic :: Text Processing :: Markup :: XML",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
dependencies = [
|
|
24
|
+
"requests>=2.25.1",
|
|
25
|
+
"lxml>=4.6.3",
|
|
26
|
+
"beautifulsoup4>=4.9.3",
|
|
27
|
+
"python-dateutil>=2.8.1",
|
|
28
|
+
"click>=7.1.2",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
[project.urls]
|
|
32
|
+
homepage = "https://github.com/darkflib/sitemap2atom"
|
|
33
|
+
repository = "https://github.com/darkflib/sitemap2atom"
|
|
34
|
+
issues = "https://github.com/darkflib/sitemap2atom/issues"
|
|
35
|
+
|
|
36
|
+
[project.scripts]
|
|
37
|
+
sitemap2atom = "sitemap2atom.cli:main"
|
|
38
|
+
|
|
39
|
+
[build-system]
|
|
40
|
+
requires = ["hatchling"]
|
|
41
|
+
build-backend = "hatchling.build"
|
|
42
|
+
|
|
43
|
+
[tool.hatch.build.targets.wheel]
|
|
44
|
+
packages = ["src/sitemap2atom"]
|
|
45
|
+
|
|
46
|
+
[dependency-groups]
|
|
47
|
+
dev = [
|
|
48
|
+
"mypy>=1.16.0",
|
|
49
|
+
"pytest>=6.2.4",
|
|
50
|
+
"black>=21.7b0",
|
|
51
|
+
"isort>=5.9.3",
|
|
52
|
+
"flake8>=3.9.2",
|
|
53
|
+
"pre-commit>=2.13.0",
|
|
54
|
+
"coverage>=5.5",
|
|
55
|
+
"types-requests>=2.25.1",
|
|
56
|
+
"types-beautifulsoup4>=4.9.3",
|
|
57
|
+
"types-python-dateutil>=2.8.1",
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
[tool.pytest.ini_options]
|
|
61
|
+
testpaths = ["tests"]
|
|
62
|
+
pythonpath = ["src"]
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""sitemap2atom: convert an XML sitemap into an enriched Atom feed."""
|
|
2
|
+
|
|
3
|
+
__version__ = "0.1.0"
|
|
4
|
+
|
|
5
|
+
from .core import (
|
|
6
|
+
enrich_atom_entry,
|
|
7
|
+
enrich_url_list_to_atom,
|
|
8
|
+
extract_metadata,
|
|
9
|
+
feed_to_pretty_xml,
|
|
10
|
+
fetch_sitemap_urls,
|
|
11
|
+
parse_metadata,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"__version__",
|
|
16
|
+
"enrich_atom_entry",
|
|
17
|
+
"enrich_url_list_to_atom",
|
|
18
|
+
"extract_metadata",
|
|
19
|
+
"feed_to_pretty_xml",
|
|
20
|
+
"fetch_sitemap_urls",
|
|
21
|
+
"parse_metadata",
|
|
22
|
+
]
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Command-line interface for sitemap2atom."""
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
import click
|
|
7
|
+
import requests
|
|
8
|
+
|
|
9
|
+
from . import __version__
|
|
10
|
+
from .core import (
|
|
11
|
+
DEFAULT_FEED_TITLE,
|
|
12
|
+
enrich_url_list_to_atom,
|
|
13
|
+
feed_to_pretty_xml,
|
|
14
|
+
fetch_sitemap_urls,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@click.command()
|
|
19
|
+
@click.argument("sitemap_url")
|
|
20
|
+
@click.option(
|
|
21
|
+
"-o",
|
|
22
|
+
"--output",
|
|
23
|
+
type=click.Path(dir_okay=False, writable=True),
|
|
24
|
+
default=None,
|
|
25
|
+
help="Write the Atom feed to this file (default: stdout).",
|
|
26
|
+
)
|
|
27
|
+
@click.option(
|
|
28
|
+
"--limit",
|
|
29
|
+
type=int,
|
|
30
|
+
default=None,
|
|
31
|
+
help="Maximum number of sitemap URLs to process (default: all).",
|
|
32
|
+
)
|
|
33
|
+
@click.option(
|
|
34
|
+
"--feed-title",
|
|
35
|
+
default=DEFAULT_FEED_TITLE,
|
|
36
|
+
show_default=True,
|
|
37
|
+
help="Title for the generated Atom feed.",
|
|
38
|
+
)
|
|
39
|
+
@click.option(
|
|
40
|
+
"--timeout",
|
|
41
|
+
type=int,
|
|
42
|
+
default=10,
|
|
43
|
+
show_default=True,
|
|
44
|
+
help="Per-request timeout in seconds.",
|
|
45
|
+
)
|
|
46
|
+
@click.option("-v", "--verbose", is_flag=True, help="Enable info-level logging.")
|
|
47
|
+
@click.version_option(__version__, prog_name="sitemap2atom")
|
|
48
|
+
def main(sitemap_url, output, limit, feed_title, timeout, verbose):
|
|
49
|
+
"""Convert the XML sitemap at SITEMAP_URL into an enriched Atom feed.
|
|
50
|
+
|
|
51
|
+
Each URL in the sitemap is fetched and its OpenGraph/Twitter metadata is
|
|
52
|
+
used to build a rich Atom entry (title, summary, image, author, dates).
|
|
53
|
+
"""
|
|
54
|
+
logging.basicConfig(
|
|
55
|
+
level=logging.INFO if verbose else logging.WARNING,
|
|
56
|
+
format="%(levelname)s: %(message)s",
|
|
57
|
+
stream=sys.stderr,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
try:
|
|
61
|
+
urls = fetch_sitemap_urls(sitemap_url, timeout=timeout)
|
|
62
|
+
except requests.RequestException as e:
|
|
63
|
+
raise click.ClickException(f"Failed to fetch sitemap {sitemap_url}: {e}")
|
|
64
|
+
|
|
65
|
+
if limit is not None:
|
|
66
|
+
urls = urls[:limit]
|
|
67
|
+
|
|
68
|
+
if not urls:
|
|
69
|
+
raise click.ClickException(f"No <loc> URLs found in sitemap: {sitemap_url}")
|
|
70
|
+
|
|
71
|
+
feed = enrich_url_list_to_atom(urls, feed_title=feed_title, timeout=timeout)
|
|
72
|
+
xml = feed_to_pretty_xml(feed)
|
|
73
|
+
|
|
74
|
+
if output:
|
|
75
|
+
with open(output, "w", encoding="utf-8") as f:
|
|
76
|
+
f.write(xml + "\n")
|
|
77
|
+
click.echo(f"Wrote {output}", err=True)
|
|
78
|
+
else:
|
|
79
|
+
click.echo(xml)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|