PyPI - sitemap2atom - Versions diffs - 0.1.0__py3-none-any.whl - Mend

sitemap2atom 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

sitemap2atom/__init__.py +22 -0
sitemap2atom/__main__.py +6 -0
sitemap2atom/cli.py +83 -0
sitemap2atom/core.py +330 -0
sitemap2atom-0.1.0.dist-info/METADATA +130 -0
sitemap2atom-0.1.0.dist-info/RECORD +9 -0
sitemap2atom-0.1.0.dist-info/WHEEL +4 -0
sitemap2atom-0.1.0.dist-info/entry_points.txt +2 -0
sitemap2atom-0.1.0.dist-info/licenses/LICENSE +21 -0

sitemap2atom/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+"""sitemap2atom: convert an XML sitemap into an enriched Atom feed."""
+__version__ = "0.1.0"
+from .core import (
+    enrich_atom_entry,
+    enrich_url_list_to_atom,
+    extract_metadata,
+    feed_to_pretty_xml,
+    fetch_sitemap_urls,
+    parse_metadata,
+)
+__all__ = [
+    "__version__",
+    "enrich_atom_entry",
+    "enrich_url_list_to_atom",
+    "extract_metadata",
+    "feed_to_pretty_xml",
+    "fetch_sitemap_urls",
+    "parse_metadata",
+]

sitemap2atom/__main__.py ADDED Viewed

@@ -0,0 +1,6 @@
+"""Enable ``python -m sitemap2atom``."""
+from .cli import main
+if __name__ == "__main__":
+    main()

sitemap2atom/cli.py ADDED Viewed

@@ -0,0 +1,83 @@
+"""Command-line interface for sitemap2atom."""
+import logging
+import sys
+import click
+import requests
+from . import __version__
+from .core import (
+    DEFAULT_FEED_TITLE,
+    enrich_url_list_to_atom,
+    feed_to_pretty_xml,
+    fetch_sitemap_urls,
+)
+@click.command()
+@click.argument("sitemap_url")
+@click.option(
+    "-o",
+    "--output",
+    type=click.Path(dir_okay=False, writable=True),
+    default=None,
+    help="Write the Atom feed to this file (default: stdout).",
+)
+@click.option(
+    "--limit",
+    type=int,
+    default=None,
+    help="Maximum number of sitemap URLs to process (default: all).",
+)
+@click.option(
+    "--feed-title",
+    default=DEFAULT_FEED_TITLE,
+    show_default=True,
+    help="Title for the generated Atom feed.",
+)
+@click.option(
+    "--timeout",
+    type=int,
+    default=10,
+    show_default=True,
+    help="Per-request timeout in seconds.",
+)
+@click.option("-v", "--verbose", is_flag=True, help="Enable info-level logging.")
+@click.version_option(__version__, prog_name="sitemap2atom")
+def main(sitemap_url, output, limit, feed_title, timeout, verbose):
+    """Convert the XML sitemap at SITEMAP_URL into an enriched Atom feed.
+    Each URL in the sitemap is fetched and its OpenGraph/Twitter metadata is
+    used to build a rich Atom entry (title, summary, image, author, dates).
+    """
+    logging.basicConfig(
+        level=logging.INFO if verbose else logging.WARNING,
+        format="%(levelname)s: %(message)s",
+        stream=sys.stderr,
+    )
+    try:
+        urls = fetch_sitemap_urls(sitemap_url, timeout=timeout)
+    except requests.RequestException as e:
+        raise click.ClickException(f"Failed to fetch sitemap {sitemap_url}: {e}")
+    if limit is not None:
+        urls = urls[:limit]
+    if not urls:
+        raise click.ClickException(f"No <loc> URLs found in sitemap: {sitemap_url}")
+    feed = enrich_url_list_to_atom(urls, feed_title=feed_title, timeout=timeout)
+    xml = feed_to_pretty_xml(feed)
+    if output:
+        with open(output, "w", encoding="utf-8") as f:
+            f.write(xml + "\n")
+        click.echo(f"Wrote {output}", err=True)
+    else:
+        click.echo(xml)
+if __name__ == "__main__":
+    main()

sitemap2atom/core.py ADDED Viewed

@@ -0,0 +1,330 @@
+"""Core sitemap-to-Atom conversion logic.
+The functions here are split so that the HTML/XML parsing is pure (no network)
+and therefore unit-testable, while the network-facing helpers wrap them.
+"""
+import logging
+import re
+import uuid
+from datetime import datetime
+from urllib.parse import urljoin, urlparse
+from xml.dom import minidom
+from xml.etree.ElementTree import Element, SubElement, tostring
+import dateutil.parser
+import requests
+from bs4 import BeautifulSoup
+logger = logging.getLogger(__name__)
+# A browser-like User-Agent; some sites reject the default requests UA.
+USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+)
+ATOM_NS = "http://www.w3.org/2005/Atom"
+DEFAULT_FEED_TITLE = "Enriched URL Feed"
+def parse_metadata(html, url):
+    """Extract Twitter and OpenGraph metadata from an HTML document.
+    This is the pure, network-free half of :func:`extract_metadata` and can be
+    tested against static HTML.
+    Args:
+        html (str | bytes): The raw HTML to parse.
+        url (str): The URL the HTML came from (used to resolve relative image
+            URLs and as a fallback site name).
+    Returns:
+        dict: Dictionary containing the extracted metadata.
+    """
+    soup = BeautifulSoup(html, "html.parser")
+    metadata = {
+        "url": url,
+        "title": None,
+        "description": None,
+        "image": None,
+        "site_name": None,
+        "twitter": {},
+        "opengraph": {},
+    }
+    # Extract OpenGraph metadata
+    # Collect both `og:*` and `article:*` properties. The `og:` prefix is
+    # stripped (e.g. og:title -> title), but `article:*` keys are kept intact
+    # because enrich_atom_entry() looks them up by their full name
+    # (article:published_time, article:modified_time, article:author).
+    og_tags = soup.find_all("meta", property=re.compile(r"^(og|article):"))
+    for tag in og_tags:
+        prop = tag.get("property", "")
+        content = tag.get("content", "")
+        if not prop or not content:
+            continue
+        if prop.startswith("og:"):
+            prop = prop[len("og:"):]
+        metadata["opengraph"][prop] = content
+    # Extract Twitter metadata
+    twitter_tags = soup.find_all("meta", attrs={"name": re.compile(r"^twitter:")})
+    for tag in twitter_tags:
+        name = tag.get("name", "").replace("twitter:", "")
+        content = tag.get("content", "")
+        if name and content:
+            metadata["twitter"][name] = content
+    # Populate main fields from OG or Twitter data
+    metadata["title"] = (
+        metadata["opengraph"].get("title")
+        or metadata["twitter"].get("title")
+        or (soup.find("title").get_text().strip() if soup.find("title") else None)
+    )
+    metadata["description"] = (
+        metadata["opengraph"].get("description")
+        or metadata["twitter"].get("description")
+        or (soup.find("meta", attrs={"name": "description"}) or {}).get("content")
+    )
+    # Handle image URLs (make absolute if relative)
+    image_url = metadata["opengraph"].get("image") or metadata["twitter"].get("image")
+    if image_url:
+        metadata["image"] = urljoin(url, image_url)
+    metadata["site_name"] = (
+        metadata["opengraph"].get("site_name")
+        or metadata["twitter"].get("site")
+        or urlparse(url).netloc
+    )
+    return metadata
+def extract_metadata(url, timeout=10):
+    """Fetch ``url`` and extract Twitter and OpenGraph metadata.
+    Args:
+        url (str): The URL to extract metadata from.
+        timeout (int): Request timeout in seconds.
+    Returns:
+        dict: Metadata from :func:`parse_metadata`, or ``{'error': ..., 'url': ...}``
+        if the request or parsing fails.
+    """
+    try:
+        # Add scheme if missing
+        if not url.startswith(("http://", "https://")):
+            url = "https://" + url
+        response = requests.get(
+            url, headers={"User-Agent": USER_AGENT}, timeout=timeout
+        )
+        response.raise_for_status()
+        return parse_metadata(response.content, url)
+    except requests.RequestException as e:
+        return {"error": f"Request failed: {str(e)}", "url": url}
+    except Exception as e:
+        return {"error": f"Parsing failed: {str(e)}", "url": url}
+def _utcnow_iso():
+    """Return the current UTC time as an Atom-friendly ISO 8601 string."""
+    return datetime.now().replace(microsecond=0).isoformat() + "Z"
+def enrich_atom_entry(metadata, base_entry=None):
+    """Create or enrich an Atom ``<entry>`` element from extracted metadata.
+    Args:
+        metadata (dict): Metadata from :func:`extract_metadata`.
+        base_entry (Element, optional): Existing entry to enrich.
+    Returns:
+        Element: The Atom entry element.
+    """
+    if base_entry is None:
+        entry = Element("entry")
+    else:
+        entry = base_entry
+    # Title
+    if metadata.get("title"):
+        title_elem = entry.find("title")
+        if title_elem is None:
+            title_elem = SubElement(entry, "title")
+        title_elem.text = metadata["title"]
+        title_elem.set("type", "text")
+    # Summary/Description
+    if metadata.get("description"):
+        summary_elem = entry.find("summary")
+        if summary_elem is None:
+            summary_elem = SubElement(entry, "summary")
+        summary_elem.text = metadata["description"]
+        summary_elem.set("type", "text")
+    # Link to original content
+    if metadata.get("url"):
+        link_elem = SubElement(entry, "link")
+        link_elem.set("rel", "alternate")
+        link_elem.set("type", "text/html")
+        link_elem.set("href", metadata["url"])
+    # Image as enclosure
+    if metadata.get("image"):
+        enclosure_elem = SubElement(entry, "link")
+        enclosure_elem.set("rel", "enclosure")
+        # TODO: detect the actual image type instead of assuming JPEG.
+        enclosure_elem.set("type", "image/jpeg")
+        enclosure_elem.set("href", metadata["image"])
+    # Content type as category
+    og_type = metadata.get("opengraph", {}).get("type")
+    if og_type:
+        category_elem = SubElement(entry, "category")
+        category_elem.set("term", og_type)
+        category_elem.set("scheme", "http://ogp.me/ns#")
+    # Published date (from article metadata)
+    published_time = metadata.get("opengraph", {}).get("article:published_time")
+    if published_time:
+        try:
+            pub_date = dateutil.parser.parse(published_time)
+            published_elem = SubElement(entry, "published")
+            published_elem.text = pub_date.isoformat()
+        except Exception:
+            pass
+    # Updated date
+    modified_time = metadata.get("opengraph", {}).get("article:modified_time")
+    if modified_time:
+        try:
+            mod_date = dateutil.parser.parse(modified_time)
+            updated_elem = entry.find("updated")
+            if updated_elem is None:
+                updated_elem = SubElement(entry, "updated")
+            updated_elem.text = mod_date.isoformat()
+        except Exception:
+            pass
+    # Author (from article metadata)
+    author_name = metadata.get("opengraph", {}).get("article:author")
+    twitter_creator = metadata.get("twitter", {}).get("creator")
+    # Always add author element (required by Atom spec)
+    author_elem = SubElement(entry, "author")
+    SubElement(author_elem, "name").text = (
+        author_name or twitter_creator or metadata.get("site_name", "Unknown")
+    )
+    # Site name as source - properly structured according to Atom spec
+    if metadata.get("site_name"):
+        source_elem = SubElement(entry, "source")
+        # URI is required
+        if metadata.get("url"):
+            source_link = SubElement(source_elem, "link")
+            source_link.set("rel", "alternate")
+            source_link.set("type", "text/html")
+            source_link.set("href", metadata["url"])
+        # Required sub-elements for source
+        source_title = SubElement(source_elem, "title")
+        source_title.text = metadata["site_name"]
+        source_id = SubElement(source_elem, "id")
+        source_id.text = "urn:source:" + urlparse(metadata.get("url", "")).netloc
+        source_updated = SubElement(source_elem, "updated")
+        source_updated.text = _utcnow_iso()
+    return entry
+def enrich_url_list_to_atom(urls, feed_title=DEFAULT_FEED_TITLE, timeout=10):
+    """Convert a list of URLs into an enriched Atom feed element.
+    Args:
+        urls (Iterable[str]): URLs to fetch and enrich.
+        feed_title (str): The feed's ``<title>``.
+        timeout (int): Per-request timeout in seconds.
+    Returns:
+        Element: The Atom ``<feed>`` root element.
+    """
+    # Create feed root
+    feed = Element("feed")
+    feed.set("xmlns", ATOM_NS)
+    # Feed metadata
+    SubElement(feed, "title").text = feed_title
+    # Generate a unique UUID for the feed
+    SubElement(feed, "id").text = "urn:uuid:" + str(uuid.uuid4())
+    SubElement(feed, "updated").text = _utcnow_iso()
+    # Add required self link (required by validators)
+    self_link = SubElement(feed, "link")
+    self_link.set("rel", "self")
+    self_link.set("type", "application/atom+xml")
+    self_link.set("href", "file:///enriched_feed.atom")
+    for url in urls:
+        metadata = extract_metadata(url, timeout=timeout)
+        if "error" in metadata:
+            logger.warning("Skipping %s: %s", url, metadata["error"])
+            continue
+        entry = enrich_atom_entry(metadata)
+        # Add required ID and updated if missing
+        if entry.find("id") is None:
+            id_elem = SubElement(entry, "id")
+            id_elem.text = url.strip()  # Ensure no whitespace
+        if entry.find("updated") is None:
+            updated_elem = SubElement(entry, "updated")
+            updated_elem.text = _utcnow_iso()
+        feed.append(entry)
+    return feed
+def fetch_sitemap_urls(sitemap_url, timeout=10):
+    """Fetch a sitemap and return the list of ``<loc>`` URLs it contains.
+    Args:
+        sitemap_url (str): URL of the XML sitemap.
+        timeout (int): Request timeout in seconds.
+    Returns:
+        list[str]: The URLs found in the sitemap.
+    """
+    logger.info("Fetching sitemap: %s", sitemap_url)
+    response = requests.get(
+        sitemap_url, timeout=timeout, headers={"User-Agent": USER_AGENT}
+    )
+    response.raise_for_status()
+    soup = BeautifulSoup(response.content, "xml")
+    urls = [loc.text for loc in soup.find_all("loc")]
+    logger.info("Found %d URLs in the sitemap.", len(urls))
+    return urls
+def feed_to_pretty_xml(feed):
+    """Serialise an Atom feed element to a pretty-printed XML string.
+    Args:
+        feed (Element): The Atom feed element.
+    Returns:
+        str: Indented, UTF-8 XML with blank lines collapsed.
+    """
+    rough_string = tostring(feed, encoding="utf-8")
+    pretty = minidom.parseString(rough_string)
+    formatted = pretty.toprettyxml(indent="    ", encoding="utf-8").decode("utf-8")
+    # Remove the extra blank lines minidom adds
+    return "\n".join(line for line in formatted.split("\n") if line.strip())

sitemap2atom-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,130 @@
+Metadata-Version: 2.4
+Name: sitemap2atom
+Version: 0.1.0
+Summary: A tool to convert XML sitemaps to Atom feeds
+Project-URL: homepage, https://github.com/darkflib/sitemap2atom
+Project-URL: repository, https://github.com/darkflib/sitemap2atom
+Project-URL: issues, https://github.com/darkflib/sitemap2atom/issues
+Author-email: Mike Preston <darkflib@gmail.com>
+License-Expression: MIT
+License-File: LICENSE
+Keywords: atom,feed,opengraph,rss,sitemap,syndication
+Classifier: Development Status :: 4 - Beta
+Classifier: Intended Audience :: Developers
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Programming Language :: Python :: 3.12
+Classifier: Programming Language :: Python :: 3.13
+Classifier: Topic :: Internet :: WWW/HTTP
+Classifier: Topic :: Text Processing :: Markup :: XML
+Requires-Python: >=3.11
+Requires-Dist: beautifulsoup4>=4.9.3
+Requires-Dist: click>=7.1.2
+Requires-Dist: lxml>=4.6.3
+Requires-Dist: python-dateutil>=2.8.1
+Requires-Dist: requests>=2.25.1
+Description-Content-Type: text/markdown
+# sitemap2atom
+A simple tool to convert an XML sitemap into an [Atom](https://datatracker.ietf.org/doc/html/rfc4287)
+feed — especially useful for sites that don't have a CMS, or where the CMS
+doesn't produce a feed. Each URL in the sitemap is fetched and its OpenGraph and
+Twitter Card metadata (title, description, image, author, dates) is used to build
+a rich Atom entry.
+## Installation
+### Run without installing (uvx)
+Once published to PyPI you can run it directly with
+[uv](https://docs.astral.sh/uv/):
+```bash
+uvx sitemap2atom https://example.com/sitemap.xml -o feed.atom
+```
+To run the latest code straight from GitHub (before a release, or to try `main`):
+```bash
+uvx --from git+https://github.com/darkflib/sitemap2atom sitemap2atom https://example.com/sitemap.xml
+```
+### Install as a tool / library
+```bash
+uv tool install sitemap2atom      # installs the `sitemap2atom` command
+# or
+pip install sitemap2atom
+```
+## Usage
+```bash
+sitemap2atom SITEMAP_URL [OPTIONS]
+```
+By default the feed is written to standard output; redirect it or use `-o` to
+save it to a file:
+```bash
+# Print to stdout
+sitemap2atom https://example.com/sitemap.xml
+# Write to a file, limiting to the first 20 URLs
+sitemap2atom https://example.com/sitemap.xml -o feed.atom --limit 20
+```
+### Options
+- `-o, --output PATH` — write the Atom feed to this file (default: stdout).
+- `--limit N` — maximum number of sitemap URLs to process (default: all).
+- `--feed-title TEXT` — title for the generated feed (default: `Enriched URL Feed`).
+- `--timeout SECONDS` — per-request timeout in seconds (default: `10`).
+- `-v, --verbose` — enable info-level logging on stderr.
+- `--version` — show the version and exit.
+### As a library
+```python
+from sitemap2atom import fetch_sitemap_urls, enrich_url_list_to_atom, feed_to_pretty_xml
+urls = fetch_sitemap_urls("https://example.com/sitemap.xml")
+feed = enrich_url_list_to_atom(urls[:10], feed_title="My Feed")
+print(feed_to_pretty_xml(feed))
+```
+## Example output
+See this gist for a sample of the kind of enriched Atom feed produced:
+<https://gist.github.com/Darkflib/989b8f3a5a1ea995e8e294669d5e282a>
+## Limitations
+This is a simple tool aimed at basic use cases. It does not support
+authentication, sitemap index files / pagination, or dynamic sitemaps, and may
+not handle every sitemap or page format. Treat the sitemap and the pages it
+references as untrusted input and run it against sources you trust.
+## Development
+This project uses [uv](https://docs.astral.sh/uv/).
+```bash
+git clone https://github.com/darkflib/sitemap2atom.git
+cd sitemap2atom
+uv sync
+uv run pytest
+```
+See [CONTRIBUTING.md](CONTRIBUTING.md) for more, and
+[CHANGELOG.md](CHANGELOG.md) for release notes.
+## License
+This project is licensed under the MIT License — see the [LICENSE](LICENSE) file
+for details.
+PS. If you do anything interesting with this code, please let me know! I'd love
+to hear about it.

sitemap2atom-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+sitemap2atom/__init__.py,sha256=j9PPwb-KmQ1X738FtMKaWWtZtVknr-6K1eD7zAsCuGQ,447
+sitemap2atom/__main__.py,sha256=rsK2pRG6HCvqTIbbg4KFBZ9YpDqSDvdaXwglXttcyyU,103
+sitemap2atom/cli.py,sha256=82bl8kNXdeY6sbWl9GPa-63GFpTJs_IUwskxZWPYC2E,2194
+sitemap2atom/core.py,sha256=YDQZQO6Z-8GkM6jMID6enbNBGuo-ixw5HhE7PF94QFo,11120
+sitemap2atom-0.1.0.dist-info/METADATA,sha256=fpEm1aGEUox53g9Rgc52JaW29-CBBHmuJOONUELDN50,4066
+sitemap2atom-0.1.0.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
+sitemap2atom-0.1.0.dist-info/entry_points.txt,sha256=hr-tATkjEjaYlqqn1D8zALcDkaDjGnvig-nKBT8wVks,55
+sitemap2atom-0.1.0.dist-info/licenses/LICENSE,sha256=Rb2AZV2we4Key-5FTjEh9ip-0Rao6s5Raj4_vVrxHgk,1069
+sitemap2atom-0.1.0.dist-info/RECORD,,

sitemap2atom-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.30.1
+Root-Is-Purelib: true
+Tag: py3-none-any

sitemap2atom-0.1.0.dist-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ [console_scripts]
2	+ sitemap2atom = sitemap2atom.cli:main

sitemap2atom-0.1.0.dist-info/licenses/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2026 Mike Preston
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.