PyPI - crawl4md - Versions diffs - 0.1.2__py3-none-any.whl - Mend

crawl4md 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

crawl4md/__init__.py +11 -0
crawl4md/check.py +20 -0
crawl4md/cli.py +93 -0
crawl4md/config.py +54 -0
crawl4md/convert/__init__.py +1 -0
crawl4md/convert/markdown.py +63 -0
crawl4md/convert/preprocessing/__init__.py +1 -0
crawl4md/convert/preprocessing/helpers/__init__.py +1 -0
crawl4md/convert/preprocessing/helpers/title_html_parser.py +40 -0
crawl4md/convert/preprocessing/markdown.py +62 -0
crawl4md/convert/preprocessing/rules/__init__.py +1 -0
crawl4md/convert/preprocessing/rules/base/__init__.py +0 -0
crawl4md/convert/preprocessing/rules/base/rule_base.py +83 -0
crawl4md/convert/preprocessing/rules/ensure_h1.py +45 -0
crawl4md/convert/preprocessing/rules/normalize_whitespace.py +140 -0
crawl4md/convert/preprocessing/rules/remove_html_comments.py +28 -0
crawl4md/convert/preprocessing/rules/remove_jump_to_content.py +68 -0
crawl4md/convert/preprocessing/rules/remove_reference_sections.py +47 -0
crawl4md/convert/preprocessing/rules/remove_wiki_loves_earth_banner.py +49 -0
crawl4md/convert/preprocessing/rules/remove_wikipedia_subtitle.py +40 -0
crawl4md/fetch/__init__.py +1 -0
crawl4md/fetch/html.py +57 -0
crawl4md/fetch/markdown.py +59 -0
crawl4md/fetch/normalize/__init__.py +0 -0
crawl4md/fetch/normalize/base/__init__.py +2 -0
crawl4md/fetch/normalize/base/normalizer_base.py +16 -0
crawl4md/fetch/normalize/mediawiki_entity.py +31 -0
crawl4md/fetch/normalize/mediawiki_hidden_span.py +31 -0
crawl4md/fetch/normalize/url.py +42 -0
crawl4md/paths.py +24 -0
crawl4md/sitemap.py +34 -0
crawl4md/writer.py +17 -0
crawl4md-0.1.2.dist-info/METADATA +336 -0
crawl4md-0.1.2.dist-info/RECORD +37 -0
crawl4md-0.1.2.dist-info/WHEEL +4 -0
crawl4md-0.1.2.dist-info/entry_points.txt +3 -0
crawl4md-0.1.2.dist-info/licenses/LICENSE.md +21 -0

crawl4md/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from .config import MarkdownPreprocessingConfig, ParseType
+from .convert.markdown import MarkdownConverter
+from .fetch.markdown import MarkdownFetcher
+__all__ = [
+    "MarkdownConverter",
+    "MarkdownFetcher",
+    "MarkdownPreprocessingConfig",
+    "ParseType",
+]

crawl4md/check.py ADDED Viewed

@@ -0,0 +1,20 @@
+import subprocess
+import sys
+def main() -> int:
+    commands = [
+        [sys.executable, "-m", "unittest", "discover", "-s", "tests", "-v"],
+        ["ruff", "check"],
+    ]
+    for command in commands:
+        result = subprocess.run(command)
+        if result.returncode != 0:
+            return result.returncode
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

crawl4md/cli.py ADDED Viewed

@@ -0,0 +1,93 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+import typer
+import asyncio
+import warnings
+from pathlib import Path
+from urllib.parse import urlparse
+from .config import load_config
+from .fetch.markdown import MarkdownFetcher
+from .paths import url_to_path
+from .sitemap import parse_sitemap
+from .writer import write_markdown
+warnings.filterwarnings(
+    "ignore",
+    category=SyntaxWarning,
+    module="crawl4ai"
+)
+app = typer.Typer()
+def pretty_name(url: str) -> str:
+    return Path(urlparse(url).path).name or "index"
+@app.command()
+def crawl(project: str):
+    config = load_config()
+    if project not in config.projects:
+        typer.echo(f"Project '{project}' not found")
+        raise typer.Exit(1)
+    proj = config.projects[project]
+    fetcher = MarkdownFetcher(
+        config=proj.preprocessing.markdown,
+        parse_type=proj.crawl.parse_type,
+    )
+    # URLs sammeln
+    urls: list[str] = []
+    if proj.type == "pages":
+        urls = proj.sources
+    elif proj.type == "sitemap":
+        for sitemap_url in proj.sources:
+            urls.extend(parse_sitemap(sitemap_url))
+    # deduplicate
+    urls = list(dict.fromkeys(urls))
+    total = len(urls)
+    success = 0
+    failed = 0
+    for i, url in enumerate(urls, start=1):
+        name = pretty_name(url)
+        typer.echo(f"[{i}/{total}] {name}")
+        try:
+            typer.echo("  → Fetching...", nl=False)
+            md = asyncio.run(fetcher.fetch(url))
+            typer.echo(" done")
+            path = url_to_path(Path("docs"), project, url)
+            typer.echo(f"  → Writing... {path}")
+            write_markdown(path, md)
+            success += 1
+        except Exception as e:
+            typer.echo(f"  → Error: {e}")
+            failed += 1
+        typer.echo("")
+    typer.echo("Done.")
+    typer.echo(f"✔ Success: {success}")
+    typer.echo(f"✖ Failed: {failed}")
+    typer.echo(f"Output: docs/{project}")

crawl4md/config.py ADDED Viewed

@@ -0,0 +1,54 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+import yaml
+from pydantic import BaseModel, Field
+from typing import Literal
+ParseType = Literal["markdown", "markdown-fit"]
+class CrawlConfig(BaseModel):
+    parse_type: ParseType = "markdown"
+class MarkdownPreprocessingConfig(BaseModel):
+    enabled: bool = False
+    ensure_h1: bool = False
+    remove_jump_to_content: bool = False
+    remove_wikipedia_subtitle: bool = False
+    remove_wiki_loves_earth_banner: bool = False
+    remove_reference_sections: bool = False
+    remove_html_comments: bool = False
+    normalize_whitespace: bool = False
+    reference_headings: list[str] = Field(default_factory=list)
+class PreprocessingConfig(BaseModel):
+    markdown: MarkdownPreprocessingConfig = Field(
+        default_factory=MarkdownPreprocessingConfig
+    )
+class ProjectConfig(BaseModel):
+    type: Literal["sitemap", "pages"]
+    sources: list[str]
+    crawl: CrawlConfig = Field(default_factory=CrawlConfig)
+    preprocessing: PreprocessingConfig = Field(default_factory=PreprocessingConfig)
+class AppConfig(BaseModel):
+    projects: dict[str, ProjectConfig]
+def load_config(path: str = "crawl.yml") -> AppConfig:
+    with open(path, "r") as f:
+        data = yaml.safe_load(f)
+    return AppConfig(**data)

crawl4md/convert/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

crawl4md/convert/markdown.py ADDED Viewed

@@ -0,0 +1,63 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+import asyncio
+from ..config import MarkdownPreprocessingConfig, ParseType
+from .preprocessing import MarkdownPreprocessing
+class MarkdownConverter:
+    def __init__(
+        self,
+        config: MarkdownPreprocessingConfig,
+        parse_type: ParseType = "markdown",
+    ) -> None:
+        self.config = config
+        self.parse_type = parse_type
+    async def convert(
+        self,
+        html: str,
+        url: str | None = None,
+    ) -> str:
+        raw_html_url = f"raw:{html}"
+        if self.parse_type == "markdown-fit":
+            markdown_generator = DefaultMarkdownGenerator(
+                content_filter=PruningContentFilter(threshold=0.5),
+                options={"ignore_links": False},
+            )
+            crawler_config = CrawlerRunConfig(markdown_generator=markdown_generator)
+        else:
+            crawler_config = CrawlerRunConfig()
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(url=raw_html_url, config=crawler_config)
+            if self.parse_type == "markdown-fit":
+                markdown = result.markdown.fit_markdown or result.markdown.raw_markdown or ""
+            else:
+                markdown = result.markdown.raw_markdown or ""
+            preprocessing = MarkdownPreprocessing(self.config)
+            return preprocessing.process(markdown, url=url, html=html)
+    def convert_sync(
+        self,
+        html: str,
+        url: str | None = None,
+    ) -> str:
+        return asyncio.run(self.convert(html=html, url=url))

crawl4md/convert/preprocessing/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from .markdown import MarkdownPreprocessing as MarkdownPreprocessing

crawl4md/convert/preprocessing/helpers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

crawl4md/convert/preprocessing/helpers/title_html_parser.py ADDED Viewed

@@ -0,0 +1,40 @@
+from html.parser import HTMLParser
+class _TitleHTMLParser(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=True)
+        self._active_tag: str | None = None
+        self._capturing_h1 = False
+        self._seen_h1 = False
+        self._h1_parts: list[str] = []
+        self._title_parts: list[str] = []
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        self._active_tag = tag
+        if tag == "h1" and not self._seen_h1:
+            self._capturing_h1 = True
+    def handle_endtag(self, tag: str) -> None:
+        if tag == "h1" and self._capturing_h1:
+            self._capturing_h1 = False
+            self._seen_h1 = True
+        if tag == self._active_tag:
+            self._active_tag = None
+    def handle_data(self, data: str) -> None:
+        if self._capturing_h1:
+            self._h1_parts.append(data)
+        if self._active_tag == "title":
+            self._title_parts.append(data)
+    @property
+    def h1_text(self) -> str:
+        return "".join(self._h1_parts)
+    @property
+    def title_text(self) -> str:
+        return "".join(self._title_parts)

crawl4md/convert/preprocessing/markdown.py ADDED Viewed

@@ -0,0 +1,62 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+from .rules.base.rule_base import RuleBase
+from .rules.ensure_h1 import RuleEnsureH1
+from .rules.normalize_whitespace import RuleNormalizeWhitespace
+from .rules.remove_html_comments import RuleRemoveHtmlComments
+from .rules.remove_jump_to_content import RuleRemoveJumpToContent
+from .rules.remove_reference_sections import RuleRemoveReferenceSections
+from .rules.remove_wiki_loves_earth_banner import RuleRemoveWikiLovesEarthBanner
+from .rules.remove_wikipedia_subtitle import RuleRemoveWikipediaSubtitle
+from crawl4md.config import MarkdownPreprocessingConfig
+class MarkdownPreprocessing:
+    def __init__(self, config: MarkdownPreprocessingConfig):
+        self.config = config
+        self.rules: list[RuleBase] = []
+        if config.remove_jump_to_content:
+            self.rules.append(RuleRemoveJumpToContent(config))
+        if config.remove_html_comments:
+            self.rules.append(RuleRemoveHtmlComments(config))
+        if config.remove_wikipedia_subtitle:
+            self.rules.append(RuleRemoveWikipediaSubtitle(config))
+        if config.remove_wiki_loves_earth_banner:
+            self.rules.append(RuleRemoveWikiLovesEarthBanner(config))
+        if config.remove_reference_sections:
+            self.rules.append(RuleRemoveReferenceSections(config))
+        if config.ensure_h1:
+            self.rules.append(RuleEnsureH1(config))
+        if config.normalize_whitespace:
+            self.rules.append(RuleNormalizeWhitespace(config))
+    def process(
+        self,
+        markdown: str,
+        *,
+        url: str | None = None,
+        html: str | None = None,
+    ) -> str:
+        if not self.config.enabled:
+            return markdown
+        for rule in self.rules:
+            markdown = rule.apply(markdown, url=url, html=html)
+        return markdown

crawl4md/convert/preprocessing/rules/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+

crawl4md/convert/preprocessing/rules/base/__init__.py ADDED Viewed

File without changes

crawl4md/convert/preprocessing/rules/base/rule_base.py ADDED Viewed

@@ -0,0 +1,83 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+import re
+from html import unescape
+from urllib.parse import unquote, urljoin, urlparse
+from urllib.request import urlopen
+from crawl4md.config import MarkdownPreprocessingConfig
+from crawl4md.convert.preprocessing.helpers.title_html_parser import _TitleHTMLParser
+class RuleBase:
+    MARKDOWN_LINK_PATTERN = re.compile(
+        r"\[(.*?)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)",
+        re.DOTALL,
+    )
+    H1_PATTERN = re.compile(r"^# ", re.MULTILINE)
+    HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.*?)\s*$")
+    TRAILING_ANCHOR_PATTERN = re.compile(r"\s*\{#[^}]+\}\s*$")
+    LEADING_NUMBER_PATTERN = re.compile(r"^\d+(?:[.)]\s*|\s+)")
+    def __init__(self, config: MarkdownPreprocessingConfig):
+        self.config = config
+    def apply(
+        self,
+        markdown: str,
+        *,
+        url: str | None = None,
+        html: str | None = None,
+    ) -> str:
+        raise NotImplementedError
+    def join_lines(self, lines: list[str], original: str) -> str:
+        suffix = "\n" if original.endswith("\n") else ""
+        return "\n".join(lines) + suffix
+    def normalize_heading(self, heading: str) -> str:
+        normalized = self.TRAILING_ANCHOR_PATTERN.sub("", heading).strip().casefold()
+        normalized = self.LEADING_NUMBER_PATTERN.sub("", normalized)
+        return " ".join(normalized.split())
+    def has_h1(self, markdown: str) -> bool:
+        return bool(self.H1_PATTERN.search(markdown))
+    def normalize_title(self, value: str) -> str | None:
+        normalized = " ".join(unescape(value).split()).strip()
+        return normalized or None
+    def extract_title_from_html(self, html: str) -> str | None:
+        parser = _TitleHTMLParser()
+        parser.feed(html)
+        parser.close()
+        return self.normalize_title(parser.h1_text) or self.normalize_title(parser.title_text)
+    def fallback_title_from_url(self, url: str) -> str:
+        parsed = urlparse(url)
+        segment = parsed.path.rstrip("/").rsplit("/", maxsplit=1)[-1]
+        candidate = unquote(segment).replace("-", " ").replace("_", " ")
+        normalized = self.normalize_title(candidate)
+        if normalized:
+            return normalized
+        return parsed.netloc or "index"
+    def fetch_html(self, url: str) -> str | None:
+        with urlopen(url, timeout=30) as response:
+            charset = response.headers.get_content_charset() or "utf-8"
+            return response.read().decode(charset, errors="replace")
+    def resolve_url(self, page_url: str, link_target: str):
+        return urlparse(urljoin(page_url, link_target))

crawl4md/convert/preprocessing/rules/ensure_h1.py ADDED Viewed

@@ -0,0 +1,45 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+from .base.rule_base import RuleBase
+class RuleEnsureH1(RuleBase):
+    def apply(
+        self,
+        markdown: str,
+        *,
+        url: str | None = None,
+        html: str | None = None,
+    ) -> str:
+        if self.has_h1(markdown):
+            return markdown
+        title = None
+        if html:
+            title = self.extract_title_from_html(html)
+        if not title and url:
+            try:
+                fetched_html = self.fetch_html(url)
+            except Exception:
+                fetched_html = None
+            if fetched_html:
+                title = self.extract_title_from_html(fetched_html)
+        if not title and url:
+            title = self.fallback_title_from_url(url)
+        if not title:
+            title = "index"
+        return f"# {title}\n\n{markdown}"

crawl4md/convert/preprocessing/rules/normalize_whitespace.py ADDED Viewed

@@ -0,0 +1,140 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+import re
+from .base.rule_base import RuleBase
+TABLE_CELL_PATTERN = re.compile(r"^:?-{3,}:?$")
+MISSING_SPACE_BEFORE_PAREN_PATTERN = re.compile(r"(?<=[\w\)])\(")
+class RuleNormalizeWhitespace(RuleBase):
+    def apply(
+        self,
+        markdown: str,
+        *,
+        url: str | None = None,
+        html: str | None = None,
+    ) -> str:
+        lines = markdown.splitlines()
+        blocks: list[str] = []
+        index = 0
+        while index < len(lines):
+            line = lines[index]
+            if not line.strip():
+                index += 1
+                continue
+            if self._is_fence(line):
+                block_lines = [line]
+                index += 1
+                while index < len(lines):
+                    block_lines.append(lines[index])
+                    if self._is_fence(lines[index]):
+                        index += 1
+                        break
+                    index += 1
+                blocks.append("\n".join(block_lines))
+                continue
+            if self.HEADING_PATTERN.match(line):
+                blocks.append(self._normalize_line(line))
+                index += 1
+                continue
+            if self._is_table_start(lines, index):
+                block_lines = [self._normalize_line(lines[index])]
+                index += 1
+                while index < len(lines):
+                    current = lines[index]
+                    if not current.strip():
+                        break
+                    if self.HEADING_PATTERN.match(current) or self._is_fence(current):
+                        break
+                    if "|" not in current:
+                        break
+                    block_lines.append(self._normalize_line(current))
+                    index += 1
+                blocks.append("\n".join(block_lines))
+                continue
+            block_lines = [self._normalize_line(line)]
+            index += 1
+            while index < len(lines):
+                current = lines[index]
+                if not current.strip():
+                    index += 1
+                    break
+                if self.HEADING_PATTERN.match(current):
+                    break
+                if self._is_fence(current):
+                    break
+                if self._is_table_start(lines, index):
+                    break
+                block_lines.append(self._normalize_line(current))
+                index += 1
+            blocks.append("\n\n".join(block_lines))
+        if not blocks:
+            return ""
+        return "\n\n".join(blocks) + "\n"
+    def _is_fence(self, line: str) -> bool:
+        stripped = line.lstrip()
+        return stripped.startswith("```") or stripped.startswith("~~~")
+    def _is_table_start(self, lines: list[str], index: int) -> bool:
+        if index + 1 >= len(lines):
+            return False
+        if "|" not in lines[index]:
+            return False
+        separator_cells = [
+            cell.strip()
+            for cell in lines[index + 1].strip().strip("|").split("|")
+        ]
+        if not separator_cells or any(not cell for cell in separator_cells):
+            return False
+        return all(TABLE_CELL_PATTERN.match(cell) for cell in separator_cells)
+    def _normalize_line(self, line: str) -> str:
+        line = line.rstrip()
+        parts: list[str] = []
+        last_end = 0
+        for match in self.MARKDOWN_LINK_PATTERN.finditer(line):
+            parts.append(line[last_end:match.start()])
+            if match.start() > 0 and not line[match.start() - 1].isspace() and line[match.start() - 1] != "!":
+                parts.append(" ")
+            parts.append(match.group(0))
+            last_end = match.end()
+        parts.append(line[last_end:])
+        normalized = "".join(parts)
+        return MISSING_SPACE_BEFORE_PAREN_PATTERN.sub(" (", normalized)

crawl4md/convert/preprocessing/rules/remove_html_comments.py ADDED Viewed

@@ -0,0 +1,28 @@
+# This file is part of the https://github.com/ixnode/crawl4md project.
+#
+# (c) 2026 Björn Hempel <bjoern@hempel.li>
+#
+# For the full copyright and license information, please view the LICENSE.md
+# file that was distributed with this source code.
+#
+# @author: Björn Hempel <bjoern@hempel.li>
+# @version: 1.0.0 (2026-05-02)
+# @since 1.0.0 (2026-05-02) First version
+import re
+from .base.rule_base import RuleBase
+HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
+class RuleRemoveHtmlComments(RuleBase):
+    def apply(
+        self,
+        markdown: str,
+        *,
+        url: str | None = None,
+        html: str | None = None,
+    ) -> str:
+        return HTML_COMMENT_PATTERN.sub("", markdown)