crawl4md 0.1.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. crawl4md/__init__.py +11 -0
  2. crawl4md/check.py +20 -0
  3. crawl4md/cli.py +93 -0
  4. crawl4md/config.py +54 -0
  5. crawl4md/convert/__init__.py +1 -0
  6. crawl4md/convert/markdown.py +63 -0
  7. crawl4md/convert/preprocessing/__init__.py +1 -0
  8. crawl4md/convert/preprocessing/helpers/__init__.py +1 -0
  9. crawl4md/convert/preprocessing/helpers/title_html_parser.py +40 -0
  10. crawl4md/convert/preprocessing/markdown.py +62 -0
  11. crawl4md/convert/preprocessing/rules/__init__.py +1 -0
  12. crawl4md/convert/preprocessing/rules/base/__init__.py +0 -0
  13. crawl4md/convert/preprocessing/rules/base/rule_base.py +83 -0
  14. crawl4md/convert/preprocessing/rules/ensure_h1.py +45 -0
  15. crawl4md/convert/preprocessing/rules/normalize_whitespace.py +140 -0
  16. crawl4md/convert/preprocessing/rules/remove_html_comments.py +28 -0
  17. crawl4md/convert/preprocessing/rules/remove_jump_to_content.py +68 -0
  18. crawl4md/convert/preprocessing/rules/remove_reference_sections.py +47 -0
  19. crawl4md/convert/preprocessing/rules/remove_wiki_loves_earth_banner.py +49 -0
  20. crawl4md/convert/preprocessing/rules/remove_wikipedia_subtitle.py +40 -0
  21. crawl4md/fetch/__init__.py +1 -0
  22. crawl4md/fetch/html.py +57 -0
  23. crawl4md/fetch/markdown.py +59 -0
  24. crawl4md/fetch/normalize/__init__.py +0 -0
  25. crawl4md/fetch/normalize/base/__init__.py +2 -0
  26. crawl4md/fetch/normalize/base/normalizer_base.py +16 -0
  27. crawl4md/fetch/normalize/mediawiki_entity.py +31 -0
  28. crawl4md/fetch/normalize/mediawiki_hidden_span.py +31 -0
  29. crawl4md/fetch/normalize/url.py +42 -0
  30. crawl4md/paths.py +24 -0
  31. crawl4md/sitemap.py +34 -0
  32. crawl4md/writer.py +17 -0
  33. crawl4md-0.1.2.dist-info/METADATA +336 -0
  34. crawl4md-0.1.2.dist-info/RECORD +37 -0
  35. crawl4md-0.1.2.dist-info/WHEEL +4 -0
  36. crawl4md-0.1.2.dist-info/entry_points.txt +3 -0
  37. crawl4md-0.1.2.dist-info/licenses/LICENSE.md +21 -0
crawl4md/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ from .config import MarkdownPreprocessingConfig, ParseType
2
+ from .convert.markdown import MarkdownConverter
3
+ from .fetch.markdown import MarkdownFetcher
4
+
5
+
6
+ __all__ = [
7
+ "MarkdownConverter",
8
+ "MarkdownFetcher",
9
+ "MarkdownPreprocessingConfig",
10
+ "ParseType",
11
+ ]
crawl4md/check.py ADDED
@@ -0,0 +1,20 @@
1
+ import subprocess
2
+ import sys
3
+
4
+
5
+ def main() -> int:
6
+ commands = [
7
+ [sys.executable, "-m", "unittest", "discover", "-s", "tests", "-v"],
8
+ ["ruff", "check"],
9
+ ]
10
+
11
+ for command in commands:
12
+ result = subprocess.run(command)
13
+ if result.returncode != 0:
14
+ return result.returncode
15
+
16
+ return 0
17
+
18
+
19
+ if __name__ == "__main__":
20
+ raise SystemExit(main())
crawl4md/cli.py ADDED
@@ -0,0 +1,93 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ import typer
13
+ import asyncio
14
+ import warnings
15
+
16
+ from pathlib import Path
17
+ from urllib.parse import urlparse
18
+
19
+ from .config import load_config
20
+ from .fetch.markdown import MarkdownFetcher
21
+ from .paths import url_to_path
22
+ from .sitemap import parse_sitemap
23
+ from .writer import write_markdown
24
+
25
+
26
+ warnings.filterwarnings(
27
+ "ignore",
28
+ category=SyntaxWarning,
29
+ module="crawl4ai"
30
+ )
31
+
32
+ app = typer.Typer()
33
+
34
+ def pretty_name(url: str) -> str:
35
+ return Path(urlparse(url).path).name or "index"
36
+
37
+ @app.command()
38
+ def crawl(project: str):
39
+ config = load_config()
40
+
41
+ if project not in config.projects:
42
+ typer.echo(f"Project '{project}' not found")
43
+ raise typer.Exit(1)
44
+
45
+ proj = config.projects[project]
46
+ fetcher = MarkdownFetcher(
47
+ config=proj.preprocessing.markdown,
48
+ parse_type=proj.crawl.parse_type,
49
+ )
50
+
51
+ # URLs sammeln
52
+ urls: list[str] = []
53
+
54
+ if proj.type == "pages":
55
+ urls = proj.sources
56
+
57
+ elif proj.type == "sitemap":
58
+ for sitemap_url in proj.sources:
59
+ urls.extend(parse_sitemap(sitemap_url))
60
+
61
+ # deduplicate
62
+ urls = list(dict.fromkeys(urls))
63
+
64
+ total = len(urls)
65
+ success = 0
66
+ failed = 0
67
+
68
+ for i, url in enumerate(urls, start=1):
69
+ name = pretty_name(url)
70
+ typer.echo(f"[{i}/{total}] {name}")
71
+
72
+ try:
73
+ typer.echo(" → Fetching...", nl=False)
74
+ md = asyncio.run(fetcher.fetch(url))
75
+ typer.echo(" done")
76
+
77
+ path = url_to_path(Path("docs"), project, url)
78
+
79
+ typer.echo(f" → Writing... {path}")
80
+ write_markdown(path, md)
81
+
82
+ success += 1
83
+
84
+ except Exception as e:
85
+ typer.echo(f" → Error: {e}")
86
+ failed += 1
87
+
88
+ typer.echo("")
89
+
90
+ typer.echo("Done.")
91
+ typer.echo(f"✔ Success: {success}")
92
+ typer.echo(f"✖ Failed: {failed}")
93
+ typer.echo(f"Output: docs/{project}")
crawl4md/config.py ADDED
@@ -0,0 +1,54 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ import yaml
13
+
14
+ from pydantic import BaseModel, Field
15
+ from typing import Literal
16
+
17
+
18
+ ParseType = Literal["markdown", "markdown-fit"]
19
+
20
+ class CrawlConfig(BaseModel):
21
+ parse_type: ParseType = "markdown"
22
+
23
+ class MarkdownPreprocessingConfig(BaseModel):
24
+ enabled: bool = False
25
+
26
+ ensure_h1: bool = False
27
+ remove_jump_to_content: bool = False
28
+ remove_wikipedia_subtitle: bool = False
29
+ remove_wiki_loves_earth_banner: bool = False
30
+ remove_reference_sections: bool = False
31
+ remove_html_comments: bool = False
32
+ normalize_whitespace: bool = False
33
+
34
+ reference_headings: list[str] = Field(default_factory=list)
35
+
36
+ class PreprocessingConfig(BaseModel):
37
+ markdown: MarkdownPreprocessingConfig = Field(
38
+ default_factory=MarkdownPreprocessingConfig
39
+ )
40
+
41
+ class ProjectConfig(BaseModel):
42
+ type: Literal["sitemap", "pages"]
43
+ sources: list[str]
44
+
45
+ crawl: CrawlConfig = Field(default_factory=CrawlConfig)
46
+ preprocessing: PreprocessingConfig = Field(default_factory=PreprocessingConfig)
47
+
48
+ class AppConfig(BaseModel):
49
+ projects: dict[str, ProjectConfig]
50
+
51
+ def load_config(path: str = "crawl.yml") -> AppConfig:
52
+ with open(path, "r") as f:
53
+ data = yaml.safe_load(f)
54
+ return AppConfig(**data)
@@ -0,0 +1 @@
1
+
@@ -0,0 +1,63 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
13
+ from crawl4ai.content_filter_strategy import PruningContentFilter
14
+ from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
15
+ import asyncio
16
+
17
+ from ..config import MarkdownPreprocessingConfig, ParseType
18
+ from .preprocessing import MarkdownPreprocessing
19
+
20
+
21
+ class MarkdownConverter:
22
+ def __init__(
23
+ self,
24
+ config: MarkdownPreprocessingConfig,
25
+ parse_type: ParseType = "markdown",
26
+ ) -> None:
27
+ self.config = config
28
+ self.parse_type = parse_type
29
+
30
+ async def convert(
31
+ self,
32
+ html: str,
33
+ url: str | None = None,
34
+ ) -> str:
35
+ raw_html_url = f"raw:{html}"
36
+
37
+ if self.parse_type == "markdown-fit":
38
+ markdown_generator = DefaultMarkdownGenerator(
39
+ content_filter=PruningContentFilter(threshold=0.5),
40
+ options={"ignore_links": False},
41
+ )
42
+ crawler_config = CrawlerRunConfig(markdown_generator=markdown_generator)
43
+ else:
44
+ crawler_config = CrawlerRunConfig()
45
+
46
+ async with AsyncWebCrawler() as crawler:
47
+ result = await crawler.arun(url=raw_html_url, config=crawler_config)
48
+
49
+ if self.parse_type == "markdown-fit":
50
+ markdown = result.markdown.fit_markdown or result.markdown.raw_markdown or ""
51
+ else:
52
+ markdown = result.markdown.raw_markdown or ""
53
+
54
+ preprocessing = MarkdownPreprocessing(self.config)
55
+
56
+ return preprocessing.process(markdown, url=url, html=html)
57
+
58
+ def convert_sync(
59
+ self,
60
+ html: str,
61
+ url: str | None = None,
62
+ ) -> str:
63
+ return asyncio.run(self.convert(html=html, url=url))
@@ -0,0 +1 @@
1
+ from .markdown import MarkdownPreprocessing as MarkdownPreprocessing
@@ -0,0 +1,40 @@
1
+ from html.parser import HTMLParser
2
+
3
+
4
+ class _TitleHTMLParser(HTMLParser):
5
+ def __init__(self) -> None:
6
+ super().__init__(convert_charrefs=True)
7
+ self._active_tag: str | None = None
8
+ self._capturing_h1 = False
9
+ self._seen_h1 = False
10
+ self._h1_parts: list[str] = []
11
+ self._title_parts: list[str] = []
12
+
13
+ def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
14
+ self._active_tag = tag
15
+
16
+ if tag == "h1" and not self._seen_h1:
17
+ self._capturing_h1 = True
18
+
19
+ def handle_endtag(self, tag: str) -> None:
20
+ if tag == "h1" and self._capturing_h1:
21
+ self._capturing_h1 = False
22
+ self._seen_h1 = True
23
+
24
+ if tag == self._active_tag:
25
+ self._active_tag = None
26
+
27
+ def handle_data(self, data: str) -> None:
28
+ if self._capturing_h1:
29
+ self._h1_parts.append(data)
30
+
31
+ if self._active_tag == "title":
32
+ self._title_parts.append(data)
33
+
34
+ @property
35
+ def h1_text(self) -> str:
36
+ return "".join(self._h1_parts)
37
+
38
+ @property
39
+ def title_text(self) -> str:
40
+ return "".join(self._title_parts)
@@ -0,0 +1,62 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ from .rules.base.rule_base import RuleBase
13
+ from .rules.ensure_h1 import RuleEnsureH1
14
+ from .rules.normalize_whitespace import RuleNormalizeWhitespace
15
+ from .rules.remove_html_comments import RuleRemoveHtmlComments
16
+ from .rules.remove_jump_to_content import RuleRemoveJumpToContent
17
+ from .rules.remove_reference_sections import RuleRemoveReferenceSections
18
+ from .rules.remove_wiki_loves_earth_banner import RuleRemoveWikiLovesEarthBanner
19
+ from .rules.remove_wikipedia_subtitle import RuleRemoveWikipediaSubtitle
20
+ from crawl4md.config import MarkdownPreprocessingConfig
21
+
22
+
23
+ class MarkdownPreprocessing:
24
+ def __init__(self, config: MarkdownPreprocessingConfig):
25
+ self.config = config
26
+ self.rules: list[RuleBase] = []
27
+
28
+ if config.remove_jump_to_content:
29
+ self.rules.append(RuleRemoveJumpToContent(config))
30
+
31
+ if config.remove_html_comments:
32
+ self.rules.append(RuleRemoveHtmlComments(config))
33
+
34
+ if config.remove_wikipedia_subtitle:
35
+ self.rules.append(RuleRemoveWikipediaSubtitle(config))
36
+
37
+ if config.remove_wiki_loves_earth_banner:
38
+ self.rules.append(RuleRemoveWikiLovesEarthBanner(config))
39
+
40
+ if config.remove_reference_sections:
41
+ self.rules.append(RuleRemoveReferenceSections(config))
42
+
43
+ if config.ensure_h1:
44
+ self.rules.append(RuleEnsureH1(config))
45
+
46
+ if config.normalize_whitespace:
47
+ self.rules.append(RuleNormalizeWhitespace(config))
48
+
49
+ def process(
50
+ self,
51
+ markdown: str,
52
+ *,
53
+ url: str | None = None,
54
+ html: str | None = None,
55
+ ) -> str:
56
+ if not self.config.enabled:
57
+ return markdown
58
+
59
+ for rule in self.rules:
60
+ markdown = rule.apply(markdown, url=url, html=html)
61
+
62
+ return markdown
File without changes
@@ -0,0 +1,83 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ import re
13
+
14
+ from html import unescape
15
+ from urllib.parse import unquote, urljoin, urlparse
16
+ from urllib.request import urlopen
17
+
18
+ from crawl4md.config import MarkdownPreprocessingConfig
19
+ from crawl4md.convert.preprocessing.helpers.title_html_parser import _TitleHTMLParser
20
+
21
+
22
+ class RuleBase:
23
+ MARKDOWN_LINK_PATTERN = re.compile(
24
+ r"\[(.*?)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)",
25
+ re.DOTALL,
26
+ )
27
+ H1_PATTERN = re.compile(r"^# ", re.MULTILINE)
28
+ HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.*?)\s*$")
29
+ TRAILING_ANCHOR_PATTERN = re.compile(r"\s*\{#[^}]+\}\s*$")
30
+ LEADING_NUMBER_PATTERN = re.compile(r"^\d+(?:[.)]\s*|\s+)")
31
+
32
+ def __init__(self, config: MarkdownPreprocessingConfig):
33
+ self.config = config
34
+
35
+ def apply(
36
+ self,
37
+ markdown: str,
38
+ *,
39
+ url: str | None = None,
40
+ html: str | None = None,
41
+ ) -> str:
42
+ raise NotImplementedError
43
+
44
+ def join_lines(self, lines: list[str], original: str) -> str:
45
+ suffix = "\n" if original.endswith("\n") else ""
46
+ return "\n".join(lines) + suffix
47
+
48
+ def normalize_heading(self, heading: str) -> str:
49
+ normalized = self.TRAILING_ANCHOR_PATTERN.sub("", heading).strip().casefold()
50
+ normalized = self.LEADING_NUMBER_PATTERN.sub("", normalized)
51
+ return " ".join(normalized.split())
52
+
53
+ def has_h1(self, markdown: str) -> bool:
54
+ return bool(self.H1_PATTERN.search(markdown))
55
+
56
+ def normalize_title(self, value: str) -> str | None:
57
+ normalized = " ".join(unescape(value).split()).strip()
58
+ return normalized or None
59
+
60
+ def extract_title_from_html(self, html: str) -> str | None:
61
+ parser = _TitleHTMLParser()
62
+ parser.feed(html)
63
+ parser.close()
64
+ return self.normalize_title(parser.h1_text) or self.normalize_title(parser.title_text)
65
+
66
+ def fallback_title_from_url(self, url: str) -> str:
67
+ parsed = urlparse(url)
68
+ segment = parsed.path.rstrip("/").rsplit("/", maxsplit=1)[-1]
69
+ candidate = unquote(segment).replace("-", " ").replace("_", " ")
70
+ normalized = self.normalize_title(candidate)
71
+
72
+ if normalized:
73
+ return normalized
74
+
75
+ return parsed.netloc or "index"
76
+
77
+ def fetch_html(self, url: str) -> str | None:
78
+ with urlopen(url, timeout=30) as response:
79
+ charset = response.headers.get_content_charset() or "utf-8"
80
+ return response.read().decode(charset, errors="replace")
81
+
82
+ def resolve_url(self, page_url: str, link_target: str):
83
+ return urlparse(urljoin(page_url, link_target))
@@ -0,0 +1,45 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ from .base.rule_base import RuleBase
13
+
14
+
15
+ class RuleEnsureH1(RuleBase):
16
+ def apply(
17
+ self,
18
+ markdown: str,
19
+ *,
20
+ url: str | None = None,
21
+ html: str | None = None,
22
+ ) -> str:
23
+ if self.has_h1(markdown):
24
+ return markdown
25
+
26
+ title = None
27
+ if html:
28
+ title = self.extract_title_from_html(html)
29
+
30
+ if not title and url:
31
+ try:
32
+ fetched_html = self.fetch_html(url)
33
+ except Exception:
34
+ fetched_html = None
35
+
36
+ if fetched_html:
37
+ title = self.extract_title_from_html(fetched_html)
38
+
39
+ if not title and url:
40
+ title = self.fallback_title_from_url(url)
41
+
42
+ if not title:
43
+ title = "index"
44
+
45
+ return f"# {title}\n\n{markdown}"
@@ -0,0 +1,140 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ import re
13
+
14
+ from .base.rule_base import RuleBase
15
+
16
+
17
+ TABLE_CELL_PATTERN = re.compile(r"^:?-{3,}:?$")
18
+ MISSING_SPACE_BEFORE_PAREN_PATTERN = re.compile(r"(?<=[\w\)])\(")
19
+
20
+
21
+ class RuleNormalizeWhitespace(RuleBase):
22
+ def apply(
23
+ self,
24
+ markdown: str,
25
+ *,
26
+ url: str | None = None,
27
+ html: str | None = None,
28
+ ) -> str:
29
+ lines = markdown.splitlines()
30
+ blocks: list[str] = []
31
+ index = 0
32
+
33
+ while index < len(lines):
34
+ line = lines[index]
35
+
36
+ if not line.strip():
37
+ index += 1
38
+ continue
39
+
40
+ if self._is_fence(line):
41
+ block_lines = [line]
42
+ index += 1
43
+
44
+ while index < len(lines):
45
+ block_lines.append(lines[index])
46
+ if self._is_fence(lines[index]):
47
+ index += 1
48
+ break
49
+ index += 1
50
+
51
+ blocks.append("\n".join(block_lines))
52
+ continue
53
+
54
+ if self.HEADING_PATTERN.match(line):
55
+ blocks.append(self._normalize_line(line))
56
+ index += 1
57
+ continue
58
+
59
+ if self._is_table_start(lines, index):
60
+ block_lines = [self._normalize_line(lines[index])]
61
+ index += 1
62
+
63
+ while index < len(lines):
64
+ current = lines[index]
65
+ if not current.strip():
66
+ break
67
+ if self.HEADING_PATTERN.match(current) or self._is_fence(current):
68
+ break
69
+ if "|" not in current:
70
+ break
71
+
72
+ block_lines.append(self._normalize_line(current))
73
+ index += 1
74
+
75
+ blocks.append("\n".join(block_lines))
76
+ continue
77
+
78
+ block_lines = [self._normalize_line(line)]
79
+ index += 1
80
+
81
+ while index < len(lines):
82
+ current = lines[index]
83
+ if not current.strip():
84
+ index += 1
85
+ break
86
+ if self.HEADING_PATTERN.match(current):
87
+ break
88
+ if self._is_fence(current):
89
+ break
90
+ if self._is_table_start(lines, index):
91
+ break
92
+
93
+ block_lines.append(self._normalize_line(current))
94
+ index += 1
95
+
96
+ blocks.append("\n\n".join(block_lines))
97
+
98
+ if not blocks:
99
+ return ""
100
+
101
+ return "\n\n".join(blocks) + "\n"
102
+
103
+ def _is_fence(self, line: str) -> bool:
104
+ stripped = line.lstrip()
105
+ return stripped.startswith("```") or stripped.startswith("~~~")
106
+
107
+ def _is_table_start(self, lines: list[str], index: int) -> bool:
108
+ if index + 1 >= len(lines):
109
+ return False
110
+
111
+ if "|" not in lines[index]:
112
+ return False
113
+
114
+ separator_cells = [
115
+ cell.strip()
116
+ for cell in lines[index + 1].strip().strip("|").split("|")
117
+ ]
118
+
119
+ if not separator_cells or any(not cell for cell in separator_cells):
120
+ return False
121
+
122
+ return all(TABLE_CELL_PATTERN.match(cell) for cell in separator_cells)
123
+
124
+ def _normalize_line(self, line: str) -> str:
125
+ line = line.rstrip()
126
+ parts: list[str] = []
127
+ last_end = 0
128
+
129
+ for match in self.MARKDOWN_LINK_PATTERN.finditer(line):
130
+ parts.append(line[last_end:match.start()])
131
+
132
+ if match.start() > 0 and not line[match.start() - 1].isspace() and line[match.start() - 1] != "!":
133
+ parts.append(" ")
134
+
135
+ parts.append(match.group(0))
136
+ last_end = match.end()
137
+
138
+ parts.append(line[last_end:])
139
+ normalized = "".join(parts)
140
+ return MISSING_SPACE_BEFORE_PAREN_PATTERN.sub(" (", normalized)
@@ -0,0 +1,28 @@
1
+ # This file is part of the https://github.com/ixnode/crawl4md project.
2
+ #
3
+ # (c) 2026 Björn Hempel <bjoern@hempel.li>
4
+ #
5
+ # For the full copyright and license information, please view the LICENSE.md
6
+ # file that was distributed with this source code.
7
+ #
8
+ # @author: Björn Hempel <bjoern@hempel.li>
9
+ # @version: 1.0.0 (2026-05-02)
10
+ # @since 1.0.0 (2026-05-02) First version
11
+
12
+ import re
13
+
14
+ from .base.rule_base import RuleBase
15
+
16
+
17
+ HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
18
+
19
+
20
+ class RuleRemoveHtmlComments(RuleBase):
21
+ def apply(
22
+ self,
23
+ markdown: str,
24
+ *,
25
+ url: str | None = None,
26
+ html: str | None = None,
27
+ ) -> str:
28
+ return HTML_COMMENT_PATTERN.sub("", markdown)