crawl4md 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crawl4md/__init__.py +11 -0
- crawl4md/check.py +20 -0
- crawl4md/cli.py +93 -0
- crawl4md/config.py +54 -0
- crawl4md/convert/__init__.py +1 -0
- crawl4md/convert/markdown.py +63 -0
- crawl4md/convert/preprocessing/__init__.py +1 -0
- crawl4md/convert/preprocessing/helpers/__init__.py +1 -0
- crawl4md/convert/preprocessing/helpers/title_html_parser.py +40 -0
- crawl4md/convert/preprocessing/markdown.py +62 -0
- crawl4md/convert/preprocessing/rules/__init__.py +1 -0
- crawl4md/convert/preprocessing/rules/base/__init__.py +0 -0
- crawl4md/convert/preprocessing/rules/base/rule_base.py +83 -0
- crawl4md/convert/preprocessing/rules/ensure_h1.py +45 -0
- crawl4md/convert/preprocessing/rules/normalize_whitespace.py +140 -0
- crawl4md/convert/preprocessing/rules/remove_html_comments.py +28 -0
- crawl4md/convert/preprocessing/rules/remove_jump_to_content.py +68 -0
- crawl4md/convert/preprocessing/rules/remove_reference_sections.py +47 -0
- crawl4md/convert/preprocessing/rules/remove_wiki_loves_earth_banner.py +49 -0
- crawl4md/convert/preprocessing/rules/remove_wikipedia_subtitle.py +40 -0
- crawl4md/fetch/__init__.py +1 -0
- crawl4md/fetch/html.py +57 -0
- crawl4md/fetch/markdown.py +59 -0
- crawl4md/fetch/normalize/__init__.py +0 -0
- crawl4md/fetch/normalize/base/__init__.py +2 -0
- crawl4md/fetch/normalize/base/normalizer_base.py +16 -0
- crawl4md/fetch/normalize/mediawiki_entity.py +31 -0
- crawl4md/fetch/normalize/mediawiki_hidden_span.py +31 -0
- crawl4md/fetch/normalize/url.py +42 -0
- crawl4md/paths.py +24 -0
- crawl4md/sitemap.py +34 -0
- crawl4md/writer.py +17 -0
- crawl4md-0.1.2.dist-info/METADATA +336 -0
- crawl4md-0.1.2.dist-info/RECORD +37 -0
- crawl4md-0.1.2.dist-info/WHEEL +4 -0
- crawl4md-0.1.2.dist-info/entry_points.txt +3 -0
- crawl4md-0.1.2.dist-info/licenses/LICENSE.md +21 -0
crawl4md/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from .config import MarkdownPreprocessingConfig, ParseType
|
|
2
|
+
from .convert.markdown import MarkdownConverter
|
|
3
|
+
from .fetch.markdown import MarkdownFetcher
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
__all__ = [
|
|
7
|
+
"MarkdownConverter",
|
|
8
|
+
"MarkdownFetcher",
|
|
9
|
+
"MarkdownPreprocessingConfig",
|
|
10
|
+
"ParseType",
|
|
11
|
+
]
|
crawl4md/check.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import subprocess
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
def main() -> int:
|
|
6
|
+
commands = [
|
|
7
|
+
[sys.executable, "-m", "unittest", "discover", "-s", "tests", "-v"],
|
|
8
|
+
["ruff", "check"],
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
for command in commands:
|
|
12
|
+
result = subprocess.run(command)
|
|
13
|
+
if result.returncode != 0:
|
|
14
|
+
return result.returncode
|
|
15
|
+
|
|
16
|
+
return 0
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
if __name__ == "__main__":
|
|
20
|
+
raise SystemExit(main())
|
crawl4md/cli.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
import typer
|
|
13
|
+
import asyncio
|
|
14
|
+
import warnings
|
|
15
|
+
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
from urllib.parse import urlparse
|
|
18
|
+
|
|
19
|
+
from .config import load_config
|
|
20
|
+
from .fetch.markdown import MarkdownFetcher
|
|
21
|
+
from .paths import url_to_path
|
|
22
|
+
from .sitemap import parse_sitemap
|
|
23
|
+
from .writer import write_markdown
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
warnings.filterwarnings(
|
|
27
|
+
"ignore",
|
|
28
|
+
category=SyntaxWarning,
|
|
29
|
+
module="crawl4ai"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
app = typer.Typer()
|
|
33
|
+
|
|
34
|
+
def pretty_name(url: str) -> str:
|
|
35
|
+
return Path(urlparse(url).path).name or "index"
|
|
36
|
+
|
|
37
|
+
@app.command()
|
|
38
|
+
def crawl(project: str):
|
|
39
|
+
config = load_config()
|
|
40
|
+
|
|
41
|
+
if project not in config.projects:
|
|
42
|
+
typer.echo(f"Project '{project}' not found")
|
|
43
|
+
raise typer.Exit(1)
|
|
44
|
+
|
|
45
|
+
proj = config.projects[project]
|
|
46
|
+
fetcher = MarkdownFetcher(
|
|
47
|
+
config=proj.preprocessing.markdown,
|
|
48
|
+
parse_type=proj.crawl.parse_type,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
# URLs sammeln
|
|
52
|
+
urls: list[str] = []
|
|
53
|
+
|
|
54
|
+
if proj.type == "pages":
|
|
55
|
+
urls = proj.sources
|
|
56
|
+
|
|
57
|
+
elif proj.type == "sitemap":
|
|
58
|
+
for sitemap_url in proj.sources:
|
|
59
|
+
urls.extend(parse_sitemap(sitemap_url))
|
|
60
|
+
|
|
61
|
+
# deduplicate
|
|
62
|
+
urls = list(dict.fromkeys(urls))
|
|
63
|
+
|
|
64
|
+
total = len(urls)
|
|
65
|
+
success = 0
|
|
66
|
+
failed = 0
|
|
67
|
+
|
|
68
|
+
for i, url in enumerate(urls, start=1):
|
|
69
|
+
name = pretty_name(url)
|
|
70
|
+
typer.echo(f"[{i}/{total}] {name}")
|
|
71
|
+
|
|
72
|
+
try:
|
|
73
|
+
typer.echo(" → Fetching...", nl=False)
|
|
74
|
+
md = asyncio.run(fetcher.fetch(url))
|
|
75
|
+
typer.echo(" done")
|
|
76
|
+
|
|
77
|
+
path = url_to_path(Path("docs"), project, url)
|
|
78
|
+
|
|
79
|
+
typer.echo(f" → Writing... {path}")
|
|
80
|
+
write_markdown(path, md)
|
|
81
|
+
|
|
82
|
+
success += 1
|
|
83
|
+
|
|
84
|
+
except Exception as e:
|
|
85
|
+
typer.echo(f" → Error: {e}")
|
|
86
|
+
failed += 1
|
|
87
|
+
|
|
88
|
+
typer.echo("")
|
|
89
|
+
|
|
90
|
+
typer.echo("Done.")
|
|
91
|
+
typer.echo(f"✔ Success: {success}")
|
|
92
|
+
typer.echo(f"✖ Failed: {failed}")
|
|
93
|
+
typer.echo(f"Output: docs/{project}")
|
crawl4md/config.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
import yaml
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
from typing import Literal
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
ParseType = Literal["markdown", "markdown-fit"]
|
|
19
|
+
|
|
20
|
+
class CrawlConfig(BaseModel):
|
|
21
|
+
parse_type: ParseType = "markdown"
|
|
22
|
+
|
|
23
|
+
class MarkdownPreprocessingConfig(BaseModel):
|
|
24
|
+
enabled: bool = False
|
|
25
|
+
|
|
26
|
+
ensure_h1: bool = False
|
|
27
|
+
remove_jump_to_content: bool = False
|
|
28
|
+
remove_wikipedia_subtitle: bool = False
|
|
29
|
+
remove_wiki_loves_earth_banner: bool = False
|
|
30
|
+
remove_reference_sections: bool = False
|
|
31
|
+
remove_html_comments: bool = False
|
|
32
|
+
normalize_whitespace: bool = False
|
|
33
|
+
|
|
34
|
+
reference_headings: list[str] = Field(default_factory=list)
|
|
35
|
+
|
|
36
|
+
class PreprocessingConfig(BaseModel):
|
|
37
|
+
markdown: MarkdownPreprocessingConfig = Field(
|
|
38
|
+
default_factory=MarkdownPreprocessingConfig
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
class ProjectConfig(BaseModel):
|
|
42
|
+
type: Literal["sitemap", "pages"]
|
|
43
|
+
sources: list[str]
|
|
44
|
+
|
|
45
|
+
crawl: CrawlConfig = Field(default_factory=CrawlConfig)
|
|
46
|
+
preprocessing: PreprocessingConfig = Field(default_factory=PreprocessingConfig)
|
|
47
|
+
|
|
48
|
+
class AppConfig(BaseModel):
|
|
49
|
+
projects: dict[str, ProjectConfig]
|
|
50
|
+
|
|
51
|
+
def load_config(path: str = "crawl.yml") -> AppConfig:
|
|
52
|
+
with open(path, "r") as f:
|
|
53
|
+
data = yaml.safe_load(f)
|
|
54
|
+
return AppConfig(**data)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
13
|
+
from crawl4ai.content_filter_strategy import PruningContentFilter
|
|
14
|
+
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
15
|
+
import asyncio
|
|
16
|
+
|
|
17
|
+
from ..config import MarkdownPreprocessingConfig, ParseType
|
|
18
|
+
from .preprocessing import MarkdownPreprocessing
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MarkdownConverter:
|
|
22
|
+
def __init__(
|
|
23
|
+
self,
|
|
24
|
+
config: MarkdownPreprocessingConfig,
|
|
25
|
+
parse_type: ParseType = "markdown",
|
|
26
|
+
) -> None:
|
|
27
|
+
self.config = config
|
|
28
|
+
self.parse_type = parse_type
|
|
29
|
+
|
|
30
|
+
async def convert(
|
|
31
|
+
self,
|
|
32
|
+
html: str,
|
|
33
|
+
url: str | None = None,
|
|
34
|
+
) -> str:
|
|
35
|
+
raw_html_url = f"raw:{html}"
|
|
36
|
+
|
|
37
|
+
if self.parse_type == "markdown-fit":
|
|
38
|
+
markdown_generator = DefaultMarkdownGenerator(
|
|
39
|
+
content_filter=PruningContentFilter(threshold=0.5),
|
|
40
|
+
options={"ignore_links": False},
|
|
41
|
+
)
|
|
42
|
+
crawler_config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
|
43
|
+
else:
|
|
44
|
+
crawler_config = CrawlerRunConfig()
|
|
45
|
+
|
|
46
|
+
async with AsyncWebCrawler() as crawler:
|
|
47
|
+
result = await crawler.arun(url=raw_html_url, config=crawler_config)
|
|
48
|
+
|
|
49
|
+
if self.parse_type == "markdown-fit":
|
|
50
|
+
markdown = result.markdown.fit_markdown or result.markdown.raw_markdown or ""
|
|
51
|
+
else:
|
|
52
|
+
markdown = result.markdown.raw_markdown or ""
|
|
53
|
+
|
|
54
|
+
preprocessing = MarkdownPreprocessing(self.config)
|
|
55
|
+
|
|
56
|
+
return preprocessing.process(markdown, url=url, html=html)
|
|
57
|
+
|
|
58
|
+
def convert_sync(
|
|
59
|
+
self,
|
|
60
|
+
html: str,
|
|
61
|
+
url: str | None = None,
|
|
62
|
+
) -> str:
|
|
63
|
+
return asyncio.run(self.convert(html=html, url=url))
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .markdown import MarkdownPreprocessing as MarkdownPreprocessing
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
from html.parser import HTMLParser
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class _TitleHTMLParser(HTMLParser):
|
|
5
|
+
def __init__(self) -> None:
|
|
6
|
+
super().__init__(convert_charrefs=True)
|
|
7
|
+
self._active_tag: str | None = None
|
|
8
|
+
self._capturing_h1 = False
|
|
9
|
+
self._seen_h1 = False
|
|
10
|
+
self._h1_parts: list[str] = []
|
|
11
|
+
self._title_parts: list[str] = []
|
|
12
|
+
|
|
13
|
+
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
14
|
+
self._active_tag = tag
|
|
15
|
+
|
|
16
|
+
if tag == "h1" and not self._seen_h1:
|
|
17
|
+
self._capturing_h1 = True
|
|
18
|
+
|
|
19
|
+
def handle_endtag(self, tag: str) -> None:
|
|
20
|
+
if tag == "h1" and self._capturing_h1:
|
|
21
|
+
self._capturing_h1 = False
|
|
22
|
+
self._seen_h1 = True
|
|
23
|
+
|
|
24
|
+
if tag == self._active_tag:
|
|
25
|
+
self._active_tag = None
|
|
26
|
+
|
|
27
|
+
def handle_data(self, data: str) -> None:
|
|
28
|
+
if self._capturing_h1:
|
|
29
|
+
self._h1_parts.append(data)
|
|
30
|
+
|
|
31
|
+
if self._active_tag == "title":
|
|
32
|
+
self._title_parts.append(data)
|
|
33
|
+
|
|
34
|
+
@property
|
|
35
|
+
def h1_text(self) -> str:
|
|
36
|
+
return "".join(self._h1_parts)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def title_text(self) -> str:
|
|
40
|
+
return "".join(self._title_parts)
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
from .rules.base.rule_base import RuleBase
|
|
13
|
+
from .rules.ensure_h1 import RuleEnsureH1
|
|
14
|
+
from .rules.normalize_whitespace import RuleNormalizeWhitespace
|
|
15
|
+
from .rules.remove_html_comments import RuleRemoveHtmlComments
|
|
16
|
+
from .rules.remove_jump_to_content import RuleRemoveJumpToContent
|
|
17
|
+
from .rules.remove_reference_sections import RuleRemoveReferenceSections
|
|
18
|
+
from .rules.remove_wiki_loves_earth_banner import RuleRemoveWikiLovesEarthBanner
|
|
19
|
+
from .rules.remove_wikipedia_subtitle import RuleRemoveWikipediaSubtitle
|
|
20
|
+
from crawl4md.config import MarkdownPreprocessingConfig
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class MarkdownPreprocessing:
|
|
24
|
+
def __init__(self, config: MarkdownPreprocessingConfig):
|
|
25
|
+
self.config = config
|
|
26
|
+
self.rules: list[RuleBase] = []
|
|
27
|
+
|
|
28
|
+
if config.remove_jump_to_content:
|
|
29
|
+
self.rules.append(RuleRemoveJumpToContent(config))
|
|
30
|
+
|
|
31
|
+
if config.remove_html_comments:
|
|
32
|
+
self.rules.append(RuleRemoveHtmlComments(config))
|
|
33
|
+
|
|
34
|
+
if config.remove_wikipedia_subtitle:
|
|
35
|
+
self.rules.append(RuleRemoveWikipediaSubtitle(config))
|
|
36
|
+
|
|
37
|
+
if config.remove_wiki_loves_earth_banner:
|
|
38
|
+
self.rules.append(RuleRemoveWikiLovesEarthBanner(config))
|
|
39
|
+
|
|
40
|
+
if config.remove_reference_sections:
|
|
41
|
+
self.rules.append(RuleRemoveReferenceSections(config))
|
|
42
|
+
|
|
43
|
+
if config.ensure_h1:
|
|
44
|
+
self.rules.append(RuleEnsureH1(config))
|
|
45
|
+
|
|
46
|
+
if config.normalize_whitespace:
|
|
47
|
+
self.rules.append(RuleNormalizeWhitespace(config))
|
|
48
|
+
|
|
49
|
+
def process(
|
|
50
|
+
self,
|
|
51
|
+
markdown: str,
|
|
52
|
+
*,
|
|
53
|
+
url: str | None = None,
|
|
54
|
+
html: str | None = None,
|
|
55
|
+
) -> str:
|
|
56
|
+
if not self.config.enabled:
|
|
57
|
+
return markdown
|
|
58
|
+
|
|
59
|
+
for rule in self.rules:
|
|
60
|
+
markdown = rule.apply(markdown, url=url, html=html)
|
|
61
|
+
|
|
62
|
+
return markdown
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
File without changes
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from html import unescape
|
|
15
|
+
from urllib.parse import unquote, urljoin, urlparse
|
|
16
|
+
from urllib.request import urlopen
|
|
17
|
+
|
|
18
|
+
from crawl4md.config import MarkdownPreprocessingConfig
|
|
19
|
+
from crawl4md.convert.preprocessing.helpers.title_html_parser import _TitleHTMLParser
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class RuleBase:
|
|
23
|
+
MARKDOWN_LINK_PATTERN = re.compile(
|
|
24
|
+
r"\[(.*?)\]\(([^)\s]+)(?:\s+\"[^\"]*\")?\)",
|
|
25
|
+
re.DOTALL,
|
|
26
|
+
)
|
|
27
|
+
H1_PATTERN = re.compile(r"^# ", re.MULTILINE)
|
|
28
|
+
HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.*?)\s*$")
|
|
29
|
+
TRAILING_ANCHOR_PATTERN = re.compile(r"\s*\{#[^}]+\}\s*$")
|
|
30
|
+
LEADING_NUMBER_PATTERN = re.compile(r"^\d+(?:[.)]\s*|\s+)")
|
|
31
|
+
|
|
32
|
+
def __init__(self, config: MarkdownPreprocessingConfig):
|
|
33
|
+
self.config = config
|
|
34
|
+
|
|
35
|
+
def apply(
|
|
36
|
+
self,
|
|
37
|
+
markdown: str,
|
|
38
|
+
*,
|
|
39
|
+
url: str | None = None,
|
|
40
|
+
html: str | None = None,
|
|
41
|
+
) -> str:
|
|
42
|
+
raise NotImplementedError
|
|
43
|
+
|
|
44
|
+
def join_lines(self, lines: list[str], original: str) -> str:
|
|
45
|
+
suffix = "\n" if original.endswith("\n") else ""
|
|
46
|
+
return "\n".join(lines) + suffix
|
|
47
|
+
|
|
48
|
+
def normalize_heading(self, heading: str) -> str:
|
|
49
|
+
normalized = self.TRAILING_ANCHOR_PATTERN.sub("", heading).strip().casefold()
|
|
50
|
+
normalized = self.LEADING_NUMBER_PATTERN.sub("", normalized)
|
|
51
|
+
return " ".join(normalized.split())
|
|
52
|
+
|
|
53
|
+
def has_h1(self, markdown: str) -> bool:
|
|
54
|
+
return bool(self.H1_PATTERN.search(markdown))
|
|
55
|
+
|
|
56
|
+
def normalize_title(self, value: str) -> str | None:
|
|
57
|
+
normalized = " ".join(unescape(value).split()).strip()
|
|
58
|
+
return normalized or None
|
|
59
|
+
|
|
60
|
+
def extract_title_from_html(self, html: str) -> str | None:
|
|
61
|
+
parser = _TitleHTMLParser()
|
|
62
|
+
parser.feed(html)
|
|
63
|
+
parser.close()
|
|
64
|
+
return self.normalize_title(parser.h1_text) or self.normalize_title(parser.title_text)
|
|
65
|
+
|
|
66
|
+
def fallback_title_from_url(self, url: str) -> str:
|
|
67
|
+
parsed = urlparse(url)
|
|
68
|
+
segment = parsed.path.rstrip("/").rsplit("/", maxsplit=1)[-1]
|
|
69
|
+
candidate = unquote(segment).replace("-", " ").replace("_", " ")
|
|
70
|
+
normalized = self.normalize_title(candidate)
|
|
71
|
+
|
|
72
|
+
if normalized:
|
|
73
|
+
return normalized
|
|
74
|
+
|
|
75
|
+
return parsed.netloc or "index"
|
|
76
|
+
|
|
77
|
+
def fetch_html(self, url: str) -> str | None:
|
|
78
|
+
with urlopen(url, timeout=30) as response:
|
|
79
|
+
charset = response.headers.get_content_charset() or "utf-8"
|
|
80
|
+
return response.read().decode(charset, errors="replace")
|
|
81
|
+
|
|
82
|
+
def resolve_url(self, page_url: str, link_target: str):
|
|
83
|
+
return urlparse(urljoin(page_url, link_target))
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
from .base.rule_base import RuleBase
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class RuleEnsureH1(RuleBase):
|
|
16
|
+
def apply(
|
|
17
|
+
self,
|
|
18
|
+
markdown: str,
|
|
19
|
+
*,
|
|
20
|
+
url: str | None = None,
|
|
21
|
+
html: str | None = None,
|
|
22
|
+
) -> str:
|
|
23
|
+
if self.has_h1(markdown):
|
|
24
|
+
return markdown
|
|
25
|
+
|
|
26
|
+
title = None
|
|
27
|
+
if html:
|
|
28
|
+
title = self.extract_title_from_html(html)
|
|
29
|
+
|
|
30
|
+
if not title and url:
|
|
31
|
+
try:
|
|
32
|
+
fetched_html = self.fetch_html(url)
|
|
33
|
+
except Exception:
|
|
34
|
+
fetched_html = None
|
|
35
|
+
|
|
36
|
+
if fetched_html:
|
|
37
|
+
title = self.extract_title_from_html(fetched_html)
|
|
38
|
+
|
|
39
|
+
if not title and url:
|
|
40
|
+
title = self.fallback_title_from_url(url)
|
|
41
|
+
|
|
42
|
+
if not title:
|
|
43
|
+
title = "index"
|
|
44
|
+
|
|
45
|
+
return f"# {title}\n\n{markdown}"
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from .base.rule_base import RuleBase
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
TABLE_CELL_PATTERN = re.compile(r"^:?-{3,}:?$")
|
|
18
|
+
MISSING_SPACE_BEFORE_PAREN_PATTERN = re.compile(r"(?<=[\w\)])\(")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class RuleNormalizeWhitespace(RuleBase):
|
|
22
|
+
def apply(
|
|
23
|
+
self,
|
|
24
|
+
markdown: str,
|
|
25
|
+
*,
|
|
26
|
+
url: str | None = None,
|
|
27
|
+
html: str | None = None,
|
|
28
|
+
) -> str:
|
|
29
|
+
lines = markdown.splitlines()
|
|
30
|
+
blocks: list[str] = []
|
|
31
|
+
index = 0
|
|
32
|
+
|
|
33
|
+
while index < len(lines):
|
|
34
|
+
line = lines[index]
|
|
35
|
+
|
|
36
|
+
if not line.strip():
|
|
37
|
+
index += 1
|
|
38
|
+
continue
|
|
39
|
+
|
|
40
|
+
if self._is_fence(line):
|
|
41
|
+
block_lines = [line]
|
|
42
|
+
index += 1
|
|
43
|
+
|
|
44
|
+
while index < len(lines):
|
|
45
|
+
block_lines.append(lines[index])
|
|
46
|
+
if self._is_fence(lines[index]):
|
|
47
|
+
index += 1
|
|
48
|
+
break
|
|
49
|
+
index += 1
|
|
50
|
+
|
|
51
|
+
blocks.append("\n".join(block_lines))
|
|
52
|
+
continue
|
|
53
|
+
|
|
54
|
+
if self.HEADING_PATTERN.match(line):
|
|
55
|
+
blocks.append(self._normalize_line(line))
|
|
56
|
+
index += 1
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
if self._is_table_start(lines, index):
|
|
60
|
+
block_lines = [self._normalize_line(lines[index])]
|
|
61
|
+
index += 1
|
|
62
|
+
|
|
63
|
+
while index < len(lines):
|
|
64
|
+
current = lines[index]
|
|
65
|
+
if not current.strip():
|
|
66
|
+
break
|
|
67
|
+
if self.HEADING_PATTERN.match(current) or self._is_fence(current):
|
|
68
|
+
break
|
|
69
|
+
if "|" not in current:
|
|
70
|
+
break
|
|
71
|
+
|
|
72
|
+
block_lines.append(self._normalize_line(current))
|
|
73
|
+
index += 1
|
|
74
|
+
|
|
75
|
+
blocks.append("\n".join(block_lines))
|
|
76
|
+
continue
|
|
77
|
+
|
|
78
|
+
block_lines = [self._normalize_line(line)]
|
|
79
|
+
index += 1
|
|
80
|
+
|
|
81
|
+
while index < len(lines):
|
|
82
|
+
current = lines[index]
|
|
83
|
+
if not current.strip():
|
|
84
|
+
index += 1
|
|
85
|
+
break
|
|
86
|
+
if self.HEADING_PATTERN.match(current):
|
|
87
|
+
break
|
|
88
|
+
if self._is_fence(current):
|
|
89
|
+
break
|
|
90
|
+
if self._is_table_start(lines, index):
|
|
91
|
+
break
|
|
92
|
+
|
|
93
|
+
block_lines.append(self._normalize_line(current))
|
|
94
|
+
index += 1
|
|
95
|
+
|
|
96
|
+
blocks.append("\n\n".join(block_lines))
|
|
97
|
+
|
|
98
|
+
if not blocks:
|
|
99
|
+
return ""
|
|
100
|
+
|
|
101
|
+
return "\n\n".join(blocks) + "\n"
|
|
102
|
+
|
|
103
|
+
def _is_fence(self, line: str) -> bool:
|
|
104
|
+
stripped = line.lstrip()
|
|
105
|
+
return stripped.startswith("```") or stripped.startswith("~~~")
|
|
106
|
+
|
|
107
|
+
def _is_table_start(self, lines: list[str], index: int) -> bool:
|
|
108
|
+
if index + 1 >= len(lines):
|
|
109
|
+
return False
|
|
110
|
+
|
|
111
|
+
if "|" not in lines[index]:
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
separator_cells = [
|
|
115
|
+
cell.strip()
|
|
116
|
+
for cell in lines[index + 1].strip().strip("|").split("|")
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
if not separator_cells or any(not cell for cell in separator_cells):
|
|
120
|
+
return False
|
|
121
|
+
|
|
122
|
+
return all(TABLE_CELL_PATTERN.match(cell) for cell in separator_cells)
|
|
123
|
+
|
|
124
|
+
def _normalize_line(self, line: str) -> str:
|
|
125
|
+
line = line.rstrip()
|
|
126
|
+
parts: list[str] = []
|
|
127
|
+
last_end = 0
|
|
128
|
+
|
|
129
|
+
for match in self.MARKDOWN_LINK_PATTERN.finditer(line):
|
|
130
|
+
parts.append(line[last_end:match.start()])
|
|
131
|
+
|
|
132
|
+
if match.start() > 0 and not line[match.start() - 1].isspace() and line[match.start() - 1] != "!":
|
|
133
|
+
parts.append(" ")
|
|
134
|
+
|
|
135
|
+
parts.append(match.group(0))
|
|
136
|
+
last_end = match.end()
|
|
137
|
+
|
|
138
|
+
parts.append(line[last_end:])
|
|
139
|
+
normalized = "".join(parts)
|
|
140
|
+
return MISSING_SPACE_BEFORE_PAREN_PATTERN.sub(" (", normalized)
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# This file is part of the https://github.com/ixnode/crawl4md project.
|
|
2
|
+
#
|
|
3
|
+
# (c) 2026 Björn Hempel <bjoern@hempel.li>
|
|
4
|
+
#
|
|
5
|
+
# For the full copyright and license information, please view the LICENSE.md
|
|
6
|
+
# file that was distributed with this source code.
|
|
7
|
+
#
|
|
8
|
+
# @author: Björn Hempel <bjoern@hempel.li>
|
|
9
|
+
# @version: 1.0.0 (2026-05-02)
|
|
10
|
+
# @since 1.0.0 (2026-05-02) First version
|
|
11
|
+
|
|
12
|
+
import re
|
|
13
|
+
|
|
14
|
+
from .base.rule_base import RuleBase
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
HTML_COMMENT_PATTERN = re.compile(r"<!--.*?-->", re.DOTALL)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class RuleRemoveHtmlComments(RuleBase):
|
|
21
|
+
def apply(
|
|
22
|
+
self,
|
|
23
|
+
markdown: str,
|
|
24
|
+
*,
|
|
25
|
+
url: str | None = None,
|
|
26
|
+
html: str | None = None,
|
|
27
|
+
) -> str:
|
|
28
|
+
return HTML_COMMENT_PATTERN.sub("", markdown)
|