llms-generator 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- llms_generator/__init__.py +1 -0
- llms_generator/__main__.py +4 -0
- llms_generator/_version.py +1 -0
- llms_generator/cli.py +77 -0
- llms_generator/crawler.py +196 -0
- llms_generator/generator.py +111 -0
- llms_generator/page_analyzer.py +110 -0
- llms_generator/section_grouper.py +38 -0
- llms_generator-0.1.0.dist-info/METADATA +203 -0
- llms_generator-0.1.0.dist-info/RECORD +14 -0
- llms_generator-0.1.0.dist-info/WHEEL +5 -0
- llms_generator-0.1.0.dist-info/entry_points.txt +2 -0
- llms_generator-0.1.0.dist-info/licenses/LICENSE +21 -0
- llms_generator-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from llms_generator._version import __version__
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.0"
|
llms_generator/cli.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import argparse
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from llms_generator.crawler import Crawler
|
|
5
|
+
from llms_generator.section_grouper import group_pages
|
|
6
|
+
from llms_generator.generator import generate_llms_txt
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
prog="llms-gen",
|
|
12
|
+
description="Crawl a website and generate llms.txt",
|
|
13
|
+
)
|
|
14
|
+
parser.add_argument("url", help="Target website URL")
|
|
15
|
+
parser.add_argument(
|
|
16
|
+
"--depth",
|
|
17
|
+
type=int,
|
|
18
|
+
default=2,
|
|
19
|
+
help="Maximum crawl depth (default: 2)",
|
|
20
|
+
)
|
|
21
|
+
parser.add_argument(
|
|
22
|
+
"--output",
|
|
23
|
+
default="llms.txt",
|
|
24
|
+
help="Output file path (default: llms.txt)",
|
|
25
|
+
)
|
|
26
|
+
parser.add_argument(
|
|
27
|
+
"--full",
|
|
28
|
+
action="store_true",
|
|
29
|
+
help="Also generate llms-full.txt with full page content",
|
|
30
|
+
)
|
|
31
|
+
parser.add_argument(
|
|
32
|
+
"--no-js",
|
|
33
|
+
action="store_true",
|
|
34
|
+
help="Skip Playwright JavaScript rendering fallback",
|
|
35
|
+
)
|
|
36
|
+
parser.add_argument(
|
|
37
|
+
"--delay",
|
|
38
|
+
type=float,
|
|
39
|
+
default=1.0,
|
|
40
|
+
help="Seconds between requests (default: 1.0)",
|
|
41
|
+
)
|
|
42
|
+
return parser.parse_args(argv)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def main(argv: list[str] | None = None) -> None:
|
|
46
|
+
args = parse_args(argv)
|
|
47
|
+
|
|
48
|
+
crawler = Crawler(
|
|
49
|
+
start_url=args.url,
|
|
50
|
+
max_depth=args.depth,
|
|
51
|
+
delay=args.delay,
|
|
52
|
+
use_js=not args.no_js,
|
|
53
|
+
)
|
|
54
|
+
pages = crawler.run()
|
|
55
|
+
|
|
56
|
+
if not pages:
|
|
57
|
+
print("No pages found. Check the URL and try again.", file=sys.stderr)
|
|
58
|
+
sys.exit(1)
|
|
59
|
+
|
|
60
|
+
sections = group_pages(pages)
|
|
61
|
+
|
|
62
|
+
output = generate_llms_txt(sections, args.full)
|
|
63
|
+
with open(args.output, "w", encoding="utf-8") as f:
|
|
64
|
+
f.write(output)
|
|
65
|
+
print(f"Wrote {args.output} with {len(pages)} pages across {len(sections)} sections.")
|
|
66
|
+
|
|
67
|
+
if args.full:
|
|
68
|
+
full_path = args.output.replace("llms.txt", "llms-full.txt")
|
|
69
|
+
if full_path == args.output:
|
|
70
|
+
full_path = "llms-full.txt"
|
|
71
|
+
with open(full_path, "w", encoding="utf-8") as f:
|
|
72
|
+
f.write(output)
|
|
73
|
+
print(f"Wrote {full_path}.")
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
if __name__ == "__main__":
|
|
77
|
+
main()
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
import urllib.robotparser
|
|
5
|
+
from collections import deque
|
|
6
|
+
from typing import Optional
|
|
7
|
+
from urllib.parse import urljoin, urlparse
|
|
8
|
+
|
|
9
|
+
import requests
|
|
10
|
+
from bs4 import BeautifulSoup, Tag
|
|
11
|
+
|
|
12
|
+
from llms_generator.page_analyzer import (
|
|
13
|
+
PageInfo,
|
|
14
|
+
RobotsDirectives,
|
|
15
|
+
extract_page_info,
|
|
16
|
+
parse_meta_robots,
|
|
17
|
+
parse_robots_header,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
USER_AGENT = "llms-generator/0.1.0"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class Crawler:
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
start_url: str,
|
|
27
|
+
max_depth: int = 2,
|
|
28
|
+
delay: float = 1.0,
|
|
29
|
+
use_js: bool = True,
|
|
30
|
+
):
|
|
31
|
+
self.start_url = start_url.rstrip("/")
|
|
32
|
+
self.max_depth = max_depth
|
|
33
|
+
self.delay = delay
|
|
34
|
+
self.use_js = use_js
|
|
35
|
+
self._session = requests.Session()
|
|
36
|
+
self._session.headers.update({"User-Agent": USER_AGENT})
|
|
37
|
+
|
|
38
|
+
parsed = urlparse(self.start_url)
|
|
39
|
+
self._base = f"{parsed.scheme}://{parsed.netloc}"
|
|
40
|
+
|
|
41
|
+
self._rp: Optional[urllib.robotparser.RobotFileParser] = None
|
|
42
|
+
self._visited: set[str] = set()
|
|
43
|
+
self._pages: list[PageInfo] = []
|
|
44
|
+
|
|
45
|
+
def run(self) -> list[PageInfo]:
|
|
46
|
+
self._load_robots_txt()
|
|
47
|
+
queue: deque[tuple[str, int]] = deque()
|
|
48
|
+
queue.append((self.start_url, 0))
|
|
49
|
+
self._visited.add(self.start_url)
|
|
50
|
+
|
|
51
|
+
while queue:
|
|
52
|
+
url, depth = queue.popleft()
|
|
53
|
+
if depth > self.max_depth:
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
if not self._is_allowed(url):
|
|
57
|
+
continue
|
|
58
|
+
|
|
59
|
+
page = self._fetch_and_analyze(url, depth)
|
|
60
|
+
if page is None:
|
|
61
|
+
continue
|
|
62
|
+
|
|
63
|
+
self._pages.append(page)
|
|
64
|
+
|
|
65
|
+
if depth < self.max_depth:
|
|
66
|
+
links = self._extract_links(url, page.raw_html)
|
|
67
|
+
for link in links:
|
|
68
|
+
if link not in self._visited:
|
|
69
|
+
self._visited.add(link)
|
|
70
|
+
queue.append((link, depth + 1))
|
|
71
|
+
|
|
72
|
+
time.sleep(self.delay)
|
|
73
|
+
|
|
74
|
+
return self._pages
|
|
75
|
+
|
|
76
|
+
# ------------------------------------------------------------------
|
|
77
|
+
# Robots.txt
|
|
78
|
+
# ------------------------------------------------------------------
|
|
79
|
+
def _load_robots_txt(self) -> None:
|
|
80
|
+
rp = urllib.robotparser.RobotFileParser()
|
|
81
|
+
rp.set_url(urljoin(self._base, "/robots.txt"))
|
|
82
|
+
try:
|
|
83
|
+
rp.read()
|
|
84
|
+
except Exception:
|
|
85
|
+
rp = None
|
|
86
|
+
self._rp = rp
|
|
87
|
+
|
|
88
|
+
def _is_allowed(self, url: str) -> bool:
|
|
89
|
+
if self._rp is None:
|
|
90
|
+
return True
|
|
91
|
+
try:
|
|
92
|
+
return self._rp.can_fetch(USER_AGENT, url)
|
|
93
|
+
except Exception:
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
# ------------------------------------------------------------------
|
|
97
|
+
# Fetch + analyze
|
|
98
|
+
# ------------------------------------------------------------------
|
|
99
|
+
def _fetch_and_analyze(self, url: str, depth: int) -> PageInfo | None:
|
|
100
|
+
html = self._fetch(url)
|
|
101
|
+
if html is None:
|
|
102
|
+
return None
|
|
103
|
+
|
|
104
|
+
page = extract_page_info(url, html, depth)
|
|
105
|
+
|
|
106
|
+
directives = self._check_robots_directives(html)
|
|
107
|
+
if directives.noindex:
|
|
108
|
+
return None
|
|
109
|
+
|
|
110
|
+
return page
|
|
111
|
+
|
|
112
|
+
def _fetch(self, url: str) -> str | None:
|
|
113
|
+
try:
|
|
114
|
+
resp = self._session.get(url, timeout=30)
|
|
115
|
+
resp.raise_for_status()
|
|
116
|
+
except requests.RequestException:
|
|
117
|
+
return self._fetch_with_playwright(url)
|
|
118
|
+
|
|
119
|
+
ct = (resp.headers.get("Content-Type") or "").lower()
|
|
120
|
+
if "text/html" not in ct:
|
|
121
|
+
return None
|
|
122
|
+
|
|
123
|
+
# Check X-Robots-Tag header
|
|
124
|
+
x_robots = resp.headers.get("X-Robots-Tag")
|
|
125
|
+
if x_robots:
|
|
126
|
+
directives = parse_robots_header(x_robots)
|
|
127
|
+
if directives.noindex:
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
return resp.text
|
|
131
|
+
|
|
132
|
+
def _fetch_with_playwright(self, url: str) -> str | None:
|
|
133
|
+
if not self.use_js:
|
|
134
|
+
return None
|
|
135
|
+
try:
|
|
136
|
+
from playwright.sync_api import sync_playwright
|
|
137
|
+
except ImportError:
|
|
138
|
+
return None
|
|
139
|
+
|
|
140
|
+
try:
|
|
141
|
+
with sync_playwright() as p:
|
|
142
|
+
browser = p.chromium.launch(headless=True)
|
|
143
|
+
page = browser.new_page(user_agent=USER_AGENT)
|
|
144
|
+
try:
|
|
145
|
+
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
146
|
+
content = page.content()
|
|
147
|
+
except Exception:
|
|
148
|
+
return None
|
|
149
|
+
finally:
|
|
150
|
+
browser.close()
|
|
151
|
+
return content
|
|
152
|
+
except Exception:
|
|
153
|
+
return None
|
|
154
|
+
|
|
155
|
+
# ------------------------------------------------------------------
|
|
156
|
+
# Link extraction
|
|
157
|
+
# ------------------------------------------------------------------
|
|
158
|
+
def _extract_links(self, base_url: str, html: str) -> list[str]:
|
|
159
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
160
|
+
links: list[str] = []
|
|
161
|
+
|
|
162
|
+
directives = self._check_robots_directives(html)
|
|
163
|
+
if directives.nofollow:
|
|
164
|
+
return links
|
|
165
|
+
|
|
166
|
+
for a_tag in soup.find_all("a", href=True):
|
|
167
|
+
if not isinstance(a_tag, Tag):
|
|
168
|
+
continue
|
|
169
|
+
href = a_tag["href"]
|
|
170
|
+
if isinstance(href, (list, tuple)):
|
|
171
|
+
href = href[0] if href else ""
|
|
172
|
+
href = str(href)
|
|
173
|
+
|
|
174
|
+
# Resolve relative URLs
|
|
175
|
+
full = urljoin(base_url, href)
|
|
176
|
+
parsed = urlparse(full)
|
|
177
|
+
|
|
178
|
+
# Same domain only, skip fragments, skip non-HTTP(S)
|
|
179
|
+
if parsed.netloc != urlparse(self._base).netloc:
|
|
180
|
+
continue
|
|
181
|
+
if parsed.scheme not in ("http", "https"):
|
|
182
|
+
continue
|
|
183
|
+
if parsed.fragment:
|
|
184
|
+
full = full.rstrip("#" + parsed.fragment)
|
|
185
|
+
|
|
186
|
+
links.append(full)
|
|
187
|
+
|
|
188
|
+
return list(dict.fromkeys(links)) # deduplicate, preserve order
|
|
189
|
+
|
|
190
|
+
# ------------------------------------------------------------------
|
|
191
|
+
# Robots directives from HTML
|
|
192
|
+
# ------------------------------------------------------------------
|
|
193
|
+
@staticmethod
|
|
194
|
+
def _check_robots_directives(html: str) -> RobotsDirectives:
|
|
195
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
196
|
+
return parse_meta_robots(soup)
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
from llms_generator.page_analyzer import PageInfo
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def generate_llms_txt(
|
|
9
|
+
sections: dict[str, list[PageInfo]],
|
|
10
|
+
full: bool = False,
|
|
11
|
+
) -> str:
|
|
12
|
+
lines: list[str] = []
|
|
13
|
+
|
|
14
|
+
# H1 — site name from the start URL's netloc
|
|
15
|
+
site_name = _pick_site_name(sections)
|
|
16
|
+
lines.append(f"# {site_name}")
|
|
17
|
+
lines.append("")
|
|
18
|
+
|
|
19
|
+
# Blockquote summary — first non-empty description
|
|
20
|
+
summary = _pick_summary(sections)
|
|
21
|
+
if summary:
|
|
22
|
+
lines.append(f"> {summary}")
|
|
23
|
+
lines.append("")
|
|
24
|
+
|
|
25
|
+
# Optional context paragraph
|
|
26
|
+
lines.append(
|
|
27
|
+
"This file provides AI systems with a structured summary of this website. "
|
|
28
|
+
"It is maintained automatically by llms-generator."
|
|
29
|
+
)
|
|
30
|
+
lines.append("")
|
|
31
|
+
|
|
32
|
+
# Sections
|
|
33
|
+
for section_name in _order_sections(sections):
|
|
34
|
+
pages = sections[section_name]
|
|
35
|
+
if not pages:
|
|
36
|
+
continue
|
|
37
|
+
|
|
38
|
+
lines.append(f"## {section_name}")
|
|
39
|
+
lines.append("")
|
|
40
|
+
|
|
41
|
+
for page in pages:
|
|
42
|
+
desc = page.summary or page.h1 or page.title
|
|
43
|
+
lines.append(f"- [{page.title or page.h1 or page.url}]({page.url}): {desc}")
|
|
44
|
+
|
|
45
|
+
if full and page.full_text:
|
|
46
|
+
lines.append("")
|
|
47
|
+
lines.append(page.full_text)
|
|
48
|
+
lines.append("")
|
|
49
|
+
|
|
50
|
+
lines.append("")
|
|
51
|
+
|
|
52
|
+
return "\n".join(lines)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _pick_site_name(sections: dict[str, list[PageInfo]]) -> str:
|
|
56
|
+
for pages in sections.values():
|
|
57
|
+
for p in pages:
|
|
58
|
+
if p.title:
|
|
59
|
+
return p.title.split("—")[0].split("|")[0].strip()
|
|
60
|
+
if p.h1:
|
|
61
|
+
return p.h1
|
|
62
|
+
# Fallback: domain name
|
|
63
|
+
for pages in sections.values():
|
|
64
|
+
for p in pages:
|
|
65
|
+
netloc = urlparse(p.url).netloc
|
|
66
|
+
return netloc.replace("www.", "").split(".")[0].title()
|
|
67
|
+
return "Untitled Site"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _pick_summary(sections: dict[str, list[PageInfo]]) -> str:
|
|
71
|
+
for name in ("Home", "About", "Docs"):
|
|
72
|
+
pages = sections.get(name)
|
|
73
|
+
if pages:
|
|
74
|
+
for p in pages:
|
|
75
|
+
if p.description:
|
|
76
|
+
return p.description
|
|
77
|
+
if p.summary:
|
|
78
|
+
return p.summary
|
|
79
|
+
# Fallback: any non-empty summary
|
|
80
|
+
for pages in sections.values():
|
|
81
|
+
for p in pages:
|
|
82
|
+
if p.description:
|
|
83
|
+
return p.description
|
|
84
|
+
return ""
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
SECTION_PRIORITY = [
|
|
88
|
+
"Home", "About", "Docs", "Documentation",
|
|
89
|
+
"Guide", "Guides", "Tutorial", "Tutorials",
|
|
90
|
+
"Api", "Api Reference",
|
|
91
|
+
"Blog", "News",
|
|
92
|
+
"Features", "Pricing",
|
|
93
|
+
"Support", "Faq",
|
|
94
|
+
"Contact",
|
|
95
|
+
]
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _order_sections(sections: dict[str, list[PageInfo]]) -> list[str]:
|
|
99
|
+
custom: list[str] = []
|
|
100
|
+
remaining: list[str] = []
|
|
101
|
+
|
|
102
|
+
for name in sections:
|
|
103
|
+
if name in SECTION_PRIORITY:
|
|
104
|
+
custom.append(name)
|
|
105
|
+
else:
|
|
106
|
+
remaining.append(name)
|
|
107
|
+
|
|
108
|
+
custom.sort(key=lambda n: SECTION_PRIORITY.index(n))
|
|
109
|
+
remaining.sort()
|
|
110
|
+
|
|
111
|
+
return custom + remaining
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from dataclasses import dataclass, field
|
|
5
|
+
from urllib.parse import urlparse
|
|
6
|
+
|
|
7
|
+
from bs4 import BeautifulSoup, Tag
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class PageInfo:
|
|
12
|
+
url: str
|
|
13
|
+
title: str = ""
|
|
14
|
+
h1: str = ""
|
|
15
|
+
description: str = ""
|
|
16
|
+
summary: str = ""
|
|
17
|
+
section: str = ""
|
|
18
|
+
depth: int = 0
|
|
19
|
+
full_text: str = ""
|
|
20
|
+
raw_html: str = ""
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@dataclass
|
|
24
|
+
class RobotsDirectives:
|
|
25
|
+
noindex: bool = False
|
|
26
|
+
nofollow: bool = False
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
_X_ROBOTS_TAG_RE = re.compile(
|
|
30
|
+
r"(?:noindex|nofollow|index|follow|none|all|noarchive)",
|
|
31
|
+
re.IGNORECASE,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def parse_robots_header(header_value: str) -> RobotsDirectives:
|
|
36
|
+
d = RobotsDirectives()
|
|
37
|
+
val = header_value.lower()
|
|
38
|
+
if "none" in val:
|
|
39
|
+
d.noindex = True
|
|
40
|
+
d.nofollow = True
|
|
41
|
+
if "noindex" in val:
|
|
42
|
+
d.noindex = True
|
|
43
|
+
if "nofollow" in val:
|
|
44
|
+
d.nofollow = True
|
|
45
|
+
return d
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def parse_meta_robots(soup: BeautifulSoup) -> RobotsDirectives:
|
|
49
|
+
d = RobotsDirectives()
|
|
50
|
+
meta = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
|
|
51
|
+
if not meta or not isinstance(meta, Tag):
|
|
52
|
+
return d
|
|
53
|
+
content = (meta.get("content") or "").lower()
|
|
54
|
+
if "none" in content:
|
|
55
|
+
d.noindex = True
|
|
56
|
+
d.nofollow = True
|
|
57
|
+
if "noindex" in content:
|
|
58
|
+
d.noindex = True
|
|
59
|
+
if "nofollow" in content:
|
|
60
|
+
d.nofollow = True
|
|
61
|
+
return d
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def extract_page_info(url: str, html: str, depth: int) -> PageInfo:
|
|
65
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
66
|
+
info = PageInfo(url=url, depth=depth, raw_html=html)
|
|
67
|
+
|
|
68
|
+
title_tag = soup.find("title")
|
|
69
|
+
if title_tag and isinstance(title_tag, Tag):
|
|
70
|
+
info.title = title_tag.get_text(strip=True)
|
|
71
|
+
|
|
72
|
+
h1_tag = soup.find("h1")
|
|
73
|
+
if h1_tag:
|
|
74
|
+
info.h1 = h1_tag.get_text(strip=True)
|
|
75
|
+
|
|
76
|
+
meta_desc = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
|
|
77
|
+
if meta_desc and isinstance(meta_desc, Tag):
|
|
78
|
+
info.description = (meta_desc.get("content") or "").strip()
|
|
79
|
+
|
|
80
|
+
info.summary = info.description or info.h1 or info.title
|
|
81
|
+
|
|
82
|
+
p_tag = soup.find("p")
|
|
83
|
+
if p_tag:
|
|
84
|
+
text = p_tag.get_text(strip=True)
|
|
85
|
+
if len(text) > 20:
|
|
86
|
+
info.full_text = text
|
|
87
|
+
else:
|
|
88
|
+
info.full_text = _find_first_meaningful_text(soup)
|
|
89
|
+
|
|
90
|
+
info.section = _infer_section(url, info.h1)
|
|
91
|
+
return info
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def _find_first_meaningful_text(soup: BeautifulSoup) -> str:
|
|
95
|
+
for tag in soup.find_all(["p", "li", "div", "section"]):
|
|
96
|
+
text = tag.get_text(strip=True)
|
|
97
|
+
if len(text) > 50:
|
|
98
|
+
return text[:500]
|
|
99
|
+
return ""
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _infer_section(url: str, h1: str) -> str:
|
|
103
|
+
path = urlparse(url).path.strip("/")
|
|
104
|
+
if not path:
|
|
105
|
+
return "Home"
|
|
106
|
+
parts = path.split("/")
|
|
107
|
+
top = parts[0].replace("-", " ").replace("_", " ").title()
|
|
108
|
+
if top:
|
|
109
|
+
return top
|
|
110
|
+
return h1 or "Other"
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
|
|
6
|
+
from llms_generator.page_analyzer import PageInfo
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def group_pages(pages: list[PageInfo]) -> dict[str, list[PageInfo]]:
|
|
10
|
+
sections: dict[str, list[PageInfo]] = defaultdict(list)
|
|
11
|
+
|
|
12
|
+
for page in pages:
|
|
13
|
+
section = _assign_section(page)
|
|
14
|
+
sections[section].append(page)
|
|
15
|
+
|
|
16
|
+
return dict(sections)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _assign_section(page: PageInfo) -> str:
|
|
20
|
+
if page.section and page.section != "Other":
|
|
21
|
+
return _normalise(page.section)
|
|
22
|
+
|
|
23
|
+
path = urlparse(page.url).path.strip("/")
|
|
24
|
+
if not path:
|
|
25
|
+
return "Home"
|
|
26
|
+
|
|
27
|
+
parts = [p for p in path.split("/") if p]
|
|
28
|
+
top = _normalise(parts[0])
|
|
29
|
+
|
|
30
|
+
if page.h1:
|
|
31
|
+
h1_section = _normalise(page.h1)
|
|
32
|
+
return h1_section
|
|
33
|
+
|
|
34
|
+
return top or "Other"
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def _normalise(name: str) -> str:
|
|
38
|
+
return name.replace("-", " ").replace("_", " ").strip().title()
|
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: llms-generator
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Crawl any website and generate llms.txt — the AI-ready site map standard.
|
|
5
|
+
Author: aouwalitshikkha
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/aouwalitshikkha/llms-generator
|
|
8
|
+
Project-URL: Repository, https://github.com/aouwalitshikkha/llms-generator
|
|
9
|
+
Keywords: llms.txt,crawler,seo,ai,llm
|
|
10
|
+
Classifier: Development Status :: 3 - Alpha
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Programming Language :: Python :: 3
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.10
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: requests>=2.31
|
|
22
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
23
|
+
Provides-Extra: js
|
|
24
|
+
Requires-Dist: playwright>=1.40; extra == "js"
|
|
25
|
+
Provides-Extra: all
|
|
26
|
+
Requires-Dist: llms-generator[js]; extra == "all"
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
|
|
29
|
+
# llms-generator
|
|
30
|
+
|
|
31
|
+
[](https://pypi.org/project/llms-generator/)
|
|
32
|
+
[](https://pypi.org/project/llms-generator/)
|
|
33
|
+
[](LICENSE)
|
|
34
|
+
|
|
35
|
+
**Crawl any website and generate `llms.txt`** — the AI-ready site map standard.
|
|
36
|
+
|
|
37
|
+
`llms.txt` is a markdown file placed at a website's root (`/llms.txt`) that helps AI assistants like ChatGPT, Claude, and Perplexity understand your site's content structure. Think of it as *robots.txt for AI*.
|
|
38
|
+
|
|
39
|
+
This tool crawls your site, extracts page metadata, groups pages into logical sections, and outputs a spec-compliant `llms.txt` file.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Why llms.txt?
|
|
44
|
+
|
|
45
|
+
AI systems struggle to navigate large, noisy websites. An `llms.txt` file gives them a curated map of your most important content — leading to:
|
|
46
|
+
|
|
47
|
+
- Accurate citations in AI-generated responses
|
|
48
|
+
- Better brand representation in ChatGPT, Perplexity, Google AI Overviews
|
|
49
|
+
- Less server load from AI crawlers wandering your site
|
|
50
|
+
- Control over how AI systems reference your content
|
|
51
|
+
|
|
52
|
+
The [llms.txt specification](https://llmstxt.org) was proposed by Jeremy Howard in 2024 and is actively supported by Perplexity, Anthropic, and other AI platforms.
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
pip install llms-generator
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
For JavaScript-heavy sites (optional):
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
pip install llms-generator[js]
|
|
66
|
+
playwright install chromium
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
llms-gen https://example.com
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
That's it. The tool crawls your site and creates `llms.txt` in the current directory.
|
|
78
|
+
|
|
79
|
+
### Options
|
|
80
|
+
|
|
81
|
+
| Flag | Default | Description |
|
|
82
|
+
|---|---|---|
|
|
83
|
+
| `URL` | required | Target website URL |
|
|
84
|
+
| `--depth` | `2` | Maximum crawl depth |
|
|
85
|
+
| `--output` | `llms.txt` | Output file path |
|
|
86
|
+
| `--full` | `False` | Also generate `llms-full.txt` with full page content |
|
|
87
|
+
| `--no-js` | `False` | Skip Playwright JavaScript rendering fallback |
|
|
88
|
+
| `--delay` | `1.0` | Seconds between requests (be polite) |
|
|
89
|
+
|
|
90
|
+
### Examples
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
# Basic crawl (2 levels deep)
|
|
94
|
+
llms-gen https://example.com
|
|
95
|
+
|
|
96
|
+
# Crawl deeper, output to custom path
|
|
97
|
+
llms-gen https://docs.example.com --depth 3 --output site-llms.txt
|
|
98
|
+
|
|
99
|
+
# Generate both standard and full versions
|
|
100
|
+
llms-gen https://example.com --full
|
|
101
|
+
|
|
102
|
+
# Fast crawl without JS rendering
|
|
103
|
+
llms-gen https://example.com --no-js --delay 0.5
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
## How it works
|
|
109
|
+
|
|
110
|
+
### Per-page robot check
|
|
111
|
+
|
|
112
|
+
Every page is checked against three layers before being included or followed:
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
robots.txt ──┬── disallowed? → skip
|
|
116
|
+
└── allowed? ──→ check HTTP X-Robots-Tag header
|
|
117
|
+
│
|
|
118
|
+
noindex? ──→ skip
|
|
119
|
+
nofollow? ──→ still analyze, don't follow links
|
|
120
|
+
│
|
|
121
|
+
absent ──→ check <meta name="robots">
|
|
122
|
+
│
|
|
123
|
+
noindex? ──→ skip
|
|
124
|
+
nofollow? ──→ still analyze, don't follow links
|
|
125
|
+
│
|
|
126
|
+
absent/index,follow ──→ analyze + follow links
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Pages with `noindex` are **excluded from `llms.txt`**. Pages with `nofollow` are still analyzed for their content but their child links are not crawled.
|
|
130
|
+
|
|
131
|
+
### Crawl strategy
|
|
132
|
+
|
|
133
|
+
1. Parse `robots.txt` — respect `Disallow` and `Crawl-Delay`
|
|
134
|
+
2. BFS from the start URL up to `--depth` levels
|
|
135
|
+
3. For each page:
|
|
136
|
+
- Fetch with `requests` (handles most sites)
|
|
137
|
+
- If content is empty (JS-rendered), fall back to Playwright headless browser
|
|
138
|
+
- Extract: `<title>`, `<h1>`, `<meta name="description">`, first meaningful paragraph, directory path
|
|
139
|
+
4. Group pages into sections (directory-based, with H1 fallback)
|
|
140
|
+
5. Assemble `llms.txt` per the spec
|
|
141
|
+
|
|
142
|
+
### Section grouping
|
|
143
|
+
|
|
144
|
+
Pages are grouped into `##` sections by their top-level directory path:
|
|
145
|
+
|
|
146
|
+
```
|
|
147
|
+
/docs/getting-started → ## Docs
|
|
148
|
+
/blog/hello-world → ## Blog
|
|
149
|
+
/api/v1/users → ## Api
|
|
150
|
+
```
|
|
151
|
+
|
|
152
|
+
Pages without a clear directory path use their `<h1>` as the section name.
|
|
153
|
+
|
|
154
|
+
---
|
|
155
|
+
|
|
156
|
+
## Output format
|
|
157
|
+
|
|
158
|
+
The generated `llms.txt` follows the [llmstxt.org](https://llmstxt.org) specification:
|
|
159
|
+
|
|
160
|
+
```markdown
|
|
161
|
+
# Example Site
|
|
162
|
+
|
|
163
|
+
> A great example site with documentation and blog content.
|
|
164
|
+
|
|
165
|
+
This file provides AI systems with a structured summary of this website.
|
|
166
|
+
|
|
167
|
+
## Docs
|
|
168
|
+
|
|
169
|
+
- [Getting Started](https://example.com/docs/getting-started): How to get started with our platform.
|
|
170
|
+
- [API Reference](https://example.com/docs/api): Complete API documentation.
|
|
171
|
+
|
|
172
|
+
## Blog
|
|
173
|
+
|
|
174
|
+
- [Hello World](https://example.com/blog/hello): Our first blog post.
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
### llms-full.txt
|
|
178
|
+
|
|
179
|
+
With `--full`, an expanded version is also generated that includes the full text content of each page inline — useful for providing complete context to LLMs in a single file.
|
|
180
|
+
|
|
181
|
+
---
|
|
182
|
+
|
|
183
|
+
## Development
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
git clone https://github.com/aouwalitshikkha/llms-generator.git
|
|
187
|
+
cd llms-generator
|
|
188
|
+
pip install -e .
|
|
189
|
+
pip install -e ".[js]" # with Playwright support
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
Run tests:
|
|
193
|
+
|
|
194
|
+
```bash
|
|
195
|
+
pip install pytest
|
|
196
|
+
pytest
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
---
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
MIT
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
llms_generator/__init__.py,sha256=EObI8QaandukD-LUGiPaKXk-UwBT_XG19YKqNUCDbW4,48
|
|
2
|
+
llms_generator/__main__.py,sha256=FIIppT24g8YwqftTHcvnX0A18ylqpUZmcW6aKGOQKI8,75
|
|
3
|
+
llms_generator/_version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
|
|
4
|
+
llms_generator/cli.py,sha256=yu21yldgodFF9UakbvvYcF5nN0Zj4YRKRaI8yL2Jrr0,2131
|
|
5
|
+
llms_generator/crawler.py,sha256=eaqhuBRnPbwrMb4CVqlh1vHLOJ1vJrw9Z2tjQpRWKTg,6128
|
|
6
|
+
llms_generator/generator.py,sha256=w2VDiZxewUwRCFKcMHaOtqVgfL7x5ZvKUA29awfhhfQ,3015
|
|
7
|
+
llms_generator/page_analyzer.py,sha256=QJVgAcxFtg9grfNZm1Wy9wVrlcIfLT5EdLOFCnAukEs,2835
|
|
8
|
+
llms_generator/section_grouper.py,sha256=UiqtlF62zc6myQn5MbixhQKdsGdgbpgO1cO0wqhy9Mc,932
|
|
9
|
+
llms_generator-0.1.0.dist-info/licenses/LICENSE,sha256=Oczh2qn7dQ5PR6mYsb-QXZ1wsZv5IGFGCerMBHG5PeA,1072
|
|
10
|
+
llms_generator-0.1.0.dist-info/METADATA,sha256=YUbFcWXXF2Qt9QTc55GqCGl9NfaTkBlPolhqDeQybUU,6459
|
|
11
|
+
llms_generator-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
12
|
+
llms_generator-0.1.0.dist-info/entry_points.txt,sha256=Q4klthIOtQMi9Z23c8AbJFg9c-xnibppCtZnmwFH9_4,53
|
|
13
|
+
llms_generator-0.1.0.dist-info/top_level.txt,sha256=w2v6vqKUn9ffmSwzBKLKDl5ygkE39h-6NgMHgQqkrxY,15
|
|
14
|
+
llms_generator-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 aouwalitshikkha
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
llms_generator
|