llms-generator 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ from llms_generator._version import __version__
@@ -0,0 +1,4 @@
1
+ from llms_generator.cli import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
@@ -0,0 +1 @@
1
+ __version__ = "0.1.0"
llms_generator/cli.py ADDED
@@ -0,0 +1,77 @@
1
+ import argparse
2
+ import sys
3
+
4
+ from llms_generator.crawler import Crawler
5
+ from llms_generator.section_grouper import group_pages
6
+ from llms_generator.generator import generate_llms_txt
7
+
8
+
9
+ def parse_args(argv: list[str] | None = None) -> argparse.Namespace:
10
+ parser = argparse.ArgumentParser(
11
+ prog="llms-gen",
12
+ description="Crawl a website and generate llms.txt",
13
+ )
14
+ parser.add_argument("url", help="Target website URL")
15
+ parser.add_argument(
16
+ "--depth",
17
+ type=int,
18
+ default=2,
19
+ help="Maximum crawl depth (default: 2)",
20
+ )
21
+ parser.add_argument(
22
+ "--output",
23
+ default="llms.txt",
24
+ help="Output file path (default: llms.txt)",
25
+ )
26
+ parser.add_argument(
27
+ "--full",
28
+ action="store_true",
29
+ help="Also generate llms-full.txt with full page content",
30
+ )
31
+ parser.add_argument(
32
+ "--no-js",
33
+ action="store_true",
34
+ help="Skip Playwright JavaScript rendering fallback",
35
+ )
36
+ parser.add_argument(
37
+ "--delay",
38
+ type=float,
39
+ default=1.0,
40
+ help="Seconds between requests (default: 1.0)",
41
+ )
42
+ return parser.parse_args(argv)
43
+
44
+
45
+ def main(argv: list[str] | None = None) -> None:
46
+ args = parse_args(argv)
47
+
48
+ crawler = Crawler(
49
+ start_url=args.url,
50
+ max_depth=args.depth,
51
+ delay=args.delay,
52
+ use_js=not args.no_js,
53
+ )
54
+ pages = crawler.run()
55
+
56
+ if not pages:
57
+ print("No pages found. Check the URL and try again.", file=sys.stderr)
58
+ sys.exit(1)
59
+
60
+ sections = group_pages(pages)
61
+
62
+ output = generate_llms_txt(sections, args.full)
63
+ with open(args.output, "w", encoding="utf-8") as f:
64
+ f.write(output)
65
+ print(f"Wrote {args.output} with {len(pages)} pages across {len(sections)} sections.")
66
+
67
+ if args.full:
68
+ full_path = args.output.replace("llms.txt", "llms-full.txt")
69
+ if full_path == args.output:
70
+ full_path = "llms-full.txt"
71
+ with open(full_path, "w", encoding="utf-8") as f:
72
+ f.write(output)
73
+ print(f"Wrote {full_path}.")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()
@@ -0,0 +1,196 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ import urllib.robotparser
5
+ from collections import deque
6
+ from typing import Optional
7
+ from urllib.parse import urljoin, urlparse
8
+
9
+ import requests
10
+ from bs4 import BeautifulSoup, Tag
11
+
12
+ from llms_generator.page_analyzer import (
13
+ PageInfo,
14
+ RobotsDirectives,
15
+ extract_page_info,
16
+ parse_meta_robots,
17
+ parse_robots_header,
18
+ )
19
+
20
+ USER_AGENT = "llms-generator/0.1.0"
21
+
22
+
23
+ class Crawler:
24
+ def __init__(
25
+ self,
26
+ start_url: str,
27
+ max_depth: int = 2,
28
+ delay: float = 1.0,
29
+ use_js: bool = True,
30
+ ):
31
+ self.start_url = start_url.rstrip("/")
32
+ self.max_depth = max_depth
33
+ self.delay = delay
34
+ self.use_js = use_js
35
+ self._session = requests.Session()
36
+ self._session.headers.update({"User-Agent": USER_AGENT})
37
+
38
+ parsed = urlparse(self.start_url)
39
+ self._base = f"{parsed.scheme}://{parsed.netloc}"
40
+
41
+ self._rp: Optional[urllib.robotparser.RobotFileParser] = None
42
+ self._visited: set[str] = set()
43
+ self._pages: list[PageInfo] = []
44
+
45
+ def run(self) -> list[PageInfo]:
46
+ self._load_robots_txt()
47
+ queue: deque[tuple[str, int]] = deque()
48
+ queue.append((self.start_url, 0))
49
+ self._visited.add(self.start_url)
50
+
51
+ while queue:
52
+ url, depth = queue.popleft()
53
+ if depth > self.max_depth:
54
+ continue
55
+
56
+ if not self._is_allowed(url):
57
+ continue
58
+
59
+ page = self._fetch_and_analyze(url, depth)
60
+ if page is None:
61
+ continue
62
+
63
+ self._pages.append(page)
64
+
65
+ if depth < self.max_depth:
66
+ links = self._extract_links(url, page.raw_html)
67
+ for link in links:
68
+ if link not in self._visited:
69
+ self._visited.add(link)
70
+ queue.append((link, depth + 1))
71
+
72
+ time.sleep(self.delay)
73
+
74
+ return self._pages
75
+
76
+ # ------------------------------------------------------------------
77
+ # Robots.txt
78
+ # ------------------------------------------------------------------
79
+ def _load_robots_txt(self) -> None:
80
+ rp = urllib.robotparser.RobotFileParser()
81
+ rp.set_url(urljoin(self._base, "/robots.txt"))
82
+ try:
83
+ rp.read()
84
+ except Exception:
85
+ rp = None
86
+ self._rp = rp
87
+
88
+ def _is_allowed(self, url: str) -> bool:
89
+ if self._rp is None:
90
+ return True
91
+ try:
92
+ return self._rp.can_fetch(USER_AGENT, url)
93
+ except Exception:
94
+ return True
95
+
96
+ # ------------------------------------------------------------------
97
+ # Fetch + analyze
98
+ # ------------------------------------------------------------------
99
+ def _fetch_and_analyze(self, url: str, depth: int) -> PageInfo | None:
100
+ html = self._fetch(url)
101
+ if html is None:
102
+ return None
103
+
104
+ page = extract_page_info(url, html, depth)
105
+
106
+ directives = self._check_robots_directives(html)
107
+ if directives.noindex:
108
+ return None
109
+
110
+ return page
111
+
112
+ def _fetch(self, url: str) -> str | None:
113
+ try:
114
+ resp = self._session.get(url, timeout=30)
115
+ resp.raise_for_status()
116
+ except requests.RequestException:
117
+ return self._fetch_with_playwright(url)
118
+
119
+ ct = (resp.headers.get("Content-Type") or "").lower()
120
+ if "text/html" not in ct:
121
+ return None
122
+
123
+ # Check X-Robots-Tag header
124
+ x_robots = resp.headers.get("X-Robots-Tag")
125
+ if x_robots:
126
+ directives = parse_robots_header(x_robots)
127
+ if directives.noindex:
128
+ return None
129
+
130
+ return resp.text
131
+
132
+ def _fetch_with_playwright(self, url: str) -> str | None:
133
+ if not self.use_js:
134
+ return None
135
+ try:
136
+ from playwright.sync_api import sync_playwright
137
+ except ImportError:
138
+ return None
139
+
140
+ try:
141
+ with sync_playwright() as p:
142
+ browser = p.chromium.launch(headless=True)
143
+ page = browser.new_page(user_agent=USER_AGENT)
144
+ try:
145
+ page.goto(url, timeout=30000, wait_until="domcontentloaded")
146
+ content = page.content()
147
+ except Exception:
148
+ return None
149
+ finally:
150
+ browser.close()
151
+ return content
152
+ except Exception:
153
+ return None
154
+
155
+ # ------------------------------------------------------------------
156
+ # Link extraction
157
+ # ------------------------------------------------------------------
158
+ def _extract_links(self, base_url: str, html: str) -> list[str]:
159
+ soup = BeautifulSoup(html, "html.parser")
160
+ links: list[str] = []
161
+
162
+ directives = self._check_robots_directives(html)
163
+ if directives.nofollow:
164
+ return links
165
+
166
+ for a_tag in soup.find_all("a", href=True):
167
+ if not isinstance(a_tag, Tag):
168
+ continue
169
+ href = a_tag["href"]
170
+ if isinstance(href, (list, tuple)):
171
+ href = href[0] if href else ""
172
+ href = str(href)
173
+
174
+ # Resolve relative URLs
175
+ full = urljoin(base_url, href)
176
+ parsed = urlparse(full)
177
+
178
+ # Same domain only, skip fragments, skip non-HTTP(S)
179
+ if parsed.netloc != urlparse(self._base).netloc:
180
+ continue
181
+ if parsed.scheme not in ("http", "https"):
182
+ continue
183
+ if parsed.fragment:
184
+ full = full.rstrip("#" + parsed.fragment)
185
+
186
+ links.append(full)
187
+
188
+ return list(dict.fromkeys(links)) # deduplicate, preserve order
189
+
190
+ # ------------------------------------------------------------------
191
+ # Robots directives from HTML
192
+ # ------------------------------------------------------------------
193
+ @staticmethod
194
+ def _check_robots_directives(html: str) -> RobotsDirectives:
195
+ soup = BeautifulSoup(html, "html.parser")
196
+ return parse_meta_robots(soup)
@@ -0,0 +1,111 @@
1
+ from __future__ import annotations
2
+
3
+ from urllib.parse import urlparse
4
+
5
+ from llms_generator.page_analyzer import PageInfo
6
+
7
+
8
+ def generate_llms_txt(
9
+ sections: dict[str, list[PageInfo]],
10
+ full: bool = False,
11
+ ) -> str:
12
+ lines: list[str] = []
13
+
14
+ # H1 — site name from the start URL's netloc
15
+ site_name = _pick_site_name(sections)
16
+ lines.append(f"# {site_name}")
17
+ lines.append("")
18
+
19
+ # Blockquote summary — first non-empty description
20
+ summary = _pick_summary(sections)
21
+ if summary:
22
+ lines.append(f"> {summary}")
23
+ lines.append("")
24
+
25
+ # Optional context paragraph
26
+ lines.append(
27
+ "This file provides AI systems with a structured summary of this website. "
28
+ "It is maintained automatically by llms-generator."
29
+ )
30
+ lines.append("")
31
+
32
+ # Sections
33
+ for section_name in _order_sections(sections):
34
+ pages = sections[section_name]
35
+ if not pages:
36
+ continue
37
+
38
+ lines.append(f"## {section_name}")
39
+ lines.append("")
40
+
41
+ for page in pages:
42
+ desc = page.summary or page.h1 or page.title
43
+ lines.append(f"- [{page.title or page.h1 or page.url}]({page.url}): {desc}")
44
+
45
+ if full and page.full_text:
46
+ lines.append("")
47
+ lines.append(page.full_text)
48
+ lines.append("")
49
+
50
+ lines.append("")
51
+
52
+ return "\n".join(lines)
53
+
54
+
55
+ def _pick_site_name(sections: dict[str, list[PageInfo]]) -> str:
56
+ for pages in sections.values():
57
+ for p in pages:
58
+ if p.title:
59
+ return p.title.split("—")[0].split("|")[0].strip()
60
+ if p.h1:
61
+ return p.h1
62
+ # Fallback: domain name
63
+ for pages in sections.values():
64
+ for p in pages:
65
+ netloc = urlparse(p.url).netloc
66
+ return netloc.replace("www.", "").split(".")[0].title()
67
+ return "Untitled Site"
68
+
69
+
70
+ def _pick_summary(sections: dict[str, list[PageInfo]]) -> str:
71
+ for name in ("Home", "About", "Docs"):
72
+ pages = sections.get(name)
73
+ if pages:
74
+ for p in pages:
75
+ if p.description:
76
+ return p.description
77
+ if p.summary:
78
+ return p.summary
79
+ # Fallback: any non-empty summary
80
+ for pages in sections.values():
81
+ for p in pages:
82
+ if p.description:
83
+ return p.description
84
+ return ""
85
+
86
+
87
+ SECTION_PRIORITY = [
88
+ "Home", "About", "Docs", "Documentation",
89
+ "Guide", "Guides", "Tutorial", "Tutorials",
90
+ "Api", "Api Reference",
91
+ "Blog", "News",
92
+ "Features", "Pricing",
93
+ "Support", "Faq",
94
+ "Contact",
95
+ ]
96
+
97
+
98
+ def _order_sections(sections: dict[str, list[PageInfo]]) -> list[str]:
99
+ custom: list[str] = []
100
+ remaining: list[str] = []
101
+
102
+ for name in sections:
103
+ if name in SECTION_PRIORITY:
104
+ custom.append(name)
105
+ else:
106
+ remaining.append(name)
107
+
108
+ custom.sort(key=lambda n: SECTION_PRIORITY.index(n))
109
+ remaining.sort()
110
+
111
+ return custom + remaining
@@ -0,0 +1,110 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from dataclasses import dataclass, field
5
+ from urllib.parse import urlparse
6
+
7
+ from bs4 import BeautifulSoup, Tag
8
+
9
+
10
+ @dataclass
11
+ class PageInfo:
12
+ url: str
13
+ title: str = ""
14
+ h1: str = ""
15
+ description: str = ""
16
+ summary: str = ""
17
+ section: str = ""
18
+ depth: int = 0
19
+ full_text: str = ""
20
+ raw_html: str = ""
21
+
22
+
23
+ @dataclass
24
+ class RobotsDirectives:
25
+ noindex: bool = False
26
+ nofollow: bool = False
27
+
28
+
29
+ _X_ROBOTS_TAG_RE = re.compile(
30
+ r"(?:noindex|nofollow|index|follow|none|all|noarchive)",
31
+ re.IGNORECASE,
32
+ )
33
+
34
+
35
+ def parse_robots_header(header_value: str) -> RobotsDirectives:
36
+ d = RobotsDirectives()
37
+ val = header_value.lower()
38
+ if "none" in val:
39
+ d.noindex = True
40
+ d.nofollow = True
41
+ if "noindex" in val:
42
+ d.noindex = True
43
+ if "nofollow" in val:
44
+ d.nofollow = True
45
+ return d
46
+
47
+
48
+ def parse_meta_robots(soup: BeautifulSoup) -> RobotsDirectives:
49
+ d = RobotsDirectives()
50
+ meta = soup.find("meta", attrs={"name": re.compile(r"^robots$", re.I)})
51
+ if not meta or not isinstance(meta, Tag):
52
+ return d
53
+ content = (meta.get("content") or "").lower()
54
+ if "none" in content:
55
+ d.noindex = True
56
+ d.nofollow = True
57
+ if "noindex" in content:
58
+ d.noindex = True
59
+ if "nofollow" in content:
60
+ d.nofollow = True
61
+ return d
62
+
63
+
64
+ def extract_page_info(url: str, html: str, depth: int) -> PageInfo:
65
+ soup = BeautifulSoup(html, "html.parser")
66
+ info = PageInfo(url=url, depth=depth, raw_html=html)
67
+
68
+ title_tag = soup.find("title")
69
+ if title_tag and isinstance(title_tag, Tag):
70
+ info.title = title_tag.get_text(strip=True)
71
+
72
+ h1_tag = soup.find("h1")
73
+ if h1_tag:
74
+ info.h1 = h1_tag.get_text(strip=True)
75
+
76
+ meta_desc = soup.find("meta", attrs={"name": re.compile(r"^description$", re.I)})
77
+ if meta_desc and isinstance(meta_desc, Tag):
78
+ info.description = (meta_desc.get("content") or "").strip()
79
+
80
+ info.summary = info.description or info.h1 or info.title
81
+
82
+ p_tag = soup.find("p")
83
+ if p_tag:
84
+ text = p_tag.get_text(strip=True)
85
+ if len(text) > 20:
86
+ info.full_text = text
87
+ else:
88
+ info.full_text = _find_first_meaningful_text(soup)
89
+
90
+ info.section = _infer_section(url, info.h1)
91
+ return info
92
+
93
+
94
+ def _find_first_meaningful_text(soup: BeautifulSoup) -> str:
95
+ for tag in soup.find_all(["p", "li", "div", "section"]):
96
+ text = tag.get_text(strip=True)
97
+ if len(text) > 50:
98
+ return text[:500]
99
+ return ""
100
+
101
+
102
+ def _infer_section(url: str, h1: str) -> str:
103
+ path = urlparse(url).path.strip("/")
104
+ if not path:
105
+ return "Home"
106
+ parts = path.split("/")
107
+ top = parts[0].replace("-", " ").replace("_", " ").title()
108
+ if top:
109
+ return top
110
+ return h1 or "Other"
@@ -0,0 +1,38 @@
1
+ from __future__ import annotations
2
+
3
+ from collections import defaultdict
4
+ from urllib.parse import urlparse
5
+
6
+ from llms_generator.page_analyzer import PageInfo
7
+
8
+
9
+ def group_pages(pages: list[PageInfo]) -> dict[str, list[PageInfo]]:
10
+ sections: dict[str, list[PageInfo]] = defaultdict(list)
11
+
12
+ for page in pages:
13
+ section = _assign_section(page)
14
+ sections[section].append(page)
15
+
16
+ return dict(sections)
17
+
18
+
19
+ def _assign_section(page: PageInfo) -> str:
20
+ if page.section and page.section != "Other":
21
+ return _normalise(page.section)
22
+
23
+ path = urlparse(page.url).path.strip("/")
24
+ if not path:
25
+ return "Home"
26
+
27
+ parts = [p for p in path.split("/") if p]
28
+ top = _normalise(parts[0])
29
+
30
+ if page.h1:
31
+ h1_section = _normalise(page.h1)
32
+ return h1_section
33
+
34
+ return top or "Other"
35
+
36
+
37
+ def _normalise(name: str) -> str:
38
+ return name.replace("-", " ").replace("_", " ").strip().title()
@@ -0,0 +1,203 @@
1
+ Metadata-Version: 2.4
2
+ Name: llms-generator
3
+ Version: 0.1.0
4
+ Summary: Crawl any website and generate llms.txt — the AI-ready site map standard.
5
+ Author: aouwalitshikkha
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/aouwalitshikkha/llms-generator
8
+ Project-URL: Repository, https://github.com/aouwalitshikkha/llms-generator
9
+ Keywords: llms.txt,crawler,seo,ai,llm
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Topic :: Internet :: WWW/HTTP
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Requires-Python: >=3.10
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: requests>=2.31
22
+ Requires-Dist: beautifulsoup4>=4.12
23
+ Provides-Extra: js
24
+ Requires-Dist: playwright>=1.40; extra == "js"
25
+ Provides-Extra: all
26
+ Requires-Dist: llms-generator[js]; extra == "all"
27
+ Dynamic: license-file
28
+
29
+ # llms-generator
30
+
31
+ [![PyPI](https://img.shields.io/pypi/v/llms-generator)](https://pypi.org/project/llms-generator/)
32
+ [![Python](https://img.shields.io/pypi/pyversions/llms-generator)](https://pypi.org/project/llms-generator/)
33
+ [![License](https://img.shields.io/github/license/aouwalitshikkha/llms-generator)](LICENSE)
34
+
35
+ **Crawl any website and generate `llms.txt`** — the AI-ready site map standard.
36
+
37
+ `llms.txt` is a markdown file placed at a website's root (`/llms.txt`) that helps AI assistants like ChatGPT, Claude, and Perplexity understand your site's content structure. Think of it as *robots.txt for AI*.
38
+
39
+ This tool crawls your site, extracts page metadata, groups pages into logical sections, and outputs a spec-compliant `llms.txt` file.
40
+
41
+ ---
42
+
43
+ ## Why llms.txt?
44
+
45
+ AI systems struggle to navigate large, noisy websites. An `llms.txt` file gives them a curated map of your most important content — leading to:
46
+
47
+ - Accurate citations in AI-generated responses
48
+ - Better brand representation in ChatGPT, Perplexity, Google AI Overviews
49
+ - Less server load from AI crawlers wandering your site
50
+ - Control over how AI systems reference your content
51
+
52
+ The [llms.txt specification](https://llmstxt.org) was proposed by Jeremy Howard in 2024 and is actively supported by Perplexity, Anthropic, and other AI platforms.
53
+
54
+ ---
55
+
56
+ ## Installation
57
+
58
+ ```bash
59
+ pip install llms-generator
60
+ ```
61
+
62
+ For JavaScript-heavy sites (optional):
63
+
64
+ ```bash
65
+ pip install llms-generator[js]
66
+ playwright install chromium
67
+ ```
68
+
69
+ ---
70
+
71
+ ## Usage
72
+
73
+ ```bash
74
+ llms-gen https://example.com
75
+ ```
76
+
77
+ That's it. The tool crawls your site and creates `llms.txt` in the current directory.
78
+
79
+ ### Options
80
+
81
+ | Flag | Default | Description |
82
+ |---|---|---|
83
+ | `URL` | required | Target website URL |
84
+ | `--depth` | `2` | Maximum crawl depth |
85
+ | `--output` | `llms.txt` | Output file path |
86
+ | `--full` | `False` | Also generate `llms-full.txt` with full page content |
87
+ | `--no-js` | `False` | Skip Playwright JavaScript rendering fallback |
88
+ | `--delay` | `1.0` | Seconds between requests (be polite) |
89
+
90
+ ### Examples
91
+
92
+ ```bash
93
+ # Basic crawl (2 levels deep)
94
+ llms-gen https://example.com
95
+
96
+ # Crawl deeper, output to custom path
97
+ llms-gen https://docs.example.com --depth 3 --output site-llms.txt
98
+
99
+ # Generate both standard and full versions
100
+ llms-gen https://example.com --full
101
+
102
+ # Fast crawl without JS rendering
103
+ llms-gen https://example.com --no-js --delay 0.5
104
+ ```
105
+
106
+ ---
107
+
108
+ ## How it works
109
+
110
+ ### Per-page robot check
111
+
112
+ Every page is checked against three layers before being included or followed:
113
+
114
+ ```
115
+ robots.txt ──┬── disallowed? → skip
116
+ └── allowed? ──→ check HTTP X-Robots-Tag header
117
+
118
+ noindex? ──→ skip
119
+ nofollow? ──→ still analyze, don't follow links
120
+
121
+ absent ──→ check <meta name="robots">
122
+
123
+ noindex? ──→ skip
124
+ nofollow? ──→ still analyze, don't follow links
125
+
126
+ absent/index,follow ──→ analyze + follow links
127
+ ```
128
+
129
+ Pages with `noindex` are **excluded from `llms.txt`**. Pages with `nofollow` are still analyzed for their content but their child links are not crawled.
130
+
131
+ ### Crawl strategy
132
+
133
+ 1. Parse `robots.txt` — respect `Disallow` and `Crawl-Delay`
134
+ 2. BFS from the start URL up to `--depth` levels
135
+ 3. For each page:
136
+ - Fetch with `requests` (handles most sites)
137
+ - If content is empty (JS-rendered), fall back to Playwright headless browser
138
+ - Extract: `<title>`, `<h1>`, `<meta name="description">`, first meaningful paragraph, directory path
139
+ 4. Group pages into sections (directory-based, with H1 fallback)
140
+ 5. Assemble `llms.txt` per the spec
141
+
142
+ ### Section grouping
143
+
144
+ Pages are grouped into `##` sections by their top-level directory path:
145
+
146
+ ```
147
+ /docs/getting-started → ## Docs
148
+ /blog/hello-world → ## Blog
149
+ /api/v1/users → ## Api
150
+ ```
151
+
152
+ Pages without a clear directory path use their `<h1>` as the section name.
153
+
154
+ ---
155
+
156
+ ## Output format
157
+
158
+ The generated `llms.txt` follows the [llmstxt.org](https://llmstxt.org) specification:
159
+
160
+ ```markdown
161
+ # Example Site
162
+
163
+ > A great example site with documentation and blog content.
164
+
165
+ This file provides AI systems with a structured summary of this website.
166
+
167
+ ## Docs
168
+
169
+ - [Getting Started](https://example.com/docs/getting-started): How to get started with our platform.
170
+ - [API Reference](https://example.com/docs/api): Complete API documentation.
171
+
172
+ ## Blog
173
+
174
+ - [Hello World](https://example.com/blog/hello): Our first blog post.
175
+ ```
176
+
177
+ ### llms-full.txt
178
+
179
+ With `--full`, an expanded version is also generated that includes the full text content of each page inline — useful for providing complete context to LLMs in a single file.
180
+
181
+ ---
182
+
183
+ ## Development
184
+
185
+ ```bash
186
+ git clone https://github.com/aouwalitshikkha/llms-generator.git
187
+ cd llms-generator
188
+ pip install -e .
189
+ pip install -e ".[js]" # with Playwright support
190
+ ```
191
+
192
+ Run tests:
193
+
194
+ ```bash
195
+ pip install pytest
196
+ pytest
197
+ ```
198
+
199
+ ---
200
+
201
+ ## License
202
+
203
+ MIT
@@ -0,0 +1,14 @@
1
+ llms_generator/__init__.py,sha256=EObI8QaandukD-LUGiPaKXk-UwBT_XG19YKqNUCDbW4,48
2
+ llms_generator/__main__.py,sha256=FIIppT24g8YwqftTHcvnX0A18ylqpUZmcW6aKGOQKI8,75
3
+ llms_generator/_version.py,sha256=kUR5RAFc7HCeiqdlX36dZOHkUI5wI6V_43RpEcD8b-0,22
4
+ llms_generator/cli.py,sha256=yu21yldgodFF9UakbvvYcF5nN0Zj4YRKRaI8yL2Jrr0,2131
5
+ llms_generator/crawler.py,sha256=eaqhuBRnPbwrMb4CVqlh1vHLOJ1vJrw9Z2tjQpRWKTg,6128
6
+ llms_generator/generator.py,sha256=w2VDiZxewUwRCFKcMHaOtqVgfL7x5ZvKUA29awfhhfQ,3015
7
+ llms_generator/page_analyzer.py,sha256=QJVgAcxFtg9grfNZm1Wy9wVrlcIfLT5EdLOFCnAukEs,2835
8
+ llms_generator/section_grouper.py,sha256=UiqtlF62zc6myQn5MbixhQKdsGdgbpgO1cO0wqhy9Mc,932
9
+ llms_generator-0.1.0.dist-info/licenses/LICENSE,sha256=Oczh2qn7dQ5PR6mYsb-QXZ1wsZv5IGFGCerMBHG5PeA,1072
10
+ llms_generator-0.1.0.dist-info/METADATA,sha256=YUbFcWXXF2Qt9QTc55GqCGl9NfaTkBlPolhqDeQybUU,6459
11
+ llms_generator-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
12
+ llms_generator-0.1.0.dist-info/entry_points.txt,sha256=Q4klthIOtQMi9Z23c8AbJFg9c-xnibppCtZnmwFH9_4,53
13
+ llms_generator-0.1.0.dist-info/top_level.txt,sha256=w2v6vqKUn9ffmSwzBKLKDl5ygkE39h-6NgMHgQqkrxY,15
14
+ llms_generator-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ llms-gen = llms_generator.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 aouwalitshikkha
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ llms_generator