getdocs 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- getdocs/__init__.py +0 -0
- getdocs/__main__.py +3 -0
- getdocs/api.py +95 -0
- getdocs/cli.py +220 -0
- getdocs/config.py +36 -0
- getdocs/engine.py +418 -0
- getdocs/extract.py +190 -0
- getdocs/identity.py +32 -0
- getdocs/jobs.py +204 -0
- getdocs/navharvest.py +242 -0
- getdocs/output.py +191 -0
- getdocs/scope.py +84 -0
- getdocs/sitemap.py +35 -0
- getdocs/source.py +238 -0
- getdocs/urlnorm.py +34 -0
- getdocs-0.1.0.dist-info/METADATA +169 -0
- getdocs-0.1.0.dist-info/RECORD +21 -0
- getdocs-0.1.0.dist-info/WHEEL +5 -0
- getdocs-0.1.0.dist-info/entry_points.txt +2 -0
- getdocs-0.1.0.dist-info/licenses/LICENSE +21 -0
- getdocs-0.1.0.dist-info/top_level.txt +1 -0
getdocs/scope.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Scope: decides whether a discovered URL belongs to a Crawl.
|
|
2
|
+
|
|
3
|
+
Default: same host + path prefix of each Seed URL, loosened by explicit
|
|
4
|
+
overrides. Discovery method never matters — Scope gates fetching no matter
|
|
5
|
+
how a URL was found.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from fnmatch import fnmatch
|
|
10
|
+
from urllib.parse import urlsplit
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _segments(path: str) -> list[str]:
|
|
14
|
+
return [s for s in path.split("/") if s]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class _SeedRule:
|
|
19
|
+
host: str
|
|
20
|
+
path_segments: tuple[str, ...]
|
|
21
|
+
|
|
22
|
+
def matches_host(self, host: str, allow_subdomains: bool) -> bool:
|
|
23
|
+
if host == self.host:
|
|
24
|
+
return True
|
|
25
|
+
return allow_subdomains and host.endswith("." + self.host)
|
|
26
|
+
|
|
27
|
+
def matches_path(self, segments: list[str], allow_backward: bool) -> bool:
|
|
28
|
+
if allow_backward:
|
|
29
|
+
return True
|
|
30
|
+
return tuple(segments[: len(self.path_segments)]) == self.path_segments
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class Scope:
|
|
35
|
+
rules: tuple[_SeedRule, ...]
|
|
36
|
+
allow_backward: bool = False
|
|
37
|
+
allow_subdomains: bool = False
|
|
38
|
+
include_paths: tuple[str, ...] = ()
|
|
39
|
+
exclude_paths: tuple[str, ...] = ()
|
|
40
|
+
|
|
41
|
+
@classmethod
|
|
42
|
+
def from_seeds(
|
|
43
|
+
cls,
|
|
44
|
+
seeds: list[str],
|
|
45
|
+
allow_backward: bool = False,
|
|
46
|
+
allow_subdomains: bool = False,
|
|
47
|
+
include_paths: list[str] | tuple[str, ...] = (),
|
|
48
|
+
exclude_paths: list[str] | tuple[str, ...] = (),
|
|
49
|
+
) -> "Scope":
|
|
50
|
+
rules = []
|
|
51
|
+
for seed in seeds:
|
|
52
|
+
parts = urlsplit(seed)
|
|
53
|
+
rules.append(
|
|
54
|
+
_SeedRule(host=parts.netloc.lower(), path_segments=tuple(_segments(parts.path)))
|
|
55
|
+
)
|
|
56
|
+
return cls(
|
|
57
|
+
rules=tuple(rules),
|
|
58
|
+
allow_backward=allow_backward,
|
|
59
|
+
allow_subdomains=allow_subdomains,
|
|
60
|
+
include_paths=tuple(include_paths),
|
|
61
|
+
exclude_paths=tuple(exclude_paths),
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
def allows(self, url: str) -> bool:
|
|
65
|
+
parts = urlsplit(url)
|
|
66
|
+
if parts.scheme not in ("http", "https"):
|
|
67
|
+
return False
|
|
68
|
+
host = parts.netloc.lower()
|
|
69
|
+
segments = _segments(parts.path)
|
|
70
|
+
|
|
71
|
+
in_seed_scope = any(
|
|
72
|
+
rule.matches_host(host, self.allow_subdomains)
|
|
73
|
+
and rule.matches_path(segments, self.allow_backward)
|
|
74
|
+
for rule in self.rules
|
|
75
|
+
)
|
|
76
|
+
if not in_seed_scope:
|
|
77
|
+
return False
|
|
78
|
+
|
|
79
|
+
path = parts.path or "/"
|
|
80
|
+
if self.include_paths and not any(fnmatch(path, g) for g in self.include_paths):
|
|
81
|
+
return False
|
|
82
|
+
if any(fnmatch(path, g) for g in self.exclude_paths):
|
|
83
|
+
return False
|
|
84
|
+
return True
|
getdocs/sitemap.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Sitemap parsing: urlsets, sitemap indexes, and robots.txt Sitemap lines.
|
|
2
|
+
|
|
3
|
+
Discovery only — Scope decides what actually gets fetched.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import xml.etree.ElementTree as ET
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_sitemap_xml(body: str) -> tuple[list[str], list[str]]:
|
|
10
|
+
"""Returns (page_urls, nested_sitemap_urls)."""
|
|
11
|
+
try:
|
|
12
|
+
root = ET.fromstring(body)
|
|
13
|
+
except ET.ParseError:
|
|
14
|
+
return [], []
|
|
15
|
+
|
|
16
|
+
tag = root.tag.rpartition("}")[2]
|
|
17
|
+
locs = [
|
|
18
|
+
loc.text.strip()
|
|
19
|
+
for loc in root.iter()
|
|
20
|
+
if loc.tag.rpartition("}")[2] == "loc" and loc.text and loc.text.strip()
|
|
21
|
+
]
|
|
22
|
+
if tag == "urlset":
|
|
23
|
+
return locs, []
|
|
24
|
+
if tag == "sitemapindex":
|
|
25
|
+
return [], locs
|
|
26
|
+
return [], []
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_robots_sitemaps(robots_txt: str) -> list[str]:
|
|
30
|
+
sitemaps = []
|
|
31
|
+
for line in robots_txt.splitlines():
|
|
32
|
+
key, _, value = line.partition(":")
|
|
33
|
+
if key.strip().lower() == "sitemap" and value.strip():
|
|
34
|
+
sitemaps.append(value.strip())
|
|
35
|
+
return sitemaps
|
getdocs/source.py
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Source-first: before crawling, check whether the docs site is open-source.
|
|
2
|
+
|
|
3
|
+
Most documentation generators embed a link back to the repository that hosts
|
|
4
|
+
the docs source — MkDocs/Material and Docusaurus render an "Edit this page"
|
|
5
|
+
link, Sphinx/Read-the-Docs an "Edit on GitHub". When we can find that repo we
|
|
6
|
+
clone it and write an mkdocs.yml so the docs can be served locally, which is
|
|
7
|
+
faster and higher-fidelity than crawling the rendered HTML.
|
|
8
|
+
|
|
9
|
+
Detection is a pure function over the seed page's HTML (`detect_repo`); the
|
|
10
|
+
network and git side-effects live in `fetch_html`, `clone_repo`, and the
|
|
11
|
+
`clone_source_for` orchestrator the CLI calls before `run_crawl`.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import importlib.util
|
|
15
|
+
import shutil
|
|
16
|
+
import subprocess
|
|
17
|
+
import sys
|
|
18
|
+
import urllib.request
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from urllib.parse import urljoin, urlsplit
|
|
21
|
+
|
|
22
|
+
import yaml
|
|
23
|
+
from bs4 import BeautifulSoup
|
|
24
|
+
|
|
25
|
+
from getdocs.config import CrawlConfig
|
|
26
|
+
from getdocs.identity import build_user_agent
|
|
27
|
+
|
|
28
|
+
# Hosts whose /ORG/REPO paths we recognize as clonable repositories.
|
|
29
|
+
_GIT_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
|
|
30
|
+
|
|
31
|
+
# First path segments on github.com that are product pages, not orgs/users.
|
|
32
|
+
_RESERVED_OWNERS = {
|
|
33
|
+
"about", "apps", "collections", "contact", "customer-stories", "explore",
|
|
34
|
+
"features", "join", "login", "marketplace", "new", "notifications",
|
|
35
|
+
"organizations", "orgs", "pricing", "readme", "security", "settings",
|
|
36
|
+
"site", "sponsors", "topics",
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
# Sub-paths a repo URL deep-links through (github.com/o/r/edit/main/docs/x.md).
|
|
40
|
+
_REPO_SUBPATHS = {"edit", "blob", "tree", "raw", "commits", "blame", "wiki"}
|
|
41
|
+
|
|
42
|
+
# Where doc sources commonly live inside a repo, in preference order.
|
|
43
|
+
_DOCS_CANDIDATES = ["docs", "doc", "documentation", "site/docs", "website/docs", "content"]
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def repo_root_from_url(url: str) -> str | None:
|
|
47
|
+
"""Reduce any URL on a known Git host to its canonical repo root, or None.
|
|
48
|
+
|
|
49
|
+
https://github.com/org/repo/edit/main/docs/x.md -> https://github.com/org/repo
|
|
50
|
+
Anything off a known host, or shallower than /ORG/REPO, returns None.
|
|
51
|
+
"""
|
|
52
|
+
parts = urlsplit(url)
|
|
53
|
+
if parts.scheme not in ("http", "https"):
|
|
54
|
+
return None
|
|
55
|
+
host = parts.netloc.lower().rsplit("@", 1)[-1].split(":", 1)[0]
|
|
56
|
+
if host.startswith("www."):
|
|
57
|
+
host = host[4:]
|
|
58
|
+
if host not in _GIT_HOSTS:
|
|
59
|
+
return None
|
|
60
|
+
segments = [s for s in parts.path.split("/") if s]
|
|
61
|
+
if len(segments) < 2:
|
|
62
|
+
return None
|
|
63
|
+
owner, repo = segments[0], segments[1]
|
|
64
|
+
if owner.lower() in _RESERVED_OWNERS:
|
|
65
|
+
return None
|
|
66
|
+
if repo.endswith(".git"):
|
|
67
|
+
repo = repo[:-4]
|
|
68
|
+
if not repo:
|
|
69
|
+
return None
|
|
70
|
+
return f"https://{host}/{owner}/{repo}"
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _link_score(url: str, blob: str) -> int:
|
|
74
|
+
"""Rank a repo link by how strongly it signals 'this is the docs source'.
|
|
75
|
+
|
|
76
|
+
An explicit "edit this page" affordance is the gold signal; a "source"/
|
|
77
|
+
"github" label is next; a bare deep-link into the repo beats a plain
|
|
78
|
+
repo-root link that might just be a footer cross-reference.
|
|
79
|
+
"""
|
|
80
|
+
if "edit" in blob:
|
|
81
|
+
return 100
|
|
82
|
+
if any(word in blob for word in ("source", "github", "gitlab", "view on", "improve")):
|
|
83
|
+
return 80
|
|
84
|
+
segments = [s for s in urlsplit(url).path.split("/") if s]
|
|
85
|
+
if len(segments) >= 3 and segments[2] in _REPO_SUBPATHS:
|
|
86
|
+
return 60
|
|
87
|
+
return 20
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def detect_repo(html: str, base_url: str = "") -> str | None:
|
|
91
|
+
"""Find the repository a docs page links back to, as a canonical repo URL.
|
|
92
|
+
|
|
93
|
+
Scans anchors for links onto a known Git host, scores each by how clearly
|
|
94
|
+
it names the docs source, and returns the highest-scoring repo root
|
|
95
|
+
(ties broken by how often that repo is linked). None when nothing matches.
|
|
96
|
+
"""
|
|
97
|
+
soup = BeautifulSoup(html, "html.parser")
|
|
98
|
+
# root -> [best score seen, link count]
|
|
99
|
+
candidates: dict[str, list[int]] = {}
|
|
100
|
+
for anchor in soup.find_all("a", href=True):
|
|
101
|
+
href = anchor["href"].strip()
|
|
102
|
+
if base_url and not urlsplit(href).scheme:
|
|
103
|
+
href = urljoin(base_url, href)
|
|
104
|
+
root = repo_root_from_url(href)
|
|
105
|
+
if not root:
|
|
106
|
+
continue
|
|
107
|
+
blob = " ".join([
|
|
108
|
+
anchor.get_text(" ", strip=True),
|
|
109
|
+
anchor.get("title", ""),
|
|
110
|
+
" ".join(anchor.get("class") or []),
|
|
111
|
+
" ".join(anchor.get("rel") or []),
|
|
112
|
+
anchor.get("aria-label", ""),
|
|
113
|
+
]).lower()
|
|
114
|
+
score = _link_score(href, blob)
|
|
115
|
+
entry = candidates.setdefault(root, [0, 0])
|
|
116
|
+
entry[0] = max(entry[0], score)
|
|
117
|
+
entry[1] += 1
|
|
118
|
+
if not candidates:
|
|
119
|
+
return None
|
|
120
|
+
return max(candidates, key=lambda root: (candidates[root][0], candidates[root][1]))
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def fetch_html(
|
|
124
|
+
url: str, user_agent: str | None = None,
|
|
125
|
+
timeout: float = 15.0, max_bytes: int = 3_000_000,
|
|
126
|
+
) -> str | None:
|
|
127
|
+
"""Fetch a single page's HTML for detection; None on any error/non-HTML."""
|
|
128
|
+
request = urllib.request.Request(
|
|
129
|
+
url, headers={"User-Agent": user_agent or build_user_agent()}
|
|
130
|
+
)
|
|
131
|
+
try:
|
|
132
|
+
with urllib.request.urlopen(request, timeout=timeout) as response: # noqa: S310
|
|
133
|
+
content_type = response.headers.get("Content-Type", "").lower()
|
|
134
|
+
if any(kind in content_type for kind in ("json", "image/", "pdf", "octet-stream")):
|
|
135
|
+
return None
|
|
136
|
+
data = response.read(max_bytes)
|
|
137
|
+
except Exception:
|
|
138
|
+
return None
|
|
139
|
+
return data.decode("utf-8", errors="replace")
|
|
140
|
+
|
|
141
|
+
|
|
142
|
+
def clone_repo(repo_url: str, dest_parent: Path, timeout: float = 180.0) -> Path | None:
|
|
143
|
+
"""Shallow-clone repo_url under dest_parent; return the clone dir or None.
|
|
144
|
+
|
|
145
|
+
Returns an existing clone untouched (idempotent for re-runs); None when git
|
|
146
|
+
is missing or the clone fails.
|
|
147
|
+
"""
|
|
148
|
+
if shutil.which("git") is None:
|
|
149
|
+
return None
|
|
150
|
+
name = repo_url.rstrip("/").rsplit("/", 1)[-1]
|
|
151
|
+
if name.endswith(".git"):
|
|
152
|
+
name = name[:-4]
|
|
153
|
+
dest = dest_parent / name
|
|
154
|
+
if dest.exists():
|
|
155
|
+
return dest if (dest / ".git").exists() else None
|
|
156
|
+
clone_url = repo_url if repo_url.endswith(".git") else repo_url + ".git"
|
|
157
|
+
dest_parent.mkdir(parents=True, exist_ok=True)
|
|
158
|
+
try:
|
|
159
|
+
subprocess.run(
|
|
160
|
+
["git", "clone", "--depth", "1", clone_url, str(dest)],
|
|
161
|
+
check=True, capture_output=True, timeout=timeout,
|
|
162
|
+
)
|
|
163
|
+
except (subprocess.CalledProcessError, subprocess.TimeoutExpired, OSError):
|
|
164
|
+
return None
|
|
165
|
+
return dest
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def find_docs_dir(repo_dir: Path) -> Path | None:
|
|
169
|
+
"""Locate the markdown docs source inside a cloned repo, or None."""
|
|
170
|
+
for candidate in _DOCS_CANDIDATES:
|
|
171
|
+
path = repo_dir / candidate
|
|
172
|
+
if path.is_dir() and (any(path.rglob("*.md")) or any(path.rglob("*.mdx"))):
|
|
173
|
+
return path
|
|
174
|
+
if any(repo_dir.glob("*.md")):
|
|
175
|
+
return repo_dir
|
|
176
|
+
return None
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
def write_mkdocs_config(output_dir: Path, docs_dir: Path, site_name: str) -> Path:
|
|
180
|
+
"""Write an mkdocs.yml in output_dir that serves docs_dir locally."""
|
|
181
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
182
|
+
theme = "material" if importlib.util.find_spec("material") else "mkdocs"
|
|
183
|
+
config = {
|
|
184
|
+
"site_name": site_name,
|
|
185
|
+
"docs_dir": str(docs_dir.resolve()),
|
|
186
|
+
"theme": {"name": theme},
|
|
187
|
+
"use_directory_urls": True,
|
|
188
|
+
}
|
|
189
|
+
path = output_dir / "mkdocs.yml"
|
|
190
|
+
path.write_text(yaml.safe_dump(config, sort_keys=False, allow_unicode=True))
|
|
191
|
+
return path
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def clone_source_for(config: CrawlConfig) -> Path | None:
|
|
195
|
+
"""Try to satisfy a crawl by cloning the docs' source repo instead.
|
|
196
|
+
|
|
197
|
+
Returns the clone directory when the site is open-source and was cloned
|
|
198
|
+
(the caller should then skip crawling); None to fall back to crawling.
|
|
199
|
+
Progress and outcomes are reported on stderr (stdout is the jsonl stream).
|
|
200
|
+
"""
|
|
201
|
+
if not config.seeds:
|
|
202
|
+
return None
|
|
203
|
+
seed = config.seeds[0]
|
|
204
|
+
if urlsplit(seed).scheme not in ("http", "https"):
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
host = urlsplit(seed).netloc or seed
|
|
208
|
+
print(f"checking whether {host} is open-source…", file=sys.stderr)
|
|
209
|
+
html = fetch_html(seed, build_user_agent(config.contact, config.user_agent))
|
|
210
|
+
if html is None:
|
|
211
|
+
print("could not fetch the seed page — crawling instead", file=sys.stderr)
|
|
212
|
+
return None
|
|
213
|
+
repo_url = detect_repo(html, seed)
|
|
214
|
+
if repo_url is None:
|
|
215
|
+
print("no source repository linked from the page — crawling instead", file=sys.stderr)
|
|
216
|
+
return None
|
|
217
|
+
|
|
218
|
+
print(f"found source repository {repo_url} — cloning…", file=sys.stderr)
|
|
219
|
+
repo_dir = clone_repo(repo_url, config.output_dir)
|
|
220
|
+
if repo_dir is None:
|
|
221
|
+
print("clone failed (git missing or repo unreachable) — crawling instead", file=sys.stderr)
|
|
222
|
+
return None
|
|
223
|
+
|
|
224
|
+
own_config = repo_dir / "mkdocs.yml"
|
|
225
|
+
if own_config.exists():
|
|
226
|
+
print(f"cloned to {repo_dir} (ships its own mkdocs.yml)", file=sys.stderr)
|
|
227
|
+
print(f"serve it with: mkdocs serve -f {own_config}", file=sys.stderr)
|
|
228
|
+
return repo_dir
|
|
229
|
+
|
|
230
|
+
docs_dir = find_docs_dir(repo_dir)
|
|
231
|
+
if docs_dir is None:
|
|
232
|
+
print(f"cloned to {repo_dir}, but found no markdown docs to serve", file=sys.stderr)
|
|
233
|
+
return repo_dir
|
|
234
|
+
|
|
235
|
+
written = write_mkdocs_config(config.output_dir, docs_dir, host)
|
|
236
|
+
print(f"cloned to {repo_dir}; wrote {written}", file=sys.stderr)
|
|
237
|
+
print(f"serve it with: mkdocs serve -f {written}", file=sys.stderr)
|
|
238
|
+
return repo_dir
|
getdocs/urlnorm.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""URL normalization for frontier dedup.
|
|
2
|
+
|
|
3
|
+
Two URLs that normalize identically are the same Page. rel=canonical is
|
|
4
|
+
deliberately NOT part of this (ADR-0003).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
|
|
8
|
+
|
|
9
|
+
_TRACKING_PREFIXES = ("utm_",)
|
|
10
|
+
_TRACKING_PARAMS = {"fbclid", "gclid", "msclkid", "mc_cid", "mc_eid"}
|
|
11
|
+
_DEFAULT_PORTS = {"http": "80", "https": "443"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _is_tracking(param: str) -> bool:
|
|
15
|
+
return param in _TRACKING_PARAMS or param.startswith(_TRACKING_PREFIXES)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def normalize(url: str) -> str:
|
|
19
|
+
parts = urlsplit(url)
|
|
20
|
+
scheme = parts.scheme.lower()
|
|
21
|
+
|
|
22
|
+
host = parts.hostname.lower() if parts.hostname else ""
|
|
23
|
+
if parts.port is not None and str(parts.port) != _DEFAULT_PORTS.get(scheme):
|
|
24
|
+
host = f"{host}:{parts.port}"
|
|
25
|
+
|
|
26
|
+
path = parts.path
|
|
27
|
+
if path.endswith("/"):
|
|
28
|
+
path = path.rstrip("/")
|
|
29
|
+
|
|
30
|
+
query = urlencode(
|
|
31
|
+
sorted((k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if not _is_tracking(k))
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
return urlunsplit((scheme, host, path, query, ""))
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: getdocs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
|
|
5
|
+
Author-email: jonbakerfish <jonbakerfish@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jonbakerfish/getdocs
|
|
8
|
+
Project-URL: Repository, https://github.com/jonbakerfish/getdocs
|
|
9
|
+
Project-URL: Issues, https://github.com/jonbakerfish/getdocs/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/jonbakerfish/getdocs/blob/main/docs/USAGE.md
|
|
11
|
+
Keywords: documentation,crawler,scraper,markdown,docs,llm,agents,rag
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
19
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: scrapy>=2.11
|
|
25
|
+
Requires-Dist: markdownify>=0.13
|
|
26
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: trafilatura>=1.12
|
|
29
|
+
Requires-Dist: scrapy-playwright>=0.0.40
|
|
30
|
+
Provides-Extra: server
|
|
31
|
+
Requires-Dist: fastapi>=0.110; extra == "server"
|
|
32
|
+
Requires-Dist: uvicorn>=0.27; extra == "server"
|
|
33
|
+
Requires-Dist: httpx>=0.27; extra == "server"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
36
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
37
|
+
Requires-Dist: fastapi>=0.110; extra == "dev"
|
|
38
|
+
Requires-Dist: uvicorn>=0.27; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# getdocs
|
|
42
|
+
|
|
43
|
+
**Turn any documentation site into a clean, local markdown copy your coding agent can actually read.**
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
getdocs crawl https://example.com/docs -o ./out
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Coding agents are only as good as the docs they can see. Pointing an agent at a
|
|
50
|
+
live docs URL means it burns tokens on nav bars, cookie banners, and HTML
|
|
51
|
+
chrome — or can't reach the page at all. `getdocs` gives the agent a local,
|
|
52
|
+
offline, markdown mirror instead: the actual content, structured to match the
|
|
53
|
+
original site, ready to drop into a repo or feed to a model.
|
|
54
|
+
|
|
55
|
+
## Why getdocs
|
|
56
|
+
|
|
57
|
+
- **Richer context for coding agents.** A local copy is greppable, indexable,
|
|
58
|
+
and always available — the agent reads the whole library at once instead of
|
|
59
|
+
fetching one rendered page at a time. No rate limits, no network flakiness,
|
|
60
|
+
no JS that won't hydrate.
|
|
61
|
+
- **Clean markdown → fewer tokens.** Each page is reduced to its content (the
|
|
62
|
+
nav, headers, footers, and ad chrome stripped) and written as plain markdown.
|
|
63
|
+
Agents consume it directly, and you spend tokens on docs, not `<div>` soup.
|
|
64
|
+
- **Structure preserved.** Files mirror the URL hierarchy
|
|
65
|
+
(`example.com/docs/auth` → `out/docs/auth.md`), each with YAML frontmatter
|
|
66
|
+
(`url`, `title`, `crawled_at`, `status`), plus a `crawl.json` Manifest that
|
|
67
|
+
captures the site's nav order and reading order — so an agent can follow the
|
|
68
|
+
docs in the order the authors intended.
|
|
69
|
+
- **Source-first: clone over crawl.** If the docs site is open-source, getdocs
|
|
70
|
+
detects the "Edit this page" link, clones the repo, and serves you the
|
|
71
|
+
original markdown source instead of scraping HTML — the highest-fidelity copy
|
|
72
|
+
there is. Falls back to crawling automatically when there's no repo.
|
|
73
|
+
|
|
74
|
+
## When to reach for it
|
|
75
|
+
|
|
76
|
+
- **Coding against an unfamiliar library or API.** Mirror its docs into your
|
|
77
|
+
repo (or a scratch dir) so your agent can ground its answers in the real
|
|
78
|
+
reference instead of hallucinating from memory.
|
|
79
|
+
- **RAG / knowledge bases.** Get a clean markdown corpus to chunk and embed,
|
|
80
|
+
without writing a bespoke scraper-and-cleaner for every site.
|
|
81
|
+
- **Offline or air-gapped work.** Take a docs set with you; read and search it
|
|
82
|
+
with no network.
|
|
83
|
+
- **Pinning a version.** Snapshot today's docs so your agent isn't tripped up
|
|
84
|
+
when the upstream site changes underneath you.
|
|
85
|
+
- **Migrating or archiving docs.** Pull an entire site down as markdown to move,
|
|
86
|
+
diff, or keep.
|
|
87
|
+
|
|
88
|
+
## Output
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
out/
|
|
92
|
+
├── crawl.json ← the Manifest: nav order, reading order, what ran
|
|
93
|
+
└── docs/
|
|
94
|
+
├── index.md
|
|
95
|
+
├── auth.md
|
|
96
|
+
└── guide/
|
|
97
|
+
└── intro.md
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
```markdown
|
|
101
|
+
---
|
|
102
|
+
url: https://example.com/docs/auth
|
|
103
|
+
title: Authentication
|
|
104
|
+
status: 200
|
|
105
|
+
crawled_at: 2026-06-12T10:00:00Z
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
# Authentication
|
|
109
|
+
...
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Sitemap discovery, JavaScript rendering, source-repo cloning, polite
|
|
113
|
+
throttling, JSONL output, and resumable crawls are all built in — see
|
|
114
|
+
[docs/USAGE.md](docs/USAGE.md).
|
|
115
|
+
|
|
116
|
+
## Install
|
|
117
|
+
|
|
118
|
+
Requires **Python 3.12+**.
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pip install getdocs
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Or from source, for the latest unreleased changes:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
git clone https://github.com/jonbakerfish/getdocs
|
|
128
|
+
cd getdocs
|
|
129
|
+
pip install -e .
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
That's enough to crawl. Two optional pieces unlock more:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
# JavaScript rendering — the headless browser used to hydrate SPA docs
|
|
136
|
+
playwright install chromium
|
|
137
|
+
|
|
138
|
+
# Serve a crawled/cloned copy locally as a browsable site
|
|
139
|
+
pip install mkdocs mkdocs-material
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**`git`** must be on your `PATH` for source-first cloning (it almost always
|
|
143
|
+
already is); without it, getdocs simply falls back to crawling. To run the
|
|
144
|
+
optional API service, install the server extra: `pip install "getdocs[server]"`.
|
|
145
|
+
|
|
146
|
+
## Development
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
git clone https://github.com/jonbakerfish/getdocs
|
|
150
|
+
cd getdocs
|
|
151
|
+
pip install -e ".[dev]"
|
|
152
|
+
pytest
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Responsible use
|
|
156
|
+
|
|
157
|
+
getdocs is a tool; how you point it is on you. By default it **honors
|
|
158
|
+
`robots.txt`**, throttles itself politely, and **identifies itself honestly**
|
|
159
|
+
in the `User-Agent` (`getdocs/<version> (+project-url)`) — please keep it that
|
|
160
|
+
way. For high-volume crawls, add `--contact you@example.com` so site operators
|
|
161
|
+
can reach you (it's appended to the User-Agent; optional but courteous).
|
|
162
|
+
|
|
163
|
+
getdocs is intended for personal, reference, and agent/RAG use on documentation
|
|
164
|
+
you have the right to access. **You are solely responsible for complying with
|
|
165
|
+
each site's Terms of Service, its `robots.txt`, applicable law, and the
|
|
166
|
+
copyright of the content you fetch** — getdocs is provided as-is, with no
|
|
167
|
+
warranty (see [LICENSE](LICENSE)). Crawled documentation belongs to its authors:
|
|
168
|
+
use it for your own reference, but don't redistribute someone else's docs as
|
|
169
|
+
your own. Crawl only what you have the right to.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
getdocs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
getdocs/__main__.py,sha256=_Tn13GrLH9DQGu82AfWWm7dShwaORqV5PnRTfl7Q03Q,55
|
|
3
|
+
getdocs/api.py,sha256=y-jNi2-OJ7iQFlMPyxc5AeX5Cg1yo-r6eI3gLn3b0d0,3098
|
|
4
|
+
getdocs/cli.py,sha256=fNQz2hv0EijfaqYOH4kfifA9_k9lQnDZs8KqmLkHA0M,8794
|
|
5
|
+
getdocs/config.py,sha256=c1DHYj5OheMEQJqQVUjc85Y1S8kM5CMIrlsakiQSh64,1671
|
|
6
|
+
getdocs/engine.py,sha256=jNaAcfXiz8BGNzvu_5ohbDA_pf_7Ubitw5GkJk1K978,17814
|
|
7
|
+
getdocs/extract.py,sha256=EBkzt8w9z1Fr_ZRc6IEFYBDn68L4SsyL7xiOeCw8j28,6671
|
|
8
|
+
getdocs/identity.py,sha256=5g8tZmgzlzCWqWQNDWVnTe9k2xVz_bSBYvmLoXh8OUE,1228
|
|
9
|
+
getdocs/jobs.py,sha256=SC6c2Isfn_nyFSTsGfJOGLvGsr5_RLHJa5G1uUhoJYk,7371
|
|
10
|
+
getdocs/navharvest.py,sha256=Ol0coVL4Z_HlZCarotADf09t2eCGq80NkHt6eVWIxcI,8510
|
|
11
|
+
getdocs/output.py,sha256=FPvCFTzL4m9HT1MsqRc8NBfnESGvOIMPWdmfPEDE-rM,6105
|
|
12
|
+
getdocs/scope.py,sha256=lrPOsxtQIrg7HrV87qf7tnXwTAXlV0yY0mBhyNciG3c,2622
|
|
13
|
+
getdocs/sitemap.py,sha256=buPx7HteGLnba-8ZdvE2nR8WHHcZVJKKq859K8Ap4Yw,992
|
|
14
|
+
getdocs/source.py,sha256=AJOhOosaZ9PTbvix8ob3BB3Fnuvk-DUrAm-AkRJX6wc,9354
|
|
15
|
+
getdocs/urlnorm.py,sha256=6EQicHbfKmW0ZtXeT5SgpVTBADXNg6kFRNP5sm50uFY,1043
|
|
16
|
+
getdocs-0.1.0.dist-info/licenses/LICENSE,sha256=Rccj8_h8ZBIXibP7auL1DDSOPRlZhYRMeVxLqbtsFRI,1069
|
|
17
|
+
getdocs-0.1.0.dist-info/METADATA,sha256=lJ90Zf55F27iI6u5cOXB3wRNVIOo7N55dmT5Dm5BApY,6382
|
|
18
|
+
getdocs-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
19
|
+
getdocs-0.1.0.dist-info/entry_points.txt,sha256=OTldHBzF-MGYD2brSgt7FOTQZ5rimQkDn6ghb3UujXg,45
|
|
20
|
+
getdocs-0.1.0.dist-info/top_level.txt,sha256=KMBRopk15nwErQ-PZQSe8xAQiHhLuRdQU1qJzeYpVIQ,8
|
|
21
|
+
getdocs-0.1.0.dist-info/RECORD,,
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 jonbakerfish
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
getdocs
|