getdocs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
getdocs/scope.py ADDED
@@ -0,0 +1,84 @@
1
+ """Scope: decides whether a discovered URL belongs to a Crawl.
2
+
3
+ Default: same host + path prefix of each Seed URL, loosened by explicit
4
+ overrides. Discovery method never matters — Scope gates fetching no matter
5
+ how a URL was found.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from fnmatch import fnmatch
10
+ from urllib.parse import urlsplit
11
+
12
+
13
+ def _segments(path: str) -> list[str]:
14
+ return [s for s in path.split("/") if s]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class _SeedRule:
19
+ host: str
20
+ path_segments: tuple[str, ...]
21
+
22
+ def matches_host(self, host: str, allow_subdomains: bool) -> bool:
23
+ if host == self.host:
24
+ return True
25
+ return allow_subdomains and host.endswith("." + self.host)
26
+
27
+ def matches_path(self, segments: list[str], allow_backward: bool) -> bool:
28
+ if allow_backward:
29
+ return True
30
+ return tuple(segments[: len(self.path_segments)]) == self.path_segments
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class Scope:
35
+ rules: tuple[_SeedRule, ...]
36
+ allow_backward: bool = False
37
+ allow_subdomains: bool = False
38
+ include_paths: tuple[str, ...] = ()
39
+ exclude_paths: tuple[str, ...] = ()
40
+
41
+ @classmethod
42
+ def from_seeds(
43
+ cls,
44
+ seeds: list[str],
45
+ allow_backward: bool = False,
46
+ allow_subdomains: bool = False,
47
+ include_paths: list[str] | tuple[str, ...] = (),
48
+ exclude_paths: list[str] | tuple[str, ...] = (),
49
+ ) -> "Scope":
50
+ rules = []
51
+ for seed in seeds:
52
+ parts = urlsplit(seed)
53
+ rules.append(
54
+ _SeedRule(host=parts.netloc.lower(), path_segments=tuple(_segments(parts.path)))
55
+ )
56
+ return cls(
57
+ rules=tuple(rules),
58
+ allow_backward=allow_backward,
59
+ allow_subdomains=allow_subdomains,
60
+ include_paths=tuple(include_paths),
61
+ exclude_paths=tuple(exclude_paths),
62
+ )
63
+
64
+ def allows(self, url: str) -> bool:
65
+ parts = urlsplit(url)
66
+ if parts.scheme not in ("http", "https"):
67
+ return False
68
+ host = parts.netloc.lower()
69
+ segments = _segments(parts.path)
70
+
71
+ in_seed_scope = any(
72
+ rule.matches_host(host, self.allow_subdomains)
73
+ and rule.matches_path(segments, self.allow_backward)
74
+ for rule in self.rules
75
+ )
76
+ if not in_seed_scope:
77
+ return False
78
+
79
+ path = parts.path or "/"
80
+ if self.include_paths and not any(fnmatch(path, g) for g in self.include_paths):
81
+ return False
82
+ if any(fnmatch(path, g) for g in self.exclude_paths):
83
+ return False
84
+ return True
getdocs/sitemap.py ADDED
@@ -0,0 +1,35 @@
1
+ """Sitemap parsing: urlsets, sitemap indexes, and robots.txt Sitemap lines.
2
+
3
+ Discovery only — Scope decides what actually gets fetched.
4
+ """
5
+
6
+ import xml.etree.ElementTree as ET
7
+
8
+
9
+ def parse_sitemap_xml(body: str) -> tuple[list[str], list[str]]:
10
+ """Returns (page_urls, nested_sitemap_urls)."""
11
+ try:
12
+ root = ET.fromstring(body)
13
+ except ET.ParseError:
14
+ return [], []
15
+
16
+ tag = root.tag.rpartition("}")[2]
17
+ locs = [
18
+ loc.text.strip()
19
+ for loc in root.iter()
20
+ if loc.tag.rpartition("}")[2] == "loc" and loc.text and loc.text.strip()
21
+ ]
22
+ if tag == "urlset":
23
+ return locs, []
24
+ if tag == "sitemapindex":
25
+ return [], locs
26
+ return [], []
27
+
28
+
29
+ def parse_robots_sitemaps(robots_txt: str) -> list[str]:
30
+ sitemaps = []
31
+ for line in robots_txt.splitlines():
32
+ key, _, value = line.partition(":")
33
+ if key.strip().lower() == "sitemap" and value.strip():
34
+ sitemaps.append(value.strip())
35
+ return sitemaps
getdocs/source.py ADDED
@@ -0,0 +1,238 @@
1
+ """Source-first: before crawling, check whether the docs site is open-source.
2
+
3
+ Most documentation generators embed a link back to the repository that hosts
4
+ the docs source — MkDocs/Material and Docusaurus render an "Edit this page"
5
+ link, Sphinx/Read-the-Docs an "Edit on GitHub". When we can find that repo we
6
+ clone it and write an mkdocs.yml so the docs can be served locally, which is
7
+ faster and higher-fidelity than crawling the rendered HTML.
8
+
9
+ Detection is a pure function over the seed page's HTML (`detect_repo`); the
10
+ network and git side-effects live in `fetch_html`, `clone_repo`, and the
11
+ `clone_source_for` orchestrator the CLI calls before `run_crawl`.
12
+ """
13
+
14
+ import importlib.util
15
+ import shutil
16
+ import subprocess
17
+ import sys
18
+ import urllib.request
19
+ from pathlib import Path
20
+ from urllib.parse import urljoin, urlsplit
21
+
22
+ import yaml
23
+ from bs4 import BeautifulSoup
24
+
25
+ from getdocs.config import CrawlConfig
26
+ from getdocs.identity import build_user_agent
27
+
28
+ # Hosts whose /ORG/REPO paths we recognize as clonable repositories.
29
+ _GIT_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
30
+
31
+ # First path segments on github.com that are product pages, not orgs/users.
32
+ _RESERVED_OWNERS = {
33
+ "about", "apps", "collections", "contact", "customer-stories", "explore",
34
+ "features", "join", "login", "marketplace", "new", "notifications",
35
+ "organizations", "orgs", "pricing", "readme", "security", "settings",
36
+ "site", "sponsors", "topics",
37
+ }
38
+
39
+ # Sub-paths a repo URL deep-links through (github.com/o/r/edit/main/docs/x.md).
40
+ _REPO_SUBPATHS = {"edit", "blob", "tree", "raw", "commits", "blame", "wiki"}
41
+
42
+ # Where doc sources commonly live inside a repo, in preference order.
43
+ _DOCS_CANDIDATES = ["docs", "doc", "documentation", "site/docs", "website/docs", "content"]
44
+
45
+
46
+ def repo_root_from_url(url: str) -> str | None:
47
+ """Reduce any URL on a known Git host to its canonical repo root, or None.
48
+
49
+ https://github.com/org/repo/edit/main/docs/x.md -> https://github.com/org/repo
50
+ Anything off a known host, or shallower than /ORG/REPO, returns None.
51
+ """
52
+ parts = urlsplit(url)
53
+ if parts.scheme not in ("http", "https"):
54
+ return None
55
+ host = parts.netloc.lower().rsplit("@", 1)[-1].split(":", 1)[0]
56
+ if host.startswith("www."):
57
+ host = host[4:]
58
+ if host not in _GIT_HOSTS:
59
+ return None
60
+ segments = [s for s in parts.path.split("/") if s]
61
+ if len(segments) < 2:
62
+ return None
63
+ owner, repo = segments[0], segments[1]
64
+ if owner.lower() in _RESERVED_OWNERS:
65
+ return None
66
+ if repo.endswith(".git"):
67
+ repo = repo[:-4]
68
+ if not repo:
69
+ return None
70
+ return f"https://{host}/{owner}/{repo}"
71
+
72
+
73
+ def _link_score(url: str, blob: str) -> int:
74
+ """Rank a repo link by how strongly it signals 'this is the docs source'.
75
+
76
+ An explicit "edit this page" affordance is the gold signal; a "source"/
77
+ "github" label is next; a bare deep-link into the repo beats a plain
78
+ repo-root link that might just be a footer cross-reference.
79
+ """
80
+ if "edit" in blob:
81
+ return 100
82
+ if any(word in blob for word in ("source", "github", "gitlab", "view on", "improve")):
83
+ return 80
84
+ segments = [s for s in urlsplit(url).path.split("/") if s]
85
+ if len(segments) >= 3 and segments[2] in _REPO_SUBPATHS:
86
+ return 60
87
+ return 20
88
+
89
+
90
+ def detect_repo(html: str, base_url: str = "") -> str | None:
91
+ """Find the repository a docs page links back to, as a canonical repo URL.
92
+
93
+ Scans anchors for links onto a known Git host, scores each by how clearly
94
+ it names the docs source, and returns the highest-scoring repo root
95
+ (ties broken by how often that repo is linked). None when nothing matches.
96
+ """
97
+ soup = BeautifulSoup(html, "html.parser")
98
+ # root -> [best score seen, link count]
99
+ candidates: dict[str, list[int]] = {}
100
+ for anchor in soup.find_all("a", href=True):
101
+ href = anchor["href"].strip()
102
+ if base_url and not urlsplit(href).scheme:
103
+ href = urljoin(base_url, href)
104
+ root = repo_root_from_url(href)
105
+ if not root:
106
+ continue
107
+ blob = " ".join([
108
+ anchor.get_text(" ", strip=True),
109
+ anchor.get("title", ""),
110
+ " ".join(anchor.get("class") or []),
111
+ " ".join(anchor.get("rel") or []),
112
+ anchor.get("aria-label", ""),
113
+ ]).lower()
114
+ score = _link_score(href, blob)
115
+ entry = candidates.setdefault(root, [0, 0])
116
+ entry[0] = max(entry[0], score)
117
+ entry[1] += 1
118
+ if not candidates:
119
+ return None
120
+ return max(candidates, key=lambda root: (candidates[root][0], candidates[root][1]))
121
+
122
+
123
+ def fetch_html(
124
+ url: str, user_agent: str | None = None,
125
+ timeout: float = 15.0, max_bytes: int = 3_000_000,
126
+ ) -> str | None:
127
+ """Fetch a single page's HTML for detection; None on any error/non-HTML."""
128
+ request = urllib.request.Request(
129
+ url, headers={"User-Agent": user_agent or build_user_agent()}
130
+ )
131
+ try:
132
+ with urllib.request.urlopen(request, timeout=timeout) as response: # noqa: S310
133
+ content_type = response.headers.get("Content-Type", "").lower()
134
+ if any(kind in content_type for kind in ("json", "image/", "pdf", "octet-stream")):
135
+ return None
136
+ data = response.read(max_bytes)
137
+ except Exception:
138
+ return None
139
+ return data.decode("utf-8", errors="replace")
140
+
141
+
142
+ def clone_repo(repo_url: str, dest_parent: Path, timeout: float = 180.0) -> Path | None:
143
+ """Shallow-clone repo_url under dest_parent; return the clone dir or None.
144
+
145
+ Returns an existing clone untouched (idempotent for re-runs); None when git
146
+ is missing or the clone fails.
147
+ """
148
+ if shutil.which("git") is None:
149
+ return None
150
+ name = repo_url.rstrip("/").rsplit("/", 1)[-1]
151
+ if name.endswith(".git"):
152
+ name = name[:-4]
153
+ dest = dest_parent / name
154
+ if dest.exists():
155
+ return dest if (dest / ".git").exists() else None
156
+ clone_url = repo_url if repo_url.endswith(".git") else repo_url + ".git"
157
+ dest_parent.mkdir(parents=True, exist_ok=True)
158
+ try:
159
+ subprocess.run(
160
+ ["git", "clone", "--depth", "1", clone_url, str(dest)],
161
+ check=True, capture_output=True, timeout=timeout,
162
+ )
163
+ except (subprocess.CalledProcessError, subprocess.TimeoutExpired, OSError):
164
+ return None
165
+ return dest
166
+
167
+
168
+ def find_docs_dir(repo_dir: Path) -> Path | None:
169
+ """Locate the markdown docs source inside a cloned repo, or None."""
170
+ for candidate in _DOCS_CANDIDATES:
171
+ path = repo_dir / candidate
172
+ if path.is_dir() and (any(path.rglob("*.md")) or any(path.rglob("*.mdx"))):
173
+ return path
174
+ if any(repo_dir.glob("*.md")):
175
+ return repo_dir
176
+ return None
177
+
178
+
179
+ def write_mkdocs_config(output_dir: Path, docs_dir: Path, site_name: str) -> Path:
180
+ """Write an mkdocs.yml in output_dir that serves docs_dir locally."""
181
+ output_dir.mkdir(parents=True, exist_ok=True)
182
+ theme = "material" if importlib.util.find_spec("material") else "mkdocs"
183
+ config = {
184
+ "site_name": site_name,
185
+ "docs_dir": str(docs_dir.resolve()),
186
+ "theme": {"name": theme},
187
+ "use_directory_urls": True,
188
+ }
189
+ path = output_dir / "mkdocs.yml"
190
+ path.write_text(yaml.safe_dump(config, sort_keys=False, allow_unicode=True))
191
+ return path
192
+
193
+
194
+ def clone_source_for(config: CrawlConfig) -> Path | None:
195
+ """Try to satisfy a crawl by cloning the docs' source repo instead.
196
+
197
+ Returns the clone directory when the site is open-source and was cloned
198
+ (the caller should then skip crawling); None to fall back to crawling.
199
+ Progress and outcomes are reported on stderr (stdout is the jsonl stream).
200
+ """
201
+ if not config.seeds:
202
+ return None
203
+ seed = config.seeds[0]
204
+ if urlsplit(seed).scheme not in ("http", "https"):
205
+ return None
206
+
207
+ host = urlsplit(seed).netloc or seed
208
+ print(f"checking whether {host} is open-source…", file=sys.stderr)
209
+ html = fetch_html(seed, build_user_agent(config.contact, config.user_agent))
210
+ if html is None:
211
+ print("could not fetch the seed page — crawling instead", file=sys.stderr)
212
+ return None
213
+ repo_url = detect_repo(html, seed)
214
+ if repo_url is None:
215
+ print("no source repository linked from the page — crawling instead", file=sys.stderr)
216
+ return None
217
+
218
+ print(f"found source repository {repo_url} — cloning…", file=sys.stderr)
219
+ repo_dir = clone_repo(repo_url, config.output_dir)
220
+ if repo_dir is None:
221
+ print("clone failed (git missing or repo unreachable) — crawling instead", file=sys.stderr)
222
+ return None
223
+
224
+ own_config = repo_dir / "mkdocs.yml"
225
+ if own_config.exists():
226
+ print(f"cloned to {repo_dir} (ships its own mkdocs.yml)", file=sys.stderr)
227
+ print(f"serve it with: mkdocs serve -f {own_config}", file=sys.stderr)
228
+ return repo_dir
229
+
230
+ docs_dir = find_docs_dir(repo_dir)
231
+ if docs_dir is None:
232
+ print(f"cloned to {repo_dir}, but found no markdown docs to serve", file=sys.stderr)
233
+ return repo_dir
234
+
235
+ written = write_mkdocs_config(config.output_dir, docs_dir, host)
236
+ print(f"cloned to {repo_dir}; wrote {written}", file=sys.stderr)
237
+ print(f"serve it with: mkdocs serve -f {written}", file=sys.stderr)
238
+ return repo_dir
getdocs/urlnorm.py ADDED
@@ -0,0 +1,34 @@
1
+ """URL normalization for frontier dedup.
2
+
3
+ Two URLs that normalize identically are the same Page. rel=canonical is
4
+ deliberately NOT part of this (ADR-0003).
5
+ """
6
+
7
+ from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
8
+
9
+ _TRACKING_PREFIXES = ("utm_",)
10
+ _TRACKING_PARAMS = {"fbclid", "gclid", "msclkid", "mc_cid", "mc_eid"}
11
+ _DEFAULT_PORTS = {"http": "80", "https": "443"}
12
+
13
+
14
+ def _is_tracking(param: str) -> bool:
15
+ return param in _TRACKING_PARAMS or param.startswith(_TRACKING_PREFIXES)
16
+
17
+
18
+ def normalize(url: str) -> str:
19
+ parts = urlsplit(url)
20
+ scheme = parts.scheme.lower()
21
+
22
+ host = parts.hostname.lower() if parts.hostname else ""
23
+ if parts.port is not None and str(parts.port) != _DEFAULT_PORTS.get(scheme):
24
+ host = f"{host}:{parts.port}"
25
+
26
+ path = parts.path
27
+ if path.endswith("/"):
28
+ path = path.rstrip("/")
29
+
30
+ query = urlencode(
31
+ sorted((k, v) for k, v in parse_qsl(parts.query, keep_blank_values=True) if not _is_tracking(k))
32
+ )
33
+
34
+ return urlunsplit((scheme, host, path, query, ""))
@@ -0,0 +1,169 @@
1
+ Metadata-Version: 2.4
2
+ Name: getdocs
3
+ Version: 0.1.0
4
+ Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
5
+ Author-email: jonbakerfish <jonbakerfish@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/jonbakerfish/getdocs
8
+ Project-URL: Repository, https://github.com/jonbakerfish/getdocs
9
+ Project-URL: Issues, https://github.com/jonbakerfish/getdocs/issues
10
+ Project-URL: Documentation, https://github.com/jonbakerfish/getdocs/blob/main/docs/USAGE.md
11
+ Keywords: documentation,crawler,scraper,markdown,docs,llm,agents,rag
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
19
+ Classifier: Topic :: Software Development :: Documentation
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Requires-Python: >=3.12
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: scrapy>=2.11
25
+ Requires-Dist: markdownify>=0.13
26
+ Requires-Dist: beautifulsoup4>=4.12
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: trafilatura>=1.12
29
+ Requires-Dist: scrapy-playwright>=0.0.40
30
+ Provides-Extra: server
31
+ Requires-Dist: fastapi>=0.110; extra == "server"
32
+ Requires-Dist: uvicorn>=0.27; extra == "server"
33
+ Requires-Dist: httpx>=0.27; extra == "server"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=8.0; extra == "dev"
36
+ Requires-Dist: httpx>=0.27; extra == "dev"
37
+ Requires-Dist: fastapi>=0.110; extra == "dev"
38
+ Requires-Dist: uvicorn>=0.27; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # getdocs
42
+
43
+ **Turn any documentation site into a clean, local markdown copy your coding agent can actually read.**
44
+
45
+ ```bash
46
+ getdocs crawl https://example.com/docs -o ./out
47
+ ```
48
+
49
+ Coding agents are only as good as the docs they can see. Pointing an agent at a
50
+ live docs URL means it burns tokens on nav bars, cookie banners, and HTML
51
+ chrome — or can't reach the page at all. `getdocs` gives the agent a local,
52
+ offline, markdown mirror instead: the actual content, structured to match the
53
+ original site, ready to drop into a repo or feed to a model.
54
+
55
+ ## Why getdocs
56
+
57
+ - **Richer context for coding agents.** A local copy is greppable, indexable,
58
+ and always available — the agent reads the whole library at once instead of
59
+ fetching one rendered page at a time. No rate limits, no network flakiness,
60
+ no JS that won't hydrate.
61
+ - **Clean markdown → fewer tokens.** Each page is reduced to its content (the
62
+ nav, headers, footers, and ad chrome stripped) and written as plain markdown.
63
+ Agents consume it directly, and you spend tokens on docs, not `<div>` soup.
64
+ - **Structure preserved.** Files mirror the URL hierarchy
65
+ (`example.com/docs/auth` → `out/docs/auth.md`), each with YAML frontmatter
66
+ (`url`, `title`, `crawled_at`, `status`), plus a `crawl.json` Manifest that
67
+ captures the site's nav order and reading order — so an agent can follow the
68
+ docs in the order the authors intended.
69
+ - **Source-first: clone over crawl.** If the docs site is open-source, getdocs
70
+ detects the "Edit this page" link, clones the repo, and serves you the
71
+ original markdown source instead of scraping HTML — the highest-fidelity copy
72
+ there is. Falls back to crawling automatically when there's no repo.
73
+
74
+ ## When to reach for it
75
+
76
+ - **Coding against an unfamiliar library or API.** Mirror its docs into your
77
+ repo (or a scratch dir) so your agent can ground its answers in the real
78
+ reference instead of hallucinating from memory.
79
+ - **RAG / knowledge bases.** Get a clean markdown corpus to chunk and embed,
80
+ without writing a bespoke scraper-and-cleaner for every site.
81
+ - **Offline or air-gapped work.** Take a docs set with you; read and search it
82
+ with no network.
83
+ - **Pinning a version.** Snapshot today's docs so your agent isn't tripped up
84
+ when the upstream site changes underneath you.
85
+ - **Migrating or archiving docs.** Pull an entire site down as markdown to move,
86
+ diff, or keep.
87
+
88
+ ## Output
89
+
90
+ ```
91
+ out/
92
+ ├── crawl.json ← the Manifest: nav order, reading order, what ran
93
+ └── docs/
94
+ ├── index.md
95
+ ├── auth.md
96
+ └── guide/
97
+ └── intro.md
98
+ ```
99
+
100
+ ```markdown
101
+ ---
102
+ url: https://example.com/docs/auth
103
+ title: Authentication
104
+ status: 200
105
+ crawled_at: 2026-06-12T10:00:00Z
106
+ ---
107
+
108
+ # Authentication
109
+ ...
110
+ ```
111
+
112
+ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
113
+ throttling, JSONL output, and resumable crawls are all built in — see
114
+ [docs/USAGE.md](docs/USAGE.md).
115
+
116
+ ## Install
117
+
118
+ Requires **Python 3.12+**.
119
+
120
+ ```bash
121
+ pip install getdocs
122
+ ```
123
+
124
+ Or from source, for the latest unreleased changes:
125
+
126
+ ```bash
127
+ git clone https://github.com/jonbakerfish/getdocs
128
+ cd getdocs
129
+ pip install -e .
130
+ ```
131
+
132
+ That's enough to crawl. Two optional pieces unlock more:
133
+
134
+ ```bash
135
+ # JavaScript rendering — the headless browser used to hydrate SPA docs
136
+ playwright install chromium
137
+
138
+ # Serve a crawled/cloned copy locally as a browsable site
139
+ pip install mkdocs mkdocs-material
140
+ ```
141
+
142
+ **`git`** must be on your `PATH` for source-first cloning (it almost always
143
+ already is); without it, getdocs simply falls back to crawling. To run the
144
+ optional API service, install the server extra: `pip install "getdocs[server]"`.
145
+
146
+ ## Development
147
+
148
+ ```bash
149
+ git clone https://github.com/jonbakerfish/getdocs
150
+ cd getdocs
151
+ pip install -e ".[dev]"
152
+ pytest
153
+ ```
154
+
155
+ ## Responsible use
156
+
157
+ getdocs is a tool; how you point it is on you. By default it **honors
158
+ `robots.txt`**, throttles itself politely, and **identifies itself honestly**
159
+ in the `User-Agent` (`getdocs/<version> (+project-url)`) — please keep it that
160
+ way. For high-volume crawls, add `--contact you@example.com` so site operators
161
+ can reach you (it's appended to the User-Agent; optional but courteous).
162
+
163
+ getdocs is intended for personal, reference, and agent/RAG use on documentation
164
+ you have the right to access. **You are solely responsible for complying with
165
+ each site's Terms of Service, its `robots.txt`, applicable law, and the
166
+ copyright of the content you fetch** — getdocs is provided as-is, with no
167
+ warranty (see [LICENSE](LICENSE)). Crawled documentation belongs to its authors:
168
+ use it for your own reference, but don't redistribute someone else's docs as
169
+ your own. Crawl only what you have the right to.
@@ -0,0 +1,21 @@
1
+ getdocs/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ getdocs/__main__.py,sha256=_Tn13GrLH9DQGu82AfWWm7dShwaORqV5PnRTfl7Q03Q,55
3
+ getdocs/api.py,sha256=y-jNi2-OJ7iQFlMPyxc5AeX5Cg1yo-r6eI3gLn3b0d0,3098
4
+ getdocs/cli.py,sha256=fNQz2hv0EijfaqYOH4kfifA9_k9lQnDZs8KqmLkHA0M,8794
5
+ getdocs/config.py,sha256=c1DHYj5OheMEQJqQVUjc85Y1S8kM5CMIrlsakiQSh64,1671
6
+ getdocs/engine.py,sha256=jNaAcfXiz8BGNzvu_5ohbDA_pf_7Ubitw5GkJk1K978,17814
7
+ getdocs/extract.py,sha256=EBkzt8w9z1Fr_ZRc6IEFYBDn68L4SsyL7xiOeCw8j28,6671
8
+ getdocs/identity.py,sha256=5g8tZmgzlzCWqWQNDWVnTe9k2xVz_bSBYvmLoXh8OUE,1228
9
+ getdocs/jobs.py,sha256=SC6c2Isfn_nyFSTsGfJOGLvGsr5_RLHJa5G1uUhoJYk,7371
10
+ getdocs/navharvest.py,sha256=Ol0coVL4Z_HlZCarotADf09t2eCGq80NkHt6eVWIxcI,8510
11
+ getdocs/output.py,sha256=FPvCFTzL4m9HT1MsqRc8NBfnESGvOIMPWdmfPEDE-rM,6105
12
+ getdocs/scope.py,sha256=lrPOsxtQIrg7HrV87qf7tnXwTAXlV0yY0mBhyNciG3c,2622
13
+ getdocs/sitemap.py,sha256=buPx7HteGLnba-8ZdvE2nR8WHHcZVJKKq859K8Ap4Yw,992
14
+ getdocs/source.py,sha256=AJOhOosaZ9PTbvix8ob3BB3Fnuvk-DUrAm-AkRJX6wc,9354
15
+ getdocs/urlnorm.py,sha256=6EQicHbfKmW0ZtXeT5SgpVTBADXNg6kFRNP5sm50uFY,1043
16
+ getdocs-0.1.0.dist-info/licenses/LICENSE,sha256=Rccj8_h8ZBIXibP7auL1DDSOPRlZhYRMeVxLqbtsFRI,1069
17
+ getdocs-0.1.0.dist-info/METADATA,sha256=lJ90Zf55F27iI6u5cOXB3wRNVIOo7N55dmT5Dm5BApY,6382
18
+ getdocs-0.1.0.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
19
+ getdocs-0.1.0.dist-info/entry_points.txt,sha256=OTldHBzF-MGYD2brSgt7FOTQZ5rimQkDn6ghb3UujXg,45
20
+ getdocs-0.1.0.dist-info/top_level.txt,sha256=KMBRopk15nwErQ-PZQSe8xAQiHhLuRdQU1qJzeYpVIQ,8
21
+ getdocs-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (82.0.1)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ getdocs = getdocs.cli:main
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 jonbakerfish
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1 @@
1
+ getdocs