getdocs 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- getdocs/__init__.py +0 -0
- getdocs/__main__.py +3 -0
- getdocs/api.py +95 -0
- getdocs/cli.py +220 -0
- getdocs/config.py +36 -0
- getdocs/engine.py +418 -0
- getdocs/extract.py +190 -0
- getdocs/identity.py +32 -0
- getdocs/jobs.py +204 -0
- getdocs/navharvest.py +242 -0
- getdocs/output.py +191 -0
- getdocs/scope.py +84 -0
- getdocs/sitemap.py +35 -0
- getdocs/source.py +238 -0
- getdocs/urlnorm.py +34 -0
- getdocs-0.1.0.dist-info/METADATA +169 -0
- getdocs-0.1.0.dist-info/RECORD +21 -0
- getdocs-0.1.0.dist-info/WHEEL +5 -0
- getdocs-0.1.0.dist-info/entry_points.txt +2 -0
- getdocs-0.1.0.dist-info/licenses/LICENSE +21 -0
- getdocs-0.1.0.dist-info/top_level.txt +1 -0
getdocs/__init__.py
ADDED
|
File without changes
|
getdocs/__main__.py
ADDED
getdocs/api.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""API service: Firecrawl-style async Crawl jobs over the engine (ADR-0002)."""
|
|
2
|
+
|
|
3
|
+
from fastapi import FastAPI, HTTPException, WebSocket
|
|
4
|
+
from pydantic import BaseModel, model_validator
|
|
5
|
+
|
|
6
|
+
from getdocs.jobs import CrawlJob, JobManager
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CrawlRequest(BaseModel):
|
|
10
|
+
url: str | None = None
|
|
11
|
+
urls: list[str] | None = None
|
|
12
|
+
limit: int | None = None
|
|
13
|
+
depth: int | None = None
|
|
14
|
+
allow_backward: bool = False
|
|
15
|
+
allow_subdomains: bool = False
|
|
16
|
+
include_paths: list[str] | None = None
|
|
17
|
+
exclude_paths: list[str] | None = None
|
|
18
|
+
sitemap: str | None = None # "both" | "off" | "only"
|
|
19
|
+
render: str | None = None # "auto" | "always" | "never"
|
|
20
|
+
selector: str | None = None
|
|
21
|
+
ignore_robots: bool = False
|
|
22
|
+
keep_html: bool = False
|
|
23
|
+
delay: float | None = None
|
|
24
|
+
concurrency: int | None = None
|
|
25
|
+
webhook: str | None = None # URL POSTed started/page/completed events
|
|
26
|
+
|
|
27
|
+
@model_validator(mode="after")
|
|
28
|
+
def _require_some_url(self):
|
|
29
|
+
if not self.url and not self.urls:
|
|
30
|
+
raise ValueError("either url or urls is required")
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _serialize(job: CrawlJob) -> dict:
|
|
35
|
+
return {
|
|
36
|
+
"id": job.id,
|
|
37
|
+
"status": job.status,
|
|
38
|
+
"seeds": job.seeds,
|
|
39
|
+
"page_count": len(job.pages),
|
|
40
|
+
"pages": job.pages,
|
|
41
|
+
"manifest": job.manifest,
|
|
42
|
+
"error": job.error,
|
|
43
|
+
"webhook_failures": job.webhook_failures,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_app(manager: JobManager | None = None) -> FastAPI:
|
|
48
|
+
manager = manager or JobManager()
|
|
49
|
+
app = FastAPI(title="getdocs", version="0.1.0")
|
|
50
|
+
app.state.manager = manager
|
|
51
|
+
|
|
52
|
+
@app.post("/v1/crawl", status_code=202)
|
|
53
|
+
async def start_crawl(request: CrawlRequest):
|
|
54
|
+
job = manager.start(request.model_dump(exclude_none=True))
|
|
55
|
+
return {"id": job.id, "status": job.status}
|
|
56
|
+
|
|
57
|
+
@app.get("/v1/crawl")
|
|
58
|
+
async def list_crawls():
|
|
59
|
+
return {
|
|
60
|
+
"jobs": [
|
|
61
|
+
{
|
|
62
|
+
"id": job.id,
|
|
63
|
+
"status": job.status,
|
|
64
|
+
"seeds": job.seeds,
|
|
65
|
+
"page_count": len(job.pages),
|
|
66
|
+
}
|
|
67
|
+
for job in manager.jobs.values()
|
|
68
|
+
]
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
@app.get("/v1/crawl/{job_id}")
|
|
72
|
+
async def get_crawl(job_id: str):
|
|
73
|
+
job = manager.get(job_id)
|
|
74
|
+
if job is None:
|
|
75
|
+
raise HTTPException(status_code=404, detail="no such Crawl job")
|
|
76
|
+
return _serialize(job)
|
|
77
|
+
|
|
78
|
+
@app.websocket("/v1/crawl/{job_id}/ws")
|
|
79
|
+
async def stream_crawl(websocket: WebSocket, job_id: str):
|
|
80
|
+
await websocket.accept()
|
|
81
|
+
if manager.get(job_id) is None:
|
|
82
|
+
await websocket.close(code=4404, reason="no such Crawl job")
|
|
83
|
+
return
|
|
84
|
+
async for event in manager.stream(job_id):
|
|
85
|
+
await websocket.send_json(event)
|
|
86
|
+
await websocket.close()
|
|
87
|
+
|
|
88
|
+
@app.delete("/v1/crawl/{job_id}")
|
|
89
|
+
async def cancel_crawl(job_id: str):
|
|
90
|
+
job = manager.cancel(job_id)
|
|
91
|
+
if job is None:
|
|
92
|
+
raise HTTPException(status_code=404, detail="no such Crawl job")
|
|
93
|
+
return {"id": job.id, "status": job.status}
|
|
94
|
+
|
|
95
|
+
return app
|
getdocs/cli.py
ADDED
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""CLI: argument parsing to CrawlConfig, engine invocation, exit-code mapping."""
|
|
2
|
+
|
|
3
|
+
import argparse
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
from getdocs.config import CrawlConfig, ServeConfig
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def parse_args(argv: list[str] | None = None) -> CrawlConfig | ServeConfig:
|
|
10
|
+
parser = argparse.ArgumentParser(
|
|
11
|
+
prog="getdocs",
|
|
12
|
+
description="Crawl a documentation site and emit clean markdown.",
|
|
13
|
+
)
|
|
14
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
15
|
+
|
|
16
|
+
crawl = subparsers.add_parser("crawl", help="Run a Crawl from one or more seed URLs")
|
|
17
|
+
crawl.add_argument(
|
|
18
|
+
"seeds", nargs="*", metavar="URL",
|
|
19
|
+
help="Seed URL(s) for the Crawl (omit with --resume to reuse saved seeds)",
|
|
20
|
+
)
|
|
21
|
+
crawl.add_argument(
|
|
22
|
+
"-f", "--seeds-file", type=Path, metavar="FILE",
|
|
23
|
+
help="File of additional Seed URLs, one per line (# comments and blank lines ignored)",
|
|
24
|
+
)
|
|
25
|
+
crawl.add_argument(
|
|
26
|
+
"--resume", action="store_true",
|
|
27
|
+
help="Continue the interrupted Crawl whose state lives in the output directory",
|
|
28
|
+
)
|
|
29
|
+
crawl.add_argument(
|
|
30
|
+
"-o", "--output-dir", type=Path, default=Path("./out"),
|
|
31
|
+
help="Directory the Pages and Manifest are written to (default: ./out)",
|
|
32
|
+
)
|
|
33
|
+
crawl.add_argument(
|
|
34
|
+
"--allow-backward", action="store_true",
|
|
35
|
+
help="Widen Scope from the seed's path prefix to its whole host",
|
|
36
|
+
)
|
|
37
|
+
crawl.add_argument(
|
|
38
|
+
"--allow-subdomains", action="store_true",
|
|
39
|
+
help="Widen Scope to subdomains of the seed host",
|
|
40
|
+
)
|
|
41
|
+
crawl.add_argument(
|
|
42
|
+
"--include-paths", action="append", default=[], metavar="GLOB",
|
|
43
|
+
help="Only crawl paths matching at least one glob (repeatable)",
|
|
44
|
+
)
|
|
45
|
+
crawl.add_argument(
|
|
46
|
+
"--exclude-paths", action="append", default=[], metavar="GLOB",
|
|
47
|
+
help="Never crawl paths matching a glob (repeatable)",
|
|
48
|
+
)
|
|
49
|
+
crawl.add_argument(
|
|
50
|
+
"--depth", type=int, default=0, metavar="N",
|
|
51
|
+
help="Maximum link-hops from any seed (default: 0 = unlimited)",
|
|
52
|
+
)
|
|
53
|
+
crawl.add_argument(
|
|
54
|
+
"--limit", type=int, default=1000, metavar="N",
|
|
55
|
+
help="Maximum Pages per Crawl (default: 1000; 0 = unlimited)",
|
|
56
|
+
)
|
|
57
|
+
sitemap_group = crawl.add_mutually_exclusive_group()
|
|
58
|
+
sitemap_group.add_argument(
|
|
59
|
+
"--no-sitemap", dest="sitemap", action="store_const", const="off", default="both",
|
|
60
|
+
help="Discover pages by link traversal only",
|
|
61
|
+
)
|
|
62
|
+
sitemap_group.add_argument(
|
|
63
|
+
"--sitemap-only", dest="sitemap", action="store_const", const="only",
|
|
64
|
+
help="Crawl exactly the in-Scope sitemap URLs; follow no links",
|
|
65
|
+
)
|
|
66
|
+
crawl.add_argument(
|
|
67
|
+
"--format", choices=["files", "jsonl"], default="files",
|
|
68
|
+
help="files: .md tree + crawl.json; jsonl: one record per Page on stdout",
|
|
69
|
+
)
|
|
70
|
+
crawl.add_argument(
|
|
71
|
+
"--selector", metavar="CSS",
|
|
72
|
+
help="CSS selector for the content container (overrides auto-detection)",
|
|
73
|
+
)
|
|
74
|
+
crawl.add_argument(
|
|
75
|
+
"--render", choices=["auto", "always", "never"], default="auto",
|
|
76
|
+
help="JavaScript rendering: auto re-fetches detected SPA shells via a "
|
|
77
|
+
"headless browser, always renders everything, never disables it",
|
|
78
|
+
)
|
|
79
|
+
crawl.add_argument(
|
|
80
|
+
"--ignore-robots", action="store_true",
|
|
81
|
+
help="Consciously override robots.txt rules — only for sites you own "
|
|
82
|
+
"or have permission to crawl",
|
|
83
|
+
)
|
|
84
|
+
crawl.add_argument(
|
|
85
|
+
"--delay", type=float, default=1.0, metavar="SECONDS",
|
|
86
|
+
help="Adaptive-throttle start delay between requests (default: 1.0; 0 = no throttle)",
|
|
87
|
+
)
|
|
88
|
+
crawl.add_argument(
|
|
89
|
+
"--concurrency", type=int, default=4, metavar="N",
|
|
90
|
+
help="Concurrent requests per domain (default: 4)",
|
|
91
|
+
)
|
|
92
|
+
crawl.add_argument(
|
|
93
|
+
"--download-media", action="store_true",
|
|
94
|
+
help="Download referenced images/documents (Assets) into _media/ and "
|
|
95
|
+
"rewrite links to the local copies",
|
|
96
|
+
)
|
|
97
|
+
crawl.add_argument(
|
|
98
|
+
"--media-max-size", type=float, default=50.0, metavar="MB",
|
|
99
|
+
help="Per-Asset size cap for --download-media (default: 50); larger files stay linked",
|
|
100
|
+
)
|
|
101
|
+
crawl.add_argument(
|
|
102
|
+
"--keep-html", action="store_true",
|
|
103
|
+
help="Also keep each Page's raw HTML (sidecar file / jsonl field)",
|
|
104
|
+
)
|
|
105
|
+
crawl.add_argument(
|
|
106
|
+
"--no-clone-source", dest="clone_source", action="store_false", default=True,
|
|
107
|
+
help="Always crawl, even when the docs site links a public source repo "
|
|
108
|
+
"(by default getdocs clones that repo instead of crawling)",
|
|
109
|
+
)
|
|
110
|
+
crawl.add_argument(
|
|
111
|
+
"--contact", metavar="EMAIL_OR_URL",
|
|
112
|
+
help="Contact appended to the User-Agent so site operators can reach you "
|
|
113
|
+
"(recommended for high-volume crawls; optional)",
|
|
114
|
+
)
|
|
115
|
+
crawl.add_argument(
|
|
116
|
+
"--user-agent", metavar="STRING",
|
|
117
|
+
help="Override the User-Agent string getdocs sends entirely",
|
|
118
|
+
)
|
|
119
|
+
|
|
120
|
+
serve = subparsers.add_parser("serve", help="Run the getdocs API service")
|
|
121
|
+
serve.add_argument("--host", default="127.0.0.1", help="Bind address (default: 127.0.0.1)")
|
|
122
|
+
serve.add_argument("--port", type=int, default=8000, help="Port (default: 8000)")
|
|
123
|
+
|
|
124
|
+
args = parser.parse_args(argv)
|
|
125
|
+
if args.command == "serve":
|
|
126
|
+
return ServeConfig(host=args.host, port=args.port)
|
|
127
|
+
seeds = list(args.seeds)
|
|
128
|
+
if args.seeds_file is not None:
|
|
129
|
+
if not args.seeds_file.exists():
|
|
130
|
+
crawl.error(f"seeds file not found: {args.seeds_file}")
|
|
131
|
+
seeds += [
|
|
132
|
+
line.strip()
|
|
133
|
+
for line in args.seeds_file.read_text().splitlines()
|
|
134
|
+
if line.strip() and not line.lstrip().startswith("#")
|
|
135
|
+
]
|
|
136
|
+
if not seeds and not args.resume:
|
|
137
|
+
crawl.error("at least one seed URL is required (or --seeds-file / --resume)")
|
|
138
|
+
return CrawlConfig(
|
|
139
|
+
seeds=seeds,
|
|
140
|
+
resume=args.resume,
|
|
141
|
+
output_dir=args.output_dir,
|
|
142
|
+
allow_backward=args.allow_backward,
|
|
143
|
+
allow_subdomains=args.allow_subdomains,
|
|
144
|
+
include_paths=args.include_paths,
|
|
145
|
+
exclude_paths=args.exclude_paths,
|
|
146
|
+
depth=args.depth,
|
|
147
|
+
limit=args.limit,
|
|
148
|
+
format=args.format,
|
|
149
|
+
keep_html=args.keep_html,
|
|
150
|
+
sitemap=args.sitemap,
|
|
151
|
+
selector=args.selector,
|
|
152
|
+
render=args.render,
|
|
153
|
+
ignore_robots=args.ignore_robots,
|
|
154
|
+
delay=args.delay,
|
|
155
|
+
concurrency=args.concurrency,
|
|
156
|
+
download_media=args.download_media,
|
|
157
|
+
media_max_size=args.media_max_size,
|
|
158
|
+
clone_source=args.clone_source,
|
|
159
|
+
contact=args.contact,
|
|
160
|
+
user_agent=args.user_agent,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def main(argv: list[str] | None = None) -> int:
|
|
165
|
+
import json
|
|
166
|
+
import sys
|
|
167
|
+
from dataclasses import replace
|
|
168
|
+
|
|
169
|
+
from getdocs.engine import playwright_available, run_crawl, state_file_for
|
|
170
|
+
|
|
171
|
+
config = parse_args(argv)
|
|
172
|
+
if isinstance(config, ServeConfig):
|
|
173
|
+
try:
|
|
174
|
+
import uvicorn
|
|
175
|
+
|
|
176
|
+
from getdocs.api import create_app
|
|
177
|
+
except ImportError:
|
|
178
|
+
print(
|
|
179
|
+
'error: getdocs serve needs the server extra (pip install "getdocs[server]")',
|
|
180
|
+
file=sys.stderr,
|
|
181
|
+
)
|
|
182
|
+
return 2
|
|
183
|
+
uvicorn.run(create_app(), host=config.host, port=config.port)
|
|
184
|
+
return 0
|
|
185
|
+
if config.render == "always" and not playwright_available():
|
|
186
|
+
print(
|
|
187
|
+
"error: --render always needs scrapy-playwright "
|
|
188
|
+
"(pip install scrapy-playwright && playwright install chromium)",
|
|
189
|
+
file=sys.stderr,
|
|
190
|
+
)
|
|
191
|
+
return 2
|
|
192
|
+
# Source-first (ADR-0006): if the docs site is open-source, clone its repo
|
|
193
|
+
# instead of crawling. Files-mode only — jsonl is a page stream with no
|
|
194
|
+
# place for a clone; --resume continues an existing crawl.
|
|
195
|
+
if config.format == "files" and config.clone_source and not config.resume and config.seeds:
|
|
196
|
+
from getdocs.source import clone_source_for
|
|
197
|
+
|
|
198
|
+
if clone_source_for(config) is not None:
|
|
199
|
+
return 0
|
|
200
|
+
state_file = state_file_for(config)
|
|
201
|
+
if config.resume:
|
|
202
|
+
if not state_file.exists():
|
|
203
|
+
print(f"error: no crawl state found in {config.output_dir}", file=sys.stderr)
|
|
204
|
+
return 2
|
|
205
|
+
saved_seeds = json.loads(state_file.read_text())["seeds"]
|
|
206
|
+
config = replace(config, seeds=saved_seeds)
|
|
207
|
+
elif state_file.exists():
|
|
208
|
+
print(
|
|
209
|
+
f"note: found crawl state from an earlier run in {config.output_dir} — "
|
|
210
|
+
"starting over (use --resume to continue it)",
|
|
211
|
+
file=sys.stderr,
|
|
212
|
+
)
|
|
213
|
+
state_file.unlink()
|
|
214
|
+
|
|
215
|
+
page_count = run_crawl(config)
|
|
216
|
+
if page_count == 0:
|
|
217
|
+
# stderr: stdout belongs to the jsonl stream (ADR-0002)
|
|
218
|
+
print("error: no Pages produced — seed(s) unreachable?", file=sys.stderr, flush=True)
|
|
219
|
+
return 1
|
|
220
|
+
return 0
|
getdocs/config.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""CrawlConfig: the value object describing a Crawl — the engine boundary (ADR-0002)."""
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from pathlib import Path
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@dataclass(frozen=True)
|
|
8
|
+
class CrawlConfig:
|
|
9
|
+
seeds: list[str] = field(default_factory=list)
|
|
10
|
+
output_dir: Path = Path("./out")
|
|
11
|
+
allow_backward: bool = False
|
|
12
|
+
allow_subdomains: bool = False
|
|
13
|
+
include_paths: list[str] = field(default_factory=list)
|
|
14
|
+
exclude_paths: list[str] = field(default_factory=list)
|
|
15
|
+
depth: int = 0 # link-hops from any seed; 0 = unlimited
|
|
16
|
+
limit: int = 1000 # max Pages per Crawl; 0 = unlimited
|
|
17
|
+
format: str = "files" # "files" or "jsonl"
|
|
18
|
+
keep_html: bool = False
|
|
19
|
+
sitemap: str = "both" # "both", "off" (--no-sitemap), or "only" (--sitemap-only)
|
|
20
|
+
selector: str | None = None # CSS selector naming the content container
|
|
21
|
+
render: str = "auto" # "auto" (escalate on Shells), "always", or "never"
|
|
22
|
+
ignore_robots: bool = False
|
|
23
|
+
resume: bool = False # continue an interrupted Crawl from saved state
|
|
24
|
+
delay: float = 1.0 # throttle start delay in seconds; 0 disables throttling
|
|
25
|
+
concurrency: int = 4 # concurrent requests per domain
|
|
26
|
+
download_media: bool = False # fetch referenced Assets into _media/
|
|
27
|
+
media_max_size: float = 50.0 # per-Asset cap in MB; larger files stay hotlinked
|
|
28
|
+
clone_source: bool = True # clone the docs' source repo if the site is open-source
|
|
29
|
+
contact: str | None = None # email/URL appended to the User-Agent (crawling etiquette)
|
|
30
|
+
user_agent: str | None = None # override the User-Agent string entirely
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass(frozen=True)
|
|
34
|
+
class ServeConfig:
|
|
35
|
+
host: str = "127.0.0.1"
|
|
36
|
+
port: int = 8000
|