getdocs 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
getdocs/__init__.py ADDED
File without changes
getdocs/__main__.py ADDED
@@ -0,0 +1,3 @@
1
+ from getdocs.cli import main
2
+
3
+ raise SystemExit(main())
getdocs/api.py ADDED
@@ -0,0 +1,95 @@
1
+ """API service: Firecrawl-style async Crawl jobs over the engine (ADR-0002)."""
2
+
3
+ from fastapi import FastAPI, HTTPException, WebSocket
4
+ from pydantic import BaseModel, model_validator
5
+
6
+ from getdocs.jobs import CrawlJob, JobManager
7
+
8
+
9
+ class CrawlRequest(BaseModel):
10
+ url: str | None = None
11
+ urls: list[str] | None = None
12
+ limit: int | None = None
13
+ depth: int | None = None
14
+ allow_backward: bool = False
15
+ allow_subdomains: bool = False
16
+ include_paths: list[str] | None = None
17
+ exclude_paths: list[str] | None = None
18
+ sitemap: str | None = None # "both" | "off" | "only"
19
+ render: str | None = None # "auto" | "always" | "never"
20
+ selector: str | None = None
21
+ ignore_robots: bool = False
22
+ keep_html: bool = False
23
+ delay: float | None = None
24
+ concurrency: int | None = None
25
+ webhook: str | None = None # URL POSTed started/page/completed events
26
+
27
+ @model_validator(mode="after")
28
+ def _require_some_url(self):
29
+ if not self.url and not self.urls:
30
+ raise ValueError("either url or urls is required")
31
+ return self
32
+
33
+
34
+ def _serialize(job: CrawlJob) -> dict:
35
+ return {
36
+ "id": job.id,
37
+ "status": job.status,
38
+ "seeds": job.seeds,
39
+ "page_count": len(job.pages),
40
+ "pages": job.pages,
41
+ "manifest": job.manifest,
42
+ "error": job.error,
43
+ "webhook_failures": job.webhook_failures,
44
+ }
45
+
46
+
47
+ def create_app(manager: JobManager | None = None) -> FastAPI:
48
+ manager = manager or JobManager()
49
+ app = FastAPI(title="getdocs", version="0.1.0")
50
+ app.state.manager = manager
51
+
52
+ @app.post("/v1/crawl", status_code=202)
53
+ async def start_crawl(request: CrawlRequest):
54
+ job = manager.start(request.model_dump(exclude_none=True))
55
+ return {"id": job.id, "status": job.status}
56
+
57
+ @app.get("/v1/crawl")
58
+ async def list_crawls():
59
+ return {
60
+ "jobs": [
61
+ {
62
+ "id": job.id,
63
+ "status": job.status,
64
+ "seeds": job.seeds,
65
+ "page_count": len(job.pages),
66
+ }
67
+ for job in manager.jobs.values()
68
+ ]
69
+ }
70
+
71
+ @app.get("/v1/crawl/{job_id}")
72
+ async def get_crawl(job_id: str):
73
+ job = manager.get(job_id)
74
+ if job is None:
75
+ raise HTTPException(status_code=404, detail="no such Crawl job")
76
+ return _serialize(job)
77
+
78
+ @app.websocket("/v1/crawl/{job_id}/ws")
79
+ async def stream_crawl(websocket: WebSocket, job_id: str):
80
+ await websocket.accept()
81
+ if manager.get(job_id) is None:
82
+ await websocket.close(code=4404, reason="no such Crawl job")
83
+ return
84
+ async for event in manager.stream(job_id):
85
+ await websocket.send_json(event)
86
+ await websocket.close()
87
+
88
+ @app.delete("/v1/crawl/{job_id}")
89
+ async def cancel_crawl(job_id: str):
90
+ job = manager.cancel(job_id)
91
+ if job is None:
92
+ raise HTTPException(status_code=404, detail="no such Crawl job")
93
+ return {"id": job.id, "status": job.status}
94
+
95
+ return app
getdocs/cli.py ADDED
@@ -0,0 +1,220 @@
1
+ """CLI: argument parsing to CrawlConfig, engine invocation, exit-code mapping."""
2
+
3
+ import argparse
4
+ from pathlib import Path
5
+
6
+ from getdocs.config import CrawlConfig, ServeConfig
7
+
8
+
9
+ def parse_args(argv: list[str] | None = None) -> CrawlConfig | ServeConfig:
10
+ parser = argparse.ArgumentParser(
11
+ prog="getdocs",
12
+ description="Crawl a documentation site and emit clean markdown.",
13
+ )
14
+ subparsers = parser.add_subparsers(dest="command", required=True)
15
+
16
+ crawl = subparsers.add_parser("crawl", help="Run a Crawl from one or more seed URLs")
17
+ crawl.add_argument(
18
+ "seeds", nargs="*", metavar="URL",
19
+ help="Seed URL(s) for the Crawl (omit with --resume to reuse saved seeds)",
20
+ )
21
+ crawl.add_argument(
22
+ "-f", "--seeds-file", type=Path, metavar="FILE",
23
+ help="File of additional Seed URLs, one per line (# comments and blank lines ignored)",
24
+ )
25
+ crawl.add_argument(
26
+ "--resume", action="store_true",
27
+ help="Continue the interrupted Crawl whose state lives in the output directory",
28
+ )
29
+ crawl.add_argument(
30
+ "-o", "--output-dir", type=Path, default=Path("./out"),
31
+ help="Directory the Pages and Manifest are written to (default: ./out)",
32
+ )
33
+ crawl.add_argument(
34
+ "--allow-backward", action="store_true",
35
+ help="Widen Scope from the seed's path prefix to its whole host",
36
+ )
37
+ crawl.add_argument(
38
+ "--allow-subdomains", action="store_true",
39
+ help="Widen Scope to subdomains of the seed host",
40
+ )
41
+ crawl.add_argument(
42
+ "--include-paths", action="append", default=[], metavar="GLOB",
43
+ help="Only crawl paths matching at least one glob (repeatable)",
44
+ )
45
+ crawl.add_argument(
46
+ "--exclude-paths", action="append", default=[], metavar="GLOB",
47
+ help="Never crawl paths matching a glob (repeatable)",
48
+ )
49
+ crawl.add_argument(
50
+ "--depth", type=int, default=0, metavar="N",
51
+ help="Maximum link-hops from any seed (default: 0 = unlimited)",
52
+ )
53
+ crawl.add_argument(
54
+ "--limit", type=int, default=1000, metavar="N",
55
+ help="Maximum Pages per Crawl (default: 1000; 0 = unlimited)",
56
+ )
57
+ sitemap_group = crawl.add_mutually_exclusive_group()
58
+ sitemap_group.add_argument(
59
+ "--no-sitemap", dest="sitemap", action="store_const", const="off", default="both",
60
+ help="Discover pages by link traversal only",
61
+ )
62
+ sitemap_group.add_argument(
63
+ "--sitemap-only", dest="sitemap", action="store_const", const="only",
64
+ help="Crawl exactly the in-Scope sitemap URLs; follow no links",
65
+ )
66
+ crawl.add_argument(
67
+ "--format", choices=["files", "jsonl"], default="files",
68
+ help="files: .md tree + crawl.json; jsonl: one record per Page on stdout",
69
+ )
70
+ crawl.add_argument(
71
+ "--selector", metavar="CSS",
72
+ help="CSS selector for the content container (overrides auto-detection)",
73
+ )
74
+ crawl.add_argument(
75
+ "--render", choices=["auto", "always", "never"], default="auto",
76
+ help="JavaScript rendering: auto re-fetches detected SPA shells via a "
77
+ "headless browser, always renders everything, never disables it",
78
+ )
79
+ crawl.add_argument(
80
+ "--ignore-robots", action="store_true",
81
+ help="Consciously override robots.txt rules — only for sites you own "
82
+ "or have permission to crawl",
83
+ )
84
+ crawl.add_argument(
85
+ "--delay", type=float, default=1.0, metavar="SECONDS",
86
+ help="Adaptive-throttle start delay between requests (default: 1.0; 0 = no throttle)",
87
+ )
88
+ crawl.add_argument(
89
+ "--concurrency", type=int, default=4, metavar="N",
90
+ help="Concurrent requests per domain (default: 4)",
91
+ )
92
+ crawl.add_argument(
93
+ "--download-media", action="store_true",
94
+ help="Download referenced images/documents (Assets) into _media/ and "
95
+ "rewrite links to the local copies",
96
+ )
97
+ crawl.add_argument(
98
+ "--media-max-size", type=float, default=50.0, metavar="MB",
99
+ help="Per-Asset size cap for --download-media (default: 50); larger files stay linked",
100
+ )
101
+ crawl.add_argument(
102
+ "--keep-html", action="store_true",
103
+ help="Also keep each Page's raw HTML (sidecar file / jsonl field)",
104
+ )
105
+ crawl.add_argument(
106
+ "--no-clone-source", dest="clone_source", action="store_false", default=True,
107
+ help="Always crawl, even when the docs site links a public source repo "
108
+ "(by default getdocs clones that repo instead of crawling)",
109
+ )
110
+ crawl.add_argument(
111
+ "--contact", metavar="EMAIL_OR_URL",
112
+ help="Contact appended to the User-Agent so site operators can reach you "
113
+ "(recommended for high-volume crawls; optional)",
114
+ )
115
+ crawl.add_argument(
116
+ "--user-agent", metavar="STRING",
117
+ help="Override the User-Agent string getdocs sends entirely",
118
+ )
119
+
120
+ serve = subparsers.add_parser("serve", help="Run the getdocs API service")
121
+ serve.add_argument("--host", default="127.0.0.1", help="Bind address (default: 127.0.0.1)")
122
+ serve.add_argument("--port", type=int, default=8000, help="Port (default: 8000)")
123
+
124
+ args = parser.parse_args(argv)
125
+ if args.command == "serve":
126
+ return ServeConfig(host=args.host, port=args.port)
127
+ seeds = list(args.seeds)
128
+ if args.seeds_file is not None:
129
+ if not args.seeds_file.exists():
130
+ crawl.error(f"seeds file not found: {args.seeds_file}")
131
+ seeds += [
132
+ line.strip()
133
+ for line in args.seeds_file.read_text().splitlines()
134
+ if line.strip() and not line.lstrip().startswith("#")
135
+ ]
136
+ if not seeds and not args.resume:
137
+ crawl.error("at least one seed URL is required (or --seeds-file / --resume)")
138
+ return CrawlConfig(
139
+ seeds=seeds,
140
+ resume=args.resume,
141
+ output_dir=args.output_dir,
142
+ allow_backward=args.allow_backward,
143
+ allow_subdomains=args.allow_subdomains,
144
+ include_paths=args.include_paths,
145
+ exclude_paths=args.exclude_paths,
146
+ depth=args.depth,
147
+ limit=args.limit,
148
+ format=args.format,
149
+ keep_html=args.keep_html,
150
+ sitemap=args.sitemap,
151
+ selector=args.selector,
152
+ render=args.render,
153
+ ignore_robots=args.ignore_robots,
154
+ delay=args.delay,
155
+ concurrency=args.concurrency,
156
+ download_media=args.download_media,
157
+ media_max_size=args.media_max_size,
158
+ clone_source=args.clone_source,
159
+ contact=args.contact,
160
+ user_agent=args.user_agent,
161
+ )
162
+
163
+
164
+ def main(argv: list[str] | None = None) -> int:
165
+ import json
166
+ import sys
167
+ from dataclasses import replace
168
+
169
+ from getdocs.engine import playwright_available, run_crawl, state_file_for
170
+
171
+ config = parse_args(argv)
172
+ if isinstance(config, ServeConfig):
173
+ try:
174
+ import uvicorn
175
+
176
+ from getdocs.api import create_app
177
+ except ImportError:
178
+ print(
179
+ 'error: getdocs serve needs the server extra (pip install "getdocs[server]")',
180
+ file=sys.stderr,
181
+ )
182
+ return 2
183
+ uvicorn.run(create_app(), host=config.host, port=config.port)
184
+ return 0
185
+ if config.render == "always" and not playwright_available():
186
+ print(
187
+ "error: --render always needs scrapy-playwright "
188
+ "(pip install scrapy-playwright && playwright install chromium)",
189
+ file=sys.stderr,
190
+ )
191
+ return 2
192
+ # Source-first (ADR-0006): if the docs site is open-source, clone its repo
193
+ # instead of crawling. Files-mode only — jsonl is a page stream with no
194
+ # place for a clone; --resume continues an existing crawl.
195
+ if config.format == "files" and config.clone_source and not config.resume and config.seeds:
196
+ from getdocs.source import clone_source_for
197
+
198
+ if clone_source_for(config) is not None:
199
+ return 0
200
+ state_file = state_file_for(config)
201
+ if config.resume:
202
+ if not state_file.exists():
203
+ print(f"error: no crawl state found in {config.output_dir}", file=sys.stderr)
204
+ return 2
205
+ saved_seeds = json.loads(state_file.read_text())["seeds"]
206
+ config = replace(config, seeds=saved_seeds)
207
+ elif state_file.exists():
208
+ print(
209
+ f"note: found crawl state from an earlier run in {config.output_dir} — "
210
+ "starting over (use --resume to continue it)",
211
+ file=sys.stderr,
212
+ )
213
+ state_file.unlink()
214
+
215
+ page_count = run_crawl(config)
216
+ if page_count == 0:
217
+ # stderr: stdout belongs to the jsonl stream (ADR-0002)
218
+ print("error: no Pages produced — seed(s) unreachable?", file=sys.stderr, flush=True)
219
+ return 1
220
+ return 0
getdocs/config.py ADDED
@@ -0,0 +1,36 @@
1
+ """CrawlConfig: the value object describing a Crawl — the engine boundary (ADR-0002)."""
2
+
3
+ from dataclasses import dataclass, field
4
+ from pathlib import Path
5
+
6
+
7
+ @dataclass(frozen=True)
8
+ class CrawlConfig:
9
+ seeds: list[str] = field(default_factory=list)
10
+ output_dir: Path = Path("./out")
11
+ allow_backward: bool = False
12
+ allow_subdomains: bool = False
13
+ include_paths: list[str] = field(default_factory=list)
14
+ exclude_paths: list[str] = field(default_factory=list)
15
+ depth: int = 0 # link-hops from any seed; 0 = unlimited
16
+ limit: int = 1000 # max Pages per Crawl; 0 = unlimited
17
+ format: str = "files" # "files" or "jsonl"
18
+ keep_html: bool = False
19
+ sitemap: str = "both" # "both", "off" (--no-sitemap), or "only" (--sitemap-only)
20
+ selector: str | None = None # CSS selector naming the content container
21
+ render: str = "auto" # "auto" (escalate on Shells), "always", or "never"
22
+ ignore_robots: bool = False
23
+ resume: bool = False # continue an interrupted Crawl from saved state
24
+ delay: float = 1.0 # throttle start delay in seconds; 0 disables throttling
25
+ concurrency: int = 4 # concurrent requests per domain
26
+ download_media: bool = False # fetch referenced Assets into _media/
27
+ media_max_size: float = 50.0 # per-Asset cap in MB; larger files stay hotlinked
28
+ clone_source: bool = True # clone the docs' source repo if the site is open-source
29
+ contact: str | None = None # email/URL appended to the User-Agent (crawling etiquette)
30
+ user_agent: str | None = None # override the User-Agent string entirely
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class ServeConfig:
35
+ host: str = "127.0.0.1"
36
+ port: int = 8000