getdocs 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. getdocs-0.1.0/LICENSE +21 -0
  2. getdocs-0.1.0/PKG-INFO +169 -0
  3. getdocs-0.1.0/README.md +129 -0
  4. getdocs-0.1.0/pyproject.toml +52 -0
  5. getdocs-0.1.0/setup.cfg +4 -0
  6. getdocs-0.1.0/src/getdocs/__init__.py +0 -0
  7. getdocs-0.1.0/src/getdocs/__main__.py +3 -0
  8. getdocs-0.1.0/src/getdocs/api.py +95 -0
  9. getdocs-0.1.0/src/getdocs/cli.py +220 -0
  10. getdocs-0.1.0/src/getdocs/config.py +36 -0
  11. getdocs-0.1.0/src/getdocs/engine.py +418 -0
  12. getdocs-0.1.0/src/getdocs/extract.py +190 -0
  13. getdocs-0.1.0/src/getdocs/identity.py +32 -0
  14. getdocs-0.1.0/src/getdocs/jobs.py +204 -0
  15. getdocs-0.1.0/src/getdocs/navharvest.py +242 -0
  16. getdocs-0.1.0/src/getdocs/output.py +191 -0
  17. getdocs-0.1.0/src/getdocs/scope.py +84 -0
  18. getdocs-0.1.0/src/getdocs/sitemap.py +35 -0
  19. getdocs-0.1.0/src/getdocs/source.py +238 -0
  20. getdocs-0.1.0/src/getdocs/urlnorm.py +34 -0
  21. getdocs-0.1.0/src/getdocs.egg-info/PKG-INFO +169 -0
  22. getdocs-0.1.0/src/getdocs.egg-info/SOURCES.txt +52 -0
  23. getdocs-0.1.0/src/getdocs.egg-info/dependency_links.txt +1 -0
  24. getdocs-0.1.0/src/getdocs.egg-info/entry_points.txt +2 -0
  25. getdocs-0.1.0/src/getdocs.egg-info/requires.txt +17 -0
  26. getdocs-0.1.0/src/getdocs.egg-info/top_level.txt +1 -0
  27. getdocs-0.1.0/tests/test_api.py +103 -0
  28. getdocs-0.1.0/tests/test_cli.py +123 -0
  29. getdocs-0.1.0/tests/test_crawl_e2e.py +38 -0
  30. getdocs-0.1.0/tests/test_extract.py +28 -0
  31. getdocs-0.1.0/tests/test_extract_pipeline.py +147 -0
  32. getdocs-0.1.0/tests/test_identity.py +20 -0
  33. getdocs-0.1.0/tests/test_identity_e2e.py +47 -0
  34. getdocs-0.1.0/tests/test_jobs.py +105 -0
  35. getdocs-0.1.0/tests/test_jsonl_e2e.py +43 -0
  36. getdocs-0.1.0/tests/test_jsonl_output.py +62 -0
  37. getdocs-0.1.0/tests/test_limits_errors_e2e.py +99 -0
  38. getdocs-0.1.0/tests/test_media_e2e.py +113 -0
  39. getdocs-0.1.0/tests/test_navharvest.py +142 -0
  40. getdocs-0.1.0/tests/test_navorder_e2e.py +75 -0
  41. getdocs-0.1.0/tests/test_output.py +114 -0
  42. getdocs-0.1.0/tests/test_politeness_e2e.py +68 -0
  43. getdocs-0.1.0/tests/test_relink_e2e.py +36 -0
  44. getdocs-0.1.0/tests/test_render_e2e.py +80 -0
  45. getdocs-0.1.0/tests/test_resume_e2e.py +77 -0
  46. getdocs-0.1.0/tests/test_scope.py +40 -0
  47. getdocs-0.1.0/tests/test_shell_detection.py +27 -0
  48. getdocs-0.1.0/tests/test_sitemap.py +47 -0
  49. getdocs-0.1.0/tests/test_sitemap_e2e.py +70 -0
  50. getdocs-0.1.0/tests/test_source.py +203 -0
  51. getdocs-0.1.0/tests/test_traversal_e2e.py +68 -0
  52. getdocs-0.1.0/tests/test_urlnorm.py +23 -0
  53. getdocs-0.1.0/tests/test_webhook_api.py +60 -0
  54. getdocs-0.1.0/tests/test_ws_api.py +69 -0
getdocs-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 jonbakerfish
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
getdocs-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,169 @@
1
+ Metadata-Version: 2.4
2
+ Name: getdocs
3
+ Version: 0.1.0
4
+ Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
5
+ Author-email: jonbakerfish <jonbakerfish@gmail.com>
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/jonbakerfish/getdocs
8
+ Project-URL: Repository, https://github.com/jonbakerfish/getdocs
9
+ Project-URL: Issues, https://github.com/jonbakerfish/getdocs/issues
10
+ Project-URL: Documentation, https://github.com/jonbakerfish/getdocs/blob/main/docs/USAGE.md
11
+ Keywords: documentation,crawler,scraper,markdown,docs,llm,agents,rag
12
+ Classifier: Development Status :: 4 - Beta
13
+ Classifier: Environment :: Console
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
19
+ Classifier: Topic :: Software Development :: Documentation
20
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
21
+ Requires-Python: >=3.12
22
+ Description-Content-Type: text/markdown
23
+ License-File: LICENSE
24
+ Requires-Dist: scrapy>=2.11
25
+ Requires-Dist: markdownify>=0.13
26
+ Requires-Dist: beautifulsoup4>=4.12
27
+ Requires-Dist: pyyaml>=6.0
28
+ Requires-Dist: trafilatura>=1.12
29
+ Requires-Dist: scrapy-playwright>=0.0.40
30
+ Provides-Extra: server
31
+ Requires-Dist: fastapi>=0.110; extra == "server"
32
+ Requires-Dist: uvicorn>=0.27; extra == "server"
33
+ Requires-Dist: httpx>=0.27; extra == "server"
34
+ Provides-Extra: dev
35
+ Requires-Dist: pytest>=8.0; extra == "dev"
36
+ Requires-Dist: httpx>=0.27; extra == "dev"
37
+ Requires-Dist: fastapi>=0.110; extra == "dev"
38
+ Requires-Dist: uvicorn>=0.27; extra == "dev"
39
+ Dynamic: license-file
40
+
41
+ # getdocs
42
+
43
+ **Turn any documentation site into a clean, local markdown copy your coding agent can actually read.**
44
+
45
+ ```bash
46
+ getdocs crawl https://example.com/docs -o ./out
47
+ ```
48
+
49
+ Coding agents are only as good as the docs they can see. Pointing an agent at a
50
+ live docs URL means it burns tokens on nav bars, cookie banners, and HTML
51
+ chrome — or can't reach the page at all. `getdocs` gives the agent a local,
52
+ offline, markdown mirror instead: the actual content, structured to match the
53
+ original site, ready to drop into a repo or feed to a model.
54
+
55
+ ## Why getdocs
56
+
57
+ - **Richer context for coding agents.** A local copy is greppable, indexable,
58
+ and always available — the agent reads the whole library at once instead of
59
+ fetching one rendered page at a time. No rate limits, no network flakiness,
60
+ no JS that won't hydrate.
61
+ - **Clean markdown → fewer tokens.** Each page is reduced to its content (the
62
+ nav, headers, footers, and ad chrome stripped) and written as plain markdown.
63
+ Agents consume it directly, and you spend tokens on docs, not `<div>` soup.
64
+ - **Structure preserved.** Files mirror the URL hierarchy
65
+ (`example.com/docs/auth` → `out/docs/auth.md`), each with YAML frontmatter
66
+ (`url`, `title`, `crawled_at`, `status`), plus a `crawl.json` Manifest that
67
+ captures the site's nav order and reading order — so an agent can follow the
68
+ docs in the order the authors intended.
69
+ - **Source-first: clone over crawl.** If the docs site is open-source, getdocs
70
+ detects the "Edit this page" link, clones the repo, and serves you the
71
+ original markdown source instead of scraping HTML — the highest-fidelity copy
72
+ there is. Falls back to crawling automatically when there's no repo.
73
+
74
+ ## When to reach for it
75
+
76
+ - **Coding against an unfamiliar library or API.** Mirror its docs into your
77
+ repo (or a scratch dir) so your agent can ground its answers in the real
78
+ reference instead of hallucinating from memory.
79
+ - **RAG / knowledge bases.** Get a clean markdown corpus to chunk and embed,
80
+ without writing a bespoke scraper-and-cleaner for every site.
81
+ - **Offline or air-gapped work.** Take a docs set with you; read and search it
82
+ with no network.
83
+ - **Pinning a version.** Snapshot today's docs so your agent isn't tripped up
84
+ when the upstream site changes underneath you.
85
+ - **Migrating or archiving docs.** Pull an entire site down as markdown to move,
86
+ diff, or keep.
87
+
88
+ ## Output
89
+
90
+ ```
91
+ out/
92
+ ├── crawl.json ← the Manifest: nav order, reading order, what ran
93
+ └── docs/
94
+ ├── index.md
95
+ ├── auth.md
96
+ └── guide/
97
+ └── intro.md
98
+ ```
99
+
100
+ ```markdown
101
+ ---
102
+ url: https://example.com/docs/auth
103
+ title: Authentication
104
+ status: 200
105
+ crawled_at: 2026-06-12T10:00:00Z
106
+ ---
107
+
108
+ # Authentication
109
+ ...
110
+ ```
111
+
112
+ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
113
+ throttling, JSONL output, and resumable crawls are all built in — see
114
+ [docs/USAGE.md](docs/USAGE.md).
115
+
116
+ ## Install
117
+
118
+ Requires **Python 3.12+**.
119
+
120
+ ```bash
121
+ pip install getdocs
122
+ ```
123
+
124
+ Or from source, for the latest unreleased changes:
125
+
126
+ ```bash
127
+ git clone https://github.com/jonbakerfish/getdocs
128
+ cd getdocs
129
+ pip install -e .
130
+ ```
131
+
132
+ That's enough to crawl. Two optional pieces unlock more:
133
+
134
+ ```bash
135
+ # JavaScript rendering — the headless browser used to hydrate SPA docs
136
+ playwright install chromium
137
+
138
+ # Serve a crawled/cloned copy locally as a browsable site
139
+ pip install mkdocs mkdocs-material
140
+ ```
141
+
142
+ **`git`** must be on your `PATH` for source-first cloning (it almost always
143
+ already is); without it, getdocs simply falls back to crawling. To run the
144
+ optional API service, install the server extra: `pip install "getdocs[server]"`.
145
+
146
+ ## Development
147
+
148
+ ```bash
149
+ git clone https://github.com/jonbakerfish/getdocs
150
+ cd getdocs
151
+ pip install -e ".[dev]"
152
+ pytest
153
+ ```
154
+
155
+ ## Responsible use
156
+
157
+ getdocs is a tool; how you point it is on you. By default it **honors
158
+ `robots.txt`**, throttles itself politely, and **identifies itself honestly**
159
+ in the `User-Agent` (`getdocs/<version> (+project-url)`) — please keep it that
160
+ way. For high-volume crawls, add `--contact you@example.com` so site operators
161
+ can reach you (it's appended to the User-Agent; optional but courteous).
162
+
163
+ getdocs is intended for personal, reference, and agent/RAG use on documentation
164
+ you have the right to access. **You are solely responsible for complying with
165
+ each site's Terms of Service, its `robots.txt`, applicable law, and the
166
+ copyright of the content you fetch** — getdocs is provided as-is, with no
167
+ warranty (see [LICENSE](LICENSE)). Crawled documentation belongs to its authors:
168
+ use it for your own reference, but don't redistribute someone else's docs as
169
+ your own. Crawl only what you have the right to.
@@ -0,0 +1,129 @@
1
+ # getdocs
2
+
3
+ **Turn any documentation site into a clean, local markdown copy your coding agent can actually read.**
4
+
5
+ ```bash
6
+ getdocs crawl https://example.com/docs -o ./out
7
+ ```
8
+
9
+ Coding agents are only as good as the docs they can see. Pointing an agent at a
10
+ live docs URL means it burns tokens on nav bars, cookie banners, and HTML
11
+ chrome — or can't reach the page at all. `getdocs` gives the agent a local,
12
+ offline, markdown mirror instead: the actual content, structured to match the
13
+ original site, ready to drop into a repo or feed to a model.
14
+
15
+ ## Why getdocs
16
+
17
+ - **Richer context for coding agents.** A local copy is greppable, indexable,
18
+ and always available — the agent reads the whole library at once instead of
19
+ fetching one rendered page at a time. No rate limits, no network flakiness,
20
+ no JS that won't hydrate.
21
+ - **Clean markdown → fewer tokens.** Each page is reduced to its content (the
22
+ nav, headers, footers, and ad chrome stripped) and written as plain markdown.
23
+ Agents consume it directly, and you spend tokens on docs, not `<div>` soup.
24
+ - **Structure preserved.** Files mirror the URL hierarchy
25
+ (`example.com/docs/auth` → `out/docs/auth.md`), each with YAML frontmatter
26
+ (`url`, `title`, `crawled_at`, `status`), plus a `crawl.json` Manifest that
27
+ captures the site's nav order and reading order — so an agent can follow the
28
+ docs in the order the authors intended.
29
+ - **Source-first: clone over crawl.** If the docs site is open-source, getdocs
30
+ detects the "Edit this page" link, clones the repo, and serves you the
31
+ original markdown source instead of scraping HTML — the highest-fidelity copy
32
+ there is. Falls back to crawling automatically when there's no repo.
33
+
34
+ ## When to reach for it
35
+
36
+ - **Coding against an unfamiliar library or API.** Mirror its docs into your
37
+ repo (or a scratch dir) so your agent can ground its answers in the real
38
+ reference instead of hallucinating from memory.
39
+ - **RAG / knowledge bases.** Get a clean markdown corpus to chunk and embed,
40
+ without writing a bespoke scraper-and-cleaner for every site.
41
+ - **Offline or air-gapped work.** Take a docs set with you; read and search it
42
+ with no network.
43
+ - **Pinning a version.** Snapshot today's docs so your agent isn't tripped up
44
+ when the upstream site changes underneath you.
45
+ - **Migrating or archiving docs.** Pull an entire site down as markdown to move,
46
+ diff, or keep.
47
+
48
+ ## Output
49
+
50
+ ```
51
+ out/
52
+ ├── crawl.json ← the Manifest: nav order, reading order, what ran
53
+ └── docs/
54
+ ├── index.md
55
+ ├── auth.md
56
+ └── guide/
57
+ └── intro.md
58
+ ```
59
+
60
+ ```markdown
61
+ ---
62
+ url: https://example.com/docs/auth
63
+ title: Authentication
64
+ status: 200
65
+ crawled_at: 2026-06-12T10:00:00Z
66
+ ---
67
+
68
+ # Authentication
69
+ ...
70
+ ```
71
+
72
+ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
73
+ throttling, JSONL output, and resumable crawls are all built in — see
74
+ [docs/USAGE.md](docs/USAGE.md).
75
+
76
+ ## Install
77
+
78
+ Requires **Python 3.12+**.
79
+
80
+ ```bash
81
+ pip install getdocs
82
+ ```
83
+
84
+ Or from source, for the latest unreleased changes:
85
+
86
+ ```bash
87
+ git clone https://github.com/jonbakerfish/getdocs
88
+ cd getdocs
89
+ pip install -e .
90
+ ```
91
+
92
+ That's enough to crawl. Two optional pieces unlock more:
93
+
94
+ ```bash
95
+ # JavaScript rendering — the headless browser used to hydrate SPA docs
96
+ playwright install chromium
97
+
98
+ # Serve a crawled/cloned copy locally as a browsable site
99
+ pip install mkdocs mkdocs-material
100
+ ```
101
+
102
+ **`git`** must be on your `PATH` for source-first cloning (it almost always
103
+ already is); without it, getdocs simply falls back to crawling. To run the
104
+ optional API service, install the server extra: `pip install "getdocs[server]"`.
105
+
106
+ ## Development
107
+
108
+ ```bash
109
+ git clone https://github.com/jonbakerfish/getdocs
110
+ cd getdocs
111
+ pip install -e ".[dev]"
112
+ pytest
113
+ ```
114
+
115
+ ## Responsible use
116
+
117
+ getdocs is a tool; how you point it is on you. By default it **honors
118
+ `robots.txt`**, throttles itself politely, and **identifies itself honestly**
119
+ in the `User-Agent` (`getdocs/<version> (+project-url)`) — please keep it that
120
+ way. For high-volume crawls, add `--contact you@example.com` so site operators
121
+ can reach you (it's appended to the User-Agent; optional but courteous).
122
+
123
+ getdocs is intended for personal, reference, and agent/RAG use on documentation
124
+ you have the right to access. **You are solely responsible for complying with
125
+ each site's Terms of Service, its `robots.txt`, applicable law, and the
126
+ copyright of the content you fetch** — getdocs is provided as-is, with no
127
+ warranty (see [LICENSE](LICENSE)). Crawled documentation belongs to its authors:
128
+ use it for your own reference, but don't redistribute someone else's docs as
129
+ your own. Crawl only what you have the right to.
@@ -0,0 +1,52 @@
1
+ [build-system]
2
+ requires = ["setuptools>=77"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "getdocs"
7
+ version = "0.1.0"
8
+ description = "Documentation crawler: recursively crawl a docs site and emit clean markdown"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = "MIT"
12
+ license-files = ["LICENSE"]
13
+ authors = [{ name = "jonbakerfish", email = "jonbakerfish@gmail.com" }]
14
+ keywords = ["documentation", "crawler", "scraper", "markdown", "docs", "llm", "agents", "rag"]
15
+ classifiers = [
16
+ "Development Status :: 4 - Beta",
17
+ "Environment :: Console",
18
+ "Intended Audience :: Developers",
19
+ "Operating System :: OS Independent",
20
+ "Programming Language :: Python :: 3",
21
+ "Programming Language :: Python :: 3.12",
22
+ "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
23
+ "Topic :: Software Development :: Documentation",
24
+ "Topic :: Text Processing :: Markup :: Markdown",
25
+ ]
26
+ dependencies = [
27
+ "scrapy>=2.11",
28
+ "markdownify>=0.13",
29
+ "beautifulsoup4>=4.12",
30
+ "pyyaml>=6.0",
31
+ "trafilatura>=1.12",
32
+ "scrapy-playwright>=0.0.40",
33
+ ]
34
+
35
+ [project.urls]
36
+ Homepage = "https://github.com/jonbakerfish/getdocs"
37
+ Repository = "https://github.com/jonbakerfish/getdocs"
38
+ Issues = "https://github.com/jonbakerfish/getdocs/issues"
39
+ Documentation = "https://github.com/jonbakerfish/getdocs/blob/main/docs/USAGE.md"
40
+
41
+ [project.optional-dependencies]
42
+ server = ["fastapi>=0.110", "uvicorn>=0.27", "httpx>=0.27"]
43
+ dev = ["pytest>=8.0", "httpx>=0.27", "fastapi>=0.110", "uvicorn>=0.27"]
44
+
45
+ [project.scripts]
46
+ getdocs = "getdocs.cli:main"
47
+
48
+ [tool.setuptools.packages.find]
49
+ where = ["src"]
50
+
51
+ [tool.pytest.ini_options]
52
+ testpaths = ["tests"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
@@ -0,0 +1,3 @@
1
+ from getdocs.cli import main
2
+
3
+ raise SystemExit(main())
@@ -0,0 +1,95 @@
1
+ """API service: Firecrawl-style async Crawl jobs over the engine (ADR-0002)."""
2
+
3
+ from fastapi import FastAPI, HTTPException, WebSocket
4
+ from pydantic import BaseModel, model_validator
5
+
6
+ from getdocs.jobs import CrawlJob, JobManager
7
+
8
+
9
+ class CrawlRequest(BaseModel):
10
+ url: str | None = None
11
+ urls: list[str] | None = None
12
+ limit: int | None = None
13
+ depth: int | None = None
14
+ allow_backward: bool = False
15
+ allow_subdomains: bool = False
16
+ include_paths: list[str] | None = None
17
+ exclude_paths: list[str] | None = None
18
+ sitemap: str | None = None # "both" | "off" | "only"
19
+ render: str | None = None # "auto" | "always" | "never"
20
+ selector: str | None = None
21
+ ignore_robots: bool = False
22
+ keep_html: bool = False
23
+ delay: float | None = None
24
+ concurrency: int | None = None
25
+ webhook: str | None = None # URL POSTed started/page/completed events
26
+
27
+ @model_validator(mode="after")
28
+ def _require_some_url(self):
29
+ if not self.url and not self.urls:
30
+ raise ValueError("either url or urls is required")
31
+ return self
32
+
33
+
34
+ def _serialize(job: CrawlJob) -> dict:
35
+ return {
36
+ "id": job.id,
37
+ "status": job.status,
38
+ "seeds": job.seeds,
39
+ "page_count": len(job.pages),
40
+ "pages": job.pages,
41
+ "manifest": job.manifest,
42
+ "error": job.error,
43
+ "webhook_failures": job.webhook_failures,
44
+ }
45
+
46
+
47
+ def create_app(manager: JobManager | None = None) -> FastAPI:
48
+ manager = manager or JobManager()
49
+ app = FastAPI(title="getdocs", version="0.1.0")
50
+ app.state.manager = manager
51
+
52
+ @app.post("/v1/crawl", status_code=202)
53
+ async def start_crawl(request: CrawlRequest):
54
+ job = manager.start(request.model_dump(exclude_none=True))
55
+ return {"id": job.id, "status": job.status}
56
+
57
+ @app.get("/v1/crawl")
58
+ async def list_crawls():
59
+ return {
60
+ "jobs": [
61
+ {
62
+ "id": job.id,
63
+ "status": job.status,
64
+ "seeds": job.seeds,
65
+ "page_count": len(job.pages),
66
+ }
67
+ for job in manager.jobs.values()
68
+ ]
69
+ }
70
+
71
+ @app.get("/v1/crawl/{job_id}")
72
+ async def get_crawl(job_id: str):
73
+ job = manager.get(job_id)
74
+ if job is None:
75
+ raise HTTPException(status_code=404, detail="no such Crawl job")
76
+ return _serialize(job)
77
+
78
+ @app.websocket("/v1/crawl/{job_id}/ws")
79
+ async def stream_crawl(websocket: WebSocket, job_id: str):
80
+ await websocket.accept()
81
+ if manager.get(job_id) is None:
82
+ await websocket.close(code=4404, reason="no such Crawl job")
83
+ return
84
+ async for event in manager.stream(job_id):
85
+ await websocket.send_json(event)
86
+ await websocket.close()
87
+
88
+ @app.delete("/v1/crawl/{job_id}")
89
+ async def cancel_crawl(job_id: str):
90
+ job = manager.cancel(job_id)
91
+ if job is None:
92
+ raise HTTPException(status_code=404, detail="no such Crawl job")
93
+ return {"id": job.id, "status": job.status}
94
+
95
+ return app