getdocs 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- getdocs-0.1.0/LICENSE +21 -0
- getdocs-0.1.0/PKG-INFO +169 -0
- getdocs-0.1.0/README.md +129 -0
- getdocs-0.1.0/pyproject.toml +52 -0
- getdocs-0.1.0/setup.cfg +4 -0
- getdocs-0.1.0/src/getdocs/__init__.py +0 -0
- getdocs-0.1.0/src/getdocs/__main__.py +3 -0
- getdocs-0.1.0/src/getdocs/api.py +95 -0
- getdocs-0.1.0/src/getdocs/cli.py +220 -0
- getdocs-0.1.0/src/getdocs/config.py +36 -0
- getdocs-0.1.0/src/getdocs/engine.py +418 -0
- getdocs-0.1.0/src/getdocs/extract.py +190 -0
- getdocs-0.1.0/src/getdocs/identity.py +32 -0
- getdocs-0.1.0/src/getdocs/jobs.py +204 -0
- getdocs-0.1.0/src/getdocs/navharvest.py +242 -0
- getdocs-0.1.0/src/getdocs/output.py +191 -0
- getdocs-0.1.0/src/getdocs/scope.py +84 -0
- getdocs-0.1.0/src/getdocs/sitemap.py +35 -0
- getdocs-0.1.0/src/getdocs/source.py +238 -0
- getdocs-0.1.0/src/getdocs/urlnorm.py +34 -0
- getdocs-0.1.0/src/getdocs.egg-info/PKG-INFO +169 -0
- getdocs-0.1.0/src/getdocs.egg-info/SOURCES.txt +52 -0
- getdocs-0.1.0/src/getdocs.egg-info/dependency_links.txt +1 -0
- getdocs-0.1.0/src/getdocs.egg-info/entry_points.txt +2 -0
- getdocs-0.1.0/src/getdocs.egg-info/requires.txt +17 -0
- getdocs-0.1.0/src/getdocs.egg-info/top_level.txt +1 -0
- getdocs-0.1.0/tests/test_api.py +103 -0
- getdocs-0.1.0/tests/test_cli.py +123 -0
- getdocs-0.1.0/tests/test_crawl_e2e.py +38 -0
- getdocs-0.1.0/tests/test_extract.py +28 -0
- getdocs-0.1.0/tests/test_extract_pipeline.py +147 -0
- getdocs-0.1.0/tests/test_identity.py +20 -0
- getdocs-0.1.0/tests/test_identity_e2e.py +47 -0
- getdocs-0.1.0/tests/test_jobs.py +105 -0
- getdocs-0.1.0/tests/test_jsonl_e2e.py +43 -0
- getdocs-0.1.0/tests/test_jsonl_output.py +62 -0
- getdocs-0.1.0/tests/test_limits_errors_e2e.py +99 -0
- getdocs-0.1.0/tests/test_media_e2e.py +113 -0
- getdocs-0.1.0/tests/test_navharvest.py +142 -0
- getdocs-0.1.0/tests/test_navorder_e2e.py +75 -0
- getdocs-0.1.0/tests/test_output.py +114 -0
- getdocs-0.1.0/tests/test_politeness_e2e.py +68 -0
- getdocs-0.1.0/tests/test_relink_e2e.py +36 -0
- getdocs-0.1.0/tests/test_render_e2e.py +80 -0
- getdocs-0.1.0/tests/test_resume_e2e.py +77 -0
- getdocs-0.1.0/tests/test_scope.py +40 -0
- getdocs-0.1.0/tests/test_shell_detection.py +27 -0
- getdocs-0.1.0/tests/test_sitemap.py +47 -0
- getdocs-0.1.0/tests/test_sitemap_e2e.py +70 -0
- getdocs-0.1.0/tests/test_source.py +203 -0
- getdocs-0.1.0/tests/test_traversal_e2e.py +68 -0
- getdocs-0.1.0/tests/test_urlnorm.py +23 -0
- getdocs-0.1.0/tests/test_webhook_api.py +60 -0
- getdocs-0.1.0/tests/test_ws_api.py +69 -0
getdocs-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 jonbakerfish
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
getdocs-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: getdocs
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
|
|
5
|
+
Author-email: jonbakerfish <jonbakerfish@gmail.com>
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/jonbakerfish/getdocs
|
|
8
|
+
Project-URL: Repository, https://github.com/jonbakerfish/getdocs
|
|
9
|
+
Project-URL: Issues, https://github.com/jonbakerfish/getdocs/issues
|
|
10
|
+
Project-URL: Documentation, https://github.com/jonbakerfish/getdocs/blob/main/docs/USAGE.md
|
|
11
|
+
Keywords: documentation,crawler,scraper,markdown,docs,llm,agents,rag
|
|
12
|
+
Classifier: Development Status :: 4 - Beta
|
|
13
|
+
Classifier: Environment :: Console
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
19
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
20
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
21
|
+
Requires-Python: >=3.12
|
|
22
|
+
Description-Content-Type: text/markdown
|
|
23
|
+
License-File: LICENSE
|
|
24
|
+
Requires-Dist: scrapy>=2.11
|
|
25
|
+
Requires-Dist: markdownify>=0.13
|
|
26
|
+
Requires-Dist: beautifulsoup4>=4.12
|
|
27
|
+
Requires-Dist: pyyaml>=6.0
|
|
28
|
+
Requires-Dist: trafilatura>=1.12
|
|
29
|
+
Requires-Dist: scrapy-playwright>=0.0.40
|
|
30
|
+
Provides-Extra: server
|
|
31
|
+
Requires-Dist: fastapi>=0.110; extra == "server"
|
|
32
|
+
Requires-Dist: uvicorn>=0.27; extra == "server"
|
|
33
|
+
Requires-Dist: httpx>=0.27; extra == "server"
|
|
34
|
+
Provides-Extra: dev
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
36
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
37
|
+
Requires-Dist: fastapi>=0.110; extra == "dev"
|
|
38
|
+
Requires-Dist: uvicorn>=0.27; extra == "dev"
|
|
39
|
+
Dynamic: license-file
|
|
40
|
+
|
|
41
|
+
# getdocs
|
|
42
|
+
|
|
43
|
+
**Turn any documentation site into a clean, local markdown copy your coding agent can actually read.**
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
getdocs crawl https://example.com/docs -o ./out
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Coding agents are only as good as the docs they can see. Pointing an agent at a
|
|
50
|
+
live docs URL means it burns tokens on nav bars, cookie banners, and HTML
|
|
51
|
+
chrome — or can't reach the page at all. `getdocs` gives the agent a local,
|
|
52
|
+
offline, markdown mirror instead: the actual content, structured to match the
|
|
53
|
+
original site, ready to drop into a repo or feed to a model.
|
|
54
|
+
|
|
55
|
+
## Why getdocs
|
|
56
|
+
|
|
57
|
+
- **Richer context for coding agents.** A local copy is greppable, indexable,
|
|
58
|
+
and always available — the agent reads the whole library at once instead of
|
|
59
|
+
fetching one rendered page at a time. No rate limits, no network flakiness,
|
|
60
|
+
no JS that won't hydrate.
|
|
61
|
+
- **Clean markdown → fewer tokens.** Each page is reduced to its content (the
|
|
62
|
+
nav, headers, footers, and ad chrome stripped) and written as plain markdown.
|
|
63
|
+
Agents consume it directly, and you spend tokens on docs, not `<div>` soup.
|
|
64
|
+
- **Structure preserved.** Files mirror the URL hierarchy
|
|
65
|
+
(`example.com/docs/auth` → `out/docs/auth.md`), each with YAML frontmatter
|
|
66
|
+
(`url`, `title`, `crawled_at`, `status`), plus a `crawl.json` Manifest that
|
|
67
|
+
captures the site's nav order and reading order — so an agent can follow the
|
|
68
|
+
docs in the order the authors intended.
|
|
69
|
+
- **Source-first: clone over crawl.** If the docs site is open-source, getdocs
|
|
70
|
+
detects the "Edit this page" link, clones the repo, and serves you the
|
|
71
|
+
original markdown source instead of scraping HTML — the highest-fidelity copy
|
|
72
|
+
there is. Falls back to crawling automatically when there's no repo.
|
|
73
|
+
|
|
74
|
+
## When to reach for it
|
|
75
|
+
|
|
76
|
+
- **Coding against an unfamiliar library or API.** Mirror its docs into your
|
|
77
|
+
repo (or a scratch dir) so your agent can ground its answers in the real
|
|
78
|
+
reference instead of hallucinating from memory.
|
|
79
|
+
- **RAG / knowledge bases.** Get a clean markdown corpus to chunk and embed,
|
|
80
|
+
without writing a bespoke scraper-and-cleaner for every site.
|
|
81
|
+
- **Offline or air-gapped work.** Take a docs set with you; read and search it
|
|
82
|
+
with no network.
|
|
83
|
+
- **Pinning a version.** Snapshot today's docs so your agent isn't tripped up
|
|
84
|
+
when the upstream site changes underneath you.
|
|
85
|
+
- **Migrating or archiving docs.** Pull an entire site down as markdown to move,
|
|
86
|
+
diff, or keep.
|
|
87
|
+
|
|
88
|
+
## Output
|
|
89
|
+
|
|
90
|
+
```
|
|
91
|
+
out/
|
|
92
|
+
├── crawl.json ← the Manifest: nav order, reading order, what ran
|
|
93
|
+
└── docs/
|
|
94
|
+
├── index.md
|
|
95
|
+
├── auth.md
|
|
96
|
+
└── guide/
|
|
97
|
+
└── intro.md
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
```markdown
|
|
101
|
+
---
|
|
102
|
+
url: https://example.com/docs/auth
|
|
103
|
+
title: Authentication
|
|
104
|
+
status: 200
|
|
105
|
+
crawled_at: 2026-06-12T10:00:00Z
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
# Authentication
|
|
109
|
+
...
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
Sitemap discovery, JavaScript rendering, source-repo cloning, polite
|
|
113
|
+
throttling, JSONL output, and resumable crawls are all built in — see
|
|
114
|
+
[docs/USAGE.md](docs/USAGE.md).
|
|
115
|
+
|
|
116
|
+
## Install
|
|
117
|
+
|
|
118
|
+
Requires **Python 3.12+**.
|
|
119
|
+
|
|
120
|
+
```bash
|
|
121
|
+
pip install getdocs
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
Or from source, for the latest unreleased changes:
|
|
125
|
+
|
|
126
|
+
```bash
|
|
127
|
+
git clone https://github.com/jonbakerfish/getdocs
|
|
128
|
+
cd getdocs
|
|
129
|
+
pip install -e .
|
|
130
|
+
```
|
|
131
|
+
|
|
132
|
+
That's enough to crawl. Two optional pieces unlock more:
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
# JavaScript rendering — the headless browser used to hydrate SPA docs
|
|
136
|
+
playwright install chromium
|
|
137
|
+
|
|
138
|
+
# Serve a crawled/cloned copy locally as a browsable site
|
|
139
|
+
pip install mkdocs mkdocs-material
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**`git`** must be on your `PATH` for source-first cloning (it almost always
|
|
143
|
+
already is); without it, getdocs simply falls back to crawling. To run the
|
|
144
|
+
optional API service, install the server extra: `pip install "getdocs[server]"`.
|
|
145
|
+
|
|
146
|
+
## Development
|
|
147
|
+
|
|
148
|
+
```bash
|
|
149
|
+
git clone https://github.com/jonbakerfish/getdocs
|
|
150
|
+
cd getdocs
|
|
151
|
+
pip install -e ".[dev]"
|
|
152
|
+
pytest
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
## Responsible use
|
|
156
|
+
|
|
157
|
+
getdocs is a tool; how you point it is on you. By default it **honors
|
|
158
|
+
`robots.txt`**, throttles itself politely, and **identifies itself honestly**
|
|
159
|
+
in the `User-Agent` (`getdocs/<version> (+project-url)`) — please keep it that
|
|
160
|
+
way. For high-volume crawls, add `--contact you@example.com` so site operators
|
|
161
|
+
can reach you (it's appended to the User-Agent; optional but courteous).
|
|
162
|
+
|
|
163
|
+
getdocs is intended for personal, reference, and agent/RAG use on documentation
|
|
164
|
+
you have the right to access. **You are solely responsible for complying with
|
|
165
|
+
each site's Terms of Service, its `robots.txt`, applicable law, and the
|
|
166
|
+
copyright of the content you fetch** — getdocs is provided as-is, with no
|
|
167
|
+
warranty (see [LICENSE](LICENSE)). Crawled documentation belongs to its authors:
|
|
168
|
+
use it for your own reference, but don't redistribute someone else's docs as
|
|
169
|
+
your own. Crawl only what you have the right to.
|
getdocs-0.1.0/README.md
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
# getdocs
|
|
2
|
+
|
|
3
|
+
**Turn any documentation site into a clean, local markdown copy your coding agent can actually read.**
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
getdocs crawl https://example.com/docs -o ./out
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
Coding agents are only as good as the docs they can see. Pointing an agent at a
|
|
10
|
+
live docs URL means it burns tokens on nav bars, cookie banners, and HTML
|
|
11
|
+
chrome — or can't reach the page at all. `getdocs` gives the agent a local,
|
|
12
|
+
offline, markdown mirror instead: the actual content, structured to match the
|
|
13
|
+
original site, ready to drop into a repo or feed to a model.
|
|
14
|
+
|
|
15
|
+
## Why getdocs
|
|
16
|
+
|
|
17
|
+
- **Richer context for coding agents.** A local copy is greppable, indexable,
|
|
18
|
+
and always available — the agent reads the whole library at once instead of
|
|
19
|
+
fetching one rendered page at a time. No rate limits, no network flakiness,
|
|
20
|
+
no JS that won't hydrate.
|
|
21
|
+
- **Clean markdown → fewer tokens.** Each page is reduced to its content (the
|
|
22
|
+
nav, headers, footers, and ad chrome stripped) and written as plain markdown.
|
|
23
|
+
Agents consume it directly, and you spend tokens on docs, not `<div>` soup.
|
|
24
|
+
- **Structure preserved.** Files mirror the URL hierarchy
|
|
25
|
+
(`example.com/docs/auth` → `out/docs/auth.md`), each with YAML frontmatter
|
|
26
|
+
(`url`, `title`, `crawled_at`, `status`), plus a `crawl.json` Manifest that
|
|
27
|
+
captures the site's nav order and reading order — so an agent can follow the
|
|
28
|
+
docs in the order the authors intended.
|
|
29
|
+
- **Source-first: clone over crawl.** If the docs site is open-source, getdocs
|
|
30
|
+
detects the "Edit this page" link, clones the repo, and serves you the
|
|
31
|
+
original markdown source instead of scraping HTML — the highest-fidelity copy
|
|
32
|
+
there is. Falls back to crawling automatically when there's no repo.
|
|
33
|
+
|
|
34
|
+
## When to reach for it
|
|
35
|
+
|
|
36
|
+
- **Coding against an unfamiliar library or API.** Mirror its docs into your
|
|
37
|
+
repo (or a scratch dir) so your agent can ground its answers in the real
|
|
38
|
+
reference instead of hallucinating from memory.
|
|
39
|
+
- **RAG / knowledge bases.** Get a clean markdown corpus to chunk and embed,
|
|
40
|
+
without writing a bespoke scraper-and-cleaner for every site.
|
|
41
|
+
- **Offline or air-gapped work.** Take a docs set with you; read and search it
|
|
42
|
+
with no network.
|
|
43
|
+
- **Pinning a version.** Snapshot today's docs so your agent isn't tripped up
|
|
44
|
+
when the upstream site changes underneath you.
|
|
45
|
+
- **Migrating or archiving docs.** Pull an entire site down as markdown to move,
|
|
46
|
+
diff, or keep.
|
|
47
|
+
|
|
48
|
+
## Output
|
|
49
|
+
|
|
50
|
+
```
|
|
51
|
+
out/
|
|
52
|
+
├── crawl.json ← the Manifest: nav order, reading order, what ran
|
|
53
|
+
└── docs/
|
|
54
|
+
├── index.md
|
|
55
|
+
├── auth.md
|
|
56
|
+
└── guide/
|
|
57
|
+
└── intro.md
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
```markdown
|
|
61
|
+
---
|
|
62
|
+
url: https://example.com/docs/auth
|
|
63
|
+
title: Authentication
|
|
64
|
+
status: 200
|
|
65
|
+
crawled_at: 2026-06-12T10:00:00Z
|
|
66
|
+
---
|
|
67
|
+
|
|
68
|
+
# Authentication
|
|
69
|
+
...
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
Sitemap discovery, JavaScript rendering, source-repo cloning, polite
|
|
73
|
+
throttling, JSONL output, and resumable crawls are all built in — see
|
|
74
|
+
[docs/USAGE.md](docs/USAGE.md).
|
|
75
|
+
|
|
76
|
+
## Install
|
|
77
|
+
|
|
78
|
+
Requires **Python 3.12+**.
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install getdocs
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
Or from source, for the latest unreleased changes:
|
|
85
|
+
|
|
86
|
+
```bash
|
|
87
|
+
git clone https://github.com/jonbakerfish/getdocs
|
|
88
|
+
cd getdocs
|
|
89
|
+
pip install -e .
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
That's enough to crawl. Two optional pieces unlock more:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
# JavaScript rendering — the headless browser used to hydrate SPA docs
|
|
96
|
+
playwright install chromium
|
|
97
|
+
|
|
98
|
+
# Serve a crawled/cloned copy locally as a browsable site
|
|
99
|
+
pip install mkdocs mkdocs-material
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**`git`** must be on your `PATH` for source-first cloning (it almost always
|
|
103
|
+
already is); without it, getdocs simply falls back to crawling. To run the
|
|
104
|
+
optional API service, install the server extra: `pip install "getdocs[server]"`.
|
|
105
|
+
|
|
106
|
+
## Development
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
git clone https://github.com/jonbakerfish/getdocs
|
|
110
|
+
cd getdocs
|
|
111
|
+
pip install -e ".[dev]"
|
|
112
|
+
pytest
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
## Responsible use
|
|
116
|
+
|
|
117
|
+
getdocs is a tool; how you point it is on you. By default it **honors
|
|
118
|
+
`robots.txt`**, throttles itself politely, and **identifies itself honestly**
|
|
119
|
+
in the `User-Agent` (`getdocs/<version> (+project-url)`) — please keep it that
|
|
120
|
+
way. For high-volume crawls, add `--contact you@example.com` so site operators
|
|
121
|
+
can reach you (it's appended to the User-Agent; optional but courteous).
|
|
122
|
+
|
|
123
|
+
getdocs is intended for personal, reference, and agent/RAG use on documentation
|
|
124
|
+
you have the right to access. **You are solely responsible for complying with
|
|
125
|
+
each site's Terms of Service, its `robots.txt`, applicable law, and the
|
|
126
|
+
copyright of the content you fetch** — getdocs is provided as-is, with no
|
|
127
|
+
warranty (see [LICENSE](LICENSE)). Crawled documentation belongs to its authors:
|
|
128
|
+
use it for your own reference, but don't redistribute someone else's docs as
|
|
129
|
+
your own. Crawl only what you have the right to.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=77"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "getdocs"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Documentation crawler: recursively crawl a docs site and emit clean markdown"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
authors = [{ name = "jonbakerfish", email = "jonbakerfish@gmail.com" }]
|
|
14
|
+
keywords = ["documentation", "crawler", "scraper", "markdown", "docs", "llm", "agents", "rag"]
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Development Status :: 4 - Beta",
|
|
17
|
+
"Environment :: Console",
|
|
18
|
+
"Intended Audience :: Developers",
|
|
19
|
+
"Operating System :: OS Independent",
|
|
20
|
+
"Programming Language :: Python :: 3",
|
|
21
|
+
"Programming Language :: Python :: 3.12",
|
|
22
|
+
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
|
|
23
|
+
"Topic :: Software Development :: Documentation",
|
|
24
|
+
"Topic :: Text Processing :: Markup :: Markdown",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"scrapy>=2.11",
|
|
28
|
+
"markdownify>=0.13",
|
|
29
|
+
"beautifulsoup4>=4.12",
|
|
30
|
+
"pyyaml>=6.0",
|
|
31
|
+
"trafilatura>=1.12",
|
|
32
|
+
"scrapy-playwright>=0.0.40",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
[project.urls]
|
|
36
|
+
Homepage = "https://github.com/jonbakerfish/getdocs"
|
|
37
|
+
Repository = "https://github.com/jonbakerfish/getdocs"
|
|
38
|
+
Issues = "https://github.com/jonbakerfish/getdocs/issues"
|
|
39
|
+
Documentation = "https://github.com/jonbakerfish/getdocs/blob/main/docs/USAGE.md"
|
|
40
|
+
|
|
41
|
+
[project.optional-dependencies]
|
|
42
|
+
server = ["fastapi>=0.110", "uvicorn>=0.27", "httpx>=0.27"]
|
|
43
|
+
dev = ["pytest>=8.0", "httpx>=0.27", "fastapi>=0.110", "uvicorn>=0.27"]
|
|
44
|
+
|
|
45
|
+
[project.scripts]
|
|
46
|
+
getdocs = "getdocs.cli:main"
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.packages.find]
|
|
49
|
+
where = ["src"]
|
|
50
|
+
|
|
51
|
+
[tool.pytest.ini_options]
|
|
52
|
+
testpaths = ["tests"]
|
getdocs-0.1.0/setup.cfg
ADDED
|
File without changes
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""API service: Firecrawl-style async Crawl jobs over the engine (ADR-0002)."""
|
|
2
|
+
|
|
3
|
+
from fastapi import FastAPI, HTTPException, WebSocket
|
|
4
|
+
from pydantic import BaseModel, model_validator
|
|
5
|
+
|
|
6
|
+
from getdocs.jobs import CrawlJob, JobManager
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CrawlRequest(BaseModel):
|
|
10
|
+
url: str | None = None
|
|
11
|
+
urls: list[str] | None = None
|
|
12
|
+
limit: int | None = None
|
|
13
|
+
depth: int | None = None
|
|
14
|
+
allow_backward: bool = False
|
|
15
|
+
allow_subdomains: bool = False
|
|
16
|
+
include_paths: list[str] | None = None
|
|
17
|
+
exclude_paths: list[str] | None = None
|
|
18
|
+
sitemap: str | None = None # "both" | "off" | "only"
|
|
19
|
+
render: str | None = None # "auto" | "always" | "never"
|
|
20
|
+
selector: str | None = None
|
|
21
|
+
ignore_robots: bool = False
|
|
22
|
+
keep_html: bool = False
|
|
23
|
+
delay: float | None = None
|
|
24
|
+
concurrency: int | None = None
|
|
25
|
+
webhook: str | None = None # URL POSTed started/page/completed events
|
|
26
|
+
|
|
27
|
+
@model_validator(mode="after")
|
|
28
|
+
def _require_some_url(self):
|
|
29
|
+
if not self.url and not self.urls:
|
|
30
|
+
raise ValueError("either url or urls is required")
|
|
31
|
+
return self
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _serialize(job: CrawlJob) -> dict:
|
|
35
|
+
return {
|
|
36
|
+
"id": job.id,
|
|
37
|
+
"status": job.status,
|
|
38
|
+
"seeds": job.seeds,
|
|
39
|
+
"page_count": len(job.pages),
|
|
40
|
+
"pages": job.pages,
|
|
41
|
+
"manifest": job.manifest,
|
|
42
|
+
"error": job.error,
|
|
43
|
+
"webhook_failures": job.webhook_failures,
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def create_app(manager: JobManager | None = None) -> FastAPI:
|
|
48
|
+
manager = manager or JobManager()
|
|
49
|
+
app = FastAPI(title="getdocs", version="0.1.0")
|
|
50
|
+
app.state.manager = manager
|
|
51
|
+
|
|
52
|
+
@app.post("/v1/crawl", status_code=202)
|
|
53
|
+
async def start_crawl(request: CrawlRequest):
|
|
54
|
+
job = manager.start(request.model_dump(exclude_none=True))
|
|
55
|
+
return {"id": job.id, "status": job.status}
|
|
56
|
+
|
|
57
|
+
@app.get("/v1/crawl")
|
|
58
|
+
async def list_crawls():
|
|
59
|
+
return {
|
|
60
|
+
"jobs": [
|
|
61
|
+
{
|
|
62
|
+
"id": job.id,
|
|
63
|
+
"status": job.status,
|
|
64
|
+
"seeds": job.seeds,
|
|
65
|
+
"page_count": len(job.pages),
|
|
66
|
+
}
|
|
67
|
+
for job in manager.jobs.values()
|
|
68
|
+
]
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
@app.get("/v1/crawl/{job_id}")
|
|
72
|
+
async def get_crawl(job_id: str):
|
|
73
|
+
job = manager.get(job_id)
|
|
74
|
+
if job is None:
|
|
75
|
+
raise HTTPException(status_code=404, detail="no such Crawl job")
|
|
76
|
+
return _serialize(job)
|
|
77
|
+
|
|
78
|
+
@app.websocket("/v1/crawl/{job_id}/ws")
|
|
79
|
+
async def stream_crawl(websocket: WebSocket, job_id: str):
|
|
80
|
+
await websocket.accept()
|
|
81
|
+
if manager.get(job_id) is None:
|
|
82
|
+
await websocket.close(code=4404, reason="no such Crawl job")
|
|
83
|
+
return
|
|
84
|
+
async for event in manager.stream(job_id):
|
|
85
|
+
await websocket.send_json(event)
|
|
86
|
+
await websocket.close()
|
|
87
|
+
|
|
88
|
+
@app.delete("/v1/crawl/{job_id}")
|
|
89
|
+
async def cancel_crawl(job_id: str):
|
|
90
|
+
job = manager.cancel(job_id)
|
|
91
|
+
if job is None:
|
|
92
|
+
raise HTTPException(status_code=404, detail="no such Crawl job")
|
|
93
|
+
return {"id": job.id, "status": job.status}
|
|
94
|
+
|
|
95
|
+
return app
|