getdocs 0.1.0__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {getdocs-0.1.0/src/getdocs.egg-info → getdocs-0.2.0}/PKG-INFO +49 -1
  2. {getdocs-0.1.0 → getdocs-0.2.0}/README.md +48 -0
  3. {getdocs-0.1.0 → getdocs-0.2.0}/pyproject.toml +1 -1
  4. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/cli.py +19 -4
  5. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/config.py +1 -0
  6. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/engine.py +11 -4
  7. getdocs-0.2.0/src/getdocs/outcome.py +75 -0
  8. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/source.py +14 -7
  9. {getdocs-0.1.0 → getdocs-0.2.0/src/getdocs.egg-info}/PKG-INFO +49 -1
  10. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs.egg-info/SOURCES.txt +3 -0
  11. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_source.py +7 -2
  12. getdocs-0.2.0/tests/test_summary_clone_e2e.py +88 -0
  13. getdocs-0.2.0/tests/test_summary_e2e.py +138 -0
  14. {getdocs-0.1.0 → getdocs-0.2.0}/LICENSE +0 -0
  15. {getdocs-0.1.0 → getdocs-0.2.0}/setup.cfg +0 -0
  16. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/__init__.py +0 -0
  17. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/__main__.py +0 -0
  18. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/api.py +0 -0
  19. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/extract.py +0 -0
  20. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/identity.py +0 -0
  21. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/jobs.py +0 -0
  22. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/navharvest.py +0 -0
  23. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/output.py +0 -0
  24. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/scope.py +0 -0
  25. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/sitemap.py +0 -0
  26. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/urlnorm.py +0 -0
  27. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs.egg-info/dependency_links.txt +0 -0
  28. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs.egg-info/entry_points.txt +0 -0
  29. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs.egg-info/requires.txt +0 -0
  30. {getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs.egg-info/top_level.txt +0 -0
  31. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_api.py +0 -0
  32. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_cli.py +0 -0
  33. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_crawl_e2e.py +0 -0
  34. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_extract.py +0 -0
  35. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_extract_pipeline.py +0 -0
  36. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_identity.py +0 -0
  37. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_identity_e2e.py +0 -0
  38. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_jobs.py +0 -0
  39. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_jsonl_e2e.py +0 -0
  40. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_jsonl_output.py +0 -0
  41. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_limits_errors_e2e.py +0 -0
  42. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_media_e2e.py +0 -0
  43. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_navharvest.py +0 -0
  44. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_navorder_e2e.py +0 -0
  45. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_output.py +0 -0
  46. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_politeness_e2e.py +0 -0
  47. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_relink_e2e.py +0 -0
  48. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_render_e2e.py +0 -0
  49. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_resume_e2e.py +0 -0
  50. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_scope.py +0 -0
  51. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_shell_detection.py +0 -0
  52. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_sitemap.py +0 -0
  53. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_sitemap_e2e.py +0 -0
  54. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_traversal_e2e.py +0 -0
  55. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_urlnorm.py +0 -0
  56. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_webhook_api.py +0 -0
  57. {getdocs-0.1.0 → getdocs-0.2.0}/tests/test_ws_api.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: getdocs
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
5
5
  Author-email: jonbakerfish <jonbakerfish@gmail.com>
6
6
  License-Expression: MIT
@@ -113,6 +113,54 @@ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
113
113
  throttling, JSONL output, and resumable crawls are all built in — see
114
114
  [docs/USAGE.md](docs/USAGE.md).
115
115
 
116
+ ## Use with your agent
117
+
118
+ getdocs is built to be driven by a coding agent: it's an ordinary CLI whose
119
+ `out/` tree + `crawl.json` Manifest *is* the return value (no MCP server, no job
120
+ protocol — see [ADR-0007](docs/adr/0007-agent-integration-is-the-cli-not-an-mcp-surface.md)).
121
+ Two patterns cover most uses.
122
+
123
+ **Synchronous — fetch one docs section.** Scope defaults to the seed's host +
124
+ path prefix, so pointing at a subtree fetches just that subtree. This blocks
125
+ until done and works under any agent:
126
+
127
+ ```bash
128
+ getdocs crawl https://example.com/docs/auth -o ./out --summary-json
129
+ ```
130
+
131
+ **Background — mirror a whole site.** Kick the crawl off as a background task
132
+ and keep working. Under **Claude Code** the agent is resumed automatically when
133
+ the crawl finishes; every other agent surveyed blocks or polls the output path
134
+ instead (this is a harness feature, not a getdocs one):
135
+
136
+ ```bash
137
+ getdocs crawl https://example.com/docs -o ./out --summary-json &
138
+ ```
139
+
140
+ **Read the summary, branch on the Outcome.** Every run ends with a one-line
141
+ summary on stderr; `--summary-json` adds a machine-readable object discriminated
142
+ by `outcome`. A run produces exactly one Outcome — a Crawl or a Clone:
143
+
144
+ ```jsonc
145
+ // outcome: "crawled" — getdocs scraped the rendered site
146
+ { "outcome": "crawled", "status": "ok", "pages": 42,
147
+ "output_dir": "./out", "manifest": "./out/crawl.json", "truncated": false }
148
+
149
+ // outcome: "cloned" — the docs were open-source, so getdocs cloned the repo
150
+ // (no pages / no manifest: a Clone is not a Crawl)
151
+ { "outcome": "cloned", "status": "ok", "repo": "acme/docs",
152
+ "output_dir": "./out/docs", "mkdocs_config": "./out/mkdocs.yml" }
153
+ ```
154
+
155
+ Have the agent switch on `outcome`:
156
+
157
+ - **`crawled`** → grep and read the Pages under `output_dir` and follow the nav
158
+ / reading order in `manifest` (`crawl.json`).
159
+ - **`cloned`** → you have the original markdown source; serve it with
160
+ `mkdocs serve -f <mkdocs_config>` (or just read the files under `output_dir`).
161
+ - **`status: "truncated"`** → the crawl hit its page cap; re-run with a higher
162
+ `--limit` (or `0` for unlimited) to get the rest.
163
+
116
164
  ## Install
117
165
 
118
166
  Requires **Python 3.12+**.
@@ -73,6 +73,54 @@ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
73
73
  throttling, JSONL output, and resumable crawls are all built in — see
74
74
  [docs/USAGE.md](docs/USAGE.md).
75
75
 
76
+ ## Use with your agent
77
+
78
+ getdocs is built to be driven by a coding agent: it's an ordinary CLI whose
79
+ `out/` tree + `crawl.json` Manifest *is* the return value (no MCP server, no job
80
+ protocol — see [ADR-0007](docs/adr/0007-agent-integration-is-the-cli-not-an-mcp-surface.md)).
81
+ Two patterns cover most uses.
82
+
83
+ **Synchronous — fetch one docs section.** Scope defaults to the seed's host +
84
+ path prefix, so pointing at a subtree fetches just that subtree. This blocks
85
+ until done and works under any agent:
86
+
87
+ ```bash
88
+ getdocs crawl https://example.com/docs/auth -o ./out --summary-json
89
+ ```
90
+
91
+ **Background — mirror a whole site.** Kick the crawl off as a background task
92
+ and keep working. Under **Claude Code** the agent is resumed automatically when
93
+ the crawl finishes; every other agent surveyed blocks or polls the output path
94
+ instead (this is a harness feature, not a getdocs one):
95
+
96
+ ```bash
97
+ getdocs crawl https://example.com/docs -o ./out --summary-json &
98
+ ```
99
+
100
+ **Read the summary, branch on the Outcome.** Every run ends with a one-line
101
+ summary on stderr; `--summary-json` adds a machine-readable object discriminated
102
+ by `outcome`. A run produces exactly one Outcome — a Crawl or a Clone:
103
+
104
+ ```jsonc
105
+ // outcome: "crawled" — getdocs scraped the rendered site
106
+ { "outcome": "crawled", "status": "ok", "pages": 42,
107
+ "output_dir": "./out", "manifest": "./out/crawl.json", "truncated": false }
108
+
109
+ // outcome: "cloned" — the docs were open-source, so getdocs cloned the repo
110
+ // (no pages / no manifest: a Clone is not a Crawl)
111
+ { "outcome": "cloned", "status": "ok", "repo": "acme/docs",
112
+ "output_dir": "./out/docs", "mkdocs_config": "./out/mkdocs.yml" }
113
+ ```
114
+
115
+ Have the agent switch on `outcome`:
116
+
117
+ - **`crawled`** → grep and read the Pages under `output_dir` and follow the nav
118
+ / reading order in `manifest` (`crawl.json`).
119
+ - **`cloned`** → you have the original markdown source; serve it with
120
+ `mkdocs serve -f <mkdocs_config>` (or just read the files under `output_dir`).
121
+ - **`status: "truncated"`** → the crawl hit its page cap; re-run with a higher
122
+ `--limit` (or `0` for unlimited) to get the rest.
123
+
76
124
  ## Install
77
125
 
78
126
  Requires **Python 3.12+**.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "getdocs"
7
- version = "0.1.0"
7
+ version = "0.2.0"
8
8
  description = "Documentation crawler: recursively crawl a docs site and emit clean markdown"
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.12"
@@ -67,6 +67,11 @@ def parse_args(argv: list[str] | None = None) -> CrawlConfig | ServeConfig:
67
67
  "--format", choices=["files", "jsonl"], default="files",
68
68
  help="files: .md tree + crawl.json; jsonl: one record per Page on stdout",
69
69
  )
70
+ crawl.add_argument(
71
+ "--summary-json", dest="summary_json", action="store_true",
72
+ help="Print a machine-readable Outcome summary (files mode: to stdout; "
73
+ "jsonl mode: already carried by the final Manifest record)",
74
+ )
70
75
  crawl.add_argument(
71
76
  "--selector", metavar="CSS",
72
77
  help="CSS selector for the content container (overrides auto-detection)",
@@ -146,6 +151,7 @@ def parse_args(argv: list[str] | None = None) -> CrawlConfig | ServeConfig:
146
151
  depth=args.depth,
147
152
  limit=args.limit,
148
153
  format=args.format,
154
+ summary_json=args.summary_json,
149
155
  keep_html=args.keep_html,
150
156
  sitemap=args.sitemap,
151
157
  selector=args.selector,
@@ -195,7 +201,11 @@ def main(argv: list[str] | None = None) -> int:
195
201
  if config.format == "files" and config.clone_source and not config.resume and config.seeds:
196
202
  from getdocs.source import clone_source_for
197
203
 
198
- if clone_source_for(config) is not None:
204
+ clone = clone_source_for(config)
205
+ if clone is not None:
206
+ print(clone.stderr_line(), file=sys.stderr, flush=True)
207
+ if config.summary_json:
208
+ print(json.dumps(clone.summary_json()), flush=True)
199
209
  return 0
200
210
  state_file = state_file_for(config)
201
211
  if config.resume:
@@ -212,9 +222,14 @@ def main(argv: list[str] | None = None) -> int:
212
222
  )
213
223
  state_file.unlink()
214
224
 
215
- page_count = run_crawl(config)
216
- if page_count == 0:
217
- # stderr: stdout belongs to the jsonl stream (ADR-0002)
225
+ outcome = run_crawl(config)
226
+ # Always-on one-line summary on stderr (stdout belongs to the jsonl stream,
227
+ # ADR-0002). The opt-in --summary-json object goes to stdout in files mode;
228
+ # in jsonl mode the final Manifest record already carries the same facts.
229
+ print(outcome.stderr_line(), file=sys.stderr, flush=True)
230
+ if config.summary_json and config.format == "files":
231
+ print(json.dumps(outcome.summary_json()), flush=True)
232
+ if outcome.status == "empty":
218
233
  print("error: no Pages produced — seed(s) unreachable?", file=sys.stderr, flush=True)
219
234
  return 1
220
235
  return 0
@@ -15,6 +15,7 @@ class CrawlConfig:
15
15
  depth: int = 0 # link-hops from any seed; 0 = unlimited
16
16
  limit: int = 1000 # max Pages per Crawl; 0 = unlimited
17
17
  format: str = "files" # "files" or "jsonl"
18
+ summary_json: bool = False # emit a machine-readable Outcome summary object
18
19
  keep_html: bool = False
19
20
  sitemap: str = "both" # "both", "off" (--no-sitemap), or "only" (--sitemap-only)
20
21
  selector: str | None = None # CSS selector naming the content container
@@ -25,6 +25,7 @@ from getdocs.config import CrawlConfig
25
25
  from getdocs.extract import extract_page, is_shell
26
26
  from getdocs.identity import build_user_agent
27
27
  from getdocs.navharvest import harvest_nav, merge_harvests
28
+ from getdocs.outcome import CrawlOutcome
28
29
  from getdocs.output import AssetStore, FileTreeWriter, JsonlWriter, PageRecord, relink_pages
29
30
  from getdocs.scope import Scope
30
31
  from getdocs.sitemap import parse_robots_sitemaps, parse_sitemap_xml
@@ -343,8 +344,8 @@ class _CrawlSpider(scrapy.Spider):
343
344
  )
344
345
 
345
346
 
346
- def run_crawl(config: CrawlConfig) -> int:
347
- """Run a Crawl to completion; returns the number of Pages produced."""
347
+ def run_crawl(config: CrawlConfig) -> CrawlOutcome:
348
+ """Run a Crawl to completion; returns the structured Outcome it produced."""
348
349
  if config.format == "jsonl":
349
350
  writer = JsonlWriter(sys.stdout)
350
351
  else:
@@ -405,7 +406,7 @@ def run_crawl(config: CrawlConfig) -> int:
405
406
  if isinstance(writer, FileTreeWriter):
406
407
  relink_pages(writer, outcome["crawl_sequence"])
407
408
  nav, reading_order = merge_harvests(outcome["harvests"], outcome["crawl_sequence"])
408
- writer.write_manifest(
409
+ manifest_path = writer.write_manifest(
409
410
  seeds=config.seeds,
410
411
  errors=outcome["errors"],
411
412
  truncated=outcome["truncated"],
@@ -415,4 +416,10 @@ def run_crawl(config: CrawlConfig) -> int:
415
416
  reading_order=reading_order,
416
417
  media_skipped=outcome["media_skipped"],
417
418
  )
418
- return writer.page_count
419
+ return CrawlOutcome(
420
+ pages=writer.page_count,
421
+ output_dir=config.output_dir,
422
+ manifest=manifest_path,
423
+ truncated=outcome["truncated"],
424
+ format=config.format,
425
+ )
@@ -0,0 +1,75 @@
1
+ """Outcome: the structured result of a getdocs run — a Crawl or a Clone (ADR-0007).
2
+
3
+ A run produces exactly one Outcome — a Crawl or a Clone. The always-on stderr
4
+ summary line and the opt-in --summary-json object are both rendered from the
5
+ same Outcome value, so the two surfaces cannot disagree.
6
+ """
7
+
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+
11
+
12
+ @dataclass(frozen=True)
13
+ class CrawlOutcome:
14
+ """What a Crawl produced: how many Pages, where they landed, whether capped."""
15
+
16
+ pages: int
17
+ output_dir: Path
18
+ manifest: Path | None # crawl.json path in files mode; None in jsonl mode
19
+ truncated: bool
20
+ format: str = "files" # "files" or "jsonl"
21
+
22
+ @property
23
+ def status(self) -> str:
24
+ """ok | truncated | empty — derived so the line and JSON always agree."""
25
+ if self.pages == 0:
26
+ return "empty"
27
+ if self.truncated:
28
+ return "truncated"
29
+ return "ok"
30
+
31
+ def stderr_line(self) -> str:
32
+ if self.format == "jsonl":
33
+ dest = "stdout (jsonl)"
34
+ else:
35
+ dest = f"{self.output_dir} ({self.manifest.name})"
36
+ note = " [truncated]" if self.truncated else ""
37
+ return f"getdocs: crawled {self.pages} Pages → {dest}{note}"
38
+
39
+ def summary_json(self) -> dict:
40
+ return {
41
+ "outcome": "crawled",
42
+ "status": self.status,
43
+ "pages": self.pages,
44
+ "output_dir": str(self.output_dir),
45
+ "manifest": str(self.manifest),
46
+ "truncated": self.truncated,
47
+ }
48
+
49
+
50
+ @dataclass(frozen=True)
51
+ class CloneOutcome:
52
+ """What a Clone produced: the source repo, where it landed, its serve config.
53
+
54
+ A Clone is a sibling of a Crawl, not a kind of Crawl (CONTEXT.md): it carries
55
+ no Pages and no Manifest, so the summary omits those keys entirely.
56
+ """
57
+
58
+ repo: str # source-repo identity, e.g. "acme/docs"
59
+ output_dir: Path # where the clone landed, e.g. ./out/docs
60
+ mkdocs_config: Path | None # generated/own serve config; None if none was written
61
+
62
+ status: str = "ok" # producing a Clone at all means it succeeded
63
+
64
+ def stderr_line(self) -> str:
65
+ cfg = f" ({self.mkdocs_config.name})" if self.mkdocs_config else ""
66
+ return f"getdocs: cloned {self.repo} → {self.output_dir}{cfg}"
67
+
68
+ def summary_json(self) -> dict:
69
+ return {
70
+ "outcome": "cloned",
71
+ "status": self.status,
72
+ "repo": self.repo,
73
+ "output_dir": str(self.output_dir),
74
+ "mkdocs_config": str(self.mkdocs_config) if self.mkdocs_config else None,
75
+ }
@@ -24,6 +24,7 @@ from bs4 import BeautifulSoup
24
24
 
25
25
  from getdocs.config import CrawlConfig
26
26
  from getdocs.identity import build_user_agent
27
+ from getdocs.outcome import CloneOutcome
27
28
 
28
29
  # Hosts whose /ORG/REPO paths we recognize as clonable repositories.
29
30
  _GIT_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
@@ -191,12 +192,17 @@ def write_mkdocs_config(output_dir: Path, docs_dir: Path, site_name: str) -> Pat
191
192
  return path
192
193
 
193
194
 
194
- def clone_source_for(config: CrawlConfig) -> Path | None:
195
+ def _repo_identity(repo_url: str) -> str:
196
+ """Short owner/repo identity for a canonical repo URL ("acme/docs")."""
197
+ return "/".join(s for s in urlsplit(repo_url).path.split("/") if s)
198
+
199
+
200
+ def clone_source_for(config: CrawlConfig) -> CloneOutcome | None:
195
201
  """Try to satisfy a crawl by cloning the docs' source repo instead.
196
202
 
197
- Returns the clone directory when the site is open-source and was cloned
198
- (the caller should then skip crawling); None to fall back to crawling.
199
- Progress and outcomes are reported on stderr (stdout is the jsonl stream).
203
+ Returns a CloneOutcome when the site is open-source and was cloned (the
204
+ caller should then skip crawling and report it); None to fall back to
205
+ crawling. Progress is reported on stderr (stdout is the jsonl stream).
200
206
  """
201
207
  if not config.seeds:
202
208
  return None
@@ -220,19 +226,20 @@ def clone_source_for(config: CrawlConfig) -> Path | None:
220
226
  if repo_dir is None:
221
227
  print("clone failed (git missing or repo unreachable) — crawling instead", file=sys.stderr)
222
228
  return None
229
+ repo = _repo_identity(repo_url)
223
230
 
224
231
  own_config = repo_dir / "mkdocs.yml"
225
232
  if own_config.exists():
226
233
  print(f"cloned to {repo_dir} (ships its own mkdocs.yml)", file=sys.stderr)
227
234
  print(f"serve it with: mkdocs serve -f {own_config}", file=sys.stderr)
228
- return repo_dir
235
+ return CloneOutcome(repo=repo, output_dir=repo_dir, mkdocs_config=own_config)
229
236
 
230
237
  docs_dir = find_docs_dir(repo_dir)
231
238
  if docs_dir is None:
232
239
  print(f"cloned to {repo_dir}, but found no markdown docs to serve", file=sys.stderr)
233
- return repo_dir
240
+ return CloneOutcome(repo=repo, output_dir=repo_dir, mkdocs_config=None)
234
241
 
235
242
  written = write_mkdocs_config(config.output_dir, docs_dir, host)
236
243
  print(f"cloned to {repo_dir}; wrote {written}", file=sys.stderr)
237
244
  print(f"serve it with: mkdocs serve -f {written}", file=sys.stderr)
238
- return repo_dir
245
+ return CloneOutcome(repo=repo, output_dir=repo_dir, mkdocs_config=written)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: getdocs
3
- Version: 0.1.0
3
+ Version: 0.2.0
4
4
  Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
5
5
  Author-email: jonbakerfish <jonbakerfish@gmail.com>
6
6
  License-Expression: MIT
@@ -113,6 +113,54 @@ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
113
113
  throttling, JSONL output, and resumable crawls are all built in — see
114
114
  [docs/USAGE.md](docs/USAGE.md).
115
115
 
116
+ ## Use with your agent
117
+
118
+ getdocs is built to be driven by a coding agent: it's an ordinary CLI whose
119
+ `out/` tree + `crawl.json` Manifest *is* the return value (no MCP server, no job
120
+ protocol — see [ADR-0007](docs/adr/0007-agent-integration-is-the-cli-not-an-mcp-surface.md)).
121
+ Two patterns cover most uses.
122
+
123
+ **Synchronous — fetch one docs section.** Scope defaults to the seed's host +
124
+ path prefix, so pointing at a subtree fetches just that subtree. This blocks
125
+ until done and works under any agent:
126
+
127
+ ```bash
128
+ getdocs crawl https://example.com/docs/auth -o ./out --summary-json
129
+ ```
130
+
131
+ **Background — mirror a whole site.** Kick the crawl off as a background task
132
+ and keep working. Under **Claude Code** the agent is resumed automatically when
133
+ the crawl finishes; every other agent surveyed blocks or polls the output path
134
+ instead (this is a harness feature, not a getdocs one):
135
+
136
+ ```bash
137
+ getdocs crawl https://example.com/docs -o ./out --summary-json &
138
+ ```
139
+
140
+ **Read the summary, branch on the Outcome.** Every run ends with a one-line
141
+ summary on stderr; `--summary-json` adds a machine-readable object discriminated
142
+ by `outcome`. A run produces exactly one Outcome — a Crawl or a Clone:
143
+
144
+ ```jsonc
145
+ // outcome: "crawled" — getdocs scraped the rendered site
146
+ { "outcome": "crawled", "status": "ok", "pages": 42,
147
+ "output_dir": "./out", "manifest": "./out/crawl.json", "truncated": false }
148
+
149
+ // outcome: "cloned" — the docs were open-source, so getdocs cloned the repo
150
+ // (no pages / no manifest: a Clone is not a Crawl)
151
+ { "outcome": "cloned", "status": "ok", "repo": "acme/docs",
152
+ "output_dir": "./out/docs", "mkdocs_config": "./out/mkdocs.yml" }
153
+ ```
154
+
155
+ Have the agent switch on `outcome`:
156
+
157
+ - **`crawled`** → grep and read the Pages under `output_dir` and follow the nav
158
+ / reading order in `manifest` (`crawl.json`).
159
+ - **`cloned`** → you have the original markdown source; serve it with
160
+ `mkdocs serve -f <mkdocs_config>` (or just read the files under `output_dir`).
161
+ - **`status: "truncated"`** → the crawl hit its page cap; re-run with a higher
162
+ `--limit` (or `0` for unlimited) to get the rest.
163
+
116
164
  ## Install
117
165
 
118
166
  Requires **Python 3.12+**.
@@ -11,6 +11,7 @@ src/getdocs/extract.py
11
11
  src/getdocs/identity.py
12
12
  src/getdocs/jobs.py
13
13
  src/getdocs/navharvest.py
14
+ src/getdocs/outcome.py
14
15
  src/getdocs/output.py
15
16
  src/getdocs/scope.py
16
17
  src/getdocs/sitemap.py
@@ -46,6 +47,8 @@ tests/test_shell_detection.py
46
47
  tests/test_sitemap.py
47
48
  tests/test_sitemap_e2e.py
48
49
  tests/test_source.py
50
+ tests/test_summary_clone_e2e.py
51
+ tests/test_summary_e2e.py
49
52
  tests/test_traversal_e2e.py
50
53
  tests/test_urlnorm.py
51
54
  tests/test_webhook_api.py
@@ -170,7 +170,9 @@ def test_clone_source_for_clones_and_writes_config(tmp_path, monkeypatch):
170
170
  config = CrawlConfig(seeds=["https://docs.acme.io/intro"], output_dir=out)
171
171
  result = source.clone_source_for(config)
172
172
 
173
- assert result == out / "docs"
173
+ assert result.repo == "acme/docs"
174
+ assert result.output_dir == out / "docs"
175
+ assert result.mkdocs_config == out / "mkdocs.yml"
174
176
  assert (out / "mkdocs.yml").exists()
175
177
 
176
178
 
@@ -198,6 +200,9 @@ def test_clone_source_for_uses_repos_own_mkdocs_yml(tmp_path, monkeypatch):
198
200
  monkeypatch.setattr(source, "clone_repo", fake_clone)
199
201
  config = CrawlConfig(seeds=["https://docs.acme.io/"], output_dir=out)
200
202
 
201
- assert source.clone_source_for(config) == out / "docs"
203
+ result = source.clone_source_for(config)
204
+ assert result.output_dir == out / "docs"
205
+ # The repo's own config is used as the serve config, not one we generate.
206
+ assert result.mkdocs_config == out / "docs" / "mkdocs.yml"
202
207
  # We don't overwrite a repo that already ships its own config.
203
208
  assert not (out / "mkdocs.yml").exists()
@@ -0,0 +1,88 @@
1
+ """Clone completion summary (#22): the source-first Outcome report.
2
+
3
+ When a run clones the docs' source repo instead of crawling (ADR-0006), it
4
+ reports a `cloned` Outcome — distinct from a Crawl — so an agent can branch on
5
+ it. Verified at the in-process cli.main() seam with the source-first
6
+ orchestrator's network/git collaborators stubbed (prior art: test_source.py
7
+ monkeypatches fetch_html and clone_repo).
8
+ """
9
+
10
+ import json
11
+
12
+ from getdocs import cli, source
13
+
14
+ EDIT_LINK_HTML = (
15
+ '<a href="https://github.com/acme/docs/edit/main/p.md" '
16
+ 'title="Edit this page">edit</a>'
17
+ )
18
+
19
+
20
+ def fake_clone_with_docs(tmp_path):
21
+ """A clone_repo stand-in that fabricates a repo dir holding markdown docs."""
22
+ def _clone(repo_url, dest_parent, timeout=180.0):
23
+ repo = dest_parent / "docs"
24
+ (repo / "docs").mkdir(parents=True)
25
+ (repo / "docs" / "index.md").write_text("# Home")
26
+ return repo
27
+ return _clone
28
+
29
+
30
+ def test_clone_prints_one_line_stderr_summary(tmp_path, monkeypatch, capsys):
31
+ out = tmp_path / "out"
32
+ monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
33
+ monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
34
+
35
+ rc = cli.main(["crawl", "https://docs.acme.io/", "-o", str(out)])
36
+
37
+ assert rc == 0
38
+ err = capsys.readouterr().err
39
+ # Names the source repo and the cloned / serve-config locations.
40
+ assert "getdocs: cloned acme/docs" in err
41
+ assert str(out / "docs") in err
42
+ assert "mkdocs.yml" in err
43
+
44
+
45
+ def test_clone_summary_json_emits_cloned_object(tmp_path, monkeypatch, capsys):
46
+ out = tmp_path / "out"
47
+ monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
48
+ monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
49
+
50
+ rc = cli.main(["crawl", "https://docs.acme.io/", "-o", str(out), "--summary-json"])
51
+
52
+ assert rc == 0
53
+ summary = json.loads(capsys.readouterr().out)
54
+ assert summary["outcome"] == "cloned"
55
+ assert summary["status"] == "ok"
56
+ assert summary["repo"] == "acme/docs"
57
+ assert summary["output_dir"] == str(out / "docs")
58
+ assert summary["mkdocs_config"] == str(out / "mkdocs.yml")
59
+ # A Clone has no Pages and no Manifest (CONTEXT.md): those keys are absent.
60
+ assert "pages" not in summary
61
+ assert "manifest" not in summary
62
+
63
+
64
+ def test_clone_stderr_line_and_json_carry_the_same_facts(tmp_path, monkeypatch, capsys):
65
+ out = tmp_path / "out"
66
+ monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
67
+ monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
68
+
69
+ rc = cli.main(["crawl", "https://docs.acme.io/", "-o", str(out), "--summary-json"])
70
+
71
+ assert rc == 0
72
+ captured = capsys.readouterr()
73
+ summary = json.loads(captured.out)
74
+ assert f"cloned {summary['repo']}" in captured.err
75
+ assert summary["output_dir"] in captured.err
76
+
77
+
78
+ def test_agent_can_branch_on_outcome_clone_vs_crawl(tmp_path, monkeypatch, capsys):
79
+ # A cloned run is discriminable from a crawled run purely by `outcome`.
80
+ out = tmp_path / "out"
81
+ monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
82
+ monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
83
+
84
+ cli.main(["crawl", "https://docs.acme.io/", "-o", str(out), "--summary-json"])
85
+ cloned = json.loads(capsys.readouterr().out)
86
+
87
+ assert cloned["outcome"] == "cloned"
88
+ assert cloned["outcome"] != "crawled"
@@ -0,0 +1,138 @@
1
+ """Completion summary: the agent-native Outcome report (#21).
2
+
3
+ A finished run reports what it produced — an always-on one-line summary on
4
+ stderr, and an opt-in structured object via --summary-json. Both surfaces are
5
+ rendered from the same Outcome so they cannot disagree (ADR-0007). Verified at
6
+ the subprocess CLI seam (prior art: test_crawl_e2e, test_limits_errors_e2e).
7
+ """
8
+
9
+ import json
10
+ import subprocess
11
+ import sys
12
+ from pathlib import Path
13
+
14
+ FIXTURE_HTML = (Path(__file__).parent / "fixtures" / "basic_docs_page.html").read_text()
15
+
16
+
17
+ def page(title, body):
18
+ return f"<html><head><title>{title}</title></head><body><main>{body}</main></body></html>"
19
+
20
+
21
+ def run_getdocs(*args):
22
+ return subprocess.run(
23
+ [sys.executable, "-m", "getdocs", *args, "--delay", "0"],
24
+ capture_output=True, text=True, timeout=120,
25
+ )
26
+
27
+
28
+ def test_crawl_prints_one_line_stderr_summary(site, tmp_path):
29
+ site.add("/docs/auth", FIXTURE_HTML)
30
+ seed = f"{site.url}/docs/auth"
31
+
32
+ result = run_getdocs("crawl", seed, "-o", str(tmp_path), "--no-clone-source")
33
+
34
+ assert result.returncode == 0, result.stderr
35
+ # Names the page count and where the Pages landed.
36
+ assert "getdocs: crawled 1 Pages" in result.stderr
37
+ assert str(tmp_path) in result.stderr
38
+
39
+
40
+ def test_summary_json_emits_crawled_object_to_stdout(site, tmp_path):
41
+ site.add("/docs/auth", FIXTURE_HTML)
42
+ seed = f"{site.url}/docs/auth"
43
+
44
+ result = run_getdocs(
45
+ "crawl", seed, "-o", str(tmp_path), "--no-clone-source", "--summary-json"
46
+ )
47
+
48
+ assert result.returncode == 0, result.stderr
49
+ summary = json.loads(result.stdout)
50
+ assert summary["outcome"] == "crawled"
51
+ assert summary["status"] == "ok"
52
+ assert summary["pages"] == 1
53
+ assert summary["output_dir"] == str(tmp_path)
54
+ assert summary["manifest"] == str(tmp_path / "crawl.json")
55
+ assert summary["truncated"] is False
56
+
57
+
58
+ def test_jsonl_summary_emits_no_stdout_object_but_keeps_stderr_line(site):
59
+ site.add("/docs/", page("Home", '<h1>Home</h1><a href="/docs/auth">Auth</a>'))
60
+ site.add("/docs/auth", page("Auth", "<h1>Auth</h1>"))
61
+
62
+ result = run_getdocs(
63
+ "crawl", f"{site.url}/docs/", "--format", "jsonl",
64
+ "--summary-json", "--no-clone-source",
65
+ )
66
+
67
+ assert result.returncode == 0, result.stderr
68
+ records = [json.loads(line) for line in result.stdout.strip().split("\n")]
69
+ # stdout stays the page stream: Page records + the final Manifest, nothing else.
70
+ assert records[-1]["type"] == "manifest"
71
+ assert all(r["type"] in ("page", "manifest") for r in records)
72
+ # The stderr line is still emitted in jsonl mode.
73
+ assert "getdocs: crawled 2 Pages" in result.stderr
74
+
75
+
76
+ def test_truncated_crawl_reports_truncated_status(site, tmp_path):
77
+ site.add("/docs/", page("Home", '<h1>Home</h1><a href="/docs/p0">start</a>'))
78
+ for i in range(6):
79
+ link = f'<a href="/docs/p{i + 1}">next</a>' if i + 1 < 6 else ""
80
+ site.add(f"/docs/p{i}", page(f"P{i}", f"<h1>P{i}</h1>{link}"))
81
+
82
+ result = run_getdocs(
83
+ "crawl", f"{site.url}/docs/", "-o", str(tmp_path),
84
+ "--limit", "2", "--summary-json", "--no-clone-source",
85
+ )
86
+
87
+ assert result.returncode == 0, result.stderr
88
+ summary = json.loads(result.stdout)
89
+ assert summary["status"] == "truncated"
90
+ assert summary["truncated"] is True
91
+ assert "[truncated]" in result.stderr
92
+
93
+
94
+ def test_empty_crawl_reports_empty_status_and_exits_nonzero(tmp_path):
95
+ result = run_getdocs(
96
+ "crawl", "http://127.0.0.1:1/none", "-o", str(tmp_path),
97
+ "--summary-json", "--no-clone-source",
98
+ )
99
+
100
+ assert result.returncode == 1
101
+ summary = json.loads(result.stdout)
102
+ assert summary["outcome"] == "crawled"
103
+ assert summary["status"] == "empty"
104
+ assert summary["pages"] == 0
105
+
106
+
107
+ def test_stderr_line_and_json_carry_the_same_facts(site, tmp_path):
108
+ site.add("/docs/auth", FIXTURE_HTML)
109
+ seed = f"{site.url}/docs/auth"
110
+
111
+ result = run_getdocs(
112
+ "crawl", seed, "-o", str(tmp_path), "--no-clone-source", "--summary-json"
113
+ )
114
+
115
+ assert result.returncode == 0, result.stderr
116
+ summary = json.loads(result.stdout)
117
+ assert f"crawled {summary['pages']} Pages" in result.stderr
118
+ assert ("[truncated]" in result.stderr) == summary["truncated"]
119
+
120
+
121
+ def test_resume_run_produces_a_crawled_summary(site, tmp_path):
122
+ site.add("/docs/", page("Home", '<h1>Home</h1><a href="/docs/p0">start</a>'))
123
+ for i in range(4):
124
+ link = f'<a href="/docs/p{i + 1}">next</a>' if i + 1 < 4 else ""
125
+ site.add(f"/docs/p{i}", page(f"P{i}", f"<h1>P{i}</h1>{link}"))
126
+
127
+ first = run_getdocs(
128
+ "crawl", f"{site.url}/docs/", "-o", str(tmp_path),
129
+ "--limit", "2", "--no-clone-source",
130
+ )
131
+ assert first.returncode == 0, first.stderr
132
+
133
+ second = run_getdocs("crawl", "--resume", "-o", str(tmp_path), "--summary-json")
134
+
135
+ assert second.returncode == 0, second.stderr
136
+ summary = json.loads(second.stdout)
137
+ assert summary["outcome"] == "crawled"
138
+ assert "getdocs: crawled" in second.stderr
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes