PyPI - getdocs - Versions diffs - 0.1.0__tar.gz → 0.2.0__tar.gz - Mend

getdocs 0.1.0tar.gz → 0.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

{getdocs-0.1.0/src/getdocs.egg-info → getdocs-0.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: getdocs
-Version: 0.1.0
+Version: 0.2.0
 Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
 Author-email: jonbakerfish <jonbakerfish@gmail.com>
 License-Expression: MIT
@@ -113,6 +113,54 @@ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
 throttling, JSONL output, and resumable crawls are all built in — see
 [docs/USAGE.md](docs/USAGE.md).
+## Use with your agent
+getdocs is built to be driven by a coding agent: it's an ordinary CLI whose
+`out/` tree + `crawl.json` Manifest *is* the return value (no MCP server, no job
+protocol — see [ADR-0007](docs/adr/0007-agent-integration-is-the-cli-not-an-mcp-surface.md)).
+Two patterns cover most uses.
+**Synchronous — fetch one docs section.** Scope defaults to the seed's host +
+path prefix, so pointing at a subtree fetches just that subtree. This blocks
+until done and works under any agent:
+```bash
+getdocs crawl https://example.com/docs/auth -o ./out --summary-json
+```
+**Background — mirror a whole site.** Kick the crawl off as a background task
+and keep working. Under **Claude Code** the agent is resumed automatically when
+the crawl finishes; every other agent surveyed blocks or polls the output path
+instead (this is a harness feature, not a getdocs one):
+```bash
+getdocs crawl https://example.com/docs -o ./out --summary-json &
+```
+**Read the summary, branch on the Outcome.** Every run ends with a one-line
+summary on stderr; `--summary-json` adds a machine-readable object discriminated
+by `outcome`. A run produces exactly one Outcome — a Crawl or a Clone:
+```jsonc
+// outcome: "crawled" — getdocs scraped the rendered site
+{ "outcome": "crawled", "status": "ok", "pages": 42,
+  "output_dir": "./out", "manifest": "./out/crawl.json", "truncated": false }
+// outcome: "cloned" — the docs were open-source, so getdocs cloned the repo
+// (no pages / no manifest: a Clone is not a Crawl)
+{ "outcome": "cloned", "status": "ok", "repo": "acme/docs",
+  "output_dir": "./out/docs", "mkdocs_config": "./out/mkdocs.yml" }
+```
+Have the agent switch on `outcome`:
+- **`crawled`** → grep and read the Pages under `output_dir` and follow the nav
+  / reading order in `manifest` (`crawl.json`).
+- **`cloned`** → you have the original markdown source; serve it with
+  `mkdocs serve -f <mkdocs_config>` (or just read the files under `output_dir`).
+- **`status: "truncated"`** → the crawl hit its page cap; re-run with a higher
+  `--limit` (or `0` for unlimited) to get the rest.
 ## Install
 Requires **Python 3.12+**.

{getdocs-0.1.0 → getdocs-0.2.0}/README.md RENAMED Viewed

@@ -73,6 +73,54 @@ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
 throttling, JSONL output, and resumable crawls are all built in — see
 [docs/USAGE.md](docs/USAGE.md).
+## Use with your agent
+getdocs is built to be driven by a coding agent: it's an ordinary CLI whose
+`out/` tree + `crawl.json` Manifest *is* the return value (no MCP server, no job
+protocol — see [ADR-0007](docs/adr/0007-agent-integration-is-the-cli-not-an-mcp-surface.md)).
+Two patterns cover most uses.
+**Synchronous — fetch one docs section.** Scope defaults to the seed's host +
+path prefix, so pointing at a subtree fetches just that subtree. This blocks
+until done and works under any agent:
+```bash
+getdocs crawl https://example.com/docs/auth -o ./out --summary-json
+```
+**Background — mirror a whole site.** Kick the crawl off as a background task
+and keep working. Under **Claude Code** the agent is resumed automatically when
+the crawl finishes; every other agent surveyed blocks or polls the output path
+instead (this is a harness feature, not a getdocs one):
+```bash
+getdocs crawl https://example.com/docs -o ./out --summary-json &
+```
+**Read the summary, branch on the Outcome.** Every run ends with a one-line
+summary on stderr; `--summary-json` adds a machine-readable object discriminated
+by `outcome`. A run produces exactly one Outcome — a Crawl or a Clone:
+```jsonc
+// outcome: "crawled" — getdocs scraped the rendered site
+{ "outcome": "crawled", "status": "ok", "pages": 42,
+  "output_dir": "./out", "manifest": "./out/crawl.json", "truncated": false }
+// outcome: "cloned" — the docs were open-source, so getdocs cloned the repo
+// (no pages / no manifest: a Clone is not a Crawl)
+{ "outcome": "cloned", "status": "ok", "repo": "acme/docs",
+  "output_dir": "./out/docs", "mkdocs_config": "./out/mkdocs.yml" }
+```
+Have the agent switch on `outcome`:
+- **`crawled`** → grep and read the Pages under `output_dir` and follow the nav
+  / reading order in `manifest` (`crawl.json`).
+- **`cloned`** → you have the original markdown source; serve it with
+  `mkdocs serve -f <mkdocs_config>` (or just read the files under `output_dir`).
+- **`status: "truncated"`** → the crawl hit its page cap; re-run with a higher
+  `--limit` (or `0` for unlimited) to get the rest.
 ## Install
 Requires **Python 3.12+**.

{getdocs-0.1.0 → getdocs-0.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "getdocs"
-version = "0.1.0"
+version = "0.2.0"
 description = "Documentation crawler: recursively crawl a docs site and emit clean markdown"
 readme = "README.md"
 requires-python = ">=3.12"

{getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/cli.py RENAMED Viewed

@@ -67,6 +67,11 @@ def parse_args(argv: list[str] | None = None) -> CrawlConfig | ServeConfig:
         "--format", choices=["files", "jsonl"], default="files",
         help="files: .md tree + crawl.json; jsonl: one record per Page on stdout",
     )
+    crawl.add_argument(
+        "--summary-json", dest="summary_json", action="store_true",
+        help="Print a machine-readable Outcome summary (files mode: to stdout; "
+             "jsonl mode: already carried by the final Manifest record)",
+    )
     crawl.add_argument(
         "--selector", metavar="CSS",
         help="CSS selector for the content container (overrides auto-detection)",
@@ -146,6 +151,7 @@ def parse_args(argv: list[str] | None = None) -> CrawlConfig | ServeConfig:
         depth=args.depth,
         limit=args.limit,
         format=args.format,
+        summary_json=args.summary_json,
         keep_html=args.keep_html,
         sitemap=args.sitemap,
         selector=args.selector,
@@ -195,7 +201,11 @@ def main(argv: list[str] | None = None) -> int:
     if config.format == "files" and config.clone_source and not config.resume and config.seeds:
         from getdocs.source import clone_source_for
-        if clone_source_for(config) is not None:
+        clone = clone_source_for(config)
+        if clone is not None:
+            print(clone.stderr_line(), file=sys.stderr, flush=True)
+            if config.summary_json:
+                print(json.dumps(clone.summary_json()), flush=True)
             return 0
     state_file = state_file_for(config)
     if config.resume:
@@ -212,9 +222,14 @@ def main(argv: list[str] | None = None) -> int:
         )
         state_file.unlink()
-    page_count = run_crawl(config)
-    if page_count == 0:
-        # stderr: stdout belongs to the jsonl stream (ADR-0002)
+    outcome = run_crawl(config)
+    # Always-on one-line summary on stderr (stdout belongs to the jsonl stream,
+    # ADR-0002). The opt-in --summary-json object goes to stdout in files mode;
+    # in jsonl mode the final Manifest record already carries the same facts.
+    print(outcome.stderr_line(), file=sys.stderr, flush=True)
+    if config.summary_json and config.format == "files":
+        print(json.dumps(outcome.summary_json()), flush=True)
+    if outcome.status == "empty":
         print("error: no Pages produced — seed(s) unreachable?", file=sys.stderr, flush=True)
         return 1
     return 0

{getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/config.py RENAMED Viewed

@@ -15,6 +15,7 @@ class CrawlConfig:
     depth: int = 0  # link-hops from any seed; 0 = unlimited
     limit: int = 1000  # max Pages per Crawl; 0 = unlimited
     format: str = "files"  # "files" or "jsonl"
+    summary_json: bool = False  # emit a machine-readable Outcome summary object
     keep_html: bool = False
     sitemap: str = "both"  # "both", "off" (--no-sitemap), or "only" (--sitemap-only)
     selector: str | None = None  # CSS selector naming the content container

{getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/engine.py RENAMED Viewed

@@ -25,6 +25,7 @@ from getdocs.config import CrawlConfig
 from getdocs.extract import extract_page, is_shell
 from getdocs.identity import build_user_agent
 from getdocs.navharvest import harvest_nav, merge_harvests
+from getdocs.outcome import CrawlOutcome
 from getdocs.output import AssetStore, FileTreeWriter, JsonlWriter, PageRecord, relink_pages
 from getdocs.scope import Scope
 from getdocs.sitemap import parse_robots_sitemaps, parse_sitemap_xml
@@ -343,8 +344,8 @@ class _CrawlSpider(scrapy.Spider):
         )
-def run_crawl(config: CrawlConfig) -> int:
-    """Run a Crawl to completion; returns the number of Pages produced."""
+def run_crawl(config: CrawlConfig) -> CrawlOutcome:
+    """Run a Crawl to completion; returns the structured Outcome it produced."""
     if config.format == "jsonl":
         writer = JsonlWriter(sys.stdout)
     else:
@@ -405,7 +406,7 @@ def run_crawl(config: CrawlConfig) -> int:
     if isinstance(writer, FileTreeWriter):
         relink_pages(writer, outcome["crawl_sequence"])
     nav, reading_order = merge_harvests(outcome["harvests"], outcome["crawl_sequence"])
-    writer.write_manifest(
+    manifest_path = writer.write_manifest(
         seeds=config.seeds,
         errors=outcome["errors"],
         truncated=outcome["truncated"],
@@ -415,4 +416,10 @@ def run_crawl(config: CrawlConfig) -> int:
         reading_order=reading_order,
         media_skipped=outcome["media_skipped"],
     )
-    return writer.page_count
+    return CrawlOutcome(
+        pages=writer.page_count,
+        output_dir=config.output_dir,
+        manifest=manifest_path,
+        truncated=outcome["truncated"],
+        format=config.format,
+    )

getdocs-0.2.0/src/getdocs/outcome.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""Outcome: the structured result of a getdocs run — a Crawl or a Clone (ADR-0007).
+A run produces exactly one Outcome — a Crawl or a Clone. The always-on stderr
+summary line and the opt-in --summary-json object are both rendered from the
+same Outcome value, so the two surfaces cannot disagree.
+"""
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class CrawlOutcome:
+    """What a Crawl produced: how many Pages, where they landed, whether capped."""
+    pages: int
+    output_dir: Path
+    manifest: Path | None  # crawl.json path in files mode; None in jsonl mode
+    truncated: bool
+    format: str = "files"  # "files" or "jsonl"
+    @property
+    def status(self) -> str:
+        """ok | truncated | empty — derived so the line and JSON always agree."""
+        if self.pages == 0:
+            return "empty"
+        if self.truncated:
+            return "truncated"
+        return "ok"
+    def stderr_line(self) -> str:
+        if self.format == "jsonl":
+            dest = "stdout (jsonl)"
+        else:
+            dest = f"{self.output_dir} ({self.manifest.name})"
+        note = " [truncated]" if self.truncated else ""
+        return f"getdocs: crawled {self.pages} Pages → {dest}{note}"
+    def summary_json(self) -> dict:
+        return {
+            "outcome": "crawled",
+            "status": self.status,
+            "pages": self.pages,
+            "output_dir": str(self.output_dir),
+            "manifest": str(self.manifest),
+            "truncated": self.truncated,
+        }
+@dataclass(frozen=True)
+class CloneOutcome:
+    """What a Clone produced: the source repo, where it landed, its serve config.
+    A Clone is a sibling of a Crawl, not a kind of Crawl (CONTEXT.md): it carries
+    no Pages and no Manifest, so the summary omits those keys entirely.
+    """
+    repo: str  # source-repo identity, e.g. "acme/docs"
+    output_dir: Path  # where the clone landed, e.g. ./out/docs
+    mkdocs_config: Path | None  # generated/own serve config; None if none was written
+    status: str = "ok"  # producing a Clone at all means it succeeded
+    def stderr_line(self) -> str:
+        cfg = f" ({self.mkdocs_config.name})" if self.mkdocs_config else ""
+        return f"getdocs: cloned {self.repo} → {self.output_dir}{cfg}"
+    def summary_json(self) -> dict:
+        return {
+            "outcome": "cloned",
+            "status": self.status,
+            "repo": self.repo,
+            "output_dir": str(self.output_dir),
+            "mkdocs_config": str(self.mkdocs_config) if self.mkdocs_config else None,
+        }

{getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs/source.py RENAMED Viewed

@@ -24,6 +24,7 @@ from bs4 import BeautifulSoup
 from getdocs.config import CrawlConfig
 from getdocs.identity import build_user_agent
+from getdocs.outcome import CloneOutcome
 # Hosts whose /ORG/REPO paths we recognize as clonable repositories.
 _GIT_HOSTS = {"github.com", "gitlab.com", "bitbucket.org", "codeberg.org"}
@@ -191,12 +192,17 @@ def write_mkdocs_config(output_dir: Path, docs_dir: Path, site_name: str) -> Pat
     return path
-def clone_source_for(config: CrawlConfig) -> Path | None:
+def _repo_identity(repo_url: str) -> str:
+    """Short owner/repo identity for a canonical repo URL ("acme/docs")."""
+    return "/".join(s for s in urlsplit(repo_url).path.split("/") if s)
+def clone_source_for(config: CrawlConfig) -> CloneOutcome | None:
     """Try to satisfy a crawl by cloning the docs' source repo instead.
-    Returns the clone directory when the site is open-source and was cloned
-    (the caller should then skip crawling); None to fall back to crawling.
-    Progress and outcomes are reported on stderr (stdout is the jsonl stream).
+    Returns a CloneOutcome when the site is open-source and was cloned (the
+    caller should then skip crawling and report it); None to fall back to
+    crawling. Progress is reported on stderr (stdout is the jsonl stream).
     """
     if not config.seeds:
         return None
@@ -220,19 +226,20 @@ def clone_source_for(config: CrawlConfig) -> Path | None:
     if repo_dir is None:
         print("clone failed (git missing or repo unreachable) — crawling instead", file=sys.stderr)
         return None
+    repo = _repo_identity(repo_url)
     own_config = repo_dir / "mkdocs.yml"
     if own_config.exists():
         print(f"cloned to {repo_dir} (ships its own mkdocs.yml)", file=sys.stderr)
         print(f"serve it with: mkdocs serve -f {own_config}", file=sys.stderr)
-        return repo_dir
+        return CloneOutcome(repo=repo, output_dir=repo_dir, mkdocs_config=own_config)
     docs_dir = find_docs_dir(repo_dir)
     if docs_dir is None:
         print(f"cloned to {repo_dir}, but found no markdown docs to serve", file=sys.stderr)
-        return repo_dir
+        return CloneOutcome(repo=repo, output_dir=repo_dir, mkdocs_config=None)
     written = write_mkdocs_config(config.output_dir, docs_dir, host)
     print(f"cloned to {repo_dir}; wrote {written}", file=sys.stderr)
     print(f"serve it with: mkdocs serve -f {written}", file=sys.stderr)
-    return repo_dir
+    return CloneOutcome(repo=repo, output_dir=repo_dir, mkdocs_config=written)

{getdocs-0.1.0 → getdocs-0.2.0/src/getdocs.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: getdocs
-Version: 0.1.0
+Version: 0.2.0
 Summary: Documentation crawler: recursively crawl a docs site and emit clean markdown
 Author-email: jonbakerfish <jonbakerfish@gmail.com>
 License-Expression: MIT
@@ -113,6 +113,54 @@ Sitemap discovery, JavaScript rendering, source-repo cloning, polite
 throttling, JSONL output, and resumable crawls are all built in — see
 [docs/USAGE.md](docs/USAGE.md).
+## Use with your agent
+getdocs is built to be driven by a coding agent: it's an ordinary CLI whose
+`out/` tree + `crawl.json` Manifest *is* the return value (no MCP server, no job
+protocol — see [ADR-0007](docs/adr/0007-agent-integration-is-the-cli-not-an-mcp-surface.md)).
+Two patterns cover most uses.
+**Synchronous — fetch one docs section.** Scope defaults to the seed's host +
+path prefix, so pointing at a subtree fetches just that subtree. This blocks
+until done and works under any agent:
+```bash
+getdocs crawl https://example.com/docs/auth -o ./out --summary-json
+```
+**Background — mirror a whole site.** Kick the crawl off as a background task
+and keep working. Under **Claude Code** the agent is resumed automatically when
+the crawl finishes; every other agent surveyed blocks or polls the output path
+instead (this is a harness feature, not a getdocs one):
+```bash
+getdocs crawl https://example.com/docs -o ./out --summary-json &
+```
+**Read the summary, branch on the Outcome.** Every run ends with a one-line
+summary on stderr; `--summary-json` adds a machine-readable object discriminated
+by `outcome`. A run produces exactly one Outcome — a Crawl or a Clone:
+```jsonc
+// outcome: "crawled" — getdocs scraped the rendered site
+{ "outcome": "crawled", "status": "ok", "pages": 42,
+  "output_dir": "./out", "manifest": "./out/crawl.json", "truncated": false }
+// outcome: "cloned" — the docs were open-source, so getdocs cloned the repo
+// (no pages / no manifest: a Clone is not a Crawl)
+{ "outcome": "cloned", "status": "ok", "repo": "acme/docs",
+  "output_dir": "./out/docs", "mkdocs_config": "./out/mkdocs.yml" }
+```
+Have the agent switch on `outcome`:
+- **`crawled`** → grep and read the Pages under `output_dir` and follow the nav
+  / reading order in `manifest` (`crawl.json`).
+- **`cloned`** → you have the original markdown source; serve it with
+  `mkdocs serve -f <mkdocs_config>` (or just read the files under `output_dir`).
+- **`status: "truncated"`** → the crawl hit its page cap; re-run with a higher
+  `--limit` (or `0` for unlimited) to get the rest.
 ## Install
 Requires **Python 3.12+**.

{getdocs-0.1.0 → getdocs-0.2.0}/src/getdocs.egg-info/SOURCES.txt RENAMED Viewed

@@ -11,6 +11,7 @@ src/getdocs/extract.py
 src/getdocs/identity.py
 src/getdocs/jobs.py
 src/getdocs/navharvest.py
+src/getdocs/outcome.py
 src/getdocs/output.py
 src/getdocs/scope.py
 src/getdocs/sitemap.py
@@ -46,6 +47,8 @@ tests/test_shell_detection.py
 tests/test_sitemap.py
 tests/test_sitemap_e2e.py
 tests/test_source.py
+tests/test_summary_clone_e2e.py
+tests/test_summary_e2e.py
 tests/test_traversal_e2e.py
 tests/test_urlnorm.py
 tests/test_webhook_api.py

{getdocs-0.1.0 → getdocs-0.2.0}/tests/test_source.py RENAMED Viewed

@@ -170,7 +170,9 @@ def test_clone_source_for_clones_and_writes_config(tmp_path, monkeypatch):
     config = CrawlConfig(seeds=["https://docs.acme.io/intro"], output_dir=out)
     result = source.clone_source_for(config)
-    assert result == out / "docs"
+    assert result.repo == "acme/docs"
+    assert result.output_dir == out / "docs"
+    assert result.mkdocs_config == out / "mkdocs.yml"
     assert (out / "mkdocs.yml").exists()
@@ -198,6 +200,9 @@ def test_clone_source_for_uses_repos_own_mkdocs_yml(tmp_path, monkeypatch):
     monkeypatch.setattr(source, "clone_repo", fake_clone)
     config = CrawlConfig(seeds=["https://docs.acme.io/"], output_dir=out)
-    assert source.clone_source_for(config) == out / "docs"
+    result = source.clone_source_for(config)
+    assert result.output_dir == out / "docs"
+    # The repo's own config is used as the serve config, not one we generate.
+    assert result.mkdocs_config == out / "docs" / "mkdocs.yml"
     # We don't overwrite a repo that already ships its own config.
     assert not (out / "mkdocs.yml").exists()

getdocs-0.2.0/tests/test_summary_clone_e2e.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Clone completion summary (#22): the source-first Outcome report.
+When a run clones the docs' source repo instead of crawling (ADR-0006), it
+reports a `cloned` Outcome — distinct from a Crawl — so an agent can branch on
+it. Verified at the in-process cli.main() seam with the source-first
+orchestrator's network/git collaborators stubbed (prior art: test_source.py
+monkeypatches fetch_html and clone_repo).
+"""
+import json
+from getdocs import cli, source
+EDIT_LINK_HTML = (
+    '<a href="https://github.com/acme/docs/edit/main/p.md" '
+    'title="Edit this page">edit</a>'
+)
+def fake_clone_with_docs(tmp_path):
+    """A clone_repo stand-in that fabricates a repo dir holding markdown docs."""
+    def _clone(repo_url, dest_parent, timeout=180.0):
+        repo = dest_parent / "docs"
+        (repo / "docs").mkdir(parents=True)
+        (repo / "docs" / "index.md").write_text("# Home")
+        return repo
+    return _clone
+def test_clone_prints_one_line_stderr_summary(tmp_path, monkeypatch, capsys):
+    out = tmp_path / "out"
+    monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
+    monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
+    rc = cli.main(["crawl", "https://docs.acme.io/", "-o", str(out)])
+    assert rc == 0
+    err = capsys.readouterr().err
+    # Names the source repo and the cloned / serve-config locations.
+    assert "getdocs: cloned acme/docs" in err
+    assert str(out / "docs") in err
+    assert "mkdocs.yml" in err
+def test_clone_summary_json_emits_cloned_object(tmp_path, monkeypatch, capsys):
+    out = tmp_path / "out"
+    monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
+    monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
+    rc = cli.main(["crawl", "https://docs.acme.io/", "-o", str(out), "--summary-json"])
+    assert rc == 0
+    summary = json.loads(capsys.readouterr().out)
+    assert summary["outcome"] == "cloned"
+    assert summary["status"] == "ok"
+    assert summary["repo"] == "acme/docs"
+    assert summary["output_dir"] == str(out / "docs")
+    assert summary["mkdocs_config"] == str(out / "mkdocs.yml")
+    # A Clone has no Pages and no Manifest (CONTEXT.md): those keys are absent.
+    assert "pages" not in summary
+    assert "manifest" not in summary
+def test_clone_stderr_line_and_json_carry_the_same_facts(tmp_path, monkeypatch, capsys):
+    out = tmp_path / "out"
+    monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
+    monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
+    rc = cli.main(["crawl", "https://docs.acme.io/", "-o", str(out), "--summary-json"])
+    assert rc == 0
+    captured = capsys.readouterr()
+    summary = json.loads(captured.out)
+    assert f"cloned {summary['repo']}" in captured.err
+    assert summary["output_dir"] in captured.err
+def test_agent_can_branch_on_outcome_clone_vs_crawl(tmp_path, monkeypatch, capsys):
+    # A cloned run is discriminable from a crawled run purely by `outcome`.
+    out = tmp_path / "out"
+    monkeypatch.setattr(source, "fetch_html", lambda url, ua=None: EDIT_LINK_HTML)
+    monkeypatch.setattr(source, "clone_repo", fake_clone_with_docs(tmp_path))
+    cli.main(["crawl", "https://docs.acme.io/", "-o", str(out), "--summary-json"])
+    cloned = json.loads(capsys.readouterr().out)
+    assert cloned["outcome"] == "cloned"
+    assert cloned["outcome"] != "crawled"

getdocs-0.2.0/tests/test_summary_e2e.py ADDED Viewed

@@ -0,0 +1,138 @@
+"""Completion summary: the agent-native Outcome report (#21).
+A finished run reports what it produced — an always-on one-line summary on
+stderr, and an opt-in structured object via --summary-json. Both surfaces are
+rendered from the same Outcome so they cannot disagree (ADR-0007). Verified at
+the subprocess CLI seam (prior art: test_crawl_e2e, test_limits_errors_e2e).
+"""
+import json
+import subprocess
+import sys
+from pathlib import Path
+FIXTURE_HTML = (Path(__file__).parent / "fixtures" / "basic_docs_page.html").read_text()
+def page(title, body):
+    return f"<html><head><title>{title}</title></head><body><main>{body}</main></body></html>"
+def run_getdocs(*args):
+    return subprocess.run(
+        [sys.executable, "-m", "getdocs", *args, "--delay", "0"],
+        capture_output=True, text=True, timeout=120,
+    )
+def test_crawl_prints_one_line_stderr_summary(site, tmp_path):
+    site.add("/docs/auth", FIXTURE_HTML)
+    seed = f"{site.url}/docs/auth"
+    result = run_getdocs("crawl", seed, "-o", str(tmp_path), "--no-clone-source")
+    assert result.returncode == 0, result.stderr
+    # Names the page count and where the Pages landed.
+    assert "getdocs: crawled 1 Pages" in result.stderr
+    assert str(tmp_path) in result.stderr
+def test_summary_json_emits_crawled_object_to_stdout(site, tmp_path):
+    site.add("/docs/auth", FIXTURE_HTML)
+    seed = f"{site.url}/docs/auth"
+    result = run_getdocs(
+        "crawl", seed, "-o", str(tmp_path), "--no-clone-source", "--summary-json"
+    )
+    assert result.returncode == 0, result.stderr
+    summary = json.loads(result.stdout)
+    assert summary["outcome"] == "crawled"
+    assert summary["status"] == "ok"
+    assert summary["pages"] == 1
+    assert summary["output_dir"] == str(tmp_path)
+    assert summary["manifest"] == str(tmp_path / "crawl.json")
+    assert summary["truncated"] is False
+def test_jsonl_summary_emits_no_stdout_object_but_keeps_stderr_line(site):
+    site.add("/docs/", page("Home", '<h1>Home</h1><a href="/docs/auth">Auth</a>'))
+    site.add("/docs/auth", page("Auth", "<h1>Auth</h1>"))
+    result = run_getdocs(
+        "crawl", f"{site.url}/docs/", "--format", "jsonl",
+        "--summary-json", "--no-clone-source",
+    )
+    assert result.returncode == 0, result.stderr
+    records = [json.loads(line) for line in result.stdout.strip().split("\n")]
+    # stdout stays the page stream: Page records + the final Manifest, nothing else.
+    assert records[-1]["type"] == "manifest"
+    assert all(r["type"] in ("page", "manifest") for r in records)
+    # The stderr line is still emitted in jsonl mode.
+    assert "getdocs: crawled 2 Pages" in result.stderr
+def test_truncated_crawl_reports_truncated_status(site, tmp_path):
+    site.add("/docs/", page("Home", '<h1>Home</h1><a href="/docs/p0">start</a>'))
+    for i in range(6):
+        link = f'<a href="/docs/p{i + 1}">next</a>' if i + 1 < 6 else ""
+        site.add(f"/docs/p{i}", page(f"P{i}", f"<h1>P{i}</h1>{link}"))
+    result = run_getdocs(
+        "crawl", f"{site.url}/docs/", "-o", str(tmp_path),
+        "--limit", "2", "--summary-json", "--no-clone-source",
+    )
+    assert result.returncode == 0, result.stderr
+    summary = json.loads(result.stdout)
+    assert summary["status"] == "truncated"
+    assert summary["truncated"] is True
+    assert "[truncated]" in result.stderr
+def test_empty_crawl_reports_empty_status_and_exits_nonzero(tmp_path):
+    result = run_getdocs(
+        "crawl", "http://127.0.0.1:1/none", "-o", str(tmp_path),
+        "--summary-json", "--no-clone-source",
+    )
+    assert result.returncode == 1
+    summary = json.loads(result.stdout)
+    assert summary["outcome"] == "crawled"
+    assert summary["status"] == "empty"
+    assert summary["pages"] == 0
+def test_stderr_line_and_json_carry_the_same_facts(site, tmp_path):
+    site.add("/docs/auth", FIXTURE_HTML)
+    seed = f"{site.url}/docs/auth"
+    result = run_getdocs(
+        "crawl", seed, "-o", str(tmp_path), "--no-clone-source", "--summary-json"
+    )
+    assert result.returncode == 0, result.stderr
+    summary = json.loads(result.stdout)
+    assert f"crawled {summary['pages']} Pages" in result.stderr
+    assert ("[truncated]" in result.stderr) == summary["truncated"]
+def test_resume_run_produces_a_crawled_summary(site, tmp_path):
+    site.add("/docs/", page("Home", '<h1>Home</h1><a href="/docs/p0">start</a>'))
+    for i in range(4):
+        link = f'<a href="/docs/p{i + 1}">next</a>' if i + 1 < 4 else ""
+        site.add(f"/docs/p{i}", page(f"P{i}", f"<h1>P{i}</h1>{link}"))
+    first = run_getdocs(
+        "crawl", f"{site.url}/docs/", "-o", str(tmp_path),
+        "--limit", "2", "--no-clone-source",
+    )
+    assert first.returncode == 0, first.stderr
+    second = run_getdocs("crawl", "--resume", "-o", str(tmp_path), "--summary-json")
+    assert second.returncode == 0, second.stderr
+    summary = json.loads(second.stdout)
+    assert summary["outcome"] == "crawled"
+    assert "getdocs: crawled" in second.stderr