afs-connector-sdk 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,41 @@
1
+ # --- OS / editor ---
2
+ .DS_Store
3
+ Thumbs.db
4
+ *.swp
5
+ .idea/
6
+ .vscode/
7
+
8
+ # --- Python (packages land in M0+) ---
9
+ __pycache__/
10
+ *.py[cod]
11
+ .venv/
12
+ venv/
13
+ .uv/
14
+ *.egg-info/
15
+ .ruff_cache/
16
+ .pytest_cache/
17
+ .mypy_cache/
18
+ .ty_cache/
19
+ dist/
20
+ build/
21
+
22
+ # --- Node (workers/mcp-edge lands later) ---
23
+ node_modules/
24
+ npm-debug.log*
25
+
26
+ # --- Secrets / local env ---
27
+ .env
28
+ .env.*
29
+ !.env.example
30
+ *.secret.*
31
+
32
+ # --- Terraform ---
33
+ # Detailed Terraform ignores live in terraform/.gitignore; these catch any
34
+ # stray state/plan artifacts produced outside that tree.
35
+ *.tfstate
36
+ *.tfstate.*
37
+ *.tfplan
38
+ .terraform/
39
+
40
+ # Agent worktrees (isolated background-agent checkouts)
41
+ .claude/worktrees/
@@ -0,0 +1,66 @@
1
+ Metadata-Version: 2.4
2
+ Name: afs-connector-sdk
3
+ Version: 0.1.0
4
+ Summary: agentic-fs connector SDK: crawl a source and ingest into agentic-fs. Ships the fs-crawler CLI plus Local FS and S3 connectors.
5
+ Project-URL: Homepage, https://github.com/vivekkhimani/agentic-fs
6
+ Project-URL: Repository, https://github.com/vivekkhimani/agentic-fs
7
+ Project-URL: Issues, https://github.com/vivekkhimani/agentic-fs/issues
8
+ Author-email: Vivek Khimani <vivekkhimani07@gmail.com>
9
+ License-Expression: Apache-2.0
10
+ Keywords: agentic-fs,agents,connector,etl,ingestion,s3
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: License :: OSI Approved :: Apache Software License
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Typing :: Typed
15
+ Requires-Python: >=3.12
16
+ Requires-Dist: afs-core
17
+ Requires-Dist: httpx>=0.27
18
+ Provides-Extra: aws
19
+ Requires-Dist: boto3>=1.34; extra == 'aws'
20
+ Description-Content-Type: text/markdown
21
+
22
+ # afs-connector-sdk
23
+
24
+ Crawl a source and ingest its documents into [agentic-fs](https://github.com/vivekkhimani/agentic-fs).
25
+ Ships the `fs-crawler` CLI plus **Local FS** and **S3** connectors.
26
+
27
+ ```bash
28
+ pip install afs-connector-sdk # Local FS connector + unauthenticated/bearer APIs
29
+ pip install "afs-connector-sdk[aws]" # adds the S3 connector + SigV4 signing for AWS_IAM Function URLs
30
+ ```
31
+
32
+ ## CLI
33
+
34
+ ```bash
35
+ # Crawl a local folder into a dev server
36
+ fs-crawler --connector local --source ./docs \
37
+ --api-url http://localhost:8080 --namespace docs
38
+
39
+ # Crawl an S3 prefix into the deployed (AWS_IAM) Function URL
40
+ fs-crawler --connector s3 --source s3://my-bucket/reports/ \
41
+ --api-url "$FUNCTION_URL" --namespace reports --auth sigv4 --region us-east-1
42
+
43
+ # Mirror exactly (also delete docs no longer at the source)
44
+ fs-crawler --connector local --source ./docs --api-url "$URL" --namespace docs --prune
45
+ ```
46
+
47
+ Re-runs are cheap and idempotent: a document is skipped unless its content
48
+ checksum differs from what agentic-fs already has, so nothing is re-extracted
49
+ needlessly.
50
+
51
+ ## Library
52
+
53
+ ```python
54
+ from afs_connector_sdk import IngestClient, SyncEngine, SigV4Signer, build_connector
55
+
56
+ connector = build_connector("local", "./docs")
57
+ async with IngestClient(api_url, signer=SigV4Signer(region="us-east-1")) as client:
58
+ report = await SyncEngine(client).sync(connector, namespace="docs")
59
+ ```
60
+
61
+ ## Writing a connector
62
+
63
+ Implement `afs_core.contracts.Connector` (`discover()` → `SourceItem`s, `fetch(item)` →
64
+ bytes), certify it against `afs_core.testing.ConnectorConformance`, and register an
65
+ `afs.connectors` entry point. Source-side auth lives in your connector; the SDK
66
+ handles everything else. See [the connector swap guide](../../docs/swap-guides/connectors.md).
@@ -0,0 +1,45 @@
1
+ # afs-connector-sdk
2
+
3
+ Crawl a source and ingest its documents into [agentic-fs](https://github.com/vivekkhimani/agentic-fs).
4
+ Ships the `fs-crawler` CLI plus **Local FS** and **S3** connectors.
5
+
6
+ ```bash
7
+ pip install afs-connector-sdk # Local FS connector + unauthenticated/bearer APIs
8
+ pip install "afs-connector-sdk[aws]" # adds the S3 connector + SigV4 signing for AWS_IAM Function URLs
9
+ ```
10
+
11
+ ## CLI
12
+
13
+ ```bash
14
+ # Crawl a local folder into a dev server
15
+ fs-crawler --connector local --source ./docs \
16
+ --api-url http://localhost:8080 --namespace docs
17
+
18
+ # Crawl an S3 prefix into the deployed (AWS_IAM) Function URL
19
+ fs-crawler --connector s3 --source s3://my-bucket/reports/ \
20
+ --api-url "$FUNCTION_URL" --namespace reports --auth sigv4 --region us-east-1
21
+
22
+ # Mirror exactly (also delete docs no longer at the source)
23
+ fs-crawler --connector local --source ./docs --api-url "$URL" --namespace docs --prune
24
+ ```
25
+
26
+ Re-runs are cheap and idempotent: a document is skipped unless its content
27
+ checksum differs from what agentic-fs already has, so nothing is re-extracted
28
+ needlessly.
29
+
30
+ ## Library
31
+
32
+ ```python
33
+ from afs_connector_sdk import IngestClient, SyncEngine, SigV4Signer, build_connector
34
+
35
+ connector = build_connector("local", "./docs")
36
+ async with IngestClient(api_url, signer=SigV4Signer(region="us-east-1")) as client:
37
+ report = await SyncEngine(client).sync(connector, namespace="docs")
38
+ ```
39
+
40
+ ## Writing a connector
41
+
42
+ Implement `afs_core.contracts.Connector` (`discover()` → `SourceItem`s, `fetch(item)` →
43
+ bytes), certify it against `afs_core.testing.ConnectorConformance`, and register an
44
+ `afs.connectors` entry point. Source-side auth lives in your connector; the SDK
45
+ handles everything else. See [the connector swap guide](../../docs/swap-guides/connectors.md).
@@ -0,0 +1,49 @@
1
+ [project]
2
+ name = "afs-connector-sdk"
3
+ version = "0.1.0"
4
+ description = "agentic-fs connector SDK: crawl a source and ingest into agentic-fs. Ships the fs-crawler CLI plus Local FS and S3 connectors."
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ license = "Apache-2.0"
8
+ authors = [{ name = "Vivek Khimani", email = "vivekkhimani07@gmail.com" }]
9
+ keywords = [
10
+ "agentic-fs",
11
+ "agents",
12
+ "connector",
13
+ "ingestion",
14
+ "s3",
15
+ "etl",
16
+ ]
17
+ classifiers = [
18
+ "Development Status :: 3 - Alpha",
19
+ "License :: OSI Approved :: Apache Software License",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Typing :: Typed",
22
+ ]
23
+ dependencies = [
24
+ "afs-core",
25
+ "httpx>=0.27",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ # The S3 connector (boto3) and SigV4 signing for AWS_IAM Function URLs. The Local
30
+ # FS connector and unauthenticated/bearer endpoints need none of this.
31
+ aws = ["boto3>=1.34"]
32
+
33
+ [project.scripts]
34
+ fs-crawler = "afs_connector_sdk.cli:main"
35
+
36
+ [project.urls]
37
+ Homepage = "https://github.com/vivekkhimani/agentic-fs"
38
+ Repository = "https://github.com/vivekkhimani/agentic-fs"
39
+ Issues = "https://github.com/vivekkhimani/agentic-fs/issues"
40
+
41
+ [build-system]
42
+ requires = ["hatchling"]
43
+ build-backend = "hatchling.build"
44
+
45
+ [tool.hatch.build.targets.wheel]
46
+ packages = ["src/afs_connector_sdk"]
47
+
48
+ [tool.uv.sources]
49
+ afs-core = { workspace = true }
@@ -0,0 +1,22 @@
1
+ """agentic-fs connector SDK — crawl a source and ingest into agentic-fs.
2
+
3
+ Public surface: the ``IngestClient`` (HTTP), the ``SyncEngine`` (discover → skip
4
+ unchanged → ingest → prune), request signers, and ``build_connector``. The
5
+ ``fs-crawler`` CLI wires them together. Source-specific logic is a
6
+ `afs_core.contracts.Connector`; this package ships Local FS and S3.
7
+ """
8
+
9
+ from afs_connector_sdk.auth import NoAuth, RequestSigner, SigV4Signer
10
+ from afs_connector_sdk.client import IngestClient
11
+ from afs_connector_sdk.engine import SyncEngine, SyncReport
12
+ from afs_connector_sdk.registry import build_connector
13
+
14
+ __all__ = [
15
+ "IngestClient",
16
+ "NoAuth",
17
+ "RequestSigner",
18
+ "SigV4Signer",
19
+ "SyncEngine",
20
+ "SyncReport",
21
+ "build_connector",
22
+ ]
@@ -0,0 +1,61 @@
1
+ """How the connector authenticates **to the agentic-fs API** (not to the source).
2
+
3
+ A signer returns the extra headers a request needs. Source-side auth (reaching
4
+ S3 / Google Drive / SharePoint) lives inside each connector, not here. Two
5
+ signers ship today:
6
+
7
+ - ``NoAuth`` — local dev or an unauthenticated endpoint.
8
+ - ``SigV4Signer`` — the default AWS deployment, whose Function URL uses
9
+ ``AWS_IAM`` auth. Needs the ``[aws]`` extra. Bearer-token (OAuth) auth arrives
10
+ with the resource server.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from typing import Any, Protocol, runtime_checkable
16
+
17
+
18
+ @runtime_checkable
19
+ class RequestSigner(Protocol):
20
+ def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
21
+ """Headers to add so the API accepts the request (may be empty)."""
22
+ ...
23
+
24
+
25
+ class NoAuth:
26
+ def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
27
+ return {}
28
+
29
+
30
+ class SigV4Signer:
31
+ """Signs requests with AWS SigV4 for an ``AWS_IAM`` Lambda Function URL.
32
+
33
+ Credentials come from the standard AWS chain (env, profile, role) unless
34
+ passed explicitly. The signer is built once and reused across requests.
35
+ """
36
+
37
+ def __init__(self, *, region: str, service: str = "lambda", credentials: Any = None) -> None:
38
+ try:
39
+ from botocore.session import Session
40
+ except ModuleNotFoundError as err: # pragma: no cover - import guard
41
+ raise RuntimeError(
42
+ "SigV4 signing needs the optional extra: pip install 'afs-connector-sdk[aws]'"
43
+ ) from err
44
+ self._region = region
45
+ self._service = service
46
+ if credentials is None:
47
+ credentials = Session().get_credentials()
48
+ if credentials is None:
49
+ raise RuntimeError(
50
+ "no AWS credentials found — configure the standard AWS credential chain "
51
+ "(env vars, a shared profile, or an instance/role)"
52
+ )
53
+ self._credentials = credentials
54
+
55
+ def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
56
+ from botocore.auth import SigV4Auth
57
+ from botocore.awsrequest import AWSRequest
58
+
59
+ request = AWSRequest(method=method, url=url, data=body or b"")
60
+ SigV4Auth(self._credentials, self._service, self._region).add_auth(request)
61
+ return dict(request.headers)
@@ -0,0 +1,90 @@
1
+ """``fs-crawler`` — crawl a source and ingest its documents into agentic-fs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import argparse
6
+ import asyncio
7
+ import sys
8
+ from collections.abc import Sequence
9
+
10
+ from afs_connector_sdk.client import IngestClient
11
+ from afs_connector_sdk.engine import SyncEngine, SyncReport
12
+ from afs_connector_sdk.registry import build_connector
13
+
14
+
15
+ def _parse_options(pairs: list[str]) -> dict[str, str]:
16
+ options: dict[str, str] = {}
17
+ for pair in pairs:
18
+ if "=" not in pair:
19
+ raise ValueError(f"--opt expects KEY=VALUE, got {pair!r}")
20
+ key, value = pair.split("=", 1)
21
+ options[key] = value
22
+ return options
23
+
24
+
25
+ def _build_parser() -> argparse.ArgumentParser:
26
+ parser = argparse.ArgumentParser(
27
+ prog="fs-crawler",
28
+ description="Crawl a source and ingest its documents into agentic-fs.",
29
+ )
30
+ add = parser.add_argument
31
+ add("--connector", default="local", help="connector name (local, s3, or a plugin)")
32
+ add("--source", required=True, help="connector source (a directory, or s3://bucket/prefix)")
33
+ add("--api-url", required=True, help="agentic-fs API base URL")
34
+ add("--namespace", required=True, help="target namespace")
35
+ add("--auth", choices=["none", "sigv4"], default="none", help="how to authenticate to the API")
36
+ add("--region", default="us-east-1", help="AWS region (for --auth sigv4)")
37
+ add("--concurrency", type=int, default=8, help="max documents in flight")
38
+ add("--prune", action="store_true", help="delete agentic-fs docs no longer at the source")
39
+ add("--dry-run", action="store_true", help="report what would change without writing")
40
+ parser.add_argument(
41
+ "--opt",
42
+ action="append",
43
+ default=[],
44
+ metavar="KEY=VALUE",
45
+ help="connector-specific option (repeatable)",
46
+ )
47
+ return parser
48
+
49
+
50
+ def main(argv: Sequence[str] | None = None) -> int:
51
+ args = _build_parser().parse_args(argv)
52
+
53
+ try:
54
+ connector = build_connector(args.connector, args.source, **_parse_options(args.opt))
55
+ except (ValueError, RuntimeError) as err:
56
+ print(f"error: {err}", file=sys.stderr)
57
+ return 2
58
+
59
+ signer = None
60
+ if args.auth == "sigv4":
61
+ from afs_connector_sdk.auth import SigV4Signer
62
+
63
+ try:
64
+ signer = SigV4Signer(region=args.region)
65
+ except RuntimeError as err:
66
+ print(f"error: {err}", file=sys.stderr)
67
+ return 2
68
+
69
+ report = asyncio.run(_run(connector, args, signer))
70
+ tag = " (dry-run)" if args.dry_run else ""
71
+ print(
72
+ f"{args.connector} -> {args.namespace}{tag}: "
73
+ f"ingested={report.ingested} skipped={report.skipped} "
74
+ f"deleted={report.deleted} failed={report.failed}"
75
+ )
76
+ for line in report.errors[:20]:
77
+ print(f" ! {line}", file=sys.stderr)
78
+ return 1 if report.failed else 0
79
+
80
+
81
+ async def _run(connector: object, args: argparse.Namespace, signer: object) -> SyncReport:
82
+ async with IngestClient(args.api_url, signer=signer) as client: # type: ignore[arg-type]
83
+ engine = SyncEngine(
84
+ client, concurrency=args.concurrency, prune=args.prune, dry_run=args.dry_run
85
+ )
86
+ return await engine.sync(connector, args.namespace) # type: ignore[arg-type]
87
+
88
+
89
+ if __name__ == "__main__":
90
+ raise SystemExit(main())
@@ -0,0 +1,99 @@
1
+ """Async HTTP client for the agentic-fs ingest + read API.
2
+
3
+ Signing is done by building the httpx request first, then signing its *final*
4
+ URL and attaching the result — so the bytes signed are exactly the bytes sent.
5
+ That avoids the SigV4 query-encoding mismatch you hit when the signed URL and
6
+ the transmitted URL disagree on how a path like ``a/b.md`` is escaped.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from types import TracebackType
12
+ from typing import Any
13
+
14
+ import httpx
15
+
16
+ from afs_connector_sdk.auth import NoAuth, RequestSigner
17
+
18
+
19
+ class IngestClient:
20
+ def __init__(
21
+ self, base_url: str, *, signer: RequestSigner | None = None, timeout: float = 30.0
22
+ ) -> None:
23
+ self._base = base_url.rstrip("/")
24
+ self._signer = signer or NoAuth()
25
+ self._http = httpx.AsyncClient(timeout=timeout)
26
+
27
+ async def __aenter__(self) -> IngestClient:
28
+ return self
29
+
30
+ async def __aexit__(
31
+ self,
32
+ exc_type: type[BaseException] | None,
33
+ exc: BaseException | None,
34
+ tb: TracebackType | None,
35
+ ) -> None:
36
+ await self.aclose()
37
+
38
+ async def aclose(self) -> None:
39
+ await self._http.aclose()
40
+
41
+ async def _send(
42
+ self,
43
+ method: str,
44
+ path: str,
45
+ *,
46
+ params: dict[str, Any] | None = None,
47
+ content: bytes | None = None,
48
+ content_type: str | None = None,
49
+ ) -> httpx.Response:
50
+ headers = {"content-type": content_type} if content_type else {}
51
+ request = self._http.build_request(
52
+ method, f"{self._base}{path}", params=params, content=content, headers=headers
53
+ )
54
+ request.headers.update(
55
+ self._signer.headers_for(
56
+ method=request.method, url=str(request.url), body=content or b""
57
+ )
58
+ )
59
+ return await self._http.send(request)
60
+
61
+ async def put_document(
62
+ self, namespace: str, path: str, data: bytes, *, content_type: str | None = None
63
+ ) -> dict[str, Any]:
64
+ resp = await self._send(
65
+ "PUT",
66
+ f"/v1/ingest/{namespace}/doc",
67
+ params={"path": path},
68
+ content=data,
69
+ content_type=content_type,
70
+ )
71
+ resp.raise_for_status()
72
+ return resp.json()
73
+
74
+ async def stat(self, namespace: str, path: str) -> dict[str, Any] | None:
75
+ resp = await self._send("GET", f"/v1/fs/{namespace}/stat", params={"path": path})
76
+ if resp.status_code == httpx.codes.NOT_FOUND:
77
+ return None
78
+ resp.raise_for_status()
79
+ return resp.json()
80
+
81
+ async def delete_document(self, namespace: str, path: str) -> None:
82
+ resp = await self._send("DELETE", f"/v1/ingest/{namespace}/doc", params={"path": path})
83
+ if resp.status_code not in (200, 202, 204):
84
+ resp.raise_for_status()
85
+
86
+ async def list_paths(self, namespace: str, prefix: str = "") -> list[str]:
87
+ paths: list[str] = []
88
+ cursor: str | None = None
89
+ while True:
90
+ params: dict[str, Any] = {"prefix": prefix, "limit": 200}
91
+ if cursor:
92
+ params["cursor"] = cursor
93
+ resp = await self._send("GET", f"/v1/fs/{namespace}/entries", params=params)
94
+ resp.raise_for_status()
95
+ page = resp.json()
96
+ paths.extend(entry["path"] for entry in page.get("items", []))
97
+ cursor = page.get("next_cursor")
98
+ if not cursor:
99
+ return paths
@@ -0,0 +1 @@
1
+ """Builtin connectors. Each is a small `afs_core.contracts.Connector` impl."""
@@ -0,0 +1,46 @@
1
+ """Local filesystem connector — crawl a directory tree.
2
+
3
+ Zero dependencies; the reference connector and the easiest way to dogfood the
4
+ whole ingest path (point it at a folder of docs and read them back over MCP).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import mimetypes
10
+ from collections.abc import Iterator
11
+ from pathlib import Path
12
+
13
+ from afs_core.models import SourceItem
14
+
15
+
16
+ class LocalConnector:
17
+ name = "local"
18
+
19
+ def __init__(self, source: str, *, follow_symlinks: str = "false") -> None:
20
+ self._root = Path(source).expanduser().resolve()
21
+ if not self._root.is_dir():
22
+ raise ValueError(f"source is not a directory: {self._root}")
23
+ # Options arrive as strings from the CLI (`--opt follow_symlinks=true`).
24
+ self._follow = str(follow_symlinks).lower() in {"1", "true", "yes"}
25
+
26
+ def discover(self) -> Iterator[SourceItem]:
27
+ for path in sorted(self._root.rglob("*")):
28
+ rel = path.relative_to(self._root)
29
+ # Skip hidden files and dot-directories (.git, .DS_Store, …).
30
+ if any(part.startswith(".") for part in rel.parts):
31
+ continue
32
+ if path.is_symlink() and not self._follow:
33
+ continue
34
+ if not path.is_file():
35
+ continue
36
+ stat = path.stat()
37
+ yield SourceItem(
38
+ path=rel.as_posix(),
39
+ locator=str(path),
40
+ size=stat.st_size,
41
+ content_type=mimetypes.guess_type(path.name)[0],
42
+ version=f"mtime:{int(stat.st_mtime)}:{stat.st_size}",
43
+ )
44
+
45
+ def fetch(self, item: SourceItem) -> bytes:
46
+ return Path(item.locator).read_bytes()
@@ -0,0 +1,59 @@
1
+ """S3 connector — crawl an ``s3://bucket/prefix`` of documents.
2
+
3
+ Source-side auth is the standard boto3 chain (env / profile / role), so reading
4
+ from S3 needs no special handling here — that's the connector pattern: each
5
+ source owns its own auth. Needs the ``[aws]`` extra (boto3).
6
+
7
+ Pass a prefix ending in ``/`` for folder semantics (``s3://bucket/docs/``); the
8
+ prefix is stripped from each key to form the agentic-fs path.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from collections.abc import Iterator
14
+ from typing import Any
15
+ from urllib.parse import urlparse
16
+
17
+ from afs_core.models import SourceItem
18
+
19
+
20
+ class S3Connector:
21
+ name = "s3"
22
+
23
+ def __init__(
24
+ self, source: str, *, endpoint_url: str | None = None, region: str | None = None
25
+ ) -> None:
26
+ parsed = urlparse(source)
27
+ if parsed.scheme != "s3" or not parsed.netloc:
28
+ raise ValueError(f"source must be s3://bucket/prefix, got {source!r}")
29
+ self._bucket = parsed.netloc
30
+ self._prefix = parsed.path.lstrip("/")
31
+ try:
32
+ import boto3
33
+ except ModuleNotFoundError as err: # pragma: no cover - import guard
34
+ raise RuntimeError(
35
+ "the s3 connector needs the optional extra: pip install 'afs-connector-sdk[aws]'"
36
+ ) from err
37
+ self._s3: Any = boto3.client("s3", endpoint_url=endpoint_url, region_name=region)
38
+
39
+ def discover(self) -> Iterator[SourceItem]:
40
+ paginator = self._s3.get_paginator("list_objects_v2")
41
+ for page in paginator.paginate(Bucket=self._bucket, Prefix=self._prefix):
42
+ for obj in page.get("Contents", []):
43
+ key = obj["Key"]
44
+ if key.endswith("/"):
45
+ continue # skip "folder" placeholder objects
46
+ rel = key[len(self._prefix) :] if self._prefix else key
47
+ rel = rel.lstrip("/")
48
+ if not rel:
49
+ continue
50
+ yield SourceItem(
51
+ path=rel,
52
+ locator=key,
53
+ size=obj.get("Size"),
54
+ version=(obj.get("ETag") or "").strip('"') or None,
55
+ )
56
+
57
+ def fetch(self, item: SourceItem) -> bytes:
58
+ resp = self._s3.get_object(Bucket=self._bucket, Key=item.locator)
59
+ return resp["Body"].read()
@@ -0,0 +1,92 @@
1
+ """The source-agnostic sync engine.
2
+
3
+ For each item a connector discovers: fetch the bytes, and skip the (expensive)
4
+ ingest if the document is already present unchanged — decided by comparing the
5
+ content checksum against the catalog's, so re-runs are cheap and idempotent and
6
+ nothing is re-extracted needlessly. With ``prune``, documents that have vanished
7
+ from the source are deleted from agentic-fs (the source stays the source of
8
+ truth).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import hashlib
15
+ from dataclasses import dataclass, field
16
+ from typing import TYPE_CHECKING
17
+
18
+ if TYPE_CHECKING:
19
+ from afs_connector_sdk.client import IngestClient
20
+ from afs_core.contracts import Connector
21
+ from afs_core.models import SourceItem
22
+
23
+
24
+ @dataclass
25
+ class SyncReport:
26
+ ingested: int = 0
27
+ skipped: int = 0
28
+ deleted: int = 0
29
+ failed: int = 0
30
+ errors: list[str] = field(default_factory=list)
31
+
32
+
33
+ class SyncEngine:
34
+ def __init__(
35
+ self,
36
+ client: IngestClient,
37
+ *,
38
+ concurrency: int = 8,
39
+ prune: bool = False,
40
+ dry_run: bool = False,
41
+ ) -> None:
42
+ self._client = client
43
+ self._sem = asyncio.Semaphore(concurrency)
44
+ self._prune = prune
45
+ self._dry = dry_run
46
+
47
+ async def sync(self, connector: Connector, namespace: str) -> SyncReport:
48
+ report = SyncReport()
49
+ items = list(connector.discover())
50
+ await asyncio.gather(*(self._process(connector, namespace, it, report) for it in items))
51
+ if self._prune:
52
+ await self._prune_missing(namespace, {it.path for it in items}, report)
53
+ return report
54
+
55
+ async def _process(
56
+ self, connector: Connector, namespace: str, item: SourceItem, report: SyncReport
57
+ ) -> None:
58
+ async with self._sem:
59
+ try:
60
+ data = await asyncio.to_thread(connector.fetch, item)
61
+ checksum = hashlib.sha256(data).hexdigest()
62
+ existing = await self._client.stat(namespace, item.path)
63
+ if existing and existing.get("checksum") == checksum:
64
+ report.skipped += 1
65
+ return
66
+ if not self._dry:
67
+ await self._client.put_document(
68
+ namespace, item.path, data, content_type=item.content_type
69
+ )
70
+ report.ingested += 1
71
+ except Exception as err:
72
+ report.failed += 1
73
+ report.errors.append(f"{item.path}: {err}")
74
+
75
+ async def _prune_missing(self, namespace: str, seen: set[str], report: SyncReport) -> None:
76
+ try:
77
+ existing = await self._client.list_paths(namespace)
78
+ except Exception as err:
79
+ report.errors.append(f"prune-list: {err}")
80
+ return
81
+ for path in existing:
82
+ if path in seen:
83
+ continue
84
+ if self._dry:
85
+ report.deleted += 1
86
+ continue
87
+ try:
88
+ await self._client.delete_document(namespace, path)
89
+ report.deleted += 1
90
+ except Exception as err:
91
+ report.failed += 1
92
+ report.errors.append(f"delete {path}: {err}")
File without changes
@@ -0,0 +1,40 @@
1
+ """Connector registry — builtins + the ``afs.connectors`` entry-point group.
2
+
3
+ Same pattern as the store and normalizer registries: pick a connector by name.
4
+ Third-party connectors (Google Drive, SharePoint, …) register an entry point
5
+ whose value is a callable ``(source, **options) -> Connector``; they need no
6
+ change here.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from importlib.metadata import entry_points
12
+ from typing import TYPE_CHECKING
13
+
14
+ if TYPE_CHECKING:
15
+ from afs_core.contracts import Connector
16
+
17
+ _ENTRY_GROUP = "afs.connectors"
18
+
19
+
20
+ def _builtins() -> dict[str, object]:
21
+ # Imported lazily so a connector's optional deps (boto3 for s3) aren't needed
22
+ # just to load the registry or use a different connector.
23
+ from afs_connector_sdk.connectors.local import LocalConnector
24
+ from afs_connector_sdk.connectors.s3 import S3Connector
25
+
26
+ return {"local": LocalConnector, "s3": S3Connector}
27
+
28
+
29
+ def build_connector(name: str, source: str, **options: str) -> Connector:
30
+ """Construct a connector by name over ``source`` (with connector-specific options)."""
31
+ factory = _builtins().get(name)
32
+ if factory is None:
33
+ for ep in entry_points(group=_ENTRY_GROUP):
34
+ if ep.name == name:
35
+ factory = ep.load()
36
+ break
37
+ if factory is None:
38
+ available = sorted(_builtins()) + [ep.name for ep in entry_points(group=_ENTRY_GROUP)]
39
+ raise ValueError(f"unknown connector {name!r}; available: {', '.join(available)}")
40
+ return factory(source, **options) # type: ignore[operator]
@@ -0,0 +1,74 @@
1
+ """IngestClient HTTP behavior + the sign-the-final-URL property (via MockTransport)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import httpx
6
+
7
+ from afs_connector_sdk.client import IngestClient
8
+
9
+
10
+ class _CaptureSigner:
11
+ def __init__(self) -> None:
12
+ self.signed_urls: list[str] = []
13
+
14
+ def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
15
+ self.signed_urls.append(url)
16
+ return {"authorization": "SIGNED"}
17
+
18
+
19
+ def _client_with(handler, signer=None) -> IngestClient:
20
+ client = IngestClient("http://api.test", signer=signer)
21
+ client._http = httpx.AsyncClient(transport=httpx.MockTransport(handler))
22
+ return client
23
+
24
+
25
+ async def test_put_signs_the_exact_url_it_sends() -> None:
26
+ sent: dict[str, str] = {}
27
+
28
+ def handler(request: httpx.Request) -> httpx.Response:
29
+ sent["url"] = str(request.url)
30
+ sent["auth"] = request.headers.get("authorization", "")
31
+ sent["ctype"] = request.headers.get("content-type", "")
32
+ return httpx.Response(201, json={"path": "a/b.md", "checksum": "abc"})
33
+
34
+ signer = _CaptureSigner()
35
+ client = _client_with(handler, signer)
36
+ entry = await client.put_document("ns", "a/b.md", b"hello", content_type="text/markdown")
37
+ await client.aclose()
38
+
39
+ assert entry["checksum"] == "abc"
40
+ assert sent["auth"] == "SIGNED"
41
+ assert sent["ctype"] == "text/markdown"
42
+ # The signer must see the byte-identical URL the transport sent (no
43
+ # re-encoding between signing and sending) — the crux of SigV4 over query paths.
44
+ assert signer.signed_urls == [sent["url"]]
45
+
46
+
47
+ async def test_stat_404_is_none() -> None:
48
+ client = _client_with(lambda req: httpx.Response(404, json={"detail": "nope"}))
49
+ assert await client.stat("ns", "missing.md") is None
50
+ await client.aclose()
51
+
52
+
53
+ async def test_list_paths_follows_pagination() -> None:
54
+ def handler(request: httpx.Request) -> httpx.Response:
55
+ if request.url.params.get("cursor") == "c1":
56
+ return httpx.Response(200, json={"items": [{"path": "b.md"}], "next_cursor": None})
57
+ return httpx.Response(200, json={"items": [{"path": "a.md"}], "next_cursor": "c1"})
58
+
59
+ client = _client_with(handler)
60
+ assert await client.list_paths("ns") == ["a.md", "b.md"]
61
+ await client.aclose()
62
+
63
+
64
+ async def test_no_auth_adds_no_authorization_header() -> None:
65
+ seen: dict[str, str | None] = {}
66
+
67
+ def handler(request: httpx.Request) -> httpx.Response:
68
+ seen["auth"] = request.headers.get("authorization")
69
+ return httpx.Response(202)
70
+
71
+ client = _client_with(handler) # default NoAuth
72
+ await client.delete_document("ns", "a.md")
73
+ await client.aclose()
74
+ assert seen["auth"] is None
@@ -0,0 +1,94 @@
1
+ """The sync engine's decisions: ingest new, skip unchanged, prune, dry-run, errors."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+
7
+ from afs_connector_sdk.engine import SyncEngine
8
+ from afs_core.models import SourceItem
9
+
10
+
11
+ class _FakeClient:
12
+ """Stands in for IngestClient. `existing` maps path -> stored checksum."""
13
+
14
+ def __init__(self, existing: dict[str, str] | None = None) -> None:
15
+ self.existing = existing or {}
16
+ self.put: list[tuple[str, bytes]] = []
17
+ self.deleted: list[str] = []
18
+
19
+ async def stat(self, namespace: str, path: str) -> dict[str, str] | None:
20
+ checksum = self.existing.get(path)
21
+ return {"checksum": checksum} if checksum is not None else None
22
+
23
+ async def put_document(
24
+ self, namespace: str, path: str, data: bytes, *, content_type: str | None = None
25
+ ) -> dict[str, str]:
26
+ self.put.append((path, data))
27
+ return {}
28
+
29
+ async def list_paths(self, namespace: str, prefix: str = "") -> list[str]:
30
+ return list(self.existing)
31
+
32
+ async def delete_document(self, namespace: str, path: str) -> None:
33
+ self.deleted.append(path)
34
+
35
+
36
+ class _FakeConnector:
37
+ name = "fake"
38
+
39
+ def __init__(self, items: dict[str, bytes], *, broken: set[str] | None = None) -> None:
40
+ self._items = items
41
+ self._broken = broken or set()
42
+
43
+ def discover(self) -> list[SourceItem]:
44
+ return [SourceItem(path=p, locator=p) for p in self._items]
45
+
46
+ def fetch(self, item: SourceItem) -> bytes:
47
+ if item.locator in self._broken:
48
+ raise OSError("unreadable")
49
+ return self._items[item.locator]
50
+
51
+
52
+ def _sha(data: bytes) -> str:
53
+ return hashlib.sha256(data).hexdigest()
54
+
55
+
56
+ async def test_ingests_new_documents() -> None:
57
+ client = _FakeClient()
58
+ connector = _FakeConnector({"a.md": b"alpha", "sub/b.txt": b"beta"})
59
+ report = await SyncEngine(client).sync(connector, "ns")
60
+ assert report.ingested == 2 and report.skipped == 0
61
+ assert {p for p, _ in client.put} == {"a.md", "sub/b.txt"}
62
+
63
+
64
+ async def test_skips_unchanged_by_checksum() -> None:
65
+ client = _FakeClient(existing={"a.md": _sha(b"alpha")})
66
+ connector = _FakeConnector({"a.md": b"alpha", "b.md": b"new"})
67
+ report = await SyncEngine(client).sync(connector, "ns")
68
+ assert report.skipped == 1 and report.ingested == 1
69
+ assert [p for p, _ in client.put] == ["b.md"]
70
+
71
+
72
+ async def test_dry_run_writes_nothing() -> None:
73
+ client = _FakeClient()
74
+ connector = _FakeConnector({"a.md": b"alpha"})
75
+ report = await SyncEngine(client, dry_run=True).sync(connector, "ns")
76
+ assert report.ingested == 1
77
+ assert client.put == []
78
+
79
+
80
+ async def test_prune_deletes_documents_absent_from_source() -> None:
81
+ client = _FakeClient(existing={"gone.md": _sha(b"old")})
82
+ connector = _FakeConnector({"a.md": b"alpha"})
83
+ report = await SyncEngine(client, prune=True).sync(connector, "ns")
84
+ assert report.ingested == 1 and report.deleted == 1
85
+ assert client.deleted == ["gone.md"]
86
+
87
+
88
+ async def test_one_bad_document_does_not_abort_the_crawl() -> None:
89
+ client = _FakeClient()
90
+ connector = _FakeConnector({"good.md": b"ok", "bad.md": b"x"}, broken={"bad.md"})
91
+ report = await SyncEngine(client).sync(connector, "ns")
92
+ assert report.ingested == 1 and report.failed == 1
93
+ assert [p for p, _ in client.put] == ["good.md"]
94
+ assert any("bad.md" in e for e in report.errors)
@@ -0,0 +1,53 @@
1
+ """Local FS connector — certified against the afs-core kit, plus its specifics."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pytest
8
+
9
+ from afs_connector_sdk.connectors.local import LocalConnector
10
+ from afs_core.testing import ConnectorConformance
11
+
12
+
13
+ def _populate(root: Path) -> None:
14
+ (root / "a.md").write_text("alpha")
15
+ (root / "sub").mkdir()
16
+ (root / "sub" / "b.txt").write_text("beta beta")
17
+
18
+
19
+ class TestLocalConnector(ConnectorConformance):
20
+ @pytest.fixture
21
+ def connector(self, tmp_path: Path) -> LocalConnector:
22
+ _populate(tmp_path)
23
+ return LocalConnector(str(tmp_path))
24
+
25
+
26
+ def test_discovers_nested_relative_paths(tmp_path: Path) -> None:
27
+ _populate(tmp_path)
28
+ paths = {item.path for item in LocalConnector(str(tmp_path)).discover()}
29
+ assert paths == {"a.md", "sub/b.txt"}
30
+
31
+
32
+ def test_skips_hidden_files_and_dot_dirs(tmp_path: Path) -> None:
33
+ _populate(tmp_path)
34
+ (tmp_path / ".secret").write_text("nope")
35
+ (tmp_path / ".git").mkdir()
36
+ (tmp_path / ".git" / "config").write_text("nope")
37
+ paths = {item.path for item in LocalConnector(str(tmp_path)).discover()}
38
+ assert paths == {"a.md", "sub/b.txt"}
39
+
40
+
41
+ def test_fetch_roundtrips_bytes(tmp_path: Path) -> None:
42
+ _populate(tmp_path)
43
+ connector = LocalConnector(str(tmp_path))
44
+ item = next(i for i in connector.discover() if i.path == "a.md")
45
+ assert connector.fetch(item) == b"alpha"
46
+ assert item.content_type == "text/markdown"
47
+
48
+
49
+ def test_rejects_non_directory(tmp_path: Path) -> None:
50
+ f = tmp_path / "file.txt"
51
+ f.write_text("x")
52
+ with pytest.raises(ValueError, match="not a directory"):
53
+ LocalConnector(str(f))
@@ -0,0 +1,42 @@
1
+ """S3 connector — certified against the afs-core kit using moto."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Iterator
6
+
7
+ import boto3
8
+ import pytest
9
+ from moto import mock_aws
10
+
11
+ from afs_connector_sdk.connectors.s3 import S3Connector
12
+ from afs_core.testing import ConnectorConformance
13
+
14
+
15
+ @pytest.fixture
16
+ def s3_source() -> Iterator[str]:
17
+ with mock_aws():
18
+ client = boto3.client("s3", region_name="us-east-1")
19
+ client.create_bucket(Bucket="docs")
20
+ client.put_object(Bucket="docs", Key="reports/a.md", Body=b"alpha")
21
+ client.put_object(Bucket="docs", Key="reports/sub/b.txt", Body=b"beta beta")
22
+ client.put_object(Bucket="docs", Key="reports/", Body=b"") # folder placeholder
23
+ yield "s3://docs/reports/"
24
+
25
+
26
+ class TestS3Connector(ConnectorConformance):
27
+ @pytest.fixture
28
+ def connector(self, s3_source: str) -> S3Connector:
29
+ return S3Connector(s3_source, region="us-east-1")
30
+
31
+
32
+ def test_strips_prefix_and_skips_placeholders(s3_source: str) -> None:
33
+ connector = S3Connector(s3_source, region="us-east-1")
34
+ items = {item.path: item for item in connector.discover()}
35
+ assert set(items) == {"a.md", "sub/b.txt"}
36
+ assert connector.fetch(items["a.md"]) == b"alpha"
37
+ assert items["a.md"].version # ETag carried as the change token
38
+
39
+
40
+ def test_rejects_non_s3_source() -> None:
41
+ with pytest.raises(ValueError, match="s3://bucket/prefix"):
42
+ S3Connector("/local/path")