afs-connector-sdk 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- afs_connector_sdk-0.1.0/.gitignore +41 -0
- afs_connector_sdk-0.1.0/PKG-INFO +66 -0
- afs_connector_sdk-0.1.0/README.md +45 -0
- afs_connector_sdk-0.1.0/pyproject.toml +49 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/__init__.py +22 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/auth.py +61 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/cli.py +90 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/client.py +99 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/connectors/__init__.py +1 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/connectors/local.py +46 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/connectors/s3.py +59 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/engine.py +92 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/py.typed +0 -0
- afs_connector_sdk-0.1.0/src/afs_connector_sdk/registry.py +40 -0
- afs_connector_sdk-0.1.0/tests/test_client.py +74 -0
- afs_connector_sdk-0.1.0/tests/test_engine.py +94 -0
- afs_connector_sdk-0.1.0/tests/test_local_connector.py +53 -0
- afs_connector_sdk-0.1.0/tests/test_s3_connector.py +42 -0
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
# --- OS / editor ---
|
|
2
|
+
.DS_Store
|
|
3
|
+
Thumbs.db
|
|
4
|
+
*.swp
|
|
5
|
+
.idea/
|
|
6
|
+
.vscode/
|
|
7
|
+
|
|
8
|
+
# --- Python (packages land in M0+) ---
|
|
9
|
+
__pycache__/
|
|
10
|
+
*.py[cod]
|
|
11
|
+
.venv/
|
|
12
|
+
venv/
|
|
13
|
+
.uv/
|
|
14
|
+
*.egg-info/
|
|
15
|
+
.ruff_cache/
|
|
16
|
+
.pytest_cache/
|
|
17
|
+
.mypy_cache/
|
|
18
|
+
.ty_cache/
|
|
19
|
+
dist/
|
|
20
|
+
build/
|
|
21
|
+
|
|
22
|
+
# --- Node (workers/mcp-edge lands later) ---
|
|
23
|
+
node_modules/
|
|
24
|
+
npm-debug.log*
|
|
25
|
+
|
|
26
|
+
# --- Secrets / local env ---
|
|
27
|
+
.env
|
|
28
|
+
.env.*
|
|
29
|
+
!.env.example
|
|
30
|
+
*.secret.*
|
|
31
|
+
|
|
32
|
+
# --- Terraform ---
|
|
33
|
+
# Detailed Terraform ignores live in terraform/.gitignore; these catch any
|
|
34
|
+
# stray state/plan artifacts produced outside that tree.
|
|
35
|
+
*.tfstate
|
|
36
|
+
*.tfstate.*
|
|
37
|
+
*.tfplan
|
|
38
|
+
.terraform/
|
|
39
|
+
|
|
40
|
+
# Agent worktrees (isolated background-agent checkouts)
|
|
41
|
+
.claude/worktrees/
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: afs-connector-sdk
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: agentic-fs connector SDK: crawl a source and ingest into agentic-fs. Ships the fs-crawler CLI plus Local FS and S3 connectors.
|
|
5
|
+
Project-URL: Homepage, https://github.com/vivekkhimani/agentic-fs
|
|
6
|
+
Project-URL: Repository, https://github.com/vivekkhimani/agentic-fs
|
|
7
|
+
Project-URL: Issues, https://github.com/vivekkhimani/agentic-fs/issues
|
|
8
|
+
Author-email: Vivek Khimani <vivekkhimani07@gmail.com>
|
|
9
|
+
License-Expression: Apache-2.0
|
|
10
|
+
Keywords: agentic-fs,agents,connector,etl,ingestion,s3
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Typing :: Typed
|
|
15
|
+
Requires-Python: >=3.12
|
|
16
|
+
Requires-Dist: afs-core
|
|
17
|
+
Requires-Dist: httpx>=0.27
|
|
18
|
+
Provides-Extra: aws
|
|
19
|
+
Requires-Dist: boto3>=1.34; extra == 'aws'
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
|
|
22
|
+
# afs-connector-sdk
|
|
23
|
+
|
|
24
|
+
Crawl a source and ingest its documents into [agentic-fs](https://github.com/vivekkhimani/agentic-fs).
|
|
25
|
+
Ships the `fs-crawler` CLI plus **Local FS** and **S3** connectors.
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install afs-connector-sdk # Local FS connector + unauthenticated/bearer APIs
|
|
29
|
+
pip install "afs-connector-sdk[aws]" # adds the S3 connector + SigV4 signing for AWS_IAM Function URLs
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## CLI
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
# Crawl a local folder into a dev server
|
|
36
|
+
fs-crawler --connector local --source ./docs \
|
|
37
|
+
--api-url http://localhost:8080 --namespace docs
|
|
38
|
+
|
|
39
|
+
# Crawl an S3 prefix into the deployed (AWS_IAM) Function URL
|
|
40
|
+
fs-crawler --connector s3 --source s3://my-bucket/reports/ \
|
|
41
|
+
--api-url "$FUNCTION_URL" --namespace reports --auth sigv4 --region us-east-1
|
|
42
|
+
|
|
43
|
+
# Mirror exactly (also delete docs no longer at the source)
|
|
44
|
+
fs-crawler --connector local --source ./docs --api-url "$URL" --namespace docs --prune
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Re-runs are cheap and idempotent: a document is skipped unless its content
|
|
48
|
+
checksum differs from what agentic-fs already has, so nothing is re-extracted
|
|
49
|
+
needlessly.
|
|
50
|
+
|
|
51
|
+
## Library
|
|
52
|
+
|
|
53
|
+
```python
|
|
54
|
+
from afs_connector_sdk import IngestClient, SyncEngine, SigV4Signer, build_connector
|
|
55
|
+
|
|
56
|
+
connector = build_connector("local", "./docs")
|
|
57
|
+
async with IngestClient(api_url, signer=SigV4Signer(region="us-east-1")) as client:
|
|
58
|
+
report = await SyncEngine(client).sync(connector, namespace="docs")
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Writing a connector
|
|
62
|
+
|
|
63
|
+
Implement `afs_core.contracts.Connector` (`discover()` → `SourceItem`s, `fetch(item)` →
|
|
64
|
+
bytes), certify it against `afs_core.testing.ConnectorConformance`, and register an
|
|
65
|
+
`afs.connectors` entry point. Source-side auth lives in your connector; the SDK
|
|
66
|
+
handles everything else. See [the connector swap guide](../../docs/swap-guides/connectors.md).
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
# afs-connector-sdk
|
|
2
|
+
|
|
3
|
+
Crawl a source and ingest its documents into [agentic-fs](https://github.com/vivekkhimani/agentic-fs).
|
|
4
|
+
Ships the `fs-crawler` CLI plus **Local FS** and **S3** connectors.
|
|
5
|
+
|
|
6
|
+
```bash
|
|
7
|
+
pip install afs-connector-sdk # Local FS connector + unauthenticated/bearer APIs
|
|
8
|
+
pip install "afs-connector-sdk[aws]" # adds the S3 connector + SigV4 signing for AWS_IAM Function URLs
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## CLI
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
# Crawl a local folder into a dev server
|
|
15
|
+
fs-crawler --connector local --source ./docs \
|
|
16
|
+
--api-url http://localhost:8080 --namespace docs
|
|
17
|
+
|
|
18
|
+
# Crawl an S3 prefix into the deployed (AWS_IAM) Function URL
|
|
19
|
+
fs-crawler --connector s3 --source s3://my-bucket/reports/ \
|
|
20
|
+
--api-url "$FUNCTION_URL" --namespace reports --auth sigv4 --region us-east-1
|
|
21
|
+
|
|
22
|
+
# Mirror exactly (also delete docs no longer at the source)
|
|
23
|
+
fs-crawler --connector local --source ./docs --api-url "$URL" --namespace docs --prune
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
Re-runs are cheap and idempotent: a document is skipped unless its content
|
|
27
|
+
checksum differs from what agentic-fs already has, so nothing is re-extracted
|
|
28
|
+
needlessly.
|
|
29
|
+
|
|
30
|
+
## Library
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from afs_connector_sdk import IngestClient, SyncEngine, SigV4Signer, build_connector
|
|
34
|
+
|
|
35
|
+
connector = build_connector("local", "./docs")
|
|
36
|
+
async with IngestClient(api_url, signer=SigV4Signer(region="us-east-1")) as client:
|
|
37
|
+
report = await SyncEngine(client).sync(connector, namespace="docs")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Writing a connector
|
|
41
|
+
|
|
42
|
+
Implement `afs_core.contracts.Connector` (`discover()` → `SourceItem`s, `fetch(item)` →
|
|
43
|
+
bytes), certify it against `afs_core.testing.ConnectorConformance`, and register an
|
|
44
|
+
`afs.connectors` entry point. Source-side auth lives in your connector; the SDK
|
|
45
|
+
handles everything else. See [the connector swap guide](../../docs/swap-guides/connectors.md).
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "afs-connector-sdk"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "agentic-fs connector SDK: crawl a source and ingest into agentic-fs. Ships the fs-crawler CLI plus Local FS and S3 connectors."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
license = "Apache-2.0"
|
|
8
|
+
authors = [{ name = "Vivek Khimani", email = "vivekkhimani07@gmail.com" }]
|
|
9
|
+
keywords = [
|
|
10
|
+
"agentic-fs",
|
|
11
|
+
"agents",
|
|
12
|
+
"connector",
|
|
13
|
+
"ingestion",
|
|
14
|
+
"s3",
|
|
15
|
+
"etl",
|
|
16
|
+
]
|
|
17
|
+
classifiers = [
|
|
18
|
+
"Development Status :: 3 - Alpha",
|
|
19
|
+
"License :: OSI Approved :: Apache Software License",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Typing :: Typed",
|
|
22
|
+
]
|
|
23
|
+
dependencies = [
|
|
24
|
+
"afs-core",
|
|
25
|
+
"httpx>=0.27",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
# The S3 connector (boto3) and SigV4 signing for AWS_IAM Function URLs. The Local
|
|
30
|
+
# FS connector and unauthenticated/bearer endpoints need none of this.
|
|
31
|
+
aws = ["boto3>=1.34"]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
fs-crawler = "afs_connector_sdk.cli:main"
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/vivekkhimani/agentic-fs"
|
|
38
|
+
Repository = "https://github.com/vivekkhimani/agentic-fs"
|
|
39
|
+
Issues = "https://github.com/vivekkhimani/agentic-fs/issues"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["hatchling"]
|
|
43
|
+
build-backend = "hatchling.build"
|
|
44
|
+
|
|
45
|
+
[tool.hatch.build.targets.wheel]
|
|
46
|
+
packages = ["src/afs_connector_sdk"]
|
|
47
|
+
|
|
48
|
+
[tool.uv.sources]
|
|
49
|
+
afs-core = { workspace = true }
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""agentic-fs connector SDK — crawl a source and ingest into agentic-fs.
|
|
2
|
+
|
|
3
|
+
Public surface: the ``IngestClient`` (HTTP), the ``SyncEngine`` (discover → skip
|
|
4
|
+
unchanged → ingest → prune), request signers, and ``build_connector``. The
|
|
5
|
+
``fs-crawler`` CLI wires them together. Source-specific logic is a
|
|
6
|
+
`afs_core.contracts.Connector`; this package ships Local FS and S3.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from afs_connector_sdk.auth import NoAuth, RequestSigner, SigV4Signer
|
|
10
|
+
from afs_connector_sdk.client import IngestClient
|
|
11
|
+
from afs_connector_sdk.engine import SyncEngine, SyncReport
|
|
12
|
+
from afs_connector_sdk.registry import build_connector
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"IngestClient",
|
|
16
|
+
"NoAuth",
|
|
17
|
+
"RequestSigner",
|
|
18
|
+
"SigV4Signer",
|
|
19
|
+
"SyncEngine",
|
|
20
|
+
"SyncReport",
|
|
21
|
+
"build_connector",
|
|
22
|
+
]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"""How the connector authenticates **to the agentic-fs API** (not to the source).
|
|
2
|
+
|
|
3
|
+
A signer returns the extra headers a request needs. Source-side auth (reaching
|
|
4
|
+
S3 / Google Drive / SharePoint) lives inside each connector, not here. Two
|
|
5
|
+
signers ship today:
|
|
6
|
+
|
|
7
|
+
- ``NoAuth`` — local dev or an unauthenticated endpoint.
|
|
8
|
+
- ``SigV4Signer`` — the default AWS deployment, whose Function URL uses
|
|
9
|
+
``AWS_IAM`` auth. Needs the ``[aws]`` extra. Bearer-token (OAuth) auth arrives
|
|
10
|
+
with the resource server.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from typing import Any, Protocol, runtime_checkable
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@runtime_checkable
|
|
19
|
+
class RequestSigner(Protocol):
|
|
20
|
+
def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
|
|
21
|
+
"""Headers to add so the API accepts the request (may be empty)."""
|
|
22
|
+
...
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NoAuth:
|
|
26
|
+
def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
|
|
27
|
+
return {}
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SigV4Signer:
|
|
31
|
+
"""Signs requests with AWS SigV4 for an ``AWS_IAM`` Lambda Function URL.
|
|
32
|
+
|
|
33
|
+
Credentials come from the standard AWS chain (env, profile, role) unless
|
|
34
|
+
passed explicitly. The signer is built once and reused across requests.
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
def __init__(self, *, region: str, service: str = "lambda", credentials: Any = None) -> None:
|
|
38
|
+
try:
|
|
39
|
+
from botocore.session import Session
|
|
40
|
+
except ModuleNotFoundError as err: # pragma: no cover - import guard
|
|
41
|
+
raise RuntimeError(
|
|
42
|
+
"SigV4 signing needs the optional extra: pip install 'afs-connector-sdk[aws]'"
|
|
43
|
+
) from err
|
|
44
|
+
self._region = region
|
|
45
|
+
self._service = service
|
|
46
|
+
if credentials is None:
|
|
47
|
+
credentials = Session().get_credentials()
|
|
48
|
+
if credentials is None:
|
|
49
|
+
raise RuntimeError(
|
|
50
|
+
"no AWS credentials found — configure the standard AWS credential chain "
|
|
51
|
+
"(env vars, a shared profile, or an instance/role)"
|
|
52
|
+
)
|
|
53
|
+
self._credentials = credentials
|
|
54
|
+
|
|
55
|
+
def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
|
|
56
|
+
from botocore.auth import SigV4Auth
|
|
57
|
+
from botocore.awsrequest import AWSRequest
|
|
58
|
+
|
|
59
|
+
request = AWSRequest(method=method, url=url, data=body or b"")
|
|
60
|
+
SigV4Auth(self._credentials, self._service, self._region).add_auth(request)
|
|
61
|
+
return dict(request.headers)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""``fs-crawler`` — crawl a source and ingest its documents into agentic-fs."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import asyncio
|
|
7
|
+
import sys
|
|
8
|
+
from collections.abc import Sequence
|
|
9
|
+
|
|
10
|
+
from afs_connector_sdk.client import IngestClient
|
|
11
|
+
from afs_connector_sdk.engine import SyncEngine, SyncReport
|
|
12
|
+
from afs_connector_sdk.registry import build_connector
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _parse_options(pairs: list[str]) -> dict[str, str]:
|
|
16
|
+
options: dict[str, str] = {}
|
|
17
|
+
for pair in pairs:
|
|
18
|
+
if "=" not in pair:
|
|
19
|
+
raise ValueError(f"--opt expects KEY=VALUE, got {pair!r}")
|
|
20
|
+
key, value = pair.split("=", 1)
|
|
21
|
+
options[key] = value
|
|
22
|
+
return options
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
26
|
+
parser = argparse.ArgumentParser(
|
|
27
|
+
prog="fs-crawler",
|
|
28
|
+
description="Crawl a source and ingest its documents into agentic-fs.",
|
|
29
|
+
)
|
|
30
|
+
add = parser.add_argument
|
|
31
|
+
add("--connector", default="local", help="connector name (local, s3, or a plugin)")
|
|
32
|
+
add("--source", required=True, help="connector source (a directory, or s3://bucket/prefix)")
|
|
33
|
+
add("--api-url", required=True, help="agentic-fs API base URL")
|
|
34
|
+
add("--namespace", required=True, help="target namespace")
|
|
35
|
+
add("--auth", choices=["none", "sigv4"], default="none", help="how to authenticate to the API")
|
|
36
|
+
add("--region", default="us-east-1", help="AWS region (for --auth sigv4)")
|
|
37
|
+
add("--concurrency", type=int, default=8, help="max documents in flight")
|
|
38
|
+
add("--prune", action="store_true", help="delete agentic-fs docs no longer at the source")
|
|
39
|
+
add("--dry-run", action="store_true", help="report what would change without writing")
|
|
40
|
+
parser.add_argument(
|
|
41
|
+
"--opt",
|
|
42
|
+
action="append",
|
|
43
|
+
default=[],
|
|
44
|
+
metavar="KEY=VALUE",
|
|
45
|
+
help="connector-specific option (repeatable)",
|
|
46
|
+
)
|
|
47
|
+
return parser
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
51
|
+
args = _build_parser().parse_args(argv)
|
|
52
|
+
|
|
53
|
+
try:
|
|
54
|
+
connector = build_connector(args.connector, args.source, **_parse_options(args.opt))
|
|
55
|
+
except (ValueError, RuntimeError) as err:
|
|
56
|
+
print(f"error: {err}", file=sys.stderr)
|
|
57
|
+
return 2
|
|
58
|
+
|
|
59
|
+
signer = None
|
|
60
|
+
if args.auth == "sigv4":
|
|
61
|
+
from afs_connector_sdk.auth import SigV4Signer
|
|
62
|
+
|
|
63
|
+
try:
|
|
64
|
+
signer = SigV4Signer(region=args.region)
|
|
65
|
+
except RuntimeError as err:
|
|
66
|
+
print(f"error: {err}", file=sys.stderr)
|
|
67
|
+
return 2
|
|
68
|
+
|
|
69
|
+
report = asyncio.run(_run(connector, args, signer))
|
|
70
|
+
tag = " (dry-run)" if args.dry_run else ""
|
|
71
|
+
print(
|
|
72
|
+
f"{args.connector} -> {args.namespace}{tag}: "
|
|
73
|
+
f"ingested={report.ingested} skipped={report.skipped} "
|
|
74
|
+
f"deleted={report.deleted} failed={report.failed}"
|
|
75
|
+
)
|
|
76
|
+
for line in report.errors[:20]:
|
|
77
|
+
print(f" ! {line}", file=sys.stderr)
|
|
78
|
+
return 1 if report.failed else 0
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
async def _run(connector: object, args: argparse.Namespace, signer: object) -> SyncReport:
|
|
82
|
+
async with IngestClient(args.api_url, signer=signer) as client: # type: ignore[arg-type]
|
|
83
|
+
engine = SyncEngine(
|
|
84
|
+
client, concurrency=args.concurrency, prune=args.prune, dry_run=args.dry_run
|
|
85
|
+
)
|
|
86
|
+
return await engine.sync(connector, args.namespace) # type: ignore[arg-type]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
if __name__ == "__main__":
|
|
90
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
"""Async HTTP client for the agentic-fs ingest + read API.
|
|
2
|
+
|
|
3
|
+
Signing is done by building the httpx request first, then signing its *final*
|
|
4
|
+
URL and attaching the result — so the bytes signed are exactly the bytes sent.
|
|
5
|
+
That avoids the SigV4 query-encoding mismatch you hit when the signed URL and
|
|
6
|
+
the transmitted URL disagree on how a path like ``a/b.md`` is escaped.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from types import TracebackType
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
import httpx
|
|
15
|
+
|
|
16
|
+
from afs_connector_sdk.auth import NoAuth, RequestSigner
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class IngestClient:
|
|
20
|
+
def __init__(
|
|
21
|
+
self, base_url: str, *, signer: RequestSigner | None = None, timeout: float = 30.0
|
|
22
|
+
) -> None:
|
|
23
|
+
self._base = base_url.rstrip("/")
|
|
24
|
+
self._signer = signer or NoAuth()
|
|
25
|
+
self._http = httpx.AsyncClient(timeout=timeout)
|
|
26
|
+
|
|
27
|
+
async def __aenter__(self) -> IngestClient:
|
|
28
|
+
return self
|
|
29
|
+
|
|
30
|
+
async def __aexit__(
|
|
31
|
+
self,
|
|
32
|
+
exc_type: type[BaseException] | None,
|
|
33
|
+
exc: BaseException | None,
|
|
34
|
+
tb: TracebackType | None,
|
|
35
|
+
) -> None:
|
|
36
|
+
await self.aclose()
|
|
37
|
+
|
|
38
|
+
async def aclose(self) -> None:
|
|
39
|
+
await self._http.aclose()
|
|
40
|
+
|
|
41
|
+
async def _send(
|
|
42
|
+
self,
|
|
43
|
+
method: str,
|
|
44
|
+
path: str,
|
|
45
|
+
*,
|
|
46
|
+
params: dict[str, Any] | None = None,
|
|
47
|
+
content: bytes | None = None,
|
|
48
|
+
content_type: str | None = None,
|
|
49
|
+
) -> httpx.Response:
|
|
50
|
+
headers = {"content-type": content_type} if content_type else {}
|
|
51
|
+
request = self._http.build_request(
|
|
52
|
+
method, f"{self._base}{path}", params=params, content=content, headers=headers
|
|
53
|
+
)
|
|
54
|
+
request.headers.update(
|
|
55
|
+
self._signer.headers_for(
|
|
56
|
+
method=request.method, url=str(request.url), body=content or b""
|
|
57
|
+
)
|
|
58
|
+
)
|
|
59
|
+
return await self._http.send(request)
|
|
60
|
+
|
|
61
|
+
async def put_document(
|
|
62
|
+
self, namespace: str, path: str, data: bytes, *, content_type: str | None = None
|
|
63
|
+
) -> dict[str, Any]:
|
|
64
|
+
resp = await self._send(
|
|
65
|
+
"PUT",
|
|
66
|
+
f"/v1/ingest/{namespace}/doc",
|
|
67
|
+
params={"path": path},
|
|
68
|
+
content=data,
|
|
69
|
+
content_type=content_type,
|
|
70
|
+
)
|
|
71
|
+
resp.raise_for_status()
|
|
72
|
+
return resp.json()
|
|
73
|
+
|
|
74
|
+
async def stat(self, namespace: str, path: str) -> dict[str, Any] | None:
|
|
75
|
+
resp = await self._send("GET", f"/v1/fs/{namespace}/stat", params={"path": path})
|
|
76
|
+
if resp.status_code == httpx.codes.NOT_FOUND:
|
|
77
|
+
return None
|
|
78
|
+
resp.raise_for_status()
|
|
79
|
+
return resp.json()
|
|
80
|
+
|
|
81
|
+
async def delete_document(self, namespace: str, path: str) -> None:
|
|
82
|
+
resp = await self._send("DELETE", f"/v1/ingest/{namespace}/doc", params={"path": path})
|
|
83
|
+
if resp.status_code not in (200, 202, 204):
|
|
84
|
+
resp.raise_for_status()
|
|
85
|
+
|
|
86
|
+
async def list_paths(self, namespace: str, prefix: str = "") -> list[str]:
|
|
87
|
+
paths: list[str] = []
|
|
88
|
+
cursor: str | None = None
|
|
89
|
+
while True:
|
|
90
|
+
params: dict[str, Any] = {"prefix": prefix, "limit": 200}
|
|
91
|
+
if cursor:
|
|
92
|
+
params["cursor"] = cursor
|
|
93
|
+
resp = await self._send("GET", f"/v1/fs/{namespace}/entries", params=params)
|
|
94
|
+
resp.raise_for_status()
|
|
95
|
+
page = resp.json()
|
|
96
|
+
paths.extend(entry["path"] for entry in page.get("items", []))
|
|
97
|
+
cursor = page.get("next_cursor")
|
|
98
|
+
if not cursor:
|
|
99
|
+
return paths
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Builtin connectors. Each is a small `afs_core.contracts.Connector` impl."""
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Local filesystem connector — crawl a directory tree.
|
|
2
|
+
|
|
3
|
+
Zero dependencies; the reference connector and the easiest way to dogfood the
|
|
4
|
+
whole ingest path (point it at a folder of docs and read them back over MCP).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import mimetypes
|
|
10
|
+
from collections.abc import Iterator
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
|
|
13
|
+
from afs_core.models import SourceItem
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class LocalConnector:
|
|
17
|
+
name = "local"
|
|
18
|
+
|
|
19
|
+
def __init__(self, source: str, *, follow_symlinks: str = "false") -> None:
|
|
20
|
+
self._root = Path(source).expanduser().resolve()
|
|
21
|
+
if not self._root.is_dir():
|
|
22
|
+
raise ValueError(f"source is not a directory: {self._root}")
|
|
23
|
+
# Options arrive as strings from the CLI (`--opt follow_symlinks=true`).
|
|
24
|
+
self._follow = str(follow_symlinks).lower() in {"1", "true", "yes"}
|
|
25
|
+
|
|
26
|
+
def discover(self) -> Iterator[SourceItem]:
|
|
27
|
+
for path in sorted(self._root.rglob("*")):
|
|
28
|
+
rel = path.relative_to(self._root)
|
|
29
|
+
# Skip hidden files and dot-directories (.git, .DS_Store, …).
|
|
30
|
+
if any(part.startswith(".") for part in rel.parts):
|
|
31
|
+
continue
|
|
32
|
+
if path.is_symlink() and not self._follow:
|
|
33
|
+
continue
|
|
34
|
+
if not path.is_file():
|
|
35
|
+
continue
|
|
36
|
+
stat = path.stat()
|
|
37
|
+
yield SourceItem(
|
|
38
|
+
path=rel.as_posix(),
|
|
39
|
+
locator=str(path),
|
|
40
|
+
size=stat.st_size,
|
|
41
|
+
content_type=mimetypes.guess_type(path.name)[0],
|
|
42
|
+
version=f"mtime:{int(stat.st_mtime)}:{stat.st_size}",
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def fetch(self, item: SourceItem) -> bytes:
|
|
46
|
+
return Path(item.locator).read_bytes()
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""S3 connector — crawl an ``s3://bucket/prefix`` of documents.
|
|
2
|
+
|
|
3
|
+
Source-side auth is the standard boto3 chain (env / profile / role), so reading
|
|
4
|
+
from S3 needs no special handling here — that's the connector pattern: each
|
|
5
|
+
source owns its own auth. Needs the ``[aws]`` extra (boto3).
|
|
6
|
+
|
|
7
|
+
Pass a prefix ending in ``/`` for folder semantics (``s3://bucket/docs/``); the
|
|
8
|
+
prefix is stripped from each key to form the agentic-fs path.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from collections.abc import Iterator
|
|
14
|
+
from typing import Any
|
|
15
|
+
from urllib.parse import urlparse
|
|
16
|
+
|
|
17
|
+
from afs_core.models import SourceItem
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class S3Connector:
|
|
21
|
+
name = "s3"
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self, source: str, *, endpoint_url: str | None = None, region: str | None = None
|
|
25
|
+
) -> None:
|
|
26
|
+
parsed = urlparse(source)
|
|
27
|
+
if parsed.scheme != "s3" or not parsed.netloc:
|
|
28
|
+
raise ValueError(f"source must be s3://bucket/prefix, got {source!r}")
|
|
29
|
+
self._bucket = parsed.netloc
|
|
30
|
+
self._prefix = parsed.path.lstrip("/")
|
|
31
|
+
try:
|
|
32
|
+
import boto3
|
|
33
|
+
except ModuleNotFoundError as err: # pragma: no cover - import guard
|
|
34
|
+
raise RuntimeError(
|
|
35
|
+
"the s3 connector needs the optional extra: pip install 'afs-connector-sdk[aws]'"
|
|
36
|
+
) from err
|
|
37
|
+
self._s3: Any = boto3.client("s3", endpoint_url=endpoint_url, region_name=region)
|
|
38
|
+
|
|
39
|
+
def discover(self) -> Iterator[SourceItem]:
|
|
40
|
+
paginator = self._s3.get_paginator("list_objects_v2")
|
|
41
|
+
for page in paginator.paginate(Bucket=self._bucket, Prefix=self._prefix):
|
|
42
|
+
for obj in page.get("Contents", []):
|
|
43
|
+
key = obj["Key"]
|
|
44
|
+
if key.endswith("/"):
|
|
45
|
+
continue # skip "folder" placeholder objects
|
|
46
|
+
rel = key[len(self._prefix) :] if self._prefix else key
|
|
47
|
+
rel = rel.lstrip("/")
|
|
48
|
+
if not rel:
|
|
49
|
+
continue
|
|
50
|
+
yield SourceItem(
|
|
51
|
+
path=rel,
|
|
52
|
+
locator=key,
|
|
53
|
+
size=obj.get("Size"),
|
|
54
|
+
version=(obj.get("ETag") or "").strip('"') or None,
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def fetch(self, item: SourceItem) -> bytes:
|
|
58
|
+
resp = self._s3.get_object(Bucket=self._bucket, Key=item.locator)
|
|
59
|
+
return resp["Body"].read()
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""The source-agnostic sync engine.
|
|
2
|
+
|
|
3
|
+
For each item a connector discovers: fetch the bytes, and skip the (expensive)
|
|
4
|
+
ingest if the document is already present unchanged — decided by comparing the
|
|
5
|
+
content checksum against the catalog's, so re-runs are cheap and idempotent and
|
|
6
|
+
nothing is re-extracted needlessly. With ``prune``, documents that have vanished
|
|
7
|
+
from the source are deleted from agentic-fs (the source stays the source of
|
|
8
|
+
truth).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import hashlib
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from afs_connector_sdk.client import IngestClient
|
|
20
|
+
from afs_core.contracts import Connector
|
|
21
|
+
from afs_core.models import SourceItem
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass
|
|
25
|
+
class SyncReport:
|
|
26
|
+
ingested: int = 0
|
|
27
|
+
skipped: int = 0
|
|
28
|
+
deleted: int = 0
|
|
29
|
+
failed: int = 0
|
|
30
|
+
errors: list[str] = field(default_factory=list)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class SyncEngine:
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
client: IngestClient,
|
|
37
|
+
*,
|
|
38
|
+
concurrency: int = 8,
|
|
39
|
+
prune: bool = False,
|
|
40
|
+
dry_run: bool = False,
|
|
41
|
+
) -> None:
|
|
42
|
+
self._client = client
|
|
43
|
+
self._sem = asyncio.Semaphore(concurrency)
|
|
44
|
+
self._prune = prune
|
|
45
|
+
self._dry = dry_run
|
|
46
|
+
|
|
47
|
+
async def sync(self, connector: Connector, namespace: str) -> SyncReport:
|
|
48
|
+
report = SyncReport()
|
|
49
|
+
items = list(connector.discover())
|
|
50
|
+
await asyncio.gather(*(self._process(connector, namespace, it, report) for it in items))
|
|
51
|
+
if self._prune:
|
|
52
|
+
await self._prune_missing(namespace, {it.path for it in items}, report)
|
|
53
|
+
return report
|
|
54
|
+
|
|
55
|
+
async def _process(
|
|
56
|
+
self, connector: Connector, namespace: str, item: SourceItem, report: SyncReport
|
|
57
|
+
) -> None:
|
|
58
|
+
async with self._sem:
|
|
59
|
+
try:
|
|
60
|
+
data = await asyncio.to_thread(connector.fetch, item)
|
|
61
|
+
checksum = hashlib.sha256(data).hexdigest()
|
|
62
|
+
existing = await self._client.stat(namespace, item.path)
|
|
63
|
+
if existing and existing.get("checksum") == checksum:
|
|
64
|
+
report.skipped += 1
|
|
65
|
+
return
|
|
66
|
+
if not self._dry:
|
|
67
|
+
await self._client.put_document(
|
|
68
|
+
namespace, item.path, data, content_type=item.content_type
|
|
69
|
+
)
|
|
70
|
+
report.ingested += 1
|
|
71
|
+
except Exception as err:
|
|
72
|
+
report.failed += 1
|
|
73
|
+
report.errors.append(f"{item.path}: {err}")
|
|
74
|
+
|
|
75
|
+
async def _prune_missing(self, namespace: str, seen: set[str], report: SyncReport) -> None:
|
|
76
|
+
try:
|
|
77
|
+
existing = await self._client.list_paths(namespace)
|
|
78
|
+
except Exception as err:
|
|
79
|
+
report.errors.append(f"prune-list: {err}")
|
|
80
|
+
return
|
|
81
|
+
for path in existing:
|
|
82
|
+
if path in seen:
|
|
83
|
+
continue
|
|
84
|
+
if self._dry:
|
|
85
|
+
report.deleted += 1
|
|
86
|
+
continue
|
|
87
|
+
try:
|
|
88
|
+
await self._client.delete_document(namespace, path)
|
|
89
|
+
report.deleted += 1
|
|
90
|
+
except Exception as err:
|
|
91
|
+
report.failed += 1
|
|
92
|
+
report.errors.append(f"delete {path}: {err}")
|
|
File without changes
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Connector registry — builtins + the ``afs.connectors`` entry-point group.
|
|
2
|
+
|
|
3
|
+
Same pattern as the store and normalizer registries: pick a connector by name.
|
|
4
|
+
Third-party connectors (Google Drive, SharePoint, …) register an entry point
|
|
5
|
+
whose value is a callable ``(source, **options) -> Connector``; they need no
|
|
6
|
+
change here.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from importlib.metadata import entry_points
|
|
12
|
+
from typing import TYPE_CHECKING
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from afs_core.contracts import Connector
|
|
16
|
+
|
|
17
|
+
_ENTRY_GROUP = "afs.connectors"
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _builtins() -> dict[str, object]:
|
|
21
|
+
# Imported lazily so a connector's optional deps (boto3 for s3) aren't needed
|
|
22
|
+
# just to load the registry or use a different connector.
|
|
23
|
+
from afs_connector_sdk.connectors.local import LocalConnector
|
|
24
|
+
from afs_connector_sdk.connectors.s3 import S3Connector
|
|
25
|
+
|
|
26
|
+
return {"local": LocalConnector, "s3": S3Connector}
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def build_connector(name: str, source: str, **options: str) -> Connector:
|
|
30
|
+
"""Construct a connector by name over ``source`` (with connector-specific options)."""
|
|
31
|
+
factory = _builtins().get(name)
|
|
32
|
+
if factory is None:
|
|
33
|
+
for ep in entry_points(group=_ENTRY_GROUP):
|
|
34
|
+
if ep.name == name:
|
|
35
|
+
factory = ep.load()
|
|
36
|
+
break
|
|
37
|
+
if factory is None:
|
|
38
|
+
available = sorted(_builtins()) + [ep.name for ep in entry_points(group=_ENTRY_GROUP)]
|
|
39
|
+
raise ValueError(f"unknown connector {name!r}; available: {', '.join(available)}")
|
|
40
|
+
return factory(source, **options) # type: ignore[operator]
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""IngestClient HTTP behavior + the sign-the-final-URL property (via MockTransport)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import httpx
|
|
6
|
+
|
|
7
|
+
from afs_connector_sdk.client import IngestClient
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _CaptureSigner:
|
|
11
|
+
def __init__(self) -> None:
|
|
12
|
+
self.signed_urls: list[str] = []
|
|
13
|
+
|
|
14
|
+
def headers_for(self, *, method: str, url: str, body: bytes) -> dict[str, str]:
|
|
15
|
+
self.signed_urls.append(url)
|
|
16
|
+
return {"authorization": "SIGNED"}
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _client_with(handler, signer=None) -> IngestClient:
|
|
20
|
+
client = IngestClient("http://api.test", signer=signer)
|
|
21
|
+
client._http = httpx.AsyncClient(transport=httpx.MockTransport(handler))
|
|
22
|
+
return client
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
async def test_put_signs_the_exact_url_it_sends() -> None:
|
|
26
|
+
sent: dict[str, str] = {}
|
|
27
|
+
|
|
28
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
29
|
+
sent["url"] = str(request.url)
|
|
30
|
+
sent["auth"] = request.headers.get("authorization", "")
|
|
31
|
+
sent["ctype"] = request.headers.get("content-type", "")
|
|
32
|
+
return httpx.Response(201, json={"path": "a/b.md", "checksum": "abc"})
|
|
33
|
+
|
|
34
|
+
signer = _CaptureSigner()
|
|
35
|
+
client = _client_with(handler, signer)
|
|
36
|
+
entry = await client.put_document("ns", "a/b.md", b"hello", content_type="text/markdown")
|
|
37
|
+
await client.aclose()
|
|
38
|
+
|
|
39
|
+
assert entry["checksum"] == "abc"
|
|
40
|
+
assert sent["auth"] == "SIGNED"
|
|
41
|
+
assert sent["ctype"] == "text/markdown"
|
|
42
|
+
# The signer must see the byte-identical URL the transport sent (no
|
|
43
|
+
# re-encoding between signing and sending) — the crux of SigV4 over query paths.
|
|
44
|
+
assert signer.signed_urls == [sent["url"]]
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
async def test_stat_404_is_none() -> None:
|
|
48
|
+
client = _client_with(lambda req: httpx.Response(404, json={"detail": "nope"}))
|
|
49
|
+
assert await client.stat("ns", "missing.md") is None
|
|
50
|
+
await client.aclose()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
async def test_list_paths_follows_pagination() -> None:
|
|
54
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
55
|
+
if request.url.params.get("cursor") == "c1":
|
|
56
|
+
return httpx.Response(200, json={"items": [{"path": "b.md"}], "next_cursor": None})
|
|
57
|
+
return httpx.Response(200, json={"items": [{"path": "a.md"}], "next_cursor": "c1"})
|
|
58
|
+
|
|
59
|
+
client = _client_with(handler)
|
|
60
|
+
assert await client.list_paths("ns") == ["a.md", "b.md"]
|
|
61
|
+
await client.aclose()
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def test_no_auth_adds_no_authorization_header() -> None:
|
|
65
|
+
seen: dict[str, str | None] = {}
|
|
66
|
+
|
|
67
|
+
def handler(request: httpx.Request) -> httpx.Response:
|
|
68
|
+
seen["auth"] = request.headers.get("authorization")
|
|
69
|
+
return httpx.Response(202)
|
|
70
|
+
|
|
71
|
+
client = _client_with(handler) # default NoAuth
|
|
72
|
+
await client.delete_document("ns", "a.md")
|
|
73
|
+
await client.aclose()
|
|
74
|
+
assert seen["auth"] is None
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""The sync engine's decisions: ingest new, skip unchanged, prune, dry-run, errors."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
|
|
7
|
+
from afs_connector_sdk.engine import SyncEngine
|
|
8
|
+
from afs_core.models import SourceItem
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class _FakeClient:
|
|
12
|
+
"""Stands in for IngestClient. `existing` maps path -> stored checksum."""
|
|
13
|
+
|
|
14
|
+
def __init__(self, existing: dict[str, str] | None = None) -> None:
|
|
15
|
+
self.existing = existing or {}
|
|
16
|
+
self.put: list[tuple[str, bytes]] = []
|
|
17
|
+
self.deleted: list[str] = []
|
|
18
|
+
|
|
19
|
+
async def stat(self, namespace: str, path: str) -> dict[str, str] | None:
|
|
20
|
+
checksum = self.existing.get(path)
|
|
21
|
+
return {"checksum": checksum} if checksum is not None else None
|
|
22
|
+
|
|
23
|
+
async def put_document(
|
|
24
|
+
self, namespace: str, path: str, data: bytes, *, content_type: str | None = None
|
|
25
|
+
) -> dict[str, str]:
|
|
26
|
+
self.put.append((path, data))
|
|
27
|
+
return {}
|
|
28
|
+
|
|
29
|
+
async def list_paths(self, namespace: str, prefix: str = "") -> list[str]:
|
|
30
|
+
return list(self.existing)
|
|
31
|
+
|
|
32
|
+
async def delete_document(self, namespace: str, path: str) -> None:
|
|
33
|
+
self.deleted.append(path)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class _FakeConnector:
|
|
37
|
+
name = "fake"
|
|
38
|
+
|
|
39
|
+
def __init__(self, items: dict[str, bytes], *, broken: set[str] | None = None) -> None:
|
|
40
|
+
self._items = items
|
|
41
|
+
self._broken = broken or set()
|
|
42
|
+
|
|
43
|
+
def discover(self) -> list[SourceItem]:
|
|
44
|
+
return [SourceItem(path=p, locator=p) for p in self._items]
|
|
45
|
+
|
|
46
|
+
def fetch(self, item: SourceItem) -> bytes:
|
|
47
|
+
if item.locator in self._broken:
|
|
48
|
+
raise OSError("unreadable")
|
|
49
|
+
return self._items[item.locator]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _sha(data: bytes) -> str:
|
|
53
|
+
return hashlib.sha256(data).hexdigest()
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def test_ingests_new_documents() -> None:
|
|
57
|
+
client = _FakeClient()
|
|
58
|
+
connector = _FakeConnector({"a.md": b"alpha", "sub/b.txt": b"beta"})
|
|
59
|
+
report = await SyncEngine(client).sync(connector, "ns")
|
|
60
|
+
assert report.ingested == 2 and report.skipped == 0
|
|
61
|
+
assert {p for p, _ in client.put} == {"a.md", "sub/b.txt"}
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
async def test_skips_unchanged_by_checksum() -> None:
|
|
65
|
+
client = _FakeClient(existing={"a.md": _sha(b"alpha")})
|
|
66
|
+
connector = _FakeConnector({"a.md": b"alpha", "b.md": b"new"})
|
|
67
|
+
report = await SyncEngine(client).sync(connector, "ns")
|
|
68
|
+
assert report.skipped == 1 and report.ingested == 1
|
|
69
|
+
assert [p for p, _ in client.put] == ["b.md"]
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
async def test_dry_run_writes_nothing() -> None:
|
|
73
|
+
client = _FakeClient()
|
|
74
|
+
connector = _FakeConnector({"a.md": b"alpha"})
|
|
75
|
+
report = await SyncEngine(client, dry_run=True).sync(connector, "ns")
|
|
76
|
+
assert report.ingested == 1
|
|
77
|
+
assert client.put == []
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
async def test_prune_deletes_documents_absent_from_source() -> None:
|
|
81
|
+
client = _FakeClient(existing={"gone.md": _sha(b"old")})
|
|
82
|
+
connector = _FakeConnector({"a.md": b"alpha"})
|
|
83
|
+
report = await SyncEngine(client, prune=True).sync(connector, "ns")
|
|
84
|
+
assert report.ingested == 1 and report.deleted == 1
|
|
85
|
+
assert client.deleted == ["gone.md"]
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
async def test_one_bad_document_does_not_abort_the_crawl() -> None:
|
|
89
|
+
client = _FakeClient()
|
|
90
|
+
connector = _FakeConnector({"good.md": b"ok", "bad.md": b"x"}, broken={"bad.md"})
|
|
91
|
+
report = await SyncEngine(client).sync(connector, "ns")
|
|
92
|
+
assert report.ingested == 1 and report.failed == 1
|
|
93
|
+
assert [p for p, _ in client.put] == ["good.md"]
|
|
94
|
+
assert any("bad.md" in e for e in report.errors)
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Local FS connector — certified against the afs-core kit, plus its specifics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from afs_connector_sdk.connectors.local import LocalConnector
|
|
10
|
+
from afs_core.testing import ConnectorConformance
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _populate(root: Path) -> None:
|
|
14
|
+
(root / "a.md").write_text("alpha")
|
|
15
|
+
(root / "sub").mkdir()
|
|
16
|
+
(root / "sub" / "b.txt").write_text("beta beta")
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TestLocalConnector(ConnectorConformance):
|
|
20
|
+
@pytest.fixture
|
|
21
|
+
def connector(self, tmp_path: Path) -> LocalConnector:
|
|
22
|
+
_populate(tmp_path)
|
|
23
|
+
return LocalConnector(str(tmp_path))
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_discovers_nested_relative_paths(tmp_path: Path) -> None:
|
|
27
|
+
_populate(tmp_path)
|
|
28
|
+
paths = {item.path for item in LocalConnector(str(tmp_path)).discover()}
|
|
29
|
+
assert paths == {"a.md", "sub/b.txt"}
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_skips_hidden_files_and_dot_dirs(tmp_path: Path) -> None:
|
|
33
|
+
_populate(tmp_path)
|
|
34
|
+
(tmp_path / ".secret").write_text("nope")
|
|
35
|
+
(tmp_path / ".git").mkdir()
|
|
36
|
+
(tmp_path / ".git" / "config").write_text("nope")
|
|
37
|
+
paths = {item.path for item in LocalConnector(str(tmp_path)).discover()}
|
|
38
|
+
assert paths == {"a.md", "sub/b.txt"}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def test_fetch_roundtrips_bytes(tmp_path: Path) -> None:
|
|
42
|
+
_populate(tmp_path)
|
|
43
|
+
connector = LocalConnector(str(tmp_path))
|
|
44
|
+
item = next(i for i in connector.discover() if i.path == "a.md")
|
|
45
|
+
assert connector.fetch(item) == b"alpha"
|
|
46
|
+
assert item.content_type == "text/markdown"
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def test_rejects_non_directory(tmp_path: Path) -> None:
|
|
50
|
+
f = tmp_path / "file.txt"
|
|
51
|
+
f.write_text("x")
|
|
52
|
+
with pytest.raises(ValueError, match="not a directory"):
|
|
53
|
+
LocalConnector(str(f))
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
"""S3 connector — certified against the afs-core kit using moto."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Iterator
|
|
6
|
+
|
|
7
|
+
import boto3
|
|
8
|
+
import pytest
|
|
9
|
+
from moto import mock_aws
|
|
10
|
+
|
|
11
|
+
from afs_connector_sdk.connectors.s3 import S3Connector
|
|
12
|
+
from afs_core.testing import ConnectorConformance
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
@pytest.fixture
|
|
16
|
+
def s3_source() -> Iterator[str]:
|
|
17
|
+
with mock_aws():
|
|
18
|
+
client = boto3.client("s3", region_name="us-east-1")
|
|
19
|
+
client.create_bucket(Bucket="docs")
|
|
20
|
+
client.put_object(Bucket="docs", Key="reports/a.md", Body=b"alpha")
|
|
21
|
+
client.put_object(Bucket="docs", Key="reports/sub/b.txt", Body=b"beta beta")
|
|
22
|
+
client.put_object(Bucket="docs", Key="reports/", Body=b"") # folder placeholder
|
|
23
|
+
yield "s3://docs/reports/"
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class TestS3Connector(ConnectorConformance):
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def connector(self, s3_source: str) -> S3Connector:
|
|
29
|
+
return S3Connector(s3_source, region="us-east-1")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_strips_prefix_and_skips_placeholders(s3_source: str) -> None:
|
|
33
|
+
connector = S3Connector(s3_source, region="us-east-1")
|
|
34
|
+
items = {item.path: item for item in connector.discover()}
|
|
35
|
+
assert set(items) == {"a.md", "sub/b.txt"}
|
|
36
|
+
assert connector.fetch(items["a.md"]) == b"alpha"
|
|
37
|
+
assert items["a.md"].version # ETag carried as the change token
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_rejects_non_s3_source() -> None:
|
|
41
|
+
with pytest.raises(ValueError, match="s3://bucket/prefix"):
|
|
42
|
+
S3Connector("/local/path")
|