payload-documents-worker-builder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,39 @@
1
+ """Public API for `payload-documents-worker-builder`.
2
+
3
+ Mirrors the shape of `agno_agent_builder.__init__`: a single import surface
4
+ that exposes the factory, the config, the clients and the tasks. Consumers
5
+ should never reach into submodules.
6
+ """
7
+
8
+ from payload_documents_worker_builder.app import WorkerApp, create_app
9
+ from payload_documents_worker_builder.broker import create_broker
10
+ from payload_documents_worker_builder.clients import (
11
+ LlamaParseClient,
12
+ LlamaParseError,
13
+ LlamaParseJob,
14
+ LlamaParseStatus,
15
+ PayloadClient,
16
+ PayloadError,
17
+ )
18
+ from payload_documents_worker_builder.config import RuntimeConfig
19
+ from payload_documents_worker_builder.tasks import (
20
+ PARSE_DOCUMENT_TASK_NAME,
21
+ register_parse_document_task,
22
+ )
23
+
24
+ __all__ = [
25
+ "PARSE_DOCUMENT_TASK_NAME",
26
+ "LlamaParseClient",
27
+ "LlamaParseError",
28
+ "LlamaParseJob",
29
+ "LlamaParseStatus",
30
+ "PayloadClient",
31
+ "PayloadError",
32
+ "RuntimeConfig",
33
+ "WorkerApp",
34
+ "create_app",
35
+ "create_broker",
36
+ "register_parse_document_task",
37
+ ]
38
+
39
+ __version__ = "0.1.0"
@@ -0,0 +1,49 @@
1
+ """Top-level factory.
2
+
3
+ Consumers call ``create_app(config)`` and get back two ready-to-run handles:
4
+
5
+ * ``broker`` — the taskiq broker. Pass it to the taskiq CLI:
6
+ ``taskiq worker my_worker.main:broker``.
7
+ * ``app`` — the FastAPI HTTP kicker. Run with uvicorn:
8
+ ``uvicorn my_worker.main:app``.
9
+
10
+ Both share the same ``RuntimeConfig`` so logs, retries, and credentials line
11
+ up across processes.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from collections.abc import Iterator
17
+ from dataclasses import dataclass
18
+
19
+ from fastapi import FastAPI
20
+ from taskiq import AsyncBroker
21
+
22
+ from payload_documents_worker_builder.broker import create_broker
23
+ from payload_documents_worker_builder.config import RuntimeConfig
24
+ from payload_documents_worker_builder.http import create_http_app
25
+ from payload_documents_worker_builder.lifecycle import configure_logging
26
+ from payload_documents_worker_builder.tasks import register_parse_document_task
27
+
28
+
29
+ @dataclass(slots=True, frozen=True)
30
+ class WorkerApp:
31
+ """Bundle returned by :func:`create_app`. Exposed as a dataclass so consumers
32
+ can ``app, broker = create_app(config)`` (`__iter__` below) or address
33
+ fields by name explicitly."""
34
+
35
+ app: FastAPI
36
+ broker: AsyncBroker
37
+
38
+ def __iter__(self) -> Iterator[FastAPI | AsyncBroker]:
39
+ yield self.app
40
+ yield self.broker
41
+
42
+
43
+ def create_app(config: RuntimeConfig) -> WorkerApp:
44
+ """Build the broker, register built-in tasks, and wrap a FastAPI kicker."""
45
+ configure_logging(config)
46
+ broker = create_broker(config)
47
+ register_parse_document_task(broker, config)
48
+ http_app = create_http_app(broker, config)
49
+ return WorkerApp(app=http_app, broker=broker)
@@ -0,0 +1,19 @@
1
+ """Broker factory.
2
+
3
+ Returns a configured `RedisStreamBroker` with `SmartRetryMiddleware`. Mirrors
4
+ the pattern from nixon's `nixon_worker_core.broker_factory.create_broker`:
5
+ one place that owns broker config so consumers never instantiate
6
+ `RedisStreamBroker` directly.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from taskiq import AsyncBroker, SmartRetryMiddleware
12
+ from taskiq_redis import RedisStreamBroker
13
+
14
+ from payload_documents_worker_builder.config import RuntimeConfig
15
+
16
+
17
+ def create_broker(config: RuntimeConfig) -> AsyncBroker:
18
+ """Build the broker the consumer's `main.py` should expose to taskiq."""
19
+ return RedisStreamBroker(url=config.redis_url).with_middlewares(SmartRetryMiddleware())
@@ -0,0 +1,18 @@
1
+ """HTTP clients used by built-in tasks (Payload + LlamaParse)."""
2
+
3
+ from payload_documents_worker_builder.clients.llama_parse import (
4
+ LlamaParseClient,
5
+ LlamaParseError,
6
+ LlamaParseJob,
7
+ LlamaParseStatus,
8
+ )
9
+ from payload_documents_worker_builder.clients.payload import PayloadClient, PayloadError
10
+
11
+ __all__ = [
12
+ "LlamaParseClient",
13
+ "LlamaParseError",
14
+ "LlamaParseJob",
15
+ "LlamaParseStatus",
16
+ "PayloadClient",
17
+ "PayloadError",
18
+ ]
@@ -0,0 +1,28 @@
1
+ """Shared error-handling helpers for HTTP clients in this package."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from collections.abc import Callable
6
+
7
+ import httpx
8
+
9
+ _DETAIL_TRUNCATE = 500
10
+
11
+
12
+ def make_raise_for_status(
13
+ exc_cls: type[Exception], prefix: str
14
+ ) -> Callable[[httpx.Response, str], None]:
15
+ """Return a `_raise_for_status(response, op)` bound to the given exception class + prefix.
16
+
17
+ Each client (Payload, LlamaParse, …) wraps non-2xx responses in its own
18
+ exception type but the rest of the logic is identical: format
19
+ `<prefix> <op> failed: HTTP <code> — <body>` and raise.
20
+ """
21
+
22
+ def _raise_for_status(response: httpx.Response, op: str) -> None:
23
+ if response.is_success:
24
+ return
25
+ detail = response.text[:_DETAIL_TRUNCATE]
26
+ raise exc_cls(f"{prefix} {op} failed: HTTP {response.status_code} — {detail}")
27
+
28
+ return _raise_for_status
@@ -0,0 +1,136 @@
1
+ """Minimal async LlamaParse client.
2
+
3
+ Ported from `packages/payload-documents/src/llama-parse/client.ts`. Only the
4
+ endpoints the parse-document task needs:
5
+
6
+ * ``POST /api/parsing/upload`` — kick off a parse job
7
+ * ``GET /api/parsing/job/{id}`` — poll status
8
+ * ``GET /api/parsing/job/{id}/result/markdown`` — fetch parsed markdown
9
+
10
+ Use as an async context manager so the underlying ``httpx.AsyncClient`` (and
11
+ its connection pool) is shared across all calls within one task instead of a
12
+ new TLS handshake per request::
13
+
14
+ async with LlamaParseClient(api_key=...) as client:
15
+ job = await client.upload(...)
16
+ ...
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from dataclasses import dataclass
22
+ from types import TracebackType
23
+ from typing import Any, Literal, Self
24
+
25
+ import httpx
26
+
27
+ from ._errors import make_raise_for_status
28
+
29
+ LlamaParseStatus = Literal["PENDING", "SUCCESS", "ERROR", "CANCELLED"]
30
+ DEFAULT_BASE_URL = "https://api.cloud.llamaindex.ai"
31
+
32
+
33
+ class LlamaParseError(Exception):
34
+ """Wraps any non-2xx response or transport failure with a helpful message."""
35
+
36
+
37
+ _raise_for_status = make_raise_for_status(LlamaParseError, "LlamaParse")
38
+
39
+
40
+ @dataclass(slots=True)
41
+ class LlamaParseJob:
42
+ id: str
43
+ status: LlamaParseStatus
44
+ error: str | None = None
45
+
46
+
47
+ class LlamaParseClient:
48
+ """Tiny httpx-backed client. Use via ``async with`` to share the httpx pool."""
49
+
50
+ def __init__(
51
+ self,
52
+ *,
53
+ api_key: str,
54
+ base_url: str = DEFAULT_BASE_URL,
55
+ timeout: float = 60.0,
56
+ ) -> None:
57
+ if not api_key:
58
+ raise LlamaParseError("LlamaParse API key is required")
59
+ self._base_url = base_url.rstrip("/")
60
+ self._headers = {"Authorization": f"Bearer {api_key}"}
61
+ self._timeout = timeout
62
+ self._http: httpx.AsyncClient | None = None
63
+
64
+ async def __aenter__(self) -> Self:
65
+ self._http = httpx.AsyncClient(timeout=self._timeout)
66
+ return self
67
+
68
+ async def __aexit__(
69
+ self,
70
+ exc_type: type[BaseException] | None,
71
+ exc: BaseException | None,
72
+ tb: TracebackType | None,
73
+ ) -> None:
74
+ if self._http is not None:
75
+ await self._http.aclose()
76
+ self._http = None
77
+
78
+ async def upload(
79
+ self,
80
+ *,
81
+ file_bytes: bytes,
82
+ filename: str,
83
+ language: str | None = None,
84
+ parsing_instruction: str | None = None,
85
+ mode: str | None = None,
86
+ ) -> LlamaParseJob:
87
+ data: dict[str, Any] = {}
88
+ if language is not None:
89
+ data["language"] = language
90
+ if parsing_instruction is not None:
91
+ data["parsing_instruction"] = parsing_instruction
92
+ if mode is not None:
93
+ data["parse_mode"] = mode
94
+
95
+ response = await self._client().post(
96
+ f"{self._base_url}/api/parsing/upload",
97
+ headers=self._headers,
98
+ files={"file": (filename, file_bytes)},
99
+ data=data,
100
+ )
101
+ _raise_for_status(response, "upload")
102
+ return _parse_job(response.json())
103
+
104
+ async def status(self, job_id: str) -> LlamaParseJob:
105
+ response = await self._client().get(
106
+ f"{self._base_url}/api/parsing/job/{job_id}",
107
+ headers=self._headers,
108
+ )
109
+ _raise_for_status(response, "status")
110
+ return _parse_job(response.json())
111
+
112
+ async def fetch_markdown(self, job_id: str) -> str:
113
+ response = await self._client().get(
114
+ f"{self._base_url}/api/parsing/job/{job_id}/result/markdown",
115
+ headers=self._headers,
116
+ )
117
+ _raise_for_status(response, "fetch_markdown")
118
+ markdown = response.json().get("markdown")
119
+ if not isinstance(markdown, str):
120
+ raise LlamaParseError(f"LlamaParse returned no markdown for job {job_id}")
121
+ return markdown
122
+
123
+ def _client(self) -> httpx.AsyncClient:
124
+ if self._http is None:
125
+ raise LlamaParseError(
126
+ "LlamaParseClient must be used inside `async with` (httpx pool not initialised)"
127
+ )
128
+ return self._http
129
+
130
+
131
+ def _parse_job(payload: dict[str, Any]) -> LlamaParseJob:
132
+ return LlamaParseJob(
133
+ id=payload["id"],
134
+ status=payload.get("status", "PENDING"),
135
+ error=payload.get("error"),
136
+ )
@@ -0,0 +1,126 @@
1
+ """Tiny Payload CMS REST client used by built-in tasks.
2
+
3
+ The worker only needs:
4
+
5
+ * fetch the document context (parser knobs + filename)
6
+ * fetch the binary attached to that document
7
+ * stamp parse results back (parsed_text / parse_status / parse_error / ...)
8
+
9
+ All Payload-side calls go through dedicated internal endpoints exposed by the
10
+ ``payload-documents`` plugin and authenticated with the shared
11
+ ``X-Internal-Secret`` header. The endpoints use Payload's local API with
12
+ ``overrideAccess: true`` server-side and the binary endpoint defers to a
13
+ host-provided resolver for the actual storage read, so the plugin stays
14
+ storage-agnostic and host apps can keep the documents collection's access
15
+ control honestly locked down (multi-tenant filters, admin-only writes, etc.)
16
+ without poking a service-account bypass into it.
17
+
18
+ Use as an async context manager so the underlying ``httpx.AsyncClient`` (and
19
+ its connection pool) is shared across all calls within one task instead of a
20
+ new TLS handshake per request::
21
+
22
+ async with PayloadClient(base_url=..., internal_secret=...) as client:
23
+ ctx = await client.fetch_parse_context(slug, doc_id)
24
+ ...
25
+ """
26
+
27
+ from __future__ import annotations
28
+
29
+ from types import TracebackType
30
+ from typing import Self
31
+
32
+ import httpx
33
+
34
+ from ._errors import make_raise_for_status
35
+ from .types import ParseContext, ParseResultUpdate
36
+
37
+ INTERNAL_SECRET_HEADER = "X-Internal-Secret" # noqa: S105 — header name, not a secret value
38
+
39
+
40
+ class PayloadError(Exception):
41
+ """Surfaced when Payload returns a non-2xx response."""
42
+
43
+
44
+ _raise_for_status = make_raise_for_status(PayloadError, "Payload")
45
+
46
+
47
+ class PayloadClient:
48
+ """Async REST client. Use via ``async with`` to share the httpx pool."""
49
+
50
+ def __init__(
51
+ self,
52
+ *,
53
+ base_url: str,
54
+ internal_secret: str,
55
+ timeout: float = 60.0,
56
+ ) -> None:
57
+ if not base_url:
58
+ raise PayloadError("Payload base URL is required")
59
+ if not internal_secret:
60
+ raise PayloadError("Internal secret is required for plugin endpoints")
61
+ self._base_url = base_url.rstrip("/")
62
+ self._headers = {INTERNAL_SECRET_HEADER: internal_secret}
63
+ self._timeout = timeout
64
+ self._http: httpx.AsyncClient | None = None
65
+
66
+ async def __aenter__(self) -> Self:
67
+ self._http = httpx.AsyncClient(timeout=self._timeout)
68
+ return self
69
+
70
+ async def __aexit__(
71
+ self,
72
+ exc_type: type[BaseException] | None,
73
+ exc: BaseException | None,
74
+ tb: TracebackType | None,
75
+ ) -> None:
76
+ if self._http is not None:
77
+ await self._http.aclose()
78
+ self._http = None
79
+
80
+ async def fetch_parse_context(self, collection: str, doc_id: str | int) -> ParseContext:
81
+ """GET the plugin's internal read endpoint.
82
+
83
+ Returns a projection containing only the fields the worker needs to
84
+ drive the parse: ``id, url, filename, mimeType, language,
85
+ parsing_instruction, mode``.
86
+ """
87
+ path = self._endpoint(collection, doc_id, "parse-context")
88
+ response = await self._client().get(path, headers=self._headers)
89
+ _raise_for_status(response, f"GET {path}")
90
+ return response.json()
91
+
92
+ async def fetch_parse_file(self, collection: str, doc_id: str | int) -> bytes:
93
+ """GET the plugin's internal binary endpoint.
94
+
95
+ The plugin defers the actual storage read to a host-provided resolver
96
+ (S3/R2 GetObject, local fs, ...) and streams the result back.
97
+ """
98
+ path = self._endpoint(collection, doc_id, "parse-file")
99
+ response = await self._client().get(path, headers=self._headers)
100
+ _raise_for_status(response, f"GET {path}")
101
+ return response.content
102
+
103
+ async def submit_parse_result(
104
+ self,
105
+ collection: str,
106
+ doc_id: str | int,
107
+ data: ParseResultUpdate,
108
+ ) -> None:
109
+ """POST to the plugin's internal write endpoint.
110
+
111
+ Body is whitelisted server-side; only the parse_* fields are accepted
112
+ regardless of what we send.
113
+ """
114
+ path = self._endpoint(collection, doc_id, "parse-result")
115
+ response = await self._client().post(path, headers=self._headers, json=dict(data))
116
+ _raise_for_status(response, f"POST {path}")
117
+
118
+ def _client(self) -> httpx.AsyncClient:
119
+ if self._http is None:
120
+ raise PayloadError(
121
+ "PayloadClient must be used inside `async with` (httpx pool not initialised)"
122
+ )
123
+ return self._http
124
+
125
+ def _endpoint(self, collection: str, doc_id: str | int, op: str) -> str:
126
+ return f"{self._base_url}/api/{collection}/{doc_id}/{op}"
@@ -0,0 +1,44 @@
1
+ """Typed dicts for the JSON contracts the Payload plugin endpoints expose.
2
+
3
+ Mirrors the TypeScript types in `packages/payload-documents/src/plugin/types.ts`
4
+ and `endpoints/parse-{context,result}-endpoint.ts`. Worth duplicating because
5
+ the alternative — `dict[str, Any]` everywhere — drops type info at every
6
+ boundary.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Literal, NotRequired, TypedDict
12
+
13
+ ParseStatus = Literal["idle", "pending", "processing", "done", "error"]
14
+
15
+
16
+ class ParseContext(TypedDict):
17
+ """Response shape of `GET /api/<collection>/<id>/parse-context`.
18
+
19
+ Field set is hard-coded server-side (see `parse-context-endpoint.ts`); the
20
+ plugin only returns what the worker needs to drive the LlamaParse upload.
21
+ """
22
+
23
+ id: str | int
24
+ url: NotRequired[str | None]
25
+ filename: NotRequired[str | None]
26
+ mimeType: NotRequired[str | None]
27
+ language: NotRequired[str | None]
28
+ parsing_instruction: NotRequired[str | None]
29
+ mode: NotRequired[str | None]
30
+
31
+
32
+ class ParseResultUpdate(TypedDict, total=False):
33
+ """Request body for `POST /api/<collection>/<id>/parse-result`.
34
+
35
+ Fields are whitelisted server-side (see `parse-result-endpoint.ts`); any
36
+ keys outside this set are silently dropped, but typing them here means the
37
+ caller catches typos at lint/check time.
38
+ """
39
+
40
+ parsed_text: str | None
41
+ parse_status: ParseStatus
42
+ parse_error: str | None
43
+ parse_job_id: str | None
44
+ parsed_at: str | None
@@ -0,0 +1,67 @@
1
+ """Runtime configuration consumed by the worker library.
2
+
3
+ Mirrors the shape of `agno_agent_builder.RuntimeConfig`: one pydantic model
4
+ populated by the consumer, no env loading inside the lib so multi-tenant
5
+ deploys can build several `RuntimeConfig` instances from a single env file.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pydantic import BaseModel, Field, HttpUrl, SecretStr
11
+
12
+
13
+ class RuntimeConfig(BaseModel):
14
+ """All knobs the consumer needs to fill in to run a worker.
15
+
16
+ The attribute split mirrors the surface area of the upstream
17
+ `agno-agent-builder` so consumers using both libraries see the same
18
+ shape twice (just with `database_url`/`payload_url`/`llama_cloud_api_key`
19
+ instead of an `agent_source`).
20
+ """
21
+
22
+ app_name: str = Field(
23
+ description="FastAPI title and structlog identity. Shows up in logs and /health.",
24
+ )
25
+
26
+ # ── Broker ─────────────────────────────────────────────────────────────
27
+ redis_url: str = Field(
28
+ description="Redis connection URL used by taskiq-redis as the broker (e.g. redis://redis:6379).",
29
+ )
30
+
31
+ # ── Payload CMS ────────────────────────────────────────────────────────
32
+ payload_url: HttpUrl = Field(
33
+ description="Base URL for the Payload REST API (e.g. http://app:3000).",
34
+ )
35
+ documents_collection_slug: str = Field(
36
+ default="documents",
37
+ description="Payload collection slug for documents. Must expose the `parse_*` fields shipped by `@zetesis/payload-documents`.",
38
+ )
39
+
40
+ # ── LlamaParse ─────────────────────────────────────────────────────────
41
+ llama_cloud_api_key: SecretStr = Field(
42
+ description="LlamaCloud API key used to upload + poll parsing jobs.",
43
+ )
44
+ llama_parse_base_url: HttpUrl = Field(
45
+ default=HttpUrl("https://api.cloud.llamaindex.ai"),
46
+ description="Override only if you point to a self-hosted LlamaCloud-compatible service.",
47
+ )
48
+ llama_parse_poll_interval_s: float = Field(
49
+ default=5.0,
50
+ description="Seconds between successive LlamaCloud status polls.",
51
+ )
52
+ llama_parse_poll_timeout_s: float = Field(
53
+ default=600.0,
54
+ description="Hard cap on how long a single parse task waits before failing.",
55
+ )
56
+
57
+ # ── Internal HTTP kicker ──────────────────────────────────────────────
58
+ internal_secret: SecretStr = Field(
59
+ description="Shared secret required by every `POST /tasks/*` request (X-Internal-Secret header).",
60
+ )
61
+ public_paths: tuple[str, ...] = Field(
62
+ default=("/health", "/ready", "/docs", "/openapi.json"),
63
+ description="Paths the InternalAuthMiddleware lets through without the secret.",
64
+ )
65
+
66
+ # ── Logging ───────────────────────────────────────────────────────────
67
+ log_level: str = Field(default="INFO")
@@ -0,0 +1,87 @@
1
+ """HTTP kicker.
2
+
3
+ Tiny FastAPI app the consumer process exposes so the Next.js side (or any
4
+ HTTP client) can enqueue a task without speaking the taskiq Redis protocol.
5
+
6
+ * ``GET /health`` and ``GET /ready`` — Kubernetes / Compose probes (no auth).
7
+ * ``POST /tasks/parse-document`` — body ``{"document_id": "<id>"}``, gated by
8
+ the ``X-Internal-Secret`` header (matched against ``config.internal_secret``).
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import hmac
14
+ from typing import Any
15
+
16
+ import structlog
17
+ from fastapi import FastAPI, HTTPException, Request, status
18
+ from fastapi.responses import JSONResponse
19
+ from pydantic import BaseModel
20
+ from taskiq import AsyncBroker
21
+
22
+ from payload_documents_worker_builder.config import RuntimeConfig
23
+ from payload_documents_worker_builder.tasks import PARSE_DOCUMENT_TASK_NAME
24
+
25
+ logger = structlog.get_logger("payload_documents_worker_builder.http")
26
+
27
+
28
+ class ParseDocumentRequest(BaseModel):
29
+ document_id: str
30
+
31
+
32
+ def create_http_app(broker: AsyncBroker, config: RuntimeConfig) -> FastAPI:
33
+ """Build the FastAPI app the consumer hands to uvicorn."""
34
+ app = FastAPI(title=config.app_name)
35
+
36
+ @app.middleware("http")
37
+ async def _internal_auth(request: Request, call_next: Any) -> Any:
38
+ if request.url.path in config.public_paths:
39
+ return await call_next(request)
40
+ provided = request.headers.get("x-internal-secret", "")
41
+ expected = config.internal_secret.get_secret_value()
42
+ if not hmac.compare_digest(provided, expected):
43
+ return JSONResponse(
44
+ {"error": "Forbidden"},
45
+ status_code=status.HTTP_403_FORBIDDEN,
46
+ )
47
+ return await call_next(request)
48
+
49
+ @app.on_event("startup")
50
+ async def _startup() -> None: # pyright: ignore[reportUnusedFunction]
51
+ if not broker.is_worker_process:
52
+ await broker.startup()
53
+ logger.info("Broker connected (kicker side)", url=config.redis_url)
54
+
55
+ @app.on_event("shutdown")
56
+ async def _shutdown() -> None: # pyright: ignore[reportUnusedFunction]
57
+ if not broker.is_worker_process:
58
+ await broker.shutdown()
59
+
60
+ @app.get("/health")
61
+ async def health() -> dict[str, str]: # pyright: ignore[reportUnusedFunction]
62
+ return {"status": "ok"}
63
+
64
+ @app.get("/ready")
65
+ async def ready() -> dict[str, str]: # pyright: ignore[reportUnusedFunction]
66
+ return {"status": "ok"}
67
+
68
+ @app.post("/tasks/parse-document", status_code=status.HTTP_202_ACCEPTED)
69
+ async def kick_parse_document( # pyright: ignore[reportUnusedFunction]
70
+ body: ParseDocumentRequest,
71
+ ) -> dict[str, str]:
72
+ if not body.document_id.strip():
73
+ raise HTTPException(
74
+ status_code=status.HTTP_400_BAD_REQUEST,
75
+ detail="document_id is required",
76
+ )
77
+ task = broker.find_task(PARSE_DOCUMENT_TASK_NAME)
78
+ if task is None:
79
+ raise HTTPException(
80
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
81
+ detail=f"Task {PARSE_DOCUMENT_TASK_NAME} is not registered",
82
+ )
83
+ await task.kiq(body.document_id)
84
+ logger.info("Enqueued parse-document task", document_id=body.document_id)
85
+ return {"status": "queued", "task": PARSE_DOCUMENT_TASK_NAME}
86
+
87
+ return app
@@ -0,0 +1,32 @@
1
+ """Logging + structured boot for the worker process.
2
+
3
+ Mirrors the spirit of `agno_agent_builder.app.create_app` lifespan but kept
4
+ much simpler — there's no registry/listener to bootstrap. We just configure
5
+ structlog so taskiq + FastAPI logs share the same JSON sink.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+
12
+ import structlog
13
+
14
+ from payload_documents_worker_builder.config import RuntimeConfig
15
+
16
+
17
+ def configure_logging(config: RuntimeConfig) -> None:
18
+ """Idempotent structlog setup so both processes (uvicorn + taskiq) match."""
19
+ level = getattr(logging, config.log_level.upper(), logging.INFO)
20
+ logging.basicConfig(level=level, format="%(message)s")
21
+ structlog.configure(
22
+ processors=[
23
+ structlog.contextvars.merge_contextvars,
24
+ structlog.processors.add_log_level,
25
+ structlog.processors.TimeStamper(fmt="iso"),
26
+ structlog.processors.StackInfoRenderer(),
27
+ structlog.processors.format_exc_info,
28
+ structlog.processors.JSONRenderer(),
29
+ ],
30
+ wrapper_class=structlog.make_filtering_bound_logger(level),
31
+ cache_logger_on_first_use=True,
32
+ )
File without changes
@@ -0,0 +1,12 @@
1
+ """Built-in tasks.
2
+
3
+ Currently exposes the LlamaParse parse-document task. Adding more tasks later
4
+ is the same pattern: define them here and have `register_tasks` wire them.
5
+ """
6
+
7
+ from payload_documents_worker_builder.tasks.parse_document import (
8
+ PARSE_DOCUMENT_TASK_NAME,
9
+ register_parse_document_task,
10
+ )
11
+
12
+ __all__ = ["PARSE_DOCUMENT_TASK_NAME", "register_parse_document_task"]
@@ -0,0 +1,195 @@
1
+ """Parse-document task.
2
+
3
+ Single responsibility: take a Payload document id, download its file, kick a
4
+ LlamaParse upload, poll until the result is ready, write the parsed markdown
5
+ back into Payload, and stamp `parse_status` accordingly.
6
+
7
+ The task is registered against a broker via ``register_parse_document_task``
8
+ so consumers can compose multiple workers on the same broker without us
9
+ hard-coding the binding.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import asyncio
15
+ import contextlib
16
+ from datetime import UTC, datetime
17
+
18
+ import httpx
19
+ import structlog
20
+ from taskiq import AsyncBroker
21
+
22
+ from payload_documents_worker_builder.clients.llama_parse import (
23
+ LlamaParseClient,
24
+ LlamaParseError,
25
+ LlamaParseJob,
26
+ )
27
+ from payload_documents_worker_builder.clients.payload import PayloadClient, PayloadError
28
+ from payload_documents_worker_builder.clients.types import ParseContext
29
+ from payload_documents_worker_builder.config import RuntimeConfig
30
+
31
+ PARSE_DOCUMENT_TASK_NAME = "documents.parse"
32
+ DEFAULT_FILENAME = "upload.bin"
33
+
34
+ logger = structlog.get_logger("payload_documents_worker_builder.parse_document")
35
+
36
+
37
+ def register_parse_document_task(broker: AsyncBroker, config: RuntimeConfig) -> None:
38
+ """Bind the parse-document task to ``broker``.
39
+
40
+ The task is named ``documents.parse``. Kick it from any taskiq client
41
+ (or via the FastAPI HTTP kicker) with a single string arg: the Payload
42
+ document id (numeric ids serialise to string just fine).
43
+ """
44
+
45
+ decorator = broker.task(
46
+ task_name=PARSE_DOCUMENT_TASK_NAME,
47
+ retry_on_error=True,
48
+ max_retries=2,
49
+ )
50
+
51
+ async def parse_document(document_id: str) -> None:
52
+ await _run_parse_document(document_id, config)
53
+
54
+ decorator(parse_document)
55
+
56
+
57
+ async def _run_parse_document(document_id: str, config: RuntimeConfig) -> None:
58
+ """Orchestrator: each phase is its own coroutine for unit-testability."""
59
+ log = logger.bind(document_id=document_id, collection=config.documents_collection_slug)
60
+ log.info("Parse document task started")
61
+
62
+ async with (
63
+ PayloadClient(
64
+ base_url=str(config.payload_url),
65
+ internal_secret=config.internal_secret.get_secret_value(),
66
+ ) as payload,
67
+ LlamaParseClient(
68
+ api_key=config.llama_cloud_api_key.get_secret_value(),
69
+ base_url=str(config.llama_parse_base_url),
70
+ ) as llama,
71
+ ):
72
+ try:
73
+ await _mark_processing(payload, config, document_id)
74
+ ctx, file_bytes = await _fetch_inputs(payload, config, document_id, log)
75
+ job = await _submit_to_llama(llama, ctx, file_bytes, log)
76
+ await _record_job_id(payload, config, document_id, job)
77
+ markdown = await _poll_until_done(llama, job.id, config, log)
78
+ await _writeback_success(payload, config, document_id, markdown, log)
79
+ log.info("Parse document task succeeded")
80
+ except (LlamaParseError, PayloadError) as exc:
81
+ log.exception("Parse document task failed")
82
+ await _stamp_error(payload, config, document_id, str(exc))
83
+ raise
84
+
85
+
86
+ async def _mark_processing(payload: PayloadClient, config: RuntimeConfig, document_id: str) -> None:
87
+ await payload.submit_parse_result(
88
+ config.documents_collection_slug,
89
+ document_id,
90
+ {"parse_status": "processing", "parse_error": None},
91
+ )
92
+
93
+
94
+ async def _fetch_inputs(
95
+ payload: PayloadClient,
96
+ config: RuntimeConfig,
97
+ document_id: str,
98
+ log: structlog.stdlib.BoundLogger,
99
+ ) -> tuple[ParseContext, bytes]:
100
+ ctx = await payload.fetch_parse_context(config.documents_collection_slug, document_id)
101
+ log.info("Downloading upload from Payload", filename=_resolve_filename(ctx))
102
+ file_bytes = await payload.fetch_parse_file(config.documents_collection_slug, document_id)
103
+ return ctx, file_bytes
104
+
105
+
106
+ async def _submit_to_llama(
107
+ llama: LlamaParseClient,
108
+ ctx: ParseContext,
109
+ file_bytes: bytes,
110
+ log: structlog.stdlib.BoundLogger,
111
+ ) -> LlamaParseJob:
112
+ filename = _resolve_filename(ctx)
113
+ log.info("Uploading to LlamaParse", filename=filename, size=len(file_bytes))
114
+ job = await llama.upload(
115
+ file_bytes=file_bytes,
116
+ filename=filename,
117
+ language=ctx.get("language"),
118
+ parsing_instruction=ctx.get("parsing_instruction"),
119
+ mode=ctx.get("mode"),
120
+ )
121
+ log.info("LlamaParse job created", llama_job_id=job.id)
122
+ return job
123
+
124
+
125
+ async def _record_job_id(
126
+ payload: PayloadClient, config: RuntimeConfig, document_id: str, job: LlamaParseJob
127
+ ) -> None:
128
+ await payload.submit_parse_result(
129
+ config.documents_collection_slug,
130
+ document_id,
131
+ {"parse_job_id": job.id},
132
+ )
133
+
134
+
135
+ async def _writeback_success(
136
+ payload: PayloadClient,
137
+ config: RuntimeConfig,
138
+ document_id: str,
139
+ markdown: str,
140
+ log: structlog.stdlib.BoundLogger,
141
+ ) -> None:
142
+ log.info("Parse complete; writing back to Payload", chars=len(markdown))
143
+ await payload.submit_parse_result(
144
+ config.documents_collection_slug,
145
+ document_id,
146
+ {
147
+ "parsed_text": markdown,
148
+ "parse_status": "done",
149
+ "parse_error": None,
150
+ "parsed_at": _now_iso(),
151
+ },
152
+ )
153
+
154
+
155
+ async def _poll_until_done(
156
+ client: LlamaParseClient,
157
+ job_id: str,
158
+ config: RuntimeConfig,
159
+ log: structlog.stdlib.BoundLogger,
160
+ ) -> str:
161
+ deadline = asyncio.get_event_loop().time() + config.llama_parse_poll_timeout_s
162
+ while asyncio.get_event_loop().time() <= deadline:
163
+ job = await client.status(job_id)
164
+ if job.status == "SUCCESS":
165
+ return await client.fetch_markdown(job_id)
166
+ if job.status in ("ERROR", "CANCELLED"):
167
+ raise LlamaParseError(
168
+ f"LlamaParse job {job_id} ended in {job.status}: {job.error or 'no detail'}"
169
+ )
170
+ log.debug("Polling LlamaParse", status=job.status)
171
+ await asyncio.sleep(config.llama_parse_poll_interval_s)
172
+ raise LlamaParseError(
173
+ f"LlamaParse job {job_id} timed out after {config.llama_parse_poll_timeout_s}s"
174
+ )
175
+
176
+
177
+ def _resolve_filename(ctx: ParseContext) -> str:
178
+ filename = ctx.get("filename")
179
+ return filename if isinstance(filename, str) and filename else DEFAULT_FILENAME
180
+
181
+
182
+ async def _stamp_error(
183
+ payload: PayloadClient, config: RuntimeConfig, document_id: str, message: str
184
+ ) -> None:
185
+ """Best-effort error stamp — never raises so we don't shadow the original exception."""
186
+ with contextlib.suppress(PayloadError, httpx.HTTPError):
187
+ await payload.submit_parse_result(
188
+ config.documents_collection_slug,
189
+ document_id,
190
+ {"parse_status": "error", "parse_error": message[:500]},
191
+ )
192
+
193
+
194
+ def _now_iso() -> str:
195
+ return datetime.now(UTC).isoformat()
@@ -0,0 +1,91 @@
1
+ Metadata-Version: 2.4
2
+ Name: payload-documents-worker-builder
3
+ Version: 0.1.0
4
+ Summary: Parametrizable taskiq-based worker for Payload CMS — FastAPI kick app + Postgres LISTEN/NOTIFY broker + LlamaParse parse-document task.
5
+ Project-URL: Homepage, https://github.com/Zetesis-Labs/PayloadAgents
6
+ Project-URL: Repository, https://github.com/Zetesis-Labs/PayloadAgents
7
+ Project-URL: Issues, https://github.com/Zetesis-Labs/PayloadAgents/issues
8
+ Author: Zetesis Labs
9
+ License: MIT
10
+ Keywords: llamaparse,payload,postgres,taskiq,worker
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Framework :: FastAPI
13
+ Classifier: Intended Audience :: Developers
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Programming Language :: Python :: 3.12
16
+ Classifier: Programming Language :: Python :: 3.13
17
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
18
+ Classifier: Typing :: Typed
19
+ Requires-Python: >=3.12
20
+ Requires-Dist: fastapi>=0.115
21
+ Requires-Dist: httpx>=0.28
22
+ Requires-Dist: pydantic>=2.9
23
+ Requires-Dist: structlog>=24.1
24
+ Requires-Dist: taskiq-redis>=1.1.1
25
+ Requires-Dist: taskiq>=0.11.18
26
+ Requires-Dist: uvicorn[standard]>=0.34
27
+ Description-Content-Type: text/markdown
28
+
29
+ # payload-documents-worker-builder
30
+
31
+ Parametrizable [taskiq](https://taskiq-python.github.io/) worker for Payload CMS, ready to drop into a workspace.
32
+
33
+ Provides:
34
+
35
+ - `RuntimeConfig` — pydantic config (one place for everything: DB DSN, Payload base URL, internal secret, LlamaCloud API key).
36
+ - `create_broker(config)` — Redis stream broker (via `taskiq-redis`) with smart retry middleware.
37
+ - `create_app(config)` — bundles broker + tasks + a small FastAPI HTTP "kicker" that exposes `POST /tasks/parse-document` so the Next.js side can enqueue parses without speaking the taskiq protocol directly.
38
+ - `parse_document_task` — built-in task that uploads a Payload document to LlamaParse, polls until done, and writes `parsed_text` + `parse_status` back via Payload REST.
39
+
40
+ ## Usage
41
+
42
+ ```python
43
+ from payload_documents_worker_builder import RuntimeConfig, create_app
44
+ from pydantic import SecretStr
45
+
46
+ config = RuntimeConfig(
47
+ app_name="my-worker",
48
+ redis_url="redis://redis:6379",
49
+ payload_url="http://app:3000",
50
+ payload_service_token=SecretStr("..."), # Payload API key with write access
51
+ llama_cloud_api_key=SecretStr("..."),
52
+ internal_secret=SecretStr("dev"),
53
+ documents_collection_slug="documents",
54
+ )
55
+
56
+ app, broker = create_app(config)
57
+ ```
58
+
59
+ Run two processes side by side:
60
+
61
+ ```bash
62
+ uvicorn my_worker.main:app --host 0.0.0.0 --port 8001 # HTTP kicker
63
+ taskiq worker my_worker.main:broker # task consumer
64
+ ```
65
+
66
+ ## Architecture
67
+
68
+ ```
69
+ Next.js (Payload) payload-documents-worker (uvicorn) payload-documents-worker (taskiq)
70
+ ───────────────────────── ──────────────────────────── ────────────────────────────
71
+ POST /api/documents/{id}/parse POST /tasks/parse-document consume `parse_document` task
72
+ ├ stamps parse_status='queued' ├ broker.kiq() ├ download file from Payload
73
+ └ HTTP→ kicker └ returns 202 ├ upload to LlamaCloud
74
+ ├ poll status
75
+ └ PATCH parsed_text/status
76
+ │ │
77
+ └─────────── Redis Stream ─────────┘
78
+ ```
79
+
80
+ ## Public API
81
+
82
+ ```python
83
+ from payload_documents_worker_builder import (
84
+ create_app,
85
+ create_broker,
86
+ RuntimeConfig,
87
+ LlamaParseClient,
88
+ PayloadClient,
89
+ parse_document_task,
90
+ )
91
+ ```
@@ -0,0 +1,17 @@
1
+ payload_documents_worker_builder/__init__.py,sha256=t7H8O1ZUnykr2f2hoqgfJUHw9FG_xHHaxHZwiXhDvDc,1063
2
+ payload_documents_worker_builder/app.py,sha256=WaEyrMC6zYuznED5-C-DOwAFVT_hJQSUq0FjbHvcrqs,1663
3
+ payload_documents_worker_builder/broker.py,sha256=ym7h9ZpSFSV86NQ9tPzL83ad0DIwRI1XMu7jzv79xB0,700
4
+ payload_documents_worker_builder/config.py,sha256=9kCybTsTpvcb6VqARED4RfnT3PQ5gIH2WeLuO3Q6mdE,3357
5
+ payload_documents_worker_builder/http.py,sha256=WVhTlKQ3S-TXy6kuiVYiY_w0SE1TZD5n46hgo6GKRYo,3302
6
+ payload_documents_worker_builder/lifecycle.py,sha256=x7n5lyy8plcT3o8QlLjMkze9ZplXX7seHWnNlN3mhHw,1157
7
+ payload_documents_worker_builder/clients/__init__.py,sha256=-GWLn6bOAc7L6kfhIEBnPnuqEQrsXxXERzNlg6o3Np0,457
8
+ payload_documents_worker_builder/clients/_errors.py,sha256=Ew7tPfrkVsFMDgdw6WsdGBcFUMi_Z4Lh7IlTRTHEVfo,898
9
+ payload_documents_worker_builder/clients/llama_parse.py,sha256=14rS_9Xi19NH0FLB8Ag2_m1Wo7q7kF8Zxxu_VThcBhk,4277
10
+ payload_documents_worker_builder/clients/payload.py,sha256=_9iuF3XSHDYAXFEY5XEa-VeDjxtVy7QjvDXHfetjJdE,4663
11
+ payload_documents_worker_builder/clients/types.py,sha256=teYbccve1IzuE_6l2sbsNwJUQ-zEMsEdq4VVEFG70YY,1475
12
+ payload_documents_worker_builder/tasks/__init__.py,sha256=4QSOzdDP8vCc_3VGwXzuRZWYK5inpQ8i0b9vIIBzAr8,384
13
+ payload_documents_worker_builder/tasks/parse_document.py,sha256=KWk-O4HQRG-C-5654cyDdAzxBpnZvtcU7cI-fydTXDQ,6776
14
+ payload_documents_worker_builder/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ payload_documents_worker_builder-0.1.0.dist-info/METADATA,sha256=MKb3Y3qTIGiOTK3kRJITp3wefkZY0aH883ozjgTUpJ0,3907
16
+ payload_documents_worker_builder-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
17
+ payload_documents_worker_builder-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any