payload-documents-worker-builder 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- payload_documents_worker_builder-0.1.0/.gitignore +7 -0
- payload_documents_worker_builder-0.1.0/PKG-INFO +91 -0
- payload_documents_worker_builder-0.1.0/README.md +63 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/__init__.py +39 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/app.py +49 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/broker.py +19 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/clients/__init__.py +18 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/clients/_errors.py +28 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/clients/llama_parse.py +136 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/clients/payload.py +126 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/clients/types.py +44 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/config.py +67 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/http.py +87 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/lifecycle.py +32 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/py.typed +0 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/tasks/__init__.py +12 -0
- payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/tasks/parse_document.py +195 -0
- payload_documents_worker_builder-0.1.0/pyproject.toml +43 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: payload-documents-worker-builder
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Parametrizable taskiq-based worker for Payload CMS — FastAPI kick app + Postgres LISTEN/NOTIFY broker + LlamaParse parse-document task.
|
|
5
|
+
Project-URL: Homepage, https://github.com/Zetesis-Labs/PayloadAgents
|
|
6
|
+
Project-URL: Repository, https://github.com/Zetesis-Labs/PayloadAgents
|
|
7
|
+
Project-URL: Issues, https://github.com/Zetesis-Labs/PayloadAgents/issues
|
|
8
|
+
Author: Zetesis Labs
|
|
9
|
+
License: MIT
|
|
10
|
+
Keywords: llamaparse,payload,postgres,taskiq,worker
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Framework :: FastAPI
|
|
13
|
+
Classifier: Intended Audience :: Developers
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Classifier: Typing :: Typed
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Requires-Dist: fastapi>=0.115
|
|
21
|
+
Requires-Dist: httpx>=0.28
|
|
22
|
+
Requires-Dist: pydantic>=2.9
|
|
23
|
+
Requires-Dist: structlog>=24.1
|
|
24
|
+
Requires-Dist: taskiq-redis>=1.1.1
|
|
25
|
+
Requires-Dist: taskiq>=0.11.18
|
|
26
|
+
Requires-Dist: uvicorn[standard]>=0.34
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
|
|
29
|
+
# payload-documents-worker-builder
|
|
30
|
+
|
|
31
|
+
Parametrizable [taskiq](https://taskiq-python.github.io/) worker for Payload CMS, ready to drop into a workspace.
|
|
32
|
+
|
|
33
|
+
Provides:
|
|
34
|
+
|
|
35
|
+
- `RuntimeConfig` — pydantic config (one place for everything: DB DSN, Payload base URL, internal secret, LlamaCloud API key).
|
|
36
|
+
- `create_broker(config)` — Redis stream broker (via `taskiq-redis`) with smart retry middleware.
|
|
37
|
+
- `create_app(config)` — bundles broker + tasks + a small FastAPI HTTP "kicker" that exposes `POST /tasks/parse-document` so the Next.js side can enqueue parses without speaking the taskiq protocol directly.
|
|
38
|
+
- `parse_document_task` — built-in task that uploads a Payload document to LlamaParse, polls until done, and writes `parsed_text` + `parse_status` back via Payload REST.
|
|
39
|
+
|
|
40
|
+
## Usage
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
from payload_documents_worker_builder import RuntimeConfig, create_app
|
|
44
|
+
from pydantic import SecretStr
|
|
45
|
+
|
|
46
|
+
config = RuntimeConfig(
|
|
47
|
+
app_name="my-worker",
|
|
48
|
+
redis_url="redis://redis:6379",
|
|
49
|
+
payload_url="http://app:3000",
|
|
50
|
+
payload_service_token=SecretStr("..."), # Payload API key with write access
|
|
51
|
+
llama_cloud_api_key=SecretStr("..."),
|
|
52
|
+
internal_secret=SecretStr("dev"),
|
|
53
|
+
documents_collection_slug="documents",
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
app, broker = create_app(config)
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
Run two processes side by side:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uvicorn my_worker.main:app --host 0.0.0.0 --port 8001 # HTTP kicker
|
|
63
|
+
taskiq worker my_worker.main:broker # task consumer
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Architecture
|
|
67
|
+
|
|
68
|
+
```
|
|
69
|
+
Next.js (Payload) payload-documents-worker (uvicorn) payload-documents-worker (taskiq)
|
|
70
|
+
───────────────────────── ──────────────────────────── ────────────────────────────
|
|
71
|
+
POST /api/documents/{id}/parse POST /tasks/parse-document consume `parse_document` task
|
|
72
|
+
├ stamps parse_status='queued' ├ broker.kiq() ├ download file from Payload
|
|
73
|
+
└ HTTP→ kicker └ returns 202 ├ upload to LlamaCloud
|
|
74
|
+
├ poll status
|
|
75
|
+
└ PATCH parsed_text/status
|
|
76
|
+
│ │
|
|
77
|
+
└─────────── Redis Stream ─────────┘
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Public API
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from payload_documents_worker_builder import (
|
|
84
|
+
create_app,
|
|
85
|
+
create_broker,
|
|
86
|
+
RuntimeConfig,
|
|
87
|
+
LlamaParseClient,
|
|
88
|
+
PayloadClient,
|
|
89
|
+
parse_document_task,
|
|
90
|
+
)
|
|
91
|
+
```
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
# payload-documents-worker-builder
|
|
2
|
+
|
|
3
|
+
Parametrizable [taskiq](https://taskiq-python.github.io/) worker for Payload CMS, ready to drop into a workspace.
|
|
4
|
+
|
|
5
|
+
Provides:
|
|
6
|
+
|
|
7
|
+
- `RuntimeConfig` — pydantic config (one place for everything: DB DSN, Payload base URL, internal secret, LlamaCloud API key).
|
|
8
|
+
- `create_broker(config)` — Redis stream broker (via `taskiq-redis`) with smart retry middleware.
|
|
9
|
+
- `create_app(config)` — bundles broker + tasks + a small FastAPI HTTP "kicker" that exposes `POST /tasks/parse-document` so the Next.js side can enqueue parses without speaking the taskiq protocol directly.
|
|
10
|
+
- `parse_document_task` — built-in task that uploads a Payload document to LlamaParse, polls until done, and writes `parsed_text` + `parse_status` back via Payload REST.
|
|
11
|
+
|
|
12
|
+
## Usage
|
|
13
|
+
|
|
14
|
+
```python
|
|
15
|
+
from payload_documents_worker_builder import RuntimeConfig, create_app
|
|
16
|
+
from pydantic import SecretStr
|
|
17
|
+
|
|
18
|
+
config = RuntimeConfig(
|
|
19
|
+
app_name="my-worker",
|
|
20
|
+
redis_url="redis://redis:6379",
|
|
21
|
+
payload_url="http://app:3000",
|
|
22
|
+
payload_service_token=SecretStr("..."), # Payload API key with write access
|
|
23
|
+
llama_cloud_api_key=SecretStr("..."),
|
|
24
|
+
internal_secret=SecretStr("dev"),
|
|
25
|
+
documents_collection_slug="documents",
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
app, broker = create_app(config)
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
Run two processes side by side:
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
uvicorn my_worker.main:app --host 0.0.0.0 --port 8001 # HTTP kicker
|
|
35
|
+
taskiq worker my_worker.main:broker # task consumer
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Architecture
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
Next.js (Payload) payload-documents-worker (uvicorn) payload-documents-worker (taskiq)
|
|
42
|
+
───────────────────────── ──────────────────────────── ────────────────────────────
|
|
43
|
+
POST /api/documents/{id}/parse POST /tasks/parse-document consume `parse_document` task
|
|
44
|
+
├ stamps parse_status='queued' ├ broker.kiq() ├ download file from Payload
|
|
45
|
+
└ HTTP→ kicker └ returns 202 ├ upload to LlamaCloud
|
|
46
|
+
├ poll status
|
|
47
|
+
└ PATCH parsed_text/status
|
|
48
|
+
│ │
|
|
49
|
+
└─────────── Redis Stream ─────────┘
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Public API
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from payload_documents_worker_builder import (
|
|
56
|
+
create_app,
|
|
57
|
+
create_broker,
|
|
58
|
+
RuntimeConfig,
|
|
59
|
+
LlamaParseClient,
|
|
60
|
+
PayloadClient,
|
|
61
|
+
parse_document_task,
|
|
62
|
+
)
|
|
63
|
+
```
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Public API for `payload-documents-worker-builder`.
|
|
2
|
+
|
|
3
|
+
Mirrors the shape of `agno_agent_builder.__init__`: a single import surface
|
|
4
|
+
that exposes the factory, the config, the clients and the tasks. Consumers
|
|
5
|
+
should never reach into submodules.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from payload_documents_worker_builder.app import WorkerApp, create_app
|
|
9
|
+
from payload_documents_worker_builder.broker import create_broker
|
|
10
|
+
from payload_documents_worker_builder.clients import (
|
|
11
|
+
LlamaParseClient,
|
|
12
|
+
LlamaParseError,
|
|
13
|
+
LlamaParseJob,
|
|
14
|
+
LlamaParseStatus,
|
|
15
|
+
PayloadClient,
|
|
16
|
+
PayloadError,
|
|
17
|
+
)
|
|
18
|
+
from payload_documents_worker_builder.config import RuntimeConfig
|
|
19
|
+
from payload_documents_worker_builder.tasks import (
|
|
20
|
+
PARSE_DOCUMENT_TASK_NAME,
|
|
21
|
+
register_parse_document_task,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
__all__ = [
|
|
25
|
+
"PARSE_DOCUMENT_TASK_NAME",
|
|
26
|
+
"LlamaParseClient",
|
|
27
|
+
"LlamaParseError",
|
|
28
|
+
"LlamaParseJob",
|
|
29
|
+
"LlamaParseStatus",
|
|
30
|
+
"PayloadClient",
|
|
31
|
+
"PayloadError",
|
|
32
|
+
"RuntimeConfig",
|
|
33
|
+
"WorkerApp",
|
|
34
|
+
"create_app",
|
|
35
|
+
"create_broker",
|
|
36
|
+
"register_parse_document_task",
|
|
37
|
+
]
|
|
38
|
+
|
|
39
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Top-level factory.
|
|
2
|
+
|
|
3
|
+
Consumers call ``create_app(config)`` and get back two ready-to-run handles:
|
|
4
|
+
|
|
5
|
+
* ``broker`` — the taskiq broker. Pass it to the taskiq CLI:
|
|
6
|
+
``taskiq worker my_worker.main:broker``.
|
|
7
|
+
* ``app`` — the FastAPI HTTP kicker. Run with uvicorn:
|
|
8
|
+
``uvicorn my_worker.main:app``.
|
|
9
|
+
|
|
10
|
+
Both share the same ``RuntimeConfig`` so logs, retries, and credentials line
|
|
11
|
+
up across processes.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from collections.abc import Iterator
|
|
17
|
+
from dataclasses import dataclass
|
|
18
|
+
|
|
19
|
+
from fastapi import FastAPI
|
|
20
|
+
from taskiq import AsyncBroker
|
|
21
|
+
|
|
22
|
+
from payload_documents_worker_builder.broker import create_broker
|
|
23
|
+
from payload_documents_worker_builder.config import RuntimeConfig
|
|
24
|
+
from payload_documents_worker_builder.http import create_http_app
|
|
25
|
+
from payload_documents_worker_builder.lifecycle import configure_logging
|
|
26
|
+
from payload_documents_worker_builder.tasks import register_parse_document_task
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(slots=True, frozen=True)
|
|
30
|
+
class WorkerApp:
|
|
31
|
+
"""Bundle returned by :func:`create_app`. Exposed as a dataclass so consumers
|
|
32
|
+
can ``app, broker = create_app(config)`` (`__iter__` below) or address
|
|
33
|
+
fields by name explicitly."""
|
|
34
|
+
|
|
35
|
+
app: FastAPI
|
|
36
|
+
broker: AsyncBroker
|
|
37
|
+
|
|
38
|
+
def __iter__(self) -> Iterator[FastAPI | AsyncBroker]:
|
|
39
|
+
yield self.app
|
|
40
|
+
yield self.broker
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def create_app(config: RuntimeConfig) -> WorkerApp:
|
|
44
|
+
"""Build the broker, register built-in tasks, and wrap a FastAPI kicker."""
|
|
45
|
+
configure_logging(config)
|
|
46
|
+
broker = create_broker(config)
|
|
47
|
+
register_parse_document_task(broker, config)
|
|
48
|
+
http_app = create_http_app(broker, config)
|
|
49
|
+
return WorkerApp(app=http_app, broker=broker)
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
"""Broker factory.
|
|
2
|
+
|
|
3
|
+
Returns a configured `RedisStreamBroker` with `SmartRetryMiddleware`. Mirrors
|
|
4
|
+
the pattern from nixon's `nixon_worker_core.broker_factory.create_broker`:
|
|
5
|
+
one place that owns broker config so consumers never instantiate
|
|
6
|
+
`RedisStreamBroker` directly.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from taskiq import AsyncBroker, SmartRetryMiddleware
|
|
12
|
+
from taskiq_redis import RedisStreamBroker
|
|
13
|
+
|
|
14
|
+
from payload_documents_worker_builder.config import RuntimeConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def create_broker(config: RuntimeConfig) -> AsyncBroker:
|
|
18
|
+
"""Build the broker the consumer's `main.py` should expose to taskiq."""
|
|
19
|
+
return RedisStreamBroker(url=config.redis_url).with_middlewares(SmartRetryMiddleware())
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""HTTP clients used by built-in tasks (Payload + LlamaParse)."""
|
|
2
|
+
|
|
3
|
+
from payload_documents_worker_builder.clients.llama_parse import (
|
|
4
|
+
LlamaParseClient,
|
|
5
|
+
LlamaParseError,
|
|
6
|
+
LlamaParseJob,
|
|
7
|
+
LlamaParseStatus,
|
|
8
|
+
)
|
|
9
|
+
from payload_documents_worker_builder.clients.payload import PayloadClient, PayloadError
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"LlamaParseClient",
|
|
13
|
+
"LlamaParseError",
|
|
14
|
+
"LlamaParseJob",
|
|
15
|
+
"LlamaParseStatus",
|
|
16
|
+
"PayloadClient",
|
|
17
|
+
"PayloadError",
|
|
18
|
+
]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Shared error-handling helpers for HTTP clients in this package."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from collections.abc import Callable
|
|
6
|
+
|
|
7
|
+
import httpx
|
|
8
|
+
|
|
9
|
+
_DETAIL_TRUNCATE = 500
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def make_raise_for_status(
|
|
13
|
+
exc_cls: type[Exception], prefix: str
|
|
14
|
+
) -> Callable[[httpx.Response, str], None]:
|
|
15
|
+
"""Return a `_raise_for_status(response, op)` bound to the given exception class + prefix.
|
|
16
|
+
|
|
17
|
+
Each client (Payload, LlamaParse, …) wraps non-2xx responses in its own
|
|
18
|
+
exception type but the rest of the logic is identical: format
|
|
19
|
+
`<prefix> <op> failed: HTTP <code> — <body>` and raise.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
def _raise_for_status(response: httpx.Response, op: str) -> None:
|
|
23
|
+
if response.is_success:
|
|
24
|
+
return
|
|
25
|
+
detail = response.text[:_DETAIL_TRUNCATE]
|
|
26
|
+
raise exc_cls(f"{prefix} {op} failed: HTTP {response.status_code} — {detail}")
|
|
27
|
+
|
|
28
|
+
return _raise_for_status
|
payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/clients/llama_parse.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
"""Minimal async LlamaParse client.
|
|
2
|
+
|
|
3
|
+
Ported from `packages/payload-documents/src/llama-parse/client.ts`. Only the
|
|
4
|
+
endpoints the parse-document task needs:
|
|
5
|
+
|
|
6
|
+
* ``POST /api/parsing/upload`` — kick off a parse job
|
|
7
|
+
* ``GET /api/parsing/job/{id}`` — poll status
|
|
8
|
+
* ``GET /api/parsing/job/{id}/result/markdown`` — fetch parsed markdown
|
|
9
|
+
|
|
10
|
+
Use as an async context manager so the underlying ``httpx.AsyncClient`` (and
|
|
11
|
+
its connection pool) is shared across all calls within one task instead of a
|
|
12
|
+
new TLS handshake per request::
|
|
13
|
+
|
|
14
|
+
async with LlamaParseClient(api_key=...) as client:
|
|
15
|
+
job = await client.upload(...)
|
|
16
|
+
...
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from types import TracebackType
|
|
23
|
+
from typing import Any, Literal, Self
|
|
24
|
+
|
|
25
|
+
import httpx
|
|
26
|
+
|
|
27
|
+
from ._errors import make_raise_for_status
|
|
28
|
+
|
|
29
|
+
LlamaParseStatus = Literal["PENDING", "SUCCESS", "ERROR", "CANCELLED"]
|
|
30
|
+
DEFAULT_BASE_URL = "https://api.cloud.llamaindex.ai"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class LlamaParseError(Exception):
|
|
34
|
+
"""Wraps any non-2xx response or transport failure with a helpful message."""
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
_raise_for_status = make_raise_for_status(LlamaParseError, "LlamaParse")
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass(slots=True)
|
|
41
|
+
class LlamaParseJob:
|
|
42
|
+
id: str
|
|
43
|
+
status: LlamaParseStatus
|
|
44
|
+
error: str | None = None
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class LlamaParseClient:
|
|
48
|
+
"""Tiny httpx-backed client. Use via ``async with`` to share the httpx pool."""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
api_key: str,
|
|
54
|
+
base_url: str = DEFAULT_BASE_URL,
|
|
55
|
+
timeout: float = 60.0,
|
|
56
|
+
) -> None:
|
|
57
|
+
if not api_key:
|
|
58
|
+
raise LlamaParseError("LlamaParse API key is required")
|
|
59
|
+
self._base_url = base_url.rstrip("/")
|
|
60
|
+
self._headers = {"Authorization": f"Bearer {api_key}"}
|
|
61
|
+
self._timeout = timeout
|
|
62
|
+
self._http: httpx.AsyncClient | None = None
|
|
63
|
+
|
|
64
|
+
async def __aenter__(self) -> Self:
|
|
65
|
+
self._http = httpx.AsyncClient(timeout=self._timeout)
|
|
66
|
+
return self
|
|
67
|
+
|
|
68
|
+
async def __aexit__(
|
|
69
|
+
self,
|
|
70
|
+
exc_type: type[BaseException] | None,
|
|
71
|
+
exc: BaseException | None,
|
|
72
|
+
tb: TracebackType | None,
|
|
73
|
+
) -> None:
|
|
74
|
+
if self._http is not None:
|
|
75
|
+
await self._http.aclose()
|
|
76
|
+
self._http = None
|
|
77
|
+
|
|
78
|
+
async def upload(
|
|
79
|
+
self,
|
|
80
|
+
*,
|
|
81
|
+
file_bytes: bytes,
|
|
82
|
+
filename: str,
|
|
83
|
+
language: str | None = None,
|
|
84
|
+
parsing_instruction: str | None = None,
|
|
85
|
+
mode: str | None = None,
|
|
86
|
+
) -> LlamaParseJob:
|
|
87
|
+
data: dict[str, Any] = {}
|
|
88
|
+
if language is not None:
|
|
89
|
+
data["language"] = language
|
|
90
|
+
if parsing_instruction is not None:
|
|
91
|
+
data["parsing_instruction"] = parsing_instruction
|
|
92
|
+
if mode is not None:
|
|
93
|
+
data["parse_mode"] = mode
|
|
94
|
+
|
|
95
|
+
response = await self._client().post(
|
|
96
|
+
f"{self._base_url}/api/parsing/upload",
|
|
97
|
+
headers=self._headers,
|
|
98
|
+
files={"file": (filename, file_bytes)},
|
|
99
|
+
data=data,
|
|
100
|
+
)
|
|
101
|
+
_raise_for_status(response, "upload")
|
|
102
|
+
return _parse_job(response.json())
|
|
103
|
+
|
|
104
|
+
async def status(self, job_id: str) -> LlamaParseJob:
|
|
105
|
+
response = await self._client().get(
|
|
106
|
+
f"{self._base_url}/api/parsing/job/{job_id}",
|
|
107
|
+
headers=self._headers,
|
|
108
|
+
)
|
|
109
|
+
_raise_for_status(response, "status")
|
|
110
|
+
return _parse_job(response.json())
|
|
111
|
+
|
|
112
|
+
async def fetch_markdown(self, job_id: str) -> str:
|
|
113
|
+
response = await self._client().get(
|
|
114
|
+
f"{self._base_url}/api/parsing/job/{job_id}/result/markdown",
|
|
115
|
+
headers=self._headers,
|
|
116
|
+
)
|
|
117
|
+
_raise_for_status(response, "fetch_markdown")
|
|
118
|
+
markdown = response.json().get("markdown")
|
|
119
|
+
if not isinstance(markdown, str):
|
|
120
|
+
raise LlamaParseError(f"LlamaParse returned no markdown for job {job_id}")
|
|
121
|
+
return markdown
|
|
122
|
+
|
|
123
|
+
def _client(self) -> httpx.AsyncClient:
|
|
124
|
+
if self._http is None:
|
|
125
|
+
raise LlamaParseError(
|
|
126
|
+
"LlamaParseClient must be used inside `async with` (httpx pool not initialised)"
|
|
127
|
+
)
|
|
128
|
+
return self._http
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _parse_job(payload: dict[str, Any]) -> LlamaParseJob:
|
|
132
|
+
return LlamaParseJob(
|
|
133
|
+
id=payload["id"],
|
|
134
|
+
status=payload.get("status", "PENDING"),
|
|
135
|
+
error=payload.get("error"),
|
|
136
|
+
)
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Tiny Payload CMS REST client used by built-in tasks.
|
|
2
|
+
|
|
3
|
+
The worker only needs:
|
|
4
|
+
|
|
5
|
+
* fetch the document context (parser knobs + filename)
|
|
6
|
+
* fetch the binary attached to that document
|
|
7
|
+
* stamp parse results back (parsed_text / parse_status / parse_error / ...)
|
|
8
|
+
|
|
9
|
+
All Payload-side calls go through dedicated internal endpoints exposed by the
|
|
10
|
+
``payload-documents`` plugin and authenticated with the shared
|
|
11
|
+
``X-Internal-Secret`` header. The endpoints use Payload's local API with
|
|
12
|
+
``overrideAccess: true`` server-side and the binary endpoint defers to a
|
|
13
|
+
host-provided resolver for the actual storage read, so the plugin stays
|
|
14
|
+
storage-agnostic and host apps can keep the documents collection's access
|
|
15
|
+
control honestly locked down (multi-tenant filters, admin-only writes, etc.)
|
|
16
|
+
without poking a service-account bypass into it.
|
|
17
|
+
|
|
18
|
+
Use as an async context manager so the underlying ``httpx.AsyncClient`` (and
|
|
19
|
+
its connection pool) is shared across all calls within one task instead of a
|
|
20
|
+
new TLS handshake per request::
|
|
21
|
+
|
|
22
|
+
async with PayloadClient(base_url=..., internal_secret=...) as client:
|
|
23
|
+
ctx = await client.fetch_parse_context(slug, doc_id)
|
|
24
|
+
...
|
|
25
|
+
"""
|
|
26
|
+
|
|
27
|
+
from __future__ import annotations
|
|
28
|
+
|
|
29
|
+
from types import TracebackType
|
|
30
|
+
from typing import Self
|
|
31
|
+
|
|
32
|
+
import httpx
|
|
33
|
+
|
|
34
|
+
from ._errors import make_raise_for_status
|
|
35
|
+
from .types import ParseContext, ParseResultUpdate
|
|
36
|
+
|
|
37
|
+
INTERNAL_SECRET_HEADER = "X-Internal-Secret" # noqa: S105 — header name, not a secret value
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class PayloadError(Exception):
|
|
41
|
+
"""Surfaced when Payload returns a non-2xx response."""
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
_raise_for_status = make_raise_for_status(PayloadError, "Payload")
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class PayloadClient:
|
|
48
|
+
"""Async REST client. Use via ``async with`` to share the httpx pool."""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
*,
|
|
53
|
+
base_url: str,
|
|
54
|
+
internal_secret: str,
|
|
55
|
+
timeout: float = 60.0,
|
|
56
|
+
) -> None:
|
|
57
|
+
if not base_url:
|
|
58
|
+
raise PayloadError("Payload base URL is required")
|
|
59
|
+
if not internal_secret:
|
|
60
|
+
raise PayloadError("Internal secret is required for plugin endpoints")
|
|
61
|
+
self._base_url = base_url.rstrip("/")
|
|
62
|
+
self._headers = {INTERNAL_SECRET_HEADER: internal_secret}
|
|
63
|
+
self._timeout = timeout
|
|
64
|
+
self._http: httpx.AsyncClient | None = None
|
|
65
|
+
|
|
66
|
+
async def __aenter__(self) -> Self:
|
|
67
|
+
self._http = httpx.AsyncClient(timeout=self._timeout)
|
|
68
|
+
return self
|
|
69
|
+
|
|
70
|
+
async def __aexit__(
|
|
71
|
+
self,
|
|
72
|
+
exc_type: type[BaseException] | None,
|
|
73
|
+
exc: BaseException | None,
|
|
74
|
+
tb: TracebackType | None,
|
|
75
|
+
) -> None:
|
|
76
|
+
if self._http is not None:
|
|
77
|
+
await self._http.aclose()
|
|
78
|
+
self._http = None
|
|
79
|
+
|
|
80
|
+
async def fetch_parse_context(self, collection: str, doc_id: str | int) -> ParseContext:
|
|
81
|
+
"""GET the plugin's internal read endpoint.
|
|
82
|
+
|
|
83
|
+
Returns a projection containing only the fields the worker needs to
|
|
84
|
+
drive the parse: ``id, url, filename, mimeType, language,
|
|
85
|
+
parsing_instruction, mode``.
|
|
86
|
+
"""
|
|
87
|
+
path = self._endpoint(collection, doc_id, "parse-context")
|
|
88
|
+
response = await self._client().get(path, headers=self._headers)
|
|
89
|
+
_raise_for_status(response, f"GET {path}")
|
|
90
|
+
return response.json()
|
|
91
|
+
|
|
92
|
+
async def fetch_parse_file(self, collection: str, doc_id: str | int) -> bytes:
|
|
93
|
+
"""GET the plugin's internal binary endpoint.
|
|
94
|
+
|
|
95
|
+
The plugin defers the actual storage read to a host-provided resolver
|
|
96
|
+
(S3/R2 GetObject, local fs, ...) and streams the result back.
|
|
97
|
+
"""
|
|
98
|
+
path = self._endpoint(collection, doc_id, "parse-file")
|
|
99
|
+
response = await self._client().get(path, headers=self._headers)
|
|
100
|
+
_raise_for_status(response, f"GET {path}")
|
|
101
|
+
return response.content
|
|
102
|
+
|
|
103
|
+
async def submit_parse_result(
|
|
104
|
+
self,
|
|
105
|
+
collection: str,
|
|
106
|
+
doc_id: str | int,
|
|
107
|
+
data: ParseResultUpdate,
|
|
108
|
+
) -> None:
|
|
109
|
+
"""POST to the plugin's internal write endpoint.
|
|
110
|
+
|
|
111
|
+
Body is whitelisted server-side; only the parse_* fields are accepted
|
|
112
|
+
regardless of what we send.
|
|
113
|
+
"""
|
|
114
|
+
path = self._endpoint(collection, doc_id, "parse-result")
|
|
115
|
+
response = await self._client().post(path, headers=self._headers, json=dict(data))
|
|
116
|
+
_raise_for_status(response, f"POST {path}")
|
|
117
|
+
|
|
118
|
+
def _client(self) -> httpx.AsyncClient:
|
|
119
|
+
if self._http is None:
|
|
120
|
+
raise PayloadError(
|
|
121
|
+
"PayloadClient must be used inside `async with` (httpx pool not initialised)"
|
|
122
|
+
)
|
|
123
|
+
return self._http
|
|
124
|
+
|
|
125
|
+
def _endpoint(self, collection: str, doc_id: str | int, op: str) -> str:
|
|
126
|
+
return f"{self._base_url}/api/{collection}/{doc_id}/{op}"
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""Typed dicts for the JSON contracts the Payload plugin endpoints expose.
|
|
2
|
+
|
|
3
|
+
Mirrors the TypeScript types in `packages/payload-documents/src/plugin/types.ts`
|
|
4
|
+
and `endpoints/parse-{context,result}-endpoint.ts`. Worth duplicating because
|
|
5
|
+
the alternative — `dict[str, Any]` everywhere — drops type info at every
|
|
6
|
+
boundary.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Literal, NotRequired, TypedDict
|
|
12
|
+
|
|
13
|
+
ParseStatus = Literal["idle", "pending", "processing", "done", "error"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ParseContext(TypedDict):
|
|
17
|
+
"""Response shape of `GET /api/<collection>/<id>/parse-context`.
|
|
18
|
+
|
|
19
|
+
Field set is hard-coded server-side (see `parse-context-endpoint.ts`); the
|
|
20
|
+
plugin only returns what the worker needs to drive the LlamaParse upload.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
id: str | int
|
|
24
|
+
url: NotRequired[str | None]
|
|
25
|
+
filename: NotRequired[str | None]
|
|
26
|
+
mimeType: NotRequired[str | None]
|
|
27
|
+
language: NotRequired[str | None]
|
|
28
|
+
parsing_instruction: NotRequired[str | None]
|
|
29
|
+
mode: NotRequired[str | None]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class ParseResultUpdate(TypedDict, total=False):
|
|
33
|
+
"""Request body for `POST /api/<collection>/<id>/parse-result`.
|
|
34
|
+
|
|
35
|
+
Fields are whitelisted server-side (see `parse-result-endpoint.ts`); any
|
|
36
|
+
keys outside this set are silently dropped, but typing them here means the
|
|
37
|
+
caller catches typos at lint/check time.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
parsed_text: str | None
|
|
41
|
+
parse_status: ParseStatus
|
|
42
|
+
parse_error: str | None
|
|
43
|
+
parse_job_id: str | None
|
|
44
|
+
parsed_at: str | None
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Runtime configuration consumed by the worker library.
|
|
2
|
+
|
|
3
|
+
Mirrors the shape of `agno_agent_builder.RuntimeConfig`: one pydantic model
|
|
4
|
+
populated by the consumer, no env loading inside the lib so multi-tenant
|
|
5
|
+
deploys can build several `RuntimeConfig` instances from a single env file.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pydantic import BaseModel, Field, HttpUrl, SecretStr
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RuntimeConfig(BaseModel):
|
|
14
|
+
"""All knobs the consumer needs to fill in to run a worker.
|
|
15
|
+
|
|
16
|
+
The attribute split mirrors the surface area of the upstream
|
|
17
|
+
`agno-agent-builder` so consumers using both libraries see the same
|
|
18
|
+
shape twice (just with `database_url`/`payload_url`/`llama_cloud_api_key`
|
|
19
|
+
instead of an `agent_source`).
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
app_name: str = Field(
|
|
23
|
+
description="FastAPI title and structlog identity. Shows up in logs and /health.",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
# ── Broker ─────────────────────────────────────────────────────────────
|
|
27
|
+
redis_url: str = Field(
|
|
28
|
+
description="Redis connection URL used by taskiq-redis as the broker (e.g. redis://redis:6379).",
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
# ── Payload CMS ────────────────────────────────────────────────────────
|
|
32
|
+
payload_url: HttpUrl = Field(
|
|
33
|
+
description="Base URL for the Payload REST API (e.g. http://app:3000).",
|
|
34
|
+
)
|
|
35
|
+
documents_collection_slug: str = Field(
|
|
36
|
+
default="documents",
|
|
37
|
+
description="Payload collection slug for documents. Must expose the `parse_*` fields shipped by `@zetesis/payload-documents`.",
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# ── LlamaParse ─────────────────────────────────────────────────────────
|
|
41
|
+
llama_cloud_api_key: SecretStr = Field(
|
|
42
|
+
description="LlamaCloud API key used to upload + poll parsing jobs.",
|
|
43
|
+
)
|
|
44
|
+
llama_parse_base_url: HttpUrl = Field(
|
|
45
|
+
default=HttpUrl("https://api.cloud.llamaindex.ai"),
|
|
46
|
+
description="Override only if you point to a self-hosted LlamaCloud-compatible service.",
|
|
47
|
+
)
|
|
48
|
+
llama_parse_poll_interval_s: float = Field(
|
|
49
|
+
default=5.0,
|
|
50
|
+
description="Seconds between successive LlamaCloud status polls.",
|
|
51
|
+
)
|
|
52
|
+
llama_parse_poll_timeout_s: float = Field(
|
|
53
|
+
default=600.0,
|
|
54
|
+
description="Hard cap on how long a single parse task waits before failing.",
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# ── Internal HTTP kicker ──────────────────────────────────────────────
|
|
58
|
+
internal_secret: SecretStr = Field(
|
|
59
|
+
description="Shared secret required by every `POST /tasks/*` request (X-Internal-Secret header).",
|
|
60
|
+
)
|
|
61
|
+
public_paths: tuple[str, ...] = Field(
|
|
62
|
+
default=("/health", "/ready", "/docs", "/openapi.json"),
|
|
63
|
+
description="Paths the InternalAuthMiddleware lets through without the secret.",
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# ── Logging ───────────────────────────────────────────────────────────
|
|
67
|
+
log_level: str = Field(default="INFO")
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
"""HTTP kicker.
|
|
2
|
+
|
|
3
|
+
Tiny FastAPI app the consumer process exposes so the Next.js side (or any
|
|
4
|
+
HTTP client) can enqueue a task without speaking the taskiq Redis protocol.
|
|
5
|
+
|
|
6
|
+
* ``GET /health`` and ``GET /ready`` — Kubernetes / Compose probes (no auth).
|
|
7
|
+
* ``POST /tasks/parse-document`` — body ``{"document_id": "<id>"}``, gated by
|
|
8
|
+
the ``X-Internal-Secret`` header (matched against ``config.internal_secret``).
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import hmac
|
|
14
|
+
from typing import Any
|
|
15
|
+
|
|
16
|
+
import structlog
|
|
17
|
+
from fastapi import FastAPI, HTTPException, Request, status
|
|
18
|
+
from fastapi.responses import JSONResponse
|
|
19
|
+
from pydantic import BaseModel
|
|
20
|
+
from taskiq import AsyncBroker
|
|
21
|
+
|
|
22
|
+
from payload_documents_worker_builder.config import RuntimeConfig
|
|
23
|
+
from payload_documents_worker_builder.tasks import PARSE_DOCUMENT_TASK_NAME
|
|
24
|
+
|
|
25
|
+
logger = structlog.get_logger("payload_documents_worker_builder.http")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class ParseDocumentRequest(BaseModel):
|
|
29
|
+
document_id: str
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def create_http_app(broker: AsyncBroker, config: RuntimeConfig) -> FastAPI:
|
|
33
|
+
"""Build the FastAPI app the consumer hands to uvicorn."""
|
|
34
|
+
app = FastAPI(title=config.app_name)
|
|
35
|
+
|
|
36
|
+
@app.middleware("http")
|
|
37
|
+
async def _internal_auth(request: Request, call_next: Any) -> Any:
|
|
38
|
+
if request.url.path in config.public_paths:
|
|
39
|
+
return await call_next(request)
|
|
40
|
+
provided = request.headers.get("x-internal-secret", "")
|
|
41
|
+
expected = config.internal_secret.get_secret_value()
|
|
42
|
+
if not hmac.compare_digest(provided, expected):
|
|
43
|
+
return JSONResponse(
|
|
44
|
+
{"error": "Forbidden"},
|
|
45
|
+
status_code=status.HTTP_403_FORBIDDEN,
|
|
46
|
+
)
|
|
47
|
+
return await call_next(request)
|
|
48
|
+
|
|
49
|
+
@app.on_event("startup")
|
|
50
|
+
async def _startup() -> None: # pyright: ignore[reportUnusedFunction]
|
|
51
|
+
if not broker.is_worker_process:
|
|
52
|
+
await broker.startup()
|
|
53
|
+
logger.info("Broker connected (kicker side)", url=config.redis_url)
|
|
54
|
+
|
|
55
|
+
@app.on_event("shutdown")
|
|
56
|
+
async def _shutdown() -> None: # pyright: ignore[reportUnusedFunction]
|
|
57
|
+
if not broker.is_worker_process:
|
|
58
|
+
await broker.shutdown()
|
|
59
|
+
|
|
60
|
+
@app.get("/health")
|
|
61
|
+
async def health() -> dict[str, str]: # pyright: ignore[reportUnusedFunction]
|
|
62
|
+
return {"status": "ok"}
|
|
63
|
+
|
|
64
|
+
@app.get("/ready")
|
|
65
|
+
async def ready() -> dict[str, str]: # pyright: ignore[reportUnusedFunction]
|
|
66
|
+
return {"status": "ok"}
|
|
67
|
+
|
|
68
|
+
@app.post("/tasks/parse-document", status_code=status.HTTP_202_ACCEPTED)
|
|
69
|
+
async def kick_parse_document( # pyright: ignore[reportUnusedFunction]
|
|
70
|
+
body: ParseDocumentRequest,
|
|
71
|
+
) -> dict[str, str]:
|
|
72
|
+
if not body.document_id.strip():
|
|
73
|
+
raise HTTPException(
|
|
74
|
+
status_code=status.HTTP_400_BAD_REQUEST,
|
|
75
|
+
detail="document_id is required",
|
|
76
|
+
)
|
|
77
|
+
task = broker.find_task(PARSE_DOCUMENT_TASK_NAME)
|
|
78
|
+
if task is None:
|
|
79
|
+
raise HTTPException(
|
|
80
|
+
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
|
81
|
+
detail=f"Task {PARSE_DOCUMENT_TASK_NAME} is not registered",
|
|
82
|
+
)
|
|
83
|
+
await task.kiq(body.document_id)
|
|
84
|
+
logger.info("Enqueued parse-document task", document_id=body.document_id)
|
|
85
|
+
return {"status": "queued", "task": PARSE_DOCUMENT_TASK_NAME}
|
|
86
|
+
|
|
87
|
+
return app
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Logging + structured boot for the worker process.
|
|
2
|
+
|
|
3
|
+
Mirrors the spirit of `agno_agent_builder.app.create_app` lifespan but kept
|
|
4
|
+
much simpler — there's no registry/listener to bootstrap. We just configure
|
|
5
|
+
structlog so taskiq + FastAPI logs share the same JSON sink.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
from payload_documents_worker_builder.config import RuntimeConfig
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def configure_logging(config: RuntimeConfig) -> None:
|
|
18
|
+
"""Idempotent structlog setup so both processes (uvicorn + taskiq) match."""
|
|
19
|
+
level = getattr(logging, config.log_level.upper(), logging.INFO)
|
|
20
|
+
logging.basicConfig(level=level, format="%(message)s")
|
|
21
|
+
structlog.configure(
|
|
22
|
+
processors=[
|
|
23
|
+
structlog.contextvars.merge_contextvars,
|
|
24
|
+
structlog.processors.add_log_level,
|
|
25
|
+
structlog.processors.TimeStamper(fmt="iso"),
|
|
26
|
+
structlog.processors.StackInfoRenderer(),
|
|
27
|
+
structlog.processors.format_exc_info,
|
|
28
|
+
structlog.processors.JSONRenderer(),
|
|
29
|
+
],
|
|
30
|
+
wrapper_class=structlog.make_filtering_bound_logger(level),
|
|
31
|
+
cache_logger_on_first_use=True,
|
|
32
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Built-in tasks.
|
|
2
|
+
|
|
3
|
+
Currently exposes the LlamaParse parse-document task. Adding more tasks later
|
|
4
|
+
is the same pattern: define them here and have `register_tasks` wire them.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from payload_documents_worker_builder.tasks.parse_document import (
|
|
8
|
+
PARSE_DOCUMENT_TASK_NAME,
|
|
9
|
+
register_parse_document_task,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
__all__ = ["PARSE_DOCUMENT_TASK_NAME", "register_parse_document_task"]
|
payload_documents_worker_builder-0.1.0/payload_documents_worker_builder/tasks/parse_document.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
1
|
+
"""Parse-document task.
|
|
2
|
+
|
|
3
|
+
Single responsibility: take a Payload document id, download its file, kick a
|
|
4
|
+
LlamaParse upload, poll until the result is ready, write the parsed markdown
|
|
5
|
+
back into Payload, and stamp `parse_status` accordingly.
|
|
6
|
+
|
|
7
|
+
The task is registered against a broker via ``register_parse_document_task``
|
|
8
|
+
so consumers can compose multiple workers on the same broker without us
|
|
9
|
+
hard-coding the binding.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import contextlib
|
|
16
|
+
from datetime import UTC, datetime
|
|
17
|
+
|
|
18
|
+
import httpx
|
|
19
|
+
import structlog
|
|
20
|
+
from taskiq import AsyncBroker
|
|
21
|
+
|
|
22
|
+
from payload_documents_worker_builder.clients.llama_parse import (
|
|
23
|
+
LlamaParseClient,
|
|
24
|
+
LlamaParseError,
|
|
25
|
+
LlamaParseJob,
|
|
26
|
+
)
|
|
27
|
+
from payload_documents_worker_builder.clients.payload import PayloadClient, PayloadError
|
|
28
|
+
from payload_documents_worker_builder.clients.types import ParseContext
|
|
29
|
+
from payload_documents_worker_builder.config import RuntimeConfig
|
|
30
|
+
|
|
31
|
+
PARSE_DOCUMENT_TASK_NAME = "documents.parse"
|
|
32
|
+
DEFAULT_FILENAME = "upload.bin"
|
|
33
|
+
|
|
34
|
+
logger = structlog.get_logger("payload_documents_worker_builder.parse_document")
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def register_parse_document_task(broker: AsyncBroker, config: RuntimeConfig) -> None:
|
|
38
|
+
"""Bind the parse-document task to ``broker``.
|
|
39
|
+
|
|
40
|
+
The task is named ``documents.parse``. Kick it from any taskiq client
|
|
41
|
+
(or via the FastAPI HTTP kicker) with a single string arg: the Payload
|
|
42
|
+
document id (numeric ids serialise to string just fine).
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
decorator = broker.task(
|
|
46
|
+
task_name=PARSE_DOCUMENT_TASK_NAME,
|
|
47
|
+
retry_on_error=True,
|
|
48
|
+
max_retries=2,
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
async def parse_document(document_id: str) -> None:
|
|
52
|
+
await _run_parse_document(document_id, config)
|
|
53
|
+
|
|
54
|
+
decorator(parse_document)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
async def _run_parse_document(document_id: str, config: RuntimeConfig) -> None:
|
|
58
|
+
"""Orchestrator: each phase is its own coroutine for unit-testability."""
|
|
59
|
+
log = logger.bind(document_id=document_id, collection=config.documents_collection_slug)
|
|
60
|
+
log.info("Parse document task started")
|
|
61
|
+
|
|
62
|
+
async with (
|
|
63
|
+
PayloadClient(
|
|
64
|
+
base_url=str(config.payload_url),
|
|
65
|
+
internal_secret=config.internal_secret.get_secret_value(),
|
|
66
|
+
) as payload,
|
|
67
|
+
LlamaParseClient(
|
|
68
|
+
api_key=config.llama_cloud_api_key.get_secret_value(),
|
|
69
|
+
base_url=str(config.llama_parse_base_url),
|
|
70
|
+
) as llama,
|
|
71
|
+
):
|
|
72
|
+
try:
|
|
73
|
+
await _mark_processing(payload, config, document_id)
|
|
74
|
+
ctx, file_bytes = await _fetch_inputs(payload, config, document_id, log)
|
|
75
|
+
job = await _submit_to_llama(llama, ctx, file_bytes, log)
|
|
76
|
+
await _record_job_id(payload, config, document_id, job)
|
|
77
|
+
markdown = await _poll_until_done(llama, job.id, config, log)
|
|
78
|
+
await _writeback_success(payload, config, document_id, markdown, log)
|
|
79
|
+
log.info("Parse document task succeeded")
|
|
80
|
+
except (LlamaParseError, PayloadError) as exc:
|
|
81
|
+
log.exception("Parse document task failed")
|
|
82
|
+
await _stamp_error(payload, config, document_id, str(exc))
|
|
83
|
+
raise
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
async def _mark_processing(payload: PayloadClient, config: RuntimeConfig, document_id: str) -> None:
|
|
87
|
+
await payload.submit_parse_result(
|
|
88
|
+
config.documents_collection_slug,
|
|
89
|
+
document_id,
|
|
90
|
+
{"parse_status": "processing", "parse_error": None},
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
async def _fetch_inputs(
|
|
95
|
+
payload: PayloadClient,
|
|
96
|
+
config: RuntimeConfig,
|
|
97
|
+
document_id: str,
|
|
98
|
+
log: structlog.stdlib.BoundLogger,
|
|
99
|
+
) -> tuple[ParseContext, bytes]:
|
|
100
|
+
ctx = await payload.fetch_parse_context(config.documents_collection_slug, document_id)
|
|
101
|
+
log.info("Downloading upload from Payload", filename=_resolve_filename(ctx))
|
|
102
|
+
file_bytes = await payload.fetch_parse_file(config.documents_collection_slug, document_id)
|
|
103
|
+
return ctx, file_bytes
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
async def _submit_to_llama(
|
|
107
|
+
llama: LlamaParseClient,
|
|
108
|
+
ctx: ParseContext,
|
|
109
|
+
file_bytes: bytes,
|
|
110
|
+
log: structlog.stdlib.BoundLogger,
|
|
111
|
+
) -> LlamaParseJob:
|
|
112
|
+
filename = _resolve_filename(ctx)
|
|
113
|
+
log.info("Uploading to LlamaParse", filename=filename, size=len(file_bytes))
|
|
114
|
+
job = await llama.upload(
|
|
115
|
+
file_bytes=file_bytes,
|
|
116
|
+
filename=filename,
|
|
117
|
+
language=ctx.get("language"),
|
|
118
|
+
parsing_instruction=ctx.get("parsing_instruction"),
|
|
119
|
+
mode=ctx.get("mode"),
|
|
120
|
+
)
|
|
121
|
+
log.info("LlamaParse job created", llama_job_id=job.id)
|
|
122
|
+
return job
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
async def _record_job_id(
|
|
126
|
+
payload: PayloadClient, config: RuntimeConfig, document_id: str, job: LlamaParseJob
|
|
127
|
+
) -> None:
|
|
128
|
+
await payload.submit_parse_result(
|
|
129
|
+
config.documents_collection_slug,
|
|
130
|
+
document_id,
|
|
131
|
+
{"parse_job_id": job.id},
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
async def _writeback_success(
|
|
136
|
+
payload: PayloadClient,
|
|
137
|
+
config: RuntimeConfig,
|
|
138
|
+
document_id: str,
|
|
139
|
+
markdown: str,
|
|
140
|
+
log: structlog.stdlib.BoundLogger,
|
|
141
|
+
) -> None:
|
|
142
|
+
log.info("Parse complete; writing back to Payload", chars=len(markdown))
|
|
143
|
+
await payload.submit_parse_result(
|
|
144
|
+
config.documents_collection_slug,
|
|
145
|
+
document_id,
|
|
146
|
+
{
|
|
147
|
+
"parsed_text": markdown,
|
|
148
|
+
"parse_status": "done",
|
|
149
|
+
"parse_error": None,
|
|
150
|
+
"parsed_at": _now_iso(),
|
|
151
|
+
},
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
async def _poll_until_done(
|
|
156
|
+
client: LlamaParseClient,
|
|
157
|
+
job_id: str,
|
|
158
|
+
config: RuntimeConfig,
|
|
159
|
+
log: structlog.stdlib.BoundLogger,
|
|
160
|
+
) -> str:
|
|
161
|
+
deadline = asyncio.get_event_loop().time() + config.llama_parse_poll_timeout_s
|
|
162
|
+
while asyncio.get_event_loop().time() <= deadline:
|
|
163
|
+
job = await client.status(job_id)
|
|
164
|
+
if job.status == "SUCCESS":
|
|
165
|
+
return await client.fetch_markdown(job_id)
|
|
166
|
+
if job.status in ("ERROR", "CANCELLED"):
|
|
167
|
+
raise LlamaParseError(
|
|
168
|
+
f"LlamaParse job {job_id} ended in {job.status}: {job.error or 'no detail'}"
|
|
169
|
+
)
|
|
170
|
+
log.debug("Polling LlamaParse", status=job.status)
|
|
171
|
+
await asyncio.sleep(config.llama_parse_poll_interval_s)
|
|
172
|
+
raise LlamaParseError(
|
|
173
|
+
f"LlamaParse job {job_id} timed out after {config.llama_parse_poll_timeout_s}s"
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _resolve_filename(ctx: ParseContext) -> str:
|
|
178
|
+
filename = ctx.get("filename")
|
|
179
|
+
return filename if isinstance(filename, str) and filename else DEFAULT_FILENAME
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
async def _stamp_error(
|
|
183
|
+
payload: PayloadClient, config: RuntimeConfig, document_id: str, message: str
|
|
184
|
+
) -> None:
|
|
185
|
+
"""Best-effort error stamp — never raises so we don't shadow the original exception."""
|
|
186
|
+
with contextlib.suppress(PayloadError, httpx.HTTPError):
|
|
187
|
+
await payload.submit_parse_result(
|
|
188
|
+
config.documents_collection_slug,
|
|
189
|
+
document_id,
|
|
190
|
+
{"parse_status": "error", "parse_error": message[:500]},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _now_iso() -> str:
|
|
195
|
+
return datetime.now(UTC).isoformat()
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "payload-documents-worker-builder"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Parametrizable taskiq-based worker for Payload CMS — FastAPI kick app + Postgres LISTEN/NOTIFY broker + LlamaParse parse-document task."
|
|
5
|
+
requires-python = ">=3.12"
|
|
6
|
+
readme = "README.md"
|
|
7
|
+
license = { text = "MIT" }
|
|
8
|
+
authors = [{ name = "Zetesis Labs" }]
|
|
9
|
+
keywords = ["payload", "worker", "taskiq", "llamaparse", "postgres"]
|
|
10
|
+
classifiers = [
|
|
11
|
+
"Development Status :: 4 - Beta",
|
|
12
|
+
"Framework :: FastAPI",
|
|
13
|
+
"Intended Audience :: Developers",
|
|
14
|
+
"License :: OSI Approved :: MIT License",
|
|
15
|
+
"Programming Language :: Python :: 3.12",
|
|
16
|
+
"Programming Language :: Python :: 3.13",
|
|
17
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
18
|
+
"Typing :: Typed",
|
|
19
|
+
]
|
|
20
|
+
dependencies = [
|
|
21
|
+
"fastapi>=0.115",
|
|
22
|
+
"httpx>=0.28",
|
|
23
|
+
"pydantic>=2.9",
|
|
24
|
+
"structlog>=24.1",
|
|
25
|
+
"taskiq>=0.11.18",
|
|
26
|
+
"taskiq-redis>=1.1.1",
|
|
27
|
+
"uvicorn[standard]>=0.34",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
[project.urls]
|
|
31
|
+
Homepage = "https://github.com/Zetesis-Labs/PayloadAgents"
|
|
32
|
+
Repository = "https://github.com/Zetesis-Labs/PayloadAgents"
|
|
33
|
+
Issues = "https://github.com/Zetesis-Labs/PayloadAgents/issues"
|
|
34
|
+
|
|
35
|
+
[build-system]
|
|
36
|
+
requires = ["hatchling"]
|
|
37
|
+
build-backend = "hatchling.build"
|
|
38
|
+
|
|
39
|
+
[tool.hatch.build.targets.wheel]
|
|
40
|
+
packages = ["payload_documents_worker_builder"]
|
|
41
|
+
|
|
42
|
+
[tool.hatch.build.targets.wheel.force-include]
|
|
43
|
+
"payload_documents_worker_builder/py.typed" = "payload_documents_worker_builder/py.typed"
|