abstractvision 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,325 @@
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import json
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from typing import Any, Dict, Optional, Tuple
8
+ from urllib.request import Request, urlopen
9
+
10
+ from ..errors import CapabilityNotSupportedError
11
+ from ..types import (
12
+ GeneratedAsset,
13
+ ImageEditRequest,
14
+ ImageGenerationRequest,
15
+ ImageToVideoRequest,
16
+ VideoGenerationRequest,
17
+ VisionBackendCapabilities,
18
+ )
19
+ from .base_backend import VisionBackend
20
+
21
+
22
+ def _join_url(base_url: str, path: str) -> str:
23
+ b = str(base_url or "").rstrip("/")
24
+ p = str(path or "").strip()
25
+ if not p:
26
+ return b
27
+ if not p.startswith("/"):
28
+ p = "/" + p
29
+ return b + p
30
+
31
+
32
+ def _sniff_mime_type(content: bytes, fallback: str) -> str:
33
+ b = bytes(content or b"")
34
+ if b.startswith(b"\x89PNG\r\n\x1a\n"):
35
+ return "image/png"
36
+ if b.startswith(b"\xff\xd8\xff"):
37
+ return "image/jpeg"
38
+ if len(b) >= 12 and b[4:8] == b"ftyp":
39
+ return "video/mp4"
40
+ return str(fallback or "application/octet-stream")
41
+
42
+
43
+ def _decode_b64(s: str) -> bytes:
44
+ raw = str(s or "").strip()
45
+ raw = "".join(raw.split())
46
+ pad = (-len(raw)) % 4
47
+ if pad:
48
+ raw = raw + ("=" * pad)
49
+ return base64.b64decode(raw, validate=False)
50
+
51
+
52
+ def _first_data_item(resp: Dict[str, Any]) -> Dict[str, Any]:
53
+ data = resp.get("data")
54
+ if isinstance(data, list) and data and isinstance(data[0], dict):
55
+ return data[0]
56
+ return {}
57
+
58
+
59
+ def _multipart_form(
60
+ *,
61
+ fields: Dict[str, str],
62
+ files: Dict[str, Tuple[str, bytes, str]],
63
+ ) -> Tuple[bytes, str]:
64
+ boundary = f"----abstractvision-{uuid.uuid4().hex}"
65
+ parts: list[bytes] = []
66
+
67
+ def _add(b: bytes) -> None:
68
+ parts.append(b)
69
+
70
+ for name, value in fields.items():
71
+ _add(f"--{boundary}\r\n".encode("utf-8"))
72
+ _add(f'Content-Disposition: form-data; name="{name}"\r\n\r\n'.encode("utf-8"))
73
+ _add(str(value).encode("utf-8"))
74
+ _add(b"\r\n")
75
+
76
+ for name, (filename, content, content_type) in files.items():
77
+ _add(f"--{boundary}\r\n".encode("utf-8"))
78
+ _add(
79
+ f'Content-Disposition: form-data; name="{name}"; filename="{filename}"\r\n'.encode(
80
+ "utf-8"
81
+ )
82
+ )
83
+ _add(f"Content-Type: {content_type}\r\n\r\n".encode("utf-8"))
84
+ _add(bytes(content))
85
+ _add(b"\r\n")
86
+
87
+ _add(f"--{boundary}--\r\n".encode("utf-8"))
88
+ body = b"".join(parts)
89
+ return body, boundary
90
+
91
+
92
+ @dataclass
93
+ class OpenAICompatibleBackendConfig:
94
+ base_url: str
95
+ api_key: Optional[str] = None
96
+ model_id: Optional[str] = None
97
+ timeout_s: float = 300.0
98
+
99
+ # Endpoints (OpenAI-shaped HTTP).
100
+ image_generations_path: str = "/images/generations"
101
+ image_edits_path: str = "/images/edits"
102
+ text_to_video_path: Optional[str] = None
103
+ image_to_video_path: Optional[str] = None
104
+
105
+ # Image-to-video request mode when enabled.
106
+ image_to_video_mode: str = "multipart" # "multipart" | "json_b64"
107
+
108
+
109
+ class OpenAICompatibleVisionBackend(VisionBackend):
110
+ """Backend adapter for OpenAI-compatible endpoints (OpenAI-shaped HTTP).
111
+
112
+ Notes:
113
+ - Image endpoints are widely implemented (`/images/generations`, `/images/edits`).
114
+ - Video endpoints are not standardized; they are optional and must be configured explicitly.
115
+ """
116
+
117
+ def __init__(self, *, config: OpenAICompatibleBackendConfig):
118
+ self._cfg = config
119
+
120
+ def get_capabilities(self) -> VisionBackendCapabilities:
121
+ tasks = {"text_to_image", "image_to_image"}
122
+ if self._cfg.text_to_video_path:
123
+ tasks.add("text_to_video")
124
+ if self._cfg.image_to_video_path:
125
+ tasks.add("image_to_video")
126
+ return VisionBackendCapabilities(
127
+ supported_tasks=sorted(tasks),
128
+ supports_mask=True,
129
+ )
130
+
131
+ def _headers(self, *, content_type: str) -> Dict[str, str]:
132
+ headers = {"Content-Type": str(content_type)}
133
+ if self._cfg.api_key:
134
+ headers["Authorization"] = f"Bearer {self._cfg.api_key}"
135
+ return headers
136
+
137
+ def _post_json(self, *, path: str, payload: Dict[str, Any]) -> Dict[str, Any]:
138
+ url = _join_url(self._cfg.base_url, path)
139
+ body = json.dumps(payload).encode("utf-8")
140
+ req = Request(url=url, data=body, method="POST", headers=self._headers(content_type="application/json"))
141
+ with urlopen(req, timeout=float(self._cfg.timeout_s)) as resp:
142
+ raw = resp.read()
143
+ data = json.loads(raw.decode("utf-8"))
144
+ if not isinstance(data, dict):
145
+ raise ValueError("Invalid response: expected JSON object")
146
+ return data
147
+
148
+ def _post_multipart(self, *, path: str, fields: Dict[str, str], files: Dict[str, Tuple[str, bytes, str]]) -> Dict[str, Any]:
149
+ url = _join_url(self._cfg.base_url, path)
150
+ body, boundary = _multipart_form(fields=fields, files=files)
151
+ ctype = f"multipart/form-data; boundary={boundary}"
152
+ req = Request(url=url, data=body, method="POST", headers=self._headers(content_type=ctype))
153
+ with urlopen(req, timeout=float(self._cfg.timeout_s)) as resp:
154
+ raw = resp.read()
155
+ data = json.loads(raw.decode("utf-8"))
156
+ if not isinstance(data, dict):
157
+ raise ValueError("Invalid response: expected JSON object")
158
+ return data
159
+
160
+ def _parse_media(self, resp: Dict[str, Any], *, fallback_mime: str) -> GeneratedAsset:
161
+ item = _first_data_item(resp)
162
+ if "b64_json" in item:
163
+ content = _decode_b64(str(item.get("b64_json") or ""))
164
+ mime = _sniff_mime_type(content, fallback_mime)
165
+ media_type = "video" if mime.startswith("video/") else "image"
166
+ return GeneratedAsset(media_type=media_type, data=content, mime_type=mime, metadata={"source": "b64_json"})
167
+ if "url" in item and isinstance(item.get("url"), str):
168
+ # Best-effort: download bytes.
169
+ u = str(item.get("url"))
170
+ req = Request(url=u, method="GET")
171
+ with urlopen(req, timeout=float(self._cfg.timeout_s)) as resp2:
172
+ content = resp2.read()
173
+ ct = resp2.headers.get("Content-Type") or fallback_mime
174
+ mime = _sniff_mime_type(content, str(ct))
175
+ media_type = "video" if mime.startswith("video/") else "image"
176
+ return GeneratedAsset(media_type=media_type, data=content, mime_type=mime, metadata={"source": "url", "url": u})
177
+ raise ValueError("Invalid response: missing data[0].b64_json or data[0].url")
178
+
179
+ def generate_image(self, request: ImageGenerationRequest) -> GeneratedAsset:
180
+ payload: Dict[str, Any] = {
181
+ "prompt": request.prompt,
182
+ "response_format": "b64_json",
183
+ "n": 1,
184
+ }
185
+ if self._cfg.model_id:
186
+ payload["model"] = self._cfg.model_id
187
+ if request.negative_prompt is not None:
188
+ payload["negative_prompt"] = request.negative_prompt
189
+ if request.width is not None and request.height is not None:
190
+ payload["size"] = f"{int(request.width)}x{int(request.height)}"
191
+ payload["width"] = int(request.width)
192
+ payload["height"] = int(request.height)
193
+ if request.seed is not None:
194
+ payload["seed"] = int(request.seed)
195
+ if request.steps is not None:
196
+ payload["steps"] = int(request.steps)
197
+ if request.guidance_scale is not None:
198
+ payload["guidance_scale"] = float(request.guidance_scale)
199
+ if isinstance(request.extra, dict) and request.extra:
200
+ payload.update(dict(request.extra))
201
+
202
+ resp = self._post_json(path=self._cfg.image_generations_path, payload=payload)
203
+ return self._parse_media(resp, fallback_mime="image/png")
204
+
205
+ def edit_image(self, request: ImageEditRequest) -> GeneratedAsset:
206
+ # OpenAI-style image edits use multipart form data.
207
+ fields: Dict[str, str] = {"prompt": request.prompt}
208
+ if self._cfg.model_id:
209
+ fields["model"] = self._cfg.model_id
210
+ if request.negative_prompt is not None:
211
+ fields["negative_prompt"] = request.negative_prompt
212
+
213
+ files: Dict[str, Tuple[str, bytes, str]] = {
214
+ "image": ("image.png", bytes(request.image), "image/png"),
215
+ }
216
+ if request.mask is not None:
217
+ files["mask"] = ("mask.png", bytes(request.mask), "image/png")
218
+
219
+ # Best-effort extra fields.
220
+ if request.seed is not None:
221
+ fields["seed"] = str(int(request.seed))
222
+ if request.steps is not None:
223
+ fields["steps"] = str(int(request.steps))
224
+ if request.guidance_scale is not None:
225
+ fields["guidance_scale"] = str(float(request.guidance_scale))
226
+ if isinstance(request.extra, dict) and request.extra:
227
+ for k, v in request.extra.items():
228
+ if v is None:
229
+ continue
230
+ fields[str(k)] = str(v)
231
+
232
+ resp = self._post_multipart(path=self._cfg.image_edits_path, fields=fields, files=files)
233
+ return self._parse_media(resp, fallback_mime="image/png")
234
+
235
+ def generate_angles(self, request) -> list[GeneratedAsset]:
236
+ raise CapabilityNotSupportedError("OpenAICompatibleVisionBackend does not implement multi-view generation.")
237
+
238
+ def generate_video(self, request: VideoGenerationRequest) -> GeneratedAsset:
239
+ if not self._cfg.text_to_video_path:
240
+ raise CapabilityNotSupportedError("text_to_video is not configured for this backend.")
241
+ payload: Dict[str, Any] = {"prompt": request.prompt, "response_format": "b64_json", "n": 1}
242
+ if self._cfg.model_id:
243
+ payload["model"] = self._cfg.model_id
244
+ if request.negative_prompt is not None:
245
+ payload["negative_prompt"] = request.negative_prompt
246
+ if request.width is not None:
247
+ payload["width"] = int(request.width)
248
+ if request.height is not None:
249
+ payload["height"] = int(request.height)
250
+ if request.fps is not None:
251
+ payload["fps"] = int(request.fps)
252
+ if request.num_frames is not None:
253
+ payload["num_frames"] = int(request.num_frames)
254
+ if request.seed is not None:
255
+ payload["seed"] = int(request.seed)
256
+ if request.steps is not None:
257
+ payload["steps"] = int(request.steps)
258
+ if request.guidance_scale is not None:
259
+ payload["guidance_scale"] = float(request.guidance_scale)
260
+ if isinstance(request.extra, dict) and request.extra:
261
+ payload.update(dict(request.extra))
262
+ resp = self._post_json(path=str(self._cfg.text_to_video_path), payload=payload)
263
+ return self._parse_media(resp, fallback_mime="video/mp4")
264
+
265
+ def image_to_video(self, request: ImageToVideoRequest) -> GeneratedAsset:
266
+ if not self._cfg.image_to_video_path:
267
+ raise CapabilityNotSupportedError("image_to_video is not configured for this backend.")
268
+
269
+ if str(self._cfg.image_to_video_mode) == "json_b64":
270
+ payload: Dict[str, Any] = {"image_b64": base64.b64encode(bytes(request.image)).decode("ascii")}
271
+ if self._cfg.model_id:
272
+ payload["model"] = self._cfg.model_id
273
+ if request.prompt is not None:
274
+ payload["prompt"] = request.prompt
275
+ if request.negative_prompt is not None:
276
+ payload["negative_prompt"] = request.negative_prompt
277
+ if request.width is not None:
278
+ payload["width"] = int(request.width)
279
+ if request.height is not None:
280
+ payload["height"] = int(request.height)
281
+ if request.fps is not None:
282
+ payload["fps"] = int(request.fps)
283
+ if request.num_frames is not None:
284
+ payload["num_frames"] = int(request.num_frames)
285
+ if request.seed is not None:
286
+ payload["seed"] = int(request.seed)
287
+ if request.steps is not None:
288
+ payload["steps"] = int(request.steps)
289
+ if request.guidance_scale is not None:
290
+ payload["guidance_scale"] = float(request.guidance_scale)
291
+ if isinstance(request.extra, dict) and request.extra:
292
+ payload.update(dict(request.extra))
293
+ resp = self._post_json(path=str(self._cfg.image_to_video_path), payload=payload)
294
+ return self._parse_media(resp, fallback_mime="video/mp4")
295
+
296
+ fields: Dict[str, str] = {}
297
+ if self._cfg.model_id:
298
+ fields["model"] = self._cfg.model_id
299
+ if request.prompt is not None:
300
+ fields["prompt"] = request.prompt
301
+ if request.negative_prompt is not None:
302
+ fields["negative_prompt"] = request.negative_prompt
303
+ if request.width is not None:
304
+ fields["width"] = str(int(request.width))
305
+ if request.height is not None:
306
+ fields["height"] = str(int(request.height))
307
+ if request.fps is not None:
308
+ fields["fps"] = str(int(request.fps))
309
+ if request.num_frames is not None:
310
+ fields["num_frames"] = str(int(request.num_frames))
311
+ if request.seed is not None:
312
+ fields["seed"] = str(int(request.seed))
313
+ if request.steps is not None:
314
+ fields["steps"] = str(int(request.steps))
315
+ if request.guidance_scale is not None:
316
+ fields["guidance_scale"] = str(float(request.guidance_scale))
317
+ if isinstance(request.extra, dict) and request.extra:
318
+ for k, v in request.extra.items():
319
+ if v is None:
320
+ continue
321
+ fields[str(k)] = str(v)
322
+
323
+ files = {"image": ("image.png", bytes(request.image), "image/png")}
324
+ resp = self._post_multipart(path=str(self._cfg.image_to_video_path), fields=fields, files=files)
325
+ return self._parse_media(resp, fallback_mime="video/mp4")