ai-pipeline-core 0.3.4__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ai_pipeline_core/__init__.py +64 -158
- ai_pipeline_core/deployment/__init__.py +6 -18
- ai_pipeline_core/deployment/base.py +392 -212
- ai_pipeline_core/deployment/contract.py +6 -10
- ai_pipeline_core/{utils → deployment}/deploy.py +50 -69
- ai_pipeline_core/deployment/helpers.py +16 -17
- ai_pipeline_core/{progress.py → deployment/progress.py} +23 -24
- ai_pipeline_core/{utils/remote_deployment.py → deployment/remote.py} +11 -14
- ai_pipeline_core/docs_generator/__init__.py +54 -0
- ai_pipeline_core/docs_generator/__main__.py +5 -0
- ai_pipeline_core/docs_generator/cli.py +196 -0
- ai_pipeline_core/docs_generator/extractor.py +324 -0
- ai_pipeline_core/docs_generator/guide_builder.py +644 -0
- ai_pipeline_core/docs_generator/trimmer.py +35 -0
- ai_pipeline_core/docs_generator/validator.py +114 -0
- ai_pipeline_core/document_store/__init__.py +13 -0
- ai_pipeline_core/document_store/_summary.py +9 -0
- ai_pipeline_core/document_store/_summary_worker.py +170 -0
- ai_pipeline_core/document_store/clickhouse.py +492 -0
- ai_pipeline_core/document_store/factory.py +38 -0
- ai_pipeline_core/document_store/local.py +312 -0
- ai_pipeline_core/document_store/memory.py +85 -0
- ai_pipeline_core/document_store/protocol.py +68 -0
- ai_pipeline_core/documents/__init__.py +12 -14
- ai_pipeline_core/documents/_context_vars.py +85 -0
- ai_pipeline_core/documents/_hashing.py +52 -0
- ai_pipeline_core/documents/attachment.py +85 -0
- ai_pipeline_core/documents/context.py +128 -0
- ai_pipeline_core/documents/document.py +318 -1434
- ai_pipeline_core/documents/mime_type.py +11 -84
- ai_pipeline_core/documents/utils.py +4 -12
- ai_pipeline_core/exceptions.py +10 -62
- ai_pipeline_core/images/__init__.py +32 -85
- ai_pipeline_core/images/_processing.py +5 -11
- ai_pipeline_core/llm/__init__.py +6 -4
- ai_pipeline_core/llm/ai_messages.py +102 -90
- ai_pipeline_core/llm/client.py +229 -183
- ai_pipeline_core/llm/model_options.py +12 -84
- ai_pipeline_core/llm/model_response.py +53 -99
- ai_pipeline_core/llm/model_types.py +8 -23
- ai_pipeline_core/logging/__init__.py +2 -7
- ai_pipeline_core/logging/logging.yml +1 -1
- ai_pipeline_core/logging/logging_config.py +27 -37
- ai_pipeline_core/logging/logging_mixin.py +15 -41
- ai_pipeline_core/observability/__init__.py +32 -0
- ai_pipeline_core/observability/_debug/__init__.py +30 -0
- ai_pipeline_core/observability/_debug/_auto_summary.py +94 -0
- ai_pipeline_core/{debug/config.py → observability/_debug/_config.py} +11 -7
- ai_pipeline_core/{debug/content.py → observability/_debug/_content.py} +133 -75
- ai_pipeline_core/{debug/processor.py → observability/_debug/_processor.py} +16 -17
- ai_pipeline_core/{debug/summary.py → observability/_debug/_summary.py} +113 -37
- ai_pipeline_core/observability/_debug/_types.py +75 -0
- ai_pipeline_core/{debug/writer.py → observability/_debug/_writer.py} +126 -196
- ai_pipeline_core/observability/_document_tracking.py +146 -0
- ai_pipeline_core/observability/_initialization.py +194 -0
- ai_pipeline_core/observability/_logging_bridge.py +57 -0
- ai_pipeline_core/observability/_summary.py +81 -0
- ai_pipeline_core/observability/_tracking/__init__.py +6 -0
- ai_pipeline_core/observability/_tracking/_client.py +178 -0
- ai_pipeline_core/observability/_tracking/_internal.py +28 -0
- ai_pipeline_core/observability/_tracking/_models.py +138 -0
- ai_pipeline_core/observability/_tracking/_processor.py +158 -0
- ai_pipeline_core/observability/_tracking/_service.py +311 -0
- ai_pipeline_core/observability/_tracking/_writer.py +229 -0
- ai_pipeline_core/{tracing.py → observability/tracing.py} +139 -335
- ai_pipeline_core/pipeline/__init__.py +10 -0
- ai_pipeline_core/pipeline/decorators.py +915 -0
- ai_pipeline_core/pipeline/options.py +16 -0
- ai_pipeline_core/prompt_manager.py +16 -102
- ai_pipeline_core/settings.py +26 -31
- ai_pipeline_core/testing.py +9 -0
- ai_pipeline_core-0.4.0.dist-info/METADATA +807 -0
- ai_pipeline_core-0.4.0.dist-info/RECORD +76 -0
- ai_pipeline_core/debug/__init__.py +0 -26
- ai_pipeline_core/documents/document_list.py +0 -420
- ai_pipeline_core/documents/flow_document.py +0 -112
- ai_pipeline_core/documents/task_document.py +0 -117
- ai_pipeline_core/documents/temporary_document.py +0 -74
- ai_pipeline_core/flow/__init__.py +0 -9
- ai_pipeline_core/flow/config.py +0 -494
- ai_pipeline_core/flow/options.py +0 -75
- ai_pipeline_core/pipeline.py +0 -718
- ai_pipeline_core/prefect.py +0 -63
- ai_pipeline_core/prompt_builder/__init__.py +0 -5
- ai_pipeline_core/prompt_builder/documents_prompt.jinja2 +0 -23
- ai_pipeline_core/prompt_builder/global_cache.py +0 -78
- ai_pipeline_core/prompt_builder/new_core_documents_prompt.jinja2 +0 -6
- ai_pipeline_core/prompt_builder/prompt_builder.py +0 -253
- ai_pipeline_core/prompt_builder/system_prompt.jinja2 +0 -41
- ai_pipeline_core/storage/__init__.py +0 -8
- ai_pipeline_core/storage/storage.py +0 -628
- ai_pipeline_core/utils/__init__.py +0 -8
- ai_pipeline_core-0.3.4.dist-info/METADATA +0 -569
- ai_pipeline_core-0.3.4.dist-info/RECORD +0 -57
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/WHEEL +0 -0
- {ai_pipeline_core-0.3.4.dist-info → ai_pipeline_core-0.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,628 +0,0 @@
|
|
|
1
|
-
"""Storage abstraction for local filesystem and Google Cloud Storage.
|
|
2
|
-
|
|
3
|
-
Provides async storage operations with automatic retry for GCS.
|
|
4
|
-
Supports local filesystem and GCS backends with a unified API.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import asyncio
|
|
10
|
-
import os
|
|
11
|
-
import random
|
|
12
|
-
from abc import ABC, abstractmethod
|
|
13
|
-
from pathlib import Path, PurePosixPath
|
|
14
|
-
from typing import Any
|
|
15
|
-
|
|
16
|
-
from prefect.utilities.asyncutils import run_sync_in_worker_thread
|
|
17
|
-
from prefect_gcp.cloud_storage import GcpCredentials, GcsBucket
|
|
18
|
-
from pydantic import BaseModel, Field
|
|
19
|
-
|
|
20
|
-
from ai_pipeline_core.logging import get_pipeline_logger
|
|
21
|
-
from ai_pipeline_core.settings import settings
|
|
22
|
-
|
|
23
|
-
__all__ = ["Storage", "LocalStorage", "GcsStorage", "RetryPolicy", "ObjectInfo"]
|
|
24
|
-
|
|
25
|
-
logger = get_pipeline_logger(__name__)
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
# ---------- Models ----------
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
class RetryPolicy(BaseModel, frozen=True):
|
|
32
|
-
"""Retry policy for async operations with exponential backoff.
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
attempts: Maximum number of attempts (default 3)
|
|
36
|
-
base_delay: Initial delay in seconds (default 0.5)
|
|
37
|
-
max_delay: Maximum delay between retries (default 5.0)
|
|
38
|
-
jitter: Random jitter factor (default 0.15)
|
|
39
|
-
retry_exceptions: Tuple of exceptions to retry on
|
|
40
|
-
"""
|
|
41
|
-
|
|
42
|
-
attempts: int = Field(default=3, ge=1)
|
|
43
|
-
base_delay: float = Field(default=0.5, ge=0.0)
|
|
44
|
-
max_delay: float = Field(default=5.0, ge=0.0)
|
|
45
|
-
jitter: float = Field(default=0.15, ge=0.0)
|
|
46
|
-
retry_exceptions: tuple[type[BaseException], ...] = Field(default_factory=tuple)
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class ObjectInfo(BaseModel, frozen=True):
|
|
50
|
-
"""Storage object metadata.
|
|
51
|
-
|
|
52
|
-
Attributes:
|
|
53
|
-
key: Relative path (POSIX-style, no leading slash)
|
|
54
|
-
size: Size in bytes (-1 if unknown)
|
|
55
|
-
is_dir: True if this is a directory
|
|
56
|
-
"""
|
|
57
|
-
|
|
58
|
-
key: str
|
|
59
|
-
size: int
|
|
60
|
-
is_dir: bool
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
# ---------- Helpers ----------
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _posix_rel(s: str) -> str:
|
|
67
|
-
if not s:
|
|
68
|
-
return ""
|
|
69
|
-
parts: list[str] = []
|
|
70
|
-
for t in s.replace("\\", "/").split("/"):
|
|
71
|
-
if t in ("", "."):
|
|
72
|
-
continue
|
|
73
|
-
if t == "..":
|
|
74
|
-
if parts:
|
|
75
|
-
parts.pop()
|
|
76
|
-
continue
|
|
77
|
-
parts.append(t)
|
|
78
|
-
return "/".join(parts)
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
def _join_posix(*parts: str) -> str:
|
|
82
|
-
return _posix_rel("/".join(p for p in map(_posix_rel, parts) if p))
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
# ---------- Abstract facade ----------
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
class Storage(ABC):
|
|
89
|
-
"""Abstract storage interface for file operations.
|
|
90
|
-
|
|
91
|
-
Provides a unified API for local filesystem and Google Cloud Storage.
|
|
92
|
-
Supports async operations with automatic retry for cloud storage.
|
|
93
|
-
|
|
94
|
-
Example:
|
|
95
|
-
>>> # Load from local filesystem
|
|
96
|
-
>>> storage = await Storage.from_uri("./data")
|
|
97
|
-
>>>
|
|
98
|
-
>>> # Load from GCS
|
|
99
|
-
>>> storage = await Storage.from_uri("gs://bucket/data")
|
|
100
|
-
"""
|
|
101
|
-
|
|
102
|
-
@classmethod
|
|
103
|
-
async def from_uri(cls, uri: str, retry: RetryPolicy | None = None) -> "Storage":
|
|
104
|
-
"""Create storage instance from URI.
|
|
105
|
-
|
|
106
|
-
Args:
|
|
107
|
-
uri: Storage URI (local path, file://, or gs://)
|
|
108
|
-
retry: Optional retry policy for GCS operations
|
|
109
|
-
|
|
110
|
-
Returns:
|
|
111
|
-
Storage instance for the given URI
|
|
112
|
-
|
|
113
|
-
Raises:
|
|
114
|
-
ValueError: If URI scheme is unsupported or path is invalid
|
|
115
|
-
"""
|
|
116
|
-
# Handle local paths without file:// prefix
|
|
117
|
-
if "://" not in uri:
|
|
118
|
-
# Treat as local filesystem path
|
|
119
|
-
base = Path(uri).expanduser().resolve()
|
|
120
|
-
if base.exists() and not base.is_dir():
|
|
121
|
-
raise ValueError("Local path must point to a directory")
|
|
122
|
-
return LocalStorage(base)
|
|
123
|
-
|
|
124
|
-
scheme, rest = uri.split("://", 1)
|
|
125
|
-
|
|
126
|
-
if scheme == "file":
|
|
127
|
-
base = Path("/" + rest.lstrip("/")).expanduser().resolve()
|
|
128
|
-
if base.exists() and not base.is_dir():
|
|
129
|
-
raise ValueError("file:// URI must point to a directory")
|
|
130
|
-
return LocalStorage(base)
|
|
131
|
-
|
|
132
|
-
if scheme == "gs":
|
|
133
|
-
bucket, *maybe_prefix = rest.split("/", 1)
|
|
134
|
-
folder = _posix_rel(maybe_prefix[0] if maybe_prefix else "")
|
|
135
|
-
return GcsStorage(
|
|
136
|
-
bucket=bucket,
|
|
137
|
-
bucket_folder=folder,
|
|
138
|
-
gcp_credentials=None, # Will try to load from settings if configured
|
|
139
|
-
retry=retry,
|
|
140
|
-
)
|
|
141
|
-
|
|
142
|
-
raise ValueError(f"Unsupported scheme: {scheme}")
|
|
143
|
-
|
|
144
|
-
# Core API — abstract in the base
|
|
145
|
-
@abstractmethod
|
|
146
|
-
def url_for(self, path: str) -> str:
|
|
147
|
-
"""Get URL for path."""
|
|
148
|
-
...
|
|
149
|
-
|
|
150
|
-
@abstractmethod
|
|
151
|
-
async def exists(self, path: str) -> bool:
|
|
152
|
-
"""Check if path exists."""
|
|
153
|
-
...
|
|
154
|
-
|
|
155
|
-
@abstractmethod
|
|
156
|
-
async def list(
|
|
157
|
-
self, prefix: str = "", *, recursive: bool = True, include_dirs: bool = True
|
|
158
|
-
) -> list[ObjectInfo]:
|
|
159
|
-
"""List objects with prefix."""
|
|
160
|
-
...
|
|
161
|
-
|
|
162
|
-
@abstractmethod
|
|
163
|
-
async def read_bytes(self, path: str) -> bytes:
|
|
164
|
-
"""Read bytes from path."""
|
|
165
|
-
...
|
|
166
|
-
|
|
167
|
-
@abstractmethod
|
|
168
|
-
async def write_bytes(self, path: str, data: bytes) -> None:
|
|
169
|
-
"""Write bytes to path."""
|
|
170
|
-
...
|
|
171
|
-
|
|
172
|
-
@abstractmethod
|
|
173
|
-
async def delete(self, path: str, *, missing_ok: bool = True) -> None:
|
|
174
|
-
"""Delete path."""
|
|
175
|
-
...
|
|
176
|
-
|
|
177
|
-
async def copy_from(
|
|
178
|
-
self, other: "Storage", *, src_prefix: str = "", dst_prefix: str = ""
|
|
179
|
-
) -> None:
|
|
180
|
-
"""Copy from another storage."""
|
|
181
|
-
items = await other.list(src_prefix, recursive=True, include_dirs=False)
|
|
182
|
-
for it in items:
|
|
183
|
-
data = await other.read_bytes(_join_posix(src_prefix, it.key))
|
|
184
|
-
await self.write_bytes(_join_posix(dst_prefix, it.key), data)
|
|
185
|
-
|
|
186
|
-
def with_base(self, subpath: str) -> "Storage":
|
|
187
|
-
"""Create sub-storage with base path."""
|
|
188
|
-
raise NotImplementedError("Subclasses must implement with_base")
|
|
189
|
-
|
|
190
|
-
async def read_text(self, path: str, encoding: str = "utf-8") -> str:
|
|
191
|
-
"""Read text from path.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
path: Path to read from
|
|
195
|
-
encoding: Text encoding to use
|
|
196
|
-
|
|
197
|
-
Returns:
|
|
198
|
-
Text content of the file
|
|
199
|
-
"""
|
|
200
|
-
data = await self.read_bytes(path)
|
|
201
|
-
return data.decode(encoding)
|
|
202
|
-
|
|
203
|
-
async def write_text(self, path: str, text: str, encoding: str = "utf-8") -> None:
|
|
204
|
-
"""Write text to path."""
|
|
205
|
-
await self.write_bytes(path, text.encode(encoding))
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
# ---------- Local filesystem ----------
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
class LocalStorage(Storage):
|
|
212
|
-
"""Local filesystem storage implementation."""
|
|
213
|
-
|
|
214
|
-
def __init__(self, base: Path):
|
|
215
|
-
"""Initialize with base path."""
|
|
216
|
-
self._base = base
|
|
217
|
-
|
|
218
|
-
def with_base(self, subpath: str) -> "Storage":
|
|
219
|
-
"""Create sub-storage with base path.
|
|
220
|
-
|
|
221
|
-
Args:
|
|
222
|
-
subpath: Relative path for sub-storage
|
|
223
|
-
|
|
224
|
-
Returns:
|
|
225
|
-
New LocalStorage instance with updated base path
|
|
226
|
-
"""
|
|
227
|
-
return LocalStorage(self._base / _posix_rel(subpath))
|
|
228
|
-
|
|
229
|
-
def _abs(self, rel: str) -> Path:
|
|
230
|
-
return (self._base / _posix_rel(rel)).resolve()
|
|
231
|
-
|
|
232
|
-
def url_for(self, path: str) -> str:
|
|
233
|
-
"""Get file URL for path.
|
|
234
|
-
|
|
235
|
-
Args:
|
|
236
|
-
path: Relative path
|
|
237
|
-
|
|
238
|
-
Returns:
|
|
239
|
-
File URL for the path
|
|
240
|
-
"""
|
|
241
|
-
return self._abs(path).as_uri()
|
|
242
|
-
|
|
243
|
-
async def exists(self, path: str) -> bool:
|
|
244
|
-
"""Check if path exists.
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
path: Path to check
|
|
248
|
-
|
|
249
|
-
Returns:
|
|
250
|
-
True if path exists, False otherwise
|
|
251
|
-
"""
|
|
252
|
-
return self._abs(path).exists()
|
|
253
|
-
|
|
254
|
-
async def list(
|
|
255
|
-
self, prefix: str = "", *, recursive: bool = True, include_dirs: bool = True
|
|
256
|
-
) -> list[ObjectInfo]:
|
|
257
|
-
"""List objects with prefix.
|
|
258
|
-
|
|
259
|
-
Args:
|
|
260
|
-
prefix: Path prefix to list
|
|
261
|
-
recursive: Whether to list recursively
|
|
262
|
-
include_dirs: Whether to include directories
|
|
263
|
-
|
|
264
|
-
Returns:
|
|
265
|
-
List of object information
|
|
266
|
-
"""
|
|
267
|
-
base = self._abs(prefix)
|
|
268
|
-
if not base.exists():
|
|
269
|
-
return []
|
|
270
|
-
if base.is_file():
|
|
271
|
-
return [ObjectInfo(key="", size=base.stat().st_size, is_dir=False)]
|
|
272
|
-
|
|
273
|
-
out: list[ObjectInfo] = []
|
|
274
|
-
if recursive:
|
|
275
|
-
for root, dirs, files in os.walk(base):
|
|
276
|
-
r = Path(root)
|
|
277
|
-
if include_dirs:
|
|
278
|
-
for d in dirs:
|
|
279
|
-
out.append(
|
|
280
|
-
ObjectInfo(
|
|
281
|
-
key=(r / d).relative_to(base).as_posix(), size=-1, is_dir=True
|
|
282
|
-
)
|
|
283
|
-
)
|
|
284
|
-
for f in files:
|
|
285
|
-
fp = r / f
|
|
286
|
-
out.append(
|
|
287
|
-
ObjectInfo(
|
|
288
|
-
key=fp.relative_to(base).as_posix(),
|
|
289
|
-
size=fp.stat().st_size,
|
|
290
|
-
is_dir=False,
|
|
291
|
-
)
|
|
292
|
-
)
|
|
293
|
-
return out
|
|
294
|
-
|
|
295
|
-
with os.scandir(base) as it:
|
|
296
|
-
for e in it:
|
|
297
|
-
if e.is_dir():
|
|
298
|
-
if include_dirs:
|
|
299
|
-
out.append(ObjectInfo(key=e.name, size=-1, is_dir=True))
|
|
300
|
-
else:
|
|
301
|
-
out.append(ObjectInfo(key=e.name, size=e.stat().st_size, is_dir=False))
|
|
302
|
-
return out
|
|
303
|
-
|
|
304
|
-
async def read_bytes(self, path: str) -> bytes:
|
|
305
|
-
"""Read bytes from path.
|
|
306
|
-
|
|
307
|
-
Args:
|
|
308
|
-
path: Path to read from
|
|
309
|
-
|
|
310
|
-
Returns:
|
|
311
|
-
Binary content of the file
|
|
312
|
-
"""
|
|
313
|
-
return self._abs(path).read_bytes()
|
|
314
|
-
|
|
315
|
-
async def write_bytes(self, path: str, data: bytes) -> None:
|
|
316
|
-
"""Write bytes to path.
|
|
317
|
-
|
|
318
|
-
Args:
|
|
319
|
-
path: Path to write to
|
|
320
|
-
data: Binary data to write
|
|
321
|
-
"""
|
|
322
|
-
p = self._abs(path)
|
|
323
|
-
p.parent.mkdir(parents=True, exist_ok=True)
|
|
324
|
-
p.write_bytes(data)
|
|
325
|
-
|
|
326
|
-
async def delete(self, path: str, *, missing_ok: bool = True) -> None:
|
|
327
|
-
"""Delete path.
|
|
328
|
-
|
|
329
|
-
Args:
|
|
330
|
-
path: Path to delete
|
|
331
|
-
missing_ok: If True, don't raise error if path doesn't exist
|
|
332
|
-
|
|
333
|
-
Raises:
|
|
334
|
-
FileNotFoundError: If path doesn't exist and missing_ok is False
|
|
335
|
-
"""
|
|
336
|
-
p = self._abs(path)
|
|
337
|
-
if not p.exists():
|
|
338
|
-
if not missing_ok:
|
|
339
|
-
raise FileNotFoundError(str(p))
|
|
340
|
-
return
|
|
341
|
-
if p.is_dir():
|
|
342
|
-
for root, dirs, files in os.walk(p, topdown=False):
|
|
343
|
-
for f in files:
|
|
344
|
-
Path(root, f).unlink(missing_ok=True)
|
|
345
|
-
for d in dirs:
|
|
346
|
-
Path(root, d).rmdir()
|
|
347
|
-
p.rmdir()
|
|
348
|
-
else:
|
|
349
|
-
p.unlink()
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
# ---------- Google Cloud Storage ----------
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
class GcsStorage(Storage):
|
|
356
|
-
"""Google Cloud Storage implementation."""
|
|
357
|
-
|
|
358
|
-
def __init__(
|
|
359
|
-
self,
|
|
360
|
-
bucket: str,
|
|
361
|
-
bucket_folder: str = "",
|
|
362
|
-
gcp_credentials: GcpCredentials | None = None,
|
|
363
|
-
retry: RetryPolicy | None = None,
|
|
364
|
-
):
|
|
365
|
-
"""Initialize GCS storage.
|
|
366
|
-
|
|
367
|
-
Args:
|
|
368
|
-
bucket: GCS bucket name
|
|
369
|
-
bucket_folder: Optional folder within bucket
|
|
370
|
-
gcp_credentials: Optional GCP credentials
|
|
371
|
-
retry: Optional retry policy for operations
|
|
372
|
-
"""
|
|
373
|
-
# If no credentials provided, try to load from settings
|
|
374
|
-
if gcp_credentials is None and hasattr(settings, "gcs_service_account_file"):
|
|
375
|
-
service_account_file = getattr(settings, "gcs_service_account_file", "")
|
|
376
|
-
if service_account_file:
|
|
377
|
-
try:
|
|
378
|
-
gcp_credentials = GcpCredentials(
|
|
379
|
-
service_account_file=Path(service_account_file)
|
|
380
|
-
)
|
|
381
|
-
except Exception:
|
|
382
|
-
# If loading fails, pass None to GcsBucket
|
|
383
|
-
pass
|
|
384
|
-
|
|
385
|
-
if not gcp_credentials:
|
|
386
|
-
gcp_credentials = GcpCredentials()
|
|
387
|
-
|
|
388
|
-
# GcsBucket expects credentials or nothing, not None
|
|
389
|
-
self.block = GcsBucket(
|
|
390
|
-
bucket=bucket, bucket_folder=bucket_folder, gcp_credentials=gcp_credentials
|
|
391
|
-
)
|
|
392
|
-
self.retry = retry or RetryPolicy()
|
|
393
|
-
|
|
394
|
-
async def create_bucket(self) -> None:
|
|
395
|
-
"""Create the GCS bucket if it doesn't exist."""
|
|
396
|
-
await self.block.create_bucket() # type: ignore[attr-defined]
|
|
397
|
-
|
|
398
|
-
def with_base(self, subpath: str) -> "Storage":
|
|
399
|
-
"""Create sub-storage with base path.
|
|
400
|
-
|
|
401
|
-
Args:
|
|
402
|
-
subpath: Relative path for sub-storage
|
|
403
|
-
|
|
404
|
-
Returns:
|
|
405
|
-
New GcsStorage instance with updated base path
|
|
406
|
-
"""
|
|
407
|
-
new_folder = _join_posix(self.block.bucket_folder or "", subpath)
|
|
408
|
-
# Get credentials if they exist
|
|
409
|
-
creds = getattr(self.block, "gcp_credentials", None)
|
|
410
|
-
return GcsStorage(
|
|
411
|
-
bucket=self.block.bucket, # type: ignore[arg-type]
|
|
412
|
-
bucket_folder=new_folder,
|
|
413
|
-
gcp_credentials=creds if creds is not None else None,
|
|
414
|
-
retry=self.retry,
|
|
415
|
-
)
|
|
416
|
-
|
|
417
|
-
def _base(self) -> str:
|
|
418
|
-
return self.block.bucket_folder or ""
|
|
419
|
-
|
|
420
|
-
def _abs_name(self, rel: str) -> str:
|
|
421
|
-
base = self._base()
|
|
422
|
-
return str(PurePosixPath(base) / _posix_rel(rel)) if base else _posix_rel(rel)
|
|
423
|
-
|
|
424
|
-
def _rel_from_abs(self, abs_name: str) -> str:
|
|
425
|
-
base = self._base()
|
|
426
|
-
if base and abs_name.startswith(base):
|
|
427
|
-
return _posix_rel(abs_name[len(base) :])
|
|
428
|
-
return _posix_rel(abs_name)
|
|
429
|
-
|
|
430
|
-
def _rex(self) -> tuple[type[BaseException], ...]:
|
|
431
|
-
return self.retry.retry_exceptions or (Exception,)
|
|
432
|
-
|
|
433
|
-
async def _retry(self, label: str, fn) -> Any: # type: ignore[no-untyped-def]
|
|
434
|
-
last: BaseException | None = None
|
|
435
|
-
for i in range(1, self.retry.attempts + 1):
|
|
436
|
-
try:
|
|
437
|
-
return await fn()
|
|
438
|
-
except asyncio.CancelledError:
|
|
439
|
-
raise
|
|
440
|
-
except self._rex() as e: # type: ignore[misc]
|
|
441
|
-
last = e
|
|
442
|
-
if i == self.retry.attempts:
|
|
443
|
-
break
|
|
444
|
-
delay = min(self.retry.base_delay * (2 ** (i - 1)), self.retry.max_delay)
|
|
445
|
-
delay += delay * self.retry.jitter * random.random()
|
|
446
|
-
logger.warning(
|
|
447
|
-
f"GCS {label} failed: {e!s}. "
|
|
448
|
-
f"retry {i}/{self.retry.attempts - 1} in {delay:.2f}s"
|
|
449
|
-
)
|
|
450
|
-
await asyncio.sleep(delay)
|
|
451
|
-
assert last is not None
|
|
452
|
-
logger.error(f"GCS {label} failed after {self.retry.attempts} attempts: {last!s}")
|
|
453
|
-
raise last
|
|
454
|
-
|
|
455
|
-
def url_for(self, path: str) -> str:
|
|
456
|
-
"""Get GCS URL for path.
|
|
457
|
-
|
|
458
|
-
Args:
|
|
459
|
-
path: Relative path
|
|
460
|
-
|
|
461
|
-
Returns:
|
|
462
|
-
GCS URL in format gs://bucket/path
|
|
463
|
-
"""
|
|
464
|
-
return f"gs://{self.block.bucket}/{self._abs_name(path)}"
|
|
465
|
-
|
|
466
|
-
async def exists(self, path: str) -> bool:
|
|
467
|
-
"""Check if path exists.
|
|
468
|
-
|
|
469
|
-
Args:
|
|
470
|
-
path: Path to check
|
|
471
|
-
|
|
472
|
-
Returns:
|
|
473
|
-
True if path exists, False otherwise
|
|
474
|
-
"""
|
|
475
|
-
name = self._abs_name(path)
|
|
476
|
-
|
|
477
|
-
async def blob_exists() -> bool:
|
|
478
|
-
"""Check if blob exists.
|
|
479
|
-
|
|
480
|
-
Returns:
|
|
481
|
-
True if blob exists
|
|
482
|
-
"""
|
|
483
|
-
bucket = await self.block.get_bucket() # type: ignore[attr-defined]
|
|
484
|
-
blob = bucket.blob(name) # type: ignore[attr-defined]
|
|
485
|
-
try:
|
|
486
|
-
return await run_sync_in_worker_thread(blob.exists)
|
|
487
|
-
except Exception:
|
|
488
|
-
return False
|
|
489
|
-
|
|
490
|
-
async def prefix_exists() -> bool:
|
|
491
|
-
"""Check if prefix exists.
|
|
492
|
-
|
|
493
|
-
Returns:
|
|
494
|
-
True if any objects exist with this prefix
|
|
495
|
-
"""
|
|
496
|
-
blobs = await self.block.list_blobs(path) # type: ignore[attr-defined]
|
|
497
|
-
prefix_name = name.rstrip("/") + "/"
|
|
498
|
-
return any(
|
|
499
|
-
getattr(b, "name", None) == name
|
|
500
|
-
or (getattr(b, "name", "").startswith(prefix_name) if hasattr(b, "name") else False)
|
|
501
|
-
for b in blobs
|
|
502
|
-
)
|
|
503
|
-
|
|
504
|
-
if await self._retry("exists", blob_exists):
|
|
505
|
-
return True
|
|
506
|
-
return await self._retry("exists-prefix", prefix_exists)
|
|
507
|
-
|
|
508
|
-
async def list(
|
|
509
|
-
self, prefix: str = "", *, recursive: bool = True, include_dirs: bool = True
|
|
510
|
-
) -> list[ObjectInfo]:
|
|
511
|
-
"""List objects with prefix.
|
|
512
|
-
|
|
513
|
-
Args:
|
|
514
|
-
prefix: Path prefix to list
|
|
515
|
-
recursive: Whether to list recursively
|
|
516
|
-
include_dirs: Whether to include directories
|
|
517
|
-
|
|
518
|
-
Returns:
|
|
519
|
-
List of object information
|
|
520
|
-
"""
|
|
521
|
-
blobs = await self._retry("list", lambda: self.block.list_blobs(prefix))
|
|
522
|
-
base_abs = self._abs_name(prefix).rstrip("/")
|
|
523
|
-
out: list[ObjectInfo] = []
|
|
524
|
-
dirs: set[str] = set()
|
|
525
|
-
|
|
526
|
-
def rel(name: str) -> str | None:
|
|
527
|
-
"""Get relative path from name.
|
|
528
|
-
|
|
529
|
-
Args:
|
|
530
|
-
name: Absolute blob name
|
|
531
|
-
|
|
532
|
-
Returns:
|
|
533
|
-
Relative path or None if not under prefix
|
|
534
|
-
"""
|
|
535
|
-
n = name.rstrip("/")
|
|
536
|
-
if not base_abs:
|
|
537
|
-
return n
|
|
538
|
-
if n == base_abs:
|
|
539
|
-
return ""
|
|
540
|
-
if n.startswith(base_abs + "/"):
|
|
541
|
-
return n[len(base_abs) + 1 :]
|
|
542
|
-
return None
|
|
543
|
-
|
|
544
|
-
for b in blobs:
|
|
545
|
-
r = rel(b.name)
|
|
546
|
-
if r is None:
|
|
547
|
-
continue
|
|
548
|
-
if not recursive and "/" in r:
|
|
549
|
-
if include_dirs:
|
|
550
|
-
dirs.add(r.split("/", 1)[0])
|
|
551
|
-
continue
|
|
552
|
-
size = getattr(b, "size", -1)
|
|
553
|
-
out.append(ObjectInfo(key=_posix_rel(r), size=size, is_dir=False))
|
|
554
|
-
|
|
555
|
-
if include_dirs and not recursive:
|
|
556
|
-
out.extend(ObjectInfo(key=d, size=-1, is_dir=True) for d in sorted(dirs))
|
|
557
|
-
|
|
558
|
-
if not out and prefix:
|
|
559
|
-
bucket = await self.block.get_bucket() # type: ignore[attr-defined]
|
|
560
|
-
blob = bucket.blob(base_abs) # type: ignore[attr-defined]
|
|
561
|
-
if await run_sync_in_worker_thread(blob.exists):
|
|
562
|
-
try:
|
|
563
|
-
await run_sync_in_worker_thread(blob.reload)
|
|
564
|
-
size = getattr(blob, "size", None)
|
|
565
|
-
except Exception:
|
|
566
|
-
size = None
|
|
567
|
-
out.append(
|
|
568
|
-
ObjectInfo(key="", size=int(size) if size is not None else -1, is_dir=False)
|
|
569
|
-
)
|
|
570
|
-
|
|
571
|
-
return out
|
|
572
|
-
|
|
573
|
-
async def read_bytes(self, path: str) -> bytes:
|
|
574
|
-
"""Read bytes from path.
|
|
575
|
-
|
|
576
|
-
Args:
|
|
577
|
-
path: Path to read from
|
|
578
|
-
|
|
579
|
-
Returns:
|
|
580
|
-
Binary content of the file
|
|
581
|
-
"""
|
|
582
|
-
# GcsBucket.read_path expects a key relative to bucket_folder
|
|
583
|
-
return await self._retry("read_bytes", lambda: self.block.read_path(path))
|
|
584
|
-
|
|
585
|
-
async def write_bytes(self, path: str, data: bytes) -> None:
|
|
586
|
-
"""Write bytes to path.
|
|
587
|
-
|
|
588
|
-
Args:
|
|
589
|
-
path: Path to write to
|
|
590
|
-
data: Binary data to write
|
|
591
|
-
"""
|
|
592
|
-
await self._retry("write_bytes", lambda: self.block.write_path(path, data))
|
|
593
|
-
|
|
594
|
-
async def delete(self, path: str, *, missing_ok: bool = True) -> None:
|
|
595
|
-
"""Delete path.
|
|
596
|
-
|
|
597
|
-
Args:
|
|
598
|
-
path: Path to delete
|
|
599
|
-
missing_ok: If True, don't raise error if path doesn't exist
|
|
600
|
-
|
|
601
|
-
Raises:
|
|
602
|
-
FileNotFoundError: If path doesn't exist and missing_ok is False
|
|
603
|
-
"""
|
|
604
|
-
name = self._abs_name(path)
|
|
605
|
-
bucket = await self.block.get_bucket() # type: ignore[attr-defined]
|
|
606
|
-
|
|
607
|
-
async def delete_exact() -> bool:
|
|
608
|
-
"""Try to delete exact blob.
|
|
609
|
-
|
|
610
|
-
Returns:
|
|
611
|
-
True if deletion succeeded
|
|
612
|
-
"""
|
|
613
|
-
try:
|
|
614
|
-
blob = bucket.blob(name) # type: ignore[attr-defined]
|
|
615
|
-
await run_sync_in_worker_thread(blob.delete)
|
|
616
|
-
return True
|
|
617
|
-
except Exception:
|
|
618
|
-
return False
|
|
619
|
-
|
|
620
|
-
if await self._retry("delete", delete_exact):
|
|
621
|
-
return
|
|
622
|
-
|
|
623
|
-
blobs = await self._retry("list-for-delete", lambda: self.block.list_blobs(path))
|
|
624
|
-
if not blobs:
|
|
625
|
-
if not missing_ok:
|
|
626
|
-
raise FileNotFoundError(name)
|
|
627
|
-
return
|
|
628
|
-
await asyncio.gather(*[run_sync_in_worker_thread(b.delete) for b in blobs])
|