opteryx-catalog 0.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +31 -0
- opteryx_catalog/catalog/__init__.py +3 -0
- opteryx_catalog/catalog/dataset.py +1221 -0
- opteryx_catalog/catalog/manifest.py +23 -0
- opteryx_catalog/catalog/metadata.py +81 -0
- opteryx_catalog/catalog/metastore.py +68 -0
- opteryx_catalog/catalog/view.py +12 -0
- opteryx_catalog/exceptions.py +38 -0
- opteryx_catalog/iops/__init__.py +6 -0
- opteryx_catalog/iops/base.py +42 -0
- opteryx_catalog/iops/fileio.py +125 -0
- opteryx_catalog/iops/gcs.py +225 -0
- opteryx_catalog/opteryx_catalog.py +923 -0
- opteryx_catalog-0.4.4.dist-info/METADATA +464 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +23 -0
- opteryx_catalog-0.4.4.dist-info/WHEEL +5 -0
- opteryx_catalog-0.4.4.dist-info/licenses/LICENSE +201 -0
- opteryx_catalog-0.4.4.dist-info/top_level.txt +3 -0
- scripts/create_dataset.py +201 -0
- scripts/read_dataset.py +268 -0
- tests/test_dataset_metadata.py +15 -0
- tests/test_import.py +5 -0
- tests/test_pyproject.py +8 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import field
|
|
5
|
+
from typing import Dict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class DataFile:
|
|
10
|
+
file_path: str
|
|
11
|
+
file_format: str = "PARQUET"
|
|
12
|
+
record_count: int = 0
|
|
13
|
+
file_size_in_bytes: int = 0
|
|
14
|
+
partition: Dict[str, object] = field(default_factory=dict)
|
|
15
|
+
lower_bounds: Dict[int, bytes] | None = None
|
|
16
|
+
upper_bounds: Dict[int, bytes] | None = None
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class ManifestEntry:
|
|
21
|
+
snapshot_id: int
|
|
22
|
+
data_file: DataFile
|
|
23
|
+
status: str = "added" # 'added' | 'deleted'
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from dataclasses import field
|
|
5
|
+
from typing import Any
|
|
6
|
+
from typing import List
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
@dataclass
|
|
11
|
+
class Snapshot:
|
|
12
|
+
snapshot_id: int
|
|
13
|
+
timestamp_ms: int
|
|
14
|
+
author: Optional[str] = None
|
|
15
|
+
# Indicates whether this snapshot was created by a user (True) or internally (False)
|
|
16
|
+
user_created: Optional[bool] = None
|
|
17
|
+
# Monotonic sequence number for writes
|
|
18
|
+
sequence_number: Optional[int] = None
|
|
19
|
+
manifest_list: Optional[str] = None
|
|
20
|
+
# Operation metadata
|
|
21
|
+
operation_type: Optional[str] = None # e.g., 'append', 'overwrite', 'compact'
|
|
22
|
+
parent_snapshot_id: Optional[int] = None
|
|
23
|
+
schema_id: Optional[str] = None
|
|
24
|
+
# Commit message for the snapshot
|
|
25
|
+
commit_message: Optional[str] = None
|
|
26
|
+
# Summary metrics (store zeros when not applicable)
|
|
27
|
+
summary: dict = field(
|
|
28
|
+
default_factory=lambda: {
|
|
29
|
+
"added-data-files": 0,
|
|
30
|
+
"added-files-size": 0,
|
|
31
|
+
"added-records": 0,
|
|
32
|
+
"deleted-data-files": 0,
|
|
33
|
+
"deleted-files-size": 0,
|
|
34
|
+
"deleted-records": 0,
|
|
35
|
+
"total-data-files": 0,
|
|
36
|
+
"total-files-size": 0,
|
|
37
|
+
"total-records": 0,
|
|
38
|
+
}
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class DatasetMetadata:
|
|
44
|
+
dataset_identifier: str
|
|
45
|
+
format_version: int = 2
|
|
46
|
+
location: str = ""
|
|
47
|
+
schema: Any = None
|
|
48
|
+
properties: dict = field(default_factory=dict)
|
|
49
|
+
# Table-level created/updated metadata
|
|
50
|
+
timestamp_ms: Optional[int] = None
|
|
51
|
+
author: Optional[str] = None
|
|
52
|
+
description: Optional[str] = None
|
|
53
|
+
describer: Optional[str] = None
|
|
54
|
+
sort_orders: List[dict] = field(default_factory=list)
|
|
55
|
+
# Maintenance policy: retention settings grouped under a single block
|
|
56
|
+
maintenance_policy: dict = field(
|
|
57
|
+
default_factory=lambda: {
|
|
58
|
+
"retained-snapshot-count": None,
|
|
59
|
+
"retained-snapshot-age-days": None,
|
|
60
|
+
"compaction-policy": "performance",
|
|
61
|
+
}
|
|
62
|
+
)
|
|
63
|
+
# Compaction policy lives under maintenance_policy as 'compaction-policy'
|
|
64
|
+
snapshots: List[Snapshot] = field(default_factory=list)
|
|
65
|
+
current_snapshot_id: Optional[int] = None
|
|
66
|
+
# Schema management: schemas are stored in a subcollection in Firestore.
|
|
67
|
+
# `schemas` contains dicts with keys: schema_id, columns (list of {id,name,type}).
|
|
68
|
+
# Each schema dict may also include `timestamp-ms` and `author`.
|
|
69
|
+
schemas: List[dict] = field(default_factory=list)
|
|
70
|
+
current_schema_id: Optional[str] = None
|
|
71
|
+
|
|
72
|
+
def current_snapshot(self) -> Optional[Snapshot]:
|
|
73
|
+
if self.current_snapshot_id is None:
|
|
74
|
+
return self.snapshots[-1] if self.snapshots else None
|
|
75
|
+
for s in self.snapshots:
|
|
76
|
+
if s.snapshot_id == self.current_snapshot_id:
|
|
77
|
+
return s
|
|
78
|
+
return None
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
# Dataset terminology: TableMetadata renamed to DatasetMetadata
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from typing import Any
|
|
4
|
+
from typing import Iterable
|
|
5
|
+
from typing import Optional
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Metastore:
|
|
9
|
+
"""Abstract catalog interface.
|
|
10
|
+
|
|
11
|
+
Implementations should provide methods to create, load and manage
|
|
12
|
+
datasets and views. Terminology in this project follows the mapping:
|
|
13
|
+
`catalog -> workspace -> collection -> dataset|view`.
|
|
14
|
+
Signatures are intentionally simple and similar to other catalog
|
|
15
|
+
implementations to ease future compatibility.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def load_dataset(self, identifier: str) -> "Table":
|
|
19
|
+
raise NotImplementedError()
|
|
20
|
+
|
|
21
|
+
def create_dataset(
|
|
22
|
+
self, identifier: str, schema: Any, properties: dict | None = None
|
|
23
|
+
) -> "Table":
|
|
24
|
+
raise NotImplementedError()
|
|
25
|
+
|
|
26
|
+
def drop_dataset(self, identifier: str) -> None:
|
|
27
|
+
raise NotImplementedError()
|
|
28
|
+
|
|
29
|
+
def list_datasets(self, namespace: str) -> Iterable[str]:
|
|
30
|
+
raise NotImplementedError()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class Dataset:
|
|
34
|
+
"""Abstract dataset interface.
|
|
35
|
+
|
|
36
|
+
Minimal methods needed by the Opteryx engine and tests: access metadata,
|
|
37
|
+
list snapshots, append data, and produce a data scan object.
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def metadata(self) -> Any:
|
|
42
|
+
raise NotImplementedError()
|
|
43
|
+
|
|
44
|
+
def snapshots(self) -> Iterable[Any]:
|
|
45
|
+
raise NotImplementedError()
|
|
46
|
+
|
|
47
|
+
def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Any]:
|
|
48
|
+
"""Return a specific snapshot by id or the current snapshot when
|
|
49
|
+
called with `snapshot_id=None`.
|
|
50
|
+
"""
|
|
51
|
+
raise NotImplementedError()
|
|
52
|
+
|
|
53
|
+
def append(self, table):
|
|
54
|
+
"""Append data (implementations can accept pyarrow.Table or similar)."""
|
|
55
|
+
raise NotImplementedError()
|
|
56
|
+
|
|
57
|
+
def scan(
|
|
58
|
+
self, row_filter=None, snapshot_id: Optional[int] = None, row_limit: Optional[int] = None
|
|
59
|
+
) -> Any:
|
|
60
|
+
raise NotImplementedError()
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class View:
|
|
64
|
+
"""Abstract view metadata representation."""
|
|
65
|
+
|
|
66
|
+
@property
|
|
67
|
+
def definition(self) -> str:
|
|
68
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"""Catalog-specific exceptions for opteryx_catalog.
|
|
2
|
+
|
|
3
|
+
Exceptions mirror previous behavior (they subclass KeyError where callers
|
|
4
|
+
may expect KeyError) but provide explicit types for tables, views and
|
|
5
|
+
namespaces.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class CatalogError(Exception):
|
|
10
|
+
"""Base class for catalog errors."""
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class DatasetError(KeyError, CatalogError):
|
|
14
|
+
pass
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DatasetAlreadyExists(DatasetError):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DatasetNotFound(DatasetError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ViewError(KeyError, CatalogError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ViewAlreadyExists(ViewError):
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ViewNotFound(ViewError):
|
|
34
|
+
pass
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CollectionAlreadyExists(KeyError, CatalogError):
|
|
38
|
+
pass
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import BinaryIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InputFile:
|
|
8
|
+
def __init__(self, location: str, content: bytes | None = None):
|
|
9
|
+
self.location = location
|
|
10
|
+
self._content = content
|
|
11
|
+
|
|
12
|
+
def open(self) -> BinaryIO:
|
|
13
|
+
if self._content is None:
|
|
14
|
+
raise FileNotFoundError(self.location)
|
|
15
|
+
return BytesIO(self._content)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OutputFile:
|
|
19
|
+
def __init__(self, location: str):
|
|
20
|
+
self.location = location
|
|
21
|
+
|
|
22
|
+
def create(self):
|
|
23
|
+
"""Return a file-like object with a `write` method.
|
|
24
|
+
|
|
25
|
+
Implementations may return a buffer or a writer that persists on write/close.
|
|
26
|
+
"""
|
|
27
|
+
raise NotImplementedError()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FileIO:
|
|
31
|
+
"""Minimal FileIO abstraction used by the `opteryx_catalog` layer.
|
|
32
|
+
|
|
33
|
+
Concrete implementations should implement `new_input`, `new_output`, and
|
|
34
|
+
optionally `delete`/`exists`. The abstraction intentionally keeps only the
|
|
35
|
+
small surface needed by the catalog (read bytes, write bytes).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def new_input(self, location: str) -> InputFile:
|
|
39
|
+
return InputFile(location)
|
|
40
|
+
|
|
41
|
+
def new_output(self, location: str) -> OutputFile:
|
|
42
|
+
return OutputFile(location)
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from io import BytesIO
|
|
4
|
+
from typing import BinaryIO
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class InputFile:
|
|
8
|
+
def __init__(self, location: str, content: bytes | None = None):
|
|
9
|
+
self.location = location
|
|
10
|
+
self._content = content
|
|
11
|
+
|
|
12
|
+
def open(self) -> BinaryIO:
|
|
13
|
+
if self._content is None:
|
|
14
|
+
raise FileNotFoundError(self.location)
|
|
15
|
+
return BytesIO(self._content)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class OutputFile:
|
|
19
|
+
def __init__(self, location: str):
|
|
20
|
+
self.location = location
|
|
21
|
+
|
|
22
|
+
def create(self):
|
|
23
|
+
"""Return a file-like object with a `write` method.
|
|
24
|
+
|
|
25
|
+
Implementations may return a buffer or a writer that persists on write/close.
|
|
26
|
+
"""
|
|
27
|
+
raise NotImplementedError()
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class FileIO:
|
|
31
|
+
"""Minimal FileIO abstraction used by the `opteryx_catalog` layer.
|
|
32
|
+
|
|
33
|
+
Concrete implementations should implement `new_input`, `new_output`, and
|
|
34
|
+
optionally `delete`/`exists`. The abstraction intentionally keeps only the
|
|
35
|
+
small surface needed by the catalog (read bytes, write bytes).
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def new_input(self, location: str) -> InputFile:
|
|
39
|
+
return InputFile(location)
|
|
40
|
+
|
|
41
|
+
def new_output(self, location: str) -> OutputFile:
|
|
42
|
+
return OutputFile(location)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class _GcsAdapterOutputFile(OutputFile):
|
|
46
|
+
def __init__(self, location: str, gcs_fileio):
|
|
47
|
+
super().__init__(location)
|
|
48
|
+
self._location = location
|
|
49
|
+
self._gcs_fileio = gcs_fileio
|
|
50
|
+
|
|
51
|
+
def create(self):
|
|
52
|
+
"""Return a writer whose `write(data)` uploads the data via the wrapped GCS FileIO.
|
|
53
|
+
|
|
54
|
+
We perform the upload on the first write and close the underlying stream
|
|
55
|
+
afterwards so callers that simply call `out.write(data)` (common pattern
|
|
56
|
+
in this codebase) will succeed.
|
|
57
|
+
"""
|
|
58
|
+
|
|
59
|
+
class _Writer:
|
|
60
|
+
def __init__(self, location: str, gcs_fileio):
|
|
61
|
+
self._location = location
|
|
62
|
+
self._gcs_fileio = gcs_fileio
|
|
63
|
+
self._stream = None
|
|
64
|
+
|
|
65
|
+
def write(self, data: bytes | bytearray):
|
|
66
|
+
if self._stream is None:
|
|
67
|
+
# Create underlying output stream (may be a GcsOutputStream,
|
|
68
|
+
# DiscardOutputStream, or CaptureOutputStream depending on
|
|
69
|
+
# the wrapped FileIO behaviour).
|
|
70
|
+
out = self._gcs_fileio.new_output(self._location)
|
|
71
|
+
self._stream = out.create()
|
|
72
|
+
# Underlying stream implements write/close semantics
|
|
73
|
+
self._stream.write(data)
|
|
74
|
+
|
|
75
|
+
def close(self):
|
|
76
|
+
if self._stream is not None:
|
|
77
|
+
try:
|
|
78
|
+
self._stream.close()
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
return _Writer(self._location, self._gcs_fileio)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
class GcsFileIO(FileIO):
|
|
86
|
+
"""GCS-backed FileIO adapter that wraps the existing GCS implementation.
|
|
87
|
+
|
|
88
|
+
This adapter delegates to `pyiceberg_firestore_gcs.fileio.gcs_fileio.GcsFileIO`
|
|
89
|
+
for actual network operations but exposes the small `opteryx_catalog.iops`
|
|
90
|
+
`FileIO` interface used by the catalog layer.
|
|
91
|
+
"""
|
|
92
|
+
|
|
93
|
+
def __init__(self, properties=None):
|
|
94
|
+
# Lazy import to avoid pulling google libs unless used
|
|
95
|
+
from pyiceberg_firestore_gcs.fileio.gcs_fileio import GcsFileIO as _GcsImpl
|
|
96
|
+
|
|
97
|
+
self._impl = _GcsImpl(properties or {})
|
|
98
|
+
|
|
99
|
+
def new_input(self, location: str) -> InputFile:
|
|
100
|
+
# Read full bytes from the underlying InputFile and return an in-memory InputFile
|
|
101
|
+
impl_input = self._impl.new_input(location)
|
|
102
|
+
try:
|
|
103
|
+
stream = impl_input.open()
|
|
104
|
+
data = stream.read()
|
|
105
|
+
return InputFile(location, data)
|
|
106
|
+
except FileNotFoundError:
|
|
107
|
+
return InputFile(location, None)
|
|
108
|
+
|
|
109
|
+
def new_output(self, location: str) -> OutputFile:
|
|
110
|
+
return _GcsAdapterOutputFile(location, self._impl)
|
|
111
|
+
|
|
112
|
+
def delete(self, location: str) -> None:
|
|
113
|
+
return self._impl.delete(location)
|
|
114
|
+
|
|
115
|
+
def exists(self, location: str) -> bool:
|
|
116
|
+
try:
|
|
117
|
+
impl_in = self._impl.new_input(location)
|
|
118
|
+
# Some implementations provide `exists()`
|
|
119
|
+
if hasattr(impl_in, "exists"):
|
|
120
|
+
return impl_in.exists()
|
|
121
|
+
# Fallback: try to open
|
|
122
|
+
_ = impl_in.open()
|
|
123
|
+
return True
|
|
124
|
+
except Exception:
|
|
125
|
+
return False
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Optimized GCS FileIO for opteryx_catalog.iops
|
|
3
|
+
|
|
4
|
+
Adapted from pyiceberg_firestore_gcs.fileio.gcs_fileio to provide a fast
|
|
5
|
+
HTTP-backed GCS implementation without depending on pyiceberg types.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import io
|
|
9
|
+
import logging
|
|
10
|
+
import os
|
|
11
|
+
import urllib.parse
|
|
12
|
+
from typing import Callable
|
|
13
|
+
from typing import Union
|
|
14
|
+
|
|
15
|
+
import requests
|
|
16
|
+
from google.auth.transport.requests import Request
|
|
17
|
+
from requests.adapters import HTTPAdapter
|
|
18
|
+
|
|
19
|
+
from .base import FileIO
|
|
20
|
+
from .base import InputFile
|
|
21
|
+
from .base import OutputFile
|
|
22
|
+
|
|
23
|
+
logger = logging.getLogger(__name__)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _get_storage_credentials():
|
|
27
|
+
from google.cloud import storage
|
|
28
|
+
|
|
29
|
+
if os.environ.get("STORAGE_EMULATOR_HOST"):
|
|
30
|
+
from google.auth.credentials import AnonymousCredentials
|
|
31
|
+
|
|
32
|
+
storage_client = storage.Client(credentials=AnonymousCredentials())
|
|
33
|
+
else:
|
|
34
|
+
storage_client = storage.Client()
|
|
35
|
+
return storage_client._credentials
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class _GcsInputStream(io.BytesIO):
|
|
39
|
+
def __init__(
|
|
40
|
+
self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
41
|
+
):
|
|
42
|
+
# Strip gs://
|
|
43
|
+
if path.startswith("gs://"):
|
|
44
|
+
path = path[5:]
|
|
45
|
+
bucket = path.split("/", 1)[0]
|
|
46
|
+
object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
|
|
47
|
+
url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
|
|
48
|
+
|
|
49
|
+
access_token = access_token_getter()
|
|
50
|
+
headers = {"Accept-Encoding": "identity"}
|
|
51
|
+
if access_token:
|
|
52
|
+
headers["Authorization"] = f"Bearer {access_token}"
|
|
53
|
+
|
|
54
|
+
response = session.get(
|
|
55
|
+
url,
|
|
56
|
+
headers=headers,
|
|
57
|
+
timeout=30,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
if response.status_code != 200:
|
|
61
|
+
raise FileNotFoundError(
|
|
62
|
+
f"Unable to read '{path}' - status {response.status_code}: {response.text}"
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
super().__init__(response.content)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class _GcsOutputStream(io.BytesIO):
|
|
69
|
+
def __init__(
|
|
70
|
+
self, path: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
71
|
+
):
|
|
72
|
+
super().__init__()
|
|
73
|
+
self._path = path
|
|
74
|
+
self._session = session
|
|
75
|
+
self._access_token_getter = access_token_getter
|
|
76
|
+
self._closed = False
|
|
77
|
+
|
|
78
|
+
def close(self):
|
|
79
|
+
if self._closed:
|
|
80
|
+
return
|
|
81
|
+
|
|
82
|
+
path = self._path
|
|
83
|
+
if path.startswith("gs://"):
|
|
84
|
+
path = path[5:]
|
|
85
|
+
|
|
86
|
+
bucket = path.split("/", 1)[0]
|
|
87
|
+
url = f"https://storage.googleapis.com/upload/storage/v1/b/{bucket}/o"
|
|
88
|
+
|
|
89
|
+
data = self.getvalue()
|
|
90
|
+
object_name = path[(len(bucket) + 1) :]
|
|
91
|
+
|
|
92
|
+
token = self._access_token_getter()
|
|
93
|
+
headers = {
|
|
94
|
+
"Content-Type": "application/octet-stream",
|
|
95
|
+
"Content-Length": str(len(data)),
|
|
96
|
+
}
|
|
97
|
+
if token:
|
|
98
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
99
|
+
|
|
100
|
+
response = self._session.post(
|
|
101
|
+
url,
|
|
102
|
+
params={"uploadType": "media", "name": object_name},
|
|
103
|
+
headers=headers,
|
|
104
|
+
data=data,
|
|
105
|
+
timeout=60,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
if response.status_code not in (200, 201):
|
|
109
|
+
raise IOError(
|
|
110
|
+
f"Failed to write '{self._path}' - status {response.status_code}: {response.text}"
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
self._closed = True
|
|
114
|
+
super().close()
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
class _GcsInputFile(InputFile):
|
|
118
|
+
def __init__(
|
|
119
|
+
self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
120
|
+
):
|
|
121
|
+
# read entire bytes via optimized session
|
|
122
|
+
try:
|
|
123
|
+
stream = _GcsInputStream(location, session, access_token_getter)
|
|
124
|
+
data = stream.read()
|
|
125
|
+
super().__init__(location, data)
|
|
126
|
+
except FileNotFoundError:
|
|
127
|
+
super().__init__(location, None)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class _GcsOutputFile(OutputFile):
|
|
131
|
+
def __init__(
|
|
132
|
+
self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
|
|
133
|
+
):
|
|
134
|
+
super().__init__(location)
|
|
135
|
+
self._location = location
|
|
136
|
+
self._session = session
|
|
137
|
+
self._access_token_getter = access_token_getter
|
|
138
|
+
|
|
139
|
+
def create(self):
|
|
140
|
+
return _GcsOutputStream(self._location, self._session, self._access_token_getter)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
class GcsFileIO(FileIO):
|
|
144
|
+
"""Optimized HTTP-backed GCS FileIO.
|
|
145
|
+
|
|
146
|
+
Implements a blackhole/capture pattern for manifest files and exposes
|
|
147
|
+
`new_input`, `new_output`, `delete`, `exists`.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(self):
|
|
151
|
+
# Track manifest paths and captured manifests
|
|
152
|
+
self.manifest_paths: list[str] = []
|
|
153
|
+
self.captured_manifests: list[tuple[str, bytes]] = []
|
|
154
|
+
|
|
155
|
+
# Prepare requests session and set up credential refresh helper (token may expire)
|
|
156
|
+
self._credentials = _get_storage_credentials()
|
|
157
|
+
self._access_token = None
|
|
158
|
+
|
|
159
|
+
def _refresh_credentials():
|
|
160
|
+
try:
|
|
161
|
+
if not self._credentials.valid:
|
|
162
|
+
req = Request()
|
|
163
|
+
self._credentials.refresh(req)
|
|
164
|
+
self._access_token = self._credentials.token
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.warning("Failed to refresh GCS credentials: %s", e)
|
|
167
|
+
self._access_token = None
|
|
168
|
+
|
|
169
|
+
self._refresh_credentials = _refresh_credentials
|
|
170
|
+
|
|
171
|
+
def get_access_token():
|
|
172
|
+
# Refresh credentials on demand to avoid using expired tokens
|
|
173
|
+
self._refresh_credentials()
|
|
174
|
+
return self._access_token
|
|
175
|
+
|
|
176
|
+
self.get_access_token = get_access_token
|
|
177
|
+
|
|
178
|
+
self._session = requests.session()
|
|
179
|
+
adapter = HTTPAdapter(pool_connections=100, pool_maxsize=100)
|
|
180
|
+
self._session.mount("https://", adapter)
|
|
181
|
+
|
|
182
|
+
def new_input(self, location: str) -> InputFile:
|
|
183
|
+
return _GcsInputFile(location, self._session, self.get_access_token)
|
|
184
|
+
|
|
185
|
+
def new_output(self, location: str) -> OutputFile:
|
|
186
|
+
logger.info(f"new_output -> {location}")
|
|
187
|
+
|
|
188
|
+
return _GcsOutputFile(location, self._session, self.get_access_token)
|
|
189
|
+
|
|
190
|
+
def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
|
|
191
|
+
if isinstance(location, (InputFile, OutputFile)):
|
|
192
|
+
location = location.location
|
|
193
|
+
|
|
194
|
+
path = location
|
|
195
|
+
if path.startswith("gs://"):
|
|
196
|
+
path = path[5:]
|
|
197
|
+
|
|
198
|
+
bucket = path.split("/", 1)[0]
|
|
199
|
+
object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
|
|
200
|
+
url = f"https://storage.googleapis.com/storage/v1/b/{bucket}/o/{object_full_path}"
|
|
201
|
+
|
|
202
|
+
token = self.get_access_token()
|
|
203
|
+
headers = {}
|
|
204
|
+
if token:
|
|
205
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
206
|
+
response = self._session.delete(url, headers=headers, timeout=10)
|
|
207
|
+
|
|
208
|
+
if response.status_code not in (204, 404):
|
|
209
|
+
raise IOError(f"Failed to delete '{location}' - status {response.status_code}")
|
|
210
|
+
|
|
211
|
+
def exists(self, location: str) -> bool:
|
|
212
|
+
path = location
|
|
213
|
+
if path.startswith("gs://"):
|
|
214
|
+
path = path[5:]
|
|
215
|
+
|
|
216
|
+
bucket = path.split("/", 1)[0]
|
|
217
|
+
object_full_path = urllib.parse.quote(path[(len(bucket) + 1) :], safe="")
|
|
218
|
+
url = f"https://storage.googleapis.com/{bucket}/{object_full_path}"
|
|
219
|
+
|
|
220
|
+
token = self.get_access_token()
|
|
221
|
+
headers = {}
|
|
222
|
+
if token:
|
|
223
|
+
headers["Authorization"] = f"Bearer {token}"
|
|
224
|
+
response = self._session.head(url, headers=headers, timeout=10)
|
|
225
|
+
return response.status_code == 200
|