tab-cli 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tab_cli/__init__.py +3 -0
- tab_cli/cli.py +171 -0
- tab_cli/config.py +14 -0
- tab_cli/formats/__init__.py +15 -0
- tab_cli/formats/avro.py +47 -0
- tab_cli/formats/base.py +63 -0
- tab_cli/formats/csv.py +45 -0
- tab_cli/formats/jsonl.py +41 -0
- tab_cli/formats/parquet.py +57 -0
- tab_cli/handlers/__init__.py +87 -0
- tab_cli/handlers/base.py +259 -0
- tab_cli/handlers/cli_table.py +55 -0
- tab_cli/storage/__init__.py +83 -0
- tab_cli/storage/aws.py +223 -0
- tab_cli/storage/az.py +249 -0
- tab_cli/storage/base.py +36 -0
- tab_cli/storage/fsspec.py +60 -0
- tab_cli/storage/gcloud.py +215 -0
- tab_cli/storage/local.py +25 -0
- tab_cli/style.py +4 -0
- tab_cli/url_parser.py +97 -0
- tab_cli-0.1.1.dist-info/METADATA +27 -0
- tab_cli-0.1.1.dist-info/RECORD +26 -0
- tab_cli-0.1.1.dist-info/WHEEL +4 -0
- tab_cli-0.1.1.dist-info/entry_points.txt +2 -0
- tab_cli-0.1.1.dist-info/licenses/LICENSE +21 -0
tab_cli/storage/az.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum
|
|
3
|
+
|
|
4
|
+
from typing import Any, BinaryIO, Iterator
|
|
5
|
+
from loguru import logger
|
|
6
|
+
|
|
7
|
+
from tab_cli.storage import StorageBackend, FileInfo
|
|
8
|
+
from tab_cli.url_parser import parse_url
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AzAuthMethod(Enum):
|
|
12
|
+
CONNECTION_STRING = 1
|
|
13
|
+
ACCOUNT_KEY = 2
|
|
14
|
+
SAS_TOKEN = 3
|
|
15
|
+
AZURE_AD = 4
|
|
16
|
+
AZURE_CLI = 5
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class AzBackend(StorageBackend):
|
|
20
|
+
"""Storage backend for Azure Blob Storage with configurable URL interpretation.
|
|
21
|
+
|
|
22
|
+
When az_url_authority_is_account=True:
|
|
23
|
+
- az://account/container/path - authority is the storage account name
|
|
24
|
+
- az:///container/path - account inferred from AZURE_STORAGE_ACCOUNT
|
|
25
|
+
|
|
26
|
+
When az_url_authority_is_account=False (default adlfs behavior):
|
|
27
|
+
- az://container/path - authority is the container name
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self, account: str | None, container: str | None, az_url_authority_is_account: bool = False) -> None:
|
|
31
|
+
"""Initialize the Azure Blob Storage backend.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
az_url_authority_is_account: If True, interpret the URL authority as the
|
|
35
|
+
storage account name. If False, interpret it as the container name.
|
|
36
|
+
"""
|
|
37
|
+
try:
|
|
38
|
+
import adlfs
|
|
39
|
+
except ImportError as e:
|
|
40
|
+
raise ImportError("Package 'adlfs' is required for az:// URLs. Install with: pip install adlfs") from e
|
|
41
|
+
|
|
42
|
+
self.adlfs = adlfs
|
|
43
|
+
self.fs = None
|
|
44
|
+
self.url_authority_is_account = az_url_authority_is_account
|
|
45
|
+
|
|
46
|
+
if account is None:
|
|
47
|
+
account = os.environ.get("AZURE_STORAGE_ACCOUNT")
|
|
48
|
+
connection_string = os.environ.get("AZURE_STORAGE_CONNECTION_STRING")
|
|
49
|
+
account_key = os.environ.get("AZURE_STORAGE_KEY")
|
|
50
|
+
sas_token = os.environ.get("AZURE_STORAGE_SAS_TOKEN")
|
|
51
|
+
|
|
52
|
+
self.account = account
|
|
53
|
+
if account is None:
|
|
54
|
+
raise ValueError("Storage account name must be specified. Use az://account/container/path and use `tab --az-url-authority-is-account`, or set the environment variable.")
|
|
55
|
+
|
|
56
|
+
# 1. Try connection string from environment
|
|
57
|
+
if connection_string:
|
|
58
|
+
logger.debug("Authenticating to Azure storage account '{}' using connection string", account)
|
|
59
|
+
self.fs = self.adlfs.AzureBlobFileSystem(connection_string=connection_string)
|
|
60
|
+
self.connection_string = connection_string
|
|
61
|
+
self.method = AzAuthMethod.CONNECTION_STRING
|
|
62
|
+
|
|
63
|
+
# 2. Try account key from environment
|
|
64
|
+
elif account_key:
|
|
65
|
+
logger.debug("Authenticating to Azure storage account '{}' using account key", account)
|
|
66
|
+
self.fs = self.adlfs.AzureBlobFileSystem(
|
|
67
|
+
account_name=account,
|
|
68
|
+
account_key=account_key,
|
|
69
|
+
)
|
|
70
|
+
self.account_key = account_key
|
|
71
|
+
self.method = AzAuthMethod.ACCOUNT_KEY
|
|
72
|
+
|
|
73
|
+
# 3. Try SAS token from environment
|
|
74
|
+
elif sas_token:
|
|
75
|
+
logger.debug("Authenticating to Azure storage account '{}' using SAS token", account)
|
|
76
|
+
self.fs = self.adlfs.AzureBlobFileSystem(
|
|
77
|
+
account_name=account,
|
|
78
|
+
sas_token=sas_token,
|
|
79
|
+
)
|
|
80
|
+
self.sas_token = sas_token
|
|
81
|
+
self.method = AzAuthMethod.SAS_TOKEN
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
# 4. Try Azure AD / RBAC (DefaultAzureCredential)
|
|
85
|
+
try:
|
|
86
|
+
from azure.identity.aio import DefaultAzureCredential # Async version,
|
|
87
|
+
|
|
88
|
+
logger.debug("Authenticating to Azure storage account '{}' using Azure AD / RBAC", account)
|
|
89
|
+
self.fs = self.adlfs.AzureBlobFileSystem(
|
|
90
|
+
account_name=account,
|
|
91
|
+
credential=DefaultAzureCredential(),
|
|
92
|
+
)
|
|
93
|
+
self.fs.ls(container)
|
|
94
|
+
self.method = AzAuthMethod.AZURE_AD
|
|
95
|
+
|
|
96
|
+
except ImportError:
|
|
97
|
+
logger.debug("azure-identity not installed, skipping Azure AD / RBAC authentication")
|
|
98
|
+
except Exception as e:
|
|
99
|
+
logger.debug(f"Azure AD / RBAC authentication failed: {e}")
|
|
100
|
+
|
|
101
|
+
if self.fs is None:
|
|
102
|
+
# 5. Fallback to fetching account key via Azure CLI
|
|
103
|
+
logger.debug(f"Attempting to fetch account key via Azure CLI for '{account}'")
|
|
104
|
+
try:
|
|
105
|
+
account_key = self._get_account_key_via_cli(account)
|
|
106
|
+
except Exception as e:
|
|
107
|
+
logger.debug(f"Fetching account key via Azure CLI failed: {e}")
|
|
108
|
+
try:
|
|
109
|
+
if account_key:
|
|
110
|
+
logger.debug(f"Authenticating to Azure storage account '{account}' using key from Azure CLI")
|
|
111
|
+
self.fs = self.adlfs.AzureBlobFileSystem(
|
|
112
|
+
account_name=account,
|
|
113
|
+
account_key=account_key,
|
|
114
|
+
)
|
|
115
|
+
self.account_key = account_key
|
|
116
|
+
self.fs.ls(container)
|
|
117
|
+
self.method = AzAuthMethod.AZURE_CLI
|
|
118
|
+
except Exception as e:
|
|
119
|
+
logger.debug(f"Authenticating to Azure storage account '{account}' using key from Azure CLI failed: {e}")
|
|
120
|
+
|
|
121
|
+
if self.fs is None:
|
|
122
|
+
raise ValueError(
|
|
123
|
+
f"Could not authenticate to storage account '{account}'. "
|
|
124
|
+
"Set AZURE_STORAGE_CONNECTION_STRING, AZURE_STORAGE_KEY, AZURE_STORAGE_SAS_TOKEN, "
|
|
125
|
+
"configure Azure AD RBAC, or run 'az login'."
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def _get_account_key_via_cli(self, account: str) -> str | None:
|
|
129
|
+
"""Try to get storage account key via Azure CLI."""
|
|
130
|
+
import subprocess
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
result = subprocess.run(
|
|
134
|
+
["az", "storage", "account", "keys", "list",
|
|
135
|
+
"--account-name", account,
|
|
136
|
+
"--query", "[0].value",
|
|
137
|
+
"-o", "tsv"],
|
|
138
|
+
capture_output=True,
|
|
139
|
+
text=True,
|
|
140
|
+
timeout=30,
|
|
141
|
+
)
|
|
142
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
143
|
+
return result.stdout.strip()
|
|
144
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
145
|
+
pass
|
|
146
|
+
return None
|
|
147
|
+
|
|
148
|
+
def normalize_for_polars(self, url: str) -> str:
|
|
149
|
+
"""Normalize URL to a format Polars understands.
|
|
150
|
+
|
|
151
|
+
Polars expects az://container/path format.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
Normalized URL in az://container/path format.
|
|
155
|
+
"""
|
|
156
|
+
parsed = parse_url(url)
|
|
157
|
+
return f"az://{parsed.bucket}/{parsed.path}"
|
|
158
|
+
|
|
159
|
+
def storage_options(self, url: str) -> dict[str, str] | None:
|
|
160
|
+
"""Return storage options for Polars Azure access.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
Dict with appropriate authentication options for Azure.
|
|
164
|
+
Includes both adlfs-style keys and Rust object_store keys for compatibility.
|
|
165
|
+
Priority: connection_string > account_key > sas_token > CLI key > account_name only
|
|
166
|
+
"""
|
|
167
|
+
if self.method == AzAuthMethod.CONNECTION_STRING:
|
|
168
|
+
return {
|
|
169
|
+
"connection_string": self.connection_string,
|
|
170
|
+
"azure_storage_connection_string": self.connection_string,
|
|
171
|
+
}
|
|
172
|
+
elif self.method == AzAuthMethod.ACCOUNT_KEY:
|
|
173
|
+
return {
|
|
174
|
+
"account_name": self.account,
|
|
175
|
+
"account_key": self.account_key,
|
|
176
|
+
"azure_storage_account_name": self.account,
|
|
177
|
+
"azure_storage_account_key": self.account_key,
|
|
178
|
+
}
|
|
179
|
+
# 3. SAS token from environment
|
|
180
|
+
elif self.method == AzAuthMethod.SAS_TOKEN:
|
|
181
|
+
return {
|
|
182
|
+
"account_name": self.account,
|
|
183
|
+
"sas_token": self.sas_token,
|
|
184
|
+
"azure_storage_account_name": self.account,
|
|
185
|
+
"azure_storage_sas_token": self.sas_token,
|
|
186
|
+
}
|
|
187
|
+
# 4. Try fetching account key via Azure CLI
|
|
188
|
+
elif self.method == AzAuthMethod.AZURE_CLI:
|
|
189
|
+
return {
|
|
190
|
+
"account_name": self.account,
|
|
191
|
+
"account_key": self.account_key,
|
|
192
|
+
"azure_storage_account_name": self.account,
|
|
193
|
+
"azure_storage_account_key": self.account_key,
|
|
194
|
+
}
|
|
195
|
+
# Fallback: account name only (will use DefaultAzureCredential)
|
|
196
|
+
else:
|
|
197
|
+
return {
|
|
198
|
+
"account_name": self.account,
|
|
199
|
+
"azure_storage_account_name": self.account,
|
|
200
|
+
"anon": 'false',
|
|
201
|
+
"use_azure_cli": 'true',
|
|
202
|
+
"azure_use_azure_cli": "true",
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
def _to_internal(self, url: str) -> str:
|
|
206
|
+
"""Convert URL to (filesystem, internal_path) for adlfs operations."""
|
|
207
|
+
parsed = parse_url(url)
|
|
208
|
+
return f"{parsed.bucket}/{parsed.path}"
|
|
209
|
+
|
|
210
|
+
def _to_uri(self, account: str, internal_path: str) -> str:
|
|
211
|
+
"""Convert internal path back to az:// URL."""
|
|
212
|
+
if self.url_authority_is_account:
|
|
213
|
+
return f"az://{account}/{internal_path}"
|
|
214
|
+
else:
|
|
215
|
+
return f"az://{internal_path}"
|
|
216
|
+
|
|
217
|
+
def open(self, url: str) -> BinaryIO:
|
|
218
|
+
return self.fs.open(self._to_internal(url), "rb")
|
|
219
|
+
|
|
220
|
+
def list_files(self, url: str, extension: str) -> Iterator[FileInfo]:
|
|
221
|
+
internal_path = self._to_internal(url)
|
|
222
|
+
pattern = f"{internal_path}/**/*{extension}"
|
|
223
|
+
for path in sorted(self.fs.glob(pattern)):
|
|
224
|
+
info = self.fs.info(path)
|
|
225
|
+
yield FileInfo(url=self._to_uri(self.account, path), size=info["size"])
|
|
226
|
+
|
|
227
|
+
def size(self, url: str) -> int:
|
|
228
|
+
return self.fs.size(self._to_internal(url))
|
|
229
|
+
|
|
230
|
+
def is_directory(self, url: str) -> bool:
|
|
231
|
+
path = self._to_internal(url)
|
|
232
|
+
try:
|
|
233
|
+
info = self.fs.info(path)
|
|
234
|
+
return info.get("type") == "directory"
|
|
235
|
+
except FileNotFoundError:
|
|
236
|
+
try:
|
|
237
|
+
contents = self.fs.ls(path, detail=False)
|
|
238
|
+
return len(contents) > 0
|
|
239
|
+
except Exception:
|
|
240
|
+
return False
|
|
241
|
+
|
|
242
|
+
def __del__(self):
|
|
243
|
+
try:
|
|
244
|
+
# Check if fs exists and has a close method
|
|
245
|
+
if hasattr(self, 'fs') and self.fs is not None:
|
|
246
|
+
self.fs.close()
|
|
247
|
+
except Exception:
|
|
248
|
+
# Silently fail as the interpreter is likely in mid-teardown
|
|
249
|
+
pass
|
tab_cli/storage/base.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
"""Base storage backend interface."""
|
|
2
|
+
|
|
3
|
+
from abc import ABC, abstractmethod
|
|
4
|
+
from dataclasses import dataclass
|
|
5
|
+
from typing import BinaryIO, Iterator, Any
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class FileInfo:
|
|
10
|
+
url: str
|
|
11
|
+
size: int
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class StorageBackend(ABC):
|
|
15
|
+
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def open(self, url: str) -> BinaryIO:
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
@abstractmethod
|
|
21
|
+
def list_files(self, url: str, extension: str) -> Iterator[FileInfo]:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
@abstractmethod
|
|
25
|
+
def size(self, url: str) -> int:
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
@abstractmethod
|
|
29
|
+
def is_directory(self, url: str) -> bool:
|
|
30
|
+
pass
|
|
31
|
+
|
|
32
|
+
def normalize_for_polars(self, url: str) -> str:
|
|
33
|
+
return url
|
|
34
|
+
|
|
35
|
+
def storage_options(self, url: str) -> dict[str, Any] | None:
|
|
36
|
+
return None
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
"""Cloud storage backend using fsspec.
|
|
2
|
+
|
|
3
|
+
Fallback for any fsspec-supported protocol not handled by dedicated backends.
|
|
4
|
+
|
|
5
|
+
Note: The following have dedicated backends:
|
|
6
|
+
- az://, abfs://, abfss:// -> AzBackend
|
|
7
|
+
- gs:// -> GcsBackend
|
|
8
|
+
- s3:// -> S3Backend
|
|
9
|
+
|
|
10
|
+
The appropriate protocol handler package must be installed separately.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from typing import BinaryIO, Iterator, Any
|
|
14
|
+
import fsspec
|
|
15
|
+
|
|
16
|
+
from tab_cli.storage.base import FileInfo, StorageBackend
|
|
17
|
+
|
|
18
|
+
# Package hints for common protocols
|
|
19
|
+
_PACKAGE_HINTS: dict[str, str] = {}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class FsspecBackend(StorageBackend):
|
|
23
|
+
|
|
24
|
+
def __init__(self, protocol: str) -> None:
|
|
25
|
+
self._protocol = protocol
|
|
26
|
+
|
|
27
|
+
try:
|
|
28
|
+
self._fs: fsspec.AbstractFileSystem = fsspec.filesystem(protocol)
|
|
29
|
+
except (ImportError, ValueError) as e:
|
|
30
|
+
pkg = _PACKAGE_HINTS.get(protocol)
|
|
31
|
+
if pkg is not None:
|
|
32
|
+
raise ImportError(f"Package '{pkg}' is required for {protocol}:// URLs. Install with: pip install {pkg}") from e
|
|
33
|
+
raise ImportError(f"No handler found for {protocol}:// URLs") from e
|
|
34
|
+
|
|
35
|
+
def open(self, url: str) -> BinaryIO:
|
|
36
|
+
return self._fs.open(url, "rb")
|
|
37
|
+
|
|
38
|
+
def list_files(self, url: str, extension: str) -> Iterator[FileInfo]:
|
|
39
|
+
pattern = f"{url}/**/*{extension}"
|
|
40
|
+
for path in sorted(self._fs.glob(pattern)):
|
|
41
|
+
info = self._fs.info(path)
|
|
42
|
+
full_uri = f"{self._protocol}://{path}" if not path.startswith(f"{self._protocol}://") else path
|
|
43
|
+
yield FileInfo(url=full_uri, size=info["size"])
|
|
44
|
+
|
|
45
|
+
def size(self, url: str) -> int:
|
|
46
|
+
return self._fs.size(url)
|
|
47
|
+
|
|
48
|
+
def is_directory(self, url: str) -> bool:
|
|
49
|
+
try:
|
|
50
|
+
info = self._fs.info(url)
|
|
51
|
+
return info.get("type") == "directory"
|
|
52
|
+
except FileNotFoundError:
|
|
53
|
+
try:
|
|
54
|
+
contents = self._fs.ls(url, detail=False)
|
|
55
|
+
return len(contents) > 0
|
|
56
|
+
except Exception:
|
|
57
|
+
return False
|
|
58
|
+
|
|
59
|
+
def storage_options(self, url: str) -> dict[str, Any] | None:
|
|
60
|
+
return None
|
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from enum import Enum
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from typing import BinaryIO, Iterator
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from tab_cli.storage.base import StorageBackend, FileInfo
|
|
9
|
+
from tab_cli.url_parser import parse_url
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class GcloudAuthMethod(Enum):
|
|
13
|
+
SERVICE_ACCOUNT = 1 # GOOGLE_APPLICATION_CREDENTIALS
|
|
14
|
+
ADC = 2 # Application Default Credentials file
|
|
15
|
+
GCLOUD_CLI = 3 # gcloud auth login (non-ADC)
|
|
16
|
+
GOOGLE_DEFAULT = 4 # google.auth.default()
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class GcloudBackend(StorageBackend):
|
|
20
|
+
"""Storage backend for Google Cloud Storage.
|
|
21
|
+
|
|
22
|
+
URL format: gs://bucket/path
|
|
23
|
+
|
|
24
|
+
Authentication order:
|
|
25
|
+
1. GOOGLE_APPLICATION_CREDENTIALS environment variable (service account JSON)
|
|
26
|
+
2. ADC file (~/.config/gcloud/application_default_credentials.json)
|
|
27
|
+
3. google.auth.default() via gcsfs token="google_default"
|
|
28
|
+
4. gcloud CLI login credentials (gcloud auth print-access-token)
|
|
29
|
+
"""
|
|
30
|
+
|
|
31
|
+
def __init__(self) -> None:
|
|
32
|
+
"""Initialize the Google Cloud Storage backend."""
|
|
33
|
+
try:
|
|
34
|
+
import gcsfs
|
|
35
|
+
except ImportError as e:
|
|
36
|
+
raise ImportError("Package 'gcsfs' is required for gs:// URLs. Install with: pip install gcsfs") from e
|
|
37
|
+
|
|
38
|
+
self.gcsfs = gcsfs
|
|
39
|
+
self.fs = None
|
|
40
|
+
|
|
41
|
+
# 1. Try GOOGLE_APPLICATION_CREDENTIALS environment variable
|
|
42
|
+
credentials_file = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
|
43
|
+
if credentials_file and os.path.exists(credentials_file):
|
|
44
|
+
logger.debug("Authenticating to GCS using GOOGLE_APPLICATION_CREDENTIALS: {}", credentials_file)
|
|
45
|
+
try:
|
|
46
|
+
self.fs = self.gcsfs.GCSFileSystem(token=credentials_file)
|
|
47
|
+
self.method = GcloudAuthMethod.SERVICE_ACCOUNT
|
|
48
|
+
self.token = credentials_file
|
|
49
|
+
return
|
|
50
|
+
except Exception as e:
|
|
51
|
+
logger.debug("GOOGLE_APPLICATION_CREDENTIALS authentication failed: {}", e)
|
|
52
|
+
|
|
53
|
+
# 2. Try ADC file (application_default_credentials.json)
|
|
54
|
+
adc_path = self._get_adc_path()
|
|
55
|
+
if adc_path and adc_path.exists():
|
|
56
|
+
logger.debug("Authenticating to GCS using ADC file: {}", adc_path)
|
|
57
|
+
try:
|
|
58
|
+
self.fs = self.gcsfs.GCSFileSystem(token=str(adc_path))
|
|
59
|
+
self.method = GcloudAuthMethod.ADC
|
|
60
|
+
self.token = str(adc_path)
|
|
61
|
+
return
|
|
62
|
+
except Exception as e:
|
|
63
|
+
logger.debug("ADC file authentication failed: {}", e)
|
|
64
|
+
|
|
65
|
+
# 3. Fallback to gcloud CLI login (gcloud auth print-access-token)
|
|
66
|
+
logger.debug("Attempting to get access token from gcloud CLI")
|
|
67
|
+
access_token = self._get_access_token_via_cli()
|
|
68
|
+
if access_token:
|
|
69
|
+
logger.debug("Authenticating to GCS using gcloud CLI access token")
|
|
70
|
+
try:
|
|
71
|
+
self.fs = self.gcsfs.GCSFileSystem(token=access_token)
|
|
72
|
+
self.method = GcloudAuthMethod.GCLOUD_CLI
|
|
73
|
+
self.token = access_token
|
|
74
|
+
return
|
|
75
|
+
except Exception as e:
|
|
76
|
+
logger.debug("gcloud CLI access token authentication failed: {}", e)
|
|
77
|
+
|
|
78
|
+
# 4. Try google.auth.default() via token="google_default"
|
|
79
|
+
logger.debug("Authenticating to GCS using google.auth.default()")
|
|
80
|
+
try:
|
|
81
|
+
self.fs = self.gcsfs.GCSFileSystem(token="google_default")
|
|
82
|
+
self.method = GcloudAuthMethod.GOOGLE_DEFAULT
|
|
83
|
+
self.token = "google_default"
|
|
84
|
+
return
|
|
85
|
+
except Exception as e:
|
|
86
|
+
logger.debug("google.auth.default() authentication failed: {}", e)
|
|
87
|
+
|
|
88
|
+
if self.fs is None:
|
|
89
|
+
raise ValueError(
|
|
90
|
+
"Could not authenticate to Google Cloud Storage. "
|
|
91
|
+
"Set GOOGLE_APPLICATION_CREDENTIALS, run 'gcloud auth application-default login', "
|
|
92
|
+
"or run 'gcloud auth login'."
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _get_adc_path(self) -> Path | None:
|
|
96
|
+
"""Get the path to the Application Default Credentials file."""
|
|
97
|
+
# Check CLOUDSDK_CONFIG for custom config directory
|
|
98
|
+
config_dir = os.environ.get("CLOUDSDK_CONFIG")
|
|
99
|
+
if config_dir:
|
|
100
|
+
return Path(config_dir) / "application_default_credentials.json"
|
|
101
|
+
|
|
102
|
+
# Default locations
|
|
103
|
+
if os.name == "nt": # Windows
|
|
104
|
+
appdata = os.environ.get("APPDATA")
|
|
105
|
+
if appdata:
|
|
106
|
+
return Path(appdata) / "gcloud" / "application_default_credentials.json"
|
|
107
|
+
else: # Linux/macOS
|
|
108
|
+
home = Path.home()
|
|
109
|
+
return home / ".config" / "gcloud" / "application_default_credentials.json"
|
|
110
|
+
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
def _get_access_token_via_cli(self) -> str | None:
|
|
114
|
+
"""Get access token from gcloud CLI (gcloud auth print-access-token)."""
|
|
115
|
+
import subprocess
|
|
116
|
+
|
|
117
|
+
try:
|
|
118
|
+
result = subprocess.run(
|
|
119
|
+
["gcloud", "auth", "print-access-token"],
|
|
120
|
+
capture_output=True,
|
|
121
|
+
text=True,
|
|
122
|
+
timeout=30,
|
|
123
|
+
)
|
|
124
|
+
if result.returncode == 0 and result.stdout.strip():
|
|
125
|
+
return result.stdout.strip()
|
|
126
|
+
except (subprocess.TimeoutExpired, FileNotFoundError):
|
|
127
|
+
logger.debug("gcloud CLI not available or timed out")
|
|
128
|
+
return None
|
|
129
|
+
|
|
130
|
+
def normalize_for_polars(self, url: str) -> str:
|
|
131
|
+
"""Normalize URL to a format Polars understands.
|
|
132
|
+
|
|
133
|
+
Polars expects gs://bucket/path format.
|
|
134
|
+
|
|
135
|
+
Returns:
|
|
136
|
+
Normalized URL in gs://bucket/path format.
|
|
137
|
+
"""
|
|
138
|
+
parsed = parse_url(url)
|
|
139
|
+
return f"gs://{parsed.bucket}/{parsed.path}"
|
|
140
|
+
|
|
141
|
+
def storage_options(self, url: str) -> dict[str, str]:
|
|
142
|
+
"""Return storage options for Polars GCS access.
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dict with appropriate authentication options for GCS.
|
|
146
|
+
Includes both gcsfs-style keys and Rust object_store keys for compatibility.
|
|
147
|
+
"""
|
|
148
|
+
if self.method == GcloudAuthMethod.SERVICE_ACCOUNT:
|
|
149
|
+
return {
|
|
150
|
+
"token": self.token,
|
|
151
|
+
"service_account": self.token,
|
|
152
|
+
"google_service_account": self.token,
|
|
153
|
+
}
|
|
154
|
+
elif self.method == GcloudAuthMethod.ADC:
|
|
155
|
+
return {
|
|
156
|
+
"token": self.token,
|
|
157
|
+
"service_account": self.token,
|
|
158
|
+
"google_service_account": self.token,
|
|
159
|
+
}
|
|
160
|
+
elif self.method == GcloudAuthMethod.GCLOUD_CLI:
|
|
161
|
+
# For CLI token, we need to refresh it for Polars
|
|
162
|
+
# since the token may have expired
|
|
163
|
+
fresh_token = self._get_access_token_via_cli()
|
|
164
|
+
if fresh_token:
|
|
165
|
+
return {
|
|
166
|
+
"token": fresh_token,
|
|
167
|
+
}
|
|
168
|
+
return {"token": self.token}
|
|
169
|
+
elif self.method == GcloudAuthMethod.GOOGLE_DEFAULT:
|
|
170
|
+
return {
|
|
171
|
+
"token": "google_default",
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
def _to_internal(self, url: str) -> str:
|
|
175
|
+
"""Convert URL to internal path for gcsfs operations."""
|
|
176
|
+
parsed = parse_url(url)
|
|
177
|
+
return f"{parsed.bucket}/{parsed.path}"
|
|
178
|
+
|
|
179
|
+
def _to_uri(self, internal_path: str) -> str:
|
|
180
|
+
"""Convert internal path back to gs:// URL."""
|
|
181
|
+
return f"gs://{internal_path}"
|
|
182
|
+
|
|
183
|
+
def open(self, url: str) -> BinaryIO:
|
|
184
|
+
return self.fs.open(self._to_internal(url), "rb")
|
|
185
|
+
|
|
186
|
+
def list_files(self, url: str, extension: str) -> Iterator[FileInfo]:
|
|
187
|
+
internal_path = self._to_internal(url)
|
|
188
|
+
pattern = f"{internal_path}/**/*{extension}"
|
|
189
|
+
for path in sorted(self.fs.glob(pattern)):
|
|
190
|
+
info = self.fs.info(path)
|
|
191
|
+
yield FileInfo(url=self._to_uri(path), size=info["size"])
|
|
192
|
+
|
|
193
|
+
def size(self, url: str) -> int:
|
|
194
|
+
return self.fs.size(self._to_internal(url))
|
|
195
|
+
|
|
196
|
+
def is_directory(self, url: str) -> bool:
|
|
197
|
+
path = self._to_internal(url)
|
|
198
|
+
try:
|
|
199
|
+
info = self.fs.info(path)
|
|
200
|
+
return info.get("type") == "directory"
|
|
201
|
+
except FileNotFoundError:
|
|
202
|
+
try:
|
|
203
|
+
contents = self.fs.ls(path, detail=False)
|
|
204
|
+
return len(contents) > 0
|
|
205
|
+
except Exception:
|
|
206
|
+
return False
|
|
207
|
+
|
|
208
|
+
def __del__(self):
|
|
209
|
+
try:
|
|
210
|
+
if hasattr(self, 'fs') and self.fs is not None:
|
|
211
|
+
# gcsfs doesn't have a close method, but clear session if possible
|
|
212
|
+
if hasattr(self.fs, 'session') and self.fs.session:
|
|
213
|
+
pass # aiohttp session cleanup is handled by gcsfs
|
|
214
|
+
except Exception:
|
|
215
|
+
pass
|
tab_cli/storage/local.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""Local filesystem storage backend."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from glob import glob
|
|
5
|
+
from typing import BinaryIO, Iterator
|
|
6
|
+
|
|
7
|
+
from tab_cli.storage.base import FileInfo, StorageBackend
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class LocalBackend(StorageBackend):
|
|
11
|
+
"""Storage backend for local filesystem."""
|
|
12
|
+
|
|
13
|
+
def open(self, url: str) -> BinaryIO:
|
|
14
|
+
return open(url, "rb")
|
|
15
|
+
|
|
16
|
+
def list_files(self, url: str, extension: str) -> Iterator[FileInfo]:
|
|
17
|
+
pattern = os.path.join(url, "**", f"*{extension}")
|
|
18
|
+
for path in sorted(glob(pattern, recursive=True)):
|
|
19
|
+
yield FileInfo(url=path, size=os.path.getsize(path))
|
|
20
|
+
|
|
21
|
+
def size(self, url: str) -> int:
|
|
22
|
+
return os.path.getsize(url)
|
|
23
|
+
|
|
24
|
+
def is_directory(self, url: str) -> bool:
|
|
25
|
+
return os.path.isdir(url)
|
tab_cli/style.py
ADDED
tab_cli/url_parser.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
from dataclasses import dataclass
|
|
2
|
+
import os
|
|
3
|
+
from urllib.parse import urlparse
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class ParsedUrl:
|
|
8
|
+
"""Parsed URL components for cloud storage."""
|
|
9
|
+
|
|
10
|
+
scheme: str
|
|
11
|
+
"""URL scheme (e.g., 's3', 'gs', 'az', 'abfs', 'abfss', 'file', or '' for local)."""
|
|
12
|
+
|
|
13
|
+
bucket: str | None
|
|
14
|
+
"""Bucket name for s3:// and gs://, container for az://, abfs://, and abfss://."""
|
|
15
|
+
|
|
16
|
+
account: str | None
|
|
17
|
+
"""Storage account name for Azure (az://, abfs://, abfss://)."""
|
|
18
|
+
|
|
19
|
+
path: str
|
|
20
|
+
"""Path within the bucket/container, or full local path."""
|
|
21
|
+
|
|
22
|
+
original: str
|
|
23
|
+
"""Original URL string."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_url(url: str) -> ParsedUrl:
|
|
27
|
+
"""Parse a storage URL into its components.
|
|
28
|
+
|
|
29
|
+
Supports:
|
|
30
|
+
- Local paths: /path/to/file, ./relative/path, file:///path/to/file
|
|
31
|
+
- S3: s3://bucket/path
|
|
32
|
+
- GCS: gs://bucket/path
|
|
33
|
+
- Azure Blob (az://):
|
|
34
|
+
- With --az-url-authority-is-account: az://account/container/path
|
|
35
|
+
- Without (default): az://container/path (account from AZURE_STORAGE_ACCOUNT)
|
|
36
|
+
- Azure Data Lake (abfs://, abfss://):
|
|
37
|
+
- abfs[s]://container@account.dfs.core.windows.net/path
|
|
38
|
+
- abfs[s]://container/path (account from AZURE_STORAGE_ACCOUNT)
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
ParsedUrl with scheme, bucket, account, and path components.
|
|
42
|
+
"""
|
|
43
|
+
parsed = urlparse(url)
|
|
44
|
+
scheme = parsed.scheme.lower() if parsed.scheme else ""
|
|
45
|
+
|
|
46
|
+
# Local file paths
|
|
47
|
+
if not scheme or scheme == "file":
|
|
48
|
+
# file:///path or file://localhost/path -> /path
|
|
49
|
+
# No scheme -> treat as local path
|
|
50
|
+
if scheme == "file":
|
|
51
|
+
path = parsed.path
|
|
52
|
+
else:
|
|
53
|
+
path = url
|
|
54
|
+
return ParsedUrl(scheme="file", bucket=None, account=None, path=path, original=url)
|
|
55
|
+
|
|
56
|
+
# S3/GCS: s3://bucket/path
|
|
57
|
+
if scheme in {"s3", "gs"}:
|
|
58
|
+
bucket = parsed.netloc
|
|
59
|
+
path = parsed.path.lstrip("/")
|
|
60
|
+
return ParsedUrl(scheme=scheme, bucket=bucket, account=None, path=path, original=url)
|
|
61
|
+
|
|
62
|
+
# Azure Blob: az://container/path or az://account/container/path
|
|
63
|
+
if scheme == "az":
|
|
64
|
+
from tab_cli import config
|
|
65
|
+
|
|
66
|
+
if config.config.az_url_authority_is_account:
|
|
67
|
+
# az://account/container/path or az:///container/path
|
|
68
|
+
account = parsed.netloc if parsed.netloc else os.environ.get("AZURE_STORAGE_ACCOUNT")
|
|
69
|
+
# Path is /container/path, first segment is container
|
|
70
|
+
path_parts = parsed.path.lstrip("/").split("/", 1)
|
|
71
|
+
container = path_parts[0] if path_parts else None
|
|
72
|
+
path = path_parts[1] if len(path_parts) > 1 else ""
|
|
73
|
+
else:
|
|
74
|
+
# az://container/path (default adlfs behavior)
|
|
75
|
+
account = os.environ.get("AZURE_STORAGE_ACCOUNT")
|
|
76
|
+
container = parsed.netloc
|
|
77
|
+
path = parsed.path.lstrip("/")
|
|
78
|
+
return ParsedUrl(scheme=scheme, bucket=container, account=account, path=path, original=url)
|
|
79
|
+
|
|
80
|
+
# Azure Data Lake: abfs[s]://container@account.dfs.core.windows.net/path
|
|
81
|
+
# or abfs[s]://container/path
|
|
82
|
+
if scheme in ("abfs", "abfss"):
|
|
83
|
+
netloc = parsed.netloc
|
|
84
|
+
if "@" in netloc:
|
|
85
|
+
# container@account.dfs.core.windows.net
|
|
86
|
+
container, host = netloc.split("@", 1)
|
|
87
|
+
# Extract account from host (account.dfs.core.windows.net)
|
|
88
|
+
account = host.split(".")[0] if "." in host else host
|
|
89
|
+
else:
|
|
90
|
+
# container only, account from env
|
|
91
|
+
container = netloc
|
|
92
|
+
account = os.environ.get("AZURE_STORAGE_ACCOUNT")
|
|
93
|
+
path = parsed.path.lstrip("/")
|
|
94
|
+
return ParsedUrl(scheme=scheme, bucket=container, account=account, path=path, original=url)
|
|
95
|
+
|
|
96
|
+
# Other schemes: return with netloc as bucket-like component
|
|
97
|
+
raise ValueError(f"Unsupported scheme: {scheme}")
|