FlowerPower 0.11.6.20__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/__init__.py +2 -6
- flowerpower/cfg/__init__.py +7 -14
- flowerpower/cfg/base.py +29 -25
- flowerpower/cfg/pipeline/__init__.py +8 -6
- flowerpower/cfg/pipeline/_schedule.py +32 -0
- flowerpower/cfg/pipeline/adapter.py +0 -5
- flowerpower/cfg/pipeline/builder.py +377 -0
- flowerpower/cfg/pipeline/run.py +36 -0
- flowerpower/cfg/project/__init__.py +11 -24
- flowerpower/cfg/project/adapter.py +0 -12
- flowerpower/cli/__init__.py +2 -21
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/mqtt.py +0 -6
- flowerpower/cli/pipeline.py +22 -415
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +345 -146
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +21 -12
- flowerpower/pipeline/io.py +58 -54
- flowerpower/pipeline/manager.py +165 -726
- flowerpower/pipeline/pipeline.py +643 -0
- flowerpower/pipeline/registry.py +285 -18
- flowerpower/pipeline/visualizer.py +5 -6
- flowerpower/plugins/io/__init__.py +8 -0
- flowerpower/plugins/mqtt/__init__.py +7 -11
- flowerpower/settings/__init__.py +0 -2
- flowerpower/settings/{backend.py → _backend.py} +0 -21
- flowerpower/settings/logging.py +1 -1
- flowerpower/utils/logging.py +24 -12
- flowerpower/utils/misc.py +17 -256
- flowerpower/utils/monkey.py +1 -83
- flowerpower-0.21.0.dist-info/METADATA +463 -0
- flowerpower-0.21.0.dist-info/RECORD +44 -0
- flowerpower/cfg/pipeline/schedule.py +0 -74
- flowerpower/cfg/project/job_queue.py +0 -238
- flowerpower/cli/job_queue.py +0 -1061
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/__init__.py +0 -294
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/job_queue/base.py +0 -413
- flowerpower/job_queue/rq/__init__.py +0 -10
- flowerpower/job_queue/rq/_trigger.py +0 -37
- flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +0 -226
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -231
- flowerpower/job_queue/rq/manager.py +0 -1582
- flowerpower/job_queue/rq/setup.py +0 -154
- flowerpower/job_queue/rq/utils.py +0 -69
- flowerpower/mqtt.py +0 -12
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/plugins/mqtt/cfg.py +0 -17
- flowerpower/plugins/mqtt/manager.py +0 -962
- flowerpower/settings/job_queue.py +0 -87
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.20.dist-info/METADATA +0 -537
- flowerpower-0.11.6.20.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/top_level.txt +0 -0
flowerpower/fs/__init__.py
DELETED
@@ -1,29 +0,0 @@
|
|
1
|
-
import importlib
|
2
|
-
|
3
|
-
has_orjson = importlib.util.find_spec("orjson") is not None
|
4
|
-
has_polars = importlib.util.find_spec("polars") is not None
|
5
|
-
|
6
|
-
if has_orjson and has_polars:
|
7
|
-
from .ext import AbstractFileSystem
|
8
|
-
else:
|
9
|
-
from fsspec import AbstractFileSystem
|
10
|
-
|
11
|
-
from .base import DirFileSystem, get_filesystem # noqa: E402
|
12
|
-
from .storage_options import AwsStorageOptions # noqa: E402
|
13
|
-
from .storage_options import AzureStorageOptions # noqa: E402
|
14
|
-
from .storage_options import GcsStorageOptions # noqa: E402
|
15
|
-
from .storage_options import (BaseStorageOptions, GitHubStorageOptions,
|
16
|
-
GitLabStorageOptions, StorageOptions)
|
17
|
-
|
18
|
-
__all__ = [
|
19
|
-
"get_filesystem",
|
20
|
-
"DirFileSystem",
|
21
|
-
"AbstractFileSystem",
|
22
|
-
"StorageOptions",
|
23
|
-
"AwsStorageOptions",
|
24
|
-
"AzureStorageOptions",
|
25
|
-
"GcsStorageOptions",
|
26
|
-
"GitHubStorageOptions",
|
27
|
-
"GitLabStorageOptions",
|
28
|
-
"BaseStorageOptions",
|
29
|
-
]
|
flowerpower/fs/base.py
DELETED
@@ -1,662 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import inspect
|
3
|
-
import os
|
4
|
-
import posixpath
|
5
|
-
import urllib
|
6
|
-
from pathlib import Path
|
7
|
-
from typing import Any
|
8
|
-
|
9
|
-
import fsspec
|
10
|
-
import requests
|
11
|
-
from fsspec import filesystem
|
12
|
-
from fsspec.implementations.cache_mapper import AbstractCacheMapper
|
13
|
-
from fsspec.implementations.cached import SimpleCacheFileSystem
|
14
|
-
from fsspec.implementations.dirfs import DirFileSystem
|
15
|
-
from fsspec.implementations.memory import MemoryFile
|
16
|
-
from fsspec.utils import infer_storage_options
|
17
|
-
from loguru import logger
|
18
|
-
|
19
|
-
from ..utils.logging import setup_logging
|
20
|
-
from . import has_orjson, has_polars
|
21
|
-
|
22
|
-
if has_orjson and has_polars:
|
23
|
-
from .ext import AbstractFileSystem
|
24
|
-
else:
|
25
|
-
from fsspec import AbstractFileSystem
|
26
|
-
|
27
|
-
from .storage_options import BaseStorageOptions
|
28
|
-
from .storage_options import from_dict as storage_options_from_dict
|
29
|
-
|
30
|
-
setup_logging()
|
31
|
-
|
32
|
-
|
33
|
-
class FileNameCacheMapper(AbstractCacheMapper):
|
34
|
-
"""Maps remote file paths to local cache paths while preserving directory structure.
|
35
|
-
|
36
|
-
This cache mapper maintains the original file path structure in the cache directory,
|
37
|
-
creating necessary subdirectories as needed.
|
38
|
-
|
39
|
-
Attributes:
|
40
|
-
directory (str): Base directory for cached files
|
41
|
-
|
42
|
-
Example:
|
43
|
-
>>> # Create cache mapper for S3 files
|
44
|
-
>>> mapper = FileNameCacheMapper("/tmp/cache")
|
45
|
-
>>>
|
46
|
-
>>> # Map remote path to cache path
|
47
|
-
>>> cache_path = mapper("bucket/data/file.csv")
|
48
|
-
>>> print(cache_path) # Preserves structure
|
49
|
-
'bucket/data/file.csv'
|
50
|
-
"""
|
51
|
-
|
52
|
-
def __init__(self, directory: str):
|
53
|
-
"""Initialize cache mapper with base directory.
|
54
|
-
|
55
|
-
Args:
|
56
|
-
directory: Base directory where cached files will be stored
|
57
|
-
"""
|
58
|
-
self.directory = directory
|
59
|
-
|
60
|
-
def __call__(self, path: str) -> str:
|
61
|
-
"""Map remote file path to cache file path.
|
62
|
-
|
63
|
-
Creates necessary subdirectories in the cache directory to maintain
|
64
|
-
the original path structure.
|
65
|
-
|
66
|
-
Args:
|
67
|
-
path: Original file path from remote filesystem
|
68
|
-
|
69
|
-
Returns:
|
70
|
-
str: Cache file path that preserves original structure
|
71
|
-
|
72
|
-
Example:
|
73
|
-
>>> mapper = FileNameCacheMapper("/tmp/cache")
|
74
|
-
>>> # Maps maintain directory structure
|
75
|
-
>>> print(mapper("data/nested/file.txt"))
|
76
|
-
'data/nested/file.txt'
|
77
|
-
"""
|
78
|
-
os.makedirs(
|
79
|
-
posixpath.dirname(posixpath.join(self.directory, path)), exist_ok=True
|
80
|
-
)
|
81
|
-
return path
|
82
|
-
|
83
|
-
|
84
|
-
class MonitoredSimpleCacheFileSystem(SimpleCacheFileSystem):
|
85
|
-
"""Enhanced caching filesystem with monitoring and improved path handling.
|
86
|
-
|
87
|
-
This filesystem extends SimpleCacheFileSystem to provide:
|
88
|
-
- Verbose logging of cache operations
|
89
|
-
- Improved path mapping for cache files
|
90
|
-
- Enhanced synchronization capabilities
|
91
|
-
- Better handling of parallel operations
|
92
|
-
|
93
|
-
Attributes:
|
94
|
-
_verbose (bool): Whether to print verbose cache operations
|
95
|
-
_mapper (FileNameCacheMapper): Maps remote paths to cache paths
|
96
|
-
storage (list[str]): List of cache storage locations
|
97
|
-
fs (AbstractFileSystem): Underlying filesystem being cached
|
98
|
-
|
99
|
-
Example:
|
100
|
-
>>> from fsspec import filesystem
|
101
|
-
>>> # Create monitored cache for S3
|
102
|
-
>>> s3 = filesystem("s3", key="ACCESS_KEY", secret="SECRET_KEY")
|
103
|
-
>>> cached_fs = MonitoredSimpleCacheFileSystem(
|
104
|
-
... fs=s3,
|
105
|
-
... cache_storage="/tmp/s3_cache",
|
106
|
-
... verbose=True
|
107
|
-
... )
|
108
|
-
>>>
|
109
|
-
>>> # Read file (downloads and caches)
|
110
|
-
>>> with cached_fs.open("bucket/data.csv") as f:
|
111
|
-
... data = f.read()
|
112
|
-
Downloading s3://bucket/data.csv
|
113
|
-
>>>
|
114
|
-
>>> # Second read uses cache
|
115
|
-
>>> with cached_fs.open("bucket/data.csv") as f:
|
116
|
-
... data = f.read() # No download message
|
117
|
-
"""
|
118
|
-
|
119
|
-
def __init__(self, **kwargs: Any):
|
120
|
-
"""Initialize monitored cache filesystem.
|
121
|
-
|
122
|
-
Args:
|
123
|
-
**kwargs: Configuration options including:
|
124
|
-
fs (AbstractFileSystem): Filesystem to cache
|
125
|
-
cache_storage (str): Cache directory path
|
126
|
-
verbose (bool): Enable verbose logging
|
127
|
-
And any other SimpleCacheFileSystem options
|
128
|
-
|
129
|
-
Example:
|
130
|
-
>>> # Cache with custom settings
|
131
|
-
>>> cached_fs = MonitoredSimpleCacheFileSystem(
|
132
|
-
... fs=remote_fs,
|
133
|
-
... cache_storage="/tmp/cache",
|
134
|
-
... verbose=True,
|
135
|
-
... same_names=True # Use original filenames
|
136
|
-
... )
|
137
|
-
"""
|
138
|
-
self._verbose = kwargs.get("verbose", False)
|
139
|
-
super().__init__(**kwargs)
|
140
|
-
self._mapper = FileNameCacheMapper(kwargs.get("cache_storage"))
|
141
|
-
|
142
|
-
def _check_file(self, path: str) -> str | None:
|
143
|
-
"""Check if file exists in cache and download if needed.
|
144
|
-
|
145
|
-
Args:
|
146
|
-
path: Path to file in the remote filesystem
|
147
|
-
|
148
|
-
Returns:
|
149
|
-
str | None: Path to cached file if found/downloaded, None otherwise
|
150
|
-
|
151
|
-
Example:
|
152
|
-
>>> fs = MonitoredSimpleCacheFileSystem(
|
153
|
-
... fs=remote_fs,
|
154
|
-
... cache_storage="/tmp/cache"
|
155
|
-
... )
|
156
|
-
>>> cached_path = fs._check_file("data.csv")
|
157
|
-
>>> print(cached_path)
|
158
|
-
'/tmp/cache/data.csv'
|
159
|
-
"""
|
160
|
-
self._check_cache()
|
161
|
-
cache_path = self._mapper(path)
|
162
|
-
for storage in self.storage:
|
163
|
-
fn = posixpath.join(storage, cache_path)
|
164
|
-
if posixpath.exists(fn):
|
165
|
-
return fn
|
166
|
-
if self._verbose:
|
167
|
-
logger.info(f"Downloading {self.protocol[0]}://{path}")
|
168
|
-
|
169
|
-
def size(self, path: str) -> int:
|
170
|
-
"""Get size of file in bytes.
|
171
|
-
|
172
|
-
Checks cache first, falls back to remote filesystem.
|
173
|
-
|
174
|
-
Args:
|
175
|
-
path: Path to file
|
176
|
-
|
177
|
-
Returns:
|
178
|
-
int: Size of file in bytes
|
179
|
-
|
180
|
-
Example:
|
181
|
-
>>> fs = MonitoredSimpleCacheFileSystem(
|
182
|
-
... fs=remote_fs,
|
183
|
-
... cache_storage="/tmp/cache"
|
184
|
-
... )
|
185
|
-
>>> size = fs.size("large_file.dat")
|
186
|
-
>>> print(f"File size: {size} bytes")
|
187
|
-
"""
|
188
|
-
cached_file = self._check_file(self._strip_protocol(path))
|
189
|
-
if cached_file is None:
|
190
|
-
return self.fs.size(path)
|
191
|
-
else:
|
192
|
-
return posixpath.getsize(cached_file)
|
193
|
-
|
194
|
-
def sync_cache(self, reload: bool = False) -> None:
|
195
|
-
"""Synchronize cache with remote filesystem.
|
196
|
-
|
197
|
-
Downloads all files in remote path to cache if not present.
|
198
|
-
|
199
|
-
Args:
|
200
|
-
reload: Whether to force reload all files, ignoring existing cache
|
201
|
-
|
202
|
-
Example:
|
203
|
-
>>> fs = MonitoredSimpleCacheFileSystem(
|
204
|
-
... fs=remote_fs,
|
205
|
-
... cache_storage="/tmp/cache"
|
206
|
-
... )
|
207
|
-
>>> # Initial sync
|
208
|
-
>>> fs.sync_cache()
|
209
|
-
>>>
|
210
|
-
>>> # Force reload all files
|
211
|
-
>>> fs.sync_cache(reload=True)
|
212
|
-
"""
|
213
|
-
if reload:
|
214
|
-
self.clear_cache()
|
215
|
-
content = self.glob("**/*")
|
216
|
-
[self.open(f).close() for f in content if self.isfile(f)]
|
217
|
-
|
218
|
-
def __getattribute__(self, item):
|
219
|
-
if item in {
|
220
|
-
# new items
|
221
|
-
"size",
|
222
|
-
"glob",
|
223
|
-
"sync_cache",
|
224
|
-
# previous
|
225
|
-
"load_cache",
|
226
|
-
"_open",
|
227
|
-
"save_cache",
|
228
|
-
"close_and_update",
|
229
|
-
"__init__",
|
230
|
-
"__getattribute__",
|
231
|
-
"__reduce__",
|
232
|
-
"_make_local_details",
|
233
|
-
"open",
|
234
|
-
"cat",
|
235
|
-
"cat_file",
|
236
|
-
"cat_ranges",
|
237
|
-
"get",
|
238
|
-
"read_block",
|
239
|
-
"tail",
|
240
|
-
"head",
|
241
|
-
"info",
|
242
|
-
"ls",
|
243
|
-
"exists",
|
244
|
-
"isfile",
|
245
|
-
"isdir",
|
246
|
-
"_check_file",
|
247
|
-
"_check_cache",
|
248
|
-
"_mkcache",
|
249
|
-
"clear_cache",
|
250
|
-
"clear_expired_cache",
|
251
|
-
"pop_from_cache",
|
252
|
-
"local_file",
|
253
|
-
"_paths_from_path",
|
254
|
-
"get_mapper",
|
255
|
-
"open_many",
|
256
|
-
"commit_many",
|
257
|
-
"hash_name",
|
258
|
-
"__hash__",
|
259
|
-
"__eq__",
|
260
|
-
"to_json",
|
261
|
-
"to_dict",
|
262
|
-
"cache_size",
|
263
|
-
"pipe_file",
|
264
|
-
"pipe",
|
265
|
-
"start_transaction",
|
266
|
-
"end_transaction",
|
267
|
-
}:
|
268
|
-
# all the methods defined in this class. Note `open` here, since
|
269
|
-
# it calls `_open`, but is actually in superclass
|
270
|
-
return lambda *args, **kw: getattr(type(self), item).__get__(self)(
|
271
|
-
*args, **kw
|
272
|
-
)
|
273
|
-
if item in ["__reduce_ex__"]:
|
274
|
-
raise AttributeError
|
275
|
-
if item in ["transaction"]:
|
276
|
-
# property
|
277
|
-
return type(self).transaction.__get__(self)
|
278
|
-
if item in ["_cache", "transaction_type"]:
|
279
|
-
# class attributes
|
280
|
-
return getattr(type(self), item)
|
281
|
-
if item == "__class__":
|
282
|
-
return type(self)
|
283
|
-
d = object.__getattribute__(self, "__dict__")
|
284
|
-
fs = d.get("fs", None) # fs is not immediately defined
|
285
|
-
if item in d:
|
286
|
-
return d[item]
|
287
|
-
elif fs is not None:
|
288
|
-
if item in fs.__dict__:
|
289
|
-
# attribute of instance
|
290
|
-
return fs.__dict__[item]
|
291
|
-
# attributed belonging to the target filesystem
|
292
|
-
cls = type(fs)
|
293
|
-
m = getattr(cls, item)
|
294
|
-
if (inspect.isfunction(m) or inspect.isdatadescriptor(m)) and (
|
295
|
-
not hasattr(m, "__self__") or m.__self__ is None
|
296
|
-
):
|
297
|
-
# instance method
|
298
|
-
return m.__get__(fs, cls)
|
299
|
-
return m # class method or attribute
|
300
|
-
else:
|
301
|
-
# attributes of the superclass, while target is being set up
|
302
|
-
return super().__getattribute__(item)
|
303
|
-
|
304
|
-
|
305
|
-
class GitLabFileSystem(AbstractFileSystem):
|
306
|
-
"""FSSpec-compatible filesystem interface for GitLab repositories.
|
307
|
-
|
308
|
-
Provides access to files in GitLab repositories through the GitLab API,
|
309
|
-
supporting read operations with authentication.
|
310
|
-
|
311
|
-
Attributes:
|
312
|
-
project_name (str): Name of the GitLab project
|
313
|
-
project_id (str): ID of the GitLab project
|
314
|
-
access_token (str): GitLab personal access token
|
315
|
-
branch (str): Git branch to read from
|
316
|
-
base_url (str): GitLab instance URL
|
317
|
-
|
318
|
-
Example:
|
319
|
-
>>> # Access public project
|
320
|
-
>>> fs = GitLabFileSystem(
|
321
|
-
... project_name="my-project",
|
322
|
-
... access_token="glpat-xxxx"
|
323
|
-
... )
|
324
|
-
>>>
|
325
|
-
>>> # Read file contents
|
326
|
-
>>> with fs.open("path/to/file.txt") as f:
|
327
|
-
... content = f.read()
|
328
|
-
>>>
|
329
|
-
>>> # List directory
|
330
|
-
>>> files = fs.ls("path/to/dir")
|
331
|
-
>>>
|
332
|
-
>>> # Access enterprise GitLab
|
333
|
-
>>> fs = GitLabFileSystem(
|
334
|
-
... project_id="12345",
|
335
|
-
... access_token="glpat-xxxx",
|
336
|
-
... base_url="https://gitlab.company.com",
|
337
|
-
... branch="develop"
|
338
|
-
... )
|
339
|
-
"""
|
340
|
-
|
341
|
-
def __init__(
|
342
|
-
self,
|
343
|
-
project_name: str | None = None,
|
344
|
-
project_id: str | None = None,
|
345
|
-
access_token: str | None = None,
|
346
|
-
branch: str = "main",
|
347
|
-
base_url: str = "https://gitlab.com",
|
348
|
-
**kwargs,
|
349
|
-
):
|
350
|
-
"""Initialize GitLab filesystem.
|
351
|
-
|
352
|
-
Args:
|
353
|
-
project_name: Name of the GitLab project. Required if project_id not provided.
|
354
|
-
project_id: ID of the GitLab project. Required if project_name not provided.
|
355
|
-
access_token: GitLab personal access token for authentication.
|
356
|
-
Required for private repositories.
|
357
|
-
branch: Git branch to read from. Defaults to "main".
|
358
|
-
base_url: GitLab instance URL. Defaults to "https://gitlab.com".
|
359
|
-
**kwargs: Additional arguments passed to AbstractFileSystem.
|
360
|
-
|
361
|
-
Raises:
|
362
|
-
ValueError: If neither project_name nor project_id is provided
|
363
|
-
requests.RequestException: If GitLab API request fails
|
364
|
-
"""
|
365
|
-
super().__init__(**kwargs)
|
366
|
-
self.project_name = project_name
|
367
|
-
self.project_id = project_id
|
368
|
-
self.access_token = access_token
|
369
|
-
self.branch = branch
|
370
|
-
self.base_url = base_url.rstrip("/")
|
371
|
-
self._validate_init()
|
372
|
-
if not self.project_id:
|
373
|
-
self.project_id = self._get_project_id()
|
374
|
-
|
375
|
-
def _validate_init(self) -> None:
|
376
|
-
"""Validate initialization parameters.
|
377
|
-
|
378
|
-
Ensures that either project_id or project_name is provided.
|
379
|
-
|
380
|
-
Raises:
|
381
|
-
ValueError: If neither project_id nor project_name is provided
|
382
|
-
"""
|
383
|
-
if not self.project_id and not self.project_name:
|
384
|
-
raise ValueError("Either 'project_id' or 'project_name' must be provided")
|
385
|
-
|
386
|
-
def _get_project_id(self) -> str:
|
387
|
-
"""Retrieve project ID from GitLab API using project name.
|
388
|
-
|
389
|
-
Makes an API request to search for projects and find the matching project ID.
|
390
|
-
|
391
|
-
Returns:
|
392
|
-
str: The GitLab project ID
|
393
|
-
|
394
|
-
Raises:
|
395
|
-
ValueError: If project not found
|
396
|
-
requests.RequestException: If API request fails
|
397
|
-
"""
|
398
|
-
url = f"{self.base_url}/api/v4/projects"
|
399
|
-
headers = {"PRIVATE-TOKEN": self.access_token}
|
400
|
-
params = {"search": self.project_name}
|
401
|
-
response = requests.get(url, headers=headers, params=params)
|
402
|
-
|
403
|
-
if response.status_code == 200:
|
404
|
-
projects = response.json()
|
405
|
-
for project in projects:
|
406
|
-
if project["name"] == self.project_name:
|
407
|
-
return project["id"]
|
408
|
-
raise ValueError(f"Project '{self.project_name}' not found")
|
409
|
-
else:
|
410
|
-
response.raise_for_status()
|
411
|
-
|
412
|
-
def _open(self, path: str, mode: str = "rb", **kwargs) -> MemoryFile:
|
413
|
-
"""Open a file from GitLab repository.
|
414
|
-
|
415
|
-
Retrieves file content from GitLab API and returns it as a memory file.
|
416
|
-
|
417
|
-
Args:
|
418
|
-
path: Path to file within repository
|
419
|
-
mode: File open mode. Only "rb" (read binary) is supported.
|
420
|
-
**kwargs: Additional arguments (unused)
|
421
|
-
|
422
|
-
Returns:
|
423
|
-
MemoryFile: File-like object containing file content
|
424
|
-
|
425
|
-
Raises:
|
426
|
-
NotImplementedError: If mode is not "rb"
|
427
|
-
requests.RequestException: If API request fails
|
428
|
-
|
429
|
-
Example:
|
430
|
-
>>> fs = GitLabFileSystem(project_id="12345", access_token="glpat-xxxx")
|
431
|
-
>>> with fs.open("README.md") as f:
|
432
|
-
... content = f.read()
|
433
|
-
... print(content.decode())
|
434
|
-
"""
|
435
|
-
if mode != "rb":
|
436
|
-
raise NotImplementedError("Only read mode is supported")
|
437
|
-
|
438
|
-
url = (
|
439
|
-
f"{self.base_url}/api/v4/projects/{self.project_id}/repository/files/"
|
440
|
-
f"{urllib.parse.quote_plus(path)}?ref={self.branch}"
|
441
|
-
)
|
442
|
-
headers = {"PRIVATE-TOKEN": self.access_token}
|
443
|
-
response = requests.get(url, headers=headers)
|
444
|
-
|
445
|
-
if response.status_code == 200:
|
446
|
-
file_content = base64.b64decode(response.json()["content"])
|
447
|
-
return MemoryFile(None, None, file_content)
|
448
|
-
else:
|
449
|
-
response.raise_for_status()
|
450
|
-
|
451
|
-
def _ls(self, path: str, detail: bool = False, **kwargs) -> list[str] | list[dict]:
|
452
|
-
"""List contents of a directory in GitLab repository.
|
453
|
-
|
454
|
-
Args:
|
455
|
-
path: Directory path within repository
|
456
|
-
detail: Whether to return detailed information about each entry.
|
457
|
-
If True, returns list of dicts with file metadata.
|
458
|
-
If False, returns list of filenames.
|
459
|
-
**kwargs: Additional arguments (unused)
|
460
|
-
|
461
|
-
Returns:
|
462
|
-
list[str] | list[dict]: List of file/directory names or detailed info
|
463
|
-
|
464
|
-
Raises:
|
465
|
-
requests.RequestException: If API request fails
|
466
|
-
|
467
|
-
Example:
|
468
|
-
>>> fs = GitLabFileSystem(project_id="12345", access_token="glpat-xxxx")
|
469
|
-
>>> # List filenames
|
470
|
-
>>> files = fs.ls("docs")
|
471
|
-
>>> print(files)
|
472
|
-
['README.md', 'API.md']
|
473
|
-
>>>
|
474
|
-
>>> # List with details
|
475
|
-
>>> details = fs.ls("docs", detail=True)
|
476
|
-
>>> for item in details:
|
477
|
-
... print(f"{item['name']}: {item['type']}")
|
478
|
-
"""
|
479
|
-
url = f"{self.base_url}/api/v4/projects/{self.project_id}/repository/tree?path={path}&ref={self.branch}"
|
480
|
-
headers = {"PRIVATE-TOKEN": self.access_token}
|
481
|
-
response = requests.get(url, headers=headers)
|
482
|
-
|
483
|
-
if response.status_code == 200:
|
484
|
-
files = response.json()
|
485
|
-
if detail:
|
486
|
-
return files
|
487
|
-
else:
|
488
|
-
return [file["name"] for file in files]
|
489
|
-
else:
|
490
|
-
response.raise_for_status()
|
491
|
-
|
492
|
-
|
493
|
-
try:
|
494
|
-
fsspec.register_implementation("gitlab", GitLabFileSystem)
|
495
|
-
except ValueError as e:
|
496
|
-
_ = e
|
497
|
-
|
498
|
-
|
499
|
-
# Original ls Methode speichern
|
500
|
-
dirfs_ls_o = DirFileSystem.ls
|
501
|
-
mscf_ls_o = MonitoredSimpleCacheFileSystem.ls
|
502
|
-
|
503
|
-
|
504
|
-
# Neue ls Methode definieren
|
505
|
-
def dir_ls_p(self, path, detail=False, **kwargs):
|
506
|
-
return dirfs_ls_o(self, path, detail=detail, **kwargs)
|
507
|
-
|
508
|
-
|
509
|
-
def mscf_ls_p(self, path, detail=False, **kwargs):
|
510
|
-
return mscf_ls_o(self, path, detail=detail, **kwargs)
|
511
|
-
|
512
|
-
|
513
|
-
# patchen
|
514
|
-
DirFileSystem.ls = dir_ls_p
|
515
|
-
MonitoredSimpleCacheFileSystem.ls = mscf_ls_p
|
516
|
-
|
517
|
-
|
518
|
-
def get_filesystem(
|
519
|
-
path: str | Path | None = None,
|
520
|
-
storage_options: BaseStorageOptions | dict[str, str] | None = None,
|
521
|
-
dirfs: bool = True,
|
522
|
-
cached: bool = False,
|
523
|
-
cache_storage: str | None = None,
|
524
|
-
fs: AbstractFileSystem | None = None,
|
525
|
-
**storage_options_kwargs,
|
526
|
-
) -> AbstractFileSystem:
|
527
|
-
"""Get a filesystem instance based on path or configuration.
|
528
|
-
|
529
|
-
This function creates and configures a filesystem instance based on the provided path
|
530
|
-
and options. It supports various filesystem types including local, S3, GCS, Azure,
|
531
|
-
and Git-based filesystems.
|
532
|
-
|
533
|
-
Args:
|
534
|
-
path: URI or path to the filesystem location. Examples:
|
535
|
-
- Local: "/path/to/data"
|
536
|
-
- S3: "s3://bucket/path"
|
537
|
-
- GCS: "gs://bucket/path"
|
538
|
-
- Azure: "abfs://container/path"
|
539
|
-
- GitHub: "github://org/repo/path"
|
540
|
-
storage_options: Configuration options for the filesystem. Can be:
|
541
|
-
- BaseStorageOptions object with protocol-specific settings
|
542
|
-
- Dictionary of key-value pairs for authentication/configuration
|
543
|
-
- None to use environment variables or default credentials
|
544
|
-
dirfs: Whether to wrap filesystem in DirFileSystem for path-based operations.
|
545
|
-
Set to False when you need direct protocol-specific features.
|
546
|
-
cached: Whether to enable local caching of remote files.
|
547
|
-
Useful for frequently accessed remote files.
|
548
|
-
cache_storage: Directory path for cached files. Defaults to path-based location
|
549
|
-
in current directory if not specified.
|
550
|
-
fs: Existing filesystem instance to wrap with caching or dirfs.
|
551
|
-
Use this to customize an existing filesystem instance.
|
552
|
-
**storage_options_kwargs: Additional keyword arguments for storage options.
|
553
|
-
Alternative to passing storage_options dictionary.
|
554
|
-
|
555
|
-
Returns:
|
556
|
-
AbstractFileSystem: Configured filesystem instance with requested features.
|
557
|
-
|
558
|
-
Raises:
|
559
|
-
ValueError: If storage protocol or options are invalid
|
560
|
-
FSSpecError: If filesystem initialization fails
|
561
|
-
ImportError: If required filesystem backend is not installed
|
562
|
-
|
563
|
-
Example:
|
564
|
-
>>> # Local filesystem
|
565
|
-
>>> fs = get_filesystem("/path/to/data")
|
566
|
-
>>>
|
567
|
-
>>> # S3 with credentials
|
568
|
-
>>> fs = get_filesystem(
|
569
|
-
... "s3://bucket/data",
|
570
|
-
... storage_options={
|
571
|
-
... "key": "ACCESS_KEY",
|
572
|
-
... "secret": "SECRET_KEY"
|
573
|
-
... }
|
574
|
-
... )
|
575
|
-
>>>
|
576
|
-
>>> # Cached GCS filesystem
|
577
|
-
>>> fs = get_filesystem(
|
578
|
-
... "gs://bucket/data",
|
579
|
-
... storage_options=GcsStorageOptions(
|
580
|
-
... token="service_account.json"
|
581
|
-
... ),
|
582
|
-
... cached=True,
|
583
|
-
... cache_storage="/tmp/gcs_cache"
|
584
|
-
... )
|
585
|
-
>>>
|
586
|
-
>>> # Azure with environment credentials
|
587
|
-
>>> fs = get_filesystem(
|
588
|
-
... "abfs://container/data",
|
589
|
-
... storage_options=AzureStorageOptions.from_env()
|
590
|
-
... )
|
591
|
-
>>>
|
592
|
-
>>> # Wrap existing filesystem
|
593
|
-
>>> base_fs = filesystem("s3", key="ACCESS", secret="SECRET")
|
594
|
-
>>> cached_fs = get_filesystem(
|
595
|
-
... fs=base_fs,
|
596
|
-
... cached=True
|
597
|
-
... )
|
598
|
-
"""
|
599
|
-
if fs is not None:
|
600
|
-
if dirfs:
|
601
|
-
base_path = path.split("://")[-1]
|
602
|
-
if fs.protocol == "dir":
|
603
|
-
if base_path != fs.path:
|
604
|
-
fs = DirFileSystem(
|
605
|
-
path=posixpath.join(
|
606
|
-
fs.path, base_path.replace(fs.path, "").lstrip("/")
|
607
|
-
),
|
608
|
-
fs=fs.fs,
|
609
|
-
)
|
610
|
-
else:
|
611
|
-
fs = DirFileSystem(path=base_path, fs=fs)
|
612
|
-
if cached:
|
613
|
-
if fs.is_cache_fs:
|
614
|
-
return fs
|
615
|
-
fs = MonitoredSimpleCacheFileSystem(fs=fs, cache_storage=cache_storage)
|
616
|
-
|
617
|
-
return fs
|
618
|
-
|
619
|
-
pp = infer_storage_options(str(path) if isinstance(path, Path) else path)
|
620
|
-
protocol = (
|
621
|
-
storage_options_kwargs.get("protocol", None)
|
622
|
-
or (
|
623
|
-
storage_options.get("protocol", None)
|
624
|
-
if isinstance(storage_options, dict)
|
625
|
-
else getattr(storage_options, "protocol", None)
|
626
|
-
)
|
627
|
-
or pp.get("protocol", "file")
|
628
|
-
)
|
629
|
-
|
630
|
-
if protocol == "file" or protocol == "local":
|
631
|
-
fs = filesystem(protocol)
|
632
|
-
fs.is_cache_fs = False
|
633
|
-
if dirfs:
|
634
|
-
fs = DirFileSystem(path=path, fs=fs)
|
635
|
-
fs.is_cache_fs = False
|
636
|
-
return fs
|
637
|
-
|
638
|
-
host = pp.get("host", "")
|
639
|
-
path = pp.get("path", "").lstrip("/")
|
640
|
-
if len(host) and host not in path:
|
641
|
-
path = posixpath.join(host, path)
|
642
|
-
if "." in path:
|
643
|
-
path = posixpath.dirname(path)
|
644
|
-
|
645
|
-
if isinstance(storage_options, dict):
|
646
|
-
storage_options = storage_options_from_dict(protocol, storage_options)
|
647
|
-
|
648
|
-
if storage_options is None:
|
649
|
-
storage_options = storage_options_from_dict(protocol, storage_options_kwargs)
|
650
|
-
|
651
|
-
fs = storage_options.to_filesystem()
|
652
|
-
fs.is_cache_fs = False
|
653
|
-
if dirfs and len(path):
|
654
|
-
fs = DirFileSystem(path=path, fs=fs)
|
655
|
-
fs.is_cache_fs = False
|
656
|
-
if cached:
|
657
|
-
if cache_storage is None:
|
658
|
-
cache_storage = (Path.cwd() / path).as_posix()
|
659
|
-
fs = MonitoredSimpleCacheFileSystem(fs=fs, cache_storage=cache_storage)
|
660
|
-
fs.is_cache_fs = True
|
661
|
-
|
662
|
-
return fs
|