FlowerPower 0.9.13.1__py3-none-any.whl → 1.0.0b2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/__init__.py +17 -2
- flowerpower/cfg/__init__.py +201 -149
- flowerpower/cfg/base.py +122 -24
- flowerpower/cfg/pipeline/__init__.py +254 -0
- flowerpower/cfg/pipeline/adapter.py +66 -0
- flowerpower/cfg/pipeline/run.py +40 -11
- flowerpower/cfg/pipeline/schedule.py +69 -79
- flowerpower/cfg/project/__init__.py +149 -0
- flowerpower/cfg/project/adapter.py +57 -0
- flowerpower/cfg/project/job_queue.py +165 -0
- flowerpower/cli/__init__.py +92 -37
- flowerpower/cli/job_queue.py +878 -0
- flowerpower/cli/mqtt.py +32 -1
- flowerpower/cli/pipeline.py +559 -406
- flowerpower/cli/utils.py +29 -18
- flowerpower/flowerpower.py +12 -8
- flowerpower/fs/__init__.py +20 -2
- flowerpower/fs/base.py +350 -26
- flowerpower/fs/ext.py +797 -216
- flowerpower/fs/storage_options.py +1097 -55
- flowerpower/io/base.py +13 -18
- flowerpower/io/loader/__init__.py +28 -0
- flowerpower/io/loader/deltatable.py +7 -10
- flowerpower/io/metadata.py +1 -0
- flowerpower/io/saver/__init__.py +28 -0
- flowerpower/io/saver/deltatable.py +4 -3
- flowerpower/job_queue/__init__.py +252 -0
- flowerpower/job_queue/apscheduler/__init__.py +11 -0
- flowerpower/job_queue/apscheduler/_setup/datastore.py +110 -0
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +93 -0
- flowerpower/job_queue/apscheduler/manager.py +1063 -0
- flowerpower/job_queue/apscheduler/setup.py +524 -0
- flowerpower/job_queue/apscheduler/trigger.py +169 -0
- flowerpower/job_queue/apscheduler/utils.py +309 -0
- flowerpower/job_queue/base.py +382 -0
- flowerpower/job_queue/rq/__init__.py +10 -0
- flowerpower/job_queue/rq/_trigger.py +37 -0
- flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +226 -0
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +231 -0
- flowerpower/job_queue/rq/manager.py +1449 -0
- flowerpower/job_queue/rq/setup.py +150 -0
- flowerpower/job_queue/rq/utils.py +69 -0
- flowerpower/pipeline/__init__.py +5 -0
- flowerpower/pipeline/base.py +118 -0
- flowerpower/pipeline/io.py +407 -0
- flowerpower/pipeline/job_queue.py +505 -0
- flowerpower/pipeline/manager.py +1586 -0
- flowerpower/pipeline/registry.py +560 -0
- flowerpower/pipeline/runner.py +560 -0
- flowerpower/pipeline/visualizer.py +142 -0
- flowerpower/plugins/mqtt/__init__.py +12 -0
- flowerpower/plugins/mqtt/cfg.py +16 -0
- flowerpower/plugins/mqtt/manager.py +789 -0
- flowerpower/settings.py +110 -0
- flowerpower/utils/logging.py +21 -0
- flowerpower/utils/misc.py +57 -9
- flowerpower/utils/sql.py +122 -24
- flowerpower/utils/templates.py +2 -142
- flowerpower-1.0.0b2.dist-info/METADATA +324 -0
- flowerpower-1.0.0b2.dist-info/RECORD +94 -0
- flowerpower/_web/__init__.py +0 -61
- flowerpower/_web/routes/config.py +0 -103
- flowerpower/_web/routes/pipelines.py +0 -173
- flowerpower/_web/routes/scheduler.py +0 -136
- flowerpower/cfg/pipeline/tracker.py +0 -14
- flowerpower/cfg/project/open_telemetry.py +0 -8
- flowerpower/cfg/project/tracker.py +0 -11
- flowerpower/cfg/project/worker.py +0 -19
- flowerpower/cli/scheduler.py +0 -309
- flowerpower/cli/web.py +0 -44
- flowerpower/event_handler.py +0 -23
- flowerpower/mqtt.py +0 -609
- flowerpower/pipeline.py +0 -2499
- flowerpower/scheduler.py +0 -680
- flowerpower/tui.py +0 -79
- flowerpower/utils/datastore.py +0 -186
- flowerpower/utils/eventbroker.py +0 -127
- flowerpower/utils/executor.py +0 -58
- flowerpower/utils/trigger.py +0 -140
- flowerpower-0.9.13.1.dist-info/METADATA +0 -586
- flowerpower-0.9.13.1.dist-info/RECORD +0 -76
- /flowerpower/{cfg/pipeline/params.py → cli/worker.py} +0 -0
- {flowerpower-0.9.13.1.dist-info → flowerpower-1.0.0b2.dist-info}/WHEEL +0 -0
- {flowerpower-0.9.13.1.dist-info → flowerpower-1.0.0b2.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.9.13.1.dist-info → flowerpower-1.0.0b2.dist-info}/top_level.txt +0 -0
flowerpower/cli/utils.py
CHANGED
@@ -1,15 +1,20 @@
|
|
1
1
|
import ast
|
2
|
+
import importlib
|
2
3
|
import json
|
3
|
-
|
4
|
+
import posixpath
|
4
5
|
import re
|
5
|
-
import
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
6
8
|
from typing import Callable
|
9
|
+
|
7
10
|
from loguru import logger
|
8
|
-
import sys
|
9
|
-
import posixpath
|
10
11
|
|
11
12
|
from flowerpower.pipeline import PipelineManager
|
12
13
|
|
14
|
+
from ..utils.logging import setup_logging
|
15
|
+
|
16
|
+
setup_logging()
|
17
|
+
|
13
18
|
|
14
19
|
# Parse additional parameters
|
15
20
|
def parse_param_dict(param_str: str | None) -> dict:
|
@@ -99,12 +104,13 @@ def parse_dict_or_list_param(
|
|
99
104
|
logger.warning(f"Could not parse {param_type} parameter: {value}")
|
100
105
|
return None
|
101
106
|
|
107
|
+
|
102
108
|
def load_hook(
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
109
|
+
pipeline_name: str,
|
110
|
+
function_path: str,
|
111
|
+
base_dir=None,
|
112
|
+
storage_options: str | None = None,
|
113
|
+
) -> Callable:
|
108
114
|
"""
|
109
115
|
Load a hook function from a specified path.
|
110
116
|
This function dynamically imports the module and retrieves the function
|
@@ -118,21 +124,26 @@ def load_hook(
|
|
118
124
|
Returns:
|
119
125
|
Callable: The loaded hook function
|
120
126
|
"""
|
121
|
-
with PipelineManager(
|
122
|
-
|
123
|
-
) as pm:
|
124
|
-
path_segments = function_path.rsplit('.', 2)
|
127
|
+
with PipelineManager(storage_options=storage_options, base_dir=base_dir) as pm:
|
128
|
+
path_segments = function_path.rsplit(".", 2)
|
125
129
|
if len(path_segments) == 2:
|
126
130
|
# If the function path is in the format 'module_name.function_name'
|
127
131
|
module_name, function_name = path_segments
|
128
|
-
module_path =
|
132
|
+
module_path = ""
|
129
133
|
elif len(path_segments) == 3:
|
130
134
|
# If the function path is in the format 'package.[subpackage.]module_name.function_name'
|
131
135
|
module_path, module_name, function_name = path_segments
|
132
136
|
|
133
|
-
|
134
|
-
|
135
|
-
|
137
|
+
logger.debug(
|
138
|
+
posixpath.join(
|
139
|
+
pm._fs.path, "hooks", pipeline_name, module_path.replace(".", "/")
|
140
|
+
)
|
141
|
+
)
|
142
|
+
sys.path.append(
|
143
|
+
posixpath.join(
|
144
|
+
pm._fs.path, "hooks", pipeline_name, module_path.replace(".", "/")
|
145
|
+
)
|
146
|
+
)
|
136
147
|
hook_module = importlib.import_module(module_name)
|
137
148
|
hook_function = getattr(hook_module, function_name)
|
138
|
-
return hook_function
|
149
|
+
return hook_function
|
flowerpower/flowerpower.py
CHANGED
@@ -6,15 +6,19 @@ from pathlib import Path
|
|
6
6
|
import rich
|
7
7
|
from fsspec.spec import AbstractFileSystem
|
8
8
|
|
9
|
-
from .cfg import
|
9
|
+
from .cfg import ProjectConfig
|
10
10
|
from .fs import get_filesystem
|
11
|
-
|
11
|
+
from . import settings
|
12
12
|
|
13
13
|
def init(
|
14
14
|
name: str | None = None,
|
15
15
|
base_dir: str | None = None,
|
16
16
|
storage_options: dict = {},
|
17
17
|
fs: AbstractFileSystem | None = None,
|
18
|
+
job_queue_type: str = settings.DEFAULT_JOB_QUEUE,
|
19
|
+
cfg_dir: str = settings.CONFIG_DIR,
|
20
|
+
pipelines_dir: str = settings.PIPELINES_DIR,
|
21
|
+
hooks_dir: str = settings.HOOKS_DIR,
|
18
22
|
):
|
19
23
|
if name is None:
|
20
24
|
name = str(Path.cwd().name)
|
@@ -25,11 +29,11 @@ def init(
|
|
25
29
|
|
26
30
|
fs = get_filesystem(posixpath.join(base_dir, name), **storage_options)
|
27
31
|
|
28
|
-
fs.makedirs("
|
29
|
-
fs.makedirs(
|
30
|
-
fs.makedirs(
|
32
|
+
fs.makedirs(f"{cfg_dir}/pipelines", exist_ok=True)
|
33
|
+
fs.makedirs(pipelines_dir, exist_ok=True)
|
34
|
+
fs.makedirs(hooks_dir, exist_ok=True)
|
31
35
|
|
32
|
-
cfg =
|
36
|
+
cfg = ProjectConfig.load(base_dir=posixpath.join(base_dir, name), name=name, job_queue_type=job_queue_type)
|
33
37
|
|
34
38
|
with open(posixpath.join(base_dir, name, "README.md"), "w") as f:
|
35
39
|
f.write(
|
@@ -55,9 +59,9 @@ def init(
|
|
55
59
|
[dim]More options:[/dim]
|
56
60
|
[blue underline]https://docs.astral.sh/uv/getting-started/installation/[/blue underline]
|
57
61
|
|
58
|
-
🚀 Initialize your project:
|
62
|
+
🚀 Initialize uv in your flowerpower project:
|
59
63
|
[dim]Run the following in your project directory:[/dim]
|
60
|
-
[bold white]uv init --
|
64
|
+
[bold white]uv init --bare --no-readme[/bold white]
|
61
65
|
"""
|
62
66
|
)
|
63
67
|
|
flowerpower/fs/__init__.py
CHANGED
@@ -1,10 +1,28 @@
|
|
1
1
|
import importlib
|
2
2
|
|
3
3
|
has_orjson = importlib.util.find_spec("orjson") is not None
|
4
|
+
has_polars = importlib.util.find_spec("polars") is not None
|
4
5
|
|
5
|
-
if has_orjson:
|
6
|
+
if has_orjson and has_polars:
|
6
7
|
from .ext import AbstractFileSystem
|
7
8
|
else:
|
8
9
|
from fsspec import AbstractFileSystem
|
9
10
|
|
10
|
-
from .base import get_filesystem
|
11
|
+
from .base import get_filesystem # noqa: E402
|
12
|
+
from .storage_options import AwsStorageOptions # noqa: E402
|
13
|
+
from .storage_options import AzureStorageOptions # noqa: E402
|
14
|
+
from .storage_options import (BaseStorageOptions, GcsStorageOptions,
|
15
|
+
GitHubStorageOptions, GitLabStorageOptions,
|
16
|
+
StorageOptions)
|
17
|
+
|
18
|
+
__all__ = [
|
19
|
+
"get_filesystem",
|
20
|
+
"AbstractFileSystem",
|
21
|
+
"StorageOptions",
|
22
|
+
"AwsStorageOptions",
|
23
|
+
"AzureStorageOptions",
|
24
|
+
"GcsStorageOptions",
|
25
|
+
"GitHubStorageOptions",
|
26
|
+
"GitLabStorageOptions",
|
27
|
+
"BaseStorageOptions",
|
28
|
+
]
|
flowerpower/fs/base.py
CHANGED
@@ -4,6 +4,7 @@ import os
|
|
4
4
|
import posixpath
|
5
5
|
import urllib
|
6
6
|
from pathlib import Path
|
7
|
+
from typing import Any
|
7
8
|
|
8
9
|
import fsspec
|
9
10
|
import requests
|
@@ -15,16 +16,65 @@ from fsspec.implementations.memory import MemoryFile
|
|
15
16
|
from fsspec.utils import infer_storage_options
|
16
17
|
from loguru import logger
|
17
18
|
|
18
|
-
from .
|
19
|
+
from ..utils.logging import setup_logging
|
20
|
+
from . import has_orjson, has_polars
|
21
|
+
|
22
|
+
if has_orjson and has_polars:
|
23
|
+
from .ext import AbstractFileSystem
|
24
|
+
else:
|
25
|
+
from fsspec import AbstractFileSystem
|
26
|
+
|
19
27
|
from .storage_options import BaseStorageOptions
|
20
28
|
from .storage_options import from_dict as storage_options_from_dict
|
21
29
|
|
30
|
+
setup_logging()
|
31
|
+
|
22
32
|
|
23
33
|
class FileNameCacheMapper(AbstractCacheMapper):
|
24
|
-
|
34
|
+
"""Maps remote file paths to local cache paths while preserving directory structure.
|
35
|
+
|
36
|
+
This cache mapper maintains the original file path structure in the cache directory,
|
37
|
+
creating necessary subdirectories as needed.
|
38
|
+
|
39
|
+
Attributes:
|
40
|
+
directory (str): Base directory for cached files
|
41
|
+
|
42
|
+
Example:
|
43
|
+
>>> # Create cache mapper for S3 files
|
44
|
+
>>> mapper = FileNameCacheMapper("/tmp/cache")
|
45
|
+
>>>
|
46
|
+
>>> # Map remote path to cache path
|
47
|
+
>>> cache_path = mapper("bucket/data/file.csv")
|
48
|
+
>>> print(cache_path) # Preserves structure
|
49
|
+
'bucket/data/file.csv'
|
50
|
+
"""
|
51
|
+
|
52
|
+
def __init__(self, directory: str):
|
53
|
+
"""Initialize cache mapper with base directory.
|
54
|
+
|
55
|
+
Args:
|
56
|
+
directory: Base directory where cached files will be stored
|
57
|
+
"""
|
25
58
|
self.directory = directory
|
26
59
|
|
27
60
|
def __call__(self, path: str) -> str:
|
61
|
+
"""Map remote file path to cache file path.
|
62
|
+
|
63
|
+
Creates necessary subdirectories in the cache directory to maintain
|
64
|
+
the original path structure.
|
65
|
+
|
66
|
+
Args:
|
67
|
+
path: Original file path from remote filesystem
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
str: Cache file path that preserves original structure
|
71
|
+
|
72
|
+
Example:
|
73
|
+
>>> mapper = FileNameCacheMapper("/tmp/cache")
|
74
|
+
>>> # Maps maintain directory structure
|
75
|
+
>>> print(mapper("data/nested/file.txt"))
|
76
|
+
'data/nested/file.txt'
|
77
|
+
"""
|
28
78
|
os.makedirs(
|
29
79
|
posixpath.dirname(posixpath.join(self.directory, path)), exist_ok=True
|
30
80
|
)
|
@@ -32,15 +82,81 @@ class FileNameCacheMapper(AbstractCacheMapper):
|
|
32
82
|
|
33
83
|
|
34
84
|
class MonitoredSimpleCacheFileSystem(SimpleCacheFileSystem):
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
85
|
+
"""Enhanced caching filesystem with monitoring and improved path handling.
|
86
|
+
|
87
|
+
This filesystem extends SimpleCacheFileSystem to provide:
|
88
|
+
- Verbose logging of cache operations
|
89
|
+
- Improved path mapping for cache files
|
90
|
+
- Enhanced synchronization capabilities
|
91
|
+
- Better handling of parallel operations
|
92
|
+
|
93
|
+
Attributes:
|
94
|
+
_verbose (bool): Whether to print verbose cache operations
|
95
|
+
_mapper (FileNameCacheMapper): Maps remote paths to cache paths
|
96
|
+
storage (list[str]): List of cache storage locations
|
97
|
+
fs (AbstractFileSystem): Underlying filesystem being cached
|
98
|
+
|
99
|
+
Example:
|
100
|
+
>>> from fsspec import filesystem
|
101
|
+
>>> # Create monitored cache for S3
|
102
|
+
>>> s3 = filesystem("s3", key="ACCESS_KEY", secret="SECRET_KEY")
|
103
|
+
>>> cached_fs = MonitoredSimpleCacheFileSystem(
|
104
|
+
... fs=s3,
|
105
|
+
... cache_storage="/tmp/s3_cache",
|
106
|
+
... verbose=True
|
107
|
+
... )
|
108
|
+
>>>
|
109
|
+
>>> # Read file (downloads and caches)
|
110
|
+
>>> with cached_fs.open("bucket/data.csv") as f:
|
111
|
+
... data = f.read()
|
112
|
+
Downloading s3://bucket/data.csv
|
113
|
+
>>>
|
114
|
+
>>> # Second read uses cache
|
115
|
+
>>> with cached_fs.open("bucket/data.csv") as f:
|
116
|
+
... data = f.read() # No download message
|
117
|
+
"""
|
118
|
+
|
119
|
+
def __init__(self, **kwargs: Any):
|
120
|
+
"""Initialize monitored cache filesystem.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
**kwargs: Configuration options including:
|
124
|
+
fs (AbstractFileSystem): Filesystem to cache
|
125
|
+
cache_storage (str): Cache directory path
|
126
|
+
verbose (bool): Enable verbose logging
|
127
|
+
And any other SimpleCacheFileSystem options
|
128
|
+
|
129
|
+
Example:
|
130
|
+
>>> # Cache with custom settings
|
131
|
+
>>> cached_fs = MonitoredSimpleCacheFileSystem(
|
132
|
+
... fs=remote_fs,
|
133
|
+
... cache_storage="/tmp/cache",
|
134
|
+
... verbose=True,
|
135
|
+
... same_names=True # Use original filenames
|
136
|
+
... )
|
137
|
+
"""
|
39
138
|
self._verbose = kwargs.get("verbose", False)
|
40
139
|
super().__init__(**kwargs)
|
41
140
|
self._mapper = FileNameCacheMapper(kwargs.get("cache_storage"))
|
42
141
|
|
43
|
-
def _check_file(self, path):
|
142
|
+
def _check_file(self, path: str) -> str | None:
|
143
|
+
"""Check if file exists in cache and download if needed.
|
144
|
+
|
145
|
+
Args:
|
146
|
+
path: Path to file in the remote filesystem
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
str | None: Path to cached file if found/downloaded, None otherwise
|
150
|
+
|
151
|
+
Example:
|
152
|
+
>>> fs = MonitoredSimpleCacheFileSystem(
|
153
|
+
... fs=remote_fs,
|
154
|
+
... cache_storage="/tmp/cache"
|
155
|
+
... )
|
156
|
+
>>> cached_path = fs._check_file("data.csv")
|
157
|
+
>>> print(cached_path)
|
158
|
+
'/tmp/cache/data.csv'
|
159
|
+
"""
|
44
160
|
self._check_cache()
|
45
161
|
cache_path = self._mapper(path)
|
46
162
|
for storage in self.storage:
|
@@ -50,17 +166,50 @@ class MonitoredSimpleCacheFileSystem(SimpleCacheFileSystem):
|
|
50
166
|
if self._verbose:
|
51
167
|
logger.info(f"Downloading {self.protocol[0]}://{path}")
|
52
168
|
|
53
|
-
|
54
|
-
|
169
|
+
def size(self, path: str) -> int:
|
170
|
+
"""Get size of file in bytes.
|
171
|
+
|
172
|
+
Checks cache first, falls back to remote filesystem.
|
173
|
+
|
174
|
+
Args:
|
175
|
+
path: Path to file
|
55
176
|
|
56
|
-
|
177
|
+
Returns:
|
178
|
+
int: Size of file in bytes
|
179
|
+
|
180
|
+
Example:
|
181
|
+
>>> fs = MonitoredSimpleCacheFileSystem(
|
182
|
+
... fs=remote_fs,
|
183
|
+
... cache_storage="/tmp/cache"
|
184
|
+
... )
|
185
|
+
>>> size = fs.size("large_file.dat")
|
186
|
+
>>> print(f"File size: {size} bytes")
|
187
|
+
"""
|
57
188
|
cached_file = self._check_file(self._strip_protocol(path))
|
58
189
|
if cached_file is None:
|
59
190
|
return self.fs.size(path)
|
60
191
|
else:
|
61
192
|
return posixpath.getsize(cached_file)
|
62
193
|
|
63
|
-
def
|
194
|
+
def sync_cache(self, reload: bool = False) -> None:
|
195
|
+
"""Synchronize cache with remote filesystem.
|
196
|
+
|
197
|
+
Downloads all files in remote path to cache if not present.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
reload: Whether to force reload all files, ignoring existing cache
|
201
|
+
|
202
|
+
Example:
|
203
|
+
>>> fs = MonitoredSimpleCacheFileSystem(
|
204
|
+
... fs=remote_fs,
|
205
|
+
... cache_storage="/tmp/cache"
|
206
|
+
... )
|
207
|
+
>>> # Initial sync
|
208
|
+
>>> fs.sync_cache()
|
209
|
+
>>>
|
210
|
+
>>> # Force reload all files
|
211
|
+
>>> fs.sync_cache(reload=True)
|
212
|
+
"""
|
64
213
|
if reload:
|
65
214
|
self.clear_cache()
|
66
215
|
content = self.glob("**/*")
|
@@ -154,6 +303,41 @@ class MonitoredSimpleCacheFileSystem(SimpleCacheFileSystem):
|
|
154
303
|
|
155
304
|
|
156
305
|
class GitLabFileSystem(AbstractFileSystem):
|
306
|
+
"""FSSpec-compatible filesystem interface for GitLab repositories.
|
307
|
+
|
308
|
+
Provides access to files in GitLab repositories through the GitLab API,
|
309
|
+
supporting read operations with authentication.
|
310
|
+
|
311
|
+
Attributes:
|
312
|
+
project_name (str): Name of the GitLab project
|
313
|
+
project_id (str): ID of the GitLab project
|
314
|
+
access_token (str): GitLab personal access token
|
315
|
+
branch (str): Git branch to read from
|
316
|
+
base_url (str): GitLab instance URL
|
317
|
+
|
318
|
+
Example:
|
319
|
+
>>> # Access public project
|
320
|
+
>>> fs = GitLabFileSystem(
|
321
|
+
... project_name="my-project",
|
322
|
+
... access_token="glpat-xxxx"
|
323
|
+
... )
|
324
|
+
>>>
|
325
|
+
>>> # Read file contents
|
326
|
+
>>> with fs.open("path/to/file.txt") as f:
|
327
|
+
... content = f.read()
|
328
|
+
>>>
|
329
|
+
>>> # List directory
|
330
|
+
>>> files = fs.ls("path/to/dir")
|
331
|
+
>>>
|
332
|
+
>>> # Access enterprise GitLab
|
333
|
+
>>> fs = GitLabFileSystem(
|
334
|
+
... project_id="12345",
|
335
|
+
... access_token="glpat-xxxx",
|
336
|
+
... base_url="https://gitlab.company.com",
|
337
|
+
... branch="develop"
|
338
|
+
... )
|
339
|
+
"""
|
340
|
+
|
157
341
|
def __init__(
|
158
342
|
self,
|
159
343
|
project_name: str | None = None,
|
@@ -163,6 +347,21 @@ class GitLabFileSystem(AbstractFileSystem):
|
|
163
347
|
base_url: str = "https://gitlab.com",
|
164
348
|
**kwargs,
|
165
349
|
):
|
350
|
+
"""Initialize GitLab filesystem.
|
351
|
+
|
352
|
+
Args:
|
353
|
+
project_name: Name of the GitLab project. Required if project_id not provided.
|
354
|
+
project_id: ID of the GitLab project. Required if project_name not provided.
|
355
|
+
access_token: GitLab personal access token for authentication.
|
356
|
+
Required for private repositories.
|
357
|
+
branch: Git branch to read from. Defaults to "main".
|
358
|
+
base_url: GitLab instance URL. Defaults to "https://gitlab.com".
|
359
|
+
**kwargs: Additional arguments passed to AbstractFileSystem.
|
360
|
+
|
361
|
+
Raises:
|
362
|
+
ValueError: If neither project_name nor project_id is provided
|
363
|
+
requests.RequestException: If GitLab API request fails
|
364
|
+
"""
|
166
365
|
super().__init__(**kwargs)
|
167
366
|
self.project_name = project_name
|
168
367
|
self.project_id = project_id
|
@@ -173,11 +372,29 @@ class GitLabFileSystem(AbstractFileSystem):
|
|
173
372
|
if not self.project_id:
|
174
373
|
self.project_id = self._get_project_id()
|
175
374
|
|
176
|
-
def _validate_init(self):
|
375
|
+
def _validate_init(self) -> None:
|
376
|
+
"""Validate initialization parameters.
|
377
|
+
|
378
|
+
Ensures that either project_id or project_name is provided.
|
379
|
+
|
380
|
+
Raises:
|
381
|
+
ValueError: If neither project_id nor project_name is provided
|
382
|
+
"""
|
177
383
|
if not self.project_id and not self.project_name:
|
178
384
|
raise ValueError("Either 'project_id' or 'project_name' must be provided")
|
179
385
|
|
180
|
-
def _get_project_id(self):
|
386
|
+
def _get_project_id(self) -> str:
|
387
|
+
"""Retrieve project ID from GitLab API using project name.
|
388
|
+
|
389
|
+
Makes an API request to search for projects and find the matching project ID.
|
390
|
+
|
391
|
+
Returns:
|
392
|
+
str: The GitLab project ID
|
393
|
+
|
394
|
+
Raises:
|
395
|
+
ValueError: If project not found
|
396
|
+
requests.RequestException: If API request fails
|
397
|
+
"""
|
181
398
|
url = f"{self.base_url}/api/v4/projects"
|
182
399
|
headers = {"PRIVATE-TOKEN": self.access_token}
|
183
400
|
params = {"search": self.project_name}
|
@@ -192,7 +409,29 @@ class GitLabFileSystem(AbstractFileSystem):
|
|
192
409
|
else:
|
193
410
|
response.raise_for_status()
|
194
411
|
|
195
|
-
def _open(self, path, mode="rb", **kwargs):
|
412
|
+
def _open(self, path: str, mode: str = "rb", **kwargs) -> MemoryFile:
|
413
|
+
"""Open a file from GitLab repository.
|
414
|
+
|
415
|
+
Retrieves file content from GitLab API and returns it as a memory file.
|
416
|
+
|
417
|
+
Args:
|
418
|
+
path: Path to file within repository
|
419
|
+
mode: File open mode. Only "rb" (read binary) is supported.
|
420
|
+
**kwargs: Additional arguments (unused)
|
421
|
+
|
422
|
+
Returns:
|
423
|
+
MemoryFile: File-like object containing file content
|
424
|
+
|
425
|
+
Raises:
|
426
|
+
NotImplementedError: If mode is not "rb"
|
427
|
+
requests.RequestException: If API request fails
|
428
|
+
|
429
|
+
Example:
|
430
|
+
>>> fs = GitLabFileSystem(project_id="12345", access_token="glpat-xxxx")
|
431
|
+
>>> with fs.open("README.md") as f:
|
432
|
+
... content = f.read()
|
433
|
+
... print(content.decode())
|
434
|
+
"""
|
196
435
|
if mode != "rb":
|
197
436
|
raise NotImplementedError("Only read mode is supported")
|
198
437
|
|
@@ -209,7 +448,34 @@ class GitLabFileSystem(AbstractFileSystem):
|
|
209
448
|
else:
|
210
449
|
response.raise_for_status()
|
211
450
|
|
212
|
-
def _ls(self, path, detail=False, **kwargs):
|
451
|
+
def _ls(self, path: str, detail: bool = False, **kwargs) -> list[str] | list[dict]:
|
452
|
+
"""List contents of a directory in GitLab repository.
|
453
|
+
|
454
|
+
Args:
|
455
|
+
path: Directory path within repository
|
456
|
+
detail: Whether to return detailed information about each entry.
|
457
|
+
If True, returns list of dicts with file metadata.
|
458
|
+
If False, returns list of filenames.
|
459
|
+
**kwargs: Additional arguments (unused)
|
460
|
+
|
461
|
+
Returns:
|
462
|
+
list[str] | list[dict]: List of file/directory names or detailed info
|
463
|
+
|
464
|
+
Raises:
|
465
|
+
requests.RequestException: If API request fails
|
466
|
+
|
467
|
+
Example:
|
468
|
+
>>> fs = GitLabFileSystem(project_id="12345", access_token="glpat-xxxx")
|
469
|
+
>>> # List filenames
|
470
|
+
>>> files = fs.ls("docs")
|
471
|
+
>>> print(files)
|
472
|
+
['README.md', 'API.md']
|
473
|
+
>>>
|
474
|
+
>>> # List with details
|
475
|
+
>>> details = fs.ls("docs", detail=True)
|
476
|
+
>>> for item in details:
|
477
|
+
... print(f"{item['name']}: {item['type']}")
|
478
|
+
"""
|
213
479
|
url = f"{self.base_url}/api/v4/projects/{self.project_id}/repository/tree?path={path}&ref={self.branch}"
|
214
480
|
headers = {"PRIVATE-TOKEN": self.access_token}
|
215
481
|
response = requests.get(url, headers=headers)
|
@@ -258,19 +524,77 @@ def get_filesystem(
|
|
258
524
|
fs: AbstractFileSystem | None = None,
|
259
525
|
**storage_options_kwargs,
|
260
526
|
) -> AbstractFileSystem:
|
261
|
-
"""
|
262
|
-
Get a filesystem based on the given path.
|
527
|
+
"""Get a filesystem instance based on path or configuration.
|
263
528
|
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
GcsStorageOptions | AzureStorageOptions | dict[str, str], optional) Storage options.
|
268
|
-
Defaults to None.
|
269
|
-
dirfs: (bool, optional) If True, return a DirFileSystem. Defaults to True.
|
270
|
-
cached: (bool, optional) If True, use a cached filesystem. Defaults to False.
|
271
|
-
cache_storage: (str, optional) Path to the cache storage. Defaults to None.
|
272
|
-
**storage_options_kwargs: Additional keyword arguments for the storage options.
|
529
|
+
This function creates and configures a filesystem instance based on the provided path
|
530
|
+
and options. It supports various filesystem types including local, S3, GCS, Azure,
|
531
|
+
and Git-based filesystems.
|
273
532
|
|
533
|
+
Args:
|
534
|
+
path: URI or path to the filesystem location. Examples:
|
535
|
+
- Local: "/path/to/data"
|
536
|
+
- S3: "s3://bucket/path"
|
537
|
+
- GCS: "gs://bucket/path"
|
538
|
+
- Azure: "abfs://container/path"
|
539
|
+
- GitHub: "github://org/repo/path"
|
540
|
+
storage_options: Configuration options for the filesystem. Can be:
|
541
|
+
- BaseStorageOptions object with protocol-specific settings
|
542
|
+
- Dictionary of key-value pairs for authentication/configuration
|
543
|
+
- None to use environment variables or default credentials
|
544
|
+
dirfs: Whether to wrap filesystem in DirFileSystem for path-based operations.
|
545
|
+
Set to False when you need direct protocol-specific features.
|
546
|
+
cached: Whether to enable local caching of remote files.
|
547
|
+
Useful for frequently accessed remote files.
|
548
|
+
cache_storage: Directory path for cached files. Defaults to path-based location
|
549
|
+
in current directory if not specified.
|
550
|
+
fs: Existing filesystem instance to wrap with caching or dirfs.
|
551
|
+
Use this to customize an existing filesystem instance.
|
552
|
+
**storage_options_kwargs: Additional keyword arguments for storage options.
|
553
|
+
Alternative to passing storage_options dictionary.
|
554
|
+
|
555
|
+
Returns:
|
556
|
+
AbstractFileSystem: Configured filesystem instance with requested features.
|
557
|
+
|
558
|
+
Raises:
|
559
|
+
ValueError: If storage protocol or options are invalid
|
560
|
+
FSSpecError: If filesystem initialization fails
|
561
|
+
ImportError: If required filesystem backend is not installed
|
562
|
+
|
563
|
+
Example:
|
564
|
+
>>> # Local filesystem
|
565
|
+
>>> fs = get_filesystem("/path/to/data")
|
566
|
+
>>>
|
567
|
+
>>> # S3 with credentials
|
568
|
+
>>> fs = get_filesystem(
|
569
|
+
... "s3://bucket/data",
|
570
|
+
... storage_options={
|
571
|
+
... "key": "ACCESS_KEY",
|
572
|
+
... "secret": "SECRET_KEY"
|
573
|
+
... }
|
574
|
+
... )
|
575
|
+
>>>
|
576
|
+
>>> # Cached GCS filesystem
|
577
|
+
>>> fs = get_filesystem(
|
578
|
+
... "gs://bucket/data",
|
579
|
+
... storage_options=GcsStorageOptions(
|
580
|
+
... token="service_account.json"
|
581
|
+
... ),
|
582
|
+
... cached=True,
|
583
|
+
... cache_storage="/tmp/gcs_cache"
|
584
|
+
... )
|
585
|
+
>>>
|
586
|
+
>>> # Azure with environment credentials
|
587
|
+
>>> fs = get_filesystem(
|
588
|
+
... "abfs://container/data",
|
589
|
+
... storage_options=AzureStorageOptions.from_env()
|
590
|
+
... )
|
591
|
+
>>>
|
592
|
+
>>> # Wrap existing filesystem
|
593
|
+
>>> base_fs = filesystem("s3", key="ACCESS", secret="SECRET")
|
594
|
+
>>> cached_fs = get_filesystem(
|
595
|
+
... fs=base_fs,
|
596
|
+
... cached=True
|
597
|
+
... )
|
274
598
|
"""
|
275
599
|
if fs is not None:
|
276
600
|
if cached:
|