FlowerPower 0.11.6.19__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/cfg/__init__.py +3 -3
- flowerpower/cfg/pipeline/__init__.py +5 -3
- flowerpower/cfg/project/__init__.py +3 -3
- flowerpower/cfg/project/job_queue.py +1 -128
- flowerpower/cli/__init__.py +5 -5
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/job_queue.py +401 -133
- flowerpower/cli/pipeline.py +14 -413
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +537 -28
- flowerpower/job_queue/__init__.py +5 -94
- flowerpower/job_queue/base.py +201 -3
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
- flowerpower/job_queue/rq/manager.py +388 -77
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +2 -2
- flowerpower/pipeline/io.py +14 -16
- flowerpower/pipeline/manager.py +21 -642
- flowerpower/pipeline/pipeline.py +571 -0
- flowerpower/pipeline/registry.py +242 -10
- flowerpower/pipeline/visualizer.py +1 -2
- flowerpower/plugins/_io/__init__.py +8 -0
- flowerpower/plugins/mqtt/manager.py +6 -6
- flowerpower/settings/backend.py +0 -2
- flowerpower/settings/job_queue.py +1 -57
- flowerpower/utils/misc.py +0 -256
- flowerpower/utils/monkey.py +1 -83
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
- flowerpower-0.20.0.dist-info/RECORD +58 -0
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.19.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
flowerpower/pipeline/registry.py
CHANGED
@@ -4,9 +4,11 @@
|
|
4
4
|
import datetime as dt
|
5
5
|
import os
|
6
6
|
import posixpath
|
7
|
-
|
7
|
+
import sys
|
8
|
+
from typing import TYPE_CHECKING, Any, Dict
|
8
9
|
|
9
10
|
import rich
|
11
|
+
from fsspec_utils import AbstractFileSystem, filesystem
|
10
12
|
from loguru import logger
|
11
13
|
from rich.console import Console
|
12
14
|
from rich.panel import Panel
|
@@ -17,15 +19,16 @@ from rich.tree import Tree
|
|
17
19
|
from .. import settings
|
18
20
|
# Import necessary config types and utility functions
|
19
21
|
from ..cfg import PipelineConfig, ProjectConfig
|
20
|
-
from ..fs import AbstractFileSystem
|
21
22
|
from ..utils.logging import setup_logging
|
22
23
|
# Assuming view_img might be used indirectly or needed later
|
23
24
|
from ..utils.templates import (HOOK_TEMPLATE__MQTT_BUILD_CONFIG,
|
24
25
|
PIPELINE_PY_TEMPLATE)
|
26
|
+
# Import base utilities
|
27
|
+
from .base import load_module
|
25
28
|
|
26
29
|
if TYPE_CHECKING:
|
27
|
-
|
28
|
-
|
30
|
+
from .pipeline import Pipeline
|
31
|
+
from ..flowerpower import FlowerPowerProject
|
29
32
|
|
30
33
|
from enum import Enum
|
31
34
|
|
@@ -54,8 +57,8 @@ class PipelineRegistry:
|
|
54
57
|
self,
|
55
58
|
project_cfg: ProjectConfig,
|
56
59
|
fs: AbstractFileSystem,
|
57
|
-
|
58
|
-
|
60
|
+
base_dir: str | None = None,
|
61
|
+
storage_options: dict | None = None,
|
59
62
|
):
|
60
63
|
"""
|
61
64
|
Initializes the PipelineRegistry.
|
@@ -63,15 +66,244 @@ class PipelineRegistry:
|
|
63
66
|
Args:
|
64
67
|
project_cfg: The project configuration object.
|
65
68
|
fs: The filesystem instance.
|
66
|
-
|
67
|
-
|
69
|
+
base_dir: The base directory path.
|
70
|
+
storage_options: Storage options for filesystem operations.
|
68
71
|
"""
|
69
72
|
self.project_cfg = project_cfg
|
70
73
|
self._fs = fs
|
71
|
-
self._cfg_dir =
|
72
|
-
self._pipelines_dir =
|
74
|
+
self._cfg_dir = settings.CONFIG_DIR
|
75
|
+
self._pipelines_dir = settings.PIPELINES_DIR
|
76
|
+
self._base_dir = base_dir
|
77
|
+
self._storage_options = storage_options or {}
|
73
78
|
self._console = Console()
|
74
79
|
|
80
|
+
# Cache for loaded pipelines
|
81
|
+
self._pipeline_cache: Dict[str, "Pipeline"] = {}
|
82
|
+
self._config_cache: Dict[str, PipelineConfig] = {}
|
83
|
+
self._module_cache: Dict[str, Any] = {}
|
84
|
+
|
85
|
+
# Ensure module paths are added
|
86
|
+
self._add_modules_path()
|
87
|
+
|
88
|
+
@classmethod
|
89
|
+
def from_filesystem(
|
90
|
+
cls,
|
91
|
+
base_dir: str,
|
92
|
+
fs: AbstractFileSystem | None = None,
|
93
|
+
storage_options: dict | None = None,
|
94
|
+
) -> "PipelineRegistry":
|
95
|
+
"""
|
96
|
+
Create a PipelineRegistry from filesystem parameters.
|
97
|
+
|
98
|
+
This factory method creates a complete PipelineRegistry instance by:
|
99
|
+
1. Creating the filesystem if not provided
|
100
|
+
2. Loading the ProjectConfig from the base directory
|
101
|
+
3. Initializing the registry with the loaded configuration
|
102
|
+
|
103
|
+
Args:
|
104
|
+
base_dir: The base directory path for the FlowerPower project
|
105
|
+
fs: Optional filesystem instance. If None, will be created from base_dir
|
106
|
+
storage_options: Optional storage options for filesystem access
|
107
|
+
|
108
|
+
Returns:
|
109
|
+
PipelineRegistry: A fully configured registry instance
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ValueError: If base_dir is invalid or ProjectConfig cannot be loaded
|
113
|
+
RuntimeError: If filesystem creation fails
|
114
|
+
|
115
|
+
Example:
|
116
|
+
```python
|
117
|
+
# Create registry from local directory
|
118
|
+
registry = PipelineRegistry.from_filesystem("/path/to/project")
|
119
|
+
|
120
|
+
# Create registry with S3 storage
|
121
|
+
registry = PipelineRegistry.from_filesystem(
|
122
|
+
"s3://my-bucket/project",
|
123
|
+
storage_options={"key": "secret"}
|
124
|
+
)
|
125
|
+
```
|
126
|
+
"""
|
127
|
+
# Create filesystem if not provided
|
128
|
+
if fs is None:
|
129
|
+
fs = filesystem(
|
130
|
+
base_dir,
|
131
|
+
storage_options=storage_options,
|
132
|
+
cached=storage_options is not None,
|
133
|
+
)
|
134
|
+
|
135
|
+
# Load project configuration
|
136
|
+
project_cfg = ProjectConfig.load(base_dir=base_dir, fs=fs)
|
137
|
+
|
138
|
+
# Ensure we have a ProjectConfig instance
|
139
|
+
if not isinstance(project_cfg, ProjectConfig):
|
140
|
+
raise TypeError(f"Expected ProjectConfig, got {type(project_cfg)}")
|
141
|
+
|
142
|
+
# Create and return registry instance
|
143
|
+
return cls(
|
144
|
+
project_cfg=project_cfg,
|
145
|
+
fs=fs,
|
146
|
+
base_dir=base_dir,
|
147
|
+
storage_options=storage_options,
|
148
|
+
)
|
149
|
+
|
150
|
+
def _add_modules_path(self) -> None:
|
151
|
+
"""Add pipeline module paths to Python path."""
|
152
|
+
try:
|
153
|
+
if hasattr(self._fs, "is_cache_fs") and self._fs.is_cache_fs:
|
154
|
+
self._fs.sync_cache()
|
155
|
+
project_path = self._fs._mapper.directory
|
156
|
+
modules_path = posixpath.join(project_path, self._pipelines_dir)
|
157
|
+
else:
|
158
|
+
# Use the base directory directly if not using cache
|
159
|
+
if hasattr(self._fs, "path"):
|
160
|
+
project_path = self._fs.path
|
161
|
+
elif self._base_dir:
|
162
|
+
project_path = self._base_dir
|
163
|
+
else:
|
164
|
+
# Fallback for mocked filesystems
|
165
|
+
project_path = "."
|
166
|
+
modules_path = posixpath.join(project_path, self._pipelines_dir)
|
167
|
+
|
168
|
+
if project_path not in sys.path:
|
169
|
+
sys.path.insert(0, project_path)
|
170
|
+
|
171
|
+
if modules_path not in sys.path:
|
172
|
+
sys.path.insert(0, modules_path)
|
173
|
+
except (AttributeError, TypeError):
|
174
|
+
# Handle case where filesystem is mocked or doesn't have required properties
|
175
|
+
logger.debug("Could not add modules path - using default Python path")
|
176
|
+
|
177
|
+
# --- Pipeline Factory Methods ---
|
178
|
+
|
179
|
+
def get_pipeline(
|
180
|
+
self, name: str, project_context: "FlowerPowerProject", reload: bool = False
|
181
|
+
) -> "Pipeline":
|
182
|
+
"""Get a Pipeline instance for the given name.
|
183
|
+
|
184
|
+
This method creates a fully-formed Pipeline object by loading its configuration
|
185
|
+
and Python module, then injecting the project context.
|
186
|
+
|
187
|
+
Args:
|
188
|
+
name: Name of the pipeline to get
|
189
|
+
project_context: Reference to the FlowerPowerProject
|
190
|
+
reload: Whether to reload configuration and module from disk
|
191
|
+
|
192
|
+
Returns:
|
193
|
+
Pipeline instance ready for execution
|
194
|
+
|
195
|
+
Raises:
|
196
|
+
FileNotFoundError: If pipeline configuration or module doesn't exist
|
197
|
+
ImportError: If pipeline module cannot be imported
|
198
|
+
ValueError: If pipeline configuration is invalid
|
199
|
+
"""
|
200
|
+
# Use cache if available and not reloading
|
201
|
+
if not reload and name in self._pipeline_cache:
|
202
|
+
logger.debug(f"Returning cached pipeline '{name}'")
|
203
|
+
return self._pipeline_cache[name]
|
204
|
+
|
205
|
+
logger.debug(f"Creating pipeline instance for '{name}'")
|
206
|
+
|
207
|
+
# Load pipeline configuration
|
208
|
+
config = self.load_config(name, reload=reload)
|
209
|
+
|
210
|
+
# Load pipeline module
|
211
|
+
module = self.load_module(name, reload=reload)
|
212
|
+
|
213
|
+
# Import Pipeline class here to avoid circular import
|
214
|
+
from .pipeline import Pipeline
|
215
|
+
|
216
|
+
# Create Pipeline instance
|
217
|
+
pipeline = Pipeline(
|
218
|
+
name=name,
|
219
|
+
config=config,
|
220
|
+
module=module,
|
221
|
+
project_context=project_context,
|
222
|
+
)
|
223
|
+
|
224
|
+
# Cache the pipeline instance
|
225
|
+
self._pipeline_cache[name] = pipeline
|
226
|
+
|
227
|
+
logger.debug(f"Successfully created pipeline instance for '{name}'")
|
228
|
+
return pipeline
|
229
|
+
|
230
|
+
def load_config(self, name: str, reload: bool = False) -> PipelineConfig:
|
231
|
+
"""Load pipeline configuration from disk.
|
232
|
+
|
233
|
+
Args:
|
234
|
+
name: Name of the pipeline
|
235
|
+
reload: Whether to reload from disk even if cached
|
236
|
+
|
237
|
+
Returns:
|
238
|
+
PipelineConfig instance
|
239
|
+
"""
|
240
|
+
# Use cache if available and not reloading
|
241
|
+
if not reload and name in self._config_cache:
|
242
|
+
logger.debug(f"Returning cached config for pipeline '{name}'")
|
243
|
+
return self._config_cache[name]
|
244
|
+
|
245
|
+
logger.debug(f"Loading configuration for pipeline '{name}'")
|
246
|
+
|
247
|
+
# Load configuration from disk
|
248
|
+
config = PipelineConfig.load(
|
249
|
+
base_dir=self._base_dir,
|
250
|
+
name=name,
|
251
|
+
fs=self._fs,
|
252
|
+
storage_options=self._storage_options,
|
253
|
+
)
|
254
|
+
|
255
|
+
# Cache the configuration
|
256
|
+
self._config_cache[name] = config
|
257
|
+
|
258
|
+
return config
|
259
|
+
|
260
|
+
def load_module(self, name: str, reload: bool = False) -> Any:
|
261
|
+
"""Load pipeline module from disk.
|
262
|
+
|
263
|
+
Args:
|
264
|
+
name: Name of the pipeline
|
265
|
+
reload: Whether to reload from disk even if cached
|
266
|
+
|
267
|
+
Returns:
|
268
|
+
Loaded Python module
|
269
|
+
"""
|
270
|
+
# Use cache if available and not reloading
|
271
|
+
if not reload and name in self._module_cache:
|
272
|
+
logger.debug(f"Returning cached module for pipeline '{name}'")
|
273
|
+
return self._module_cache[name]
|
274
|
+
|
275
|
+
logger.debug(f"Loading module for pipeline '{name}'")
|
276
|
+
|
277
|
+
# Convert pipeline name to module name
|
278
|
+
formatted_name = name.replace(".", "/").replace("-", "_")
|
279
|
+
module_name = f"pipelines.{formatted_name}"
|
280
|
+
|
281
|
+
# Load the module
|
282
|
+
module = load_module(module_name, reload=reload)
|
283
|
+
|
284
|
+
# Cache the module
|
285
|
+
self._module_cache[name] = module
|
286
|
+
|
287
|
+
return module
|
288
|
+
|
289
|
+
def clear_cache(self, name: str | None = None):
|
290
|
+
"""Clear cached pipelines, configurations, and modules.
|
291
|
+
|
292
|
+
Args:
|
293
|
+
name: If provided, clear cache only for this pipeline.
|
294
|
+
If None, clear entire cache.
|
295
|
+
"""
|
296
|
+
if name:
|
297
|
+
logger.debug(f"Clearing cache for pipeline '{name}'")
|
298
|
+
self._pipeline_cache.pop(name, None)
|
299
|
+
self._config_cache.pop(name, None)
|
300
|
+
self._module_cache.pop(name, None)
|
301
|
+
else:
|
302
|
+
logger.debug("Clearing entire pipeline cache")
|
303
|
+
self._pipeline_cache.clear()
|
304
|
+
self._config_cache.clear()
|
305
|
+
self._module_cache.clear()
|
306
|
+
|
75
307
|
# --- Methods moved from PipelineManager ---
|
76
308
|
def new(self, name: str, overwrite: bool = False):
|
77
309
|
"""
|
@@ -1,12 +1,11 @@
|
|
1
1
|
import posixpath
|
2
|
-
from typing import Any
|
3
2
|
|
3
|
+
from fsspec_utils import AbstractFileSystem
|
4
4
|
from hamilton import driver
|
5
5
|
from rich import print
|
6
6
|
|
7
7
|
# Import necessary config types and utility functions
|
8
8
|
from ..cfg import PipelineConfig, ProjectConfig
|
9
|
-
from ..fs import AbstractFileSystem
|
10
9
|
from ..utils.misc import view_img
|
11
10
|
from .base import load_module # Import module loading utility
|
12
11
|
|
@@ -7,6 +7,7 @@ from types import TracebackType
|
|
7
7
|
from typing import Any, Callable
|
8
8
|
|
9
9
|
import mmh3
|
10
|
+
from fsspec_utils import AbstractFileSystem, BaseStorageOptions, filesystem
|
10
11
|
from loguru import logger
|
11
12
|
from munch import Munch
|
12
13
|
from paho.mqtt.client import (MQTT_ERR_SUCCESS, CallbackAPIVersion, Client,
|
@@ -16,9 +17,7 @@ from paho.mqtt.reasoncodes import ReasonCode
|
|
16
17
|
from ...cfg import ProjectConfig
|
17
18
|
from ...cfg.pipeline.run import ExecutorConfig, WithAdapterConfig
|
18
19
|
from ...cfg.project.adapter import AdapterConfig
|
19
|
-
from ...fs import AbstractFileSystem, BaseStorageOptions, get_filesystem
|
20
20
|
from ...pipeline.manager import PipelineManager
|
21
|
-
from ...utils.callback import run_with_callback
|
22
21
|
from ...utils.logging import setup_logging
|
23
22
|
from .cfg import MqttConfig
|
24
23
|
|
@@ -132,8 +131,9 @@ class MqttManager:
|
|
132
131
|
import os
|
133
132
|
|
134
133
|
if fs is None:
|
135
|
-
fs =
|
136
|
-
|
134
|
+
fs = filesystem(
|
135
|
+
protocol_or_path=os.path.dirname(path),
|
136
|
+
storage_options=storage_options,
|
137
137
|
)
|
138
138
|
|
139
139
|
cfg = MqttConfig.from_yaml(path=os.path.basename(path), fs=fs)
|
@@ -637,7 +637,7 @@ class MqttManager:
|
|
637
637
|
storage_options=storage_options, fs=fs, base_dir=base_dir
|
638
638
|
) as pipeline:
|
639
639
|
if as_job:
|
640
|
-
|
640
|
+
pipeline.add_job(
|
641
641
|
name=name,
|
642
642
|
inputs=inputs,
|
643
643
|
final_vars=final_vars,
|
@@ -664,7 +664,7 @@ class MqttManager:
|
|
664
664
|
)
|
665
665
|
|
666
666
|
else:
|
667
|
-
|
667
|
+
pipeline.run(
|
668
668
|
name=name,
|
669
669
|
inputs=inputs,
|
670
670
|
final_vars=final_vars,
|
flowerpower/settings/backend.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
import os
|
2
2
|
|
3
3
|
from .backend import BACKEND_PROPERTIES
|
4
|
-
from .executor import
|
4
|
+
from .executor import EXECUTOR_NUM_CPUS
|
5
5
|
|
6
6
|
# WORKER
|
7
7
|
JOB_QUEUE_TYPE = os.getenv("FP_JOB_QUEUE_TYPE", "rq")
|
@@ -29,59 +29,3 @@ RQ_QUEUES = (
|
|
29
29
|
.split(",")
|
30
30
|
)
|
31
31
|
RQ_NUM_WORKERS = int(os.getenv("FP_RQ_NUM_WORKERS", EXECUTOR_NUM_CPUS))
|
32
|
-
|
33
|
-
# APS WORKER
|
34
|
-
APS_BACKEND_DS = os.getenv("FP_APS_BACKEND_DS", "memory")
|
35
|
-
|
36
|
-
APS_BACKEND_DS_HOST = os.getenv(
|
37
|
-
"FP_APS_BACKEND_DS_HOST",
|
38
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_host", None),
|
39
|
-
)
|
40
|
-
APS_BACKEND_DS_PORT = int(
|
41
|
-
os.getenv(
|
42
|
-
"FP_APS_BACKEND_DS_PORT",
|
43
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_port", 0),
|
44
|
-
)
|
45
|
-
)
|
46
|
-
APS_BACKEND_DS_DB = os.getenv(
|
47
|
-
"FP_APS_BACKEND_DS_DB",
|
48
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_database", None),
|
49
|
-
)
|
50
|
-
APS_BACKEND_DS_USERNAME = os.getenv(
|
51
|
-
"FP_APS_BACKEND_DS_USERNAME",
|
52
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_username", None),
|
53
|
-
)
|
54
|
-
APS_BACKEND_DS_PASSWORD = os.getenv(
|
55
|
-
"FP_APS_BACKEND_DS_PASSWORD",
|
56
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_password", None),
|
57
|
-
)
|
58
|
-
APS_BACKEND_DS_SCHEMA = os.getenv("FP_APS_BACKEND_DS_SCHEMA", "flowerpower")
|
59
|
-
|
60
|
-
APS_BACKEND_EB = os.getenv("FP_APS_BACKEND_EB", "memory")
|
61
|
-
APS_BACKEND_EB_HOST = os.getenv(
|
62
|
-
"FP_APS_BACKEND_EB_HOST",
|
63
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_host", None),
|
64
|
-
)
|
65
|
-
APS_BACKEND_EB_PORT = int(
|
66
|
-
os.getenv(
|
67
|
-
"FP_APS_BACKEND_EB_PORT",
|
68
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_port", 0),
|
69
|
-
)
|
70
|
-
)
|
71
|
-
APS_BACKEND_EB_DB = os.getenv(
|
72
|
-
"FP_APS_BACKEND_EB_DB",
|
73
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_database", None),
|
74
|
-
)
|
75
|
-
APS_BACKEND_EB_USERNAME = os.getenv(
|
76
|
-
"FP_APS_BACKEND_EB_USERNAME",
|
77
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_username", None),
|
78
|
-
)
|
79
|
-
APS_BACKEND_EB_PASSWORD = os.getenv(
|
80
|
-
"FP_APS_BACKEND_EB_PASSWORD",
|
81
|
-
BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_password", None),
|
82
|
-
)
|
83
|
-
|
84
|
-
APS_CLEANUP_INTERVAL = int(os.getenv("FP_APS_CLEANUP_INTERVAL", 300))
|
85
|
-
APS_MAX_CONCURRENT_JOBS = int(os.getenv("FP_APS_MAX_CONCURRENT_JOBS", 10))
|
86
|
-
APS_DEFAULT_EXECUTOR = os.getenv("FP_APS_DEFAULT_EXECUTOR", EXECUTOR)
|
87
|
-
APS_NUM_WORKERS = int(os.getenv("FP_APS_NUM_WORKERS", EXECUTOR_MAX_WORKERS))
|
flowerpower/utils/misc.py
CHANGED
@@ -8,262 +8,6 @@ from typing import Any
|
|
8
8
|
|
9
9
|
import msgspec
|
10
10
|
|
11
|
-
if importlib.util.find_spec("pyarrow"):
|
12
|
-
import pyarrow as pa
|
13
|
-
|
14
|
-
def convert_large_types_to_standard(schema: pa.Schema) -> pa.Schema:
|
15
|
-
# Define mapping of large types to standard types
|
16
|
-
type_mapping = {
|
17
|
-
pa.large_string(): pa.string(),
|
18
|
-
pa.large_binary(): pa.binary(),
|
19
|
-
pa.large_list(pa.null()): pa.list_(pa.null()),
|
20
|
-
}
|
21
|
-
|
22
|
-
# Convert fields
|
23
|
-
new_fields = []
|
24
|
-
for field in schema:
|
25
|
-
field_type = field.type
|
26
|
-
# Check if type exists in mapping
|
27
|
-
if field_type in type_mapping:
|
28
|
-
new_field = pa.field(
|
29
|
-
name=field.name,
|
30
|
-
type=type_mapping[field_type],
|
31
|
-
nullable=field.nullable,
|
32
|
-
metadata=field.metadata,
|
33
|
-
)
|
34
|
-
new_fields.append(new_field)
|
35
|
-
# Handle large lists with nested types
|
36
|
-
elif isinstance(field_type, pa.LargeListType):
|
37
|
-
new_field = pa.field(
|
38
|
-
name=field.name,
|
39
|
-
type=pa.list_(field_type.value_type),
|
40
|
-
nullable=field.nullable,
|
41
|
-
metadata=field.metadata,
|
42
|
-
)
|
43
|
-
new_fields.append(new_field)
|
44
|
-
else:
|
45
|
-
new_fields.append(field)
|
46
|
-
|
47
|
-
return pa.schema(new_fields)
|
48
|
-
|
49
|
-
|
50
|
-
else:
|
51
|
-
|
52
|
-
def convert_large_types_to_standard(*args, **kwargs):
|
53
|
-
raise ImportError("pyarrow not installed")
|
54
|
-
|
55
|
-
|
56
|
-
if importlib.util.find_spec("polars"):
|
57
|
-
import polars as pl
|
58
|
-
|
59
|
-
def _dict_to_dataframe(
|
60
|
-
data: dict | list[dict], unique: bool | list[str] | str = False
|
61
|
-
) -> pl.DataFrame:
|
62
|
-
"""
|
63
|
-
Convert a dictionary or list of dictionaries to a polars DataFrame.
|
64
|
-
|
65
|
-
Args:
|
66
|
-
data: (dict | list[dict]) Data to convert.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
pl.DataFrame: Converted data.
|
70
|
-
|
71
|
-
Examples:
|
72
|
-
>>> # Single dict with list values
|
73
|
-
>>> data = {'a': [1, 2, 3], 'b': [4, 5, 6]}
|
74
|
-
>>> _dict_to_dataframe(data)
|
75
|
-
shape: (3, 2)
|
76
|
-
┌─────┬─────┐
|
77
|
-
│ a ┆ b │
|
78
|
-
│ --- ┆ --- │
|
79
|
-
│ i64 ┆ i64 │
|
80
|
-
╞═════╪═════╡
|
81
|
-
│ 1 ┆ 4 │
|
82
|
-
│ 2 ┆ 5 │
|
83
|
-
│ 3 ┆ 6 │
|
84
|
-
└─────┴─────┘
|
85
|
-
|
86
|
-
>>> # Single dict with scalar values
|
87
|
-
>>> data = {'a': 1, 'b': 2}
|
88
|
-
>>> _dict_to_dataframe(data)
|
89
|
-
shape: (1, 2)
|
90
|
-
┌─────┬─────┐
|
91
|
-
│ a ┆ b │
|
92
|
-
│ --- ┆ --- │
|
93
|
-
│ i64 ┆ i64 │
|
94
|
-
╞═════╪═════╡
|
95
|
-
│ 1 ┆ 2 │
|
96
|
-
└─────┴─────┘
|
97
|
-
|
98
|
-
>>> # List of dicts with scalar values
|
99
|
-
>>> data = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
|
100
|
-
>>> _dict_to_dataframe(data)
|
101
|
-
shape: (2, 2)
|
102
|
-
┌─────┬─────┐
|
103
|
-
│ a ┆ b │
|
104
|
-
│ --- ┆ --- │
|
105
|
-
│ i64 ┆ i64 │
|
106
|
-
╞═════╪═════╡
|
107
|
-
│ 1 ┆ 2 │
|
108
|
-
│ 3 ┆ 4 │
|
109
|
-
└─────┴─────┘
|
110
|
-
|
111
|
-
>>> # List of dicts with list values
|
112
|
-
>>> data = [{'a': [1, 2], 'b': [3, 4]}, {'a': [5, 6], 'b': [7, 8]}]
|
113
|
-
>>> _dict_to_dataframe(data)
|
114
|
-
shape: (2, 2)
|
115
|
-
┌───────┬───────┐
|
116
|
-
│ a ┆ b │
|
117
|
-
│ --- ┆ --- │
|
118
|
-
│ list ┆ list │
|
119
|
-
╞═══════╪═══════╡
|
120
|
-
│ [1,2] ┆ [3,4] │
|
121
|
-
│ [5,6] ┆ [7,8] │
|
122
|
-
└───────┴───────┘
|
123
|
-
"""
|
124
|
-
if isinstance(data, list):
|
125
|
-
# If it's a single-element list, just use the first element
|
126
|
-
if len(data) == 1:
|
127
|
-
data = data[0]
|
128
|
-
# If it's a list of dicts
|
129
|
-
else:
|
130
|
-
first_item = data[0]
|
131
|
-
# Check if the dict values are lists/tuples
|
132
|
-
if any(isinstance(v, (list, tuple)) for v in first_item.values()):
|
133
|
-
# Each dict becomes a row with list/tuple values
|
134
|
-
data = pl.DataFrame(data)
|
135
|
-
else:
|
136
|
-
# If values are scalars, convert list of dicts to DataFrame
|
137
|
-
data = pl.DataFrame(data)
|
138
|
-
|
139
|
-
if unique:
|
140
|
-
data = data.unique(
|
141
|
-
subset=None if not isinstance(unique, str | list) else unique,
|
142
|
-
maintain_order=True,
|
143
|
-
)
|
144
|
-
return data
|
145
|
-
|
146
|
-
# If it's a single dict
|
147
|
-
if isinstance(data, dict):
|
148
|
-
# Check if values are lists/tuples
|
149
|
-
if any(isinstance(v, (list, tuple)) for v in data.values()):
|
150
|
-
# Get the length of any list value (assuming all lists have same length)
|
151
|
-
length = len(
|
152
|
-
next(v for v in data.values() if isinstance(v, (list, tuple)))
|
153
|
-
)
|
154
|
-
# Convert to DataFrame where each list element becomes a row
|
155
|
-
data = pl.DataFrame({
|
156
|
-
k: v if isinstance(v, (list, tuple)) else [v] * length
|
157
|
-
for k, v in data.items()
|
158
|
-
})
|
159
|
-
else:
|
160
|
-
# If values are scalars, wrap them in a list to create a single row
|
161
|
-
data = pl.DataFrame({k: [v] for k, v in data.items()})
|
162
|
-
|
163
|
-
if unique:
|
164
|
-
data = data.unique(
|
165
|
-
subset=None if not isinstance(unique, str | list) else unique,
|
166
|
-
maintain_order=True,
|
167
|
-
)
|
168
|
-
return data
|
169
|
-
|
170
|
-
raise ValueError("Input must be a dictionary or list of dictionaries")
|
171
|
-
|
172
|
-
else:
|
173
|
-
|
174
|
-
def _dict_to_dataframe(*args, **kwargs):
|
175
|
-
raise ImportError("polars not installed")
|
176
|
-
|
177
|
-
|
178
|
-
if (
|
179
|
-
importlib.util.find_spec("pandas")
|
180
|
-
and importlib.util.find_spec("polars")
|
181
|
-
and importlib.util.find_spec("pyarrow")
|
182
|
-
):
|
183
|
-
from typing import Generator
|
184
|
-
|
185
|
-
import pandas as pd
|
186
|
-
|
187
|
-
def to_pyarrow_table(
|
188
|
-
data: pl.DataFrame
|
189
|
-
| pl.LazyFrame
|
190
|
-
| pd.DataFrame
|
191
|
-
| dict
|
192
|
-
| list[pl.DataFrame | pl.LazyFrame | pd.DataFrame | dict],
|
193
|
-
concat: bool = False,
|
194
|
-
unique: bool | list[str] | str = False,
|
195
|
-
) -> pa.Table:
|
196
|
-
if isinstance(data, dict):
|
197
|
-
data = _dict_to_dataframe(data)
|
198
|
-
if isinstance(data, list):
|
199
|
-
if isinstance(data[0], dict):
|
200
|
-
data = _dict_to_dataframe(data, unique=unique)
|
201
|
-
|
202
|
-
if not isinstance(data, list):
|
203
|
-
data = [data]
|
204
|
-
|
205
|
-
if isinstance(data[0], pl.LazyFrame):
|
206
|
-
data = [dd.collect() for dd in data]
|
207
|
-
|
208
|
-
if isinstance(data[0], pl.DataFrame):
|
209
|
-
if concat:
|
210
|
-
data = pl.concat(data, how="diagonal_relaxed")
|
211
|
-
if unique:
|
212
|
-
data = data.unique(
|
213
|
-
subset=None if not isinstance(unique, str | list) else unique,
|
214
|
-
maintain_order=True,
|
215
|
-
)
|
216
|
-
data = data.to_arrow()
|
217
|
-
data = data.cast(convert_large_types_to_standard(data.schema))
|
218
|
-
else:
|
219
|
-
data = [dd.to_arrow() for dd in data]
|
220
|
-
data = [
|
221
|
-
dd.cast(convert_large_types_to_standard(dd.schema)) for dd in data
|
222
|
-
]
|
223
|
-
|
224
|
-
elif isinstance(data[0], pd.DataFrame):
|
225
|
-
data = [pa.Table.from_pandas(dd, preserve_index=False) for dd in data]
|
226
|
-
if concat:
|
227
|
-
data = pa.concat_tables(data, promote_options="permissive")
|
228
|
-
if unique:
|
229
|
-
data = (
|
230
|
-
pl.from_arrow(data)
|
231
|
-
.unique(
|
232
|
-
subset=None
|
233
|
-
if not isinstance(unique, str | list)
|
234
|
-
else unique,
|
235
|
-
maintain_order=True,
|
236
|
-
)
|
237
|
-
.to_arrow()
|
238
|
-
)
|
239
|
-
data = data.cast(convert_large_types_to_standard(data.schema))
|
240
|
-
|
241
|
-
elif isinstance(data[0], pa.RecordBatch | pa.RecordBatchReader | Generator):
|
242
|
-
if concat:
|
243
|
-
data = pa.Table.from_batches(data)
|
244
|
-
if unique:
|
245
|
-
data = (
|
246
|
-
pl.from_arrow(data)
|
247
|
-
.unique(
|
248
|
-
subset=None
|
249
|
-
if not isinstance(unique, str | list)
|
250
|
-
else unique,
|
251
|
-
maintain_order=True,
|
252
|
-
)
|
253
|
-
.to_arrow()
|
254
|
-
)
|
255
|
-
data = data.cast(convert_large_types_to_standard(data.schema))
|
256
|
-
else:
|
257
|
-
data = [pa.Table.from_batches([dd]) for dd in data]
|
258
|
-
|
259
|
-
return data
|
260
|
-
|
261
|
-
else:
|
262
|
-
|
263
|
-
def to_pyarrow_table(*args, **kwargs):
|
264
|
-
raise ImportError("pandas, polars, or pyarrow not installed")
|
265
|
-
|
266
|
-
|
267
11
|
if importlib.util.find_spec("joblib"):
|
268
12
|
from joblib import Parallel, delayed
|
269
13
|
from rich.progress import (BarColumn, Progress, TextColumn,
|