FlowerPower 0.11.6.19__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +401 -133
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.19.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -4,9 +4,11 @@
4
4
  import datetime as dt
5
5
  import os
6
6
  import posixpath
7
- from typing import TYPE_CHECKING
7
+ import sys
8
+ from typing import TYPE_CHECKING, Any, Dict
8
9
 
9
10
  import rich
11
+ from fsspec_utils import AbstractFileSystem, filesystem
10
12
  from loguru import logger
11
13
  from rich.console import Console
12
14
  from rich.panel import Panel
@@ -17,15 +19,16 @@ from rich.tree import Tree
17
19
  from .. import settings
18
20
  # Import necessary config types and utility functions
19
21
  from ..cfg import PipelineConfig, ProjectConfig
20
- from ..fs import AbstractFileSystem
21
22
  from ..utils.logging import setup_logging
22
23
  # Assuming view_img might be used indirectly or needed later
23
24
  from ..utils.templates import (HOOK_TEMPLATE__MQTT_BUILD_CONFIG,
24
25
  PIPELINE_PY_TEMPLATE)
26
+ # Import base utilities
27
+ from .base import load_module
25
28
 
26
29
  if TYPE_CHECKING:
27
- # Keep this for type hinting if needed elsewhere, though Config is imported directly now
28
- pass
30
+ from .pipeline import Pipeline
31
+ from ..flowerpower import FlowerPowerProject
29
32
 
30
33
  from enum import Enum
31
34
 
@@ -54,8 +57,8 @@ class PipelineRegistry:
54
57
  self,
55
58
  project_cfg: ProjectConfig,
56
59
  fs: AbstractFileSystem,
57
- cfg_dir: str,
58
- pipelines_dir: str,
60
+ base_dir: str | None = None,
61
+ storage_options: dict | None = None,
59
62
  ):
60
63
  """
61
64
  Initializes the PipelineRegistry.
@@ -63,15 +66,244 @@ class PipelineRegistry:
63
66
  Args:
64
67
  project_cfg: The project configuration object.
65
68
  fs: The filesystem instance.
66
- cfg_dir: The configuration directory path.
67
- pipelines_dir: The pipelines directory path.
69
+ base_dir: The base directory path.
70
+ storage_options: Storage options for filesystem operations.
68
71
  """
69
72
  self.project_cfg = project_cfg
70
73
  self._fs = fs
71
- self._cfg_dir = cfg_dir
72
- self._pipelines_dir = pipelines_dir
74
+ self._cfg_dir = settings.CONFIG_DIR
75
+ self._pipelines_dir = settings.PIPELINES_DIR
76
+ self._base_dir = base_dir
77
+ self._storage_options = storage_options or {}
73
78
  self._console = Console()
74
79
 
80
+ # Cache for loaded pipelines
81
+ self._pipeline_cache: Dict[str, "Pipeline"] = {}
82
+ self._config_cache: Dict[str, PipelineConfig] = {}
83
+ self._module_cache: Dict[str, Any] = {}
84
+
85
+ # Ensure module paths are added
86
+ self._add_modules_path()
87
+
88
+ @classmethod
89
+ def from_filesystem(
90
+ cls,
91
+ base_dir: str,
92
+ fs: AbstractFileSystem | None = None,
93
+ storage_options: dict | None = None,
94
+ ) -> "PipelineRegistry":
95
+ """
96
+ Create a PipelineRegistry from filesystem parameters.
97
+
98
+ This factory method creates a complete PipelineRegistry instance by:
99
+ 1. Creating the filesystem if not provided
100
+ 2. Loading the ProjectConfig from the base directory
101
+ 3. Initializing the registry with the loaded configuration
102
+
103
+ Args:
104
+ base_dir: The base directory path for the FlowerPower project
105
+ fs: Optional filesystem instance. If None, will be created from base_dir
106
+ storage_options: Optional storage options for filesystem access
107
+
108
+ Returns:
109
+ PipelineRegistry: A fully configured registry instance
110
+
111
+ Raises:
112
+ ValueError: If base_dir is invalid or ProjectConfig cannot be loaded
113
+ RuntimeError: If filesystem creation fails
114
+
115
+ Example:
116
+ ```python
117
+ # Create registry from local directory
118
+ registry = PipelineRegistry.from_filesystem("/path/to/project")
119
+
120
+ # Create registry with S3 storage
121
+ registry = PipelineRegistry.from_filesystem(
122
+ "s3://my-bucket/project",
123
+ storage_options={"key": "secret"}
124
+ )
125
+ ```
126
+ """
127
+ # Create filesystem if not provided
128
+ if fs is None:
129
+ fs = filesystem(
130
+ base_dir,
131
+ storage_options=storage_options,
132
+ cached=storage_options is not None,
133
+ )
134
+
135
+ # Load project configuration
136
+ project_cfg = ProjectConfig.load(base_dir=base_dir, fs=fs)
137
+
138
+ # Ensure we have a ProjectConfig instance
139
+ if not isinstance(project_cfg, ProjectConfig):
140
+ raise TypeError(f"Expected ProjectConfig, got {type(project_cfg)}")
141
+
142
+ # Create and return registry instance
143
+ return cls(
144
+ project_cfg=project_cfg,
145
+ fs=fs,
146
+ base_dir=base_dir,
147
+ storage_options=storage_options,
148
+ )
149
+
150
+ def _add_modules_path(self) -> None:
151
+ """Add pipeline module paths to Python path."""
152
+ try:
153
+ if hasattr(self._fs, "is_cache_fs") and self._fs.is_cache_fs:
154
+ self._fs.sync_cache()
155
+ project_path = self._fs._mapper.directory
156
+ modules_path = posixpath.join(project_path, self._pipelines_dir)
157
+ else:
158
+ # Use the base directory directly if not using cache
159
+ if hasattr(self._fs, "path"):
160
+ project_path = self._fs.path
161
+ elif self._base_dir:
162
+ project_path = self._base_dir
163
+ else:
164
+ # Fallback for mocked filesystems
165
+ project_path = "."
166
+ modules_path = posixpath.join(project_path, self._pipelines_dir)
167
+
168
+ if project_path not in sys.path:
169
+ sys.path.insert(0, project_path)
170
+
171
+ if modules_path not in sys.path:
172
+ sys.path.insert(0, modules_path)
173
+ except (AttributeError, TypeError):
174
+ # Handle case where filesystem is mocked or doesn't have required properties
175
+ logger.debug("Could not add modules path - using default Python path")
176
+
177
+ # --- Pipeline Factory Methods ---
178
+
179
+ def get_pipeline(
180
+ self, name: str, project_context: "FlowerPowerProject", reload: bool = False
181
+ ) -> "Pipeline":
182
+ """Get a Pipeline instance for the given name.
183
+
184
+ This method creates a fully-formed Pipeline object by loading its configuration
185
+ and Python module, then injecting the project context.
186
+
187
+ Args:
188
+ name: Name of the pipeline to get
189
+ project_context: Reference to the FlowerPowerProject
190
+ reload: Whether to reload configuration and module from disk
191
+
192
+ Returns:
193
+ Pipeline instance ready for execution
194
+
195
+ Raises:
196
+ FileNotFoundError: If pipeline configuration or module doesn't exist
197
+ ImportError: If pipeline module cannot be imported
198
+ ValueError: If pipeline configuration is invalid
199
+ """
200
+ # Use cache if available and not reloading
201
+ if not reload and name in self._pipeline_cache:
202
+ logger.debug(f"Returning cached pipeline '{name}'")
203
+ return self._pipeline_cache[name]
204
+
205
+ logger.debug(f"Creating pipeline instance for '{name}'")
206
+
207
+ # Load pipeline configuration
208
+ config = self.load_config(name, reload=reload)
209
+
210
+ # Load pipeline module
211
+ module = self.load_module(name, reload=reload)
212
+
213
+ # Import Pipeline class here to avoid circular import
214
+ from .pipeline import Pipeline
215
+
216
+ # Create Pipeline instance
217
+ pipeline = Pipeline(
218
+ name=name,
219
+ config=config,
220
+ module=module,
221
+ project_context=project_context,
222
+ )
223
+
224
+ # Cache the pipeline instance
225
+ self._pipeline_cache[name] = pipeline
226
+
227
+ logger.debug(f"Successfully created pipeline instance for '{name}'")
228
+ return pipeline
229
+
230
+ def load_config(self, name: str, reload: bool = False) -> PipelineConfig:
231
+ """Load pipeline configuration from disk.
232
+
233
+ Args:
234
+ name: Name of the pipeline
235
+ reload: Whether to reload from disk even if cached
236
+
237
+ Returns:
238
+ PipelineConfig instance
239
+ """
240
+ # Use cache if available and not reloading
241
+ if not reload and name in self._config_cache:
242
+ logger.debug(f"Returning cached config for pipeline '{name}'")
243
+ return self._config_cache[name]
244
+
245
+ logger.debug(f"Loading configuration for pipeline '{name}'")
246
+
247
+ # Load configuration from disk
248
+ config = PipelineConfig.load(
249
+ base_dir=self._base_dir,
250
+ name=name,
251
+ fs=self._fs,
252
+ storage_options=self._storage_options,
253
+ )
254
+
255
+ # Cache the configuration
256
+ self._config_cache[name] = config
257
+
258
+ return config
259
+
260
+ def load_module(self, name: str, reload: bool = False) -> Any:
261
+ """Load pipeline module from disk.
262
+
263
+ Args:
264
+ name: Name of the pipeline
265
+ reload: Whether to reload from disk even if cached
266
+
267
+ Returns:
268
+ Loaded Python module
269
+ """
270
+ # Use cache if available and not reloading
271
+ if not reload and name in self._module_cache:
272
+ logger.debug(f"Returning cached module for pipeline '{name}'")
273
+ return self._module_cache[name]
274
+
275
+ logger.debug(f"Loading module for pipeline '{name}'")
276
+
277
+ # Convert pipeline name to module name
278
+ formatted_name = name.replace(".", "/").replace("-", "_")
279
+ module_name = f"pipelines.{formatted_name}"
280
+
281
+ # Load the module
282
+ module = load_module(module_name, reload=reload)
283
+
284
+ # Cache the module
285
+ self._module_cache[name] = module
286
+
287
+ return module
288
+
289
+ def clear_cache(self, name: str | None = None):
290
+ """Clear cached pipelines, configurations, and modules.
291
+
292
+ Args:
293
+ name: If provided, clear cache only for this pipeline.
294
+ If None, clear entire cache.
295
+ """
296
+ if name:
297
+ logger.debug(f"Clearing cache for pipeline '{name}'")
298
+ self._pipeline_cache.pop(name, None)
299
+ self._config_cache.pop(name, None)
300
+ self._module_cache.pop(name, None)
301
+ else:
302
+ logger.debug("Clearing entire pipeline cache")
303
+ self._pipeline_cache.clear()
304
+ self._config_cache.clear()
305
+ self._module_cache.clear()
306
+
75
307
  # --- Methods moved from PipelineManager ---
76
308
  def new(self, name: str, overwrite: bool = False):
77
309
  """
@@ -1,12 +1,11 @@
1
1
  import posixpath
2
- from typing import Any
3
2
 
3
+ from fsspec_utils import AbstractFileSystem
4
4
  from hamilton import driver
5
5
  from rich import print
6
6
 
7
7
  # Import necessary config types and utility functions
8
8
  from ..cfg import PipelineConfig, ProjectConfig
9
- from ..fs import AbstractFileSystem
10
9
  from ..utils.misc import view_img
11
10
  from .base import load_module # Import module loading utility
12
11
 
@@ -0,0 +1,8 @@
1
+ import warnings
2
+
3
+ warnings.warn(
4
+ "The flowerpower.plugins._io module is deprecated. "
5
+ "Please use 'flowerpower-io' instead. Install it with 'pip install flowerpower-io'.",
6
+ DeprecationWarning,
7
+ stacklevel=2,
8
+ )
@@ -7,6 +7,7 @@ from types import TracebackType
7
7
  from typing import Any, Callable
8
8
 
9
9
  import mmh3
10
+ from fsspec_utils import AbstractFileSystem, BaseStorageOptions, filesystem
10
11
  from loguru import logger
11
12
  from munch import Munch
12
13
  from paho.mqtt.client import (MQTT_ERR_SUCCESS, CallbackAPIVersion, Client,
@@ -16,9 +17,7 @@ from paho.mqtt.reasoncodes import ReasonCode
16
17
  from ...cfg import ProjectConfig
17
18
  from ...cfg.pipeline.run import ExecutorConfig, WithAdapterConfig
18
19
  from ...cfg.project.adapter import AdapterConfig
19
- from ...fs import AbstractFileSystem, BaseStorageOptions, get_filesystem
20
20
  from ...pipeline.manager import PipelineManager
21
- from ...utils.callback import run_with_callback
22
21
  from ...utils.logging import setup_logging
23
22
  from .cfg import MqttConfig
24
23
 
@@ -132,8 +131,9 @@ class MqttManager:
132
131
  import os
133
132
 
134
133
  if fs is None:
135
- fs = get_filesystem(
136
- path=os.path.dirname(path), storage_options=storage_options
134
+ fs = filesystem(
135
+ protocol_or_path=os.path.dirname(path),
136
+ storage_options=storage_options,
137
137
  )
138
138
 
139
139
  cfg = MqttConfig.from_yaml(path=os.path.basename(path), fs=fs)
@@ -637,7 +637,7 @@ class MqttManager:
637
637
  storage_options=storage_options, fs=fs, base_dir=base_dir
638
638
  ) as pipeline:
639
639
  if as_job:
640
- res = pipeline.add_job(
640
+ pipeline.add_job(
641
641
  name=name,
642
642
  inputs=inputs,
643
643
  final_vars=final_vars,
@@ -664,7 +664,7 @@ class MqttManager:
664
664
  )
665
665
 
666
666
  else:
667
- res = pipeline.run(
667
+ pipeline.run(
668
668
  name=name,
669
669
  inputs=inputs,
670
670
  final_vars=final_vars,
@@ -1,5 +1,3 @@
1
- import os
2
-
3
1
  # Define backend properties in a dictionary for easier maintenance
4
2
 
5
3
  BACKEND_PROPERTIES = {
@@ -1,7 +1,7 @@
1
1
  import os
2
2
 
3
3
  from .backend import BACKEND_PROPERTIES
4
- from .executor import EXECUTOR, EXECUTOR_MAX_WORKERS, EXECUTOR_NUM_CPUS
4
+ from .executor import EXECUTOR_NUM_CPUS
5
5
 
6
6
  # WORKER
7
7
  JOB_QUEUE_TYPE = os.getenv("FP_JOB_QUEUE_TYPE", "rq")
@@ -29,59 +29,3 @@ RQ_QUEUES = (
29
29
  .split(",")
30
30
  )
31
31
  RQ_NUM_WORKERS = int(os.getenv("FP_RQ_NUM_WORKERS", EXECUTOR_NUM_CPUS))
32
-
33
- # APS WORKER
34
- APS_BACKEND_DS = os.getenv("FP_APS_BACKEND_DS", "memory")
35
-
36
- APS_BACKEND_DS_HOST = os.getenv(
37
- "FP_APS_BACKEND_DS_HOST",
38
- BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_host", None),
39
- )
40
- APS_BACKEND_DS_PORT = int(
41
- os.getenv(
42
- "FP_APS_BACKEND_DS_PORT",
43
- BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_port", 0),
44
- )
45
- )
46
- APS_BACKEND_DS_DB = os.getenv(
47
- "FP_APS_BACKEND_DS_DB",
48
- BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_database", None),
49
- )
50
- APS_BACKEND_DS_USERNAME = os.getenv(
51
- "FP_APS_BACKEND_DS_USERNAME",
52
- BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_username", None),
53
- )
54
- APS_BACKEND_DS_PASSWORD = os.getenv(
55
- "FP_APS_BACKEND_DS_PASSWORD",
56
- BACKEND_PROPERTIES.get(APS_BACKEND_DS, {}).get("default_password", None),
57
- )
58
- APS_BACKEND_DS_SCHEMA = os.getenv("FP_APS_BACKEND_DS_SCHEMA", "flowerpower")
59
-
60
- APS_BACKEND_EB = os.getenv("FP_APS_BACKEND_EB", "memory")
61
- APS_BACKEND_EB_HOST = os.getenv(
62
- "FP_APS_BACKEND_EB_HOST",
63
- BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_host", None),
64
- )
65
- APS_BACKEND_EB_PORT = int(
66
- os.getenv(
67
- "FP_APS_BACKEND_EB_PORT",
68
- BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_port", 0),
69
- )
70
- )
71
- APS_BACKEND_EB_DB = os.getenv(
72
- "FP_APS_BACKEND_EB_DB",
73
- BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_database", None),
74
- )
75
- APS_BACKEND_EB_USERNAME = os.getenv(
76
- "FP_APS_BACKEND_EB_USERNAME",
77
- BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_username", None),
78
- )
79
- APS_BACKEND_EB_PASSWORD = os.getenv(
80
- "FP_APS_BACKEND_EB_PASSWORD",
81
- BACKEND_PROPERTIES.get(APS_BACKEND_EB, {}).get("default_password", None),
82
- )
83
-
84
- APS_CLEANUP_INTERVAL = int(os.getenv("FP_APS_CLEANUP_INTERVAL", 300))
85
- APS_MAX_CONCURRENT_JOBS = int(os.getenv("FP_APS_MAX_CONCURRENT_JOBS", 10))
86
- APS_DEFAULT_EXECUTOR = os.getenv("FP_APS_DEFAULT_EXECUTOR", EXECUTOR)
87
- APS_NUM_WORKERS = int(os.getenv("FP_APS_NUM_WORKERS", EXECUTOR_MAX_WORKERS))
flowerpower/utils/misc.py CHANGED
@@ -8,262 +8,6 @@ from typing import Any
8
8
 
9
9
  import msgspec
10
10
 
11
- if importlib.util.find_spec("pyarrow"):
12
- import pyarrow as pa
13
-
14
- def convert_large_types_to_standard(schema: pa.Schema) -> pa.Schema:
15
- # Define mapping of large types to standard types
16
- type_mapping = {
17
- pa.large_string(): pa.string(),
18
- pa.large_binary(): pa.binary(),
19
- pa.large_list(pa.null()): pa.list_(pa.null()),
20
- }
21
-
22
- # Convert fields
23
- new_fields = []
24
- for field in schema:
25
- field_type = field.type
26
- # Check if type exists in mapping
27
- if field_type in type_mapping:
28
- new_field = pa.field(
29
- name=field.name,
30
- type=type_mapping[field_type],
31
- nullable=field.nullable,
32
- metadata=field.metadata,
33
- )
34
- new_fields.append(new_field)
35
- # Handle large lists with nested types
36
- elif isinstance(field_type, pa.LargeListType):
37
- new_field = pa.field(
38
- name=field.name,
39
- type=pa.list_(field_type.value_type),
40
- nullable=field.nullable,
41
- metadata=field.metadata,
42
- )
43
- new_fields.append(new_field)
44
- else:
45
- new_fields.append(field)
46
-
47
- return pa.schema(new_fields)
48
-
49
-
50
- else:
51
-
52
- def convert_large_types_to_standard(*args, **kwargs):
53
- raise ImportError("pyarrow not installed")
54
-
55
-
56
- if importlib.util.find_spec("polars"):
57
- import polars as pl
58
-
59
- def _dict_to_dataframe(
60
- data: dict | list[dict], unique: bool | list[str] | str = False
61
- ) -> pl.DataFrame:
62
- """
63
- Convert a dictionary or list of dictionaries to a polars DataFrame.
64
-
65
- Args:
66
- data: (dict | list[dict]) Data to convert.
67
-
68
- Returns:
69
- pl.DataFrame: Converted data.
70
-
71
- Examples:
72
- >>> # Single dict with list values
73
- >>> data = {'a': [1, 2, 3], 'b': [4, 5, 6]}
74
- >>> _dict_to_dataframe(data)
75
- shape: (3, 2)
76
- ┌─────┬─────┐
77
- │ a ┆ b │
78
- │ --- ┆ --- │
79
- │ i64 ┆ i64 │
80
- ╞═════╪═════╡
81
- │ 1 ┆ 4 │
82
- │ 2 ┆ 5 │
83
- │ 3 ┆ 6 │
84
- └─────┴─────┘
85
-
86
- >>> # Single dict with scalar values
87
- >>> data = {'a': 1, 'b': 2}
88
- >>> _dict_to_dataframe(data)
89
- shape: (1, 2)
90
- ┌─────┬─────┐
91
- │ a ┆ b │
92
- │ --- ┆ --- │
93
- │ i64 ┆ i64 │
94
- ╞═════╪═════╡
95
- │ 1 ┆ 2 │
96
- └─────┴─────┘
97
-
98
- >>> # List of dicts with scalar values
99
- >>> data = [{'a': 1, 'b': 2}, {'a': 3, 'b': 4}]
100
- >>> _dict_to_dataframe(data)
101
- shape: (2, 2)
102
- ┌─────┬─────┐
103
- │ a ┆ b │
104
- │ --- ┆ --- │
105
- │ i64 ┆ i64 │
106
- ╞═════╪═════╡
107
- │ 1 ┆ 2 │
108
- │ 3 ┆ 4 │
109
- └─────┴─────┘
110
-
111
- >>> # List of dicts with list values
112
- >>> data = [{'a': [1, 2], 'b': [3, 4]}, {'a': [5, 6], 'b': [7, 8]}]
113
- >>> _dict_to_dataframe(data)
114
- shape: (2, 2)
115
- ┌───────┬───────┐
116
- │ a ┆ b │
117
- │ --- ┆ --- │
118
- │ list ┆ list │
119
- ╞═══════╪═══════╡
120
- │ [1,2] ┆ [3,4] │
121
- │ [5,6] ┆ [7,8] │
122
- └───────┴───────┘
123
- """
124
- if isinstance(data, list):
125
- # If it's a single-element list, just use the first element
126
- if len(data) == 1:
127
- data = data[0]
128
- # If it's a list of dicts
129
- else:
130
- first_item = data[0]
131
- # Check if the dict values are lists/tuples
132
- if any(isinstance(v, (list, tuple)) for v in first_item.values()):
133
- # Each dict becomes a row with list/tuple values
134
- data = pl.DataFrame(data)
135
- else:
136
- # If values are scalars, convert list of dicts to DataFrame
137
- data = pl.DataFrame(data)
138
-
139
- if unique:
140
- data = data.unique(
141
- subset=None if not isinstance(unique, str | list) else unique,
142
- maintain_order=True,
143
- )
144
- return data
145
-
146
- # If it's a single dict
147
- if isinstance(data, dict):
148
- # Check if values are lists/tuples
149
- if any(isinstance(v, (list, tuple)) for v in data.values()):
150
- # Get the length of any list value (assuming all lists have same length)
151
- length = len(
152
- next(v for v in data.values() if isinstance(v, (list, tuple)))
153
- )
154
- # Convert to DataFrame where each list element becomes a row
155
- data = pl.DataFrame({
156
- k: v if isinstance(v, (list, tuple)) else [v] * length
157
- for k, v in data.items()
158
- })
159
- else:
160
- # If values are scalars, wrap them in a list to create a single row
161
- data = pl.DataFrame({k: [v] for k, v in data.items()})
162
-
163
- if unique:
164
- data = data.unique(
165
- subset=None if not isinstance(unique, str | list) else unique,
166
- maintain_order=True,
167
- )
168
- return data
169
-
170
- raise ValueError("Input must be a dictionary or list of dictionaries")
171
-
172
- else:
173
-
174
- def _dict_to_dataframe(*args, **kwargs):
175
- raise ImportError("polars not installed")
176
-
177
-
178
- if (
179
- importlib.util.find_spec("pandas")
180
- and importlib.util.find_spec("polars")
181
- and importlib.util.find_spec("pyarrow")
182
- ):
183
- from typing import Generator
184
-
185
- import pandas as pd
186
-
187
- def to_pyarrow_table(
188
- data: pl.DataFrame
189
- | pl.LazyFrame
190
- | pd.DataFrame
191
- | dict
192
- | list[pl.DataFrame | pl.LazyFrame | pd.DataFrame | dict],
193
- concat: bool = False,
194
- unique: bool | list[str] | str = False,
195
- ) -> pa.Table:
196
- if isinstance(data, dict):
197
- data = _dict_to_dataframe(data)
198
- if isinstance(data, list):
199
- if isinstance(data[0], dict):
200
- data = _dict_to_dataframe(data, unique=unique)
201
-
202
- if not isinstance(data, list):
203
- data = [data]
204
-
205
- if isinstance(data[0], pl.LazyFrame):
206
- data = [dd.collect() for dd in data]
207
-
208
- if isinstance(data[0], pl.DataFrame):
209
- if concat:
210
- data = pl.concat(data, how="diagonal_relaxed")
211
- if unique:
212
- data = data.unique(
213
- subset=None if not isinstance(unique, str | list) else unique,
214
- maintain_order=True,
215
- )
216
- data = data.to_arrow()
217
- data = data.cast(convert_large_types_to_standard(data.schema))
218
- else:
219
- data = [dd.to_arrow() for dd in data]
220
- data = [
221
- dd.cast(convert_large_types_to_standard(dd.schema)) for dd in data
222
- ]
223
-
224
- elif isinstance(data[0], pd.DataFrame):
225
- data = [pa.Table.from_pandas(dd, preserve_index=False) for dd in data]
226
- if concat:
227
- data = pa.concat_tables(data, promote_options="permissive")
228
- if unique:
229
- data = (
230
- pl.from_arrow(data)
231
- .unique(
232
- subset=None
233
- if not isinstance(unique, str | list)
234
- else unique,
235
- maintain_order=True,
236
- )
237
- .to_arrow()
238
- )
239
- data = data.cast(convert_large_types_to_standard(data.schema))
240
-
241
- elif isinstance(data[0], pa.RecordBatch | pa.RecordBatchReader | Generator):
242
- if concat:
243
- data = pa.Table.from_batches(data)
244
- if unique:
245
- data = (
246
- pl.from_arrow(data)
247
- .unique(
248
- subset=None
249
- if not isinstance(unique, str | list)
250
- else unique,
251
- maintain_order=True,
252
- )
253
- .to_arrow()
254
- )
255
- data = data.cast(convert_large_types_to_standard(data.schema))
256
- else:
257
- data = [pa.Table.from_batches([dd]) for dd in data]
258
-
259
- return data
260
-
261
- else:
262
-
263
- def to_pyarrow_table(*args, **kwargs):
264
- raise ImportError("pandas, polars, or pyarrow not installed")
265
-
266
-
267
11
  if importlib.util.find_spec("joblib"):
268
12
  from joblib import Parallel, delayed
269
13
  from rich.progress import (BarColumn, Progress, TextColumn,