FlowerPower 0.11.6.20__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +400 -132
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.20.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,26 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseReader
4
-
5
-
6
- # @attrs.define
7
- class MySQLReader(BaseDatabaseReader, gc=False):
8
- """MySQL loader.
9
-
10
- This class is responsible for loading dataframes from MySQL database.
11
-
12
- Examples:
13
- ```python
14
- loader = MySQLReader(table_name="table", host="localhost",
15
- port=5432, username="user", password="password",
16
- database="database")
17
- df = loader.to_polars()
18
-
19
- # or
20
- loader = MySQLReader(table_name="table",
21
- connection_string="mssql+pyodbc://user:password@localhost:5432/database")
22
- df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
23
- ```
24
- """
25
-
26
- type_: str = field(default="mysql")
@@ -1,26 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseReader
4
-
5
-
6
- # @attrs.define
7
- class OracleDBReader(BaseDatabaseReader, gc=False):
8
- """OracleDB loader.
9
-
10
- This class is responsible for loading dataframes from OracleDB database.
11
-
12
- Examples:
13
- ```python
14
- loader = OracleDBReader(table_name="table", host="localhost",
15
- port=5432, username="user", password="password",
16
- database="database")
17
- df = loader.to_polars()
18
-
19
- # or
20
- loader = OracleDBReader(table_name="table",
21
- connection_string="mssql+pyodbc://user:password@localhost:5432/database")
22
- df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
23
- ```
24
- """
25
-
26
- type_: str = field(default="oracle")
@@ -1,35 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatasetReader, BaseFileReader
4
-
5
-
6
- # @attrs.define
7
- class ParquetFileReader(BaseFileReader, gc=False):
8
- """Parquet file loader.
9
-
10
- This class is responsible for loading dataframes from Parquet files.
11
-
12
- Examples:
13
- ```python
14
- loader = ParquetFileReader("data.parquet")
15
- df = loader.load()
16
- ```
17
- """
18
-
19
- format: str = field(default="parquet")
20
-
21
-
22
- # @attrs.define
23
- class ParquetDatasetReader(BaseDatasetReader, gc=False):
24
- """Parquet dataset loader.
25
-
26
- This class is responsible for loading dataframes from Parquet dataset.
27
-
28
- Examples:
29
- ```python
30
- loader = ParquetDatasetReader("parquet_data/")
31
- df = loader.load()
32
- ```
33
- """
34
-
35
- format: str = field(default="parquet")
@@ -1,26 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseReader
4
-
5
-
6
- # @attrs.define
7
- class PostgreSQLReader(BaseDatabaseReader, gc=False):
8
- """PostgreSQL loader.
9
-
10
- This class is responsible for loading dataframes from PostgreSQL database.
11
-
12
- Examples:
13
- ```python
14
- loader = PostgreSQLReader(table_name="table", host="localhost",
15
- port=5432, username="user", password="password",
16
- database="database")
17
- df = loader.to_polars()
18
-
19
- # or
20
- loader = PostgreSQLReader(table_name="table",
21
- connection_string="mssql+pyodbc://user:password@localhost:5432/database")
22
- df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
23
- ```
24
- """
25
-
26
- type_: str = field(default="postgres")
@@ -1,19 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatasetReader
4
-
5
-
6
- # @attrs.define
7
- class PydalaDatasetReader(BaseDatasetReader, gc=False):
8
- """Pydala dataset loader.
9
-
10
- This class is responsible for loading dataframes from Pydala dataset.
11
-
12
- Examples:
13
- ```python
14
- loader = PydalaDatasetReader("pydala_data/")
15
- df = loader.load()
16
- ```
17
- """
18
-
19
- format: str = field(default="parquet")
@@ -1,23 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseReader
4
-
5
-
6
- # @attrs.define
7
- class SQLiteReader(BaseDatabaseReader, gc=False):
8
- """SQLite loader.
9
-
10
- This class is responsible for loading dataframes from SQLite database.
11
-
12
- Examples:
13
- ```python
14
- loader = SQLiteReader(table_name="table", path="data.db")
15
- df = loader.to_polars("SELECT * FROM table WHERE column = 'value'")
16
-
17
- # or
18
- loader = SQLiteReader(table_name="table", connection_string="sqlite://data.db")
19
- df = loader.to_pyarrow_table()
20
- ```
21
- """
22
-
23
- type_: str = field(default="sqlite")
@@ -1,244 +0,0 @@
1
- import datetime as dt
2
- import importlib
3
- import os
4
-
5
- import duckdb
6
- import pandas as pd
7
- import polars as pl
8
- import pyarrow as pa
9
- import pyarrow.dataset as pds
10
- from deltalake import DeltaTable
11
- from fsspec import AbstractFileSystem
12
-
13
- from ...fs.ext import path_to_glob
14
-
15
-
16
- def get_serializable_schema(
17
- data: (
18
- pd.DataFrame
19
- | pl.DataFrame
20
- | pl.LazyFrame
21
- | duckdb.DuckDBPyRelation
22
- | pa.Table
23
- | pa.Schema
24
- | pa.RecordBatch
25
- | pa.RecordBatchReader
26
- | pds.Dataset
27
- ),
28
- ) -> dict[str, str]:
29
- """
30
- Convert DataFrame dtypes to a serializable dictionary.
31
-
32
- Args:
33
- data: DataFrame
34
-
35
- Returns:
36
- dict mapping column names to dtype strings
37
- """
38
- if isinstance(data, pd.DataFrame):
39
- return {col: str(dtype) for col, dtype in data.dtypes.items()}
40
- elif isinstance(data, pl.DataFrame):
41
- return data.schema.to_python()
42
- elif isinstance(data, pl.LazyFrame):
43
- return data.collect_schema().to_python()
44
- elif isinstance(data, duckdb.DuckDBPyRelation):
45
- return dict(zip(data.columns, [str(dtype) for dtype in data.types]))
46
- elif isinstance(
47
- data, pa.Table | pa.RecordBatch | pa.RecordBatchReader | pds.Dataset
48
- ):
49
- return dict(zip(data.schema.names, [str(dtype) for dtype in data.schema.types]))
50
- elif isinstance(data, pa.Schema):
51
- return dict(zip(data.names, [str(dtype) for dtype in data.types]))
52
-
53
-
54
- def get_dataframe_metadata(
55
- df: pd.DataFrame
56
- | pl.DataFrame
57
- | pl.LazyFrame
58
- | pa.Table
59
- | pa.RecordBatch
60
- | pa.RecordBatchReader
61
- | list[
62
- pd.DataFrame
63
- | pl.DataFrame
64
- | pl.LazyFrame
65
- | pa.Table
66
- | pa.RecordBatch
67
- | pa.RecordBatchReader
68
- ],
69
- path: str | list[str] | None = None,
70
- format: str | None = None,
71
- topic: str | None = None,
72
- num_files: int | None = None,
73
- partition_columns: list[str] | None = None,
74
- fs: AbstractFileSystem | None = None,
75
- **kwargs,
76
- ) -> dict:
77
- """
78
- Get metadata for a DataFrame.
79
-
80
- Args:
81
- df: DataFrame
82
- path: Path to the file(s) that the DataFrame was loaded from
83
- fs: Optional filesystem
84
- kwargs: Additional metadata fields
85
-
86
- Returns:
87
- dict: DataFrame metadata
88
- """
89
- if isinstance(df, list):
90
- schema = get_serializable_schema(df[0])
91
- num_rows = sum(df_.shape[0] for df_ in df)
92
- else:
93
- schema = get_serializable_schema(df)
94
- num_rows = df.shape[0] if hasattr(df, "shape") else None
95
-
96
- if path is not None and num_files is None:
97
- if isinstance(path, list):
98
- num_files = len(path)
99
- else:
100
- path_ = path_to_glob(path=path, format=format)
101
- num_files = len(fs.glob(path_)) if fs is not None else None
102
-
103
- if partition_columns is not None:
104
- schema = {k: v for k, v in schema.items() if k not in partition_columns}
105
-
106
- metadata = {
107
- "path": path,
108
- "topic": topic,
109
- "format": format,
110
- "timestamp": int(dt.datetime.now().timestamp() * 1000),
111
- "schema": schema,
112
- "partition_columns": partition_columns,
113
- "num_columns": len(schema),
114
- "num_rows": num_rows,
115
- "num_files": num_files,
116
- }
117
- metadata.update(kwargs)
118
- return {k: v for k, v in metadata.items() if v is not None}
119
-
120
-
121
- def get_duckdb_metadata(
122
- rel: duckdb.DuckDBPyRelation,
123
- path: str,
124
- format: str,
125
- fs: AbstractFileSystem | None = None,
126
- include_shape: bool = False,
127
- include_num_files: bool = False,
128
- partition_columns: list[str] | None = None,
129
- **kwargs,
130
- ) -> dict:
131
- """
132
- Get metadata for a DuckDBPyRelation.
133
-
134
- Args:
135
- rel: DuckDBPyRelation
136
- path: Path to the file(s) that the DuckDBPyRelation was loaded from
137
- fs: Filesystem
138
- include_shape: Include shape in metadata
139
- include_num_files: Include number of files in metadata
140
- kwargs: Additional metadata fields
141
-
142
- Returns:
143
- dict: DuckDBPyRelation metadata
144
- """
145
-
146
- schema = get_serializable_schema(rel)
147
- if include_shape:
148
- shape = rel.shape
149
- else:
150
- shape = None
151
- if partition_columns is not None:
152
- schema = {k: v for k, v in schema.items() if k not in partition_columns}
153
-
154
- metadata = {
155
- "path": path,
156
- "format": format,
157
- "timestamp": dt.datetime.now().timestamp(),
158
- "schema": schema,
159
- "partition_columns": partition_columns,
160
- "num_columns": shape[1] if shape else None,
161
- "num_rows": shape[0] if shape else None,
162
- "num_files": len(fs.glob(path)) if include_num_files else None,
163
- }
164
- metadata.update(kwargs)
165
- return {k: v for k, v in metadata.items() if v is not None}
166
-
167
-
168
- def get_pyarrow_dataset_metadata(
169
- ds: pds.Dataset,
170
- path: str,
171
- format: str,
172
- **kwargs,
173
- ) -> dict:
174
- schema = get_serializable_schema(ds.schema)
175
- files = ds.files
176
-
177
- metadata = {
178
- "path": path or os.path.dirname(files[0]),
179
- "format": format,
180
- "timestamp": dt.datetime.now().timestamp(),
181
- "schema": schema,
182
- "partition_columns": ds.partitioning.schema.names if ds.partitioning else None,
183
- "num_columns": len(ds.schema),
184
- "num_rows": None,
185
- "num_files": len(files),
186
- }
187
- metadata.update(kwargs)
188
- return metadata
189
-
190
-
191
- def get_delta_metadata(
192
- dtable: DeltaTable,
193
- path: str,
194
- **kwargs,
195
- ) -> dict:
196
- dt_meta = dtable.metadata()
197
- dt_schema = dtable.schema().to_pyarrow()
198
- metadata = {
199
- "path": path,
200
- "format": "delta",
201
- "timestamp": dt.datetime.now().timestamp(),
202
- "schema": dict(zip(dt_schema.names, [str(x) for x in dt_schema.types])),
203
- "partition_columns": dt_meta.partition_columns
204
- if hasattr(dt_meta, "partition_columns")
205
- else None,
206
- "num_columns": len(dt_schema),
207
- "num_files": len(dtable.files()),
208
- "name": dt_meta.name or kwargs.get("name", None),
209
- "description": dt_meta.description or kwargs.get("description", None),
210
- "id": dt_meta.id or kwargs.get("id", None),
211
- }
212
-
213
- return {k: v for k, v in metadata.items() if v is not None}
214
-
215
-
216
- if importlib.util.find_spec("orjson"):
217
- import orjson
218
-
219
- def get_mqtt_metadata(
220
- payload: bytes | dict[str, any],
221
- topic: str | None = None,
222
- **kwargs,
223
- ) -> dict:
224
- if isinstance(payload, bytes):
225
- payload = orjson.loads(payload)
226
-
227
- schema = get_serializable_schema(payload)
228
- metadata = {
229
- "topic": topic,
230
- "format": "mqtt",
231
- "timestamp": dt.datetime.now().timestamp(),
232
- "schema": schema,
233
- "num_columns": len(schema),
234
- "num_rows": len(payload),
235
- "name": kwargs.get("name", None),
236
- "description": kwargs.get("description", None),
237
- "id": kwargs.get("id", None),
238
- }
239
- return metadata
240
-
241
- else:
242
-
243
- def get_mqtt_metadata(*args, **kwargs):
244
- raise ImportError("orjson not installed")
@@ -1,28 +0,0 @@
1
- from .csv import CSVDatasetWriter, CSVFileWriter
2
- from .deltatable import DeltaTableWriter
3
- from .duckdb import DuckDBWriter
4
- from .json import JsonDatasetWriter, JsonFileWriter
5
- from .mssql import MSSQLWriter
6
- from .mysql import MySQLWriter
7
- from .oracle import OracleDBWriter
8
- from .parquet import ParquetDatasetWriter, ParquetFileWriter
9
- from .postgres import PostgreSQLWriter
10
- from .pydala import PydalaDatasetWriter
11
- from .sqlite import SQLiteWriter
12
-
13
- __all__ = [
14
- "CSVFileWriter",
15
- "CSVDatasetWriter",
16
- "DeltaTableWriter",
17
- "DuckDBWriter",
18
- "JsonFileWriter",
19
- "JsonDatasetWriter",
20
- "MSSQLWriter",
21
- "MySQLWriter",
22
- "OracleDBWriter",
23
- "ParquetFileWriter",
24
- "ParquetDatasetWriter",
25
- "PostgreSQLWriter",
26
- "PydalaDatasetWriter",
27
- "SQLiteWriter",
28
- ]
@@ -1,36 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatasetWriter, BaseFileWriter
4
-
5
-
6
- # @attrs.define
7
- class CSVFileWriter(BaseFileWriter, gc=False):
8
- """CSV file writer.
9
-
10
- This class is responsible for writing dataframes to CSV files.
11
-
12
- Examples:
13
- ```python
14
- writer = CSVFileWriter(df, "data.csv")
15
- writer.write()
16
- ```
17
- """
18
-
19
- format: str = field(default="csv")
20
-
21
-
22
- # @attrs.define
23
- class CSVDatasetWriter(BaseDatasetWriter, gc=False):
24
- """CSV dataset writer.
25
-
26
- This class is responsible for writing dataframes to CSV dataset.
27
-
28
- Examples:
29
- ```python
30
- writer = CSVDatasetWriter(df, "csv_data/")
31
- writer.write()
32
- ```
33
-
34
- """
35
-
36
- format: str = field(default="csv")
@@ -1,186 +0,0 @@
1
- from typing import Any
2
-
3
- import pandas as pd
4
- import polars as pl
5
- import pyarrow as pa
6
- from deltalake.transaction import CommitProperties, PostCommitHookProperties
7
- from deltalake.writer import (ColumnProperties, WriterProperties,
8
- write_deltalake)
9
- from msgspec import field
10
- from redis import Redis, StrictRedis
11
- from sherlock import RedisLock
12
-
13
- from ....utils.misc import _dict_to_dataframe
14
- from ..base import BaseDatasetWriter
15
- from ..metadata import get_dataframe_metadata
16
-
17
-
18
- # @attrs.define
19
- class DeltaTableWriter(BaseDatasetWriter, gc=False):
20
- """Delta table writer.
21
-
22
- This class is responsible for writing dataframes to Delta tables.
23
-
24
- Examples:
25
- ```python
26
- writer = DeltaTableWriter("data/")
27
- writer.write(df)
28
- ```
29
- """
30
-
31
- description: str | None = None
32
- with_lock: bool = False
33
- redis: StrictRedis | Redis | None = None
34
- format: str = field(default="delta")
35
-
36
- def __post_init__(self):
37
- super().__post_init__()
38
- if self.with_lock and self.redis is None:
39
- raise ValueError("Redis connection is required when using locks.")
40
-
41
- def write(
42
- self,
43
- data: (
44
- pl.DataFrame
45
- | pl.LazyFrame
46
- | pa.Table
47
- | pa.RecordBatch
48
- | pa.RecordBatchReader
49
- | pd.DataFrame
50
- | dict[str, Any]
51
- | list[
52
- pl.DataFrame
53
- | pl.LazyFrame
54
- | pa.Table
55
- | pa.RecordBatch
56
- | pa.RecordBatchReader
57
- | pd.DataFrame
58
- | dict[str, Any]
59
- ]
60
- )
61
- | None = None,
62
- mode: str = "append", # "overwrite" | "append" | "error | "ignore"
63
- # schema: pa.Schema | None = None,
64
- schema_mode: str | None = None, # "merge" | "overwrite"
65
- partition_by: list[str] | None = None,
66
- # partition_filters: list[tuple[str, str, Any]] | None = None,
67
- predicate: str | None = None,
68
- target_file_size: int | None = None,
69
- # large_dtypes: bool = False,
70
- # custom_metadata: dict[str, Any] | None = None,
71
- post_commithook_properties: PostCommitHookProperties | None = None,
72
- commit_properties: CommitProperties | None = None,
73
- # writerproperties
74
- data_page_size_limit: int | None = None,
75
- dictionary_page_size_limit: int | None = None,
76
- data_page_row_count_limit: int | None = None,
77
- write_batch_size: int | None = None,
78
- max_row_group_size: int | None = None,
79
- compression: str | None = None,
80
- compression_level: int | None = None,
81
- statistics_truncate_length: int | None = None,
82
- default_column_properties: ColumnProperties | None = None,
83
- column_properties: dict[str, ColumnProperties] | None = None,
84
- ) -> dict[str, Any]:
85
- """
86
- Write data to a Delta table.
87
-
88
- Args:
89
- data: Data to write
90
- mode: Write mode
91
- schema: Schema of the data
92
- schema_mode: Schema mode
93
- partition_by: Columns to partition by
94
- partition_filters: Filters to apply to the partitions
95
- predicate: Predicate to apply to the data
96
- target_file_size: Target file size
97
- large_dtypes: Whether to use large dtypes
98
- custom_metadata: Custom metadata
99
- post_commithook_properties: Post-commit hook properties
100
- commit_properties: Commit properties
101
- data_page_size_limit: Data page size limit
102
- dictionary_page_size_limit: Dictionary page size limit
103
- data_page_row_count_limit: Data page row count limit
104
- write_batch_size: Write batch size
105
- max_row_group_size: Maximum row group size
106
- compression: Compression method
107
- compression_level: Compression level
108
- statistics_truncate_length: Statistics truncate length
109
- default_column_properties: Default column properties
110
- column_properties: Column properties
111
-
112
- Returns:
113
- Metadata
114
- """
115
- if data is None:
116
- data = self.data
117
- if isinstance(data, dict):
118
- data = _dict_to_dataframe(data)
119
- if not isinstance(data, list):
120
- data = [data]
121
- if isinstance(data[0], dict):
122
- data = [_dict_to_dataframe(d) for d in data]
123
- if isinstance(data[0], pl.LazyFrame):
124
- data = [d.collect() for d in data]
125
- if isinstance(data[0], pl.DataFrame):
126
- data = pl.concat(data, how="diagonal_relaxed").to_arrow()
127
- if isinstance(data[0], pd.DataFrame):
128
- data = pa.concat_tables(
129
- [pa.Table.from_pandas(d, preserve_index=False) for d in data],
130
- promote_options="permissive",
131
- )
132
- if isinstance(data[0], pa.RecordBatch | pa.RecordBatchReader):
133
- data = pa.Table.from_batches(data)
134
- if isinstance(data[0], pa.Table):
135
- data = pa.concat_tables(data, promote_options="permissive")
136
-
137
- metadata = get_dataframe_metadata(
138
- data, path=self._base_path, format=self.format
139
- )
140
-
141
- writer_properties = WriterProperties(
142
- data_page_size_limit=data_page_size_limit,
143
- dictionary_page_size_limit=dictionary_page_size_limit,
144
- data_page_row_count_limit=data_page_row_count_limit,
145
- write_batch_size=write_batch_size,
146
- max_row_group_size=max_row_group_size or self.row_group_size,
147
- compression=compression or self.compression.upper(),
148
- compression_level=compression_level,
149
- statistics_truncate_length=statistics_truncate_length,
150
- default_column_properties=default_column_properties,
151
- column_properties=column_properties,
152
- )
153
-
154
- def _write():
155
- write_deltalake(
156
- self._base_path,
157
- data,
158
- mode=mode,
159
- # schema=schema or self.schema_,
160
- partition_by=partition_by or self.partition_by,
161
- storage_options=self.storage_options.to_object_store_kwargs(),
162
- description=self.description,
163
- schema_mode=schema_mode,
164
- # partition_filters=partition_filters,
165
- predicate=predicate,
166
- target_file_size=target_file_size,
167
- # large_dtypes=large_dtypes,
168
- # custom_metadata=custom_metadata,
169
- post_commithook_properties=post_commithook_properties,
170
- commit_properties=commit_properties,
171
- writer_properties=writer_properties,
172
- )
173
-
174
- if self.with_lock:
175
- with RedisLock(
176
- lock_name=self._base_path,
177
- namespace="flowerpower",
178
- client=self.redis,
179
- expire=10,
180
- timeout=5,
181
- retry_interval=0.1,
182
- ):
183
- _write()
184
- else:
185
- _write()
186
- return metadata
@@ -1,19 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseWriter
4
-
5
-
6
- # @attrs.define
7
- class DuckDBWriter(BaseDatabaseWriter, gc=False):
8
- """DuckDB writer.
9
-
10
- This class is responsible for writing dataframes to DuckDB database.
11
-
12
- Examples:
13
- ```python
14
- writer = DuckDBWriter(table_name="table", path="data.db")
15
- writer.write(df)
16
- ```
17
- """
18
-
19
- type_: str = field(default="duckdb")