sibi-dst 2025.9.3__py3-none-any.whl → 2025.9.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +6 -4
- sibi_dst/df_helper/__init__.py +1 -0
- sibi_dst/df_helper/_parquet_artifact.py +533 -113
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -281
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +349 -142
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -0
- sibi_dst/tests/test_baseclass.py +403 -0
- sibi_dst/utils/base.py +0 -254
- sibi_dst/utils/boilerplate/__init__.py +4 -1
- sibi_dst/utils/boilerplate/hybrid_data_loader.py +144 -0
- sibi_dst/utils/data_wrapper.py +460 -61
- sibi_dst/utils/parquet_saver.py +403 -161
- sibi_dst/utils/update_planner.py +553 -319
- sibi_dst/utils/write_gatekeeper.py +18 -0
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/METADATA +2 -2
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/RECORD +17 -14
- {sibi_dst-2025.9.3.dist-info → sibi_dst-2025.9.5.dist-info}/WHEEL +0 -0
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -1,224 +1,466 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
1
3
|
import warnings
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
5
|
+
from contextlib import contextmanager
|
6
|
+
from functools import partial
|
7
|
+
from multiprocessing.pool import ThreadPool
|
8
|
+
from typing import Any, Dict, Optional
|
2
9
|
|
3
|
-
|
4
|
-
import pandas as pd
|
10
|
+
import dask
|
5
11
|
import dask.dataframe as dd
|
12
|
+
import pandas as pd
|
6
13
|
import pyarrow as pa
|
7
14
|
|
8
15
|
from . import ManagedResource
|
16
|
+
from .write_gatekeeper import get_write_sem
|
9
17
|
|
10
18
|
warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parquet is deprecated")
|
11
19
|
|
12
20
|
|
13
|
-
|
21
|
+
def _coerce_partition(pdf: pd.DataFrame, target: Dict[str, pa.DataType]) -> pd.DataFrame:
|
14
22
|
"""
|
15
|
-
|
16
|
-
|
23
|
+
Applies type conversions to a single pandas partition.
|
24
|
+
This function is defined at module level to ensure Dask serialization compatibility.
|
25
|
+
"""
|
26
|
+
for col, pa_type in target.items():
|
27
|
+
if col not in pdf.columns:
|
28
|
+
continue
|
29
|
+
|
30
|
+
try:
|
31
|
+
current_dtype_str = str(pdf[col].dtype)
|
32
|
+
if pa.types.is_string(pa_type) and current_dtype_str != "string[pyarrow]":
|
33
|
+
pdf[col] = pdf[col].astype("string[pyarrow]")
|
34
|
+
elif pa.types.is_boolean(pa_type) and current_dtype_str != "boolean[pyarrow]":
|
35
|
+
pdf[col] = pdf[col].astype("boolean[pyarrow]")
|
36
|
+
elif pa.types.is_integer(pa_type) and current_dtype_str != "int64[pyarrow]":
|
37
|
+
pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("int64[pyarrow]")
|
38
|
+
elif pa.types.is_floating(pa_type) and current_dtype_str != "float64[pyarrow]":
|
39
|
+
pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("float64[pyarrow]")
|
40
|
+
elif pa.types.is_timestamp(pa_type):
|
41
|
+
if hasattr(pdf[col].dtype, 'pyarrow_dtype') and pa.types.is_timestamp(pdf[col].dtype.pyarrow_dtype):
|
42
|
+
pdf[col] = pdf[col].astype('datetime64[ns]')
|
43
|
+
pdf[col] = pd.to_datetime(pdf[col], errors="coerce")
|
44
|
+
pdf[col] = pdf[col].astype("timestamp[ns][pyarrow]")
|
45
|
+
except Exception:
|
46
|
+
pass
|
47
|
+
return pdf
|
48
|
+
|
17
49
|
|
18
|
-
|
50
|
+
class ParquetSaver(ManagedResource):
|
51
|
+
"""
|
52
|
+
Production-grade Dask → Parquet writer with bounded concurrency.
|
53
|
+
This version is refactored to be fully pyarrow-aware, ensuring metadata
|
54
|
+
consistency from data source to parquet sink.
|
19
55
|
"""
|
20
56
|
logger_extra = {"sibi_dst_component": __name__}
|
21
57
|
|
22
58
|
def __init__(
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
59
|
+
self,
|
60
|
+
df_result: dd.DataFrame,
|
61
|
+
parquet_storage_path: str,
|
62
|
+
*,
|
63
|
+
repartition_size: Optional[str] = "128MB",
|
64
|
+
persist: bool = True,
|
65
|
+
write_index: bool = False,
|
66
|
+
write_metadata_file: bool = True,
|
67
|
+
pyarrow_args: Optional[Dict[str, Any]] = None,
|
68
|
+
writer_threads: int = 8,
|
69
|
+
arrow_cpu: Optional[int] = None,
|
70
|
+
partitions_per_round: int = 24,
|
71
|
+
max_delete_workers: int = 8,
|
72
|
+
write_gate_max: int = 2,
|
73
|
+
write_gate_key: Optional[str] = None,
|
74
|
+
**kwargs: Any,
|
27
75
|
):
|
28
76
|
super().__init__(**kwargs)
|
29
|
-
|
30
|
-
|
77
|
+
|
78
|
+
if not isinstance(df_result, dd.DataFrame):
|
79
|
+
raise TypeError("df_result must be a Dask DataFrame")
|
31
80
|
if not self.fs:
|
32
81
|
raise ValueError("File system (fs) must be provided to ParquetSaver.")
|
33
82
|
|
83
|
+
self.df_result = df_result
|
84
|
+
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
85
|
+
self.repartition_size = repartition_size
|
86
|
+
self.persist = persist
|
87
|
+
self.write_index = write_index
|
88
|
+
self.write_metadata_file = write_metadata_file
|
89
|
+
self.pyarrow_args = dict(pyarrow_args or {})
|
90
|
+
self.writer_threads = max(1, int(writer_threads))
|
91
|
+
self.arrow_cpu = None if arrow_cpu is None else max(1, int(arrow_cpu))
|
92
|
+
self.partitions_per_round = max(1, int(partitions_per_round))
|
93
|
+
self.max_delete_workers = max(1, int(max_delete_workers))
|
94
|
+
self.write_gate_max = max(1, int(write_gate_max))
|
95
|
+
self.write_gate_key = (write_gate_key or self.parquet_storage_path).rstrip("/")
|
96
|
+
|
97
|
+
# Fix: Remove deprecated coerce_timestamps parameter
|
98
|
+
self.pyarrow_args.setdefault("compression", "zstd")
|
99
|
+
|
34
100
|
self.protocol = "file"
|
35
101
|
if "://" in self.parquet_storage_path:
|
36
102
|
self.protocol = self.parquet_storage_path.split(":", 1)[0]
|
37
103
|
|
38
|
-
|
39
|
-
|
40
|
-
|
104
|
+
# ---------- public API ----------
|
105
|
+
def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True) -> str:
|
106
|
+
target_path = f"{self.parquet_storage_path}/{output_directory_name}".rstrip("/")
|
41
107
|
|
42
|
-
|
43
|
-
|
44
|
-
|
108
|
+
sem = get_write_sem(self.write_gate_key, self.write_gate_max)
|
109
|
+
with sem:
|
110
|
+
if overwrite and self.fs.exists(target_path):
|
111
|
+
self._clear_directory_safely(target_path)
|
112
|
+
self.fs.mkdirs(target_path, exist_ok=True)
|
45
113
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
114
|
+
# Define a pyarrow schema and coerce the Dask frame to match it.
|
115
|
+
schema = self._define_schema()
|
116
|
+
ddf = self._coerce_ddf_to_schema(self.df_result, schema)
|
117
|
+
|
118
|
+
if self.repartition_size:
|
119
|
+
ddf = ddf.repartition(partition_size=self.repartition_size)
|
50
120
|
|
51
|
-
|
52
|
-
|
53
|
-
|
121
|
+
if self.persist:
|
122
|
+
with self._local_dask_pool():
|
123
|
+
ddf = ddf.persist(scheduler="threads")
|
54
124
|
|
55
|
-
|
56
|
-
|
125
|
+
old_arrow_cpu = None
|
126
|
+
if self.arrow_cpu:
|
127
|
+
old_arrow_cpu = pa.get_cpu_count()
|
128
|
+
pa.set_cpu_count(self.arrow_cpu)
|
57
129
|
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
130
|
+
try:
|
131
|
+
with self._local_dask_pool():
|
132
|
+
ddf.to_parquet(
|
133
|
+
path=target_path,
|
134
|
+
engine="pyarrow",
|
135
|
+
schema=schema,
|
136
|
+
overwrite=False,
|
137
|
+
filesystem=self.fs,
|
138
|
+
write_index=self.write_index,
|
139
|
+
write_metadata_file=self.write_metadata_file,
|
140
|
+
**self.pyarrow_args,
|
141
|
+
)
|
142
|
+
finally:
|
143
|
+
if old_arrow_cpu is not None:
|
144
|
+
pa.set_cpu_count(old_arrow_cpu)
|
62
145
|
|
63
|
-
|
64
|
-
|
146
|
+
self.logger.info(
|
147
|
+
f"Parquet dataset written: {target_path}",
|
148
|
+
extra=self.logger_extra,
|
149
|
+
)
|
150
|
+
return target_path
|
65
151
|
|
152
|
+
@contextmanager
|
153
|
+
def _local_dask_pool(self):
|
154
|
+
"""Limit Dask threads only within persist/write phases."""
|
155
|
+
prev_pool = dask.config.get("pool", None)
|
66
156
|
try:
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
self.
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
Clears the contents of a directory robustly.
|
84
|
-
- For S3, deletes files one-by-one to bypass brittle multi-delete.
|
85
|
-
- For other filesystems, uses the standard recursive remove.
|
86
|
-
"""
|
87
|
-
if self.protocol == "s3":
|
88
|
-
self.logger.warning(
|
89
|
-
"Using single-file S3 deletion for compatibility. "
|
90
|
-
"This may be slow for directories with many files."
|
91
|
-
)
|
92
|
-
# Glob all contents (files and subdirs) and delete them individually.
|
93
|
-
all_paths = self.fs.glob(f"{directory}/**")
|
94
|
-
# delete contents (deepest first)
|
95
|
-
for path in sorted([p for p in all_paths if p != directory], key=len, reverse=True):
|
96
|
-
self.logger.debug(f"Deleting: {path}")
|
157
|
+
dask.config.set(pool=ThreadPool(self.writer_threads), scheduler="threads")
|
158
|
+
yield
|
159
|
+
finally:
|
160
|
+
if prev_pool is None:
|
161
|
+
dask.config.refresh()
|
162
|
+
else:
|
163
|
+
dask.config.set(pool=prev_pool)
|
164
|
+
|
165
|
+
def _clear_directory_safely(self, directory: str) -> None:
|
166
|
+
"""Robustly clear a directory, with optimizations for S3."""
|
167
|
+
if self.protocol.startswith("s3"):
|
168
|
+
entries = [p for p in self.fs.glob(f"{directory}/**") if p != directory]
|
169
|
+
if not entries:
|
170
|
+
return
|
171
|
+
|
172
|
+
def _rm_one(p: str) -> None:
|
97
173
|
try:
|
98
|
-
|
99
|
-
if hasattr(self.fs, "rm_file"):
|
100
|
-
self.fs.rm_file(path)
|
101
|
-
else:
|
102
|
-
self.fs.rm(path, recursive=False)
|
174
|
+
self.fs.rm_file(p)
|
103
175
|
except Exception as e:
|
104
|
-
self.logger.warning(f"
|
105
|
-
|
176
|
+
self.logger.warning(f"Delete failed '{p}': {e}", extra=self.logger_extra)
|
177
|
+
|
178
|
+
with ThreadPoolExecutor(max_workers=self.max_delete_workers) as ex:
|
179
|
+
list(ex.map(_rm_one, entries))
|
106
180
|
try:
|
107
181
|
self.fs.rm(directory, recursive=False)
|
108
182
|
except Exception:
|
109
183
|
pass
|
110
184
|
else:
|
111
|
-
# Standard, fast deletion for other filesystems (local, etc.)
|
112
185
|
self.fs.rm(directory, recursive=True)
|
113
186
|
|
187
|
+
# ---------- REFACTORED SCHEMA METHODS ----------
|
188
|
+
|
114
189
|
def _define_schema(self) -> pa.Schema:
|
115
190
|
"""
|
116
|
-
Defines a PyArrow schema
|
117
|
-
Works for Dask by using known dtypes on the collection.
|
191
|
+
Defines a PyArrow schema from the DataFrame's dtypes.
|
118
192
|
"""
|
119
193
|
pandas_dtype_to_pa = {
|
120
|
-
"
|
121
|
-
"
|
122
|
-
"
|
123
|
-
"float64": pa.float64(), "float32": pa.float32(),
|
124
|
-
"bool": pa.bool_(), "boolean": pa.bool_(),
|
194
|
+
"string": pa.string(),
|
195
|
+
"Int64": pa.int64(),
|
196
|
+
"boolean": pa.bool_(),
|
125
197
|
"datetime64[ns]": pa.timestamp("ns"),
|
126
|
-
"
|
127
|
-
"
|
198
|
+
"string[pyarrow]": pa.string(),
|
199
|
+
"int64[pyarrow]": pa.int64(),
|
200
|
+
"boolean[pyarrow]": pa.bool_(),
|
201
|
+
"date32[pyarrow]": pa.date32(),
|
202
|
+
"timestamp[ns][pyarrow]": pa.timestamp("ns"),
|
203
|
+
"time64[ns][pyarrow]": pa.time64("ns"),
|
204
|
+
"object": pa.string(),
|
205
|
+
"float64": pa.float64(),
|
206
|
+
"int32": pa.int32(),
|
128
207
|
}
|
129
208
|
fields = [
|
130
|
-
pa.field(
|
131
|
-
for
|
209
|
+
pa.field(name, pandas_dtype_to_pa.get(str(dtype), pa.string()))
|
210
|
+
for name, dtype in self.df_result.dtypes.items()
|
132
211
|
]
|
133
212
|
return pa.schema(fields)
|
134
213
|
|
135
|
-
|
136
214
|
def _coerce_ddf_to_schema(self, ddf: dd.DataFrame, schema: pa.Schema) -> dd.DataFrame:
|
137
215
|
"""
|
138
|
-
|
139
|
-
- Ensures cross-partition consistency.
|
140
|
-
- Converts troublesome dtypes (Period, mixed object/bool) to the declared type.
|
216
|
+
Coerces DataFrame partitions to a target schema.
|
141
217
|
"""
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
for col, typ in target.items():
|
147
|
-
if col not in pdf.columns:
|
148
|
-
continue
|
149
|
-
|
150
|
-
pa_type = typ
|
151
|
-
|
152
|
-
# String targets
|
153
|
-
if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
|
154
|
-
# Convert Period or any dtype to string with NA-preservation
|
155
|
-
s = pdf[col]
|
156
|
-
if is_period_dtype(s):
|
157
|
-
pdf[col] = s.astype(str)
|
158
|
-
elif not is_string_dtype(s):
|
159
|
-
# astype("string") keeps NA; str(s) can produce "NaT" strings
|
160
|
-
try:
|
161
|
-
pdf[col] = s.astype("string")
|
162
|
-
except Exception:
|
163
|
-
pdf[col] = s.astype(str).astype("string")
|
164
|
-
continue
|
165
|
-
|
166
|
-
# Boolean targets
|
167
|
-
if pa.types.is_boolean(pa_type):
|
168
|
-
s = pdf[col]
|
169
|
-
# Allow object/bool mixtures; coerce via pandas nullable boolean then to bool
|
170
|
-
try:
|
171
|
-
pdf[col] = s.astype("boolean").astype(bool)
|
172
|
-
except Exception:
|
173
|
-
pdf[col] = s.astype(bool)
|
174
|
-
continue
|
175
|
-
|
176
|
-
# Integer targets
|
177
|
-
if pa.types.is_integer(pa_type):
|
178
|
-
s = pdf[col]
|
179
|
-
# Go through pandas nullable Int64 to preserve NA, then to int64 if clean
|
180
|
-
s2 = pd.to_numeric(s, errors="coerce").astype("Int64")
|
181
|
-
# If there are no nulls, downcast to numpy int64 for speed
|
182
|
-
if not s2.isna().any():
|
183
|
-
s2 = s2.astype("int64")
|
184
|
-
pdf[col] = s2
|
185
|
-
continue
|
186
|
-
|
187
|
-
# Floating targets
|
188
|
-
if pa.types.is_floating(pa_type):
|
189
|
-
pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("float64")
|
190
|
-
continue
|
191
|
-
|
192
|
-
# Timestamp[ns] (optionally with tz)
|
193
|
-
if pa.types.is_timestamp(pa_type):
|
194
|
-
# If tz in Arrow type, you may want to localize; here we just ensure ns
|
195
|
-
pdf[col] = pd.to_datetime(pdf[col], errors="coerce")
|
196
|
-
continue
|
197
|
-
|
198
|
-
# Fallback: leave as-is
|
199
|
-
return pdf
|
200
|
-
|
201
|
-
# Provide a meta with target dtypes to avoid meta mismatch warnings
|
202
|
-
meta = {}
|
218
|
+
target = {f.name: f.type for f in schema}
|
219
|
+
|
220
|
+
# Build the new meta object with pyarrow-backed dtypes
|
221
|
+
meta_cols: Dict[str, pd.Series] = {}
|
203
222
|
for name, typ in target.items():
|
204
|
-
|
205
|
-
|
206
|
-
meta[name] = pd.Series([], dtype="string")
|
223
|
+
if pa.types.is_string(typ):
|
224
|
+
meta_cols[name] = pd.Series([], dtype="string[pyarrow]")
|
207
225
|
elif pa.types.is_boolean(typ):
|
208
|
-
|
226
|
+
meta_cols[name] = pd.Series([], dtype="boolean[pyarrow]")
|
209
227
|
elif pa.types.is_integer(typ):
|
210
|
-
|
228
|
+
meta_cols[name] = pd.Series([], dtype="int64[pyarrow]")
|
211
229
|
elif pa.types.is_floating(typ):
|
212
|
-
|
230
|
+
meta_cols[name] = pd.Series([], dtype="float64[pyarrow]")
|
213
231
|
elif pa.types.is_timestamp(typ):
|
214
|
-
|
232
|
+
meta_cols[name] = pd.Series([], dtype="timestamp[ns][pyarrow]")
|
215
233
|
else:
|
216
|
-
|
234
|
+
meta_cols[name] = pd.Series([], dtype="string[pyarrow]") # Safe default
|
235
|
+
|
236
|
+
new_meta = pd.DataFrame(meta_cols, index=ddf._meta.index)
|
237
|
+
|
238
|
+
# Use partial to pass the target dictionary
|
239
|
+
coerce_fn = partial(_coerce_partition, target=target)
|
217
240
|
|
218
|
-
|
219
|
-
new_meta = ddf._meta.copy()
|
220
|
-
for k, v in meta.items():
|
221
|
-
if k in new_meta.columns:
|
222
|
-
new_meta[k] = v
|
241
|
+
return ddf.map_partitions(coerce_fn, meta=new_meta)
|
223
242
|
|
224
|
-
|
243
|
+
# import warnings
|
244
|
+
#
|
245
|
+
# from pandas.api.types import is_period_dtype, is_bool_dtype, is_string_dtype
|
246
|
+
# import pandas as pd
|
247
|
+
# import dask.dataframe as dd
|
248
|
+
# import pyarrow as pa
|
249
|
+
#
|
250
|
+
# from . import ManagedResource
|
251
|
+
#
|
252
|
+
# warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parquet is deprecated")
|
253
|
+
#
|
254
|
+
#
|
255
|
+
# class ParquetSaver(ManagedResource):
|
256
|
+
# """
|
257
|
+
# Saves Dask DataFrames to Parquet, with a workaround for S3-compatible
|
258
|
+
# storage providers that misbehave on batch delete operations.
|
259
|
+
#
|
260
|
+
# Assumes `df_result` is a Dask DataFrame.
|
261
|
+
# """
|
262
|
+
# logger_extra = {"sibi_dst_component": __name__}
|
263
|
+
#
|
264
|
+
# def __init__(
|
265
|
+
# self,
|
266
|
+
# df_result: dd.DataFrame,
|
267
|
+
# parquet_storage_path: str,
|
268
|
+
# **kwargs,
|
269
|
+
# ):
|
270
|
+
# super().__init__(**kwargs)
|
271
|
+
# self.df_result = df_result
|
272
|
+
# self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
273
|
+
# if not self.fs:
|
274
|
+
# raise ValueError("File system (fs) must be provided to ParquetSaver.")
|
275
|
+
#
|
276
|
+
# self.protocol = "file"
|
277
|
+
# if "://" in self.parquet_storage_path:
|
278
|
+
# self.protocol = self.parquet_storage_path.split(":", 1)[0]
|
279
|
+
#
|
280
|
+
# self.persist = kwargs.get("persist",True)
|
281
|
+
# self.write_index = kwargs.get("write_index", False)
|
282
|
+
# self.write_metadata_file = kwargs.get("write_metadata_file", True)
|
283
|
+
#
|
284
|
+
# def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True):
|
285
|
+
# """
|
286
|
+
# Saves the Dask DataFrame to a Parquet dataset.
|
287
|
+
#
|
288
|
+
# If overwrite is True, it manually clears the destination directory before
|
289
|
+
# writing to avoid issues with certain S3-compatible storage providers.
|
290
|
+
# """
|
291
|
+
# full_path = f"{self.parquet_storage_path}/{output_directory_name}"
|
292
|
+
#
|
293
|
+
# if overwrite and self.fs and self.fs.exists(full_path):
|
294
|
+
# self.logger.info(f"Overwrite is True, clearing destination path: {full_path}", extra=self.logger_extra)
|
295
|
+
# self._clear_directory_safely(full_path)
|
296
|
+
#
|
297
|
+
# # Ensure the base directory exists after clearing
|
298
|
+
# self.fs.mkdirs(full_path, exist_ok=True)
|
299
|
+
#
|
300
|
+
# schema = self._define_schema()
|
301
|
+
# self.logger.info(f"Saving DataFrame to Parquet dataset at: {full_path}", extra=self.logger_extra)
|
302
|
+
# # 1) Normalize to declared schema (fixes bool→string, Period→string, etc.)
|
303
|
+
# ddf = self._coerce_ddf_to_schema(self.df_result, schema)
|
304
|
+
#
|
305
|
+
# # 2) Persist after coercion so all partitions share the coerced dtypes
|
306
|
+
# ddf = ddf.persist() if self.persist else ddf
|
307
|
+
#
|
308
|
+
# try:
|
309
|
+
# ddf.to_parquet(
|
310
|
+
# path=full_path,
|
311
|
+
# engine="pyarrow",
|
312
|
+
# schema=schema,
|
313
|
+
# overwrite=False, # we've handled deletion already
|
314
|
+
# filesystem=self.fs,
|
315
|
+
# write_index=self.write_index, # whether to write the index
|
316
|
+
# write_metadata_file=self.write_metadata_file, # write _metadata for easier reading later
|
317
|
+
# )
|
318
|
+
# self.logger.info(f"Successfully saved Parquet dataset to: {full_path}", extra=self.logger_extra)
|
319
|
+
# except Exception as e:
|
320
|
+
# self.logger.error(f"Failed to save Parquet dataset to {full_path}: {e}", extra=self.logger_extra)
|
321
|
+
# raise
|
322
|
+
#
|
323
|
+
# def _clear_directory_safely(self, directory: str):
|
324
|
+
# """
|
325
|
+
# Clears the contents of a directory robustly.
|
326
|
+
# - For S3, deletes files one-by-one to bypass brittle multi-delete.
|
327
|
+
# - For other filesystems, uses the standard recursive remove.
|
328
|
+
# """
|
329
|
+
# if self.protocol == "s3":
|
330
|
+
# self.logger.warning(
|
331
|
+
# "Using single-file S3 deletion for compatibility. "
|
332
|
+
# "This may be slow for directories with many files."
|
333
|
+
# )
|
334
|
+
# # Glob all contents (files and subdirs) and delete them individually.
|
335
|
+
# all_paths = self.fs.glob(f"{directory}/**")
|
336
|
+
# # delete contents (deepest first)
|
337
|
+
# for path in sorted([p for p in all_paths if p != directory], key=len, reverse=True):
|
338
|
+
# self.logger.debug(f"Deleting: {path}")
|
339
|
+
# try:
|
340
|
+
# # prefer rm_file if available (minio, s3fs expose it)
|
341
|
+
# if hasattr(self.fs, "rm_file"):
|
342
|
+
# self.fs.rm_file(path)
|
343
|
+
# else:
|
344
|
+
# self.fs.rm(path, recursive=False)
|
345
|
+
# except Exception as e:
|
346
|
+
# self.logger.warning(f"Failed to delete '{path}': {e}", extra=self.logger_extra)
|
347
|
+
# # remove the (now empty) directory if present
|
348
|
+
# try:
|
349
|
+
# self.fs.rm(directory, recursive=False)
|
350
|
+
# except Exception:
|
351
|
+
# pass
|
352
|
+
# else:
|
353
|
+
# # Standard, fast deletion for other filesystems (local, etc.)
|
354
|
+
# self.fs.rm(directory, recursive=True)
|
355
|
+
#
|
356
|
+
# def _define_schema(self) -> pa.Schema:
|
357
|
+
# """
|
358
|
+
# Defines a PyArrow schema dynamically based on DataFrame's column types.
|
359
|
+
# Works for Dask by using known dtypes on the collection.
|
360
|
+
# """
|
361
|
+
# pandas_dtype_to_pa = {
|
362
|
+
# "object": pa.string(), "string": pa.string(),
|
363
|
+
# "int64": pa.int64(), "Int64": pa.int64(),
|
364
|
+
# "int32": pa.int32(), "Int32": pa.int32(),
|
365
|
+
# "float64": pa.float64(), "float32": pa.float32(),
|
366
|
+
# "bool": pa.bool_(), "boolean": pa.bool_(),
|
367
|
+
# "datetime64[ns]": pa.timestamp("ns"),
|
368
|
+
# "datetime64[ns, UTC]": pa.timestamp("ns", tz="UTC"),
|
369
|
+
# "category": pa.string(),
|
370
|
+
# }
|
371
|
+
# fields = [
|
372
|
+
# pa.field(c, pandas_dtype_to_pa.get(str(d), pa.string()))
|
373
|
+
# for c, d in self.df_result.dtypes.items()
|
374
|
+
# ]
|
375
|
+
# return pa.schema(fields)
|
376
|
+
#
|
377
|
+
#
|
378
|
+
# def _coerce_ddf_to_schema(self, ddf: dd.DataFrame, schema: pa.Schema) -> dd.DataFrame:
|
379
|
+
# """
|
380
|
+
# Coerce Dask DataFrame columns to match the provided PyArrow schema.
|
381
|
+
# - Ensures cross-partition consistency.
|
382
|
+
# - Converts troublesome dtypes (Period, mixed object/bool) to the declared type.
|
383
|
+
# """
|
384
|
+
# # Build a map: name -> target kind
|
385
|
+
# target = {field.name: field.type for field in schema}
|
386
|
+
#
|
387
|
+
# def _coerce_partition(pdf: pd.DataFrame) -> pd.DataFrame:
|
388
|
+
# for col, typ in target.items():
|
389
|
+
# if col not in pdf.columns:
|
390
|
+
# continue
|
391
|
+
#
|
392
|
+
# pa_type = typ
|
393
|
+
#
|
394
|
+
# # String targets
|
395
|
+
# if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
|
396
|
+
# # Convert Period or any dtype to string with NA-preservation
|
397
|
+
# s = pdf[col]
|
398
|
+
# if is_period_dtype(s):
|
399
|
+
# pdf[col] = s.astype(str)
|
400
|
+
# elif not is_string_dtype(s):
|
401
|
+
# # astype("string") keeps NA; str(s) can produce "NaT" strings
|
402
|
+
# try:
|
403
|
+
# pdf[col] = s.astype("string")
|
404
|
+
# except Exception:
|
405
|
+
# pdf[col] = s.astype(str).astype("string")
|
406
|
+
# continue
|
407
|
+
#
|
408
|
+
# # Boolean targets
|
409
|
+
# if pa.types.is_boolean(pa_type):
|
410
|
+
# s = pdf[col]
|
411
|
+
# # Allow object/bool mixtures; coerce via pandas nullable boolean then to bool
|
412
|
+
# try:
|
413
|
+
# pdf[col] = s.astype("boolean").astype(bool)
|
414
|
+
# except Exception:
|
415
|
+
# pdf[col] = s.astype(bool)
|
416
|
+
# continue
|
417
|
+
#
|
418
|
+
# # Integer targets
|
419
|
+
# if pa.types.is_integer(pa_type):
|
420
|
+
# s = pdf[col]
|
421
|
+
# # Go through pandas nullable Int64 to preserve NA, then to int64 if clean
|
422
|
+
# s2 = pd.to_numeric(s, errors="coerce").astype("Int64")
|
423
|
+
# # If there are no nulls, downcast to numpy int64 for speed
|
424
|
+
# if not s2.isna().any():
|
425
|
+
# s2 = s2.astype("int64")
|
426
|
+
# pdf[col] = s2
|
427
|
+
# continue
|
428
|
+
#
|
429
|
+
# # Floating targets
|
430
|
+
# if pa.types.is_floating(pa_type):
|
431
|
+
# pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("float64")
|
432
|
+
# continue
|
433
|
+
#
|
434
|
+
# # Timestamp[ns] (optionally with tz)
|
435
|
+
# if pa.types.is_timestamp(pa_type):
|
436
|
+
# # If tz in Arrow type, you may want to localize; here we just ensure ns
|
437
|
+
# pdf[col] = pd.to_datetime(pdf[col], errors="coerce")
|
438
|
+
# continue
|
439
|
+
#
|
440
|
+
# # Fallback: leave as-is
|
441
|
+
# return pdf
|
442
|
+
#
|
443
|
+
# # Provide a meta with target dtypes to avoid meta mismatch warnings
|
444
|
+
# meta = {}
|
445
|
+
# for name, typ in target.items():
|
446
|
+
# # Rough meta mapping; Arrow large_string vs string both → 'string'
|
447
|
+
# if pa.types.is_string(typ) or pa.types.is_large_string(typ):
|
448
|
+
# meta[name] = pd.Series([], dtype="string")
|
449
|
+
# elif pa.types.is_boolean(typ):
|
450
|
+
# meta[name] = pd.Series([], dtype="bool")
|
451
|
+
# elif pa.types.is_integer(typ):
|
452
|
+
# meta[name] = pd.Series([], dtype="Int64") # nullable int
|
453
|
+
# elif pa.types.is_floating(typ):
|
454
|
+
# meta[name] = pd.Series([], dtype="float64")
|
455
|
+
# elif pa.types.is_timestamp(typ):
|
456
|
+
# meta[name] = pd.Series([], dtype="datetime64[ns]")
|
457
|
+
# else:
|
458
|
+
# meta[name] = pd.Series([], dtype="object")
|
459
|
+
#
|
460
|
+
# # Start from current meta and update known columns
|
461
|
+
# new_meta = ddf._meta.copy()
|
462
|
+
# for k, v in meta.items():
|
463
|
+
# if k in new_meta.columns:
|
464
|
+
# new_meta[k] = v
|
465
|
+
#
|
466
|
+
# return ddf.map_partitions(_coerce_partition, meta=new_meta)
|