sibi-dst 2025.9.2__py3-none-any.whl → 2025.9.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,224 +1,466 @@
1
+ from __future__ import annotations
2
+
1
3
  import warnings
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from contextlib import contextmanager
6
+ from functools import partial
7
+ from multiprocessing.pool import ThreadPool
8
+ from typing import Any, Dict, Optional
2
9
 
3
- from pandas.api.types import is_period_dtype, is_bool_dtype, is_string_dtype
4
- import pandas as pd
10
+ import dask
5
11
  import dask.dataframe as dd
12
+ import pandas as pd
6
13
  import pyarrow as pa
7
14
 
8
15
  from . import ManagedResource
16
+ from .write_gatekeeper import get_write_sem
9
17
 
10
18
  warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parquet is deprecated")
11
19
 
12
20
 
13
- class ParquetSaver(ManagedResource):
21
+ def _coerce_partition(pdf: pd.DataFrame, target: Dict[str, pa.DataType]) -> pd.DataFrame:
14
22
  """
15
- Saves Dask DataFrames to Parquet, with a workaround for S3-compatible
16
- storage providers that misbehave on batch delete operations.
23
+ Applies type conversions to a single pandas partition.
24
+ This function is defined at module level to ensure Dask serialization compatibility.
25
+ """
26
+ for col, pa_type in target.items():
27
+ if col not in pdf.columns:
28
+ continue
29
+
30
+ try:
31
+ current_dtype_str = str(pdf[col].dtype)
32
+ if pa.types.is_string(pa_type) and current_dtype_str != "string[pyarrow]":
33
+ pdf[col] = pdf[col].astype("string[pyarrow]")
34
+ elif pa.types.is_boolean(pa_type) and current_dtype_str != "boolean[pyarrow]":
35
+ pdf[col] = pdf[col].astype("boolean[pyarrow]")
36
+ elif pa.types.is_integer(pa_type) and current_dtype_str != "int64[pyarrow]":
37
+ pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("int64[pyarrow]")
38
+ elif pa.types.is_floating(pa_type) and current_dtype_str != "float64[pyarrow]":
39
+ pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("float64[pyarrow]")
40
+ elif pa.types.is_timestamp(pa_type):
41
+ if hasattr(pdf[col].dtype, 'pyarrow_dtype') and pa.types.is_timestamp(pdf[col].dtype.pyarrow_dtype):
42
+ pdf[col] = pdf[col].astype('datetime64[ns]')
43
+ pdf[col] = pd.to_datetime(pdf[col], errors="coerce")
44
+ pdf[col] = pdf[col].astype("timestamp[ns][pyarrow]")
45
+ except Exception:
46
+ pass
47
+ return pdf
48
+
17
49
 
18
- Assumes `df_result` is a Dask DataFrame.
50
+ class ParquetSaver(ManagedResource):
51
+ """
52
+ Production-grade Dask → Parquet writer with bounded concurrency.
53
+ This version is refactored to be fully pyarrow-aware, ensuring metadata
54
+ consistency from data source to parquet sink.
19
55
  """
20
56
  logger_extra = {"sibi_dst_component": __name__}
21
57
 
22
58
  def __init__(
23
- self,
24
- df_result: dd.DataFrame,
25
- parquet_storage_path: str,
26
- **kwargs,
59
+ self,
60
+ df_result: dd.DataFrame,
61
+ parquet_storage_path: str,
62
+ *,
63
+ repartition_size: Optional[str] = "128MB",
64
+ persist: bool = True,
65
+ write_index: bool = False,
66
+ write_metadata_file: bool = True,
67
+ pyarrow_args: Optional[Dict[str, Any]] = None,
68
+ writer_threads: int = 8,
69
+ arrow_cpu: Optional[int] = None,
70
+ partitions_per_round: int = 24,
71
+ max_delete_workers: int = 8,
72
+ write_gate_max: int = 2,
73
+ write_gate_key: Optional[str] = None,
74
+ **kwargs: Any,
27
75
  ):
28
76
  super().__init__(**kwargs)
29
- self.df_result = df_result
30
- self.parquet_storage_path = parquet_storage_path.rstrip("/")
77
+
78
+ if not isinstance(df_result, dd.DataFrame):
79
+ raise TypeError("df_result must be a Dask DataFrame")
31
80
  if not self.fs:
32
81
  raise ValueError("File system (fs) must be provided to ParquetSaver.")
33
82
 
83
+ self.df_result = df_result
84
+ self.parquet_storage_path = parquet_storage_path.rstrip("/")
85
+ self.repartition_size = repartition_size
86
+ self.persist = persist
87
+ self.write_index = write_index
88
+ self.write_metadata_file = write_metadata_file
89
+ self.pyarrow_args = dict(pyarrow_args or {})
90
+ self.writer_threads = max(1, int(writer_threads))
91
+ self.arrow_cpu = None if arrow_cpu is None else max(1, int(arrow_cpu))
92
+ self.partitions_per_round = max(1, int(partitions_per_round))
93
+ self.max_delete_workers = max(1, int(max_delete_workers))
94
+ self.write_gate_max = max(1, int(write_gate_max))
95
+ self.write_gate_key = (write_gate_key or self.parquet_storage_path).rstrip("/")
96
+
97
+ # Fix: Remove deprecated coerce_timestamps parameter
98
+ self.pyarrow_args.setdefault("compression", "zstd")
99
+
34
100
  self.protocol = "file"
35
101
  if "://" in self.parquet_storage_path:
36
102
  self.protocol = self.parquet_storage_path.split(":", 1)[0]
37
103
 
38
- self.persist = kwargs.get("persist",True)
39
- self.write_index = kwargs.get("write_index", False)
40
- self.write_metadata_file = kwargs.get("write_metadata_file", True)
104
+ # ---------- public API ----------
105
+ def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True) -> str:
106
+ target_path = f"{self.parquet_storage_path}/{output_directory_name}".rstrip("/")
41
107
 
42
- def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True):
43
- """
44
- Saves the Dask DataFrame to a Parquet dataset.
108
+ sem = get_write_sem(self.write_gate_key, self.write_gate_max)
109
+ with sem:
110
+ if overwrite and self.fs.exists(target_path):
111
+ self._clear_directory_safely(target_path)
112
+ self.fs.mkdirs(target_path, exist_ok=True)
45
113
 
46
- If overwrite is True, it manually clears the destination directory before
47
- writing to avoid issues with certain S3-compatible storage providers.
48
- """
49
- full_path = f"{self.parquet_storage_path}/{output_directory_name}"
114
+ # Define a pyarrow schema and coerce the Dask frame to match it.
115
+ schema = self._define_schema()
116
+ ddf = self._coerce_ddf_to_schema(self.df_result, schema)
117
+
118
+ if self.repartition_size:
119
+ ddf = ddf.repartition(partition_size=self.repartition_size)
50
120
 
51
- if overwrite and self.fs and self.fs.exists(full_path):
52
- self.logger.info(f"Overwrite is True, clearing destination path: {full_path}", extra=self.logger_extra)
53
- self._clear_directory_safely(full_path)
121
+ if self.persist:
122
+ with self._local_dask_pool():
123
+ ddf = ddf.persist(scheduler="threads")
54
124
 
55
- # Ensure the base directory exists after clearing
56
- self.fs.mkdirs(full_path, exist_ok=True)
125
+ old_arrow_cpu = None
126
+ if self.arrow_cpu:
127
+ old_arrow_cpu = pa.get_cpu_count()
128
+ pa.set_cpu_count(self.arrow_cpu)
57
129
 
58
- schema = self._define_schema()
59
- self.logger.info(f"Saving DataFrame to Parquet dataset at: {full_path}", extra=self.logger_extra)
60
- # 1) Normalize to declared schema (fixes bool→string, Period→string, etc.)
61
- ddf = self._coerce_ddf_to_schema(self.df_result, schema)
130
+ try:
131
+ with self._local_dask_pool():
132
+ ddf.to_parquet(
133
+ path=target_path,
134
+ engine="pyarrow",
135
+ schema=schema,
136
+ overwrite=False,
137
+ filesystem=self.fs,
138
+ write_index=self.write_index,
139
+ write_metadata_file=self.write_metadata_file,
140
+ **self.pyarrow_args,
141
+ )
142
+ finally:
143
+ if old_arrow_cpu is not None:
144
+ pa.set_cpu_count(old_arrow_cpu)
62
145
 
63
- # 2) Persist after coercion so all partitions share the coerced dtypes
64
- ddf = ddf.persist() if self.persist else ddf
146
+ self.logger.info(
147
+ f"Parquet dataset written: {target_path}",
148
+ extra=self.logger_extra,
149
+ )
150
+ return target_path
65
151
 
152
+ @contextmanager
153
+ def _local_dask_pool(self):
154
+ """Limit Dask threads only within persist/write phases."""
155
+ prev_pool = dask.config.get("pool", None)
66
156
  try:
67
- ddf.to_parquet(
68
- path=full_path,
69
- engine="pyarrow",
70
- schema=schema,
71
- overwrite=False, # we've handled deletion already
72
- filesystem=self.fs,
73
- write_index=self.write_index, # whether to write the index
74
- write_metadata_file=self.write_metadata_file, # write _metadata for easier reading later
75
- )
76
- self.logger.info(f"Successfully saved Parquet dataset to: {full_path}", extra=self.logger_extra)
77
- except Exception as e:
78
- self.logger.error(f"Failed to save Parquet dataset to {full_path}: {e}", extra=self.logger_extra)
79
- raise
80
-
81
- def _clear_directory_safely(self, directory: str):
82
- """
83
- Clears the contents of a directory robustly.
84
- - For S3, deletes files one-by-one to bypass brittle multi-delete.
85
- - For other filesystems, uses the standard recursive remove.
86
- """
87
- if self.protocol == "s3":
88
- self.logger.warning(
89
- "Using single-file S3 deletion for compatibility. "
90
- "This may be slow for directories with many files."
91
- )
92
- # Glob all contents (files and subdirs) and delete them individually.
93
- all_paths = self.fs.glob(f"{directory}/**")
94
- # delete contents (deepest first)
95
- for path in sorted([p for p in all_paths if p != directory], key=len, reverse=True):
96
- self.logger.debug(f"Deleting: {path}")
157
+ dask.config.set(pool=ThreadPool(self.writer_threads), scheduler="threads")
158
+ yield
159
+ finally:
160
+ if prev_pool is None:
161
+ dask.config.refresh()
162
+ else:
163
+ dask.config.set(pool=prev_pool)
164
+
165
+ def _clear_directory_safely(self, directory: str) -> None:
166
+ """Robustly clear a directory, with optimizations for S3."""
167
+ if self.protocol.startswith("s3"):
168
+ entries = [p for p in self.fs.glob(f"{directory}/**") if p != directory]
169
+ if not entries:
170
+ return
171
+
172
+ def _rm_one(p: str) -> None:
97
173
  try:
98
- # prefer rm_file if available (minio, s3fs expose it)
99
- if hasattr(self.fs, "rm_file"):
100
- self.fs.rm_file(path)
101
- else:
102
- self.fs.rm(path, recursive=False)
174
+ self.fs.rm_file(p)
103
175
  except Exception as e:
104
- self.logger.warning(f"Failed to delete '{path}': {e}", extra=self.logger_extra)
105
- # remove the (now empty) directory if present
176
+ self.logger.warning(f"Delete failed '{p}': {e}", extra=self.logger_extra)
177
+
178
+ with ThreadPoolExecutor(max_workers=self.max_delete_workers) as ex:
179
+ list(ex.map(_rm_one, entries))
106
180
  try:
107
181
  self.fs.rm(directory, recursive=False)
108
182
  except Exception:
109
183
  pass
110
184
  else:
111
- # Standard, fast deletion for other filesystems (local, etc.)
112
185
  self.fs.rm(directory, recursive=True)
113
186
 
187
+ # ---------- REFACTORED SCHEMA METHODS ----------
188
+
114
189
  def _define_schema(self) -> pa.Schema:
115
190
  """
116
- Defines a PyArrow schema dynamically based on DataFrame's column types.
117
- Works for Dask by using known dtypes on the collection.
191
+ Defines a PyArrow schema from the DataFrame's dtypes.
118
192
  """
119
193
  pandas_dtype_to_pa = {
120
- "object": pa.string(), "string": pa.string(),
121
- "int64": pa.int64(), "Int64": pa.int64(),
122
- "int32": pa.int32(), "Int32": pa.int32(),
123
- "float64": pa.float64(), "float32": pa.float32(),
124
- "bool": pa.bool_(), "boolean": pa.bool_(),
194
+ "string": pa.string(),
195
+ "Int64": pa.int64(),
196
+ "boolean": pa.bool_(),
125
197
  "datetime64[ns]": pa.timestamp("ns"),
126
- "datetime64[ns, UTC]": pa.timestamp("ns", tz="UTC"),
127
- "category": pa.string(),
198
+ "string[pyarrow]": pa.string(),
199
+ "int64[pyarrow]": pa.int64(),
200
+ "boolean[pyarrow]": pa.bool_(),
201
+ "date32[pyarrow]": pa.date32(),
202
+ "timestamp[ns][pyarrow]": pa.timestamp("ns"),
203
+ "time64[ns][pyarrow]": pa.time64("ns"),
204
+ "object": pa.string(),
205
+ "float64": pa.float64(),
206
+ "int32": pa.int32(),
128
207
  }
129
208
  fields = [
130
- pa.field(c, pandas_dtype_to_pa.get(str(d), pa.string()))
131
- for c, d in self.df_result.dtypes.items()
209
+ pa.field(name, pandas_dtype_to_pa.get(str(dtype), pa.string()))
210
+ for name, dtype in self.df_result.dtypes.items()
132
211
  ]
133
212
  return pa.schema(fields)
134
213
 
135
-
136
214
  def _coerce_ddf_to_schema(self, ddf: dd.DataFrame, schema: pa.Schema) -> dd.DataFrame:
137
215
  """
138
- Coerce Dask DataFrame columns to match the provided PyArrow schema.
139
- - Ensures cross-partition consistency.
140
- - Converts troublesome dtypes (Period, mixed object/bool) to the declared type.
216
+ Coerces DataFrame partitions to a target schema.
141
217
  """
142
- # Build a map: name -> target kind
143
- target = {field.name: field.type for field in schema}
144
-
145
- def _coerce_partition(pdf: pd.DataFrame) -> pd.DataFrame:
146
- for col, typ in target.items():
147
- if col not in pdf.columns:
148
- continue
149
-
150
- pa_type = typ
151
-
152
- # String targets
153
- if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
154
- # Convert Period or any dtype to string with NA-preservation
155
- s = pdf[col]
156
- if is_period_dtype(s):
157
- pdf[col] = s.astype(str)
158
- elif not is_string_dtype(s):
159
- # astype("string") keeps NA; str(s) can produce "NaT" strings
160
- try:
161
- pdf[col] = s.astype("string")
162
- except Exception:
163
- pdf[col] = s.astype(str).astype("string")
164
- continue
165
-
166
- # Boolean targets
167
- if pa.types.is_boolean(pa_type):
168
- s = pdf[col]
169
- # Allow object/bool mixtures; coerce via pandas nullable boolean then to bool
170
- try:
171
- pdf[col] = s.astype("boolean").astype(bool)
172
- except Exception:
173
- pdf[col] = s.astype(bool)
174
- continue
175
-
176
- # Integer targets
177
- if pa.types.is_integer(pa_type):
178
- s = pdf[col]
179
- # Go through pandas nullable Int64 to preserve NA, then to int64 if clean
180
- s2 = pd.to_numeric(s, errors="coerce").astype("Int64")
181
- # If there are no nulls, downcast to numpy int64 for speed
182
- if not s2.isna().any():
183
- s2 = s2.astype("int64")
184
- pdf[col] = s2
185
- continue
186
-
187
- # Floating targets
188
- if pa.types.is_floating(pa_type):
189
- pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("float64")
190
- continue
191
-
192
- # Timestamp[ns] (optionally with tz)
193
- if pa.types.is_timestamp(pa_type):
194
- # If tz in Arrow type, you may want to localize; here we just ensure ns
195
- pdf[col] = pd.to_datetime(pdf[col], errors="coerce")
196
- continue
197
-
198
- # Fallback: leave as-is
199
- return pdf
200
-
201
- # Provide a meta with target dtypes to avoid meta mismatch warnings
202
- meta = {}
218
+ target = {f.name: f.type for f in schema}
219
+
220
+ # Build the new meta object with pyarrow-backed dtypes
221
+ meta_cols: Dict[str, pd.Series] = {}
203
222
  for name, typ in target.items():
204
- # Rough meta mapping; Arrow large_string vs string both → 'string'
205
- if pa.types.is_string(typ) or pa.types.is_large_string(typ):
206
- meta[name] = pd.Series([], dtype="string")
223
+ if pa.types.is_string(typ):
224
+ meta_cols[name] = pd.Series([], dtype="string[pyarrow]")
207
225
  elif pa.types.is_boolean(typ):
208
- meta[name] = pd.Series([], dtype="bool")
226
+ meta_cols[name] = pd.Series([], dtype="boolean[pyarrow]")
209
227
  elif pa.types.is_integer(typ):
210
- meta[name] = pd.Series([], dtype="Int64") # nullable int
228
+ meta_cols[name] = pd.Series([], dtype="int64[pyarrow]")
211
229
  elif pa.types.is_floating(typ):
212
- meta[name] = pd.Series([], dtype="float64")
230
+ meta_cols[name] = pd.Series([], dtype="float64[pyarrow]")
213
231
  elif pa.types.is_timestamp(typ):
214
- meta[name] = pd.Series([], dtype="datetime64[ns]")
232
+ meta_cols[name] = pd.Series([], dtype="timestamp[ns][pyarrow]")
215
233
  else:
216
- meta[name] = pd.Series([], dtype="object")
234
+ meta_cols[name] = pd.Series([], dtype="string[pyarrow]") # Safe default
235
+
236
+ new_meta = pd.DataFrame(meta_cols, index=ddf._meta.index)
237
+
238
+ # Use partial to pass the target dictionary
239
+ coerce_fn = partial(_coerce_partition, target=target)
217
240
 
218
- # Start from current meta and update known columns
219
- new_meta = ddf._meta.copy()
220
- for k, v in meta.items():
221
- if k in new_meta.columns:
222
- new_meta[k] = v
241
+ return ddf.map_partitions(coerce_fn, meta=new_meta)
223
242
 
224
- return ddf.map_partitions(_coerce_partition, meta=new_meta)
243
+ # import warnings
244
+ #
245
+ # from pandas.api.types import is_period_dtype, is_bool_dtype, is_string_dtype
246
+ # import pandas as pd
247
+ # import dask.dataframe as dd
248
+ # import pyarrow as pa
249
+ #
250
+ # from . import ManagedResource
251
+ #
252
+ # warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parquet is deprecated")
253
+ #
254
+ #
255
+ # class ParquetSaver(ManagedResource):
256
+ # """
257
+ # Saves Dask DataFrames to Parquet, with a workaround for S3-compatible
258
+ # storage providers that misbehave on batch delete operations.
259
+ #
260
+ # Assumes `df_result` is a Dask DataFrame.
261
+ # """
262
+ # logger_extra = {"sibi_dst_component": __name__}
263
+ #
264
+ # def __init__(
265
+ # self,
266
+ # df_result: dd.DataFrame,
267
+ # parquet_storage_path: str,
268
+ # **kwargs,
269
+ # ):
270
+ # super().__init__(**kwargs)
271
+ # self.df_result = df_result
272
+ # self.parquet_storage_path = parquet_storage_path.rstrip("/")
273
+ # if not self.fs:
274
+ # raise ValueError("File system (fs) must be provided to ParquetSaver.")
275
+ #
276
+ # self.protocol = "file"
277
+ # if "://" in self.parquet_storage_path:
278
+ # self.protocol = self.parquet_storage_path.split(":", 1)[0]
279
+ #
280
+ # self.persist = kwargs.get("persist",True)
281
+ # self.write_index = kwargs.get("write_index", False)
282
+ # self.write_metadata_file = kwargs.get("write_metadata_file", True)
283
+ #
284
+ # def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True):
285
+ # """
286
+ # Saves the Dask DataFrame to a Parquet dataset.
287
+ #
288
+ # If overwrite is True, it manually clears the destination directory before
289
+ # writing to avoid issues with certain S3-compatible storage providers.
290
+ # """
291
+ # full_path = f"{self.parquet_storage_path}/{output_directory_name}"
292
+ #
293
+ # if overwrite and self.fs and self.fs.exists(full_path):
294
+ # self.logger.info(f"Overwrite is True, clearing destination path: {full_path}", extra=self.logger_extra)
295
+ # self._clear_directory_safely(full_path)
296
+ #
297
+ # # Ensure the base directory exists after clearing
298
+ # self.fs.mkdirs(full_path, exist_ok=True)
299
+ #
300
+ # schema = self._define_schema()
301
+ # self.logger.info(f"Saving DataFrame to Parquet dataset at: {full_path}", extra=self.logger_extra)
302
+ # # 1) Normalize to declared schema (fixes bool→string, Period→string, etc.)
303
+ # ddf = self._coerce_ddf_to_schema(self.df_result, schema)
304
+ #
305
+ # # 2) Persist after coercion so all partitions share the coerced dtypes
306
+ # ddf = ddf.persist() if self.persist else ddf
307
+ #
308
+ # try:
309
+ # ddf.to_parquet(
310
+ # path=full_path,
311
+ # engine="pyarrow",
312
+ # schema=schema,
313
+ # overwrite=False, # we've handled deletion already
314
+ # filesystem=self.fs,
315
+ # write_index=self.write_index, # whether to write the index
316
+ # write_metadata_file=self.write_metadata_file, # write _metadata for easier reading later
317
+ # )
318
+ # self.logger.info(f"Successfully saved Parquet dataset to: {full_path}", extra=self.logger_extra)
319
+ # except Exception as e:
320
+ # self.logger.error(f"Failed to save Parquet dataset to {full_path}: {e}", extra=self.logger_extra)
321
+ # raise
322
+ #
323
+ # def _clear_directory_safely(self, directory: str):
324
+ # """
325
+ # Clears the contents of a directory robustly.
326
+ # - For S3, deletes files one-by-one to bypass brittle multi-delete.
327
+ # - For other filesystems, uses the standard recursive remove.
328
+ # """
329
+ # if self.protocol == "s3":
330
+ # self.logger.warning(
331
+ # "Using single-file S3 deletion for compatibility. "
332
+ # "This may be slow for directories with many files."
333
+ # )
334
+ # # Glob all contents (files and subdirs) and delete them individually.
335
+ # all_paths = self.fs.glob(f"{directory}/**")
336
+ # # delete contents (deepest first)
337
+ # for path in sorted([p for p in all_paths if p != directory], key=len, reverse=True):
338
+ # self.logger.debug(f"Deleting: {path}")
339
+ # try:
340
+ # # prefer rm_file if available (minio, s3fs expose it)
341
+ # if hasattr(self.fs, "rm_file"):
342
+ # self.fs.rm_file(path)
343
+ # else:
344
+ # self.fs.rm(path, recursive=False)
345
+ # except Exception as e:
346
+ # self.logger.warning(f"Failed to delete '{path}': {e}", extra=self.logger_extra)
347
+ # # remove the (now empty) directory if present
348
+ # try:
349
+ # self.fs.rm(directory, recursive=False)
350
+ # except Exception:
351
+ # pass
352
+ # else:
353
+ # # Standard, fast deletion for other filesystems (local, etc.)
354
+ # self.fs.rm(directory, recursive=True)
355
+ #
356
+ # def _define_schema(self) -> pa.Schema:
357
+ # """
358
+ # Defines a PyArrow schema dynamically based on DataFrame's column types.
359
+ # Works for Dask by using known dtypes on the collection.
360
+ # """
361
+ # pandas_dtype_to_pa = {
362
+ # "object": pa.string(), "string": pa.string(),
363
+ # "int64": pa.int64(), "Int64": pa.int64(),
364
+ # "int32": pa.int32(), "Int32": pa.int32(),
365
+ # "float64": pa.float64(), "float32": pa.float32(),
366
+ # "bool": pa.bool_(), "boolean": pa.bool_(),
367
+ # "datetime64[ns]": pa.timestamp("ns"),
368
+ # "datetime64[ns, UTC]": pa.timestamp("ns", tz="UTC"),
369
+ # "category": pa.string(),
370
+ # }
371
+ # fields = [
372
+ # pa.field(c, pandas_dtype_to_pa.get(str(d), pa.string()))
373
+ # for c, d in self.df_result.dtypes.items()
374
+ # ]
375
+ # return pa.schema(fields)
376
+ #
377
+ #
378
+ # def _coerce_ddf_to_schema(self, ddf: dd.DataFrame, schema: pa.Schema) -> dd.DataFrame:
379
+ # """
380
+ # Coerce Dask DataFrame columns to match the provided PyArrow schema.
381
+ # - Ensures cross-partition consistency.
382
+ # - Converts troublesome dtypes (Period, mixed object/bool) to the declared type.
383
+ # """
384
+ # # Build a map: name -> target kind
385
+ # target = {field.name: field.type for field in schema}
386
+ #
387
+ # def _coerce_partition(pdf: pd.DataFrame) -> pd.DataFrame:
388
+ # for col, typ in target.items():
389
+ # if col not in pdf.columns:
390
+ # continue
391
+ #
392
+ # pa_type = typ
393
+ #
394
+ # # String targets
395
+ # if pa.types.is_string(pa_type) or pa.types.is_large_string(pa_type):
396
+ # # Convert Period or any dtype to string with NA-preservation
397
+ # s = pdf[col]
398
+ # if is_period_dtype(s):
399
+ # pdf[col] = s.astype(str)
400
+ # elif not is_string_dtype(s):
401
+ # # astype("string") keeps NA; str(s) can produce "NaT" strings
402
+ # try:
403
+ # pdf[col] = s.astype("string")
404
+ # except Exception:
405
+ # pdf[col] = s.astype(str).astype("string")
406
+ # continue
407
+ #
408
+ # # Boolean targets
409
+ # if pa.types.is_boolean(pa_type):
410
+ # s = pdf[col]
411
+ # # Allow object/bool mixtures; coerce via pandas nullable boolean then to bool
412
+ # try:
413
+ # pdf[col] = s.astype("boolean").astype(bool)
414
+ # except Exception:
415
+ # pdf[col] = s.astype(bool)
416
+ # continue
417
+ #
418
+ # # Integer targets
419
+ # if pa.types.is_integer(pa_type):
420
+ # s = pdf[col]
421
+ # # Go through pandas nullable Int64 to preserve NA, then to int64 if clean
422
+ # s2 = pd.to_numeric(s, errors="coerce").astype("Int64")
423
+ # # If there are no nulls, downcast to numpy int64 for speed
424
+ # if not s2.isna().any():
425
+ # s2 = s2.astype("int64")
426
+ # pdf[col] = s2
427
+ # continue
428
+ #
429
+ # # Floating targets
430
+ # if pa.types.is_floating(pa_type):
431
+ # pdf[col] = pd.to_numeric(pdf[col], errors="coerce").astype("float64")
432
+ # continue
433
+ #
434
+ # # Timestamp[ns] (optionally with tz)
435
+ # if pa.types.is_timestamp(pa_type):
436
+ # # If tz in Arrow type, you may want to localize; here we just ensure ns
437
+ # pdf[col] = pd.to_datetime(pdf[col], errors="coerce")
438
+ # continue
439
+ #
440
+ # # Fallback: leave as-is
441
+ # return pdf
442
+ #
443
+ # # Provide a meta with target dtypes to avoid meta mismatch warnings
444
+ # meta = {}
445
+ # for name, typ in target.items():
446
+ # # Rough meta mapping; Arrow large_string vs string both → 'string'
447
+ # if pa.types.is_string(typ) or pa.types.is_large_string(typ):
448
+ # meta[name] = pd.Series([], dtype="string")
449
+ # elif pa.types.is_boolean(typ):
450
+ # meta[name] = pd.Series([], dtype="bool")
451
+ # elif pa.types.is_integer(typ):
452
+ # meta[name] = pd.Series([], dtype="Int64") # nullable int
453
+ # elif pa.types.is_floating(typ):
454
+ # meta[name] = pd.Series([], dtype="float64")
455
+ # elif pa.types.is_timestamp(typ):
456
+ # meta[name] = pd.Series([], dtype="datetime64[ns]")
457
+ # else:
458
+ # meta[name] = pd.Series([], dtype="object")
459
+ #
460
+ # # Start from current meta and update known columns
461
+ # new_meta = ddf._meta.copy()
462
+ # for k, v in meta.items():
463
+ # if k in new_meta.columns:
464
+ # new_meta[k] = v
465
+ #
466
+ # return ddf.map_partitions(_coerce_partition, meta=new_meta)