sibi-dst 2025.9.5__tar.gz → 2025.9.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/PKG-INFO +1 -1
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/pyproject.toml +1 -1
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/__init__.py +2 -0
- sibi_dst-2025.9.7/sibi_dst/utils/boilerplate/base_pipeline.py +178 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/clickhouse_writer.py +138 -13
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/dask_utils.py +1 -1
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/README.md +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/tests/test_baseclass.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/async_utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/hybrid_data_loader.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/data_wrapper.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/iceberg_saver.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/progress/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/progress/jobs.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/progress/sse_runner.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/storage_hive.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/utils/write_gatekeeper.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -3,6 +3,7 @@ from .base_data_cube import BaseDataCube
|
|
3
3
|
from .base_attacher import make_attacher
|
4
4
|
from .base_parquet_reader import BaseParquetReader
|
5
5
|
from .hybrid_data_loader import HybridDataLoader
|
6
|
+
from .base_pipeline import BasePipeline
|
6
7
|
|
7
8
|
__all__ = [
|
8
9
|
"BaseDataCube",
|
@@ -10,5 +11,6 @@ __all__ = [
|
|
10
11
|
"make_attacher",
|
11
12
|
"BaseParquetReader",
|
12
13
|
"HybridDataLoader",
|
14
|
+
"BasePipeline",
|
13
15
|
]
|
14
16
|
|
@@ -0,0 +1,178 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import asyncio
|
4
|
+
from concurrent.futures import ThreadPoolExecutor
|
5
|
+
from typing import Type, Any, Callable, List
|
6
|
+
|
7
|
+
import pandas as pd
|
8
|
+
import dask.dataframe as dd
|
9
|
+
|
10
|
+
from sibi_dst.utils import ManagedResource, ParquetSaver
|
11
|
+
from sibi_dst.df_helper import ParquetReader
|
12
|
+
from sibi_dst.utils.dask_utils import dask_is_empty
|
13
|
+
|
14
|
+
|
15
|
+
class DateRangeHelper:
|
16
|
+
@staticmethod
|
17
|
+
def generate_daily_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[str]:
|
18
|
+
start = pd.to_datetime(start_date)
|
19
|
+
end = pd.to_datetime(end_date)
|
20
|
+
return [d.strftime(date_format) for d in pd.date_range(start, end, freq="D")]
|
21
|
+
|
22
|
+
@staticmethod
|
23
|
+
def generate_monthly_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[tuple[str, str]]:
|
24
|
+
"""
|
25
|
+
Generate (start_date, end_date) tuples for each calendar month in range.
|
26
|
+
Always includes the first and last month, even if partial.
|
27
|
+
"""
|
28
|
+
start = pd.to_datetime(start_date)
|
29
|
+
end = pd.to_datetime(end_date)
|
30
|
+
ranges = []
|
31
|
+
current = start.replace(day=1)
|
32
|
+
while current <= end:
|
33
|
+
month_end = (current + pd.offsets.MonthEnd(0)).normalize()
|
34
|
+
ranges.append((
|
35
|
+
current.strftime(date_format),
|
36
|
+
min(month_end, end).strftime(date_format)
|
37
|
+
))
|
38
|
+
current += pd.DateOffset(months=1)
|
39
|
+
return ranges
|
40
|
+
|
41
|
+
class BasePipeline(ManagedResource):
|
42
|
+
def __init__(
|
43
|
+
self,
|
44
|
+
start_date: str,
|
45
|
+
end_date: str,
|
46
|
+
dataset_cls: Type,
|
47
|
+
parquet_storage_path: str,
|
48
|
+
*,
|
49
|
+
fs: Any,
|
50
|
+
filename: str = "dataset",
|
51
|
+
date_field: str = "date",
|
52
|
+
max_workers: int = 4,
|
53
|
+
dataset_kwargs: dict = None,
|
54
|
+
**kwargs,
|
55
|
+
):
|
56
|
+
kwargs["fs"] = fs
|
57
|
+
super().__init__(**kwargs)
|
58
|
+
|
59
|
+
self.start_date = start_date
|
60
|
+
self.end_date = end_date
|
61
|
+
self.fs = fs
|
62
|
+
self.filename = filename
|
63
|
+
self.date_field = date_field
|
64
|
+
self.max_workers = max_workers
|
65
|
+
self.storage_path = parquet_storage_path.rstrip("/")
|
66
|
+
self.df: dd.DataFrame | None = None
|
67
|
+
|
68
|
+
self.ds = dataset_cls(
|
69
|
+
start_date=self.start_date,
|
70
|
+
end_date=self.end_date,
|
71
|
+
debug=self.debug,
|
72
|
+
logger=self.logger,
|
73
|
+
**(dataset_kwargs or {}),
|
74
|
+
)
|
75
|
+
|
76
|
+
def _get_storage_path_for_date(self, date: pd.Timestamp) -> str:
|
77
|
+
return f"{self.storage_path}/{date.year}/{date.month:02d}/{date.day:02d}"
|
78
|
+
|
79
|
+
def _get_output_filename(self, fmt: str = "parquet") -> str:
|
80
|
+
return f"{self.filename}.{fmt}"
|
81
|
+
|
82
|
+
async def aload(self, **kwargs) -> dd.DataFrame:
|
83
|
+
await self.emit("status", message="Loading dataset...", progress=5)
|
84
|
+
self.df = await self.ds.aload(**kwargs)
|
85
|
+
return self.df
|
86
|
+
|
87
|
+
async def to_parquet(self, **kwargs) -> None:
|
88
|
+
df = await self.aload(**kwargs)
|
89
|
+
if dask_is_empty(df):
|
90
|
+
self.logger.warning("No data to save.")
|
91
|
+
return
|
92
|
+
|
93
|
+
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
94
|
+
dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
|
95
|
+
|
96
|
+
tasks = []
|
97
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
98
|
+
for date_str in dates:
|
99
|
+
date_obj = pd.to_datetime(date_str).date()
|
100
|
+
df_day = df[df[self.date_field].dt.date == date_obj]
|
101
|
+
if dask_is_empty(df_day):
|
102
|
+
self.logger.info(f"No data for {date_obj}, skipping.")
|
103
|
+
continue
|
104
|
+
|
105
|
+
path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
|
106
|
+
await self.emit("status", message=f"Saving data for {date_obj}")
|
107
|
+
|
108
|
+
saver = ParquetSaver(
|
109
|
+
df_result=df_day,
|
110
|
+
parquet_storage_path=path,
|
111
|
+
fs=self.fs,
|
112
|
+
debug=self.debug,
|
113
|
+
logger=self.logger,
|
114
|
+
)
|
115
|
+
|
116
|
+
tasks.append(
|
117
|
+
asyncio.get_running_loop().run_in_executor(
|
118
|
+
executor, saver.save_to_parquet, self._get_output_filename()
|
119
|
+
)
|
120
|
+
)
|
121
|
+
|
122
|
+
await asyncio.gather(*tasks)
|
123
|
+
await self.emit("complete", message="All partitions written.")
|
124
|
+
|
125
|
+
async def from_parquet(self, **kwargs) -> dd.DataFrame:
|
126
|
+
reader = ParquetReader(
|
127
|
+
parquet_start_date=self.start_date,
|
128
|
+
parquet_end_date=self.end_date,
|
129
|
+
parquet_storage_path=self.storage_path,
|
130
|
+
parquet_filename=self._get_output_filename(),
|
131
|
+
fs=self.fs,
|
132
|
+
debug=self.debug,
|
133
|
+
logger=self.logger,
|
134
|
+
)
|
135
|
+
return await reader.aload(**kwargs)
|
136
|
+
|
137
|
+
async def to_clickhouse(self, clk_conf: dict, **kwargs):
|
138
|
+
"""
|
139
|
+
Writes daily-partitioned data to ClickHouse using concurrent threads.
|
140
|
+
"""
|
141
|
+
from sibi_dst.utils import ClickHouseWriter
|
142
|
+
|
143
|
+
df = await self.from_parquet(**kwargs)
|
144
|
+
if dask_is_empty(df):
|
145
|
+
self.logger.warning("No data to write to ClickHouse.")
|
146
|
+
return
|
147
|
+
|
148
|
+
df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
|
149
|
+
df = df.persist()
|
150
|
+
|
151
|
+
unique_dates = df[self.date_field].dt.date.dropna().unique().compute()
|
152
|
+
if len(unique_dates)==0:
|
153
|
+
self.logger.warning("No valid dates found for partitioning.")
|
154
|
+
return
|
155
|
+
|
156
|
+
clk = ClickHouseWriter(**clk_conf)
|
157
|
+
loop = asyncio.get_running_loop()
|
158
|
+
tasks = []
|
159
|
+
|
160
|
+
with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
|
161
|
+
for date in unique_dates:
|
162
|
+
df_day = df[df[self.date_field].dt.date == date]
|
163
|
+
if dask_is_empty(df_day):
|
164
|
+
self.logger.info(f"[ClickHouse] No data for {date}, skipping.")
|
165
|
+
continue
|
166
|
+
|
167
|
+
self.logger.info(f"[ClickHouse] Writing {len(df_day)} rows for {date}")
|
168
|
+
|
169
|
+
tasks.append(
|
170
|
+
loop.run_in_executor(executor, clk.save_to_clickhouse, df_day)
|
171
|
+
)
|
172
|
+
|
173
|
+
await asyncio.gather(*tasks)
|
174
|
+
|
175
|
+
self.logger.info(f"ClickHouse write complete for {len(unique_dates)} daily partitions.")
|
176
|
+
|
177
|
+
|
178
|
+
__all__ = ["BasePipeline"]
|
@@ -7,6 +7,7 @@ from typing import ClassVar, Dict, Optional, Any, Iterable, Tuple
|
|
7
7
|
import pandas as pd
|
8
8
|
import dask.dataframe as dd
|
9
9
|
import clickhouse_connect
|
10
|
+
import numpy as np
|
10
11
|
|
11
12
|
from . import ManagedResource
|
12
13
|
|
@@ -27,6 +28,7 @@ class ClickHouseWriter(ManagedResource):
|
|
27
28
|
- Optional overwrite (drop + recreate)
|
28
29
|
- Partitioned, batched inserts
|
29
30
|
- Per-thread clients to avoid session conflicts
|
31
|
+
- Proper PyArrow dtype handling
|
30
32
|
"""
|
31
33
|
|
32
34
|
# Default dtype mapping (pandas/dask → ClickHouse)
|
@@ -109,7 +111,11 @@ class ClickHouseWriter(ManagedResource):
|
|
109
111
|
return
|
110
112
|
|
111
113
|
# lazily fill missing values per-partition (no global compute)
|
112
|
-
|
114
|
+
# Use the new method that ensures correct types for ClickHouse
|
115
|
+
df = df.map_partitions(
|
116
|
+
type(self)._process_partition_for_clickhouse_compatible,
|
117
|
+
meta=df._meta
|
118
|
+
)
|
113
119
|
|
114
120
|
# (re)create table
|
115
121
|
ow = self.overwrite if overwrite is None else bool(overwrite)
|
@@ -121,7 +127,7 @@ class ClickHouseWriter(ManagedResource):
|
|
121
127
|
self._command(f"DROP TABLE IF EXISTS {self._ident(self.table)}")
|
122
128
|
self.logger.info(f"Dropped table {self.table} (overwrite=True)")
|
123
129
|
|
124
|
-
create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}
|
130
|
+
create_sql = f"CREATE TABLE IF NOT EXISTS {self._ident(self.table)} ({schema_sql}) {engine_sql}"
|
125
131
|
self._command(create_sql)
|
126
132
|
self.logger.info(f"Ensured table {self.table} exists")
|
127
133
|
|
@@ -159,6 +165,26 @@ class ClickHouseWriter(ManagedResource):
|
|
159
165
|
return ", ".join(pieces)
|
160
166
|
|
161
167
|
def _map_dtype(self, dtype: Any) -> str:
|
168
|
+
dtype_str = str(dtype).lower()
|
169
|
+
# Handle PyArrow dtypes
|
170
|
+
if "[pyarrow]" in dtype_str:
|
171
|
+
if "int64" in dtype_str:
|
172
|
+
return "Int64"
|
173
|
+
elif "int32" in dtype_str:
|
174
|
+
return "Int32"
|
175
|
+
elif "float64" in dtype_str or "double" in dtype_str:
|
176
|
+
return "Float64"
|
177
|
+
elif "float32" in dtype_str:
|
178
|
+
return "Float32"
|
179
|
+
elif "bool" in dtype_str:
|
180
|
+
return "UInt8"
|
181
|
+
elif "timestamp" in dtype_str: # PyArrow timestamp
|
182
|
+
return "DateTime"
|
183
|
+
elif "string" in dtype_str: # PyArrow string
|
184
|
+
return "String"
|
185
|
+
else:
|
186
|
+
return "String" # fallback
|
187
|
+
|
162
188
|
# Handle pandas extension dtypes explicitly
|
163
189
|
if isinstance(dtype, pd.Int64Dtype):
|
164
190
|
return "Int64"
|
@@ -170,19 +196,29 @@ class ClickHouseWriter(ManagedResource):
|
|
170
196
|
return "Float64"
|
171
197
|
if isinstance(dtype, pd.StringDtype):
|
172
198
|
return "String"
|
173
|
-
if "datetime64" in
|
199
|
+
if "datetime64" in dtype_str:
|
174
200
|
return "DateTime"
|
175
201
|
|
176
202
|
return self.DTYPE_MAP.get(str(dtype), "String")
|
177
203
|
|
178
204
|
def _should_mark_nullable(self, dtype: Any) -> bool:
|
179
|
-
|
205
|
+
dtype_str = str(dtype).lower()
|
206
|
+
# PyArrow types are generally nullable, but let's be specific
|
207
|
+
if "[pyarrow]" in dtype_str:
|
208
|
+
# For PyArrow, make strings and timestamps nullable, numbers usually not unless data has nulls
|
209
|
+
base_type = dtype_str.replace("[pyarrow]", "")
|
210
|
+
if base_type in ["string", "large_string"] or "timestamp" in base_type:
|
211
|
+
return True
|
212
|
+
# For numeric PyArrow, check if the actual data contains nulls (hard to do here)
|
213
|
+
# Let's default to not nullable for numeric unless explicitly needed
|
214
|
+
return False # Conservative for PyArrow numerics
|
215
|
+
|
180
216
|
if isinstance(dtype, (pd.StringDtype, pd.BooleanDtype, pd.Int64Dtype, pd.Int32Dtype, pd.Float64Dtype)):
|
181
217
|
return True
|
182
|
-
if "datetime64" in
|
218
|
+
if "datetime64" in dtype_str:
|
183
219
|
return True
|
184
220
|
# object/category almost always nullable
|
185
|
-
if
|
221
|
+
if dtype_str in ("object", "category", "string"):
|
186
222
|
return True
|
187
223
|
return False
|
188
224
|
|
@@ -203,6 +239,10 @@ class ClickHouseWriter(ManagedResource):
|
|
203
239
|
# Ensure column ordering is stable
|
204
240
|
cols = list(pdf.columns)
|
205
241
|
|
242
|
+
# --- CRITICAL FIX: Ensure datetime columns are compatible BEFORE insertion ---
|
243
|
+
# This is the key step to prevent the numpy.datetime64 error
|
244
|
+
pdf = self._ensure_clickhouse_compatible_datetime_types(pdf)
|
245
|
+
|
206
246
|
# Split into batches (to avoid giant single insert)
|
207
247
|
for start in range(0, len(pdf), self.insert_chunksize):
|
208
248
|
batch = pdf.iloc[start:start + self.insert_chunksize]
|
@@ -215,30 +255,116 @@ class ClickHouseWriter(ManagedResource):
|
|
215
255
|
def _insert_df(self, cols: Iterable[str], df: pd.DataFrame) -> None:
|
216
256
|
client = self._get_client()
|
217
257
|
# clickhouse-connect supports insert_df
|
258
|
+
# The df passed here should now have compatible datetime types
|
218
259
|
client.insert_df(self.table, df[cols], settings={"async_insert": 1, "wait_end_of_query": 1})
|
219
260
|
|
220
|
-
# ------------- missing values (lazy) -------------
|
261
|
+
# ------------- missing values & type conversion (lazy) -------------
|
221
262
|
|
222
263
|
@staticmethod
|
223
|
-
def
|
224
|
-
|
264
|
+
def _process_partition_for_clickhouse_compatible(pdf: pd.DataFrame) -> pd.DataFrame:
|
265
|
+
"""
|
266
|
+
Process a partition to fill missing values and ensure initial data types are consistent.
|
267
|
+
This is the first step of data preparation.
|
268
|
+
"""
|
269
|
+
pdf = pdf.copy() # Avoid modifying original
|
270
|
+
|
225
271
|
for col in pdf.columns:
|
226
272
|
s = pdf[col]
|
227
|
-
|
273
|
+
dtype_str = str(s.dtype).lower()
|
274
|
+
|
275
|
+
# --- Handle PyArrow dtypes ---
|
276
|
+
if "[pyarrow]" in dtype_str:
|
277
|
+
try:
|
278
|
+
if "string" in dtype_str:
|
279
|
+
# Convert PyArrow string to object, fillna with empty string
|
280
|
+
pdf[col] = s.astype('object').fillna("")
|
281
|
+
elif "timestamp" in dtype_str:
|
282
|
+
# Convert PyArrow timestamp to pandas datetime, NaT for nulls
|
283
|
+
pdf[col] = pd.to_datetime(s, errors='coerce') # errors='coerce' handles conversion issues
|
284
|
+
elif "int" in dtype_str:
|
285
|
+
# Convert PyArrow int to pandas int, fillna with 0 for non-nullable
|
286
|
+
pdf[col] = s.fillna(0)
|
287
|
+
elif "float" in dtype_str or "double" in dtype_str:
|
288
|
+
pdf[col] = s.fillna(0.0)
|
289
|
+
elif "bool" in dtype_str:
|
290
|
+
pdf[col] = s.fillna(False) # Or pd.NA if you prefer
|
291
|
+
else:
|
292
|
+
# Fallback: convert to object and then to string
|
293
|
+
pdf[col] = s.astype('object').astype(str).fillna("")
|
294
|
+
except Exception as e:
|
295
|
+
# If conversion fails, fall back to object and string
|
296
|
+
pdf[col] = s.astype('object').astype(str).fillna("")
|
297
|
+
|
298
|
+
# --- Handle standard pandas dtypes ---
|
299
|
+
elif pd.api.types.is_integer_dtype(s.dtype):
|
228
300
|
if pd.api.types.is_extension_array_dtype(s.dtype):
|
229
301
|
pdf[col] = s.fillna(pd.NA)
|
230
302
|
else:
|
231
303
|
pdf[col] = s.fillna(0)
|
232
304
|
elif pd.api.types.is_bool_dtype(s.dtype):
|
233
|
-
pdf[col] = s.fillna(pd.NA)
|
305
|
+
pdf[col] = s.fillna(pd.NA) # Or False
|
234
306
|
elif pd.api.types.is_float_dtype(s.dtype):
|
235
307
|
pdf[col] = s.fillna(0.0)
|
236
308
|
elif pd.api.types.is_datetime64_any_dtype(s.dtype):
|
309
|
+
# Datetimes - leave as is for now, will be handled in final step
|
237
310
|
pass
|
238
311
|
else:
|
239
|
-
|
312
|
+
# For object/string/category columns, ensure they're strings
|
313
|
+
pdf[col] = s.astype(str).fillna("")
|
314
|
+
|
240
315
|
return pdf
|
241
316
|
|
317
|
+
def _ensure_clickhouse_compatible_datetime_types(self, df: pd.DataFrame) -> pd.DataFrame:
|
318
|
+
"""
|
319
|
+
Final conversion step: Ensure datetime columns are in a format compatible
|
320
|
+
with clickhouse-connect driver. Specifically, convert numpy.datetime64 to
|
321
|
+
pandas.Timestamp or Python datetime objects.
|
322
|
+
This is called just before insertion.
|
323
|
+
"""
|
324
|
+
df = df.copy()
|
325
|
+
for col in df.columns:
|
326
|
+
s = df[col]
|
327
|
+
# Check if the column is datetime-like
|
328
|
+
if pd.api.types.is_datetime64_any_dtype(s.dtype):
|
329
|
+
# --- Robust conversion to ensure compatibility ---
|
330
|
+
# 1. Convert to pandas datetime explicitly
|
331
|
+
df[col] = pd.to_datetime(s, utc=True) # Ensures timezone handling
|
332
|
+
|
333
|
+
# 2. Replace NaT with None for nullable columns (clickhouse-connect handles this)
|
334
|
+
# This is often sufficient, but let's be extra sure about the object type
|
335
|
+
# 3. Ensure the underlying objects are pandas.Timestamp (which have .timestamp())
|
336
|
+
# The pd.to_datetime should handle this, but accessing .dt accessor reinforces it.
|
337
|
+
# If there are still issues, we can force object conversion:
|
338
|
+
# df[col] = df[col].dt.to_pydatetime() # Converts to numpy array of datetime64 or None
|
339
|
+
# But pd.Timestamp is better. Let's try accessing .dt to ensure it's proper:
|
340
|
+
try:
|
341
|
+
_ = df[col].dt # Accessing .dt confirms it's datetime-like
|
342
|
+
except:
|
343
|
+
# If .dt fails, it means conversion wasn't clean, force it
|
344
|
+
self.logger.debug(f"Forcing datetime conversion for column {col}")
|
345
|
+
df[col] = pd.to_datetime(df[col].astype('object'), utc=True)
|
346
|
+
|
347
|
+
# --- Final check and explicit conversion if needed ---
|
348
|
+
# If the error persists, we might need to explicitly convert the array elements.
|
349
|
+
# Let's add a check for the first non-null element in a sample:
|
350
|
+
sample_series = df[col].dropna()
|
351
|
+
if len(sample_series) > 0:
|
352
|
+
first_val = sample_series.iloc[0]
|
353
|
+
if isinstance(first_val, np.datetime64):
|
354
|
+
self.logger.warning(f"Column {col} still contains numpy.datetime64 after conversion. Forcing object conversion.")
|
355
|
+
# Force conversion to object array of pandas.Timestamp or None
|
356
|
+
def convert_val(v):
|
357
|
+
if pd.isna(v):
|
358
|
+
return None
|
359
|
+
if isinstance(v, np.datetime64):
|
360
|
+
# Convert numpy.datetime64 to pandas.Timestamp
|
361
|
+
return pd.Timestamp(v)
|
362
|
+
return v
|
363
|
+
df[col] = df[col].apply(convert_val)
|
364
|
+
|
365
|
+
return df
|
366
|
+
|
367
|
+
|
242
368
|
# ------------- low-level helpers -------------
|
243
369
|
|
244
370
|
def _get_client(self):
|
@@ -284,4 +410,3 @@ class ClickHouseWriter(ManagedResource):
|
|
284
410
|
finally:
|
285
411
|
if hasattr(self._tlocal, "client"):
|
286
412
|
delattr(self._tlocal, "client")
|
287
|
-
|
@@ -31,7 +31,7 @@ def dask_is_empty(ddf: dd.DataFrame, *, sample: int = 4) -> bool:
|
|
31
31
|
k = min(max(sample, 1), ddf.npartitions)
|
32
32
|
probes = dask.compute(*[
|
33
33
|
ddf.get_partition(i).map_partitions(len) for i in range(k)
|
34
|
-
])
|
34
|
+
], scheduler="threads")
|
35
35
|
|
36
36
|
if any(_to_int_safe(n) > 0 for n in probes):
|
37
37
|
return False
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py
RENAMED
File without changes
|
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py
RENAMED
File without changes
|
{sibi_dst-2025.9.5 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|