sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/__init__.py +3 -2
- sibi_dst/df_helper/_artifact_updater_async.py +238 -0
- sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
- sibi_dst/df_helper/_df_helper.py +418 -118
- sibi_dst/df_helper/_parquet_artifact.py +275 -283
- sibi_dst/df_helper/_parquet_reader.py +9 -10
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/__init__.py +2 -0
- sibi_dst/utils/base.py +235 -100
- sibi_dst/utils/business_days.py +248 -0
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +392 -88
- sibi_dst/utils/date_utils.py +711 -393
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_age_checker.py +301 -0
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/periods.py +42 -0
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
sibi_dst/utils/df_utils.py
CHANGED
@@ -1,44 +1,88 @@
|
|
1
|
+
import warnings
|
2
|
+
from typing import Union, List, Dict, Tuple, Iterable
|
3
|
+
|
1
4
|
import dask.dataframe as dd
|
2
5
|
import pandas as pd
|
3
6
|
|
4
7
|
from .log_utils import Logger
|
5
|
-
|
8
|
+
|
6
9
|
warnings.filterwarnings("ignore", message="Sorting a Dask DataFrame is expensive and may not be efficient")
|
7
10
|
|
11
|
+
|
8
12
|
class DfUtils:
|
9
|
-
|
10
|
-
|
11
|
-
|
13
|
+
"""
|
14
|
+
Utilities that work with both pandas and Dask DataFrames, with Dask-first behavior.
|
15
|
+
"""
|
12
16
|
|
13
|
-
|
14
|
-
logger (Logger, optional): Logger instance for logging information.
|
15
|
-
"""
|
17
|
+
def __init__(self, logger=None, *, debug: bool = False):
|
16
18
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
19
|
+
self.debug = debug
|
20
|
+
|
21
|
+
# -------------------------
|
22
|
+
# helpers
|
23
|
+
# -------------------------
|
24
|
+
@staticmethod
|
25
|
+
def _is_dask(obj) -> bool:
|
26
|
+
return isinstance(obj, (dd.DataFrame, dd.Series))
|
17
27
|
|
18
28
|
@classmethod
|
19
29
|
def compute_to_list(cls, series):
|
20
30
|
return series.compute().tolist() if hasattr(series, "compute") else series.tolist()
|
21
31
|
|
22
|
-
def
|
23
|
-
|
32
|
+
def _astype_safe(self, df, col: str, dtype) -> None:
|
33
|
+
"""
|
34
|
+
Cast a single column in-place; handles Dask meta generation implicitly by letting Dask infer.
|
35
|
+
"""
|
36
|
+
if col not in df.columns:
|
37
|
+
return
|
38
|
+
if self._is_dask(df):
|
39
|
+
df[col] = df[col].astype(dtype)
|
40
|
+
else:
|
41
|
+
df[col] = df[col].astype(dtype)
|
24
42
|
|
25
|
-
def
|
43
|
+
def _df_len_zero(self, df) -> bool:
|
44
|
+
"""
|
45
|
+
Dask-safe emptiness check (avoids df.empty with Dask).
|
26
46
|
"""
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
47
|
+
if self._is_dask(df):
|
48
|
+
try:
|
49
|
+
# Much faster than materializing the whole df
|
50
|
+
n = df.map_partitions(len).sum().compute()
|
51
|
+
return int(n) == 0
|
52
|
+
except Exception as e:
|
53
|
+
self.logger.error(f"Error computing Dask length: {e}")
|
54
|
+
return False
|
55
|
+
return df.empty
|
56
|
+
|
57
|
+
# -------------------------
|
58
|
+
# public API
|
59
|
+
# -------------------------
|
60
|
+
def extract_unique_values(self, df, *columns):
|
61
|
+
result: Dict[str, List] = {}
|
62
|
+
for col in columns:
|
63
|
+
if col not in df.columns:
|
64
|
+
result[col] = []
|
65
|
+
continue
|
66
|
+
vals = df[col].dropna()
|
67
|
+
# Prefer drop_duplicates over unique() for Dask robustness
|
68
|
+
if self._is_dask(vals):
|
69
|
+
vals = vals.drop_duplicates().compute()
|
70
|
+
else:
|
71
|
+
vals = vals.drop_duplicates()
|
72
|
+
result[col] = vals.tolist()
|
73
|
+
return result
|
74
|
+
|
75
|
+
def align_and_merge_by_type(self, df_left, df_right, type_mapping: Dict[str, Iterable[Tuple[str, str]]], how='left'):
|
76
|
+
"""
|
77
|
+
Align dtypes for pairs of columns then merge on aligned pairs.
|
78
|
+
type_mapping example:
|
79
|
+
{
|
80
|
+
'integer': [('customer_id','temp1'), ('product_type_id','temp2')],
|
81
|
+
'string': [('group2','temp4')],
|
82
|
+
'datetime':[('ts','ts2')],
|
83
|
+
'boolean':[('is_ok','flag')]
|
84
|
+
}
|
40
85
|
"""
|
41
|
-
# Map string keys to actual dtypes
|
42
86
|
dtype_map = {
|
43
87
|
'integer': 'int64',
|
44
88
|
'float': 'float64',
|
@@ -47,238 +91,174 @@ class DfUtils:
|
|
47
91
|
'boolean': 'bool',
|
48
92
|
}
|
49
93
|
|
50
|
-
#
|
51
|
-
for target_type,
|
94
|
+
# Cast columns as requested
|
95
|
+
for target_type, pairs in (type_mapping or {}).items():
|
52
96
|
if target_type not in dtype_map:
|
53
97
|
self.logger.error(f"Unsupported type: {target_type}")
|
54
|
-
|
55
|
-
for left_col, right_col in
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
df_right
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
98
|
+
continue
|
99
|
+
for left_col, right_col in pairs:
|
100
|
+
if left_col in df_left.columns:
|
101
|
+
self._astype_safe(df_left, left_col, dtype_map[target_type])
|
102
|
+
if right_col in df_right.columns:
|
103
|
+
self._astype_safe(df_right, right_col, dtype_map[target_type])
|
104
|
+
|
105
|
+
all_pairs = [p for pairs in (type_mapping or {}).values() for p in pairs]
|
106
|
+
left_keys = [p[0] for p in all_pairs]
|
107
|
+
right_keys = [p[1] for p in all_pairs]
|
108
|
+
|
109
|
+
# Dask merge works fine if both are Dask; if mixed, coerce right to Dask for scalability.
|
110
|
+
if self._is_dask(df_left) and not self._is_dask(df_right):
|
111
|
+
df_right = dd.from_pandas(df_right, npartitions=max(1, df_left.npartitions))
|
112
|
+
if self._is_dask(df_right) and not self._is_dask(df_left):
|
113
|
+
df_left = dd.from_pandas(df_left, npartitions=max(1, df_right.npartitions))
|
114
|
+
|
115
|
+
return df_left.merge(df_right, how=how, left_on=left_keys, right_on=right_keys)
|
116
|
+
|
117
|
+
def exclude_from_dataframe(self, df, conditions: List[Tuple[str, str, object]]):
|
73
118
|
"""
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame to filter.
|
78
|
-
- conditions (list of tuples): List of conditions to apply for filtering.
|
79
|
-
Each condition is a tuple: (column_name, operator, value).
|
80
|
-
|
81
|
-
Returns:
|
82
|
-
- pandas.DataFrame or dask.dataframe.DataFrame: Filtered DataFrame.
|
119
|
+
Filter rows out based on combined conditions (AND). Returns df[~combined].
|
120
|
+
conditions: list of (column, operator, value)
|
121
|
+
operators supported: ==, !=, <, <=, >, >=
|
83
122
|
"""
|
84
123
|
import operator
|
124
|
+
ops = {"==": operator.eq, "!=": operator.ne, "<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge}
|
85
125
|
|
86
|
-
|
87
|
-
ops = {
|
88
|
-
"==": operator.eq,
|
89
|
-
"!=": operator.ne,
|
90
|
-
"<": operator.lt,
|
91
|
-
"<=": operator.le,
|
92
|
-
">": operator.gt,
|
93
|
-
">=": operator.ge,
|
94
|
-
}
|
95
|
-
# Ensure all specified columns exist in the DataFrame
|
96
|
-
missing_columns = [col for col, _, _ in conditions if col not in df.columns]
|
97
|
-
if missing_columns:
|
98
|
-
self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
|
126
|
+
if not conditions:
|
99
127
|
return df
|
100
128
|
|
101
|
-
|
102
|
-
|
103
|
-
|
129
|
+
missing = [c for c, _, _ in conditions if c not in df.columns]
|
130
|
+
if missing:
|
131
|
+
self.logger.debug(f"Missing columns in DataFrame: {', '.join(missing)}")
|
132
|
+
return df
|
133
|
+
|
134
|
+
combined = None
|
135
|
+
for col, op, val in conditions:
|
104
136
|
if op not in ops:
|
105
137
|
raise ValueError(f"Unsupported operator: {op}")
|
138
|
+
cond = ops[op](df[col], val)
|
139
|
+
combined = cond if combined is None else (combined & cond)
|
106
140
|
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
# Combine the condition with AND (&)
|
111
|
-
combined_condition = condition if combined_condition is None else (combined_condition & condition)
|
112
|
-
|
113
|
-
# Apply the filtering and return the DataFrame
|
114
|
-
return df[~combined_condition]
|
141
|
+
if combined is None:
|
142
|
+
return df
|
143
|
+
return df[~combined]
|
115
144
|
|
116
|
-
|
117
|
-
|
118
|
-
|
145
|
+
# ---- numeric/boolean casting
|
146
|
+
@staticmethod
|
147
|
+
def _transform_column(series, fill_value, dtype):
|
148
|
+
return pd.to_numeric(series, errors="coerce").fillna(fill_value).astype(dtype)
|
119
149
|
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
150
|
+
def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0, dtype=int):
|
151
|
+
if not columns:
|
152
|
+
self.logger.warning("No columns specified.")
|
153
|
+
return df
|
154
|
+
columns = [c for c in columns if c in df.columns]
|
155
|
+
for col in columns:
|
156
|
+
if self._is_dask(df):
|
157
|
+
df[col] = df[col].map_partitions(self._transform_column, fill_value, dtype, meta=(col, dtype))
|
158
|
+
else:
|
159
|
+
df[col] = self._transform_column(df[col], fill_value, dtype)
|
160
|
+
return df
|
125
161
|
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
if debug:
|
130
|
-
self.logger.debug(f"Grouping by: {group_by_expr}")
|
162
|
+
# kept for backward-compat
|
163
|
+
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
164
|
+
return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=dtype)
|
131
165
|
|
132
|
-
|
133
|
-
return
|
166
|
+
def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
|
167
|
+
return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
|
134
168
|
|
135
|
-
|
169
|
+
# ---- duplicate handling
|
170
|
+
def eval_duplicate_removal(self, df, duplicate_expr, sort_field: str | None = None, keep='last', debug=False):
|
136
171
|
"""
|
137
|
-
|
138
|
-
|
139
|
-
Parameters:
|
140
|
-
df (DataFrame): Pandas or Dask DataFrame from which duplicates are to be removed.
|
141
|
-
duplicate_expr (str or list): Column(s) to identify duplicates.
|
142
|
-
sort_field (str, optional): Column to sort by before dropping duplicates.
|
143
|
-
keep (str): Which duplicate to keep ('first' or 'last').
|
144
|
-
debug (bool): If True, logs duplicate rows.
|
145
|
-
|
146
|
-
Returns:
|
147
|
-
DataFrame: DataFrame with duplicates removed.
|
172
|
+
Drop duplicates. For Dask, uses its shuffle-based drop_duplicates.
|
173
|
+
If sort_field is provided, we avoid global sorts for Dask.
|
148
174
|
"""
|
149
175
|
if duplicate_expr is None:
|
150
176
|
return df
|
151
177
|
|
152
178
|
if debug:
|
153
|
-
|
154
|
-
|
179
|
+
try:
|
180
|
+
dups = df[df.duplicated(subset=duplicate_expr)]
|
181
|
+
# Do not .compute() here; just log that duplicates exist in Dask
|
182
|
+
self.logger.debug(f"Duplicate rows based on {duplicate_expr}: (preview only)")
|
183
|
+
if not self._is_dask(dups):
|
184
|
+
self.logger.debug(f"\n{dups}")
|
185
|
+
except Exception:
|
186
|
+
pass
|
155
187
|
|
156
188
|
if sort_field:
|
157
|
-
if
|
158
|
-
self.logger.warning("Sorting a Dask DataFrame is expensive
|
159
|
-
|
189
|
+
if self._is_dask(df):
|
190
|
+
self.logger.warning("Sorting a Dask DataFrame is expensive; skipping global sort.")
|
191
|
+
else:
|
192
|
+
df = df.sort_values(sort_field)
|
160
193
|
|
161
|
-
|
162
|
-
|
163
|
-
df = df.drop_duplicates(subset=duplicate_expr, keep=keep
|
194
|
+
if self._is_dask(df):
|
195
|
+
# Let Dask handle the global de-dup with a shuffle under the hood
|
196
|
+
df = df.drop_duplicates(subset=duplicate_expr, keep=keep)
|
164
197
|
else:
|
165
198
|
df = df.drop_duplicates(subset=duplicate_expr, keep=keep)
|
166
|
-
|
167
199
|
return df
|
168
200
|
|
169
201
|
def load_latest(self, df, duplicate_expr, sort_field=None, debug=False):
|
170
|
-
"""
|
171
|
-
Removes duplicates keeping the latest occurrence.
|
172
|
-
|
173
|
-
Parameters:
|
174
|
-
df (DataFrame): Pandas or Dask DataFrame.
|
175
|
-
duplicate_expr (str or list): Column(s) to identify duplicates.
|
176
|
-
sort_field (str, optional): Column to sort by before dropping duplicates.
|
177
|
-
debug (bool): If True, logs duplicate rows.
|
178
|
-
|
179
|
-
Returns:
|
180
|
-
DataFrame: DataFrame with latest duplicates removed.
|
181
|
-
"""
|
182
202
|
return self.eval_duplicate_removal(df, duplicate_expr, sort_field=sort_field, keep='last', debug=debug)
|
183
203
|
|
184
204
|
def load_earliest(self, df, duplicate_expr, sort_field=None, debug=False):
|
185
|
-
"""
|
186
|
-
Removes duplicates keeping the earliest occurrence.
|
187
|
-
|
188
|
-
Parameters:
|
189
|
-
df (DataFrame): Pandas or Dask DataFrame.
|
190
|
-
duplicate_expr (str or list): Column(s) to identify duplicates.
|
191
|
-
sort_field (str, optional): Column to sort by before dropping duplicates.
|
192
|
-
debug (bool): If True, logs duplicate rows.
|
193
|
-
|
194
|
-
Returns:
|
195
|
-
DataFrame: DataFrame with the earliest duplicates removed.
|
196
|
-
"""
|
197
205
|
return self.eval_duplicate_removal(df, duplicate_expr, sort_field=sort_field, keep='first', debug=debug)
|
198
206
|
|
199
|
-
|
200
|
-
def add_df_totals(df):
|
207
|
+
# ---- totals
|
208
|
+
def add_df_totals(self, df):
|
201
209
|
"""
|
202
|
-
Adds
|
203
|
-
|
204
|
-
Parameters:
|
205
|
-
df (DataFrame): Pandas or Dask DataFrame.
|
206
|
-
|
207
|
-
Returns:
|
208
|
-
DataFrame: DataFrame with total row and column added.
|
210
|
+
Adds totals; for Dask, this computes to pandas (be careful with large frames).
|
209
211
|
"""
|
210
|
-
if
|
211
|
-
|
212
|
+
if self._is_dask(df):
|
213
|
+
self.logger.warning("add_df_totals will compute to pandas; may be large.")
|
212
214
|
col_totals = df.sum(numeric_only=True).compute()
|
213
215
|
row_totals = df.sum(axis=1, numeric_only=True).compute()
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
216
|
+
pdf = df.compute()
|
217
|
+
pdf.loc['Total'] = col_totals
|
218
|
+
pdf['Total'] = row_totals
|
219
|
+
return pdf
|
218
220
|
else:
|
219
221
|
df.loc['Total'] = df.sum(numeric_only=True)
|
220
222
|
df['Total'] = df.sum(axis=1, numeric_only=True)
|
221
|
-
|
223
|
+
return df
|
222
224
|
|
225
|
+
# ---- summarization / resampling
|
223
226
|
def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
|
224
227
|
"""
|
225
|
-
|
226
|
-
|
227
|
-
Parameters:
|
228
|
-
df (DataFrame): Pandas or Dask DataFrame.
|
229
|
-
summary_column (str or list): Column(s) for summarization.
|
230
|
-
values_column (str or list): Column(s) to aggregate.
|
231
|
-
rule (str): Resampling frequency (e.g., 'D' for daily).
|
232
|
-
agg_func (str or function): Aggregation function.
|
233
|
-
|
234
|
-
Returns:
|
235
|
-
DataFrame: Resampled pivot table.
|
236
|
-
"""
|
237
|
-
if isinstance(df, dd.core.DataFrame):
|
238
|
-
# Implement Dask-compatible pivot and resample
|
239
|
-
self.logger.debug("Performing summarization with Dask DataFrame.")
|
240
|
-
# Ensure the index is a datetime for resampling
|
241
|
-
if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
|
242
|
-
self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
|
243
|
-
df = df.set_index(dd.to_datetime(df.index))
|
244
|
-
|
245
|
-
# Group by index and summary columns
|
246
|
-
df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
|
247
|
-
agg_func).reset_index()
|
248
|
-
|
249
|
-
# Pivot the table
|
250
|
-
df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
|
251
|
-
aggfunc='sum').fillna(0)
|
252
|
-
|
253
|
-
# Resample
|
254
|
-
df_pivot.index = dd.to_datetime(df_pivot.index)
|
255
|
-
df_pivot = df_pivot.repartition(freq=rule)
|
256
|
-
df_resampled = df_pivot.map_partitions(lambda df: df.resample(rule).sum())
|
257
|
-
|
258
|
-
return df_resampled.compute()
|
259
|
-
else:
|
260
|
-
df_pivot = df.pivot_table(
|
261
|
-
index=df.index,
|
262
|
-
columns=summary_column,
|
263
|
-
values=values_column,
|
264
|
-
aggfunc=agg_func
|
265
|
-
).fillna(0)
|
266
|
-
df_resampled = df_pivot.resample(rule).sum()
|
267
|
-
return df_resampled
|
268
|
-
|
269
|
-
@staticmethod
|
270
|
-
def summarize_and_resample_data(df, summary_columns, value_columns, rule='D', agg_func='count'):
|
271
|
-
"""
|
272
|
-
Summarizes and resamples data.
|
273
|
-
|
274
|
-
Parameters:
|
275
|
-
df (DataFrame): Pandas or Dask DataFrame.
|
276
|
-
summary_columns (str or list): Column(s) for summarization.
|
277
|
-
value_columns (str or list): Column(s) to aggregate.
|
278
|
-
rule (str): Resampling frequency.
|
279
|
-
agg_func (str or function): Aggregation function.
|
280
|
-
|
281
|
-
Returns:
|
282
|
-
DataFrame: Resampled pivot table.
|
228
|
+
For pandas: pivot+resample on DatetimeIndex.
|
229
|
+
For Dask: create time bins and aggregate in Dask, then (optionally) pivot in pandas.
|
283
230
|
"""
|
284
|
-
|
231
|
+
# pandas path
|
232
|
+
if not self._is_dask(df):
|
233
|
+
idx = df.index
|
234
|
+
if not isinstance(idx, pd.DatetimeIndex):
|
235
|
+
self.logger.warning("Index is not DatetimeIndex; converting from current index.")
|
236
|
+
df = df.copy()
|
237
|
+
df.index = pd.to_datetime(idx, errors="coerce")
|
238
|
+
pivot = df.pivot_table(index=df.index, columns=summary_column, values=values_column, aggfunc=agg_func).fillna(0)
|
239
|
+
return pivot.resample(rule).sum()
|
240
|
+
|
241
|
+
# Dask path
|
242
|
+
# 1) Build a datetime column from index (no global sort)
|
243
|
+
ddf = df
|
244
|
+
ddf = ddf.assign(_ts_bin=dd.to_datetime(ddf.index, errors="coerce"))
|
245
|
+
|
246
|
+
# 2) Bucket to rule using floor; do it per partition
|
247
|
+
def _floor_partition(pdf: pd.DataFrame, col: str, rule: str) -> pd.DataFrame:
|
248
|
+
out = pdf.copy()
|
249
|
+
out[col] = pd.to_datetime(out[col], errors="coerce")
|
250
|
+
out['_bin'] = out[col].dt.floor(rule)
|
251
|
+
return out
|
252
|
+
|
253
|
+
ddf = ddf.map_partitions(_floor_partition, col="_ts_bin", rule=rule, meta=dd.utils.make_meta(ddf))
|
254
|
+
|
255
|
+
# 3) Group in Dask on ['_bin', summary_column] and aggregate
|
256
|
+
grouped = ddf.groupby(['_bin', summary_column])[values_column].agg(agg_func).reset_index()
|
257
|
+
|
258
|
+
# 4) If you need a pivoted result, compute to pandas then pivot (Dask pivot_table is not generally supported)
|
259
|
+
gpdf = grouped.compute()
|
260
|
+
pivot = gpdf.pivot_table(index="_bin", columns=summary_column, values=values_column, aggfunc='sum').fillna(0)
|
261
|
+
|
262
|
+
# 5) Ensure regular resample (already bucketed; resampling is now cheap in pandas)
|
263
|
+
pivot.index = pd.to_datetime(pivot.index)
|
264
|
+
return pivot.asfreq(rule, fill_value=0)
|