sibi-dst 0.3.10__py3-none-any.whl → 0.3.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -164,7 +164,7 @@ class DataWrapper:
164
164
 
165
165
  date_range = self.generate_date_range()
166
166
  if self.show_progress:
167
- date_range = tqdm(date_range, desc="Evaluating update plan", unit="date")
167
+ date_range = tqdm(date_range, desc=f"Evaluating update plan {self.__class__.__name__}", unit="date")
168
168
 
169
169
  for current_date in date_range:
170
170
  folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
@@ -207,325 +207,7 @@ class DataWrapper:
207
207
  return update_plan_table
208
208
 
209
209
 
210
- # import datetime
211
- # from typing import Type, Any, Dict, Optional
212
- # import fsspec
213
- # import pandas as pd
214
- # from IPython.display import display
215
- #
216
- # from sibi_dst.utils import Logger
217
- # from tqdm import tqdm
218
- # from sibi_dst.utils import ParquetSaver
219
- #
220
- # class DataWrapper:
221
- # DEFAULT_MAX_AGE_MINUTES = 1440
222
- # DEFAULT_HISTORY_DAYS_THRESHOLD = 30
223
- #
224
- # def __init__(self,
225
- # dataclass: Type,
226
- # date_field: str,
227
- # data_path: str,
228
- # parquet_filename: str,
229
- # start_date: Any,
230
- # end_date: Any,
231
- # filesystem_type: str = "file",
232
- # filesystem_options: Optional[Dict] = None,
233
- # verbose: bool = False,
234
- # class_params: Optional[Dict] = None,
235
- # load_params: Optional[Dict] = None,
236
- # reverse_order: bool = False,
237
- # overwrite: bool = False,
238
- # ignore_missing: bool = False,
239
- # logger: Optional[Logger] = None,
240
- # max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
241
- # history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
242
- # show_progress: bool = False):
243
- # self.dataclass = dataclass
244
- # self.date_field = date_field
245
- # self.data_path = self.ensure_forward_slash(data_path)
246
- # self.parquet_filename = parquet_filename
247
- # self.filesystem_type = filesystem_type
248
- # self.filesystem_options = filesystem_options or {}
249
- # self.fs = fsspec.filesystem(filesystem_type, **self.filesystem_options)
250
- # self.verbose = verbose
251
- # self.class_params = class_params or {}
252
- # self.load_params = load_params or {}
253
- # self.reverse_order = reverse_order
254
- # self.overwrite = overwrite
255
- # self.ignore_missing = ignore_missing
256
- # self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
257
- # self.max_age_minutes = max_age_minutes
258
- # self.history_days_threshold = history_days_threshold
259
- # self.show_progress = show_progress
260
- #
261
- # self.start_date = self.convert_to_date(start_date)
262
- # self.end_date = self.convert_to_date(end_date)
263
- #
264
- #
265
- # def convert_to_date(self, date: Any) -> datetime.date:
266
- # try:
267
- # return datetime.datetime.strptime(date, '%Y-%m-%d').date() if isinstance(date, str) else date
268
- # except ValueError as e:
269
- # self.logger.error(f"Error converting {date} to datetime: {e}")
270
- # raise
271
- #
272
- # @staticmethod
273
- # def ensure_forward_slash(path: str) -> str:
274
- # return path if path.endswith('/') else path + '/'
275
- #
276
- # def generate_date_range(self):
277
- # step = -1 if self.reverse_order else 1
278
- # start, end = (self.end_date, self.start_date) if self.reverse_order else (self.start_date, self.end_date)
279
- # current_date = start
280
- # while current_date != end + datetime.timedelta(days=step):
281
- # yield current_date
282
- # current_date += datetime.timedelta(days=step)
283
- #
284
- # def process(self):
285
- # """Execute the update plan following the specified hierarchy."""
286
- # update_plan, update_plan_table = self.generate_update_plan_with_conditions()
287
- #
288
- # # Display the update plan table to the user
289
- #
290
- # display(update_plan_table)
291
- #
292
- # # Process files according to the hierarchy, considering only `update_required` dates
293
- # for category, description in [
294
- # ("overwrite", "Processing files due to overwrite=True"),
295
- # ("history_days", "Processing files within history_days_threshold"),
296
- # ("missing_files", "Processing missing files")
297
- # ]:
298
- # # Filter dates in the category where `update_required` is True
299
- # dates_to_process = update_plan_table[
300
- # (update_plan_table["update_category"] == category) & (update_plan_table["update_required"])
301
- # ]["date"].tolist()
302
- #
303
- # for current_date in tqdm(dates_to_process, desc=description, unit="date"):
304
- # self.process_date(current_date)
305
- #
306
- # def is_file_older_than(self, file_path: str, current_date: datetime.date) -> bool:
307
- # """
308
- # Check if a file is older than the specified max_age_minutes.
309
- # """
310
- # if not self.fs.exists(file_path):
311
- # return True # Treat missing files as old
312
- #
313
- # # Get the file modification time
314
- # file_modification_time = self.fs.info(file_path)['mtime']
315
- # file_modification_datetime = datetime.datetime.fromtimestamp(file_modification_time, tz=datetime.timezone.utc)
316
- #
317
- # # Get the current UTC time as a timezone-aware object
318
- # current_time = datetime.datetime.now(datetime.timezone.utc)
319
- #
320
- # # Calculate file age in seconds and minutes
321
- # file_age_seconds = (current_time - file_modification_datetime).total_seconds()
322
- # file_age_minutes = file_age_seconds / 60
323
- #
324
- # if self.verbose:
325
- # self.logger.info(
326
- # f"File {file_path} is {round(file_age_minutes, 2)} minutes old (threshold: {self.max_age_minutes} minutes)")
327
- #
328
- # # Check if the file date is within the history threshold
329
- # history_start_date = datetime.date.today() - datetime.timedelta(days=self.history_days_threshold)
330
- # within_history_threshold = current_date >= history_start_date
331
- #
332
- # # File is considered old if it exceeds max_age_minutes and is within the history threshold
333
- # return file_age_minutes > self.max_age_minutes and within_history_threshold
334
- #
335
- # def process_date(self, date: datetime.date):
336
- # """Process a specific date by regenerating data as necessary."""
337
- # folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
338
- # full_parquet_filename = f"{folder}{self.parquet_filename}"
339
- #
340
- # start_time = datetime.datetime.now()
341
- #
342
- # if self.verbose:
343
- # self.logger.info(f"Processing {full_parquet_filename}...")
344
- #
345
- # data_object = self.dataclass(**self.class_params)
346
- # #date_filter_params = {
347
- # # f'{self.date_field}__year': date.year,
348
- # # f'{self.date_field}__month': date.month,
349
- # # f'{self.date_field}__day': date.day
350
- # #}
351
- # df=data_object.load_period(dt_field=self.date_field, start=date, end=date)
352
- # #df = data_object.load(**self.load_params, **date_filter_params)
353
- #
354
- # if len(df.index) == 0:
355
- # if self.verbose:
356
- # self.logger.info("No data found for the specified date.")
357
- # return
358
- #
359
- # parquet_saver = ParquetSaver(df, folder, self.logger)
360
- # parquet_saver.save_to_parquet(self.parquet_filename, clear_existing=True)
361
- #
362
- # end_time = datetime.datetime.now()
363
- # duration_seconds = (end_time - start_time).total_seconds()
364
- #
365
- # if self.verbose:
366
- # self.logger.info(f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds")
367
- #
368
- #
369
- # def remove_empty_directories(self, path: str):
370
- # if not self.fs.isdir(path) or self.fs.abspath(path) == self.fs.abspath(self.data_path):
371
- # return
372
- #
373
- # if not self.fs.ls(path): # Check if directory is empty
374
- # try:
375
- # self.fs.rmdir(path)
376
- # if self.verbose:
377
- # self.logger.info(f"Removed empty directory: {path}")
378
- # self.remove_empty_directories(self.fs.path.dirname(path))
379
- # except Exception as e:
380
- # if self.verbose:
381
- # self.logger.error(f"Error removing directory {path}: {e}")
382
- # else:
383
- # if self.verbose:
384
- # self.logger.info(f"Directory not empty, stopping: {path}")
385
- #
386
- # def generate_update_plan_with_conditions(self):
387
- # """
388
- # Generate an update plan that evaluates files based on the specified hierarchy:
389
- # 1. Overwrite (all files regenerated).
390
- # 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
391
- # 3. Missing files: Detect missing files, ignoring future dates.
392
- # """
393
- # update_plan = {
394
- # "overwrite": [],
395
- # "history_days": [],
396
- # "missing_files": []
397
- # }
398
- # rows = []
399
- #
400
- # today = datetime.date.today()
401
- # history_start_date = today - datetime.timedelta(
402
- # days=self.history_days_threshold) if self.history_days_threshold else None
403
- #
404
- # for current_date in tqdm(self.generate_date_range(), desc="Evaluating update plan", unit="date"):
405
- # folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
406
- # full_parquet_filename = f"{folder}{self.parquet_filename}"
407
- #
408
- # file_exists = self.fs.exists(full_parquet_filename)
409
- # file_age_minutes = None # Initialize file_age_minutes as None
410
- # file_is_old = False
411
- # within_history = False
412
- # missing_file = not file_exists and not self.ignore_missing
413
- # category = None
414
- #
415
- # if file_exists:
416
- # # Calculate file age in minutes
417
- # file_modification_time = self.fs.info(full_parquet_filename)['mtime']
418
- # file_modification_datetime = datetime.datetime.fromtimestamp(file_modification_time,
419
- # tz=datetime.timezone.utc)
420
- # current_time = datetime.datetime.now(datetime.timezone.utc)
421
- # file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
422
- #
423
- # # Determine if the file is old
424
- # file_is_old = file_age_minutes > self.max_age_minutes
425
- #
426
- # # Determine if the file is within the history threshold
427
- # if self.history_days_threshold and history_start_date and history_start_date <= current_date <= today:
428
- # within_history = True
429
- #
430
- # # Hierarchy 1: Overwrite (all files are marked for regeneration)
431
- # if self.overwrite:
432
- # category = "overwrite"
433
- #
434
- # # Hierarchy 2: History threshold evaluation
435
- # elif within_history and (missing_file or file_is_old):
436
- # category = "history_days"
437
- #
438
- # # Hierarchy 3: Detect missing files, ignoring future dates
439
- # elif missing_file and current_date <= today:
440
- # category = "missing_files"
441
- #
442
- # # Append to update plan
443
- # if category:
444
- # update_plan[category].append(current_date)
445
- #
446
- # # Collect condition descriptions for the update plan table
447
- # rows.append({
448
- # "date": current_date,
449
- # "file_exists": file_exists,
450
- # "file_age_minutes": file_age_minutes, # Add file age to the table
451
- # "file_is_old": file_is_old,
452
- # "within_history": within_history,
453
- # "missing_file": missing_file,
454
- # "update_required": category is not None, # Mark as true if a category is assigned
455
- # "update_category": category
456
- # })
457
- #
458
- # # Sort dates in descending order if reverse_order is True
459
- # if self.reverse_order:
460
- # for key in update_plan:
461
- # update_plan[key].sort(reverse=True)
462
- #
463
- # update_plan_table = pd.DataFrame(rows)
464
- # return update_plan, update_plan_table
465
- # # def generate_update_plan_with_conditions(self):
466
- # # """
467
- # # Generate an update plan that evaluates files based on the specified hierarchy:
468
- # # 1. Overwrite (all files regenerated).
469
- # # 2. History threshold: Files within `history_days_threshold` are evaluated for `max_age_minutes`.
470
- # # 3. Missing files: Detect missing files, ignoring future dates.
471
- # # """
472
- # # update_plan = {
473
- # # "overwrite": [],
474
- # # "history_days": [],
475
- # # "missing_files": []
476
- # # }
477
- # # rows = []
478
- # #
479
- # # today = datetime.date.today()
480
- # # history_start_date = today - datetime.timedelta(days=self.history_days_threshold) if self.history_days_threshold else None
481
- # #
482
- # # for current_date in tqdm(self.generate_date_range(), desc="Evaluating update plan", unit="date"):
483
- # # folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
484
- # # full_parquet_filename = f"{folder}{self.parquet_filename}"
485
- # #
486
- # # file_exists = self.fs.exists(full_parquet_filename)
487
- # # file_is_old = file_exists and self.is_file_older_than(full_parquet_filename, current_date)
488
- # # within_history = False
489
- # # missing_file = not file_exists and not self.ignore_missing
490
- # # category = None
491
- # #
492
- # # # Hierarchy 1: Overwrite (all files are marked for regeneration)
493
- # # if self.overwrite:
494
- # # category = "overwrite"
495
- # #
496
- # # # Hierarchy 2: History threshold evaluation
497
- # # elif self.history_days_threshold and history_start_date and history_start_date <= current_date <= today:
498
- # # within_history = True
499
- # # if missing_file or self.is_file_older_than(full_parquet_filename, current_date):
500
- # # category = "history_days"
501
- # #
502
- # # # Hierarchy 3: Detect missing files, ignoring future dates
503
- # # elif missing_file and current_date <= today:
504
- # # category = "missing_files"
505
- # #
506
- # # # Append to update plan
507
- # # if category:
508
- # # update_plan[category].append(current_date)
509
- # #
510
- # # # Collect condition descriptions for the update plan table
511
- # # rows.append({
512
- # # "date": current_date,
513
- # # "file_exists": file_exists,
514
- # # "file_is_old": file_is_old,
515
- # # "within_history": within_history,
516
- # # "missing_file": missing_file,
517
- # # "update_required": category is not None,
518
- # # "update_category": category
519
- # # })
520
- # #
521
- # # # Sort dates in descending order if reverse_order is True
522
- # # if self.reverse_order:
523
- # # for key in update_plan:
524
- # # update_plan[key].sort(reverse=True)
525
- # #
526
- # # update_plan_table = pd.DataFrame(rows)
527
- # # return update_plan, update_plan_table
528
- #
210
+
529
211
  # # Usage:
530
212
  # # wrapper = DataWrapper(
531
213
  # # dataclass=YourDataClass,
@@ -1,5 +1,7 @@
1
1
  import datetime
2
2
  from typing import Union, Tuple, Callable, Dict, Any
3
+
4
+ import numpy as np
3
5
  import pandas as pd
4
6
  from sibi_dst.utils import Logger
5
7
 
@@ -118,6 +120,134 @@ class DateUtils:
118
120
  'ytd': lambda: (datetime.date(today().year, 1, 1), today()),
119
121
  }
120
122
 
123
+ class BusinessDays:
124
+ def __init__(self, holiday_list, logger):
125
+ """
126
+ Initialize a BusinessDays object with a given holiday list.
127
+ """
128
+ self.logger = logger
129
+ self.HOLIDAY_LIST = holiday_list
130
+ bd_holidays = [day for year in self.HOLIDAY_LIST for day in self.HOLIDAY_LIST[year]]
131
+ self.bd_cal = np.busdaycalendar(holidays=bd_holidays, weekmask="1111100")
132
+ self.holidays = self.bd_cal.holidays
133
+ self.week_mask = self.bd_cal.weekmask
134
+
135
+ def get_business_days_count(self, begin_date, end_date):
136
+ """
137
+ Calculate the number of business days between two dates.
138
+ """
139
+ try:
140
+ begin_date = pd.to_datetime(begin_date)
141
+ end_date = pd.to_datetime(end_date)
142
+ except Exception as e:
143
+ raise ValueError(f"Invalid date format: {e}")
144
+
145
+ years = [str(year) for year in range(begin_date.year, end_date.year + 1)]
146
+ if not all(year in self.HOLIDAY_LIST for year in years):
147
+ raise ValueError("Not all years in date range are in the holiday list")
148
+
149
+ return np.busday_count(
150
+ begin_date.strftime("%Y-%m-%d"),
151
+ end_date.strftime("%Y-%m-%d"),
152
+ busdaycal=self.bd_cal,
153
+ )
154
+
155
+ def calc_business_days_from_df(self, df, begin_date_col, end_date_col, result_col="business_days"):
156
+ """
157
+ Add a column to a Dask DataFrame with the number of business days between two date columns.
158
+ """
159
+ if not all(col in df.columns for col in [begin_date_col, end_date_col]):
160
+ self.logger.error("Column names not found in DataFrame")
161
+ raise ValueError("Required columns are missing")
162
+
163
+ # Extract holidays and weekmask to recreate the busdaycalendar
164
+ holidays = self.bd_cal.holidays
165
+ weekmask = self.bd_cal.weekmask
166
+
167
+ # Define a function to calculate business days
168
+ def calculate_business_days(row, holidays, weekmask):
169
+ begin_date = pd.to_datetime(row[begin_date_col])
170
+ end_date = pd.to_datetime(row[end_date_col])
171
+ busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
172
+ return np.busday_count(
173
+ begin_date.strftime("%Y-%m-%d"),
174
+ end_date.strftime("%Y-%m-%d"),
175
+ busdaycal=busdaycal,
176
+ )
177
+
178
+ # Define a wrapper function for partition-wise operations
179
+ def apply_business_days(partition, holidays, weekmask):
180
+ return partition.apply(
181
+ calculate_business_days, axis=1, holidays=holidays, weekmask=weekmask
182
+ )
183
+
184
+ # Apply the function using map_partitions
185
+ df[result_col] = df.map_partitions(
186
+ apply_business_days,
187
+ holidays,
188
+ weekmask,
189
+ meta=(result_col, "int64"),
190
+ )
191
+
192
+ return df
193
+
194
+ def add_business_days(self, start_date, n_days):
195
+ """
196
+ Add n_days business days to start_date.
197
+ """
198
+ try:
199
+ start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
200
+ except ValueError:
201
+ raise ValueError("Date should be a string in the format YYYY-MM-DD")
202
+
203
+ if str(start_date.year) not in self.HOLIDAY_LIST:
204
+ self.logger.warning(f"Year {start_date.year} is not in the holiday list")
205
+
206
+ return np.busday_offset(
207
+ start_date.strftime("%Y-%m-%d"),
208
+ n_days,
209
+ roll="forward",
210
+ busdaycal=self.bd_cal,
211
+ )
212
+
213
+ def calc_sla_end_date(self, df, start_date_col, n_days_col, result_col="sla_end_date"):
214
+ """
215
+ Add a column to a Dask DataFrame with SLA end dates based on start date and SLA days.
216
+ """
217
+ if not all(col in df.columns for col in [start_date_col, n_days_col]):
218
+ raise ValueError("Column names not found in DataFrame")
219
+
220
+ # Extract holidays and weekmask to recreate the busdaycalendar
221
+ holidays = self.bd_cal.holidays
222
+ weekmask = self.bd_cal.weekmask
223
+
224
+ # Define a function to calculate SLA end dates
225
+ def calculate_sla_end_date(row, holidays, weekmask):
226
+ start_date = pd.to_datetime(row[start_date_col])
227
+ n_days = row[n_days_col]
228
+ busdaycal = np.busdaycalendar(holidays=holidays, weekmask=weekmask)
229
+ return np.busday_offset(
230
+ start_date.strftime("%Y-%m-%d"),
231
+ n_days,
232
+ roll="forward",
233
+ busdaycal=busdaycal,
234
+ )
235
+
236
+ # Define a wrapper for partition-wise operation
237
+ def apply_sla_end_date(partition, holidays, weekmask):
238
+ return partition.apply(
239
+ calculate_sla_end_date, axis=1, holidays=holidays, weekmask=weekmask
240
+ )
241
+
242
+ # Apply the function using map_partitions
243
+ df[result_col] = df.map_partitions(
244
+ apply_sla_end_date,
245
+ holidays,
246
+ weekmask,
247
+ meta=(result_col, "object"),
248
+ )
249
+
250
+ return df
121
251
  # Class enhancements
122
252
  # DateUtils.register_period('next_week', lambda: (datetime.date.today() + datetime.timedelta(days=7),
123
253
  # datetime.date.today() + datetime.timedelta(days=13)))
@@ -12,6 +12,97 @@ class DfUtils:
12
12
  """
13
13
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
14
14
 
15
+ def align_and_merge_by_type(self, df_left, df_right, type_mapping, how='left'):
16
+ """
17
+ Align column data types in two DataFrames based on a type mapping dictionary and perform the merge.
18
+
19
+ Parameters:
20
+ - df_left (pd.DataFrame or dd.DataFrame): Left DataFrame
21
+ - df_right (pd.DataFrame or dd.DataFrame): Right DataFrame
22
+ - type_mapping (dict): Dictionary mapping target dtypes to column pairs.
23
+ Example: {
24
+ 'integer': [('customer_id', 'temp1'), ('product_type_id', 'temp2')],
25
+ 'string': [('group2', 'temp4')]
26
+ }
27
+
28
+ Returns:
29
+ - Merged DataFrame
30
+ """
31
+ # Map string keys to actual dtypes
32
+ dtype_map = {
33
+ 'integer': 'int64',
34
+ 'float': 'float64',
35
+ 'string': 'string',
36
+ 'datetime': 'datetime64[ns]',
37
+ 'boolean': 'bool',
38
+ }
39
+
40
+ # Iterate over each dtype and align the column pairs
41
+ for target_type, column_pairs in type_mapping.items():
42
+ if target_type not in dtype_map:
43
+ self.logger.error(f"Unsupported type: {target_type}")
44
+
45
+ for left_col, right_col in column_pairs:
46
+ # Align dtypes in left and right DataFrames
47
+ if left_col in df_left.columns and right_col in df_right.columns:
48
+ df_left[left_col] = df_left[left_col].astype(dtype_map[target_type])
49
+ df_right[right_col] = df_right[right_col].astype(dtype_map[target_type])
50
+
51
+ # Flatten all column pairs for the merge operation
52
+ all_pairs = [pair for pairs in type_mapping.values() for pair in pairs]
53
+
54
+ # Perform the merge
55
+ return df_left.merge(
56
+ df_right,
57
+ how=how,
58
+ left_on=[pair[0] for pair in all_pairs],
59
+ right_on=[pair[1] for pair in all_pairs]
60
+ )
61
+
62
+ def exclude_from_dataframe(self, df, conditions):
63
+ """
64
+ Generic function to filter rows from a DataFrame (Pandas or Dask).
65
+
66
+ Parameters:
67
+ - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame to filter.
68
+ - conditions (list of tuples): List of conditions to apply for filtering.
69
+ Each condition is a tuple: (column_name, operator, value).
70
+
71
+ Returns:
72
+ - pandas.DataFrame or dask.dataframe.DataFrame: Filtered DataFrame.
73
+ """
74
+ import operator
75
+
76
+ # Mapping string operators to actual Python operators
77
+ ops = {
78
+ "==": operator.eq,
79
+ "!=": operator.ne,
80
+ "<": operator.lt,
81
+ "<=": operator.le,
82
+ ">": operator.gt,
83
+ ">=": operator.ge,
84
+ }
85
+ # Ensure all specified columns exist in the DataFrame
86
+ missing_columns = [col for col, _, _ in conditions if col not in df.columns]
87
+ if missing_columns:
88
+ self.logger.info(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
89
+ return df
90
+
91
+ # Build the combined filtering condition
92
+ combined_condition = None
93
+ for col, op, value in conditions:
94
+ if op not in ops:
95
+ raise ValueError(f"Unsupported operator: {op}")
96
+
97
+ # Get the individual condition
98
+ condition = ops[op](df[col], value)
99
+
100
+ # Combine the condition with AND (&)
101
+ combined_condition = condition if combined_condition is None else (combined_condition & condition)
102
+
103
+ # Apply the filtering and return the DataFrame
104
+ return df[~combined_condition]
105
+
15
106
  def load_grouped_activity(self, df, group_by_expr, group_expr='count', debug=False):
16
107
  """
17
108
  Groups the DataFrame by the specified expression and computes the size.
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.10
4
- Summary:
3
+ Version: 0.3.12
4
+ Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
7
7
  Requires-Python: >=3.11,<4.0
@@ -9,6 +9,8 @@ Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
11
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
12
14
  Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
13
15
  Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
14
16
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
@@ -29,6 +31,7 @@ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
29
31
  Requires-Dist: sqlmodel (>=0.0.22,<0.0.23)
30
32
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
31
33
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
34
+ Requires-Dist: uvicorn (>=0.32.1,<0.33.0)
32
35
  Description-Content-Type: text/markdown
33
36
 
34
37
  # sibi-dst
@@ -1,45 +1,45 @@
1
1
  sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
2
2
  sibi_dst/df_helper/__init__.py,sha256=JXJBY47G6wOYhzNI646OBl3pSGWIy4282-3qPGYHU7w,167
3
- sibi_dst/df_helper/_df_helper.py,sha256=Pvu1kByZhUCAY9LGKFrcyasTq1MeeIBeMoeCgScStPM,12507
3
+ sibi_dst/df_helper/_df_helper.py,sha256=43-eY9mDU-j-QFeAtdMjIb3KuC2_hYzLjVi177_EKAo,13006
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=f5oHwXtsNW6-ONSFsRB0AniVefA0THzP92J-nugp9vo,4973
5
- sibi_dst/df_helper/core/__init__.py,sha256=UXGUGGSjjrcJRrs25zPV-xgJoyYy1WjVQAExcJDWgV0,254
6
- sibi_dst/df_helper/core/_defaults.py,sha256=AVNT_Vk8K7dLKOnPX_-Cygi-Nuku65CIn0baE0Wn6dI,1877
5
+ sibi_dst/df_helper/core/__init__.py,sha256=NSYY_evzq6XEkO06Nz6xLH5KznzRGI44cLbrnN3zHXQ,503
6
+ sibi_dst/df_helper/core/_defaults.py,sha256=pJU-lX7w4nrt0Anx35j08mVr_0oMGn1bTA_iCl_p1qI,6700
7
7
  sibi_dst/df_helper/core/_params_config.py,sha256=hO-PddoaGjFebqJFgtn76WwVHcCjzPW3z5i3NyK6mDw,3475
8
8
  sibi_dst/df_helper/core/_query_config.py,sha256=HEiyR_fBJjIMum-PSQroY3KaefQ2SpW1w1SQS8oT-NU,489
9
9
  sibi_dst/df_helper/plugins/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  sibi_dst/df_helper/plugins/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
11
11
  sibi_dst/df_helper/plugins/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
12
- sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=ZWVwJCJY7xmlZwDCZG3vNlEMyTGKJ8CoEtwgKYX0ofQ,2918
12
+ sibi_dst/df_helper/plugins/django/_django_load_from_db.py,sha256=NrkBb5LM1A_vo3wAotqj2sVVYIuTfFGrQqIXk3xOoDs,5162
13
13
  sibi_dst/df_helper/plugins/django/_django_sql_model_builder.py,sha256=GprCh2c6PFlRBUCir4hh8pmf4Eqb-4OIK6Vz_xXQjMw,14864
14
14
  sibi_dst/df_helper/plugins/django/_io_dask.py,sha256=jryDojeA62rB3seRaWWMjsAmekKacK5xctwCQGVklPQ,9063
15
15
  sibi_dst/df_helper/plugins/django/_io_dask_alt.py,sha256=zDjLyYxBeL0ffn3yfE_7vqMLMpeEEk2o-zMr66sKkDw,6827
16
16
  sibi_dst/df_helper/plugins/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGnzWsfKgNLgPpSXjY,102
17
- sibi_dst/df_helper/plugins/http/_http_config.py,sha256=TaoI0F5S-Gf9jiWJp3ngQZTw2jlks-_WNDzKX1Wybtc,2165
17
+ sibi_dst/df_helper/plugins/http/_http_config.py,sha256=WH0d4vsxfZRhWrWI4iTVAnhsdY3421SBr9kXYZVfeYQ,2126
18
18
  sibi_dst/df_helper/plugins/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
19
- sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=45mHID1azAg5PmaYWbuRlghoRd3H2aTLj1XcycfLJo0,3497
20
- sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=cKyRj0UCby9-iYPPFnlel1H03x8MnAoEv8k1tp7kHXw,4277
19
+ sibi_dst/df_helper/plugins/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
20
+ sibi_dst/df_helper/plugins/parquet/_parquet_options.py,sha256=suJC7LfNEWAo-7_R62YTMSRku3k8orysft83VxRUems,4394
21
21
  sibi_dst/df_helper/plugins/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
22
- sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=ET9cih0Frc5izMOsdvNlLhjJWtUQbwZhRtsdo5dRckQ,5059
23
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=UXcZ1shS7shcjkSqIIduAnb1Lhzc6pZ6NEcbkcnwgWk,4606
22
+ sibi_dst/df_helper/plugins/sql_alchemy/_io_sqlalchemy_dask.py,sha256=6IjQEREXqTAzSJE95FKfXjRkTlEjRMS4hJ_yMpyKDTg,5223
23
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlachemy_filter_handler.py,sha256=H8ypUjLKzYYl9BerfJjX_Uv9qBVkBR-wZiQlh3uRQXg,4669
24
24
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
25
- sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=f1zqBISVn8OjZJs8hu6IvRZSwMX7_DIZMIbhxV6uV80,3179
25
+ sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=myrtEzK12DvA73x7QFaqXFb_TxOPMrsVj-mxYHJD2dg,2371
26
26
  sibi_dst/df_helper/plugins/sql_alchemy/_sqlalchemy_model_builder.py,sha256=vrTTeFcrf8sFpzqLlQfspjduKuR7Wgn7mDPUASsQs6s,4355
27
27
  sibi_dst/df_helper/plugins/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
28
28
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
29
29
  sibi_dst/df_helper/plugins/sql_model/_sqlmodel_load_from_db.py,sha256=bLD4tEcGDKkJCfSO4b13_89tzVJcpz55I6uw9D4ERnE,3751
30
- sibi_dst/utils/__init__.py,sha256=jiXJSnmsaGZTRhUThtIo6cssWXBWXNij8ffYmv77QK4,797
30
+ sibi_dst/utils/__init__.py,sha256=nkX7tASNn57kw998YdqQQGY8qXv2J4LC4-g0GoQSiic,831
31
31
  sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3fVwNZk,7524
32
32
  sibi_dst/utils/_clickhouse_writer.py,sha256=mdgszbyVluhGvDmvsHY4XDTZrp42L3xtdmiyn3z2bYM,8534
33
33
  sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
34
- sibi_dst/utils/_data_utils.py,sha256=XB0xjW2LbCmoZjgDbNQQpWaf4upmSoTXeJZ3QMVqbsQ,7056
35
- sibi_dst/utils/_data_wrapper.py,sha256=pZnylBFTvsLGfYGv2tTyQHzyb6IbIahfaXR-PxHdivk,24099
36
- sibi_dst/utils/_date_utils.py,sha256=6HCrcTiuYLNsbgrNB3eAVAAgXbfx7Ce1qNc3OJla9nM,5621
37
- sibi_dst/utils/_df_utils.py,sha256=o2bK5-xMGKqIG4i9xfavYRxIkiHLA0nz5TQTN78998k,7350
34
+ sibi_dst/utils/_data_utils.py,sha256=BvmjMNSkICy671BmjW68RhvDMfN5uAXwhffSV-wEwmk,9185
35
+ sibi_dst/utils/_data_wrapper.py,sha256=SmNv1UoZLq7ovRVy4wipsWLMidKJXcRTp4HtxmaCQdk,9399
36
+ sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
37
+ sibi_dst/utils/_df_utils.py,sha256=9_dNYoZ9_ofU0t_sxMdsXALWCuh02gvqUrei-6Lhr6w,10910
38
38
  sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
39
39
  sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixWnn_68_n0,6686
40
40
  sibi_dst/utils/_log_utils.py,sha256=AAenyubYUjk77WqiaNkjgkxws3dnAMIdaGl2Ryz_cA4,2245
41
41
  sibi_dst/utils/_parquet_saver.py,sha256=-A0o_vucyYe7wlwiby_0_yS-ZfT2GHwImyQHrCIBNwk,9051
42
42
  sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
43
- sibi_dst-0.3.10.dist-info/METADATA,sha256=lrVYU1PPBuHQrEDl_-SURTkE0ip_0xWsJc58AiihHZs,1877
44
- sibi_dst-0.3.10.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
- sibi_dst-0.3.10.dist-info/RECORD,,
43
+ sibi_dst-0.3.12.dist-info/METADATA,sha256=5mezOBAiUV2pMgNsVqI7iCZZgmxeZpLuYWDYUAZCTVk,2030
44
+ sibi_dst-0.3.12.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
45
+ sibi_dst-0.3.12.dist-info/RECORD,,