sibi-dst 2025.8.2__tar.gz → 2025.8.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/PKG-INFO +1 -1
  2. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/pyproject.toml +1 -1
  3. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_df_helper.py +25 -6
  4. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_filter_handler.py +116 -37
  5. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/README.md +0 -0
  6. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/__init__.py +0 -0
  7. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/__init__.py +0 -0
  8. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  9. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  10. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  11. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  12. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/__init__.py +0 -0
  13. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  14. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  15. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  16. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  17. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  18. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  19. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  20. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  21. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  22. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  23. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  24. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  25. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/__init__.py +0 -0
  26. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_defaults.py +0 -0
  27. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_params_config.py +0 -0
  28. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/core/_query_config.py +0 -0
  29. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/df_helper/data_cleaner.py +0 -0
  30. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/__init__.py +0 -0
  31. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  32. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/geopy_helper/utils.py +0 -0
  33. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/__init__.py +0 -0
  34. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  35. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  36. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  37. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  38. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  39. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  40. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/osmnx_helper/utils.py +0 -0
  41. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/tests/__init__.py +0 -0
  42. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  43. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/__init__.py +0 -0
  44. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/base.py +0 -0
  45. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/business_days.py +0 -0
  46. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/clickhouse_writer.py +0 -0
  47. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/credentials.py +0 -0
  48. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/data_from_http_source.py +0 -0
  49. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/data_utils.py +0 -0
  50. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/data_wrapper.py +0 -0
  51. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/date_utils.py +0 -0
  52. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/df_utils.py +0 -0
  53. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/file_age_checker.py +0 -0
  54. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/file_utils.py +0 -0
  55. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/filepath_generator.py +0 -0
  56. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/log_utils.py +0 -0
  57. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/manifest_manager.py +0 -0
  58. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/parquet_saver.py +0 -0
  59. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/periods.py +0 -0
  60. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/phone_formatter.py +0 -0
  61. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/storage_config.py +0 -0
  62. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/storage_manager.py +0 -0
  63. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/update_planner.py +0 -0
  64. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/utils/webdav_client.py +0 -0
  65. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/__init__.py +0 -0
  66. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/__init__.py +0 -0
  67. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  68. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  69. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  70. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  71. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  72. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  73. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  74. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  75. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  76. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  77. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  78. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  79. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  80. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  81. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  82. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  83. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/utils/__init__.py +0 -0
  84. {sibi_dst-2025.8.2 → sibi_dst-2025.8.3}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.2
3
+ Version: 2025.8.3
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.8.2"
3
+ version = "2025.8.3"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -55,23 +55,42 @@ class ParquetBackend(BaseBackend):
55
55
  def load(self, **options):
56
56
  try:
57
57
  df = self.helper.backend_parquet.load_files()
58
- if len(df.head(1)) == 0:
59
- return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
58
+ if self._is_empty(df):
59
+ return -1, self._empty_like(df)
60
60
 
61
61
  if options and df is not None:
62
62
  df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
63
- if len(df.head(1)) == 0:
63
+ nrows = self._row_count(df)
64
+ if nrows == 0:
64
65
  self.logger.debug("No records after filters; returning empty DataFrame.")
65
- return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
66
+ return 0, self._empty_like(df)
66
67
 
67
68
  df = df.persist()
68
- self.total_records = len(df) or -1
69
+ self.total_records = self._row_count(df) or -1
69
70
  return self.total_records, df
71
+
70
72
  except Exception as e:
71
- self.total_records = -1
73
+ self.total_records = -1 # Reset total_records on failure
72
74
  self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
73
75
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
74
76
 
77
+ def _is_empty(self, ddf) -> bool:
78
+ """True if no rows across all partitions."""
79
+ try:
80
+ # head with npartitions=-1 walks partitions until it gets n rows
81
+ return ddf.head(1, npartitions=-1).shape[0] == 0
82
+ except Exception:
83
+ return True
84
+
85
+ def _row_count(self, ddf) -> int:
86
+ """Reliable row count for Dask DataFrame."""
87
+ return int(ddf.map_partitions(len).sum().compute())
88
+
89
+ def _empty_like(self, ddf):
90
+ """Return an empty Dask DF with the SAME columns/dtypes."""
91
+ empty_pdf = ddf._meta.iloc[0:0]
92
+ return dd.from_pandas(empty_pdf, npartitions=1)
93
+
75
94
 
76
95
  class HttpBackend(BaseBackend):
77
96
  def load(self, **options):
@@ -89,20 +89,16 @@ class FilterHandler:
89
89
  return field_name, casting, operation
90
90
 
91
91
  def _parse_filter_value(self, casting, value):
92
- """
93
- Convert filter value to appropriate type based on the casting (e.g., date).
94
- """
95
92
  if casting == "date":
96
93
  if isinstance(value, str):
97
- parsed = pd.Timestamp(value) # Convert to datetime64[ns]
98
- return parsed
94
+ return pd.Timestamp(value)
95
+ if isinstance(value, list):
96
+ return [pd.Timestamp(v) for v in value]
97
+ elif casting == "time":
98
+ # convert to seconds since midnight
99
99
  if isinstance(value, list):
100
- parsed = [pd.Timestamp(v) for v in value]
101
- return parsed
102
- elif casting == "time" and isinstance(value, str):
103
- parsed = datetime.time.fromisoformat(value)
104
- self.logger.debug(f"Parsed value (time): {parsed}")
105
- return parsed
100
+ return [self._time_to_seconds(v) for v in value]
101
+ return self._time_to_seconds(value)
106
102
  return value
107
103
 
108
104
  @staticmethod
@@ -153,24 +149,20 @@ class FilterHandler:
153
149
 
154
150
  @staticmethod
155
151
  def _get_dask_column(df, field_name, casting):
156
- """
157
- Retrieve and optionally cast a column for Dask based on the field name and casting.
152
+ needs_dt = casting in (FilterHandler._dt_operators() + FilterHandler._date_operators())
153
+ column = dd.to_datetime(df[field_name], errors="coerce") if needs_dt else df[field_name]
158
154
 
159
- Args:
160
- df: The Dask DataFrame.
161
- field_name: The name of the field/column in the DataFrame.
162
- casting: The casting type ('date', 'time', etc.).
163
-
164
- Returns:
165
- The Dask Series object, optionally cast or transformed.
166
- """
167
- column = dd.to_datetime(df[field_name], errors="coerce") if casting in FilterHandler._dt_operators() else df[
168
- field_name]
155
+ if needs_dt:
156
+ column = FilterHandler._strip_tz(column)
169
157
 
170
158
  if casting == "date":
171
- column = column.dt.floor("D") # Ensure truncation to the date level
159
+ column = column.dt.floor("D")
160
+ elif casting == "time":
161
+ # compare as "seconds since midnight"
162
+ column = (column.dt.hour * 3600 + column.dt.minute * 60 + column.dt.second)
172
163
  elif casting in FilterHandler._date_operators():
173
- column = getattr(column.dt, casting)
164
+ attr = "weekday" if casting == "week_day" else casting
165
+ column = getattr(column.dt, attr)
174
166
 
175
167
  return column
176
168
 
@@ -221,23 +213,61 @@ class FilterHandler:
221
213
  "gte": lambda col, val: col >= val,
222
214
  "lt": lambda col, val: col < val,
223
215
  "lte": lambda col, val: col <= val,
224
- "in": lambda col, val: col.isin(val),
216
+
217
+ # <-- type-safe "in" and "not_in"
218
+ "in": lambda col, val: FilterHandler._align_in_types(col, val)[0].isin(
219
+ FilterHandler._align_in_types(col, val)[1]),
220
+ "not_in": lambda col, val: ~FilterHandler._align_in_types(col, val)[0].isin(
221
+ FilterHandler._align_in_types(col, val)[1]),
222
+
225
223
  "range": lambda col, val: (col >= val[0]) & (col <= val[1]),
226
- "contains": lambda col, val: col.str.contains(val, regex=True),
227
- "startswith": lambda col, val: col.str.startswith(val),
228
- "endswith": lambda col, val: col.str.endswith(val),
224
+
225
+ # robust string ops (dtype-agnostic)
226
+ "contains": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
227
+ "startswith": lambda col, val: FilterHandler._as_str(col).str.startswith(val, na=False),
228
+ "endswith": lambda col, val: FilterHandler._as_str(col).str.endswith(val, na=False),
229
+ "not_contains": lambda col, val: ~FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
230
+ "regex": lambda col, val: FilterHandler._as_str(col).str.contains(val, regex=True, na=False),
231
+ "icontains": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True,
232
+ na=False),
233
+ "istartswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.startswith(str(val).lower(),
234
+ na=False),
235
+ "iendswith": lambda col, val: FilterHandler._as_str(col).str.lower().str.endswith(str(val).lower(),
236
+ na=False),
237
+ "iexact": lambda col, val: FilterHandler._as_str(col).str.lower() == str(val).lower(),
238
+ "iregex": lambda col, val: FilterHandler._as_str(col).str.contains(val, case=False, regex=True, na=False),
239
+
229
240
  "isnull": lambda col, val: col.isnull() if val else col.notnull(),
230
241
  "not_exact": lambda col, val: col != val,
231
- "not_contains": lambda col, val: ~col.str.contains(val, regex=True),
232
- "not_in": lambda col, val: ~col.isin(val), # Custom operation
233
- "regex": lambda col, val: col.str.contains(val, regex=True), # Custom operation
234
- "icontains": lambda col, val: col.str.contains(val, case=False, regex=True), # Custom operation
235
- "istartswith": lambda col, val: col.str.startswith(val, case=False), # Custom operation
236
- "iendswith": lambda col, val: col.str.endswith(val, case=False), # Custom operation
237
- "iexact": lambda col, val: col.str.contains(f"^{val}$", case=False, regex=True), # Added iexact
238
- "iregex": lambda col, val: col.str.contains(val, case=False, regex=True), # Added iregex
239
242
  }
240
243
 
244
+ @staticmethod
245
+ def _as_str(col):
246
+ # Force a reliable string view (works with object, categorical, etc.)
247
+ return col.astype("string").fillna("")
248
+
249
+ @staticmethod
250
+ def _strip_tz(col):
251
+ # Make tz-aware datetimes naive so they compare to tz-naive filter values
252
+ import pandas as pd
253
+ def _part(s: pd.Series) -> pd.Series:
254
+ try:
255
+ return s.dt.tz_convert("UTC").dt.tz_localize(None)
256
+ except Exception:
257
+ try:
258
+ return s.dt.tz_localize(None)
259
+ except Exception:
260
+ return s
261
+
262
+ return col.map_partitions(_part, meta=col._meta)
263
+
264
+ @staticmethod
265
+ def _time_to_seconds(t):
266
+ # t can be datetime.time or a "HH:MM[:SS]" str
267
+ if isinstance(t, str):
268
+ t = datetime.time.fromisoformat(t)
269
+ return t.hour * 3600 + t.minute * 60 + t.second
270
+
241
271
  @staticmethod
242
272
  def _dt_operators():
243
273
  return ["date", "time"]
@@ -255,3 +285,52 @@ class FilterHandler:
255
285
  "regex", "icontains", "istartswith", "iendswith",
256
286
  "iexact", "iregex"
257
287
  ]
288
+
289
+ @staticmethod
290
+ def _align_in_types(col, val):
291
+ """
292
+ Return (coerced_col, coerced_values) with compatible dtypes
293
+ so that .isin(...) behaves as expected across partitions.
294
+ """
295
+ # normalize val to a list
296
+ if isinstance(val, (set, tuple)):
297
+ vals = list(val)
298
+ elif isinstance(val, list):
299
+ vals = val
300
+ else:
301
+ vals = [val]
302
+
303
+ # try numeric alignment first if column is numeric-like
304
+ kind = getattr(getattr(col, "dtype", None), "kind", None)
305
+ if kind in ("i", "u"): # integer
306
+ def to_ints(xs):
307
+ out = []
308
+ for x in xs:
309
+ try:
310
+ out.append(int(x))
311
+ except Exception:
312
+ # if any value can't be int, fall back to strings below
313
+ return None
314
+ return out
315
+
316
+ ints = to_ints(vals)
317
+ if ints is not None:
318
+ # nullable Int64 handles missing values
319
+ return col.astype("Int64"), ints
320
+
321
+ if kind in ("f",): # float
322
+ def to_floats(xs):
323
+ out = []
324
+ for x in xs:
325
+ try:
326
+ out.append(float(x))
327
+ except Exception:
328
+ return None
329
+ return out
330
+
331
+ flts = to_floats(vals)
332
+ if flts is not None:
333
+ return col.astype("float64"), flts
334
+
335
+ # fallback: compare as strings (robust across object/categorical/mixed)
336
+ return FilterHandler._as_str(col), [str(x) for x in vals]
File without changes