sibi-dst 2025.9.6__tar.gz → 2025.9.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/PKG-INFO +1 -1
  2. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/pyproject.toml +1 -1
  3. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/__init__.py +2 -0
  4. sibi_dst-2025.9.7/sibi_dst/utils/boilerplate/base_pipeline.py +178 -0
  5. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/README.md +0 -0
  6. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/__init__.py +0 -0
  7. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/__init__.py +0 -0
  8. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  9. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  10. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_df_helper.py +0 -0
  11. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  12. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  13. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/__init__.py +0 -0
  14. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  15. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  16. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  17. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  18. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  19. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  20. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  21. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  22. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  23. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  24. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  25. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/__init__.py +0 -0
  26. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_defaults.py +0 -0
  27. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  28. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_params_config.py +0 -0
  29. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/core/_query_config.py +0 -0
  30. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/df_helper/data_cleaner.py +0 -0
  31. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/geopy_helper/__init__.py +0 -0
  32. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  33. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/geopy_helper/utils.py +0 -0
  34. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/__init__.py +0 -0
  35. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  36. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  37. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  38. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  39. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  40. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  41. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/osmnx_helper/utils.py +0 -0
  42. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/tests/__init__.py +0 -0
  43. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/tests/test_baseclass.py +0 -0
  44. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  45. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/__init__.py +0 -0
  46. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/async_utils.py +0 -0
  47. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/base.py +0 -0
  48. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_attacher.py +0 -0
  49. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_data_cube.py +0 -0
  50. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_parquet_artifact.py +0 -0
  51. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/base_parquet_reader.py +0 -0
  52. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/boilerplate/hybrid_data_loader.py +0 -0
  53. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/business_days.py +0 -0
  54. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/clickhouse_writer.py +0 -0
  55. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/credentials.py +0 -0
  56. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/dask_utils.py +0 -0
  57. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/data_from_http_source.py +0 -0
  58. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/data_utils.py +0 -0
  59. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/data_wrapper.py +0 -0
  60. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/date_utils.py +0 -0
  61. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/df_utils.py +0 -0
  62. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/file_age_checker.py +0 -0
  63. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/file_utils.py +0 -0
  64. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/filepath_generator.py +0 -0
  65. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/iceberg_saver.py +0 -0
  66. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/log_utils.py +0 -0
  67. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/manifest_manager.py +0 -0
  68. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/parquet_saver.py +0 -0
  69. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/periods.py +0 -0
  70. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/phone_formatter.py +0 -0
  71. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/progress/__init__.py +0 -0
  72. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/progress/jobs.py +0 -0
  73. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/progress/sse_runner.py +0 -0
  74. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/storage_config.py +0 -0
  75. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/storage_hive.py +0 -0
  76. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/storage_manager.py +0 -0
  77. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/update_planner.py +0 -0
  78. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/webdav_client.py +0 -0
  79. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/utils/write_gatekeeper.py +0 -0
  80. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/__init__.py +0 -0
  81. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/__init__.py +0 -0
  82. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  83. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  84. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  85. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  86. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  87. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  88. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  89. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  90. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  91. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  92. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  93. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  94. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  95. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  96. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  97. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  98. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/utils/__init__.py +0 -0
  99. {sibi_dst-2025.9.6 → sibi_dst-2025.9.7}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.9.6
3
+ Version: 2025.9.7
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.9.6"
3
+ version = "2025.9.7"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -3,6 +3,7 @@ from .base_data_cube import BaseDataCube
3
3
  from .base_attacher import make_attacher
4
4
  from .base_parquet_reader import BaseParquetReader
5
5
  from .hybrid_data_loader import HybridDataLoader
6
+ from .base_pipeline import BasePipeline
6
7
 
7
8
  __all__ = [
8
9
  "BaseDataCube",
@@ -10,5 +11,6 @@ __all__ = [
10
11
  "make_attacher",
11
12
  "BaseParquetReader",
12
13
  "HybridDataLoader",
14
+ "BasePipeline",
13
15
  ]
14
16
 
@@ -0,0 +1,178 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from typing import Type, Any, Callable, List
6
+
7
+ import pandas as pd
8
+ import dask.dataframe as dd
9
+
10
+ from sibi_dst.utils import ManagedResource, ParquetSaver
11
+ from sibi_dst.df_helper import ParquetReader
12
+ from sibi_dst.utils.dask_utils import dask_is_empty
13
+
14
+
15
+ class DateRangeHelper:
16
+ @staticmethod
17
+ def generate_daily_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[str]:
18
+ start = pd.to_datetime(start_date)
19
+ end = pd.to_datetime(end_date)
20
+ return [d.strftime(date_format) for d in pd.date_range(start, end, freq="D")]
21
+
22
+ @staticmethod
23
+ def generate_monthly_ranges(start_date: str, end_date: str, date_format: str = "%Y-%m-%d") -> List[tuple[str, str]]:
24
+ """
25
+ Generate (start_date, end_date) tuples for each calendar month in range.
26
+ Always includes the first and last month, even if partial.
27
+ """
28
+ start = pd.to_datetime(start_date)
29
+ end = pd.to_datetime(end_date)
30
+ ranges = []
31
+ current = start.replace(day=1)
32
+ while current <= end:
33
+ month_end = (current + pd.offsets.MonthEnd(0)).normalize()
34
+ ranges.append((
35
+ current.strftime(date_format),
36
+ min(month_end, end).strftime(date_format)
37
+ ))
38
+ current += pd.DateOffset(months=1)
39
+ return ranges
40
+
41
+ class BasePipeline(ManagedResource):
42
+ def __init__(
43
+ self,
44
+ start_date: str,
45
+ end_date: str,
46
+ dataset_cls: Type,
47
+ parquet_storage_path: str,
48
+ *,
49
+ fs: Any,
50
+ filename: str = "dataset",
51
+ date_field: str = "date",
52
+ max_workers: int = 4,
53
+ dataset_kwargs: dict = None,
54
+ **kwargs,
55
+ ):
56
+ kwargs["fs"] = fs
57
+ super().__init__(**kwargs)
58
+
59
+ self.start_date = start_date
60
+ self.end_date = end_date
61
+ self.fs = fs
62
+ self.filename = filename
63
+ self.date_field = date_field
64
+ self.max_workers = max_workers
65
+ self.storage_path = parquet_storage_path.rstrip("/")
66
+ self.df: dd.DataFrame | None = None
67
+
68
+ self.ds = dataset_cls(
69
+ start_date=self.start_date,
70
+ end_date=self.end_date,
71
+ debug=self.debug,
72
+ logger=self.logger,
73
+ **(dataset_kwargs or {}),
74
+ )
75
+
76
+ def _get_storage_path_for_date(self, date: pd.Timestamp) -> str:
77
+ return f"{self.storage_path}/{date.year}/{date.month:02d}/{date.day:02d}"
78
+
79
+ def _get_output_filename(self, fmt: str = "parquet") -> str:
80
+ return f"{self.filename}.{fmt}"
81
+
82
+ async def aload(self, **kwargs) -> dd.DataFrame:
83
+ await self.emit("status", message="Loading dataset...", progress=5)
84
+ self.df = await self.ds.aload(**kwargs)
85
+ return self.df
86
+
87
+ async def to_parquet(self, **kwargs) -> None:
88
+ df = await self.aload(**kwargs)
89
+ if dask_is_empty(df):
90
+ self.logger.warning("No data to save.")
91
+ return
92
+
93
+ df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
94
+ dates = DateRangeHelper.generate_daily_ranges(self.start_date, self.end_date)
95
+
96
+ tasks = []
97
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
98
+ for date_str in dates:
99
+ date_obj = pd.to_datetime(date_str).date()
100
+ df_day = df[df[self.date_field].dt.date == date_obj]
101
+ if dask_is_empty(df_day):
102
+ self.logger.info(f"No data for {date_obj}, skipping.")
103
+ continue
104
+
105
+ path = self._get_storage_path_for_date(pd.Timestamp(date_obj))
106
+ await self.emit("status", message=f"Saving data for {date_obj}")
107
+
108
+ saver = ParquetSaver(
109
+ df_result=df_day,
110
+ parquet_storage_path=path,
111
+ fs=self.fs,
112
+ debug=self.debug,
113
+ logger=self.logger,
114
+ )
115
+
116
+ tasks.append(
117
+ asyncio.get_running_loop().run_in_executor(
118
+ executor, saver.save_to_parquet, self._get_output_filename()
119
+ )
120
+ )
121
+
122
+ await asyncio.gather(*tasks)
123
+ await self.emit("complete", message="All partitions written.")
124
+
125
+ async def from_parquet(self, **kwargs) -> dd.DataFrame:
126
+ reader = ParquetReader(
127
+ parquet_start_date=self.start_date,
128
+ parquet_end_date=self.end_date,
129
+ parquet_storage_path=self.storage_path,
130
+ parquet_filename=self._get_output_filename(),
131
+ fs=self.fs,
132
+ debug=self.debug,
133
+ logger=self.logger,
134
+ )
135
+ return await reader.aload(**kwargs)
136
+
137
+ async def to_clickhouse(self, clk_conf: dict, **kwargs):
138
+ """
139
+ Writes daily-partitioned data to ClickHouse using concurrent threads.
140
+ """
141
+ from sibi_dst.utils import ClickHouseWriter
142
+
143
+ df = await self.from_parquet(**kwargs)
144
+ if dask_is_empty(df):
145
+ self.logger.warning("No data to write to ClickHouse.")
146
+ return
147
+
148
+ df[self.date_field] = dd.to_datetime(df[self.date_field], errors="coerce")
149
+ df = df.persist()
150
+
151
+ unique_dates = df[self.date_field].dt.date.dropna().unique().compute()
152
+ if len(unique_dates)==0:
153
+ self.logger.warning("No valid dates found for partitioning.")
154
+ return
155
+
156
+ clk = ClickHouseWriter(**clk_conf)
157
+ loop = asyncio.get_running_loop()
158
+ tasks = []
159
+
160
+ with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
161
+ for date in unique_dates:
162
+ df_day = df[df[self.date_field].dt.date == date]
163
+ if dask_is_empty(df_day):
164
+ self.logger.info(f"[ClickHouse] No data for {date}, skipping.")
165
+ continue
166
+
167
+ self.logger.info(f"[ClickHouse] Writing {len(df_day)} rows for {date}")
168
+
169
+ tasks.append(
170
+ loop.run_in_executor(executor, clk.save_to_clickhouse, df_day)
171
+ )
172
+
173
+ await asyncio.gather(*tasks)
174
+
175
+ self.logger.info(f"ClickHouse write complete for {len(unique_dates)} daily partitions.")
176
+
177
+
178
+ __all__ = ["BasePipeline"]
File without changes