sibi-dst 0.3.52__py3-none-any.whl → 0.3.54__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,7 +74,7 @@ class DfHelper:
74
74
  logger: Logger
75
75
  default_config: Dict = None
76
76
 
77
- def __init__(self, backend='django_db', **kwargs):
77
+ def __init__(self, backend='sqlalchemy', **kwargs):
78
78
  # Ensure default_config is not shared across instances
79
79
  self.default_config = self.default_config or {}
80
80
  kwargs = {**self.default_config.copy(), **kwargs}
@@ -209,6 +209,7 @@ class DfHelper:
209
209
  else:
210
210
  self.logger.debug("Regular asyncio run...")
211
211
  return asyncio.run(self.__load_from_http(**options))
212
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
212
213
 
213
214
  def __load_from_sqlalchemy(self, **options):
214
215
  """
@@ -151,9 +151,21 @@ class ParquetArtifact(DfHelper):
151
151
  'parquet_end_date': datetime.date.today().strftime('%Y-%m-%d')
152
152
  }
153
153
 
154
+ def custom_config():
155
+ try:
156
+ start_date = kwargs.pop('start_on')
157
+ end_date = kwargs.pop('end_on')
158
+ except KeyError:
159
+ raise ValueError("For period 'custom', you must provide 'start_on' in kwargs.")
160
+ return {
161
+ 'parquet_start_date': start_date,
162
+ 'parquet_end_date': end_date
163
+ }
164
+
154
165
  config_map = {
155
166
  'itd': itd_config,
156
- 'ytd': ytd_config
167
+ 'ytd': ytd_config,
168
+ 'custom': custom_config,
157
169
  }
158
170
 
159
171
  if period in config_map:
@@ -29,7 +29,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
29
29
  connection_url: str
30
30
  table: Optional[str] = None
31
31
  model: Any = None
32
- engine: Optional[Any] = None # Save engine to reuse it
32
+ engine: Optional[Any] = None
33
+ pool_size: int = 10
34
+ max_overflow: int = 5
35
+ pool_timeout: int = 30
33
36
 
34
37
  @model_validator(mode="after")
35
38
  def validate_and_initialize(self):
@@ -41,7 +44,10 @@ class SqlAlchemyConnectionConfig(BaseModel):
41
44
  raise ValueError("`connection_url` must be provided.")
42
45
 
43
46
  # Initialize the engine
44
- self.engine = create_engine(self.connection_url)
47
+ self.engine = create_engine(self.connection_url,
48
+ pool_size=self.pool_size,
49
+ max_overflow=self.max_overflow,
50
+ pool_timeout=self.pool_timeout)
45
51
 
46
52
  # Validate the connection
47
53
  self.validate_connection()
@@ -47,7 +47,8 @@ class DataWrapper:
47
47
  show_progress: bool = False,
48
48
  timeout: float = 60,
49
49
  reference_date: datetime.date = None,
50
- custom_priority_map: Dict[str, int] = None):
50
+ custom_priority_map: Dict[str, int] = None,
51
+ max_threads: int = 3):
51
52
  self.dataclass = dataclass
52
53
  self.date_field = date_field
53
54
  self.data_path = self._ensure_forward_slash(data_path)
@@ -70,12 +71,14 @@ class DataWrapper:
70
71
  self.timeout = timeout
71
72
  self.reference_date = reference_date or datetime.date.today()
72
73
  self.priority_map = custom_priority_map or self.DEFAULT_PRIORITY_MAP
74
+ self.max_threads = max_threads
73
75
 
74
76
  self.start_date = self._convert_to_date(start_date)
75
77
  self.end_date = self._convert_to_date(end_date)
76
78
  self._lock = Lock()
77
79
  self.processed_dates = []
78
80
  self.age_checker = FileAgeChecker(logger=self.logger)
81
+ self.data_class_instance = self.dataclass(**self.class_params) or None
79
82
 
80
83
  def _init_filesystem(self) -> fsspec.AbstractFileSystem:
81
84
  with self._lock:
@@ -138,8 +141,9 @@ class DataWrapper:
138
141
 
139
142
  desc = f"Processing {self.dataclass.__name__}, task: {self._priority_label(priority)}"
140
143
  self.logger.info(f"Starting {desc.lower()}")
141
-
142
- with ThreadPoolExecutor() as executor:
144
+ max_threads = min(len(dates), self.max_threads)
145
+ self.logger.info(f"DataWrapper Max threads set at: {max_threads}")
146
+ with ThreadPoolExecutor(max_workers=max_threads) as executor:
143
147
  futures = {
144
148
  executor.submit(self._process_date_with_retry, date, max_retries): date
145
149
  for date in dates
@@ -262,8 +266,9 @@ class DataWrapper:
262
266
  try:
263
267
  self.logger.debug(f"Class Params: {self.class_params}")
264
268
  self.logger.debug(f"Load Params: {self.load_params}")
265
- data = self.dataclass(**self.class_params)
266
- df = data.load_period(
269
+
270
+ #data = self.dataclass(**self.class_params)
271
+ df = self.data_class_instance.load_period(
267
272
  dt_field=self.date_field,
268
273
  start=date,
269
274
  end=date,
@@ -1,15 +1,10 @@
1
1
  import base64
2
+ import hashlib
2
3
  import logging
3
- from pathlib import Path
4
+ import warnings
4
5
  from typing import Optional
5
6
 
6
7
  import pyarrow as pa
7
- import fsspec
8
- import warnings
9
- import hashlib
10
- from s3fs import S3FileSystem
11
-
12
- from fsspec import filesystem
13
8
 
14
9
  # Suppress the specific UserWarning message
15
10
  warnings.filterwarnings("ignore")
@@ -0,0 +1,97 @@
1
+ Metadata-Version: 2.1
2
+ Name: sibi-dst
3
+ Version: 0.3.54
4
+ Summary: Data Science Toolkit
5
+ Author: Luis Valverde
6
+ Author-email: lvalverdeb@gmail.com
7
+ Requires-Python: >=3.11,<4.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: Programming Language :: Python :: 3.11
10
+ Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
12
+ Provides-Extra: complete
13
+ Provides-Extra: df-helper
14
+ Provides-Extra: geospatial
15
+ Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
16
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
17
+ Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
18
+ Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
19
+ Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
20
+ Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
21
+ Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
22
+ Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
23
+ Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "geospatial" or extra == "complete"
24
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "geospatial" or extra == "complete"
25
+ Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
26
+ Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
27
+ Requires-Dist: httpx (>=0.27.2,<0.28.0)
28
+ Requires-Dist: ipython (>=8.29.0,<9.0.0)
29
+ Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
30
+ Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
31
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
32
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
33
+ Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "geospatial" or extra == "complete"
34
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
35
+ Requires-Dist: paramiko (>=3.5.0,<4.0.0)
36
+ Requires-Dist: psutil (>=6.1.0,<7.0.0)
37
+ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
38
+ Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
39
+ Requires-Dist: pydantic (>=2.9.2,<3.0.0)
40
+ Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
41
+ Requires-Dist: pytest (>=8.3.3,<9.0.0)
42
+ Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
43
+ Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
44
+ Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
45
+ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
46
+ Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
47
+ Requires-Dist: tornado (>=6.4.1,<7.0.0)
48
+ Requires-Dist: tqdm (>=4.67.0,<5.0.0)
49
+ Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
50
+ Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
51
+ Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
52
+ Description-Content-Type: text/markdown
53
+
54
+ ---
55
+ title: "Sibi Data Science Toolkit"
56
+ description: "Data Science Toolkit built with Python, Pandas, Dask, GeoPandas, OpenStreetMaps, SQLAlchemy"
57
+ author: "Luis Valverde"
58
+ current_mantainer: "Luis Valverde"
59
+ ---
60
+
61
+ ### SIBI-DST
62
+
63
+ Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, NetworkX, SQLAlchemy, GeoPandas, and Folium.
64
+
65
+ ## Example Use Cases
66
+
67
+ 1. **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs**.
68
+ 2. **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
69
+ 3. **Flexible Data Sharing** with client applications by writing to **Data Warehouses in Clickhouse, local filesystems, and cloud storage platforms** such as **S3**.
70
+ 4. **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`)** for high-performance data exchange.
71
+ 5. **Geospatial Analysis** – Utilize **OpenStreetMaps** and **GeoPandas** for advanced geospatial data processing and visualization.
72
+
73
+ ## Supported Technologies
74
+
75
+ - **Data Processing**: Pandas, Dask
76
+ - **Databases & Storage**: SQLAlchemy, Parquet, S3, Clickhouse
77
+ - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
78
+ - **API Development**: Django REST Framework, FastAPI
79
+
80
+ ## Installation
81
+
82
+ ```bash
83
+ # with pip
84
+
85
+ pip install sibi-dst[complete] # Install all dependencies
86
+ pip install sibi-dst[df_helper] # Install only df_helper dependencies
87
+ pip install sibi-dst[geospatial] # Install only geospatial dependencies
88
+
89
+ # with poetry
90
+
91
+ poetry add "sibi-dst[complete]" # Install all dependencies
92
+ poetry add "sibi-dst[df_helper]" # Install only df_helper dependencies
93
+ poetry add "sibi-dst[geospatial]" # Install only geospatial dependencies
94
+
95
+
96
+ ```
97
+
@@ -1,8 +1,8 @@
1
1
  sibi_dst/__init__.py,sha256=3pbriM7Ym5f9gew7n9cO4G_p9n-0bnxdmQ0hwBdJjr4,253
2
2
  sibi_dst/df_helper/__init__.py,sha256=McYrw2N0MsMgtawLrONXTGdyHfQWVOBUvIDbklfjb54,342
3
3
  sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=toH2QvNF-CQNJ4Bc8xreytuWr37G0EWz4ciWVdFMVqU,11646
4
- sibi_dst/df_helper/_df_helper.py,sha256=IS1m9r9U-eJ7EVMBqmITmre6S0JfIl6nJtPIwNI3xKY,29771
5
- sibi_dst/df_helper/_parquet_artifact.py,sha256=qt3WZXrE2EZs3KI0biGzm3znIZazzTh8fgiipCVr_Ic,10196
4
+ sibi_dst/df_helper/_df_helper.py,sha256=D85n4oUdu92IN2QaPc6k9uJJ_Vm197me1aoHojuWEYs,29833
5
+ sibi_dst/df_helper/_parquet_artifact.py,sha256=dDoR5Tq0EJViW52tD9XW_vno7hkOfA8WeAilR1mAb_g,10636
6
6
  sibi_dst/df_helper/_parquet_reader.py,sha256=L6mr2FeKtTeIn37G9EGpvOx8PwMqXb6qnEECqBaiwxo,3954
7
7
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
8
8
  sibi_dst/df_helper/backends/django/__init__.py,sha256=uWHi-DtQX5re7b2HcqoXUH3_FZWOw1VTmDf552FAkNs,256
@@ -16,7 +16,7 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
16
16
  sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
17
17
  sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
18
18
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=TuVp8Ce49dCIIxtyrtFGRblarQUl8QGcS-TDZd515IE,348
19
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Kli83IEg5SFVqkhsK4w45cV6PbZnfdGanfsyiW6Xw00,2502
19
+ sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=bQxQAA2GNSmNsRWTZbsLmpvK6pnkbVzECBXQ9H2iqRc,2750
20
20
  sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py,sha256=58RCda1Hg_nsuJw-2V36IstsT8O84IQFgsdE7FnqvMk,4655
21
21
  sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=BtiRSYA4kFIM-mBCdrwE20vzByfq8_Biv_jPLUCDv58,5466
22
22
  sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=I2Us3RrxHci561yyZYBuUCrLVOhB0F3KBnae78m_ARw,6259
@@ -44,13 +44,13 @@ sibi_dst/utils/clickhouse_writer.py,sha256=iAUe4_Kn2WR1xZjpLW2FOWCWfOTw6fCGMTUcW
44
44
  sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
45
45
  sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
46
46
  sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
47
- sibi_dst/utils/data_wrapper.py,sha256=pIIQxeHknUeQd0YbISkAhL-xYBK4OdijoATBY-oBznw,12114
47
+ sibi_dst/utils/data_wrapper.py,sha256=-RqK_sU3uuc6U9dFfnICFZkaIetk3JOvJain6_lTWlo,12446
48
48
  sibi_dst/utils/date_utils.py,sha256=OCJqkWl5e8fE7z11Ufz4206DUeuLMd_Gf_JGZu914Pg,18539
49
49
  sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
50
50
  sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
51
51
  sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
52
52
  sibi_dst/utils/log_utils.py,sha256=eSAbi_jmMpJ8RpycakzT4S4zNkqVZDj3FY8WwnxpdXc,4623
53
- sibi_dst/utils/parquet_saver.py,sha256=Tucxv9jRX66VuLQZn0dPQBN7JOttBou6SF8FxqufeGE,8169
53
+ sibi_dst/utils/parquet_saver.py,sha256=EBtd9blzk7Wb65aDBVVU0ZMHFtqjfWi_fCUt1LvyAC4,8069
54
54
  sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
55
55
  sibi_dst/utils/storage_config.py,sha256=Cg8EOGLZ_5v9sunaQHZLYHdp5FDkgPrCVVNHF-ys5sQ,2181
56
56
  sibi_dst/utils/storage_manager.py,sha256=btecX7ggNb7rfu5EK9Xuu2q_FZA7r_rB_tfhQ8V96qc,6567
@@ -75,6 +75,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
75
75
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
76
76
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
77
77
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
78
- sibi_dst-0.3.52.dist-info/METADATA,sha256=VIiTzkdO-FsYBkie1cfuwFKEwFFSE5TL5Ory-VzPa0g,7221
79
- sibi_dst-0.3.52.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
80
- sibi_dst-0.3.52.dist-info/RECORD,,
78
+ sibi_dst-0.3.54.dist-info/METADATA,sha256=i2guOpX-oxuWb7-Jk2GNLQrSY83VjaAulNA_9AyuVzk,4502
79
+ sibi_dst-0.3.54.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
80
+ sibi_dst-0.3.54.dist-info/RECORD,,
@@ -1,200 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: sibi-dst
3
- Version: 0.3.52
4
- Summary: Data Science Toolkit
5
- Author: Luis Valverde
6
- Author-email: lvalverdeb@gmail.com
7
- Requires-Python: >=3.11,<4.0
8
- Classifier: Programming Language :: Python :: 3
9
- Classifier: Programming Language :: Python :: 3.11
10
- Classifier: Programming Language :: Python :: 3.12
11
- Classifier: Programming Language :: Python :: 3.13
12
- Provides-Extra: complete
13
- Provides-Extra: df-helper
14
- Provides-Extra: geopy-helper
15
- Provides-Extra: osmnx-helper
16
- Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
17
- Requires-Dist: chardet (>=5.2.0,<6.0.0)
18
- Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
19
- Requires-Dist: clickhouse-connect (>=0.8.7,<0.9.0)
20
- Requires-Dist: clickhouse-driver (>=0.2.9,<0.3.0)
21
- Requires-Dist: dask[complete] (>=2025.3.0,<2026.0.0)
22
- Requires-Dist: django (>=5.1.4,<6.0.0) ; extra == "df-helper" or extra == "complete"
23
- Requires-Dist: djangorestframework (>=3.15.2,<4.0.0) ; extra == "df-helper" or extra == "complete"
24
- Requires-Dist: folium (>=0.19.4,<0.20.0) ; extra == "osmnx-helper" or extra == "complete"
25
- Requires-Dist: geopandas (>=1.0.1,<2.0.0) ; extra == "osmnx-helper" or extra == "complete"
26
- Requires-Dist: geopy (>=2.4.1,<3.0.0) ; extra == "geopy-helper" or extra == "complete"
27
- Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
28
- Requires-Dist: httpx (>=0.27.2,<0.28.0)
29
- Requires-Dist: ipython (>=8.29.0,<9.0.0)
30
- Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
31
- Requires-Dist: mysqlclient (>=2.2.6,<3.0.0) ; extra == "df-helper" or extra == "complete"
32
- Requires-Dist: nltk (>=3.9.1,<4.0.0)
33
- Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
34
- Requires-Dist: osmnx (>=2.0.1,<3.0.0) ; extra == "osmnx-helper" or extra == "complete"
35
- Requires-Dist: pandas (>=2.2.3,<3.0.0)
36
- Requires-Dist: paramiko (>=3.5.0,<4.0.0)
37
- Requires-Dist: psutil (>=6.1.0,<7.0.0)
38
- Requires-Dist: psycopg2 (>=2.9.10,<3.0.0) ; extra == "df-helper" or extra == "complete"
39
- Requires-Dist: pyarrow (>=18.0.0,<19.0.0)
40
- Requires-Dist: pydantic (>=2.9.2,<3.0.0)
41
- Requires-Dist: pymysql (>=1.1.1,<2.0.0) ; extra == "df-helper" or extra == "complete"
42
- Requires-Dist: pytest (>=8.3.3,<9.0.0)
43
- Requires-Dist: pytest-mock (>=3.14.0,<4.0.0)
44
- Requires-Dist: python-dotenv (>=1.0.1,<2.0.0)
45
- Requires-Dist: s3fs (>=2024.12.0,<2025.0.0)
46
- Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0) ; extra == "df-helper" or extra == "complete"
47
- Requires-Dist: sqlmodel (>=0.0.22,<0.0.23) ; extra == "df-helper" or extra == "complete"
48
- Requires-Dist: tornado (>=6.4.1,<7.0.0)
49
- Requires-Dist: tqdm (>=4.67.0,<5.0.0)
50
- Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
51
- Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
52
- Requires-Dist: webdav4[fsspec] (>=0.10.0,<0.11.0)
53
- Description-Content-Type: text/markdown
54
-
55
- # sibi-dst
56
-
57
- Data Science Toolkit
58
- ---------------------
59
- Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFramework, FastAPI
60
-
61
- Major Functionality
62
- --------------------
63
- 1) **Build DataCubes, DataSets, and DataObjects** from diverse data sources, including **relational databases, Parquet files, Excel (`.xlsx`), delimited tables (`.csv`, `.tsv`), JSON, and RESTful APIs (`JSON API REST`)**.
64
- 2) **Comprehensive DataFrame Management** utilities for efficient data handling, transformation, and optimization using **Pandas** and **Dask**.
65
- 3) **Flexible Data Sharing** with client applications by writing to **Data Warehouses, local filesystems, and cloud storage platforms** such as **Amazon S3, Google Cloud Storage (GCS), and Azure Blob Storage**.
66
- 4) **Microservices for Data Access** – Build scalable **API-driven services** using **RESTful APIs (`Django REST Framework`, `FastAPI`) and gRPC** for high-performance data exchange.
67
-
68
- Supported Technologies
69
- --------------------
70
- - **Data Processing**: Pandas, Dask
71
- - **Machine Learning**: Scikit-Learn, XGBoost
72
- - **Databases & Storage**: SQLAlchemy, Django ORM, Parquet, Amazon S3, GCS, Azure Blob Storage
73
- - **Mapping & Geospatial Analysis**: OpenStreetMaps, OSMnx, Geopy
74
- - **API Development**: Django REST Framework, gRPC, FastAPI
75
-
76
- Installation
77
- ---------------------
78
- ```bash
79
- pip install sibi-dst
80
- ```
81
-
82
- Usage
83
- ---------------------
84
- ### Loading Data from SQLAlchemy
85
- ```python
86
- from sibi_dst.df_helper import DfHelper
87
- from conf.transforms.fields.crm import customer_fields
88
- from conf.credentials import replica_db_conf
89
- from conf.storage import get_fs_instance
90
-
91
- config = {
92
- 'backend': 'sqlalchemy',
93
- 'connection_url': replica_db_conf.get('db_url'),
94
- 'table': 'crm_clientes_archivo',
95
- 'field_map': customer_fields,
96
- 'legacy_filters': True,
97
- 'fs': get_fs_instance()
98
- }
99
-
100
- df_helper = DfHelper(**config)
101
- result = df_helper.load(id__gte=1)
102
- ```
103
-
104
- ### Saving Data to ClickHouse
105
- ```python
106
- clk_creds = {
107
- 'host': '192.168.3.171',
108
- 'port': 18123,
109
- 'user': 'username',
110
- 'database': 'xxxxxxx',
111
- 'table': 'customer_file',
112
- 'order_by': 'id'
113
- }
114
-
115
- df_helper.save_to_clickhouse(**clk_creds)
116
- ```
117
-
118
- ### Saving Data to Parquet
119
- ```python
120
- df_helper.save_to_parquet(
121
- parquet_filename='filename.parquet',
122
- parquet_storage_path='/path/to/my/files/'
123
- )
124
- ```
125
-
126
- Backends Supported
127
- ---------------------
128
- | Backend | Description |
129
- |--------------|-------------|
130
- | `sqlalchemy` | Load data from SQL databases using SQLAlchemy. |
131
- | `django_db` | Load data from Django ORM models. |
132
- | `parquet` | Load and save data from Parquet files. |
133
- | `http` | Fetch data from HTTP endpoints. |
134
- | `osmnx` | Geospatial mapping and routing using OpenStreetMap. |
135
- | `geopy` | Geolocation services for address lookup and reverse geocoding. |
136
-
137
- Geospatial Utilities
138
- ---------------------
139
- ### **OSMnx Helper (`sibi_dst.osmnx_helper`)
140
- **
141
- Provides **OpenStreetMap-based mapping utilities** using `osmnx` and `folium`.
142
-
143
- #### 🔹 Key Features
144
- - **BaseOsmMap**: Manages interactive Folium-based maps.
145
- - **PBFHandler**: Loads `.pbf` (Protocolbuffer Binary Format) files for network graphs.
146
-
147
- #### Example: Generating an OSM Map
148
- ```python
149
- from sibi_dst.osmnx_helper import BaseOsmMap
150
- osm_map = BaseOsmMap(osmnx_graph=my_graph, df=my_dataframe)
151
- osm_map.generate_map()
152
- ```
153
-
154
- ### **Geopy Helper (`sibi_dst.geopy_helper`)
155
- **
156
- Provides **geolocation services** using `Geopy` for forward and reverse geocoding.
157
-
158
- #### 🔹 Key Features
159
- - **GeolocationService**: Interfaces with `Nominatim` API for geocoding.
160
- - **Error Handling**: Manages `GeocoderTimedOut` and `GeocoderServiceError` gracefully.
161
- - **Singleton Geolocator**: Efficiently reuses a global geolocator instance.
162
-
163
- #### Example: Reverse Geocoding
164
- ```python
165
- from sibi_dst.geopy_helper import GeolocationService
166
- gs = GeolocationService()
167
- location = gs.reverse((9.935,-84.091))
168
- print(location)
169
- ```
170
-
171
- Advanced Features
172
- ---------------------
173
- ### Querying with Custom Filters
174
- Filters can be applied dynamically using Django-style syntax:
175
- ```python
176
- result = df_helper.load(date__gte='2023-01-01', status='active')
177
- ```
178
-
179
- ### Parallel Processing
180
- Leverage Dask for parallel execution:
181
- ```python
182
- result = df_helper.load_parallel(status='active')
183
- ```
184
-
185
- Testing
186
- ---------------------
187
- To run unit tests, use:
188
- ```bash
189
- pytest tests/
190
- ```
191
-
192
- Contributing
193
- ---------------------
194
- Contributions are welcome! Please submit pull requests or open issues for discussions.
195
-
196
- License
197
- ---------------------
198
- sibi-dst is licensed under the MIT License.
199
-
200
-