sibi-dst 0.3.40__py3-none-any.whl → 0.3.43__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +2 -0
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +262 -0
- sibi_dst/df_helper/_df_helper.py +5 -2
- sibi_dst/df_helper/_parquet_artifact.py +8 -2
- sibi_dst/df_helper/_parquet_reader.py +5 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +1 -0
- sibi_dst/osmnx_helper/__init__.py +2 -2
- sibi_dst/osmnx_helper/v1/basemaps/__init__.py +0 -0
- sibi_dst/osmnx_helper/{basemaps → v1/basemaps}/router_plotter.py +85 -30
- sibi_dst/osmnx_helper/v2/__init__.py +0 -0
- sibi_dst/osmnx_helper/v2/base_osm_map.py +153 -0
- sibi_dst/osmnx_helper/v2/basemaps/__init__.py +0 -0
- sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
- sibi_dst/utils/__init__.py +3 -0
- sibi_dst/utils/data_utils.py +66 -25
- sibi_dst/utils/data_wrapper.py +222 -285
- sibi_dst/utils/date_utils.py +118 -113
- sibi_dst/utils/df_utils.py +7 -0
- sibi_dst/utils/log_utils.py +57 -18
- sibi_dst/utils/parquet_saver.py +4 -2
- sibi_dst/utils/phone_formatter.py +127 -0
- sibi_dst/utils/storage_manager.py +14 -7
- sibi_dst-0.3.43.dist-info/METADATA +194 -0
- {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.43.dist-info}/RECORD +29 -22
- sibi_dst-0.3.40.dist-info/METADATA +0 -62
- /sibi_dst/osmnx_helper/{basemaps → v1}/__init__.py +0 -0
- /sibi_dst/osmnx_helper/{base_osm_map.py → v1/base_osm_map.py} +0 -0
- /sibi_dst/osmnx_helper/{basemaps → v1/basemaps}/calendar_html.py +0 -0
- /sibi_dst/osmnx_helper/{utils.py → v1/utils.py} +0 -0
- {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.43.dist-info}/WHEEL +0 -0
@@ -0,0 +1,153 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import html
|
4
|
+
from abc import ABC, abstractmethod
|
5
|
+
from typing import Optional
|
6
|
+
|
7
|
+
import folium
|
8
|
+
import geopandas as gpd
|
9
|
+
import numpy as np
|
10
|
+
import osmnx as ox
|
11
|
+
import pandas as pd
|
12
|
+
from folium.plugins import Fullscreen
|
13
|
+
from networkx import MultiDiGraph
|
14
|
+
|
15
|
+
|
16
|
+
class BaseOsmMap(ABC):
|
17
|
+
# Define available tile options for the map
|
18
|
+
tile_options = {
|
19
|
+
"OpenStreetMap": "OpenStreetMap",
|
20
|
+
"CartoDB": "cartodbpositron",
|
21
|
+
"CartoDB Voyager": "cartodbvoyager"
|
22
|
+
}
|
23
|
+
# Default geographical bounds (Costa Rica)
|
24
|
+
bounds = [[8.0340, -85.9417], [11.2192, -82.5566]]
|
25
|
+
|
26
|
+
def __init__(
|
27
|
+
self,
|
28
|
+
osmnx_graph: MultiDiGraph,
|
29
|
+
df: pd.DataFrame,
|
30
|
+
lat_col: str = "latitude",
|
31
|
+
lon_col: str = "longitude",
|
32
|
+
map_html_title: str = "OSM Basemap",
|
33
|
+
zoom_start: int = 13,
|
34
|
+
fullscreen: bool = True,
|
35
|
+
fullscreen_position: str = "topright",
|
36
|
+
tiles: str = "OpenStreetMap",
|
37
|
+
verbose: bool = False,
|
38
|
+
sort_keys: Optional[list[str]] = None,
|
39
|
+
dt_field: Optional[str] = None,
|
40
|
+
calc_nearest_nodes: bool = False,
|
41
|
+
max_bounds: bool = False,
|
42
|
+
):
|
43
|
+
if df.empty:
|
44
|
+
raise ValueError("df must not be empty")
|
45
|
+
|
46
|
+
# Store attributes
|
47
|
+
self.df = df.copy()
|
48
|
+
self.osmnx_graph = osmnx_graph
|
49
|
+
self.lat_col = lat_col
|
50
|
+
self.lon_col = lon_col
|
51
|
+
self.map_html_title = self._sanitize_html(map_html_title)
|
52
|
+
self.zoom_start = zoom_start
|
53
|
+
self.fullscreen = fullscreen
|
54
|
+
self.fullscreen_position = fullscreen_position
|
55
|
+
self.tiles = tiles
|
56
|
+
self.verbose = verbose
|
57
|
+
self.sort_keys = sort_keys
|
58
|
+
self.dt_field = dt_field
|
59
|
+
self.calc_nearest_nodes = calc_nearest_nodes
|
60
|
+
self.max_bounds = max_bounds
|
61
|
+
self.dt = self.df[self.dt_field].to_list() if self.dt_field else None
|
62
|
+
self.nearest_nodes = None
|
63
|
+
self.G = None
|
64
|
+
self.osm_map = None
|
65
|
+
|
66
|
+
self._prepare_df()
|
67
|
+
self._initialize_map()
|
68
|
+
|
69
|
+
def _prepare_df(self):
|
70
|
+
"""Sort and preprocess the DataFrame."""
|
71
|
+
if self.sort_keys:
|
72
|
+
self.df.sort_values(by=self.sort_keys, inplace=True, ignore_index=True)
|
73
|
+
self.gps_points = self.df[[self.lat_col, self.lon_col]].to_numpy()
|
74
|
+
|
75
|
+
# Compute nearest nodes if required
|
76
|
+
if self.calc_nearest_nodes and not self.df.empty:
|
77
|
+
self.nearest_nodes = ox.distance.nearest_nodes(
|
78
|
+
self.osmnx_graph, X=self.df[self.lon_col], Y=self.df[self.lat_col]
|
79
|
+
)
|
80
|
+
|
81
|
+
def _initialize_map(self):
|
82
|
+
"""Initialize the folium map centered around the dataset."""
|
83
|
+
if self.gps_points.size == 0:
|
84
|
+
raise ValueError("No valid GPS points available for map initialization")
|
85
|
+
|
86
|
+
center = self.gps_points.mean(axis=0).tolist()
|
87
|
+
if self.osm_map is None:
|
88
|
+
self.osm_map = folium.Map(
|
89
|
+
location=center, zoom_start=self.zoom_start, tiles=self.tiles, max_bounds=self.max_bounds
|
90
|
+
)
|
91
|
+
self.G = self._extract_subgraph(*self._get_bounding_box_from_points())
|
92
|
+
|
93
|
+
def _get_bounding_box_from_points(self, margin: float = 0.001) -> tuple[float, float, float, float]:
|
94
|
+
"""Compute bounding box for the dataset with margin."""
|
95
|
+
latitudes, longitudes = self.gps_points[:, 0], self.gps_points[:, 1]
|
96
|
+
return max(latitudes) + margin, min(latitudes) - margin, max(longitudes) + margin, min(longitudes) - margin
|
97
|
+
|
98
|
+
def _extract_subgraph(self, north: float, south: float, east: float, west: float) -> MultiDiGraph:
|
99
|
+
"""Extract a subgraph from OSM data within the bounding box."""
|
100
|
+
bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly((west, south, east, north))])
|
101
|
+
nodes_gdf = ox.graph_to_gdfs(self.osmnx_graph, nodes=True, edges=False)
|
102
|
+
nodes_within_bbox = gpd.sjoin(nodes_gdf, gpd.GeoDataFrame(geometry=bbox_poly), predicate="within")
|
103
|
+
return self.osmnx_graph.subgraph(nodes_within_bbox.index)
|
104
|
+
|
105
|
+
def _post_process_map(self):
|
106
|
+
"""Perform final adjustments to the map."""
|
107
|
+
self._attach_supported_tiles()
|
108
|
+
self.add_tile_layer()
|
109
|
+
self._add_fullscreen()
|
110
|
+
self._add_map_title()
|
111
|
+
if self.max_bounds and self.bounds:
|
112
|
+
self.osm_map.fit_bounds(self.bounds)
|
113
|
+
|
114
|
+
def _attach_supported_tiles(self):
|
115
|
+
"""Attach additional tile layers to the map."""
|
116
|
+
for name, tile in self.tile_options.items():
|
117
|
+
if tile.lower() != self.tiles.lower():
|
118
|
+
folium.TileLayer(name=name, tiles=tile, show=False).add_to(self.osm_map)
|
119
|
+
|
120
|
+
def _add_fullscreen(self):
|
121
|
+
"""Enable fullscreen control if required."""
|
122
|
+
if self.fullscreen:
|
123
|
+
Fullscreen(position=self.fullscreen_position).add_to(self.osm_map)
|
124
|
+
|
125
|
+
def _add_map_title(self):
|
126
|
+
"""Add a title to the map if provided."""
|
127
|
+
if self.map_html_title:
|
128
|
+
self.osm_map.get_root().html.add_child(folium.Element(self.map_html_title))
|
129
|
+
|
130
|
+
@staticmethod
|
131
|
+
def _sanitize_html(input_html: str) -> str:
|
132
|
+
"""Sanitize HTML input to prevent script injection."""
|
133
|
+
return html.escape(input_html)
|
134
|
+
|
135
|
+
@abstractmethod
|
136
|
+
def process_map(self):
|
137
|
+
"""Abstract method to define map processing logic in subclasses."""
|
138
|
+
pass
|
139
|
+
|
140
|
+
def pre_process_map(self):
|
141
|
+
"""Optional preprocessing step before main processing."""
|
142
|
+
pass
|
143
|
+
|
144
|
+
def add_tile_layer(self):
|
145
|
+
"""Add a layer control to the map."""
|
146
|
+
folium.LayerControl().add_to(self.osm_map)
|
147
|
+
|
148
|
+
def generate_map(self) -> folium.Map:
|
149
|
+
"""Generate and return the processed map."""
|
150
|
+
self.pre_process_map()
|
151
|
+
self.process_map()
|
152
|
+
self._post_process_map()
|
153
|
+
return self.osm_map
|
File without changes
|
File without changes
|
sibi_dst/utils/__init__.py
CHANGED
@@ -4,6 +4,7 @@ from .log_utils import Logger
|
|
4
4
|
from .date_utils import *
|
5
5
|
from .data_utils import DataUtils
|
6
6
|
from .file_utils import FileUtils
|
7
|
+
from .phone_formatter import PhoneNumberFormatter
|
7
8
|
from .filepath_generator import FilePathGenerator
|
8
9
|
from .df_utils import DfUtils
|
9
10
|
from .storage_manager import StorageManager
|
@@ -18,8 +19,10 @@ __all__ = [
|
|
18
19
|
"ConfigManager",
|
19
20
|
"ConfigLoader",
|
20
21
|
"DateUtils",
|
22
|
+
"FileAgeChecker",
|
21
23
|
"BusinessDays",
|
22
24
|
"FileUtils",
|
25
|
+
"PhoneNumberFormatter",
|
23
26
|
"DataWrapper",
|
24
27
|
"DataUtils",
|
25
28
|
"FilePathGenerator",
|
sibi_dst/utils/data_utils.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from typing import Union, List
|
2
|
+
|
1
3
|
import dask.dataframe as dd
|
2
4
|
import pandas as pd
|
3
5
|
|
@@ -23,6 +25,58 @@ class DataUtils:
|
|
23
25
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
24
26
|
self.debug = kwargs.get('debug', False)
|
25
27
|
|
28
|
+
@staticmethod
|
29
|
+
def _transform_column(series, fill_value, dtype):
|
30
|
+
"""
|
31
|
+
Helper method to transform a column by converting it to numeric, filling missing values,
|
32
|
+
and casting to the specified dtype.
|
33
|
+
|
34
|
+
:param series: The column to transform.
|
35
|
+
:type series: pd.Series or dd.Series
|
36
|
+
:param fill_value: Value to replace missing or invalid data.
|
37
|
+
:type fill_value: int or float
|
38
|
+
:param dtype: Target data type for the column.
|
39
|
+
:type dtype: type
|
40
|
+
:return: Transformed column.
|
41
|
+
:rtype: pd.Series or dd.Series
|
42
|
+
"""
|
43
|
+
return (
|
44
|
+
pd.to_numeric(series, errors="coerce") # Convert to numeric, invalid to NaN
|
45
|
+
.fillna(fill_value) # Replace NaN with fill_value
|
46
|
+
.astype(dtype) # Convert to target dtype
|
47
|
+
)
|
48
|
+
|
49
|
+
def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
|
50
|
+
dtype=int):
|
51
|
+
"""
|
52
|
+
Transform specified numeric columns in the DataFrame by converting their data types
|
53
|
+
to the specified dtype and replacing missing values with the given fill_value.
|
54
|
+
|
55
|
+
:param df: DataFrame to be transformed.
|
56
|
+
:type df: pd.DataFrame or dd.DataFrame
|
57
|
+
:param columns: List of column names to transform.
|
58
|
+
:type columns: list[str]
|
59
|
+
:param fill_value: Value to replace missing or invalid data. Default is 0.
|
60
|
+
:type fill_value: int or float
|
61
|
+
:param dtype: Target data type for the columns. Default is int.
|
62
|
+
:type dtype: type
|
63
|
+
:return: Transformed DataFrame.
|
64
|
+
:rtype: pd.DataFrame or dd.DataFrame
|
65
|
+
"""
|
66
|
+
if not columns:
|
67
|
+
self.logger.warning("No columns specified.")
|
68
|
+
return df
|
69
|
+
|
70
|
+
self.logger.debug(f"DataFrame type: {type(df)}")
|
71
|
+
columns = [col for col in columns if col in df.columns]
|
72
|
+
|
73
|
+
for col in columns:
|
74
|
+
df[col] = df[col].map_partitions(
|
75
|
+
self._transform_column, fill_value, dtype, meta=(col, dtype)
|
76
|
+
)
|
77
|
+
|
78
|
+
return df
|
79
|
+
|
26
80
|
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
27
81
|
"""
|
28
82
|
This function transforms the specified numeric columns in the given dataframe by converting
|
@@ -57,34 +111,21 @@ class DataUtils:
|
|
57
111
|
|
58
112
|
return df
|
59
113
|
|
60
|
-
def transform_boolean_columns(self, df, columns=
|
114
|
+
def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
|
61
115
|
"""
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
Parameters:
|
66
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
67
|
-
- columns (list of str): List of columns to check and transform.
|
68
|
-
- sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
|
116
|
+
Convert specified columns in the DataFrame to boolean, replacing missing values with
|
117
|
+
the given fill_value.
|
69
118
|
|
70
|
-
|
71
|
-
|
119
|
+
:param df: DataFrame to be transformed.
|
120
|
+
:type df: pd.DataFrame or dd.DataFrame
|
121
|
+
:param columns: List of column names to transform.
|
122
|
+
:type columns: list[str]
|
123
|
+
:param fill_value: Value to replace missing or invalid data. Default is 0.
|
124
|
+
:type fill_value: int or float
|
125
|
+
:return: Transformed DataFrame.
|
126
|
+
:rtype: pd.DataFrame or dd.DataFrame
|
72
127
|
"""
|
73
|
-
|
74
|
-
# Apply transformation to each specified column
|
75
|
-
for col in columns:
|
76
|
-
if col in df.columns:
|
77
|
-
# Replace NaN with 0, then convert to boolean
|
78
|
-
df[col] = df[col].map_partitions(
|
79
|
-
lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
|
80
|
-
.fillna(0) # Replace NaN with 0
|
81
|
-
.astype(int) # Ensure integer type
|
82
|
-
.astype(bool), # Convert to boolean
|
83
|
-
meta=(col, 'bool')
|
84
|
-
)
|
85
|
-
if self.debug:
|
86
|
-
self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
|
87
|
-
return df
|
128
|
+
return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
|
88
129
|
|
89
130
|
def merge_lookup_data(self, classname, df, **kwargs):
|
90
131
|
"""
|