sibi-dst 0.3.40__py3-none-any.whl → 0.3.43__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sibi_dst/df_helper/__init__.py +2 -0
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +262 -0
  3. sibi_dst/df_helper/_df_helper.py +5 -2
  4. sibi_dst/df_helper/_parquet_artifact.py +8 -2
  5. sibi_dst/df_helper/_parquet_reader.py +5 -1
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +1 -0
  7. sibi_dst/osmnx_helper/__init__.py +2 -2
  8. sibi_dst/osmnx_helper/v1/basemaps/__init__.py +0 -0
  9. sibi_dst/osmnx_helper/{basemaps → v1/basemaps}/router_plotter.py +85 -30
  10. sibi_dst/osmnx_helper/v2/__init__.py +0 -0
  11. sibi_dst/osmnx_helper/v2/base_osm_map.py +153 -0
  12. sibi_dst/osmnx_helper/v2/basemaps/__init__.py +0 -0
  13. sibi_dst/osmnx_helper/v2/basemaps/utils.py +0 -0
  14. sibi_dst/utils/__init__.py +3 -0
  15. sibi_dst/utils/data_utils.py +66 -25
  16. sibi_dst/utils/data_wrapper.py +222 -285
  17. sibi_dst/utils/date_utils.py +118 -113
  18. sibi_dst/utils/df_utils.py +7 -0
  19. sibi_dst/utils/log_utils.py +57 -18
  20. sibi_dst/utils/parquet_saver.py +4 -2
  21. sibi_dst/utils/phone_formatter.py +127 -0
  22. sibi_dst/utils/storage_manager.py +14 -7
  23. sibi_dst-0.3.43.dist-info/METADATA +194 -0
  24. {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.43.dist-info}/RECORD +29 -22
  25. sibi_dst-0.3.40.dist-info/METADATA +0 -62
  26. /sibi_dst/osmnx_helper/{basemaps → v1}/__init__.py +0 -0
  27. /sibi_dst/osmnx_helper/{base_osm_map.py → v1/base_osm_map.py} +0 -0
  28. /sibi_dst/osmnx_helper/{basemaps → v1/basemaps}/calendar_html.py +0 -0
  29. /sibi_dst/osmnx_helper/{utils.py → v1/utils.py} +0 -0
  30. {sibi_dst-0.3.40.dist-info → sibi_dst-0.3.43.dist-info}/WHEEL +0 -0
@@ -0,0 +1,153 @@
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ from abc import ABC, abstractmethod
5
+ from typing import Optional
6
+
7
+ import folium
8
+ import geopandas as gpd
9
+ import numpy as np
10
+ import osmnx as ox
11
+ import pandas as pd
12
+ from folium.plugins import Fullscreen
13
+ from networkx import MultiDiGraph
14
+
15
+
16
+ class BaseOsmMap(ABC):
17
+ # Define available tile options for the map
18
+ tile_options = {
19
+ "OpenStreetMap": "OpenStreetMap",
20
+ "CartoDB": "cartodbpositron",
21
+ "CartoDB Voyager": "cartodbvoyager"
22
+ }
23
+ # Default geographical bounds (Costa Rica)
24
+ bounds = [[8.0340, -85.9417], [11.2192, -82.5566]]
25
+
26
+ def __init__(
27
+ self,
28
+ osmnx_graph: MultiDiGraph,
29
+ df: pd.DataFrame,
30
+ lat_col: str = "latitude",
31
+ lon_col: str = "longitude",
32
+ map_html_title: str = "OSM Basemap",
33
+ zoom_start: int = 13,
34
+ fullscreen: bool = True,
35
+ fullscreen_position: str = "topright",
36
+ tiles: str = "OpenStreetMap",
37
+ verbose: bool = False,
38
+ sort_keys: Optional[list[str]] = None,
39
+ dt_field: Optional[str] = None,
40
+ calc_nearest_nodes: bool = False,
41
+ max_bounds: bool = False,
42
+ ):
43
+ if df.empty:
44
+ raise ValueError("df must not be empty")
45
+
46
+ # Store attributes
47
+ self.df = df.copy()
48
+ self.osmnx_graph = osmnx_graph
49
+ self.lat_col = lat_col
50
+ self.lon_col = lon_col
51
+ self.map_html_title = self._sanitize_html(map_html_title)
52
+ self.zoom_start = zoom_start
53
+ self.fullscreen = fullscreen
54
+ self.fullscreen_position = fullscreen_position
55
+ self.tiles = tiles
56
+ self.verbose = verbose
57
+ self.sort_keys = sort_keys
58
+ self.dt_field = dt_field
59
+ self.calc_nearest_nodes = calc_nearest_nodes
60
+ self.max_bounds = max_bounds
61
+ self.dt = self.df[self.dt_field].to_list() if self.dt_field else None
62
+ self.nearest_nodes = None
63
+ self.G = None
64
+ self.osm_map = None
65
+
66
+ self._prepare_df()
67
+ self._initialize_map()
68
+
69
+ def _prepare_df(self):
70
+ """Sort and preprocess the DataFrame."""
71
+ if self.sort_keys:
72
+ self.df.sort_values(by=self.sort_keys, inplace=True, ignore_index=True)
73
+ self.gps_points = self.df[[self.lat_col, self.lon_col]].to_numpy()
74
+
75
+ # Compute nearest nodes if required
76
+ if self.calc_nearest_nodes and not self.df.empty:
77
+ self.nearest_nodes = ox.distance.nearest_nodes(
78
+ self.osmnx_graph, X=self.df[self.lon_col], Y=self.df[self.lat_col]
79
+ )
80
+
81
+ def _initialize_map(self):
82
+ """Initialize the folium map centered around the dataset."""
83
+ if self.gps_points.size == 0:
84
+ raise ValueError("No valid GPS points available for map initialization")
85
+
86
+ center = self.gps_points.mean(axis=0).tolist()
87
+ if self.osm_map is None:
88
+ self.osm_map = folium.Map(
89
+ location=center, zoom_start=self.zoom_start, tiles=self.tiles, max_bounds=self.max_bounds
90
+ )
91
+ self.G = self._extract_subgraph(*self._get_bounding_box_from_points())
92
+
93
+ def _get_bounding_box_from_points(self, margin: float = 0.001) -> tuple[float, float, float, float]:
94
+ """Compute bounding box for the dataset with margin."""
95
+ latitudes, longitudes = self.gps_points[:, 0], self.gps_points[:, 1]
96
+ return max(latitudes) + margin, min(latitudes) - margin, max(longitudes) + margin, min(longitudes) - margin
97
+
98
+ def _extract_subgraph(self, north: float, south: float, east: float, west: float) -> MultiDiGraph:
99
+ """Extract a subgraph from OSM data within the bounding box."""
100
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly((west, south, east, north))])
101
+ nodes_gdf = ox.graph_to_gdfs(self.osmnx_graph, nodes=True, edges=False)
102
+ nodes_within_bbox = gpd.sjoin(nodes_gdf, gpd.GeoDataFrame(geometry=bbox_poly), predicate="within")
103
+ return self.osmnx_graph.subgraph(nodes_within_bbox.index)
104
+
105
+ def _post_process_map(self):
106
+ """Perform final adjustments to the map."""
107
+ self._attach_supported_tiles()
108
+ self.add_tile_layer()
109
+ self._add_fullscreen()
110
+ self._add_map_title()
111
+ if self.max_bounds and self.bounds:
112
+ self.osm_map.fit_bounds(self.bounds)
113
+
114
+ def _attach_supported_tiles(self):
115
+ """Attach additional tile layers to the map."""
116
+ for name, tile in self.tile_options.items():
117
+ if tile.lower() != self.tiles.lower():
118
+ folium.TileLayer(name=name, tiles=tile, show=False).add_to(self.osm_map)
119
+
120
+ def _add_fullscreen(self):
121
+ """Enable fullscreen control if required."""
122
+ if self.fullscreen:
123
+ Fullscreen(position=self.fullscreen_position).add_to(self.osm_map)
124
+
125
+ def _add_map_title(self):
126
+ """Add a title to the map if provided."""
127
+ if self.map_html_title:
128
+ self.osm_map.get_root().html.add_child(folium.Element(self.map_html_title))
129
+
130
+ @staticmethod
131
+ def _sanitize_html(input_html: str) -> str:
132
+ """Sanitize HTML input to prevent script injection."""
133
+ return html.escape(input_html)
134
+
135
+ @abstractmethod
136
+ def process_map(self):
137
+ """Abstract method to define map processing logic in subclasses."""
138
+ pass
139
+
140
+ def pre_process_map(self):
141
+ """Optional preprocessing step before main processing."""
142
+ pass
143
+
144
+ def add_tile_layer(self):
145
+ """Add a layer control to the map."""
146
+ folium.LayerControl().add_to(self.osm_map)
147
+
148
+ def generate_map(self) -> folium.Map:
149
+ """Generate and return the processed map."""
150
+ self.pre_process_map()
151
+ self.process_map()
152
+ self._post_process_map()
153
+ return self.osm_map
File without changes
File without changes
@@ -4,6 +4,7 @@ from .log_utils import Logger
4
4
  from .date_utils import *
5
5
  from .data_utils import DataUtils
6
6
  from .file_utils import FileUtils
7
+ from .phone_formatter import PhoneNumberFormatter
7
8
  from .filepath_generator import FilePathGenerator
8
9
  from .df_utils import DfUtils
9
10
  from .storage_manager import StorageManager
@@ -18,8 +19,10 @@ __all__ = [
18
19
  "ConfigManager",
19
20
  "ConfigLoader",
20
21
  "DateUtils",
22
+ "FileAgeChecker",
21
23
  "BusinessDays",
22
24
  "FileUtils",
25
+ "PhoneNumberFormatter",
23
26
  "DataWrapper",
24
27
  "DataUtils",
25
28
  "FilePathGenerator",
@@ -1,3 +1,5 @@
1
+ from typing import Union, List
2
+
1
3
  import dask.dataframe as dd
2
4
  import pandas as pd
3
5
 
@@ -23,6 +25,58 @@ class DataUtils:
23
25
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
24
26
  self.debug = kwargs.get('debug', False)
25
27
 
28
+ @staticmethod
29
+ def _transform_column(series, fill_value, dtype):
30
+ """
31
+ Helper method to transform a column by converting it to numeric, filling missing values,
32
+ and casting to the specified dtype.
33
+
34
+ :param series: The column to transform.
35
+ :type series: pd.Series or dd.Series
36
+ :param fill_value: Value to replace missing or invalid data.
37
+ :type fill_value: int or float
38
+ :param dtype: Target data type for the column.
39
+ :type dtype: type
40
+ :return: Transformed column.
41
+ :rtype: pd.Series or dd.Series
42
+ """
43
+ return (
44
+ pd.to_numeric(series, errors="coerce") # Convert to numeric, invalid to NaN
45
+ .fillna(fill_value) # Replace NaN with fill_value
46
+ .astype(dtype) # Convert to target dtype
47
+ )
48
+
49
+ def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
50
+ dtype=int):
51
+ """
52
+ Transform specified numeric columns in the DataFrame by converting their data types
53
+ to the specified dtype and replacing missing values with the given fill_value.
54
+
55
+ :param df: DataFrame to be transformed.
56
+ :type df: pd.DataFrame or dd.DataFrame
57
+ :param columns: List of column names to transform.
58
+ :type columns: list[str]
59
+ :param fill_value: Value to replace missing or invalid data. Default is 0.
60
+ :type fill_value: int or float
61
+ :param dtype: Target data type for the columns. Default is int.
62
+ :type dtype: type
63
+ :return: Transformed DataFrame.
64
+ :rtype: pd.DataFrame or dd.DataFrame
65
+ """
66
+ if not columns:
67
+ self.logger.warning("No columns specified.")
68
+ return df
69
+
70
+ self.logger.debug(f"DataFrame type: {type(df)}")
71
+ columns = [col for col in columns if col in df.columns]
72
+
73
+ for col in columns:
74
+ df[col] = df[col].map_partitions(
75
+ self._transform_column, fill_value, dtype, meta=(col, dtype)
76
+ )
77
+
78
+ return df
79
+
26
80
  def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
27
81
  """
28
82
  This function transforms the specified numeric columns in the given dataframe by converting
@@ -57,34 +111,21 @@ class DataUtils:
57
111
 
58
112
  return df
59
113
 
60
- def transform_boolean_columns(self, df, columns=None):
114
+ def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
61
115
  """
62
- Detect if the provided columns in a DataFrame (Pandas or Dask) contain only 0 and 1
63
- and convert them to boolean. Detection is performed using a sample.
64
-
65
- Parameters:
66
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
67
- - columns (list of str): List of columns to check and transform.
68
- - sample_size (int): Number of rows to sample for detection. Ignored for Pandas DataFrames.
116
+ Convert specified columns in the DataFrame to boolean, replacing missing values with
117
+ the given fill_value.
69
118
 
70
- Returns:
71
- - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with transformed boolean columns.
119
+ :param df: DataFrame to be transformed.
120
+ :type df: pd.DataFrame or dd.DataFrame
121
+ :param columns: List of column names to transform.
122
+ :type columns: list[str]
123
+ :param fill_value: Value to replace missing or invalid data. Default is 0.
124
+ :type fill_value: int or float
125
+ :return: Transformed DataFrame.
126
+ :rtype: pd.DataFrame or dd.DataFrame
72
127
  """
73
-
74
- # Apply transformation to each specified column
75
- for col in columns:
76
- if col in df.columns:
77
- # Replace NaN with 0, then convert to boolean
78
- df[col] = df[col].map_partitions(
79
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
80
- .fillna(0) # Replace NaN with 0
81
- .astype(int) # Ensure integer type
82
- .astype(bool), # Convert to boolean
83
- meta=(col, 'bool')
84
- )
85
- if self.debug:
86
- self.logger.debug(f'Dataframe type:{type(df)}, boolean applied to columns: {columns}')
87
- return df
128
+ return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
88
129
 
89
130
  def merge_lookup_data(self, classname, df, **kwargs):
90
131
  """