sibi-dst 0.3.27__py3-none-any.whl → 0.3.28__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,9 +3,11 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
+ #from .data_cleaner import DataCleaner
6
7
 
7
8
  __all__ = [
8
9
  'DfHelper',
9
10
  'ParquetArtifact',
10
11
  'ParquetReader',
12
+ #'DataCleaner'
11
13
  ]
@@ -6,6 +6,7 @@ from typing import Any, Dict, TypeVar
6
6
  from typing import Union, Optional
7
7
 
8
8
  import dask.dataframe as dd
9
+ from dask import delayed, compute
9
10
  import pandas as pd
10
11
  from pydantic import BaseModel
11
12
 
@@ -29,6 +30,38 @@ warnings.filterwarnings(
29
30
 
30
31
 
31
32
  class DfHelper:
33
+ """
34
+ DfHelper is a utility class for managing, loading, and processing data from
35
+ various backends, such as Django databases, Parquet files, HTTP sources, and
36
+ SQLAlchemy-based databases. The class abstracts the complexities of handling
37
+ different backends and provides a unified interface for data operations.
38
+
39
+ The class is particularly useful for projects that require flexibility in
40
+ data source configuration and seamless integration with both Dask and Pandas
41
+ for handling data frames. It includes robust mechanisms for post-processing
42
+ data, filtering columns, renaming, and setting indices.
43
+
44
+ :ivar df: The DataFrame currently being processed or loaded.
45
+ :type df: Union[dd.DataFrame, pd.DataFrame]
46
+ :ivar backend_django: Configuration for interacting with Django database backends.
47
+ :type backend_django: Optional[DjangoConnectionConfig]
48
+ :ivar _backend_query: Internal configuration for query handling.
49
+ :type _backend_query: Optional[QueryConfig]
50
+ :ivar _backend_params: Internal parameters configuration for DataFrame handling.
51
+ :type _backend_params: Optional[ParamsConfig]
52
+ :ivar backend_parquet: Configuration for Parquet file handling.
53
+ :type backend_parquet: Optional[ParquetConfig]
54
+ :ivar backend_http: Configuration for interacting with HTTP-based backends.
55
+ :type backend_http: Optional[HttpConfig]
56
+ :ivar backend_sqlalchemy: Configuration for interacting with SQLAlchemy-based databases.
57
+ :type backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig]
58
+ :ivar parquet_filename: The filename for a Parquet file, if applicable.
59
+ :type parquet_filename: str
60
+ :ivar logger: Logger instance used for debugging and information logging.
61
+ :type logger: Logger
62
+ :ivar default_config: Default configuration dictionary that can be overridden.
63
+ :type default_config: Dict
64
+ """
32
65
  df: Union[dd.DataFrame, pd.DataFrame] = None
33
66
  backend_django: Optional[DjangoConnectionConfig] = None
34
67
  _backend_query: Optional[QueryConfig] = None
@@ -60,7 +93,20 @@ class DfHelper:
60
93
  def __str__(self):
61
94
  return self.__class__.__name__
62
95
 
96
+ def __call__(self, **options):
97
+ return self.load(**options)
98
+
63
99
  def __post_init(self, **kwargs):
100
+ """
101
+ Initializes backend-specific configurations based on the provided backend type and other
102
+ parameters. This method performs configuration setup dependent on the selected backend,
103
+ such as 'django_db', 'parquet', 'http', or 'sqlalchemy'. Configuration for each backend
104
+ type is fetched or instantiated as necessary using provided parameters or default values.
105
+
106
+ :param kwargs: Dictionary of arguments passed during initialization of backend configurations.
107
+ Additional parameters for specific backend types are extracted here.
108
+ :return: None
109
+ """
64
110
  self.logger.debug(f"backend used: {self.backend}")
65
111
  self._backend_query = self.__get_config(QueryConfig, kwargs)
66
112
  self._backend_params = self.__get_config(ParamsConfig, kwargs)
@@ -88,7 +134,35 @@ class DfHelper:
88
134
  model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
89
135
  return model(**model_kwargs)
90
136
 
137
+ def load_parallel(self, **options):
138
+ """
139
+ Executes the `load` method in parallel using Dask, allowing multiple instances
140
+ to run concurrently. This function leverages Dask's `delayed` and `compute`
141
+ methods to schedule and process tasks in parallel. It is designed to handle
142
+ concurrent workloads efficiently by utilizing up to 4 parallel executions of
143
+ the `load` function.
144
+
145
+ :param options: Keyword arguments to be passed to the `load` method. These options
146
+ will be applied to all parallel instances of the `load` method.
147
+ :return: A list of results, where each element represents the output
148
+ from one of the parallel executions of the `load` method.
149
+ """
150
+ # Define tasks using Dask's delayed
151
+ tasks = [delayed(self.load)(**options) for _ in range(4)]
152
+ results = compute(*tasks)
153
+ return results
154
+
91
155
  def load(self, **options):
156
+ """
157
+ Loads data from a dataframe backend, ensuring compatibility with multiple
158
+ data processing backends. Provides the data in a pandas dataframe format
159
+ if the `as_pandas` attribute is set to True.
160
+
161
+ :param options: Arbitrary keyword arguments for dataframe loading customization.
162
+ :type options: dict
163
+ :return: The loaded dataframe, computed as a pandas dataframe if
164
+ `as_pandas` is set to True, or kept in its native backend format otherwise.
165
+ """
92
166
  # this will be the universal method to load data from a df irrespective of the backend
93
167
  df = self.__load(**options)
94
168
  if self.as_pandas:
@@ -96,7 +170,23 @@ class DfHelper:
96
170
  return df
97
171
 
98
172
  def __load(self, **options):
99
-
173
+ """
174
+ Private method responsible for loading data using a specified backend. This method
175
+ abstracts away the details of interacting with the backend and dynamically calls the
176
+ appropriate function depending on the backend type. It supports multiple backend
177
+ types, such as `django_db`, `sqlalchemy`, `parquet`, and `http`. If the `http` backend
178
+ is selected, it checks whether the asyncio event loop is running and either runs the
179
+ process as a new asyncio task or synchronously.
180
+
181
+ :param options: Arbitrary keyword arguments provided for backend-specific configurations.
182
+ These should align with the requirements of the chosen backend.
183
+ :type options: dict
184
+
185
+ :return: The data loaded from the specified backend. The return type is dependent on
186
+ the particular backend being used.
187
+ :rtype: Depending on backend implementation; could be `Task`, `List`, `Dict`, or
188
+ another format defined by the backend.
189
+ """
100
190
  if self.backend == 'django_db':
101
191
  self._backend_params.parse_params(options)
102
192
  return self.__load_from_db(**options)
@@ -167,8 +257,13 @@ class DfHelper:
167
257
 
168
258
  def __post_process_df(self):
169
259
  """
170
- Efficiently process the DataFrame by filtering, renaming, and setting indices.
171
- Optimized for large datasets with Dask compatibility.
260
+ Processes a DataFrame according to the provided parameters defined within the
261
+ `self._backend_params.df_params` dictionary. This involves filtering columns,
262
+ renaming columns, setting an index column, and handling datetime indexing.
263
+ The method modifies the DataFrame in place.
264
+
265
+ :raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
266
+ or if the specified `index_col` is not found in the DataFrame.
172
267
  """
173
268
  df_params = self._backend_params.df_params
174
269
  fieldnames = df_params.get("fieldnames", None)
@@ -205,6 +300,21 @@ class DfHelper:
205
300
  self.logger.debug("Post-processing of DataFrame completed.")
206
301
 
207
302
  def __process_loaded_data(self):
303
+ """
304
+ Processes the dataframe by applying renaming logic based on the given field map
305
+ configuration. Inspects the dataframe for missing columns referenced in the field
306
+ map and flags them with a warning. Applies renaming only for columns that exist
307
+ in the dataframe while ensuring that no operations take place if the dataframe
308
+ is empty.
309
+
310
+ :param self: The instance of the class where the dataframe is being processed.
311
+ :type self: object with attributes `df`, `_backend_params`, and `logger`.
312
+
313
+ :raises Warning: Logs a warning if specified columns in the `field_map` are not
314
+ present in the dataframe.
315
+
316
+ :return: None
317
+ """
208
318
  self.logger.debug(f"Type of self.df: {type(self.df)}")
209
319
  if self.df.map_partitions(len).compute().sum() > 0:
210
320
  field_map = self._backend_params.field_map or {}
@@ -239,20 +349,54 @@ class DfHelper:
239
349
  self.logger.debug("Save to ClickHouse completed.")
240
350
 
241
351
  def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
242
- self.df = self.backend_parquet.load_files()
243
- if options:
244
- """
245
- deprecated specific filter handling to a generic one
246
- self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
247
-
248
- """
249
- self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
250
- return self.df
352
+ try:
353
+ self.df = self.backend_parquet.load_files()
354
+ if options and self.df is not None:
355
+ """
356
+ deprecated specific filter handling to a generic one
357
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
358
+
359
+ """
360
+ self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
361
+ return self.df
362
+ except Exception as e:
363
+ self.logger.debug(f"Failed to load data from parquet: {e}")
364
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
251
365
 
252
366
  def load_period(self, **kwargs):
253
367
  return self.__load_period(**kwargs)
254
368
 
255
369
  def __load_period(self, **kwargs):
370
+ """
371
+ Validates and processes the temporal filtering parameters `start` and `end` for querying,
372
+ ensuring correctness and compatibility with a specified backend (Django or SQLAlchemy).
373
+ This method dynamically maps and validates the provided datetime or date field from the
374
+ model according to the configured backend, and applies the appropriate filters to query objects.
375
+
376
+ This function enforces that both `start` and `end` are provided and checks if the start date
377
+ is earlier or the same as the end date. It supports parsing string representations of dates
378
+ and validates them against the date or datetime fields associated with the chosen backend.
379
+ If the backend or field is incompatible or missing, appropriate errors are raised.
380
+
381
+ The resulting filter conditions are integrated into `kwargs` for querying with the
382
+ appropriate backend model.
383
+
384
+ :param kwargs: Keyword arguments, including temporal filtering parameters and optionally a
385
+ datetime or date field name. Supported parameters include:
386
+ - **dt_field**: The name of the date or datetime field to use in filtering. Defaults
387
+ to an internally set field if not explicitly provided.
388
+ - **start**: The starting date or datetime for the query range. Can be a `str` or
389
+ `datetime.date/datetime.datetime` object.
390
+ - **end**: The ending date or datetime for the query range. Can be a `str` or
391
+ `datetime.date/datetime.datetime` object.
392
+
393
+ :return: Queryset or result of the load function with the applied temporal filters.
394
+ :rtype: Any
395
+
396
+ :raises ValueError: If the `dt_field` is not provided, if `start` or `end`
397
+ are missing, if the `start` date is later than `end`, or if the `dt_field`
398
+ does not exist in the backend model or its metadata.
399
+ """
256
400
  dt_field = kwargs.pop("dt_field", self.dt_field)
257
401
  if dt_field is None:
258
402
  raise ValueError("dt_field must be provided")
@@ -316,6 +460,30 @@ class DfHelper:
316
460
 
317
461
  @staticmethod
318
462
  def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
463
+ """
464
+ Parses a date string and converts it to a `datetime.datetime` or
465
+ `datetime.date` object.
466
+
467
+ This method attempts to parse the given string in two distinct formats:
468
+ 1. First, it tries to interpret the string as a datetime with the format
469
+ ``%Y-%m-%d %H:%M:%S``. If successful, it returns a `datetime.datetime`
470
+ object.
471
+ 2. If the first format parsing fails, it attempts to parse the string as
472
+ a date with the format ``%Y-%m-%d``. If successful, it returns a
473
+ `datetime.date` object.
474
+
475
+ If the string cannot be parsed in either of these formats, the method will
476
+ raise a `ValueError`.
477
+
478
+ :param date_str: The date string to be parsed. Expected to match one of the
479
+ formats: ``%Y-%m-%d %H:%M:%S`` or ``%Y-%m-%d``.
480
+ :type date_str: str
481
+ :return: A `datetime.datetime` object if the string matches the first format,
482
+ or a `datetime.date` object if the string matches the second format.
483
+ :rtype: Union[datetime.datetime, datetime.date]
484
+ :raises ValueError: Raised if neither date format can be successfully parsed
485
+ from the provided string.
486
+ """
319
487
  try:
320
488
  return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
321
489
  except ValueError:
@@ -9,6 +9,22 @@ from sibi_dst.utils import Logger
9
9
 
10
10
 
11
11
  class FilterHandler:
12
+ """
13
+ Handles the application of filters to data sources with support for SQLAlchemy and Dask backends.
14
+
15
+ The FilterHandler class abstracts the process of applying filters to various backends, specifically
16
+ SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
17
+ exact matches, comparisons, and string-related operations such as contains and regex. The handler
18
+ automatically determines and applies backend-specific processing, enabling seamless integration with
19
+ different data models or backends.
20
+
21
+ :ivar backend: The backend in use ('sqlalchemy' or 'dask').
22
+ :type backend: str
23
+ :ivar logger: An optional logger instance for debugging and logging purposes.
24
+ :type logger: Logger
25
+ :ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
26
+ :type backend_methods: dict
27
+ """
12
28
  def __init__(self, backend, logger=None):
13
29
  """
14
30
  Initialize the FilterHandler.
@@ -0,0 +1,132 @@
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem import SnowballStemmer
4
+ import dask.dataframe as dd
5
+ from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
6
+ import nltk
7
+
8
+ class DataCleaner:
9
+ def __init__(self, dataframe):
10
+ self.original_df = dataframe
11
+ self.df = dataframe.copy()
12
+ self.duplicates_df = None
13
+
14
+ def handle_missing_values(self, strategy='mean'):
15
+ if strategy == 'mean':
16
+ self.df = self.df.fillna(self.df.mean())
17
+ elif strategy == 'median':
18
+ self.df = self.df.fillna(self.df.median())
19
+ elif strategy == 'mode':
20
+ self.df = self.df.fillna(self.df.mode().iloc[0])
21
+ elif strategy == 'drop':
22
+ self.df = self.df.dropna()
23
+ return self
24
+
25
+ def identify_duplicates(self, subset=None):
26
+ self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
27
+ return self.duplicates_df
28
+
29
+ def remove_duplicates(self):
30
+ if self.duplicates_df is not None:
31
+ self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
32
+ return self
33
+
34
+ def validate_date_fields(self, date_columns=None):
35
+ if date_columns is None:
36
+ date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
37
+ for col in date_columns:
38
+ print('Validating date field: ', col)
39
+ self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
40
+ return self
41
+
42
+ def clean_text(self, text_columns=None, language='english'):
43
+ nltk.download('stopwords')
44
+ stop_words = set(stopwords.words(language))
45
+ stemmer = SnowballStemmer(language)
46
+
47
+ def clean_text(text):
48
+ if isinstance(text, str):
49
+ text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
50
+ text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
51
+ words = text.split()
52
+ words = [word for word in words if word not in stop_words] # Remove stop words
53
+ words = [stemmer.stem(word) for word in words] # Apply stemming
54
+ return ' '.join(words)
55
+ return text
56
+
57
+ if text_columns is None:
58
+ text_columns = self.df.select_dtypes(include=['object', 'string']).columns
59
+ text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
60
+
61
+ for col in text_columns:
62
+ print('Cleaning text field: ', col)
63
+ self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
64
+ return self
65
+
66
+ def validate_numeric_fields(self, int_columns=None, float_columns=None):
67
+ if int_columns is None:
68
+ int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
69
+ if float_columns is None:
70
+ float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
71
+
72
+ for col in int_columns:
73
+ print('Validating integer field: ', col)
74
+ self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
75
+
76
+ for col in float_columns:
77
+ print('Validating float field: ', col)
78
+ self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
79
+
80
+ return self
81
+
82
+ def detect_categorical_columns(self, threshold=0.05):
83
+ """
84
+ Detect columns that can be converted to 'category' dtype.
85
+
86
+ Parameters:
87
+ threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
88
+
89
+ Returns:
90
+ List of column names that can be converted to 'category' dtype.
91
+ """
92
+ categorical_columns = []
93
+
94
+ def unique_ratio(partition, col):
95
+ return partition[col].nunique() / len(partition)
96
+
97
+ for col in self.df.columns:
98
+ print("Detecting categorical columns: ", col)
99
+ unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
100
+ overall_unique_ratio = unique_ratios.sum() / len(self.df)
101
+ if overall_unique_ratio < threshold:
102
+ print(f'Column {col} is categorical')
103
+ categorical_columns.append(col)
104
+
105
+ return categorical_columns
106
+
107
+ def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
108
+ if columns is None:
109
+ columns = self.detect_categorical_columns(threshold)
110
+
111
+ if method == 'onehot':
112
+ for col in columns:
113
+ self.df[col] = self.df[col].astype('category')
114
+ encoder = OneHotEncoder(sparse_output=False)
115
+ self.df = encoder.fit_transform(self.df)
116
+ elif method == 'label':
117
+ encoder = LabelEncoder()
118
+ for col in columns:
119
+ self.df[col] = encoder.fit_transform(self.df[col])
120
+ return self
121
+
122
+ def analyze_dtypes(self):
123
+ return self.df.dtypes
124
+
125
+ def get_cleaned_dataframe(self):
126
+ return self.df
127
+
128
+ def get_original_dataframe(self):
129
+ return self.original_df
130
+
131
+ def get_duplicates_dataframe(self):
132
+ return self.duplicates_df
@@ -0,0 +1,9 @@
1
+ from __future__ import annotations
2
+
3
+ from .base_osm_map import BaseOsmMap
4
+ from .utils import PBFHandler
5
+
6
+ __all__ = [
7
+ "BaseOsmMap",
8
+ "PBFHandler",
9
+ ]
@@ -0,0 +1,165 @@
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ from abc import abstractmethod
5
+
6
+ import folium
7
+ import geopandas as gpd
8
+ import numpy as np
9
+ import osmnx as ox
10
+ from folium.plugins import Fullscreen
11
+
12
+
13
+ class BaseOsmMap:
14
+ tile_options = {
15
+ "OpenStreetMap": "OpenStreetMap",
16
+ "CartoDB": "cartodbpositron",
17
+ "CartoDB Voyager": "cartodbvoyager"
18
+ }
19
+ # Set default bounds for Costa Rica
20
+ bounds = [[8.0340, -85.9417], [11.2192, -82.5566]]
21
+
22
+ def __init__(self, osmnx_graph=None, df=None, **kwargs):
23
+ if osmnx_graph is None:
24
+ raise ValueError('osmnx_graph must be provided')
25
+ if df is None:
26
+ raise ValueError('df must be provided')
27
+ if df.empty:
28
+ raise ValueError('df must not be empty')
29
+ self.df = df.copy()
30
+ self.osmnx_graph = osmnx_graph
31
+ self.lat_col = kwargs.get('lat_col', 'latitude')
32
+ self.lon_col = kwargs.get('lon_col', 'longitude')
33
+ self.osm_map = None
34
+ self.G = None
35
+ self.map_html_title = self._sanitize_html(kwargs.get('map_html_title', 'OSM Basemap'))
36
+
37
+ self.zoom_start = kwargs.pop('zoom_start', 13)
38
+ self.fullscreen = kwargs.pop('fullscreen', True)
39
+ self.fullscreen_position = kwargs.pop('fullscreen_position', 'topright')
40
+ self.tiles = kwargs.pop('tiles', 'OpenStreetMap')
41
+ self.verbose = kwargs.pop('verbose', False)
42
+ self.sort_keys = kwargs.pop('sort_keys', None)
43
+ self.dt_field = kwargs.pop('dt_field', None)
44
+ self.dt = None
45
+ self.calc_nearest_nodes = kwargs.pop('calc_nearest_nodes', False)
46
+ self.nearest_nodes = None
47
+ self.max_bounds = kwargs.pop('max_bounds', False)
48
+ self._prepare_df()
49
+ self._initialise_map()
50
+
51
+
52
+ def _prepare_df(self):
53
+ if self.sort_keys:
54
+ self.df.sort_values(by=self.sort_keys, inplace=True)
55
+ self.df.reset_index(drop=True, inplace=True)
56
+ self.gps_points = self.df[[self.lat_col, self.lon_col]].values.tolist()
57
+ if self.dt_field is not None:
58
+ self.dt = self.df[self.dt_field].tolist()
59
+
60
+ if self.calc_nearest_nodes:
61
+ self.nearest_nodes = ox.distance.nearest_nodes(self.osmnx_graph, X=self.df[self.lon_col],
62
+ Y=self.df[self.lat_col])
63
+
64
+
65
+ def _initialise_map(self):
66
+ gps_array = np.array(self.gps_points)
67
+ mean_latitude = np.mean(gps_array[:, 0])
68
+ mean_longitude = np.mean(gps_array[:, 1])
69
+ self.osm_map = folium.Map(location=[mean_latitude, mean_longitude], zoom_start=self.zoom_start,
70
+ tiles=self.tiles, max_bounds=self.max_bounds)
71
+ north, south, east, west = self._get_bounding_box_from_points(margin=0.001)
72
+ self.G = self._extract_subgraph(north, south, east, west)
73
+
74
+
75
+ def _attach_supported_tiles(self):
76
+ # Normalize the default tile name to lowercase for comparison
77
+ normalized_default_tile = self.tiles.lower()
78
+
79
+ # Filter out the default tile layer from the options to avoid duplication
80
+ tile_options_filtered = {k: v for k, v in self.tile_options.items() if v.lower() != normalized_default_tile}
81
+
82
+ for tile, description in tile_options_filtered.items():
83
+ folium.TileLayer(name=tile, tiles=description, show=False).add_to(self.osm_map)
84
+
85
+
86
+ def _get_bounding_box_from_points(self, margin=0.001):
87
+ latitudes = [point[0] for point in self.gps_points]
88
+ longitudes = [point[1] for point in self.gps_points]
89
+
90
+ north = max(latitudes) + margin
91
+ south = min(latitudes) - margin
92
+ east = max(longitudes) + margin
93
+ west = min(longitudes) - margin
94
+
95
+ return north, south, east, west
96
+
97
+
98
+ def _extract_subgraph(self, north, south, east, west):
99
+ # Create a bounding box polygon
100
+ # from osmnx v2 this is how it is done
101
+ if ox.__version__ >= '2.0':
102
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(bbox=(west, south, east, north))])
103
+ else:
104
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(north, south, east, west)])
105
+
106
+ # Get nodes GeoDataFrame
107
+ nodes_gdf = ox.graph_to_gdfs(self.osmnx_graph, nodes=True, edges=False)
108
+
109
+ # Find nodes within the bounding box
110
+ nodes_within_bbox = nodes_gdf[nodes_gdf.geometry.within(bbox_poly.geometry.unary_union)]
111
+
112
+ # Create subgraph
113
+ subgraph = self.osmnx_graph.subgraph(nodes_within_bbox.index)
114
+
115
+ return subgraph
116
+
117
+
118
+ @abstractmethod
119
+ def process_map(self):
120
+ # this is to be implemented at the subclass level
121
+ # implement here your specific map logic.
122
+ ...
123
+
124
+
125
+ def pre_process_map(self):
126
+ # this is to be implemented at the subclass level
127
+ # call super().pre_process_map first to inherit the following behaviour
128
+ ...
129
+
130
+
131
+ def _post_process_map(self):
132
+ self._attach_supported_tiles()
133
+ self.add_tile_layer()
134
+ self._add_fullscreen()
135
+ self._add_map_title()
136
+ if self.max_bounds:
137
+ self.osm_map.fit_bounds(self.bounds)
138
+
139
+
140
+ def add_tile_layer(self):
141
+ # Override in subclass and call super().add_tile_layer at the end
142
+ folium.LayerControl().add_to(self.osm_map)
143
+
144
+
145
+ def _add_fullscreen(self):
146
+ if self.fullscreen:
147
+ Fullscreen(position=self.fullscreen_position).add_to(self.osm_map)
148
+
149
+
150
+ def _add_map_title(self):
151
+ if self.map_html_title:
152
+ self.osm_map.get_root().html.add_child(folium.Element(self.map_html_title))
153
+
154
+
155
+ @staticmethod
156
+ def _sanitize_html(input_html):
157
+ return html.escape(input_html)
158
+
159
+
160
+ def generate_map(self):
161
+ self.pre_process_map()
162
+ self.process_map()
163
+ self._post_process_map()
164
+
165
+ return self.osm_map
File without changes
@@ -0,0 +1,122 @@
1
+
2
+ # HTML and CSS for the calendar button and popup
3
+ calendar_html = """
4
+ <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/flatpickr/dist/flatpickr.min.css">
5
+ <script src="https://cdn.jsdelivr.net/npm/flatpickr"></script>
6
+
7
+ <style>
8
+ /* Style for the calendar button */
9
+ .calendar-btn {
10
+ background-color: white;
11
+ border: 1px solid gray;
12
+ border-radius: 3px;
13
+ padding: 5px;
14
+ font-size: 16px;
15
+ cursor: pointer;
16
+ position: fixed; /* Changed from absolute to fixed */
17
+ bottom: 50px; /* Adjust position relative to the viewport */
18
+ left: 10px; /* Adjust position relative to the viewport */
19
+ z-index: 10000; /* Ensure it stays on top of other elements */
20
+ }
21
+
22
+ /* Calendar popup with sufficient size */
23
+ .calendar-popup {
24
+ display: none;
25
+ position: fixed; /* Keep the popup fixed so it stays in view */
26
+ bottom: 100px;
27
+ left: 10px;
28
+ background-color: white;
29
+ padding: 10px;
30
+ border: 1px solid gray;
31
+ border-radius: 3px;
32
+ z-index: 10000; /* Ensure it stays on top of other elements */
33
+ width: 250px;
34
+ height: 300px;
35
+ }
36
+
37
+ /* Ensure the calendar fits properly */
38
+ #calendar {
39
+ width: 100%;
40
+ height: auto;
41
+ }
42
+ </style>
43
+
44
+ <!-- Calendar Button -->
45
+ <div class="calendar-btn">📅 Select Date</div>
46
+
47
+ <!-- Calendar Popup -->
48
+ <div class="calendar-popup" id="calendar-popup">
49
+ <div id="calendar"></div>
50
+ </div>
51
+
52
+ <script>
53
+ // Initialize Flatpickr calendar
54
+ const today = new Date().toISOString().split('T')[0];
55
+ // Function to show the "Please wait" message
56
+ function showLoadingMessage() {
57
+ let loadingMessage = document.createElement("div");
58
+ loadingMessage.id = "loading-message";
59
+ loadingMessage.style.position = "fixed";
60
+ loadingMessage.style.top = "50%";
61
+ loadingMessage.style.left = "50%";
62
+ loadingMessage.style.transform = "translate(-50%, -50%)";
63
+ loadingMessage.style.backgroundColor = "rgba(0, 0, 0, 0.8)";
64
+ loadingMessage.style.color = "white";
65
+ loadingMessage.style.padding = "20px";
66
+ loadingMessage.style.borderRadius = "5px";
67
+ loadingMessage.style.zIndex = "9999";
68
+ loadingMessage.innerText = "Please wait...";
69
+ document.body.appendChild(loadingMessage);
70
+ }
71
+
72
+ // Function to remove the "Please wait" message
73
+ function removeLoadingMessage() {
74
+ let loadingMessage = document.getElementById("loading-message");
75
+ if (loadingMessage) {
76
+ loadingMessage.remove();
77
+ }
78
+ }
79
+
80
+
81
+ flatpickr("#calendar", {
82
+ inline: true, // Render the calendar inline within the container
83
+ maxDate: today, // Disable future dates
84
+ onChange: function(selectedDates, dateStr, instance) {
85
+ console.log("Selected date: " + dateStr); // Debugging: Log the selected date
86
+ // Get the current URL and create a URL object to manipulate the query parameters
87
+ // Get the current URL from the parent window
88
+ showLoadingMessage();
89
+ let currentUrl = window.parent.location.href;
90
+
91
+ // If the URL contains "srcdoc", remove it and use the correct base path
92
+ if (currentUrl.includes("srcdoc")) {
93
+ currentUrl = currentUrl.replace("srcdoc", "");
94
+ }
95
+
96
+ const url = new URL(currentUrl);
97
+
98
+ // Set or update the 'date' parameter while preserving existing parameters
99
+ url.searchParams.set('date', dateStr);
100
+
101
+ console.log("Updated URL: " + url.toString()); // Debugging: Log the updated URL
102
+
103
+ // Update the parent window's location with the new URL
104
+ window.parent.location.href = url.toString();
105
+ }
106
+ });
107
+ // Remove the "Please wait" message once the page has finished loading
108
+ window.addEventListener("load", function() {
109
+ removeLoadingMessage();
110
+ });
111
+
112
+ // Toggle the calendar popup when the button is clicked
113
+ document.querySelector(".calendar-btn").addEventListener("click", function() {
114
+ var popup = document.getElementById("calendar-popup");
115
+ if (popup.style.display === "none" || popup.style.display === "") {
116
+ popup.style.display = "block";
117
+ } else {
118
+ popup.style.display = "none";
119
+ }
120
+ });
121
+ </script>
122
+ """
@@ -0,0 +1,186 @@
1
+ from __future__ import annotations
2
+ from sibi_dst.osmnx_helper.utils import get_distance_between_points, add_arrows
3
+ from collections import defaultdict
4
+ import folium
5
+ from folium.plugins import AntPath
6
+ import networkx as nx
7
+
8
+ from sibi_dst.osmnx_helper import BaseOsmMap
9
+ from sibi_dst.osmnx_helper.basemaps.calendar_html import calendar_html
10
+
11
+ class RoutePlotter(BaseOsmMap):
12
+ def __init__(self, osmnx_graph, df, **kwargs):
13
+ self.action_field = kwargs.pop('action_field', '')
14
+ self.action_groups = kwargs.pop('action_groups', {})
15
+ self.action_styles = kwargs.pop('action_styles', {})
16
+ self.use_ant_path = kwargs.pop('use_ant_path', True)
17
+ self.show_calendar = kwargs.pop('show_calendar', True)
18
+ self.show_map_title = kwargs.pop('show_map_title', True)
19
+ self.sort_keys = kwargs.pop('sort_keys', None)
20
+ self.main_route_layer = folium.FeatureGroup(name="Main Route")
21
+ self.feature_groups = {}
22
+ self.feature_group_counts = {}
23
+ self.total_distance = 0.0
24
+ self.actions = []
25
+ self.action_group_counts = {action_group: 0 for action_group in self.action_groups}
26
+ self.marker_count = 1
27
+ kwargs.update({'calc_nearest_nodes': True})
28
+ kwargs['dt_field'] = 'date_time'
29
+ super().__init__(osmnx_graph, df, **kwargs)
30
+
31
+ def pre_process_map(self):
32
+ super().pre_process_map()
33
+ self.actions = self.df[self.action_field].tolist()
34
+
35
+ def process_map(self):
36
+ self._calculate_routes()
37
+ self._plot_routes()
38
+ self._add_markers()
39
+ self.main_route_layer.add_to(self.osm_map)
40
+ if self.show_calendar:
41
+ self._add_calendar()
42
+
43
+ def _calculate_routes(self):
44
+ if self.verbose:
45
+ print("Calculating routes and markers...")
46
+ distances = [
47
+ get_distance_between_points(tuple(self.gps_points[0]), tuple(coord), 'm')
48
+ for coord in self.gps_points
49
+ ]
50
+ self.max_distance_index = distances.index(max(distances))
51
+ self.max_time_index = self.dt.index(max(self.dt))
52
+ self.route_polylines = []
53
+ self.markers = defaultdict(list) # Store markers for action groups
54
+ for i in range(len(self.gps_points) - 1):
55
+ polyline, color, markers = self._calculate_route(i)
56
+ if polyline:
57
+ self.route_polylines.append((polyline, color))
58
+ for action_group, action_markers in markers.items():
59
+ self.markers[action_group].extend(action_markers)
60
+ self.action_group_counts[action_group] += 1
61
+ self.marker_count += 1
62
+ if self.verbose:
63
+ print("Route and marker calculation complete.")
64
+
65
+ for action_group in self.action_groups:
66
+ count = self.action_group_counts[action_group]
67
+ self.feature_groups[action_group] = folium.FeatureGroup(name=f"{action_group} ({count})").add_to(
68
+ self.osm_map)
69
+ self.osm_map.add_child(self.feature_groups[action_group])
70
+
71
+ def _calculate_route(self, i):
72
+ if self.verbose:
73
+ print(f"Calculating for item:{i}")
74
+ orig = self.nearest_nodes[i]
75
+ dest = self.nearest_nodes[i + 1]
76
+ try:
77
+ route = nx.shortest_path(self.G, orig, dest, weight='length')
78
+ route_length = sum(self.G[u][v][0]['length'] for u, v in zip(route[:-1], route[1:]))
79
+ self.total_distance += route_length
80
+ offset = 0 if i < self.max_distance_index else 0.0005
81
+ lats, lons = zip(*[(self.G.nodes[node]['y'] + offset, self.G.nodes[node]['x']) for node in route])
82
+ color = 'blue' if i < self.max_distance_index else 'red'
83
+ polyline = list(zip(lats, lons))
84
+ markers = self._calculate_markers(i)
85
+ return polyline, color, markers
86
+ except nx.NetworkXNoPath:
87
+ if self.verbose:
88
+ print(f"Item:{i}-No path found for {orig} to {dest}")
89
+ return None, None, {}
90
+ except nx.NodeNotFound:
91
+ if self.verbose:
92
+ print(f"Item:{i}-No path found for {orig} to {dest}")
93
+ return None, None, {}
94
+
95
+ def _calculate_markers(self, i):
96
+ # Calculate markers for action groups
97
+ markers = defaultdict(list)
98
+ for action_group in self.action_groups:
99
+ action_indices = [idx for idx, action in enumerate(self.actions) if action == action_group]
100
+ for idx in action_indices:
101
+ if idx == i:
102
+ location = self.gps_points[i]
103
+ tooltip = f"Result {self.marker_count}: {action_group}<br>Date/time:{self.dt[i]}"
104
+ popup_data = self._get_data(i)
105
+ action_style = self.action_styles.get(action_group,
106
+ {'color': 'blue', 'icon': 'marker', 'prefix': 'fa'})
107
+ markers[action_group].append((location, tooltip, popup_data, action_style))
108
+ return markers
109
+
110
+ def _plot_routes(self):
111
+ if self.verbose:
112
+ print("Plotting routes and markers...")
113
+ # self.action_group_counts = {action_group: 0 for action_group in self.feature_groups.keys()}
114
+ for polyline, color in self.route_polylines:
115
+ if self.use_ant_path:
116
+ AntPath(
117
+ locations=polyline,
118
+ color=color,
119
+ weight=3, # Increase line thickness
120
+ opacity=10, # Increase opacity
121
+ # pulse_color=color,
122
+ delay=1000, # Slower animation to reduce flickering
123
+ # dash_array=[20, 30] # Adjust dash pattern if needed
124
+ ).add_to(self.main_route_layer)
125
+ else:
126
+ folium.PolyLine(locations=polyline, color=color).add_to(self.main_route_layer)
127
+ self.osm_map = add_arrows(self.osm_map, polyline, color, n_arrows=3)
128
+ # Plot markers for action groups
129
+ for action_group, action_markers in self.markers.items():
130
+ for location, tooltip, popup_data, action_style in action_markers:
131
+ folium.Marker(
132
+ location=location,
133
+ popup=folium.Popup(popup_data, max_width=600),
134
+ tooltip=tooltip,
135
+ icon=folium.Icon(
136
+ icon=action_style.get("icon"),
137
+ color=action_style.get("color"),
138
+ prefix=action_style.get("prefix")
139
+ )
140
+ ).add_to(self.feature_groups[action_group])
141
+
142
+ if self.verbose:
143
+ print("Route and marker plotting complete.")
144
+
145
+ def _add_markers(self):
146
+ if self.verbose:
147
+ print("Adding markers...")
148
+ # Add start marker
149
+ start_popup = folium.Popup(f"Start of route at {self.dt[0]}", max_width=300)
150
+ folium.Marker(location=self.gps_points[0], popup=start_popup,
151
+ icon=folium.Icon(icon='flag-checkered', prefix='fa')).add_to(self.osm_map)
152
+ # Add total distance marker at the end
153
+ folium.Marker(
154
+ self.gps_points[-1],
155
+ popup=f"End of Route at {self.dt[self.max_time_index]}. Total Distance Travelled: {self.total_distance / 1000:.2f} km",
156
+ icon=folium.Icon(color="red", icon="flag-checkered", prefix="fa")
157
+ ).add_to(self.osm_map)
158
+ if self.verbose:
159
+ print("Marker addition complete.")
160
+
161
+ def _add_calendar(self):
162
+ calendar_element = folium.Element(calendar_html)
163
+ self.osm_map.get_root().html.add_child(calendar_element)
164
+
165
+ def _add_map_title(self):
166
+ if self.map_html_title and self.show_map_title:
167
+ title_html = f'''
168
+ <div style="position: fixed;
169
+ top: 10px;
170
+ left: 50%;
171
+ transform: translate(-50%, 0%);
172
+ z-index: 9999;
173
+ font-size: 24px;
174
+ font-weight: bold;
175
+ background-color: white;
176
+ padding: 10px;
177
+ border: 2px solid black;
178
+ border-radius: 5px;">
179
+ {self.map_html_title}
180
+ </div>
181
+ '''
182
+ self.osm_map.get_root().html.add_child(folium.Element(title_html))
183
+
184
+ def _get_data(self, index):
185
+ # implement in subclass to populate popups
186
+ ...
@@ -0,0 +1,267 @@
1
+ import math
2
+ import os
3
+ import pickle
4
+ from urllib.parse import urlencode, urlsplit, urlunsplit
5
+
6
+ import folium
7
+ import geopandas as gpd
8
+ import numpy as np
9
+ import osmnx as ox
10
+ from geopy.distance import geodesic
11
+
12
+
13
+ #
14
+ # options = {
15
+ # 'ox_files_save_path': ox_files_save_path,
16
+ # 'network_type': 'drive',
17
+ # 'place': 'Costa Rica',
18
+ # 'files_prefix': 'costa-rica-',
19
+ # }
20
+ # Usage example
21
+ # handler = PBFHandler(**options)
22
+ # handler.load()
23
+
24
+
25
+ class PBFHandler:
26
+ def __init__(self, **kwargs):
27
+ self.graph = None
28
+ self.nodes = None
29
+ self.edges = None
30
+ self.rebuild = kwargs.setdefault("rebuild", False)
31
+ self.verbose = kwargs.setdefault("verbose", False)
32
+ self.place = kwargs.setdefault('place', 'Costa Rica')
33
+ self.filepath = kwargs.setdefault('ox_files_save_path', "gis_data/")
34
+ self.file_prefix = kwargs.setdefault('file_prefix', 'costa-rica-')
35
+ self.network_type = kwargs.setdefault('network_type', 'all')
36
+ self.graph_file = f"{self.filepath}{self.file_prefix}graph.pkl"
37
+ self.node_file = f"{self.filepath}{self.file_prefix}nodes.pkl"
38
+ self.edge_file = f"{self.filepath}{self.file_prefix}edges.pkl"
39
+
40
+ def load(self):
41
+ if self.verbose:
42
+ print("Loading data...")
43
+
44
+ files_to_check = [self.graph_file, self.node_file, self.edge_file]
45
+
46
+ if self.rebuild:
47
+ for file in files_to_check:
48
+ if os.path.exists(file):
49
+ os.remove(file)
50
+ if not os.path.exists(self.filepath):
51
+ os.makedirs(self.filepath, exist_ok=True)
52
+ # self.process_pbf()
53
+ # self.save_to_pickle()
54
+ if not all(os.path.exists(f) for f in files_to_check):
55
+ self.process_pbf()
56
+ self.save_to_pickle()
57
+ else:
58
+ self.load_from_pickle()
59
+
60
+ if self.verbose:
61
+ print("Data loaded successfully.")
62
+
63
+ def process_pbf(self):
64
+ """
65
+ Load a PBF file and create a graph.
66
+ """
67
+ try:
68
+ if self.verbose:
69
+ print(f"Processing PBF for {self.place}...")
70
+
71
+ self.graph = ox.graph_from_place(self.place, network_type=self.network_type)
72
+ self.nodes, self.edges = ox.graph_to_gdfs(self.graph)
73
+
74
+ if self.verbose:
75
+ print("PBF processed successfully.")
76
+ except Exception as e:
77
+ print(f"Error processing PBF: {e}")
78
+ raise
79
+
80
+ def save_to_pickle(self):
81
+ """
82
+ Save the graph, nodes, and edges to pickle files.
83
+ """
84
+ try:
85
+ if self.verbose:
86
+ print("Saving data to pickle files...")
87
+
88
+ data_to_save = {
89
+ self.graph_file: self.graph,
90
+ self.node_file: self.nodes,
91
+ self.edge_file: self.edges
92
+ }
93
+
94
+ for file, data in data_to_save.items():
95
+ if data is not None:
96
+ with open(file, 'wb') as f:
97
+ pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
98
+
99
+ if self.verbose:
100
+ print("Data saved to pickle files successfully.")
101
+ except Exception as e:
102
+ print(f"Error saving to pickle: {e}")
103
+ raise
104
+
105
+ def load_from_pickle(self):
106
+ """
107
+ Load the graph, nodes, and edges from pickle files.
108
+ """
109
+ try:
110
+ if self.verbose:
111
+ print("Loading data from pickle files...")
112
+
113
+ files_to_load = {
114
+ self.graph_file: 'graph',
115
+ self.node_file: 'nodes',
116
+ self.edge_file: 'edges'
117
+ }
118
+
119
+ for file, attr in files_to_load.items():
120
+ with open(file, 'rb') as f:
121
+ setattr(self, attr, pickle.load(f))
122
+
123
+ if self.verbose:
124
+ print("Data loaded from pickle files successfully.")
125
+ except Exception as e:
126
+ print(f"Error loading from pickle: {e}")
127
+ raise
128
+
129
+ def plot_graph(self):
130
+ """
131
+ Plot the graph.
132
+ """
133
+ try:
134
+ if self.graph is not None:
135
+ if self.verbose:
136
+ print("Plotting the graph...")
137
+ ox.plot_graph(self.graph)
138
+ if self.verbose:
139
+ print("Graph plotted successfully.")
140
+ else:
141
+ print("Graph is not loaded. Please load a PBF file first.")
142
+ except Exception as e:
143
+ print(f"Error plotting the graph: {e}")
144
+ raise
145
+
146
+
147
+ def get_bounding_box_from_points(gps_points, margin=0.001):
148
+ latitudes = [point[0] for point in gps_points]
149
+ longitudes = [point[1] for point in gps_points]
150
+
151
+ north = max(latitudes) + margin
152
+ south = min(latitudes) - margin
153
+ east = max(longitudes) + margin
154
+ west = min(longitudes) - margin
155
+
156
+ return north, south, east, west
157
+
158
+
159
+ def add_arrows(map_object, locations, color, n_arrows):
160
+ # Get the number of locations
161
+ n = len(locations)
162
+
163
+ # If there are more than two points...
164
+ if n > 2:
165
+ # Add arrows along the path
166
+ for i in range(0, n - 1, n // n_arrows):
167
+ # Get the start and end point for this segment
168
+ start, end = locations[i], locations[i + 1]
169
+
170
+ # Calculate the direction in which to place the arrow
171
+ rotation = -np.arctan2((end[1] - start[1]), (end[0] - start[0])) * 180 / np.pi
172
+
173
+ folium.RegularPolygonMarker(location=end,
174
+ fill_color=color,
175
+ number_of_sides=2,
176
+ radius=6,
177
+ rotation=rotation).add_to(map_object)
178
+ return map_object
179
+
180
+
181
+ def extract_subgraph(G, north, south, east, west):
182
+ # Create a bounding box polygon
183
+ # from osmnx v2 this is how it is done
184
+ if ox.__version__ >= '2.0':
185
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(bbox=(west, south, east, north))])
186
+ else:
187
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(north, south, east, west)])
188
+
189
+ # Get nodes GeoDataFrame
190
+ nodes_gdf = ox.graph_to_gdfs(G, nodes=True, edges=False)
191
+
192
+ # Find nodes within the bounding box
193
+ nodes_within_bbox = nodes_gdf[nodes_gdf.geometry.within(bbox_poly.geometry.unary_union)]
194
+
195
+ # Create subgraph
196
+ subgraph = G.subgraph(nodes_within_bbox.index)
197
+
198
+ return subgraph
199
+
200
+
201
+ def get_distance_between_points(point_a, point_b, unit='km'):
202
+ if not isinstance(point_a, tuple) or len(point_a) != 2:
203
+ return 0
204
+ if not all(isinstance(x, float) and not math.isnan(x) for x in point_a):
205
+ return 0
206
+ if not isinstance(point_b, tuple) or len(point_b) != 2:
207
+ return 0
208
+ if not all(isinstance(x, float) and not math.isnan(x) for x in point_b):
209
+ return 0
210
+ distance = geodesic(point_a, point_b)
211
+ if unit == 'km':
212
+ return distance.kilometers
213
+ elif unit == 'm':
214
+ return distance.meters
215
+ elif unit == 'mi':
216
+ return distance.miles
217
+ else:
218
+ return 0
219
+
220
+
221
+ tile_options = {
222
+ "OpenStreetMap": "OpenStreetMap",
223
+ "CartoDB": "cartodbpositron",
224
+ "CartoDB Voyager": "cartodbvoyager"
225
+ }
226
+
227
+
228
+ def attach_supported_tiles(map_object, default_tile="OpenStreetMap"):
229
+ # Normalize the default tile name to lowercase for comparison
230
+ normalized_default_tile = default_tile.lower()
231
+
232
+ # Filter out the default tile layer from the options to avoid duplication
233
+ tile_options_filtered = {k: v for k, v in tile_options.items() if v.lower() != normalized_default_tile}
234
+
235
+ for tile, description in tile_options_filtered.items():
236
+ folium.TileLayer(name=tile, tiles=description, show=False).add_to(map_object)
237
+
238
+
239
+ def get_graph(**options):
240
+ handler = PBFHandler(**options)
241
+ handler.load()
242
+ return handler.graph, handler.nodes, handler.edges
243
+
244
+
245
+ def add_query_params(url, params):
246
+ # Parse the original URL
247
+ url_components = urlsplit(url)
248
+
249
+ # Parse original query parameters and update with new params
250
+ original_params = dict([tuple(pair.split('=')) for pair in url_components.query.split('&') if pair])
251
+ original_params.update(params)
252
+
253
+ # Construct the new query string
254
+ new_query_string = urlencode(original_params)
255
+
256
+ # Construct the new URL
257
+ new_url = urlunsplit((
258
+ url_components.scheme,
259
+ url_components.netloc,
260
+ url_components.path,
261
+ new_query_string,
262
+ url_components.fragment
263
+ ))
264
+
265
+ return new_url
266
+
267
+
File without changes
@@ -0,0 +1,78 @@
1
+ import unittest
2
+ from unittest.mock import patch, MagicMock
3
+ import datetime
4
+ import pandas as pd
5
+ from sibi_dst.utils import Logger, ParquetSaver
6
+ from sibi_dst.utils.data_wrapper import DataWrapper
7
+
8
+
9
+ class TestDataWrapper(unittest.TestCase):
10
+
11
+ def setUp(self):
12
+ self.dataclass = MagicMock()
13
+ self.date_field = "created_at"
14
+ self.data_path = "/path/to/data"
15
+ #self.data_path = "s3://your-bucket-name/path/to/data"
16
+ self.parquet_filename = "data.parquet"
17
+ self.start_date = "2022-01-01"
18
+ self.end_date = "2022-12-31"
19
+ self.filesystem_type = "file"
20
+ self.filesystem_options = {
21
+ #"key": "your_aws_access_key",
22
+ #"secret": "your_aws_secret_key",
23
+ #"client_kwargs": {"endpoint_url": "https://s3.amazonaws.com"}
24
+ }
25
+ self.logger = Logger.default_logger(logger_name="TestLogger")
26
+
27
+ def test_initialization(self):
28
+ wrapper = DataWrapper(
29
+ dataclass=self.dataclass,
30
+ date_field=self.date_field,
31
+ data_path=self.data_path,
32
+ parquet_filename=self.parquet_filename,
33
+ start_date=self.start_date,
34
+ end_date=self.end_date,
35
+ filesystem_type=self.filesystem_type,
36
+ filesystem_options=self.filesystem_options,
37
+ logger=self.logger
38
+ )
39
+ self.assertEqual(wrapper.dataclass, self.dataclass)
40
+ self.assertEqual(wrapper.date_field, self.date_field)
41
+ self.assertEqual(wrapper.data_path, "/path/to/data/")
42
+ self.assertEqual(wrapper.parquet_filename, self.parquet_filename)
43
+ self.assertEqual(wrapper.start_date, datetime.date(2022, 1, 1))
44
+ self.assertEqual(wrapper.end_date, datetime.date(2022, 12, 31))
45
+ self.assertEqual(wrapper.filesystem_type, self.filesystem_type)
46
+ self.assertEqual(wrapper.filesystem_options, self.filesystem_options)
47
+ self.assertEqual(wrapper.logger, self.logger)
48
+
49
+ def test_convert_to_date(self):
50
+ self.assertEqual(DataWrapper.convert_to_date("2022-01-01"), datetime.date(2022, 1, 1))
51
+ self.assertEqual(DataWrapper.convert_to_date(datetime.date(2022, 1, 1)), datetime.date(2022, 1, 1))
52
+ with self.assertRaises(ValueError):
53
+ DataWrapper.convert_to_date("invalid-date")
54
+
55
+ @patch('fsspec.filesystem')
56
+ def test_is_file_older_than(self, mock_filesystem):
57
+ mock_fs = mock_filesystem.return_value
58
+ mock_fs.info.return_value = {'mtime': (datetime.datetime.now() - datetime.timedelta(minutes=1500)).timestamp()}
59
+
60
+ wrapper = DataWrapper(
61
+ dataclass=self.dataclass,
62
+ date_field=self.date_field,
63
+ data_path=self.data_path,
64
+ parquet_filename=self.parquet_filename,
65
+ start_date=self.start_date,
66
+ end_date=self.end_date,
67
+ filesystem_type=self.filesystem_type,
68
+ filesystem_options=self.filesystem_options,
69
+ logger=self.logger
70
+ )
71
+
72
+ self.assertTrue(wrapper.is_file_older_than("some/file/path"))
73
+ mock_fs.info.return_value = {'mtime': (datetime.datetime.now() - datetime.timedelta(minutes=1000)).timestamp()}
74
+ self.assertFalse(wrapper.is_file_older_than("some/file/path"))
75
+
76
+
77
+ if __name__ == '__main__':
78
+ unittest.main()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.27
3
+ Version: 0.3.28
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -8,6 +8,7 @@ Requires-Python: >=3.11,<4.0
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
11
12
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
13
  Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
14
  Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
@@ -17,10 +18,13 @@ Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
17
18
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
18
19
  Requires-Dist: django (>=5.1.4,<6.0.0)
19
20
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
21
+ Requires-Dist: folium (>=0.19.4,<0.20.0)
22
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0)
20
23
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
21
24
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
22
25
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
23
26
  Requires-Dist: mysqlclient (>=2.2.6,<3.0.0)
27
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
24
28
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
25
29
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
26
30
  Requires-Dist: paramiko (>=3.5.0,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  sibi_dst/__init__.py,sha256=CLHfzrFNqklNx5uMKAPtbZfkbBbVYR5qsiMro0RTfmA,252
2
- sibi_dst/df_helper/__init__.py,sha256=aiAu7j1SWDiw3RVI4UJmvLcADP34OfrJTCYpdupPGII,234
3
- sibi_dst/df_helper/_df_helper.py,sha256=vG-Lb9lj8s5cACTvfYp7JhXt1PajttHVhKYzBWR-9Vc,13953
2
+ sibi_dst/df_helper/__init__.py,sha256=5yzslP6zYYOHsTtAzHnNDXHYjf_T6yW7baxwgtduWqQ,292
3
+ sibi_dst/df_helper/_df_helper.py,sha256=MttqHot8dlHzo4G522JL-z6LOFWYVXqqz06k-4YcvRM,23447
4
4
  sibi_dst/df_helper/_parquet_artifact.py,sha256=nx1wTEyrjARpCCPNwBxYiBROee3CSb6c-u7Cpme_tdk,4978
5
5
  sibi_dst/df_helper/_parquet_reader.py,sha256=sbe8DsScNT2h6huNsz8mUxVnUGpJeRzbaONZ3u2sQeQ,1685
6
6
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,9 +22,18 @@ sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ML-m_WeTR1_UMgiDR
22
22
  sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=Bmhh6VvmBfNfBA2JpuEdsYD_193yJ768Si2TvkY9HmU,4405
23
23
  sibi_dst/df_helper/core/__init__.py,sha256=o4zDwgVmaijde3oix0ezb6KLxI5QFy-SGUhFTDVFLT4,569
24
24
  sibi_dst/df_helper/core/_defaults.py,sha256=eNpHD2sZxir-2xO0b3_V16ryw8YP_5FfpIKK0HNuiN4,7011
25
- sibi_dst/df_helper/core/_filter_handler.py,sha256=g9FMcB_koT724ggcWt98jow2XgUnmupK_fNhF95W5bQ,10217
25
+ sibi_dst/df_helper/core/_filter_handler.py,sha256=t3uLLJX5hWO_dWKCCz8Dwpc9RZ5PMHBIWkHSELCpFXI,11131
26
26
  sibi_dst/df_helper/core/_params_config.py,sha256=Og3GYth0GVWpcOYWZWRy7CZ5PDsg63Nmqo-W7TUrA_0,3503
27
27
  sibi_dst/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
28
+ sibi_dst/df_helper/data_cleaner.py,sha256=lkxQoXLvGzXCicFUimnA5nen5qkrO1oxgl_p2Be2o8w,5183
29
+ sibi_dst/osmnx_helper/__init__.py,sha256=QeAKEeVXZk_qn8o0d3BOoGgv2lzatcI2yBqY3ZqviKI,153
30
+ sibi_dst/osmnx_helper/base_osm_map.py,sha256=s2OY_XfwjZA3ImJNtCgevGBCbwRVe3dY3QVkTHEulB0,5794
31
+ sibi_dst/osmnx_helper/basemaps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
32
+ sibi_dst/osmnx_helper/basemaps/calendar_html.py,sha256=UArt6FDgoCgoRte45Xo3IHqd-RNzW0YgitgZYfOFasY,4031
33
+ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=QznnBGsUwhl8ZITcVNBrQDm-MXAd0jpJGPuyozKyQg0,8537
34
+ sibi_dst/osmnx_helper/utils.py,sha256=8sF-wNSL38WzhWS3DceZ1cP8BM11i7D0bI-E4XYD8K4,8449
35
+ sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ sibi_dst/tests/test_data_wrapper_class.py,sha256=Nkup5OFH5Cos2fxPaU7g9IEyINJM0uJ5-rOZ-eNtd20,3275
28
37
  sibi_dst/utils/__init__.py,sha256=z51o5sjIo_gTjnDXk5SBniCxWJIrDBMS7df0dTs8VMk,775
29
38
  sibi_dst/utils/airflow_manager.py,sha256=-d44EKUZNYJyp4wuNwRvilRQktunArPOB5fZuWdQv10,7526
30
39
  sibi_dst/utils/clickhouse_writer.py,sha256=xUhFDOuZt0eZDpVJNuLb7pfTHUV06NCYrNUx_a7qrSM,8580
@@ -38,6 +47,6 @@ sibi_dst/utils/filepath_generator.py,sha256=hjI7gQwfwRToPeuzoUQDayHKQrr4Ivhi4Chl
38
47
  sibi_dst/utils/log_utils.py,sha256=4eLmoV8VC7wDwPr1mRfDKP24_-laGO6ogE4U0u3DUuA,2315
39
48
  sibi_dst/utils/parquet_saver.py,sha256=hLrWr1G132y94eLopDPPGQGDsAiR1lQ8id4QQtGYPE4,4349
40
49
  sibi_dst/utils/storage_manager.py,sha256=7nkfeBW_2xlF59pGj7V2aY5TLwpJnPQuPVclqjavJOA,3856
41
- sibi_dst-0.3.27.dist-info/METADATA,sha256=YFb0ZGbz2m0-aczvItyKK4Iqf1wn6pSVGE41ZUQ6YI8,2265
42
- sibi_dst-0.3.27.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
43
- sibi_dst-0.3.27.dist-info/RECORD,,
50
+ sibi_dst-0.3.28.dist-info/METADATA,sha256=9xBeLwWalUf7exDK-0NZfnYmUQnOIdV2xa0PYNTd85I,2436
51
+ sibi_dst-0.3.28.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
52
+ sibi_dst-0.3.28.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 1.9.0
2
+ Generator: poetry-core 1.9.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any