sibi-dst 0.3.27__tar.gz → 0.3.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/PKG-INFO +5 -1
  2. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/pyproject.toml +4 -1
  3. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/__init__.py +2 -0
  4. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/_df_helper.py +180 -12
  5. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/core/_filter_handler.py +16 -0
  6. sibi_dst-0.3.28/sibi_dst/df_helper/data_cleaner.py +132 -0
  7. sibi_dst-0.3.28/sibi_dst/osmnx_helper/__init__.py +9 -0
  8. sibi_dst-0.3.28/sibi_dst/osmnx_helper/base_osm_map.py +165 -0
  9. sibi_dst-0.3.28/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  10. sibi_dst-0.3.28/sibi_dst/osmnx_helper/basemaps/calendar_html.py +122 -0
  11. sibi_dst-0.3.28/sibi_dst/osmnx_helper/basemaps/router_plotter.py +186 -0
  12. sibi_dst-0.3.28/sibi_dst/osmnx_helper/utils.py +267 -0
  13. sibi_dst-0.3.28/sibi_dst/tests/__init__.py +0 -0
  14. sibi_dst-0.3.28/sibi_dst/tests/test_data_wrapper_class.py +78 -0
  15. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/README.md +0 -0
  16. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/__init__.py +0 -0
  17. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  18. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  19. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/__init__.py +0 -0
  20. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  21. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/django/_db_connection.py +0 -0
  22. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/django/_io_dask.py +0 -0
  23. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/django/_load_from_db.py +0 -0
  24. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -0
  25. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  26. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  27. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  28. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -0
  29. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -0
  30. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  31. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  32. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  33. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  34. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  35. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  36. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/core/__init__.py +0 -0
  37. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/core/_defaults.py +0 -0
  38. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/core/_params_config.py +0 -0
  39. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/df_helper/core/_query_config.py +0 -0
  40. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/__init__.py +0 -0
  41. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/airflow_manager.py +0 -0
  42. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/clickhouse_writer.py +0 -0
  43. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/credentials.py +0 -0
  44. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/data_utils.py +0 -0
  45. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/data_wrapper.py +0 -0
  46. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/date_utils.py +0 -0
  47. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/df_utils.py +0 -0
  48. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/file_utils.py +0 -0
  49. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/filepath_generator.py +0 -0
  50. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/log_utils.py +0 -0
  51. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/parquet_saver.py +0 -0
  52. {sibi_dst-0.3.27 → sibi_dst-0.3.28}/sibi_dst/utils/storage_manager.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.27
3
+ Version: 0.3.28
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -8,6 +8,7 @@ Requires-Python: >=3.11,<4.0
8
8
  Classifier: Programming Language :: Python :: 3
9
9
  Classifier: Programming Language :: Python :: 3.11
10
10
  Classifier: Programming Language :: Python :: 3.12
11
+ Classifier: Programming Language :: Python :: 3.13
11
12
  Requires-Dist: apache-airflow-client (>=2.10.0,<3.0.0)
12
13
  Requires-Dist: chardet (>=5.2.0,<6.0.0)
13
14
  Requires-Dist: charset-normalizer (>=3.4.0,<4.0.0)
@@ -17,10 +18,13 @@ Requires-Dist: dask-expr (>=1.1.20,<2.0.0)
17
18
  Requires-Dist: dask[complete] (>=2024.11.1,<2025.0.0)
18
19
  Requires-Dist: django (>=5.1.4,<6.0.0)
19
20
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
21
+ Requires-Dist: folium (>=0.19.4,<0.20.0)
22
+ Requires-Dist: geopandas (>=1.0.1,<2.0.0)
20
23
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
21
24
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
22
25
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
23
26
  Requires-Dist: mysqlclient (>=2.2.6,<3.0.0)
27
+ Requires-Dist: nltk (>=3.9.1,<4.0.0)
24
28
  Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
25
29
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
26
30
  Requires-Dist: paramiko (>=3.5.0,<4.0.0)
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.27"
3
+ version = "0.3.28"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -37,6 +37,9 @@ psycopg2 = "^2.9.10"
37
37
  uvicorn = "^0.34.0"
38
38
  pytest-mock = "^3.14.0"
39
39
  s3fs = "^2024.12.0"
40
+ nltk = "^3.9.1"
41
+ folium = "^0.19.4"
42
+ geopandas = "^1.0.1"
40
43
 
41
44
 
42
45
  [build-system]
@@ -3,9 +3,11 @@ from __future__ import annotations
3
3
  from ._df_helper import DfHelper
4
4
  from ._parquet_artifact import ParquetArtifact
5
5
  from ._parquet_reader import ParquetReader
6
+ #from .data_cleaner import DataCleaner
6
7
 
7
8
  __all__ = [
8
9
  'DfHelper',
9
10
  'ParquetArtifact',
10
11
  'ParquetReader',
12
+ #'DataCleaner'
11
13
  ]
@@ -6,6 +6,7 @@ from typing import Any, Dict, TypeVar
6
6
  from typing import Union, Optional
7
7
 
8
8
  import dask.dataframe as dd
9
+ from dask import delayed, compute
9
10
  import pandas as pd
10
11
  from pydantic import BaseModel
11
12
 
@@ -29,6 +30,38 @@ warnings.filterwarnings(
29
30
 
30
31
 
31
32
  class DfHelper:
33
+ """
34
+ DfHelper is a utility class for managing, loading, and processing data from
35
+ various backends, such as Django databases, Parquet files, HTTP sources, and
36
+ SQLAlchemy-based databases. The class abstracts the complexities of handling
37
+ different backends and provides a unified interface for data operations.
38
+
39
+ The class is particularly useful for projects that require flexibility in
40
+ data source configuration and seamless integration with both Dask and Pandas
41
+ for handling data frames. It includes robust mechanisms for post-processing
42
+ data, filtering columns, renaming, and setting indices.
43
+
44
+ :ivar df: The DataFrame currently being processed or loaded.
45
+ :type df: Union[dd.DataFrame, pd.DataFrame]
46
+ :ivar backend_django: Configuration for interacting with Django database backends.
47
+ :type backend_django: Optional[DjangoConnectionConfig]
48
+ :ivar _backend_query: Internal configuration for query handling.
49
+ :type _backend_query: Optional[QueryConfig]
50
+ :ivar _backend_params: Internal parameters configuration for DataFrame handling.
51
+ :type _backend_params: Optional[ParamsConfig]
52
+ :ivar backend_parquet: Configuration for Parquet file handling.
53
+ :type backend_parquet: Optional[ParquetConfig]
54
+ :ivar backend_http: Configuration for interacting with HTTP-based backends.
55
+ :type backend_http: Optional[HttpConfig]
56
+ :ivar backend_sqlalchemy: Configuration for interacting with SQLAlchemy-based databases.
57
+ :type backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig]
58
+ :ivar parquet_filename: The filename for a Parquet file, if applicable.
59
+ :type parquet_filename: str
60
+ :ivar logger: Logger instance used for debugging and information logging.
61
+ :type logger: Logger
62
+ :ivar default_config: Default configuration dictionary that can be overridden.
63
+ :type default_config: Dict
64
+ """
32
65
  df: Union[dd.DataFrame, pd.DataFrame] = None
33
66
  backend_django: Optional[DjangoConnectionConfig] = None
34
67
  _backend_query: Optional[QueryConfig] = None
@@ -60,7 +93,20 @@ class DfHelper:
60
93
  def __str__(self):
61
94
  return self.__class__.__name__
62
95
 
96
+ def __call__(self, **options):
97
+ return self.load(**options)
98
+
63
99
  def __post_init(self, **kwargs):
100
+ """
101
+ Initializes backend-specific configurations based on the provided backend type and other
102
+ parameters. This method performs configuration setup dependent on the selected backend,
103
+ such as 'django_db', 'parquet', 'http', or 'sqlalchemy'. Configuration for each backend
104
+ type is fetched or instantiated as necessary using provided parameters or default values.
105
+
106
+ :param kwargs: Dictionary of arguments passed during initialization of backend configurations.
107
+ Additional parameters for specific backend types are extracted here.
108
+ :return: None
109
+ """
64
110
  self.logger.debug(f"backend used: {self.backend}")
65
111
  self._backend_query = self.__get_config(QueryConfig, kwargs)
66
112
  self._backend_params = self.__get_config(ParamsConfig, kwargs)
@@ -88,7 +134,35 @@ class DfHelper:
88
134
  model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
89
135
  return model(**model_kwargs)
90
136
 
137
+ def load_parallel(self, **options):
138
+ """
139
+ Executes the `load` method in parallel using Dask, allowing multiple instances
140
+ to run concurrently. This function leverages Dask's `delayed` and `compute`
141
+ methods to schedule and process tasks in parallel. It is designed to handle
142
+ concurrent workloads efficiently by utilizing up to 4 parallel executions of
143
+ the `load` function.
144
+
145
+ :param options: Keyword arguments to be passed to the `load` method. These options
146
+ will be applied to all parallel instances of the `load` method.
147
+ :return: A list of results, where each element represents the output
148
+ from one of the parallel executions of the `load` method.
149
+ """
150
+ # Define tasks using Dask's delayed
151
+ tasks = [delayed(self.load)(**options) for _ in range(4)]
152
+ results = compute(*tasks)
153
+ return results
154
+
91
155
  def load(self, **options):
156
+ """
157
+ Loads data from a dataframe backend, ensuring compatibility with multiple
158
+ data processing backends. Provides the data in a pandas dataframe format
159
+ if the `as_pandas` attribute is set to True.
160
+
161
+ :param options: Arbitrary keyword arguments for dataframe loading customization.
162
+ :type options: dict
163
+ :return: The loaded dataframe, computed as a pandas dataframe if
164
+ `as_pandas` is set to True, or kept in its native backend format otherwise.
165
+ """
92
166
  # this will be the universal method to load data from a df irrespective of the backend
93
167
  df = self.__load(**options)
94
168
  if self.as_pandas:
@@ -96,7 +170,23 @@ class DfHelper:
96
170
  return df
97
171
 
98
172
  def __load(self, **options):
99
-
173
+ """
174
+ Private method responsible for loading data using a specified backend. This method
175
+ abstracts away the details of interacting with the backend and dynamically calls the
176
+ appropriate function depending on the backend type. It supports multiple backend
177
+ types, such as `django_db`, `sqlalchemy`, `parquet`, and `http`. If the `http` backend
178
+ is selected, it checks whether the asyncio event loop is running and either runs the
179
+ process as a new asyncio task or synchronously.
180
+
181
+ :param options: Arbitrary keyword arguments provided for backend-specific configurations.
182
+ These should align with the requirements of the chosen backend.
183
+ :type options: dict
184
+
185
+ :return: The data loaded from the specified backend. The return type is dependent on
186
+ the particular backend being used.
187
+ :rtype: Depending on backend implementation; could be `Task`, `List`, `Dict`, or
188
+ another format defined by the backend.
189
+ """
100
190
  if self.backend == 'django_db':
101
191
  self._backend_params.parse_params(options)
102
192
  return self.__load_from_db(**options)
@@ -167,8 +257,13 @@ class DfHelper:
167
257
 
168
258
  def __post_process_df(self):
169
259
  """
170
- Efficiently process the DataFrame by filtering, renaming, and setting indices.
171
- Optimized for large datasets with Dask compatibility.
260
+ Processes a DataFrame according to the provided parameters defined within the
261
+ `self._backend_params.df_params` dictionary. This involves filtering columns,
262
+ renaming columns, setting an index column, and handling datetime indexing.
263
+ The method modifies the DataFrame in place.
264
+
265
+ :raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
266
+ or if the specified `index_col` is not found in the DataFrame.
172
267
  """
173
268
  df_params = self._backend_params.df_params
174
269
  fieldnames = df_params.get("fieldnames", None)
@@ -205,6 +300,21 @@ class DfHelper:
205
300
  self.logger.debug("Post-processing of DataFrame completed.")
206
301
 
207
302
  def __process_loaded_data(self):
303
+ """
304
+ Processes the dataframe by applying renaming logic based on the given field map
305
+ configuration. Inspects the dataframe for missing columns referenced in the field
306
+ map and flags them with a warning. Applies renaming only for columns that exist
307
+ in the dataframe while ensuring that no operations take place if the dataframe
308
+ is empty.
309
+
310
+ :param self: The instance of the class where the dataframe is being processed.
311
+ :type self: object with attributes `df`, `_backend_params`, and `logger`.
312
+
313
+ :raises Warning: Logs a warning if specified columns in the `field_map` are not
314
+ present in the dataframe.
315
+
316
+ :return: None
317
+ """
208
318
  self.logger.debug(f"Type of self.df: {type(self.df)}")
209
319
  if self.df.map_partitions(len).compute().sum() > 0:
210
320
  field_map = self._backend_params.field_map or {}
@@ -239,20 +349,54 @@ class DfHelper:
239
349
  self.logger.debug("Save to ClickHouse completed.")
240
350
 
241
351
  def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
242
- self.df = self.backend_parquet.load_files()
243
- if options:
244
- """
245
- deprecated specific filter handling to a generic one
246
- self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
247
-
248
- """
249
- self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
250
- return self.df
352
+ try:
353
+ self.df = self.backend_parquet.load_files()
354
+ if options and self.df is not None:
355
+ """
356
+ deprecated specific filter handling to a generic one
357
+ self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
358
+
359
+ """
360
+ self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
361
+ return self.df
362
+ except Exception as e:
363
+ self.logger.debug(f"Failed to load data from parquet: {e}")
364
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
251
365
 
252
366
  def load_period(self, **kwargs):
253
367
  return self.__load_period(**kwargs)
254
368
 
255
369
  def __load_period(self, **kwargs):
370
+ """
371
+ Validates and processes the temporal filtering parameters `start` and `end` for querying,
372
+ ensuring correctness and compatibility with a specified backend (Django or SQLAlchemy).
373
+ This method dynamically maps and validates the provided datetime or date field from the
374
+ model according to the configured backend, and applies the appropriate filters to query objects.
375
+
376
+ This function enforces that both `start` and `end` are provided and checks if the start date
377
+ is earlier or the same as the end date. It supports parsing string representations of dates
378
+ and validates them against the date or datetime fields associated with the chosen backend.
379
+ If the backend or field is incompatible or missing, appropriate errors are raised.
380
+
381
+ The resulting filter conditions are integrated into `kwargs` for querying with the
382
+ appropriate backend model.
383
+
384
+ :param kwargs: Keyword arguments, including temporal filtering parameters and optionally a
385
+ datetime or date field name. Supported parameters include:
386
+ - **dt_field**: The name of the date or datetime field to use in filtering. Defaults
387
+ to an internally set field if not explicitly provided.
388
+ - **start**: The starting date or datetime for the query range. Can be a `str` or
389
+ `datetime.date/datetime.datetime` object.
390
+ - **end**: The ending date or datetime for the query range. Can be a `str` or
391
+ `datetime.date/datetime.datetime` object.
392
+
393
+ :return: Queryset or result of the load function with the applied temporal filters.
394
+ :rtype: Any
395
+
396
+ :raises ValueError: If the `dt_field` is not provided, if `start` or `end`
397
+ are missing, if the `start` date is later than `end`, or if the `dt_field`
398
+ does not exist in the backend model or its metadata.
399
+ """
256
400
  dt_field = kwargs.pop("dt_field", self.dt_field)
257
401
  if dt_field is None:
258
402
  raise ValueError("dt_field must be provided")
@@ -316,6 +460,30 @@ class DfHelper:
316
460
 
317
461
  @staticmethod
318
462
  def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
463
+ """
464
+ Parses a date string and converts it to a `datetime.datetime` or
465
+ `datetime.date` object.
466
+
467
+ This method attempts to parse the given string in two distinct formats:
468
+ 1. First, it tries to interpret the string as a datetime with the format
469
+ ``%Y-%m-%d %H:%M:%S``. If successful, it returns a `datetime.datetime`
470
+ object.
471
+ 2. If the first format parsing fails, it attempts to parse the string as
472
+ a date with the format ``%Y-%m-%d``. If successful, it returns a
473
+ `datetime.date` object.
474
+
475
+ If the string cannot be parsed in either of these formats, the method will
476
+ raise a `ValueError`.
477
+
478
+ :param date_str: The date string to be parsed. Expected to match one of the
479
+ formats: ``%Y-%m-%d %H:%M:%S`` or ``%Y-%m-%d``.
480
+ :type date_str: str
481
+ :return: A `datetime.datetime` object if the string matches the first format,
482
+ or a `datetime.date` object if the string matches the second format.
483
+ :rtype: Union[datetime.datetime, datetime.date]
484
+ :raises ValueError: Raised if neither date format can be successfully parsed
485
+ from the provided string.
486
+ """
319
487
  try:
320
488
  return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
321
489
  except ValueError:
@@ -9,6 +9,22 @@ from sibi_dst.utils import Logger
9
9
 
10
10
 
11
11
  class FilterHandler:
12
+ """
13
+ Handles the application of filters to data sources with support for SQLAlchemy and Dask backends.
14
+
15
+ The FilterHandler class abstracts the process of applying filters to various backends, specifically
16
+ SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
17
+ exact matches, comparisons, and string-related operations such as contains and regex. The handler
18
+ automatically determines and applies backend-specific processing, enabling seamless integration with
19
+ different data models or backends.
20
+
21
+ :ivar backend: The backend in use ('sqlalchemy' or 'dask').
22
+ :type backend: str
23
+ :ivar logger: An optional logger instance for debugging and logging purposes.
24
+ :type logger: Logger
25
+ :ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
26
+ :type backend_methods: dict
27
+ """
12
28
  def __init__(self, backend, logger=None):
13
29
  """
14
30
  Initialize the FilterHandler.
@@ -0,0 +1,132 @@
1
+ import re
2
+ from nltk.corpus import stopwords
3
+ from nltk.stem import SnowballStemmer
4
+ import dask.dataframe as dd
5
+ from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
6
+ import nltk
7
+
8
+ class DataCleaner:
9
+ def __init__(self, dataframe):
10
+ self.original_df = dataframe
11
+ self.df = dataframe.copy()
12
+ self.duplicates_df = None
13
+
14
+ def handle_missing_values(self, strategy='mean'):
15
+ if strategy == 'mean':
16
+ self.df = self.df.fillna(self.df.mean())
17
+ elif strategy == 'median':
18
+ self.df = self.df.fillna(self.df.median())
19
+ elif strategy == 'mode':
20
+ self.df = self.df.fillna(self.df.mode().iloc[0])
21
+ elif strategy == 'drop':
22
+ self.df = self.df.dropna()
23
+ return self
24
+
25
+ def identify_duplicates(self, subset=None):
26
+ self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
27
+ return self.duplicates_df
28
+
29
+ def remove_duplicates(self):
30
+ if self.duplicates_df is not None:
31
+ self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
32
+ return self
33
+
34
+ def validate_date_fields(self, date_columns=None):
35
+ if date_columns is None:
36
+ date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
37
+ for col in date_columns:
38
+ print('Validating date field: ', col)
39
+ self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
40
+ return self
41
+
42
+ def clean_text(self, text_columns=None, language='english'):
43
+ nltk.download('stopwords')
44
+ stop_words = set(stopwords.words(language))
45
+ stemmer = SnowballStemmer(language)
46
+
47
+ def clean_text(text):
48
+ if isinstance(text, str):
49
+ text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
50
+ text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
51
+ words = text.split()
52
+ words = [word for word in words if word not in stop_words] # Remove stop words
53
+ words = [stemmer.stem(word) for word in words] # Apply stemming
54
+ return ' '.join(words)
55
+ return text
56
+
57
+ if text_columns is None:
58
+ text_columns = self.df.select_dtypes(include=['object', 'string']).columns
59
+ text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
60
+
61
+ for col in text_columns:
62
+ print('Cleaning text field: ', col)
63
+ self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
64
+ return self
65
+
66
+ def validate_numeric_fields(self, int_columns=None, float_columns=None):
67
+ if int_columns is None:
68
+ int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
69
+ if float_columns is None:
70
+ float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
71
+
72
+ for col in int_columns:
73
+ print('Validating integer field: ', col)
74
+ self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
75
+
76
+ for col in float_columns:
77
+ print('Validating float field: ', col)
78
+ self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
79
+
80
+ return self
81
+
82
+ def detect_categorical_columns(self, threshold=0.05):
83
+ """
84
+ Detect columns that can be converted to 'category' dtype.
85
+
86
+ Parameters:
87
+ threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
88
+
89
+ Returns:
90
+ List of column names that can be converted to 'category' dtype.
91
+ """
92
+ categorical_columns = []
93
+
94
+ def unique_ratio(partition, col):
95
+ return partition[col].nunique() / len(partition)
96
+
97
+ for col in self.df.columns:
98
+ print("Detecting categorical columns: ", col)
99
+ unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
100
+ overall_unique_ratio = unique_ratios.sum() / len(self.df)
101
+ if overall_unique_ratio < threshold:
102
+ print(f'Column {col} is categorical')
103
+ categorical_columns.append(col)
104
+
105
+ return categorical_columns
106
+
107
+ def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
108
+ if columns is None:
109
+ columns = self.detect_categorical_columns(threshold)
110
+
111
+ if method == 'onehot':
112
+ for col in columns:
113
+ self.df[col] = self.df[col].astype('category')
114
+ encoder = OneHotEncoder(sparse_output=False)
115
+ self.df = encoder.fit_transform(self.df)
116
+ elif method == 'label':
117
+ encoder = LabelEncoder()
118
+ for col in columns:
119
+ self.df[col] = encoder.fit_transform(self.df[col])
120
+ return self
121
+
122
+ def analyze_dtypes(self):
123
+ return self.df.dtypes
124
+
125
+ def get_cleaned_dataframe(self):
126
+ return self.df
127
+
128
+ def get_original_dataframe(self):
129
+ return self.original_df
130
+
131
+ def get_duplicates_dataframe(self):
132
+ return self.duplicates_df
@@ -0,0 +1,9 @@
1
+ from __future__ import annotations
2
+
3
+ from .base_osm_map import BaseOsmMap
4
+ from .utils import PBFHandler
5
+
6
+ __all__ = [
7
+ "BaseOsmMap",
8
+ "PBFHandler",
9
+ ]
@@ -0,0 +1,165 @@
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ from abc import abstractmethod
5
+
6
+ import folium
7
+ import geopandas as gpd
8
+ import numpy as np
9
+ import osmnx as ox
10
+ from folium.plugins import Fullscreen
11
+
12
+
13
+ class BaseOsmMap:
14
+ tile_options = {
15
+ "OpenStreetMap": "OpenStreetMap",
16
+ "CartoDB": "cartodbpositron",
17
+ "CartoDB Voyager": "cartodbvoyager"
18
+ }
19
+ # Set default bounds for Costa Rica
20
+ bounds = [[8.0340, -85.9417], [11.2192, -82.5566]]
21
+
22
+ def __init__(self, osmnx_graph=None, df=None, **kwargs):
23
+ if osmnx_graph is None:
24
+ raise ValueError('osmnx_graph must be provided')
25
+ if df is None:
26
+ raise ValueError('df must be provided')
27
+ if df.empty:
28
+ raise ValueError('df must not be empty')
29
+ self.df = df.copy()
30
+ self.osmnx_graph = osmnx_graph
31
+ self.lat_col = kwargs.get('lat_col', 'latitude')
32
+ self.lon_col = kwargs.get('lon_col', 'longitude')
33
+ self.osm_map = None
34
+ self.G = None
35
+ self.map_html_title = self._sanitize_html(kwargs.get('map_html_title', 'OSM Basemap'))
36
+
37
+ self.zoom_start = kwargs.pop('zoom_start', 13)
38
+ self.fullscreen = kwargs.pop('fullscreen', True)
39
+ self.fullscreen_position = kwargs.pop('fullscreen_position', 'topright')
40
+ self.tiles = kwargs.pop('tiles', 'OpenStreetMap')
41
+ self.verbose = kwargs.pop('verbose', False)
42
+ self.sort_keys = kwargs.pop('sort_keys', None)
43
+ self.dt_field = kwargs.pop('dt_field', None)
44
+ self.dt = None
45
+ self.calc_nearest_nodes = kwargs.pop('calc_nearest_nodes', False)
46
+ self.nearest_nodes = None
47
+ self.max_bounds = kwargs.pop('max_bounds', False)
48
+ self._prepare_df()
49
+ self._initialise_map()
50
+
51
+
52
+ def _prepare_df(self):
53
+ if self.sort_keys:
54
+ self.df.sort_values(by=self.sort_keys, inplace=True)
55
+ self.df.reset_index(drop=True, inplace=True)
56
+ self.gps_points = self.df[[self.lat_col, self.lon_col]].values.tolist()
57
+ if self.dt_field is not None:
58
+ self.dt = self.df[self.dt_field].tolist()
59
+
60
+ if self.calc_nearest_nodes:
61
+ self.nearest_nodes = ox.distance.nearest_nodes(self.osmnx_graph, X=self.df[self.lon_col],
62
+ Y=self.df[self.lat_col])
63
+
64
+
65
+ def _initialise_map(self):
66
+ gps_array = np.array(self.gps_points)
67
+ mean_latitude = np.mean(gps_array[:, 0])
68
+ mean_longitude = np.mean(gps_array[:, 1])
69
+ self.osm_map = folium.Map(location=[mean_latitude, mean_longitude], zoom_start=self.zoom_start,
70
+ tiles=self.tiles, max_bounds=self.max_bounds)
71
+ north, south, east, west = self._get_bounding_box_from_points(margin=0.001)
72
+ self.G = self._extract_subgraph(north, south, east, west)
73
+
74
+
75
+ def _attach_supported_tiles(self):
76
+ # Normalize the default tile name to lowercase for comparison
77
+ normalized_default_tile = self.tiles.lower()
78
+
79
+ # Filter out the default tile layer from the options to avoid duplication
80
+ tile_options_filtered = {k: v for k, v in self.tile_options.items() if v.lower() != normalized_default_tile}
81
+
82
+ for tile, description in tile_options_filtered.items():
83
+ folium.TileLayer(name=tile, tiles=description, show=False).add_to(self.osm_map)
84
+
85
+
86
+ def _get_bounding_box_from_points(self, margin=0.001):
87
+ latitudes = [point[0] for point in self.gps_points]
88
+ longitudes = [point[1] for point in self.gps_points]
89
+
90
+ north = max(latitudes) + margin
91
+ south = min(latitudes) - margin
92
+ east = max(longitudes) + margin
93
+ west = min(longitudes) - margin
94
+
95
+ return north, south, east, west
96
+
97
+
98
+ def _extract_subgraph(self, north, south, east, west):
99
+ # Create a bounding box polygon
100
+ # from osmnx v2 this is how it is done
101
+ if ox.__version__ >= '2.0':
102
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(bbox=(west, south, east, north))])
103
+ else:
104
+ bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(north, south, east, west)])
105
+
106
+ # Get nodes GeoDataFrame
107
+ nodes_gdf = ox.graph_to_gdfs(self.osmnx_graph, nodes=True, edges=False)
108
+
109
+ # Find nodes within the bounding box
110
+ nodes_within_bbox = nodes_gdf[nodes_gdf.geometry.within(bbox_poly.geometry.unary_union)]
111
+
112
+ # Create subgraph
113
+ subgraph = self.osmnx_graph.subgraph(nodes_within_bbox.index)
114
+
115
+ return subgraph
116
+
117
+
118
+ @abstractmethod
119
+ def process_map(self):
120
+ # this is to be implemented at the subclass level
121
+ # implement here your specific map logic.
122
+ ...
123
+
124
+
125
+ def pre_process_map(self):
126
+ # this is to be implemented at the subclass level
127
+ # call super().pre_process_map first to inherit the following behaviour
128
+ ...
129
+
130
+
131
+ def _post_process_map(self):
132
+ self._attach_supported_tiles()
133
+ self.add_tile_layer()
134
+ self._add_fullscreen()
135
+ self._add_map_title()
136
+ if self.max_bounds:
137
+ self.osm_map.fit_bounds(self.bounds)
138
+
139
+
140
+ def add_tile_layer(self):
141
+ # Override in subclass and call super().add_tile_layer at the end
142
+ folium.LayerControl().add_to(self.osm_map)
143
+
144
+
145
+ def _add_fullscreen(self):
146
+ if self.fullscreen:
147
+ Fullscreen(position=self.fullscreen_position).add_to(self.osm_map)
148
+
149
+
150
+ def _add_map_title(self):
151
+ if self.map_html_title:
152
+ self.osm_map.get_root().html.add_child(folium.Element(self.map_html_title))
153
+
154
+
155
+ @staticmethod
156
+ def _sanitize_html(input_html):
157
+ return html.escape(input_html)
158
+
159
+
160
+ def generate_map(self):
161
+ self.pre_process_map()
162
+ self.process_map()
163
+ self._post_process_map()
164
+
165
+ return self.osm_map