sibi-dst 0.3.26__py3-none-any.whl → 0.3.28__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +2 -0
- sibi_dst/df_helper/_df_helper.py +180 -12
- sibi_dst/df_helper/core/_filter_handler.py +37 -0
- sibi_dst/df_helper/core/_params_config.py +2 -2
- sibi_dst/df_helper/data_cleaner.py +132 -0
- sibi_dst/osmnx_helper/__init__.py +9 -0
- sibi_dst/osmnx_helper/base_osm_map.py +165 -0
- sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- sibi_dst/osmnx_helper/basemaps/calendar_html.py +122 -0
- sibi_dst/osmnx_helper/basemaps/router_plotter.py +186 -0
- sibi_dst/osmnx_helper/utils.py +267 -0
- sibi_dst/tests/__init__.py +0 -0
- sibi_dst/tests/test_data_wrapper_class.py +78 -0
- sibi_dst/utils/clickhouse_writer.py +1 -1
- sibi_dst/utils/data_utils.py +1 -1
- sibi_dst/utils/data_wrapper.py +0 -26
- {sibi_dst-0.3.26.dist-info → sibi_dst-0.3.28.dist-info}/METADATA +6 -1
- {sibi_dst-0.3.26.dist-info → sibi_dst-0.3.28.dist-info}/RECORD +19 -10
- {sibi_dst-0.3.26.dist-info → sibi_dst-0.3.28.dist-info}/WHEEL +1 -1
sibi_dst/df_helper/__init__.py
CHANGED
@@ -3,9 +3,11 @@ from __future__ import annotations
|
|
3
3
|
from ._df_helper import DfHelper
|
4
4
|
from ._parquet_artifact import ParquetArtifact
|
5
5
|
from ._parquet_reader import ParquetReader
|
6
|
+
#from .data_cleaner import DataCleaner
|
6
7
|
|
7
8
|
__all__ = [
|
8
9
|
'DfHelper',
|
9
10
|
'ParquetArtifact',
|
10
11
|
'ParquetReader',
|
12
|
+
#'DataCleaner'
|
11
13
|
]
|
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -6,6 +6,7 @@ from typing import Any, Dict, TypeVar
|
|
6
6
|
from typing import Union, Optional
|
7
7
|
|
8
8
|
import dask.dataframe as dd
|
9
|
+
from dask import delayed, compute
|
9
10
|
import pandas as pd
|
10
11
|
from pydantic import BaseModel
|
11
12
|
|
@@ -29,6 +30,38 @@ warnings.filterwarnings(
|
|
29
30
|
|
30
31
|
|
31
32
|
class DfHelper:
|
33
|
+
"""
|
34
|
+
DfHelper is a utility class for managing, loading, and processing data from
|
35
|
+
various backends, such as Django databases, Parquet files, HTTP sources, and
|
36
|
+
SQLAlchemy-based databases. The class abstracts the complexities of handling
|
37
|
+
different backends and provides a unified interface for data operations.
|
38
|
+
|
39
|
+
The class is particularly useful for projects that require flexibility in
|
40
|
+
data source configuration and seamless integration with both Dask and Pandas
|
41
|
+
for handling data frames. It includes robust mechanisms for post-processing
|
42
|
+
data, filtering columns, renaming, and setting indices.
|
43
|
+
|
44
|
+
:ivar df: The DataFrame currently being processed or loaded.
|
45
|
+
:type df: Union[dd.DataFrame, pd.DataFrame]
|
46
|
+
:ivar backend_django: Configuration for interacting with Django database backends.
|
47
|
+
:type backend_django: Optional[DjangoConnectionConfig]
|
48
|
+
:ivar _backend_query: Internal configuration for query handling.
|
49
|
+
:type _backend_query: Optional[QueryConfig]
|
50
|
+
:ivar _backend_params: Internal parameters configuration for DataFrame handling.
|
51
|
+
:type _backend_params: Optional[ParamsConfig]
|
52
|
+
:ivar backend_parquet: Configuration for Parquet file handling.
|
53
|
+
:type backend_parquet: Optional[ParquetConfig]
|
54
|
+
:ivar backend_http: Configuration for interacting with HTTP-based backends.
|
55
|
+
:type backend_http: Optional[HttpConfig]
|
56
|
+
:ivar backend_sqlalchemy: Configuration for interacting with SQLAlchemy-based databases.
|
57
|
+
:type backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig]
|
58
|
+
:ivar parquet_filename: The filename for a Parquet file, if applicable.
|
59
|
+
:type parquet_filename: str
|
60
|
+
:ivar logger: Logger instance used for debugging and information logging.
|
61
|
+
:type logger: Logger
|
62
|
+
:ivar default_config: Default configuration dictionary that can be overridden.
|
63
|
+
:type default_config: Dict
|
64
|
+
"""
|
32
65
|
df: Union[dd.DataFrame, pd.DataFrame] = None
|
33
66
|
backend_django: Optional[DjangoConnectionConfig] = None
|
34
67
|
_backend_query: Optional[QueryConfig] = None
|
@@ -60,7 +93,20 @@ class DfHelper:
|
|
60
93
|
def __str__(self):
|
61
94
|
return self.__class__.__name__
|
62
95
|
|
96
|
+
def __call__(self, **options):
|
97
|
+
return self.load(**options)
|
98
|
+
|
63
99
|
def __post_init(self, **kwargs):
|
100
|
+
"""
|
101
|
+
Initializes backend-specific configurations based on the provided backend type and other
|
102
|
+
parameters. This method performs configuration setup dependent on the selected backend,
|
103
|
+
such as 'django_db', 'parquet', 'http', or 'sqlalchemy'. Configuration for each backend
|
104
|
+
type is fetched or instantiated as necessary using provided parameters or default values.
|
105
|
+
|
106
|
+
:param kwargs: Dictionary of arguments passed during initialization of backend configurations.
|
107
|
+
Additional parameters for specific backend types are extracted here.
|
108
|
+
:return: None
|
109
|
+
"""
|
64
110
|
self.logger.debug(f"backend used: {self.backend}")
|
65
111
|
self._backend_query = self.__get_config(QueryConfig, kwargs)
|
66
112
|
self._backend_params = self.__get_config(ParamsConfig, kwargs)
|
@@ -88,7 +134,35 @@ class DfHelper:
|
|
88
134
|
model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
|
89
135
|
return model(**model_kwargs)
|
90
136
|
|
137
|
+
def load_parallel(self, **options):
|
138
|
+
"""
|
139
|
+
Executes the `load` method in parallel using Dask, allowing multiple instances
|
140
|
+
to run concurrently. This function leverages Dask's `delayed` and `compute`
|
141
|
+
methods to schedule and process tasks in parallel. It is designed to handle
|
142
|
+
concurrent workloads efficiently by utilizing up to 4 parallel executions of
|
143
|
+
the `load` function.
|
144
|
+
|
145
|
+
:param options: Keyword arguments to be passed to the `load` method. These options
|
146
|
+
will be applied to all parallel instances of the `load` method.
|
147
|
+
:return: A list of results, where each element represents the output
|
148
|
+
from one of the parallel executions of the `load` method.
|
149
|
+
"""
|
150
|
+
# Define tasks using Dask's delayed
|
151
|
+
tasks = [delayed(self.load)(**options) for _ in range(4)]
|
152
|
+
results = compute(*tasks)
|
153
|
+
return results
|
154
|
+
|
91
155
|
def load(self, **options):
|
156
|
+
"""
|
157
|
+
Loads data from a dataframe backend, ensuring compatibility with multiple
|
158
|
+
data processing backends. Provides the data in a pandas dataframe format
|
159
|
+
if the `as_pandas` attribute is set to True.
|
160
|
+
|
161
|
+
:param options: Arbitrary keyword arguments for dataframe loading customization.
|
162
|
+
:type options: dict
|
163
|
+
:return: The loaded dataframe, computed as a pandas dataframe if
|
164
|
+
`as_pandas` is set to True, or kept in its native backend format otherwise.
|
165
|
+
"""
|
92
166
|
# this will be the universal method to load data from a df irrespective of the backend
|
93
167
|
df = self.__load(**options)
|
94
168
|
if self.as_pandas:
|
@@ -96,7 +170,23 @@ class DfHelper:
|
|
96
170
|
return df
|
97
171
|
|
98
172
|
def __load(self, **options):
|
99
|
-
|
173
|
+
"""
|
174
|
+
Private method responsible for loading data using a specified backend. This method
|
175
|
+
abstracts away the details of interacting with the backend and dynamically calls the
|
176
|
+
appropriate function depending on the backend type. It supports multiple backend
|
177
|
+
types, such as `django_db`, `sqlalchemy`, `parquet`, and `http`. If the `http` backend
|
178
|
+
is selected, it checks whether the asyncio event loop is running and either runs the
|
179
|
+
process as a new asyncio task or synchronously.
|
180
|
+
|
181
|
+
:param options: Arbitrary keyword arguments provided for backend-specific configurations.
|
182
|
+
These should align with the requirements of the chosen backend.
|
183
|
+
:type options: dict
|
184
|
+
|
185
|
+
:return: The data loaded from the specified backend. The return type is dependent on
|
186
|
+
the particular backend being used.
|
187
|
+
:rtype: Depending on backend implementation; could be `Task`, `List`, `Dict`, or
|
188
|
+
another format defined by the backend.
|
189
|
+
"""
|
100
190
|
if self.backend == 'django_db':
|
101
191
|
self._backend_params.parse_params(options)
|
102
192
|
return self.__load_from_db(**options)
|
@@ -167,8 +257,13 @@ class DfHelper:
|
|
167
257
|
|
168
258
|
def __post_process_df(self):
|
169
259
|
"""
|
170
|
-
|
171
|
-
|
260
|
+
Processes a DataFrame according to the provided parameters defined within the
|
261
|
+
`self._backend_params.df_params` dictionary. This involves filtering columns,
|
262
|
+
renaming columns, setting an index column, and handling datetime indexing.
|
263
|
+
The method modifies the DataFrame in place.
|
264
|
+
|
265
|
+
:raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
|
266
|
+
or if the specified `index_col` is not found in the DataFrame.
|
172
267
|
"""
|
173
268
|
df_params = self._backend_params.df_params
|
174
269
|
fieldnames = df_params.get("fieldnames", None)
|
@@ -205,6 +300,21 @@ class DfHelper:
|
|
205
300
|
self.logger.debug("Post-processing of DataFrame completed.")
|
206
301
|
|
207
302
|
def __process_loaded_data(self):
|
303
|
+
"""
|
304
|
+
Processes the dataframe by applying renaming logic based on the given field map
|
305
|
+
configuration. Inspects the dataframe for missing columns referenced in the field
|
306
|
+
map and flags them with a warning. Applies renaming only for columns that exist
|
307
|
+
in the dataframe while ensuring that no operations take place if the dataframe
|
308
|
+
is empty.
|
309
|
+
|
310
|
+
:param self: The instance of the class where the dataframe is being processed.
|
311
|
+
:type self: object with attributes `df`, `_backend_params`, and `logger`.
|
312
|
+
|
313
|
+
:raises Warning: Logs a warning if specified columns in the `field_map` are not
|
314
|
+
present in the dataframe.
|
315
|
+
|
316
|
+
:return: None
|
317
|
+
"""
|
208
318
|
self.logger.debug(f"Type of self.df: {type(self.df)}")
|
209
319
|
if self.df.map_partitions(len).compute().sum() > 0:
|
210
320
|
field_map = self._backend_params.field_map or {}
|
@@ -239,20 +349,54 @@ class DfHelper:
|
|
239
349
|
self.logger.debug("Save to ClickHouse completed.")
|
240
350
|
|
241
351
|
def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
352
|
+
try:
|
353
|
+
self.df = self.backend_parquet.load_files()
|
354
|
+
if options and self.df is not None:
|
355
|
+
"""
|
356
|
+
deprecated specific filter handling to a generic one
|
357
|
+
self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
|
358
|
+
|
359
|
+
"""
|
360
|
+
self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
|
361
|
+
return self.df
|
362
|
+
except Exception as e:
|
363
|
+
self.logger.debug(f"Failed to load data from parquet: {e}")
|
364
|
+
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
251
365
|
|
252
366
|
def load_period(self, **kwargs):
|
253
367
|
return self.__load_period(**kwargs)
|
254
368
|
|
255
369
|
def __load_period(self, **kwargs):
|
370
|
+
"""
|
371
|
+
Validates and processes the temporal filtering parameters `start` and `end` for querying,
|
372
|
+
ensuring correctness and compatibility with a specified backend (Django or SQLAlchemy).
|
373
|
+
This method dynamically maps and validates the provided datetime or date field from the
|
374
|
+
model according to the configured backend, and applies the appropriate filters to query objects.
|
375
|
+
|
376
|
+
This function enforces that both `start` and `end` are provided and checks if the start date
|
377
|
+
is earlier or the same as the end date. It supports parsing string representations of dates
|
378
|
+
and validates them against the date or datetime fields associated with the chosen backend.
|
379
|
+
If the backend or field is incompatible or missing, appropriate errors are raised.
|
380
|
+
|
381
|
+
The resulting filter conditions are integrated into `kwargs` for querying with the
|
382
|
+
appropriate backend model.
|
383
|
+
|
384
|
+
:param kwargs: Keyword arguments, including temporal filtering parameters and optionally a
|
385
|
+
datetime or date field name. Supported parameters include:
|
386
|
+
- **dt_field**: The name of the date or datetime field to use in filtering. Defaults
|
387
|
+
to an internally set field if not explicitly provided.
|
388
|
+
- **start**: The starting date or datetime for the query range. Can be a `str` or
|
389
|
+
`datetime.date/datetime.datetime` object.
|
390
|
+
- **end**: The ending date or datetime for the query range. Can be a `str` or
|
391
|
+
`datetime.date/datetime.datetime` object.
|
392
|
+
|
393
|
+
:return: Queryset or result of the load function with the applied temporal filters.
|
394
|
+
:rtype: Any
|
395
|
+
|
396
|
+
:raises ValueError: If the `dt_field` is not provided, if `start` or `end`
|
397
|
+
are missing, if the `start` date is later than `end`, or if the `dt_field`
|
398
|
+
does not exist in the backend model or its metadata.
|
399
|
+
"""
|
256
400
|
dt_field = kwargs.pop("dt_field", self.dt_field)
|
257
401
|
if dt_field is None:
|
258
402
|
raise ValueError("dt_field must be provided")
|
@@ -316,6 +460,30 @@ class DfHelper:
|
|
316
460
|
|
317
461
|
@staticmethod
|
318
462
|
def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
|
463
|
+
"""
|
464
|
+
Parses a date string and converts it to a `datetime.datetime` or
|
465
|
+
`datetime.date` object.
|
466
|
+
|
467
|
+
This method attempts to parse the given string in two distinct formats:
|
468
|
+
1. First, it tries to interpret the string as a datetime with the format
|
469
|
+
``%Y-%m-%d %H:%M:%S``. If successful, it returns a `datetime.datetime`
|
470
|
+
object.
|
471
|
+
2. If the first format parsing fails, it attempts to parse the string as
|
472
|
+
a date with the format ``%Y-%m-%d``. If successful, it returns a
|
473
|
+
`datetime.date` object.
|
474
|
+
|
475
|
+
If the string cannot be parsed in either of these formats, the method will
|
476
|
+
raise a `ValueError`.
|
477
|
+
|
478
|
+
:param date_str: The date string to be parsed. Expected to match one of the
|
479
|
+
formats: ``%Y-%m-%d %H:%M:%S`` or ``%Y-%m-%d``.
|
480
|
+
:type date_str: str
|
481
|
+
:return: A `datetime.datetime` object if the string matches the first format,
|
482
|
+
or a `datetime.date` object if the string matches the second format.
|
483
|
+
:rtype: Union[datetime.datetime, datetime.date]
|
484
|
+
:raises ValueError: Raised if neither date format can be successfully parsed
|
485
|
+
from the provided string.
|
486
|
+
"""
|
319
487
|
try:
|
320
488
|
return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
|
321
489
|
except ValueError:
|
@@ -9,6 +9,22 @@ from sibi_dst.utils import Logger
|
|
9
9
|
|
10
10
|
|
11
11
|
class FilterHandler:
|
12
|
+
"""
|
13
|
+
Handles the application of filters to data sources with support for SQLAlchemy and Dask backends.
|
14
|
+
|
15
|
+
The FilterHandler class abstracts the process of applying filters to various backends, specifically
|
16
|
+
SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
|
17
|
+
exact matches, comparisons, and string-related operations such as contains and regex. The handler
|
18
|
+
automatically determines and applies backend-specific processing, enabling seamless integration with
|
19
|
+
different data models or backends.
|
20
|
+
|
21
|
+
:ivar backend: The backend in use ('sqlalchemy' or 'dask').
|
22
|
+
:type backend: str
|
23
|
+
:ivar logger: An optional logger instance for debugging and logging purposes.
|
24
|
+
:type logger: Logger
|
25
|
+
:ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
|
26
|
+
:type backend_methods: dict
|
27
|
+
"""
|
12
28
|
def __init__(self, backend, logger=None):
|
13
29
|
"""
|
14
30
|
Initialize the FilterHandler.
|
@@ -185,6 +201,15 @@ class FilterHandler:
|
|
185
201
|
"startswith": lambda col, val: col.like(f"{val}%"),
|
186
202
|
"endswith": lambda col, val: col.like(f"%{val}"),
|
187
203
|
"isnull": lambda col, val: col.is_(None) if val else col.isnot(None),
|
204
|
+
"not_exact": lambda col, val: col != val,
|
205
|
+
"not_contains": lambda col, val: ~col.like(f"%{val}%"),
|
206
|
+
"not_in": lambda col, val: ~col.in_(val), # Custom operation
|
207
|
+
"regex": lambda col, val: col.op("~")(val), # Custom operation
|
208
|
+
"icontains": lambda col, val: col.ilike(f"%{val}%"), # Custom operation
|
209
|
+
"istartswith": lambda col, val: col.ilike(f"{val}%"), # Custom operation
|
210
|
+
"iendswith": lambda col, val: col.ilike(f"%{val}"), # Custom operation
|
211
|
+
"iexact": lambda col, val: col.ilike(val), # Added iexact
|
212
|
+
"iregex": lambda col, val: col.op("~*")(val), # Added iregex
|
188
213
|
}
|
189
214
|
|
190
215
|
@staticmethod
|
@@ -201,6 +226,15 @@ class FilterHandler:
|
|
201
226
|
"startswith": lambda col, val: col.str.startswith(val),
|
202
227
|
"endswith": lambda col, val: col.str.endswith(val),
|
203
228
|
"isnull": lambda col, val: col.isnull() if val else col.notnull(),
|
229
|
+
"not_exact": lambda col, val: col != val,
|
230
|
+
"not_contains": lambda col, val: ~col.str.contains(val, regex=True),
|
231
|
+
"not_in": lambda col, val: ~col.isin(val), # Custom operation
|
232
|
+
"regex": lambda col, val: col.str.contains(val, regex=True), # Custom operation
|
233
|
+
"icontains": lambda col, val: col.str.contains(val, case=False, regex=True), # Custom operation
|
234
|
+
"istartswith": lambda col, val: col.str.startswith(val, case=False), # Custom operation
|
235
|
+
"iendswith": lambda col, val: col.str.endswith(val, case=False), # Custom operation
|
236
|
+
"iexact": lambda col, val: col.str.contains(f"^{val}$", case=False, regex=True), # Added iexact
|
237
|
+
"iregex": lambda col, val: col.str.contains(val, case=False, regex=True), # Added iregex
|
204
238
|
}
|
205
239
|
|
206
240
|
@staticmethod
|
@@ -216,4 +250,7 @@ class FilterHandler:
|
|
216
250
|
return [
|
217
251
|
"gte", "lte", "gt", "lt", "exact", "in", "range",
|
218
252
|
"contains", "startswith", "endswith", "isnull",
|
253
|
+
"not_exact", "not_contains", "not_in",
|
254
|
+
"regex", "icontains", "istartswith", "iendswith",
|
255
|
+
"iexact", "iregex"
|
219
256
|
]
|
@@ -29,8 +29,8 @@ LOOKUP_SEP = "__"
|
|
29
29
|
class ParamsConfig(BaseModel):
|
30
30
|
field_map: Optional[Dict] = Field(default_factory=dict)
|
31
31
|
legacy_filters: bool = False
|
32
|
-
sticky_filters: Dict[str, Union[str, bool, int, float]] = Field(default_factory=dict)
|
33
|
-
filters: Dict[str, Union[str, Dict, bool, int, float]] = Field(default_factory=dict)
|
32
|
+
sticky_filters: Dict[str, Union[str, bool, int, float, list, tuple]] = Field(default_factory=dict)
|
33
|
+
filters: Dict[str, Union[str, Dict, bool, int, float, list, tuple]] = Field(default_factory=dict)
|
34
34
|
df_params: Dict[str, Union[tuple, str, bool, None]] = Field(default_factory=dict)
|
35
35
|
df_options: Dict[str, Union[bool, str, None]] = Field(default_factory=dict)
|
36
36
|
params: Dict[str, Union[str, bool, int, float, List[Union[str, int, bool, float]]]] = Field(default_factory=dict)
|
@@ -0,0 +1,132 @@
|
|
1
|
+
import re
|
2
|
+
from nltk.corpus import stopwords
|
3
|
+
from nltk.stem import SnowballStemmer
|
4
|
+
import dask.dataframe as dd
|
5
|
+
from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
|
6
|
+
import nltk
|
7
|
+
|
8
|
+
class DataCleaner:
|
9
|
+
def __init__(self, dataframe):
|
10
|
+
self.original_df = dataframe
|
11
|
+
self.df = dataframe.copy()
|
12
|
+
self.duplicates_df = None
|
13
|
+
|
14
|
+
def handle_missing_values(self, strategy='mean'):
|
15
|
+
if strategy == 'mean':
|
16
|
+
self.df = self.df.fillna(self.df.mean())
|
17
|
+
elif strategy == 'median':
|
18
|
+
self.df = self.df.fillna(self.df.median())
|
19
|
+
elif strategy == 'mode':
|
20
|
+
self.df = self.df.fillna(self.df.mode().iloc[0])
|
21
|
+
elif strategy == 'drop':
|
22
|
+
self.df = self.df.dropna()
|
23
|
+
return self
|
24
|
+
|
25
|
+
def identify_duplicates(self, subset=None):
|
26
|
+
self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
|
27
|
+
return self.duplicates_df
|
28
|
+
|
29
|
+
def remove_duplicates(self):
|
30
|
+
if self.duplicates_df is not None:
|
31
|
+
self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
|
32
|
+
return self
|
33
|
+
|
34
|
+
def validate_date_fields(self, date_columns=None):
|
35
|
+
if date_columns is None:
|
36
|
+
date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
|
37
|
+
for col in date_columns:
|
38
|
+
print('Validating date field: ', col)
|
39
|
+
self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
|
40
|
+
return self
|
41
|
+
|
42
|
+
def clean_text(self, text_columns=None, language='english'):
|
43
|
+
nltk.download('stopwords')
|
44
|
+
stop_words = set(stopwords.words(language))
|
45
|
+
stemmer = SnowballStemmer(language)
|
46
|
+
|
47
|
+
def clean_text(text):
|
48
|
+
if isinstance(text, str):
|
49
|
+
text = text.strip().lower() # Remove leading/trailing whitespace and convert to lowercase
|
50
|
+
text = re.sub(r'[^\w\s]', '', text) # Remove special characters and punctuation
|
51
|
+
words = text.split()
|
52
|
+
words = [word for word in words if word not in stop_words] # Remove stop words
|
53
|
+
words = [stemmer.stem(word) for word in words] # Apply stemming
|
54
|
+
return ' '.join(words)
|
55
|
+
return text
|
56
|
+
|
57
|
+
if text_columns is None:
|
58
|
+
text_columns = self.df.select_dtypes(include=['object', 'string']).columns
|
59
|
+
text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
|
60
|
+
|
61
|
+
for col in text_columns:
|
62
|
+
print('Cleaning text field: ', col)
|
63
|
+
self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
|
64
|
+
return self
|
65
|
+
|
66
|
+
def validate_numeric_fields(self, int_columns=None, float_columns=None):
|
67
|
+
if int_columns is None:
|
68
|
+
int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
|
69
|
+
if float_columns is None:
|
70
|
+
float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
|
71
|
+
|
72
|
+
for col in int_columns:
|
73
|
+
print('Validating integer field: ', col)
|
74
|
+
self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
|
75
|
+
|
76
|
+
for col in float_columns:
|
77
|
+
print('Validating float field: ', col)
|
78
|
+
self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
|
79
|
+
|
80
|
+
return self
|
81
|
+
|
82
|
+
def detect_categorical_columns(self, threshold=0.05):
|
83
|
+
"""
|
84
|
+
Detect columns that can be converted to 'category' dtype.
|
85
|
+
|
86
|
+
Parameters:
|
87
|
+
threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
|
88
|
+
|
89
|
+
Returns:
|
90
|
+
List of column names that can be converted to 'category' dtype.
|
91
|
+
"""
|
92
|
+
categorical_columns = []
|
93
|
+
|
94
|
+
def unique_ratio(partition, col):
|
95
|
+
return partition[col].nunique() / len(partition)
|
96
|
+
|
97
|
+
for col in self.df.columns:
|
98
|
+
print("Detecting categorical columns: ", col)
|
99
|
+
unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
|
100
|
+
overall_unique_ratio = unique_ratios.sum() / len(self.df)
|
101
|
+
if overall_unique_ratio < threshold:
|
102
|
+
print(f'Column {col} is categorical')
|
103
|
+
categorical_columns.append(col)
|
104
|
+
|
105
|
+
return categorical_columns
|
106
|
+
|
107
|
+
def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
|
108
|
+
if columns is None:
|
109
|
+
columns = self.detect_categorical_columns(threshold)
|
110
|
+
|
111
|
+
if method == 'onehot':
|
112
|
+
for col in columns:
|
113
|
+
self.df[col] = self.df[col].astype('category')
|
114
|
+
encoder = OneHotEncoder(sparse_output=False)
|
115
|
+
self.df = encoder.fit_transform(self.df)
|
116
|
+
elif method == 'label':
|
117
|
+
encoder = LabelEncoder()
|
118
|
+
for col in columns:
|
119
|
+
self.df[col] = encoder.fit_transform(self.df[col])
|
120
|
+
return self
|
121
|
+
|
122
|
+
def analyze_dtypes(self):
|
123
|
+
return self.df.dtypes
|
124
|
+
|
125
|
+
def get_cleaned_dataframe(self):
|
126
|
+
return self.df
|
127
|
+
|
128
|
+
def get_original_dataframe(self):
|
129
|
+
return self.original_df
|
130
|
+
|
131
|
+
def get_duplicates_dataframe(self):
|
132
|
+
return self.duplicates_df
|
@@ -0,0 +1,165 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import html
|
4
|
+
from abc import abstractmethod
|
5
|
+
|
6
|
+
import folium
|
7
|
+
import geopandas as gpd
|
8
|
+
import numpy as np
|
9
|
+
import osmnx as ox
|
10
|
+
from folium.plugins import Fullscreen
|
11
|
+
|
12
|
+
|
13
|
+
class BaseOsmMap:
|
14
|
+
tile_options = {
|
15
|
+
"OpenStreetMap": "OpenStreetMap",
|
16
|
+
"CartoDB": "cartodbpositron",
|
17
|
+
"CartoDB Voyager": "cartodbvoyager"
|
18
|
+
}
|
19
|
+
# Set default bounds for Costa Rica
|
20
|
+
bounds = [[8.0340, -85.9417], [11.2192, -82.5566]]
|
21
|
+
|
22
|
+
def __init__(self, osmnx_graph=None, df=None, **kwargs):
|
23
|
+
if osmnx_graph is None:
|
24
|
+
raise ValueError('osmnx_graph must be provided')
|
25
|
+
if df is None:
|
26
|
+
raise ValueError('df must be provided')
|
27
|
+
if df.empty:
|
28
|
+
raise ValueError('df must not be empty')
|
29
|
+
self.df = df.copy()
|
30
|
+
self.osmnx_graph = osmnx_graph
|
31
|
+
self.lat_col = kwargs.get('lat_col', 'latitude')
|
32
|
+
self.lon_col = kwargs.get('lon_col', 'longitude')
|
33
|
+
self.osm_map = None
|
34
|
+
self.G = None
|
35
|
+
self.map_html_title = self._sanitize_html(kwargs.get('map_html_title', 'OSM Basemap'))
|
36
|
+
|
37
|
+
self.zoom_start = kwargs.pop('zoom_start', 13)
|
38
|
+
self.fullscreen = kwargs.pop('fullscreen', True)
|
39
|
+
self.fullscreen_position = kwargs.pop('fullscreen_position', 'topright')
|
40
|
+
self.tiles = kwargs.pop('tiles', 'OpenStreetMap')
|
41
|
+
self.verbose = kwargs.pop('verbose', False)
|
42
|
+
self.sort_keys = kwargs.pop('sort_keys', None)
|
43
|
+
self.dt_field = kwargs.pop('dt_field', None)
|
44
|
+
self.dt = None
|
45
|
+
self.calc_nearest_nodes = kwargs.pop('calc_nearest_nodes', False)
|
46
|
+
self.nearest_nodes = None
|
47
|
+
self.max_bounds = kwargs.pop('max_bounds', False)
|
48
|
+
self._prepare_df()
|
49
|
+
self._initialise_map()
|
50
|
+
|
51
|
+
|
52
|
+
def _prepare_df(self):
|
53
|
+
if self.sort_keys:
|
54
|
+
self.df.sort_values(by=self.sort_keys, inplace=True)
|
55
|
+
self.df.reset_index(drop=True, inplace=True)
|
56
|
+
self.gps_points = self.df[[self.lat_col, self.lon_col]].values.tolist()
|
57
|
+
if self.dt_field is not None:
|
58
|
+
self.dt = self.df[self.dt_field].tolist()
|
59
|
+
|
60
|
+
if self.calc_nearest_nodes:
|
61
|
+
self.nearest_nodes = ox.distance.nearest_nodes(self.osmnx_graph, X=self.df[self.lon_col],
|
62
|
+
Y=self.df[self.lat_col])
|
63
|
+
|
64
|
+
|
65
|
+
def _initialise_map(self):
|
66
|
+
gps_array = np.array(self.gps_points)
|
67
|
+
mean_latitude = np.mean(gps_array[:, 0])
|
68
|
+
mean_longitude = np.mean(gps_array[:, 1])
|
69
|
+
self.osm_map = folium.Map(location=[mean_latitude, mean_longitude], zoom_start=self.zoom_start,
|
70
|
+
tiles=self.tiles, max_bounds=self.max_bounds)
|
71
|
+
north, south, east, west = self._get_bounding_box_from_points(margin=0.001)
|
72
|
+
self.G = self._extract_subgraph(north, south, east, west)
|
73
|
+
|
74
|
+
|
75
|
+
def _attach_supported_tiles(self):
|
76
|
+
# Normalize the default tile name to lowercase for comparison
|
77
|
+
normalized_default_tile = self.tiles.lower()
|
78
|
+
|
79
|
+
# Filter out the default tile layer from the options to avoid duplication
|
80
|
+
tile_options_filtered = {k: v for k, v in self.tile_options.items() if v.lower() != normalized_default_tile}
|
81
|
+
|
82
|
+
for tile, description in tile_options_filtered.items():
|
83
|
+
folium.TileLayer(name=tile, tiles=description, show=False).add_to(self.osm_map)
|
84
|
+
|
85
|
+
|
86
|
+
def _get_bounding_box_from_points(self, margin=0.001):
|
87
|
+
latitudes = [point[0] for point in self.gps_points]
|
88
|
+
longitudes = [point[1] for point in self.gps_points]
|
89
|
+
|
90
|
+
north = max(latitudes) + margin
|
91
|
+
south = min(latitudes) - margin
|
92
|
+
east = max(longitudes) + margin
|
93
|
+
west = min(longitudes) - margin
|
94
|
+
|
95
|
+
return north, south, east, west
|
96
|
+
|
97
|
+
|
98
|
+
def _extract_subgraph(self, north, south, east, west):
|
99
|
+
# Create a bounding box polygon
|
100
|
+
# from osmnx v2 this is how it is done
|
101
|
+
if ox.__version__ >= '2.0':
|
102
|
+
bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(bbox=(west, south, east, north))])
|
103
|
+
else:
|
104
|
+
bbox_poly = gpd.GeoSeries([ox.utils_geo.bbox_to_poly(north, south, east, west)])
|
105
|
+
|
106
|
+
# Get nodes GeoDataFrame
|
107
|
+
nodes_gdf = ox.graph_to_gdfs(self.osmnx_graph, nodes=True, edges=False)
|
108
|
+
|
109
|
+
# Find nodes within the bounding box
|
110
|
+
nodes_within_bbox = nodes_gdf[nodes_gdf.geometry.within(bbox_poly.geometry.unary_union)]
|
111
|
+
|
112
|
+
# Create subgraph
|
113
|
+
subgraph = self.osmnx_graph.subgraph(nodes_within_bbox.index)
|
114
|
+
|
115
|
+
return subgraph
|
116
|
+
|
117
|
+
|
118
|
+
@abstractmethod
|
119
|
+
def process_map(self):
|
120
|
+
# this is to be implemented at the subclass level
|
121
|
+
# implement here your specific map logic.
|
122
|
+
...
|
123
|
+
|
124
|
+
|
125
|
+
def pre_process_map(self):
|
126
|
+
# this is to be implemented at the subclass level
|
127
|
+
# call super().pre_process_map first to inherit the following behaviour
|
128
|
+
...
|
129
|
+
|
130
|
+
|
131
|
+
def _post_process_map(self):
|
132
|
+
self._attach_supported_tiles()
|
133
|
+
self.add_tile_layer()
|
134
|
+
self._add_fullscreen()
|
135
|
+
self._add_map_title()
|
136
|
+
if self.max_bounds:
|
137
|
+
self.osm_map.fit_bounds(self.bounds)
|
138
|
+
|
139
|
+
|
140
|
+
def add_tile_layer(self):
|
141
|
+
# Override in subclass and call super().add_tile_layer at the end
|
142
|
+
folium.LayerControl().add_to(self.osm_map)
|
143
|
+
|
144
|
+
|
145
|
+
def _add_fullscreen(self):
|
146
|
+
if self.fullscreen:
|
147
|
+
Fullscreen(position=self.fullscreen_position).add_to(self.osm_map)
|
148
|
+
|
149
|
+
|
150
|
+
def _add_map_title(self):
|
151
|
+
if self.map_html_title:
|
152
|
+
self.osm_map.get_root().html.add_child(folium.Element(self.map_html_title))
|
153
|
+
|
154
|
+
|
155
|
+
@staticmethod
|
156
|
+
def _sanitize_html(input_html):
|
157
|
+
return html.escape(input_html)
|
158
|
+
|
159
|
+
|
160
|
+
def generate_map(self):
|
161
|
+
self.pre_process_map()
|
162
|
+
self.process_map()
|
163
|
+
self._post_process_map()
|
164
|
+
|
165
|
+
return self.osm_map
|
File without changes
|