sibi-dst 0.3.31__py3-none-any.whl → 0.3.33__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_parquet_artifact.py +68 -0
- sibi_dst/df_helper/_parquet_reader.py +45 -1
- sibi_dst/df_helper/backends/django/_db_connection.py +41 -1
- sibi_dst/df_helper/backends/django/_io_dask.py +211 -3
- sibi_dst/df_helper/backends/django/_load_from_db.py +96 -1
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +132 -6
- sibi_dst/df_helper/backends/http/_http_config.py +52 -1
- sibi_dst/df_helper/backends/parquet/_filter_handler.py +28 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +105 -1
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +17 -0
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +80 -2
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +90 -29
- sibi_dst/df_helper/core/_params_config.py +59 -0
- sibi_dst/geopy_helper/geo_location_service.py +14 -0
- sibi_dst/geopy_helper/utils.py +37 -3
- sibi_dst/osmnx_helper/base_osm_map.py +254 -0
- sibi_dst/osmnx_helper/utils.py +226 -4
- sibi_dst/utils/clickhouse_writer.py +27 -0
- sibi_dst/utils/data_utils.py +32 -1
- sibi_dst/utils/data_wrapper.py +94 -6
- sibi_dst/utils/date_utils.py +35 -0
- sibi_dst/utils/log_utils.py +19 -2
- sibi_dst/utils/parquet_saver.py +1 -0
- sibi_dst/utils/storage_manager.py +4 -1
- {sibi_dst-0.3.31.dist-info → sibi_dst-0.3.33.dist-info}/METADATA +3 -1
- {sibi_dst-0.3.31.dist-info → sibi_dst-0.3.33.dist-info}/RECORD +27 -27
- {sibi_dst-0.3.31.dist-info → sibi_dst-0.3.33.dist-info}/WHEEL +0 -0
sibi_dst/osmnx_helper/utils.py
CHANGED
@@ -23,6 +23,37 @@ from geopy.distance import geodesic
|
|
23
23
|
|
24
24
|
|
25
25
|
class PBFHandler:
|
26
|
+
"""
|
27
|
+
Handles the creation, management, and visualization of graph data derived
|
28
|
+
from .pbf (Protocolbuffer Binary Format) files. This class enables the
|
29
|
+
loading, processing, saving, and reutilization of graph, node, and edge
|
30
|
+
data for geographical regions, supporting verbose mode for detailed outputs.
|
31
|
+
|
32
|
+
:ivar graph: The generated graph object representing the spatial network; can be None if not yet loaded or processed.
|
33
|
+
:type graph: Optional[NetworkX.Graph]
|
34
|
+
:ivar nodes: GeoDataFrame representing the nodes of the graph; can be None if not yet loaded or processed.
|
35
|
+
:type nodes: Optional[geopandas.GeoDataFrame]
|
36
|
+
:ivar edges: GeoDataFrame representing the edges of the graph; can be None if not yet loaded or processed.
|
37
|
+
:type edges: Optional[geopandas.GeoDataFrame]
|
38
|
+
:ivar rebuild: Indicates whether to rebuild the graph data, ignoring any existing cached files. Default is ``False``.
|
39
|
+
:type rebuild: bool
|
40
|
+
:ivar verbose: Enables verbose mode to provide detailed status messages during operations. Default is ``False``.
|
41
|
+
:type verbose: bool
|
42
|
+
:ivar place: The name of the geographical region to process with OpenStreetMap. Default is ``Costa Rica``.
|
43
|
+
:type place: str
|
44
|
+
:ivar filepath: The path to the directory where the graph, nodes, and edges pickle files are saved. Default is ``gis_data/``.
|
45
|
+
:type filepath: str
|
46
|
+
:ivar file_prefix: The prefix for the filenames of the saved graph, node, and edge pickle files. Default is ``costa-rica-``.
|
47
|
+
:type file_prefix: str
|
48
|
+
:ivar network_type: The type of network to extract from OpenStreetMap, such as "all" or other specific network types. Default is ``all``.
|
49
|
+
:type network_type: str
|
50
|
+
:ivar graph_file: Full path of the file to save or load the graph data as a pickle file.
|
51
|
+
:type graph_file: str
|
52
|
+
:ivar node_file: Full path of the file to save or load the graph's node data as a pickle file.
|
53
|
+
:type node_file: str
|
54
|
+
:ivar edge_file: Full path of the file to save or load the graph's edge data as a pickle file.
|
55
|
+
:type edge_file: str
|
56
|
+
"""
|
26
57
|
def __init__(self, **kwargs):
|
27
58
|
self.graph = None
|
28
59
|
self.nodes = None
|
@@ -38,6 +69,23 @@ class PBFHandler:
|
|
38
69
|
self.edge_file = f"{self.filepath}{self.file_prefix}edges.pkl"
|
39
70
|
|
40
71
|
def load(self):
|
72
|
+
"""
|
73
|
+
Loads the required data files for processing. If the files do not exist or
|
74
|
+
if the `rebuild` flag is set to True, it will process and recreate the
|
75
|
+
necessary data from the source. Otherwise, it will load the data from
|
76
|
+
existing pickle files. This function ensures the target directory exists,
|
77
|
+
and processes files conditionally based on their presence.
|
78
|
+
|
79
|
+
:param verbose: Flag to control the verbosity of the function's output.
|
80
|
+
:param rebuild: Indicates whether the data should be rebuilt from the raw
|
81
|
+
source files.
|
82
|
+
:param graph_file: Path to the graph file to be loaded or rebuilt.
|
83
|
+
:param node_file: Path to the node file to be loaded or rebuilt.
|
84
|
+
:param edge_file: Path to the edge file to be loaded or rebuilt.
|
85
|
+
:param filepath: Path to the directory where files are processed and saved.
|
86
|
+
|
87
|
+
:return: None
|
88
|
+
"""
|
41
89
|
if self.verbose:
|
42
90
|
print("Loading data...")
|
43
91
|
|
@@ -62,7 +110,31 @@ class PBFHandler:
|
|
62
110
|
|
63
111
|
def process_pbf(self):
|
64
112
|
"""
|
65
|
-
|
113
|
+
Processes the Protocolbuffer Binary Format (PBF) data specified for a given place by
|
114
|
+
utilizing the OSMnx library to create a graph representation and extracts nodes and
|
115
|
+
edges into GeoDataFrames. The function provides verbose output if enabled.
|
116
|
+
|
117
|
+
:param self: Refers to the current instance of the class containing this method.
|
118
|
+
|
119
|
+
:param self.verbose: bool
|
120
|
+
A flag to control verbose output. If True, detailed processing status messages are
|
121
|
+
logged to the console.
|
122
|
+
|
123
|
+
:param self.place: str
|
124
|
+
The name or description of the geographic place for which PBF data is processed. It
|
125
|
+
is used to construct a graph representation of the place.
|
126
|
+
|
127
|
+
:param self.network_type: str
|
128
|
+
The type of network graph to be created, typically one of 'all', 'walk', 'drive',
|
129
|
+
etc., reflecting the type of paths or streets included in the graph.
|
130
|
+
|
131
|
+
:return: None
|
132
|
+
This function does not return a value, but updates class attributes ``graph``,
|
133
|
+
``nodes``, and ``edges``.
|
134
|
+
|
135
|
+
:raises Exception:
|
136
|
+
Raises a general exception when there is an error in processing the PBF data. Error
|
137
|
+
details are printed when verbose output is enabled.
|
66
138
|
"""
|
67
139
|
try:
|
68
140
|
if self.verbose:
|
@@ -79,7 +151,20 @@ class PBFHandler:
|
|
79
151
|
|
80
152
|
def save_to_pickle(self):
|
81
153
|
"""
|
82
|
-
|
154
|
+
Saves data, including graph, nodes, and edges, to pickle files. Each data object is
|
155
|
+
saved to its corresponding file if available. If verbose mode is enabled, prints
|
156
|
+
messages indicating the saving progress and success.
|
157
|
+
|
158
|
+
:param self:
|
159
|
+
Represents the instance of the class that contains attributes `graph_file`,
|
160
|
+
`graph`, `node_file`, `nodes`, `edge_file`, `edges`, and `verbose`. These
|
161
|
+
attributes determine the files to save to and the data to save.
|
162
|
+
|
163
|
+
:raises Exception:
|
164
|
+
Raises an exception if an error occurs during the saving process.
|
165
|
+
|
166
|
+
:return:
|
167
|
+
None
|
83
168
|
"""
|
84
169
|
try:
|
85
170
|
if self.verbose:
|
@@ -104,7 +189,13 @@ class PBFHandler:
|
|
104
189
|
|
105
190
|
def load_from_pickle(self):
|
106
191
|
"""
|
107
|
-
|
192
|
+
Loads data from pickle files specified by the attributes `graph_file`, `node_file`,
|
193
|
+
and `edge_file` and assigns them to the corresponding attributes `graph`,
|
194
|
+
`nodes`, and `edges`, respectively. Displays verbose messages during the load
|
195
|
+
process if the `verbose` attribute is set to True.
|
196
|
+
|
197
|
+
:raises Exception: If an error occurs during reading or deserialization of the
|
198
|
+
pickle files.
|
108
199
|
"""
|
109
200
|
try:
|
110
201
|
if self.verbose:
|
@@ -128,7 +219,13 @@ class PBFHandler:
|
|
128
219
|
|
129
220
|
def plot_graph(self):
|
130
221
|
"""
|
131
|
-
|
222
|
+
Plots the loaded graph using the OSMnx library.
|
223
|
+
|
224
|
+
This method checks if a graph is loaded and, if available, plots it. Outputs
|
225
|
+
verbose messages during the process if verbosity is enabled.
|
226
|
+
|
227
|
+
:raises Exception: Raises if an error occurs during the plotting process.
|
228
|
+
:return: None
|
132
229
|
"""
|
133
230
|
try:
|
134
231
|
if self.graph is not None:
|
@@ -145,6 +242,23 @@ class PBFHandler:
|
|
145
242
|
|
146
243
|
|
147
244
|
def get_bounding_box_from_points(gps_points, margin=0.001):
|
245
|
+
"""
|
246
|
+
Calculates a bounding box from a list of GPS points, with an optional margin added
|
247
|
+
to expand the bounding box in all directions. The function iterates over the GPS
|
248
|
+
points to determine the maximum and minimum latitude and longitude values, then
|
249
|
+
applies the specified margin to calculate the bounding box's boundaries.
|
250
|
+
|
251
|
+
:param gps_points: A list of GPS points, where each point is represented as a tuple
|
252
|
+
containing a latitude and a longitude (latitude, longitude).
|
253
|
+
:type gps_points: list[tuple[float, float]]
|
254
|
+
:param margin: An optional margin value to expand the bounding box in all directions.
|
255
|
+
Default value is 0.001.
|
256
|
+
:type margin: float
|
257
|
+
:return: A tuple containing the bounding box boundaries in the following order:
|
258
|
+
north (maximum latitude), south (minimum latitude), east (maximum longitude),
|
259
|
+
and west (minimum longitude), each adjusted with the margin.
|
260
|
+
:rtype: tuple[float, float, float, float]
|
261
|
+
"""
|
148
262
|
latitudes = [point[0] for point in gps_points]
|
149
263
|
longitudes = [point[1] for point in gps_points]
|
150
264
|
|
@@ -157,6 +271,28 @@ def get_bounding_box_from_points(gps_points, margin=0.001):
|
|
157
271
|
|
158
272
|
|
159
273
|
def add_arrows(map_object, locations, color, n_arrows):
|
274
|
+
"""
|
275
|
+
Adds directional arrows to a map object to indicate paths or flows along a polyline
|
276
|
+
defined by the given locations.
|
277
|
+
|
278
|
+
The function computes directional arrows based on the locations list, places them
|
279
|
+
along the defined path at intervals determined by the number of arrows, and adds
|
280
|
+
these arrows to the specified `map_object`.
|
281
|
+
|
282
|
+
.. note::
|
283
|
+
The function works optimally when the number of locations is greater than two.
|
284
|
+
|
285
|
+
:param map_object: The folium map object to which the directional arrows will be added.
|
286
|
+
:param locations: A list containing tuples of latitude and longitude values that define
|
287
|
+
the polyline. Each tuple represents a geographic point.
|
288
|
+
:type locations: list[tuple[float, float]]
|
289
|
+
:param color: The color to be used for the directional arrows.
|
290
|
+
:type color: str
|
291
|
+
:param n_arrows: The number of arrows to be drawn along the path.
|
292
|
+
:type n_arrows: int
|
293
|
+
:return: The modified folium map object containing the added arrows.
|
294
|
+
:rtype: folium.Map
|
295
|
+
"""
|
160
296
|
# Get the number of locations
|
161
297
|
n = len(locations)
|
162
298
|
|
@@ -179,6 +315,26 @@ def add_arrows(map_object, locations, color, n_arrows):
|
|
179
315
|
|
180
316
|
|
181
317
|
def extract_subgraph(G, north, south, east, west):
|
318
|
+
"""
|
319
|
+
Extracts a subgraph from the input graph `G` within a specified bounding box. The bounding
|
320
|
+
box is defined by its north, south, east, and west coordinates. The function identifies
|
321
|
+
nodes from the graph that lie within this bounding box and creates a subgraph containing
|
322
|
+
only these nodes and their corresponding edges.
|
323
|
+
|
324
|
+
:param G: The input graph representing the original main graph.
|
325
|
+
:type G: networkx.Graph
|
326
|
+
:param north: The northern latitude that defines the upper boundary of the bounding box.
|
327
|
+
:type north: float
|
328
|
+
:param south: The southern latitude that defines the lower boundary of the bounding box.
|
329
|
+
:type south: float
|
330
|
+
:param east: The eastern longitude that defines the right boundary of the bounding box.
|
331
|
+
:type east: float
|
332
|
+
:param west: The western longitude that defines the left boundary of the bounding box.
|
333
|
+
:type west: float
|
334
|
+
:return: A subgraph extracted from the input graph `G` containing nodes and edges within
|
335
|
+
the specified bounding box.
|
336
|
+
:rtype: networkx.Graph
|
337
|
+
"""
|
182
338
|
# Create a bounding box polygon
|
183
339
|
# from osmnx v2 this is how it is done
|
184
340
|
if ox.__version__ >= '2.0':
|
@@ -199,6 +355,26 @@ def extract_subgraph(G, north, south, east, west):
|
|
199
355
|
|
200
356
|
|
201
357
|
def get_distance_between_points(point_a, point_b, unit='km'):
|
358
|
+
"""
|
359
|
+
Calculate the geographical distance between two points on Earth.
|
360
|
+
|
361
|
+
This function computes the distance between two points on the Earth's surface
|
362
|
+
specified in their geographical coordinates (latitude, longitude). The calculation
|
363
|
+
employs the geodesic distance, which represents the shortest distance between
|
364
|
+
two points on the Earth's surface. The distance can be returned in different units of
|
365
|
+
measurement depending on the provided parameter.
|
366
|
+
|
367
|
+
:param point_a: A tuple representing the latitude and longitude of the first
|
368
|
+
point in decimal degrees (e.g., (latitude, longitude)). Must be a tuple of
|
369
|
+
two float values.
|
370
|
+
:param point_b: A tuple representing the latitude and longitude of the second
|
371
|
+
point in decimal degrees (e.g., (latitude, longitude)). Must be a tuple of
|
372
|
+
two float values.
|
373
|
+
:param unit: A string value representing the unit of the calculated distance. Can be
|
374
|
+
'km' for kilometers (default), 'm' for meters, or 'mi' for miles.
|
375
|
+
:return: A float value of the distance between the two points in the specified unit.
|
376
|
+
Returns 0 if the input validation fails or the specified unit is invalid.
|
377
|
+
"""
|
202
378
|
if not isinstance(point_a, tuple) or len(point_a) != 2:
|
203
379
|
return 0
|
204
380
|
if not all(isinstance(x, float) and not math.isnan(x) for x in point_a):
|
@@ -226,6 +402,20 @@ tile_options = {
|
|
226
402
|
|
227
403
|
|
228
404
|
def attach_supported_tiles(map_object, default_tile="OpenStreetMap"):
|
405
|
+
"""
|
406
|
+
Attaches supported tile layers to a given folium map object, excluding the
|
407
|
+
default tile layer, to provide layer selection functionality in the map.
|
408
|
+
|
409
|
+
This function allows dynamic addition of multiple tile layers to the map
|
410
|
+
object while avoiding duplication of the default tile. By filtering out the
|
411
|
+
default tile, it prevents redundancy and ensures a cleaner map interface.
|
412
|
+
|
413
|
+
:param map_object: The folium map object to which the tile layers will be added.
|
414
|
+
It must be an instance of Folium's Map class or a compatible map object.
|
415
|
+
:param default_tile: The name of the default tile layer to exclude from the
|
416
|
+
list of tiles added to the map. If not specified, defaults to 'OpenStreetMap'.
|
417
|
+
:return: None. The function modifies the provided map object in place.
|
418
|
+
"""
|
229
419
|
# Normalize the default tile name to lowercase for comparison
|
230
420
|
normalized_default_tile = default_tile.lower()
|
231
421
|
|
@@ -237,12 +427,44 @@ def attach_supported_tiles(map_object, default_tile="OpenStreetMap"):
|
|
237
427
|
|
238
428
|
|
239
429
|
def get_graph(**options):
|
430
|
+
"""
|
431
|
+
Generates and returns a graph along with its nodes and edges based on the
|
432
|
+
provided options. The function initializes a PBFHandler instance with the
|
433
|
+
given options, processes any data required, and retrieves the resulting
|
434
|
+
graph structure.
|
435
|
+
|
436
|
+
:param options: Variable-length keyword arguments passed to initialize the
|
437
|
+
PBFHandler instance. These parameters play a role in
|
438
|
+
determining how the graph data is processed and structured.
|
439
|
+
:return: Returns a tuple containing three elements:
|
440
|
+
- The generated graph object
|
441
|
+
- The list or collection of nodes within the graph
|
442
|
+
- The list or collection of edges that describe relationships
|
443
|
+
between nodes in the graph
|
444
|
+
"""
|
240
445
|
handler = PBFHandler(**options)
|
241
446
|
handler.load()
|
242
447
|
return handler.graph, handler.nodes, handler.edges
|
243
448
|
|
244
449
|
|
245
450
|
def add_query_params(url, params):
|
451
|
+
"""
|
452
|
+
Update the query parameters of a given URL with new parameters.
|
453
|
+
|
454
|
+
This function takes a URL and a dictionary of parameters, merges these
|
455
|
+
parameters with the existing parameters in the URL, and returns a new URL
|
456
|
+
with updated query parameters.
|
457
|
+
|
458
|
+
:param url: The original URL whose query parameters are to be updated,
|
459
|
+
including the scheme, netloc, path, and optional query string and fragment.
|
460
|
+
:type url: str
|
461
|
+
:param params: A dictionary containing the new parameters to be added or updated
|
462
|
+
in the query string of the given URL.
|
463
|
+
:type params: dict
|
464
|
+
:return: A new URL with updated query parameters after merging the original
|
465
|
+
and new parameters.
|
466
|
+
:rtype: str
|
467
|
+
"""
|
246
468
|
# Parse the original URL
|
247
469
|
url_components = urlsplit(url)
|
248
470
|
|
@@ -9,6 +9,33 @@ from sibi_dst.utils import Logger
|
|
9
9
|
|
10
10
|
|
11
11
|
class ClickHouseWriter:
|
12
|
+
"""
|
13
|
+
Provides functionality to write a Dask DataFrame to a ClickHouse database using
|
14
|
+
a specified schema. This class handles the creation of tables, schema generation,
|
15
|
+
data transformation, and data insertion. It ensures compatibility between Dask
|
16
|
+
data types and ClickHouse types.
|
17
|
+
|
18
|
+
:ivar clickhouse_host: Host address of the ClickHouse database.
|
19
|
+
:type clickhouse_host: str
|
20
|
+
:ivar clickhouse_port: Port of the ClickHouse database.
|
21
|
+
:type clickhouse_port: int
|
22
|
+
:ivar clickhouse_dbname: Name of the database to connect to in ClickHouse.
|
23
|
+
:type clickhouse_dbname: str
|
24
|
+
:ivar clickhouse_user: Username for database authentication.
|
25
|
+
:type clickhouse_user: str
|
26
|
+
:ivar clickhouse_password: Password for database authentication.
|
27
|
+
:type clickhouse_password: str
|
28
|
+
:ivar clickhouse_table: Name of the table to store the data in.
|
29
|
+
:type clickhouse_table: str
|
30
|
+
:ivar logger: Logger instance for logging messages.
|
31
|
+
:type logger: logging.Logger
|
32
|
+
:ivar client: Instance of the ClickHouse database client.
|
33
|
+
:type client: clickhouse_connect.Client or None
|
34
|
+
:ivar df: Dask DataFrame to be written into ClickHouse.
|
35
|
+
:type df: dask.dataframe.DataFrame
|
36
|
+
:ivar order_by: Field or column name to use for table ordering.
|
37
|
+
:type order_by: str
|
38
|
+
"""
|
12
39
|
dtype_to_clickhouse = {
|
13
40
|
'int64': 'Int64',
|
14
41
|
'int32': 'Int32',
|
sibi_dst/utils/data_utils.py
CHANGED
@@ -5,12 +5,43 @@ from sibi_dst.utils import Logger
|
|
5
5
|
|
6
6
|
|
7
7
|
class DataUtils:
|
8
|
-
|
8
|
+
"""
|
9
|
+
Utility class for data transformation, manipulation, and merging.
|
10
|
+
|
11
|
+
This class provides functionalities for transforming numeric and boolean columns, merging
|
12
|
+
lookup data, checking DataFrame emptiness, and converting columns to datetime format in
|
13
|
+
Pandas or Dask DataFrames. It is designed to handle data preprocessing steps efficiently
|
14
|
+
for both small-scale and large-scale datasets. Logging and debug options are available
|
15
|
+
to trace execution and monitor operations.
|
16
|
+
|
17
|
+
:ivar logger: Logger instance for logging messages.
|
18
|
+
:type logger: logging.Logger
|
19
|
+
:ivar debug: Flag to enable or disable debug mode.
|
20
|
+
:type debug: bool
|
21
|
+
"""
|
9
22
|
def __init__(self, logger=None, **kwargs):
|
10
23
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
11
24
|
self.debug = kwargs.get('debug', False)
|
12
25
|
|
13
26
|
def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
|
27
|
+
"""
|
28
|
+
This function transforms the specified numeric columns in the given dataframe by converting
|
29
|
+
their data types to the specified dtype, with an optional parameter for replacing missing
|
30
|
+
values. It first checks if the provided columns exist in the dataframe, processes each column
|
31
|
+
to replace non-numeric values with NaN, fills NaN values with the given fill_value, and finally
|
32
|
+
converts the column to the specified dtype.
|
33
|
+
|
34
|
+
:param df: DataFrame to be transformed.
|
35
|
+
:type df: dask.dataframe.DataFrame
|
36
|
+
:param columns: List of column names to be transformed.
|
37
|
+
:type columns: list[str]
|
38
|
+
:param fill_value: Value used to replace missing or invalid data. Default is 0.
|
39
|
+
:type fill_value: int or float
|
40
|
+
:param dtype: Target data type for the columns after transformation. Default is int.
|
41
|
+
:type dtype: type
|
42
|
+
:return: Transformed dataframe with the specified numeric columns converted and modified.
|
43
|
+
:rtype: dask.dataframe.DataFrame
|
44
|
+
"""
|
14
45
|
if not columns:
|
15
46
|
self.logger.warning('No columns specified')
|
16
47
|
self.logger.debug(f'Dataframe type:{type(df)}')
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -12,6 +12,62 @@ from sibi_dst.utils import ParquetSaver
|
|
12
12
|
|
13
13
|
|
14
14
|
class DataWrapper:
|
15
|
+
"""
|
16
|
+
Utility class for handling file-based operations, including processing and saving data
|
17
|
+
in Parquet format, while managing a hierarchy of conditions such as overwrite, history
|
18
|
+
threshold, and missing file detection.
|
19
|
+
|
20
|
+
This class aims to simplify the process of managing large datasets stored in a filesystem.
|
21
|
+
It allows for controlled updates to data files based on parameters set by the user, with
|
22
|
+
support for different filesystem types and options.
|
23
|
+
|
24
|
+
It also provides features like logging actions, managing processing threads, generating
|
25
|
+
update plans, checking file age, and dynamically creating date ranges for data operations.
|
26
|
+
|
27
|
+
The design supports flexible integration with user-defined classes (dataclasses) to define
|
28
|
+
custom loading and processing behavior.
|
29
|
+
|
30
|
+
:ivar dataclass: The user-defined class for data processing.
|
31
|
+
:type dataclass: Type
|
32
|
+
:ivar date_field: The name of the date field in the user-defined class.
|
33
|
+
:type date_field: str
|
34
|
+
:ivar data_path: Base path for the dataset storage.
|
35
|
+
:type data_path: str
|
36
|
+
:ivar parquet_filename: File name for the Parquet file.
|
37
|
+
:type parquet_filename: str
|
38
|
+
:ivar start_date: Start date for processing.
|
39
|
+
:type start_date: datetime.date
|
40
|
+
:ivar end_date: End date for processing.
|
41
|
+
:type end_date: datetime.date
|
42
|
+
:ivar fs: File system object for managing files.
|
43
|
+
:type fs: Optional[fsspec.AbstractFileSystem]
|
44
|
+
:ivar filesystem_type: Type of the filesystem (e.g., "file", "s3").
|
45
|
+
:type filesystem_type: str
|
46
|
+
:ivar filesystem_options: Additional options for initializing the filesystem.
|
47
|
+
:type filesystem_options: Optional[Dict]
|
48
|
+
:ivar verbose: Flag to enable verbose logging.
|
49
|
+
:type verbose: bool
|
50
|
+
:ivar class_params: Parameters to initialize the dataclass.
|
51
|
+
:type class_params: Optional[Dict]
|
52
|
+
:ivar load_params: Additional parameters for loading functions.
|
53
|
+
:type load_params: Optional[Dict]
|
54
|
+
:ivar reverse_order: Flag to reverse the order of date range generation.
|
55
|
+
:type reverse_order: bool
|
56
|
+
:ivar overwrite: Whether to overwrite all files during processing.
|
57
|
+
:type overwrite: bool
|
58
|
+
:ivar ignore_missing: Whether to ignore missing files.
|
59
|
+
:type ignore_missing: bool
|
60
|
+
:ivar logger: Logger instance for logging information.
|
61
|
+
:type logger: Optional[Logger]
|
62
|
+
:ivar max_age_minutes: Maximum file age threshold in minutes.
|
63
|
+
:type max_age_minutes: int
|
64
|
+
:ivar history_days_threshold: Number of days for the history threshold.
|
65
|
+
:type history_days_threshold: int
|
66
|
+
:ivar show_progress: Flag to enable progress display.
|
67
|
+
:type show_progress: bool
|
68
|
+
:ivar timeout: Timeout in seconds for processing tasks with threads.
|
69
|
+
:type timeout: Optional[int]
|
70
|
+
"""
|
15
71
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
16
72
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
17
73
|
|
@@ -80,7 +136,19 @@ class DataWrapper:
|
|
80
136
|
yield date.date()
|
81
137
|
|
82
138
|
def process(self):
|
83
|
-
"""
|
139
|
+
"""
|
140
|
+
Processes update tasks by generating an update plan, filtering required updates, and distributing
|
141
|
+
the workload across threads based on priority levels.
|
142
|
+
|
143
|
+
This method operates by assessing required updates through generated conditions,
|
144
|
+
grouping them by priority levels, and processing them in parallel threads.
|
145
|
+
Each thread handles the updates for a specific priority level, ensuring a streamlined approach
|
146
|
+
to handling the updates efficiently.
|
147
|
+
|
148
|
+
:raises TimeoutError: If a thread processing a priority level exceeds the allowed timeout duration.
|
149
|
+
|
150
|
+
:return: None
|
151
|
+
"""
|
84
152
|
update_plan_table = self.generate_update_plan_with_conditions()
|
85
153
|
|
86
154
|
# Display the update plan table to the user if requested
|
@@ -171,7 +239,20 @@ class DataWrapper:
|
|
171
239
|
return True #
|
172
240
|
|
173
241
|
def process_date(self, date: datetime.date):
|
174
|
-
"""
|
242
|
+
"""
|
243
|
+
Processes data for a given date and saves it as a Parquet file.
|
244
|
+
|
245
|
+
This method processes data for the specified date by loading the data
|
246
|
+
corresponding to that day, saving it into a structured storage format
|
247
|
+
(Parquet), and logging relevant information such as processing time
|
248
|
+
and errors that may occur during the process. It uses provided
|
249
|
+
dataclass and parameters to operate and ensures the data is stored
|
250
|
+
in a structured folder hierarchy.
|
251
|
+
|
252
|
+
:param date: The specific date for which data processing and saving should occur
|
253
|
+
:type date: datetime.date
|
254
|
+
:return: None
|
255
|
+
"""
|
175
256
|
folder = f'{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/'
|
176
257
|
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
177
258
|
|
@@ -196,10 +277,17 @@ class DataWrapper:
|
|
196
277
|
|
197
278
|
def generate_update_plan_with_conditions(self):
|
198
279
|
"""
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
280
|
+
Generates an update plan for data files based on specific conditions. The function evaluates the need for updating or
|
281
|
+
overwriting data files for a given date range. Conditions include file existence, whether the file falls within a
|
282
|
+
specified historical threshold, and the necessity to overwrite or handle missing files. A priority map is utilized to
|
283
|
+
assign priority levels to update categories.
|
284
|
+
|
285
|
+
:raises FileNotFoundError: If any file is referenced that does not exist and the ``ignore_missing`` property is set to False.
|
286
|
+
:raises AttributeError: If any required attribute like ``fs``, ``dataclass``, or others are not properly set or initialized.
|
287
|
+
|
288
|
+
:return: A Pandas DataFrame representing the update plan, where each row contains information about a date, the conditions
|
289
|
+
evaluated for that date, and the determined update priority.
|
290
|
+
:rtype: pandas.DataFrame
|
203
291
|
"""
|
204
292
|
rows = []
|
205
293
|
|
sibi_dst/utils/date_utils.py
CHANGED
@@ -8,6 +8,24 @@ from sibi_dst.utils import Logger
|
|
8
8
|
|
9
9
|
|
10
10
|
class DateUtils:
|
11
|
+
"""
|
12
|
+
Utility class for date-related operations.
|
13
|
+
|
14
|
+
The DateUtils class provides a variety of operations to manipulate and retrieve
|
15
|
+
information about dates, such as calculating week ranges, determining start or
|
16
|
+
end dates for specific periods (quarters, months, years), and dynamically
|
17
|
+
registering custom time period functions. It also supports parsing specific
|
18
|
+
periods for date range computations and ensuring the input date is correctly
|
19
|
+
converted to the desired format.
|
20
|
+
|
21
|
+
:ivar logger: Logger instance used for logging messages. Defaults to the logger
|
22
|
+
for the current class if not provided.
|
23
|
+
:type logger: Logger
|
24
|
+
|
25
|
+
:ivar _PERIOD_FUNCTIONS: Stores dynamically registered period functions that
|
26
|
+
return start and end dates.
|
27
|
+
:type _PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]]
|
28
|
+
"""
|
11
29
|
_PERIOD_FUNCTIONS: Dict[str, Callable[[], Tuple[datetime.date, datetime.date]]] = {}
|
12
30
|
|
13
31
|
def __init__(self, logger=None):
|
@@ -127,6 +145,23 @@ class DateUtils:
|
|
127
145
|
|
128
146
|
|
129
147
|
class BusinessDays:
|
148
|
+
"""
|
149
|
+
Provides functionality for handling business days calculations with a custom
|
150
|
+
holiday list. The class includes methods for calculating the number of
|
151
|
+
business days, modifying dates by adding business days, and applying these
|
152
|
+
operations to Dask DataFrames.
|
153
|
+
|
154
|
+
:ivar logger: Logger instance for logging error, warning, and debug messages.
|
155
|
+
:type logger: logging.Logger
|
156
|
+
:ivar HOLIDAY_LIST: Dictionary mapping years to lists of holiday dates.
|
157
|
+
:type HOLIDAY_LIST: dict
|
158
|
+
:ivar bd_cal: Numpy busdaycalendar object containing holidays and week mask.
|
159
|
+
:type bd_cal: numpy.busdaycalendar
|
160
|
+
:ivar holidays: Array of holiday dates used by the business day calendar.
|
161
|
+
:type holidays: numpy.ndarray
|
162
|
+
:ivar week_mask: Boolean array indicating working days within a week.
|
163
|
+
:type week_mask: numpy.ndarray
|
164
|
+
"""
|
130
165
|
def __init__(self, holiday_list, logger):
|
131
166
|
"""
|
132
167
|
Initialize a BusinessDays object with a given holiday list.
|
sibi_dst/utils/log_utils.py
CHANGED
@@ -1,11 +1,28 @@
|
|
1
|
-
# Copyright (c) 2023. ISTMO Center S.A. All Rights Reserved
|
2
|
-
#
|
3
1
|
import logging
|
4
2
|
import os
|
5
3
|
import sys
|
6
4
|
|
7
5
|
|
8
6
|
class Logger:
|
7
|
+
"""
|
8
|
+
Handles the creation, setup, and management of logging functionalities.
|
9
|
+
|
10
|
+
This class facilitates logging by creating and managing a logger instance with
|
11
|
+
customizable logging directory, name, and file. It ensures logs from a script
|
12
|
+
are stored in a well-defined directory and file, and provides various logging
|
13
|
+
methods for different log levels. The logger automatically formats and handles
|
14
|
+
log messages. Additionally, this class provides a class method to initialize a
|
15
|
+
logger with default behaviors.
|
16
|
+
|
17
|
+
:ivar log_dir: Path to the directory where log files are stored.
|
18
|
+
:type log_dir: str
|
19
|
+
:ivar logger_name: Name of the logger instance.
|
20
|
+
:type logger_name: str
|
21
|
+
:ivar log_file: Base name of the log file.
|
22
|
+
:type log_file: str
|
23
|
+
:ivar logger: The initialized logger instance used for logging messages.
|
24
|
+
:type logger: logging.Logger
|
25
|
+
"""
|
9
26
|
def __init__(self, log_dir, logger_name, log_file):
|
10
27
|
self.log_dir = log_dir
|
11
28
|
self.logger_name = logger_name
|
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -38,6 +38,7 @@ class ParquetSaver:
|
|
38
38
|
schema = self._define_schema()
|
39
39
|
self._convert_dtypes(schema)
|
40
40
|
self._save_dataframe_to_parquet(full_path, schema)
|
41
|
+
self.fs.close()
|
41
42
|
|
42
43
|
def _define_schema(self) -> pa.Schema:
|
43
44
|
"""Define a PyArrow schema dynamically based on df_result column types."""
|
@@ -90,4 +90,7 @@ class StorageManager:
|
|
90
90
|
"""
|
91
91
|
print("Rebuilding depot structure...")
|
92
92
|
self.rebuild_depot_paths(depots, clear_existing=clear_existing)
|
93
|
-
print("Rebuild complete.")
|
93
|
+
print("Rebuild complete.")
|
94
|
+
|
95
|
+
def get_fs_instance(self):
|
96
|
+
return fsspec.filesystem(self.fs_type, **self.fs_options)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.33
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -20,6 +20,7 @@ Requires-Dist: django (>=5.1.4,<6.0.0)
|
|
20
20
|
Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
|
21
21
|
Requires-Dist: folium (>=0.19.4,<0.20.0)
|
22
22
|
Requires-Dist: geopandas (>=1.0.1,<2.0.0)
|
23
|
+
Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
|
23
24
|
Requires-Dist: httpx (>=0.27.2,<0.28.0)
|
24
25
|
Requires-Dist: ipython (>=8.29.0,<9.0.0)
|
25
26
|
Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
|
@@ -42,6 +43,7 @@ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
|
|
42
43
|
Requires-Dist: tornado (>=6.4.1,<7.0.0)
|
43
44
|
Requires-Dist: tqdm (>=4.67.0,<5.0.0)
|
44
45
|
Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
|
46
|
+
Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
|
45
47
|
Description-Content-Type: text/markdown
|
46
48
|
|
47
49
|
# sibi-dst
|