sibi-dst 0.3.31__tar.gz → 0.3.33__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/PKG-INFO +3 -1
  2. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/pyproject.toml +3 -1
  3. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/_parquet_artifact.py +68 -0
  4. sibi_dst-0.3.33/sibi_dst/df_helper/_parquet_reader.py +94 -0
  5. sibi_dst-0.3.33/sibi_dst/df_helper/backends/django/_db_connection.py +88 -0
  6. sibi_dst-0.3.33/sibi_dst/df_helper/backends/django/_io_dask.py +450 -0
  7. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/django/_load_from_db.py +96 -1
  8. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/django/_sql_model_builder.py +132 -6
  9. sibi_dst-0.3.33/sibi_dst/df_helper/backends/http/_http_config.py +101 -0
  10. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/parquet/_filter_handler.py +28 -0
  11. sibi_dst-0.3.33/sibi_dst/df_helper/backends/parquet/_parquet_options.py +205 -0
  12. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +17 -0
  13. sibi_dst-0.3.33/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +141 -0
  14. sibi_dst-0.3.33/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +192 -0
  15. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/core/_params_config.py +59 -0
  16. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/geopy_helper/geo_location_service.py +14 -0
  17. sibi_dst-0.3.33/sibi_dst/geopy_helper/utils.py +89 -0
  18. sibi_dst-0.3.33/sibi_dst/osmnx_helper/base_osm_map.py +419 -0
  19. sibi_dst-0.3.33/sibi_dst/osmnx_helper/utils.py +489 -0
  20. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/clickhouse_writer.py +27 -0
  21. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/data_utils.py +32 -1
  22. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/data_wrapper.py +94 -6
  23. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/date_utils.py +35 -0
  24. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/log_utils.py +19 -2
  25. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/parquet_saver.py +1 -0
  26. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/storage_manager.py +4 -1
  27. sibi_dst-0.3.31/sibi_dst/df_helper/_parquet_reader.py +0 -50
  28. sibi_dst-0.3.31/sibi_dst/df_helper/backends/django/_db_connection.py +0 -48
  29. sibi_dst-0.3.31/sibi_dst/df_helper/backends/django/_io_dask.py +0 -242
  30. sibi_dst-0.3.31/sibi_dst/df_helper/backends/http/_http_config.py +0 -50
  31. sibi_dst-0.3.31/sibi_dst/df_helper/backends/parquet/_parquet_options.py +0 -101
  32. sibi_dst-0.3.31/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -63
  33. sibi_dst-0.3.31/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -131
  34. sibi_dst-0.3.31/sibi_dst/geopy_helper/utils.py +0 -55
  35. sibi_dst-0.3.31/sibi_dst/osmnx_helper/base_osm_map.py +0 -165
  36. sibi_dst-0.3.31/sibi_dst/osmnx_helper/utils.py +0 -267
  37. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/README.md +0 -0
  38. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/__init__.py +0 -0
  39. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/__init__.py +0 -0
  40. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/_df_helper.py +0 -0
  41. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/__init__.py +0 -0
  42. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/django/__init__.py +0 -0
  43. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  44. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -0
  45. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  46. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -0
  47. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  48. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/core/__init__.py +0 -0
  49. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/core/_defaults.py +0 -0
  50. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/core/_filter_handler.py +0 -0
  51. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/core/_query_config.py +0 -0
  52. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/df_helper/data_cleaner.py +0 -0
  53. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/geopy_helper/__init__.py +0 -0
  54. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/osmnx_helper/__init__.py +0 -0
  55. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  56. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  57. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  58. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/tests/__init__.py +0 -0
  59. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  60. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/__init__.py +0 -0
  61. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/airflow_manager.py +0 -0
  62. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/credentials.py +0 -0
  63. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/df_utils.py +0 -0
  64. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/file_utils.py +0 -0
  65. {sibi_dst-0.3.31 → sibi_dst-0.3.33}/sibi_dst/utils/filepath_generator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 0.3.31
3
+ Version: 0.3.33
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -20,6 +20,7 @@ Requires-Dist: django (>=5.1.4,<6.0.0)
20
20
  Requires-Dist: djangorestframework (>=3.15.2,<4.0.0)
21
21
  Requires-Dist: folium (>=0.19.4,<0.20.0)
22
22
  Requires-Dist: geopandas (>=1.0.1,<2.0.0)
23
+ Requires-Dist: gunicorn (>=23.0.0,<24.0.0)
23
24
  Requires-Dist: httpx (>=0.27.2,<0.28.0)
24
25
  Requires-Dist: ipython (>=8.29.0,<9.0.0)
25
26
  Requires-Dist: jinja2 (>=3.1.4,<4.0.0)
@@ -42,6 +43,7 @@ Requires-Dist: sqlalchemy (>=2.0.36,<3.0.0)
42
43
  Requires-Dist: tornado (>=6.4.1,<7.0.0)
43
44
  Requires-Dist: tqdm (>=4.67.0,<5.0.0)
44
45
  Requires-Dist: uvicorn (>=0.34.0,<0.35.0)
46
+ Requires-Dist: uvicorn-worker (>=0.3.0,<0.4.0)
45
47
  Description-Content-Type: text/markdown
46
48
 
47
49
  # sibi-dst
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "0.3.31"
3
+ version = "0.3.33"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -41,6 +41,8 @@ nltk = "^3.9.1"
41
41
  folium = "^0.19.4"
42
42
  geopandas = "^1.0.1"
43
43
  osmnx = "^2.0.1"
44
+ gunicorn = "^23.0.0"
45
+ uvicorn-worker = "^0.3.0"
44
46
 
45
47
 
46
48
  [build-system]
@@ -9,11 +9,74 @@ from sibi_dst.utils import DateUtils
9
9
 
10
10
 
11
11
  class ParquetArtifact(DfHelper):
12
+ """
13
+ Class designed to manage Parquet data storage and retrieval using a specified
14
+ DataWrapper class for data processing. It provides functionality for loading,
15
+ updating, rebuilding, and generating Parquet files within a configurable
16
+ storage filesystem. The class ensures that all essential configurations and
17
+ filesystems are properly set up before operations.
18
+
19
+ Detailed functionality includes support for dynamically managing and generating
20
+ Parquet files based on time periods, with customizable options for paths,
21
+ filenames, date fields, and more. It is an abstraction for efficiently handling
22
+ storage tasks related to distributed or local file systems.
23
+
24
+ :ivar config: Configuration dictionary containing all configurable parameters
25
+ for managing Parquet data storage, such as paths, filenames,
26
+ and date ranges.
27
+ :type config: dict
28
+ :ivar df: Cached Dask DataFrame used to store and manipulate data loaded
29
+ from the Parquet file.
30
+ :type df: Optional[dask.dataframe.DataFrame]
31
+ :ivar data_wrapper_class: Class responsible for abstracting data processing
32
+ operations required for Parquet file generation.
33
+ :type data_wrapper_class: type
34
+ :ivar date_field: Name of the field used to identify and process data by date.
35
+ :type date_field: Optional[str]
36
+ :ivar parquet_storage_path: Filesystem path to store Parquet files.
37
+ :type parquet_storage_path: Optional[str]
38
+ :ivar parquet_filename: Name of the Parquet file to be generated and managed.
39
+ :type parquet_filename: Optional[str]
40
+ :ivar parquet_start_date: Date string specifying the start date for data range
41
+ processing.
42
+ :type parquet_start_date: Optional[str]
43
+ :ivar parquet_end_date: Date string specifying the end date for data range
44
+ processing.
45
+ :type parquet_end_date: Optional[str]
46
+ :ivar filesystem_type: Type of the filesystem used for managing storage
47
+ operations (e.g., `file`, `s3`, etc.).
48
+ :type filesystem_type: str
49
+ :ivar filesystem_options: Additional options for configuring the filesystem.
50
+ :type filesystem_options: dict
51
+ :ivar fs: Filesystem object used for storage operations.
52
+ :type fs: fsspec.AbstractFileSystem
53
+ """
12
54
  DEFAULT_CONFIG = {
13
55
  'backend': 'parquet'
14
56
  }
15
57
 
16
58
  def __init__(self, data_wrapper_class, **kwargs):
59
+ """
60
+ Initializes an instance of the class with given configuration and validates
61
+ required parameters. Sets up the filesystem to handle storage, ensuring
62
+ necessary directories exist. The configuration supports a variety of options
63
+ to manage parquet storage requirements, including paths, filenames, and date
64
+ ranges.
65
+
66
+ :param data_wrapper_class: The class responsible for wrapping data to be managed
67
+ by this instance.
68
+ :type data_wrapper_class: type
69
+ :param kwargs: Arbitrary keyword arguments to override default configuration.
70
+ Includes settings for `date_field`, `parquet_storage_path`,
71
+ `parquet_filename`, `parquet_start_date`, `parquet_end_date`,
72
+ `filesystem_type`, `filesystem_options`, and `fs`.
73
+ :type kwargs: dict
74
+
75
+ :raises ValueError: If any of the required configuration options
76
+ (`date_field`, `parquet_storage_path`,
77
+ `parquet_filename`, `parquet_start_date`,
78
+ or `parquet_end_date`) are missing or not set properly.
79
+ """
17
80
  self.config = {
18
81
  **self.DEFAULT_CONFIG,
19
82
  **kwargs,
@@ -61,6 +124,11 @@ class ParquetArtifact(DfHelper):
61
124
  dw = DataWrapper(self.data_wrapper_class, **params)
62
125
  dw.process()
63
126
 
127
+ def __exit__(self, exc_type, exc_value, traceback):
128
+ # Ensure resources are cleaned up
129
+ if self.fs:
130
+ self.fs.close()
131
+
64
132
  def update_parquet(self, period: str = 'today', **kwargs) -> None:
65
133
  """Update the Parquet file with data from a specific period."""
66
134
  kwargs.update(self.parse_parquet_period(period=period))
@@ -0,0 +1,94 @@
1
+ from typing import Optional
2
+
3
+ import dask.dataframe as dd
4
+ import fsspec
5
+
6
+ from sibi_dst.df_helper import DfHelper
7
+
8
+
9
+ class ParquetReader(DfHelper):
10
+ """
11
+ This class is a specialized helper for reading and managing Parquet files.
12
+
13
+ The `ParquetReader` class is designed to facilitate working with Parquet
14
+ datasets stored across different filesystems. It initializes the required
15
+ resources, ensures the existence of the specified Parquet directory,
16
+ and provides an abstraction to load the data into a Dask DataFrame.
17
+
18
+ The class requires configuration for the storage path and dates defining
19
+ a range of interest. It also supports various filesystem types through
20
+ `fsspec`.
21
+
22
+ :ivar config: Holds the final configuration for this instance, combining
23
+ `DEFAULT_CONFIG` with user-provided configuration.
24
+ :type config: dict
25
+ :ivar df: Stores the loaded Dask DataFrame after the `load()` method is
26
+ invoked. Initially set to None.
27
+ :type df: Optional[dd.DataFrame]
28
+ :ivar parquet_storage_path: The path to the Parquet storage directory.
29
+ :type parquet_storage_path: str
30
+ :ivar parquet_start_date: Start date for Parquet data selection. Must
31
+ be set in the configuration.
32
+ :type parquet_start_date: str
33
+ :ivar parquet_end_date: End date for Parquet data selection. Must be
34
+ set in the configuration.
35
+ :type parquet_end_date: str
36
+ :ivar filesystem_type: The type of filesystem the Parquet files are
37
+ stored on (e.g., "file", "s3").
38
+ :type filesystem_type: str
39
+ :ivar filesystem_options: Any additional options required for the
40
+ specified filesystem type.
41
+ :type filesystem_options: dict
42
+ :ivar fs: Instance of `fsspec` filesystem used to interact with the
43
+ Parquet storage.
44
+ :type fs: fsspec.AbstractFileSystem
45
+ """
46
+ DEFAULT_CONFIG = {
47
+ 'backend': 'parquet'
48
+ }
49
+
50
+ def __init__(self, filesystem_type="file", filesystem_options=None, **kwargs):
51
+ self.config = {
52
+ **self.DEFAULT_CONFIG,
53
+ **kwargs,
54
+ }
55
+ self.df: Optional[dd.DataFrame] = None
56
+ self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
57
+ if self.parquet_storage_path is None:
58
+ raise ValueError('parquet_storage_path must be set')
59
+ self.parquet_start_date = self.config.setdefault('parquet_start_date', None)
60
+ if self.parquet_start_date is None:
61
+ raise ValueError('parquet_start_date must be set')
62
+
63
+ self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
64
+ if self.parquet_end_date is None:
65
+ raise ValueError('parquet_end_date must be set')
66
+
67
+ # Filesystem setup
68
+ self.filesystem_type = filesystem_type
69
+ self.filesystem_options = filesystem_options or {}
70
+ self.fs = self.config.setdefault('fs', None)
71
+ if self.fs is None:
72
+ self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
73
+ self.config.setdefault('fs', self.fs)
74
+
75
+ if not self.directory_exists():
76
+ raise ValueError(f"{self.parquet_storage_path} does not exist")
77
+
78
+ super().__init__(**self.config)
79
+
80
+ def load(self, **kwargs):
81
+ self.df = super().load(**kwargs)
82
+ return self.df
83
+
84
+ def directory_exists(self):
85
+ try:
86
+ info = self.fs.info(self.parquet_storage_path)
87
+ return info['type'] == 'directory'
88
+ except FileNotFoundError:
89
+ return False
90
+
91
+ def __exit__(self, exc_type, exc_value, traceback):
92
+ # Ensure resources are cleaned up
93
+ if self.fs:
94
+ self.fs.close()
@@ -0,0 +1,88 @@
1
+ from typing import Any
2
+
3
+ from pydantic import BaseModel, model_validator
4
+
5
+ from ._sql_model_builder import DjangoSqlModelBuilder
6
+
7
+
8
+ class DjangoConnectionConfig(BaseModel):
9
+ """
10
+ Represents a configuration for establishing a Django database connection.
11
+
12
+ This class is used for defining the configurations necessary to establish a Django
13
+ database connection. It supports dynamic model generation if the model is not
14
+ provided explicitly. It also validates the connection configuration to ensure it
15
+ is properly set up before being used.
16
+
17
+ :ivar live: Indicates whether the connection is live. Automatically set to False if
18
+ a table is provided without a pre-built model.
19
+ :type live: bool
20
+ :ivar connection_name: The name of the database connection to use. This is a mandatory
21
+ parameter and must be provided.
22
+ :type connection_name: str
23
+ :ivar table: The name of the database table to use. Required for dynamic model
24
+ generation when no model is provided.
25
+ :type table: str
26
+ :ivar model: The Django model that represents the database table. If not provided,
27
+ this can be generated dynamically by using the table name.
28
+ :type model: Any
29
+ """
30
+ live: bool = False
31
+ connection_name: str = None
32
+ table: str = None
33
+ model: Any = None
34
+
35
+ @model_validator(mode="after")
36
+ def check_model(self):
37
+ """
38
+ Validates and modifies the instance based on the provided attributes and conditions.
39
+ This method ensures that all required parameters are populated and consistent, and it
40
+ dynamically builds a model if necessary. The method also ensures the connection is
41
+ validated after the model preparation process.
42
+
43
+ :raises ValueError: If `connection_name` is not provided.
44
+ :raises ValueError: If `table` name is not specified when building the model dynamically.
45
+ :raises ValueError: If there are errors during the dynamic model-building process.
46
+ :raises ValueError: If `validate_connection` fails due to invalid configuration.
47
+ :return: The validated and potentially mutated instance.
48
+ """
49
+ # connection_name is mandatory
50
+ if self.connection_name is None:
51
+ raise ValueError("Connection name must be specified")
52
+
53
+ # If table is provided, enforce live=False
54
+ if self.table:
55
+ self.live = False
56
+
57
+ # If model is not provided, build it dynamically
58
+ if not self.model:
59
+ if not self.table:
60
+ raise ValueError("Table name must be specified to build the model")
61
+ try:
62
+ self.model = DjangoSqlModelBuilder(
63
+ connection_name=self.connection_name, table=self.table
64
+ ).build_model()
65
+ except Exception as e:
66
+ raise ValueError(f"Failed to build model: {e}")
67
+ else:
68
+ self.live = True
69
+ # Validate the connection after building the model
70
+ self.validate_connection()
71
+ return self
72
+
73
+ def validate_connection(self):
74
+ """
75
+ Ensures the database connection is valid by performing a simple
76
+ query. Raises a ValueError if the connection is broken or if any
77
+ other exception occurs during the query.
78
+
79
+ :raises ValueError: If the connection to the database cannot be
80
+ established or if the query fails.
81
+ """
82
+ try:
83
+ # Perform a simple query to test the connection
84
+ self.model.objects.using(self.connection_name).exists()
85
+ except Exception as e:
86
+ raise ValueError(
87
+ f"Failed to connect to the database '{self.connection_name}': {e}"
88
+ )