sibi-dst 0.3.63__py3-none-any.whl → 2025.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +186 -591
- sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -2
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +161 -115
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +291 -97
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +34 -105
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +175 -162
- sibi_dst/df_helper/core/__init__.py +0 -4
- sibi_dst/df_helper/core/_defaults.py +1 -50
- sibi_dst/df_helper/core/_query_config.py +2 -2
- sibi_dst/utils/__init__.py +0 -2
- sibi_dst/utils/data_wrapper.py +9 -12
- sibi_dst/utils/log_utils.py +15 -11
- sibi_dst/utils/update_planner.py +2 -0
- sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +325 -50
- sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +2 -2
- sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +330 -51
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +91 -0
- sibi_dst-2025.1.1.dist-info/METADATA +55 -0
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/RECORD +23 -26
- sibi_dst/df_helper/backends/django/__init__.py +0 -11
- sibi_dst/df_helper/backends/django/_db_connection.py +0 -88
- sibi_dst/df_helper/backends/django/_io_dask.py +0 -450
- sibi_dst/df_helper/backends/django/_load_from_db.py +0 -227
- sibi_dst/df_helper/backends/django/_sql_model_builder.py +0 -493
- sibi_dst/df_helper/backends/sqlalchemy/_filter_handler.py +0 -119
- sibi_dst/utils/airflow_manager.py +0 -212
- sibi_dst-0.3.63.dist-info/METADATA +0 -90
- {sibi_dst-0.3.63.dist-info → sibi_dst-2025.1.1.dist-info}/WHEEL +0 -0
@@ -1,193 +1,206 @@
|
|
1
1
|
import re
|
2
|
+
import keyword
|
3
|
+
import threading
|
4
|
+
from sqlalchemy import MetaData, Engine
|
5
|
+
from sqlalchemy.orm import DeclarativeBase
|
2
6
|
|
3
|
-
from sqlalchemy import MetaData, Table
|
4
|
-
from sqlalchemy.orm import declarative_base, relationship
|
5
7
|
|
6
|
-
|
7
|
-
|
8
|
+
class Base(DeclarativeBase):
|
9
|
+
"""Shared declarative base for all ORM models."""
|
10
|
+
pass
|
8
11
|
|
9
|
-
|
12
|
+
|
13
|
+
apps_label = "datacubes.models"
|
10
14
|
|
11
15
|
|
12
16
|
class SqlAlchemyModelBuilder:
|
13
17
|
"""
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
database tables.
|
18
|
-
|
19
|
-
The primary purpose of this class is to simplify the process of creating
|
20
|
-
SQLAlchemy ORM models by reflecting tables from a connected database,
|
21
|
-
dynamically generating model classes, and handling relationships between
|
22
|
-
tables.
|
23
|
-
|
24
|
-
:ivar engine: SQLAlchemy engine connected to the database.
|
25
|
-
:type engine: Engine
|
26
|
-
:ivar table_name: Name of the table for which the model is generated.
|
27
|
-
:type table_name: str
|
28
|
-
:ivar metadata: SQLAlchemy MetaData instance for reflecting tables.
|
29
|
-
:type metadata: MetaData
|
30
|
-
:ivar table: Reflected SQLAlchemy Table object for the specified table name.
|
31
|
-
:type table: Optional[Table]
|
32
|
-
:ivar class_name: Dynamically normalized class name derived from table_name.
|
33
|
-
:type class_name: str
|
18
|
+
Builds a single SQLAlchemy ORM model from a specific database table.
|
19
|
+
This class is thread-safe and caches reflected table metadata to
|
20
|
+
improve performance across multiple instantiations.
|
34
21
|
"""
|
35
|
-
|
22
|
+
_lock = threading.Lock()
|
23
|
+
_metadata_cache: dict[str, MetaData] = {}
|
36
24
|
|
37
|
-
def __init__(self, engine, table_name):
|
25
|
+
def __init__(self, engine: Engine, table_name: str):
|
38
26
|
"""
|
39
|
-
|
27
|
+
Initializes the model builder for a specific table.
|
40
28
|
|
41
29
|
Args:
|
42
|
-
engine: SQLAlchemy engine connected to the database.
|
43
|
-
table_name
|
30
|
+
engine: The SQLAlchemy engine connected to the database.
|
31
|
+
table_name: The name of the table to generate the model for.
|
44
32
|
"""
|
45
33
|
self.engine = engine
|
46
34
|
self.table_name = table_name
|
47
|
-
self.
|
48
|
-
self.table = None # Placeholder for the specific table
|
49
|
-
self.class_name = self.normalize_class_name(self.table_name)
|
35
|
+
self.class_name = self._normalize_class_name(self.table_name)
|
50
36
|
|
51
|
-
|
52
|
-
"""
|
53
|
-
Builds and returns a database model class corresponding to the specified table name.
|
54
|
-
The method checks if the model is already registered in the ORM's registry. If not,
|
55
|
-
it reflects the database schema of the specified table and dynamically creates the
|
56
|
-
model class.
|
57
|
-
|
58
|
-
:raises ValueError: If the specified table does not exist in the database.
|
59
|
-
:return: A database model class corresponding to the specified table name.
|
60
|
-
:rtype: type
|
61
|
-
"""
|
62
|
-
# Check if the model is already registered
|
63
|
-
model = Base.registry._class_registry.get(self.class_name)
|
64
|
-
if model:
|
65
|
-
return model
|
37
|
+
engine_key = str(engine.url)
|
66
38
|
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
39
|
+
# ✅ REFACTOR: Acquire lock to make cache access and creation atomic,
|
40
|
+
# preventing a race condition between multiple threads.
|
41
|
+
with self._lock:
|
42
|
+
if engine_key not in self._metadata_cache:
|
43
|
+
self._metadata_cache[engine_key] = MetaData()
|
44
|
+
self.metadata = self._metadata_cache[engine_key]
|
71
45
|
|
72
|
-
|
73
|
-
return model
|
74
|
-
|
75
|
-
def create_model(self) -> type:
|
46
|
+
def build_model(self) -> type:
|
76
47
|
"""
|
77
|
-
|
78
|
-
|
79
|
-
attributes, and creates the model class if it doesn't already exist in the
|
80
|
-
SQLAlchemy base registry.
|
48
|
+
Builds and returns a database model class for the specified table.
|
49
|
+
This process is atomic and thread-safe.
|
81
50
|
|
82
|
-
:
|
83
|
-
|
84
|
-
:
|
85
|
-
|
86
|
-
:return: The dynamically created or fetched model class.
|
87
|
-
:rtype: type
|
51
|
+
Raises:
|
52
|
+
ValueError: If the specified table does not exist in the database.
|
53
|
+
Returns:
|
54
|
+
The dynamically created ORM model class.
|
88
55
|
"""
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
56
|
+
with self._lock:
|
57
|
+
# ✅ REFACTOR: Add a comment acknowledging the risk of using an
|
58
|
+
# internal API. This is a maintenance warning for future developers.
|
59
|
+
# NOTE: Using a private SQLAlchemy API. This is a performance
|
60
|
+
# optimization but may break in future versions of the library.
|
61
|
+
registered_model = Base.registry._class_registry.get(self.class_name)
|
62
|
+
if registered_model:
|
63
|
+
return registered_model
|
64
|
+
|
65
|
+
# Check if the table's schema is in our metadata cache
|
66
|
+
table = self.metadata.tables.get(self.table_name)
|
67
|
+
|
68
|
+
# If not cached, reflect it from the database
|
69
|
+
if table is None:
|
70
|
+
self.metadata.reflect(bind=self.engine, only=[self.table_name])
|
71
|
+
table = self.metadata.tables.get(self.table_name)
|
72
|
+
|
73
|
+
if table is None:
|
74
|
+
raise ValueError(
|
75
|
+
f"Table '{self.table_name}' does not exist in the database."
|
76
|
+
)
|
77
|
+
|
78
|
+
# Create the model class dynamically.
|
79
|
+
attrs = {
|
80
|
+
"__tablename__": table.name,
|
81
|
+
"__table__": table,
|
82
|
+
"__module__": apps_label,
|
83
|
+
}
|
105
84
|
model = type(self.class_name, (Base,), attrs)
|
106
|
-
# Add the class to Base.registry so it is registered
|
107
|
-
Base.registry._class_registry[self.class_name] = model
|
108
|
-
return model
|
109
|
-
|
110
|
-
def get_columns(self, table: Table):
|
111
|
-
"""
|
112
|
-
Extracts and returns a dictionary of column names and their corresponding column
|
113
|
-
objects from a given table, excluding reserved names. Reserved names are used
|
114
|
-
internally and should not overlap with column names in the provided table. The
|
115
|
-
method ensures sanitized column names through normalization and filters out any
|
116
|
-
column matching reserved keywords.
|
117
|
-
|
118
|
-
:param table: The table object from which columns are to be extracted.
|
119
|
-
:type table: Table
|
120
|
-
:return: A dictionary containing the sanitized column names as keys and their
|
121
|
-
corresponding column objects as values, excluding reserved names.
|
122
|
-
:rtype: dict
|
123
|
-
"""
|
124
|
-
columns = {}
|
125
|
-
reserved_names = ["metadata", "class_", "table"]
|
126
|
-
|
127
|
-
for column in table.columns:
|
128
|
-
column_name = self.normalize_column_name(column.name)
|
129
|
-
if column_name not in reserved_names:
|
130
|
-
columns[column_name] = column
|
131
|
-
return columns
|
132
|
-
|
133
|
-
def add_relationships(self, attrs, table: Table):
|
134
|
-
"""
|
135
|
-
Adds relationships to the provided attributes dictionary for a given database table.
|
136
|
-
|
137
|
-
This method iterates through the foreign keys of the provided table, constructs
|
138
|
-
relationship attributes, and updates the attributes dictionary with relationships
|
139
|
-
that connect the current table to related tables.
|
140
|
-
|
141
|
-
:param attrs: Dictionary of attributes to which relationships will be added.
|
142
|
-
The dictionary will be updated with new relationship mappings.
|
143
|
-
:type attrs: dict
|
144
|
-
:param table: A database table object containing foreign key relationships.
|
145
|
-
The method will use this table to establish relationships.
|
146
|
-
:return: None
|
147
|
-
"""
|
148
|
-
for fk in table.foreign_keys:
|
149
|
-
related_table_name = fk.column.table.name
|
150
|
-
related_class_name = self.normalize_class_name(related_table_name)
|
151
|
-
relationship_name = self.normalize_column_name(related_table_name)
|
152
|
-
attrs[relationship_name] = relationship(related_class_name, back_populates=None)
|
153
85
|
|
86
|
+
return model
|
154
87
|
|
155
88
|
@staticmethod
|
156
|
-
def
|
157
|
-
"""
|
158
|
-
Generate a normalized class name from a given table name by capitalizing
|
159
|
-
each word separated by underscores and concatenating them.
|
160
|
-
|
161
|
-
This static method takes a string representation of a table name, where
|
162
|
-
words are separated by underscores, and converts it into a camel case
|
163
|
-
class name. It processes the string by capitalizing the first letter of
|
164
|
-
each word and removing the underscores. The normalized class name
|
165
|
-
returned can be used programmatically for various purposes, such as
|
166
|
-
class generation or naming conventions.
|
167
|
-
|
168
|
-
:param table_name: The table name to normalize, with words separated by
|
169
|
-
underscores. E.g., 'sample_table' becomes 'SampleTable'.
|
170
|
-
:type table_name: str
|
171
|
-
:return: A normalized class name in camel case format.
|
172
|
-
:rtype: str
|
173
|
-
"""
|
89
|
+
def _normalize_class_name(table_name: str) -> str:
|
90
|
+
"""Converts a snake_case table_name to a CamelCase class name."""
|
174
91
|
return "".join(word.capitalize() for word in table_name.split("_"))
|
175
92
|
|
176
93
|
@staticmethod
|
177
|
-
def
|
94
|
+
def _normalize_column_name(column_name: str) -> str:
|
178
95
|
"""
|
179
|
-
|
180
|
-
|
181
|
-
such as 'class', 'def', 'return', etc. If the normalized name conflicts with
|
182
|
-
a Python reserved keyword, "_field" is appended to it.
|
183
|
-
|
184
|
-
:param column_name: The original name of the column to be normalized.
|
185
|
-
:type column_name: str
|
186
|
-
:return: A normalized column name that is safe and compatible for usage
|
187
|
-
in various contexts such as database columns or Python code.
|
188
|
-
:rtype: str
|
96
|
+
Sanitizes a column name to be a valid Python identifier.
|
97
|
+
(Kept for utility, though not used in the final model creation).
|
189
98
|
"""
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
99
|
+
sane_name = re.sub(r"\W", "_", column_name)
|
100
|
+
sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
|
101
|
+
|
102
|
+
if keyword.iskeyword(sane_name):
|
103
|
+
return f"{sane_name}_field"
|
104
|
+
return sane_name
|
105
|
+
|
106
|
+
# import re
|
107
|
+
# import keyword
|
108
|
+
# import threading
|
109
|
+
# from sqlalchemy import MetaData, Engine
|
110
|
+
# from sqlalchemy.orm import DeclarativeBase
|
111
|
+
#
|
112
|
+
#
|
113
|
+
#
|
114
|
+
# class Base(DeclarativeBase):
|
115
|
+
# """shared declarative base for all ORM models."""
|
116
|
+
# pass
|
117
|
+
#
|
118
|
+
#
|
119
|
+
# apps_label = "datacubes.models"
|
120
|
+
#
|
121
|
+
#
|
122
|
+
# class SqlAlchemyModelBuilder:
|
123
|
+
# """
|
124
|
+
# Builds a single SQLAlchemy ORM model from a specific database table.
|
125
|
+
# This class is thread-safe and caches reflected table metadata to
|
126
|
+
# improve performance across multiple instantiations.
|
127
|
+
# """
|
128
|
+
# _lock = threading.Lock()
|
129
|
+
# _metadata_cache: dict[str, MetaData] = {}
|
130
|
+
#
|
131
|
+
# def __init__(self, engine: Engine, table_name: str):
|
132
|
+
# """
|
133
|
+
# Initializes the model builder for a specific table.
|
134
|
+
#
|
135
|
+
# Args:
|
136
|
+
# engine: The SQLAlchemy engine connected to the database.
|
137
|
+
# table_name: The name of the table to generate the model for.
|
138
|
+
# """
|
139
|
+
# self.engine = engine
|
140
|
+
# self.table_name = table_name
|
141
|
+
# self.class_name = self._normalize_class_name(self.table_name)
|
142
|
+
#
|
143
|
+
# # Use or create a cached MetaData object for this engine to avoid
|
144
|
+
# # re-reading the schema for tables that are already known.
|
145
|
+
# engine_key = str(engine.url)
|
146
|
+
# if engine_key not in self._metadata_cache:
|
147
|
+
# self._metadata_cache[engine_key] = MetaData()
|
148
|
+
# self.metadata = self._metadata_cache[engine_key]
|
149
|
+
#
|
150
|
+
# def build_model(self) -> type:
|
151
|
+
# """
|
152
|
+
# Builds and returns a database model class for the specified table.
|
153
|
+
# This process is atomic and thread-safe.
|
154
|
+
#
|
155
|
+
# Raises:
|
156
|
+
# ValueError: If the specified table does not exist in the database.
|
157
|
+
# Returns:
|
158
|
+
# The dynamically created ORM model class.
|
159
|
+
# """
|
160
|
+
# with self._lock:
|
161
|
+
# # First, check if the model class is already registered in SQLAlchemy
|
162
|
+
# registered_model = Base.registry._class_registry.get(self.class_name)
|
163
|
+
# if registered_model:
|
164
|
+
# return registered_model
|
165
|
+
#
|
166
|
+
# # Next, check if the table's schema is in our metadata cache
|
167
|
+
# table = self.metadata.tables.get(self.table_name)
|
168
|
+
#
|
169
|
+
# # If not cached, reflect it from the database
|
170
|
+
# if table is None:
|
171
|
+
# self.metadata.reflect(bind=self.engine, only=[self.table_name])
|
172
|
+
# table = self.metadata.tables.get(self.table_name)
|
173
|
+
#
|
174
|
+
# if table is None:
|
175
|
+
# raise ValueError(
|
176
|
+
# f"Table '{self.table_name}' does not exist in the database."
|
177
|
+
# )
|
178
|
+
#
|
179
|
+
# # Create the model class dynamically.
|
180
|
+
# # No need to add columns manually; __table__ handles it.
|
181
|
+
# attrs = {
|
182
|
+
# "__tablename__": table.name,
|
183
|
+
# "__table__": table,
|
184
|
+
# "__module__": apps_label,
|
185
|
+
# }
|
186
|
+
# model = type(self.class_name, (Base,), attrs)
|
187
|
+
#
|
188
|
+
# return model
|
189
|
+
#
|
190
|
+
# @staticmethod
|
191
|
+
# def _normalize_class_name(table_name: str) -> str:
|
192
|
+
# """Converts a snake_case table_name to a CamelCase class name."""
|
193
|
+
# return "".join(word.capitalize() for word in table_name.split("_"))
|
194
|
+
#
|
195
|
+
# @staticmethod
|
196
|
+
# def _normalize_column_name(column_name: str) -> str:
|
197
|
+
# """
|
198
|
+
# Sanitizes a column name to be a valid Python identifier.
|
199
|
+
# (Kept for utility, though not used in the final model creation).
|
200
|
+
# """
|
201
|
+
# sane_name = re.sub(r"\W", "_", column_name)
|
202
|
+
# sane_name = re.sub(r"^\d", r"_\g<0>", sane_name)
|
203
|
+
#
|
204
|
+
# if keyword.iskeyword(sane_name):
|
205
|
+
# return f"{sane_name}_field"
|
206
|
+
# return sane_name
|
@@ -1,8 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from ._defaults import (
|
4
|
-
django_field_conversion_map_pandas,
|
5
|
-
django_field_conversion_map_dask,
|
6
4
|
sqlalchemy_field_conversion_map_dask,
|
7
5
|
normalize_sqlalchemy_type)
|
8
6
|
from ._filter_handler import FilterHandler
|
@@ -12,8 +10,6 @@ from ._query_config import QueryConfig
|
|
12
10
|
__all__ = [
|
13
11
|
"ParamsConfig",
|
14
12
|
"QueryConfig",
|
15
|
-
"django_field_conversion_map_pandas",
|
16
|
-
"django_field_conversion_map_dask",
|
17
13
|
"sqlalchemy_field_conversion_map_dask",
|
18
14
|
"normalize_sqlalchemy_type",
|
19
15
|
"FilterHandler",
|
@@ -13,56 +13,7 @@ from sqlalchemy.dialects.mysql import TINYINT, MEDIUMTEXT
|
|
13
13
|
# conversion_map is a dictionary that maps the field types to their corresponding data type conversion functions.
|
14
14
|
# Each entry in the dictionary is a pair of a field type (as a string) and a callable function that performs the
|
15
15
|
# conversion. This mapping is used to convert the values in a pandas DataFrame to the appropriate data types based on
|
16
|
-
# the
|
17
|
-
|
18
|
-
django_field_conversion_map_pandas: Dict[str, callable] = {
|
19
|
-
"CharField": lambda x: x.astype(str),
|
20
|
-
"TextField": lambda x: x.astype(str),
|
21
|
-
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
22
|
-
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
23
|
-
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
24
|
-
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
25
|
-
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
26
|
-
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
27
|
-
"PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
28
|
-
"FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
|
29
|
-
"DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
|
30
|
-
"BooleanField": lambda x: x.astype(bool),
|
31
|
-
"NullBooleanField": lambda x: x.astype(bool),
|
32
|
-
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
33
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").dt.date,
|
34
|
-
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").dt.time,
|
35
|
-
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
36
|
-
# for JSONField, assuming JSON objects are represented as string in df
|
37
|
-
"JSONField": lambda x: x.apply(json.loads),
|
38
|
-
"ArrayField": lambda x: x.apply(eval),
|
39
|
-
"UUIDField": lambda x: x.astype(str),
|
40
|
-
}
|
41
|
-
|
42
|
-
django_field_conversion_map_dask: Dict[str, callable] = {
|
43
|
-
"CharField": lambda x: x.astype(str),
|
44
|
-
"TextField": lambda x: x.astype(str),
|
45
|
-
"IntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
46
|
-
"AutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
47
|
-
"BigAutoField": lambda x: pd.to_numeric(x, errors="coerce"),
|
48
|
-
"BigIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
49
|
-
"SmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
50
|
-
"PositiveIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
51
|
-
"PositiveSmallIntegerField": lambda x: pd.to_numeric(x, errors="coerce"),
|
52
|
-
"FloatField": lambda x: pd.to_numeric(x, errors="coerce"),
|
53
|
-
"DecimalField": lambda x: pd.to_numeric(x, errors="coerce"),
|
54
|
-
"BooleanField": lambda x: x.astype(bool),
|
55
|
-
"NullBooleanField": lambda x: x.astype(bool),
|
56
|
-
"DateTimeField": lambda x: pd.to_datetime(x, errors="coerce"),
|
57
|
-
"DateField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.date,
|
58
|
-
meta=("date", "object")),
|
59
|
-
"TimeField": lambda x: pd.to_datetime(x, errors="coerce").map_partitions(lambda x: x.dt.time,
|
60
|
-
meta=("time", "object")),
|
61
|
-
"DurationField": lambda x: pd.to_timedelta(x, errors="coerce"),
|
62
|
-
"JSONField": lambda x: x.map_partitions(lambda s: s.apply(json.loads), meta=("json", "object")),
|
63
|
-
"ArrayField": lambda x: x.map_partitions(lambda s: s.apply(eval), meta=("array", "object")),
|
64
|
-
"UUIDField": lambda x: x.astype(str),
|
65
|
-
}
|
16
|
+
# the db field type.
|
66
17
|
|
67
18
|
sqlalchemy_field_conversion_map_dask: Dict[str, callable] = {
|
68
19
|
String.__name__: lambda x: x.astype(str).fillna(""),
|
@@ -7,8 +7,8 @@ class QueryConfig(BaseModel):
|
|
7
7
|
use_exclude: bool = False
|
8
8
|
n_records: int = 100
|
9
9
|
dt_field: Optional[str] = None
|
10
|
-
use_dask: bool =
|
11
|
-
as_dask: bool =
|
10
|
+
use_dask: bool = True
|
11
|
+
as_dask: bool = True
|
12
12
|
|
13
13
|
@model_validator(mode='after')
|
14
14
|
def check_n_records(self):
|
sibi_dst/utils/__init__.py
CHANGED
@@ -10,7 +10,6 @@ from .df_utils import DfUtils
|
|
10
10
|
from .storage_manager import StorageManager
|
11
11
|
from .parquet_saver import ParquetSaver
|
12
12
|
from .clickhouse_writer import ClickHouseWriter
|
13
|
-
from .airflow_manager import AirflowDAGManager
|
14
13
|
from .credentials import *
|
15
14
|
from .update_planner import UpdatePlanner
|
16
15
|
from .data_wrapper import DataWrapper
|
@@ -35,7 +34,6 @@ __all__ = [
|
|
35
34
|
"StorageManager",
|
36
35
|
"DfUtils",
|
37
36
|
"ClickHouseWriter",
|
38
|
-
"AirflowDAGManager",
|
39
37
|
"StorageConfig",
|
40
38
|
"FsRegistry",
|
41
39
|
"DataFromHttpSource",
|
sibi_dst/utils/data_wrapper.py
CHANGED
@@ -38,7 +38,7 @@ class DataWrapper:
|
|
38
38
|
logger: Logger = None,
|
39
39
|
show_progress: bool = False,
|
40
40
|
timeout: float = 30,
|
41
|
-
max_threads: int =
|
41
|
+
max_threads: int = 3,
|
42
42
|
**kwargs: Any,
|
43
43
|
):
|
44
44
|
self.dataclass = dataclass
|
@@ -66,6 +66,7 @@ class DataWrapper:
|
|
66
66
|
self.benchmarks: Dict[datetime.date, Dict[str, float]] = {}
|
67
67
|
self.mmanifest = kwargs.get("mmanifest", None)
|
68
68
|
self.update_planner=kwargs.get("update_planner", None)
|
69
|
+
self.datacls = self.dataclass(**self.class_params)
|
69
70
|
|
70
71
|
def __enter__(self):
|
71
72
|
"""Context manager entry"""
|
@@ -164,28 +165,24 @@ class DataWrapper:
|
|
164
165
|
def _process_single_date(self, date: datetime.date):
|
165
166
|
"""Core date processing logic with load/save timing and thread reporting"""
|
166
167
|
path = f"{self.data_path}{date.year}/{date.month:02d}/{date.day:02d}/"
|
167
|
-
self.logger.
|
168
|
-
# self.logger.info(f"Path {path} in {self.skipped}: {path in self.skipped}")
|
168
|
+
self.logger.debug(f"Processing date {date.isoformat()} for {path}")
|
169
169
|
if path in self.update_planner.skipped and self.update_planner.ignore_missing:
|
170
170
|
self.logger.info(f"Skipping {date} as it exists in the skipped list")
|
171
171
|
return
|
172
172
|
full_path = f"{path}{self.parquet_filename}"
|
173
173
|
|
174
174
|
thread_name = threading.current_thread().name
|
175
|
-
self.logger.
|
175
|
+
self.logger.debug(f"[{thread_name}] Executing date: {date} -> saving to: {full_path}")
|
176
176
|
|
177
177
|
overall_start = time.perf_counter()
|
178
178
|
try:
|
179
179
|
load_start = time.perf_counter()
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
**self.load_params
|
186
|
-
)
|
180
|
+
date_filter = {f"{self.date_field}__date": {date.isoformat()}}
|
181
|
+
self.logger.debug(f"Loading data for {date} with filter: {date_filter}")
|
182
|
+
# Load data using the dataclass with the provided date filter
|
183
|
+
self.load_params.update(date_filter)
|
184
|
+
df = self.datacls.load(**self.load_params)
|
187
185
|
load_time = time.perf_counter() - load_start
|
188
|
-
|
189
186
|
if df.head(1, compute=True).empty:
|
190
187
|
if self.mmanifest:
|
191
188
|
schema = df._meta.dtypes.astype(str).to_dict()
|
sibi_dst/utils/log_utils.py
CHANGED
@@ -115,22 +115,26 @@ class Logger:
|
|
115
115
|
"""
|
116
116
|
self.logger.setLevel(level)
|
117
117
|
|
118
|
-
def debug(self, msg: str):
|
118
|
+
def debug(self, msg: str, *args, **kwargs):
|
119
119
|
"""Log a debug message."""
|
120
|
-
self.logger.debug(msg)
|
120
|
+
self.logger.debug(msg, *args, **kwargs)
|
121
121
|
|
122
|
-
def info(self, msg: str):
|
122
|
+
def info(self, msg: str, *args, **kwargs):
|
123
123
|
"""Log an info message."""
|
124
|
-
self.logger.info(msg)
|
124
|
+
self.logger.info(msg, *args, **kwargs)
|
125
125
|
|
126
|
-
def warning(self, msg: str):
|
126
|
+
def warning(self, msg: str, *args, **kwargs):
|
127
127
|
"""Log a warning message."""
|
128
|
-
self.logger.warning(msg)
|
128
|
+
self.logger.warning(msg, *args, **kwargs)
|
129
129
|
|
130
|
-
def error(self, msg: str):
|
131
|
-
"""
|
132
|
-
|
130
|
+
def error(self, msg: str, *args, **kwargs):
|
131
|
+
"""
|
132
|
+
Log an error message.
|
133
|
+
|
134
|
+
To log exception information, use the `exc_info=True` keyword argument.
|
135
|
+
"""
|
136
|
+
self.logger.error(msg, *args, **kwargs)
|
133
137
|
|
134
|
-
def critical(self, msg: str):
|
138
|
+
def critical(self, msg: str, *args, **kwargs):
|
135
139
|
"""Log a critical message."""
|
136
|
-
self.logger.critical(msg)
|
140
|
+
self.logger.critical(msg, *args, **kwargs)
|
sibi_dst/utils/update_planner.py
CHANGED
@@ -73,6 +73,8 @@ class UpdatePlanner:
|
|
73
73
|
self.show_progress = show_progress
|
74
74
|
self.logger = logger or Logger.default_logger(logger_name="update_planner")
|
75
75
|
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
76
|
+
self.debug = debug
|
77
|
+
self.verbose = verbose
|
76
78
|
|
77
79
|
# Filesystem and age helper
|
78
80
|
self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
|