sibi-dst 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/__init__.py +2 -0
- sibi_dst/df_helper/_df_helper.py +3 -0
- sibi_dst/df_helper/_parquet_reader.py +49 -0
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -1
- sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py +13 -18
- sibi_dst/utils/_data_wrapper.py +16 -38
- {sibi_dst-0.3.18.dist-info → sibi_dst-0.3.19.dist-info}/METADATA +4 -4
- {sibi_dst-0.3.18.dist-info → sibi_dst-0.3.19.dist-info}/RECORD +9 -8
- {sibi_dst-0.3.18.dist-info → sibi_dst-0.3.19.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/__init__.py
CHANGED
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -56,6 +56,9 @@ class DfHelper:
|
|
56
56
|
kwargs.setdefault("logger", self.logger)
|
57
57
|
self.post_init(**kwargs)
|
58
58
|
|
59
|
+
def __str__(self):
|
60
|
+
return self.__class__.__name__
|
61
|
+
|
59
62
|
def post_init(self, **kwargs):
|
60
63
|
self.logger.debug(f"backend used: {self.backend}")
|
61
64
|
self.backend_query = self.__get_config(QueryConfig, kwargs)
|
@@ -0,0 +1,49 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
import dask.dataframe as dd
|
4
|
+
import fsspec
|
5
|
+
|
6
|
+
from sibi_dst.df_helper import DfHelper
|
7
|
+
|
8
|
+
class ParquetReader(DfHelper):
|
9
|
+
DEFAULT_CONFIG = {
|
10
|
+
'backend': 'parquet'
|
11
|
+
}
|
12
|
+
|
13
|
+
def __init__(self, filesystem_type="file", filesystem_options=None, **kwargs):
|
14
|
+
self.config = {
|
15
|
+
**self.DEFAULT_CONFIG,
|
16
|
+
**kwargs,
|
17
|
+
}
|
18
|
+
self.df: Optional[dd.DataFrame] = None
|
19
|
+
self.parquet_storage_path = self.config.setdefault('parquet_storage_path', None)
|
20
|
+
if self.parquet_storage_path is None:
|
21
|
+
raise ValueError('parquet_storage_path must be set')
|
22
|
+
self.parquet_start_date = self.config.setdefault('parquet_start_date', None)
|
23
|
+
if self.parquet_start_date is None:
|
24
|
+
raise ValueError('parquet_start_date must be set')
|
25
|
+
|
26
|
+
self.parquet_end_date = self.config.setdefault('parquet_end_date', None)
|
27
|
+
if self.parquet_end_date is None:
|
28
|
+
raise ValueError('parquet_end_date must be set')
|
29
|
+
|
30
|
+
# Filesystem setup
|
31
|
+
self.filesystem_type = filesystem_type
|
32
|
+
self.filesystem_options = filesystem_options or {}
|
33
|
+
self.fs = fsspec.filesystem(self.filesystem_type, **self.filesystem_options)
|
34
|
+
|
35
|
+
if not self.directory_exists():
|
36
|
+
raise ValueError(f"{self.parquet_storage_path} does not exist")
|
37
|
+
|
38
|
+
super().__init__(**self.config)
|
39
|
+
|
40
|
+
def load(self, **kwargs):
|
41
|
+
self.df = super().load(**kwargs)
|
42
|
+
return self.df
|
43
|
+
|
44
|
+
def directory_exists(self):
|
45
|
+
try:
|
46
|
+
info = self.fs.info(self.parquet_storage_path)
|
47
|
+
return info['type'] == 'directory'
|
48
|
+
except FileNotFoundError:
|
49
|
+
return False
|
@@ -52,7 +52,7 @@ class ParquetConfig(BaseModel):
|
|
52
52
|
raise ValueError('Parquet end date must be greater than start date')
|
53
53
|
|
54
54
|
# Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
|
55
|
-
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path)).generate_file_paths(start_date, end_date)
|
55
|
+
self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), logger=self.logger).generate_file_paths(start_date, end_date)
|
56
56
|
self.parquet_size_bytes = self.get_parquet_size_bytes()
|
57
57
|
self.load_parquet = True
|
58
58
|
#self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
|
@@ -9,6 +9,7 @@ Base = declarative_base()
|
|
9
9
|
apps_label = "datacubes"
|
10
10
|
|
11
11
|
class SqlAlchemyModelBuilder:
|
12
|
+
_model_cache = {} # Local cache for model classes
|
12
13
|
def __init__(self, engine, table_name):
|
13
14
|
"""
|
14
15
|
Initialize the model builder with a database engine and specific table.
|
@@ -21,28 +22,21 @@ class SqlAlchemyModelBuilder:
|
|
21
22
|
self.table_name = table_name
|
22
23
|
self.metadata = MetaData()
|
23
24
|
self.table = None # Placeholder for the specific table
|
25
|
+
self.class_name = self.normalize_class_name(self.table_name)
|
24
26
|
|
25
27
|
def build_model(self) -> type:
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
type: Dynamically generated SQLAlchemy ORM model class.
|
31
|
-
"""
|
32
|
-
# Check if the class is already registered
|
33
|
-
class_name = self.normalize_class_name(self.table_name)
|
34
|
-
mapper_registry = Base.registry
|
35
|
-
if class_name in mapper_registry._class_registry:
|
36
|
-
return mapper_registry._class_registry[class_name]
|
28
|
+
# Check if the model is already registered
|
29
|
+
model = Base.registry._class_registry.get(self.class_name)
|
30
|
+
if model:
|
31
|
+
return model
|
37
32
|
|
38
|
-
# Reflect only the specified table
|
39
33
|
self.metadata.reflect(only=[self.table_name], bind=self.engine)
|
40
34
|
self.table = self.metadata.tables.get(self.table_name)
|
41
|
-
|
42
35
|
if self.table is None:
|
43
36
|
raise ValueError(f"Table '{self.table_name}' does not exist in the database.")
|
44
37
|
|
45
|
-
|
38
|
+
model = self.create_model()
|
39
|
+
return model
|
46
40
|
|
47
41
|
def create_model(self) -> type:
|
48
42
|
"""
|
@@ -52,7 +46,6 @@ class SqlAlchemyModelBuilder:
|
|
52
46
|
type: Dynamically generated SQLAlchemy ORM model class.
|
53
47
|
"""
|
54
48
|
# Normalize the class name from the table name
|
55
|
-
class_name = self.normalize_class_name(self.table_name)
|
56
49
|
columns = self.get_columns(self.table)
|
57
50
|
|
58
51
|
# Define attributes for the model class
|
@@ -66,9 +59,11 @@ class SqlAlchemyModelBuilder:
|
|
66
59
|
# Add columns and relationships to the model
|
67
60
|
attrs.update(columns)
|
68
61
|
#self.add_relationships(attrs, self.table)
|
69
|
-
|
70
|
-
|
71
|
-
|
62
|
+
model = Base.registry._class_registry.get(self.class_name)
|
63
|
+
if not model:
|
64
|
+
model = type(self.class_name, (Base,), attrs)
|
65
|
+
# Add the class to Base.registry so it is registered
|
66
|
+
Base.registry._class_registry[self.class_name] = model
|
72
67
|
return model
|
73
68
|
|
74
69
|
def get_columns(self, table: Table):
|
sibi_dst/utils/_data_wrapper.py
CHANGED
@@ -1,15 +1,12 @@
|
|
1
1
|
import datetime
|
2
2
|
from typing import Type, Any, Dict, Optional
|
3
|
-
|
4
3
|
import fsspec
|
5
4
|
import pandas as pd
|
6
5
|
from IPython.display import display
|
7
|
-
from tqdm import tqdm
|
8
|
-
|
9
6
|
from sibi_dst.utils import Logger
|
7
|
+
from tqdm import tqdm
|
10
8
|
from sibi_dst.utils import ParquetSaver
|
11
9
|
|
12
|
-
|
13
10
|
class DataWrapper:
|
14
11
|
DEFAULT_MAX_AGE_MINUTES = 1440
|
15
12
|
DEFAULT_HISTORY_DAYS_THRESHOLD = 30
|
@@ -46,7 +43,7 @@ class DataWrapper:
|
|
46
43
|
self.reverse_order = reverse_order
|
47
44
|
self.overwrite = overwrite
|
48
45
|
self.ignore_missing = ignore_missing
|
49
|
-
self.logger = logger or Logger.default_logger(logger_name=self.
|
46
|
+
self.logger = logger or Logger.default_logger(logger_name=self.dataclass.__name__)
|
50
47
|
self.max_age_minutes = max_age_minutes
|
51
48
|
self.history_days_threshold = history_days_threshold
|
52
49
|
self.show_progress = show_progress
|
@@ -96,7 +93,7 @@ class DataWrapper:
|
|
96
93
|
|
97
94
|
date_iterator = dates_to_process
|
98
95
|
if self.show_progress:
|
99
|
-
date_iterator = tqdm(date_iterator, desc=description, unit="date")
|
96
|
+
date_iterator = tqdm(date_iterator, desc=f"{description}:{self.dataclass.__name__}", unit="date")
|
100
97
|
|
101
98
|
for current_date in date_iterator:
|
102
99
|
self.process_date(current_date)
|
@@ -113,16 +110,14 @@ class DataWrapper:
|
|
113
110
|
)
|
114
111
|
current_time = datetime.datetime.now(datetime.timezone.utc)
|
115
112
|
file_age_minutes = (current_time - file_modification_datetime).total_seconds() / 60
|
116
|
-
|
117
|
-
|
118
|
-
self.
|
119
|
-
|
120
|
-
f"(threshold: {self.max_age_minutes} minutes)"
|
121
|
-
)
|
113
|
+
self.logger.info(
|
114
|
+
f"File {file_path} is {round(file_age_minutes, 2)} minutes old "
|
115
|
+
f"(threshold: {self.max_age_minutes} minutes)"
|
116
|
+
)
|
122
117
|
|
123
118
|
return file_age_minutes > self.max_age_minutes
|
124
119
|
except FileNotFoundError:
|
125
|
-
return True
|
120
|
+
return True
|
126
121
|
|
127
122
|
def process_date(self, date: datetime.date):
|
128
123
|
"""Process a specific date by regenerating data as necessary."""
|
@@ -130,16 +125,13 @@ class DataWrapper:
|
|
130
125
|
full_parquet_filename = f"{folder}{self.parquet_filename}"
|
131
126
|
|
132
127
|
start_time = datetime.datetime.now()
|
133
|
-
|
134
|
-
if self.verbose:
|
135
|
-
self.logger.debug(f"Processing {full_parquet_filename}...")
|
128
|
+
self.logger.info(f"Processing {full_parquet_filename}...")
|
136
129
|
|
137
130
|
data_object = self.dataclass(**self.class_params)
|
138
131
|
df = data_object.load_period(dt_field=self.date_field, start=date, end=date)
|
139
132
|
|
140
133
|
if len(df.index)==0:
|
141
|
-
|
142
|
-
self.logger.debug("No data found for the specified date.")
|
134
|
+
self.logger.error("No data found for the specified date.")
|
143
135
|
return
|
144
136
|
|
145
137
|
parquet_saver = ParquetSaver(df, folder, self.logger)
|
@@ -147,11 +139,9 @@ class DataWrapper:
|
|
147
139
|
|
148
140
|
end_time = datetime.datetime.now()
|
149
141
|
duration_seconds = (end_time - start_time).total_seconds()
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
154
|
-
)
|
142
|
+
self.logger.info(
|
143
|
+
f"Data saved to {full_parquet_filename}. Processing time: {duration_seconds:.2f} seconds"
|
144
|
+
)
|
155
145
|
|
156
146
|
def generate_update_plan_with_conditions(self):
|
157
147
|
"""
|
@@ -167,7 +157,7 @@ class DataWrapper:
|
|
167
157
|
|
168
158
|
date_range = self.generate_date_range()
|
169
159
|
if self.show_progress:
|
170
|
-
date_range = tqdm(date_range, desc=f"Evaluating update plan
|
160
|
+
date_range = tqdm(date_range, desc=f"Evaluating update plan:{self.dataclass.__name__}", unit="date")
|
171
161
|
|
172
162
|
for current_date in date_range:
|
173
163
|
folder = f'{self.data_path}{current_date.year}/{current_date.month:02d}/{current_date.day:02d}/'
|
@@ -203,25 +193,13 @@ class DataWrapper:
|
|
203
193
|
"within_history": within_history,
|
204
194
|
"missing_file": missing_file,
|
205
195
|
"update_required": update_required,
|
206
|
-
"update_category": category
|
196
|
+
"update_category": category,
|
197
|
+
"datawrapper class":self.dataclass.__name__
|
207
198
|
})
|
208
199
|
|
209
200
|
update_plan_table = pd.DataFrame(rows)
|
210
201
|
return update_plan_table
|
211
202
|
|
212
|
-
|
213
|
-
|
214
|
-
# # Usage:
|
215
|
-
# # wrapper = DataWrapper(
|
216
|
-
# # dataclass=YourDataClass,
|
217
|
-
# # date_field="created_at",
|
218
|
-
# # data_path="/path/to/data",
|
219
|
-
# # parquet_filename="data.parquet",
|
220
|
-
# # start_date="2022-01-01",
|
221
|
-
# # end_date="2022-12-31",
|
222
|
-
# # filesystem_type="file",
|
223
|
-
# # verbose=True
|
224
|
-
# # )
|
225
203
|
# # wrapper.process()
|
226
204
|
# # wrapper = DataWrapper(
|
227
205
|
# # dataclass=YourDataClass,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.19
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -40,13 +40,13 @@ Description-Content-Type: text/markdown
|
|
40
40
|
|
41
41
|
Data Science Toolkit
|
42
42
|
---------------------
|
43
|
-
Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, DjangoRestFrameWork
|
43
|
+
Data Science Toolkit built with Python, Pandas, Dask, OpenStreetMaps, Scikit-Learn, XGBOOST, Django ORM, SQLAlchemy, DjangoRestFrameWork
|
44
44
|
|
45
45
|
Major Functionality
|
46
46
|
--------------------
|
47
47
|
1) Build DataCubes, DataSets and DataObjects from different datasources. These include relational databases, parquet files, xlsx, delimited tables, json, json API REST.
|
48
|
-
2)
|
49
|
-
3) Share Data with client applications by write to
|
48
|
+
2) Common dataframe management utilities.
|
49
|
+
3) Share Data with client applications by write to Data Warehouses in local filesystems as well as other supported platforms.
|
50
50
|
4) Build microservices to communicate/share data via API-REST, gRPC.
|
51
51
|
|
52
52
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
sibi_dst/__init__.py,sha256=1KaC0LYTHxjpENq-NXI325WcEYZ8GCBrHGkLoFxEcu0,251
|
2
|
-
sibi_dst/df_helper/__init__.py,sha256=
|
3
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
2
|
+
sibi_dst/df_helper/__init__.py,sha256=rbTr9CqwbJhu8pbZabwfcOqhm-5hm2iXk0vVBtK01bA,231
|
3
|
+
sibi_dst/df_helper/_df_helper.py,sha256=e6e32CRTCKjFVvYMytWTuBVpwB1VcnVQ1T4Rg8KXWvY,13917
|
4
4
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=ctISmwxP9icFCXsELBjbPiz-FK3CEojN7yNIlStdOWw,4974
|
5
|
+
sibi_dst/df_helper/_parquet_reader.py,sha256=A8qWuWQiaiS7pk4sD5EDAvGs-qz7VfziINXpSA7o00U,1683
|
5
6
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
7
|
sibi_dst/df_helper/backends/django/__init__.py,sha256=VkvYql-PUyCKOfoKx5aGdUAki8X-kULfhnCvgSQzHDQ,311
|
7
8
|
sibi_dst/df_helper/backends/django/_django_db_connection.py,sha256=9fGvXXgqPk_TC7BvaSljxTgNDWoXi_LZxKabEgExznM,1654
|
@@ -13,13 +14,13 @@ sibi_dst/df_helper/backends/http/__init__.py,sha256=AG9JSDRyVna2r1yxCQ9HcY32EaGn
|
|
13
14
|
sibi_dst/df_helper/backends/http/_http_config.py,sha256=NN3bol7NgBTDv70yOX7hJkazt1-dAAdFWVkYyHdIXsI,2128
|
14
15
|
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=ClkyIsIh_ovEwqm0dTrkXImbPjLDTVHW2NQqqfQwWAw,187
|
15
16
|
sibi_dst/df_helper/backends/parquet/_parquet_filter_handler.py,sha256=6iFvblnVq0qj89QvieQuYxe_2RPX5ArKfq5zBcEIj90,3660
|
16
|
-
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=
|
17
|
+
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=09b9yLPREvx6ebs62B9qEqJt1cCKJz97plGW82i4630,4414
|
17
18
|
sibi_dst/df_helper/backends/sql_alchemy/__init__.py,sha256=FHorj40SbHc0OBzQ_ieG6MG-HLbf0tw6I_5eoIjJkOI,369
|
18
19
|
sibi_dst/df_helper/backends/sql_alchemy/_io_sqlalchemy_dask.py,sha256=YYhjt5rL1yomcrby4i4bD5wPVDzRJpZZbxHp5CM40tQ,5414
|
19
20
|
sibi_dst/df_helper/backends/sql_alchemy/_sqlachemy_filter_handler.py,sha256=KShsLJYGVxN0ps9Wot7fF0nR0wW9WzcPIcWZ9f5vdBo,4654
|
20
21
|
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_db_connection.py,sha256=HtMsfH5com4dLVJxh3wdMUpQI3mz0cKDJz0CmFS2S8U,1648
|
21
22
|
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_load_from_db.py,sha256=QkR-_S4zqJpwH9dJ5cqXW8iy9XoAFUXmcsgUSm3PbLo,2251
|
22
|
-
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py,sha256=
|
23
|
+
sibi_dst/df_helper/backends/sql_alchemy/_sqlalchemy_model_builder.py,sha256=RjtKEk-i8EmX98rwqkq1Bg7IgPwYDduL967gsl9T73c,4401
|
23
24
|
sibi_dst/df_helper/backends/sql_model/__init__.py,sha256=MXd4OOdTqR4cENSV733SGodPO6eQMCexANs-3w0qL5U,226
|
24
25
|
sibi_dst/df_helper/backends/sql_model/_sqlmodel_db_connection.py,sha256=6jmMjKIv5Btysj3kZMaXQ98IqKQkhnOC-JWtb1B8rus,4265
|
25
26
|
sibi_dst/df_helper/backends/sql_model/_sqlmodel_load_from_db.py,sha256=jYwkIz7_E9Z6Mqw1a9TCWKWD146Tbx7mcQFxIpmKgKU,3686
|
@@ -33,7 +34,7 @@ sibi_dst/utils/_airflow_manager.py,sha256=rlt3eolR5QvtxWhAtBTCpHXvxftnKM-ibPMv3f
|
|
33
34
|
sibi_dst/utils/_clickhouse_writer.py,sha256=JcnWN2635ATCOaFiB6NYglNXDwqKw0jC7Urs9WOZE20,8571
|
34
35
|
sibi_dst/utils/_credentials.py,sha256=8i6z7y3y5S-6mSk4xrT2AwhzCA32mTn1n1iYX9IVyHk,1724
|
35
36
|
sibi_dst/utils/_data_utils.py,sha256=ch4j5FEs8ZnniUzpbeLO-b4Yco_6nwCu71xHaVqMGi4,7050
|
36
|
-
sibi_dst/utils/_data_wrapper.py,sha256=
|
37
|
+
sibi_dst/utils/_data_wrapper.py,sha256=cvUkGRiPfCyLD4XcoX7FWLYzM8gnHBGR1pJ08PMneCk,9010
|
37
38
|
sibi_dst/utils/_date_utils.py,sha256=KYB07puKDrSG8tOm_i1HGX0TjLNUtSWjwfsCYBmW9co,10619
|
38
39
|
sibi_dst/utils/_df_utils.py,sha256=pjEfkof9hggXQgYerG0p4DXrwBeIRynJFg4IX3Yrb4c,10919
|
39
40
|
sibi_dst/utils/_file_utils.py,sha256=5EN90c8N1n9d-_xwz2RzaYcXRMQY_rws2Q3EA3pNAog,1254
|
@@ -41,6 +42,6 @@ sibi_dst/utils/_filepath_generator.py,sha256=ytPSZ9GYOnnSP25zwA-0NjFHupPRZyXwixW
|
|
41
42
|
sibi_dst/utils/_log_utils.py,sha256=rPp8z1UglwvqzBOOAvMOct0syQZ-54gGYafnJDRYZN4,2313
|
42
43
|
sibi_dst/utils/_parquet_saver.py,sha256=3BK0XXgMOOAdIw4OzbwMxmDrzDw3_MKi8RTpulIVUe0,4367
|
43
44
|
sibi_dst/utils/_storage_manager.py,sha256=KP2HBXnLUMMquqcO30ecfuoU7g1z8RtaV3Dv0TvEXoY,3856
|
44
|
-
sibi_dst-0.3.
|
45
|
-
sibi_dst-0.3.
|
46
|
-
sibi_dst-0.3.
|
45
|
+
sibi_dst-0.3.19.dist-info/METADATA,sha256=IDeMqZZHRsAV-v5TngSTKaB7y7SQhMjfEduHozqhOsk,2134
|
46
|
+
sibi_dst-0.3.19.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
47
|
+
sibi_dst-0.3.19.dist-info/RECORD,,
|
File without changes
|