sibi-dst 2025.1.4__py3-none-any.whl → 2025.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +4 -1
- sibi_dst/df_helper/__init__.py +2 -2
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +355 -163
- sibi_dst/df_helper/_df_helper.py +47 -30
- sibi_dst/df_helper/_parquet_artifact.py +57 -47
- sibi_dst/df_helper/_parquet_reader.py +9 -13
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +15 -11
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +23 -16
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +17 -11
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +1 -103
- sibi_dst/utils/__init__.py +3 -2
- sibi_dst/utils/base.py +97 -0
- sibi_dst/utils/clickhouse_writer.py +5 -4
- sibi_dst/utils/data_wrapper.py +69 -84
- sibi_dst/utils/date_utils.py +2 -1
- sibi_dst/utils/log_utils.py +309 -77
- sibi_dst/utils/manifest_manager.py +94 -373
- sibi_dst/utils/parquet_saver.py +98 -173
- sibi_dst/utils/storage_config.py +6 -0
- sibi_dst/utils/storage_manager.py +2 -1
- sibi_dst/utils/update_planner.py +72 -22
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/METADATA +2 -1
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/RECORD +24 -27
- sibi_dst/v3/__init__.py +0 -0
- sibi_dst/v3/backends/__init__.py +0 -0
- sibi_dst/v3/df_helper/__init__.py +0 -0
- sibi_dst/v3/df_helper/_df_helper.py +0 -91
- {sibi_dst-2025.1.4.dist-info → sibi_dst-2025.1.5.dist-info}/WHEEL +0 -0
sibi_dst/utils/parquet_saver.py
CHANGED
@@ -1,196 +1,121 @@
|
|
1
|
-
import base64
|
2
|
-
import hashlib
|
3
1
|
import logging
|
4
|
-
import threading
|
5
2
|
import warnings
|
6
3
|
from typing import Optional
|
7
4
|
|
5
|
+
import dask.dataframe as dd
|
8
6
|
import pyarrow as pa
|
7
|
+
from fsspec import AbstractFileSystem
|
9
8
|
|
10
|
-
|
11
|
-
warnings.filterwarnings("ignore")
|
12
|
-
from .log_utils import Logger
|
9
|
+
from . import ManagedResource
|
13
10
|
|
11
|
+
warnings.filterwarnings("ignore", message="Passing 'overwrite=True' to to_parquet is deprecated")
|
14
12
|
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
13
|
+
|
14
|
+
class ParquetSaver(ManagedResource):
|
15
|
+
"""
|
16
|
+
Saves Dask DataFrames to Parquet, with a workaround for S3-compatible
|
17
|
+
storage that fails on batch delete operations.
|
18
|
+
"""
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
df_result: dd.DataFrame,
|
23
|
+
parquet_storage_path: str,
|
24
|
+
**kwargs,
|
25
|
+
):
|
26
|
+
super().__init__(**kwargs)
|
24
27
|
self.df_result = df_result
|
25
28
|
self.parquet_storage_path = parquet_storage_path.rstrip("/")
|
26
|
-
self.debug = debug
|
27
|
-
self.logger = logger or Logger.default_logger(
|
28
|
-
self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
29
|
-
self.fs = fs
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
29
|
+
#self.debug = debug
|
30
|
+
#self.logger = logger or Logger.default_logger(self.__class__.__name__)
|
31
|
+
#self.logger.set_level(logging.DEBUG if self.debug else logging.INFO)
|
32
|
+
#self.fs = fs
|
33
|
+
# Determine protocol for special handling (e.g., 's3')
|
34
|
+
if not self.fs:
|
35
|
+
raise ValueError("File system (fs) must be provided to ParquetSaver.")
|
36
|
+
|
37
|
+
self.protocol = "file"
|
38
|
+
if "://" in self.parquet_storage_path:
|
39
|
+
self.protocol = self.parquet_storage_path.split(":", 1)[0]
|
40
|
+
|
41
|
+
def save_to_parquet(self, output_directory_name: str = "default_output", overwrite: bool = True):
|
34
42
|
"""
|
35
|
-
|
36
|
-
|
37
|
-
|
43
|
+
Saves the DataFrame to a Parquet dataset.
|
44
|
+
|
45
|
+
If overwrite is True, it manually clears the destination directory before
|
46
|
+
writing to avoid issues with certain S3-compatible storage providers.
|
38
47
|
"""
|
39
|
-
|
40
|
-
full_path = self._construct_full_path(parquet_filename)
|
41
|
-
self.logger.info(f"Save method for :{full_path}")
|
42
|
-
# Ensure directory exists and clear if necessary
|
43
|
-
self._ensure_directory_exists(full_path, clear_existing=clear_existing)
|
44
|
-
|
45
|
-
# Define schema and save DataFrame to Parquet
|
46
|
-
schema = self._define_schema()
|
47
|
-
self._convert_dtypes(schema)
|
48
|
-
self._save_dataframe_to_parquet(full_path, schema)
|
49
|
-
# Close the filesystem if the close method exists
|
50
|
-
if hasattr(self.fs, 'close') and callable(getattr(self.fs, 'close', None)):
|
51
|
-
self.fs.close()
|
48
|
+
full_path = f"{self.parquet_storage_path}/{output_directory_name}"
|
52
49
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
"object": pa.string(),
|
57
|
-
"string": pa.string(),
|
58
|
-
"Int64": pa.int64(),
|
59
|
-
"int64": pa.int64(),
|
60
|
-
"float64": pa.float64(),
|
61
|
-
"float32": pa.float32(),
|
62
|
-
"bool": pa.bool_(),
|
63
|
-
"boolean": pa.bool_(), # pandas nullable boolean
|
64
|
-
"datetime64[ns]": pa.timestamp("ns"),
|
65
|
-
"timedelta[ns]": pa.duration("ns"),
|
66
|
-
}
|
50
|
+
if overwrite and self.fs and self.fs.exists(full_path):
|
51
|
+
self.logger.info(f"Overwrite is True, clearing destination path: {full_path}")
|
52
|
+
self._clear_directory_safely(full_path)
|
67
53
|
|
68
|
-
|
54
|
+
# Ensure the base directory exists after clearing
|
55
|
+
self.fs.mkdirs(full_path, exist_ok=True)
|
69
56
|
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
57
|
+
schema = self._define_schema()
|
58
|
+
self.logger.info(f"Saving DataFrame to Parquet dataset at: {full_path}")
|
59
|
+
self.df_result = self.df_result.persist()
|
60
|
+
try:
|
61
|
+
# We call to_parquet with overwrite=False because we have already
|
62
|
+
# handled the directory clearing manually.
|
63
|
+
self.df_result.to_parquet(
|
64
|
+
path=full_path,
|
65
|
+
engine="pyarrow",
|
66
|
+
schema=schema,
|
67
|
+
overwrite=False,
|
68
|
+
filesystem=self.fs,
|
69
|
+
write_index=False,
|
70
|
+
compute=True, # Use compute=True over persisted ddf for immediate execution.
|
71
|
+
)
|
72
|
+
self.logger.info(f"Successfully saved Parquet dataset to: {full_path}")
|
73
|
+
except Exception as e:
|
74
|
+
self.logger.error(f"Failed to save Parquet dataset to {full_path}: {e}")
|
75
|
+
raise
|
75
76
|
|
76
|
-
def
|
77
|
-
"""Convert DataFrame columns to match the specified schema."""
|
78
|
-
dtype_mapping = {}
|
79
|
-
for field in schema:
|
80
|
-
col_name = field.name
|
81
|
-
if col_name in self.df_result.columns:
|
82
|
-
if pa.types.is_string(field.type):
|
83
|
-
dtype_mapping[col_name] = "string"
|
84
|
-
elif pa.types.is_int64(field.type):
|
85
|
-
dtype_mapping[col_name] = "Int64"
|
86
|
-
elif pa.types.is_float64(field.type):
|
87
|
-
dtype_mapping[col_name] = "float64"
|
88
|
-
elif pa.types.is_float32(field.type):
|
89
|
-
dtype_mapping[col_name] = "float32"
|
90
|
-
elif pa.types.is_boolean(field.type):
|
91
|
-
dtype_mapping[col_name] = "boolean"
|
92
|
-
elif pa.types.is_timestamp(field.type):
|
93
|
-
dtype_mapping[col_name] = "datetime64[ns]"
|
94
|
-
else:
|
95
|
-
dtype_mapping[col_name] = "object"
|
96
|
-
self.df_result = self.df_result.astype(dtype_mapping)
|
97
|
-
|
98
|
-
def _construct_full_path(self, parquet_filename: Optional[str]) -> str:
|
99
|
-
"""Construct and return the full path for the Parquet file."""
|
100
|
-
parquet_filename = parquet_filename or "default.parquet"
|
101
|
-
return f"{self.parquet_storage_path}/{parquet_filename}"
|
102
|
-
|
103
|
-
def _ensure_directory_exists(self, full_path: str, clear_existing=False):
|
77
|
+
def _clear_directory_safely(self, directory: str):
|
104
78
|
"""
|
105
|
-
|
106
|
-
|
107
|
-
|
79
|
+
Clears the contents of a directory robustly.
|
80
|
+
- For S3, it deletes files one-by-one to bypass the 'MissingContentMD5' error.
|
81
|
+
- For other filesystems, it uses the standard recursive remove.
|
108
82
|
"""
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
self._clear_directory(directory)
|
114
|
-
|
115
|
-
if not self.fs.exists(directory):
|
116
|
-
self.logger.info(f"Creating directory: {directory}")
|
117
|
-
self.fs.mkdirs(directory, exist_ok=True)
|
118
|
-
|
119
|
-
def _save_dataframe_to_parquet(self, full_path: str, schema: pa.Schema):
|
120
|
-
"""Save the DataFrame to Parquet using the specified schema."""
|
121
|
-
#self._clear_directory(full_path)
|
122
|
-
self.logger.info(f"Saving/Overwrite Parquet file to: {full_path}")
|
123
|
-
self.df_result.to_parquet(
|
124
|
-
path=full_path,
|
125
|
-
engine="pyarrow",
|
126
|
-
schema=schema,
|
127
|
-
#overwrite=True,
|
128
|
-
filesystem=self.fs,
|
129
|
-
write_index=False,
|
130
|
-
)
|
131
|
-
|
132
|
-
def _clear_directory(self, directory: str):
|
133
|
-
"""
|
134
|
-
Clears the specified directory by removing all the files within it. If the
|
135
|
-
directory is not associated with the "s3" protocol, it will be removed using
|
136
|
-
the local filesystem's functionality. For directories using the "s3" protocol,
|
137
|
-
the bucket name and prefix are parsed, and files are deleted using the S3
|
138
|
-
client's `delete_objects` method.
|
139
|
-
|
140
|
-
:param directory: The directory path to clear. If the protocol is not "s3", it
|
141
|
-
represents a local filesystem path. Otherwise, it is assumed
|
142
|
-
to be an S3 path (e.g., "s3://bucket_name/prefix").
|
143
|
-
:type directory: str
|
144
|
-
"""
|
145
|
-
if self.protocol != "s3":
|
146
|
-
if self.fs.exists(directory):
|
147
|
-
self.logger.info(f"Clearing directory: {directory}")
|
148
|
-
self.fs.rm(directory, recursive=True)
|
149
|
-
return
|
150
|
-
# Parse bucket name and prefix
|
151
|
-
bucket_name, prefix = self._parse_s3_path(directory)
|
152
|
-
|
153
|
-
# List files in the directory
|
154
|
-
files = self.fs.ls(directory, detail=True)
|
155
|
-
if not files:
|
156
|
-
self.logger.info(f"No files to delete in directory: {directory}")
|
157
|
-
return
|
158
|
-
|
159
|
-
# Generate the delete payload
|
160
|
-
objects_to_delete = [{"Key": file["name"].replace(f"{bucket_name}/", "", 1)} for file in files]
|
161
|
-
delete_payload = {
|
162
|
-
"Objects": objects_to_delete,
|
163
|
-
"Quiet": True,
|
164
|
-
}
|
165
|
-
|
166
|
-
# Calculate Content-MD5
|
167
|
-
payload_bytes = str(delete_payload).encode("utf-8")
|
168
|
-
md5_hash = hashlib.md5(payload_bytes).digest()
|
169
|
-
content_md5 = base64.b64encode(md5_hash).decode("utf-8")
|
170
|
-
|
171
|
-
# Use the underlying s3 client to delete objects
|
172
|
-
try:
|
173
|
-
self.fs.s3.delete_objects(
|
174
|
-
Bucket=bucket_name,
|
175
|
-
Delete=delete_payload,
|
176
|
-
ContentMD5=content_md5,
|
83
|
+
if self.protocol == "s3":
|
84
|
+
self.logger.warning(
|
85
|
+
"Using single-file S3 deletion for compatibility. "
|
86
|
+
"This may be slow for directories with many files."
|
177
87
|
)
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
88
|
+
# Glob all contents (files and subdirs) and delete them individually.
|
89
|
+
# Calling fs.rm() on a single file path should trigger a single
|
90
|
+
# DeleteObject call, avoiding the faulty batch operation.
|
91
|
+
# We sort by length descending to delete contents of subdirectories first.
|
92
|
+
all_paths = self.fs.glob(f"{directory}/**")
|
93
|
+
paths_to_delete = sorted([p for p in all_paths if p != directory], key=len, reverse=True)
|
94
|
+
|
95
|
+
for path in paths_to_delete:
|
96
|
+
self.logger.debug(f"Deleting: {path}")
|
97
|
+
self.fs.rm_file(path)
|
98
|
+
else:
|
99
|
+
# Standard, fast deletion for other filesystems (local, etc.)
|
100
|
+
self.fs.rm(directory, recursive=True)
|
182
101
|
|
183
|
-
|
184
|
-
def _parse_s3_path(s3_path: str):
|
102
|
+
def _define_schema(self) -> pa.Schema:
|
185
103
|
"""
|
186
|
-
|
187
|
-
:param s3_path: Full S3 path (e.g., s3://bucket-name/path/).
|
188
|
-
:return: Tuple of bucket name and prefix.
|
104
|
+
Defines a PyArrow schema dynamically based on DataFrame's column types.
|
189
105
|
"""
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
106
|
+
pandas_dtype_to_pa = {
|
107
|
+
"object": pa.string(), "string": pa.string(),
|
108
|
+
"int64": pa.int64(), "Int64": pa.int64(),
|
109
|
+
"int32": pa.int32(), "Int32": pa.int32(),
|
110
|
+
"float64": pa.float64(), "float32": pa.float32(),
|
111
|
+
"bool": pa.bool_(), "boolean": pa.bool_(),
|
112
|
+
"datetime64[ns]": pa.timestamp("ns"),
|
113
|
+
"datetime64[ns, UTC]": pa.timestamp("ns", tz="UTC"),
|
114
|
+
"category": pa.string(),
|
115
|
+
}
|
116
|
+
fields = [
|
117
|
+
pa.field(c, pandas_dtype_to_pa.get(str(d), pa.string()))
|
118
|
+
for c, d in self.df_result.dtypes.items()
|
119
|
+
]
|
120
|
+
return pa.schema(fields)
|
196
121
|
|
sibi_dst/utils/storage_config.py
CHANGED
@@ -31,6 +31,12 @@ class StorageConfig:
|
|
31
31
|
"use_listings_cache": False,
|
32
32
|
"client_kwargs": {
|
33
33
|
"endpoint_url": self.conf.get('fs_endpoint')
|
34
|
+
},
|
35
|
+
"config_kwargs" :{
|
36
|
+
"signature_version": "s3v4",
|
37
|
+
's3': {
|
38
|
+
'addressing_style': 'path'
|
39
|
+
}
|
34
40
|
}
|
35
41
|
}
|
36
42
|
elif self.filesystem_type == "webdav":
|
@@ -20,6 +20,7 @@ class StorageManager:
|
|
20
20
|
self._initialize_webdav()
|
21
21
|
else:
|
22
22
|
self.fs = fsspec.filesystem(fs_type, **self.fs_options)
|
23
|
+
|
23
24
|
self.depot_paths = {}
|
24
25
|
self.depot_name = None
|
25
26
|
|
@@ -161,4 +162,4 @@ class StorageManager:
|
|
161
162
|
self.webdav_client.download_file(remote_path, local_path)
|
162
163
|
else:
|
163
164
|
# Use fsspec's get method for other filesystems
|
164
|
-
self.fs.get(remote_path, local_path)
|
165
|
+
self.fs.get(remote_path, local_path)
|
sibi_dst/utils/update_planner.py
CHANGED
@@ -1,13 +1,38 @@
|
|
1
1
|
import datetime
|
2
2
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
|
-
from typing import List, Optional, Dict, Union, Tuple, Set
|
3
|
+
from typing import List, Optional, Dict, Union, Tuple, Set, Iterator
|
4
4
|
import pandas as pd
|
5
|
-
import fsspec
|
6
|
-
from sibi_dst.utils import Logger
|
7
5
|
from .date_utils import FileAgeChecker
|
6
|
+
from pydantic import BaseModel, Field
|
7
|
+
from rich.console import Console
|
8
|
+
from rich.table import Table
|
8
9
|
|
10
|
+
from sibi_dst.utils import ManagedResource
|
9
11
|
|
10
|
-
|
12
|
+
|
13
|
+
class UpdateConfig(BaseModel):
|
14
|
+
"""
|
15
|
+
A unified Pydantic model for the data update process configuration.
|
16
|
+
Acts as a single source of truth for all settings.
|
17
|
+
"""
|
18
|
+
overwrite: bool = False
|
19
|
+
reverse_order: bool = True
|
20
|
+
ignore_missing: bool = False
|
21
|
+
history_days_threshold: int = 30
|
22
|
+
max_age_minutes: int = 1440 # 24 hours
|
23
|
+
show_progress: bool = False
|
24
|
+
verbose: bool = False
|
25
|
+
debug: bool = False
|
26
|
+
start_date: datetime.date
|
27
|
+
end_date: datetime.date
|
28
|
+
custom_priority_map: Optional[Dict[str, int]] = None
|
29
|
+
max_threads: int = 3
|
30
|
+
timeout: float = 30.0
|
31
|
+
|
32
|
+
class Config:
|
33
|
+
arbitrary_types_allowed = True
|
34
|
+
|
35
|
+
class UpdatePlanner(ManagedResource):
|
11
36
|
"""
|
12
37
|
A utility class to scan a date-partitioned filesystem and
|
13
38
|
generate an update plan indicating which dates need processing.
|
@@ -47,9 +72,9 @@ class UpdatePlanner:
|
|
47
72
|
data_path: str,
|
48
73
|
filename: str,
|
49
74
|
description: str = "Update Planner",
|
50
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
51
|
-
filesystem_type: str = "file",
|
52
|
-
filesystem_options: Optional[Dict] = None,
|
75
|
+
#fs: Optional[fsspec.AbstractFileSystem] = None,
|
76
|
+
#filesystem_type: str = "file",
|
77
|
+
#filesystem_options: Optional[Dict] = None,
|
53
78
|
reference_date: Union[str, datetime.date] = None,
|
54
79
|
history_days_threshold: int = DEFAULT_HISTORY_DAYS_THRESHOLD,
|
55
80
|
max_age_minutes: int = DEFAULT_MAX_AGE_MINUTES,
|
@@ -58,12 +83,11 @@ class UpdatePlanner:
|
|
58
83
|
custom_priority_map: Optional[Dict[str, int]] = None,
|
59
84
|
reverse_order: bool = False,
|
60
85
|
show_progress: bool = False,
|
61
|
-
verbose: bool = False,
|
62
|
-
debug: bool = False,
|
63
|
-
logger: Optional[Logger] = None,
|
64
86
|
skipped: Optional[List[str]] = None,
|
87
|
+
**kwargs
|
65
88
|
):
|
66
89
|
# Initialize state
|
90
|
+
super().__init__(**kwargs)
|
67
91
|
self.plan: pd.DataFrame = pd.DataFrame()
|
68
92
|
self.df_req: pd.DataFrame = pd.DataFrame()
|
69
93
|
self.description = description
|
@@ -71,14 +95,7 @@ class UpdatePlanner:
|
|
71
95
|
self.filename = filename
|
72
96
|
self.reverse_order = reverse_order
|
73
97
|
self.show_progress = show_progress
|
74
|
-
self.
|
75
|
-
self.logger.set_level(Logger.DEBUG if debug else Logger.INFO)
|
76
|
-
self.debug = debug
|
77
|
-
self.verbose = verbose
|
78
|
-
|
79
|
-
# Filesystem and age helper
|
80
|
-
self.fs = fs or fsspec.filesystem(filesystem_type, **(filesystem_options or {}))
|
81
|
-
self.age_checker = FileAgeChecker(debug=debug, logger=self.logger)
|
98
|
+
self.age_checker = FileAgeChecker(debug=self.debug, logger=self.logger)
|
82
99
|
|
83
100
|
# Normalize reference date
|
84
101
|
if reference_date is None:
|
@@ -127,8 +144,12 @@ class UpdatePlanner:
|
|
127
144
|
)
|
128
145
|
for future in iterator:
|
129
146
|
d = futures[future]
|
130
|
-
|
131
|
-
|
147
|
+
try:
|
148
|
+
exists, age = future.result()
|
149
|
+
rows.append(self._make_row(d, history_start, exists, age))
|
150
|
+
except Exception as exc:
|
151
|
+
self.logger.error(f"Error processing date {d}: {exc}")
|
152
|
+
rows.append(self._make_row(d, history_start, False, None))
|
132
153
|
|
133
154
|
df = pd.DataFrame(rows)
|
134
155
|
df = df.sort_values(
|
@@ -169,8 +190,37 @@ class UpdatePlanner:
|
|
169
190
|
if self.plan.empty:
|
170
191
|
self.logger.warning("No update plan available. Call generate_plan() first.")
|
171
192
|
return
|
172
|
-
|
173
|
-
|
193
|
+
|
194
|
+
console = Console(record=True)
|
195
|
+
|
196
|
+
table = Table(title=f"Update Plan for {self.data_path}", show_header=True, header_style="bold magenta")
|
197
|
+
for column in self.plan.columns:
|
198
|
+
table.add_column(column, justify="left")
|
199
|
+
for _, row in self.plan.iterrows():
|
200
|
+
table.add_row(*(str(item) for item in row))
|
201
|
+
|
202
|
+
console.print(table)
|
203
|
+
|
204
|
+
plan_string = console.export_text()
|
205
|
+
|
206
|
+
self.logger.info(f"Full Update Plan:\n{plan_string}")
|
207
|
+
|
208
|
+
def get_tasks_by_priority(self) -> Iterator[Tuple[int, List[datetime.date]]]:
|
209
|
+
"""Yields batches of dates to be processed, grouped and sorted by priority."""
|
210
|
+
if self.plan.empty:
|
211
|
+
return
|
212
|
+
|
213
|
+
required_updates = self.plan[self.plan['update_required']].copy()
|
214
|
+
if required_updates.empty:
|
215
|
+
return
|
216
|
+
|
217
|
+
for priority in sorted(required_updates["update_priority"].unique()):
|
218
|
+
dates_df = required_updates[required_updates["update_priority"] == priority]
|
219
|
+
# Sort dates within the priority group
|
220
|
+
sorted_dates = dates_df.sort_values(by="date", ascending=not self.reverse_order)
|
221
|
+
dates = sorted_dates["date"].tolist()
|
222
|
+
if dates:
|
223
|
+
yield priority, dates
|
174
224
|
|
175
225
|
def _get_file_status(
|
176
226
|
self,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sibi-dst
|
3
|
-
Version: 2025.1.
|
3
|
+
Version: 2025.1.5
|
4
4
|
Summary: Data Science Toolkit
|
5
5
|
Author: Luis Valverde
|
6
6
|
Author-email: lvalverdeb@gmail.com
|
@@ -18,6 +18,7 @@ Requires-Dist: psycopg2 (>=2.9.10,<3.0.0)
|
|
18
18
|
Requires-Dist: pyarrow (>=20.0.0,<21.0.0)
|
19
19
|
Requires-Dist: pydantic (>=2.11.7,<3.0.0)
|
20
20
|
Requires-Dist: pymysql (>=1.1.1,<2.0.0)
|
21
|
+
Requires-Dist: rich (>=14.0.0,<15.0.0)
|
21
22
|
Requires-Dist: s3fs (>=2025.5.1,<2026.0.0)
|
22
23
|
Requires-Dist: sqlalchemy (>=2.0.41,<3.0.0)
|
23
24
|
Requires-Dist: tqdm (>=4.67.1,<5.0.0)
|
@@ -1,9 +1,9 @@
|
|
1
|
-
sibi_dst/__init__.py,sha256=
|
2
|
-
sibi_dst/df_helper/__init__.py,sha256=
|
3
|
-
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256
|
4
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
-
sibi_dst/df_helper/_parquet_artifact.py,sha256=
|
6
|
-
sibi_dst/df_helper/_parquet_reader.py,sha256=
|
1
|
+
sibi_dst/__init__.py,sha256=j8lZpGCJlxlLgEgeIMxZnWdqJ0g3MCs7-gsnbvPn_KY,285
|
2
|
+
sibi_dst/df_helper/__init__.py,sha256=VJE1qvKO-7QsFADZxSY5s4LVoWnPKfz0rP3nYO2ljhA,358
|
3
|
+
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py,sha256=LvFCNr4VKFV-b-NS_TeRkaoKsM4tdsPtAgSIwMvKgGA,18043
|
4
|
+
sibi_dst/df_helper/_df_helper.py,sha256=iBoWz2iVgLzQ3hA1EwllL62dkraKamRx2sXseu30FVI,11914
|
5
|
+
sibi_dst/df_helper/_parquet_artifact.py,sha256=vDZOtSugBuWuZ3W6l2Y7IBO6RohIrA_sVNuPHp8e8h0,15438
|
6
|
+
sibi_dst/df_helper/_parquet_reader.py,sha256=o5ijxHtD1EMzUUD9e6PIoGMeuSLHDItvZIouGfVZhgA,3817
|
7
7
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
8
8
|
sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
|
9
9
|
sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
|
@@ -11,10 +11,10 @@ sibi_dst/df_helper/backends/parquet/__init__.py,sha256=esWJ9aSuYC26d-T01z9dPrJ1u
|
|
11
11
|
sibi_dst/df_helper/backends/parquet/_filter_handler.py,sha256=TvDf0RXta7mwJv11GNQttYJsXgFf2XDj4oLIjt4xTzA,5219
|
12
12
|
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=TaU5_wG1Y3lQC8DVCItVvMnc6ZJmECLu3avssVEMbaM,10591
|
13
13
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
|
14
|
-
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=
|
15
|
-
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=
|
16
|
-
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=
|
17
|
-
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=
|
14
|
+
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=Rsvh1nfVtqzfMhv968vNTYYIqVxYsEs4PB-O5CTSYdk,10935
|
15
|
+
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py,sha256=NqBSHqeYv_1vHt6J0tez0GdMwKrP_sIRcXYXu869ZkY,13313
|
16
|
+
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py,sha256=ibxeVqpIEsSVusP2bgcd1MNV_wJIoNgXwacltUbwTas,3194
|
17
|
+
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py,sha256=d_-ip-dQnWOlM8btCjoywAXpaiSuN6AaavkTGJsVQfY,3576
|
18
18
|
sibi_dst/df_helper/core/__init__.py,sha256=LfmTqFh6GUZup-g95bcXgAxX7J5Hkve7ftLE_CJg_AE,409
|
19
19
|
sibi_dst/df_helper/core/_defaults.py,sha256=9UMEMu2wXznO5UzEhnQ82f_ZazZ20JRyRXIi3HP3gDw,4043
|
20
20
|
sibi_dst/df_helper/core/_filter_handler.py,sha256=Pmbzygry2mpkNPVS7DBMulHpAb1yYZNFqUU0bJTWJF0,11214
|
@@ -32,23 +32,24 @@ sibi_dst/osmnx_helper/basemaps/router_plotter.py,sha256=UAiijn-J-jjX4YnL0_P9SFqT
|
|
32
32
|
sibi_dst/osmnx_helper/utils.py,sha256=BzuY8CtYnBAAO8UAr_M7EOk6CP1zcifNLs8pkdFZEFg,20577
|
33
33
|
sibi_dst/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
34
34
|
sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUHMLFB_PqqUq7k,3336
|
35
|
-
sibi_dst/utils/__init__.py,sha256=
|
36
|
-
sibi_dst/utils/
|
35
|
+
sibi_dst/utils/__init__.py,sha256=PQsG188_lnqgSFljkCc15Nyv933HnvmQ7XYs02m77Vc,1217
|
36
|
+
sibi_dst/utils/base.py,sha256=MBshlQKg-WNeTeuE_aMLQjcBRaa4O_TFED7bVKDhJ-o,3783
|
37
|
+
sibi_dst/utils/clickhouse_writer.py,sha256=5XDRjXU9d0Vhb9GFdCiRoNXrucJvTm6h8auCAQbEwW0,9917
|
37
38
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
38
39
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
39
40
|
sibi_dst/utils/data_utils.py,sha256=MqbwXk33BuANWeKKmsabHouhb8GZswSmbM-VetWWE-M,10357
|
40
|
-
sibi_dst/utils/data_wrapper.py,sha256=
|
41
|
-
sibi_dst/utils/date_utils.py,sha256=
|
41
|
+
sibi_dst/utils/data_wrapper.py,sha256=Vx3At8SlAoMCTaXmVsTiTGynfjV2isB9W6yL0cWZ7g4,9346
|
42
|
+
sibi_dst/utils/date_utils.py,sha256=8fwPpOYqSdM3nHeNykh7Ftk-uPdFa44cEAy5S8iUNw4,18667
|
42
43
|
sibi_dst/utils/df_utils.py,sha256=TzIAUCLbgOn3bvCFvzkc1S9YU-OlZTImdCj-88dtg8g,11401
|
43
44
|
sibi_dst/utils/file_utils.py,sha256=Z99CZ_4nPDIaZqbCfzzUDfAYJjSudWDj-mwEO8grhbc,1253
|
44
45
|
sibi_dst/utils/filepath_generator.py,sha256=-HHO0U-PR8fysDDFwnWdHRlgqksh_RkmgBZLWv9hM7s,6669
|
45
|
-
sibi_dst/utils/log_utils.py,sha256=
|
46
|
-
sibi_dst/utils/manifest_manager.py,sha256=
|
47
|
-
sibi_dst/utils/parquet_saver.py,sha256=
|
46
|
+
sibi_dst/utils/log_utils.py,sha256=_YnpCnMcjT--ou3BU0EGJma1xMULrA4V5v5UU4IbjAo,14102
|
47
|
+
sibi_dst/utils/manifest_manager.py,sha256=Rw7i2phoKJjGlPHYLg_1kr40syVKxd9LJEmfxvZPeDg,8544
|
48
|
+
sibi_dst/utils/parquet_saver.py,sha256=zau_s0Mn2ccz9ivVtRbTkBmCghUgCofI1LsCdy1dfDw,4943
|
48
49
|
sibi_dst/utils/phone_formatter.py,sha256=tsVTDamuthFYgy4-5UwmQkPQ-FGTGH7MjZyH8utAkIY,4945
|
49
|
-
sibi_dst/utils/storage_config.py,sha256=
|
50
|
-
sibi_dst/utils/storage_manager.py,sha256=
|
51
|
-
sibi_dst/utils/update_planner.py,sha256=
|
50
|
+
sibi_dst/utils/storage_config.py,sha256=uaCBF8rgCeYkk-lxVSCjsic8O8HJKAu455MR-OBliCo,4325
|
51
|
+
sibi_dst/utils/storage_manager.py,sha256=yyZqT8XjTf4MKFrfznCmxXxOYz_TiWgtQhzqPoXR9So,6569
|
52
|
+
sibi_dst/utils/update_planner.py,sha256=2ZVsuZlghKDRv7IhqaraS-7GRIY6nGRpFnpBdXYo7Io,11538
|
52
53
|
sibi_dst/utils/webdav_client.py,sha256=pYF1UsGOuxYeGLq7aBfwZFvkvD4meOcbbaiZ4d6GW9I,7107
|
53
54
|
sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
54
55
|
sibi_dst/v2/df_helper/__init__.py,sha256=XuH6jKYAPg2DdRbsxxBSxp9X3x-ARyaT0xe27uILrVo,99
|
@@ -70,10 +71,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
70
71
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
71
72
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
72
73
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
73
|
-
sibi_dst/
|
74
|
-
sibi_dst/
|
75
|
-
sibi_dst
|
76
|
-
sibi_dst/v3/df_helper/_df_helper.py,sha256=NKIQ4Y-Tn-e841sbZxzLh3Q071_Zo9Vu4y3OAXcsO98,3900
|
77
|
-
sibi_dst-2025.1.4.dist-info/METADATA,sha256=1YbANTdPXYdETuWFKKldnB237pYywn7rw53Oww3NiC0,2459
|
78
|
-
sibi_dst-2025.1.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
79
|
-
sibi_dst-2025.1.4.dist-info/RECORD,,
|
74
|
+
sibi_dst-2025.1.5.dist-info/METADATA,sha256=T0zFKtNSQ7if1S590EwTZ_CN96oiDe8t559-xFQ-XWM,2498
|
75
|
+
sibi_dst-2025.1.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
76
|
+
sibi_dst-2025.1.5.dist-info/RECORD,,
|
sibi_dst/v3/__init__.py
DELETED
File without changes
|
sibi_dst/v3/backends/__init__.py
DELETED
File without changes
|
File without changes
|