sibi-dst 2025.8.4__py3-none-any.whl → 2025.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +8 -6
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +56 -15
- sibi_dst/utils/clickhouse_writer.py +4 -241
- sibi_dst/utils/storage_config.py +2 -2
- sibi_dst/utils/storage_manager.py +3 -2
- {sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.6.dist-info}/METADATA +1 -1
- {sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.6.dist-info}/RECORD +8 -8
- {sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.6.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -182,18 +182,20 @@ class DfHelper(ManagedResource):
|
|
182
182
|
return model(**model_kwargs)
|
183
183
|
|
184
184
|
# ---------- load/aload ----------
|
185
|
-
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
185
|
+
def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
186
186
|
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
187
187
|
self.total_records, df = self.backend_strategy.load(**options)
|
188
188
|
df = self._process_loaded_data(df)
|
189
189
|
df = self._post_process_df(df)
|
190
|
-
self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
190
|
+
#self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
191
|
+
df = df.persist() if persist else df
|
191
192
|
return df.compute() if as_pandas else df
|
192
193
|
|
193
|
-
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
194
|
+
async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
194
195
|
self.total_records, df = await self.backend_strategy.aload(**options)
|
195
196
|
df = self._process_loaded_data(df)
|
196
197
|
df = self._post_process_df(df)
|
198
|
+
df = df.persist() if persist else df
|
197
199
|
return df.compute() if as_pandas else df
|
198
200
|
|
199
201
|
# ---------- dataframe post-processing ----------
|
@@ -239,8 +241,8 @@ class DfHelper(ManagedResource):
|
|
239
241
|
|
240
242
|
# ---------- sinks ----------
|
241
243
|
def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
242
|
-
fs: AbstractFileSystem = kwargs.
|
243
|
-
path: str = kwargs.
|
244
|
+
fs: AbstractFileSystem = kwargs.pop("fs", self.fs)
|
245
|
+
path: str = kwargs.pop("parquet_storage_path")
|
244
246
|
if not fs:
|
245
247
|
raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
246
248
|
if not path:
|
@@ -266,7 +268,7 @@ class DfHelper(ManagedResource):
|
|
266
268
|
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
267
269
|
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
268
270
|
return
|
269
|
-
with ClickHouseWriter(debug=self.debug, logger=self.logger,
|
271
|
+
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
270
272
|
writer.save_to_clickhouse(df)
|
271
273
|
self.logger.debug("Save to ClickHouse completed.")
|
272
274
|
|
@@ -6,6 +6,8 @@ import dask.dataframe as dd
|
|
6
6
|
import fsspec
|
7
7
|
import pandas as pd
|
8
8
|
from pydantic import BaseModel, model_validator, ConfigDict
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import FilterHandler
|
9
11
|
from sibi_dst.utils import FilePathGenerator
|
10
12
|
from sibi_dst.utils import Logger
|
11
13
|
|
@@ -177,38 +179,77 @@ class ParquetConfig(BaseModel):
|
|
177
179
|
|
178
180
|
def load_files(self, **filters):
|
179
181
|
"""
|
180
|
-
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
181
|
-
|
182
|
-
parquet folder paths or a single specified parquet path.
|
183
|
-
|
184
|
-
:return: A Dask DataFrame containing loaded parquet file data.
|
185
|
-
:rtype: dask.dataframe.DataFrame
|
182
|
+
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
183
|
+
Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
|
186
184
|
"""
|
187
185
|
if not self.load_parquet:
|
188
186
|
self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
|
189
187
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
190
188
|
|
189
|
+
# Resolve paths
|
191
190
|
paths_to_load = []
|
192
191
|
if self.parquet_folder_list:
|
193
|
-
|
194
|
-
paths_to_load = [p for p in self.parquet_folder_list if p is not None]
|
192
|
+
paths_to_load = [p for p in self.parquet_folder_list if p]
|
195
193
|
elif self.parquet_full_path:
|
196
|
-
# Treat the single path as a list with one item
|
197
194
|
paths_to_load = [self.parquet_full_path]
|
198
195
|
|
199
196
|
if not paths_to_load:
|
200
197
|
self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
|
201
198
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
202
199
|
|
200
|
+
# Prepare filters
|
201
|
+
fh = None
|
202
|
+
expr = None
|
203
|
+
pq_filters = None
|
204
|
+
residual_filters = None
|
205
|
+
if filters:
|
206
|
+
fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
|
207
|
+
|
208
|
+
# Use the compiler + pushdown split so we don't double-apply
|
209
|
+
try:
|
210
|
+
# If you added split_pushdown_and_residual earlier:
|
211
|
+
pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
|
212
|
+
expr = fh.compile_filters(residual_filters) if residual_filters else None
|
213
|
+
except AttributeError:
|
214
|
+
# Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
|
215
|
+
expr = fh.compile_filters(filters)
|
216
|
+
pq_filters = expr.to_parquet_filters()
|
217
|
+
|
203
218
|
try:
|
204
219
|
self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
220
|
+
|
221
|
+
# Optional: prune columns. Keep it simple unless you want to compute from filters.
|
222
|
+
columns = None # or a concrete list if you know it
|
223
|
+
|
224
|
+
if fh and pq_filters:
|
225
|
+
self.logger.debug(f"Applying Parquet filters: {pq_filters}")
|
226
|
+
dd_result = dd.read_parquet(
|
227
|
+
paths_to_load,
|
228
|
+
engine="pyarrow",
|
229
|
+
filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
|
230
|
+
filters=pq_filters,
|
231
|
+
columns=columns,
|
232
|
+
gather_statistics=False, # uncomment if you have *many* files and don't need global stats
|
233
|
+
)
|
234
|
+
# Apply only residual mask (if any)
|
235
|
+
if expr is not None:
|
236
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
237
|
+
else:
|
238
|
+
dd_result = dd.read_parquet(
|
239
|
+
paths_to_load,
|
240
|
+
engine="pyarrow",
|
241
|
+
filesystem=self.fs,
|
242
|
+
columns=columns,
|
243
|
+
gather_statistics=False,
|
244
|
+
)
|
245
|
+
# If we didn't push down, but have filters, apply them here
|
246
|
+
if expr is None and fh and filters:
|
247
|
+
expr = fh.compile_filters(filters)
|
248
|
+
if expr is not None:
|
249
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
250
|
+
|
211
251
|
return dd_result
|
252
|
+
|
212
253
|
except FileNotFoundError as e:
|
213
254
|
self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
|
214
255
|
self.logger.debug("Returning empty DataFrame due to missing parquet files.")
|
@@ -91,7 +91,7 @@ class ClickHouseWriter(ManagedResource):
|
|
91
91
|
return
|
92
92
|
|
93
93
|
# lazily fill missing values per-partition (no global compute)
|
94
|
-
df = df.map_partitions(self._fill_missing_partition, meta=df)
|
94
|
+
df = df.map_partitions(type(self)._fill_missing_partition, meta=df._meta)
|
95
95
|
|
96
96
|
# (re)create table
|
97
97
|
ow = self.overwrite if overwrite is None else bool(overwrite)
|
@@ -201,23 +201,21 @@ class ClickHouseWriter(ManagedResource):
|
|
201
201
|
|
202
202
|
# ------------- missing values (lazy) -------------
|
203
203
|
|
204
|
-
|
205
|
-
|
204
|
+
@staticmethod
|
205
|
+
def _fill_missing_partition(pdf: pd.DataFrame) -> pd.DataFrame:
|
206
|
+
# (unchanged body)
|
206
207
|
for col in pdf.columns:
|
207
208
|
s = pdf[col]
|
208
209
|
if pd.api.types.is_integer_dtype(s.dtype):
|
209
|
-
# pandas nullable IntX supports NA → fill where needed
|
210
210
|
if pd.api.types.is_extension_array_dtype(s.dtype):
|
211
211
|
pdf[col] = s.fillna(pd.NA)
|
212
212
|
else:
|
213
213
|
pdf[col] = s.fillna(0)
|
214
214
|
elif pd.api.types.is_bool_dtype(s.dtype):
|
215
|
-
# boolean pandas extension supports NA, ClickHouse uses UInt8; keep NA → Nullable
|
216
215
|
pdf[col] = s.fillna(pd.NA)
|
217
216
|
elif pd.api.types.is_float_dtype(s.dtype):
|
218
217
|
pdf[col] = s.fillna(0.0)
|
219
218
|
elif pd.api.types.is_datetime64_any_dtype(s.dtype):
|
220
|
-
# keep NaT; ClickHouse Nullable(DateTime) will take NULL
|
221
219
|
pass
|
222
220
|
else:
|
223
221
|
pdf[col] = s.fillna("")
|
@@ -264,238 +262,3 @@ class ClickHouseWriter(ManagedResource):
|
|
264
262
|
if hasattr(self._tlocal, "client"):
|
265
263
|
delattr(self._tlocal, "client")
|
266
264
|
|
267
|
-
# from concurrent.futures import ThreadPoolExecutor
|
268
|
-
# from typing import ClassVar, Dict
|
269
|
-
#
|
270
|
-
# import clickhouse_connect
|
271
|
-
# import pandas as pd
|
272
|
-
# from clickhouse_driver import Client
|
273
|
-
# import dask.dataframe as dd
|
274
|
-
#
|
275
|
-
# from . import ManagedResource
|
276
|
-
#
|
277
|
-
#
|
278
|
-
# class ClickHouseWriter(ManagedResource):
|
279
|
-
# """
|
280
|
-
# Provides functionality to write a Dask DataFrame to a ClickHouse database using
|
281
|
-
# a specified schema. This class handles the creation of tables, schema generation,
|
282
|
-
# data transformation, and data insertion. It ensures compatibility between Dask
|
283
|
-
# data types and ClickHouse types.
|
284
|
-
#
|
285
|
-
# :ivar clickhouse_host: Host address of the ClickHouse database.
|
286
|
-
# :type clickhouse_host: str
|
287
|
-
# :ivar clickhouse_port: Port of the ClickHouse database.
|
288
|
-
# :type clickhouse_port: int
|
289
|
-
# :ivar clickhouse_dbname: Name of the database to connect to in ClickHouse.
|
290
|
-
# :type clickhouse_dbname: str
|
291
|
-
# :ivar clickhouse_user: Username for database authentication.
|
292
|
-
# :type clickhouse_user: str
|
293
|
-
# :ivar clickhouse_password: Password for database authentication.
|
294
|
-
# :type clickhouse_password: str
|
295
|
-
# :ivar clickhouse_table: Name of the table to store the data in.
|
296
|
-
# :type clickhouse_table: str
|
297
|
-
# :ivar logger: Logger instance for logging messages.
|
298
|
-
# :type logger: logging.Logger
|
299
|
-
# :ivar client: Instance of the ClickHouse database client.
|
300
|
-
# :type client: clickhouse_connect.Client or None
|
301
|
-
# :ivar df: Dask DataFrame to be written into ClickHouse.
|
302
|
-
# :type df: dask.dataframe.DataFrame
|
303
|
-
# :ivar order_by: Field or column name to use for table ordering.
|
304
|
-
# :type order_by: str
|
305
|
-
# """
|
306
|
-
# dtype_to_clickhouse: ClassVar[Dict[str, str]] = {
|
307
|
-
# 'int64': 'Int64',
|
308
|
-
# 'int32': 'Int32',
|
309
|
-
# 'float64': 'Float64',
|
310
|
-
# 'float32': 'Float32',
|
311
|
-
# 'bool': 'UInt8',
|
312
|
-
# 'datetime64[ns]': 'DateTime',
|
313
|
-
# 'object': 'String',
|
314
|
-
# 'category': 'String',
|
315
|
-
# }
|
316
|
-
# df: dd.DataFrame
|
317
|
-
#
|
318
|
-
# def __init__(self, **kwargs):
|
319
|
-
# super().__init__(**kwargs)
|
320
|
-
# self.clickhouse_host = kwargs.setdefault('host', "localhost")
|
321
|
-
# self.clickhouse_port = kwargs.setdefault('port', 8123)
|
322
|
-
# self.clickhouse_dbname = kwargs.setdefault('database', 'sibi_data')
|
323
|
-
# self.clickhouse_user = kwargs.setdefault('user', 'default')
|
324
|
-
# self.clickhouse_password = kwargs.setdefault('password', '')
|
325
|
-
# self.clickhouse_table = kwargs.setdefault('table', 'test_sibi_table')
|
326
|
-
#
|
327
|
-
# #self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
328
|
-
# self.client = None
|
329
|
-
# self.order_by = kwargs.setdefault('order_by', 'id')
|
330
|
-
#
|
331
|
-
# def save_to_clickhouse(self, df, **kwargs):
|
332
|
-
# self.df = df.copy()
|
333
|
-
# self.order_by = kwargs.setdefault('order_by', self.order_by)
|
334
|
-
# if len(self.df.head().index) == 0:
|
335
|
-
# self.logger.debug("Dataframe is empty")
|
336
|
-
# return
|
337
|
-
# self._handle_missing_values()
|
338
|
-
# self._connect()
|
339
|
-
# self._drop_table()
|
340
|
-
# self._create_table_from_dask()
|
341
|
-
# self._write_data()
|
342
|
-
#
|
343
|
-
# def _connect(self):
|
344
|
-
# try:
|
345
|
-
# self.client = clickhouse_connect.get_client(
|
346
|
-
# host=self.clickhouse_host,
|
347
|
-
# port=self.clickhouse_port,
|
348
|
-
# database=self.clickhouse_dbname,
|
349
|
-
# user=self.clickhouse_user,
|
350
|
-
# password=self.clickhouse_password
|
351
|
-
# )
|
352
|
-
# self.logger.debug("Connected to ClickHouse")
|
353
|
-
# except Exception as e:
|
354
|
-
# self.logger.error(e)
|
355
|
-
# raise
|
356
|
-
#
|
357
|
-
# @staticmethod
|
358
|
-
# def _generate_clickhouse_schema(dask_dtypes, dtype_map):
|
359
|
-
# schema = []
|
360
|
-
# for col, dtype in dask_dtypes.items():
|
361
|
-
# # Handle pandas nullable types explicitly
|
362
|
-
# if isinstance(dtype, pd.Int64Dtype): # pandas nullable Int64
|
363
|
-
# clickhouse_type = 'Int64'
|
364
|
-
# elif isinstance(dtype, pd.Float64Dtype): # pandas nullable Float64
|
365
|
-
# clickhouse_type = 'Float64'
|
366
|
-
# elif isinstance(dtype, pd.BooleanDtype): # pandas nullable Boolean
|
367
|
-
# clickhouse_type = 'UInt8'
|
368
|
-
# elif isinstance(dtype, pd.DatetimeTZDtype) or 'datetime' in str(dtype): # Nullable datetime
|
369
|
-
# clickhouse_type = 'Nullable(DateTime)'
|
370
|
-
# elif isinstance(dtype, pd.StringDtype): # pandas nullable String
|
371
|
-
# clickhouse_type = 'String'
|
372
|
-
# else:
|
373
|
-
# # Default mapping using the provided dtype_map
|
374
|
-
# clickhouse_type = dtype_map.get(str(dtype), 'String')
|
375
|
-
# schema.append(f"`{col}` {clickhouse_type}")
|
376
|
-
# return ', '.join(schema)
|
377
|
-
#
|
378
|
-
# def _drop_table(self):
|
379
|
-
# if self.client:
|
380
|
-
# self.client.command('DROP TABLE IF EXISTS {}'.format(self.clickhouse_table))
|
381
|
-
# self.logger.debug(f"Dropped table {self.clickhouse_table}")
|
382
|
-
#
|
383
|
-
# def _create_table_from_dask(self, engine=None):
|
384
|
-
# if engine is None:
|
385
|
-
# engine = f"ENGINE = MergeTree() order by {self.order_by}"
|
386
|
-
# dtypes = self.df.dtypes
|
387
|
-
# clickhouse_schema = self._generate_clickhouse_schema(dtypes, self.dtype_to_clickhouse)
|
388
|
-
# create_table_sql = f"CREATE TABLE IF NOT EXISTS {self.clickhouse_table} ({clickhouse_schema}) {engine};"
|
389
|
-
# self.logger.debug(f"Creating table SQL:{create_table_sql}")
|
390
|
-
# if self.client:
|
391
|
-
# self.client.command(create_table_sql)
|
392
|
-
# self.logger.debug("Created table '{}'".format(self.clickhouse_table))
|
393
|
-
#
|
394
|
-
# def _handle_missing_values(self):
|
395
|
-
# """
|
396
|
-
# Handle missing values in the Dask DataFrame before writing to ClickHouse.
|
397
|
-
# """
|
398
|
-
# self.logger.debug("Checking for missing values...")
|
399
|
-
# missing_counts = self.df.isnull().sum().compute()
|
400
|
-
# self.logger.debug(f"Missing values per column:\n{missing_counts}")
|
401
|
-
#
|
402
|
-
# # Replace missing values based on column types
|
403
|
-
# def replace_missing_values(df):
|
404
|
-
# for col in df.columns:
|
405
|
-
# if pd.api.types.is_integer_dtype(df[col]):
|
406
|
-
# df[col] = df[col].fillna(0) # Replace NA with 0 for integers
|
407
|
-
# elif pd.api.types.is_float_dtype(df[col]):
|
408
|
-
# df[col] = df[col].fillna(0.0) # Replace NA with 0.0 for floats
|
409
|
-
# elif pd.api.types.is_bool_dtype(df[col]):
|
410
|
-
# df[col] = df[col].fillna(False) # Replace NA with False for booleans
|
411
|
-
# else:
|
412
|
-
# df[col] = df[col].fillna('') # Replace NA with empty string for other types
|
413
|
-
# return df
|
414
|
-
#
|
415
|
-
# # Apply replacement
|
416
|
-
# self.df = replace_missing_values(self.df)
|
417
|
-
# self.logger.debug("Missing values replaced.")
|
418
|
-
#
|
419
|
-
# def _write_data(self):
|
420
|
-
# """
|
421
|
-
# Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
422
|
-
# """
|
423
|
-
# if len(self.df.index) == 0:
|
424
|
-
# self.logger.debug("No data found. Nothing written.")
|
425
|
-
# return
|
426
|
-
#
|
427
|
-
# for i, partition in enumerate(self.df.to_delayed()):
|
428
|
-
# try:
|
429
|
-
# # Compute the current partition into a pandas DataFrame
|
430
|
-
# df = partition.compute()
|
431
|
-
#
|
432
|
-
# if df.empty:
|
433
|
-
# self.logger.debug(f"Partition {i} is empty. Skipping...")
|
434
|
-
# continue
|
435
|
-
#
|
436
|
-
# self.logger.debug(f"Writing partition {i} with {len(df)} rows to ClickHouse.")
|
437
|
-
#
|
438
|
-
# # Write the partition to the ClickHouse table
|
439
|
-
# self.client.insert_df(self.clickhouse_table, df)
|
440
|
-
# except Exception as e:
|
441
|
-
# self.logger.error(f"Error writing partition {i}: {e}")
|
442
|
-
#
|
443
|
-
# def _write_data_multi_not_working_yet(self):
|
444
|
-
# """
|
445
|
-
# Writes the Dask DataFrame to a ClickHouse table partition by partition.
|
446
|
-
# Ensures a separate client instance is used per thread to avoid session conflicts.
|
447
|
-
# """
|
448
|
-
# if len(self.df.index) == 0:
|
449
|
-
# self.logger.debug("No data found. Nothing written.")
|
450
|
-
# return
|
451
|
-
#
|
452
|
-
# def create_client():
|
453
|
-
# client = Client(
|
454
|
-
# host=self.clickhouse_host,
|
455
|
-
# port=self.clickhouse_port,
|
456
|
-
# database=self.clickhouse_dbname,
|
457
|
-
# user=self.clickhouse_user,
|
458
|
-
# password=self.clickhouse_password
|
459
|
-
# )
|
460
|
-
# """
|
461
|
-
# Create a new instance of the ClickHouse client for each thread.
|
462
|
-
# This avoids session conflicts during concurrent writes.
|
463
|
-
# """
|
464
|
-
# return client
|
465
|
-
#
|
466
|
-
# def write_partition(partition, index):
|
467
|
-
# """
|
468
|
-
# Write a single partition to ClickHouse using a separate client instance.
|
469
|
-
# """
|
470
|
-
# try:
|
471
|
-
# self.logger.debug(f"Starting to process partition {index}")
|
472
|
-
# client = create_client() # Create a new client for the thread
|
473
|
-
#
|
474
|
-
# # Compute the Dask partition into a Pandas DataFrame
|
475
|
-
# df = partition.compute()
|
476
|
-
# if df.empty:
|
477
|
-
# self.logger.debug(f"Partition {index} is empty. Skipping...")
|
478
|
-
# return
|
479
|
-
#
|
480
|
-
# # Convert DataFrame to list of tuples
|
481
|
-
# data = [tuple(row) for row in df.to_numpy()]
|
482
|
-
# columns = df.columns.tolist()
|
483
|
-
#
|
484
|
-
# # Perform the insert
|
485
|
-
# self.logger.debug(f"Writing partition {index} with {len(df)} rows to ClickHouse.")
|
486
|
-
# client.execute(f"INSERT INTO {self.clickhouse_table} ({', '.join(columns)}) VALUES", data)
|
487
|
-
#
|
488
|
-
# except Exception as e:
|
489
|
-
# self.logger.error(f"Error writing partition {index}: {e}")
|
490
|
-
# finally:
|
491
|
-
# if 'client' in locals() and hasattr(client, 'close'):
|
492
|
-
# client.close()
|
493
|
-
# self.logger.debug(f"Closed client for partition {index}")
|
494
|
-
#
|
495
|
-
# try:
|
496
|
-
# # Get delayed partitions and enumerate them
|
497
|
-
# partitions = self.df.to_delayed()
|
498
|
-
# with ThreadPoolExecutor() as executor:
|
499
|
-
# executor.map(write_partition, partitions, range(len(partitions)))
|
500
|
-
# except Exception as e:
|
501
|
-
# self.logger.error(f"Error during multi-partition write: {e}")
|
sibi_dst/utils/storage_config.py
CHANGED
@@ -6,13 +6,13 @@ from .storage_manager import StorageManager
|
|
6
6
|
from .credentials import ConfigManager
|
7
7
|
|
8
8
|
class StorageConfig:
|
9
|
-
def __init__(self, config:ConfigManager, depots:dict=None):
|
9
|
+
def __init__(self, config:ConfigManager, depots:dict=None, clear_existing=False, write_mode="full-access"):
|
10
10
|
self.conf = config
|
11
11
|
self.depots = depots
|
12
12
|
self._initialize_storage()
|
13
13
|
self.storage_manager = StorageManager(self.base_storage, self.filesystem_type, self.filesystem_options)
|
14
14
|
if self.depots is not None:
|
15
|
-
self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots)
|
15
|
+
self.depot_paths, self.depot_names = self.storage_manager.rebuild_depot_paths(depots, clear_existing=clear_existing, write_mode=write_mode)
|
16
16
|
else:
|
17
17
|
self.depot_paths = None
|
18
18
|
self.depot_names = None
|
@@ -83,7 +83,7 @@ class StorageManager:
|
|
83
83
|
self.fs.rm(sub_path, recursive=True)
|
84
84
|
self.fs.mkdirs(sub_path, exist_ok=True)
|
85
85
|
|
86
|
-
def rebuild_depot_paths(self, depots, clear_existing=False):
|
86
|
+
def rebuild_depot_paths(self, depots, clear_existing=False, write_mode="full-access"):
|
87
87
|
"""
|
88
88
|
Rebuilds depot_paths (dictionary) and depot_name (SimpleNamespace).
|
89
89
|
Handles clear_existing scenario by resetting directories when required.
|
@@ -96,7 +96,8 @@ class StorageManager:
|
|
96
96
|
depot_path = self.join_paths(self.storage_path, depot)
|
97
97
|
if self.debug:
|
98
98
|
print(f"Rebuilding depot at: {depot_path}")
|
99
|
-
|
99
|
+
if write_mode == "full-access":
|
100
|
+
self.setup_directories(depot_path, sub_directories, clear_existing=clear_existing)
|
100
101
|
|
101
102
|
# Generate depot_paths dictionary
|
102
103
|
self.depot_paths = {
|
@@ -2,14 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
|
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
|
3
3
|
sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
|
4
4
|
sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
|
5
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
+
sibi_dst/df_helper/_df_helper.py,sha256=TS8nQV6QExSz5rNh94zmawNOvQ6eBEzsAcJkiiKXAb0,12945
|
6
6
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
|
7
7
|
sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
|
8
8
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
|
10
10
|
sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
|
11
11
|
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
|
12
|
-
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=
|
12
|
+
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=V6y1Vco3_uY4UBF79_JPd1CFK5DpNsnGYHCc5PDPGZo,13798
|
13
13
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
|
14
14
|
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
|
15
15
|
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
|
@@ -39,7 +39,7 @@ sibi_dst/tests/test_data_wrapper_class.py,sha256=6uFmZR2DxnxQz49L5jT2ehlKvlLnpUH
|
|
39
39
|
sibi_dst/utils/__init__.py,sha256=vShNCOMPw8KKwlb4tq5XGrpjqakJ_OE8YDc_xDAWAxI,1302
|
40
40
|
sibi_dst/utils/base.py,sha256=IyObjZ7AaE-YjVU0RLIXNCnQKWwzi5NH2I6D1KfcIyk,8716
|
41
41
|
sibi_dst/utils/business_days.py,sha256=dP0Xj4FhTBIvZZrZYLOHZl5zOpDAgWkD4p_1a7BOT7I,8461
|
42
|
-
sibi_dst/utils/clickhouse_writer.py,sha256=
|
42
|
+
sibi_dst/utils/clickhouse_writer.py,sha256=NngJyJpx2PjUQWsX0YmwCuGdeViK77Wi3HmYqHz3jTc,9544
|
43
43
|
sibi_dst/utils/credentials.py,sha256=cHJPPsmVyijqbUQIq7WWPe-lIallA-mI5RAy3YUuRME,1724
|
44
44
|
sibi_dst/utils/data_from_http_source.py,sha256=AcpKNsqTgN2ClNwuhgUpuNCx62r5_DdsAiKY8vcHEBA,1867
|
45
45
|
sibi_dst/utils/data_utils.py,sha256=7bLidEjppieNoozDFb6OuRY0W995cxg4tiGAlkGfePI,7768
|
@@ -54,8 +54,8 @@ sibi_dst/utils/manifest_manager.py,sha256=9y4cV-Ig8O-ekhApp_UObTY-cTsl-bGnvKIThI
|
|
54
54
|
sibi_dst/utils/parquet_saver.py,sha256=aYBlijqPAn-yuJXhmaRIteAN_IAQZvPh8I8Os2TLGgI,4861
|
55
55
|
sibi_dst/utils/periods.py,sha256=8eTGi-bToa6_a8Vwyg4fkBPryyzft9Nzy-3ToxjqC8c,1434
|
56
56
|
sibi_dst/utils/phone_formatter.py,sha256=oeM22nLjhObENrpItCNeVpkYS4pXRm5hSxdk0M4nvwU,4580
|
57
|
-
sibi_dst/utils/storage_config.py,sha256=
|
58
|
-
sibi_dst/utils/storage_manager.py,sha256=
|
57
|
+
sibi_dst/utils/storage_config.py,sha256=DLtP5jKVM0mdFdgRw6LQfRqyavMjJcCVU7GhsUCRH78,4427
|
58
|
+
sibi_dst/utils/storage_manager.py,sha256=La1NY79bhRAmHWXp7QcXJZtbHoRboJMgoXOSXbIl1SA,6643
|
59
59
|
sibi_dst/utils/update_planner.py,sha256=smlMHpr1p8guZnP5SyzCe6RsC-XkPOJWIsdeospUyb0,11471
|
60
60
|
sibi_dst/utils/webdav_client.py,sha256=D9J5d1f1qQwHGm5FE5AMVpOPwcU5oD7K8JZoKGP8NpM,5811
|
61
61
|
sibi_dst/v2/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -78,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
78
78
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
79
79
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
80
80
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
81
|
-
sibi_dst-2025.8.
|
82
|
-
sibi_dst-2025.8.
|
83
|
-
sibi_dst-2025.8.
|
81
|
+
sibi_dst-2025.8.6.dist-info/METADATA,sha256=fFvtxHyXl8FCryToGB4H91n_NZ3hzJNRos0O2FUNVBQ,2610
|
82
|
+
sibi_dst-2025.8.6.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
83
|
+
sibi_dst-2025.8.6.dist-info/RECORD,,
|
File without changes
|