sibi-dst 2025.8.7__py3-none-any.whl → 2025.8.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,11 +1,13 @@
1
- import datetime
1
+ import datetime as dt
2
+ import logging
3
+ import posixpath
2
4
  from pathlib import Path
3
- from typing import Optional, List
5
+ from typing import Optional, List, Tuple
4
6
 
5
7
  import dask.dataframe as dd
6
8
  import fsspec
7
9
  import pandas as pd
8
- from pydantic import BaseModel, model_validator, ConfigDict
10
+ from pydantic import BaseModel, ConfigDict, model_validator
9
11
 
10
12
  from sibi_dst.df_helper.core import FilterHandler
11
13
  from sibi_dst.utils import FilePathGenerator
@@ -14,262 +16,550 @@ from sibi_dst.utils import Logger
14
16
 
15
17
  class ParquetConfig(BaseModel):
16
18
  """
17
- Represents configuration for managing and validating parquet file operations.
18
-
19
- The `ParquetConfig` class provides attributes and methods necessary to handle operations
20
- on parquet files in a file system. It includes functionalities for ensuring file paths
21
- and extensions, validating storage paths and parameters, determining file recency,
22
- and calculating the size of parquet files. This class is designed with flexibility to handle
23
- different file systems through the integration with `fsspec` and allows storage path validations
24
- with optional logging support.
25
-
26
- :ivar load_parquet: Indicates whether parquet data should be loaded based on the
27
- current configuration and validation.
28
- :type load_parquet: bool
29
- :ivar parquet_filename: The name of the parquet file, optional if folders are used.
30
- :type parquet_filename: Optional[str]
31
- :ivar parquet_storage_path: The base path for storing or retrieving parquet files.
32
- :type parquet_storage_path: Optional[str]
33
- :ivar parquet_full_path: The full path to a specific parquet file, derived from the
34
- storage path and filename when applicable.
35
- :type parquet_full_path: Optional[str]
36
- :ivar parquet_folder_list: A list of folder paths to parquet data, derived from start
37
- and end dates if specified.
38
- :type parquet_folder_list: Optional[List[str]]
39
- :ivar parquet_size_bytes: The total size of the parquet files, in bytes.
40
- :type parquet_size_bytes: int
41
- :ivar parquet_max_age_minutes: Maximum acceptable age of the most recent parquet file, in minutes.
42
- :type parquet_max_age_minutes: int
43
- :ivar parquet_is_recent: Indicates whether the parquet file is considered recent based
44
- on the `parquet_max_age_minutes` condition.
45
- :type parquet_is_recent: bool
46
- :ivar parquet_start_date: The start date for parquet file validation or file path generation.
47
- :type parquet_start_date: Optional[str]
48
- :ivar parquet_end_date: The end date for parquet file validation or file path generation.
49
- :type parquet_end_date: Optional[str]
50
- :ivar fs: The file system object used for storage operations, compliant with `fsspec`.
51
- :type fs: Optional[fsspec.spec.AbstractFileSystem]
52
- :ivar logger: A logger for handling logging operations.
53
- :type logger: Optional[Logger]
19
+ Configuration and helpers for reading Parquet datasets with fsspec + Dask.
20
+
21
+ Heavy I/O (exists/size/listing) is deferred to explicit methods.
22
+ The validator only normalizes and validates inputs.
54
23
  """
55
- load_parquet: bool = False
56
- parquet_filename: Optional[str] = None
24
+
25
+ # ---- Inputs / knobs ----
57
26
  parquet_storage_path: Optional[str] = None
58
- parquet_full_path: Optional[str] = None
59
- parquet_folder_list: Optional[List[str]] = None
60
- parquet_size_bytes: int = 0
61
- parquet_max_age_minutes: int = 0
62
- parquet_is_recent: bool = False
63
- parquet_start_date: Optional[str] = None
64
- parquet_end_date: Optional[str] = None
65
- fs: Optional[fsspec.spec.AbstractFileSystem] = None # Your fsspec filesystem object
27
+ parquet_filename: Optional[str] = None
28
+ parquet_start_date: Optional[str] = None # YYYY-MM-DD
29
+ parquet_end_date: Optional[str] = None # YYYY-MM-DD
30
+ parquet_max_age_minutes: int = 0 # 0 => no recency limit
31
+ fs: Optional[fsspec.spec.AbstractFileSystem] = None
66
32
  logger: Optional[Logger] = None
67
33
  debug: bool = False
34
+
35
+ # ---- Derived / runtime fields (lazy) ----
36
+ parquet_full_path: Optional[str] = None # file or directory
37
+ parquet_folder_list: Optional[List[str]] = None
38
+ parquet_is_recent: bool = False
39
+ parquet_size_bytes: int = 0
40
+ load_parquet: bool = False # computed when loading
41
+
68
42
  model_config = ConfigDict(arbitrary_types_allowed=True)
69
43
 
70
- @model_validator(mode='after')
71
- def check_parquet_params(self):
72
- """
73
- Validates and configures the parameters required for managing parquet files. This includes
74
- configuring paths through `fsspec`, identifying file storage paths, checking the validity of
75
- dates related to parquet files, ensuring proper parquet file extensions, and determining
76
- whether existing parquet files are recent and loadable.
77
-
78
- :return: The current instance with validated and migrated attributes configured for
79
- handling parquet files.
80
-
81
- :raises ValueError: If certain conditions are not met, such as missing or invalid
82
- `parquet_storage_path`, providing only one of
83
- `parquet_start_date` or `parquet_end_date`, or if the
84
- `parquet_end_date` is earlier than the `parquet_start_date`.
85
- """
86
- # Configure paths based on fsspec
44
+ # ------------------------- validation -------------------------
45
+
46
+ @model_validator(mode="after")
47
+ def _normalize_and_validate(self):
48
+ # logger
87
49
  if self.logger is None:
88
50
  self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
89
- self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
51
+ import logging as _logging
52
+ self.logger.set_level(_logging.DEBUG if self.debug else _logging.INFO)
53
+
54
+ # fs
90
55
  if self.fs is None:
91
- raise ValueError('Parquet Options: File system (fs) must be specified')
92
-
93
- if self.parquet_storage_path is None:
94
- raise ValueError('Parquet storage path must be specified')
95
- self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
96
- if not self.fs.exists(self.parquet_storage_path):
97
- self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
98
- # raise ValueError('Parquet storage path does not exist')
99
- self.load_parquet = False
100
- if self.parquet_filename is not None:
101
- self.parquet_full_path = self.ensure_file_extension(
102
- filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
103
- extension='parquet'
56
+ raise ValueError("File system (fs) must be specified")
57
+
58
+ # base path
59
+ if not self.parquet_storage_path:
60
+ raise ValueError("Parquet storage path must be specified")
61
+ self.parquet_storage_path = self.parquet_storage_path.rstrip("/")
62
+
63
+ # dates: both or none
64
+ if self.parquet_start_date and not self.parquet_end_date:
65
+ raise ValueError("Parquet end date must be specified if start date is provided")
66
+ if self.parquet_end_date and not self.parquet_start_date:
67
+ raise ValueError("Parquet start date must be specified if end date is provided")
68
+
69
+ # date ordering
70
+ if self.parquet_start_date and self.parquet_end_date:
71
+ start = dt.datetime.strptime(self.parquet_start_date, "%Y-%m-%d").date()
72
+ end = dt.datetime.strptime(self.parquet_end_date, "%Y-%m-%d").date()
73
+ if end < start:
74
+ raise ValueError("Parquet end date must be greater than start date")
75
+
76
+ # generate day-wise folders (no I/O)
77
+ fpg = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs, logger=self.logger)
78
+ self.parquet_folder_list = fpg.generate_file_paths(
79
+ dt.datetime.combine(start, dt.time.min),
80
+ dt.datetime.combine(end, dt.time.min),
104
81
  )
105
- self.parquet_is_recent = self.is_file_recent()
106
- self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
107
82
 
108
- if self.parquet_start_date is not None:
109
- if self.parquet_end_date is None:
110
- raise ValueError('Parquet end date must be specified if start date is provided')
111
-
112
- start_date = datetime.datetime.strptime(self.parquet_start_date, '%Y-%m-%d')
113
- end_date = datetime.datetime.strptime(self.parquet_end_date, '%Y-%m-%d')
114
- if end_date < start_date:
115
- raise ValueError('Parquet end date must be greater than start date')
116
-
117
- # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
118
- self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
119
- logger=self.logger).generate_file_paths(start_date, end_date)
120
-
121
- self.parquet_size_bytes = self.get_parquet_size_bytes()
122
- self.load_parquet = True
123
- # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
124
- elif self.parquet_end_date is not None:
125
- raise ValueError('Parquet start date must be specified if end date is provided')
83
+ # file vs dataset-at-root
84
+ if self.parquet_filename:
85
+ self.parquet_full_path = self.ensure_file_extension(
86
+ posixpath.join(str(self.parquet_storage_path), str(self.parquet_filename)),
87
+ "parquet",
88
+ )
89
+ else:
90
+ # treat storage path as a directory dataset
91
+ self.parquet_full_path = self.parquet_storage_path
126
92
 
127
93
  return self
128
94
 
129
- def is_file_recent(self):
95
+ # ------------------------- public helpers -------------------------
96
+
97
+ def determine_recency(self) -> bool:
130
98
  """
131
- Determines whether the file at the specified parquet path is considered recent
132
- based on its modification time and the maximum age limit defined.
133
-
134
- The function first checks for the existence of the file at the specified
135
- `parquet_full_path`. If the file does not exist, the function will return
136
- False. If `parquet_max_age_minutes` is set to 0, it implies no maximum age
137
- limit, and the function will return True. Otherwise, it retrieves the file's
138
- last modified time and calculates the age of the file by comparing it with the
139
- current time. The function returns True if the file's age does not exceed the
140
- maximum age specified by `parquet_max_age_minutes`, otherwise it returns
141
- False.
142
-
143
- :return: Whether the file is considered recent based on its existence,
144
- modification time, and maximum age limit.
145
- :rtype: bool
99
+ Returns True if parquet_full_path exists and is within parquet_max_age_minutes.
100
+ File recency applies only when full_path points to a file.
146
101
  """
147
- if not self.fs.exists(self.parquet_full_path):
102
+ path = self.parquet_full_path
103
+ if not path:
148
104
  return False
105
+
106
+ # If path is a directory dataset, skip recency check
107
+ if not path.endswith(".parquet"):
108
+ self.parquet_is_recent = True
109
+ return True
110
+
111
+ if not self._exists(path):
112
+ self.parquet_is_recent = False
113
+ return False
114
+
149
115
  if self.parquet_max_age_minutes == 0:
116
+ self.parquet_is_recent = True
150
117
  return True
151
- file_time = datetime.datetime.fromtimestamp(self.fs.modified(self.parquet_full_path))
152
- return (datetime.datetime.now() - file_time) <= datetime.timedelta(minutes=self.parquet_max_age_minutes)
153
118
 
154
- def get_parquet_size_bytes(self):
119
+ mdt = self._get_mtime(path)
120
+ if not mdt:
121
+ self.parquet_is_recent = False
122
+ return False
123
+
124
+ now = dt.datetime.now(dt.timezone.utc)
125
+ if mdt.tzinfo is None:
126
+ mdt = mdt.replace(tzinfo=dt.timezone.utc)
127
+ self.parquet_is_recent = (now - mdt) <= dt.timedelta(minutes=self.parquet_max_age_minutes)
128
+ return self.parquet_is_recent
129
+
130
+ def compute_parquet_size_bytes(self) -> int:
155
131
  """
156
- Calculate the total size, in bytes, of all Parquet files within the defined
157
- folders specified by `parquet_folder_list`. The function iteratively goes
158
- through each folder in the provided list, applying a recursive wildcard
159
- search to include all levels of nested directories, and calculates the
160
- cumulative size of all found Parquet files using the file system's size
161
- retrieval method.
162
-
163
- :raises AttributeError: If `fs` or `parquet_folder_list` attributes are not set
164
- or improperly configured when the method is called.
165
- :raises NotImplementedError: If the `fs.size` or `fs.glob` methods are
166
- unimplemented in the provided file system object or it otherwise lacks
167
- necessary support for these operations.
168
-
169
- :return: The cumulative size of all Parquet files located in the folders
170
- defined by `parquet_folder_list`, measured in bytes.
171
- :rtype: int
132
+ Computes total size of *.parquet files under parquet_folder_list.
133
+ No-op if folder list is missing.
172
134
  """
173
- total_size = 0
174
- for folder in self.parquet_folder_list:
175
- # Use a double wildcard ** to match any level of nested directories
176
- for path in self.fs.glob(f"{folder}/**/*.parquet"):
177
- total_size += self.fs.size(path)
178
- return total_size
135
+ if not self.parquet_folder_list:
136
+ self.parquet_size_bytes = 0
137
+ return 0
179
138
 
180
- def load_files(self, **filters):
139
+ total = 0
140
+ for folder in self.parquet_folder_list:
141
+ try:
142
+ # Preferred: find (recursive)
143
+ for path in self.fs.find(folder):
144
+ if path.endswith(".parquet"):
145
+ info = self.fs.info(path)
146
+ total += int(info.get("size", 0))
147
+ except Exception:
148
+ # Fallback: glob recursive
149
+ for path in self.fs.glob(f"{folder}/**/*.parquet"):
150
+ info = self.fs.info(path)
151
+ total += int(info.get("size", 0))
152
+
153
+ self.parquet_size_bytes = total
154
+ return total
155
+
156
+ def load_files(self, **filters) -> dd.DataFrame:
181
157
  """
182
- Loads parquet files into a Dask DataFrame based on the specified conditions.
183
- Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
158
+ Load Parquet as a Dask DataFrame with optional pushdown + residual filtering.
159
+ Decides paths lazily. Avoids heavy work in validators.
184
160
  """
185
- if not self.load_parquet:
186
- self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
187
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
161
+ paths_to_load = self._resolve_paths_for_read()
162
+ if not paths_to_load:
163
+ self.logger.warning("No valid parquet paths resolved. Returning empty DataFrame.")
164
+ return self._empty_ddf()
188
165
 
189
- # Resolve paths
190
- paths_to_load = []
166
+ # Determine if loading is allowed
167
+ # If a single file was specified, honor recency; for directories or date ranges, load.
191
168
  if self.parquet_folder_list:
192
- paths_to_load = [p for p in self.parquet_folder_list if p]
193
- elif self.parquet_full_path:
194
- paths_to_load = [self.parquet_full_path]
169
+ self.load_parquet = True
170
+ else:
171
+ # single file or dataset-at-root
172
+ if self.parquet_full_path and self.parquet_full_path.endswith(".parquet"):
173
+ self.load_parquet = self.determine_recency()
174
+ else:
175
+ self.load_parquet = True
195
176
 
196
- if not paths_to_load:
197
- self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
198
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
177
+ if not self.load_parquet:
178
+ self.logger.info("Parquet loading disabled by recency policy. Returning empty DataFrame.")
179
+ return self._empty_ddf()
199
180
 
200
- # Prepare filters
181
+ # Compile filters
201
182
  fh = None
202
- expr = None
203
183
  pq_filters = None
204
- residual_filters = None
184
+ residual_expr = None
205
185
  if filters:
206
186
  fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
207
-
208
- # Use the compiler + pushdown split so we don't double-apply
209
- try:
210
- # If you added split_pushdown_and_residual earlier:
187
+ if hasattr(fh, "split_pushdown_and_residual"):
211
188
  pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
212
- expr = fh.compile_filters(residual_filters) if residual_filters else None
213
- except AttributeError:
214
- # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
215
- expr = fh.compile_filters(filters)
216
- pq_filters = expr.to_parquet_filters()
189
+ if residual_filters:
190
+ residual_expr = fh.compile_filters(residual_filters)
191
+ else:
192
+ residual_expr = fh.compile_filters(filters)
193
+ if hasattr(residual_expr, "to_parquet_filters"):
194
+ pq_filters = residual_expr.to_parquet_filters()
217
195
 
196
+ # Read parquet
218
197
  try:
219
- self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
220
-
221
- # Optional: prune columns. Keep it simple unless you want to compute from filters.
222
- columns = None # or a concrete list if you know it
223
-
224
- if fh and pq_filters:
225
- self.logger.debug(f"Applying Parquet filters: {pq_filters}")
226
- dd_result = dd.read_parquet(
227
- paths_to_load,
228
- engine="pyarrow",
229
- filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
230
- filters=pq_filters,
231
- columns=columns,
232
- gather_statistics=False, # uncomment if you have *many* files and don't need global stats
233
- )
234
- # Apply only residual mask (if any)
235
- if expr is not None:
236
- dd_result = dd_result[expr.mask(dd_result)]
237
- else:
238
- dd_result = dd.read_parquet(
239
- paths_to_load,
240
- engine="pyarrow",
241
- filesystem=self.fs,
242
- columns=columns,
243
- gather_statistics=False,
244
- )
245
- # If we didn't push down, but have filters, apply them here
246
- if expr is None and fh and filters:
247
- expr = fh.compile_filters(filters)
248
- if expr is not None:
249
- dd_result = dd_result[expr.mask(dd_result)]
198
+ self.logger.debug(f"Reading parquet from: {paths_to_load}")
199
+ if pq_filters:
200
+ self.logger.debug(f"Applying pushdown filters: {pq_filters}")
201
+
202
+ dd_result = dd.read_parquet(
203
+ paths_to_load,
204
+ engine="pyarrow",
205
+ filesystem=self.fs,
206
+ filters=pq_filters,
207
+ # Toggle based on file count; False is safer for many tiny files.
208
+ gather_statistics=False,
209
+ ignore_metadata_file=True,
210
+ )
211
+
212
+ if residual_expr is not None:
213
+ dd_result = dd_result[residual_expr.mask(dd_result)]
250
214
 
251
215
  return dd_result
252
216
 
253
217
  except FileNotFoundError as e:
254
- self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
255
- self.logger.debug("Returning empty DataFrame due to missing parquet files.")
256
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
218
+ self.logger.debug(f"Parquet not found at {paths_to_load}: {e}")
219
+ return self._empty_ddf()
257
220
  except Exception as e:
258
- self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
259
- self.logger.debug("Returning empty DataFrame due to loading error.")
260
- return dd.from_pandas(pd.DataFrame(), npartitions=1)
221
+ self.logger.debug(f"Parquet load failed for {paths_to_load}: {e}")
222
+ return self._empty_ddf()
261
223
 
224
+ # ------------------------- internals -------------------------
262
225
 
263
- @staticmethod
264
- def ensure_file_extension(filepath: str, extension: str) -> str:
226
+ def _resolve_paths_for_read(self) -> List[str]:
265
227
  """
266
- Ensures that the specified file has the desired extension. If the file already has the
267
- specified extension, it returns the filepath unchanged. Otherwise, it updates the file
268
- extension to the given one and returns the modified filepath.
228
+ Builds a list of path patterns for dask.read_parquet.
229
+ """
230
+ # Date-ranged folders
231
+ if self.parquet_folder_list:
232
+ dirs = {self._dirname(p) for p in self.parquet_folder_list}
233
+ return [d.rstrip("/") + "/*.parquet" for d in sorted(dirs)]
269
234
 
270
- :param filepath: The path to the file as a string.
271
- :param extension: The desired file extension, without the leading dot.
272
- :return: The updated file path as a string, ensuring it has the specified extension.
235
+ # Single file or dataset root
236
+ if not self.parquet_full_path:
237
+ return []
238
+
239
+ if self.parquet_full_path.endswith(".parquet"):
240
+ return [self.parquet_full_path]
241
+
242
+ # Directory dataset
243
+ return [self.parquet_full_path.rstrip("/") + "/*.parquet"]
244
+
245
+ def _get_mtime(self, path: str) -> Optional[dt.datetime]:
246
+ """
247
+ Returns a timezone-aware datetime for the path's modification time if available.
273
248
  """
249
+ try:
250
+ info = self.fs.info(path)
251
+ except Exception:
252
+ return None
253
+
254
+ mtime = info.get("mtime") or info.get("last_modified") or info.get("LastModified")
255
+ if isinstance(mtime, (int, float)):
256
+ return dt.datetime.fromtimestamp(mtime, tz=dt.timezone.utc)
257
+ if isinstance(mtime, str):
258
+ # ISO 8601 or RFC 3339 common form
259
+ try:
260
+ return dt.datetime.fromisoformat(mtime.replace("Z", "+00:00"))
261
+ except ValueError:
262
+ return None
263
+ if hasattr(mtime, "tzinfo"):
264
+ return mtime
265
+ return None
266
+
267
+ def _exists(self, path: str) -> bool:
268
+ try:
269
+ return self.fs.exists(path)
270
+ except Exception:
271
+ return False
272
+
273
+ @staticmethod
274
+ def _dirname(p: str) -> str:
275
+ # Keep URL semantics stable (S3/HTTP/…)
276
+ return posixpath.dirname(p.rstrip("/"))
277
+
278
+ @staticmethod
279
+ def _empty_ddf() -> dd.DataFrame:
280
+ return dd.from_pandas(pd.DataFrame(), npartitions=1)
281
+
282
+ @staticmethod
283
+ def ensure_file_extension(filepath: str, extension: str) -> str:
274
284
  path = Path(filepath)
275
285
  return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath
286
+
287
+ # import datetime
288
+ # from pathlib import Path
289
+ # from typing import Optional, List
290
+ #
291
+ # import dask.dataframe as dd
292
+ # import fsspec
293
+ # import pandas as pd
294
+ # from pydantic import BaseModel, model_validator, ConfigDict
295
+ #
296
+ # from sibi_dst.df_helper.core import FilterHandler
297
+ # from sibi_dst.utils import FilePathGenerator
298
+ # from sibi_dst.utils import Logger
299
+ #
300
+ #
301
+ # class ParquetConfig(BaseModel):
302
+ # """
303
+ # Represents configuration for managing and validating parquet file operations.
304
+ #
305
+ # The `ParquetConfig` class provides attributes and methods necessary to handle operations
306
+ # on parquet files in a file system. It includes functionalities for ensuring file paths
307
+ # and extensions, validating storage paths and parameters, determining file recency,
308
+ # and calculating the size of parquet files. This class is designed with flexibility to handle
309
+ # different file systems through the integration with `fsspec` and allows storage path validations
310
+ # with optional logging support.
311
+ #
312
+ # :ivar load_parquet: Indicates whether parquet data should be loaded based on the
313
+ # current configuration and validation.
314
+ # :type load_parquet: bool
315
+ # :ivar parquet_filename: The name of the parquet file, optional if folders are used.
316
+ # :type parquet_filename: Optional[str]
317
+ # :ivar parquet_storage_path: The base path for storing or retrieving parquet files.
318
+ # :type parquet_storage_path: Optional[str]
319
+ # :ivar parquet_full_path: The full path to a specific parquet file, derived from the
320
+ # storage path and filename when applicable.
321
+ # :type parquet_full_path: Optional[str]
322
+ # :ivar parquet_folder_list: A list of folder paths to parquet data, derived from start
323
+ # and end dates if specified.
324
+ # :type parquet_folder_list: Optional[List[str]]
325
+ # :ivar parquet_size_bytes: The total size of the parquet files, in bytes.
326
+ # :type parquet_size_bytes: int
327
+ # :ivar parquet_max_age_minutes: Maximum acceptable age of the most recent parquet file, in minutes.
328
+ # :type parquet_max_age_minutes: int
329
+ # :ivar parquet_is_recent: Indicates whether the parquet file is considered recent based
330
+ # on the `parquet_max_age_minutes` condition.
331
+ # :type parquet_is_recent: bool
332
+ # :ivar parquet_start_date: The start date for parquet file validation or file path generation.
333
+ # :type parquet_start_date: Optional[str]
334
+ # :ivar parquet_end_date: The end date for parquet file validation or file path generation.
335
+ # :type parquet_end_date: Optional[str]
336
+ # :ivar fs: The file system object used for storage operations, compliant with `fsspec`.
337
+ # :type fs: Optional[fsspec.spec.AbstractFileSystem]
338
+ # :ivar logger: A logger for handling logging operations.
339
+ # :type logger: Optional[Logger]
340
+ # """
341
+ # load_parquet: bool = False
342
+ # parquet_filename: Optional[str] = None
343
+ # parquet_storage_path: Optional[str] = None
344
+ # parquet_full_path: Optional[str] = None
345
+ # parquet_folder_list: Optional[List[str]] = None
346
+ # parquet_size_bytes: int = 0
347
+ # parquet_max_age_minutes: int = 0
348
+ # parquet_is_recent: bool = False
349
+ # parquet_start_date: Optional[str] = None
350
+ # parquet_end_date: Optional[str] = None
351
+ # fs: Optional[fsspec.spec.AbstractFileSystem] = None # Your fsspec filesystem object
352
+ # logger: Optional[Logger] = None
353
+ # debug: bool = False
354
+ # model_config = ConfigDict(arbitrary_types_allowed=True)
355
+ #
356
+ # @model_validator(mode='after')
357
+ # def check_parquet_params(self):
358
+ # """
359
+ # Validates and configures the parameters required for managing parquet files. This includes
360
+ # configuring paths through `fsspec`, identifying file storage paths, checking the validity of
361
+ # dates related to parquet files, ensuring proper parquet file extensions, and determining
362
+ # whether existing parquet files are recent and loadable.
363
+ #
364
+ # :return: The current instance with validated and migrated attributes configured for
365
+ # handling parquet files.
366
+ #
367
+ # :raises ValueError: If certain conditions are not met, such as missing or invalid
368
+ # `parquet_storage_path`, providing only one of
369
+ # `parquet_start_date` or `parquet_end_date`, or if the
370
+ # `parquet_end_date` is earlier than the `parquet_start_date`.
371
+ # """
372
+ # # Configure paths based on fsspec
373
+ # if self.logger is None:
374
+ # self.logger = Logger.default_logger(logger_name=self.__class__.__name__)
375
+ # self.logger.set_level(Logger.DEBUG if self.debug else Logger.INFO)
376
+ # if self.fs is None:
377
+ # raise ValueError('Parquet Options: File system (fs) must be specified')
378
+ #
379
+ # if self.parquet_storage_path is None:
380
+ # raise ValueError('Parquet storage path must be specified')
381
+ # self.parquet_storage_path = self.parquet_storage_path.rstrip('/')
382
+ # #if not self.fs.exists(self.parquet_storage_path):
383
+ # # self.fs.mkdirs(self.parquet_storage_path, exist_ok=True)
384
+ # # self.logger.debug(f'Parquet storage path {self.parquet_storage_path} does not exist')
385
+ # self.load_parquet = False
386
+ # if self.parquet_filename is not None:
387
+ # self.parquet_full_path = self.ensure_file_extension(
388
+ # filepath=self.fs.sep.join([str(self.parquet_storage_path), str(self.parquet_filename)]),
389
+ # extension='parquet'
390
+ # )
391
+ # self.parquet_is_recent = self.is_file_recent()
392
+ # self.load_parquet = self.parquet_is_recent and self.fs.exists(self.parquet_full_path)
393
+ #
394
+ # if self.parquet_start_date is not None:
395
+ # if self.parquet_end_date is None:
396
+ # raise ValueError('Parquet end date must be specified if start date is provided')
397
+ #
398
+ # start_date = datetime.datetime.strptime(self.parquet_start_date, '%Y-%m-%d')
399
+ # end_date = datetime.datetime.strptime(self.parquet_end_date, '%Y-%m-%d')
400
+ # if end_date < start_date:
401
+ # raise ValueError('Parquet end date must be greater than start date')
402
+ #
403
+ # # Saving to parquet is disabled when start and end dates are provided, as we will load parquet files
404
+ # self.parquet_folder_list = FilePathGenerator(str(self.parquet_storage_path), fs=self.fs,
405
+ # logger=self.logger).generate_file_paths(start_date, end_date)
406
+ #
407
+ # self.parquet_size_bytes = self.get_parquet_size_bytes()
408
+ # self.load_parquet = True
409
+ # # self.load_parquet = all([self.fs.exists(folder) for folder in self.parquet_folder_list]) and self.parquet_size_bytes > 0
410
+ # elif self.parquet_end_date is not None:
411
+ # raise ValueError('Parquet start date must be specified if end date is provided')
412
+ #
413
+ # return self
414
+ #
415
+ # def is_file_recent(self):
416
+ # """
417
+ # Determines whether the file at the specified parquet path is considered recent
418
+ # based on its modification time and the maximum age limit defined.
419
+ #
420
+ # The function first checks for the existence of the file at the specified
421
+ # `parquet_full_path`. If the file does not exist, the function will return
422
+ # False. If `parquet_max_age_minutes` is set to 0, it implies no maximum age
423
+ # limit, and the function will return True. Otherwise, it retrieves the file's
424
+ # last modified time and calculates the age of the file by comparing it with the
425
+ # current time. The function returns True if the file's age does not exceed the
426
+ # maximum age specified by `parquet_max_age_minutes`, otherwise it returns
427
+ # False.
428
+ #
429
+ # :return: Whether the file is considered recent based on its existence,
430
+ # modification time, and maximum age limit.
431
+ # :rtype: bool
432
+ # """
433
+ # if not self.fs.exists(self.parquet_full_path):
434
+ # return False
435
+ # if self.parquet_max_age_minutes == 0:
436
+ # return True
437
+ # file_time = datetime.datetime.fromtimestamp(self.fs.modified(self.parquet_full_path))
438
+ # return (datetime.datetime.now() - file_time) <= datetime.timedelta(minutes=self.parquet_max_age_minutes)
439
+ #
440
+ # def get_parquet_size_bytes(self):
441
+ # """
442
+ # Calculate the total size, in bytes, of all Parquet files within the defined
443
+ # folders specified by `parquet_folder_list`. The function iteratively goes
444
+ # through each folder in the provided list, applying a recursive wildcard
445
+ # search to include all levels of nested directories, and calculates the
446
+ # cumulative size of all found Parquet files using the file system's size
447
+ # retrieval method.
448
+ #
449
+ # :raises AttributeError: If `fs` or `parquet_folder_list` attributes are not set
450
+ # or improperly configured when the method is called.
451
+ # :raises NotImplementedError: If the `fs.size` or `fs.glob` methods are
452
+ # unimplemented in the provided file system object or it otherwise lacks
453
+ # necessary support for these operations.
454
+ #
455
+ # :return: The cumulative size of all Parquet files located in the folders
456
+ # defined by `parquet_folder_list`, measured in bytes.
457
+ # :rtype: int
458
+ # """
459
+ # total_size = 0
460
+ # for folder in self.parquet_folder_list:
461
+ # # Use a double wildcard ** to match any level of nested directories
462
+ # for path in self.fs.glob(f"{folder}/**/*.parquet"):
463
+ # total_size += self.fs.size(path)
464
+ # return total_size
465
+ #
466
+ # def load_files(self, **filters):
467
+ # """
468
+ # Loads parquet files into a Dask DataFrame based on the specified conditions.
469
+ # Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
470
+ # """
471
+ # if not self.load_parquet:
472
+ # self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
473
+ # return dd.from_pandas(pd.DataFrame(), npartitions=1)
474
+ #
475
+ # # Resolve paths
476
+ # paths_to_load = []
477
+ # if self.parquet_folder_list:
478
+ # import posixpath
479
+ # paths_to_load = sorted(set([posixpath.dirname(p) for p in self.parquet_folder_list]))
480
+ # paths_to_load = [p.rstrip("/") + "/*.parquet" for p in paths_to_load]
481
+ # elif self.parquet_full_path:
482
+ # paths_to_load = [self.parquet_full_path]
483
+ #
484
+ # if not paths_to_load:
485
+ # self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
486
+ # return dd.from_pandas(pd.DataFrame(), npartitions=1)
487
+ #
488
+ # # Prepare filters
489
+ # fh = None
490
+ # expr = None
491
+ # pq_filters = None
492
+ # residual_filters = None
493
+ # if filters:
494
+ # fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
495
+ #
496
+ # # Use the compiler + pushdown split so we don't double-apply
497
+ # try:
498
+ # # If you added split_pushdown_and_residual earlier:
499
+ # pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
500
+ # expr = fh.compile_filters(residual_filters) if residual_filters else None
501
+ # except AttributeError:
502
+ # # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
503
+ # expr = fh.compile_filters(filters)
504
+ # pq_filters = expr.to_parquet_filters()
505
+ #
506
+ # try:
507
+ # self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
508
+ #
509
+ # # Optional: prune columns. Keep it simple unless you want to compute from filters.
510
+ # columns = None # or a concrete list if you know it
511
+ #
512
+ # if fh and pq_filters:
513
+ # self.logger.debug(f"Applying Parquet filters: {pq_filters}")
514
+ # dd_result = dd.read_parquet(
515
+ # paths_to_load,
516
+ # engine="pyarrow",
517
+ # filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
518
+ # filters=pq_filters,
519
+ # columns=columns,
520
+ # gather_statistics=False, # uncomment if you have *many* files and don't need global stats
521
+ # ignore_metadata_file=True
522
+ # )
523
+ # # Apply only residual mask (if any)
524
+ # if expr is not None:
525
+ # dd_result = dd_result[expr.mask(dd_result)]
526
+ # else:
527
+ # dd_result = dd.read_parquet(
528
+ # paths_to_load,
529
+ # engine="pyarrow",
530
+ # filesystem=self.fs,
531
+ # columns=columns,
532
+ # gather_statistics=False,
533
+ # ignore_metadata_file=True
534
+ # )
535
+ # # If we didn't push down, but have filters, apply them here
536
+ # if expr is None and fh and filters:
537
+ # expr = fh.compile_filters(filters)
538
+ # if expr is not None:
539
+ # dd_result = dd_result[expr.mask(dd_result)]
540
+ #
541
+ # return dd_result
542
+ #
543
+ # except FileNotFoundError as e:
544
+ # self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
545
+ # self.logger.debug("Returning empty DataFrame due to missing parquet files.")
546
+ # return dd.from_pandas(pd.DataFrame(), npartitions=1)
547
+ # except Exception as e:
548
+ # self.logger.debug(f"Parquet loading failed for paths {paths_to_load}: {e}")
549
+ # self.logger.debug("Returning empty DataFrame due to loading error.")
550
+ # return dd.from_pandas(pd.DataFrame(), npartitions=1)
551
+ #
552
+ #
553
+ # @staticmethod
554
+ # def ensure_file_extension(filepath: str, extension: str) -> str:
555
+ # """
556
+ # Ensures that the specified file has the desired extension. If the file already has the
557
+ # specified extension, it returns the filepath unchanged. Otherwise, it updates the file
558
+ # extension to the given one and returns the modified filepath.
559
+ #
560
+ # :param filepath: The path to the file as a string.
561
+ # :param extension: The desired file extension, without the leading dot.
562
+ # :return: The updated file path as a string, ensuring it has the specified extension.
563
+ # """
564
+ # path = Path(filepath)
565
+ # return str(path.with_suffix(f".{extension}")) if path.suffix != f".{extension}" else filepath