oups 2025.9.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of oups might be problematic. Click here for more details.

Files changed (43) hide show
  1. oups/__init__.py +40 -0
  2. oups/date_utils.py +62 -0
  3. oups/defines.py +26 -0
  4. oups/numpy_utils.py +114 -0
  5. oups/stateful_loop/__init__.py +14 -0
  6. oups/stateful_loop/loop_persistence_io.py +55 -0
  7. oups/stateful_loop/stateful_loop.py +654 -0
  8. oups/stateful_loop/validate_loop_usage.py +338 -0
  9. oups/stateful_ops/__init__.py +22 -0
  10. oups/stateful_ops/aggstream/__init__.py +12 -0
  11. oups/stateful_ops/aggstream/aggstream.py +1524 -0
  12. oups/stateful_ops/aggstream/cumsegagg.py +580 -0
  13. oups/stateful_ops/aggstream/jcumsegagg.py +416 -0
  14. oups/stateful_ops/aggstream/segmentby.py +1018 -0
  15. oups/stateful_ops/aggstream/utils.py +71 -0
  16. oups/stateful_ops/asof_merger/__init__.py +11 -0
  17. oups/stateful_ops/asof_merger/asof_merger.py +750 -0
  18. oups/stateful_ops/asof_merger/get_config.py +401 -0
  19. oups/stateful_ops/asof_merger/validate_params.py +285 -0
  20. oups/store/__init__.py +15 -0
  21. oups/store/filepath_utils.py +68 -0
  22. oups/store/indexer.py +457 -0
  23. oups/store/ordered_parquet_dataset/__init__.py +19 -0
  24. oups/store/ordered_parquet_dataset/metadata_filename.py +50 -0
  25. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/__init__.py +15 -0
  26. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/base.py +863 -0
  27. oups/store/ordered_parquet_dataset/ordered_parquet_dataset/read_only.py +252 -0
  28. oups/store/ordered_parquet_dataset/parquet_adapter.py +157 -0
  29. oups/store/ordered_parquet_dataset/write/__init__.py +19 -0
  30. oups/store/ordered_parquet_dataset/write/iter_merge_split_data.py +131 -0
  31. oups/store/ordered_parquet_dataset/write/merge_split_strategies/__init__.py +22 -0
  32. oups/store/ordered_parquet_dataset/write/merge_split_strategies/base.py +784 -0
  33. oups/store/ordered_parquet_dataset/write/merge_split_strategies/n_rows_strategy.py +297 -0
  34. oups/store/ordered_parquet_dataset/write/merge_split_strategies/time_period_strategy.py +319 -0
  35. oups/store/ordered_parquet_dataset/write/write.py +270 -0
  36. oups/store/store/__init__.py +11 -0
  37. oups/store/store/dataset_cache.py +50 -0
  38. oups/store/store/iter_intersections.py +397 -0
  39. oups/store/store/store.py +345 -0
  40. oups-2025.9.5.dist-info/LICENSE +201 -0
  41. oups-2025.9.5.dist-info/METADATA +44 -0
  42. oups-2025.9.5.dist-info/RECORD +43 -0
  43. oups-2025.9.5.dist-info/WHEEL +4 -0
@@ -0,0 +1,68 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 4 21:30:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections.abc import Iterator
9
+ from pathlib import Path
10
+
11
+
12
+ def files_at_depth(basepath: Path, depth: int = 2) -> Iterator[tuple[Path, list[str]]]:
13
+ """
14
+ Yield file list in dirs.
15
+
16
+ Generator yielding a tuple which:
17
+ - 1st value is the path of a non-empty directory at 'depth' sublevel,
18
+ counting from 'basepath'.
19
+ - 2nd value is the list of files in this directory.
20
+
21
+ Parameters
22
+ ----------
23
+ basepath : Path
24
+ Path to directory from which scanning.
25
+ depth : int, default 2
26
+ Number of levels for directories to be retained (includes top level).
27
+ By default, at least 2 levels.
28
+
29
+ Yields
30
+ ------
31
+ Iterator[tuple[Path, list[str]]]
32
+ List of files within directory specified by the key. Empty directories
33
+ are not returned.
34
+
35
+ """
36
+ if not basepath.exists():
37
+ return
38
+ if depth == 0:
39
+ files = [entry.name for entry in basepath.iterdir() if not entry.is_dir()]
40
+ if files:
41
+ yield basepath, files
42
+ if depth > 0:
43
+ try:
44
+ dirs = [entry for entry in basepath.iterdir() if entry.is_dir()]
45
+ except FileNotFoundError:
46
+ # If directory not existing, return `None`
47
+ return
48
+ depth -= 1
49
+ for path in dirs:
50
+ yield from files_at_depth(path, depth)
51
+
52
+
53
+ def remove_dir(path: Path):
54
+ """
55
+ Remove directory and all its contents.
56
+
57
+ Parameters
58
+ ----------
59
+ path : Path
60
+ Path to directory to be removed.
61
+
62
+ """
63
+ if path.is_file() or path.is_symlink():
64
+ path.unlink()
65
+ return
66
+ for p in path.iterdir():
67
+ remove_dir(p)
68
+ path.rmdir()
oups/store/indexer.py ADDED
@@ -0,0 +1,457 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Wed Dec 1 18:35:00 2021.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from collections.abc import Callable
9
+ from collections.abc import Hashable
10
+ from collections.abc import Iterator
11
+ from dataclasses import dataclass
12
+ from dataclasses import fields
13
+ from dataclasses import is_dataclass
14
+ from os.path import sep
15
+ from pathlib import Path
16
+ from typing import Any
17
+ from typing import Protocol
18
+ from typing import runtime_checkable
19
+
20
+
21
+ @runtime_checkable
22
+ class StoreKey(Protocol, Hashable):
23
+ def __lt__(self, other: object) -> bool: ...
24
+
25
+ def to_path(self) -> Path: ...
26
+
27
+
28
+ @runtime_checkable
29
+ class TopLevelIndexer[K: StoreKey](Protocol):
30
+ field_sep: str
31
+ depth: int
32
+
33
+ @classmethod
34
+ def from_path(cls, source: str | Path) -> K | None: ...
35
+
36
+ @classmethod
37
+ def from_str(cls, source: str) -> K | None: ...
38
+
39
+
40
+ # Float removed to prevent having '.' in field values.
41
+ TYPE_ACCEPTED = {int, str}
42
+ # Default fields separator, if not modified by user.
43
+ DEFAULT_FIELD_SEP = "-"
44
+ # Characters forbidden in field value.
45
+ # 'field_sep' is also included at runtime before check.
46
+ FORBIDDEN_CHARS = (sep, ".")
47
+ KEY_FIELD_SEP = "field_sep"
48
+ KEY_FROM_PATH = "from_path"
49
+ KEY_DEPTH = "depth"
50
+
51
+
52
+ def _is_dataclass_instance(obj: Any) -> bool:
53
+ # Check if a class is an instance of a dataclass and not a dataclass
54
+ # itself, as per
55
+ # https://docs.python.org/3/library/dataclasses.html#dataclasses.is_dataclass
56
+ return is_dataclass(obj) and not isinstance(obj, type)
57
+
58
+
59
+ def _dataclass_instance_to_dict(obj: Any) -> dict:
60
+ # Shallow copy, not possible to use 'asdict()', as per
61
+ # https://docs.python.org/3/library/dataclasses.html#dataclasses.asdict
62
+ return {field.name: getattr(obj, field.name) for field in fields(obj)}
63
+
64
+
65
+ def _dataclass_instance_to_lists(obj: Any) -> Iterator[list[Any]]:
66
+ """
67
+ Yield items as lists of fields values.
68
+
69
+ If a new dataclass instance is the last field, its fields values are
70
+ yielded as next item, and so on...
71
+
72
+ Parameters
73
+ ----------
74
+ obj : dataclass instance
75
+ Top-level key or nested sublevel dataclass instance. May contain
76
+ nested dataclass instances.
77
+
78
+ Returns
79
+ -------
80
+ Iterator[list[Any]]
81
+ Yields list of fields values.
82
+
83
+ """
84
+ fields = list(_dataclass_instance_to_dict(obj).values())
85
+ if fields:
86
+ yield fields
87
+ if _is_dataclass_instance(fields[-1]):
88
+ yield from _dataclass_instance_to_lists(fields[-1])
89
+
90
+
91
+ def _validate_toplevel_instance(toplevel: StoreKey):
92
+ """
93
+ Validate a 'toplevel'-decorated data class instance.
94
+
95
+ Check field type is only among 'int', 'str' or another dataclass instance.
96
+ Check that there is at most only one dataclass instance per nesting level,
97
+ and if present, it is not the 1st field, nor the last field.
98
+ Raise a TypeError or ValueError if instance is not compliant.
99
+
100
+ Parameters
101
+ ----------
102
+ toplevel : StoreKey
103
+ Top-level key instance produced by '@toplevel'.
104
+
105
+ """
106
+ forbidden_chars = (toplevel.field_sep, *FORBIDDEN_CHARS)
107
+ for fields_ in _dataclass_instance_to_lists(toplevel):
108
+ number_of_fields = len(fields_)
109
+ for counter, field in enumerate(fields_):
110
+ if _is_dataclass_instance(field):
111
+ # If a dataclass instance.
112
+ if not counter:
113
+ # A dataclass instance cannot be the only field.
114
+ # Detecting if it is in last position suffice, except if
115
+ # there is only one field, in which case it is also in
116
+ # 1st position.
117
+ raise TypeError(
118
+ "a dataclass instance cannot be the only field of a level.",
119
+ )
120
+ if counter + 1 != number_of_fields:
121
+ # A dataclass instance cannot be in last position.
122
+ raise TypeError(
123
+ "a dataclass instance is only possible in last position.",
124
+ )
125
+ else:
126
+ # If not a dataclass instance.
127
+ field_as_str = str(field)
128
+ if any(symb in field_as_str for symb in forbidden_chars):
129
+ raise ValueError(
130
+ f"use of a forbidden character among {forbidden_chars} "
131
+ f"is not possible in {field_as_str}.",
132
+ )
133
+ if not ((type(field) in TYPE_ACCEPTED) or _is_dataclass_instance(field)):
134
+ raise TypeError(f"field type {type(field)} not possible.")
135
+ return
136
+
137
+
138
+ def _dataclass_instance_format(toplevel: StoreKey, to_path: bool = False) -> str | Path:
139
+ """
140
+ Return a key instance formatted as a string or Path object.
141
+
142
+ Parameters
143
+ ----------
144
+ toplevel : StoreKey
145
+ Top-level key instance.
146
+ to_path : bool, default False
147
+ If True, return a Path object;
148
+ If False, return a string.
149
+
150
+ Returns
151
+ -------
152
+ str | Path
153
+ Formatted representation of the key instance.
154
+
155
+ """
156
+ fields_lists = list(_dataclass_instance_to_lists(toplevel))
157
+ # Relying on the fact that only the tail can be a dataclass instance.
158
+ path_parts = [toplevel.field_sep.join(map(str, fields_[:-1])) for fields_ in fields_lists]
159
+ # Handle the last field of the final level
160
+ path_parts[-1] += (
161
+ f"{toplevel.field_sep}{fields_lists[-1][-1]!s}" if path_parts[-1] else str(fields_lists[-1][-1])
162
+ )
163
+ return Path(*path_parts) if to_path else toplevel.field_sep.join(path_parts)
164
+
165
+
166
+ def _dataclass_instance_format_to_path(toplevel: StoreKey) -> Path:
167
+ """
168
+ Return a key instance formatted as a Path object.
169
+
170
+ Parameters
171
+ ----------
172
+ toplevel : StoreKey
173
+ Top-level key instance.
174
+
175
+ Returns
176
+ -------
177
+ Path
178
+ Path object representing the key instance.
179
+
180
+ """
181
+ return _dataclass_instance_format(toplevel, to_path=True)
182
+
183
+
184
+ def _dataclass_fields_types_to_lists(cls: TopLevelIndexer[StoreKey]) -> list[list[Any]]:
185
+ """
186
+ Type of fields in dataclass returned in lists.
187
+
188
+ Return the type of each field, one list per level, and all levels in a
189
+ list.
190
+
191
+ Parameters
192
+ ----------
193
+ cls : TopLevelIndexer[StoreKey]
194
+ Top-level indexer class produced by '@toplevel'.
195
+
196
+ Returns
197
+ -------
198
+ list[list[type]]
199
+ List of field-types lists, one list per level.
200
+
201
+ """
202
+ types = [[field.type for field in fields(cls)]]
203
+ while is_dataclass(last := types[-1][-1]):
204
+ types.append([field.type for field in fields(last)])
205
+ return types
206
+
207
+
208
+ def _dataclass_instance_from_source[K: StoreKey](
209
+ cls: TopLevelIndexer[K],
210
+ source: str | Path,
211
+ ) -> K | None:
212
+ """
213
+ Return a dataclass instance derived from input string or Path object.
214
+
215
+ If dataclass '__init__' fails, `None` is returned.
216
+
217
+ Parameters
218
+ ----------
219
+ cls : TopLevelIndexer[K]
220
+ Top-level indexer class to be used for generating a key instance.
221
+ source : str | Path
222
+ String or Path representation of the key instance.
223
+
224
+ Returns
225
+ -------
226
+ K | None
227
+ Key instance derived from input, or None if parsing fails.
228
+
229
+ """
230
+ types = _dataclass_fields_types_to_lists(cls)
231
+ # Split string into different fields, depending on the type of source.
232
+ field_sep = cls.field_sep
233
+ if isinstance(source, Path):
234
+ # Path case: split each part by field_sep
235
+ strings_as_list = [substring for part in source.parts for substring in part.split(field_sep)]
236
+ else:
237
+ # String case: split only by field_sep (no directory separator).
238
+ strings_as_list = source.split(field_sep)
239
+ # Manages last level first.
240
+ level_types = types.pop() # remove last element
241
+ level_length = len(level_types)
242
+ try:
243
+ level = [
244
+ field_type(field_as_string)
245
+ for field_type, field_as_string in zip(level_types, strings_as_list[-level_length:], strict=False)
246
+ ]
247
+ while types:
248
+ strings_as_list = strings_as_list[:-level_length]
249
+ level_types = types.pop() # remove last element
250
+ level_length = len(level_types) - 1
251
+ # Relying on the fact that a dataclass is necessarily the last
252
+ # field.
253
+ level = [
254
+ field_type(field_as_string)
255
+ for field_type, field_as_string in zip(
256
+ level_types[:-1],
257
+ strings_as_list[-level_length:],
258
+ strict=False,
259
+ )
260
+ ] + [level_types[-1](*level)]
261
+ return cls(*level, check=False)
262
+ except (TypeError, ValueError):
263
+ # TypeError if the number of arguments for instantiation of a
264
+ # dataclass is not correct (meaning the split has not been done
265
+ # with the right 'field_sep' character).
266
+ # ValueError if there is a type mismatch, for instance when 'int'
267
+ # is initialized from a string.
268
+ return None
269
+
270
+
271
+ def _get_depth(obj: type) -> int:
272
+ """
273
+ Return number of levels, including 'toplevel'.
274
+
275
+ To be decorated with '@property'.
276
+
277
+ Parameters
278
+ ----------
279
+ obj : type
280
+ Top-level indexer class.
281
+
282
+ Returns
283
+ -------
284
+ int
285
+ Number of levels (including toplevel).
286
+
287
+ """
288
+ depth = 1
289
+ level = obj
290
+ while is_dataclass(level := fields(level)[-1].type):
291
+ depth += 1
292
+ return depth
293
+
294
+
295
+ def _reduce(obj: TopLevelIndexer[StoreKey]) -> tuple[Callable, tuple[str]]:
296
+ """
297
+ Reduce function for making 'Indexer' serializable.
298
+
299
+ Parameters
300
+ ----------
301
+ obj : TopLevelIndexer[StoreKey]
302
+ Top-level indexer class.
303
+
304
+ Returns
305
+ -------
306
+ tuple[Callable, tuple[str]]
307
+ See '__reduce' standard interface.
308
+ https://docs.python.org/3/library/pickle.html#object.__reduce__
309
+
310
+ """
311
+ return obj.from_str, (str(obj),)
312
+
313
+
314
+ class TopLevel(type):
315
+ """
316
+ Metaclass defining class properties of '@toplevel'-decorated class.
317
+ """
318
+
319
+ @property
320
+ def field_sep(cls) -> str:
321
+ """
322
+ Return field separator.
323
+ """
324
+ return cls._field_sep
325
+
326
+ @property
327
+ def depth(cls) -> int:
328
+ """
329
+ Return depth, i.e. number of levels.
330
+ """
331
+ return cls._depth
332
+
333
+
334
+ def toplevel(index_class=None, *, field_sep: str = DEFAULT_FIELD_SEP) -> type | Callable:
335
+ """
336
+ Turn decorated class into an indexing schema.
337
+
338
+ Decorated class is equipped with methods and attributes to use with a
339
+ ``Store`` instance.
340
+ It has to be defined as one would define a class decorated by
341
+ ``@dataclass``.
342
+
343
+ Parameters
344
+ ----------
345
+ field_sep : str, default '-'
346
+ Character to use as separator between fields of the dataclass.
347
+
348
+ Returns
349
+ -------
350
+ type
351
+ Decorated class (a toplevel indexer class).
352
+
353
+ Attributes
354
+ ----------
355
+ field_sep: str
356
+ Fields separator (can't assign).
357
+ depth: int
358
+ Number of levels, including 'toplevel' (can't assign).
359
+
360
+ Notes
361
+ -----
362
+ ``@dataclass`` is actually called when decorating with ``@toplevel`` with
363
+ parameters set to:
364
+
365
+ - ``order=True``,
366
+ - ``frozen=True``
367
+
368
+ When class is instantiated, a validation step is conducted on attributes
369
+ types and values.
370
+
371
+ - An instance can only be composed with ``int``, ``str`` or a dataclass
372
+ object coming in last position;
373
+ - Value of attribute can not incorporate forbidden characters like ``/``
374
+ and ``self.field_sep``.
375
+
376
+ """
377
+
378
+ def create_toplevel_class(index_class):
379
+ # Re-create 'index_class' as a 'TopLevel'-inheriting class to equip it
380
+ # with class properties 'depth' and 'field_sep'
381
+ # (as per https://stackoverflow.com/questions/5120688)
382
+ # Explicitly add property to OtherClass.__dict__
383
+ # (as per https://stackoverflow.com/questions/70233891)
384
+ d = dict(index_class.__dict__)
385
+ d.update({KEY_FIELD_SEP: TopLevel.field_sep, KEY_DEPTH: TopLevel.depth})
386
+ index_class = TopLevel(index_class.__name__, index_class.__bases__, d)
387
+ # Wrap with `@dataclass`.
388
+ # TODO: when python 3.10 is more wide spread, set 'slot=True' to save
389
+ # RAM.
390
+ index_class = dataclass(index_class, order=True, frozen=True)
391
+
392
+ # Equip 'index_class' with what is needed to be a 'toplevel'.
393
+ # Dunders: modified '__init__', modified '__str__'
394
+ # Copy of original __init__ to call it without recursion.
395
+ index_class_init = index_class.__init__
396
+
397
+ def __init__(self, *args, check: bool = True, **kws):
398
+ # object.__setattr__(self, "_field_sep", field_sep)
399
+ index_class_init(self, *args, **kws)
400
+ if check:
401
+ # Validate dataclass instance.
402
+ _validate_toplevel_instance(self)
403
+
404
+ index_class.__init__ = __init__
405
+ index_class.__str__ = _dataclass_instance_format
406
+
407
+ # Class properties: 'field_sep', 'depth'
408
+ index_class._field_sep = field_sep
409
+ index_class._depth = _get_depth(index_class)
410
+
411
+ # Class instance method: 'to_path'
412
+ index_class.to_path = _dataclass_instance_format_to_path
413
+
414
+ # Classmethods: 'from_str', 'from_path'.
415
+ index_class.from_path = classmethod(_dataclass_instance_from_source)
416
+ index_class.from_str = classmethod(_dataclass_instance_from_source)
417
+
418
+ # Serialization.
419
+ index_class.__reduce__ = _reduce
420
+
421
+ return index_class
422
+
423
+ if index_class:
424
+ # Calling decorator without other parameters.
425
+ return create_toplevel_class(index_class)
426
+ # Calling decorator with other parameters.
427
+ return create_toplevel_class
428
+
429
+
430
+ def is_toplevel(toplevel: Any) -> bool:
431
+ """
432
+ Return `True` if `toplevel`-decorated class.
433
+
434
+ Returns 'True' if 'toplevel' (class or instance) has been decorated with
435
+ '@toplevel'. It checks presence 'field_sep' attribute and 'from_path' method.
436
+
437
+ """
438
+ return hasattr(toplevel, KEY_FIELD_SEP) and callable(getattr(toplevel, KEY_FROM_PATH, None))
439
+
440
+
441
+ def sublevel(index_class):
442
+ """
443
+ Define a subdirectory level.
444
+
445
+ This decorator really is an alias of ``@dataclass`` decorator, with
446
+ parameters set to:
447
+
448
+ - ``order=True``,
449
+ - ``frozen=True``
450
+
451
+ """
452
+ # Wrap with `@dataclass`.
453
+ # TODO: when python 3.10 is more wide spread, set 'slot=True' to save RAM.
454
+ return dataclass(index_class, order=True, frozen=True)
455
+
456
+
457
+ # TODO: deep copy of an Indexer is currently not possible. Work this out.
@@ -0,0 +1,19 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Sun May 18 16:00:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from .ordered_parquet_dataset import OrderedParquetDataset
9
+ from .parquet_adapter import check_cmidx
10
+ from .parquet_adapter import conform_cmidx
11
+ from .write import write
12
+
13
+
14
+ __all__ = [
15
+ "OrderedParquetDataset",
16
+ "check_cmidx",
17
+ "conform_cmidx",
18
+ "write",
19
+ ]
@@ -0,0 +1,50 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Sat May 24 18:00:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from pathlib import Path
9
+
10
+
11
+ OPDMD_EXTENSION = "_opdmd"
12
+
13
+
14
+ def get_md_filepath(dirpath: Path) -> Path:
15
+ """
16
+ Get standardized opd metadata file path.
17
+
18
+ Parameters
19
+ ----------
20
+ dirpath : Path
21
+ The directory path to use in the file path.
22
+
23
+ Returns
24
+ -------
25
+ Path
26
+ The formatted file name.
27
+
28
+ """
29
+ return dirpath.parent / f"{dirpath.name}{OPDMD_EXTENSION}"
30
+
31
+
32
+ def get_md_basename(filepath: str | Path) -> str:
33
+ """
34
+ Get the basename of the opd metadata file.
35
+
36
+ Parameters
37
+ ----------
38
+ filepath : Union[str, Path]
39
+ The file path from which extract the basename.
40
+
41
+ Returns
42
+ -------
43
+ str
44
+ The formatted file basename if file extension is present, None
45
+ otherwise.
46
+
47
+ """
48
+ if isinstance(filepath, str):
49
+ filepath = Path(filepath)
50
+ return filepath.name[: -len(OPDMD_EXTENSION)] if filepath.name.endswith(OPDMD_EXTENSION) else None
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Created on Tue Jun 10 18:00:00 2025.
4
+
5
+ @author: pierrot
6
+
7
+ """
8
+ from .base import OrderedParquetDataset
9
+ from .base import create_custom_opd
10
+
11
+
12
+ __all__ = [
13
+ "OrderedParquetDataset",
14
+ "create_custom_opd",
15
+ ]