kaparoo-python 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kaparoo/data/__init__.py CHANGED
@@ -2,9 +2,12 @@ __all__ = (
2
2
  "ConcatSequence",
3
3
  "DataSequence",
4
4
  "FileFolderSequence",
5
+ "FileListSequence",
5
6
  "SingleFileSequence",
6
7
  "SlicedSequence",
8
+ "TransformedSequence",
7
9
  "WindowedSequence",
10
+ "ZippedSequence",
8
11
  "generate_batches",
9
12
  )
10
13
 
@@ -12,8 +15,11 @@ from kaparoo.data.sequences import (
12
15
  ConcatSequence,
13
16
  DataSequence,
14
17
  FileFolderSequence,
18
+ FileListSequence,
15
19
  SingleFileSequence,
16
20
  SlicedSequence,
21
+ TransformedSequence,
17
22
  WindowedSequence,
23
+ ZippedSequence,
18
24
  generate_batches,
19
25
  )
@@ -4,9 +4,12 @@ __all__ = (
4
4
  "ConcatSequence",
5
5
  "DataSequence",
6
6
  "FileFolderSequence",
7
+ "FileListSequence",
7
8
  "SingleFileSequence",
8
9
  "SlicedSequence",
10
+ "TransformedSequence",
9
11
  "WindowedSequence",
12
+ "ZippedSequence",
10
13
  "generate_batches",
11
14
  )
12
15
 
@@ -14,10 +17,13 @@ from kaparoo.data.sequences.base import DataSequence
14
17
  from kaparoo.data.sequences.composers import (
15
18
  ConcatSequence,
16
19
  SlicedSequence,
20
+ TransformedSequence,
17
21
  WindowedSequence,
22
+ ZippedSequence,
18
23
  )
19
24
  from kaparoo.data.sequences.templates import (
20
25
  FileFolderSequence,
26
+ FileListSequence,
21
27
  SingleFileSequence,
22
28
  )
23
29
  from kaparoo.data.sequences.utils import generate_batches
@@ -1,15 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ("ConcatSequence", "SlicedSequence", "WindowedSequence")
3
+ __all__ = (
4
+ "ConcatSequence",
5
+ "SlicedSequence",
6
+ "TransformedSequence",
7
+ "WindowedSequence",
8
+ "ZippedSequence",
9
+ )
4
10
 
5
11
  from abc import abstractmethod
6
12
  from bisect import bisect_right
7
- from typing import TYPE_CHECKING
13
+ from typing import TYPE_CHECKING, cast
8
14
 
9
15
  from kaparoo.data.sequences.base import DataSequence
10
16
 
11
17
  if TYPE_CHECKING:
12
- from collections.abc import Sequence
18
+ from collections.abc import Callable, Sequence
13
19
 
14
20
 
15
21
  class SlicedSequence[T, M](DataSequence[T, M]):
@@ -59,6 +65,61 @@ class SlicedSequence[T, M](DataSequence[T, M]):
59
65
  return self._source.get_meta(self._indices[index])
60
66
 
61
67
 
68
+ class TransformedSequence[T_in, M_in, T_out = T_in, M_out = M_in](
69
+ DataSequence[T_out, M_out]
70
+ ):
71
+ """A view of `source` with `transform` applied lazily to each item.
72
+
73
+ `transform` is called on demand in `get_item`; nothing is loaded or
74
+ converted at construction time. `get_meta` passes through
75
+ `source.get_meta` unchanged by default -- override it in a subclass
76
+ when `M_out` differs from `M_in`.
77
+
78
+ Type Parameters:
79
+ T_in: Item type of `source`.
80
+ M_in: Metadata type of `source`.
81
+ T_out: Item type after the transform. Defaults to `T_in`.
82
+ M_out: Metadata type exposed by this view. Defaults to `M_in`.
83
+ When `M_out != M_in`, override `get_meta` in a subclass;
84
+ the default passthrough is only safe when `M_out == M_in`.
85
+
86
+ Example:
87
+ >>> # Item-only transform; metadata passes through unchanged.
88
+ >>> normalized = TransformedSequence(image_folder, normalize)
89
+
90
+ >>> # Meta transform via subclassing:
91
+ >>> class Augmented(TransformedSequence[ndarray, Path, ndarray, AugMeta]):
92
+ ... def get_meta(self, index: int) -> AugMeta:
93
+ ... return AugMeta(
94
+ ... path=self.source.get_meta(index),
95
+ ... applied="normalize",
96
+ ... )
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ source: DataSequence[T_in, M_in],
102
+ transform: Callable[[T_in], T_out],
103
+ ) -> None:
104
+ self._source = source
105
+ self._transform = transform
106
+
107
+ @property
108
+ def source(self) -> DataSequence[T_in, M_in]:
109
+ """The wrapped sequence."""
110
+ return self._source
111
+
112
+ def __len__(self) -> int:
113
+ return len(self._source)
114
+
115
+ def get_item(self, index: int) -> T_out:
116
+ return self._transform(self._source.get_item(index))
117
+
118
+ def get_meta(self, index: int) -> M_out:
119
+ # Passthrough by default. Override when M_out != M_in.
120
+ return cast("M_out", self._source.get_meta(index))
121
+
122
+
62
123
  class ConcatSequence[T, M](DataSequence[T, M]):
63
124
  """The end-to-end concatenation of zero or more `sources`.
64
125
 
@@ -112,7 +173,7 @@ class ConcatSequence[T, M](DataSequence[T, M]):
112
173
  return source.get_meta(local)
113
174
 
114
175
 
115
- class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
176
+ class WindowedSequence[T, M_in, M_out = M_in](DataSequence[tuple[T, ...], M_out]):
116
177
  """An abstract sliding-window view over `source`.
117
178
 
118
179
  Each item is a tuple of `size` items from `source`, starting at
@@ -130,8 +191,8 @@ class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
130
191
  T: Item type of `source` (also the per-frame type within each
131
192
  window).
132
193
  M_in: Metadata type of `source` (per-frame metadata).
133
- M_out: Metadata type of the window. Determined by the
134
- subclass's `get_meta` return.
194
+ M_out: Metadata type of the window. Defaults to `M_in`.
195
+ Determined by the subclass's `get_meta` return.
135
196
 
136
197
  Args:
137
198
  source: The sequence to window over.
@@ -219,3 +280,115 @@ class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
219
280
  @abstractmethod
220
281
  def get_meta(self, index: int) -> M_out:
221
282
  raise NotImplementedError
283
+
284
+
285
+ class ZippedSequence[T1, T2, M1 = None, M2 = None](
286
+ DataSequence[tuple[T1, T2], tuple[M1, M2]]
287
+ ):
288
+ """Element-wise zip of two sequences.
289
+
290
+ Item `i` is `(first[i], second[i])` and metadata `i` is
291
+ `(first.get_meta(i), second.get_meta(i))` -- the "paired image + label"
292
+ pattern that `ConcatSequence` (end-to-end) cannot express.
293
+
294
+ With `strict=True` (the default) the two sequences must have the same
295
+ length; a mismatch raises `ValueError` at construction. With
296
+ `strict=False` the view is truncated to the shorter length, like the
297
+ builtin `zip`. For a different combined-metadata shape, subclass and
298
+ override `get_meta`.
299
+
300
+ Type Parameters:
301
+ T1: Item type of the first source.
302
+ T2: Item type of the second source.
303
+ M1: Metadata type of the first source. Defaults to `None`.
304
+ M2: Metadata type of the second source. Defaults to `None`.
305
+
306
+ Args:
307
+ first: The first sequence.
308
+ second: The second sequence.
309
+ strict: When True (default), require equal lengths and raise on a
310
+ mismatch. When False, truncate to the shorter length.
311
+
312
+ Raises:
313
+ ValueError: If `strict` is True and the sequences differ in length.
314
+
315
+ Example:
316
+ >>> pairs = ZippedSequence(images, labels)
317
+ >>> pairs[0] # (images[0], labels[0])
318
+ >>> pairs.get_meta(0) # (images.get_meta(0), labels.get_meta(0))
319
+ """
320
+
321
+ def __init__(
322
+ self,
323
+ first: DataSequence[T1, M1],
324
+ second: DataSequence[T2, M2],
325
+ *,
326
+ strict: bool = True,
327
+ ) -> None:
328
+ if strict and len(first) != len(second):
329
+ msg = f"sequences differ in length: {len(first)} != {len(second)}"
330
+ raise ValueError(msg)
331
+ self._first = first
332
+ self._second = second
333
+ self._length = len(first) if strict else min(len(first), len(second))
334
+
335
+ @property
336
+ def first(self) -> DataSequence[T1, M1]:
337
+ """The first wrapped sequence."""
338
+ return self._first
339
+
340
+ @property
341
+ def second(self) -> DataSequence[T2, M2]:
342
+ """The second wrapped sequence."""
343
+ return self._second
344
+
345
+ def __len__(self) -> int:
346
+ return self._length
347
+
348
+ def _normalize_index(self, index: int) -> int:
349
+ """Normalize a possibly-negative index and validate range.
350
+
351
+ Indices resolve against the zipped length (the shorter source when
352
+ `strict=False`), so they address the same position in both sources.
353
+
354
+ Raises:
355
+ IndexError: If `index` is outside `[-len(self), len(self))`.
356
+ """
357
+ n = self._length
358
+ original = index
359
+ if index < 0:
360
+ index += n
361
+ if not 0 <= index < n:
362
+ msg = f"index {original} out of range for length {n}"
363
+ raise IndexError(msg)
364
+ return index
365
+
366
+ def get_item(self, index: int) -> tuple[T1, T2]:
367
+ index = self._normalize_index(index)
368
+ return self._first.get_item(index), self._second.get_item(index)
369
+
370
+ def get_items(self, indices: Sequence[int]) -> Sequence[tuple[T1, T2]]:
371
+ # Normalize, then bulk-delegate so each source's `get_items`
372
+ # optimization is used.
373
+ normalized = [self._normalize_index(i) for i in indices]
374
+ return list(
375
+ zip(
376
+ self._first.get_items(normalized),
377
+ self._second.get_items(normalized),
378
+ strict=True,
379
+ )
380
+ )
381
+
382
+ def get_meta(self, index: int) -> tuple[M1, M2]:
383
+ index = self._normalize_index(index)
384
+ return self._first.get_meta(index), self._second.get_meta(index)
385
+
386
+ def get_metas(self, indices: Sequence[int]) -> Sequence[tuple[M1, M2]]:
387
+ normalized = [self._normalize_index(i) for i in indices]
388
+ return list(
389
+ zip(
390
+ self._first.get_metas(normalized),
391
+ self._second.get_metas(normalized),
392
+ strict=True,
393
+ )
394
+ )
@@ -1,6 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ("FileFolderSequence", "SingleFileSequence")
3
+ __all__ = ("FileFolderSequence", "FileListSequence", "SingleFileSequence")
4
4
 
5
5
  from abc import abstractmethod
6
6
  from pathlib import Path
@@ -11,14 +11,95 @@ from kaparoo.filesystem.existence import ensure_dir_exists, ensure_file_exists
11
11
  from kaparoo.filesystem.utils import stringify_paths, wrap_path
12
12
 
13
13
  if TYPE_CHECKING:
14
- from kaparoo.filesystem.types import StrPath
14
+ from kaparoo.filesystem.types import StrPath, StrPaths
15
15
 
16
16
 
17
- class FileFolderSequence[T, M = Path](DataSequence[T, M]):
18
- """A folder-rooted `DataSequence` whose items live in individual files.
17
+ class FileListSequence[T, M = Path](DataSequence[T, M]):
18
+ """A `DataSequence` over an explicit, ordered list of files.
19
19
 
20
- The base class handles file discovery, indexing, and root-relative
21
- path bookkeeping. Subclasses are responsible for three things:
20
+ Items live one-per-file; subclasses implement `load_file` and `get_meta`.
21
+ The files are given directly rather than discovered under a `root`, so
22
+ they may live in unrelated directories -- or, on Windows, on different
23
+ drives. (`FileFolderSequence` is the special case where the list is
24
+ discovered under a single root and stored relative to it.)
25
+
26
+ The given order is preserved verbatim and duplicates are kept; sort the
27
+ input yourself (`sorted(files, key=...)`) if a particular order is
28
+ needed. Paths are not checked for existence at construction; `load_file`
29
+ is called lazily on each `get_item`.
30
+
31
+ The base exposes:
32
+
33
+ - `files: tuple[Path, ...]` — full paths as an immutable snapshot.
34
+ - `get_file(index) -> Path` — full path of the i-th file.
35
+
36
+ Type Parameters:
37
+ T: Item type returned by `get_item`.
38
+ M: Per-item metadata type. Defaults to `Path`; override when the
39
+ metadata is something else (label, line number, ...).
40
+
41
+ Args:
42
+ files: The file paths to expose, in order.
43
+
44
+ Example:
45
+ >>> from pathlib import Path
46
+ >>> class BytesList(FileListSequence[bytes]):
47
+ ... def get_meta(self, index: int) -> Path:
48
+ ... return self.get_file(index)
49
+ ...
50
+ ... def load_file(self, path: Path) -> bytes:
51
+ ... return path.read_bytes()
52
+ >>>
53
+ >>> data = BytesList(["images/a.png", "/other/b.png"])
54
+ """
55
+
56
+ def __init__(self, files: StrPaths) -> None:
57
+ self._files = list(stringify_paths(files))
58
+
59
+ def __len__(self) -> int:
60
+ return len(self._files)
61
+
62
+ @property
63
+ def files(self) -> tuple[Path, ...]:
64
+ """Immutable snapshot of the full file paths, in order.
65
+
66
+ Returns a fresh `tuple[Path, ...]` on each access.
67
+ """
68
+ return tuple(self.get_file(i) for i in range(len(self)))
69
+
70
+ def get_file(self, index: int) -> Path:
71
+ """Full Path of the file at `index`."""
72
+ return Path(self._files[index])
73
+
74
+ def get_item(self, index: int) -> T:
75
+ return self.load_file(self.get_file(index))
76
+
77
+ @abstractmethod
78
+ def get_meta(self, index: int) -> M:
79
+ raise NotImplementedError
80
+
81
+ @abstractmethod
82
+ def load_file(self, path: Path) -> T:
83
+ """Decode a single file into an item of type `T`.
84
+
85
+ Called lazily on each `get_item` -- not at construction time.
86
+ Subclasses may freely use external libraries (PIL, librosa,
87
+ cv2, ...) to decode.
88
+ """
89
+ raise NotImplementedError
90
+
91
+
92
+ class FileFolderSequence[T, M = Path](FileListSequence[T, M]):
93
+ """A `FileListSequence` whose file list is discovered under a root.
94
+
95
+ The special case of `FileListSequence` where every file lives under one
96
+ base directory. The list is produced by `list_files(root)`, validated to
97
+ be under `root`, and stored in root-relative form so memory stays low for
98
+ large datasets and the paths survive a `root` relocation; `get_file`
99
+ transparently re-prepends `root`. `load_file`, `get_item`, `files`, and
100
+ `__len__` are inherited unchanged.
101
+
102
+ Subclasses are responsible for three things:
22
103
 
23
104
  - **`list_files(self, root)`** (abstract): return the full `Path`
24
105
  of every file to expose, in the desired order. Called once from
@@ -33,16 +114,9 @@ class FileFolderSequence[T, M = Path](DataSequence[T, M]):
33
114
  to `Path` and `get_meta(i)` can be the one-liner
34
115
  `return self.get_file(i)`.
35
116
 
36
- The base exposes:
117
+ The base adds, on top of `FileListSequence`:
37
118
 
38
119
  - `root: Path` — the base directory.
39
- - `files: tuple[Path, ...]` — full paths as an immutable snapshot.
40
- - `get_file(index) -> Path` — full path of the i-th file.
41
-
42
- Paths are kept internally in their root-relative form so that
43
- memory stays low for large datasets and the sequence survives
44
- `root` relocations; the conversion is transparent to subclasses
45
- and external callers.
46
120
 
47
121
  Parameterized subclasses:
48
122
  When a subclass needs instance-level options (e.g. `pattern`,
@@ -94,48 +168,20 @@ class FileFolderSequence[T, M = Path](DataSequence[T, M]):
94
168
 
95
169
  def __init__(self, root: StrPath) -> None:
96
170
  self._root = ensure_dir_exists(root)
97
- self._files = list(
98
- stringify_paths(self.list_files(self._root), after=self._root)
99
- )
100
-
101
- def __len__(self) -> int:
102
- return len(self._files)
171
+ # `after=root` makes each path root-relative and raises ValueError if
172
+ # any file is not under `root`. The base then stores the relative
173
+ # form; `get_file` re-prepends `root`.
174
+ super().__init__(stringify_paths(self.list_files(self._root), after=self._root))
103
175
 
104
176
  @property
105
177
  def root(self) -> Path:
106
178
  """The base directory the sequence was constructed from."""
107
179
  return self._root
108
180
 
109
- @property
110
- def files(self) -> tuple[Path, ...]:
111
- """Immutable snapshot of the full file paths this sequence exposes.
112
-
113
- Returns a fresh `tuple[Path, ...]` on each access, in the order
114
- established by `list_files`.
115
- """
116
- return tuple(self.get_file(i) for i in range(len(self)))
117
-
118
181
  def get_file(self, index: int) -> Path:
119
182
  """Full Path of the file at `index`."""
120
183
  return wrap_path(self._files[index], prepend=self._root)
121
184
 
122
- def get_item(self, index: int) -> T:
123
- return self.load_file(self.get_file(index))
124
-
125
- @abstractmethod
126
- def get_meta(self, index: int) -> M:
127
- raise NotImplementedError
128
-
129
- @abstractmethod
130
- def load_file(self, path: Path) -> T:
131
- """Decode a single file into an item of type `T`.
132
-
133
- Called lazily on each `get_item` -- not at construction time.
134
- Subclasses may freely use external libraries (PIL, librosa,
135
- cv2, ...) to decode.
136
- """
137
- raise NotImplementedError
138
-
139
185
  @abstractmethod
140
186
  def list_files(self, root: Path) -> list[Path]:
141
187
  """Return the full Path of every file to expose, in order.
@@ -38,6 +38,20 @@ if TYPE_CHECKING:
38
38
  # ========================== #
39
39
 
40
40
 
41
+ def _ensure_directory_target(path: Path, *, clean: bool) -> None:
42
+ """Reject a path that cannot serve as a directory target.
43
+
44
+ Raises `NotADirectoryError` when `path` exists but is not a directory,
45
+ or when `clean` is requested on a symlink -- cleaning must operate on a
46
+ real directory, never through a link (which would otherwise reach the
47
+ link's target). A symlink to a directory is accepted only when `clean`
48
+ is False.
49
+ """
50
+ if (path.exists() and not path.is_dir()) or (clean and path.is_symlink()):
51
+ msg = f"not a usable directory target: {path}"
52
+ raise NotADirectoryError(msg)
53
+
54
+
41
55
  @overload
42
56
  def make_dir(
43
57
  path: StrPath,
@@ -88,9 +102,9 @@ def make_dir(
88
102
  Defaults to False.
89
103
  clean: Whether to recreate the directory empty when it already exists,
90
104
  removing its contents first. Only an existing *directory* is wiped;
91
- a non-directory still raises. Because the directory is removed and
92
- remade, `clean=True` makes `exist_ok` moot. **Destructive.**
93
- Defaults to False.
105
+ a non-directory -- or a symlink -- still raises. Because the
106
+ directory is removed and remade, `clean=True` makes `exist_ok`
107
+ moot. **Destructive.** Defaults to False.
94
108
  stringify: Whether to return the path as a string. Defaults to False.
95
109
 
96
110
  Returns:
@@ -100,15 +114,14 @@ def make_dir(
100
114
  Raises:
101
115
  ValueError: If `mode` is outside the range 0o1-0o7777
102
116
  (not checked on Windows, where the mode is ignored).
103
- NotADirectoryError: If the path exists but is not a directory.
117
+ NotADirectoryError: If the path exists but is not a directory, or
118
+ `clean` is True and the path is a symlink.
104
119
  OSError: If `exist_ok` is False, `clean` is False, and the path
105
120
  already exists.
106
121
  """
107
122
  _validate_mode(mode)
108
123
  path = Path(path)
109
- if path.exists() and not path.is_dir():
110
- msg = f"not a directory: {path}"
111
- raise NotADirectoryError(msg)
124
+ _ensure_directory_target(path, clean=clean)
112
125
  if clean and path.is_dir():
113
126
  shutil.rmtree(path)
114
127
  path.mkdir(mode=mode, parents=True, exist_ok=exist_ok)
@@ -170,9 +183,9 @@ def make_dirs(
170
183
  Defaults to False.
171
184
  clean: Whether to recreate each directory empty when it already exists,
172
185
  removing its contents first. Only an existing *directory* is wiped;
173
- a non-directory still raises. Because the directory is removed and
174
- remade, `clean=True` makes `exist_ok` moot. **Destructive.**
175
- Defaults to False.
186
+ a non-directory -- or a symlink -- still raises. Because the
187
+ directory is removed and remade, `clean=True` makes `exist_ok`
188
+ moot. **Destructive.** Defaults to False.
176
189
  stringify: Whether to return the paths as strings. Defaults to False.
177
190
 
178
191
  Returns:
@@ -183,15 +196,26 @@ def make_dirs(
183
196
  ValueError: If `mode` is outside the range 0o1-0o7777
184
197
  (not checked on Windows, where the mode is ignored).
185
198
  DirectoryNotFoundError: If `root` is provided and does not exist.
186
- NotADirectoryError: If `root` is provided and is not a directory.
199
+ NotADirectoryError: If `root` is provided and is not a directory, if
200
+ any path exists but is not a directory, or `clean` is True and
201
+ any path is a symlink.
187
202
  ValueError: If `root` is provided and any of the paths are absolute.
188
203
  OSError: If `exist_ok` is False, `clean` is False, and any of the
189
204
  paths already exist.
190
- OSError: If any of the paths are not directories.
205
+
206
+ Note:
207
+ Every path is validated (the non-directory / symlink checks above)
208
+ *before* any directory is wiped or created, so a deterministically
209
+ bad entry -- e.g. a file in the list -- fails without partially
210
+ cleaning earlier entries. Creation/cleanup is otherwise per-path and
211
+ not transactional, so a runtime failure (a race, a permission error)
212
+ partway through can still leave earlier entries created or cleaned.
191
213
  """
192
214
  _validate_mode(mode)
193
215
  paths = _join_root_if_provided(paths, root)
194
216
  directories = [Path(p) for p in paths]
217
+ for directory in directories:
218
+ _ensure_directory_target(directory, clean=clean)
195
219
  for directory in directories:
196
220
  if clean and directory.is_dir():
197
221
  shutil.rmtree(directory)
@@ -58,6 +58,23 @@ def _default_dir_mode() -> int:
58
58
  return _umask_default(0o777)
59
59
 
60
60
 
61
+ def _fsync_parent(path: Path) -> None:
62
+ """Best-effort fsync of `path`'s parent directory entry.
63
+
64
+ Makes a just-completed rename/link into `path` durable across a crash on
65
+ POSIX (the file's own data is fsynced separately). A no-op where a
66
+ directory cannot be opened for fsync, e.g. Windows.
67
+ """
68
+ try:
69
+ fd = os.open(path.parent, os.O_RDONLY)
70
+ except OSError:
71
+ return
72
+ try:
73
+ os.fsync(fd)
74
+ finally:
75
+ os.close(fd)
76
+
77
+
61
78
  class StagedFile[AnyStrT: (str, bytes)]:
62
79
  """Write a file safely: stage to a temp file, then commit by atomic move.
63
80
 
@@ -87,10 +104,13 @@ class StagedFile[AnyStrT: (str, bytes)]:
87
104
  ```
88
105
 
89
106
  With `overwrite=False` (the default) an existing destination is a
90
- fail-fast `FileExistsError`, and the commit creates the file atomically --
91
- it never clobbers a file that appeared meanwhile. With `overwrite=True`
92
- the destination is atomically replaced, inheriting its previous
93
- permissions.
107
+ fail-fast `FileExistsError`, and the commit creates the file atomically
108
+ via a hardlink -- it never clobbers a file that appeared meanwhile. On a
109
+ filesystem without hardlink support (FAT/exFAT, some network mounts) the
110
+ commit falls back to a best-effort existence check plus replace, leaving
111
+ a small window where a file appearing concurrently could be clobbered.
112
+ With `overwrite=True` the destination is atomically replaced, inheriting
113
+ its previous permissions.
94
114
 
95
115
  The committed file gets the usual umask-based permissions (not the
96
116
  restrictive mode of the internal temp file). The destination's parent
@@ -254,15 +274,26 @@ class StagedFile[AnyStrT: (str, bytes)]:
254
274
  if self._overwrite:
255
275
  self._temp_path.replace(self._path)
256
276
  else:
277
+ # Atomic exclusive create via hardlink where supported. A
278
+ # filesystem without hardlinks (FAT/exFAT, some network mounts)
279
+ # raises a non-`FileExistsError` `OSError`; fall back to a
280
+ # best-effort existence check plus `replace` (which leaves a
281
+ # small TOCTOU window where a file appearing meanwhile could be
282
+ # clobbered -- unavoidable without an atomic no-clobber move).
257
283
  try:
258
284
  self._path.hardlink_to(self._temp_path)
259
- except FileExistsError:
260
- msg = (
261
- f"file already exists, pass overwrite=True to replace: {self._path}"
262
- )
263
- raise FileExistsError(msg) from None
264
- finally:
265
- self._temp_path.unlink(missing_ok=True)
285
+ except OSError as exc:
286
+ if isinstance(exc, FileExistsError) or self._path.exists():
287
+ self._temp_path.unlink(missing_ok=True)
288
+ msg = (
289
+ "file already exists, pass overwrite=True to replace: "
290
+ f"{self._path}"
291
+ )
292
+ raise FileExistsError(msg) from None
293
+ self._temp_path.replace(self._path)
294
+ else:
295
+ self._temp_path.unlink()
296
+ _fsync_parent(self._path)
266
297
  self._committed = True
267
298
  self._finalizer.detach()
268
299
  return self._path
@@ -318,9 +349,9 @@ class StagedDirectory:
318
349
  staged directory is moved into place with a single rename, and an existing
319
350
  destination is a fail-fast `FileExistsError`. Replacing an existing one
320
351
  (`overwrite=True`) is *not* fully atomic -- the old directory is swapped
321
- aside and then removed, leaving a brief window where the destination is
322
- absent and, on a rare failure mid-swap, the previous contents in a sibling
323
- ``<name>.old`` directory for recovery.
352
+ aside, the staged one moved in, then the old removed. A failed move
353
+ restores the original; only a crash *between* the two renames leaves the
354
+ previous contents in a sibling ``<name>.old`` directory for recovery.
324
355
 
325
356
  The committed directory gets the usual umask-based permissions. Pass
326
357
  `make_parents=True` to create the destination's parent if it is missing.
@@ -395,6 +426,8 @@ class StagedDirectory:
395
426
  appeared after this builder opened.
396
427
  NotADirectoryError: If `overwrite` is True and the destination
397
428
  exists but is not a directory.
429
+ OSError: If replacing an existing directory and moving the staged
430
+ one into place fails; the original is restored first.
398
431
  """
399
432
  if self._committed:
400
433
  return self._path
@@ -420,16 +453,24 @@ class StagedDirectory:
420
453
  mode = stat.S_IMODE(self._path.stat().st_mode)
421
454
  self._workdir.chmod(mode)
422
455
  if exists:
423
- # Replacing an existing directory. No portable atomic dir replace:
424
- # swap the old one aside, move the staged one in, then remove the
425
- # old. A failure between the renames leaves the previous contents
426
- # in `<name>.old`.
456
+ # Replacing an existing directory. There is no portable atomic
457
+ # directory replace, so swap the old one aside, move the staged one
458
+ # in, then remove the old. If the second move fails, restore the
459
+ # original; removing the backup is best-effort (the destination is
460
+ # already correct). A crash *between* the two moves is the residual
461
+ # non-atomic window -- the previous contents remain in a sibling
462
+ # `<name>.old` directory for manual recovery.
427
463
  backup = self._path.with_name(f"{self._workdir.name}.old")
428
464
  self._path.rename(backup)
429
- self._workdir.rename(self._path)
430
- shutil.rmtree(backup)
465
+ try:
466
+ self._workdir.rename(self._path)
467
+ except OSError:
468
+ backup.rename(self._path)
469
+ raise
470
+ shutil.rmtree(backup, ignore_errors=True)
431
471
  else:
432
472
  self._workdir.rename(self._path)
473
+ _fsync_parent(self._path)
433
474
  self._committed = True
434
475
  self._finalizer.detach()
435
476
  return self._path
@@ -266,6 +266,10 @@ def reserve_path(
266
266
  an exclusive file create, `open(path, "x")` raises the same
267
267
  `FileExistsError` directly.
268
268
 
269
+ A symlink counts as occupying the path -- including a *broken* one,
270
+ which `Path.exists` alone reports as absent yet still takes the name
271
+ (so `open(path, "x")` would fail). Such a path is treated as existing.
272
+
269
273
  Args:
270
274
  path: The path that should not yet exist.
271
275
  exist_ok: Whether to allow an already-existing path. Defaults to False.
@@ -277,9 +281,13 @@ def reserve_path(
277
281
  The path as a Path object or a string, depending on `stringify`.
278
282
 
279
283
  Raises:
280
- FileExistsError: If the path exists and `exist_ok` is False.
284
+ FileExistsError: If the path exists (or is a symlink) and `exist_ok`
285
+ is False.
286
+ OSError: If `make_parents` is True and the parent cannot be created
287
+ (e.g. an ancestor along the path is a file).
281
288
  """
282
- if (path := Path(path)).exists() and not exist_ok:
289
+ path = Path(path)
290
+ if (path.exists() or path.is_symlink()) and not exist_ok:
283
291
  msg = f"path already exists: {path}"
284
292
  raise FileExistsError(msg)
285
293
  if make_parents:
kaparoo/utils/__init__.py CHANGED
@@ -8,9 +8,11 @@ __all__ = (
8
8
  "Reduction",
9
9
  "SegmentRecord",
10
10
  "SegmentTimer",
11
+ "Std",
11
12
  "Sum",
12
13
  "Timer",
13
14
  "UnweightedReduction",
15
+ "Var",
14
16
  "factory_if_none",
15
17
  "replace_if_none",
16
18
  "unwrap_or_default",
@@ -27,8 +29,10 @@ from kaparoo.utils.aggregate import (
27
29
  Mean,
28
30
  Min,
29
31
  Reduction,
32
+ Std,
30
33
  Sum,
31
34
  UnweightedReduction,
35
+ Var,
32
36
  )
33
37
  from kaparoo.utils.optional import (
34
38
  factory_if_none,
@@ -16,10 +16,13 @@ __all__ = (
16
16
  "Mean",
17
17
  "Min",
18
18
  "Reduction",
19
+ "Std",
19
20
  "Sum",
20
21
  "UnweightedReduction",
22
+ "Var",
21
23
  )
22
24
 
25
+ import math
23
26
  from abc import ABC, abstractmethod
24
27
  from dataclasses import dataclass
25
28
  from typing import TYPE_CHECKING
@@ -108,6 +111,65 @@ class Mean(Reduction[tuple[float, float]]):
108
111
  return state[0] / state[1] if state[1] else float("nan")
109
112
 
110
113
 
114
+ @dataclass(frozen=True)
115
+ class Var(Reduction[tuple[float, float, float]]):
116
+ """Weighted population variance; state is `(weight, mean, M2)`.
117
+
118
+ Accumulated online (Welford) and merged exactly (Chan's parallel
119
+ algorithm), so it nests across loop levels like the other reductions.
120
+ Uses the population convention -- M2 over the total weight, as in
121
+ numpy's default `ddof=0` -- which stays well-defined under weighting.
122
+ Empty -> `nan`.
123
+ """
124
+
125
+ def identity(self) -> tuple[float, float, float]:
126
+ return (0.0, 0.0, 0.0)
127
+
128
+ def step(
129
+ self, state: tuple[float, float, float], value: float, weight: float
130
+ ) -> tuple[float, float, float]:
131
+ total, mean, m2 = state
132
+ total += weight
133
+ delta = value - mean
134
+ mean += (weight / total) * delta
135
+ m2 += weight * delta * (value - mean)
136
+ return (total, mean, m2)
137
+
138
+ def merge(
139
+ self,
140
+ a: tuple[float, float, float],
141
+ b: tuple[float, float, float],
142
+ ) -> tuple[float, float, float]:
143
+ total_a, mean_a, m2_a = a
144
+ total_b, mean_b, m2_b = b
145
+ total = total_a + total_b
146
+ if total == 0:
147
+ return (0.0, 0.0, 0.0)
148
+ delta = mean_b - mean_a
149
+ mean = mean_a + delta * total_b / total
150
+ m2 = m2_a + m2_b + delta * delta * total_a * total_b / total
151
+ return (total, mean, m2)
152
+
153
+ def result(self, state: tuple[float, float, float]) -> float:
154
+ total, _mean, m2 = state
155
+ return m2 / total if total else float("nan")
156
+
157
+
158
+ @dataclass(frozen=True)
159
+ class Std(Var):
160
+ """Weighted population standard deviation: the square root of `Var`.
161
+
162
+ Shares `Var`'s online, mergeable moments; only the final projection
163
+ differs. Empty -> `nan`.
164
+ """
165
+
166
+ def result(self, state: tuple[float, float, float]) -> float:
167
+ variance = super().result(state)
168
+ if math.isnan(variance): # empty state
169
+ return variance
170
+ return max(variance, 0.0) ** 0.5
171
+
172
+
111
173
  @dataclass(frozen=True)
112
174
  class Sum(UnweightedReduction[float]):
113
175
  """Running sum of values (weight ignored). Empty -> `0.0`."""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kaparoo-python
3
- Version: 0.4.0
3
+ Version: 0.6.0
4
4
  Summary: Personally common and useful Python features
5
5
  Keywords: filesystem,pathlib,paths,utilities
6
6
  Author: Jaewoo Park
@@ -67,16 +67,17 @@ hook for custom filter kinds.
67
67
 
68
68
  `Timer` / `SegmentTimer` context-manager-and-decorator timers (with
69
69
  `lap`-split and `measure`-block timings); `Aggregator` for nested,
70
- pluggable metric aggregation (the batch → epoch → run pattern); plus a
71
- small family of helpers for working with `Optional[T]` values
72
- (`replace_if_none`, `unwrap_or_default`, ...).
70
+ pluggable metric aggregation (the batch → epoch → run pattern;
71
+ experimental); plus a small family of helpers for working with
72
+ `Optional[T]` values (`replace_if_none`, `unwrap_or_default`, ...).
73
73
 
74
74
  ### [`kaparoo.data`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/data)
75
75
 
76
76
  Building blocks for dataset code: `DataSequence[T, M]` ABC (item +
77
77
  metadata), composers (`SlicedSequence`, `ConcatSequence`,
78
- `WindowedSequence`), file-backed templates (`FileFolderSequence`,
79
- `SingleFileSequence`), and `generate_batches`.
78
+ `TransformedSequence`, `WindowedSequence`, `ZippedSequence`), file-backed
79
+ templates (`FileFolderSequence`, `FileListSequence`, `SingleFileSequence`),
80
+ and `generate_batches`.
80
81
 
81
82
  ## 🎯 Quick example
82
83
 
@@ -1,12 +1,12 @@
1
1
  kaparoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- kaparoo/data/__init__.py,sha256=jCQNbgwE5eDLJaytQB-HWWkd2qRVYM9ndvZmBIh2MQY,368
3
- kaparoo/data/sequences/__init__.py,sha256=rokLPUrJ1bxEx9c9wNH9ekNjEfwBW0z4o1-UEW9Qrj0,534
2
+ kaparoo/data/__init__.py,sha256=dgbmVOVq_As2ovxRw_JQRdVjQ8d1UTkMFt_A50YfCfM,508
3
+ kaparoo/data/sequences/__init__.py,sha256=3WPq8yDzGvGFXRxu0Dtbp0WPLga32qMDDkL_CZlmFpE,674
4
4
  kaparoo/data/sequences/base.py,sha256=m2JcIcT-SLrTzsjFCtgrQ9I5XVpB6PYBigyooEpg4VE,2628
5
- kaparoo/data/sequences/composers.py,sha256=NjQ-2fxSffE5FjTA1xfZMS65w8N13ZOd_BHj1-NiC1w,7364
6
- kaparoo/data/sequences/templates.py,sha256=bY_znZ3vcQwRL6F--1Cs_A3e34UvDkQr1L_71l1lPLo,7611
5
+ kaparoo/data/sequences/composers.py,sha256=eaWW8VKyVq4zzJAvIh_LKYt6g-R-HqzzpoV4NKtFeqc,13373
6
+ kaparoo/data/sequences/templates.py,sha256=9a_vM-c9OaF-9qY0ybqRaGkVkywJzrQbNlEHfe39MHU,9572
7
7
  kaparoo/data/sequences/utils.py,sha256=oe0qWwnAjsf-9CBUPSlkxkeuQS8kjn1sYhx2eDIwPKI,2808
8
8
  kaparoo/filesystem/__init__.py,sha256=uES_e8DYBE0db5z-_E7N2-vSGvi9-uJiSOWnKHdtuPs,1797
9
- kaparoo/filesystem/directory.py,sha256=8NZyQt40Lcdp8_vlfNGDDkyC8FPzTCBuKO3zFq_KGL8,9166
9
+ kaparoo/filesystem/directory.py,sha256=Pr15aMl0tz2-VU14rkaFzsV0Zo2oqaevDM0JLW-ZOwk,10421
10
10
  kaparoo/filesystem/exceptions.py,sha256=dWzD7d30bUEAxKWMYtJWmIg8tK3meEFz84NhWmWXb6k,464
11
11
  kaparoo/filesystem/existence.py,sha256=6AafbDmY_sXrePLs9Vg8gQgT7jS8ETAqxz7TysV8WJ4,12564
12
12
  kaparoo/filesystem/search/__init__.py,sha256=4MXwO1l6n-jq-wswcAM8S7gJ8A6TMNRO_JbfZyOya_c,1503
@@ -20,15 +20,15 @@ kaparoo/filesystem/search/filters/pattern.py,sha256=GT8tI7falx62Ydrq_CuYnuoWUMn_
20
20
  kaparoo/filesystem/search/filters/types.py,sha256=jrRxhVZwyMgQ0_o4zOvmBB_uDCcCDEGFvIgmQH1X45A,1115
21
21
  kaparoo/filesystem/search/filters/utils.py,sha256=j-AfVu298t5GuQDb1UCOtb0L7R2Hm9gMD7UuFbIe2yE,1594
22
22
  kaparoo/filesystem/search/wrappers.py,sha256=CDLvw9IhVNZLtMSdGnldJLum9tt8UTSQoRphOxg0qD8,10782
23
- kaparoo/filesystem/staged.py,sha256=u1CaWVU02_iaf8FKMruHGvyKq2oCQakOayOneDmqQjI,16269
23
+ kaparoo/filesystem/staged.py,sha256=VXjzszo-FAjQaO9aSPelorIVOyxIUS5SPDank5wQupM,18275
24
24
  kaparoo/filesystem/types.py,sha256=Cm1MLEAujVRyB9uciJ-uUxC0hw6j-ycxVPKS9KFmJnw,202
25
- kaparoo/filesystem/utils.py,sha256=cPdK7K8DOWyVXwg66xEAdbhDvMOp0IWkKMLKtG5np2I,10437
25
+ kaparoo/filesystem/utils.py,sha256=nSy_7RDPljP5FTZjRipkWEi6PAOLsYr0MubwCNfXm6s,10850
26
26
  kaparoo/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
27
- kaparoo/utils/__init__.py,sha256=Z8nDsWJUkUsP728AuzNPGkEjkZRWNcWn_LVA7O384zc,745
28
- kaparoo/utils/aggregate.py,sha256=e6pyC5m9O8q5uJi4SOTM-RpzILsS5aARoZNFXwmD1Es,11949
27
+ kaparoo/utils/__init__.py,sha256=QR-aXuDvxCtOXZLiqHNrKozLlzg_v60UaKI6x2S3YtU,785
28
+ kaparoo/utils/aggregate.py,sha256=8apzZiqLAxoSO51DDDMsOnkrsEaejudXDTc6h3uKRZc,13953
29
29
  kaparoo/utils/optional.py,sha256=UgNhGDzl317PE_ESt9hW7yl9MYcdL0nV6Ly0mpqIz0U,4224
30
30
  kaparoo/utils/timer.py,sha256=n2RenrYik51v1Dmo9JmZpG3_cPafRDgGMbxdvNoRhgs,17001
31
- kaparoo_python-0.4.0.dist-info/licenses/LICENSE,sha256=hb6LWYP2rtcoz4V2HpawmblDfHwjwsg9N3cz0c5JQJE,1067
32
- kaparoo_python-0.4.0.dist-info/WHEEL,sha256=Q9FtwzuR2QE37l-JIkuyklGnJJiCBHKnsPVQ9vzCMzQ,81
33
- kaparoo_python-0.4.0.dist-info/METADATA,sha256=kIbzjQhdCCeRLVQ_7GlZwVm2rjkRQwPJR1U_4_EnJa8,4252
34
- kaparoo_python-0.4.0.dist-info/RECORD,,
31
+ kaparoo_python-0.6.0.dist-info/licenses/LICENSE,sha256=hb6LWYP2rtcoz4V2HpawmblDfHwjwsg9N3cz0c5JQJE,1067
32
+ kaparoo_python-0.6.0.dist-info/WHEEL,sha256=V5-3dKee3Zs8C4JP6swr6zdqriLsOpItBEQxe6_oWpY,81
33
+ kaparoo_python-0.6.0.dist-info/METADATA,sha256=I9qqiuRdVQeIh0cHdadR_3UQlyLgtKpTLJ-ycLtYWjQ,4327
34
+ kaparoo_python-0.6.0.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: uv 0.11.17
2
+ Generator: uv 0.11.18
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any