kaparoo-python 0.5.0__tar.gz → 0.6.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/PKG-INFO +7 -6
  2. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/README.md +6 -5
  3. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/README.md +57 -4
  4. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/__init__.py +4 -0
  5. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/sequences/__init__.py +4 -0
  6. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/sequences/composers.py +179 -6
  7. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/sequences/templates.py +90 -120
  8. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/pyproject.toml +1 -1
  9. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/LICENSE +0 -0
  10. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/__init__.py +0 -0
  11. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/sequences/base.py +0 -0
  12. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/data/sequences/utils.py +0 -0
  13. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/README.md +0 -0
  14. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/__init__.py +0 -0
  15. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/directory.py +0 -0
  16. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/exceptions.py +0 -0
  17. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/existence.py +0 -0
  18. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/README.md +0 -0
  19. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/__init__.py +0 -0
  20. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/classes.py +0 -0
  21. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/deprecated.py +0 -0
  22. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/__init__.py +0 -0
  23. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/base.py +0 -0
  24. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/logical.py +0 -0
  25. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/multi_pattern.py +0 -0
  26. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/pattern.py +0 -0
  27. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/types.py +0 -0
  28. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/filters/utils.py +0 -0
  29. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/search/wrappers.py +0 -0
  30. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/staged.py +0 -0
  31. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/types.py +0 -0
  32. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/filesystem/utils.py +0 -0
  33. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/py.typed +0 -0
  34. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/utils/README.md +0 -0
  35. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/utils/__init__.py +0 -0
  36. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/utils/aggregate.py +0 -0
  37. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/utils/optional.py +0 -0
  38. {kaparoo_python-0.5.0 → kaparoo_python-0.6.0}/kaparoo/utils/timer.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kaparoo-python
3
- Version: 0.5.0
3
+ Version: 0.6.0
4
4
  Summary: Personally common and useful Python features
5
5
  Keywords: filesystem,pathlib,paths,utilities
6
6
  Author: Jaewoo Park
@@ -67,16 +67,17 @@ hook for custom filter kinds.
67
67
 
68
68
  `Timer` / `SegmentTimer` context-manager-and-decorator timers (with
69
69
  `lap`-split and `measure`-block timings); `Aggregator` for nested,
70
- pluggable metric aggregation (the batch → epoch → run pattern); plus a
71
- small family of helpers for working with `Optional[T]` values
72
- (`replace_if_none`, `unwrap_or_default`, ...).
70
+ pluggable metric aggregation (the batch → epoch → run pattern;
71
+ experimental); plus a small family of helpers for working with
72
+ `Optional[T]` values (`replace_if_none`, `unwrap_or_default`, ...).
73
73
 
74
74
  ### [`kaparoo.data`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/data)
75
75
 
76
76
  Building blocks for dataset code: `DataSequence[T, M]` ABC (item +
77
77
  metadata), composers (`SlicedSequence`, `ConcatSequence`,
78
- `WindowedSequence`), file-backed templates (`FileFolderSequence`,
79
- `SingleFileSequence`), and `generate_batches`.
78
+ `TransformedSequence`, `WindowedSequence`, `ZippedSequence`), file-backed
79
+ templates (`FileFolderSequence`, `FileListSequence`, `SingleFileSequence`),
80
+ and `generate_batches`.
80
81
 
81
82
  ## 🎯 Quick example
82
83
 
@@ -46,16 +46,17 @@ hook for custom filter kinds.
46
46
 
47
47
  `Timer` / `SegmentTimer` context-manager-and-decorator timers (with
48
48
  `lap`-split and `measure`-block timings); `Aggregator` for nested,
49
- pluggable metric aggregation (the batch → epoch → run pattern); plus a
50
- small family of helpers for working with `Optional[T]` values
51
- (`replace_if_none`, `unwrap_or_default`, ...).
49
+ pluggable metric aggregation (the batch → epoch → run pattern;
50
+ experimental); plus a small family of helpers for working with
51
+ `Optional[T]` values (`replace_if_none`, `unwrap_or_default`, ...).
52
52
 
53
53
  ### [`kaparoo.data`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/data)
54
54
 
55
55
  Building blocks for dataset code: `DataSequence[T, M]` ABC (item +
56
56
  metadata), composers (`SlicedSequence`, `ConcatSequence`,
57
- `WindowedSequence`), file-backed templates (`FileFolderSequence`,
58
- `SingleFileSequence`), and `generate_batches`.
57
+ `TransformedSequence`, `WindowedSequence`, `ZippedSequence`), file-backed
58
+ templates (`FileFolderSequence`, `FileListSequence`, `SingleFileSequence`),
59
+ and `generate_batches`.
59
60
 
60
61
  ## 🎯 Quick example
61
62
 
@@ -7,7 +7,8 @@ small set of composers, and ready-to-subclass file-backed templates.
7
7
 
8
8
  - [`sequences/base`](./sequences/base.py) — `DataSequence[T, M]` abstract base
9
9
  - [`sequences/composers`](./sequences/composers.py) — `SlicedSequence`,
10
- `ConcatSequence`, `WindowedSequence`
10
+ `TransformedSequence`, `ConcatSequence`, `WindowedSequence`,
11
+ `ZippedSequence`
11
12
  - [`sequences/templates`](./sequences/templates.py) — `FileFolderSequence`,
12
13
  `FileListSequence`, `SingleFileSequence`
13
14
  - [`sequences/utils`](./sequences/utils.py) — `generate_batches`
@@ -83,18 +84,49 @@ combined = ConcatSequence(train_a, train_b, train_c)
83
84
  len(combined) # == len(train_a) + len(train_b) + len(train_c)
84
85
  ```
85
86
 
87
+ ### `TransformedSequence`
88
+
89
+ A lazy view that applies a `transform` callable to each item of
90
+ `source`. The transform is called on demand in `get_item` -- nothing
91
+ is computed at construction. `get_meta` passes through `source.get_meta`
92
+ unchanged by default; override it in a subclass when `M_out` differs
93
+ from `M_in`.
94
+
95
+ ```python
96
+ from kaparoo.data.sequences import TransformedSequence
97
+
98
+ # Item transform only -- metadata type is unchanged.
99
+ normalized = TransformedSequence(image_folder, normalize_fn)
100
+
101
+ # Meta transform via subclassing:
102
+ class Augmented(TransformedSequence[ndarray, Path, ndarray, AugMeta]):
103
+ def get_meta(self, index: int) -> AugMeta:
104
+ return AugMeta(path=self.source.get_meta(index), applied="normalize")
105
+ ```
106
+
107
+ Chaining two `TransformedSequence` instances applies the transforms in
108
+ order:
109
+
110
+ ```python
111
+ resized = TransformedSequence(raw, resize)
112
+ normalized = TransformedSequence(resized, normalize)
113
+ ```
114
+
115
+ `T_out` and `M_out` default to `T_in` and `M_in` respectively (PEP 696),
116
+ so you only need to specify them when the type actually changes.
117
+
86
118
  ### `WindowedSequence`
87
119
 
88
120
  An abstract sliding-window view: each item is a `tuple[T, ...]` of
89
121
  `size` frames from `source`. Per-frame `M_in` and window-level
90
- `M_out` are independent type parameters, so subclasses decide how
91
- metadata aggregates.
122
+ `M_out` are independent type parameters (`M_out` defaults to `M_in`),
123
+ so subclasses decide how metadata aggregates.
92
124
 
93
125
  ```python
94
126
  from pathlib import Path
95
127
  from kaparoo.data.sequences import WindowedSequence
96
128
 
97
- class FirstFrameMeta(WindowedSequence[bytes, Path, Path]):
129
+ class FirstFrameMeta(WindowedSequence[bytes, Path]):
98
130
  def get_meta(self, index):
99
131
  # window's metadata is its first frame's metadata
100
132
  index = self._normalize_index(index)
@@ -109,6 +141,27 @@ windows.get_meta(0) # frames.get_meta(0)
109
141
  `size`, `step`, `skip` follow the same semantics as
110
142
  [`generate_batches`](#generate_batches).
111
143
 
144
+ ### `ZippedSequence`
145
+
146
+ Element-wise zip of two sequences — item `i` is `(first[i], second[i])`
147
+ and metadata `i` is the `(M1, M2)` tuple. This is the "paired image +
148
+ label" pattern that `ConcatSequence` (end-to-end) cannot express. With
149
+ `strict=True` (the default) the lengths must match or construction raises
150
+ `ValueError`; pass `strict=False` to truncate to the shorter length, like
151
+ the builtin `zip`. For a different combined metadata shape, subclass and
152
+ override `get_meta`.
153
+
154
+ ```python
155
+ from kaparoo.data.sequences import ZippedSequence
156
+
157
+ pairs = ZippedSequence(images, labels)
158
+ pairs[0] # (images[0], labels[0])
159
+ pairs.get_meta(0) # (images.get_meta(0), labels.get_meta(0))
160
+ ```
161
+
162
+ For three or more, nest: `ZippedSequence(a, ZippedSequence(b, c))` yields
163
+ `(a[i], (b[i], c[i]))`.
164
+
112
165
  ## Templates
113
166
 
114
167
  ### `FileFolderSequence`
@@ -5,7 +5,9 @@ __all__ = (
5
5
  "FileListSequence",
6
6
  "SingleFileSequence",
7
7
  "SlicedSequence",
8
+ "TransformedSequence",
8
9
  "WindowedSequence",
10
+ "ZippedSequence",
9
11
  "generate_batches",
10
12
  )
11
13
 
@@ -16,6 +18,8 @@ from kaparoo.data.sequences import (
16
18
  FileListSequence,
17
19
  SingleFileSequence,
18
20
  SlicedSequence,
21
+ TransformedSequence,
19
22
  WindowedSequence,
23
+ ZippedSequence,
20
24
  generate_batches,
21
25
  )
@@ -7,7 +7,9 @@ __all__ = (
7
7
  "FileListSequence",
8
8
  "SingleFileSequence",
9
9
  "SlicedSequence",
10
+ "TransformedSequence",
10
11
  "WindowedSequence",
12
+ "ZippedSequence",
11
13
  "generate_batches",
12
14
  )
13
15
 
@@ -15,7 +17,9 @@ from kaparoo.data.sequences.base import DataSequence
15
17
  from kaparoo.data.sequences.composers import (
16
18
  ConcatSequence,
17
19
  SlicedSequence,
20
+ TransformedSequence,
18
21
  WindowedSequence,
22
+ ZippedSequence,
19
23
  )
20
24
  from kaparoo.data.sequences.templates import (
21
25
  FileFolderSequence,
@@ -1,15 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- __all__ = ("ConcatSequence", "SlicedSequence", "WindowedSequence")
3
+ __all__ = (
4
+ "ConcatSequence",
5
+ "SlicedSequence",
6
+ "TransformedSequence",
7
+ "WindowedSequence",
8
+ "ZippedSequence",
9
+ )
4
10
 
5
11
  from abc import abstractmethod
6
12
  from bisect import bisect_right
7
- from typing import TYPE_CHECKING
13
+ from typing import TYPE_CHECKING, cast
8
14
 
9
15
  from kaparoo.data.sequences.base import DataSequence
10
16
 
11
17
  if TYPE_CHECKING:
12
- from collections.abc import Sequence
18
+ from collections.abc import Callable, Sequence
13
19
 
14
20
 
15
21
  class SlicedSequence[T, M](DataSequence[T, M]):
@@ -59,6 +65,61 @@ class SlicedSequence[T, M](DataSequence[T, M]):
59
65
  return self._source.get_meta(self._indices[index])
60
66
 
61
67
 
68
+ class TransformedSequence[T_in, M_in, T_out = T_in, M_out = M_in](
69
+ DataSequence[T_out, M_out]
70
+ ):
71
+ """A view of `source` with `transform` applied lazily to each item.
72
+
73
+ `transform` is called on demand in `get_item`; nothing is loaded or
74
+ converted at construction time. `get_meta` passes through
75
+ `source.get_meta` unchanged by default -- override it in a subclass
76
+ when `M_out` differs from `M_in`.
77
+
78
+ Type Parameters:
79
+ T_in: Item type of `source`.
80
+ M_in: Metadata type of `source`.
81
+ T_out: Item type after the transform. Defaults to `T_in`.
82
+ M_out: Metadata type exposed by this view. Defaults to `M_in`.
83
+ When `M_out != M_in`, override `get_meta` in a subclass;
84
+ the default passthrough is only safe when `M_out == M_in`.
85
+
86
+ Example:
87
+ >>> # Item-only transform; metadata passes through unchanged.
88
+ >>> normalized = TransformedSequence(image_folder, normalize)
89
+
90
+ >>> # Meta transform via subclassing:
91
+ >>> class Augmented(TransformedSequence[ndarray, Path, ndarray, AugMeta]):
92
+ ... def get_meta(self, index: int) -> AugMeta:
93
+ ... return AugMeta(
94
+ ... path=self.source.get_meta(index),
95
+ ... applied="normalize",
96
+ ... )
97
+ """
98
+
99
+ def __init__(
100
+ self,
101
+ source: DataSequence[T_in, M_in],
102
+ transform: Callable[[T_in], T_out],
103
+ ) -> None:
104
+ self._source = source
105
+ self._transform = transform
106
+
107
+ @property
108
+ def source(self) -> DataSequence[T_in, M_in]:
109
+ """The wrapped sequence."""
110
+ return self._source
111
+
112
+ def __len__(self) -> int:
113
+ return len(self._source)
114
+
115
+ def get_item(self, index: int) -> T_out:
116
+ return self._transform(self._source.get_item(index))
117
+
118
+ def get_meta(self, index: int) -> M_out:
119
+ # Passthrough by default. Override when M_out != M_in.
120
+ return cast("M_out", self._source.get_meta(index))
121
+
122
+
62
123
  class ConcatSequence[T, M](DataSequence[T, M]):
63
124
  """The end-to-end concatenation of zero or more `sources`.
64
125
 
@@ -112,7 +173,7 @@ class ConcatSequence[T, M](DataSequence[T, M]):
112
173
  return source.get_meta(local)
113
174
 
114
175
 
115
- class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
176
+ class WindowedSequence[T, M_in, M_out = M_in](DataSequence[tuple[T, ...], M_out]):
116
177
  """An abstract sliding-window view over `source`.
117
178
 
118
179
  Each item is a tuple of `size` items from `source`, starting at
@@ -130,8 +191,8 @@ class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
130
191
  T: Item type of `source` (also the per-frame type within each
131
192
  window).
132
193
  M_in: Metadata type of `source` (per-frame metadata).
133
- M_out: Metadata type of the window. Determined by the
134
- subclass's `get_meta` return.
194
+ M_out: Metadata type of the window. Defaults to `M_in`.
195
+ Determined by the subclass's `get_meta` return.
135
196
 
136
197
  Args:
137
198
  source: The sequence to window over.
@@ -219,3 +280,115 @@ class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
219
280
  @abstractmethod
220
281
  def get_meta(self, index: int) -> M_out:
221
282
  raise NotImplementedError
283
+
284
+
285
+ class ZippedSequence[T1, T2, M1 = None, M2 = None](
286
+ DataSequence[tuple[T1, T2], tuple[M1, M2]]
287
+ ):
288
+ """Element-wise zip of two sequences.
289
+
290
+ Item `i` is `(first[i], second[i])` and metadata `i` is
291
+ `(first.get_meta(i), second.get_meta(i))` -- the "paired image + label"
292
+ pattern that `ConcatSequence` (end-to-end) cannot express.
293
+
294
+ With `strict=True` (the default) the two sequences must have the same
295
+ length; a mismatch raises `ValueError` at construction. With
296
+ `strict=False` the view is truncated to the shorter length, like the
297
+ builtin `zip`. For a different combined-metadata shape, subclass and
298
+ override `get_meta`.
299
+
300
+ Type Parameters:
301
+ T1: Item type of the first source.
302
+ T2: Item type of the second source.
303
+ M1: Metadata type of the first source. Defaults to `None`.
304
+ M2: Metadata type of the second source. Defaults to `None`.
305
+
306
+ Args:
307
+ first: The first sequence.
308
+ second: The second sequence.
309
+ strict: When True (default), require equal lengths and raise on a
310
+ mismatch. When False, truncate to the shorter length.
311
+
312
+ Raises:
313
+ ValueError: If `strict` is True and the sequences differ in length.
314
+
315
+ Example:
316
+ >>> pairs = ZippedSequence(images, labels)
317
+ >>> pairs[0] # (images[0], labels[0])
318
+ >>> pairs.get_meta(0) # (images.get_meta(0), labels.get_meta(0))
319
+ """
320
+
321
+ def __init__(
322
+ self,
323
+ first: DataSequence[T1, M1],
324
+ second: DataSequence[T2, M2],
325
+ *,
326
+ strict: bool = True,
327
+ ) -> None:
328
+ if strict and len(first) != len(second):
329
+ msg = f"sequences differ in length: {len(first)} != {len(second)}"
330
+ raise ValueError(msg)
331
+ self._first = first
332
+ self._second = second
333
+ self._length = len(first) if strict else min(len(first), len(second))
334
+
335
+ @property
336
+ def first(self) -> DataSequence[T1, M1]:
337
+ """The first wrapped sequence."""
338
+ return self._first
339
+
340
+ @property
341
+ def second(self) -> DataSequence[T2, M2]:
342
+ """The second wrapped sequence."""
343
+ return self._second
344
+
345
+ def __len__(self) -> int:
346
+ return self._length
347
+
348
+ def _normalize_index(self, index: int) -> int:
349
+ """Normalize a possibly-negative index and validate range.
350
+
351
+ Indices resolve against the zipped length (the shorter source when
352
+ `strict=False`), so they address the same position in both sources.
353
+
354
+ Raises:
355
+ IndexError: If `index` is outside `[-len(self), len(self))`.
356
+ """
357
+ n = self._length
358
+ original = index
359
+ if index < 0:
360
+ index += n
361
+ if not 0 <= index < n:
362
+ msg = f"index {original} out of range for length {n}"
363
+ raise IndexError(msg)
364
+ return index
365
+
366
+ def get_item(self, index: int) -> tuple[T1, T2]:
367
+ index = self._normalize_index(index)
368
+ return self._first.get_item(index), self._second.get_item(index)
369
+
370
+ def get_items(self, indices: Sequence[int]) -> Sequence[tuple[T1, T2]]:
371
+ # Normalize, then bulk-delegate so each source's `get_items`
372
+ # optimization is used.
373
+ normalized = [self._normalize_index(i) for i in indices]
374
+ return list(
375
+ zip(
376
+ self._first.get_items(normalized),
377
+ self._second.get_items(normalized),
378
+ strict=True,
379
+ )
380
+ )
381
+
382
+ def get_meta(self, index: int) -> tuple[M1, M2]:
383
+ index = self._normalize_index(index)
384
+ return self._first.get_meta(index), self._second.get_meta(index)
385
+
386
+ def get_metas(self, indices: Sequence[int]) -> Sequence[tuple[M1, M2]]:
387
+ normalized = [self._normalize_index(i) for i in indices]
388
+ return list(
389
+ zip(
390
+ self._first.get_metas(normalized),
391
+ self._second.get_metas(normalized),
392
+ strict=True,
393
+ )
394
+ )
@@ -14,11 +14,92 @@ if TYPE_CHECKING:
14
14
  from kaparoo.filesystem.types import StrPath, StrPaths
15
15
 
16
16
 
17
- class FileFolderSequence[T, M = Path](DataSequence[T, M]):
18
- """A folder-rooted `DataSequence` whose items live in individual files.
17
+ class FileListSequence[T, M = Path](DataSequence[T, M]):
18
+ """A `DataSequence` over an explicit, ordered list of files.
19
+
20
+ Items live one-per-file; subclasses implement `load_file` and `get_meta`.
21
+ The files are given directly rather than discovered under a `root`, so
22
+ they may live in unrelated directories -- or, on Windows, on different
23
+ drives. (`FileFolderSequence` is the special case where the list is
24
+ discovered under a single root and stored relative to it.)
25
+
26
+ The given order is preserved verbatim and duplicates are kept; sort the
27
+ input yourself (`sorted(files, key=...)`) if a particular order is
28
+ needed. Paths are not checked for existence at construction; `load_file`
29
+ is called lazily on each `get_item`.
30
+
31
+ The base exposes:
32
+
33
+ - `files: tuple[Path, ...]` — full paths as an immutable snapshot.
34
+ - `get_file(index) -> Path` — full path of the i-th file.
19
35
 
20
- The base class handles file discovery, indexing, and root-relative
21
- path bookkeeping. Subclasses are responsible for three things:
36
+ Type Parameters:
37
+ T: Item type returned by `get_item`.
38
+ M: Per-item metadata type. Defaults to `Path`; override when the
39
+ metadata is something else (label, line number, ...).
40
+
41
+ Args:
42
+ files: The file paths to expose, in order.
43
+
44
+ Example:
45
+ >>> from pathlib import Path
46
+ >>> class BytesList(FileListSequence[bytes]):
47
+ ... def get_meta(self, index: int) -> Path:
48
+ ... return self.get_file(index)
49
+ ...
50
+ ... def load_file(self, path: Path) -> bytes:
51
+ ... return path.read_bytes()
52
+ >>>
53
+ >>> data = BytesList(["images/a.png", "/other/b.png"])
54
+ """
55
+
56
+ def __init__(self, files: StrPaths) -> None:
57
+ self._files = list(stringify_paths(files))
58
+
59
+ def __len__(self) -> int:
60
+ return len(self._files)
61
+
62
+ @property
63
+ def files(self) -> tuple[Path, ...]:
64
+ """Immutable snapshot of the full file paths, in order.
65
+
66
+ Returns a fresh `tuple[Path, ...]` on each access.
67
+ """
68
+ return tuple(self.get_file(i) for i in range(len(self)))
69
+
70
+ def get_file(self, index: int) -> Path:
71
+ """Full Path of the file at `index`."""
72
+ return Path(self._files[index])
73
+
74
+ def get_item(self, index: int) -> T:
75
+ return self.load_file(self.get_file(index))
76
+
77
+ @abstractmethod
78
+ def get_meta(self, index: int) -> M:
79
+ raise NotImplementedError
80
+
81
+ @abstractmethod
82
+ def load_file(self, path: Path) -> T:
83
+ """Decode a single file into an item of type `T`.
84
+
85
+ Called lazily on each `get_item` -- not at construction time.
86
+ Subclasses may freely use external libraries (PIL, librosa,
87
+ cv2, ...) to decode.
88
+ """
89
+ raise NotImplementedError
90
+
91
+
92
+ class FileFolderSequence[T, M = Path](FileListSequence[T, M]):
93
+ """A `FileListSequence` whose file list is discovered under a root.
94
+
95
+ The special case of `FileListSequence` where every file lives under one
96
+ base directory. The list is produced by `list_files(root)`, validated to
97
+ be under `root`, and stored in root-relative form so memory stays low for
98
+ large datasets and the paths survive a `root` relocation; `get_file`
99
+ transparently re-prepends `root`. `load_file`, `get_item`, `files`, and
100
+ `__len__` are inherited unchanged.
101
+
102
+ Subclasses are responsible for three things:
22
103
 
23
104
  - **`list_files(self, root)`** (abstract): return the full `Path`
24
105
  of every file to expose, in the desired order. Called once from
@@ -33,16 +114,9 @@ class FileFolderSequence[T, M = Path](DataSequence[T, M]):
33
114
  to `Path` and `get_meta(i)` can be the one-liner
34
115
  `return self.get_file(i)`.
35
116
 
36
- The base exposes:
117
+ The base adds, on top of `FileListSequence`:
37
118
 
38
119
  - `root: Path` — the base directory.
39
- - `files: tuple[Path, ...]` — full paths as an immutable snapshot.
40
- - `get_file(index) -> Path` — full path of the i-th file.
41
-
42
- Paths are kept internally in their root-relative form so that
43
- memory stays low for large datasets and the sequence survives
44
- `root` relocations; the conversion is transparent to subclasses
45
- and external callers.
46
120
 
47
121
  Parameterized subclasses:
48
122
  When a subclass needs instance-level options (e.g. `pattern`,
@@ -94,48 +168,20 @@ class FileFolderSequence[T, M = Path](DataSequence[T, M]):
94
168
 
95
169
  def __init__(self, root: StrPath) -> None:
96
170
  self._root = ensure_dir_exists(root)
97
- self._files = list(
98
- stringify_paths(self.list_files(self._root), after=self._root)
99
- )
100
-
101
- def __len__(self) -> int:
102
- return len(self._files)
171
+ # `after=root` makes each path root-relative and raises ValueError if
172
+ # any file is not under `root`. The base then stores the relative
173
+ # form; `get_file` re-prepends `root`.
174
+ super().__init__(stringify_paths(self.list_files(self._root), after=self._root))
103
175
 
104
176
  @property
105
177
  def root(self) -> Path:
106
178
  """The base directory the sequence was constructed from."""
107
179
  return self._root
108
180
 
109
- @property
110
- def files(self) -> tuple[Path, ...]:
111
- """Immutable snapshot of the full file paths this sequence exposes.
112
-
113
- Returns a fresh `tuple[Path, ...]` on each access, in the order
114
- established by `list_files`.
115
- """
116
- return tuple(self.get_file(i) for i in range(len(self)))
117
-
118
181
  def get_file(self, index: int) -> Path:
119
182
  """Full Path of the file at `index`."""
120
183
  return wrap_path(self._files[index], prepend=self._root)
121
184
 
122
- def get_item(self, index: int) -> T:
123
- return self.load_file(self.get_file(index))
124
-
125
- @abstractmethod
126
- def get_meta(self, index: int) -> M:
127
- raise NotImplementedError
128
-
129
- @abstractmethod
130
- def load_file(self, path: Path) -> T:
131
- """Decode a single file into an item of type `T`.
132
-
133
- Called lazily on each `get_item` -- not at construction time.
134
- Subclasses may freely use external libraries (PIL, librosa,
135
- cv2, ...) to decode.
136
- """
137
- raise NotImplementedError
138
-
139
185
  @abstractmethod
140
186
  def list_files(self, root: Path) -> list[Path]:
141
187
  """Return the full Path of every file to expose, in order.
@@ -149,82 +195,6 @@ class FileFolderSequence[T, M = Path](DataSequence[T, M]):
149
195
  raise NotImplementedError
150
196
 
151
197
 
152
- class FileListSequence[T, M = Path](DataSequence[T, M]):
153
- """A `DataSequence` over an explicit, ordered list of files.
154
-
155
- Like `FileFolderSequence`, items live one-per-file and subclasses
156
- implement `load_file` and `get_meta`. Unlike it, the files are given
157
- directly rather than discovered under a `root`, so they may live in
158
- unrelated directories -- or, on Windows, on different drives -- which
159
- `FileFolderSequence` cannot represent (it stores paths relative to one
160
- root). There is no `list_files`: the input list *is* the listing.
161
-
162
- The given order is preserved verbatim and duplicates are kept; sort the
163
- input yourself (`sorted(files, key=...)`) if a particular order is
164
- needed. Paths are not checked for existence at construction; `load_file`
165
- is called lazily on each `get_item`.
166
-
167
- The base exposes:
168
-
169
- - `files: tuple[Path, ...]` — full paths as an immutable snapshot.
170
- - `get_file(index) -> Path` — full path of the i-th file.
171
-
172
- Type Parameters:
173
- T: Item type returned by `get_item`.
174
- M: Per-item metadata type. Defaults to `Path`; override when the
175
- metadata is something else (label, line number, ...).
176
-
177
- Args:
178
- files: The file paths to expose, in order.
179
-
180
- Example:
181
- >>> from pathlib import Path
182
- >>> class BytesList(FileListSequence[bytes]):
183
- ... def get_meta(self, index: int) -> Path:
184
- ... return self.get_file(index)
185
- ...
186
- ... def load_file(self, path: Path) -> bytes:
187
- ... return path.read_bytes()
188
- >>>
189
- >>> data = BytesList(["images/a.png", "/other/b.png"])
190
- """
191
-
192
- def __init__(self, files: StrPaths) -> None:
193
- self._files = list(stringify_paths(files))
194
-
195
- def __len__(self) -> int:
196
- return len(self._files)
197
-
198
- @property
199
- def files(self) -> tuple[Path, ...]:
200
- """Immutable snapshot of the full file paths, in the given order.
201
-
202
- Returns a fresh `tuple[Path, ...]` on each access.
203
- """
204
- return tuple(self.get_file(i) for i in range(len(self)))
205
-
206
- def get_file(self, index: int) -> Path:
207
- """Full Path of the file at `index`."""
208
- return Path(self._files[index])
209
-
210
- def get_item(self, index: int) -> T:
211
- return self.load_file(self.get_file(index))
212
-
213
- @abstractmethod
214
- def get_meta(self, index: int) -> M:
215
- raise NotImplementedError
216
-
217
- @abstractmethod
218
- def load_file(self, path: Path) -> T:
219
- """Decode a single file into an item of type `T`.
220
-
221
- Called lazily on each `get_item` -- not at construction time.
222
- Subclasses may freely use external libraries (PIL, librosa,
223
- cv2, ...) to decode.
224
- """
225
- raise NotImplementedError
226
-
227
-
228
198
  class SingleFileSequence[T, M = None](DataSequence[T, M]):
229
199
  """A `DataSequence` backed by a single file that holds multiple records.
230
200
 
@@ -12,7 +12,7 @@ build-backend = "uv_build"
12
12
 
13
13
  [project]
14
14
  name = "kaparoo-python"
15
- version = "0.5.0"
15
+ version = "0.6.0"
16
16
  description = "Personally common and useful Python features"
17
17
  readme = "README.md"
18
18
  requires-python = ">=3.14"
File without changes