kaparoo-python 0.2.1__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kaparoo/data/__init__.py CHANGED
@@ -0,0 +1,19 @@
1
+ __all__ = (
2
+ "ConcatSequence",
3
+ "DataSequence",
4
+ "FileFolderSequence",
5
+ "SingleFileSequence",
6
+ "SlicedSequence",
7
+ "WindowedSequence",
8
+ "generate_batches",
9
+ )
10
+
11
+ from kaparoo.data.sequences import (
12
+ ConcatSequence,
13
+ DataSequence,
14
+ FileFolderSequence,
15
+ SingleFileSequence,
16
+ SlicedSequence,
17
+ WindowedSequence,
18
+ generate_batches,
19
+ )
@@ -0,0 +1,23 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = (
4
+ "ConcatSequence",
5
+ "DataSequence",
6
+ "FileFolderSequence",
7
+ "SingleFileSequence",
8
+ "SlicedSequence",
9
+ "WindowedSequence",
10
+ "generate_batches",
11
+ )
12
+
13
+ from kaparoo.data.sequences.base import DataSequence
14
+ from kaparoo.data.sequences.composers import (
15
+ ConcatSequence,
16
+ SlicedSequence,
17
+ WindowedSequence,
18
+ )
19
+ from kaparoo.data.sequences.templates import (
20
+ FileFolderSequence,
21
+ SingleFileSequence,
22
+ )
23
+ from kaparoo.data.sequences.utils import generate_batches
@@ -0,0 +1,73 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ("DataSequence",)
4
+
5
+ from abc import abstractmethod
6
+ from collections.abc import Sequence
7
+ from typing import overload
8
+
9
+
10
+ class DataSequence[T, M = None](Sequence[T]):
11
+ """An ordered, lazily-loaded, read-only sequence with per-item metadata.
12
+
13
+ Subclasses implement `get_item` (and `__len__`) to fetch a single
14
+ item by index. Sequence operations (`ds[i]`, `ds[i:j]`, `for x in
15
+ ds`, `x in ds`, `reversed(ds)`, ...) come from the inherited
16
+ `collections.abc.Sequence` protocol; only `get_item` need be
17
+ overridden, and `get_items` may be overridden for batch-fetch
18
+ optimization.
19
+
20
+ The second type parameter `M` carries per-item metadata (labels,
21
+ source paths, timestamps, ...). Subclasses implement `get_meta`.
22
+ When the data has no metadata, parameterize as `DataSequence[T]`
23
+ (so `M` defaults to `None`) and let `get_meta` simply return
24
+ `None`.
25
+
26
+ Type Parameters:
27
+ T: Element type. `ds[i]` and `get_item(i)` return `T`.
28
+ M: Per-item metadata type. `get_meta(i)` returns `M`. Defaults
29
+ to `None` -- meaning "no metadata", in which case
30
+ subclasses still implement `get_meta` but as a no-op.
31
+ """
32
+
33
+ @abstractmethod
34
+ def __len__(self) -> int:
35
+ raise NotImplementedError
36
+
37
+ # --- item access -------------------------------------------------------
38
+
39
+ @overload
40
+ def __getitem__(self, index: int, /) -> T: ...
41
+
42
+ @overload
43
+ def __getitem__(self, index: slice, /) -> Sequence[T]: ...
44
+
45
+ def __getitem__(self, index: int | slice, /) -> T | Sequence[T]:
46
+ if isinstance(index, slice):
47
+ start, stop, step = index.indices(len(self))
48
+ return self.get_items(range(start, stop, step))
49
+ return self.get_item(index)
50
+
51
+ @abstractmethod
52
+ def get_item(self, index: int) -> T:
53
+ raise NotImplementedError
54
+
55
+ def get_items(self, indices: Sequence[int]) -> Sequence[T]:
56
+ return [self.get_item(index) for index in indices]
57
+
58
+ # --- metadata access ---------------------------------------------------
59
+
60
+ @abstractmethod
61
+ def get_meta(self, index: int) -> M:
62
+ raise NotImplementedError
63
+
64
+ def get_metas(self, indices: Sequence[int]) -> Sequence[M]:
65
+ return [self.get_meta(index) for index in indices]
66
+
67
+ # --- combined item + metadata ------------------------------------------
68
+
69
+ def get_pair(self, index: int) -> tuple[T, M]:
70
+ return self.get_item(index), self.get_meta(index)
71
+
72
+ def get_pairs(self, indices: Sequence[int]) -> Sequence[tuple[T, M]]:
73
+ return [self.get_pair(index) for index in indices]
@@ -0,0 +1,221 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ("ConcatSequence", "SlicedSequence", "WindowedSequence")
4
+
5
+ from abc import abstractmethod
6
+ from bisect import bisect_right
7
+ from typing import TYPE_CHECKING
8
+
9
+ from kaparoo.data.sequences.base import DataSequence
10
+
11
+ if TYPE_CHECKING:
12
+ from collections.abc import Sequence
13
+
14
+
15
+ class SlicedSequence[T, M](DataSequence[T, M]):
16
+ """A view of `source` exposing only items at the given `indices`.
17
+
18
+ `indices` is materialized as a tuple at construction time so that the
19
+ view has a stable length and supports O(1) random access. Negative
20
+ and out-of-range indices delegate to Python's tuple semantics
21
+ (negative wraps, out-of-range raises `IndexError`).
22
+
23
+ `indices` is taken as-is: duplicates are allowed (the same source
24
+ item is yielded multiple times) and order is preserved (no sorting).
25
+ Bounds against `source` are not validated at construction; an
26
+ out-of-range entry surfaces only when that position is accessed.
27
+
28
+ Example:
29
+ >>> sliced = SlicedSequence(full_dataset, [3, 7, 11])
30
+ >>> sliced[0] # == full_dataset[3]
31
+ >>> sliced[1] # == full_dataset[7]
32
+ """
33
+
34
+ def __init__(
35
+ self,
36
+ source: DataSequence[T, M],
37
+ indices: Sequence[int],
38
+ ) -> None:
39
+ self._source = source
40
+ self._indices = tuple(indices)
41
+
42
+ @property
43
+ def source(self) -> DataSequence[T, M]:
44
+ """The wrapped sequence."""
45
+ return self._source
46
+
47
+ @property
48
+ def indices(self) -> tuple[int, ...]:
49
+ """The index map into `source`, frozen at construction."""
50
+ return self._indices
51
+
52
+ def __len__(self) -> int:
53
+ return len(self._indices)
54
+
55
+ def get_item(self, index: int) -> T:
56
+ return self._source.get_item(self._indices[index])
57
+
58
+ def get_meta(self, index: int) -> M:
59
+ return self._source.get_meta(self._indices[index])
60
+
61
+
62
+ class ConcatSequence[T, M](DataSequence[T, M]):
63
+ """The end-to-end concatenation of zero or more `sources`.
64
+
65
+ Indexing maps to `(source, local_index)` via a precomputed cumulative
66
+ length array and `bisect`, so a lookup is O(log N) in the number of
67
+ sources. Negative indices are normalized; out-of-range indices raise
68
+ `IndexError`.
69
+
70
+ Example:
71
+ >>> combined = ConcatSequence(train_a, train_b, train_c)
72
+ >>> len(combined) # == len(train_a) + len(train_b) + len(train_c)
73
+ """
74
+
75
+ def __init__(self, *sources: DataSequence[T, M]) -> None:
76
+ self._sources = sources
77
+ cumulative = [0]
78
+ for s in sources:
79
+ cumulative.append(cumulative[-1] + len(s))
80
+ self._cumulative = tuple(cumulative)
81
+
82
+ @property
83
+ def sources(self) -> tuple[DataSequence[T, M], ...]:
84
+ """The wrapped sequences, in the order they were passed in."""
85
+ return self._sources
86
+
87
+ def __len__(self) -> int:
88
+ return self._cumulative[-1]
89
+
90
+ def _locate(self, index: int) -> tuple[DataSequence[T, M], int]:
91
+ """Resolve a logical index to `(source, local_index)`.
92
+
93
+ Raises:
94
+ IndexError: If `index` is outside `[-len(self), len(self))`.
95
+ """
96
+ n = self._cumulative[-1]
97
+ original = index
98
+ if index < 0:
99
+ index += n
100
+ if not 0 <= index < n:
101
+ msg = f"index {original} out of range for length {n}"
102
+ raise IndexError(msg)
103
+ i = bisect_right(self._cumulative, index) - 1
104
+ return self._sources[i], index - self._cumulative[i]
105
+
106
+ def get_item(self, index: int) -> T:
107
+ source, local = self._locate(index)
108
+ return source.get_item(local)
109
+
110
+ def get_meta(self, index: int) -> M:
111
+ source, local = self._locate(index)
112
+ return source.get_meta(local)
113
+
114
+
115
+ class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
116
+ """An abstract sliding-window view over `source`.
117
+
118
+ Each item is a tuple of `size` items from `source`, starting at
119
+ position `i * step`, with intra-window stride `skip`. Indexed item
120
+ access (`get_item`) is implemented; **the window's metadata
121
+ strategy is intentionally left abstract** so the relationship
122
+ between per-frame `M_in` and window-level `M_out` is decided at
123
+ subclass-definition time.
124
+
125
+ Subclasses use the `source`, `size`, `step`, `skip` properties and
126
+ should call `_normalize_index` from `get_meta` so negative and
127
+ out-of-range window indices behave the same way as in `get_item`.
128
+
129
+ Type Parameters:
130
+ T: Item type of `source` (also the per-frame type within each
131
+ window).
132
+ M_in: Metadata type of `source` (per-frame metadata).
133
+ M_out: Metadata type of the window. Determined by the
134
+ subclass's `get_meta` return.
135
+
136
+ Args:
137
+ source: The sequence to window over.
138
+ size: Number of items per window. Must be positive.
139
+ step: Position advance between consecutive windows. Defaults
140
+ to 1 (overlapping windows by `size - 1`).
141
+ skip: Intra-window stride. Defaults to 1 (consecutive frames).
142
+
143
+ Raises:
144
+ ValueError: If `size`, `step`, or `skip` is non-positive.
145
+ """
146
+
147
+ def __init__(
148
+ self,
149
+ source: DataSequence[T, M_in],
150
+ size: int,
151
+ *,
152
+ step: int = 1,
153
+ skip: int = 1,
154
+ ) -> None:
155
+ if size <= 0 or step <= 0 or skip <= 0:
156
+ msg = (
157
+ f"size, step, skip must be positive "
158
+ f"(got size={size}, step={step}, skip={skip})"
159
+ )
160
+ raise ValueError(msg)
161
+ self._source = source
162
+ self._size = size
163
+ self._step = step
164
+ self._skip = skip
165
+ # The window spans `(size - 1) * skip + 1` source positions; the
166
+ # number of complete windows is then `(len(source) - span) // step + 1`.
167
+ span = (size - 1) * skip + 1
168
+ self._length = max(0, (len(source) - span) // step + 1)
169
+
170
+ @property
171
+ def source(self) -> DataSequence[T, M_in]:
172
+ """The wrapped sequence."""
173
+ return self._source
174
+
175
+ @property
176
+ def size(self) -> int:
177
+ """Number of items per window."""
178
+ return self._size
179
+
180
+ @property
181
+ def step(self) -> int:
182
+ """Position advance between consecutive windows."""
183
+ return self._step
184
+
185
+ @property
186
+ def skip(self) -> int:
187
+ """Intra-window stride."""
188
+ return self._skip
189
+
190
+ def __len__(self) -> int:
191
+ return self._length
192
+
193
+ def _normalize_index(self, index: int) -> int:
194
+ """Normalize a possibly-negative window index and validate range.
195
+
196
+ Subclasses should call this from `get_meta` to apply the same
197
+ negative-index handling and bounds checking that `get_item`
198
+ performs.
199
+
200
+ Raises:
201
+ IndexError: If `index` is outside `[-len(self), len(self))`.
202
+ """
203
+ n = self._length
204
+ original = index
205
+ if index < 0:
206
+ index += n
207
+ if not 0 <= index < n:
208
+ msg = f"index {original} out of range for length {n}"
209
+ raise IndexError(msg)
210
+ return index
211
+
212
+ def get_item(self, index: int) -> tuple[T, ...]:
213
+ index = self._normalize_index(index)
214
+ start = index * self._step
215
+ return tuple(
216
+ self._source.get_item(start + j * self._skip) for j in range(self._size)
217
+ )
218
+
219
+ @abstractmethod
220
+ def get_meta(self, index: int) -> M_out:
221
+ raise NotImplementedError
@@ -0,0 +1,196 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ("FileFolderSequence", "SingleFileSequence")
4
+
5
+ from abc import abstractmethod
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ from kaparoo.data.sequences.base import DataSequence
10
+ from kaparoo.filesystem.existence import ensure_dir_exists, ensure_file_exists
11
+ from kaparoo.filesystem.utils import stringify_paths, wrap_path
12
+
13
+ if TYPE_CHECKING:
14
+ from kaparoo.filesystem.types import StrPath
15
+
16
+
17
+ class FileFolderSequence[T, M = Path](DataSequence[T, M]):
18
+ """A folder-rooted `DataSequence` whose items live in individual files.
19
+
20
+ The base class handles file discovery, indexing, and root-relative
21
+ path bookkeeping. Subclasses are responsible for three things:
22
+
23
+ - **`list_files(self, root)`** (abstract): return the full `Path`
24
+ of every file to expose, in the desired order. Called once from
25
+ `__init__` after `root` has been validated. Every returned path
26
+ must be under `root`; otherwise construction raises `ValueError`.
27
+ Subclasses can read instance state to parameterize the listing
28
+ (see "Parameterized subclasses" below).
29
+ - **`load_file(self, path)`** (abstract): decode a single file.
30
+ Called lazily on each `get_item`, never at construction time.
31
+ - **`get_meta(self, index)`** (abstract): produce per-item
32
+ metadata. When the metadata IS the source path, `M` defaults
33
+ to `Path` and `get_meta(i)` can be the one-liner
34
+ `return self.get_file(i)`.
35
+
36
+ The base exposes:
37
+
38
+ - `root: Path` — the base directory.
39
+ - `files: tuple[Path, ...]` — full paths as an immutable snapshot.
40
+ - `get_file(index) -> Path` — full path of the i-th file.
41
+
42
+ Paths are kept internally in their root-relative form so that
43
+ memory stays low for large datasets and the sequence survives
44
+ `root` relocations; the conversion is transparent to subclasses
45
+ and external callers.
46
+
47
+ Parameterized subclasses:
48
+ When a subclass needs instance-level options (e.g. `pattern`,
49
+ `recursive`, label maps), set them on `self` **before** calling
50
+ `super().__init__(root)` -- the base class invokes
51
+ `self.list_files(root)` from its own `__init__`, so any state
52
+ `list_files` will read must already be in place. State that
53
+ `list_files` does *not* read (caches, label tables, ...) can
54
+ be set after `super().__init__(root)` as usual.
55
+
56
+ Type Parameters:
57
+ T: Item type returned by `get_item`.
58
+ M: Per-item metadata type. Defaults to `Path`; override when
59
+ the metadata is something else (label, line number, ...).
60
+
61
+ Args:
62
+ root: The base directory. Must exist and be a directory.
63
+
64
+ Raises:
65
+ DirectoryNotFoundError: If `root` does not exist.
66
+ NotADirectoryError: If `root` exists but is not a directory.
67
+ ValueError: If any path returned by `list_files` is not under
68
+ `root`.
69
+
70
+ Example:
71
+ >>> from pathlib import Path
72
+ >>> class GlobFolder(FileFolderSequence[bytes]):
73
+ ... def __init__(
74
+ ... self, root, *, pattern: str = "*", recursive: bool = False
75
+ ... ) -> None:
76
+ ... # Set state BEFORE super().__init__() so list_files
77
+ ... # can read it.
78
+ ... self._pattern = pattern
79
+ ... self._recursive = recursive
80
+ ... super().__init__(root)
81
+ ...
82
+ ... def list_files(self, root: Path) -> list[Path]:
83
+ ... glob_fn = root.rglob if self._recursive else root.glob
84
+ ... return sorted(p for p in glob_fn(self._pattern) if p.is_file())
85
+ ...
86
+ ... def get_meta(self, index: int) -> Path:
87
+ ... return self.get_file(index)
88
+ ...
89
+ ... def load_file(self, path: Path) -> bytes:
90
+ ... return path.read_bytes()
91
+ >>>
92
+ >>> folder = GlobFolder("data", pattern="*.png", recursive=True)
93
+ """
94
+
95
+ def __init__(self, root: StrPath) -> None:
96
+ self._root = ensure_dir_exists(root)
97
+ self._files = list(
98
+ stringify_paths(self.list_files(self._root), after=self._root)
99
+ )
100
+
101
+ def __len__(self) -> int:
102
+ return len(self._files)
103
+
104
+ @property
105
+ def root(self) -> Path:
106
+ """The base directory the sequence was constructed from."""
107
+ return self._root
108
+
109
+ @property
110
+ def files(self) -> tuple[Path, ...]:
111
+ """Immutable snapshot of the full file paths this sequence exposes.
112
+
113
+ Returns a fresh `tuple[Path, ...]` on each access, in the order
114
+ established by `list_files`.
115
+ """
116
+ return tuple(self.get_file(i) for i in range(len(self)))
117
+
118
+ def get_file(self, index: int) -> Path:
119
+ """Full Path of the file at `index`."""
120
+ return wrap_path(self._files[index], prepend=self._root)
121
+
122
+ def get_item(self, index: int) -> T:
123
+ return self.load_file(self.get_file(index))
124
+
125
+ @abstractmethod
126
+ def get_meta(self, index: int) -> M:
127
+ raise NotImplementedError
128
+
129
+ @abstractmethod
130
+ def load_file(self, path: Path) -> T:
131
+ """Decode a single file into an item of type `T`.
132
+
133
+ Called lazily on each `get_item` -- not at construction time.
134
+ Subclasses may freely use external libraries (PIL, librosa,
135
+ cv2, ...) to decode.
136
+ """
137
+ raise NotImplementedError
138
+
139
+ @abstractmethod
140
+ def list_files(self, root: Path) -> list[Path]:
141
+ """Return the full Path of every file to expose, in order.
142
+
143
+ Called once from `__init__` after `root` has been validated.
144
+ Every returned path must be under `root`; construction raises
145
+ `ValueError` otherwise. May read instance state set before
146
+ `super().__init__(root)` -- see the class docstring's
147
+ "Parameterized subclasses" note.
148
+ """
149
+ raise NotImplementedError
150
+
151
+
152
+ class SingleFileSequence[T, M = None](DataSequence[T, M]):
153
+ """A `DataSequence` backed by a single file that holds multiple records.
154
+
155
+ Thin abstract base for the "one file, many records" pattern
156
+ (a video file with many frames; a CSV with many rows; a binary
157
+ blob with fixed-size records; ...). Indexing strategies vary too
158
+ widely across formats to abstract here -- subclasses are
159
+ responsible for opening, indexing, and decoding the file.
160
+
161
+ `__init__` validates that `path` exists and is a regular file and
162
+ makes it available via the `path` property. Subclasses typically
163
+ override `__init__` to additionally open or pre-scan the file,
164
+ calling `super().__init__(path)` first.
165
+
166
+ Args:
167
+ path: The file to read. Must exist and be a regular file.
168
+
169
+ Raises:
170
+ FileNotFoundError: If `path` does not exist.
171
+ NotAFileError: If `path` exists but is not a regular file.
172
+
173
+ Example:
174
+ >>> from pathlib import Path
175
+ >>> class LinesFile(SingleFileSequence[str, int]):
176
+ ... def __init__(self, path) -> None:
177
+ ... super().__init__(path)
178
+ ... self._lines = tuple(self.path.read_text().splitlines())
179
+ ...
180
+ ... def __len__(self) -> int:
181
+ ... return len(self._lines)
182
+ ...
183
+ ... def get_item(self, index: int) -> str:
184
+ ... return self._lines[index]
185
+ ...
186
+ ... def get_meta(self, index: int) -> int:
187
+ ... return index + 1 # 1-based line number
188
+ """
189
+
190
+ def __init__(self, path: StrPath) -> None:
191
+ self._path = ensure_file_exists(path)
192
+
193
+ @property
194
+ def path(self) -> Path:
195
+ """The wrapped file's path."""
196
+ return self._path
@@ -0,0 +1,79 @@
1
+ from __future__ import annotations
2
+
3
+ __all__ = ("generate_batches",)
4
+
5
+ from typing import TYPE_CHECKING
6
+
7
+ if TYPE_CHECKING:
8
+ from collections.abc import Iterator, Sequence
9
+
10
+
11
+ def generate_batches[T](
12
+ sequence: Sequence[T],
13
+ size: int,
14
+ *,
15
+ step: int = 1,
16
+ skip: int = 1,
17
+ start: int = 0,
18
+ stop: int | None = None,
19
+ drop_last: bool = True,
20
+ ) -> Iterator[Sequence[T]]:
21
+ """Yield sliding windows from `sequence`.
22
+
23
+ Each yielded batch is `sequence[head : tail : skip]` where `head`
24
+ advances by `step` per iteration. With the defaults (`size=3,
25
+ step=1, skip=1, drop_last=True`), this produces overlapping
26
+ consecutive-frame windows; pair this with a non-overlapping `step
27
+ >= size` for a classic non-overlapping batch loader.
28
+
29
+ Traversal is constrained to the index range `[start, stop)`.
30
+ `stop=None` defaults to `len(sequence)`. An empty range (`start ==
31
+ stop`) yields nothing -- the function returns without error.
32
+
33
+ Args:
34
+ sequence: The sequence to slide windows over.
35
+ size: Number of items per window. Must be positive.
36
+ step: Position advance between consecutive windows. Defaults
37
+ to 1 (overlapping windows by `size - 1`).
38
+ skip: Intra-window stride. Defaults to 1 (consecutive items).
39
+ start: Inclusive lower bound on source indices. Defaults to 0.
40
+ Must satisfy `0 <= start <= stop`.
41
+ stop: Exclusive upper bound on source indices. Defaults to
42
+ `len(sequence)`. The partial window (when `drop_last=False`)
43
+ respects `stop` and never extends past it.
44
+ drop_last: If False, yield a final partial (possibly shorter
45
+ than `size`) window when items remain after the last full
46
+ window. Defaults to True.
47
+
48
+ Yields:
49
+ Sub-sequences of `sequence` obtained by slicing.
50
+
51
+ Raises:
52
+ ValueError: If `size`, `step`, or `skip` is non-positive, or
53
+ if the range is not `0 <= start <= stop <= len(sequence)`.
54
+ """
55
+ if size <= 0 or step <= 0 or skip <= 0:
56
+ msg = (
57
+ f"size, step, skip must be positive "
58
+ f"(got size={size}, step={step}, skip={skip})"
59
+ )
60
+ raise ValueError(msg)
61
+
62
+ length = len(sequence)
63
+ stop = stop if stop is not None else length
64
+ if not 0 <= start <= stop <= length:
65
+ msg = f"invalid range [{start}, {stop}) for sequence of length {length}"
66
+ raise ValueError(msg)
67
+
68
+ head = start
69
+ tail = head + (size - 1) * skip + 1
70
+
71
+ while tail <= stop:
72
+ yield sequence[head:tail:skip]
73
+ head += step
74
+ tail += step
75
+
76
+ # Final partial window must respect `stop` (not `tail`, which has
77
+ # advanced past `stop` by the time we get here).
78
+ if not drop_last and head < stop:
79
+ yield sequence[head:stop:skip]
@@ -38,7 +38,13 @@ class MultiPatternFilter(Filter, ABC):
38
38
  tuple is deduped at construction time; the normalized form is
39
39
  what gets stored and serialized.
40
40
  case_sensitive: If False, matching is performed case-insensitively
41
- via Unicode `casefold`. Defaults to True.
41
+ via Unicode `str.casefold()`. Note that `casefold()` is more
42
+ aggressive than `str.lower()` (e.g. ``"ß".casefold() == "ss"``,
43
+ ``"fi".casefold() == "fi"``), so two filenames that the
44
+ underlying filesystem treats as distinct may still match each
45
+ other here. This is the "caseless linguistic equivalence"
46
+ interpretation that Python recommends for case-insensitive
47
+ string matching. Defaults to True.
42
48
 
43
49
  Raises:
44
50
  ValueError: If `patterns` is empty.
@@ -45,7 +45,13 @@ class PatternFilter(Filter, ABC):
45
45
  construction time and the normalized form is what gets
46
46
  stored and serialized.
47
47
  case_sensitive: If False, matching is performed case-insensitively
48
- via Unicode `casefold`. Defaults to True.
48
+ via Unicode `str.casefold()`. Note that `casefold()` is more
49
+ aggressive than `str.lower()` (e.g. ``"ß".casefold() == "ss"``,
50
+ ``"fi".casefold() == "fi"``), so two filenames that the
51
+ underlying filesystem treats as distinct may still match each
52
+ other here. This is the "caseless linguistic equivalence"
53
+ interpretation that Python recommends for case-insensitive
54
+ string matching. Defaults to True.
49
55
  """
50
56
 
51
57
  pattern: str
@@ -19,7 +19,7 @@ Treated as private -- mutate only through `register_filter`.
19
19
  """
20
20
 
21
21
 
22
- def register_filter(kind: str) -> Callable[[type[Filter]], type[Filter]]:
22
+ def register_filter[F: Filter](kind: str) -> Callable[[type[F]], type[F]]:
23
23
  """Register a `Filter` subclass under `kind` (decorator).
24
24
 
25
25
  The registered class becomes discoverable by `Filter.from_dict`
@@ -37,7 +37,7 @@ def register_filter(kind: str) -> Callable[[type[Filter]], type[Filter]]:
37
37
  ValueError: If `kind` is already registered to another class.
38
38
  """
39
39
 
40
- def decorator(cls: type[Filter]) -> type[Filter]:
40
+ def decorator(cls: type[F]) -> type[F]:
41
41
  existing = _FILTER_REGISTRY.get(kind)
42
42
  if existing is not None and existing is not cls:
43
43
  msg = (
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kaparoo-python
3
- Version: 0.2.1
3
+ Version: 0.3.0
4
4
  Summary: Personally common and useful Python features
5
5
  Keywords: filesystem,pathlib,paths,utilities
6
6
  Author: Jaewoo Park
@@ -46,37 +46,48 @@ pip install kaparoo-python
46
46
 
47
47
  ## 🧩 Modules
48
48
 
49
- ### `kaparoo.filesystem`
49
+ Each submodule ships its own README with focused examples.
50
50
 
51
- `pathlib`-based filesystem helpers.
51
+ ### [`kaparoo.filesystem`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/filesystem)
52
52
 
53
- - **`existence`** existence checks (`*_exists`) and `ensure_*` validators.
54
- - **`directory`** `make_dir(s)`, `dir_empty(s)` (with `_unsafe` variants).
55
- - **`utils`** `stringify_path(s)`, `wrap_path(s)`.
56
- - **`exceptions`** — `DirectoryNotFoundError`, `NotAFileError`.
57
- - **`types`** — `StrPath`, `StrPaths`.
53
+ `pathlib`-based filesystem helpers: existence checks (`*_exists`),
54
+ `ensure_*` validators, `make_dir(s)`, `dir_empty(s)`, path
55
+ stringification, and a small exception hierarchy.
58
56
 
59
- ### `kaparoo.filesystem.search`
57
+ ### [`kaparoo.filesystem.search`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/filesystem/search)
60
58
 
61
- Filesystem traversal with composable filters.
59
+ Filesystem traversal with composable filters. Includes `search_paths` /
60
+ `search_files` / `search_dirs`, a `Filter` family (pattern, multi-pattern,
61
+ logical) that round-trips through JSON-friendly dicts, and an extension
62
+ hook for custom filter kinds.
62
63
 
63
- - **Entry points** — `search_paths`, `search_files`, `search_dirs`.
64
- - **Pattern filters** — `Equals`, `StartsWith`, `EndsWith`, `Contains`,
65
- `Regex`, `Glob`.
66
- - **Multi-pattern filters** — `EqualsAny`, `StartsWithAny`, `EndsWithAny`,
67
- `ContainsAny`.
68
- - **Logical filters** — `And`, `Or`, `Not`.
69
- - **Serialization** — `Filter.to_dict()` / `Filter.from_dict()` round-trip
70
- via a `"kind"` discriminator; `Filter.parse()` accepts a `Filter` or a
71
- `FilterDict`; `register_filter(kind)` extends the dispatcher with
72
- custom subclasses. `FilterDict` family lives at
73
- `kaparoo.filesystem.search.filters.types`.
74
- - **Deprecated** — `get_paths`, `get_files`, `get_dirs` (use `search_*`).
64
+ ### [`kaparoo.utils`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/utils)
75
65
 
76
- ### `kaparoo.utils`
66
+ `Timer` / `LapTimer` context-manager-and-decorator timers, plus a small
67
+ family of helpers for working with `Optional[T]` values
68
+ (`replace_if_none`, `unwrap_or_default`, ...).
77
69
 
78
- - **`timer`** — `Timer` and `LapTimer` context-manager / decorator timers.
79
- - **`optional`** — `replace_if_none`, `factory_if_none`, `unwrap_or_*`.
70
+ ### [`kaparoo.data`](https://github.com/kaparoo/kaparoo-python/tree/main/kaparoo/data)
71
+
72
+ Building blocks for dataset code: `DataSequence[T, M]` ABC (item +
73
+ metadata), composers (`SlicedSequence`, `ConcatSequence`,
74
+ `WindowedSequence`), file-backed templates (`FileFolderSequence`,
75
+ `SingleFileSequence`), and `generate_batches`.
76
+
77
+ ## 🎯 Quick example
78
+
79
+ ```python
80
+ from kaparoo.filesystem import search_files
81
+ from kaparoo.filesystem.search.filters import And, EndsWith, Equals, Not
82
+
83
+ # All .py files except __init__.py
84
+ py_files = search_files(
85
+ "src",
86
+ name_filter=And((EndsWith(".py"), Not(Equals("__init__.py")))),
87
+ )
88
+ ```
89
+
90
+ See each submodule's README for more.
80
91
 
81
92
  ## 📋 TODO
82
93
 
@@ -1,7 +1,10 @@
1
1
  kaparoo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
- kaparoo/data/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- kaparoo/data/sequence.py,sha256=4i5rrHTQOx4vpVWbmCC3BushQgiuE5_rcraoWAm4Mko,1094
4
- kaparoo/data/utils.py,sha256=8g_DgZeKs4L1Dp0wGHG64UkTq8wyK6FPV7FDw5BOM0Q,1192
2
+ kaparoo/data/__init__.py,sha256=jCQNbgwE5eDLJaytQB-HWWkd2qRVYM9ndvZmBIh2MQY,368
3
+ kaparoo/data/sequences/__init__.py,sha256=rokLPUrJ1bxEx9c9wNH9ekNjEfwBW0z4o1-UEW9Qrj0,534
4
+ kaparoo/data/sequences/base.py,sha256=m2JcIcT-SLrTzsjFCtgrQ9I5XVpB6PYBigyooEpg4VE,2628
5
+ kaparoo/data/sequences/composers.py,sha256=NjQ-2fxSffE5FjTA1xfZMS65w8N13ZOd_BHj1-NiC1w,7364
6
+ kaparoo/data/sequences/templates.py,sha256=bY_znZ3vcQwRL6F--1Cs_A3e34UvDkQr1L_71l1lPLo,7611
7
+ kaparoo/data/sequences/utils.py,sha256=oe0qWwnAjsf-9CBUPSlkxkeuQS8kjn1sYhx2eDIwPKI,2808
5
8
  kaparoo/filesystem/__init__.py,sha256=OYSpqRSkEMnpnz2dpRxXnnwyQuvcnWRdHw6shJWoLDU,1420
6
9
  kaparoo/filesystem/directory.py,sha256=xHHE4ckPS4a8WHPLbXi2427pa9ePf0IQzbmRe7VPSUo,6268
7
10
  kaparoo/filesystem/exceptions.py,sha256=dWzD7d30bUEAxKWMYtJWmIg8tK3meEFz84NhWmWXb6k,464
@@ -12,10 +15,10 @@ kaparoo/filesystem/search/deprecated.py,sha256=pNJ7GflMS5d-xkVPjOwq4kAGizdtnDMDi
12
15
  kaparoo/filesystem/search/filters/__init__.py,sha256=XYxnTklHUctpkCusW0dlHMneQFOI8AJJCIR9n7eCc8U,1416
13
16
  kaparoo/filesystem/search/filters/base.py,sha256=NSnRJGFFK15R4-eSMi3_HRbpJ_dRWXC2Lxrn6xCrYPA,3309
14
17
  kaparoo/filesystem/search/filters/logical.py,sha256=aAyJHRB87yBdpUJIwNTOQG6jsPEcNefVSpcNejaTSSk,3872
15
- kaparoo/filesystem/search/filters/multi_pattern.py,sha256=He7_bgaPy931jH7NXknKqodywTGWbDhAcpumqrwL5kQ,4995
16
- kaparoo/filesystem/search/filters/pattern.py,sha256=PnRRy9IvJWz-FJaSIzZj6D3CARRcJNoeSM9MDpI2Hl0,6595
18
+ kaparoo/filesystem/search/filters/multi_pattern.py,sha256=xH7_9Lfla477yeUaWV0g8O98OcN8lJ_fjheYApvegik,5422
19
+ kaparoo/filesystem/search/filters/pattern.py,sha256=GT8tI7falx62Ydrq_CuYnuoWUMn_TUDREIHP8pi9ETo,7022
17
20
  kaparoo/filesystem/search/filters/types.py,sha256=jrRxhVZwyMgQ0_o4zOvmBB_uDCcCDEGFvIgmQH1X45A,1115
18
- kaparoo/filesystem/search/filters/utils.py,sha256=yA2KuhBjI4FSdLfy1hr4AyBA98LFtTtTkPJ9aKK54Cg,1603
21
+ kaparoo/filesystem/search/filters/utils.py,sha256=j-AfVu298t5GuQDb1UCOtb0L7R2Hm9gMD7UuFbIe2yE,1594
19
22
  kaparoo/filesystem/search/wrappers.py,sha256=CDLvw9IhVNZLtMSdGnldJLum9tt8UTSQoRphOxg0qD8,10782
20
23
  kaparoo/filesystem/types.py,sha256=Cm1MLEAujVRyB9uciJ-uUxC0hw6j-ycxVPKS9KFmJnw,202
21
24
  kaparoo/filesystem/utils.py,sha256=kP-4GgfFr6iwUmnNJmV2QIO_5vBWpsU9uBxZMJKqjwY,6400
@@ -23,7 +26,7 @@ kaparoo/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
26
  kaparoo/utils/__init__.py,sha256=BJLpxE8OIJpNsR7FP7YpfXNajynKAS3Jd3qv6xDaBsI,445
24
27
  kaparoo/utils/optional.py,sha256=UgNhGDzl317PE_ESt9hW7yl9MYcdL0nV6Ly0mpqIz0U,4224
25
28
  kaparoo/utils/timer.py,sha256=P2xKbPrTeYPlg06vwyJIQGF7T9xs6yF5Hgm56KfLh5s,12916
26
- kaparoo_python-0.2.1.dist-info/licenses/LICENSE,sha256=hb6LWYP2rtcoz4V2HpawmblDfHwjwsg9N3cz0c5JQJE,1067
27
- kaparoo_python-0.2.1.dist-info/WHEEL,sha256=f5fWSvWsg5Knq5GWa6t1nJIug0Tqo69GqAWD_9LbBKw,81
28
- kaparoo_python-0.2.1.dist-info/METADATA,sha256=zTMvwww6w7ZBNhjiQzG0LAoGpFHZebjvlZKePfqU9iE,3647
29
- kaparoo_python-0.2.1.dist-info/RECORD,,
29
+ kaparoo_python-0.3.0.dist-info/licenses/LICENSE,sha256=hb6LWYP2rtcoz4V2HpawmblDfHwjwsg9N3cz0c5JQJE,1067
30
+ kaparoo_python-0.3.0.dist-info/WHEEL,sha256=f5fWSvWsg5Knq5GWa6t1nJIug0Tqo69GqAWD_9LbBKw,81
31
+ kaparoo_python-0.3.0.dist-info/METADATA,sha256=uJa9RUSbQ17apuTfYb_kKR2Iyd0Gq_K5Ve8838z_lQI,3947
32
+ kaparoo_python-0.3.0.dist-info/RECORD,,
kaparoo/data/sequence.py DELETED
@@ -1,39 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = ("DataSequence",)
4
-
5
- from abc import abstractmethod
6
- from collections.abc import Sequence
7
- from typing import TYPE_CHECKING, overload
8
-
9
- if TYPE_CHECKING:
10
- from kaparoo.filesystem.types import StrPath
11
-
12
-
13
- class DataSequence[T](Sequence[T]):
14
- @abstractmethod
15
- def __init__(self, path: StrPath) -> None:
16
- raise NotImplementedError
17
-
18
- @abstractmethod
19
- def __len__(self) -> int:
20
- raise NotImplementedError
21
-
22
- @overload
23
- def __getitem__(self, index: int, /) -> T: ...
24
-
25
- @overload
26
- def __getitem__(self, index: slice, /) -> Sequence[T]: ...
27
-
28
- def __getitem__(self, index: int | slice, /) -> T | Sequence[T]:
29
- if isinstance(index, slice):
30
- start, stop, step = index.indices(len(self))
31
- return self.by_indices(range(start, stop, step))
32
- return self.by_index(index)
33
-
34
- @abstractmethod
35
- def by_index(self, index: int) -> T:
36
- raise NotImplementedError
37
-
38
- def by_indices(self, indices: Sequence[int]) -> Sequence[T]:
39
- return [self.by_index(index) for index in indices]
kaparoo/data/utils.py DELETED
@@ -1,46 +0,0 @@
1
- from __future__ import annotations
2
-
3
- __all__ = ("generate_batches",)
4
-
5
- from typing import TYPE_CHECKING
6
-
7
- from kaparoo.utils.optional import replace_if_none
8
-
9
- if TYPE_CHECKING:
10
- from collections.abc import Iterator, Sequence
11
-
12
-
13
- def generate_batches[T](
14
- sequence: Sequence[T],
15
- size: int,
16
- step: int = 1,
17
- skip: int = 1,
18
- start: int = 0,
19
- stop: int | None = None,
20
- *,
21
- drop_last: bool = True,
22
- ) -> Iterator[Sequence[T]]:
23
- def die_if_not_positive(name: str, value: int) -> None:
24
- if value <= 0:
25
- msg = f"{name} must be positive (got {value})"
26
- raise ValueError(msg)
27
-
28
- die_if_not_positive("size", size)
29
- die_if_not_positive("step", step)
30
- die_if_not_positive("skip", skip)
31
-
32
- stop = replace_if_none(stop, len_ := len(sequence))
33
- if not (start < stop <= len_ and start >= 0):
34
- msg = f"invalid range [{start}, {stop}) for sequence of length {len_}"
35
- raise ValueError(msg)
36
-
37
- head = start
38
- tail = head + (size - 1) * skip + 1
39
-
40
- while tail <= stop:
41
- yield sequence[head:tail:skip]
42
- head += step
43
- tail += step
44
-
45
- if not drop_last and head < stop:
46
- yield sequence[head:tail:skip]