kaparoo-python 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kaparoo/data/__init__.py +19 -0
- kaparoo/data/sequences/__init__.py +23 -0
- kaparoo/data/sequences/base.py +73 -0
- kaparoo/data/sequences/composers.py +221 -0
- kaparoo/data/sequences/templates.py +196 -0
- kaparoo/data/sequences/utils.py +79 -0
- kaparoo/filesystem/__init__.py +73 -73
- kaparoo/filesystem/directory.py +226 -226
- kaparoo/filesystem/exceptions.py +17 -17
- kaparoo/filesystem/existence.py +392 -392
- kaparoo/filesystem/search/__init__.py +2 -0
- kaparoo/filesystem/search/classes.py +208 -199
- kaparoo/filesystem/search/deprecated.py +289 -289
- kaparoo/filesystem/search/filters/__init__.py +73 -0
- kaparoo/filesystem/search/filters/base.py +92 -0
- kaparoo/filesystem/search/filters/logical.py +138 -0
- kaparoo/filesystem/search/filters/multi_pattern.py +160 -0
- kaparoo/filesystem/search/filters/pattern.py +216 -0
- kaparoo/filesystem/search/filters/types.py +47 -0
- kaparoo/filesystem/search/filters/utils.py +51 -0
- kaparoo/filesystem/search/wrappers.py +318 -311
- kaparoo/filesystem/types.py +9 -9
- kaparoo/filesystem/utils.py +208 -208
- kaparoo/utils/__init__.py +21 -21
- kaparoo/utils/optional.py +129 -129
- kaparoo/utils/timer.py +374 -374
- {kaparoo_python-0.2.0.dist-info → kaparoo_python-0.3.0.dist-info}/METADATA +36 -20
- kaparoo_python-0.3.0.dist-info/RECORD +32 -0
- {kaparoo_python-0.2.0.dist-info → kaparoo_python-0.3.0.dist-info}/WHEEL +1 -1
- kaparoo/data/sequence.py +0 -39
- kaparoo/data/utils.py +0 -46
- kaparoo/filesystem/search/filters.py +0 -322
- kaparoo_python-0.2.0.dist-info/RECORD +0 -23
- {kaparoo_python-0.2.0.dist-info → kaparoo_python-0.3.0.dist-info}/licenses/LICENSE +0 -0
kaparoo/data/__init__.py
CHANGED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
__all__ = (
|
|
2
|
+
"ConcatSequence",
|
|
3
|
+
"DataSequence",
|
|
4
|
+
"FileFolderSequence",
|
|
5
|
+
"SingleFileSequence",
|
|
6
|
+
"SlicedSequence",
|
|
7
|
+
"WindowedSequence",
|
|
8
|
+
"generate_batches",
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
from kaparoo.data.sequences import (
|
|
12
|
+
ConcatSequence,
|
|
13
|
+
DataSequence,
|
|
14
|
+
FileFolderSequence,
|
|
15
|
+
SingleFileSequence,
|
|
16
|
+
SlicedSequence,
|
|
17
|
+
WindowedSequence,
|
|
18
|
+
generate_batches,
|
|
19
|
+
)
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
__all__ = (
|
|
4
|
+
"ConcatSequence",
|
|
5
|
+
"DataSequence",
|
|
6
|
+
"FileFolderSequence",
|
|
7
|
+
"SingleFileSequence",
|
|
8
|
+
"SlicedSequence",
|
|
9
|
+
"WindowedSequence",
|
|
10
|
+
"generate_batches",
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
from kaparoo.data.sequences.base import DataSequence
|
|
14
|
+
from kaparoo.data.sequences.composers import (
|
|
15
|
+
ConcatSequence,
|
|
16
|
+
SlicedSequence,
|
|
17
|
+
WindowedSequence,
|
|
18
|
+
)
|
|
19
|
+
from kaparoo.data.sequences.templates import (
|
|
20
|
+
FileFolderSequence,
|
|
21
|
+
SingleFileSequence,
|
|
22
|
+
)
|
|
23
|
+
from kaparoo.data.sequences.utils import generate_batches
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
__all__ = ("DataSequence",)
|
|
4
|
+
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from collections.abc import Sequence
|
|
7
|
+
from typing import overload
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataSequence[T, M = None](Sequence[T]):
|
|
11
|
+
"""An ordered, lazily-loaded, read-only sequence with per-item metadata.
|
|
12
|
+
|
|
13
|
+
Subclasses implement `get_item` (and `__len__`) to fetch a single
|
|
14
|
+
item by index. Sequence operations (`ds[i]`, `ds[i:j]`, `for x in
|
|
15
|
+
ds`, `x in ds`, `reversed(ds)`, ...) come from the inherited
|
|
16
|
+
`collections.abc.Sequence` protocol; only `get_item` need be
|
|
17
|
+
overridden, and `get_items` may be overridden for batch-fetch
|
|
18
|
+
optimization.
|
|
19
|
+
|
|
20
|
+
The second type parameter `M` carries per-item metadata (labels,
|
|
21
|
+
source paths, timestamps, ...). Subclasses implement `get_meta`.
|
|
22
|
+
When the data has no metadata, parameterize as `DataSequence[T]`
|
|
23
|
+
(so `M` defaults to `None`) and let `get_meta` simply return
|
|
24
|
+
`None`.
|
|
25
|
+
|
|
26
|
+
Type Parameters:
|
|
27
|
+
T: Element type. `ds[i]` and `get_item(i)` return `T`.
|
|
28
|
+
M: Per-item metadata type. `get_meta(i)` returns `M`. Defaults
|
|
29
|
+
to `None` -- meaning "no metadata", in which case
|
|
30
|
+
subclasses still implement `get_meta` but as a no-op.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
@abstractmethod
|
|
34
|
+
def __len__(self) -> int:
|
|
35
|
+
raise NotImplementedError
|
|
36
|
+
|
|
37
|
+
# --- item access -------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
@overload
|
|
40
|
+
def __getitem__(self, index: int, /) -> T: ...
|
|
41
|
+
|
|
42
|
+
@overload
|
|
43
|
+
def __getitem__(self, index: slice, /) -> Sequence[T]: ...
|
|
44
|
+
|
|
45
|
+
def __getitem__(self, index: int | slice, /) -> T | Sequence[T]:
|
|
46
|
+
if isinstance(index, slice):
|
|
47
|
+
start, stop, step = index.indices(len(self))
|
|
48
|
+
return self.get_items(range(start, stop, step))
|
|
49
|
+
return self.get_item(index)
|
|
50
|
+
|
|
51
|
+
@abstractmethod
|
|
52
|
+
def get_item(self, index: int) -> T:
|
|
53
|
+
raise NotImplementedError
|
|
54
|
+
|
|
55
|
+
def get_items(self, indices: Sequence[int]) -> Sequence[T]:
|
|
56
|
+
return [self.get_item(index) for index in indices]
|
|
57
|
+
|
|
58
|
+
# --- metadata access ---------------------------------------------------
|
|
59
|
+
|
|
60
|
+
@abstractmethod
|
|
61
|
+
def get_meta(self, index: int) -> M:
|
|
62
|
+
raise NotImplementedError
|
|
63
|
+
|
|
64
|
+
def get_metas(self, indices: Sequence[int]) -> Sequence[M]:
|
|
65
|
+
return [self.get_meta(index) for index in indices]
|
|
66
|
+
|
|
67
|
+
# --- combined item + metadata ------------------------------------------
|
|
68
|
+
|
|
69
|
+
def get_pair(self, index: int) -> tuple[T, M]:
|
|
70
|
+
return self.get_item(index), self.get_meta(index)
|
|
71
|
+
|
|
72
|
+
def get_pairs(self, indices: Sequence[int]) -> Sequence[tuple[T, M]]:
|
|
73
|
+
return [self.get_pair(index) for index in indices]
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
__all__ = ("ConcatSequence", "SlicedSequence", "WindowedSequence")
|
|
4
|
+
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from bisect import bisect_right
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from kaparoo.data.sequences.base import DataSequence
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class SlicedSequence[T, M](DataSequence[T, M]):
|
|
16
|
+
"""A view of `source` exposing only items at the given `indices`.
|
|
17
|
+
|
|
18
|
+
`indices` is materialized as a tuple at construction time so that the
|
|
19
|
+
view has a stable length and supports O(1) random access. Negative
|
|
20
|
+
and out-of-range indices delegate to Python's tuple semantics
|
|
21
|
+
(negative wraps, out-of-range raises `IndexError`).
|
|
22
|
+
|
|
23
|
+
`indices` is taken as-is: duplicates are allowed (the same source
|
|
24
|
+
item is yielded multiple times) and order is preserved (no sorting).
|
|
25
|
+
Bounds against `source` are not validated at construction; an
|
|
26
|
+
out-of-range entry surfaces only when that position is accessed.
|
|
27
|
+
|
|
28
|
+
Example:
|
|
29
|
+
>>> sliced = SlicedSequence(full_dataset, [3, 7, 11])
|
|
30
|
+
>>> sliced[0] # == full_dataset[3]
|
|
31
|
+
>>> sliced[1] # == full_dataset[7]
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
source: DataSequence[T, M],
|
|
37
|
+
indices: Sequence[int],
|
|
38
|
+
) -> None:
|
|
39
|
+
self._source = source
|
|
40
|
+
self._indices = tuple(indices)
|
|
41
|
+
|
|
42
|
+
@property
|
|
43
|
+
def source(self) -> DataSequence[T, M]:
|
|
44
|
+
"""The wrapped sequence."""
|
|
45
|
+
return self._source
|
|
46
|
+
|
|
47
|
+
@property
|
|
48
|
+
def indices(self) -> tuple[int, ...]:
|
|
49
|
+
"""The index map into `source`, frozen at construction."""
|
|
50
|
+
return self._indices
|
|
51
|
+
|
|
52
|
+
def __len__(self) -> int:
|
|
53
|
+
return len(self._indices)
|
|
54
|
+
|
|
55
|
+
def get_item(self, index: int) -> T:
|
|
56
|
+
return self._source.get_item(self._indices[index])
|
|
57
|
+
|
|
58
|
+
def get_meta(self, index: int) -> M:
|
|
59
|
+
return self._source.get_meta(self._indices[index])
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class ConcatSequence[T, M](DataSequence[T, M]):
|
|
63
|
+
"""The end-to-end concatenation of zero or more `sources`.
|
|
64
|
+
|
|
65
|
+
Indexing maps to `(source, local_index)` via a precomputed cumulative
|
|
66
|
+
length array and `bisect`, so a lookup is O(log N) in the number of
|
|
67
|
+
sources. Negative indices are normalized; out-of-range indices raise
|
|
68
|
+
`IndexError`.
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> combined = ConcatSequence(train_a, train_b, train_c)
|
|
72
|
+
>>> len(combined) # == len(train_a) + len(train_b) + len(train_c)
|
|
73
|
+
"""
|
|
74
|
+
|
|
75
|
+
def __init__(self, *sources: DataSequence[T, M]) -> None:
|
|
76
|
+
self._sources = sources
|
|
77
|
+
cumulative = [0]
|
|
78
|
+
for s in sources:
|
|
79
|
+
cumulative.append(cumulative[-1] + len(s))
|
|
80
|
+
self._cumulative = tuple(cumulative)
|
|
81
|
+
|
|
82
|
+
@property
|
|
83
|
+
def sources(self) -> tuple[DataSequence[T, M], ...]:
|
|
84
|
+
"""The wrapped sequences, in the order they were passed in."""
|
|
85
|
+
return self._sources
|
|
86
|
+
|
|
87
|
+
def __len__(self) -> int:
|
|
88
|
+
return self._cumulative[-1]
|
|
89
|
+
|
|
90
|
+
def _locate(self, index: int) -> tuple[DataSequence[T, M], int]:
|
|
91
|
+
"""Resolve a logical index to `(source, local_index)`.
|
|
92
|
+
|
|
93
|
+
Raises:
|
|
94
|
+
IndexError: If `index` is outside `[-len(self), len(self))`.
|
|
95
|
+
"""
|
|
96
|
+
n = self._cumulative[-1]
|
|
97
|
+
original = index
|
|
98
|
+
if index < 0:
|
|
99
|
+
index += n
|
|
100
|
+
if not 0 <= index < n:
|
|
101
|
+
msg = f"index {original} out of range for length {n}"
|
|
102
|
+
raise IndexError(msg)
|
|
103
|
+
i = bisect_right(self._cumulative, index) - 1
|
|
104
|
+
return self._sources[i], index - self._cumulative[i]
|
|
105
|
+
|
|
106
|
+
def get_item(self, index: int) -> T:
|
|
107
|
+
source, local = self._locate(index)
|
|
108
|
+
return source.get_item(local)
|
|
109
|
+
|
|
110
|
+
def get_meta(self, index: int) -> M:
|
|
111
|
+
source, local = self._locate(index)
|
|
112
|
+
return source.get_meta(local)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
class WindowedSequence[T, M_in, M_out](DataSequence[tuple[T, ...], M_out]):
|
|
116
|
+
"""An abstract sliding-window view over `source`.
|
|
117
|
+
|
|
118
|
+
Each item is a tuple of `size` items from `source`, starting at
|
|
119
|
+
position `i * step`, with intra-window stride `skip`. Indexed item
|
|
120
|
+
access (`get_item`) is implemented; **the window's metadata
|
|
121
|
+
strategy is intentionally left abstract** so the relationship
|
|
122
|
+
between per-frame `M_in` and window-level `M_out` is decided at
|
|
123
|
+
subclass-definition time.
|
|
124
|
+
|
|
125
|
+
Subclasses use the `source`, `size`, `step`, `skip` properties and
|
|
126
|
+
should call `_normalize_index` from `get_meta` so negative and
|
|
127
|
+
out-of-range window indices behave the same way as in `get_item`.
|
|
128
|
+
|
|
129
|
+
Type Parameters:
|
|
130
|
+
T: Item type of `source` (also the per-frame type within each
|
|
131
|
+
window).
|
|
132
|
+
M_in: Metadata type of `source` (per-frame metadata).
|
|
133
|
+
M_out: Metadata type of the window. Determined by the
|
|
134
|
+
subclass's `get_meta` return.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
source: The sequence to window over.
|
|
138
|
+
size: Number of items per window. Must be positive.
|
|
139
|
+
step: Position advance between consecutive windows. Defaults
|
|
140
|
+
to 1 (overlapping windows by `size - 1`).
|
|
141
|
+
skip: Intra-window stride. Defaults to 1 (consecutive frames).
|
|
142
|
+
|
|
143
|
+
Raises:
|
|
144
|
+
ValueError: If `size`, `step`, or `skip` is non-positive.
|
|
145
|
+
"""
|
|
146
|
+
|
|
147
|
+
def __init__(
|
|
148
|
+
self,
|
|
149
|
+
source: DataSequence[T, M_in],
|
|
150
|
+
size: int,
|
|
151
|
+
*,
|
|
152
|
+
step: int = 1,
|
|
153
|
+
skip: int = 1,
|
|
154
|
+
) -> None:
|
|
155
|
+
if size <= 0 or step <= 0 or skip <= 0:
|
|
156
|
+
msg = (
|
|
157
|
+
f"size, step, skip must be positive "
|
|
158
|
+
f"(got size={size}, step={step}, skip={skip})"
|
|
159
|
+
)
|
|
160
|
+
raise ValueError(msg)
|
|
161
|
+
self._source = source
|
|
162
|
+
self._size = size
|
|
163
|
+
self._step = step
|
|
164
|
+
self._skip = skip
|
|
165
|
+
# The window spans `(size - 1) * skip + 1` source positions; the
|
|
166
|
+
# number of complete windows is then `(len(source) - span) // step + 1`.
|
|
167
|
+
span = (size - 1) * skip + 1
|
|
168
|
+
self._length = max(0, (len(source) - span) // step + 1)
|
|
169
|
+
|
|
170
|
+
@property
|
|
171
|
+
def source(self) -> DataSequence[T, M_in]:
|
|
172
|
+
"""The wrapped sequence."""
|
|
173
|
+
return self._source
|
|
174
|
+
|
|
175
|
+
@property
|
|
176
|
+
def size(self) -> int:
|
|
177
|
+
"""Number of items per window."""
|
|
178
|
+
return self._size
|
|
179
|
+
|
|
180
|
+
@property
|
|
181
|
+
def step(self) -> int:
|
|
182
|
+
"""Position advance between consecutive windows."""
|
|
183
|
+
return self._step
|
|
184
|
+
|
|
185
|
+
@property
|
|
186
|
+
def skip(self) -> int:
|
|
187
|
+
"""Intra-window stride."""
|
|
188
|
+
return self._skip
|
|
189
|
+
|
|
190
|
+
def __len__(self) -> int:
|
|
191
|
+
return self._length
|
|
192
|
+
|
|
193
|
+
def _normalize_index(self, index: int) -> int:
|
|
194
|
+
"""Normalize a possibly-negative window index and validate range.
|
|
195
|
+
|
|
196
|
+
Subclasses should call this from `get_meta` to apply the same
|
|
197
|
+
negative-index handling and bounds checking that `get_item`
|
|
198
|
+
performs.
|
|
199
|
+
|
|
200
|
+
Raises:
|
|
201
|
+
IndexError: If `index` is outside `[-len(self), len(self))`.
|
|
202
|
+
"""
|
|
203
|
+
n = self._length
|
|
204
|
+
original = index
|
|
205
|
+
if index < 0:
|
|
206
|
+
index += n
|
|
207
|
+
if not 0 <= index < n:
|
|
208
|
+
msg = f"index {original} out of range for length {n}"
|
|
209
|
+
raise IndexError(msg)
|
|
210
|
+
return index
|
|
211
|
+
|
|
212
|
+
def get_item(self, index: int) -> tuple[T, ...]:
|
|
213
|
+
index = self._normalize_index(index)
|
|
214
|
+
start = index * self._step
|
|
215
|
+
return tuple(
|
|
216
|
+
self._source.get_item(start + j * self._skip) for j in range(self._size)
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
@abstractmethod
|
|
220
|
+
def get_meta(self, index: int) -> M_out:
|
|
221
|
+
raise NotImplementedError
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
__all__ = ("FileFolderSequence", "SingleFileSequence")
|
|
4
|
+
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
from kaparoo.data.sequences.base import DataSequence
|
|
10
|
+
from kaparoo.filesystem.existence import ensure_dir_exists, ensure_file_exists
|
|
11
|
+
from kaparoo.filesystem.utils import stringify_paths, wrap_path
|
|
12
|
+
|
|
13
|
+
if TYPE_CHECKING:
|
|
14
|
+
from kaparoo.filesystem.types import StrPath
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class FileFolderSequence[T, M = Path](DataSequence[T, M]):
|
|
18
|
+
"""A folder-rooted `DataSequence` whose items live in individual files.
|
|
19
|
+
|
|
20
|
+
The base class handles file discovery, indexing, and root-relative
|
|
21
|
+
path bookkeeping. Subclasses are responsible for three things:
|
|
22
|
+
|
|
23
|
+
- **`list_files(self, root)`** (abstract): return the full `Path`
|
|
24
|
+
of every file to expose, in the desired order. Called once from
|
|
25
|
+
`__init__` after `root` has been validated. Every returned path
|
|
26
|
+
must be under `root`; otherwise construction raises `ValueError`.
|
|
27
|
+
Subclasses can read instance state to parameterize the listing
|
|
28
|
+
(see "Parameterized subclasses" below).
|
|
29
|
+
- **`load_file(self, path)`** (abstract): decode a single file.
|
|
30
|
+
Called lazily on each `get_item`, never at construction time.
|
|
31
|
+
- **`get_meta(self, index)`** (abstract): produce per-item
|
|
32
|
+
metadata. When the metadata IS the source path, `M` defaults
|
|
33
|
+
to `Path` and `get_meta(i)` can be the one-liner
|
|
34
|
+
`return self.get_file(i)`.
|
|
35
|
+
|
|
36
|
+
The base exposes:
|
|
37
|
+
|
|
38
|
+
- `root: Path` — the base directory.
|
|
39
|
+
- `files: tuple[Path, ...]` — full paths as an immutable snapshot.
|
|
40
|
+
- `get_file(index) -> Path` — full path of the i-th file.
|
|
41
|
+
|
|
42
|
+
Paths are kept internally in their root-relative form so that
|
|
43
|
+
memory stays low for large datasets and the sequence survives
|
|
44
|
+
`root` relocations; the conversion is transparent to subclasses
|
|
45
|
+
and external callers.
|
|
46
|
+
|
|
47
|
+
Parameterized subclasses:
|
|
48
|
+
When a subclass needs instance-level options (e.g. `pattern`,
|
|
49
|
+
`recursive`, label maps), set them on `self` **before** calling
|
|
50
|
+
`super().__init__(root)` -- the base class invokes
|
|
51
|
+
`self.list_files(root)` from its own `__init__`, so any state
|
|
52
|
+
`list_files` will read must already be in place. State that
|
|
53
|
+
`list_files` does *not* read (caches, label tables, ...) can
|
|
54
|
+
be set after `super().__init__(root)` as usual.
|
|
55
|
+
|
|
56
|
+
Type Parameters:
|
|
57
|
+
T: Item type returned by `get_item`.
|
|
58
|
+
M: Per-item metadata type. Defaults to `Path`; override when
|
|
59
|
+
the metadata is something else (label, line number, ...).
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
root: The base directory. Must exist and be a directory.
|
|
63
|
+
|
|
64
|
+
Raises:
|
|
65
|
+
DirectoryNotFoundError: If `root` does not exist.
|
|
66
|
+
NotADirectoryError: If `root` exists but is not a directory.
|
|
67
|
+
ValueError: If any path returned by `list_files` is not under
|
|
68
|
+
`root`.
|
|
69
|
+
|
|
70
|
+
Example:
|
|
71
|
+
>>> from pathlib import Path
|
|
72
|
+
>>> class GlobFolder(FileFolderSequence[bytes]):
|
|
73
|
+
... def __init__(
|
|
74
|
+
... self, root, *, pattern: str = "*", recursive: bool = False
|
|
75
|
+
... ) -> None:
|
|
76
|
+
... # Set state BEFORE super().__init__() so list_files
|
|
77
|
+
... # can read it.
|
|
78
|
+
... self._pattern = pattern
|
|
79
|
+
... self._recursive = recursive
|
|
80
|
+
... super().__init__(root)
|
|
81
|
+
...
|
|
82
|
+
... def list_files(self, root: Path) -> list[Path]:
|
|
83
|
+
... glob_fn = root.rglob if self._recursive else root.glob
|
|
84
|
+
... return sorted(p for p in glob_fn(self._pattern) if p.is_file())
|
|
85
|
+
...
|
|
86
|
+
... def get_meta(self, index: int) -> Path:
|
|
87
|
+
... return self.get_file(index)
|
|
88
|
+
...
|
|
89
|
+
... def load_file(self, path: Path) -> bytes:
|
|
90
|
+
... return path.read_bytes()
|
|
91
|
+
>>>
|
|
92
|
+
>>> folder = GlobFolder("data", pattern="*.png", recursive=True)
|
|
93
|
+
"""
|
|
94
|
+
|
|
95
|
+
def __init__(self, root: StrPath) -> None:
|
|
96
|
+
self._root = ensure_dir_exists(root)
|
|
97
|
+
self._files = list(
|
|
98
|
+
stringify_paths(self.list_files(self._root), after=self._root)
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
def __len__(self) -> int:
|
|
102
|
+
return len(self._files)
|
|
103
|
+
|
|
104
|
+
@property
|
|
105
|
+
def root(self) -> Path:
|
|
106
|
+
"""The base directory the sequence was constructed from."""
|
|
107
|
+
return self._root
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def files(self) -> tuple[Path, ...]:
|
|
111
|
+
"""Immutable snapshot of the full file paths this sequence exposes.
|
|
112
|
+
|
|
113
|
+
Returns a fresh `tuple[Path, ...]` on each access, in the order
|
|
114
|
+
established by `list_files`.
|
|
115
|
+
"""
|
|
116
|
+
return tuple(self.get_file(i) for i in range(len(self)))
|
|
117
|
+
|
|
118
|
+
def get_file(self, index: int) -> Path:
|
|
119
|
+
"""Full Path of the file at `index`."""
|
|
120
|
+
return wrap_path(self._files[index], prepend=self._root)
|
|
121
|
+
|
|
122
|
+
def get_item(self, index: int) -> T:
|
|
123
|
+
return self.load_file(self.get_file(index))
|
|
124
|
+
|
|
125
|
+
@abstractmethod
|
|
126
|
+
def get_meta(self, index: int) -> M:
|
|
127
|
+
raise NotImplementedError
|
|
128
|
+
|
|
129
|
+
@abstractmethod
|
|
130
|
+
def load_file(self, path: Path) -> T:
|
|
131
|
+
"""Decode a single file into an item of type `T`.
|
|
132
|
+
|
|
133
|
+
Called lazily on each `get_item` -- not at construction time.
|
|
134
|
+
Subclasses may freely use external libraries (PIL, librosa,
|
|
135
|
+
cv2, ...) to decode.
|
|
136
|
+
"""
|
|
137
|
+
raise NotImplementedError
|
|
138
|
+
|
|
139
|
+
@abstractmethod
|
|
140
|
+
def list_files(self, root: Path) -> list[Path]:
|
|
141
|
+
"""Return the full Path of every file to expose, in order.
|
|
142
|
+
|
|
143
|
+
Called once from `__init__` after `root` has been validated.
|
|
144
|
+
Every returned path must be under `root`; construction raises
|
|
145
|
+
`ValueError` otherwise. May read instance state set before
|
|
146
|
+
`super().__init__(root)` -- see the class docstring's
|
|
147
|
+
"Parameterized subclasses" note.
|
|
148
|
+
"""
|
|
149
|
+
raise NotImplementedError
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class SingleFileSequence[T, M = None](DataSequence[T, M]):
|
|
153
|
+
"""A `DataSequence` backed by a single file that holds multiple records.
|
|
154
|
+
|
|
155
|
+
Thin abstract base for the "one file, many records" pattern
|
|
156
|
+
(a video file with many frames; a CSV with many rows; a binary
|
|
157
|
+
blob with fixed-size records; ...). Indexing strategies vary too
|
|
158
|
+
widely across formats to abstract here -- subclasses are
|
|
159
|
+
responsible for opening, indexing, and decoding the file.
|
|
160
|
+
|
|
161
|
+
`__init__` validates that `path` exists and is a regular file and
|
|
162
|
+
makes it available via the `path` property. Subclasses typically
|
|
163
|
+
override `__init__` to additionally open or pre-scan the file,
|
|
164
|
+
calling `super().__init__(path)` first.
|
|
165
|
+
|
|
166
|
+
Args:
|
|
167
|
+
path: The file to read. Must exist and be a regular file.
|
|
168
|
+
|
|
169
|
+
Raises:
|
|
170
|
+
FileNotFoundError: If `path` does not exist.
|
|
171
|
+
NotAFileError: If `path` exists but is not a regular file.
|
|
172
|
+
|
|
173
|
+
Example:
|
|
174
|
+
>>> from pathlib import Path
|
|
175
|
+
>>> class LinesFile(SingleFileSequence[str, int]):
|
|
176
|
+
... def __init__(self, path) -> None:
|
|
177
|
+
... super().__init__(path)
|
|
178
|
+
... self._lines = tuple(self.path.read_text().splitlines())
|
|
179
|
+
...
|
|
180
|
+
... def __len__(self) -> int:
|
|
181
|
+
... return len(self._lines)
|
|
182
|
+
...
|
|
183
|
+
... def get_item(self, index: int) -> str:
|
|
184
|
+
... return self._lines[index]
|
|
185
|
+
...
|
|
186
|
+
... def get_meta(self, index: int) -> int:
|
|
187
|
+
... return index + 1 # 1-based line number
|
|
188
|
+
"""
|
|
189
|
+
|
|
190
|
+
def __init__(self, path: StrPath) -> None:
|
|
191
|
+
self._path = ensure_file_exists(path)
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def path(self) -> Path:
|
|
195
|
+
"""The wrapped file's path."""
|
|
196
|
+
return self._path
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
__all__ = ("generate_batches",)
|
|
4
|
+
|
|
5
|
+
from typing import TYPE_CHECKING
|
|
6
|
+
|
|
7
|
+
if TYPE_CHECKING:
|
|
8
|
+
from collections.abc import Iterator, Sequence
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def generate_batches[T](
|
|
12
|
+
sequence: Sequence[T],
|
|
13
|
+
size: int,
|
|
14
|
+
*,
|
|
15
|
+
step: int = 1,
|
|
16
|
+
skip: int = 1,
|
|
17
|
+
start: int = 0,
|
|
18
|
+
stop: int | None = None,
|
|
19
|
+
drop_last: bool = True,
|
|
20
|
+
) -> Iterator[Sequence[T]]:
|
|
21
|
+
"""Yield sliding windows from `sequence`.
|
|
22
|
+
|
|
23
|
+
Each yielded batch is `sequence[head : tail : skip]` where `head`
|
|
24
|
+
advances by `step` per iteration. With the defaults (`size=3,
|
|
25
|
+
step=1, skip=1, drop_last=True`), this produces overlapping
|
|
26
|
+
consecutive-frame windows; pair this with a non-overlapping `step
|
|
27
|
+
>= size` for a classic non-overlapping batch loader.
|
|
28
|
+
|
|
29
|
+
Traversal is constrained to the index range `[start, stop)`.
|
|
30
|
+
`stop=None` defaults to `len(sequence)`. An empty range (`start ==
|
|
31
|
+
stop`) yields nothing -- the function returns without error.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
sequence: The sequence to slide windows over.
|
|
35
|
+
size: Number of items per window. Must be positive.
|
|
36
|
+
step: Position advance between consecutive windows. Defaults
|
|
37
|
+
to 1 (overlapping windows by `size - 1`).
|
|
38
|
+
skip: Intra-window stride. Defaults to 1 (consecutive items).
|
|
39
|
+
start: Inclusive lower bound on source indices. Defaults to 0.
|
|
40
|
+
Must satisfy `0 <= start <= stop`.
|
|
41
|
+
stop: Exclusive upper bound on source indices. Defaults to
|
|
42
|
+
`len(sequence)`. The partial window (when `drop_last=False`)
|
|
43
|
+
respects `stop` and never extends past it.
|
|
44
|
+
drop_last: If False, yield a final partial (possibly shorter
|
|
45
|
+
than `size`) window when items remain after the last full
|
|
46
|
+
window. Defaults to True.
|
|
47
|
+
|
|
48
|
+
Yields:
|
|
49
|
+
Sub-sequences of `sequence` obtained by slicing.
|
|
50
|
+
|
|
51
|
+
Raises:
|
|
52
|
+
ValueError: If `size`, `step`, or `skip` is non-positive, or
|
|
53
|
+
if the range is not `0 <= start <= stop <= len(sequence)`.
|
|
54
|
+
"""
|
|
55
|
+
if size <= 0 or step <= 0 or skip <= 0:
|
|
56
|
+
msg = (
|
|
57
|
+
f"size, step, skip must be positive "
|
|
58
|
+
f"(got size={size}, step={step}, skip={skip})"
|
|
59
|
+
)
|
|
60
|
+
raise ValueError(msg)
|
|
61
|
+
|
|
62
|
+
length = len(sequence)
|
|
63
|
+
stop = stop if stop is not None else length
|
|
64
|
+
if not 0 <= start <= stop <= length:
|
|
65
|
+
msg = f"invalid range [{start}, {stop}) for sequence of length {length}"
|
|
66
|
+
raise ValueError(msg)
|
|
67
|
+
|
|
68
|
+
head = start
|
|
69
|
+
tail = head + (size - 1) * skip + 1
|
|
70
|
+
|
|
71
|
+
while tail <= stop:
|
|
72
|
+
yield sequence[head:tail:skip]
|
|
73
|
+
head += step
|
|
74
|
+
tail += step
|
|
75
|
+
|
|
76
|
+
# Final partial window must respect `stop` (not `tail`, which has
|
|
77
|
+
# advanced past `stop` by the time we get here).
|
|
78
|
+
if not drop_last and head < stop:
|
|
79
|
+
yield sequence[head:stop:skip]
|