data-forager 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_forager-0.1.3 → data_forager-0.1.5}/PKG-INFO +1 -1
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/datasets/common.py +54 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager.egg-info/PKG-INFO +1 -1
- {data_forager-0.1.3 → data_forager-0.1.5}/pyproject.toml +1 -1
- {data_forager-0.1.3 → data_forager-0.1.5}/LICENSE +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/README.md +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/__init__.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/datasets/__init__.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/datasets/jsonl.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/datasets/tokens.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/index_stores/__init__.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/index_stores/common.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/index_stores/fs_based.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/indexers/__init__.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/indexers/jsonl_indexer.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/indexers/text_lines.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/indexers/tokenization_indexer.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/sample_index.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/unzip_files.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager/utils.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager.egg-info/SOURCES.txt +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager.egg-info/dependency_links.txt +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager.egg-info/requires.txt +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/data_forager.egg-info/top_level.txt +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/setup.cfg +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/tests/test_jsonl_indexing.py +0 -0
- {data_forager-0.1.3 → data_forager-0.1.5}/tests/test_tokenizing_indexing_jsonl.py +0 -0
|
@@ -1,6 +1,8 @@
|
|
|
1
1
|
import abc
|
|
2
2
|
from typing import Optional, Union, Dict, List, Protocol, Any
|
|
3
3
|
|
|
4
|
+
import numpy as np
|
|
5
|
+
|
|
4
6
|
from basics.base import Base
|
|
5
7
|
|
|
6
8
|
from data_forager.sample_index import SampleIndex, SampleLocation
|
|
@@ -128,3 +130,55 @@ class Dataset(Base, metaclass=abc.ABCMeta):
|
|
|
128
130
|
|
|
129
131
|
def __del__(self):
|
|
130
132
|
self._close_files()
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
class SubsampledDataset:
|
|
136
|
+
"""
|
|
137
|
+
Wrapper that provides a subsampled view of a dataset.
|
|
138
|
+
|
|
139
|
+
Randomly selects a subset of indices from the wrapped dataset, allowing
|
|
140
|
+
for faster iteration through epochs when testing or debugging.
|
|
141
|
+
|
|
142
|
+
:param dataset: The dataset to wrap (must support __len__ and __getitem__).
|
|
143
|
+
:param subsample_factor: Fraction of the dataset to use (must be between 0 and 1).
|
|
144
|
+
:param seed: Random seed for reproducibility. If None, sampling is random.
|
|
145
|
+
:param random_order: If False (default), indices are sorted for better disk
|
|
146
|
+
read locality. If True, indices are kept in random order, which can be
|
|
147
|
+
used as a randomizer.
|
|
148
|
+
"""
|
|
149
|
+
|
|
150
|
+
def __init__(
|
|
151
|
+
self,
|
|
152
|
+
dataset,
|
|
153
|
+
subsample_factor: float,
|
|
154
|
+
seed: int | None = None,
|
|
155
|
+
random_order: bool = False,
|
|
156
|
+
):
|
|
157
|
+
if not 0 < subsample_factor <= 1:
|
|
158
|
+
raise ValueError(
|
|
159
|
+
f"subsample_factor must be between 0 (exclusive) and 1 (inclusive), "
|
|
160
|
+
f"got {subsample_factor}"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
self._dataset = dataset
|
|
164
|
+
self._subsample_factor = subsample_factor
|
|
165
|
+
|
|
166
|
+
n_full = len(dataset)
|
|
167
|
+
n_sub = int(subsample_factor * n_full)
|
|
168
|
+
|
|
169
|
+
# Sample indices without replacement
|
|
170
|
+
rng = np.random.default_rng(seed)
|
|
171
|
+
indices = rng.choice(n_full, size=n_sub, replace=False)
|
|
172
|
+
|
|
173
|
+
# Sort for cache locality unless random order is requested
|
|
174
|
+
if not random_order:
|
|
175
|
+
indices.sort()
|
|
176
|
+
|
|
177
|
+
# Convert to Python list of ints for underlying dataset compatibility
|
|
178
|
+
self._indices: list[int] = indices.tolist()
|
|
179
|
+
|
|
180
|
+
def __len__(self) -> int:
|
|
181
|
+
return len(self._indices)
|
|
182
|
+
|
|
183
|
+
def __getitem__(self, idx: int):
|
|
184
|
+
return self._dataset[self._indices[idx]]
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "data-forager"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.5"
|
|
8
8
|
description = "Enabling random access to large datasets on disk for PyTorch training and other use cases"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|