data-forager 0.1.4__py3-none-any.whl → 0.1.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_forager/datasets/common.py +5 -2
- {data_forager-0.1.4.dist-info → data_forager-0.1.5.dist-info}/METADATA +1 -1
- {data_forager-0.1.4.dist-info → data_forager-0.1.5.dist-info}/RECORD +6 -6
- {data_forager-0.1.4.dist-info → data_forager-0.1.5.dist-info}/WHEEL +0 -0
- {data_forager-0.1.4.dist-info → data_forager-0.1.5.dist-info}/licenses/LICENSE +0 -0
- {data_forager-0.1.4.dist-info → data_forager-0.1.5.dist-info}/top_level.txt +0 -0
data_forager/datasets/common.py
CHANGED
|
@@ -168,11 +168,14 @@ class SubsampledDataset:
|
|
|
168
168
|
|
|
169
169
|
# Sample indices without replacement
|
|
170
170
|
rng = np.random.default_rng(seed)
|
|
171
|
-
|
|
171
|
+
indices = rng.choice(n_full, size=n_sub, replace=False)
|
|
172
172
|
|
|
173
173
|
# Sort for cache locality unless random order is requested
|
|
174
174
|
if not random_order:
|
|
175
|
-
|
|
175
|
+
indices.sort()
|
|
176
|
+
|
|
177
|
+
# Convert to Python list of ints for underlying dataset compatibility
|
|
178
|
+
self._indices: list[int] = indices.tolist()
|
|
176
179
|
|
|
177
180
|
def __len__(self) -> int:
|
|
178
181
|
return len(self._indices)
|
|
@@ -3,7 +3,7 @@ data_forager/sample_index.py,sha256=72J4_AZtmgyMd6AXMxkfz5BnZ3tf6iZBk962DeFGVcI,
|
|
|
3
3
|
data_forager/unzip_files.py,sha256=f3rUUN31NdScQiau_uiw1fNeIHobvGfExSG0KqW9kok,2695
|
|
4
4
|
data_forager/utils.py,sha256=Vbp-wA4Tf0Y4rHRIFaf_uU7MA6xzfFI2jjzmnlNGwRk,454
|
|
5
5
|
data_forager/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
data_forager/datasets/common.py,sha256=
|
|
6
|
+
data_forager/datasets/common.py,sha256=gA9Q_2nXp1cvYm7zK99puAVg6rWARZ0eqSO7YrP8rr4,5865
|
|
7
7
|
data_forager/datasets/jsonl.py,sha256=enOjWRT-AJTF3tWtNlonCqdDpZfVsK8If6yEtlA8tns,630
|
|
8
8
|
data_forager/datasets/tokens.py,sha256=OP5MNb9uBDSX_Of6lNVLs5CAj46RwkP4gGDk-94lD40,597
|
|
9
9
|
data_forager/index_stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -13,8 +13,8 @@ data_forager/indexers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
|
|
|
13
13
|
data_forager/indexers/jsonl_indexer.py,sha256=xvogFjEcKPNr-GBkKEr8WnWPVvJbkWQzUBrGwVELlA4,1395
|
|
14
14
|
data_forager/indexers/text_lines.py,sha256=XMm5oc0btP7I16z87g1fmq9AqJyVhDOvR2cDu_zFZio,5093
|
|
15
15
|
data_forager/indexers/tokenization_indexer.py,sha256=t-7Q3PLAJ0DYZT6LWdHeahk9Hz9OQsvWfoPvhHIneMk,13927
|
|
16
|
-
data_forager-0.1.
|
|
17
|
-
data_forager-0.1.
|
|
18
|
-
data_forager-0.1.
|
|
19
|
-
data_forager-0.1.
|
|
20
|
-
data_forager-0.1.
|
|
16
|
+
data_forager-0.1.5.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
|
|
17
|
+
data_forager-0.1.5.dist-info/METADATA,sha256=V2844BjLgxIwaLuRb-78AUDFXpW9YfMb2ieyUDrKx4Q,9090
|
|
18
|
+
data_forager-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
data_forager-0.1.5.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
|
|
20
|
+
data_forager-0.1.5.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|