data-forager 0.1.4__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_forager/datasets/common.py +5 -2
- data_forager/index_stores/fs_based.py +14 -4
- {data_forager-0.1.4.dist-info → data_forager-0.1.6.dist-info}/METADATA +1 -1
- {data_forager-0.1.4.dist-info → data_forager-0.1.6.dist-info}/RECORD +7 -7
- {data_forager-0.1.4.dist-info → data_forager-0.1.6.dist-info}/WHEEL +0 -0
- {data_forager-0.1.4.dist-info → data_forager-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {data_forager-0.1.4.dist-info → data_forager-0.1.6.dist-info}/top_level.txt +0 -0
data_forager/datasets/common.py
CHANGED
|
@@ -168,11 +168,14 @@ class SubsampledDataset:
|
|
|
168
168
|
|
|
169
169
|
# Sample indices without replacement
|
|
170
170
|
rng = np.random.default_rng(seed)
|
|
171
|
-
|
|
171
|
+
indices = rng.choice(n_full, size=n_sub, replace=False)
|
|
172
172
|
|
|
173
173
|
# Sort for cache locality unless random order is requested
|
|
174
174
|
if not random_order:
|
|
175
|
-
|
|
175
|
+
indices.sort()
|
|
176
|
+
|
|
177
|
+
# Convert to Python list of ints for underlying dataset compatibility
|
|
178
|
+
self._indices: list[int] = indices.tolist()
|
|
176
179
|
|
|
177
180
|
def __len__(self) -> int:
|
|
178
181
|
return len(self._indices)
|
|
@@ -16,13 +16,15 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
:param base_path: Base path where the index files are stored.
|
|
19
|
+
File paths in file_location.txt are stored relative to this path.
|
|
19
20
|
|
|
20
21
|
:param name: Name of instance, if not provided, the classname will be used
|
|
21
22
|
"""
|
|
22
23
|
|
|
23
24
|
super().__init__(pybase_logger_name=name)
|
|
24
25
|
|
|
25
|
-
self.
|
|
26
|
+
self._base_path = os.path.abspath(base_path)
|
|
27
|
+
self._index_data_path = os.path.join(self._base_path, index_data_folder)
|
|
26
28
|
self._file_locations = []
|
|
27
29
|
|
|
28
30
|
# File handles for buffered writing
|
|
@@ -45,14 +47,17 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
45
47
|
|
|
46
48
|
def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
|
|
47
49
|
"""
|
|
48
|
-
:param file_location:
|
|
50
|
+
:param file_location: Absolute or relative path to the sample file.
|
|
51
|
+
Will be stored as a path relative to base_path.
|
|
49
52
|
:param byte_offset:
|
|
50
53
|
:param num_bytes:
|
|
51
54
|
:return:
|
|
52
55
|
"""
|
|
53
56
|
if file_location not in self._file_locations:
|
|
54
57
|
self._file_locations.append(file_location)
|
|
55
|
-
|
|
58
|
+
# Store as relative path for portability
|
|
59
|
+
relative_path = os.path.relpath(file_location, self._base_path)
|
|
60
|
+
self._file_location_handle.write(relative_path + '\n')
|
|
56
61
|
|
|
57
62
|
file_index = self._file_locations.index(file_location)
|
|
58
63
|
|
|
@@ -74,7 +79,12 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
74
79
|
|
|
75
80
|
def load(self) -> SampleIndex:
|
|
76
81
|
with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
|
|
77
|
-
|
|
82
|
+
relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
|
|
83
|
+
|
|
84
|
+
# Resolve relative paths against base_path
|
|
85
|
+
file_locations = [
|
|
86
|
+
os.path.join(self._base_path, loc) for loc in relative_locations
|
|
87
|
+
]
|
|
78
88
|
|
|
79
89
|
with open(os.path.join(self._index_data_path, "sample_locations.bin"), "rb") as f:
|
|
80
90
|
data = f.read()
|
|
@@ -3,18 +3,18 @@ data_forager/sample_index.py,sha256=72J4_AZtmgyMd6AXMxkfz5BnZ3tf6iZBk962DeFGVcI,
|
|
|
3
3
|
data_forager/unzip_files.py,sha256=f3rUUN31NdScQiau_uiw1fNeIHobvGfExSG0KqW9kok,2695
|
|
4
4
|
data_forager/utils.py,sha256=Vbp-wA4Tf0Y4rHRIFaf_uU7MA6xzfFI2jjzmnlNGwRk,454
|
|
5
5
|
data_forager/datasets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
data_forager/datasets/common.py,sha256=
|
|
6
|
+
data_forager/datasets/common.py,sha256=gA9Q_2nXp1cvYm7zK99puAVg6rWARZ0eqSO7YrP8rr4,5865
|
|
7
7
|
data_forager/datasets/jsonl.py,sha256=enOjWRT-AJTF3tWtNlonCqdDpZfVsK8If6yEtlA8tns,630
|
|
8
8
|
data_forager/datasets/tokens.py,sha256=OP5MNb9uBDSX_Of6lNVLs5CAj46RwkP4gGDk-94lD40,597
|
|
9
9
|
data_forager/index_stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
data_forager/index_stores/common.py,sha256=HYPZfCtmbLDxjPgCoUseahicTEI1el7V1s5tSyZxkfs,780
|
|
11
|
-
data_forager/index_stores/fs_based.py,sha256=
|
|
11
|
+
data_forager/index_stores/fs_based.py,sha256=usTE_eUdcfvBTgIX6fMrdXWqrHuUgNPmtRbGweKh2g8,4106
|
|
12
12
|
data_forager/indexers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
data_forager/indexers/jsonl_indexer.py,sha256=xvogFjEcKPNr-GBkKEr8WnWPVvJbkWQzUBrGwVELlA4,1395
|
|
14
14
|
data_forager/indexers/text_lines.py,sha256=XMm5oc0btP7I16z87g1fmq9AqJyVhDOvR2cDu_zFZio,5093
|
|
15
15
|
data_forager/indexers/tokenization_indexer.py,sha256=t-7Q3PLAJ0DYZT6LWdHeahk9Hz9OQsvWfoPvhHIneMk,13927
|
|
16
|
-
data_forager-0.1.
|
|
17
|
-
data_forager-0.1.
|
|
18
|
-
data_forager-0.1.
|
|
19
|
-
data_forager-0.1.
|
|
20
|
-
data_forager-0.1.
|
|
16
|
+
data_forager-0.1.6.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
|
|
17
|
+
data_forager-0.1.6.dist-info/METADATA,sha256=djaXS9HRCi_ei2Sm-f_yqWgGwe2JX6UZphbTfsHN2vw,9090
|
|
18
|
+
data_forager-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
data_forager-0.1.6.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
|
|
20
|
+
data_forager-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|