data-forager 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_forager/index_stores/fs_based.py +14 -4
- {data_forager-0.1.5.dist-info → data_forager-0.1.6.dist-info}/METADATA +1 -1
- {data_forager-0.1.5.dist-info → data_forager-0.1.6.dist-info}/RECORD +6 -6
- {data_forager-0.1.5.dist-info → data_forager-0.1.6.dist-info}/WHEEL +0 -0
- {data_forager-0.1.5.dist-info → data_forager-0.1.6.dist-info}/licenses/LICENSE +0 -0
- {data_forager-0.1.5.dist-info → data_forager-0.1.6.dist-info}/top_level.txt +0 -0
|
@@ -16,13 +16,15 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
18
|
:param base_path: Base path where the index files are stored.
|
|
19
|
+
File paths in file_location.txt are stored relative to this path.
|
|
19
20
|
|
|
20
21
|
:param name: Name of instance, if not provided, the classname will be used
|
|
21
22
|
"""
|
|
22
23
|
|
|
23
24
|
super().__init__(pybase_logger_name=name)
|
|
24
25
|
|
|
25
|
-
self.
|
|
26
|
+
self._base_path = os.path.abspath(base_path)
|
|
27
|
+
self._index_data_path = os.path.join(self._base_path, index_data_folder)
|
|
26
28
|
self._file_locations = []
|
|
27
29
|
|
|
28
30
|
# File handles for buffered writing
|
|
@@ -45,14 +47,17 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
45
47
|
|
|
46
48
|
def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
|
|
47
49
|
"""
|
|
48
|
-
:param file_location:
|
|
50
|
+
:param file_location: Absolute or relative path to the sample file.
|
|
51
|
+
Will be stored as a path relative to base_path.
|
|
49
52
|
:param byte_offset:
|
|
50
53
|
:param num_bytes:
|
|
51
54
|
:return:
|
|
52
55
|
"""
|
|
53
56
|
if file_location not in self._file_locations:
|
|
54
57
|
self._file_locations.append(file_location)
|
|
55
|
-
|
|
58
|
+
# Store as relative path for portability
|
|
59
|
+
relative_path = os.path.relpath(file_location, self._base_path)
|
|
60
|
+
self._file_location_handle.write(relative_path + '\n')
|
|
56
61
|
|
|
57
62
|
file_index = self._file_locations.index(file_location)
|
|
58
63
|
|
|
@@ -74,7 +79,12 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
74
79
|
|
|
75
80
|
def load(self) -> SampleIndex:
|
|
76
81
|
with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
|
|
77
|
-
|
|
82
|
+
relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
|
|
83
|
+
|
|
84
|
+
# Resolve relative paths against base_path
|
|
85
|
+
file_locations = [
|
|
86
|
+
os.path.join(self._base_path, loc) for loc in relative_locations
|
|
87
|
+
]
|
|
78
88
|
|
|
79
89
|
with open(os.path.join(self._index_data_path, "sample_locations.bin"), "rb") as f:
|
|
80
90
|
data = f.read()
|
|
@@ -8,13 +8,13 @@ data_forager/datasets/jsonl.py,sha256=enOjWRT-AJTF3tWtNlonCqdDpZfVsK8If6yEtlA8tn
|
|
|
8
8
|
data_forager/datasets/tokens.py,sha256=OP5MNb9uBDSX_Of6lNVLs5CAj46RwkP4gGDk-94lD40,597
|
|
9
9
|
data_forager/index_stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
10
|
data_forager/index_stores/common.py,sha256=HYPZfCtmbLDxjPgCoUseahicTEI1el7V1s5tSyZxkfs,780
|
|
11
|
-
data_forager/index_stores/fs_based.py,sha256=
|
|
11
|
+
data_forager/index_stores/fs_based.py,sha256=usTE_eUdcfvBTgIX6fMrdXWqrHuUgNPmtRbGweKh2g8,4106
|
|
12
12
|
data_forager/indexers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
data_forager/indexers/jsonl_indexer.py,sha256=xvogFjEcKPNr-GBkKEr8WnWPVvJbkWQzUBrGwVELlA4,1395
|
|
14
14
|
data_forager/indexers/text_lines.py,sha256=XMm5oc0btP7I16z87g1fmq9AqJyVhDOvR2cDu_zFZio,5093
|
|
15
15
|
data_forager/indexers/tokenization_indexer.py,sha256=t-7Q3PLAJ0DYZT6LWdHeahk9Hz9OQsvWfoPvhHIneMk,13927
|
|
16
|
-
data_forager-0.1.
|
|
17
|
-
data_forager-0.1.
|
|
18
|
-
data_forager-0.1.
|
|
19
|
-
data_forager-0.1.
|
|
20
|
-
data_forager-0.1.
|
|
16
|
+
data_forager-0.1.6.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
|
|
17
|
+
data_forager-0.1.6.dist-info/METADATA,sha256=djaXS9HRCi_ei2Sm-f_yqWgGwe2JX6UZphbTfsHN2vw,9090
|
|
18
|
+
data_forager-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
data_forager-0.1.6.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
|
|
20
|
+
data_forager-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|