data-forager 0.1.5__py3-none-any.whl → 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,13 +16,15 @@ class IndexStore(Base, IndexStoreInterface):
16
16
  """
17
17
 
18
18
  :param base_path: Base path where the index files are stored.
19
+ File paths in file_location.txt are stored relative to this path.
19
20
 
20
21
  :param name: Name of instance, if not provided, the classname will be used
21
22
  """
22
23
 
23
24
  super().__init__(pybase_logger_name=name)
24
25
 
25
- self._index_data_path = os.path.join(base_path, index_data_folder)
26
+ self._base_path = os.path.abspath(base_path)
27
+ self._index_data_path = os.path.join(self._base_path, index_data_folder)
26
28
  self._file_locations = []
27
29
 
28
30
  # File handles for buffered writing
@@ -45,14 +47,17 @@ class IndexStore(Base, IndexStoreInterface):
45
47
 
46
48
  def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
47
49
  """
48
- :param file_location:
50
+ :param file_location: Absolute or relative path to the sample file.
51
+ Will be stored as a path relative to base_path.
49
52
  :param byte_offset:
50
53
  :param num_bytes:
51
54
  :return:
52
55
  """
53
56
  if file_location not in self._file_locations:
54
57
  self._file_locations.append(file_location)
55
- self._file_location_handle.write(file_location + '\n')
58
+ # Store as relative path for portability
59
+ relative_path = os.path.relpath(file_location, self._base_path)
60
+ self._file_location_handle.write(relative_path + '\n')
56
61
 
57
62
  file_index = self._file_locations.index(file_location)
58
63
 
@@ -74,7 +79,12 @@ class IndexStore(Base, IndexStoreInterface):
74
79
 
75
80
  def load(self) -> SampleIndex:
76
81
  with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
77
- file_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
82
+ relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
83
+
84
+ # Resolve relative paths against base_path
85
+ file_locations = [
86
+ os.path.join(self._base_path, loc) for loc in relative_locations
87
+ ]
78
88
 
79
89
  with open(os.path.join(self._index_data_path, "sample_locations.bin"), "rb") as f:
80
90
  data = f.read()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-forager
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
5
5
  Author-email: Freddy Snijder <forager@visionscapers.com>
6
6
  License-Expression: MIT
@@ -8,13 +8,13 @@ data_forager/datasets/jsonl.py,sha256=enOjWRT-AJTF3tWtNlonCqdDpZfVsK8If6yEtlA8tn
8
8
  data_forager/datasets/tokens.py,sha256=OP5MNb9uBDSX_Of6lNVLs5CAj46RwkP4gGDk-94lD40,597
9
9
  data_forager/index_stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
10
  data_forager/index_stores/common.py,sha256=HYPZfCtmbLDxjPgCoUseahicTEI1el7V1s5tSyZxkfs,780
11
- data_forager/index_stores/fs_based.py,sha256=1MB_iZPZS187rlLRUNPIJcQK-lU9EqEAg3sARnT2vSk,3564
11
+ data_forager/index_stores/fs_based.py,sha256=usTE_eUdcfvBTgIX6fMrdXWqrHuUgNPmtRbGweKh2g8,4106
12
12
  data_forager/indexers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  data_forager/indexers/jsonl_indexer.py,sha256=xvogFjEcKPNr-GBkKEr8WnWPVvJbkWQzUBrGwVELlA4,1395
14
14
  data_forager/indexers/text_lines.py,sha256=XMm5oc0btP7I16z87g1fmq9AqJyVhDOvR2cDu_zFZio,5093
15
15
  data_forager/indexers/tokenization_indexer.py,sha256=t-7Q3PLAJ0DYZT6LWdHeahk9Hz9OQsvWfoPvhHIneMk,13927
16
- data_forager-0.1.5.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
17
- data_forager-0.1.5.dist-info/METADATA,sha256=V2844BjLgxIwaLuRb-78AUDFXpW9YfMb2ieyUDrKx4Q,9090
18
- data_forager-0.1.5.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- data_forager-0.1.5.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
20
- data_forager-0.1.5.dist-info/RECORD,,
16
+ data_forager-0.1.6.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
17
+ data_forager-0.1.6.dist-info/METADATA,sha256=djaXS9HRCi_ei2Sm-f_yqWgGwe2JX6UZphbTfsHN2vw,9090
18
+ data_forager-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ data_forager-0.1.6.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
20
+ data_forager-0.1.6.dist-info/RECORD,,