data-forager 0.1.5__tar.gz → 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. {data_forager-0.1.5 → data_forager-0.1.6}/PKG-INFO +1 -1
  2. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/index_stores/fs_based.py +14 -4
  3. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager.egg-info/PKG-INFO +1 -1
  4. {data_forager-0.1.5 → data_forager-0.1.6}/pyproject.toml +1 -1
  5. {data_forager-0.1.5 → data_forager-0.1.6}/LICENSE +0 -0
  6. {data_forager-0.1.5 → data_forager-0.1.6}/README.md +0 -0
  7. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/__init__.py +0 -0
  8. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/datasets/__init__.py +0 -0
  9. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/datasets/common.py +0 -0
  10. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/datasets/jsonl.py +0 -0
  11. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/datasets/tokens.py +0 -0
  12. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/index_stores/__init__.py +0 -0
  13. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/index_stores/common.py +0 -0
  14. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/indexers/__init__.py +0 -0
  15. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/indexers/jsonl_indexer.py +0 -0
  16. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/indexers/text_lines.py +0 -0
  17. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/indexers/tokenization_indexer.py +0 -0
  18. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/sample_index.py +0 -0
  19. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/unzip_files.py +0 -0
  20. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager/utils.py +0 -0
  21. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager.egg-info/SOURCES.txt +0 -0
  22. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager.egg-info/dependency_links.txt +0 -0
  23. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager.egg-info/requires.txt +0 -0
  24. {data_forager-0.1.5 → data_forager-0.1.6}/data_forager.egg-info/top_level.txt +0 -0
  25. {data_forager-0.1.5 → data_forager-0.1.6}/setup.cfg +0 -0
  26. {data_forager-0.1.5 → data_forager-0.1.6}/tests/test_jsonl_indexing.py +0 -0
  27. {data_forager-0.1.5 → data_forager-0.1.6}/tests/test_tokenizing_indexing_jsonl.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-forager
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
5
5
  Author-email: Freddy Snijder <forager@visionscapers.com>
6
6
  License-Expression: MIT
@@ -16,13 +16,15 @@ class IndexStore(Base, IndexStoreInterface):
16
16
  """
17
17
 
18
18
  :param base_path: Base path where the index files are stored.
19
+ File paths in file_location.txt are stored relative to this path.
19
20
 
20
21
  :param name: Name of instance, if not provided, the classname will be used
21
22
  """
22
23
 
23
24
  super().__init__(pybase_logger_name=name)
24
25
 
25
- self._index_data_path = os.path.join(base_path, index_data_folder)
26
+ self._base_path = os.path.abspath(base_path)
27
+ self._index_data_path = os.path.join(self._base_path, index_data_folder)
26
28
  self._file_locations = []
27
29
 
28
30
  # File handles for buffered writing
@@ -45,14 +47,17 @@ class IndexStore(Base, IndexStoreInterface):
45
47
 
46
48
  def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
47
49
  """
48
- :param file_location:
50
+ :param file_location: Absolute or relative path to the sample file.
51
+ Will be stored as a path relative to base_path.
49
52
  :param byte_offset:
50
53
  :param num_bytes:
51
54
  :return:
52
55
  """
53
56
  if file_location not in self._file_locations:
54
57
  self._file_locations.append(file_location)
55
- self._file_location_handle.write(file_location + '\n')
58
+ # Store as relative path for portability
59
+ relative_path = os.path.relpath(file_location, self._base_path)
60
+ self._file_location_handle.write(relative_path + '\n')
56
61
 
57
62
  file_index = self._file_locations.index(file_location)
58
63
 
@@ -74,7 +79,12 @@ class IndexStore(Base, IndexStoreInterface):
74
79
 
75
80
  def load(self) -> SampleIndex:
76
81
  with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
77
- file_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
82
+ relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
83
+
84
+ # Resolve relative paths against base_path
85
+ file_locations = [
86
+ os.path.join(self._base_path, loc) for loc in relative_locations
87
+ ]
78
88
 
79
89
  with open(os.path.join(self._index_data_path, "sample_locations.bin"), "rb") as f:
80
90
  data = f.read()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-forager
3
- Version: 0.1.5
3
+ Version: 0.1.6
4
4
  Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
5
5
  Author-email: Freddy Snijder <forager@visionscapers.com>
6
6
  License-Expression: MIT
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "data-forager"
7
- version = "0.1.5"
7
+ version = "0.1.6"
8
8
  description = "Enabling random access to large datasets on disk for PyTorch training and other use cases"
9
9
  readme = "README.md"
10
10
  license = "MIT"
File without changes
File without changes
File without changes