data-forager 0.1.6__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,91 @@
1
+ """
2
+ Dataset for reading tokenized samples with auxiliary data.
3
+
4
+ This module provides TokensWithAuxDataset which reads samples containing
5
+ both tokens and auxiliary data (e.g., loss masks) using the schema stored
6
+ in the sample index.
7
+ """
8
+
9
+ from typing import Dict, Optional
10
+
11
+ import numpy as np
12
+
13
+ from data_forager.sample_index import SampleIndex
14
+ from data_forager.datasets.common import Dataset
15
+
16
+
17
+ class TokensWithAuxDataset(Dataset):
18
+ """
19
+ Dataset that returns tokens with auxiliary data.
20
+
21
+ Reads samples containing multiple arrays (tokens + auxiliary data) using
22
+ the schema from the sample index to parse the concatenated bytes.
23
+
24
+ Requires sample_schema in the SampleIndex. Use TokensDataset for indexes
25
+ without auxiliary data.
26
+ """
27
+
28
+ @classmethod
29
+ def create_from_index_on_filesystem(
30
+ cls,
31
+ base_path: str,
32
+ name: Optional[str] = None,
33
+ ) -> "TokensWithAuxDataset":
34
+ """
35
+ Create a TokensWithAuxDataset from an index stored on the filesystem.
36
+
37
+ :param base_path: Base path where the index is stored.
38
+ :param name: Optional name for logging.
39
+
40
+ :return: TokensWithAuxDataset instance.
41
+ """
42
+ from data_forager.index_stores.fs_based import IndexStore
43
+
44
+ index_store = IndexStore(base_path=base_path)
45
+ sample_index = index_store.load()
46
+
47
+ return cls(sample_index=sample_index, name=name)
48
+
49
+ def __init__(
50
+ self,
51
+ sample_index: SampleIndex,
52
+ name: Optional[str] = None,
53
+ **kwargs,
54
+ ):
55
+ """
56
+ Initialize the dataset.
57
+
58
+ :param sample_index: SampleIndex with sample_schema describing the
59
+ structure of samples.
60
+ :param name: Optional name for logging.
61
+
62
+ :raises ValueError: If sample_index has no sample_schema.
63
+ """
64
+ super().__init__(sample_index, name=name, **kwargs)
65
+
66
+ if sample_index.sample_schema is None:
67
+ raise ValueError(
68
+ "SampleIndex has no sample_schema. "
69
+ "Use TokensDataset for indexes without auxiliary data."
70
+ )
71
+
72
+ self._schema = sample_index.sample_schema
73
+
74
+ def _process_sample(self, sample_bytes: bytes) -> Dict[str, np.ndarray]:
75
+ """
76
+ Parse concatenated bytes into named arrays.
77
+
78
+ :param sample_bytes: Raw bytes containing all arrays concatenated.
79
+
80
+ :return: Dict mapping array names to numpy arrays.
81
+ """
82
+ result = {}
83
+ for array_spec in self._schema.arrays:
84
+ dtype = np.dtype(array_spec.dtype)
85
+ start = array_spec.offset
86
+ length = self._schema.sample_size * dtype.itemsize
87
+ result[array_spec.name] = np.frombuffer(
88
+ sample_bytes[start:start + length],
89
+ dtype=dtype,
90
+ )
91
+ return result
@@ -1,3 +1,11 @@
1
+ """
2
+ Filesystem-based index store for persisting sample indexes.
3
+
4
+ This module provides IndexStore which saves and loads sample indexes
5
+ to/from the filesystem.
6
+ """
7
+
8
+ import json
1
9
  import os.path
2
10
  import shutil
3
11
  from typing import Optional, TextIO, BinaryIO
@@ -8,19 +16,28 @@ from basics.base import Base
8
16
 
9
17
  from data_forager.index_stores.common import IndexStoreInterface
10
18
  from data_forager.sample_index import SampleIndex
19
+ from data_forager.sample_generators.schema import SampleSchema, ArraySpec
11
20
 
12
21
 
13
22
  class IndexStore(Base, IndexStoreInterface):
23
+ """
24
+ Filesystem-based index store for saving and loading sample indexes.
25
+
26
+ Stores index data in a directory with the following structure:
27
+ - file_location.txt: List of file paths (relative to base_path)
28
+ - sample_locations.bin: Binary array of (file_index, byte_offset, num_bytes)
29
+ - sample_schema.json: Optional schema for structured samples
30
+ """
14
31
 
15
32
  def __init__(self, base_path: str, index_data_folder: str = "index", name: Optional[str] = None):
16
33
  """
34
+ Initialize the index store.
17
35
 
18
36
  :param base_path: Base path where the index files are stored.
19
37
  File paths in file_location.txt are stored relative to this path.
20
-
21
- :param name: Name of instance, if not provided, the classname will be used
38
+ :param index_data_folder: Name of the folder within base_path for index files.
39
+ :param name: Name of instance, if not provided, the classname will be used.
22
40
  """
23
-
24
41
  super().__init__(pybase_logger_name=name)
25
42
 
26
43
  self._base_path = os.path.abspath(base_path)
@@ -31,6 +48,9 @@ class IndexStore(Base, IndexStoreInterface):
31
48
  self._file_location_handle: Optional[TextIO] = None
32
49
  self._sample_locations_handle: Optional[BinaryIO] = None
33
50
 
51
+ # Optional schema for structured samples
52
+ self._sample_schema: Optional[SampleSchema] = None
53
+
34
54
  def init_store(self):
35
55
  if os.path.exists(self._index_data_path):
36
56
  raise ValueError(f"Provided index path already exists: {self._index_data_path}")
@@ -64,6 +84,15 @@ class IndexStore(Base, IndexStoreInterface):
64
84
  sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
65
85
  self._sample_locations_handle.write(sample_location_bytes)
66
86
 
87
+ def set_sample_schema(self, schema: SampleSchema) -> None:
88
+ """
89
+ Set and persist the sample schema.
90
+
91
+ :param schema: SampleSchema describing the structure of samples.
92
+ """
93
+ self._sample_schema = schema
94
+ self._save_schema()
95
+
67
96
  def close(self):
68
97
  """Close file handles and flush buffered data."""
69
98
  if self._file_location_handle is not None:
@@ -78,6 +107,11 @@ class IndexStore(Base, IndexStoreInterface):
78
107
  self.close()
79
108
 
80
109
  def load(self) -> SampleIndex:
110
+ """
111
+ Load the sample index from disk.
112
+
113
+ :return: SampleIndex with file locations, sample locations, and optional schema.
114
+ """
81
115
  with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
82
116
  relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
83
117
 
@@ -91,7 +125,46 @@ class IndexStore(Base, IndexStoreInterface):
91
125
  sample_locations = np.frombuffer(data, dtype=np.uint64)
92
126
  sample_locations = sample_locations.reshape((-1, 3))
93
127
 
94
- return SampleIndex(file_locations, sample_locations)
128
+ # Load schema if it exists
129
+ sample_schema = self._load_schema()
130
+
131
+ return SampleIndex(file_locations, sample_locations, sample_schema)
132
+
133
+ def _save_schema(self) -> None:
134
+ """Save the sample schema to sample_schema.json."""
135
+ if self._sample_schema is None:
136
+ return
137
+
138
+ schema_path = os.path.join(self._index_data_path, "sample_schema.json")
139
+ schema_dict = {
140
+ "sample_size": self._sample_schema.sample_size,
141
+ "arrays": [
142
+ {"name": arr.name, "dtype": arr.dtype, "offset": arr.offset}
143
+ for arr in self._sample_schema.arrays
144
+ ],
145
+ "total_bytes_per_sample": self._sample_schema.total_bytes_per_sample,
146
+ }
147
+ with open(schema_path, "w") as f:
148
+ json.dump(schema_dict, f, indent=2)
149
+
150
+ def _load_schema(self) -> Optional[SampleSchema]:
151
+ """Load the sample schema from sample_schema.json if it exists."""
152
+ schema_path = os.path.join(self._index_data_path, "sample_schema.json")
153
+ if not os.path.exists(schema_path):
154
+ return None
155
+
156
+ with open(schema_path, "r") as f:
157
+ schema_dict = json.load(f)
158
+
159
+ arrays = [
160
+ ArraySpec(name=arr["name"], dtype=arr["dtype"], offset=arr["offset"])
161
+ for arr in schema_dict["arrays"]
162
+ ]
163
+ return SampleSchema(
164
+ sample_size=schema_dict["sample_size"],
165
+ arrays=arrays,
166
+ total_bytes_per_sample=schema_dict["total_bytes_per_sample"],
167
+ )
95
168
 
96
169
  def exists(self) -> bool:
97
170
  """Check if the index already exists."""
@@ -1,5 +1,12 @@
1
- from typing import Optional, List, Tuple, Protocol
2
- from dataclasses import dataclass
1
+ """
2
+ File text lines indexer for building sample indexes.
3
+
4
+ This module provides FileTextLinesIndexer which scans text files line by line,
5
+ uses a sample generator to transform lines into samples, and builds an index
6
+ for random access.
7
+ """
8
+
9
+ from typing import Optional, List
3
10
 
4
11
  import os
5
12
  import sys
@@ -8,77 +15,19 @@ from basics.base import Base
8
15
  from tqdm import tqdm
9
16
 
10
17
  from data_forager.index_stores.common import IndexStoreInterface
11
-
12
-
13
- @dataclass
14
- class SampleData:
15
-
16
- sample_bytes: bytes
17
- file_path: str
18
-
19
- class SampleGeneratorInterface(Protocol):
20
-
21
- def prepare(self, text_file_path: str):
22
- """
23
- Prepare sample generation from a new input text file
24
-
25
- :param text_file_path: path to text file
26
-
27
- :return:
28
- """
29
- ...
30
-
31
- def create_samples(self, text_line: bytes) -> List[SampleData]:
32
- """
33
- Creates one or more samples from the given text_line and stores it in one or multiple different files.
34
- The path to the file(s) in which the samples are stores are also returned.
35
-
36
- IMPORTANT: it is assumed that each sample returned is stored in a file sequentially in the same order.
37
- This must also hold over multiple function calls. This is important because the byte offset
38
- of a sample is derived from the order the samples are returned.
39
-
40
- :param text_line: Text line in bytes from text_file_path, provided in the prepare phase.
41
- The function needs to choose a text encoding itself
42
-
43
- :return: List of DataSample objects. For each created sample the following is given:
44
- * Its representation in bytes, as used to store the sample
45
- * The file path to where the sample is stored
46
-
47
- """
48
- ...
49
-
50
- def finish(self, is_last_file: bool):
51
- """
52
- Finish generation of samples from text lines of input file at the `text_file_path` given in the prepare() phase.
53
-
54
- is_last_file: indicates if the input text file was the last file to be processed
55
-
56
- :return:
57
- """
58
- ...
59
-
60
-
61
- class NOOPSampleGenerator(SampleGeneratorInterface):
62
-
63
- def __init__(self):
64
- self._current_text_file = None
65
-
66
- def prepare(self, text_file_path: str):
67
- self._current_text_file = text_file_path
68
-
69
- def create_samples(self, text_line: bytes) -> List[SampleData]:
70
- return [SampleData(text_line, self._current_text_file)]
71
-
72
- def finish(self, is_last_file: bool):
73
- self._current_text_file = None
74
-
75
-
76
- def noop_sample_processing(text_line: bytes, text_file_path: str) -> List[SampleData]:
77
-
78
- return [SampleData(text_line, text_file_path)]
18
+ from data_forager.sample_generators.common import (
19
+ SampleGeneratorInterface,
20
+ NOOPSampleGenerator,
21
+ )
79
22
 
80
23
 
81
24
  class FileTextLinesIndexer(Base):
25
+ """
26
+ Indexes text files by scanning lines and building a sample index.
27
+
28
+ Uses a sample generator to transform text lines into samples, then records
29
+ byte offsets in an index store for O(1) random access during training.
30
+ """
82
31
 
83
32
  def __init__(
84
33
  self,
@@ -104,13 +53,19 @@ class FileTextLinesIndexer(Base):
104
53
 
105
54
  def __call__(self):
106
55
  """
107
- IMPORTANT: input files are always read in binary mode; applying a text encoding is up to the user.
108
- E.g. through process_sample_func and/or when processing the data Dataset::_process_sample()
56
+ Run the indexing process.
109
57
 
110
- :return:
58
+ IMPORTANT: Input files are always read in binary mode; applying a text
59
+ encoding is up to the user, e.g., through process_sample_func and/or
60
+ when processing the data in Dataset._process_sample().
111
61
  """
112
62
  self._index_store.init_store()
113
63
 
64
+ # Set schema if the sample generator provides one
65
+ sample_schema = self._sample_generator.get_sample_schema()
66
+ if sample_schema is not None:
67
+ self._index_store.set_sample_schema(sample_schema)
68
+
114
69
  byte_offset_map = {}
115
70
 
116
71
  for input_file_path in self._input_file_paths: