data-forager 0.1.6__tar.gz → 0.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {data_forager-0.1.6 → data_forager-0.2.0}/PKG-INFO +1 -1
  2. data_forager-0.2.0/data_forager/datasets/tokens_with_aux.py +91 -0
  3. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/index_stores/fs_based.py +77 -4
  4. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/indexers/text_lines.py +28 -73
  5. data_forager-0.2.0/data_forager/indexers/tokenization_indexer.py +310 -0
  6. data_forager-0.2.0/data_forager/sample_generators/__init__.py +30 -0
  7. data_forager-0.2.0/data_forager/sample_generators/aux/__init__.py +18 -0
  8. data_forager-0.2.0/data_forager/sample_generators/aux/common.py +77 -0
  9. data_forager-0.2.0/data_forager/sample_generators/aux/loss_mask.py +78 -0
  10. data_forager-0.2.0/data_forager/sample_generators/common.py +117 -0
  11. data_forager-0.2.0/data_forager/sample_generators/schema.py +54 -0
  12. data_forager-0.2.0/data_forager/sample_generators/tokenization.py +210 -0
  13. data_forager-0.2.0/data_forager/sample_generators/tokenization_with_aux.py +250 -0
  14. data_forager-0.2.0/data_forager/sample_index.py +74 -0
  15. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/PKG-INFO +1 -1
  16. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/SOURCES.txt +10 -0
  17. {data_forager-0.1.6 → data_forager-0.2.0}/pyproject.toml +1 -1
  18. data_forager-0.2.0/tests/test_tokenization_with_aux.py +670 -0
  19. data_forager-0.1.6/data_forager/indexers/tokenization_indexer.py +0 -343
  20. data_forager-0.1.6/data_forager/sample_index.py +0 -42
  21. {data_forager-0.1.6 → data_forager-0.2.0}/LICENSE +0 -0
  22. {data_forager-0.1.6 → data_forager-0.2.0}/README.md +0 -0
  23. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/__init__.py +0 -0
  24. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/__init__.py +0 -0
  25. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/common.py +0 -0
  26. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/jsonl.py +0 -0
  27. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/tokens.py +0 -0
  28. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/index_stores/__init__.py +0 -0
  29. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/index_stores/common.py +0 -0
  30. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/indexers/__init__.py +0 -0
  31. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/indexers/jsonl_indexer.py +0 -0
  32. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/unzip_files.py +0 -0
  33. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/utils.py +0 -0
  34. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/dependency_links.txt +0 -0
  35. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/requires.txt +0 -0
  36. {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/top_level.txt +0 -0
  37. {data_forager-0.1.6 → data_forager-0.2.0}/setup.cfg +0 -0
  38. {data_forager-0.1.6 → data_forager-0.2.0}/tests/test_jsonl_indexing.py +0 -0
  39. {data_forager-0.1.6 → data_forager-0.2.0}/tests/test_tokenizing_indexing_jsonl.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-forager
3
- Version: 0.1.6
3
+ Version: 0.2.0
4
4
  Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
5
5
  Author-email: Freddy Snijder <forager@visionscapers.com>
6
6
  License-Expression: MIT
@@ -0,0 +1,91 @@
1
+ """
2
+ Dataset for reading tokenized samples with auxiliary data.
3
+
4
+ This module provides TokensWithAuxDataset which reads samples containing
5
+ both tokens and auxiliary data (e.g., loss masks) using the schema stored
6
+ in the sample index.
7
+ """
8
+
9
+ from typing import Dict, Optional
10
+
11
+ import numpy as np
12
+
13
+ from data_forager.sample_index import SampleIndex
14
+ from data_forager.datasets.common import Dataset
15
+
16
+
17
+ class TokensWithAuxDataset(Dataset):
18
+ """
19
+ Dataset that returns tokens with auxiliary data.
20
+
21
+ Reads samples containing multiple arrays (tokens + auxiliary data) using
22
+ the schema from the sample index to parse the concatenated bytes.
23
+
24
+ Requires sample_schema in the SampleIndex. Use TokensDataset for indexes
25
+ without auxiliary data.
26
+ """
27
+
28
+ @classmethod
29
+ def create_from_index_on_filesystem(
30
+ cls,
31
+ base_path: str,
32
+ name: Optional[str] = None,
33
+ ) -> "TokensWithAuxDataset":
34
+ """
35
+ Create a TokensWithAuxDataset from an index stored on the filesystem.
36
+
37
+ :param base_path: Base path where the index is stored.
38
+ :param name: Optional name for logging.
39
+
40
+ :return: TokensWithAuxDataset instance.
41
+ """
42
+ from data_forager.index_stores.fs_based import IndexStore
43
+
44
+ index_store = IndexStore(base_path=base_path)
45
+ sample_index = index_store.load()
46
+
47
+ return cls(sample_index=sample_index, name=name)
48
+
49
+ def __init__(
50
+ self,
51
+ sample_index: SampleIndex,
52
+ name: Optional[str] = None,
53
+ **kwargs,
54
+ ):
55
+ """
56
+ Initialize the dataset.
57
+
58
+ :param sample_index: SampleIndex with sample_schema describing the
59
+ structure of samples.
60
+ :param name: Optional name for logging.
61
+
62
+ :raises ValueError: If sample_index has no sample_schema.
63
+ """
64
+ super().__init__(sample_index, name=name, **kwargs)
65
+
66
+ if sample_index.sample_schema is None:
67
+ raise ValueError(
68
+ "SampleIndex has no sample_schema. "
69
+ "Use TokensDataset for indexes without auxiliary data."
70
+ )
71
+
72
+ self._schema = sample_index.sample_schema
73
+
74
+ def _process_sample(self, sample_bytes: bytes) -> Dict[str, np.ndarray]:
75
+ """
76
+ Parse concatenated bytes into named arrays.
77
+
78
+ :param sample_bytes: Raw bytes containing all arrays concatenated.
79
+
80
+ :return: Dict mapping array names to numpy arrays.
81
+ """
82
+ result = {}
83
+ for array_spec in self._schema.arrays:
84
+ dtype = np.dtype(array_spec.dtype)
85
+ start = array_spec.offset
86
+ length = self._schema.sample_size * dtype.itemsize
87
+ result[array_spec.name] = np.frombuffer(
88
+ sample_bytes[start:start + length],
89
+ dtype=dtype,
90
+ )
91
+ return result
@@ -1,3 +1,11 @@
1
+ """
2
+ Filesystem-based index store for persisting sample indexes.
3
+
4
+ This module provides IndexStore which saves and loads sample indexes
5
+ to/from the filesystem.
6
+ """
7
+
8
+ import json
1
9
  import os.path
2
10
  import shutil
3
11
  from typing import Optional, TextIO, BinaryIO
@@ -8,19 +16,28 @@ from basics.base import Base
8
16
 
9
17
  from data_forager.index_stores.common import IndexStoreInterface
10
18
  from data_forager.sample_index import SampleIndex
19
+ from data_forager.sample_generators.schema import SampleSchema, ArraySpec
11
20
 
12
21
 
13
22
  class IndexStore(Base, IndexStoreInterface):
23
+ """
24
+ Filesystem-based index store for saving and loading sample indexes.
25
+
26
+ Stores index data in a directory with the following structure:
27
+ - file_location.txt: List of file paths (relative to base_path)
28
+ - sample_locations.bin: Binary array of (file_index, byte_offset, num_bytes)
29
+ - sample_schema.json: Optional schema for structured samples
30
+ """
14
31
 
15
32
  def __init__(self, base_path: str, index_data_folder: str = "index", name: Optional[str] = None):
16
33
  """
34
+ Initialize the index store.
17
35
 
18
36
  :param base_path: Base path where the index files are stored.
19
37
  File paths in file_location.txt are stored relative to this path.
20
-
21
- :param name: Name of instance, if not provided, the classname will be used
38
+ :param index_data_folder: Name of the folder within base_path for index files.
39
+ :param name: Name of instance, if not provided, the classname will be used.
22
40
  """
23
-
24
41
  super().__init__(pybase_logger_name=name)
25
42
 
26
43
  self._base_path = os.path.abspath(base_path)
@@ -31,6 +48,9 @@ class IndexStore(Base, IndexStoreInterface):
31
48
  self._file_location_handle: Optional[TextIO] = None
32
49
  self._sample_locations_handle: Optional[BinaryIO] = None
33
50
 
51
+ # Optional schema for structured samples
52
+ self._sample_schema: Optional[SampleSchema] = None
53
+
34
54
  def init_store(self):
35
55
  if os.path.exists(self._index_data_path):
36
56
  raise ValueError(f"Provided index path already exists: {self._index_data_path}")
@@ -64,6 +84,15 @@ class IndexStore(Base, IndexStoreInterface):
64
84
  sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
65
85
  self._sample_locations_handle.write(sample_location_bytes)
66
86
 
87
+ def set_sample_schema(self, schema: SampleSchema) -> None:
88
+ """
89
+ Set and persist the sample schema.
90
+
91
+ :param schema: SampleSchema describing the structure of samples.
92
+ """
93
+ self._sample_schema = schema
94
+ self._save_schema()
95
+
67
96
  def close(self):
68
97
  """Close file handles and flush buffered data."""
69
98
  if self._file_location_handle is not None:
@@ -78,6 +107,11 @@ class IndexStore(Base, IndexStoreInterface):
78
107
  self.close()
79
108
 
80
109
  def load(self) -> SampleIndex:
110
+ """
111
+ Load the sample index from disk.
112
+
113
+ :return: SampleIndex with file locations, sample locations, and optional schema.
114
+ """
81
115
  with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
82
116
  relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
83
117
 
@@ -91,7 +125,46 @@ class IndexStore(Base, IndexStoreInterface):
91
125
  sample_locations = np.frombuffer(data, dtype=np.uint64)
92
126
  sample_locations = sample_locations.reshape((-1, 3))
93
127
 
94
- return SampleIndex(file_locations, sample_locations)
128
+ # Load schema if it exists
129
+ sample_schema = self._load_schema()
130
+
131
+ return SampleIndex(file_locations, sample_locations, sample_schema)
132
+
133
+ def _save_schema(self) -> None:
134
+ """Save the sample schema to sample_schema.json."""
135
+ if self._sample_schema is None:
136
+ return
137
+
138
+ schema_path = os.path.join(self._index_data_path, "sample_schema.json")
139
+ schema_dict = {
140
+ "sample_size": self._sample_schema.sample_size,
141
+ "arrays": [
142
+ {"name": arr.name, "dtype": arr.dtype, "offset": arr.offset}
143
+ for arr in self._sample_schema.arrays
144
+ ],
145
+ "total_bytes_per_sample": self._sample_schema.total_bytes_per_sample,
146
+ }
147
+ with open(schema_path, "w") as f:
148
+ json.dump(schema_dict, f, indent=2)
149
+
150
+ def _load_schema(self) -> Optional[SampleSchema]:
151
+ """Load the sample schema from sample_schema.json if it exists."""
152
+ schema_path = os.path.join(self._index_data_path, "sample_schema.json")
153
+ if not os.path.exists(schema_path):
154
+ return None
155
+
156
+ with open(schema_path, "r") as f:
157
+ schema_dict = json.load(f)
158
+
159
+ arrays = [
160
+ ArraySpec(name=arr["name"], dtype=arr["dtype"], offset=arr["offset"])
161
+ for arr in schema_dict["arrays"]
162
+ ]
163
+ return SampleSchema(
164
+ sample_size=schema_dict["sample_size"],
165
+ arrays=arrays,
166
+ total_bytes_per_sample=schema_dict["total_bytes_per_sample"],
167
+ )
95
168
 
96
169
  def exists(self) -> bool:
97
170
  """Check if the index already exists."""
@@ -1,5 +1,12 @@
1
- from typing import Optional, List, Tuple, Protocol
2
- from dataclasses import dataclass
1
+ """
2
+ File text lines indexer for building sample indexes.
3
+
4
+ This module provides FileTextLinesIndexer which scans text files line by line,
5
+ uses a sample generator to transform lines into samples, and builds an index
6
+ for random access.
7
+ """
8
+
9
+ from typing import Optional, List
3
10
 
4
11
  import os
5
12
  import sys
@@ -8,77 +15,19 @@ from basics.base import Base
8
15
  from tqdm import tqdm
9
16
 
10
17
  from data_forager.index_stores.common import IndexStoreInterface
11
-
12
-
13
- @dataclass
14
- class SampleData:
15
-
16
- sample_bytes: bytes
17
- file_path: str
18
-
19
- class SampleGeneratorInterface(Protocol):
20
-
21
- def prepare(self, text_file_path: str):
22
- """
23
- Prepare sample generation from a new input text file
24
-
25
- :param text_file_path: path to text file
26
-
27
- :return:
28
- """
29
- ...
30
-
31
- def create_samples(self, text_line: bytes) -> List[SampleData]:
32
- """
33
- Creates one or more samples from the given text_line and stores it in one or multiple different files.
34
- The path to the file(s) in which the samples are stores are also returned.
35
-
36
- IMPORTANT: it is assumed that each sample returned is stored in a file sequentially in the same order.
37
- This must also hold over multiple function calls. This is important because the byte offset
38
- of a sample is derived from the order the samples are returned.
39
-
40
- :param text_line: Text line in bytes from text_file_path, provided in the prepare phase.
41
- The function needs to choose a text encoding itself
42
-
43
- :return: List of DataSample objects. For each created sample the following is given:
44
- * Its representation in bytes, as used to store the sample
45
- * The file path to where the sample is stored
46
-
47
- """
48
- ...
49
-
50
- def finish(self, is_last_file: bool):
51
- """
52
- Finish generation of samples from text lines of input file at the `text_file_path` given in the prepare() phase.
53
-
54
- is_last_file: indicates if the input text file was the last file to be processed
55
-
56
- :return:
57
- """
58
- ...
59
-
60
-
61
- class NOOPSampleGenerator(SampleGeneratorInterface):
62
-
63
- def __init__(self):
64
- self._current_text_file = None
65
-
66
- def prepare(self, text_file_path: str):
67
- self._current_text_file = text_file_path
68
-
69
- def create_samples(self, text_line: bytes) -> List[SampleData]:
70
- return [SampleData(text_line, self._current_text_file)]
71
-
72
- def finish(self, is_last_file: bool):
73
- self._current_text_file = None
74
-
75
-
76
- def noop_sample_processing(text_line: bytes, text_file_path: str) -> List[SampleData]:
77
-
78
- return [SampleData(text_line, text_file_path)]
18
+ from data_forager.sample_generators.common import (
19
+ SampleGeneratorInterface,
20
+ NOOPSampleGenerator,
21
+ )
79
22
 
80
23
 
81
24
  class FileTextLinesIndexer(Base):
25
+ """
26
+ Indexes text files by scanning lines and building a sample index.
27
+
28
+ Uses a sample generator to transform text lines into samples, then records
29
+ byte offsets in an index store for O(1) random access during training.
30
+ """
82
31
 
83
32
  def __init__(
84
33
  self,
@@ -104,13 +53,19 @@ class FileTextLinesIndexer(Base):
104
53
 
105
54
  def __call__(self):
106
55
  """
107
- IMPORTANT: input files are always read in binary mode; applying a text encoding is up to the user.
108
- E.g. through process_sample_func and/or when processing the data Dataset::_process_sample()
56
+ Run the indexing process.
109
57
 
110
- :return:
58
+ IMPORTANT: Input files are always read in binary mode; applying a text
59
+ encoding is up to the user, e.g., through process_sample_func and/or
60
+ when processing the data in Dataset._process_sample().
111
61
  """
112
62
  self._index_store.init_store()
113
63
 
64
+ # Set schema if the sample generator provides one
65
+ sample_schema = self._sample_generator.get_sample_schema()
66
+ if sample_schema is not None:
67
+ self._index_store.set_sample_schema(sample_schema)
68
+
114
69
  byte_offset_map = {}
115
70
 
116
71
  for input_file_path in self._input_file_paths:
@@ -0,0 +1,310 @@
1
+ """
2
+ Factory function for creating tokenization and indexing pipelines.
3
+
4
+ This module provides a convenience function for setting up the complete pipeline
5
+ to tokenize JSONL text files and create an index for random access.
6
+ """
7
+
8
+ from typing import Callable, Dict, List, Optional
9
+
10
+ import json
11
+ import logging
12
+ import os
13
+
14
+ from basics.logging import get_logger
15
+
16
+ from data_forager.index_stores.common import IndexStoreInterface
17
+ from data_forager.index_stores.fs_based import IndexStore as FSBasedIndexStore
18
+ from data_forager.indexers.text_lines import FileTextLinesIndexer
19
+ from data_forager.sample_generators.tokenization import (
20
+ TokenizedSampleGenerator,
21
+ TokenizerFunc,
22
+ ProcessTextLineFunc,
23
+ )
24
+ from data_forager.sample_generators.aux.common import Part, AuxDataGenerator
25
+ from data_forager.utils import find_files_recursive, natural_sort
26
+
27
+
28
+ ProcessPartsFunc = Callable[[bytes], List[Part]]
29
+
30
+
31
+ module_logger = get_logger(os.path.basename(__file__))
32
+
33
+
34
+ def get_text_from_jsonl(jsonl_bytes: bytes, text_key: str = "text", text_encoding: str = "utf-8") -> str:
35
+ """
36
+ Extract text from a JSONL line.
37
+
38
+ :param jsonl_bytes: Raw bytes of the JSONL line.
39
+ :param text_key: Key in the JSON object containing the text.
40
+ :param text_encoding: Text encoding to use for decoding.
41
+
42
+ :return: The extracted text string.
43
+ """
44
+ jsonl_text = jsonl_bytes.decode(text_encoding)
45
+ data = json.loads(jsonl_text)
46
+ return data[text_key]
47
+
48
+
49
+ def create_tokenize_and_index_jsonl_text_func(
50
+ tokenizer_func: TokenizerFunc,
51
+ eos_idx: int,
52
+ input_base_path: Optional[str] = None,
53
+ input_file_paths: Optional[List[str]] = None,
54
+ output_base_path: Optional[str] = None,
55
+ index_store: Optional[IndexStoreInterface] = None,
56
+ process_text_line_func: Optional[ProcessTextLineFunc] = None,
57
+ logger: Optional[logging.Logger] = None,
58
+ name: Optional[str] = None,
59
+ **sample_generator_kwargs,
60
+ ) -> FileTextLinesIndexer:
61
+ """
62
+ Create a pipeline to tokenize text from JSONL files and create an index for random access.
63
+
64
+ The pipeline:
65
+ * Tokenizes text from input JSONL objects
66
+ * Stores the token data in bin files under "tokenized-samples" folder
67
+ * Stores index data under "index" folder
68
+
69
+ Usage:
70
+ ```python
71
+ import tiktoken
72
+
73
+ enc = tiktoken.get_encoding("gpt2")
74
+ def tokenize_text(text: str) -> List[int]:
75
+ return enc.encode_ordinary(text)
76
+
77
+ # Option 1: Scan directory for JSONL files, output to same directory
78
+ indexer = create_tokenize_and_index_jsonl_text_func(
79
+ tokenizer_func=tokenize_text,
80
+ eos_idx=enc.eot_token,
81
+ input_base_path='./data',
82
+ sample_size=1024,
83
+ )
84
+
85
+ # Option 2: Explicit input files and output path
86
+ indexer = create_tokenize_and_index_jsonl_text_func(
87
+ tokenizer_func=tokenize_text,
88
+ eos_idx=enc.eot_token,
89
+ input_file_paths=['./data/train.jsonl'],
90
+ output_base_path='./output',
91
+ sample_size=1024,
92
+ )
93
+
94
+ # Run tokenization and indexing
95
+ indexer()
96
+ ```
97
+
98
+ :param tokenizer_func: Function used to tokenize text.
99
+ :param eos_idx: EOS token index, known by the used Tokenizer.
100
+ :param input_base_path: Path to directory containing JSONL files (searched recursively).
101
+ Used as fallback for output if `output_base_path` is not provided.
102
+ :param input_file_paths: List of file paths to process. If provided, these are used
103
+ instead of scanning `input_base_path` for JSONL files.
104
+ :param output_base_path: Base path for output (index and tokenized samples).
105
+ If not provided, `input_base_path` is used.
106
+ :param index_store: Index store to use. If provided, this is used instead of
107
+ creating a new FSBasedIndexStore.
108
+ :param process_text_line_func: Function used to process text lines.
109
+ By default, this converts input JSON lines to dicts and returns the "text" field.
110
+ See function get_text_from_jsonl().
111
+ :param logger: Logger to use. If not provided, uses module logger.
112
+ :param name: Name of the indexer, used for logging purposes.
113
+ :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator
114
+ (e.g., sample_size, token_dtype, base_output_path).
115
+
116
+ :raises ValueError: If both `input_base_path` and `input_file_paths` are None.
117
+ :raises ValueError: If `index_store` is None and both `output_base_path` and
118
+ `input_base_path` are None.
119
+
120
+ :return: FileTextLinesIndexer instance that can be called to run tokenization
121
+ and indexing.
122
+ """
123
+ if logger is None:
124
+ logger = module_logger
125
+
126
+ # Validate input source
127
+ if input_base_path is None and input_file_paths is None:
128
+ raise ValueError(
129
+ "Either input_base_path or input_file_paths must be provided"
130
+ )
131
+
132
+ # Determine output base path
133
+ effective_output_base_path = output_base_path or input_base_path
134
+
135
+ # Validate output destination
136
+ if index_store is None and effective_output_base_path is None:
137
+ raise ValueError(
138
+ "Either index_store, output_base_path, or input_base_path must be provided "
139
+ "to determine where to store the index"
140
+ )
141
+
142
+ logger.info(f"Output base path: {effective_output_base_path}")
143
+
144
+ if process_text_line_func is None:
145
+ process_text_line_func = get_text_from_jsonl
146
+
147
+ if index_store is None:
148
+ index_store = FSBasedIndexStore(
149
+ base_path=effective_output_base_path,
150
+ )
151
+
152
+ if input_file_paths is None:
153
+ logger.info(f"Scanning for JSONL files in: {input_base_path}")
154
+ input_file_paths = find_files_recursive(
155
+ input_base_path,
156
+ extension_patterns=['*.jsonl', '*.JSONL']
157
+ )
158
+ # Assuming numbered files
159
+ input_file_paths = natural_sort(input_file_paths)
160
+ logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
161
+
162
+ # Set default base_output_path for tokenized samples if not provided in kwargs
163
+ if 'base_output_path' not in sample_generator_kwargs:
164
+ default_base_output_path = os.path.join(
165
+ effective_output_base_path, "tokenized-samples"
166
+ )
167
+ logger.info(f"Tokenized samples output path: {default_base_output_path}")
168
+ sample_generator_kwargs['base_output_path'] = default_base_output_path
169
+
170
+ sample_generator = TokenizedSampleGenerator(
171
+ process_text_line_func=process_text_line_func,
172
+ tokenizer_func=tokenizer_func,
173
+ eos_idx=eos_idx,
174
+ **sample_generator_kwargs
175
+ )
176
+
177
+ return FileTextLinesIndexer(
178
+ input_file_paths=input_file_paths,
179
+ index_store=index_store,
180
+ sample_generator=sample_generator,
181
+ description="Tokenizing and indexing",
182
+ name=name,
183
+ )
184
+
185
+
186
+ def create_tokenize_and_index_with_aux_func(
187
+ process_parts_func: ProcessPartsFunc,
188
+ tokenizer_func: TokenizerFunc,
189
+ eos_idx: int,
190
+ aux_generators: Dict[str, AuxDataGenerator],
191
+ input_base_path: Optional[str] = None,
192
+ input_file_paths: Optional[List[str]] = None,
193
+ output_base_path: Optional[str] = None,
194
+ index_store: Optional[IndexStoreInterface] = None,
195
+ logger: Optional[logging.Logger] = None,
196
+ name: Optional[str] = None,
197
+ **sample_generator_kwargs,
198
+ ) -> FileTextLinesIndexer:
199
+ """
200
+ Create a pipeline to tokenize structured samples with auxiliary data.
201
+
202
+ This function creates a pipeline that:
203
+ * Processes structured input (parts with types) from JSONL files
204
+ * Tokenizes each part and generates auxiliary data (e.g., loss masks)
205
+ * Stores concatenated token + aux data in bin files
206
+ * Creates an index with schema for random access
207
+
208
+ Usage:
209
+ ```python
210
+ from data_forager.sample_generators.aux import Part, LossMaskGenerator
211
+
212
+ def parse_parts(line_bytes: bytes) -> List[Part]:
213
+ data = json.loads(line_bytes.decode('utf-8'))
214
+ return [Part(type=p['type'], text=p['text']) for p in data['parts']]
215
+
216
+ indexer = create_tokenize_and_index_with_aux_func(
217
+ process_parts_func=parse_parts,
218
+ tokenizer_func=tokenizer.encode,
219
+ eos_idx=tokenizer.eos_token_id,
220
+ aux_generators={'loss_mask': LossMaskGenerator()},
221
+ input_base_path='./data',
222
+ sample_size=4096,
223
+ )
224
+
225
+ indexer()
226
+ ```
227
+
228
+ :param process_parts_func: Function to extract typed parts from input bytes.
229
+ Takes JSONL bytes and returns List[Part].
230
+ :param tokenizer_func: Function used to tokenize text.
231
+ :param eos_idx: EOS token index, known by the used Tokenizer.
232
+ :param aux_generators: Dict mapping names to AuxDataGenerator instances.
233
+ Example: {'loss_mask': LossMaskGenerator()}
234
+ :param input_base_path: Path to directory containing JSONL files.
235
+ :param input_file_paths: List of file paths to process.
236
+ :param output_base_path: Base path for output (index and tokenized samples).
237
+ :param index_store: Index store to use. Must support set_sample_schema().
238
+ :param logger: Logger to use.
239
+ :param name: Name of the indexer for logging.
240
+ :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleWithAuxGenerator
241
+ (e.g., sample_size, token_dtype).
242
+
243
+ :raises ValueError: If both input_base_path and input_file_paths are None.
244
+ :raises ValueError: If output destination cannot be determined.
245
+
246
+ :return: FileTextLinesIndexer instance that can be called to run the pipeline.
247
+ """
248
+ # Import here to avoid circular dependency
249
+ from data_forager.sample_generators.tokenization_with_aux import (
250
+ TokenizedSampleWithAuxGenerator,
251
+ )
252
+
253
+ if logger is None:
254
+ logger = module_logger
255
+
256
+ # Validate input source
257
+ if input_base_path is None and input_file_paths is None:
258
+ raise ValueError(
259
+ "Either input_base_path or input_file_paths must be provided"
260
+ )
261
+
262
+ # Determine output base path
263
+ effective_output_base_path = output_base_path or input_base_path
264
+
265
+ # Validate output destination
266
+ if index_store is None and effective_output_base_path is None:
267
+ raise ValueError(
268
+ "Either index_store, output_base_path, or input_base_path must be provided "
269
+ "to determine where to store the index"
270
+ )
271
+
272
+ logger.info(f"Output base path: {effective_output_base_path}")
273
+
274
+ if index_store is None:
275
+ index_store = FSBasedIndexStore(
276
+ base_path=effective_output_base_path,
277
+ )
278
+
279
+ if input_file_paths is None:
280
+ logger.info(f"Scanning for JSONL files in: {input_base_path}")
281
+ input_file_paths = find_files_recursive(
282
+ input_base_path,
283
+ extension_patterns=['*.jsonl', '*.JSONL']
284
+ )
285
+ input_file_paths = natural_sort(input_file_paths)
286
+ logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
287
+
288
+ # Set default base_output_path for tokenized samples if not provided
289
+ if 'base_output_path' not in sample_generator_kwargs:
290
+ default_base_output_path = os.path.join(
291
+ effective_output_base_path, "tokenized-samples"
292
+ )
293
+ logger.info(f"Tokenized samples output path: {default_base_output_path}")
294
+ sample_generator_kwargs['base_output_path'] = default_base_output_path
295
+
296
+ sample_generator = TokenizedSampleWithAuxGenerator(
297
+ process_parts_func=process_parts_func,
298
+ tokenizer_func=tokenizer_func,
299
+ eos_idx=eos_idx,
300
+ aux_generators=aux_generators,
301
+ **sample_generator_kwargs
302
+ )
303
+
304
+ return FileTextLinesIndexer(
305
+ input_file_paths=input_file_paths,
306
+ index_store=index_store,
307
+ sample_generator=sample_generator,
308
+ description="Tokenizing with aux data and indexing",
309
+ name=name,
310
+ )
@@ -0,0 +1,30 @@
1
+ """
2
+ Sample generators for transforming input data into samples.
3
+
4
+ This package contains:
5
+ - SampleGeneratorInterface: Protocol for sample generators
6
+ - SampleData: Data class for sample information
7
+ - SampleSchema, ArraySpec: Schema classes for structured samples
8
+ - TokenizedSampleGenerator: Tokenizes text into fixed-length samples
9
+ - TokenizedSampleWithAuxGenerator: Tokenizes with auxiliary data (loss masks, etc.)
10
+ """
11
+
12
+ from data_forager.sample_generators.common import (
13
+ SampleData,
14
+ SampleGeneratorInterface,
15
+ NOOPSampleGenerator,
16
+ noop_sample_processing,
17
+ )
18
+ from data_forager.sample_generators.schema import (
19
+ ArraySpec,
20
+ SampleSchema,
21
+ )
22
+
23
+ __all__ = [
24
+ "ArraySpec",
25
+ "NOOPSampleGenerator",
26
+ "SampleData",
27
+ "SampleGeneratorInterface",
28
+ "SampleSchema",
29
+ "noop_sample_processing",
30
+ ]