data-forager 0.1.6__tar.gz → 0.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {data_forager-0.1.6 → data_forager-0.2.0}/PKG-INFO +1 -1
- data_forager-0.2.0/data_forager/datasets/tokens_with_aux.py +91 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/index_stores/fs_based.py +77 -4
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/indexers/text_lines.py +28 -73
- data_forager-0.2.0/data_forager/indexers/tokenization_indexer.py +310 -0
- data_forager-0.2.0/data_forager/sample_generators/__init__.py +30 -0
- data_forager-0.2.0/data_forager/sample_generators/aux/__init__.py +18 -0
- data_forager-0.2.0/data_forager/sample_generators/aux/common.py +77 -0
- data_forager-0.2.0/data_forager/sample_generators/aux/loss_mask.py +78 -0
- data_forager-0.2.0/data_forager/sample_generators/common.py +117 -0
- data_forager-0.2.0/data_forager/sample_generators/schema.py +54 -0
- data_forager-0.2.0/data_forager/sample_generators/tokenization.py +210 -0
- data_forager-0.2.0/data_forager/sample_generators/tokenization_with_aux.py +250 -0
- data_forager-0.2.0/data_forager/sample_index.py +74 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/PKG-INFO +1 -1
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/SOURCES.txt +10 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/pyproject.toml +1 -1
- data_forager-0.2.0/tests/test_tokenization_with_aux.py +670 -0
- data_forager-0.1.6/data_forager/indexers/tokenization_indexer.py +0 -343
- data_forager-0.1.6/data_forager/sample_index.py +0 -42
- {data_forager-0.1.6 → data_forager-0.2.0}/LICENSE +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/README.md +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/__init__.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/__init__.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/common.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/jsonl.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/datasets/tokens.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/index_stores/__init__.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/index_stores/common.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/indexers/__init__.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/indexers/jsonl_indexer.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/unzip_files.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager/utils.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/dependency_links.txt +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/requires.txt +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/data_forager.egg-info/top_level.txt +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/setup.cfg +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/tests/test_jsonl_indexing.py +0 -0
- {data_forager-0.1.6 → data_forager-0.2.0}/tests/test_tokenizing_indexing_jsonl.py +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset for reading tokenized samples with auxiliary data.
|
|
3
|
+
|
|
4
|
+
This module provides TokensWithAuxDataset which reads samples containing
|
|
5
|
+
both tokens and auxiliary data (e.g., loss masks) using the schema stored
|
|
6
|
+
in the sample index.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Dict, Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from data_forager.sample_index import SampleIndex
|
|
14
|
+
from data_forager.datasets.common import Dataset
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TokensWithAuxDataset(Dataset):
|
|
18
|
+
"""
|
|
19
|
+
Dataset that returns tokens with auxiliary data.
|
|
20
|
+
|
|
21
|
+
Reads samples containing multiple arrays (tokens + auxiliary data) using
|
|
22
|
+
the schema from the sample index to parse the concatenated bytes.
|
|
23
|
+
|
|
24
|
+
Requires sample_schema in the SampleIndex. Use TokensDataset for indexes
|
|
25
|
+
without auxiliary data.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def create_from_index_on_filesystem(
|
|
30
|
+
cls,
|
|
31
|
+
base_path: str,
|
|
32
|
+
name: Optional[str] = None,
|
|
33
|
+
) -> "TokensWithAuxDataset":
|
|
34
|
+
"""
|
|
35
|
+
Create a TokensWithAuxDataset from an index stored on the filesystem.
|
|
36
|
+
|
|
37
|
+
:param base_path: Base path where the index is stored.
|
|
38
|
+
:param name: Optional name for logging.
|
|
39
|
+
|
|
40
|
+
:return: TokensWithAuxDataset instance.
|
|
41
|
+
"""
|
|
42
|
+
from data_forager.index_stores.fs_based import IndexStore
|
|
43
|
+
|
|
44
|
+
index_store = IndexStore(base_path=base_path)
|
|
45
|
+
sample_index = index_store.load()
|
|
46
|
+
|
|
47
|
+
return cls(sample_index=sample_index, name=name)
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
sample_index: SampleIndex,
|
|
52
|
+
name: Optional[str] = None,
|
|
53
|
+
**kwargs,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize the dataset.
|
|
57
|
+
|
|
58
|
+
:param sample_index: SampleIndex with sample_schema describing the
|
|
59
|
+
structure of samples.
|
|
60
|
+
:param name: Optional name for logging.
|
|
61
|
+
|
|
62
|
+
:raises ValueError: If sample_index has no sample_schema.
|
|
63
|
+
"""
|
|
64
|
+
super().__init__(sample_index, name=name, **kwargs)
|
|
65
|
+
|
|
66
|
+
if sample_index.sample_schema is None:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"SampleIndex has no sample_schema. "
|
|
69
|
+
"Use TokensDataset for indexes without auxiliary data."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self._schema = sample_index.sample_schema
|
|
73
|
+
|
|
74
|
+
def _process_sample(self, sample_bytes: bytes) -> Dict[str, np.ndarray]:
|
|
75
|
+
"""
|
|
76
|
+
Parse concatenated bytes into named arrays.
|
|
77
|
+
|
|
78
|
+
:param sample_bytes: Raw bytes containing all arrays concatenated.
|
|
79
|
+
|
|
80
|
+
:return: Dict mapping array names to numpy arrays.
|
|
81
|
+
"""
|
|
82
|
+
result = {}
|
|
83
|
+
for array_spec in self._schema.arrays:
|
|
84
|
+
dtype = np.dtype(array_spec.dtype)
|
|
85
|
+
start = array_spec.offset
|
|
86
|
+
length = self._schema.sample_size * dtype.itemsize
|
|
87
|
+
result[array_spec.name] = np.frombuffer(
|
|
88
|
+
sample_bytes[start:start + length],
|
|
89
|
+
dtype=dtype,
|
|
90
|
+
)
|
|
91
|
+
return result
|
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Filesystem-based index store for persisting sample indexes.
|
|
3
|
+
|
|
4
|
+
This module provides IndexStore which saves and loads sample indexes
|
|
5
|
+
to/from the filesystem.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
1
9
|
import os.path
|
|
2
10
|
import shutil
|
|
3
11
|
from typing import Optional, TextIO, BinaryIO
|
|
@@ -8,19 +16,28 @@ from basics.base import Base
|
|
|
8
16
|
|
|
9
17
|
from data_forager.index_stores.common import IndexStoreInterface
|
|
10
18
|
from data_forager.sample_index import SampleIndex
|
|
19
|
+
from data_forager.sample_generators.schema import SampleSchema, ArraySpec
|
|
11
20
|
|
|
12
21
|
|
|
13
22
|
class IndexStore(Base, IndexStoreInterface):
|
|
23
|
+
"""
|
|
24
|
+
Filesystem-based index store for saving and loading sample indexes.
|
|
25
|
+
|
|
26
|
+
Stores index data in a directory with the following structure:
|
|
27
|
+
- file_location.txt: List of file paths (relative to base_path)
|
|
28
|
+
- sample_locations.bin: Binary array of (file_index, byte_offset, num_bytes)
|
|
29
|
+
- sample_schema.json: Optional schema for structured samples
|
|
30
|
+
"""
|
|
14
31
|
|
|
15
32
|
def __init__(self, base_path: str, index_data_folder: str = "index", name: Optional[str] = None):
|
|
16
33
|
"""
|
|
34
|
+
Initialize the index store.
|
|
17
35
|
|
|
18
36
|
:param base_path: Base path where the index files are stored.
|
|
19
37
|
File paths in file_location.txt are stored relative to this path.
|
|
20
|
-
|
|
21
|
-
:param name: Name of instance, if not provided, the classname will be used
|
|
38
|
+
:param index_data_folder: Name of the folder within base_path for index files.
|
|
39
|
+
:param name: Name of instance, if not provided, the classname will be used.
|
|
22
40
|
"""
|
|
23
|
-
|
|
24
41
|
super().__init__(pybase_logger_name=name)
|
|
25
42
|
|
|
26
43
|
self._base_path = os.path.abspath(base_path)
|
|
@@ -31,6 +48,9 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
31
48
|
self._file_location_handle: Optional[TextIO] = None
|
|
32
49
|
self._sample_locations_handle: Optional[BinaryIO] = None
|
|
33
50
|
|
|
51
|
+
# Optional schema for structured samples
|
|
52
|
+
self._sample_schema: Optional[SampleSchema] = None
|
|
53
|
+
|
|
34
54
|
def init_store(self):
|
|
35
55
|
if os.path.exists(self._index_data_path):
|
|
36
56
|
raise ValueError(f"Provided index path already exists: {self._index_data_path}")
|
|
@@ -64,6 +84,15 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
64
84
|
sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
|
|
65
85
|
self._sample_locations_handle.write(sample_location_bytes)
|
|
66
86
|
|
|
87
|
+
def set_sample_schema(self, schema: SampleSchema) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Set and persist the sample schema.
|
|
90
|
+
|
|
91
|
+
:param schema: SampleSchema describing the structure of samples.
|
|
92
|
+
"""
|
|
93
|
+
self._sample_schema = schema
|
|
94
|
+
self._save_schema()
|
|
95
|
+
|
|
67
96
|
def close(self):
|
|
68
97
|
"""Close file handles and flush buffered data."""
|
|
69
98
|
if self._file_location_handle is not None:
|
|
@@ -78,6 +107,11 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
78
107
|
self.close()
|
|
79
108
|
|
|
80
109
|
def load(self) -> SampleIndex:
|
|
110
|
+
"""
|
|
111
|
+
Load the sample index from disk.
|
|
112
|
+
|
|
113
|
+
:return: SampleIndex with file locations, sample locations, and optional schema.
|
|
114
|
+
"""
|
|
81
115
|
with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
|
|
82
116
|
relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
|
|
83
117
|
|
|
@@ -91,7 +125,46 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
91
125
|
sample_locations = np.frombuffer(data, dtype=np.uint64)
|
|
92
126
|
sample_locations = sample_locations.reshape((-1, 3))
|
|
93
127
|
|
|
94
|
-
|
|
128
|
+
# Load schema if it exists
|
|
129
|
+
sample_schema = self._load_schema()
|
|
130
|
+
|
|
131
|
+
return SampleIndex(file_locations, sample_locations, sample_schema)
|
|
132
|
+
|
|
133
|
+
def _save_schema(self) -> None:
|
|
134
|
+
"""Save the sample schema to sample_schema.json."""
|
|
135
|
+
if self._sample_schema is None:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
schema_path = os.path.join(self._index_data_path, "sample_schema.json")
|
|
139
|
+
schema_dict = {
|
|
140
|
+
"sample_size": self._sample_schema.sample_size,
|
|
141
|
+
"arrays": [
|
|
142
|
+
{"name": arr.name, "dtype": arr.dtype, "offset": arr.offset}
|
|
143
|
+
for arr in self._sample_schema.arrays
|
|
144
|
+
],
|
|
145
|
+
"total_bytes_per_sample": self._sample_schema.total_bytes_per_sample,
|
|
146
|
+
}
|
|
147
|
+
with open(schema_path, "w") as f:
|
|
148
|
+
json.dump(schema_dict, f, indent=2)
|
|
149
|
+
|
|
150
|
+
def _load_schema(self) -> Optional[SampleSchema]:
|
|
151
|
+
"""Load the sample schema from sample_schema.json if it exists."""
|
|
152
|
+
schema_path = os.path.join(self._index_data_path, "sample_schema.json")
|
|
153
|
+
if not os.path.exists(schema_path):
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
with open(schema_path, "r") as f:
|
|
157
|
+
schema_dict = json.load(f)
|
|
158
|
+
|
|
159
|
+
arrays = [
|
|
160
|
+
ArraySpec(name=arr["name"], dtype=arr["dtype"], offset=arr["offset"])
|
|
161
|
+
for arr in schema_dict["arrays"]
|
|
162
|
+
]
|
|
163
|
+
return SampleSchema(
|
|
164
|
+
sample_size=schema_dict["sample_size"],
|
|
165
|
+
arrays=arrays,
|
|
166
|
+
total_bytes_per_sample=schema_dict["total_bytes_per_sample"],
|
|
167
|
+
)
|
|
95
168
|
|
|
96
169
|
def exists(self) -> bool:
|
|
97
170
|
"""Check if the index already exists."""
|
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
File text lines indexer for building sample indexes.
|
|
3
|
+
|
|
4
|
+
This module provides FileTextLinesIndexer which scans text files line by line,
|
|
5
|
+
uses a sample generator to transform lines into samples, and builds an index
|
|
6
|
+
for random access.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Optional, List
|
|
3
10
|
|
|
4
11
|
import os
|
|
5
12
|
import sys
|
|
@@ -8,77 +15,19 @@ from basics.base import Base
|
|
|
8
15
|
from tqdm import tqdm
|
|
9
16
|
|
|
10
17
|
from data_forager.index_stores.common import IndexStoreInterface
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
sample_bytes: bytes
|
|
17
|
-
file_path: str
|
|
18
|
-
|
|
19
|
-
class SampleGeneratorInterface(Protocol):
|
|
20
|
-
|
|
21
|
-
def prepare(self, text_file_path: str):
|
|
22
|
-
"""
|
|
23
|
-
Prepare sample generation from a new input text file
|
|
24
|
-
|
|
25
|
-
:param text_file_path: path to text file
|
|
26
|
-
|
|
27
|
-
:return:
|
|
28
|
-
"""
|
|
29
|
-
...
|
|
30
|
-
|
|
31
|
-
def create_samples(self, text_line: bytes) -> List[SampleData]:
|
|
32
|
-
"""
|
|
33
|
-
Creates one or more samples from the given text_line and stores it in one or multiple different files.
|
|
34
|
-
The path to the file(s) in which the samples are stores are also returned.
|
|
35
|
-
|
|
36
|
-
IMPORTANT: it is assumed that each sample returned is stored in a file sequentially in the same order.
|
|
37
|
-
This must also hold over multiple function calls. This is important because the byte offset
|
|
38
|
-
of a sample is derived from the order the samples are returned.
|
|
39
|
-
|
|
40
|
-
:param text_line: Text line in bytes from text_file_path, provided in the prepare phase.
|
|
41
|
-
The function needs to choose a text encoding itself
|
|
42
|
-
|
|
43
|
-
:return: List of DataSample objects. For each created sample the following is given:
|
|
44
|
-
* Its representation in bytes, as used to store the sample
|
|
45
|
-
* The file path to where the sample is stored
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
...
|
|
49
|
-
|
|
50
|
-
def finish(self, is_last_file: bool):
|
|
51
|
-
"""
|
|
52
|
-
Finish generation of samples from text lines of input file at the `text_file_path` given in the prepare() phase.
|
|
53
|
-
|
|
54
|
-
is_last_file: indicates if the input text file was the last file to be processed
|
|
55
|
-
|
|
56
|
-
:return:
|
|
57
|
-
"""
|
|
58
|
-
...
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class NOOPSampleGenerator(SampleGeneratorInterface):
|
|
62
|
-
|
|
63
|
-
def __init__(self):
|
|
64
|
-
self._current_text_file = None
|
|
65
|
-
|
|
66
|
-
def prepare(self, text_file_path: str):
|
|
67
|
-
self._current_text_file = text_file_path
|
|
68
|
-
|
|
69
|
-
def create_samples(self, text_line: bytes) -> List[SampleData]:
|
|
70
|
-
return [SampleData(text_line, self._current_text_file)]
|
|
71
|
-
|
|
72
|
-
def finish(self, is_last_file: bool):
|
|
73
|
-
self._current_text_file = None
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def noop_sample_processing(text_line: bytes, text_file_path: str) -> List[SampleData]:
|
|
77
|
-
|
|
78
|
-
return [SampleData(text_line, text_file_path)]
|
|
18
|
+
from data_forager.sample_generators.common import (
|
|
19
|
+
SampleGeneratorInterface,
|
|
20
|
+
NOOPSampleGenerator,
|
|
21
|
+
)
|
|
79
22
|
|
|
80
23
|
|
|
81
24
|
class FileTextLinesIndexer(Base):
|
|
25
|
+
"""
|
|
26
|
+
Indexes text files by scanning lines and building a sample index.
|
|
27
|
+
|
|
28
|
+
Uses a sample generator to transform text lines into samples, then records
|
|
29
|
+
byte offsets in an index store for O(1) random access during training.
|
|
30
|
+
"""
|
|
82
31
|
|
|
83
32
|
def __init__(
|
|
84
33
|
self,
|
|
@@ -104,13 +53,19 @@ class FileTextLinesIndexer(Base):
|
|
|
104
53
|
|
|
105
54
|
def __call__(self):
|
|
106
55
|
"""
|
|
107
|
-
|
|
108
|
-
E.g. through process_sample_func and/or when processing the data Dataset::_process_sample()
|
|
56
|
+
Run the indexing process.
|
|
109
57
|
|
|
110
|
-
:
|
|
58
|
+
IMPORTANT: Input files are always read in binary mode; applying a text
|
|
59
|
+
encoding is up to the user, e.g., through process_sample_func and/or
|
|
60
|
+
when processing the data in Dataset._process_sample().
|
|
111
61
|
"""
|
|
112
62
|
self._index_store.init_store()
|
|
113
63
|
|
|
64
|
+
# Set schema if the sample generator provides one
|
|
65
|
+
sample_schema = self._sample_generator.get_sample_schema()
|
|
66
|
+
if sample_schema is not None:
|
|
67
|
+
self._index_store.set_sample_schema(sample_schema)
|
|
68
|
+
|
|
114
69
|
byte_offset_map = {}
|
|
115
70
|
|
|
116
71
|
for input_file_path in self._input_file_paths:
|
|
@@ -0,0 +1,310 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Factory function for creating tokenization and indexing pipelines.
|
|
3
|
+
|
|
4
|
+
This module provides a convenience function for setting up the complete pipeline
|
|
5
|
+
to tokenize JSONL text files and create an index for random access.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import Callable, Dict, List, Optional
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import logging
|
|
12
|
+
import os
|
|
13
|
+
|
|
14
|
+
from basics.logging import get_logger
|
|
15
|
+
|
|
16
|
+
from data_forager.index_stores.common import IndexStoreInterface
|
|
17
|
+
from data_forager.index_stores.fs_based import IndexStore as FSBasedIndexStore
|
|
18
|
+
from data_forager.indexers.text_lines import FileTextLinesIndexer
|
|
19
|
+
from data_forager.sample_generators.tokenization import (
|
|
20
|
+
TokenizedSampleGenerator,
|
|
21
|
+
TokenizerFunc,
|
|
22
|
+
ProcessTextLineFunc,
|
|
23
|
+
)
|
|
24
|
+
from data_forager.sample_generators.aux.common import Part, AuxDataGenerator
|
|
25
|
+
from data_forager.utils import find_files_recursive, natural_sort
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
ProcessPartsFunc = Callable[[bytes], List[Part]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
module_logger = get_logger(os.path.basename(__file__))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def get_text_from_jsonl(jsonl_bytes: bytes, text_key: str = "text", text_encoding: str = "utf-8") -> str:
|
|
35
|
+
"""
|
|
36
|
+
Extract text from a JSONL line.
|
|
37
|
+
|
|
38
|
+
:param jsonl_bytes: Raw bytes of the JSONL line.
|
|
39
|
+
:param text_key: Key in the JSON object containing the text.
|
|
40
|
+
:param text_encoding: Text encoding to use for decoding.
|
|
41
|
+
|
|
42
|
+
:return: The extracted text string.
|
|
43
|
+
"""
|
|
44
|
+
jsonl_text = jsonl_bytes.decode(text_encoding)
|
|
45
|
+
data = json.loads(jsonl_text)
|
|
46
|
+
return data[text_key]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def create_tokenize_and_index_jsonl_text_func(
|
|
50
|
+
tokenizer_func: TokenizerFunc,
|
|
51
|
+
eos_idx: int,
|
|
52
|
+
input_base_path: Optional[str] = None,
|
|
53
|
+
input_file_paths: Optional[List[str]] = None,
|
|
54
|
+
output_base_path: Optional[str] = None,
|
|
55
|
+
index_store: Optional[IndexStoreInterface] = None,
|
|
56
|
+
process_text_line_func: Optional[ProcessTextLineFunc] = None,
|
|
57
|
+
logger: Optional[logging.Logger] = None,
|
|
58
|
+
name: Optional[str] = None,
|
|
59
|
+
**sample_generator_kwargs,
|
|
60
|
+
) -> FileTextLinesIndexer:
|
|
61
|
+
"""
|
|
62
|
+
Create a pipeline to tokenize text from JSONL files and create an index for random access.
|
|
63
|
+
|
|
64
|
+
The pipeline:
|
|
65
|
+
* Tokenizes text from input JSONL objects
|
|
66
|
+
* Stores the token data in bin files under "tokenized-samples" folder
|
|
67
|
+
* Stores index data under "index" folder
|
|
68
|
+
|
|
69
|
+
Usage:
|
|
70
|
+
```python
|
|
71
|
+
import tiktoken
|
|
72
|
+
|
|
73
|
+
enc = tiktoken.get_encoding("gpt2")
|
|
74
|
+
def tokenize_text(text: str) -> List[int]:
|
|
75
|
+
return enc.encode_ordinary(text)
|
|
76
|
+
|
|
77
|
+
# Option 1: Scan directory for JSONL files, output to same directory
|
|
78
|
+
indexer = create_tokenize_and_index_jsonl_text_func(
|
|
79
|
+
tokenizer_func=tokenize_text,
|
|
80
|
+
eos_idx=enc.eot_token,
|
|
81
|
+
input_base_path='./data',
|
|
82
|
+
sample_size=1024,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
# Option 2: Explicit input files and output path
|
|
86
|
+
indexer = create_tokenize_and_index_jsonl_text_func(
|
|
87
|
+
tokenizer_func=tokenize_text,
|
|
88
|
+
eos_idx=enc.eot_token,
|
|
89
|
+
input_file_paths=['./data/train.jsonl'],
|
|
90
|
+
output_base_path='./output',
|
|
91
|
+
sample_size=1024,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Run tokenization and indexing
|
|
95
|
+
indexer()
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
:param tokenizer_func: Function used to tokenize text.
|
|
99
|
+
:param eos_idx: EOS token index, known by the used Tokenizer.
|
|
100
|
+
:param input_base_path: Path to directory containing JSONL files (searched recursively).
|
|
101
|
+
Used as fallback for output if `output_base_path` is not provided.
|
|
102
|
+
:param input_file_paths: List of file paths to process. If provided, these are used
|
|
103
|
+
instead of scanning `input_base_path` for JSONL files.
|
|
104
|
+
:param output_base_path: Base path for output (index and tokenized samples).
|
|
105
|
+
If not provided, `input_base_path` is used.
|
|
106
|
+
:param index_store: Index store to use. If provided, this is used instead of
|
|
107
|
+
creating a new FSBasedIndexStore.
|
|
108
|
+
:param process_text_line_func: Function used to process text lines.
|
|
109
|
+
By default, this converts input JSON lines to dicts and returns the "text" field.
|
|
110
|
+
See function get_text_from_jsonl().
|
|
111
|
+
:param logger: Logger to use. If not provided, uses module logger.
|
|
112
|
+
:param name: Name of the indexer, used for logging purposes.
|
|
113
|
+
:param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator
|
|
114
|
+
(e.g., sample_size, token_dtype, base_output_path).
|
|
115
|
+
|
|
116
|
+
:raises ValueError: If both `input_base_path` and `input_file_paths` are None.
|
|
117
|
+
:raises ValueError: If `index_store` is None and both `output_base_path` and
|
|
118
|
+
`input_base_path` are None.
|
|
119
|
+
|
|
120
|
+
:return: FileTextLinesIndexer instance that can be called to run tokenization
|
|
121
|
+
and indexing.
|
|
122
|
+
"""
|
|
123
|
+
if logger is None:
|
|
124
|
+
logger = module_logger
|
|
125
|
+
|
|
126
|
+
# Validate input source
|
|
127
|
+
if input_base_path is None and input_file_paths is None:
|
|
128
|
+
raise ValueError(
|
|
129
|
+
"Either input_base_path or input_file_paths must be provided"
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
# Determine output base path
|
|
133
|
+
effective_output_base_path = output_base_path or input_base_path
|
|
134
|
+
|
|
135
|
+
# Validate output destination
|
|
136
|
+
if index_store is None and effective_output_base_path is None:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"Either index_store, output_base_path, or input_base_path must be provided "
|
|
139
|
+
"to determine where to store the index"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
logger.info(f"Output base path: {effective_output_base_path}")
|
|
143
|
+
|
|
144
|
+
if process_text_line_func is None:
|
|
145
|
+
process_text_line_func = get_text_from_jsonl
|
|
146
|
+
|
|
147
|
+
if index_store is None:
|
|
148
|
+
index_store = FSBasedIndexStore(
|
|
149
|
+
base_path=effective_output_base_path,
|
|
150
|
+
)
|
|
151
|
+
|
|
152
|
+
if input_file_paths is None:
|
|
153
|
+
logger.info(f"Scanning for JSONL files in: {input_base_path}")
|
|
154
|
+
input_file_paths = find_files_recursive(
|
|
155
|
+
input_base_path,
|
|
156
|
+
extension_patterns=['*.jsonl', '*.JSONL']
|
|
157
|
+
)
|
|
158
|
+
# Assuming numbered files
|
|
159
|
+
input_file_paths = natural_sort(input_file_paths)
|
|
160
|
+
logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
|
|
161
|
+
|
|
162
|
+
# Set default base_output_path for tokenized samples if not provided in kwargs
|
|
163
|
+
if 'base_output_path' not in sample_generator_kwargs:
|
|
164
|
+
default_base_output_path = os.path.join(
|
|
165
|
+
effective_output_base_path, "tokenized-samples"
|
|
166
|
+
)
|
|
167
|
+
logger.info(f"Tokenized samples output path: {default_base_output_path}")
|
|
168
|
+
sample_generator_kwargs['base_output_path'] = default_base_output_path
|
|
169
|
+
|
|
170
|
+
sample_generator = TokenizedSampleGenerator(
|
|
171
|
+
process_text_line_func=process_text_line_func,
|
|
172
|
+
tokenizer_func=tokenizer_func,
|
|
173
|
+
eos_idx=eos_idx,
|
|
174
|
+
**sample_generator_kwargs
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
return FileTextLinesIndexer(
|
|
178
|
+
input_file_paths=input_file_paths,
|
|
179
|
+
index_store=index_store,
|
|
180
|
+
sample_generator=sample_generator,
|
|
181
|
+
description="Tokenizing and indexing",
|
|
182
|
+
name=name,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def create_tokenize_and_index_with_aux_func(
|
|
187
|
+
process_parts_func: ProcessPartsFunc,
|
|
188
|
+
tokenizer_func: TokenizerFunc,
|
|
189
|
+
eos_idx: int,
|
|
190
|
+
aux_generators: Dict[str, AuxDataGenerator],
|
|
191
|
+
input_base_path: Optional[str] = None,
|
|
192
|
+
input_file_paths: Optional[List[str]] = None,
|
|
193
|
+
output_base_path: Optional[str] = None,
|
|
194
|
+
index_store: Optional[IndexStoreInterface] = None,
|
|
195
|
+
logger: Optional[logging.Logger] = None,
|
|
196
|
+
name: Optional[str] = None,
|
|
197
|
+
**sample_generator_kwargs,
|
|
198
|
+
) -> FileTextLinesIndexer:
|
|
199
|
+
"""
|
|
200
|
+
Create a pipeline to tokenize structured samples with auxiliary data.
|
|
201
|
+
|
|
202
|
+
This function creates a pipeline that:
|
|
203
|
+
* Processes structured input (parts with types) from JSONL files
|
|
204
|
+
* Tokenizes each part and generates auxiliary data (e.g., loss masks)
|
|
205
|
+
* Stores concatenated token + aux data in bin files
|
|
206
|
+
* Creates an index with schema for random access
|
|
207
|
+
|
|
208
|
+
Usage:
|
|
209
|
+
```python
|
|
210
|
+
from data_forager.sample_generators.aux import Part, LossMaskGenerator
|
|
211
|
+
|
|
212
|
+
def parse_parts(line_bytes: bytes) -> List[Part]:
|
|
213
|
+
data = json.loads(line_bytes.decode('utf-8'))
|
|
214
|
+
return [Part(type=p['type'], text=p['text']) for p in data['parts']]
|
|
215
|
+
|
|
216
|
+
indexer = create_tokenize_and_index_with_aux_func(
|
|
217
|
+
process_parts_func=parse_parts,
|
|
218
|
+
tokenizer_func=tokenizer.encode,
|
|
219
|
+
eos_idx=tokenizer.eos_token_id,
|
|
220
|
+
aux_generators={'loss_mask': LossMaskGenerator()},
|
|
221
|
+
input_base_path='./data',
|
|
222
|
+
sample_size=4096,
|
|
223
|
+
)
|
|
224
|
+
|
|
225
|
+
indexer()
|
|
226
|
+
```
|
|
227
|
+
|
|
228
|
+
:param process_parts_func: Function to extract typed parts from input bytes.
|
|
229
|
+
Takes JSONL bytes and returns List[Part].
|
|
230
|
+
:param tokenizer_func: Function used to tokenize text.
|
|
231
|
+
:param eos_idx: EOS token index, known by the used Tokenizer.
|
|
232
|
+
:param aux_generators: Dict mapping names to AuxDataGenerator instances.
|
|
233
|
+
Example: {'loss_mask': LossMaskGenerator()}
|
|
234
|
+
:param input_base_path: Path to directory containing JSONL files.
|
|
235
|
+
:param input_file_paths: List of file paths to process.
|
|
236
|
+
:param output_base_path: Base path for output (index and tokenized samples).
|
|
237
|
+
:param index_store: Index store to use. Must support set_sample_schema().
|
|
238
|
+
:param logger: Logger to use.
|
|
239
|
+
:param name: Name of the indexer for logging.
|
|
240
|
+
:param sample_generator_kwargs: Other kwargs passed to TokenizedSampleWithAuxGenerator
|
|
241
|
+
(e.g., sample_size, token_dtype).
|
|
242
|
+
|
|
243
|
+
:raises ValueError: If both input_base_path and input_file_paths are None.
|
|
244
|
+
:raises ValueError: If output destination cannot be determined.
|
|
245
|
+
|
|
246
|
+
:return: FileTextLinesIndexer instance that can be called to run the pipeline.
|
|
247
|
+
"""
|
|
248
|
+
# Import here to avoid circular dependency
|
|
249
|
+
from data_forager.sample_generators.tokenization_with_aux import (
|
|
250
|
+
TokenizedSampleWithAuxGenerator,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
if logger is None:
|
|
254
|
+
logger = module_logger
|
|
255
|
+
|
|
256
|
+
# Validate input source
|
|
257
|
+
if input_base_path is None and input_file_paths is None:
|
|
258
|
+
raise ValueError(
|
|
259
|
+
"Either input_base_path or input_file_paths must be provided"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Determine output base path
|
|
263
|
+
effective_output_base_path = output_base_path or input_base_path
|
|
264
|
+
|
|
265
|
+
# Validate output destination
|
|
266
|
+
if index_store is None and effective_output_base_path is None:
|
|
267
|
+
raise ValueError(
|
|
268
|
+
"Either index_store, output_base_path, or input_base_path must be provided "
|
|
269
|
+
"to determine where to store the index"
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
logger.info(f"Output base path: {effective_output_base_path}")
|
|
273
|
+
|
|
274
|
+
if index_store is None:
|
|
275
|
+
index_store = FSBasedIndexStore(
|
|
276
|
+
base_path=effective_output_base_path,
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if input_file_paths is None:
|
|
280
|
+
logger.info(f"Scanning for JSONL files in: {input_base_path}")
|
|
281
|
+
input_file_paths = find_files_recursive(
|
|
282
|
+
input_base_path,
|
|
283
|
+
extension_patterns=['*.jsonl', '*.JSONL']
|
|
284
|
+
)
|
|
285
|
+
input_file_paths = natural_sort(input_file_paths)
|
|
286
|
+
logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
|
|
287
|
+
|
|
288
|
+
# Set default base_output_path for tokenized samples if not provided
|
|
289
|
+
if 'base_output_path' not in sample_generator_kwargs:
|
|
290
|
+
default_base_output_path = os.path.join(
|
|
291
|
+
effective_output_base_path, "tokenized-samples"
|
|
292
|
+
)
|
|
293
|
+
logger.info(f"Tokenized samples output path: {default_base_output_path}")
|
|
294
|
+
sample_generator_kwargs['base_output_path'] = default_base_output_path
|
|
295
|
+
|
|
296
|
+
sample_generator = TokenizedSampleWithAuxGenerator(
|
|
297
|
+
process_parts_func=process_parts_func,
|
|
298
|
+
tokenizer_func=tokenizer_func,
|
|
299
|
+
eos_idx=eos_idx,
|
|
300
|
+
aux_generators=aux_generators,
|
|
301
|
+
**sample_generator_kwargs
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
return FileTextLinesIndexer(
|
|
305
|
+
input_file_paths=input_file_paths,
|
|
306
|
+
index_store=index_store,
|
|
307
|
+
sample_generator=sample_generator,
|
|
308
|
+
description="Tokenizing with aux data and indexing",
|
|
309
|
+
name=name,
|
|
310
|
+
)
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Sample generators for transforming input data into samples.
|
|
3
|
+
|
|
4
|
+
This package contains:
|
|
5
|
+
- SampleGeneratorInterface: Protocol for sample generators
|
|
6
|
+
- SampleData: Data class for sample information
|
|
7
|
+
- SampleSchema, ArraySpec: Schema classes for structured samples
|
|
8
|
+
- TokenizedSampleGenerator: Tokenizes text into fixed-length samples
|
|
9
|
+
- TokenizedSampleWithAuxGenerator: Tokenizes with auxiliary data (loss masks, etc.)
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from data_forager.sample_generators.common import (
|
|
13
|
+
SampleData,
|
|
14
|
+
SampleGeneratorInterface,
|
|
15
|
+
NOOPSampleGenerator,
|
|
16
|
+
noop_sample_processing,
|
|
17
|
+
)
|
|
18
|
+
from data_forager.sample_generators.schema import (
|
|
19
|
+
ArraySpec,
|
|
20
|
+
SampleSchema,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"ArraySpec",
|
|
25
|
+
"NOOPSampleGenerator",
|
|
26
|
+
"SampleData",
|
|
27
|
+
"SampleGeneratorInterface",
|
|
28
|
+
"SampleSchema",
|
|
29
|
+
"noop_sample_processing",
|
|
30
|
+
]
|