data-forager 0.1.6__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_forager/datasets/tokens_with_aux.py +91 -0
- data_forager/index_stores/fs_based.py +77 -4
- data_forager/indexers/text_lines.py +28 -73
- data_forager/indexers/tokenization_indexer.py +158 -191
- data_forager/sample_generators/__init__.py +30 -0
- data_forager/sample_generators/aux/__init__.py +18 -0
- data_forager/sample_generators/aux/common.py +77 -0
- data_forager/sample_generators/aux/loss_mask.py +78 -0
- data_forager/sample_generators/common.py +117 -0
- data_forager/sample_generators/schema.py +54 -0
- data_forager/sample_generators/tokenization.py +210 -0
- data_forager/sample_generators/tokenization_with_aux.py +250 -0
- data_forager/sample_index.py +34 -2
- {data_forager-0.1.6.dist-info → data_forager-0.2.0.dist-info}/METADATA +1 -1
- data_forager-0.2.0.dist-info/RECORD +29 -0
- {data_forager-0.1.6.dist-info → data_forager-0.2.0.dist-info}/WHEEL +1 -1
- data_forager-0.1.6.dist-info/RECORD +0 -20
- {data_forager-0.1.6.dist-info → data_forager-0.2.0.dist-info}/licenses/LICENSE +0 -0
- {data_forager-0.1.6.dist-info → data_forager-0.2.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Dataset for reading tokenized samples with auxiliary data.
|
|
3
|
+
|
|
4
|
+
This module provides TokensWithAuxDataset which reads samples containing
|
|
5
|
+
both tokens and auxiliary data (e.g., loss masks) using the schema stored
|
|
6
|
+
in the sample index.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Dict, Optional
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
|
|
13
|
+
from data_forager.sample_index import SampleIndex
|
|
14
|
+
from data_forager.datasets.common import Dataset
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class TokensWithAuxDataset(Dataset):
|
|
18
|
+
"""
|
|
19
|
+
Dataset that returns tokens with auxiliary data.
|
|
20
|
+
|
|
21
|
+
Reads samples containing multiple arrays (tokens + auxiliary data) using
|
|
22
|
+
the schema from the sample index to parse the concatenated bytes.
|
|
23
|
+
|
|
24
|
+
Requires sample_schema in the SampleIndex. Use TokensDataset for indexes
|
|
25
|
+
without auxiliary data.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def create_from_index_on_filesystem(
|
|
30
|
+
cls,
|
|
31
|
+
base_path: str,
|
|
32
|
+
name: Optional[str] = None,
|
|
33
|
+
) -> "TokensWithAuxDataset":
|
|
34
|
+
"""
|
|
35
|
+
Create a TokensWithAuxDataset from an index stored on the filesystem.
|
|
36
|
+
|
|
37
|
+
:param base_path: Base path where the index is stored.
|
|
38
|
+
:param name: Optional name for logging.
|
|
39
|
+
|
|
40
|
+
:return: TokensWithAuxDataset instance.
|
|
41
|
+
"""
|
|
42
|
+
from data_forager.index_stores.fs_based import IndexStore
|
|
43
|
+
|
|
44
|
+
index_store = IndexStore(base_path=base_path)
|
|
45
|
+
sample_index = index_store.load()
|
|
46
|
+
|
|
47
|
+
return cls(sample_index=sample_index, name=name)
|
|
48
|
+
|
|
49
|
+
def __init__(
|
|
50
|
+
self,
|
|
51
|
+
sample_index: SampleIndex,
|
|
52
|
+
name: Optional[str] = None,
|
|
53
|
+
**kwargs,
|
|
54
|
+
):
|
|
55
|
+
"""
|
|
56
|
+
Initialize the dataset.
|
|
57
|
+
|
|
58
|
+
:param sample_index: SampleIndex with sample_schema describing the
|
|
59
|
+
structure of samples.
|
|
60
|
+
:param name: Optional name for logging.
|
|
61
|
+
|
|
62
|
+
:raises ValueError: If sample_index has no sample_schema.
|
|
63
|
+
"""
|
|
64
|
+
super().__init__(sample_index, name=name, **kwargs)
|
|
65
|
+
|
|
66
|
+
if sample_index.sample_schema is None:
|
|
67
|
+
raise ValueError(
|
|
68
|
+
"SampleIndex has no sample_schema. "
|
|
69
|
+
"Use TokensDataset for indexes without auxiliary data."
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
self._schema = sample_index.sample_schema
|
|
73
|
+
|
|
74
|
+
def _process_sample(self, sample_bytes: bytes) -> Dict[str, np.ndarray]:
|
|
75
|
+
"""
|
|
76
|
+
Parse concatenated bytes into named arrays.
|
|
77
|
+
|
|
78
|
+
:param sample_bytes: Raw bytes containing all arrays concatenated.
|
|
79
|
+
|
|
80
|
+
:return: Dict mapping array names to numpy arrays.
|
|
81
|
+
"""
|
|
82
|
+
result = {}
|
|
83
|
+
for array_spec in self._schema.arrays:
|
|
84
|
+
dtype = np.dtype(array_spec.dtype)
|
|
85
|
+
start = array_spec.offset
|
|
86
|
+
length = self._schema.sample_size * dtype.itemsize
|
|
87
|
+
result[array_spec.name] = np.frombuffer(
|
|
88
|
+
sample_bytes[start:start + length],
|
|
89
|
+
dtype=dtype,
|
|
90
|
+
)
|
|
91
|
+
return result
|
|
@@ -1,3 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Filesystem-based index store for persisting sample indexes.
|
|
3
|
+
|
|
4
|
+
This module provides IndexStore which saves and loads sample indexes
|
|
5
|
+
to/from the filesystem.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
1
9
|
import os.path
|
|
2
10
|
import shutil
|
|
3
11
|
from typing import Optional, TextIO, BinaryIO
|
|
@@ -8,19 +16,28 @@ from basics.base import Base
|
|
|
8
16
|
|
|
9
17
|
from data_forager.index_stores.common import IndexStoreInterface
|
|
10
18
|
from data_forager.sample_index import SampleIndex
|
|
19
|
+
from data_forager.sample_generators.schema import SampleSchema, ArraySpec
|
|
11
20
|
|
|
12
21
|
|
|
13
22
|
class IndexStore(Base, IndexStoreInterface):
|
|
23
|
+
"""
|
|
24
|
+
Filesystem-based index store for saving and loading sample indexes.
|
|
25
|
+
|
|
26
|
+
Stores index data in a directory with the following structure:
|
|
27
|
+
- file_location.txt: List of file paths (relative to base_path)
|
|
28
|
+
- sample_locations.bin: Binary array of (file_index, byte_offset, num_bytes)
|
|
29
|
+
- sample_schema.json: Optional schema for structured samples
|
|
30
|
+
"""
|
|
14
31
|
|
|
15
32
|
def __init__(self, base_path: str, index_data_folder: str = "index", name: Optional[str] = None):
|
|
16
33
|
"""
|
|
34
|
+
Initialize the index store.
|
|
17
35
|
|
|
18
36
|
:param base_path: Base path where the index files are stored.
|
|
19
37
|
File paths in file_location.txt are stored relative to this path.
|
|
20
|
-
|
|
21
|
-
:param name: Name of instance, if not provided, the classname will be used
|
|
38
|
+
:param index_data_folder: Name of the folder within base_path for index files.
|
|
39
|
+
:param name: Name of instance, if not provided, the classname will be used.
|
|
22
40
|
"""
|
|
23
|
-
|
|
24
41
|
super().__init__(pybase_logger_name=name)
|
|
25
42
|
|
|
26
43
|
self._base_path = os.path.abspath(base_path)
|
|
@@ -31,6 +48,9 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
31
48
|
self._file_location_handle: Optional[TextIO] = None
|
|
32
49
|
self._sample_locations_handle: Optional[BinaryIO] = None
|
|
33
50
|
|
|
51
|
+
# Optional schema for structured samples
|
|
52
|
+
self._sample_schema: Optional[SampleSchema] = None
|
|
53
|
+
|
|
34
54
|
def init_store(self):
|
|
35
55
|
if os.path.exists(self._index_data_path):
|
|
36
56
|
raise ValueError(f"Provided index path already exists: {self._index_data_path}")
|
|
@@ -64,6 +84,15 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
64
84
|
sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
|
|
65
85
|
self._sample_locations_handle.write(sample_location_bytes)
|
|
66
86
|
|
|
87
|
+
def set_sample_schema(self, schema: SampleSchema) -> None:
|
|
88
|
+
"""
|
|
89
|
+
Set and persist the sample schema.
|
|
90
|
+
|
|
91
|
+
:param schema: SampleSchema describing the structure of samples.
|
|
92
|
+
"""
|
|
93
|
+
self._sample_schema = schema
|
|
94
|
+
self._save_schema()
|
|
95
|
+
|
|
67
96
|
def close(self):
|
|
68
97
|
"""Close file handles and flush buffered data."""
|
|
69
98
|
if self._file_location_handle is not None:
|
|
@@ -78,6 +107,11 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
78
107
|
self.close()
|
|
79
108
|
|
|
80
109
|
def load(self) -> SampleIndex:
|
|
110
|
+
"""
|
|
111
|
+
Load the sample index from disk.
|
|
112
|
+
|
|
113
|
+
:return: SampleIndex with file locations, sample locations, and optional schema.
|
|
114
|
+
"""
|
|
81
115
|
with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
|
|
82
116
|
relative_locations = [loc[:-1] if loc[-1]=='\n' else loc for loc in f.readlines()]
|
|
83
117
|
|
|
@@ -91,7 +125,46 @@ class IndexStore(Base, IndexStoreInterface):
|
|
|
91
125
|
sample_locations = np.frombuffer(data, dtype=np.uint64)
|
|
92
126
|
sample_locations = sample_locations.reshape((-1, 3))
|
|
93
127
|
|
|
94
|
-
|
|
128
|
+
# Load schema if it exists
|
|
129
|
+
sample_schema = self._load_schema()
|
|
130
|
+
|
|
131
|
+
return SampleIndex(file_locations, sample_locations, sample_schema)
|
|
132
|
+
|
|
133
|
+
def _save_schema(self) -> None:
|
|
134
|
+
"""Save the sample schema to sample_schema.json."""
|
|
135
|
+
if self._sample_schema is None:
|
|
136
|
+
return
|
|
137
|
+
|
|
138
|
+
schema_path = os.path.join(self._index_data_path, "sample_schema.json")
|
|
139
|
+
schema_dict = {
|
|
140
|
+
"sample_size": self._sample_schema.sample_size,
|
|
141
|
+
"arrays": [
|
|
142
|
+
{"name": arr.name, "dtype": arr.dtype, "offset": arr.offset}
|
|
143
|
+
for arr in self._sample_schema.arrays
|
|
144
|
+
],
|
|
145
|
+
"total_bytes_per_sample": self._sample_schema.total_bytes_per_sample,
|
|
146
|
+
}
|
|
147
|
+
with open(schema_path, "w") as f:
|
|
148
|
+
json.dump(schema_dict, f, indent=2)
|
|
149
|
+
|
|
150
|
+
def _load_schema(self) -> Optional[SampleSchema]:
|
|
151
|
+
"""Load the sample schema from sample_schema.json if it exists."""
|
|
152
|
+
schema_path = os.path.join(self._index_data_path, "sample_schema.json")
|
|
153
|
+
if not os.path.exists(schema_path):
|
|
154
|
+
return None
|
|
155
|
+
|
|
156
|
+
with open(schema_path, "r") as f:
|
|
157
|
+
schema_dict = json.load(f)
|
|
158
|
+
|
|
159
|
+
arrays = [
|
|
160
|
+
ArraySpec(name=arr["name"], dtype=arr["dtype"], offset=arr["offset"])
|
|
161
|
+
for arr in schema_dict["arrays"]
|
|
162
|
+
]
|
|
163
|
+
return SampleSchema(
|
|
164
|
+
sample_size=schema_dict["sample_size"],
|
|
165
|
+
arrays=arrays,
|
|
166
|
+
total_bytes_per_sample=schema_dict["total_bytes_per_sample"],
|
|
167
|
+
)
|
|
95
168
|
|
|
96
169
|
def exists(self) -> bool:
|
|
97
170
|
"""Check if the index already exists."""
|
|
@@ -1,5 +1,12 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
File text lines indexer for building sample indexes.
|
|
3
|
+
|
|
4
|
+
This module provides FileTextLinesIndexer which scans text files line by line,
|
|
5
|
+
uses a sample generator to transform lines into samples, and builds an index
|
|
6
|
+
for random access.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Optional, List
|
|
3
10
|
|
|
4
11
|
import os
|
|
5
12
|
import sys
|
|
@@ -8,77 +15,19 @@ from basics.base import Base
|
|
|
8
15
|
from tqdm import tqdm
|
|
9
16
|
|
|
10
17
|
from data_forager.index_stores.common import IndexStoreInterface
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
sample_bytes: bytes
|
|
17
|
-
file_path: str
|
|
18
|
-
|
|
19
|
-
class SampleGeneratorInterface(Protocol):
|
|
20
|
-
|
|
21
|
-
def prepare(self, text_file_path: str):
|
|
22
|
-
"""
|
|
23
|
-
Prepare sample generation from a new input text file
|
|
24
|
-
|
|
25
|
-
:param text_file_path: path to text file
|
|
26
|
-
|
|
27
|
-
:return:
|
|
28
|
-
"""
|
|
29
|
-
...
|
|
30
|
-
|
|
31
|
-
def create_samples(self, text_line: bytes) -> List[SampleData]:
|
|
32
|
-
"""
|
|
33
|
-
Creates one or more samples from the given text_line and stores it in one or multiple different files.
|
|
34
|
-
The path to the file(s) in which the samples are stores are also returned.
|
|
35
|
-
|
|
36
|
-
IMPORTANT: it is assumed that each sample returned is stored in a file sequentially in the same order.
|
|
37
|
-
This must also hold over multiple function calls. This is important because the byte offset
|
|
38
|
-
of a sample is derived from the order the samples are returned.
|
|
39
|
-
|
|
40
|
-
:param text_line: Text line in bytes from text_file_path, provided in the prepare phase.
|
|
41
|
-
The function needs to choose a text encoding itself
|
|
42
|
-
|
|
43
|
-
:return: List of DataSample objects. For each created sample the following is given:
|
|
44
|
-
* Its representation in bytes, as used to store the sample
|
|
45
|
-
* The file path to where the sample is stored
|
|
46
|
-
|
|
47
|
-
"""
|
|
48
|
-
...
|
|
49
|
-
|
|
50
|
-
def finish(self, is_last_file: bool):
|
|
51
|
-
"""
|
|
52
|
-
Finish generation of samples from text lines of input file at the `text_file_path` given in the prepare() phase.
|
|
53
|
-
|
|
54
|
-
is_last_file: indicates if the input text file was the last file to be processed
|
|
55
|
-
|
|
56
|
-
:return:
|
|
57
|
-
"""
|
|
58
|
-
...
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
class NOOPSampleGenerator(SampleGeneratorInterface):
|
|
62
|
-
|
|
63
|
-
def __init__(self):
|
|
64
|
-
self._current_text_file = None
|
|
65
|
-
|
|
66
|
-
def prepare(self, text_file_path: str):
|
|
67
|
-
self._current_text_file = text_file_path
|
|
68
|
-
|
|
69
|
-
def create_samples(self, text_line: bytes) -> List[SampleData]:
|
|
70
|
-
return [SampleData(text_line, self._current_text_file)]
|
|
71
|
-
|
|
72
|
-
def finish(self, is_last_file: bool):
|
|
73
|
-
self._current_text_file = None
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
def noop_sample_processing(text_line: bytes, text_file_path: str) -> List[SampleData]:
|
|
77
|
-
|
|
78
|
-
return [SampleData(text_line, text_file_path)]
|
|
18
|
+
from data_forager.sample_generators.common import (
|
|
19
|
+
SampleGeneratorInterface,
|
|
20
|
+
NOOPSampleGenerator,
|
|
21
|
+
)
|
|
79
22
|
|
|
80
23
|
|
|
81
24
|
class FileTextLinesIndexer(Base):
|
|
25
|
+
"""
|
|
26
|
+
Indexes text files by scanning lines and building a sample index.
|
|
27
|
+
|
|
28
|
+
Uses a sample generator to transform text lines into samples, then records
|
|
29
|
+
byte offsets in an index store for O(1) random access during training.
|
|
30
|
+
"""
|
|
82
31
|
|
|
83
32
|
def __init__(
|
|
84
33
|
self,
|
|
@@ -104,13 +53,19 @@ class FileTextLinesIndexer(Base):
|
|
|
104
53
|
|
|
105
54
|
def __call__(self):
|
|
106
55
|
"""
|
|
107
|
-
|
|
108
|
-
E.g. through process_sample_func and/or when processing the data Dataset::_process_sample()
|
|
56
|
+
Run the indexing process.
|
|
109
57
|
|
|
110
|
-
:
|
|
58
|
+
IMPORTANT: Input files are always read in binary mode; applying a text
|
|
59
|
+
encoding is up to the user, e.g., through process_sample_func and/or
|
|
60
|
+
when processing the data in Dataset._process_sample().
|
|
111
61
|
"""
|
|
112
62
|
self._index_store.init_store()
|
|
113
63
|
|
|
64
|
+
# Set schema if the sample generator provides one
|
|
65
|
+
sample_schema = self._sample_generator.get_sample_schema()
|
|
66
|
+
if sample_schema is not None:
|
|
67
|
+
self._index_store.set_sample_schema(sample_schema)
|
|
68
|
+
|
|
114
69
|
byte_offset_map = {}
|
|
115
70
|
|
|
116
71
|
for input_file_path in self._input_file_paths:
|