data-forager 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_forager/index_stores/common.py +9 -0
- data_forager/index_stores/fs_based.py +29 -7
- data_forager/indexers/text_lines.py +2 -0
- data_forager/indexers/tokenization_indexer.py +106 -32
- {data_forager-0.1.1.dist-info → data_forager-0.1.3.dist-info}/METADATA +1 -1
- {data_forager-0.1.1.dist-info → data_forager-0.1.3.dist-info}/RECORD +9 -9
- {data_forager-0.1.1.dist-info → data_forager-0.1.3.dist-info}/WHEEL +0 -0
- {data_forager-0.1.1.dist-info → data_forager-0.1.3.dist-info}/licenses/LICENSE +0 -0
- {data_forager-0.1.1.dist-info → data_forager-0.1.3.dist-info}/top_level.txt +0 -0
|
@@ -6,16 +6,25 @@ from data_forager.sample_index import SampleIndex
|
|
|
6
6
|
class IndexStoreInterface(Protocol):
|
|
7
7
|
|
|
8
8
|
def init_store(self):
|
|
9
|
+
"""Initialize the store for writing. Must be called before add_sample()."""
|
|
9
10
|
...
|
|
10
11
|
|
|
11
12
|
def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
|
|
13
|
+
"""Add a sample location to the index."""
|
|
14
|
+
...
|
|
15
|
+
|
|
16
|
+
def close(self):
|
|
17
|
+
"""Close the store, flushing any buffered data. Must be called after all samples are added."""
|
|
12
18
|
...
|
|
13
19
|
|
|
14
20
|
def load(self) -> SampleIndex:
|
|
21
|
+
"""Load the index from the store."""
|
|
15
22
|
...
|
|
16
23
|
|
|
17
24
|
def exists(self) -> bool:
|
|
25
|
+
"""Check if the index already exists."""
|
|
18
26
|
...
|
|
19
27
|
|
|
20
28
|
def clear(self) -> None:
|
|
29
|
+
"""Remove the index."""
|
|
21
30
|
...
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import os.path
|
|
2
2
|
import shutil
|
|
3
|
-
from typing import Optional
|
|
3
|
+
from typing import Optional, TextIO, BinaryIO
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
@@ -10,7 +10,7 @@ from data_forager.index_stores.common import IndexStoreInterface
|
|
|
10
10
|
from data_forager.sample_index import SampleIndex
|
|
11
11
|
|
|
12
12
|
|
|
13
|
-
class IndexStore(
|
|
13
|
+
class IndexStore(Base, IndexStoreInterface):
|
|
14
14
|
|
|
15
15
|
def __init__(self, base_path: str, index_data_folder: str = "index", name: Optional[str] = None):
|
|
16
16
|
"""
|
|
@@ -25,12 +25,24 @@ class IndexStore(IndexStoreInterface, Base):
|
|
|
25
25
|
self._index_data_path = os.path.join(base_path, index_data_folder)
|
|
26
26
|
self._file_locations = []
|
|
27
27
|
|
|
28
|
+
# File handles for buffered writing
|
|
29
|
+
self._file_location_handle: Optional[TextIO] = None
|
|
30
|
+
self._sample_locations_handle: Optional[BinaryIO] = None
|
|
31
|
+
|
|
28
32
|
def init_store(self):
|
|
29
33
|
if os.path.exists(self._index_data_path):
|
|
30
34
|
raise ValueError(f"Provided index path already exists: {self._index_data_path}")
|
|
31
35
|
|
|
32
36
|
os.makedirs(self._index_data_path, exist_ok=True)
|
|
33
37
|
|
|
38
|
+
# Open file handles for writing
|
|
39
|
+
self._file_location_handle = open(
|
|
40
|
+
os.path.join(self._index_data_path, "file_location.txt"), "a"
|
|
41
|
+
)
|
|
42
|
+
self._sample_locations_handle = open(
|
|
43
|
+
os.path.join(self._index_data_path, "sample_locations.bin"), "ab"
|
|
44
|
+
)
|
|
45
|
+
|
|
34
46
|
def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
|
|
35
47
|
"""
|
|
36
48
|
:param file_location:
|
|
@@ -40,15 +52,25 @@ class IndexStore(IndexStoreInterface, Base):
|
|
|
40
52
|
"""
|
|
41
53
|
if file_location not in self._file_locations:
|
|
42
54
|
self._file_locations.append(file_location)
|
|
43
|
-
|
|
44
|
-
f.writelines([file_location+'\n'])
|
|
55
|
+
self._file_location_handle.write(file_location + '\n')
|
|
45
56
|
|
|
46
57
|
file_index = self._file_locations.index(file_location)
|
|
47
58
|
|
|
48
|
-
|
|
49
|
-
|
|
59
|
+
sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
|
|
60
|
+
self._sample_locations_handle.write(sample_location_bytes)
|
|
61
|
+
|
|
62
|
+
def close(self):
|
|
63
|
+
"""Close file handles and flush buffered data."""
|
|
64
|
+
if self._file_location_handle is not None:
|
|
65
|
+
self._file_location_handle.close()
|
|
66
|
+
self._file_location_handle = None
|
|
67
|
+
|
|
68
|
+
if self._sample_locations_handle is not None:
|
|
69
|
+
self._sample_locations_handle.close()
|
|
70
|
+
self._sample_locations_handle = None
|
|
50
71
|
|
|
51
|
-
|
|
72
|
+
def __del__(self):
|
|
73
|
+
self.close()
|
|
52
74
|
|
|
53
75
|
def load(self) -> SampleIndex:
|
|
54
76
|
with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Callable, List, Optional
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
import os
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
@@ -8,7 +9,11 @@ import json
|
|
|
8
9
|
import numpy as np
|
|
9
10
|
|
|
10
11
|
from basics.base import Base
|
|
12
|
+
from basics.logging import get_logger
|
|
11
13
|
|
|
14
|
+
module_logger = get_logger(os.path.basename(__file__))
|
|
15
|
+
|
|
16
|
+
from data_forager.index_stores.common import IndexStoreInterface
|
|
12
17
|
from data_forager.index_stores.fs_based import IndexStore as FSBasedIndexStore
|
|
13
18
|
from data_forager.indexers.text_lines import SampleData, FileTextLinesIndexer, SampleGeneratorInterface
|
|
14
19
|
from data_forager.utils import find_files_recursive, natural_sort
|
|
@@ -24,61 +29,125 @@ def get_text_from_jsonl(jsonl_bytes: bytes, text_key: str = "text", text_encodin
|
|
|
24
29
|
|
|
25
30
|
|
|
26
31
|
def create_tokenize_and_index_jsonl_text_func(
|
|
27
|
-
input_base_path: str,
|
|
28
32
|
tokenizer_func: TokenizerFunc,
|
|
29
33
|
eos_idx: int,
|
|
34
|
+
input_base_path: Optional[str] = None,
|
|
35
|
+
input_file_paths: Optional[List[str]] = None,
|
|
36
|
+
output_base_path: Optional[str] = None,
|
|
37
|
+
index_store: Optional[IndexStoreInterface] = None,
|
|
30
38
|
process_text_line_func: Optional[ProcessTextLineFunc] = None,
|
|
39
|
+
logger: Optional[logging.Logger] = None,
|
|
31
40
|
name: Optional[str] = None,
|
|
32
41
|
**sample_generator_kwargs,
|
|
33
42
|
) -> FileTextLinesIndexer:
|
|
34
43
|
"""
|
|
35
|
-
Create
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
*
|
|
44
|
+
Create a pipeline to tokenize text from JSONL files and create an index for random access.
|
|
45
|
+
|
|
46
|
+
The pipeline:
|
|
47
|
+
* Tokenizes text from input JSONL objects
|
|
48
|
+
* Stores the token data in bin files under "tokenized-samples" folder
|
|
49
|
+
* Stores index data under "index" folder
|
|
39
50
|
|
|
40
51
|
Usage:
|
|
41
|
-
|
|
52
|
+
```python
|
|
42
53
|
import tiktoken
|
|
43
54
|
|
|
44
55
|
enc = tiktoken.get_encoding("gpt2")
|
|
45
56
|
def tokenize_text(text: str) -> List[int]:
|
|
46
|
-
return
|
|
57
|
+
return enc.encode_ordinary(text)
|
|
58
|
+
|
|
59
|
+
# Option 1: Scan directory for JSONL files, output to same directory
|
|
60
|
+
indexer = create_tokenize_and_index_jsonl_text_func(
|
|
61
|
+
tokenizer_func=tokenize_text,
|
|
62
|
+
eos_idx=enc.eot_token,
|
|
63
|
+
input_base_path='./data',
|
|
64
|
+
sample_size=1024,
|
|
65
|
+
)
|
|
47
66
|
|
|
48
|
-
|
|
49
|
-
|
|
67
|
+
# Option 2: Explicit input files and output path
|
|
68
|
+
indexer = create_tokenize_and_index_jsonl_text_func(
|
|
50
69
|
tokenizer_func=tokenize_text,
|
|
51
|
-
|
|
70
|
+
eos_idx=enc.eot_token,
|
|
71
|
+
input_file_paths=['./data/train.jsonl'],
|
|
72
|
+
output_base_path='./output',
|
|
73
|
+
sample_size=1024,
|
|
52
74
|
)
|
|
53
75
|
|
|
54
|
-
#
|
|
55
|
-
|
|
76
|
+
# Run tokenization and indexing
|
|
77
|
+
indexer()
|
|
78
|
+
```
|
|
56
79
|
|
|
57
|
-
:param input_base_path: Path to directory containing JSONL files (searched recursively).
|
|
58
80
|
:param tokenizer_func: Function used to tokenize text.
|
|
59
|
-
:param eos_idx: EOS token index, known by the used Tokenizer
|
|
81
|
+
:param eos_idx: EOS token index, known by the used Tokenizer.
|
|
82
|
+
:param input_base_path: Path to directory containing JSONL files (searched recursively).
|
|
83
|
+
Used as fallback for output if `output_base_path` is not provided.
|
|
84
|
+
:param input_file_paths: List of file paths to process. If provided, these are used
|
|
85
|
+
instead of scanning `input_base_path` for JSONL files.
|
|
86
|
+
:param output_base_path: Base path for output (index and tokenized samples).
|
|
87
|
+
If not provided, `input_base_path` is used.
|
|
88
|
+
:param index_store: Index store to use. If provided, this is used instead of
|
|
89
|
+
creating a new FSBasedIndexStore.
|
|
60
90
|
:param process_text_line_func: Function used to process text lines.
|
|
61
91
|
By default, this converts input JSON lines to dicts and returns the "text" field.
|
|
62
92
|
See function get_text_from_jsonl().
|
|
63
|
-
:param
|
|
64
|
-
:param name:
|
|
93
|
+
:param logger: Logger to use. If not provided, uses module logger.
|
|
94
|
+
:param name: Name of the indexer, used for logging purposes.
|
|
95
|
+
:param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator
|
|
96
|
+
(e.g., sample_size, token_dtype, base_output_path).
|
|
97
|
+
|
|
98
|
+
:raises ValueError: If both `input_base_path` and `input_file_paths` are None.
|
|
99
|
+
:raises ValueError: If `index_store` is None and both `output_base_path` and
|
|
100
|
+
`input_base_path` are None.
|
|
65
101
|
|
|
66
|
-
:return: FileTextLinesIndexer instance that can be
|
|
67
|
-
|
|
102
|
+
:return: FileTextLinesIndexer instance that can be called to run tokenization
|
|
103
|
+
and indexing.
|
|
68
104
|
"""
|
|
69
|
-
if
|
|
70
|
-
|
|
105
|
+
if logger is None:
|
|
106
|
+
logger = module_logger
|
|
71
107
|
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
108
|
+
# Validate input source
|
|
109
|
+
if input_base_path is None and input_file_paths is None:
|
|
110
|
+
raise ValueError(
|
|
111
|
+
"Either input_base_path or input_file_paths must be provided"
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Determine output base path
|
|
115
|
+
effective_output_base_path = output_base_path or input_base_path
|
|
79
116
|
|
|
80
|
-
#
|
|
81
|
-
|
|
117
|
+
# Validate output destination
|
|
118
|
+
if index_store is None and effective_output_base_path is None:
|
|
119
|
+
raise ValueError(
|
|
120
|
+
"Either index_store, output_base_path, or input_base_path must be provided "
|
|
121
|
+
"to determine where to store the index"
|
|
122
|
+
)
|
|
123
|
+
|
|
124
|
+
logger.info(f"Output base path: {effective_output_base_path}")
|
|
125
|
+
|
|
126
|
+
if process_text_line_func is None:
|
|
127
|
+
process_text_line_func = get_text_from_jsonl
|
|
128
|
+
|
|
129
|
+
if index_store is None:
|
|
130
|
+
index_store = FSBasedIndexStore(
|
|
131
|
+
base_path=effective_output_base_path,
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
if input_file_paths is None:
|
|
135
|
+
logger.info(f"Scanning for JSONL files in: {input_base_path}")
|
|
136
|
+
input_file_paths = find_files_recursive(
|
|
137
|
+
input_base_path,
|
|
138
|
+
extension_patterns=['*.jsonl', '*.JSONL']
|
|
139
|
+
)
|
|
140
|
+
# Assuming numbered files
|
|
141
|
+
input_file_paths = natural_sort(input_file_paths)
|
|
142
|
+
logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
|
|
143
|
+
|
|
144
|
+
# Set default base_output_path for tokenized samples if not provided in kwargs
|
|
145
|
+
if 'base_output_path' not in sample_generator_kwargs:
|
|
146
|
+
default_base_output_path = os.path.join(
|
|
147
|
+
effective_output_base_path, "tokenized-samples"
|
|
148
|
+
)
|
|
149
|
+
logger.info(f"Tokenized samples output path: {default_base_output_path}")
|
|
150
|
+
sample_generator_kwargs['base_output_path'] = default_base_output_path
|
|
82
151
|
|
|
83
152
|
sample_generator = TokenizedSampleGenerator(
|
|
84
153
|
process_text_line_func=process_text_line_func,
|
|
@@ -96,7 +165,7 @@ def create_tokenize_and_index_jsonl_text_func(
|
|
|
96
165
|
)
|
|
97
166
|
|
|
98
167
|
|
|
99
|
-
class TokenizedSampleGenerator(
|
|
168
|
+
class TokenizedSampleGenerator(Base, SampleGeneratorInterface):
|
|
100
169
|
|
|
101
170
|
def __init__(
|
|
102
171
|
self,
|
|
@@ -215,6 +284,10 @@ class TokenizedSampleGenerator(SampleGeneratorInterface, Base):
|
|
|
215
284
|
tokenized_text = self._tokenizer_func(input_text)
|
|
216
285
|
|
|
217
286
|
if self._sample_size is not None:
|
|
287
|
+
# Always append EOS after each document to mark document boundary
|
|
288
|
+
tokenized_text = tokenized_text + [self._eos_idx]
|
|
289
|
+
|
|
290
|
+
# Prepend any leftover tokens from previous document
|
|
218
291
|
if self._rest_tokens is not None:
|
|
219
292
|
tokenized_text = self._rest_tokens + tokenized_text
|
|
220
293
|
self._rest_tokens = None
|
|
@@ -224,8 +297,9 @@ class TokenizedSampleGenerator(SampleGeneratorInterface, Base):
|
|
|
224
297
|
num_rest_tokens = num_tokens % self._sample_size
|
|
225
298
|
|
|
226
299
|
if num_rest_tokens > 0:
|
|
227
|
-
|
|
228
|
-
|
|
300
|
+
# Store remainder tokens (includes EOS from this document)
|
|
301
|
+
self._rest_tokens = tokenized_text[-num_rest_tokens:]
|
|
302
|
+
tokenized_text = tokenized_text[:num_samples * self._sample_size]
|
|
229
303
|
|
|
230
304
|
tokenized_samples = np.array(tokenized_text, dtype=self._token_dtype)
|
|
231
305
|
tokenized_samples = tokenized_samples.reshape(-1, self._sample_size)
|
|
@@ -7,14 +7,14 @@ data_forager/datasets/common.py,sha256=OB0DzuxuLfd9BP68K9vTNzzBfK-L6ZzIFqe-FZTwQ
|
|
|
7
7
|
data_forager/datasets/jsonl.py,sha256=enOjWRT-AJTF3tWtNlonCqdDpZfVsK8If6yEtlA8tns,630
|
|
8
8
|
data_forager/datasets/tokens.py,sha256=OP5MNb9uBDSX_Of6lNVLs5CAj46RwkP4gGDk-94lD40,597
|
|
9
9
|
data_forager/index_stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
10
|
-
data_forager/index_stores/common.py,sha256=
|
|
11
|
-
data_forager/index_stores/fs_based.py,sha256=
|
|
10
|
+
data_forager/index_stores/common.py,sha256=HYPZfCtmbLDxjPgCoUseahicTEI1el7V1s5tSyZxkfs,780
|
|
11
|
+
data_forager/index_stores/fs_based.py,sha256=1MB_iZPZS187rlLRUNPIJcQK-lU9EqEAg3sARnT2vSk,3564
|
|
12
12
|
data_forager/indexers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
13
|
data_forager/indexers/jsonl_indexer.py,sha256=xvogFjEcKPNr-GBkKEr8WnWPVvJbkWQzUBrGwVELlA4,1395
|
|
14
|
-
data_forager/indexers/text_lines.py,sha256=
|
|
15
|
-
data_forager/indexers/tokenization_indexer.py,sha256=
|
|
16
|
-
data_forager-0.1.
|
|
17
|
-
data_forager-0.1.
|
|
18
|
-
data_forager-0.1.
|
|
19
|
-
data_forager-0.1.
|
|
20
|
-
data_forager-0.1.
|
|
14
|
+
data_forager/indexers/text_lines.py,sha256=XMm5oc0btP7I16z87g1fmq9AqJyVhDOvR2cDu_zFZio,5093
|
|
15
|
+
data_forager/indexers/tokenization_indexer.py,sha256=t-7Q3PLAJ0DYZT6LWdHeahk9Hz9OQsvWfoPvhHIneMk,13927
|
|
16
|
+
data_forager-0.1.3.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
|
|
17
|
+
data_forager-0.1.3.dist-info/METADATA,sha256=S8OWa1JSkQB77oVn3QMVEPN6obnvePn12jlojzZvELo,9090
|
|
18
|
+
data_forager-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
data_forager-0.1.3.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
|
|
20
|
+
data_forager-0.1.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|