data-forager 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,16 +6,25 @@ from data_forager.sample_index import SampleIndex
6
6
  class IndexStoreInterface(Protocol):
7
7
 
8
8
  def init_store(self):
9
+ """Initialize the store for writing. Must be called before add_sample()."""
9
10
  ...
10
11
 
11
12
  def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
13
+ """Add a sample location to the index."""
14
+ ...
15
+
16
+ def close(self):
17
+ """Close the store, flushing any buffered data. Must be called after all samples are added."""
12
18
  ...
13
19
 
14
20
  def load(self) -> SampleIndex:
21
+ """Load the index from the store."""
15
22
  ...
16
23
 
17
24
  def exists(self) -> bool:
25
+ """Check if the index already exists."""
18
26
  ...
19
27
 
20
28
  def clear(self) -> None:
29
+ """Remove the index."""
21
30
  ...
@@ -1,6 +1,6 @@
1
1
  import os.path
2
2
  import shutil
3
- from typing import Optional
3
+ from typing import Optional, TextIO, BinaryIO
4
4
 
5
5
  import numpy as np
6
6
 
@@ -10,7 +10,7 @@ from data_forager.index_stores.common import IndexStoreInterface
10
10
  from data_forager.sample_index import SampleIndex
11
11
 
12
12
 
13
- class IndexStore(IndexStoreInterface, Base):
13
+ class IndexStore(Base, IndexStoreInterface):
14
14
 
15
15
  def __init__(self, base_path: str, index_data_folder: str = "index", name: Optional[str] = None):
16
16
  """
@@ -25,12 +25,24 @@ class IndexStore(IndexStoreInterface, Base):
25
25
  self._index_data_path = os.path.join(base_path, index_data_folder)
26
26
  self._file_locations = []
27
27
 
28
+ # File handles for buffered writing
29
+ self._file_location_handle: Optional[TextIO] = None
30
+ self._sample_locations_handle: Optional[BinaryIO] = None
31
+
28
32
  def init_store(self):
29
33
  if os.path.exists(self._index_data_path):
30
34
  raise ValueError(f"Provided index path already exists: {self._index_data_path}")
31
35
 
32
36
  os.makedirs(self._index_data_path, exist_ok=True)
33
37
 
38
+ # Open file handles for writing
39
+ self._file_location_handle = open(
40
+ os.path.join(self._index_data_path, "file_location.txt"), "a"
41
+ )
42
+ self._sample_locations_handle = open(
43
+ os.path.join(self._index_data_path, "sample_locations.bin"), "ab"
44
+ )
45
+
34
46
  def add_sample(self, file_location: str, byte_offset: int, num_bytes: int):
35
47
  """
36
48
  :param file_location:
@@ -40,15 +52,25 @@ class IndexStore(IndexStoreInterface, Base):
40
52
  """
41
53
  if file_location not in self._file_locations:
42
54
  self._file_locations.append(file_location)
43
- with open(os.path.join(self._index_data_path, "file_location.txt"), "a") as f:
44
- f.writelines([file_location+'\n'])
55
+ self._file_location_handle.write(file_location + '\n')
45
56
 
46
57
  file_index = self._file_locations.index(file_location)
47
58
 
48
- with open(os.path.join(self._index_data_path, "sample_locations.bin"), "ab") as f:
49
- sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
59
+ sample_location_bytes = np.array([file_index, byte_offset, num_bytes], dtype=np.uint64).tobytes()
60
+ self._sample_locations_handle.write(sample_location_bytes)
61
+
62
+ def close(self):
63
+ """Close file handles and flush buffered data."""
64
+ if self._file_location_handle is not None:
65
+ self._file_location_handle.close()
66
+ self._file_location_handle = None
67
+
68
+ if self._sample_locations_handle is not None:
69
+ self._sample_locations_handle.close()
70
+ self._sample_locations_handle = None
50
71
 
51
- f.write(sample_location_bytes)
72
+ def __del__(self):
73
+ self.close()
52
74
 
53
75
  def load(self) -> SampleIndex:
54
76
  with open(os.path.join(self._index_data_path, "file_location.txt"), "r") as f:
@@ -155,3 +155,5 @@ class FileTextLinesIndexer(Base):
155
155
 
156
156
  sys.stdout.write('\n\n')
157
157
  sys.stdout.flush()
158
+
159
+ self._index_store.close()
@@ -1,5 +1,6 @@
1
1
  from typing import Callable, List, Optional
2
2
 
3
+ import logging
3
4
  import os
4
5
  from pathlib import Path
5
6
 
@@ -8,7 +9,11 @@ import json
8
9
  import numpy as np
9
10
 
10
11
  from basics.base import Base
12
+ from basics.logging import get_logger
11
13
 
14
+ module_logger = get_logger(os.path.basename(__file__))
15
+
16
+ from data_forager.index_stores.common import IndexStoreInterface
12
17
  from data_forager.index_stores.fs_based import IndexStore as FSBasedIndexStore
13
18
  from data_forager.indexers.text_lines import SampleData, FileTextLinesIndexer, SampleGeneratorInterface
14
19
  from data_forager.utils import find_files_recursive, natural_sort
@@ -24,61 +29,125 @@ def get_text_from_jsonl(jsonl_bytes: bytes, text_key: str = "text", text_encodin
24
29
 
25
30
 
26
31
  def create_tokenize_and_index_jsonl_text_func(
27
- input_base_path: str,
28
32
  tokenizer_func: TokenizerFunc,
29
33
  eos_idx: int,
34
+ input_base_path: Optional[str] = None,
35
+ input_file_paths: Optional[List[str]] = None,
36
+ output_base_path: Optional[str] = None,
37
+ index_store: Optional[IndexStoreInterface] = None,
30
38
  process_text_line_func: Optional[ProcessTextLineFunc] = None,
39
+ logger: Optional[logging.Logger] = None,
31
40
  name: Optional[str] = None,
32
41
  **sample_generator_kwargs,
33
42
  ) -> FileTextLinesIndexer:
34
43
  """
35
- Create function to:
36
- * Tokenize text from input JSONL objects, loaded from files at input_base_path (recursively),
37
- * Store the token data in bin files under folder "tokenized-samples" in input_base_path
38
- * Store index data under folder "index" in input_base_path
44
+ Create a pipeline to tokenize text from JSONL files and create an index for random access.
45
+
46
+ The pipeline:
47
+ * Tokenizes text from input JSONL objects
48
+ * Stores the token data in bin files under "tokenized-samples" folder
49
+ * Stores index data under "index" folder
39
50
 
40
51
  Usage:
41
- # Create pipeline to tokenize text from input JSONL objects and index the token samples
52
+ ```python
42
53
  import tiktoken
43
54
 
44
55
  enc = tiktoken.get_encoding("gpt2")
45
56
  def tokenize_text(text: str) -> List[int]:
46
- return tiktoken.enc.encode_ordinary(text)
57
+ return enc.encode_ordinary(text)
58
+
59
+ # Option 1: Scan directory for JSONL files, output to same directory
60
+ indexer = create_tokenize_and_index_jsonl_text_func(
61
+ tokenizer_func=tokenize_text,
62
+ eos_idx=enc.eot_token,
63
+ input_base_path='./data',
64
+ sample_size=1024,
65
+ )
47
66
 
48
- tokenize_and_index_jsonl_text_func = create_jsonl_text_tokenization_and_indexing_pipeline(
49
- input_base_path='.',
67
+ # Option 2: Explicit input files and output path
68
+ indexer = create_tokenize_and_index_jsonl_text_func(
50
69
  tokenizer_func=tokenize_text,
51
- sample_size=1024
70
+ eos_idx=enc.eot_token,
71
+ input_file_paths=['./data/train.jsonl'],
72
+ output_base_path='./output',
73
+ sample_size=1024,
52
74
  )
53
75
 
54
- # Start tokenization and indexing
55
- tokenize_and_index_jsonl_text_func()
76
+ # Run tokenization and indexing
77
+ indexer()
78
+ ```
56
79
 
57
- :param input_base_path: Path to directory containing JSONL files (searched recursively).
58
80
  :param tokenizer_func: Function used to tokenize text.
59
- :param eos_idx: EOS token index, known by the used Tokenizer
81
+ :param eos_idx: EOS token index, known by the used Tokenizer.
82
+ :param input_base_path: Path to directory containing JSONL files (searched recursively).
83
+ Used as fallback for output if `output_base_path` is not provided.
84
+ :param input_file_paths: List of file paths to process. If provided, these are used
85
+ instead of scanning `input_base_path` for JSONL files.
86
+ :param output_base_path: Base path for output (index and tokenized samples).
87
+ If not provided, `input_base_path` is used.
88
+ :param index_store: Index store to use. If provided, this is used instead of
89
+ creating a new FSBasedIndexStore.
60
90
  :param process_text_line_func: Function used to process text lines.
61
91
  By default, this converts input JSON lines to dicts and returns the "text" field.
62
92
  See function get_text_from_jsonl().
63
- :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator.
64
- :param name: Optional: name of the indexer to create, used for logging purposes
93
+ :param logger: Logger to use. If not provided, uses module logger.
94
+ :param name: Name of the indexer, used for logging purposes.
95
+ :param sample_generator_kwargs: Other kwargs passed to TokenizedSampleGenerator
96
+ (e.g., sample_size, token_dtype, base_output_path).
97
+
98
+ :raises ValueError: If both `input_base_path` and `input_file_paths` are None.
99
+ :raises ValueError: If `index_store` is None and both `output_base_path` and
100
+ `input_base_path` are None.
65
101
 
66
- :return: FileTextLinesIndexer instance that can be used to tokenize and index text from jsonl objects, from
67
- JSONL files at input_base_path (recursively)
102
+ :return: FileTextLinesIndexer instance that can be called to run tokenization
103
+ and indexing.
68
104
  """
69
- if process_text_line_func is None:
70
- process_text_line_func=get_text_from_jsonl
105
+ if logger is None:
106
+ logger = module_logger
71
107
 
72
- index_store = FSBasedIndexStore(
73
- base_path=input_base_path,
74
- )
75
- input_file_paths = find_files_recursive(
76
- input_base_path,
77
- extension_patterns=['*.jsonl', '*.JSONL']
78
- )
108
+ # Validate input source
109
+ if input_base_path is None and input_file_paths is None:
110
+ raise ValueError(
111
+ "Either input_base_path or input_file_paths must be provided"
112
+ )
113
+
114
+ # Determine output base path
115
+ effective_output_base_path = output_base_path or input_base_path
79
116
 
80
- # Assuming numbered files
81
- input_file_paths = natural_sort(input_file_paths)
117
+ # Validate output destination
118
+ if index_store is None and effective_output_base_path is None:
119
+ raise ValueError(
120
+ "Either index_store, output_base_path, or input_base_path must be provided "
121
+ "to determine where to store the index"
122
+ )
123
+
124
+ logger.info(f"Output base path: {effective_output_base_path}")
125
+
126
+ if process_text_line_func is None:
127
+ process_text_line_func = get_text_from_jsonl
128
+
129
+ if index_store is None:
130
+ index_store = FSBasedIndexStore(
131
+ base_path=effective_output_base_path,
132
+ )
133
+
134
+ if input_file_paths is None:
135
+ logger.info(f"Scanning for JSONL files in: {input_base_path}")
136
+ input_file_paths = find_files_recursive(
137
+ input_base_path,
138
+ extension_patterns=['*.jsonl', '*.JSONL']
139
+ )
140
+ # Assuming numbered files
141
+ input_file_paths = natural_sort(input_file_paths)
142
+ logger.info(f"Found {len(input_file_paths)} JSONL file(s)")
143
+
144
+ # Set default base_output_path for tokenized samples if not provided in kwargs
145
+ if 'base_output_path' not in sample_generator_kwargs:
146
+ default_base_output_path = os.path.join(
147
+ effective_output_base_path, "tokenized-samples"
148
+ )
149
+ logger.info(f"Tokenized samples output path: {default_base_output_path}")
150
+ sample_generator_kwargs['base_output_path'] = default_base_output_path
82
151
 
83
152
  sample_generator = TokenizedSampleGenerator(
84
153
  process_text_line_func=process_text_line_func,
@@ -96,7 +165,7 @@ def create_tokenize_and_index_jsonl_text_func(
96
165
  )
97
166
 
98
167
 
99
- class TokenizedSampleGenerator(SampleGeneratorInterface, Base):
168
+ class TokenizedSampleGenerator(Base, SampleGeneratorInterface):
100
169
 
101
170
  def __init__(
102
171
  self,
@@ -215,6 +284,10 @@ class TokenizedSampleGenerator(SampleGeneratorInterface, Base):
215
284
  tokenized_text = self._tokenizer_func(input_text)
216
285
 
217
286
  if self._sample_size is not None:
287
+ # Always append EOS after each document to mark document boundary
288
+ tokenized_text = tokenized_text + [self._eos_idx]
289
+
290
+ # Prepend any leftover tokens from previous document
218
291
  if self._rest_tokens is not None:
219
292
  tokenized_text = self._rest_tokens + tokenized_text
220
293
  self._rest_tokens = None
@@ -224,8 +297,9 @@ class TokenizedSampleGenerator(SampleGeneratorInterface, Base):
224
297
  num_rest_tokens = num_tokens % self._sample_size
225
298
 
226
299
  if num_rest_tokens > 0:
227
- self._rest_tokens = tokenized_text[-num_rest_tokens:] + [self._eos_idx]
228
- tokenized_text = tokenized_text[:num_samples*self._sample_size]
300
+ # Store remainder tokens (includes EOS from this document)
301
+ self._rest_tokens = tokenized_text[-num_rest_tokens:]
302
+ tokenized_text = tokenized_text[:num_samples * self._sample_size]
229
303
 
230
304
  tokenized_samples = np.array(tokenized_text, dtype=self._token_dtype)
231
305
  tokenized_samples = tokenized_samples.reshape(-1, self._sample_size)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: data-forager
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: Enabling random access to large datasets on disk for PyTorch training and other use cases
5
5
  Author-email: Freddy Snijder <forager@visionscapers.com>
6
6
  License-Expression: MIT
@@ -7,14 +7,14 @@ data_forager/datasets/common.py,sha256=OB0DzuxuLfd9BP68K9vTNzzBfK-L6ZzIFqe-FZTwQ
7
7
  data_forager/datasets/jsonl.py,sha256=enOjWRT-AJTF3tWtNlonCqdDpZfVsK8If6yEtlA8tns,630
8
8
  data_forager/datasets/tokens.py,sha256=OP5MNb9uBDSX_Of6lNVLs5CAj46RwkP4gGDk-94lD40,597
9
9
  data_forager/index_stores/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
- data_forager/index_stores/common.py,sha256=4ak_LGOWp1jhRVkjyxuYPQ7vf7qNiSIfJB6UwKl-X0I,383
11
- data_forager/index_stores/fs_based.py,sha256=apwSQegacadBr9GB1amcelz8TseeAahYNA1cNv1gCjs,2798
10
+ data_forager/index_stores/common.py,sha256=HYPZfCtmbLDxjPgCoUseahicTEI1el7V1s5tSyZxkfs,780
11
+ data_forager/index_stores/fs_based.py,sha256=1MB_iZPZS187rlLRUNPIJcQK-lU9EqEAg3sARnT2vSk,3564
12
12
  data_forager/indexers/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  data_forager/indexers/jsonl_indexer.py,sha256=xvogFjEcKPNr-GBkKEr8WnWPVvJbkWQzUBrGwVELlA4,1395
14
- data_forager/indexers/text_lines.py,sha256=Z5XmlhfR1DiLAF_2GUO88H5SJFOhUGwQ0vkcGfltRXg,5058
15
- data_forager/indexers/tokenization_indexer.py,sha256=gzAuDwIJenNklfHkegDoZz-JzIOXmvuwj4r10wtbJpo,10908
16
- data_forager-0.1.1.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
17
- data_forager-0.1.1.dist-info/METADATA,sha256=P_ZlNLS3x9O2OkPyNLgsB58k_jooKMlnpDYkh7l9x9w,9090
18
- data_forager-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
- data_forager-0.1.1.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
20
- data_forager-0.1.1.dist-info/RECORD,,
14
+ data_forager/indexers/text_lines.py,sha256=XMm5oc0btP7I16z87g1fmq9AqJyVhDOvR2cDu_zFZio,5093
15
+ data_forager/indexers/tokenization_indexer.py,sha256=t-7Q3PLAJ0DYZT6LWdHeahk9Hz9OQsvWfoPvhHIneMk,13927
16
+ data_forager-0.1.3.dist-info/licenses/LICENSE,sha256=If0vYAiJJUtbASoyZPVhvTu3e3m4WB1cQmUpvo9HRTc,1071
17
+ data_forager-0.1.3.dist-info/METADATA,sha256=S8OWa1JSkQB77oVn3QMVEPN6obnvePn12jlojzZvELo,9090
18
+ data_forager-0.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ data_forager-0.1.3.dist-info/top_level.txt,sha256=mLcF2mYnfdaeJ_vIa7hT-MtTpUvn7kgyaWNuxXZ1Ds8,13
20
+ data_forager-0.1.3.dist-info/RECORD,,