mldataforge 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mldataforge-0.1.5 → mldataforge-0.1.7}/PKG-INFO +1 -1
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/join.py +1 -1
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/compression.py +5 -1
- mldataforge-0.1.7/mldataforge/mds.py +448 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/snappy.py +0 -54
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/utils.py +12 -7
- {mldataforge-0.1.5 → mldataforge-0.1.7}/pyproject.toml +1 -1
- mldataforge-0.1.5/mldataforge/mds.py +0 -95
- {mldataforge-0.1.5 → mldataforge-0.1.7}/.gitignore +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/LICENSE +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/README.md +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/__main__.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/brotli.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/__init__.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/convert/__init__.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/convert/jsonl.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/convert/mds.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/convert/parquet.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/commands/split.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/options.py +0 -0
- {mldataforge-0.1.5 → mldataforge-0.1.7}/mldataforge/pigz.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mldataforge
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.7
|
4
4
|
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
5
|
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
6
|
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
@@ -53,7 +53,7 @@ def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes
|
|
53
53
|
compression=compression,
|
54
54
|
buf_size=buf_size,
|
55
55
|
shard_size=shard_size,
|
56
|
-
pigz=use_pigz(compression, no_pigz)
|
56
|
+
pigz=use_pigz(compression, no_pigz),
|
57
57
|
)
|
58
58
|
|
59
59
|
@join.command()
|
@@ -30,7 +30,7 @@ JSONL_COMPRESSIONS = dict(
|
|
30
30
|
)
|
31
31
|
MDS_COMPRESSIONS = dict(
|
32
32
|
default=None,
|
33
|
-
choices=["none", "brotli", "bz2", "gzip", "pigz", "snappy", "zstd"],
|
33
|
+
choices=["none", "brotli", "bz2", "gzip", "pigz", "snappy", "zstd", "sample::brotli", "sample::bz2", "sample::gzip", "sample::snappy", "sample::zstd"],
|
34
34
|
)
|
35
35
|
PARQUET_COMPRESSIONS = dict(
|
36
36
|
default="snappy",
|
@@ -55,6 +55,10 @@ def determine_compression(fmt, file_path, compression="infer", no_pigz=False):
|
|
55
55
|
return "gz"
|
56
56
|
if compression == "brotli":
|
57
57
|
return "br"
|
58
|
+
if compression == "sample::gzip":
|
59
|
+
return "gz"
|
60
|
+
if compression == "sample::brotli":
|
61
|
+
return "br"
|
58
62
|
return compression
|
59
63
|
if fmt == "parquet":
|
60
64
|
return compression
|
@@ -0,0 +1,448 @@
|
|
1
|
+
from copy import deepcopy
|
2
|
+
import json
|
3
|
+
import numpy as np
|
4
|
+
import os
|
5
|
+
import shutil
|
6
|
+
from streaming.base.compression import compress, decompress, get_compression_extension, is_compression
|
7
|
+
from streaming.base.format import _readers
|
8
|
+
from streaming.base.format.base.reader import FileInfo, JointReader
|
9
|
+
from streaming.base.format.index import get_index_basename
|
10
|
+
from streaming.base.format.mds.encodings import mds_decode, mds_encode, is_mds_encoding, is_mds_encoding_safe, get_mds_encodings, get_mds_encoded_size
|
11
|
+
from streaming.base.hashing import get_hash, is_hash
|
12
|
+
from streaming.base.util import bytes_to_int
|
13
|
+
from typing import Any, Optional, Generator, Self, Union
|
14
|
+
|
15
|
+
from .utils import open_compression
|
16
|
+
|
17
|
+
__all__ = [
|
18
|
+
"MDSBulkReader",
|
19
|
+
"MDSBulkShardReader",
|
20
|
+
"MDSReader",
|
21
|
+
"MDSWriter",
|
22
|
+
]
|
23
|
+
|
24
|
+
class MDSBulkReader:
|
25
|
+
def __init__(
|
26
|
+
self,
|
27
|
+
dirnames: list[str],
|
28
|
+
split: Optional[str],
|
29
|
+
) -> None:
|
30
|
+
self.shards = []
|
31
|
+
self.samples = 0
|
32
|
+
for dirname in dirnames:
|
33
|
+
if split is not None:
|
34
|
+
dirname = os.path.join(dirname, split)
|
35
|
+
index = json.load(open(os.path.join(dirname, "index.json"), 'rt'))
|
36
|
+
for shard in index["shards"]:
|
37
|
+
basename = shard['raw_data']['basename'] if shard['zip_data'] is None else shard['zip_data']['basename']
|
38
|
+
filename = os.path.join(dirname, basename)
|
39
|
+
self.shards.append({
|
40
|
+
"filename": filename,
|
41
|
+
"compression": shard['compression'],
|
42
|
+
})
|
43
|
+
self.samples += shard['samples']
|
44
|
+
|
45
|
+
def __len__(self) -> int:
|
46
|
+
return self.samples
|
47
|
+
|
48
|
+
def __iter__(self) -> Generator[dict[str, Any], None, None]:
|
49
|
+
for shard in self.shards:
|
50
|
+
with MDSBulkShardReader(**shard) as reader:
|
51
|
+
for sample in reader:
|
52
|
+
yield sample
|
53
|
+
|
54
|
+
class MDSBulkShardReader:
|
55
|
+
def __init__(
|
56
|
+
self,
|
57
|
+
filename: str,
|
58
|
+
compression: Optional[str],
|
59
|
+
) -> None:
|
60
|
+
self.sample_compression = None
|
61
|
+
if compression is not None and compression.startswith("sample::"):
|
62
|
+
compression, self.sample_compression = None, compression.removeprefix("sample::")
|
63
|
+
self.fp = open_compression(filename, "rb", compression=compression)
|
64
|
+
self.samples = np.frombuffer(self.fp.read(4), np.uint32)[0]
|
65
|
+
self.index = np.frombuffer(self.fp.read((1+self.samples)*4), np.uint32)
|
66
|
+
info = json.loads(self.fp.read(self.index[0]-self.fp.tell()))
|
67
|
+
self.column_encodings = info["column_encodings"]
|
68
|
+
self.column_names = info["column_names"]
|
69
|
+
self.column_sizes = info["column_sizes"]
|
70
|
+
assert self.fp.tell() == self.index[0]
|
71
|
+
|
72
|
+
def decode_sample(self, data: bytes) -> dict[str, Any]:
|
73
|
+
sizes = []
|
74
|
+
idx = 0
|
75
|
+
for key, size in zip(self.column_names, self.column_sizes):
|
76
|
+
if size:
|
77
|
+
sizes.append(size)
|
78
|
+
else:
|
79
|
+
size, = np.frombuffer(data[idx:idx + 4], np.uint32)
|
80
|
+
sizes.append(size)
|
81
|
+
idx += 4
|
82
|
+
sample = {}
|
83
|
+
for key, encoding, size in zip(self.column_names, self.column_encodings, sizes):
|
84
|
+
value = data[idx:idx + size]
|
85
|
+
sample[key] = mds_decode(encoding, value)
|
86
|
+
idx += size
|
87
|
+
return sample
|
88
|
+
|
89
|
+
def get_sample_data(self, idx: int) -> bytes:
|
90
|
+
begin, end = self.index[idx:idx+2]
|
91
|
+
assert self.fp.tell() == begin
|
92
|
+
data = self.fp.read(end - begin)
|
93
|
+
assert self.fp.tell() == end
|
94
|
+
assert data
|
95
|
+
return data
|
96
|
+
|
97
|
+
def get_item(self, idx: int) -> dict[str, Any]:
|
98
|
+
data = self.get_sample_data(idx)
|
99
|
+
if self.sample_compression is not None:
|
100
|
+
data = decompress(self.sample_compression, data)
|
101
|
+
return self.decode_sample(data)
|
102
|
+
|
103
|
+
def __iter__(self) -> Generator[dict[str, Any], None, None]:
|
104
|
+
for i in range(self.samples):
|
105
|
+
yield self.get_item(i)
|
106
|
+
|
107
|
+
def __enter__(self) -> "MDSBulkShardReader":
|
108
|
+
return self
|
109
|
+
|
110
|
+
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
111
|
+
self.fp.close()
|
112
|
+
|
113
|
+
|
114
|
+
class MDSWriter:
|
115
|
+
|
116
|
+
format = 'mds'
|
117
|
+
extra_bytes_per_sample = 4
|
118
|
+
|
119
|
+
def __init__(self,
|
120
|
+
*,
|
121
|
+
columns: dict[str, str],
|
122
|
+
out: Union[str, tuple[str, str]],
|
123
|
+
compression: Optional[str] = None,
|
124
|
+
hashes: Optional[list[str]] = None,
|
125
|
+
size_limit: Optional[Union[int, str]] = 1 << 26,
|
126
|
+
**kwargs: Any) -> None:
|
127
|
+
compression = compression or None
|
128
|
+
sample_compression = None
|
129
|
+
if compression is not None and compression.startswith("sample::"):
|
130
|
+
compression, sample_compression = None, compression.removeprefix("sample::")
|
131
|
+
if compression:
|
132
|
+
if not is_compression(compression):
|
133
|
+
raise ValueError(f'Invalid compression: {compression}.')
|
134
|
+
if sample_compression:
|
135
|
+
if not is_compression(sample_compression):
|
136
|
+
raise ValueError(f'Invalid sample compression: {sample_compression}.')
|
137
|
+
hashes = hashes or []
|
138
|
+
if list(hashes) != sorted(hashes):
|
139
|
+
raise ValueError('Hashes must be unique and in sorted order.')
|
140
|
+
for algo in hashes:
|
141
|
+
if not is_hash(algo):
|
142
|
+
raise ValueError(f'Invalid hash: {algo}.')
|
143
|
+
|
144
|
+
size_limit_value = None
|
145
|
+
if size_limit:
|
146
|
+
size_limit_value = bytes_to_int(size_limit)
|
147
|
+
if size_limit_value < 0:
|
148
|
+
raise ValueError(f'`size_limit` must be greater than zero, instead, ' +
|
149
|
+
f'found as {size_limit_value}.')
|
150
|
+
if size_limit_value >= 2**32:
|
151
|
+
raise ValueError(f'`size_limit` must be less than 2**32, instead, ' +
|
152
|
+
f'found as {size_limit_value}. This is because sample ' +
|
153
|
+
f'byte offsets are stored with uint32.')
|
154
|
+
|
155
|
+
# Validate keyword arguments
|
156
|
+
invalid_kwargs = [
|
157
|
+
arg for arg in kwargs.keys()
|
158
|
+
if arg not in ('progress_bar', 'exist_ok')
|
159
|
+
]
|
160
|
+
if invalid_kwargs:
|
161
|
+
raise ValueError(f'Invalid Writer argument(s): {invalid_kwargs} ')
|
162
|
+
|
163
|
+
self.compression = compression
|
164
|
+
self.sample_compression = sample_compression
|
165
|
+
self.hashes = hashes
|
166
|
+
self.size_limit = size_limit_value
|
167
|
+
self.new_samples: list[bytes]
|
168
|
+
self.new_shard_size: int
|
169
|
+
|
170
|
+
self.shards = []
|
171
|
+
|
172
|
+
# Remove local directory if requested prior to creating writer
|
173
|
+
self.local = os.path.expanduser(out)
|
174
|
+
if os.path.exists(self.local) and len(os.listdir(self.local)) != 0:
|
175
|
+
if kwargs.get('exist_ok', False):
|
176
|
+
raise FileExistsError(f'Directory is not empty: {self.local}')
|
177
|
+
shutil.rmtree(self.local)
|
178
|
+
os.makedirs(self.local, exist_ok=True)
|
179
|
+
|
180
|
+
self.columns = columns
|
181
|
+
self.column_names = []
|
182
|
+
self.column_encodings = []
|
183
|
+
self.column_sizes = []
|
184
|
+
for name in sorted(columns):
|
185
|
+
encoding = columns[name]
|
186
|
+
if not is_mds_encoding(encoding):
|
187
|
+
raise TypeError(f'MDSWriter passed column `{name}` with encoding `{encoding}` ' +
|
188
|
+
f'is unsupported. Supported encodings are {get_mds_encodings()}')
|
189
|
+
size = get_mds_encoded_size(encoding)
|
190
|
+
self.column_names.append(name)
|
191
|
+
self.column_encodings.append(encoding)
|
192
|
+
self.column_sizes.append(size)
|
193
|
+
|
194
|
+
obj = self.get_config()
|
195
|
+
text = json.dumps(obj, sort_keys=True)
|
196
|
+
self.config_data = text.encode('utf-8')
|
197
|
+
self.extra_bytes_per_shard = 4 + 4 + len(self.config_data)
|
198
|
+
self._reset_cache()
|
199
|
+
|
200
|
+
def encode_sample(self, sample: dict[str, Any]) -> bytes:
|
201
|
+
sizes = []
|
202
|
+
data = []
|
203
|
+
for key, encoding, size in zip(self.column_names, self.column_encodings,
|
204
|
+
self.column_sizes):
|
205
|
+
value = sample[key]
|
206
|
+
datum = mds_encode(encoding, value)
|
207
|
+
if size is None:
|
208
|
+
size = len(datum)
|
209
|
+
sizes.append(size)
|
210
|
+
else:
|
211
|
+
if size != len(datum):
|
212
|
+
raise KeyError(f'Unexpected data size; was this data typed with the correct ' +
|
213
|
+
f'encoding ({encoding})?')
|
214
|
+
data.append(datum)
|
215
|
+
head = np.array(sizes, np.uint32).tobytes()
|
216
|
+
body = b''.join(data)
|
217
|
+
sample_data = head + body
|
218
|
+
if self.sample_compression:
|
219
|
+
sample_data = compress(self.sample_compression, sample_data)
|
220
|
+
return sample_data
|
221
|
+
|
222
|
+
def encode_joint_shard(self) -> bytes:
|
223
|
+
num_samples = np.uint32(len(self.new_samples))
|
224
|
+
sizes = list(map(len, self.new_samples))
|
225
|
+
offsets = np.array([0] + sizes).cumsum().astype(np.uint32)
|
226
|
+
offsets += len(num_samples.tobytes()) + len(offsets.tobytes()) + len(self.config_data)
|
227
|
+
sample_data = b''.join(self.new_samples)
|
228
|
+
return num_samples.tobytes() + offsets.tobytes() + self.config_data + sample_data
|
229
|
+
|
230
|
+
def flush_shard(self) -> None:
|
231
|
+
raw_data_basename, zip_data_basename = self._name_next_shard()
|
232
|
+
raw_data = self.encode_joint_shard()
|
233
|
+
raw_data_info, zip_data_info = self._process_file(raw_data, raw_data_basename,
|
234
|
+
zip_data_basename)
|
235
|
+
obj = {
|
236
|
+
'samples': len(self.new_samples),
|
237
|
+
'raw_data': raw_data_info,
|
238
|
+
'zip_data': zip_data_info
|
239
|
+
}
|
240
|
+
obj.update(self.get_config())
|
241
|
+
self.shards.append(obj)
|
242
|
+
|
243
|
+
def _reset_cache(self) -> None:
|
244
|
+
self.new_samples = []
|
245
|
+
self.new_shard_size = self.extra_bytes_per_shard
|
246
|
+
|
247
|
+
def _name_next_shard(self, extension: Optional[str] = None) -> tuple[str, Optional[str]]:
|
248
|
+
shard = len(self.shards)
|
249
|
+
parts = ['shard', f'{shard:05}', self.format]
|
250
|
+
if extension:
|
251
|
+
parts.append(extension)
|
252
|
+
raw_basename = '.'.join(parts)
|
253
|
+
if self.compression:
|
254
|
+
ext = get_compression_extension(self.compression)
|
255
|
+
parts.append(ext)
|
256
|
+
zip_basename = '.'.join(parts)
|
257
|
+
else:
|
258
|
+
zip_basename = None
|
259
|
+
return raw_basename, zip_basename
|
260
|
+
|
261
|
+
def _hash(self, data: bytes, basename: str) -> dict[str, Any]:
|
262
|
+
hashes = {}
|
263
|
+
for algo in self.hashes:
|
264
|
+
hashes[algo] = get_hash(algo, data)
|
265
|
+
return {'basename': basename, 'bytes': len(data), 'hashes': hashes}
|
266
|
+
|
267
|
+
def _process_file(self, raw_data: bytes, raw_basename: str,
|
268
|
+
zip_basename: Optional[str]) -> tuple[dict, Optional[dict]]:
|
269
|
+
raw_info = self._hash(raw_data, raw_basename)
|
270
|
+
if zip_basename:
|
271
|
+
zip_data = compress(self.compression, raw_data)
|
272
|
+
zip_info = self._hash(zip_data, zip_basename)
|
273
|
+
data = zip_data
|
274
|
+
basename = zip_basename
|
275
|
+
else:
|
276
|
+
zip_info = None
|
277
|
+
data = raw_data
|
278
|
+
basename = raw_basename
|
279
|
+
filename = os.path.join(self.local, basename)
|
280
|
+
with open(filename, 'wb') as out:
|
281
|
+
out.write(data)
|
282
|
+
return raw_info, zip_info
|
283
|
+
|
284
|
+
def get_config(self) -> dict[str, Any]:
|
285
|
+
return {
|
286
|
+
'version': 2,
|
287
|
+
'format': self.format,
|
288
|
+
'compression': self.compression if self.sample_compression is None else f"sample::{self.sample_compression}",
|
289
|
+
'hashes': self.hashes,
|
290
|
+
'size_limit': self.size_limit,
|
291
|
+
'column_names': self.column_names,
|
292
|
+
'column_encodings': self.column_encodings,
|
293
|
+
'column_sizes': self.column_sizes,
|
294
|
+
}
|
295
|
+
|
296
|
+
def write(self, sample: dict[str, Any]) -> None:
|
297
|
+
new_sample = self.encode_sample(sample)
|
298
|
+
new_sample_size = len(new_sample) + self.extra_bytes_per_sample
|
299
|
+
if self.size_limit and self.size_limit < self.new_shard_size + new_sample_size:
|
300
|
+
self.flush_shard()
|
301
|
+
self._reset_cache()
|
302
|
+
self.new_samples.append(new_sample)
|
303
|
+
self.new_shard_size += new_sample_size
|
304
|
+
|
305
|
+
def _write_index(self) -> None:
|
306
|
+
if self.new_samples:
|
307
|
+
raise RuntimeError('Internal error: not all samples have been written.')
|
308
|
+
basename = get_index_basename()
|
309
|
+
filename = os.path.join(self.local, basename)
|
310
|
+
obj = {
|
311
|
+
'version': 2,
|
312
|
+
'shards': self.shards,
|
313
|
+
}
|
314
|
+
with open(filename, 'w') as out:
|
315
|
+
json.dump(obj, out, sort_keys=True)
|
316
|
+
|
317
|
+
def finish(self) -> None:
|
318
|
+
if self.new_samples:
|
319
|
+
self.flush_shard()
|
320
|
+
self._reset_cache()
|
321
|
+
self._write_index()
|
322
|
+
|
323
|
+
def __enter__(self) -> Self:
|
324
|
+
return self
|
325
|
+
|
326
|
+
def __exit__(self, exc_type, exc, traceback):
|
327
|
+
self.finish()
|
328
|
+
|
329
|
+
class MDSReader(JointReader):
|
330
|
+
|
331
|
+
def __init__(
|
332
|
+
self,
|
333
|
+
dirname: str,
|
334
|
+
split: Optional[str],
|
335
|
+
column_encodings: list[str],
|
336
|
+
column_names: list[str],
|
337
|
+
column_sizes: list[Optional[int]],
|
338
|
+
compression: Optional[str],
|
339
|
+
hashes: list[str],
|
340
|
+
raw_data: FileInfo,
|
341
|
+
samples: int,
|
342
|
+
size_limit: Optional[Union[int, str]],
|
343
|
+
zip_data: Optional[FileInfo],
|
344
|
+
) -> None:
|
345
|
+
self.sample_compression = None
|
346
|
+
if compression and compression.startswith("sample::"):
|
347
|
+
compression, self.sample_compression = None, compression.removeprefix("sample::")
|
348
|
+
super().__init__(dirname, split, compression, hashes, raw_data, samples, size_limit,
|
349
|
+
zip_data)
|
350
|
+
self.column_encodings = column_encodings
|
351
|
+
self.column_names = column_names
|
352
|
+
self.column_sizes = column_sizes
|
353
|
+
|
354
|
+
@classmethod
|
355
|
+
def from_json(cls, dirname: str, split: Optional[str], obj: dict[str, Any]) -> Self:
|
356
|
+
"""Initialize from JSON object.
|
357
|
+
|
358
|
+
Args:
|
359
|
+
dirname (str): Local directory containing shards.
|
360
|
+
split (str, optional): Which dataset split to use, if any.
|
361
|
+
obj (Dict[str, Any]): JSON object to load.
|
362
|
+
|
363
|
+
Returns:
|
364
|
+
Self: Loaded MDSReader.
|
365
|
+
"""
|
366
|
+
args = deepcopy(obj)
|
367
|
+
args_version = args['version']
|
368
|
+
if args_version != 2:
|
369
|
+
raise ValueError(
|
370
|
+
f'Unsupported streaming data version: {args_version}. Expected version 2.')
|
371
|
+
del args['version']
|
372
|
+
args_format = args['format']
|
373
|
+
if args_format != 'mds':
|
374
|
+
raise ValueError(f'Unsupported data format: {args_format}. Expected to be `mds`.')
|
375
|
+
del args['format']
|
376
|
+
args['dirname'] = dirname
|
377
|
+
args['split'] = split
|
378
|
+
for key in ['raw_data', 'zip_data']:
|
379
|
+
arg = args[key]
|
380
|
+
args[key] = FileInfo(**arg) if arg else None
|
381
|
+
return cls(**args)
|
382
|
+
|
383
|
+
def validate(self, allow_unsafe_types: bool) -> None:
|
384
|
+
"""Check whether this shard is acceptable to be part of some Stream.
|
385
|
+
|
386
|
+
Args:
|
387
|
+
allow_unsafe_types (bool): If a shard contains Pickle, which allows arbitrary code
|
388
|
+
execution during deserialization, whether to keep going if ``True`` or raise an
|
389
|
+
error if ``False``.
|
390
|
+
"""
|
391
|
+
if not allow_unsafe_types:
|
392
|
+
for column_id, encoding in enumerate(self.column_encodings):
|
393
|
+
if not is_mds_encoding_safe(encoding):
|
394
|
+
name = self.column_names[column_id]
|
395
|
+
raise ValueError(f'Column {name} contains an unsafe type: {encoding}. To ' +
|
396
|
+
f'proceed anyway, set ``allow_unsafe_types=True``.')
|
397
|
+
|
398
|
+
def decode_sample(self, data: bytes) -> dict[str, Any]:
|
399
|
+
"""Decode a sample dict from bytes.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
data (bytes): The sample encoded as bytes.
|
403
|
+
|
404
|
+
Returns:
|
405
|
+
Dict[str, Any]: Sample dict.
|
406
|
+
"""
|
407
|
+
sizes = []
|
408
|
+
idx = 0
|
409
|
+
for key, size in zip(self.column_names, self.column_sizes):
|
410
|
+
if size:
|
411
|
+
sizes.append(size)
|
412
|
+
else:
|
413
|
+
size, = np.frombuffer(data[idx:idx + 4], np.uint32)
|
414
|
+
sizes.append(size)
|
415
|
+
idx += 4
|
416
|
+
sample = {}
|
417
|
+
for key, encoding, size in zip(self.column_names, self.column_encodings, sizes):
|
418
|
+
value = data[idx:idx + size]
|
419
|
+
sample[key] = mds_decode(encoding, value)
|
420
|
+
idx += size
|
421
|
+
return sample
|
422
|
+
|
423
|
+
def get_sample_data(self, idx: int) -> bytes:
|
424
|
+
"""Get the raw sample data at the index.
|
425
|
+
|
426
|
+
Args:
|
427
|
+
idx (int): Sample index.
|
428
|
+
|
429
|
+
Returns:
|
430
|
+
bytes: Sample data.
|
431
|
+
"""
|
432
|
+
filename = os.path.join(self.dirname, self.split, self.raw_data.basename)
|
433
|
+
offset = (1 + idx) * 4
|
434
|
+
with open(filename, 'rb', 0) as fp:
|
435
|
+
fp.seek(offset)
|
436
|
+
pair = fp.read(8)
|
437
|
+
begin, end = np.frombuffer(pair, np.uint32)
|
438
|
+
fp.seek(begin)
|
439
|
+
data = fp.read(end - begin)
|
440
|
+
if not data:
|
441
|
+
raise IndexError(
|
442
|
+
f'Relative sample index {idx} is not present in the {self.raw_data.basename} file.'
|
443
|
+
)
|
444
|
+
if self.sample_compression:
|
445
|
+
data = decompress(self.sample_compression, data)
|
446
|
+
return data
|
447
|
+
|
448
|
+
_readers["mds"] = MDSReader
|
@@ -36,60 +36,6 @@ class _SnappyWriteWrapper(io.RawIOBase):
|
|
36
36
|
def writable(self):
|
37
37
|
return True
|
38
38
|
|
39
|
-
|
40
|
-
# class _SnappyReadWrapper(io.RawIOBase):
|
41
|
-
# def __init__(self, fileobj):
|
42
|
-
# self.fileobj = fileobj
|
43
|
-
# self.buffer = io.BytesIO()
|
44
|
-
# self.eof = False
|
45
|
-
|
46
|
-
# def _fill_buffer_if_needed(self, min_bytes):
|
47
|
-
# self.buffer.seek(0, io.SEEK_END)
|
48
|
-
# while not self.eof and self.buffer.tell() < min_bytes:
|
49
|
-
# length_bytes = self.fileobj.read(4)
|
50
|
-
# if not length_bytes:
|
51
|
-
# self.eof = True
|
52
|
-
# break
|
53
|
-
# if len(length_bytes) < 4:
|
54
|
-
# self.eof = True # mark as EOF even if last chunk is malformed
|
55
|
-
# break
|
56
|
-
|
57
|
-
# try:
|
58
|
-
# length = struct.unpack(">I", length_bytes)[0]
|
59
|
-
# compressed = self.fileobj.read(length)
|
60
|
-
# if len(compressed) < length:
|
61
|
-
# self.eof = True
|
62
|
-
# break
|
63
|
-
|
64
|
-
# decompressed = snappy.decompress(compressed)
|
65
|
-
# self.buffer.write(decompressed)
|
66
|
-
# except Exception:
|
67
|
-
# self.eof = True
|
68
|
-
# break
|
69
|
-
|
70
|
-
# self.buffer.seek(0)
|
71
|
-
|
72
|
-
# def read(self, size=-1):
|
73
|
-
# if size == -1:
|
74
|
-
# while not self.eof:
|
75
|
-
# self._fill_buffer_if_needed(_CHUNK_SIZE)
|
76
|
-
# result = self.buffer.read()
|
77
|
-
# self.buffer = io.BytesIO()
|
78
|
-
# return result
|
79
|
-
|
80
|
-
# self._fill_buffer_if_needed(size)
|
81
|
-
# data = self.buffer.read(size)
|
82
|
-
# rest = self.buffer.read()
|
83
|
-
# self.buffer = io.BytesIO()
|
84
|
-
# self.buffer.write(rest)
|
85
|
-
# return data
|
86
|
-
|
87
|
-
# def readable(self):
|
88
|
-
# return True
|
89
|
-
|
90
|
-
# def close(self):
|
91
|
-
# self.fileobj.close()
|
92
|
-
|
93
39
|
class _SnappyReadWrapper(io.RawIOBase):
|
94
40
|
def __init__(self, fileobj):
|
95
41
|
self.fileobj = fileobj
|
@@ -6,11 +6,11 @@ import pyarrow as pa
|
|
6
6
|
import pyarrow.parquet as pq
|
7
7
|
import os
|
8
8
|
import shutil
|
9
|
-
from streaming import
|
9
|
+
from streaming import StreamingDataset
|
10
10
|
from tqdm import tqdm
|
11
11
|
|
12
12
|
from .compression import determine_compression, open_compression, pigz_compress
|
13
|
-
from .mds import MDSBulkReader
|
13
|
+
from .mds import MDSBulkReader, MDSWriter
|
14
14
|
from .pigz import pigz_open
|
15
15
|
|
16
16
|
__all__ = [
|
@@ -23,6 +23,11 @@ __all__ = [
|
|
23
23
|
"save_parquet",
|
24
24
|
]
|
25
25
|
|
26
|
+
_NO_PROGESS = False
|
27
|
+
def set_progress(value):
|
28
|
+
global _NO_PROGESS
|
29
|
+
_NO_PROGESS = value
|
30
|
+
|
26
31
|
def _batch_iterable(iterable, batch_size):
|
27
32
|
batch = []
|
28
33
|
for item in iterable:
|
@@ -73,7 +78,7 @@ def _infer_mds_encoding(value):
|
|
73
78
|
return 'pkl'
|
74
79
|
|
75
80
|
def _streaming_jsonl(jsonl_files, compressions):
|
76
|
-
for jsonl_file, compression in tqdm(zip(jsonl_files, compressions), desc="Loading JSONL files", unit="file"):
|
81
|
+
for jsonl_file, compression in tqdm(zip(jsonl_files, compressions), desc="Loading JSONL files", unit="file", disable=_NO_PROGESS):
|
77
82
|
for line in open_compression(jsonl_file, mode="rt", compression=compression):
|
78
83
|
yield json.loads(line)
|
79
84
|
|
@@ -109,7 +114,7 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True
|
|
109
114
|
def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True):
|
110
115
|
f = None
|
111
116
|
part = 0
|
112
|
-
for item in tqdm(iterable, desc="Writing to JSONL", unit="sample"):
|
117
|
+
for item in tqdm(iterable, desc="Writing to JSONL", unit="sample", disable=_NO_PROGESS):
|
113
118
|
if f is None:
|
114
119
|
part_file = output_file.format(part=part)
|
115
120
|
check_arguments(part_file, overwrite, yes)
|
@@ -127,7 +132,7 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
|
|
127
132
|
writer = None
|
128
133
|
part = 0
|
129
134
|
files = []
|
130
|
-
for sample in tqdm(it, desc="Writing to MDS", unit="sample"):
|
135
|
+
for sample in tqdm(it, desc="Writing to MDS", unit="sample", disable=_NO_PROGESS):
|
131
136
|
if writer is None:
|
132
137
|
part_dir = output_dir.format(part=part)
|
133
138
|
check_arguments(part_dir, overwrite, yes)
|
@@ -151,7 +156,7 @@ def save_mds(it, output_dir, processes=64, compression=None, buf_size=2**24, pig
|
|
151
156
|
name2info = {shard["raw_data"]["basename"]: shard for shard in index["shards"]}
|
152
157
|
file_names = [file for file in os.listdir(output_dir) if file.endswith(".mds")]
|
153
158
|
assert set(file_names) == set(name2info.keys())
|
154
|
-
for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file"):
|
159
|
+
for file_name in tqdm(file_names, desc="Compressing with pigz", unit="file", disable=_NO_PROGESS):
|
155
160
|
compressed_file_name = file_name + ".gz"
|
156
161
|
file_path = os.path.join(output_dir, file_name)
|
157
162
|
compressed_file_path = os.path.join(output_dir, compressed_file_name)
|
@@ -169,7 +174,7 @@ def save_parquet(it, output_file, compression=None, batch_size=2**16, size_hint=
|
|
169
174
|
compression = determine_compression("parquet", output_file, compression)
|
170
175
|
writer = None
|
171
176
|
part = 0
|
172
|
-
it = tqdm(it, desc="Writing to Parquet", unit="sample")
|
177
|
+
it = tqdm(it, desc="Writing to Parquet", unit="sample", disable=_NO_PROGESS)
|
173
178
|
for batch in _batch_iterable(it, batch_size):
|
174
179
|
table = pa.Table.from_pylist(batch)
|
175
180
|
if writer is None:
|
@@ -1,95 +0,0 @@
|
|
1
|
-
import gzip
|
2
|
-
import json
|
3
|
-
from mltiming import timing
|
4
|
-
import numpy as np
|
5
|
-
import os
|
6
|
-
import snappy
|
7
|
-
from streaming.base.format.mds.encodings import mds_decode
|
8
|
-
from typing import Any, Optional, Generator
|
9
|
-
|
10
|
-
from .options import MDS_COMPRESSIONS
|
11
|
-
from .utils import open_compression
|
12
|
-
|
13
|
-
class MDSBulkReader:
|
14
|
-
def __init__(
|
15
|
-
self,
|
16
|
-
dirnames: list[str],
|
17
|
-
split: Optional[str],
|
18
|
-
) -> None:
|
19
|
-
self.shards = []
|
20
|
-
self.samples = 0
|
21
|
-
for dirname in dirnames:
|
22
|
-
if split is not None:
|
23
|
-
dirname = os.path.join(dirname, split)
|
24
|
-
index = json.load(open(os.path.join(dirname, "index.json"), 'rt'))
|
25
|
-
for shard in index["shards"]:
|
26
|
-
basename = shard['raw_data']['basename'] if shard['zip_data'] is None else shard['zip_data']['basename']
|
27
|
-
filename = os.path.join(dirname, basename)
|
28
|
-
self.shards.append({
|
29
|
-
"filename": filename,
|
30
|
-
"compression": shard['compression'],
|
31
|
-
})
|
32
|
-
self.samples += shard['samples']
|
33
|
-
|
34
|
-
def __len__(self) -> int:
|
35
|
-
return self.samples
|
36
|
-
|
37
|
-
def __iter__(self) -> Generator[dict[str, Any], None, None]:
|
38
|
-
for shard in self.shards:
|
39
|
-
with MDSShardReader(**shard) as reader:
|
40
|
-
for sample in reader:
|
41
|
-
yield sample
|
42
|
-
|
43
|
-
class MDSShardReader:
|
44
|
-
def __init__(
|
45
|
-
self,
|
46
|
-
filename: str,
|
47
|
-
compression: Optional[str],
|
48
|
-
) -> None:
|
49
|
-
self.fp = open_compression(filename, "rb", compression=compression)
|
50
|
-
self.samples = np.frombuffer(self.fp.read(4), np.uint32)[0]
|
51
|
-
self.index = np.frombuffer(self.fp.read((1+self.samples)*4), np.uint32)
|
52
|
-
info = json.loads(self.fp.read(self.index[0]-self.fp.tell()))
|
53
|
-
self.column_encodings = info["column_encodings"]
|
54
|
-
self.column_names = info["column_names"]
|
55
|
-
self.column_sizes = info["column_sizes"]
|
56
|
-
assert self.fp.tell() == self.index[0]
|
57
|
-
|
58
|
-
def decode_sample(self, data: bytes) -> dict[str, Any]:
|
59
|
-
sizes = []
|
60
|
-
idx = 0
|
61
|
-
for key, size in zip(self.column_names, self.column_sizes):
|
62
|
-
if size:
|
63
|
-
sizes.append(size)
|
64
|
-
else:
|
65
|
-
size, = np.frombuffer(data[idx:idx + 4], np.uint32)
|
66
|
-
sizes.append(size)
|
67
|
-
idx += 4
|
68
|
-
sample = {}
|
69
|
-
for key, encoding, size in zip(self.column_names, self.column_encodings, sizes):
|
70
|
-
value = data[idx:idx + size]
|
71
|
-
sample[key] = mds_decode(encoding, value)
|
72
|
-
idx += size
|
73
|
-
return sample
|
74
|
-
|
75
|
-
def get_sample_data(self, idx: int) -> bytes:
|
76
|
-
begin, end = self.index[idx:idx+2]
|
77
|
-
assert self.fp.tell() == begin
|
78
|
-
data = self.fp.read(end - begin)
|
79
|
-
assert self.fp.tell() == end
|
80
|
-
assert data
|
81
|
-
return data
|
82
|
-
|
83
|
-
def get_item(self, idx: int) -> dict[str, Any]:
|
84
|
-
data = self.get_sample_data(idx)
|
85
|
-
return self.decode_sample(data)
|
86
|
-
|
87
|
-
def __iter__(self) -> Generator[dict[str, Any], None, None]:
|
88
|
-
for i in range(self.samples):
|
89
|
-
yield self.get_item(i)
|
90
|
-
|
91
|
-
def __enter__(self) -> "MDSShardReader":
|
92
|
-
return self
|
93
|
-
|
94
|
-
def __exit__(self, exc_type, exc_value, traceback) -> None:
|
95
|
-
self.fp.close()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|