mldataforge 0.2.1__tar.gz → 0.2.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mldataforge-0.2.1 → mldataforge-0.2.3}/PKG-INFO +1 -2
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/brotli.py +1 -6
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/mds.py +6 -4
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/join.py +3 -2
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/split.py +3 -2
- mldataforge-0.2.3/mldataforge/indexing.py +25 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/options.py +12 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/trafos.py +4 -23
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/utils.py +10 -1
- {mldataforge-0.2.1 → mldataforge-0.2.3}/pyproject.toml +1 -2
- {mldataforge-0.2.1 → mldataforge-0.2.3}/.gitignore +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/LICENSE +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/README.md +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/__main__.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/__init__.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/__init__.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/jsonl.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/parquet.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/compression.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/mds.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/pigz.py +0 -0
- {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/snappy.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: mldataforge
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.3
|
4
4
|
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
5
|
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
6
|
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
@@ -10,7 +10,6 @@ Classifier: License :: OSI Approved :: MIT License
|
|
10
10
|
Classifier: Operating System :: OS Independent
|
11
11
|
Classifier: Programming Language :: Python :: 3
|
12
12
|
Requires-Python: >=3.12
|
13
|
-
Requires-Dist: brotlicffi
|
14
13
|
Requires-Dist: click
|
15
14
|
Requires-Dist: datasets
|
16
15
|
Requires-Dist: isal
|
@@ -1,4 +1,4 @@
|
|
1
|
-
import
|
1
|
+
import brotli
|
2
2
|
import io
|
3
3
|
|
4
4
|
__all__ = ["brotli_open"]
|
@@ -6,11 +6,6 @@ __all__ = ["brotli_open"]
|
|
6
6
|
def brotli_open(filename, mode='rb', encoding='utf-8', compress_level=11):
|
7
7
|
return BrotliFile(filename, mode=mode, encoding=encoding, compress_level=11)
|
8
8
|
|
9
|
-
import brotlicffi as brotli
|
10
|
-
import io
|
11
|
-
|
12
|
-
__all__ = ["brotli_open"]
|
13
|
-
|
14
9
|
class BrotliFile:
|
15
10
|
def __init__(self, filename, mode='rb', encoding='utf-8', compress_level=11):
|
16
11
|
self.filename = filename
|
@@ -20,12 +20,13 @@ def mds():
|
|
20
20
|
@batch_size_option()
|
21
21
|
@no_bulk_option()
|
22
22
|
@trafo_option()
|
23
|
+
@shuffle_option()
|
23
24
|
def jsonl(**kwargs):
|
24
25
|
mds_to_jsonl(**kwargs)
|
25
|
-
def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk, trafo):
|
26
|
+
def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk, trafo, shuffle):
|
26
27
|
check_arguments(output_file, overwrite, yes, mds_directories)
|
27
28
|
save_jsonl(
|
28
|
-
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
29
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
|
29
30
|
output_file,
|
30
31
|
compression=compression,
|
31
32
|
processes=processes,
|
@@ -41,12 +42,13 @@ def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite
|
|
41
42
|
@batch_size_option()
|
42
43
|
@no_bulk_option()
|
43
44
|
@trafo_option()
|
45
|
+
@shuffle_option()
|
44
46
|
def parquet(**kwargs):
|
45
47
|
mds_to_parquet(**kwargs)
|
46
|
-
def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk, trafo):
|
48
|
+
def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk, trafo, shuffle):
|
47
49
|
check_arguments(output_file, overwrite, yes, mds_directories)
|
48
50
|
save_parquet(
|
49
|
-
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
51
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
|
50
52
|
output_file,
|
51
53
|
compression=compression,
|
52
54
|
batch_size=batch_size,
|
@@ -43,13 +43,14 @@ def join_jsonl(output_file, jsonl_files, compression, processes, overwrite, yes,
|
|
43
43
|
@shard_size_option()
|
44
44
|
@no_pigz_option()
|
45
45
|
@trafo_option()
|
46
|
+
@shuffle_option()
|
46
47
|
def mds(**kwargs):
|
47
48
|
print(kwargs)
|
48
49
|
join_mds(**kwargs)
|
49
|
-
def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz, trafo):
|
50
|
+
def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz, trafo, shuffle):
|
50
51
|
check_arguments(output_dir, overwrite, yes, mds_directories)
|
51
52
|
save_mds(
|
52
|
-
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
53
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
|
53
54
|
output_dir,
|
54
55
|
processes=processes,
|
55
56
|
compression=compression,
|
@@ -50,11 +50,12 @@ def split_jsonl(jsonl_files, prefix, output_dir, size_hint, compression, process
|
|
50
50
|
@shard_size_option()
|
51
51
|
@no_pigz_option()
|
52
52
|
@trafo_option()
|
53
|
+
@shuffle_option()
|
53
54
|
def mds(*args, **kwargs):
|
54
55
|
split_mds(*args, **kwargs)
|
55
|
-
def split_mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz, trafo):
|
56
|
+
def split_mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz, trafo, shuffle):
|
56
57
|
save_mds(
|
57
|
-
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
58
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
|
58
59
|
output_dir=f"{output_dir}/{prefix}{{part:04d}}",
|
59
60
|
processes=processes,
|
60
61
|
compression=compression,
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import numpy as np
|
2
|
+
|
3
|
+
__all__ = ['IndexedDatasetView', 'shuffle_permutation']
|
4
|
+
|
5
|
+
class IndexedDatasetView:
|
6
|
+
def __init__(self, dataset, indices):
|
7
|
+
self.dataset = dataset
|
8
|
+
self.indices = list(indices) # ensure repeatable accessx
|
9
|
+
|
10
|
+
def __iter__(self):
|
11
|
+
for idx in self.indices:
|
12
|
+
yield self.dataset[idx]
|
13
|
+
|
14
|
+
def __len__(self):
|
15
|
+
return len(self.indices)
|
16
|
+
|
17
|
+
def shuffle_permutation(n, seed=int):
|
18
|
+
rng = np.random.default_rng(seed)
|
19
|
+
return rng.permutation(n)
|
20
|
+
|
21
|
+
def reverse_permutation(indices):
|
22
|
+
n = len(indices)
|
23
|
+
reverse_indices = np.empty(n, dtype=int)
|
24
|
+
reverse_indices[indices] = np.arange(n)
|
25
|
+
return reverse_indices
|
@@ -13,6 +13,7 @@ __all__ = [
|
|
13
13
|
"processes_option",
|
14
14
|
"prefix_option",
|
15
15
|
"shard_size_option",
|
16
|
+
"shuffle_option",
|
16
17
|
"size_hint_option",
|
17
18
|
"trafo_option",
|
18
19
|
"yes_option",
|
@@ -120,6 +121,17 @@ def shard_size_option(default=2**26):
|
|
120
121
|
help=f"Shard size for the dataset (default: {default}).",
|
121
122
|
)
|
122
123
|
|
124
|
+
def shuffle_option():
|
125
|
+
"""
|
126
|
+
Option for specifying whether to shuffle the dataset by providing a random seed.
|
127
|
+
"""
|
128
|
+
return click.option(
|
129
|
+
"--shuffle",
|
130
|
+
default=None,
|
131
|
+
type=int,
|
132
|
+
help="Shuffle the dataset by providing a random seed.",
|
133
|
+
)
|
134
|
+
|
123
135
|
def size_hint_option(default=2**26):
|
124
136
|
"""
|
125
137
|
Option for specifying the size hint.
|
@@ -1,18 +1,6 @@
|
|
1
1
|
import re
|
2
2
|
|
3
|
-
__all__ = ['
|
4
|
-
|
5
|
-
class IndexedDatasetView:
|
6
|
-
def __init__(self, dataset, indices):
|
7
|
-
self.dataset = dataset
|
8
|
-
self.indices = list(indices) # ensure repeatable access
|
9
|
-
|
10
|
-
def __iter__(self):
|
11
|
-
for idx in self.indices:
|
12
|
-
yield self.dataset[idx]
|
13
|
-
|
14
|
-
def __len__(self):
|
15
|
-
return len(self.indices)
|
3
|
+
__all__ = ['Transformation', 'Transformations', 'flatten_json', 'identity', 'unflatten_json']
|
16
4
|
|
17
5
|
class Transformation:
|
18
6
|
def __init__(self, code: str):
|
@@ -56,33 +44,23 @@ class Transformation:
|
|
56
44
|
return self._last_input_len
|
57
45
|
raise TypeError("Length is not available for this transformation.")
|
58
46
|
|
59
|
-
|
60
47
|
class Transformations:
|
61
48
|
def __init__(self, codes: list[str], indices=None):
|
62
49
|
self.pipeline = [Transformation(code) for code in codes]
|
63
|
-
self.indices = indices # Optional index iterable
|
64
50
|
|
65
51
|
def __call__(self, dataset):
|
66
|
-
# Wrap dataset with IndexedDatasetView if indices are provided
|
67
|
-
if self.indices is not None:
|
68
|
-
dataset = IndexedDatasetView(dataset, self.indices)
|
69
|
-
|
70
52
|
result = dataset
|
71
53
|
for transform in self.pipeline:
|
72
54
|
result = transform(result)
|
73
55
|
return result
|
74
56
|
|
75
57
|
def __len__(self):
|
76
|
-
# Return the input length to the pipeline
|
77
58
|
if self.indices is not None:
|
78
59
|
return len(self.indices)
|
79
60
|
elif hasattr(self.pipeline[0], '_last_input_len') and self.pipeline[0]._last_input_len is not None:
|
80
61
|
return self.pipeline[0]._last_input_len
|
81
62
|
raise TypeError("Transformations length is not available until __call__ is used on a sized input.")
|
82
63
|
|
83
|
-
def identity(obj):
|
84
|
-
return obj
|
85
|
-
|
86
64
|
def flatten_json(obj, parent_key='', sep='.', escape_char='\\'):
|
87
65
|
def escape(key):
|
88
66
|
return key.replace(escape_char, escape_char * 2)\
|
@@ -110,6 +88,9 @@ def flatten_json(obj, parent_key='', sep='.', escape_char='\\'):
|
|
110
88
|
items.append((parent_key, obj))
|
111
89
|
return dict(items)
|
112
90
|
|
91
|
+
def identity(obj):
|
92
|
+
return obj
|
93
|
+
|
113
94
|
def unflatten_json(flat_dict, sep='.', escape_char='\\'):
|
114
95
|
def check_flat_json(obj):
|
115
96
|
assert isinstance(obj, dict), "Input must be a dictionary"
|
@@ -10,6 +10,7 @@ from streaming import StreamingDataset
|
|
10
10
|
from tqdm import tqdm
|
11
11
|
|
12
12
|
from .compression import determine_compression, open_compression, pigz_compress
|
13
|
+
from .indexing import IndexedDatasetView, reverse_permutation, shuffle_permutation
|
13
14
|
from .mds import MDSBulkReader, MDSWriter
|
14
15
|
from .pigz import pigz_open
|
15
16
|
from .trafos import Transformations
|
@@ -89,7 +90,9 @@ def load_jsonl_files(jsonl_files):
|
|
89
90
|
return _streaming_jsonl(jsonl_files, compressions)
|
90
91
|
return load_dataset("json", data_files=jsonl_files, split="train")
|
91
92
|
|
92
|
-
def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True):
|
93
|
+
def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True, shuffle=None):
|
94
|
+
if bulk and shuffle is not None:
|
95
|
+
raise ValueError("Bulk reader does not support shuffling by design.")
|
93
96
|
if bulk:
|
94
97
|
return MDSBulkReader(mds_directories, split=split)
|
95
98
|
dss = []
|
@@ -110,6 +113,12 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True
|
|
110
113
|
else:
|
111
114
|
with timing(message=f"Concatenating {len(dss)} datasets"):
|
112
115
|
ds = concatenate_datasets(dsets=dss)
|
116
|
+
if shuffle is not None:
|
117
|
+
with timing(message="Creating shuffle indices"):
|
118
|
+
indices = shuffle_permutation(len(ds), seed=abs(shuffle))
|
119
|
+
if shuffle < 0:
|
120
|
+
indices = reverse_permutation(indices)
|
121
|
+
ds = IndexedDatasetView(ds, indices)
|
113
122
|
return ds
|
114
123
|
|
115
124
|
def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True, trafo=None):
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "mldataforge"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.3"
|
8
8
|
authors = [
|
9
9
|
{ name = "Peter Schneider-Kamp" }
|
10
10
|
]
|
@@ -19,7 +19,6 @@ classifiers = [
|
|
19
19
|
]
|
20
20
|
|
21
21
|
dependencies = [
|
22
|
-
'brotlicffi',
|
23
22
|
'click',
|
24
23
|
'datasets',
|
25
24
|
'isal',
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|