mldataforge 0.1.1__tar.gz → 0.1.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- mldataforge-0.1.2/PKG-INFO +59 -0
- mldataforge-0.1.2/README.md +30 -0
- mldataforge-0.1.2/mldataforge/__main__.py +4 -0
- mldataforge-0.1.2/mldataforge/commands/__init__.py +13 -0
- mldataforge-0.1.2/mldataforge/commands/join.py +64 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/pyproject.toml +6 -1
- mldataforge-0.1.1/PKG-INFO +0 -20
- mldataforge-0.1.1/README.md +0 -2
- mldataforge-0.1.1/mldataforge/__main__.py +0 -12
- mldataforge-0.1.1/mldataforge/commands/__init__.py +0 -3
- {mldataforge-0.1.1 → mldataforge-0.1.2}/.gitignore +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/LICENSE +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/commands/convert/__init__.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/commands/convert/jsonl.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/commands/convert/mds.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/commands/convert/parquet.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/mds.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/options.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/pigz.py +0 -0
- {mldataforge-0.1.1 → mldataforge-0.1.2}/mldataforge/utils.py +0 -0
@@ -0,0 +1,59 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: mldataforge
|
3
|
+
Version: 0.1.2
|
4
|
+
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
|
+
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
|
+
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
7
|
+
Author: Peter Schneider-Kamp
|
8
|
+
License-File: LICENSE
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.12
|
13
|
+
Requires-Dist: click
|
14
|
+
Requires-Dist: datasets
|
15
|
+
Requires-Dist: mltiming
|
16
|
+
Requires-Dist: mosaicml-streaming
|
17
|
+
Provides-Extra: all
|
18
|
+
Requires-Dist: build; extra == 'all'
|
19
|
+
Requires-Dist: pytest; extra == 'all'
|
20
|
+
Requires-Dist: pytest-dependency; extra == 'all'
|
21
|
+
Requires-Dist: twine; extra == 'all'
|
22
|
+
Provides-Extra: dev
|
23
|
+
Requires-Dist: build; extra == 'dev'
|
24
|
+
Requires-Dist: twine; extra == 'dev'
|
25
|
+
Provides-Extra: test
|
26
|
+
Requires-Dist: pytest; extra == 'test'
|
27
|
+
Requires-Dist: pytest-dependency; extra == 'test'
|
28
|
+
Description-Content-Type: text/markdown
|
29
|
+
|
30
|
+
# mldatasets
|
31
|
+
swiss army knife of scripts for transforming and processing datasets for machine learning
|
32
|
+
|
33
|
+
## scope
|
34
|
+
Currently, mldataforge provides space- and time-efficient conversions between JSONL (with or without compression), MosaiclML Dataset (MDS format), and Parquet. The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. Consequently, mldataforge is an excellent choice when transforming TB-scale datasets on data processing nodes with many cores.
|
35
|
+
|
36
|
+
## installation and general usage
|
37
|
+
```
|
38
|
+
pip install mldataforge
|
39
|
+
python -m mldataforge --help
|
40
|
+
```
|
41
|
+
|
42
|
+
## usage example: converting MosaiclML Dataset (MDS) to Parquet format
|
43
|
+
```
|
44
|
+
Usage: python -m mldataforge convert mds parquet [OPTIONS] OUTPUT_FILE
|
45
|
+
MDS_DIRECTORIES...
|
46
|
+
|
47
|
+
Options:
|
48
|
+
--compression [snappy|gzip|zstd]
|
49
|
+
Compress the output file (default: snappy).
|
50
|
+
--overwrite Overwrite existing path.
|
51
|
+
--yes Assume yes to all prompts. Use with caution
|
52
|
+
as it will remove files or even entire
|
53
|
+
directories without confirmation.
|
54
|
+
--batch-size INTEGER Batch size for loading data and writing
|
55
|
+
files (default: 65536).
|
56
|
+
--no-bulk Use a custom space and time-efficient bulk
|
57
|
+
reader (only gzip and no compression).
|
58
|
+
--help Show this message and exit.
|
59
|
+
```
|
@@ -0,0 +1,30 @@
|
|
1
|
+
# mldatasets
|
2
|
+
swiss army knife of scripts for transforming and processing datasets for machine learning
|
3
|
+
|
4
|
+
## scope
|
5
|
+
Currently, mldataforge provides space- and time-efficient conversions between JSONL (with or without compression), MosaiclML Dataset (MDS format), and Parquet. The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. Consequently, mldataforge is an excellent choice when transforming TB-scale datasets on data processing nodes with many cores.
|
6
|
+
|
7
|
+
## installation and general usage
|
8
|
+
```
|
9
|
+
pip install mldataforge
|
10
|
+
python -m mldataforge --help
|
11
|
+
```
|
12
|
+
|
13
|
+
## usage example: converting MosaiclML Dataset (MDS) to Parquet format
|
14
|
+
```
|
15
|
+
Usage: python -m mldataforge convert mds parquet [OPTIONS] OUTPUT_FILE
|
16
|
+
MDS_DIRECTORIES...
|
17
|
+
|
18
|
+
Options:
|
19
|
+
--compression [snappy|gzip|zstd]
|
20
|
+
Compress the output file (default: snappy).
|
21
|
+
--overwrite Overwrite existing path.
|
22
|
+
--yes Assume yes to all prompts. Use with caution
|
23
|
+
as it will remove files or even entire
|
24
|
+
directories without confirmation.
|
25
|
+
--batch-size INTEGER Batch size for loading data and writing
|
26
|
+
files (default: 65536).
|
27
|
+
--no-bulk Use a custom space and time-efficient bulk
|
28
|
+
reader (only gzip and no compression).
|
29
|
+
--help Show this message and exit.
|
30
|
+
```
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import click
|
2
|
+
from datasets import load_dataset
|
3
|
+
|
4
|
+
from ..options import *
|
5
|
+
from ..utils import *
|
6
|
+
|
7
|
+
__all__ = ["join"]
|
8
|
+
|
9
|
+
@click.group()
|
10
|
+
def join():
|
11
|
+
pass
|
12
|
+
|
13
|
+
@join.command()
|
14
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
15
|
+
@click.argument("jsonl_files", type=click.Path(exists=True), required=True, nargs=-1)
|
16
|
+
@compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
|
17
|
+
@processes_option()
|
18
|
+
@overwrite_option()
|
19
|
+
@yes_option()
|
20
|
+
def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
|
21
|
+
check_arguments(output_file, overwrite, yes, jsonl_files)
|
22
|
+
save_jsonl(
|
23
|
+
load_dataset("json", data_files=jsonl_files, split="train"),
|
24
|
+
output_file,
|
25
|
+
compression=compression,
|
26
|
+
processes=processes,
|
27
|
+
)
|
28
|
+
|
29
|
+
@join.command()
|
30
|
+
@click.argument("output_dir", type=click.Path(exists=False), required=True)
|
31
|
+
@click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
|
32
|
+
@compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
|
33
|
+
@processes_option()
|
34
|
+
@overwrite_option()
|
35
|
+
@yes_option()
|
36
|
+
@batch_size_option()
|
37
|
+
@buf_size_option()
|
38
|
+
@no_bulk_option()
|
39
|
+
def mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk):
|
40
|
+
check_arguments(output_dir, overwrite, yes, mds_directories)
|
41
|
+
save_mds(
|
42
|
+
load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
|
43
|
+
output_dir,
|
44
|
+
processes=processes,
|
45
|
+
compression=compression,
|
46
|
+
buf_size=buf_size,
|
47
|
+
pigz=use_pigz(compression),
|
48
|
+
)
|
49
|
+
|
50
|
+
@join.command()
|
51
|
+
@click.argument("output_file", type=click.Path(exists=False), required=True)
|
52
|
+
@click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
|
53
|
+
@compression_option("snappy", ["snappy", "gzip", "zstd"])
|
54
|
+
@overwrite_option()
|
55
|
+
@yes_option()
|
56
|
+
@batch_size_option()
|
57
|
+
def parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
|
58
|
+
check_arguments(output_file, overwrite, yes, parquet_files)
|
59
|
+
save_parquet(
|
60
|
+
load_dataset("parquet", data_files=parquet_files, split="train"),
|
61
|
+
output_file,
|
62
|
+
compression=compression,
|
63
|
+
batch_size=batch_size,
|
64
|
+
)
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "mldataforge"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.2"
|
8
8
|
authors = [
|
9
9
|
{ name = "Peter Schneider-Kamp" }
|
10
10
|
]
|
@@ -25,6 +25,11 @@ dependencies = [
|
|
25
25
|
'mosaicml-streaming'
|
26
26
|
]
|
27
27
|
|
28
|
+
[project.optional-dependencies]
|
29
|
+
test = ["pytest", "pytest-dependency"]
|
30
|
+
dev = ["build", "twine"]
|
31
|
+
all = ["build", "twine", "pytest", "pytest-dependency"]
|
32
|
+
|
28
33
|
[project.urls]
|
29
34
|
"Homepage" = "https://github.com/schneiderkamplab/mldataforge"
|
30
35
|
"Bug Tracker" = "https://github.com/schneiderkamplab/mldataforge/issues"
|
mldataforge-0.1.1/PKG-INFO
DELETED
@@ -1,20 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.4
|
2
|
-
Name: mldataforge
|
3
|
-
Version: 0.1.1
|
4
|
-
Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
|
5
|
-
Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
|
6
|
-
Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
|
7
|
-
Author: Peter Schneider-Kamp
|
8
|
-
License-File: LICENSE
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.12
|
13
|
-
Requires-Dist: click
|
14
|
-
Requires-Dist: datasets
|
15
|
-
Requires-Dist: mltiming
|
16
|
-
Requires-Dist: mosaicml-streaming
|
17
|
-
Description-Content-Type: text/markdown
|
18
|
-
|
19
|
-
# mldatasets
|
20
|
-
swiss army knife of scripts for transforming and processing datasets for machine learning
|
mldataforge-0.1.1/README.md
DELETED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|