mldataforge 0.1.1__tar.gz → 0.1.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,59 @@
1
+ Metadata-Version: 2.4
2
+ Name: mldataforge
3
+ Version: 0.1.2
4
+ Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
5
+ Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
6
+ Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
7
+ Author: Peter Schneider-Kamp
8
+ License-File: LICENSE
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Classifier: Programming Language :: Python :: 3
12
+ Requires-Python: >=3.12
13
+ Requires-Dist: click
14
+ Requires-Dist: datasets
15
+ Requires-Dist: mltiming
16
+ Requires-Dist: mosaicml-streaming
17
+ Provides-Extra: all
18
+ Requires-Dist: build; extra == 'all'
19
+ Requires-Dist: pytest; extra == 'all'
20
+ Requires-Dist: pytest-dependency; extra == 'all'
21
+ Requires-Dist: twine; extra == 'all'
22
+ Provides-Extra: dev
23
+ Requires-Dist: build; extra == 'dev'
24
+ Requires-Dist: twine; extra == 'dev'
25
+ Provides-Extra: test
26
+ Requires-Dist: pytest; extra == 'test'
27
+ Requires-Dist: pytest-dependency; extra == 'test'
28
+ Description-Content-Type: text/markdown
29
+
30
+ # mldatasets
31
+ swiss army knife of scripts for transforming and processing datasets for machine learning
32
+
33
+ ## scope
34
+ Currently, mldataforge provides space- and time-efficient conversions between JSONL (with or without compression), MosaiclML Dataset (MDS format), and Parquet. The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. Consequently, mldataforge is an excellent choice when transforming TB-scale datasets on data processing nodes with many cores.
35
+
36
+ ## installation and general usage
37
+ ```
38
+ pip install mldataforge
39
+ python -m mldataforge --help
40
+ ```
41
+
42
+ ## usage example: converting MosaiclML Dataset (MDS) to Parquet format
43
+ ```
44
+ Usage: python -m mldataforge convert mds parquet [OPTIONS] OUTPUT_FILE
45
+ MDS_DIRECTORIES...
46
+
47
+ Options:
48
+ --compression [snappy|gzip|zstd]
49
+ Compress the output file (default: snappy).
50
+ --overwrite Overwrite existing path.
51
+ --yes Assume yes to all prompts. Use with caution
52
+ as it will remove files or even entire
53
+ directories without confirmation.
54
+ --batch-size INTEGER Batch size for loading data and writing
55
+ files (default: 65536).
56
+ --no-bulk Use a custom space and time-efficient bulk
57
+ reader (only gzip and no compression).
58
+ --help Show this message and exit.
59
+ ```
@@ -0,0 +1,30 @@
1
+ # mldatasets
2
+ swiss army knife of scripts for transforming and processing datasets for machine learning
3
+
4
+ ## scope
5
+ Currently, mldataforge provides space- and time-efficient conversions between JSONL (with or without compression), MosaiclML Dataset (MDS format), and Parquet. The implementations handle conversions by individual samples or small batches of samples and make efficient use of multi-core architectures where possible. Consequently, mldataforge is an excellent choice when transforming TB-scale datasets on data processing nodes with many cores.
6
+
7
+ ## installation and general usage
8
+ ```
9
+ pip install mldataforge
10
+ python -m mldataforge --help
11
+ ```
12
+
13
+ ## usage example: converting MosaiclML Dataset (MDS) to Parquet format
14
+ ```
15
+ Usage: python -m mldataforge convert mds parquet [OPTIONS] OUTPUT_FILE
16
+ MDS_DIRECTORIES...
17
+
18
+ Options:
19
+ --compression [snappy|gzip|zstd]
20
+ Compress the output file (default: snappy).
21
+ --overwrite Overwrite existing path.
22
+ --yes Assume yes to all prompts. Use with caution
23
+ as it will remove files or even entire
24
+ directories without confirmation.
25
+ --batch-size INTEGER Batch size for loading data and writing
26
+ files (default: 65536).
27
+ --no-bulk Use a custom space and time-efficient bulk
28
+ reader (only gzip and no compression).
29
+ --help Show this message and exit.
30
+ ```
@@ -0,0 +1,4 @@
1
+ from .commands import cli
2
+
3
+ if __name__ == "__main__":
4
+ cli()
@@ -0,0 +1,13 @@
1
+ import click
2
+
3
+ from .convert import convert
4
+ from .join import join
5
+
6
+ __all__ = ["cli"]
7
+
8
+ @click.group()
9
+ def cli():
10
+ pass
11
+
12
+ cli.add_command(convert)
13
+ cli.add_command(join)
@@ -0,0 +1,64 @@
1
+ import click
2
+ from datasets import load_dataset
3
+
4
+ from ..options import *
5
+ from ..utils import *
6
+
7
+ __all__ = ["join"]
8
+
9
+ @click.group()
10
+ def join():
11
+ pass
12
+
13
+ @join.command()
14
+ @click.argument("output_file", type=click.Path(exists=False), required=True)
15
+ @click.argument("jsonl_files", type=click.Path(exists=True), required=True, nargs=-1)
16
+ @compression_option("infer", ["none", "infer", "pigz", "gzip", "bz2", "xz"])
17
+ @processes_option()
18
+ @overwrite_option()
19
+ @yes_option()
20
+ def jsonl(output_file, jsonl_files, compression, processes, overwrite, yes):
21
+ check_arguments(output_file, overwrite, yes, jsonl_files)
22
+ save_jsonl(
23
+ load_dataset("json", data_files=jsonl_files, split="train"),
24
+ output_file,
25
+ compression=compression,
26
+ processes=processes,
27
+ )
28
+
29
+ @join.command()
30
+ @click.argument("output_dir", type=click.Path(exists=False), required=True)
31
+ @click.argument("mds_directories", type=click.Path(exists=True), required=True, nargs=-1)
32
+ @compression_option(None, ['none', 'br', 'bz2', 'gzip', 'pigz', 'snappy', 'zstd'])
33
+ @processes_option()
34
+ @overwrite_option()
35
+ @yes_option()
36
+ @batch_size_option()
37
+ @buf_size_option()
38
+ @no_bulk_option()
39
+ def mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk):
40
+ check_arguments(output_dir, overwrite, yes, mds_directories)
41
+ save_mds(
42
+ load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
43
+ output_dir,
44
+ processes=processes,
45
+ compression=compression,
46
+ buf_size=buf_size,
47
+ pigz=use_pigz(compression),
48
+ )
49
+
50
+ @join.command()
51
+ @click.argument("output_file", type=click.Path(exists=False), required=True)
52
+ @click.argument("parquet_files", type=click.Path(exists=True), required=True, nargs=-1)
53
+ @compression_option("snappy", ["snappy", "gzip", "zstd"])
54
+ @overwrite_option()
55
+ @yes_option()
56
+ @batch_size_option()
57
+ def parquet(output_file, parquet_files, compression, overwrite, yes, batch_size):
58
+ check_arguments(output_file, overwrite, yes, parquet_files)
59
+ save_parquet(
60
+ load_dataset("parquet", data_files=parquet_files, split="train"),
61
+ output_file,
62
+ compression=compression,
63
+ batch_size=batch_size,
64
+ )
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "mldataforge"
7
- version = "0.1.1"
7
+ version = "0.1.2"
8
8
  authors = [
9
9
  { name = "Peter Schneider-Kamp" }
10
10
  ]
@@ -25,6 +25,11 @@ dependencies = [
25
25
  'mosaicml-streaming'
26
26
  ]
27
27
 
28
+ [project.optional-dependencies]
29
+ test = ["pytest", "pytest-dependency"]
30
+ dev = ["build", "twine"]
31
+ all = ["build", "twine", "pytest", "pytest-dependency"]
32
+
28
33
  [project.urls]
29
34
  "Homepage" = "https://github.com/schneiderkamplab/mldataforge"
30
35
  "Bug Tracker" = "https://github.com/schneiderkamplab/mldataforge/issues"
@@ -1,20 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: mldataforge
3
- Version: 0.1.1
4
- Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
5
- Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
6
- Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
7
- Author: Peter Schneider-Kamp
8
- License-File: LICENSE
9
- Classifier: License :: OSI Approved :: MIT License
10
- Classifier: Operating System :: OS Independent
11
- Classifier: Programming Language :: Python :: 3
12
- Requires-Python: >=3.12
13
- Requires-Dist: click
14
- Requires-Dist: datasets
15
- Requires-Dist: mltiming
16
- Requires-Dist: mosaicml-streaming
17
- Description-Content-Type: text/markdown
18
-
19
- # mldatasets
20
- swiss army knife of scripts for transforming and processing datasets for machine learning
@@ -1,2 +0,0 @@
1
- # mldatasets
2
- swiss army knife of scripts for transforming and processing datasets for machine learning
@@ -1,12 +0,0 @@
1
- import click
2
-
3
- from .commands import convert
4
-
5
- @click.group()
6
- def cli():
7
- pass
8
-
9
- cli.add_command(convert)
10
-
11
- if __name__ == "__main__":
12
- cli()
@@ -1,3 +0,0 @@
1
- from .convert import convert
2
-
3
- __all__ = ["convert"]
File without changes
File without changes