mldataforge 0.2.1__tar.gz → 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. {mldataforge-0.2.1 → mldataforge-0.2.3}/PKG-INFO +1 -2
  2. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/brotli.py +1 -6
  3. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/mds.py +6 -4
  4. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/join.py +3 -2
  5. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/split.py +3 -2
  6. mldataforge-0.2.3/mldataforge/indexing.py +25 -0
  7. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/options.py +12 -0
  8. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/trafos.py +4 -23
  9. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/utils.py +10 -1
  10. {mldataforge-0.2.1 → mldataforge-0.2.3}/pyproject.toml +1 -2
  11. {mldataforge-0.2.1 → mldataforge-0.2.3}/.gitignore +0 -0
  12. {mldataforge-0.2.1 → mldataforge-0.2.3}/LICENSE +0 -0
  13. {mldataforge-0.2.1 → mldataforge-0.2.3}/README.md +0 -0
  14. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/__main__.py +0 -0
  15. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/__init__.py +0 -0
  16. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/__init__.py +0 -0
  17. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/jsonl.py +0 -0
  18. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/commands/convert/parquet.py +0 -0
  19. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/compression.py +0 -0
  20. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/mds.py +0 -0
  21. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/pigz.py +0 -0
  22. {mldataforge-0.2.1 → mldataforge-0.2.3}/mldataforge/snappy.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mldataforge
3
- Version: 0.2.1
3
+ Version: 0.2.3
4
4
  Summary: swiss army knife of scripts for transforming and processing datasets for machine learning.
5
5
  Project-URL: Homepage, https://github.com/schneiderkamplab/mldataforge
6
6
  Project-URL: Bug Tracker, https://github.com/schneiderkamplab/mldataforge/issues
@@ -10,7 +10,6 @@ Classifier: License :: OSI Approved :: MIT License
10
10
  Classifier: Operating System :: OS Independent
11
11
  Classifier: Programming Language :: Python :: 3
12
12
  Requires-Python: >=3.12
13
- Requires-Dist: brotlicffi
14
13
  Requires-Dist: click
15
14
  Requires-Dist: datasets
16
15
  Requires-Dist: isal
@@ -1,4 +1,4 @@
1
- import brotlicffi as brotli
1
+ import brotli
2
2
  import io
3
3
 
4
4
  __all__ = ["brotli_open"]
@@ -6,11 +6,6 @@ __all__ = ["brotli_open"]
6
6
  def brotli_open(filename, mode='rb', encoding='utf-8', compress_level=11):
7
7
  return BrotliFile(filename, mode=mode, encoding=encoding, compress_level=11)
8
8
 
9
- import brotlicffi as brotli
10
- import io
11
-
12
- __all__ = ["brotli_open"]
13
-
14
9
  class BrotliFile:
15
10
  def __init__(self, filename, mode='rb', encoding='utf-8', compress_level=11):
16
11
  self.filename = filename
@@ -20,12 +20,13 @@ def mds():
20
20
  @batch_size_option()
21
21
  @no_bulk_option()
22
22
  @trafo_option()
23
+ @shuffle_option()
23
24
  def jsonl(**kwargs):
24
25
  mds_to_jsonl(**kwargs)
25
- def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk, trafo):
26
+ def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite, yes, batch_size, no_bulk, trafo, shuffle):
26
27
  check_arguments(output_file, overwrite, yes, mds_directories)
27
28
  save_jsonl(
28
- load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
29
+ load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
29
30
  output_file,
30
31
  compression=compression,
31
32
  processes=processes,
@@ -41,12 +42,13 @@ def mds_to_jsonl(output_file, mds_directories, compression, processes, overwrite
41
42
  @batch_size_option()
42
43
  @no_bulk_option()
43
44
  @trafo_option()
45
+ @shuffle_option()
44
46
  def parquet(**kwargs):
45
47
  mds_to_parquet(**kwargs)
46
- def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk, trafo):
48
+ def mds_to_parquet(output_file, mds_directories, compression, overwrite, yes, batch_size, no_bulk, trafo, shuffle):
47
49
  check_arguments(output_file, overwrite, yes, mds_directories)
48
50
  save_parquet(
49
- load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
51
+ load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
50
52
  output_file,
51
53
  compression=compression,
52
54
  batch_size=batch_size,
@@ -43,13 +43,14 @@ def join_jsonl(output_file, jsonl_files, compression, processes, overwrite, yes,
43
43
  @shard_size_option()
44
44
  @no_pigz_option()
45
45
  @trafo_option()
46
+ @shuffle_option()
46
47
  def mds(**kwargs):
47
48
  print(kwargs)
48
49
  join_mds(**kwargs)
49
- def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz, trafo):
50
+ def join_mds(output_dir, mds_directories, compression, processes, overwrite, yes, batch_size, buf_size, no_bulk, shard_size, no_pigz, trafo, shuffle):
50
51
  check_arguments(output_dir, overwrite, yes, mds_directories)
51
52
  save_mds(
52
- load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
53
+ load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
53
54
  output_dir,
54
55
  processes=processes,
55
56
  compression=compression,
@@ -50,11 +50,12 @@ def split_jsonl(jsonl_files, prefix, output_dir, size_hint, compression, process
50
50
  @shard_size_option()
51
51
  @no_pigz_option()
52
52
  @trafo_option()
53
+ @shuffle_option()
53
54
  def mds(*args, **kwargs):
54
55
  split_mds(*args, **kwargs)
55
- def split_mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz, trafo):
56
+ def split_mds(mds_directories, prefix, output_dir, size_hint, compression, processes, overwrite, yes, buf_size, batch_size, no_bulk, shard_size, no_pigz, trafo, shuffle):
56
57
  save_mds(
57
- load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk),
58
+ load_mds_directories(mds_directories, batch_size=batch_size, bulk=not no_bulk, shuffle=shuffle),
58
59
  output_dir=f"{output_dir}/{prefix}{{part:04d}}",
59
60
  processes=processes,
60
61
  compression=compression,
@@ -0,0 +1,25 @@
1
+ import numpy as np
2
+
3
+ __all__ = ['IndexedDatasetView', 'shuffle_permutation']
4
+
5
+ class IndexedDatasetView:
6
+ def __init__(self, dataset, indices):
7
+ self.dataset = dataset
8
+ self.indices = list(indices) # ensure repeatable accessx
9
+
10
+ def __iter__(self):
11
+ for idx in self.indices:
12
+ yield self.dataset[idx]
13
+
14
+ def __len__(self):
15
+ return len(self.indices)
16
+
17
+ def shuffle_permutation(n, seed=int):
18
+ rng = np.random.default_rng(seed)
19
+ return rng.permutation(n)
20
+
21
+ def reverse_permutation(indices):
22
+ n = len(indices)
23
+ reverse_indices = np.empty(n, dtype=int)
24
+ reverse_indices[indices] = np.arange(n)
25
+ return reverse_indices
@@ -13,6 +13,7 @@ __all__ = [
13
13
  "processes_option",
14
14
  "prefix_option",
15
15
  "shard_size_option",
16
+ "shuffle_option",
16
17
  "size_hint_option",
17
18
  "trafo_option",
18
19
  "yes_option",
@@ -120,6 +121,17 @@ def shard_size_option(default=2**26):
120
121
  help=f"Shard size for the dataset (default: {default}).",
121
122
  )
122
123
 
124
+ def shuffle_option():
125
+ """
126
+ Option for specifying whether to shuffle the dataset by providing a random seed.
127
+ """
128
+ return click.option(
129
+ "--shuffle",
130
+ default=None,
131
+ type=int,
132
+ help="Shuffle the dataset by providing a random seed.",
133
+ )
134
+
123
135
  def size_hint_option(default=2**26):
124
136
  """
125
137
  Option for specifying the size hint.
@@ -1,18 +1,6 @@
1
1
  import re
2
2
 
3
- __all__ = ['IndexedDatasetView', 'Transformation', 'Transformations', 'flatten_json', 'unflatten_json']
4
-
5
- class IndexedDatasetView:
6
- def __init__(self, dataset, indices):
7
- self.dataset = dataset
8
- self.indices = list(indices) # ensure repeatable access
9
-
10
- def __iter__(self):
11
- for idx in self.indices:
12
- yield self.dataset[idx]
13
-
14
- def __len__(self):
15
- return len(self.indices)
3
+ __all__ = ['Transformation', 'Transformations', 'flatten_json', 'identity', 'unflatten_json']
16
4
 
17
5
  class Transformation:
18
6
  def __init__(self, code: str):
@@ -56,33 +44,23 @@ class Transformation:
56
44
  return self._last_input_len
57
45
  raise TypeError("Length is not available for this transformation.")
58
46
 
59
-
60
47
  class Transformations:
61
48
  def __init__(self, codes: list[str], indices=None):
62
49
  self.pipeline = [Transformation(code) for code in codes]
63
- self.indices = indices # Optional index iterable
64
50
 
65
51
  def __call__(self, dataset):
66
- # Wrap dataset with IndexedDatasetView if indices are provided
67
- if self.indices is not None:
68
- dataset = IndexedDatasetView(dataset, self.indices)
69
-
70
52
  result = dataset
71
53
  for transform in self.pipeline:
72
54
  result = transform(result)
73
55
  return result
74
56
 
75
57
  def __len__(self):
76
- # Return the input length to the pipeline
77
58
  if self.indices is not None:
78
59
  return len(self.indices)
79
60
  elif hasattr(self.pipeline[0], '_last_input_len') and self.pipeline[0]._last_input_len is not None:
80
61
  return self.pipeline[0]._last_input_len
81
62
  raise TypeError("Transformations length is not available until __call__ is used on a sized input.")
82
63
 
83
- def identity(obj):
84
- return obj
85
-
86
64
  def flatten_json(obj, parent_key='', sep='.', escape_char='\\'):
87
65
  def escape(key):
88
66
  return key.replace(escape_char, escape_char * 2)\
@@ -110,6 +88,9 @@ def flatten_json(obj, parent_key='', sep='.', escape_char='\\'):
110
88
  items.append((parent_key, obj))
111
89
  return dict(items)
112
90
 
91
+ def identity(obj):
92
+ return obj
93
+
113
94
  def unflatten_json(flat_dict, sep='.', escape_char='\\'):
114
95
  def check_flat_json(obj):
115
96
  assert isinstance(obj, dict), "Input must be a dictionary"
@@ -10,6 +10,7 @@ from streaming import StreamingDataset
10
10
  from tqdm import tqdm
11
11
 
12
12
  from .compression import determine_compression, open_compression, pigz_compress
13
+ from .indexing import IndexedDatasetView, reverse_permutation, shuffle_permutation
13
14
  from .mds import MDSBulkReader, MDSWriter
14
15
  from .pigz import pigz_open
15
16
  from .trafos import Transformations
@@ -89,7 +90,9 @@ def load_jsonl_files(jsonl_files):
89
90
  return _streaming_jsonl(jsonl_files, compressions)
90
91
  return load_dataset("json", data_files=jsonl_files, split="train")
91
92
 
92
- def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True):
93
+ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True, shuffle=None):
94
+ if bulk and shuffle is not None:
95
+ raise ValueError("Bulk reader does not support shuffling by design.")
93
96
  if bulk:
94
97
  return MDSBulkReader(mds_directories, split=split)
95
98
  dss = []
@@ -110,6 +113,12 @@ def load_mds_directories(mds_directories, split='.', batch_size=2**16, bulk=True
110
113
  else:
111
114
  with timing(message=f"Concatenating {len(dss)} datasets"):
112
115
  ds = concatenate_datasets(dsets=dss)
116
+ if shuffle is not None:
117
+ with timing(message="Creating shuffle indices"):
118
+ indices = shuffle_permutation(len(ds), seed=abs(shuffle))
119
+ if shuffle < 0:
120
+ indices = reverse_permutation(indices)
121
+ ds = IndexedDatasetView(ds, indices)
113
122
  return ds
114
123
 
115
124
  def save_jsonl(iterable, output_file, compression=None, processes=64, size_hint=None, overwrite=True, yes=True, trafo=None):
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "mldataforge"
7
- version = "0.2.1"
7
+ version = "0.2.3"
8
8
  authors = [
9
9
  { name = "Peter Schneider-Kamp" }
10
10
  ]
@@ -19,7 +19,6 @@ classifiers = [
19
19
  ]
20
20
 
21
21
  dependencies = [
22
- 'brotlicffi',
23
22
  'click',
24
23
  'datasets',
25
24
  'isal',
File without changes
File without changes
File without changes