datachain 0.3.6__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/lib/hf.py ADDED
@@ -0,0 +1,166 @@
1
+ try:
2
+ from datasets import (
3
+ Array2D,
4
+ Array3D,
5
+ Array4D,
6
+ Array5D,
7
+ Audio,
8
+ ClassLabel,
9
+ Dataset,
10
+ DatasetDict,
11
+ Image,
12
+ IterableDataset,
13
+ IterableDatasetDict,
14
+ Sequence,
15
+ Value,
16
+ load_dataset,
17
+ )
18
+ from datasets.features.features import string_to_arrow
19
+ from datasets.features.image import image_to_bytes
20
+
21
+ except ImportError as exc:
22
+ raise ImportError(
23
+ "Missing dependencies for huggingface datasets:\n"
24
+ "To install run:\n\n"
25
+ " pip install 'datachain[hf]'\n"
26
+ ) from exc
27
+
28
+ from io import BytesIO
29
+ from typing import TYPE_CHECKING, Any, Union
30
+
31
+ import PIL
32
+ from tqdm import tqdm
33
+
34
+ from datachain.lib.arrow import arrow_type_mapper
35
+ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
+ from datachain.lib.udf import Generator
37
+
38
+ if TYPE_CHECKING:
39
+ from pydantic import BaseModel
40
+
41
+
42
+ HFDatasetType = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]
43
+
44
+
45
+ class HFClassLabel(DataModel):
46
+ string: str
47
+ integer: int
48
+
49
+ def read(self):
50
+ return self.integer
51
+
52
+
53
+ class HFImage(DataModel):
54
+ img: bytes
55
+
56
+ def read(self):
57
+ return PIL.Image.open(BytesIO(self.img))
58
+
59
+
60
+ class HFAudio(DataModel):
61
+ path: str
62
+ array: list[float]
63
+ sampling_rate: int
64
+
65
+
66
+ class HFGenerator(Generator):
67
+ def __init__(
68
+ self,
69
+ ds: Union[str, HFDatasetType],
70
+ output_schema: type["BaseModel"],
71
+ *args,
72
+ **kwargs,
73
+ ):
74
+ super().__init__()
75
+ self.ds = ds
76
+ self.output_schema = output_schema
77
+ self.args = args
78
+ self.kwargs = kwargs
79
+
80
+ def setup(self):
81
+ self.ds_dict = stream_splits(self.ds, *self.args, **self.kwargs)
82
+
83
+ def process(self, split: str = ""):
84
+ desc = "Parsed Hugging Face dataset"
85
+ ds = self.ds_dict[split]
86
+ if split:
87
+ desc += f" split '{split}'"
88
+ with tqdm(desc=desc, unit=" rows") as pbar:
89
+ for row in ds:
90
+ output_dict = {}
91
+ if split:
92
+ output_dict["split"] = split
93
+ for name, feat in ds.features.items():
94
+ anno = self.output_schema.model_fields[name].annotation
95
+ output_dict[name] = _convert_feature(row[name], feat, anno)
96
+ yield self.output_schema(**output_dict)
97
+ pbar.update(1)
98
+
99
+
100
+ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
101
+ if isinstance(ds, str):
102
+ ds = load_dataset(ds, *args, streaming=True, **kwargs)
103
+ if isinstance(ds, (DatasetDict, IterableDatasetDict)):
104
+ return ds
105
+ return {"": ds}
106
+
107
+
108
+ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
109
+ if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
110
+ return val
111
+ if isinstance(feat, ClassLabel):
112
+ return HFClassLabel(string=feat.names[val], integer=val)
113
+ if isinstance(feat, Sequence):
114
+ if isinstance(feat.feature, dict):
115
+ sdict = {}
116
+ for sname in val:
117
+ sfeat = feat.feature[sname]
118
+ sanno = anno.model_fields[sname].annotation
119
+ sdict[sname] = [_convert_feature(v, sfeat, sanno) for v in val[sname]]
120
+ return anno(**sdict)
121
+ return val
122
+ if isinstance(feat, Image):
123
+ return HFImage(img=image_to_bytes(val))
124
+ if isinstance(feat, Audio):
125
+ return HFAudio(**val)
126
+
127
+
128
+ def get_output_schema(
129
+ ds: Union[Dataset, IterableDataset], model_name: str = ""
130
+ ) -> dict[str, DataType]:
131
+ fields_dict = {}
132
+ for name, val in ds.features.items():
133
+ fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
134
+ return fields_dict # type: ignore[return-value]
135
+
136
+
137
+ def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
138
+ if isinstance(val, Value):
139
+ return arrow_type_mapper(val.pa_type)
140
+ if isinstance(val, ClassLabel):
141
+ return HFClassLabel
142
+ if isinstance(val, Sequence):
143
+ if isinstance(val.feature, dict):
144
+ sequence_dict = {}
145
+ for sname, sval in val.feature.items():
146
+ dtype = _feature_to_chain_type(sname, sval)
147
+ sequence_dict[sname] = list[dtype] # type: ignore[valid-type]
148
+ return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
149
+ return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
150
+ if isinstance(val, Array2D):
151
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
152
+ return list[list[dtype]] # type: ignore[valid-type]
153
+ if isinstance(val, Array3D):
154
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
155
+ return list[list[list[dtype]]] # type: ignore[valid-type]
156
+ if isinstance(val, Array4D):
157
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
158
+ return list[list[list[list[dtype]]]] # type: ignore[valid-type]
159
+ if isinstance(val, Array5D):
160
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
161
+ return list[list[list[list[list[dtype]]]]] # type: ignore[valid-type]
162
+ if isinstance(val, Image):
163
+ return HFImage
164
+ if isinstance(val, Audio):
165
+ return HFAudio
166
+ raise TypeError(f"Unknown huggingface datasets type {type(val)}")
datachain/lib/image.py CHANGED
@@ -10,6 +10,7 @@ def convert_image(
10
10
  size: Optional[tuple[int, int]] = None,
11
11
  transform: Optional[Callable] = None,
12
12
  encoder: Optional[Callable] = None,
13
+ device: Optional[Union[str, torch.device]] = None,
13
14
  ) -> Union[Image.Image, torch.Tensor]:
14
15
  """
15
16
  Resize, transform, and otherwise convert an image.
@@ -20,6 +21,7 @@ def convert_image(
20
21
  size (tuple[int, int]): Size in (width, height) pixels for resizing.
21
22
  transform (Callable): Torchvision transform or huggingface processor to apply.
22
23
  encoder (Callable): Encode image using model.
24
+ device (str or torch.device): Device to use.
23
25
  """
24
26
  if mode:
25
27
  img = img.convert(mode)
@@ -35,6 +37,8 @@ def convert_image(
35
37
  img = torch.tensor(img.pixel_values[0]) # type: ignore[assignment,attr-defined]
36
38
  except ImportError:
37
39
  pass
40
+ if device:
41
+ img = img.to(device) # type: ignore[attr-defined]
38
42
  if encoder:
39
43
  img = img.unsqueeze(0) # type: ignore[attr-defined]
40
44
  if encoder:
@@ -48,6 +52,7 @@ def convert_images(
48
52
  size: Optional[tuple[int, int]] = None,
49
53
  transform: Optional[Callable] = None,
50
54
  encoder: Optional[Callable] = None,
55
+ device: Optional[Union[str, torch.device]] = None,
51
56
  ) -> Union[list[Image.Image], torch.Tensor]:
52
57
  """
53
58
  Resize, transform, and otherwise convert one or more images.
@@ -58,11 +63,14 @@ def convert_images(
58
63
  size (tuple[int, int]): Size in (width, height) pixels for resizing.
59
64
  transform (Callable): Torchvision transform or huggingface processor to apply.
60
65
  encoder (Callable): Encode image using model.
66
+ device (str or torch.device): Device to use.
61
67
  """
62
68
  if isinstance(images, Image.Image):
63
69
  images = [images]
64
70
 
65
- converted = [convert_image(img, mode, size, transform) for img in images]
71
+ converted = [
72
+ convert_image(img, mode, size, transform, device=device) for img in images
73
+ ]
66
74
 
67
75
  if isinstance(converted[0], torch.Tensor):
68
76
  converted = torch.stack(converted) # type: ignore[assignment,arg-type]
datachain/lib/pytorch.py CHANGED
@@ -10,7 +10,6 @@ from torchvision.transforms import v2
10
10
 
11
11
  from datachain.catalog import Catalog, get_catalog
12
12
  from datachain.lib.dc import DataChain
13
- from datachain.lib.file import File
14
13
  from datachain.lib.text import convert_text
15
14
 
16
15
  if TYPE_CHECKING:
@@ -97,7 +96,7 @@ class PytorchDataset(IterableDataset):
97
96
  for row_features in ds.collect():
98
97
  row = []
99
98
  for fr in row_features:
100
- if isinstance(fr, File):
99
+ if hasattr(fr, "read"):
101
100
  row.append(fr.read()) # type: ignore[unreachable]
102
101
  else:
103
102
  row.append(fr)
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import warnings
2
3
  from collections.abc import Iterator, Sequence
3
4
  from dataclasses import dataclass
4
5
  from datetime import datetime
@@ -42,6 +43,8 @@ NAMES_TO_TYPES = {
42
43
  "dict": dict,
43
44
  "bytes": bytes,
44
45
  "datetime": datetime,
46
+ "Literal": Literal,
47
+ "Union": Union,
45
48
  }
46
49
 
47
50
 
@@ -49,6 +52,10 @@ class SignalSchemaError(DataChainParamsError):
49
52
  pass
50
53
 
51
54
 
55
+ class SignalSchemaWarning(RuntimeWarning):
56
+ pass
57
+
58
+
52
59
  class SignalResolvingError(SignalSchemaError):
53
60
  def __init__(self, path: Optional[list[str]], msg: str):
54
61
  name = " '" + ".".join(path) + "'" if path else ""
@@ -69,6 +76,28 @@ class SignalResolvingTypeError(SignalResolvingError):
69
76
  )
70
77
 
71
78
 
79
+ def create_feature_model(
80
+ name: str, fields: dict[str, Union[type, tuple[type, Any]]]
81
+ ) -> type[BaseModel]:
82
+ """
83
+ This gets or returns a dynamic feature model for use in restoring a model
84
+ from the custom_types stored within a serialized SignalSchema. This is useful
85
+ when using a custom feature model where the original definition is not available.
86
+ This happens in Studio and if a custom model is used in a dataset, then that dataset
87
+ is used in a DataChain in a separate script where that model is not declared.
88
+ """
89
+ name = name.replace("@", "_")
90
+ return create_model(
91
+ name,
92
+ __base__=DataModel, # type: ignore[call-overload]
93
+ # These are tuples for each field of: annotation, default (if any)
94
+ **{
95
+ field_name: anno if isinstance(anno, tuple) else (anno, None)
96
+ for field_name, anno in fields.items()
97
+ },
98
+ )
99
+
100
+
72
101
  @dataclass
73
102
  class SignalSchema:
74
103
  values: dict[str, DataType]
@@ -117,40 +146,115 @@ class SignalSchema:
117
146
  )
118
147
  return SignalSchema(signals)
119
148
 
120
- def serialize(self) -> dict[str, str]:
121
- signals = {}
149
+ @staticmethod
150
+ def _get_name_original_type(fr_type: type) -> tuple[str, type]:
151
+ """Returns the name of and the original type for the given type,
152
+ based on whether the type is Optional or not."""
153
+ orig = get_origin(fr_type)
154
+ args = get_args(fr_type)
155
+ # Check if fr_type is Optional
156
+ if orig == Union and len(args) == 2 and (type(None) in args):
157
+ fr_type = args[0]
158
+ orig = get_origin(fr_type)
159
+ if orig in (Literal, LiteralEx):
160
+ # Literal has no __name__ in Python 3.9
161
+ type_name = "Literal"
162
+ elif orig == Union:
163
+ # Union also has no __name__ in Python 3.9
164
+ type_name = "Union"
165
+ else:
166
+ type_name = str(fr_type.__name__) # type: ignore[union-attr]
167
+ return type_name, fr_type
168
+
169
+ @staticmethod
170
+ def serialize_custom_model_fields(
171
+ name: str, fr: type, custom_types: dict[str, Any]
172
+ ) -> str:
173
+ """This serializes any custom type information to the provided custom_types
174
+ dict, and returns the name of the type provided."""
175
+ if hasattr(fr, "__origin__") or not issubclass(fr, BaseModel):
176
+ # Don't store non-feature types.
177
+ return name
178
+ version_name = ModelStore.get_name(fr)
179
+ if version_name in custom_types:
180
+ # This type is already stored in custom_types.
181
+ return version_name
182
+ fields = {}
183
+ for field_name, info in fr.model_fields.items():
184
+ field_type = info.annotation
185
+ # All fields should be typed.
186
+ assert field_type
187
+ field_type_name, field_type = SignalSchema._get_name_original_type(
188
+ field_type
189
+ )
190
+ # Serialize this type to custom_types if it is a custom type as well.
191
+ fields[field_name] = SignalSchema.serialize_custom_model_fields(
192
+ field_type_name, field_type, custom_types
193
+ )
194
+ custom_types[version_name] = fields
195
+ return version_name
196
+
197
+ def serialize(self) -> dict[str, Any]:
198
+ signals: dict[str, Any] = {}
199
+ custom_types: dict[str, Any] = {}
122
200
  for name, fr_type in self.values.items():
123
201
  if (fr := ModelStore.to_pydantic(fr_type)) is not None:
124
202
  ModelStore.register(fr)
125
203
  signals[name] = ModelStore.get_name(fr)
204
+ type_name, fr_type = SignalSchema._get_name_original_type(fr)
126
205
  else:
127
- orig = get_origin(fr_type)
128
- args = get_args(fr_type)
129
- # Check if fr_type is Optional
130
- if orig == Union and len(args) == 2 and (type(None) in args):
131
- fr_type = args[0]
132
- signals[name] = str(fr_type.__name__) # type: ignore[union-attr]
206
+ type_name, fr_type = SignalSchema._get_name_original_type(fr_type)
207
+ signals[name] = type_name
208
+ self.serialize_custom_model_fields(type_name, fr_type, custom_types)
209
+ if custom_types:
210
+ signals["_custom_types"] = custom_types
133
211
  return signals
134
212
 
135
213
  @staticmethod
136
- def deserialize(schema: dict[str, str]) -> "SignalSchema":
214
+ def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
215
+ """Convert a string-based type back into a python type."""
216
+ fr = NAMES_TO_TYPES.get(type_name)
217
+ if fr:
218
+ return fr # type: ignore[return-value]
219
+
220
+ model_name, version = ModelStore.parse_name_version(type_name)
221
+ fr = ModelStore.get(model_name, version)
222
+ if fr:
223
+ return fr
224
+
225
+ if type_name in custom_types:
226
+ fields = custom_types[type_name]
227
+ fields = {
228
+ field_name: SignalSchema._resolve_type(field_type_str, custom_types)
229
+ for field_name, field_type_str in fields.items()
230
+ }
231
+ return create_feature_model(type_name, fields)
232
+ return None
233
+
234
+ @staticmethod
235
+ def deserialize(schema: dict[str, Any]) -> "SignalSchema":
137
236
  if not isinstance(schema, dict):
138
237
  raise SignalSchemaError(f"cannot deserialize signal schema: {schema}")
139
238
 
140
239
  signals: dict[str, DataType] = {}
240
+ custom_types: dict[str, Any] = schema.get("_custom_types", {})
141
241
  for signal, type_name in schema.items():
242
+ if signal == "_custom_types":
243
+ # This entry is used as a lookup for custom types,
244
+ # and is not an actual field.
245
+ continue
142
246
  try:
143
- fr = NAMES_TO_TYPES.get(type_name)
144
- if not fr:
145
- type_name, version = ModelStore.parse_name_version(type_name)
146
- fr = ModelStore.get(type_name, version)
147
-
148
- if not fr:
149
- raise SignalSchemaError(
150
- f"cannot deserialize '{signal}': "
151
- f"unknown type '{type_name}'."
152
- f" Try to add it with `ModelStore.register({type_name})`."
153
- )
247
+ fr = SignalSchema._resolve_type(type_name, custom_types)
248
+ if fr is None:
249
+ # Skip if the type is not found, so all data can be displayed.
250
+ warnings.warn(
251
+ f"In signal '{signal}': "
252
+ f"unknown type '{type_name}'."
253
+ f" Try to add it with `ModelStore.register({type_name})`.",
254
+ SignalSchemaWarning,
255
+ stacklevel=2,
256
+ )
257
+ continue
154
258
  except TypeError as err:
155
259
  raise SignalSchemaError(
156
260
  f"cannot deserialize '{signal}': {err}"
datachain/lib/text.py CHANGED
@@ -9,6 +9,7 @@ def convert_text(
9
9
  tokenizer: Optional[Callable] = None,
10
10
  tokenizer_kwargs: Optional[dict[str, Any]] = None,
11
11
  encoder: Optional[Callable] = None,
12
+ device: Optional[Union[str, torch.device]] = None,
12
13
  ) -> Union[str, list[str], torch.Tensor]:
13
14
  """
14
15
  Tokenize and otherwise transform text.
@@ -18,6 +19,7 @@ def convert_text(
18
19
  tokenizer (Callable): Tokenizer to use to tokenize objects.
19
20
  tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
20
21
  encoder (Callable): Encode text using model.
22
+ device (str or torch.device): Device to use.
21
23
  """
22
24
  if not tokenizer:
23
25
  return text
@@ -32,6 +34,8 @@ def convert_text(
32
34
 
33
35
  tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
34
36
  tokens = torch.tensor(tokens)
37
+ if device:
38
+ tokens = tokens.to(device)
35
39
 
36
40
  if not encoder:
37
41
  return tokens
datachain/lib/udf.py CHANGED
@@ -242,26 +242,8 @@ class UDFBase(AbstractUDF):
242
242
  if not self.is_output_batched:
243
243
  result_objs = [result_objs]
244
244
 
245
- if len(self.output.values) > 1:
246
- res = []
247
- for tuple_ in result_objs:
248
- flat = []
249
- for obj in tuple_:
250
- if isinstance(obj, BaseModel):
251
- flat.extend(flatten(obj))
252
- else:
253
- flat.append(obj)
254
- res.append(tuple(flat))
255
- else:
256
- # Generator expression is required, otherwise the value will be materialized
257
- res = (
258
- flatten(obj)
259
- if isinstance(obj, BaseModel)
260
- else obj
261
- if isinstance(obj, tuple)
262
- else (obj,)
263
- for obj in result_objs
264
- )
245
+ # Generator expression is required, otherwise the value will be materialized
246
+ res = (self._flatten_row(row) for row in result_objs)
265
247
 
266
248
  if not self.is_output_batched:
267
249
  res = list(res)
@@ -282,6 +264,18 @@ class UDFBase(AbstractUDF):
282
264
 
283
265
  return res
284
266
 
267
+ def _flatten_row(self, row):
268
+ if len(self.output.values) > 1 and not isinstance(row, BaseModel):
269
+ flat = []
270
+ for obj in row:
271
+ flat.extend(self._obj_to_list(obj))
272
+ return tuple(flat)
273
+ return row if isinstance(row, tuple) else tuple(self._obj_to_list(row))
274
+
275
+ @staticmethod
276
+ def _obj_to_list(obj):
277
+ return flatten(obj) if isinstance(obj, BaseModel) else [obj]
278
+
285
279
  def _parse_rows(self, rows, cache, download_cb):
286
280
  objs = []
287
281
  for row in rows:
@@ -24,6 +24,7 @@ from typing import (
24
24
  )
25
25
 
26
26
  import attrs
27
+ import psutil
27
28
  import sqlalchemy
28
29
  import sqlalchemy as sa
29
30
  from attrs import frozen
@@ -383,7 +384,7 @@ def process_udf_outputs(
383
384
  udf_table: "Table",
384
385
  udf_results: Iterator[Iterable["UDFResult"]],
385
386
  udf: UDFBase,
386
- batch_size=INSERT_BATCH_SIZE,
387
+ batch_size: int = INSERT_BATCH_SIZE,
387
388
  cb: Callback = DEFAULT_CALLBACK,
388
389
  ) -> None:
389
390
  rows: list[UDFResult] = []
@@ -396,7 +397,9 @@ def process_udf_outputs(
396
397
  for row in udf_output:
397
398
  cb.relative_update()
398
399
  rows.append(adjust_outputs(warehouse, row, udf_col_types))
399
- if len(rows) >= batch_size:
400
+ if len(rows) >= batch_size or (
401
+ len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
402
+ ):
400
403
  for row_chunk in batched(rows, batch_size):
401
404
  warehouse.insert_rows(udf_table, row_chunk)
402
405
  rows.clear()
@@ -1775,6 +1778,10 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1775
1778
  save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
1776
1779
  save_as = os.getenv("DATACHAIN_QUERY_SAVE_AS")
1777
1780
 
1781
+ is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
1782
+ dataset_query.session.get_temp_prefix()
1783
+ )
1784
+
1778
1785
  if save_as:
1779
1786
  if dataset_query.attached:
1780
1787
  dataset_name = dataset_query.name
@@ -1801,7 +1808,7 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1801
1808
  )
1802
1809
  else:
1803
1810
  dataset_query = dataset_query.save(save_as)
1804
- elif save and not dataset_query.attached:
1811
+ elif save and (is_session_temp_dataset or not dataset_query.attached):
1805
1812
  name = catalog.generate_query_dataset_name()
1806
1813
  dataset_query = dataset_query.save(name)
1807
1814
 
@@ -74,11 +74,13 @@ class Session:
74
74
  self.catalog.id_generator.close_on_exit()
75
75
 
76
76
  def generate_temp_dataset_name(self) -> str:
77
- tmp_table_uid = uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
78
- return f"{self.DATASET_PREFIX}{self.name}_{tmp_table_uid}"
77
+ return self.get_temp_prefix() + uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
78
+
79
+ def get_temp_prefix(self) -> str:
80
+ return f"{self.DATASET_PREFIX}{self.name}_"
79
81
 
80
82
  def _cleanup_temp_datasets(self) -> None:
81
- prefix = f"{self.DATASET_PREFIX}{self.name}"
83
+ prefix = self.get_temp_prefix()
82
84
  try:
83
85
  for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
84
86
  self.catalog.remove_dataset(dataset.name, force=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.6
3
+ Version: 0.3.8
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,10 +41,11 @@ Requires-Dist: jmespath >=1.0
41
41
  Requires-Dist: datamodel-code-generator >=0.25
42
42
  Requires-Dist: Pillow <11,>=10.0.0
43
43
  Requires-Dist: msgpack <2,>=1.0.4
44
+ Requires-Dist: psutil
44
45
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
45
46
  Provides-Extra: dev
46
47
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
47
- Requires-Dist: mypy ==1.11.1 ; extra == 'dev'
48
+ Requires-Dist: mypy ==1.11.2 ; extra == 'dev'
48
49
  Requires-Dist: types-python-dateutil ; extra == 'dev'
49
50
  Requires-Dist: types-pytz ; extra == 'dev'
50
51
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -64,11 +65,14 @@ Requires-Dist: accelerate ; extra == 'examples'
64
65
  Requires-Dist: unstructured[pdf] ; extra == 'examples'
65
66
  Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
66
67
  Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
68
+ Provides-Extra: hf
69
+ Requires-Dist: numba >=0.60.0 ; extra == 'hf'
70
+ Requires-Dist: datasets[audio,vision] ; extra == 'hf'
67
71
  Provides-Extra: remote
68
72
  Requires-Dist: lz4 ; extra == 'remote'
69
73
  Requires-Dist: requests >=2.22.0 ; extra == 'remote'
70
74
  Provides-Extra: tests
71
- Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
75
+ Requires-Dist: datachain[hf,remote,torch,vector] ; extra == 'tests'
72
76
  Requires-Dist: pytest <9,>=8 ; extra == 'tests'
73
77
  Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
74
78
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
@@ -83,6 +87,7 @@ Requires-Dist: hypothesis ; extra == 'tests'
83
87
  Requires-Dist: open-clip-torch ; extra == 'tests'
84
88
  Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
85
89
  Requires-Dist: requests-mock ; extra == 'tests'
90
+ Requires-Dist: scipy ; extra == 'tests'
86
91
  Provides-Extra: torch
87
92
  Requires-Dist: torch >=2.1.0 ; extra == 'torch'
88
93
  Requires-Dist: torchvision ; extra == 'torch'