datachain 0.3.7__py3-none-any.whl → 0.3.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1540,87 +1540,6 @@ class Catalog:
1540
1540
  dataset = self.get_dataset(name)
1541
1541
  return self.update_dataset(dataset, **update_data)
1542
1542
 
1543
- def merge_datasets(
1544
- self,
1545
- src: DatasetRecord,
1546
- dst: DatasetRecord,
1547
- src_version: int,
1548
- dst_version: Optional[int] = None,
1549
- ) -> DatasetRecord:
1550
- """
1551
- Merges records from source to destination dataset.
1552
- It will create new version
1553
- of a dataset with records merged from old version and the source, unless
1554
- existing version is specified for destination in which case it must
1555
- be in non final status as datasets are immutable
1556
- """
1557
- if (
1558
- dst_version
1559
- and not dst.is_valid_next_version(dst_version)
1560
- and dst.get_version(dst_version).is_final_status()
1561
- ):
1562
- raise DatasetInvalidVersionError(
1563
- f"Version {dst_version} must be higher than the current latest one"
1564
- )
1565
-
1566
- src_dep = self.get_dataset_dependencies(src.name, src_version)
1567
- dst_dep = self.get_dataset_dependencies(
1568
- dst.name,
1569
- dst.latest_version, # type: ignore[arg-type]
1570
- )
1571
-
1572
- if dst.has_version(dst_version): # type: ignore[arg-type]
1573
- # case where we don't create new version, but append to the existing one
1574
- self.warehouse.merge_dataset_rows(
1575
- src,
1576
- dst,
1577
- src_version,
1578
- dst_version=dst_version, # type: ignore[arg-type]
1579
- )
1580
- merged_schema = src.serialized_schema | dst.serialized_schema
1581
- self.update_dataset(dst, schema=merged_schema)
1582
- self.update_dataset_version_with_warehouse_info(
1583
- dst,
1584
- dst_version, # type: ignore[arg-type]
1585
- schema=merged_schema,
1586
- )
1587
- for dep in src_dep:
1588
- if dep and dep not in dst_dep:
1589
- self.metastore.add_dependency(
1590
- dep,
1591
- dst.name,
1592
- dst_version, # type: ignore[arg-type]
1593
- )
1594
- else:
1595
- # case where we create new version of merged results
1596
- src_dr = self.warehouse.dataset_rows(src, src_version)
1597
- dst_dr = self.warehouse.dataset_rows(dst)
1598
-
1599
- merge_result_columns = list(
1600
- {
1601
- c.name: c for c in list(src_dr.table.c) + list(dst_dr.table.c)
1602
- }.values()
1603
- )
1604
-
1605
- dst_version = dst_version or dst.next_version
1606
- dst = self.create_new_dataset_version(
1607
- dst,
1608
- dst_version,
1609
- columns=merge_result_columns,
1610
- )
1611
- self.warehouse.merge_dataset_rows(
1612
- src,
1613
- dst,
1614
- src_version,
1615
- dst_version,
1616
- )
1617
- self.update_dataset_version_with_warehouse_info(dst, dst_version)
1618
- for dep in set(src_dep + dst_dep):
1619
- if dep:
1620
- self.metastore.add_dependency(dep, dst.name, dst_version)
1621
-
1622
- return dst
1623
-
1624
1543
  def get_file_signals(
1625
1544
  self, dataset_name: str, dataset_version: int, row: RowDict
1626
1545
  ) -> Optional[dict]:
datachain/cli.py CHANGED
@@ -336,36 +336,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
336
336
  help="Display size using powers of 1000 not 1024",
337
337
  )
338
338
 
339
- parse_merge_datasets = subp.add_parser(
340
- "merge-datasets", parents=[parent_parser], description="Merges datasets"
341
- )
342
- parse_merge_datasets.add_argument(
343
- "--src",
344
- action="store",
345
- default=None,
346
- help="Source dataset name",
347
- )
348
- parse_merge_datasets.add_argument(
349
- "--dst",
350
- action="store",
351
- default=None,
352
- help="Destination dataset name",
353
- )
354
- parse_merge_datasets.add_argument(
355
- "--src-version",
356
- action="store",
357
- default=None,
358
- type=int,
359
- help="Source dataset version",
360
- )
361
- parse_merge_datasets.add_argument(
362
- "--dst-version",
363
- action="store",
364
- default=None,
365
- type=int,
366
- help="Destination dataset version",
367
- )
368
-
369
339
  parse_ls = subp.add_parser(
370
340
  "ls", parents=[parent_parser], description="List storage contents"
371
341
  )
@@ -996,13 +966,6 @@ def main(argv: Optional[list[str]] = None) -> int: # noqa: C901, PLR0912, PLR09
996
966
  new_name=args.new_name,
997
967
  labels=args.labels,
998
968
  )
999
- elif args.command == "merge-datasets":
1000
- catalog.merge_datasets(
1001
- catalog.get_dataset(args.src),
1002
- catalog.get_dataset(args.dst),
1003
- args.src_version,
1004
- dst_version=args.dst_version,
1005
- )
1006
969
  elif args.command == "ls":
1007
970
  ls(
1008
971
  args.sources,
datachain/lib/arrow.py CHANGED
@@ -95,7 +95,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
95
95
  if not column:
96
96
  column = f"c{default_column}"
97
97
  default_column += 1
98
- dtype = _arrow_type_mapper(field.type) # type: ignore[assignment]
98
+ dtype = arrow_type_mapper(field.type) # type: ignore[assignment]
99
99
  if field.nullable:
100
100
  dtype = Optional[dtype] # type: ignore[assignment]
101
101
  output[column] = dtype
@@ -103,7 +103,7 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
103
103
  return output
104
104
 
105
105
 
106
- def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
106
+ def arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
107
107
  """Convert pyarrow types to basic types."""
108
108
  from datetime import datetime
109
109
 
@@ -122,11 +122,11 @@ def _arrow_type_mapper(col_type: pa.DataType) -> type: # noqa: PLR0911
122
122
  if pa.types.is_string(col_type) or pa.types.is_large_string(col_type):
123
123
  return str
124
124
  if pa.types.is_list(col_type):
125
- return list[_arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
125
+ return list[arrow_type_mapper(col_type.value_type)] # type: ignore[return-value, misc]
126
126
  if pa.types.is_struct(col_type) or pa.types.is_map(col_type):
127
127
  return dict
128
128
  if isinstance(col_type, pa.lib.DictionaryType):
129
- return _arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
129
+ return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
130
130
  raise TypeError(f"{col_type!r} datatypes not supported")
131
131
 
132
132
 
datachain/lib/clip.py CHANGED
@@ -1,5 +1,5 @@
1
1
  import inspect
2
- from typing import TYPE_CHECKING, Any, Callable, Literal, Union
2
+ from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Union
3
3
 
4
4
  import torch
5
5
  from transformers.modeling_utils import PreTrainedModel
@@ -39,6 +39,7 @@ def clip_similarity_scores(
39
39
  tokenizer: Callable,
40
40
  prob: bool = False,
41
41
  image_to_text: bool = True,
42
+ device: Optional[Union[str, torch.device]] = None,
42
43
  ) -> list[list[float]]:
43
44
  """
44
45
  Calculate CLIP similarity scores between one or more images and/or text.
@@ -52,6 +53,7 @@ def clip_similarity_scores(
52
53
  prob : Compute softmax probabilities.
53
54
  image_to_text : Whether to compute for image-to-text or text-to-image. Ignored
54
55
  if only one of images or text provided.
56
+ device : Device to use. Defaults is None - use model's device.
55
57
 
56
58
 
57
59
  Example:
@@ -130,17 +132,26 @@ def clip_similarity_scores(
130
132
  ```
131
133
  """
132
134
 
135
+ if device is None:
136
+ if hasattr(model, "device"):
137
+ device = model.device
138
+ else:
139
+ device = next(model.parameters()).device
140
+ else:
141
+ model = model.to(device)
133
142
  with torch.no_grad():
134
143
  if images is not None:
135
144
  encoder = _get_encoder(model, "image")
136
145
  image_features = convert_images(
137
- images, transform=preprocess, encoder=encoder
146
+ images, transform=preprocess, encoder=encoder, device=device
138
147
  )
139
148
  image_features /= image_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
140
149
 
141
150
  if text is not None:
142
151
  encoder = _get_encoder(model, "text")
143
- text_features = convert_text(text, tokenizer, encoder=encoder)
152
+ text_features = convert_text(
153
+ text, tokenizer, encoder=encoder, device=device
154
+ )
144
155
  text_features /= text_features.norm(dim=-1, keepdim=True) # type: ignore[union-attr]
145
156
 
146
157
  if images is not None and text is not None:
@@ -73,6 +73,9 @@ def python_to_sql(typ): # noqa: PLR0911
73
73
  if len(args) == 2 and (type(None) in args):
74
74
  return python_to_sql(args[0])
75
75
 
76
+ if _is_union_str_literal(orig, args):
77
+ return String
78
+
76
79
  if _is_json_inside_union(orig, args):
77
80
  return JSON
78
81
 
@@ -94,3 +97,9 @@ def _is_json_inside_union(orig, args) -> bool:
94
97
  if any(inspect.isclass(arg) and issubclass(arg, BaseModel) for arg in args):
95
98
  return True
96
99
  return False
100
+
101
+
102
+ def _is_union_str_literal(orig, args) -> bool:
103
+ if orig != Union:
104
+ return False
105
+ return all(arg is str or get_origin(arg) in (Literal, LiteralEx) for arg in args)
@@ -2,7 +2,7 @@ from collections.abc import Sequence
2
2
  from datetime import datetime
3
3
  from typing import ClassVar, Union, get_args, get_origin
4
4
 
5
- from pydantic import BaseModel
5
+ from pydantic import BaseModel, create_model
6
6
 
7
7
  from datachain.lib.model_store import ModelStore
8
8
 
@@ -57,3 +57,12 @@ def is_chain_type(t: type) -> bool:
57
57
  return is_chain_type(args[0])
58
58
 
59
59
  return False
60
+
61
+
62
+ def dict_to_data_model(name: str, data_dict: dict[str, DataType]) -> type[BaseModel]:
63
+ fields = {name: (anno, ...) for name, anno in data_dict.items()}
64
+ return create_model(
65
+ name,
66
+ __base__=(DataModel,), # type: ignore[call-overload]
67
+ **fields,
68
+ ) # type: ignore[call-overload]
datachain/lib/dc.py CHANGED
@@ -18,14 +18,13 @@ from typing import (
18
18
 
19
19
  import pandas as pd
20
20
  import sqlalchemy
21
- from pydantic import BaseModel, create_model
21
+ from pydantic import BaseModel
22
22
  from sqlalchemy.sql.functions import GenericFunction
23
23
  from sqlalchemy.sql.sqltypes import NullType
24
24
 
25
- from datachain import DataModel
26
25
  from datachain.lib.convert.python_to_sql import python_to_sql
27
26
  from datachain.lib.convert.values_to_tuples import values_to_tuples
28
- from datachain.lib.data_model import DataType
27
+ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
29
28
  from datachain.lib.dataset_info import DatasetInfo
30
29
  from datachain.lib.file import ExportPlacement as FileExportPlacement
31
30
  from datachain.lib.file import File, IndexedFile, get_file
@@ -55,6 +54,8 @@ from datachain.utils import inside_notebook
55
54
  if TYPE_CHECKING:
56
55
  from typing_extensions import Concatenate, ParamSpec, Self
57
56
 
57
+ from datachain.lib.hf import HFDatasetType
58
+
58
59
  P = ParamSpec("P")
59
60
 
60
61
  C = Column
@@ -77,12 +78,12 @@ def resolve_columns(
77
78
  @wraps(method)
78
79
  def _inner(self: D, *args: "P.args", **kwargs: "P.kwargs") -> D:
79
80
  resolved_args = self.signals_schema.resolve(
80
- *[arg for arg in args if not isinstance(arg, GenericFunction)]
81
+ *[arg for arg in args if not isinstance(arg, GenericFunction)] # type: ignore[arg-type]
81
82
  ).db_signals()
82
83
 
83
84
  for idx, arg in enumerate(args):
84
85
  if isinstance(arg, GenericFunction):
85
- resolved_args.insert(idx, arg)
86
+ resolved_args.insert(idx, arg) # type: ignore[arg-type]
86
87
 
87
88
  return method(self, *resolved_args, **kwargs)
88
89
 
@@ -208,23 +209,28 @@ class DataChain(DatasetQuery):
208
209
  "size": 0,
209
210
  }
210
211
 
211
- def __init__(self, *args, **kwargs):
212
+ def __init__(self, *args, settings: Optional[dict] = None, **kwargs):
212
213
  """This method needs to be redefined as a part of Dataset and DataChain
213
214
  decoupling.
214
215
  """
215
- super().__init__(
216
+ super().__init__( # type: ignore[misc]
216
217
  *args,
217
218
  **kwargs,
218
219
  indexing_column_types=File._datachain_column_types,
219
220
  )
220
- self._settings = Settings()
221
- self._setup = {}
221
+ if settings:
222
+ self._settings = Settings(**settings)
223
+ else:
224
+ self._settings = Settings()
225
+ self._setup: dict = {}
222
226
 
223
227
  self.signals_schema = SignalSchema({"sys": Sys})
224
228
  if self.feature_schema:
225
229
  self.signals_schema |= SignalSchema.deserialize(self.feature_schema)
226
230
  else:
227
- self.signals_schema |= SignalSchema.from_column_types(self.column_types)
231
+ self.signals_schema |= SignalSchema.from_column_types(
232
+ self.column_types or {}
233
+ )
228
234
 
229
235
  self._sys = False
230
236
 
@@ -309,6 +315,7 @@ class DataChain(DatasetQuery):
309
315
  *,
310
316
  type: Literal["binary", "text", "image"] = "binary",
311
317
  session: Optional[Session] = None,
318
+ settings: Optional[dict] = None,
312
319
  in_memory: bool = False,
313
320
  recursive: Optional[bool] = True,
314
321
  object_name: str = "file",
@@ -336,6 +343,7 @@ class DataChain(DatasetQuery):
336
343
  cls(
337
344
  path,
338
345
  session=session,
346
+ settings=settings,
339
347
  recursive=recursive,
340
348
  update=update,
341
349
  in_memory=in_memory,
@@ -489,6 +497,7 @@ class DataChain(DatasetQuery):
489
497
  def datasets(
490
498
  cls,
491
499
  session: Optional[Session] = None,
500
+ settings: Optional[dict] = None,
492
501
  in_memory: bool = False,
493
502
  object_name: str = "dataset",
494
503
  ) -> "DataChain":
@@ -513,6 +522,7 @@ class DataChain(DatasetQuery):
513
522
 
514
523
  return cls.from_values(
515
524
  session=session,
525
+ settings=settings,
516
526
  in_memory=in_memory,
517
527
  output={object_name: DatasetInfo},
518
528
  **{object_name: datasets}, # type: ignore[arg-type]
@@ -895,7 +905,7 @@ class DataChain(DatasetQuery):
895
905
  if isinstance(value, Column):
896
906
  # renaming existing column
897
907
  for signal in schema.db_signals(name=value.name, as_columns=True):
898
- mutated[signal.name.replace(value.name, name, 1)] = signal
908
+ mutated[signal.name.replace(value.name, name, 1)] = signal # type: ignore[union-attr]
899
909
  else:
900
910
  # adding new signal
901
911
  mutated[name] = value
@@ -1086,7 +1096,7 @@ class DataChain(DatasetQuery):
1086
1096
  )
1087
1097
 
1088
1098
  signals_schema = self.signals_schema.clone_without_sys_signals()
1089
- on_columns = signals_schema.resolve(*on).db_signals()
1099
+ on_columns: list[str] = signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
1090
1100
 
1091
1101
  right_signals_schema = right_ds.signals_schema.clone_without_sys_signals()
1092
1102
  if right_on is not None:
@@ -1105,7 +1115,9 @@ class DataChain(DatasetQuery):
1105
1115
  on, right_on, "'on' and 'right_on' must have the same length'"
1106
1116
  )
1107
1117
 
1108
- right_on_columns = right_signals_schema.resolve(*right_on).db_signals()
1118
+ right_on_columns: list[str] = right_signals_schema.resolve(
1119
+ *right_on
1120
+ ).db_signals() # type: ignore[assignment]
1109
1121
 
1110
1122
  if len(right_on_columns) != len(on_columns):
1111
1123
  on_str = ", ".join(right_on_columns)
@@ -1169,7 +1181,7 @@ class DataChain(DatasetQuery):
1169
1181
  "'on' cannot be empty",
1170
1182
  )
1171
1183
  else:
1172
- signals = self.signals_schema.resolve(*on).db_signals()
1184
+ signals = self.signals_schema.resolve(*on).db_signals() # type: ignore[assignment]
1173
1185
  return super()._subtract(other, signals) # type: ignore[arg-type]
1174
1186
 
1175
1187
  @classmethod
@@ -1177,6 +1189,7 @@ class DataChain(DatasetQuery):
1177
1189
  cls,
1178
1190
  ds_name: str = "",
1179
1191
  session: Optional[Session] = None,
1192
+ settings: Optional[dict] = None,
1180
1193
  in_memory: bool = False,
1181
1194
  output: OutputType = None,
1182
1195
  object_name: str = "",
@@ -1195,10 +1208,13 @@ class DataChain(DatasetQuery):
1195
1208
  yield from tuples
1196
1209
 
1197
1210
  chain = DataChain.from_records(
1198
- DataChain.DEFAULT_FILE_RECORD, session=session, in_memory=in_memory
1211
+ DataChain.DEFAULT_FILE_RECORD,
1212
+ session=session,
1213
+ settings=settings,
1214
+ in_memory=in_memory,
1199
1215
  )
1200
1216
  if object_name:
1201
- output = {object_name: DataChain._dict_to_data_model(object_name, output)} # type: ignore[arg-type]
1217
+ output = {object_name: dict_to_data_model(object_name, output)} # type: ignore[arg-type]
1202
1218
  return chain.gen(_func_fr, output=output)
1203
1219
 
1204
1220
  @classmethod
@@ -1207,6 +1223,7 @@ class DataChain(DatasetQuery):
1207
1223
  df: "pd.DataFrame",
1208
1224
  name: str = "",
1209
1225
  session: Optional[Session] = None,
1226
+ settings: Optional[dict] = None,
1210
1227
  in_memory: bool = False,
1211
1228
  object_name: str = "",
1212
1229
  ) -> "DataChain":
@@ -1236,7 +1253,12 @@ class DataChain(DatasetQuery):
1236
1253
  )
1237
1254
 
1238
1255
  return cls.from_values(
1239
- name, session, object_name=object_name, in_memory=in_memory, **fr_map
1256
+ name,
1257
+ session,
1258
+ settings=settings,
1259
+ object_name=object_name,
1260
+ in_memory=in_memory,
1261
+ **fr_map,
1240
1262
  )
1241
1263
 
1242
1264
  def to_pandas(self, flatten=False) -> "pd.DataFrame":
@@ -1306,6 +1328,59 @@ class DataChain(DatasetQuery):
1306
1328
  if len(df) == limit:
1307
1329
  print(f"\n[Limited by {len(df)} rows]")
1308
1330
 
1331
+ @classmethod
1332
+ def from_hf(
1333
+ cls,
1334
+ dataset: Union[str, "HFDatasetType"],
1335
+ *args,
1336
+ session: Optional[Session] = None,
1337
+ settings: Optional[dict] = None,
1338
+ object_name: str = "",
1339
+ model_name: str = "",
1340
+ **kwargs,
1341
+ ) -> "DataChain":
1342
+ """Generate chain from huggingface hub dataset.
1343
+
1344
+ Parameters:
1345
+ dataset : Path or name of the dataset to read from Hugging Face Hub,
1346
+ or an instance of `datasets.Dataset`-like object.
1347
+ session : Session to use for the chain.
1348
+ settings : Settings to use for the chain.
1349
+ object_name : Generated object column name.
1350
+ model_name : Generated model name.
1351
+ kwargs : Parameters to pass to datasets.load_dataset.
1352
+
1353
+ Example:
1354
+ Load from Hugging Face Hub:
1355
+ ```py
1356
+ DataChain.from_hf("beans", split="train")
1357
+ ```
1358
+
1359
+ Generate chain from loaded dataset:
1360
+ ```py
1361
+ from datasets import load_dataset
1362
+ ds = load_dataset("beans", split="train")
1363
+ DataChain.from_hf(ds)
1364
+ ```
1365
+ """
1366
+ from datachain.lib.hf import HFGenerator, get_output_schema, stream_splits
1367
+
1368
+ output: dict[str, DataType] = {}
1369
+ ds_dict = stream_splits(dataset, *args, **kwargs)
1370
+ if len(ds_dict) > 1:
1371
+ output = {"split": str}
1372
+
1373
+ model_name = model_name or object_name or ""
1374
+ output = output | get_output_schema(next(iter(ds_dict.values())), model_name)
1375
+ model = dict_to_data_model(model_name, output)
1376
+ if object_name:
1377
+ output = {object_name: model}
1378
+
1379
+ chain = DataChain.from_values(
1380
+ split=list(ds_dict.keys()), session=session, settings=settings
1381
+ )
1382
+ return chain.gen(HFGenerator(dataset, model, *args, **kwargs), output=output)
1383
+
1309
1384
  def parse_tabular(
1310
1385
  self,
1311
1386
  output: OutputType = None,
@@ -1367,7 +1442,7 @@ class DataChain(DatasetQuery):
1367
1442
 
1368
1443
  if isinstance(output, dict):
1369
1444
  model_name = model_name or object_name or ""
1370
- model = DataChain._dict_to_data_model(model_name, output)
1445
+ model = dict_to_data_model(model_name, output)
1371
1446
  else:
1372
1447
  model = output # type: ignore[assignment]
1373
1448
 
@@ -1384,17 +1459,6 @@ class DataChain(DatasetQuery):
1384
1459
  ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
1385
1460
  )
1386
1461
 
1387
- @staticmethod
1388
- def _dict_to_data_model(
1389
- name: str, data_dict: dict[str, DataType]
1390
- ) -> type[BaseModel]:
1391
- fields = {name: (anno, ...) for name, anno in data_dict.items()}
1392
- return create_model(
1393
- name,
1394
- __base__=(DataModel,), # type: ignore[call-overload]
1395
- **fields,
1396
- ) # type: ignore[call-overload]
1397
-
1398
1462
  @classmethod
1399
1463
  def from_csv(
1400
1464
  cls,
@@ -1543,6 +1607,7 @@ class DataChain(DatasetQuery):
1543
1607
  cls,
1544
1608
  to_insert: Optional[Union[dict, list[dict]]],
1545
1609
  session: Optional[Session] = None,
1610
+ settings: Optional[dict] = None,
1546
1611
  in_memory: bool = False,
1547
1612
  schema: Optional[dict[str, DataType]] = None,
1548
1613
  ) -> "DataChain":
@@ -1597,7 +1662,7 @@ class DataChain(DatasetQuery):
1597
1662
  insert_q = dr.get_table().insert()
1598
1663
  for record in to_insert:
1599
1664
  db.execute(insert_q.values(**record))
1600
- return DataChain(name=dsr.name)
1665
+ return DataChain(name=dsr.name, settings=settings)
1601
1666
 
1602
1667
  def sum(self, fr: DataType): # type: ignore[override]
1603
1668
  """Compute the sum of a column."""
datachain/lib/hf.py ADDED
@@ -0,0 +1,166 @@
1
+ try:
2
+ from datasets import (
3
+ Array2D,
4
+ Array3D,
5
+ Array4D,
6
+ Array5D,
7
+ Audio,
8
+ ClassLabel,
9
+ Dataset,
10
+ DatasetDict,
11
+ Image,
12
+ IterableDataset,
13
+ IterableDatasetDict,
14
+ Sequence,
15
+ Value,
16
+ load_dataset,
17
+ )
18
+ from datasets.features.features import string_to_arrow
19
+ from datasets.features.image import image_to_bytes
20
+
21
+ except ImportError as exc:
22
+ raise ImportError(
23
+ "Missing dependencies for huggingface datasets:\n"
24
+ "To install run:\n\n"
25
+ " pip install 'datachain[hf]'\n"
26
+ ) from exc
27
+
28
+ from io import BytesIO
29
+ from typing import TYPE_CHECKING, Any, Union
30
+
31
+ import PIL
32
+ from tqdm import tqdm
33
+
34
+ from datachain.lib.arrow import arrow_type_mapper
35
+ from datachain.lib.data_model import DataModel, DataType, dict_to_data_model
36
+ from datachain.lib.udf import Generator
37
+
38
+ if TYPE_CHECKING:
39
+ from pydantic import BaseModel
40
+
41
+
42
+ HFDatasetType = Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset]
43
+
44
+
45
+ class HFClassLabel(DataModel):
46
+ string: str
47
+ integer: int
48
+
49
+ def read(self):
50
+ return self.integer
51
+
52
+
53
+ class HFImage(DataModel):
54
+ img: bytes
55
+
56
+ def read(self):
57
+ return PIL.Image.open(BytesIO(self.img))
58
+
59
+
60
+ class HFAudio(DataModel):
61
+ path: str
62
+ array: list[float]
63
+ sampling_rate: int
64
+
65
+
66
+ class HFGenerator(Generator):
67
+ def __init__(
68
+ self,
69
+ ds: Union[str, HFDatasetType],
70
+ output_schema: type["BaseModel"],
71
+ *args,
72
+ **kwargs,
73
+ ):
74
+ super().__init__()
75
+ self.ds = ds
76
+ self.output_schema = output_schema
77
+ self.args = args
78
+ self.kwargs = kwargs
79
+
80
+ def setup(self):
81
+ self.ds_dict = stream_splits(self.ds, *self.args, **self.kwargs)
82
+
83
+ def process(self, split: str = ""):
84
+ desc = "Parsed Hugging Face dataset"
85
+ ds = self.ds_dict[split]
86
+ if split:
87
+ desc += f" split '{split}'"
88
+ with tqdm(desc=desc, unit=" rows") as pbar:
89
+ for row in ds:
90
+ output_dict = {}
91
+ if split:
92
+ output_dict["split"] = split
93
+ for name, feat in ds.features.items():
94
+ anno = self.output_schema.model_fields[name].annotation
95
+ output_dict[name] = _convert_feature(row[name], feat, anno)
96
+ yield self.output_schema(**output_dict)
97
+ pbar.update(1)
98
+
99
+
100
+ def stream_splits(ds: Union[str, HFDatasetType], *args, **kwargs):
101
+ if isinstance(ds, str):
102
+ ds = load_dataset(ds, *args, streaming=True, **kwargs)
103
+ if isinstance(ds, (DatasetDict, IterableDatasetDict)):
104
+ return ds
105
+ return {"": ds}
106
+
107
+
108
+ def _convert_feature(val: Any, feat: Any, anno: Any) -> Any:
109
+ if isinstance(feat, (Value, Array2D, Array3D, Array4D, Array5D)):
110
+ return val
111
+ if isinstance(feat, ClassLabel):
112
+ return HFClassLabel(string=feat.names[val], integer=val)
113
+ if isinstance(feat, Sequence):
114
+ if isinstance(feat.feature, dict):
115
+ sdict = {}
116
+ for sname in val:
117
+ sfeat = feat.feature[sname]
118
+ sanno = anno.model_fields[sname].annotation
119
+ sdict[sname] = [_convert_feature(v, sfeat, sanno) for v in val[sname]]
120
+ return anno(**sdict)
121
+ return val
122
+ if isinstance(feat, Image):
123
+ return HFImage(img=image_to_bytes(val))
124
+ if isinstance(feat, Audio):
125
+ return HFAudio(**val)
126
+
127
+
128
+ def get_output_schema(
129
+ ds: Union[Dataset, IterableDataset], model_name: str = ""
130
+ ) -> dict[str, DataType]:
131
+ fields_dict = {}
132
+ for name, val in ds.features.items():
133
+ fields_dict[name] = _feature_to_chain_type(name, val) # type: ignore[assignment]
134
+ return fields_dict # type: ignore[return-value]
135
+
136
+
137
+ def _feature_to_chain_type(name: str, val: Any) -> type: # noqa: PLR0911
138
+ if isinstance(val, Value):
139
+ return arrow_type_mapper(val.pa_type)
140
+ if isinstance(val, ClassLabel):
141
+ return HFClassLabel
142
+ if isinstance(val, Sequence):
143
+ if isinstance(val.feature, dict):
144
+ sequence_dict = {}
145
+ for sname, sval in val.feature.items():
146
+ dtype = _feature_to_chain_type(sname, sval)
147
+ sequence_dict[sname] = list[dtype] # type: ignore[valid-type]
148
+ return dict_to_data_model(name, sequence_dict) # type: ignore[arg-type]
149
+ return list[_feature_to_chain_type(name, val.feature)] # type: ignore[arg-type,misc,return-value]
150
+ if isinstance(val, Array2D):
151
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
152
+ return list[list[dtype]] # type: ignore[valid-type]
153
+ if isinstance(val, Array3D):
154
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
155
+ return list[list[list[dtype]]] # type: ignore[valid-type]
156
+ if isinstance(val, Array4D):
157
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
158
+ return list[list[list[list[dtype]]]] # type: ignore[valid-type]
159
+ if isinstance(val, Array5D):
160
+ dtype = arrow_type_mapper(string_to_arrow(val.dtype))
161
+ return list[list[list[list[list[dtype]]]]] # type: ignore[valid-type]
162
+ if isinstance(val, Image):
163
+ return HFImage
164
+ if isinstance(val, Audio):
165
+ return HFAudio
166
+ raise TypeError(f"Unknown huggingface datasets type {type(val)}")
datachain/lib/image.py CHANGED
@@ -10,6 +10,7 @@ def convert_image(
10
10
  size: Optional[tuple[int, int]] = None,
11
11
  transform: Optional[Callable] = None,
12
12
  encoder: Optional[Callable] = None,
13
+ device: Optional[Union[str, torch.device]] = None,
13
14
  ) -> Union[Image.Image, torch.Tensor]:
14
15
  """
15
16
  Resize, transform, and otherwise convert an image.
@@ -20,6 +21,7 @@ def convert_image(
20
21
  size (tuple[int, int]): Size in (width, height) pixels for resizing.
21
22
  transform (Callable): Torchvision transform or huggingface processor to apply.
22
23
  encoder (Callable): Encode image using model.
24
+ device (str or torch.device): Device to use.
23
25
  """
24
26
  if mode:
25
27
  img = img.convert(mode)
@@ -35,6 +37,8 @@ def convert_image(
35
37
  img = torch.tensor(img.pixel_values[0]) # type: ignore[assignment,attr-defined]
36
38
  except ImportError:
37
39
  pass
40
+ if device:
41
+ img = img.to(device) # type: ignore[attr-defined]
38
42
  if encoder:
39
43
  img = img.unsqueeze(0) # type: ignore[attr-defined]
40
44
  if encoder:
@@ -48,6 +52,7 @@ def convert_images(
48
52
  size: Optional[tuple[int, int]] = None,
49
53
  transform: Optional[Callable] = None,
50
54
  encoder: Optional[Callable] = None,
55
+ device: Optional[Union[str, torch.device]] = None,
51
56
  ) -> Union[list[Image.Image], torch.Tensor]:
52
57
  """
53
58
  Resize, transform, and otherwise convert one or more images.
@@ -58,11 +63,14 @@ def convert_images(
58
63
  size (tuple[int, int]): Size in (width, height) pixels for resizing.
59
64
  transform (Callable): Torchvision transform or huggingface processor to apply.
60
65
  encoder (Callable): Encode image using model.
66
+ device (str or torch.device): Device to use.
61
67
  """
62
68
  if isinstance(images, Image.Image):
63
69
  images = [images]
64
70
 
65
- converted = [convert_image(img, mode, size, transform) for img in images]
71
+ converted = [
72
+ convert_image(img, mode, size, transform, device=device) for img in images
73
+ ]
66
74
 
67
75
  if isinstance(converted[0], torch.Tensor):
68
76
  converted = torch.stack(converted) # type: ignore[assignment,arg-type]
datachain/lib/pytorch.py CHANGED
@@ -10,7 +10,6 @@ from torchvision.transforms import v2
10
10
 
11
11
  from datachain.catalog import Catalog, get_catalog
12
12
  from datachain.lib.dc import DataChain
13
- from datachain.lib.file import File
14
13
  from datachain.lib.text import convert_text
15
14
 
16
15
  if TYPE_CHECKING:
@@ -97,7 +96,7 @@ class PytorchDataset(IterableDataset):
97
96
  for row_features in ds.collect():
98
97
  row = []
99
98
  for fr in row_features:
100
- if isinstance(fr, File):
99
+ if hasattr(fr, "read"):
101
100
  row.append(fr.read()) # type: ignore[unreachable]
102
101
  else:
103
102
  row.append(fr)
@@ -1,4 +1,5 @@
1
1
  import copy
2
+ import warnings
2
3
  from collections.abc import Iterator, Sequence
3
4
  from dataclasses import dataclass
4
5
  from datetime import datetime
@@ -42,6 +43,8 @@ NAMES_TO_TYPES = {
42
43
  "dict": dict,
43
44
  "bytes": bytes,
44
45
  "datetime": datetime,
46
+ "Literal": Literal,
47
+ "Union": Union,
45
48
  }
46
49
 
47
50
 
@@ -49,6 +52,10 @@ class SignalSchemaError(DataChainParamsError):
49
52
  pass
50
53
 
51
54
 
55
+ class SignalSchemaWarning(RuntimeWarning):
56
+ pass
57
+
58
+
52
59
  class SignalResolvingError(SignalSchemaError):
53
60
  def __init__(self, path: Optional[list[str]], msg: str):
54
61
  name = " '" + ".".join(path) + "'" if path else ""
@@ -69,6 +76,28 @@ class SignalResolvingTypeError(SignalResolvingError):
69
76
  )
70
77
 
71
78
 
79
+ def create_feature_model(
80
+ name: str, fields: dict[str, Union[type, tuple[type, Any]]]
81
+ ) -> type[BaseModel]:
82
+ """
83
+ This gets or returns a dynamic feature model for use in restoring a model
84
+ from the custom_types stored within a serialized SignalSchema. This is useful
85
+ when using a custom feature model where the original definition is not available.
86
+ This happens in Studio and if a custom model is used in a dataset, then that dataset
87
+ is used in a DataChain in a separate script where that model is not declared.
88
+ """
89
+ name = name.replace("@", "_")
90
+ return create_model(
91
+ name,
92
+ __base__=DataModel, # type: ignore[call-overload]
93
+ # These are tuples for each field of: annotation, default (if any)
94
+ **{
95
+ field_name: anno if isinstance(anno, tuple) else (anno, None)
96
+ for field_name, anno in fields.items()
97
+ },
98
+ )
99
+
100
+
72
101
  @dataclass
73
102
  class SignalSchema:
74
103
  values: dict[str, DataType]
@@ -117,40 +146,115 @@ class SignalSchema:
117
146
  )
118
147
  return SignalSchema(signals)
119
148
 
120
- def serialize(self) -> dict[str, str]:
121
- signals = {}
149
+ @staticmethod
150
+ def _get_name_original_type(fr_type: type) -> tuple[str, type]:
151
+ """Returns the name of and the original type for the given type,
152
+ based on whether the type is Optional or not."""
153
+ orig = get_origin(fr_type)
154
+ args = get_args(fr_type)
155
+ # Check if fr_type is Optional
156
+ if orig == Union and len(args) == 2 and (type(None) in args):
157
+ fr_type = args[0]
158
+ orig = get_origin(fr_type)
159
+ if orig in (Literal, LiteralEx):
160
+ # Literal has no __name__ in Python 3.9
161
+ type_name = "Literal"
162
+ elif orig == Union:
163
+ # Union also has no __name__ in Python 3.9
164
+ type_name = "Union"
165
+ else:
166
+ type_name = str(fr_type.__name__) # type: ignore[union-attr]
167
+ return type_name, fr_type
168
+
169
+ @staticmethod
170
+ def serialize_custom_model_fields(
171
+ name: str, fr: type, custom_types: dict[str, Any]
172
+ ) -> str:
173
+ """This serializes any custom type information to the provided custom_types
174
+ dict, and returns the name of the type provided."""
175
+ if hasattr(fr, "__origin__") or not issubclass(fr, BaseModel):
176
+ # Don't store non-feature types.
177
+ return name
178
+ version_name = ModelStore.get_name(fr)
179
+ if version_name in custom_types:
180
+ # This type is already stored in custom_types.
181
+ return version_name
182
+ fields = {}
183
+ for field_name, info in fr.model_fields.items():
184
+ field_type = info.annotation
185
+ # All fields should be typed.
186
+ assert field_type
187
+ field_type_name, field_type = SignalSchema._get_name_original_type(
188
+ field_type
189
+ )
190
+ # Serialize this type to custom_types if it is a custom type as well.
191
+ fields[field_name] = SignalSchema.serialize_custom_model_fields(
192
+ field_type_name, field_type, custom_types
193
+ )
194
+ custom_types[version_name] = fields
195
+ return version_name
196
+
197
+ def serialize(self) -> dict[str, Any]:
198
+ signals: dict[str, Any] = {}
199
+ custom_types: dict[str, Any] = {}
122
200
  for name, fr_type in self.values.items():
123
201
  if (fr := ModelStore.to_pydantic(fr_type)) is not None:
124
202
  ModelStore.register(fr)
125
203
  signals[name] = ModelStore.get_name(fr)
204
+ type_name, fr_type = SignalSchema._get_name_original_type(fr)
126
205
  else:
127
- orig = get_origin(fr_type)
128
- args = get_args(fr_type)
129
- # Check if fr_type is Optional
130
- if orig == Union and len(args) == 2 and (type(None) in args):
131
- fr_type = args[0]
132
- signals[name] = str(fr_type.__name__) # type: ignore[union-attr]
206
+ type_name, fr_type = SignalSchema._get_name_original_type(fr_type)
207
+ signals[name] = type_name
208
+ self.serialize_custom_model_fields(type_name, fr_type, custom_types)
209
+ if custom_types:
210
+ signals["_custom_types"] = custom_types
133
211
  return signals
134
212
 
135
213
  @staticmethod
136
- def deserialize(schema: dict[str, str]) -> "SignalSchema":
214
+ def _resolve_type(type_name: str, custom_types: dict[str, Any]) -> Optional[type]:
215
+ """Convert a string-based type back into a python type."""
216
+ fr = NAMES_TO_TYPES.get(type_name)
217
+ if fr:
218
+ return fr # type: ignore[return-value]
219
+
220
+ model_name, version = ModelStore.parse_name_version(type_name)
221
+ fr = ModelStore.get(model_name, version)
222
+ if fr:
223
+ return fr
224
+
225
+ if type_name in custom_types:
226
+ fields = custom_types[type_name]
227
+ fields = {
228
+ field_name: SignalSchema._resolve_type(field_type_str, custom_types)
229
+ for field_name, field_type_str in fields.items()
230
+ }
231
+ return create_feature_model(type_name, fields)
232
+ return None
233
+
234
+ @staticmethod
235
+ def deserialize(schema: dict[str, Any]) -> "SignalSchema":
137
236
  if not isinstance(schema, dict):
138
237
  raise SignalSchemaError(f"cannot deserialize signal schema: {schema}")
139
238
 
140
239
  signals: dict[str, DataType] = {}
240
+ custom_types: dict[str, Any] = schema.get("_custom_types", {})
141
241
  for signal, type_name in schema.items():
242
+ if signal == "_custom_types":
243
+ # This entry is used as a lookup for custom types,
244
+ # and is not an actual field.
245
+ continue
142
246
  try:
143
- fr = NAMES_TO_TYPES.get(type_name)
144
- if not fr:
145
- type_name, version = ModelStore.parse_name_version(type_name)
146
- fr = ModelStore.get(type_name, version)
147
-
148
- if not fr:
149
- raise SignalSchemaError(
150
- f"cannot deserialize '{signal}': "
151
- f"unknown type '{type_name}'."
152
- f" Try to add it with `ModelStore.register({type_name})`."
153
- )
247
+ fr = SignalSchema._resolve_type(type_name, custom_types)
248
+ if fr is None:
249
+ # Skip if the type is not found, so all data can be displayed.
250
+ warnings.warn(
251
+ f"In signal '{signal}': "
252
+ f"unknown type '{type_name}'."
253
+ f" Try to add it with `ModelStore.register({type_name})`.",
254
+ SignalSchemaWarning,
255
+ stacklevel=2,
256
+ )
257
+ continue
154
258
  except TypeError as err:
155
259
  raise SignalSchemaError(
156
260
  f"cannot deserialize '{signal}': {err}"
datachain/lib/text.py CHANGED
@@ -9,6 +9,7 @@ def convert_text(
9
9
  tokenizer: Optional[Callable] = None,
10
10
  tokenizer_kwargs: Optional[dict[str, Any]] = None,
11
11
  encoder: Optional[Callable] = None,
12
+ device: Optional[Union[str, torch.device]] = None,
12
13
  ) -> Union[str, list[str], torch.Tensor]:
13
14
  """
14
15
  Tokenize and otherwise transform text.
@@ -18,6 +19,7 @@ def convert_text(
18
19
  tokenizer (Callable): Tokenizer to use to tokenize objects.
19
20
  tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
20
21
  encoder (Callable): Encode text using model.
22
+ device (str or torch.device): Device to use.
21
23
  """
22
24
  if not tokenizer:
23
25
  return text
@@ -32,6 +34,8 @@ def convert_text(
32
34
 
33
35
  tokens = res.input_ids if isinstance(tokenizer, PreTrainedTokenizerBase) else res
34
36
  tokens = torch.tensor(tokens)
37
+ if device:
38
+ tokens = tokens.to(device)
35
39
 
36
40
  if not encoder:
37
41
  return tokens
datachain/lib/udf.py CHANGED
@@ -242,26 +242,8 @@ class UDFBase(AbstractUDF):
242
242
  if not self.is_output_batched:
243
243
  result_objs = [result_objs]
244
244
 
245
- if len(self.output.values) > 1:
246
- res = []
247
- for tuple_ in result_objs:
248
- flat = []
249
- for obj in tuple_:
250
- if isinstance(obj, BaseModel):
251
- flat.extend(flatten(obj))
252
- else:
253
- flat.append(obj)
254
- res.append(tuple(flat))
255
- else:
256
- # Generator expression is required, otherwise the value will be materialized
257
- res = (
258
- flatten(obj)
259
- if isinstance(obj, BaseModel)
260
- else obj
261
- if isinstance(obj, tuple)
262
- else (obj,)
263
- for obj in result_objs
264
- )
245
+ # Generator expression is required, otherwise the value will be materialized
246
+ res = (self._flatten_row(row) for row in result_objs)
265
247
 
266
248
  if not self.is_output_batched:
267
249
  res = list(res)
@@ -282,6 +264,18 @@ class UDFBase(AbstractUDF):
282
264
 
283
265
  return res
284
266
 
267
+ def _flatten_row(self, row):
268
+ if len(self.output.values) > 1 and not isinstance(row, BaseModel):
269
+ flat = []
270
+ for obj in row:
271
+ flat.extend(self._obj_to_list(obj))
272
+ return tuple(flat)
273
+ return row if isinstance(row, tuple) else tuple(self._obj_to_list(row))
274
+
275
+ @staticmethod
276
+ def _obj_to_list(obj):
277
+ return flatten(obj) if isinstance(obj, BaseModel) else [obj]
278
+
285
279
  def _parse_rows(self, rows, cache, download_cb):
286
280
  objs = []
287
281
  for row in rows:
@@ -24,6 +24,7 @@ from typing import (
24
24
  )
25
25
 
26
26
  import attrs
27
+ import psutil
27
28
  import sqlalchemy
28
29
  import sqlalchemy as sa
29
30
  from attrs import frozen
@@ -383,7 +384,7 @@ def process_udf_outputs(
383
384
  udf_table: "Table",
384
385
  udf_results: Iterator[Iterable["UDFResult"]],
385
386
  udf: UDFBase,
386
- batch_size=INSERT_BATCH_SIZE,
387
+ batch_size: int = INSERT_BATCH_SIZE,
387
388
  cb: Callback = DEFAULT_CALLBACK,
388
389
  ) -> None:
389
390
  rows: list[UDFResult] = []
@@ -396,7 +397,9 @@ def process_udf_outputs(
396
397
  for row in udf_output:
397
398
  cb.relative_update()
398
399
  rows.append(adjust_outputs(warehouse, row, udf_col_types))
399
- if len(rows) >= batch_size:
400
+ if len(rows) >= batch_size or (
401
+ len(rows) % 10 == 0 and psutil.virtual_memory().percent > 80
402
+ ):
400
403
  for row_chunk in batched(rows, batch_size):
401
404
  warehouse.insert_rows(udf_table, row_chunk)
402
405
  rows.clear()
@@ -1775,6 +1778,10 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1775
1778
  save = bool(os.getenv("DATACHAIN_QUERY_SAVE"))
1776
1779
  save_as = os.getenv("DATACHAIN_QUERY_SAVE_AS")
1777
1780
 
1781
+ is_session_temp_dataset = dataset_query.name and dataset_query.name.startswith(
1782
+ dataset_query.session.get_temp_prefix()
1783
+ )
1784
+
1778
1785
  if save_as:
1779
1786
  if dataset_query.attached:
1780
1787
  dataset_name = dataset_query.name
@@ -1801,7 +1808,7 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1801
1808
  )
1802
1809
  else:
1803
1810
  dataset_query = dataset_query.save(save_as)
1804
- elif save and not dataset_query.attached:
1811
+ elif save and (is_session_temp_dataset or not dataset_query.attached):
1805
1812
  name = catalog.generate_query_dataset_name()
1806
1813
  dataset_query = dataset_query.save(name)
1807
1814
 
@@ -74,11 +74,13 @@ class Session:
74
74
  self.catalog.id_generator.close_on_exit()
75
75
 
76
76
  def generate_temp_dataset_name(self) -> str:
77
- tmp_table_uid = uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
78
- return f"{self.DATASET_PREFIX}{self.name}_{tmp_table_uid}"
77
+ return self.get_temp_prefix() + uuid4().hex[: self.TEMP_TABLE_UUID_LEN]
78
+
79
+ def get_temp_prefix(self) -> str:
80
+ return f"{self.DATASET_PREFIX}{self.name}_"
79
81
 
80
82
  def _cleanup_temp_datasets(self) -> None:
81
- prefix = f"{self.DATASET_PREFIX}{self.name}"
83
+ prefix = self.get_temp_prefix()
82
84
  try:
83
85
  for dataset in list(self.catalog.metastore.list_datasets_by_prefix(prefix)):
84
86
  self.catalog.remove_dataset(dataset.name, force=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.3.7
3
+ Version: 0.3.8
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -41,10 +41,11 @@ Requires-Dist: jmespath >=1.0
41
41
  Requires-Dist: datamodel-code-generator >=0.25
42
42
  Requires-Dist: Pillow <11,>=10.0.0
43
43
  Requires-Dist: msgpack <2,>=1.0.4
44
+ Requires-Dist: psutil
44
45
  Requires-Dist: numpy <2,>=1 ; sys_platform == "win32"
45
46
  Provides-Extra: dev
46
47
  Requires-Dist: datachain[docs,tests] ; extra == 'dev'
47
- Requires-Dist: mypy ==1.11.1 ; extra == 'dev'
48
+ Requires-Dist: mypy ==1.11.2 ; extra == 'dev'
48
49
  Requires-Dist: types-python-dateutil ; extra == 'dev'
49
50
  Requires-Dist: types-pytz ; extra == 'dev'
50
51
  Requires-Dist: types-PyYAML ; extra == 'dev'
@@ -64,11 +65,14 @@ Requires-Dist: accelerate ; extra == 'examples'
64
65
  Requires-Dist: unstructured[pdf] ; extra == 'examples'
65
66
  Requires-Dist: pdfplumber ==0.11.4 ; extra == 'examples'
66
67
  Requires-Dist: huggingface-hub[hf_transfer] ; extra == 'examples'
68
+ Provides-Extra: hf
69
+ Requires-Dist: numba >=0.60.0 ; extra == 'hf'
70
+ Requires-Dist: datasets[audio,vision] ; extra == 'hf'
67
71
  Provides-Extra: remote
68
72
  Requires-Dist: lz4 ; extra == 'remote'
69
73
  Requires-Dist: requests >=2.22.0 ; extra == 'remote'
70
74
  Provides-Extra: tests
71
- Requires-Dist: datachain[remote,torch,vector] ; extra == 'tests'
75
+ Requires-Dist: datachain[hf,remote,torch,vector] ; extra == 'tests'
72
76
  Requires-Dist: pytest <9,>=8 ; extra == 'tests'
73
77
  Requires-Dist: pytest-sugar >=0.9.6 ; extra == 'tests'
74
78
  Requires-Dist: pytest-cov >=4.1.0 ; extra == 'tests'
@@ -83,6 +87,7 @@ Requires-Dist: hypothesis ; extra == 'tests'
83
87
  Requires-Dist: open-clip-torch ; extra == 'tests'
84
88
  Requires-Dist: aiotools >=1.7.0 ; extra == 'tests'
85
89
  Requires-Dist: requests-mock ; extra == 'tests'
90
+ Requires-Dist: scipy ; extra == 'tests'
86
91
  Provides-Extra: torch
87
92
  Requires-Dist: torch >=2.1.0 ; extra == 'torch'
88
93
  Requires-Dist: torchvision ; extra == 'torch'
@@ -2,7 +2,7 @@ datachain/__init__.py,sha256=GeyhE-5LgfJav2OKYGaieP2lBvf2Gm-ihj7thnK9zjI,800
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=biF8M8fQujtj5xs0VLi8S16eBtzG6kceWlO_NILbCsg,8197
4
4
  datachain/cache.py,sha256=wznC2pge6RhlPTaJfBVGjmBc6bxWCPThu4aTFMltvFU,4076
5
- datachain/cli.py,sha256=DbmI1sXs7-KCQz6RdLE_JAp3XO3yrTSRJ71LdUzx-XE,33099
5
+ datachain/cli.py,sha256=otR2eN0JL-JhZ9SOTPcPwt_-_TiT-vHifx2h4YzD6Tg,32052
6
6
  datachain/cli_utils.py,sha256=jrn9ejGXjybeO1ur3fjdSiAyCHZrX0qsLLbJzN9ErPM,2418
7
7
  datachain/config.py,sha256=PfC7W5yO6HFO6-iMB4YB-0RR88LPiGmD6sS_SfVbGso,1979
8
8
  datachain/dataset.py,sha256=MZezyuJWNj_3PEtzr0epPMNyWAOTrhTSPI5FmemV6L4,14470
@@ -17,7 +17,7 @@ datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  datachain/storage.py,sha256=RiSJLYdHUjnrEWkLBKPcETHpAxld_B2WxLg711t0aZI,3733
18
18
  datachain/utils.py,sha256=ROVCLwb37VmFRzgTlSGUDw4eJNgYGiQ4yMX581HfUX8,12988
19
19
  datachain/catalog/__init__.py,sha256=g2iAAFx_gEIrqshXlhSEbrc8qDaEH11cjU40n3CHDz4,409
20
- datachain/catalog/catalog.py,sha256=dSEpktnwnpx1yY_QMvUexZVvvn6085olV7bnyImPM_k,81280
20
+ datachain/catalog/catalog.py,sha256=6S4AnDos4sGYGhy4wNSyV2pKPQNXvo819cd3Dl8Htgg,78271
21
21
  datachain/catalog/datasource.py,sha256=D-VWIVDCM10A8sQavLhRXdYSCG7F4o4ifswEF80_NAQ,1412
22
22
  datachain/catalog/loader.py,sha256=-6VelNfXUdgUnwInVyA8g86Boxv2xqhTh9xNS-Zlwig,8242
23
23
  datachain/catalog/subclass.py,sha256=B5R0qxeTYEyVAAPM1RutBPSoXZc8L5mVVZeSGXki9Sw,2096
@@ -38,21 +38,22 @@ datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2kru
38
38
  datachain/data_storage/sqlite.py,sha256=jLgkvikYkENQUO_ykoNFfsBc2ofZXwFHLMa1nyWP3aw,28316
39
39
  datachain/data_storage/warehouse.py,sha256=cvlfa-nyIxqrrpSRtCdeVjlTwhn7rcIoWjOq91HhItU,33668
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=D8N7zCppRdc5sTYT1hNIbROc-sKA_8FN5J_m-KjD3Us,4929
42
- datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
- datachain/lib/data_model.py,sha256=ZvtMRMcPpBxI-rOhkXb-ry1PkGYcEFFK1w1wH12vs4g,1718
41
+ datachain/lib/arrow.py,sha256=W8bIxMIe_b3dqMFYKGWmfbC_7Xe0gV3UiJjQ2i4EYLA,4925
42
+ datachain/lib/clip.py,sha256=33RL11OIqfbwyhvBgiMGM8rDAnZx1IRmxk9dY89ls3Q,6130
43
+ datachain/lib/data_model.py,sha256=gHIjlow84GMRDa78yLL1Ud-N18or21fnTyPEwsatpXY,2045
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=DkVhbjlxpl-HgHenIK1msofU2tUwsSiKPtNim5ai6OE,60136
45
+ datachain/lib/dc.py,sha256=wdMzFLglOhwWKHwh4qcLA0ezMrjuRJq2il2WnkHjyag,62490
46
46
  datachain/lib/file.py,sha256=ZHpdilDPYCob8uqtwUPtBvBNxVvQRq4AC_0IGg5m-G4,12003
47
- datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
47
+ datachain/lib/hf.py,sha256=mYaHFPS4CW2-stRZHBMWW-NKN4dhrnhjZobBgRocnvo,5317
48
+ datachain/lib/image.py,sha256=WbcwSaFzuyqjg4x4hH5CUogeUQjkZFjQHqw_oDEV1nA,2655
48
49
  datachain/lib/listing.py,sha256=nXLmGae_oQke4hnurzzWiHTEjHjWiqqHdB41Wb-hMTk,3521
49
50
  datachain/lib/meta_formats.py,sha256=Hels85LJmNCz1aYVJvhymNdAt3qdJ2-qoxsIiUezrow,7198
50
51
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
51
- datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
52
+ datachain/lib/pytorch.py,sha256=vK3GbWCy7kunN7ubul6w1hrWmJLja56uTCiMG_7XVQA,5623
52
53
  datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
53
- datachain/lib/signal_schema.py,sha256=MS8qkOIl-3Qh3KyYTCtuSgF9nP5PeaGccbtGqfWo2wI,15902
54
- datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
55
- datachain/lib/udf.py,sha256=n3x6No-7l5LAciPJPWwZbA8WtTnGUU7d0wRL6CyfZh8,11847
54
+ datachain/lib/signal_schema.py,sha256=rW1R6nIzdtmqWzpXk7aNAfrQD58_gbvkvEGyNTQ4WNM,20099
55
+ datachain/lib/text.py,sha256=vqs1SQdsw1vCzfvOanIeT4xY2R2TmPonElBgYDVeZmY,1241
56
+ datachain/lib/udf.py,sha256=nG7DDuPgZ5ZuijwvDoCq-OZMxlDM8vFNzyxMmik0Y1c,11716
56
57
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
57
58
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
58
59
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -60,20 +61,20 @@ datachain/lib/webdataset.py,sha256=SsjCKLSKEkHRRfeTHQhjoGqNPqIWw_SCWQcUwgUWWP0,8
60
61
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
61
62
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
62
63
  datachain/lib/convert/flatten.py,sha256=Uebc5CeqCsacp-nr6IG9i6OGuUavXqdqnoGctZBk3RQ,1384
63
- datachain/lib/convert/python_to_sql.py,sha256=4gplGlr_Kg-Z40OpJUzJiarDWj7pwbUOk-dPOYYCJ9Q,2629
64
+ datachain/lib/convert/python_to_sql.py,sha256=40SAOdoOgikZRhn8iomCPDRoxC3RFxjJLivEAA9MHDU,2880
64
65
  datachain/lib/convert/sql_to_python.py,sha256=lGnKzSF_tz9Y_5SSKkrIU95QEjpcDzvOxIRkEKTQag0,443
65
66
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
66
67
  datachain/lib/convert/values_to_tuples.py,sha256=YOdbjzHq-uj6-cV2Qq43G72eN2avMNDGl4x5t6yQMl8,3931
67
68
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
68
69
  datachain/query/batch.py,sha256=-vlpINJiertlnaoUVv1C95RatU0F6zuhpIYRufJRo1M,3660
69
70
  datachain/query/builtins.py,sha256=EmKPYsoQ46zwdyOn54MuCzvYFmfsBn5F8zyF7UBUfrc,2550
70
- datachain/query/dataset.py,sha256=4F_Q101Lbpc0YxOAcP3rc3GtKv8HwxpqF9lpJ0OoUEk,60818
71
+ datachain/query/dataset.py,sha256=G6xA3ItIGUJTXhizdAb6S3L1zFwTf8I0w0jHa1A6F4A,61103
71
72
  datachain/query/dispatch.py,sha256=GBh3EZHDp5AaXxrjOpfrpfsuy7Umnqxu-MAXcK9X3gc,12945
72
73
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
73
74
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
74
75
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
75
76
  datachain/query/schema.py,sha256=BvHipN79CnSTbVFcfIEwzo1npe7HmThnk0iY-CSLEkM,7899
76
- datachain/query/session.py,sha256=PkOLANS0s8KPz4wO17tAab-CMzIt7FK8RPzJiibExds,4290
77
+ datachain/query/session.py,sha256=UPH5Z4fzCDsvj81ji0e8GA6Mgra3bOAEpVq4htqOtis,4317
77
78
  datachain/query/udf.py,sha256=j3NhmKK5rYG5TclcM2Sr0LhS1tmYLMjzMugx9G9iFLM,8100
78
79
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
79
80
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
@@ -94,9 +95,9 @@ datachain/sql/sqlite/base.py,sha256=WLPHBhZbXbiqPoRV1VgDrXJqku4UuvJpBhYeQ0k5rI8,
94
95
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
95
96
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
96
97
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
97
- datachain-0.3.7.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
98
- datachain-0.3.7.dist-info/METADATA,sha256=m7NZ31iEMCD2xOF8HZNp8YvGu05TmF_3UiZQQPUVmmc,16719
99
- datachain-0.3.7.dist-info/WHEEL,sha256=Mdi9PDNwEZptOjTlUcAth7XJDFtKrHYaQMPulZeBCiQ,91
100
- datachain-0.3.7.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
101
- datachain-0.3.7.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
102
- datachain-0.3.7.dist-info/RECORD,,
98
+ datachain-0.3.8.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
99
+ datachain-0.3.8.dist-info/METADATA,sha256=ivteXQrJgp8dKgIO2pdwUj6Qdg96rbI3Gq0kx5fyxtk,16903
100
+ datachain-0.3.8.dist-info/WHEEL,sha256=UvcQYKBHoFqaQd6LKyqHw9fxEolWLQnlzP0h_LgJAfI,91
101
+ datachain-0.3.8.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
102
+ datachain-0.3.8.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
103
+ datachain-0.3.8.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (73.0.1)
2
+ Generator: setuptools (74.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5