datachain 0.2.14__py3-none-any.whl → 0.2.15__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -42,6 +42,7 @@ if TYPE_CHECKING:
42
42
  from sqlalchemy.dialects.sqlite import Insert
43
43
  from sqlalchemy.schema import SchemaItem
44
44
  from sqlalchemy.sql.elements import ColumnClause, ColumnElement, TextClause
45
+ from sqlalchemy.sql.selectable import Select
45
46
  from sqlalchemy.types import TypeEngine
46
47
 
47
48
 
@@ -705,3 +706,23 @@ class SQLiteWarehouse(AbstractWarehouse):
705
706
  client_config=None,
706
707
  ) -> list[str]:
707
708
  raise NotImplementedError("Exporting dataset table not implemented for SQLite")
709
+
710
+ def create_pre_udf_table(self, query: "Select") -> "Table":
711
+ """
712
+ Create a temporary table from a query for use in a UDF.
713
+ """
714
+ columns = [
715
+ sqlalchemy.Column(c.name, c.type)
716
+ for c in query.selected_columns
717
+ if c.name != "sys__id"
718
+ ]
719
+ table = self.create_udf_table(columns)
720
+
721
+ select_q = query.with_only_columns(
722
+ *[c for c in query.selected_columns if c.name != "sys__id"]
723
+ )
724
+ self.db.execute(
725
+ table.insert().from_select(list(select_q.selected_columns), select_q)
726
+ )
727
+
728
+ return table
@@ -2,6 +2,8 @@ import glob
2
2
  import json
3
3
  import logging
4
4
  import posixpath
5
+ import random
6
+ import string
5
7
  from abc import ABC, abstractmethod
6
8
  from collections.abc import Generator, Iterable, Iterator, Sequence
7
9
  from typing import TYPE_CHECKING, Any, Optional, Union
@@ -24,6 +26,7 @@ from datachain.utils import sql_escape_like
24
26
  if TYPE_CHECKING:
25
27
  from sqlalchemy.sql._typing import _ColumnsClauseArgument
26
28
  from sqlalchemy.sql.elements import ColumnElement
29
+ from sqlalchemy.sql.selectable import Select
27
30
  from sqlalchemy.types import TypeEngine
28
31
 
29
32
  from datachain.data_storage import AbstractIDGenerator, schema
@@ -252,6 +255,12 @@ class AbstractWarehouse(ABC, Serializable):
252
255
  prefix = self.DATASET_SOURCE_TABLE_PREFIX
253
256
  return f"{prefix}{dataset_name}_{version}"
254
257
 
258
+ def temp_table_name(self) -> str:
259
+ return self.TMP_TABLE_NAME_PREFIX + _random_string(6)
260
+
261
+ def udf_table_name(self) -> str:
262
+ return self.UDF_TABLE_NAME_PREFIX + _random_string(6)
263
+
255
264
  #
256
265
  # Datasets
257
266
  #
@@ -869,8 +878,8 @@ class AbstractWarehouse(ABC, Serializable):
869
878
 
870
879
  def create_udf_table(
871
880
  self,
872
- name: str,
873
881
  columns: Sequence["sa.Column"] = (),
882
+ name: Optional[str] = None,
874
883
  ) -> "sa.Table":
875
884
  """
876
885
  Create a temporary table for storing custom signals generated by a UDF.
@@ -878,7 +887,7 @@ class AbstractWarehouse(ABC, Serializable):
878
887
  and UDFs are run in other processes when run in parallel.
879
888
  """
880
889
  tbl = sa.Table(
881
- name,
890
+ name or self.udf_table_name(),
882
891
  sa.MetaData(),
883
892
  sa.Column("sys__id", Int, primary_key=True),
884
893
  *columns,
@@ -886,6 +895,12 @@ class AbstractWarehouse(ABC, Serializable):
886
895
  self.db.create_table(tbl, if_not_exists=True)
887
896
  return tbl
888
897
 
898
+ @abstractmethod
899
+ def create_pre_udf_table(self, query: "Select") -> "Table":
900
+ """
901
+ Create a temporary table from a query for use in a UDF.
902
+ """
903
+
889
904
  def is_temp_table_name(self, name: str) -> bool:
890
905
  """Returns if the given table name refers to a temporary
891
906
  or no longer needed table."""
@@ -937,3 +952,10 @@ class AbstractWarehouse(ABC, Serializable):
937
952
  & (tq.c.is_latest == true())
938
953
  )
939
954
  )
955
+
956
+
957
+ def _random_string(length: int) -> str:
958
+ return "".join(
959
+ random.choice(string.ascii_letters + string.digits) # noqa: S311
960
+ for i in range(length)
961
+ )
datachain/lib/arrow.py CHANGED
@@ -10,13 +10,17 @@ from datachain.lib.file import File, IndexedFile
10
10
  from datachain.lib.udf import Generator
11
11
 
12
12
  if TYPE_CHECKING:
13
+ from pydantic import BaseModel
14
+
13
15
  from datachain.lib.dc import DataChain
14
16
 
15
17
 
16
18
  class ArrowGenerator(Generator):
17
19
  def __init__(
18
20
  self,
19
- schema: Optional["pa.Schema"] = None,
21
+ input_schema: Optional["pa.Schema"] = None,
22
+ output_schema: Optional[type["BaseModel"]] = None,
23
+ source: bool = True,
20
24
  nrows: Optional[int] = None,
21
25
  **kwargs,
22
26
  ):
@@ -25,24 +29,36 @@ class ArrowGenerator(Generator):
25
29
 
26
30
  Parameters:
27
31
 
28
- schema : Optional pyarrow schema for validation.
32
+ input_schema : Optional pyarrow schema for validation.
33
+ output_schema : Optional pydantic model for validation.
34
+ source : Whether to include info about the source file.
29
35
  nrows : Optional row limit.
30
36
  kwargs: Parameters to pass to pyarrow.dataset.dataset.
31
37
  """
32
38
  super().__init__()
33
- self.schema = schema
39
+ self.input_schema = input_schema
40
+ self.output_schema = output_schema
41
+ self.source = source
34
42
  self.nrows = nrows
35
43
  self.kwargs = kwargs
36
44
 
37
45
  def process(self, file: File):
38
46
  path = file.get_path()
39
- ds = dataset(path, filesystem=file.get_fs(), schema=self.schema, **self.kwargs)
47
+ ds = dataset(
48
+ path, filesystem=file.get_fs(), schema=self.input_schema, **self.kwargs
49
+ )
40
50
  index = 0
41
51
  with tqdm(desc="Parsed by pyarrow", unit=" rows") as pbar:
42
- for record_batch in ds.to_batches():
52
+ for record_batch in ds.to_batches(use_threads=False):
43
53
  for record in record_batch.to_pylist():
44
- source = IndexedFile(file=file, index=index)
45
- yield [source, *record.values()]
54
+ vals = list(record.values())
55
+ if self.output_schema:
56
+ fields = self.output_schema.model_fields
57
+ vals = [self.output_schema(**dict(zip(fields, vals)))]
58
+ if self.source:
59
+ yield [IndexedFile(file=file, index=index), *vals]
60
+ else:
61
+ yield vals
46
62
  index += 1
47
63
  if self.nrows and index >= self.nrows:
48
64
  return
@@ -76,7 +92,10 @@ def schema_to_output(schema: pa.Schema, col_names: Optional[Sequence[str]] = Non
76
92
  if not column:
77
93
  column = f"c{default_column}"
78
94
  default_column += 1
79
- output[column] = _arrow_type_mapper(field.type) # type: ignore[assignment]
95
+ dtype = _arrow_type_mapper(field.type) # type: ignore[assignment]
96
+ if field.nullable:
97
+ dtype = Optional[dtype] # type: ignore[assignment]
98
+ output[column] = dtype
80
99
 
81
100
  return output
82
101
 
@@ -41,17 +41,22 @@ def flatten_list(obj_list):
41
41
  )
42
42
 
43
43
 
44
+ def _flatten_list_field(value: list):
45
+ assert isinstance(value, list)
46
+ if value and ModelStore.is_pydantic(type(value[0])):
47
+ return [val.model_dump() for val in value]
48
+ if value and isinstance(value[0], list):
49
+ return [_flatten_list_field(v) for v in value]
50
+ return value
51
+
52
+
44
53
  def _flatten_fields_values(fields, obj: BaseModel):
45
54
  for name, f_info in fields.items():
46
55
  anno = f_info.annotation
47
56
  # Optimization: Access attributes directly to skip the model_dump() call.
48
57
  value = getattr(obj, name)
49
-
50
58
  if isinstance(value, list):
51
- if value and ModelStore.is_pydantic(type(value[0])):
52
- yield [val.model_dump() for val in value]
53
- else:
54
- yield value
59
+ yield _flatten_list_field(value)
55
60
  elif isinstance(value, dict):
56
61
  yield {
57
62
  key: val.model_dump() if ModelStore.is_pydantic(type(val)) else val
@@ -82,7 +82,7 @@ def python_to_sql(typ): # noqa: PLR0911
82
82
  def _is_json_inside_union(orig, args) -> bool:
83
83
  if orig == Union and len(args) >= 2:
84
84
  # List in JSON: Union[dict, list[dict]]
85
- args_no_nones = [arg for arg in args if arg != type(None)]
85
+ args_no_nones = [arg for arg in args if arg != type(None)] # noqa: E721
86
86
  if len(args_no_nones) == 2:
87
87
  args_no_dicts = [arg for arg in args_no_nones if arg is not dict]
88
88
  if len(args_no_dicts) == 1 and get_origin(args_no_dicts[0]) is list:
@@ -47,7 +47,12 @@ def is_chain_type(t: type) -> bool:
47
47
  if any(t is ft or t is get_args(ft)[0] for ft in get_args(StandardType)):
48
48
  return True
49
49
 
50
- if get_origin(t) is list and len(get_args(t)) == 1:
50
+ orig = get_origin(t)
51
+ args = get_args(t)
52
+ if orig is list and len(args) == 1:
51
53
  return is_chain_type(get_args(t)[0])
52
54
 
55
+ if orig is Union and len(args) == 2 and (type(None) in args):
56
+ return is_chain_type(args[0])
57
+
53
58
  return False
datachain/lib/dc.py CHANGED
@@ -33,6 +33,7 @@ from datachain.lib.settings import Settings
33
33
  from datachain.lib.signal_schema import SignalSchema
34
34
  from datachain.lib.udf import (
35
35
  Aggregator,
36
+ BatchMapper,
36
37
  Generator,
37
38
  Mapper,
38
39
  UDFBase,
@@ -192,6 +193,8 @@ class DataChain(DatasetQuery):
192
193
  ```
193
194
  """
194
195
 
196
+ max_row_count: Optional[int] = None
197
+
195
198
  DEFAULT_FILE_RECORD: ClassVar[dict] = {
196
199
  "source": "",
197
200
  "name": "",
@@ -237,7 +240,6 @@ class DataChain(DatasetQuery):
237
240
  def settings(
238
241
  self,
239
242
  cache=None,
240
- batch=None,
241
243
  parallel=None,
242
244
  workers=None,
243
245
  min_task_size=None,
@@ -250,7 +252,6 @@ class DataChain(DatasetQuery):
250
252
 
251
253
  Parameters:
252
254
  cache : data caching (default=False)
253
- batch : size of the batch (default=1000)
254
255
  parallel : number of thread for processors. True is a special value to
255
256
  enable all available CPUs (default=1)
256
257
  workers : number of distributed workers. Only for Studio mode. (default=1)
@@ -268,7 +269,7 @@ class DataChain(DatasetQuery):
268
269
  chain = self.clone()
269
270
  if sys is not None:
270
271
  chain._sys = sys
271
- chain._settings.add(Settings(cache, batch, parallel, workers, min_task_size))
272
+ chain._settings.add(Settings(cache, parallel, workers, min_task_size))
272
273
  return chain
273
274
 
274
275
  def reset_settings(self, settings: Optional[Settings] = None) -> "Self":
@@ -344,7 +345,7 @@ class DataChain(DatasetQuery):
344
345
  jmespath: Optional[str] = None,
345
346
  object_name: Optional[str] = "",
346
347
  model_name: Optional[str] = None,
347
- show_schema: Optional[bool] = False,
348
+ print_schema: Optional[bool] = False,
348
349
  meta_type: Optional[str] = "json",
349
350
  nrows=None,
350
351
  **kwargs,
@@ -359,7 +360,7 @@ class DataChain(DatasetQuery):
359
360
  schema_from : path to sample to infer spec (if schema not provided)
360
361
  object_name : generated object column name
361
362
  model_name : optional generated model name
362
- show_schema : print auto-generated schema
363
+ print_schema : print auto-generated schema
363
364
  jmespath : optional JMESPATH expression to reduce JSON
364
365
  nrows : optional row limit for jsonl and JSON arrays
365
366
 
@@ -392,7 +393,7 @@ class DataChain(DatasetQuery):
392
393
  meta_type=meta_type,
393
394
  spec=spec,
394
395
  model_name=model_name,
395
- show_schema=show_schema,
396
+ print_schema=print_schema,
396
397
  jmespath=jmespath,
397
398
  nrows=nrows,
398
399
  )
@@ -409,7 +410,7 @@ class DataChain(DatasetQuery):
409
410
  jmespath: Optional[str] = None,
410
411
  object_name: Optional[str] = "",
411
412
  model_name: Optional[str] = None,
412
- show_schema: Optional[bool] = False,
413
+ print_schema: Optional[bool] = False,
413
414
  meta_type: Optional[str] = "jsonl",
414
415
  nrows=None,
415
416
  **kwargs,
@@ -424,7 +425,7 @@ class DataChain(DatasetQuery):
424
425
  schema_from : path to sample to infer spec (if schema not provided)
425
426
  object_name : generated object column name
426
427
  model_name : optional generated model name
427
- show_schema : print auto-generated schema
428
+ print_schema : print auto-generated schema
428
429
  jmespath : optional JMESPATH expression to reduce JSON
429
430
  nrows : optional row limit for jsonl and JSON arrays
430
431
 
@@ -452,7 +453,7 @@ class DataChain(DatasetQuery):
452
453
  meta_type=meta_type,
453
454
  spec=spec,
454
455
  model_name=model_name,
455
- show_schema=show_schema,
456
+ print_schema=print_schema,
456
457
  jmespath=jmespath,
457
458
  nrows=nrows,
458
459
  )
@@ -488,7 +489,7 @@ class DataChain(DatasetQuery):
488
489
  **{object_name: datasets}, # type: ignore[arg-type]
489
490
  )
490
491
 
491
- def show_json_schema( # type: ignore[override]
492
+ def print_json_schema( # type: ignore[override]
492
493
  self, jmespath: Optional[str] = None, model_name: Optional[str] = None
493
494
  ) -> "DataChain":
494
495
  """Print JSON data model and save it. It returns the chain itself.
@@ -513,7 +514,7 @@ class DataChain(DatasetQuery):
513
514
  output=str,
514
515
  )
515
516
 
516
- def show_jsonl_schema( # type: ignore[override]
517
+ def print_jsonl_schema( # type: ignore[override]
517
518
  self, jmespath: Optional[str] = None, model_name: Optional[str] = None
518
519
  ) -> "DataChain":
519
520
  """Print JSON data model and save it. It returns the chain itself.
@@ -598,14 +599,16 @@ class DataChain(DatasetQuery):
598
599
 
599
600
  Using func and output as a map:
600
601
  ```py
601
- chain = chain.map(lambda name: name[:-4] + ".json", output={"res": str})
602
+ chain = chain.map(
603
+ lambda name: name.split("."), output={"stem": str, "ext": str}
604
+ )
602
605
  chain.save("new_dataset")
603
606
  ```
604
607
  """
605
608
  udf_obj = self._udf_to_obj(Mapper, func, params, output, signal_map)
606
609
 
607
610
  chain = self.add_signals(
608
- udf_obj.to_udf_wrapper(self._settings.batch),
611
+ udf_obj.to_udf_wrapper(),
609
612
  **self._settings.to_dict(),
610
613
  )
611
614
 
@@ -618,7 +621,7 @@ class DataChain(DatasetQuery):
618
621
  output: OutputType = None,
619
622
  **signal_map,
620
623
  ) -> "Self":
621
- """Apply a function to each row to create new rows (with potentially new
624
+ r"""Apply a function to each row to create new rows (with potentially new
622
625
  signals). The function needs to return a new objects for each of the new rows.
623
626
  It returns a chain itself with new signals.
624
627
 
@@ -628,11 +631,20 @@ class DataChain(DatasetQuery):
628
631
  one key differences: It produces a sequence of rows for each input row (like
629
632
  extracting multiple file records from a single tar file or bounding boxes from a
630
633
  single image file).
634
+
635
+ Example:
636
+ ```py
637
+ chain = chain.gen(
638
+ line=lambda file: [l for l in file.read().split("\n")],
639
+ output=str,
640
+ )
641
+ chain.save("new_dataset")
642
+ ```
631
643
  """
632
644
  udf_obj = self._udf_to_obj(Generator, func, params, output, signal_map)
633
645
  chain = DatasetQuery.generate(
634
646
  self,
635
- udf_obj.to_udf_wrapper(self._settings.batch),
647
+ udf_obj.to_udf_wrapper(),
636
648
  **self._settings.to_dict(),
637
649
  )
638
650
 
@@ -652,23 +664,68 @@ class DataChain(DatasetQuery):
652
664
 
653
665
  Input-output relationship: N:M
654
666
 
655
- This method bears similarity to `gen()` and map(), employing a comparable set of
656
- parameters, yet differs in two crucial aspects:
667
+ This method bears similarity to `gen()` and `map()`, employing a comparable set
668
+ of parameters, yet differs in two crucial aspects:
657
669
  1. The `partition_by` parameter: This specifies the column name or a list of
658
670
  column names that determine the grouping criteria for aggregation.
659
671
  2. Group-based UDF function input: Instead of individual rows, the function
660
672
  receives a list all rows within each group defined by `partition_by`.
673
+
674
+ Example:
675
+ ```py
676
+ chain = chain.agg(
677
+ total=lambda category, amount: [sum(amount)],
678
+ output=float,
679
+ partition_by="category",
680
+ )
681
+ chain.save("new_dataset")
682
+ ```
661
683
  """
662
684
  udf_obj = self._udf_to_obj(Aggregator, func, params, output, signal_map)
663
685
  chain = DatasetQuery.generate(
664
686
  self,
665
- udf_obj.to_udf_wrapper(self._settings.batch),
687
+ udf_obj.to_udf_wrapper(),
666
688
  partition_by=partition_by,
667
689
  **self._settings.to_dict(),
668
690
  )
669
691
 
670
692
  return chain.reset_schema(udf_obj.output).reset_settings(self._settings)
671
693
 
694
+ def batch_map(
695
+ self,
696
+ func: Optional[Callable] = None,
697
+ params: Union[None, str, Sequence[str]] = None,
698
+ output: OutputType = None,
699
+ batch: int = 1000,
700
+ **signal_map,
701
+ ) -> "Self":
702
+ """This is a batch version of `map()`.
703
+
704
+ Input-output relationship: N:N
705
+
706
+ It accepts the same parameters plus an
707
+ additional parameter:
708
+
709
+ batch : Size of each batch passed to `func`. Defaults to 1000.
710
+
711
+ Example:
712
+ ```py
713
+ chain = chain.batch_map(
714
+ sqrt=lambda size: np.sqrt(size),
715
+ output=float
716
+ )
717
+ chain.save("new_dataset")
718
+ ```
719
+ """
720
+ udf_obj = self._udf_to_obj(BatchMapper, func, params, output, signal_map)
721
+ chain = DatasetQuery.add_signals(
722
+ self,
723
+ udf_obj.to_udf_wrapper(batch),
724
+ **self._settings.to_dict(),
725
+ )
726
+
727
+ return chain.add_schema(udf_obj.output).reset_settings(self._settings)
728
+
672
729
  def _udf_to_obj(
673
730
  self,
674
731
  target_class: type[UDFBase],
@@ -1176,6 +1233,7 @@ class DataChain(DatasetQuery):
1176
1233
  output: OutputType = None,
1177
1234
  object_name: str = "",
1178
1235
  model_name: str = "",
1236
+ source: bool = True,
1179
1237
  nrows: Optional[int] = None,
1180
1238
  **kwargs,
1181
1239
  ) -> "DataChain":
@@ -1187,8 +1245,9 @@ class DataChain(DatasetQuery):
1187
1245
  case types will be inferred.
1188
1246
  object_name : Generated object column name.
1189
1247
  model_name : Generated model name.
1190
- kwargs : Parameters to pass to pyarrow.dataset.dataset.
1248
+ source : Whether to include info about the source file.
1191
1249
  nrows : Optional row limit.
1250
+ kwargs : Parameters to pass to pyarrow.dataset.dataset.
1192
1251
 
1193
1252
  Example:
1194
1253
  Reading a json lines file:
@@ -1215,18 +1274,24 @@ class DataChain(DatasetQuery):
1215
1274
  except ValueError as e:
1216
1275
  raise DatasetPrepareError(self.name, e) from e
1217
1276
 
1277
+ if isinstance(output, dict):
1278
+ model_name = model_name or object_name or ""
1279
+ model = DataChain._dict_to_data_model(model_name, output)
1280
+ else:
1281
+ model = output # type: ignore[assignment]
1282
+
1218
1283
  if object_name:
1219
- if isinstance(output, dict):
1220
- model_name = model_name or object_name
1221
- output = DataChain._dict_to_data_model(model_name, output)
1222
- output = {object_name: output} # type: ignore[dict-item]
1284
+ output = {object_name: model} # type: ignore[dict-item]
1223
1285
  elif isinstance(output, type(BaseModel)):
1224
1286
  output = {
1225
1287
  name: info.annotation # type: ignore[misc]
1226
1288
  for name, info in output.model_fields.items()
1227
1289
  }
1228
- output = {"source": IndexedFile} | output # type: ignore[assignment,operator]
1229
- return self.gen(ArrowGenerator(schema, nrows, **kwargs), output=output)
1290
+ if source:
1291
+ output = {"source": IndexedFile} | output # type: ignore[assignment,operator]
1292
+ return self.gen(
1293
+ ArrowGenerator(schema, model, source, nrows, **kwargs), output=output
1294
+ )
1230
1295
 
1231
1296
  @staticmethod
1232
1297
  def _dict_to_data_model(
@@ -1245,10 +1310,10 @@ class DataChain(DatasetQuery):
1245
1310
  path,
1246
1311
  delimiter: str = ",",
1247
1312
  header: bool = True,
1248
- column_names: Optional[list[str]] = None,
1249
1313
  output: OutputType = None,
1250
1314
  object_name: str = "",
1251
1315
  model_name: str = "",
1316
+ source: bool = True,
1252
1317
  nrows=None,
1253
1318
  **kwargs,
1254
1319
  ) -> "DataChain":
@@ -1264,6 +1329,7 @@ class DataChain(DatasetQuery):
1264
1329
  case types will be inferred.
1265
1330
  object_name : Created object column name.
1266
1331
  model_name : Generated model name.
1332
+ source : Whether to include info about the source file.
1267
1333
  nrows : Optional row limit.
1268
1334
 
1269
1335
  Example:
@@ -1282,6 +1348,7 @@ class DataChain(DatasetQuery):
1282
1348
 
1283
1349
  chain = DataChain.from_storage(path, **kwargs)
1284
1350
 
1351
+ column_names = None
1285
1352
  if not header:
1286
1353
  if not output:
1287
1354
  msg = "error parsing csv - provide output if no header"
@@ -1303,6 +1370,7 @@ class DataChain(DatasetQuery):
1303
1370
  output=output,
1304
1371
  object_name=object_name,
1305
1372
  model_name=model_name,
1373
+ source=source,
1306
1374
  nrows=nrows,
1307
1375
  format=format,
1308
1376
  )
@@ -1315,6 +1383,7 @@ class DataChain(DatasetQuery):
1315
1383
  output: Optional[dict[str, DataType]] = None,
1316
1384
  object_name: str = "",
1317
1385
  model_name: str = "",
1386
+ source: bool = True,
1318
1387
  nrows=None,
1319
1388
  **kwargs,
1320
1389
  ) -> "DataChain":
@@ -1327,6 +1396,7 @@ class DataChain(DatasetQuery):
1327
1396
  output : Dictionary defining column names and their corresponding types.
1328
1397
  object_name : Created object column name.
1329
1398
  model_name : Generated model name.
1399
+ source : Whether to include info about the source file.
1330
1400
  nrows : Optional row limit.
1331
1401
 
1332
1402
  Example:
@@ -1345,6 +1415,7 @@ class DataChain(DatasetQuery):
1345
1415
  output=output,
1346
1416
  object_name=object_name,
1347
1417
  model_name=model_name,
1418
+ source=source,
1348
1419
  nrows=None,
1349
1420
  format="parquet",
1350
1421
  partitioning=partitioning,
@@ -1531,7 +1602,18 @@ class DataChain(DatasetQuery):
1531
1602
  @detach
1532
1603
  def limit(self, n: int) -> "Self":
1533
1604
  """Return the first n rows of the chain."""
1534
- return super().limit(n)
1605
+ n = max(n, 0)
1606
+
1607
+ if self.max_row_count is None:
1608
+ self.max_row_count = n
1609
+ return super().limit(n)
1610
+
1611
+ limit = min(n, self.max_row_count)
1612
+ if limit == self.max_row_count:
1613
+ return self
1614
+
1615
+ self.max_row_count = limit
1616
+ return super().limit(self.max_row_count)
1535
1617
 
1536
1618
  @detach
1537
1619
  def offset(self, offset: int) -> "Self":
@@ -101,7 +101,7 @@ def read_meta( # noqa: C901
101
101
  schema_from=None,
102
102
  meta_type="json",
103
103
  jmespath=None,
104
- show_schema=False,
104
+ print_schema=False,
105
105
  model_name=None,
106
106
  nrows=None,
107
107
  ) -> Callable:
@@ -129,7 +129,7 @@ def read_meta( # noqa: C901
129
129
  model_output = captured_output.getvalue()
130
130
  captured_output.close()
131
131
 
132
- if show_schema:
132
+ if print_schema:
133
133
  print(f"{model_output}")
134
134
  # Below 'spec' should be a dynamically converted DataModel from Pydantic
135
135
  if not spec:
@@ -153,13 +153,13 @@ def read_meta( # noqa: C901
153
153
  jmespath=jmespath,
154
154
  nrows=nrows,
155
155
  ) -> Iterator[spec]:
156
- def validator(json_object: dict) -> spec:
156
+ def validator(json_object: dict, nrow=0) -> spec:
157
157
  json_string = json.dumps(json_object)
158
158
  try:
159
159
  data_instance = data_model.model_validate_json(json_string)
160
160
  yield data_instance
161
161
  except ValidationError as e:
162
- print(f"Validation error occurred in file {file.name}:", e)
162
+ print(f"Validation error occurred in row {nrow} file {file.name}:", e)
163
163
 
164
164
  if meta_type == "csv":
165
165
  with (
@@ -185,7 +185,7 @@ def read_meta( # noqa: C901
185
185
  nrow = nrow + 1
186
186
  if nrows is not None and nrow > nrows:
187
187
  return
188
- yield from validator(json_dict)
188
+ yield from validator(json_dict, nrow)
189
189
 
190
190
  if meta_type == "jsonl":
191
191
  try:
@@ -198,7 +198,7 @@ def read_meta( # noqa: C901
198
198
  return
199
199
  json_object = process_json(data_string, jmespath)
200
200
  data_string = fd.readline()
201
- yield from validator(json_object)
201
+ yield from validator(json_object, nrow)
202
202
  except OSError as e:
203
203
  print(f"An unexpected file error occurred in file {file.name}: {e}")
204
204
 
datachain/lib/settings.py CHANGED
@@ -7,11 +7,8 @@ class SettingsError(DataChainParamsError):
7
7
 
8
8
 
9
9
  class Settings:
10
- def __init__(
11
- self, cache=None, batch=None, parallel=None, workers=None, min_task_size=None
12
- ):
10
+ def __init__(self, cache=None, parallel=None, workers=None, min_task_size=None):
13
11
  self._cache = cache
14
- self._batch = batch
15
12
  self.parallel = parallel
16
13
  self._workers = workers
17
14
  self.min_task_size = min_task_size
@@ -22,12 +19,6 @@ class Settings:
22
19
  f" while {cache.__class__.__name__} was given"
23
20
  )
24
21
 
25
- if not isinstance(batch, int) and batch is not None:
26
- raise SettingsError(
27
- "'batch' argument must be int or None"
28
- f" while {batch.__class__.__name__} was given"
29
- )
30
-
31
22
  if not isinstance(parallel, int) and parallel is not None:
32
23
  raise SettingsError(
33
24
  "'parallel' argument must be int or None"
@@ -54,10 +45,6 @@ class Settings:
54
45
  def cache(self):
55
46
  return self._cache if self._cache is not None else False
56
47
 
57
- @property
58
- def batch(self):
59
- return self._batch if self._batch is not None else 1
60
-
61
48
  @property
62
49
  def workers(self):
63
50
  return self._workers if self._workers is not None else False
@@ -66,8 +53,6 @@ class Settings:
66
53
  res = {}
67
54
  if self._cache is not None:
68
55
  res["cache"] = self.cache
69
- if self._batch is not None:
70
- res["batch"] = self.batch
71
56
  if self.parallel is not None:
72
57
  res["parallel"] = self.parallel
73
58
  if self._workers is not None:
@@ -78,7 +63,6 @@ class Settings:
78
63
 
79
64
  def add(self, settings: "Settings"):
80
65
  self._cache = settings._cache or self._cache
81
- self._batch = settings._batch or self._batch
82
66
  self.parallel = settings.parallel or self.parallel
83
67
  self._workers = settings._workers or self._workers
84
68
  self.min_task_size = settings.min_task_size or self.min_task_size
datachain/lib/udf.py CHANGED
@@ -225,11 +225,10 @@ class UDFBase(AbstractUDF):
225
225
  def __call__(self, *rows, cache, download_cb):
226
226
  if self.is_input_grouped:
227
227
  objs = self._parse_grouped_rows(rows[0], cache, download_cb)
228
+ elif self.is_input_batched:
229
+ objs = zip(*self._parse_rows(rows[0], cache, download_cb))
228
230
  else:
229
- objs = self._parse_rows(rows, cache, download_cb)
230
-
231
- if not self.is_input_batched:
232
- objs = objs[0]
231
+ objs = self._parse_rows([rows], cache, download_cb)[0]
233
232
 
234
233
  result_objs = self.process_safe(objs)
235
234
 
@@ -259,17 +258,24 @@ class UDFBase(AbstractUDF):
259
258
 
260
259
  if not self.is_output_batched:
261
260
  res = list(res)
262
- assert len(res) == 1, (
263
- f"{self.name} returns {len(res)} " f"rows while it's not batched"
264
- )
261
+ assert (
262
+ len(res) == 1
263
+ ), f"{self.name} returns {len(res)} rows while it's not batched"
265
264
  if isinstance(res[0], tuple):
266
265
  res = res[0]
266
+ elif (
267
+ self.is_input_batched
268
+ and self.is_output_batched
269
+ and not self.is_input_grouped
270
+ ):
271
+ res = list(res)
272
+ assert len(res) == len(
273
+ rows[0]
274
+ ), f"{self.name} returns {len(res)} rows while len(rows[0]) expected"
267
275
 
268
276
  return res
269
277
 
270
278
  def _parse_rows(self, rows, cache, download_cb):
271
- if not self.is_input_batched:
272
- rows = [rows]
273
279
  objs = []
274
280
  for row in rows:
275
281
  obj_row = self.params.row_to_objs(row)
@@ -330,7 +336,9 @@ class Mapper(UDFBase):
330
336
  """Inherit from this class to pass to `DataChain.map()`."""
331
337
 
332
338
 
333
- class BatchMapper(Mapper):
339
+ class BatchMapper(UDFBase):
340
+ """Inherit from this class to pass to `DataChain.batch_map()`."""
341
+
334
342
  is_input_batched = True
335
343
  is_output_batched = True
336
344
 
@@ -262,9 +262,7 @@ class DatasetDiffOperation(Step):
262
262
  temp_tables.extend(self.dq.temp_table_names)
263
263
 
264
264
  # creating temp table that will hold subtract results
265
- temp_table_name = self.catalog.warehouse.TMP_TABLE_NAME_PREFIX + _random_string(
266
- 6
267
- )
265
+ temp_table_name = self.catalog.warehouse.temp_table_name()
268
266
  temp_tables.append(temp_table_name)
269
267
 
270
268
  columns = [
@@ -448,9 +446,6 @@ class UDFStep(Step, ABC):
448
446
  to select
449
447
  """
450
448
 
451
- def udf_table_name(self) -> str:
452
- return self.catalog.warehouse.UDF_TABLE_NAME_PREFIX + _random_string(6)
453
-
454
449
  def populate_udf_table(self, udf_table: "Table", query: Select) -> None:
455
450
  use_partitioning = self.partition_by is not None
456
451
  batching = self.udf.properties.get_batching(use_partitioning)
@@ -574,9 +569,7 @@ class UDFStep(Step, ABC):
574
569
  list_partition_by = [self.partition_by]
575
570
 
576
571
  # create table with partitions
577
- tbl = self.catalog.warehouse.create_udf_table(
578
- self.udf_table_name(), partition_columns()
579
- )
572
+ tbl = self.catalog.warehouse.create_udf_table(partition_columns())
580
573
 
581
574
  # fill table with partitions
582
575
  cols = [
@@ -638,37 +631,12 @@ class UDFSignal(UDFStep):
638
631
  for (col_name, col_type) in self.udf.output.items()
639
632
  ]
640
633
 
641
- return self.catalog.warehouse.create_udf_table(
642
- self.udf_table_name(), udf_output_columns
643
- )
644
-
645
- def create_pre_udf_table(self, query: Select) -> "Table":
646
- columns = [
647
- sqlalchemy.Column(c.name, c.type)
648
- for c in query.selected_columns
649
- if c.name != "sys__id"
650
- ]
651
- table = self.catalog.warehouse.create_udf_table(self.udf_table_name(), columns)
652
- select_q = query.with_only_columns(
653
- *[c for c in query.selected_columns if c.name != "sys__id"]
654
- )
655
-
656
- # if there is order by clause we need row_number to preserve order
657
- # if there is no order by clause we still need row_number to generate
658
- # unique ids as uniqueness is important for this table
659
- select_q = select_q.add_columns(
660
- f.row_number().over(order_by=select_q._order_by_clauses).label("sys__id")
661
- )
662
-
663
- self.catalog.warehouse.db.execute(
664
- table.insert().from_select(list(select_q.selected_columns), select_q)
665
- )
666
- return table
634
+ return self.catalog.warehouse.create_udf_table(udf_output_columns)
667
635
 
668
636
  def process_input_query(self, query: Select) -> tuple[Select, list["Table"]]:
669
637
  if os.getenv("DATACHAIN_DISABLE_QUERY_CACHE", "") not in ("", "0"):
670
638
  return query, []
671
- table = self.create_pre_udf_table(query)
639
+ table = self.catalog.warehouse.create_pre_udf_table(query)
672
640
  q: Select = sqlalchemy.select(*table.c)
673
641
  if query._order_by_clauses:
674
642
  # we are adding ordering only if it's explicitly added by user in
@@ -732,7 +700,7 @@ class RowGenerator(UDFStep):
732
700
  def create_udf_table(self, query: Select) -> "Table":
733
701
  warehouse = self.catalog.warehouse
734
702
 
735
- table_name = self.udf_table_name()
703
+ table_name = self.catalog.warehouse.udf_table_name()
736
704
  columns: tuple[Column, ...] = tuple(
737
705
  Column(name, typ) for name, typ in self.udf.output.items()
738
706
  )
@@ -1802,10 +1770,3 @@ def query_wrapper(dataset_query: DatasetQuery) -> DatasetQuery:
1802
1770
 
1803
1771
  _send_result(dataset_query)
1804
1772
  return dataset_query
1805
-
1806
-
1807
- def _random_string(length: int) -> str:
1808
- return "".join(
1809
- random.choice(string.ascii_letters + string.digits) # noqa: S311
1810
- for i in range(length)
1811
- )
datachain/sql/types.py CHANGED
@@ -12,6 +12,7 @@ for sqlite we can use `sqlite.register_converter`
12
12
  ( https://docs.python.org/3/library/sqlite3.html#sqlite3.register_converter )
13
13
  """
14
14
 
15
+ import json
15
16
  from datetime import datetime
16
17
  from types import MappingProxyType
17
18
  from typing import Any, Union
@@ -247,7 +248,10 @@ class Array(SQLType):
247
248
  return type_defaults(dialect).array()
248
249
 
249
250
  def on_read_convert(self, value, dialect):
250
- return read_converter(dialect).array(value, self.item_type, dialect)
251
+ r = read_converter(dialect).array(value, self.item_type, dialect)
252
+ if isinstance(self.item_type, JSON):
253
+ r = [json.loads(item) if isinstance(item, str) else item for item in r]
254
+ return r
251
255
 
252
256
 
253
257
  class JSON(SQLType):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: datachain
3
- Version: 0.2.14
3
+ Version: 0.2.15
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License: Apache-2.0
@@ -35,38 +35,38 @@ datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s
35
35
  datachain/data_storage/metastore.py,sha256=wVcT8MiSH_paWEXN6eZ8Z3msrHY6vWtVFTH5kwHteRE,54852
36
36
  datachain/data_storage/schema.py,sha256=FQvt5MUMSnI5ZAE7Nthae4aaJpt8JC4nH8KiWDuhJkk,8135
37
37
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
38
- datachain/data_storage/sqlite.py,sha256=i4h8ZY15A2YNXd2PU5BZPoRaBqqs9lOdPtBjC0BZy3s,24935
39
- datachain/data_storage/warehouse.py,sha256=fQO6UZc2MFgFPRnpCQW7c1GCl3FJBYE4dtA_ZXWuA8M,32627
38
+ datachain/data_storage/sqlite.py,sha256=w0d_cZ2u9LpQYFFXll22mnxHaxPOoJdHlsKAZmONQpA,25605
39
+ datachain/data_storage/warehouse.py,sha256=WGHWBuBmNmK-qHwhvMfAwtXZ-fQKwk8w1dadN_4dugA,33293
40
40
  datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
- datachain/lib/arrow.py,sha256=WBZ4iVU0CcmCgog1wS-Nrtqhzvf2I4_QqDJtzhaECeA,3641
41
+ datachain/lib/arrow.py,sha256=9C5AVH6tLo9hwzav-1tLLnmWP-3_SReYCOfcOC54pu0,4437
42
42
  datachain/lib/clip.py,sha256=16u4b_y2Y15nUS2UN_8ximMo6r_-_4IQpmct2ol-e-g,5730
43
- datachain/lib/data_model.py,sha256=jPYDmTYbixy4LhdToOyvldYGYZxblhp6Tn4MF-VAd-o,1495
43
+ datachain/lib/data_model.py,sha256=qfTtQNncS5pt9SvXdMEa5kClniaT6XBGBfO7onEz2TI,1632
44
44
  datachain/lib/dataset_info.py,sha256=lONGr71ozo1DS4CQEhnpKORaU4qFb6Ketv8Xm8CVm2U,2188
45
- datachain/lib/dc.py,sha256=I3BLJJK17kB8velBSCTjtoR8CcPZOHPgFTibS9OclmY,54155
45
+ datachain/lib/dc.py,sha256=alJwK7z5JoUmGc1Kj74dGtlH2MJ0jeSyS2dnInemnnA,56386
46
46
  datachain/lib/file.py,sha256=n9GBmZ1CjzDjHkbUBsUrs8JOJrAoh3MV2Cc8hBkex20,11957
47
47
  datachain/lib/image.py,sha256=TgYhRhzd4nkytfFMeykQkPyzqb5Le_-tU81unVMPn4Q,2328
48
- datachain/lib/meta_formats.py,sha256=WRjUzaBKo0IJFHhKz7dxzAKXjR4OvuzsLjkdjyewL6Q,7001
48
+ datachain/lib/meta_formats.py,sha256=jlSYWRUeDMjun_YCsQ2JxyaDJpEpokzHDPmKUAoCXnU,7034
49
49
  datachain/lib/model_store.py,sha256=c4USXsBBjrGH8VOh4seIgOiav-qHOwdoixtxfLgU63c,2409
50
50
  datachain/lib/pytorch.py,sha256=9PsypKseyKfIimTmTQOgb-pbNXgeeAHLdlWx0qRPULY,5660
51
- datachain/lib/settings.py,sha256=6Nkoh8riETrftYwDp3aniK53Dsjc07MdztL8N0cW1D8,2849
51
+ datachain/lib/settings.py,sha256=39thOpYJw-zPirzeNO6pmRC2vPrQvt4eBsw1xLWDFsw,2344
52
52
  datachain/lib/signal_schema.py,sha256=lKGlpRRUHOUFLcpk-pLQd9kGAJ8FPy0Q2bk--UlVemU,14559
53
53
  datachain/lib/text.py,sha256=dVe2Ilc_gW2EV0kun0UwegiCkapWcd20cef7CgINWHU,1083
54
- datachain/lib/udf.py,sha256=mo3NoyYy7fY2UZtZOtAN_jR1e5a803b1dlnD5ztduzk,11454
54
+ datachain/lib/udf.py,sha256=IjuDt2B8E3xEHhcJnaK_ZhmivdrOYPXz5uf7ylpktws,11815
55
55
  datachain/lib/udf_signature.py,sha256=gMStcEeYJka5M6cg50Z9orC6y6HzCAJ3MkFqqn1fjZg,7137
56
56
  datachain/lib/utils.py,sha256=5-kJlAZE0D9nXXweAjo7-SP_AWGo28feaDByONYaooQ,463
57
57
  datachain/lib/vfile.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
58
58
  datachain/lib/webdataset.py,sha256=nIa6ubv94CwnATeeSdE7f_F9Zkz9LuBTfbXvFg3_-Ak,8295
59
59
  datachain/lib/webdataset_laion.py,sha256=PQP6tQmUP7Xu9fPuAGK1JDBYA6T5UufYMUTGaxgspJA,2118
60
60
  datachain/lib/convert/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
61
- datachain/lib/convert/flatten.py,sha256=vrj2Kg-I1YAq2OGAFIwFUqtIesGpweve3c1ipeFOvDQ,1615
62
- datachain/lib/convert/python_to_sql.py,sha256=54G6dsMhxo1GKCzPziOqCKo2d4VRWmsJhJYRJxt1Thw,2615
61
+ datachain/lib/convert/flatten.py,sha256=YMoC00BqEy3zSpvCp6Q0DfxihuPmgjUJj1g2cesWGPs,1790
62
+ datachain/lib/convert/python_to_sql.py,sha256=4gplGlr_Kg-Z40OpJUzJiarDWj7pwbUOk-dPOYYCJ9Q,2629
63
63
  datachain/lib/convert/sql_to_python.py,sha256=HK414fexSQ4Ur-OY7_pKvDKEGdtos1CeeAFa4RxH4nU,532
64
64
  datachain/lib/convert/unflatten.py,sha256=Ogvh_5wg2f38_At_1lN0D_e2uZOOpYEvwvB2xdq56Tw,2012
65
65
  datachain/lib/convert/values_to_tuples.py,sha256=aVoHWMOUGLAiS6_BBwKJqVIne91VffOW6-dWyNE7oHg,3715
66
66
  datachain/query/__init__.py,sha256=tv-spkjUCYamMN9ys_90scYrZ8kJ7C7d1MTYVmxGtk4,325
67
67
  datachain/query/batch.py,sha256=j-_ZcuQra2Ro3Wj4crtqQCg-7xuv-p84hr4QHdvT7as,3479
68
68
  datachain/query/builtins.py,sha256=ZKNs49t8Oa_OaboCBIEqtXZt7c1Qe9OR_C_HpoDriIU,2781
69
- datachain/query/dataset.py,sha256=VhsbHTOps-E4_trLzkJWGQV3zblN6LdlyHED9-3H5Vo,61388
69
+ datachain/query/dataset.py,sha256=PJFVasYhCU0XvF7OrbxlAHLdm_PnhIQBp3TUDVHNHVY,60054
70
70
  datachain/query/dispatch.py,sha256=oGX9ZuoKWPB_EyqAZD_eULcO3OejY44_keSmFS6SHT0,13315
71
71
  datachain/query/metrics.py,sha256=vsECqbZfoSDBnvC3GQlziKXmISVYDLgHP1fMPEOtKyo,640
72
72
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
@@ -77,7 +77,7 @@ datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
77
77
  datachain/remote/studio.py,sha256=f5s6qSZ9uB4URGUoU_8_W1KZRRQQVSm6cgEBkBUEfuE,7226
78
78
  datachain/sql/__init__.py,sha256=A2djrbQwSMUZZEIKGnm-mnRA-NDSbiDJNpAmmwGNyIo,303
79
79
  datachain/sql/selectable.py,sha256=fBM-wS1TUA42kVEAAiwqGtibIevyZAEritwt8PZGyLQ,1589
80
- datachain/sql/types.py,sha256=BzUm0nCcMPASvdqpQouX5bdVcK3G3DBfeeNhau7X_hA,10234
80
+ datachain/sql/types.py,sha256=SShudhdIpdfTKDxWDDqOajYRkTCkIgQbilA94g4i-4E,10389
81
81
  datachain/sql/utils.py,sha256=rzlJw08etivdrcuQPqNVvVWhuVSyUPUQEEc6DOhu258,818
82
82
  datachain/sql/default/__init__.py,sha256=XQ2cEZpzWiABqjV-6yYHUBGI9vN_UHxbxZENESmVAWw,45
83
83
  datachain/sql/default/base.py,sha256=h44005q3qtMc9cjWmRufWwcBr5CfK_dnvG4IrcSQs_8,536
@@ -92,9 +92,9 @@ datachain/sql/sqlite/base.py,sha256=Jb1csbIARjEvwbylnvgNA7ChozSyoL3CQzOGBUf8QAw,
92
92
  datachain/sql/sqlite/types.py,sha256=yzvp0sXSEoEYXs6zaYC_2YubarQoZH-MiUNXcpuEP4s,1573
93
93
  datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR0,469
94
94
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
95
- datachain-0.2.14.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
- datachain-0.2.14.dist-info/METADATA,sha256=UiBiVmF8nF2aIimMNPn3XB14OhIbRj0w4w5q72qTaRM,14577
97
- datachain-0.2.14.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
98
- datachain-0.2.14.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
- datachain-0.2.14.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
- datachain-0.2.14.dist-info/RECORD,,
95
+ datachain-0.2.15.dist-info/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
96
+ datachain-0.2.15.dist-info/METADATA,sha256=kKdEsDFle6KQ55q9RlWsAd6DUTgAg40A8L5YWE9fbMg,14577
97
+ datachain-0.2.15.dist-info/WHEEL,sha256=R0nc6qTxuoLk7ShA2_Y-UWkN8ZdfDBG2B6Eqpz2WXbs,91
98
+ datachain-0.2.15.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
99
+ datachain-0.2.15.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
100
+ datachain-0.2.15.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (71.1.0)
2
+ Generator: setuptools (72.1.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5