datachain 0.14.5__py3-none-any.whl → 0.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

datachain/__init__.py CHANGED
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
5
5
  DataChain,
6
6
  Sys,
7
7
  datasets,
8
+ delete_dataset,
8
9
  listings,
9
10
  read_csv,
11
+ read_database,
10
12
  read_dataset,
11
13
  read_hf,
12
14
  read_json,
@@ -61,11 +63,13 @@ __all__ = [
61
63
  "VideoFragment",
62
64
  "VideoFrame",
63
65
  "datasets",
66
+ "delete_dataset",
64
67
  "is_chain_type",
65
68
  "listings",
66
69
  "metrics",
67
70
  "param",
68
71
  "read_csv",
72
+ "read_database",
69
73
  "read_dataset",
70
74
  "read_hf",
71
75
  "read_json",
@@ -776,7 +776,7 @@ class Catalog:
776
776
  listing: Optional[bool] = False,
777
777
  uuid: Optional[str] = None,
778
778
  description: Optional[str] = None,
779
- labels: Optional[list[str]] = None,
779
+ attrs: Optional[list[str]] = None,
780
780
  ) -> "DatasetRecord":
781
781
  """
782
782
  Creates new dataset of a specific version.
@@ -794,16 +794,16 @@ class Catalog:
794
794
  dataset = self.get_dataset(name)
795
795
  default_version = dataset.next_version
796
796
 
797
- if (description or labels) and (
798
- dataset.description != description or dataset.labels != labels
797
+ if (description or attrs) and (
798
+ dataset.description != description or dataset.attrs != attrs
799
799
  ):
800
800
  description = description or dataset.description
801
- labels = labels or dataset.labels
801
+ attrs = attrs or dataset.attrs
802
802
 
803
803
  self.update_dataset(
804
804
  dataset,
805
805
  description=description,
806
- labels=labels,
806
+ attrs=attrs,
807
807
  )
808
808
 
809
809
  except DatasetNotFoundError:
@@ -817,7 +817,7 @@ class Catalog:
817
817
  schema=schema,
818
818
  ignore_if_exists=True,
819
819
  description=description,
820
- labels=labels,
820
+ attrs=attrs,
821
821
  )
822
822
 
823
823
  version = version or default_version
@@ -1299,7 +1299,17 @@ class Catalog:
1299
1299
  name: str,
1300
1300
  version: Optional[int] = None,
1301
1301
  force: Optional[bool] = False,
1302
+ studio: Optional[bool] = False,
1302
1303
  ):
1304
+ from datachain.remote.studio import StudioClient
1305
+
1306
+ if studio:
1307
+ client = StudioClient()
1308
+ response = client.rm_dataset(name, version=version, force=force)
1309
+ if not response.ok:
1310
+ raise DataChainError(response.message)
1311
+ return
1312
+
1303
1313
  dataset = self.get_dataset(name)
1304
1314
  if not version and not force:
1305
1315
  raise ValueError(f"Missing dataset version from input for dataset {name}")
@@ -1324,15 +1334,15 @@ class Catalog:
1324
1334
  name: str,
1325
1335
  new_name: Optional[str] = None,
1326
1336
  description: Optional[str] = None,
1327
- labels: Optional[list[str]] = None,
1337
+ attrs: Optional[list[str]] = None,
1328
1338
  ) -> DatasetRecord:
1329
1339
  update_data = {}
1330
1340
  if new_name:
1331
1341
  update_data["name"] = new_name
1332
1342
  if description is not None:
1333
1343
  update_data["description"] = description
1334
- if labels is not None:
1335
- update_data["labels"] = labels # type: ignore[assignment]
1344
+ if attrs is not None:
1345
+ update_data["attrs"] = attrs # type: ignore[assignment]
1336
1346
 
1337
1347
  dataset = self.get_dataset(name)
1338
1348
  return self.update_dataset(dataset, **update_data)
@@ -1,4 +1,5 @@
1
1
  import os
2
+ import sys
2
3
  from importlib import import_module
3
4
  from typing import TYPE_CHECKING, Any, Optional
4
5
 
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
15
16
  WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
16
17
  WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
17
18
  WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
19
+ DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
18
20
  DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
19
21
 
20
22
  IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
100
102
  return warehouse_class(**warehouse_args)
101
103
 
102
104
 
103
- def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
104
- distributed_import_path = os.environ.get(DISTRIBUTED_IMPORT_PATH)
105
+ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
106
+ if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
107
+ return None
105
108
 
106
- if not distributed_import_path:
107
- raise RuntimeError(
108
- f"{DISTRIBUTED_IMPORT_PATH} import path is required "
109
- "for distributed UDF processing."
110
- )
111
109
  # Distributed class paths are specified as (for example): module.classname
112
110
  if "." not in distributed_import_path:
113
111
  raise RuntimeError(
114
112
  f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
115
113
  )
114
+
115
+ # Optional: set the Python path to look for the module
116
+ distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
117
+ if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
118
+ sys.path.insert(0, distributed_import_pythonpath)
119
+
116
120
  module_name, _, class_name = distributed_import_path.rpartition(".")
117
121
  distributed = import_module(module_name)
118
122
  return getattr(distributed, class_name)
datachain/cli/__init__.py CHANGED
@@ -149,7 +149,7 @@ def handle_dataset_command(args, catalog):
149
149
  args.name,
150
150
  new_name=args.new_name,
151
151
  description=args.description,
152
- labels=args.labels,
152
+ attrs=args.attrs,
153
153
  studio=args.studio,
154
154
  local=args.local,
155
155
  all=args.all,
@@ -154,7 +154,7 @@ def edit_dataset(
154
154
  name: str,
155
155
  new_name: Optional[str] = None,
156
156
  description: Optional[str] = None,
157
- labels: Optional[list[str]] = None,
157
+ attrs: Optional[list[str]] = None,
158
158
  studio: bool = False,
159
159
  local: bool = False,
160
160
  all: bool = True,
@@ -167,9 +167,9 @@ def edit_dataset(
167
167
 
168
168
  if all or local:
169
169
  try:
170
- catalog.edit_dataset(name, new_name, description, labels)
170
+ catalog.edit_dataset(name, new_name, description, attrs)
171
171
  except DatasetNotFoundError:
172
172
  print("Dataset not found in local", file=sys.stderr)
173
173
 
174
174
  if (all or studio) and token:
175
- edit_studio_dataset(team, name, new_name, description, labels)
175
+ edit_studio_dataset(team, name, new_name, description, attrs)
@@ -42,8 +42,8 @@ def show(
42
42
  print("Name: ", name)
43
43
  if dataset.description:
44
44
  print("Description: ", dataset.description)
45
- if dataset.labels:
46
- print("Labels: ", ",".join(dataset.labels))
45
+ if dataset.attrs:
46
+ print("Attributes: ", ",".join(dataset.attrs))
47
47
  print("\n")
48
48
 
49
49
  show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
@@ -217,9 +217,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
217
217
  help="Dataset description",
218
218
  )
219
219
  parse_edit_dataset.add_argument(
220
- "--labels",
220
+ "--attrs",
221
221
  nargs="+",
222
- help="Dataset labels",
222
+ help="Dataset attributes",
223
223
  )
224
224
  parse_edit_dataset.add_argument(
225
225
  "--studio",
@@ -120,7 +120,7 @@ class AbstractMetastore(ABC, Serializable):
120
120
  schema: Optional[dict[str, Any]] = None,
121
121
  ignore_if_exists: bool = False,
122
122
  description: Optional[str] = None,
123
- labels: Optional[list[str]] = None,
123
+ attrs: Optional[list[str]] = None,
124
124
  ) -> DatasetRecord:
125
125
  """Creates new dataset."""
126
126
 
@@ -326,7 +326,7 @@ class AbstractDBMetastore(AbstractMetastore):
326
326
  Column("id", Integer, primary_key=True),
327
327
  Column("name", Text, nullable=False),
328
328
  Column("description", Text),
329
- Column("labels", JSON, nullable=True),
329
+ Column("attrs", JSON, nullable=True),
330
330
  Column("status", Integer, nullable=False),
331
331
  Column("feature_schema", JSON, nullable=True),
332
332
  Column("created_at", DateTime(timezone=True)),
@@ -521,7 +521,7 @@ class AbstractDBMetastore(AbstractMetastore):
521
521
  schema: Optional[dict[str, Any]] = None,
522
522
  ignore_if_exists: bool = False,
523
523
  description: Optional[str] = None,
524
- labels: Optional[list[str]] = None,
524
+ attrs: Optional[list[str]] = None,
525
525
  **kwargs, # TODO registered = True / False
526
526
  ) -> DatasetRecord:
527
527
  """Creates new dataset."""
@@ -538,7 +538,7 @@ class AbstractDBMetastore(AbstractMetastore):
538
538
  query_script=query_script,
539
539
  schema=json.dumps(schema or {}),
540
540
  description=description,
541
- labels=json.dumps(labels or []),
541
+ attrs=json.dumps(attrs or []),
542
542
  )
543
543
  if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
544
544
  # SQLite and PostgreSQL both support 'on_conflict_do_nothing',
@@ -621,7 +621,7 @@ class AbstractDBMetastore(AbstractMetastore):
621
621
  dataset_values = {}
622
622
  for field, value in kwargs.items():
623
623
  if field in self._dataset_fields[1:]:
624
- if field in ["labels", "schema"]:
624
+ if field in ["attrs", "schema"]:
625
625
  values[field] = json.dumps(value) if value else None
626
626
  else:
627
627
  values[field] = value
datachain/dataset.py CHANGED
@@ -329,7 +329,7 @@ class DatasetRecord:
329
329
  id: int
330
330
  name: str
331
331
  description: Optional[str]
332
- labels: list[str]
332
+ attrs: list[str]
333
333
  schema: dict[str, Union[SQLType, type[SQLType]]]
334
334
  feature_schema: dict
335
335
  versions: list[DatasetVersion]
@@ -357,7 +357,7 @@ class DatasetRecord:
357
357
  id: int,
358
358
  name: str,
359
359
  description: Optional[str],
360
- labels: str,
360
+ attrs: str,
361
361
  status: int,
362
362
  feature_schema: Optional[str],
363
363
  created_at: datetime,
@@ -387,7 +387,7 @@ class DatasetRecord:
387
387
  version_schema: str,
388
388
  version_job_id: Optional[str] = None,
389
389
  ) -> "DatasetRecord":
390
- labels_lst: list[str] = json.loads(labels) if labels else []
390
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
391
391
  schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
392
392
  version_schema_dct: dict[str, str] = (
393
393
  json.loads(version_schema) if version_schema else {}
@@ -418,7 +418,7 @@ class DatasetRecord:
418
418
  id,
419
419
  name,
420
420
  description,
421
- labels_lst,
421
+ attrs_lst,
422
422
  cls.parse_schema(schema_dct), # type: ignore[arg-type]
423
423
  json.loads(feature_schema) if feature_schema else {},
424
424
  [dataset_version],
@@ -562,7 +562,7 @@ class DatasetListRecord:
562
562
  id: int
563
563
  name: str
564
564
  description: Optional[str]
565
- labels: list[str]
565
+ attrs: list[str]
566
566
  versions: list[DatasetListVersion]
567
567
  created_at: Optional[datetime] = None
568
568
 
@@ -572,7 +572,7 @@ class DatasetListRecord:
572
572
  id: int,
573
573
  name: str,
574
574
  description: Optional[str],
575
- labels: str,
575
+ attrs: str,
576
576
  created_at: datetime,
577
577
  version_id: int,
578
578
  version_uuid: str,
@@ -588,7 +588,7 @@ class DatasetListRecord:
588
588
  version_query_script: Optional[str],
589
589
  version_job_id: Optional[str] = None,
590
590
  ) -> "DatasetListRecord":
591
- labels_lst: list[str] = json.loads(labels) if labels else []
591
+ attrs_lst: list[str] = json.loads(attrs) if attrs else []
592
592
 
593
593
  dataset_version = DatasetListVersion.parse(
594
594
  version_id,
@@ -610,7 +610,7 @@ class DatasetListRecord:
610
610
  id,
611
611
  name,
612
612
  description,
613
- labels_lst,
613
+ attrs_lst,
614
614
  [dataset_version],
615
615
  created_at,
616
616
  )
@@ -1,5 +1,6 @@
1
+ import itertools
1
2
  from collections.abc import Sequence
2
- from typing import Any, Union
3
+ from typing import Any, Optional, Union
3
4
 
4
5
  from datachain.lib.data_model import (
5
6
  DataType,
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
66
67
  f"signal '{k}' is not present in the output",
67
68
  )
68
69
  else:
69
- if len_ == 0:
70
- raise ValuesToTupleError(ds_name, f"signal '{k}' is empty list")
71
-
72
- first_element = next(iter(v))
73
- typ = type(first_element)
74
- if not is_chain_type(typ):
75
- raise ValuesToTupleError(
76
- ds_name,
77
- f"signal '{k}' has unsupported type '{typ.__name__}'."
78
- f" Please use DataModel types: {DataTypeNames}",
70
+ # FIXME: Stops as soon as it finds the first non-None value.
71
+ # If a non-None value appears early, it won't check the remaining items for
72
+ # `None` values.
73
+ try:
74
+ pos, first_not_none_element = next(
75
+ itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
79
76
  )
80
- if isinstance(first_element, list):
81
- types_map[k] = list[type(first_element[0])] # type: ignore[assignment, misc]
77
+ except StopIteration:
78
+ typ = str # default to str if all values are None or has length 0
79
+ nullable = True
82
80
  else:
83
- types_map[k] = typ
81
+ nullable = pos > 0
82
+ typ = type(first_not_none_element) # type: ignore[assignment]
83
+ if not is_chain_type(typ):
84
+ raise ValuesToTupleError(
85
+ ds_name,
86
+ f"signal '{k}' has unsupported type '{typ.__name__}'."
87
+ f" Please use DataModel types: {DataTypeNames}",
88
+ )
89
+ if isinstance(first_not_none_element, list):
90
+ typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
91
+
92
+ types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
84
93
 
85
94
  if length < 0:
86
95
  length = len_
@@ -32,11 +32,28 @@ class DatasetInfo(DataModel):
32
32
  metrics: dict[str, Any] = Field(default={})
33
33
  error_message: str = Field(default="")
34
34
  error_stack: str = Field(default="")
35
+ attrs: list[str] = Field(default=[])
35
36
 
36
37
  @property
37
38
  def is_temp(self) -> bool:
38
39
  return Session.is_temp_dataset(self.name)
39
40
 
41
+ def has_attr(self, attr: str) -> bool:
42
+ s = attr.split("=")
43
+ if len(s) == 1:
44
+ return attr in self.attrs
45
+
46
+ name = s[0]
47
+ value = s[1]
48
+ for a in self.attrs:
49
+ s = a.split("=")
50
+ if value == "*" and s[0] == name:
51
+ return True
52
+ if len(s) == 2 and s[0] == name and s[1] == value:
53
+ return True
54
+
55
+ return False
56
+
40
57
  @staticmethod
41
58
  def _validate_dict(
42
59
  v: Optional[Union[str, dict]],
@@ -83,4 +100,5 @@ class DatasetInfo(DataModel):
83
100
  metrics=job.metrics if job else {},
84
101
  error_message=version.error_message,
85
102
  error_stack=version.error_stack,
103
+ attrs=dataset.attrs,
86
104
  )
@@ -1,6 +1,7 @@
1
1
  from .csv import read_csv
2
+ from .database import read_database
2
3
  from .datachain import C, Column, DataChain
3
- from .datasets import datasets, read_dataset
4
+ from .datasets import datasets, delete_dataset, read_dataset
4
5
  from .hf import read_hf
5
6
  from .json import read_json
6
7
  from .listings import listings
@@ -19,8 +20,10 @@ __all__ = [
19
20
  "DatasetPrepareError",
20
21
  "Sys",
21
22
  "datasets",
23
+ "delete_dataset",
22
24
  "listings",
23
25
  "read_csv",
26
+ "read_database",
24
27
  "read_dataset",
25
28
  "read_hf",
26
29
  "read_json",
@@ -0,0 +1,151 @@
1
+ import contextlib
2
+ import itertools
3
+ import os
4
+ import sqlite3
5
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
+
7
+ import sqlalchemy
8
+
9
+ if TYPE_CHECKING:
10
+ from collections.abc import Iterator, Mapping, Sequence
11
+
12
+ import sqlalchemy.orm # noqa: TC004
13
+
14
+ from datachain.lib.data_model import DataType
15
+ from datachain.query import Session
16
+
17
+ from .datachain import DataChain
18
+
19
+ ConnectionType = Union[
20
+ str,
21
+ sqlalchemy.engine.URL,
22
+ sqlalchemy.engine.interfaces.Connectable,
23
+ sqlalchemy.engine.Engine,
24
+ sqlalchemy.engine.Connection,
25
+ sqlalchemy.orm.Session,
26
+ sqlite3.Connection,
27
+ ]
28
+
29
+
30
+ @contextlib.contextmanager
31
+ def _connect(
32
+ connection: "ConnectionType",
33
+ ) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
34
+ import sqlalchemy.orm
35
+
36
+ with contextlib.ExitStack() as stack:
37
+ engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
38
+ if isinstance(connection, (str, sqlalchemy.URL)):
39
+ engine = sqlalchemy.create_engine(connection, **engine_kwargs)
40
+ stack.callback(engine.dispose)
41
+ yield stack.enter_context(engine.connect())
42
+ elif isinstance(connection, sqlite3.Connection):
43
+ engine = sqlalchemy.create_engine(
44
+ "sqlite://", creator=lambda: connection, **engine_kwargs
45
+ )
46
+ # do not close the connection, as it is managed by the caller
47
+ yield engine.connect()
48
+ elif isinstance(connection, sqlalchemy.Engine):
49
+ yield stack.enter_context(connection.connect())
50
+ elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
51
+ # do not close the connection, as it is managed by the caller
52
+ yield connection
53
+ else:
54
+ raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
55
+
56
+
57
+ def _infer_schema(
58
+ result: "sqlalchemy.engine.Result",
59
+ to_infer: list[str],
60
+ infer_schema_length: Optional[int] = 100,
61
+ ) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
62
+ from datachain.lib.convert.values_to_tuples import values_to_tuples
63
+
64
+ if not to_infer:
65
+ return [], {}
66
+
67
+ rows = list(itertools.islice(result, infer_schema_length))
68
+ values = {col: [row._mapping[col] for row in rows] for col in to_infer}
69
+ _, output_schema, _ = values_to_tuples("", **values)
70
+ return rows, output_schema
71
+
72
+
73
+ def read_database(
74
+ query: Union[str, "sqlalchemy.sql.expression.Executable"],
75
+ connection: "ConnectionType",
76
+ params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
77
+ *,
78
+ output: Optional["dict[str, DataType]"] = None,
79
+ session: Optional["Session"] = None,
80
+ settings: Optional[dict] = None,
81
+ in_memory: bool = False,
82
+ infer_schema_length: Optional[int] = 100,
83
+ ) -> "DataChain":
84
+ """
85
+ Read the results of a SQL query into a DataChain, using a given database connection.
86
+
87
+ Args:
88
+ query:
89
+ The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
90
+ `Executable` object.
91
+ connection: SQLAlchemy connectable, str, or a sqlite3 connection
92
+ Using SQLAlchemy makes it possible to use any DB supported by that
93
+ library. If a DBAPI2 object, only sqlite3 is supported. The user is
94
+ responsible for engine disposal and connection closure for the
95
+ SQLAlchemy connectable; str connections are closed automatically.
96
+ params: Parameters to pass to execute method.
97
+ output: A dictionary mapping column names to types, used to override the
98
+ schema inferred from the query results.
99
+ session: Session to use for the chain.
100
+ settings: Settings to use for the chain.
101
+ in_memory: If True, creates an in-memory session. Defaults to False.
102
+ infer_schema_length:
103
+ The maximum number of rows to scan for inferring schema.
104
+ If set to `None`, the full data may be scanned.
105
+ The rows used for schema inference are stored in memory,
106
+ so large values can lead to high memory usage.
107
+ Only applies if the `output` parameter is not set for the given column.
108
+
109
+ Examples:
110
+ Reading from a SQL query against a user-supplied connection:
111
+ ```python
112
+ query = "SELECT key, value FROM tbl"
113
+ chain = dc.read_database(query, connection, output={"value": float})
114
+ ```
115
+
116
+ Load data from a SQLAlchemy driver/engine:
117
+ ```python
118
+ from sqlalchemy import create_engine
119
+ engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
120
+ chain = dc.read_database("select * from tbl", engine)
121
+ ```
122
+
123
+ Load data from a parameterized SQLAlchemy query:
124
+ ```python
125
+ query = "SELECT key, value FROM tbl WHERE value > :value"
126
+ dc.read_database(query, engine, params={"value": 50})
127
+ ```
128
+
129
+ Notes:
130
+ This function works with a variety of databases — including, but not limited to,
131
+ SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
132
+ installed.
133
+ """
134
+ from datachain.lib.dc.records import read_records
135
+
136
+ output = output or {}
137
+ if isinstance(query, str):
138
+ query = sqlalchemy.text(query)
139
+ kw = {"execution_options": {"stream_results": True}} # use server-side cursors
140
+ with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
141
+ cols = result.keys()
142
+ to_infer = [k for k in cols if k not in output] # preserve the order
143
+ rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
144
+ records = (row._asdict() for row in itertools.chain(rows, result))
145
+ return read_records(
146
+ records,
147
+ session=session,
148
+ settings=settings,
149
+ in_memory=in_memory,
150
+ schema=inferred_schema | output,
151
+ )
@@ -133,7 +133,7 @@ class DataChain:
133
133
  .choices[0]
134
134
  .message.content,
135
135
  )
136
- .save()
136
+ .persist()
137
137
  )
138
138
 
139
139
  try:
@@ -443,22 +443,33 @@ class DataChain:
443
443
  )
444
444
  return listings(*args, **kwargs)
445
445
 
446
+ def persist(self) -> "Self":
447
+ """Saves temporary chain that will be removed after the process ends.
448
+ Temporary datasets are useful for optimization, for example when we have
449
+ multiple chains starting with identical sub-chain. We can then persist that
450
+ common chain and use it to calculate other chains, to avoid re-calculation
451
+ every time.
452
+ It returns the chain itself.
453
+ """
454
+ schema = self.signals_schema.clone_without_sys_signals().serialize()
455
+ return self._evolve(query=self._query.save(feature_schema=schema))
456
+
446
457
  def save( # type: ignore[override]
447
458
  self,
448
- name: Optional[str] = None,
459
+ name: str,
449
460
  version: Optional[int] = None,
450
461
  description: Optional[str] = None,
451
- labels: Optional[list[str]] = None,
462
+ attrs: Optional[list[str]] = None,
452
463
  **kwargs,
453
464
  ) -> "Self":
454
465
  """Save to a Dataset. It returns the chain itself.
455
466
 
456
467
  Parameters:
457
- name : dataset name. Empty name saves to a temporary dataset that will be
458
- removed after process ends. Temp dataset are useful for optimization.
468
+ name : dataset name.
459
469
  version : version of a dataset. Default - the last version that exist.
460
470
  description : description of a dataset.
461
- labels : labels of a dataset.
471
+ attrs : attributes of a dataset. They can be without value, e.g "NLP",
472
+ or with a value, e.g "location=US".
462
473
  """
463
474
  schema = self.signals_schema.clone_without_sys_signals().serialize()
464
475
  return self._evolve(
@@ -466,7 +477,7 @@ class DataChain:
466
477
  name=name,
467
478
  version=version,
468
479
  description=description,
469
- labels=labels,
480
+ attrs=attrs,
470
481
  feature_schema=schema,
471
482
  **kwargs,
472
483
  )
@@ -1112,7 +1123,7 @@ class DataChain:
1112
1123
  if self._query.attached:
1113
1124
  chain = self
1114
1125
  else:
1115
- chain = self.save()
1126
+ chain = self.persist()
1116
1127
  assert chain.name is not None # for mypy
1117
1128
  return PytorchDataset(
1118
1129
  chain.name,
@@ -102,6 +102,7 @@ def datasets(
102
102
  column: Optional[str] = None,
103
103
  include_listing: bool = False,
104
104
  studio: bool = False,
105
+ attrs: Optional[list[str]] = None,
105
106
  ) -> "DataChain":
106
107
  """Generate chain with list of registered datasets.
107
108
 
@@ -114,6 +115,10 @@ def datasets(
114
115
  include_listing: If True, includes listing datasets. Defaults to False.
115
116
  studio: If True, returns datasets from Studio only,
116
117
  otherwise returns all local datasets. Defaults to False.
118
+ attrs: Optional list of attributes to filter datasets on. It can be just
119
+ attribute without value e.g "NLP", or attribute with value
120
+ e.g "location=US". Attribute with value can also accept "*" to target
121
+ all that have specific name e.g "location=*"
117
122
 
118
123
  Returns:
119
124
  DataChain: A new DataChain instance containing dataset information.
@@ -139,6 +144,10 @@ def datasets(
139
144
  ]
140
145
  datasets_values = [d for d in datasets_values if not d.is_temp]
141
146
 
147
+ if attrs:
148
+ for attr in attrs:
149
+ datasets_values = [d for d in datasets_values if d.has_attr(attr)]
150
+
142
151
  if not column:
143
152
  # flattening dataset fields
144
153
  schema = {
@@ -166,3 +175,46 @@ def datasets(
166
175
  output={column: DatasetInfo},
167
176
  **{column: datasets_values}, # type: ignore[arg-type]
168
177
  )
178
+
179
+
180
+ def delete_dataset(
181
+ name: str,
182
+ version: Optional[int] = None,
183
+ force: Optional[bool] = False,
184
+ studio: Optional[bool] = False,
185
+ session: Optional[Session] = None,
186
+ in_memory: bool = False,
187
+ ) -> None:
188
+ """Removes specific dataset version or all dataset versions, depending on
189
+ a force flag.
190
+
191
+ Args:
192
+ name : Dataset name
193
+ version : Optional dataset version
194
+ force: If true, all datasets versions will be removed. Defaults to False.
195
+ studio: If True, removes dataset from Studio only,
196
+ otherwise remove from local. Defaults to False.
197
+ session: Optional session instance. If not provided, uses default session.
198
+ in_memory: If True, creates an in-memory session. Defaults to False.
199
+
200
+ Returns: None
201
+
202
+ Example:
203
+ ```py
204
+ import datachain as dc
205
+ dc.delete_dataset("cats")
206
+ ```
207
+
208
+ ```py
209
+ import datachain as dc
210
+ dc.delete_dataset("cats", version=1)
211
+ ```
212
+ """
213
+
214
+ session = Session.get(session, in_memory=in_memory)
215
+ catalog = session.catalog
216
+ if not force:
217
+ version = version or catalog.get_dataset(name).latest_version
218
+ else:
219
+ version = None
220
+ catalog.remove_dataset(name, version=version, force=force, studio=studio)
@@ -37,7 +37,14 @@ def read_pandas( # type: ignore[override]
37
37
  """
38
38
  from .utils import DatasetPrepareError
39
39
 
40
- fr_map = {col.lower(): df[col].tolist() for col in df.columns}
40
+ def get_col_name(col):
41
+ if isinstance(col, tuple):
42
+ # Join tuple elements with underscore for MultiIndex columns
43
+ return "_".join(map(str, col)).lower()
44
+ # Handle regular string column names
45
+ return str(col).lower()
46
+
47
+ fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
41
48
 
42
49
  for c in fr_map:
43
50
  if not c.isidentifier():
@@ -1,8 +1,5 @@
1
- from typing import (
2
- TYPE_CHECKING,
3
- Optional,
4
- Union,
5
- )
1
+ from collections.abc import Iterable
2
+ from typing import TYPE_CHECKING, Optional, Union
6
3
 
7
4
  import sqlalchemy
8
5
 
@@ -12,6 +9,7 @@ from datachain.lib.file import (
12
9
  )
13
10
  from datachain.lib.signal_schema import SignalSchema
14
11
  from datachain.query import Session
12
+ from datachain.query.schema import Column
15
13
 
16
14
  if TYPE_CHECKING:
17
15
  from typing_extensions import ParamSpec
@@ -22,7 +20,7 @@ if TYPE_CHECKING:
22
20
 
23
21
 
24
22
  def read_records(
25
- to_insert: Optional[Union[dict, list[dict]]],
23
+ to_insert: Optional[Union[dict, Iterable[dict]]],
26
24
  session: Optional[Session] = None,
27
25
  settings: Optional[dict] = None,
28
26
  in_memory: bool = False,
@@ -54,10 +52,11 @@ def read_records(
54
52
 
55
53
  if schema:
56
54
  signal_schema = SignalSchema(schema)
57
- columns = [
58
- sqlalchemy.Column(c.name, c.type) # type: ignore[union-attr]
59
- for c in signal_schema.db_signals(as_columns=True) # type: ignore[assignment]
60
- ]
55
+ columns = []
56
+ for c in signal_schema.db_signals(as_columns=True):
57
+ assert isinstance(c, Column)
58
+ kw = {"nullable": c.nullable} if c.nullable is not None else {}
59
+ columns.append(sqlalchemy.Column(c.name, c.type, **kw))
61
60
  else:
62
61
  columns = [
63
62
  sqlalchemy.Column(name, typ)
@@ -83,8 +82,7 @@ def read_records(
83
82
 
84
83
  warehouse = catalog.warehouse
85
84
  dr = warehouse.dataset_rows(dsr)
86
- db = warehouse.db
87
- insert_q = dr.get_table().insert()
88
- for record in to_insert:
89
- db.execute(insert_q.values(**record))
85
+ table = dr.get_table()
86
+ warehouse.insert_rows(table, to_insert)
87
+ warehouse.insert_rows_done(table)
90
88
  return read_dataset(name=dsr.name, session=session, settings=settings)
@@ -581,7 +581,11 @@ class SignalSchema:
581
581
  signals = [
582
582
  DEFAULT_DELIMITER.join(path)
583
583
  if not as_columns
584
- else Column(DEFAULT_DELIMITER.join(path), python_to_sql(_type))
584
+ else Column(
585
+ DEFAULT_DELIMITER.join(path),
586
+ python_to_sql(_type),
587
+ nullable=is_optional(_type),
588
+ )
585
589
  for path, _type, has_subtree, _ in self.get_flat_tree(
586
590
  include_hidden=include_hidden
587
591
  )
@@ -990,3 +994,8 @@ class SignalSchema:
990
994
  }
991
995
 
992
996
  return SignalSchema.deserialize(schema)
997
+
998
+
999
+ def is_optional(type_: Any) -> bool:
1000
+ """Check if a type is Optional."""
1001
+ return get_origin(type_) is Union and type(None) in get_args(type_)
datachain/lib/udf.py CHANGED
@@ -474,8 +474,9 @@ class Generator(UDFBase):
474
474
  remove_prefetched=bool(self.prefetch) and not cache,
475
475
  )
476
476
  with closing(prepared_inputs):
477
- for row in processed_cb.wrap(prepared_inputs):
477
+ for row in prepared_inputs:
478
478
  yield _process_row(row)
479
+ processed_cb.relative_update(1)
479
480
 
480
481
  self.teardown()
481
482
 
@@ -437,9 +437,17 @@ class UDFStep(Step, ABC):
437
437
  "distributed processing."
438
438
  )
439
439
 
440
- from datachain.catalog.loader import get_udf_distributor_class
440
+ from datachain.catalog.loader import (
441
+ DISTRIBUTED_IMPORT_PATH,
442
+ get_udf_distributor_class,
443
+ )
444
+
445
+ if not (udf_distributor_class := get_udf_distributor_class()):
446
+ raise RuntimeError(
447
+ f"{DISTRIBUTED_IMPORT_PATH} import path is required "
448
+ "for distributed UDF processing."
449
+ )
441
450
 
442
- udf_distributor_class = get_udf_distributor_class()
443
451
  udf_distributor = udf_distributor_class(
444
452
  catalog=catalog,
445
453
  table=udf_table,
@@ -1162,16 +1170,6 @@ class DatasetQuery:
1162
1170
  )
1163
1171
  return sqlalchemy.table(table_name)
1164
1172
 
1165
- @staticmethod
1166
- def delete(
1167
- name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
1168
- ) -> None:
1169
- from datachain.catalog import get_catalog
1170
-
1171
- catalog = catalog or get_catalog()
1172
- version = version or catalog.get_dataset(name).latest_version
1173
- catalog.remove_dataset(name, version)
1174
-
1175
1173
  @property
1176
1174
  def attached(self) -> bool:
1177
1175
  """
@@ -1682,7 +1680,7 @@ class DatasetQuery:
1682
1680
  version: Optional[int] = None,
1683
1681
  feature_schema: Optional[dict] = None,
1684
1682
  description: Optional[str] = None,
1685
- labels: Optional[list[str]] = None,
1683
+ attrs: Optional[list[str]] = None,
1686
1684
  **kwargs,
1687
1685
  ) -> "Self":
1688
1686
  """Save the query as a dataset."""
@@ -1716,7 +1714,7 @@ class DatasetQuery:
1716
1714
  feature_schema=feature_schema,
1717
1715
  columns=columns,
1718
1716
  description=description,
1719
- labels=labels,
1717
+ attrs=attrs,
1720
1718
  **kwargs,
1721
1719
  )
1722
1720
  version = version or dataset.latest_version
@@ -13,7 +13,7 @@ from multiprocess import get_context
13
13
 
14
14
  from datachain.catalog import Catalog
15
15
  from datachain.catalog.catalog import clone_catalog_with_cache
16
- from datachain.catalog.loader import get_udf_distributor_class
16
+ from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
17
17
  from datachain.lib.udf import _get_cache
18
18
  from datachain.query.batch import RowsOutput, RowsOutputBatch
19
19
  from datachain.query.dataset import (
@@ -91,7 +91,12 @@ def udf_entrypoint() -> int:
91
91
 
92
92
 
93
93
  def udf_worker_entrypoint() -> int:
94
- return get_udf_distributor_class().run_worker()
94
+ if not (udf_distributor_class := get_udf_distributor_class()):
95
+ raise RuntimeError(
96
+ f"{DISTRIBUTED_IMPORT_PATH} import path is required "
97
+ "for distributed UDF processing."
98
+ )
99
+ return udf_distributor_class.run_worker()
95
100
 
96
101
 
97
102
  class UDFDispatcher:
datachain/query/schema.py CHANGED
@@ -40,12 +40,15 @@ class ColumnMeta(type):
40
40
  class Column(sa.ColumnClause, metaclass=ColumnMeta):
41
41
  inherit_cache: Optional[bool] = True
42
42
 
43
- def __init__(self, text, type_=None, is_literal=False, _selectable=None):
43
+ def __init__(
44
+ self, text, type_=None, is_literal=False, nullable=None, _selectable=None
45
+ ):
44
46
  """Dataset column."""
45
47
  self.name = ColumnMeta.to_db_name(text)
46
48
  super().__init__(
47
49
  self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
48
50
  )
51
+ self.nullable = nullable
49
52
 
50
53
  def __getattr__(self, name: str):
51
54
  return Column(self.name + DEFAULT_DELIMITER + name)
@@ -290,13 +290,13 @@ class StudioClient:
290
290
  name: str,
291
291
  new_name: Optional[str] = None,
292
292
  description: Optional[str] = None,
293
- labels: Optional[list[str]] = None,
293
+ attrs: Optional[list[str]] = None,
294
294
  ) -> Response[DatasetInfoData]:
295
295
  body = {
296
296
  "new_name": new_name,
297
297
  "dataset_name": name,
298
298
  "description": description,
299
- "labels": labels,
299
+ "attrs": attrs,
300
300
  }
301
301
 
302
302
  return self._send_request(
datachain/studio.py CHANGED
@@ -187,10 +187,10 @@ def edit_studio_dataset(
187
187
  name: str,
188
188
  new_name: Optional[str] = None,
189
189
  description: Optional[str] = None,
190
- labels: Optional[list[str]] = None,
190
+ attrs: Optional[list[str]] = None,
191
191
  ):
192
192
  client = StudioClient(team=team_name)
193
- response = client.edit_dataset(name, new_name, description, labels)
193
+ response = client.edit_dataset(name, new_name, description, attrs)
194
194
  if not response.ok:
195
195
  raise DataChainError(response.message)
196
196
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datachain
3
- Version: 0.14.5
3
+ Version: 0.16.0
4
4
  Summary: Wrangle unstructured AI data at scale
5
5
  Author-email: Dmitry Petrov <support@dvc.org>
6
6
  License-Expression: Apache-2.0
@@ -1,9 +1,9 @@
1
- datachain/__init__.py,sha256=h3W0agyTcpXOfMA26jZyHo-Gs7vLXhbR-9uEkzK8Szk,1414
1
+ datachain/__init__.py,sha256=Dx_Dw6AuvC_CZtXxfRv0Z-ND6ieC4Cz-tZkMW-Rvmz4,1496
2
2
  datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
3
3
  datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
4
4
  datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
5
5
  datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
6
- datachain/dataset.py,sha256=ZfgsGlddTXsSqCohNSRSChdH6Jjw7wrkso1Am166k-M,19391
6
+ datachain/dataset.py,sha256=msBC62M_HAv3hT4tKFEGOlH3sMCMg5DVd5lhmqkDGB4,19379
7
7
  datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
8
8
  datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
9
9
  datachain/listing.py,sha256=kNSCFYWo2iM1wWg1trwq4WpYZxYqz4RKxkTtsppEzAw,7079
@@ -13,24 +13,24 @@ datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2Sm
13
13
  datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
14
14
  datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
16
- datachain/studio.py,sha256=9MEpFPLKI3gG4isKklcfD5BMLeNsSXhtOUboOjW4Fdc,10017
16
+ datachain/studio.py,sha256=CwXrZ3PXJFIoilelIHblDV05kzcWj9vbV3KanMPVrRQ,10015
17
17
  datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
18
18
  datachain/utils.py,sha256=8Qz8lRrX0bUTGvwYd-OR-l6ElVRsQBdBO5QMvwt56T4,15190
19
19
  datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
20
- datachain/catalog/catalog.py,sha256=05_JplTuoyqDWtxUeu324ogaHVqXGPSaPxtUXtuMljk,60682
20
+ datachain/catalog/catalog.py,sha256=drCemStFXk2MZgexbUsSIBJuUvn0YwL1tJO69KrWeeg,61004
21
21
  datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
22
- datachain/catalog/loader.py,sha256=wCOWeDwuFNKr_frZRkqTZhkCAiB0CBCRJio3LF2zKPA,5765
23
- datachain/cli/__init__.py,sha256=YPVkuQ7IezNhtzo5xrfca1hEIiZtFxOlJCOzAOEuxmA,8335
22
+ datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
23
+ datachain/cli/__init__.py,sha256=i40xHzVZP3iZFBw3UixQ2OU-s_GQq6OyvQ-_6opwIYc,8333
24
24
  datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
25
25
  datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
26
- datachain/cli/commands/datasets.py,sha256=865ui6q4UVPbL_-jk18C-lYi_bGMlh7XhfRaHbbNyhk,5796
26
+ datachain/cli/commands/datasets.py,sha256=sQ83zxHLuP04cXqBYD3iVcsr49LHA3lnjYxdL142HMk,5793
27
27
  datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
28
28
  datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
29
29
  datachain/cli/commands/ls.py,sha256=dSD2_MHng4t9HRFJZWMOCjPL4XU3qaBV3piNl8UXP08,5275
30
30
  datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
31
31
  datachain/cli/commands/query.py,sha256=2S7hQxialt1fkbocxi6JXZI6jS5QnFrD1aOjKgZkzfI,1471
32
- datachain/cli/commands/show.py,sha256=P6e6bYiRCyVKO0ggnoFkLkwGmBWlrlm8W5c_sBNxBBw,1604
33
- datachain/cli/parser/__init__.py,sha256=rtjlqSsDd4LZH9WdgvluO27M4sID1wD7YkQ4cKhNXzw,15721
32
+ datachain/cli/commands/show.py,sha256=K__cCLDJLTRt-sBTMxDID0A_4dFgRRMvjDrrVWcbMUQ,1606
33
+ datachain/cli/parser/__init__.py,sha256=SKB94ZS9kRHV7UOrQcIXsSQ7BOFlp4U2To4wseXXcaI,15724
34
34
  datachain/cli/parser/job.py,sha256=kvQkSfieyUmvJpOK8p78UgS8sygHhQXztRlOtVcgtaU,3449
35
35
  datachain/cli/parser/studio.py,sha256=Y-1OlQGecLVi9QofvWUfSlPd2ISyaESf7QFGZqGsrdw,3609
36
36
  datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
@@ -45,7 +45,7 @@ datachain/client/s3.py,sha256=YCtDhKVO_jGsMPeyqe3xk5QsF5lqMabqkt0tPFWUHOM,7286
45
45
  datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
46
46
  datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
47
47
  datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
48
- datachain/data_storage/metastore.py,sha256=19LP15xT2Fmz0aIZ1sIajq8i1-KnFgCBEZeU2Ka9-mc,37780
48
+ datachain/data_storage/metastore.py,sha256=bhfAaijM7p_D5ltMWg-CVEv9lTflL3bGUWqAmJ8qFbc,37774
49
49
  datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
50
50
  datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
51
51
  datachain/data_storage/sqlite.py,sha256=f4tvq0gzYQP7aYGnfL3j4IBUNvctpBxI_ioFU-B1LFc,24540
@@ -69,7 +69,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
69
  datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
70
70
  datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
71
71
  datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
72
- datachain/lib/dataset_info.py,sha256=Jnjy7vq4iNVkq1e-SYjqxdojlxIDXvZ352NCLLZg59k,2633
72
+ datachain/lib/dataset_info.py,sha256=Mmo3r_MWRb-47H4QueSaUqgeENJiJZmjkTYBMpRuKM8,3128
73
73
  datachain/lib/file.py,sha256=HLQXS_WULm7Y-fkHMy0WpibVAcrkLPRS6CrZy6rwFe0,30450
74
74
  datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
75
75
  datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
@@ -79,10 +79,10 @@ datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A
79
79
  datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
80
80
  datachain/lib/pytorch.py,sha256=YS6yR13iVlrAXo5wzJswFFUHwWOql9KTdWIa86DXB-k,7712
81
81
  datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
82
- datachain/lib/signal_schema.py,sha256=uIBHYXtu_XpLbOUVC-kq-GduEOCfz9hQORi9ZG3JFqo,35820
82
+ datachain/lib/signal_schema.py,sha256=rt5DpL6DptQEZ8NYe2x_v1C_QFO-lDVEUawxzSswKXw,36062
83
83
  datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
84
84
  datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
85
- datachain/lib/udf.py,sha256=JJwjvy41N65PtWGUAq7TYnhdOOR6RiMDUJEKl5xtwLc,16199
85
+ datachain/lib/udf.py,sha256=zCdO5__gLMCgrdHmOvIa0eoWKCDAU1uO-MMAu_EU13o,16228
86
86
  datachain/lib/udf_signature.py,sha256=2EtsOPDNSPqcOlYwqbCdy6RF5MldI-7smii8aLy8p7Y,7543
87
87
  datachain/lib/utils.py,sha256=QrjVs_oLRXEotOPUYurBJypBFi_ReTJmxcnJeH4j2Uk,1596
88
88
  datachain/lib/video.py,sha256=suH_8Mi8VYk4-IVb1vjSduF_njs64ji1WGKHxDLnGYw,6629
@@ -93,17 +93,18 @@ datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1
93
93
  datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djMP3t8CHJLo,3188
94
94
  datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
95
95
  datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
96
- datachain/lib/convert/values_to_tuples.py,sha256=EFfIGBiVVltJQG8blzsQ1dGXneh4D3wdLfSUeoK10OI,3931
97
- datachain/lib/dc/__init__.py,sha256=6rKKHS6MA3mS6UJXiysrv4TURs4R_UWAQK2tJ2t1QMs,743
96
+ datachain/lib/convert/values_to_tuples.py,sha256=CJ7x91ZYrRMc1lr-BR5AYi7EkWHbzPu1bVqCiP6jLoY,4491
97
+ datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
98
98
  datachain/lib/dc/csv.py,sha256=asWPAxhMgIoLAdD2dObDlnGL8CTSD3TAuFuM4ci89bQ,4374
99
- datachain/lib/dc/datachain.py,sha256=PDkB1fvmokJr-Tmyn0CuFGgZSxPn25FMjjUVHbrx6-c,76326
100
- datachain/lib/dc/datasets.py,sha256=K-GCTZ6Ps_XNpzKz19my8VijXb-b0b3eZASoavKk1Uc,5157
99
+ datachain/lib/dc/database.py,sha256=gYKh1iO5hOWMPFTU1vZC5kOXkJzVse14TYTWE4_1iEA,5940
100
+ datachain/lib/dc/datachain.py,sha256=aRTHaYMk2C1A3dslGpaaEmTvhwvbqnMNaWIBgdIWUX8,76847
101
+ datachain/lib/dc/datasets.py,sha256=u6hlz0Eodh_s39TOW6kz0VIL3nGfadqu8FLoWqDxSJs,6890
101
102
  datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
102
103
  datachain/lib/dc/json.py,sha256=ZUThPDAaP2gBFIL5vsQTwKBcuN_dhvC_O44wdDv0jEc,2683
103
104
  datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
104
- datachain/lib/dc/pandas.py,sha256=mM2y44s1-3dwkxjVe6RdfT6PVoeRHS9OgsGaSz4YsqQ,1219
105
+ datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
105
106
  datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
106
- datachain/lib/dc/records.py,sha256=DOFkQV7A7kZnMiCS4mHOzee2ibWIhz-mWQpgVsU78SE,2524
107
+ datachain/lib/dc/records.py,sha256=br5MTtD8mCrPpWXiyHXpYL-ChH9_tg0S-7ttAa8hH80,2634
107
108
  datachain/lib/dc/storage.py,sha256=QLf3-xMV2Gmy3AA8qF9WqAsb7R8Rk87l4s5hBoiCH98,5285
108
109
  datachain/lib/dc/utils.py,sha256=Ct-0FqCaDhNWHx09gJFcCXJGPjMI-VZr4t-GJyqTi44,3984
109
110
  datachain/lib/dc/values.py,sha256=cBQubhmPNEDMJldUXzGh-UKbdim4P6O2B91Gp39roKw,1389
@@ -118,17 +119,17 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
118
119
  datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
119
120
  datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
120
121
  datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
121
- datachain/query/dataset.py,sha256=caUsFzaVZXOz8NmeTMeOdyRQLQP8KCnxYMxF-pG4yFQ,58712
122
- datachain/query/dispatch.py,sha256=ErdK-biHYhRLDsm7on6vAHSjX-hAHgEHsBRHmuMS_4E,12979
122
+ datachain/query/dataset.py,sha256=0SKm8VaXYuzm06j53WK-vnB3-55jauJwq3QULPOooVU,58687
123
+ datachain/query/dispatch.py,sha256=5p_jXxKJVCfIA4jLSQ0tAY1IhZUS3oJvyQXUH0Dk3bc,13215
123
124
  datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
124
125
  datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
125
126
  datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
126
- datachain/query/schema.py,sha256=b_KnVy6B26Ol4nYG0LqNNpeQ1QYPk95YRGUjXfdaQWs,6606
127
+ datachain/query/schema.py,sha256=fo_MdPXblMAtbB3kcZAQDzAUHWP2RfuPX2JWndeGGt8,6668
127
128
  datachain/query/session.py,sha256=wNdOHAi4HrsEihfzdcTlfB5i1xyj0dw6rlUz84StOoU,6512
128
129
  datachain/query/udf.py,sha256=ljAYaF-J77t7iS4zc1-g1ssYd4c6Q-ccKGEc3VQQmeM,1322
129
130
  datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
130
131
  datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
131
- datachain/remote/studio.py,sha256=kzpOWnmtaeXlRXgHbZ7pxno-r0pSgwq2LJFGSY0u1UY,13110
132
+ datachain/remote/studio.py,sha256=SCmsYURwqYTXfxQpizOoyxlPE2ECJv-sZWVitStRPgc,13107
132
133
  datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
133
134
  datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
134
135
  datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
@@ -150,9 +151,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
150
151
  datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
151
152
  datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
152
153
  datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
153
- datachain-0.14.5.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
154
- datachain-0.14.5.dist-info/METADATA,sha256=y6sL0tB9tFRXF_LnjkPLM7cmtBBhXWxTvtNWRnmgfb4,11328
155
- datachain-0.14.5.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
156
- datachain-0.14.5.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
157
- datachain-0.14.5.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
158
- datachain-0.14.5.dist-info/RECORD,,
154
+ datachain-0.16.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
155
+ datachain-0.16.0.dist-info/METADATA,sha256=om4GIGxM-IQkuTWdISiHploZfvi4BmhAY8ywNdHtqYM,11328
156
+ datachain-0.16.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
157
+ datachain-0.16.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
158
+ datachain-0.16.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
159
+ datachain-0.16.0.dist-info/RECORD,,