datachain 0.14.5__py3-none-any.whl → 0.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of datachain might be problematic. Click here for more details.
- datachain/__init__.py +4 -0
- datachain/catalog/catalog.py +19 -9
- datachain/catalog/loader.py +11 -7
- datachain/cli/__init__.py +1 -1
- datachain/cli/commands/datasets.py +3 -3
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +2 -2
- datachain/data_storage/metastore.py +5 -5
- datachain/dataset.py +8 -8
- datachain/lib/convert/values_to_tuples.py +23 -14
- datachain/lib/dataset_info.py +18 -0
- datachain/lib/dc/__init__.py +4 -1
- datachain/lib/dc/database.py +151 -0
- datachain/lib/dc/datachain.py +19 -8
- datachain/lib/dc/datasets.py +52 -0
- datachain/lib/dc/pandas.py +8 -1
- datachain/lib/dc/records.py +12 -14
- datachain/lib/signal_schema.py +10 -1
- datachain/lib/udf.py +2 -1
- datachain/query/dataset.py +12 -14
- datachain/query/dispatch.py +7 -2
- datachain/query/schema.py +4 -1
- datachain/remote/studio.py +2 -2
- datachain/studio.py +2 -2
- {datachain-0.14.5.dist-info → datachain-0.16.0.dist-info}/METADATA +1 -1
- {datachain-0.14.5.dist-info → datachain-0.16.0.dist-info}/RECORD +30 -29
- {datachain-0.14.5.dist-info → datachain-0.16.0.dist-info}/WHEEL +0 -0
- {datachain-0.14.5.dist-info → datachain-0.16.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.14.5.dist-info → datachain-0.16.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.14.5.dist-info → datachain-0.16.0.dist-info}/top_level.txt +0 -0
datachain/__init__.py
CHANGED
|
@@ -5,8 +5,10 @@ from datachain.lib.dc import (
|
|
|
5
5
|
DataChain,
|
|
6
6
|
Sys,
|
|
7
7
|
datasets,
|
|
8
|
+
delete_dataset,
|
|
8
9
|
listings,
|
|
9
10
|
read_csv,
|
|
11
|
+
read_database,
|
|
10
12
|
read_dataset,
|
|
11
13
|
read_hf,
|
|
12
14
|
read_json,
|
|
@@ -61,11 +63,13 @@ __all__ = [
|
|
|
61
63
|
"VideoFragment",
|
|
62
64
|
"VideoFrame",
|
|
63
65
|
"datasets",
|
|
66
|
+
"delete_dataset",
|
|
64
67
|
"is_chain_type",
|
|
65
68
|
"listings",
|
|
66
69
|
"metrics",
|
|
67
70
|
"param",
|
|
68
71
|
"read_csv",
|
|
72
|
+
"read_database",
|
|
69
73
|
"read_dataset",
|
|
70
74
|
"read_hf",
|
|
71
75
|
"read_json",
|
datachain/catalog/catalog.py
CHANGED
|
@@ -776,7 +776,7 @@ class Catalog:
|
|
|
776
776
|
listing: Optional[bool] = False,
|
|
777
777
|
uuid: Optional[str] = None,
|
|
778
778
|
description: Optional[str] = None,
|
|
779
|
-
|
|
779
|
+
attrs: Optional[list[str]] = None,
|
|
780
780
|
) -> "DatasetRecord":
|
|
781
781
|
"""
|
|
782
782
|
Creates new dataset of a specific version.
|
|
@@ -794,16 +794,16 @@ class Catalog:
|
|
|
794
794
|
dataset = self.get_dataset(name)
|
|
795
795
|
default_version = dataset.next_version
|
|
796
796
|
|
|
797
|
-
if (description or
|
|
798
|
-
dataset.description != description or dataset.
|
|
797
|
+
if (description or attrs) and (
|
|
798
|
+
dataset.description != description or dataset.attrs != attrs
|
|
799
799
|
):
|
|
800
800
|
description = description or dataset.description
|
|
801
|
-
|
|
801
|
+
attrs = attrs or dataset.attrs
|
|
802
802
|
|
|
803
803
|
self.update_dataset(
|
|
804
804
|
dataset,
|
|
805
805
|
description=description,
|
|
806
|
-
|
|
806
|
+
attrs=attrs,
|
|
807
807
|
)
|
|
808
808
|
|
|
809
809
|
except DatasetNotFoundError:
|
|
@@ -817,7 +817,7 @@ class Catalog:
|
|
|
817
817
|
schema=schema,
|
|
818
818
|
ignore_if_exists=True,
|
|
819
819
|
description=description,
|
|
820
|
-
|
|
820
|
+
attrs=attrs,
|
|
821
821
|
)
|
|
822
822
|
|
|
823
823
|
version = version or default_version
|
|
@@ -1299,7 +1299,17 @@ class Catalog:
|
|
|
1299
1299
|
name: str,
|
|
1300
1300
|
version: Optional[int] = None,
|
|
1301
1301
|
force: Optional[bool] = False,
|
|
1302
|
+
studio: Optional[bool] = False,
|
|
1302
1303
|
):
|
|
1304
|
+
from datachain.remote.studio import StudioClient
|
|
1305
|
+
|
|
1306
|
+
if studio:
|
|
1307
|
+
client = StudioClient()
|
|
1308
|
+
response = client.rm_dataset(name, version=version, force=force)
|
|
1309
|
+
if not response.ok:
|
|
1310
|
+
raise DataChainError(response.message)
|
|
1311
|
+
return
|
|
1312
|
+
|
|
1303
1313
|
dataset = self.get_dataset(name)
|
|
1304
1314
|
if not version and not force:
|
|
1305
1315
|
raise ValueError(f"Missing dataset version from input for dataset {name}")
|
|
@@ -1324,15 +1334,15 @@ class Catalog:
|
|
|
1324
1334
|
name: str,
|
|
1325
1335
|
new_name: Optional[str] = None,
|
|
1326
1336
|
description: Optional[str] = None,
|
|
1327
|
-
|
|
1337
|
+
attrs: Optional[list[str]] = None,
|
|
1328
1338
|
) -> DatasetRecord:
|
|
1329
1339
|
update_data = {}
|
|
1330
1340
|
if new_name:
|
|
1331
1341
|
update_data["name"] = new_name
|
|
1332
1342
|
if description is not None:
|
|
1333
1343
|
update_data["description"] = description
|
|
1334
|
-
if
|
|
1335
|
-
update_data["
|
|
1344
|
+
if attrs is not None:
|
|
1345
|
+
update_data["attrs"] = attrs # type: ignore[assignment]
|
|
1336
1346
|
|
|
1337
1347
|
dataset = self.get_dataset(name)
|
|
1338
1348
|
return self.update_dataset(dataset, **update_data)
|
datachain/catalog/loader.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import sys
|
|
2
3
|
from importlib import import_module
|
|
3
4
|
from typing import TYPE_CHECKING, Any, Optional
|
|
4
5
|
|
|
@@ -15,6 +16,7 @@ METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
|
|
|
15
16
|
WAREHOUSE_SERIALIZED = "DATACHAIN__WAREHOUSE"
|
|
16
17
|
WAREHOUSE_IMPORT_PATH = "DATACHAIN_WAREHOUSE"
|
|
17
18
|
WAREHOUSE_ARG_PREFIX = "DATACHAIN_WAREHOUSE_ARG_"
|
|
19
|
+
DISTRIBUTED_IMPORT_PYTHONPATH = "DATACHAIN_DISTRIBUTED_PYTHONPATH"
|
|
18
20
|
DISTRIBUTED_IMPORT_PATH = "DATACHAIN_DISTRIBUTED"
|
|
19
21
|
|
|
20
22
|
IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
@@ -100,19 +102,21 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
100
102
|
return warehouse_class(**warehouse_args)
|
|
101
103
|
|
|
102
104
|
|
|
103
|
-
def get_udf_distributor_class() -> type["AbstractUDFDistributor"]:
|
|
104
|
-
distributed_import_path
|
|
105
|
+
def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
106
|
+
if not (distributed_import_path := os.environ.get(DISTRIBUTED_IMPORT_PATH)):
|
|
107
|
+
return None
|
|
105
108
|
|
|
106
|
-
if not distributed_import_path:
|
|
107
|
-
raise RuntimeError(
|
|
108
|
-
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
109
|
-
"for distributed UDF processing."
|
|
110
|
-
)
|
|
111
109
|
# Distributed class paths are specified as (for example): module.classname
|
|
112
110
|
if "." not in distributed_import_path:
|
|
113
111
|
raise RuntimeError(
|
|
114
112
|
f"Invalid {DISTRIBUTED_IMPORT_PATH} import path: {distributed_import_path}"
|
|
115
113
|
)
|
|
114
|
+
|
|
115
|
+
# Optional: set the Python path to look for the module
|
|
116
|
+
distributed_import_pythonpath = os.environ.get(DISTRIBUTED_IMPORT_PYTHONPATH)
|
|
117
|
+
if distributed_import_pythonpath and distributed_import_pythonpath not in sys.path:
|
|
118
|
+
sys.path.insert(0, distributed_import_pythonpath)
|
|
119
|
+
|
|
116
120
|
module_name, _, class_name = distributed_import_path.rpartition(".")
|
|
117
121
|
distributed = import_module(module_name)
|
|
118
122
|
return getattr(distributed, class_name)
|
datachain/cli/__init__.py
CHANGED
|
@@ -154,7 +154,7 @@ def edit_dataset(
|
|
|
154
154
|
name: str,
|
|
155
155
|
new_name: Optional[str] = None,
|
|
156
156
|
description: Optional[str] = None,
|
|
157
|
-
|
|
157
|
+
attrs: Optional[list[str]] = None,
|
|
158
158
|
studio: bool = False,
|
|
159
159
|
local: bool = False,
|
|
160
160
|
all: bool = True,
|
|
@@ -167,9 +167,9 @@ def edit_dataset(
|
|
|
167
167
|
|
|
168
168
|
if all or local:
|
|
169
169
|
try:
|
|
170
|
-
catalog.edit_dataset(name, new_name, description,
|
|
170
|
+
catalog.edit_dataset(name, new_name, description, attrs)
|
|
171
171
|
except DatasetNotFoundError:
|
|
172
172
|
print("Dataset not found in local", file=sys.stderr)
|
|
173
173
|
|
|
174
174
|
if (all or studio) and token:
|
|
175
|
-
edit_studio_dataset(team, name, new_name, description,
|
|
175
|
+
edit_studio_dataset(team, name, new_name, description, attrs)
|
datachain/cli/commands/show.py
CHANGED
|
@@ -42,8 +42,8 @@ def show(
|
|
|
42
42
|
print("Name: ", name)
|
|
43
43
|
if dataset.description:
|
|
44
44
|
print("Description: ", dataset.description)
|
|
45
|
-
if dataset.
|
|
46
|
-
print("
|
|
45
|
+
if dataset.attrs:
|
|
46
|
+
print("Attributes: ", ",".join(dataset.attrs))
|
|
47
47
|
print("\n")
|
|
48
48
|
|
|
49
49
|
show_records(records, collapse_columns=not no_collapse, hidden_fields=hidden_fields)
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -217,9 +217,9 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
217
217
|
help="Dataset description",
|
|
218
218
|
)
|
|
219
219
|
parse_edit_dataset.add_argument(
|
|
220
|
-
"--
|
|
220
|
+
"--attrs",
|
|
221
221
|
nargs="+",
|
|
222
|
-
help="Dataset
|
|
222
|
+
help="Dataset attributes",
|
|
223
223
|
)
|
|
224
224
|
parse_edit_dataset.add_argument(
|
|
225
225
|
"--studio",
|
|
@@ -120,7 +120,7 @@ class AbstractMetastore(ABC, Serializable):
|
|
|
120
120
|
schema: Optional[dict[str, Any]] = None,
|
|
121
121
|
ignore_if_exists: bool = False,
|
|
122
122
|
description: Optional[str] = None,
|
|
123
|
-
|
|
123
|
+
attrs: Optional[list[str]] = None,
|
|
124
124
|
) -> DatasetRecord:
|
|
125
125
|
"""Creates new dataset."""
|
|
126
126
|
|
|
@@ -326,7 +326,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
326
326
|
Column("id", Integer, primary_key=True),
|
|
327
327
|
Column("name", Text, nullable=False),
|
|
328
328
|
Column("description", Text),
|
|
329
|
-
Column("
|
|
329
|
+
Column("attrs", JSON, nullable=True),
|
|
330
330
|
Column("status", Integer, nullable=False),
|
|
331
331
|
Column("feature_schema", JSON, nullable=True),
|
|
332
332
|
Column("created_at", DateTime(timezone=True)),
|
|
@@ -521,7 +521,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
521
521
|
schema: Optional[dict[str, Any]] = None,
|
|
522
522
|
ignore_if_exists: bool = False,
|
|
523
523
|
description: Optional[str] = None,
|
|
524
|
-
|
|
524
|
+
attrs: Optional[list[str]] = None,
|
|
525
525
|
**kwargs, # TODO registered = True / False
|
|
526
526
|
) -> DatasetRecord:
|
|
527
527
|
"""Creates new dataset."""
|
|
@@ -538,7 +538,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
538
538
|
query_script=query_script,
|
|
539
539
|
schema=json.dumps(schema or {}),
|
|
540
540
|
description=description,
|
|
541
|
-
|
|
541
|
+
attrs=json.dumps(attrs or []),
|
|
542
542
|
)
|
|
543
543
|
if ignore_if_exists and hasattr(query, "on_conflict_do_nothing"):
|
|
544
544
|
# SQLite and PostgreSQL both support 'on_conflict_do_nothing',
|
|
@@ -621,7 +621,7 @@ class AbstractDBMetastore(AbstractMetastore):
|
|
|
621
621
|
dataset_values = {}
|
|
622
622
|
for field, value in kwargs.items():
|
|
623
623
|
if field in self._dataset_fields[1:]:
|
|
624
|
-
if field in ["
|
|
624
|
+
if field in ["attrs", "schema"]:
|
|
625
625
|
values[field] = json.dumps(value) if value else None
|
|
626
626
|
else:
|
|
627
627
|
values[field] = value
|
datachain/dataset.py
CHANGED
|
@@ -329,7 +329,7 @@ class DatasetRecord:
|
|
|
329
329
|
id: int
|
|
330
330
|
name: str
|
|
331
331
|
description: Optional[str]
|
|
332
|
-
|
|
332
|
+
attrs: list[str]
|
|
333
333
|
schema: dict[str, Union[SQLType, type[SQLType]]]
|
|
334
334
|
feature_schema: dict
|
|
335
335
|
versions: list[DatasetVersion]
|
|
@@ -357,7 +357,7 @@ class DatasetRecord:
|
|
|
357
357
|
id: int,
|
|
358
358
|
name: str,
|
|
359
359
|
description: Optional[str],
|
|
360
|
-
|
|
360
|
+
attrs: str,
|
|
361
361
|
status: int,
|
|
362
362
|
feature_schema: Optional[str],
|
|
363
363
|
created_at: datetime,
|
|
@@ -387,7 +387,7 @@ class DatasetRecord:
|
|
|
387
387
|
version_schema: str,
|
|
388
388
|
version_job_id: Optional[str] = None,
|
|
389
389
|
) -> "DatasetRecord":
|
|
390
|
-
|
|
390
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
391
391
|
schema_dct: dict[str, Any] = json.loads(schema) if schema else {}
|
|
392
392
|
version_schema_dct: dict[str, str] = (
|
|
393
393
|
json.loads(version_schema) if version_schema else {}
|
|
@@ -418,7 +418,7 @@ class DatasetRecord:
|
|
|
418
418
|
id,
|
|
419
419
|
name,
|
|
420
420
|
description,
|
|
421
|
-
|
|
421
|
+
attrs_lst,
|
|
422
422
|
cls.parse_schema(schema_dct), # type: ignore[arg-type]
|
|
423
423
|
json.loads(feature_schema) if feature_schema else {},
|
|
424
424
|
[dataset_version],
|
|
@@ -562,7 +562,7 @@ class DatasetListRecord:
|
|
|
562
562
|
id: int
|
|
563
563
|
name: str
|
|
564
564
|
description: Optional[str]
|
|
565
|
-
|
|
565
|
+
attrs: list[str]
|
|
566
566
|
versions: list[DatasetListVersion]
|
|
567
567
|
created_at: Optional[datetime] = None
|
|
568
568
|
|
|
@@ -572,7 +572,7 @@ class DatasetListRecord:
|
|
|
572
572
|
id: int,
|
|
573
573
|
name: str,
|
|
574
574
|
description: Optional[str],
|
|
575
|
-
|
|
575
|
+
attrs: str,
|
|
576
576
|
created_at: datetime,
|
|
577
577
|
version_id: int,
|
|
578
578
|
version_uuid: str,
|
|
@@ -588,7 +588,7 @@ class DatasetListRecord:
|
|
|
588
588
|
version_query_script: Optional[str],
|
|
589
589
|
version_job_id: Optional[str] = None,
|
|
590
590
|
) -> "DatasetListRecord":
|
|
591
|
-
|
|
591
|
+
attrs_lst: list[str] = json.loads(attrs) if attrs else []
|
|
592
592
|
|
|
593
593
|
dataset_version = DatasetListVersion.parse(
|
|
594
594
|
version_id,
|
|
@@ -610,7 +610,7 @@ class DatasetListRecord:
|
|
|
610
610
|
id,
|
|
611
611
|
name,
|
|
612
612
|
description,
|
|
613
|
-
|
|
613
|
+
attrs_lst,
|
|
614
614
|
[dataset_version],
|
|
615
615
|
created_at,
|
|
616
616
|
)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
+
import itertools
|
|
1
2
|
from collections.abc import Sequence
|
|
2
|
-
from typing import Any, Union
|
|
3
|
+
from typing import Any, Optional, Union
|
|
3
4
|
|
|
4
5
|
from datachain.lib.data_model import (
|
|
5
6
|
DataType,
|
|
@@ -66,21 +67,29 @@ def values_to_tuples( # noqa: C901, PLR0912
|
|
|
66
67
|
f"signal '{k}' is not present in the output",
|
|
67
68
|
)
|
|
68
69
|
else:
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
raise ValuesToTupleError(
|
|
76
|
-
ds_name,
|
|
77
|
-
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
78
|
-
f" Please use DataModel types: {DataTypeNames}",
|
|
70
|
+
# FIXME: Stops as soon as it finds the first non-None value.
|
|
71
|
+
# If a non-None value appears early, it won't check the remaining items for
|
|
72
|
+
# `None` values.
|
|
73
|
+
try:
|
|
74
|
+
pos, first_not_none_element = next(
|
|
75
|
+
itertools.dropwhile(lambda pair: pair[1] is None, enumerate(v))
|
|
79
76
|
)
|
|
80
|
-
|
|
81
|
-
|
|
77
|
+
except StopIteration:
|
|
78
|
+
typ = str # default to str if all values are None or has length 0
|
|
79
|
+
nullable = True
|
|
82
80
|
else:
|
|
83
|
-
|
|
81
|
+
nullable = pos > 0
|
|
82
|
+
typ = type(first_not_none_element) # type: ignore[assignment]
|
|
83
|
+
if not is_chain_type(typ):
|
|
84
|
+
raise ValuesToTupleError(
|
|
85
|
+
ds_name,
|
|
86
|
+
f"signal '{k}' has unsupported type '{typ.__name__}'."
|
|
87
|
+
f" Please use DataModel types: {DataTypeNames}",
|
|
88
|
+
)
|
|
89
|
+
if isinstance(first_not_none_element, list):
|
|
90
|
+
typ = list[type(first_not_none_element[0])] # type: ignore[assignment, misc]
|
|
91
|
+
|
|
92
|
+
types_map[k] = Optional[typ] if nullable else typ # type: ignore[assignment]
|
|
84
93
|
|
|
85
94
|
if length < 0:
|
|
86
95
|
length = len_
|
datachain/lib/dataset_info.py
CHANGED
|
@@ -32,11 +32,28 @@ class DatasetInfo(DataModel):
|
|
|
32
32
|
metrics: dict[str, Any] = Field(default={})
|
|
33
33
|
error_message: str = Field(default="")
|
|
34
34
|
error_stack: str = Field(default="")
|
|
35
|
+
attrs: list[str] = Field(default=[])
|
|
35
36
|
|
|
36
37
|
@property
|
|
37
38
|
def is_temp(self) -> bool:
|
|
38
39
|
return Session.is_temp_dataset(self.name)
|
|
39
40
|
|
|
41
|
+
def has_attr(self, attr: str) -> bool:
|
|
42
|
+
s = attr.split("=")
|
|
43
|
+
if len(s) == 1:
|
|
44
|
+
return attr in self.attrs
|
|
45
|
+
|
|
46
|
+
name = s[0]
|
|
47
|
+
value = s[1]
|
|
48
|
+
for a in self.attrs:
|
|
49
|
+
s = a.split("=")
|
|
50
|
+
if value == "*" and s[0] == name:
|
|
51
|
+
return True
|
|
52
|
+
if len(s) == 2 and s[0] == name and s[1] == value:
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
return False
|
|
56
|
+
|
|
40
57
|
@staticmethod
|
|
41
58
|
def _validate_dict(
|
|
42
59
|
v: Optional[Union[str, dict]],
|
|
@@ -83,4 +100,5 @@ class DatasetInfo(DataModel):
|
|
|
83
100
|
metrics=job.metrics if job else {},
|
|
84
101
|
error_message=version.error_message,
|
|
85
102
|
error_stack=version.error_stack,
|
|
103
|
+
attrs=dataset.attrs,
|
|
86
104
|
)
|
datachain/lib/dc/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from .csv import read_csv
|
|
2
|
+
from .database import read_database
|
|
2
3
|
from .datachain import C, Column, DataChain
|
|
3
|
-
from .datasets import datasets, read_dataset
|
|
4
|
+
from .datasets import datasets, delete_dataset, read_dataset
|
|
4
5
|
from .hf import read_hf
|
|
5
6
|
from .json import read_json
|
|
6
7
|
from .listings import listings
|
|
@@ -19,8 +20,10 @@ __all__ = [
|
|
|
19
20
|
"DatasetPrepareError",
|
|
20
21
|
"Sys",
|
|
21
22
|
"datasets",
|
|
23
|
+
"delete_dataset",
|
|
22
24
|
"listings",
|
|
23
25
|
"read_csv",
|
|
26
|
+
"read_database",
|
|
24
27
|
"read_dataset",
|
|
25
28
|
"read_hf",
|
|
26
29
|
"read_json",
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
import contextlib
|
|
2
|
+
import itertools
|
|
3
|
+
import os
|
|
4
|
+
import sqlite3
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
6
|
+
|
|
7
|
+
import sqlalchemy
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from collections.abc import Iterator, Mapping, Sequence
|
|
11
|
+
|
|
12
|
+
import sqlalchemy.orm # noqa: TC004
|
|
13
|
+
|
|
14
|
+
from datachain.lib.data_model import DataType
|
|
15
|
+
from datachain.query import Session
|
|
16
|
+
|
|
17
|
+
from .datachain import DataChain
|
|
18
|
+
|
|
19
|
+
ConnectionType = Union[
|
|
20
|
+
str,
|
|
21
|
+
sqlalchemy.engine.URL,
|
|
22
|
+
sqlalchemy.engine.interfaces.Connectable,
|
|
23
|
+
sqlalchemy.engine.Engine,
|
|
24
|
+
sqlalchemy.engine.Connection,
|
|
25
|
+
sqlalchemy.orm.Session,
|
|
26
|
+
sqlite3.Connection,
|
|
27
|
+
]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@contextlib.contextmanager
|
|
31
|
+
def _connect(
|
|
32
|
+
connection: "ConnectionType",
|
|
33
|
+
) -> "Iterator[Union[sqlalchemy.engine.Connection, sqlalchemy.orm.Session]]":
|
|
34
|
+
import sqlalchemy.orm
|
|
35
|
+
|
|
36
|
+
with contextlib.ExitStack() as stack:
|
|
37
|
+
engine_kwargs = {"echo": bool(os.environ.get("DEBUG_SHOW_SQL_QUERIES"))}
|
|
38
|
+
if isinstance(connection, (str, sqlalchemy.URL)):
|
|
39
|
+
engine = sqlalchemy.create_engine(connection, **engine_kwargs)
|
|
40
|
+
stack.callback(engine.dispose)
|
|
41
|
+
yield stack.enter_context(engine.connect())
|
|
42
|
+
elif isinstance(connection, sqlite3.Connection):
|
|
43
|
+
engine = sqlalchemy.create_engine(
|
|
44
|
+
"sqlite://", creator=lambda: connection, **engine_kwargs
|
|
45
|
+
)
|
|
46
|
+
# do not close the connection, as it is managed by the caller
|
|
47
|
+
yield engine.connect()
|
|
48
|
+
elif isinstance(connection, sqlalchemy.Engine):
|
|
49
|
+
yield stack.enter_context(connection.connect())
|
|
50
|
+
elif isinstance(connection, (sqlalchemy.Connection, sqlalchemy.orm.Session)):
|
|
51
|
+
# do not close the connection, as it is managed by the caller
|
|
52
|
+
yield connection
|
|
53
|
+
else:
|
|
54
|
+
raise TypeError(f"Unsupported connection type: {type(connection).__name__}")
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _infer_schema(
|
|
58
|
+
result: "sqlalchemy.engine.Result",
|
|
59
|
+
to_infer: list[str],
|
|
60
|
+
infer_schema_length: Optional[int] = 100,
|
|
61
|
+
) -> tuple[list["sqlalchemy.Row"], dict[str, "DataType"]]:
|
|
62
|
+
from datachain.lib.convert.values_to_tuples import values_to_tuples
|
|
63
|
+
|
|
64
|
+
if not to_infer:
|
|
65
|
+
return [], {}
|
|
66
|
+
|
|
67
|
+
rows = list(itertools.islice(result, infer_schema_length))
|
|
68
|
+
values = {col: [row._mapping[col] for row in rows] for col in to_infer}
|
|
69
|
+
_, output_schema, _ = values_to_tuples("", **values)
|
|
70
|
+
return rows, output_schema
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def read_database(
|
|
74
|
+
query: Union[str, "sqlalchemy.sql.expression.Executable"],
|
|
75
|
+
connection: "ConnectionType",
|
|
76
|
+
params: Union["Sequence[Mapping[str, Any]]", "Mapping[str, Any]", None] = None,
|
|
77
|
+
*,
|
|
78
|
+
output: Optional["dict[str, DataType]"] = None,
|
|
79
|
+
session: Optional["Session"] = None,
|
|
80
|
+
settings: Optional[dict] = None,
|
|
81
|
+
in_memory: bool = False,
|
|
82
|
+
infer_schema_length: Optional[int] = 100,
|
|
83
|
+
) -> "DataChain":
|
|
84
|
+
"""
|
|
85
|
+
Read the results of a SQL query into a DataChain, using a given database connection.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
query:
|
|
89
|
+
The SQL query to execute. Can be a raw SQL string or a SQLAlchemy
|
|
90
|
+
`Executable` object.
|
|
91
|
+
connection: SQLAlchemy connectable, str, or a sqlite3 connection
|
|
92
|
+
Using SQLAlchemy makes it possible to use any DB supported by that
|
|
93
|
+
library. If a DBAPI2 object, only sqlite3 is supported. The user is
|
|
94
|
+
responsible for engine disposal and connection closure for the
|
|
95
|
+
SQLAlchemy connectable; str connections are closed automatically.
|
|
96
|
+
params: Parameters to pass to execute method.
|
|
97
|
+
output: A dictionary mapping column names to types, used to override the
|
|
98
|
+
schema inferred from the query results.
|
|
99
|
+
session: Session to use for the chain.
|
|
100
|
+
settings: Settings to use for the chain.
|
|
101
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
102
|
+
infer_schema_length:
|
|
103
|
+
The maximum number of rows to scan for inferring schema.
|
|
104
|
+
If set to `None`, the full data may be scanned.
|
|
105
|
+
The rows used for schema inference are stored in memory,
|
|
106
|
+
so large values can lead to high memory usage.
|
|
107
|
+
Only applies if the `output` parameter is not set for the given column.
|
|
108
|
+
|
|
109
|
+
Examples:
|
|
110
|
+
Reading from a SQL query against a user-supplied connection:
|
|
111
|
+
```python
|
|
112
|
+
query = "SELECT key, value FROM tbl"
|
|
113
|
+
chain = dc.read_database(query, connection, output={"value": float})
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
Load data from a SQLAlchemy driver/engine:
|
|
117
|
+
```python
|
|
118
|
+
from sqlalchemy import create_engine
|
|
119
|
+
engine = create_engine("postgresql+psycopg://myuser:mypassword@localhost:5432/mydb")
|
|
120
|
+
chain = dc.read_database("select * from tbl", engine)
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
Load data from a parameterized SQLAlchemy query:
|
|
124
|
+
```python
|
|
125
|
+
query = "SELECT key, value FROM tbl WHERE value > :value"
|
|
126
|
+
dc.read_database(query, engine, params={"value": 50})
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
Notes:
|
|
130
|
+
This function works with a variety of databases — including, but not limited to,
|
|
131
|
+
SQLite, DuckDB, PostgreSQL, and Snowflake, provided the appropriate driver is
|
|
132
|
+
installed.
|
|
133
|
+
"""
|
|
134
|
+
from datachain.lib.dc.records import read_records
|
|
135
|
+
|
|
136
|
+
output = output or {}
|
|
137
|
+
if isinstance(query, str):
|
|
138
|
+
query = sqlalchemy.text(query)
|
|
139
|
+
kw = {"execution_options": {"stream_results": True}} # use server-side cursors
|
|
140
|
+
with _connect(connection) as conn, conn.execute(query, params, **kw) as result:
|
|
141
|
+
cols = result.keys()
|
|
142
|
+
to_infer = [k for k in cols if k not in output] # preserve the order
|
|
143
|
+
rows, inferred_schema = _infer_schema(result, to_infer, infer_schema_length)
|
|
144
|
+
records = (row._asdict() for row in itertools.chain(rows, result))
|
|
145
|
+
return read_records(
|
|
146
|
+
records,
|
|
147
|
+
session=session,
|
|
148
|
+
settings=settings,
|
|
149
|
+
in_memory=in_memory,
|
|
150
|
+
schema=inferred_schema | output,
|
|
151
|
+
)
|
datachain/lib/dc/datachain.py
CHANGED
|
@@ -133,7 +133,7 @@ class DataChain:
|
|
|
133
133
|
.choices[0]
|
|
134
134
|
.message.content,
|
|
135
135
|
)
|
|
136
|
-
.
|
|
136
|
+
.persist()
|
|
137
137
|
)
|
|
138
138
|
|
|
139
139
|
try:
|
|
@@ -443,22 +443,33 @@ class DataChain:
|
|
|
443
443
|
)
|
|
444
444
|
return listings(*args, **kwargs)
|
|
445
445
|
|
|
446
|
+
def persist(self) -> "Self":
|
|
447
|
+
"""Saves temporary chain that will be removed after the process ends.
|
|
448
|
+
Temporary datasets are useful for optimization, for example when we have
|
|
449
|
+
multiple chains starting with identical sub-chain. We can then persist that
|
|
450
|
+
common chain and use it to calculate other chains, to avoid re-calculation
|
|
451
|
+
every time.
|
|
452
|
+
It returns the chain itself.
|
|
453
|
+
"""
|
|
454
|
+
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
455
|
+
return self._evolve(query=self._query.save(feature_schema=schema))
|
|
456
|
+
|
|
446
457
|
def save( # type: ignore[override]
|
|
447
458
|
self,
|
|
448
|
-
name:
|
|
459
|
+
name: str,
|
|
449
460
|
version: Optional[int] = None,
|
|
450
461
|
description: Optional[str] = None,
|
|
451
|
-
|
|
462
|
+
attrs: Optional[list[str]] = None,
|
|
452
463
|
**kwargs,
|
|
453
464
|
) -> "Self":
|
|
454
465
|
"""Save to a Dataset. It returns the chain itself.
|
|
455
466
|
|
|
456
467
|
Parameters:
|
|
457
|
-
name : dataset name.
|
|
458
|
-
removed after process ends. Temp dataset are useful for optimization.
|
|
468
|
+
name : dataset name.
|
|
459
469
|
version : version of a dataset. Default - the last version that exist.
|
|
460
470
|
description : description of a dataset.
|
|
461
|
-
|
|
471
|
+
attrs : attributes of a dataset. They can be without value, e.g "NLP",
|
|
472
|
+
or with a value, e.g "location=US".
|
|
462
473
|
"""
|
|
463
474
|
schema = self.signals_schema.clone_without_sys_signals().serialize()
|
|
464
475
|
return self._evolve(
|
|
@@ -466,7 +477,7 @@ class DataChain:
|
|
|
466
477
|
name=name,
|
|
467
478
|
version=version,
|
|
468
479
|
description=description,
|
|
469
|
-
|
|
480
|
+
attrs=attrs,
|
|
470
481
|
feature_schema=schema,
|
|
471
482
|
**kwargs,
|
|
472
483
|
)
|
|
@@ -1112,7 +1123,7 @@ class DataChain:
|
|
|
1112
1123
|
if self._query.attached:
|
|
1113
1124
|
chain = self
|
|
1114
1125
|
else:
|
|
1115
|
-
chain = self.
|
|
1126
|
+
chain = self.persist()
|
|
1116
1127
|
assert chain.name is not None # for mypy
|
|
1117
1128
|
return PytorchDataset(
|
|
1118
1129
|
chain.name,
|
datachain/lib/dc/datasets.py
CHANGED
|
@@ -102,6 +102,7 @@ def datasets(
|
|
|
102
102
|
column: Optional[str] = None,
|
|
103
103
|
include_listing: bool = False,
|
|
104
104
|
studio: bool = False,
|
|
105
|
+
attrs: Optional[list[str]] = None,
|
|
105
106
|
) -> "DataChain":
|
|
106
107
|
"""Generate chain with list of registered datasets.
|
|
107
108
|
|
|
@@ -114,6 +115,10 @@ def datasets(
|
|
|
114
115
|
include_listing: If True, includes listing datasets. Defaults to False.
|
|
115
116
|
studio: If True, returns datasets from Studio only,
|
|
116
117
|
otherwise returns all local datasets. Defaults to False.
|
|
118
|
+
attrs: Optional list of attributes to filter datasets on. It can be just
|
|
119
|
+
attribute without value e.g "NLP", or attribute with value
|
|
120
|
+
e.g "location=US". Attribute with value can also accept "*" to target
|
|
121
|
+
all that have specific name e.g "location=*"
|
|
117
122
|
|
|
118
123
|
Returns:
|
|
119
124
|
DataChain: A new DataChain instance containing dataset information.
|
|
@@ -139,6 +144,10 @@ def datasets(
|
|
|
139
144
|
]
|
|
140
145
|
datasets_values = [d for d in datasets_values if not d.is_temp]
|
|
141
146
|
|
|
147
|
+
if attrs:
|
|
148
|
+
for attr in attrs:
|
|
149
|
+
datasets_values = [d for d in datasets_values if d.has_attr(attr)]
|
|
150
|
+
|
|
142
151
|
if not column:
|
|
143
152
|
# flattening dataset fields
|
|
144
153
|
schema = {
|
|
@@ -166,3 +175,46 @@ def datasets(
|
|
|
166
175
|
output={column: DatasetInfo},
|
|
167
176
|
**{column: datasets_values}, # type: ignore[arg-type]
|
|
168
177
|
)
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def delete_dataset(
|
|
181
|
+
name: str,
|
|
182
|
+
version: Optional[int] = None,
|
|
183
|
+
force: Optional[bool] = False,
|
|
184
|
+
studio: Optional[bool] = False,
|
|
185
|
+
session: Optional[Session] = None,
|
|
186
|
+
in_memory: bool = False,
|
|
187
|
+
) -> None:
|
|
188
|
+
"""Removes specific dataset version or all dataset versions, depending on
|
|
189
|
+
a force flag.
|
|
190
|
+
|
|
191
|
+
Args:
|
|
192
|
+
name : Dataset name
|
|
193
|
+
version : Optional dataset version
|
|
194
|
+
force: If true, all datasets versions will be removed. Defaults to False.
|
|
195
|
+
studio: If True, removes dataset from Studio only,
|
|
196
|
+
otherwise remove from local. Defaults to False.
|
|
197
|
+
session: Optional session instance. If not provided, uses default session.
|
|
198
|
+
in_memory: If True, creates an in-memory session. Defaults to False.
|
|
199
|
+
|
|
200
|
+
Returns: None
|
|
201
|
+
|
|
202
|
+
Example:
|
|
203
|
+
```py
|
|
204
|
+
import datachain as dc
|
|
205
|
+
dc.delete_dataset("cats")
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
```py
|
|
209
|
+
import datachain as dc
|
|
210
|
+
dc.delete_dataset("cats", version=1)
|
|
211
|
+
```
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
session = Session.get(session, in_memory=in_memory)
|
|
215
|
+
catalog = session.catalog
|
|
216
|
+
if not force:
|
|
217
|
+
version = version or catalog.get_dataset(name).latest_version
|
|
218
|
+
else:
|
|
219
|
+
version = None
|
|
220
|
+
catalog.remove_dataset(name, version=version, force=force, studio=studio)
|
datachain/lib/dc/pandas.py
CHANGED
|
@@ -37,7 +37,14 @@ def read_pandas( # type: ignore[override]
|
|
|
37
37
|
"""
|
|
38
38
|
from .utils import DatasetPrepareError
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
def get_col_name(col):
|
|
41
|
+
if isinstance(col, tuple):
|
|
42
|
+
# Join tuple elements with underscore for MultiIndex columns
|
|
43
|
+
return "_".join(map(str, col)).lower()
|
|
44
|
+
# Handle regular string column names
|
|
45
|
+
return str(col).lower()
|
|
46
|
+
|
|
47
|
+
fr_map = {get_col_name(col): df[col].tolist() for col in df.columns}
|
|
41
48
|
|
|
42
49
|
for c in fr_map:
|
|
43
50
|
if not c.isidentifier():
|
datachain/lib/dc/records.py
CHANGED
|
@@ -1,8 +1,5 @@
|
|
|
1
|
-
from
|
|
2
|
-
|
|
3
|
-
Optional,
|
|
4
|
-
Union,
|
|
5
|
-
)
|
|
1
|
+
from collections.abc import Iterable
|
|
2
|
+
from typing import TYPE_CHECKING, Optional, Union
|
|
6
3
|
|
|
7
4
|
import sqlalchemy
|
|
8
5
|
|
|
@@ -12,6 +9,7 @@ from datachain.lib.file import (
|
|
|
12
9
|
)
|
|
13
10
|
from datachain.lib.signal_schema import SignalSchema
|
|
14
11
|
from datachain.query import Session
|
|
12
|
+
from datachain.query.schema import Column
|
|
15
13
|
|
|
16
14
|
if TYPE_CHECKING:
|
|
17
15
|
from typing_extensions import ParamSpec
|
|
@@ -22,7 +20,7 @@ if TYPE_CHECKING:
|
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
def read_records(
|
|
25
|
-
to_insert: Optional[Union[dict,
|
|
23
|
+
to_insert: Optional[Union[dict, Iterable[dict]]],
|
|
26
24
|
session: Optional[Session] = None,
|
|
27
25
|
settings: Optional[dict] = None,
|
|
28
26
|
in_memory: bool = False,
|
|
@@ -54,10 +52,11 @@ def read_records(
|
|
|
54
52
|
|
|
55
53
|
if schema:
|
|
56
54
|
signal_schema = SignalSchema(schema)
|
|
57
|
-
columns = [
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
55
|
+
columns = []
|
|
56
|
+
for c in signal_schema.db_signals(as_columns=True):
|
|
57
|
+
assert isinstance(c, Column)
|
|
58
|
+
kw = {"nullable": c.nullable} if c.nullable is not None else {}
|
|
59
|
+
columns.append(sqlalchemy.Column(c.name, c.type, **kw))
|
|
61
60
|
else:
|
|
62
61
|
columns = [
|
|
63
62
|
sqlalchemy.Column(name, typ)
|
|
@@ -83,8 +82,7 @@ def read_records(
|
|
|
83
82
|
|
|
84
83
|
warehouse = catalog.warehouse
|
|
85
84
|
dr = warehouse.dataset_rows(dsr)
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
db.execute(insert_q.values(**record))
|
|
85
|
+
table = dr.get_table()
|
|
86
|
+
warehouse.insert_rows(table, to_insert)
|
|
87
|
+
warehouse.insert_rows_done(table)
|
|
90
88
|
return read_dataset(name=dsr.name, session=session, settings=settings)
|
datachain/lib/signal_schema.py
CHANGED
|
@@ -581,7 +581,11 @@ class SignalSchema:
|
|
|
581
581
|
signals = [
|
|
582
582
|
DEFAULT_DELIMITER.join(path)
|
|
583
583
|
if not as_columns
|
|
584
|
-
else Column(
|
|
584
|
+
else Column(
|
|
585
|
+
DEFAULT_DELIMITER.join(path),
|
|
586
|
+
python_to_sql(_type),
|
|
587
|
+
nullable=is_optional(_type),
|
|
588
|
+
)
|
|
585
589
|
for path, _type, has_subtree, _ in self.get_flat_tree(
|
|
586
590
|
include_hidden=include_hidden
|
|
587
591
|
)
|
|
@@ -990,3 +994,8 @@ class SignalSchema:
|
|
|
990
994
|
}
|
|
991
995
|
|
|
992
996
|
return SignalSchema.deserialize(schema)
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def is_optional(type_: Any) -> bool:
|
|
1000
|
+
"""Check if a type is Optional."""
|
|
1001
|
+
return get_origin(type_) is Union and type(None) in get_args(type_)
|
datachain/lib/udf.py
CHANGED
|
@@ -474,8 +474,9 @@ class Generator(UDFBase):
|
|
|
474
474
|
remove_prefetched=bool(self.prefetch) and not cache,
|
|
475
475
|
)
|
|
476
476
|
with closing(prepared_inputs):
|
|
477
|
-
for row in
|
|
477
|
+
for row in prepared_inputs:
|
|
478
478
|
yield _process_row(row)
|
|
479
|
+
processed_cb.relative_update(1)
|
|
479
480
|
|
|
480
481
|
self.teardown()
|
|
481
482
|
|
datachain/query/dataset.py
CHANGED
|
@@ -437,9 +437,17 @@ class UDFStep(Step, ABC):
|
|
|
437
437
|
"distributed processing."
|
|
438
438
|
)
|
|
439
439
|
|
|
440
|
-
from datachain.catalog.loader import
|
|
440
|
+
from datachain.catalog.loader import (
|
|
441
|
+
DISTRIBUTED_IMPORT_PATH,
|
|
442
|
+
get_udf_distributor_class,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
if not (udf_distributor_class := get_udf_distributor_class()):
|
|
446
|
+
raise RuntimeError(
|
|
447
|
+
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
448
|
+
"for distributed UDF processing."
|
|
449
|
+
)
|
|
441
450
|
|
|
442
|
-
udf_distributor_class = get_udf_distributor_class()
|
|
443
451
|
udf_distributor = udf_distributor_class(
|
|
444
452
|
catalog=catalog,
|
|
445
453
|
table=udf_table,
|
|
@@ -1162,16 +1170,6 @@ class DatasetQuery:
|
|
|
1162
1170
|
)
|
|
1163
1171
|
return sqlalchemy.table(table_name)
|
|
1164
1172
|
|
|
1165
|
-
@staticmethod
|
|
1166
|
-
def delete(
|
|
1167
|
-
name: str, version: Optional[int] = None, catalog: Optional["Catalog"] = None
|
|
1168
|
-
) -> None:
|
|
1169
|
-
from datachain.catalog import get_catalog
|
|
1170
|
-
|
|
1171
|
-
catalog = catalog or get_catalog()
|
|
1172
|
-
version = version or catalog.get_dataset(name).latest_version
|
|
1173
|
-
catalog.remove_dataset(name, version)
|
|
1174
|
-
|
|
1175
1173
|
@property
|
|
1176
1174
|
def attached(self) -> bool:
|
|
1177
1175
|
"""
|
|
@@ -1682,7 +1680,7 @@ class DatasetQuery:
|
|
|
1682
1680
|
version: Optional[int] = None,
|
|
1683
1681
|
feature_schema: Optional[dict] = None,
|
|
1684
1682
|
description: Optional[str] = None,
|
|
1685
|
-
|
|
1683
|
+
attrs: Optional[list[str]] = None,
|
|
1686
1684
|
**kwargs,
|
|
1687
1685
|
) -> "Self":
|
|
1688
1686
|
"""Save the query as a dataset."""
|
|
@@ -1716,7 +1714,7 @@ class DatasetQuery:
|
|
|
1716
1714
|
feature_schema=feature_schema,
|
|
1717
1715
|
columns=columns,
|
|
1718
1716
|
description=description,
|
|
1719
|
-
|
|
1717
|
+
attrs=attrs,
|
|
1720
1718
|
**kwargs,
|
|
1721
1719
|
)
|
|
1722
1720
|
version = version or dataset.latest_version
|
datachain/query/dispatch.py
CHANGED
|
@@ -13,7 +13,7 @@ from multiprocess import get_context
|
|
|
13
13
|
|
|
14
14
|
from datachain.catalog import Catalog
|
|
15
15
|
from datachain.catalog.catalog import clone_catalog_with_cache
|
|
16
|
-
from datachain.catalog.loader import get_udf_distributor_class
|
|
16
|
+
from datachain.catalog.loader import DISTRIBUTED_IMPORT_PATH, get_udf_distributor_class
|
|
17
17
|
from datachain.lib.udf import _get_cache
|
|
18
18
|
from datachain.query.batch import RowsOutput, RowsOutputBatch
|
|
19
19
|
from datachain.query.dataset import (
|
|
@@ -91,7 +91,12 @@ def udf_entrypoint() -> int:
|
|
|
91
91
|
|
|
92
92
|
|
|
93
93
|
def udf_worker_entrypoint() -> int:
|
|
94
|
-
|
|
94
|
+
if not (udf_distributor_class := get_udf_distributor_class()):
|
|
95
|
+
raise RuntimeError(
|
|
96
|
+
f"{DISTRIBUTED_IMPORT_PATH} import path is required "
|
|
97
|
+
"for distributed UDF processing."
|
|
98
|
+
)
|
|
99
|
+
return udf_distributor_class.run_worker()
|
|
95
100
|
|
|
96
101
|
|
|
97
102
|
class UDFDispatcher:
|
datachain/query/schema.py
CHANGED
|
@@ -40,12 +40,15 @@ class ColumnMeta(type):
|
|
|
40
40
|
class Column(sa.ColumnClause, metaclass=ColumnMeta):
|
|
41
41
|
inherit_cache: Optional[bool] = True
|
|
42
42
|
|
|
43
|
-
def __init__(
|
|
43
|
+
def __init__(
|
|
44
|
+
self, text, type_=None, is_literal=False, nullable=None, _selectable=None
|
|
45
|
+
):
|
|
44
46
|
"""Dataset column."""
|
|
45
47
|
self.name = ColumnMeta.to_db_name(text)
|
|
46
48
|
super().__init__(
|
|
47
49
|
self.name, type_=type_, is_literal=is_literal, _selectable=_selectable
|
|
48
50
|
)
|
|
51
|
+
self.nullable = nullable
|
|
49
52
|
|
|
50
53
|
def __getattr__(self, name: str):
|
|
51
54
|
return Column(self.name + DEFAULT_DELIMITER + name)
|
datachain/remote/studio.py
CHANGED
|
@@ -290,13 +290,13 @@ class StudioClient:
|
|
|
290
290
|
name: str,
|
|
291
291
|
new_name: Optional[str] = None,
|
|
292
292
|
description: Optional[str] = None,
|
|
293
|
-
|
|
293
|
+
attrs: Optional[list[str]] = None,
|
|
294
294
|
) -> Response[DatasetInfoData]:
|
|
295
295
|
body = {
|
|
296
296
|
"new_name": new_name,
|
|
297
297
|
"dataset_name": name,
|
|
298
298
|
"description": description,
|
|
299
|
-
"
|
|
299
|
+
"attrs": attrs,
|
|
300
300
|
}
|
|
301
301
|
|
|
302
302
|
return self._send_request(
|
datachain/studio.py
CHANGED
|
@@ -187,10 +187,10 @@ def edit_studio_dataset(
|
|
|
187
187
|
name: str,
|
|
188
188
|
new_name: Optional[str] = None,
|
|
189
189
|
description: Optional[str] = None,
|
|
190
|
-
|
|
190
|
+
attrs: Optional[list[str]] = None,
|
|
191
191
|
):
|
|
192
192
|
client = StudioClient(team=team_name)
|
|
193
|
-
response = client.edit_dataset(name, new_name, description,
|
|
193
|
+
response = client.edit_dataset(name, new_name, description, attrs)
|
|
194
194
|
if not response.ok:
|
|
195
195
|
raise DataChainError(response.message)
|
|
196
196
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
datachain/__init__.py,sha256=
|
|
1
|
+
datachain/__init__.py,sha256=Dx_Dw6AuvC_CZtXxfRv0Z-ND6ieC4Cz-tZkMW-Rvmz4,1496
|
|
2
2
|
datachain/__main__.py,sha256=hG3Y4ARGEqe1AWwNMd259rBlqtphx1Wk39YbueQ0yV8,91
|
|
3
3
|
datachain/asyn.py,sha256=RH_jFwJcTXxhEFomaI9yL6S3Onau6NZ6FSKfKFGtrJE,9689
|
|
4
4
|
datachain/cache.py,sha256=yQblPhOh_Mq74Ma7xT1CL1idLJ0HgrQxpGVYvRy_9Eg,3623
|
|
5
5
|
datachain/config.py,sha256=g8qbNV0vW2VEKpX-dGZ9pAn0DAz6G2ZFcr7SAV3PoSM,4272
|
|
6
|
-
datachain/dataset.py,sha256=
|
|
6
|
+
datachain/dataset.py,sha256=msBC62M_HAv3hT4tKFEGOlH3sMCMg5DVd5lhmqkDGB4,19379
|
|
7
7
|
datachain/error.py,sha256=bxAAL32lSeMgzsQDEHbGTGORj-mPzzpCRvWDPueJNN4,1092
|
|
8
8
|
datachain/job.py,sha256=x5PB6d5sqx00hePNNkirESlOVAvnmkEM5ygUgQmAhsk,1262
|
|
9
9
|
datachain/listing.py,sha256=kNSCFYWo2iM1wWg1trwq4WpYZxYqz4RKxkTtsppEzAw,7079
|
|
@@ -13,24 +13,24 @@ datachain/nodes_thread_pool.py,sha256=mdo0s-VybuSZkRUARcUO4Tjh8KFfZr9foHqmupx2Sm
|
|
|
13
13
|
datachain/progress.py,sha256=lRzxoYP4Qv2XBwD78sOkmYRzHFpZ2ExVNJF8wAeICtY,770
|
|
14
14
|
datachain/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
15
15
|
datachain/script_meta.py,sha256=V-LaFOZG84pD0Zc0NvejYdzwDgzITv6yHvAHggDCnuY,4978
|
|
16
|
-
datachain/studio.py,sha256=
|
|
16
|
+
datachain/studio.py,sha256=CwXrZ3PXJFIoilelIHblDV05kzcWj9vbV3KanMPVrRQ,10015
|
|
17
17
|
datachain/telemetry.py,sha256=0A4IOPPp9VlP5pyW9eBfaTK3YhHGzHl7dQudQjUAx9A,994
|
|
18
18
|
datachain/utils.py,sha256=8Qz8lRrX0bUTGvwYd-OR-l6ElVRsQBdBO5QMvwt56T4,15190
|
|
19
19
|
datachain/catalog/__init__.py,sha256=cMZzSz3VoUi-6qXSVaHYN-agxQuAcz2XSqnEPZ55crE,353
|
|
20
|
-
datachain/catalog/catalog.py,sha256=
|
|
20
|
+
datachain/catalog/catalog.py,sha256=drCemStFXk2MZgexbUsSIBJuUvn0YwL1tJO69KrWeeg,61004
|
|
21
21
|
datachain/catalog/datasource.py,sha256=IkGMh0Ttg6Q-9DWfU_H05WUnZepbGa28HYleECi6K7I,1353
|
|
22
|
-
datachain/catalog/loader.py,sha256=
|
|
23
|
-
datachain/cli/__init__.py,sha256=
|
|
22
|
+
datachain/catalog/loader.py,sha256=UXjYD6BNRoupPvkiz3-b04jepXhtLHCA4gzKFnXxOtQ,5987
|
|
23
|
+
datachain/cli/__init__.py,sha256=i40xHzVZP3iZFBw3UixQ2OU-s_GQq6OyvQ-_6opwIYc,8333
|
|
24
24
|
datachain/cli/utils.py,sha256=wrLnAh7Wx8O_ojZE8AE4Lxn5WoxHbOj7as8NWlLAA74,3036
|
|
25
25
|
datachain/cli/commands/__init__.py,sha256=zp3bYIioO60x_X04A4-IpZqSYVnpwOa1AdERQaRlIhI,493
|
|
26
|
-
datachain/cli/commands/datasets.py,sha256=
|
|
26
|
+
datachain/cli/commands/datasets.py,sha256=sQ83zxHLuP04cXqBYD3iVcsr49LHA3lnjYxdL142HMk,5793
|
|
27
27
|
datachain/cli/commands/du.py,sha256=9edEzDEs98K2VYk8Wf-ZMpUzALcgm9uD6YtoqbvtUGU,391
|
|
28
28
|
datachain/cli/commands/index.py,sha256=eglNaIe1yyIadUHHumjtNbgIjht6kme7SS7xE3YHR88,198
|
|
29
29
|
datachain/cli/commands/ls.py,sha256=dSD2_MHng4t9HRFJZWMOCjPL4XU3qaBV3piNl8UXP08,5275
|
|
30
30
|
datachain/cli/commands/misc.py,sha256=c0DmkOLwcDI2YhA8ArOuLJk6aGzSMZCiKL_E2JGibVE,600
|
|
31
31
|
datachain/cli/commands/query.py,sha256=2S7hQxialt1fkbocxi6JXZI6jS5QnFrD1aOjKgZkzfI,1471
|
|
32
|
-
datachain/cli/commands/show.py,sha256=
|
|
33
|
-
datachain/cli/parser/__init__.py,sha256=
|
|
32
|
+
datachain/cli/commands/show.py,sha256=K__cCLDJLTRt-sBTMxDID0A_4dFgRRMvjDrrVWcbMUQ,1606
|
|
33
|
+
datachain/cli/parser/__init__.py,sha256=SKB94ZS9kRHV7UOrQcIXsSQ7BOFlp4U2To4wseXXcaI,15724
|
|
34
34
|
datachain/cli/parser/job.py,sha256=kvQkSfieyUmvJpOK8p78UgS8sygHhQXztRlOtVcgtaU,3449
|
|
35
35
|
datachain/cli/parser/studio.py,sha256=Y-1OlQGecLVi9QofvWUfSlPd2ISyaESf7QFGZqGsrdw,3609
|
|
36
36
|
datachain/cli/parser/utils.py,sha256=rETdD-9Hq9A4OolgfT7jQw4aoawtbfmkdtH6E7nkhpI,2888
|
|
@@ -45,7 +45,7 @@ datachain/client/s3.py,sha256=YCtDhKVO_jGsMPeyqe3xk5QsF5lqMabqkt0tPFWUHOM,7286
|
|
|
45
45
|
datachain/data_storage/__init__.py,sha256=9Wit-oe5P46V7CJQTD0BJ5MhOa2Y9h3ddJ4VWTe-Lec,273
|
|
46
46
|
datachain/data_storage/db_engine.py,sha256=n8ojCbvVMPY2e3SG8fUaaD0b9GkVfpl_Naa_6EiHfWg,3788
|
|
47
47
|
datachain/data_storage/job.py,sha256=w-7spowjkOa1P5fUVtJou3OltT0L48P0RYWZ9rSJ9-s,383
|
|
48
|
-
datachain/data_storage/metastore.py,sha256=
|
|
48
|
+
datachain/data_storage/metastore.py,sha256=bhfAaijM7p_D5ltMWg-CVEv9lTflL3bGUWqAmJ8qFbc,37774
|
|
49
49
|
datachain/data_storage/schema.py,sha256=asZYz1cg_WKfe2Q-k5W51E2z2CzHU5B4QEDZDMFr8yo,9346
|
|
50
50
|
datachain/data_storage/serializer.py,sha256=6G2YtOFqqDzJf1KbvZraKGXl2XHZyVml2krunWUum5o,927
|
|
51
51
|
datachain/data_storage/sqlite.py,sha256=f4tvq0gzYQP7aYGnfL3j4IBUNvctpBxI_ioFU-B1LFc,24540
|
|
@@ -69,7 +69,7 @@ datachain/lib/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
69
69
|
datachain/lib/arrow.py,sha256=9UBCF-lftQaz0yxdsjbLKbyzVSmrF_QSWdhp2oBDPqs,9486
|
|
70
70
|
datachain/lib/clip.py,sha256=lm5CzVi4Cj1jVLEKvERKArb-egb9j1Ls-fwTItT6vlI,6150
|
|
71
71
|
datachain/lib/data_model.py,sha256=ZwBXELtqROEdLL4DmxTipnwUZmhQvMz_UVDzyf7nQ9Y,2899
|
|
72
|
-
datachain/lib/dataset_info.py,sha256=
|
|
72
|
+
datachain/lib/dataset_info.py,sha256=Mmo3r_MWRb-47H4QueSaUqgeENJiJZmjkTYBMpRuKM8,3128
|
|
73
73
|
datachain/lib/file.py,sha256=HLQXS_WULm7Y-fkHMy0WpibVAcrkLPRS6CrZy6rwFe0,30450
|
|
74
74
|
datachain/lib/hf.py,sha256=gjxuStZBlKtNk3-4yYSlWZDv9zBGblOdvEy_Lwap5hA,5882
|
|
75
75
|
datachain/lib/image.py,sha256=butvUY_33PVEYPKX2nVCPeJjJVcBaptZwsE9REQsTS8,3247
|
|
@@ -79,10 +79,10 @@ datachain/lib/meta_formats.py,sha256=Epydbdch1g4CojK8wd_ePzmwmljC4fVWlJtZ16jsX-A
|
|
|
79
79
|
datachain/lib/model_store.py,sha256=DNIv8Y6Jtk1_idNLzIpsThOsdW2BMAudyUCbPUcgcxk,2515
|
|
80
80
|
datachain/lib/pytorch.py,sha256=YS6yR13iVlrAXo5wzJswFFUHwWOql9KTdWIa86DXB-k,7712
|
|
81
81
|
datachain/lib/settings.py,sha256=ZELRCTLbi5vzRPiDX6cQ9LLg9TefJ_A05gIGni0lll8,2535
|
|
82
|
-
datachain/lib/signal_schema.py,sha256=
|
|
82
|
+
datachain/lib/signal_schema.py,sha256=rt5DpL6DptQEZ8NYe2x_v1C_QFO-lDVEUawxzSswKXw,36062
|
|
83
83
|
datachain/lib/tar.py,sha256=3WIzao6yD5fbLqXLTt9GhPGNonbFIs_fDRu-9vgLgsA,1038
|
|
84
84
|
datachain/lib/text.py,sha256=UNHm8fhidk7wdrWqacEWaA6I9ykfYqarQ2URby7jc7M,1261
|
|
85
|
-
datachain/lib/udf.py,sha256=
|
|
85
|
+
datachain/lib/udf.py,sha256=zCdO5__gLMCgrdHmOvIa0eoWKCDAU1uO-MMAu_EU13o,16228
|
|
86
86
|
datachain/lib/udf_signature.py,sha256=2EtsOPDNSPqcOlYwqbCdy6RF5MldI-7smii8aLy8p7Y,7543
|
|
87
87
|
datachain/lib/utils.py,sha256=QrjVs_oLRXEotOPUYurBJypBFi_ReTJmxcnJeH4j2Uk,1596
|
|
88
88
|
datachain/lib/video.py,sha256=suH_8Mi8VYk4-IVb1vjSduF_njs64ji1WGKHxDLnGYw,6629
|
|
@@ -93,17 +93,18 @@ datachain/lib/convert/flatten.py,sha256=IZFiUYbgXSxXhPSG5Cqf5IjnJ4ZDZKXMr4o_yCR1
|
|
|
93
93
|
datachain/lib/convert/python_to_sql.py,sha256=wg-O5FRKX3x3Wh8ZL1b9ntMlgf1zRO4djMP3t8CHJLo,3188
|
|
94
94
|
datachain/lib/convert/sql_to_python.py,sha256=XXCBYDQFUXJIBNWkjEP944cnCfJ8GF2Tji0DLF3A_zQ,315
|
|
95
95
|
datachain/lib/convert/unflatten.py,sha256=ysMkstwJzPMWUlnxn-Z-tXJR3wmhjHeSN_P-sDcLS6s,2010
|
|
96
|
-
datachain/lib/convert/values_to_tuples.py,sha256=
|
|
97
|
-
datachain/lib/dc/__init__.py,sha256=
|
|
96
|
+
datachain/lib/convert/values_to_tuples.py,sha256=CJ7x91ZYrRMc1lr-BR5AYi7EkWHbzPu1bVqCiP6jLoY,4491
|
|
97
|
+
datachain/lib/dc/__init__.py,sha256=HD0NYrdy44u6kkpvgGjJcvGz-UGTHui2azghcT8ZUg0,838
|
|
98
98
|
datachain/lib/dc/csv.py,sha256=asWPAxhMgIoLAdD2dObDlnGL8CTSD3TAuFuM4ci89bQ,4374
|
|
99
|
-
datachain/lib/dc/
|
|
100
|
-
datachain/lib/dc/
|
|
99
|
+
datachain/lib/dc/database.py,sha256=gYKh1iO5hOWMPFTU1vZC5kOXkJzVse14TYTWE4_1iEA,5940
|
|
100
|
+
datachain/lib/dc/datachain.py,sha256=aRTHaYMk2C1A3dslGpaaEmTvhwvbqnMNaWIBgdIWUX8,76847
|
|
101
|
+
datachain/lib/dc/datasets.py,sha256=u6hlz0Eodh_s39TOW6kz0VIL3nGfadqu8FLoWqDxSJs,6890
|
|
101
102
|
datachain/lib/dc/hf.py,sha256=PJl2wiLjdRsMz0SYbLT-6H8b-D5i2WjeH7li8HHOk_0,2145
|
|
102
103
|
datachain/lib/dc/json.py,sha256=ZUThPDAaP2gBFIL5vsQTwKBcuN_dhvC_O44wdDv0jEc,2683
|
|
103
104
|
datachain/lib/dc/listings.py,sha256=2na9v63xO1vPUNaoBSzA-TSN49V7zQAb-4iS1wOPLFE,1029
|
|
104
|
-
datachain/lib/dc/pandas.py,sha256=
|
|
105
|
+
datachain/lib/dc/pandas.py,sha256=ObueUXDUFKJGu380GmazdG02ARpKAHPhSaymfmOH13E,1489
|
|
105
106
|
datachain/lib/dc/parquet.py,sha256=zYcSgrWwyEDW9UxGUSVdIVsCu15IGEf0xL8KfWQqK94,1782
|
|
106
|
-
datachain/lib/dc/records.py,sha256=
|
|
107
|
+
datachain/lib/dc/records.py,sha256=br5MTtD8mCrPpWXiyHXpYL-ChH9_tg0S-7ttAa8hH80,2634
|
|
107
108
|
datachain/lib/dc/storage.py,sha256=QLf3-xMV2Gmy3AA8qF9WqAsb7R8Rk87l4s5hBoiCH98,5285
|
|
108
109
|
datachain/lib/dc/utils.py,sha256=Ct-0FqCaDhNWHx09gJFcCXJGPjMI-VZr4t-GJyqTi44,3984
|
|
109
110
|
datachain/lib/dc/values.py,sha256=cBQubhmPNEDMJldUXzGh-UKbdim4P6O2B91Gp39roKw,1389
|
|
@@ -118,17 +119,17 @@ datachain/model/ultralytics/pose.py,sha256=gXAWfAk4OWZl93hKcQPKZvqJa3nIrECB4RM8K
|
|
|
118
119
|
datachain/model/ultralytics/segment.py,sha256=koq1HASo29isf0in6oSlzmU4IzsmOXe87F1ajQQVfh4,2911
|
|
119
120
|
datachain/query/__init__.py,sha256=7DhEIjAA8uZJfejruAVMZVcGFmvUpffuZJwgRqNwe-c,263
|
|
120
121
|
datachain/query/batch.py,sha256=6w8gzLTmLeylststu-gT5jIqEfi4-djS7_yTYyeo-fw,4190
|
|
121
|
-
datachain/query/dataset.py,sha256=
|
|
122
|
-
datachain/query/dispatch.py,sha256=
|
|
122
|
+
datachain/query/dataset.py,sha256=0SKm8VaXYuzm06j53WK-vnB3-55jauJwq3QULPOooVU,58687
|
|
123
|
+
datachain/query/dispatch.py,sha256=5p_jXxKJVCfIA4jLSQ0tAY1IhZUS3oJvyQXUH0Dk3bc,13215
|
|
123
124
|
datachain/query/metrics.py,sha256=r5b0ygYhokbXp8Mg3kCH8iFSRw0jxzyeBe-C-J_bKFc,938
|
|
124
125
|
datachain/query/params.py,sha256=O_j89mjYRLOwWNhYZl-z7mi-rkdP7WyFmaDufsdTryE,863
|
|
125
126
|
datachain/query/queue.py,sha256=waqM_KzavU8C-G95-4211Nd4GXna_u2747Chgwtgz2w,3839
|
|
126
|
-
datachain/query/schema.py,sha256=
|
|
127
|
+
datachain/query/schema.py,sha256=fo_MdPXblMAtbB3kcZAQDzAUHWP2RfuPX2JWndeGGt8,6668
|
|
127
128
|
datachain/query/session.py,sha256=wNdOHAi4HrsEihfzdcTlfB5i1xyj0dw6rlUz84StOoU,6512
|
|
128
129
|
datachain/query/udf.py,sha256=ljAYaF-J77t7iS4zc1-g1ssYd4c6Q-ccKGEc3VQQmeM,1322
|
|
129
130
|
datachain/query/utils.py,sha256=u0A_BwG9PNs0DxoDcvSWgWLpj3ByTUv8CqH13CIuGag,1293
|
|
130
131
|
datachain/remote/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
131
|
-
datachain/remote/studio.py,sha256=
|
|
132
|
+
datachain/remote/studio.py,sha256=SCmsYURwqYTXfxQpizOoyxlPE2ECJv-sZWVitStRPgc,13107
|
|
132
133
|
datachain/sql/__init__.py,sha256=6SQRdbljO3d2hx3EAVXEZrHQKv5jth0Jh98PogT59No,262
|
|
133
134
|
datachain/sql/selectable.py,sha256=cTc60qVoAwqqss0Vop8Lt5Z-ROnM1XrQmL_GLjRxhXs,1765
|
|
134
135
|
datachain/sql/types.py,sha256=ASSPkmM5EzdRindqj2O7WHLXq8VHAgFYedG8lYfGvVI,14045
|
|
@@ -150,9 +151,9 @@ datachain/sql/sqlite/vector.py,sha256=ncW4eu2FlJhrP_CIpsvtkUabZlQdl2D5Lgwy_cbfqR
|
|
|
150
151
|
datachain/toolkit/__init__.py,sha256=eQ58Q5Yf_Fgv1ZG0IO5dpB4jmP90rk8YxUWmPc1M2Bo,68
|
|
151
152
|
datachain/toolkit/split.py,sha256=ktGWzY4kyzjWyR86dhvzw-Zhl0lVk_LOX3NciTac6qo,2914
|
|
152
153
|
datachain/torch/__init__.py,sha256=gIS74PoEPy4TB3X6vx9nLO0Y3sLJzsA8ckn8pRWihJM,579
|
|
153
|
-
datachain-0.
|
|
154
|
-
datachain-0.
|
|
155
|
-
datachain-0.
|
|
156
|
-
datachain-0.
|
|
157
|
-
datachain-0.
|
|
158
|
-
datachain-0.
|
|
154
|
+
datachain-0.16.0.dist-info/licenses/LICENSE,sha256=8DnqK5yoPI_E50bEg_zsHKZHY2HqPy4rYN338BHQaRA,11344
|
|
155
|
+
datachain-0.16.0.dist-info/METADATA,sha256=om4GIGxM-IQkuTWdISiHploZfvi4BmhAY8ywNdHtqYM,11328
|
|
156
|
+
datachain-0.16.0.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
157
|
+
datachain-0.16.0.dist-info/entry_points.txt,sha256=0GMJS6B_KWq0m3VT98vQI2YZodAMkn4uReZ_okga9R4,49
|
|
158
|
+
datachain-0.16.0.dist-info/top_level.txt,sha256=lZPpdU_2jJABLNIg2kvEOBi8PtsYikbN1OdMLHk8bTg,10
|
|
159
|
+
datachain-0.16.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|