datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datachain/__init__.py +4 -0
- datachain/asyn.py +11 -12
- datachain/cache.py +5 -5
- datachain/catalog/__init__.py +0 -2
- datachain/catalog/catalog.py +276 -354
- datachain/catalog/dependency.py +164 -0
- datachain/catalog/loader.py +8 -3
- datachain/checkpoint.py +43 -0
- datachain/cli/__init__.py +10 -17
- datachain/cli/commands/__init__.py +1 -8
- datachain/cli/commands/datasets.py +42 -27
- datachain/cli/commands/ls.py +15 -15
- datachain/cli/commands/show.py +2 -2
- datachain/cli/parser/__init__.py +3 -43
- datachain/cli/parser/job.py +1 -1
- datachain/cli/parser/utils.py +1 -2
- datachain/cli/utils.py +2 -15
- datachain/client/azure.py +2 -2
- datachain/client/fsspec.py +34 -23
- datachain/client/gcs.py +3 -3
- datachain/client/http.py +157 -0
- datachain/client/local.py +11 -7
- datachain/client/s3.py +3 -3
- datachain/config.py +4 -8
- datachain/data_storage/db_engine.py +12 -6
- datachain/data_storage/job.py +2 -0
- datachain/data_storage/metastore.py +716 -137
- datachain/data_storage/schema.py +20 -27
- datachain/data_storage/serializer.py +105 -15
- datachain/data_storage/sqlite.py +114 -114
- datachain/data_storage/warehouse.py +140 -48
- datachain/dataset.py +109 -89
- datachain/delta.py +117 -42
- datachain/diff/__init__.py +25 -33
- datachain/error.py +24 -0
- datachain/func/aggregate.py +9 -11
- datachain/func/array.py +12 -12
- datachain/func/base.py +7 -4
- datachain/func/conditional.py +9 -13
- datachain/func/func.py +63 -45
- datachain/func/numeric.py +5 -7
- datachain/func/string.py +2 -2
- datachain/hash_utils.py +123 -0
- datachain/job.py +11 -7
- datachain/json.py +138 -0
- datachain/lib/arrow.py +18 -15
- datachain/lib/audio.py +60 -59
- datachain/lib/clip.py +14 -13
- datachain/lib/convert/python_to_sql.py +6 -10
- datachain/lib/convert/values_to_tuples.py +151 -53
- datachain/lib/data_model.py +23 -19
- datachain/lib/dataset_info.py +7 -7
- datachain/lib/dc/__init__.py +2 -1
- datachain/lib/dc/csv.py +22 -26
- datachain/lib/dc/database.py +37 -34
- datachain/lib/dc/datachain.py +518 -324
- datachain/lib/dc/datasets.py +38 -30
- datachain/lib/dc/hf.py +16 -20
- datachain/lib/dc/json.py +17 -18
- datachain/lib/dc/listings.py +5 -8
- datachain/lib/dc/pandas.py +3 -6
- datachain/lib/dc/parquet.py +33 -21
- datachain/lib/dc/records.py +9 -13
- datachain/lib/dc/storage.py +103 -65
- datachain/lib/dc/storage_pattern.py +251 -0
- datachain/lib/dc/utils.py +17 -14
- datachain/lib/dc/values.py +3 -6
- datachain/lib/file.py +187 -50
- datachain/lib/hf.py +7 -5
- datachain/lib/image.py +13 -13
- datachain/lib/listing.py +5 -5
- datachain/lib/listing_info.py +1 -2
- datachain/lib/meta_formats.py +2 -3
- datachain/lib/model_store.py +20 -8
- datachain/lib/namespaces.py +59 -7
- datachain/lib/projects.py +51 -9
- datachain/lib/pytorch.py +31 -23
- datachain/lib/settings.py +188 -85
- datachain/lib/signal_schema.py +302 -64
- datachain/lib/text.py +8 -7
- datachain/lib/udf.py +103 -63
- datachain/lib/udf_signature.py +59 -34
- datachain/lib/utils.py +20 -0
- datachain/lib/video.py +3 -4
- datachain/lib/webdataset.py +31 -36
- datachain/lib/webdataset_laion.py +15 -16
- datachain/listing.py +12 -5
- datachain/model/bbox.py +3 -1
- datachain/namespace.py +22 -3
- datachain/node.py +6 -6
- datachain/nodes_thread_pool.py +0 -1
- datachain/plugins.py +24 -0
- datachain/project.py +4 -4
- datachain/query/batch.py +10 -12
- datachain/query/dataset.py +376 -194
- datachain/query/dispatch.py +112 -84
- datachain/query/metrics.py +3 -4
- datachain/query/params.py +2 -3
- datachain/query/queue.py +2 -1
- datachain/query/schema.py +7 -6
- datachain/query/session.py +190 -33
- datachain/query/udf.py +9 -6
- datachain/remote/studio.py +90 -53
- datachain/script_meta.py +12 -12
- datachain/sql/sqlite/base.py +37 -25
- datachain/sql/sqlite/types.py +1 -1
- datachain/sql/types.py +36 -5
- datachain/studio.py +49 -40
- datachain/toolkit/split.py +31 -10
- datachain/utils.py +39 -48
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
- datachain-0.39.0.dist-info/RECORD +173 -0
- datachain/cli/commands/query.py +0 -54
- datachain/query/utils.py +0 -36
- datachain-0.30.5.dist-info/RECORD +0 -168
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
- {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import builtins
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
from typing import TypeVar
|
|
5
|
+
|
|
6
|
+
from datachain.dataset import DatasetDependency
|
|
7
|
+
|
|
8
|
+
DDN = TypeVar("DDN", bound="DatasetDependencyNode")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class DatasetDependencyNode:
|
|
13
|
+
namespace: str
|
|
14
|
+
project: str
|
|
15
|
+
id: int
|
|
16
|
+
dataset_id: int | None
|
|
17
|
+
dataset_version_id: int | None
|
|
18
|
+
dataset_name: str | None
|
|
19
|
+
dataset_version: str | None
|
|
20
|
+
created_at: datetime
|
|
21
|
+
source_dataset_id: int
|
|
22
|
+
source_dataset_version_id: int | None
|
|
23
|
+
depth: int
|
|
24
|
+
|
|
25
|
+
@classmethod
|
|
26
|
+
def parse(
|
|
27
|
+
cls: builtins.type[DDN],
|
|
28
|
+
namespace: str,
|
|
29
|
+
project: str,
|
|
30
|
+
id: int,
|
|
31
|
+
dataset_id: int | None,
|
|
32
|
+
dataset_version_id: int | None,
|
|
33
|
+
dataset_name: str | None,
|
|
34
|
+
dataset_version: str | None,
|
|
35
|
+
created_at: datetime,
|
|
36
|
+
source_dataset_id: int,
|
|
37
|
+
source_dataset_version_id: int | None,
|
|
38
|
+
depth: int,
|
|
39
|
+
) -> "DatasetDependencyNode | None":
|
|
40
|
+
return cls(
|
|
41
|
+
namespace,
|
|
42
|
+
project,
|
|
43
|
+
id,
|
|
44
|
+
dataset_id,
|
|
45
|
+
dataset_version_id,
|
|
46
|
+
dataset_name,
|
|
47
|
+
dataset_version,
|
|
48
|
+
created_at,
|
|
49
|
+
source_dataset_id,
|
|
50
|
+
source_dataset_version_id,
|
|
51
|
+
depth,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
def to_dependency(self) -> "DatasetDependency | None":
|
|
55
|
+
return DatasetDependency.parse(
|
|
56
|
+
namespace_name=self.namespace,
|
|
57
|
+
project_name=self.project,
|
|
58
|
+
id=self.id,
|
|
59
|
+
dataset_id=self.dataset_id,
|
|
60
|
+
dataset_version_id=self.dataset_version_id,
|
|
61
|
+
dataset_name=self.dataset_name,
|
|
62
|
+
dataset_version=self.dataset_version,
|
|
63
|
+
dataset_version_created_at=self.created_at,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def build_dependency_hierarchy(
|
|
68
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
69
|
+
) -> tuple[
|
|
70
|
+
dict[int, DatasetDependency | None], dict[tuple[int, int | None], list[int]]
|
|
71
|
+
]:
|
|
72
|
+
"""
|
|
73
|
+
Build dependency hierarchy from dependency nodes.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
dependency_nodes: List of DatasetDependencyNode objects from the database
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
Tuple of (dependency_map, children_map) where:
|
|
80
|
+
- dependency_map: Maps dependency_id -> DatasetDependency
|
|
81
|
+
- children_map: Maps (source_dataset_id, source_version_id) ->
|
|
82
|
+
list of dependency_ids
|
|
83
|
+
"""
|
|
84
|
+
dependency_map: dict[int, DatasetDependency | None] = {}
|
|
85
|
+
children_map: dict[tuple[int, int | None], list[int]] = {}
|
|
86
|
+
|
|
87
|
+
for node in dependency_nodes:
|
|
88
|
+
if node is None:
|
|
89
|
+
continue
|
|
90
|
+
dependency = node.to_dependency()
|
|
91
|
+
parent_key = (node.source_dataset_id, node.source_dataset_version_id)
|
|
92
|
+
|
|
93
|
+
if dependency is not None:
|
|
94
|
+
dependency_map[dependency.id] = dependency
|
|
95
|
+
children_map.setdefault(parent_key, []).append(dependency.id)
|
|
96
|
+
else:
|
|
97
|
+
# Handle case where dependency creation failed (e.g., deleted dependency)
|
|
98
|
+
dependency_map[node.id] = None
|
|
99
|
+
children_map.setdefault(parent_key, []).append(node.id)
|
|
100
|
+
|
|
101
|
+
return dependency_map, children_map
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def populate_nested_dependencies(
|
|
105
|
+
dependency: DatasetDependency,
|
|
106
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
107
|
+
dependency_map: dict[int, DatasetDependency | None],
|
|
108
|
+
children_map: dict[tuple[int, int | None], list[int]],
|
|
109
|
+
) -> None:
|
|
110
|
+
"""
|
|
111
|
+
Recursively populate nested dependencies for a given dependency.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
dependency: The dependency to populate nested dependencies for
|
|
115
|
+
dependency_nodes: All dependency nodes from the database
|
|
116
|
+
dependency_map: Maps dependency_id -> DatasetDependency
|
|
117
|
+
children_map: Maps (source_dataset_id, source_version_id) ->
|
|
118
|
+
list of dependency_ids
|
|
119
|
+
"""
|
|
120
|
+
# Find the target dataset and version for this dependency
|
|
121
|
+
target_dataset_id, target_version_id = find_target_dataset_version(
|
|
122
|
+
dependency, dependency_nodes
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
if target_dataset_id is None or target_version_id is None:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
# Get children for this target
|
|
129
|
+
target_key = (target_dataset_id, target_version_id)
|
|
130
|
+
if target_key not in children_map:
|
|
131
|
+
dependency.dependencies = []
|
|
132
|
+
return
|
|
133
|
+
|
|
134
|
+
child_dependency_ids = children_map[target_key]
|
|
135
|
+
child_dependencies = [dependency_map[child_id] for child_id in child_dependency_ids]
|
|
136
|
+
|
|
137
|
+
dependency.dependencies = child_dependencies
|
|
138
|
+
|
|
139
|
+
# Recursively populate children
|
|
140
|
+
for child_dependency in child_dependencies:
|
|
141
|
+
if child_dependency is not None:
|
|
142
|
+
populate_nested_dependencies(
|
|
143
|
+
child_dependency, dependency_nodes, dependency_map, children_map
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def find_target_dataset_version(
|
|
148
|
+
dependency: DatasetDependency,
|
|
149
|
+
dependency_nodes: list[DatasetDependencyNode | None],
|
|
150
|
+
) -> tuple[int | None, int | None]:
|
|
151
|
+
"""
|
|
152
|
+
Find the target dataset ID and version ID for a given dependency.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
dependency: The dependency to find target for
|
|
156
|
+
dependency_nodes: All dependency nodes from the database
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Tuple of (target_dataset_id, target_version_id) or (None, None) if not found
|
|
160
|
+
"""
|
|
161
|
+
for node in dependency_nodes:
|
|
162
|
+
if node is not None and node.id == dependency.id:
|
|
163
|
+
return node.dataset_id, node.dataset_version_id
|
|
164
|
+
return None, None
|
datachain/catalog/loader.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import sys
|
|
3
3
|
from importlib import import_module
|
|
4
|
-
from typing import TYPE_CHECKING, Any
|
|
4
|
+
from typing import TYPE_CHECKING, Any
|
|
5
5
|
|
|
6
|
+
from datachain.plugins import ensure_plugins_loaded
|
|
6
7
|
from datachain.utils import get_envs_by_prefix
|
|
7
8
|
|
|
8
9
|
if TYPE_CHECKING:
|
|
@@ -24,6 +25,8 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
|
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
|
|
28
|
+
ensure_plugins_loaded()
|
|
29
|
+
|
|
27
30
|
from datachain.data_storage import AbstractMetastore
|
|
28
31
|
from datachain.data_storage.serializer import deserialize
|
|
29
32
|
|
|
@@ -64,6 +67,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
|
|
|
64
67
|
|
|
65
68
|
|
|
66
69
|
def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
70
|
+
ensure_plugins_loaded()
|
|
71
|
+
|
|
67
72
|
from datachain.data_storage import AbstractWarehouse
|
|
68
73
|
from datachain.data_storage.serializer import deserialize
|
|
69
74
|
|
|
@@ -103,7 +108,7 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
|
|
|
103
108
|
return warehouse_class(**warehouse_args)
|
|
104
109
|
|
|
105
110
|
|
|
106
|
-
def get_udf_distributor_class() ->
|
|
111
|
+
def get_udf_distributor_class() -> type["AbstractUDFDistributor"] | None:
|
|
107
112
|
if os.environ.get(DISTRIBUTED_DISABLED) == "True":
|
|
108
113
|
return None
|
|
109
114
|
|
|
@@ -127,7 +132,7 @@ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
|
|
|
127
132
|
|
|
128
133
|
|
|
129
134
|
def get_catalog(
|
|
130
|
-
client_config:
|
|
135
|
+
client_config: dict[str, Any] | None = None,
|
|
131
136
|
in_memory: bool = False,
|
|
132
137
|
) -> "Catalog":
|
|
133
138
|
"""
|
datachain/checkpoint.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
import uuid
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from datetime import datetime
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
@dataclass
|
|
7
|
+
class Checkpoint:
|
|
8
|
+
"""
|
|
9
|
+
Represents a checkpoint within a job run.
|
|
10
|
+
|
|
11
|
+
A checkpoint marks a successfully completed stage of execution. In the event
|
|
12
|
+
of a failure, the job can resume from the most recent checkpoint rather than
|
|
13
|
+
starting over from the beginning.
|
|
14
|
+
|
|
15
|
+
Checkpoints can also be created in a "partial" mode, which indicates that the
|
|
16
|
+
work at this stage was only partially completed. For example, if a failure
|
|
17
|
+
occurs halfway through running a UDF, already computed results can still be
|
|
18
|
+
saved, allowing the job to resume from that partially completed state on
|
|
19
|
+
restart.
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
id: str
|
|
23
|
+
job_id: str
|
|
24
|
+
hash: str
|
|
25
|
+
partial: bool
|
|
26
|
+
created_at: datetime
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def parse(
|
|
30
|
+
cls,
|
|
31
|
+
id: str | uuid.UUID,
|
|
32
|
+
job_id: str,
|
|
33
|
+
_hash: str,
|
|
34
|
+
partial: bool,
|
|
35
|
+
created_at: datetime,
|
|
36
|
+
) -> "Checkpoint":
|
|
37
|
+
return cls(
|
|
38
|
+
str(id),
|
|
39
|
+
job_id,
|
|
40
|
+
_hash,
|
|
41
|
+
bool(partial),
|
|
42
|
+
created_at,
|
|
43
|
+
)
|
datachain/cli/__init__.py
CHANGED
|
@@ -3,10 +3,8 @@ import os
|
|
|
3
3
|
import sys
|
|
4
4
|
import traceback
|
|
5
5
|
from multiprocessing import freeze_support
|
|
6
|
-
from typing import Optional
|
|
7
6
|
|
|
8
7
|
from datachain.cli.utils import get_logging_level
|
|
9
|
-
from datachain.error import DataChainError as DataChainError
|
|
10
8
|
|
|
11
9
|
from .commands import (
|
|
12
10
|
clear_cache,
|
|
@@ -17,7 +15,6 @@ from .commands import (
|
|
|
17
15
|
index,
|
|
18
16
|
list_datasets,
|
|
19
17
|
ls,
|
|
20
|
-
query,
|
|
21
18
|
rm_dataset,
|
|
22
19
|
show,
|
|
23
20
|
)
|
|
@@ -26,7 +23,7 @@ from .parser import get_parser
|
|
|
26
23
|
logger = logging.getLogger("datachain")
|
|
27
24
|
|
|
28
25
|
|
|
29
|
-
def main(argv:
|
|
26
|
+
def main(argv: list[str] | None = None) -> int:
|
|
30
27
|
from datachain.catalog import get_catalog
|
|
31
28
|
|
|
32
29
|
# Required for Windows multiprocessing support
|
|
@@ -38,7 +35,7 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
38
35
|
if args.command == "internal-run-udf":
|
|
39
36
|
return handle_udf()
|
|
40
37
|
if args.command == "internal-run-udf-worker":
|
|
41
|
-
return handle_udf_runner(
|
|
38
|
+
return handle_udf_runner()
|
|
42
39
|
|
|
43
40
|
if args.command is None:
|
|
44
41
|
datachain_parser.print_help(sys.stderr)
|
|
@@ -62,6 +59,7 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
62
59
|
|
|
63
60
|
error = None
|
|
64
61
|
|
|
62
|
+
catalog = None
|
|
65
63
|
try:
|
|
66
64
|
catalog = get_catalog(client_config=client_config)
|
|
67
65
|
return handle_command(args, catalog, client_config)
|
|
@@ -72,6 +70,11 @@ def main(argv: Optional[list[str]] = None) -> int:
|
|
|
72
70
|
error, return_code = handle_general_exception(exc, args, logging_level)
|
|
73
71
|
return return_code
|
|
74
72
|
finally:
|
|
73
|
+
if catalog is not None:
|
|
74
|
+
try:
|
|
75
|
+
catalog.close()
|
|
76
|
+
except Exception:
|
|
77
|
+
logger.exception("Failed to close catalog")
|
|
75
78
|
from datachain.telemetry import telemetry
|
|
76
79
|
|
|
77
80
|
telemetry.send_cli_call(args.command, error=error)
|
|
@@ -92,7 +95,6 @@ def handle_command(args, catalog, client_config) -> int:
|
|
|
92
95
|
"find": lambda: handle_find_command(args, catalog),
|
|
93
96
|
"index": lambda: handle_index_command(args, catalog),
|
|
94
97
|
"completion": lambda: handle_completion_command(args),
|
|
95
|
-
"query": lambda: handle_query_command(args, catalog),
|
|
96
98
|
"clear-cache": lambda: clear_cache(catalog),
|
|
97
99
|
"gc": lambda: garbage_collect(catalog),
|
|
98
100
|
"auth": lambda: process_auth_cli_args(args),
|
|
@@ -261,15 +263,6 @@ def handle_completion_command(args):
|
|
|
261
263
|
print(completion(args.shell))
|
|
262
264
|
|
|
263
265
|
|
|
264
|
-
def handle_query_command(args, catalog):
|
|
265
|
-
query(
|
|
266
|
-
catalog,
|
|
267
|
-
args.script,
|
|
268
|
-
parallel=args.parallel,
|
|
269
|
-
params=args.param,
|
|
270
|
-
)
|
|
271
|
-
|
|
272
|
-
|
|
273
266
|
def handle_broken_pipe_error(exc):
|
|
274
267
|
# Python flushes standard streams on exit; redirect remaining output
|
|
275
268
|
# to devnull to avoid another BrokenPipeError at shutdown
|
|
@@ -307,7 +300,7 @@ def handle_udf() -> int:
|
|
|
307
300
|
return udf_entrypoint()
|
|
308
301
|
|
|
309
302
|
|
|
310
|
-
def handle_udf_runner(
|
|
303
|
+
def handle_udf_runner() -> int:
|
|
311
304
|
from datachain.query.dispatch import udf_worker_entrypoint
|
|
312
305
|
|
|
313
|
-
return udf_worker_entrypoint(
|
|
306
|
+
return udf_worker_entrypoint()
|
|
@@ -1,14 +1,8 @@
|
|
|
1
|
-
from .datasets import
|
|
2
|
-
edit_dataset,
|
|
3
|
-
list_datasets,
|
|
4
|
-
list_datasets_local,
|
|
5
|
-
rm_dataset,
|
|
6
|
-
)
|
|
1
|
+
from .datasets import edit_dataset, list_datasets, list_datasets_local, rm_dataset
|
|
7
2
|
from .du import du
|
|
8
3
|
from .index import index
|
|
9
4
|
from .ls import ls
|
|
10
5
|
from .misc import clear_cache, completion, garbage_collect
|
|
11
|
-
from .query import query
|
|
12
6
|
from .show import show
|
|
13
7
|
|
|
14
8
|
__all__ = [
|
|
@@ -21,7 +15,6 @@ __all__ = [
|
|
|
21
15
|
"list_datasets",
|
|
22
16
|
"list_datasets_local",
|
|
23
17
|
"ls",
|
|
24
|
-
"query",
|
|
25
18
|
"rm_dataset",
|
|
26
19
|
"show",
|
|
27
20
|
]
|
|
@@ -1,30 +1,41 @@
|
|
|
1
1
|
import sys
|
|
2
|
-
from
|
|
2
|
+
from collections.abc import Iterable, Iterator
|
|
3
|
+
from typing import TYPE_CHECKING
|
|
3
4
|
|
|
4
5
|
from tabulate import tabulate
|
|
5
6
|
|
|
6
|
-
|
|
7
|
-
from datachain.catalog import Catalog
|
|
8
|
-
|
|
7
|
+
from datachain import semver
|
|
9
8
|
from datachain.catalog import is_namespace_local
|
|
10
9
|
from datachain.cli.utils import determine_flavors
|
|
11
10
|
from datachain.config import Config
|
|
12
11
|
from datachain.error import DataChainError, DatasetNotFoundError
|
|
13
12
|
from datachain.studio import list_datasets as list_datasets_studio
|
|
14
13
|
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from datachain.catalog import Catalog
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def group_dataset_versions(
|
|
19
|
+
datasets: Iterable[tuple[str, str]], latest_only=True
|
|
20
|
+
) -> dict[str, str | list[str]]:
|
|
21
|
+
grouped: dict[str, list[tuple[int, int, int]]] = {}
|
|
15
22
|
|
|
16
|
-
def group_dataset_versions(datasets, latest_only=True):
|
|
17
|
-
grouped = {}
|
|
18
23
|
# Sort to ensure groupby works as expected
|
|
19
24
|
# (groupby expects consecutive items with the same key)
|
|
20
25
|
for name, version in sorted(datasets):
|
|
21
|
-
grouped.setdefault(name, []).append(version)
|
|
26
|
+
grouped.setdefault(name, []).append(semver.parse(version))
|
|
22
27
|
|
|
23
28
|
if latest_only:
|
|
24
29
|
# For each dataset name, pick the highest version.
|
|
25
|
-
return {
|
|
30
|
+
return {
|
|
31
|
+
name: semver.create(*(max(versions))) for name, versions in grouped.items()
|
|
32
|
+
}
|
|
33
|
+
|
|
26
34
|
# For each dataset name, return a sorted list of unique versions.
|
|
27
|
-
return {
|
|
35
|
+
return {
|
|
36
|
+
name: [semver.create(*v) for v in sorted(set(versions))]
|
|
37
|
+
for name, versions in grouped.items()
|
|
38
|
+
}
|
|
28
39
|
|
|
29
40
|
|
|
30
41
|
def list_datasets(
|
|
@@ -32,10 +43,10 @@ def list_datasets(
|
|
|
32
43
|
studio: bool = False,
|
|
33
44
|
local: bool = False,
|
|
34
45
|
all: bool = True,
|
|
35
|
-
team:
|
|
46
|
+
team: str | None = None,
|
|
36
47
|
latest_only: bool = True,
|
|
37
|
-
name:
|
|
38
|
-
):
|
|
48
|
+
name: str | None = None,
|
|
49
|
+
) -> None:
|
|
39
50
|
token = Config().read().get("studio", {}).get("token")
|
|
40
51
|
all, local, studio = determine_flavors(studio, local, all, token)
|
|
41
52
|
if name:
|
|
@@ -95,27 +106,31 @@ def list_datasets(
|
|
|
95
106
|
print(tabulate(rows, headers="keys"))
|
|
96
107
|
|
|
97
108
|
|
|
98
|
-
def list_datasets_local(
|
|
109
|
+
def list_datasets_local(
|
|
110
|
+
catalog: "Catalog", name: str | None = None
|
|
111
|
+
) -> Iterator[tuple[str, str]]:
|
|
99
112
|
if name:
|
|
100
113
|
yield from list_datasets_local_versions(catalog, name)
|
|
101
114
|
return
|
|
102
115
|
|
|
103
116
|
for d in catalog.ls_datasets():
|
|
104
117
|
for v in d.versions:
|
|
105
|
-
yield
|
|
118
|
+
yield d.full_name, v.version
|
|
106
119
|
|
|
107
120
|
|
|
108
|
-
def list_datasets_local_versions(
|
|
121
|
+
def list_datasets_local_versions(
|
|
122
|
+
catalog: "Catalog", name: str
|
|
123
|
+
) -> Iterator[tuple[str, str]]:
|
|
109
124
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
110
125
|
|
|
111
126
|
ds = catalog.get_dataset(
|
|
112
127
|
name, namespace_name=namespace_name, project_name=project_name
|
|
113
128
|
)
|
|
114
129
|
for v in ds.versions:
|
|
115
|
-
yield
|
|
130
|
+
yield name, v.version
|
|
116
131
|
|
|
117
132
|
|
|
118
|
-
def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
133
|
+
def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
|
|
119
134
|
row = {
|
|
120
135
|
"Name": name,
|
|
121
136
|
}
|
|
@@ -132,11 +147,11 @@ def _datasets_tabulate_row(name, both, local_version, studio_version):
|
|
|
132
147
|
def rm_dataset(
|
|
133
148
|
catalog: "Catalog",
|
|
134
149
|
name: str,
|
|
135
|
-
version:
|
|
136
|
-
force:
|
|
137
|
-
studio:
|
|
138
|
-
team:
|
|
139
|
-
):
|
|
150
|
+
version: str | None = None,
|
|
151
|
+
force: bool | None = False,
|
|
152
|
+
studio: bool | None = False,
|
|
153
|
+
team: str | None = None,
|
|
154
|
+
) -> None:
|
|
140
155
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
|
141
156
|
|
|
142
157
|
if studio:
|
|
@@ -162,11 +177,11 @@ def rm_dataset(
|
|
|
162
177
|
def edit_dataset(
|
|
163
178
|
catalog: "Catalog",
|
|
164
179
|
name: str,
|
|
165
|
-
new_name:
|
|
166
|
-
description:
|
|
167
|
-
attrs:
|
|
168
|
-
team:
|
|
169
|
-
):
|
|
180
|
+
new_name: str | None = None,
|
|
181
|
+
description: str | None = None,
|
|
182
|
+
attrs: list[str] | None = None,
|
|
183
|
+
team: str | None = None,
|
|
184
|
+
) -> None:
|
|
170
185
|
from datachain.lib.dc.utils import is_studio
|
|
171
186
|
|
|
172
187
|
namespace_name, project_name, name = catalog.get_full_dataset_name(name)
|
datachain/cli/commands/ls.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import shlex
|
|
2
2
|
from collections.abc import Iterable, Iterator
|
|
3
3
|
from itertools import chain
|
|
4
|
-
from typing import TYPE_CHECKING
|
|
5
|
-
|
|
6
|
-
if TYPE_CHECKING:
|
|
7
|
-
from datachain.catalog import Catalog
|
|
4
|
+
from typing import TYPE_CHECKING
|
|
8
5
|
|
|
9
6
|
from datachain.cli.utils import determine_flavors
|
|
10
7
|
from datachain.config import Config
|
|
8
|
+
from datachain.query.session import Session
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from datachain.catalog import Catalog
|
|
11
12
|
|
|
12
13
|
|
|
13
14
|
def ls(
|
|
@@ -16,7 +17,7 @@ def ls(
|
|
|
16
17
|
studio: bool = False,
|
|
17
18
|
local: bool = False,
|
|
18
19
|
all: bool = True,
|
|
19
|
-
team:
|
|
20
|
+
team: str | None = None,
|
|
20
21
|
**kwargs,
|
|
21
22
|
):
|
|
22
23
|
token = Config().read().get("studio", {}).get("token")
|
|
@@ -32,18 +33,15 @@ def ls(
|
|
|
32
33
|
def ls_local(
|
|
33
34
|
sources,
|
|
34
35
|
long: bool = False,
|
|
35
|
-
catalog
|
|
36
|
+
catalog=None,
|
|
36
37
|
client_config=None,
|
|
37
38
|
**kwargs,
|
|
38
39
|
):
|
|
39
40
|
from datachain import listings
|
|
40
41
|
|
|
41
42
|
if sources:
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
catalog = get_catalog(client_config=client_config)
|
|
46
|
-
|
|
43
|
+
session = Session.get(catalog=catalog, client_config=client_config)
|
|
44
|
+
catalog = session.catalog
|
|
47
45
|
actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
|
|
48
46
|
if len(actual_sources) == 1:
|
|
49
47
|
for _, entries in actual_sources:
|
|
@@ -78,7 +76,7 @@ def format_ls_entry(entry: str) -> str:
|
|
|
78
76
|
def ls_remote(
|
|
79
77
|
paths: Iterable[str],
|
|
80
78
|
long: bool = False,
|
|
81
|
-
team:
|
|
79
|
+
team: str | None = None,
|
|
82
80
|
):
|
|
83
81
|
from datachain.node import long_line_str
|
|
84
82
|
from datachain.remote.studio import StudioClient
|
|
@@ -145,7 +143,7 @@ def _ls_urls_flat(
|
|
|
145
143
|
long: bool,
|
|
146
144
|
catalog: "Catalog",
|
|
147
145
|
**kwargs,
|
|
148
|
-
) -> Iterator[tuple[str,
|
|
146
|
+
) -> Iterator[tuple[str, Iterable[str]]]:
|
|
149
147
|
from datachain.client import Client
|
|
150
148
|
from datachain.node import long_line_str
|
|
151
149
|
|
|
@@ -154,7 +152,9 @@ def _ls_urls_flat(
|
|
|
154
152
|
if client_cls.is_root_url(source):
|
|
155
153
|
buckets = client_cls.ls_buckets(**catalog.client_config)
|
|
156
154
|
if long:
|
|
157
|
-
values = (
|
|
155
|
+
values: Iterable[str] = (
|
|
156
|
+
long_line_str(b.name, b.created) for b in buckets
|
|
157
|
+
)
|
|
158
158
|
else:
|
|
159
159
|
values = (b.name for b in buckets)
|
|
160
160
|
yield source, values
|
|
@@ -164,7 +164,7 @@ def _ls_urls_flat(
|
|
|
164
164
|
if long:
|
|
165
165
|
fields.append("last_modified")
|
|
166
166
|
for data_source, results in catalog.ls([source], fields=fields, **kwargs):
|
|
167
|
-
values =
|
|
167
|
+
values = [_node_data_to_ls_values(r, long) for r in results]
|
|
168
168
|
found = True
|
|
169
169
|
yield data_source.dirname(), values
|
|
170
170
|
if not found:
|
datachain/cli/commands/show.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from collections.abc import Sequence
|
|
2
|
-
from typing import TYPE_CHECKING
|
|
2
|
+
from typing import TYPE_CHECKING
|
|
3
3
|
|
|
4
4
|
from datachain.lib.signal_schema import SignalSchema
|
|
5
5
|
|
|
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|
|
10
10
|
def show(
|
|
11
11
|
catalog: "Catalog",
|
|
12
12
|
name: str,
|
|
13
|
-
version:
|
|
13
|
+
version: str | None = None,
|
|
14
14
|
limit: int = 10,
|
|
15
15
|
offset: int = 0,
|
|
16
16
|
columns: Sequence[str] = (),
|
datachain/cli/parser/__init__.py
CHANGED
|
@@ -3,7 +3,7 @@ from importlib.metadata import PackageNotFoundError, version
|
|
|
3
3
|
|
|
4
4
|
import shtab
|
|
5
5
|
|
|
6
|
-
from datachain.cli.utils import BooleanOptionalAction
|
|
6
|
+
from datachain.cli.utils import BooleanOptionalAction
|
|
7
7
|
|
|
8
8
|
from .job import add_jobs_parser
|
|
9
9
|
from .studio import add_auth_parser
|
|
@@ -16,9 +16,7 @@ from .utils import (
|
|
|
16
16
|
add_update_arg,
|
|
17
17
|
find_columns_type,
|
|
18
18
|
)
|
|
19
|
-
from .utils import
|
|
20
|
-
CustomArgumentParser as ArgumentParser,
|
|
21
|
-
)
|
|
19
|
+
from .utils import CustomArgumentParser as ArgumentParser
|
|
22
20
|
|
|
23
21
|
|
|
24
22
|
def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
@@ -467,37 +465,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
467
465
|
show_parser.add_argument("--schema", action="store_true", help="Show schema")
|
|
468
466
|
add_show_args(show_parser)
|
|
469
467
|
|
|
470
|
-
query_parser = subp.add_parser(
|
|
471
|
-
"query",
|
|
472
|
-
parents=[parent_parser],
|
|
473
|
-
description="Create a new dataset with a query script.",
|
|
474
|
-
formatter_class=CustomHelpFormatter,
|
|
475
|
-
)
|
|
476
|
-
add_anon_arg(query_parser)
|
|
477
|
-
query_parser.add_argument(
|
|
478
|
-
"script", metavar="<script.py>", type=str, help="Filepath for script"
|
|
479
|
-
)
|
|
480
|
-
query_parser.add_argument(
|
|
481
|
-
"--parallel",
|
|
482
|
-
nargs="?",
|
|
483
|
-
type=int,
|
|
484
|
-
const=-1,
|
|
485
|
-
default=None,
|
|
486
|
-
metavar="N",
|
|
487
|
-
help=(
|
|
488
|
-
"Use multiprocessing to run any query script UDFs with N worker processes. "
|
|
489
|
-
"N defaults to the CPU count"
|
|
490
|
-
),
|
|
491
|
-
)
|
|
492
|
-
query_parser.add_argument(
|
|
493
|
-
"-p",
|
|
494
|
-
"--param",
|
|
495
|
-
metavar="param=value",
|
|
496
|
-
nargs=1,
|
|
497
|
-
action=KeyValueArgs,
|
|
498
|
-
help="Query parameters",
|
|
499
|
-
)
|
|
500
|
-
|
|
501
468
|
parse_clear_cache = subp.add_parser(
|
|
502
469
|
"clear-cache",
|
|
503
470
|
parents=[parent_parser],
|
|
@@ -515,14 +482,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
|
|
|
515
482
|
add_anon_arg(parse_gc)
|
|
516
483
|
|
|
517
484
|
subp.add_parser("internal-run-udf", parents=[parent_parser])
|
|
518
|
-
|
|
519
|
-
run_udf_worker.add_argument(
|
|
520
|
-
"--fd",
|
|
521
|
-
type=int,
|
|
522
|
-
action="store",
|
|
523
|
-
default=None,
|
|
524
|
-
help="File descriptor to write results to",
|
|
525
|
-
)
|
|
485
|
+
subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
|
|
526
486
|
|
|
527
487
|
add_completion_parser(subp, [parent_parser])
|
|
528
488
|
return parser
|
datachain/cli/parser/job.py
CHANGED
|
@@ -83,7 +83,7 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
|
|
|
83
83
|
studio_run_parser.add_argument(
|
|
84
84
|
"--python-version",
|
|
85
85
|
action="store",
|
|
86
|
-
help="Python version for the job (e.g., 3.
|
|
86
|
+
help="Python version for the job (e.g., 3.10, 3.11, 3.12, 3.13)",
|
|
87
87
|
)
|
|
88
88
|
studio_run_parser.add_argument(
|
|
89
89
|
"--repository",
|