datachain 0.30.5__py3-none-any.whl → 0.39.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. datachain/__init__.py +4 -0
  2. datachain/asyn.py +11 -12
  3. datachain/cache.py +5 -5
  4. datachain/catalog/__init__.py +0 -2
  5. datachain/catalog/catalog.py +276 -354
  6. datachain/catalog/dependency.py +164 -0
  7. datachain/catalog/loader.py +8 -3
  8. datachain/checkpoint.py +43 -0
  9. datachain/cli/__init__.py +10 -17
  10. datachain/cli/commands/__init__.py +1 -8
  11. datachain/cli/commands/datasets.py +42 -27
  12. datachain/cli/commands/ls.py +15 -15
  13. datachain/cli/commands/show.py +2 -2
  14. datachain/cli/parser/__init__.py +3 -43
  15. datachain/cli/parser/job.py +1 -1
  16. datachain/cli/parser/utils.py +1 -2
  17. datachain/cli/utils.py +2 -15
  18. datachain/client/azure.py +2 -2
  19. datachain/client/fsspec.py +34 -23
  20. datachain/client/gcs.py +3 -3
  21. datachain/client/http.py +157 -0
  22. datachain/client/local.py +11 -7
  23. datachain/client/s3.py +3 -3
  24. datachain/config.py +4 -8
  25. datachain/data_storage/db_engine.py +12 -6
  26. datachain/data_storage/job.py +2 -0
  27. datachain/data_storage/metastore.py +716 -137
  28. datachain/data_storage/schema.py +20 -27
  29. datachain/data_storage/serializer.py +105 -15
  30. datachain/data_storage/sqlite.py +114 -114
  31. datachain/data_storage/warehouse.py +140 -48
  32. datachain/dataset.py +109 -89
  33. datachain/delta.py +117 -42
  34. datachain/diff/__init__.py +25 -33
  35. datachain/error.py +24 -0
  36. datachain/func/aggregate.py +9 -11
  37. datachain/func/array.py +12 -12
  38. datachain/func/base.py +7 -4
  39. datachain/func/conditional.py +9 -13
  40. datachain/func/func.py +63 -45
  41. datachain/func/numeric.py +5 -7
  42. datachain/func/string.py +2 -2
  43. datachain/hash_utils.py +123 -0
  44. datachain/job.py +11 -7
  45. datachain/json.py +138 -0
  46. datachain/lib/arrow.py +18 -15
  47. datachain/lib/audio.py +60 -59
  48. datachain/lib/clip.py +14 -13
  49. datachain/lib/convert/python_to_sql.py +6 -10
  50. datachain/lib/convert/values_to_tuples.py +151 -53
  51. datachain/lib/data_model.py +23 -19
  52. datachain/lib/dataset_info.py +7 -7
  53. datachain/lib/dc/__init__.py +2 -1
  54. datachain/lib/dc/csv.py +22 -26
  55. datachain/lib/dc/database.py +37 -34
  56. datachain/lib/dc/datachain.py +518 -324
  57. datachain/lib/dc/datasets.py +38 -30
  58. datachain/lib/dc/hf.py +16 -20
  59. datachain/lib/dc/json.py +17 -18
  60. datachain/lib/dc/listings.py +5 -8
  61. datachain/lib/dc/pandas.py +3 -6
  62. datachain/lib/dc/parquet.py +33 -21
  63. datachain/lib/dc/records.py +9 -13
  64. datachain/lib/dc/storage.py +103 -65
  65. datachain/lib/dc/storage_pattern.py +251 -0
  66. datachain/lib/dc/utils.py +17 -14
  67. datachain/lib/dc/values.py +3 -6
  68. datachain/lib/file.py +187 -50
  69. datachain/lib/hf.py +7 -5
  70. datachain/lib/image.py +13 -13
  71. datachain/lib/listing.py +5 -5
  72. datachain/lib/listing_info.py +1 -2
  73. datachain/lib/meta_formats.py +2 -3
  74. datachain/lib/model_store.py +20 -8
  75. datachain/lib/namespaces.py +59 -7
  76. datachain/lib/projects.py +51 -9
  77. datachain/lib/pytorch.py +31 -23
  78. datachain/lib/settings.py +188 -85
  79. datachain/lib/signal_schema.py +302 -64
  80. datachain/lib/text.py +8 -7
  81. datachain/lib/udf.py +103 -63
  82. datachain/lib/udf_signature.py +59 -34
  83. datachain/lib/utils.py +20 -0
  84. datachain/lib/video.py +3 -4
  85. datachain/lib/webdataset.py +31 -36
  86. datachain/lib/webdataset_laion.py +15 -16
  87. datachain/listing.py +12 -5
  88. datachain/model/bbox.py +3 -1
  89. datachain/namespace.py +22 -3
  90. datachain/node.py +6 -6
  91. datachain/nodes_thread_pool.py +0 -1
  92. datachain/plugins.py +24 -0
  93. datachain/project.py +4 -4
  94. datachain/query/batch.py +10 -12
  95. datachain/query/dataset.py +376 -194
  96. datachain/query/dispatch.py +112 -84
  97. datachain/query/metrics.py +3 -4
  98. datachain/query/params.py +2 -3
  99. datachain/query/queue.py +2 -1
  100. datachain/query/schema.py +7 -6
  101. datachain/query/session.py +190 -33
  102. datachain/query/udf.py +9 -6
  103. datachain/remote/studio.py +90 -53
  104. datachain/script_meta.py +12 -12
  105. datachain/sql/sqlite/base.py +37 -25
  106. datachain/sql/sqlite/types.py +1 -1
  107. datachain/sql/types.py +36 -5
  108. datachain/studio.py +49 -40
  109. datachain/toolkit/split.py +31 -10
  110. datachain/utils.py +39 -48
  111. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/METADATA +26 -38
  112. datachain-0.39.0.dist-info/RECORD +173 -0
  113. datachain/cli/commands/query.py +0 -54
  114. datachain/query/utils.py +0 -36
  115. datachain-0.30.5.dist-info/RECORD +0 -168
  116. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/WHEEL +0 -0
  117. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/entry_points.txt +0 -0
  118. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/licenses/LICENSE +0 -0
  119. {datachain-0.30.5.dist-info → datachain-0.39.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,164 @@
1
+ import builtins
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+ from typing import TypeVar
5
+
6
+ from datachain.dataset import DatasetDependency
7
+
8
+ DDN = TypeVar("DDN", bound="DatasetDependencyNode")
9
+
10
+
11
+ @dataclass
12
+ class DatasetDependencyNode:
13
+ namespace: str
14
+ project: str
15
+ id: int
16
+ dataset_id: int | None
17
+ dataset_version_id: int | None
18
+ dataset_name: str | None
19
+ dataset_version: str | None
20
+ created_at: datetime
21
+ source_dataset_id: int
22
+ source_dataset_version_id: int | None
23
+ depth: int
24
+
25
+ @classmethod
26
+ def parse(
27
+ cls: builtins.type[DDN],
28
+ namespace: str,
29
+ project: str,
30
+ id: int,
31
+ dataset_id: int | None,
32
+ dataset_version_id: int | None,
33
+ dataset_name: str | None,
34
+ dataset_version: str | None,
35
+ created_at: datetime,
36
+ source_dataset_id: int,
37
+ source_dataset_version_id: int | None,
38
+ depth: int,
39
+ ) -> "DatasetDependencyNode | None":
40
+ return cls(
41
+ namespace,
42
+ project,
43
+ id,
44
+ dataset_id,
45
+ dataset_version_id,
46
+ dataset_name,
47
+ dataset_version,
48
+ created_at,
49
+ source_dataset_id,
50
+ source_dataset_version_id,
51
+ depth,
52
+ )
53
+
54
+ def to_dependency(self) -> "DatasetDependency | None":
55
+ return DatasetDependency.parse(
56
+ namespace_name=self.namespace,
57
+ project_name=self.project,
58
+ id=self.id,
59
+ dataset_id=self.dataset_id,
60
+ dataset_version_id=self.dataset_version_id,
61
+ dataset_name=self.dataset_name,
62
+ dataset_version=self.dataset_version,
63
+ dataset_version_created_at=self.created_at,
64
+ )
65
+
66
+
67
+ def build_dependency_hierarchy(
68
+ dependency_nodes: list[DatasetDependencyNode | None],
69
+ ) -> tuple[
70
+ dict[int, DatasetDependency | None], dict[tuple[int, int | None], list[int]]
71
+ ]:
72
+ """
73
+ Build dependency hierarchy from dependency nodes.
74
+
75
+ Args:
76
+ dependency_nodes: List of DatasetDependencyNode objects from the database
77
+
78
+ Returns:
79
+ Tuple of (dependency_map, children_map) where:
80
+ - dependency_map: Maps dependency_id -> DatasetDependency
81
+ - children_map: Maps (source_dataset_id, source_version_id) ->
82
+ list of dependency_ids
83
+ """
84
+ dependency_map: dict[int, DatasetDependency | None] = {}
85
+ children_map: dict[tuple[int, int | None], list[int]] = {}
86
+
87
+ for node in dependency_nodes:
88
+ if node is None:
89
+ continue
90
+ dependency = node.to_dependency()
91
+ parent_key = (node.source_dataset_id, node.source_dataset_version_id)
92
+
93
+ if dependency is not None:
94
+ dependency_map[dependency.id] = dependency
95
+ children_map.setdefault(parent_key, []).append(dependency.id)
96
+ else:
97
+ # Handle case where dependency creation failed (e.g., deleted dependency)
98
+ dependency_map[node.id] = None
99
+ children_map.setdefault(parent_key, []).append(node.id)
100
+
101
+ return dependency_map, children_map
102
+
103
+
104
+ def populate_nested_dependencies(
105
+ dependency: DatasetDependency,
106
+ dependency_nodes: list[DatasetDependencyNode | None],
107
+ dependency_map: dict[int, DatasetDependency | None],
108
+ children_map: dict[tuple[int, int | None], list[int]],
109
+ ) -> None:
110
+ """
111
+ Recursively populate nested dependencies for a given dependency.
112
+
113
+ Args:
114
+ dependency: The dependency to populate nested dependencies for
115
+ dependency_nodes: All dependency nodes from the database
116
+ dependency_map: Maps dependency_id -> DatasetDependency
117
+ children_map: Maps (source_dataset_id, source_version_id) ->
118
+ list of dependency_ids
119
+ """
120
+ # Find the target dataset and version for this dependency
121
+ target_dataset_id, target_version_id = find_target_dataset_version(
122
+ dependency, dependency_nodes
123
+ )
124
+
125
+ if target_dataset_id is None or target_version_id is None:
126
+ return
127
+
128
+ # Get children for this target
129
+ target_key = (target_dataset_id, target_version_id)
130
+ if target_key not in children_map:
131
+ dependency.dependencies = []
132
+ return
133
+
134
+ child_dependency_ids = children_map[target_key]
135
+ child_dependencies = [dependency_map[child_id] for child_id in child_dependency_ids]
136
+
137
+ dependency.dependencies = child_dependencies
138
+
139
+ # Recursively populate children
140
+ for child_dependency in child_dependencies:
141
+ if child_dependency is not None:
142
+ populate_nested_dependencies(
143
+ child_dependency, dependency_nodes, dependency_map, children_map
144
+ )
145
+
146
+
147
+ def find_target_dataset_version(
148
+ dependency: DatasetDependency,
149
+ dependency_nodes: list[DatasetDependencyNode | None],
150
+ ) -> tuple[int | None, int | None]:
151
+ """
152
+ Find the target dataset ID and version ID for a given dependency.
153
+
154
+ Args:
155
+ dependency: The dependency to find target for
156
+ dependency_nodes: All dependency nodes from the database
157
+
158
+ Returns:
159
+ Tuple of (target_dataset_id, target_version_id) or (None, None) if not found
160
+ """
161
+ for node in dependency_nodes:
162
+ if node is not None and node.id == dependency.id:
163
+ return node.dataset_id, node.dataset_version_id
164
+ return None, None
@@ -1,8 +1,9 @@
1
1
  import os
2
2
  import sys
3
3
  from importlib import import_module
4
- from typing import TYPE_CHECKING, Any, Optional
4
+ from typing import TYPE_CHECKING, Any
5
5
 
6
+ from datachain.plugins import ensure_plugins_loaded
6
7
  from datachain.utils import get_envs_by_prefix
7
8
 
8
9
  if TYPE_CHECKING:
@@ -24,6 +25,8 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
24
25
 
25
26
 
26
27
  def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
28
+ ensure_plugins_loaded()
29
+
27
30
  from datachain.data_storage import AbstractMetastore
28
31
  from datachain.data_storage.serializer import deserialize
29
32
 
@@ -64,6 +67,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
64
67
 
65
68
 
66
69
  def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
70
+ ensure_plugins_loaded()
71
+
67
72
  from datachain.data_storage import AbstractWarehouse
68
73
  from datachain.data_storage.serializer import deserialize
69
74
 
@@ -103,7 +108,7 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
103
108
  return warehouse_class(**warehouse_args)
104
109
 
105
110
 
106
- def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
111
+ def get_udf_distributor_class() -> type["AbstractUDFDistributor"] | None:
107
112
  if os.environ.get(DISTRIBUTED_DISABLED) == "True":
108
113
  return None
109
114
 
@@ -127,7 +132,7 @@ def get_udf_distributor_class() -> Optional[type["AbstractUDFDistributor"]]:
127
132
 
128
133
 
129
134
  def get_catalog(
130
- client_config: Optional[dict[str, Any]] = None,
135
+ client_config: dict[str, Any] | None = None,
131
136
  in_memory: bool = False,
132
137
  ) -> "Catalog":
133
138
  """
@@ -0,0 +1,43 @@
1
+ import uuid
2
+ from dataclasses import dataclass
3
+ from datetime import datetime
4
+
5
+
6
+ @dataclass
7
+ class Checkpoint:
8
+ """
9
+ Represents a checkpoint within a job run.
10
+
11
+ A checkpoint marks a successfully completed stage of execution. In the event
12
+ of a failure, the job can resume from the most recent checkpoint rather than
13
+ starting over from the beginning.
14
+
15
+ Checkpoints can also be created in a "partial" mode, which indicates that the
16
+ work at this stage was only partially completed. For example, if a failure
17
+ occurs halfway through running a UDF, already computed results can still be
18
+ saved, allowing the job to resume from that partially completed state on
19
+ restart.
20
+ """
21
+
22
+ id: str
23
+ job_id: str
24
+ hash: str
25
+ partial: bool
26
+ created_at: datetime
27
+
28
+ @classmethod
29
+ def parse(
30
+ cls,
31
+ id: str | uuid.UUID,
32
+ job_id: str,
33
+ _hash: str,
34
+ partial: bool,
35
+ created_at: datetime,
36
+ ) -> "Checkpoint":
37
+ return cls(
38
+ str(id),
39
+ job_id,
40
+ _hash,
41
+ bool(partial),
42
+ created_at,
43
+ )
datachain/cli/__init__.py CHANGED
@@ -3,10 +3,8 @@ import os
3
3
  import sys
4
4
  import traceback
5
5
  from multiprocessing import freeze_support
6
- from typing import Optional
7
6
 
8
7
  from datachain.cli.utils import get_logging_level
9
- from datachain.error import DataChainError as DataChainError
10
8
 
11
9
  from .commands import (
12
10
  clear_cache,
@@ -17,7 +15,6 @@ from .commands import (
17
15
  index,
18
16
  list_datasets,
19
17
  ls,
20
- query,
21
18
  rm_dataset,
22
19
  show,
23
20
  )
@@ -26,7 +23,7 @@ from .parser import get_parser
26
23
  logger = logging.getLogger("datachain")
27
24
 
28
25
 
29
- def main(argv: Optional[list[str]] = None) -> int:
26
+ def main(argv: list[str] | None = None) -> int:
30
27
  from datachain.catalog import get_catalog
31
28
 
32
29
  # Required for Windows multiprocessing support
@@ -38,7 +35,7 @@ def main(argv: Optional[list[str]] = None) -> int:
38
35
  if args.command == "internal-run-udf":
39
36
  return handle_udf()
40
37
  if args.command == "internal-run-udf-worker":
41
- return handle_udf_runner(args.fd)
38
+ return handle_udf_runner()
42
39
 
43
40
  if args.command is None:
44
41
  datachain_parser.print_help(sys.stderr)
@@ -62,6 +59,7 @@ def main(argv: Optional[list[str]] = None) -> int:
62
59
 
63
60
  error = None
64
61
 
62
+ catalog = None
65
63
  try:
66
64
  catalog = get_catalog(client_config=client_config)
67
65
  return handle_command(args, catalog, client_config)
@@ -72,6 +70,11 @@ def main(argv: Optional[list[str]] = None) -> int:
72
70
  error, return_code = handle_general_exception(exc, args, logging_level)
73
71
  return return_code
74
72
  finally:
73
+ if catalog is not None:
74
+ try:
75
+ catalog.close()
76
+ except Exception:
77
+ logger.exception("Failed to close catalog")
75
78
  from datachain.telemetry import telemetry
76
79
 
77
80
  telemetry.send_cli_call(args.command, error=error)
@@ -92,7 +95,6 @@ def handle_command(args, catalog, client_config) -> int:
92
95
  "find": lambda: handle_find_command(args, catalog),
93
96
  "index": lambda: handle_index_command(args, catalog),
94
97
  "completion": lambda: handle_completion_command(args),
95
- "query": lambda: handle_query_command(args, catalog),
96
98
  "clear-cache": lambda: clear_cache(catalog),
97
99
  "gc": lambda: garbage_collect(catalog),
98
100
  "auth": lambda: process_auth_cli_args(args),
@@ -261,15 +263,6 @@ def handle_completion_command(args):
261
263
  print(completion(args.shell))
262
264
 
263
265
 
264
- def handle_query_command(args, catalog):
265
- query(
266
- catalog,
267
- args.script,
268
- parallel=args.parallel,
269
- params=args.param,
270
- )
271
-
272
-
273
266
  def handle_broken_pipe_error(exc):
274
267
  # Python flushes standard streams on exit; redirect remaining output
275
268
  # to devnull to avoid another BrokenPipeError at shutdown
@@ -307,7 +300,7 @@ def handle_udf() -> int:
307
300
  return udf_entrypoint()
308
301
 
309
302
 
310
- def handle_udf_runner(fd: Optional[int] = None) -> int:
303
+ def handle_udf_runner() -> int:
311
304
  from datachain.query.dispatch import udf_worker_entrypoint
312
305
 
313
- return udf_worker_entrypoint(fd)
306
+ return udf_worker_entrypoint()
@@ -1,14 +1,8 @@
1
- from .datasets import (
2
- edit_dataset,
3
- list_datasets,
4
- list_datasets_local,
5
- rm_dataset,
6
- )
1
+ from .datasets import edit_dataset, list_datasets, list_datasets_local, rm_dataset
7
2
  from .du import du
8
3
  from .index import index
9
4
  from .ls import ls
10
5
  from .misc import clear_cache, completion, garbage_collect
11
- from .query import query
12
6
  from .show import show
13
7
 
14
8
  __all__ = [
@@ -21,7 +15,6 @@ __all__ = [
21
15
  "list_datasets",
22
16
  "list_datasets_local",
23
17
  "ls",
24
- "query",
25
18
  "rm_dataset",
26
19
  "show",
27
20
  ]
@@ -1,30 +1,41 @@
1
1
  import sys
2
- from typing import TYPE_CHECKING, Optional
2
+ from collections.abc import Iterable, Iterator
3
+ from typing import TYPE_CHECKING
3
4
 
4
5
  from tabulate import tabulate
5
6
 
6
- if TYPE_CHECKING:
7
- from datachain.catalog import Catalog
8
-
7
+ from datachain import semver
9
8
  from datachain.catalog import is_namespace_local
10
9
  from datachain.cli.utils import determine_flavors
11
10
  from datachain.config import Config
12
11
  from datachain.error import DataChainError, DatasetNotFoundError
13
12
  from datachain.studio import list_datasets as list_datasets_studio
14
13
 
14
+ if TYPE_CHECKING:
15
+ from datachain.catalog import Catalog
16
+
17
+
18
+ def group_dataset_versions(
19
+ datasets: Iterable[tuple[str, str]], latest_only=True
20
+ ) -> dict[str, str | list[str]]:
21
+ grouped: dict[str, list[tuple[int, int, int]]] = {}
15
22
 
16
- def group_dataset_versions(datasets, latest_only=True):
17
- grouped = {}
18
23
  # Sort to ensure groupby works as expected
19
24
  # (groupby expects consecutive items with the same key)
20
25
  for name, version in sorted(datasets):
21
- grouped.setdefault(name, []).append(version)
26
+ grouped.setdefault(name, []).append(semver.parse(version))
22
27
 
23
28
  if latest_only:
24
29
  # For each dataset name, pick the highest version.
25
- return {name: max(versions) for name, versions in grouped.items()}
30
+ return {
31
+ name: semver.create(*(max(versions))) for name, versions in grouped.items()
32
+ }
33
+
26
34
  # For each dataset name, return a sorted list of unique versions.
27
- return {name: sorted(set(versions)) for name, versions in grouped.items()}
35
+ return {
36
+ name: [semver.create(*v) for v in sorted(set(versions))]
37
+ for name, versions in grouped.items()
38
+ }
28
39
 
29
40
 
30
41
  def list_datasets(
@@ -32,10 +43,10 @@ def list_datasets(
32
43
  studio: bool = False,
33
44
  local: bool = False,
34
45
  all: bool = True,
35
- team: Optional[str] = None,
46
+ team: str | None = None,
36
47
  latest_only: bool = True,
37
- name: Optional[str] = None,
38
- ):
48
+ name: str | None = None,
49
+ ) -> None:
39
50
  token = Config().read().get("studio", {}).get("token")
40
51
  all, local, studio = determine_flavors(studio, local, all, token)
41
52
  if name:
@@ -95,27 +106,31 @@ def list_datasets(
95
106
  print(tabulate(rows, headers="keys"))
96
107
 
97
108
 
98
- def list_datasets_local(catalog: "Catalog", name: Optional[str] = None):
109
+ def list_datasets_local(
110
+ catalog: "Catalog", name: str | None = None
111
+ ) -> Iterator[tuple[str, str]]:
99
112
  if name:
100
113
  yield from list_datasets_local_versions(catalog, name)
101
114
  return
102
115
 
103
116
  for d in catalog.ls_datasets():
104
117
  for v in d.versions:
105
- yield (d.full_name, v.version)
118
+ yield d.full_name, v.version
106
119
 
107
120
 
108
- def list_datasets_local_versions(catalog: "Catalog", name: str):
121
+ def list_datasets_local_versions(
122
+ catalog: "Catalog", name: str
123
+ ) -> Iterator[tuple[str, str]]:
109
124
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
110
125
 
111
126
  ds = catalog.get_dataset(
112
127
  name, namespace_name=namespace_name, project_name=project_name
113
128
  )
114
129
  for v in ds.versions:
115
- yield (name, v.version)
130
+ yield name, v.version
116
131
 
117
132
 
118
- def _datasets_tabulate_row(name, both, local_version, studio_version):
133
+ def _datasets_tabulate_row(name, both, local_version, studio_version) -> dict[str, str]:
119
134
  row = {
120
135
  "Name": name,
121
136
  }
@@ -132,11 +147,11 @@ def _datasets_tabulate_row(name, both, local_version, studio_version):
132
147
  def rm_dataset(
133
148
  catalog: "Catalog",
134
149
  name: str,
135
- version: Optional[str] = None,
136
- force: Optional[bool] = False,
137
- studio: Optional[bool] = False,
138
- team: Optional[str] = None,
139
- ):
150
+ version: str | None = None,
151
+ force: bool | None = False,
152
+ studio: bool | None = False,
153
+ team: str | None = None,
154
+ ) -> None:
140
155
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
141
156
 
142
157
  if studio:
@@ -162,11 +177,11 @@ def rm_dataset(
162
177
  def edit_dataset(
163
178
  catalog: "Catalog",
164
179
  name: str,
165
- new_name: Optional[str] = None,
166
- description: Optional[str] = None,
167
- attrs: Optional[list[str]] = None,
168
- team: Optional[str] = None,
169
- ):
180
+ new_name: str | None = None,
181
+ description: str | None = None,
182
+ attrs: list[str] | None = None,
183
+ team: str | None = None,
184
+ ) -> None:
170
185
  from datachain.lib.dc.utils import is_studio
171
186
 
172
187
  namespace_name, project_name, name = catalog.get_full_dataset_name(name)
@@ -1,13 +1,14 @@
1
1
  import shlex
2
2
  from collections.abc import Iterable, Iterator
3
3
  from itertools import chain
4
- from typing import TYPE_CHECKING, Optional
5
-
6
- if TYPE_CHECKING:
7
- from datachain.catalog import Catalog
4
+ from typing import TYPE_CHECKING
8
5
 
9
6
  from datachain.cli.utils import determine_flavors
10
7
  from datachain.config import Config
8
+ from datachain.query.session import Session
9
+
10
+ if TYPE_CHECKING:
11
+ from datachain.catalog import Catalog
11
12
 
12
13
 
13
14
  def ls(
@@ -16,7 +17,7 @@ def ls(
16
17
  studio: bool = False,
17
18
  local: bool = False,
18
19
  all: bool = True,
19
- team: Optional[str] = None,
20
+ team: str | None = None,
20
21
  **kwargs,
21
22
  ):
22
23
  token = Config().read().get("studio", {}).get("token")
@@ -32,18 +33,15 @@ def ls(
32
33
  def ls_local(
33
34
  sources,
34
35
  long: bool = False,
35
- catalog: Optional["Catalog"] = None,
36
+ catalog=None,
36
37
  client_config=None,
37
38
  **kwargs,
38
39
  ):
39
40
  from datachain import listings
40
41
 
41
42
  if sources:
42
- if catalog is None:
43
- from datachain.catalog import get_catalog
44
-
45
- catalog = get_catalog(client_config=client_config)
46
-
43
+ session = Session.get(catalog=catalog, client_config=client_config)
44
+ catalog = session.catalog
47
45
  actual_sources = list(ls_urls(sources, catalog=catalog, long=long, **kwargs))
48
46
  if len(actual_sources) == 1:
49
47
  for _, entries in actual_sources:
@@ -78,7 +76,7 @@ def format_ls_entry(entry: str) -> str:
78
76
  def ls_remote(
79
77
  paths: Iterable[str],
80
78
  long: bool = False,
81
- team: Optional[str] = None,
79
+ team: str | None = None,
82
80
  ):
83
81
  from datachain.node import long_line_str
84
82
  from datachain.remote.studio import StudioClient
@@ -145,7 +143,7 @@ def _ls_urls_flat(
145
143
  long: bool,
146
144
  catalog: "Catalog",
147
145
  **kwargs,
148
- ) -> Iterator[tuple[str, Iterator[str]]]:
146
+ ) -> Iterator[tuple[str, Iterable[str]]]:
149
147
  from datachain.client import Client
150
148
  from datachain.node import long_line_str
151
149
 
@@ -154,7 +152,9 @@ def _ls_urls_flat(
154
152
  if client_cls.is_root_url(source):
155
153
  buckets = client_cls.ls_buckets(**catalog.client_config)
156
154
  if long:
157
- values = (long_line_str(b.name, b.created) for b in buckets)
155
+ values: Iterable[str] = (
156
+ long_line_str(b.name, b.created) for b in buckets
157
+ )
158
158
  else:
159
159
  values = (b.name for b in buckets)
160
160
  yield source, values
@@ -164,7 +164,7 @@ def _ls_urls_flat(
164
164
  if long:
165
165
  fields.append("last_modified")
166
166
  for data_source, results in catalog.ls([source], fields=fields, **kwargs):
167
- values = (_node_data_to_ls_values(r, long) for r in results)
167
+ values = [_node_data_to_ls_values(r, long) for r in results]
168
168
  found = True
169
169
  yield data_source.dirname(), values
170
170
  if not found:
@@ -1,5 +1,5 @@
1
1
  from collections.abc import Sequence
2
- from typing import TYPE_CHECKING, Optional
2
+ from typing import TYPE_CHECKING
3
3
 
4
4
  from datachain.lib.signal_schema import SignalSchema
5
5
 
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
10
10
  def show(
11
11
  catalog: "Catalog",
12
12
  name: str,
13
- version: Optional[str] = None,
13
+ version: str | None = None,
14
14
  limit: int = 10,
15
15
  offset: int = 0,
16
16
  columns: Sequence[str] = (),
@@ -3,7 +3,7 @@ from importlib.metadata import PackageNotFoundError, version
3
3
 
4
4
  import shtab
5
5
 
6
- from datachain.cli.utils import BooleanOptionalAction, KeyValueArgs
6
+ from datachain.cli.utils import BooleanOptionalAction
7
7
 
8
8
  from .job import add_jobs_parser
9
9
  from .studio import add_auth_parser
@@ -16,9 +16,7 @@ from .utils import (
16
16
  add_update_arg,
17
17
  find_columns_type,
18
18
  )
19
- from .utils import (
20
- CustomArgumentParser as ArgumentParser,
21
- )
19
+ from .utils import CustomArgumentParser as ArgumentParser
22
20
 
23
21
 
24
22
  def get_parser() -> ArgumentParser: # noqa: PLR0915
@@ -467,37 +465,6 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
467
465
  show_parser.add_argument("--schema", action="store_true", help="Show schema")
468
466
  add_show_args(show_parser)
469
467
 
470
- query_parser = subp.add_parser(
471
- "query",
472
- parents=[parent_parser],
473
- description="Create a new dataset with a query script.",
474
- formatter_class=CustomHelpFormatter,
475
- )
476
- add_anon_arg(query_parser)
477
- query_parser.add_argument(
478
- "script", metavar="<script.py>", type=str, help="Filepath for script"
479
- )
480
- query_parser.add_argument(
481
- "--parallel",
482
- nargs="?",
483
- type=int,
484
- const=-1,
485
- default=None,
486
- metavar="N",
487
- help=(
488
- "Use multiprocessing to run any query script UDFs with N worker processes. "
489
- "N defaults to the CPU count"
490
- ),
491
- )
492
- query_parser.add_argument(
493
- "-p",
494
- "--param",
495
- metavar="param=value",
496
- nargs=1,
497
- action=KeyValueArgs,
498
- help="Query parameters",
499
- )
500
-
501
468
  parse_clear_cache = subp.add_parser(
502
469
  "clear-cache",
503
470
  parents=[parent_parser],
@@ -515,14 +482,7 @@ def get_parser() -> ArgumentParser: # noqa: PLR0915
515
482
  add_anon_arg(parse_gc)
516
483
 
517
484
  subp.add_parser("internal-run-udf", parents=[parent_parser])
518
- run_udf_worker = subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
519
- run_udf_worker.add_argument(
520
- "--fd",
521
- type=int,
522
- action="store",
523
- default=None,
524
- help="File descriptor to write results to",
525
- )
485
+ subp.add_parser("internal-run-udf-worker", parents=[parent_parser])
526
486
 
527
487
  add_completion_parser(subp, [parent_parser])
528
488
  return parser
@@ -83,7 +83,7 @@ def add_jobs_parser(subparsers, parent_parser) -> None:
83
83
  studio_run_parser.add_argument(
84
84
  "--python-version",
85
85
  action="store",
86
- help="Python version for the job (e.g., 3.9, 3.10, 3.11)",
86
+ help="Python version for the job (e.g., 3.10, 3.11, 3.12, 3.13)",
87
87
  )
88
88
  studio_run_parser.add_argument(
89
89
  "--repository",