datachain 0.11.11__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -25,7 +25,6 @@ from typing import (
25
25
  )
26
26
  from uuid import uuid4
27
27
 
28
- import requests
29
28
  import sqlalchemy as sa
30
29
  from sqlalchemy import Column
31
30
  from tqdm.auto import tqdm
@@ -54,7 +53,6 @@ from datachain.error import (
54
53
  from datachain.lib.listing import get_listing
55
54
  from datachain.node import DirType, Node, NodeWithPath
56
55
  from datachain.nodes_thread_pool import NodesThreadPool
57
- from datachain.remote.studio import StudioClient
58
56
  from datachain.sql.types import DateTime, SQLType
59
57
  from datachain.utils import DataChainDir
60
58
 
@@ -162,6 +160,8 @@ class DatasetRowsFetcher(NodesThreadPool):
162
160
  max_threads: int = PULL_DATASET_MAX_THREADS,
163
161
  progress_bar=None,
164
162
  ):
163
+ from datachain.remote.studio import StudioClient
164
+
165
165
  super().__init__(max_threads)
166
166
  self._check_dependencies()
167
167
  self.metastore = metastore
@@ -234,6 +234,8 @@ class DatasetRowsFetcher(NodesThreadPool):
234
234
  return df.drop("sys__id", axis=1)
235
235
 
236
236
  def get_parquet_content(self, url: str):
237
+ import requests
238
+
237
239
  while True:
238
240
  if self.should_check_for_status():
239
241
  self.check_for_status()
@@ -1130,6 +1132,8 @@ class Catalog:
1130
1132
  raise DatasetNotFoundError(f"Dataset with version uuid {uuid} not found.")
1131
1133
 
1132
1134
  def get_remote_dataset(self, name: str) -> DatasetRecord:
1135
+ from datachain.remote.studio import StudioClient
1136
+
1133
1137
  studio_client = StudioClient()
1134
1138
 
1135
1139
  info_response = studio_client.dataset_info(name)
@@ -1164,8 +1168,27 @@ class Catalog:
1164
1168
 
1165
1169
  return direct_dependencies
1166
1170
 
1167
- def ls_datasets(self, include_listing: bool = False) -> Iterator[DatasetListRecord]:
1168
- datasets = self.metastore.list_datasets()
1171
+ def ls_datasets(
1172
+ self, include_listing: bool = False, studio: bool = False
1173
+ ) -> Iterator[DatasetListRecord]:
1174
+ from datachain.remote.studio import StudioClient
1175
+
1176
+ if studio:
1177
+ client = StudioClient()
1178
+ response = client.ls_datasets()
1179
+ if not response.ok:
1180
+ raise DataChainError(response.message)
1181
+ if not response.data:
1182
+ return
1183
+
1184
+ datasets: Iterator[DatasetListRecord] = (
1185
+ DatasetListRecord.from_dict(d)
1186
+ for d in response.data
1187
+ if not d.get("name", "").startswith(QUERY_DATASET_PREFIX)
1188
+ )
1189
+ else:
1190
+ datasets = self.metastore.list_datasets()
1191
+
1169
1192
  for d in datasets:
1170
1193
  if not d.is_bucket_listing or include_listing:
1171
1194
  yield d
@@ -1173,9 +1196,12 @@ class Catalog:
1173
1196
  def list_datasets_versions(
1174
1197
  self,
1175
1198
  include_listing: bool = False,
1199
+ studio: bool = False,
1176
1200
  ) -> Iterator[tuple[DatasetListRecord, "DatasetListVersion", Optional["Job"]]]:
1177
1201
  """Iterate over all dataset versions with related jobs."""
1178
- datasets = list(self.ls_datasets(include_listing=include_listing))
1202
+ datasets = list(
1203
+ self.ls_datasets(include_listing=include_listing, studio=studio)
1204
+ )
1179
1205
 
1180
1206
  # preselect dataset versions jobs from db to avoid multiple queries
1181
1207
  jobs_ids: set[str] = {
@@ -1345,6 +1371,8 @@ class Catalog:
1345
1371
  if cp and not output:
1346
1372
  raise ValueError("Please provide output directory for instantiation")
1347
1373
 
1374
+ from datachain.remote.studio import StudioClient
1375
+
1348
1376
  studio_client = StudioClient()
1349
1377
 
1350
1378
  try:
@@ -1,19 +1,13 @@
1
1
  import os
2
2
  from importlib import import_module
3
- from typing import Any, Optional
4
-
5
- from datachain.catalog import Catalog
6
- from datachain.data_storage import (
7
- AbstractMetastore,
8
- AbstractWarehouse,
9
- )
10
- from datachain.data_storage.serializer import deserialize
11
- from datachain.data_storage.sqlite import (
12
- SQLiteMetastore,
13
- SQLiteWarehouse,
14
- )
3
+ from typing import TYPE_CHECKING, Any, Optional
4
+
15
5
  from datachain.utils import get_envs_by_prefix
16
6
 
7
+ if TYPE_CHECKING:
8
+ from datachain.catalog import Catalog
9
+ from datachain.data_storage import AbstractMetastore, AbstractWarehouse
10
+
17
11
  METASTORE_SERIALIZED = "DATACHAIN__METASTORE"
18
12
  METASTORE_IMPORT_PATH = "DATACHAIN_METASTORE"
19
13
  METASTORE_ARG_PREFIX = "DATACHAIN_METASTORE_ARG_"
@@ -27,6 +21,9 @@ IN_MEMORY_ERROR_MESSAGE = "In-memory is only supported on SQLite"
27
21
 
28
22
 
29
23
  def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
24
+ from datachain.data_storage import AbstractMetastore
25
+ from datachain.data_storage.serializer import deserialize
26
+
30
27
  metastore_serialized = os.environ.get(METASTORE_SERIALIZED)
31
28
  if metastore_serialized:
32
29
  metastore_obj = deserialize(metastore_serialized)
@@ -45,6 +42,8 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
45
42
  }
46
43
 
47
44
  if not metastore_import_path:
45
+ from datachain.data_storage.sqlite import SQLiteMetastore
46
+
48
47
  metastore_args["in_memory"] = in_memory
49
48
  return SQLiteMetastore(**metastore_args)
50
49
  if in_memory:
@@ -62,6 +61,9 @@ def get_metastore(in_memory: bool = False) -> "AbstractMetastore":
62
61
 
63
62
 
64
63
  def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
64
+ from datachain.data_storage import AbstractWarehouse
65
+ from datachain.data_storage.serializer import deserialize
66
+
65
67
  warehouse_serialized = os.environ.get(WAREHOUSE_SERIALIZED)
66
68
  if warehouse_serialized:
67
69
  warehouse_obj = deserialize(warehouse_serialized)
@@ -80,6 +82,8 @@ def get_warehouse(in_memory: bool = False) -> "AbstractWarehouse":
80
82
  }
81
83
 
82
84
  if not warehouse_import_path:
85
+ from datachain.data_storage.sqlite import SQLiteWarehouse
86
+
83
87
  warehouse_args["in_memory"] = in_memory
84
88
  return SQLiteWarehouse(**warehouse_args)
85
89
  if in_memory:
@@ -121,7 +125,7 @@ def get_distributed_class(**kwargs):
121
125
 
122
126
  def get_catalog(
123
127
  client_config: Optional[dict[str, Any]] = None, in_memory: bool = False
124
- ) -> Catalog:
128
+ ) -> "Catalog":
125
129
  """
126
130
  Function that creates Catalog instance with appropriate metastore
127
131
  and warehouse classes. Metastore class can be provided with env variable
@@ -133,6 +137,8 @@ def get_catalog(
133
137
  and name of variable after, e.g. if it accepts team_id as kwargs
134
138
  we can provide DATACHAIN_METASTORE_ARG_TEAM_ID=12345 env variable.
135
139
  """
140
+ from datachain.catalog import Catalog
141
+
136
142
  return Catalog(
137
143
  metastore=get_metastore(in_memory=in_memory),
138
144
  warehouse=get_warehouse(in_memory=in_memory),
datachain/cli/__init__.py CHANGED
@@ -6,7 +6,6 @@ from multiprocessing import freeze_support
6
6
  from typing import Optional
7
7
 
8
8
  from datachain.cli.utils import get_logging_level
9
- from datachain.telemetry import telemetry
10
9
 
11
10
  from .commands import (
12
11
  clear_cache,
@@ -70,6 +69,8 @@ def main(argv: Optional[list[str]] = None) -> int:
70
69
  error, return_code = handle_general_exception(exc, args, logging_level)
71
70
  return return_code
72
71
  finally:
72
+ from datachain.telemetry import telemetry
73
+
73
74
  telemetry.send_cli_call(args.command, error=error)
74
75
 
75
76
 
@@ -63,19 +63,31 @@ def add_auth_parser(subparsers, parent_parser) -> None:
63
63
  default=False,
64
64
  help="Use code-based authentication without browser",
65
65
  )
66
+ login_parser.add_argument(
67
+ "--local",
68
+ action="store_true",
69
+ default=False,
70
+ help="Save the token in the local project config",
71
+ )
66
72
 
67
73
  auth_logout_help = "Log out from Studio"
68
74
  auth_logout_description = (
69
75
  "Remove the Studio authentication token from global config."
70
76
  )
71
77
 
72
- auth_subparser.add_parser(
78
+ logout_parser = auth_subparser.add_parser(
73
79
  "logout",
74
80
  parents=[parent_parser],
75
81
  description=auth_logout_description,
76
82
  help=auth_logout_help,
77
83
  formatter_class=CustomHelpFormatter,
78
84
  )
85
+ logout_parser.add_argument(
86
+ "--local",
87
+ action="store_true",
88
+ default=False,
89
+ help="Remove the token from the local project config",
90
+ )
79
91
 
80
92
  auth_team_help = "Set default team for Studio operations"
81
93
  auth_team_description = "Set the default team for Studio operations."
@@ -17,10 +17,10 @@ from typing import (
17
17
  ClassVar,
18
18
  NamedTuple,
19
19
  Optional,
20
+ Union,
20
21
  )
21
22
  from urllib.parse import urlparse
22
23
 
23
- from botocore.exceptions import ClientError
24
24
  from dvc_objects.fs.system import reflink
25
25
  from fsspec.asyn import get_loop, sync
26
26
  from fsspec.callbacks import DEFAULT_CALLBACK, Callback
@@ -28,7 +28,6 @@ from tqdm.auto import tqdm
28
28
 
29
29
  from datachain.cache import Cache
30
30
  from datachain.client.fileslice import FileWrapper
31
- from datachain.error import ClientError as DataChainClientError
32
31
  from datachain.nodes_fetcher import NodesFetcher
33
32
  from datachain.nodes_thread_pool import NodeChunk
34
33
 
@@ -83,19 +82,17 @@ class Client(ABC):
83
82
  self.uri = self.get_uri(self.name)
84
83
 
85
84
  @staticmethod
86
- def get_implementation(url: str) -> type["Client"]:
85
+ def get_implementation(url: Union[str, os.PathLike[str]]) -> type["Client"]:
87
86
  from .azure import AzureClient
88
87
  from .gcs import GCSClient
89
88
  from .hf import HfClient
90
89
  from .local import FileClient
91
90
  from .s3 import ClientS3
92
91
 
93
- protocol = urlparse(url).scheme
92
+ protocol = urlparse(str(url)).scheme
94
93
 
95
- if not protocol or _is_win_local_path(url):
94
+ if not protocol or _is_win_local_path(str(url)):
96
95
  return FileClient
97
-
98
- protocol = protocol.lower()
99
96
  if protocol == ClientS3.protocol:
100
97
  return ClientS3
101
98
  if protocol == GCSClient.protocol:
@@ -121,9 +118,11 @@ class Client(ABC):
121
118
  return cls.get_uri(storage_name), rel_path
122
119
 
123
120
  @staticmethod
124
- def get_client(source: str, cache: Cache, **kwargs) -> "Client":
121
+ def get_client(
122
+ source: Union[str, os.PathLike[str]], cache: Cache, **kwargs
123
+ ) -> "Client":
125
124
  cls = Client.get_implementation(source)
126
- storage_url, _ = cls.split_url(source)
125
+ storage_url, _ = cls.split_url(str(source))
127
126
  if os.name == "nt":
128
127
  storage_url = storage_url.removeprefix("/")
129
128
 
@@ -209,7 +208,7 @@ class Client(ABC):
209
208
 
210
209
  async def get_current_etag(self, file: "File") -> str:
211
210
  kwargs = {}
212
- if self.fs.version_aware:
211
+ if getattr(self.fs, "version_aware", False):
213
212
  kwargs["version_id"] = file.version
214
213
  info = await self.fs._info(
215
214
  self.get_full_path(file.path, file.version), **kwargs
@@ -286,11 +285,6 @@ class Client(ABC):
286
285
  worker.cancel()
287
286
  if excs:
288
287
  raise excs[0]
289
- except ClientError as exc:
290
- raise DataChainClientError(
291
- exc.response.get("Error", {}).get("Message") or exc,
292
- exc.response.get("Error", {}).get("Code"),
293
- ) from exc
294
288
  finally:
295
289
  # This ensures the progress bar is closed before any exceptions are raised
296
290
  progress_bar.close()
@@ -333,7 +327,9 @@ class Client(ABC):
333
327
  return not (key.startswith("/") or key.endswith("/") or "//" in key)
334
328
 
335
329
  async def ls_dir(self, path):
336
- return await self.fs._ls(path, detail=True, versions=True)
330
+ if getattr(self.fs, "version_aware", False):
331
+ kwargs = {"versions": True}
332
+ return await self.fs._ls(path, detail=True, **kwargs)
337
333
 
338
334
  def rel_path(self, path: str) -> str:
339
335
  return self.fs.split_path(path)[1]
datachain/client/hf.py CHANGED
@@ -1,25 +1,50 @@
1
- import os
1
+ import functools
2
2
  import posixpath
3
- from typing import Any, cast
4
-
5
- from huggingface_hub import HfFileSystem
3
+ from typing import Any
6
4
 
7
5
  from datachain.lib.file import File
8
6
 
9
7
  from .fsspec import Client
10
8
 
11
9
 
10
+ class classproperty: # noqa: N801
11
+ def __init__(self, func):
12
+ self.fget = func
13
+
14
+ def __get__(self, instance, owner):
15
+ return self.fget(owner)
16
+
17
+
18
+ @functools.cache
19
+ def get_hf_filesystem_cls():
20
+ import fsspec
21
+ from packaging.version import Version, parse
22
+
23
+ fsspec_version = parse(fsspec.__version__)
24
+ minver = Version("2024.12.0")
25
+
26
+ if fsspec_version < minver:
27
+ raise ImportError(
28
+ f"datachain requires 'fsspec>={minver}' but version "
29
+ f"{fsspec_version} is installed."
30
+ )
31
+
32
+ from fsspec.implementations.asyn_wrapper import AsyncFileSystemWrapper
33
+ from huggingface_hub import HfFileSystem
34
+
35
+ fs_cls = AsyncFileSystemWrapper.wrap_class(HfFileSystem)
36
+ # AsyncFileSystemWrapper does not set class properties, so we need to set them back.
37
+ fs_cls.protocol = HfFileSystem.protocol
38
+ return fs_cls
39
+
40
+
12
41
  class HfClient(Client):
13
- FS_CLASS = HfFileSystem
14
42
  PREFIX = "hf://"
15
43
  protocol = "hf"
16
44
 
17
- @classmethod
18
- def create_fs(cls, **kwargs) -> HfFileSystem:
19
- if os.environ.get("HF_TOKEN"):
20
- kwargs["token"] = os.environ["HF_TOKEN"]
21
-
22
- return cast(HfFileSystem, super().create_fs(**kwargs))
45
+ @classproperty
46
+ def FS_CLASS(cls): # noqa: N802, N805
47
+ return get_hf_filesystem_cls()
23
48
 
24
49
  def info_to_file(self, v: dict[str, Any], path: str) -> File:
25
50
  return File(
@@ -31,8 +56,5 @@ class HfClient(Client):
31
56
  last_modified=v["last_commit"].date,
32
57
  )
33
58
 
34
- async def ls_dir(self, path):
35
- return self.fs.ls(path, detail=True)
36
-
37
59
  def rel_path(self, path):
38
60
  return posixpath.relpath(path, self.name)
datachain/client/local.py CHANGED
@@ -67,10 +67,7 @@ class FileClient(Client):
67
67
  @classmethod
68
68
  def split_url(cls, url: str) -> tuple[str, str]:
69
69
  parsed = urlparse(url)
70
- if parsed.scheme == "file":
71
- scheme, rest = url.split(":", 1)
72
- url = f"{scheme.lower()}:{rest}"
73
- else:
70
+ if parsed.scheme != "file":
74
71
  url = cls.path_to_uri(url)
75
72
 
76
73
  fill_path = url[len(cls.PREFIX) :]
@@ -39,13 +39,6 @@ if TYPE_CHECKING:
39
39
  from datachain.data_storage.schema import DataTable
40
40
  from datachain.lib.file import File
41
41
 
42
- try:
43
- import numpy as np
44
-
45
- numpy_imported = True
46
- except ImportError:
47
- numpy_imported = False
48
-
49
42
 
50
43
  logger = logging.getLogger("datachain")
51
44
 
@@ -96,7 +89,9 @@ class AbstractWarehouse(ABC, Serializable):
96
89
  If value is a list or some other iterable, it tries to convert sub elements
97
90
  as well
98
91
  """
99
- if numpy_imported and isinstance(val, (np.ndarray, np.generic)):
92
+ import numpy as np
93
+
94
+ if isinstance(val, (np.ndarray, np.generic)):
100
95
  val = val.tolist()
101
96
 
102
97
  # Optimization: Precompute all the column type variables.
datachain/dataset.py CHANGED
@@ -302,6 +302,7 @@ class DatasetListVersion:
302
302
  size: Optional[int],
303
303
  query_script: str = "",
304
304
  job_id: Optional[str] = None,
305
+ **kwargs,
305
306
  ):
306
307
  return cls(
307
308
  id,
@@ -648,6 +649,13 @@ class DatasetListRecord:
648
649
  def has_version_with_uuid(self, uuid: str) -> bool:
649
650
  return any(v.uuid == uuid for v in self.versions)
650
651
 
652
+ @classmethod
653
+ def from_dict(cls, d: dict[str, Any]) -> "DatasetListRecord":
654
+ versions = [DatasetListVersion.parse(**v) for v in d.get("versions", [])]
655
+ kwargs = {f.name: d[f.name] for f in fields(cls) if f.name in d}
656
+ kwargs["versions"] = versions
657
+ return cls(**kwargs)
658
+
651
659
 
652
660
  class RowDict(dict):
653
661
  pass
datachain/error.py CHANGED
@@ -1,15 +1,3 @@
1
- import botocore.errorfactory
2
- import botocore.exceptions
3
- import gcsfs.retry
4
-
5
- REMOTE_ERRORS = (
6
- gcsfs.retry.HttpError, # GCS
7
- OSError, # GCS
8
- botocore.exceptions.BotoCoreError, # S3
9
- ValueError, # Azure
10
- )
11
-
12
-
13
1
  class DataChainError(RuntimeError):
14
2
  pass
15
3
 
datachain/fs/utils.py ADDED
@@ -0,0 +1,30 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ from fsspec.implementations.local import LocalFileSystem
4
+
5
+ if TYPE_CHECKING:
6
+ from fsspec import AbstractFileSystem
7
+
8
+
9
+ def _isdir(fs: "AbstractFileSystem", path: str) -> bool:
10
+ info = fs.info(path)
11
+ return info["type"] == "directory" or (
12
+ info["size"] == 0 and info["type"] == "file" and info["name"].endswith("/")
13
+ )
14
+
15
+
16
+ def isfile(fs: "AbstractFileSystem", path: str) -> bool:
17
+ """
18
+ Returns True if uri points to a file.
19
+
20
+ Supports special directories on object storages, e.g.:
21
+ Google creates a zero byte file with the same name as the directory with a trailing
22
+ slash at the end.
23
+ """
24
+ if isinstance(fs, LocalFileSystem):
25
+ return fs.isfile(path)
26
+
27
+ try:
28
+ return not _isdir(fs, path)
29
+ except FileNotFoundError:
30
+ return False
@@ -18,6 +18,7 @@ from .aggregate import (
18
18
  from .array import contains, cosine_distance, euclidean_distance, length, sip_hash_64
19
19
  from .conditional import and_, case, greatest, ifelse, isnone, least, or_
20
20
  from .numeric import bit_and, bit_hamming_distance, bit_or, bit_xor, int_hash_64
21
+ from .path import file_ext, file_stem, name, parent
21
22
  from .random import rand
22
23
  from .string import byte_hamming_distance
23
24
  from .window import window
@@ -40,6 +41,8 @@ __all__ = [
40
41
  "count",
41
42
  "dense_rank",
42
43
  "euclidean_distance",
44
+ "file_ext",
45
+ "file_stem",
43
46
  "first",
44
47
  "greatest",
45
48
  "ifelse",
@@ -50,7 +53,9 @@ __all__ = [
50
53
  "literal",
51
54
  "max",
52
55
  "min",
56
+ "name",
53
57
  "or_",
58
+ "parent",
54
59
  "path",
55
60
  "rand",
56
61
  "random",
datachain/func/func.py CHANGED
@@ -3,7 +3,6 @@ from collections.abc import Sequence
3
3
  from typing import TYPE_CHECKING, Any, Callable, Optional, Union
4
4
 
5
5
  from sqlalchemy import BindParameter, Case, ColumnElement, Integer, cast, desc
6
- from sqlalchemy.ext.hybrid import Comparator
7
6
  from sqlalchemy.sql import func as sa_func
8
7
 
9
8
  from datachain.lib.convert.python_to_sql import python_to_sql
@@ -75,6 +74,8 @@ class Func(Function):
75
74
 
76
75
  @property
77
76
  def _db_cols(self) -> Sequence[ColT]:
77
+ from sqlalchemy.ext.hybrid import Comparator
78
+
78
79
  return (
79
80
  [
80
81
  col
datachain/lib/dc.py CHANGED
@@ -22,7 +22,6 @@ import orjson
22
22
  import sqlalchemy
23
23
  from pydantic import BaseModel
24
24
  from sqlalchemy.sql.functions import GenericFunction
25
- from sqlalchemy.sql.sqltypes import NullType
26
25
  from tqdm import tqdm
27
26
 
28
27
  from datachain.dataset import DatasetRecord
@@ -55,7 +54,6 @@ from datachain.query import Session
55
54
  from datachain.query.dataset import DatasetQuery, PartitionByType
56
55
  from datachain.query.schema import DEFAULT_DELIMITER, Column, ColumnMeta
57
56
  from datachain.sql.functions import path as pathfunc
58
- from datachain.telemetry import telemetry
59
57
  from datachain.utils import batched_it, inside_notebook, row_to_nested_dict
60
58
 
61
59
  if TYPE_CHECKING:
@@ -215,7 +213,7 @@ class DataChain:
215
213
  from mistralai.client import MistralClient
216
214
  from mistralai.models.chat_completion import ChatMessage
217
215
 
218
- from datachain.dc import DataChain, Column
216
+ from datachain import DataChain, Column
219
217
 
220
218
  PROMPT = (
221
219
  "Was this bot dialog successful? "
@@ -408,7 +406,7 @@ class DataChain:
408
406
  @classmethod
409
407
  def from_storage(
410
408
  cls,
411
- uri,
409
+ uri: Union[str, os.PathLike[str]],
412
410
  *,
413
411
  type: FileType = "binary",
414
412
  session: Optional[Session] = None,
@@ -550,6 +548,8 @@ class DataChain:
550
548
  )
551
549
  ```
552
550
  """
551
+ from datachain.telemetry import telemetry
552
+
553
553
  query = DatasetQuery(
554
554
  name=name,
555
555
  version=version,
@@ -573,7 +573,7 @@ class DataChain:
573
573
  @classmethod
574
574
  def from_json(
575
575
  cls,
576
- path,
576
+ path: Union[str, os.PathLike[str]],
577
577
  type: FileType = "text",
578
578
  spec: Optional[DataType] = None,
579
579
  schema_from: Optional[str] = "auto",
@@ -610,7 +610,7 @@ class DataChain:
610
610
  ```
611
611
  """
612
612
  if schema_from == "auto":
613
- schema_from = path
613
+ schema_from = str(path)
614
614
 
615
615
  def jmespath_to_name(s: str):
616
616
  name_end = re.search(r"\W", s).start() if re.search(r"\W", s) else len(s) # type: ignore[union-attr]
@@ -701,9 +701,22 @@ class DataChain:
701
701
  in_memory: bool = False,
702
702
  object_name: str = "dataset",
703
703
  include_listing: bool = False,
704
+ studio: bool = False,
704
705
  ) -> "DataChain":
705
706
  """Generate chain with list of registered datasets.
706
707
 
708
+ Args:
709
+ session: Optional session instance. If not provided, uses default session.
710
+ settings: Optional dictionary of settings to configure the chain.
711
+ in_memory: If True, creates an in-memory session. Defaults to False.
712
+ object_name: Name of the output object in the chain. Defaults to "dataset".
713
+ include_listing: If True, includes listing datasets. Defaults to False.
714
+ studio: If True, returns datasets from Studio only,
715
+ otherwise returns all local datasets. Defaults to False.
716
+
717
+ Returns:
718
+ DataChain: A new DataChain instance containing dataset information.
719
+
707
720
  Example:
708
721
  ```py
709
722
  from datachain import DataChain
@@ -719,7 +732,7 @@ class DataChain:
719
732
  datasets = [
720
733
  DatasetInfo.from_models(d, v, j)
721
734
  for d, v, j in catalog.list_datasets_versions(
722
- include_listing=include_listing
735
+ include_listing=include_listing, studio=studio
723
736
  )
724
737
  ]
725
738
 
@@ -1195,6 +1208,8 @@ class DataChain:
1195
1208
  )
1196
1209
  ```
1197
1210
  """
1211
+ from sqlalchemy.sql.sqltypes import NullType
1212
+
1198
1213
  primitives = (bool, str, int, float)
1199
1214
 
1200
1215
  for col_name, expr in kwargs.items():
@@ -2542,7 +2557,7 @@ class DataChain:
2542
2557
 
2543
2558
  def to_storage(
2544
2559
  self,
2545
- output: str,
2560
+ output: Union[str, os.PathLike[str]],
2546
2561
  signal: str = "file",
2547
2562
  placement: FileExportPlacement = "fullpath",
2548
2563
  link_type: Literal["copy", "symlink"] = "copy",