datachain 0.8.8__py3-none-any.whl → 0.8.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of datachain might be problematic. Click here for more details.

@@ -1,34 +1,31 @@
1
- def add_studio_parser(subparsers, parent_parser) -> None:
2
- studio_help = "Commands to authenticate DataChain with Iterative Studio"
3
- studio_description = (
4
- "Authenticate DataChain with Studio and set the token. "
5
- "Once this token has been properly configured,\n"
6
- "DataChain will utilize it for seamlessly sharing datasets\n"
7
- "and using Studio features from CLI"
1
+ def add_auth_parser(subparsers, parent_parser) -> None:
2
+ auth_help = "Manage Studio authentication"
3
+ auth_description = (
4
+ "Manage authentication and settings for Studio. "
5
+ "Configure tokens for sharing datasets and using Studio features."
8
6
  )
9
7
 
10
- studio_parser = subparsers.add_parser(
11
- "studio",
8
+ auth_parser = subparsers.add_parser(
9
+ "auth",
12
10
  parents=[parent_parser],
13
- description=studio_description,
14
- help=studio_help,
11
+ description=auth_description,
12
+ help=auth_help,
15
13
  )
16
- studio_subparser = studio_parser.add_subparsers(
14
+ auth_subparser = auth_parser.add_subparsers(
17
15
  dest="cmd",
18
- help="Use `DataChain studio CMD --help` to display command-specific help.",
19
- required=True,
16
+ help="Use `datachain auth CMD --help` to display command-specific help",
20
17
  )
21
18
 
22
- studio_login_help = "Authenticate DataChain with Studio host"
23
- studio_login_description = (
24
- "By default, this command authenticates the DataChain with Studio\n"
25
- "using default scopes and assigns a random name as the token name."
19
+ auth_login_help = "Authenticate with Studio"
20
+ auth_login_description = (
21
+ "Authenticate with Studio using default scopes. "
22
+ "A random name will be assigned as the token name if not specified."
26
23
  )
27
- login_parser = studio_subparser.add_parser(
24
+ login_parser = auth_subparser.add_parser(
28
25
  "login",
29
26
  parents=[parent_parser],
30
- description=studio_login_description,
31
- help=studio_login_help,
27
+ description=auth_login_description,
28
+ help=auth_login_help,
32
29
  )
33
30
 
34
31
  login_parser.add_argument(
@@ -36,14 +33,14 @@ def add_studio_parser(subparsers, parent_parser) -> None:
36
33
  "--hostname",
37
34
  action="store",
38
35
  default=None,
39
- help="The hostname of the Studio instance to authenticate with.",
36
+ help="Hostname of the Studio instance",
40
37
  )
41
38
  login_parser.add_argument(
42
39
  "-s",
43
40
  "--scopes",
44
41
  action="store",
45
42
  default=None,
46
- help="The scopes for the authentication token. ",
43
+ help="Authentication token scopes",
47
44
  )
48
45
 
49
46
  login_parser.add_argument(
@@ -51,76 +48,55 @@ def add_studio_parser(subparsers, parent_parser) -> None:
51
48
  "--name",
52
49
  action="store",
53
50
  default=None,
54
- help="The name of the authentication token. It will be used to\n"
55
- "identify token shown in Studio profile.",
51
+ help="Authentication token name (shown in Studio profile)",
56
52
  )
57
53
 
58
54
  login_parser.add_argument(
59
55
  "--no-open",
60
56
  action="store_true",
61
57
  default=False,
62
- help="Use authentication flow based on user code.\n"
63
- "You will be presented with user code to enter in browser.\n"
64
- "DataChain will also use this if it cannot launch browser on your behalf.",
58
+ help="Use code-based authentication without browser",
65
59
  )
66
60
 
67
- studio_logout_help = "Logout user from Studio"
68
- studio_logout_description = "This removes the studio token from your global config."
61
+ auth_logout_help = "Log out from Studio"
62
+ auth_logout_description = (
63
+ "Remove the Studio authentication token from global config."
64
+ )
69
65
 
70
- studio_subparser.add_parser(
66
+ auth_subparser.add_parser(
71
67
  "logout",
72
68
  parents=[parent_parser],
73
- description=studio_logout_description,
74
- help=studio_logout_help,
69
+ description=auth_logout_description,
70
+ help=auth_logout_help,
75
71
  )
76
72
 
77
- studio_team_help = "Set the default team for DataChain"
78
- studio_team_description = (
79
- "Set the default team for DataChain to use when interacting with Studio."
80
- )
73
+ auth_team_help = "Set default team for Studio operations"
74
+ auth_team_description = "Set the default team for Studio operations."
81
75
 
82
- team_parser = studio_subparser.add_parser(
76
+ team_parser = auth_subparser.add_parser(
83
77
  "team",
84
78
  parents=[parent_parser],
85
- description=studio_team_description,
86
- help=studio_team_help,
79
+ description=auth_team_description,
80
+ help=auth_team_help,
87
81
  )
88
82
  team_parser.add_argument(
89
83
  "team_name",
90
84
  action="store",
91
- help="The name of the team to set as the default.",
85
+ help="Name of the team to set as default",
92
86
  )
93
87
  team_parser.add_argument(
94
88
  "--global",
95
89
  action="store_true",
96
90
  default=False,
97
- help="Set the team globally for all DataChain projects.",
91
+ help="Set team globally for all projects",
98
92
  )
99
93
 
100
- studio_token_help = "View the token datachain uses to contact Studio" # noqa: S105 # nosec B105
94
+ auth_token_help = "View Studio authentication token" # noqa: S105
95
+ auth_token_description = "Display the current authentication token for Studio." # noqa: S105
101
96
 
102
- studio_subparser.add_parser(
97
+ auth_subparser.add_parser(
103
98
  "token",
104
99
  parents=[parent_parser],
105
- description=studio_token_help,
106
- help=studio_token_help,
107
- )
108
-
109
- studio_ls_dataset_help = "List the available datasets from Studio"
110
- studio_ls_dataset_description = (
111
- "This command lists all the datasets available in Studio.\n"
112
- "It will show the dataset name and the number of versions available."
113
- )
114
-
115
- ls_dataset_parser = studio_subparser.add_parser(
116
- "dataset",
117
- parents=[parent_parser],
118
- description=studio_ls_dataset_description,
119
- help=studio_ls_dataset_help,
120
- )
121
- ls_dataset_parser.add_argument(
122
- "--team",
123
- action="store",
124
- default=None,
125
- help="The team to list datasets for. By default, it will use team from config.",
100
+ description=auth_token_description,
101
+ help=auth_token_help,
126
102
  )
@@ -30,7 +30,7 @@ def add_sources_arg(parser: ArgumentParser, nargs: Union[str, int] = "+") -> Act
30
30
  "sources",
31
31
  type=str,
32
32
  nargs=nargs,
33
- help="Data sources - paths to cloud storage dirs",
33
+ help="Data sources - paths to source storage directories or files",
34
34
  )
35
35
 
36
36
 
datachain/cli/utils.py CHANGED
@@ -87,7 +87,7 @@ def get_logging_level(args: Namespace) -> int:
87
87
  def determine_flavors(studio: bool, local: bool, all: bool, token: Optional[str]):
88
88
  if studio and not token:
89
89
  raise DataChainError(
90
- "Not logged in to Studio. Log in with 'datachain studio login'."
90
+ "Not logged in to Studio. Log in with 'datachain auth login'."
91
91
  )
92
92
 
93
93
  if local or studio:
datachain/client/local.py CHANGED
@@ -38,7 +38,7 @@ class FileClient(Client):
38
38
  def get_uri(cls, name: str) -> "StorageURI":
39
39
  from datachain.dataset import StorageURI
40
40
 
41
- return StorageURI(f'{cls.PREFIX}/{name.removeprefix("/")}')
41
+ return StorageURI(f"{cls.PREFIX}/{name.removeprefix('/')}")
42
42
 
43
43
  @classmethod
44
44
  def ls_buckets(cls, **kwargs):
@@ -19,6 +19,7 @@ from sqlalchemy import MetaData, Table, UniqueConstraint, exists, select
19
19
  from sqlalchemy.dialects import sqlite
20
20
  from sqlalchemy.schema import CreateIndex, CreateTable, DropTable
21
21
  from sqlalchemy.sql import func
22
+ from sqlalchemy.sql.elements import BinaryExpression, BooleanClauseList
22
23
  from sqlalchemy.sql.expression import bindparam, cast
23
24
  from sqlalchemy.sql.selectable import Select
24
25
  from tqdm.auto import tqdm
@@ -40,7 +41,6 @@ if TYPE_CHECKING:
40
41
  from sqlalchemy.schema import SchemaItem
41
42
  from sqlalchemy.sql._typing import _FromClauseArgument, _OnClauseArgument
42
43
  from sqlalchemy.sql.elements import ColumnElement
43
- from sqlalchemy.sql.selectable import Join
44
44
  from sqlalchemy.types import TypeEngine
45
45
 
46
46
  from datachain.lib.file import File
@@ -654,16 +654,47 @@ class SQLiteWarehouse(AbstractWarehouse):
654
654
  right: "_FromClauseArgument",
655
655
  onclause: "_OnClauseArgument",
656
656
  inner: bool = True,
657
- ) -> "Join":
657
+ full: bool = False,
658
+ columns=None,
659
+ ) -> "Select":
658
660
  """
659
661
  Join two tables together.
660
662
  """
661
- return sqlalchemy.join(
662
- left,
663
- right,
664
- onclause,
665
- isouter=not inner,
663
+ if not full:
664
+ join_query = sqlalchemy.join(
665
+ left,
666
+ right,
667
+ onclause,
668
+ isouter=not inner,
669
+ )
670
+ return sqlalchemy.select(*columns).select_from(join_query)
671
+
672
+ left_right_join = sqlalchemy.select(*columns).select_from(
673
+ sqlalchemy.join(left, right, onclause, isouter=True)
666
674
  )
675
+ right_left_join = sqlalchemy.select(*columns).select_from(
676
+ sqlalchemy.join(right, left, onclause, isouter=True)
677
+ )
678
+
679
+ def add_left_rows_filter(exp: BinaryExpression):
680
+ """
681
+ Adds filter to right_left_join to remove unmatched left table rows by
682
+ getting column names that need to be NULL from BinaryExpressions in onclause
683
+ """
684
+ return right_left_join.where(
685
+ getattr(left.c, exp.left.name) == None # type: ignore[union-attr] # noqa: E711
686
+ )
687
+
688
+ if isinstance(onclause, BinaryExpression):
689
+ right_left_join = add_left_rows_filter(onclause)
690
+
691
+ if isinstance(onclause, BooleanClauseList):
692
+ for c in onclause.get_children():
693
+ if isinstance(c, BinaryExpression):
694
+ right_left_join = add_left_rows_filter(c)
695
+
696
+ union = sqlalchemy.union(left_right_join, right_left_join).subquery()
697
+ return sqlalchemy.select(*union.c).select_from(union)
667
698
 
668
699
  def create_pre_udf_table(self, query: "Select") -> "Table":
669
700
  """
@@ -31,7 +31,7 @@ if TYPE_CHECKING:
31
31
  _FromClauseArgument,
32
32
  _OnClauseArgument,
33
33
  )
34
- from sqlalchemy.sql.selectable import Join, Select
34
+ from sqlalchemy.sql.selectable import Select
35
35
  from sqlalchemy.types import TypeEngine
36
36
 
37
37
  from datachain.data_storage import schema
@@ -873,7 +873,7 @@ class AbstractWarehouse(ABC, Serializable):
873
873
  right: "_FromClauseArgument",
874
874
  onclause: "_OnClauseArgument",
875
875
  inner: bool = True,
876
- ) -> "Join":
876
+ ) -> "Select":
877
877
  """
878
878
  Join two tables together.
879
879
  """
datachain/lib/arrow.py CHANGED
@@ -33,7 +33,7 @@ class ReferenceFileSystem(fsspec.implementations.reference.ReferenceFileSystem):
33
33
  # reads the whole file in-memory.
34
34
  (uri,) = self.references[path]
35
35
  protocol, _ = split_protocol(uri)
36
- return self.fss[protocol]._open(uri, mode, *args, **kwargs)
36
+ return self.fss[protocol].open(uri, mode, *args, **kwargs)
37
37
 
38
38
 
39
39
  class ArrowGenerator(Generator):
@@ -52,15 +52,15 @@ def python_to_sql(typ): # noqa: PLR0911
52
52
 
53
53
  args = get_args(typ)
54
54
  if inspect.isclass(orig) and (issubclass(list, orig) or issubclass(tuple, orig)):
55
- if args is None or len(args) != 1:
55
+ if args is None:
56
56
  raise TypeError(f"Cannot resolve type '{typ}' for flattening features")
57
57
 
58
58
  args0 = args[0]
59
59
  if ModelStore.is_pydantic(args0):
60
60
  return Array(JSON())
61
61
 
62
- next_type = python_to_sql(args0)
63
- return Array(next_type)
62
+ list_type = list_of_args_to_type(args)
63
+ return Array(list_type)
64
64
 
65
65
  if orig is Annotated:
66
66
  # Ignoring annotations
@@ -82,6 +82,18 @@ def python_to_sql(typ): # noqa: PLR0911
82
82
  raise TypeError(f"Cannot recognize type {typ}")
83
83
 
84
84
 
85
+ def list_of_args_to_type(args) -> SQLType:
86
+ first_type = python_to_sql(args[0])
87
+ for next_arg in args[1:]:
88
+ try:
89
+ next_type = python_to_sql(next_arg)
90
+ if next_type != first_type:
91
+ return JSON()
92
+ except TypeError:
93
+ return JSON()
94
+ return first_type
95
+
96
+
85
97
  def _is_json_inside_union(orig, args) -> bool:
86
98
  if orig == Union and len(args) >= 2:
87
99
  # List in JSON: Union[dict, list[dict]]
@@ -35,8 +35,7 @@ def unflatten_to_json_pos(
35
35
  def _normalize(name: str) -> str:
36
36
  if DEFAULT_DELIMITER in name:
37
37
  raise RuntimeError(
38
- f"variable '{name}' cannot be used "
39
- f"because it contains {DEFAULT_DELIMITER}"
38
+ f"variable '{name}' cannot be used because it contains {DEFAULT_DELIMITER}"
40
39
  )
41
40
  return _to_snake_case(name)
42
41
 
datachain/lib/dc.py CHANGED
@@ -11,6 +11,7 @@ from typing import (
11
11
  BinaryIO,
12
12
  Callable,
13
13
  ClassVar,
14
+ Literal,
14
15
  Optional,
15
16
  TypeVar,
16
17
  Union,
@@ -1276,7 +1277,12 @@ class DataChain:
1276
1277
  yield ret[0] if len(cols) == 1 else tuple(ret)
1277
1278
 
1278
1279
  def to_pytorch(
1279
- self, transform=None, tokenizer=None, tokenizer_kwargs=None, num_samples=0
1280
+ self,
1281
+ transform=None,
1282
+ tokenizer=None,
1283
+ tokenizer_kwargs=None,
1284
+ num_samples=0,
1285
+ remove_prefetched: bool = False,
1280
1286
  ):
1281
1287
  """Convert to pytorch dataset format.
1282
1288
 
@@ -1286,6 +1292,7 @@ class DataChain:
1286
1292
  tokenizer_kwargs (dict): Additional kwargs to pass when calling tokenizer.
1287
1293
  num_samples (int): Number of random samples to draw for each epoch.
1288
1294
  This argument is ignored if `num_samples=0` (the default).
1295
+ remove_prefetched (bool): Whether to remove prefetched files after reading.
1289
1296
 
1290
1297
  Example:
1291
1298
  ```py
@@ -1312,6 +1319,7 @@ class DataChain:
1312
1319
  tokenizer_kwargs=tokenizer_kwargs,
1313
1320
  num_samples=num_samples,
1314
1321
  dc_settings=chain._settings,
1322
+ remove_prefetched=remove_prefetched,
1315
1323
  )
1316
1324
 
1317
1325
  def remove_file_signals(self) -> "Self": # noqa: D102
@@ -1324,6 +1332,7 @@ class DataChain:
1324
1332
  on: Union[MergeColType, Sequence[MergeColType]],
1325
1333
  right_on: Optional[Union[MergeColType, Sequence[MergeColType]]] = None,
1326
1334
  inner=False,
1335
+ full=False,
1327
1336
  rname="right_",
1328
1337
  ) -> "Self":
1329
1338
  """Merge two chains based on the specified criteria.
@@ -1337,6 +1346,7 @@ class DataChain:
1337
1346
  right_on: Optional predicate or list of Predicates for the `right_ds`
1338
1347
  to join.
1339
1348
  inner (bool): Whether to run inner join or outer join.
1349
+ full (bool): Whether to run full outer join.
1340
1350
  rname (str): Name prefix for conflicting signal names.
1341
1351
 
1342
1352
  Examples:
@@ -1411,7 +1421,7 @@ class DataChain:
1411
1421
  )
1412
1422
 
1413
1423
  query = self._query.join(
1414
- right_ds._query, sqlalchemy.and_(*ops), inner, rname + "{name}"
1424
+ right_ds._query, sqlalchemy.and_(*ops), inner, full, rname + "{name}"
1415
1425
  )
1416
1426
  query.feature_schema = None
1417
1427
  ds = self._evolve(query=query)
@@ -2415,11 +2425,22 @@ class DataChain:
2415
2425
  def export_files(
2416
2426
  self,
2417
2427
  output: str,
2418
- signal="file",
2428
+ signal: str = "file",
2419
2429
  placement: FileExportPlacement = "fullpath",
2420
2430
  use_cache: bool = True,
2431
+ link_type: Literal["copy", "symlink"] = "copy",
2421
2432
  ) -> None:
2422
- """Method that exports all files from chain to some folder."""
2433
+ """Export files from a specified signal to a directory.
2434
+
2435
+ Args:
2436
+ output: Path to the target directory for exporting files.
2437
+ signal: Name of the signal to export files from.
2438
+ placement: The method to use for naming exported files.
2439
+ The possible values are: "filename", "etag", "fullpath", and "checksum".
2440
+ use_cache: If `True`, cache the files before exporting.
2441
+ link_type: Method to use for exporting files.
2442
+ Falls back to `'copy'` if symlinking fails.
2443
+ """
2423
2444
  if placement == "filename" and (
2424
2445
  self._query.distinct(pathfunc.name(C(f"{signal}__path"))).count()
2425
2446
  != self._query.count()
@@ -2427,7 +2448,7 @@ class DataChain:
2427
2448
  raise ValueError("Files with the same name found")
2428
2449
 
2429
2450
  for file in self.collect(signal):
2430
- file.export(output, placement, use_cache) # type: ignore[union-attr]
2451
+ file.export(output, placement, use_cache, link_type=link_type) # type: ignore[union-attr]
2431
2452
 
2432
2453
  def shuffle(self) -> "Self":
2433
2454
  """Shuffle the rows of the chain deterministically."""
datachain/lib/file.py CHANGED
@@ -1,3 +1,4 @@
1
+ import errno
1
2
  import hashlib
2
3
  import io
3
4
  import json
@@ -76,18 +77,18 @@ class TarVFile(VFile):
76
77
  def open(cls, file: "File", location: list[dict]):
77
78
  """Stream file from tar archive based on location in archive."""
78
79
  if len(location) > 1:
79
- VFileError(file, "multiple 'location's are not supported yet")
80
+ raise VFileError(file, "multiple 'location's are not supported yet")
80
81
 
81
82
  loc = location[0]
82
83
 
83
84
  if (offset := loc.get("offset", None)) is None:
84
- VFileError(file, "'offset' is not specified")
85
+ raise VFileError(file, "'offset' is not specified")
85
86
 
86
87
  if (size := loc.get("size", None)) is None:
87
- VFileError(file, "'size' is not specified")
88
+ raise VFileError(file, "'size' is not specified")
88
89
 
89
90
  if (parent := loc.get("parent", None)) is None:
90
- VFileError(file, "'parent' is not specified")
91
+ raise VFileError(file, "'parent' is not specified")
91
92
 
92
93
  tar_file = File(**parent)
93
94
  tar_file._set_stream(file._catalog)
@@ -236,11 +237,26 @@ class File(DataModel):
236
237
  with open(destination, mode="wb") as f:
237
238
  f.write(self.read())
238
239
 
240
+ def _symlink_to(self, destination: str):
241
+ if self.location:
242
+ raise OSError(errno.ENOTSUP, "Symlinking virtual file is not supported")
243
+
244
+ if self._caching_enabled:
245
+ self.ensure_cached()
246
+ source = self.get_local_path()
247
+ assert source, "File was not cached"
248
+ elif self.source.startswith("file://"):
249
+ source = self.get_path()
250
+ else:
251
+ raise OSError(errno.EXDEV, "can't link across filesystems")
252
+ return os.symlink(source, destination)
253
+
239
254
  def export(
240
255
  self,
241
256
  output: str,
242
257
  placement: ExportPlacement = "fullpath",
243
258
  use_cache: bool = True,
259
+ link_type: Literal["copy", "symlink"] = "copy",
244
260
  ) -> None:
245
261
  """Export file to new location."""
246
262
  if use_cache:
@@ -249,6 +265,13 @@ class File(DataModel):
249
265
  dst_dir = os.path.dirname(dst)
250
266
  os.makedirs(dst_dir, exist_ok=True)
251
267
 
268
+ if link_type == "symlink":
269
+ try:
270
+ return self._symlink_to(dst)
271
+ except OSError as exc:
272
+ if exc.errno not in (errno.ENOTSUP, errno.EXDEV, errno.ENOSYS):
273
+ raise
274
+
252
275
  self.save(dst)
253
276
 
254
277
  def _set_stream(
datachain/lib/listing.py CHANGED
@@ -113,14 +113,14 @@ def parse_listing_uri(uri: str, cache, client_config) -> tuple[Optional[str], st
113
113
  telemetry.log_param("client", client.PREFIX)
114
114
 
115
115
  if not uri.endswith("/") and _isfile(client, uri):
116
- return None, f'{storage_uri}/{path.lstrip("/")}', path
116
+ return None, f"{storage_uri}/{path.lstrip('/')}", path
117
117
  if uses_glob(path):
118
118
  lst_uri_path = posixpath.dirname(path)
119
119
  else:
120
- storage_uri, path = Client.parse_url(f'{uri.rstrip("/")}/')
120
+ storage_uri, path = Client.parse_url(f"{uri.rstrip('/')}/")
121
121
  lst_uri_path = path
122
122
 
123
- lst_uri = f'{storage_uri}/{lst_uri_path.lstrip("/")}'
123
+ lst_uri = f"{storage_uri}/{lst_uri_path.lstrip('/')}"
124
124
  ds_name = (
125
125
  f"{LISTING_PREFIX}{storage_uri}/{posixpath.join(lst_uri_path, '').lstrip('/')}"
126
126
  )
@@ -180,7 +180,7 @@ def get_listing(
180
180
  # for local file system we need to fix listing path / prefix
181
181
  # if we are reusing existing listing
182
182
  if isinstance(client, FileClient) and listing and listing.name != ds_name:
183
- list_path = f'{ds_name.strip("/").removeprefix(listing.name)}/{list_path}'
183
+ list_path = f"{ds_name.strip('/').removeprefix(listing.name)}/{list_path}"
184
184
 
185
185
  ds_name = listing.name if listing else ds_name
186
186
 
datachain/lib/pytorch.py CHANGED
@@ -50,6 +50,7 @@ class PytorchDataset(IterableDataset):
50
50
  tokenizer_kwargs: Optional[dict[str, Any]] = None,
51
51
  num_samples: int = 0,
52
52
  dc_settings: Optional[Settings] = None,
53
+ remove_prefetched: bool = False,
53
54
  ):
54
55
  """
55
56
  Pytorch IterableDataset that streams DataChain datasets.
@@ -84,6 +85,7 @@ class PytorchDataset(IterableDataset):
84
85
 
85
86
  self._cache = catalog.cache
86
87
  self._prefetch_cache: Optional[Cache] = None
88
+ self._remove_prefetched = remove_prefetched
87
89
  if prefetch and not self.cache:
88
90
  tmp_dir = catalog.cache.tmp_dir
89
91
  assert tmp_dir
@@ -147,7 +149,7 @@ class PytorchDataset(IterableDataset):
147
149
  rows,
148
150
  self.prefetch,
149
151
  download_cb=download_cb,
150
- after_prefetch=download_cb.increment_file_count,
152
+ remove_prefetched=self._remove_prefetched,
151
153
  )
152
154
 
153
155
  with download_cb, closing(rows):